diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,41711 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9998487674547563, + "eval_steps": 500, + "global_step": 29754, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1782.354232835545, + "learning_rate": 3.3602150537634413e-09, + "loss": 17.1486, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 1724.0339589676632, + "learning_rate": 1.6801075268817205e-08, + "loss": 16.7835, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 1700.514961570837, + "learning_rate": 3.360215053763441e-08, + "loss": 16.6795, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 1570.7438431775615, + "learning_rate": 5.040322580645161e-08, + "loss": 16.2985, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 1602.1277018435649, + "learning_rate": 6.720430107526882e-08, + "loss": 16.4868, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 1499.0619348435034, + "learning_rate": 8.400537634408603e-08, + "loss": 15.7543, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 1438.121413693714, + "learning_rate": 1.0080645161290322e-07, + "loss": 14.2784, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 1205.847939284157, + "learning_rate": 1.1760752688172043e-07, + "loss": 13.4497, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 1081.138946537131, + "learning_rate": 1.3440860215053764e-07, + "loss": 11.4945, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 1122.9535780227677, + "learning_rate": 1.5120967741935485e-07, + "loss": 10.4741, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 915.8074736571381, + "learning_rate": 1.6801075268817206e-07, + "loss": 9.3298, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 485.2026272196928, + "learning_rate": 1.8481182795698927e-07, + "loss": 7.5624, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 413.4948329618694, + "learning_rate": 2.0161290322580645e-07, + "loss": 6.5282, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 341.22189353250485, + "learning_rate": 2.1841397849462368e-07, + "loss": 5.8548, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 314.30197693693736, + "learning_rate": 2.3521505376344087e-07, + "loss": 5.4052, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 219.09034450096308, + "learning_rate": 2.520161290322581e-07, + "loss": 4.9707, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 243.8000015899658, + "learning_rate": 2.688172043010753e-07, + "loss": 4.7539, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 158.48368358483174, + "learning_rate": 2.856182795698925e-07, + "loss": 4.4491, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 177.99221919611344, + "learning_rate": 3.024193548387097e-07, + "loss": 4.0891, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 147.33085776107916, + "learning_rate": 3.192204301075269e-07, + "loss": 3.8303, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 118.84914037685353, + "learning_rate": 3.360215053763441e-07, + "loss": 3.5773, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 115.65208604223525, + "learning_rate": 3.528225806451614e-07, + "loss": 3.2914, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 117.31300158152648, + "learning_rate": 3.6962365591397853e-07, + "loss": 2.9765, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 105.60650363440705, + "learning_rate": 3.8642473118279574e-07, + "loss": 3.0645, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 97.2173603475482, + "learning_rate": 4.032258064516129e-07, + "loss": 2.6624, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 77.426266243747, + "learning_rate": 4.200268817204301e-07, + "loss": 2.689, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 76.25148368706107, + "learning_rate": 4.3682795698924737e-07, + "loss": 2.581, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 60.52769003302683, + "learning_rate": 4.536290322580646e-07, + "loss": 2.49, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 50.57180850107596, + "learning_rate": 4.7043010752688173e-07, + "loss": 2.4519, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 62.69948504309535, + "learning_rate": 4.872311827956989e-07, + "loss": 2.3989, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 47.631212194728114, + "learning_rate": 5.040322580645161e-07, + "loss": 2.2853, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 42.17283021141972, + "learning_rate": 5.208333333333334e-07, + "loss": 2.3543, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 34.08359958874836, + "learning_rate": 5.376344086021506e-07, + "loss": 2.1276, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 40.674430325440596, + "learning_rate": 5.544354838709678e-07, + "loss": 2.1955, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 31.948809198091773, + "learning_rate": 5.71236559139785e-07, + "loss": 2.0281, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 40.27205465031299, + "learning_rate": 5.880376344086022e-07, + "loss": 1.9945, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 30.44956418233564, + "learning_rate": 6.048387096774194e-07, + "loss": 2.0563, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 25.864985022985085, + "learning_rate": 6.216397849462366e-07, + "loss": 2.0562, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 48.57307017843553, + "learning_rate": 6.384408602150538e-07, + "loss": 2.0814, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 34.41894528967244, + "learning_rate": 6.55241935483871e-07, + "loss": 2.0217, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 25.687554758258678, + "learning_rate": 6.720430107526882e-07, + "loss": 1.9102, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 22.2588126225637, + "learning_rate": 6.888440860215053e-07, + "loss": 1.8607, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 26.021175684312663, + "learning_rate": 7.056451612903228e-07, + "loss": 1.8601, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 24.55833963633963, + "learning_rate": 7.224462365591399e-07, + "loss": 1.8857, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 24.853385573955414, + "learning_rate": 7.392473118279571e-07, + "loss": 1.9314, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 24.241953706581974, + "learning_rate": 7.560483870967743e-07, + "loss": 1.8315, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 30.963286237241633, + "learning_rate": 7.728494623655915e-07, + "loss": 1.9002, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 22.03379591891388, + "learning_rate": 7.896505376344087e-07, + "loss": 1.884, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 24.578647797403292, + "learning_rate": 8.064516129032258e-07, + "loss": 1.8904, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 21.216487036162114, + "learning_rate": 8.23252688172043e-07, + "loss": 1.8219, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 20.861193581911664, + "learning_rate": 8.400537634408602e-07, + "loss": 1.6311, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 24.299865558645777, + "learning_rate": 8.568548387096774e-07, + "loss": 1.8558, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 24.746355187719427, + "learning_rate": 8.736559139784947e-07, + "loss": 1.7711, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 21.940626478094718, + "learning_rate": 8.904569892473119e-07, + "loss": 1.6859, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 19.6483613759701, + "learning_rate": 9.072580645161292e-07, + "loss": 1.7343, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 19.89611142642267, + "learning_rate": 9.240591397849464e-07, + "loss": 1.6938, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 25.001859350266486, + "learning_rate": 9.408602150537635e-07, + "loss": 1.738, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 17.066884810710853, + "learning_rate": 9.576612903225808e-07, + "loss": 1.783, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 24.174269785302588, + "learning_rate": 9.744623655913979e-07, + "loss": 1.8442, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 18.12810335166148, + "learning_rate": 9.91263440860215e-07, + "loss": 1.7999, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 20.191857447339, + "learning_rate": 1.0080645161290323e-06, + "loss": 1.7081, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 19.815036999646132, + "learning_rate": 1.0248655913978496e-06, + "loss": 1.6522, + "step": 305 + }, + { + "epoch": 0.03, + "grad_norm": 15.835894404829006, + "learning_rate": 1.0416666666666667e-06, + "loss": 1.6562, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 17.63269929975477, + "learning_rate": 1.058467741935484e-06, + "loss": 1.6752, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 24.828660268995748, + "learning_rate": 1.0752688172043011e-06, + "loss": 1.5975, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 22.533688396632005, + "learning_rate": 1.0920698924731184e-06, + "loss": 1.6904, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 19.855998449745552, + "learning_rate": 1.1088709677419356e-06, + "loss": 1.6313, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 33.592956121234906, + "learning_rate": 1.1256720430107527e-06, + "loss": 1.7334, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 17.355880614952422, + "learning_rate": 1.14247311827957e-06, + "loss": 1.6071, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 17.581379694577958, + "learning_rate": 1.159274193548387e-06, + "loss": 1.6236, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 14.977879814802337, + "learning_rate": 1.1760752688172044e-06, + "loss": 1.5829, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 14.8664963134277, + "learning_rate": 1.1928763440860217e-06, + "loss": 1.695, + "step": 355 + }, + { + "epoch": 0.04, + "grad_norm": 20.709276914746113, + "learning_rate": 1.2096774193548388e-06, + "loss": 1.6185, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 17.65158447171455, + "learning_rate": 1.2264784946236561e-06, + "loss": 1.5907, + "step": 365 + }, + { + "epoch": 0.04, + "grad_norm": 20.428125598199816, + "learning_rate": 1.2432795698924732e-06, + "loss": 1.5583, + "step": 370 + }, + { + "epoch": 0.04, + "grad_norm": 17.84519660574699, + "learning_rate": 1.2600806451612903e-06, + "loss": 1.5405, + "step": 375 + }, + { + "epoch": 0.04, + "grad_norm": 13.767453102040555, + "learning_rate": 1.2768817204301076e-06, + "loss": 1.5337, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 14.818777203371633, + "learning_rate": 1.2936827956989247e-06, + "loss": 1.6202, + "step": 385 + }, + { + "epoch": 0.04, + "grad_norm": 15.367040069929933, + "learning_rate": 1.310483870967742e-06, + "loss": 1.5684, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 14.598127290678107, + "learning_rate": 1.3272849462365592e-06, + "loss": 1.5376, + "step": 395 + }, + { + "epoch": 0.04, + "grad_norm": 17.719109719831028, + "learning_rate": 1.3440860215053765e-06, + "loss": 1.4921, + "step": 400 + }, + { + "epoch": 0.04, + "grad_norm": 13.86472842753203, + "learning_rate": 1.3608870967741936e-06, + "loss": 1.5219, + "step": 405 + }, + { + "epoch": 0.04, + "grad_norm": 16.638436674699083, + "learning_rate": 1.3776881720430107e-06, + "loss": 1.5141, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 16.282678581255507, + "learning_rate": 1.394489247311828e-06, + "loss": 1.4588, + "step": 415 + }, + { + "epoch": 0.04, + "grad_norm": 13.90193197057512, + "learning_rate": 1.4112903225806455e-06, + "loss": 1.5744, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 14.65628343176628, + "learning_rate": 1.4280913978494626e-06, + "loss": 1.4718, + "step": 425 + }, + { + "epoch": 0.04, + "grad_norm": 14.634571109448922, + "learning_rate": 1.4448924731182797e-06, + "loss": 1.5239, + "step": 430 + }, + { + "epoch": 0.04, + "grad_norm": 13.749287700582828, + "learning_rate": 1.461693548387097e-06, + "loss": 1.4566, + "step": 435 + }, + { + "epoch": 0.04, + "grad_norm": 14.547106603930775, + "learning_rate": 1.4784946236559141e-06, + "loss": 1.5024, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 17.773314653888498, + "learning_rate": 1.4952956989247315e-06, + "loss": 1.6064, + "step": 445 + }, + { + "epoch": 0.05, + "grad_norm": 15.139400485945263, + "learning_rate": 1.5120967741935486e-06, + "loss": 1.5204, + "step": 450 + }, + { + "epoch": 0.05, + "grad_norm": 25.575817089248652, + "learning_rate": 1.5288978494623657e-06, + "loss": 1.4468, + "step": 455 + }, + { + "epoch": 0.05, + "grad_norm": 16.012505506084903, + "learning_rate": 1.545698924731183e-06, + "loss": 1.4547, + "step": 460 + }, + { + "epoch": 0.05, + "grad_norm": 17.410132583317328, + "learning_rate": 1.5625e-06, + "loss": 1.446, + "step": 465 + }, + { + "epoch": 0.05, + "grad_norm": 14.868370038078499, + "learning_rate": 1.5793010752688174e-06, + "loss": 1.5959, + "step": 470 + }, + { + "epoch": 0.05, + "grad_norm": 14.812418592555467, + "learning_rate": 1.5961021505376345e-06, + "loss": 1.4709, + "step": 475 + }, + { + "epoch": 0.05, + "grad_norm": 15.905386132006193, + "learning_rate": 1.6129032258064516e-06, + "loss": 1.5097, + "step": 480 + }, + { + "epoch": 0.05, + "grad_norm": 14.946674581803094, + "learning_rate": 1.629704301075269e-06, + "loss": 1.4466, + "step": 485 + }, + { + "epoch": 0.05, + "grad_norm": 13.214144758770525, + "learning_rate": 1.646505376344086e-06, + "loss": 1.4626, + "step": 490 + }, + { + "epoch": 0.05, + "grad_norm": 20.39110267796856, + "learning_rate": 1.6633064516129033e-06, + "loss": 1.5832, + "step": 495 + }, + { + "epoch": 0.05, + "grad_norm": 14.768822449697835, + "learning_rate": 1.6801075268817204e-06, + "loss": 1.4171, + "step": 500 + }, + { + "epoch": 0.05, + "grad_norm": 15.346879819966862, + "learning_rate": 1.6969086021505377e-06, + "loss": 1.4682, + "step": 505 + }, + { + "epoch": 0.05, + "grad_norm": 11.776476285237981, + "learning_rate": 1.7137096774193548e-06, + "loss": 1.4112, + "step": 510 + }, + { + "epoch": 0.05, + "grad_norm": 14.468416925660526, + "learning_rate": 1.7305107526881724e-06, + "loss": 1.3957, + "step": 515 + }, + { + "epoch": 0.05, + "grad_norm": 16.81879140532043, + "learning_rate": 1.7473118279569895e-06, + "loss": 1.4286, + "step": 520 + }, + { + "epoch": 0.05, + "grad_norm": 22.268339884303696, + "learning_rate": 1.7641129032258068e-06, + "loss": 1.4274, + "step": 525 + }, + { + "epoch": 0.05, + "grad_norm": 16.974470923302114, + "learning_rate": 1.7809139784946239e-06, + "loss": 1.5043, + "step": 530 + }, + { + "epoch": 0.05, + "grad_norm": 16.01147151337686, + "learning_rate": 1.797715053763441e-06, + "loss": 1.4032, + "step": 535 + }, + { + "epoch": 0.05, + "grad_norm": 22.10572306421554, + "learning_rate": 1.8145161290322583e-06, + "loss": 1.4333, + "step": 540 + }, + { + "epoch": 0.05, + "grad_norm": 12.640677257163176, + "learning_rate": 1.8313172043010754e-06, + "loss": 1.4681, + "step": 545 + }, + { + "epoch": 0.06, + "grad_norm": 26.564187554827637, + "learning_rate": 1.8481182795698927e-06, + "loss": 1.4594, + "step": 550 + }, + { + "epoch": 0.06, + "grad_norm": 15.08193320266031, + "learning_rate": 1.8649193548387098e-06, + "loss": 1.4362, + "step": 555 + }, + { + "epoch": 0.06, + "grad_norm": 34.685212861993335, + "learning_rate": 1.881720430107527e-06, + "loss": 1.471, + "step": 560 + }, + { + "epoch": 0.06, + "grad_norm": 43.100869084203225, + "learning_rate": 1.8985215053763442e-06, + "loss": 1.4642, + "step": 565 + }, + { + "epoch": 0.06, + "grad_norm": 15.559894937589828, + "learning_rate": 1.9153225806451616e-06, + "loss": 1.3963, + "step": 570 + }, + { + "epoch": 0.06, + "grad_norm": 38.2485410412665, + "learning_rate": 1.9321236559139787e-06, + "loss": 1.4386, + "step": 575 + }, + { + "epoch": 0.06, + "grad_norm": 30.95124448720418, + "learning_rate": 1.9489247311827958e-06, + "loss": 1.4151, + "step": 580 + }, + { + "epoch": 0.06, + "grad_norm": 51.70839528402327, + "learning_rate": 1.965725806451613e-06, + "loss": 1.4209, + "step": 585 + }, + { + "epoch": 0.06, + "grad_norm": 23.196005284612998, + "learning_rate": 1.98252688172043e-06, + "loss": 1.4012, + "step": 590 + }, + { + "epoch": 0.06, + "grad_norm": 27.1622839764402, + "learning_rate": 1.9993279569892475e-06, + "loss": 1.3261, + "step": 595 + }, + { + "epoch": 0.06, + "grad_norm": 13.43452977396187, + "learning_rate": 2.0161290322580646e-06, + "loss": 1.3766, + "step": 600 + }, + { + "epoch": 0.06, + "grad_norm": 12.033787281630863, + "learning_rate": 2.032930107526882e-06, + "loss": 1.4099, + "step": 605 + }, + { + "epoch": 0.06, + "grad_norm": 13.955820585990788, + "learning_rate": 2.0497311827956992e-06, + "loss": 1.3794, + "step": 610 + }, + { + "epoch": 0.06, + "grad_norm": 15.461768151074137, + "learning_rate": 2.0665322580645163e-06, + "loss": 1.4485, + "step": 615 + }, + { + "epoch": 0.06, + "grad_norm": 19.64480417285225, + "learning_rate": 2.0833333333333334e-06, + "loss": 1.3828, + "step": 620 + }, + { + "epoch": 0.06, + "grad_norm": 13.057917363440225, + "learning_rate": 2.100134408602151e-06, + "loss": 1.4158, + "step": 625 + }, + { + "epoch": 0.06, + "grad_norm": 23.390563850096793, + "learning_rate": 2.116935483870968e-06, + "loss": 1.4126, + "step": 630 + }, + { + "epoch": 0.06, + "grad_norm": 27.856058877726337, + "learning_rate": 2.133736559139785e-06, + "loss": 1.3153, + "step": 635 + }, + { + "epoch": 0.06, + "grad_norm": 15.627432697331793, + "learning_rate": 2.1505376344086023e-06, + "loss": 1.3693, + "step": 640 + }, + { + "epoch": 0.07, + "grad_norm": 17.018250112690833, + "learning_rate": 2.1673387096774194e-06, + "loss": 1.3771, + "step": 645 + }, + { + "epoch": 0.07, + "grad_norm": 24.363765549799407, + "learning_rate": 2.184139784946237e-06, + "loss": 1.4001, + "step": 650 + }, + { + "epoch": 0.07, + "grad_norm": 16.8886205337284, + "learning_rate": 2.200940860215054e-06, + "loss": 1.3851, + "step": 655 + }, + { + "epoch": 0.07, + "grad_norm": 12.200499403924693, + "learning_rate": 2.217741935483871e-06, + "loss": 1.3914, + "step": 660 + }, + { + "epoch": 0.07, + "grad_norm": 22.506933751518403, + "learning_rate": 2.234543010752688e-06, + "loss": 1.3736, + "step": 665 + }, + { + "epoch": 0.07, + "grad_norm": 21.168953266789856, + "learning_rate": 2.2513440860215053e-06, + "loss": 1.349, + "step": 670 + }, + { + "epoch": 0.07, + "grad_norm": 58.48769280628009, + "learning_rate": 2.268145161290323e-06, + "loss": 1.4582, + "step": 675 + }, + { + "epoch": 0.07, + "grad_norm": 34.01636176936607, + "learning_rate": 2.28494623655914e-06, + "loss": 1.4758, + "step": 680 + }, + { + "epoch": 0.07, + "grad_norm": 25.025355540731983, + "learning_rate": 2.301747311827957e-06, + "loss": 1.3476, + "step": 685 + }, + { + "epoch": 0.07, + "grad_norm": 14.751839601928431, + "learning_rate": 2.318548387096774e-06, + "loss": 1.3785, + "step": 690 + }, + { + "epoch": 0.07, + "grad_norm": 12.071805821551914, + "learning_rate": 2.3353494623655917e-06, + "loss": 1.3635, + "step": 695 + }, + { + "epoch": 0.07, + "grad_norm": 20.602958992356196, + "learning_rate": 2.3521505376344088e-06, + "loss": 1.3845, + "step": 700 + }, + { + "epoch": 0.07, + "grad_norm": 17.385890285083427, + "learning_rate": 2.3689516129032263e-06, + "loss": 1.2857, + "step": 705 + }, + { + "epoch": 0.07, + "grad_norm": 12.215016267194583, + "learning_rate": 2.3857526881720434e-06, + "loss": 1.3015, + "step": 710 + }, + { + "epoch": 0.07, + "grad_norm": 33.48226663382944, + "learning_rate": 2.4025537634408605e-06, + "loss": 1.4045, + "step": 715 + }, + { + "epoch": 0.07, + "grad_norm": 23.942731192443755, + "learning_rate": 2.4193548387096776e-06, + "loss": 1.3649, + "step": 720 + }, + { + "epoch": 0.07, + "grad_norm": 11.558052499204832, + "learning_rate": 2.4361559139784947e-06, + "loss": 1.3811, + "step": 725 + }, + { + "epoch": 0.07, + "grad_norm": 18.69964334303326, + "learning_rate": 2.4529569892473122e-06, + "loss": 1.3416, + "step": 730 + }, + { + "epoch": 0.07, + "grad_norm": 13.177544261859945, + "learning_rate": 2.4697580645161293e-06, + "loss": 1.3243, + "step": 735 + }, + { + "epoch": 0.07, + "grad_norm": 12.803122016869308, + "learning_rate": 2.4865591397849464e-06, + "loss": 1.371, + "step": 740 + }, + { + "epoch": 0.08, + "grad_norm": 23.51671956073884, + "learning_rate": 2.503360215053764e-06, + "loss": 1.3652, + "step": 745 + }, + { + "epoch": 0.08, + "grad_norm": 15.72972602267457, + "learning_rate": 2.5201612903225806e-06, + "loss": 1.3826, + "step": 750 + }, + { + "epoch": 0.08, + "grad_norm": 11.865572453001107, + "learning_rate": 2.536962365591398e-06, + "loss": 1.387, + "step": 755 + }, + { + "epoch": 0.08, + "grad_norm": 21.938389282777255, + "learning_rate": 2.5537634408602153e-06, + "loss": 1.3031, + "step": 760 + }, + { + "epoch": 0.08, + "grad_norm": 14.036068148740473, + "learning_rate": 2.570564516129033e-06, + "loss": 1.392, + "step": 765 + }, + { + "epoch": 0.08, + "grad_norm": 15.200040828281905, + "learning_rate": 2.5873655913978495e-06, + "loss": 1.4077, + "step": 770 + }, + { + "epoch": 0.08, + "grad_norm": 16.776949710823484, + "learning_rate": 2.604166666666667e-06, + "loss": 1.3833, + "step": 775 + }, + { + "epoch": 0.08, + "grad_norm": 14.947572172197686, + "learning_rate": 2.620967741935484e-06, + "loss": 1.4015, + "step": 780 + }, + { + "epoch": 0.08, + "grad_norm": 76.93992013037158, + "learning_rate": 2.6377688172043016e-06, + "loss": 1.4051, + "step": 785 + }, + { + "epoch": 0.08, + "grad_norm": 12.942371317725181, + "learning_rate": 2.6545698924731183e-06, + "loss": 1.348, + "step": 790 + }, + { + "epoch": 0.08, + "grad_norm": 13.086369840086844, + "learning_rate": 2.671370967741936e-06, + "loss": 1.3453, + "step": 795 + }, + { + "epoch": 0.08, + "grad_norm": 35.89028186175731, + "learning_rate": 2.688172043010753e-06, + "loss": 1.3807, + "step": 800 + }, + { + "epoch": 0.08, + "grad_norm": 29.63422412243611, + "learning_rate": 2.70497311827957e-06, + "loss": 1.3625, + "step": 805 + }, + { + "epoch": 0.08, + "grad_norm": 55.32993232323523, + "learning_rate": 2.721774193548387e-06, + "loss": 1.4005, + "step": 810 + }, + { + "epoch": 0.08, + "grad_norm": 15.984066997357118, + "learning_rate": 2.7385752688172047e-06, + "loss": 1.3381, + "step": 815 + }, + { + "epoch": 0.08, + "grad_norm": 25.472146032597063, + "learning_rate": 2.7553763440860214e-06, + "loss": 1.4245, + "step": 820 + }, + { + "epoch": 0.08, + "grad_norm": 15.457358515963906, + "learning_rate": 2.772177419354839e-06, + "loss": 1.3625, + "step": 825 + }, + { + "epoch": 0.08, + "grad_norm": 16.759070196363787, + "learning_rate": 2.788978494623656e-06, + "loss": 1.448, + "step": 830 + }, + { + "epoch": 0.08, + "grad_norm": 40.79061945114763, + "learning_rate": 2.8057795698924735e-06, + "loss": 1.3464, + "step": 835 + }, + { + "epoch": 0.08, + "grad_norm": 25.161756708332057, + "learning_rate": 2.822580645161291e-06, + "loss": 1.3722, + "step": 840 + }, + { + "epoch": 0.09, + "grad_norm": 25.188019881756727, + "learning_rate": 2.8393817204301077e-06, + "loss": 1.2966, + "step": 845 + }, + { + "epoch": 0.09, + "grad_norm": 20.235137203610662, + "learning_rate": 2.8561827956989252e-06, + "loss": 1.3905, + "step": 850 + }, + { + "epoch": 0.09, + "grad_norm": 36.617132909309895, + "learning_rate": 2.872983870967742e-06, + "loss": 1.3609, + "step": 855 + }, + { + "epoch": 0.09, + "grad_norm": 29.49169037802908, + "learning_rate": 2.8897849462365594e-06, + "loss": 1.3619, + "step": 860 + }, + { + "epoch": 0.09, + "grad_norm": 46.26563028512804, + "learning_rate": 2.9065860215053765e-06, + "loss": 1.3855, + "step": 865 + }, + { + "epoch": 0.09, + "grad_norm": 22.0915541742763, + "learning_rate": 2.923387096774194e-06, + "loss": 1.4284, + "step": 870 + }, + { + "epoch": 0.09, + "grad_norm": 35.015078004933514, + "learning_rate": 2.9401881720430108e-06, + "loss": 1.3365, + "step": 875 + }, + { + "epoch": 0.09, + "grad_norm": 95.54066941374312, + "learning_rate": 2.9569892473118283e-06, + "loss": 1.3615, + "step": 880 + }, + { + "epoch": 0.09, + "grad_norm": 69.29704172941422, + "learning_rate": 2.9737903225806454e-06, + "loss": 1.3591, + "step": 885 + }, + { + "epoch": 0.09, + "grad_norm": 28.783068659669784, + "learning_rate": 2.990591397849463e-06, + "loss": 1.3271, + "step": 890 + }, + { + "epoch": 0.09, + "grad_norm": 18.165110584313982, + "learning_rate": 3.0073924731182796e-06, + "loss": 1.2884, + "step": 895 + }, + { + "epoch": 0.09, + "grad_norm": 41.205307526496874, + "learning_rate": 3.024193548387097e-06, + "loss": 1.3995, + "step": 900 + }, + { + "epoch": 0.09, + "grad_norm": 40.001454194930204, + "learning_rate": 3.0409946236559142e-06, + "loss": 1.3513, + "step": 905 + }, + { + "epoch": 0.09, + "grad_norm": 15.56121470148507, + "learning_rate": 3.0577956989247313e-06, + "loss": 1.2726, + "step": 910 + }, + { + "epoch": 0.09, + "grad_norm": 13.766228304604502, + "learning_rate": 3.0745967741935484e-06, + "loss": 1.3304, + "step": 915 + }, + { + "epoch": 0.09, + "grad_norm": 16.262090744904206, + "learning_rate": 3.091397849462366e-06, + "loss": 1.3634, + "step": 920 + }, + { + "epoch": 0.09, + "grad_norm": 20.491180802304278, + "learning_rate": 3.1081989247311826e-06, + "loss": 1.3582, + "step": 925 + }, + { + "epoch": 0.09, + "grad_norm": 44.89752185269729, + "learning_rate": 3.125e-06, + "loss": 1.3173, + "step": 930 + }, + { + "epoch": 0.09, + "grad_norm": 35.91291644151354, + "learning_rate": 3.1418010752688177e-06, + "loss": 1.3348, + "step": 935 + }, + { + "epoch": 0.09, + "grad_norm": 11.238116458693074, + "learning_rate": 3.1586021505376348e-06, + "loss": 1.3507, + "step": 940 + }, + { + "epoch": 0.1, + "grad_norm": 66.27238623554678, + "learning_rate": 3.1754032258064523e-06, + "loss": 1.4242, + "step": 945 + }, + { + "epoch": 0.1, + "grad_norm": 23.33319086370242, + "learning_rate": 3.192204301075269e-06, + "loss": 1.3167, + "step": 950 + }, + { + "epoch": 0.1, + "grad_norm": 27.96170215869396, + "learning_rate": 3.2090053763440865e-06, + "loss": 1.3044, + "step": 955 + }, + { + "epoch": 0.1, + "grad_norm": 24.96629022725591, + "learning_rate": 3.225806451612903e-06, + "loss": 1.3122, + "step": 960 + }, + { + "epoch": 0.1, + "grad_norm": 31.97296302595778, + "learning_rate": 3.2426075268817207e-06, + "loss": 1.3263, + "step": 965 + }, + { + "epoch": 0.1, + "grad_norm": 40.56869211119147, + "learning_rate": 3.259408602150538e-06, + "loss": 1.3216, + "step": 970 + }, + { + "epoch": 0.1, + "grad_norm": 48.86733327981514, + "learning_rate": 3.2762096774193553e-06, + "loss": 1.3852, + "step": 975 + }, + { + "epoch": 0.1, + "grad_norm": 48.8589827825722, + "learning_rate": 3.293010752688172e-06, + "loss": 1.3942, + "step": 980 + }, + { + "epoch": 0.1, + "grad_norm": 105.92172275966337, + "learning_rate": 3.3098118279569895e-06, + "loss": 1.331, + "step": 985 + }, + { + "epoch": 0.1, + "grad_norm": 79.25673823625935, + "learning_rate": 3.3266129032258067e-06, + "loss": 1.4146, + "step": 990 + }, + { + "epoch": 0.1, + "grad_norm": 22.753954756085893, + "learning_rate": 3.343413978494624e-06, + "loss": 1.3425, + "step": 995 + }, + { + "epoch": 0.1, + "grad_norm": 17.938951839801916, + "learning_rate": 3.360215053763441e-06, + "loss": 1.2828, + "step": 1000 + }, + { + "epoch": 0.1, + "grad_norm": 40.93281890650386, + "learning_rate": 3.3770161290322584e-06, + "loss": 1.4161, + "step": 1005 + }, + { + "epoch": 0.1, + "grad_norm": 23.016726816711316, + "learning_rate": 3.3938172043010755e-06, + "loss": 1.342, + "step": 1010 + }, + { + "epoch": 0.1, + "grad_norm": 19.694780338872466, + "learning_rate": 3.4106182795698926e-06, + "loss": 1.2934, + "step": 1015 + }, + { + "epoch": 0.1, + "grad_norm": 12.232488558864302, + "learning_rate": 3.4274193548387097e-06, + "loss": 1.4007, + "step": 1020 + }, + { + "epoch": 0.1, + "grad_norm": 14.56739533203392, + "learning_rate": 3.4442204301075272e-06, + "loss": 1.2597, + "step": 1025 + }, + { + "epoch": 0.1, + "grad_norm": 17.55384322451927, + "learning_rate": 3.4610215053763447e-06, + "loss": 1.3139, + "step": 1030 + }, + { + "epoch": 0.1, + "grad_norm": 14.51771613755045, + "learning_rate": 3.4778225806451614e-06, + "loss": 1.2369, + "step": 1035 + }, + { + "epoch": 0.1, + "grad_norm": 12.547628388697412, + "learning_rate": 3.494623655913979e-06, + "loss": 1.3077, + "step": 1040 + }, + { + "epoch": 0.11, + "grad_norm": 21.253857468288704, + "learning_rate": 3.511424731182796e-06, + "loss": 1.2945, + "step": 1045 + }, + { + "epoch": 0.11, + "grad_norm": 32.2978178380615, + "learning_rate": 3.5282258064516136e-06, + "loss": 1.3445, + "step": 1050 + }, + { + "epoch": 0.11, + "grad_norm": 11.768307513613498, + "learning_rate": 3.5450268817204303e-06, + "loss": 1.3081, + "step": 1055 + }, + { + "epoch": 0.11, + "grad_norm": 15.387095517003354, + "learning_rate": 3.5618279569892478e-06, + "loss": 1.2361, + "step": 1060 + }, + { + "epoch": 0.11, + "grad_norm": 14.17899433065435, + "learning_rate": 3.578629032258065e-06, + "loss": 1.307, + "step": 1065 + }, + { + "epoch": 0.11, + "grad_norm": 19.757988892716817, + "learning_rate": 3.595430107526882e-06, + "loss": 1.3318, + "step": 1070 + }, + { + "epoch": 0.11, + "grad_norm": 14.618417336002926, + "learning_rate": 3.612231182795699e-06, + "loss": 1.311, + "step": 1075 + }, + { + "epoch": 0.11, + "grad_norm": 11.71596301767083, + "learning_rate": 3.6290322580645166e-06, + "loss": 1.226, + "step": 1080 + }, + { + "epoch": 0.11, + "grad_norm": 11.068963240572275, + "learning_rate": 3.6458333333333333e-06, + "loss": 1.2774, + "step": 1085 + }, + { + "epoch": 0.11, + "grad_norm": 12.802355495211414, + "learning_rate": 3.662634408602151e-06, + "loss": 1.3052, + "step": 1090 + }, + { + "epoch": 0.11, + "grad_norm": 15.684098983890863, + "learning_rate": 3.679435483870968e-06, + "loss": 1.2614, + "step": 1095 + }, + { + "epoch": 0.11, + "grad_norm": 11.099864032228766, + "learning_rate": 3.6962365591397855e-06, + "loss": 1.2375, + "step": 1100 + }, + { + "epoch": 0.11, + "grad_norm": 44.01005527420453, + "learning_rate": 3.713037634408602e-06, + "loss": 1.269, + "step": 1105 + }, + { + "epoch": 0.11, + "grad_norm": 13.69498598023566, + "learning_rate": 3.7298387096774197e-06, + "loss": 1.3259, + "step": 1110 + }, + { + "epoch": 0.11, + "grad_norm": 22.87277001637021, + "learning_rate": 3.7466397849462368e-06, + "loss": 1.2521, + "step": 1115 + }, + { + "epoch": 0.11, + "grad_norm": 12.733526400008255, + "learning_rate": 3.763440860215054e-06, + "loss": 1.3009, + "step": 1120 + }, + { + "epoch": 0.11, + "grad_norm": 17.430350713728973, + "learning_rate": 3.7802419354838714e-06, + "loss": 1.2388, + "step": 1125 + }, + { + "epoch": 0.11, + "grad_norm": 15.85124486151365, + "learning_rate": 3.7970430107526885e-06, + "loss": 1.3255, + "step": 1130 + }, + { + "epoch": 0.11, + "grad_norm": 11.2973639532908, + "learning_rate": 3.813844086021506e-06, + "loss": 1.2868, + "step": 1135 + }, + { + "epoch": 0.11, + "grad_norm": 16.044475090556347, + "learning_rate": 3.830645161290323e-06, + "loss": 1.2921, + "step": 1140 + }, + { + "epoch": 0.12, + "grad_norm": 11.419443385517743, + "learning_rate": 3.84744623655914e-06, + "loss": 1.3027, + "step": 1145 + }, + { + "epoch": 0.12, + "grad_norm": 9.408727766295128, + "learning_rate": 3.864247311827957e-06, + "loss": 1.2636, + "step": 1150 + }, + { + "epoch": 0.12, + "grad_norm": 28.714229113569143, + "learning_rate": 3.8810483870967744e-06, + "loss": 1.3094, + "step": 1155 + }, + { + "epoch": 0.12, + "grad_norm": 19.468610643519202, + "learning_rate": 3.8978494623655915e-06, + "loss": 1.295, + "step": 1160 + }, + { + "epoch": 0.12, + "grad_norm": 22.147649353738434, + "learning_rate": 3.914650537634409e-06, + "loss": 1.268, + "step": 1165 + }, + { + "epoch": 0.12, + "grad_norm": 21.815639864910924, + "learning_rate": 3.931451612903226e-06, + "loss": 1.3022, + "step": 1170 + }, + { + "epoch": 0.12, + "grad_norm": 10.131219829800257, + "learning_rate": 3.948252688172044e-06, + "loss": 1.3033, + "step": 1175 + }, + { + "epoch": 0.12, + "grad_norm": 40.422268139360455, + "learning_rate": 3.96505376344086e-06, + "loss": 1.2916, + "step": 1180 + }, + { + "epoch": 0.12, + "grad_norm": 13.18996572855082, + "learning_rate": 3.981854838709678e-06, + "loss": 1.2089, + "step": 1185 + }, + { + "epoch": 0.12, + "grad_norm": 34.97991531805703, + "learning_rate": 3.998655913978495e-06, + "loss": 1.25, + "step": 1190 + }, + { + "epoch": 0.12, + "grad_norm": 24.84307661290493, + "learning_rate": 4.015456989247312e-06, + "loss": 1.3503, + "step": 1195 + }, + { + "epoch": 0.12, + "grad_norm": 23.814660056097257, + "learning_rate": 4.032258064516129e-06, + "loss": 1.2976, + "step": 1200 + }, + { + "epoch": 0.12, + "grad_norm": 48.66684356663973, + "learning_rate": 4.049059139784946e-06, + "loss": 1.278, + "step": 1205 + }, + { + "epoch": 0.12, + "grad_norm": 16.743111074822284, + "learning_rate": 4.065860215053764e-06, + "loss": 1.2592, + "step": 1210 + }, + { + "epoch": 0.12, + "grad_norm": 24.74596115728974, + "learning_rate": 4.082661290322581e-06, + "loss": 1.2825, + "step": 1215 + }, + { + "epoch": 0.12, + "grad_norm": 34.36014978790517, + "learning_rate": 4.0994623655913985e-06, + "loss": 1.2459, + "step": 1220 + }, + { + "epoch": 0.12, + "grad_norm": 20.999547137931017, + "learning_rate": 4.1162634408602156e-06, + "loss": 1.2121, + "step": 1225 + }, + { + "epoch": 0.12, + "grad_norm": 12.868497455108407, + "learning_rate": 4.133064516129033e-06, + "loss": 1.2768, + "step": 1230 + }, + { + "epoch": 0.12, + "grad_norm": 30.434809468480765, + "learning_rate": 4.14986559139785e-06, + "loss": 1.2675, + "step": 1235 + }, + { + "epoch": 0.13, + "grad_norm": 11.616266135748583, + "learning_rate": 4.166666666666667e-06, + "loss": 1.2608, + "step": 1240 + }, + { + "epoch": 0.13, + "grad_norm": 12.532446205798287, + "learning_rate": 4.183467741935484e-06, + "loss": 1.3375, + "step": 1245 + }, + { + "epoch": 0.13, + "grad_norm": 21.560371038383625, + "learning_rate": 4.200268817204302e-06, + "loss": 1.2329, + "step": 1250 + }, + { + "epoch": 0.13, + "grad_norm": 13.724212892297027, + "learning_rate": 4.217069892473118e-06, + "loss": 1.2697, + "step": 1255 + }, + { + "epoch": 0.13, + "grad_norm": 22.2529932746503, + "learning_rate": 4.233870967741936e-06, + "loss": 1.2645, + "step": 1260 + }, + { + "epoch": 0.13, + "grad_norm": 13.558716472596469, + "learning_rate": 4.250672043010753e-06, + "loss": 1.2197, + "step": 1265 + }, + { + "epoch": 0.13, + "grad_norm": 29.247532948847013, + "learning_rate": 4.26747311827957e-06, + "loss": 1.2919, + "step": 1270 + }, + { + "epoch": 0.13, + "grad_norm": 11.804159591275033, + "learning_rate": 4.2842741935483874e-06, + "loss": 1.3064, + "step": 1275 + }, + { + "epoch": 0.13, + "grad_norm": 21.409715922052758, + "learning_rate": 4.3010752688172045e-06, + "loss": 1.2718, + "step": 1280 + }, + { + "epoch": 0.13, + "grad_norm": 14.77842104466719, + "learning_rate": 4.317876344086022e-06, + "loss": 1.3095, + "step": 1285 + }, + { + "epoch": 0.13, + "grad_norm": 15.685031119948517, + "learning_rate": 4.334677419354839e-06, + "loss": 1.2832, + "step": 1290 + }, + { + "epoch": 0.13, + "grad_norm": 21.385894858494375, + "learning_rate": 4.351478494623656e-06, + "loss": 1.3237, + "step": 1295 + }, + { + "epoch": 0.13, + "grad_norm": 13.723338453835758, + "learning_rate": 4.368279569892474e-06, + "loss": 1.2666, + "step": 1300 + }, + { + "epoch": 0.13, + "grad_norm": 12.439591311170961, + "learning_rate": 4.385080645161291e-06, + "loss": 1.2909, + "step": 1305 + }, + { + "epoch": 0.13, + "grad_norm": 10.514215609608113, + "learning_rate": 4.401881720430108e-06, + "loss": 1.2327, + "step": 1310 + }, + { + "epoch": 0.13, + "grad_norm": 17.845827804092984, + "learning_rate": 4.418682795698925e-06, + "loss": 1.2617, + "step": 1315 + }, + { + "epoch": 0.13, + "grad_norm": 9.823360940605603, + "learning_rate": 4.435483870967742e-06, + "loss": 1.2915, + "step": 1320 + }, + { + "epoch": 0.13, + "grad_norm": 11.472975718436162, + "learning_rate": 4.452284946236559e-06, + "loss": 1.302, + "step": 1325 + }, + { + "epoch": 0.13, + "grad_norm": 11.284002557459615, + "learning_rate": 4.469086021505376e-06, + "loss": 1.2788, + "step": 1330 + }, + { + "epoch": 0.13, + "grad_norm": 38.27911520285089, + "learning_rate": 4.485887096774194e-06, + "loss": 1.2552, + "step": 1335 + }, + { + "epoch": 0.14, + "grad_norm": 9.432535268516432, + "learning_rate": 4.502688172043011e-06, + "loss": 1.2759, + "step": 1340 + }, + { + "epoch": 0.14, + "grad_norm": 10.98747211347199, + "learning_rate": 4.5194892473118286e-06, + "loss": 1.2844, + "step": 1345 + }, + { + "epoch": 0.14, + "grad_norm": 12.511423262582975, + "learning_rate": 4.536290322580646e-06, + "loss": 1.2283, + "step": 1350 + }, + { + "epoch": 0.14, + "grad_norm": 20.514439149634352, + "learning_rate": 4.553091397849463e-06, + "loss": 1.2353, + "step": 1355 + }, + { + "epoch": 0.14, + "grad_norm": 16.015991119714123, + "learning_rate": 4.56989247311828e-06, + "loss": 1.2959, + "step": 1360 + }, + { + "epoch": 0.14, + "grad_norm": 10.106660893981587, + "learning_rate": 4.586693548387097e-06, + "loss": 1.2239, + "step": 1365 + }, + { + "epoch": 0.14, + "grad_norm": 29.853730062938347, + "learning_rate": 4.603494623655914e-06, + "loss": 1.3194, + "step": 1370 + }, + { + "epoch": 0.14, + "grad_norm": 14.685737996052852, + "learning_rate": 4.620295698924732e-06, + "loss": 1.2148, + "step": 1375 + }, + { + "epoch": 0.14, + "grad_norm": 10.877331149990967, + "learning_rate": 4.637096774193548e-06, + "loss": 1.2045, + "step": 1380 + }, + { + "epoch": 0.14, + "grad_norm": 11.008218814296683, + "learning_rate": 4.653897849462366e-06, + "loss": 1.2477, + "step": 1385 + }, + { + "epoch": 0.14, + "grad_norm": 22.743570797984013, + "learning_rate": 4.670698924731183e-06, + "loss": 1.2775, + "step": 1390 + }, + { + "epoch": 0.14, + "grad_norm": 29.524974297593605, + "learning_rate": 4.6875000000000004e-06, + "loss": 1.299, + "step": 1395 + }, + { + "epoch": 0.14, + "grad_norm": 19.37930037376844, + "learning_rate": 4.7043010752688175e-06, + "loss": 1.3329, + "step": 1400 + }, + { + "epoch": 0.14, + "grad_norm": 12.598571626831996, + "learning_rate": 4.721102150537635e-06, + "loss": 1.2942, + "step": 1405 + }, + { + "epoch": 0.14, + "grad_norm": 12.752457322963501, + "learning_rate": 4.737903225806453e-06, + "loss": 1.2408, + "step": 1410 + }, + { + "epoch": 0.14, + "grad_norm": 12.210288312721424, + "learning_rate": 4.754704301075269e-06, + "loss": 1.2762, + "step": 1415 + }, + { + "epoch": 0.14, + "grad_norm": 15.028431784981668, + "learning_rate": 4.771505376344087e-06, + "loss": 1.2817, + "step": 1420 + }, + { + "epoch": 0.14, + "grad_norm": 14.993762858927266, + "learning_rate": 4.788306451612904e-06, + "loss": 1.2521, + "step": 1425 + }, + { + "epoch": 0.14, + "grad_norm": 26.11557096134634, + "learning_rate": 4.805107526881721e-06, + "loss": 1.2275, + "step": 1430 + }, + { + "epoch": 0.14, + "grad_norm": 23.808664429913215, + "learning_rate": 4.821908602150538e-06, + "loss": 1.2729, + "step": 1435 + }, + { + "epoch": 0.15, + "grad_norm": 13.214083146280673, + "learning_rate": 4.838709677419355e-06, + "loss": 1.2103, + "step": 1440 + }, + { + "epoch": 0.15, + "grad_norm": 14.950068491419977, + "learning_rate": 4.855510752688172e-06, + "loss": 1.2407, + "step": 1445 + }, + { + "epoch": 0.15, + "grad_norm": 49.77700236183305, + "learning_rate": 4.872311827956989e-06, + "loss": 1.2599, + "step": 1450 + }, + { + "epoch": 0.15, + "grad_norm": 12.2907307607193, + "learning_rate": 4.8891129032258065e-06, + "loss": 1.2365, + "step": 1455 + }, + { + "epoch": 0.15, + "grad_norm": 35.24775519440709, + "learning_rate": 4.9059139784946245e-06, + "loss": 1.2727, + "step": 1460 + }, + { + "epoch": 0.15, + "grad_norm": 19.209125211904805, + "learning_rate": 4.922715053763441e-06, + "loss": 1.2516, + "step": 1465 + }, + { + "epoch": 0.15, + "grad_norm": 13.378215907461524, + "learning_rate": 4.939516129032259e-06, + "loss": 1.2488, + "step": 1470 + }, + { + "epoch": 0.15, + "grad_norm": 17.6633476819897, + "learning_rate": 4.956317204301076e-06, + "loss": 1.2668, + "step": 1475 + }, + { + "epoch": 0.15, + "grad_norm": 10.740924281703688, + "learning_rate": 4.973118279569893e-06, + "loss": 1.249, + "step": 1480 + }, + { + "epoch": 0.15, + "grad_norm": 14.726705345231204, + "learning_rate": 4.98991935483871e-06, + "loss": 1.3307, + "step": 1485 + }, + { + "epoch": 0.15, + "grad_norm": 14.59599163185456, + "learning_rate": 5.006720430107528e-06, + "loss": 1.2854, + "step": 1490 + }, + { + "epoch": 0.15, + "grad_norm": 41.090146378069534, + "learning_rate": 5.023521505376344e-06, + "loss": 1.291, + "step": 1495 + }, + { + "epoch": 0.15, + "grad_norm": 11.10786448778689, + "learning_rate": 5.040322580645161e-06, + "loss": 1.2843, + "step": 1500 + }, + { + "epoch": 0.15, + "grad_norm": 16.353102002707647, + "learning_rate": 5.057123655913979e-06, + "loss": 1.2556, + "step": 1505 + }, + { + "epoch": 0.15, + "grad_norm": 15.89417888688282, + "learning_rate": 5.073924731182796e-06, + "loss": 1.2582, + "step": 1510 + }, + { + "epoch": 0.15, + "grad_norm": 38.41649367763352, + "learning_rate": 5.090725806451613e-06, + "loss": 1.3358, + "step": 1515 + }, + { + "epoch": 0.15, + "grad_norm": 13.107954803233616, + "learning_rate": 5.1075268817204305e-06, + "loss": 1.28, + "step": 1520 + }, + { + "epoch": 0.15, + "grad_norm": 11.825179318943986, + "learning_rate": 5.124327956989248e-06, + "loss": 1.2516, + "step": 1525 + }, + { + "epoch": 0.15, + "grad_norm": 26.492711678315192, + "learning_rate": 5.141129032258066e-06, + "loss": 1.2652, + "step": 1530 + }, + { + "epoch": 0.15, + "grad_norm": 17.328853885466696, + "learning_rate": 5.157930107526882e-06, + "loss": 1.2965, + "step": 1535 + }, + { + "epoch": 0.16, + "grad_norm": 17.50828288344384, + "learning_rate": 5.174731182795699e-06, + "loss": 1.2399, + "step": 1540 + }, + { + "epoch": 0.16, + "grad_norm": 11.950092920119129, + "learning_rate": 5.191532258064517e-06, + "loss": 1.3137, + "step": 1545 + }, + { + "epoch": 0.16, + "grad_norm": 20.052669655145927, + "learning_rate": 5.208333333333334e-06, + "loss": 1.3046, + "step": 1550 + }, + { + "epoch": 0.16, + "grad_norm": 53.370358081765325, + "learning_rate": 5.22513440860215e-06, + "loss": 1.2716, + "step": 1555 + }, + { + "epoch": 0.16, + "grad_norm": 23.64171905195358, + "learning_rate": 5.241935483870968e-06, + "loss": 1.2413, + "step": 1560 + }, + { + "epoch": 0.16, + "grad_norm": 18.501262717248586, + "learning_rate": 5.258736559139785e-06, + "loss": 1.3243, + "step": 1565 + }, + { + "epoch": 0.16, + "grad_norm": 46.31409454126866, + "learning_rate": 5.275537634408603e-06, + "loss": 1.2501, + "step": 1570 + }, + { + "epoch": 0.16, + "grad_norm": 24.436697787509363, + "learning_rate": 5.2923387096774195e-06, + "loss": 1.2446, + "step": 1575 + }, + { + "epoch": 0.16, + "grad_norm": 15.211955398526683, + "learning_rate": 5.309139784946237e-06, + "loss": 1.2485, + "step": 1580 + }, + { + "epoch": 0.16, + "grad_norm": 51.17545641781786, + "learning_rate": 5.3259408602150546e-06, + "loss": 1.2664, + "step": 1585 + }, + { + "epoch": 0.16, + "grad_norm": 22.73729158273849, + "learning_rate": 5.342741935483872e-06, + "loss": 1.2712, + "step": 1590 + }, + { + "epoch": 0.16, + "grad_norm": 14.40335244214581, + "learning_rate": 5.359543010752689e-06, + "loss": 1.2636, + "step": 1595 + }, + { + "epoch": 0.16, + "grad_norm": 10.214516983760586, + "learning_rate": 5.376344086021506e-06, + "loss": 1.2158, + "step": 1600 + }, + { + "epoch": 0.16, + "grad_norm": 10.34966113388582, + "learning_rate": 5.393145161290323e-06, + "loss": 1.2513, + "step": 1605 + }, + { + "epoch": 0.16, + "grad_norm": 27.012632212205652, + "learning_rate": 5.40994623655914e-06, + "loss": 1.2153, + "step": 1610 + }, + { + "epoch": 0.16, + "grad_norm": 66.26220201569467, + "learning_rate": 5.426747311827958e-06, + "loss": 1.2941, + "step": 1615 + }, + { + "epoch": 0.16, + "grad_norm": 14.447517864939536, + "learning_rate": 5.443548387096774e-06, + "loss": 1.2104, + "step": 1620 + }, + { + "epoch": 0.16, + "grad_norm": 53.03873905700024, + "learning_rate": 5.460349462365591e-06, + "loss": 1.2912, + "step": 1625 + }, + { + "epoch": 0.16, + "grad_norm": 82.47953703104959, + "learning_rate": 5.477150537634409e-06, + "loss": 1.2963, + "step": 1630 + }, + { + "epoch": 0.16, + "grad_norm": 59.35832189434322, + "learning_rate": 5.4939516129032264e-06, + "loss": 1.299, + "step": 1635 + }, + { + "epoch": 0.17, + "grad_norm": 21.23456282221694, + "learning_rate": 5.510752688172043e-06, + "loss": 1.2955, + "step": 1640 + }, + { + "epoch": 0.17, + "grad_norm": 17.208909499987218, + "learning_rate": 5.527553763440861e-06, + "loss": 1.2827, + "step": 1645 + }, + { + "epoch": 0.17, + "grad_norm": 17.814579537509662, + "learning_rate": 5.544354838709678e-06, + "loss": 1.2782, + "step": 1650 + }, + { + "epoch": 0.17, + "grad_norm": 44.84766228129635, + "learning_rate": 5.561155913978496e-06, + "loss": 1.3388, + "step": 1655 + }, + { + "epoch": 0.17, + "grad_norm": 80.02157305544218, + "learning_rate": 5.577956989247312e-06, + "loss": 1.3387, + "step": 1660 + }, + { + "epoch": 0.17, + "grad_norm": 47.30693371027271, + "learning_rate": 5.594758064516129e-06, + "loss": 1.2853, + "step": 1665 + }, + { + "epoch": 0.17, + "grad_norm": 31.111971822450105, + "learning_rate": 5.611559139784947e-06, + "loss": 1.3207, + "step": 1670 + }, + { + "epoch": 0.17, + "grad_norm": 29.323472551959778, + "learning_rate": 5.628360215053764e-06, + "loss": 1.2522, + "step": 1675 + }, + { + "epoch": 0.17, + "grad_norm": 40.92036524504599, + "learning_rate": 5.645161290322582e-06, + "loss": 1.3008, + "step": 1680 + }, + { + "epoch": 0.17, + "grad_norm": 33.7231465176679, + "learning_rate": 5.661962365591398e-06, + "loss": 1.2796, + "step": 1685 + }, + { + "epoch": 0.17, + "grad_norm": 11.506027765246968, + "learning_rate": 5.678763440860215e-06, + "loss": 1.2689, + "step": 1690 + }, + { + "epoch": 0.17, + "grad_norm": 23.309943493618878, + "learning_rate": 5.6955645161290325e-06, + "loss": 1.2803, + "step": 1695 + }, + { + "epoch": 0.17, + "grad_norm": 14.260279103971566, + "learning_rate": 5.7123655913978505e-06, + "loss": 1.2479, + "step": 1700 + }, + { + "epoch": 0.17, + "grad_norm": 17.417965761132848, + "learning_rate": 5.729166666666667e-06, + "loss": 1.2659, + "step": 1705 + }, + { + "epoch": 0.17, + "grad_norm": 19.069359196115766, + "learning_rate": 5.745967741935484e-06, + "loss": 1.258, + "step": 1710 + }, + { + "epoch": 0.17, + "grad_norm": 31.862010501733916, + "learning_rate": 5.762768817204302e-06, + "loss": 1.2745, + "step": 1715 + }, + { + "epoch": 0.17, + "grad_norm": 14.171126284799604, + "learning_rate": 5.779569892473119e-06, + "loss": 1.2201, + "step": 1720 + }, + { + "epoch": 0.17, + "grad_norm": 10.839283951323784, + "learning_rate": 5.796370967741935e-06, + "loss": 1.2394, + "step": 1725 + }, + { + "epoch": 0.17, + "grad_norm": 28.093728374613843, + "learning_rate": 5.813172043010753e-06, + "loss": 1.2638, + "step": 1730 + }, + { + "epoch": 0.17, + "grad_norm": 44.58712437099615, + "learning_rate": 5.82997311827957e-06, + "loss": 1.263, + "step": 1735 + }, + { + "epoch": 0.18, + "grad_norm": 11.611237724562416, + "learning_rate": 5.846774193548388e-06, + "loss": 1.2396, + "step": 1740 + }, + { + "epoch": 0.18, + "grad_norm": 10.721134129684025, + "learning_rate": 5.863575268817204e-06, + "loss": 1.2612, + "step": 1745 + }, + { + "epoch": 0.18, + "grad_norm": 28.526923438499946, + "learning_rate": 5.8803763440860215e-06, + "loss": 1.2348, + "step": 1750 + }, + { + "epoch": 0.18, + "grad_norm": 28.344913656755676, + "learning_rate": 5.8971774193548394e-06, + "loss": 1.2826, + "step": 1755 + }, + { + "epoch": 0.18, + "grad_norm": 11.471422006525902, + "learning_rate": 5.9139784946236566e-06, + "loss": 1.2943, + "step": 1760 + }, + { + "epoch": 0.18, + "grad_norm": 10.434390641845038, + "learning_rate": 5.930779569892473e-06, + "loss": 1.2229, + "step": 1765 + }, + { + "epoch": 0.18, + "grad_norm": 16.75539696310857, + "learning_rate": 5.947580645161291e-06, + "loss": 1.2526, + "step": 1770 + }, + { + "epoch": 0.18, + "grad_norm": 12.3925293626586, + "learning_rate": 5.964381720430108e-06, + "loss": 1.2394, + "step": 1775 + }, + { + "epoch": 0.18, + "grad_norm": 12.786185248449616, + "learning_rate": 5.981182795698926e-06, + "loss": 1.2927, + "step": 1780 + }, + { + "epoch": 0.18, + "grad_norm": 28.326800741035534, + "learning_rate": 5.997983870967743e-06, + "loss": 1.3106, + "step": 1785 + }, + { + "epoch": 0.18, + "grad_norm": 13.276235173795225, + "learning_rate": 6.014784946236559e-06, + "loss": 1.2011, + "step": 1790 + }, + { + "epoch": 0.18, + "grad_norm": 19.043491768486806, + "learning_rate": 6.031586021505377e-06, + "loss": 1.31, + "step": 1795 + }, + { + "epoch": 0.18, + "grad_norm": 12.982912118702751, + "learning_rate": 6.048387096774194e-06, + "loss": 1.1834, + "step": 1800 + }, + { + "epoch": 0.18, + "grad_norm": 23.46029378615589, + "learning_rate": 6.065188172043011e-06, + "loss": 1.2058, + "step": 1805 + }, + { + "epoch": 0.18, + "grad_norm": 9.56457697533073, + "learning_rate": 6.0819892473118284e-06, + "loss": 1.2209, + "step": 1810 + }, + { + "epoch": 0.18, + "grad_norm": 20.007818323470907, + "learning_rate": 6.0987903225806455e-06, + "loss": 1.2084, + "step": 1815 + }, + { + "epoch": 0.18, + "grad_norm": 30.888852345480156, + "learning_rate": 6.115591397849463e-06, + "loss": 1.2878, + "step": 1820 + }, + { + "epoch": 0.18, + "grad_norm": 11.206797920592592, + "learning_rate": 6.132392473118281e-06, + "loss": 1.256, + "step": 1825 + }, + { + "epoch": 0.18, + "grad_norm": 41.77653719952749, + "learning_rate": 6.149193548387097e-06, + "loss": 1.2143, + "step": 1830 + }, + { + "epoch": 0.19, + "grad_norm": 22.773443987679865, + "learning_rate": 6.165994623655914e-06, + "loss": 1.2212, + "step": 1835 + }, + { + "epoch": 0.19, + "grad_norm": 15.003527818874442, + "learning_rate": 6.182795698924732e-06, + "loss": 1.1451, + "step": 1840 + }, + { + "epoch": 0.19, + "grad_norm": 10.832295518728609, + "learning_rate": 6.199596774193549e-06, + "loss": 1.2371, + "step": 1845 + }, + { + "epoch": 0.19, + "grad_norm": 14.106996556562182, + "learning_rate": 6.216397849462365e-06, + "loss": 1.2564, + "step": 1850 + }, + { + "epoch": 0.19, + "grad_norm": 11.6321798945971, + "learning_rate": 6.233198924731183e-06, + "loss": 1.235, + "step": 1855 + }, + { + "epoch": 0.19, + "grad_norm": 12.246030421588882, + "learning_rate": 6.25e-06, + "loss": 1.3035, + "step": 1860 + }, + { + "epoch": 0.19, + "grad_norm": 14.37717478205693, + "learning_rate": 6.266801075268818e-06, + "loss": 1.2778, + "step": 1865 + }, + { + "epoch": 0.19, + "grad_norm": 15.24027128069132, + "learning_rate": 6.283602150537635e-06, + "loss": 1.272, + "step": 1870 + }, + { + "epoch": 0.19, + "grad_norm": 34.70380358345755, + "learning_rate": 6.300403225806452e-06, + "loss": 1.2512, + "step": 1875 + }, + { + "epoch": 0.19, + "grad_norm": 46.86598098882556, + "learning_rate": 6.3172043010752696e-06, + "loss": 1.2413, + "step": 1880 + }, + { + "epoch": 0.19, + "grad_norm": 24.5265225085656, + "learning_rate": 6.334005376344087e-06, + "loss": 1.2549, + "step": 1885 + }, + { + "epoch": 0.19, + "grad_norm": 12.176692968830055, + "learning_rate": 6.350806451612905e-06, + "loss": 1.2243, + "step": 1890 + }, + { + "epoch": 0.19, + "grad_norm": 12.623066798462252, + "learning_rate": 6.367607526881721e-06, + "loss": 1.2203, + "step": 1895 + }, + { + "epoch": 0.19, + "grad_norm": 20.506001827294856, + "learning_rate": 6.384408602150538e-06, + "loss": 1.2448, + "step": 1900 + }, + { + "epoch": 0.19, + "grad_norm": 24.346786364555726, + "learning_rate": 6.401209677419356e-06, + "loss": 1.1681, + "step": 1905 + }, + { + "epoch": 0.19, + "grad_norm": 22.37322556152353, + "learning_rate": 6.418010752688173e-06, + "loss": 1.2679, + "step": 1910 + }, + { + "epoch": 0.19, + "grad_norm": 18.538859354210924, + "learning_rate": 6.434811827956989e-06, + "loss": 1.225, + "step": 1915 + }, + { + "epoch": 0.19, + "grad_norm": 34.54548609395492, + "learning_rate": 6.451612903225806e-06, + "loss": 1.2446, + "step": 1920 + }, + { + "epoch": 0.19, + "grad_norm": 10.82382489549032, + "learning_rate": 6.468413978494624e-06, + "loss": 1.2507, + "step": 1925 + }, + { + "epoch": 0.19, + "grad_norm": 19.144353601820267, + "learning_rate": 6.4852150537634414e-06, + "loss": 1.2938, + "step": 1930 + }, + { + "epoch": 0.2, + "grad_norm": 29.58265058763573, + "learning_rate": 6.502016129032258e-06, + "loss": 1.2952, + "step": 1935 + }, + { + "epoch": 0.2, + "grad_norm": 9.721343926876305, + "learning_rate": 6.518817204301076e-06, + "loss": 1.1828, + "step": 1940 + }, + { + "epoch": 0.2, + "grad_norm": 10.514261400823013, + "learning_rate": 6.535618279569893e-06, + "loss": 1.2053, + "step": 1945 + }, + { + "epoch": 0.2, + "grad_norm": 15.533730854979197, + "learning_rate": 6.552419354838711e-06, + "loss": 1.2568, + "step": 1950 + }, + { + "epoch": 0.2, + "grad_norm": 16.386926044621365, + "learning_rate": 6.569220430107528e-06, + "loss": 1.2712, + "step": 1955 + }, + { + "epoch": 0.2, + "grad_norm": 9.035649299385856, + "learning_rate": 6.586021505376344e-06, + "loss": 1.1855, + "step": 1960 + }, + { + "epoch": 0.2, + "grad_norm": 19.69163264701267, + "learning_rate": 6.602822580645162e-06, + "loss": 1.2761, + "step": 1965 + }, + { + "epoch": 0.2, + "grad_norm": 37.0944311318195, + "learning_rate": 6.619623655913979e-06, + "loss": 1.2577, + "step": 1970 + }, + { + "epoch": 0.2, + "grad_norm": 9.865249614590498, + "learning_rate": 6.636424731182797e-06, + "loss": 1.26, + "step": 1975 + }, + { + "epoch": 0.2, + "grad_norm": 11.904200524254145, + "learning_rate": 6.653225806451613e-06, + "loss": 1.2764, + "step": 1980 + }, + { + "epoch": 0.2, + "grad_norm": 12.933867176670313, + "learning_rate": 6.67002688172043e-06, + "loss": 1.2721, + "step": 1985 + }, + { + "epoch": 0.2, + "grad_norm": 29.639862278436, + "learning_rate": 6.686827956989248e-06, + "loss": 1.2208, + "step": 1990 + }, + { + "epoch": 0.2, + "grad_norm": 25.54302227116381, + "learning_rate": 6.7036290322580655e-06, + "loss": 1.2142, + "step": 1995 + }, + { + "epoch": 0.2, + "grad_norm": 13.655438913627023, + "learning_rate": 6.720430107526882e-06, + "loss": 1.2235, + "step": 2000 + }, + { + "epoch": 0.2, + "grad_norm": 18.24343043881259, + "learning_rate": 6.7372311827957e-06, + "loss": 1.2186, + "step": 2005 + }, + { + "epoch": 0.2, + "grad_norm": 12.287545958285396, + "learning_rate": 6.754032258064517e-06, + "loss": 1.2351, + "step": 2010 + }, + { + "epoch": 0.2, + "grad_norm": 27.61706184500475, + "learning_rate": 6.770833333333334e-06, + "loss": 1.219, + "step": 2015 + }, + { + "epoch": 0.2, + "grad_norm": 12.006823598473522, + "learning_rate": 6.787634408602151e-06, + "loss": 1.2106, + "step": 2020 + }, + { + "epoch": 0.2, + "grad_norm": 24.899259234998013, + "learning_rate": 6.804435483870968e-06, + "loss": 1.2471, + "step": 2025 + }, + { + "epoch": 0.2, + "grad_norm": 108.13820601181524, + "learning_rate": 6.821236559139785e-06, + "loss": 1.3069, + "step": 2030 + }, + { + "epoch": 0.21, + "grad_norm": 51.41482122176862, + "learning_rate": 6.838037634408603e-06, + "loss": 1.274, + "step": 2035 + }, + { + "epoch": 0.21, + "grad_norm": 27.969160983681736, + "learning_rate": 6.854838709677419e-06, + "loss": 1.2421, + "step": 2040 + }, + { + "epoch": 0.21, + "grad_norm": 116.52262746448919, + "learning_rate": 6.8716397849462365e-06, + "loss": 1.209, + "step": 2045 + }, + { + "epoch": 0.21, + "grad_norm": 28.9605266075955, + "learning_rate": 6.8884408602150544e-06, + "loss": 1.2727, + "step": 2050 + }, + { + "epoch": 0.21, + "grad_norm": 20.262447992796503, + "learning_rate": 6.9052419354838715e-06, + "loss": 1.2644, + "step": 2055 + }, + { + "epoch": 0.21, + "grad_norm": 12.79893422071457, + "learning_rate": 6.9220430107526895e-06, + "loss": 1.2442, + "step": 2060 + }, + { + "epoch": 0.21, + "grad_norm": 23.597737721499232, + "learning_rate": 6.938844086021506e-06, + "loss": 1.241, + "step": 2065 + }, + { + "epoch": 0.21, + "grad_norm": 24.518983087782463, + "learning_rate": 6.955645161290323e-06, + "loss": 1.2307, + "step": 2070 + }, + { + "epoch": 0.21, + "grad_norm": 20.560253477289546, + "learning_rate": 6.972446236559141e-06, + "loss": 1.2691, + "step": 2075 + }, + { + "epoch": 0.21, + "grad_norm": 25.848118116745248, + "learning_rate": 6.989247311827958e-06, + "loss": 1.2724, + "step": 2080 + }, + { + "epoch": 0.21, + "grad_norm": 9.914019924510955, + "learning_rate": 7.006048387096774e-06, + "loss": 1.2748, + "step": 2085 + }, + { + "epoch": 0.21, + "grad_norm": 10.925737866573701, + "learning_rate": 7.022849462365592e-06, + "loss": 1.2468, + "step": 2090 + }, + { + "epoch": 0.21, + "grad_norm": 10.000718814137622, + "learning_rate": 7.039650537634409e-06, + "loss": 1.2004, + "step": 2095 + }, + { + "epoch": 0.21, + "grad_norm": 14.595339324971437, + "learning_rate": 7.056451612903227e-06, + "loss": 1.2976, + "step": 2100 + }, + { + "epoch": 0.21, + "grad_norm": 15.463529208058105, + "learning_rate": 7.073252688172043e-06, + "loss": 1.2366, + "step": 2105 + }, + { + "epoch": 0.21, + "grad_norm": 29.45323360640321, + "learning_rate": 7.0900537634408605e-06, + "loss": 1.2691, + "step": 2110 + }, + { + "epoch": 0.21, + "grad_norm": 12.03636155828545, + "learning_rate": 7.1068548387096785e-06, + "loss": 1.2626, + "step": 2115 + }, + { + "epoch": 0.21, + "grad_norm": 41.205485411736404, + "learning_rate": 7.1236559139784956e-06, + "loss": 1.2255, + "step": 2120 + }, + { + "epoch": 0.21, + "grad_norm": 19.8354928734948, + "learning_rate": 7.140456989247312e-06, + "loss": 1.2627, + "step": 2125 + }, + { + "epoch": 0.21, + "grad_norm": 13.143130418752412, + "learning_rate": 7.15725806451613e-06, + "loss": 1.2636, + "step": 2130 + }, + { + "epoch": 0.22, + "grad_norm": 26.329639735822322, + "learning_rate": 7.174059139784947e-06, + "loss": 1.2913, + "step": 2135 + }, + { + "epoch": 0.22, + "grad_norm": 10.53974742117649, + "learning_rate": 7.190860215053764e-06, + "loss": 1.1796, + "step": 2140 + }, + { + "epoch": 0.22, + "grad_norm": 19.81150442809384, + "learning_rate": 7.207661290322582e-06, + "loss": 1.2697, + "step": 2145 + }, + { + "epoch": 0.22, + "grad_norm": 41.27431702908246, + "learning_rate": 7.224462365591398e-06, + "loss": 1.2155, + "step": 2150 + }, + { + "epoch": 0.22, + "grad_norm": 17.502773503666184, + "learning_rate": 7.241263440860215e-06, + "loss": 1.2091, + "step": 2155 + }, + { + "epoch": 0.22, + "grad_norm": 9.51563458004467, + "learning_rate": 7.258064516129033e-06, + "loss": 1.2635, + "step": 2160 + }, + { + "epoch": 0.22, + "grad_norm": 15.575762923082294, + "learning_rate": 7.27486559139785e-06, + "loss": 1.221, + "step": 2165 + }, + { + "epoch": 0.22, + "grad_norm": 21.278627675604866, + "learning_rate": 7.291666666666667e-06, + "loss": 1.2411, + "step": 2170 + }, + { + "epoch": 0.22, + "grad_norm": 13.04465876557275, + "learning_rate": 7.3084677419354845e-06, + "loss": 1.2134, + "step": 2175 + }, + { + "epoch": 0.22, + "grad_norm": 11.011104679604054, + "learning_rate": 7.325268817204302e-06, + "loss": 1.2422, + "step": 2180 + }, + { + "epoch": 0.22, + "grad_norm": 16.353363056360596, + "learning_rate": 7.34206989247312e-06, + "loss": 1.2202, + "step": 2185 + }, + { + "epoch": 0.22, + "grad_norm": 12.87880319538541, + "learning_rate": 7.358870967741936e-06, + "loss": 1.2097, + "step": 2190 + }, + { + "epoch": 0.22, + "grad_norm": 11.533815983023235, + "learning_rate": 7.375672043010753e-06, + "loss": 1.2266, + "step": 2195 + }, + { + "epoch": 0.22, + "grad_norm": 55.96590550959414, + "learning_rate": 7.392473118279571e-06, + "loss": 1.2446, + "step": 2200 + }, + { + "epoch": 0.22, + "grad_norm": 27.6433561291497, + "learning_rate": 7.409274193548388e-06, + "loss": 1.2762, + "step": 2205 + }, + { + "epoch": 0.22, + "grad_norm": 49.82007468136901, + "learning_rate": 7.426075268817204e-06, + "loss": 1.2283, + "step": 2210 + }, + { + "epoch": 0.22, + "grad_norm": 14.84247480602059, + "learning_rate": 7.442876344086022e-06, + "loss": 1.2353, + "step": 2215 + }, + { + "epoch": 0.22, + "grad_norm": 13.413311841272398, + "learning_rate": 7.459677419354839e-06, + "loss": 1.2622, + "step": 2220 + }, + { + "epoch": 0.22, + "grad_norm": 40.40763721338431, + "learning_rate": 7.476478494623656e-06, + "loss": 1.2936, + "step": 2225 + }, + { + "epoch": 0.22, + "grad_norm": 82.94010690130378, + "learning_rate": 7.4932795698924735e-06, + "loss": 1.2799, + "step": 2230 + }, + { + "epoch": 0.23, + "grad_norm": 40.39126638765991, + "learning_rate": 7.510080645161291e-06, + "loss": 1.3345, + "step": 2235 + }, + { + "epoch": 0.23, + "grad_norm": 47.39627347260159, + "learning_rate": 7.526881720430108e-06, + "loss": 1.228, + "step": 2240 + }, + { + "epoch": 0.23, + "grad_norm": 11.629121665795358, + "learning_rate": 7.543682795698926e-06, + "loss": 1.2331, + "step": 2245 + }, + { + "epoch": 0.23, + "grad_norm": 15.306221861015311, + "learning_rate": 7.560483870967743e-06, + "loss": 1.2713, + "step": 2250 + }, + { + "epoch": 0.23, + "grad_norm": 24.542736233239204, + "learning_rate": 7.577284946236559e-06, + "loss": 1.2892, + "step": 2255 + }, + { + "epoch": 0.23, + "grad_norm": 21.07591070120843, + "learning_rate": 7.594086021505377e-06, + "loss": 1.2692, + "step": 2260 + }, + { + "epoch": 0.23, + "grad_norm": 29.56250640158638, + "learning_rate": 7.610887096774194e-06, + "loss": 1.2271, + "step": 2265 + }, + { + "epoch": 0.23, + "grad_norm": 46.59319817819226, + "learning_rate": 7.627688172043012e-06, + "loss": 1.2268, + "step": 2270 + }, + { + "epoch": 0.23, + "grad_norm": 84.07647878086206, + "learning_rate": 7.644489247311827e-06, + "loss": 1.2491, + "step": 2275 + }, + { + "epoch": 0.23, + "grad_norm": 42.82187183604961, + "learning_rate": 7.661290322580646e-06, + "loss": 1.2686, + "step": 2280 + }, + { + "epoch": 0.23, + "grad_norm": 63.93417339646263, + "learning_rate": 7.678091397849463e-06, + "loss": 1.2721, + "step": 2285 + }, + { + "epoch": 0.23, + "grad_norm": 33.91029033440181, + "learning_rate": 7.69489247311828e-06, + "loss": 1.2616, + "step": 2290 + }, + { + "epoch": 0.23, + "grad_norm": 9.582134518943782, + "learning_rate": 7.711693548387098e-06, + "loss": 1.2684, + "step": 2295 + }, + { + "epoch": 0.23, + "grad_norm": 24.52154465326118, + "learning_rate": 7.728494623655915e-06, + "loss": 1.268, + "step": 2300 + }, + { + "epoch": 0.23, + "grad_norm": 39.68127286315339, + "learning_rate": 7.745295698924732e-06, + "loss": 1.2984, + "step": 2305 + }, + { + "epoch": 0.23, + "grad_norm": 64.21209850893193, + "learning_rate": 7.762096774193549e-06, + "loss": 1.3044, + "step": 2310 + }, + { + "epoch": 0.23, + "grad_norm": 51.574901468650154, + "learning_rate": 7.778897849462366e-06, + "loss": 1.2065, + "step": 2315 + }, + { + "epoch": 0.23, + "grad_norm": 29.1347166115766, + "learning_rate": 7.795698924731183e-06, + "loss": 1.2155, + "step": 2320 + }, + { + "epoch": 0.23, + "grad_norm": 14.815362147149694, + "learning_rate": 7.8125e-06, + "loss": 1.2377, + "step": 2325 + }, + { + "epoch": 0.23, + "grad_norm": 31.023853624949705, + "learning_rate": 7.829301075268817e-06, + "loss": 1.2757, + "step": 2330 + }, + { + "epoch": 0.24, + "grad_norm": 11.259718591127633, + "learning_rate": 7.846102150537636e-06, + "loss": 1.2178, + "step": 2335 + }, + { + "epoch": 0.24, + "grad_norm": 10.77824590902307, + "learning_rate": 7.862903225806451e-06, + "loss": 1.27, + "step": 2340 + }, + { + "epoch": 0.24, + "grad_norm": 26.383993695953727, + "learning_rate": 7.879704301075269e-06, + "loss": 1.2079, + "step": 2345 + }, + { + "epoch": 0.24, + "grad_norm": 26.736318983340148, + "learning_rate": 7.896505376344087e-06, + "loss": 1.2078, + "step": 2350 + }, + { + "epoch": 0.24, + "grad_norm": 27.43805184607868, + "learning_rate": 7.913306451612904e-06, + "loss": 1.2061, + "step": 2355 + }, + { + "epoch": 0.24, + "grad_norm": 10.12846401107268, + "learning_rate": 7.93010752688172e-06, + "loss": 1.2931, + "step": 2360 + }, + { + "epoch": 0.24, + "grad_norm": 8.723056959950561, + "learning_rate": 7.946908602150539e-06, + "loss": 1.1951, + "step": 2365 + }, + { + "epoch": 0.24, + "grad_norm": 8.066035570374572, + "learning_rate": 7.963709677419356e-06, + "loss": 1.1917, + "step": 2370 + }, + { + "epoch": 0.24, + "grad_norm": 16.15126653319653, + "learning_rate": 7.980510752688173e-06, + "loss": 1.2501, + "step": 2375 + }, + { + "epoch": 0.24, + "grad_norm": 9.825376500667879, + "learning_rate": 7.99731182795699e-06, + "loss": 1.2303, + "step": 2380 + }, + { + "epoch": 0.24, + "grad_norm": 15.19258369455789, + "learning_rate": 8.014112903225807e-06, + "loss": 1.2147, + "step": 2385 + }, + { + "epoch": 0.24, + "grad_norm": 18.237466445781678, + "learning_rate": 8.030913978494624e-06, + "loss": 1.1802, + "step": 2390 + }, + { + "epoch": 0.24, + "grad_norm": 28.40527681526738, + "learning_rate": 8.047715053763441e-06, + "loss": 1.262, + "step": 2395 + }, + { + "epoch": 0.24, + "grad_norm": 9.153614338557704, + "learning_rate": 8.064516129032258e-06, + "loss": 1.2535, + "step": 2400 + }, + { + "epoch": 0.24, + "grad_norm": 18.013184380961132, + "learning_rate": 8.081317204301075e-06, + "loss": 1.3406, + "step": 2405 + }, + { + "epoch": 0.24, + "grad_norm": 11.551706178969816, + "learning_rate": 8.098118279569893e-06, + "loss": 1.2798, + "step": 2410 + }, + { + "epoch": 0.24, + "grad_norm": 29.301817149313784, + "learning_rate": 8.114919354838711e-06, + "loss": 1.2021, + "step": 2415 + }, + { + "epoch": 0.24, + "grad_norm": 14.604552659024082, + "learning_rate": 8.131720430107529e-06, + "loss": 1.2193, + "step": 2420 + }, + { + "epoch": 0.24, + "grad_norm": 13.661579135545585, + "learning_rate": 8.148521505376344e-06, + "loss": 1.1753, + "step": 2425 + }, + { + "epoch": 0.24, + "grad_norm": 17.732871735188713, + "learning_rate": 8.165322580645163e-06, + "loss": 1.1896, + "step": 2430 + }, + { + "epoch": 0.25, + "grad_norm": 21.94264730965134, + "learning_rate": 8.18212365591398e-06, + "loss": 1.294, + "step": 2435 + }, + { + "epoch": 0.25, + "grad_norm": 8.98862683040616, + "learning_rate": 8.198924731182797e-06, + "loss": 1.3597, + "step": 2440 + }, + { + "epoch": 0.25, + "grad_norm": 9.348562191116157, + "learning_rate": 8.215725806451614e-06, + "loss": 1.1981, + "step": 2445 + }, + { + "epoch": 0.25, + "grad_norm": 12.382309784539512, + "learning_rate": 8.232526881720431e-06, + "loss": 1.197, + "step": 2450 + }, + { + "epoch": 0.25, + "grad_norm": 12.802639889690079, + "learning_rate": 8.249327956989248e-06, + "loss": 1.2518, + "step": 2455 + }, + { + "epoch": 0.25, + "grad_norm": 9.105120008248589, + "learning_rate": 8.266129032258065e-06, + "loss": 1.2412, + "step": 2460 + }, + { + "epoch": 0.25, + "grad_norm": 22.5005191369109, + "learning_rate": 8.282930107526882e-06, + "loss": 1.2095, + "step": 2465 + }, + { + "epoch": 0.25, + "grad_norm": 55.06496286963062, + "learning_rate": 8.2997311827957e-06, + "loss": 1.2292, + "step": 2470 + }, + { + "epoch": 0.25, + "grad_norm": 25.40056685381794, + "learning_rate": 8.316532258064517e-06, + "loss": 1.2229, + "step": 2475 + }, + { + "epoch": 0.25, + "grad_norm": 19.75732050971706, + "learning_rate": 8.333333333333334e-06, + "loss": 1.276, + "step": 2480 + }, + { + "epoch": 0.25, + "grad_norm": 19.259188424158435, + "learning_rate": 8.35013440860215e-06, + "loss": 1.2295, + "step": 2485 + }, + { + "epoch": 0.25, + "grad_norm": 48.034449754252755, + "learning_rate": 8.366935483870968e-06, + "loss": 1.2694, + "step": 2490 + }, + { + "epoch": 0.25, + "grad_norm": 27.913204135569178, + "learning_rate": 8.383736559139785e-06, + "loss": 1.2747, + "step": 2495 + }, + { + "epoch": 0.25, + "grad_norm": 34.71178286331895, + "learning_rate": 8.400537634408604e-06, + "loss": 1.2141, + "step": 2500 + }, + { + "epoch": 0.25, + "grad_norm": 19.01748344569644, + "learning_rate": 8.41733870967742e-06, + "loss": 1.2222, + "step": 2505 + }, + { + "epoch": 0.25, + "grad_norm": 10.43993883230521, + "learning_rate": 8.434139784946236e-06, + "loss": 1.258, + "step": 2510 + }, + { + "epoch": 0.25, + "grad_norm": 38.56997728761633, + "learning_rate": 8.450940860215055e-06, + "loss": 1.2001, + "step": 2515 + }, + { + "epoch": 0.25, + "grad_norm": 22.647653948196986, + "learning_rate": 8.467741935483872e-06, + "loss": 1.2565, + "step": 2520 + }, + { + "epoch": 0.25, + "grad_norm": 31.804400232690394, + "learning_rate": 8.48454301075269e-06, + "loss": 1.2916, + "step": 2525 + }, + { + "epoch": 0.26, + "grad_norm": 10.568518363297464, + "learning_rate": 8.501344086021506e-06, + "loss": 1.2009, + "step": 2530 + }, + { + "epoch": 0.26, + "grad_norm": 13.26372201231781, + "learning_rate": 8.518145161290324e-06, + "loss": 1.209, + "step": 2535 + }, + { + "epoch": 0.26, + "grad_norm": 46.54550273923697, + "learning_rate": 8.53494623655914e-06, + "loss": 1.245, + "step": 2540 + }, + { + "epoch": 0.26, + "grad_norm": 9.219462432954233, + "learning_rate": 8.551747311827958e-06, + "loss": 1.2544, + "step": 2545 + }, + { + "epoch": 0.26, + "grad_norm": 44.110924602079656, + "learning_rate": 8.568548387096775e-06, + "loss": 1.2368, + "step": 2550 + }, + { + "epoch": 0.26, + "grad_norm": 43.35815365202579, + "learning_rate": 8.585349462365592e-06, + "loss": 1.2318, + "step": 2555 + }, + { + "epoch": 0.26, + "grad_norm": 11.019119762296853, + "learning_rate": 8.602150537634409e-06, + "loss": 1.2579, + "step": 2560 + }, + { + "epoch": 0.26, + "grad_norm": 13.597771671863768, + "learning_rate": 8.618951612903226e-06, + "loss": 1.255, + "step": 2565 + }, + { + "epoch": 0.26, + "grad_norm": 50.13693425901624, + "learning_rate": 8.635752688172043e-06, + "loss": 1.256, + "step": 2570 + }, + { + "epoch": 0.26, + "grad_norm": 7.904065398841818, + "learning_rate": 8.65255376344086e-06, + "loss": 1.2039, + "step": 2575 + }, + { + "epoch": 0.26, + "grad_norm": 37.45772381933171, + "learning_rate": 8.669354838709677e-06, + "loss": 1.2229, + "step": 2580 + }, + { + "epoch": 0.26, + "grad_norm": 39.27057013113243, + "learning_rate": 8.686155913978496e-06, + "loss": 1.2191, + "step": 2585 + }, + { + "epoch": 0.26, + "grad_norm": 70.60558760616007, + "learning_rate": 8.702956989247312e-06, + "loss": 1.3142, + "step": 2590 + }, + { + "epoch": 0.26, + "grad_norm": 28.19659398732191, + "learning_rate": 8.719758064516129e-06, + "loss": 1.2259, + "step": 2595 + }, + { + "epoch": 0.26, + "grad_norm": 23.9566064510682, + "learning_rate": 8.736559139784948e-06, + "loss": 1.2543, + "step": 2600 + }, + { + "epoch": 0.26, + "grad_norm": 15.671051560916663, + "learning_rate": 8.753360215053765e-06, + "loss": 1.2193, + "step": 2605 + }, + { + "epoch": 0.26, + "grad_norm": 29.838824406491774, + "learning_rate": 8.770161290322582e-06, + "loss": 1.2308, + "step": 2610 + }, + { + "epoch": 0.26, + "grad_norm": 8.987977176453866, + "learning_rate": 8.786962365591399e-06, + "loss": 1.2298, + "step": 2615 + }, + { + "epoch": 0.26, + "grad_norm": 16.52133772623778, + "learning_rate": 8.803763440860216e-06, + "loss": 1.2272, + "step": 2620 + }, + { + "epoch": 0.26, + "grad_norm": 15.693819052666122, + "learning_rate": 8.820564516129033e-06, + "loss": 1.2678, + "step": 2625 + }, + { + "epoch": 0.27, + "grad_norm": 20.076796566784243, + "learning_rate": 8.83736559139785e-06, + "loss": 1.1967, + "step": 2630 + }, + { + "epoch": 0.27, + "grad_norm": 29.791845399978822, + "learning_rate": 8.854166666666667e-06, + "loss": 1.2163, + "step": 2635 + }, + { + "epoch": 0.27, + "grad_norm": 47.53203127521539, + "learning_rate": 8.870967741935484e-06, + "loss": 1.2407, + "step": 2640 + }, + { + "epoch": 0.27, + "grad_norm": 16.00670809300897, + "learning_rate": 8.887768817204302e-06, + "loss": 1.238, + "step": 2645 + }, + { + "epoch": 0.27, + "grad_norm": 13.108573334947934, + "learning_rate": 8.904569892473119e-06, + "loss": 1.2901, + "step": 2650 + }, + { + "epoch": 0.27, + "grad_norm": 16.094412129400425, + "learning_rate": 8.921370967741936e-06, + "loss": 1.1964, + "step": 2655 + }, + { + "epoch": 0.27, + "grad_norm": 11.235105946719376, + "learning_rate": 8.938172043010753e-06, + "loss": 1.245, + "step": 2660 + }, + { + "epoch": 0.27, + "grad_norm": 41.31431089609605, + "learning_rate": 8.95497311827957e-06, + "loss": 1.2624, + "step": 2665 + }, + { + "epoch": 0.27, + "grad_norm": 16.115421164459278, + "learning_rate": 8.971774193548389e-06, + "loss": 1.231, + "step": 2670 + }, + { + "epoch": 0.27, + "grad_norm": 55.68637807130677, + "learning_rate": 8.988575268817204e-06, + "loss": 1.2537, + "step": 2675 + }, + { + "epoch": 0.27, + "grad_norm": 44.42132523032244, + "learning_rate": 9.005376344086021e-06, + "loss": 1.1876, + "step": 2680 + }, + { + "epoch": 0.27, + "grad_norm": 27.949890360806133, + "learning_rate": 9.02217741935484e-06, + "loss": 1.2117, + "step": 2685 + }, + { + "epoch": 0.27, + "grad_norm": 40.761220118311314, + "learning_rate": 9.038978494623657e-06, + "loss": 1.2633, + "step": 2690 + }, + { + "epoch": 0.27, + "grad_norm": 20.224082082802013, + "learning_rate": 9.055779569892473e-06, + "loss": 1.2541, + "step": 2695 + }, + { + "epoch": 0.27, + "grad_norm": 17.094096668262384, + "learning_rate": 9.072580645161291e-06, + "loss": 1.2348, + "step": 2700 + }, + { + "epoch": 0.27, + "grad_norm": 60.1962371836327, + "learning_rate": 9.089381720430108e-06, + "loss": 1.2331, + "step": 2705 + }, + { + "epoch": 0.27, + "grad_norm": 55.94215746637798, + "learning_rate": 9.106182795698926e-06, + "loss": 1.2352, + "step": 2710 + }, + { + "epoch": 0.27, + "grad_norm": 32.57124388066799, + "learning_rate": 9.122983870967743e-06, + "loss": 1.2323, + "step": 2715 + }, + { + "epoch": 0.27, + "grad_norm": 16.941148434081548, + "learning_rate": 9.13978494623656e-06, + "loss": 1.2388, + "step": 2720 + }, + { + "epoch": 0.27, + "grad_norm": 18.389045889903844, + "learning_rate": 9.156586021505377e-06, + "loss": 1.2326, + "step": 2725 + }, + { + "epoch": 0.28, + "grad_norm": 16.3662470293946, + "learning_rate": 9.173387096774194e-06, + "loss": 1.2977, + "step": 2730 + }, + { + "epoch": 0.28, + "grad_norm": 14.143682290674668, + "learning_rate": 9.190188172043013e-06, + "loss": 1.2299, + "step": 2735 + }, + { + "epoch": 0.28, + "grad_norm": 10.488199757167923, + "learning_rate": 9.206989247311828e-06, + "loss": 1.1949, + "step": 2740 + }, + { + "epoch": 0.28, + "grad_norm": 15.543690391067857, + "learning_rate": 9.223790322580645e-06, + "loss": 1.285, + "step": 2745 + }, + { + "epoch": 0.28, + "grad_norm": 37.70309866093175, + "learning_rate": 9.240591397849464e-06, + "loss": 1.2336, + "step": 2750 + }, + { + "epoch": 0.28, + "grad_norm": 62.688716746409945, + "learning_rate": 9.257392473118281e-06, + "loss": 1.2581, + "step": 2755 + }, + { + "epoch": 0.28, + "grad_norm": 37.72106543644531, + "learning_rate": 9.274193548387097e-06, + "loss": 1.2729, + "step": 2760 + }, + { + "epoch": 0.28, + "grad_norm": 56.0751796884353, + "learning_rate": 9.290994623655915e-06, + "loss": 1.2609, + "step": 2765 + }, + { + "epoch": 0.28, + "grad_norm": 25.60819582600334, + "learning_rate": 9.307795698924732e-06, + "loss": 1.22, + "step": 2770 + }, + { + "epoch": 0.28, + "grad_norm": 15.017430243129686, + "learning_rate": 9.32459677419355e-06, + "loss": 1.2838, + "step": 2775 + }, + { + "epoch": 0.28, + "grad_norm": 21.030906077378877, + "learning_rate": 9.341397849462367e-06, + "loss": 1.2826, + "step": 2780 + }, + { + "epoch": 0.28, + "grad_norm": 28.879404465573838, + "learning_rate": 9.358198924731184e-06, + "loss": 1.252, + "step": 2785 + }, + { + "epoch": 0.28, + "grad_norm": 19.44653324996414, + "learning_rate": 9.375000000000001e-06, + "loss": 1.2714, + "step": 2790 + }, + { + "epoch": 0.28, + "grad_norm": 10.247990145821044, + "learning_rate": 9.391801075268818e-06, + "loss": 1.2259, + "step": 2795 + }, + { + "epoch": 0.28, + "grad_norm": 24.325834288611475, + "learning_rate": 9.408602150537635e-06, + "loss": 1.2544, + "step": 2800 + }, + { + "epoch": 0.28, + "grad_norm": 14.962373850237869, + "learning_rate": 9.425403225806452e-06, + "loss": 1.2342, + "step": 2805 + }, + { + "epoch": 0.28, + "grad_norm": 20.103796795195294, + "learning_rate": 9.44220430107527e-06, + "loss": 1.2428, + "step": 2810 + }, + { + "epoch": 0.28, + "grad_norm": 65.72278478898748, + "learning_rate": 9.459005376344086e-06, + "loss": 1.2779, + "step": 2815 + }, + { + "epoch": 0.28, + "grad_norm": 34.57590700018222, + "learning_rate": 9.475806451612905e-06, + "loss": 1.2214, + "step": 2820 + }, + { + "epoch": 0.28, + "grad_norm": 29.87845279871248, + "learning_rate": 9.49260752688172e-06, + "loss": 1.2607, + "step": 2825 + }, + { + "epoch": 0.29, + "grad_norm": 15.99531248344474, + "learning_rate": 9.509408602150538e-06, + "loss": 1.215, + "step": 2830 + }, + { + "epoch": 0.29, + "grad_norm": 40.937307170959855, + "learning_rate": 9.526209677419356e-06, + "loss": 1.2376, + "step": 2835 + }, + { + "epoch": 0.29, + "grad_norm": 24.801802543062614, + "learning_rate": 9.543010752688174e-06, + "loss": 1.2416, + "step": 2840 + }, + { + "epoch": 0.29, + "grad_norm": 14.843620697232266, + "learning_rate": 9.559811827956989e-06, + "loss": 1.2758, + "step": 2845 + }, + { + "epoch": 0.29, + "grad_norm": 25.767959275553633, + "learning_rate": 9.576612903225808e-06, + "loss": 1.2924, + "step": 2850 + }, + { + "epoch": 0.29, + "grad_norm": 24.835210656953024, + "learning_rate": 9.593413978494625e-06, + "loss": 1.3021, + "step": 2855 + }, + { + "epoch": 0.29, + "grad_norm": 14.811252228116862, + "learning_rate": 9.610215053763442e-06, + "loss": 1.1958, + "step": 2860 + }, + { + "epoch": 0.29, + "grad_norm": 8.84580463225306, + "learning_rate": 9.627016129032259e-06, + "loss": 1.2299, + "step": 2865 + }, + { + "epoch": 0.29, + "grad_norm": 35.50525850048963, + "learning_rate": 9.643817204301076e-06, + "loss": 1.2543, + "step": 2870 + }, + { + "epoch": 0.29, + "grad_norm": 17.821644675146317, + "learning_rate": 9.660618279569893e-06, + "loss": 1.2512, + "step": 2875 + }, + { + "epoch": 0.29, + "grad_norm": 25.16120140943554, + "learning_rate": 9.67741935483871e-06, + "loss": 1.2644, + "step": 2880 + }, + { + "epoch": 0.29, + "grad_norm": 26.443037158380964, + "learning_rate": 9.694220430107528e-06, + "loss": 1.2873, + "step": 2885 + }, + { + "epoch": 0.29, + "grad_norm": 32.58614186448375, + "learning_rate": 9.711021505376345e-06, + "loss": 1.2056, + "step": 2890 + }, + { + "epoch": 0.29, + "grad_norm": 28.009823108590062, + "learning_rate": 9.727822580645162e-06, + "loss": 1.2312, + "step": 2895 + }, + { + "epoch": 0.29, + "grad_norm": 29.593172996117087, + "learning_rate": 9.744623655913979e-06, + "loss": 1.2385, + "step": 2900 + }, + { + "epoch": 0.29, + "grad_norm": 46.463353284382364, + "learning_rate": 9.761424731182798e-06, + "loss": 1.2203, + "step": 2905 + }, + { + "epoch": 0.29, + "grad_norm": 14.522172617728398, + "learning_rate": 9.778225806451613e-06, + "loss": 1.2851, + "step": 2910 + }, + { + "epoch": 0.29, + "grad_norm": 21.8644130205428, + "learning_rate": 9.79502688172043e-06, + "loss": 1.3051, + "step": 2915 + }, + { + "epoch": 0.29, + "grad_norm": 60.03291649051756, + "learning_rate": 9.811827956989249e-06, + "loss": 1.2196, + "step": 2920 + }, + { + "epoch": 0.29, + "grad_norm": 60.594682600646706, + "learning_rate": 9.828629032258066e-06, + "loss": 1.2187, + "step": 2925 + }, + { + "epoch": 0.3, + "grad_norm": 8.381452691629116, + "learning_rate": 9.845430107526881e-06, + "loss": 1.2154, + "step": 2930 + }, + { + "epoch": 0.3, + "grad_norm": 10.090385181932243, + "learning_rate": 9.8622311827957e-06, + "loss": 1.2283, + "step": 2935 + }, + { + "epoch": 0.3, + "grad_norm": 18.85732363275535, + "learning_rate": 9.879032258064517e-06, + "loss": 1.3106, + "step": 2940 + }, + { + "epoch": 0.3, + "grad_norm": 39.278581764228036, + "learning_rate": 9.895833333333334e-06, + "loss": 1.2291, + "step": 2945 + }, + { + "epoch": 0.3, + "grad_norm": 44.68729775499523, + "learning_rate": 9.912634408602152e-06, + "loss": 1.2338, + "step": 2950 + }, + { + "epoch": 0.3, + "grad_norm": 43.45808222006517, + "learning_rate": 9.929435483870969e-06, + "loss": 1.3069, + "step": 2955 + }, + { + "epoch": 0.3, + "grad_norm": 17.02806886416131, + "learning_rate": 9.946236559139786e-06, + "loss": 1.2378, + "step": 2960 + }, + { + "epoch": 0.3, + "grad_norm": 35.73834294646412, + "learning_rate": 9.963037634408603e-06, + "loss": 1.2714, + "step": 2965 + }, + { + "epoch": 0.3, + "grad_norm": 24.782915964923706, + "learning_rate": 9.97983870967742e-06, + "loss": 1.2359, + "step": 2970 + }, + { + "epoch": 0.3, + "grad_norm": 70.72793777002627, + "learning_rate": 9.996639784946237e-06, + "loss": 1.3251, + "step": 2975 + }, + { + "epoch": 0.3, + "grad_norm": 57.16942627102183, + "learning_rate": 9.999999449441523e-06, + "loss": 1.3049, + "step": 2980 + }, + { + "epoch": 0.3, + "grad_norm": 94.53740449854857, + "learning_rate": 9.999997212797916e-06, + "loss": 1.2459, + "step": 2985 + }, + { + "epoch": 0.3, + "grad_norm": 111.95977469538862, + "learning_rate": 9.999993255660043e-06, + "loss": 1.3163, + "step": 2990 + }, + { + "epoch": 0.3, + "grad_norm": 15.031804726431162, + "learning_rate": 9.999987578029268e-06, + "loss": 1.3024, + "step": 2995 + }, + { + "epoch": 0.3, + "grad_norm": 71.96322221490063, + "learning_rate": 9.999980179907541e-06, + "loss": 1.299, + "step": 3000 + }, + { + "epoch": 0.3, + "grad_norm": 51.906863459767855, + "learning_rate": 9.999971061297411e-06, + "loss": 1.3545, + "step": 3005 + }, + { + "epoch": 0.3, + "grad_norm": 41.68798761739742, + "learning_rate": 9.999960222202014e-06, + "loss": 1.2937, + "step": 3010 + }, + { + "epoch": 0.3, + "grad_norm": 46.196693267941605, + "learning_rate": 9.99994766262508e-06, + "loss": 1.291, + "step": 3015 + }, + { + "epoch": 0.3, + "grad_norm": 83.24411008845641, + "learning_rate": 9.99993338257093e-06, + "loss": 1.3184, + "step": 3020 + }, + { + "epoch": 0.3, + "grad_norm": 79.47916536696067, + "learning_rate": 9.999917382044479e-06, + "loss": 1.3597, + "step": 3025 + }, + { + "epoch": 0.31, + "grad_norm": 47.364807117651814, + "learning_rate": 9.999899661051232e-06, + "loss": 1.2771, + "step": 3030 + }, + { + "epoch": 0.31, + "grad_norm": 8.949929395733275, + "learning_rate": 9.999880219597287e-06, + "loss": 1.2681, + "step": 3035 + }, + { + "epoch": 0.31, + "grad_norm": 36.05513683071444, + "learning_rate": 9.999859057689336e-06, + "loss": 1.2577, + "step": 3040 + }, + { + "epoch": 0.31, + "grad_norm": 49.1723693913278, + "learning_rate": 9.999836175334657e-06, + "loss": 1.2624, + "step": 3045 + }, + { + "epoch": 0.31, + "grad_norm": 19.818863595628407, + "learning_rate": 9.999811572541125e-06, + "loss": 1.252, + "step": 3050 + }, + { + "epoch": 0.31, + "grad_norm": 65.3095406563796, + "learning_rate": 9.999785249317207e-06, + "loss": 1.2392, + "step": 3055 + }, + { + "epoch": 0.31, + "grad_norm": 33.44323503436429, + "learning_rate": 9.999757205671959e-06, + "loss": 1.2445, + "step": 3060 + }, + { + "epoch": 0.31, + "grad_norm": 54.9536579150963, + "learning_rate": 9.999727441615032e-06, + "loss": 1.3022, + "step": 3065 + }, + { + "epoch": 0.31, + "grad_norm": 21.625118717845428, + "learning_rate": 9.99969595715667e-06, + "loss": 1.2172, + "step": 3070 + }, + { + "epoch": 0.31, + "grad_norm": 53.04058016138585, + "learning_rate": 9.999662752307702e-06, + "loss": 1.2891, + "step": 3075 + }, + { + "epoch": 0.31, + "grad_norm": 41.0790648049123, + "learning_rate": 9.999627827079557e-06, + "loss": 1.2606, + "step": 3080 + }, + { + "epoch": 0.31, + "grad_norm": 12.949311683171679, + "learning_rate": 9.999591181484251e-06, + "loss": 1.2433, + "step": 3085 + }, + { + "epoch": 0.31, + "grad_norm": 13.606680721081727, + "learning_rate": 9.999552815534396e-06, + "loss": 1.2767, + "step": 3090 + }, + { + "epoch": 0.31, + "grad_norm": 23.548287110125504, + "learning_rate": 9.999512729243191e-06, + "loss": 1.2329, + "step": 3095 + }, + { + "epoch": 0.31, + "grad_norm": 11.659213956234366, + "learning_rate": 9.999470922624432e-06, + "loss": 1.2118, + "step": 3100 + }, + { + "epoch": 0.31, + "grad_norm": 11.109026198379695, + "learning_rate": 9.999427395692502e-06, + "loss": 1.271, + "step": 3105 + }, + { + "epoch": 0.31, + "grad_norm": 16.205675455931562, + "learning_rate": 9.999382148462382e-06, + "loss": 1.2228, + "step": 3110 + }, + { + "epoch": 0.31, + "grad_norm": 13.84615719190485, + "learning_rate": 9.99933518094964e-06, + "loss": 1.2587, + "step": 3115 + }, + { + "epoch": 0.31, + "grad_norm": 12.473676379339645, + "learning_rate": 9.999286493170435e-06, + "loss": 1.2854, + "step": 3120 + }, + { + "epoch": 0.32, + "grad_norm": 9.029530584732344, + "learning_rate": 9.999236085141523e-06, + "loss": 1.2422, + "step": 3125 + }, + { + "epoch": 0.32, + "grad_norm": 8.574237532934099, + "learning_rate": 9.999183956880252e-06, + "loss": 1.1645, + "step": 3130 + }, + { + "epoch": 0.32, + "grad_norm": 9.224399893537099, + "learning_rate": 9.999130108404553e-06, + "loss": 1.2224, + "step": 3135 + }, + { + "epoch": 0.32, + "grad_norm": 8.540594477353714, + "learning_rate": 9.99907453973296e-06, + "loss": 1.238, + "step": 3140 + }, + { + "epoch": 0.32, + "grad_norm": 25.11320923926726, + "learning_rate": 9.999017250884591e-06, + "loss": 1.2602, + "step": 3145 + }, + { + "epoch": 0.32, + "grad_norm": 14.434342173799088, + "learning_rate": 9.998958241879163e-06, + "loss": 1.2327, + "step": 3150 + }, + { + "epoch": 0.32, + "grad_norm": 23.92510726176057, + "learning_rate": 9.998897512736977e-06, + "loss": 1.2771, + "step": 3155 + }, + { + "epoch": 0.32, + "grad_norm": 11.27586016590783, + "learning_rate": 9.998835063478928e-06, + "loss": 1.2186, + "step": 3160 + }, + { + "epoch": 0.32, + "grad_norm": 17.206208499134267, + "learning_rate": 9.998770894126513e-06, + "loss": 1.244, + "step": 3165 + }, + { + "epoch": 0.32, + "grad_norm": 9.071587368592734, + "learning_rate": 9.998705004701805e-06, + "loss": 1.2746, + "step": 3170 + }, + { + "epoch": 0.32, + "grad_norm": 8.851985458784839, + "learning_rate": 9.998637395227481e-06, + "loss": 1.2293, + "step": 3175 + }, + { + "epoch": 0.32, + "grad_norm": 11.125235158488461, + "learning_rate": 9.998568065726804e-06, + "loss": 1.2395, + "step": 3180 + }, + { + "epoch": 0.32, + "grad_norm": 14.459385418108235, + "learning_rate": 9.998497016223628e-06, + "loss": 1.2507, + "step": 3185 + }, + { + "epoch": 0.32, + "grad_norm": 8.723762738270437, + "learning_rate": 9.998424246742403e-06, + "loss": 1.2461, + "step": 3190 + }, + { + "epoch": 0.32, + "grad_norm": 9.560316300456964, + "learning_rate": 9.99834975730817e-06, + "loss": 1.1793, + "step": 3195 + }, + { + "epoch": 0.32, + "grad_norm": 17.330824182062663, + "learning_rate": 9.998273547946557e-06, + "loss": 1.2471, + "step": 3200 + }, + { + "epoch": 0.32, + "grad_norm": 8.256225012715516, + "learning_rate": 9.998195618683793e-06, + "loss": 1.2439, + "step": 3205 + }, + { + "epoch": 0.32, + "grad_norm": 13.292485899881653, + "learning_rate": 9.99811596954669e-06, + "loss": 1.1854, + "step": 3210 + }, + { + "epoch": 0.32, + "grad_norm": 8.509099993425679, + "learning_rate": 9.998034600562654e-06, + "loss": 1.2467, + "step": 3215 + }, + { + "epoch": 0.32, + "grad_norm": 12.822666642474209, + "learning_rate": 9.997951511759686e-06, + "loss": 1.321, + "step": 3220 + }, + { + "epoch": 0.33, + "grad_norm": 8.367649827222804, + "learning_rate": 9.997866703166376e-06, + "loss": 1.2055, + "step": 3225 + }, + { + "epoch": 0.33, + "grad_norm": 7.915246360335449, + "learning_rate": 9.997780174811908e-06, + "loss": 1.2468, + "step": 3230 + }, + { + "epoch": 0.33, + "grad_norm": 7.3235819296260605, + "learning_rate": 9.997691926726055e-06, + "loss": 1.2301, + "step": 3235 + }, + { + "epoch": 0.33, + "grad_norm": 7.881549658254508, + "learning_rate": 9.997601958939185e-06, + "loss": 1.2476, + "step": 3240 + }, + { + "epoch": 0.33, + "grad_norm": 7.774689870813962, + "learning_rate": 9.997510271482252e-06, + "loss": 1.2587, + "step": 3245 + }, + { + "epoch": 0.33, + "grad_norm": 23.119656881210187, + "learning_rate": 9.997416864386808e-06, + "loss": 1.2944, + "step": 3250 + }, + { + "epoch": 0.33, + "grad_norm": 44.43248294796607, + "learning_rate": 9.997321737684994e-06, + "loss": 1.2861, + "step": 3255 + }, + { + "epoch": 0.33, + "grad_norm": 47.999482127159396, + "learning_rate": 9.997224891409542e-06, + "loss": 1.2578, + "step": 3260 + }, + { + "epoch": 0.33, + "grad_norm": 70.59257075033224, + "learning_rate": 9.99712632559378e-06, + "loss": 1.2717, + "step": 3265 + }, + { + "epoch": 0.33, + "grad_norm": 23.71641327422162, + "learning_rate": 9.997026040271623e-06, + "loss": 1.2947, + "step": 3270 + }, + { + "epoch": 0.33, + "grad_norm": 22.5621081905156, + "learning_rate": 9.996924035477575e-06, + "loss": 1.2861, + "step": 3275 + }, + { + "epoch": 0.33, + "grad_norm": 11.337136579692809, + "learning_rate": 9.996820311246741e-06, + "loss": 1.2452, + "step": 3280 + }, + { + "epoch": 0.33, + "grad_norm": 38.92682377208474, + "learning_rate": 9.99671486761481e-06, + "loss": 1.227, + "step": 3285 + }, + { + "epoch": 0.33, + "grad_norm": 39.12070470753764, + "learning_rate": 9.996607704618067e-06, + "loss": 1.2175, + "step": 3290 + }, + { + "epoch": 0.33, + "grad_norm": 55.06863641189487, + "learning_rate": 9.996498822293383e-06, + "loss": 1.261, + "step": 3295 + }, + { + "epoch": 0.33, + "grad_norm": 53.5917064046568, + "learning_rate": 9.996388220678226e-06, + "loss": 1.267, + "step": 3300 + }, + { + "epoch": 0.33, + "grad_norm": 22.7001638237988, + "learning_rate": 9.996275899810657e-06, + "loss": 1.2679, + "step": 3305 + }, + { + "epoch": 0.33, + "grad_norm": 26.240695732587145, + "learning_rate": 9.996161859729322e-06, + "loss": 1.2276, + "step": 3310 + }, + { + "epoch": 0.33, + "grad_norm": 28.864109624837717, + "learning_rate": 9.99604610047346e-06, + "loss": 1.2622, + "step": 3315 + }, + { + "epoch": 0.33, + "grad_norm": 14.181745192592954, + "learning_rate": 9.99592862208291e-06, + "loss": 1.2353, + "step": 3320 + }, + { + "epoch": 0.34, + "grad_norm": 31.135738640073715, + "learning_rate": 9.99580942459809e-06, + "loss": 1.2259, + "step": 3325 + }, + { + "epoch": 0.34, + "grad_norm": 14.694094175623361, + "learning_rate": 9.995688508060022e-06, + "loss": 1.2638, + "step": 3330 + }, + { + "epoch": 0.34, + "grad_norm": 14.336896391555797, + "learning_rate": 9.995565872510306e-06, + "loss": 1.2791, + "step": 3335 + }, + { + "epoch": 0.34, + "grad_norm": 67.85777726137141, + "learning_rate": 9.995441517991149e-06, + "loss": 1.2656, + "step": 3340 + }, + { + "epoch": 0.34, + "grad_norm": 49.402079926275604, + "learning_rate": 9.995315444545332e-06, + "loss": 1.2348, + "step": 3345 + }, + { + "epoch": 0.34, + "grad_norm": 104.59190696680739, + "learning_rate": 9.995187652216245e-06, + "loss": 1.3082, + "step": 3350 + }, + { + "epoch": 0.34, + "grad_norm": 48.01718012292942, + "learning_rate": 9.995058141047857e-06, + "loss": 1.2673, + "step": 3355 + }, + { + "epoch": 0.34, + "grad_norm": 33.64789850910811, + "learning_rate": 9.994926911084734e-06, + "loss": 1.2607, + "step": 3360 + }, + { + "epoch": 0.34, + "grad_norm": 31.926457677825184, + "learning_rate": 9.994793962372032e-06, + "loss": 1.3047, + "step": 3365 + }, + { + "epoch": 0.34, + "grad_norm": 36.43634702069511, + "learning_rate": 9.994659294955496e-06, + "loss": 1.2635, + "step": 3370 + }, + { + "epoch": 0.34, + "grad_norm": 22.716948822292796, + "learning_rate": 9.994522908881468e-06, + "loss": 1.2718, + "step": 3375 + }, + { + "epoch": 0.34, + "grad_norm": 15.93812477910786, + "learning_rate": 9.994384804196877e-06, + "loss": 1.2071, + "step": 3380 + }, + { + "epoch": 0.34, + "grad_norm": 7.4307138152681285, + "learning_rate": 9.994244980949246e-06, + "loss": 1.2283, + "step": 3385 + }, + { + "epoch": 0.34, + "grad_norm": 26.716814730179262, + "learning_rate": 9.994103439186686e-06, + "loss": 1.2311, + "step": 3390 + }, + { + "epoch": 0.34, + "grad_norm": 13.57535376565661, + "learning_rate": 9.993960178957904e-06, + "loss": 1.2416, + "step": 3395 + }, + { + "epoch": 0.34, + "grad_norm": 9.489410995507601, + "learning_rate": 9.993815200312194e-06, + "loss": 1.2401, + "step": 3400 + }, + { + "epoch": 0.34, + "grad_norm": 8.963757789326618, + "learning_rate": 9.993668503299443e-06, + "loss": 1.2223, + "step": 3405 + }, + { + "epoch": 0.34, + "grad_norm": 11.731516475386249, + "learning_rate": 9.993520087970128e-06, + "loss": 1.2622, + "step": 3410 + }, + { + "epoch": 0.34, + "grad_norm": 15.199217760337424, + "learning_rate": 9.993369954375322e-06, + "loss": 1.2187, + "step": 3415 + }, + { + "epoch": 0.34, + "grad_norm": 20.61144183377057, + "learning_rate": 9.993218102566683e-06, + "loss": 1.2648, + "step": 3420 + }, + { + "epoch": 0.35, + "grad_norm": 7.007496284136545, + "learning_rate": 9.993064532596464e-06, + "loss": 1.2809, + "step": 3425 + }, + { + "epoch": 0.35, + "grad_norm": 8.051987396263192, + "learning_rate": 9.992909244517507e-06, + "loss": 1.2683, + "step": 3430 + }, + { + "epoch": 0.35, + "grad_norm": 6.8353867234768675, + "learning_rate": 9.99275223838325e-06, + "loss": 1.2712, + "step": 3435 + }, + { + "epoch": 0.35, + "grad_norm": 10.769209591212741, + "learning_rate": 9.992593514247716e-06, + "loss": 1.2244, + "step": 3440 + }, + { + "epoch": 0.35, + "grad_norm": 9.723137752045876, + "learning_rate": 9.992433072165521e-06, + "loss": 1.2055, + "step": 3445 + }, + { + "epoch": 0.35, + "grad_norm": 7.580879719381715, + "learning_rate": 9.992270912191875e-06, + "loss": 1.214, + "step": 3450 + }, + { + "epoch": 0.35, + "grad_norm": 8.348028440545775, + "learning_rate": 9.992107034382576e-06, + "loss": 1.2512, + "step": 3455 + }, + { + "epoch": 0.35, + "grad_norm": 9.119279921782843, + "learning_rate": 9.991941438794016e-06, + "loss": 1.2072, + "step": 3460 + }, + { + "epoch": 0.35, + "grad_norm": 11.524058585005298, + "learning_rate": 9.991774125483173e-06, + "loss": 1.2263, + "step": 3465 + }, + { + "epoch": 0.35, + "grad_norm": 9.444283873220677, + "learning_rate": 9.991605094507621e-06, + "loss": 1.2625, + "step": 3470 + }, + { + "epoch": 0.35, + "grad_norm": 7.837669847558407, + "learning_rate": 9.991434345925525e-06, + "loss": 1.2565, + "step": 3475 + }, + { + "epoch": 0.35, + "grad_norm": 27.594282860281425, + "learning_rate": 9.991261879795637e-06, + "loss": 1.2006, + "step": 3480 + }, + { + "epoch": 0.35, + "grad_norm": 9.791241576434466, + "learning_rate": 9.991087696177304e-06, + "loss": 1.2963, + "step": 3485 + }, + { + "epoch": 0.35, + "grad_norm": 30.203839734682212, + "learning_rate": 9.990911795130461e-06, + "loss": 1.2561, + "step": 3490 + }, + { + "epoch": 0.35, + "grad_norm": 98.23200362278807, + "learning_rate": 9.990734176715635e-06, + "loss": 1.2461, + "step": 3495 + }, + { + "epoch": 0.35, + "grad_norm": 27.271840312622874, + "learning_rate": 9.990554840993948e-06, + "loss": 1.3156, + "step": 3500 + }, + { + "epoch": 0.35, + "grad_norm": 61.93216058953879, + "learning_rate": 9.990373788027105e-06, + "loss": 1.3108, + "step": 3505 + }, + { + "epoch": 0.35, + "grad_norm": 15.847753788646035, + "learning_rate": 9.990191017877408e-06, + "loss": 1.2783, + "step": 3510 + }, + { + "epoch": 0.35, + "grad_norm": 16.410406057080326, + "learning_rate": 9.99000653060775e-06, + "loss": 1.2319, + "step": 3515 + }, + { + "epoch": 0.35, + "grad_norm": 17.31825759414051, + "learning_rate": 9.989820326281608e-06, + "loss": 1.2944, + "step": 3520 + }, + { + "epoch": 0.36, + "grad_norm": 24.686023979520524, + "learning_rate": 9.989632404963058e-06, + "loss": 1.2311, + "step": 3525 + }, + { + "epoch": 0.36, + "grad_norm": 11.056066379950447, + "learning_rate": 9.989442766716766e-06, + "loss": 1.2642, + "step": 3530 + }, + { + "epoch": 0.36, + "grad_norm": 8.193503192165913, + "learning_rate": 9.98925141160798e-06, + "loss": 1.229, + "step": 3535 + }, + { + "epoch": 0.36, + "grad_norm": 12.672413486305313, + "learning_rate": 9.98905833970255e-06, + "loss": 1.2523, + "step": 3540 + }, + { + "epoch": 0.36, + "grad_norm": 9.013438018268829, + "learning_rate": 9.98886355106691e-06, + "loss": 1.2177, + "step": 3545 + }, + { + "epoch": 0.36, + "grad_norm": 9.739984602942702, + "learning_rate": 9.988667045768088e-06, + "loss": 1.3048, + "step": 3550 + }, + { + "epoch": 0.36, + "grad_norm": 14.313811608656064, + "learning_rate": 9.988468823873701e-06, + "loss": 1.2297, + "step": 3555 + }, + { + "epoch": 0.36, + "grad_norm": 13.080105167095407, + "learning_rate": 9.988268885451954e-06, + "loss": 1.2716, + "step": 3560 + }, + { + "epoch": 0.36, + "grad_norm": 12.052177241976224, + "learning_rate": 9.988067230571648e-06, + "loss": 1.2084, + "step": 3565 + }, + { + "epoch": 0.36, + "grad_norm": 43.26819159375231, + "learning_rate": 9.987863859302174e-06, + "loss": 1.2811, + "step": 3570 + }, + { + "epoch": 0.36, + "grad_norm": 31.906852111691876, + "learning_rate": 9.987658771713508e-06, + "loss": 1.2278, + "step": 3575 + }, + { + "epoch": 0.36, + "grad_norm": 32.0926758251754, + "learning_rate": 9.987451967876222e-06, + "loss": 1.1654, + "step": 3580 + }, + { + "epoch": 0.36, + "grad_norm": 12.778055781974132, + "learning_rate": 9.987243447861479e-06, + "loss": 1.2623, + "step": 3585 + }, + { + "epoch": 0.36, + "grad_norm": 26.73272187671943, + "learning_rate": 9.987033211741028e-06, + "loss": 1.1936, + "step": 3590 + }, + { + "epoch": 0.36, + "grad_norm": 23.712272553679398, + "learning_rate": 9.986821259587214e-06, + "loss": 1.2293, + "step": 3595 + }, + { + "epoch": 0.36, + "grad_norm": 25.72299647520127, + "learning_rate": 9.986607591472966e-06, + "loss": 1.2311, + "step": 3600 + }, + { + "epoch": 0.36, + "grad_norm": 13.615736190580805, + "learning_rate": 9.98639220747181e-06, + "loss": 1.3012, + "step": 3605 + }, + { + "epoch": 0.36, + "grad_norm": 17.95268313491177, + "learning_rate": 9.986175107657855e-06, + "loss": 1.2166, + "step": 3610 + }, + { + "epoch": 0.36, + "grad_norm": 19.109081494313262, + "learning_rate": 9.985956292105809e-06, + "loss": 1.21, + "step": 3615 + }, + { + "epoch": 0.36, + "grad_norm": 10.46050026278034, + "learning_rate": 9.985735760890966e-06, + "loss": 1.2424, + "step": 3620 + }, + { + "epoch": 0.37, + "grad_norm": 9.548313419301463, + "learning_rate": 9.985513514089209e-06, + "loss": 1.2031, + "step": 3625 + }, + { + "epoch": 0.37, + "grad_norm": 11.367090000488474, + "learning_rate": 9.985289551777014e-06, + "loss": 1.2292, + "step": 3630 + }, + { + "epoch": 0.37, + "grad_norm": 11.359290128056417, + "learning_rate": 9.985063874031444e-06, + "loss": 1.2018, + "step": 3635 + }, + { + "epoch": 0.37, + "grad_norm": 25.69776468028513, + "learning_rate": 9.984836480930157e-06, + "loss": 1.246, + "step": 3640 + }, + { + "epoch": 0.37, + "grad_norm": 26.738006219526483, + "learning_rate": 9.9846073725514e-06, + "loss": 1.232, + "step": 3645 + }, + { + "epoch": 0.37, + "grad_norm": 27.98023302821429, + "learning_rate": 9.984376548974005e-06, + "loss": 1.1696, + "step": 3650 + }, + { + "epoch": 0.37, + "grad_norm": 14.861073552576952, + "learning_rate": 9.984144010277402e-06, + "loss": 1.2144, + "step": 3655 + }, + { + "epoch": 0.37, + "grad_norm": 7.373906157122731, + "learning_rate": 9.983909756541603e-06, + "loss": 1.2201, + "step": 3660 + }, + { + "epoch": 0.37, + "grad_norm": 14.88970548441138, + "learning_rate": 9.983673787847218e-06, + "loss": 1.2754, + "step": 3665 + }, + { + "epoch": 0.37, + "grad_norm": 11.941159431577093, + "learning_rate": 9.983436104275443e-06, + "loss": 1.1831, + "step": 3670 + }, + { + "epoch": 0.37, + "grad_norm": 10.114563134319768, + "learning_rate": 9.983196705908063e-06, + "loss": 1.3016, + "step": 3675 + }, + { + "epoch": 0.37, + "grad_norm": 7.319121830353334, + "learning_rate": 9.982955592827456e-06, + "loss": 1.1637, + "step": 3680 + }, + { + "epoch": 0.37, + "grad_norm": 15.652938083479674, + "learning_rate": 9.98271276511659e-06, + "loss": 1.2296, + "step": 3685 + }, + { + "epoch": 0.37, + "grad_norm": 16.455205422152094, + "learning_rate": 9.982468222859021e-06, + "loss": 1.2656, + "step": 3690 + }, + { + "epoch": 0.37, + "grad_norm": 13.94661713597918, + "learning_rate": 9.982221966138895e-06, + "loss": 1.2311, + "step": 3695 + }, + { + "epoch": 0.37, + "grad_norm": 12.960873556702204, + "learning_rate": 9.981973995040948e-06, + "loss": 1.2198, + "step": 3700 + }, + { + "epoch": 0.37, + "grad_norm": 22.09218480259108, + "learning_rate": 9.981724309650507e-06, + "loss": 1.323, + "step": 3705 + }, + { + "epoch": 0.37, + "grad_norm": 8.0080144446023, + "learning_rate": 9.981472910053492e-06, + "loss": 1.2343, + "step": 3710 + }, + { + "epoch": 0.37, + "grad_norm": 21.145227700843645, + "learning_rate": 9.981219796336403e-06, + "loss": 1.2497, + "step": 3715 + }, + { + "epoch": 0.38, + "grad_norm": 19.79162580341351, + "learning_rate": 9.980964968586342e-06, + "loss": 1.2426, + "step": 3720 + }, + { + "epoch": 0.38, + "grad_norm": 9.497505379032013, + "learning_rate": 9.980708426890993e-06, + "loss": 1.1945, + "step": 3725 + }, + { + "epoch": 0.38, + "grad_norm": 10.618953280914997, + "learning_rate": 9.98045017133863e-06, + "loss": 1.2648, + "step": 3730 + }, + { + "epoch": 0.38, + "grad_norm": 18.349019252093978, + "learning_rate": 9.980190202018121e-06, + "loss": 1.2245, + "step": 3735 + }, + { + "epoch": 0.38, + "grad_norm": 12.344019064583094, + "learning_rate": 9.979928519018919e-06, + "loss": 1.2064, + "step": 3740 + }, + { + "epoch": 0.38, + "grad_norm": 14.945729051308529, + "learning_rate": 9.97966512243107e-06, + "loss": 1.1961, + "step": 3745 + }, + { + "epoch": 0.38, + "grad_norm": 9.531189055193323, + "learning_rate": 9.97940001234521e-06, + "loss": 1.2092, + "step": 3750 + }, + { + "epoch": 0.38, + "grad_norm": 23.213046385730106, + "learning_rate": 9.97913318885256e-06, + "loss": 1.2197, + "step": 3755 + }, + { + "epoch": 0.38, + "grad_norm": 32.725060083495215, + "learning_rate": 9.978864652044936e-06, + "loss": 1.2902, + "step": 3760 + }, + { + "epoch": 0.38, + "grad_norm": 43.98473480275498, + "learning_rate": 9.97859440201474e-06, + "loss": 1.2233, + "step": 3765 + }, + { + "epoch": 0.38, + "grad_norm": 42.96571294466646, + "learning_rate": 9.978322438854966e-06, + "loss": 1.2434, + "step": 3770 + }, + { + "epoch": 0.38, + "grad_norm": 67.18105533450216, + "learning_rate": 9.978048762659195e-06, + "loss": 1.2261, + "step": 3775 + }, + { + "epoch": 0.38, + "grad_norm": 6.791734699201156, + "learning_rate": 9.977773373521601e-06, + "loss": 1.2142, + "step": 3780 + }, + { + "epoch": 0.38, + "grad_norm": 10.897995141075134, + "learning_rate": 9.977496271536943e-06, + "loss": 1.2249, + "step": 3785 + }, + { + "epoch": 0.38, + "grad_norm": 14.497407187298952, + "learning_rate": 9.977217456800572e-06, + "loss": 1.2123, + "step": 3790 + }, + { + "epoch": 0.38, + "grad_norm": 7.736211949879045, + "learning_rate": 9.976936929408427e-06, + "loss": 1.2156, + "step": 3795 + }, + { + "epoch": 0.38, + "grad_norm": 18.303746441739822, + "learning_rate": 9.976654689457038e-06, + "loss": 1.2863, + "step": 3800 + }, + { + "epoch": 0.38, + "grad_norm": 9.829497062940318, + "learning_rate": 9.976370737043525e-06, + "loss": 1.2618, + "step": 3805 + }, + { + "epoch": 0.38, + "grad_norm": 25.263446092793245, + "learning_rate": 9.976085072265593e-06, + "loss": 1.2526, + "step": 3810 + }, + { + "epoch": 0.38, + "grad_norm": 35.3377923043493, + "learning_rate": 9.975797695221542e-06, + "loss": 1.2453, + "step": 3815 + }, + { + "epoch": 0.39, + "grad_norm": 17.464297585102695, + "learning_rate": 9.975508606010254e-06, + "loss": 1.2531, + "step": 3820 + }, + { + "epoch": 0.39, + "grad_norm": 23.692828808280154, + "learning_rate": 9.975217804731208e-06, + "loss": 1.2235, + "step": 3825 + }, + { + "epoch": 0.39, + "grad_norm": 11.668925213714026, + "learning_rate": 9.974925291484468e-06, + "loss": 1.2151, + "step": 3830 + }, + { + "epoch": 0.39, + "grad_norm": 12.866655427075113, + "learning_rate": 9.974631066370685e-06, + "loss": 1.2726, + "step": 3835 + }, + { + "epoch": 0.39, + "grad_norm": 25.841814770895773, + "learning_rate": 9.974335129491107e-06, + "loss": 1.27, + "step": 3840 + }, + { + "epoch": 0.39, + "grad_norm": 23.100541426504538, + "learning_rate": 9.974037480947558e-06, + "loss": 1.2535, + "step": 3845 + }, + { + "epoch": 0.39, + "grad_norm": 64.29530761193791, + "learning_rate": 9.973738120842465e-06, + "loss": 1.2538, + "step": 3850 + }, + { + "epoch": 0.39, + "grad_norm": 41.529956070053096, + "learning_rate": 9.973437049278833e-06, + "loss": 1.2561, + "step": 3855 + }, + { + "epoch": 0.39, + "grad_norm": 9.71277426315383, + "learning_rate": 9.973134266360265e-06, + "loss": 1.2695, + "step": 3860 + }, + { + "epoch": 0.39, + "grad_norm": 8.312591268093923, + "learning_rate": 9.972829772190944e-06, + "loss": 1.236, + "step": 3865 + }, + { + "epoch": 0.39, + "grad_norm": 7.681689247726292, + "learning_rate": 9.972523566875649e-06, + "loss": 1.2536, + "step": 3870 + }, + { + "epoch": 0.39, + "grad_norm": 26.235793994940313, + "learning_rate": 9.972215650519743e-06, + "loss": 1.3028, + "step": 3875 + }, + { + "epoch": 0.39, + "grad_norm": 15.132056462810064, + "learning_rate": 9.97190602322918e-06, + "loss": 1.1961, + "step": 3880 + }, + { + "epoch": 0.39, + "grad_norm": 32.44650553471024, + "learning_rate": 9.971594685110507e-06, + "loss": 1.3053, + "step": 3885 + }, + { + "epoch": 0.39, + "grad_norm": 16.110785173854836, + "learning_rate": 9.971281636270847e-06, + "loss": 1.2593, + "step": 3890 + }, + { + "epoch": 0.39, + "grad_norm": 10.444171464337078, + "learning_rate": 9.970966876817928e-06, + "loss": 1.2561, + "step": 3895 + }, + { + "epoch": 0.39, + "grad_norm": 25.17838705462199, + "learning_rate": 9.970650406860051e-06, + "loss": 1.2029, + "step": 3900 + }, + { + "epoch": 0.39, + "grad_norm": 7.115984906641157, + "learning_rate": 9.970332226506118e-06, + "loss": 1.2493, + "step": 3905 + }, + { + "epoch": 0.39, + "grad_norm": 19.70047927862016, + "learning_rate": 9.970012335865612e-06, + "loss": 1.1869, + "step": 3910 + }, + { + "epoch": 0.39, + "grad_norm": 18.038800651626424, + "learning_rate": 9.96969073504861e-06, + "loss": 1.2424, + "step": 3915 + }, + { + "epoch": 0.4, + "grad_norm": 8.15861746144962, + "learning_rate": 9.96936742416577e-06, + "loss": 1.1838, + "step": 3920 + }, + { + "epoch": 0.4, + "grad_norm": 12.484113761097328, + "learning_rate": 9.969042403328348e-06, + "loss": 1.2171, + "step": 3925 + }, + { + "epoch": 0.4, + "grad_norm": 8.110356553423042, + "learning_rate": 9.96871567264818e-06, + "loss": 1.2518, + "step": 3930 + }, + { + "epoch": 0.4, + "grad_norm": 17.73363097364326, + "learning_rate": 9.968387232237695e-06, + "loss": 1.2789, + "step": 3935 + }, + { + "epoch": 0.4, + "grad_norm": 27.686523594105953, + "learning_rate": 9.968057082209909e-06, + "loss": 1.244, + "step": 3940 + }, + { + "epoch": 0.4, + "grad_norm": 11.820108672992458, + "learning_rate": 9.967725222678426e-06, + "loss": 1.2913, + "step": 3945 + }, + { + "epoch": 0.4, + "grad_norm": 19.67650247073177, + "learning_rate": 9.967391653757438e-06, + "loss": 1.2102, + "step": 3950 + }, + { + "epoch": 0.4, + "grad_norm": 18.73171602233947, + "learning_rate": 9.967056375561726e-06, + "loss": 1.1982, + "step": 3955 + }, + { + "epoch": 0.4, + "grad_norm": 24.675528714132533, + "learning_rate": 9.966719388206661e-06, + "loss": 1.2475, + "step": 3960 + }, + { + "epoch": 0.4, + "grad_norm": 45.25930029609521, + "learning_rate": 9.966380691808197e-06, + "loss": 1.2594, + "step": 3965 + }, + { + "epoch": 0.4, + "grad_norm": 87.67463131497611, + "learning_rate": 9.96604028648288e-06, + "loss": 1.2446, + "step": 3970 + }, + { + "epoch": 0.4, + "grad_norm": 7.474598522395306, + "learning_rate": 9.965698172347843e-06, + "loss": 1.2846, + "step": 3975 + }, + { + "epoch": 0.4, + "grad_norm": 56.77411416314017, + "learning_rate": 9.96535434952081e-06, + "loss": 1.252, + "step": 3980 + }, + { + "epoch": 0.4, + "grad_norm": 46.8997659190866, + "learning_rate": 9.965008818120088e-06, + "loss": 1.2369, + "step": 3985 + }, + { + "epoch": 0.4, + "grad_norm": 27.77528218311658, + "learning_rate": 9.964661578264571e-06, + "loss": 1.2456, + "step": 3990 + }, + { + "epoch": 0.4, + "grad_norm": 18.546895946518553, + "learning_rate": 9.964312630073749e-06, + "loss": 1.2298, + "step": 3995 + }, + { + "epoch": 0.4, + "grad_norm": 9.279947297055188, + "learning_rate": 9.963961973667691e-06, + "loss": 1.1957, + "step": 4000 + }, + { + "epoch": 0.4, + "grad_norm": 51.66062157374361, + "learning_rate": 9.96360960916706e-06, + "loss": 1.2408, + "step": 4005 + }, + { + "epoch": 0.4, + "grad_norm": 38.003334883771586, + "learning_rate": 9.963255536693103e-06, + "loss": 1.2566, + "step": 4010 + }, + { + "epoch": 0.4, + "grad_norm": 36.49216818644186, + "learning_rate": 9.962899756367657e-06, + "loss": 1.2388, + "step": 4015 + }, + { + "epoch": 0.41, + "grad_norm": 37.691359368487895, + "learning_rate": 9.962542268313144e-06, + "loss": 1.2522, + "step": 4020 + }, + { + "epoch": 0.41, + "grad_norm": 13.338664665110976, + "learning_rate": 9.962183072652577e-06, + "loss": 1.2367, + "step": 4025 + }, + { + "epoch": 0.41, + "grad_norm": 9.087845748300886, + "learning_rate": 9.961822169509552e-06, + "loss": 1.2158, + "step": 4030 + }, + { + "epoch": 0.41, + "grad_norm": 38.570589682996335, + "learning_rate": 9.96145955900826e-06, + "loss": 1.2555, + "step": 4035 + }, + { + "epoch": 0.41, + "grad_norm": 27.693672064187652, + "learning_rate": 9.96109524127347e-06, + "loss": 1.206, + "step": 4040 + }, + { + "epoch": 0.41, + "grad_norm": 21.133340119549086, + "learning_rate": 9.960729216430549e-06, + "loss": 1.269, + "step": 4045 + }, + { + "epoch": 0.41, + "grad_norm": 47.127690639583996, + "learning_rate": 9.96036148460544e-06, + "loss": 1.255, + "step": 4050 + }, + { + "epoch": 0.41, + "grad_norm": 13.301073562064612, + "learning_rate": 9.959992045924682e-06, + "loss": 1.2739, + "step": 4055 + }, + { + "epoch": 0.41, + "grad_norm": 11.609836658516967, + "learning_rate": 9.9596209005154e-06, + "loss": 1.3207, + "step": 4060 + }, + { + "epoch": 0.41, + "grad_norm": 33.995550626776534, + "learning_rate": 9.959248048505304e-06, + "loss": 1.2085, + "step": 4065 + }, + { + "epoch": 0.41, + "grad_norm": 59.88588971982497, + "learning_rate": 9.958873490022688e-06, + "loss": 1.2102, + "step": 4070 + }, + { + "epoch": 0.41, + "grad_norm": 30.261967435183816, + "learning_rate": 9.958497225196441e-06, + "loss": 1.2009, + "step": 4075 + }, + { + "epoch": 0.41, + "grad_norm": 20.236600833942322, + "learning_rate": 9.958119254156036e-06, + "loss": 1.2641, + "step": 4080 + }, + { + "epoch": 0.41, + "grad_norm": 9.097576830169164, + "learning_rate": 9.95773957703153e-06, + "loss": 1.2568, + "step": 4085 + }, + { + "epoch": 0.41, + "grad_norm": 28.322236699356623, + "learning_rate": 9.957358193953573e-06, + "loss": 1.2436, + "step": 4090 + }, + { + "epoch": 0.41, + "grad_norm": 12.21633921097602, + "learning_rate": 9.956975105053395e-06, + "loss": 1.2931, + "step": 4095 + }, + { + "epoch": 0.41, + "grad_norm": 10.400387277003981, + "learning_rate": 9.956590310462817e-06, + "loss": 1.2489, + "step": 4100 + }, + { + "epoch": 0.41, + "grad_norm": 8.919282255738858, + "learning_rate": 9.956203810314248e-06, + "loss": 1.2455, + "step": 4105 + }, + { + "epoch": 0.41, + "grad_norm": 11.852815217381016, + "learning_rate": 9.955815604740682e-06, + "loss": 1.2237, + "step": 4110 + }, + { + "epoch": 0.41, + "grad_norm": 9.280350390136551, + "learning_rate": 9.955425693875699e-06, + "loss": 1.2531, + "step": 4115 + }, + { + "epoch": 0.42, + "grad_norm": 7.206007829007667, + "learning_rate": 9.955034077853466e-06, + "loss": 1.1968, + "step": 4120 + }, + { + "epoch": 0.42, + "grad_norm": 8.565179622370264, + "learning_rate": 9.954640756808743e-06, + "loss": 1.2529, + "step": 4125 + }, + { + "epoch": 0.42, + "grad_norm": 17.6317617587089, + "learning_rate": 9.954245730876866e-06, + "loss": 1.2129, + "step": 4130 + }, + { + "epoch": 0.42, + "grad_norm": 18.47417695536017, + "learning_rate": 9.953849000193764e-06, + "loss": 1.233, + "step": 4135 + }, + { + "epoch": 0.42, + "grad_norm": 23.358928325643685, + "learning_rate": 9.953450564895955e-06, + "loss": 1.2425, + "step": 4140 + }, + { + "epoch": 0.42, + "grad_norm": 38.85271406815992, + "learning_rate": 9.953050425120537e-06, + "loss": 1.2653, + "step": 4145 + }, + { + "epoch": 0.42, + "grad_norm": 26.5549459705775, + "learning_rate": 9.952648581005196e-06, + "loss": 1.2167, + "step": 4150 + }, + { + "epoch": 0.42, + "grad_norm": 7.9806651207861155, + "learning_rate": 9.95224503268821e-06, + "loss": 1.2159, + "step": 4155 + }, + { + "epoch": 0.42, + "grad_norm": 9.204088379198158, + "learning_rate": 9.951839780308439e-06, + "loss": 1.1693, + "step": 4160 + }, + { + "epoch": 0.42, + "grad_norm": 34.843347127473734, + "learning_rate": 9.951432824005328e-06, + "loss": 1.2514, + "step": 4165 + }, + { + "epoch": 0.42, + "grad_norm": 11.431225825049113, + "learning_rate": 9.951024163918913e-06, + "loss": 1.2578, + "step": 4170 + }, + { + "epoch": 0.42, + "grad_norm": 11.273926092048018, + "learning_rate": 9.95061380018981e-06, + "loss": 1.1902, + "step": 4175 + }, + { + "epoch": 0.42, + "grad_norm": 10.601147365831144, + "learning_rate": 9.950201732959228e-06, + "loss": 1.2412, + "step": 4180 + }, + { + "epoch": 0.42, + "grad_norm": 39.646460393382455, + "learning_rate": 9.949787962368957e-06, + "loss": 1.2466, + "step": 4185 + }, + { + "epoch": 0.42, + "grad_norm": 23.64410488897537, + "learning_rate": 9.949372488561377e-06, + "loss": 1.1957, + "step": 4190 + }, + { + "epoch": 0.42, + "grad_norm": 10.449811055571177, + "learning_rate": 9.94895531167945e-06, + "loss": 1.228, + "step": 4195 + }, + { + "epoch": 0.42, + "grad_norm": 11.967857762143987, + "learning_rate": 9.948536431866726e-06, + "loss": 1.2249, + "step": 4200 + }, + { + "epoch": 0.42, + "grad_norm": 16.977410140447304, + "learning_rate": 9.948115849267344e-06, + "loss": 1.2819, + "step": 4205 + }, + { + "epoch": 0.42, + "grad_norm": 13.49390147390754, + "learning_rate": 9.947693564026025e-06, + "loss": 1.252, + "step": 4210 + }, + { + "epoch": 0.42, + "grad_norm": 14.579440419526714, + "learning_rate": 9.947269576288074e-06, + "loss": 1.2551, + "step": 4215 + }, + { + "epoch": 0.43, + "grad_norm": 23.073349590631008, + "learning_rate": 9.946843886199387e-06, + "loss": 1.2799, + "step": 4220 + }, + { + "epoch": 0.43, + "grad_norm": 33.080214220661595, + "learning_rate": 9.946416493906445e-06, + "loss": 1.219, + "step": 4225 + }, + { + "epoch": 0.43, + "grad_norm": 17.340182315915886, + "learning_rate": 9.94598739955631e-06, + "loss": 1.2418, + "step": 4230 + }, + { + "epoch": 0.43, + "grad_norm": 7.732259768036213, + "learning_rate": 9.945556603296636e-06, + "loss": 1.2616, + "step": 4235 + }, + { + "epoch": 0.43, + "grad_norm": 12.220849733614314, + "learning_rate": 9.945124105275658e-06, + "loss": 1.2264, + "step": 4240 + }, + { + "epoch": 0.43, + "grad_norm": 7.415288492518843, + "learning_rate": 9.9446899056422e-06, + "loss": 1.2292, + "step": 4245 + }, + { + "epoch": 0.43, + "grad_norm": 11.51497427087494, + "learning_rate": 9.944254004545666e-06, + "loss": 1.2158, + "step": 4250 + }, + { + "epoch": 0.43, + "grad_norm": 45.84960290897499, + "learning_rate": 9.943816402136053e-06, + "loss": 1.185, + "step": 4255 + }, + { + "epoch": 0.43, + "grad_norm": 51.5481831854867, + "learning_rate": 9.943377098563936e-06, + "loss": 1.2144, + "step": 4260 + }, + { + "epoch": 0.43, + "grad_norm": 26.0885083725824, + "learning_rate": 9.942936093980482e-06, + "loss": 1.2406, + "step": 4265 + }, + { + "epoch": 0.43, + "grad_norm": 52.86487745850491, + "learning_rate": 9.94249338853744e-06, + "loss": 1.2343, + "step": 4270 + }, + { + "epoch": 0.43, + "grad_norm": 32.085728312012584, + "learning_rate": 9.942048982387142e-06, + "loss": 1.2485, + "step": 4275 + }, + { + "epoch": 0.43, + "grad_norm": 10.147507968619474, + "learning_rate": 9.94160287568251e-06, + "loss": 1.2447, + "step": 4280 + }, + { + "epoch": 0.43, + "grad_norm": 11.746530392136881, + "learning_rate": 9.941155068577049e-06, + "loss": 1.2367, + "step": 4285 + }, + { + "epoch": 0.43, + "grad_norm": 38.852565860595654, + "learning_rate": 9.940705561224847e-06, + "loss": 1.2278, + "step": 4290 + }, + { + "epoch": 0.43, + "grad_norm": 51.92516409644444, + "learning_rate": 9.940254353780581e-06, + "loss": 1.2651, + "step": 4295 + }, + { + "epoch": 0.43, + "grad_norm": 43.46149206212037, + "learning_rate": 9.939801446399511e-06, + "loss": 1.2728, + "step": 4300 + }, + { + "epoch": 0.43, + "grad_norm": 26.44387192644677, + "learning_rate": 9.93934683923748e-06, + "loss": 1.1896, + "step": 4305 + }, + { + "epoch": 0.43, + "grad_norm": 37.746036203691254, + "learning_rate": 9.93889053245092e-06, + "loss": 1.2588, + "step": 4310 + }, + { + "epoch": 0.44, + "grad_norm": 12.520173237581492, + "learning_rate": 9.938432526196844e-06, + "loss": 1.1869, + "step": 4315 + }, + { + "epoch": 0.44, + "grad_norm": 10.467561168011299, + "learning_rate": 9.937972820632854e-06, + "loss": 1.2904, + "step": 4320 + }, + { + "epoch": 0.44, + "grad_norm": 18.609641849366643, + "learning_rate": 9.937511415917132e-06, + "loss": 1.2455, + "step": 4325 + }, + { + "epoch": 0.44, + "grad_norm": 14.00022249781794, + "learning_rate": 9.937048312208448e-06, + "loss": 1.2635, + "step": 4330 + }, + { + "epoch": 0.44, + "grad_norm": 19.548844065392103, + "learning_rate": 9.936583509666154e-06, + "loss": 1.2521, + "step": 4335 + }, + { + "epoch": 0.44, + "grad_norm": 10.740443268777065, + "learning_rate": 9.93611700845019e-06, + "loss": 1.214, + "step": 4340 + }, + { + "epoch": 0.44, + "grad_norm": 15.354451049628311, + "learning_rate": 9.93564880872108e-06, + "loss": 1.2474, + "step": 4345 + }, + { + "epoch": 0.44, + "grad_norm": 13.354353870738024, + "learning_rate": 9.935178910639927e-06, + "loss": 1.2202, + "step": 4350 + }, + { + "epoch": 0.44, + "grad_norm": 11.461666361813926, + "learning_rate": 9.934707314368424e-06, + "loss": 1.1891, + "step": 4355 + }, + { + "epoch": 0.44, + "grad_norm": 26.26375051175625, + "learning_rate": 9.934234020068847e-06, + "loss": 1.2721, + "step": 4360 + }, + { + "epoch": 0.44, + "grad_norm": 19.66123236538432, + "learning_rate": 9.933759027904058e-06, + "loss": 1.1961, + "step": 4365 + }, + { + "epoch": 0.44, + "grad_norm": 9.849872593058937, + "learning_rate": 9.9332823380375e-06, + "loss": 1.191, + "step": 4370 + }, + { + "epoch": 0.44, + "grad_norm": 10.52585365597777, + "learning_rate": 9.932803950633199e-06, + "loss": 1.2106, + "step": 4375 + }, + { + "epoch": 0.44, + "grad_norm": 23.505146587479103, + "learning_rate": 9.93232386585577e-06, + "loss": 1.2883, + "step": 4380 + }, + { + "epoch": 0.44, + "grad_norm": 20.704178651242195, + "learning_rate": 9.931842083870413e-06, + "loss": 1.2569, + "step": 4385 + }, + { + "epoch": 0.44, + "grad_norm": 21.662070835592704, + "learning_rate": 9.931358604842902e-06, + "loss": 1.2289, + "step": 4390 + }, + { + "epoch": 0.44, + "grad_norm": 13.059661247937026, + "learning_rate": 9.930873428939607e-06, + "loss": 1.2457, + "step": 4395 + }, + { + "epoch": 0.44, + "grad_norm": 12.51351005431564, + "learning_rate": 9.930386556327475e-06, + "loss": 1.2639, + "step": 4400 + }, + { + "epoch": 0.44, + "grad_norm": 25.232301350217114, + "learning_rate": 9.929897987174038e-06, + "loss": 1.1957, + "step": 4405 + }, + { + "epoch": 0.44, + "grad_norm": 15.350420590665204, + "learning_rate": 9.92940772164741e-06, + "loss": 1.2223, + "step": 4410 + }, + { + "epoch": 0.45, + "grad_norm": 11.311765119921033, + "learning_rate": 9.928915759916295e-06, + "loss": 1.2363, + "step": 4415 + }, + { + "epoch": 0.45, + "grad_norm": 8.127480756243877, + "learning_rate": 9.928422102149974e-06, + "loss": 1.2429, + "step": 4420 + }, + { + "epoch": 0.45, + "grad_norm": 7.699872791778428, + "learning_rate": 9.927926748518316e-06, + "loss": 1.2488, + "step": 4425 + }, + { + "epoch": 0.45, + "grad_norm": 11.510205983564731, + "learning_rate": 9.927429699191768e-06, + "loss": 1.2004, + "step": 4430 + }, + { + "epoch": 0.45, + "grad_norm": 31.498092704192338, + "learning_rate": 9.926930954341368e-06, + "loss": 1.2013, + "step": 4435 + }, + { + "epoch": 0.45, + "grad_norm": 23.41388595460819, + "learning_rate": 9.926430514138734e-06, + "loss": 1.2054, + "step": 4440 + }, + { + "epoch": 0.45, + "grad_norm": 33.179614627446334, + "learning_rate": 9.925928378756064e-06, + "loss": 1.2418, + "step": 4445 + }, + { + "epoch": 0.45, + "grad_norm": 15.854169811315563, + "learning_rate": 9.925424548366142e-06, + "loss": 1.2694, + "step": 4450 + }, + { + "epoch": 0.45, + "grad_norm": 22.130365073256336, + "learning_rate": 9.92491902314234e-06, + "loss": 1.2199, + "step": 4455 + }, + { + "epoch": 0.45, + "grad_norm": 26.12787060881701, + "learning_rate": 9.924411803258604e-06, + "loss": 1.2743, + "step": 4460 + }, + { + "epoch": 0.45, + "grad_norm": 44.41777719215755, + "learning_rate": 9.923902888889472e-06, + "loss": 1.2235, + "step": 4465 + }, + { + "epoch": 0.45, + "grad_norm": 68.4105816881735, + "learning_rate": 9.923392280210056e-06, + "loss": 1.2475, + "step": 4470 + }, + { + "epoch": 0.45, + "grad_norm": 65.17654692737699, + "learning_rate": 9.922879977396062e-06, + "loss": 1.2378, + "step": 4475 + }, + { + "epoch": 0.45, + "grad_norm": 15.007706083704576, + "learning_rate": 9.922365980623768e-06, + "loss": 1.3025, + "step": 4480 + }, + { + "epoch": 0.45, + "grad_norm": 33.37928601370577, + "learning_rate": 9.92185029007004e-06, + "loss": 1.2351, + "step": 4485 + }, + { + "epoch": 0.45, + "grad_norm": 49.204195129373645, + "learning_rate": 9.92133290591233e-06, + "loss": 1.2301, + "step": 4490 + }, + { + "epoch": 0.45, + "grad_norm": 20.865406533566258, + "learning_rate": 9.920813828328668e-06, + "loss": 1.2425, + "step": 4495 + }, + { + "epoch": 0.45, + "grad_norm": 9.484545945850263, + "learning_rate": 9.920293057497665e-06, + "loss": 1.2036, + "step": 4500 + }, + { + "epoch": 0.45, + "grad_norm": 10.112272089623518, + "learning_rate": 9.919770593598523e-06, + "loss": 1.1751, + "step": 4505 + }, + { + "epoch": 0.45, + "grad_norm": 16.091124986461764, + "learning_rate": 9.919246436811017e-06, + "loss": 1.1866, + "step": 4510 + }, + { + "epoch": 0.46, + "grad_norm": 10.14013552151064, + "learning_rate": 9.918720587315512e-06, + "loss": 1.2246, + "step": 4515 + }, + { + "epoch": 0.46, + "grad_norm": 15.0306005576408, + "learning_rate": 9.918193045292949e-06, + "loss": 1.2514, + "step": 4520 + }, + { + "epoch": 0.46, + "grad_norm": 8.245379854317187, + "learning_rate": 9.917663810924858e-06, + "loss": 1.1635, + "step": 4525 + }, + { + "epoch": 0.46, + "grad_norm": 20.92733969341995, + "learning_rate": 9.917132884393346e-06, + "loss": 1.2316, + "step": 4530 + }, + { + "epoch": 0.46, + "grad_norm": 8.617987539724332, + "learning_rate": 9.916600265881104e-06, + "loss": 1.2483, + "step": 4535 + }, + { + "epoch": 0.46, + "grad_norm": 7.6733268929963545, + "learning_rate": 9.916065955571408e-06, + "loss": 1.2179, + "step": 4540 + }, + { + "epoch": 0.46, + "grad_norm": 15.242155444345359, + "learning_rate": 9.915529953648111e-06, + "loss": 1.2364, + "step": 4545 + }, + { + "epoch": 0.46, + "grad_norm": 7.546232213228472, + "learning_rate": 9.914992260295653e-06, + "loss": 1.2006, + "step": 4550 + }, + { + "epoch": 0.46, + "grad_norm": 8.412672944813984, + "learning_rate": 9.914452875699053e-06, + "loss": 1.2184, + "step": 4555 + }, + { + "epoch": 0.46, + "grad_norm": 9.453619543099387, + "learning_rate": 9.91391180004391e-06, + "loss": 1.2225, + "step": 4560 + }, + { + "epoch": 0.46, + "grad_norm": 15.707388871143154, + "learning_rate": 9.913369033516412e-06, + "loss": 1.2034, + "step": 4565 + }, + { + "epoch": 0.46, + "grad_norm": 16.46315709092553, + "learning_rate": 9.912824576303321e-06, + "loss": 1.2387, + "step": 4570 + }, + { + "epoch": 0.46, + "grad_norm": 10.270494960714121, + "learning_rate": 9.912278428591986e-06, + "loss": 1.2646, + "step": 4575 + }, + { + "epoch": 0.46, + "grad_norm": 11.417488967738182, + "learning_rate": 9.911730590570335e-06, + "loss": 1.2333, + "step": 4580 + }, + { + "epoch": 0.46, + "grad_norm": 28.781749944510707, + "learning_rate": 9.91118106242688e-06, + "loss": 1.2128, + "step": 4585 + }, + { + "epoch": 0.46, + "grad_norm": 33.17270676630123, + "learning_rate": 9.910629844350711e-06, + "loss": 1.2591, + "step": 4590 + }, + { + "epoch": 0.46, + "grad_norm": 27.21735457760835, + "learning_rate": 9.910076936531503e-06, + "loss": 1.1947, + "step": 4595 + }, + { + "epoch": 0.46, + "grad_norm": 44.91419841319632, + "learning_rate": 9.909522339159511e-06, + "loss": 1.2802, + "step": 4600 + }, + { + "epoch": 0.46, + "grad_norm": 21.381704300692164, + "learning_rate": 9.908966052425573e-06, + "loss": 1.2412, + "step": 4605 + }, + { + "epoch": 0.46, + "grad_norm": 133.4976979325375, + "learning_rate": 9.908408076521104e-06, + "loss": 1.2974, + "step": 4610 + }, + { + "epoch": 0.47, + "grad_norm": 104.40804477999271, + "learning_rate": 9.907848411638102e-06, + "loss": 1.2562, + "step": 4615 + }, + { + "epoch": 0.47, + "grad_norm": 43.1718486242428, + "learning_rate": 9.90728705796915e-06, + "loss": 1.2858, + "step": 4620 + }, + { + "epoch": 0.47, + "grad_norm": 10.430405281503546, + "learning_rate": 9.90672401570741e-06, + "loss": 1.2349, + "step": 4625 + }, + { + "epoch": 0.47, + "grad_norm": 28.408859457680723, + "learning_rate": 9.906159285046622e-06, + "loss": 1.2663, + "step": 4630 + }, + { + "epoch": 0.47, + "grad_norm": 35.80208086924776, + "learning_rate": 9.905592866181108e-06, + "loss": 1.2238, + "step": 4635 + }, + { + "epoch": 0.47, + "grad_norm": 55.64747717171744, + "learning_rate": 9.905024759305777e-06, + "loss": 1.2932, + "step": 4640 + }, + { + "epoch": 0.47, + "grad_norm": 25.653632126635838, + "learning_rate": 9.904454964616108e-06, + "loss": 1.2248, + "step": 4645 + }, + { + "epoch": 0.47, + "grad_norm": 89.7506396373891, + "learning_rate": 9.903883482308172e-06, + "loss": 1.2288, + "step": 4650 + }, + { + "epoch": 0.47, + "grad_norm": 84.92599413538284, + "learning_rate": 9.903310312578613e-06, + "loss": 1.2751, + "step": 4655 + }, + { + "epoch": 0.47, + "grad_norm": 40.33396768812129, + "learning_rate": 9.902735455624658e-06, + "loss": 1.2513, + "step": 4660 + }, + { + "epoch": 0.47, + "grad_norm": 44.693246410015014, + "learning_rate": 9.902158911644117e-06, + "loss": 1.222, + "step": 4665 + }, + { + "epoch": 0.47, + "grad_norm": 36.40728587256452, + "learning_rate": 9.901580680835377e-06, + "loss": 1.247, + "step": 4670 + }, + { + "epoch": 0.47, + "grad_norm": 11.767226773863197, + "learning_rate": 9.901000763397404e-06, + "loss": 1.2207, + "step": 4675 + }, + { + "epoch": 0.47, + "grad_norm": 33.84826992392133, + "learning_rate": 9.900419159529751e-06, + "loss": 1.2387, + "step": 4680 + }, + { + "epoch": 0.47, + "grad_norm": 21.084937111619066, + "learning_rate": 9.899835869432544e-06, + "loss": 1.2599, + "step": 4685 + }, + { + "epoch": 0.47, + "grad_norm": 13.903378794691731, + "learning_rate": 9.899250893306496e-06, + "loss": 1.2094, + "step": 4690 + }, + { + "epoch": 0.47, + "grad_norm": 12.651747205568551, + "learning_rate": 9.898664231352894e-06, + "loss": 1.278, + "step": 4695 + }, + { + "epoch": 0.47, + "grad_norm": 18.91091257408942, + "learning_rate": 9.898075883773609e-06, + "loss": 1.2486, + "step": 4700 + }, + { + "epoch": 0.47, + "grad_norm": 15.666073931133331, + "learning_rate": 9.897485850771092e-06, + "loss": 1.2789, + "step": 4705 + }, + { + "epoch": 0.47, + "grad_norm": 20.233336917533183, + "learning_rate": 9.89689413254837e-06, + "loss": 1.2588, + "step": 4710 + }, + { + "epoch": 0.48, + "grad_norm": 48.842048881189655, + "learning_rate": 9.896300729309051e-06, + "loss": 1.1803, + "step": 4715 + }, + { + "epoch": 0.48, + "grad_norm": 48.32400165414345, + "learning_rate": 9.895705641257332e-06, + "loss": 1.2072, + "step": 4720 + }, + { + "epoch": 0.48, + "grad_norm": 15.576826950538285, + "learning_rate": 9.895108868597976e-06, + "loss": 1.2672, + "step": 4725 + }, + { + "epoch": 0.48, + "grad_norm": 57.056727168245324, + "learning_rate": 9.894510411536335e-06, + "loss": 1.2345, + "step": 4730 + }, + { + "epoch": 0.48, + "grad_norm": 45.82864349568885, + "learning_rate": 9.893910270278335e-06, + "loss": 1.2594, + "step": 4735 + }, + { + "epoch": 0.48, + "grad_norm": 53.472664284359006, + "learning_rate": 9.893308445030485e-06, + "loss": 1.2658, + "step": 4740 + }, + { + "epoch": 0.48, + "grad_norm": 51.09707449624854, + "learning_rate": 9.892704935999872e-06, + "loss": 1.2381, + "step": 4745 + }, + { + "epoch": 0.48, + "grad_norm": 52.06497551634407, + "learning_rate": 9.892099743394165e-06, + "loss": 1.2305, + "step": 4750 + }, + { + "epoch": 0.48, + "grad_norm": 19.917155144998816, + "learning_rate": 9.89149286742161e-06, + "loss": 1.248, + "step": 4755 + }, + { + "epoch": 0.48, + "grad_norm": 33.93858162026787, + "learning_rate": 9.89088430829103e-06, + "loss": 1.1996, + "step": 4760 + }, + { + "epoch": 0.48, + "grad_norm": 24.056306625196044, + "learning_rate": 9.890274066211828e-06, + "loss": 1.2323, + "step": 4765 + }, + { + "epoch": 0.48, + "grad_norm": 36.916877403006154, + "learning_rate": 9.889662141393994e-06, + "loss": 1.2677, + "step": 4770 + }, + { + "epoch": 0.48, + "grad_norm": 49.26865152084359, + "learning_rate": 9.889048534048088e-06, + "loss": 1.2934, + "step": 4775 + }, + { + "epoch": 0.48, + "grad_norm": 36.128655809311724, + "learning_rate": 9.888433244385248e-06, + "loss": 1.2324, + "step": 4780 + }, + { + "epoch": 0.48, + "grad_norm": 33.15784894308101, + "learning_rate": 9.8878162726172e-06, + "loss": 1.2033, + "step": 4785 + }, + { + "epoch": 0.48, + "grad_norm": 9.0821031794577, + "learning_rate": 9.887197618956239e-06, + "loss": 1.2938, + "step": 4790 + }, + { + "epoch": 0.48, + "grad_norm": 18.63229833800865, + "learning_rate": 9.886577283615247e-06, + "loss": 1.2084, + "step": 4795 + }, + { + "epoch": 0.48, + "grad_norm": 34.363919504880776, + "learning_rate": 9.885955266807677e-06, + "loss": 1.2019, + "step": 4800 + }, + { + "epoch": 0.48, + "grad_norm": 23.32077401063675, + "learning_rate": 9.885331568747569e-06, + "loss": 1.2544, + "step": 4805 + }, + { + "epoch": 0.48, + "grad_norm": 27.802670446738542, + "learning_rate": 9.884706189649532e-06, + "loss": 1.1967, + "step": 4810 + }, + { + "epoch": 0.49, + "grad_norm": 8.71809098143742, + "learning_rate": 9.88407912972876e-06, + "loss": 1.2215, + "step": 4815 + }, + { + "epoch": 0.49, + "grad_norm": 14.621038663696202, + "learning_rate": 9.883450389201025e-06, + "loss": 1.2621, + "step": 4820 + }, + { + "epoch": 0.49, + "grad_norm": 34.66887135697982, + "learning_rate": 9.882819968282678e-06, + "loss": 1.2593, + "step": 4825 + }, + { + "epoch": 0.49, + "grad_norm": 18.02489044655485, + "learning_rate": 9.88218786719064e-06, + "loss": 1.2477, + "step": 4830 + }, + { + "epoch": 0.49, + "grad_norm": 11.21532524157301, + "learning_rate": 9.881554086142422e-06, + "loss": 1.278, + "step": 4835 + }, + { + "epoch": 0.49, + "grad_norm": 39.71229395992522, + "learning_rate": 9.880918625356103e-06, + "loss": 1.2432, + "step": 4840 + }, + { + "epoch": 0.49, + "grad_norm": 38.8748764564693, + "learning_rate": 9.88028148505035e-06, + "loss": 1.24, + "step": 4845 + }, + { + "epoch": 0.49, + "grad_norm": 19.839325112997184, + "learning_rate": 9.879642665444397e-06, + "loss": 1.2117, + "step": 4850 + }, + { + "epoch": 0.49, + "grad_norm": 79.07900220844354, + "learning_rate": 9.879002166758065e-06, + "loss": 1.2215, + "step": 4855 + }, + { + "epoch": 0.49, + "grad_norm": 23.67166694685957, + "learning_rate": 9.878359989211747e-06, + "loss": 1.2823, + "step": 4860 + }, + { + "epoch": 0.49, + "grad_norm": 11.273654423274305, + "learning_rate": 9.877716133026415e-06, + "loss": 1.3155, + "step": 4865 + }, + { + "epoch": 0.49, + "grad_norm": 8.417875140663172, + "learning_rate": 9.87707059842362e-06, + "loss": 1.187, + "step": 4870 + }, + { + "epoch": 0.49, + "grad_norm": 9.901533696869453, + "learning_rate": 9.876423385625492e-06, + "loss": 1.2881, + "step": 4875 + }, + { + "epoch": 0.49, + "grad_norm": 8.667402310523359, + "learning_rate": 9.875774494854734e-06, + "loss": 1.263, + "step": 4880 + }, + { + "epoch": 0.49, + "grad_norm": 8.67718694052718, + "learning_rate": 9.875123926334629e-06, + "loss": 1.2246, + "step": 4885 + }, + { + "epoch": 0.49, + "grad_norm": 18.615445503902517, + "learning_rate": 9.874471680289037e-06, + "loss": 1.2496, + "step": 4890 + }, + { + "epoch": 0.49, + "grad_norm": 10.202333722795672, + "learning_rate": 9.873817756942396e-06, + "loss": 1.2405, + "step": 4895 + }, + { + "epoch": 0.49, + "grad_norm": 9.941122551177786, + "learning_rate": 9.873162156519718e-06, + "loss": 1.2547, + "step": 4900 + }, + { + "epoch": 0.49, + "grad_norm": 8.333362690140278, + "learning_rate": 9.872504879246598e-06, + "loss": 1.257, + "step": 4905 + }, + { + "epoch": 0.5, + "grad_norm": 22.836006708807865, + "learning_rate": 9.871845925349204e-06, + "loss": 1.2134, + "step": 4910 + }, + { + "epoch": 0.5, + "grad_norm": 29.536939423968438, + "learning_rate": 9.871185295054278e-06, + "loss": 1.217, + "step": 4915 + }, + { + "epoch": 0.5, + "grad_norm": 25.05820903087772, + "learning_rate": 9.870522988589146e-06, + "loss": 1.2439, + "step": 4920 + }, + { + "epoch": 0.5, + "grad_norm": 23.98113010207636, + "learning_rate": 9.869859006181705e-06, + "loss": 1.1887, + "step": 4925 + }, + { + "epoch": 0.5, + "grad_norm": 23.53745644035191, + "learning_rate": 9.86919334806043e-06, + "loss": 1.233, + "step": 4930 + }, + { + "epoch": 0.5, + "grad_norm": 15.311372888712697, + "learning_rate": 9.868526014454377e-06, + "loss": 1.2674, + "step": 4935 + }, + { + "epoch": 0.5, + "grad_norm": 12.363781317584587, + "learning_rate": 9.867857005593172e-06, + "loss": 1.2111, + "step": 4940 + }, + { + "epoch": 0.5, + "grad_norm": 12.179319863792337, + "learning_rate": 9.86718632170702e-06, + "loss": 1.2123, + "step": 4945 + }, + { + "epoch": 0.5, + "grad_norm": 6.505244786041763, + "learning_rate": 9.866513963026703e-06, + "loss": 1.2404, + "step": 4950 + }, + { + "epoch": 0.5, + "grad_norm": 6.854134017392649, + "learning_rate": 9.865839929783578e-06, + "loss": 1.2601, + "step": 4955 + }, + { + "epoch": 0.5, + "grad_norm": 9.191892520579293, + "learning_rate": 9.865164222209583e-06, + "loss": 1.2635, + "step": 4960 + }, + { + "epoch": 0.5, + "grad_norm": 6.751273585672897, + "learning_rate": 9.864486840537225e-06, + "loss": 1.1951, + "step": 4965 + }, + { + "epoch": 0.5, + "grad_norm": 44.06145173052812, + "learning_rate": 9.863807784999591e-06, + "loss": 1.2202, + "step": 4970 + }, + { + "epoch": 0.5, + "grad_norm": 11.736756270544875, + "learning_rate": 9.863127055830343e-06, + "loss": 1.2326, + "step": 4975 + }, + { + "epoch": 0.5, + "grad_norm": 11.145462317727578, + "learning_rate": 9.862444653263723e-06, + "loss": 1.2715, + "step": 4980 + }, + { + "epoch": 0.5, + "grad_norm": 14.699736433964649, + "learning_rate": 9.861760577534538e-06, + "loss": 1.2768, + "step": 4985 + }, + { + "epoch": 0.5, + "grad_norm": 8.020798950138495, + "learning_rate": 9.861074828878184e-06, + "loss": 1.254, + "step": 4990 + }, + { + "epoch": 0.5, + "grad_norm": 20.66731748752754, + "learning_rate": 9.860387407530625e-06, + "loss": 1.2259, + "step": 4995 + }, + { + "epoch": 0.5, + "grad_norm": 9.659673761001347, + "learning_rate": 9.859698313728399e-06, + "loss": 1.2653, + "step": 5000 + }, + { + "epoch": 0.5, + "grad_norm": 10.695606595819575, + "learning_rate": 9.859007547708626e-06, + "loss": 1.2337, + "step": 5005 + }, + { + "epoch": 0.51, + "grad_norm": 11.328135911399114, + "learning_rate": 9.858315109708998e-06, + "loss": 1.2404, + "step": 5010 + }, + { + "epoch": 0.51, + "grad_norm": 15.11731390695412, + "learning_rate": 9.857620999967778e-06, + "loss": 1.2627, + "step": 5015 + }, + { + "epoch": 0.51, + "grad_norm": 20.37594952930069, + "learning_rate": 9.856925218723814e-06, + "loss": 1.2325, + "step": 5020 + }, + { + "epoch": 0.51, + "grad_norm": 23.392796017990797, + "learning_rate": 9.85622776621652e-06, + "loss": 1.2049, + "step": 5025 + }, + { + "epoch": 0.51, + "grad_norm": 46.26122007063707, + "learning_rate": 9.85552864268589e-06, + "loss": 1.2259, + "step": 5030 + }, + { + "epoch": 0.51, + "grad_norm": 14.075205946208943, + "learning_rate": 9.854827848372492e-06, + "loss": 1.2915, + "step": 5035 + }, + { + "epoch": 0.51, + "grad_norm": 8.041877762390996, + "learning_rate": 9.854125383517468e-06, + "loss": 1.2339, + "step": 5040 + }, + { + "epoch": 0.51, + "grad_norm": 18.90087269271391, + "learning_rate": 9.853421248362536e-06, + "loss": 1.2436, + "step": 5045 + }, + { + "epoch": 0.51, + "grad_norm": 10.086319639257932, + "learning_rate": 9.852715443149988e-06, + "loss": 1.2494, + "step": 5050 + }, + { + "epoch": 0.51, + "grad_norm": 14.108264527082493, + "learning_rate": 9.852007968122691e-06, + "loss": 1.213, + "step": 5055 + }, + { + "epoch": 0.51, + "grad_norm": 6.833632396667184, + "learning_rate": 9.851298823524086e-06, + "loss": 1.2191, + "step": 5060 + }, + { + "epoch": 0.51, + "grad_norm": 16.616780902137627, + "learning_rate": 9.85058800959819e-06, + "loss": 1.2347, + "step": 5065 + }, + { + "epoch": 0.51, + "grad_norm": 53.27961975081988, + "learning_rate": 9.849875526589592e-06, + "loss": 1.2327, + "step": 5070 + }, + { + "epoch": 0.51, + "grad_norm": 9.000337199179235, + "learning_rate": 9.849161374743456e-06, + "loss": 1.2462, + "step": 5075 + }, + { + "epoch": 0.51, + "grad_norm": 12.697995505646016, + "learning_rate": 9.848445554305525e-06, + "loss": 1.255, + "step": 5080 + }, + { + "epoch": 0.51, + "grad_norm": 10.537173940270334, + "learning_rate": 9.847728065522109e-06, + "loss": 1.2691, + "step": 5085 + }, + { + "epoch": 0.51, + "grad_norm": 8.881764103267859, + "learning_rate": 9.847008908640096e-06, + "loss": 1.2048, + "step": 5090 + }, + { + "epoch": 0.51, + "grad_norm": 10.444448898642385, + "learning_rate": 9.846288083906945e-06, + "loss": 1.2537, + "step": 5095 + }, + { + "epoch": 0.51, + "grad_norm": 23.459583080559593, + "learning_rate": 9.845565591570696e-06, + "loss": 1.2382, + "step": 5100 + }, + { + "epoch": 0.51, + "grad_norm": 12.53518476827716, + "learning_rate": 9.844841431879953e-06, + "loss": 1.2616, + "step": 5105 + }, + { + "epoch": 0.52, + "grad_norm": 7.742081524029113, + "learning_rate": 9.8441156050839e-06, + "loss": 1.2426, + "step": 5110 + }, + { + "epoch": 0.52, + "grad_norm": 7.190738080721399, + "learning_rate": 9.843388111432295e-06, + "loss": 1.2427, + "step": 5115 + }, + { + "epoch": 0.52, + "grad_norm": 16.59878231203815, + "learning_rate": 9.842658951175468e-06, + "loss": 1.2566, + "step": 5120 + }, + { + "epoch": 0.52, + "grad_norm": 8.645578107701345, + "learning_rate": 9.84192812456432e-06, + "loss": 1.1824, + "step": 5125 + }, + { + "epoch": 0.52, + "grad_norm": 8.559822039639393, + "learning_rate": 9.841195631850329e-06, + "loss": 1.2168, + "step": 5130 + }, + { + "epoch": 0.52, + "grad_norm": 9.02789966316085, + "learning_rate": 9.840461473285545e-06, + "loss": 1.2199, + "step": 5135 + }, + { + "epoch": 0.52, + "grad_norm": 7.593998708996688, + "learning_rate": 9.83972564912259e-06, + "loss": 1.2275, + "step": 5140 + }, + { + "epoch": 0.52, + "grad_norm": 15.418060402702537, + "learning_rate": 9.838988159614662e-06, + "loss": 1.2532, + "step": 5145 + }, + { + "epoch": 0.52, + "grad_norm": 7.902976645073543, + "learning_rate": 9.838249005015532e-06, + "loss": 1.2601, + "step": 5150 + }, + { + "epoch": 0.52, + "grad_norm": 8.003923073859536, + "learning_rate": 9.837508185579539e-06, + "loss": 1.2247, + "step": 5155 + }, + { + "epoch": 0.52, + "grad_norm": 27.50304902900661, + "learning_rate": 9.8367657015616e-06, + "loss": 1.318, + "step": 5160 + }, + { + "epoch": 0.52, + "grad_norm": 10.691673740264005, + "learning_rate": 9.836021553217204e-06, + "loss": 1.2053, + "step": 5165 + }, + { + "epoch": 0.52, + "grad_norm": 39.807525373760335, + "learning_rate": 9.835275740802407e-06, + "loss": 1.2445, + "step": 5170 + }, + { + "epoch": 0.52, + "grad_norm": 54.93907310262677, + "learning_rate": 9.834528264573848e-06, + "loss": 1.2496, + "step": 5175 + }, + { + "epoch": 0.52, + "grad_norm": 29.959308679991757, + "learning_rate": 9.833779124788732e-06, + "loss": 1.213, + "step": 5180 + }, + { + "epoch": 0.52, + "grad_norm": 30.341779174554887, + "learning_rate": 9.833028321704833e-06, + "loss": 1.241, + "step": 5185 + }, + { + "epoch": 0.52, + "grad_norm": 7.11403595375024, + "learning_rate": 9.832275855580506e-06, + "loss": 1.2206, + "step": 5190 + }, + { + "epoch": 0.52, + "grad_norm": 8.211458334912457, + "learning_rate": 9.831521726674673e-06, + "loss": 1.2358, + "step": 5195 + }, + { + "epoch": 0.52, + "grad_norm": 55.873465004227356, + "learning_rate": 9.83076593524683e-06, + "loss": 1.2454, + "step": 5200 + }, + { + "epoch": 0.52, + "grad_norm": 14.95501751361242, + "learning_rate": 9.830008481557039e-06, + "loss": 1.265, + "step": 5205 + }, + { + "epoch": 0.53, + "grad_norm": 26.831116982855633, + "learning_rate": 9.829249365865944e-06, + "loss": 1.1869, + "step": 5210 + }, + { + "epoch": 0.53, + "grad_norm": 13.272876439024357, + "learning_rate": 9.828488588434755e-06, + "loss": 1.2621, + "step": 5215 + }, + { + "epoch": 0.53, + "grad_norm": 14.744067112409168, + "learning_rate": 9.827726149525254e-06, + "loss": 1.2149, + "step": 5220 + }, + { + "epoch": 0.53, + "grad_norm": 18.587763110752242, + "learning_rate": 9.826962049399797e-06, + "loss": 1.1707, + "step": 5225 + }, + { + "epoch": 0.53, + "grad_norm": 46.80257048909383, + "learning_rate": 9.826196288321308e-06, + "loss": 1.2159, + "step": 5230 + }, + { + "epoch": 0.53, + "grad_norm": 30.462258854027745, + "learning_rate": 9.825428866553286e-06, + "loss": 1.2172, + "step": 5235 + }, + { + "epoch": 0.53, + "grad_norm": 12.872961826576143, + "learning_rate": 9.824659784359801e-06, + "loss": 1.2454, + "step": 5240 + }, + { + "epoch": 0.53, + "grad_norm": 18.107407756464607, + "learning_rate": 9.823889042005491e-06, + "loss": 1.2975, + "step": 5245 + }, + { + "epoch": 0.53, + "grad_norm": 32.772802733577855, + "learning_rate": 9.82311663975557e-06, + "loss": 1.2728, + "step": 5250 + }, + { + "epoch": 0.53, + "grad_norm": 16.778002093988857, + "learning_rate": 9.822342577875818e-06, + "loss": 1.2828, + "step": 5255 + }, + { + "epoch": 0.53, + "grad_norm": 20.179978174318997, + "learning_rate": 9.821566856632591e-06, + "loss": 1.2224, + "step": 5260 + }, + { + "epoch": 0.53, + "grad_norm": 14.651830319926578, + "learning_rate": 9.820789476292815e-06, + "loss": 1.2582, + "step": 5265 + }, + { + "epoch": 0.53, + "grad_norm": 23.417263418666796, + "learning_rate": 9.820010437123985e-06, + "loss": 1.2399, + "step": 5270 + }, + { + "epoch": 0.53, + "grad_norm": 16.117470831131186, + "learning_rate": 9.819229739394167e-06, + "loss": 1.2411, + "step": 5275 + }, + { + "epoch": 0.53, + "grad_norm": 28.815770897493955, + "learning_rate": 9.818447383371999e-06, + "loss": 1.2659, + "step": 5280 + }, + { + "epoch": 0.53, + "grad_norm": 12.109159903975375, + "learning_rate": 9.817663369326687e-06, + "loss": 1.2177, + "step": 5285 + }, + { + "epoch": 0.53, + "grad_norm": 16.63369205165027, + "learning_rate": 9.816877697528011e-06, + "loss": 1.2485, + "step": 5290 + }, + { + "epoch": 0.53, + "grad_norm": 11.89167586242028, + "learning_rate": 9.81609036824632e-06, + "loss": 1.2576, + "step": 5295 + }, + { + "epoch": 0.53, + "grad_norm": 36.62989419884296, + "learning_rate": 9.815301381752535e-06, + "loss": 1.2026, + "step": 5300 + }, + { + "epoch": 0.53, + "grad_norm": 13.802536810868858, + "learning_rate": 9.814510738318142e-06, + "loss": 1.2953, + "step": 5305 + }, + { + "epoch": 0.54, + "grad_norm": 8.092988919751438, + "learning_rate": 9.813718438215202e-06, + "loss": 1.2312, + "step": 5310 + }, + { + "epoch": 0.54, + "grad_norm": 14.4301077730165, + "learning_rate": 9.812924481716347e-06, + "loss": 1.2257, + "step": 5315 + }, + { + "epoch": 0.54, + "grad_norm": 7.0347139548726805, + "learning_rate": 9.812128869094773e-06, + "loss": 1.2851, + "step": 5320 + }, + { + "epoch": 0.54, + "grad_norm": 15.166163833725394, + "learning_rate": 9.811331600624251e-06, + "loss": 1.1624, + "step": 5325 + }, + { + "epoch": 0.54, + "grad_norm": 9.087763297173032, + "learning_rate": 9.810532676579123e-06, + "loss": 1.2585, + "step": 5330 + }, + { + "epoch": 0.54, + "grad_norm": 17.94304547276539, + "learning_rate": 9.809732097234295e-06, + "loss": 1.2302, + "step": 5335 + }, + { + "epoch": 0.54, + "grad_norm": 7.76779948855747, + "learning_rate": 9.808929862865244e-06, + "loss": 1.2467, + "step": 5340 + }, + { + "epoch": 0.54, + "grad_norm": 27.146992171033162, + "learning_rate": 9.808125973748021e-06, + "loss": 1.239, + "step": 5345 + }, + { + "epoch": 0.54, + "grad_norm": 69.52604053781388, + "learning_rate": 9.807320430159245e-06, + "loss": 1.2823, + "step": 5350 + }, + { + "epoch": 0.54, + "grad_norm": 22.185999195630405, + "learning_rate": 9.806513232376097e-06, + "loss": 1.2242, + "step": 5355 + }, + { + "epoch": 0.54, + "grad_norm": 39.44914138709512, + "learning_rate": 9.805704380676338e-06, + "loss": 1.2085, + "step": 5360 + }, + { + "epoch": 0.54, + "grad_norm": 12.255961127537361, + "learning_rate": 9.804893875338292e-06, + "loss": 1.1902, + "step": 5365 + }, + { + "epoch": 0.54, + "grad_norm": 31.670780533498856, + "learning_rate": 9.804081716640852e-06, + "loss": 1.2313, + "step": 5370 + }, + { + "epoch": 0.54, + "grad_norm": 13.131983864366022, + "learning_rate": 9.803267904863483e-06, + "loss": 1.215, + "step": 5375 + }, + { + "epoch": 0.54, + "grad_norm": 10.312170515341023, + "learning_rate": 9.802452440286215e-06, + "loss": 1.2263, + "step": 5380 + }, + { + "epoch": 0.54, + "grad_norm": 15.20883072214561, + "learning_rate": 9.801635323189648e-06, + "loss": 1.3343, + "step": 5385 + }, + { + "epoch": 0.54, + "grad_norm": 21.616483837568865, + "learning_rate": 9.800816553854952e-06, + "loss": 1.2645, + "step": 5390 + }, + { + "epoch": 0.54, + "grad_norm": 8.544994897236405, + "learning_rate": 9.799996132563867e-06, + "loss": 1.2552, + "step": 5395 + }, + { + "epoch": 0.54, + "grad_norm": 22.379944292734105, + "learning_rate": 9.799174059598697e-06, + "loss": 1.2214, + "step": 5400 + }, + { + "epoch": 0.54, + "grad_norm": 28.820684171774843, + "learning_rate": 9.798350335242318e-06, + "loss": 1.2612, + "step": 5405 + }, + { + "epoch": 0.55, + "grad_norm": 17.531294494626554, + "learning_rate": 9.797524959778169e-06, + "loss": 1.2082, + "step": 5410 + }, + { + "epoch": 0.55, + "grad_norm": 7.5919105729720595, + "learning_rate": 9.796697933490265e-06, + "loss": 1.2545, + "step": 5415 + }, + { + "epoch": 0.55, + "grad_norm": 7.756273626218609, + "learning_rate": 9.795869256663183e-06, + "loss": 1.2734, + "step": 5420 + }, + { + "epoch": 0.55, + "grad_norm": 29.31089089420677, + "learning_rate": 9.79503892958207e-06, + "loss": 1.2371, + "step": 5425 + }, + { + "epoch": 0.55, + "grad_norm": 19.237086210719905, + "learning_rate": 9.79420695253264e-06, + "loss": 1.1942, + "step": 5430 + }, + { + "epoch": 0.55, + "grad_norm": 43.10425467271122, + "learning_rate": 9.79337332580118e-06, + "loss": 1.2834, + "step": 5435 + }, + { + "epoch": 0.55, + "grad_norm": 14.207577647989709, + "learning_rate": 9.792538049674536e-06, + "loss": 1.2432, + "step": 5440 + }, + { + "epoch": 0.55, + "grad_norm": 15.049823339098804, + "learning_rate": 9.791701124440123e-06, + "loss": 1.3175, + "step": 5445 + }, + { + "epoch": 0.55, + "grad_norm": 44.32935270103052, + "learning_rate": 9.790862550385933e-06, + "loss": 1.2502, + "step": 5450 + }, + { + "epoch": 0.55, + "grad_norm": 36.82462495372997, + "learning_rate": 9.790022327800515e-06, + "loss": 1.2129, + "step": 5455 + }, + { + "epoch": 0.55, + "grad_norm": 26.76749530527562, + "learning_rate": 9.789180456972989e-06, + "loss": 1.2581, + "step": 5460 + }, + { + "epoch": 0.55, + "grad_norm": 71.981834353475, + "learning_rate": 9.788336938193041e-06, + "loss": 1.2926, + "step": 5465 + }, + { + "epoch": 0.55, + "grad_norm": 69.81416038234114, + "learning_rate": 9.787491771750925e-06, + "loss": 1.3102, + "step": 5470 + }, + { + "epoch": 0.55, + "grad_norm": 53.959547812668724, + "learning_rate": 9.786644957937466e-06, + "loss": 1.3059, + "step": 5475 + }, + { + "epoch": 0.55, + "grad_norm": 124.99102839739543, + "learning_rate": 9.785796497044047e-06, + "loss": 1.2209, + "step": 5480 + }, + { + "epoch": 0.55, + "grad_norm": 35.57188832728497, + "learning_rate": 9.784946389362624e-06, + "loss": 1.248, + "step": 5485 + }, + { + "epoch": 0.55, + "grad_norm": 98.88287778453991, + "learning_rate": 9.784094635185718e-06, + "loss": 1.3008, + "step": 5490 + }, + { + "epoch": 0.55, + "grad_norm": 120.06635252889072, + "learning_rate": 9.783241234806417e-06, + "loss": 1.2917, + "step": 5495 + }, + { + "epoch": 0.55, + "grad_norm": 47.268439703258316, + "learning_rate": 9.782386188518378e-06, + "loss": 1.3227, + "step": 5500 + }, + { + "epoch": 0.56, + "grad_norm": 16.03822977666455, + "learning_rate": 9.781529496615819e-06, + "loss": 1.2413, + "step": 5505 + }, + { + "epoch": 0.56, + "grad_norm": 17.75968374946111, + "learning_rate": 9.780671159393525e-06, + "loss": 1.2753, + "step": 5510 + }, + { + "epoch": 0.56, + "grad_norm": 12.653797529576986, + "learning_rate": 9.779811177146854e-06, + "loss": 1.1932, + "step": 5515 + }, + { + "epoch": 0.56, + "grad_norm": 21.42135697910021, + "learning_rate": 9.778949550171719e-06, + "loss": 1.2838, + "step": 5520 + }, + { + "epoch": 0.56, + "grad_norm": 19.442380055841458, + "learning_rate": 9.77808627876461e-06, + "loss": 1.2718, + "step": 5525 + }, + { + "epoch": 0.56, + "grad_norm": 25.571013974374363, + "learning_rate": 9.777221363222576e-06, + "loss": 1.2893, + "step": 5530 + }, + { + "epoch": 0.56, + "grad_norm": 11.107924256021452, + "learning_rate": 9.776354803843233e-06, + "loss": 1.223, + "step": 5535 + }, + { + "epoch": 0.56, + "grad_norm": 22.823538370422042, + "learning_rate": 9.775486600924765e-06, + "loss": 1.205, + "step": 5540 + }, + { + "epoch": 0.56, + "grad_norm": 10.079573929956554, + "learning_rate": 9.774616754765918e-06, + "loss": 1.2629, + "step": 5545 + }, + { + "epoch": 0.56, + "grad_norm": 19.370464473154634, + "learning_rate": 9.773745265666006e-06, + "loss": 1.2299, + "step": 5550 + }, + { + "epoch": 0.56, + "grad_norm": 10.160721101979744, + "learning_rate": 9.772872133924907e-06, + "loss": 1.2204, + "step": 5555 + }, + { + "epoch": 0.56, + "grad_norm": 24.96434652482283, + "learning_rate": 9.771997359843066e-06, + "loss": 1.2244, + "step": 5560 + }, + { + "epoch": 0.56, + "grad_norm": 27.007797934357004, + "learning_rate": 9.771120943721492e-06, + "loss": 1.2095, + "step": 5565 + }, + { + "epoch": 0.56, + "grad_norm": 7.687110756133366, + "learning_rate": 9.770242885861757e-06, + "loss": 1.1947, + "step": 5570 + }, + { + "epoch": 0.56, + "grad_norm": 7.129469968585911, + "learning_rate": 9.769363186566e-06, + "loss": 1.2495, + "step": 5575 + }, + { + "epoch": 0.56, + "grad_norm": 15.770422506749258, + "learning_rate": 9.76848184613693e-06, + "loss": 1.2225, + "step": 5580 + }, + { + "epoch": 0.56, + "grad_norm": 22.39907626509249, + "learning_rate": 9.767598864877808e-06, + "loss": 1.1946, + "step": 5585 + }, + { + "epoch": 0.56, + "grad_norm": 23.564102543690783, + "learning_rate": 9.76671424309247e-06, + "loss": 1.2236, + "step": 5590 + }, + { + "epoch": 0.56, + "grad_norm": 13.498061833519296, + "learning_rate": 9.765827981085314e-06, + "loss": 1.2342, + "step": 5595 + }, + { + "epoch": 0.56, + "grad_norm": 14.157141212968256, + "learning_rate": 9.764940079161302e-06, + "loss": 1.265, + "step": 5600 + }, + { + "epoch": 0.57, + "grad_norm": 11.234968885987945, + "learning_rate": 9.76405053762596e-06, + "loss": 1.2341, + "step": 5605 + }, + { + "epoch": 0.57, + "grad_norm": 10.741718694760706, + "learning_rate": 9.76315935678538e-06, + "loss": 1.1982, + "step": 5610 + }, + { + "epoch": 0.57, + "grad_norm": 8.162597237447422, + "learning_rate": 9.76226653694621e-06, + "loss": 1.2361, + "step": 5615 + }, + { + "epoch": 0.57, + "grad_norm": 21.02779574214582, + "learning_rate": 9.761372078415675e-06, + "loss": 1.2287, + "step": 5620 + }, + { + "epoch": 0.57, + "grad_norm": 18.851378631057123, + "learning_rate": 9.760475981501558e-06, + "loss": 1.2333, + "step": 5625 + }, + { + "epoch": 0.57, + "grad_norm": 9.78763285674776, + "learning_rate": 9.7595782465122e-06, + "loss": 1.2298, + "step": 5630 + }, + { + "epoch": 0.57, + "grad_norm": 7.69984737716517, + "learning_rate": 9.758678873756515e-06, + "loss": 1.2666, + "step": 5635 + }, + { + "epoch": 0.57, + "grad_norm": 11.042149705146223, + "learning_rate": 9.757777863543973e-06, + "loss": 1.2436, + "step": 5640 + }, + { + "epoch": 0.57, + "grad_norm": 8.11977096613918, + "learning_rate": 9.756875216184614e-06, + "loss": 1.2466, + "step": 5645 + }, + { + "epoch": 0.57, + "grad_norm": 9.664705978775967, + "learning_rate": 9.755970931989035e-06, + "loss": 1.2387, + "step": 5650 + }, + { + "epoch": 0.57, + "grad_norm": 13.252407580225176, + "learning_rate": 9.755065011268401e-06, + "loss": 1.2514, + "step": 5655 + }, + { + "epoch": 0.57, + "grad_norm": 11.59180227816643, + "learning_rate": 9.754157454334439e-06, + "loss": 1.2036, + "step": 5660 + }, + { + "epoch": 0.57, + "grad_norm": 12.266010042715896, + "learning_rate": 9.753248261499437e-06, + "loss": 1.2018, + "step": 5665 + }, + { + "epoch": 0.57, + "grad_norm": 19.68909605489335, + "learning_rate": 9.752337433076248e-06, + "loss": 1.2048, + "step": 5670 + }, + { + "epoch": 0.57, + "grad_norm": 44.27561117531063, + "learning_rate": 9.751424969378286e-06, + "loss": 1.2434, + "step": 5675 + }, + { + "epoch": 0.57, + "grad_norm": 15.319303141711027, + "learning_rate": 9.750510870719532e-06, + "loss": 1.1879, + "step": 5680 + }, + { + "epoch": 0.57, + "grad_norm": 25.15578221338594, + "learning_rate": 9.749595137414525e-06, + "loss": 1.2388, + "step": 5685 + }, + { + "epoch": 0.57, + "grad_norm": 14.321282031004424, + "learning_rate": 9.748677769778368e-06, + "loss": 1.2015, + "step": 5690 + }, + { + "epoch": 0.57, + "grad_norm": 27.291663057795933, + "learning_rate": 9.747758768126724e-06, + "loss": 1.2821, + "step": 5695 + }, + { + "epoch": 0.57, + "grad_norm": 8.251920040152498, + "learning_rate": 9.746838132775823e-06, + "loss": 1.1904, + "step": 5700 + }, + { + "epoch": 0.58, + "grad_norm": 8.673801034324477, + "learning_rate": 9.745915864042455e-06, + "loss": 1.212, + "step": 5705 + }, + { + "epoch": 0.58, + "grad_norm": 8.009051061546915, + "learning_rate": 9.744991962243971e-06, + "loss": 1.2011, + "step": 5710 + }, + { + "epoch": 0.58, + "grad_norm": 16.415166435895298, + "learning_rate": 9.744066427698285e-06, + "loss": 1.2115, + "step": 5715 + }, + { + "epoch": 0.58, + "grad_norm": 7.864972467275857, + "learning_rate": 9.743139260723871e-06, + "loss": 1.2226, + "step": 5720 + }, + { + "epoch": 0.58, + "grad_norm": 9.83282651933836, + "learning_rate": 9.74221046163977e-06, + "loss": 1.3015, + "step": 5725 + }, + { + "epoch": 0.58, + "grad_norm": 20.517980595445845, + "learning_rate": 9.741280030765576e-06, + "loss": 1.2031, + "step": 5730 + }, + { + "epoch": 0.58, + "grad_norm": 10.334819861708224, + "learning_rate": 9.740347968421453e-06, + "loss": 1.2512, + "step": 5735 + }, + { + "epoch": 0.58, + "grad_norm": 19.394363447308308, + "learning_rate": 9.739414274928121e-06, + "loss": 1.3025, + "step": 5740 + }, + { + "epoch": 0.58, + "grad_norm": 9.250141257436102, + "learning_rate": 9.738478950606864e-06, + "loss": 1.2673, + "step": 5745 + }, + { + "epoch": 0.58, + "grad_norm": 25.409589436587748, + "learning_rate": 9.737541995779526e-06, + "loss": 1.1938, + "step": 5750 + }, + { + "epoch": 0.58, + "grad_norm": 8.857813684638622, + "learning_rate": 9.736603410768513e-06, + "loss": 1.1694, + "step": 5755 + }, + { + "epoch": 0.58, + "grad_norm": 7.8243029109924285, + "learning_rate": 9.735663195896789e-06, + "loss": 1.1978, + "step": 5760 + }, + { + "epoch": 0.58, + "grad_norm": 7.954461871946666, + "learning_rate": 9.734721351487881e-06, + "loss": 1.2346, + "step": 5765 + }, + { + "epoch": 0.58, + "grad_norm": 6.316762828559021, + "learning_rate": 9.73377787786588e-06, + "loss": 1.2735, + "step": 5770 + }, + { + "epoch": 0.58, + "grad_norm": 13.68701531419754, + "learning_rate": 9.732832775355434e-06, + "loss": 1.2395, + "step": 5775 + }, + { + "epoch": 0.58, + "grad_norm": 21.106624359950015, + "learning_rate": 9.731886044281748e-06, + "loss": 1.2553, + "step": 5780 + }, + { + "epoch": 0.58, + "grad_norm": 26.419039922771095, + "learning_rate": 9.730937684970594e-06, + "loss": 1.1864, + "step": 5785 + }, + { + "epoch": 0.58, + "grad_norm": 14.47606213050339, + "learning_rate": 9.729987697748303e-06, + "loss": 1.2524, + "step": 5790 + }, + { + "epoch": 0.58, + "grad_norm": 11.793629060293263, + "learning_rate": 9.72903608294176e-06, + "loss": 1.2426, + "step": 5795 + }, + { + "epoch": 0.58, + "grad_norm": 7.5793849168018435, + "learning_rate": 9.72808284087842e-06, + "loss": 1.2307, + "step": 5800 + }, + { + "epoch": 0.59, + "grad_norm": 32.128228531886656, + "learning_rate": 9.727127971886289e-06, + "loss": 1.2282, + "step": 5805 + }, + { + "epoch": 0.59, + "grad_norm": 9.432836679461422, + "learning_rate": 9.726171476293937e-06, + "loss": 1.2244, + "step": 5810 + }, + { + "epoch": 0.59, + "grad_norm": 19.641558757565917, + "learning_rate": 9.725213354430496e-06, + "loss": 1.2558, + "step": 5815 + }, + { + "epoch": 0.59, + "grad_norm": 26.33319425625301, + "learning_rate": 9.724253606625651e-06, + "loss": 1.2333, + "step": 5820 + }, + { + "epoch": 0.59, + "grad_norm": 51.359947539006086, + "learning_rate": 9.723292233209653e-06, + "loss": 1.2724, + "step": 5825 + }, + { + "epoch": 0.59, + "grad_norm": 32.29157944242356, + "learning_rate": 9.72232923451331e-06, + "loss": 1.2224, + "step": 5830 + }, + { + "epoch": 0.59, + "grad_norm": 8.222712003517096, + "learning_rate": 9.721364610867988e-06, + "loss": 1.2052, + "step": 5835 + }, + { + "epoch": 0.59, + "grad_norm": 12.545359419012799, + "learning_rate": 9.72039836260561e-06, + "loss": 1.2601, + "step": 5840 + }, + { + "epoch": 0.59, + "grad_norm": 10.516808421680913, + "learning_rate": 9.719430490058666e-06, + "loss": 1.2155, + "step": 5845 + }, + { + "epoch": 0.59, + "grad_norm": 22.072674421278542, + "learning_rate": 9.718460993560197e-06, + "loss": 1.2381, + "step": 5850 + }, + { + "epoch": 0.59, + "grad_norm": 26.640128221824423, + "learning_rate": 9.717489873443807e-06, + "loss": 1.2868, + "step": 5855 + }, + { + "epoch": 0.59, + "grad_norm": 15.269708168787664, + "learning_rate": 9.716517130043658e-06, + "loss": 1.2279, + "step": 5860 + }, + { + "epoch": 0.59, + "grad_norm": 16.005739462293075, + "learning_rate": 9.715542763694469e-06, + "loss": 1.2376, + "step": 5865 + }, + { + "epoch": 0.59, + "grad_norm": 12.050689181115803, + "learning_rate": 9.714566774731518e-06, + "loss": 1.2021, + "step": 5870 + }, + { + "epoch": 0.59, + "grad_norm": 12.528858851920072, + "learning_rate": 9.713589163490645e-06, + "loss": 1.2167, + "step": 5875 + }, + { + "epoch": 0.59, + "grad_norm": 9.533475071584181, + "learning_rate": 9.71260993030824e-06, + "loss": 1.2132, + "step": 5880 + }, + { + "epoch": 0.59, + "grad_norm": 11.651551555020754, + "learning_rate": 9.71162907552126e-06, + "loss": 1.2411, + "step": 5885 + }, + { + "epoch": 0.59, + "grad_norm": 9.091995680595648, + "learning_rate": 9.710646599467215e-06, + "loss": 1.3136, + "step": 5890 + }, + { + "epoch": 0.59, + "grad_norm": 7.703057172358534, + "learning_rate": 9.709662502484177e-06, + "loss": 1.2139, + "step": 5895 + }, + { + "epoch": 0.59, + "grad_norm": 7.568655647470476, + "learning_rate": 9.708676784910767e-06, + "loss": 1.2019, + "step": 5900 + }, + { + "epoch": 0.6, + "grad_norm": 9.499941306187813, + "learning_rate": 9.707689447086174e-06, + "loss": 1.2124, + "step": 5905 + }, + { + "epoch": 0.6, + "grad_norm": 10.596113261143561, + "learning_rate": 9.706700489350137e-06, + "loss": 1.2569, + "step": 5910 + }, + { + "epoch": 0.6, + "grad_norm": 8.712435257088414, + "learning_rate": 9.705709912042959e-06, + "loss": 1.2312, + "step": 5915 + }, + { + "epoch": 0.6, + "grad_norm": 15.448047445432763, + "learning_rate": 9.704717715505494e-06, + "loss": 1.2094, + "step": 5920 + }, + { + "epoch": 0.6, + "grad_norm": 10.309107814440093, + "learning_rate": 9.703723900079156e-06, + "loss": 1.2249, + "step": 5925 + }, + { + "epoch": 0.6, + "grad_norm": 7.665381140645497, + "learning_rate": 9.702728466105918e-06, + "loss": 1.2193, + "step": 5930 + }, + { + "epoch": 0.6, + "grad_norm": 7.949250005352762, + "learning_rate": 9.701731413928305e-06, + "loss": 1.2466, + "step": 5935 + }, + { + "epoch": 0.6, + "grad_norm": 44.720747917861885, + "learning_rate": 9.700732743889402e-06, + "loss": 1.2538, + "step": 5940 + }, + { + "epoch": 0.6, + "grad_norm": 37.26154099664765, + "learning_rate": 9.699732456332855e-06, + "loss": 1.2931, + "step": 5945 + }, + { + "epoch": 0.6, + "grad_norm": 60.81777030005192, + "learning_rate": 9.698730551602857e-06, + "loss": 1.229, + "step": 5950 + }, + { + "epoch": 0.6, + "grad_norm": 32.24160414710352, + "learning_rate": 9.697727030044165e-06, + "loss": 1.2347, + "step": 5955 + }, + { + "epoch": 0.6, + "grad_norm": 7.549838623118922, + "learning_rate": 9.696721892002086e-06, + "loss": 1.2633, + "step": 5960 + }, + { + "epoch": 0.6, + "grad_norm": 8.840685661279561, + "learning_rate": 9.695715137822491e-06, + "loss": 1.2155, + "step": 5965 + }, + { + "epoch": 0.6, + "grad_norm": 45.466885900243135, + "learning_rate": 9.694706767851803e-06, + "loss": 1.2183, + "step": 5970 + }, + { + "epoch": 0.6, + "grad_norm": 47.74491389796742, + "learning_rate": 9.693696782436999e-06, + "loss": 1.2244, + "step": 5975 + }, + { + "epoch": 0.6, + "grad_norm": 38.78516801788493, + "learning_rate": 9.692685181925616e-06, + "loss": 1.2745, + "step": 5980 + }, + { + "epoch": 0.6, + "grad_norm": 8.448991128639108, + "learning_rate": 9.691671966665743e-06, + "loss": 1.2304, + "step": 5985 + }, + { + "epoch": 0.6, + "grad_norm": 8.417128118229789, + "learning_rate": 9.690657137006028e-06, + "loss": 1.2243, + "step": 5990 + }, + { + "epoch": 0.6, + "grad_norm": 11.355053182742228, + "learning_rate": 9.689640693295673e-06, + "loss": 1.2213, + "step": 5995 + }, + { + "epoch": 0.6, + "grad_norm": 10.028209869242007, + "learning_rate": 9.688622635884434e-06, + "loss": 1.2339, + "step": 6000 + }, + { + "epoch": 0.61, + "grad_norm": 10.668657050541347, + "learning_rate": 9.687602965122624e-06, + "loss": 1.2313, + "step": 6005 + }, + { + "epoch": 0.61, + "grad_norm": 11.902714837352619, + "learning_rate": 9.686581681361112e-06, + "loss": 1.2749, + "step": 6010 + }, + { + "epoch": 0.61, + "grad_norm": 14.917042572282254, + "learning_rate": 9.685558784951318e-06, + "loss": 1.1797, + "step": 6015 + }, + { + "epoch": 0.61, + "grad_norm": 15.440952998521315, + "learning_rate": 9.684534276245222e-06, + "loss": 1.2392, + "step": 6020 + }, + { + "epoch": 0.61, + "grad_norm": 28.600345521150423, + "learning_rate": 9.683508155595355e-06, + "loss": 1.2236, + "step": 6025 + }, + { + "epoch": 0.61, + "grad_norm": 55.11319802817678, + "learning_rate": 9.682480423354805e-06, + "loss": 1.206, + "step": 6030 + }, + { + "epoch": 0.61, + "grad_norm": 60.53880107700804, + "learning_rate": 9.681451079877214e-06, + "loss": 1.2893, + "step": 6035 + }, + { + "epoch": 0.61, + "grad_norm": 21.035566080355625, + "learning_rate": 9.680420125516779e-06, + "loss": 1.2402, + "step": 6040 + }, + { + "epoch": 0.61, + "grad_norm": 15.617150535642468, + "learning_rate": 9.679387560628247e-06, + "loss": 1.206, + "step": 6045 + }, + { + "epoch": 0.61, + "grad_norm": 18.093646024119007, + "learning_rate": 9.678353385566926e-06, + "loss": 1.2368, + "step": 6050 + }, + { + "epoch": 0.61, + "grad_norm": 18.733954738163668, + "learning_rate": 9.677317600688674e-06, + "loss": 1.2561, + "step": 6055 + }, + { + "epoch": 0.61, + "grad_norm": 11.341747092645615, + "learning_rate": 9.676280206349902e-06, + "loss": 1.2127, + "step": 6060 + }, + { + "epoch": 0.61, + "grad_norm": 8.062867324416539, + "learning_rate": 9.675241202907577e-06, + "loss": 1.2512, + "step": 6065 + }, + { + "epoch": 0.61, + "grad_norm": 7.630871536778502, + "learning_rate": 9.674200590719219e-06, + "loss": 1.1599, + "step": 6070 + }, + { + "epoch": 0.61, + "grad_norm": 10.54550227669717, + "learning_rate": 9.673158370142902e-06, + "loss": 1.158, + "step": 6075 + }, + { + "epoch": 0.61, + "grad_norm": 17.44815630750628, + "learning_rate": 9.672114541537255e-06, + "loss": 1.2572, + "step": 6080 + }, + { + "epoch": 0.61, + "grad_norm": 8.579969741515313, + "learning_rate": 9.671069105261457e-06, + "loss": 1.1934, + "step": 6085 + }, + { + "epoch": 0.61, + "grad_norm": 6.899566769739347, + "learning_rate": 9.67002206167524e-06, + "loss": 1.26, + "step": 6090 + }, + { + "epoch": 0.61, + "grad_norm": 8.65903029035593, + "learning_rate": 9.66897341113889e-06, + "loss": 1.2176, + "step": 6095 + }, + { + "epoch": 0.62, + "grad_norm": 11.406093802458411, + "learning_rate": 9.667923154013252e-06, + "loss": 1.2592, + "step": 6100 + }, + { + "epoch": 0.62, + "grad_norm": 16.11693527850338, + "learning_rate": 9.666871290659715e-06, + "loss": 1.219, + "step": 6105 + }, + { + "epoch": 0.62, + "grad_norm": 15.848587868111656, + "learning_rate": 9.665817821440223e-06, + "loss": 1.2077, + "step": 6110 + }, + { + "epoch": 0.62, + "grad_norm": 11.238908843851462, + "learning_rate": 9.664762746717274e-06, + "loss": 1.2573, + "step": 6115 + }, + { + "epoch": 0.62, + "grad_norm": 12.335079682624038, + "learning_rate": 9.66370606685392e-06, + "loss": 1.2615, + "step": 6120 + }, + { + "epoch": 0.62, + "grad_norm": 12.274431760147126, + "learning_rate": 9.662647782213763e-06, + "loss": 1.2101, + "step": 6125 + }, + { + "epoch": 0.62, + "grad_norm": 9.259435803547051, + "learning_rate": 9.661587893160957e-06, + "loss": 1.2227, + "step": 6130 + }, + { + "epoch": 0.62, + "grad_norm": 18.325180356389684, + "learning_rate": 9.66052640006021e-06, + "loss": 1.2377, + "step": 6135 + }, + { + "epoch": 0.62, + "grad_norm": 10.132396290240596, + "learning_rate": 9.659463303276779e-06, + "loss": 1.2625, + "step": 6140 + }, + { + "epoch": 0.62, + "grad_norm": 7.713191427068931, + "learning_rate": 9.658398603176478e-06, + "loss": 1.2974, + "step": 6145 + }, + { + "epoch": 0.62, + "grad_norm": 17.338937889405493, + "learning_rate": 9.657332300125665e-06, + "loss": 1.2185, + "step": 6150 + }, + { + "epoch": 0.62, + "grad_norm": 21.81946959153514, + "learning_rate": 9.656264394491256e-06, + "loss": 1.1804, + "step": 6155 + }, + { + "epoch": 0.62, + "grad_norm": 17.97123339745943, + "learning_rate": 9.655194886640715e-06, + "loss": 1.2721, + "step": 6160 + }, + { + "epoch": 0.62, + "grad_norm": 20.83391204567327, + "learning_rate": 9.654123776942061e-06, + "loss": 1.2028, + "step": 6165 + }, + { + "epoch": 0.62, + "grad_norm": 18.73028944994978, + "learning_rate": 9.653051065763862e-06, + "loss": 1.3421, + "step": 6170 + }, + { + "epoch": 0.62, + "grad_norm": 53.7545975218364, + "learning_rate": 9.651976753475234e-06, + "loss": 1.2165, + "step": 6175 + }, + { + "epoch": 0.62, + "grad_norm": 77.10926771492795, + "learning_rate": 9.650900840445848e-06, + "loss": 1.2407, + "step": 6180 + }, + { + "epoch": 0.62, + "grad_norm": 14.79236867444254, + "learning_rate": 9.649823327045924e-06, + "loss": 1.2756, + "step": 6185 + }, + { + "epoch": 0.62, + "grad_norm": 17.08348453792601, + "learning_rate": 9.648744213646236e-06, + "loss": 1.2452, + "step": 6190 + }, + { + "epoch": 0.62, + "grad_norm": 37.571486457997665, + "learning_rate": 9.647663500618105e-06, + "loss": 1.2712, + "step": 6195 + }, + { + "epoch": 0.63, + "grad_norm": 62.85049842589145, + "learning_rate": 9.6465811883334e-06, + "loss": 1.2964, + "step": 6200 + }, + { + "epoch": 0.63, + "grad_norm": 38.57779119365313, + "learning_rate": 9.645497277164547e-06, + "loss": 1.2494, + "step": 6205 + }, + { + "epoch": 0.63, + "grad_norm": 20.42096778530192, + "learning_rate": 9.644411767484518e-06, + "loss": 1.2838, + "step": 6210 + }, + { + "epoch": 0.63, + "grad_norm": 22.478502839401152, + "learning_rate": 9.643324659666835e-06, + "loss": 1.2418, + "step": 6215 + }, + { + "epoch": 0.63, + "grad_norm": 22.76077350421602, + "learning_rate": 9.642235954085572e-06, + "loss": 1.2485, + "step": 6220 + }, + { + "epoch": 0.63, + "grad_norm": 16.03143192056874, + "learning_rate": 9.641145651115353e-06, + "loss": 1.2521, + "step": 6225 + }, + { + "epoch": 0.63, + "grad_norm": 11.175516763267444, + "learning_rate": 9.640053751131346e-06, + "loss": 1.2065, + "step": 6230 + }, + { + "epoch": 0.63, + "grad_norm": 28.09344221326846, + "learning_rate": 9.638960254509275e-06, + "loss": 1.2107, + "step": 6235 + }, + { + "epoch": 0.63, + "grad_norm": 13.04588546999759, + "learning_rate": 9.637865161625413e-06, + "loss": 1.2309, + "step": 6240 + }, + { + "epoch": 0.63, + "grad_norm": 10.280353040372706, + "learning_rate": 9.636768472856576e-06, + "loss": 1.2752, + "step": 6245 + }, + { + "epoch": 0.63, + "grad_norm": 11.312458044057468, + "learning_rate": 9.635670188580137e-06, + "loss": 1.2841, + "step": 6250 + }, + { + "epoch": 0.63, + "grad_norm": 18.780242029323798, + "learning_rate": 9.634570309174014e-06, + "loss": 1.265, + "step": 6255 + }, + { + "epoch": 0.63, + "grad_norm": 10.44977312785592, + "learning_rate": 9.633468835016675e-06, + "loss": 1.2838, + "step": 6260 + }, + { + "epoch": 0.63, + "grad_norm": 17.267491130984855, + "learning_rate": 9.632365766487135e-06, + "loss": 1.2211, + "step": 6265 + }, + { + "epoch": 0.63, + "grad_norm": 35.17393142764635, + "learning_rate": 9.631261103964958e-06, + "loss": 1.2804, + "step": 6270 + }, + { + "epoch": 0.63, + "grad_norm": 37.53819962798613, + "learning_rate": 9.63015484783026e-06, + "loss": 1.2865, + "step": 6275 + }, + { + "epoch": 0.63, + "grad_norm": 34.51336920403124, + "learning_rate": 9.6290469984637e-06, + "loss": 1.2361, + "step": 6280 + }, + { + "epoch": 0.63, + "grad_norm": 26.10458429883579, + "learning_rate": 9.62793755624649e-06, + "loss": 1.2768, + "step": 6285 + }, + { + "epoch": 0.63, + "grad_norm": 34.46040241880211, + "learning_rate": 9.626826521560387e-06, + "loss": 1.2453, + "step": 6290 + }, + { + "epoch": 0.63, + "grad_norm": 88.14491517650183, + "learning_rate": 9.625713894787696e-06, + "loss": 1.2459, + "step": 6295 + }, + { + "epoch": 0.64, + "grad_norm": 76.31252087250449, + "learning_rate": 9.624599676311273e-06, + "loss": 1.2018, + "step": 6300 + }, + { + "epoch": 0.64, + "grad_norm": 28.64535600337029, + "learning_rate": 9.623483866514517e-06, + "loss": 1.2601, + "step": 6305 + }, + { + "epoch": 0.64, + "grad_norm": 18.035308610589407, + "learning_rate": 9.622366465781378e-06, + "loss": 1.2424, + "step": 6310 + }, + { + "epoch": 0.64, + "grad_norm": 12.908505676676313, + "learning_rate": 9.621247474496357e-06, + "loss": 1.2472, + "step": 6315 + }, + { + "epoch": 0.64, + "grad_norm": 18.115144941515627, + "learning_rate": 9.620126893044491e-06, + "loss": 1.2438, + "step": 6320 + }, + { + "epoch": 0.64, + "grad_norm": 37.6967841689743, + "learning_rate": 9.619004721811372e-06, + "loss": 1.2521, + "step": 6325 + }, + { + "epoch": 0.64, + "grad_norm": 23.183543834786384, + "learning_rate": 9.617880961183143e-06, + "loss": 1.2517, + "step": 6330 + }, + { + "epoch": 0.64, + "grad_norm": 21.772660530997452, + "learning_rate": 9.616755611546484e-06, + "loss": 1.2042, + "step": 6335 + }, + { + "epoch": 0.64, + "grad_norm": 8.213710289509011, + "learning_rate": 9.615628673288629e-06, + "loss": 1.2467, + "step": 6340 + }, + { + "epoch": 0.64, + "grad_norm": 17.00506589316768, + "learning_rate": 9.614500146797356e-06, + "loss": 1.2252, + "step": 6345 + }, + { + "epoch": 0.64, + "grad_norm": 7.687034743993428, + "learning_rate": 9.61337003246099e-06, + "loss": 1.2222, + "step": 6350 + }, + { + "epoch": 0.64, + "grad_norm": 10.589184643064405, + "learning_rate": 9.612238330668401e-06, + "loss": 1.1862, + "step": 6355 + }, + { + "epoch": 0.64, + "grad_norm": 8.600735030357516, + "learning_rate": 9.61110504180901e-06, + "loss": 1.2781, + "step": 6360 + }, + { + "epoch": 0.64, + "grad_norm": 7.895062163861428, + "learning_rate": 9.609970166272777e-06, + "loss": 1.2641, + "step": 6365 + }, + { + "epoch": 0.64, + "grad_norm": 9.94101027761609, + "learning_rate": 9.608833704450213e-06, + "loss": 1.1981, + "step": 6370 + }, + { + "epoch": 0.64, + "grad_norm": 8.047485735007022, + "learning_rate": 9.607695656732375e-06, + "loss": 1.2508, + "step": 6375 + }, + { + "epoch": 0.64, + "grad_norm": 7.586597449322517, + "learning_rate": 9.60655602351086e-06, + "loss": 1.2631, + "step": 6380 + }, + { + "epoch": 0.64, + "grad_norm": 8.42166296432965, + "learning_rate": 9.605414805177817e-06, + "loss": 1.2241, + "step": 6385 + }, + { + "epoch": 0.64, + "grad_norm": 7.152314852090806, + "learning_rate": 9.604272002125938e-06, + "loss": 1.2114, + "step": 6390 + }, + { + "epoch": 0.64, + "grad_norm": 9.068297972056287, + "learning_rate": 9.603127614748461e-06, + "loss": 1.2552, + "step": 6395 + }, + { + "epoch": 0.65, + "grad_norm": 19.653506664350946, + "learning_rate": 9.601981643439168e-06, + "loss": 1.259, + "step": 6400 + }, + { + "epoch": 0.65, + "grad_norm": 41.18724376017987, + "learning_rate": 9.600834088592388e-06, + "loss": 1.2159, + "step": 6405 + }, + { + "epoch": 0.65, + "grad_norm": 24.876109952152202, + "learning_rate": 9.599684950602991e-06, + "loss": 1.2188, + "step": 6410 + }, + { + "epoch": 0.65, + "grad_norm": 22.76694558163489, + "learning_rate": 9.598534229866398e-06, + "loss": 1.2662, + "step": 6415 + }, + { + "epoch": 0.65, + "grad_norm": 17.60411778830573, + "learning_rate": 9.597381926778567e-06, + "loss": 1.2528, + "step": 6420 + }, + { + "epoch": 0.65, + "grad_norm": 24.16780324595837, + "learning_rate": 9.596228041736007e-06, + "loss": 1.2151, + "step": 6425 + }, + { + "epoch": 0.65, + "grad_norm": 16.10966313520157, + "learning_rate": 9.595072575135767e-06, + "loss": 1.2488, + "step": 6430 + }, + { + "epoch": 0.65, + "grad_norm": 12.37016194782002, + "learning_rate": 9.593915527375443e-06, + "loss": 1.2451, + "step": 6435 + }, + { + "epoch": 0.65, + "grad_norm": 30.760442676404576, + "learning_rate": 9.592756898853173e-06, + "loss": 1.1768, + "step": 6440 + }, + { + "epoch": 0.65, + "grad_norm": 7.9327806692135825, + "learning_rate": 9.591596689967642e-06, + "loss": 1.239, + "step": 6445 + }, + { + "epoch": 0.65, + "grad_norm": 7.935892823030629, + "learning_rate": 9.590434901118073e-06, + "loss": 1.2067, + "step": 6450 + }, + { + "epoch": 0.65, + "grad_norm": 10.38240903059879, + "learning_rate": 9.58927153270424e-06, + "loss": 1.2602, + "step": 6455 + }, + { + "epoch": 0.65, + "grad_norm": 15.125354639455605, + "learning_rate": 9.588106585126457e-06, + "loss": 1.2293, + "step": 6460 + }, + { + "epoch": 0.65, + "grad_norm": 9.382238333633055, + "learning_rate": 9.58694005878558e-06, + "loss": 1.3058, + "step": 6465 + }, + { + "epoch": 0.65, + "grad_norm": 26.545333628441504, + "learning_rate": 9.58577195408301e-06, + "loss": 1.2527, + "step": 6470 + }, + { + "epoch": 0.65, + "grad_norm": 9.298505320684617, + "learning_rate": 9.584602271420688e-06, + "loss": 1.275, + "step": 6475 + }, + { + "epoch": 0.65, + "grad_norm": 26.275263338563576, + "learning_rate": 9.583431011201105e-06, + "loss": 1.251, + "step": 6480 + }, + { + "epoch": 0.65, + "grad_norm": 9.408209269007875, + "learning_rate": 9.58225817382729e-06, + "loss": 1.2217, + "step": 6485 + }, + { + "epoch": 0.65, + "grad_norm": 30.065555129882664, + "learning_rate": 9.581083759702813e-06, + "loss": 1.2567, + "step": 6490 + }, + { + "epoch": 0.65, + "grad_norm": 27.407127676629674, + "learning_rate": 9.579907769231789e-06, + "loss": 1.2801, + "step": 6495 + }, + { + "epoch": 0.66, + "grad_norm": 28.56496645801653, + "learning_rate": 9.578730202818875e-06, + "loss": 1.2509, + "step": 6500 + }, + { + "epoch": 0.66, + "grad_norm": 17.31986863958474, + "learning_rate": 9.577551060869274e-06, + "loss": 1.2163, + "step": 6505 + }, + { + "epoch": 0.66, + "grad_norm": 9.653806228046228, + "learning_rate": 9.576370343788723e-06, + "loss": 1.2312, + "step": 6510 + }, + { + "epoch": 0.66, + "grad_norm": 13.001146930549856, + "learning_rate": 9.57518805198351e-06, + "loss": 1.2487, + "step": 6515 + }, + { + "epoch": 0.66, + "grad_norm": 11.961593528674674, + "learning_rate": 9.574004185860456e-06, + "loss": 1.2569, + "step": 6520 + }, + { + "epoch": 0.66, + "grad_norm": 36.483473798179794, + "learning_rate": 9.57281874582693e-06, + "loss": 1.2644, + "step": 6525 + }, + { + "epoch": 0.66, + "grad_norm": 39.88278761239571, + "learning_rate": 9.571631732290842e-06, + "loss": 1.2123, + "step": 6530 + }, + { + "epoch": 0.66, + "grad_norm": 30.69636858528973, + "learning_rate": 9.570443145660643e-06, + "loss": 1.2888, + "step": 6535 + }, + { + "epoch": 0.66, + "grad_norm": 16.863130870943063, + "learning_rate": 9.56925298634532e-06, + "loss": 1.2528, + "step": 6540 + }, + { + "epoch": 0.66, + "grad_norm": 23.332506732937507, + "learning_rate": 9.568061254754411e-06, + "loss": 1.244, + "step": 6545 + }, + { + "epoch": 0.66, + "grad_norm": 21.000864386202174, + "learning_rate": 9.566867951297985e-06, + "loss": 1.1825, + "step": 6550 + }, + { + "epoch": 0.66, + "grad_norm": 24.943736587546084, + "learning_rate": 9.56567307638666e-06, + "loss": 1.2365, + "step": 6555 + }, + { + "epoch": 0.66, + "grad_norm": 26.197989638859376, + "learning_rate": 9.56447663043159e-06, + "loss": 1.2024, + "step": 6560 + }, + { + "epoch": 0.66, + "grad_norm": 27.33670350537627, + "learning_rate": 9.56327861384447e-06, + "loss": 1.2344, + "step": 6565 + }, + { + "epoch": 0.66, + "grad_norm": 54.36381056789312, + "learning_rate": 9.56207902703754e-06, + "loss": 1.2252, + "step": 6570 + }, + { + "epoch": 0.66, + "grad_norm": 24.11108123688655, + "learning_rate": 9.560877870423571e-06, + "loss": 1.255, + "step": 6575 + }, + { + "epoch": 0.66, + "grad_norm": 26.91323668325551, + "learning_rate": 9.559675144415884e-06, + "loss": 1.2607, + "step": 6580 + }, + { + "epoch": 0.66, + "grad_norm": 18.21113053277181, + "learning_rate": 9.558470849428336e-06, + "loss": 1.2771, + "step": 6585 + }, + { + "epoch": 0.66, + "grad_norm": 21.156681839865353, + "learning_rate": 9.557264985875322e-06, + "loss": 1.2141, + "step": 6590 + }, + { + "epoch": 0.66, + "grad_norm": 14.857850606300715, + "learning_rate": 9.556057554171779e-06, + "loss": 1.2938, + "step": 6595 + }, + { + "epoch": 0.67, + "grad_norm": 36.77440472712633, + "learning_rate": 9.554848554733183e-06, + "loss": 1.2396, + "step": 6600 + }, + { + "epoch": 0.67, + "grad_norm": 32.7688105252039, + "learning_rate": 9.55363798797555e-06, + "loss": 1.2142, + "step": 6605 + }, + { + "epoch": 0.67, + "grad_norm": 18.13894292347681, + "learning_rate": 9.552425854315434e-06, + "loss": 1.2874, + "step": 6610 + }, + { + "epoch": 0.67, + "grad_norm": 26.93915633805332, + "learning_rate": 9.55121215416993e-06, + "loss": 1.2832, + "step": 6615 + }, + { + "epoch": 0.67, + "grad_norm": 18.665374233208155, + "learning_rate": 9.549996887956669e-06, + "loss": 1.2471, + "step": 6620 + }, + { + "epoch": 0.67, + "grad_norm": 14.125730711800594, + "learning_rate": 9.548780056093826e-06, + "loss": 1.2374, + "step": 6625 + }, + { + "epoch": 0.67, + "grad_norm": 18.90762837288638, + "learning_rate": 9.547561659000112e-06, + "loss": 1.2515, + "step": 6630 + }, + { + "epoch": 0.67, + "grad_norm": 22.7067280652705, + "learning_rate": 9.546341697094772e-06, + "loss": 1.2299, + "step": 6635 + }, + { + "epoch": 0.67, + "grad_norm": 24.480892629449738, + "learning_rate": 9.545120170797596e-06, + "loss": 1.2745, + "step": 6640 + }, + { + "epoch": 0.67, + "grad_norm": 13.729366330188203, + "learning_rate": 9.543897080528912e-06, + "loss": 1.2389, + "step": 6645 + }, + { + "epoch": 0.67, + "grad_norm": 9.27919718275337, + "learning_rate": 9.542672426709582e-06, + "loss": 1.2478, + "step": 6650 + }, + { + "epoch": 0.67, + "grad_norm": 29.536054001370225, + "learning_rate": 9.54144620976101e-06, + "loss": 1.208, + "step": 6655 + }, + { + "epoch": 0.67, + "grad_norm": 9.512285015383293, + "learning_rate": 9.540218430105133e-06, + "loss": 1.227, + "step": 6660 + }, + { + "epoch": 0.67, + "grad_norm": 7.868686602501446, + "learning_rate": 9.53898908816443e-06, + "loss": 1.2358, + "step": 6665 + }, + { + "epoch": 0.67, + "grad_norm": 6.812767303201554, + "learning_rate": 9.537758184361919e-06, + "loss": 1.2234, + "step": 6670 + }, + { + "epoch": 0.67, + "grad_norm": 6.52552803155819, + "learning_rate": 9.536525719121151e-06, + "loss": 1.256, + "step": 6675 + }, + { + "epoch": 0.67, + "grad_norm": 8.192657871379609, + "learning_rate": 9.535291692866214e-06, + "loss": 1.2279, + "step": 6680 + }, + { + "epoch": 0.67, + "grad_norm": 7.97303139318268, + "learning_rate": 9.534056106021739e-06, + "loss": 1.2419, + "step": 6685 + }, + { + "epoch": 0.67, + "grad_norm": 8.820777015333706, + "learning_rate": 9.532818959012885e-06, + "loss": 1.2114, + "step": 6690 + }, + { + "epoch": 0.68, + "grad_norm": 18.200670156862532, + "learning_rate": 9.53158025226536e-06, + "loss": 1.2167, + "step": 6695 + }, + { + "epoch": 0.68, + "grad_norm": 19.482322599930992, + "learning_rate": 9.530339986205398e-06, + "loss": 1.2416, + "step": 6700 + }, + { + "epoch": 0.68, + "grad_norm": 23.067243107377486, + "learning_rate": 9.529098161259774e-06, + "loss": 1.2141, + "step": 6705 + }, + { + "epoch": 0.68, + "grad_norm": 6.4198088592960145, + "learning_rate": 9.527854777855797e-06, + "loss": 1.2327, + "step": 6710 + }, + { + "epoch": 0.68, + "grad_norm": 8.785443206784343, + "learning_rate": 9.526609836421316e-06, + "loss": 1.226, + "step": 6715 + }, + { + "epoch": 0.68, + "grad_norm": 20.46431809622519, + "learning_rate": 9.525363337384715e-06, + "loss": 1.2453, + "step": 6720 + }, + { + "epoch": 0.68, + "grad_norm": 36.130670249795706, + "learning_rate": 9.52411528117491e-06, + "loss": 1.2554, + "step": 6725 + }, + { + "epoch": 0.68, + "grad_norm": 22.559710806107194, + "learning_rate": 9.522865668221357e-06, + "loss": 1.2525, + "step": 6730 + }, + { + "epoch": 0.68, + "grad_norm": 6.352798022234042, + "learning_rate": 9.52161449895405e-06, + "loss": 1.2245, + "step": 6735 + }, + { + "epoch": 0.68, + "grad_norm": 15.847839472599194, + "learning_rate": 9.52036177380351e-06, + "loss": 1.256, + "step": 6740 + }, + { + "epoch": 0.68, + "grad_norm": 9.453488936634951, + "learning_rate": 9.519107493200803e-06, + "loss": 1.2318, + "step": 6745 + }, + { + "epoch": 0.68, + "grad_norm": 11.633528345767568, + "learning_rate": 9.517851657577523e-06, + "loss": 1.2229, + "step": 6750 + }, + { + "epoch": 0.68, + "grad_norm": 10.626581824133057, + "learning_rate": 9.516594267365804e-06, + "loss": 1.2158, + "step": 6755 + }, + { + "epoch": 0.68, + "grad_norm": 11.393721825328813, + "learning_rate": 9.51533532299831e-06, + "loss": 1.2719, + "step": 6760 + }, + { + "epoch": 0.68, + "grad_norm": 9.769313015181671, + "learning_rate": 9.514074824908245e-06, + "loss": 1.3028, + "step": 6765 + }, + { + "epoch": 0.68, + "grad_norm": 20.33308617227934, + "learning_rate": 9.512812773529343e-06, + "loss": 1.2591, + "step": 6770 + }, + { + "epoch": 0.68, + "grad_norm": 11.166131697654846, + "learning_rate": 9.511549169295877e-06, + "loss": 1.1871, + "step": 6775 + }, + { + "epoch": 0.68, + "grad_norm": 14.136358746864397, + "learning_rate": 9.51028401264265e-06, + "loss": 1.2101, + "step": 6780 + }, + { + "epoch": 0.68, + "grad_norm": 13.390290412429662, + "learning_rate": 9.509017304005003e-06, + "loss": 1.195, + "step": 6785 + }, + { + "epoch": 0.68, + "grad_norm": 9.224827613504818, + "learning_rate": 9.507749043818806e-06, + "loss": 1.218, + "step": 6790 + }, + { + "epoch": 0.69, + "grad_norm": 47.04593173534263, + "learning_rate": 9.506479232520472e-06, + "loss": 1.2103, + "step": 6795 + }, + { + "epoch": 0.69, + "grad_norm": 14.859674831678463, + "learning_rate": 9.505207870546935e-06, + "loss": 1.1938, + "step": 6800 + }, + { + "epoch": 0.69, + "grad_norm": 31.86713786138509, + "learning_rate": 9.503934958335674e-06, + "loss": 1.1739, + "step": 6805 + }, + { + "epoch": 0.69, + "grad_norm": 45.09655897499265, + "learning_rate": 9.502660496324695e-06, + "loss": 1.2677, + "step": 6810 + }, + { + "epoch": 0.69, + "grad_norm": 22.36183306586514, + "learning_rate": 9.501384484952542e-06, + "loss": 1.2735, + "step": 6815 + }, + { + "epoch": 0.69, + "grad_norm": 9.55254057571248, + "learning_rate": 9.500106924658286e-06, + "loss": 1.2156, + "step": 6820 + }, + { + "epoch": 0.69, + "grad_norm": 9.712501636130115, + "learning_rate": 9.498827815881535e-06, + "loss": 1.2817, + "step": 6825 + }, + { + "epoch": 0.69, + "grad_norm": 18.22147105174755, + "learning_rate": 9.497547159062429e-06, + "loss": 1.1614, + "step": 6830 + }, + { + "epoch": 0.69, + "grad_norm": 16.77428145176956, + "learning_rate": 9.496264954641642e-06, + "loss": 1.184, + "step": 6835 + }, + { + "epoch": 0.69, + "grad_norm": 21.482036685862674, + "learning_rate": 9.494981203060377e-06, + "loss": 1.2329, + "step": 6840 + }, + { + "epoch": 0.69, + "grad_norm": 30.82452357899338, + "learning_rate": 9.493695904760374e-06, + "loss": 1.2574, + "step": 6845 + }, + { + "epoch": 0.69, + "grad_norm": 49.07622638487179, + "learning_rate": 9.492409060183902e-06, + "loss": 1.215, + "step": 6850 + }, + { + "epoch": 0.69, + "grad_norm": 27.852350571005495, + "learning_rate": 9.491120669773764e-06, + "loss": 1.253, + "step": 6855 + }, + { + "epoch": 0.69, + "grad_norm": 21.832099841552093, + "learning_rate": 9.489830733973294e-06, + "loss": 1.2532, + "step": 6860 + }, + { + "epoch": 0.69, + "grad_norm": 9.263626058875476, + "learning_rate": 9.488539253226355e-06, + "loss": 1.2408, + "step": 6865 + }, + { + "epoch": 0.69, + "grad_norm": 14.705639458146393, + "learning_rate": 9.487246227977344e-06, + "loss": 1.2532, + "step": 6870 + }, + { + "epoch": 0.69, + "grad_norm": 20.862808964869487, + "learning_rate": 9.485951658671195e-06, + "loss": 1.2053, + "step": 6875 + }, + { + "epoch": 0.69, + "grad_norm": 25.52529554846823, + "learning_rate": 9.484655545753365e-06, + "loss": 1.2055, + "step": 6880 + }, + { + "epoch": 0.69, + "grad_norm": 27.135176135004095, + "learning_rate": 9.483357889669844e-06, + "loss": 1.2336, + "step": 6885 + }, + { + "epoch": 0.69, + "grad_norm": 13.919200196550955, + "learning_rate": 9.482058690867155e-06, + "loss": 1.222, + "step": 6890 + }, + { + "epoch": 0.7, + "grad_norm": 8.228121962617985, + "learning_rate": 9.480757949792352e-06, + "loss": 1.2375, + "step": 6895 + }, + { + "epoch": 0.7, + "grad_norm": 38.56471924185, + "learning_rate": 9.479455666893017e-06, + "loss": 1.2782, + "step": 6900 + }, + { + "epoch": 0.7, + "grad_norm": 16.839326193627322, + "learning_rate": 9.478151842617266e-06, + "loss": 1.2418, + "step": 6905 + }, + { + "epoch": 0.7, + "grad_norm": 25.0169370675844, + "learning_rate": 9.476846477413744e-06, + "loss": 1.2698, + "step": 6910 + }, + { + "epoch": 0.7, + "grad_norm": 20.44364527322342, + "learning_rate": 9.475539571731623e-06, + "loss": 1.2483, + "step": 6915 + }, + { + "epoch": 0.7, + "grad_norm": 33.7266187137183, + "learning_rate": 9.474231126020611e-06, + "loss": 1.2221, + "step": 6920 + }, + { + "epoch": 0.7, + "grad_norm": 20.670169197792678, + "learning_rate": 9.472921140730942e-06, + "loss": 1.2104, + "step": 6925 + }, + { + "epoch": 0.7, + "grad_norm": 7.691279573789632, + "learning_rate": 9.47160961631338e-06, + "loss": 1.245, + "step": 6930 + }, + { + "epoch": 0.7, + "grad_norm": 8.08452734229773, + "learning_rate": 9.470296553219221e-06, + "loss": 1.2224, + "step": 6935 + }, + { + "epoch": 0.7, + "grad_norm": 9.076040025569132, + "learning_rate": 9.468981951900288e-06, + "loss": 1.2418, + "step": 6940 + }, + { + "epoch": 0.7, + "grad_norm": 8.83344240330592, + "learning_rate": 9.467665812808933e-06, + "loss": 1.232, + "step": 6945 + }, + { + "epoch": 0.7, + "grad_norm": 13.906164830629537, + "learning_rate": 9.466348136398038e-06, + "loss": 1.2528, + "step": 6950 + }, + { + "epoch": 0.7, + "grad_norm": 7.360240095907476, + "learning_rate": 9.465028923121016e-06, + "loss": 1.2909, + "step": 6955 + }, + { + "epoch": 0.7, + "grad_norm": 9.91868384199132, + "learning_rate": 9.463708173431808e-06, + "loss": 1.2611, + "step": 6960 + }, + { + "epoch": 0.7, + "grad_norm": 42.551322031638165, + "learning_rate": 9.462385887784878e-06, + "loss": 1.2037, + "step": 6965 + }, + { + "epoch": 0.7, + "grad_norm": 15.858468890492471, + "learning_rate": 9.461062066635227e-06, + "loss": 1.2246, + "step": 6970 + }, + { + "epoch": 0.7, + "grad_norm": 46.32101587841062, + "learning_rate": 9.45973671043838e-06, + "loss": 1.2431, + "step": 6975 + }, + { + "epoch": 0.7, + "grad_norm": 27.18892845859127, + "learning_rate": 9.45840981965039e-06, + "loss": 1.1757, + "step": 6980 + }, + { + "epoch": 0.7, + "grad_norm": 41.3387922376486, + "learning_rate": 9.457081394727839e-06, + "loss": 1.2043, + "step": 6985 + }, + { + "epoch": 0.7, + "grad_norm": 74.34153904649125, + "learning_rate": 9.455751436127838e-06, + "loss": 1.2271, + "step": 6990 + }, + { + "epoch": 0.71, + "grad_norm": 47.880845285006515, + "learning_rate": 9.454419944308023e-06, + "loss": 1.289, + "step": 6995 + }, + { + "epoch": 0.71, + "grad_norm": 8.31019170814755, + "learning_rate": 9.45308691972656e-06, + "loss": 1.2575, + "step": 7000 + }, + { + "epoch": 0.71, + "grad_norm": 35.09321220970617, + "learning_rate": 9.451752362842142e-06, + "loss": 1.2396, + "step": 7005 + }, + { + "epoch": 0.71, + "grad_norm": 45.95642775325224, + "learning_rate": 9.450416274113984e-06, + "loss": 1.2274, + "step": 7010 + }, + { + "epoch": 0.71, + "grad_norm": 20.437604930753526, + "learning_rate": 9.44907865400184e-06, + "loss": 1.2589, + "step": 7015 + }, + { + "epoch": 0.71, + "grad_norm": 110.20869446046459, + "learning_rate": 9.447739502965981e-06, + "loss": 1.26, + "step": 7020 + }, + { + "epoch": 0.71, + "grad_norm": 55.010558102721724, + "learning_rate": 9.446398821467207e-06, + "loss": 1.27, + "step": 7025 + }, + { + "epoch": 0.71, + "grad_norm": 37.421245925011334, + "learning_rate": 9.445056609966843e-06, + "loss": 1.2753, + "step": 7030 + }, + { + "epoch": 0.71, + "grad_norm": 58.58172405386906, + "learning_rate": 9.443712868926747e-06, + "loss": 1.2242, + "step": 7035 + }, + { + "epoch": 0.71, + "grad_norm": 108.11313414129204, + "learning_rate": 9.442367598809296e-06, + "loss": 1.3262, + "step": 7040 + }, + { + "epoch": 0.71, + "grad_norm": 53.60255208092043, + "learning_rate": 9.441020800077398e-06, + "loss": 1.2836, + "step": 7045 + }, + { + "epoch": 0.71, + "grad_norm": 39.63944842158645, + "learning_rate": 9.439672473194484e-06, + "loss": 1.2709, + "step": 7050 + }, + { + "epoch": 0.71, + "grad_norm": 58.28398000888765, + "learning_rate": 9.438322618624514e-06, + "loss": 1.284, + "step": 7055 + }, + { + "epoch": 0.71, + "grad_norm": 47.30731552146769, + "learning_rate": 9.436971236831966e-06, + "loss": 1.2984, + "step": 7060 + }, + { + "epoch": 0.71, + "grad_norm": 35.50017341642059, + "learning_rate": 9.435618328281856e-06, + "loss": 1.2511, + "step": 7065 + }, + { + "epoch": 0.71, + "grad_norm": 19.540606098701666, + "learning_rate": 9.434263893439717e-06, + "loss": 1.2829, + "step": 7070 + }, + { + "epoch": 0.71, + "grad_norm": 15.46016735290377, + "learning_rate": 9.432907932771604e-06, + "loss": 1.2504, + "step": 7075 + }, + { + "epoch": 0.71, + "grad_norm": 10.716698511129454, + "learning_rate": 9.431550446744109e-06, + "loss": 1.2351, + "step": 7080 + }, + { + "epoch": 0.71, + "grad_norm": 22.396806981332013, + "learning_rate": 9.430191435824335e-06, + "loss": 1.2902, + "step": 7085 + }, + { + "epoch": 0.71, + "grad_norm": 6.093918197190177, + "learning_rate": 9.42883090047992e-06, + "loss": 1.2596, + "step": 7090 + }, + { + "epoch": 0.72, + "grad_norm": 13.438683461999606, + "learning_rate": 9.427468841179025e-06, + "loss": 1.2259, + "step": 7095 + }, + { + "epoch": 0.72, + "grad_norm": 20.475889676398804, + "learning_rate": 9.426105258390326e-06, + "loss": 1.1868, + "step": 7100 + }, + { + "epoch": 0.72, + "grad_norm": 9.25807562795457, + "learning_rate": 9.424740152583037e-06, + "loss": 1.2288, + "step": 7105 + }, + { + "epoch": 0.72, + "grad_norm": 14.92935578434538, + "learning_rate": 9.423373524226888e-06, + "loss": 1.2415, + "step": 7110 + }, + { + "epoch": 0.72, + "grad_norm": 17.035920194295393, + "learning_rate": 9.422005373792134e-06, + "loss": 1.214, + "step": 7115 + }, + { + "epoch": 0.72, + "grad_norm": 8.952893664109181, + "learning_rate": 9.420635701749553e-06, + "loss": 1.2441, + "step": 7120 + }, + { + "epoch": 0.72, + "grad_norm": 10.24312828879759, + "learning_rate": 9.41926450857045e-06, + "loss": 1.2545, + "step": 7125 + }, + { + "epoch": 0.72, + "grad_norm": 17.895593284970484, + "learning_rate": 9.41789179472665e-06, + "loss": 1.2717, + "step": 7130 + }, + { + "epoch": 0.72, + "grad_norm": 25.915344582786382, + "learning_rate": 9.416517560690505e-06, + "loss": 1.2114, + "step": 7135 + }, + { + "epoch": 0.72, + "grad_norm": 28.063154100394737, + "learning_rate": 9.415141806934885e-06, + "loss": 1.1602, + "step": 7140 + }, + { + "epoch": 0.72, + "grad_norm": 60.96186172132934, + "learning_rate": 9.413764533933186e-06, + "loss": 1.2729, + "step": 7145 + }, + { + "epoch": 0.72, + "grad_norm": 34.2103281919088, + "learning_rate": 9.412385742159325e-06, + "loss": 1.1829, + "step": 7150 + }, + { + "epoch": 0.72, + "grad_norm": 31.55595638712618, + "learning_rate": 9.411005432087745e-06, + "loss": 1.2225, + "step": 7155 + }, + { + "epoch": 0.72, + "grad_norm": 16.44544536704509, + "learning_rate": 9.409623604193409e-06, + "loss": 1.1842, + "step": 7160 + }, + { + "epoch": 0.72, + "grad_norm": 19.01930018142275, + "learning_rate": 9.408240258951803e-06, + "loss": 1.234, + "step": 7165 + }, + { + "epoch": 0.72, + "grad_norm": 7.31729099988006, + "learning_rate": 9.406855396838934e-06, + "loss": 1.2533, + "step": 7170 + }, + { + "epoch": 0.72, + "grad_norm": 12.758603793971947, + "learning_rate": 9.405469018331333e-06, + "loss": 1.2281, + "step": 7175 + }, + { + "epoch": 0.72, + "grad_norm": 9.729019158963341, + "learning_rate": 9.404081123906048e-06, + "loss": 1.2155, + "step": 7180 + }, + { + "epoch": 0.72, + "grad_norm": 7.591024723043679, + "learning_rate": 9.402691714040658e-06, + "loss": 1.2419, + "step": 7185 + }, + { + "epoch": 0.72, + "grad_norm": 9.408729873579752, + "learning_rate": 9.401300789213251e-06, + "loss": 1.2615, + "step": 7190 + }, + { + "epoch": 0.73, + "grad_norm": 5.981977669792066, + "learning_rate": 9.399908349902448e-06, + "loss": 1.2266, + "step": 7195 + }, + { + "epoch": 0.73, + "grad_norm": 20.35521316100864, + "learning_rate": 9.398514396587383e-06, + "loss": 1.2822, + "step": 7200 + }, + { + "epoch": 0.73, + "grad_norm": 21.621038660870372, + "learning_rate": 9.397118929747716e-06, + "loss": 1.2243, + "step": 7205 + }, + { + "epoch": 0.73, + "grad_norm": 7.1169999232885415, + "learning_rate": 9.395721949863626e-06, + "loss": 1.2136, + "step": 7210 + }, + { + "epoch": 0.73, + "grad_norm": 14.05921741295788, + "learning_rate": 9.39432345741581e-06, + "loss": 1.2538, + "step": 7215 + }, + { + "epoch": 0.73, + "grad_norm": 15.0163186370097, + "learning_rate": 9.39292345288549e-06, + "loss": 1.1999, + "step": 7220 + }, + { + "epoch": 0.73, + "grad_norm": 9.509276797106333, + "learning_rate": 9.391521936754405e-06, + "loss": 1.1838, + "step": 7225 + }, + { + "epoch": 0.73, + "grad_norm": 9.682679767995246, + "learning_rate": 9.390118909504816e-06, + "loss": 1.2345, + "step": 7230 + }, + { + "epoch": 0.73, + "grad_norm": 8.298106706947552, + "learning_rate": 9.388714371619504e-06, + "loss": 1.2146, + "step": 7235 + }, + { + "epoch": 0.73, + "grad_norm": 32.68014420140746, + "learning_rate": 9.387308323581767e-06, + "loss": 1.2208, + "step": 7240 + }, + { + "epoch": 0.73, + "grad_norm": 32.94240281632185, + "learning_rate": 9.385900765875428e-06, + "loss": 1.2308, + "step": 7245 + }, + { + "epoch": 0.73, + "grad_norm": 11.20941662216896, + "learning_rate": 9.384491698984824e-06, + "loss": 1.2556, + "step": 7250 + }, + { + "epoch": 0.73, + "grad_norm": 44.34305604752469, + "learning_rate": 9.383081123394812e-06, + "loss": 1.2736, + "step": 7255 + }, + { + "epoch": 0.73, + "grad_norm": 25.712638034601298, + "learning_rate": 9.381669039590774e-06, + "loss": 1.2039, + "step": 7260 + }, + { + "epoch": 0.73, + "grad_norm": 48.74925265761191, + "learning_rate": 9.380255448058605e-06, + "loss": 1.2437, + "step": 7265 + }, + { + "epoch": 0.73, + "grad_norm": 57.63732263239281, + "learning_rate": 9.378840349284719e-06, + "loss": 1.2407, + "step": 7270 + }, + { + "epoch": 0.73, + "grad_norm": 8.103386894214017, + "learning_rate": 9.377423743756052e-06, + "loss": 1.2584, + "step": 7275 + }, + { + "epoch": 0.73, + "grad_norm": 23.587086106979573, + "learning_rate": 9.376005631960054e-06, + "loss": 1.1959, + "step": 7280 + }, + { + "epoch": 0.73, + "grad_norm": 47.89273622586558, + "learning_rate": 9.374586014384698e-06, + "loss": 1.2304, + "step": 7285 + }, + { + "epoch": 0.73, + "grad_norm": 14.207434380149833, + "learning_rate": 9.373164891518474e-06, + "loss": 1.2026, + "step": 7290 + }, + { + "epoch": 0.74, + "grad_norm": 25.565965293698454, + "learning_rate": 9.371742263850386e-06, + "loss": 1.2605, + "step": 7295 + }, + { + "epoch": 0.74, + "grad_norm": 12.987314783270103, + "learning_rate": 9.370318131869962e-06, + "loss": 1.2852, + "step": 7300 + }, + { + "epoch": 0.74, + "grad_norm": 8.170501768501227, + "learning_rate": 9.368892496067242e-06, + "loss": 1.2447, + "step": 7305 + }, + { + "epoch": 0.74, + "grad_norm": 24.983723833865042, + "learning_rate": 9.367465356932786e-06, + "loss": 1.2606, + "step": 7310 + }, + { + "epoch": 0.74, + "grad_norm": 10.967512176058351, + "learning_rate": 9.366036714957673e-06, + "loss": 1.2554, + "step": 7315 + }, + { + "epoch": 0.74, + "grad_norm": 7.222476940885444, + "learning_rate": 9.364606570633496e-06, + "loss": 1.2277, + "step": 7320 + }, + { + "epoch": 0.74, + "grad_norm": 8.599376103937072, + "learning_rate": 9.363174924452368e-06, + "loss": 1.2546, + "step": 7325 + }, + { + "epoch": 0.74, + "grad_norm": 8.74402561354435, + "learning_rate": 9.361741776906914e-06, + "loss": 1.2431, + "step": 7330 + }, + { + "epoch": 0.74, + "grad_norm": 9.350739519335928, + "learning_rate": 9.360307128490282e-06, + "loss": 1.2834, + "step": 7335 + }, + { + "epoch": 0.74, + "grad_norm": 58.73070927096001, + "learning_rate": 9.358870979696132e-06, + "loss": 1.2686, + "step": 7340 + }, + { + "epoch": 0.74, + "grad_norm": 9.974912177329822, + "learning_rate": 9.35743333101864e-06, + "loss": 1.1898, + "step": 7345 + }, + { + "epoch": 0.74, + "grad_norm": 17.78339694738643, + "learning_rate": 9.355994182952501e-06, + "loss": 1.2043, + "step": 7350 + }, + { + "epoch": 0.74, + "grad_norm": 18.31437657156583, + "learning_rate": 9.354553535992923e-06, + "loss": 1.2304, + "step": 7355 + }, + { + "epoch": 0.74, + "grad_norm": 20.665349554537844, + "learning_rate": 9.353111390635634e-06, + "loss": 1.189, + "step": 7360 + }, + { + "epoch": 0.74, + "grad_norm": 19.05751323569656, + "learning_rate": 9.351667747376874e-06, + "loss": 1.2734, + "step": 7365 + }, + { + "epoch": 0.74, + "grad_norm": 8.933435798473003, + "learning_rate": 9.350222606713396e-06, + "loss": 1.1759, + "step": 7370 + }, + { + "epoch": 0.74, + "grad_norm": 8.62867231923937, + "learning_rate": 9.348775969142475e-06, + "loss": 1.2595, + "step": 7375 + }, + { + "epoch": 0.74, + "grad_norm": 8.329043149847656, + "learning_rate": 9.347327835161897e-06, + "loss": 1.2506, + "step": 7380 + }, + { + "epoch": 0.74, + "grad_norm": 8.351497862234899, + "learning_rate": 9.345878205269962e-06, + "loss": 1.253, + "step": 7385 + }, + { + "epoch": 0.75, + "grad_norm": 11.719251752013907, + "learning_rate": 9.344427079965487e-06, + "loss": 1.2536, + "step": 7390 + }, + { + "epoch": 0.75, + "grad_norm": 6.942666744527387, + "learning_rate": 9.342974459747804e-06, + "loss": 1.26, + "step": 7395 + }, + { + "epoch": 0.75, + "grad_norm": 18.285466703612645, + "learning_rate": 9.341520345116759e-06, + "loss": 1.18, + "step": 7400 + }, + { + "epoch": 0.75, + "grad_norm": 30.12228972307043, + "learning_rate": 9.34006473657271e-06, + "loss": 1.2179, + "step": 7405 + }, + { + "epoch": 0.75, + "grad_norm": 12.21589668238483, + "learning_rate": 9.338607634616528e-06, + "loss": 1.2136, + "step": 7410 + }, + { + "epoch": 0.75, + "grad_norm": 12.842802840102594, + "learning_rate": 9.337149039749603e-06, + "loss": 1.1953, + "step": 7415 + }, + { + "epoch": 0.75, + "grad_norm": 10.58284469305697, + "learning_rate": 9.335688952473836e-06, + "loss": 1.2499, + "step": 7420 + }, + { + "epoch": 0.75, + "grad_norm": 11.622043712284013, + "learning_rate": 9.334227373291642e-06, + "loss": 1.2617, + "step": 7425 + }, + { + "epoch": 0.75, + "grad_norm": 11.883722967582399, + "learning_rate": 9.33276430270595e-06, + "loss": 1.2706, + "step": 7430 + }, + { + "epoch": 0.75, + "grad_norm": 10.60015415599963, + "learning_rate": 9.331299741220196e-06, + "loss": 1.2676, + "step": 7435 + }, + { + "epoch": 0.75, + "grad_norm": 13.111803702436317, + "learning_rate": 9.329833689338342e-06, + "loss": 1.2369, + "step": 7440 + }, + { + "epoch": 0.75, + "grad_norm": 17.055944336562472, + "learning_rate": 9.32836614756485e-06, + "loss": 1.259, + "step": 7445 + }, + { + "epoch": 0.75, + "grad_norm": 28.08428867001668, + "learning_rate": 9.326897116404698e-06, + "loss": 1.2645, + "step": 7450 + }, + { + "epoch": 0.75, + "grad_norm": 7.352690250363281, + "learning_rate": 9.325426596363382e-06, + "loss": 1.2373, + "step": 7455 + }, + { + "epoch": 0.75, + "grad_norm": 11.900462142361196, + "learning_rate": 9.323954587946907e-06, + "loss": 1.186, + "step": 7460 + }, + { + "epoch": 0.75, + "grad_norm": 9.566500090186759, + "learning_rate": 9.322481091661788e-06, + "loss": 1.2952, + "step": 7465 + }, + { + "epoch": 0.75, + "grad_norm": 9.033891289696, + "learning_rate": 9.321006108015053e-06, + "loss": 1.272, + "step": 7470 + }, + { + "epoch": 0.75, + "grad_norm": 14.899753735922808, + "learning_rate": 9.319529637514244e-06, + "loss": 1.2528, + "step": 7475 + }, + { + "epoch": 0.75, + "grad_norm": 9.255310223264777, + "learning_rate": 9.318051680667412e-06, + "loss": 1.2085, + "step": 7480 + }, + { + "epoch": 0.75, + "grad_norm": 17.864126836858343, + "learning_rate": 9.316572237983119e-06, + "loss": 1.2642, + "step": 7485 + }, + { + "epoch": 0.76, + "grad_norm": 6.551752495088672, + "learning_rate": 9.315091309970444e-06, + "loss": 1.2041, + "step": 7490 + }, + { + "epoch": 0.76, + "grad_norm": 22.80093972121296, + "learning_rate": 9.31360889713897e-06, + "loss": 1.2334, + "step": 7495 + }, + { + "epoch": 0.76, + "grad_norm": 28.90301544894272, + "learning_rate": 9.312124999998796e-06, + "loss": 1.2479, + "step": 7500 + }, + { + "epoch": 0.76, + "grad_norm": 23.575320478557227, + "learning_rate": 9.310639619060525e-06, + "loss": 1.1864, + "step": 7505 + }, + { + "epoch": 0.76, + "grad_norm": 27.62402399391801, + "learning_rate": 9.30915275483528e-06, + "loss": 1.2981, + "step": 7510 + }, + { + "epoch": 0.76, + "grad_norm": 29.52538342623024, + "learning_rate": 9.307664407834687e-06, + "loss": 1.2367, + "step": 7515 + }, + { + "epoch": 0.76, + "grad_norm": 27.250936315036952, + "learning_rate": 9.306174578570886e-06, + "loss": 1.2476, + "step": 7520 + }, + { + "epoch": 0.76, + "grad_norm": 29.535063998267166, + "learning_rate": 9.304683267556526e-06, + "loss": 1.253, + "step": 7525 + }, + { + "epoch": 0.76, + "grad_norm": 28.251195584812745, + "learning_rate": 9.303190475304765e-06, + "loss": 1.2189, + "step": 7530 + }, + { + "epoch": 0.76, + "grad_norm": 14.902149710792362, + "learning_rate": 9.301696202329271e-06, + "loss": 1.2406, + "step": 7535 + }, + { + "epoch": 0.76, + "grad_norm": 40.43104040021645, + "learning_rate": 9.300200449144222e-06, + "loss": 1.2063, + "step": 7540 + }, + { + "epoch": 0.76, + "grad_norm": 10.089939602148963, + "learning_rate": 9.298703216264306e-06, + "loss": 1.2279, + "step": 7545 + }, + { + "epoch": 0.76, + "grad_norm": 16.76947542440218, + "learning_rate": 9.29720450420472e-06, + "loss": 1.2208, + "step": 7550 + }, + { + "epoch": 0.76, + "grad_norm": 8.613305081206319, + "learning_rate": 9.295704313481167e-06, + "loss": 1.2397, + "step": 7555 + }, + { + "epoch": 0.76, + "grad_norm": 29.797823991883156, + "learning_rate": 9.294202644609863e-06, + "loss": 1.1968, + "step": 7560 + }, + { + "epoch": 0.76, + "grad_norm": 9.489259290104659, + "learning_rate": 9.292699498107529e-06, + "loss": 1.2035, + "step": 7565 + }, + { + "epoch": 0.76, + "grad_norm": 17.944090986437367, + "learning_rate": 9.291194874491401e-06, + "loss": 1.2632, + "step": 7570 + }, + { + "epoch": 0.76, + "grad_norm": 21.959976388311894, + "learning_rate": 9.289688774279213e-06, + "loss": 1.2442, + "step": 7575 + }, + { + "epoch": 0.76, + "grad_norm": 11.984896080723992, + "learning_rate": 9.288181197989215e-06, + "loss": 1.24, + "step": 7580 + }, + { + "epoch": 0.76, + "grad_norm": 11.296258510485956, + "learning_rate": 9.286672146140162e-06, + "loss": 1.1646, + "step": 7585 + }, + { + "epoch": 0.77, + "grad_norm": 8.409430913426574, + "learning_rate": 9.28516161925132e-06, + "loss": 1.2424, + "step": 7590 + }, + { + "epoch": 0.77, + "grad_norm": 7.815724731539182, + "learning_rate": 9.283649617842455e-06, + "loss": 1.1497, + "step": 7595 + }, + { + "epoch": 0.77, + "grad_norm": 9.245525477882799, + "learning_rate": 9.282136142433849e-06, + "loss": 1.2134, + "step": 7600 + }, + { + "epoch": 0.77, + "grad_norm": 8.055682706845232, + "learning_rate": 9.280621193546286e-06, + "loss": 1.2681, + "step": 7605 + }, + { + "epoch": 0.77, + "grad_norm": 9.08566928740703, + "learning_rate": 9.279104771701059e-06, + "loss": 1.2022, + "step": 7610 + }, + { + "epoch": 0.77, + "grad_norm": 8.766664575908408, + "learning_rate": 9.277586877419967e-06, + "loss": 1.2152, + "step": 7615 + }, + { + "epoch": 0.77, + "grad_norm": 13.087774417016558, + "learning_rate": 9.276067511225318e-06, + "loss": 1.2306, + "step": 7620 + }, + { + "epoch": 0.77, + "grad_norm": 29.673522022975185, + "learning_rate": 9.274546673639919e-06, + "loss": 1.2166, + "step": 7625 + }, + { + "epoch": 0.77, + "grad_norm": 51.539591817311056, + "learning_rate": 9.273024365187093e-06, + "loss": 1.2284, + "step": 7630 + }, + { + "epoch": 0.77, + "grad_norm": 12.140019429296594, + "learning_rate": 9.271500586390666e-06, + "loss": 1.2434, + "step": 7635 + }, + { + "epoch": 0.77, + "grad_norm": 26.036718584654587, + "learning_rate": 9.269975337774967e-06, + "loss": 1.1555, + "step": 7640 + }, + { + "epoch": 0.77, + "grad_norm": 12.191433366044787, + "learning_rate": 9.268448619864832e-06, + "loss": 1.2142, + "step": 7645 + }, + { + "epoch": 0.77, + "grad_norm": 16.33184538805689, + "learning_rate": 9.266920433185603e-06, + "loss": 1.2552, + "step": 7650 + }, + { + "epoch": 0.77, + "grad_norm": 25.21325091607401, + "learning_rate": 9.265390778263129e-06, + "loss": 1.2298, + "step": 7655 + }, + { + "epoch": 0.77, + "grad_norm": 7.156688433644524, + "learning_rate": 9.263859655623761e-06, + "loss": 1.2132, + "step": 7660 + }, + { + "epoch": 0.77, + "grad_norm": 11.300367798956218, + "learning_rate": 9.262327065794358e-06, + "loss": 1.2519, + "step": 7665 + }, + { + "epoch": 0.77, + "grad_norm": 9.169919324760624, + "learning_rate": 9.260793009302284e-06, + "loss": 1.2031, + "step": 7670 + }, + { + "epoch": 0.77, + "grad_norm": 25.815209942981188, + "learning_rate": 9.259257486675404e-06, + "loss": 1.2059, + "step": 7675 + }, + { + "epoch": 0.77, + "grad_norm": 23.533425674532772, + "learning_rate": 9.25772049844209e-06, + "loss": 1.2389, + "step": 7680 + }, + { + "epoch": 0.77, + "grad_norm": 8.829696761260031, + "learning_rate": 9.256182045131222e-06, + "loss": 1.1899, + "step": 7685 + }, + { + "epoch": 0.78, + "grad_norm": 11.084671135829076, + "learning_rate": 9.254642127272175e-06, + "loss": 1.2283, + "step": 7690 + }, + { + "epoch": 0.78, + "grad_norm": 11.66814246775726, + "learning_rate": 9.253100745394836e-06, + "loss": 1.2656, + "step": 7695 + }, + { + "epoch": 0.78, + "grad_norm": 8.733947935037307, + "learning_rate": 9.251557900029593e-06, + "loss": 1.2077, + "step": 7700 + }, + { + "epoch": 0.78, + "grad_norm": 9.323735837842106, + "learning_rate": 9.250013591707339e-06, + "loss": 1.2519, + "step": 7705 + }, + { + "epoch": 0.78, + "grad_norm": 10.768122637522222, + "learning_rate": 9.248467820959467e-06, + "loss": 1.2063, + "step": 7710 + }, + { + "epoch": 0.78, + "grad_norm": 17.952374205143265, + "learning_rate": 9.246920588317873e-06, + "loss": 1.2455, + "step": 7715 + }, + { + "epoch": 0.78, + "grad_norm": 22.129056751142848, + "learning_rate": 9.245371894314962e-06, + "loss": 1.2426, + "step": 7720 + }, + { + "epoch": 0.78, + "grad_norm": 14.67588349421575, + "learning_rate": 9.243821739483638e-06, + "loss": 1.227, + "step": 7725 + }, + { + "epoch": 0.78, + "grad_norm": 77.09409480945052, + "learning_rate": 9.242270124357306e-06, + "loss": 1.1896, + "step": 7730 + }, + { + "epoch": 0.78, + "grad_norm": 17.986491603701044, + "learning_rate": 9.240717049469874e-06, + "loss": 1.2003, + "step": 7735 + }, + { + "epoch": 0.78, + "grad_norm": 29.014583020401524, + "learning_rate": 9.239162515355759e-06, + "loss": 1.2147, + "step": 7740 + }, + { + "epoch": 0.78, + "grad_norm": 9.372813185319659, + "learning_rate": 9.23760652254987e-06, + "loss": 1.2414, + "step": 7745 + }, + { + "epoch": 0.78, + "grad_norm": 11.178809016800125, + "learning_rate": 9.236049071587623e-06, + "loss": 1.2296, + "step": 7750 + }, + { + "epoch": 0.78, + "grad_norm": 20.288685383202317, + "learning_rate": 9.234490163004938e-06, + "loss": 1.2335, + "step": 7755 + }, + { + "epoch": 0.78, + "grad_norm": 38.59795381962454, + "learning_rate": 9.232929797338231e-06, + "loss": 1.2161, + "step": 7760 + }, + { + "epoch": 0.78, + "grad_norm": 11.725619510243467, + "learning_rate": 9.231367975124425e-06, + "loss": 1.2327, + "step": 7765 + }, + { + "epoch": 0.78, + "grad_norm": 19.0956332935042, + "learning_rate": 9.229804696900938e-06, + "loss": 1.2172, + "step": 7770 + }, + { + "epoch": 0.78, + "grad_norm": 7.363008188568402, + "learning_rate": 9.228239963205697e-06, + "loss": 1.2235, + "step": 7775 + }, + { + "epoch": 0.78, + "grad_norm": 12.919405963264193, + "learning_rate": 9.226673774577123e-06, + "loss": 1.2012, + "step": 7780 + }, + { + "epoch": 0.78, + "grad_norm": 9.370760427375297, + "learning_rate": 9.225106131554138e-06, + "loss": 1.2742, + "step": 7785 + }, + { + "epoch": 0.79, + "grad_norm": 10.574753334317899, + "learning_rate": 9.22353703467617e-06, + "loss": 1.2251, + "step": 7790 + }, + { + "epoch": 0.79, + "grad_norm": 9.919204055521005, + "learning_rate": 9.221966484483143e-06, + "loss": 1.2137, + "step": 7795 + }, + { + "epoch": 0.79, + "grad_norm": 11.117727775224042, + "learning_rate": 9.22039448151548e-06, + "loss": 1.2701, + "step": 7800 + }, + { + "epoch": 0.79, + "grad_norm": 8.821250378982779, + "learning_rate": 9.218821026314106e-06, + "loss": 1.2434, + "step": 7805 + }, + { + "epoch": 0.79, + "grad_norm": 10.83726533956292, + "learning_rate": 9.217246119420449e-06, + "loss": 1.2364, + "step": 7810 + }, + { + "epoch": 0.79, + "grad_norm": 25.25069526526832, + "learning_rate": 9.215669761376428e-06, + "loss": 1.2415, + "step": 7815 + }, + { + "epoch": 0.79, + "grad_norm": 27.832030681337162, + "learning_rate": 9.214091952724469e-06, + "loss": 1.2075, + "step": 7820 + }, + { + "epoch": 0.79, + "grad_norm": 31.352547290909595, + "learning_rate": 9.212512694007494e-06, + "loss": 1.2429, + "step": 7825 + }, + { + "epoch": 0.79, + "grad_norm": 59.52508502860254, + "learning_rate": 9.210931985768924e-06, + "loss": 1.2324, + "step": 7830 + }, + { + "epoch": 0.79, + "grad_norm": 42.29197901671513, + "learning_rate": 9.209349828552681e-06, + "loss": 1.2324, + "step": 7835 + }, + { + "epoch": 0.79, + "grad_norm": 58.09267680215089, + "learning_rate": 9.207766222903182e-06, + "loss": 1.281, + "step": 7840 + }, + { + "epoch": 0.79, + "grad_norm": 11.99424188929102, + "learning_rate": 9.206181169365345e-06, + "loss": 1.211, + "step": 7845 + }, + { + "epoch": 0.79, + "grad_norm": 37.79255731226824, + "learning_rate": 9.204594668484584e-06, + "loss": 1.2649, + "step": 7850 + }, + { + "epoch": 0.79, + "grad_norm": 32.152434144982294, + "learning_rate": 9.203006720806813e-06, + "loss": 1.288, + "step": 7855 + }, + { + "epoch": 0.79, + "grad_norm": 11.160381783938107, + "learning_rate": 9.201417326878444e-06, + "loss": 1.2589, + "step": 7860 + }, + { + "epoch": 0.79, + "grad_norm": 24.820525431369962, + "learning_rate": 9.199826487246386e-06, + "loss": 1.2355, + "step": 7865 + }, + { + "epoch": 0.79, + "grad_norm": 10.610850830909609, + "learning_rate": 9.198234202458045e-06, + "loss": 1.2876, + "step": 7870 + }, + { + "epoch": 0.79, + "grad_norm": 12.256979389282636, + "learning_rate": 9.196640473061325e-06, + "loss": 1.2339, + "step": 7875 + }, + { + "epoch": 0.79, + "grad_norm": 7.769905681918171, + "learning_rate": 9.195045299604626e-06, + "loss": 1.237, + "step": 7880 + }, + { + "epoch": 0.79, + "grad_norm": 6.99103804897849, + "learning_rate": 9.193448682636846e-06, + "loss": 1.2418, + "step": 7885 + }, + { + "epoch": 0.8, + "grad_norm": 10.75457384662508, + "learning_rate": 9.19185062270738e-06, + "loss": 1.2511, + "step": 7890 + }, + { + "epoch": 0.8, + "grad_norm": 29.757096051723074, + "learning_rate": 9.190251120366118e-06, + "loss": 1.2335, + "step": 7895 + }, + { + "epoch": 0.8, + "grad_norm": 42.73464975981768, + "learning_rate": 9.188650176163448e-06, + "loss": 1.2636, + "step": 7900 + }, + { + "epoch": 0.8, + "grad_norm": 15.765238752923903, + "learning_rate": 9.187047790650252e-06, + "loss": 1.2848, + "step": 7905 + }, + { + "epoch": 0.8, + "grad_norm": 9.11348257275226, + "learning_rate": 9.185443964377911e-06, + "loss": 1.2391, + "step": 7910 + }, + { + "epoch": 0.8, + "grad_norm": 9.159478032563705, + "learning_rate": 9.1838386978983e-06, + "loss": 1.2208, + "step": 7915 + }, + { + "epoch": 0.8, + "grad_norm": 18.609278312997958, + "learning_rate": 9.18223199176379e-06, + "loss": 1.2904, + "step": 7920 + }, + { + "epoch": 0.8, + "grad_norm": 25.709645643557472, + "learning_rate": 9.180623846527244e-06, + "loss": 1.2852, + "step": 7925 + }, + { + "epoch": 0.8, + "grad_norm": 11.165928500885084, + "learning_rate": 9.179014262742027e-06, + "loss": 1.2282, + "step": 7930 + }, + { + "epoch": 0.8, + "grad_norm": 12.092276441061669, + "learning_rate": 9.177403240961993e-06, + "loss": 1.2677, + "step": 7935 + }, + { + "epoch": 0.8, + "grad_norm": 6.391862355681053, + "learning_rate": 9.175790781741493e-06, + "loss": 1.2481, + "step": 7940 + }, + { + "epoch": 0.8, + "grad_norm": 6.279028816900641, + "learning_rate": 9.174176885635373e-06, + "loss": 1.244, + "step": 7945 + }, + { + "epoch": 0.8, + "grad_norm": 8.923061253265221, + "learning_rate": 9.172561553198974e-06, + "loss": 1.184, + "step": 7950 + }, + { + "epoch": 0.8, + "grad_norm": 15.690954537377602, + "learning_rate": 9.17094478498813e-06, + "loss": 1.2314, + "step": 7955 + }, + { + "epoch": 0.8, + "grad_norm": 6.831051956873171, + "learning_rate": 9.16932658155917e-06, + "loss": 1.2684, + "step": 7960 + }, + { + "epoch": 0.8, + "grad_norm": 11.512000309343122, + "learning_rate": 9.167706943468916e-06, + "loss": 1.217, + "step": 7965 + }, + { + "epoch": 0.8, + "grad_norm": 9.104503401851176, + "learning_rate": 9.16608587127468e-06, + "loss": 1.1921, + "step": 7970 + }, + { + "epoch": 0.8, + "grad_norm": 31.74318596256183, + "learning_rate": 9.164463365534277e-06, + "loss": 1.2401, + "step": 7975 + }, + { + "epoch": 0.8, + "grad_norm": 10.413034406609874, + "learning_rate": 9.162839426806007e-06, + "loss": 1.2366, + "step": 7980 + }, + { + "epoch": 0.81, + "grad_norm": 16.185415863724284, + "learning_rate": 9.161214055648667e-06, + "loss": 1.2459, + "step": 7985 + }, + { + "epoch": 0.81, + "grad_norm": 16.188366129864246, + "learning_rate": 9.159587252621545e-06, + "loss": 1.2086, + "step": 7990 + }, + { + "epoch": 0.81, + "grad_norm": 17.707374452769816, + "learning_rate": 9.157959018284421e-06, + "loss": 1.1813, + "step": 7995 + }, + { + "epoch": 0.81, + "grad_norm": 16.354744724838543, + "learning_rate": 9.15632935319757e-06, + "loss": 1.2413, + "step": 8000 + }, + { + "epoch": 0.81, + "grad_norm": 22.372889845701312, + "learning_rate": 9.15469825792176e-06, + "loss": 1.2657, + "step": 8005 + }, + { + "epoch": 0.81, + "grad_norm": 9.73702140543741, + "learning_rate": 9.153065733018247e-06, + "loss": 1.2369, + "step": 8010 + }, + { + "epoch": 0.81, + "grad_norm": 10.564815088861721, + "learning_rate": 9.15143177904878e-06, + "loss": 1.2148, + "step": 8015 + }, + { + "epoch": 0.81, + "grad_norm": 16.606622739126877, + "learning_rate": 9.149796396575606e-06, + "loss": 1.2042, + "step": 8020 + }, + { + "epoch": 0.81, + "grad_norm": 12.368792590493465, + "learning_rate": 9.148159586161454e-06, + "loss": 1.2278, + "step": 8025 + }, + { + "epoch": 0.81, + "grad_norm": 13.079034496857842, + "learning_rate": 9.146521348369549e-06, + "loss": 1.236, + "step": 8030 + }, + { + "epoch": 0.81, + "grad_norm": 14.571565172228567, + "learning_rate": 9.14488168376361e-06, + "loss": 1.2529, + "step": 8035 + }, + { + "epoch": 0.81, + "grad_norm": 8.846307319390768, + "learning_rate": 9.143240592907842e-06, + "loss": 1.1753, + "step": 8040 + }, + { + "epoch": 0.81, + "grad_norm": 12.865132020941735, + "learning_rate": 9.141598076366942e-06, + "loss": 1.1693, + "step": 8045 + }, + { + "epoch": 0.81, + "grad_norm": 8.827255680234257, + "learning_rate": 9.139954134706102e-06, + "loss": 1.2104, + "step": 8050 + }, + { + "epoch": 0.81, + "grad_norm": 30.57885664997976, + "learning_rate": 9.138308768490998e-06, + "loss": 1.3031, + "step": 8055 + }, + { + "epoch": 0.81, + "grad_norm": 6.607095764384038, + "learning_rate": 9.136661978287799e-06, + "loss": 1.2254, + "step": 8060 + }, + { + "epoch": 0.81, + "grad_norm": 10.89714796529194, + "learning_rate": 9.135013764663163e-06, + "loss": 1.2506, + "step": 8065 + }, + { + "epoch": 0.81, + "grad_norm": 6.236539667441939, + "learning_rate": 9.133364128184242e-06, + "loss": 1.2887, + "step": 8070 + }, + { + "epoch": 0.81, + "grad_norm": 19.598284133793374, + "learning_rate": 9.131713069418671e-06, + "loss": 1.218, + "step": 8075 + }, + { + "epoch": 0.81, + "grad_norm": 11.89993729959541, + "learning_rate": 9.130060588934578e-06, + "loss": 1.2071, + "step": 8080 + }, + { + "epoch": 0.82, + "grad_norm": 11.775313601119954, + "learning_rate": 9.128406687300582e-06, + "loss": 1.2395, + "step": 8085 + }, + { + "epoch": 0.82, + "grad_norm": 27.60652757738097, + "learning_rate": 9.12675136508579e-06, + "loss": 1.1755, + "step": 8090 + }, + { + "epoch": 0.82, + "grad_norm": 31.59229218037076, + "learning_rate": 9.125094622859791e-06, + "loss": 1.2039, + "step": 8095 + }, + { + "epoch": 0.82, + "grad_norm": 38.65978472538587, + "learning_rate": 9.123436461192674e-06, + "loss": 1.2323, + "step": 8100 + }, + { + "epoch": 0.82, + "grad_norm": 65.80980214245291, + "learning_rate": 9.12177688065501e-06, + "loss": 1.2492, + "step": 8105 + }, + { + "epoch": 0.82, + "grad_norm": 59.28577568454184, + "learning_rate": 9.120115881817857e-06, + "loss": 1.2893, + "step": 8110 + }, + { + "epoch": 0.82, + "grad_norm": 27.688683401321523, + "learning_rate": 9.118453465252764e-06, + "loss": 1.2641, + "step": 8115 + }, + { + "epoch": 0.82, + "grad_norm": 13.70332351800936, + "learning_rate": 9.116789631531769e-06, + "loss": 1.181, + "step": 8120 + }, + { + "epoch": 0.82, + "grad_norm": 60.407514738702325, + "learning_rate": 9.115124381227392e-06, + "loss": 1.2316, + "step": 8125 + }, + { + "epoch": 0.82, + "grad_norm": 18.859070082123754, + "learning_rate": 9.113457714912646e-06, + "loss": 1.2146, + "step": 8130 + }, + { + "epoch": 0.82, + "grad_norm": 42.63500580644595, + "learning_rate": 9.111789633161029e-06, + "loss": 1.2345, + "step": 8135 + }, + { + "epoch": 0.82, + "grad_norm": 23.14253982357684, + "learning_rate": 9.110120136546528e-06, + "loss": 1.2374, + "step": 8140 + }, + { + "epoch": 0.82, + "grad_norm": 6.973157486553424, + "learning_rate": 9.108449225643612e-06, + "loss": 1.2557, + "step": 8145 + }, + { + "epoch": 0.82, + "grad_norm": 21.157793547602015, + "learning_rate": 9.10677690102724e-06, + "loss": 1.2004, + "step": 8150 + }, + { + "epoch": 0.82, + "grad_norm": 18.801783703870495, + "learning_rate": 9.105103163272862e-06, + "loss": 1.2679, + "step": 8155 + }, + { + "epoch": 0.82, + "grad_norm": 13.444503620625873, + "learning_rate": 9.103428012956406e-06, + "loss": 1.228, + "step": 8160 + }, + { + "epoch": 0.82, + "grad_norm": 26.61347370151708, + "learning_rate": 9.101751450654289e-06, + "loss": 1.2171, + "step": 8165 + }, + { + "epoch": 0.82, + "grad_norm": 27.953381232035486, + "learning_rate": 9.100073476943415e-06, + "loss": 1.2415, + "step": 8170 + }, + { + "epoch": 0.82, + "grad_norm": 20.37772713019263, + "learning_rate": 9.098394092401174e-06, + "loss": 1.2158, + "step": 8175 + }, + { + "epoch": 0.82, + "grad_norm": 16.193271631825727, + "learning_rate": 9.096713297605439e-06, + "loss": 1.2632, + "step": 8180 + }, + { + "epoch": 0.83, + "grad_norm": 11.45725368852007, + "learning_rate": 9.095031093134574e-06, + "loss": 1.2415, + "step": 8185 + }, + { + "epoch": 0.83, + "grad_norm": 25.74841599073145, + "learning_rate": 9.093347479567419e-06, + "loss": 1.2874, + "step": 8190 + }, + { + "epoch": 0.83, + "grad_norm": 7.071323002953288, + "learning_rate": 9.091662457483305e-06, + "loss": 1.2536, + "step": 8195 + }, + { + "epoch": 0.83, + "grad_norm": 23.679119240128742, + "learning_rate": 9.08997602746205e-06, + "loss": 1.2502, + "step": 8200 + }, + { + "epoch": 0.83, + "grad_norm": 11.481301766638277, + "learning_rate": 9.088288190083949e-06, + "loss": 1.2335, + "step": 8205 + }, + { + "epoch": 0.83, + "grad_norm": 11.789345499913125, + "learning_rate": 9.086598945929787e-06, + "loss": 1.2299, + "step": 8210 + }, + { + "epoch": 0.83, + "grad_norm": 31.442603486590052, + "learning_rate": 9.08490829558083e-06, + "loss": 1.2039, + "step": 8215 + }, + { + "epoch": 0.83, + "grad_norm": 31.453182996449176, + "learning_rate": 9.083216239618831e-06, + "loss": 1.2333, + "step": 8220 + }, + { + "epoch": 0.83, + "grad_norm": 16.40557681087287, + "learning_rate": 9.081522778626022e-06, + "loss": 1.2445, + "step": 8225 + }, + { + "epoch": 0.83, + "grad_norm": 41.965222999552864, + "learning_rate": 9.079827913185126e-06, + "loss": 1.2503, + "step": 8230 + }, + { + "epoch": 0.83, + "grad_norm": 33.22209321149795, + "learning_rate": 9.07813164387934e-06, + "loss": 1.2313, + "step": 8235 + }, + { + "epoch": 0.83, + "grad_norm": 81.53473188534082, + "learning_rate": 9.07643397129235e-06, + "loss": 1.2446, + "step": 8240 + }, + { + "epoch": 0.83, + "grad_norm": 120.40449446440331, + "learning_rate": 9.074734896008326e-06, + "loss": 1.2962, + "step": 8245 + }, + { + "epoch": 0.83, + "grad_norm": 65.36493591941586, + "learning_rate": 9.073034418611915e-06, + "loss": 1.2235, + "step": 8250 + }, + { + "epoch": 0.83, + "grad_norm": 40.4010764969845, + "learning_rate": 9.071332539688248e-06, + "loss": 1.2348, + "step": 8255 + }, + { + "epoch": 0.83, + "grad_norm": 9.64350332332248, + "learning_rate": 9.069629259822947e-06, + "loss": 1.2535, + "step": 8260 + }, + { + "epoch": 0.83, + "grad_norm": 46.141321205756675, + "learning_rate": 9.067924579602102e-06, + "loss": 1.2254, + "step": 8265 + }, + { + "epoch": 0.83, + "grad_norm": 22.800453773851473, + "learning_rate": 9.066218499612296e-06, + "loss": 1.2383, + "step": 8270 + }, + { + "epoch": 0.83, + "grad_norm": 32.825239418145735, + "learning_rate": 9.064511020440587e-06, + "loss": 1.1774, + "step": 8275 + }, + { + "epoch": 0.83, + "grad_norm": 13.849725814097734, + "learning_rate": 9.062802142674519e-06, + "loss": 1.2256, + "step": 8280 + }, + { + "epoch": 0.84, + "grad_norm": 36.209761027309284, + "learning_rate": 9.061091866902112e-06, + "loss": 1.171, + "step": 8285 + }, + { + "epoch": 0.84, + "grad_norm": 25.1646410223101, + "learning_rate": 9.059380193711873e-06, + "loss": 1.277, + "step": 8290 + }, + { + "epoch": 0.84, + "grad_norm": 17.127930781671427, + "learning_rate": 9.057667123692788e-06, + "loss": 1.2592, + "step": 8295 + }, + { + "epoch": 0.84, + "grad_norm": 17.664642683604065, + "learning_rate": 9.05595265743432e-06, + "loss": 1.2532, + "step": 8300 + }, + { + "epoch": 0.84, + "grad_norm": 11.241837976622543, + "learning_rate": 9.054236795526416e-06, + "loss": 1.2686, + "step": 8305 + }, + { + "epoch": 0.84, + "grad_norm": 8.854922348067305, + "learning_rate": 9.052519538559505e-06, + "loss": 1.247, + "step": 8310 + }, + { + "epoch": 0.84, + "grad_norm": 8.39256460774842, + "learning_rate": 9.050800887124492e-06, + "loss": 1.2109, + "step": 8315 + }, + { + "epoch": 0.84, + "grad_norm": 8.011338239652282, + "learning_rate": 9.04908084181276e-06, + "loss": 1.242, + "step": 8320 + }, + { + "epoch": 0.84, + "grad_norm": 9.290562496694669, + "learning_rate": 9.047359403216177e-06, + "loss": 1.2043, + "step": 8325 + }, + { + "epoch": 0.84, + "grad_norm": 20.502784890291515, + "learning_rate": 9.04563657192709e-06, + "loss": 1.2437, + "step": 8330 + }, + { + "epoch": 0.84, + "grad_norm": 19.85465910133973, + "learning_rate": 9.043912348538324e-06, + "loss": 1.2447, + "step": 8335 + }, + { + "epoch": 0.84, + "grad_norm": 21.084864859510795, + "learning_rate": 9.04218673364318e-06, + "loss": 1.2518, + "step": 8340 + }, + { + "epoch": 0.84, + "grad_norm": 8.3528319487047, + "learning_rate": 9.040459727835442e-06, + "loss": 1.1677, + "step": 8345 + }, + { + "epoch": 0.84, + "grad_norm": 9.499465506508274, + "learning_rate": 9.038731331709371e-06, + "loss": 1.2745, + "step": 8350 + }, + { + "epoch": 0.84, + "grad_norm": 22.8566905996232, + "learning_rate": 9.037001545859706e-06, + "loss": 1.226, + "step": 8355 + }, + { + "epoch": 0.84, + "grad_norm": 20.300693565803126, + "learning_rate": 9.035270370881666e-06, + "loss": 1.1771, + "step": 8360 + }, + { + "epoch": 0.84, + "grad_norm": 22.637425395129974, + "learning_rate": 9.033537807370943e-06, + "loss": 1.1876, + "step": 8365 + }, + { + "epoch": 0.84, + "grad_norm": 91.96974249962282, + "learning_rate": 9.031803855923715e-06, + "loss": 1.2614, + "step": 8370 + }, + { + "epoch": 0.84, + "grad_norm": 69.88547886649809, + "learning_rate": 9.03006851713663e-06, + "loss": 1.2672, + "step": 8375 + }, + { + "epoch": 0.84, + "grad_norm": 10.499419152260945, + "learning_rate": 9.028331791606819e-06, + "loss": 1.1971, + "step": 8380 + }, + { + "epoch": 0.85, + "grad_norm": 18.395960923241695, + "learning_rate": 9.026593679931885e-06, + "loss": 1.2513, + "step": 8385 + }, + { + "epoch": 0.85, + "grad_norm": 11.08015527528984, + "learning_rate": 9.02485418270991e-06, + "loss": 1.2314, + "step": 8390 + }, + { + "epoch": 0.85, + "grad_norm": 23.025894946322406, + "learning_rate": 9.023113300539457e-06, + "loss": 1.2254, + "step": 8395 + }, + { + "epoch": 0.85, + "grad_norm": 93.79636474761914, + "learning_rate": 9.021371034019559e-06, + "loss": 1.2282, + "step": 8400 + }, + { + "epoch": 0.85, + "grad_norm": 28.079331258559648, + "learning_rate": 9.019627383749728e-06, + "loss": 1.2048, + "step": 8405 + }, + { + "epoch": 0.85, + "grad_norm": 60.348869043870465, + "learning_rate": 9.017882350329955e-06, + "loss": 1.3066, + "step": 8410 + }, + { + "epoch": 0.85, + "grad_norm": 33.47748076849988, + "learning_rate": 9.016135934360703e-06, + "loss": 1.2841, + "step": 8415 + }, + { + "epoch": 0.85, + "grad_norm": 32.28164414314165, + "learning_rate": 9.014388136442912e-06, + "loss": 1.2336, + "step": 8420 + }, + { + "epoch": 0.85, + "grad_norm": 12.344102265445358, + "learning_rate": 9.012638957177994e-06, + "loss": 1.2027, + "step": 8425 + }, + { + "epoch": 0.85, + "grad_norm": 7.961658100350237, + "learning_rate": 9.010888397167848e-06, + "loss": 1.227, + "step": 8430 + }, + { + "epoch": 0.85, + "grad_norm": 30.993547565692637, + "learning_rate": 9.009136457014833e-06, + "loss": 1.1944, + "step": 8435 + }, + { + "epoch": 0.85, + "grad_norm": 10.533667423823333, + "learning_rate": 9.007383137321793e-06, + "loss": 1.1939, + "step": 8440 + }, + { + "epoch": 0.85, + "grad_norm": 20.641505087389255, + "learning_rate": 9.005628438692042e-06, + "loss": 1.1807, + "step": 8445 + }, + { + "epoch": 0.85, + "grad_norm": 27.084097419574157, + "learning_rate": 9.003872361729371e-06, + "loss": 1.2487, + "step": 8450 + }, + { + "epoch": 0.85, + "grad_norm": 55.487939119016765, + "learning_rate": 9.002114907038045e-06, + "loss": 1.2511, + "step": 8455 + }, + { + "epoch": 0.85, + "grad_norm": 38.647380685218614, + "learning_rate": 9.000356075222803e-06, + "loss": 1.2469, + "step": 8460 + }, + { + "epoch": 0.85, + "grad_norm": 18.655068951279034, + "learning_rate": 8.998595866888855e-06, + "loss": 1.2072, + "step": 8465 + }, + { + "epoch": 0.85, + "grad_norm": 14.370284045382212, + "learning_rate": 8.996834282641889e-06, + "loss": 1.2035, + "step": 8470 + }, + { + "epoch": 0.85, + "grad_norm": 10.700022848670333, + "learning_rate": 8.995071323088063e-06, + "loss": 1.1962, + "step": 8475 + }, + { + "epoch": 0.85, + "grad_norm": 19.48844904276957, + "learning_rate": 8.99330698883401e-06, + "loss": 1.2056, + "step": 8480 + }, + { + "epoch": 0.86, + "grad_norm": 20.600346419580166, + "learning_rate": 8.991541280486838e-06, + "loss": 1.2016, + "step": 8485 + }, + { + "epoch": 0.86, + "grad_norm": 8.50302243666553, + "learning_rate": 8.989774198654123e-06, + "loss": 1.2685, + "step": 8490 + }, + { + "epoch": 0.86, + "grad_norm": 8.320021532955366, + "learning_rate": 8.988005743943916e-06, + "loss": 1.228, + "step": 8495 + }, + { + "epoch": 0.86, + "grad_norm": 8.707022281835492, + "learning_rate": 8.986235916964742e-06, + "loss": 1.2113, + "step": 8500 + }, + { + "epoch": 0.86, + "grad_norm": 11.868031314799666, + "learning_rate": 8.984464718325596e-06, + "loss": 1.2128, + "step": 8505 + }, + { + "epoch": 0.86, + "grad_norm": 14.321884298163736, + "learning_rate": 8.982692148635947e-06, + "loss": 1.1707, + "step": 8510 + }, + { + "epoch": 0.86, + "grad_norm": 11.031382634589063, + "learning_rate": 8.980918208505734e-06, + "loss": 1.1996, + "step": 8515 + }, + { + "epoch": 0.86, + "grad_norm": 7.593011499567609, + "learning_rate": 8.979142898545366e-06, + "loss": 1.2297, + "step": 8520 + }, + { + "epoch": 0.86, + "grad_norm": 17.85682476258015, + "learning_rate": 8.977366219365728e-06, + "loss": 1.2096, + "step": 8525 + }, + { + "epoch": 0.86, + "grad_norm": 17.337151667033524, + "learning_rate": 8.975588171578172e-06, + "loss": 1.2184, + "step": 8530 + }, + { + "epoch": 0.86, + "grad_norm": 19.574082092030107, + "learning_rate": 8.973808755794525e-06, + "loss": 1.2287, + "step": 8535 + }, + { + "epoch": 0.86, + "grad_norm": 10.340481711809604, + "learning_rate": 8.972027972627078e-06, + "loss": 1.1706, + "step": 8540 + }, + { + "epoch": 0.86, + "grad_norm": 16.560002968647215, + "learning_rate": 8.970245822688601e-06, + "loss": 1.2048, + "step": 8545 + }, + { + "epoch": 0.86, + "grad_norm": 14.707635187199964, + "learning_rate": 8.968462306592328e-06, + "loss": 1.195, + "step": 8550 + }, + { + "epoch": 0.86, + "grad_norm": 11.796462917778843, + "learning_rate": 8.966677424951966e-06, + "loss": 1.168, + "step": 8555 + }, + { + "epoch": 0.86, + "grad_norm": 6.260137171097879, + "learning_rate": 8.964891178381691e-06, + "loss": 1.2253, + "step": 8560 + }, + { + "epoch": 0.86, + "grad_norm": 25.65853338757763, + "learning_rate": 8.963103567496148e-06, + "loss": 1.344, + "step": 8565 + }, + { + "epoch": 0.86, + "grad_norm": 7.153776235558424, + "learning_rate": 8.961314592910452e-06, + "loss": 1.1815, + "step": 8570 + }, + { + "epoch": 0.86, + "grad_norm": 21.907940778880246, + "learning_rate": 8.959524255240189e-06, + "loss": 1.2651, + "step": 8575 + }, + { + "epoch": 0.87, + "grad_norm": 22.521581700435902, + "learning_rate": 8.95773255510141e-06, + "loss": 1.1989, + "step": 8580 + }, + { + "epoch": 0.87, + "grad_norm": 7.53693385733085, + "learning_rate": 8.95593949311064e-06, + "loss": 1.1989, + "step": 8585 + }, + { + "epoch": 0.87, + "grad_norm": 34.65238445292161, + "learning_rate": 8.95414506988487e-06, + "loss": 1.2189, + "step": 8590 + }, + { + "epoch": 0.87, + "grad_norm": 13.09106999665172, + "learning_rate": 8.952349286041556e-06, + "loss": 1.2075, + "step": 8595 + }, + { + "epoch": 0.87, + "grad_norm": 46.468132754859965, + "learning_rate": 8.950552142198628e-06, + "loss": 1.1656, + "step": 8600 + }, + { + "epoch": 0.87, + "grad_norm": 26.328771913319503, + "learning_rate": 8.948753638974482e-06, + "loss": 1.2187, + "step": 8605 + }, + { + "epoch": 0.87, + "grad_norm": 9.830332243757532, + "learning_rate": 8.94695377698798e-06, + "loss": 1.2091, + "step": 8610 + }, + { + "epoch": 0.87, + "grad_norm": 53.041771160912965, + "learning_rate": 8.945152556858452e-06, + "loss": 1.2352, + "step": 8615 + }, + { + "epoch": 0.87, + "grad_norm": 49.80951360537134, + "learning_rate": 8.9433499792057e-06, + "loss": 1.262, + "step": 8620 + }, + { + "epoch": 0.87, + "grad_norm": 95.03977633541976, + "learning_rate": 8.941546044649985e-06, + "loss": 1.2122, + "step": 8625 + }, + { + "epoch": 0.87, + "grad_norm": 18.152155839898636, + "learning_rate": 8.93974075381204e-06, + "loss": 1.1913, + "step": 8630 + }, + { + "epoch": 0.87, + "grad_norm": 16.884864637091642, + "learning_rate": 8.937934107313065e-06, + "loss": 1.1907, + "step": 8635 + }, + { + "epoch": 0.87, + "grad_norm": 21.758968553236418, + "learning_rate": 8.936126105774725e-06, + "loss": 1.2776, + "step": 8640 + }, + { + "epoch": 0.87, + "grad_norm": 65.77897117650656, + "learning_rate": 8.93431674981915e-06, + "loss": 1.2564, + "step": 8645 + }, + { + "epoch": 0.87, + "grad_norm": 37.48469408393138, + "learning_rate": 8.93250604006894e-06, + "loss": 1.2701, + "step": 8650 + }, + { + "epoch": 0.87, + "grad_norm": 20.5819566041243, + "learning_rate": 8.930693977147157e-06, + "loss": 1.2082, + "step": 8655 + }, + { + "epoch": 0.87, + "grad_norm": 11.838012542117974, + "learning_rate": 8.928880561677333e-06, + "loss": 1.2151, + "step": 8660 + }, + { + "epoch": 0.87, + "grad_norm": 28.934816659757903, + "learning_rate": 8.927065794283458e-06, + "loss": 1.2108, + "step": 8665 + }, + { + "epoch": 0.87, + "grad_norm": 30.015851158231182, + "learning_rate": 8.925249675589995e-06, + "loss": 1.207, + "step": 8670 + }, + { + "epoch": 0.87, + "grad_norm": 18.05708344697593, + "learning_rate": 8.923432206221868e-06, + "loss": 1.2508, + "step": 8675 + }, + { + "epoch": 0.88, + "grad_norm": 13.071404346946036, + "learning_rate": 8.921613386804465e-06, + "loss": 1.25, + "step": 8680 + }, + { + "epoch": 0.88, + "grad_norm": 10.1873966208491, + "learning_rate": 8.919793217963642e-06, + "loss": 1.1895, + "step": 8685 + }, + { + "epoch": 0.88, + "grad_norm": 86.5409264169609, + "learning_rate": 8.917971700325717e-06, + "loss": 1.2177, + "step": 8690 + }, + { + "epoch": 0.88, + "grad_norm": 48.8292154136072, + "learning_rate": 8.91614883451747e-06, + "loss": 1.2553, + "step": 8695 + }, + { + "epoch": 0.88, + "grad_norm": 10.72765388759314, + "learning_rate": 8.914324621166151e-06, + "loss": 1.2569, + "step": 8700 + }, + { + "epoch": 0.88, + "grad_norm": 14.99381671321517, + "learning_rate": 8.912499060899469e-06, + "loss": 1.2478, + "step": 8705 + }, + { + "epoch": 0.88, + "grad_norm": 23.038501634929837, + "learning_rate": 8.910672154345596e-06, + "loss": 1.2592, + "step": 8710 + }, + { + "epoch": 0.88, + "grad_norm": 86.91259519907479, + "learning_rate": 8.90884390213317e-06, + "loss": 1.2319, + "step": 8715 + }, + { + "epoch": 0.88, + "grad_norm": 72.00559372046911, + "learning_rate": 8.907014304891291e-06, + "loss": 1.1971, + "step": 8720 + }, + { + "epoch": 0.88, + "grad_norm": 52.695079478540215, + "learning_rate": 8.905183363249521e-06, + "loss": 1.2671, + "step": 8725 + }, + { + "epoch": 0.88, + "grad_norm": 21.294026987914467, + "learning_rate": 8.903351077837885e-06, + "loss": 1.2787, + "step": 8730 + }, + { + "epoch": 0.88, + "grad_norm": 26.77369152335356, + "learning_rate": 8.901517449286871e-06, + "loss": 1.241, + "step": 8735 + }, + { + "epoch": 0.88, + "grad_norm": 50.896197869777964, + "learning_rate": 8.89968247822743e-06, + "loss": 1.2331, + "step": 8740 + }, + { + "epoch": 0.88, + "grad_norm": 31.150932876356467, + "learning_rate": 8.897846165290974e-06, + "loss": 1.2359, + "step": 8745 + }, + { + "epoch": 0.88, + "grad_norm": 17.45636798929398, + "learning_rate": 8.896008511109373e-06, + "loss": 1.2213, + "step": 8750 + }, + { + "epoch": 0.88, + "grad_norm": 34.80863089297944, + "learning_rate": 8.894169516314966e-06, + "loss": 1.2084, + "step": 8755 + }, + { + "epoch": 0.88, + "grad_norm": 5.824283701844934, + "learning_rate": 8.892329181540547e-06, + "loss": 1.2012, + "step": 8760 + }, + { + "epoch": 0.88, + "grad_norm": 20.875788115200002, + "learning_rate": 8.890487507419375e-06, + "loss": 1.1967, + "step": 8765 + }, + { + "epoch": 0.88, + "grad_norm": 10.419811862747704, + "learning_rate": 8.888644494585167e-06, + "loss": 1.2338, + "step": 8770 + }, + { + "epoch": 0.88, + "grad_norm": 11.537355999648662, + "learning_rate": 8.886800143672103e-06, + "loss": 1.2357, + "step": 8775 + }, + { + "epoch": 0.89, + "grad_norm": 16.75738643127998, + "learning_rate": 8.884954455314822e-06, + "loss": 1.2745, + "step": 8780 + }, + { + "epoch": 0.89, + "grad_norm": 7.229991642182542, + "learning_rate": 8.883107430148422e-06, + "loss": 1.2192, + "step": 8785 + }, + { + "epoch": 0.89, + "grad_norm": 9.311212308408923, + "learning_rate": 8.881259068808466e-06, + "loss": 1.239, + "step": 8790 + }, + { + "epoch": 0.89, + "grad_norm": 9.739353701546275, + "learning_rate": 8.879409371930969e-06, + "loss": 1.1639, + "step": 8795 + }, + { + "epoch": 0.89, + "grad_norm": 20.381728011219263, + "learning_rate": 8.877558340152414e-06, + "loss": 1.2314, + "step": 8800 + }, + { + "epoch": 0.89, + "grad_norm": 10.386860710130472, + "learning_rate": 8.875705974109738e-06, + "loss": 1.2133, + "step": 8805 + }, + { + "epoch": 0.89, + "grad_norm": 19.316580467726308, + "learning_rate": 8.873852274440337e-06, + "loss": 1.234, + "step": 8810 + }, + { + "epoch": 0.89, + "grad_norm": 6.727198524509611, + "learning_rate": 8.87199724178207e-06, + "loss": 1.2077, + "step": 8815 + }, + { + "epoch": 0.89, + "grad_norm": 9.958332741842913, + "learning_rate": 8.87014087677325e-06, + "loss": 1.2547, + "step": 8820 + }, + { + "epoch": 0.89, + "grad_norm": 13.067840849747313, + "learning_rate": 8.868283180052648e-06, + "loss": 1.227, + "step": 8825 + }, + { + "epoch": 0.89, + "grad_norm": 23.86742366801994, + "learning_rate": 8.866424152259501e-06, + "loss": 1.2002, + "step": 8830 + }, + { + "epoch": 0.89, + "grad_norm": 10.579342887032047, + "learning_rate": 8.864563794033496e-06, + "loss": 1.226, + "step": 8835 + }, + { + "epoch": 0.89, + "grad_norm": 32.68986457966961, + "learning_rate": 8.86270210601478e-06, + "loss": 1.2414, + "step": 8840 + }, + { + "epoch": 0.89, + "grad_norm": 14.087093156037234, + "learning_rate": 8.860839088843958e-06, + "loss": 1.2746, + "step": 8845 + }, + { + "epoch": 0.89, + "grad_norm": 22.212447048092486, + "learning_rate": 8.858974743162095e-06, + "loss": 1.2399, + "step": 8850 + }, + { + "epoch": 0.89, + "grad_norm": 9.496678463979974, + "learning_rate": 8.857109069610706e-06, + "loss": 1.1913, + "step": 8855 + }, + { + "epoch": 0.89, + "grad_norm": 16.26558139753938, + "learning_rate": 8.85524206883177e-06, + "loss": 1.2259, + "step": 8860 + }, + { + "epoch": 0.89, + "grad_norm": 15.567739935203726, + "learning_rate": 8.853373741467724e-06, + "loss": 1.2014, + "step": 8865 + }, + { + "epoch": 0.89, + "grad_norm": 7.243019396023581, + "learning_rate": 8.85150408816145e-06, + "loss": 1.1706, + "step": 8870 + }, + { + "epoch": 0.89, + "grad_norm": 12.90760732651893, + "learning_rate": 8.8496331095563e-06, + "loss": 1.231, + "step": 8875 + }, + { + "epoch": 0.9, + "grad_norm": 6.272240575737805, + "learning_rate": 8.847760806296072e-06, + "loss": 1.1923, + "step": 8880 + }, + { + "epoch": 0.9, + "grad_norm": 9.876889907445083, + "learning_rate": 8.845887179025029e-06, + "loss": 1.2276, + "step": 8885 + }, + { + "epoch": 0.9, + "grad_norm": 9.256098634241363, + "learning_rate": 8.844012228387879e-06, + "loss": 1.2183, + "step": 8890 + }, + { + "epoch": 0.9, + "grad_norm": 8.472058437265845, + "learning_rate": 8.84213595502979e-06, + "loss": 1.2006, + "step": 8895 + }, + { + "epoch": 0.9, + "grad_norm": 10.871714887277207, + "learning_rate": 8.840258359596392e-06, + "loss": 1.2144, + "step": 8900 + }, + { + "epoch": 0.9, + "grad_norm": 9.80383694900924, + "learning_rate": 8.83837944273376e-06, + "loss": 1.2478, + "step": 8905 + }, + { + "epoch": 0.9, + "grad_norm": 6.231795176525273, + "learning_rate": 8.836499205088426e-06, + "loss": 1.1868, + "step": 8910 + }, + { + "epoch": 0.9, + "grad_norm": 10.804885651323739, + "learning_rate": 8.834617647307383e-06, + "loss": 1.24, + "step": 8915 + }, + { + "epoch": 0.9, + "grad_norm": 6.386285962788818, + "learning_rate": 8.832734770038068e-06, + "loss": 1.2363, + "step": 8920 + }, + { + "epoch": 0.9, + "grad_norm": 7.478128391048558, + "learning_rate": 8.830850573928378e-06, + "loss": 1.2555, + "step": 8925 + }, + { + "epoch": 0.9, + "grad_norm": 8.276309193456074, + "learning_rate": 8.828965059626666e-06, + "loss": 1.2382, + "step": 8930 + }, + { + "epoch": 0.9, + "grad_norm": 20.538374321553064, + "learning_rate": 8.827078227781734e-06, + "loss": 1.2285, + "step": 8935 + }, + { + "epoch": 0.9, + "grad_norm": 12.22973284606407, + "learning_rate": 8.825190079042839e-06, + "loss": 1.1935, + "step": 8940 + }, + { + "epoch": 0.9, + "grad_norm": 10.121712987282244, + "learning_rate": 8.823300614059692e-06, + "loss": 1.2082, + "step": 8945 + }, + { + "epoch": 0.9, + "grad_norm": 18.410891172450526, + "learning_rate": 8.821409833482454e-06, + "loss": 1.1824, + "step": 8950 + }, + { + "epoch": 0.9, + "grad_norm": 23.697063264981985, + "learning_rate": 8.819517737961742e-06, + "loss": 1.2121, + "step": 8955 + }, + { + "epoch": 0.9, + "grad_norm": 22.61499589372635, + "learning_rate": 8.817624328148627e-06, + "loss": 1.2562, + "step": 8960 + }, + { + "epoch": 0.9, + "grad_norm": 62.12866389191985, + "learning_rate": 8.815729604694624e-06, + "loss": 1.24, + "step": 8965 + }, + { + "epoch": 0.9, + "grad_norm": 15.910338303371752, + "learning_rate": 8.813833568251708e-06, + "loss": 1.2244, + "step": 8970 + }, + { + "epoch": 0.9, + "grad_norm": 40.97912228130415, + "learning_rate": 8.811936219472306e-06, + "loss": 1.2409, + "step": 8975 + }, + { + "epoch": 0.91, + "grad_norm": 185.10887018584373, + "learning_rate": 8.810037559009292e-06, + "loss": 1.2355, + "step": 8980 + }, + { + "epoch": 0.91, + "grad_norm": 124.19054332408055, + "learning_rate": 8.808137587515992e-06, + "loss": 1.2855, + "step": 8985 + }, + { + "epoch": 0.91, + "grad_norm": 68.21812846024108, + "learning_rate": 8.806236305646185e-06, + "loss": 1.3029, + "step": 8990 + }, + { + "epoch": 0.91, + "grad_norm": 26.761045112155376, + "learning_rate": 8.8043337140541e-06, + "loss": 1.3163, + "step": 8995 + }, + { + "epoch": 0.91, + "grad_norm": 49.033990563804736, + "learning_rate": 8.80242981339442e-06, + "loss": 1.2956, + "step": 9000 + }, + { + "epoch": 0.91, + "grad_norm": 21.560566087023556, + "learning_rate": 8.800524604322272e-06, + "loss": 1.2232, + "step": 9005 + }, + { + "epoch": 0.91, + "grad_norm": 16.567735923502728, + "learning_rate": 8.798618087493237e-06, + "loss": 1.2081, + "step": 9010 + }, + { + "epoch": 0.91, + "grad_norm": 14.993297853077632, + "learning_rate": 8.796710263563345e-06, + "loss": 1.2857, + "step": 9015 + }, + { + "epoch": 0.91, + "grad_norm": 13.15594183914007, + "learning_rate": 8.794801133189079e-06, + "loss": 1.2217, + "step": 9020 + }, + { + "epoch": 0.91, + "grad_norm": 7.100897474881501, + "learning_rate": 8.792890697027367e-06, + "loss": 1.1967, + "step": 9025 + }, + { + "epoch": 0.91, + "grad_norm": 14.349787053381577, + "learning_rate": 8.79097895573559e-06, + "loss": 1.2492, + "step": 9030 + }, + { + "epoch": 0.91, + "grad_norm": 54.46800846028436, + "learning_rate": 8.789065909971574e-06, + "loss": 1.2935, + "step": 9035 + }, + { + "epoch": 0.91, + "grad_norm": 58.03382374458775, + "learning_rate": 8.787151560393597e-06, + "loss": 1.2109, + "step": 9040 + }, + { + "epoch": 0.91, + "grad_norm": 15.560042500814026, + "learning_rate": 8.785235907660385e-06, + "loss": 1.2293, + "step": 9045 + }, + { + "epoch": 0.91, + "grad_norm": 7.94549857333933, + "learning_rate": 8.783318952431114e-06, + "loss": 1.2127, + "step": 9050 + }, + { + "epoch": 0.91, + "grad_norm": 19.386889362913447, + "learning_rate": 8.781400695365405e-06, + "loss": 1.235, + "step": 9055 + }, + { + "epoch": 0.91, + "grad_norm": 10.863732384837983, + "learning_rate": 8.779481137123327e-06, + "loss": 1.2258, + "step": 9060 + }, + { + "epoch": 0.91, + "grad_norm": 34.821485508785, + "learning_rate": 8.7775602783654e-06, + "loss": 1.2098, + "step": 9065 + }, + { + "epoch": 0.91, + "grad_norm": 21.310302561173884, + "learning_rate": 8.77563811975259e-06, + "loss": 1.2393, + "step": 9070 + }, + { + "epoch": 0.91, + "grad_norm": 7.004806784226558, + "learning_rate": 8.77371466194631e-06, + "loss": 1.2415, + "step": 9075 + }, + { + "epoch": 0.92, + "grad_norm": 23.93245509241067, + "learning_rate": 8.771789905608416e-06, + "loss": 1.2573, + "step": 9080 + }, + { + "epoch": 0.92, + "grad_norm": 25.861977789777615, + "learning_rate": 8.769863851401219e-06, + "loss": 1.2743, + "step": 9085 + }, + { + "epoch": 0.92, + "grad_norm": 10.755736619167987, + "learning_rate": 8.767936499987473e-06, + "loss": 1.2376, + "step": 9090 + }, + { + "epoch": 0.92, + "grad_norm": 17.211424345701793, + "learning_rate": 8.766007852030373e-06, + "loss": 1.2578, + "step": 9095 + }, + { + "epoch": 0.92, + "grad_norm": 7.148920549885833, + "learning_rate": 8.764077908193571e-06, + "loss": 1.1645, + "step": 9100 + }, + { + "epoch": 0.92, + "grad_norm": 7.930321153879788, + "learning_rate": 8.762146669141156e-06, + "loss": 1.1894, + "step": 9105 + }, + { + "epoch": 0.92, + "grad_norm": 7.029191550269419, + "learning_rate": 8.760214135537663e-06, + "loss": 1.1963, + "step": 9110 + }, + { + "epoch": 0.92, + "grad_norm": 9.684012600037251, + "learning_rate": 8.758280308048079e-06, + "loss": 1.2488, + "step": 9115 + }, + { + "epoch": 0.92, + "grad_norm": 25.043411822630535, + "learning_rate": 8.75634518733783e-06, + "loss": 1.2259, + "step": 9120 + }, + { + "epoch": 0.92, + "grad_norm": 20.978717883249782, + "learning_rate": 8.754408774072791e-06, + "loss": 1.2128, + "step": 9125 + }, + { + "epoch": 0.92, + "grad_norm": 13.364554753385073, + "learning_rate": 8.752471068919277e-06, + "loss": 1.274, + "step": 9130 + }, + { + "epoch": 0.92, + "grad_norm": 7.852807191673204, + "learning_rate": 8.750532072544053e-06, + "loss": 1.269, + "step": 9135 + }, + { + "epoch": 0.92, + "grad_norm": 6.0768275552660755, + "learning_rate": 8.748591785614327e-06, + "loss": 1.2448, + "step": 9140 + }, + { + "epoch": 0.92, + "grad_norm": 7.895062035573548, + "learning_rate": 8.746650208797745e-06, + "loss": 1.237, + "step": 9145 + }, + { + "epoch": 0.92, + "grad_norm": 18.688327529636332, + "learning_rate": 8.744707342762406e-06, + "loss": 1.2442, + "step": 9150 + }, + { + "epoch": 0.92, + "grad_norm": 11.773096278719505, + "learning_rate": 8.742763188176845e-06, + "loss": 1.2289, + "step": 9155 + }, + { + "epoch": 0.92, + "grad_norm": 7.539377156107374, + "learning_rate": 8.740817745710049e-06, + "loss": 1.2787, + "step": 9160 + }, + { + "epoch": 0.92, + "grad_norm": 30.669054910974296, + "learning_rate": 8.738871016031438e-06, + "loss": 1.1643, + "step": 9165 + }, + { + "epoch": 0.92, + "grad_norm": 13.535314129564306, + "learning_rate": 8.73692299981088e-06, + "loss": 1.2147, + "step": 9170 + }, + { + "epoch": 0.93, + "grad_norm": 8.407433647498465, + "learning_rate": 8.734973697718689e-06, + "loss": 1.241, + "step": 9175 + }, + { + "epoch": 0.93, + "grad_norm": 11.41579698367922, + "learning_rate": 8.733023110425616e-06, + "loss": 1.2248, + "step": 9180 + }, + { + "epoch": 0.93, + "grad_norm": 17.82990665062719, + "learning_rate": 8.731071238602855e-06, + "loss": 1.2382, + "step": 9185 + }, + { + "epoch": 0.93, + "grad_norm": 6.384785763412171, + "learning_rate": 8.729118082922044e-06, + "loss": 1.2981, + "step": 9190 + }, + { + "epoch": 0.93, + "grad_norm": 12.22399926884024, + "learning_rate": 8.727163644055263e-06, + "loss": 1.2077, + "step": 9195 + }, + { + "epoch": 0.93, + "grad_norm": 10.248519938538731, + "learning_rate": 8.725207922675032e-06, + "loss": 1.1737, + "step": 9200 + }, + { + "epoch": 0.93, + "grad_norm": 20.2802850223925, + "learning_rate": 8.723250919454313e-06, + "loss": 1.2105, + "step": 9205 + }, + { + "epoch": 0.93, + "grad_norm": 9.164284527748045, + "learning_rate": 8.721292635066509e-06, + "loss": 1.1661, + "step": 9210 + }, + { + "epoch": 0.93, + "grad_norm": 7.118906619701703, + "learning_rate": 8.719333070185463e-06, + "loss": 1.1894, + "step": 9215 + }, + { + "epoch": 0.93, + "grad_norm": 8.555578503418722, + "learning_rate": 8.71737222548546e-06, + "loss": 1.2029, + "step": 9220 + }, + { + "epoch": 0.93, + "grad_norm": 10.287993256735366, + "learning_rate": 8.715410101641225e-06, + "loss": 1.239, + "step": 9225 + }, + { + "epoch": 0.93, + "grad_norm": 13.616509952185027, + "learning_rate": 8.71344669932792e-06, + "loss": 1.2277, + "step": 9230 + }, + { + "epoch": 0.93, + "grad_norm": 10.266653583169361, + "learning_rate": 8.711482019221157e-06, + "loss": 1.2479, + "step": 9235 + }, + { + "epoch": 0.93, + "grad_norm": 12.831512190806212, + "learning_rate": 8.709516061996973e-06, + "loss": 1.2383, + "step": 9240 + }, + { + "epoch": 0.93, + "grad_norm": 7.612810498848723, + "learning_rate": 8.707548828331856e-06, + "loss": 1.1593, + "step": 9245 + }, + { + "epoch": 0.93, + "grad_norm": 12.042277080028542, + "learning_rate": 8.705580318902728e-06, + "loss": 1.2517, + "step": 9250 + }, + { + "epoch": 0.93, + "grad_norm": 7.223576413834543, + "learning_rate": 8.703610534386952e-06, + "loss": 1.2395, + "step": 9255 + }, + { + "epoch": 0.93, + "grad_norm": 14.846067819244304, + "learning_rate": 8.70163947546233e-06, + "loss": 1.1662, + "step": 9260 + }, + { + "epoch": 0.93, + "grad_norm": 19.750262940528504, + "learning_rate": 8.699667142807096e-06, + "loss": 1.2339, + "step": 9265 + }, + { + "epoch": 0.93, + "grad_norm": 6.201453699126599, + "learning_rate": 8.697693537099935e-06, + "loss": 1.218, + "step": 9270 + }, + { + "epoch": 0.94, + "grad_norm": 14.547911380471197, + "learning_rate": 8.695718659019957e-06, + "loss": 1.2112, + "step": 9275 + }, + { + "epoch": 0.94, + "grad_norm": 10.642260834594786, + "learning_rate": 8.69374250924672e-06, + "loss": 1.2476, + "step": 9280 + }, + { + "epoch": 0.94, + "grad_norm": 17.92848949067963, + "learning_rate": 8.691765088460214e-06, + "loss": 1.193, + "step": 9285 + }, + { + "epoch": 0.94, + "grad_norm": 27.8880184994152, + "learning_rate": 8.689786397340866e-06, + "loss": 1.2467, + "step": 9290 + }, + { + "epoch": 0.94, + "grad_norm": 33.71047112323762, + "learning_rate": 8.687806436569544e-06, + "loss": 1.2282, + "step": 9295 + }, + { + "epoch": 0.94, + "grad_norm": 30.28245393486485, + "learning_rate": 8.685825206827549e-06, + "loss": 1.1944, + "step": 9300 + }, + { + "epoch": 0.94, + "grad_norm": 11.01931733205294, + "learning_rate": 8.68384270879662e-06, + "loss": 1.1915, + "step": 9305 + }, + { + "epoch": 0.94, + "grad_norm": 9.301290633165818, + "learning_rate": 8.681858943158934e-06, + "loss": 1.218, + "step": 9310 + }, + { + "epoch": 0.94, + "grad_norm": 59.894527194127896, + "learning_rate": 8.6798739105971e-06, + "loss": 1.2276, + "step": 9315 + }, + { + "epoch": 0.94, + "grad_norm": 60.59704754802048, + "learning_rate": 8.677887611794171e-06, + "loss": 1.2136, + "step": 9320 + }, + { + "epoch": 0.94, + "grad_norm": 31.229948653849934, + "learning_rate": 8.675900047433628e-06, + "loss": 1.2148, + "step": 9325 + }, + { + "epoch": 0.94, + "grad_norm": 9.751502635285522, + "learning_rate": 8.673911218199387e-06, + "loss": 1.2416, + "step": 9330 + }, + { + "epoch": 0.94, + "grad_norm": 27.156635876069526, + "learning_rate": 8.671921124775807e-06, + "loss": 1.2311, + "step": 9335 + }, + { + "epoch": 0.94, + "grad_norm": 28.13243527055898, + "learning_rate": 8.669929767847673e-06, + "loss": 1.2309, + "step": 9340 + }, + { + "epoch": 0.94, + "grad_norm": 21.92555759390989, + "learning_rate": 8.667937148100211e-06, + "loss": 1.2145, + "step": 9345 + }, + { + "epoch": 0.94, + "grad_norm": 16.51310034385813, + "learning_rate": 8.665943266219081e-06, + "loss": 1.273, + "step": 9350 + }, + { + "epoch": 0.94, + "grad_norm": 60.47307916368799, + "learning_rate": 8.663948122890376e-06, + "loss": 1.2216, + "step": 9355 + }, + { + "epoch": 0.94, + "grad_norm": 27.728669094488204, + "learning_rate": 8.661951718800618e-06, + "loss": 1.2355, + "step": 9360 + }, + { + "epoch": 0.94, + "grad_norm": 12.484033499295984, + "learning_rate": 8.659954054636774e-06, + "loss": 1.2516, + "step": 9365 + }, + { + "epoch": 0.94, + "grad_norm": 8.625017951128116, + "learning_rate": 8.657955131086234e-06, + "loss": 1.1943, + "step": 9370 + }, + { + "epoch": 0.95, + "grad_norm": 8.072922518170913, + "learning_rate": 8.655954948836826e-06, + "loss": 1.2521, + "step": 9375 + }, + { + "epoch": 0.95, + "grad_norm": 20.147834103743097, + "learning_rate": 8.653953508576813e-06, + "loss": 1.2457, + "step": 9380 + }, + { + "epoch": 0.95, + "grad_norm": 8.187688426230986, + "learning_rate": 8.65195081099489e-06, + "loss": 1.2233, + "step": 9385 + }, + { + "epoch": 0.95, + "grad_norm": 5.52875596393606, + "learning_rate": 8.649946856780178e-06, + "loss": 1.1896, + "step": 9390 + }, + { + "epoch": 0.95, + "grad_norm": 5.634859897158842, + "learning_rate": 8.64794164662224e-06, + "loss": 1.2319, + "step": 9395 + }, + { + "epoch": 0.95, + "grad_norm": 7.724500236711942, + "learning_rate": 8.645935181211065e-06, + "loss": 1.185, + "step": 9400 + }, + { + "epoch": 0.95, + "grad_norm": 6.001379200664274, + "learning_rate": 8.643927461237076e-06, + "loss": 1.1959, + "step": 9405 + }, + { + "epoch": 0.95, + "grad_norm": 21.132559590260563, + "learning_rate": 8.641918487391129e-06, + "loss": 1.2037, + "step": 9410 + }, + { + "epoch": 0.95, + "grad_norm": 8.29694004153204, + "learning_rate": 8.639908260364506e-06, + "loss": 1.2135, + "step": 9415 + }, + { + "epoch": 0.95, + "grad_norm": 16.94645524463928, + "learning_rate": 8.637896780848932e-06, + "loss": 1.1843, + "step": 9420 + }, + { + "epoch": 0.95, + "grad_norm": 33.47190874015566, + "learning_rate": 8.635884049536548e-06, + "loss": 1.2311, + "step": 9425 + }, + { + "epoch": 0.95, + "grad_norm": 23.42170464750877, + "learning_rate": 8.633870067119934e-06, + "loss": 1.2217, + "step": 9430 + }, + { + "epoch": 0.95, + "grad_norm": 18.017418447170066, + "learning_rate": 8.631854834292102e-06, + "loss": 1.2402, + "step": 9435 + }, + { + "epoch": 0.95, + "grad_norm": 13.972512670568745, + "learning_rate": 8.629838351746488e-06, + "loss": 1.1981, + "step": 9440 + }, + { + "epoch": 0.95, + "grad_norm": 6.461769001581415, + "learning_rate": 8.627820620176967e-06, + "loss": 1.2776, + "step": 9445 + }, + { + "epoch": 0.95, + "grad_norm": 10.786493970019889, + "learning_rate": 8.625801640277835e-06, + "loss": 1.1868, + "step": 9450 + }, + { + "epoch": 0.95, + "grad_norm": 29.018649268371522, + "learning_rate": 8.62378141274382e-06, + "loss": 1.1648, + "step": 9455 + }, + { + "epoch": 0.95, + "grad_norm": 45.83096642377824, + "learning_rate": 8.621759938270085e-06, + "loss": 1.204, + "step": 9460 + }, + { + "epoch": 0.95, + "grad_norm": 37.56036797387887, + "learning_rate": 8.619737217552213e-06, + "loss": 1.2293, + "step": 9465 + }, + { + "epoch": 0.95, + "grad_norm": 17.85963464956369, + "learning_rate": 8.617713251286221e-06, + "loss": 1.2041, + "step": 9470 + }, + { + "epoch": 0.96, + "grad_norm": 28.08967115530905, + "learning_rate": 8.615688040168554e-06, + "loss": 1.2017, + "step": 9475 + }, + { + "epoch": 0.96, + "grad_norm": 12.488225184800497, + "learning_rate": 8.613661584896088e-06, + "loss": 1.1837, + "step": 9480 + }, + { + "epoch": 0.96, + "grad_norm": 12.232954988608356, + "learning_rate": 8.61163388616612e-06, + "loss": 1.2455, + "step": 9485 + }, + { + "epoch": 0.96, + "grad_norm": 9.295162473894017, + "learning_rate": 8.609604944676382e-06, + "loss": 1.1843, + "step": 9490 + }, + { + "epoch": 0.96, + "grad_norm": 6.202453090548019, + "learning_rate": 8.607574761125029e-06, + "loss": 1.1921, + "step": 9495 + }, + { + "epoch": 0.96, + "grad_norm": 11.589082214759323, + "learning_rate": 8.605543336210648e-06, + "loss": 1.2447, + "step": 9500 + }, + { + "epoch": 0.96, + "grad_norm": 21.868788541707016, + "learning_rate": 8.603510670632248e-06, + "loss": 1.1866, + "step": 9505 + }, + { + "epoch": 0.96, + "grad_norm": 15.43176697786205, + "learning_rate": 8.601476765089267e-06, + "loss": 1.2228, + "step": 9510 + }, + { + "epoch": 0.96, + "grad_norm": 25.64496682281605, + "learning_rate": 8.599441620281573e-06, + "loss": 1.188, + "step": 9515 + }, + { + "epoch": 0.96, + "grad_norm": 29.764310160845312, + "learning_rate": 8.597405236909454e-06, + "loss": 1.2362, + "step": 9520 + }, + { + "epoch": 0.96, + "grad_norm": 32.23508806917358, + "learning_rate": 8.59536761567363e-06, + "loss": 1.1883, + "step": 9525 + }, + { + "epoch": 0.96, + "grad_norm": 17.162439336224594, + "learning_rate": 8.593328757275244e-06, + "loss": 1.2109, + "step": 9530 + }, + { + "epoch": 0.96, + "grad_norm": 7.8497243633031735, + "learning_rate": 8.591288662415863e-06, + "loss": 1.1849, + "step": 9535 + }, + { + "epoch": 0.96, + "grad_norm": 7.325698207035451, + "learning_rate": 8.589247331797482e-06, + "loss": 1.2294, + "step": 9540 + }, + { + "epoch": 0.96, + "grad_norm": 8.684046114236223, + "learning_rate": 8.587204766122524e-06, + "loss": 1.2249, + "step": 9545 + }, + { + "epoch": 0.96, + "grad_norm": 5.780997127520526, + "learning_rate": 8.585160966093832e-06, + "loss": 1.1899, + "step": 9550 + }, + { + "epoch": 0.96, + "grad_norm": 10.076217961018143, + "learning_rate": 8.583115932414677e-06, + "loss": 1.1961, + "step": 9555 + }, + { + "epoch": 0.96, + "grad_norm": 11.558273083924005, + "learning_rate": 8.58106966578875e-06, + "loss": 1.1607, + "step": 9560 + }, + { + "epoch": 0.96, + "grad_norm": 10.86979617003217, + "learning_rate": 8.579022166920172e-06, + "loss": 1.2633, + "step": 9565 + }, + { + "epoch": 0.96, + "grad_norm": 25.659561988607937, + "learning_rate": 8.576973436513485e-06, + "loss": 1.1857, + "step": 9570 + }, + { + "epoch": 0.97, + "grad_norm": 46.048303619796755, + "learning_rate": 8.574923475273656e-06, + "loss": 1.2248, + "step": 9575 + }, + { + "epoch": 0.97, + "grad_norm": 7.727155163521066, + "learning_rate": 8.572872283906071e-06, + "loss": 1.2353, + "step": 9580 + }, + { + "epoch": 0.97, + "grad_norm": 10.94732184496867, + "learning_rate": 8.570819863116546e-06, + "loss": 1.1864, + "step": 9585 + }, + { + "epoch": 0.97, + "grad_norm": 20.209059290959434, + "learning_rate": 8.56876621361132e-06, + "loss": 1.2085, + "step": 9590 + }, + { + "epoch": 0.97, + "grad_norm": 15.880908707190672, + "learning_rate": 8.566711336097045e-06, + "loss": 1.2717, + "step": 9595 + }, + { + "epoch": 0.97, + "grad_norm": 12.070944510809447, + "learning_rate": 8.564655231280808e-06, + "loss": 1.2236, + "step": 9600 + }, + { + "epoch": 0.97, + "grad_norm": 34.890685751008064, + "learning_rate": 8.562597899870107e-06, + "loss": 1.1806, + "step": 9605 + }, + { + "epoch": 0.97, + "grad_norm": 17.757711801324973, + "learning_rate": 8.560539342572874e-06, + "loss": 1.1937, + "step": 9610 + }, + { + "epoch": 0.97, + "grad_norm": 26.934656558915368, + "learning_rate": 8.558479560097455e-06, + "loss": 1.2282, + "step": 9615 + }, + { + "epoch": 0.97, + "grad_norm": 10.051031400680982, + "learning_rate": 8.556418553152617e-06, + "loss": 1.199, + "step": 9620 + }, + { + "epoch": 0.97, + "grad_norm": 6.064718410605512, + "learning_rate": 8.55435632244755e-06, + "loss": 1.1695, + "step": 9625 + }, + { + "epoch": 0.97, + "grad_norm": 7.585082014770853, + "learning_rate": 8.552292868691869e-06, + "loss": 1.184, + "step": 9630 + }, + { + "epoch": 0.97, + "grad_norm": 10.378246934152191, + "learning_rate": 8.550228192595604e-06, + "loss": 1.2185, + "step": 9635 + }, + { + "epoch": 0.97, + "grad_norm": 12.97285764971995, + "learning_rate": 8.54816229486921e-06, + "loss": 1.195, + "step": 9640 + }, + { + "epoch": 0.97, + "grad_norm": 12.008911383877557, + "learning_rate": 8.546095176223559e-06, + "loss": 1.1942, + "step": 9645 + }, + { + "epoch": 0.97, + "grad_norm": 6.92183947353392, + "learning_rate": 8.544026837369944e-06, + "loss": 1.2008, + "step": 9650 + }, + { + "epoch": 0.97, + "grad_norm": 7.115687819170043, + "learning_rate": 8.54195727902008e-06, + "loss": 1.2601, + "step": 9655 + }, + { + "epoch": 0.97, + "grad_norm": 12.63085545181519, + "learning_rate": 8.539886501886096e-06, + "loss": 1.245, + "step": 9660 + }, + { + "epoch": 0.97, + "grad_norm": 6.0638585143837895, + "learning_rate": 8.537814506680552e-06, + "loss": 1.2002, + "step": 9665 + }, + { + "epoch": 0.97, + "grad_norm": 16.479375704581546, + "learning_rate": 8.535741294116413e-06, + "loss": 1.2232, + "step": 9670 + }, + { + "epoch": 0.98, + "grad_norm": 17.557551719070158, + "learning_rate": 8.533666864907072e-06, + "loss": 1.1973, + "step": 9675 + }, + { + "epoch": 0.98, + "grad_norm": 10.673967352600394, + "learning_rate": 8.531591219766337e-06, + "loss": 1.2523, + "step": 9680 + }, + { + "epoch": 0.98, + "grad_norm": 15.669799825253532, + "learning_rate": 8.52951435940844e-06, + "loss": 1.2175, + "step": 9685 + }, + { + "epoch": 0.98, + "grad_norm": 20.04964319057395, + "learning_rate": 8.527436284548019e-06, + "loss": 1.1941, + "step": 9690 + }, + { + "epoch": 0.98, + "grad_norm": 10.990033812811166, + "learning_rate": 8.525356995900142e-06, + "loss": 1.1702, + "step": 9695 + }, + { + "epoch": 0.98, + "grad_norm": 16.20945799802136, + "learning_rate": 8.523276494180293e-06, + "loss": 1.1931, + "step": 9700 + }, + { + "epoch": 0.98, + "grad_norm": 12.84362195949145, + "learning_rate": 8.521194780104364e-06, + "loss": 1.2362, + "step": 9705 + }, + { + "epoch": 0.98, + "grad_norm": 12.616349187919386, + "learning_rate": 8.519111854388676e-06, + "loss": 1.1801, + "step": 9710 + }, + { + "epoch": 0.98, + "grad_norm": 18.920360038100625, + "learning_rate": 8.51702771774996e-06, + "loss": 1.2, + "step": 9715 + }, + { + "epoch": 0.98, + "grad_norm": 31.170767115548916, + "learning_rate": 8.514942370905364e-06, + "loss": 1.2017, + "step": 9720 + }, + { + "epoch": 0.98, + "grad_norm": 24.598565989478793, + "learning_rate": 8.512855814572456e-06, + "loss": 1.1829, + "step": 9725 + }, + { + "epoch": 0.98, + "grad_norm": 16.275465737808293, + "learning_rate": 8.510768049469218e-06, + "loss": 1.2021, + "step": 9730 + }, + { + "epoch": 0.98, + "grad_norm": 72.26265395071671, + "learning_rate": 8.508679076314048e-06, + "loss": 1.2837, + "step": 9735 + }, + { + "epoch": 0.98, + "grad_norm": 31.72169190093813, + "learning_rate": 8.506588895825758e-06, + "loss": 1.2686, + "step": 9740 + }, + { + "epoch": 0.98, + "grad_norm": 176.50463584566126, + "learning_rate": 8.504497508723578e-06, + "loss": 1.2294, + "step": 9745 + }, + { + "epoch": 0.98, + "grad_norm": 58.2525129534981, + "learning_rate": 8.502404915727154e-06, + "loss": 1.2327, + "step": 9750 + }, + { + "epoch": 0.98, + "grad_norm": 44.345100118213004, + "learning_rate": 8.50031111755654e-06, + "loss": 1.2719, + "step": 9755 + }, + { + "epoch": 0.98, + "grad_norm": 58.80090588929816, + "learning_rate": 8.498216114932217e-06, + "loss": 1.225, + "step": 9760 + }, + { + "epoch": 0.98, + "grad_norm": 17.63017726386444, + "learning_rate": 8.49611990857507e-06, + "loss": 1.2465, + "step": 9765 + }, + { + "epoch": 0.99, + "grad_norm": 20.442324359928453, + "learning_rate": 8.4940224992064e-06, + "loss": 1.264, + "step": 9770 + }, + { + "epoch": 0.99, + "grad_norm": 11.434910677344071, + "learning_rate": 8.491923887547926e-06, + "loss": 1.1754, + "step": 9775 + }, + { + "epoch": 0.99, + "grad_norm": 17.22452900332348, + "learning_rate": 8.489824074321778e-06, + "loss": 1.2591, + "step": 9780 + }, + { + "epoch": 0.99, + "grad_norm": 15.660771484878797, + "learning_rate": 8.487723060250498e-06, + "loss": 1.1941, + "step": 9785 + }, + { + "epoch": 0.99, + "grad_norm": 5.793455215195363, + "learning_rate": 8.485620846057044e-06, + "loss": 1.2331, + "step": 9790 + }, + { + "epoch": 0.99, + "grad_norm": 20.189243088111986, + "learning_rate": 8.483517432464788e-06, + "loss": 1.1827, + "step": 9795 + }, + { + "epoch": 0.99, + "grad_norm": 31.80617544301878, + "learning_rate": 8.481412820197508e-06, + "loss": 1.209, + "step": 9800 + }, + { + "epoch": 0.99, + "grad_norm": 51.16240905871924, + "learning_rate": 8.479307009979403e-06, + "loss": 1.1993, + "step": 9805 + }, + { + "epoch": 0.99, + "grad_norm": 37.645660326349336, + "learning_rate": 8.47720000253508e-06, + "loss": 1.2541, + "step": 9810 + }, + { + "epoch": 0.99, + "grad_norm": 13.537646193886891, + "learning_rate": 8.475091798589556e-06, + "loss": 1.2196, + "step": 9815 + }, + { + "epoch": 0.99, + "grad_norm": 74.70213748980201, + "learning_rate": 8.472982398868263e-06, + "loss": 1.2352, + "step": 9820 + }, + { + "epoch": 0.99, + "grad_norm": 135.7574431462861, + "learning_rate": 8.470871804097045e-06, + "loss": 1.2056, + "step": 9825 + }, + { + "epoch": 0.99, + "grad_norm": 57.281387936043224, + "learning_rate": 8.468760015002153e-06, + "loss": 1.1798, + "step": 9830 + }, + { + "epoch": 0.99, + "grad_norm": 49.41630425177251, + "learning_rate": 8.466647032310253e-06, + "loss": 1.2587, + "step": 9835 + }, + { + "epoch": 0.99, + "grad_norm": 24.117672854707138, + "learning_rate": 8.46453285674842e-06, + "loss": 1.2449, + "step": 9840 + }, + { + "epoch": 0.99, + "grad_norm": 14.016753129993532, + "learning_rate": 8.46241748904414e-06, + "loss": 1.2152, + "step": 9845 + }, + { + "epoch": 0.99, + "grad_norm": 9.079832042525334, + "learning_rate": 8.46030092992531e-06, + "loss": 1.2344, + "step": 9850 + }, + { + "epoch": 0.99, + "grad_norm": 16.341759458322716, + "learning_rate": 8.458183180120235e-06, + "loss": 1.2856, + "step": 9855 + }, + { + "epoch": 0.99, + "grad_norm": 10.787416825397322, + "learning_rate": 8.456064240357628e-06, + "loss": 1.2745, + "step": 9860 + }, + { + "epoch": 0.99, + "grad_norm": 45.30087761027042, + "learning_rate": 8.45394411136662e-06, + "loss": 1.1844, + "step": 9865 + }, + { + "epoch": 1.0, + "grad_norm": 35.027289288363285, + "learning_rate": 8.451822793876742e-06, + "loss": 1.2217, + "step": 9870 + }, + { + "epoch": 1.0, + "grad_norm": 19.68667017662464, + "learning_rate": 8.449700288617935e-06, + "loss": 1.2125, + "step": 9875 + }, + { + "epoch": 1.0, + "grad_norm": 18.34371606158347, + "learning_rate": 8.447576596320558e-06, + "loss": 1.2053, + "step": 9880 + }, + { + "epoch": 1.0, + "grad_norm": 18.141594774196527, + "learning_rate": 8.445451717715363e-06, + "loss": 1.164, + "step": 9885 + }, + { + "epoch": 1.0, + "grad_norm": 25.351385895760103, + "learning_rate": 8.443325653533525e-06, + "loss": 1.2194, + "step": 9890 + }, + { + "epoch": 1.0, + "grad_norm": 16.307025203392016, + "learning_rate": 8.441198404506616e-06, + "loss": 1.1969, + "step": 9895 + }, + { + "epoch": 1.0, + "grad_norm": 12.175196142826138, + "learning_rate": 8.439069971366625e-06, + "loss": 1.1987, + "step": 9900 + }, + { + "epoch": 1.0, + "grad_norm": 14.931195059799597, + "learning_rate": 8.436940354845939e-06, + "loss": 1.2042, + "step": 9905 + }, + { + "epoch": 1.0, + "grad_norm": 11.002472470006175, + "learning_rate": 8.434809555677361e-06, + "loss": 1.2303, + "step": 9910 + }, + { + "epoch": 1.0, + "grad_norm": 12.290291630079329, + "learning_rate": 8.432677574594095e-06, + "loss": 1.1862, + "step": 9915 + }, + { + "epoch": 1.0, + "eval_loss": 1.2109025716781616, + "eval_runtime": 25.7471, + "eval_samples_per_second": 31.305, + "eval_steps_per_second": 3.923, + "step": 9918 + }, + { + "epoch": 1.0, + "grad_norm": 10.829768346435534, + "learning_rate": 8.430544412329756e-06, + "loss": 1.1478, + "step": 9920 + }, + { + "epoch": 1.0, + "grad_norm": 8.970145587847068, + "learning_rate": 8.42841006961836e-06, + "loss": 0.9985, + "step": 9925 + }, + { + "epoch": 1.0, + "grad_norm": 12.882498532354235, + "learning_rate": 8.426274547194332e-06, + "loss": 1.0612, + "step": 9930 + }, + { + "epoch": 1.0, + "grad_norm": 10.97482649042033, + "learning_rate": 8.424137845792509e-06, + "loss": 0.9763, + "step": 9935 + }, + { + "epoch": 1.0, + "grad_norm": 11.76466163919948, + "learning_rate": 8.42199996614812e-06, + "loss": 1.0238, + "step": 9940 + }, + { + "epoch": 1.0, + "grad_norm": 9.112399675573785, + "learning_rate": 8.41986090899681e-06, + "loss": 1.0398, + "step": 9945 + }, + { + "epoch": 1.0, + "grad_norm": 6.213687218919592, + "learning_rate": 8.417720675074633e-06, + "loss": 1.051, + "step": 9950 + }, + { + "epoch": 1.0, + "grad_norm": 9.663983135535817, + "learning_rate": 8.415579265118032e-06, + "loss": 1.0585, + "step": 9955 + }, + { + "epoch": 1.0, + "grad_norm": 5.359268922806681, + "learning_rate": 8.413436679863868e-06, + "loss": 0.9919, + "step": 9960 + }, + { + "epoch": 1.0, + "grad_norm": 8.245242498527361, + "learning_rate": 8.411292920049403e-06, + "loss": 1.0313, + "step": 9965 + }, + { + "epoch": 1.01, + "grad_norm": 9.768501766101803, + "learning_rate": 8.409147986412302e-06, + "loss": 0.9857, + "step": 9970 + }, + { + "epoch": 1.01, + "grad_norm": 20.84868605870357, + "learning_rate": 8.407001879690637e-06, + "loss": 1.0668, + "step": 9975 + }, + { + "epoch": 1.01, + "grad_norm": 10.832758318711273, + "learning_rate": 8.404854600622878e-06, + "loss": 1.061, + "step": 9980 + }, + { + "epoch": 1.01, + "grad_norm": 28.323129537818932, + "learning_rate": 8.402706149947903e-06, + "loss": 1.0443, + "step": 9985 + }, + { + "epoch": 1.01, + "grad_norm": 32.08304774339511, + "learning_rate": 8.40055652840499e-06, + "loss": 1.102, + "step": 9990 + }, + { + "epoch": 1.01, + "grad_norm": 13.982485513234868, + "learning_rate": 8.398405736733827e-06, + "loss": 1.0058, + "step": 9995 + }, + { + "epoch": 1.01, + "grad_norm": 9.166678030612518, + "learning_rate": 8.396253775674495e-06, + "loss": 0.9781, + "step": 10000 + }, + { + "epoch": 1.01, + "grad_norm": 24.744865722386095, + "learning_rate": 8.39410064596748e-06, + "loss": 1.054, + "step": 10005 + }, + { + "epoch": 1.01, + "grad_norm": 33.39987986980466, + "learning_rate": 8.391946348353676e-06, + "loss": 1.0492, + "step": 10010 + }, + { + "epoch": 1.01, + "grad_norm": 16.197408664885675, + "learning_rate": 8.389790883574374e-06, + "loss": 1.0285, + "step": 10015 + }, + { + "epoch": 1.01, + "grad_norm": 6.478049205668278, + "learning_rate": 8.387634252371267e-06, + "loss": 1.0178, + "step": 10020 + }, + { + "epoch": 1.01, + "grad_norm": 11.22929308446771, + "learning_rate": 8.385476455486447e-06, + "loss": 1.0622, + "step": 10025 + }, + { + "epoch": 1.01, + "grad_norm": 6.37257185988015, + "learning_rate": 8.383317493662412e-06, + "loss": 1.0177, + "step": 10030 + }, + { + "epoch": 1.01, + "grad_norm": 34.315595057653006, + "learning_rate": 8.381157367642062e-06, + "loss": 1.0417, + "step": 10035 + }, + { + "epoch": 1.01, + "grad_norm": 18.402346346049598, + "learning_rate": 8.37899607816869e-06, + "loss": 1.052, + "step": 10040 + }, + { + "epoch": 1.01, + "grad_norm": 11.337683054348759, + "learning_rate": 8.376833625985994e-06, + "loss": 1.062, + "step": 10045 + }, + { + "epoch": 1.01, + "grad_norm": 20.548054154408476, + "learning_rate": 8.374670011838072e-06, + "loss": 1.031, + "step": 10050 + }, + { + "epoch": 1.01, + "grad_norm": 17.210150969500315, + "learning_rate": 8.372505236469424e-06, + "loss": 1.0524, + "step": 10055 + }, + { + "epoch": 1.01, + "grad_norm": 12.613329324259729, + "learning_rate": 8.370339300624943e-06, + "loss": 1.0022, + "step": 10060 + }, + { + "epoch": 1.01, + "grad_norm": 7.485345502725573, + "learning_rate": 8.36817220504993e-06, + "loss": 0.9975, + "step": 10065 + }, + { + "epoch": 1.02, + "grad_norm": 7.671073845035285, + "learning_rate": 8.366003950490076e-06, + "loss": 1.0465, + "step": 10070 + }, + { + "epoch": 1.02, + "grad_norm": 8.982752291924099, + "learning_rate": 8.36383453769148e-06, + "loss": 1.0219, + "step": 10075 + }, + { + "epoch": 1.02, + "grad_norm": 14.116848311765736, + "learning_rate": 8.36166396740063e-06, + "loss": 1.0371, + "step": 10080 + }, + { + "epoch": 1.02, + "grad_norm": 9.768301556799226, + "learning_rate": 8.35949224036442e-06, + "loss": 1.0401, + "step": 10085 + }, + { + "epoch": 1.02, + "grad_norm": 13.02668779107601, + "learning_rate": 8.35731935733014e-06, + "loss": 0.9951, + "step": 10090 + }, + { + "epoch": 1.02, + "grad_norm": 11.426296707679093, + "learning_rate": 8.355145319045475e-06, + "loss": 1.0167, + "step": 10095 + }, + { + "epoch": 1.02, + "grad_norm": 16.00541707616357, + "learning_rate": 8.35297012625851e-06, + "loss": 0.9962, + "step": 10100 + }, + { + "epoch": 1.02, + "grad_norm": 6.588001957684907, + "learning_rate": 8.350793779717727e-06, + "loss": 1.0284, + "step": 10105 + }, + { + "epoch": 1.02, + "grad_norm": 6.013644599649537, + "learning_rate": 8.348616280172006e-06, + "loss": 1.0332, + "step": 10110 + }, + { + "epoch": 1.02, + "grad_norm": 19.298108799226043, + "learning_rate": 8.346437628370621e-06, + "loss": 1.0447, + "step": 10115 + }, + { + "epoch": 1.02, + "grad_norm": 7.074142078355483, + "learning_rate": 8.344257825063242e-06, + "loss": 1.0473, + "step": 10120 + }, + { + "epoch": 1.02, + "grad_norm": 6.3778894218672, + "learning_rate": 8.342076870999943e-06, + "loss": 0.9901, + "step": 10125 + }, + { + "epoch": 1.02, + "grad_norm": 7.1894231376209685, + "learning_rate": 8.339894766931183e-06, + "loss": 1.0352, + "step": 10130 + }, + { + "epoch": 1.02, + "grad_norm": 15.009292576306457, + "learning_rate": 8.337711513607823e-06, + "loss": 0.9798, + "step": 10135 + }, + { + "epoch": 1.02, + "grad_norm": 12.916583311141933, + "learning_rate": 8.335527111781121e-06, + "loss": 1.0658, + "step": 10140 + }, + { + "epoch": 1.02, + "grad_norm": 8.335359800171835, + "learning_rate": 8.333341562202724e-06, + "loss": 1.0424, + "step": 10145 + }, + { + "epoch": 1.02, + "grad_norm": 16.405141871430068, + "learning_rate": 8.33115486562468e-06, + "loss": 1.0568, + "step": 10150 + }, + { + "epoch": 1.02, + "grad_norm": 15.878882635298778, + "learning_rate": 8.328967022799427e-06, + "loss": 1.0359, + "step": 10155 + }, + { + "epoch": 1.02, + "grad_norm": 7.011786564450279, + "learning_rate": 8.326778034479802e-06, + "loss": 1.0394, + "step": 10160 + }, + { + "epoch": 1.02, + "grad_norm": 6.231043280092075, + "learning_rate": 8.324587901419033e-06, + "loss": 1.0051, + "step": 10165 + }, + { + "epoch": 1.03, + "grad_norm": 6.168059044544621, + "learning_rate": 8.322396624370741e-06, + "loss": 1.0036, + "step": 10170 + }, + { + "epoch": 1.03, + "grad_norm": 10.939236046593278, + "learning_rate": 8.320204204088946e-06, + "loss": 1.0453, + "step": 10175 + }, + { + "epoch": 1.03, + "grad_norm": 5.345591223655666, + "learning_rate": 8.318010641328053e-06, + "loss": 1.0282, + "step": 10180 + }, + { + "epoch": 1.03, + "grad_norm": 18.90314250251431, + "learning_rate": 8.31581593684287e-06, + "loss": 0.9976, + "step": 10185 + }, + { + "epoch": 1.03, + "grad_norm": 18.084551690785258, + "learning_rate": 8.313620091388588e-06, + "loss": 1.0772, + "step": 10190 + }, + { + "epoch": 1.03, + "grad_norm": 9.684884049853789, + "learning_rate": 8.311423105720799e-06, + "loss": 1.0162, + "step": 10195 + }, + { + "epoch": 1.03, + "grad_norm": 7.549694639447544, + "learning_rate": 8.30922498059548e-06, + "loss": 0.987, + "step": 10200 + }, + { + "epoch": 1.03, + "grad_norm": 7.143518433232426, + "learning_rate": 8.307025716769008e-06, + "loss": 1.0323, + "step": 10205 + }, + { + "epoch": 1.03, + "grad_norm": 8.884891662639422, + "learning_rate": 8.304825314998147e-06, + "loss": 0.9754, + "step": 10210 + }, + { + "epoch": 1.03, + "grad_norm": 12.655469405362567, + "learning_rate": 8.302623776040047e-06, + "loss": 1.0069, + "step": 10215 + }, + { + "epoch": 1.03, + "grad_norm": 14.075675562314656, + "learning_rate": 8.300421100652263e-06, + "loss": 1.0739, + "step": 10220 + }, + { + "epoch": 1.03, + "grad_norm": 18.065814927344455, + "learning_rate": 8.298217289592729e-06, + "loss": 1.0376, + "step": 10225 + }, + { + "epoch": 1.03, + "grad_norm": 54.61351609242025, + "learning_rate": 8.296012343619777e-06, + "loss": 1.1097, + "step": 10230 + }, + { + "epoch": 1.03, + "grad_norm": 9.53814558868182, + "learning_rate": 8.293806263492126e-06, + "loss": 1.0131, + "step": 10235 + }, + { + "epoch": 1.03, + "grad_norm": 11.438272279146418, + "learning_rate": 8.291599049968885e-06, + "loss": 1.0001, + "step": 10240 + }, + { + "epoch": 1.03, + "grad_norm": 9.00508276425213, + "learning_rate": 8.289390703809554e-06, + "loss": 1.0835, + "step": 10245 + }, + { + "epoch": 1.03, + "grad_norm": 9.395690192570587, + "learning_rate": 8.287181225774025e-06, + "loss": 1.051, + "step": 10250 + }, + { + "epoch": 1.03, + "grad_norm": 10.050017934085144, + "learning_rate": 8.284970616622575e-06, + "loss": 0.9877, + "step": 10255 + }, + { + "epoch": 1.03, + "grad_norm": 6.029724756383363, + "learning_rate": 8.282758877115873e-06, + "loss": 1.0171, + "step": 10260 + }, + { + "epoch": 1.03, + "grad_norm": 11.208701180126377, + "learning_rate": 8.280546008014979e-06, + "loss": 1.0231, + "step": 10265 + }, + { + "epoch": 1.04, + "grad_norm": 12.966732406844317, + "learning_rate": 8.278332010081333e-06, + "loss": 1.0141, + "step": 10270 + }, + { + "epoch": 1.04, + "grad_norm": 7.461619182024871, + "learning_rate": 8.276116884076777e-06, + "loss": 1.0603, + "step": 10275 + }, + { + "epoch": 1.04, + "grad_norm": 13.178804487830194, + "learning_rate": 8.273900630763529e-06, + "loss": 1.0469, + "step": 10280 + }, + { + "epoch": 1.04, + "grad_norm": 15.349058379616233, + "learning_rate": 8.2716832509042e-06, + "loss": 0.9975, + "step": 10285 + }, + { + "epoch": 1.04, + "grad_norm": 19.535910730995127, + "learning_rate": 8.26946474526179e-06, + "loss": 1.0539, + "step": 10290 + }, + { + "epoch": 1.04, + "grad_norm": 11.929726128486617, + "learning_rate": 8.267245114599684e-06, + "loss": 1.0546, + "step": 10295 + }, + { + "epoch": 1.04, + "grad_norm": 12.196993975441478, + "learning_rate": 8.265024359681655e-06, + "loss": 0.9936, + "step": 10300 + }, + { + "epoch": 1.04, + "grad_norm": 7.850173272592965, + "learning_rate": 8.262802481271861e-06, + "loss": 1.041, + "step": 10305 + }, + { + "epoch": 1.04, + "grad_norm": 7.691774593198793, + "learning_rate": 8.26057948013485e-06, + "loss": 1.0955, + "step": 10310 + }, + { + "epoch": 1.04, + "grad_norm": 7.1729861230834295, + "learning_rate": 8.258355357035555e-06, + "loss": 1.0714, + "step": 10315 + }, + { + "epoch": 1.04, + "grad_norm": 8.238572105953923, + "learning_rate": 8.256130112739293e-06, + "loss": 1.0458, + "step": 10320 + }, + { + "epoch": 1.04, + "grad_norm": 5.624595274100203, + "learning_rate": 8.253903748011769e-06, + "loss": 1.0115, + "step": 10325 + }, + { + "epoch": 1.04, + "grad_norm": 17.442304927152296, + "learning_rate": 8.251676263619074e-06, + "loss": 1.0122, + "step": 10330 + }, + { + "epoch": 1.04, + "grad_norm": 6.0748278749495785, + "learning_rate": 8.249447660327681e-06, + "loss": 1.0554, + "step": 10335 + }, + { + "epoch": 1.04, + "grad_norm": 6.41122547786913, + "learning_rate": 8.247217938904453e-06, + "loss": 1.0389, + "step": 10340 + }, + { + "epoch": 1.04, + "grad_norm": 14.149361527527443, + "learning_rate": 8.244987100116632e-06, + "loss": 1.066, + "step": 10345 + }, + { + "epoch": 1.04, + "grad_norm": 20.382488116202655, + "learning_rate": 8.242755144731851e-06, + "loss": 1.0616, + "step": 10350 + }, + { + "epoch": 1.04, + "grad_norm": 5.805046554209081, + "learning_rate": 8.240522073518122e-06, + "loss": 1.0178, + "step": 10355 + }, + { + "epoch": 1.04, + "grad_norm": 11.771420372480122, + "learning_rate": 8.23828788724384e-06, + "loss": 1.0467, + "step": 10360 + }, + { + "epoch": 1.05, + "grad_norm": 39.87695906960376, + "learning_rate": 8.236052586677792e-06, + "loss": 1.0675, + "step": 10365 + }, + { + "epoch": 1.05, + "grad_norm": 26.86833571951981, + "learning_rate": 8.233816172589139e-06, + "loss": 1.0649, + "step": 10370 + }, + { + "epoch": 1.05, + "grad_norm": 6.884515060507346, + "learning_rate": 8.231578645747428e-06, + "loss": 1.0472, + "step": 10375 + }, + { + "epoch": 1.05, + "grad_norm": 23.93658008919047, + "learning_rate": 8.229340006922592e-06, + "loss": 1.0236, + "step": 10380 + }, + { + "epoch": 1.05, + "grad_norm": 9.962471938855685, + "learning_rate": 8.227100256884945e-06, + "loss": 1.0748, + "step": 10385 + }, + { + "epoch": 1.05, + "grad_norm": 9.589553264943177, + "learning_rate": 8.224859396405183e-06, + "loss": 1.0212, + "step": 10390 + }, + { + "epoch": 1.05, + "grad_norm": 27.241871994524665, + "learning_rate": 8.22261742625438e-06, + "loss": 1.0397, + "step": 10395 + }, + { + "epoch": 1.05, + "grad_norm": 35.438534496728494, + "learning_rate": 8.220374347204001e-06, + "loss": 1.0519, + "step": 10400 + }, + { + "epoch": 1.05, + "grad_norm": 63.37721092054966, + "learning_rate": 8.218130160025883e-06, + "loss": 1.0447, + "step": 10405 + }, + { + "epoch": 1.05, + "grad_norm": 46.362651376223255, + "learning_rate": 8.215884865492252e-06, + "loss": 1.0396, + "step": 10410 + }, + { + "epoch": 1.05, + "grad_norm": 86.81240093881469, + "learning_rate": 8.21363846437571e-06, + "loss": 1.0687, + "step": 10415 + }, + { + "epoch": 1.05, + "grad_norm": 38.108316598515685, + "learning_rate": 8.211390957449242e-06, + "loss": 1.0728, + "step": 10420 + }, + { + "epoch": 1.05, + "grad_norm": 33.206193958643894, + "learning_rate": 8.20914234548621e-06, + "loss": 1.0477, + "step": 10425 + }, + { + "epoch": 1.05, + "grad_norm": 6.020883745985015, + "learning_rate": 8.206892629260366e-06, + "loss": 1.0067, + "step": 10430 + }, + { + "epoch": 1.05, + "grad_norm": 16.45834233869216, + "learning_rate": 8.204641809545829e-06, + "loss": 1.0321, + "step": 10435 + }, + { + "epoch": 1.05, + "grad_norm": 22.878622903566985, + "learning_rate": 8.202389887117106e-06, + "loss": 1.0424, + "step": 10440 + }, + { + "epoch": 1.05, + "grad_norm": 9.914121611087722, + "learning_rate": 8.200136862749083e-06, + "loss": 1.0094, + "step": 10445 + }, + { + "epoch": 1.05, + "grad_norm": 6.991708402704003, + "learning_rate": 8.19788273721702e-06, + "loss": 1.08, + "step": 10450 + }, + { + "epoch": 1.05, + "grad_norm": 43.71924056121477, + "learning_rate": 8.195627511296562e-06, + "loss": 1.0456, + "step": 10455 + }, + { + "epoch": 1.05, + "grad_norm": 74.34005505033582, + "learning_rate": 8.19337118576373e-06, + "loss": 1.0431, + "step": 10460 + }, + { + "epoch": 1.06, + "grad_norm": 26.077817865542375, + "learning_rate": 8.191113761394924e-06, + "loss": 1.0635, + "step": 10465 + }, + { + "epoch": 1.06, + "grad_norm": 9.513061881158686, + "learning_rate": 8.188855238966918e-06, + "loss": 1.0094, + "step": 10470 + }, + { + "epoch": 1.06, + "grad_norm": 27.962173814297504, + "learning_rate": 8.186595619256872e-06, + "loss": 1.0165, + "step": 10475 + }, + { + "epoch": 1.06, + "grad_norm": 9.287680421146506, + "learning_rate": 8.184334903042317e-06, + "loss": 1.0481, + "step": 10480 + }, + { + "epoch": 1.06, + "grad_norm": 18.770639134900765, + "learning_rate": 8.182073091101162e-06, + "loss": 1.0454, + "step": 10485 + }, + { + "epoch": 1.06, + "grad_norm": 10.44461465762257, + "learning_rate": 8.179810184211697e-06, + "loss": 1.0993, + "step": 10490 + }, + { + "epoch": 1.06, + "grad_norm": 14.95781697961938, + "learning_rate": 8.177546183152584e-06, + "loss": 1.0544, + "step": 10495 + }, + { + "epoch": 1.06, + "grad_norm": 9.94690016611478, + "learning_rate": 8.175281088702865e-06, + "loss": 1.0863, + "step": 10500 + }, + { + "epoch": 1.06, + "grad_norm": 6.348800475665943, + "learning_rate": 8.173014901641955e-06, + "loss": 1.079, + "step": 10505 + }, + { + "epoch": 1.06, + "grad_norm": 15.869679015605106, + "learning_rate": 8.170747622749648e-06, + "loss": 1.029, + "step": 10510 + }, + { + "epoch": 1.06, + "grad_norm": 6.568519455147601, + "learning_rate": 8.168479252806111e-06, + "loss": 1.069, + "step": 10515 + }, + { + "epoch": 1.06, + "grad_norm": 6.191276539726678, + "learning_rate": 8.166209792591893e-06, + "loss": 1.0137, + "step": 10520 + }, + { + "epoch": 1.06, + "grad_norm": 17.364086529976788, + "learning_rate": 8.163939242887909e-06, + "loss": 1.0729, + "step": 10525 + }, + { + "epoch": 1.06, + "grad_norm": 11.752862921532925, + "learning_rate": 8.161667604475452e-06, + "loss": 1.0175, + "step": 10530 + }, + { + "epoch": 1.06, + "grad_norm": 8.257352736574848, + "learning_rate": 8.15939487813619e-06, + "loss": 1.0339, + "step": 10535 + }, + { + "epoch": 1.06, + "grad_norm": 21.954806474603796, + "learning_rate": 8.157121064652171e-06, + "loss": 1.0565, + "step": 10540 + }, + { + "epoch": 1.06, + "grad_norm": 10.781500694131484, + "learning_rate": 8.154846164805807e-06, + "loss": 1.0432, + "step": 10545 + }, + { + "epoch": 1.06, + "grad_norm": 22.378219693704544, + "learning_rate": 8.152570179379893e-06, + "loss": 1.0174, + "step": 10550 + }, + { + "epoch": 1.06, + "grad_norm": 5.7171653504915385, + "learning_rate": 8.15029310915759e-06, + "loss": 1.0416, + "step": 10555 + }, + { + "epoch": 1.06, + "grad_norm": 11.155624751237, + "learning_rate": 8.148014954922438e-06, + "loss": 1.0438, + "step": 10560 + }, + { + "epoch": 1.07, + "grad_norm": 9.151260055253163, + "learning_rate": 8.145735717458344e-06, + "loss": 0.9877, + "step": 10565 + }, + { + "epoch": 1.07, + "grad_norm": 7.068599316429127, + "learning_rate": 8.143455397549597e-06, + "loss": 1.0721, + "step": 10570 + }, + { + "epoch": 1.07, + "grad_norm": 13.183688049917007, + "learning_rate": 8.14117399598085e-06, + "loss": 1.0617, + "step": 10575 + }, + { + "epoch": 1.07, + "grad_norm": 16.8120516390695, + "learning_rate": 8.13889151353713e-06, + "loss": 1.0236, + "step": 10580 + }, + { + "epoch": 1.07, + "grad_norm": 6.849510964885902, + "learning_rate": 8.13660795100384e-06, + "loss": 1.0305, + "step": 10585 + }, + { + "epoch": 1.07, + "grad_norm": 6.130947304232826, + "learning_rate": 8.134323309166747e-06, + "loss": 1.0564, + "step": 10590 + }, + { + "epoch": 1.07, + "grad_norm": 5.548870019213742, + "learning_rate": 8.132037588811998e-06, + "loss": 1.0183, + "step": 10595 + }, + { + "epoch": 1.07, + "grad_norm": 35.41958649859262, + "learning_rate": 8.129750790726108e-06, + "loss": 1.0407, + "step": 10600 + }, + { + "epoch": 1.07, + "grad_norm": 20.220901648050884, + "learning_rate": 8.127462915695958e-06, + "loss": 1.0309, + "step": 10605 + }, + { + "epoch": 1.07, + "grad_norm": 14.431443131862906, + "learning_rate": 8.125173964508806e-06, + "loss": 0.9904, + "step": 10610 + }, + { + "epoch": 1.07, + "grad_norm": 14.018049652826944, + "learning_rate": 8.12288393795228e-06, + "loss": 1.05, + "step": 10615 + }, + { + "epoch": 1.07, + "grad_norm": 8.331762857681298, + "learning_rate": 8.120592836814372e-06, + "loss": 0.9944, + "step": 10620 + }, + { + "epoch": 1.07, + "grad_norm": 10.694322359008826, + "learning_rate": 8.118300661883447e-06, + "loss": 1.0141, + "step": 10625 + }, + { + "epoch": 1.07, + "grad_norm": 7.139637852801844, + "learning_rate": 8.116007413948246e-06, + "loss": 1.0605, + "step": 10630 + }, + { + "epoch": 1.07, + "grad_norm": 27.75660179197848, + "learning_rate": 8.113713093797868e-06, + "loss": 1.0805, + "step": 10635 + }, + { + "epoch": 1.07, + "grad_norm": 23.475321565301055, + "learning_rate": 8.111417702221789e-06, + "loss": 1.0264, + "step": 10640 + }, + { + "epoch": 1.07, + "grad_norm": 21.672369017405423, + "learning_rate": 8.109121240009851e-06, + "loss": 1.0534, + "step": 10645 + }, + { + "epoch": 1.07, + "grad_norm": 6.602144775695785, + "learning_rate": 8.106823707952263e-06, + "loss": 1.0419, + "step": 10650 + }, + { + "epoch": 1.07, + "grad_norm": 9.930638541831872, + "learning_rate": 8.104525106839606e-06, + "loss": 1.0507, + "step": 10655 + }, + { + "epoch": 1.07, + "grad_norm": 21.190413953730527, + "learning_rate": 8.102225437462824e-06, + "loss": 1.0411, + "step": 10660 + }, + { + "epoch": 1.08, + "grad_norm": 20.32191528592284, + "learning_rate": 8.09992470061323e-06, + "loss": 1.0641, + "step": 10665 + }, + { + "epoch": 1.08, + "grad_norm": 11.930762358782742, + "learning_rate": 8.097622897082512e-06, + "loss": 1.0278, + "step": 10670 + }, + { + "epoch": 1.08, + "grad_norm": 34.26451847010071, + "learning_rate": 8.09532002766271e-06, + "loss": 1.0161, + "step": 10675 + }, + { + "epoch": 1.08, + "grad_norm": 54.58971847600707, + "learning_rate": 8.093016093146242e-06, + "loss": 1.0074, + "step": 10680 + }, + { + "epoch": 1.08, + "grad_norm": 21.47990163607273, + "learning_rate": 8.090711094325891e-06, + "loss": 1.0681, + "step": 10685 + }, + { + "epoch": 1.08, + "grad_norm": 7.386616762935336, + "learning_rate": 8.088405031994807e-06, + "loss": 1.0295, + "step": 10690 + }, + { + "epoch": 1.08, + "grad_norm": 27.232856368628745, + "learning_rate": 8.0860979069465e-06, + "loss": 1.0289, + "step": 10695 + }, + { + "epoch": 1.08, + "grad_norm": 9.453651242681508, + "learning_rate": 8.083789719974851e-06, + "loss": 1.0618, + "step": 10700 + }, + { + "epoch": 1.08, + "grad_norm": 6.668346991769979, + "learning_rate": 8.081480471874104e-06, + "loss": 1.0612, + "step": 10705 + }, + { + "epoch": 1.08, + "grad_norm": 6.643278409339298, + "learning_rate": 8.079170163438868e-06, + "loss": 1.0549, + "step": 10710 + }, + { + "epoch": 1.08, + "grad_norm": 13.331052850098343, + "learning_rate": 8.076858795464122e-06, + "loss": 1.0276, + "step": 10715 + }, + { + "epoch": 1.08, + "grad_norm": 10.186490783994433, + "learning_rate": 8.074546368745203e-06, + "loss": 1.0888, + "step": 10720 + }, + { + "epoch": 1.08, + "grad_norm": 18.851520851508756, + "learning_rate": 8.072232884077816e-06, + "loss": 1.0049, + "step": 10725 + }, + { + "epoch": 1.08, + "grad_norm": 43.982567273199294, + "learning_rate": 8.069918342258026e-06, + "loss": 1.0366, + "step": 10730 + }, + { + "epoch": 1.08, + "grad_norm": 15.929804138237563, + "learning_rate": 8.067602744082268e-06, + "loss": 1.0421, + "step": 10735 + }, + { + "epoch": 1.08, + "grad_norm": 8.961094668401225, + "learning_rate": 8.065286090347335e-06, + "loss": 1.0774, + "step": 10740 + }, + { + "epoch": 1.08, + "grad_norm": 24.28551240410909, + "learning_rate": 8.062968381850386e-06, + "loss": 1.0076, + "step": 10745 + }, + { + "epoch": 1.08, + "grad_norm": 10.853929888653179, + "learning_rate": 8.060649619388942e-06, + "loss": 1.0128, + "step": 10750 + }, + { + "epoch": 1.08, + "grad_norm": 8.068621266149146, + "learning_rate": 8.058329803760887e-06, + "loss": 1.0167, + "step": 10755 + }, + { + "epoch": 1.08, + "grad_norm": 19.775858823799506, + "learning_rate": 8.056008935764469e-06, + "loss": 1.0708, + "step": 10760 + }, + { + "epoch": 1.09, + "grad_norm": 28.512827810024064, + "learning_rate": 8.053687016198292e-06, + "loss": 1.0837, + "step": 10765 + }, + { + "epoch": 1.09, + "grad_norm": 23.63028805052106, + "learning_rate": 8.051364045861332e-06, + "loss": 1.0027, + "step": 10770 + }, + { + "epoch": 1.09, + "grad_norm": 8.396814053507557, + "learning_rate": 8.049040025552919e-06, + "loss": 1.0051, + "step": 10775 + }, + { + "epoch": 1.09, + "grad_norm": 33.01296800751487, + "learning_rate": 8.046714956072744e-06, + "loss": 1.0547, + "step": 10780 + }, + { + "epoch": 1.09, + "grad_norm": 38.4056182742984, + "learning_rate": 8.044388838220863e-06, + "loss": 1.0533, + "step": 10785 + }, + { + "epoch": 1.09, + "grad_norm": 5.4678980940781985, + "learning_rate": 8.04206167279769e-06, + "loss": 1.0223, + "step": 10790 + }, + { + "epoch": 1.09, + "grad_norm": 16.31958176301968, + "learning_rate": 8.039733460604002e-06, + "loss": 1.0467, + "step": 10795 + }, + { + "epoch": 1.09, + "grad_norm": 29.78757054516638, + "learning_rate": 8.037404202440932e-06, + "loss": 1.1068, + "step": 10800 + }, + { + "epoch": 1.09, + "grad_norm": 9.351514786042655, + "learning_rate": 8.035073899109978e-06, + "loss": 1.055, + "step": 10805 + }, + { + "epoch": 1.09, + "grad_norm": 16.13405242140683, + "learning_rate": 8.032742551412994e-06, + "loss": 1.0259, + "step": 10810 + }, + { + "epoch": 1.09, + "grad_norm": 23.518528190560342, + "learning_rate": 8.030410160152193e-06, + "loss": 1.0932, + "step": 10815 + }, + { + "epoch": 1.09, + "grad_norm": 39.96884044404456, + "learning_rate": 8.028076726130152e-06, + "loss": 1.037, + "step": 10820 + }, + { + "epoch": 1.09, + "grad_norm": 27.79746347759069, + "learning_rate": 8.0257422501498e-06, + "loss": 1.0445, + "step": 10825 + }, + { + "epoch": 1.09, + "grad_norm": 41.14827167125988, + "learning_rate": 8.023406733014432e-06, + "loss": 1.12, + "step": 10830 + }, + { + "epoch": 1.09, + "grad_norm": 41.22766937081543, + "learning_rate": 8.021070175527693e-06, + "loss": 1.0478, + "step": 10835 + }, + { + "epoch": 1.09, + "grad_norm": 8.38343686690903, + "learning_rate": 8.018732578493593e-06, + "loss": 1.0177, + "step": 10840 + }, + { + "epoch": 1.09, + "grad_norm": 7.7279491440924115, + "learning_rate": 8.016393942716495e-06, + "loss": 1.0698, + "step": 10845 + }, + { + "epoch": 1.09, + "grad_norm": 10.603661113936377, + "learning_rate": 8.014054269001124e-06, + "loss": 1.0172, + "step": 10850 + }, + { + "epoch": 1.09, + "grad_norm": 6.832556522153212, + "learning_rate": 8.011713558152556e-06, + "loss": 1.0496, + "step": 10855 + }, + { + "epoch": 1.09, + "grad_norm": 8.44330576857254, + "learning_rate": 8.00937181097623e-06, + "loss": 1.0259, + "step": 10860 + }, + { + "epoch": 1.1, + "grad_norm": 19.961190197259683, + "learning_rate": 8.00702902827794e-06, + "loss": 1.0294, + "step": 10865 + }, + { + "epoch": 1.1, + "grad_norm": 7.772105224394243, + "learning_rate": 8.004685210863831e-06, + "loss": 1.0449, + "step": 10870 + }, + { + "epoch": 1.1, + "grad_norm": 15.284396898913977, + "learning_rate": 8.00234035954041e-06, + "loss": 1.0185, + "step": 10875 + }, + { + "epoch": 1.1, + "grad_norm": 29.785290427174154, + "learning_rate": 7.999994475114541e-06, + "loss": 0.9921, + "step": 10880 + }, + { + "epoch": 1.1, + "grad_norm": 13.357731611223983, + "learning_rate": 7.997647558393438e-06, + "loss": 1.0381, + "step": 10885 + }, + { + "epoch": 1.1, + "grad_norm": 5.406314279732902, + "learning_rate": 7.995299610184673e-06, + "loss": 1.0861, + "step": 10890 + }, + { + "epoch": 1.1, + "grad_norm": 17.625404656509023, + "learning_rate": 7.992950631296172e-06, + "loss": 1.0584, + "step": 10895 + }, + { + "epoch": 1.1, + "grad_norm": 34.69791299273681, + "learning_rate": 7.990600622536217e-06, + "loss": 1.0001, + "step": 10900 + }, + { + "epoch": 1.1, + "grad_norm": 16.127613634453, + "learning_rate": 7.988249584713447e-06, + "loss": 1.1006, + "step": 10905 + }, + { + "epoch": 1.1, + "grad_norm": 13.244903524023991, + "learning_rate": 7.985897518636847e-06, + "loss": 1.1066, + "step": 10910 + }, + { + "epoch": 1.1, + "grad_norm": 18.772660505535132, + "learning_rate": 7.983544425115763e-06, + "loss": 1.0398, + "step": 10915 + }, + { + "epoch": 1.1, + "grad_norm": 6.461138582086308, + "learning_rate": 7.981190304959891e-06, + "loss": 1.066, + "step": 10920 + }, + { + "epoch": 1.1, + "grad_norm": 26.736545490527853, + "learning_rate": 7.978835158979285e-06, + "loss": 1.0661, + "step": 10925 + }, + { + "epoch": 1.1, + "grad_norm": 18.0084969505494, + "learning_rate": 7.976478987984345e-06, + "loss": 1.02, + "step": 10930 + }, + { + "epoch": 1.1, + "grad_norm": 25.307357719882972, + "learning_rate": 7.974121792785826e-06, + "loss": 1.0413, + "step": 10935 + }, + { + "epoch": 1.1, + "grad_norm": 13.887929931927594, + "learning_rate": 7.97176357419484e-06, + "loss": 1.0631, + "step": 10940 + }, + { + "epoch": 1.1, + "grad_norm": 8.280734504239222, + "learning_rate": 7.969404333022846e-06, + "loss": 1.0711, + "step": 10945 + }, + { + "epoch": 1.1, + "grad_norm": 22.243380183936026, + "learning_rate": 7.967044070081658e-06, + "loss": 1.0771, + "step": 10950 + }, + { + "epoch": 1.1, + "grad_norm": 32.605761545212125, + "learning_rate": 7.96468278618344e-06, + "loss": 1.0772, + "step": 10955 + }, + { + "epoch": 1.11, + "grad_norm": 10.284422751807178, + "learning_rate": 7.962320482140704e-06, + "loss": 1.0444, + "step": 10960 + }, + { + "epoch": 1.11, + "grad_norm": 5.770567621626649, + "learning_rate": 7.959957158766323e-06, + "loss": 1.0371, + "step": 10965 + }, + { + "epoch": 1.11, + "grad_norm": 6.864176917653522, + "learning_rate": 7.957592816873509e-06, + "loss": 1.0122, + "step": 10970 + }, + { + "epoch": 1.11, + "grad_norm": 9.70443716107041, + "learning_rate": 7.95522745727583e-06, + "loss": 1.0528, + "step": 10975 + }, + { + "epoch": 1.11, + "grad_norm": 6.24545368926349, + "learning_rate": 7.952861080787209e-06, + "loss": 1.0377, + "step": 10980 + }, + { + "epoch": 1.11, + "grad_norm": 7.757228862705906, + "learning_rate": 7.950493688221907e-06, + "loss": 1.0669, + "step": 10985 + }, + { + "epoch": 1.11, + "grad_norm": 12.116188153519241, + "learning_rate": 7.948125280394548e-06, + "loss": 1.0545, + "step": 10990 + }, + { + "epoch": 1.11, + "grad_norm": 5.717967247580755, + "learning_rate": 7.945755858120095e-06, + "loss": 1.0256, + "step": 10995 + }, + { + "epoch": 1.11, + "grad_norm": 5.836371646522697, + "learning_rate": 7.943385422213862e-06, + "loss": 1.0441, + "step": 11000 + }, + { + "epoch": 1.11, + "grad_norm": 30.217546600081967, + "learning_rate": 7.941013973491518e-06, + "loss": 1.0654, + "step": 11005 + }, + { + "epoch": 1.11, + "grad_norm": 40.932544472644175, + "learning_rate": 7.938641512769077e-06, + "loss": 1.0572, + "step": 11010 + }, + { + "epoch": 1.11, + "grad_norm": 22.72804791336364, + "learning_rate": 7.936268040862895e-06, + "loss": 1.0351, + "step": 11015 + }, + { + "epoch": 1.11, + "grad_norm": 15.044052151674721, + "learning_rate": 7.933893558589687e-06, + "loss": 1.0369, + "step": 11020 + }, + { + "epoch": 1.11, + "grad_norm": 8.747254479015107, + "learning_rate": 7.931518066766506e-06, + "loss": 1.0475, + "step": 11025 + }, + { + "epoch": 1.11, + "grad_norm": 7.632051553686237, + "learning_rate": 7.92914156621076e-06, + "loss": 1.035, + "step": 11030 + }, + { + "epoch": 1.11, + "grad_norm": 7.1829450330708955, + "learning_rate": 7.926764057740198e-06, + "loss": 1.0905, + "step": 11035 + }, + { + "epoch": 1.11, + "grad_norm": 26.81836768913999, + "learning_rate": 7.924385542172919e-06, + "loss": 1.0201, + "step": 11040 + }, + { + "epoch": 1.11, + "grad_norm": 17.98999389891583, + "learning_rate": 7.922006020327365e-06, + "loss": 1.0039, + "step": 11045 + }, + { + "epoch": 1.11, + "grad_norm": 21.47333072000413, + "learning_rate": 7.919625493022335e-06, + "loss": 1.0052, + "step": 11050 + }, + { + "epoch": 1.11, + "grad_norm": 9.565135655710572, + "learning_rate": 7.917243961076959e-06, + "loss": 1.0425, + "step": 11055 + }, + { + "epoch": 1.12, + "grad_norm": 10.493952061391708, + "learning_rate": 7.914861425310723e-06, + "loss": 1.0527, + "step": 11060 + }, + { + "epoch": 1.12, + "grad_norm": 15.28996338406018, + "learning_rate": 7.912477886543454e-06, + "loss": 1.1012, + "step": 11065 + }, + { + "epoch": 1.12, + "grad_norm": 5.786521467272293, + "learning_rate": 7.910093345595326e-06, + "loss": 1.064, + "step": 11070 + }, + { + "epoch": 1.12, + "grad_norm": 17.638360793188944, + "learning_rate": 7.907707803286856e-06, + "loss": 1.0817, + "step": 11075 + }, + { + "epoch": 1.12, + "grad_norm": 11.579350487528567, + "learning_rate": 7.90532126043891e-06, + "loss": 1.0424, + "step": 11080 + }, + { + "epoch": 1.12, + "grad_norm": 16.179693273682382, + "learning_rate": 7.902933717872693e-06, + "loss": 1.0705, + "step": 11085 + }, + { + "epoch": 1.12, + "grad_norm": 18.410102516344413, + "learning_rate": 7.900545176409756e-06, + "loss": 1.0398, + "step": 11090 + }, + { + "epoch": 1.12, + "grad_norm": 9.49004821436449, + "learning_rate": 7.898155636871995e-06, + "loss": 0.9996, + "step": 11095 + }, + { + "epoch": 1.12, + "grad_norm": 12.33850196492309, + "learning_rate": 7.895765100081646e-06, + "loss": 1.0006, + "step": 11100 + }, + { + "epoch": 1.12, + "grad_norm": 10.626847004913168, + "learning_rate": 7.893373566861292e-06, + "loss": 1.0345, + "step": 11105 + }, + { + "epoch": 1.12, + "grad_norm": 7.446076591813217, + "learning_rate": 7.890981038033859e-06, + "loss": 1.0387, + "step": 11110 + }, + { + "epoch": 1.12, + "grad_norm": 8.060957710750417, + "learning_rate": 7.88858751442261e-06, + "loss": 1.026, + "step": 11115 + }, + { + "epoch": 1.12, + "grad_norm": 7.228268268058988, + "learning_rate": 7.886192996851157e-06, + "loss": 1.0508, + "step": 11120 + }, + { + "epoch": 1.12, + "grad_norm": 7.8075659425069, + "learning_rate": 7.88379748614345e-06, + "loss": 1.077, + "step": 11125 + }, + { + "epoch": 1.12, + "grad_norm": 6.819613988104902, + "learning_rate": 7.881400983123781e-06, + "loss": 1.0307, + "step": 11130 + }, + { + "epoch": 1.12, + "grad_norm": 9.468776605044408, + "learning_rate": 7.879003488616788e-06, + "loss": 1.0522, + "step": 11135 + }, + { + "epoch": 1.12, + "grad_norm": 6.1731817046492745, + "learning_rate": 7.876605003447443e-06, + "loss": 1.0664, + "step": 11140 + }, + { + "epoch": 1.12, + "grad_norm": 5.531644225900602, + "learning_rate": 7.874205528441066e-06, + "loss": 1.0354, + "step": 11145 + }, + { + "epoch": 1.12, + "grad_norm": 13.930059962381911, + "learning_rate": 7.871805064423308e-06, + "loss": 1.0302, + "step": 11150 + }, + { + "epoch": 1.12, + "grad_norm": 23.65862826045937, + "learning_rate": 7.869403612220174e-06, + "loss": 1.0645, + "step": 11155 + }, + { + "epoch": 1.13, + "grad_norm": 14.90337054778474, + "learning_rate": 7.867001172657995e-06, + "loss": 1.0685, + "step": 11160 + }, + { + "epoch": 1.13, + "grad_norm": 17.389451509530925, + "learning_rate": 7.864597746563453e-06, + "loss": 1.0228, + "step": 11165 + }, + { + "epoch": 1.13, + "grad_norm": 19.688390048504406, + "learning_rate": 7.862193334763562e-06, + "loss": 1.0236, + "step": 11170 + }, + { + "epoch": 1.13, + "grad_norm": 15.867171483942066, + "learning_rate": 7.859787938085677e-06, + "loss": 1.098, + "step": 11175 + }, + { + "epoch": 1.13, + "grad_norm": 9.273633495798792, + "learning_rate": 7.857381557357495e-06, + "loss": 1.0321, + "step": 11180 + }, + { + "epoch": 1.13, + "grad_norm": 30.61533719738187, + "learning_rate": 7.854974193407047e-06, + "loss": 1.0454, + "step": 11185 + }, + { + "epoch": 1.13, + "grad_norm": 18.968282250852365, + "learning_rate": 7.852565847062706e-06, + "loss": 1.0331, + "step": 11190 + }, + { + "epoch": 1.13, + "grad_norm": 28.827140133606875, + "learning_rate": 7.850156519153183e-06, + "loss": 1.024, + "step": 11195 + }, + { + "epoch": 1.13, + "grad_norm": 36.740794689535356, + "learning_rate": 7.847746210507522e-06, + "loss": 1.099, + "step": 11200 + }, + { + "epoch": 1.13, + "grad_norm": 44.494900973755506, + "learning_rate": 7.845334921955111e-06, + "loss": 1.0322, + "step": 11205 + }, + { + "epoch": 1.13, + "grad_norm": 42.45845432983005, + "learning_rate": 7.842922654325672e-06, + "loss": 1.05, + "step": 11210 + }, + { + "epoch": 1.13, + "grad_norm": 7.523642147761985, + "learning_rate": 7.84050940844926e-06, + "loss": 1.0771, + "step": 11215 + }, + { + "epoch": 1.13, + "grad_norm": 21.394059843462124, + "learning_rate": 7.838095185156276e-06, + "loss": 1.0405, + "step": 11220 + }, + { + "epoch": 1.13, + "grad_norm": 9.283677153672595, + "learning_rate": 7.83567998527745e-06, + "loss": 1.0834, + "step": 11225 + }, + { + "epoch": 1.13, + "grad_norm": 28.72774136482472, + "learning_rate": 7.833263809643848e-06, + "loss": 1.0269, + "step": 11230 + }, + { + "epoch": 1.13, + "grad_norm": 19.66660259770272, + "learning_rate": 7.830846659086876e-06, + "loss": 1.0759, + "step": 11235 + }, + { + "epoch": 1.13, + "grad_norm": 11.813658613230237, + "learning_rate": 7.82842853443827e-06, + "loss": 1.0506, + "step": 11240 + }, + { + "epoch": 1.13, + "grad_norm": 7.133952082068999, + "learning_rate": 7.826009436530109e-06, + "loss": 1.0687, + "step": 11245 + }, + { + "epoch": 1.13, + "grad_norm": 7.930521893875791, + "learning_rate": 7.823589366194799e-06, + "loss": 1.043, + "step": 11250 + }, + { + "epoch": 1.13, + "grad_norm": 9.752854297463534, + "learning_rate": 7.821168324265088e-06, + "loss": 1.0623, + "step": 11255 + }, + { + "epoch": 1.14, + "grad_norm": 17.145263152954865, + "learning_rate": 7.818746311574047e-06, + "loss": 1.0475, + "step": 11260 + }, + { + "epoch": 1.14, + "grad_norm": 7.308207081197901, + "learning_rate": 7.816323328955094e-06, + "loss": 1.0341, + "step": 11265 + }, + { + "epoch": 1.14, + "grad_norm": 5.357752176077241, + "learning_rate": 7.813899377241973e-06, + "loss": 1.0696, + "step": 11270 + }, + { + "epoch": 1.14, + "grad_norm": 35.70239091708709, + "learning_rate": 7.811474457268765e-06, + "loss": 1.0979, + "step": 11275 + }, + { + "epoch": 1.14, + "grad_norm": 9.081859081902147, + "learning_rate": 7.809048569869881e-06, + "loss": 1.018, + "step": 11280 + }, + { + "epoch": 1.14, + "grad_norm": 12.144620324513742, + "learning_rate": 7.806621715880066e-06, + "loss": 1.0702, + "step": 11285 + }, + { + "epoch": 1.14, + "grad_norm": 6.676098661137126, + "learning_rate": 7.804193896134402e-06, + "loss": 1.0519, + "step": 11290 + }, + { + "epoch": 1.14, + "grad_norm": 13.957609131873946, + "learning_rate": 7.801765111468295e-06, + "loss": 1.065, + "step": 11295 + }, + { + "epoch": 1.14, + "grad_norm": 30.804669147183866, + "learning_rate": 7.799335362717488e-06, + "loss": 1.0803, + "step": 11300 + }, + { + "epoch": 1.14, + "grad_norm": 8.972976576259954, + "learning_rate": 7.796904650718058e-06, + "loss": 1.0595, + "step": 11305 + }, + { + "epoch": 1.14, + "grad_norm": 14.42614691099274, + "learning_rate": 7.794472976306409e-06, + "loss": 1.1013, + "step": 11310 + }, + { + "epoch": 1.14, + "grad_norm": 7.3877898822812, + "learning_rate": 7.792040340319275e-06, + "loss": 1.0484, + "step": 11315 + }, + { + "epoch": 1.14, + "grad_norm": 6.5635085577917245, + "learning_rate": 7.78960674359373e-06, + "loss": 1.0075, + "step": 11320 + }, + { + "epoch": 1.14, + "grad_norm": 6.7544240823692085, + "learning_rate": 7.787172186967169e-06, + "loss": 1.0101, + "step": 11325 + }, + { + "epoch": 1.14, + "grad_norm": 12.521578068619176, + "learning_rate": 7.784736671277318e-06, + "loss": 1.0461, + "step": 11330 + }, + { + "epoch": 1.14, + "grad_norm": 7.301669315226013, + "learning_rate": 7.782300197362241e-06, + "loss": 0.9991, + "step": 11335 + }, + { + "epoch": 1.14, + "grad_norm": 10.220694186657157, + "learning_rate": 7.77986276606032e-06, + "loss": 1.0877, + "step": 11340 + }, + { + "epoch": 1.14, + "grad_norm": 7.972744230181513, + "learning_rate": 7.777424378210279e-06, + "loss": 1.0245, + "step": 11345 + }, + { + "epoch": 1.14, + "grad_norm": 15.665389124617688, + "learning_rate": 7.77498503465116e-06, + "loss": 1.0289, + "step": 11350 + }, + { + "epoch": 1.14, + "grad_norm": 22.08979678059866, + "learning_rate": 7.772544736222345e-06, + "loss": 1.0317, + "step": 11355 + }, + { + "epoch": 1.15, + "grad_norm": 26.602186669253843, + "learning_rate": 7.77010348376353e-06, + "loss": 1.024, + "step": 11360 + }, + { + "epoch": 1.15, + "grad_norm": 27.038319449125172, + "learning_rate": 7.767661278114754e-06, + "loss": 1.015, + "step": 11365 + }, + { + "epoch": 1.15, + "grad_norm": 32.94987066921108, + "learning_rate": 7.765218120116376e-06, + "loss": 1.0769, + "step": 11370 + }, + { + "epoch": 1.15, + "grad_norm": 23.66418718772051, + "learning_rate": 7.762774010609083e-06, + "loss": 1.0451, + "step": 11375 + }, + { + "epoch": 1.15, + "grad_norm": 32.45744837740459, + "learning_rate": 7.760328950433892e-06, + "loss": 1.0282, + "step": 11380 + }, + { + "epoch": 1.15, + "grad_norm": 29.766784519041305, + "learning_rate": 7.757882940432145e-06, + "loss": 1.0397, + "step": 11385 + }, + { + "epoch": 1.15, + "grad_norm": 6.7197884991075405, + "learning_rate": 7.755435981445513e-06, + "loss": 1.0311, + "step": 11390 + }, + { + "epoch": 1.15, + "grad_norm": 23.359189631838863, + "learning_rate": 7.752988074315991e-06, + "loss": 1.0214, + "step": 11395 + }, + { + "epoch": 1.15, + "grad_norm": 23.444606363008706, + "learning_rate": 7.750539219885902e-06, + "loss": 1.0268, + "step": 11400 + }, + { + "epoch": 1.15, + "grad_norm": 14.199249449692479, + "learning_rate": 7.748089418997895e-06, + "loss": 1.018, + "step": 11405 + }, + { + "epoch": 1.15, + "grad_norm": 10.347617826238135, + "learning_rate": 7.745638672494944e-06, + "loss": 1.0613, + "step": 11410 + }, + { + "epoch": 1.15, + "grad_norm": 8.417151079608356, + "learning_rate": 7.743186981220348e-06, + "loss": 1.061, + "step": 11415 + }, + { + "epoch": 1.15, + "grad_norm": 12.959055237876981, + "learning_rate": 7.740734346017733e-06, + "loss": 1.056, + "step": 11420 + }, + { + "epoch": 1.15, + "grad_norm": 15.176059141528643, + "learning_rate": 7.738280767731045e-06, + "loss": 0.9817, + "step": 11425 + }, + { + "epoch": 1.15, + "grad_norm": 6.2690785308256105, + "learning_rate": 7.73582624720456e-06, + "loss": 1.0171, + "step": 11430 + }, + { + "epoch": 1.15, + "grad_norm": 8.486811724837144, + "learning_rate": 7.733370785282879e-06, + "loss": 1.0282, + "step": 11435 + }, + { + "epoch": 1.15, + "grad_norm": 11.823763052543553, + "learning_rate": 7.730914382810919e-06, + "loss": 1.0827, + "step": 11440 + }, + { + "epoch": 1.15, + "grad_norm": 10.257250829565304, + "learning_rate": 7.72845704063393e-06, + "loss": 1.0779, + "step": 11445 + }, + { + "epoch": 1.15, + "grad_norm": 9.893179851041156, + "learning_rate": 7.725998759597478e-06, + "loss": 1.0631, + "step": 11450 + }, + { + "epoch": 1.15, + "grad_norm": 5.05310239761301, + "learning_rate": 7.723539540547455e-06, + "loss": 1.087, + "step": 11455 + }, + { + "epoch": 1.16, + "grad_norm": 7.568081314755192, + "learning_rate": 7.72107938433008e-06, + "loss": 1.0407, + "step": 11460 + }, + { + "epoch": 1.16, + "grad_norm": 12.30451232867941, + "learning_rate": 7.718618291791887e-06, + "loss": 1.0147, + "step": 11465 + }, + { + "epoch": 1.16, + "grad_norm": 5.581416562349123, + "learning_rate": 7.716156263779736e-06, + "loss": 1.0496, + "step": 11470 + }, + { + "epoch": 1.16, + "grad_norm": 18.265076701434122, + "learning_rate": 7.71369330114081e-06, + "loss": 1.0413, + "step": 11475 + }, + { + "epoch": 1.16, + "grad_norm": 6.449712305653418, + "learning_rate": 7.71122940472261e-06, + "loss": 1.0169, + "step": 11480 + }, + { + "epoch": 1.16, + "grad_norm": 6.470154640544608, + "learning_rate": 7.708764575372962e-06, + "loss": 1.0046, + "step": 11485 + }, + { + "epoch": 1.16, + "grad_norm": 11.209267270155115, + "learning_rate": 7.70629881394001e-06, + "loss": 1.0392, + "step": 11490 + }, + { + "epoch": 1.16, + "grad_norm": 14.664536173796234, + "learning_rate": 7.703832121272221e-06, + "loss": 1.0161, + "step": 11495 + }, + { + "epoch": 1.16, + "grad_norm": 6.939112700472921, + "learning_rate": 7.701364498218381e-06, + "loss": 1.0694, + "step": 11500 + }, + { + "epoch": 1.16, + "grad_norm": 17.043229302231182, + "learning_rate": 7.698895945627597e-06, + "loss": 0.9974, + "step": 11505 + }, + { + "epoch": 1.16, + "grad_norm": 23.990478258309913, + "learning_rate": 7.696426464349299e-06, + "loss": 1.0676, + "step": 11510 + }, + { + "epoch": 1.16, + "grad_norm": 25.65013648668628, + "learning_rate": 7.693956055233227e-06, + "loss": 1.068, + "step": 11515 + }, + { + "epoch": 1.16, + "grad_norm": 10.336450698290124, + "learning_rate": 7.69148471912945e-06, + "loss": 1.0261, + "step": 11520 + }, + { + "epoch": 1.16, + "grad_norm": 7.209808655252218, + "learning_rate": 7.689012456888352e-06, + "loss": 1.0366, + "step": 11525 + }, + { + "epoch": 1.16, + "grad_norm": 8.653020039541587, + "learning_rate": 7.686539269360636e-06, + "loss": 1.0248, + "step": 11530 + }, + { + "epoch": 1.16, + "grad_norm": 21.89442345749155, + "learning_rate": 7.684065157397323e-06, + "loss": 1.0303, + "step": 11535 + }, + { + "epoch": 1.16, + "grad_norm": 13.183307252692156, + "learning_rate": 7.681590121849754e-06, + "loss": 1.0576, + "step": 11540 + }, + { + "epoch": 1.16, + "grad_norm": 10.55522118711657, + "learning_rate": 7.679114163569584e-06, + "loss": 0.9881, + "step": 11545 + }, + { + "epoch": 1.16, + "grad_norm": 8.645009341875197, + "learning_rate": 7.67663728340879e-06, + "loss": 1.0538, + "step": 11550 + }, + { + "epoch": 1.16, + "grad_norm": 10.288234650084943, + "learning_rate": 7.674159482219663e-06, + "loss": 1.0916, + "step": 11555 + }, + { + "epoch": 1.17, + "grad_norm": 6.323944478111165, + "learning_rate": 7.671680760854812e-06, + "loss": 1.0389, + "step": 11560 + }, + { + "epoch": 1.17, + "grad_norm": 10.28682974878779, + "learning_rate": 7.669201120167164e-06, + "loss": 1.0077, + "step": 11565 + }, + { + "epoch": 1.17, + "grad_norm": 12.965117672513008, + "learning_rate": 7.666720561009959e-06, + "loss": 1.0556, + "step": 11570 + }, + { + "epoch": 1.17, + "grad_norm": 12.879913896859524, + "learning_rate": 7.664239084236756e-06, + "loss": 1.0431, + "step": 11575 + }, + { + "epoch": 1.17, + "grad_norm": 10.09897871614661, + "learning_rate": 7.661756690701429e-06, + "loss": 1.0235, + "step": 11580 + }, + { + "epoch": 1.17, + "grad_norm": 6.193103071161919, + "learning_rate": 7.659273381258165e-06, + "loss": 1.0652, + "step": 11585 + }, + { + "epoch": 1.17, + "grad_norm": 36.002717652272395, + "learning_rate": 7.656789156761473e-06, + "loss": 1.0214, + "step": 11590 + }, + { + "epoch": 1.17, + "grad_norm": 27.389027634831674, + "learning_rate": 7.654304018066169e-06, + "loss": 1.0712, + "step": 11595 + }, + { + "epoch": 1.17, + "grad_norm": 29.295790763413898, + "learning_rate": 7.651817966027387e-06, + "loss": 1.0563, + "step": 11600 + }, + { + "epoch": 1.17, + "grad_norm": 78.5986973673433, + "learning_rate": 7.649331001500576e-06, + "loss": 1.0787, + "step": 11605 + }, + { + "epoch": 1.17, + "grad_norm": 20.747085214926916, + "learning_rate": 7.646843125341498e-06, + "loss": 1.0394, + "step": 11610 + }, + { + "epoch": 1.17, + "grad_norm": 42.82699540343177, + "learning_rate": 7.644354338406229e-06, + "loss": 1.0461, + "step": 11615 + }, + { + "epoch": 1.17, + "grad_norm": 38.36576292944948, + "learning_rate": 7.641864641551157e-06, + "loss": 1.0601, + "step": 11620 + }, + { + "epoch": 1.17, + "grad_norm": 42.49809837717228, + "learning_rate": 7.639374035632984e-06, + "loss": 1.0101, + "step": 11625 + }, + { + "epoch": 1.17, + "grad_norm": 21.982321503938692, + "learning_rate": 7.636882521508728e-06, + "loss": 1.0013, + "step": 11630 + }, + { + "epoch": 1.17, + "grad_norm": 31.605933081763478, + "learning_rate": 7.634390100035716e-06, + "loss": 1.0223, + "step": 11635 + }, + { + "epoch": 1.17, + "grad_norm": 11.527420466489241, + "learning_rate": 7.631896772071585e-06, + "loss": 1.0067, + "step": 11640 + }, + { + "epoch": 1.17, + "grad_norm": 57.497324855910215, + "learning_rate": 7.629402538474288e-06, + "loss": 1.0984, + "step": 11645 + }, + { + "epoch": 1.17, + "grad_norm": 70.73824629974315, + "learning_rate": 7.626907400102089e-06, + "loss": 1.0807, + "step": 11650 + }, + { + "epoch": 1.18, + "grad_norm": 38.39492524899637, + "learning_rate": 7.624411357813564e-06, + "loss": 1.021, + "step": 11655 + }, + { + "epoch": 1.18, + "grad_norm": 23.23834490509645, + "learning_rate": 7.621914412467597e-06, + "loss": 1.0724, + "step": 11660 + }, + { + "epoch": 1.18, + "grad_norm": 13.626825151703164, + "learning_rate": 7.6194165649233834e-06, + "loss": 1.0526, + "step": 11665 + }, + { + "epoch": 1.18, + "grad_norm": 13.050704659349623, + "learning_rate": 7.616917816040433e-06, + "loss": 1.058, + "step": 11670 + }, + { + "epoch": 1.18, + "grad_norm": 25.018079860661217, + "learning_rate": 7.614418166678562e-06, + "loss": 1.0797, + "step": 11675 + }, + { + "epoch": 1.18, + "grad_norm": 19.988606047033, + "learning_rate": 7.611917617697896e-06, + "loss": 1.0386, + "step": 11680 + }, + { + "epoch": 1.18, + "grad_norm": 8.389834101700366, + "learning_rate": 7.609416169958872e-06, + "loss": 1.0451, + "step": 11685 + }, + { + "epoch": 1.18, + "grad_norm": 14.369819552441536, + "learning_rate": 7.606913824322238e-06, + "loss": 1.013, + "step": 11690 + }, + { + "epoch": 1.18, + "grad_norm": 14.873428629131478, + "learning_rate": 7.604410581649045e-06, + "loss": 1.0511, + "step": 11695 + }, + { + "epoch": 1.18, + "grad_norm": 6.226147562203659, + "learning_rate": 7.601906442800658e-06, + "loss": 1.0711, + "step": 11700 + }, + { + "epoch": 1.18, + "grad_norm": 13.70917967690263, + "learning_rate": 7.599401408638751e-06, + "loss": 1.0148, + "step": 11705 + }, + { + "epoch": 1.18, + "grad_norm": 6.286083740875113, + "learning_rate": 7.5968954800253015e-06, + "loss": 1.0611, + "step": 11710 + }, + { + "epoch": 1.18, + "grad_norm": 15.715994147646205, + "learning_rate": 7.594388657822596e-06, + "loss": 1.0474, + "step": 11715 + }, + { + "epoch": 1.18, + "grad_norm": 8.514305533684258, + "learning_rate": 7.591880942893234e-06, + "loss": 1.0829, + "step": 11720 + }, + { + "epoch": 1.18, + "grad_norm": 13.13120395760116, + "learning_rate": 7.5893723361001135e-06, + "loss": 1.0444, + "step": 11725 + }, + { + "epoch": 1.18, + "grad_norm": 11.178578396505047, + "learning_rate": 7.586862838306445e-06, + "loss": 1.0816, + "step": 11730 + }, + { + "epoch": 1.18, + "grad_norm": 12.746854313765677, + "learning_rate": 7.5843524503757445e-06, + "loss": 0.9941, + "step": 11735 + }, + { + "epoch": 1.18, + "grad_norm": 11.078364992056647, + "learning_rate": 7.581841173171835e-06, + "loss": 1.0006, + "step": 11740 + }, + { + "epoch": 1.18, + "grad_norm": 12.740739798927343, + "learning_rate": 7.579329007558843e-06, + "loss": 1.0418, + "step": 11745 + }, + { + "epoch": 1.18, + "grad_norm": 6.539722370722517, + "learning_rate": 7.576815954401204e-06, + "loss": 1.0112, + "step": 11750 + }, + { + "epoch": 1.19, + "grad_norm": 5.299343963705468, + "learning_rate": 7.574302014563656e-06, + "loss": 1.0479, + "step": 11755 + }, + { + "epoch": 1.19, + "grad_norm": 29.875561566756147, + "learning_rate": 7.571787188911243e-06, + "loss": 1.0701, + "step": 11760 + }, + { + "epoch": 1.19, + "grad_norm": 25.05908382103154, + "learning_rate": 7.569271478309315e-06, + "loss": 1.0341, + "step": 11765 + }, + { + "epoch": 1.19, + "grad_norm": 15.601761644761815, + "learning_rate": 7.566754883623525e-06, + "loss": 1.0087, + "step": 11770 + }, + { + "epoch": 1.19, + "grad_norm": 17.813865904730815, + "learning_rate": 7.564237405719831e-06, + "loss": 1.004, + "step": 11775 + }, + { + "epoch": 1.19, + "grad_norm": 17.25236748256092, + "learning_rate": 7.561719045464495e-06, + "loss": 1.081, + "step": 11780 + }, + { + "epoch": 1.19, + "grad_norm": 27.333124687407285, + "learning_rate": 7.5591998037240825e-06, + "loss": 1.0157, + "step": 11785 + }, + { + "epoch": 1.19, + "grad_norm": 22.089282057926013, + "learning_rate": 7.556679681365462e-06, + "loss": 1.0206, + "step": 11790 + }, + { + "epoch": 1.19, + "grad_norm": 39.214568937531084, + "learning_rate": 7.554158679255805e-06, + "loss": 1.0419, + "step": 11795 + }, + { + "epoch": 1.19, + "grad_norm": 26.746819599838243, + "learning_rate": 7.551636798262585e-06, + "loss": 1.0797, + "step": 11800 + }, + { + "epoch": 1.19, + "grad_norm": 25.734360047448632, + "learning_rate": 7.549114039253581e-06, + "loss": 1.0184, + "step": 11805 + }, + { + "epoch": 1.19, + "grad_norm": 45.72851511737219, + "learning_rate": 7.546590403096871e-06, + "loss": 1.0573, + "step": 11810 + }, + { + "epoch": 1.19, + "grad_norm": 9.937427370671701, + "learning_rate": 7.544065890660834e-06, + "loss": 1.0428, + "step": 11815 + }, + { + "epoch": 1.19, + "grad_norm": 56.33537705264059, + "learning_rate": 7.541540502814154e-06, + "loss": 1.0469, + "step": 11820 + }, + { + "epoch": 1.19, + "grad_norm": 33.71330192354937, + "learning_rate": 7.539014240425816e-06, + "loss": 1.0631, + "step": 11825 + }, + { + "epoch": 1.19, + "grad_norm": 8.407018181834788, + "learning_rate": 7.536487104365102e-06, + "loss": 1.0612, + "step": 11830 + }, + { + "epoch": 1.19, + "grad_norm": 29.552370346206306, + "learning_rate": 7.533959095501597e-06, + "loss": 1.0555, + "step": 11835 + }, + { + "epoch": 1.19, + "grad_norm": 9.965135513648303, + "learning_rate": 7.531430214705189e-06, + "loss": 1.0246, + "step": 11840 + }, + { + "epoch": 1.19, + "grad_norm": 37.11431532290853, + "learning_rate": 7.528900462846062e-06, + "loss": 1.0686, + "step": 11845 + }, + { + "epoch": 1.19, + "grad_norm": 11.840035222559349, + "learning_rate": 7.526369840794699e-06, + "loss": 1.0106, + "step": 11850 + }, + { + "epoch": 1.2, + "grad_norm": 6.281527677159895, + "learning_rate": 7.523838349421889e-06, + "loss": 1.028, + "step": 11855 + }, + { + "epoch": 1.2, + "grad_norm": 12.020245250646116, + "learning_rate": 7.5213059895987124e-06, + "loss": 1.0179, + "step": 11860 + }, + { + "epoch": 1.2, + "grad_norm": 10.354434431670976, + "learning_rate": 7.518772762196553e-06, + "loss": 1.0332, + "step": 11865 + }, + { + "epoch": 1.2, + "grad_norm": 25.30003234851166, + "learning_rate": 7.516238668087092e-06, + "loss": 1.0493, + "step": 11870 + }, + { + "epoch": 1.2, + "grad_norm": 14.778100727598297, + "learning_rate": 7.513703708142309e-06, + "loss": 1.0585, + "step": 11875 + }, + { + "epoch": 1.2, + "grad_norm": 5.594776106824048, + "learning_rate": 7.5111678832344806e-06, + "loss": 1.0452, + "step": 11880 + }, + { + "epoch": 1.2, + "grad_norm": 19.96266800576846, + "learning_rate": 7.508631194236182e-06, + "loss": 1.0303, + "step": 11885 + }, + { + "epoch": 1.2, + "grad_norm": 10.193457442652853, + "learning_rate": 7.506093642020286e-06, + "loss": 1.0045, + "step": 11890 + }, + { + "epoch": 1.2, + "grad_norm": 7.817856979567341, + "learning_rate": 7.50355522745996e-06, + "loss": 1.02, + "step": 11895 + }, + { + "epoch": 1.2, + "grad_norm": 14.320404713505287, + "learning_rate": 7.501015951428673e-06, + "loss": 1.0394, + "step": 11900 + }, + { + "epoch": 1.2, + "grad_norm": 5.896707491370909, + "learning_rate": 7.498475814800184e-06, + "loss": 1.0197, + "step": 11905 + }, + { + "epoch": 1.2, + "grad_norm": 23.15339531944063, + "learning_rate": 7.495934818448555e-06, + "loss": 1.0546, + "step": 11910 + }, + { + "epoch": 1.2, + "grad_norm": 21.27469954026058, + "learning_rate": 7.493392963248138e-06, + "loss": 1.0375, + "step": 11915 + }, + { + "epoch": 1.2, + "grad_norm": 18.41665952623711, + "learning_rate": 7.490850250073585e-06, + "loss": 1.0324, + "step": 11920 + }, + { + "epoch": 1.2, + "grad_norm": 20.31198194365891, + "learning_rate": 7.488306679799838e-06, + "loss": 1.0249, + "step": 11925 + }, + { + "epoch": 1.2, + "grad_norm": 7.6231025150440415, + "learning_rate": 7.4857622533021415e-06, + "loss": 1.014, + "step": 11930 + }, + { + "epoch": 1.2, + "grad_norm": 16.329948465900326, + "learning_rate": 7.483216971456027e-06, + "loss": 1.0423, + "step": 11935 + }, + { + "epoch": 1.2, + "grad_norm": 10.185718359298546, + "learning_rate": 7.480670835137324e-06, + "loss": 1.0605, + "step": 11940 + }, + { + "epoch": 1.2, + "grad_norm": 11.293068193481355, + "learning_rate": 7.478123845222155e-06, + "loss": 0.9933, + "step": 11945 + }, + { + "epoch": 1.2, + "grad_norm": 21.82002169605213, + "learning_rate": 7.47557600258694e-06, + "loss": 1.0965, + "step": 11950 + }, + { + "epoch": 1.21, + "grad_norm": 21.896520825672138, + "learning_rate": 7.4730273081083845e-06, + "loss": 1.0227, + "step": 11955 + }, + { + "epoch": 1.21, + "grad_norm": 14.134215363044147, + "learning_rate": 7.470477762663495e-06, + "loss": 0.9876, + "step": 11960 + }, + { + "epoch": 1.21, + "grad_norm": 5.438427314214314, + "learning_rate": 7.467927367129568e-06, + "loss": 1.0533, + "step": 11965 + }, + { + "epoch": 1.21, + "grad_norm": 6.797106966731714, + "learning_rate": 7.4653761223841906e-06, + "loss": 1.0289, + "step": 11970 + }, + { + "epoch": 1.21, + "grad_norm": 6.665798452998185, + "learning_rate": 7.462824029305243e-06, + "loss": 1.0613, + "step": 11975 + }, + { + "epoch": 1.21, + "grad_norm": 7.152409531921233, + "learning_rate": 7.460271088770902e-06, + "loss": 1.0811, + "step": 11980 + }, + { + "epoch": 1.21, + "grad_norm": 12.889424672209163, + "learning_rate": 7.457717301659626e-06, + "loss": 1.0123, + "step": 11985 + }, + { + "epoch": 1.21, + "grad_norm": 14.443733425576188, + "learning_rate": 7.455162668850175e-06, + "loss": 1.0179, + "step": 11990 + }, + { + "epoch": 1.21, + "grad_norm": 9.578724563179945, + "learning_rate": 7.452607191221596e-06, + "loss": 1.043, + "step": 11995 + }, + { + "epoch": 1.21, + "grad_norm": 20.593865569307333, + "learning_rate": 7.450050869653224e-06, + "loss": 0.9908, + "step": 12000 + }, + { + "epoch": 1.21, + "grad_norm": 42.410456039081524, + "learning_rate": 7.4474937050246865e-06, + "loss": 1.0648, + "step": 12005 + }, + { + "epoch": 1.21, + "grad_norm": 32.589676758330064, + "learning_rate": 7.444935698215905e-06, + "loss": 1.0305, + "step": 12010 + }, + { + "epoch": 1.21, + "grad_norm": 16.916616819955006, + "learning_rate": 7.442376850107083e-06, + "loss": 1.0317, + "step": 12015 + }, + { + "epoch": 1.21, + "grad_norm": 6.563970452745615, + "learning_rate": 7.439817161578721e-06, + "loss": 1.0074, + "step": 12020 + }, + { + "epoch": 1.21, + "grad_norm": 14.369544319819505, + "learning_rate": 7.437256633511603e-06, + "loss": 1.0282, + "step": 12025 + }, + { + "epoch": 1.21, + "grad_norm": 81.02214877177478, + "learning_rate": 7.434695266786807e-06, + "loss": 1.0539, + "step": 12030 + }, + { + "epoch": 1.21, + "grad_norm": 63.362129758959256, + "learning_rate": 7.432133062285693e-06, + "loss": 1.0339, + "step": 12035 + }, + { + "epoch": 1.21, + "grad_norm": 18.466818473382034, + "learning_rate": 7.429570020889916e-06, + "loss": 1.123, + "step": 12040 + }, + { + "epoch": 1.21, + "grad_norm": 117.99242146591413, + "learning_rate": 7.4270061434814156e-06, + "loss": 1.1451, + "step": 12045 + }, + { + "epoch": 1.21, + "grad_norm": 26.428118691401227, + "learning_rate": 7.424441430942418e-06, + "loss": 1.062, + "step": 12050 + }, + { + "epoch": 1.22, + "grad_norm": 61.68523063344123, + "learning_rate": 7.421875884155442e-06, + "loss": 1.0528, + "step": 12055 + }, + { + "epoch": 1.22, + "grad_norm": 16.310893704321423, + "learning_rate": 7.419309504003286e-06, + "loss": 1.009, + "step": 12060 + }, + { + "epoch": 1.22, + "grad_norm": 12.134186371943398, + "learning_rate": 7.416742291369041e-06, + "loss": 1.092, + "step": 12065 + }, + { + "epoch": 1.22, + "grad_norm": 8.07378676429573, + "learning_rate": 7.414174247136081e-06, + "loss": 1.0399, + "step": 12070 + }, + { + "epoch": 1.22, + "grad_norm": 8.895453691924525, + "learning_rate": 7.411605372188068e-06, + "loss": 1.057, + "step": 12075 + }, + { + "epoch": 1.22, + "grad_norm": 11.816493232257297, + "learning_rate": 7.409035667408951e-06, + "loss": 1.0485, + "step": 12080 + }, + { + "epoch": 1.22, + "grad_norm": 10.004634418174367, + "learning_rate": 7.406465133682961e-06, + "loss": 1.0245, + "step": 12085 + }, + { + "epoch": 1.22, + "grad_norm": 10.262160758749364, + "learning_rate": 7.403893771894618e-06, + "loss": 1.0168, + "step": 12090 + }, + { + "epoch": 1.22, + "grad_norm": 83.46486853809938, + "learning_rate": 7.401321582928722e-06, + "loss": 1.0558, + "step": 12095 + }, + { + "epoch": 1.22, + "grad_norm": 45.59415566925905, + "learning_rate": 7.398748567670364e-06, + "loss": 1.0833, + "step": 12100 + }, + { + "epoch": 1.22, + "grad_norm": 10.413525481182806, + "learning_rate": 7.396174727004915e-06, + "loss": 1.0297, + "step": 12105 + }, + { + "epoch": 1.22, + "grad_norm": 36.060354029203495, + "learning_rate": 7.393600061818031e-06, + "loss": 0.9854, + "step": 12110 + }, + { + "epoch": 1.22, + "grad_norm": 8.742621477803867, + "learning_rate": 7.391024572995652e-06, + "loss": 1.0082, + "step": 12115 + }, + { + "epoch": 1.22, + "grad_norm": 9.899628942463798, + "learning_rate": 7.388448261424002e-06, + "loss": 1.026, + "step": 12120 + }, + { + "epoch": 1.22, + "grad_norm": 8.47051525142764, + "learning_rate": 7.385871127989584e-06, + "loss": 1.0435, + "step": 12125 + }, + { + "epoch": 1.22, + "grad_norm": 8.032898832474729, + "learning_rate": 7.383293173579192e-06, + "loss": 1.0847, + "step": 12130 + }, + { + "epoch": 1.22, + "grad_norm": 8.214473601243663, + "learning_rate": 7.380714399079894e-06, + "loss": 1.0917, + "step": 12135 + }, + { + "epoch": 1.22, + "grad_norm": 7.741330271398156, + "learning_rate": 7.378134805379046e-06, + "loss": 1.0164, + "step": 12140 + }, + { + "epoch": 1.22, + "grad_norm": 7.721926747859045, + "learning_rate": 7.375554393364281e-06, + "loss": 1.0236, + "step": 12145 + }, + { + "epoch": 1.22, + "grad_norm": 18.241684309958316, + "learning_rate": 7.372973163923521e-06, + "loss": 1.0521, + "step": 12150 + }, + { + "epoch": 1.23, + "grad_norm": 7.768640195269594, + "learning_rate": 7.37039111794496e-06, + "loss": 1.0152, + "step": 12155 + }, + { + "epoch": 1.23, + "grad_norm": 16.11180816202643, + "learning_rate": 7.3678082563170795e-06, + "loss": 1.0445, + "step": 12160 + }, + { + "epoch": 1.23, + "grad_norm": 13.321856476735322, + "learning_rate": 7.36522457992864e-06, + "loss": 1.0724, + "step": 12165 + }, + { + "epoch": 1.23, + "grad_norm": 6.679558986447608, + "learning_rate": 7.36264008966868e-06, + "loss": 1.0118, + "step": 12170 + }, + { + "epoch": 1.23, + "grad_norm": 9.763045702610519, + "learning_rate": 7.360054786426523e-06, + "loss": 1.0341, + "step": 12175 + }, + { + "epoch": 1.23, + "grad_norm": 5.664202828772289, + "learning_rate": 7.357468671091769e-06, + "loss": 1.0163, + "step": 12180 + }, + { + "epoch": 1.23, + "grad_norm": 8.474044749820074, + "learning_rate": 7.354881744554295e-06, + "loss": 1.0537, + "step": 12185 + }, + { + "epoch": 1.23, + "grad_norm": 7.806941862617317, + "learning_rate": 7.352294007704264e-06, + "loss": 1.0285, + "step": 12190 + }, + { + "epoch": 1.23, + "grad_norm": 11.439091677523939, + "learning_rate": 7.349705461432111e-06, + "loss": 1.0277, + "step": 12195 + }, + { + "epoch": 1.23, + "grad_norm": 5.9110356966133075, + "learning_rate": 7.347116106628552e-06, + "loss": 1.0517, + "step": 12200 + }, + { + "epoch": 1.23, + "grad_norm": 21.628650555715147, + "learning_rate": 7.344525944184583e-06, + "loss": 1.0052, + "step": 12205 + }, + { + "epoch": 1.23, + "grad_norm": 16.59051396975955, + "learning_rate": 7.341934974991475e-06, + "loss": 1.0597, + "step": 12210 + }, + { + "epoch": 1.23, + "grad_norm": 12.506905837854545, + "learning_rate": 7.339343199940781e-06, + "loss": 1.0487, + "step": 12215 + }, + { + "epoch": 1.23, + "grad_norm": 29.591091767578742, + "learning_rate": 7.336750619924324e-06, + "loss": 1.0879, + "step": 12220 + }, + { + "epoch": 1.23, + "grad_norm": 18.500775694028768, + "learning_rate": 7.3341572358342106e-06, + "loss": 0.9942, + "step": 12225 + }, + { + "epoch": 1.23, + "grad_norm": 9.698100785554281, + "learning_rate": 7.331563048562823e-06, + "loss": 0.995, + "step": 12230 + }, + { + "epoch": 1.23, + "grad_norm": 8.678619222102471, + "learning_rate": 7.328968059002816e-06, + "loss": 1.0036, + "step": 12235 + }, + { + "epoch": 1.23, + "grad_norm": 21.717058220261013, + "learning_rate": 7.326372268047126e-06, + "loss": 1.0141, + "step": 12240 + }, + { + "epoch": 1.23, + "grad_norm": 32.58218285532491, + "learning_rate": 7.32377567658896e-06, + "loss": 1.0866, + "step": 12245 + }, + { + "epoch": 1.24, + "grad_norm": 22.211976172679357, + "learning_rate": 7.321178285521803e-06, + "loss": 1.048, + "step": 12250 + }, + { + "epoch": 1.24, + "grad_norm": 8.174005536221587, + "learning_rate": 7.318580095739414e-06, + "loss": 1.0712, + "step": 12255 + }, + { + "epoch": 1.24, + "grad_norm": 8.402187876841412, + "learning_rate": 7.31598110813583e-06, + "loss": 1.0803, + "step": 12260 + }, + { + "epoch": 1.24, + "grad_norm": 12.961387598255712, + "learning_rate": 7.313381323605358e-06, + "loss": 1.0511, + "step": 12265 + }, + { + "epoch": 1.24, + "grad_norm": 10.117317501652218, + "learning_rate": 7.310780743042582e-06, + "loss": 1.0348, + "step": 12270 + }, + { + "epoch": 1.24, + "grad_norm": 14.799743957011334, + "learning_rate": 7.30817936734236e-06, + "loss": 1.0166, + "step": 12275 + }, + { + "epoch": 1.24, + "grad_norm": 6.5633985549972955, + "learning_rate": 7.30557719739982e-06, + "loss": 1.0362, + "step": 12280 + }, + { + "epoch": 1.24, + "grad_norm": 9.630547954930222, + "learning_rate": 7.302974234110371e-06, + "loss": 1.047, + "step": 12285 + }, + { + "epoch": 1.24, + "grad_norm": 9.977244441347235, + "learning_rate": 7.300370478369687e-06, + "loss": 1.0569, + "step": 12290 + }, + { + "epoch": 1.24, + "grad_norm": 12.767745576845352, + "learning_rate": 7.297765931073718e-06, + "loss": 1.0362, + "step": 12295 + }, + { + "epoch": 1.24, + "grad_norm": 15.83769986316688, + "learning_rate": 7.295160593118687e-06, + "loss": 1.0446, + "step": 12300 + }, + { + "epoch": 1.24, + "grad_norm": 7.351824272242793, + "learning_rate": 7.2925544654010885e-06, + "loss": 1.0334, + "step": 12305 + }, + { + "epoch": 1.24, + "grad_norm": 8.969839115109984, + "learning_rate": 7.289947548817687e-06, + "loss": 1.0491, + "step": 12310 + }, + { + "epoch": 1.24, + "grad_norm": 17.136217622195936, + "learning_rate": 7.287339844265522e-06, + "loss": 1.0099, + "step": 12315 + }, + { + "epoch": 1.24, + "grad_norm": 7.247329863483515, + "learning_rate": 7.284731352641903e-06, + "loss": 1.0323, + "step": 12320 + }, + { + "epoch": 1.24, + "grad_norm": 7.78098142765102, + "learning_rate": 7.282122074844404e-06, + "loss": 1.0202, + "step": 12325 + }, + { + "epoch": 1.24, + "grad_norm": 9.18913385981386, + "learning_rate": 7.2795120117708804e-06, + "loss": 1.0699, + "step": 12330 + }, + { + "epoch": 1.24, + "grad_norm": 14.796562129763549, + "learning_rate": 7.276901164319452e-06, + "loss": 1.0205, + "step": 12335 + }, + { + "epoch": 1.24, + "grad_norm": 11.081581545913867, + "learning_rate": 7.274289533388505e-06, + "loss": 0.9852, + "step": 12340 + }, + { + "epoch": 1.24, + "grad_norm": 5.599993939832297, + "learning_rate": 7.2716771198767035e-06, + "loss": 1.0266, + "step": 12345 + }, + { + "epoch": 1.25, + "grad_norm": 12.122890189257696, + "learning_rate": 7.269063924682974e-06, + "loss": 1.001, + "step": 12350 + }, + { + "epoch": 1.25, + "grad_norm": 8.483029398431478, + "learning_rate": 7.266449948706517e-06, + "loss": 1.0569, + "step": 12355 + }, + { + "epoch": 1.25, + "grad_norm": 6.532734085560354, + "learning_rate": 7.263835192846795e-06, + "loss": 1.0619, + "step": 12360 + }, + { + "epoch": 1.25, + "grad_norm": 10.057663962553224, + "learning_rate": 7.2612196580035465e-06, + "loss": 1.0185, + "step": 12365 + }, + { + "epoch": 1.25, + "grad_norm": 6.063995399514143, + "learning_rate": 7.258603345076773e-06, + "loss": 1.0223, + "step": 12370 + }, + { + "epoch": 1.25, + "grad_norm": 20.492031391508416, + "learning_rate": 7.255986254966747e-06, + "loss": 1.0152, + "step": 12375 + }, + { + "epoch": 1.25, + "grad_norm": 10.980889415213772, + "learning_rate": 7.253368388574004e-06, + "loss": 1.0853, + "step": 12380 + }, + { + "epoch": 1.25, + "grad_norm": 6.932420119536721, + "learning_rate": 7.2507497467993515e-06, + "loss": 1.0281, + "step": 12385 + }, + { + "epoch": 1.25, + "grad_norm": 17.442263891628073, + "learning_rate": 7.24813033054386e-06, + "loss": 1.0653, + "step": 12390 + }, + { + "epoch": 1.25, + "grad_norm": 19.04890079244909, + "learning_rate": 7.2455101407088694e-06, + "loss": 1.0582, + "step": 12395 + }, + { + "epoch": 1.25, + "grad_norm": 10.76606447908117, + "learning_rate": 7.242889178195984e-06, + "loss": 1.0594, + "step": 12400 + }, + { + "epoch": 1.25, + "grad_norm": 20.407517494631122, + "learning_rate": 7.240267443907074e-06, + "loss": 1.0406, + "step": 12405 + }, + { + "epoch": 1.25, + "grad_norm": 6.844117091500154, + "learning_rate": 7.237644938744277e-06, + "loss": 1.028, + "step": 12410 + }, + { + "epoch": 1.25, + "grad_norm": 5.081519520518418, + "learning_rate": 7.235021663609995e-06, + "loss": 1.04, + "step": 12415 + }, + { + "epoch": 1.25, + "grad_norm": 10.90657997718108, + "learning_rate": 7.232397619406891e-06, + "loss": 1.0339, + "step": 12420 + }, + { + "epoch": 1.25, + "grad_norm": 6.055211671272745, + "learning_rate": 7.2297728070379e-06, + "loss": 1.0916, + "step": 12425 + }, + { + "epoch": 1.25, + "grad_norm": 5.292463026715902, + "learning_rate": 7.227147227406215e-06, + "loss": 1.119, + "step": 12430 + }, + { + "epoch": 1.25, + "grad_norm": 8.40782131860608, + "learning_rate": 7.224520881415296e-06, + "loss": 1.0729, + "step": 12435 + }, + { + "epoch": 1.25, + "grad_norm": 9.861234186251867, + "learning_rate": 7.221893769968866e-06, + "loss": 1.0636, + "step": 12440 + }, + { + "epoch": 1.25, + "grad_norm": 5.435769423839574, + "learning_rate": 7.219265893970913e-06, + "loss": 1.0333, + "step": 12445 + }, + { + "epoch": 1.26, + "grad_norm": 10.968760917967472, + "learning_rate": 7.216637254325685e-06, + "loss": 1.0419, + "step": 12450 + }, + { + "epoch": 1.26, + "grad_norm": 10.53215039683486, + "learning_rate": 7.214007851937696e-06, + "loss": 1.0314, + "step": 12455 + }, + { + "epoch": 1.26, + "grad_norm": 12.877044861260606, + "learning_rate": 7.21137768771172e-06, + "loss": 1.019, + "step": 12460 + }, + { + "epoch": 1.26, + "grad_norm": 9.287028573377304, + "learning_rate": 7.208746762552792e-06, + "loss": 0.9964, + "step": 12465 + }, + { + "epoch": 1.26, + "grad_norm": 11.675474516635187, + "learning_rate": 7.206115077366211e-06, + "loss": 0.9874, + "step": 12470 + }, + { + "epoch": 1.26, + "grad_norm": 6.109227137571904, + "learning_rate": 7.203482633057542e-06, + "loss": 1.0754, + "step": 12475 + }, + { + "epoch": 1.26, + "grad_norm": 7.617850345781858, + "learning_rate": 7.200849430532603e-06, + "loss": 1.0351, + "step": 12480 + }, + { + "epoch": 1.26, + "grad_norm": 9.918311738342016, + "learning_rate": 7.198215470697476e-06, + "loss": 1.0259, + "step": 12485 + }, + { + "epoch": 1.26, + "grad_norm": 12.622793750849116, + "learning_rate": 7.195580754458506e-06, + "loss": 1.0338, + "step": 12490 + }, + { + "epoch": 1.26, + "grad_norm": 10.284078241578221, + "learning_rate": 7.192945282722295e-06, + "loss": 0.9951, + "step": 12495 + }, + { + "epoch": 1.26, + "grad_norm": 5.934648099905459, + "learning_rate": 7.190309056395707e-06, + "loss": 1.0494, + "step": 12500 + }, + { + "epoch": 1.26, + "grad_norm": 27.260267090753327, + "learning_rate": 7.187672076385866e-06, + "loss": 1.0635, + "step": 12505 + }, + { + "epoch": 1.26, + "grad_norm": 31.740758025053612, + "learning_rate": 7.18503434360015e-06, + "loss": 1.0887, + "step": 12510 + }, + { + "epoch": 1.26, + "grad_norm": 7.88891262912335, + "learning_rate": 7.182395858946205e-06, + "loss": 1.0174, + "step": 12515 + }, + { + "epoch": 1.26, + "grad_norm": 25.142187362474495, + "learning_rate": 7.1797566233319295e-06, + "loss": 1.0363, + "step": 12520 + }, + { + "epoch": 1.26, + "grad_norm": 6.329384696868719, + "learning_rate": 7.177116637665481e-06, + "loss": 1.0686, + "step": 12525 + }, + { + "epoch": 1.26, + "grad_norm": 25.886128600810267, + "learning_rate": 7.174475902855277e-06, + "loss": 1.0288, + "step": 12530 + }, + { + "epoch": 1.26, + "grad_norm": 28.974868937037098, + "learning_rate": 7.171834419809993e-06, + "loss": 1.0078, + "step": 12535 + }, + { + "epoch": 1.26, + "grad_norm": 18.79249425342462, + "learning_rate": 7.169192189438558e-06, + "loss": 1.0652, + "step": 12540 + }, + { + "epoch": 1.26, + "grad_norm": 6.237221329610547, + "learning_rate": 7.166549212650162e-06, + "loss": 1.018, + "step": 12545 + }, + { + "epoch": 1.27, + "grad_norm": 13.120941011725753, + "learning_rate": 7.163905490354254e-06, + "loss": 1.0207, + "step": 12550 + }, + { + "epoch": 1.27, + "grad_norm": 27.040197948817184, + "learning_rate": 7.16126102346053e-06, + "loss": 1.0338, + "step": 12555 + }, + { + "epoch": 1.27, + "grad_norm": 5.480407746930139, + "learning_rate": 7.158615812878954e-06, + "loss": 1.0465, + "step": 12560 + }, + { + "epoch": 1.27, + "grad_norm": 16.480070729256095, + "learning_rate": 7.155969859519739e-06, + "loss": 1.0365, + "step": 12565 + }, + { + "epoch": 1.27, + "grad_norm": 18.25539248289707, + "learning_rate": 7.153323164293353e-06, + "loss": 1.071, + "step": 12570 + }, + { + "epoch": 1.27, + "grad_norm": 7.285603597985358, + "learning_rate": 7.150675728110525e-06, + "loss": 1.024, + "step": 12575 + }, + { + "epoch": 1.27, + "grad_norm": 22.656511072501885, + "learning_rate": 7.148027551882232e-06, + "loss": 1.0349, + "step": 12580 + }, + { + "epoch": 1.27, + "grad_norm": 25.51686516079161, + "learning_rate": 7.14537863651971e-06, + "loss": 1.0305, + "step": 12585 + }, + { + "epoch": 1.27, + "grad_norm": 36.296584926767956, + "learning_rate": 7.142728982934448e-06, + "loss": 1.0582, + "step": 12590 + }, + { + "epoch": 1.27, + "grad_norm": 9.6989012937694, + "learning_rate": 7.140078592038188e-06, + "loss": 1.0583, + "step": 12595 + }, + { + "epoch": 1.27, + "grad_norm": 12.384304551882453, + "learning_rate": 7.137427464742931e-06, + "loss": 1.0389, + "step": 12600 + }, + { + "epoch": 1.27, + "grad_norm": 8.185076794077693, + "learning_rate": 7.134775601960924e-06, + "loss": 0.9996, + "step": 12605 + }, + { + "epoch": 1.27, + "grad_norm": 11.546886645297697, + "learning_rate": 7.1321230046046704e-06, + "loss": 1.0152, + "step": 12610 + }, + { + "epoch": 1.27, + "grad_norm": 7.036397979494588, + "learning_rate": 7.129469673586928e-06, + "loss": 1.0489, + "step": 12615 + }, + { + "epoch": 1.27, + "grad_norm": 7.682921144015696, + "learning_rate": 7.126815609820705e-06, + "loss": 1.0305, + "step": 12620 + }, + { + "epoch": 1.27, + "grad_norm": 16.658399193206822, + "learning_rate": 7.124160814219262e-06, + "loss": 1.0618, + "step": 12625 + }, + { + "epoch": 1.27, + "grad_norm": 13.945032393377005, + "learning_rate": 7.1215052876961126e-06, + "loss": 1.0678, + "step": 12630 + }, + { + "epoch": 1.27, + "grad_norm": 7.638107952258533, + "learning_rate": 7.118849031165018e-06, + "loss": 1.0623, + "step": 12635 + }, + { + "epoch": 1.27, + "grad_norm": 6.605684454562965, + "learning_rate": 7.116192045539996e-06, + "loss": 1.0466, + "step": 12640 + }, + { + "epoch": 1.27, + "grad_norm": 9.750111366604184, + "learning_rate": 7.113534331735313e-06, + "loss": 1.0526, + "step": 12645 + }, + { + "epoch": 1.28, + "grad_norm": 12.176878081409408, + "learning_rate": 7.110875890665485e-06, + "loss": 1.0297, + "step": 12650 + }, + { + "epoch": 1.28, + "grad_norm": 7.296951539236836, + "learning_rate": 7.1082167232452785e-06, + "loss": 1.0467, + "step": 12655 + }, + { + "epoch": 1.28, + "grad_norm": 15.97042566401193, + "learning_rate": 7.105556830389713e-06, + "loss": 1.0344, + "step": 12660 + }, + { + "epoch": 1.28, + "grad_norm": 7.3407772774995985, + "learning_rate": 7.102896213014051e-06, + "loss": 1.0328, + "step": 12665 + }, + { + "epoch": 1.28, + "grad_norm": 15.607682097535177, + "learning_rate": 7.100234872033811e-06, + "loss": 1.0044, + "step": 12670 + }, + { + "epoch": 1.28, + "grad_norm": 40.043840619621726, + "learning_rate": 7.097572808364759e-06, + "loss": 1.0728, + "step": 12675 + }, + { + "epoch": 1.28, + "grad_norm": 27.527787667760947, + "learning_rate": 7.094910022922905e-06, + "loss": 1.0092, + "step": 12680 + }, + { + "epoch": 1.28, + "grad_norm": 12.531847109483222, + "learning_rate": 7.092246516624513e-06, + "loss": 1.0342, + "step": 12685 + }, + { + "epoch": 1.28, + "grad_norm": 75.7379895208296, + "learning_rate": 7.089582290386095e-06, + "loss": 1.0137, + "step": 12690 + }, + { + "epoch": 1.28, + "grad_norm": 64.10051819695862, + "learning_rate": 7.086917345124406e-06, + "loss": 1.0882, + "step": 12695 + }, + { + "epoch": 1.28, + "grad_norm": 41.26126004999705, + "learning_rate": 7.084251681756451e-06, + "loss": 1.0554, + "step": 12700 + }, + { + "epoch": 1.28, + "grad_norm": 10.337770024854704, + "learning_rate": 7.081585301199483e-06, + "loss": 1.0163, + "step": 12705 + }, + { + "epoch": 1.28, + "grad_norm": 13.561604971280826, + "learning_rate": 7.078918204371003e-06, + "loss": 1.0499, + "step": 12710 + }, + { + "epoch": 1.28, + "grad_norm": 15.69201828658827, + "learning_rate": 7.076250392188752e-06, + "loss": 1.0125, + "step": 12715 + }, + { + "epoch": 1.28, + "grad_norm": 7.476788419263224, + "learning_rate": 7.073581865570724e-06, + "loss": 1.0923, + "step": 12720 + }, + { + "epoch": 1.28, + "grad_norm": 14.125474784180392, + "learning_rate": 7.070912625435158e-06, + "loss": 1.0808, + "step": 12725 + }, + { + "epoch": 1.28, + "grad_norm": 7.270760693192891, + "learning_rate": 7.0682426727005345e-06, + "loss": 0.9814, + "step": 12730 + }, + { + "epoch": 1.28, + "grad_norm": 9.556102509114638, + "learning_rate": 7.065572008285584e-06, + "loss": 1.0297, + "step": 12735 + }, + { + "epoch": 1.28, + "grad_norm": 9.606222908410741, + "learning_rate": 7.062900633109277e-06, + "loss": 1.0731, + "step": 12740 + }, + { + "epoch": 1.28, + "grad_norm": 8.49691656987773, + "learning_rate": 7.060228548090833e-06, + "loss": 1.0223, + "step": 12745 + }, + { + "epoch": 1.29, + "grad_norm": 10.925933039779347, + "learning_rate": 7.057555754149713e-06, + "loss": 0.9925, + "step": 12750 + }, + { + "epoch": 1.29, + "grad_norm": 13.128472275615595, + "learning_rate": 7.054882252205624e-06, + "loss": 1.0409, + "step": 12755 + }, + { + "epoch": 1.29, + "grad_norm": 7.126108692814256, + "learning_rate": 7.052208043178514e-06, + "loss": 1.0924, + "step": 12760 + }, + { + "epoch": 1.29, + "grad_norm": 10.21221054468124, + "learning_rate": 7.049533127988576e-06, + "loss": 1.0358, + "step": 12765 + }, + { + "epoch": 1.29, + "grad_norm": 7.1484499062030125, + "learning_rate": 7.046857507556247e-06, + "loss": 1.0567, + "step": 12770 + }, + { + "epoch": 1.29, + "grad_norm": 38.45334141445022, + "learning_rate": 7.0441811828022045e-06, + "loss": 1.0411, + "step": 12775 + }, + { + "epoch": 1.29, + "grad_norm": 25.874324500701785, + "learning_rate": 7.041504154647369e-06, + "loss": 1.0286, + "step": 12780 + }, + { + "epoch": 1.29, + "grad_norm": 15.270163623871788, + "learning_rate": 7.038826424012904e-06, + "loss": 1.0301, + "step": 12785 + }, + { + "epoch": 1.29, + "grad_norm": 7.999079089209222, + "learning_rate": 7.036147991820215e-06, + "loss": 1.0585, + "step": 12790 + }, + { + "epoch": 1.29, + "grad_norm": 24.206478065357793, + "learning_rate": 7.033468858990944e-06, + "loss": 1.0118, + "step": 12795 + }, + { + "epoch": 1.29, + "grad_norm": 15.89523978208532, + "learning_rate": 7.030789026446984e-06, + "loss": 0.9707, + "step": 12800 + }, + { + "epoch": 1.29, + "grad_norm": 8.747962327773623, + "learning_rate": 7.028108495110457e-06, + "loss": 1.0732, + "step": 12805 + }, + { + "epoch": 1.29, + "grad_norm": 23.8698334255745, + "learning_rate": 7.025427265903735e-06, + "loss": 1.0465, + "step": 12810 + }, + { + "epoch": 1.29, + "grad_norm": 10.05917059719966, + "learning_rate": 7.022745339749426e-06, + "loss": 1.0668, + "step": 12815 + }, + { + "epoch": 1.29, + "grad_norm": 18.915197740297916, + "learning_rate": 7.020062717570376e-06, + "loss": 1.0105, + "step": 12820 + }, + { + "epoch": 1.29, + "grad_norm": 20.876299249569453, + "learning_rate": 7.017379400289675e-06, + "loss": 1.0386, + "step": 12825 + }, + { + "epoch": 1.29, + "grad_norm": 6.659062012775592, + "learning_rate": 7.01469538883065e-06, + "loss": 1.0328, + "step": 12830 + }, + { + "epoch": 1.29, + "grad_norm": 6.95405327987556, + "learning_rate": 7.012010684116865e-06, + "loss": 1.025, + "step": 12835 + }, + { + "epoch": 1.29, + "grad_norm": 9.551544747776951, + "learning_rate": 7.009325287072125e-06, + "loss": 1.0814, + "step": 12840 + }, + { + "epoch": 1.3, + "grad_norm": 26.97255684330524, + "learning_rate": 7.006639198620473e-06, + "loss": 0.9959, + "step": 12845 + }, + { + "epoch": 1.3, + "grad_norm": 37.41006235833254, + "learning_rate": 7.00395241968619e-06, + "loss": 1.0512, + "step": 12850 + }, + { + "epoch": 1.3, + "grad_norm": 20.749955587173165, + "learning_rate": 7.001264951193793e-06, + "loss": 1.0193, + "step": 12855 + }, + { + "epoch": 1.3, + "grad_norm": 9.119022299789663, + "learning_rate": 6.998576794068037e-06, + "loss": 0.9929, + "step": 12860 + }, + { + "epoch": 1.3, + "grad_norm": 17.41580671020243, + "learning_rate": 6.995887949233917e-06, + "loss": 1.0261, + "step": 12865 + }, + { + "epoch": 1.3, + "grad_norm": 21.30687944722217, + "learning_rate": 6.993198417616658e-06, + "loss": 1.0262, + "step": 12870 + }, + { + "epoch": 1.3, + "grad_norm": 17.579780848368085, + "learning_rate": 6.990508200141728e-06, + "loss": 0.9938, + "step": 12875 + }, + { + "epoch": 1.3, + "grad_norm": 15.586664085842704, + "learning_rate": 6.987817297734828e-06, + "loss": 1.0025, + "step": 12880 + }, + { + "epoch": 1.3, + "grad_norm": 15.894681746357387, + "learning_rate": 6.985125711321894e-06, + "loss": 1.0483, + "step": 12885 + }, + { + "epoch": 1.3, + "grad_norm": 16.365915987102852, + "learning_rate": 6.982433441829097e-06, + "loss": 1.0512, + "step": 12890 + }, + { + "epoch": 1.3, + "grad_norm": 14.951731138083796, + "learning_rate": 6.979740490182849e-06, + "loss": 1.0282, + "step": 12895 + }, + { + "epoch": 1.3, + "grad_norm": 7.217412059486604, + "learning_rate": 6.977046857309788e-06, + "loss": 1.0322, + "step": 12900 + }, + { + "epoch": 1.3, + "grad_norm": 8.806719211080662, + "learning_rate": 6.974352544136791e-06, + "loss": 1.045, + "step": 12905 + }, + { + "epoch": 1.3, + "grad_norm": 5.92757104630876, + "learning_rate": 6.971657551590971e-06, + "loss": 1.0264, + "step": 12910 + }, + { + "epoch": 1.3, + "grad_norm": 15.25657133532226, + "learning_rate": 6.968961880599668e-06, + "loss": 1.0021, + "step": 12915 + }, + { + "epoch": 1.3, + "grad_norm": 14.516066347033293, + "learning_rate": 6.9662655320904636e-06, + "loss": 1.0564, + "step": 12920 + }, + { + "epoch": 1.3, + "grad_norm": 9.063912907532362, + "learning_rate": 6.96356850699117e-06, + "loss": 1.0061, + "step": 12925 + }, + { + "epoch": 1.3, + "grad_norm": 15.391986135466741, + "learning_rate": 6.960870806229826e-06, + "loss": 1.0156, + "step": 12930 + }, + { + "epoch": 1.3, + "grad_norm": 6.738347589270799, + "learning_rate": 6.958172430734711e-06, + "loss": 1.0248, + "step": 12935 + }, + { + "epoch": 1.3, + "grad_norm": 7.753620709514878, + "learning_rate": 6.955473381434332e-06, + "loss": 1.03, + "step": 12940 + }, + { + "epoch": 1.31, + "grad_norm": 21.557169202777263, + "learning_rate": 6.952773659257431e-06, + "loss": 1.0125, + "step": 12945 + }, + { + "epoch": 1.31, + "grad_norm": 6.965167026749218, + "learning_rate": 6.95007326513298e-06, + "loss": 1.0619, + "step": 12950 + }, + { + "epoch": 1.31, + "grad_norm": 7.265830292021931, + "learning_rate": 6.94737219999018e-06, + "loss": 1.0392, + "step": 12955 + }, + { + "epoch": 1.31, + "grad_norm": 23.52081806715469, + "learning_rate": 6.9446704647584665e-06, + "loss": 1.0245, + "step": 12960 + }, + { + "epoch": 1.31, + "grad_norm": 12.871420177875759, + "learning_rate": 6.9419680603675026e-06, + "loss": 0.9975, + "step": 12965 + }, + { + "epoch": 1.31, + "grad_norm": 11.222317702243371, + "learning_rate": 6.9392649877471855e-06, + "loss": 1.0536, + "step": 12970 + }, + { + "epoch": 1.31, + "grad_norm": 42.25272051987114, + "learning_rate": 6.936561247827635e-06, + "loss": 1.0668, + "step": 12975 + }, + { + "epoch": 1.31, + "grad_norm": 13.740236086629558, + "learning_rate": 6.93385684153921e-06, + "loss": 1.0125, + "step": 12980 + }, + { + "epoch": 1.31, + "grad_norm": 18.645595937400724, + "learning_rate": 6.931151769812496e-06, + "loss": 1.0658, + "step": 12985 + }, + { + "epoch": 1.31, + "grad_norm": 15.141950619913802, + "learning_rate": 6.928446033578299e-06, + "loss": 1.0382, + "step": 12990 + }, + { + "epoch": 1.31, + "grad_norm": 11.023119671967667, + "learning_rate": 6.925739633767664e-06, + "loss": 1.0282, + "step": 12995 + }, + { + "epoch": 1.31, + "grad_norm": 24.068304301841803, + "learning_rate": 6.923032571311863e-06, + "loss": 1.0067, + "step": 13000 + }, + { + "epoch": 1.31, + "grad_norm": 27.996209165869637, + "learning_rate": 6.920324847142388e-06, + "loss": 1.0103, + "step": 13005 + }, + { + "epoch": 1.31, + "grad_norm": 6.417572565792332, + "learning_rate": 6.917616462190968e-06, + "loss": 1.0201, + "step": 13010 + }, + { + "epoch": 1.31, + "grad_norm": 7.657853453808597, + "learning_rate": 6.914907417389556e-06, + "loss": 1.0766, + "step": 13015 + }, + { + "epoch": 1.31, + "grad_norm": 10.65758461824999, + "learning_rate": 6.9121977136703285e-06, + "loss": 1.0146, + "step": 13020 + }, + { + "epoch": 1.31, + "grad_norm": 8.007045939186899, + "learning_rate": 6.9094873519656955e-06, + "loss": 0.9793, + "step": 13025 + }, + { + "epoch": 1.31, + "grad_norm": 15.578988873364581, + "learning_rate": 6.90677633320829e-06, + "loss": 1.0458, + "step": 13030 + }, + { + "epoch": 1.31, + "grad_norm": 16.951781002259686, + "learning_rate": 6.904064658330967e-06, + "loss": 1.0555, + "step": 13035 + }, + { + "epoch": 1.31, + "grad_norm": 16.35412417238803, + "learning_rate": 6.901352328266814e-06, + "loss": 1.0519, + "step": 13040 + }, + { + "epoch": 1.32, + "grad_norm": 7.959471511350942, + "learning_rate": 6.898639343949141e-06, + "loss": 1.0015, + "step": 13045 + }, + { + "epoch": 1.32, + "grad_norm": 15.821702563866582, + "learning_rate": 6.895925706311484e-06, + "loss": 1.0324, + "step": 13050 + }, + { + "epoch": 1.32, + "grad_norm": 8.07143200791322, + "learning_rate": 6.893211416287601e-06, + "loss": 1.0619, + "step": 13055 + }, + { + "epoch": 1.32, + "grad_norm": 7.163118330139236, + "learning_rate": 6.890496474811478e-06, + "loss": 1.0037, + "step": 13060 + }, + { + "epoch": 1.32, + "grad_norm": 22.921157404139223, + "learning_rate": 6.887780882817325e-06, + "loss": 1.0314, + "step": 13065 + }, + { + "epoch": 1.32, + "grad_norm": 21.488001146763192, + "learning_rate": 6.885064641239572e-06, + "loss": 1.0426, + "step": 13070 + }, + { + "epoch": 1.32, + "grad_norm": 14.64297873012137, + "learning_rate": 6.882347751012877e-06, + "loss": 1.0084, + "step": 13075 + }, + { + "epoch": 1.32, + "grad_norm": 7.848258797644815, + "learning_rate": 6.879630213072119e-06, + "loss": 1.0571, + "step": 13080 + }, + { + "epoch": 1.32, + "grad_norm": 8.16142770751216, + "learning_rate": 6.8769120283524e-06, + "loss": 0.9836, + "step": 13085 + }, + { + "epoch": 1.32, + "grad_norm": 17.09570378515828, + "learning_rate": 6.874193197789044e-06, + "loss": 1.009, + "step": 13090 + }, + { + "epoch": 1.32, + "grad_norm": 22.433742219274283, + "learning_rate": 6.8714737223176e-06, + "loss": 1.024, + "step": 13095 + }, + { + "epoch": 1.32, + "grad_norm": 5.383163564586508, + "learning_rate": 6.868753602873834e-06, + "loss": 1.0429, + "step": 13100 + }, + { + "epoch": 1.32, + "grad_norm": 8.936132776012842, + "learning_rate": 6.866032840393738e-06, + "loss": 1.0228, + "step": 13105 + }, + { + "epoch": 1.32, + "grad_norm": 6.030396329550592, + "learning_rate": 6.863311435813525e-06, + "loss": 1.0008, + "step": 13110 + }, + { + "epoch": 1.32, + "grad_norm": 11.397392977750727, + "learning_rate": 6.860589390069626e-06, + "loss": 1.0177, + "step": 13115 + }, + { + "epoch": 1.32, + "grad_norm": 18.55638265726068, + "learning_rate": 6.857866704098695e-06, + "loss": 1.0323, + "step": 13120 + }, + { + "epoch": 1.32, + "grad_norm": 9.559656051710714, + "learning_rate": 6.8551433788376066e-06, + "loss": 1.0315, + "step": 13125 + }, + { + "epoch": 1.32, + "grad_norm": 13.641928541047324, + "learning_rate": 6.852419415223451e-06, + "loss": 1.0941, + "step": 13130 + }, + { + "epoch": 1.32, + "grad_norm": 8.212436486827265, + "learning_rate": 6.849694814193546e-06, + "loss": 1.0541, + "step": 13135 + }, + { + "epoch": 1.32, + "grad_norm": 5.8214796520446495, + "learning_rate": 6.846969576685422e-06, + "loss": 1.0366, + "step": 13140 + }, + { + "epoch": 1.33, + "grad_norm": 8.157033991262095, + "learning_rate": 6.84424370363683e-06, + "loss": 0.9949, + "step": 13145 + }, + { + "epoch": 1.33, + "grad_norm": 14.45530704947713, + "learning_rate": 6.841517195985741e-06, + "loss": 1.0393, + "step": 13150 + }, + { + "epoch": 1.33, + "grad_norm": 16.143450026136613, + "learning_rate": 6.838790054670345e-06, + "loss": 1.0291, + "step": 13155 + }, + { + "epoch": 1.33, + "grad_norm": 16.624898335910864, + "learning_rate": 6.836062280629046e-06, + "loss": 1.0026, + "step": 13160 + }, + { + "epoch": 1.33, + "grad_norm": 18.84600348155945, + "learning_rate": 6.83333387480047e-06, + "loss": 1.018, + "step": 13165 + }, + { + "epoch": 1.33, + "grad_norm": 43.80741925650718, + "learning_rate": 6.830604838123459e-06, + "loss": 1.0479, + "step": 13170 + }, + { + "epoch": 1.33, + "grad_norm": 41.92711636866129, + "learning_rate": 6.827875171537071e-06, + "loss": 1.0578, + "step": 13175 + }, + { + "epoch": 1.33, + "grad_norm": 19.51673300351281, + "learning_rate": 6.8251448759805824e-06, + "loss": 1.0711, + "step": 13180 + }, + { + "epoch": 1.33, + "grad_norm": 8.556515933677582, + "learning_rate": 6.8224139523934865e-06, + "loss": 1.0446, + "step": 13185 + }, + { + "epoch": 1.33, + "grad_norm": 37.85927373587871, + "learning_rate": 6.81968240171549e-06, + "loss": 1.0247, + "step": 13190 + }, + { + "epoch": 1.33, + "grad_norm": 18.662322953416247, + "learning_rate": 6.816950224886515e-06, + "loss": 1.0237, + "step": 13195 + }, + { + "epoch": 1.33, + "grad_norm": 7.907269676994511, + "learning_rate": 6.814217422846705e-06, + "loss": 1.0179, + "step": 13200 + }, + { + "epoch": 1.33, + "grad_norm": 26.99504945572047, + "learning_rate": 6.811483996536412e-06, + "loss": 1.006, + "step": 13205 + }, + { + "epoch": 1.33, + "grad_norm": 6.444229002874957, + "learning_rate": 6.808749946896206e-06, + "loss": 1.0098, + "step": 13210 + }, + { + "epoch": 1.33, + "grad_norm": 6.473057195996981, + "learning_rate": 6.80601527486687e-06, + "loss": 1.0325, + "step": 13215 + }, + { + "epoch": 1.33, + "grad_norm": 14.804280215536732, + "learning_rate": 6.8032799813894055e-06, + "loss": 1.0363, + "step": 13220 + }, + { + "epoch": 1.33, + "grad_norm": 6.313675131537839, + "learning_rate": 6.800544067405019e-06, + "loss": 1.0658, + "step": 13225 + }, + { + "epoch": 1.33, + "grad_norm": 5.706735278471638, + "learning_rate": 6.7978075338551395e-06, + "loss": 1.057, + "step": 13230 + }, + { + "epoch": 1.33, + "grad_norm": 15.756023104740889, + "learning_rate": 6.795070381681405e-06, + "loss": 1.0309, + "step": 13235 + }, + { + "epoch": 1.33, + "grad_norm": 22.676044196697127, + "learning_rate": 6.792332611825667e-06, + "loss": 1.0114, + "step": 13240 + }, + { + "epoch": 1.34, + "grad_norm": 26.87964038992477, + "learning_rate": 6.789594225229987e-06, + "loss": 1.0055, + "step": 13245 + }, + { + "epoch": 1.34, + "grad_norm": 10.512523423043234, + "learning_rate": 6.7868552228366455e-06, + "loss": 1.0165, + "step": 13250 + }, + { + "epoch": 1.34, + "grad_norm": 23.665933617476504, + "learning_rate": 6.784115605588129e-06, + "loss": 1.0666, + "step": 13255 + }, + { + "epoch": 1.34, + "grad_norm": 47.00172811271067, + "learning_rate": 6.781375374427135e-06, + "loss": 1.0533, + "step": 13260 + }, + { + "epoch": 1.34, + "grad_norm": 50.63177323507287, + "learning_rate": 6.778634530296577e-06, + "loss": 1.0132, + "step": 13265 + }, + { + "epoch": 1.34, + "grad_norm": 12.536518733773919, + "learning_rate": 6.775893074139575e-06, + "loss": 1.0746, + "step": 13270 + }, + { + "epoch": 1.34, + "grad_norm": 29.53133532836771, + "learning_rate": 6.773151006899462e-06, + "loss": 1.0575, + "step": 13275 + }, + { + "epoch": 1.34, + "grad_norm": 77.40998959421712, + "learning_rate": 6.770408329519783e-06, + "loss": 1.0598, + "step": 13280 + }, + { + "epoch": 1.34, + "grad_norm": 30.24829150706806, + "learning_rate": 6.767665042944287e-06, + "loss": 1.0107, + "step": 13285 + }, + { + "epoch": 1.34, + "grad_norm": 13.192949203694534, + "learning_rate": 6.764921148116938e-06, + "loss": 1.0108, + "step": 13290 + }, + { + "epoch": 1.34, + "grad_norm": 7.9501053589897985, + "learning_rate": 6.7621766459819095e-06, + "loss": 1.0401, + "step": 13295 + }, + { + "epoch": 1.34, + "grad_norm": 6.111409883932251, + "learning_rate": 6.759431537483578e-06, + "loss": 1.0443, + "step": 13300 + }, + { + "epoch": 1.34, + "grad_norm": 8.679607540016216, + "learning_rate": 6.756685823566537e-06, + "loss": 1.0365, + "step": 13305 + }, + { + "epoch": 1.34, + "grad_norm": 11.376957592481851, + "learning_rate": 6.753939505175581e-06, + "loss": 1.0523, + "step": 13310 + }, + { + "epoch": 1.34, + "grad_norm": 7.391994052623102, + "learning_rate": 6.751192583255716e-06, + "loss": 1.0234, + "step": 13315 + }, + { + "epoch": 1.34, + "grad_norm": 19.930521794582923, + "learning_rate": 6.748445058752155e-06, + "loss": 1.0303, + "step": 13320 + }, + { + "epoch": 1.34, + "grad_norm": 7.669913265990586, + "learning_rate": 6.745696932610322e-06, + "loss": 1.0876, + "step": 13325 + }, + { + "epoch": 1.34, + "grad_norm": 7.146589085411706, + "learning_rate": 6.742948205775839e-06, + "loss": 1.0389, + "step": 13330 + }, + { + "epoch": 1.34, + "grad_norm": 6.971442314488811, + "learning_rate": 6.740198879194544e-06, + "loss": 1.0264, + "step": 13335 + }, + { + "epoch": 1.34, + "grad_norm": 15.669283894212775, + "learning_rate": 6.7374489538124775e-06, + "loss": 1.0593, + "step": 13340 + }, + { + "epoch": 1.35, + "grad_norm": 12.953616910642543, + "learning_rate": 6.734698430575885e-06, + "loss": 1.0129, + "step": 13345 + }, + { + "epoch": 1.35, + "grad_norm": 11.523231959616394, + "learning_rate": 6.731947310431219e-06, + "loss": 1.0135, + "step": 13350 + }, + { + "epoch": 1.35, + "grad_norm": 17.804972747545044, + "learning_rate": 6.7291955943251385e-06, + "loss": 1.0272, + "step": 13355 + }, + { + "epoch": 1.35, + "grad_norm": 10.953005179019751, + "learning_rate": 6.726443283204506e-06, + "loss": 1.0357, + "step": 13360 + }, + { + "epoch": 1.35, + "grad_norm": 6.35695685522286, + "learning_rate": 6.723690378016387e-06, + "loss": 0.9987, + "step": 13365 + }, + { + "epoch": 1.35, + "grad_norm": 12.64247175046237, + "learning_rate": 6.720936879708055e-06, + "loss": 0.9996, + "step": 13370 + }, + { + "epoch": 1.35, + "grad_norm": 12.212790234072378, + "learning_rate": 6.718182789226988e-06, + "loss": 1.0104, + "step": 13375 + }, + { + "epoch": 1.35, + "grad_norm": 5.377406455852372, + "learning_rate": 6.715428107520864e-06, + "loss": 1.0282, + "step": 13380 + }, + { + "epoch": 1.35, + "grad_norm": 16.39394240132829, + "learning_rate": 6.712672835537566e-06, + "loss": 1.049, + "step": 13385 + }, + { + "epoch": 1.35, + "grad_norm": 13.035697471945214, + "learning_rate": 6.709916974225181e-06, + "loss": 1.0153, + "step": 13390 + }, + { + "epoch": 1.35, + "grad_norm": 9.64975431761266, + "learning_rate": 6.707160524532e-06, + "loss": 1.0552, + "step": 13395 + }, + { + "epoch": 1.35, + "grad_norm": 17.294643581025277, + "learning_rate": 6.70440348740651e-06, + "loss": 1.073, + "step": 13400 + }, + { + "epoch": 1.35, + "grad_norm": 18.488330345940135, + "learning_rate": 6.70164586379741e-06, + "loss": 1.0568, + "step": 13405 + }, + { + "epoch": 1.35, + "grad_norm": 13.870119310854673, + "learning_rate": 6.698887654653593e-06, + "loss": 1.0285, + "step": 13410 + }, + { + "epoch": 1.35, + "grad_norm": 13.912999058837164, + "learning_rate": 6.6961288609241555e-06, + "loss": 1.0441, + "step": 13415 + }, + { + "epoch": 1.35, + "grad_norm": 26.74928942560138, + "learning_rate": 6.693369483558399e-06, + "loss": 1.0799, + "step": 13420 + }, + { + "epoch": 1.35, + "grad_norm": 8.9684024815037, + "learning_rate": 6.69060952350582e-06, + "loss": 1.016, + "step": 13425 + }, + { + "epoch": 1.35, + "grad_norm": 9.46064987134036, + "learning_rate": 6.687848981716118e-06, + "loss": 0.9943, + "step": 13430 + }, + { + "epoch": 1.35, + "grad_norm": 7.169127122095071, + "learning_rate": 6.6850878591391945e-06, + "loss": 1.0552, + "step": 13435 + }, + { + "epoch": 1.36, + "grad_norm": 5.165354843115252, + "learning_rate": 6.682326156725147e-06, + "loss": 1.0508, + "step": 13440 + }, + { + "epoch": 1.36, + "grad_norm": 6.2432862349534695, + "learning_rate": 6.679563875424276e-06, + "loss": 1.0393, + "step": 13445 + }, + { + "epoch": 1.36, + "grad_norm": 10.935217332921848, + "learning_rate": 6.67680101618708e-06, + "loss": 1.0513, + "step": 13450 + }, + { + "epoch": 1.36, + "grad_norm": 6.607537224645915, + "learning_rate": 6.674037579964256e-06, + "loss": 1.0162, + "step": 13455 + }, + { + "epoch": 1.36, + "grad_norm": 15.120510077099906, + "learning_rate": 6.671273567706699e-06, + "loss": 1.0126, + "step": 13460 + }, + { + "epoch": 1.36, + "grad_norm": 7.850181977228456, + "learning_rate": 6.668508980365505e-06, + "loss": 1.0038, + "step": 13465 + }, + { + "epoch": 1.36, + "grad_norm": 7.137039326824293, + "learning_rate": 6.665743818891963e-06, + "loss": 1.0238, + "step": 13470 + }, + { + "epoch": 1.36, + "grad_norm": 11.516447441123454, + "learning_rate": 6.662978084237565e-06, + "loss": 1.0241, + "step": 13475 + }, + { + "epoch": 1.36, + "grad_norm": 27.20639222893311, + "learning_rate": 6.6602117773539954e-06, + "loss": 0.9892, + "step": 13480 + }, + { + "epoch": 1.36, + "grad_norm": 19.739972267618672, + "learning_rate": 6.65744489919314e-06, + "loss": 1.0032, + "step": 13485 + }, + { + "epoch": 1.36, + "grad_norm": 21.551587919524792, + "learning_rate": 6.654677450707077e-06, + "loss": 1.0572, + "step": 13490 + }, + { + "epoch": 1.36, + "grad_norm": 6.2760398499502985, + "learning_rate": 6.651909432848083e-06, + "loss": 1.0114, + "step": 13495 + }, + { + "epoch": 1.36, + "grad_norm": 9.902126713444899, + "learning_rate": 6.6491408465686324e-06, + "loss": 1.1127, + "step": 13500 + }, + { + "epoch": 1.36, + "grad_norm": 12.188235790272506, + "learning_rate": 6.646371692821391e-06, + "loss": 0.9909, + "step": 13505 + }, + { + "epoch": 1.36, + "grad_norm": 12.421880256038474, + "learning_rate": 6.6436019725592215e-06, + "loss": 1.0017, + "step": 13510 + }, + { + "epoch": 1.36, + "grad_norm": 7.010103002828106, + "learning_rate": 6.640831686735186e-06, + "loss": 1.0216, + "step": 13515 + }, + { + "epoch": 1.36, + "grad_norm": 6.120440748200015, + "learning_rate": 6.638060836302531e-06, + "loss": 1.0354, + "step": 13520 + }, + { + "epoch": 1.36, + "grad_norm": 13.263586848738726, + "learning_rate": 6.635289422214708e-06, + "loss": 1.0607, + "step": 13525 + }, + { + "epoch": 1.36, + "grad_norm": 16.134589255668278, + "learning_rate": 6.632517445425357e-06, + "loss": 1.011, + "step": 13530 + }, + { + "epoch": 1.36, + "grad_norm": 12.013205345946286, + "learning_rate": 6.6297449068883125e-06, + "loss": 1.0028, + "step": 13535 + }, + { + "epoch": 1.37, + "grad_norm": 10.790604203729266, + "learning_rate": 6.6269718075576e-06, + "loss": 1.0237, + "step": 13540 + }, + { + "epoch": 1.37, + "grad_norm": 7.71188736972473, + "learning_rate": 6.624198148387446e-06, + "loss": 1.0279, + "step": 13545 + }, + { + "epoch": 1.37, + "grad_norm": 7.169575350843859, + "learning_rate": 6.621423930332258e-06, + "loss": 1.0798, + "step": 13550 + }, + { + "epoch": 1.37, + "grad_norm": 11.090805139004452, + "learning_rate": 6.618649154346645e-06, + "loss": 1.0083, + "step": 13555 + }, + { + "epoch": 1.37, + "grad_norm": 11.675451134812084, + "learning_rate": 6.615873821385404e-06, + "loss": 1.0365, + "step": 13560 + }, + { + "epoch": 1.37, + "grad_norm": 8.993575443756557, + "learning_rate": 6.613097932403524e-06, + "loss": 0.9776, + "step": 13565 + }, + { + "epoch": 1.37, + "grad_norm": 5.518707514073316, + "learning_rate": 6.610321488356186e-06, + "loss": 1.0298, + "step": 13570 + }, + { + "epoch": 1.37, + "grad_norm": 16.39762762125881, + "learning_rate": 6.607544490198763e-06, + "loss": 1.011, + "step": 13575 + }, + { + "epoch": 1.37, + "grad_norm": 19.01928544072449, + "learning_rate": 6.6047669388868155e-06, + "loss": 1.0303, + "step": 13580 + }, + { + "epoch": 1.37, + "grad_norm": 27.189320273451663, + "learning_rate": 6.601988835376096e-06, + "loss": 1.0292, + "step": 13585 + }, + { + "epoch": 1.37, + "grad_norm": 39.491268467130276, + "learning_rate": 6.599210180622551e-06, + "loss": 1.0243, + "step": 13590 + }, + { + "epoch": 1.37, + "grad_norm": 26.732490527271786, + "learning_rate": 6.5964309755823075e-06, + "loss": 1.0122, + "step": 13595 + }, + { + "epoch": 1.37, + "grad_norm": 15.135663071949276, + "learning_rate": 6.593651221211691e-06, + "loss": 1.0185, + "step": 13600 + }, + { + "epoch": 1.37, + "grad_norm": 7.233287153650046, + "learning_rate": 6.590870918467214e-06, + "loss": 0.971, + "step": 13605 + }, + { + "epoch": 1.37, + "grad_norm": 6.495907008406722, + "learning_rate": 6.588090068305569e-06, + "loss": 1.0558, + "step": 13610 + }, + { + "epoch": 1.37, + "grad_norm": 44.42453635403032, + "learning_rate": 6.58530867168365e-06, + "loss": 0.9835, + "step": 13615 + }, + { + "epoch": 1.37, + "grad_norm": 28.144903089867967, + "learning_rate": 6.582526729558533e-06, + "loss": 1.0596, + "step": 13620 + }, + { + "epoch": 1.37, + "grad_norm": 12.024692701567258, + "learning_rate": 6.579744242887478e-06, + "loss": 1.0064, + "step": 13625 + }, + { + "epoch": 1.37, + "grad_norm": 6.463744260982136, + "learning_rate": 6.576961212627938e-06, + "loss": 1.0177, + "step": 13630 + }, + { + "epoch": 1.37, + "grad_norm": 8.95274300642296, + "learning_rate": 6.574177639737551e-06, + "loss": 1.0668, + "step": 13635 + }, + { + "epoch": 1.38, + "grad_norm": 8.42371767681941, + "learning_rate": 6.571393525174142e-06, + "loss": 1.0396, + "step": 13640 + }, + { + "epoch": 1.38, + "grad_norm": 11.357186063769456, + "learning_rate": 6.568608869895722e-06, + "loss": 1.0231, + "step": 13645 + }, + { + "epoch": 1.38, + "grad_norm": 12.015801963572391, + "learning_rate": 6.565823674860487e-06, + "loss": 1.0472, + "step": 13650 + }, + { + "epoch": 1.38, + "grad_norm": 14.069250439384112, + "learning_rate": 6.563037941026822e-06, + "loss": 0.981, + "step": 13655 + }, + { + "epoch": 1.38, + "grad_norm": 24.130766918714176, + "learning_rate": 6.560251669353294e-06, + "loss": 1.0042, + "step": 13660 + }, + { + "epoch": 1.38, + "grad_norm": 7.899136249675347, + "learning_rate": 6.557464860798657e-06, + "loss": 1.0442, + "step": 13665 + }, + { + "epoch": 1.38, + "grad_norm": 8.27856661226327, + "learning_rate": 6.554677516321849e-06, + "loss": 1.026, + "step": 13670 + }, + { + "epoch": 1.38, + "grad_norm": 13.703392661787985, + "learning_rate": 6.551889636881992e-06, + "loss": 1.0072, + "step": 13675 + }, + { + "epoch": 1.38, + "grad_norm": 14.521669786912787, + "learning_rate": 6.549101223438394e-06, + "loss": 0.9798, + "step": 13680 + }, + { + "epoch": 1.38, + "grad_norm": 8.132208029869739, + "learning_rate": 6.546312276950544e-06, + "loss": 1.0406, + "step": 13685 + }, + { + "epoch": 1.38, + "grad_norm": 5.337966994467242, + "learning_rate": 6.5435227983781155e-06, + "loss": 1.0336, + "step": 13690 + }, + { + "epoch": 1.38, + "grad_norm": 10.018133752379166, + "learning_rate": 6.540732788680968e-06, + "loss": 1.0596, + "step": 13695 + }, + { + "epoch": 1.38, + "grad_norm": 8.981410299463178, + "learning_rate": 6.537942248819139e-06, + "loss": 1.052, + "step": 13700 + }, + { + "epoch": 1.38, + "grad_norm": 13.807726025401056, + "learning_rate": 6.53515117975285e-06, + "loss": 1.0451, + "step": 13705 + }, + { + "epoch": 1.38, + "grad_norm": 12.258256090032113, + "learning_rate": 6.532359582442509e-06, + "loss": 1.0257, + "step": 13710 + }, + { + "epoch": 1.38, + "grad_norm": 13.830220039221603, + "learning_rate": 6.529567457848697e-06, + "loss": 1.0574, + "step": 13715 + }, + { + "epoch": 1.38, + "grad_norm": 14.951624879155915, + "learning_rate": 6.526774806932184e-06, + "loss": 1.053, + "step": 13720 + }, + { + "epoch": 1.38, + "grad_norm": 10.10751827537318, + "learning_rate": 6.523981630653918e-06, + "loss": 1.0139, + "step": 13725 + }, + { + "epoch": 1.38, + "grad_norm": 7.374129464914586, + "learning_rate": 6.521187929975028e-06, + "loss": 1.0713, + "step": 13730 + }, + { + "epoch": 1.38, + "grad_norm": 8.969936644729158, + "learning_rate": 6.518393705856826e-06, + "loss": 1.0491, + "step": 13735 + }, + { + "epoch": 1.39, + "grad_norm": 17.11796650989927, + "learning_rate": 6.515598959260798e-06, + "loss": 1.0005, + "step": 13740 + }, + { + "epoch": 1.39, + "grad_norm": 9.871941947917126, + "learning_rate": 6.512803691148618e-06, + "loss": 1.0545, + "step": 13745 + }, + { + "epoch": 1.39, + "grad_norm": 14.622679603307276, + "learning_rate": 6.510007902482132e-06, + "loss": 1.051, + "step": 13750 + }, + { + "epoch": 1.39, + "grad_norm": 61.39754193248918, + "learning_rate": 6.507211594223367e-06, + "loss": 0.9929, + "step": 13755 + }, + { + "epoch": 1.39, + "grad_norm": 17.488750011595506, + "learning_rate": 6.504414767334535e-06, + "loss": 0.9972, + "step": 13760 + }, + { + "epoch": 1.39, + "grad_norm": 10.12429589602954, + "learning_rate": 6.501617422778016e-06, + "loss": 1.0398, + "step": 13765 + }, + { + "epoch": 1.39, + "grad_norm": 9.879413296776791, + "learning_rate": 6.498819561516376e-06, + "loss": 0.9726, + "step": 13770 + }, + { + "epoch": 1.39, + "grad_norm": 23.75657216326405, + "learning_rate": 6.4960211845123574e-06, + "loss": 1.0689, + "step": 13775 + }, + { + "epoch": 1.39, + "grad_norm": 11.45006080589518, + "learning_rate": 6.493222292728878e-06, + "loss": 1.0506, + "step": 13780 + }, + { + "epoch": 1.39, + "grad_norm": 12.991676780315553, + "learning_rate": 6.490422887129034e-06, + "loss": 1.0286, + "step": 13785 + }, + { + "epoch": 1.39, + "grad_norm": 16.18143877800714, + "learning_rate": 6.487622968676098e-06, + "loss": 1.0247, + "step": 13790 + }, + { + "epoch": 1.39, + "grad_norm": 6.793232464061429, + "learning_rate": 6.48482253833352e-06, + "loss": 1.0389, + "step": 13795 + }, + { + "epoch": 1.39, + "grad_norm": 7.608126922471388, + "learning_rate": 6.482021597064923e-06, + "loss": 1.0548, + "step": 13800 + }, + { + "epoch": 1.39, + "grad_norm": 10.140751653201086, + "learning_rate": 6.479220145834111e-06, + "loss": 1.0124, + "step": 13805 + }, + { + "epoch": 1.39, + "grad_norm": 5.505552673280925, + "learning_rate": 6.47641818560506e-06, + "loss": 1.0583, + "step": 13810 + }, + { + "epoch": 1.39, + "grad_norm": 6.627856677353117, + "learning_rate": 6.47361571734192e-06, + "loss": 1.0085, + "step": 13815 + }, + { + "epoch": 1.39, + "grad_norm": 8.41666298648295, + "learning_rate": 6.470812742009021e-06, + "loss": 0.9837, + "step": 13820 + }, + { + "epoch": 1.39, + "grad_norm": 10.724829915820955, + "learning_rate": 6.468009260570861e-06, + "loss": 1.0064, + "step": 13825 + }, + { + "epoch": 1.39, + "grad_norm": 15.376678289487328, + "learning_rate": 6.4652052739921165e-06, + "loss": 1.0592, + "step": 13830 + }, + { + "epoch": 1.39, + "grad_norm": 6.893900152860663, + "learning_rate": 6.462400783237638e-06, + "loss": 0.9976, + "step": 13835 + }, + { + "epoch": 1.4, + "grad_norm": 5.700056112704488, + "learning_rate": 6.459595789272446e-06, + "loss": 0.9963, + "step": 13840 + }, + { + "epoch": 1.4, + "grad_norm": 7.192370893705413, + "learning_rate": 6.456790293061735e-06, + "loss": 1.0182, + "step": 13845 + }, + { + "epoch": 1.4, + "grad_norm": 16.006904290572265, + "learning_rate": 6.453984295570879e-06, + "loss": 1.038, + "step": 13850 + }, + { + "epoch": 1.4, + "grad_norm": 17.040487629761007, + "learning_rate": 6.451177797765414e-06, + "loss": 1.0622, + "step": 13855 + }, + { + "epoch": 1.4, + "grad_norm": 11.209879361348221, + "learning_rate": 6.4483708006110546e-06, + "loss": 1.0178, + "step": 13860 + }, + { + "epoch": 1.4, + "grad_norm": 29.12635124130482, + "learning_rate": 6.445563305073685e-06, + "loss": 0.996, + "step": 13865 + }, + { + "epoch": 1.4, + "grad_norm": 30.88874303744498, + "learning_rate": 6.4427553121193665e-06, + "loss": 1.0448, + "step": 13870 + }, + { + "epoch": 1.4, + "grad_norm": 31.535720727730226, + "learning_rate": 6.4399468227143204e-06, + "loss": 1.0054, + "step": 13875 + }, + { + "epoch": 1.4, + "grad_norm": 15.864907947690293, + "learning_rate": 6.437137837824947e-06, + "loss": 1.0738, + "step": 13880 + }, + { + "epoch": 1.4, + "grad_norm": 30.114081794271012, + "learning_rate": 6.434328358417819e-06, + "loss": 1.0386, + "step": 13885 + }, + { + "epoch": 1.4, + "grad_norm": 23.122982889635825, + "learning_rate": 6.43151838545967e-06, + "loss": 1.0401, + "step": 13890 + }, + { + "epoch": 1.4, + "grad_norm": 45.08967697873334, + "learning_rate": 6.428707919917412e-06, + "loss": 1.0247, + "step": 13895 + }, + { + "epoch": 1.4, + "grad_norm": 8.635129933519156, + "learning_rate": 6.425896962758126e-06, + "loss": 1.0081, + "step": 13900 + }, + { + "epoch": 1.4, + "grad_norm": 12.054772921652399, + "learning_rate": 6.423085514949055e-06, + "loss": 1.0032, + "step": 13905 + }, + { + "epoch": 1.4, + "grad_norm": 16.190495259014995, + "learning_rate": 6.420273577457617e-06, + "loss": 1.0165, + "step": 13910 + }, + { + "epoch": 1.4, + "grad_norm": 29.90283692177583, + "learning_rate": 6.417461151251399e-06, + "loss": 1.1068, + "step": 13915 + }, + { + "epoch": 1.4, + "grad_norm": 23.32826749738727, + "learning_rate": 6.414648237298151e-06, + "loss": 1.0314, + "step": 13920 + }, + { + "epoch": 1.4, + "grad_norm": 17.451426247775657, + "learning_rate": 6.411834836565797e-06, + "loss": 1.018, + "step": 13925 + }, + { + "epoch": 1.4, + "grad_norm": 5.1216586854247375, + "learning_rate": 6.409020950022424e-06, + "loss": 1.0708, + "step": 13930 + }, + { + "epoch": 1.4, + "grad_norm": 7.443637085100315, + "learning_rate": 6.406206578636288e-06, + "loss": 1.0659, + "step": 13935 + }, + { + "epoch": 1.41, + "grad_norm": 20.288942801522673, + "learning_rate": 6.403391723375812e-06, + "loss": 1.0014, + "step": 13940 + }, + { + "epoch": 1.41, + "grad_norm": 18.847174012916593, + "learning_rate": 6.400576385209583e-06, + "loss": 0.9968, + "step": 13945 + }, + { + "epoch": 1.41, + "grad_norm": 27.161722995764585, + "learning_rate": 6.397760565106358e-06, + "loss": 1.0816, + "step": 13950 + }, + { + "epoch": 1.41, + "grad_norm": 11.175698198589409, + "learning_rate": 6.394944264035057e-06, + "loss": 1.0533, + "step": 13955 + }, + { + "epoch": 1.41, + "grad_norm": 6.558573764246493, + "learning_rate": 6.3921274829647685e-06, + "loss": 0.9715, + "step": 13960 + }, + { + "epoch": 1.41, + "grad_norm": 10.821086188227273, + "learning_rate": 6.389310222864742e-06, + "loss": 1.022, + "step": 13965 + }, + { + "epoch": 1.41, + "grad_norm": 10.366573020455437, + "learning_rate": 6.386492484704394e-06, + "loss": 1.0075, + "step": 13970 + }, + { + "epoch": 1.41, + "grad_norm": 6.897677438642832, + "learning_rate": 6.3836742694533084e-06, + "loss": 1.0126, + "step": 13975 + }, + { + "epoch": 1.41, + "grad_norm": 9.255974567583273, + "learning_rate": 6.380855578081227e-06, + "loss": 1.0278, + "step": 13980 + }, + { + "epoch": 1.41, + "grad_norm": 6.620443852704517, + "learning_rate": 6.378036411558058e-06, + "loss": 1.0286, + "step": 13985 + }, + { + "epoch": 1.41, + "grad_norm": 8.237252298755164, + "learning_rate": 6.3752167708538795e-06, + "loss": 1.0324, + "step": 13990 + }, + { + "epoch": 1.41, + "grad_norm": 15.148748108774727, + "learning_rate": 6.3723966569389215e-06, + "loss": 0.9985, + "step": 13995 + }, + { + "epoch": 1.41, + "grad_norm": 8.73717599860813, + "learning_rate": 6.369576070783585e-06, + "loss": 1.0606, + "step": 14000 + }, + { + "epoch": 1.41, + "grad_norm": 5.626033599591908, + "learning_rate": 6.366755013358431e-06, + "loss": 1.0858, + "step": 14005 + }, + { + "epoch": 1.41, + "grad_norm": 13.170414197647256, + "learning_rate": 6.3639334856341824e-06, + "loss": 1.038, + "step": 14010 + }, + { + "epoch": 1.41, + "grad_norm": 14.102484949514457, + "learning_rate": 6.361111488581724e-06, + "loss": 0.9809, + "step": 14015 + }, + { + "epoch": 1.41, + "grad_norm": 28.116597064326502, + "learning_rate": 6.358289023172102e-06, + "loss": 1.0353, + "step": 14020 + }, + { + "epoch": 1.41, + "grad_norm": 38.69362673107948, + "learning_rate": 6.355466090376526e-06, + "loss": 1.0562, + "step": 14025 + }, + { + "epoch": 1.41, + "grad_norm": 47.16918166965689, + "learning_rate": 6.35264269116636e-06, + "loss": 1.0332, + "step": 14030 + }, + { + "epoch": 1.42, + "grad_norm": 17.533420893919676, + "learning_rate": 6.349818826513137e-06, + "loss": 1.0296, + "step": 14035 + }, + { + "epoch": 1.42, + "grad_norm": 11.442797392178308, + "learning_rate": 6.346994497388545e-06, + "loss": 1.0713, + "step": 14040 + }, + { + "epoch": 1.42, + "grad_norm": 16.04787718380431, + "learning_rate": 6.344169704764433e-06, + "loss": 1.0068, + "step": 14045 + }, + { + "epoch": 1.42, + "grad_norm": 10.064768698412545, + "learning_rate": 6.341344449612811e-06, + "loss": 1.0295, + "step": 14050 + }, + { + "epoch": 1.42, + "grad_norm": 8.302200586498188, + "learning_rate": 6.338518732905844e-06, + "loss": 1.0236, + "step": 14055 + }, + { + "epoch": 1.42, + "grad_norm": 21.329774898188415, + "learning_rate": 6.3356925556158586e-06, + "loss": 1.058, + "step": 14060 + }, + { + "epoch": 1.42, + "grad_norm": 29.134880862354656, + "learning_rate": 6.332865918715341e-06, + "loss": 1.0199, + "step": 14065 + }, + { + "epoch": 1.42, + "grad_norm": 32.23187860799201, + "learning_rate": 6.330038823176935e-06, + "loss": 1.048, + "step": 14070 + }, + { + "epoch": 1.42, + "grad_norm": 37.799283905948954, + "learning_rate": 6.327211269973439e-06, + "loss": 1.0262, + "step": 14075 + }, + { + "epoch": 1.42, + "grad_norm": 12.704421125137417, + "learning_rate": 6.324383260077813e-06, + "loss": 1.0037, + "step": 14080 + }, + { + "epoch": 1.42, + "grad_norm": 7.9294961136118, + "learning_rate": 6.321554794463173e-06, + "loss": 1.0039, + "step": 14085 + }, + { + "epoch": 1.42, + "grad_norm": 6.850289258704223, + "learning_rate": 6.318725874102789e-06, + "loss": 1.0635, + "step": 14090 + }, + { + "epoch": 1.42, + "grad_norm": 11.226221318889067, + "learning_rate": 6.315896499970091e-06, + "loss": 0.9888, + "step": 14095 + }, + { + "epoch": 1.42, + "grad_norm": 12.130575799026504, + "learning_rate": 6.313066673038666e-06, + "loss": 1.0261, + "step": 14100 + }, + { + "epoch": 1.42, + "grad_norm": 9.480941531356995, + "learning_rate": 6.310236394282252e-06, + "loss": 1.0188, + "step": 14105 + }, + { + "epoch": 1.42, + "grad_norm": 7.185087851042856, + "learning_rate": 6.307405664674745e-06, + "loss": 1.0114, + "step": 14110 + }, + { + "epoch": 1.42, + "grad_norm": 6.910149014895005, + "learning_rate": 6.304574485190199e-06, + "loss": 1.0491, + "step": 14115 + }, + { + "epoch": 1.42, + "grad_norm": 5.739412132695425, + "learning_rate": 6.301742856802819e-06, + "loss": 0.9905, + "step": 14120 + }, + { + "epoch": 1.42, + "grad_norm": 7.0777644734152, + "learning_rate": 6.298910780486964e-06, + "loss": 1.021, + "step": 14125 + }, + { + "epoch": 1.42, + "grad_norm": 5.950787765281109, + "learning_rate": 6.29607825721715e-06, + "loss": 1.036, + "step": 14130 + }, + { + "epoch": 1.43, + "grad_norm": 10.30674289285351, + "learning_rate": 6.2932452879680475e-06, + "loss": 1.0568, + "step": 14135 + }, + { + "epoch": 1.43, + "grad_norm": 8.131594426838518, + "learning_rate": 6.290411873714475e-06, + "loss": 1.0144, + "step": 14140 + }, + { + "epoch": 1.43, + "grad_norm": 8.093782127641017, + "learning_rate": 6.287578015431409e-06, + "loss": 0.9836, + "step": 14145 + }, + { + "epoch": 1.43, + "grad_norm": 10.006982750133712, + "learning_rate": 6.284743714093979e-06, + "loss": 1.072, + "step": 14150 + }, + { + "epoch": 1.43, + "grad_norm": 5.524437370717353, + "learning_rate": 6.281908970677463e-06, + "loss": 1.0174, + "step": 14155 + }, + { + "epoch": 1.43, + "grad_norm": 18.981786541530653, + "learning_rate": 6.279073786157293e-06, + "loss": 0.9756, + "step": 14160 + }, + { + "epoch": 1.43, + "grad_norm": 8.638712127684034, + "learning_rate": 6.276238161509058e-06, + "loss": 1.0631, + "step": 14165 + }, + { + "epoch": 1.43, + "grad_norm": 10.830997597851976, + "learning_rate": 6.273402097708488e-06, + "loss": 1.0142, + "step": 14170 + }, + { + "epoch": 1.43, + "grad_norm": 9.764014225155503, + "learning_rate": 6.270565595731475e-06, + "loss": 1.0186, + "step": 14175 + }, + { + "epoch": 1.43, + "grad_norm": 13.199229045647108, + "learning_rate": 6.267728656554053e-06, + "loss": 0.9966, + "step": 14180 + }, + { + "epoch": 1.43, + "grad_norm": 18.008084456261397, + "learning_rate": 6.26489128115241e-06, + "loss": 1.0303, + "step": 14185 + }, + { + "epoch": 1.43, + "grad_norm": 21.477202925427804, + "learning_rate": 6.262053470502886e-06, + "loss": 0.9913, + "step": 14190 + }, + { + "epoch": 1.43, + "grad_norm": 6.387465020085141, + "learning_rate": 6.259215225581968e-06, + "loss": 1.0221, + "step": 14195 + }, + { + "epoch": 1.43, + "grad_norm": 19.397843353781308, + "learning_rate": 6.256376547366294e-06, + "loss": 1.0107, + "step": 14200 + }, + { + "epoch": 1.43, + "grad_norm": 8.135092010729808, + "learning_rate": 6.25353743683265e-06, + "loss": 1.0878, + "step": 14205 + }, + { + "epoch": 1.43, + "grad_norm": 16.12569114899103, + "learning_rate": 6.250697894957971e-06, + "loss": 0.9939, + "step": 14210 + }, + { + "epoch": 1.43, + "grad_norm": 5.956171692699573, + "learning_rate": 6.247857922719341e-06, + "loss": 1.0283, + "step": 14215 + }, + { + "epoch": 1.43, + "grad_norm": 11.561231576101571, + "learning_rate": 6.24501752109399e-06, + "loss": 1.0195, + "step": 14220 + }, + { + "epoch": 1.43, + "grad_norm": 5.562739598868662, + "learning_rate": 6.242176691059302e-06, + "loss": 1.0614, + "step": 14225 + }, + { + "epoch": 1.43, + "grad_norm": 9.959521963283766, + "learning_rate": 6.239335433592799e-06, + "loss": 1.0354, + "step": 14230 + }, + { + "epoch": 1.44, + "grad_norm": 14.3662667179511, + "learning_rate": 6.2364937496721575e-06, + "loss": 1.0758, + "step": 14235 + }, + { + "epoch": 1.44, + "grad_norm": 12.379812977336815, + "learning_rate": 6.233651640275199e-06, + "loss": 1.009, + "step": 14240 + }, + { + "epoch": 1.44, + "grad_norm": 13.883131577198084, + "learning_rate": 6.2308091063798895e-06, + "loss": 1.0122, + "step": 14245 + }, + { + "epoch": 1.44, + "grad_norm": 6.14167183566433, + "learning_rate": 6.227966148964339e-06, + "loss": 1.018, + "step": 14250 + }, + { + "epoch": 1.44, + "grad_norm": 13.308290140967893, + "learning_rate": 6.2251227690068125e-06, + "loss": 1.0288, + "step": 14255 + }, + { + "epoch": 1.44, + "grad_norm": 14.664006828581652, + "learning_rate": 6.22227896748571e-06, + "loss": 0.9823, + "step": 14260 + }, + { + "epoch": 1.44, + "grad_norm": 9.49020895969274, + "learning_rate": 6.219434745379582e-06, + "loss": 1.0258, + "step": 14265 + }, + { + "epoch": 1.44, + "grad_norm": 12.663898407814036, + "learning_rate": 6.216590103667124e-06, + "loss": 1.0244, + "step": 14270 + }, + { + "epoch": 1.44, + "grad_norm": 6.217872837879128, + "learning_rate": 6.213745043327172e-06, + "loss": 1.0175, + "step": 14275 + }, + { + "epoch": 1.44, + "grad_norm": 7.949280368143053, + "learning_rate": 6.210899565338711e-06, + "loss": 1.001, + "step": 14280 + }, + { + "epoch": 1.44, + "grad_norm": 30.094188801468604, + "learning_rate": 6.208053670680864e-06, + "loss": 1.048, + "step": 14285 + }, + { + "epoch": 1.44, + "grad_norm": 20.347516427833664, + "learning_rate": 6.205207360332906e-06, + "loss": 0.9853, + "step": 14290 + }, + { + "epoch": 1.44, + "grad_norm": 28.21486187237754, + "learning_rate": 6.202360635274243e-06, + "loss": 1.0146, + "step": 14295 + }, + { + "epoch": 1.44, + "grad_norm": 12.837680300255675, + "learning_rate": 6.199513496484436e-06, + "loss": 0.9886, + "step": 14300 + }, + { + "epoch": 1.44, + "grad_norm": 7.318146291443392, + "learning_rate": 6.19666594494318e-06, + "loss": 1.0477, + "step": 14305 + }, + { + "epoch": 1.44, + "grad_norm": 8.860643957291648, + "learning_rate": 6.193817981630314e-06, + "loss": 1.0713, + "step": 14310 + }, + { + "epoch": 1.44, + "grad_norm": 17.737601081480946, + "learning_rate": 6.190969607525823e-06, + "loss": 1.0437, + "step": 14315 + }, + { + "epoch": 1.44, + "grad_norm": 11.679315730182553, + "learning_rate": 6.188120823609826e-06, + "loss": 1.0078, + "step": 14320 + }, + { + "epoch": 1.44, + "grad_norm": 6.776719899109231, + "learning_rate": 6.185271630862588e-06, + "loss": 1.0335, + "step": 14325 + }, + { + "epoch": 1.44, + "grad_norm": 7.095041730152499, + "learning_rate": 6.1824220302645165e-06, + "loss": 1.0284, + "step": 14330 + }, + { + "epoch": 1.45, + "grad_norm": 9.704614074308678, + "learning_rate": 6.179572022796151e-06, + "loss": 1.0643, + "step": 14335 + }, + { + "epoch": 1.45, + "grad_norm": 6.134028494062003, + "learning_rate": 6.17672160943818e-06, + "loss": 0.9957, + "step": 14340 + }, + { + "epoch": 1.45, + "grad_norm": 7.781740867825054, + "learning_rate": 6.173870791171428e-06, + "loss": 1.0073, + "step": 14345 + }, + { + "epoch": 1.45, + "grad_norm": 5.8775956473541, + "learning_rate": 6.171019568976857e-06, + "loss": 0.9979, + "step": 14350 + }, + { + "epoch": 1.45, + "grad_norm": 9.887612818763753, + "learning_rate": 6.16816794383557e-06, + "loss": 1.0681, + "step": 14355 + }, + { + "epoch": 1.45, + "grad_norm": 10.619252812637434, + "learning_rate": 6.1653159167288115e-06, + "loss": 1.0361, + "step": 14360 + }, + { + "epoch": 1.45, + "grad_norm": 16.324555085616012, + "learning_rate": 6.162463488637957e-06, + "loss": 1.0245, + "step": 14365 + }, + { + "epoch": 1.45, + "grad_norm": 9.315834199483762, + "learning_rate": 6.159610660544526e-06, + "loss": 1.0556, + "step": 14370 + }, + { + "epoch": 1.45, + "grad_norm": 6.9917183655457675, + "learning_rate": 6.156757433430176e-06, + "loss": 1.051, + "step": 14375 + }, + { + "epoch": 1.45, + "grad_norm": 10.626563270273751, + "learning_rate": 6.153903808276698e-06, + "loss": 1.0146, + "step": 14380 + }, + { + "epoch": 1.45, + "grad_norm": 9.43592684093022, + "learning_rate": 6.151049786066021e-06, + "loss": 1.0138, + "step": 14385 + }, + { + "epoch": 1.45, + "grad_norm": 11.265247039986576, + "learning_rate": 6.148195367780211e-06, + "loss": 1.0057, + "step": 14390 + }, + { + "epoch": 1.45, + "grad_norm": 8.458256378539899, + "learning_rate": 6.145340554401473e-06, + "loss": 1.0076, + "step": 14395 + }, + { + "epoch": 1.45, + "grad_norm": 5.2129788646983375, + "learning_rate": 6.1424853469121436e-06, + "loss": 1.0102, + "step": 14400 + }, + { + "epoch": 1.45, + "grad_norm": 6.904045579179168, + "learning_rate": 6.139629746294698e-06, + "loss": 1.0414, + "step": 14405 + }, + { + "epoch": 1.45, + "grad_norm": 12.254639251979803, + "learning_rate": 6.1367737535317455e-06, + "loss": 1.0348, + "step": 14410 + }, + { + "epoch": 1.45, + "grad_norm": 24.60089000632015, + "learning_rate": 6.133917369606028e-06, + "loss": 1.0041, + "step": 14415 + }, + { + "epoch": 1.45, + "grad_norm": 25.11777311760514, + "learning_rate": 6.131060595500429e-06, + "loss": 1.0148, + "step": 14420 + }, + { + "epoch": 1.45, + "grad_norm": 13.428895668678699, + "learning_rate": 6.128203432197959e-06, + "loss": 1.0127, + "step": 14425 + }, + { + "epoch": 1.45, + "grad_norm": 15.744034932615966, + "learning_rate": 6.125345880681763e-06, + "loss": 0.9718, + "step": 14430 + }, + { + "epoch": 1.46, + "grad_norm": 9.820188753420734, + "learning_rate": 6.1224879419351265e-06, + "loss": 1.0571, + "step": 14435 + }, + { + "epoch": 1.46, + "grad_norm": 7.721814644490256, + "learning_rate": 6.119629616941462e-06, + "loss": 1.0497, + "step": 14440 + }, + { + "epoch": 1.46, + "grad_norm": 10.363061957577708, + "learning_rate": 6.116770906684315e-06, + "loss": 1.0061, + "step": 14445 + }, + { + "epoch": 1.46, + "grad_norm": 11.604748909394797, + "learning_rate": 6.113911812147364e-06, + "loss": 1.0434, + "step": 14450 + }, + { + "epoch": 1.46, + "grad_norm": 8.423518434542537, + "learning_rate": 6.1110523343144245e-06, + "loss": 0.9749, + "step": 14455 + }, + { + "epoch": 1.46, + "grad_norm": 5.908629896757577, + "learning_rate": 6.108192474169438e-06, + "loss": 1.0043, + "step": 14460 + }, + { + "epoch": 1.46, + "grad_norm": 14.146388089823004, + "learning_rate": 6.105332232696478e-06, + "loss": 1.023, + "step": 14465 + }, + { + "epoch": 1.46, + "grad_norm": 6.164722658583353, + "learning_rate": 6.1024716108797536e-06, + "loss": 1.0099, + "step": 14470 + }, + { + "epoch": 1.46, + "grad_norm": 9.13629495963452, + "learning_rate": 6.0996106097036005e-06, + "loss": 1.0635, + "step": 14475 + }, + { + "epoch": 1.46, + "grad_norm": 7.800462873014633, + "learning_rate": 6.096749230152486e-06, + "loss": 1.0582, + "step": 14480 + }, + { + "epoch": 1.46, + "grad_norm": 16.102963227939757, + "learning_rate": 6.093887473211011e-06, + "loss": 1.0162, + "step": 14485 + }, + { + "epoch": 1.46, + "grad_norm": 6.403683996342634, + "learning_rate": 6.0910253398638986e-06, + "loss": 0.9756, + "step": 14490 + }, + { + "epoch": 1.46, + "grad_norm": 7.1986272105334885, + "learning_rate": 6.08816283109601e-06, + "loss": 1.0398, + "step": 14495 + }, + { + "epoch": 1.46, + "grad_norm": 10.499142846058897, + "learning_rate": 6.085299947892331e-06, + "loss": 1.0325, + "step": 14500 + }, + { + "epoch": 1.46, + "grad_norm": 6.443651726931194, + "learning_rate": 6.082436691237977e-06, + "loss": 1.0316, + "step": 14505 + }, + { + "epoch": 1.46, + "grad_norm": 5.599973887823316, + "learning_rate": 6.079573062118192e-06, + "loss": 0.9952, + "step": 14510 + }, + { + "epoch": 1.46, + "grad_norm": 9.28820497848877, + "learning_rate": 6.076709061518345e-06, + "loss": 0.9898, + "step": 14515 + }, + { + "epoch": 1.46, + "grad_norm": 9.505366057732097, + "learning_rate": 6.07384469042394e-06, + "loss": 1.0557, + "step": 14520 + }, + { + "epoch": 1.46, + "grad_norm": 6.316945997554746, + "learning_rate": 6.070979949820601e-06, + "loss": 0.9976, + "step": 14525 + }, + { + "epoch": 1.46, + "grad_norm": 9.461804826693626, + "learning_rate": 6.068114840694085e-06, + "loss": 1.0098, + "step": 14530 + }, + { + "epoch": 1.47, + "grad_norm": 10.95271979417951, + "learning_rate": 6.065249364030274e-06, + "loss": 1.0732, + "step": 14535 + }, + { + "epoch": 1.47, + "grad_norm": 5.253690343477784, + "learning_rate": 6.062383520815173e-06, + "loss": 1.0354, + "step": 14540 + }, + { + "epoch": 1.47, + "grad_norm": 12.420956881537123, + "learning_rate": 6.059517312034916e-06, + "loss": 1.0241, + "step": 14545 + }, + { + "epoch": 1.47, + "grad_norm": 9.335665009559412, + "learning_rate": 6.056650738675765e-06, + "loss": 1.039, + "step": 14550 + }, + { + "epoch": 1.47, + "grad_norm": 6.074575706713052, + "learning_rate": 6.053783801724104e-06, + "loss": 1.026, + "step": 14555 + }, + { + "epoch": 1.47, + "grad_norm": 8.785882460179483, + "learning_rate": 6.050916502166444e-06, + "loss": 1.0531, + "step": 14560 + }, + { + "epoch": 1.47, + "grad_norm": 12.30238163595188, + "learning_rate": 6.048048840989419e-06, + "loss": 1.0576, + "step": 14565 + }, + { + "epoch": 1.47, + "grad_norm": 7.506627651398785, + "learning_rate": 6.045180819179788e-06, + "loss": 1.072, + "step": 14570 + }, + { + "epoch": 1.47, + "grad_norm": 11.198950377669895, + "learning_rate": 6.042312437724436e-06, + "loss": 1.0364, + "step": 14575 + }, + { + "epoch": 1.47, + "grad_norm": 13.304318128207594, + "learning_rate": 6.03944369761037e-06, + "loss": 1.0088, + "step": 14580 + }, + { + "epoch": 1.47, + "grad_norm": 7.41513228240807, + "learning_rate": 6.036574599824719e-06, + "loss": 1.0097, + "step": 14585 + }, + { + "epoch": 1.47, + "grad_norm": 15.723566109044413, + "learning_rate": 6.033705145354739e-06, + "loss": 0.9864, + "step": 14590 + }, + { + "epoch": 1.47, + "grad_norm": 13.220611774840002, + "learning_rate": 6.0308353351878066e-06, + "loss": 1.0636, + "step": 14595 + }, + { + "epoch": 1.47, + "grad_norm": 28.326960131834948, + "learning_rate": 6.0279651703114195e-06, + "loss": 1.0307, + "step": 14600 + }, + { + "epoch": 1.47, + "grad_norm": 21.15053825545421, + "learning_rate": 6.0250946517131995e-06, + "loss": 1.0315, + "step": 14605 + }, + { + "epoch": 1.47, + "grad_norm": 19.95053160025602, + "learning_rate": 6.0222237803808895e-06, + "loss": 1.0259, + "step": 14610 + }, + { + "epoch": 1.47, + "grad_norm": 23.706207217394876, + "learning_rate": 6.01935255730235e-06, + "loss": 1.0002, + "step": 14615 + }, + { + "epoch": 1.47, + "grad_norm": 12.140859339836725, + "learning_rate": 6.016480983465572e-06, + "loss": 1.0286, + "step": 14620 + }, + { + "epoch": 1.47, + "grad_norm": 7.605635830489675, + "learning_rate": 6.013609059858657e-06, + "loss": 1.0206, + "step": 14625 + }, + { + "epoch": 1.48, + "grad_norm": 11.045248244303234, + "learning_rate": 6.010736787469833e-06, + "loss": 1.0038, + "step": 14630 + }, + { + "epoch": 1.48, + "grad_norm": 13.588855130318489, + "learning_rate": 6.007864167287447e-06, + "loss": 1.034, + "step": 14635 + }, + { + "epoch": 1.48, + "grad_norm": 6.09002337269585, + "learning_rate": 6.004991200299962e-06, + "loss": 0.9912, + "step": 14640 + }, + { + "epoch": 1.48, + "grad_norm": 20.41358760134712, + "learning_rate": 6.0021178874959665e-06, + "loss": 1.0801, + "step": 14645 + }, + { + "epoch": 1.48, + "grad_norm": 14.779793206949819, + "learning_rate": 5.999244229864162e-06, + "loss": 1.0057, + "step": 14650 + }, + { + "epoch": 1.48, + "grad_norm": 9.243732728824146, + "learning_rate": 5.996370228393371e-06, + "loss": 1.0523, + "step": 14655 + }, + { + "epoch": 1.48, + "grad_norm": 12.525122559658719, + "learning_rate": 5.993495884072538e-06, + "loss": 1.0188, + "step": 14660 + }, + { + "epoch": 1.48, + "grad_norm": 16.96403205642408, + "learning_rate": 5.99062119789072e-06, + "loss": 1.0091, + "step": 14665 + }, + { + "epoch": 1.48, + "grad_norm": 17.281253296587717, + "learning_rate": 5.987746170837093e-06, + "loss": 1.0365, + "step": 14670 + }, + { + "epoch": 1.48, + "grad_norm": 5.516804753459836, + "learning_rate": 5.984870803900953e-06, + "loss": 1.0164, + "step": 14675 + }, + { + "epoch": 1.48, + "grad_norm": 6.549956303475277, + "learning_rate": 5.98199509807171e-06, + "loss": 1.0113, + "step": 14680 + }, + { + "epoch": 1.48, + "grad_norm": 5.293675045161462, + "learning_rate": 5.979119054338891e-06, + "loss": 1.0273, + "step": 14685 + }, + { + "epoch": 1.48, + "grad_norm": 13.75212362247369, + "learning_rate": 5.976242673692141e-06, + "loss": 1.0119, + "step": 14690 + }, + { + "epoch": 1.48, + "grad_norm": 12.283894924775016, + "learning_rate": 5.973365957121219e-06, + "loss": 1.0087, + "step": 14695 + }, + { + "epoch": 1.48, + "grad_norm": 17.99199079388364, + "learning_rate": 5.970488905616001e-06, + "loss": 0.9819, + "step": 14700 + }, + { + "epoch": 1.48, + "grad_norm": 8.31338495628191, + "learning_rate": 5.967611520166479e-06, + "loss": 0.9973, + "step": 14705 + }, + { + "epoch": 1.48, + "grad_norm": 15.28664721508452, + "learning_rate": 5.964733801762754e-06, + "loss": 1.0251, + "step": 14710 + }, + { + "epoch": 1.48, + "grad_norm": 16.402366788820295, + "learning_rate": 5.961855751395052e-06, + "loss": 1.0345, + "step": 14715 + }, + { + "epoch": 1.48, + "grad_norm": 7.971032257036951, + "learning_rate": 5.9589773700537035e-06, + "loss": 1.0322, + "step": 14720 + }, + { + "epoch": 1.48, + "grad_norm": 10.61183751369133, + "learning_rate": 5.956098658729157e-06, + "loss": 1.0088, + "step": 14725 + }, + { + "epoch": 1.49, + "grad_norm": 7.953844617439651, + "learning_rate": 5.953219618411976e-06, + "loss": 1.0001, + "step": 14730 + }, + { + "epoch": 1.49, + "grad_norm": 7.375623072845399, + "learning_rate": 5.950340250092835e-06, + "loss": 1.0226, + "step": 14735 + }, + { + "epoch": 1.49, + "grad_norm": 8.030057617795244, + "learning_rate": 5.947460554762522e-06, + "loss": 0.9634, + "step": 14740 + }, + { + "epoch": 1.49, + "grad_norm": 13.009853891348671, + "learning_rate": 5.944580533411935e-06, + "loss": 1.0118, + "step": 14745 + }, + { + "epoch": 1.49, + "grad_norm": 28.990989603600664, + "learning_rate": 5.9417001870320925e-06, + "loss": 1.0167, + "step": 14750 + }, + { + "epoch": 1.49, + "grad_norm": 6.118613775864574, + "learning_rate": 5.9388195166141114e-06, + "loss": 1.0082, + "step": 14755 + }, + { + "epoch": 1.49, + "grad_norm": 13.273742102437229, + "learning_rate": 5.935938523149234e-06, + "loss": 1.0389, + "step": 14760 + }, + { + "epoch": 1.49, + "grad_norm": 39.17306160606416, + "learning_rate": 5.933057207628804e-06, + "loss": 1.0493, + "step": 14765 + }, + { + "epoch": 1.49, + "grad_norm": 24.024743927701977, + "learning_rate": 5.930175571044279e-06, + "loss": 0.9837, + "step": 14770 + }, + { + "epoch": 1.49, + "grad_norm": 10.705609986527344, + "learning_rate": 5.927293614387229e-06, + "loss": 1.0272, + "step": 14775 + }, + { + "epoch": 1.49, + "grad_norm": 23.230859114470846, + "learning_rate": 5.924411338649332e-06, + "loss": 1.0593, + "step": 14780 + }, + { + "epoch": 1.49, + "grad_norm": 35.492015860924646, + "learning_rate": 5.921528744822376e-06, + "loss": 1.0298, + "step": 14785 + }, + { + "epoch": 1.49, + "grad_norm": 9.562825349710867, + "learning_rate": 5.918645833898257e-06, + "loss": 1.0212, + "step": 14790 + }, + { + "epoch": 1.49, + "grad_norm": 11.27956387033416, + "learning_rate": 5.915762606868987e-06, + "loss": 1.0027, + "step": 14795 + }, + { + "epoch": 1.49, + "grad_norm": 26.21674719448401, + "learning_rate": 5.912879064726679e-06, + "loss": 0.98, + "step": 14800 + }, + { + "epoch": 1.49, + "grad_norm": 8.700641274635641, + "learning_rate": 5.909995208463555e-06, + "loss": 1.0159, + "step": 14805 + }, + { + "epoch": 1.49, + "grad_norm": 25.915543721817727, + "learning_rate": 5.907111039071949e-06, + "loss": 1.0333, + "step": 14810 + }, + { + "epoch": 1.49, + "grad_norm": 10.689572006386276, + "learning_rate": 5.9042265575443015e-06, + "loss": 1.024, + "step": 14815 + }, + { + "epoch": 1.49, + "grad_norm": 17.410061579336677, + "learning_rate": 5.901341764873159e-06, + "loss": 1.0091, + "step": 14820 + }, + { + "epoch": 1.49, + "grad_norm": 20.74331671098658, + "learning_rate": 5.898456662051175e-06, + "loss": 0.9996, + "step": 14825 + }, + { + "epoch": 1.5, + "grad_norm": 16.260279666494277, + "learning_rate": 5.895571250071114e-06, + "loss": 1.0082, + "step": 14830 + }, + { + "epoch": 1.5, + "grad_norm": 6.515962590425934, + "learning_rate": 5.892685529925839e-06, + "loss": 1.0382, + "step": 14835 + }, + { + "epoch": 1.5, + "grad_norm": 21.656660220190645, + "learning_rate": 5.889799502608326e-06, + "loss": 1.0119, + "step": 14840 + }, + { + "epoch": 1.5, + "grad_norm": 13.943941394119102, + "learning_rate": 5.886913169111655e-06, + "loss": 1.0148, + "step": 14845 + }, + { + "epoch": 1.5, + "grad_norm": 5.059556966773174, + "learning_rate": 5.88402653042901e-06, + "loss": 1.0267, + "step": 14850 + }, + { + "epoch": 1.5, + "grad_norm": 10.230521020102254, + "learning_rate": 5.881139587553679e-06, + "loss": 1.0459, + "step": 14855 + }, + { + "epoch": 1.5, + "grad_norm": 7.029114904796708, + "learning_rate": 5.878252341479058e-06, + "loss": 1.0302, + "step": 14860 + }, + { + "epoch": 1.5, + "grad_norm": 7.191819892272424, + "learning_rate": 5.8753647931986455e-06, + "loss": 1.0411, + "step": 14865 + }, + { + "epoch": 1.5, + "grad_norm": 10.431883483873603, + "learning_rate": 5.872476943706043e-06, + "loss": 1.0362, + "step": 14870 + }, + { + "epoch": 1.5, + "grad_norm": 7.225020541416042, + "learning_rate": 5.869588793994958e-06, + "loss": 1.0278, + "step": 14875 + }, + { + "epoch": 1.5, + "grad_norm": 6.718840666658366, + "learning_rate": 5.8667003450592e-06, + "loss": 0.9902, + "step": 14880 + }, + { + "epoch": 1.5, + "grad_norm": 16.306096075768597, + "learning_rate": 5.863811597892679e-06, + "loss": 1.0415, + "step": 14885 + }, + { + "epoch": 1.5, + "grad_norm": 17.330385188221122, + "learning_rate": 5.860922553489416e-06, + "loss": 1.045, + "step": 14890 + }, + { + "epoch": 1.5, + "grad_norm": 10.898400490052744, + "learning_rate": 5.858033212843521e-06, + "loss": 1.0119, + "step": 14895 + }, + { + "epoch": 1.5, + "grad_norm": 7.821759514758661, + "learning_rate": 5.855143576949218e-06, + "loss": 1.1096, + "step": 14900 + }, + { + "epoch": 1.5, + "grad_norm": 16.145556238240854, + "learning_rate": 5.852253646800829e-06, + "loss": 1.0223, + "step": 14905 + }, + { + "epoch": 1.5, + "grad_norm": 11.538354365516687, + "learning_rate": 5.849363423392771e-06, + "loss": 1.0003, + "step": 14910 + }, + { + "epoch": 1.5, + "grad_norm": 8.376499493234194, + "learning_rate": 5.846472907719571e-06, + "loss": 1.0383, + "step": 14915 + }, + { + "epoch": 1.5, + "grad_norm": 15.640406948927293, + "learning_rate": 5.843582100775854e-06, + "loss": 1.0545, + "step": 14920 + }, + { + "epoch": 1.5, + "grad_norm": 41.71062596817106, + "learning_rate": 5.84069100355634e-06, + "loss": 1.0365, + "step": 14925 + }, + { + "epoch": 1.51, + "grad_norm": 36.440213900509036, + "learning_rate": 5.837799617055855e-06, + "loss": 1.0287, + "step": 14930 + }, + { + "epoch": 1.51, + "grad_norm": 13.231312091439358, + "learning_rate": 5.834907942269323e-06, + "loss": 1.0196, + "step": 14935 + }, + { + "epoch": 1.51, + "grad_norm": 6.67987282339489, + "learning_rate": 5.832015980191763e-06, + "loss": 1.0247, + "step": 14940 + }, + { + "epoch": 1.51, + "grad_norm": 6.027745730413337, + "learning_rate": 5.8291237318183e-06, + "loss": 1.0395, + "step": 14945 + }, + { + "epoch": 1.51, + "grad_norm": 8.944458259992423, + "learning_rate": 5.826231198144153e-06, + "loss": 0.9978, + "step": 14950 + }, + { + "epoch": 1.51, + "grad_norm": 7.375857405507643, + "learning_rate": 5.82333838016464e-06, + "loss": 1.0039, + "step": 14955 + }, + { + "epoch": 1.51, + "grad_norm": 29.02177122478957, + "learning_rate": 5.820445278875177e-06, + "loss": 1.011, + "step": 14960 + }, + { + "epoch": 1.51, + "grad_norm": 12.960662825766184, + "learning_rate": 5.817551895271277e-06, + "loss": 1.036, + "step": 14965 + }, + { + "epoch": 1.51, + "grad_norm": 6.955402914365195, + "learning_rate": 5.81465823034855e-06, + "loss": 0.9873, + "step": 14970 + }, + { + "epoch": 1.51, + "grad_norm": 19.03775796460291, + "learning_rate": 5.811764285102704e-06, + "loss": 1.002, + "step": 14975 + }, + { + "epoch": 1.51, + "grad_norm": 20.372957156704015, + "learning_rate": 5.8088700605295445e-06, + "loss": 1.0141, + "step": 14980 + }, + { + "epoch": 1.51, + "grad_norm": 5.777200256501984, + "learning_rate": 5.805975557624969e-06, + "loss": 1.0528, + "step": 14985 + }, + { + "epoch": 1.51, + "grad_norm": 48.67917693510359, + "learning_rate": 5.803080777384973e-06, + "loss": 1.0117, + "step": 14990 + }, + { + "epoch": 1.51, + "grad_norm": 9.497038082024737, + "learning_rate": 5.800185720805649e-06, + "loss": 0.9775, + "step": 14995 + }, + { + "epoch": 1.51, + "grad_norm": 23.166715738992746, + "learning_rate": 5.797290388883183e-06, + "loss": 0.9993, + "step": 15000 + }, + { + "epoch": 1.51, + "grad_norm": 13.057863642501305, + "learning_rate": 5.794394782613855e-06, + "loss": 1.001, + "step": 15005 + }, + { + "epoch": 1.51, + "grad_norm": 6.830435844917975, + "learning_rate": 5.7914989029940414e-06, + "loss": 1.0265, + "step": 15010 + }, + { + "epoch": 1.51, + "grad_norm": 35.212613349302586, + "learning_rate": 5.788602751020211e-06, + "loss": 1.0209, + "step": 15015 + }, + { + "epoch": 1.51, + "grad_norm": 17.05136078336535, + "learning_rate": 5.785706327688929e-06, + "loss": 1.0196, + "step": 15020 + }, + { + "epoch": 1.51, + "grad_norm": 15.07256425645445, + "learning_rate": 5.782809633996849e-06, + "loss": 0.9761, + "step": 15025 + }, + { + "epoch": 1.52, + "grad_norm": 25.085596348207954, + "learning_rate": 5.7799126709407215e-06, + "loss": 1.0027, + "step": 15030 + }, + { + "epoch": 1.52, + "grad_norm": 6.371754516177725, + "learning_rate": 5.777015439517389e-06, + "loss": 1.0181, + "step": 15035 + }, + { + "epoch": 1.52, + "grad_norm": 5.5333864685009395, + "learning_rate": 5.774117940723784e-06, + "loss": 1.0047, + "step": 15040 + }, + { + "epoch": 1.52, + "grad_norm": 7.500730030934907, + "learning_rate": 5.771220175556938e-06, + "loss": 1.0336, + "step": 15045 + }, + { + "epoch": 1.52, + "grad_norm": 9.709625588493253, + "learning_rate": 5.768322145013964e-06, + "loss": 1.0299, + "step": 15050 + }, + { + "epoch": 1.52, + "grad_norm": 11.338952099488806, + "learning_rate": 5.765423850092073e-06, + "loss": 1.036, + "step": 15055 + }, + { + "epoch": 1.52, + "grad_norm": 5.169962548517268, + "learning_rate": 5.762525291788569e-06, + "loss": 0.982, + "step": 15060 + }, + { + "epoch": 1.52, + "grad_norm": 5.900902537411488, + "learning_rate": 5.759626471100839e-06, + "loss": 0.9946, + "step": 15065 + }, + { + "epoch": 1.52, + "grad_norm": 7.1707931642179235, + "learning_rate": 5.756727389026365e-06, + "loss": 1.0523, + "step": 15070 + }, + { + "epoch": 1.52, + "grad_norm": 6.353224816483072, + "learning_rate": 5.753828046562721e-06, + "loss": 0.9908, + "step": 15075 + }, + { + "epoch": 1.52, + "grad_norm": 13.573037894562587, + "learning_rate": 5.7509284447075644e-06, + "loss": 0.997, + "step": 15080 + }, + { + "epoch": 1.52, + "grad_norm": 11.837813817381356, + "learning_rate": 5.748028584458648e-06, + "loss": 1.0719, + "step": 15085 + }, + { + "epoch": 1.52, + "grad_norm": 12.876634308775737, + "learning_rate": 5.745128466813811e-06, + "loss": 1.0037, + "step": 15090 + }, + { + "epoch": 1.52, + "grad_norm": 11.539536593451036, + "learning_rate": 5.742228092770978e-06, + "loss": 1.0185, + "step": 15095 + }, + { + "epoch": 1.52, + "grad_norm": 33.397164571504376, + "learning_rate": 5.739327463328168e-06, + "loss": 1.0098, + "step": 15100 + }, + { + "epoch": 1.52, + "grad_norm": 29.183823725475442, + "learning_rate": 5.736426579483486e-06, + "loss": 1.0048, + "step": 15105 + }, + { + "epoch": 1.52, + "grad_norm": 24.587574963191283, + "learning_rate": 5.733525442235119e-06, + "loss": 1.0098, + "step": 15110 + }, + { + "epoch": 1.52, + "grad_norm": 6.947952312742559, + "learning_rate": 5.730624052581349e-06, + "loss": 1.0179, + "step": 15115 + }, + { + "epoch": 1.52, + "grad_norm": 10.89284380916069, + "learning_rate": 5.727722411520541e-06, + "loss": 0.9783, + "step": 15120 + }, + { + "epoch": 1.52, + "grad_norm": 18.396496552929893, + "learning_rate": 5.724820520051145e-06, + "loss": 1.0223, + "step": 15125 + }, + { + "epoch": 1.53, + "grad_norm": 23.512104397954204, + "learning_rate": 5.7219183791717004e-06, + "loss": 0.998, + "step": 15130 + }, + { + "epoch": 1.53, + "grad_norm": 8.491487800240288, + "learning_rate": 5.719015989880831e-06, + "loss": 1.0189, + "step": 15135 + }, + { + "epoch": 1.53, + "grad_norm": 7.884338842606663, + "learning_rate": 5.716113353177247e-06, + "loss": 1.0263, + "step": 15140 + }, + { + "epoch": 1.53, + "grad_norm": 6.324532652016312, + "learning_rate": 5.713210470059743e-06, + "loss": 1.0832, + "step": 15145 + }, + { + "epoch": 1.53, + "grad_norm": 7.037924639353008, + "learning_rate": 5.710307341527196e-06, + "loss": 1.0221, + "step": 15150 + }, + { + "epoch": 1.53, + "grad_norm": 5.74501667455677, + "learning_rate": 5.707403968578572e-06, + "loss": 1.0076, + "step": 15155 + }, + { + "epoch": 1.53, + "grad_norm": 7.5396246138440945, + "learning_rate": 5.704500352212918e-06, + "loss": 1.067, + "step": 15160 + }, + { + "epoch": 1.53, + "grad_norm": 8.767467544066514, + "learning_rate": 5.701596493429363e-06, + "loss": 0.9922, + "step": 15165 + }, + { + "epoch": 1.53, + "grad_norm": 9.52735107254121, + "learning_rate": 5.6986923932271285e-06, + "loss": 1.0567, + "step": 15170 + }, + { + "epoch": 1.53, + "grad_norm": 5.0504532995719345, + "learning_rate": 5.695788052605507e-06, + "loss": 0.9651, + "step": 15175 + }, + { + "epoch": 1.53, + "grad_norm": 13.310908479402293, + "learning_rate": 5.69288347256388e-06, + "loss": 1.038, + "step": 15180 + }, + { + "epoch": 1.53, + "grad_norm": 7.202300147414778, + "learning_rate": 5.689978654101715e-06, + "loss": 0.9734, + "step": 15185 + }, + { + "epoch": 1.53, + "grad_norm": 7.184742770785585, + "learning_rate": 5.687073598218551e-06, + "loss": 1.0026, + "step": 15190 + }, + { + "epoch": 1.53, + "grad_norm": 6.543331062085483, + "learning_rate": 5.68416830591402e-06, + "loss": 1.0343, + "step": 15195 + }, + { + "epoch": 1.53, + "grad_norm": 18.40410285840536, + "learning_rate": 5.681262778187828e-06, + "loss": 1.0112, + "step": 15200 + }, + { + "epoch": 1.53, + "grad_norm": 8.552560720211156, + "learning_rate": 5.678357016039764e-06, + "loss": 0.9836, + "step": 15205 + }, + { + "epoch": 1.53, + "grad_norm": 15.137282707856617, + "learning_rate": 5.675451020469699e-06, + "loss": 1.0329, + "step": 15210 + }, + { + "epoch": 1.53, + "grad_norm": 12.269031411588143, + "learning_rate": 5.672544792477584e-06, + "loss": 0.9694, + "step": 15215 + }, + { + "epoch": 1.53, + "grad_norm": 8.489519557977571, + "learning_rate": 5.669638333063448e-06, + "loss": 0.9874, + "step": 15220 + }, + { + "epoch": 1.54, + "grad_norm": 13.159324332143338, + "learning_rate": 5.666731643227399e-06, + "loss": 0.9907, + "step": 15225 + }, + { + "epoch": 1.54, + "grad_norm": 18.394066611868105, + "learning_rate": 5.663824723969631e-06, + "loss": 0.9899, + "step": 15230 + }, + { + "epoch": 1.54, + "grad_norm": 18.369779247598046, + "learning_rate": 5.6609175762904075e-06, + "loss": 0.9992, + "step": 15235 + }, + { + "epoch": 1.54, + "grad_norm": 7.373663946948557, + "learning_rate": 5.658010201190078e-06, + "loss": 1.0451, + "step": 15240 + }, + { + "epoch": 1.54, + "grad_norm": 11.41526122202614, + "learning_rate": 5.655102599669068e-06, + "loss": 1.0139, + "step": 15245 + }, + { + "epoch": 1.54, + "grad_norm": 6.543128116619436, + "learning_rate": 5.652194772727878e-06, + "loss": 1.0553, + "step": 15250 + }, + { + "epoch": 1.54, + "grad_norm": 9.985843352021122, + "learning_rate": 5.649286721367089e-06, + "loss": 1.0342, + "step": 15255 + }, + { + "epoch": 1.54, + "grad_norm": 6.4618556132775264, + "learning_rate": 5.646378446587362e-06, + "loss": 1.0222, + "step": 15260 + }, + { + "epoch": 1.54, + "grad_norm": 6.018828072452745, + "learning_rate": 5.643469949389426e-06, + "loss": 1.0022, + "step": 15265 + }, + { + "epoch": 1.54, + "grad_norm": 21.408005050217124, + "learning_rate": 5.640561230774095e-06, + "loss": 1.0343, + "step": 15270 + }, + { + "epoch": 1.54, + "grad_norm": 27.216153751199354, + "learning_rate": 5.6376522917422584e-06, + "loss": 1.0465, + "step": 15275 + }, + { + "epoch": 1.54, + "grad_norm": 6.033256161413907, + "learning_rate": 5.634743133294877e-06, + "loss": 1.0771, + "step": 15280 + }, + { + "epoch": 1.54, + "grad_norm": 11.203846527256655, + "learning_rate": 5.631833756432988e-06, + "loss": 1.0364, + "step": 15285 + }, + { + "epoch": 1.54, + "grad_norm": 11.766700739594345, + "learning_rate": 5.628924162157707e-06, + "loss": 1.002, + "step": 15290 + }, + { + "epoch": 1.54, + "grad_norm": 13.70182695862929, + "learning_rate": 5.6260143514702245e-06, + "loss": 1.0249, + "step": 15295 + }, + { + "epoch": 1.54, + "grad_norm": 6.402225039197062, + "learning_rate": 5.6231043253718e-06, + "loss": 1.0014, + "step": 15300 + }, + { + "epoch": 1.54, + "grad_norm": 12.84456051133836, + "learning_rate": 5.620194084863771e-06, + "loss": 1.0315, + "step": 15305 + }, + { + "epoch": 1.54, + "grad_norm": 7.13298034214026, + "learning_rate": 5.617283630947553e-06, + "loss": 1.0206, + "step": 15310 + }, + { + "epoch": 1.54, + "grad_norm": 6.80736730732881, + "learning_rate": 5.614372964624625e-06, + "loss": 1.0183, + "step": 15315 + }, + { + "epoch": 1.54, + "grad_norm": 12.23622607920322, + "learning_rate": 5.6114620868965466e-06, + "loss": 1.0323, + "step": 15320 + }, + { + "epoch": 1.55, + "grad_norm": 11.502210869656103, + "learning_rate": 5.608550998764949e-06, + "loss": 1.0368, + "step": 15325 + }, + { + "epoch": 1.55, + "grad_norm": 5.627686206640388, + "learning_rate": 5.605639701231533e-06, + "loss": 0.9889, + "step": 15330 + }, + { + "epoch": 1.55, + "grad_norm": 4.981370174837831, + "learning_rate": 5.602728195298075e-06, + "loss": 0.9927, + "step": 15335 + }, + { + "epoch": 1.55, + "grad_norm": 5.225736903417539, + "learning_rate": 5.5998164819664215e-06, + "loss": 1.0384, + "step": 15340 + }, + { + "epoch": 1.55, + "grad_norm": 10.113727655453436, + "learning_rate": 5.596904562238488e-06, + "loss": 1.0588, + "step": 15345 + }, + { + "epoch": 1.55, + "grad_norm": 18.306783354210584, + "learning_rate": 5.593992437116264e-06, + "loss": 1.0228, + "step": 15350 + }, + { + "epoch": 1.55, + "grad_norm": 15.44502839539049, + "learning_rate": 5.5910801076018115e-06, + "loss": 0.9847, + "step": 15355 + }, + { + "epoch": 1.55, + "grad_norm": 6.849547074253466, + "learning_rate": 5.5881675746972585e-06, + "loss": 1.0247, + "step": 15360 + }, + { + "epoch": 1.55, + "grad_norm": 9.89612174609036, + "learning_rate": 5.585254839404804e-06, + "loss": 0.9802, + "step": 15365 + }, + { + "epoch": 1.55, + "grad_norm": 6.112813776458878, + "learning_rate": 5.582341902726719e-06, + "loss": 1.0123, + "step": 15370 + }, + { + "epoch": 1.55, + "grad_norm": 6.294517000571133, + "learning_rate": 5.57942876566534e-06, + "loss": 1.007, + "step": 15375 + }, + { + "epoch": 1.55, + "grad_norm": 5.6312243252068495, + "learning_rate": 5.576515429223077e-06, + "loss": 0.9867, + "step": 15380 + }, + { + "epoch": 1.55, + "grad_norm": 6.158982603092975, + "learning_rate": 5.5736018944024065e-06, + "loss": 0.9994, + "step": 15385 + }, + { + "epoch": 1.55, + "grad_norm": 8.584956074326355, + "learning_rate": 5.5706881622058704e-06, + "loss": 1.0102, + "step": 15390 + }, + { + "epoch": 1.55, + "grad_norm": 5.795599395721448, + "learning_rate": 5.567774233636083e-06, + "loss": 1.0619, + "step": 15395 + }, + { + "epoch": 1.55, + "grad_norm": 8.595874741508421, + "learning_rate": 5.564860109695726e-06, + "loss": 1.0089, + "step": 15400 + }, + { + "epoch": 1.55, + "grad_norm": 6.457989774283112, + "learning_rate": 5.561945791387543e-06, + "loss": 0.9862, + "step": 15405 + }, + { + "epoch": 1.55, + "grad_norm": 5.683225599592505, + "learning_rate": 5.55903127971435e-06, + "loss": 1.0738, + "step": 15410 + }, + { + "epoch": 1.55, + "grad_norm": 9.491692917430592, + "learning_rate": 5.556116575679028e-06, + "loss": 1.0031, + "step": 15415 + }, + { + "epoch": 1.55, + "grad_norm": 13.804822153310992, + "learning_rate": 5.553201680284523e-06, + "loss": 1.042, + "step": 15420 + }, + { + "epoch": 1.56, + "grad_norm": 5.054414839126639, + "learning_rate": 5.550286594533848e-06, + "loss": 1.0369, + "step": 15425 + }, + { + "epoch": 1.56, + "grad_norm": 10.795255075267063, + "learning_rate": 5.547371319430083e-06, + "loss": 1.0149, + "step": 15430 + }, + { + "epoch": 1.56, + "grad_norm": 8.29537836031942, + "learning_rate": 5.544455855976369e-06, + "loss": 1.0145, + "step": 15435 + }, + { + "epoch": 1.56, + "grad_norm": 8.764843303631773, + "learning_rate": 5.541540205175914e-06, + "loss": 0.9834, + "step": 15440 + }, + { + "epoch": 1.56, + "grad_norm": 24.348022899174037, + "learning_rate": 5.538624368031992e-06, + "loss": 1.059, + "step": 15445 + }, + { + "epoch": 1.56, + "grad_norm": 42.15851586163241, + "learning_rate": 5.535708345547939e-06, + "loss": 1.0674, + "step": 15450 + }, + { + "epoch": 1.56, + "grad_norm": 14.398321831145607, + "learning_rate": 5.5327921387271565e-06, + "loss": 1.0084, + "step": 15455 + }, + { + "epoch": 1.56, + "grad_norm": 12.41292785937368, + "learning_rate": 5.529875748573109e-06, + "loss": 0.9958, + "step": 15460 + }, + { + "epoch": 1.56, + "grad_norm": 31.31049057929927, + "learning_rate": 5.52695917608932e-06, + "loss": 1.0856, + "step": 15465 + }, + { + "epoch": 1.56, + "grad_norm": 17.824836231611595, + "learning_rate": 5.5240424222793836e-06, + "loss": 1.0009, + "step": 15470 + }, + { + "epoch": 1.56, + "grad_norm": 12.923284968966275, + "learning_rate": 5.521125488146951e-06, + "loss": 1.0319, + "step": 15475 + }, + { + "epoch": 1.56, + "grad_norm": 25.84663898577755, + "learning_rate": 5.5182083746957334e-06, + "loss": 1.0175, + "step": 15480 + }, + { + "epoch": 1.56, + "grad_norm": 8.9546403529747, + "learning_rate": 5.51529108292951e-06, + "loss": 1.0446, + "step": 15485 + }, + { + "epoch": 1.56, + "grad_norm": 5.7153867003786205, + "learning_rate": 5.512373613852117e-06, + "loss": 1.0295, + "step": 15490 + }, + { + "epoch": 1.56, + "grad_norm": 12.35043341931898, + "learning_rate": 5.5094559684674545e-06, + "loss": 1.0182, + "step": 15495 + }, + { + "epoch": 1.56, + "grad_norm": 7.7129822787587905, + "learning_rate": 5.506538147779478e-06, + "loss": 0.9995, + "step": 15500 + }, + { + "epoch": 1.56, + "grad_norm": 5.9358006177255, + "learning_rate": 5.503620152792208e-06, + "loss": 0.9772, + "step": 15505 + }, + { + "epoch": 1.56, + "grad_norm": 8.139260154917016, + "learning_rate": 5.500701984509727e-06, + "loss": 1.0554, + "step": 15510 + }, + { + "epoch": 1.56, + "grad_norm": 8.548520446037127, + "learning_rate": 5.497783643936169e-06, + "loss": 1.0202, + "step": 15515 + }, + { + "epoch": 1.56, + "grad_norm": 6.310244375627785, + "learning_rate": 5.494865132075735e-06, + "loss": 0.9979, + "step": 15520 + }, + { + "epoch": 1.57, + "grad_norm": 12.879052737594693, + "learning_rate": 5.491946449932683e-06, + "loss": 1.074, + "step": 15525 + }, + { + "epoch": 1.57, + "grad_norm": 25.970468165459245, + "learning_rate": 5.489027598511327e-06, + "loss": 1.0097, + "step": 15530 + }, + { + "epoch": 1.57, + "grad_norm": 16.811799979001137, + "learning_rate": 5.48610857881604e-06, + "loss": 0.9936, + "step": 15535 + }, + { + "epoch": 1.57, + "grad_norm": 5.571587571136714, + "learning_rate": 5.483189391851258e-06, + "loss": 0.9873, + "step": 15540 + }, + { + "epoch": 1.57, + "grad_norm": 9.477857504477589, + "learning_rate": 5.480270038621466e-06, + "loss": 0.9792, + "step": 15545 + }, + { + "epoch": 1.57, + "grad_norm": 6.107218873468646, + "learning_rate": 5.4773505201312125e-06, + "loss": 1.0711, + "step": 15550 + }, + { + "epoch": 1.57, + "grad_norm": 15.503662574518724, + "learning_rate": 5.474430837385102e-06, + "loss": 0.9318, + "step": 15555 + }, + { + "epoch": 1.57, + "grad_norm": 23.904668772962744, + "learning_rate": 5.471510991387792e-06, + "loss": 1.001, + "step": 15560 + }, + { + "epoch": 1.57, + "grad_norm": 9.82295750301058, + "learning_rate": 5.4685909831439995e-06, + "loss": 0.9886, + "step": 15565 + }, + { + "epoch": 1.57, + "grad_norm": 13.798088528044294, + "learning_rate": 5.4656708136584994e-06, + "loss": 1.0119, + "step": 15570 + }, + { + "epoch": 1.57, + "grad_norm": 18.4122532450321, + "learning_rate": 5.462750483936116e-06, + "loss": 1.0103, + "step": 15575 + }, + { + "epoch": 1.57, + "grad_norm": 8.189506784850868, + "learning_rate": 5.459829994981732e-06, + "loss": 1.0242, + "step": 15580 + }, + { + "epoch": 1.57, + "grad_norm": 6.985499988081523, + "learning_rate": 5.456909347800289e-06, + "loss": 1.0173, + "step": 15585 + }, + { + "epoch": 1.57, + "grad_norm": 12.435487548336896, + "learning_rate": 5.453988543396773e-06, + "loss": 1.0303, + "step": 15590 + }, + { + "epoch": 1.57, + "grad_norm": 11.9345012211472, + "learning_rate": 5.451067582776233e-06, + "loss": 1.0322, + "step": 15595 + }, + { + "epoch": 1.57, + "grad_norm": 6.203630037662816, + "learning_rate": 5.44814646694377e-06, + "loss": 1.0119, + "step": 15600 + }, + { + "epoch": 1.57, + "grad_norm": 5.700017667723494, + "learning_rate": 5.445225196904536e-06, + "loss": 1.0121, + "step": 15605 + }, + { + "epoch": 1.57, + "grad_norm": 5.727849581352104, + "learning_rate": 5.4423037736637354e-06, + "loss": 0.9896, + "step": 15610 + }, + { + "epoch": 1.57, + "grad_norm": 6.293525246706005, + "learning_rate": 5.4393821982266296e-06, + "loss": 0.988, + "step": 15615 + }, + { + "epoch": 1.57, + "grad_norm": 16.412605276421097, + "learning_rate": 5.436460471598528e-06, + "loss": 1.0571, + "step": 15620 + }, + { + "epoch": 1.58, + "grad_norm": 5.940978612544219, + "learning_rate": 5.433538594784796e-06, + "loss": 1.0098, + "step": 15625 + }, + { + "epoch": 1.58, + "grad_norm": 8.230723437166615, + "learning_rate": 5.4306165687908485e-06, + "loss": 1.0514, + "step": 15630 + }, + { + "epoch": 1.58, + "grad_norm": 7.221237771261272, + "learning_rate": 5.4276943946221494e-06, + "loss": 1.0028, + "step": 15635 + }, + { + "epoch": 1.58, + "grad_norm": 10.657311547680987, + "learning_rate": 5.424772073284218e-06, + "loss": 1.0432, + "step": 15640 + }, + { + "epoch": 1.58, + "grad_norm": 5.577141376755671, + "learning_rate": 5.421849605782622e-06, + "loss": 0.9769, + "step": 15645 + }, + { + "epoch": 1.58, + "grad_norm": 6.860599441000725, + "learning_rate": 5.418926993122979e-06, + "loss": 1.0185, + "step": 15650 + }, + { + "epoch": 1.58, + "grad_norm": 12.177441626567626, + "learning_rate": 5.4160042363109585e-06, + "loss": 0.9806, + "step": 15655 + }, + { + "epoch": 1.58, + "grad_norm": 9.185151466420002, + "learning_rate": 5.413081336352278e-06, + "loss": 1.0278, + "step": 15660 + }, + { + "epoch": 1.58, + "grad_norm": 10.775833291063558, + "learning_rate": 5.410158294252704e-06, + "loss": 1.0019, + "step": 15665 + }, + { + "epoch": 1.58, + "grad_norm": 9.249542375778276, + "learning_rate": 5.4072351110180525e-06, + "loss": 1.0343, + "step": 15670 + }, + { + "epoch": 1.58, + "grad_norm": 7.721745645435944, + "learning_rate": 5.404311787654189e-06, + "loss": 1.028, + "step": 15675 + }, + { + "epoch": 1.58, + "grad_norm": 5.587955552947999, + "learning_rate": 5.401388325167024e-06, + "loss": 1.0212, + "step": 15680 + }, + { + "epoch": 1.58, + "grad_norm": 5.269292112699187, + "learning_rate": 5.39846472456252e-06, + "loss": 1.0303, + "step": 15685 + }, + { + "epoch": 1.58, + "grad_norm": 6.921660261357361, + "learning_rate": 5.3955409868466845e-06, + "loss": 0.9814, + "step": 15690 + }, + { + "epoch": 1.58, + "grad_norm": 7.739963976484743, + "learning_rate": 5.392617113025576e-06, + "loss": 1.0304, + "step": 15695 + }, + { + "epoch": 1.58, + "grad_norm": 8.29286986619866, + "learning_rate": 5.38969310410529e-06, + "loss": 1.0466, + "step": 15700 + }, + { + "epoch": 1.58, + "grad_norm": 10.065776472665721, + "learning_rate": 5.386768961091981e-06, + "loss": 0.9981, + "step": 15705 + }, + { + "epoch": 1.58, + "grad_norm": 16.73112924832002, + "learning_rate": 5.3838446849918425e-06, + "loss": 1.0516, + "step": 15710 + }, + { + "epoch": 1.58, + "grad_norm": 10.114222624670669, + "learning_rate": 5.3809202768111135e-06, + "loss": 0.9975, + "step": 15715 + }, + { + "epoch": 1.58, + "grad_norm": 20.69821552756159, + "learning_rate": 5.377995737556081e-06, + "loss": 1.0127, + "step": 15720 + }, + { + "epoch": 1.59, + "grad_norm": 27.485717435936895, + "learning_rate": 5.375071068233077e-06, + "loss": 1.042, + "step": 15725 + }, + { + "epoch": 1.59, + "grad_norm": 13.7777365075493, + "learning_rate": 5.372146269848476e-06, + "loss": 1.0256, + "step": 15730 + }, + { + "epoch": 1.59, + "grad_norm": 9.602185252863674, + "learning_rate": 5.3692213434086995e-06, + "loss": 1.0205, + "step": 15735 + }, + { + "epoch": 1.59, + "grad_norm": 7.2234703348929346, + "learning_rate": 5.366296289920211e-06, + "loss": 0.9911, + "step": 15740 + }, + { + "epoch": 1.59, + "grad_norm": 8.4351426890756, + "learning_rate": 5.363371110389519e-06, + "loss": 1.0269, + "step": 15745 + }, + { + "epoch": 1.59, + "grad_norm": 6.786363333398277, + "learning_rate": 5.360445805823174e-06, + "loss": 0.9598, + "step": 15750 + }, + { + "epoch": 1.59, + "grad_norm": 4.97525024859164, + "learning_rate": 5.357520377227774e-06, + "loss": 0.9675, + "step": 15755 + }, + { + "epoch": 1.59, + "grad_norm": 9.086909538105326, + "learning_rate": 5.354594825609952e-06, + "loss": 0.9931, + "step": 15760 + }, + { + "epoch": 1.59, + "grad_norm": 10.159060953047028, + "learning_rate": 5.351669151976389e-06, + "loss": 1.0362, + "step": 15765 + }, + { + "epoch": 1.59, + "grad_norm": 7.916611390361543, + "learning_rate": 5.348743357333808e-06, + "loss": 1.0337, + "step": 15770 + }, + { + "epoch": 1.59, + "grad_norm": 8.805481262074903, + "learning_rate": 5.34581744268897e-06, + "loss": 1.0247, + "step": 15775 + }, + { + "epoch": 1.59, + "grad_norm": 26.194657589325068, + "learning_rate": 5.34289140904868e-06, + "loss": 0.9847, + "step": 15780 + }, + { + "epoch": 1.59, + "grad_norm": 14.415964782837731, + "learning_rate": 5.339965257419784e-06, + "loss": 1.0127, + "step": 15785 + }, + { + "epoch": 1.59, + "grad_norm": 9.071868588891041, + "learning_rate": 5.3370389888091675e-06, + "loss": 1.0205, + "step": 15790 + }, + { + "epoch": 1.59, + "grad_norm": 6.9732842913514395, + "learning_rate": 5.334112604223757e-06, + "loss": 1.0129, + "step": 15795 + }, + { + "epoch": 1.59, + "grad_norm": 17.997310476911395, + "learning_rate": 5.331186104670518e-06, + "loss": 0.992, + "step": 15800 + }, + { + "epoch": 1.59, + "grad_norm": 14.085613485435752, + "learning_rate": 5.328259491156458e-06, + "loss": 0.9577, + "step": 15805 + }, + { + "epoch": 1.59, + "grad_norm": 12.168853592166277, + "learning_rate": 5.325332764688619e-06, + "loss": 0.9916, + "step": 15810 + }, + { + "epoch": 1.59, + "grad_norm": 8.324216349508182, + "learning_rate": 5.322405926274087e-06, + "loss": 1.0157, + "step": 15815 + }, + { + "epoch": 1.59, + "grad_norm": 17.376033806430183, + "learning_rate": 5.319478976919984e-06, + "loss": 1.0294, + "step": 15820 + }, + { + "epoch": 1.6, + "grad_norm": 7.579910155353343, + "learning_rate": 5.31655191763347e-06, + "loss": 1.0061, + "step": 15825 + }, + { + "epoch": 1.6, + "grad_norm": 5.2035533184641825, + "learning_rate": 5.313624749421743e-06, + "loss": 1.0234, + "step": 15830 + }, + { + "epoch": 1.6, + "grad_norm": 5.575174310526426, + "learning_rate": 5.31069747329204e-06, + "loss": 0.971, + "step": 15835 + }, + { + "epoch": 1.6, + "grad_norm": 11.968812764711304, + "learning_rate": 5.307770090251633e-06, + "loss": 1.0269, + "step": 15840 + }, + { + "epoch": 1.6, + "grad_norm": 13.944910548216217, + "learning_rate": 5.3048426013078315e-06, + "loss": 1.0075, + "step": 15845 + }, + { + "epoch": 1.6, + "grad_norm": 9.062487695154594, + "learning_rate": 5.301915007467982e-06, + "loss": 1.0303, + "step": 15850 + }, + { + "epoch": 1.6, + "grad_norm": 8.147205587050276, + "learning_rate": 5.298987309739467e-06, + "loss": 0.9552, + "step": 15855 + }, + { + "epoch": 1.6, + "grad_norm": 10.092372722717185, + "learning_rate": 5.296059509129704e-06, + "loss": 1.0003, + "step": 15860 + }, + { + "epoch": 1.6, + "grad_norm": 10.38372461132063, + "learning_rate": 5.293131606646148e-06, + "loss": 0.9896, + "step": 15865 + }, + { + "epoch": 1.6, + "grad_norm": 13.089507838844908, + "learning_rate": 5.290203603296285e-06, + "loss": 1.0019, + "step": 15870 + }, + { + "epoch": 1.6, + "grad_norm": 9.69511411954582, + "learning_rate": 5.287275500087639e-06, + "loss": 1.024, + "step": 15875 + }, + { + "epoch": 1.6, + "grad_norm": 13.584071369622801, + "learning_rate": 5.284347298027769e-06, + "loss": 0.9924, + "step": 15880 + }, + { + "epoch": 1.6, + "grad_norm": 5.611077298524117, + "learning_rate": 5.281418998124264e-06, + "loss": 1.0062, + "step": 15885 + }, + { + "epoch": 1.6, + "grad_norm": 13.579750167447928, + "learning_rate": 5.278490601384752e-06, + "loss": 1.002, + "step": 15890 + }, + { + "epoch": 1.6, + "grad_norm": 13.858087425142072, + "learning_rate": 5.275562108816889e-06, + "loss": 1.003, + "step": 15895 + }, + { + "epoch": 1.6, + "grad_norm": 10.62984739944366, + "learning_rate": 5.2726335214283675e-06, + "loss": 1.0321, + "step": 15900 + }, + { + "epoch": 1.6, + "grad_norm": 15.914241867641904, + "learning_rate": 5.269704840226911e-06, + "loss": 0.9902, + "step": 15905 + }, + { + "epoch": 1.6, + "grad_norm": 11.49404515966052, + "learning_rate": 5.266776066220278e-06, + "loss": 1.0109, + "step": 15910 + }, + { + "epoch": 1.6, + "grad_norm": 10.567038240824985, + "learning_rate": 5.2638472004162545e-06, + "loss": 1.0079, + "step": 15915 + }, + { + "epoch": 1.61, + "grad_norm": 6.009631683944291, + "learning_rate": 5.260918243822662e-06, + "loss": 1.0159, + "step": 15920 + }, + { + "epoch": 1.61, + "grad_norm": 10.12079307982005, + "learning_rate": 5.257989197447352e-06, + "loss": 1.0071, + "step": 15925 + }, + { + "epoch": 1.61, + "grad_norm": 8.507796359581, + "learning_rate": 5.255060062298204e-06, + "loss": 0.9743, + "step": 15930 + }, + { + "epoch": 1.61, + "grad_norm": 7.731754541730131, + "learning_rate": 5.252130839383133e-06, + "loss": 1.0074, + "step": 15935 + }, + { + "epoch": 1.61, + "grad_norm": 6.589248267275763, + "learning_rate": 5.249201529710079e-06, + "loss": 1.0055, + "step": 15940 + }, + { + "epoch": 1.61, + "grad_norm": 5.784506805665072, + "learning_rate": 5.24627213428702e-06, + "loss": 0.9896, + "step": 15945 + }, + { + "epoch": 1.61, + "grad_norm": 9.764375372843931, + "learning_rate": 5.243342654121953e-06, + "loss": 1.007, + "step": 15950 + }, + { + "epoch": 1.61, + "grad_norm": 12.327918830755468, + "learning_rate": 5.24041309022291e-06, + "loss": 0.974, + "step": 15955 + }, + { + "epoch": 1.61, + "grad_norm": 11.952405391443223, + "learning_rate": 5.237483443597954e-06, + "loss": 0.9841, + "step": 15960 + }, + { + "epoch": 1.61, + "grad_norm": 17.890437029989172, + "learning_rate": 5.234553715255171e-06, + "loss": 0.9778, + "step": 15965 + }, + { + "epoch": 1.61, + "grad_norm": 18.01316538586824, + "learning_rate": 5.231623906202677e-06, + "loss": 1.0248, + "step": 15970 + }, + { + "epoch": 1.61, + "grad_norm": 15.853916748104757, + "learning_rate": 5.228694017448621e-06, + "loss": 1.0252, + "step": 15975 + }, + { + "epoch": 1.61, + "grad_norm": 6.036828572797781, + "learning_rate": 5.2257640500011704e-06, + "loss": 1.0223, + "step": 15980 + }, + { + "epoch": 1.61, + "grad_norm": 5.557133960109277, + "learning_rate": 5.222834004868527e-06, + "loss": 0.9986, + "step": 15985 + }, + { + "epoch": 1.61, + "grad_norm": 11.190170767718795, + "learning_rate": 5.219903883058916e-06, + "loss": 0.9722, + "step": 15990 + }, + { + "epoch": 1.61, + "grad_norm": 6.968707961937117, + "learning_rate": 5.216973685580586e-06, + "loss": 1.0007, + "step": 15995 + }, + { + "epoch": 1.61, + "grad_norm": 5.695239415229787, + "learning_rate": 5.214043413441819e-06, + "loss": 1.0044, + "step": 16000 + }, + { + "epoch": 1.61, + "grad_norm": 6.32502215343701, + "learning_rate": 5.21111306765092e-06, + "loss": 1.0325, + "step": 16005 + }, + { + "epoch": 1.61, + "grad_norm": 7.064153141750366, + "learning_rate": 5.208182649216213e-06, + "loss": 1.0135, + "step": 16010 + }, + { + "epoch": 1.61, + "grad_norm": 25.031161662044592, + "learning_rate": 5.205252159146057e-06, + "loss": 0.987, + "step": 16015 + }, + { + "epoch": 1.62, + "grad_norm": 26.71026392221403, + "learning_rate": 5.202321598448829e-06, + "loss": 1.0106, + "step": 16020 + }, + { + "epoch": 1.62, + "grad_norm": 6.861465699216564, + "learning_rate": 5.1993909681329325e-06, + "loss": 1.0096, + "step": 16025 + }, + { + "epoch": 1.62, + "grad_norm": 12.95376418351282, + "learning_rate": 5.196460269206794e-06, + "loss": 1.0328, + "step": 16030 + }, + { + "epoch": 1.62, + "grad_norm": 10.814262958608504, + "learning_rate": 5.193529502678865e-06, + "loss": 1.0316, + "step": 16035 + }, + { + "epoch": 1.62, + "grad_norm": 7.929773352393471, + "learning_rate": 5.190598669557618e-06, + "loss": 1.0064, + "step": 16040 + }, + { + "epoch": 1.62, + "grad_norm": 9.322807476578738, + "learning_rate": 5.187667770851552e-06, + "loss": 1.074, + "step": 16045 + }, + { + "epoch": 1.62, + "grad_norm": 19.974955659049545, + "learning_rate": 5.184736807569185e-06, + "loss": 1.0602, + "step": 16050 + }, + { + "epoch": 1.62, + "grad_norm": 6.79221091060615, + "learning_rate": 5.1818057807190584e-06, + "loss": 0.9562, + "step": 16055 + }, + { + "epoch": 1.62, + "grad_norm": 12.40831543670176, + "learning_rate": 5.178874691309736e-06, + "loss": 0.9686, + "step": 16060 + }, + { + "epoch": 1.62, + "grad_norm": 9.394481238822914, + "learning_rate": 5.175943540349804e-06, + "loss": 1.0069, + "step": 16065 + }, + { + "epoch": 1.62, + "grad_norm": 26.236056559541815, + "learning_rate": 5.173012328847867e-06, + "loss": 1.0099, + "step": 16070 + }, + { + "epoch": 1.62, + "grad_norm": 32.35843255079721, + "learning_rate": 5.170081057812553e-06, + "loss": 1.0193, + "step": 16075 + }, + { + "epoch": 1.62, + "grad_norm": 39.151728665016165, + "learning_rate": 5.167149728252511e-06, + "loss": 1.0183, + "step": 16080 + }, + { + "epoch": 1.62, + "grad_norm": 36.6253507644504, + "learning_rate": 5.164218341176405e-06, + "loss": 1.0064, + "step": 16085 + }, + { + "epoch": 1.62, + "grad_norm": 10.653327592026665, + "learning_rate": 5.161286897592925e-06, + "loss": 1.0032, + "step": 16090 + }, + { + "epoch": 1.62, + "grad_norm": 40.64514818536792, + "learning_rate": 5.1583553985107785e-06, + "loss": 1.0449, + "step": 16095 + }, + { + "epoch": 1.62, + "grad_norm": 7.140483573782942, + "learning_rate": 5.15542384493869e-06, + "loss": 0.9884, + "step": 16100 + }, + { + "epoch": 1.62, + "grad_norm": 10.083417716052871, + "learning_rate": 5.152492237885405e-06, + "loss": 0.9715, + "step": 16105 + }, + { + "epoch": 1.62, + "grad_norm": 8.234421458690278, + "learning_rate": 5.149560578359687e-06, + "loss": 1.0033, + "step": 16110 + }, + { + "epoch": 1.62, + "grad_norm": 36.12020745998665, + "learning_rate": 5.146628867370316e-06, + "loss": 0.9946, + "step": 16115 + }, + { + "epoch": 1.63, + "grad_norm": 43.702150113062764, + "learning_rate": 5.143697105926092e-06, + "loss": 1.0268, + "step": 16120 + }, + { + "epoch": 1.63, + "grad_norm": 26.741514849915372, + "learning_rate": 5.140765295035832e-06, + "loss": 0.9801, + "step": 16125 + }, + { + "epoch": 1.63, + "grad_norm": 19.76751637787017, + "learning_rate": 5.137833435708368e-06, + "loss": 1.0012, + "step": 16130 + }, + { + "epoch": 1.63, + "grad_norm": 7.159552963703967, + "learning_rate": 5.13490152895255e-06, + "loss": 0.9978, + "step": 16135 + }, + { + "epoch": 1.63, + "grad_norm": 30.926519715727775, + "learning_rate": 5.131969575777245e-06, + "loss": 0.9883, + "step": 16140 + }, + { + "epoch": 1.63, + "grad_norm": 25.473821960921036, + "learning_rate": 5.129037577191335e-06, + "loss": 1.0405, + "step": 16145 + }, + { + "epoch": 1.63, + "grad_norm": 28.72089383268889, + "learning_rate": 5.126105534203717e-06, + "loss": 1.029, + "step": 16150 + }, + { + "epoch": 1.63, + "grad_norm": 5.484114156842334, + "learning_rate": 5.123173447823308e-06, + "loss": 1.0108, + "step": 16155 + }, + { + "epoch": 1.63, + "grad_norm": 11.445040855239327, + "learning_rate": 5.120241319059031e-06, + "loss": 1.0394, + "step": 16160 + }, + { + "epoch": 1.63, + "grad_norm": 46.8445866363268, + "learning_rate": 5.11730914891983e-06, + "loss": 1.0269, + "step": 16165 + }, + { + "epoch": 1.63, + "grad_norm": 7.866166214418564, + "learning_rate": 5.114376938414665e-06, + "loss": 1.0182, + "step": 16170 + }, + { + "epoch": 1.63, + "grad_norm": 9.67096279704452, + "learning_rate": 5.1114446885525046e-06, + "loss": 1.0391, + "step": 16175 + }, + { + "epoch": 1.63, + "grad_norm": 8.017881971905954, + "learning_rate": 5.108512400342332e-06, + "loss": 1.0013, + "step": 16180 + }, + { + "epoch": 1.63, + "grad_norm": 7.040845594445138, + "learning_rate": 5.105580074793146e-06, + "loss": 0.9716, + "step": 16185 + }, + { + "epoch": 1.63, + "grad_norm": 5.396904757008385, + "learning_rate": 5.102647712913958e-06, + "loss": 1.0135, + "step": 16190 + }, + { + "epoch": 1.63, + "grad_norm": 6.974532267997327, + "learning_rate": 5.09971531571379e-06, + "loss": 0.9845, + "step": 16195 + }, + { + "epoch": 1.63, + "grad_norm": 7.373761571462232, + "learning_rate": 5.096782884201676e-06, + "loss": 0.9918, + "step": 16200 + }, + { + "epoch": 1.63, + "grad_norm": 8.083849869450802, + "learning_rate": 5.093850419386667e-06, + "loss": 1.0365, + "step": 16205 + }, + { + "epoch": 1.63, + "grad_norm": 8.544053197681697, + "learning_rate": 5.090917922277815e-06, + "loss": 1.0435, + "step": 16210 + }, + { + "epoch": 1.63, + "grad_norm": 12.317444178392435, + "learning_rate": 5.087985393884194e-06, + "loss": 0.9998, + "step": 16215 + }, + { + "epoch": 1.64, + "grad_norm": 6.149487955374466, + "learning_rate": 5.0850528352148846e-06, + "loss": 0.9818, + "step": 16220 + }, + { + "epoch": 1.64, + "grad_norm": 8.896241226392624, + "learning_rate": 5.082120247278973e-06, + "loss": 1.0041, + "step": 16225 + }, + { + "epoch": 1.64, + "grad_norm": 12.43839173359589, + "learning_rate": 5.079187631085564e-06, + "loss": 1.0192, + "step": 16230 + }, + { + "epoch": 1.64, + "grad_norm": 5.759400539341507, + "learning_rate": 5.076254987643767e-06, + "loss": 0.9977, + "step": 16235 + }, + { + "epoch": 1.64, + "grad_norm": 8.259653918952248, + "learning_rate": 5.073322317962701e-06, + "loss": 1.0643, + "step": 16240 + }, + { + "epoch": 1.64, + "grad_norm": 6.969349385953867, + "learning_rate": 5.070389623051496e-06, + "loss": 0.9602, + "step": 16245 + }, + { + "epoch": 1.64, + "grad_norm": 13.913741738579871, + "learning_rate": 5.067456903919289e-06, + "loss": 0.9951, + "step": 16250 + }, + { + "epoch": 1.64, + "grad_norm": 9.38234563013056, + "learning_rate": 5.0645241615752246e-06, + "loss": 0.973, + "step": 16255 + }, + { + "epoch": 1.64, + "grad_norm": 7.500639363718501, + "learning_rate": 5.0615913970284594e-06, + "loss": 0.9942, + "step": 16260 + }, + { + "epoch": 1.64, + "grad_norm": 12.000246455537164, + "learning_rate": 5.058658611288153e-06, + "loss": 0.9913, + "step": 16265 + }, + { + "epoch": 1.64, + "grad_norm": 25.602213423387873, + "learning_rate": 5.055725805363474e-06, + "loss": 0.9963, + "step": 16270 + }, + { + "epoch": 1.64, + "grad_norm": 14.792623271639881, + "learning_rate": 5.052792980263598e-06, + "loss": 1.0323, + "step": 16275 + }, + { + "epoch": 1.64, + "grad_norm": 7.58658288712669, + "learning_rate": 5.04986013699771e-06, + "loss": 1.043, + "step": 16280 + }, + { + "epoch": 1.64, + "grad_norm": 21.70761618039746, + "learning_rate": 5.046927276574994e-06, + "loss": 1.0099, + "step": 16285 + }, + { + "epoch": 1.64, + "grad_norm": 19.85364796172033, + "learning_rate": 5.043994400004648e-06, + "loss": 1.0249, + "step": 16290 + }, + { + "epoch": 1.64, + "grad_norm": 10.427242979146856, + "learning_rate": 5.041061508295872e-06, + "loss": 1.0583, + "step": 16295 + }, + { + "epoch": 1.64, + "grad_norm": 6.200635499849241, + "learning_rate": 5.038128602457868e-06, + "loss": 1.0306, + "step": 16300 + }, + { + "epoch": 1.64, + "grad_norm": 9.09617656159858, + "learning_rate": 5.0351956834998504e-06, + "loss": 0.9774, + "step": 16305 + }, + { + "epoch": 1.64, + "grad_norm": 7.715743797183803, + "learning_rate": 5.032262752431031e-06, + "loss": 1.0376, + "step": 16310 + }, + { + "epoch": 1.64, + "grad_norm": 6.819222953107973, + "learning_rate": 5.029329810260629e-06, + "loss": 1.0369, + "step": 16315 + }, + { + "epoch": 1.65, + "grad_norm": 6.80103133398059, + "learning_rate": 5.026396857997867e-06, + "loss": 0.9847, + "step": 16320 + }, + { + "epoch": 1.65, + "grad_norm": 5.888074167871263, + "learning_rate": 5.023463896651972e-06, + "loss": 0.9626, + "step": 16325 + }, + { + "epoch": 1.65, + "grad_norm": 5.705718315442812, + "learning_rate": 5.020530927232173e-06, + "loss": 0.9942, + "step": 16330 + }, + { + "epoch": 1.65, + "grad_norm": 5.41260032660405, + "learning_rate": 5.017597950747701e-06, + "loss": 0.9902, + "step": 16335 + }, + { + "epoch": 1.65, + "grad_norm": 8.255042843953591, + "learning_rate": 5.014664968207791e-06, + "loss": 1.073, + "step": 16340 + }, + { + "epoch": 1.65, + "grad_norm": 7.170200899510636, + "learning_rate": 5.01173198062168e-06, + "loss": 1.011, + "step": 16345 + }, + { + "epoch": 1.65, + "grad_norm": 6.790745864427179, + "learning_rate": 5.008798988998605e-06, + "loss": 1.0245, + "step": 16350 + }, + { + "epoch": 1.65, + "grad_norm": 7.923644495165293, + "learning_rate": 5.005865994347805e-06, + "loss": 0.9935, + "step": 16355 + }, + { + "epoch": 1.65, + "grad_norm": 11.578460220226166, + "learning_rate": 5.002932997678524e-06, + "loss": 1.0125, + "step": 16360 + }, + { + "epoch": 1.65, + "grad_norm": 6.753338369687813, + "learning_rate": 5e-06, + "loss": 0.956, + "step": 16365 + }, + { + "epoch": 1.65, + "grad_norm": 8.70576280011481, + "learning_rate": 4.997067002321478e-06, + "loss": 0.9988, + "step": 16370 + }, + { + "epoch": 1.65, + "grad_norm": 6.817150720143112, + "learning_rate": 4.994134005652196e-06, + "loss": 0.9805, + "step": 16375 + }, + { + "epoch": 1.65, + "grad_norm": 8.45500405763143, + "learning_rate": 4.991201011001398e-06, + "loss": 0.9904, + "step": 16380 + }, + { + "epoch": 1.65, + "grad_norm": 6.819425098234933, + "learning_rate": 4.988268019378322e-06, + "loss": 1.0371, + "step": 16385 + }, + { + "epoch": 1.65, + "grad_norm": 5.1871866152562625, + "learning_rate": 4.9853350317922105e-06, + "loss": 1.0262, + "step": 16390 + }, + { + "epoch": 1.65, + "grad_norm": 27.438805065280388, + "learning_rate": 4.9824020492523e-06, + "loss": 1.0582, + "step": 16395 + }, + { + "epoch": 1.65, + "grad_norm": 9.955853598441074, + "learning_rate": 4.979469072767829e-06, + "loss": 0.9541, + "step": 16400 + }, + { + "epoch": 1.65, + "grad_norm": 8.369021538950003, + "learning_rate": 4.976536103348029e-06, + "loss": 1.0106, + "step": 16405 + }, + { + "epoch": 1.65, + "grad_norm": 17.15791993164313, + "learning_rate": 4.973603142002134e-06, + "loss": 0.9559, + "step": 16410 + }, + { + "epoch": 1.65, + "grad_norm": 11.55419850396348, + "learning_rate": 4.9706701897393715e-06, + "loss": 1.0312, + "step": 16415 + }, + { + "epoch": 1.66, + "grad_norm": 5.541969961872092, + "learning_rate": 4.967737247568971e-06, + "loss": 1.048, + "step": 16420 + }, + { + "epoch": 1.66, + "grad_norm": 14.509974176445898, + "learning_rate": 4.964804316500151e-06, + "loss": 0.9561, + "step": 16425 + }, + { + "epoch": 1.66, + "grad_norm": 6.444622668215799, + "learning_rate": 4.961871397542133e-06, + "loss": 1.0031, + "step": 16430 + }, + { + "epoch": 1.66, + "grad_norm": 20.414380824876183, + "learning_rate": 4.95893849170413e-06, + "loss": 0.9993, + "step": 16435 + }, + { + "epoch": 1.66, + "grad_norm": 12.008312874134626, + "learning_rate": 4.956005599995354e-06, + "loss": 1.0188, + "step": 16440 + }, + { + "epoch": 1.66, + "grad_norm": 11.690679034928507, + "learning_rate": 4.953072723425007e-06, + "loss": 1.0049, + "step": 16445 + }, + { + "epoch": 1.66, + "grad_norm": 7.541363194151192, + "learning_rate": 4.950139863002293e-06, + "loss": 1.0243, + "step": 16450 + }, + { + "epoch": 1.66, + "grad_norm": 8.849347533184488, + "learning_rate": 4.947207019736403e-06, + "loss": 0.9921, + "step": 16455 + }, + { + "epoch": 1.66, + "grad_norm": 15.513672027070383, + "learning_rate": 4.944274194636528e-06, + "loss": 1.0574, + "step": 16460 + }, + { + "epoch": 1.66, + "grad_norm": 5.615937012219703, + "learning_rate": 4.941341388711849e-06, + "loss": 1.0081, + "step": 16465 + }, + { + "epoch": 1.66, + "grad_norm": 9.633247658658282, + "learning_rate": 4.938408602971543e-06, + "loss": 0.9871, + "step": 16470 + }, + { + "epoch": 1.66, + "grad_norm": 5.3010119117651735, + "learning_rate": 4.935475838424775e-06, + "loss": 0.9994, + "step": 16475 + }, + { + "epoch": 1.66, + "grad_norm": 7.134598686246772, + "learning_rate": 4.932543096080712e-06, + "loss": 1.0322, + "step": 16480 + }, + { + "epoch": 1.66, + "grad_norm": 10.047466822582225, + "learning_rate": 4.929610376948505e-06, + "loss": 0.9992, + "step": 16485 + }, + { + "epoch": 1.66, + "grad_norm": 5.05062321545772, + "learning_rate": 4.9266776820373e-06, + "loss": 1.0001, + "step": 16490 + }, + { + "epoch": 1.66, + "grad_norm": 6.863782270164636, + "learning_rate": 4.923745012356235e-06, + "loss": 1.0463, + "step": 16495 + }, + { + "epoch": 1.66, + "grad_norm": 9.068238812118306, + "learning_rate": 4.920812368914438e-06, + "loss": 1.0067, + "step": 16500 + }, + { + "epoch": 1.66, + "grad_norm": 16.288893208069187, + "learning_rate": 4.917879752721029e-06, + "loss": 1.0203, + "step": 16505 + }, + { + "epoch": 1.66, + "grad_norm": 26.670296805020982, + "learning_rate": 4.914947164785118e-06, + "loss": 0.9884, + "step": 16510 + }, + { + "epoch": 1.67, + "grad_norm": 6.97630357097606, + "learning_rate": 4.9120146061158084e-06, + "loss": 1.0277, + "step": 16515 + }, + { + "epoch": 1.67, + "grad_norm": 5.184246433267411, + "learning_rate": 4.909082077722186e-06, + "loss": 0.9844, + "step": 16520 + }, + { + "epoch": 1.67, + "grad_norm": 6.626723164196851, + "learning_rate": 4.906149580613336e-06, + "loss": 1.0073, + "step": 16525 + }, + { + "epoch": 1.67, + "grad_norm": 4.8935410810085385, + "learning_rate": 4.903217115798325e-06, + "loss": 1.0257, + "step": 16530 + }, + { + "epoch": 1.67, + "grad_norm": 6.181904513877318, + "learning_rate": 4.900284684286213e-06, + "loss": 1.0207, + "step": 16535 + }, + { + "epoch": 1.67, + "grad_norm": 8.826227520357035, + "learning_rate": 4.897352287086043e-06, + "loss": 1.006, + "step": 16540 + }, + { + "epoch": 1.67, + "grad_norm": 13.01286550419651, + "learning_rate": 4.894419925206856e-06, + "loss": 0.9993, + "step": 16545 + }, + { + "epoch": 1.67, + "grad_norm": 7.8796431874940085, + "learning_rate": 4.8914875996576695e-06, + "loss": 1.0159, + "step": 16550 + }, + { + "epoch": 1.67, + "grad_norm": 10.036244702718315, + "learning_rate": 4.888555311447498e-06, + "loss": 1.0214, + "step": 16555 + }, + { + "epoch": 1.67, + "grad_norm": 5.9240077190368465, + "learning_rate": 4.885623061585337e-06, + "loss": 1.0047, + "step": 16560 + }, + { + "epoch": 1.67, + "grad_norm": 5.430152786339917, + "learning_rate": 4.8826908510801715e-06, + "loss": 0.9777, + "step": 16565 + }, + { + "epoch": 1.67, + "grad_norm": 5.585109078393798, + "learning_rate": 4.87975868094097e-06, + "loss": 1.009, + "step": 16570 + }, + { + "epoch": 1.67, + "grad_norm": 9.150641168623665, + "learning_rate": 4.876826552176695e-06, + "loss": 1.0202, + "step": 16575 + }, + { + "epoch": 1.67, + "grad_norm": 5.935491981122044, + "learning_rate": 4.873894465796283e-06, + "loss": 0.9867, + "step": 16580 + }, + { + "epoch": 1.67, + "grad_norm": 9.611867042469141, + "learning_rate": 4.8709624228086665e-06, + "loss": 1.0331, + "step": 16585 + }, + { + "epoch": 1.67, + "grad_norm": 5.611371344783553, + "learning_rate": 4.868030424222756e-06, + "loss": 1.0573, + "step": 16590 + }, + { + "epoch": 1.67, + "grad_norm": 32.63255101680103, + "learning_rate": 4.865098471047452e-06, + "loss": 0.9927, + "step": 16595 + }, + { + "epoch": 1.67, + "grad_norm": 19.81342606970215, + "learning_rate": 4.862166564291633e-06, + "loss": 1.0103, + "step": 16600 + }, + { + "epoch": 1.67, + "grad_norm": 10.499850254147836, + "learning_rate": 4.859234704964169e-06, + "loss": 1.0191, + "step": 16605 + }, + { + "epoch": 1.67, + "grad_norm": 10.785974686070636, + "learning_rate": 4.856302894073908e-06, + "loss": 1.0459, + "step": 16610 + }, + { + "epoch": 1.68, + "grad_norm": 6.036809757407398, + "learning_rate": 4.853371132629685e-06, + "loss": 1.019, + "step": 16615 + }, + { + "epoch": 1.68, + "grad_norm": 7.013408316700486, + "learning_rate": 4.8504394216403145e-06, + "loss": 0.9935, + "step": 16620 + }, + { + "epoch": 1.68, + "grad_norm": 5.371096164511118, + "learning_rate": 4.8475077621145965e-06, + "loss": 1.0574, + "step": 16625 + }, + { + "epoch": 1.68, + "grad_norm": 5.6340660387069, + "learning_rate": 4.844576155061313e-06, + "loss": 1.0259, + "step": 16630 + }, + { + "epoch": 1.68, + "grad_norm": 5.442063040672251, + "learning_rate": 4.841644601489222e-06, + "loss": 0.9876, + "step": 16635 + }, + { + "epoch": 1.68, + "grad_norm": 5.244679072108467, + "learning_rate": 4.8387131024070775e-06, + "loss": 1.0156, + "step": 16640 + }, + { + "epoch": 1.68, + "grad_norm": 9.89473569238001, + "learning_rate": 4.835781658823597e-06, + "loss": 1.008, + "step": 16645 + }, + { + "epoch": 1.68, + "grad_norm": 11.662598637323155, + "learning_rate": 4.832850271747493e-06, + "loss": 1.043, + "step": 16650 + }, + { + "epoch": 1.68, + "grad_norm": 5.7050325167017855, + "learning_rate": 4.829918942187449e-06, + "loss": 0.9853, + "step": 16655 + }, + { + "epoch": 1.68, + "grad_norm": 8.552096546462334, + "learning_rate": 4.826987671152136e-06, + "loss": 0.9813, + "step": 16660 + }, + { + "epoch": 1.68, + "grad_norm": 9.231427286814666, + "learning_rate": 4.8240564596501976e-06, + "loss": 1.0655, + "step": 16665 + }, + { + "epoch": 1.68, + "grad_norm": 10.973960826013965, + "learning_rate": 4.821125308690267e-06, + "loss": 1.0352, + "step": 16670 + }, + { + "epoch": 1.68, + "grad_norm": 5.115569281328436, + "learning_rate": 4.818194219280943e-06, + "loss": 0.991, + "step": 16675 + }, + { + "epoch": 1.68, + "grad_norm": 18.37398551239263, + "learning_rate": 4.815263192430818e-06, + "loss": 1.022, + "step": 16680 + }, + { + "epoch": 1.68, + "grad_norm": 9.116026171727619, + "learning_rate": 4.81233222914845e-06, + "loss": 1.0154, + "step": 16685 + }, + { + "epoch": 1.68, + "grad_norm": 9.88286013001611, + "learning_rate": 4.809401330442384e-06, + "loss": 0.985, + "step": 16690 + }, + { + "epoch": 1.68, + "grad_norm": 10.794187658035513, + "learning_rate": 4.806470497321135e-06, + "loss": 0.9761, + "step": 16695 + }, + { + "epoch": 1.68, + "grad_norm": 15.911140347032589, + "learning_rate": 4.803539730793207e-06, + "loss": 1.0205, + "step": 16700 + }, + { + "epoch": 1.68, + "grad_norm": 25.425147897472836, + "learning_rate": 4.8006090318670675e-06, + "loss": 1.0148, + "step": 16705 + }, + { + "epoch": 1.68, + "grad_norm": 8.894959658277786, + "learning_rate": 4.797678401551172e-06, + "loss": 1.046, + "step": 16710 + }, + { + "epoch": 1.69, + "grad_norm": 6.720123354590075, + "learning_rate": 4.794747840853943e-06, + "loss": 0.9894, + "step": 16715 + }, + { + "epoch": 1.69, + "grad_norm": 9.45932942523887, + "learning_rate": 4.791817350783788e-06, + "loss": 1.0095, + "step": 16720 + }, + { + "epoch": 1.69, + "grad_norm": 7.482267961424742, + "learning_rate": 4.7888869323490805e-06, + "loss": 1.0267, + "step": 16725 + }, + { + "epoch": 1.69, + "grad_norm": 7.750108166742298, + "learning_rate": 4.785956586558182e-06, + "loss": 1.01, + "step": 16730 + }, + { + "epoch": 1.69, + "grad_norm": 7.196071199273943, + "learning_rate": 4.783026314419414e-06, + "loss": 1.0019, + "step": 16735 + }, + { + "epoch": 1.69, + "grad_norm": 7.908790043567544, + "learning_rate": 4.780096116941087e-06, + "loss": 1.0099, + "step": 16740 + }, + { + "epoch": 1.69, + "grad_norm": 6.650041814165618, + "learning_rate": 4.777165995131473e-06, + "loss": 1.0018, + "step": 16745 + }, + { + "epoch": 1.69, + "grad_norm": 17.682189805974183, + "learning_rate": 4.77423594999883e-06, + "loss": 1.0463, + "step": 16750 + }, + { + "epoch": 1.69, + "grad_norm": 5.3288850258912115, + "learning_rate": 4.771305982551381e-06, + "loss": 0.9903, + "step": 16755 + }, + { + "epoch": 1.69, + "grad_norm": 26.904240163387783, + "learning_rate": 4.7683760937973235e-06, + "loss": 1.0455, + "step": 16760 + }, + { + "epoch": 1.69, + "grad_norm": 40.74926258668739, + "learning_rate": 4.7654462847448316e-06, + "loss": 1.0471, + "step": 16765 + }, + { + "epoch": 1.69, + "grad_norm": 8.09515246288266, + "learning_rate": 4.762516556402048e-06, + "loss": 0.9861, + "step": 16770 + }, + { + "epoch": 1.69, + "grad_norm": 23.91326969614342, + "learning_rate": 4.759586909777092e-06, + "loss": 0.9822, + "step": 16775 + }, + { + "epoch": 1.69, + "grad_norm": 5.47668710150236, + "learning_rate": 4.75665734587805e-06, + "loss": 1.0411, + "step": 16780 + }, + { + "epoch": 1.69, + "grad_norm": 25.25118285825452, + "learning_rate": 4.753727865712983e-06, + "loss": 0.9719, + "step": 16785 + }, + { + "epoch": 1.69, + "grad_norm": 8.448283026301109, + "learning_rate": 4.750798470289922e-06, + "loss": 1.0017, + "step": 16790 + }, + { + "epoch": 1.69, + "grad_norm": 14.430106955844881, + "learning_rate": 4.74786916061687e-06, + "loss": 0.973, + "step": 16795 + }, + { + "epoch": 1.69, + "grad_norm": 5.983024090698553, + "learning_rate": 4.744939937701797e-06, + "loss": 1.0874, + "step": 16800 + }, + { + "epoch": 1.69, + "grad_norm": 48.062147831281614, + "learning_rate": 4.74201080255265e-06, + "loss": 0.9621, + "step": 16805 + }, + { + "epoch": 1.69, + "grad_norm": 8.270595501676446, + "learning_rate": 4.739081756177339e-06, + "loss": 0.9702, + "step": 16810 + }, + { + "epoch": 1.7, + "grad_norm": 13.539937937696077, + "learning_rate": 4.736152799583746e-06, + "loss": 0.9727, + "step": 16815 + }, + { + "epoch": 1.7, + "grad_norm": 10.640223121696199, + "learning_rate": 4.733223933779723e-06, + "loss": 1.0139, + "step": 16820 + }, + { + "epoch": 1.7, + "grad_norm": 10.791427548679868, + "learning_rate": 4.73029515977309e-06, + "loss": 1.0287, + "step": 16825 + }, + { + "epoch": 1.7, + "grad_norm": 20.742115000163974, + "learning_rate": 4.727366478571633e-06, + "loss": 1.0327, + "step": 16830 + }, + { + "epoch": 1.7, + "grad_norm": 6.944895323419959, + "learning_rate": 4.724437891183112e-06, + "loss": 0.9897, + "step": 16835 + }, + { + "epoch": 1.7, + "grad_norm": 13.526581874208533, + "learning_rate": 4.721509398615249e-06, + "loss": 0.9737, + "step": 16840 + }, + { + "epoch": 1.7, + "grad_norm": 9.652363648702273, + "learning_rate": 4.718581001875737e-06, + "loss": 1.0013, + "step": 16845 + }, + { + "epoch": 1.7, + "grad_norm": 5.278136771376039, + "learning_rate": 4.715652701972233e-06, + "loss": 0.9977, + "step": 16850 + }, + { + "epoch": 1.7, + "grad_norm": 5.61950884273999, + "learning_rate": 4.712724499912362e-06, + "loss": 1.0335, + "step": 16855 + }, + { + "epoch": 1.7, + "grad_norm": 7.070495461354499, + "learning_rate": 4.709796396703715e-06, + "loss": 0.942, + "step": 16860 + }, + { + "epoch": 1.7, + "grad_norm": 13.222729282641115, + "learning_rate": 4.706868393353854e-06, + "loss": 0.9613, + "step": 16865 + }, + { + "epoch": 1.7, + "grad_norm": 5.940206553289965, + "learning_rate": 4.703940490870296e-06, + "loss": 1.0139, + "step": 16870 + }, + { + "epoch": 1.7, + "grad_norm": 6.654456357112275, + "learning_rate": 4.701012690260534e-06, + "loss": 1.0285, + "step": 16875 + }, + { + "epoch": 1.7, + "grad_norm": 7.150880433070066, + "learning_rate": 4.69808499253202e-06, + "loss": 0.9728, + "step": 16880 + }, + { + "epoch": 1.7, + "grad_norm": 6.347270980874507, + "learning_rate": 4.695157398692171e-06, + "loss": 1.0049, + "step": 16885 + }, + { + "epoch": 1.7, + "grad_norm": 8.608674093571464, + "learning_rate": 4.69222990974837e-06, + "loss": 0.9961, + "step": 16890 + }, + { + "epoch": 1.7, + "grad_norm": 10.716642295325675, + "learning_rate": 4.689302526707961e-06, + "loss": 0.9758, + "step": 16895 + }, + { + "epoch": 1.7, + "grad_norm": 22.581713134228753, + "learning_rate": 4.686375250578259e-06, + "loss": 0.9866, + "step": 16900 + }, + { + "epoch": 1.7, + "grad_norm": 16.87550070685061, + "learning_rate": 4.683448082366532e-06, + "loss": 1.0443, + "step": 16905 + }, + { + "epoch": 1.7, + "grad_norm": 5.306310736277971, + "learning_rate": 4.680521023080018e-06, + "loss": 1.0307, + "step": 16910 + }, + { + "epoch": 1.71, + "grad_norm": 14.584978128407352, + "learning_rate": 4.677594073725915e-06, + "loss": 1.0003, + "step": 16915 + }, + { + "epoch": 1.71, + "grad_norm": 16.88851693358926, + "learning_rate": 4.674667235311384e-06, + "loss": 1.0235, + "step": 16920 + }, + { + "epoch": 1.71, + "grad_norm": 30.054488713226664, + "learning_rate": 4.6717405088435445e-06, + "loss": 0.9955, + "step": 16925 + }, + { + "epoch": 1.71, + "grad_norm": 13.621390724593681, + "learning_rate": 4.668813895329483e-06, + "loss": 1.0425, + "step": 16930 + }, + { + "epoch": 1.71, + "grad_norm": 11.266159283078437, + "learning_rate": 4.6658873957762445e-06, + "loss": 1.0256, + "step": 16935 + }, + { + "epoch": 1.71, + "grad_norm": 5.748902310168273, + "learning_rate": 4.662961011190835e-06, + "loss": 0.9987, + "step": 16940 + }, + { + "epoch": 1.71, + "grad_norm": 5.719847757693675, + "learning_rate": 4.660034742580218e-06, + "loss": 0.9898, + "step": 16945 + }, + { + "epoch": 1.71, + "grad_norm": 7.953366372779367, + "learning_rate": 4.657108590951323e-06, + "loss": 0.9953, + "step": 16950 + }, + { + "epoch": 1.71, + "grad_norm": 13.829895806922655, + "learning_rate": 4.654182557311031e-06, + "loss": 1.0197, + "step": 16955 + }, + { + "epoch": 1.71, + "grad_norm": 9.987562304626971, + "learning_rate": 4.651256642666194e-06, + "loss": 1.0042, + "step": 16960 + }, + { + "epoch": 1.71, + "grad_norm": 22.439513059825465, + "learning_rate": 4.648330848023611e-06, + "loss": 1.0018, + "step": 16965 + }, + { + "epoch": 1.71, + "grad_norm": 6.647793940331765, + "learning_rate": 4.645405174390049e-06, + "loss": 1.0185, + "step": 16970 + }, + { + "epoch": 1.71, + "grad_norm": 9.646874515262727, + "learning_rate": 4.642479622772227e-06, + "loss": 0.9786, + "step": 16975 + }, + { + "epoch": 1.71, + "grad_norm": 10.023487564809672, + "learning_rate": 4.639554194176827e-06, + "loss": 1.0247, + "step": 16980 + }, + { + "epoch": 1.71, + "grad_norm": 5.715892801327604, + "learning_rate": 4.636628889610481e-06, + "loss": 1.0001, + "step": 16985 + }, + { + "epoch": 1.71, + "grad_norm": 18.11156588494887, + "learning_rate": 4.63370371007979e-06, + "loss": 1.0271, + "step": 16990 + }, + { + "epoch": 1.71, + "grad_norm": 22.52875419448177, + "learning_rate": 4.630778656591301e-06, + "loss": 0.9778, + "step": 16995 + }, + { + "epoch": 1.71, + "grad_norm": 10.726171691868798, + "learning_rate": 4.6278537301515256e-06, + "loss": 0.967, + "step": 17000 + }, + { + "epoch": 1.71, + "grad_norm": 23.2971827059768, + "learning_rate": 4.624928931766924e-06, + "loss": 1.0191, + "step": 17005 + }, + { + "epoch": 1.71, + "grad_norm": 9.843956527421028, + "learning_rate": 4.62200426244392e-06, + "loss": 1.0643, + "step": 17010 + }, + { + "epoch": 1.72, + "grad_norm": 28.33813073646957, + "learning_rate": 4.619079723188889e-06, + "loss": 1.001, + "step": 17015 + }, + { + "epoch": 1.72, + "grad_norm": 26.971320195799127, + "learning_rate": 4.616155315008159e-06, + "loss": 1.0223, + "step": 17020 + }, + { + "epoch": 1.72, + "grad_norm": 7.941925646816498, + "learning_rate": 4.6132310389080205e-06, + "loss": 1.0409, + "step": 17025 + }, + { + "epoch": 1.72, + "grad_norm": 9.057853691194898, + "learning_rate": 4.610306895894711e-06, + "loss": 1.0512, + "step": 17030 + }, + { + "epoch": 1.72, + "grad_norm": 14.217344829324652, + "learning_rate": 4.607382886974428e-06, + "loss": 0.9383, + "step": 17035 + }, + { + "epoch": 1.72, + "grad_norm": 7.626275290185591, + "learning_rate": 4.604459013153316e-06, + "loss": 0.9898, + "step": 17040 + }, + { + "epoch": 1.72, + "grad_norm": 6.647514563751912, + "learning_rate": 4.601535275437482e-06, + "loss": 0.987, + "step": 17045 + }, + { + "epoch": 1.72, + "grad_norm": 8.78296404389408, + "learning_rate": 4.598611674832977e-06, + "loss": 1.033, + "step": 17050 + }, + { + "epoch": 1.72, + "grad_norm": 5.24405677217857, + "learning_rate": 4.595688212345814e-06, + "loss": 1.022, + "step": 17055 + }, + { + "epoch": 1.72, + "grad_norm": 15.862725670052217, + "learning_rate": 4.592764888981948e-06, + "loss": 1.0246, + "step": 17060 + }, + { + "epoch": 1.72, + "grad_norm": 9.559220803954123, + "learning_rate": 4.589841705747298e-06, + "loss": 0.9701, + "step": 17065 + }, + { + "epoch": 1.72, + "grad_norm": 7.9863393651332055, + "learning_rate": 4.586918663647723e-06, + "loss": 1.0066, + "step": 17070 + }, + { + "epoch": 1.72, + "grad_norm": 8.517380585020579, + "learning_rate": 4.583995763689043e-06, + "loss": 0.994, + "step": 17075 + }, + { + "epoch": 1.72, + "grad_norm": 17.356856092450435, + "learning_rate": 4.581073006877021e-06, + "loss": 0.9981, + "step": 17080 + }, + { + "epoch": 1.72, + "grad_norm": 7.891020181152935, + "learning_rate": 4.578150394217379e-06, + "loss": 0.9576, + "step": 17085 + }, + { + "epoch": 1.72, + "grad_norm": 6.722072676995166, + "learning_rate": 4.575227926715783e-06, + "loss": 0.9896, + "step": 17090 + }, + { + "epoch": 1.72, + "grad_norm": 10.102300230727144, + "learning_rate": 4.572305605377852e-06, + "loss": 0.9656, + "step": 17095 + }, + { + "epoch": 1.72, + "grad_norm": 12.58749119120385, + "learning_rate": 4.569383431209153e-06, + "loss": 0.9775, + "step": 17100 + }, + { + "epoch": 1.72, + "grad_norm": 7.591727232593307, + "learning_rate": 4.5664614052152056e-06, + "loss": 0.9955, + "step": 17105 + }, + { + "epoch": 1.73, + "grad_norm": 8.519427533241876, + "learning_rate": 4.5635395284014714e-06, + "loss": 0.9577, + "step": 17110 + }, + { + "epoch": 1.73, + "grad_norm": 9.03651336287879, + "learning_rate": 4.560617801773371e-06, + "loss": 0.9762, + "step": 17115 + }, + { + "epoch": 1.73, + "grad_norm": 5.72300338152943, + "learning_rate": 4.557696226336265e-06, + "loss": 0.9808, + "step": 17120 + }, + { + "epoch": 1.73, + "grad_norm": 10.967811199066373, + "learning_rate": 4.554774803095467e-06, + "loss": 0.9577, + "step": 17125 + }, + { + "epoch": 1.73, + "grad_norm": 20.01930160997758, + "learning_rate": 4.551853533056231e-06, + "loss": 0.9873, + "step": 17130 + }, + { + "epoch": 1.73, + "grad_norm": 6.00087246915051, + "learning_rate": 4.548932417223768e-06, + "loss": 0.9588, + "step": 17135 + }, + { + "epoch": 1.73, + "grad_norm": 18.4835082019977, + "learning_rate": 4.546011456603229e-06, + "loss": 1.0342, + "step": 17140 + }, + { + "epoch": 1.73, + "grad_norm": 10.10114229581885, + "learning_rate": 4.543090652199713e-06, + "loss": 0.9772, + "step": 17145 + }, + { + "epoch": 1.73, + "grad_norm": 10.719237844124875, + "learning_rate": 4.540170005018269e-06, + "loss": 0.9921, + "step": 17150 + }, + { + "epoch": 1.73, + "grad_norm": 38.72733323117287, + "learning_rate": 4.537249516063886e-06, + "loss": 1.0065, + "step": 17155 + }, + { + "epoch": 1.73, + "grad_norm": 18.655554225283357, + "learning_rate": 4.534329186341503e-06, + "loss": 1.0126, + "step": 17160 + }, + { + "epoch": 1.73, + "grad_norm": 6.583481223929379, + "learning_rate": 4.531409016856001e-06, + "loss": 1.0153, + "step": 17165 + }, + { + "epoch": 1.73, + "grad_norm": 17.630329560996714, + "learning_rate": 4.528489008612211e-06, + "loss": 0.9809, + "step": 17170 + }, + { + "epoch": 1.73, + "grad_norm": 19.1968269463294, + "learning_rate": 4.5255691626149e-06, + "loss": 1.0535, + "step": 17175 + }, + { + "epoch": 1.73, + "grad_norm": 7.514058147973323, + "learning_rate": 4.52264947986879e-06, + "loss": 1.0545, + "step": 17180 + }, + { + "epoch": 1.73, + "grad_norm": 37.06831696340187, + "learning_rate": 4.5197299613785355e-06, + "loss": 0.9911, + "step": 17185 + }, + { + "epoch": 1.73, + "grad_norm": 18.270934411108115, + "learning_rate": 4.516810608148744e-06, + "loss": 1.0212, + "step": 17190 + }, + { + "epoch": 1.73, + "grad_norm": 5.937408149311286, + "learning_rate": 4.5138914211839605e-06, + "loss": 0.9908, + "step": 17195 + }, + { + "epoch": 1.73, + "grad_norm": 13.308259612931357, + "learning_rate": 4.510972401488675e-06, + "loss": 1.0033, + "step": 17200 + }, + { + "epoch": 1.73, + "grad_norm": 5.430432297635973, + "learning_rate": 4.5080535500673175e-06, + "loss": 1.0123, + "step": 17205 + }, + { + "epoch": 1.74, + "grad_norm": 5.6980510237613915, + "learning_rate": 4.505134867924266e-06, + "loss": 1.0147, + "step": 17210 + }, + { + "epoch": 1.74, + "grad_norm": 6.142734641134295, + "learning_rate": 4.502216356063831e-06, + "loss": 0.9704, + "step": 17215 + }, + { + "epoch": 1.74, + "grad_norm": 6.755769632860416, + "learning_rate": 4.499298015490275e-06, + "loss": 0.9695, + "step": 17220 + }, + { + "epoch": 1.74, + "grad_norm": 7.707542636435186, + "learning_rate": 4.496379847207792e-06, + "loss": 1.0009, + "step": 17225 + }, + { + "epoch": 1.74, + "grad_norm": 17.107552234617984, + "learning_rate": 4.493461852220524e-06, + "loss": 1.0208, + "step": 17230 + }, + { + "epoch": 1.74, + "grad_norm": 17.540283502674953, + "learning_rate": 4.490544031532546e-06, + "loss": 0.9819, + "step": 17235 + }, + { + "epoch": 1.74, + "grad_norm": 7.8252134143751455, + "learning_rate": 4.487626386147884e-06, + "loss": 1.0146, + "step": 17240 + }, + { + "epoch": 1.74, + "grad_norm": 14.68033910105838, + "learning_rate": 4.48470891707049e-06, + "loss": 1.0048, + "step": 17245 + }, + { + "epoch": 1.74, + "grad_norm": 15.716785015753265, + "learning_rate": 4.481791625304267e-06, + "loss": 1.0389, + "step": 17250 + }, + { + "epoch": 1.74, + "grad_norm": 13.748595545093872, + "learning_rate": 4.478874511853051e-06, + "loss": 0.9763, + "step": 17255 + }, + { + "epoch": 1.74, + "grad_norm": 7.787979772469156, + "learning_rate": 4.475957577720617e-06, + "loss": 0.9967, + "step": 17260 + }, + { + "epoch": 1.74, + "grad_norm": 8.623623483227947, + "learning_rate": 4.473040823910681e-06, + "loss": 1.051, + "step": 17265 + }, + { + "epoch": 1.74, + "grad_norm": 7.899670736913321, + "learning_rate": 4.4701242514268925e-06, + "loss": 1.0022, + "step": 17270 + }, + { + "epoch": 1.74, + "grad_norm": 6.208714786916086, + "learning_rate": 4.467207861272846e-06, + "loss": 1.0399, + "step": 17275 + }, + { + "epoch": 1.74, + "grad_norm": 22.947292874334988, + "learning_rate": 4.464291654452062e-06, + "loss": 1.0067, + "step": 17280 + }, + { + "epoch": 1.74, + "grad_norm": 14.629056253001238, + "learning_rate": 4.46137563196801e-06, + "loss": 0.9718, + "step": 17285 + }, + { + "epoch": 1.74, + "grad_norm": 12.206480636864152, + "learning_rate": 4.458459794824088e-06, + "loss": 0.9993, + "step": 17290 + }, + { + "epoch": 1.74, + "grad_norm": 4.803311960029917, + "learning_rate": 4.455544144023635e-06, + "loss": 0.9975, + "step": 17295 + }, + { + "epoch": 1.74, + "grad_norm": 8.348121825749278, + "learning_rate": 4.452628680569919e-06, + "loss": 0.9796, + "step": 17300 + }, + { + "epoch": 1.74, + "grad_norm": 6.288188813858446, + "learning_rate": 4.449713405466154e-06, + "loss": 0.9764, + "step": 17305 + }, + { + "epoch": 1.75, + "grad_norm": 6.461168584339323, + "learning_rate": 4.446798319715478e-06, + "loss": 0.9525, + "step": 17310 + }, + { + "epoch": 1.75, + "grad_norm": 6.714426835122893, + "learning_rate": 4.443883424320974e-06, + "loss": 1.0277, + "step": 17315 + }, + { + "epoch": 1.75, + "grad_norm": 6.288258486608056, + "learning_rate": 4.440968720285651e-06, + "loss": 0.9845, + "step": 17320 + }, + { + "epoch": 1.75, + "grad_norm": 4.975300294674621, + "learning_rate": 4.438054208612459e-06, + "loss": 0.9864, + "step": 17325 + }, + { + "epoch": 1.75, + "grad_norm": 13.98775495317326, + "learning_rate": 4.435139890304274e-06, + "loss": 1.0285, + "step": 17330 + }, + { + "epoch": 1.75, + "grad_norm": 8.307857755049275, + "learning_rate": 4.4322257663639186e-06, + "loss": 0.9848, + "step": 17335 + }, + { + "epoch": 1.75, + "grad_norm": 6.699145118913506, + "learning_rate": 4.4293118377941295e-06, + "loss": 1.0009, + "step": 17340 + }, + { + "epoch": 1.75, + "grad_norm": 8.902087498401224, + "learning_rate": 4.426398105597595e-06, + "loss": 1.0017, + "step": 17345 + }, + { + "epoch": 1.75, + "grad_norm": 15.285934871439602, + "learning_rate": 4.423484570776923e-06, + "loss": 0.9993, + "step": 17350 + }, + { + "epoch": 1.75, + "grad_norm": 10.455064209350546, + "learning_rate": 4.420571234334661e-06, + "loss": 1.0108, + "step": 17355 + }, + { + "epoch": 1.75, + "grad_norm": 6.4557331429864115, + "learning_rate": 4.417658097273282e-06, + "loss": 0.9708, + "step": 17360 + }, + { + "epoch": 1.75, + "grad_norm": 6.605121792110902, + "learning_rate": 4.414745160595198e-06, + "loss": 0.9765, + "step": 17365 + }, + { + "epoch": 1.75, + "grad_norm": 5.619675388451704, + "learning_rate": 4.411832425302742e-06, + "loss": 1.0003, + "step": 17370 + }, + { + "epoch": 1.75, + "grad_norm": 12.416456959181561, + "learning_rate": 4.408919892398189e-06, + "loss": 0.9591, + "step": 17375 + }, + { + "epoch": 1.75, + "grad_norm": 5.601776990215138, + "learning_rate": 4.406007562883736e-06, + "loss": 1.0323, + "step": 17380 + }, + { + "epoch": 1.75, + "grad_norm": 5.827378110856624, + "learning_rate": 4.403095437761514e-06, + "loss": 0.9753, + "step": 17385 + }, + { + "epoch": 1.75, + "grad_norm": 10.286835453274794, + "learning_rate": 4.400183518033579e-06, + "loss": 0.9774, + "step": 17390 + }, + { + "epoch": 1.75, + "grad_norm": 10.106658867927537, + "learning_rate": 4.397271804701926e-06, + "loss": 0.9667, + "step": 17395 + }, + { + "epoch": 1.75, + "grad_norm": 10.385230411470802, + "learning_rate": 4.394360298768469e-06, + "loss": 0.9837, + "step": 17400 + }, + { + "epoch": 1.75, + "grad_norm": 10.907928573203082, + "learning_rate": 4.391449001235052e-06, + "loss": 0.9549, + "step": 17405 + }, + { + "epoch": 1.76, + "grad_norm": 9.133982511906204, + "learning_rate": 4.388537913103454e-06, + "loss": 1.0296, + "step": 17410 + }, + { + "epoch": 1.76, + "grad_norm": 7.0215145597485265, + "learning_rate": 4.385627035375377e-06, + "loss": 1.0016, + "step": 17415 + }, + { + "epoch": 1.76, + "grad_norm": 10.266710216270274, + "learning_rate": 4.38271636905245e-06, + "loss": 1.0154, + "step": 17420 + }, + { + "epoch": 1.76, + "grad_norm": 7.678286453039773, + "learning_rate": 4.3798059151362295e-06, + "loss": 0.9848, + "step": 17425 + }, + { + "epoch": 1.76, + "grad_norm": 6.549703960442, + "learning_rate": 4.3768956746282026e-06, + "loss": 1.0005, + "step": 17430 + }, + { + "epoch": 1.76, + "grad_norm": 29.5138649807499, + "learning_rate": 4.373985648529777e-06, + "loss": 0.9626, + "step": 17435 + }, + { + "epoch": 1.76, + "grad_norm": 5.480192541997506, + "learning_rate": 4.371075837842294e-06, + "loss": 0.9756, + "step": 17440 + }, + { + "epoch": 1.76, + "grad_norm": 14.568325674625413, + "learning_rate": 4.368166243567013e-06, + "loss": 1.006, + "step": 17445 + }, + { + "epoch": 1.76, + "grad_norm": 12.115886297919321, + "learning_rate": 4.365256866705126e-06, + "loss": 0.9651, + "step": 17450 + }, + { + "epoch": 1.76, + "grad_norm": 5.275895266537964, + "learning_rate": 4.362347708257743e-06, + "loss": 0.962, + "step": 17455 + }, + { + "epoch": 1.76, + "grad_norm": 7.259529619695632, + "learning_rate": 4.359438769225906e-06, + "loss": 1.0017, + "step": 17460 + }, + { + "epoch": 1.76, + "grad_norm": 5.857933774426776, + "learning_rate": 4.3565300506105745e-06, + "loss": 0.9852, + "step": 17465 + }, + { + "epoch": 1.76, + "grad_norm": 6.629255297123736, + "learning_rate": 4.3536215534126404e-06, + "loss": 0.9695, + "step": 17470 + }, + { + "epoch": 1.76, + "grad_norm": 5.3825534811967355, + "learning_rate": 4.350713278632911e-06, + "loss": 0.9809, + "step": 17475 + }, + { + "epoch": 1.76, + "grad_norm": 5.394150217809622, + "learning_rate": 4.3478052272721234e-06, + "loss": 0.9942, + "step": 17480 + }, + { + "epoch": 1.76, + "grad_norm": 6.22363972614146, + "learning_rate": 4.344897400330933e-06, + "loss": 0.9782, + "step": 17485 + }, + { + "epoch": 1.76, + "grad_norm": 9.202572061556888, + "learning_rate": 4.341989798809923e-06, + "loss": 0.9811, + "step": 17490 + }, + { + "epoch": 1.76, + "grad_norm": 6.541440915741455, + "learning_rate": 4.339082423709592e-06, + "loss": 0.9589, + "step": 17495 + }, + { + "epoch": 1.76, + "grad_norm": 5.146462234492245, + "learning_rate": 4.33617527603037e-06, + "loss": 0.9939, + "step": 17500 + }, + { + "epoch": 1.76, + "grad_norm": 5.2775085198018195, + "learning_rate": 4.3332683567726e-06, + "loss": 0.9663, + "step": 17505 + }, + { + "epoch": 1.77, + "grad_norm": 13.61356411742695, + "learning_rate": 4.330361666936555e-06, + "loss": 0.9924, + "step": 17510 + }, + { + "epoch": 1.77, + "grad_norm": 16.77351936985382, + "learning_rate": 4.327455207522417e-06, + "loss": 1.0308, + "step": 17515 + }, + { + "epoch": 1.77, + "grad_norm": 14.68995331942081, + "learning_rate": 4.3245489795303025e-06, + "loss": 1.0281, + "step": 17520 + }, + { + "epoch": 1.77, + "grad_norm": 15.917899439391954, + "learning_rate": 4.321642983960238e-06, + "loss": 0.9828, + "step": 17525 + }, + { + "epoch": 1.77, + "grad_norm": 6.713513046333969, + "learning_rate": 4.318737221812173e-06, + "loss": 1.0316, + "step": 17530 + }, + { + "epoch": 1.77, + "grad_norm": 7.220594770385739, + "learning_rate": 4.315831694085982e-06, + "loss": 0.9288, + "step": 17535 + }, + { + "epoch": 1.77, + "grad_norm": 9.360087324298462, + "learning_rate": 4.31292640178145e-06, + "loss": 0.9691, + "step": 17540 + }, + { + "epoch": 1.77, + "grad_norm": 14.104011531131553, + "learning_rate": 4.310021345898288e-06, + "loss": 0.9856, + "step": 17545 + }, + { + "epoch": 1.77, + "grad_norm": 6.394733795721605, + "learning_rate": 4.307116527436121e-06, + "loss": 0.9483, + "step": 17550 + }, + { + "epoch": 1.77, + "grad_norm": 5.6137553038406685, + "learning_rate": 4.304211947394496e-06, + "loss": 0.9874, + "step": 17555 + }, + { + "epoch": 1.77, + "grad_norm": 17.249399564508376, + "learning_rate": 4.301307606772873e-06, + "loss": 0.9847, + "step": 17560 + }, + { + "epoch": 1.77, + "grad_norm": 6.970658585095424, + "learning_rate": 4.298403506570638e-06, + "loss": 0.992, + "step": 17565 + }, + { + "epoch": 1.77, + "grad_norm": 20.018871333149153, + "learning_rate": 4.295499647787085e-06, + "loss": 0.9623, + "step": 17570 + }, + { + "epoch": 1.77, + "grad_norm": 8.33446100071459, + "learning_rate": 4.292596031421431e-06, + "loss": 0.9659, + "step": 17575 + }, + { + "epoch": 1.77, + "grad_norm": 5.655812375433922, + "learning_rate": 4.289692658472806e-06, + "loss": 0.9549, + "step": 17580 + }, + { + "epoch": 1.77, + "grad_norm": 16.006061782628333, + "learning_rate": 4.28678952994026e-06, + "loss": 0.978, + "step": 17585 + }, + { + "epoch": 1.77, + "grad_norm": 10.31919048127266, + "learning_rate": 4.283886646822753e-06, + "loss": 0.9939, + "step": 17590 + }, + { + "epoch": 1.77, + "grad_norm": 4.898513394142511, + "learning_rate": 4.2809840101191695e-06, + "loss": 0.9559, + "step": 17595 + }, + { + "epoch": 1.77, + "grad_norm": 22.10211628788823, + "learning_rate": 4.2780816208282995e-06, + "loss": 0.9865, + "step": 17600 + }, + { + "epoch": 1.77, + "grad_norm": 13.142924272084487, + "learning_rate": 4.275179479948856e-06, + "loss": 0.9863, + "step": 17605 + }, + { + "epoch": 1.78, + "grad_norm": 43.70523431723558, + "learning_rate": 4.272277588479461e-06, + "loss": 0.9565, + "step": 17610 + }, + { + "epoch": 1.78, + "grad_norm": 22.507175546613677, + "learning_rate": 4.269375947418652e-06, + "loss": 0.9726, + "step": 17615 + }, + { + "epoch": 1.78, + "grad_norm": 17.30146559379677, + "learning_rate": 4.2664745577648806e-06, + "loss": 1.0624, + "step": 17620 + }, + { + "epoch": 1.78, + "grad_norm": 25.06644250902353, + "learning_rate": 4.2635734205165155e-06, + "loss": 1.0045, + "step": 17625 + }, + { + "epoch": 1.78, + "grad_norm": 8.910629056259536, + "learning_rate": 4.260672536671832e-06, + "loss": 0.9652, + "step": 17630 + }, + { + "epoch": 1.78, + "grad_norm": 15.806887353642972, + "learning_rate": 4.257771907229023e-06, + "loss": 0.969, + "step": 17635 + }, + { + "epoch": 1.78, + "grad_norm": 5.1094952043062385, + "learning_rate": 4.254871533186191e-06, + "loss": 1.003, + "step": 17640 + }, + { + "epoch": 1.78, + "grad_norm": 20.289905829013996, + "learning_rate": 4.2519714155413534e-06, + "loss": 1.0062, + "step": 17645 + }, + { + "epoch": 1.78, + "grad_norm": 6.23840709833831, + "learning_rate": 4.249071555292438e-06, + "loss": 0.964, + "step": 17650 + }, + { + "epoch": 1.78, + "grad_norm": 37.41253753110922, + "learning_rate": 4.246171953437281e-06, + "loss": 1.0014, + "step": 17655 + }, + { + "epoch": 1.78, + "grad_norm": 6.888293950500067, + "learning_rate": 4.2432726109736365e-06, + "loss": 0.9665, + "step": 17660 + }, + { + "epoch": 1.78, + "grad_norm": 5.644402360932366, + "learning_rate": 4.240373528899163e-06, + "loss": 0.9564, + "step": 17665 + }, + { + "epoch": 1.78, + "grad_norm": 6.026582771228052, + "learning_rate": 4.237474708211434e-06, + "loss": 0.9931, + "step": 17670 + }, + { + "epoch": 1.78, + "grad_norm": 10.193252882210064, + "learning_rate": 4.234576149907928e-06, + "loss": 1.0125, + "step": 17675 + }, + { + "epoch": 1.78, + "grad_norm": 13.234158378729202, + "learning_rate": 4.2316778549860395e-06, + "loss": 0.9782, + "step": 17680 + }, + { + "epoch": 1.78, + "grad_norm": 7.290440088877322, + "learning_rate": 4.228779824443065e-06, + "loss": 1.0064, + "step": 17685 + }, + { + "epoch": 1.78, + "grad_norm": 11.854939976517208, + "learning_rate": 4.225882059276217e-06, + "loss": 1.0144, + "step": 17690 + }, + { + "epoch": 1.78, + "grad_norm": 6.235898705036122, + "learning_rate": 4.222984560482614e-06, + "loss": 0.9722, + "step": 17695 + }, + { + "epoch": 1.78, + "grad_norm": 18.231748698058492, + "learning_rate": 4.220087329059281e-06, + "loss": 0.9929, + "step": 17700 + }, + { + "epoch": 1.79, + "grad_norm": 6.612949224990907, + "learning_rate": 4.2171903660031535e-06, + "loss": 0.9802, + "step": 17705 + }, + { + "epoch": 1.79, + "grad_norm": 18.64871835788384, + "learning_rate": 4.214293672311073e-06, + "loss": 1.0134, + "step": 17710 + }, + { + "epoch": 1.79, + "grad_norm": 27.804267649031544, + "learning_rate": 4.211397248979789e-06, + "loss": 0.9994, + "step": 17715 + }, + { + "epoch": 1.79, + "grad_norm": 6.232331289327623, + "learning_rate": 4.208501097005959e-06, + "loss": 0.9865, + "step": 17720 + }, + { + "epoch": 1.79, + "grad_norm": 7.36852730763666, + "learning_rate": 4.205605217386145e-06, + "loss": 0.9561, + "step": 17725 + }, + { + "epoch": 1.79, + "grad_norm": 5.900038904042403, + "learning_rate": 4.202709611116818e-06, + "loss": 0.996, + "step": 17730 + }, + { + "epoch": 1.79, + "grad_norm": 29.413539099777385, + "learning_rate": 4.1998142791943515e-06, + "loss": 1.0042, + "step": 17735 + }, + { + "epoch": 1.79, + "grad_norm": 11.892005516743971, + "learning_rate": 4.196919222615029e-06, + "loss": 1.0257, + "step": 17740 + }, + { + "epoch": 1.79, + "grad_norm": 6.966387701303805, + "learning_rate": 4.194024442375032e-06, + "loss": 0.9833, + "step": 17745 + }, + { + "epoch": 1.79, + "grad_norm": 8.231042995618557, + "learning_rate": 4.191129939470457e-06, + "loss": 0.9778, + "step": 17750 + }, + { + "epoch": 1.79, + "grad_norm": 16.639381946636583, + "learning_rate": 4.188235714897296e-06, + "loss": 0.9758, + "step": 17755 + }, + { + "epoch": 1.79, + "grad_norm": 5.232707031871469, + "learning_rate": 4.185341769651451e-06, + "loss": 0.9965, + "step": 17760 + }, + { + "epoch": 1.79, + "grad_norm": 4.799057501236086, + "learning_rate": 4.1824481047287244e-06, + "loss": 0.9631, + "step": 17765 + }, + { + "epoch": 1.79, + "grad_norm": 18.93211267186369, + "learning_rate": 4.179554721124825e-06, + "loss": 1.0005, + "step": 17770 + }, + { + "epoch": 1.79, + "grad_norm": 6.342829499951731, + "learning_rate": 4.17666161983536e-06, + "loss": 0.9595, + "step": 17775 + }, + { + "epoch": 1.79, + "grad_norm": 7.298086003533476, + "learning_rate": 4.1737688018558476e-06, + "loss": 1.0069, + "step": 17780 + }, + { + "epoch": 1.79, + "grad_norm": 9.309454412926847, + "learning_rate": 4.170876268181703e-06, + "loss": 0.9926, + "step": 17785 + }, + { + "epoch": 1.79, + "grad_norm": 8.48349237724496, + "learning_rate": 4.167984019808238e-06, + "loss": 0.9817, + "step": 17790 + }, + { + "epoch": 1.79, + "grad_norm": 7.696426460645399, + "learning_rate": 4.165092057730681e-06, + "loss": 1.0428, + "step": 17795 + }, + { + "epoch": 1.79, + "grad_norm": 5.215360712972935, + "learning_rate": 4.1622003829441474e-06, + "loss": 0.987, + "step": 17800 + }, + { + "epoch": 1.8, + "grad_norm": 5.644196565159671, + "learning_rate": 4.1593089964436625e-06, + "loss": 0.977, + "step": 17805 + }, + { + "epoch": 1.8, + "grad_norm": 5.107974839971601, + "learning_rate": 4.156417899224147e-06, + "loss": 0.9941, + "step": 17810 + }, + { + "epoch": 1.8, + "grad_norm": 5.301233945541358, + "learning_rate": 4.153527092280431e-06, + "loss": 1.0134, + "step": 17815 + }, + { + "epoch": 1.8, + "grad_norm": 7.220120395735668, + "learning_rate": 4.15063657660723e-06, + "loss": 1.0002, + "step": 17820 + }, + { + "epoch": 1.8, + "grad_norm": 15.811694128818516, + "learning_rate": 4.147746353199174e-06, + "loss": 0.9888, + "step": 17825 + }, + { + "epoch": 1.8, + "grad_norm": 12.713258453209393, + "learning_rate": 4.144856423050783e-06, + "loss": 0.9514, + "step": 17830 + }, + { + "epoch": 1.8, + "grad_norm": 10.175345447281716, + "learning_rate": 4.14196678715648e-06, + "loss": 0.9518, + "step": 17835 + }, + { + "epoch": 1.8, + "grad_norm": 7.196682747430522, + "learning_rate": 4.139077446510585e-06, + "loss": 1.0259, + "step": 17840 + }, + { + "epoch": 1.8, + "grad_norm": 10.721920938939746, + "learning_rate": 4.136188402107322e-06, + "loss": 0.9831, + "step": 17845 + }, + { + "epoch": 1.8, + "grad_norm": 16.312284928694034, + "learning_rate": 4.133299654940801e-06, + "loss": 0.9578, + "step": 17850 + }, + { + "epoch": 1.8, + "grad_norm": 8.727399474249944, + "learning_rate": 4.130411206005043e-06, + "loss": 1.0198, + "step": 17855 + }, + { + "epoch": 1.8, + "grad_norm": 8.203930546045322, + "learning_rate": 4.127523056293958e-06, + "loss": 1.0151, + "step": 17860 + }, + { + "epoch": 1.8, + "grad_norm": 6.905780390122556, + "learning_rate": 4.124635206801356e-06, + "loss": 0.9554, + "step": 17865 + }, + { + "epoch": 1.8, + "grad_norm": 14.270094254344347, + "learning_rate": 4.121747658520942e-06, + "loss": 0.9901, + "step": 17870 + }, + { + "epoch": 1.8, + "grad_norm": 24.142268165349467, + "learning_rate": 4.1188604124463224e-06, + "loss": 1.0144, + "step": 17875 + }, + { + "epoch": 1.8, + "grad_norm": 6.472023048193844, + "learning_rate": 4.115973469570991e-06, + "loss": 0.9952, + "step": 17880 + }, + { + "epoch": 1.8, + "grad_norm": 7.9995434492814255, + "learning_rate": 4.113086830888346e-06, + "loss": 1.0237, + "step": 17885 + }, + { + "epoch": 1.8, + "grad_norm": 6.160793829862443, + "learning_rate": 4.1102004973916744e-06, + "loss": 0.9899, + "step": 17890 + }, + { + "epoch": 1.8, + "grad_norm": 9.314565949813037, + "learning_rate": 4.107314470074163e-06, + "loss": 0.9722, + "step": 17895 + }, + { + "epoch": 1.8, + "grad_norm": 11.753414499732502, + "learning_rate": 4.104428749928887e-06, + "loss": 0.9672, + "step": 17900 + }, + { + "epoch": 1.81, + "grad_norm": 11.277581642321612, + "learning_rate": 4.101543337948826e-06, + "loss": 0.9993, + "step": 17905 + }, + { + "epoch": 1.81, + "grad_norm": 9.02303347969287, + "learning_rate": 4.0986582351268445e-06, + "loss": 0.9497, + "step": 17910 + }, + { + "epoch": 1.81, + "grad_norm": 22.410472744076394, + "learning_rate": 4.0957734424557e-06, + "loss": 1.0122, + "step": 17915 + }, + { + "epoch": 1.81, + "grad_norm": 25.44505269635536, + "learning_rate": 4.092888960928053e-06, + "loss": 0.9531, + "step": 17920 + }, + { + "epoch": 1.81, + "grad_norm": 6.758375815804422, + "learning_rate": 4.090004791536447e-06, + "loss": 1.0082, + "step": 17925 + }, + { + "epoch": 1.81, + "grad_norm": 7.558547984958102, + "learning_rate": 4.0871209352733244e-06, + "loss": 1.0401, + "step": 17930 + }, + { + "epoch": 1.81, + "grad_norm": 4.957332675445743, + "learning_rate": 4.084237393131015e-06, + "loss": 1.0092, + "step": 17935 + }, + { + "epoch": 1.81, + "grad_norm": 5.890734746420278, + "learning_rate": 4.081354166101744e-06, + "loss": 0.9885, + "step": 17940 + }, + { + "epoch": 1.81, + "grad_norm": 6.54145049689193, + "learning_rate": 4.078471255177626e-06, + "loss": 0.981, + "step": 17945 + }, + { + "epoch": 1.81, + "grad_norm": 23.910132442804926, + "learning_rate": 4.075588661350671e-06, + "loss": 1.0282, + "step": 17950 + }, + { + "epoch": 1.81, + "grad_norm": 5.24292684826029, + "learning_rate": 4.072706385612773e-06, + "loss": 0.9975, + "step": 17955 + }, + { + "epoch": 1.81, + "grad_norm": 19.115389829935094, + "learning_rate": 4.069824428955724e-06, + "loss": 0.9957, + "step": 17960 + }, + { + "epoch": 1.81, + "grad_norm": 4.945914484492379, + "learning_rate": 4.066942792371198e-06, + "loss": 0.9641, + "step": 17965 + }, + { + "epoch": 1.81, + "grad_norm": 10.521339685687494, + "learning_rate": 4.064061476850769e-06, + "loss": 0.9963, + "step": 17970 + }, + { + "epoch": 1.81, + "grad_norm": 5.894472343268108, + "learning_rate": 4.0611804833858885e-06, + "loss": 1.0377, + "step": 17975 + }, + { + "epoch": 1.81, + "grad_norm": 13.576633426606861, + "learning_rate": 4.05829981296791e-06, + "loss": 0.9905, + "step": 17980 + }, + { + "epoch": 1.81, + "grad_norm": 24.05532145531544, + "learning_rate": 4.055419466588064e-06, + "loss": 1.0365, + "step": 17985 + }, + { + "epoch": 1.81, + "grad_norm": 6.015289000674614, + "learning_rate": 4.05253944523748e-06, + "loss": 0.9566, + "step": 17990 + }, + { + "epoch": 1.81, + "grad_norm": 8.899950989787076, + "learning_rate": 4.049659749907165e-06, + "loss": 1.0058, + "step": 17995 + }, + { + "epoch": 1.81, + "grad_norm": 7.243197684418895, + "learning_rate": 4.046780381588025e-06, + "loss": 1.0171, + "step": 18000 + }, + { + "epoch": 1.82, + "grad_norm": 6.474829652542478, + "learning_rate": 4.043901341270843e-06, + "loss": 0.9419, + "step": 18005 + }, + { + "epoch": 1.82, + "grad_norm": 6.102354302507712, + "learning_rate": 4.041022629946298e-06, + "loss": 0.9974, + "step": 18010 + }, + { + "epoch": 1.82, + "grad_norm": 6.581577273091294, + "learning_rate": 4.038144248604949e-06, + "loss": 0.9694, + "step": 18015 + }, + { + "epoch": 1.82, + "grad_norm": 5.7366265831491345, + "learning_rate": 4.035266198237247e-06, + "loss": 1.0029, + "step": 18020 + }, + { + "epoch": 1.82, + "grad_norm": 11.450022518303586, + "learning_rate": 4.032388479833522e-06, + "loss": 0.9699, + "step": 18025 + }, + { + "epoch": 1.82, + "grad_norm": 21.668572220867887, + "learning_rate": 4.029511094384001e-06, + "loss": 0.9723, + "step": 18030 + }, + { + "epoch": 1.82, + "grad_norm": 9.873050740877582, + "learning_rate": 4.026634042878782e-06, + "loss": 0.9608, + "step": 18035 + }, + { + "epoch": 1.82, + "grad_norm": 4.936992079591061, + "learning_rate": 4.02375732630786e-06, + "loss": 0.9841, + "step": 18040 + }, + { + "epoch": 1.82, + "grad_norm": 16.356020950439156, + "learning_rate": 4.020880945661111e-06, + "loss": 0.9837, + "step": 18045 + }, + { + "epoch": 1.82, + "grad_norm": 5.460548595726973, + "learning_rate": 4.0180049019282916e-06, + "loss": 0.9915, + "step": 18050 + }, + { + "epoch": 1.82, + "grad_norm": 7.417525899749307, + "learning_rate": 4.015129196099049e-06, + "loss": 1.0166, + "step": 18055 + }, + { + "epoch": 1.82, + "grad_norm": 10.20607296161159, + "learning_rate": 4.012253829162908e-06, + "loss": 0.9829, + "step": 18060 + }, + { + "epoch": 1.82, + "grad_norm": 4.930258224866766, + "learning_rate": 4.0093788021092826e-06, + "loss": 0.991, + "step": 18065 + }, + { + "epoch": 1.82, + "grad_norm": 7.401745585192158, + "learning_rate": 4.006504115927463e-06, + "loss": 1.0106, + "step": 18070 + }, + { + "epoch": 1.82, + "grad_norm": 7.793097611353607, + "learning_rate": 4.0036297716066295e-06, + "loss": 1.0181, + "step": 18075 + }, + { + "epoch": 1.82, + "grad_norm": 6.639441474692842, + "learning_rate": 4.0007557701358405e-06, + "loss": 0.9854, + "step": 18080 + }, + { + "epoch": 1.82, + "grad_norm": 4.979689799633668, + "learning_rate": 3.997882112504036e-06, + "loss": 0.9966, + "step": 18085 + }, + { + "epoch": 1.82, + "grad_norm": 18.30019551431489, + "learning_rate": 3.995008799700039e-06, + "loss": 0.9399, + "step": 18090 + }, + { + "epoch": 1.82, + "grad_norm": 9.010236910045, + "learning_rate": 3.992135832712555e-06, + "loss": 0.973, + "step": 18095 + }, + { + "epoch": 1.82, + "grad_norm": 14.357174340154753, + "learning_rate": 3.9892632125301664e-06, + "loss": 0.9703, + "step": 18100 + }, + { + "epoch": 1.83, + "grad_norm": 6.800647275884193, + "learning_rate": 3.986390940141344e-06, + "loss": 1.0361, + "step": 18105 + }, + { + "epoch": 1.83, + "grad_norm": 10.66733919132119, + "learning_rate": 3.983519016534429e-06, + "loss": 0.9551, + "step": 18110 + }, + { + "epoch": 1.83, + "grad_norm": 10.08796648741792, + "learning_rate": 3.980647442697651e-06, + "loss": 0.9354, + "step": 18115 + }, + { + "epoch": 1.83, + "grad_norm": 4.8986084699880825, + "learning_rate": 3.977776219619113e-06, + "loss": 0.9813, + "step": 18120 + }, + { + "epoch": 1.83, + "grad_norm": 4.779198222654169, + "learning_rate": 3.974905348286803e-06, + "loss": 1.004, + "step": 18125 + }, + { + "epoch": 1.83, + "grad_norm": 11.207525881092016, + "learning_rate": 3.9720348296885805e-06, + "loss": 0.9679, + "step": 18130 + }, + { + "epoch": 1.83, + "grad_norm": 12.756377143981297, + "learning_rate": 3.969164664812194e-06, + "loss": 0.9359, + "step": 18135 + }, + { + "epoch": 1.83, + "grad_norm": 7.282128100521323, + "learning_rate": 3.966294854645261e-06, + "loss": 0.943, + "step": 18140 + }, + { + "epoch": 1.83, + "grad_norm": 8.343543523016564, + "learning_rate": 3.963425400175282e-06, + "loss": 0.9858, + "step": 18145 + }, + { + "epoch": 1.83, + "grad_norm": 14.959195280855614, + "learning_rate": 3.960556302389632e-06, + "loss": 0.9732, + "step": 18150 + }, + { + "epoch": 1.83, + "grad_norm": 5.4525478859237255, + "learning_rate": 3.957687562275566e-06, + "loss": 0.9777, + "step": 18155 + }, + { + "epoch": 1.83, + "grad_norm": 6.0295155709128565, + "learning_rate": 3.954819180820213e-06, + "loss": 1.0042, + "step": 18160 + }, + { + "epoch": 1.83, + "grad_norm": 5.130634304881189, + "learning_rate": 3.951951159010583e-06, + "loss": 0.9856, + "step": 18165 + }, + { + "epoch": 1.83, + "grad_norm": 9.513383734699328, + "learning_rate": 3.949083497833558e-06, + "loss": 0.9726, + "step": 18170 + }, + { + "epoch": 1.83, + "grad_norm": 5.615005606728285, + "learning_rate": 3.946216198275897e-06, + "loss": 0.9881, + "step": 18175 + }, + { + "epoch": 1.83, + "grad_norm": 5.967220890591053, + "learning_rate": 3.943349261324237e-06, + "loss": 0.9645, + "step": 18180 + }, + { + "epoch": 1.83, + "grad_norm": 8.03922077754858, + "learning_rate": 3.940482687965085e-06, + "loss": 1.0003, + "step": 18185 + }, + { + "epoch": 1.83, + "grad_norm": 9.307895107855172, + "learning_rate": 3.937616479184831e-06, + "loss": 0.9933, + "step": 18190 + }, + { + "epoch": 1.83, + "grad_norm": 18.744700072412474, + "learning_rate": 3.934750635969728e-06, + "loss": 0.9654, + "step": 18195 + }, + { + "epoch": 1.83, + "grad_norm": 11.952335132138158, + "learning_rate": 3.931885159305916e-06, + "loss": 1.0467, + "step": 18200 + }, + { + "epoch": 1.84, + "grad_norm": 15.245737402574543, + "learning_rate": 3.9290200501794e-06, + "loss": 0.9912, + "step": 18205 + }, + { + "epoch": 1.84, + "grad_norm": 8.934123527505806, + "learning_rate": 3.926155309576063e-06, + "loss": 0.9621, + "step": 18210 + }, + { + "epoch": 1.84, + "grad_norm": 6.822756484981494, + "learning_rate": 3.923290938481657e-06, + "loss": 0.9913, + "step": 18215 + }, + { + "epoch": 1.84, + "grad_norm": 7.3094605135069015, + "learning_rate": 3.920426937881812e-06, + "loss": 1.0135, + "step": 18220 + }, + { + "epoch": 1.84, + "grad_norm": 8.234278005327626, + "learning_rate": 3.917563308762024e-06, + "loss": 1.024, + "step": 18225 + }, + { + "epoch": 1.84, + "grad_norm": 14.143253937630613, + "learning_rate": 3.9147000521076695e-06, + "loss": 1.0049, + "step": 18230 + }, + { + "epoch": 1.84, + "grad_norm": 8.190834230959963, + "learning_rate": 3.9118371689039905e-06, + "loss": 1.0115, + "step": 18235 + }, + { + "epoch": 1.84, + "grad_norm": 7.641550092853329, + "learning_rate": 3.908974660136102e-06, + "loss": 0.9952, + "step": 18240 + }, + { + "epoch": 1.84, + "grad_norm": 8.542498068012963, + "learning_rate": 3.906112526788991e-06, + "loss": 0.9804, + "step": 18245 + }, + { + "epoch": 1.84, + "grad_norm": 12.868475394958528, + "learning_rate": 3.903250769847515e-06, + "loss": 1.0248, + "step": 18250 + }, + { + "epoch": 1.84, + "grad_norm": 10.90066644420374, + "learning_rate": 3.9003893902964e-06, + "loss": 0.9451, + "step": 18255 + }, + { + "epoch": 1.84, + "grad_norm": 27.968509274783298, + "learning_rate": 3.897528389120247e-06, + "loss": 0.9905, + "step": 18260 + }, + { + "epoch": 1.84, + "grad_norm": 8.71773024020709, + "learning_rate": 3.894667767303523e-06, + "loss": 0.9664, + "step": 18265 + }, + { + "epoch": 1.84, + "grad_norm": 9.158395474912115, + "learning_rate": 3.891807525830564e-06, + "loss": 1.0286, + "step": 18270 + }, + { + "epoch": 1.84, + "grad_norm": 5.322537717175965, + "learning_rate": 3.888947665685576e-06, + "loss": 1.0008, + "step": 18275 + }, + { + "epoch": 1.84, + "grad_norm": 5.961458444995351, + "learning_rate": 3.8860881878526365e-06, + "loss": 0.9912, + "step": 18280 + }, + { + "epoch": 1.84, + "grad_norm": 7.145722663367105, + "learning_rate": 3.883229093315686e-06, + "loss": 1.0161, + "step": 18285 + }, + { + "epoch": 1.84, + "grad_norm": 5.074742049472983, + "learning_rate": 3.880370383058539e-06, + "loss": 0.9978, + "step": 18290 + }, + { + "epoch": 1.84, + "grad_norm": 4.970232085990174, + "learning_rate": 3.877512058064876e-06, + "loss": 0.9639, + "step": 18295 + }, + { + "epoch": 1.85, + "grad_norm": 8.412604579770981, + "learning_rate": 3.8746541193182375e-06, + "loss": 1.0341, + "step": 18300 + }, + { + "epoch": 1.85, + "grad_norm": 8.680626298857828, + "learning_rate": 3.871796567802044e-06, + "loss": 0.9469, + "step": 18305 + }, + { + "epoch": 1.85, + "grad_norm": 4.796765209774283, + "learning_rate": 3.868939404499573e-06, + "loss": 0.942, + "step": 18310 + }, + { + "epoch": 1.85, + "grad_norm": 17.149810215370284, + "learning_rate": 3.8660826303939745e-06, + "loss": 0.9614, + "step": 18315 + }, + { + "epoch": 1.85, + "grad_norm": 21.602876098567382, + "learning_rate": 3.863226246468256e-06, + "loss": 1.0041, + "step": 18320 + }, + { + "epoch": 1.85, + "grad_norm": 5.5164022506358705, + "learning_rate": 3.860370253705304e-06, + "loss": 0.9953, + "step": 18325 + }, + { + "epoch": 1.85, + "grad_norm": 6.346002360848893, + "learning_rate": 3.857514653087857e-06, + "loss": 0.9913, + "step": 18330 + }, + { + "epoch": 1.85, + "grad_norm": 5.696269583932353, + "learning_rate": 3.854659445598529e-06, + "loss": 0.9501, + "step": 18335 + }, + { + "epoch": 1.85, + "grad_norm": 13.613799479606872, + "learning_rate": 3.85180463221979e-06, + "loss": 0.9782, + "step": 18340 + }, + { + "epoch": 1.85, + "grad_norm": 5.906200366057462, + "learning_rate": 3.848950213933982e-06, + "loss": 0.9995, + "step": 18345 + }, + { + "epoch": 1.85, + "grad_norm": 9.985103632638992, + "learning_rate": 3.846096191723303e-06, + "loss": 1.0518, + "step": 18350 + }, + { + "epoch": 1.85, + "grad_norm": 7.450855140786176, + "learning_rate": 3.843242566569826e-06, + "loss": 1.0015, + "step": 18355 + }, + { + "epoch": 1.85, + "grad_norm": 7.17042311212853, + "learning_rate": 3.840389339455474e-06, + "loss": 0.9941, + "step": 18360 + }, + { + "epoch": 1.85, + "grad_norm": 6.19466857070708, + "learning_rate": 3.837536511362045e-06, + "loss": 0.9699, + "step": 18365 + }, + { + "epoch": 1.85, + "grad_norm": 10.901155997440986, + "learning_rate": 3.834684083271191e-06, + "loss": 0.9548, + "step": 18370 + }, + { + "epoch": 1.85, + "grad_norm": 19.352982354625738, + "learning_rate": 3.831832056164431e-06, + "loss": 0.9756, + "step": 18375 + }, + { + "epoch": 1.85, + "grad_norm": 6.70998177042718, + "learning_rate": 3.8289804310231434e-06, + "loss": 1.0063, + "step": 18380 + }, + { + "epoch": 1.85, + "grad_norm": 25.9351320245653, + "learning_rate": 3.826129208828573e-06, + "loss": 0.9915, + "step": 18385 + }, + { + "epoch": 1.85, + "grad_norm": 5.987625138296798, + "learning_rate": 3.82327839056182e-06, + "loss": 0.9812, + "step": 18390 + }, + { + "epoch": 1.85, + "grad_norm": 13.41267269943096, + "learning_rate": 3.8204279772038495e-06, + "loss": 0.9616, + "step": 18395 + }, + { + "epoch": 1.86, + "grad_norm": 5.9943162502046015, + "learning_rate": 3.817577969735485e-06, + "loss": 0.9764, + "step": 18400 + }, + { + "epoch": 1.86, + "grad_norm": 5.261829409492166, + "learning_rate": 3.8147283691374124e-06, + "loss": 0.9682, + "step": 18405 + }, + { + "epoch": 1.86, + "grad_norm": 6.718804953227591, + "learning_rate": 3.811879176390174e-06, + "loss": 0.9882, + "step": 18410 + }, + { + "epoch": 1.86, + "grad_norm": 7.7449916557750855, + "learning_rate": 3.8090303924741784e-06, + "loss": 1.0081, + "step": 18415 + }, + { + "epoch": 1.86, + "grad_norm": 19.46955376180154, + "learning_rate": 3.8061820183696857e-06, + "loss": 0.9514, + "step": 18420 + }, + { + "epoch": 1.86, + "grad_norm": 6.0383605638679505, + "learning_rate": 3.803334055056822e-06, + "loss": 0.9281, + "step": 18425 + }, + { + "epoch": 1.86, + "grad_norm": 7.50864837032302, + "learning_rate": 3.8004865035155662e-06, + "loss": 0.9906, + "step": 18430 + }, + { + "epoch": 1.86, + "grad_norm": 4.858201015924298, + "learning_rate": 3.797639364725758e-06, + "loss": 0.928, + "step": 18435 + }, + { + "epoch": 1.86, + "grad_norm": 12.82481969041467, + "learning_rate": 3.7947926396670975e-06, + "loss": 0.9848, + "step": 18440 + }, + { + "epoch": 1.86, + "grad_norm": 6.861048594296718, + "learning_rate": 3.791946329319136e-06, + "loss": 0.9813, + "step": 18445 + }, + { + "epoch": 1.86, + "grad_norm": 6.975396611645755, + "learning_rate": 3.7891004346612925e-06, + "loss": 0.9723, + "step": 18450 + }, + { + "epoch": 1.86, + "grad_norm": 20.182853926554543, + "learning_rate": 3.786254956672829e-06, + "loss": 0.9931, + "step": 18455 + }, + { + "epoch": 1.86, + "grad_norm": 13.837429575125533, + "learning_rate": 3.7834098963328784e-06, + "loss": 0.9738, + "step": 18460 + }, + { + "epoch": 1.86, + "grad_norm": 5.216565893064262, + "learning_rate": 3.7805652546204193e-06, + "loss": 0.9949, + "step": 18465 + }, + { + "epoch": 1.86, + "grad_norm": 11.355112575571148, + "learning_rate": 3.777721032514292e-06, + "loss": 0.9918, + "step": 18470 + }, + { + "epoch": 1.86, + "grad_norm": 6.094017204819921, + "learning_rate": 3.7748772309931887e-06, + "loss": 0.9381, + "step": 18475 + }, + { + "epoch": 1.86, + "grad_norm": 12.374752647571407, + "learning_rate": 3.7720338510356623e-06, + "loss": 0.9747, + "step": 18480 + }, + { + "epoch": 1.86, + "grad_norm": 5.5966625675434125, + "learning_rate": 3.7691908936201126e-06, + "loss": 0.9333, + "step": 18485 + }, + { + "epoch": 1.86, + "grad_norm": 5.305843062394117, + "learning_rate": 3.7663483597248023e-06, + "loss": 0.9548, + "step": 18490 + }, + { + "epoch": 1.86, + "grad_norm": 9.114471686460963, + "learning_rate": 3.7635062503278425e-06, + "loss": 1.0119, + "step": 18495 + }, + { + "epoch": 1.87, + "grad_norm": 5.816051820282711, + "learning_rate": 3.7606645664072016e-06, + "loss": 0.9708, + "step": 18500 + }, + { + "epoch": 1.87, + "grad_norm": 9.49667170674841, + "learning_rate": 3.7578233089406984e-06, + "loss": 0.9589, + "step": 18505 + }, + { + "epoch": 1.87, + "grad_norm": 5.761558730111664, + "learning_rate": 3.754982478906011e-06, + "loss": 0.9756, + "step": 18510 + }, + { + "epoch": 1.87, + "grad_norm": 10.967302332582332, + "learning_rate": 3.75214207728066e-06, + "loss": 0.9411, + "step": 18515 + }, + { + "epoch": 1.87, + "grad_norm": 5.116253224622286, + "learning_rate": 3.7493021050420308e-06, + "loss": 0.9907, + "step": 18520 + }, + { + "epoch": 1.87, + "grad_norm": 24.120440143339238, + "learning_rate": 3.746462563167351e-06, + "loss": 0.9318, + "step": 18525 + }, + { + "epoch": 1.87, + "grad_norm": 10.41418126468614, + "learning_rate": 3.7436234526337077e-06, + "loss": 0.9539, + "step": 18530 + }, + { + "epoch": 1.87, + "grad_norm": 6.873694965923555, + "learning_rate": 3.740784774418032e-06, + "loss": 1.0013, + "step": 18535 + }, + { + "epoch": 1.87, + "grad_norm": 21.799567648823835, + "learning_rate": 3.7379465294971164e-06, + "loss": 1.0051, + "step": 18540 + }, + { + "epoch": 1.87, + "grad_norm": 7.05982153807311, + "learning_rate": 3.7351087188475904e-06, + "loss": 0.9677, + "step": 18545 + }, + { + "epoch": 1.87, + "grad_norm": 32.014472362161165, + "learning_rate": 3.732271343445949e-06, + "loss": 0.9886, + "step": 18550 + }, + { + "epoch": 1.87, + "grad_norm": 8.146122922065649, + "learning_rate": 3.729434404268527e-06, + "loss": 0.9481, + "step": 18555 + }, + { + "epoch": 1.87, + "grad_norm": 17.909251593077418, + "learning_rate": 3.726597902291512e-06, + "loss": 1.0004, + "step": 18560 + }, + { + "epoch": 1.87, + "grad_norm": 6.9456126927850335, + "learning_rate": 3.7237618384909446e-06, + "loss": 0.9513, + "step": 18565 + }, + { + "epoch": 1.87, + "grad_norm": 7.361620530332281, + "learning_rate": 3.720926213842707e-06, + "loss": 0.9708, + "step": 18570 + }, + { + "epoch": 1.87, + "grad_norm": 7.811469339075542, + "learning_rate": 3.7180910293225404e-06, + "loss": 1.0052, + "step": 18575 + }, + { + "epoch": 1.87, + "grad_norm": 14.513163674141524, + "learning_rate": 3.715256285906023e-06, + "loss": 1.0281, + "step": 18580 + }, + { + "epoch": 1.87, + "grad_norm": 10.774195223815436, + "learning_rate": 3.712421984568593e-06, + "loss": 1.0158, + "step": 18585 + }, + { + "epoch": 1.87, + "grad_norm": 4.960282979334194, + "learning_rate": 3.7095881262855267e-06, + "loss": 0.9662, + "step": 18590 + }, + { + "epoch": 1.87, + "grad_norm": 7.978757212662668, + "learning_rate": 3.7067547120319554e-06, + "loss": 0.9774, + "step": 18595 + }, + { + "epoch": 1.88, + "grad_norm": 6.034834780122345, + "learning_rate": 3.7039217427828513e-06, + "loss": 0.9752, + "step": 18600 + }, + { + "epoch": 1.88, + "grad_norm": 15.004516099561922, + "learning_rate": 3.701089219513038e-06, + "loss": 0.9734, + "step": 18605 + }, + { + "epoch": 1.88, + "grad_norm": 9.402085580760845, + "learning_rate": 3.6982571431971824e-06, + "loss": 1.0251, + "step": 18610 + }, + { + "epoch": 1.88, + "grad_norm": 23.56500346844539, + "learning_rate": 3.695425514809802e-06, + "loss": 0.9798, + "step": 18615 + }, + { + "epoch": 1.88, + "grad_norm": 7.346783743132763, + "learning_rate": 3.692594335325255e-06, + "loss": 1.0118, + "step": 18620 + }, + { + "epoch": 1.88, + "grad_norm": 10.117378850707686, + "learning_rate": 3.68976360571775e-06, + "loss": 0.9884, + "step": 18625 + }, + { + "epoch": 1.88, + "grad_norm": 8.6563959867572, + "learning_rate": 3.686933326961335e-06, + "loss": 0.9778, + "step": 18630 + }, + { + "epoch": 1.88, + "grad_norm": 4.718362628760115, + "learning_rate": 3.6841035000299104e-06, + "loss": 0.9712, + "step": 18635 + }, + { + "epoch": 1.88, + "grad_norm": 11.901977228691198, + "learning_rate": 3.681274125897212e-06, + "loss": 1.0074, + "step": 18640 + }, + { + "epoch": 1.88, + "grad_norm": 5.376219367954663, + "learning_rate": 3.6784452055368293e-06, + "loss": 0.9927, + "step": 18645 + }, + { + "epoch": 1.88, + "grad_norm": 7.261738860162723, + "learning_rate": 3.6756167399221875e-06, + "loss": 0.9726, + "step": 18650 + }, + { + "epoch": 1.88, + "grad_norm": 11.322575516017178, + "learning_rate": 3.6727887300265623e-06, + "loss": 0.9967, + "step": 18655 + }, + { + "epoch": 1.88, + "grad_norm": 6.396128479592039, + "learning_rate": 3.669961176823065e-06, + "loss": 1.0004, + "step": 18660 + }, + { + "epoch": 1.88, + "grad_norm": 5.572859527055436, + "learning_rate": 3.66713408128466e-06, + "loss": 0.9574, + "step": 18665 + }, + { + "epoch": 1.88, + "grad_norm": 5.566152355775521, + "learning_rate": 3.664307444384141e-06, + "loss": 0.944, + "step": 18670 + }, + { + "epoch": 1.88, + "grad_norm": 6.814616898071068, + "learning_rate": 3.661481267094157e-06, + "loss": 0.9645, + "step": 18675 + }, + { + "epoch": 1.88, + "grad_norm": 7.4491753760139545, + "learning_rate": 3.658655550387191e-06, + "loss": 0.971, + "step": 18680 + }, + { + "epoch": 1.88, + "grad_norm": 9.650043757641736, + "learning_rate": 3.655830295235567e-06, + "loss": 0.994, + "step": 18685 + }, + { + "epoch": 1.88, + "grad_norm": 4.950233639909765, + "learning_rate": 3.6530055026114564e-06, + "loss": 0.9695, + "step": 18690 + }, + { + "epoch": 1.88, + "grad_norm": 10.336979569142324, + "learning_rate": 3.6501811734868644e-06, + "loss": 0.9697, + "step": 18695 + }, + { + "epoch": 1.89, + "grad_norm": 7.485667474533593, + "learning_rate": 3.6473573088336424e-06, + "loss": 0.9788, + "step": 18700 + }, + { + "epoch": 1.89, + "grad_norm": 11.737953968885742, + "learning_rate": 3.6445339096234765e-06, + "loss": 0.9528, + "step": 18705 + }, + { + "epoch": 1.89, + "grad_norm": 11.420294666661196, + "learning_rate": 3.6417109768279004e-06, + "loss": 0.9939, + "step": 18710 + }, + { + "epoch": 1.89, + "grad_norm": 18.063368906404644, + "learning_rate": 3.638888511418278e-06, + "loss": 0.9813, + "step": 18715 + }, + { + "epoch": 1.89, + "grad_norm": 8.256498293961954, + "learning_rate": 3.63606651436582e-06, + "loss": 0.946, + "step": 18720 + }, + { + "epoch": 1.89, + "grad_norm": 11.775343917226046, + "learning_rate": 3.633244986641571e-06, + "loss": 1.0092, + "step": 18725 + }, + { + "epoch": 1.89, + "grad_norm": 9.327990203243036, + "learning_rate": 3.630423929216417e-06, + "loss": 0.9956, + "step": 18730 + }, + { + "epoch": 1.89, + "grad_norm": 10.684176316275288, + "learning_rate": 3.6276033430610793e-06, + "loss": 0.9586, + "step": 18735 + }, + { + "epoch": 1.89, + "grad_norm": 8.370734052190713, + "learning_rate": 3.6247832291461226e-06, + "loss": 0.9927, + "step": 18740 + }, + { + "epoch": 1.89, + "grad_norm": 8.603847329973096, + "learning_rate": 3.621963588441942e-06, + "loss": 0.9735, + "step": 18745 + }, + { + "epoch": 1.89, + "grad_norm": 14.70117140483938, + "learning_rate": 3.6191444219187753e-06, + "loss": 0.913, + "step": 18750 + }, + { + "epoch": 1.89, + "grad_norm": 7.259241140398004, + "learning_rate": 3.6163257305466937e-06, + "loss": 1.0054, + "step": 18755 + }, + { + "epoch": 1.89, + "grad_norm": 5.831064986749914, + "learning_rate": 3.6135075152956074e-06, + "loss": 0.9717, + "step": 18760 + }, + { + "epoch": 1.89, + "grad_norm": 5.268942377717585, + "learning_rate": 3.610689777135259e-06, + "loss": 0.9348, + "step": 18765 + }, + { + "epoch": 1.89, + "grad_norm": 6.077651255287886, + "learning_rate": 3.6078725170352323e-06, + "loss": 0.9825, + "step": 18770 + }, + { + "epoch": 1.89, + "grad_norm": 6.9833842942122235, + "learning_rate": 3.605055735964943e-06, + "loss": 0.9822, + "step": 18775 + }, + { + "epoch": 1.89, + "grad_norm": 4.798592166369944, + "learning_rate": 3.6022394348936436e-06, + "loss": 0.9778, + "step": 18780 + }, + { + "epoch": 1.89, + "grad_norm": 5.34851456599785, + "learning_rate": 3.5994236147904182e-06, + "loss": 0.9623, + "step": 18785 + }, + { + "epoch": 1.89, + "grad_norm": 6.684841424069073, + "learning_rate": 3.5966082766241904e-06, + "loss": 0.9787, + "step": 18790 + }, + { + "epoch": 1.89, + "grad_norm": 11.479047892827804, + "learning_rate": 3.593793421363712e-06, + "loss": 0.9816, + "step": 18795 + }, + { + "epoch": 1.9, + "grad_norm": 7.040538898804096, + "learning_rate": 3.5909790499775765e-06, + "loss": 0.9905, + "step": 18800 + }, + { + "epoch": 1.9, + "grad_norm": 5.958641544227882, + "learning_rate": 3.588165163434203e-06, + "loss": 0.9728, + "step": 18805 + }, + { + "epoch": 1.9, + "grad_norm": 8.054065495967254, + "learning_rate": 3.5853517627018495e-06, + "loss": 0.9511, + "step": 18810 + }, + { + "epoch": 1.9, + "grad_norm": 5.890771687952744, + "learning_rate": 3.5825388487486034e-06, + "loss": 0.995, + "step": 18815 + }, + { + "epoch": 1.9, + "grad_norm": 5.0730729877051965, + "learning_rate": 3.5797264225423846e-06, + "loss": 0.9923, + "step": 18820 + }, + { + "epoch": 1.9, + "grad_norm": 6.636257745017197, + "learning_rate": 3.576914485050948e-06, + "loss": 0.9761, + "step": 18825 + }, + { + "epoch": 1.9, + "grad_norm": 6.803076414174225, + "learning_rate": 3.5741030372418763e-06, + "loss": 0.9786, + "step": 18830 + }, + { + "epoch": 1.9, + "grad_norm": 12.692665844872918, + "learning_rate": 3.571292080082589e-06, + "loss": 0.9742, + "step": 18835 + }, + { + "epoch": 1.9, + "grad_norm": 6.049139761426119, + "learning_rate": 3.568481614540332e-06, + "loss": 0.961, + "step": 18840 + }, + { + "epoch": 1.9, + "grad_norm": 7.999274994098982, + "learning_rate": 3.5656716415821848e-06, + "loss": 0.9545, + "step": 18845 + }, + { + "epoch": 1.9, + "grad_norm": 7.444020322274279, + "learning_rate": 3.5628621621750546e-06, + "loss": 0.9868, + "step": 18850 + }, + { + "epoch": 1.9, + "grad_norm": 6.935898973123527, + "learning_rate": 3.560053177285683e-06, + "loss": 1.0229, + "step": 18855 + }, + { + "epoch": 1.9, + "grad_norm": 8.652059327456081, + "learning_rate": 3.5572446878806356e-06, + "loss": 0.9985, + "step": 18860 + }, + { + "epoch": 1.9, + "grad_norm": 5.470800195437701, + "learning_rate": 3.5544366949263154e-06, + "loss": 0.974, + "step": 18865 + }, + { + "epoch": 1.9, + "grad_norm": 5.540500894353744, + "learning_rate": 3.5516291993889463e-06, + "loss": 0.9712, + "step": 18870 + }, + { + "epoch": 1.9, + "grad_norm": 4.903932248809819, + "learning_rate": 3.548822202234588e-06, + "loss": 1.0247, + "step": 18875 + }, + { + "epoch": 1.9, + "grad_norm": 5.255685901803376, + "learning_rate": 3.546015704429123e-06, + "loss": 0.9893, + "step": 18880 + }, + { + "epoch": 1.9, + "grad_norm": 7.7442920466087815, + "learning_rate": 3.5432097069382652e-06, + "loss": 0.9865, + "step": 18885 + }, + { + "epoch": 1.9, + "grad_norm": 6.451898088603783, + "learning_rate": 3.540404210727555e-06, + "loss": 1.0302, + "step": 18890 + }, + { + "epoch": 1.91, + "grad_norm": 5.078002477744629, + "learning_rate": 3.5375992167623635e-06, + "loss": 0.9517, + "step": 18895 + }, + { + "epoch": 1.91, + "grad_norm": 6.8441422291882414, + "learning_rate": 3.534794726007884e-06, + "loss": 1.0022, + "step": 18900 + }, + { + "epoch": 1.91, + "grad_norm": 4.820516107934559, + "learning_rate": 3.531990739429141e-06, + "loss": 0.9455, + "step": 18905 + }, + { + "epoch": 1.91, + "grad_norm": 6.986413654613702, + "learning_rate": 3.52918725799098e-06, + "loss": 0.9679, + "step": 18910 + }, + { + "epoch": 1.91, + "grad_norm": 5.527089802422374, + "learning_rate": 3.5263842826580813e-06, + "loss": 0.9618, + "step": 18915 + }, + { + "epoch": 1.91, + "grad_norm": 5.444281911919659, + "learning_rate": 3.523581814394941e-06, + "loss": 0.993, + "step": 18920 + }, + { + "epoch": 1.91, + "grad_norm": 6.782005747340329, + "learning_rate": 3.52077985416589e-06, + "loss": 0.9753, + "step": 18925 + }, + { + "epoch": 1.91, + "grad_norm": 8.74891044127738, + "learning_rate": 3.517978402935077e-06, + "loss": 0.9723, + "step": 18930 + }, + { + "epoch": 1.91, + "grad_norm": 9.758541435324947, + "learning_rate": 3.5151774616664825e-06, + "loss": 0.9603, + "step": 18935 + }, + { + "epoch": 1.91, + "grad_norm": 7.337683199317084, + "learning_rate": 3.5123770313239035e-06, + "loss": 0.9969, + "step": 18940 + }, + { + "epoch": 1.91, + "grad_norm": 6.297019442567877, + "learning_rate": 3.5095771128709678e-06, + "loss": 0.9789, + "step": 18945 + }, + { + "epoch": 1.91, + "grad_norm": 4.913586941747046, + "learning_rate": 3.506777707271124e-06, + "loss": 0.9492, + "step": 18950 + }, + { + "epoch": 1.91, + "grad_norm": 15.048817929045729, + "learning_rate": 3.503978815487643e-06, + "loss": 1.0014, + "step": 18955 + }, + { + "epoch": 1.91, + "grad_norm": 9.630358819181739, + "learning_rate": 3.501180438483626e-06, + "loss": 0.9702, + "step": 18960 + }, + { + "epoch": 1.91, + "grad_norm": 7.888623974867344, + "learning_rate": 3.4983825772219858e-06, + "loss": 0.9936, + "step": 18965 + }, + { + "epoch": 1.91, + "grad_norm": 4.889481060326786, + "learning_rate": 3.4955852326654683e-06, + "loss": 0.987, + "step": 18970 + }, + { + "epoch": 1.91, + "grad_norm": 7.851092223043249, + "learning_rate": 3.4927884057766344e-06, + "loss": 0.9532, + "step": 18975 + }, + { + "epoch": 1.91, + "grad_norm": 21.96452203997625, + "learning_rate": 3.4899920975178714e-06, + "loss": 0.9801, + "step": 18980 + }, + { + "epoch": 1.91, + "grad_norm": 16.662043974615656, + "learning_rate": 3.4871963088513833e-06, + "loss": 0.9786, + "step": 18985 + }, + { + "epoch": 1.91, + "grad_norm": 5.3324544231113515, + "learning_rate": 3.4844010407392035e-06, + "loss": 0.9684, + "step": 18990 + }, + { + "epoch": 1.92, + "grad_norm": 8.410229456737007, + "learning_rate": 3.481606294143175e-06, + "loss": 0.9266, + "step": 18995 + }, + { + "epoch": 1.92, + "grad_norm": 7.723097172662212, + "learning_rate": 3.4788120700249726e-06, + "loss": 0.967, + "step": 19000 + }, + { + "epoch": 1.92, + "grad_norm": 6.042198027911561, + "learning_rate": 3.476018369346083e-06, + "loss": 1.0244, + "step": 19005 + }, + { + "epoch": 1.92, + "grad_norm": 10.48724264317641, + "learning_rate": 3.4732251930678184e-06, + "loss": 0.9755, + "step": 19010 + }, + { + "epoch": 1.92, + "grad_norm": 6.070287251272303, + "learning_rate": 3.470432542151304e-06, + "loss": 0.9775, + "step": 19015 + }, + { + "epoch": 1.92, + "grad_norm": 7.767740288674974, + "learning_rate": 3.467640417557493e-06, + "loss": 0.983, + "step": 19020 + }, + { + "epoch": 1.92, + "grad_norm": 6.745129466956522, + "learning_rate": 3.4648488202471497e-06, + "loss": 0.9818, + "step": 19025 + }, + { + "epoch": 1.92, + "grad_norm": 12.653727961877179, + "learning_rate": 3.4620577511808623e-06, + "loss": 0.9817, + "step": 19030 + }, + { + "epoch": 1.92, + "grad_norm": 9.797091615802747, + "learning_rate": 3.459267211319033e-06, + "loss": 1.0586, + "step": 19035 + }, + { + "epoch": 1.92, + "grad_norm": 11.293391404977797, + "learning_rate": 3.456477201621885e-06, + "loss": 0.9574, + "step": 19040 + }, + { + "epoch": 1.92, + "grad_norm": 5.905429069319764, + "learning_rate": 3.4536877230494562e-06, + "loss": 0.9757, + "step": 19045 + }, + { + "epoch": 1.92, + "grad_norm": 5.478601961947565, + "learning_rate": 3.4508987765616075e-06, + "loss": 0.9561, + "step": 19050 + }, + { + "epoch": 1.92, + "grad_norm": 12.63894396760112, + "learning_rate": 3.4481103631180087e-06, + "loss": 0.9583, + "step": 19055 + }, + { + "epoch": 1.92, + "grad_norm": 6.789057388553128, + "learning_rate": 3.4453224836781526e-06, + "loss": 1.0379, + "step": 19060 + }, + { + "epoch": 1.92, + "grad_norm": 7.728543474370344, + "learning_rate": 3.442535139201345e-06, + "loss": 0.9697, + "step": 19065 + }, + { + "epoch": 1.92, + "grad_norm": 8.968113376227555, + "learning_rate": 3.439748330646707e-06, + "loss": 0.9938, + "step": 19070 + }, + { + "epoch": 1.92, + "grad_norm": 11.356592255128312, + "learning_rate": 3.43696205897318e-06, + "loss": 0.9788, + "step": 19075 + }, + { + "epoch": 1.92, + "grad_norm": 15.07420793847312, + "learning_rate": 3.434176325139513e-06, + "loss": 0.9548, + "step": 19080 + }, + { + "epoch": 1.92, + "grad_norm": 12.565583992430048, + "learning_rate": 3.4313911301042813e-06, + "loss": 1.0289, + "step": 19085 + }, + { + "epoch": 1.92, + "grad_norm": 12.439259933471051, + "learning_rate": 3.4286064748258597e-06, + "loss": 0.9554, + "step": 19090 + }, + { + "epoch": 1.93, + "grad_norm": 12.951354380025263, + "learning_rate": 3.4258223602624507e-06, + "loss": 1.0033, + "step": 19095 + }, + { + "epoch": 1.93, + "grad_norm": 7.20930073969771, + "learning_rate": 3.4230387873720637e-06, + "loss": 0.9792, + "step": 19100 + }, + { + "epoch": 1.93, + "grad_norm": 7.283083435209268, + "learning_rate": 3.4202557571125245e-06, + "loss": 0.9718, + "step": 19105 + }, + { + "epoch": 1.93, + "grad_norm": 5.211683982481918, + "learning_rate": 3.417473270441468e-06, + "loss": 0.9909, + "step": 19110 + }, + { + "epoch": 1.93, + "grad_norm": 12.417928327129392, + "learning_rate": 3.414691328316352e-06, + "loss": 0.9361, + "step": 19115 + }, + { + "epoch": 1.93, + "grad_norm": 5.890075038626392, + "learning_rate": 3.4119099316944315e-06, + "loss": 0.9219, + "step": 19120 + }, + { + "epoch": 1.93, + "grad_norm": 15.139042437693465, + "learning_rate": 3.4091290815327893e-06, + "loss": 0.9656, + "step": 19125 + }, + { + "epoch": 1.93, + "grad_norm": 6.149139570733197, + "learning_rate": 3.40634877878831e-06, + "loss": 0.9365, + "step": 19130 + }, + { + "epoch": 1.93, + "grad_norm": 14.084743495998689, + "learning_rate": 3.4035690244176946e-06, + "loss": 0.9949, + "step": 19135 + }, + { + "epoch": 1.93, + "grad_norm": 6.122993624208971, + "learning_rate": 3.4007898193774503e-06, + "loss": 0.9489, + "step": 19140 + }, + { + "epoch": 1.93, + "grad_norm": 6.45783174606669, + "learning_rate": 3.3980111646239056e-06, + "loss": 0.9574, + "step": 19145 + }, + { + "epoch": 1.93, + "grad_norm": 17.3945960690069, + "learning_rate": 3.3952330611131857e-06, + "loss": 1.0276, + "step": 19150 + }, + { + "epoch": 1.93, + "grad_norm": 9.510608770178116, + "learning_rate": 3.392455509801239e-06, + "loss": 0.9959, + "step": 19155 + }, + { + "epoch": 1.93, + "grad_norm": 5.704575515763283, + "learning_rate": 3.3896785116438145e-06, + "loss": 0.9894, + "step": 19160 + }, + { + "epoch": 1.93, + "grad_norm": 10.801312030785498, + "learning_rate": 3.3869020675964777e-06, + "loss": 1.0108, + "step": 19165 + }, + { + "epoch": 1.93, + "grad_norm": 8.595579495569428, + "learning_rate": 3.3841261786145963e-06, + "loss": 0.9596, + "step": 19170 + }, + { + "epoch": 1.93, + "grad_norm": 5.823529261050941, + "learning_rate": 3.3813508456533566e-06, + "loss": 1.0178, + "step": 19175 + }, + { + "epoch": 1.93, + "grad_norm": 11.252521117776325, + "learning_rate": 3.3785760696677424e-06, + "loss": 0.966, + "step": 19180 + }, + { + "epoch": 1.93, + "grad_norm": 8.811292947117277, + "learning_rate": 3.3758018516125556e-06, + "loss": 0.9748, + "step": 19185 + }, + { + "epoch": 1.93, + "grad_norm": 4.916508964969845, + "learning_rate": 3.3730281924423993e-06, + "loss": 0.9212, + "step": 19190 + }, + { + "epoch": 1.94, + "grad_norm": 26.873127984778893, + "learning_rate": 3.370255093111689e-06, + "loss": 0.9369, + "step": 19195 + }, + { + "epoch": 1.94, + "grad_norm": 14.070726015621453, + "learning_rate": 3.367482554574645e-06, + "loss": 1.0289, + "step": 19200 + }, + { + "epoch": 1.94, + "grad_norm": 5.370934105921357, + "learning_rate": 3.364710577785293e-06, + "loss": 0.9708, + "step": 19205 + }, + { + "epoch": 1.94, + "grad_norm": 4.821227512347984, + "learning_rate": 3.361939163697471e-06, + "loss": 0.9436, + "step": 19210 + }, + { + "epoch": 1.94, + "grad_norm": 6.814182159827276, + "learning_rate": 3.3591683132648156e-06, + "loss": 0.9094, + "step": 19215 + }, + { + "epoch": 1.94, + "grad_norm": 7.957443256059856, + "learning_rate": 3.3563980274407793e-06, + "loss": 0.9984, + "step": 19220 + }, + { + "epoch": 1.94, + "grad_norm": 6.935732607255355, + "learning_rate": 3.3536283071786103e-06, + "loss": 1.0013, + "step": 19225 + }, + { + "epoch": 1.94, + "grad_norm": 11.855885109582275, + "learning_rate": 3.3508591534313697e-06, + "loss": 0.9671, + "step": 19230 + }, + { + "epoch": 1.94, + "grad_norm": 5.1394970315511355, + "learning_rate": 3.3480905671519183e-06, + "loss": 0.9849, + "step": 19235 + }, + { + "epoch": 1.94, + "grad_norm": 5.813034310776297, + "learning_rate": 3.3453225492929255e-06, + "loss": 1.0069, + "step": 19240 + }, + { + "epoch": 1.94, + "grad_norm": 6.7170498460960255, + "learning_rate": 3.3425551008068613e-06, + "loss": 0.9643, + "step": 19245 + }, + { + "epoch": 1.94, + "grad_norm": 6.363155157270702, + "learning_rate": 3.339788222646006e-06, + "loss": 0.9958, + "step": 19250 + }, + { + "epoch": 1.94, + "grad_norm": 11.452011498593757, + "learning_rate": 3.3370219157624363e-06, + "loss": 1.0531, + "step": 19255 + }, + { + "epoch": 1.94, + "grad_norm": 5.577825743787345, + "learning_rate": 3.334256181108039e-06, + "loss": 0.9736, + "step": 19260 + }, + { + "epoch": 1.94, + "grad_norm": 16.352973183288714, + "learning_rate": 3.331491019634496e-06, + "loss": 0.987, + "step": 19265 + }, + { + "epoch": 1.94, + "grad_norm": 8.162757418382164, + "learning_rate": 3.3287264322933023e-06, + "loss": 0.9249, + "step": 19270 + }, + { + "epoch": 1.94, + "grad_norm": 20.131892472404356, + "learning_rate": 3.3259624200357444e-06, + "loss": 0.979, + "step": 19275 + }, + { + "epoch": 1.94, + "grad_norm": 5.198175991509779, + "learning_rate": 3.323198983812921e-06, + "loss": 0.953, + "step": 19280 + }, + { + "epoch": 1.94, + "grad_norm": 5.437106390168473, + "learning_rate": 3.3204361245757242e-06, + "loss": 0.9871, + "step": 19285 + }, + { + "epoch": 1.94, + "grad_norm": 7.5062723968359535, + "learning_rate": 3.3176738432748547e-06, + "loss": 1.0045, + "step": 19290 + }, + { + "epoch": 1.95, + "grad_norm": 5.170068060454293, + "learning_rate": 3.3149121408608063e-06, + "loss": 0.9387, + "step": 19295 + }, + { + "epoch": 1.95, + "grad_norm": 13.939999849704586, + "learning_rate": 3.3121510182838833e-06, + "loss": 0.9707, + "step": 19300 + }, + { + "epoch": 1.95, + "grad_norm": 13.588659684745384, + "learning_rate": 3.3093904764941804e-06, + "loss": 1.0645, + "step": 19305 + }, + { + "epoch": 1.95, + "grad_norm": 5.260854066268349, + "learning_rate": 3.306630516441602e-06, + "loss": 1.0016, + "step": 19310 + }, + { + "epoch": 1.95, + "grad_norm": 5.626575610265271, + "learning_rate": 3.3038711390758436e-06, + "loss": 0.9482, + "step": 19315 + }, + { + "epoch": 1.95, + "grad_norm": 6.659383220388749, + "learning_rate": 3.3011123453464088e-06, + "loss": 0.9535, + "step": 19320 + }, + { + "epoch": 1.95, + "grad_norm": 17.15180703898682, + "learning_rate": 3.298354136202592e-06, + "loss": 0.9649, + "step": 19325 + }, + { + "epoch": 1.95, + "grad_norm": 5.692113458158882, + "learning_rate": 3.2955965125934913e-06, + "loss": 0.9702, + "step": 19330 + }, + { + "epoch": 1.95, + "grad_norm": 5.830959718212891, + "learning_rate": 3.2928394754680037e-06, + "loss": 0.9656, + "step": 19335 + }, + { + "epoch": 1.95, + "grad_norm": 6.298851888358607, + "learning_rate": 3.29008302577482e-06, + "loss": 1.0396, + "step": 19340 + }, + { + "epoch": 1.95, + "grad_norm": 10.562326510048589, + "learning_rate": 3.287327164462436e-06, + "loss": 0.9582, + "step": 19345 + }, + { + "epoch": 1.95, + "grad_norm": 7.80680385409349, + "learning_rate": 3.284571892479138e-06, + "loss": 0.9311, + "step": 19350 + }, + { + "epoch": 1.95, + "grad_norm": 7.586249629628685, + "learning_rate": 3.281817210773014e-06, + "loss": 0.9851, + "step": 19355 + }, + { + "epoch": 1.95, + "grad_norm": 7.72997718645396, + "learning_rate": 3.279063120291946e-06, + "loss": 0.9776, + "step": 19360 + }, + { + "epoch": 1.95, + "grad_norm": 5.134953220297068, + "learning_rate": 3.2763096219836153e-06, + "loss": 1.0368, + "step": 19365 + }, + { + "epoch": 1.95, + "grad_norm": 20.821681569213887, + "learning_rate": 3.273556716795496e-06, + "loss": 0.9414, + "step": 19370 + }, + { + "epoch": 1.95, + "grad_norm": 7.563927082040128, + "learning_rate": 3.270804405674863e-06, + "loss": 0.968, + "step": 19375 + }, + { + "epoch": 1.95, + "grad_norm": 15.408947715661137, + "learning_rate": 3.2680526895687813e-06, + "loss": 0.9527, + "step": 19380 + }, + { + "epoch": 1.95, + "grad_norm": 16.582628546020413, + "learning_rate": 3.2653015694241166e-06, + "loss": 0.9667, + "step": 19385 + }, + { + "epoch": 1.95, + "grad_norm": 6.582224579638049, + "learning_rate": 3.2625510461875238e-06, + "loss": 0.9966, + "step": 19390 + }, + { + "epoch": 1.96, + "grad_norm": 5.985900200531787, + "learning_rate": 3.259801120805457e-06, + "loss": 0.9541, + "step": 19395 + }, + { + "epoch": 1.96, + "grad_norm": 6.757010012138148, + "learning_rate": 3.257051794224161e-06, + "loss": 0.9379, + "step": 19400 + }, + { + "epoch": 1.96, + "grad_norm": 5.71433219862553, + "learning_rate": 3.25430306738968e-06, + "loss": 0.944, + "step": 19405 + }, + { + "epoch": 1.96, + "grad_norm": 6.147343628896916, + "learning_rate": 3.2515549412478446e-06, + "loss": 0.9509, + "step": 19410 + }, + { + "epoch": 1.96, + "grad_norm": 6.409899027959961, + "learning_rate": 3.2488074167442852e-06, + "loss": 0.9639, + "step": 19415 + }, + { + "epoch": 1.96, + "grad_norm": 6.426059907250027, + "learning_rate": 3.24606049482442e-06, + "loss": 0.9638, + "step": 19420 + }, + { + "epoch": 1.96, + "grad_norm": 6.580312941706315, + "learning_rate": 3.2433141764334654e-06, + "loss": 0.914, + "step": 19425 + }, + { + "epoch": 1.96, + "grad_norm": 20.71500348245436, + "learning_rate": 3.2405684625164213e-06, + "loss": 0.9696, + "step": 19430 + }, + { + "epoch": 1.96, + "grad_norm": 9.001431989529735, + "learning_rate": 3.2378233540180913e-06, + "loss": 0.938, + "step": 19435 + }, + { + "epoch": 1.96, + "grad_norm": 25.185513924432797, + "learning_rate": 3.2350788518830617e-06, + "loss": 0.9514, + "step": 19440 + }, + { + "epoch": 1.96, + "grad_norm": 7.790210054110683, + "learning_rate": 3.232334957055714e-06, + "loss": 0.9617, + "step": 19445 + }, + { + "epoch": 1.96, + "grad_norm": 8.369432007064662, + "learning_rate": 3.2295916704802194e-06, + "loss": 0.9363, + "step": 19450 + }, + { + "epoch": 1.96, + "grad_norm": 5.8460514741934, + "learning_rate": 3.2268489931005385e-06, + "loss": 0.9848, + "step": 19455 + }, + { + "epoch": 1.96, + "grad_norm": 14.77954889982426, + "learning_rate": 3.2241069258604273e-06, + "loss": 0.9394, + "step": 19460 + }, + { + "epoch": 1.96, + "grad_norm": 4.971049752470999, + "learning_rate": 3.2213654697034246e-06, + "loss": 0.9953, + "step": 19465 + }, + { + "epoch": 1.96, + "grad_norm": 9.583984995729754, + "learning_rate": 3.218624625572867e-06, + "loss": 0.9636, + "step": 19470 + }, + { + "epoch": 1.96, + "grad_norm": 12.056659589103473, + "learning_rate": 3.2158843944118735e-06, + "loss": 1.0098, + "step": 19475 + }, + { + "epoch": 1.96, + "grad_norm": 5.1372502108362665, + "learning_rate": 3.213144777163356e-06, + "loss": 0.9576, + "step": 19480 + }, + { + "epoch": 1.96, + "grad_norm": 8.742973523888802, + "learning_rate": 3.2104057747700133e-06, + "loss": 1.012, + "step": 19485 + }, + { + "epoch": 1.97, + "grad_norm": 4.8678377346229755, + "learning_rate": 3.207667388174336e-06, + "loss": 0.9343, + "step": 19490 + }, + { + "epoch": 1.97, + "grad_norm": 4.993960911914766, + "learning_rate": 3.204929618318596e-06, + "loss": 0.9503, + "step": 19495 + }, + { + "epoch": 1.97, + "grad_norm": 7.897187382469572, + "learning_rate": 3.2021924661448617e-06, + "loss": 0.9685, + "step": 19500 + }, + { + "epoch": 1.97, + "grad_norm": 8.06865849915388, + "learning_rate": 3.199455932594982e-06, + "loss": 0.9807, + "step": 19505 + }, + { + "epoch": 1.97, + "grad_norm": 5.445746655511793, + "learning_rate": 3.1967200186105975e-06, + "loss": 0.9827, + "step": 19510 + }, + { + "epoch": 1.97, + "grad_norm": 8.303285449271074, + "learning_rate": 3.19398472513313e-06, + "loss": 0.9557, + "step": 19515 + }, + { + "epoch": 1.97, + "grad_norm": 5.096354209566563, + "learning_rate": 3.191250053103796e-06, + "loss": 1.0006, + "step": 19520 + }, + { + "epoch": 1.97, + "grad_norm": 9.044059243281657, + "learning_rate": 3.1885160034635885e-06, + "loss": 0.9825, + "step": 19525 + }, + { + "epoch": 1.97, + "grad_norm": 8.238962956982464, + "learning_rate": 3.185782577153296e-06, + "loss": 0.986, + "step": 19530 + }, + { + "epoch": 1.97, + "grad_norm": 8.294757392118468, + "learning_rate": 3.183049775113485e-06, + "loss": 0.947, + "step": 19535 + }, + { + "epoch": 1.97, + "grad_norm": 10.90709327082817, + "learning_rate": 3.1803175982845124e-06, + "loss": 0.9887, + "step": 19540 + }, + { + "epoch": 1.97, + "grad_norm": 7.1987411375622825, + "learning_rate": 3.1775860476065147e-06, + "loss": 0.9221, + "step": 19545 + }, + { + "epoch": 1.97, + "grad_norm": 5.084108859024125, + "learning_rate": 3.1748551240194192e-06, + "loss": 0.992, + "step": 19550 + }, + { + "epoch": 1.97, + "grad_norm": 4.930419426018079, + "learning_rate": 3.1721248284629293e-06, + "loss": 0.9964, + "step": 19555 + }, + { + "epoch": 1.97, + "grad_norm": 5.018806857675019, + "learning_rate": 3.169395161876543e-06, + "loss": 0.9697, + "step": 19560 + }, + { + "epoch": 1.97, + "grad_norm": 9.210110614847695, + "learning_rate": 3.1666661251995313e-06, + "loss": 0.9453, + "step": 19565 + }, + { + "epoch": 1.97, + "grad_norm": 5.061761204356091, + "learning_rate": 3.1639377193709564e-06, + "loss": 0.9771, + "step": 19570 + }, + { + "epoch": 1.97, + "grad_norm": 5.907670275453538, + "learning_rate": 3.1612099453296575e-06, + "loss": 0.9937, + "step": 19575 + }, + { + "epoch": 1.97, + "grad_norm": 6.268881229619832, + "learning_rate": 3.158482804014261e-06, + "loss": 0.9415, + "step": 19580 + }, + { + "epoch": 1.97, + "grad_norm": 6.410057104629809, + "learning_rate": 3.1557562963631728e-06, + "loss": 0.9235, + "step": 19585 + }, + { + "epoch": 1.98, + "grad_norm": 6.225154512292546, + "learning_rate": 3.153030423314579e-06, + "loss": 0.9581, + "step": 19590 + }, + { + "epoch": 1.98, + "grad_norm": 7.5527394175980005, + "learning_rate": 3.150305185806457e-06, + "loss": 0.9861, + "step": 19595 + }, + { + "epoch": 1.98, + "grad_norm": 5.332123472990733, + "learning_rate": 3.1475805847765497e-06, + "loss": 0.993, + "step": 19600 + }, + { + "epoch": 1.98, + "grad_norm": 10.367055598439357, + "learning_rate": 3.1448566211623964e-06, + "loss": 1.009, + "step": 19605 + }, + { + "epoch": 1.98, + "grad_norm": 6.04981083164468, + "learning_rate": 3.1421332959013063e-06, + "loss": 0.9901, + "step": 19610 + }, + { + "epoch": 1.98, + "grad_norm": 7.850619560943523, + "learning_rate": 3.1394106099303766e-06, + "loss": 0.9711, + "step": 19615 + }, + { + "epoch": 1.98, + "grad_norm": 5.190983677356039, + "learning_rate": 3.1366885641864764e-06, + "loss": 0.974, + "step": 19620 + }, + { + "epoch": 1.98, + "grad_norm": 12.919928057436149, + "learning_rate": 3.1339671596062642e-06, + "loss": 0.9464, + "step": 19625 + }, + { + "epoch": 1.98, + "grad_norm": 11.208963956216042, + "learning_rate": 3.1312463971261675e-06, + "loss": 0.9159, + "step": 19630 + }, + { + "epoch": 1.98, + "grad_norm": 10.998212896605247, + "learning_rate": 3.1285262776824033e-06, + "loss": 0.918, + "step": 19635 + }, + { + "epoch": 1.98, + "grad_norm": 6.484371252595751, + "learning_rate": 3.125806802210958e-06, + "loss": 0.9785, + "step": 19640 + }, + { + "epoch": 1.98, + "grad_norm": 10.07006168798367, + "learning_rate": 3.1230879716476027e-06, + "loss": 0.9834, + "step": 19645 + }, + { + "epoch": 1.98, + "grad_norm": 8.719973146813349, + "learning_rate": 3.1203697869278815e-06, + "loss": 0.9671, + "step": 19650 + }, + { + "epoch": 1.98, + "grad_norm": 9.906113261291717, + "learning_rate": 3.117652248987125e-06, + "loss": 0.9734, + "step": 19655 + }, + { + "epoch": 1.98, + "grad_norm": 6.461870871497509, + "learning_rate": 3.1149353587604282e-06, + "loss": 0.9461, + "step": 19660 + }, + { + "epoch": 1.98, + "grad_norm": 5.962793748592961, + "learning_rate": 3.1122191171826765e-06, + "loss": 0.974, + "step": 19665 + }, + { + "epoch": 1.98, + "grad_norm": 4.555865802855921, + "learning_rate": 3.109503525188522e-06, + "loss": 0.9898, + "step": 19670 + }, + { + "epoch": 1.98, + "grad_norm": 8.682060107258698, + "learning_rate": 3.1067885837124003e-06, + "loss": 0.9276, + "step": 19675 + }, + { + "epoch": 1.98, + "grad_norm": 8.964687635868797, + "learning_rate": 3.104074293688517e-06, + "loss": 0.9366, + "step": 19680 + }, + { + "epoch": 1.98, + "grad_norm": 6.943606394724827, + "learning_rate": 3.1013606560508606e-06, + "loss": 0.9712, + "step": 19685 + }, + { + "epoch": 1.99, + "grad_norm": 15.623945247762496, + "learning_rate": 3.098647671733187e-06, + "loss": 0.9687, + "step": 19690 + }, + { + "epoch": 1.99, + "grad_norm": 5.5336412205470475, + "learning_rate": 3.0959353416690354e-06, + "loss": 0.9732, + "step": 19695 + }, + { + "epoch": 1.99, + "grad_norm": 21.16146004449491, + "learning_rate": 3.0932236667917128e-06, + "loss": 0.989, + "step": 19700 + }, + { + "epoch": 1.99, + "grad_norm": 8.607128787122527, + "learning_rate": 3.0905126480343057e-06, + "loss": 0.9715, + "step": 19705 + }, + { + "epoch": 1.99, + "grad_norm": 5.665532346332486, + "learning_rate": 3.087802286329673e-06, + "loss": 0.9226, + "step": 19710 + }, + { + "epoch": 1.99, + "grad_norm": 11.992099425488396, + "learning_rate": 3.085092582610446e-06, + "loss": 0.9462, + "step": 19715 + }, + { + "epoch": 1.99, + "grad_norm": 5.808277797298123, + "learning_rate": 3.0823835378090344e-06, + "loss": 1.006, + "step": 19720 + }, + { + "epoch": 1.99, + "grad_norm": 5.481945211686297, + "learning_rate": 3.0796751528576137e-06, + "loss": 0.9797, + "step": 19725 + }, + { + "epoch": 1.99, + "grad_norm": 5.420100717721782, + "learning_rate": 3.07696742868814e-06, + "loss": 0.9554, + "step": 19730 + }, + { + "epoch": 1.99, + "grad_norm": 6.261498471883031, + "learning_rate": 3.074260366232337e-06, + "loss": 0.9386, + "step": 19735 + }, + { + "epoch": 1.99, + "grad_norm": 7.262869495772408, + "learning_rate": 3.071553966421703e-06, + "loss": 0.9791, + "step": 19740 + }, + { + "epoch": 1.99, + "grad_norm": 8.01776762367467, + "learning_rate": 3.068848230187506e-06, + "loss": 0.9706, + "step": 19745 + }, + { + "epoch": 1.99, + "grad_norm": 16.350092017822924, + "learning_rate": 3.0661431584607917e-06, + "loss": 0.9205, + "step": 19750 + }, + { + "epoch": 1.99, + "grad_norm": 9.02862210737073, + "learning_rate": 3.063438752172366e-06, + "loss": 0.9602, + "step": 19755 + }, + { + "epoch": 1.99, + "grad_norm": 9.624701360428293, + "learning_rate": 3.060735012252818e-06, + "loss": 0.9675, + "step": 19760 + }, + { + "epoch": 1.99, + "grad_norm": 10.022757663300407, + "learning_rate": 3.058031939632499e-06, + "loss": 0.9546, + "step": 19765 + }, + { + "epoch": 1.99, + "grad_norm": 11.299124192747938, + "learning_rate": 3.0553295352415356e-06, + "loss": 0.9571, + "step": 19770 + }, + { + "epoch": 1.99, + "grad_norm": 6.8246198096307005, + "learning_rate": 3.0526278000098208e-06, + "loss": 0.9435, + "step": 19775 + }, + { + "epoch": 1.99, + "grad_norm": 18.1215474019368, + "learning_rate": 3.0499267348670225e-06, + "loss": 0.9766, + "step": 19780 + }, + { + "epoch": 1.99, + "grad_norm": 7.868272802586499, + "learning_rate": 3.047226340742569e-06, + "loss": 0.9472, + "step": 19785 + }, + { + "epoch": 2.0, + "grad_norm": 5.174261883529199, + "learning_rate": 3.0445266185656685e-06, + "loss": 0.9222, + "step": 19790 + }, + { + "epoch": 2.0, + "grad_norm": 5.064743033532646, + "learning_rate": 3.04182756926529e-06, + "loss": 0.986, + "step": 19795 + }, + { + "epoch": 2.0, + "grad_norm": 5.872229895463509, + "learning_rate": 3.0391291937701755e-06, + "loss": 0.9669, + "step": 19800 + }, + { + "epoch": 2.0, + "grad_norm": 5.4018475622049, + "learning_rate": 3.0364314930088312e-06, + "loss": 0.9524, + "step": 19805 + }, + { + "epoch": 2.0, + "grad_norm": 25.0373690123754, + "learning_rate": 3.0337344679095373e-06, + "loss": 0.9465, + "step": 19810 + }, + { + "epoch": 2.0, + "grad_norm": 10.512944296977459, + "learning_rate": 3.0310381194003312e-06, + "loss": 1.0136, + "step": 19815 + }, + { + "epoch": 2.0, + "grad_norm": 26.48421656057948, + "learning_rate": 3.0283424484090306e-06, + "loss": 0.9404, + "step": 19820 + }, + { + "epoch": 2.0, + "grad_norm": 11.807803981551405, + "learning_rate": 3.0256474558632093e-06, + "loss": 0.9365, + "step": 19825 + }, + { + "epoch": 2.0, + "grad_norm": 4.631833071132508, + "learning_rate": 3.022953142690214e-06, + "loss": 0.9636, + "step": 19830 + }, + { + "epoch": 2.0, + "grad_norm": 5.990611013751563, + "learning_rate": 3.0202595098171527e-06, + "loss": 0.9332, + "step": 19835 + }, + { + "epoch": 2.0, + "eval_loss": 1.0710164308547974, + "eval_runtime": 25.431, + "eval_samples_per_second": 31.694, + "eval_steps_per_second": 3.972, + "step": 19837 + }, + { + "epoch": 2.0, + "grad_norm": 10.07847082531632, + "learning_rate": 3.0175665581709035e-06, + "loss": 0.8202, + "step": 19840 + }, + { + "epoch": 2.0, + "grad_norm": 8.554418428839272, + "learning_rate": 3.014874288678109e-06, + "loss": 0.708, + "step": 19845 + }, + { + "epoch": 2.0, + "grad_norm": 5.866761074974754, + "learning_rate": 3.012182702265173e-06, + "loss": 0.6866, + "step": 19850 + }, + { + "epoch": 2.0, + "grad_norm": 5.599634041133625, + "learning_rate": 3.0094917998582737e-06, + "loss": 0.693, + "step": 19855 + }, + { + "epoch": 2.0, + "grad_norm": 8.495053387713696, + "learning_rate": 3.006801582383343e-06, + "loss": 0.6998, + "step": 19860 + }, + { + "epoch": 2.0, + "grad_norm": 17.031744168199783, + "learning_rate": 3.0041120507660854e-06, + "loss": 0.6845, + "step": 19865 + }, + { + "epoch": 2.0, + "grad_norm": 5.8690008816324, + "learning_rate": 3.0014232059319636e-06, + "loss": 0.6585, + "step": 19870 + }, + { + "epoch": 2.0, + "grad_norm": 5.786581846929866, + "learning_rate": 2.9987350488062088e-06, + "loss": 0.7137, + "step": 19875 + }, + { + "epoch": 2.0, + "grad_norm": 4.322402483787055, + "learning_rate": 2.9960475803138105e-06, + "loss": 0.6975, + "step": 19880 + }, + { + "epoch": 2.0, + "grad_norm": 4.447412321402005, + "learning_rate": 2.993360801379528e-06, + "loss": 0.6839, + "step": 19885 + }, + { + "epoch": 2.01, + "grad_norm": 6.652872882405284, + "learning_rate": 2.9906747129278756e-06, + "loss": 0.6754, + "step": 19890 + }, + { + "epoch": 2.01, + "grad_norm": 5.959284438123414, + "learning_rate": 2.9879893158831368e-06, + "loss": 0.7099, + "step": 19895 + }, + { + "epoch": 2.01, + "grad_norm": 11.276126342509745, + "learning_rate": 2.985304611169352e-06, + "loss": 0.7156, + "step": 19900 + }, + { + "epoch": 2.01, + "grad_norm": 6.270239118964066, + "learning_rate": 2.9826205997103264e-06, + "loss": 0.7028, + "step": 19905 + }, + { + "epoch": 2.01, + "grad_norm": 5.870637246845116, + "learning_rate": 2.979937282429625e-06, + "loss": 0.6825, + "step": 19910 + }, + { + "epoch": 2.01, + "grad_norm": 6.8819565680372135, + "learning_rate": 2.9772546602505758e-06, + "loss": 0.6809, + "step": 19915 + }, + { + "epoch": 2.01, + "grad_norm": 5.735775614992868, + "learning_rate": 2.974572734096266e-06, + "loss": 0.6958, + "step": 19920 + }, + { + "epoch": 2.01, + "grad_norm": 5.125437627956788, + "learning_rate": 2.9718915048895442e-06, + "loss": 0.7067, + "step": 19925 + }, + { + "epoch": 2.01, + "grad_norm": 5.669127427169447, + "learning_rate": 2.9692109735530183e-06, + "loss": 0.6444, + "step": 19930 + }, + { + "epoch": 2.01, + "grad_norm": 12.186094264136873, + "learning_rate": 2.966531141009057e-06, + "loss": 0.6882, + "step": 19935 + }, + { + "epoch": 2.01, + "grad_norm": 16.473372562403895, + "learning_rate": 2.9638520081797862e-06, + "loss": 0.6951, + "step": 19940 + }, + { + "epoch": 2.01, + "grad_norm": 5.946284342434288, + "learning_rate": 2.9611735759870963e-06, + "loss": 0.7039, + "step": 19945 + }, + { + "epoch": 2.01, + "grad_norm": 14.49126548825676, + "learning_rate": 2.9584958453526315e-06, + "loss": 0.7073, + "step": 19950 + }, + { + "epoch": 2.01, + "grad_norm": 21.897553417108384, + "learning_rate": 2.955818817197797e-06, + "loss": 0.6855, + "step": 19955 + }, + { + "epoch": 2.01, + "grad_norm": 5.509726325927025, + "learning_rate": 2.9531424924437535e-06, + "loss": 0.6387, + "step": 19960 + }, + { + "epoch": 2.01, + "grad_norm": 14.647114585979262, + "learning_rate": 2.950466872011425e-06, + "loss": 0.7173, + "step": 19965 + }, + { + "epoch": 2.01, + "grad_norm": 19.553340817495304, + "learning_rate": 2.9477919568214886e-06, + "loss": 0.7056, + "step": 19970 + }, + { + "epoch": 2.01, + "grad_norm": 7.480061284531499, + "learning_rate": 2.9451177477943772e-06, + "loss": 0.6694, + "step": 19975 + }, + { + "epoch": 2.01, + "grad_norm": 4.73237332843965, + "learning_rate": 2.9424442458502884e-06, + "loss": 0.6902, + "step": 19980 + }, + { + "epoch": 2.01, + "grad_norm": 4.257596743266556, + "learning_rate": 2.939771451909168e-06, + "loss": 0.6621, + "step": 19985 + }, + { + "epoch": 2.02, + "grad_norm": 7.007980217929251, + "learning_rate": 2.937099366890725e-06, + "loss": 0.6742, + "step": 19990 + }, + { + "epoch": 2.02, + "grad_norm": 6.684722771405095, + "learning_rate": 2.934427991714418e-06, + "loss": 0.6924, + "step": 19995 + }, + { + "epoch": 2.02, + "grad_norm": 6.459688562079147, + "learning_rate": 2.931757327299467e-06, + "loss": 0.6764, + "step": 20000 + }, + { + "epoch": 2.02, + "grad_norm": 7.166925967390257, + "learning_rate": 2.929087374564843e-06, + "loss": 0.7016, + "step": 20005 + }, + { + "epoch": 2.02, + "grad_norm": 5.216661904637062, + "learning_rate": 2.9264181344292775e-06, + "loss": 0.688, + "step": 20010 + }, + { + "epoch": 2.02, + "grad_norm": 4.799432873080406, + "learning_rate": 2.9237496078112493e-06, + "loss": 0.6843, + "step": 20015 + }, + { + "epoch": 2.02, + "grad_norm": 4.919357986122606, + "learning_rate": 2.921081795629e-06, + "loss": 0.6726, + "step": 20020 + }, + { + "epoch": 2.02, + "grad_norm": 8.222286865949679, + "learning_rate": 2.918414698800517e-06, + "loss": 0.6846, + "step": 20025 + }, + { + "epoch": 2.02, + "grad_norm": 4.497129639848889, + "learning_rate": 2.91574831824355e-06, + "loss": 0.6942, + "step": 20030 + }, + { + "epoch": 2.02, + "grad_norm": 4.741374508164342, + "learning_rate": 2.913082654875596e-06, + "loss": 0.722, + "step": 20035 + }, + { + "epoch": 2.02, + "grad_norm": 4.86965920427053, + "learning_rate": 2.9104177096139073e-06, + "loss": 0.6915, + "step": 20040 + }, + { + "epoch": 2.02, + "grad_norm": 9.343518561020797, + "learning_rate": 2.9077534833754866e-06, + "loss": 0.685, + "step": 20045 + }, + { + "epoch": 2.02, + "grad_norm": 5.7595663507858905, + "learning_rate": 2.9050899770770964e-06, + "loss": 0.6682, + "step": 20050 + }, + { + "epoch": 2.02, + "grad_norm": 5.569107304442746, + "learning_rate": 2.902427191635242e-06, + "loss": 0.6701, + "step": 20055 + }, + { + "epoch": 2.02, + "grad_norm": 4.941058043606878, + "learning_rate": 2.8997651279661893e-06, + "loss": 0.6646, + "step": 20060 + }, + { + "epoch": 2.02, + "grad_norm": 5.2021071826259995, + "learning_rate": 2.8971037869859503e-06, + "loss": 0.6754, + "step": 20065 + }, + { + "epoch": 2.02, + "grad_norm": 23.181178572182155, + "learning_rate": 2.8944431696102898e-06, + "loss": 0.6937, + "step": 20070 + }, + { + "epoch": 2.02, + "grad_norm": 8.262592504800057, + "learning_rate": 2.8917832767547215e-06, + "loss": 0.6674, + "step": 20075 + }, + { + "epoch": 2.02, + "grad_norm": 8.445147355846368, + "learning_rate": 2.8891241093345163e-06, + "loss": 0.7094, + "step": 20080 + }, + { + "epoch": 2.03, + "grad_norm": 5.449205286580153, + "learning_rate": 2.886465668264686e-06, + "loss": 0.6716, + "step": 20085 + }, + { + "epoch": 2.03, + "grad_norm": 15.904881029885123, + "learning_rate": 2.8838079544600038e-06, + "loss": 0.6855, + "step": 20090 + }, + { + "epoch": 2.03, + "grad_norm": 7.317897665887902, + "learning_rate": 2.881150968834983e-06, + "loss": 0.6476, + "step": 20095 + }, + { + "epoch": 2.03, + "grad_norm": 22.960176156048963, + "learning_rate": 2.87849471230389e-06, + "loss": 0.6739, + "step": 20100 + }, + { + "epoch": 2.03, + "grad_norm": 12.81723029382713, + "learning_rate": 2.87583918578074e-06, + "loss": 0.7035, + "step": 20105 + }, + { + "epoch": 2.03, + "grad_norm": 6.429570917860615, + "learning_rate": 2.8731843901792955e-06, + "loss": 0.6859, + "step": 20110 + }, + { + "epoch": 2.03, + "grad_norm": 16.864528516068667, + "learning_rate": 2.8705303264130734e-06, + "loss": 0.6718, + "step": 20115 + }, + { + "epoch": 2.03, + "grad_norm": 5.239644650544476, + "learning_rate": 2.86787699539533e-06, + "loss": 0.655, + "step": 20120 + }, + { + "epoch": 2.03, + "grad_norm": 5.1466601613579686, + "learning_rate": 2.8652243980390778e-06, + "loss": 0.7119, + "step": 20125 + }, + { + "epoch": 2.03, + "grad_norm": 6.394354689435088, + "learning_rate": 2.862572535257071e-06, + "loss": 0.7041, + "step": 20130 + }, + { + "epoch": 2.03, + "grad_norm": 19.978558529974322, + "learning_rate": 2.8599214079618137e-06, + "loss": 0.6777, + "step": 20135 + }, + { + "epoch": 2.03, + "grad_norm": 5.013234273687217, + "learning_rate": 2.857271017065554e-06, + "loss": 0.6787, + "step": 20140 + }, + { + "epoch": 2.03, + "grad_norm": 7.637030733686688, + "learning_rate": 2.854621363480292e-06, + "loss": 0.7093, + "step": 20145 + }, + { + "epoch": 2.03, + "grad_norm": 6.422025071823492, + "learning_rate": 2.851972448117769e-06, + "loss": 0.7019, + "step": 20150 + }, + { + "epoch": 2.03, + "grad_norm": 9.61149127644432, + "learning_rate": 2.8493242718894765e-06, + "loss": 0.6949, + "step": 20155 + }, + { + "epoch": 2.03, + "grad_norm": 6.036412049392237, + "learning_rate": 2.8466768357066475e-06, + "loss": 0.6843, + "step": 20160 + }, + { + "epoch": 2.03, + "grad_norm": 12.36780256872287, + "learning_rate": 2.844030140480263e-06, + "loss": 0.7137, + "step": 20165 + }, + { + "epoch": 2.03, + "grad_norm": 5.8956494643916875, + "learning_rate": 2.8413841871210456e-06, + "loss": 0.6885, + "step": 20170 + }, + { + "epoch": 2.03, + "grad_norm": 5.501107518944112, + "learning_rate": 2.83873897653947e-06, + "loss": 0.6672, + "step": 20175 + }, + { + "epoch": 2.03, + "grad_norm": 5.34334702345557, + "learning_rate": 2.836094509645747e-06, + "loss": 0.6899, + "step": 20180 + }, + { + "epoch": 2.04, + "grad_norm": 7.976060684480714, + "learning_rate": 2.8334507873498373e-06, + "loss": 0.6979, + "step": 20185 + }, + { + "epoch": 2.04, + "grad_norm": 4.765949760846961, + "learning_rate": 2.8308078105614433e-06, + "loss": 0.6678, + "step": 20190 + }, + { + "epoch": 2.04, + "grad_norm": 5.483903474155636, + "learning_rate": 2.8281655801900098e-06, + "loss": 0.6894, + "step": 20195 + }, + { + "epoch": 2.04, + "grad_norm": 5.11059505030005, + "learning_rate": 2.8255240971447227e-06, + "loss": 0.6772, + "step": 20200 + }, + { + "epoch": 2.04, + "grad_norm": 4.625996552622085, + "learning_rate": 2.8228833623345205e-06, + "loss": 0.71, + "step": 20205 + }, + { + "epoch": 2.04, + "grad_norm": 5.572562001223029, + "learning_rate": 2.820243376668071e-06, + "loss": 0.6595, + "step": 20210 + }, + { + "epoch": 2.04, + "grad_norm": 5.759923303950589, + "learning_rate": 2.817604141053796e-06, + "loss": 0.7204, + "step": 20215 + }, + { + "epoch": 2.04, + "grad_norm": 6.061007775325533, + "learning_rate": 2.8149656563998513e-06, + "loss": 0.6661, + "step": 20220 + }, + { + "epoch": 2.04, + "grad_norm": 4.473550927111044, + "learning_rate": 2.8123279236141378e-06, + "loss": 0.6577, + "step": 20225 + }, + { + "epoch": 2.04, + "grad_norm": 6.747404862096733, + "learning_rate": 2.809690943604295e-06, + "loss": 0.6944, + "step": 20230 + }, + { + "epoch": 2.04, + "grad_norm": 12.424278453040909, + "learning_rate": 2.8070547172777063e-06, + "loss": 0.6621, + "step": 20235 + }, + { + "epoch": 2.04, + "grad_norm": 5.392454233816305, + "learning_rate": 2.8044192455414955e-06, + "loss": 0.675, + "step": 20240 + }, + { + "epoch": 2.04, + "grad_norm": 4.785097695138289, + "learning_rate": 2.8017845293025243e-06, + "loss": 0.6736, + "step": 20245 + }, + { + "epoch": 2.04, + "grad_norm": 6.860239130549238, + "learning_rate": 2.7991505694673983e-06, + "loss": 0.7173, + "step": 20250 + }, + { + "epoch": 2.04, + "grad_norm": 5.029730559126252, + "learning_rate": 2.7965173669424596e-06, + "loss": 0.6879, + "step": 20255 + }, + { + "epoch": 2.04, + "grad_norm": 9.141573323808583, + "learning_rate": 2.7938849226337905e-06, + "loss": 0.6831, + "step": 20260 + }, + { + "epoch": 2.04, + "grad_norm": 4.803515943071768, + "learning_rate": 2.79125323744721e-06, + "loss": 0.6667, + "step": 20265 + }, + { + "epoch": 2.04, + "grad_norm": 4.768831960299563, + "learning_rate": 2.7886223122882833e-06, + "loss": 0.6589, + "step": 20270 + }, + { + "epoch": 2.04, + "grad_norm": 5.363161119443387, + "learning_rate": 2.7859921480623048e-06, + "loss": 0.6545, + "step": 20275 + }, + { + "epoch": 2.04, + "grad_norm": 6.738430484972305, + "learning_rate": 2.783362745674315e-06, + "loss": 0.6556, + "step": 20280 + }, + { + "epoch": 2.05, + "grad_norm": 5.157022650903519, + "learning_rate": 2.780734106029088e-06, + "loss": 0.6538, + "step": 20285 + }, + { + "epoch": 2.05, + "grad_norm": 8.027913613784868, + "learning_rate": 2.7781062300311357e-06, + "loss": 0.6428, + "step": 20290 + }, + { + "epoch": 2.05, + "grad_norm": 7.702997038291723, + "learning_rate": 2.7754791185847045e-06, + "loss": 0.6968, + "step": 20295 + }, + { + "epoch": 2.05, + "grad_norm": 4.703632225890155, + "learning_rate": 2.7728527725937866e-06, + "loss": 0.7008, + "step": 20300 + }, + { + "epoch": 2.05, + "grad_norm": 4.326646266181257, + "learning_rate": 2.7702271929621004e-06, + "loss": 0.6359, + "step": 20305 + }, + { + "epoch": 2.05, + "grad_norm": 7.596957107489121, + "learning_rate": 2.7676023805931096e-06, + "loss": 0.6543, + "step": 20310 + }, + { + "epoch": 2.05, + "grad_norm": 4.4689293897811275, + "learning_rate": 2.764978336390007e-06, + "loss": 0.6543, + "step": 20315 + }, + { + "epoch": 2.05, + "grad_norm": 4.592063799128114, + "learning_rate": 2.762355061255724e-06, + "loss": 0.6456, + "step": 20320 + }, + { + "epoch": 2.05, + "grad_norm": 8.724734726353038, + "learning_rate": 2.7597325560929254e-06, + "loss": 0.6652, + "step": 20325 + }, + { + "epoch": 2.05, + "grad_norm": 4.5520198384323365, + "learning_rate": 2.7571108218040172e-06, + "loss": 0.6667, + "step": 20330 + }, + { + "epoch": 2.05, + "grad_norm": 4.7006276737748305, + "learning_rate": 2.7544898592911305e-06, + "loss": 0.6714, + "step": 20335 + }, + { + "epoch": 2.05, + "grad_norm": 7.035710997650199, + "learning_rate": 2.7518696694561402e-06, + "loss": 0.663, + "step": 20340 + }, + { + "epoch": 2.05, + "grad_norm": 5.984785241111086, + "learning_rate": 2.7492502532006498e-06, + "loss": 0.6972, + "step": 20345 + }, + { + "epoch": 2.05, + "grad_norm": 12.48809346091373, + "learning_rate": 2.746631611425997e-06, + "loss": 0.6877, + "step": 20350 + }, + { + "epoch": 2.05, + "grad_norm": 4.822505788690949, + "learning_rate": 2.744013745033256e-06, + "loss": 0.6788, + "step": 20355 + }, + { + "epoch": 2.05, + "grad_norm": 8.309126432519314, + "learning_rate": 2.7413966549232274e-06, + "loss": 0.6215, + "step": 20360 + }, + { + "epoch": 2.05, + "grad_norm": 9.376856836130756, + "learning_rate": 2.7387803419964547e-06, + "loss": 0.6749, + "step": 20365 + }, + { + "epoch": 2.05, + "grad_norm": 4.955550699410014, + "learning_rate": 2.736164807153205e-06, + "loss": 0.6753, + "step": 20370 + }, + { + "epoch": 2.05, + "grad_norm": 12.158760477300898, + "learning_rate": 2.7335500512934844e-06, + "loss": 0.657, + "step": 20375 + }, + { + "epoch": 2.05, + "grad_norm": 7.839000626662179, + "learning_rate": 2.7309360753170267e-06, + "loss": 0.698, + "step": 20380 + }, + { + "epoch": 2.06, + "grad_norm": 4.465886089969563, + "learning_rate": 2.7283228801232985e-06, + "loss": 0.6795, + "step": 20385 + }, + { + "epoch": 2.06, + "grad_norm": 11.638003524690347, + "learning_rate": 2.725710466611495e-06, + "loss": 0.6578, + "step": 20390 + }, + { + "epoch": 2.06, + "grad_norm": 15.555120482903076, + "learning_rate": 2.7230988356805502e-06, + "loss": 0.6611, + "step": 20395 + }, + { + "epoch": 2.06, + "grad_norm": 6.007908263827729, + "learning_rate": 2.7204879882291195e-06, + "loss": 0.711, + "step": 20400 + }, + { + "epoch": 2.06, + "grad_norm": 9.52058789317427, + "learning_rate": 2.7178779251555963e-06, + "loss": 0.686, + "step": 20405 + }, + { + "epoch": 2.06, + "grad_norm": 8.290434833099377, + "learning_rate": 2.7152686473581e-06, + "loss": 0.6597, + "step": 20410 + }, + { + "epoch": 2.06, + "grad_norm": 6.865850991334291, + "learning_rate": 2.71266015573448e-06, + "loss": 0.6921, + "step": 20415 + }, + { + "epoch": 2.06, + "grad_norm": 7.093957021899156, + "learning_rate": 2.7100524511823134e-06, + "loss": 0.6589, + "step": 20420 + }, + { + "epoch": 2.06, + "grad_norm": 12.990284071684462, + "learning_rate": 2.7074455345989128e-06, + "loss": 0.6583, + "step": 20425 + }, + { + "epoch": 2.06, + "grad_norm": 8.045778809474987, + "learning_rate": 2.7048394068813127e-06, + "loss": 0.6724, + "step": 20430 + }, + { + "epoch": 2.06, + "grad_norm": 5.497086333725814, + "learning_rate": 2.7022340689262827e-06, + "loss": 0.6372, + "step": 20435 + }, + { + "epoch": 2.06, + "grad_norm": 12.37157407740874, + "learning_rate": 2.6996295216303147e-06, + "loss": 0.6974, + "step": 20440 + }, + { + "epoch": 2.06, + "grad_norm": 8.893442888000223, + "learning_rate": 2.697025765889631e-06, + "loss": 0.6669, + "step": 20445 + }, + { + "epoch": 2.06, + "grad_norm": 5.408198005557214, + "learning_rate": 2.6944228026001795e-06, + "loss": 0.6511, + "step": 20450 + }, + { + "epoch": 2.06, + "grad_norm": 5.167101911384638, + "learning_rate": 2.6918206326576424e-06, + "loss": 0.6442, + "step": 20455 + }, + { + "epoch": 2.06, + "grad_norm": 4.727500287228669, + "learning_rate": 2.6892192569574183e-06, + "loss": 0.6843, + "step": 20460 + }, + { + "epoch": 2.06, + "grad_norm": 4.898389179651797, + "learning_rate": 2.6866186763946426e-06, + "loss": 0.6685, + "step": 20465 + }, + { + "epoch": 2.06, + "grad_norm": 5.218251143268944, + "learning_rate": 2.684018891864172e-06, + "loss": 0.6747, + "step": 20470 + }, + { + "epoch": 2.06, + "grad_norm": 4.851991764395169, + "learning_rate": 2.681419904260587e-06, + "loss": 0.6963, + "step": 20475 + }, + { + "epoch": 2.06, + "grad_norm": 6.618853238599201, + "learning_rate": 2.6788217144781995e-06, + "loss": 0.6651, + "step": 20480 + }, + { + "epoch": 2.07, + "grad_norm": 4.94027051266893, + "learning_rate": 2.6762243234110415e-06, + "loss": 0.688, + "step": 20485 + }, + { + "epoch": 2.07, + "grad_norm": 8.675509761541878, + "learning_rate": 2.6736277319528757e-06, + "loss": 0.7261, + "step": 20490 + }, + { + "epoch": 2.07, + "grad_norm": 8.93388076006707, + "learning_rate": 2.6710319409971837e-06, + "loss": 0.7099, + "step": 20495 + }, + { + "epoch": 2.07, + "grad_norm": 5.709835978825093, + "learning_rate": 2.66843695143718e-06, + "loss": 0.7102, + "step": 20500 + }, + { + "epoch": 2.07, + "grad_norm": 5.807707734312976, + "learning_rate": 2.6658427641657903e-06, + "loss": 0.6731, + "step": 20505 + }, + { + "epoch": 2.07, + "grad_norm": 4.902988562763788, + "learning_rate": 2.6632493800756787e-06, + "loss": 0.6528, + "step": 20510 + }, + { + "epoch": 2.07, + "grad_norm": 6.921829168490332, + "learning_rate": 2.6606568000592214e-06, + "loss": 0.697, + "step": 20515 + }, + { + "epoch": 2.07, + "grad_norm": 5.402784949687757, + "learning_rate": 2.658065025008526e-06, + "loss": 0.6611, + "step": 20520 + }, + { + "epoch": 2.07, + "grad_norm": 4.516261871930669, + "learning_rate": 2.655474055815418e-06, + "loss": 0.6624, + "step": 20525 + }, + { + "epoch": 2.07, + "grad_norm": 5.161141461701427, + "learning_rate": 2.6528838933714514e-06, + "loss": 0.6762, + "step": 20530 + }, + { + "epoch": 2.07, + "grad_norm": 4.941845927863708, + "learning_rate": 2.6502945385678913e-06, + "loss": 0.6946, + "step": 20535 + }, + { + "epoch": 2.07, + "grad_norm": 5.659199780653852, + "learning_rate": 2.6477059922957384e-06, + "loss": 0.6783, + "step": 20540 + }, + { + "epoch": 2.07, + "grad_norm": 7.124417827033684, + "learning_rate": 2.645118255445705e-06, + "loss": 0.6573, + "step": 20545 + }, + { + "epoch": 2.07, + "grad_norm": 4.85559312595928, + "learning_rate": 2.642531328908232e-06, + "loss": 0.6704, + "step": 20550 + }, + { + "epoch": 2.07, + "grad_norm": 4.770810220827577, + "learning_rate": 2.639945213573476e-06, + "loss": 0.6672, + "step": 20555 + }, + { + "epoch": 2.07, + "grad_norm": 6.784653605858367, + "learning_rate": 2.637359910331322e-06, + "loss": 0.6597, + "step": 20560 + }, + { + "epoch": 2.07, + "grad_norm": 5.517438292447058, + "learning_rate": 2.6347754200713616e-06, + "loss": 0.6762, + "step": 20565 + }, + { + "epoch": 2.07, + "grad_norm": 7.121687089288357, + "learning_rate": 2.6321917436829226e-06, + "loss": 0.676, + "step": 20570 + }, + { + "epoch": 2.07, + "grad_norm": 5.757356735754648, + "learning_rate": 2.6296088820550402e-06, + "loss": 0.6445, + "step": 20575 + }, + { + "epoch": 2.07, + "grad_norm": 9.034457247639295, + "learning_rate": 2.6270268360764807e-06, + "loss": 0.689, + "step": 20580 + }, + { + "epoch": 2.08, + "grad_norm": 5.2361163711605565, + "learning_rate": 2.6244456066357176e-06, + "loss": 0.6888, + "step": 20585 + }, + { + "epoch": 2.08, + "grad_norm": 4.735153898853402, + "learning_rate": 2.6218651946209543e-06, + "loss": 0.7077, + "step": 20590 + }, + { + "epoch": 2.08, + "grad_norm": 6.132323732813553, + "learning_rate": 2.619285600920107e-06, + "loss": 0.6814, + "step": 20595 + }, + { + "epoch": 2.08, + "grad_norm": 4.56491215584217, + "learning_rate": 2.6167068264208096e-06, + "loss": 0.6309, + "step": 20600 + }, + { + "epoch": 2.08, + "grad_norm": 4.420679368081689, + "learning_rate": 2.6141288720104153e-06, + "loss": 0.6567, + "step": 20605 + }, + { + "epoch": 2.08, + "grad_norm": 4.678522405810549, + "learning_rate": 2.611551738576e-06, + "loss": 0.6773, + "step": 20610 + }, + { + "epoch": 2.08, + "grad_norm": 4.886865696835705, + "learning_rate": 2.6089754270043493e-06, + "loss": 0.6665, + "step": 20615 + }, + { + "epoch": 2.08, + "grad_norm": 5.956127785995881, + "learning_rate": 2.6063999381819686e-06, + "loss": 0.6539, + "step": 20620 + }, + { + "epoch": 2.08, + "grad_norm": 13.519226867216945, + "learning_rate": 2.6038252729950877e-06, + "loss": 0.695, + "step": 20625 + }, + { + "epoch": 2.08, + "grad_norm": 4.7267256993890925, + "learning_rate": 2.601251432329637e-06, + "loss": 0.6967, + "step": 20630 + }, + { + "epoch": 2.08, + "grad_norm": 4.823013090894927, + "learning_rate": 2.5986784170712797e-06, + "loss": 0.7371, + "step": 20635 + }, + { + "epoch": 2.08, + "grad_norm": 4.526737098118042, + "learning_rate": 2.5961062281053838e-06, + "loss": 0.6591, + "step": 20640 + }, + { + "epoch": 2.08, + "grad_norm": 9.339338277834594, + "learning_rate": 2.59353486631704e-06, + "loss": 0.6972, + "step": 20645 + }, + { + "epoch": 2.08, + "grad_norm": 7.65317210480881, + "learning_rate": 2.5909643325910495e-06, + "loss": 0.6889, + "step": 20650 + }, + { + "epoch": 2.08, + "grad_norm": 5.956832370518871, + "learning_rate": 2.588394627811934e-06, + "loss": 0.709, + "step": 20655 + }, + { + "epoch": 2.08, + "grad_norm": 5.067865571460534, + "learning_rate": 2.5858257528639208e-06, + "loss": 0.713, + "step": 20660 + }, + { + "epoch": 2.08, + "grad_norm": 7.12917377194101, + "learning_rate": 2.583257708630962e-06, + "loss": 0.626, + "step": 20665 + }, + { + "epoch": 2.08, + "grad_norm": 5.907861023980704, + "learning_rate": 2.580690495996715e-06, + "loss": 0.6633, + "step": 20670 + }, + { + "epoch": 2.08, + "grad_norm": 7.7192338329918355, + "learning_rate": 2.578124115844559e-06, + "loss": 0.6593, + "step": 20675 + }, + { + "epoch": 2.08, + "grad_norm": 10.403310083930933, + "learning_rate": 2.5755585690575808e-06, + "loss": 0.6623, + "step": 20680 + }, + { + "epoch": 2.09, + "grad_norm": 5.040667531817449, + "learning_rate": 2.572993856518587e-06, + "loss": 0.6759, + "step": 20685 + }, + { + "epoch": 2.09, + "grad_norm": 5.139888145277165, + "learning_rate": 2.570429979110085e-06, + "loss": 0.6965, + "step": 20690 + }, + { + "epoch": 2.09, + "grad_norm": 12.342407045299352, + "learning_rate": 2.5678669377143082e-06, + "loss": 0.6686, + "step": 20695 + }, + { + "epoch": 2.09, + "grad_norm": 8.900244543868103, + "learning_rate": 2.5653047332131943e-06, + "loss": 0.6473, + "step": 20700 + }, + { + "epoch": 2.09, + "grad_norm": 5.137426285227035, + "learning_rate": 2.562743366488397e-06, + "loss": 0.6422, + "step": 20705 + }, + { + "epoch": 2.09, + "grad_norm": 6.895202332079677, + "learning_rate": 2.560182838421279e-06, + "loss": 0.633, + "step": 20710 + }, + { + "epoch": 2.09, + "grad_norm": 4.653129772734897, + "learning_rate": 2.5576231498929187e-06, + "loss": 0.6608, + "step": 20715 + }, + { + "epoch": 2.09, + "grad_norm": 6.0133160033021715, + "learning_rate": 2.5550643017840966e-06, + "loss": 0.6672, + "step": 20720 + }, + { + "epoch": 2.09, + "grad_norm": 5.671373077072103, + "learning_rate": 2.5525062949753143e-06, + "loss": 0.6669, + "step": 20725 + }, + { + "epoch": 2.09, + "grad_norm": 5.510727721272181, + "learning_rate": 2.549949130346777e-06, + "loss": 0.6768, + "step": 20730 + }, + { + "epoch": 2.09, + "grad_norm": 4.834607513541353, + "learning_rate": 2.547392808778405e-06, + "loss": 0.7069, + "step": 20735 + }, + { + "epoch": 2.09, + "grad_norm": 4.60320911064297, + "learning_rate": 2.544837331149826e-06, + "loss": 0.6845, + "step": 20740 + }, + { + "epoch": 2.09, + "grad_norm": 4.333675816136024, + "learning_rate": 2.542282698340376e-06, + "loss": 0.6757, + "step": 20745 + }, + { + "epoch": 2.09, + "grad_norm": 4.605720986577917, + "learning_rate": 2.5397289112291016e-06, + "loss": 0.6295, + "step": 20750 + }, + { + "epoch": 2.09, + "grad_norm": 9.79014322646118, + "learning_rate": 2.5371759706947575e-06, + "loss": 0.6647, + "step": 20755 + }, + { + "epoch": 2.09, + "grad_norm": 8.1075613008192, + "learning_rate": 2.5346238776158115e-06, + "loss": 0.6826, + "step": 20760 + }, + { + "epoch": 2.09, + "grad_norm": 6.475051681081209, + "learning_rate": 2.532072632870433e-06, + "loss": 0.6881, + "step": 20765 + }, + { + "epoch": 2.09, + "grad_norm": 4.665453634646515, + "learning_rate": 2.5295222373365056e-06, + "loss": 0.6741, + "step": 20770 + }, + { + "epoch": 2.09, + "grad_norm": 5.6573931101364465, + "learning_rate": 2.526972691891617e-06, + "loss": 0.661, + "step": 20775 + }, + { + "epoch": 2.1, + "grad_norm": 8.297920740915522, + "learning_rate": 2.5244239974130637e-06, + "loss": 0.6929, + "step": 20780 + }, + { + "epoch": 2.1, + "grad_norm": 6.549276471188224, + "learning_rate": 2.5218761547778457e-06, + "loss": 0.6911, + "step": 20785 + }, + { + "epoch": 2.1, + "grad_norm": 5.854245160085682, + "learning_rate": 2.519329164862678e-06, + "loss": 0.6927, + "step": 20790 + }, + { + "epoch": 2.1, + "grad_norm": 4.596145059140424, + "learning_rate": 2.5167830285439743e-06, + "loss": 0.6333, + "step": 20795 + }, + { + "epoch": 2.1, + "grad_norm": 6.27937273157563, + "learning_rate": 2.5142377466978598e-06, + "loss": 0.688, + "step": 20800 + }, + { + "epoch": 2.1, + "grad_norm": 8.834287310597626, + "learning_rate": 2.5116933202001627e-06, + "loss": 0.683, + "step": 20805 + }, + { + "epoch": 2.1, + "grad_norm": 6.1546276439154175, + "learning_rate": 2.509149749926417e-06, + "loss": 0.6488, + "step": 20810 + }, + { + "epoch": 2.1, + "grad_norm": 4.963786781997544, + "learning_rate": 2.5066070367518624e-06, + "loss": 0.6578, + "step": 20815 + }, + { + "epoch": 2.1, + "grad_norm": 4.901247568524524, + "learning_rate": 2.5040651815514465e-06, + "loss": 0.6575, + "step": 20820 + }, + { + "epoch": 2.1, + "grad_norm": 4.8133936919905835, + "learning_rate": 2.5015241851998156e-06, + "loss": 0.6654, + "step": 20825 + }, + { + "epoch": 2.1, + "grad_norm": 9.8028683121827, + "learning_rate": 2.4989840485713287e-06, + "loss": 0.6362, + "step": 20830 + }, + { + "epoch": 2.1, + "grad_norm": 11.757108818862323, + "learning_rate": 2.4964447725400417e-06, + "loss": 0.6458, + "step": 20835 + }, + { + "epoch": 2.1, + "grad_norm": 5.1336993367309995, + "learning_rate": 2.493906357979717e-06, + "loss": 0.6796, + "step": 20840 + }, + { + "epoch": 2.1, + "grad_norm": 4.6868173155668975, + "learning_rate": 2.491368805763819e-06, + "loss": 0.6749, + "step": 20845 + }, + { + "epoch": 2.1, + "grad_norm": 6.181757307196955, + "learning_rate": 2.488832116765521e-06, + "loss": 0.6935, + "step": 20850 + }, + { + "epoch": 2.1, + "grad_norm": 4.879231867941652, + "learning_rate": 2.486296291857691e-06, + "loss": 0.6466, + "step": 20855 + }, + { + "epoch": 2.1, + "grad_norm": 5.210616346246946, + "learning_rate": 2.4837613319129082e-06, + "loss": 0.6493, + "step": 20860 + }, + { + "epoch": 2.1, + "grad_norm": 5.092954661412917, + "learning_rate": 2.481227237803448e-06, + "loss": 0.686, + "step": 20865 + }, + { + "epoch": 2.1, + "grad_norm": 5.902697895006476, + "learning_rate": 2.4786940104012897e-06, + "loss": 0.6926, + "step": 20870 + }, + { + "epoch": 2.1, + "grad_norm": 7.0086173784932, + "learning_rate": 2.4761616505781137e-06, + "loss": 0.6361, + "step": 20875 + }, + { + "epoch": 2.11, + "grad_norm": 5.193635820587068, + "learning_rate": 2.473630159205302e-06, + "loss": 0.6792, + "step": 20880 + }, + { + "epoch": 2.11, + "grad_norm": 6.165631840813355, + "learning_rate": 2.471099537153941e-06, + "loss": 0.654, + "step": 20885 + }, + { + "epoch": 2.11, + "grad_norm": 4.861817478106466, + "learning_rate": 2.468569785294812e-06, + "loss": 0.6763, + "step": 20890 + }, + { + "epoch": 2.11, + "grad_norm": 5.372317983836856, + "learning_rate": 2.466040904498404e-06, + "loss": 0.6674, + "step": 20895 + }, + { + "epoch": 2.11, + "grad_norm": 4.755747744451209, + "learning_rate": 2.463512895634901e-06, + "loss": 0.6508, + "step": 20900 + }, + { + "epoch": 2.11, + "grad_norm": 5.040387676166835, + "learning_rate": 2.460985759574187e-06, + "loss": 0.6563, + "step": 20905 + }, + { + "epoch": 2.11, + "grad_norm": 4.968898382731428, + "learning_rate": 2.4584594971858473e-06, + "loss": 0.6549, + "step": 20910 + }, + { + "epoch": 2.11, + "grad_norm": 8.786369127215906, + "learning_rate": 2.455934109339168e-06, + "loss": 0.6701, + "step": 20915 + }, + { + "epoch": 2.11, + "grad_norm": 5.362652153951624, + "learning_rate": 2.453409596903131e-06, + "loss": 0.6565, + "step": 20920 + }, + { + "epoch": 2.11, + "grad_norm": 5.037224859768098, + "learning_rate": 2.4508859607464203e-06, + "loss": 0.69, + "step": 20925 + }, + { + "epoch": 2.11, + "grad_norm": 6.183266255986752, + "learning_rate": 2.4483632017374167e-06, + "loss": 0.6915, + "step": 20930 + }, + { + "epoch": 2.11, + "grad_norm": 4.745109900608311, + "learning_rate": 2.445841320744198e-06, + "loss": 0.705, + "step": 20935 + }, + { + "epoch": 2.11, + "grad_norm": 10.187762151918701, + "learning_rate": 2.443320318634539e-06, + "loss": 0.6788, + "step": 20940 + }, + { + "epoch": 2.11, + "grad_norm": 5.243981756066541, + "learning_rate": 2.4408001962759187e-06, + "loss": 0.6579, + "step": 20945 + }, + { + "epoch": 2.11, + "grad_norm": 5.436368028087012, + "learning_rate": 2.4382809545355046e-06, + "loss": 0.6857, + "step": 20950 + }, + { + "epoch": 2.11, + "grad_norm": 11.060669721055994, + "learning_rate": 2.4357625942801694e-06, + "loss": 0.658, + "step": 20955 + }, + { + "epoch": 2.11, + "grad_norm": 7.9540665812641596, + "learning_rate": 2.4332451163764765e-06, + "loss": 0.6839, + "step": 20960 + }, + { + "epoch": 2.11, + "grad_norm": 4.879592711398314, + "learning_rate": 2.4307285216906866e-06, + "loss": 0.6852, + "step": 20965 + }, + { + "epoch": 2.11, + "grad_norm": 9.395266201019412, + "learning_rate": 2.4282128110887575e-06, + "loss": 0.6396, + "step": 20970 + }, + { + "epoch": 2.11, + "grad_norm": 10.715951603539619, + "learning_rate": 2.4256979854363453e-06, + "loss": 0.6795, + "step": 20975 + }, + { + "epoch": 2.12, + "grad_norm": 5.113152703376329, + "learning_rate": 2.423184045598796e-06, + "loss": 0.6761, + "step": 20980 + }, + { + "epoch": 2.12, + "grad_norm": 8.185557904992484, + "learning_rate": 2.420670992441157e-06, + "loss": 0.7158, + "step": 20985 + }, + { + "epoch": 2.12, + "grad_norm": 5.8309932391804375, + "learning_rate": 2.4181588268281656e-06, + "loss": 0.6775, + "step": 20990 + }, + { + "epoch": 2.12, + "grad_norm": 5.352726397807516, + "learning_rate": 2.4156475496242563e-06, + "loss": 0.6462, + "step": 20995 + }, + { + "epoch": 2.12, + "grad_norm": 5.074028530793119, + "learning_rate": 2.413137161693557e-06, + "loss": 0.6394, + "step": 21000 + }, + { + "epoch": 2.12, + "grad_norm": 4.806118859847987, + "learning_rate": 2.410627663899888e-06, + "loss": 0.6965, + "step": 21005 + }, + { + "epoch": 2.12, + "grad_norm": 7.303205623748911, + "learning_rate": 2.4081190571067688e-06, + "loss": 0.6612, + "step": 21010 + }, + { + "epoch": 2.12, + "grad_norm": 6.7635101057624665, + "learning_rate": 2.4056113421774036e-06, + "loss": 0.6534, + "step": 21015 + }, + { + "epoch": 2.12, + "grad_norm": 6.0749282619508, + "learning_rate": 2.4031045199746998e-06, + "loss": 0.7072, + "step": 21020 + }, + { + "epoch": 2.12, + "grad_norm": 4.909661154884081, + "learning_rate": 2.4005985913612507e-06, + "loss": 0.6823, + "step": 21025 + }, + { + "epoch": 2.12, + "grad_norm": 13.926500818645366, + "learning_rate": 2.3980935571993435e-06, + "loss": 0.6779, + "step": 21030 + }, + { + "epoch": 2.12, + "grad_norm": 9.879990313993776, + "learning_rate": 2.3955894183509566e-06, + "loss": 0.685, + "step": 21035 + }, + { + "epoch": 2.12, + "grad_norm": 7.49919898506855, + "learning_rate": 2.3930861756777648e-06, + "loss": 0.6762, + "step": 21040 + }, + { + "epoch": 2.12, + "grad_norm": 4.825971877972342, + "learning_rate": 2.390583830041128e-06, + "loss": 0.6662, + "step": 21045 + }, + { + "epoch": 2.12, + "grad_norm": 4.836307842150723, + "learning_rate": 2.3880823823021056e-06, + "loss": 0.7139, + "step": 21050 + }, + { + "epoch": 2.12, + "grad_norm": 4.8872685890080385, + "learning_rate": 2.38558183332144e-06, + "loss": 0.6456, + "step": 21055 + }, + { + "epoch": 2.12, + "grad_norm": 4.874725834505177, + "learning_rate": 2.3830821839595684e-06, + "loss": 0.7174, + "step": 21060 + }, + { + "epoch": 2.12, + "grad_norm": 5.197819594155209, + "learning_rate": 2.3805834350766165e-06, + "loss": 0.6593, + "step": 21065 + }, + { + "epoch": 2.12, + "grad_norm": 5.793047824829741, + "learning_rate": 2.3780855875324047e-06, + "loss": 0.6978, + "step": 21070 + }, + { + "epoch": 2.12, + "grad_norm": 4.829755969981174, + "learning_rate": 2.375588642186436e-06, + "loss": 0.663, + "step": 21075 + }, + { + "epoch": 2.13, + "grad_norm": 8.64441800256596, + "learning_rate": 2.373092599897911e-06, + "loss": 0.6674, + "step": 21080 + }, + { + "epoch": 2.13, + "grad_norm": 10.047361648257937, + "learning_rate": 2.3705974615257133e-06, + "loss": 0.7065, + "step": 21085 + }, + { + "epoch": 2.13, + "grad_norm": 9.95270047732342, + "learning_rate": 2.3681032279284173e-06, + "loss": 0.7028, + "step": 21090 + }, + { + "epoch": 2.13, + "grad_norm": 5.204323988308421, + "learning_rate": 2.365609899964285e-06, + "loss": 0.6651, + "step": 21095 + }, + { + "epoch": 2.13, + "grad_norm": 5.271678402978469, + "learning_rate": 2.3631174784912723e-06, + "loss": 0.6666, + "step": 21100 + }, + { + "epoch": 2.13, + "grad_norm": 5.063459933192673, + "learning_rate": 2.3606259643670148e-06, + "loss": 0.6482, + "step": 21105 + }, + { + "epoch": 2.13, + "grad_norm": 5.437559796028627, + "learning_rate": 2.3581353584488437e-06, + "loss": 0.6586, + "step": 21110 + }, + { + "epoch": 2.13, + "grad_norm": 8.825669261536088, + "learning_rate": 2.355645661593773e-06, + "loss": 0.641, + "step": 21115 + }, + { + "epoch": 2.13, + "grad_norm": 4.667516137702388, + "learning_rate": 2.3531568746585036e-06, + "loss": 0.6698, + "step": 21120 + }, + { + "epoch": 2.13, + "grad_norm": 5.988785674417255, + "learning_rate": 2.3506689984994265e-06, + "loss": 0.6739, + "step": 21125 + }, + { + "epoch": 2.13, + "grad_norm": 4.727327892790064, + "learning_rate": 2.3481820339726142e-06, + "loss": 0.6624, + "step": 21130 + }, + { + "epoch": 2.13, + "grad_norm": 4.8673197563247435, + "learning_rate": 2.345695981933833e-06, + "loss": 0.6868, + "step": 21135 + }, + { + "epoch": 2.13, + "grad_norm": 4.490877814256852, + "learning_rate": 2.3432108432385274e-06, + "loss": 0.6523, + "step": 21140 + }, + { + "epoch": 2.13, + "grad_norm": 9.706242854874962, + "learning_rate": 2.3407266187418354e-06, + "loss": 0.6702, + "step": 21145 + }, + { + "epoch": 2.13, + "grad_norm": 4.630820647418488, + "learning_rate": 2.338243309298574e-06, + "loss": 0.6594, + "step": 21150 + }, + { + "epoch": 2.13, + "grad_norm": 5.784604042705804, + "learning_rate": 2.3357609157632473e-06, + "loss": 0.734, + "step": 21155 + }, + { + "epoch": 2.13, + "grad_norm": 5.414196654666086, + "learning_rate": 2.3332794389900434e-06, + "loss": 0.6885, + "step": 21160 + }, + { + "epoch": 2.13, + "grad_norm": 7.1018738978504, + "learning_rate": 2.3307988798328384e-06, + "loss": 0.7051, + "step": 21165 + }, + { + "epoch": 2.13, + "grad_norm": 4.906674381033124, + "learning_rate": 2.3283192391451885e-06, + "loss": 0.6619, + "step": 21170 + }, + { + "epoch": 2.13, + "grad_norm": 4.469245284173604, + "learning_rate": 2.3258405177803386e-06, + "loss": 0.6902, + "step": 21175 + }, + { + "epoch": 2.14, + "grad_norm": 6.014548678716852, + "learning_rate": 2.323362716591212e-06, + "loss": 0.6439, + "step": 21180 + }, + { + "epoch": 2.14, + "grad_norm": 4.395091828408313, + "learning_rate": 2.320885836430418e-06, + "loss": 0.6858, + "step": 21185 + }, + { + "epoch": 2.14, + "grad_norm": 4.338385671125915, + "learning_rate": 2.3184098781502473e-06, + "loss": 0.6797, + "step": 21190 + }, + { + "epoch": 2.14, + "grad_norm": 4.440486546092224, + "learning_rate": 2.3159348426026784e-06, + "loss": 0.6535, + "step": 21195 + }, + { + "epoch": 2.14, + "grad_norm": 5.014703336012933, + "learning_rate": 2.313460730639364e-06, + "loss": 0.6694, + "step": 21200 + }, + { + "epoch": 2.14, + "grad_norm": 6.0268000255109815, + "learning_rate": 2.3109875431116485e-06, + "loss": 0.6431, + "step": 21205 + }, + { + "epoch": 2.14, + "grad_norm": 7.294111410480581, + "learning_rate": 2.308515280870551e-06, + "loss": 0.6721, + "step": 21210 + }, + { + "epoch": 2.14, + "grad_norm": 4.596658628899066, + "learning_rate": 2.3060439447667755e-06, + "loss": 0.64, + "step": 21215 + }, + { + "epoch": 2.14, + "grad_norm": 5.359281340269409, + "learning_rate": 2.3035735356507027e-06, + "loss": 0.6634, + "step": 21220 + }, + { + "epoch": 2.14, + "grad_norm": 8.878999044428964, + "learning_rate": 2.3011040543724035e-06, + "loss": 0.6545, + "step": 21225 + }, + { + "epoch": 2.14, + "grad_norm": 5.7306168814791745, + "learning_rate": 2.2986355017816194e-06, + "loss": 0.7132, + "step": 21230 + }, + { + "epoch": 2.14, + "grad_norm": 6.440527418729273, + "learning_rate": 2.2961678787277803e-06, + "loss": 0.6431, + "step": 21235 + }, + { + "epoch": 2.14, + "grad_norm": 6.095974682993984, + "learning_rate": 2.2937011860599924e-06, + "loss": 0.6931, + "step": 21240 + }, + { + "epoch": 2.14, + "grad_norm": 7.959130597361517, + "learning_rate": 2.2912354246270406e-06, + "loss": 0.6827, + "step": 21245 + }, + { + "epoch": 2.14, + "grad_norm": 4.820875700283996, + "learning_rate": 2.2887705952773933e-06, + "loss": 0.6873, + "step": 21250 + }, + { + "epoch": 2.14, + "grad_norm": 5.121696731093606, + "learning_rate": 2.286306698859192e-06, + "loss": 0.6799, + "step": 21255 + }, + { + "epoch": 2.14, + "grad_norm": 10.373115426094676, + "learning_rate": 2.2838437362202653e-06, + "loss": 0.7123, + "step": 21260 + }, + { + "epoch": 2.14, + "grad_norm": 11.701640475740954, + "learning_rate": 2.2813817082081135e-06, + "loss": 0.6797, + "step": 21265 + }, + { + "epoch": 2.14, + "grad_norm": 6.262566730988645, + "learning_rate": 2.278920615669921e-06, + "loss": 0.7041, + "step": 21270 + }, + { + "epoch": 2.14, + "grad_norm": 8.108524486232833, + "learning_rate": 2.276460459452546e-06, + "loss": 0.6351, + "step": 21275 + }, + { + "epoch": 2.15, + "grad_norm": 9.204313406839686, + "learning_rate": 2.2740012404025253e-06, + "loss": 0.6517, + "step": 21280 + }, + { + "epoch": 2.15, + "grad_norm": 4.876521529639379, + "learning_rate": 2.2715429593660722e-06, + "loss": 0.6841, + "step": 21285 + }, + { + "epoch": 2.15, + "grad_norm": 6.1636493192147785, + "learning_rate": 2.269085617189083e-06, + "loss": 0.6706, + "step": 21290 + }, + { + "epoch": 2.15, + "grad_norm": 7.354043706285142, + "learning_rate": 2.266629214717122e-06, + "loss": 0.6583, + "step": 21295 + }, + { + "epoch": 2.15, + "grad_norm": 4.656344874179418, + "learning_rate": 2.2641737527954405e-06, + "loss": 0.6794, + "step": 21300 + }, + { + "epoch": 2.15, + "grad_norm": 15.927842070258235, + "learning_rate": 2.261719232268957e-06, + "loss": 0.6469, + "step": 21305 + }, + { + "epoch": 2.15, + "grad_norm": 11.574361597789768, + "learning_rate": 2.2592656539822706e-06, + "loss": 0.67, + "step": 21310 + }, + { + "epoch": 2.15, + "grad_norm": 4.458625032162326, + "learning_rate": 2.256813018779653e-06, + "loss": 0.656, + "step": 21315 + }, + { + "epoch": 2.15, + "grad_norm": 15.504041246695413, + "learning_rate": 2.254361327505057e-06, + "loss": 0.6734, + "step": 21320 + }, + { + "epoch": 2.15, + "grad_norm": 4.920432068704808, + "learning_rate": 2.251910581002104e-06, + "loss": 0.6801, + "step": 21325 + }, + { + "epoch": 2.15, + "grad_norm": 4.977985796519088, + "learning_rate": 2.2494607801140977e-06, + "loss": 0.6701, + "step": 21330 + }, + { + "epoch": 2.15, + "grad_norm": 5.521974127049712, + "learning_rate": 2.2470119256840102e-06, + "loss": 0.6597, + "step": 21335 + }, + { + "epoch": 2.15, + "grad_norm": 6.2108291136682245, + "learning_rate": 2.2445640185544887e-06, + "loss": 0.6738, + "step": 21340 + }, + { + "epoch": 2.15, + "grad_norm": 5.578370170317782, + "learning_rate": 2.2421170595678554e-06, + "loss": 0.6956, + "step": 21345 + }, + { + "epoch": 2.15, + "grad_norm": 5.455209357833909, + "learning_rate": 2.2396710495661096e-06, + "loss": 0.6872, + "step": 21350 + }, + { + "epoch": 2.15, + "grad_norm": 4.932387669873154, + "learning_rate": 2.2372259893909177e-06, + "loss": 0.6931, + "step": 21355 + }, + { + "epoch": 2.15, + "grad_norm": 7.383305059308782, + "learning_rate": 2.234781879883625e-06, + "loss": 0.6765, + "step": 21360 + }, + { + "epoch": 2.15, + "grad_norm": 4.588357606015773, + "learning_rate": 2.2323387218852472e-06, + "loss": 0.6468, + "step": 21365 + }, + { + "epoch": 2.15, + "grad_norm": 6.9486101441179855, + "learning_rate": 2.229896516236472e-06, + "loss": 0.7126, + "step": 21370 + }, + { + "epoch": 2.16, + "grad_norm": 6.831380879476231, + "learning_rate": 2.227455263777657e-06, + "loss": 0.6925, + "step": 21375 + }, + { + "epoch": 2.16, + "grad_norm": 14.772545065182918, + "learning_rate": 2.22501496534884e-06, + "loss": 0.6487, + "step": 21380 + }, + { + "epoch": 2.16, + "grad_norm": 6.355784462134895, + "learning_rate": 2.2225756217897232e-06, + "loss": 0.6881, + "step": 21385 + }, + { + "epoch": 2.16, + "grad_norm": 10.180633022588685, + "learning_rate": 2.2201372339396797e-06, + "loss": 0.7179, + "step": 21390 + }, + { + "epoch": 2.16, + "grad_norm": 4.906657785907644, + "learning_rate": 2.2176998026377615e-06, + "loss": 0.6773, + "step": 21395 + }, + { + "epoch": 2.16, + "grad_norm": 5.973334214591822, + "learning_rate": 2.2152633287226836e-06, + "loss": 0.6862, + "step": 21400 + }, + { + "epoch": 2.16, + "grad_norm": 5.857519742476949, + "learning_rate": 2.2128278130328346e-06, + "loss": 0.6675, + "step": 21405 + }, + { + "epoch": 2.16, + "grad_norm": 5.331979105826995, + "learning_rate": 2.2103932564062706e-06, + "loss": 0.6346, + "step": 21410 + }, + { + "epoch": 2.16, + "grad_norm": 6.850241441793481, + "learning_rate": 2.207959659680726e-06, + "loss": 0.6459, + "step": 21415 + }, + { + "epoch": 2.16, + "grad_norm": 4.663222394532015, + "learning_rate": 2.2055270236935927e-06, + "loss": 0.6659, + "step": 21420 + }, + { + "epoch": 2.16, + "grad_norm": 6.714394073555861, + "learning_rate": 2.2030953492819435e-06, + "loss": 0.6453, + "step": 21425 + }, + { + "epoch": 2.16, + "grad_norm": 6.421109528249489, + "learning_rate": 2.2006646372825138e-06, + "loss": 0.6648, + "step": 21430 + }, + { + "epoch": 2.16, + "grad_norm": 6.463981661729482, + "learning_rate": 2.198234888531708e-06, + "loss": 0.6567, + "step": 21435 + }, + { + "epoch": 2.16, + "grad_norm": 5.074988279798165, + "learning_rate": 2.1958061038656e-06, + "loss": 0.6448, + "step": 21440 + }, + { + "epoch": 2.16, + "grad_norm": 4.73004122724363, + "learning_rate": 2.193378284119934e-06, + "loss": 0.6768, + "step": 21445 + }, + { + "epoch": 2.16, + "grad_norm": 5.659328962468312, + "learning_rate": 2.190951430130119e-06, + "loss": 0.6546, + "step": 21450 + }, + { + "epoch": 2.16, + "grad_norm": 4.614004198385022, + "learning_rate": 2.188525542731236e-06, + "loss": 0.6502, + "step": 21455 + }, + { + "epoch": 2.16, + "grad_norm": 6.497775382681463, + "learning_rate": 2.1861006227580276e-06, + "loss": 0.7165, + "step": 21460 + }, + { + "epoch": 2.16, + "grad_norm": 4.542848170679907, + "learning_rate": 2.1836766710449077e-06, + "loss": 0.6685, + "step": 21465 + }, + { + "epoch": 2.16, + "grad_norm": 4.649025832208719, + "learning_rate": 2.1812536884259537e-06, + "loss": 0.6761, + "step": 21470 + }, + { + "epoch": 2.17, + "grad_norm": 4.801979439674982, + "learning_rate": 2.178831675734915e-06, + "loss": 0.6833, + "step": 21475 + }, + { + "epoch": 2.17, + "grad_norm": 4.821275013329016, + "learning_rate": 2.1764106338052005e-06, + "loss": 0.6756, + "step": 21480 + }, + { + "epoch": 2.17, + "grad_norm": 4.596268528847123, + "learning_rate": 2.1739905634698916e-06, + "loss": 0.6768, + "step": 21485 + }, + { + "epoch": 2.17, + "grad_norm": 8.495715820222872, + "learning_rate": 2.171571465561731e-06, + "loss": 0.6667, + "step": 21490 + }, + { + "epoch": 2.17, + "grad_norm": 4.776997921508625, + "learning_rate": 2.169153340913127e-06, + "loss": 0.6944, + "step": 21495 + }, + { + "epoch": 2.17, + "grad_norm": 5.506795003368372, + "learning_rate": 2.1667361903561534e-06, + "loss": 0.6646, + "step": 21500 + }, + { + "epoch": 2.17, + "grad_norm": 7.392404856257032, + "learning_rate": 2.1643200147225523e-06, + "loss": 0.6782, + "step": 21505 + }, + { + "epoch": 2.17, + "grad_norm": 5.477503623359234, + "learning_rate": 2.1619048148437256e-06, + "loss": 0.6601, + "step": 21510 + }, + { + "epoch": 2.17, + "grad_norm": 4.524025133682762, + "learning_rate": 2.1594905915507397e-06, + "loss": 0.6839, + "step": 21515 + }, + { + "epoch": 2.17, + "grad_norm": 4.615708947012337, + "learning_rate": 2.1570773456743305e-06, + "loss": 0.6494, + "step": 21520 + }, + { + "epoch": 2.17, + "grad_norm": 5.095074229625856, + "learning_rate": 2.1546650780448907e-06, + "loss": 0.6696, + "step": 21525 + }, + { + "epoch": 2.17, + "grad_norm": 6.528698210233045, + "learning_rate": 2.15225378949248e-06, + "loss": 0.6469, + "step": 21530 + }, + { + "epoch": 2.17, + "grad_norm": 7.061796993105184, + "learning_rate": 2.149843480846819e-06, + "loss": 0.6828, + "step": 21535 + }, + { + "epoch": 2.17, + "grad_norm": 4.608756274550869, + "learning_rate": 2.1474341529372955e-06, + "loss": 0.6519, + "step": 21540 + }, + { + "epoch": 2.17, + "grad_norm": 7.6559465404075455, + "learning_rate": 2.1450258065929536e-06, + "loss": 0.6905, + "step": 21545 + }, + { + "epoch": 2.17, + "grad_norm": 4.611780806892584, + "learning_rate": 2.1426184426425073e-06, + "loss": 0.6165, + "step": 21550 + }, + { + "epoch": 2.17, + "grad_norm": 4.775707939635718, + "learning_rate": 2.1402120619143254e-06, + "loss": 0.6955, + "step": 21555 + }, + { + "epoch": 2.17, + "grad_norm": 4.858396353644519, + "learning_rate": 2.137806665236441e-06, + "loss": 0.6687, + "step": 21560 + }, + { + "epoch": 2.17, + "grad_norm": 4.800894336334279, + "learning_rate": 2.135402253436548e-06, + "loss": 0.6648, + "step": 21565 + }, + { + "epoch": 2.17, + "grad_norm": 4.6098765956852015, + "learning_rate": 2.1329988273420055e-06, + "loss": 0.6861, + "step": 21570 + }, + { + "epoch": 2.18, + "grad_norm": 8.6380129196337, + "learning_rate": 2.1305963877798265e-06, + "loss": 0.6491, + "step": 21575 + }, + { + "epoch": 2.18, + "grad_norm": 5.510593341834106, + "learning_rate": 2.128194935576692e-06, + "loss": 0.6633, + "step": 21580 + }, + { + "epoch": 2.18, + "grad_norm": 5.012673378533721, + "learning_rate": 2.1257944715589363e-06, + "loss": 0.6853, + "step": 21585 + }, + { + "epoch": 2.18, + "grad_norm": 5.940600401787411, + "learning_rate": 2.1233949965525585e-06, + "loss": 0.6819, + "step": 21590 + }, + { + "epoch": 2.18, + "grad_norm": 8.435816299728263, + "learning_rate": 2.120996511383213e-06, + "loss": 0.6768, + "step": 21595 + }, + { + "epoch": 2.18, + "grad_norm": 7.472662445866399, + "learning_rate": 2.1185990168762193e-06, + "loss": 0.6378, + "step": 21600 + }, + { + "epoch": 2.18, + "grad_norm": 5.412920367145333, + "learning_rate": 2.1162025138565505e-06, + "loss": 0.691, + "step": 21605 + }, + { + "epoch": 2.18, + "grad_norm": 4.563470399698418, + "learning_rate": 2.1138070031488445e-06, + "loss": 0.7107, + "step": 21610 + }, + { + "epoch": 2.18, + "grad_norm": 5.746862880742315, + "learning_rate": 2.1114124855773915e-06, + "loss": 0.6786, + "step": 21615 + }, + { + "epoch": 2.18, + "grad_norm": 5.143965721903495, + "learning_rate": 2.1090189619661437e-06, + "loss": 0.651, + "step": 21620 + }, + { + "epoch": 2.18, + "grad_norm": 5.909432206635309, + "learning_rate": 2.1066264331387084e-06, + "loss": 0.6533, + "step": 21625 + }, + { + "epoch": 2.18, + "grad_norm": 5.953769822474178, + "learning_rate": 2.104234899918355e-06, + "loss": 0.6591, + "step": 21630 + }, + { + "epoch": 2.18, + "grad_norm": 4.648977379946307, + "learning_rate": 2.101844363128007e-06, + "loss": 0.6816, + "step": 21635 + }, + { + "epoch": 2.18, + "grad_norm": 7.201087140446208, + "learning_rate": 2.099454823590244e-06, + "loss": 0.6705, + "step": 21640 + }, + { + "epoch": 2.18, + "grad_norm": 4.959974141273814, + "learning_rate": 2.0970662821273074e-06, + "loss": 0.6469, + "step": 21645 + }, + { + "epoch": 2.18, + "grad_norm": 4.664286681471639, + "learning_rate": 2.094678739561091e-06, + "loss": 0.6646, + "step": 21650 + }, + { + "epoch": 2.18, + "grad_norm": 6.485570321475418, + "learning_rate": 2.092292196713145e-06, + "loss": 0.6523, + "step": 21655 + }, + { + "epoch": 2.18, + "grad_norm": 5.368909470789177, + "learning_rate": 2.0899066544046754e-06, + "loss": 0.6451, + "step": 21660 + }, + { + "epoch": 2.18, + "grad_norm": 4.97877059094573, + "learning_rate": 2.087522113456548e-06, + "loss": 0.674, + "step": 21665 + }, + { + "epoch": 2.18, + "grad_norm": 6.508107160264964, + "learning_rate": 2.085138574689278e-06, + "loss": 0.6405, + "step": 21670 + }, + { + "epoch": 2.19, + "grad_norm": 5.801473722359958, + "learning_rate": 2.082756038923042e-06, + "loss": 0.6738, + "step": 21675 + }, + { + "epoch": 2.19, + "grad_norm": 4.701163787304521, + "learning_rate": 2.080374506977667e-06, + "loss": 0.6915, + "step": 21680 + }, + { + "epoch": 2.19, + "grad_norm": 5.0583630847683345, + "learning_rate": 2.0779939796726358e-06, + "loss": 0.6789, + "step": 21685 + }, + { + "epoch": 2.19, + "grad_norm": 4.989863272716518, + "learning_rate": 2.075614457827083e-06, + "loss": 0.6593, + "step": 21690 + }, + { + "epoch": 2.19, + "grad_norm": 5.457297521696792, + "learning_rate": 2.073235942259804e-06, + "loss": 0.6472, + "step": 21695 + }, + { + "epoch": 2.19, + "grad_norm": 12.769915672849963, + "learning_rate": 2.0708584337892406e-06, + "loss": 0.6985, + "step": 21700 + }, + { + "epoch": 2.19, + "grad_norm": 6.0036448831642755, + "learning_rate": 2.0684819332334937e-06, + "loss": 0.6477, + "step": 21705 + }, + { + "epoch": 2.19, + "grad_norm": 5.353427945270811, + "learning_rate": 2.066106441410314e-06, + "loss": 0.716, + "step": 21710 + }, + { + "epoch": 2.19, + "grad_norm": 5.403640101289655, + "learning_rate": 2.0637319591371057e-06, + "loss": 0.6456, + "step": 21715 + }, + { + "epoch": 2.19, + "grad_norm": 4.799200550599031, + "learning_rate": 2.0613584872309238e-06, + "loss": 0.6445, + "step": 21720 + }, + { + "epoch": 2.19, + "grad_norm": 10.13065659228395, + "learning_rate": 2.058986026508482e-06, + "loss": 0.6908, + "step": 21725 + }, + { + "epoch": 2.19, + "grad_norm": 6.577943061340765, + "learning_rate": 2.0566145777861374e-06, + "loss": 0.6751, + "step": 21730 + }, + { + "epoch": 2.19, + "grad_norm": 6.263454894906639, + "learning_rate": 2.054244141879907e-06, + "loss": 0.6474, + "step": 21735 + }, + { + "epoch": 2.19, + "grad_norm": 4.782587443226552, + "learning_rate": 2.0518747196054533e-06, + "loss": 0.633, + "step": 21740 + }, + { + "epoch": 2.19, + "grad_norm": 4.484463749033461, + "learning_rate": 2.049506311778094e-06, + "loss": 0.6482, + "step": 21745 + }, + { + "epoch": 2.19, + "grad_norm": 5.316010453674646, + "learning_rate": 2.047138919212792e-06, + "loss": 0.6819, + "step": 21750 + }, + { + "epoch": 2.19, + "grad_norm": 6.308040713505686, + "learning_rate": 2.04477254272417e-06, + "loss": 0.6636, + "step": 21755 + }, + { + "epoch": 2.19, + "grad_norm": 4.735685322131239, + "learning_rate": 2.0424071831264913e-06, + "loss": 0.6882, + "step": 21760 + }, + { + "epoch": 2.19, + "grad_norm": 8.13529321248825, + "learning_rate": 2.0400428412336776e-06, + "loss": 0.6645, + "step": 21765 + }, + { + "epoch": 2.19, + "grad_norm": 9.906802535189406, + "learning_rate": 2.0376795178592973e-06, + "loss": 0.66, + "step": 21770 + }, + { + "epoch": 2.2, + "grad_norm": 6.087737709787373, + "learning_rate": 2.035317213816562e-06, + "loss": 0.6657, + "step": 21775 + }, + { + "epoch": 2.2, + "grad_norm": 4.619965083742663, + "learning_rate": 2.0329559299183438e-06, + "loss": 0.6584, + "step": 21780 + }, + { + "epoch": 2.2, + "grad_norm": 7.693608908068327, + "learning_rate": 2.0305956669771544e-06, + "loss": 0.692, + "step": 21785 + }, + { + "epoch": 2.2, + "grad_norm": 7.339632482464978, + "learning_rate": 2.0282364258051614e-06, + "loss": 0.6453, + "step": 21790 + }, + { + "epoch": 2.2, + "grad_norm": 4.938669128359225, + "learning_rate": 2.025878207214174e-06, + "loss": 0.6991, + "step": 21795 + }, + { + "epoch": 2.2, + "grad_norm": 5.791176392690036, + "learning_rate": 2.023521012015659e-06, + "loss": 0.6592, + "step": 21800 + }, + { + "epoch": 2.2, + "grad_norm": 5.366344972490452, + "learning_rate": 2.021164841020717e-06, + "loss": 0.6674, + "step": 21805 + }, + { + "epoch": 2.2, + "grad_norm": 5.120904543191494, + "learning_rate": 2.0188096950401097e-06, + "loss": 0.6761, + "step": 21810 + }, + { + "epoch": 2.2, + "grad_norm": 8.949775455094654, + "learning_rate": 2.016455574884238e-06, + "loss": 0.668, + "step": 21815 + }, + { + "epoch": 2.2, + "grad_norm": 6.4831923815287515, + "learning_rate": 2.0141024813631543e-06, + "loss": 0.6894, + "step": 21820 + }, + { + "epoch": 2.2, + "grad_norm": 8.6838611621769, + "learning_rate": 2.0117504152865535e-06, + "loss": 0.677, + "step": 21825 + }, + { + "epoch": 2.2, + "grad_norm": 11.12976987528081, + "learning_rate": 2.0093993774637844e-06, + "loss": 0.6721, + "step": 21830 + }, + { + "epoch": 2.2, + "grad_norm": 7.923204024712102, + "learning_rate": 2.007049368703829e-06, + "loss": 0.6547, + "step": 21835 + }, + { + "epoch": 2.2, + "grad_norm": 15.444648074512381, + "learning_rate": 2.0047003898153294e-06, + "loss": 0.7173, + "step": 21840 + }, + { + "epoch": 2.2, + "grad_norm": 4.512895550568181, + "learning_rate": 2.002352441606563e-06, + "loss": 0.6774, + "step": 21845 + }, + { + "epoch": 2.2, + "grad_norm": 5.120862716747655, + "learning_rate": 2.00000552488546e-06, + "loss": 0.6994, + "step": 21850 + }, + { + "epoch": 2.2, + "grad_norm": 6.1405050814060465, + "learning_rate": 1.9976596404595896e-06, + "loss": 0.6761, + "step": 21855 + }, + { + "epoch": 2.2, + "grad_norm": 6.729094027264364, + "learning_rate": 1.995314789136172e-06, + "loss": 0.6588, + "step": 21860 + }, + { + "epoch": 2.2, + "grad_norm": 4.843415886395163, + "learning_rate": 1.9929709717220618e-06, + "loss": 0.6514, + "step": 21865 + }, + { + "epoch": 2.2, + "grad_norm": 6.202932791193621, + "learning_rate": 1.9906281890237713e-06, + "loss": 0.6734, + "step": 21870 + }, + { + "epoch": 2.21, + "grad_norm": 4.752555156991824, + "learning_rate": 1.9882864418474433e-06, + "loss": 0.6801, + "step": 21875 + }, + { + "epoch": 2.21, + "grad_norm": 4.546389634068141, + "learning_rate": 1.985945730998877e-06, + "loss": 0.712, + "step": 21880 + }, + { + "epoch": 2.21, + "grad_norm": 4.450342124329078, + "learning_rate": 1.9836060572835043e-06, + "loss": 0.6513, + "step": 21885 + }, + { + "epoch": 2.21, + "grad_norm": 5.011460680897503, + "learning_rate": 1.981267421506409e-06, + "loss": 0.7039, + "step": 21890 + }, + { + "epoch": 2.21, + "grad_norm": 4.361626448788882, + "learning_rate": 1.9789298244723094e-06, + "loss": 0.647, + "step": 21895 + }, + { + "epoch": 2.21, + "grad_norm": 5.543949883889084, + "learning_rate": 1.9765932669855696e-06, + "loss": 0.7214, + "step": 21900 + }, + { + "epoch": 2.21, + "grad_norm": 4.629136822219546, + "learning_rate": 1.9742577498502015e-06, + "loss": 0.6905, + "step": 21905 + }, + { + "epoch": 2.21, + "grad_norm": 8.094072104483322, + "learning_rate": 1.9719232738698496e-06, + "loss": 0.7063, + "step": 21910 + }, + { + "epoch": 2.21, + "grad_norm": 6.180828173160776, + "learning_rate": 1.9695898398478087e-06, + "loss": 0.6723, + "step": 21915 + }, + { + "epoch": 2.21, + "grad_norm": 11.900414331532556, + "learning_rate": 1.9672574485870095e-06, + "loss": 0.6339, + "step": 21920 + }, + { + "epoch": 2.21, + "grad_norm": 13.887237517635425, + "learning_rate": 1.9649261008900256e-06, + "loss": 0.6656, + "step": 21925 + }, + { + "epoch": 2.21, + "grad_norm": 5.3677286120369345, + "learning_rate": 1.9625957975590697e-06, + "loss": 0.6835, + "step": 21930 + }, + { + "epoch": 2.21, + "grad_norm": 4.997891580541595, + "learning_rate": 1.9602665393960006e-06, + "loss": 0.6776, + "step": 21935 + }, + { + "epoch": 2.21, + "grad_norm": 13.034257581641024, + "learning_rate": 1.9579383272023106e-06, + "loss": 0.6598, + "step": 21940 + }, + { + "epoch": 2.21, + "grad_norm": 5.095295571247911, + "learning_rate": 1.9556111617791383e-06, + "loss": 0.6423, + "step": 21945 + }, + { + "epoch": 2.21, + "grad_norm": 4.779739155406581, + "learning_rate": 1.9532850439272576e-06, + "loss": 0.659, + "step": 21950 + }, + { + "epoch": 2.21, + "grad_norm": 17.447892921304916, + "learning_rate": 1.950959974447083e-06, + "loss": 0.6411, + "step": 21955 + }, + { + "epoch": 2.21, + "grad_norm": 5.38037099062299, + "learning_rate": 1.948635954138668e-06, + "loss": 0.6598, + "step": 21960 + }, + { + "epoch": 2.21, + "grad_norm": 6.985338264358882, + "learning_rate": 1.946312983801708e-06, + "loss": 0.6597, + "step": 21965 + }, + { + "epoch": 2.22, + "grad_norm": 5.06480244294857, + "learning_rate": 1.943991064235532e-06, + "loss": 0.6527, + "step": 21970 + }, + { + "epoch": 2.22, + "grad_norm": 6.539366017792932, + "learning_rate": 1.941670196239113e-06, + "loss": 0.6831, + "step": 21975 + }, + { + "epoch": 2.22, + "grad_norm": 4.784330135145143, + "learning_rate": 1.939350380611058e-06, + "loss": 0.6861, + "step": 21980 + }, + { + "epoch": 2.22, + "grad_norm": 7.570023660771087, + "learning_rate": 1.937031618149616e-06, + "loss": 0.6576, + "step": 21985 + }, + { + "epoch": 2.22, + "grad_norm": 4.7423422973082845, + "learning_rate": 1.9347139096526662e-06, + "loss": 0.6765, + "step": 21990 + }, + { + "epoch": 2.22, + "grad_norm": 6.434972787435978, + "learning_rate": 1.932397255917734e-06, + "loss": 0.6976, + "step": 21995 + }, + { + "epoch": 2.22, + "grad_norm": 5.393669477772615, + "learning_rate": 1.930081657741974e-06, + "loss": 0.6386, + "step": 22000 + }, + { + "epoch": 2.22, + "grad_norm": 5.324763834075317, + "learning_rate": 1.9277671159221858e-06, + "loss": 0.6698, + "step": 22005 + }, + { + "epoch": 2.22, + "grad_norm": 5.172778216238288, + "learning_rate": 1.925453631254796e-06, + "loss": 0.668, + "step": 22010 + }, + { + "epoch": 2.22, + "grad_norm": 10.775476470719338, + "learning_rate": 1.9231412045358794e-06, + "loss": 0.7001, + "step": 22015 + }, + { + "epoch": 2.22, + "grad_norm": 5.772612247891434, + "learning_rate": 1.920829836561134e-06, + "loss": 0.6343, + "step": 22020 + }, + { + "epoch": 2.22, + "grad_norm": 5.638808772116786, + "learning_rate": 1.9185195281258984e-06, + "loss": 0.6662, + "step": 22025 + }, + { + "epoch": 2.22, + "grad_norm": 8.477423201454245, + "learning_rate": 1.9162102800251526e-06, + "loss": 0.691, + "step": 22030 + }, + { + "epoch": 2.22, + "grad_norm": 6.001265825880683, + "learning_rate": 1.913902093053502e-06, + "loss": 0.7369, + "step": 22035 + }, + { + "epoch": 2.22, + "grad_norm": 5.695603228654765, + "learning_rate": 1.911594968005195e-06, + "loss": 0.6901, + "step": 22040 + }, + { + "epoch": 2.22, + "grad_norm": 4.699503683926651, + "learning_rate": 1.90928890567411e-06, + "loss": 0.6603, + "step": 22045 + }, + { + "epoch": 2.22, + "grad_norm": 5.246464717025585, + "learning_rate": 1.9069839068537605e-06, + "loss": 0.6683, + "step": 22050 + }, + { + "epoch": 2.22, + "grad_norm": 7.441835122688492, + "learning_rate": 1.9046799723372927e-06, + "loss": 0.6552, + "step": 22055 + }, + { + "epoch": 2.22, + "grad_norm": 6.807563202328714, + "learning_rate": 1.902377102917492e-06, + "loss": 0.6635, + "step": 22060 + }, + { + "epoch": 2.22, + "grad_norm": 4.53254206934949, + "learning_rate": 1.90007529938677e-06, + "loss": 0.6673, + "step": 22065 + }, + { + "epoch": 2.23, + "grad_norm": 6.36613438278603, + "learning_rate": 1.897774562537178e-06, + "loss": 0.6644, + "step": 22070 + }, + { + "epoch": 2.23, + "grad_norm": 8.838309800932239, + "learning_rate": 1.895474893160396e-06, + "loss": 0.7222, + "step": 22075 + }, + { + "epoch": 2.23, + "grad_norm": 7.298006664524484, + "learning_rate": 1.8931762920477387e-06, + "loss": 0.6583, + "step": 22080 + }, + { + "epoch": 2.23, + "grad_norm": 6.656911860726481, + "learning_rate": 1.89087875999015e-06, + "loss": 0.6445, + "step": 22085 + }, + { + "epoch": 2.23, + "grad_norm": 4.649023226724349, + "learning_rate": 1.888582297778212e-06, + "loss": 0.6448, + "step": 22090 + }, + { + "epoch": 2.23, + "grad_norm": 5.356781900163968, + "learning_rate": 1.8862869062021317e-06, + "loss": 0.6458, + "step": 22095 + }, + { + "epoch": 2.23, + "grad_norm": 5.5443744334239895, + "learning_rate": 1.8839925860517549e-06, + "loss": 0.6573, + "step": 22100 + }, + { + "epoch": 2.23, + "grad_norm": 4.735248749146872, + "learning_rate": 1.8816993381165533e-06, + "loss": 0.6369, + "step": 22105 + }, + { + "epoch": 2.23, + "grad_norm": 6.68724647456985, + "learning_rate": 1.8794071631856315e-06, + "loss": 0.6606, + "step": 22110 + }, + { + "epoch": 2.23, + "grad_norm": 4.519650423597918, + "learning_rate": 1.8771160620477219e-06, + "loss": 0.6397, + "step": 22115 + }, + { + "epoch": 2.23, + "grad_norm": 7.213776592218962, + "learning_rate": 1.8748260354911945e-06, + "loss": 0.6731, + "step": 22120 + }, + { + "epoch": 2.23, + "grad_norm": 8.309313240765013, + "learning_rate": 1.872537084304042e-06, + "loss": 0.6633, + "step": 22125 + }, + { + "epoch": 2.23, + "grad_norm": 5.250550954950561, + "learning_rate": 1.8702492092738934e-06, + "loss": 0.6962, + "step": 22130 + }, + { + "epoch": 2.23, + "grad_norm": 4.756675355745477, + "learning_rate": 1.8679624111880024e-06, + "loss": 0.6402, + "step": 22135 + }, + { + "epoch": 2.23, + "grad_norm": 6.987116484431764, + "learning_rate": 1.8656766908332542e-06, + "loss": 0.669, + "step": 22140 + }, + { + "epoch": 2.23, + "grad_norm": 5.662568308039815, + "learning_rate": 1.8633920489961615e-06, + "loss": 0.6795, + "step": 22145 + }, + { + "epoch": 2.23, + "grad_norm": 4.9315334222705465, + "learning_rate": 1.8611084864628708e-06, + "loss": 0.63, + "step": 22150 + }, + { + "epoch": 2.23, + "grad_norm": 7.919007732778563, + "learning_rate": 1.8588260040191518e-06, + "loss": 0.6346, + "step": 22155 + }, + { + "epoch": 2.23, + "grad_norm": 5.209612486019074, + "learning_rate": 1.856544602450403e-06, + "loss": 0.6401, + "step": 22160 + }, + { + "epoch": 2.23, + "grad_norm": 4.573632350427878, + "learning_rate": 1.8542642825416558e-06, + "loss": 0.6574, + "step": 22165 + }, + { + "epoch": 2.24, + "grad_norm": 5.75801656606795, + "learning_rate": 1.8519850450775646e-06, + "loss": 0.681, + "step": 22170 + }, + { + "epoch": 2.24, + "grad_norm": 4.473139584757566, + "learning_rate": 1.849706890842412e-06, + "loss": 0.64, + "step": 22175 + }, + { + "epoch": 2.24, + "grad_norm": 5.356833240856356, + "learning_rate": 1.8474298206201086e-06, + "loss": 0.6876, + "step": 22180 + }, + { + "epoch": 2.24, + "grad_norm": 5.005661901628179, + "learning_rate": 1.8451538351941938e-06, + "loss": 0.6857, + "step": 22185 + }, + { + "epoch": 2.24, + "grad_norm": 5.55621385776115, + "learning_rate": 1.84287893534783e-06, + "loss": 0.6653, + "step": 22190 + }, + { + "epoch": 2.24, + "grad_norm": 5.004544719422335, + "learning_rate": 1.8406051218638104e-06, + "loss": 0.6388, + "step": 22195 + }, + { + "epoch": 2.24, + "grad_norm": 5.650527884490754, + "learning_rate": 1.8383323955245513e-06, + "loss": 0.666, + "step": 22200 + }, + { + "epoch": 2.24, + "grad_norm": 5.510569408301339, + "learning_rate": 1.8360607571120948e-06, + "loss": 0.6684, + "step": 22205 + }, + { + "epoch": 2.24, + "grad_norm": 5.61974386639768, + "learning_rate": 1.8337902074081082e-06, + "loss": 0.669, + "step": 22210 + }, + { + "epoch": 2.24, + "grad_norm": 5.199501659759417, + "learning_rate": 1.831520747193889e-06, + "loss": 0.6624, + "step": 22215 + }, + { + "epoch": 2.24, + "grad_norm": 5.019792770858063, + "learning_rate": 1.8292523772503524e-06, + "loss": 0.6581, + "step": 22220 + }, + { + "epoch": 2.24, + "grad_norm": 8.958303016321787, + "learning_rate": 1.826985098358046e-06, + "loss": 0.6779, + "step": 22225 + }, + { + "epoch": 2.24, + "grad_norm": 7.503162405669565, + "learning_rate": 1.8247189112971374e-06, + "loss": 0.6477, + "step": 22230 + }, + { + "epoch": 2.24, + "grad_norm": 6.53591139782825, + "learning_rate": 1.8224538168474182e-06, + "loss": 0.686, + "step": 22235 + }, + { + "epoch": 2.24, + "grad_norm": 5.93514558069354, + "learning_rate": 1.820189815788304e-06, + "loss": 0.6986, + "step": 22240 + }, + { + "epoch": 2.24, + "grad_norm": 5.085521784062833, + "learning_rate": 1.8179269088988387e-06, + "loss": 0.6652, + "step": 22245 + }, + { + "epoch": 2.24, + "grad_norm": 4.759343061989954, + "learning_rate": 1.8156650969576832e-06, + "loss": 0.6996, + "step": 22250 + }, + { + "epoch": 2.24, + "grad_norm": 6.081130624986382, + "learning_rate": 1.8134043807431283e-06, + "loss": 0.6706, + "step": 22255 + }, + { + "epoch": 2.24, + "grad_norm": 4.646210752535273, + "learning_rate": 1.811144761033083e-06, + "loss": 0.6515, + "step": 22260 + }, + { + "epoch": 2.24, + "grad_norm": 5.462019258658007, + "learning_rate": 1.8088862386050786e-06, + "loss": 0.6332, + "step": 22265 + }, + { + "epoch": 2.25, + "grad_norm": 4.5455720420032675, + "learning_rate": 1.8066288142362704e-06, + "loss": 0.6632, + "step": 22270 + }, + { + "epoch": 2.25, + "grad_norm": 5.0054769315556555, + "learning_rate": 1.8043724887034392e-06, + "loss": 0.6543, + "step": 22275 + }, + { + "epoch": 2.25, + "grad_norm": 6.001193032887485, + "learning_rate": 1.802117262782982e-06, + "loss": 0.6798, + "step": 22280 + }, + { + "epoch": 2.25, + "grad_norm": 5.377042635357598, + "learning_rate": 1.799863137250919e-06, + "loss": 0.628, + "step": 22285 + }, + { + "epoch": 2.25, + "grad_norm": 6.247020646583163, + "learning_rate": 1.7976101128828955e-06, + "loss": 0.6841, + "step": 22290 + }, + { + "epoch": 2.25, + "grad_norm": 5.228469202329293, + "learning_rate": 1.7953581904541733e-06, + "loss": 0.6506, + "step": 22295 + }, + { + "epoch": 2.25, + "grad_norm": 6.332671657525808, + "learning_rate": 1.7931073707396373e-06, + "loss": 0.6919, + "step": 22300 + }, + { + "epoch": 2.25, + "grad_norm": 8.67677861390262, + "learning_rate": 1.7908576545137907e-06, + "loss": 0.6763, + "step": 22305 + }, + { + "epoch": 2.25, + "grad_norm": 5.955850137989377, + "learning_rate": 1.7886090425507612e-06, + "loss": 0.6497, + "step": 22310 + }, + { + "epoch": 2.25, + "grad_norm": 5.066982632939822, + "learning_rate": 1.7863615356242913e-06, + "loss": 0.6893, + "step": 22315 + }, + { + "epoch": 2.25, + "grad_norm": 4.719905492192519, + "learning_rate": 1.7841151345077495e-06, + "loss": 0.6588, + "step": 22320 + }, + { + "epoch": 2.25, + "grad_norm": 6.610428719794262, + "learning_rate": 1.7818698399741186e-06, + "loss": 0.6422, + "step": 22325 + }, + { + "epoch": 2.25, + "grad_norm": 8.213106162955256, + "learning_rate": 1.7796256527960021e-06, + "loss": 0.6617, + "step": 22330 + }, + { + "epoch": 2.25, + "grad_norm": 7.229355768945967, + "learning_rate": 1.7773825737456208e-06, + "loss": 0.6505, + "step": 22335 + }, + { + "epoch": 2.25, + "grad_norm": 5.012731623542265, + "learning_rate": 1.7751406035948193e-06, + "loss": 0.6619, + "step": 22340 + }, + { + "epoch": 2.25, + "grad_norm": 5.413531472969937, + "learning_rate": 1.7728997431150546e-06, + "loss": 0.6601, + "step": 22345 + }, + { + "epoch": 2.25, + "grad_norm": 5.726110813450331, + "learning_rate": 1.7706599930774077e-06, + "loss": 0.6832, + "step": 22350 + }, + { + "epoch": 2.25, + "grad_norm": 4.9672144231735045, + "learning_rate": 1.768421354252573e-06, + "loss": 0.654, + "step": 22355 + }, + { + "epoch": 2.25, + "grad_norm": 7.407604286978984, + "learning_rate": 1.7661838274108633e-06, + "loss": 0.7097, + "step": 22360 + }, + { + "epoch": 2.25, + "grad_norm": 5.588861929961504, + "learning_rate": 1.7639474133222085e-06, + "loss": 0.6319, + "step": 22365 + }, + { + "epoch": 2.26, + "grad_norm": 5.068515096985442, + "learning_rate": 1.7617121127561598e-06, + "loss": 0.6928, + "step": 22370 + }, + { + "epoch": 2.26, + "grad_norm": 4.565497603312442, + "learning_rate": 1.7594779264818783e-06, + "loss": 0.6595, + "step": 22375 + }, + { + "epoch": 2.26, + "grad_norm": 4.856303691873805, + "learning_rate": 1.7572448552681493e-06, + "loss": 0.6576, + "step": 22380 + }, + { + "epoch": 2.26, + "grad_norm": 5.032417280865937, + "learning_rate": 1.7550128998833681e-06, + "loss": 0.6665, + "step": 22385 + }, + { + "epoch": 2.26, + "grad_norm": 11.027154143621729, + "learning_rate": 1.7527820610955487e-06, + "loss": 0.6895, + "step": 22390 + }, + { + "epoch": 2.26, + "grad_norm": 5.220039667640416, + "learning_rate": 1.7505523396723189e-06, + "loss": 0.6352, + "step": 22395 + }, + { + "epoch": 2.26, + "grad_norm": 9.297318647388199, + "learning_rate": 1.7483237363809274e-06, + "loss": 0.6873, + "step": 22400 + }, + { + "epoch": 2.26, + "grad_norm": 4.900496581353949, + "learning_rate": 1.7460962519882324e-06, + "loss": 0.6739, + "step": 22405 + }, + { + "epoch": 2.26, + "grad_norm": 4.656133617830663, + "learning_rate": 1.7438698872607073e-06, + "loss": 0.6538, + "step": 22410 + }, + { + "epoch": 2.26, + "grad_norm": 5.577365486763903, + "learning_rate": 1.7416446429644462e-06, + "loss": 0.6357, + "step": 22415 + }, + { + "epoch": 2.26, + "grad_norm": 5.4964458786704755, + "learning_rate": 1.739420519865151e-06, + "loss": 0.6212, + "step": 22420 + }, + { + "epoch": 2.26, + "grad_norm": 5.718619521205935, + "learning_rate": 1.7371975187281408e-06, + "loss": 0.6586, + "step": 22425 + }, + { + "epoch": 2.26, + "grad_norm": 5.828469851925843, + "learning_rate": 1.7349756403183466e-06, + "loss": 0.6864, + "step": 22430 + }, + { + "epoch": 2.26, + "grad_norm": 4.822043565193145, + "learning_rate": 1.7327548854003174e-06, + "loss": 0.6553, + "step": 22435 + }, + { + "epoch": 2.26, + "grad_norm": 4.742210440858266, + "learning_rate": 1.7305352547382104e-06, + "loss": 0.6594, + "step": 22440 + }, + { + "epoch": 2.26, + "grad_norm": 5.268040093322078, + "learning_rate": 1.7283167490958008e-06, + "loss": 0.704, + "step": 22445 + }, + { + "epoch": 2.26, + "grad_norm": 5.3592022150205665, + "learning_rate": 1.726099369236473e-06, + "loss": 0.655, + "step": 22450 + }, + { + "epoch": 2.26, + "grad_norm": 5.995899113494076, + "learning_rate": 1.7238831159232257e-06, + "loss": 0.6753, + "step": 22455 + }, + { + "epoch": 2.26, + "grad_norm": 4.883862383907867, + "learning_rate": 1.7216679899186672e-06, + "loss": 0.6574, + "step": 22460 + }, + { + "epoch": 2.26, + "grad_norm": 4.639544085369703, + "learning_rate": 1.719453991985024e-06, + "loss": 0.6652, + "step": 22465 + }, + { + "epoch": 2.27, + "grad_norm": 5.225580088013155, + "learning_rate": 1.7172411228841268e-06, + "loss": 0.6618, + "step": 22470 + }, + { + "epoch": 2.27, + "grad_norm": 5.1915848336128985, + "learning_rate": 1.7150293833774257e-06, + "loss": 0.6785, + "step": 22475 + }, + { + "epoch": 2.27, + "grad_norm": 7.4886162606891284, + "learning_rate": 1.7128187742259766e-06, + "loss": 0.6523, + "step": 22480 + }, + { + "epoch": 2.27, + "grad_norm": 5.071153259933611, + "learning_rate": 1.7106092961904476e-06, + "loss": 0.6642, + "step": 22485 + }, + { + "epoch": 2.27, + "grad_norm": 5.821964514546382, + "learning_rate": 1.708400950031116e-06, + "loss": 0.6571, + "step": 22490 + }, + { + "epoch": 2.27, + "grad_norm": 6.567740592588277, + "learning_rate": 1.706193736507875e-06, + "loss": 0.6624, + "step": 22495 + }, + { + "epoch": 2.27, + "grad_norm": 5.930879935653045, + "learning_rate": 1.7039876563802227e-06, + "loss": 0.6651, + "step": 22500 + }, + { + "epoch": 2.27, + "grad_norm": 6.0767573323260935, + "learning_rate": 1.701782710407271e-06, + "loss": 0.7218, + "step": 22505 + }, + { + "epoch": 2.27, + "grad_norm": 4.6670914829207115, + "learning_rate": 1.699578899347738e-06, + "loss": 0.6669, + "step": 22510 + }, + { + "epoch": 2.27, + "grad_norm": 5.849989536360458, + "learning_rate": 1.697376223959954e-06, + "loss": 0.6287, + "step": 22515 + }, + { + "epoch": 2.27, + "grad_norm": 8.789381725465098, + "learning_rate": 1.6951746850018553e-06, + "loss": 0.6648, + "step": 22520 + }, + { + "epoch": 2.27, + "grad_norm": 7.480342356637824, + "learning_rate": 1.6929742832309926e-06, + "loss": 0.6871, + "step": 22525 + }, + { + "epoch": 2.27, + "grad_norm": 8.677024202708074, + "learning_rate": 1.6907750194045187e-06, + "loss": 0.672, + "step": 22530 + }, + { + "epoch": 2.27, + "grad_norm": 4.2619899057185044, + "learning_rate": 1.6885768942792018e-06, + "loss": 0.6492, + "step": 22535 + }, + { + "epoch": 2.27, + "grad_norm": 4.64342507169857, + "learning_rate": 1.6863799086114123e-06, + "loss": 0.6228, + "step": 22540 + }, + { + "epoch": 2.27, + "grad_norm": 4.816052603229862, + "learning_rate": 1.684184063157132e-06, + "loss": 0.6745, + "step": 22545 + }, + { + "epoch": 2.27, + "grad_norm": 5.13577322884629, + "learning_rate": 1.6819893586719482e-06, + "loss": 0.6422, + "step": 22550 + }, + { + "epoch": 2.27, + "grad_norm": 5.261141020345327, + "learning_rate": 1.6797957959110556e-06, + "loss": 0.6444, + "step": 22555 + }, + { + "epoch": 2.27, + "grad_norm": 13.338569930570511, + "learning_rate": 1.6776033756292602e-06, + "loss": 0.6916, + "step": 22560 + }, + { + "epoch": 2.28, + "grad_norm": 8.419978426971964, + "learning_rate": 1.675412098580968e-06, + "loss": 0.6775, + "step": 22565 + }, + { + "epoch": 2.28, + "grad_norm": 5.16182641300474, + "learning_rate": 1.6732219655201992e-06, + "loss": 0.6604, + "step": 22570 + }, + { + "epoch": 2.28, + "grad_norm": 5.597935147747176, + "learning_rate": 1.6710329772005745e-06, + "loss": 0.6568, + "step": 22575 + }, + { + "epoch": 2.28, + "grad_norm": 5.515372501062043, + "learning_rate": 1.668845134375323e-06, + "loss": 0.6796, + "step": 22580 + }, + { + "epoch": 2.28, + "grad_norm": 6.9916103673781, + "learning_rate": 1.6666584377972778e-06, + "loss": 0.6468, + "step": 22585 + }, + { + "epoch": 2.28, + "grad_norm": 5.799949235202191, + "learning_rate": 1.6644728882188816e-06, + "loss": 0.6541, + "step": 22590 + }, + { + "epoch": 2.28, + "grad_norm": 7.06369827344823, + "learning_rate": 1.6622884863921774e-06, + "loss": 0.645, + "step": 22595 + }, + { + "epoch": 2.28, + "grad_norm": 4.824620430723345, + "learning_rate": 1.6601052330688183e-06, + "loss": 0.6786, + "step": 22600 + }, + { + "epoch": 2.28, + "grad_norm": 5.431448980072174, + "learning_rate": 1.657923129000059e-06, + "loss": 0.6947, + "step": 22605 + }, + { + "epoch": 2.28, + "grad_norm": 5.597594890904755, + "learning_rate": 1.6557421749367587e-06, + "loss": 0.7003, + "step": 22610 + }, + { + "epoch": 2.28, + "grad_norm": 4.8169484999390715, + "learning_rate": 1.6535623716293803e-06, + "loss": 0.6554, + "step": 22615 + }, + { + "epoch": 2.28, + "grad_norm": 6.649438638850983, + "learning_rate": 1.6513837198279954e-06, + "loss": 0.6925, + "step": 22620 + }, + { + "epoch": 2.28, + "grad_norm": 8.363151737668417, + "learning_rate": 1.6492062202822724e-06, + "loss": 0.675, + "step": 22625 + }, + { + "epoch": 2.28, + "grad_norm": 6.638961681603949, + "learning_rate": 1.6470298737414902e-06, + "loss": 0.6805, + "step": 22630 + }, + { + "epoch": 2.28, + "grad_norm": 5.301544447481115, + "learning_rate": 1.6448546809545263e-06, + "loss": 0.6685, + "step": 22635 + }, + { + "epoch": 2.28, + "grad_norm": 4.82592619935771, + "learning_rate": 1.6426806426698616e-06, + "loss": 0.6311, + "step": 22640 + }, + { + "epoch": 2.28, + "grad_norm": 4.879463534993098, + "learning_rate": 1.6405077596355802e-06, + "loss": 0.676, + "step": 22645 + }, + { + "epoch": 2.28, + "grad_norm": 4.285753076248275, + "learning_rate": 1.6383360325993714e-06, + "loss": 0.6604, + "step": 22650 + }, + { + "epoch": 2.28, + "grad_norm": 6.806887277989003, + "learning_rate": 1.636165462308521e-06, + "loss": 0.7141, + "step": 22655 + }, + { + "epoch": 2.28, + "grad_norm": 4.653980698418252, + "learning_rate": 1.6339960495099243e-06, + "loss": 0.6709, + "step": 22660 + }, + { + "epoch": 2.29, + "grad_norm": 5.908926619764527, + "learning_rate": 1.6318277949500722e-06, + "loss": 0.6616, + "step": 22665 + }, + { + "epoch": 2.29, + "grad_norm": 6.916726063911858, + "learning_rate": 1.6296606993750585e-06, + "loss": 0.6744, + "step": 22670 + }, + { + "epoch": 2.29, + "grad_norm": 8.085829241125342, + "learning_rate": 1.6274947635305787e-06, + "loss": 0.6666, + "step": 22675 + }, + { + "epoch": 2.29, + "grad_norm": 5.298049943306973, + "learning_rate": 1.6253299881619288e-06, + "loss": 0.6646, + "step": 22680 + }, + { + "epoch": 2.29, + "grad_norm": 4.5751950869327045, + "learning_rate": 1.623166374014008e-06, + "loss": 0.6832, + "step": 22685 + }, + { + "epoch": 2.29, + "grad_norm": 5.355871168560011, + "learning_rate": 1.6210039218313111e-06, + "loss": 0.6712, + "step": 22690 + }, + { + "epoch": 2.29, + "grad_norm": 5.288403929339322, + "learning_rate": 1.6188426323579393e-06, + "loss": 0.6812, + "step": 22695 + }, + { + "epoch": 2.29, + "grad_norm": 4.5701276572794125, + "learning_rate": 1.616682506337588e-06, + "loss": 0.6712, + "step": 22700 + }, + { + "epoch": 2.29, + "grad_norm": 5.251012685634926, + "learning_rate": 1.6145235445135548e-06, + "loss": 0.7031, + "step": 22705 + }, + { + "epoch": 2.29, + "grad_norm": 5.5135520460923795, + "learning_rate": 1.6123657476287351e-06, + "loss": 0.659, + "step": 22710 + }, + { + "epoch": 2.29, + "grad_norm": 4.66950416575332, + "learning_rate": 1.6102091164256274e-06, + "loss": 0.6622, + "step": 22715 + }, + { + "epoch": 2.29, + "grad_norm": 5.036823131575185, + "learning_rate": 1.6080536516463236e-06, + "loss": 0.6851, + "step": 22720 + }, + { + "epoch": 2.29, + "grad_norm": 5.4755189801415325, + "learning_rate": 1.60589935403252e-06, + "loss": 0.6726, + "step": 22725 + }, + { + "epoch": 2.29, + "grad_norm": 6.375815713966879, + "learning_rate": 1.6037462243255075e-06, + "loss": 0.6622, + "step": 22730 + }, + { + "epoch": 2.29, + "grad_norm": 5.399869288391677, + "learning_rate": 1.6015942632661751e-06, + "loss": 0.6638, + "step": 22735 + }, + { + "epoch": 2.29, + "grad_norm": 5.892863779789559, + "learning_rate": 1.5994434715950092e-06, + "loss": 0.6664, + "step": 22740 + }, + { + "epoch": 2.29, + "grad_norm": 10.217792975059123, + "learning_rate": 1.5972938500520985e-06, + "loss": 0.6393, + "step": 22745 + }, + { + "epoch": 2.29, + "grad_norm": 5.600183963459423, + "learning_rate": 1.5951453993771226e-06, + "loss": 0.6497, + "step": 22750 + }, + { + "epoch": 2.29, + "grad_norm": 9.397161807213989, + "learning_rate": 1.5929981203093642e-06, + "loss": 0.6868, + "step": 22755 + }, + { + "epoch": 2.29, + "grad_norm": 6.338230143136408, + "learning_rate": 1.5908520135876981e-06, + "loss": 0.631, + "step": 22760 + }, + { + "epoch": 2.3, + "grad_norm": 9.48096451098916, + "learning_rate": 1.588707079950599e-06, + "loss": 0.6505, + "step": 22765 + }, + { + "epoch": 2.3, + "grad_norm": 4.999293793939398, + "learning_rate": 1.586563320136133e-06, + "loss": 0.637, + "step": 22770 + }, + { + "epoch": 2.3, + "grad_norm": 5.274154137351849, + "learning_rate": 1.5844207348819702e-06, + "loss": 0.6752, + "step": 22775 + }, + { + "epoch": 2.3, + "grad_norm": 6.788575808926672, + "learning_rate": 1.5822793249253687e-06, + "loss": 0.6621, + "step": 22780 + }, + { + "epoch": 2.3, + "grad_norm": 4.857226848457538, + "learning_rate": 1.5801390910031888e-06, + "loss": 0.6832, + "step": 22785 + }, + { + "epoch": 2.3, + "grad_norm": 7.658009144634473, + "learning_rate": 1.5780000338518813e-06, + "loss": 0.6528, + "step": 22790 + }, + { + "epoch": 2.3, + "grad_norm": 16.1108955367705, + "learning_rate": 1.5758621542074942e-06, + "loss": 0.6745, + "step": 22795 + }, + { + "epoch": 2.3, + "grad_norm": 13.716842477092792, + "learning_rate": 1.573725452805669e-06, + "loss": 0.6798, + "step": 22800 + }, + { + "epoch": 2.3, + "grad_norm": 12.84118164022468, + "learning_rate": 1.5715899303816413e-06, + "loss": 0.6628, + "step": 22805 + }, + { + "epoch": 2.3, + "grad_norm": 13.752749266021626, + "learning_rate": 1.569455587670246e-06, + "loss": 0.689, + "step": 22810 + }, + { + "epoch": 2.3, + "grad_norm": 5.108777183409205, + "learning_rate": 1.5673224254059045e-06, + "loss": 0.6336, + "step": 22815 + }, + { + "epoch": 2.3, + "grad_norm": 5.070827902531056, + "learning_rate": 1.5651904443226396e-06, + "loss": 0.6531, + "step": 22820 + }, + { + "epoch": 2.3, + "grad_norm": 6.386136834259939, + "learning_rate": 1.563059645154062e-06, + "loss": 0.6565, + "step": 22825 + }, + { + "epoch": 2.3, + "grad_norm": 7.73848076802254, + "learning_rate": 1.5609300286333785e-06, + "loss": 0.637, + "step": 22830 + }, + { + "epoch": 2.3, + "grad_norm": 4.383849619388004, + "learning_rate": 1.558801595493385e-06, + "loss": 0.6544, + "step": 22835 + }, + { + "epoch": 2.3, + "grad_norm": 4.528671138607547, + "learning_rate": 1.556674346466478e-06, + "loss": 0.6391, + "step": 22840 + }, + { + "epoch": 2.3, + "grad_norm": 7.863716125361647, + "learning_rate": 1.5545482822846375e-06, + "loss": 0.6687, + "step": 22845 + }, + { + "epoch": 2.3, + "grad_norm": 10.416901948032352, + "learning_rate": 1.5524234036794443e-06, + "loss": 0.6686, + "step": 22850 + }, + { + "epoch": 2.3, + "grad_norm": 12.080893725057612, + "learning_rate": 1.5502997113820651e-06, + "loss": 0.6806, + "step": 22855 + }, + { + "epoch": 2.3, + "grad_norm": 12.492772940957037, + "learning_rate": 1.5481772061232602e-06, + "loss": 0.6668, + "step": 22860 + }, + { + "epoch": 2.31, + "grad_norm": 6.6746567293411605, + "learning_rate": 1.5460558886333799e-06, + "loss": 0.6356, + "step": 22865 + }, + { + "epoch": 2.31, + "grad_norm": 5.517073475954265, + "learning_rate": 1.5439357596423715e-06, + "loss": 0.6996, + "step": 22870 + }, + { + "epoch": 2.31, + "grad_norm": 4.808254989184655, + "learning_rate": 1.5418168198797656e-06, + "loss": 0.664, + "step": 22875 + }, + { + "epoch": 2.31, + "grad_norm": 4.489570597294558, + "learning_rate": 1.5396990700746906e-06, + "loss": 0.6744, + "step": 22880 + }, + { + "epoch": 2.31, + "grad_norm": 5.180070947431376, + "learning_rate": 1.5375825109558606e-06, + "loss": 0.6738, + "step": 22885 + }, + { + "epoch": 2.31, + "grad_norm": 4.799182335651492, + "learning_rate": 1.5354671432515816e-06, + "loss": 0.6175, + "step": 22890 + }, + { + "epoch": 2.31, + "grad_norm": 4.500868500253037, + "learning_rate": 1.533352967689748e-06, + "loss": 0.6295, + "step": 22895 + }, + { + "epoch": 2.31, + "grad_norm": 4.943229695166293, + "learning_rate": 1.5312399849978483e-06, + "loss": 0.6333, + "step": 22900 + }, + { + "epoch": 2.31, + "grad_norm": 7.070154327116882, + "learning_rate": 1.5291281959029558e-06, + "loss": 0.6425, + "step": 22905 + }, + { + "epoch": 2.31, + "grad_norm": 4.857751537719575, + "learning_rate": 1.5270176011317373e-06, + "loss": 0.6621, + "step": 22910 + }, + { + "epoch": 2.31, + "grad_norm": 4.647239303953281, + "learning_rate": 1.5249082014104455e-06, + "loss": 0.646, + "step": 22915 + }, + { + "epoch": 2.31, + "grad_norm": 4.693460297092233, + "learning_rate": 1.5227999974649221e-06, + "loss": 0.6826, + "step": 22920 + }, + { + "epoch": 2.31, + "grad_norm": 4.809710792652713, + "learning_rate": 1.5206929900205987e-06, + "loss": 0.637, + "step": 22925 + }, + { + "epoch": 2.31, + "grad_norm": 6.580585052715756, + "learning_rate": 1.5185871798024925e-06, + "loss": 0.6762, + "step": 22930 + }, + { + "epoch": 2.31, + "grad_norm": 4.500498281272657, + "learning_rate": 1.516482567535214e-06, + "loss": 0.6469, + "step": 22935 + }, + { + "epoch": 2.31, + "grad_norm": 4.953886299364294, + "learning_rate": 1.514379153942956e-06, + "loss": 0.6251, + "step": 22940 + }, + { + "epoch": 2.31, + "grad_norm": 4.8641256553373, + "learning_rate": 1.5122769397495047e-06, + "loss": 0.6595, + "step": 22945 + }, + { + "epoch": 2.31, + "grad_norm": 4.563681811327853, + "learning_rate": 1.510175925678224e-06, + "loss": 0.6188, + "step": 22950 + }, + { + "epoch": 2.31, + "grad_norm": 5.174693521988725, + "learning_rate": 1.508076112452076e-06, + "loss": 0.6701, + "step": 22955 + }, + { + "epoch": 2.31, + "grad_norm": 4.501701102219561, + "learning_rate": 1.5059775007936006e-06, + "loss": 0.682, + "step": 22960 + }, + { + "epoch": 2.32, + "grad_norm": 5.772847198348246, + "learning_rate": 1.5038800914249319e-06, + "loss": 0.6474, + "step": 22965 + }, + { + "epoch": 2.32, + "grad_norm": 4.9431492869509865, + "learning_rate": 1.501783885067783e-06, + "loss": 0.6448, + "step": 22970 + }, + { + "epoch": 2.32, + "grad_norm": 5.472881197234141, + "learning_rate": 1.4996888824434613e-06, + "loss": 0.6552, + "step": 22975 + }, + { + "epoch": 2.32, + "grad_norm": 5.316621141713757, + "learning_rate": 1.4975950842728482e-06, + "loss": 0.6278, + "step": 22980 + }, + { + "epoch": 2.32, + "grad_norm": 4.813533380241714, + "learning_rate": 1.4955024912764239e-06, + "loss": 0.6656, + "step": 22985 + }, + { + "epoch": 2.32, + "grad_norm": 4.427506467377018, + "learning_rate": 1.4934111041742427e-06, + "loss": 0.629, + "step": 22990 + }, + { + "epoch": 2.32, + "grad_norm": 4.177308682078762, + "learning_rate": 1.4913209236859533e-06, + "loss": 0.6516, + "step": 22995 + }, + { + "epoch": 2.32, + "grad_norm": 4.602772286736399, + "learning_rate": 1.4892319505307817e-06, + "loss": 0.6848, + "step": 23000 + }, + { + "epoch": 2.32, + "grad_norm": 5.021566911989718, + "learning_rate": 1.4871441854275454e-06, + "loss": 0.6633, + "step": 23005 + }, + { + "epoch": 2.32, + "grad_norm": 5.1205395441480155, + "learning_rate": 1.4850576290946368e-06, + "loss": 0.6336, + "step": 23010 + }, + { + "epoch": 2.32, + "grad_norm": 5.050517972420797, + "learning_rate": 1.4829722822500425e-06, + "loss": 0.6554, + "step": 23015 + }, + { + "epoch": 2.32, + "grad_norm": 5.489818487186654, + "learning_rate": 1.480888145611325e-06, + "loss": 0.6831, + "step": 23020 + }, + { + "epoch": 2.32, + "grad_norm": 4.368923565018206, + "learning_rate": 1.478805219895637e-06, + "loss": 0.6764, + "step": 23025 + }, + { + "epoch": 2.32, + "grad_norm": 5.585422569645293, + "learning_rate": 1.4767235058197083e-06, + "loss": 0.6559, + "step": 23030 + }, + { + "epoch": 2.32, + "grad_norm": 7.324264673646722, + "learning_rate": 1.474643004099857e-06, + "loss": 0.6808, + "step": 23035 + }, + { + "epoch": 2.32, + "grad_norm": 5.070237790985046, + "learning_rate": 1.4725637154519817e-06, + "loss": 0.6641, + "step": 23040 + }, + { + "epoch": 2.32, + "grad_norm": 5.047696957897613, + "learning_rate": 1.4704856405915629e-06, + "loss": 0.6386, + "step": 23045 + }, + { + "epoch": 2.32, + "grad_norm": 4.5509977864925, + "learning_rate": 1.4684087802336639e-06, + "loss": 0.6647, + "step": 23050 + }, + { + "epoch": 2.32, + "grad_norm": 5.120281916761617, + "learning_rate": 1.466333135092929e-06, + "loss": 0.6543, + "step": 23055 + }, + { + "epoch": 2.32, + "grad_norm": 4.583061331252208, + "learning_rate": 1.4642587058835883e-06, + "loss": 0.6635, + "step": 23060 + }, + { + "epoch": 2.33, + "grad_norm": 7.757596000491188, + "learning_rate": 1.4621854933194491e-06, + "loss": 0.6824, + "step": 23065 + }, + { + "epoch": 2.33, + "grad_norm": 4.383801982904171, + "learning_rate": 1.4601134981139054e-06, + "loss": 0.6578, + "step": 23070 + }, + { + "epoch": 2.33, + "grad_norm": 5.3470348884541306, + "learning_rate": 1.4580427209799231e-06, + "loss": 0.6928, + "step": 23075 + }, + { + "epoch": 2.33, + "grad_norm": 5.149969443706589, + "learning_rate": 1.4559731626300589e-06, + "loss": 0.6617, + "step": 23080 + }, + { + "epoch": 2.33, + "grad_norm": 5.284858096905827, + "learning_rate": 1.4539048237764431e-06, + "loss": 0.6275, + "step": 23085 + }, + { + "epoch": 2.33, + "grad_norm": 4.693178152027507, + "learning_rate": 1.451837705130792e-06, + "loss": 0.6713, + "step": 23090 + }, + { + "epoch": 2.33, + "grad_norm": 5.203824484556633, + "learning_rate": 1.449771807404396e-06, + "loss": 0.6607, + "step": 23095 + }, + { + "epoch": 2.33, + "grad_norm": 5.086803924453811, + "learning_rate": 1.447707131308133e-06, + "loss": 0.7089, + "step": 23100 + }, + { + "epoch": 2.33, + "grad_norm": 5.049690406164953, + "learning_rate": 1.4456436775524507e-06, + "loss": 0.7033, + "step": 23105 + }, + { + "epoch": 2.33, + "grad_norm": 4.761020015525577, + "learning_rate": 1.4435814468473858e-06, + "loss": 0.6751, + "step": 23110 + }, + { + "epoch": 2.33, + "grad_norm": 9.406705510139943, + "learning_rate": 1.4415204399025462e-06, + "loss": 0.631, + "step": 23115 + }, + { + "epoch": 2.33, + "grad_norm": 5.127818797007829, + "learning_rate": 1.4394606574271264e-06, + "loss": 0.6809, + "step": 23120 + }, + { + "epoch": 2.33, + "grad_norm": 7.045815118104132, + "learning_rate": 1.437402100129892e-06, + "loss": 0.6879, + "step": 23125 + }, + { + "epoch": 2.33, + "grad_norm": 5.939354889411304, + "learning_rate": 1.4353447687191958e-06, + "loss": 0.6896, + "step": 23130 + }, + { + "epoch": 2.33, + "grad_norm": 5.292790763207718, + "learning_rate": 1.4332886639029564e-06, + "loss": 0.7029, + "step": 23135 + }, + { + "epoch": 2.33, + "grad_norm": 5.154024926520374, + "learning_rate": 1.4312337863886832e-06, + "loss": 0.6413, + "step": 23140 + }, + { + "epoch": 2.33, + "grad_norm": 4.647242944147119, + "learning_rate": 1.4291801368834534e-06, + "loss": 0.6296, + "step": 23145 + }, + { + "epoch": 2.33, + "grad_norm": 6.41781643535116, + "learning_rate": 1.42712771609393e-06, + "loss": 0.6493, + "step": 23150 + }, + { + "epoch": 2.33, + "grad_norm": 6.03442067665093, + "learning_rate": 1.4250765247263455e-06, + "loss": 0.6604, + "step": 23155 + }, + { + "epoch": 2.34, + "grad_norm": 4.550340133803211, + "learning_rate": 1.423026563486517e-06, + "loss": 0.6379, + "step": 23160 + }, + { + "epoch": 2.34, + "grad_norm": 5.03642193203217, + "learning_rate": 1.420977833079829e-06, + "loss": 0.6574, + "step": 23165 + }, + { + "epoch": 2.34, + "grad_norm": 5.5802584153495705, + "learning_rate": 1.4189303342112515e-06, + "loss": 0.6574, + "step": 23170 + }, + { + "epoch": 2.34, + "grad_norm": 4.571903557702888, + "learning_rate": 1.416884067585324e-06, + "loss": 0.6682, + "step": 23175 + }, + { + "epoch": 2.34, + "grad_norm": 4.723195708364407, + "learning_rate": 1.4148390339061686e-06, + "loss": 0.6845, + "step": 23180 + }, + { + "epoch": 2.34, + "grad_norm": 5.344770915417063, + "learning_rate": 1.412795233877477e-06, + "loss": 0.6471, + "step": 23185 + }, + { + "epoch": 2.34, + "grad_norm": 5.694346065527504, + "learning_rate": 1.4107526682025196e-06, + "loss": 0.6362, + "step": 23190 + }, + { + "epoch": 2.34, + "grad_norm": 4.6656465393345625, + "learning_rate": 1.4087113375841405e-06, + "loss": 0.626, + "step": 23195 + }, + { + "epoch": 2.34, + "grad_norm": 7.125473741431489, + "learning_rate": 1.4066712427247585e-06, + "loss": 0.65, + "step": 23200 + }, + { + "epoch": 2.34, + "grad_norm": 5.056846590080103, + "learning_rate": 1.404632384326372e-06, + "loss": 0.6943, + "step": 23205 + }, + { + "epoch": 2.34, + "grad_norm": 4.925503705896461, + "learning_rate": 1.4025947630905462e-06, + "loss": 0.6557, + "step": 23210 + }, + { + "epoch": 2.34, + "grad_norm": 6.065099671118211, + "learning_rate": 1.400558379718428e-06, + "loss": 0.7129, + "step": 23215 + }, + { + "epoch": 2.34, + "grad_norm": 5.1811074220151765, + "learning_rate": 1.3985232349107337e-06, + "loss": 0.6796, + "step": 23220 + }, + { + "epoch": 2.34, + "grad_norm": 4.636562868475947, + "learning_rate": 1.3964893293677544e-06, + "loss": 0.6606, + "step": 23225 + }, + { + "epoch": 2.34, + "grad_norm": 5.413028304997516, + "learning_rate": 1.3944566637893532e-06, + "loss": 0.667, + "step": 23230 + }, + { + "epoch": 2.34, + "grad_norm": 5.872599984362106, + "learning_rate": 1.392425238874972e-06, + "loss": 0.6637, + "step": 23235 + }, + { + "epoch": 2.34, + "grad_norm": 6.7251617213033485, + "learning_rate": 1.3903950553236195e-06, + "loss": 0.6672, + "step": 23240 + }, + { + "epoch": 2.34, + "grad_norm": 6.3003296095887995, + "learning_rate": 1.3883661138338816e-06, + "loss": 0.6466, + "step": 23245 + }, + { + "epoch": 2.34, + "grad_norm": 5.494566292087462, + "learning_rate": 1.386338415103915e-06, + "loss": 0.6855, + "step": 23250 + }, + { + "epoch": 2.34, + "grad_norm": 5.15293869738178, + "learning_rate": 1.384311959831448e-06, + "loss": 0.6595, + "step": 23255 + }, + { + "epoch": 2.35, + "grad_norm": 4.260877143165857, + "learning_rate": 1.3822867487137808e-06, + "loss": 0.6825, + "step": 23260 + }, + { + "epoch": 2.35, + "grad_norm": 4.584211338372017, + "learning_rate": 1.38026278244779e-06, + "loss": 0.6439, + "step": 23265 + }, + { + "epoch": 2.35, + "grad_norm": 7.742745689669459, + "learning_rate": 1.3782400617299164e-06, + "loss": 0.6723, + "step": 23270 + }, + { + "epoch": 2.35, + "grad_norm": 4.726444616139492, + "learning_rate": 1.37621858725618e-06, + "loss": 0.6351, + "step": 23275 + }, + { + "epoch": 2.35, + "grad_norm": 4.850540870101814, + "learning_rate": 1.374198359722167e-06, + "loss": 0.6377, + "step": 23280 + }, + { + "epoch": 2.35, + "grad_norm": 4.95117451315653, + "learning_rate": 1.3721793798230348e-06, + "loss": 0.6468, + "step": 23285 + }, + { + "epoch": 2.35, + "grad_norm": 5.481108016903852, + "learning_rate": 1.370161648253512e-06, + "loss": 0.6457, + "step": 23290 + }, + { + "epoch": 2.35, + "grad_norm": 4.867437599763508, + "learning_rate": 1.3681451657079003e-06, + "loss": 0.7072, + "step": 23295 + }, + { + "epoch": 2.35, + "grad_norm": 5.348183075500717, + "learning_rate": 1.3661299328800671e-06, + "loss": 0.6474, + "step": 23300 + }, + { + "epoch": 2.35, + "grad_norm": 4.566750840451449, + "learning_rate": 1.3641159504634543e-06, + "loss": 0.6517, + "step": 23305 + }, + { + "epoch": 2.35, + "grad_norm": 4.349563812986563, + "learning_rate": 1.3621032191510708e-06, + "loss": 0.6582, + "step": 23310 + }, + { + "epoch": 2.35, + "grad_norm": 5.230110035485306, + "learning_rate": 1.3600917396354946e-06, + "loss": 0.6464, + "step": 23315 + }, + { + "epoch": 2.35, + "grad_norm": 10.85878257847084, + "learning_rate": 1.3580815126088742e-06, + "loss": 0.6586, + "step": 23320 + }, + { + "epoch": 2.35, + "grad_norm": 11.345914802277472, + "learning_rate": 1.356072538762926e-06, + "loss": 0.643, + "step": 23325 + }, + { + "epoch": 2.35, + "grad_norm": 4.8404459088455924, + "learning_rate": 1.3540648187889377e-06, + "loss": 0.6631, + "step": 23330 + }, + { + "epoch": 2.35, + "grad_norm": 4.5794802817734865, + "learning_rate": 1.352058353377762e-06, + "loss": 0.6473, + "step": 23335 + }, + { + "epoch": 2.35, + "grad_norm": 5.9389158480385, + "learning_rate": 1.3500531432198239e-06, + "loss": 0.6806, + "step": 23340 + }, + { + "epoch": 2.35, + "grad_norm": 6.3870434258097735, + "learning_rate": 1.3480491890051128e-06, + "loss": 0.6266, + "step": 23345 + }, + { + "epoch": 2.35, + "grad_norm": 4.826035233529999, + "learning_rate": 1.3460464914231886e-06, + "loss": 0.6739, + "step": 23350 + }, + { + "epoch": 2.35, + "grad_norm": 4.647906550361867, + "learning_rate": 1.3440450511631748e-06, + "loss": 0.669, + "step": 23355 + }, + { + "epoch": 2.36, + "grad_norm": 4.637883983239464, + "learning_rate": 1.3420448689137683e-06, + "loss": 0.6564, + "step": 23360 + }, + { + "epoch": 2.36, + "grad_norm": 5.5577202461210335, + "learning_rate": 1.3400459453632274e-06, + "loss": 0.6824, + "step": 23365 + }, + { + "epoch": 2.36, + "grad_norm": 4.9500063154271485, + "learning_rate": 1.3380482811993827e-06, + "loss": 0.6521, + "step": 23370 + }, + { + "epoch": 2.36, + "grad_norm": 5.150985099315612, + "learning_rate": 1.3360518771096264e-06, + "loss": 0.6485, + "step": 23375 + }, + { + "epoch": 2.36, + "grad_norm": 5.006946347892576, + "learning_rate": 1.3340567337809203e-06, + "loss": 0.6921, + "step": 23380 + }, + { + "epoch": 2.36, + "grad_norm": 5.375026650050526, + "learning_rate": 1.332062851899789e-06, + "loss": 0.6219, + "step": 23385 + }, + { + "epoch": 2.36, + "grad_norm": 4.841278654003525, + "learning_rate": 1.3300702321523285e-06, + "loss": 0.6747, + "step": 23390 + }, + { + "epoch": 2.36, + "grad_norm": 4.856888018203015, + "learning_rate": 1.3280788752241945e-06, + "loss": 0.6781, + "step": 23395 + }, + { + "epoch": 2.36, + "grad_norm": 5.087734511284156, + "learning_rate": 1.326088781800614e-06, + "loss": 0.7146, + "step": 23400 + }, + { + "epoch": 2.36, + "grad_norm": 4.674249998453914, + "learning_rate": 1.3240999525663745e-06, + "loss": 0.6532, + "step": 23405 + }, + { + "epoch": 2.36, + "grad_norm": 7.303622485507553, + "learning_rate": 1.3221123882058308e-06, + "loss": 0.6753, + "step": 23410 + }, + { + "epoch": 2.36, + "grad_norm": 6.834287356147514, + "learning_rate": 1.3201260894028994e-06, + "loss": 0.6871, + "step": 23415 + }, + { + "epoch": 2.36, + "grad_norm": 4.456355924902777, + "learning_rate": 1.318141056841068e-06, + "loss": 0.6214, + "step": 23420 + }, + { + "epoch": 2.36, + "grad_norm": 5.193507840480104, + "learning_rate": 1.3161572912033804e-06, + "loss": 0.6692, + "step": 23425 + }, + { + "epoch": 2.36, + "grad_norm": 5.216069177319419, + "learning_rate": 1.3141747931724525e-06, + "loss": 0.6571, + "step": 23430 + }, + { + "epoch": 2.36, + "grad_norm": 5.731333771254167, + "learning_rate": 1.3121935634304579e-06, + "loss": 0.6906, + "step": 23435 + }, + { + "epoch": 2.36, + "grad_norm": 4.787135408765722, + "learning_rate": 1.310213602659135e-06, + "loss": 0.6366, + "step": 23440 + }, + { + "epoch": 2.36, + "grad_norm": 5.078790304953358, + "learning_rate": 1.308234911539788e-06, + "loss": 0.6694, + "step": 23445 + }, + { + "epoch": 2.36, + "grad_norm": 4.6198193220233525, + "learning_rate": 1.3062574907532804e-06, + "loss": 0.6594, + "step": 23450 + }, + { + "epoch": 2.36, + "grad_norm": 5.8832634578953735, + "learning_rate": 1.3042813409800436e-06, + "loss": 0.6642, + "step": 23455 + }, + { + "epoch": 2.37, + "grad_norm": 4.567256783115041, + "learning_rate": 1.3023064629000658e-06, + "loss": 0.6852, + "step": 23460 + }, + { + "epoch": 2.37, + "grad_norm": 6.4284491630095015, + "learning_rate": 1.3003328571929047e-06, + "loss": 0.6544, + "step": 23465 + }, + { + "epoch": 2.37, + "grad_norm": 4.960176754434972, + "learning_rate": 1.2983605245376729e-06, + "loss": 0.6995, + "step": 23470 + }, + { + "epoch": 2.37, + "grad_norm": 5.14495847600321, + "learning_rate": 1.2963894656130498e-06, + "loss": 0.6466, + "step": 23475 + }, + { + "epoch": 2.37, + "grad_norm": 5.411771484130297, + "learning_rate": 1.2944196810972726e-06, + "loss": 0.6451, + "step": 23480 + }, + { + "epoch": 2.37, + "grad_norm": 6.403061557609689, + "learning_rate": 1.2924511716681454e-06, + "loss": 0.6707, + "step": 23485 + }, + { + "epoch": 2.37, + "grad_norm": 9.665497217526969, + "learning_rate": 1.290483938003027e-06, + "loss": 0.6474, + "step": 23490 + }, + { + "epoch": 2.37, + "grad_norm": 6.353025034488214, + "learning_rate": 1.2885179807788445e-06, + "loss": 0.6332, + "step": 23495 + }, + { + "epoch": 2.37, + "grad_norm": 4.23222585755923, + "learning_rate": 1.2865533006720799e-06, + "loss": 0.6703, + "step": 23500 + }, + { + "epoch": 2.37, + "grad_norm": 10.189597357989905, + "learning_rate": 1.284589898358778e-06, + "loss": 0.6521, + "step": 23505 + }, + { + "epoch": 2.37, + "grad_norm": 5.614627502483765, + "learning_rate": 1.2826277745145416e-06, + "loss": 0.6625, + "step": 23510 + }, + { + "epoch": 2.37, + "grad_norm": 6.686704893298934, + "learning_rate": 1.2806669298145391e-06, + "loss": 0.6506, + "step": 23515 + }, + { + "epoch": 2.37, + "grad_norm": 4.7921094655898955, + "learning_rate": 1.2787073649334919e-06, + "loss": 0.679, + "step": 23520 + }, + { + "epoch": 2.37, + "grad_norm": 6.666328142877876, + "learning_rate": 1.2767490805456878e-06, + "loss": 0.6433, + "step": 23525 + }, + { + "epoch": 2.37, + "grad_norm": 5.584125934385775, + "learning_rate": 1.2747920773249689e-06, + "loss": 0.6884, + "step": 23530 + }, + { + "epoch": 2.37, + "grad_norm": 7.078045665988331, + "learning_rate": 1.2728363559447388e-06, + "loss": 0.6599, + "step": 23535 + }, + { + "epoch": 2.37, + "grad_norm": 4.828946933390636, + "learning_rate": 1.2708819170779568e-06, + "loss": 0.6351, + "step": 23540 + }, + { + "epoch": 2.37, + "grad_norm": 4.93210162599946, + "learning_rate": 1.2689287613971467e-06, + "loss": 0.6515, + "step": 23545 + }, + { + "epoch": 2.37, + "grad_norm": 6.907369932396423, + "learning_rate": 1.266976889574385e-06, + "loss": 0.6519, + "step": 23550 + }, + { + "epoch": 2.37, + "grad_norm": 4.496370017842469, + "learning_rate": 1.265026302281312e-06, + "loss": 0.6496, + "step": 23555 + }, + { + "epoch": 2.38, + "grad_norm": 4.370476330349814, + "learning_rate": 1.2630770001891207e-06, + "loss": 0.6559, + "step": 23560 + }, + { + "epoch": 2.38, + "grad_norm": 4.738527745570117, + "learning_rate": 1.2611289839685643e-06, + "loss": 0.652, + "step": 23565 + }, + { + "epoch": 2.38, + "grad_norm": 4.5919750707047156, + "learning_rate": 1.2591822542899535e-06, + "loss": 0.6782, + "step": 23570 + }, + { + "epoch": 2.38, + "grad_norm": 6.649660615172721, + "learning_rate": 1.257236811823155e-06, + "loss": 0.6538, + "step": 23575 + }, + { + "epoch": 2.38, + "grad_norm": 4.520625159182393, + "learning_rate": 1.2552926572375957e-06, + "loss": 0.6689, + "step": 23580 + }, + { + "epoch": 2.38, + "grad_norm": 6.9033900446678516, + "learning_rate": 1.2533497912022558e-06, + "loss": 0.6437, + "step": 23585 + }, + { + "epoch": 2.38, + "grad_norm": 9.766767179662107, + "learning_rate": 1.2514082143856748e-06, + "loss": 0.6579, + "step": 23590 + }, + { + "epoch": 2.38, + "grad_norm": 6.433224776492155, + "learning_rate": 1.2494679274559473e-06, + "loss": 0.6443, + "step": 23595 + }, + { + "epoch": 2.38, + "grad_norm": 9.979552939552086, + "learning_rate": 1.2475289310807243e-06, + "loss": 0.6721, + "step": 23600 + }, + { + "epoch": 2.38, + "grad_norm": 15.67763557490779, + "learning_rate": 1.2455912259272101e-06, + "loss": 0.6534, + "step": 23605 + }, + { + "epoch": 2.38, + "grad_norm": 6.332743212638864, + "learning_rate": 1.2436548126621706e-06, + "loss": 0.6616, + "step": 23610 + }, + { + "epoch": 2.38, + "grad_norm": 7.280536435536539, + "learning_rate": 1.241719691951921e-06, + "loss": 0.6559, + "step": 23615 + }, + { + "epoch": 2.38, + "grad_norm": 5.5144024446355715, + "learning_rate": 1.2397858644623372e-06, + "loss": 0.6806, + "step": 23620 + }, + { + "epoch": 2.38, + "grad_norm": 6.825569952036882, + "learning_rate": 1.2378533308588465e-06, + "loss": 0.6852, + "step": 23625 + }, + { + "epoch": 2.38, + "grad_norm": 5.720916702390011, + "learning_rate": 1.2359220918064307e-06, + "loss": 0.642, + "step": 23630 + }, + { + "epoch": 2.38, + "grad_norm": 4.798257914773307, + "learning_rate": 1.2339921479696272e-06, + "loss": 0.6712, + "step": 23635 + }, + { + "epoch": 2.38, + "grad_norm": 5.585871975942351, + "learning_rate": 1.2320635000125297e-06, + "loss": 0.6744, + "step": 23640 + }, + { + "epoch": 2.38, + "grad_norm": 6.093392040206783, + "learning_rate": 1.2301361485987817e-06, + "loss": 0.6983, + "step": 23645 + }, + { + "epoch": 2.38, + "grad_norm": 6.9083491018896055, + "learning_rate": 1.2282100943915854e-06, + "loss": 0.7137, + "step": 23650 + }, + { + "epoch": 2.38, + "grad_norm": 5.209742991514805, + "learning_rate": 1.2262853380536938e-06, + "loss": 0.66, + "step": 23655 + }, + { + "epoch": 2.39, + "grad_norm": 4.855657115362898, + "learning_rate": 1.2243618802474127e-06, + "loss": 0.6402, + "step": 23660 + }, + { + "epoch": 2.39, + "grad_norm": 5.611373859685703, + "learning_rate": 1.2224397216346012e-06, + "loss": 0.6706, + "step": 23665 + }, + { + "epoch": 2.39, + "grad_norm": 4.840925762847756, + "learning_rate": 1.2205188628766745e-06, + "loss": 0.6681, + "step": 23670 + }, + { + "epoch": 2.39, + "grad_norm": 5.042033497059596, + "learning_rate": 1.2185993046345961e-06, + "loss": 0.6468, + "step": 23675 + }, + { + "epoch": 2.39, + "grad_norm": 4.79871160771968, + "learning_rate": 1.2166810475688867e-06, + "loss": 0.6741, + "step": 23680 + }, + { + "epoch": 2.39, + "grad_norm": 5.105323448947047, + "learning_rate": 1.214764092339616e-06, + "loss": 0.6735, + "step": 23685 + }, + { + "epoch": 2.39, + "grad_norm": 4.891121972055124, + "learning_rate": 1.212848439606405e-06, + "loss": 0.6561, + "step": 23690 + }, + { + "epoch": 2.39, + "grad_norm": 6.4686279997552765, + "learning_rate": 1.2109340900284289e-06, + "loss": 0.6541, + "step": 23695 + }, + { + "epoch": 2.39, + "grad_norm": 5.01156044355106, + "learning_rate": 1.2090210442644124e-06, + "loss": 0.668, + "step": 23700 + }, + { + "epoch": 2.39, + "grad_norm": 5.618000263976089, + "learning_rate": 1.2071093029726343e-06, + "loss": 0.6481, + "step": 23705 + }, + { + "epoch": 2.39, + "grad_norm": 5.163482808177356, + "learning_rate": 1.205198866810922e-06, + "loss": 0.6409, + "step": 23710 + }, + { + "epoch": 2.39, + "grad_norm": 4.790103543725066, + "learning_rate": 1.2032897364366563e-06, + "loss": 0.6461, + "step": 23715 + }, + { + "epoch": 2.39, + "grad_norm": 5.230244969450162, + "learning_rate": 1.2013819125067661e-06, + "loss": 0.6942, + "step": 23720 + }, + { + "epoch": 2.39, + "grad_norm": 4.56863451913176, + "learning_rate": 1.199475395677731e-06, + "loss": 0.6497, + "step": 23725 + }, + { + "epoch": 2.39, + "grad_norm": 4.490697737307678, + "learning_rate": 1.1975701866055816e-06, + "loss": 0.6548, + "step": 23730 + }, + { + "epoch": 2.39, + "grad_norm": 5.674980329651185, + "learning_rate": 1.1956662859459006e-06, + "loss": 0.6524, + "step": 23735 + }, + { + "epoch": 2.39, + "grad_norm": 9.130828897789995, + "learning_rate": 1.1937636943538155e-06, + "loss": 0.6814, + "step": 23740 + }, + { + "epoch": 2.39, + "grad_norm": 5.096759628684305, + "learning_rate": 1.1918624124840094e-06, + "loss": 0.6951, + "step": 23745 + }, + { + "epoch": 2.39, + "grad_norm": 4.732147767802345, + "learning_rate": 1.18996244099071e-06, + "loss": 0.6996, + "step": 23750 + }, + { + "epoch": 2.4, + "grad_norm": 4.852807030794389, + "learning_rate": 1.1880637805276956e-06, + "loss": 0.6602, + "step": 23755 + }, + { + "epoch": 2.4, + "grad_norm": 7.124165341584474, + "learning_rate": 1.1861664317482918e-06, + "loss": 0.6541, + "step": 23760 + }, + { + "epoch": 2.4, + "grad_norm": 6.067019003192417, + "learning_rate": 1.184270395305378e-06, + "loss": 0.6576, + "step": 23765 + }, + { + "epoch": 2.4, + "grad_norm": 6.195958907393889, + "learning_rate": 1.182375671851375e-06, + "loss": 0.6619, + "step": 23770 + }, + { + "epoch": 2.4, + "grad_norm": 5.072963402580256, + "learning_rate": 1.1804822620382584e-06, + "loss": 0.6233, + "step": 23775 + }, + { + "epoch": 2.4, + "grad_norm": 5.2949230601384825, + "learning_rate": 1.1785901665175474e-06, + "loss": 0.6862, + "step": 23780 + }, + { + "epoch": 2.4, + "grad_norm": 5.679220448866176, + "learning_rate": 1.1766993859403109e-06, + "loss": 0.6397, + "step": 23785 + }, + { + "epoch": 2.4, + "grad_norm": 6.529126893550042, + "learning_rate": 1.1748099209571618e-06, + "loss": 0.6602, + "step": 23790 + }, + { + "epoch": 2.4, + "grad_norm": 7.105973681550202, + "learning_rate": 1.1729217722182673e-06, + "loss": 0.6719, + "step": 23795 + }, + { + "epoch": 2.4, + "grad_norm": 7.280664552972269, + "learning_rate": 1.171034940373334e-06, + "loss": 0.6734, + "step": 23800 + }, + { + "epoch": 2.4, + "grad_norm": 5.513459079235484, + "learning_rate": 1.1691494260716225e-06, + "loss": 0.6807, + "step": 23805 + }, + { + "epoch": 2.4, + "grad_norm": 7.795157957580197, + "learning_rate": 1.1672652299619342e-06, + "loss": 0.6232, + "step": 23810 + }, + { + "epoch": 2.4, + "grad_norm": 8.094072639673021, + "learning_rate": 1.1653823526926195e-06, + "loss": 0.6686, + "step": 23815 + }, + { + "epoch": 2.4, + "grad_norm": 5.185770027064905, + "learning_rate": 1.163500794911575e-06, + "loss": 0.6832, + "step": 23820 + }, + { + "epoch": 2.4, + "grad_norm": 5.688434083755331, + "learning_rate": 1.1616205572662415e-06, + "loss": 0.6613, + "step": 23825 + }, + { + "epoch": 2.4, + "grad_norm": 5.044012018706139, + "learning_rate": 1.1597416404036095e-06, + "loss": 0.7053, + "step": 23830 + }, + { + "epoch": 2.4, + "grad_norm": 4.448281320498033, + "learning_rate": 1.1578640449702095e-06, + "loss": 0.6427, + "step": 23835 + }, + { + "epoch": 2.4, + "grad_norm": 4.846487787867139, + "learning_rate": 1.1559877716121232e-06, + "loss": 0.6528, + "step": 23840 + }, + { + "epoch": 2.4, + "grad_norm": 4.610950244909516, + "learning_rate": 1.1541128209749735e-06, + "loss": 0.6585, + "step": 23845 + }, + { + "epoch": 2.4, + "grad_norm": 4.877553176280549, + "learning_rate": 1.1522391937039284e-06, + "loss": 0.6696, + "step": 23850 + }, + { + "epoch": 2.41, + "grad_norm": 5.477651738532338, + "learning_rate": 1.150366890443701e-06, + "loss": 0.677, + "step": 23855 + }, + { + "epoch": 2.41, + "grad_norm": 4.863572635163428, + "learning_rate": 1.1484959118385507e-06, + "loss": 0.6449, + "step": 23860 + }, + { + "epoch": 2.41, + "grad_norm": 4.7215699626198155, + "learning_rate": 1.1466262585322773e-06, + "loss": 0.6645, + "step": 23865 + }, + { + "epoch": 2.41, + "grad_norm": 8.375682442605772, + "learning_rate": 1.1447579311682294e-06, + "loss": 0.7003, + "step": 23870 + }, + { + "epoch": 2.41, + "grad_norm": 5.609454163336808, + "learning_rate": 1.1428909303892955e-06, + "loss": 0.6503, + "step": 23875 + }, + { + "epoch": 2.41, + "grad_norm": 4.961901802189119, + "learning_rate": 1.1410252568379082e-06, + "loss": 0.6505, + "step": 23880 + }, + { + "epoch": 2.41, + "grad_norm": 5.013798412490743, + "learning_rate": 1.1391609111560425e-06, + "loss": 0.6742, + "step": 23885 + }, + { + "epoch": 2.41, + "grad_norm": 5.366997262349579, + "learning_rate": 1.137297893985222e-06, + "loss": 0.6679, + "step": 23890 + }, + { + "epoch": 2.41, + "grad_norm": 6.604333797071156, + "learning_rate": 1.1354362059665047e-06, + "loss": 0.6788, + "step": 23895 + }, + { + "epoch": 2.41, + "grad_norm": 4.409950786328532, + "learning_rate": 1.1335758477404995e-06, + "loss": 0.6651, + "step": 23900 + }, + { + "epoch": 2.41, + "grad_norm": 4.340362687340669, + "learning_rate": 1.1317168199473522e-06, + "loss": 0.6428, + "step": 23905 + }, + { + "epoch": 2.41, + "grad_norm": 4.850769904421837, + "learning_rate": 1.1298591232267524e-06, + "loss": 0.6648, + "step": 23910 + }, + { + "epoch": 2.41, + "grad_norm": 5.969602368637574, + "learning_rate": 1.1280027582179309e-06, + "loss": 0.6632, + "step": 23915 + }, + { + "epoch": 2.41, + "grad_norm": 6.396984680311234, + "learning_rate": 1.1261477255596632e-06, + "loss": 0.659, + "step": 23920 + }, + { + "epoch": 2.41, + "grad_norm": 6.951125667931925, + "learning_rate": 1.124294025890262e-06, + "loss": 0.6385, + "step": 23925 + }, + { + "epoch": 2.41, + "grad_norm": 5.376467123518012, + "learning_rate": 1.1224416598475857e-06, + "loss": 0.666, + "step": 23930 + }, + { + "epoch": 2.41, + "grad_norm": 5.42513092423465, + "learning_rate": 1.1205906280690315e-06, + "loss": 0.6845, + "step": 23935 + }, + { + "epoch": 2.41, + "grad_norm": 7.187259221881291, + "learning_rate": 1.1187409311915365e-06, + "loss": 0.6453, + "step": 23940 + }, + { + "epoch": 2.41, + "grad_norm": 4.725902531276005, + "learning_rate": 1.1168925698515782e-06, + "loss": 0.688, + "step": 23945 + }, + { + "epoch": 2.41, + "grad_norm": 4.6032004663512955, + "learning_rate": 1.11504554468518e-06, + "loss": 0.6641, + "step": 23950 + }, + { + "epoch": 2.42, + "grad_norm": 8.473231294951365, + "learning_rate": 1.113199856327899e-06, + "loss": 0.6495, + "step": 23955 + }, + { + "epoch": 2.42, + "grad_norm": 5.635391217197278, + "learning_rate": 1.1113555054148335e-06, + "loss": 0.6375, + "step": 23960 + }, + { + "epoch": 2.42, + "grad_norm": 4.718512384709406, + "learning_rate": 1.1095124925806262e-06, + "loss": 0.6804, + "step": 23965 + }, + { + "epoch": 2.42, + "grad_norm": 4.954687232537294, + "learning_rate": 1.1076708184594542e-06, + "loss": 0.6424, + "step": 23970 + }, + { + "epoch": 2.42, + "grad_norm": 5.237282573011134, + "learning_rate": 1.1058304836850358e-06, + "loss": 0.6636, + "step": 23975 + }, + { + "epoch": 2.42, + "grad_norm": 4.847006852076549, + "learning_rate": 1.1039914888906278e-06, + "loss": 0.6557, + "step": 23980 + }, + { + "epoch": 2.42, + "grad_norm": 5.255883899216501, + "learning_rate": 1.1021538347090282e-06, + "loss": 0.6363, + "step": 23985 + }, + { + "epoch": 2.42, + "grad_norm": 4.797507804413398, + "learning_rate": 1.1003175217725699e-06, + "loss": 0.6141, + "step": 23990 + }, + { + "epoch": 2.42, + "grad_norm": 5.190261202725136, + "learning_rate": 1.098482550713129e-06, + "loss": 0.6663, + "step": 23995 + }, + { + "epoch": 2.42, + "grad_norm": 7.210572111259796, + "learning_rate": 1.0966489221621162e-06, + "loss": 0.651, + "step": 24000 + }, + { + "epoch": 2.42, + "grad_norm": 5.273773088540418, + "learning_rate": 1.0948166367504814e-06, + "loss": 0.6401, + "step": 24005 + }, + { + "epoch": 2.42, + "grad_norm": 6.767623899763042, + "learning_rate": 1.0929856951087104e-06, + "loss": 0.6333, + "step": 24010 + }, + { + "epoch": 2.42, + "grad_norm": 6.412046448881226, + "learning_rate": 1.0911560978668312e-06, + "loss": 0.6715, + "step": 24015 + }, + { + "epoch": 2.42, + "grad_norm": 6.078105296172169, + "learning_rate": 1.0893278456544043e-06, + "loss": 0.6443, + "step": 24020 + }, + { + "epoch": 2.42, + "grad_norm": 5.674597763088829, + "learning_rate": 1.087500939100532e-06, + "loss": 0.6508, + "step": 24025 + }, + { + "epoch": 2.42, + "grad_norm": 5.110649682914834, + "learning_rate": 1.0856753788338498e-06, + "loss": 0.6628, + "step": 24030 + }, + { + "epoch": 2.42, + "grad_norm": 4.889614638207886, + "learning_rate": 1.0838511654825306e-06, + "loss": 0.673, + "step": 24035 + }, + { + "epoch": 2.42, + "grad_norm": 11.984844405841715, + "learning_rate": 1.0820282996742842e-06, + "loss": 0.6379, + "step": 24040 + }, + { + "epoch": 2.42, + "grad_norm": 6.518256321449587, + "learning_rate": 1.0802067820363588e-06, + "loss": 0.6502, + "step": 24045 + }, + { + "epoch": 2.42, + "grad_norm": 4.845221841902757, + "learning_rate": 1.0783866131955345e-06, + "loss": 0.6563, + "step": 24050 + }, + { + "epoch": 2.43, + "grad_norm": 5.6040059079289595, + "learning_rate": 1.0765677937781327e-06, + "loss": 0.6574, + "step": 24055 + }, + { + "epoch": 2.43, + "grad_norm": 6.081550821628119, + "learning_rate": 1.0747503244100055e-06, + "loss": 0.6582, + "step": 24060 + }, + { + "epoch": 2.43, + "grad_norm": 6.16529242847246, + "learning_rate": 1.0729342057165427e-06, + "loss": 0.6523, + "step": 24065 + }, + { + "epoch": 2.43, + "grad_norm": 5.544136274685921, + "learning_rate": 1.0711194383226675e-06, + "loss": 0.6813, + "step": 24070 + }, + { + "epoch": 2.43, + "grad_norm": 5.539291616022181, + "learning_rate": 1.0693060228528424e-06, + "loss": 0.66, + "step": 24075 + }, + { + "epoch": 2.43, + "grad_norm": 4.528425012110296, + "learning_rate": 1.0674939599310607e-06, + "loss": 0.6513, + "step": 24080 + }, + { + "epoch": 2.43, + "grad_norm": 5.497645977169182, + "learning_rate": 1.06568325018085e-06, + "loss": 0.6785, + "step": 24085 + }, + { + "epoch": 2.43, + "grad_norm": 5.452333913167858, + "learning_rate": 1.063873894225278e-06, + "loss": 0.6291, + "step": 24090 + }, + { + "epoch": 2.43, + "grad_norm": 7.584321380477787, + "learning_rate": 1.062065892686937e-06, + "loss": 0.6179, + "step": 24095 + }, + { + "epoch": 2.43, + "grad_norm": 5.582992398770889, + "learning_rate": 1.0602592461879623e-06, + "loss": 0.6542, + "step": 24100 + }, + { + "epoch": 2.43, + "grad_norm": 5.267445725604313, + "learning_rate": 1.0584539553500167e-06, + "loss": 0.6883, + "step": 24105 + }, + { + "epoch": 2.43, + "grad_norm": 5.106951719637822, + "learning_rate": 1.056650020794302e-06, + "loss": 0.6289, + "step": 24110 + }, + { + "epoch": 2.43, + "grad_norm": 7.748623451954727, + "learning_rate": 1.0548474431415473e-06, + "loss": 0.6364, + "step": 24115 + }, + { + "epoch": 2.43, + "grad_norm": 4.443000217909867, + "learning_rate": 1.0530462230120209e-06, + "loss": 0.6766, + "step": 24120 + }, + { + "epoch": 2.43, + "grad_norm": 4.48550594637477, + "learning_rate": 1.0512463610255191e-06, + "loss": 0.6367, + "step": 24125 + }, + { + "epoch": 2.43, + "grad_norm": 7.411209327002572, + "learning_rate": 1.049447857801373e-06, + "loss": 0.6673, + "step": 24130 + }, + { + "epoch": 2.43, + "grad_norm": 24.461037766404242, + "learning_rate": 1.0476507139584447e-06, + "loss": 0.6432, + "step": 24135 + }, + { + "epoch": 2.43, + "grad_norm": 5.983197491207409, + "learning_rate": 1.0458549301151317e-06, + "loss": 0.6388, + "step": 24140 + }, + { + "epoch": 2.43, + "grad_norm": 4.714272084787362, + "learning_rate": 1.04406050688936e-06, + "loss": 0.6924, + "step": 24145 + }, + { + "epoch": 2.43, + "grad_norm": 5.18344029670136, + "learning_rate": 1.0422674448985904e-06, + "loss": 0.6532, + "step": 24150 + }, + { + "epoch": 2.44, + "grad_norm": 5.14167289300526, + "learning_rate": 1.040475744759813e-06, + "loss": 0.6907, + "step": 24155 + }, + { + "epoch": 2.44, + "grad_norm": 6.647224608466981, + "learning_rate": 1.0386854070895496e-06, + "loss": 0.6835, + "step": 24160 + }, + { + "epoch": 2.44, + "grad_norm": 7.015265097882171, + "learning_rate": 1.0368964325038533e-06, + "loss": 0.6691, + "step": 24165 + }, + { + "epoch": 2.44, + "grad_norm": 4.5761073878121215, + "learning_rate": 1.035108821618311e-06, + "loss": 0.635, + "step": 24170 + }, + { + "epoch": 2.44, + "grad_norm": 5.162102016764382, + "learning_rate": 1.0333225750480341e-06, + "loss": 0.6435, + "step": 24175 + }, + { + "epoch": 2.44, + "grad_norm": 6.211445872418304, + "learning_rate": 1.0315376934076725e-06, + "loss": 0.6869, + "step": 24180 + }, + { + "epoch": 2.44, + "grad_norm": 6.234258386467219, + "learning_rate": 1.0297541773114e-06, + "loss": 0.6333, + "step": 24185 + }, + { + "epoch": 2.44, + "grad_norm": 5.055423005378473, + "learning_rate": 1.0279720273729232e-06, + "loss": 0.6543, + "step": 24190 + }, + { + "epoch": 2.44, + "grad_norm": 4.32115384536653, + "learning_rate": 1.0261912442054767e-06, + "loss": 0.6433, + "step": 24195 + }, + { + "epoch": 2.44, + "grad_norm": 4.849032712997024, + "learning_rate": 1.024411828421829e-06, + "loss": 0.66, + "step": 24200 + }, + { + "epoch": 2.44, + "grad_norm": 5.072989418040996, + "learning_rate": 1.022633780634274e-06, + "loss": 0.68, + "step": 24205 + }, + { + "epoch": 2.44, + "grad_norm": 5.483775285184719, + "learning_rate": 1.0208571014546347e-06, + "loss": 0.6556, + "step": 24210 + }, + { + "epoch": 2.44, + "grad_norm": 4.745063361912836, + "learning_rate": 1.0190817914942696e-06, + "loss": 0.6549, + "step": 24215 + }, + { + "epoch": 2.44, + "grad_norm": 6.906242377889271, + "learning_rate": 1.0173078513640543e-06, + "loss": 0.6336, + "step": 24220 + }, + { + "epoch": 2.44, + "grad_norm": 6.337955797386548, + "learning_rate": 1.0155352816744058e-06, + "loss": 0.6436, + "step": 24225 + }, + { + "epoch": 2.44, + "grad_norm": 6.599573279630294, + "learning_rate": 1.013764083035259e-06, + "loss": 0.6662, + "step": 24230 + }, + { + "epoch": 2.44, + "grad_norm": 4.345380794791357, + "learning_rate": 1.0119942560560858e-06, + "loss": 0.6526, + "step": 24235 + }, + { + "epoch": 2.44, + "grad_norm": 4.940158221488668, + "learning_rate": 1.0102258013458783e-06, + "loss": 0.6329, + "step": 24240 + }, + { + "epoch": 2.44, + "grad_norm": 5.484670311151196, + "learning_rate": 1.0084587195131652e-06, + "loss": 0.6431, + "step": 24245 + }, + { + "epoch": 2.44, + "grad_norm": 4.733477820239863, + "learning_rate": 1.0066930111659911e-06, + "loss": 0.66, + "step": 24250 + }, + { + "epoch": 2.45, + "grad_norm": 6.399131133773677, + "learning_rate": 1.0049286769119393e-06, + "loss": 0.665, + "step": 24255 + }, + { + "epoch": 2.45, + "grad_norm": 5.436299381570852, + "learning_rate": 1.0031657173581126e-06, + "loss": 0.6253, + "step": 24260 + }, + { + "epoch": 2.45, + "grad_norm": 5.653731055188008, + "learning_rate": 1.0014041331111467e-06, + "loss": 0.6347, + "step": 24265 + }, + { + "epoch": 2.45, + "grad_norm": 4.907820309286419, + "learning_rate": 9.996439247771983e-07, + "loss": 0.6569, + "step": 24270 + }, + { + "epoch": 2.45, + "grad_norm": 4.544214480872975, + "learning_rate": 9.97885092961957e-07, + "loss": 0.6536, + "step": 24275 + }, + { + "epoch": 2.45, + "grad_norm": 5.587307184447402, + "learning_rate": 9.961276382706297e-07, + "loss": 0.669, + "step": 24280 + }, + { + "epoch": 2.45, + "grad_norm": 5.151800180599868, + "learning_rate": 9.943715613079602e-07, + "loss": 0.6471, + "step": 24285 + }, + { + "epoch": 2.45, + "grad_norm": 4.647118728799816, + "learning_rate": 9.926168626782085e-07, + "loss": 0.6501, + "step": 24290 + }, + { + "epoch": 2.45, + "grad_norm": 6.864846064764639, + "learning_rate": 9.908635429851688e-07, + "loss": 0.6601, + "step": 24295 + }, + { + "epoch": 2.45, + "grad_norm": 4.941681400199244, + "learning_rate": 9.891116028321528e-07, + "loss": 0.657, + "step": 24300 + }, + { + "epoch": 2.45, + "grad_norm": 7.459932839259485, + "learning_rate": 9.873610428220065e-07, + "loss": 0.6518, + "step": 24305 + }, + { + "epoch": 2.45, + "grad_norm": 10.689099853989275, + "learning_rate": 9.856118635570905e-07, + "loss": 0.675, + "step": 24310 + }, + { + "epoch": 2.45, + "grad_norm": 8.415278394684226, + "learning_rate": 9.83864065639299e-07, + "loss": 0.6749, + "step": 24315 + }, + { + "epoch": 2.45, + "grad_norm": 8.166749219171328, + "learning_rate": 9.821176496700453e-07, + "loss": 0.6639, + "step": 24320 + }, + { + "epoch": 2.45, + "grad_norm": 4.802799710730936, + "learning_rate": 9.803726162502726e-07, + "loss": 0.6423, + "step": 24325 + }, + { + "epoch": 2.45, + "grad_norm": 4.814502203551193, + "learning_rate": 9.786289659804415e-07, + "loss": 0.6593, + "step": 24330 + }, + { + "epoch": 2.45, + "grad_norm": 7.235234383606917, + "learning_rate": 9.768866994605452e-07, + "loss": 0.6661, + "step": 24335 + }, + { + "epoch": 2.45, + "grad_norm": 5.895525185606499, + "learning_rate": 9.75145817290092e-07, + "loss": 0.6807, + "step": 24340 + }, + { + "epoch": 2.45, + "grad_norm": 6.272616856347768, + "learning_rate": 9.73406320068117e-07, + "loss": 0.6332, + "step": 24345 + }, + { + "epoch": 2.46, + "grad_norm": 7.1266478704623015, + "learning_rate": 9.716682083931834e-07, + "loss": 0.6698, + "step": 24350 + }, + { + "epoch": 2.46, + "grad_norm": 4.686239580595646, + "learning_rate": 9.699314828633704e-07, + "loss": 0.6389, + "step": 24355 + }, + { + "epoch": 2.46, + "grad_norm": 6.269954961352537, + "learning_rate": 9.68196144076286e-07, + "loss": 0.6258, + "step": 24360 + }, + { + "epoch": 2.46, + "grad_norm": 4.586232695798104, + "learning_rate": 9.66462192629058e-07, + "loss": 0.6671, + "step": 24365 + }, + { + "epoch": 2.46, + "grad_norm": 4.715433525036328, + "learning_rate": 9.647296291183367e-07, + "loss": 0.6313, + "step": 24370 + }, + { + "epoch": 2.46, + "grad_norm": 4.60639124847477, + "learning_rate": 9.629984541402947e-07, + "loss": 0.6235, + "step": 24375 + }, + { + "epoch": 2.46, + "grad_norm": 4.911709687109333, + "learning_rate": 9.612686682906302e-07, + "loss": 0.6797, + "step": 24380 + }, + { + "epoch": 2.46, + "grad_norm": 4.824313647744851, + "learning_rate": 9.59540272164558e-07, + "loss": 0.6612, + "step": 24385 + }, + { + "epoch": 2.46, + "grad_norm": 4.787688302758495, + "learning_rate": 9.578132663568207e-07, + "loss": 0.6451, + "step": 24390 + }, + { + "epoch": 2.46, + "grad_norm": 4.836387498664002, + "learning_rate": 9.560876514616775e-07, + "loss": 0.6509, + "step": 24395 + }, + { + "epoch": 2.46, + "grad_norm": 4.868549786245656, + "learning_rate": 9.54363428072911e-07, + "loss": 0.6472, + "step": 24400 + }, + { + "epoch": 2.46, + "grad_norm": 6.530863320667234, + "learning_rate": 9.526405967838237e-07, + "loss": 0.6555, + "step": 24405 + }, + { + "epoch": 2.46, + "grad_norm": 4.391324501270228, + "learning_rate": 9.509191581872424e-07, + "loss": 0.6357, + "step": 24410 + }, + { + "epoch": 2.46, + "grad_norm": 4.6127302095667, + "learning_rate": 9.491991128755107e-07, + "loss": 0.6515, + "step": 24415 + }, + { + "epoch": 2.46, + "grad_norm": 4.712671864213921, + "learning_rate": 9.474804614404959e-07, + "loss": 0.6574, + "step": 24420 + }, + { + "epoch": 2.46, + "grad_norm": 4.847014489297407, + "learning_rate": 9.457632044735832e-07, + "loss": 0.6373, + "step": 24425 + }, + { + "epoch": 2.46, + "grad_norm": 6.670228672865111, + "learning_rate": 9.440473425656815e-07, + "loss": 0.6506, + "step": 24430 + }, + { + "epoch": 2.46, + "grad_norm": 4.431668423806074, + "learning_rate": 9.423328763072131e-07, + "loss": 0.6459, + "step": 24435 + }, + { + "epoch": 2.46, + "grad_norm": 5.034693170576436, + "learning_rate": 9.406198062881278e-07, + "loss": 0.6449, + "step": 24440 + }, + { + "epoch": 2.46, + "grad_norm": 4.678252267266481, + "learning_rate": 9.389081330978883e-07, + "loss": 0.6589, + "step": 24445 + }, + { + "epoch": 2.47, + "grad_norm": 6.156093510328251, + "learning_rate": 9.371978573254831e-07, + "loss": 0.6928, + "step": 24450 + }, + { + "epoch": 2.47, + "grad_norm": 8.083625790363284, + "learning_rate": 9.354889795594135e-07, + "loss": 0.6822, + "step": 24455 + }, + { + "epoch": 2.47, + "grad_norm": 5.270059897769438, + "learning_rate": 9.33781500387706e-07, + "loss": 0.6475, + "step": 24460 + }, + { + "epoch": 2.47, + "grad_norm": 5.445776137690954, + "learning_rate": 9.320754203978999e-07, + "loss": 0.6577, + "step": 24465 + }, + { + "epoch": 2.47, + "grad_norm": 4.724132080897397, + "learning_rate": 9.303707401770545e-07, + "loss": 0.6127, + "step": 24470 + }, + { + "epoch": 2.47, + "grad_norm": 4.683063811483321, + "learning_rate": 9.286674603117524e-07, + "loss": 0.6603, + "step": 24475 + }, + { + "epoch": 2.47, + "grad_norm": 4.257766289785195, + "learning_rate": 9.269655813880868e-07, + "loss": 0.6006, + "step": 24480 + }, + { + "epoch": 2.47, + "grad_norm": 5.429484324997625, + "learning_rate": 9.25265103991676e-07, + "loss": 0.6603, + "step": 24485 + }, + { + "epoch": 2.47, + "grad_norm": 4.8748023788537385, + "learning_rate": 9.235660287076509e-07, + "loss": 0.6937, + "step": 24490 + }, + { + "epoch": 2.47, + "grad_norm": 4.573077935697372, + "learning_rate": 9.218683561206621e-07, + "loss": 0.6657, + "step": 24495 + }, + { + "epoch": 2.47, + "grad_norm": 5.0768056663720245, + "learning_rate": 9.201720868148756e-07, + "loss": 0.6426, + "step": 24500 + }, + { + "epoch": 2.47, + "grad_norm": 5.132452067770202, + "learning_rate": 9.184772213739784e-07, + "loss": 0.6173, + "step": 24505 + }, + { + "epoch": 2.47, + "grad_norm": 5.8371733786325946, + "learning_rate": 9.167837603811702e-07, + "loss": 0.6421, + "step": 24510 + }, + { + "epoch": 2.47, + "grad_norm": 11.925534485149385, + "learning_rate": 9.15091704419171e-07, + "loss": 0.6504, + "step": 24515 + }, + { + "epoch": 2.47, + "grad_norm": 7.885909463985392, + "learning_rate": 9.134010540702148e-07, + "loss": 0.6602, + "step": 24520 + }, + { + "epoch": 2.47, + "grad_norm": 6.120338319418244, + "learning_rate": 9.117118099160526e-07, + "loss": 0.6821, + "step": 24525 + }, + { + "epoch": 2.47, + "grad_norm": 6.798152916059508, + "learning_rate": 9.100239725379512e-07, + "loss": 0.6748, + "step": 24530 + }, + { + "epoch": 2.47, + "grad_norm": 4.905722956211278, + "learning_rate": 9.083375425166952e-07, + "loss": 0.6423, + "step": 24535 + }, + { + "epoch": 2.47, + "grad_norm": 6.674599790896468, + "learning_rate": 9.066525204325816e-07, + "loss": 0.6314, + "step": 24540 + }, + { + "epoch": 2.47, + "grad_norm": 4.787652908868984, + "learning_rate": 9.049689068654277e-07, + "loss": 0.6461, + "step": 24545 + }, + { + "epoch": 2.48, + "grad_norm": 4.648534073284891, + "learning_rate": 9.032867023945613e-07, + "loss": 0.6802, + "step": 24550 + }, + { + "epoch": 2.48, + "grad_norm": 4.963734604380744, + "learning_rate": 9.016059075988282e-07, + "loss": 0.6637, + "step": 24555 + }, + { + "epoch": 2.48, + "grad_norm": 4.815717399319649, + "learning_rate": 8.999265230565863e-07, + "loss": 0.666, + "step": 24560 + }, + { + "epoch": 2.48, + "grad_norm": 4.534025588775262, + "learning_rate": 8.982485493457133e-07, + "loss": 0.6598, + "step": 24565 + }, + { + "epoch": 2.48, + "grad_norm": 5.243752686300016, + "learning_rate": 8.965719870435951e-07, + "loss": 0.679, + "step": 24570 + }, + { + "epoch": 2.48, + "grad_norm": 5.169525277173541, + "learning_rate": 8.948968367271388e-07, + "loss": 0.6735, + "step": 24575 + }, + { + "epoch": 2.48, + "grad_norm": 4.372567152329839, + "learning_rate": 8.932230989727598e-07, + "loss": 0.6407, + "step": 24580 + }, + { + "epoch": 2.48, + "grad_norm": 4.503524583394472, + "learning_rate": 8.915507743563901e-07, + "loss": 0.6268, + "step": 24585 + }, + { + "epoch": 2.48, + "grad_norm": 4.582464553338467, + "learning_rate": 8.89879863453475e-07, + "loss": 0.6439, + "step": 24590 + }, + { + "epoch": 2.48, + "grad_norm": 4.4729040322625515, + "learning_rate": 8.882103668389719e-07, + "loss": 0.6668, + "step": 24595 + }, + { + "epoch": 2.48, + "grad_norm": 6.280305513971691, + "learning_rate": 8.865422850873556e-07, + "loss": 0.6826, + "step": 24600 + }, + { + "epoch": 2.48, + "grad_norm": 4.931605865332295, + "learning_rate": 8.848756187726093e-07, + "loss": 0.6705, + "step": 24605 + }, + { + "epoch": 2.48, + "grad_norm": 5.2975829385507325, + "learning_rate": 8.83210368468233e-07, + "loss": 0.6587, + "step": 24610 + }, + { + "epoch": 2.48, + "grad_norm": 6.032825354671978, + "learning_rate": 8.815465347472374e-07, + "loss": 0.657, + "step": 24615 + }, + { + "epoch": 2.48, + "grad_norm": 4.7918494728805, + "learning_rate": 8.798841181821449e-07, + "loss": 0.6433, + "step": 24620 + }, + { + "epoch": 2.48, + "grad_norm": 4.6352119324579295, + "learning_rate": 8.782231193449914e-07, + "loss": 0.6052, + "step": 24625 + }, + { + "epoch": 2.48, + "grad_norm": 4.468768095821291, + "learning_rate": 8.765635388073274e-07, + "loss": 0.6506, + "step": 24630 + }, + { + "epoch": 2.48, + "grad_norm": 6.1714584235928, + "learning_rate": 8.749053771402094e-07, + "loss": 0.6813, + "step": 24635 + }, + { + "epoch": 2.48, + "grad_norm": 5.405067333949699, + "learning_rate": 8.732486349142127e-07, + "loss": 0.6597, + "step": 24640 + }, + { + "epoch": 2.48, + "grad_norm": 5.641166645937143, + "learning_rate": 8.715933126994192e-07, + "loss": 0.6686, + "step": 24645 + }, + { + "epoch": 2.49, + "grad_norm": 6.540422303816012, + "learning_rate": 8.699394110654241e-07, + "loss": 0.6277, + "step": 24650 + }, + { + "epoch": 2.49, + "grad_norm": 8.926265663207731, + "learning_rate": 8.682869305813312e-07, + "loss": 0.6619, + "step": 24655 + }, + { + "epoch": 2.49, + "grad_norm": 4.522604459042804, + "learning_rate": 8.666358718157608e-07, + "loss": 0.6753, + "step": 24660 + }, + { + "epoch": 2.49, + "grad_norm": 4.8097535901666495, + "learning_rate": 8.649862353368377e-07, + "loss": 0.6776, + "step": 24665 + }, + { + "epoch": 2.49, + "grad_norm": 5.158174973798557, + "learning_rate": 8.633380217122028e-07, + "loss": 0.6765, + "step": 24670 + }, + { + "epoch": 2.49, + "grad_norm": 4.610447525325233, + "learning_rate": 8.616912315090037e-07, + "loss": 0.6734, + "step": 24675 + }, + { + "epoch": 2.49, + "grad_norm": 4.612845668220532, + "learning_rate": 8.600458652938997e-07, + "loss": 0.6521, + "step": 24680 + }, + { + "epoch": 2.49, + "grad_norm": 4.578334838267179, + "learning_rate": 8.584019236330576e-07, + "loss": 0.6442, + "step": 24685 + }, + { + "epoch": 2.49, + "grad_norm": 5.757414987867542, + "learning_rate": 8.567594070921592e-07, + "loss": 0.6377, + "step": 24690 + }, + { + "epoch": 2.49, + "grad_norm": 5.545040668554131, + "learning_rate": 8.551183162363907e-07, + "loss": 0.6203, + "step": 24695 + }, + { + "epoch": 2.49, + "grad_norm": 5.18825691136152, + "learning_rate": 8.534786516304516e-07, + "loss": 0.6589, + "step": 24700 + }, + { + "epoch": 2.49, + "grad_norm": 4.63240360899482, + "learning_rate": 8.518404138385483e-07, + "loss": 0.6737, + "step": 24705 + }, + { + "epoch": 2.49, + "grad_norm": 5.375068745954727, + "learning_rate": 8.502036034243965e-07, + "loss": 0.6233, + "step": 24710 + }, + { + "epoch": 2.49, + "grad_norm": 4.711918972811405, + "learning_rate": 8.485682209512203e-07, + "loss": 0.6563, + "step": 24715 + }, + { + "epoch": 2.49, + "grad_norm": 5.740392950733569, + "learning_rate": 8.469342669817548e-07, + "loss": 0.6779, + "step": 24720 + }, + { + "epoch": 2.49, + "grad_norm": 4.782537893732716, + "learning_rate": 8.453017420782422e-07, + "loss": 0.6634, + "step": 24725 + }, + { + "epoch": 2.49, + "grad_norm": 5.0966574448446185, + "learning_rate": 8.436706468024303e-07, + "loss": 0.67, + "step": 24730 + }, + { + "epoch": 2.49, + "grad_norm": 4.884923814880293, + "learning_rate": 8.420409817155806e-07, + "loss": 0.6687, + "step": 24735 + }, + { + "epoch": 2.49, + "grad_norm": 4.659544018108539, + "learning_rate": 8.404127473784574e-07, + "loss": 0.6462, + "step": 24740 + }, + { + "epoch": 2.49, + "grad_norm": 4.709118306156166, + "learning_rate": 8.387859443513353e-07, + "loss": 0.679, + "step": 24745 + }, + { + "epoch": 2.5, + "grad_norm": 4.928574888944737, + "learning_rate": 8.371605731939935e-07, + "loss": 0.6381, + "step": 24750 + }, + { + "epoch": 2.5, + "grad_norm": 5.063956339138812, + "learning_rate": 8.355366344657245e-07, + "loss": 0.6671, + "step": 24755 + }, + { + "epoch": 2.5, + "grad_norm": 4.831064742072634, + "learning_rate": 8.339141287253205e-07, + "loss": 0.6376, + "step": 24760 + }, + { + "epoch": 2.5, + "grad_norm": 4.496507983904041, + "learning_rate": 8.322930565310867e-07, + "loss": 0.6523, + "step": 24765 + }, + { + "epoch": 2.5, + "grad_norm": 5.897423559897414, + "learning_rate": 8.306734184408316e-07, + "loss": 0.654, + "step": 24770 + }, + { + "epoch": 2.5, + "grad_norm": 4.895179239345184, + "learning_rate": 8.290552150118714e-07, + "loss": 0.6587, + "step": 24775 + }, + { + "epoch": 2.5, + "grad_norm": 5.01025021028483, + "learning_rate": 8.274384468010266e-07, + "loss": 0.6599, + "step": 24780 + }, + { + "epoch": 2.5, + "grad_norm": 5.621504495377497, + "learning_rate": 8.258231143646284e-07, + "loss": 0.6536, + "step": 24785 + }, + { + "epoch": 2.5, + "grad_norm": 5.015773076364035, + "learning_rate": 8.242092182585082e-07, + "loss": 0.6886, + "step": 24790 + }, + { + "epoch": 2.5, + "grad_norm": 7.427442287654225, + "learning_rate": 8.225967590380091e-07, + "loss": 0.6637, + "step": 24795 + }, + { + "epoch": 2.5, + "grad_norm": 6.137015803961235, + "learning_rate": 8.209857372579749e-07, + "loss": 0.6189, + "step": 24800 + }, + { + "epoch": 2.5, + "grad_norm": 4.886094258233801, + "learning_rate": 8.193761534727579e-07, + "loss": 0.6334, + "step": 24805 + }, + { + "epoch": 2.5, + "grad_norm": 5.027983452333498, + "learning_rate": 8.177680082362116e-07, + "loss": 0.6403, + "step": 24810 + }, + { + "epoch": 2.5, + "grad_norm": 4.498959374655465, + "learning_rate": 8.161613021017006e-07, + "loss": 0.665, + "step": 24815 + }, + { + "epoch": 2.5, + "grad_norm": 7.442887102085162, + "learning_rate": 8.145560356220883e-07, + "loss": 0.6512, + "step": 24820 + }, + { + "epoch": 2.5, + "grad_norm": 6.059385265435554, + "learning_rate": 8.129522093497483e-07, + "loss": 0.661, + "step": 24825 + }, + { + "epoch": 2.5, + "grad_norm": 5.89352893079638, + "learning_rate": 8.11349823836553e-07, + "loss": 0.6473, + "step": 24830 + }, + { + "epoch": 2.5, + "grad_norm": 5.557086942989607, + "learning_rate": 8.097488796338837e-07, + "loss": 0.6246, + "step": 24835 + }, + { + "epoch": 2.5, + "grad_norm": 4.471587700245278, + "learning_rate": 8.081493772926208e-07, + "loss": 0.6219, + "step": 24840 + }, + { + "epoch": 2.5, + "grad_norm": 6.8867025140941545, + "learning_rate": 8.06551317363155e-07, + "loss": 0.6267, + "step": 24845 + }, + { + "epoch": 2.51, + "grad_norm": 4.849642771433342, + "learning_rate": 8.049547003953756e-07, + "loss": 0.6704, + "step": 24850 + }, + { + "epoch": 2.51, + "grad_norm": 4.900048710600587, + "learning_rate": 8.033595269386762e-07, + "loss": 0.7352, + "step": 24855 + }, + { + "epoch": 2.51, + "grad_norm": 4.458521029193161, + "learning_rate": 8.017657975419563e-07, + "loss": 0.6711, + "step": 24860 + }, + { + "epoch": 2.51, + "grad_norm": 7.234623918704893, + "learning_rate": 8.001735127536153e-07, + "loss": 0.6682, + "step": 24865 + }, + { + "epoch": 2.51, + "grad_norm": 5.8119753199973125, + "learning_rate": 7.985826731215579e-07, + "loss": 0.6526, + "step": 24870 + }, + { + "epoch": 2.51, + "grad_norm": 4.786871359170267, + "learning_rate": 7.969932791931883e-07, + "loss": 0.6588, + "step": 24875 + }, + { + "epoch": 2.51, + "grad_norm": 4.888432868770594, + "learning_rate": 7.954053315154181e-07, + "loss": 0.6491, + "step": 24880 + }, + { + "epoch": 2.51, + "grad_norm": 4.7430510735248745, + "learning_rate": 7.938188306346567e-07, + "loss": 0.6593, + "step": 24885 + }, + { + "epoch": 2.51, + "grad_norm": 6.1545479704772434, + "learning_rate": 7.922337770968192e-07, + "loss": 0.6435, + "step": 24890 + }, + { + "epoch": 2.51, + "grad_norm": 4.849948977490544, + "learning_rate": 7.906501714473203e-07, + "loss": 0.6504, + "step": 24895 + }, + { + "epoch": 2.51, + "grad_norm": 9.081958242724612, + "learning_rate": 7.89068014231077e-07, + "loss": 0.6467, + "step": 24900 + }, + { + "epoch": 2.51, + "grad_norm": 5.470923465659294, + "learning_rate": 7.874873059925064e-07, + "loss": 0.6437, + "step": 24905 + }, + { + "epoch": 2.51, + "grad_norm": 4.654415047763868, + "learning_rate": 7.85908047275532e-07, + "loss": 0.6388, + "step": 24910 + }, + { + "epoch": 2.51, + "grad_norm": 6.073545061463334, + "learning_rate": 7.843302386235724e-07, + "loss": 0.6478, + "step": 24915 + }, + { + "epoch": 2.51, + "grad_norm": 5.715609234994534, + "learning_rate": 7.827538805795526e-07, + "loss": 0.6644, + "step": 24920 + }, + { + "epoch": 2.51, + "grad_norm": 5.964046487206026, + "learning_rate": 7.811789736858943e-07, + "loss": 0.6411, + "step": 24925 + }, + { + "epoch": 2.51, + "grad_norm": 6.154862219711928, + "learning_rate": 7.796055184845219e-07, + "loss": 0.6658, + "step": 24930 + }, + { + "epoch": 2.51, + "grad_norm": 4.905050013439889, + "learning_rate": 7.780335155168584e-07, + "loss": 0.6531, + "step": 24935 + }, + { + "epoch": 2.51, + "grad_norm": 4.581144513340311, + "learning_rate": 7.764629653238309e-07, + "loss": 0.661, + "step": 24940 + }, + { + "epoch": 2.51, + "grad_norm": 5.231113583542581, + "learning_rate": 7.748938684458623e-07, + "loss": 0.7024, + "step": 24945 + }, + { + "epoch": 2.52, + "grad_norm": 8.860678825782125, + "learning_rate": 7.733262254228791e-07, + "loss": 0.6368, + "step": 24950 + }, + { + "epoch": 2.52, + "grad_norm": 5.975033156518935, + "learning_rate": 7.717600367943046e-07, + "loss": 0.6649, + "step": 24955 + }, + { + "epoch": 2.52, + "grad_norm": 8.01741938313855, + "learning_rate": 7.701953030990628e-07, + "loss": 0.658, + "step": 24960 + }, + { + "epoch": 2.52, + "grad_norm": 5.3034097032783265, + "learning_rate": 7.686320248755763e-07, + "loss": 0.6758, + "step": 24965 + }, + { + "epoch": 2.52, + "grad_norm": 7.27663529194628, + "learning_rate": 7.670702026617699e-07, + "loss": 0.6453, + "step": 24970 + }, + { + "epoch": 2.52, + "grad_norm": 9.33105884995202, + "learning_rate": 7.655098369950643e-07, + "loss": 0.6452, + "step": 24975 + }, + { + "epoch": 2.52, + "grad_norm": 5.392173457773588, + "learning_rate": 7.639509284123776e-07, + "loss": 0.6375, + "step": 24980 + }, + { + "epoch": 2.52, + "grad_norm": 5.3268626627599165, + "learning_rate": 7.623934774501318e-07, + "loss": 0.6545, + "step": 24985 + }, + { + "epoch": 2.52, + "grad_norm": 4.913510886945881, + "learning_rate": 7.608374846442429e-07, + "loss": 0.6389, + "step": 24990 + }, + { + "epoch": 2.52, + "grad_norm": 4.420185217342942, + "learning_rate": 7.592829505301269e-07, + "loss": 0.6462, + "step": 24995 + }, + { + "epoch": 2.52, + "grad_norm": 4.50249743438501, + "learning_rate": 7.577298756426959e-07, + "loss": 0.6746, + "step": 25000 + }, + { + "epoch": 2.52, + "grad_norm": 5.406885599331131, + "learning_rate": 7.561782605163642e-07, + "loss": 0.6584, + "step": 25005 + }, + { + "epoch": 2.52, + "grad_norm": 4.776451399414106, + "learning_rate": 7.546281056850386e-07, + "loss": 0.647, + "step": 25010 + }, + { + "epoch": 2.52, + "grad_norm": 4.647823902193932, + "learning_rate": 7.530794116821282e-07, + "loss": 0.6189, + "step": 25015 + }, + { + "epoch": 2.52, + "grad_norm": 8.680773430815062, + "learning_rate": 7.515321790405356e-07, + "loss": 0.6375, + "step": 25020 + }, + { + "epoch": 2.52, + "grad_norm": 4.969723876599489, + "learning_rate": 7.499864082926627e-07, + "loss": 0.6222, + "step": 25025 + }, + { + "epoch": 2.52, + "grad_norm": 4.758278915875785, + "learning_rate": 7.484420999704067e-07, + "loss": 0.6381, + "step": 25030 + }, + { + "epoch": 2.52, + "grad_norm": 4.498832604608828, + "learning_rate": 7.46899254605164e-07, + "loss": 0.6355, + "step": 25035 + }, + { + "epoch": 2.52, + "grad_norm": 6.443240026602288, + "learning_rate": 7.453578727278249e-07, + "loss": 0.6575, + "step": 25040 + }, + { + "epoch": 2.53, + "grad_norm": 4.819968215010338, + "learning_rate": 7.43817954868779e-07, + "loss": 0.6548, + "step": 25045 + }, + { + "epoch": 2.53, + "grad_norm": 5.28827657407379, + "learning_rate": 7.422795015579098e-07, + "loss": 0.619, + "step": 25050 + }, + { + "epoch": 2.53, + "grad_norm": 4.686959750334854, + "learning_rate": 7.407425133245971e-07, + "loss": 0.6313, + "step": 25055 + }, + { + "epoch": 2.53, + "grad_norm": 8.081679427774791, + "learning_rate": 7.392069906977167e-07, + "loss": 0.651, + "step": 25060 + }, + { + "epoch": 2.53, + "grad_norm": 5.242750449479009, + "learning_rate": 7.376729342056427e-07, + "loss": 0.6706, + "step": 25065 + }, + { + "epoch": 2.53, + "grad_norm": 4.727060169319375, + "learning_rate": 7.361403443762399e-07, + "loss": 0.664, + "step": 25070 + }, + { + "epoch": 2.53, + "grad_norm": 5.85291155153888, + "learning_rate": 7.346092217368727e-07, + "loss": 0.6669, + "step": 25075 + }, + { + "epoch": 2.53, + "grad_norm": 4.782691470831053, + "learning_rate": 7.330795668143992e-07, + "loss": 0.6496, + "step": 25080 + }, + { + "epoch": 2.53, + "grad_norm": 5.031602082846597, + "learning_rate": 7.315513801351709e-07, + "loss": 0.6241, + "step": 25085 + }, + { + "epoch": 2.53, + "grad_norm": 6.13748043404483, + "learning_rate": 7.300246622250345e-07, + "loss": 0.6558, + "step": 25090 + }, + { + "epoch": 2.53, + "grad_norm": 5.022918393131529, + "learning_rate": 7.284994136093348e-07, + "loss": 0.6203, + "step": 25095 + }, + { + "epoch": 2.53, + "grad_norm": 4.577534653944751, + "learning_rate": 7.269756348129064e-07, + "loss": 0.6342, + "step": 25100 + }, + { + "epoch": 2.53, + "grad_norm": 6.3508017293347105, + "learning_rate": 7.254533263600816e-07, + "loss": 0.6571, + "step": 25105 + }, + { + "epoch": 2.53, + "grad_norm": 4.6107215829957715, + "learning_rate": 7.239324887746846e-07, + "loss": 0.6478, + "step": 25110 + }, + { + "epoch": 2.53, + "grad_norm": 4.9246172109137705, + "learning_rate": 7.22413122580034e-07, + "loss": 0.6753, + "step": 25115 + }, + { + "epoch": 2.53, + "grad_norm": 4.752569415648657, + "learning_rate": 7.208952282989423e-07, + "loss": 0.6719, + "step": 25120 + }, + { + "epoch": 2.53, + "grad_norm": 6.074928665972307, + "learning_rate": 7.193788064537149e-07, + "loss": 0.6584, + "step": 25125 + }, + { + "epoch": 2.53, + "grad_norm": 5.459360160030664, + "learning_rate": 7.178638575661523e-07, + "loss": 0.6757, + "step": 25130 + }, + { + "epoch": 2.53, + "grad_norm": 4.9071075389801475, + "learning_rate": 7.163503821575457e-07, + "loss": 0.6371, + "step": 25135 + }, + { + "epoch": 2.53, + "grad_norm": 4.469886611538309, + "learning_rate": 7.14838380748682e-07, + "loss": 0.651, + "step": 25140 + }, + { + "epoch": 2.54, + "grad_norm": 4.255804557272553, + "learning_rate": 7.133278538598388e-07, + "loss": 0.6691, + "step": 25145 + }, + { + "epoch": 2.54, + "grad_norm": 5.76982918005449, + "learning_rate": 7.118188020107869e-07, + "loss": 0.6618, + "step": 25150 + }, + { + "epoch": 2.54, + "grad_norm": 6.35076015863343, + "learning_rate": 7.103112257207884e-07, + "loss": 0.617, + "step": 25155 + }, + { + "epoch": 2.54, + "grad_norm": 5.487834302585936, + "learning_rate": 7.088051255086015e-07, + "loss": 0.6986, + "step": 25160 + }, + { + "epoch": 2.54, + "grad_norm": 8.413840768505342, + "learning_rate": 7.073005018924706e-07, + "loss": 0.694, + "step": 25165 + }, + { + "epoch": 2.54, + "grad_norm": 5.55955416421179, + "learning_rate": 7.057973553901387e-07, + "loss": 0.6594, + "step": 25170 + }, + { + "epoch": 2.54, + "grad_norm": 4.613757922631881, + "learning_rate": 7.04295686518835e-07, + "loss": 0.6504, + "step": 25175 + }, + { + "epoch": 2.54, + "grad_norm": 6.900243725454976, + "learning_rate": 7.027954957952827e-07, + "loss": 0.6405, + "step": 25180 + }, + { + "epoch": 2.54, + "grad_norm": 6.027603532032837, + "learning_rate": 7.012967837356949e-07, + "loss": 0.6339, + "step": 25185 + }, + { + "epoch": 2.54, + "grad_norm": 7.712147105387408, + "learning_rate": 6.997995508557792e-07, + "loss": 0.6172, + "step": 25190 + }, + { + "epoch": 2.54, + "grad_norm": 5.581397598376956, + "learning_rate": 6.983037976707297e-07, + "loss": 0.6563, + "step": 25195 + }, + { + "epoch": 2.54, + "grad_norm": 7.936731592965554, + "learning_rate": 6.968095246952361e-07, + "loss": 0.6421, + "step": 25200 + }, + { + "epoch": 2.54, + "grad_norm": 6.692962308108754, + "learning_rate": 6.953167324434751e-07, + "loss": 0.6399, + "step": 25205 + }, + { + "epoch": 2.54, + "grad_norm": 11.914079098706027, + "learning_rate": 6.938254214291152e-07, + "loss": 0.6798, + "step": 25210 + }, + { + "epoch": 2.54, + "grad_norm": 4.763857783003577, + "learning_rate": 6.92335592165313e-07, + "loss": 0.6922, + "step": 25215 + }, + { + "epoch": 2.54, + "grad_norm": 4.452148035606174, + "learning_rate": 6.908472451647213e-07, + "loss": 0.6676, + "step": 25220 + }, + { + "epoch": 2.54, + "grad_norm": 6.178984053236935, + "learning_rate": 6.89360380939475e-07, + "loss": 0.6449, + "step": 25225 + }, + { + "epoch": 2.54, + "grad_norm": 4.261025469317658, + "learning_rate": 6.878750000012058e-07, + "loss": 0.6287, + "step": 25230 + }, + { + "epoch": 2.54, + "grad_norm": 5.163820142436162, + "learning_rate": 6.863911028610309e-07, + "loss": 0.6342, + "step": 25235 + }, + { + "epoch": 2.54, + "grad_norm": 5.186245405153009, + "learning_rate": 6.849086900295571e-07, + "loss": 0.6386, + "step": 25240 + }, + { + "epoch": 2.55, + "grad_norm": 4.625364394762081, + "learning_rate": 6.834277620168817e-07, + "loss": 0.628, + "step": 25245 + }, + { + "epoch": 2.55, + "grad_norm": 4.309413384246584, + "learning_rate": 6.819483193325898e-07, + "loss": 0.6031, + "step": 25250 + }, + { + "epoch": 2.55, + "grad_norm": 5.5437712430174235, + "learning_rate": 6.804703624857578e-07, + "loss": 0.6801, + "step": 25255 + }, + { + "epoch": 2.55, + "grad_norm": 5.719041204609119, + "learning_rate": 6.789938919849475e-07, + "loss": 0.6371, + "step": 25260 + }, + { + "epoch": 2.55, + "grad_norm": 5.4311190616552825, + "learning_rate": 6.775189083382128e-07, + "loss": 0.6481, + "step": 25265 + }, + { + "epoch": 2.55, + "grad_norm": 4.49817950043537, + "learning_rate": 6.760454120530935e-07, + "loss": 0.6764, + "step": 25270 + }, + { + "epoch": 2.55, + "grad_norm": 5.075615059252561, + "learning_rate": 6.74573403636618e-07, + "loss": 0.6722, + "step": 25275 + }, + { + "epoch": 2.55, + "grad_norm": 4.756692975556284, + "learning_rate": 6.731028835953024e-07, + "loss": 0.6153, + "step": 25280 + }, + { + "epoch": 2.55, + "grad_norm": 5.111532811336681, + "learning_rate": 6.716338524351523e-07, + "loss": 0.6435, + "step": 25285 + }, + { + "epoch": 2.55, + "grad_norm": 5.235658015353194, + "learning_rate": 6.701663106616591e-07, + "loss": 0.6366, + "step": 25290 + }, + { + "epoch": 2.55, + "grad_norm": 8.998525285202195, + "learning_rate": 6.687002587798036e-07, + "loss": 0.617, + "step": 25295 + }, + { + "epoch": 2.55, + "grad_norm": 5.636753103897285, + "learning_rate": 6.672356972940519e-07, + "loss": 0.6578, + "step": 25300 + }, + { + "epoch": 2.55, + "grad_norm": 5.007653059949343, + "learning_rate": 6.657726267083592e-07, + "loss": 0.6836, + "step": 25305 + }, + { + "epoch": 2.55, + "grad_norm": 7.4049423574361946, + "learning_rate": 6.643110475261644e-07, + "loss": 0.6093, + "step": 25310 + }, + { + "epoch": 2.55, + "grad_norm": 4.575892380855971, + "learning_rate": 6.62850960250398e-07, + "loss": 0.6632, + "step": 25315 + }, + { + "epoch": 2.55, + "grad_norm": 5.247916695481189, + "learning_rate": 6.613923653834731e-07, + "loss": 0.6536, + "step": 25320 + }, + { + "epoch": 2.55, + "grad_norm": 4.71368785157455, + "learning_rate": 6.599352634272921e-07, + "loss": 0.6091, + "step": 25325 + }, + { + "epoch": 2.55, + "grad_norm": 5.105286131354021, + "learning_rate": 6.584796548832422e-07, + "loss": 0.6343, + "step": 25330 + }, + { + "epoch": 2.55, + "grad_norm": 6.713066779444538, + "learning_rate": 6.570255402521964e-07, + "loss": 0.6458, + "step": 25335 + }, + { + "epoch": 2.55, + "grad_norm": 4.530370525092025, + "learning_rate": 6.555729200345123e-07, + "loss": 0.6326, + "step": 25340 + }, + { + "epoch": 2.56, + "grad_norm": 7.959540168806156, + "learning_rate": 6.541217947300388e-07, + "loss": 0.6574, + "step": 25345 + }, + { + "epoch": 2.56, + "grad_norm": 4.631356069842528, + "learning_rate": 6.52672164838104e-07, + "loss": 0.6537, + "step": 25350 + }, + { + "epoch": 2.56, + "grad_norm": 7.655849920407781, + "learning_rate": 6.512240308575257e-07, + "loss": 0.6484, + "step": 25355 + }, + { + "epoch": 2.56, + "grad_norm": 5.14779961172738, + "learning_rate": 6.497773932866064e-07, + "loss": 0.634, + "step": 25360 + }, + { + "epoch": 2.56, + "grad_norm": 6.745994577184643, + "learning_rate": 6.483322526231284e-07, + "loss": 0.651, + "step": 25365 + }, + { + "epoch": 2.56, + "grad_norm": 9.284058608425205, + "learning_rate": 6.468886093643673e-07, + "loss": 0.6718, + "step": 25370 + }, + { + "epoch": 2.56, + "grad_norm": 4.781063229895192, + "learning_rate": 6.454464640070767e-07, + "loss": 0.6623, + "step": 25375 + }, + { + "epoch": 2.56, + "grad_norm": 4.43704953047697, + "learning_rate": 6.440058170475006e-07, + "loss": 0.6134, + "step": 25380 + }, + { + "epoch": 2.56, + "grad_norm": 4.953656567496981, + "learning_rate": 6.425666689813603e-07, + "loss": 0.7096, + "step": 25385 + }, + { + "epoch": 2.56, + "grad_norm": 4.7737790556705, + "learning_rate": 6.411290203038705e-07, + "loss": 0.643, + "step": 25390 + }, + { + "epoch": 2.56, + "grad_norm": 4.713787102142757, + "learning_rate": 6.396928715097189e-07, + "loss": 0.6776, + "step": 25395 + }, + { + "epoch": 2.56, + "grad_norm": 4.8241511109663975, + "learning_rate": 6.382582230930868e-07, + "loss": 0.6387, + "step": 25400 + }, + { + "epoch": 2.56, + "grad_norm": 5.215920059201571, + "learning_rate": 6.368250755476329e-07, + "loss": 0.6091, + "step": 25405 + }, + { + "epoch": 2.56, + "grad_norm": 6.24655780189847, + "learning_rate": 6.353934293665049e-07, + "loss": 0.6465, + "step": 25410 + }, + { + "epoch": 2.56, + "grad_norm": 4.51722974692802, + "learning_rate": 6.339632850423272e-07, + "loss": 0.6697, + "step": 25415 + }, + { + "epoch": 2.56, + "grad_norm": 4.942691073595876, + "learning_rate": 6.325346430672158e-07, + "loss": 0.6949, + "step": 25420 + }, + { + "epoch": 2.56, + "grad_norm": 5.600010750073224, + "learning_rate": 6.311075039327602e-07, + "loss": 0.6358, + "step": 25425 + }, + { + "epoch": 2.56, + "grad_norm": 7.194071088902065, + "learning_rate": 6.296818681300409e-07, + "loss": 0.633, + "step": 25430 + }, + { + "epoch": 2.56, + "grad_norm": 4.352558847451886, + "learning_rate": 6.28257736149615e-07, + "loss": 0.6241, + "step": 25435 + }, + { + "epoch": 2.56, + "grad_norm": 4.656036452931519, + "learning_rate": 6.268351084815283e-07, + "loss": 0.6695, + "step": 25440 + }, + { + "epoch": 2.57, + "grad_norm": 4.692429799378556, + "learning_rate": 6.254139856153024e-07, + "loss": 0.6111, + "step": 25445 + }, + { + "epoch": 2.57, + "grad_norm": 4.833729995327521, + "learning_rate": 6.23994368039948e-07, + "loss": 0.6356, + "step": 25450 + }, + { + "epoch": 2.57, + "grad_norm": 4.541053282950694, + "learning_rate": 6.225762562439503e-07, + "loss": 0.6815, + "step": 25455 + }, + { + "epoch": 2.57, + "grad_norm": 4.63160893567276, + "learning_rate": 6.211596507152823e-07, + "loss": 0.6209, + "step": 25460 + }, + { + "epoch": 2.57, + "grad_norm": 4.7630558895319615, + "learning_rate": 6.197445519413958e-07, + "loss": 0.6881, + "step": 25465 + }, + { + "epoch": 2.57, + "grad_norm": 4.555410308682889, + "learning_rate": 6.183309604092258e-07, + "loss": 0.632, + "step": 25470 + }, + { + "epoch": 2.57, + "grad_norm": 4.86849347035945, + "learning_rate": 6.16918876605187e-07, + "loss": 0.6498, + "step": 25475 + }, + { + "epoch": 2.57, + "grad_norm": 4.873602369763425, + "learning_rate": 6.155083010151769e-07, + "loss": 0.6766, + "step": 25480 + }, + { + "epoch": 2.57, + "grad_norm": 4.367303692125798, + "learning_rate": 6.140992341245728e-07, + "loss": 0.6312, + "step": 25485 + }, + { + "epoch": 2.57, + "grad_norm": 4.862852100253972, + "learning_rate": 6.126916764182334e-07, + "loss": 0.6281, + "step": 25490 + }, + { + "epoch": 2.57, + "grad_norm": 5.515087993078539, + "learning_rate": 6.11285628380498e-07, + "loss": 0.6453, + "step": 25495 + }, + { + "epoch": 2.57, + "grad_norm": 4.2591560119771845, + "learning_rate": 6.098810904951847e-07, + "loss": 0.6283, + "step": 25500 + }, + { + "epoch": 2.57, + "grad_norm": 4.87669698541844, + "learning_rate": 6.084780632455967e-07, + "loss": 0.6248, + "step": 25505 + }, + { + "epoch": 2.57, + "grad_norm": 4.576326511788119, + "learning_rate": 6.070765471145113e-07, + "loss": 0.6219, + "step": 25510 + }, + { + "epoch": 2.57, + "grad_norm": 5.332526356057729, + "learning_rate": 6.056765425841921e-07, + "loss": 0.667, + "step": 25515 + }, + { + "epoch": 2.57, + "grad_norm": 4.774005536488563, + "learning_rate": 6.04278050136376e-07, + "loss": 0.633, + "step": 25520 + }, + { + "epoch": 2.57, + "grad_norm": 8.115837864694559, + "learning_rate": 6.028810702522853e-07, + "loss": 0.6713, + "step": 25525 + }, + { + "epoch": 2.57, + "grad_norm": 5.133344507166991, + "learning_rate": 6.014856034126177e-07, + "loss": 0.6668, + "step": 25530 + }, + { + "epoch": 2.57, + "grad_norm": 4.432610346496629, + "learning_rate": 6.000916500975534e-07, + "loss": 0.6277, + "step": 25535 + }, + { + "epoch": 2.57, + "grad_norm": 5.360887456370527, + "learning_rate": 5.986992107867495e-07, + "loss": 0.6635, + "step": 25540 + }, + { + "epoch": 2.58, + "grad_norm": 4.656144876230373, + "learning_rate": 5.973082859593448e-07, + "loss": 0.6342, + "step": 25545 + }, + { + "epoch": 2.58, + "grad_norm": 4.42985086767463, + "learning_rate": 5.959188760939527e-07, + "loss": 0.6248, + "step": 25550 + }, + { + "epoch": 2.58, + "grad_norm": 4.887388152336578, + "learning_rate": 5.945309816686695e-07, + "loss": 0.588, + "step": 25555 + }, + { + "epoch": 2.58, + "grad_norm": 4.9624664483204315, + "learning_rate": 5.931446031610666e-07, + "loss": 0.6716, + "step": 25560 + }, + { + "epoch": 2.58, + "grad_norm": 5.040677312192779, + "learning_rate": 5.917597410481979e-07, + "loss": 0.6604, + "step": 25565 + }, + { + "epoch": 2.58, + "grad_norm": 4.935228644214927, + "learning_rate": 5.903763958065912e-07, + "loss": 0.6344, + "step": 25570 + }, + { + "epoch": 2.58, + "grad_norm": 4.952775601502335, + "learning_rate": 5.889945679122566e-07, + "loss": 0.6559, + "step": 25575 + }, + { + "epoch": 2.58, + "grad_norm": 4.7423239151175185, + "learning_rate": 5.876142578406763e-07, + "loss": 0.6531, + "step": 25580 + }, + { + "epoch": 2.58, + "grad_norm": 4.254082493619603, + "learning_rate": 5.862354660668168e-07, + "loss": 0.6331, + "step": 25585 + }, + { + "epoch": 2.58, + "grad_norm": 4.633593931063916, + "learning_rate": 5.848581930651165e-07, + "loss": 0.6595, + "step": 25590 + }, + { + "epoch": 2.58, + "grad_norm": 4.51468653861724, + "learning_rate": 5.834824393094962e-07, + "loss": 0.668, + "step": 25595 + }, + { + "epoch": 2.58, + "grad_norm": 6.2054497461118, + "learning_rate": 5.821082052733495e-07, + "loss": 0.6203, + "step": 25600 + }, + { + "epoch": 2.58, + "grad_norm": 6.41402199262695, + "learning_rate": 5.807354914295516e-07, + "loss": 0.6508, + "step": 25605 + }, + { + "epoch": 2.58, + "grad_norm": 5.13511331457535, + "learning_rate": 5.793642982504477e-07, + "loss": 0.6391, + "step": 25610 + }, + { + "epoch": 2.58, + "grad_norm": 5.619860066420089, + "learning_rate": 5.779946262078683e-07, + "loss": 0.6209, + "step": 25615 + }, + { + "epoch": 2.58, + "grad_norm": 5.702009815902476, + "learning_rate": 5.766264757731144e-07, + "loss": 0.6334, + "step": 25620 + }, + { + "epoch": 2.58, + "grad_norm": 7.854644332692116, + "learning_rate": 5.752598474169641e-07, + "loss": 0.6912, + "step": 25625 + }, + { + "epoch": 2.58, + "grad_norm": 4.9694105036745775, + "learning_rate": 5.738947416096752e-07, + "loss": 0.6628, + "step": 25630 + }, + { + "epoch": 2.58, + "grad_norm": 4.500559211637084, + "learning_rate": 5.725311588209786e-07, + "loss": 0.6253, + "step": 25635 + }, + { + "epoch": 2.59, + "grad_norm": 4.848868708725839, + "learning_rate": 5.711690995200814e-07, + "loss": 0.6545, + "step": 25640 + }, + { + "epoch": 2.59, + "grad_norm": 5.647854890983991, + "learning_rate": 5.698085641756657e-07, + "loss": 0.6283, + "step": 25645 + }, + { + "epoch": 2.59, + "grad_norm": 5.390714116328605, + "learning_rate": 5.684495532558931e-07, + "loss": 0.6334, + "step": 25650 + }, + { + "epoch": 2.59, + "grad_norm": 5.771148030368524, + "learning_rate": 5.670920672283958e-07, + "loss": 0.6491, + "step": 25655 + }, + { + "epoch": 2.59, + "grad_norm": 4.782194409574925, + "learning_rate": 5.657361065602846e-07, + "loss": 0.6233, + "step": 25660 + }, + { + "epoch": 2.59, + "grad_norm": 5.897256882115723, + "learning_rate": 5.643816717181444e-07, + "loss": 0.6263, + "step": 25665 + }, + { + "epoch": 2.59, + "grad_norm": 4.424184580177064, + "learning_rate": 5.630287631680343e-07, + "loss": 0.5929, + "step": 25670 + }, + { + "epoch": 2.59, + "grad_norm": 4.881655967229124, + "learning_rate": 5.616773813754883e-07, + "loss": 0.6481, + "step": 25675 + }, + { + "epoch": 2.59, + "grad_norm": 4.506211542493089, + "learning_rate": 5.60327526805517e-07, + "loss": 0.6363, + "step": 25680 + }, + { + "epoch": 2.59, + "grad_norm": 7.092793877990308, + "learning_rate": 5.589791999226024e-07, + "loss": 0.6262, + "step": 25685 + }, + { + "epoch": 2.59, + "grad_norm": 4.5625815137116215, + "learning_rate": 5.576324011907041e-07, + "loss": 0.6374, + "step": 25690 + }, + { + "epoch": 2.59, + "grad_norm": 5.210948120306893, + "learning_rate": 5.562871310732543e-07, + "loss": 0.6637, + "step": 25695 + }, + { + "epoch": 2.59, + "grad_norm": 5.441147446353072, + "learning_rate": 5.54943390033158e-07, + "loss": 0.6427, + "step": 25700 + }, + { + "epoch": 2.59, + "grad_norm": 4.73805515667014, + "learning_rate": 5.53601178532795e-07, + "loss": 0.6232, + "step": 25705 + }, + { + "epoch": 2.59, + "grad_norm": 9.367015312850405, + "learning_rate": 5.522604970340201e-07, + "loss": 0.6698, + "step": 25710 + }, + { + "epoch": 2.59, + "grad_norm": 6.430392634994808, + "learning_rate": 5.509213459981594e-07, + "loss": 0.6715, + "step": 25715 + }, + { + "epoch": 2.59, + "grad_norm": 6.593853208661611, + "learning_rate": 5.495837258860154e-07, + "loss": 0.6764, + "step": 25720 + }, + { + "epoch": 2.59, + "grad_norm": 4.917194336277024, + "learning_rate": 5.482476371578605e-07, + "loss": 0.6576, + "step": 25725 + }, + { + "epoch": 2.59, + "grad_norm": 4.599302216384735, + "learning_rate": 5.469130802734418e-07, + "loss": 0.6801, + "step": 25730 + }, + { + "epoch": 2.59, + "grad_norm": 4.5781171029935255, + "learning_rate": 5.455800556919777e-07, + "loss": 0.628, + "step": 25735 + }, + { + "epoch": 2.6, + "grad_norm": 5.24417630576127, + "learning_rate": 5.442485638721635e-07, + "loss": 0.6454, + "step": 25740 + }, + { + "epoch": 2.6, + "grad_norm": 6.1990419917474675, + "learning_rate": 5.429186052721613e-07, + "loss": 0.6697, + "step": 25745 + }, + { + "epoch": 2.6, + "grad_norm": 4.870940237800007, + "learning_rate": 5.415901803496109e-07, + "loss": 0.6187, + "step": 25750 + }, + { + "epoch": 2.6, + "grad_norm": 5.990545890958486, + "learning_rate": 5.402632895616217e-07, + "loss": 0.6298, + "step": 25755 + }, + { + "epoch": 2.6, + "grad_norm": 6.069670700384858, + "learning_rate": 5.389379333647748e-07, + "loss": 0.6339, + "step": 25760 + }, + { + "epoch": 2.6, + "grad_norm": 5.133013881271903, + "learning_rate": 5.37614112215124e-07, + "loss": 0.6353, + "step": 25765 + }, + { + "epoch": 2.6, + "grad_norm": 4.550175486105931, + "learning_rate": 5.362918265681943e-07, + "loss": 0.5891, + "step": 25770 + }, + { + "epoch": 2.6, + "grad_norm": 5.150306726699754, + "learning_rate": 5.349710768789851e-07, + "loss": 0.6505, + "step": 25775 + }, + { + "epoch": 2.6, + "grad_norm": 4.704186001901201, + "learning_rate": 5.336518636019622e-07, + "loss": 0.6493, + "step": 25780 + }, + { + "epoch": 2.6, + "grad_norm": 4.631385357677084, + "learning_rate": 5.323341871910687e-07, + "loss": 0.6723, + "step": 25785 + }, + { + "epoch": 2.6, + "grad_norm": 4.732305304742764, + "learning_rate": 5.310180480997135e-07, + "loss": 0.6353, + "step": 25790 + }, + { + "epoch": 2.6, + "grad_norm": 5.81983535599789, + "learning_rate": 5.297034467807805e-07, + "loss": 0.6561, + "step": 25795 + }, + { + "epoch": 2.6, + "grad_norm": 4.4578611668626955, + "learning_rate": 5.2839038368662e-07, + "loss": 0.6382, + "step": 25800 + }, + { + "epoch": 2.6, + "grad_norm": 4.919902980430912, + "learning_rate": 5.27078859269059e-07, + "loss": 0.6479, + "step": 25805 + }, + { + "epoch": 2.6, + "grad_norm": 4.126133325799623, + "learning_rate": 5.257688739793892e-07, + "loss": 0.678, + "step": 25810 + }, + { + "epoch": 2.6, + "grad_norm": 6.436875044498097, + "learning_rate": 5.24460428268378e-07, + "loss": 0.6517, + "step": 25815 + }, + { + "epoch": 2.6, + "grad_norm": 5.790793229197409, + "learning_rate": 5.231535225862583e-07, + "loss": 0.6788, + "step": 25820 + }, + { + "epoch": 2.6, + "grad_norm": 4.505001576559171, + "learning_rate": 5.218481573827356e-07, + "loss": 0.6755, + "step": 25825 + }, + { + "epoch": 2.6, + "grad_norm": 4.509741158174838, + "learning_rate": 5.205443331069843e-07, + "loss": 0.6237, + "step": 25830 + }, + { + "epoch": 2.6, + "grad_norm": 10.540920818207695, + "learning_rate": 5.1924205020765e-07, + "loss": 0.6581, + "step": 25835 + }, + { + "epoch": 2.61, + "grad_norm": 4.4711223646163125, + "learning_rate": 5.179413091328461e-07, + "loss": 0.6207, + "step": 25840 + }, + { + "epoch": 2.61, + "grad_norm": 6.107980371352623, + "learning_rate": 5.166421103301572e-07, + "loss": 0.657, + "step": 25845 + }, + { + "epoch": 2.61, + "grad_norm": 4.71289938635372, + "learning_rate": 5.153444542466368e-07, + "loss": 0.6399, + "step": 25850 + }, + { + "epoch": 2.61, + "grad_norm": 5.43401069724985, + "learning_rate": 5.140483413288061e-07, + "loss": 0.6474, + "step": 25855 + }, + { + "epoch": 2.61, + "grad_norm": 4.692708943604419, + "learning_rate": 5.127537720226555e-07, + "loss": 0.6567, + "step": 25860 + }, + { + "epoch": 2.61, + "grad_norm": 5.929392791908258, + "learning_rate": 5.114607467736471e-07, + "loss": 0.6174, + "step": 25865 + }, + { + "epoch": 2.61, + "grad_norm": 4.493020788611299, + "learning_rate": 5.101692660267077e-07, + "loss": 0.6331, + "step": 25870 + }, + { + "epoch": 2.61, + "grad_norm": 7.050134354806047, + "learning_rate": 5.088793302262362e-07, + "loss": 0.6717, + "step": 25875 + }, + { + "epoch": 2.61, + "grad_norm": 4.760627520220874, + "learning_rate": 5.075909398160983e-07, + "loss": 0.6459, + "step": 25880 + }, + { + "epoch": 2.61, + "grad_norm": 4.579875926689988, + "learning_rate": 5.06304095239627e-07, + "loss": 0.6128, + "step": 25885 + }, + { + "epoch": 2.61, + "grad_norm": 4.695488079068494, + "learning_rate": 5.050187969396248e-07, + "loss": 0.6409, + "step": 25890 + }, + { + "epoch": 2.61, + "grad_norm": 4.584920910978887, + "learning_rate": 5.037350453583601e-07, + "loss": 0.6127, + "step": 25895 + }, + { + "epoch": 2.61, + "grad_norm": 4.733672284793962, + "learning_rate": 5.024528409375728e-07, + "loss": 0.6261, + "step": 25900 + }, + { + "epoch": 2.61, + "grad_norm": 5.334173651853007, + "learning_rate": 5.011721841184663e-07, + "loss": 0.6602, + "step": 25905 + }, + { + "epoch": 2.61, + "grad_norm": 4.659865730931912, + "learning_rate": 4.998930753417153e-07, + "loss": 0.6366, + "step": 25910 + }, + { + "epoch": 2.61, + "grad_norm": 7.149491262060584, + "learning_rate": 4.986155150474592e-07, + "loss": 0.6125, + "step": 25915 + }, + { + "epoch": 2.61, + "grad_norm": 5.661274683439253, + "learning_rate": 4.973395036753054e-07, + "loss": 0.6447, + "step": 25920 + }, + { + "epoch": 2.61, + "grad_norm": 5.369340660498729, + "learning_rate": 4.960650416643259e-07, + "loss": 0.6536, + "step": 25925 + }, + { + "epoch": 2.61, + "grad_norm": 4.4477401440865405, + "learning_rate": 4.947921294530656e-07, + "loss": 0.6509, + "step": 25930 + }, + { + "epoch": 2.61, + "grad_norm": 5.025337057836171, + "learning_rate": 4.935207674795295e-07, + "loss": 0.6458, + "step": 25935 + }, + { + "epoch": 2.62, + "grad_norm": 4.64075589139611, + "learning_rate": 4.922509561811939e-07, + "loss": 0.6374, + "step": 25940 + }, + { + "epoch": 2.62, + "grad_norm": 4.465561320407794, + "learning_rate": 4.909826959949988e-07, + "loss": 0.6422, + "step": 25945 + }, + { + "epoch": 2.62, + "grad_norm": 4.385235783038693, + "learning_rate": 4.897159873573515e-07, + "loss": 0.6421, + "step": 25950 + }, + { + "epoch": 2.62, + "grad_norm": 4.594086031534952, + "learning_rate": 4.884508307041241e-07, + "loss": 0.6374, + "step": 25955 + }, + { + "epoch": 2.62, + "grad_norm": 4.258903294857566, + "learning_rate": 4.871872264706579e-07, + "loss": 0.6678, + "step": 25960 + }, + { + "epoch": 2.62, + "grad_norm": 4.863058821194196, + "learning_rate": 4.859251750917559e-07, + "loss": 0.6825, + "step": 25965 + }, + { + "epoch": 2.62, + "grad_norm": 4.665283377330753, + "learning_rate": 4.846646770016905e-07, + "loss": 0.6431, + "step": 25970 + }, + { + "epoch": 2.62, + "grad_norm": 4.468217346387384, + "learning_rate": 4.834057326341973e-07, + "loss": 0.6687, + "step": 25975 + }, + { + "epoch": 2.62, + "grad_norm": 4.913611862161178, + "learning_rate": 4.821483424224776e-07, + "loss": 0.6269, + "step": 25980 + }, + { + "epoch": 2.62, + "grad_norm": 5.830245506259714, + "learning_rate": 4.808925067991977e-07, + "loss": 0.642, + "step": 25985 + }, + { + "epoch": 2.62, + "grad_norm": 5.179273168502301, + "learning_rate": 4.796382261964905e-07, + "loss": 0.647, + "step": 25990 + }, + { + "epoch": 2.62, + "grad_norm": 4.474430609458084, + "learning_rate": 4.78385501045951e-07, + "loss": 0.6752, + "step": 25995 + }, + { + "epoch": 2.62, + "grad_norm": 5.195319875581904, + "learning_rate": 4.771343317786431e-07, + "loss": 0.6482, + "step": 26000 + }, + { + "epoch": 2.62, + "grad_norm": 5.632248905230632, + "learning_rate": 4.75884718825092e-07, + "loss": 0.6448, + "step": 26005 + }, + { + "epoch": 2.62, + "grad_norm": 4.697644503611095, + "learning_rate": 4.7463666261528816e-07, + "loss": 0.6415, + "step": 26010 + }, + { + "epoch": 2.62, + "grad_norm": 4.47786641642741, + "learning_rate": 4.7339016357868586e-07, + "loss": 0.6067, + "step": 26015 + }, + { + "epoch": 2.62, + "grad_norm": 4.537017336696449, + "learning_rate": 4.7214522214420464e-07, + "loss": 0.6452, + "step": 26020 + }, + { + "epoch": 2.62, + "grad_norm": 5.8881464897238285, + "learning_rate": 4.7090183874022867e-07, + "loss": 0.6293, + "step": 26025 + }, + { + "epoch": 2.62, + "grad_norm": 7.987028504731956, + "learning_rate": 4.696600137946028e-07, + "loss": 0.6627, + "step": 26030 + }, + { + "epoch": 2.62, + "grad_norm": 4.634628482096437, + "learning_rate": 4.684197477346408e-07, + "loss": 0.6449, + "step": 26035 + }, + { + "epoch": 2.63, + "grad_norm": 4.980914909416849, + "learning_rate": 4.6718104098711525e-07, + "loss": 0.6715, + "step": 26040 + }, + { + "epoch": 2.63, + "grad_norm": 4.477828631881524, + "learning_rate": 4.65943893978264e-07, + "loss": 0.6841, + "step": 26045 + }, + { + "epoch": 2.63, + "grad_norm": 4.962689410012361, + "learning_rate": 4.6470830713378714e-07, + "loss": 0.6577, + "step": 26050 + }, + { + "epoch": 2.63, + "grad_norm": 4.410764725035844, + "learning_rate": 4.634742808788517e-07, + "loss": 0.6437, + "step": 26055 + }, + { + "epoch": 2.63, + "grad_norm": 5.205768291835934, + "learning_rate": 4.62241815638082e-07, + "loss": 0.6492, + "step": 26060 + }, + { + "epoch": 2.63, + "grad_norm": 4.529713580489901, + "learning_rate": 4.610109118355699e-07, + "loss": 0.6483, + "step": 26065 + }, + { + "epoch": 2.63, + "grad_norm": 5.108805923799643, + "learning_rate": 4.597815698948688e-07, + "loss": 0.6324, + "step": 26070 + }, + { + "epoch": 2.63, + "grad_norm": 4.6314997168050525, + "learning_rate": 4.585537902389925e-07, + "loss": 0.6332, + "step": 26075 + }, + { + "epoch": 2.63, + "grad_norm": 7.184210025979383, + "learning_rate": 4.5732757329041866e-07, + "loss": 0.6728, + "step": 26080 + }, + { + "epoch": 2.63, + "grad_norm": 4.714161295756465, + "learning_rate": 4.5610291947108866e-07, + "loss": 0.6529, + "step": 26085 + }, + { + "epoch": 2.63, + "grad_norm": 5.667064450386645, + "learning_rate": 4.548798292024037e-07, + "loss": 0.6738, + "step": 26090 + }, + { + "epoch": 2.63, + "grad_norm": 7.745480185868553, + "learning_rate": 4.536583029052294e-07, + "loss": 0.6351, + "step": 26095 + }, + { + "epoch": 2.63, + "grad_norm": 5.460932447787701, + "learning_rate": 4.5243834099989006e-07, + "loss": 0.6804, + "step": 26100 + }, + { + "epoch": 2.63, + "grad_norm": 4.212632081219248, + "learning_rate": 4.5121994390617484e-07, + "loss": 0.6317, + "step": 26105 + }, + { + "epoch": 2.63, + "grad_norm": 8.231950682387714, + "learning_rate": 4.5000311204333123e-07, + "loss": 0.6345, + "step": 26110 + }, + { + "epoch": 2.63, + "grad_norm": 6.870402774643485, + "learning_rate": 4.4878784583007207e-07, + "loss": 0.6527, + "step": 26115 + }, + { + "epoch": 2.63, + "grad_norm": 4.89773131939057, + "learning_rate": 4.4757414568456724e-07, + "loss": 0.6368, + "step": 26120 + }, + { + "epoch": 2.63, + "grad_norm": 4.53326405211866, + "learning_rate": 4.4636201202445164e-07, + "loss": 0.6441, + "step": 26125 + }, + { + "epoch": 2.63, + "grad_norm": 4.791953087780344, + "learning_rate": 4.4515144526681875e-07, + "loss": 0.6438, + "step": 26130 + }, + { + "epoch": 2.63, + "grad_norm": 5.825566433607306, + "learning_rate": 4.4394244582822264e-07, + "loss": 0.6512, + "step": 26135 + }, + { + "epoch": 2.64, + "grad_norm": 6.235658704216288, + "learning_rate": 4.4273501412468e-07, + "loss": 0.6547, + "step": 26140 + }, + { + "epoch": 2.64, + "grad_norm": 6.890768808912151, + "learning_rate": 4.4152915057166513e-07, + "loss": 0.6593, + "step": 26145 + }, + { + "epoch": 2.64, + "grad_norm": 4.795158812209058, + "learning_rate": 4.403248555841166e-07, + "loss": 0.6529, + "step": 26150 + }, + { + "epoch": 2.64, + "grad_norm": 5.131070693260541, + "learning_rate": 4.391221295764292e-07, + "loss": 0.6528, + "step": 26155 + }, + { + "epoch": 2.64, + "grad_norm": 4.710697756328377, + "learning_rate": 4.379209729624617e-07, + "loss": 0.6503, + "step": 26160 + }, + { + "epoch": 2.64, + "grad_norm": 4.6781511899293236, + "learning_rate": 4.367213861555308e-07, + "loss": 0.6672, + "step": 26165 + }, + { + "epoch": 2.64, + "grad_norm": 5.517593966636261, + "learning_rate": 4.355233695684119e-07, + "loss": 0.6221, + "step": 26170 + }, + { + "epoch": 2.64, + "grad_norm": 8.373524161316528, + "learning_rate": 4.343269236133413e-07, + "loss": 0.6879, + "step": 26175 + }, + { + "epoch": 2.64, + "grad_norm": 4.720161138755838, + "learning_rate": 4.331320487020163e-07, + "loss": 0.6572, + "step": 26180 + }, + { + "epoch": 2.64, + "grad_norm": 4.763630065649678, + "learning_rate": 4.319387452455903e-07, + "loss": 0.6046, + "step": 26185 + }, + { + "epoch": 2.64, + "grad_norm": 4.422303645853079, + "learning_rate": 4.3074701365468043e-07, + "loss": 0.6208, + "step": 26190 + }, + { + "epoch": 2.64, + "grad_norm": 4.433871515640049, + "learning_rate": 4.295568543393591e-07, + "loss": 0.6487, + "step": 26195 + }, + { + "epoch": 2.64, + "grad_norm": 4.931653971987222, + "learning_rate": 4.283682677091583e-07, + "loss": 0.6361, + "step": 26200 + }, + { + "epoch": 2.64, + "grad_norm": 4.890247256439932, + "learning_rate": 4.271812541730697e-07, + "loss": 0.7039, + "step": 26205 + }, + { + "epoch": 2.64, + "grad_norm": 4.698518626266278, + "learning_rate": 4.2599581413954485e-07, + "loss": 0.6728, + "step": 26210 + }, + { + "epoch": 2.64, + "grad_norm": 4.8057777852945, + "learning_rate": 4.2481194801649086e-07, + "loss": 0.6493, + "step": 26215 + }, + { + "epoch": 2.64, + "grad_norm": 5.169528262794991, + "learning_rate": 4.236296562112768e-07, + "loss": 0.6523, + "step": 26220 + }, + { + "epoch": 2.64, + "grad_norm": 5.521880829571063, + "learning_rate": 4.224489391307268e-07, + "loss": 0.6345, + "step": 26225 + }, + { + "epoch": 2.64, + "grad_norm": 5.596943705221121, + "learning_rate": 4.212697971811247e-07, + "loss": 0.6282, + "step": 26230 + }, + { + "epoch": 2.65, + "grad_norm": 4.40358921987457, + "learning_rate": 4.200922307682115e-07, + "loss": 0.6475, + "step": 26235 + }, + { + "epoch": 2.65, + "grad_norm": 4.424249521185416, + "learning_rate": 4.1891624029718856e-07, + "loss": 0.6432, + "step": 26240 + }, + { + "epoch": 2.65, + "grad_norm": 4.796678850595712, + "learning_rate": 4.1774182617271064e-07, + "loss": 0.6428, + "step": 26245 + }, + { + "epoch": 2.65, + "grad_norm": 4.6661345801770056, + "learning_rate": 4.165689887988944e-07, + "loss": 0.6626, + "step": 26250 + }, + { + "epoch": 2.65, + "grad_norm": 4.5806291067324105, + "learning_rate": 4.1539772857931203e-07, + "loss": 0.6459, + "step": 26255 + }, + { + "epoch": 2.65, + "grad_norm": 4.5650564431500085, + "learning_rate": 4.142280459169923e-07, + "loss": 0.6337, + "step": 26260 + }, + { + "epoch": 2.65, + "grad_norm": 4.73604227877278, + "learning_rate": 4.130599412144215e-07, + "loss": 0.6437, + "step": 26265 + }, + { + "epoch": 2.65, + "grad_norm": 4.586531802102478, + "learning_rate": 4.118934148735437e-07, + "loss": 0.653, + "step": 26270 + }, + { + "epoch": 2.65, + "grad_norm": 5.758452148639693, + "learning_rate": 4.107284672957601e-07, + "loss": 0.6437, + "step": 26275 + }, + { + "epoch": 2.65, + "grad_norm": 4.571298067242407, + "learning_rate": 4.095650988819272e-07, + "loss": 0.6772, + "step": 26280 + }, + { + "epoch": 2.65, + "grad_norm": 7.556138379592204, + "learning_rate": 4.084033100323598e-07, + "loss": 0.6234, + "step": 26285 + }, + { + "epoch": 2.65, + "grad_norm": 5.604219528719242, + "learning_rate": 4.072431011468286e-07, + "loss": 0.6023, + "step": 26290 + }, + { + "epoch": 2.65, + "grad_norm": 5.610685075191588, + "learning_rate": 4.0608447262455886e-07, + "loss": 0.6254, + "step": 26295 + }, + { + "epoch": 2.65, + "grad_norm": 4.604971165395091, + "learning_rate": 4.049274248642343e-07, + "loss": 0.6382, + "step": 26300 + }, + { + "epoch": 2.65, + "grad_norm": 5.805269762146627, + "learning_rate": 4.037719582639943e-07, + "loss": 0.6471, + "step": 26305 + }, + { + "epoch": 2.65, + "grad_norm": 5.928338049264352, + "learning_rate": 4.02618073221433e-07, + "loss": 0.6322, + "step": 26310 + }, + { + "epoch": 2.65, + "grad_norm": 4.635905096298693, + "learning_rate": 4.014657701336028e-07, + "loss": 0.6773, + "step": 26315 + }, + { + "epoch": 2.65, + "grad_norm": 4.460898503134962, + "learning_rate": 4.003150493970087e-07, + "loss": 0.6585, + "step": 26320 + }, + { + "epoch": 2.65, + "grad_norm": 4.856638872130659, + "learning_rate": 3.9916591140761294e-07, + "loss": 0.6428, + "step": 26325 + }, + { + "epoch": 2.65, + "grad_norm": 4.54106911842793, + "learning_rate": 3.98018356560832e-07, + "loss": 0.6197, + "step": 26330 + }, + { + "epoch": 2.66, + "grad_norm": 4.777343218581925, + "learning_rate": 3.9687238525153994e-07, + "loss": 0.6599, + "step": 26335 + }, + { + "epoch": 2.66, + "grad_norm": 5.6748165213887125, + "learning_rate": 3.9572799787406245e-07, + "loss": 0.6462, + "step": 26340 + }, + { + "epoch": 2.66, + "grad_norm": 4.5916775692590726, + "learning_rate": 3.945851948221846e-07, + "loss": 0.605, + "step": 26345 + }, + { + "epoch": 2.66, + "grad_norm": 4.306976299073246, + "learning_rate": 3.9344397648914234e-07, + "loss": 0.6697, + "step": 26350 + }, + { + "epoch": 2.66, + "grad_norm": 5.195639071441672, + "learning_rate": 3.9230434326762823e-07, + "loss": 0.6419, + "step": 26355 + }, + { + "epoch": 2.66, + "grad_norm": 4.581093545879182, + "learning_rate": 3.9116629554978747e-07, + "loss": 0.6413, + "step": 26360 + }, + { + "epoch": 2.66, + "grad_norm": 5.052458529720565, + "learning_rate": 3.9002983372722345e-07, + "loss": 0.6623, + "step": 26365 + }, + { + "epoch": 2.66, + "grad_norm": 4.697125648228998, + "learning_rate": 3.888949581909901e-07, + "loss": 0.6843, + "step": 26370 + }, + { + "epoch": 2.66, + "grad_norm": 5.660423087556206, + "learning_rate": 3.877616693315978e-07, + "loss": 0.6547, + "step": 26375 + }, + { + "epoch": 2.66, + "grad_norm": 4.394527486808878, + "learning_rate": 3.8662996753901063e-07, + "loss": 0.6739, + "step": 26380 + }, + { + "epoch": 2.66, + "grad_norm": 4.587506714746173, + "learning_rate": 3.8549985320264496e-07, + "loss": 0.6577, + "step": 26385 + }, + { + "epoch": 2.66, + "grad_norm": 4.525985421444638, + "learning_rate": 3.8437132671137245e-07, + "loss": 0.6433, + "step": 26390 + }, + { + "epoch": 2.66, + "grad_norm": 5.308533008713368, + "learning_rate": 3.8324438845351697e-07, + "loss": 0.6337, + "step": 26395 + }, + { + "epoch": 2.66, + "grad_norm": 4.600283096400497, + "learning_rate": 3.8211903881685887e-07, + "loss": 0.6838, + "step": 26400 + }, + { + "epoch": 2.66, + "grad_norm": 4.629575518720822, + "learning_rate": 3.8099527818862837e-07, + "loss": 0.6116, + "step": 26405 + }, + { + "epoch": 2.66, + "grad_norm": 4.946398314146315, + "learning_rate": 3.7987310695551115e-07, + "loss": 0.6687, + "step": 26410 + }, + { + "epoch": 2.66, + "grad_norm": 4.699259761441962, + "learning_rate": 3.787525255036456e-07, + "loss": 0.6748, + "step": 26415 + }, + { + "epoch": 2.66, + "grad_norm": 4.4768450040424606, + "learning_rate": 3.7763353421862215e-07, + "loss": 0.6404, + "step": 26420 + }, + { + "epoch": 2.66, + "grad_norm": 4.489871861176078, + "learning_rate": 3.7651613348548386e-07, + "loss": 0.6138, + "step": 26425 + }, + { + "epoch": 2.66, + "grad_norm": 4.413401018566667, + "learning_rate": 3.7540032368872937e-07, + "loss": 0.6692, + "step": 26430 + }, + { + "epoch": 2.67, + "grad_norm": 4.8938714822113205, + "learning_rate": 3.742861052123048e-07, + "loss": 0.5991, + "step": 26435 + }, + { + "epoch": 2.67, + "grad_norm": 5.467851521742816, + "learning_rate": 3.7317347843961514e-07, + "loss": 0.7053, + "step": 26440 + }, + { + "epoch": 2.67, + "grad_norm": 4.564001385017399, + "learning_rate": 3.7206244375351197e-07, + "loss": 0.6543, + "step": 26445 + }, + { + "epoch": 2.67, + "grad_norm": 5.099093322914692, + "learning_rate": 3.7095300153630167e-07, + "loss": 0.6714, + "step": 26450 + }, + { + "epoch": 2.67, + "grad_norm": 4.413616723694407, + "learning_rate": 3.6984515216974104e-07, + "loss": 0.6774, + "step": 26455 + }, + { + "epoch": 2.67, + "grad_norm": 4.746017427039399, + "learning_rate": 3.687388960350424e-07, + "loss": 0.6702, + "step": 26460 + }, + { + "epoch": 2.67, + "grad_norm": 6.302096269813245, + "learning_rate": 3.6763423351286576e-07, + "loss": 0.6302, + "step": 26465 + }, + { + "epoch": 2.67, + "grad_norm": 5.413759822215279, + "learning_rate": 3.6653116498332587e-07, + "loss": 0.6451, + "step": 26470 + }, + { + "epoch": 2.67, + "grad_norm": 4.924375370439851, + "learning_rate": 3.6542969082598576e-07, + "loss": 0.6791, + "step": 26475 + }, + { + "epoch": 2.67, + "grad_norm": 4.551790902336037, + "learning_rate": 3.6432981141986347e-07, + "loss": 0.6193, + "step": 26480 + }, + { + "epoch": 2.67, + "grad_norm": 4.447720030507796, + "learning_rate": 3.632315271434239e-07, + "loss": 0.6189, + "step": 26485 + }, + { + "epoch": 2.67, + "grad_norm": 4.80993642009209, + "learning_rate": 3.6213483837458873e-07, + "loss": 0.6334, + "step": 26490 + }, + { + "epoch": 2.67, + "grad_norm": 4.8623738488613215, + "learning_rate": 3.61039745490725e-07, + "loss": 0.6147, + "step": 26495 + }, + { + "epoch": 2.67, + "grad_norm": 5.022644888678823, + "learning_rate": 3.5994624886865445e-07, + "loss": 0.697, + "step": 26500 + }, + { + "epoch": 2.67, + "grad_norm": 12.240058964168647, + "learning_rate": 3.588543488846485e-07, + "loss": 0.6848, + "step": 26505 + }, + { + "epoch": 2.67, + "grad_norm": 4.456351689321203, + "learning_rate": 3.5776404591442824e-07, + "loss": 0.633, + "step": 26510 + }, + { + "epoch": 2.67, + "grad_norm": 5.244832953200636, + "learning_rate": 3.5667534033316466e-07, + "loss": 0.6951, + "step": 26515 + }, + { + "epoch": 2.67, + "grad_norm": 4.539647528153177, + "learning_rate": 3.5558823251548304e-07, + "loss": 0.6083, + "step": 26520 + }, + { + "epoch": 2.67, + "grad_norm": 5.573046024457291, + "learning_rate": 3.545027228354542e-07, + "loss": 0.6366, + "step": 26525 + }, + { + "epoch": 2.67, + "grad_norm": 4.955392603809859, + "learning_rate": 3.534188116666004e-07, + "loss": 0.681, + "step": 26530 + }, + { + "epoch": 2.68, + "grad_norm": 5.170833130307659, + "learning_rate": 3.523364993818978e-07, + "loss": 0.6342, + "step": 26535 + }, + { + "epoch": 2.68, + "grad_norm": 4.891291415673024, + "learning_rate": 3.512557863537647e-07, + "loss": 0.6489, + "step": 26540 + }, + { + "epoch": 2.68, + "grad_norm": 4.888406857967937, + "learning_rate": 3.5017667295407676e-07, + "loss": 0.6544, + "step": 26545 + }, + { + "epoch": 2.68, + "grad_norm": 4.449293637678464, + "learning_rate": 3.490991595541532e-07, + "loss": 0.6436, + "step": 26550 + }, + { + "epoch": 2.68, + "grad_norm": 4.652288939825329, + "learning_rate": 3.480232465247679e-07, + "loss": 0.6586, + "step": 26555 + }, + { + "epoch": 2.68, + "grad_norm": 6.060881608415734, + "learning_rate": 3.469489342361393e-07, + "loss": 0.6442, + "step": 26560 + }, + { + "epoch": 2.68, + "grad_norm": 4.798399988610016, + "learning_rate": 3.458762230579388e-07, + "loss": 0.6235, + "step": 26565 + }, + { + "epoch": 2.68, + "grad_norm": 4.838544020817497, + "learning_rate": 3.44805113359285e-07, + "loss": 0.652, + "step": 26570 + }, + { + "epoch": 2.68, + "grad_norm": 4.5975411064486185, + "learning_rate": 3.4373560550874543e-07, + "loss": 0.6477, + "step": 26575 + }, + { + "epoch": 2.68, + "grad_norm": 5.549271557696654, + "learning_rate": 3.426676998743361e-07, + "loss": 0.6937, + "step": 26580 + }, + { + "epoch": 2.68, + "grad_norm": 4.760921452630401, + "learning_rate": 3.416013968235238e-07, + "loss": 0.6432, + "step": 26585 + }, + { + "epoch": 2.68, + "grad_norm": 5.4813525599711355, + "learning_rate": 3.4053669672322096e-07, + "loss": 0.6855, + "step": 26590 + }, + { + "epoch": 2.68, + "grad_norm": 4.671748925862381, + "learning_rate": 3.3947359993979077e-07, + "loss": 0.6466, + "step": 26595 + }, + { + "epoch": 2.68, + "grad_norm": 4.543481218279123, + "learning_rate": 3.3841210683904393e-07, + "loss": 0.6411, + "step": 26600 + }, + { + "epoch": 2.68, + "grad_norm": 4.72674022959049, + "learning_rate": 3.3735221778623815e-07, + "loss": 0.6554, + "step": 26605 + }, + { + "epoch": 2.68, + "grad_norm": 5.098608945175818, + "learning_rate": 3.362939331460807e-07, + "loss": 0.6626, + "step": 26610 + }, + { + "epoch": 2.68, + "grad_norm": 4.787862398042563, + "learning_rate": 3.352372532827275e-07, + "loss": 0.6539, + "step": 26615 + }, + { + "epoch": 2.68, + "grad_norm": 4.632119559373765, + "learning_rate": 3.341821785597787e-07, + "loss": 0.6774, + "step": 26620 + }, + { + "epoch": 2.68, + "grad_norm": 4.783547536689469, + "learning_rate": 3.3312870934028685e-07, + "loss": 0.6352, + "step": 26625 + }, + { + "epoch": 2.68, + "grad_norm": 4.297647085341283, + "learning_rate": 3.3207684598674906e-07, + "loss": 0.674, + "step": 26630 + }, + { + "epoch": 2.69, + "grad_norm": 4.711494616548537, + "learning_rate": 3.310265888611097e-07, + "loss": 0.6873, + "step": 26635 + }, + { + "epoch": 2.69, + "grad_norm": 5.4983153795010775, + "learning_rate": 3.2997793832476146e-07, + "loss": 0.6039, + "step": 26640 + }, + { + "epoch": 2.69, + "grad_norm": 4.69938871439085, + "learning_rate": 3.2893089473854447e-07, + "loss": 0.6504, + "step": 26645 + }, + { + "epoch": 2.69, + "grad_norm": 4.5622571590941225, + "learning_rate": 3.2788545846274553e-07, + "loss": 0.6531, + "step": 26650 + }, + { + "epoch": 2.69, + "grad_norm": 6.100048240343806, + "learning_rate": 3.2684162985709757e-07, + "loss": 0.6577, + "step": 26655 + }, + { + "epoch": 2.69, + "grad_norm": 6.270853011278117, + "learning_rate": 3.2579940928078204e-07, + "loss": 0.6192, + "step": 26660 + }, + { + "epoch": 2.69, + "grad_norm": 4.928062910367746, + "learning_rate": 3.247587970924243e-07, + "loss": 0.6612, + "step": 26665 + }, + { + "epoch": 2.69, + "grad_norm": 8.925526303808986, + "learning_rate": 3.237197936501002e-07, + "loss": 0.6387, + "step": 26670 + }, + { + "epoch": 2.69, + "grad_norm": 6.50319252883846, + "learning_rate": 3.226823993113276e-07, + "loss": 0.6456, + "step": 26675 + }, + { + "epoch": 2.69, + "grad_norm": 4.515467389106821, + "learning_rate": 3.216466144330749e-07, + "loss": 0.6674, + "step": 26680 + }, + { + "epoch": 2.69, + "grad_norm": 4.683024789220738, + "learning_rate": 3.2061243937175304e-07, + "loss": 0.6754, + "step": 26685 + }, + { + "epoch": 2.69, + "grad_norm": 4.525353924162903, + "learning_rate": 3.195798744832235e-07, + "loss": 0.648, + "step": 26690 + }, + { + "epoch": 2.69, + "grad_norm": 4.236326022567566, + "learning_rate": 3.185489201227865e-07, + "loss": 0.6259, + "step": 26695 + }, + { + "epoch": 2.69, + "grad_norm": 7.172427585267906, + "learning_rate": 3.1751957664519606e-07, + "loss": 0.6693, + "step": 26700 + }, + { + "epoch": 2.69, + "grad_norm": 4.397112012371753, + "learning_rate": 3.164918444046461e-07, + "loss": 0.6263, + "step": 26705 + }, + { + "epoch": 2.69, + "grad_norm": 4.769481924908539, + "learning_rate": 3.154657237547798e-07, + "loss": 0.6951, + "step": 26710 + }, + { + "epoch": 2.69, + "grad_norm": 4.80030276412543, + "learning_rate": 3.144412150486831e-07, + "loss": 0.6363, + "step": 26715 + }, + { + "epoch": 2.69, + "grad_norm": 4.923919053517498, + "learning_rate": 3.134183186388906e-07, + "loss": 0.7015, + "step": 26720 + }, + { + "epoch": 2.69, + "grad_norm": 4.634432469691726, + "learning_rate": 3.1239703487737696e-07, + "loss": 0.6231, + "step": 26725 + }, + { + "epoch": 2.69, + "grad_norm": 4.6519209114755435, + "learning_rate": 3.1137736411556705e-07, + "loss": 0.6219, + "step": 26730 + }, + { + "epoch": 2.7, + "grad_norm": 4.644362142363348, + "learning_rate": 3.103593067043276e-07, + "loss": 0.6476, + "step": 26735 + }, + { + "epoch": 2.7, + "grad_norm": 5.0411318413340815, + "learning_rate": 3.093428629939721e-07, + "loss": 0.6551, + "step": 26740 + }, + { + "epoch": 2.7, + "grad_norm": 5.615899235277585, + "learning_rate": 3.0832803333425643e-07, + "loss": 0.6275, + "step": 26745 + }, + { + "epoch": 2.7, + "grad_norm": 5.031842335224812, + "learning_rate": 3.0731481807438513e-07, + "loss": 0.652, + "step": 26750 + }, + { + "epoch": 2.7, + "grad_norm": 4.7918819848872864, + "learning_rate": 3.063032175630015e-07, + "loss": 0.6294, + "step": 26755 + }, + { + "epoch": 2.7, + "grad_norm": 5.49577226983595, + "learning_rate": 3.052932321481983e-07, + "loss": 0.6443, + "step": 26760 + }, + { + "epoch": 2.7, + "grad_norm": 5.230227096478425, + "learning_rate": 3.0428486217750907e-07, + "loss": 0.6361, + "step": 26765 + }, + { + "epoch": 2.7, + "grad_norm": 4.67422220177396, + "learning_rate": 3.032781079979147e-07, + "loss": 0.6537, + "step": 26770 + }, + { + "epoch": 2.7, + "grad_norm": 5.363855352539699, + "learning_rate": 3.0227296995583797e-07, + "loss": 0.6151, + "step": 26775 + }, + { + "epoch": 2.7, + "grad_norm": 7.584201273966142, + "learning_rate": 3.012694483971446e-07, + "loss": 0.6498, + "step": 26780 + }, + { + "epoch": 2.7, + "grad_norm": 4.896712764016411, + "learning_rate": 3.0026754366714703e-07, + "loss": 0.6589, + "step": 26785 + }, + { + "epoch": 2.7, + "grad_norm": 5.134313710305708, + "learning_rate": 2.9926725611059747e-07, + "loss": 0.6656, + "step": 26790 + }, + { + "epoch": 2.7, + "grad_norm": 4.870412747383471, + "learning_rate": 2.982685860716966e-07, + "loss": 0.6446, + "step": 26795 + }, + { + "epoch": 2.7, + "grad_norm": 4.89087219092221, + "learning_rate": 2.9727153389408347e-07, + "loss": 0.6327, + "step": 26800 + }, + { + "epoch": 2.7, + "grad_norm": 4.952435309248969, + "learning_rate": 2.962760999208453e-07, + "loss": 0.665, + "step": 26805 + }, + { + "epoch": 2.7, + "grad_norm": 4.448454332529152, + "learning_rate": 2.95282284494508e-07, + "loss": 0.6392, + "step": 26810 + }, + { + "epoch": 2.7, + "grad_norm": 5.802846152453651, + "learning_rate": 2.9429008795704317e-07, + "loss": 0.6402, + "step": 26815 + }, + { + "epoch": 2.7, + "grad_norm": 5.889305219959239, + "learning_rate": 2.932995106498637e-07, + "loss": 0.6156, + "step": 26820 + }, + { + "epoch": 2.7, + "grad_norm": 4.515546156661981, + "learning_rate": 2.9231055291382816e-07, + "loss": 0.6196, + "step": 26825 + }, + { + "epoch": 2.71, + "grad_norm": 4.409786442384293, + "learning_rate": 2.9132321508923424e-07, + "loss": 0.615, + "step": 26830 + }, + { + "epoch": 2.71, + "grad_norm": 4.82496443011332, + "learning_rate": 2.903374975158257e-07, + "loss": 0.666, + "step": 26835 + }, + { + "epoch": 2.71, + "grad_norm": 5.321518340014227, + "learning_rate": 2.893534005327858e-07, + "loss": 0.6391, + "step": 26840 + }, + { + "epoch": 2.71, + "grad_norm": 4.6860386188295795, + "learning_rate": 2.883709244787414e-07, + "loss": 0.6422, + "step": 26845 + }, + { + "epoch": 2.71, + "grad_norm": 4.54933372316617, + "learning_rate": 2.8739006969176087e-07, + "loss": 0.6227, + "step": 26850 + }, + { + "epoch": 2.71, + "grad_norm": 4.550003769397981, + "learning_rate": 2.8641083650935765e-07, + "loss": 0.6237, + "step": 26855 + }, + { + "epoch": 2.71, + "grad_norm": 5.100015574627647, + "learning_rate": 2.854332252684827e-07, + "loss": 0.6396, + "step": 26860 + }, + { + "epoch": 2.71, + "grad_norm": 4.526484394100703, + "learning_rate": 2.844572363055326e-07, + "loss": 0.6314, + "step": 26865 + }, + { + "epoch": 2.71, + "grad_norm": 4.262718390081325, + "learning_rate": 2.8348286995634247e-07, + "loss": 0.6888, + "step": 26870 + }, + { + "epoch": 2.71, + "grad_norm": 5.466091994999804, + "learning_rate": 2.8251012655619417e-07, + "loss": 0.6439, + "step": 26875 + }, + { + "epoch": 2.71, + "grad_norm": 4.722071904748594, + "learning_rate": 2.815390064398038e-07, + "loss": 0.6373, + "step": 26880 + }, + { + "epoch": 2.71, + "grad_norm": 5.302036926701708, + "learning_rate": 2.8056950994133524e-07, + "loss": 0.6513, + "step": 26885 + }, + { + "epoch": 2.71, + "grad_norm": 4.580234389013792, + "learning_rate": 2.796016373943905e-07, + "loss": 0.6607, + "step": 26890 + }, + { + "epoch": 2.71, + "grad_norm": 5.854842515682165, + "learning_rate": 2.786353891320143e-07, + "loss": 0.6205, + "step": 26895 + }, + { + "epoch": 2.71, + "grad_norm": 5.375572061695649, + "learning_rate": 2.7767076548669004e-07, + "loss": 0.6762, + "step": 26900 + }, + { + "epoch": 2.71, + "grad_norm": 4.422668495277672, + "learning_rate": 2.7670776679034727e-07, + "loss": 0.6533, + "step": 26905 + }, + { + "epoch": 2.71, + "grad_norm": 6.482983874201208, + "learning_rate": 2.757463933743493e-07, + "loss": 0.6156, + "step": 26910 + }, + { + "epoch": 2.71, + "grad_norm": 5.274977240043379, + "learning_rate": 2.747866455695053e-07, + "loss": 0.6541, + "step": 26915 + }, + { + "epoch": 2.71, + "grad_norm": 4.630084332091941, + "learning_rate": 2.7382852370606337e-07, + "loss": 0.6466, + "step": 26920 + }, + { + "epoch": 2.71, + "grad_norm": 4.469241103432817, + "learning_rate": 2.7287202811371203e-07, + "loss": 0.6348, + "step": 26925 + }, + { + "epoch": 2.72, + "grad_norm": 6.203696119549017, + "learning_rate": 2.7191715912158187e-07, + "loss": 0.6029, + "step": 26930 + }, + { + "epoch": 2.72, + "grad_norm": 5.2311056726551985, + "learning_rate": 2.7096391705824067e-07, + "loss": 0.6521, + "step": 26935 + }, + { + "epoch": 2.72, + "grad_norm": 4.831648020213548, + "learning_rate": 2.700123022516998e-07, + "loss": 0.6684, + "step": 26940 + }, + { + "epoch": 2.72, + "grad_norm": 4.001125628731334, + "learning_rate": 2.6906231502940637e-07, + "loss": 0.6459, + "step": 26945 + }, + { + "epoch": 2.72, + "grad_norm": 6.581463486268398, + "learning_rate": 2.6811395571825327e-07, + "loss": 0.6374, + "step": 26950 + }, + { + "epoch": 2.72, + "grad_norm": 4.827504175309236, + "learning_rate": 2.671672246445678e-07, + "loss": 0.6118, + "step": 26955 + }, + { + "epoch": 2.72, + "grad_norm": 6.577015648069526, + "learning_rate": 2.662221221341199e-07, + "loss": 0.6472, + "step": 26960 + }, + { + "epoch": 2.72, + "grad_norm": 5.8216784748881185, + "learning_rate": 2.652786485121195e-07, + "loss": 0.6551, + "step": 26965 + }, + { + "epoch": 2.72, + "grad_norm": 4.206714745623948, + "learning_rate": 2.643368041032135e-07, + "loss": 0.6614, + "step": 26970 + }, + { + "epoch": 2.72, + "grad_norm": 4.9204799966670425, + "learning_rate": 2.6339658923148934e-07, + "loss": 0.65, + "step": 26975 + }, + { + "epoch": 2.72, + "grad_norm": 5.502378422522532, + "learning_rate": 2.624580042204755e-07, + "loss": 0.5966, + "step": 26980 + }, + { + "epoch": 2.72, + "grad_norm": 4.841075778399853, + "learning_rate": 2.615210493931369e-07, + "loss": 0.6508, + "step": 26985 + }, + { + "epoch": 2.72, + "grad_norm": 5.457993744591424, + "learning_rate": 2.605857250718802e-07, + "loss": 0.6604, + "step": 26990 + }, + { + "epoch": 2.72, + "grad_norm": 5.407642117421727, + "learning_rate": 2.596520315785489e-07, + "loss": 0.6132, + "step": 26995 + }, + { + "epoch": 2.72, + "grad_norm": 4.586783087603529, + "learning_rate": 2.5871996923442555e-07, + "loss": 0.6758, + "step": 27000 + }, + { + "epoch": 2.72, + "grad_norm": 5.3373244259396495, + "learning_rate": 2.5778953836023247e-07, + "loss": 0.6464, + "step": 27005 + }, + { + "epoch": 2.72, + "grad_norm": 4.6943349890765145, + "learning_rate": 2.5686073927612964e-07, + "loss": 0.6582, + "step": 27010 + }, + { + "epoch": 2.72, + "grad_norm": 5.550383608738202, + "learning_rate": 2.5593357230171645e-07, + "loss": 0.6317, + "step": 27015 + }, + { + "epoch": 2.72, + "grad_norm": 5.585911485678205, + "learning_rate": 2.550080377560299e-07, + "loss": 0.6652, + "step": 27020 + }, + { + "epoch": 2.72, + "grad_norm": 4.611901149160534, + "learning_rate": 2.540841359575458e-07, + "loss": 0.6434, + "step": 27025 + }, + { + "epoch": 2.73, + "grad_norm": 6.226867509992674, + "learning_rate": 2.5316186722417756e-07, + "loss": 0.6384, + "step": 27030 + }, + { + "epoch": 2.73, + "grad_norm": 4.889591104976683, + "learning_rate": 2.5224123187327753e-07, + "loss": 0.6282, + "step": 27035 + }, + { + "epoch": 2.73, + "grad_norm": 5.138212027862173, + "learning_rate": 2.5132223022163395e-07, + "loss": 0.6599, + "step": 27040 + }, + { + "epoch": 2.73, + "grad_norm": 4.678815642637734, + "learning_rate": 2.5040486258547603e-07, + "loss": 0.6327, + "step": 27045 + }, + { + "epoch": 2.73, + "grad_norm": 4.569598167796238, + "learning_rate": 2.4948912928046797e-07, + "loss": 0.6298, + "step": 27050 + }, + { + "epoch": 2.73, + "grad_norm": 5.7706974750619535, + "learning_rate": 2.4857503062171386e-07, + "loss": 0.6319, + "step": 27055 + }, + { + "epoch": 2.73, + "grad_norm": 6.3280113977072965, + "learning_rate": 2.4766256692375367e-07, + "loss": 0.6304, + "step": 27060 + }, + { + "epoch": 2.73, + "grad_norm": 5.076679797067638, + "learning_rate": 2.467517385005652e-07, + "loss": 0.6282, + "step": 27065 + }, + { + "epoch": 2.73, + "grad_norm": 5.1371346360423, + "learning_rate": 2.4584254566556276e-07, + "loss": 0.618, + "step": 27070 + }, + { + "epoch": 2.73, + "grad_norm": 4.86654090029525, + "learning_rate": 2.4493498873160046e-07, + "loss": 0.6152, + "step": 27075 + }, + { + "epoch": 2.73, + "grad_norm": 4.641826973860905, + "learning_rate": 2.440290680109664e-07, + "loss": 0.6607, + "step": 27080 + }, + { + "epoch": 2.73, + "grad_norm": 4.154592587557687, + "learning_rate": 2.4312478381538775e-07, + "loss": 0.6139, + "step": 27085 + }, + { + "epoch": 2.73, + "grad_norm": 4.593874514370757, + "learning_rate": 2.422221364560279e-07, + "loss": 0.6748, + "step": 27090 + }, + { + "epoch": 2.73, + "grad_norm": 4.655599640633073, + "learning_rate": 2.413211262434867e-07, + "loss": 0.6744, + "step": 27095 + }, + { + "epoch": 2.73, + "grad_norm": 4.777183373107945, + "learning_rate": 2.404217534878006e-07, + "loss": 0.6518, + "step": 27100 + }, + { + "epoch": 2.73, + "grad_norm": 4.5609775207126, + "learning_rate": 2.395240184984432e-07, + "loss": 0.6391, + "step": 27105 + }, + { + "epoch": 2.73, + "grad_norm": 4.5354817359318975, + "learning_rate": 2.3862792158432403e-07, + "loss": 0.6396, + "step": 27110 + }, + { + "epoch": 2.73, + "grad_norm": 4.840120970779168, + "learning_rate": 2.377334630537903e-07, + "loss": 0.6534, + "step": 27115 + }, + { + "epoch": 2.73, + "grad_norm": 4.754107720175892, + "learning_rate": 2.3684064321462309e-07, + "loss": 0.6446, + "step": 27120 + }, + { + "epoch": 2.73, + "grad_norm": 4.446069918589517, + "learning_rate": 2.3594946237404104e-07, + "loss": 0.6295, + "step": 27125 + }, + { + "epoch": 2.74, + "grad_norm": 5.89894032512749, + "learning_rate": 2.3505992083869832e-07, + "loss": 0.641, + "step": 27130 + }, + { + "epoch": 2.74, + "grad_norm": 4.866888053961888, + "learning_rate": 2.3417201891468677e-07, + "loss": 0.6797, + "step": 27135 + }, + { + "epoch": 2.74, + "grad_norm": 5.704806075752119, + "learning_rate": 2.332857569075303e-07, + "loss": 0.6402, + "step": 27140 + }, + { + "epoch": 2.74, + "grad_norm": 4.795521106834384, + "learning_rate": 2.3240113512219332e-07, + "loss": 0.6492, + "step": 27145 + }, + { + "epoch": 2.74, + "grad_norm": 4.505870048445661, + "learning_rate": 2.3151815386307175e-07, + "loss": 0.6641, + "step": 27150 + }, + { + "epoch": 2.74, + "grad_norm": 4.294647387657272, + "learning_rate": 2.3063681343399923e-07, + "loss": 0.6221, + "step": 27155 + }, + { + "epoch": 2.74, + "grad_norm": 4.379296908401169, + "learning_rate": 2.297571141382443e-07, + "loss": 0.6102, + "step": 27160 + }, + { + "epoch": 2.74, + "grad_norm": 6.542098819508065, + "learning_rate": 2.2887905627850926e-07, + "loss": 0.6615, + "step": 27165 + }, + { + "epoch": 2.74, + "grad_norm": 4.604680790205558, + "learning_rate": 2.2800264015693464e-07, + "loss": 0.6596, + "step": 27170 + }, + { + "epoch": 2.74, + "grad_norm": 4.17649037933092, + "learning_rate": 2.2712786607509308e-07, + "loss": 0.624, + "step": 27175 + }, + { + "epoch": 2.74, + "grad_norm": 4.553300967953392, + "learning_rate": 2.262547343339949e-07, + "loss": 0.6386, + "step": 27180 + }, + { + "epoch": 2.74, + "grad_norm": 4.575406320081005, + "learning_rate": 2.2538324523408318e-07, + "loss": 0.6476, + "step": 27185 + }, + { + "epoch": 2.74, + "grad_norm": 4.569759907083859, + "learning_rate": 2.2451339907523684e-07, + "loss": 0.6641, + "step": 27190 + }, + { + "epoch": 2.74, + "grad_norm": 4.901676613919331, + "learning_rate": 2.236451961567676e-07, + "loss": 0.6235, + "step": 27195 + }, + { + "epoch": 2.74, + "grad_norm": 4.735981562262451, + "learning_rate": 2.2277863677742538e-07, + "loss": 0.635, + "step": 27200 + }, + { + "epoch": 2.74, + "grad_norm": 5.4045401013505705, + "learning_rate": 2.2191372123539057e-07, + "loss": 0.622, + "step": 27205 + }, + { + "epoch": 2.74, + "grad_norm": 4.5875845615749595, + "learning_rate": 2.2105044982828173e-07, + "loss": 0.6534, + "step": 27210 + }, + { + "epoch": 2.74, + "grad_norm": 5.110692783266679, + "learning_rate": 2.2018882285314848e-07, + "loss": 0.6655, + "step": 27215 + }, + { + "epoch": 2.74, + "grad_norm": 4.163729086764986, + "learning_rate": 2.1932884060647585e-07, + "loss": 0.6263, + "step": 27220 + }, + { + "epoch": 2.74, + "grad_norm": 5.04769979373338, + "learning_rate": 2.1847050338418275e-07, + "loss": 0.6169, + "step": 27225 + }, + { + "epoch": 2.75, + "grad_norm": 5.0579975466535645, + "learning_rate": 2.1761381148162286e-07, + "loss": 0.6224, + "step": 27230 + }, + { + "epoch": 2.75, + "grad_norm": 5.477945345762049, + "learning_rate": 2.1675876519358264e-07, + "loss": 0.631, + "step": 27235 + }, + { + "epoch": 2.75, + "grad_norm": 4.502067958512376, + "learning_rate": 2.159053648142828e-07, + "loss": 0.6526, + "step": 27240 + }, + { + "epoch": 2.75, + "grad_norm": 4.645772831828346, + "learning_rate": 2.1505361063737795e-07, + "loss": 0.6892, + "step": 27245 + }, + { + "epoch": 2.75, + "grad_norm": 4.7562112410395585, + "learning_rate": 2.1420350295595527e-07, + "loss": 0.6234, + "step": 27250 + }, + { + "epoch": 2.75, + "grad_norm": 4.715893602665484, + "learning_rate": 2.1335504206253577e-07, + "loss": 0.6438, + "step": 27255 + }, + { + "epoch": 2.75, + "grad_norm": 4.788625339216968, + "learning_rate": 2.1250822824907535e-07, + "loss": 0.6887, + "step": 27260 + }, + { + "epoch": 2.75, + "grad_norm": 5.742321681343637, + "learning_rate": 2.116630618069604e-07, + "loss": 0.6219, + "step": 27265 + }, + { + "epoch": 2.75, + "grad_norm": 5.879362925821528, + "learning_rate": 2.108195430270127e-07, + "loss": 0.6526, + "step": 27270 + }, + { + "epoch": 2.75, + "grad_norm": 5.045335088378256, + "learning_rate": 2.0997767219948616e-07, + "loss": 0.6327, + "step": 27275 + }, + { + "epoch": 2.75, + "grad_norm": 5.009295957562433, + "learning_rate": 2.09137449614068e-07, + "loss": 0.6294, + "step": 27280 + }, + { + "epoch": 2.75, + "grad_norm": 4.434427769144761, + "learning_rate": 2.082988755598764e-07, + "loss": 0.6671, + "step": 27285 + }, + { + "epoch": 2.75, + "grad_norm": 4.990377036828536, + "learning_rate": 2.0746195032546657e-07, + "loss": 0.6371, + "step": 27290 + }, + { + "epoch": 2.75, + "grad_norm": 4.797263478041765, + "learning_rate": 2.0662667419882155e-07, + "loss": 0.6445, + "step": 27295 + }, + { + "epoch": 2.75, + "grad_norm": 5.001224407816903, + "learning_rate": 2.0579304746735917e-07, + "loss": 0.6829, + "step": 27300 + }, + { + "epoch": 2.75, + "grad_norm": 5.193277092738359, + "learning_rate": 2.049610704179311e-07, + "loss": 0.6176, + "step": 27305 + }, + { + "epoch": 2.75, + "grad_norm": 5.041487298048559, + "learning_rate": 2.0413074333681893e-07, + "loss": 0.6126, + "step": 27310 + }, + { + "epoch": 2.75, + "grad_norm": 7.129523974749876, + "learning_rate": 2.0330206650973683e-07, + "loss": 0.6541, + "step": 27315 + }, + { + "epoch": 2.75, + "grad_norm": 4.825415696213447, + "learning_rate": 2.024750402218323e-07, + "loss": 0.6706, + "step": 27320 + }, + { + "epoch": 2.75, + "grad_norm": 6.6591871746914695, + "learning_rate": 2.0164966475768432e-07, + "loss": 0.6622, + "step": 27325 + }, + { + "epoch": 2.76, + "grad_norm": 6.145263719644995, + "learning_rate": 2.0082594040130353e-07, + "loss": 0.635, + "step": 27330 + }, + { + "epoch": 2.76, + "grad_norm": 4.636647280013002, + "learning_rate": 2.000038674361332e-07, + "loss": 0.6217, + "step": 27335 + }, + { + "epoch": 2.76, + "grad_norm": 4.4739024748437695, + "learning_rate": 1.9918344614504814e-07, + "loss": 0.6514, + "step": 27340 + }, + { + "epoch": 2.76, + "grad_norm": 5.0127000776778585, + "learning_rate": 1.9836467681035365e-07, + "loss": 0.6368, + "step": 27345 + }, + { + "epoch": 2.76, + "grad_norm": 4.906114484345132, + "learning_rate": 1.975475597137866e-07, + "loss": 0.6439, + "step": 27350 + }, + { + "epoch": 2.76, + "grad_norm": 4.916876018127332, + "learning_rate": 1.967320951365187e-07, + "loss": 0.6158, + "step": 27355 + }, + { + "epoch": 2.76, + "grad_norm": 4.487110739372675, + "learning_rate": 1.959182833591483e-07, + "loss": 0.6318, + "step": 27360 + }, + { + "epoch": 2.76, + "grad_norm": 4.636937516626221, + "learning_rate": 1.951061246617092e-07, + "loss": 0.6577, + "step": 27365 + }, + { + "epoch": 2.76, + "grad_norm": 5.932916527673636, + "learning_rate": 1.942956193236628e-07, + "loss": 0.6169, + "step": 27370 + }, + { + "epoch": 2.76, + "grad_norm": 4.89725898238156, + "learning_rate": 1.934867676239044e-07, + "loss": 0.6313, + "step": 27375 + }, + { + "epoch": 2.76, + "grad_norm": 4.572865175199781, + "learning_rate": 1.926795698407574e-07, + "loss": 0.6603, + "step": 27380 + }, + { + "epoch": 2.76, + "grad_norm": 4.9158419371051005, + "learning_rate": 1.918740262519797e-07, + "loss": 0.625, + "step": 27385 + }, + { + "epoch": 2.76, + "grad_norm": 4.900828600336452, + "learning_rate": 1.9107013713475674e-07, + "loss": 0.6687, + "step": 27390 + }, + { + "epoch": 2.76, + "grad_norm": 5.230901880339584, + "learning_rate": 1.9026790276570728e-07, + "loss": 0.6119, + "step": 27395 + }, + { + "epoch": 2.76, + "grad_norm": 4.609157007664113, + "learning_rate": 1.8946732342087825e-07, + "loss": 0.6388, + "step": 27400 + }, + { + "epoch": 2.76, + "grad_norm": 4.811633485820632, + "learning_rate": 1.8866839937574876e-07, + "loss": 0.6433, + "step": 27405 + }, + { + "epoch": 2.76, + "grad_norm": 4.5620610799291015, + "learning_rate": 1.8787113090522723e-07, + "loss": 0.611, + "step": 27410 + }, + { + "epoch": 2.76, + "grad_norm": 5.112562968547693, + "learning_rate": 1.8707551828365368e-07, + "loss": 0.6381, + "step": 27415 + }, + { + "epoch": 2.76, + "grad_norm": 4.52468498356504, + "learning_rate": 1.8628156178479794e-07, + "loss": 0.6312, + "step": 27420 + }, + { + "epoch": 2.77, + "grad_norm": 5.258889515886162, + "learning_rate": 1.854892616818582e-07, + "loss": 0.6233, + "step": 27425 + }, + { + "epoch": 2.77, + "grad_norm": 6.628910817758399, + "learning_rate": 1.8469861824746583e-07, + "loss": 0.651, + "step": 27430 + }, + { + "epoch": 2.77, + "grad_norm": 4.163731183357664, + "learning_rate": 1.8390963175367983e-07, + "loss": 0.6219, + "step": 27435 + }, + { + "epoch": 2.77, + "grad_norm": 4.51936940513308, + "learning_rate": 1.831223024719897e-07, + "loss": 0.6593, + "step": 27440 + }, + { + "epoch": 2.77, + "grad_norm": 4.8868413077096315, + "learning_rate": 1.823366306733143e-07, + "loss": 0.6509, + "step": 27445 + }, + { + "epoch": 2.77, + "grad_norm": 4.137439186382453, + "learning_rate": 1.8155261662800295e-07, + "loss": 0.6079, + "step": 27450 + }, + { + "epoch": 2.77, + "grad_norm": 6.193024837189629, + "learning_rate": 1.8077026060583369e-07, + "loss": 0.6733, + "step": 27455 + }, + { + "epoch": 2.77, + "grad_norm": 4.470445592225861, + "learning_rate": 1.7998956287601565e-07, + "loss": 0.6343, + "step": 27460 + }, + { + "epoch": 2.77, + "grad_norm": 4.9839207959687695, + "learning_rate": 1.7921052370718505e-07, + "loss": 0.6557, + "step": 27465 + }, + { + "epoch": 2.77, + "grad_norm": 5.966319709791478, + "learning_rate": 1.7843314336740913e-07, + "loss": 0.6608, + "step": 27470 + }, + { + "epoch": 2.77, + "grad_norm": 4.612224889221658, + "learning_rate": 1.776574221241828e-07, + "loss": 0.6375, + "step": 27475 + }, + { + "epoch": 2.77, + "grad_norm": 5.1061979664003685, + "learning_rate": 1.7688336024443197e-07, + "loss": 0.6297, + "step": 27480 + }, + { + "epoch": 2.77, + "grad_norm": 4.620897017042376, + "learning_rate": 1.7611095799450973e-07, + "loss": 0.6672, + "step": 27485 + }, + { + "epoch": 2.77, + "grad_norm": 5.325307617224155, + "learning_rate": 1.753402156402001e-07, + "loss": 0.6339, + "step": 27490 + }, + { + "epoch": 2.77, + "grad_norm": 5.107182911004926, + "learning_rate": 1.7457113344671484e-07, + "loss": 0.6398, + "step": 27495 + }, + { + "epoch": 2.77, + "grad_norm": 6.10444693123381, + "learning_rate": 1.7380371167869282e-07, + "loss": 0.6638, + "step": 27500 + }, + { + "epoch": 2.77, + "grad_norm": 4.388963100605161, + "learning_rate": 1.730379506002039e-07, + "loss": 0.6182, + "step": 27505 + }, + { + "epoch": 2.77, + "grad_norm": 4.527852561965197, + "learning_rate": 1.7227385047474676e-07, + "loss": 0.6453, + "step": 27510 + }, + { + "epoch": 2.77, + "grad_norm": 4.947131925278648, + "learning_rate": 1.7151141156524554e-07, + "loss": 0.6488, + "step": 27515 + }, + { + "epoch": 2.77, + "grad_norm": 4.888298137702055, + "learning_rate": 1.70750634134057e-07, + "loss": 0.6201, + "step": 27520 + }, + { + "epoch": 2.78, + "grad_norm": 5.068734448012588, + "learning_rate": 1.6999151844296237e-07, + "loss": 0.6648, + "step": 27525 + }, + { + "epoch": 2.78, + "grad_norm": 5.719138525994361, + "learning_rate": 1.6923406475317316e-07, + "loss": 0.6649, + "step": 27530 + }, + { + "epoch": 2.78, + "grad_norm": 5.014641777763894, + "learning_rate": 1.684782733253276e-07, + "loss": 0.645, + "step": 27535 + }, + { + "epoch": 2.78, + "grad_norm": 4.405259394411325, + "learning_rate": 1.6772414441949436e-07, + "loss": 0.5895, + "step": 27540 + }, + { + "epoch": 2.78, + "grad_norm": 4.434647710812157, + "learning_rate": 1.6697167829516747e-07, + "loss": 0.6287, + "step": 27545 + }, + { + "epoch": 2.78, + "grad_norm": 4.570729319226043, + "learning_rate": 1.6622087521126983e-07, + "loss": 0.6165, + "step": 27550 + }, + { + "epoch": 2.78, + "grad_norm": 4.708876473272213, + "learning_rate": 1.6547173542615258e-07, + "loss": 0.6634, + "step": 27555 + }, + { + "epoch": 2.78, + "grad_norm": 5.443803641000985, + "learning_rate": 1.6472425919759338e-07, + "loss": 0.6735, + "step": 27560 + }, + { + "epoch": 2.78, + "grad_norm": 5.503813881852303, + "learning_rate": 1.6397844678279872e-07, + "loss": 0.6524, + "step": 27565 + }, + { + "epoch": 2.78, + "grad_norm": 5.262393504748039, + "learning_rate": 1.632342984384011e-07, + "loss": 0.6522, + "step": 27570 + }, + { + "epoch": 2.78, + "grad_norm": 5.533398944217278, + "learning_rate": 1.6249181442046235e-07, + "loss": 0.6556, + "step": 27575 + }, + { + "epoch": 2.78, + "grad_norm": 4.584461282149826, + "learning_rate": 1.6175099498446866e-07, + "loss": 0.6376, + "step": 27580 + }, + { + "epoch": 2.78, + "grad_norm": 4.740322548336197, + "learning_rate": 1.610118403853378e-07, + "loss": 0.659, + "step": 27585 + }, + { + "epoch": 2.78, + "grad_norm": 4.585743047346483, + "learning_rate": 1.6027435087741072e-07, + "loss": 0.6057, + "step": 27590 + }, + { + "epoch": 2.78, + "grad_norm": 6.2909688966884225, + "learning_rate": 1.5953852671445668e-07, + "loss": 0.654, + "step": 27595 + }, + { + "epoch": 2.78, + "grad_norm": 4.97205188031535, + "learning_rate": 1.5880436814967258e-07, + "loss": 0.627, + "step": 27600 + }, + { + "epoch": 2.78, + "grad_norm": 4.519227048096504, + "learning_rate": 1.5807187543568136e-07, + "loss": 0.6219, + "step": 27605 + }, + { + "epoch": 2.78, + "grad_norm": 4.361661472488166, + "learning_rate": 1.5734104882453305e-07, + "loss": 0.6861, + "step": 27610 + }, + { + "epoch": 2.78, + "grad_norm": 4.8666409971276074, + "learning_rate": 1.5661188856770537e-07, + "loss": 0.6247, + "step": 27615 + }, + { + "epoch": 2.78, + "grad_norm": 4.717116455709111, + "learning_rate": 1.558843949161004e-07, + "loss": 0.6532, + "step": 27620 + }, + { + "epoch": 2.79, + "grad_norm": 4.630714814407945, + "learning_rate": 1.5515856812004849e-07, + "loss": 0.6157, + "step": 27625 + }, + { + "epoch": 2.79, + "grad_norm": 5.28299970158232, + "learning_rate": 1.544344084293059e-07, + "loss": 0.6272, + "step": 27630 + }, + { + "epoch": 2.79, + "grad_norm": 6.239301778257726, + "learning_rate": 1.5371191609305502e-07, + "loss": 0.6476, + "step": 27635 + }, + { + "epoch": 2.79, + "grad_norm": 5.132571940847922, + "learning_rate": 1.5299109135990531e-07, + "loss": 0.6757, + "step": 27640 + }, + { + "epoch": 2.79, + "grad_norm": 6.163828640077864, + "learning_rate": 1.5227193447789167e-07, + "loss": 0.646, + "step": 27645 + }, + { + "epoch": 2.79, + "grad_norm": 5.623353690086022, + "learning_rate": 1.5155444569447565e-07, + "loss": 0.6297, + "step": 27650 + }, + { + "epoch": 2.79, + "grad_norm": 4.618225671423743, + "learning_rate": 1.5083862525654413e-07, + "loss": 0.6644, + "step": 27655 + }, + { + "epoch": 2.79, + "grad_norm": 4.8193840512997435, + "learning_rate": 1.5012447341040903e-07, + "loss": 0.6378, + "step": 27660 + }, + { + "epoch": 2.79, + "grad_norm": 4.872251860166495, + "learning_rate": 1.4941199040181152e-07, + "loss": 0.6605, + "step": 27665 + }, + { + "epoch": 2.79, + "grad_norm": 4.521733108340862, + "learning_rate": 1.4870117647591441e-07, + "loss": 0.6715, + "step": 27670 + }, + { + "epoch": 2.79, + "grad_norm": 5.034286716147694, + "learning_rate": 1.4799203187730982e-07, + "loss": 0.6315, + "step": 27675 + }, + { + "epoch": 2.79, + "grad_norm": 4.99830347398244, + "learning_rate": 1.4728455685001253e-07, + "loss": 0.6192, + "step": 27680 + }, + { + "epoch": 2.79, + "grad_norm": 5.923759561043896, + "learning_rate": 1.4657875163746448e-07, + "loss": 0.6361, + "step": 27685 + }, + { + "epoch": 2.79, + "grad_norm": 4.4891986428328545, + "learning_rate": 1.4587461648253253e-07, + "loss": 0.631, + "step": 27690 + }, + { + "epoch": 2.79, + "grad_norm": 4.620859160067768, + "learning_rate": 1.4517215162750841e-07, + "loss": 0.626, + "step": 27695 + }, + { + "epoch": 2.79, + "grad_norm": 4.350124688452219, + "learning_rate": 1.4447135731411044e-07, + "loss": 0.6229, + "step": 27700 + }, + { + "epoch": 2.79, + "grad_norm": 5.222093872008338, + "learning_rate": 1.4377223378348014e-07, + "loss": 0.6814, + "step": 27705 + }, + { + "epoch": 2.79, + "grad_norm": 4.804943687592125, + "learning_rate": 1.430747812761868e-07, + "loss": 0.6751, + "step": 27710 + }, + { + "epoch": 2.79, + "grad_norm": 4.434005038612656, + "learning_rate": 1.4237900003222172e-07, + "loss": 0.6487, + "step": 27715 + }, + { + "epoch": 2.79, + "grad_norm": 4.983682819390938, + "learning_rate": 1.4168489029100395e-07, + "loss": 0.6641, + "step": 27720 + }, + { + "epoch": 2.8, + "grad_norm": 4.826412766166672, + "learning_rate": 1.4099245229137414e-07, + "loss": 0.6253, + "step": 27725 + }, + { + "epoch": 2.8, + "grad_norm": 4.98851863031606, + "learning_rate": 1.4030168627160112e-07, + "loss": 0.6159, + "step": 27730 + }, + { + "epoch": 2.8, + "grad_norm": 4.966095621552294, + "learning_rate": 1.3961259246937642e-07, + "loss": 0.6586, + "step": 27735 + }, + { + "epoch": 2.8, + "grad_norm": 5.077498218523122, + "learning_rate": 1.3892517112181646e-07, + "loss": 0.6666, + "step": 27740 + }, + { + "epoch": 2.8, + "grad_norm": 4.4972011043076145, + "learning_rate": 1.3823942246546207e-07, + "loss": 0.6735, + "step": 27745 + }, + { + "epoch": 2.8, + "grad_norm": 4.6509709640658645, + "learning_rate": 1.3755534673627947e-07, + "loss": 0.6801, + "step": 27750 + }, + { + "epoch": 2.8, + "grad_norm": 4.454353433000559, + "learning_rate": 1.36872944169657e-07, + "loss": 0.6128, + "step": 27755 + }, + { + "epoch": 2.8, + "grad_norm": 5.563784938049496, + "learning_rate": 1.361922150004097e-07, + "loss": 0.6344, + "step": 27760 + }, + { + "epoch": 2.8, + "grad_norm": 4.688473347214066, + "learning_rate": 1.355131594627762e-07, + "loss": 0.6378, + "step": 27765 + }, + { + "epoch": 2.8, + "grad_norm": 4.675427454072095, + "learning_rate": 1.3483577779041802e-07, + "loss": 0.6397, + "step": 27770 + }, + { + "epoch": 2.8, + "grad_norm": 5.560500501070449, + "learning_rate": 1.3416007021642252e-07, + "loss": 0.6303, + "step": 27775 + }, + { + "epoch": 2.8, + "grad_norm": 5.279496533965027, + "learning_rate": 1.334860369732993e-07, + "loss": 0.6404, + "step": 27780 + }, + { + "epoch": 2.8, + "grad_norm": 4.514573270455693, + "learning_rate": 1.328136782929823e-07, + "loss": 0.6456, + "step": 27785 + }, + { + "epoch": 2.8, + "grad_norm": 4.712080775356427, + "learning_rate": 1.3214299440683032e-07, + "loss": 0.6441, + "step": 27790 + }, + { + "epoch": 2.8, + "grad_norm": 5.380594002720372, + "learning_rate": 1.3147398554562374e-07, + "loss": 0.6455, + "step": 27795 + }, + { + "epoch": 2.8, + "grad_norm": 4.474777394755265, + "learning_rate": 1.3080665193956954e-07, + "loss": 0.6088, + "step": 27800 + }, + { + "epoch": 2.8, + "grad_norm": 5.393544527385068, + "learning_rate": 1.301409938182968e-07, + "loss": 0.6282, + "step": 27805 + }, + { + "epoch": 2.8, + "grad_norm": 4.875236489949824, + "learning_rate": 1.294770114108551e-07, + "loss": 0.6368, + "step": 27810 + }, + { + "epoch": 2.8, + "grad_norm": 6.864909001772833, + "learning_rate": 1.2881470494572278e-07, + "loss": 0.6636, + "step": 27815 + }, + { + "epoch": 2.8, + "grad_norm": 5.021612888517245, + "learning_rate": 1.2815407465079754e-07, + "loss": 0.6444, + "step": 27820 + }, + { + "epoch": 2.81, + "grad_norm": 7.051741591293166, + "learning_rate": 1.2749512075340198e-07, + "loss": 0.6464, + "step": 27825 + }, + { + "epoch": 2.81, + "grad_norm": 5.710481817137641, + "learning_rate": 1.268378434802814e-07, + "loss": 0.6608, + "step": 27830 + }, + { + "epoch": 2.81, + "grad_norm": 4.617873854790824, + "learning_rate": 1.2618224305760596e-07, + "loss": 0.6828, + "step": 27835 + }, + { + "epoch": 2.81, + "grad_norm": 4.886618090636221, + "learning_rate": 1.2552831971096412e-07, + "loss": 0.6615, + "step": 27840 + }, + { + "epoch": 2.81, + "grad_norm": 4.696371454416478, + "learning_rate": 1.2487607366537258e-07, + "loss": 0.6118, + "step": 27845 + }, + { + "epoch": 2.81, + "grad_norm": 5.019821337329065, + "learning_rate": 1.2422550514526678e-07, + "loss": 0.6458, + "step": 27850 + }, + { + "epoch": 2.81, + "grad_norm": 4.665430937369177, + "learning_rate": 1.2357661437450873e-07, + "loss": 0.6386, + "step": 27855 + }, + { + "epoch": 2.81, + "grad_norm": 4.63196248195062, + "learning_rate": 1.229294015763799e-07, + "loss": 0.6203, + "step": 27860 + }, + { + "epoch": 2.81, + "grad_norm": 4.628889587290443, + "learning_rate": 1.2228386697358653e-07, + "loss": 0.6239, + "step": 27865 + }, + { + "epoch": 2.81, + "grad_norm": 4.365112314017564, + "learning_rate": 1.2164001078825482e-07, + "loss": 0.623, + "step": 27870 + }, + { + "epoch": 2.81, + "grad_norm": 5.258002898744488, + "learning_rate": 1.2099783324193647e-07, + "loss": 0.6571, + "step": 27875 + }, + { + "epoch": 2.81, + "grad_norm": 4.669424922639722, + "learning_rate": 1.2035733455560305e-07, + "loss": 0.6891, + "step": 27880 + }, + { + "epoch": 2.81, + "grad_norm": 4.869084130597134, + "learning_rate": 1.1971851494965104e-07, + "loss": 0.6252, + "step": 27885 + }, + { + "epoch": 2.81, + "grad_norm": 14.23946930823488, + "learning_rate": 1.1908137464389625e-07, + "loss": 0.651, + "step": 27890 + }, + { + "epoch": 2.81, + "grad_norm": 6.59501403796409, + "learning_rate": 1.1844591385757942e-07, + "loss": 0.6481, + "step": 27895 + }, + { + "epoch": 2.81, + "grad_norm": 4.9948682409578025, + "learning_rate": 1.178121328093601e-07, + "loss": 0.6465, + "step": 27900 + }, + { + "epoch": 2.81, + "grad_norm": 4.395043259501831, + "learning_rate": 1.1718003171732384e-07, + "loss": 0.6535, + "step": 27905 + }, + { + "epoch": 2.81, + "grad_norm": 5.313318406915275, + "learning_rate": 1.1654961079897442e-07, + "loss": 0.6322, + "step": 27910 + }, + { + "epoch": 2.81, + "grad_norm": 5.613602971142326, + "learning_rate": 1.1592087027123999e-07, + "loss": 0.6677, + "step": 27915 + }, + { + "epoch": 2.81, + "grad_norm": 5.073924506007503, + "learning_rate": 1.152938103504686e-07, + "loss": 0.6293, + "step": 27920 + }, + { + "epoch": 2.82, + "grad_norm": 4.616028361075712, + "learning_rate": 1.1466843125243322e-07, + "loss": 0.638, + "step": 27925 + }, + { + "epoch": 2.82, + "grad_norm": 5.381443254499906, + "learning_rate": 1.140447331923239e-07, + "loss": 0.6057, + "step": 27930 + }, + { + "epoch": 2.82, + "grad_norm": 4.644498220335952, + "learning_rate": 1.1342271638475455e-07, + "loss": 0.6175, + "step": 27935 + }, + { + "epoch": 2.82, + "grad_norm": 5.249951639838429, + "learning_rate": 1.1280238104376173e-07, + "loss": 0.6687, + "step": 27940 + }, + { + "epoch": 2.82, + "grad_norm": 4.78029838029608, + "learning_rate": 1.1218372738280137e-07, + "loss": 0.6233, + "step": 27945 + }, + { + "epoch": 2.82, + "grad_norm": 4.657499611806018, + "learning_rate": 1.115667556147526e-07, + "loss": 0.652, + "step": 27950 + }, + { + "epoch": 2.82, + "grad_norm": 5.236393726741324, + "learning_rate": 1.1095146595191397e-07, + "loss": 0.6533, + "step": 27955 + }, + { + "epoch": 2.82, + "grad_norm": 4.643644972620957, + "learning_rate": 1.1033785860600666e-07, + "loss": 0.6154, + "step": 27960 + }, + { + "epoch": 2.82, + "grad_norm": 4.558633798199325, + "learning_rate": 1.0972593378817176e-07, + "loss": 0.6651, + "step": 27965 + }, + { + "epoch": 2.82, + "grad_norm": 4.432431102445073, + "learning_rate": 1.0911569170897252e-07, + "loss": 0.6423, + "step": 27970 + }, + { + "epoch": 2.82, + "grad_norm": 5.460147093023334, + "learning_rate": 1.0850713257839152e-07, + "loss": 0.6485, + "step": 27975 + }, + { + "epoch": 2.82, + "grad_norm": 5.0266464235297015, + "learning_rate": 1.0790025660583514e-07, + "loss": 0.6477, + "step": 27980 + }, + { + "epoch": 2.82, + "grad_norm": 4.702953698317971, + "learning_rate": 1.0729506400012801e-07, + "loss": 0.6596, + "step": 27985 + }, + { + "epoch": 2.82, + "grad_norm": 4.1933909947174675, + "learning_rate": 1.0669155496951633e-07, + "loss": 0.5747, + "step": 27990 + }, + { + "epoch": 2.82, + "grad_norm": 4.7820636291726055, + "learning_rate": 1.0608972972166675e-07, + "loss": 0.6255, + "step": 27995 + }, + { + "epoch": 2.82, + "grad_norm": 5.026350823898908, + "learning_rate": 1.0548958846366697e-07, + "loss": 0.6381, + "step": 28000 + }, + { + "epoch": 2.82, + "grad_norm": 4.826278656901001, + "learning_rate": 1.0489113140202456e-07, + "loss": 0.6491, + "step": 28005 + }, + { + "epoch": 2.82, + "grad_norm": 5.047374508540337, + "learning_rate": 1.0429435874266869e-07, + "loss": 0.6206, + "step": 28010 + }, + { + "epoch": 2.82, + "grad_norm": 5.188749544431611, + "learning_rate": 1.0369927069094788e-07, + "loss": 0.6763, + "step": 28015 + }, + { + "epoch": 2.83, + "grad_norm": 5.4120748291853085, + "learning_rate": 1.0310586745163276e-07, + "loss": 0.6866, + "step": 28020 + }, + { + "epoch": 2.83, + "grad_norm": 5.1115261193344494, + "learning_rate": 1.0251414922891e-07, + "loss": 0.6488, + "step": 28025 + }, + { + "epoch": 2.83, + "grad_norm": 4.17494029759684, + "learning_rate": 1.0192411622639175e-07, + "loss": 0.596, + "step": 28030 + }, + { + "epoch": 2.83, + "grad_norm": 4.61738215353897, + "learning_rate": 1.0133576864710671e-07, + "loss": 0.6573, + "step": 28035 + }, + { + "epoch": 2.83, + "grad_norm": 4.420087106860671, + "learning_rate": 1.0074910669350457e-07, + "loss": 0.6262, + "step": 28040 + }, + { + "epoch": 2.83, + "grad_norm": 4.711847300503816, + "learning_rate": 1.0016413056745555e-07, + "loss": 0.6349, + "step": 28045 + }, + { + "epoch": 2.83, + "grad_norm": 4.972521433716798, + "learning_rate": 9.95808404702503e-08, + "loss": 0.6547, + "step": 28050 + }, + { + "epoch": 2.83, + "grad_norm": 5.537633311092984, + "learning_rate": 9.899923660259658e-08, + "loss": 0.6912, + "step": 28055 + }, + { + "epoch": 2.83, + "grad_norm": 5.051293484949038, + "learning_rate": 9.841931916462433e-08, + "loss": 0.6236, + "step": 28060 + }, + { + "epoch": 2.83, + "grad_norm": 5.493774218309113, + "learning_rate": 9.784108835588335e-08, + "loss": 0.6791, + "step": 28065 + }, + { + "epoch": 2.83, + "grad_norm": 4.576787830353115, + "learning_rate": 9.726454437534116e-08, + "loss": 0.6131, + "step": 28070 + }, + { + "epoch": 2.83, + "grad_norm": 4.630404316217037, + "learning_rate": 9.668968742138741e-08, + "loss": 0.6616, + "step": 28075 + }, + { + "epoch": 2.83, + "grad_norm": 4.891299088255928, + "learning_rate": 9.611651769182828e-08, + "loss": 0.6426, + "step": 28080 + }, + { + "epoch": 2.83, + "grad_norm": 4.5114788096308756, + "learning_rate": 9.554503538389215e-08, + "loss": 0.6562, + "step": 28085 + }, + { + "epoch": 2.83, + "grad_norm": 6.711102801133464, + "learning_rate": 9.497524069422448e-08, + "loss": 0.6548, + "step": 28090 + }, + { + "epoch": 2.83, + "grad_norm": 5.066223190114222, + "learning_rate": 9.440713381889233e-08, + "loss": 0.6809, + "step": 28095 + }, + { + "epoch": 2.83, + "grad_norm": 4.979158338870474, + "learning_rate": 9.384071495337932e-08, + "loss": 0.6261, + "step": 28100 + }, + { + "epoch": 2.83, + "grad_norm": 4.64668267133173, + "learning_rate": 9.327598429259122e-08, + "loss": 0.6389, + "step": 28105 + }, + { + "epoch": 2.83, + "grad_norm": 6.083600469147071, + "learning_rate": 9.271294203085035e-08, + "loss": 0.6395, + "step": 28110 + }, + { + "epoch": 2.83, + "grad_norm": 4.465458441123022, + "learning_rate": 9.215158836189897e-08, + "loss": 0.6326, + "step": 28115 + }, + { + "epoch": 2.84, + "grad_norm": 6.267730702068969, + "learning_rate": 9.159192347889755e-08, + "loss": 0.6629, + "step": 28120 + }, + { + "epoch": 2.84, + "grad_norm": 5.391159402020457, + "learning_rate": 9.103394757442818e-08, + "loss": 0.6537, + "step": 28125 + }, + { + "epoch": 2.84, + "grad_norm": 4.377867762700203, + "learning_rate": 9.047766084048837e-08, + "loss": 0.6667, + "step": 28130 + }, + { + "epoch": 2.84, + "grad_norm": 5.430428637787642, + "learning_rate": 8.992306346849721e-08, + "loss": 0.6451, + "step": 28135 + }, + { + "epoch": 2.84, + "grad_norm": 4.971622244790864, + "learning_rate": 8.93701556492893e-08, + "loss": 0.6494, + "step": 28140 + }, + { + "epoch": 2.84, + "grad_norm": 4.812306458518884, + "learning_rate": 8.881893757312132e-08, + "loss": 0.6946, + "step": 28145 + }, + { + "epoch": 2.84, + "grad_norm": 4.785301173240359, + "learning_rate": 8.826940942966544e-08, + "loss": 0.6429, + "step": 28150 + }, + { + "epoch": 2.84, + "grad_norm": 5.8853191729513386, + "learning_rate": 8.77215714080154e-08, + "loss": 0.6692, + "step": 28155 + }, + { + "epoch": 2.84, + "grad_norm": 5.361968654665912, + "learning_rate": 8.717542369667986e-08, + "loss": 0.6632, + "step": 28160 + }, + { + "epoch": 2.84, + "grad_norm": 4.786993702451831, + "learning_rate": 8.66309664835896e-08, + "loss": 0.6265, + "step": 28165 + }, + { + "epoch": 2.84, + "grad_norm": 4.65748116787266, + "learning_rate": 8.608819995609085e-08, + "loss": 0.6647, + "step": 28170 + }, + { + "epoch": 2.84, + "grad_norm": 4.762331116389162, + "learning_rate": 8.554712430094925e-08, + "loss": 0.672, + "step": 28175 + }, + { + "epoch": 2.84, + "grad_norm": 4.6782388279918115, + "learning_rate": 8.50077397043475e-08, + "loss": 0.6628, + "step": 28180 + }, + { + "epoch": 2.84, + "grad_norm": 4.888639645100182, + "learning_rate": 8.447004635188882e-08, + "loss": 0.638, + "step": 28185 + }, + { + "epoch": 2.84, + "grad_norm": 4.782015532131001, + "learning_rate": 8.393404442859243e-08, + "loss": 0.6418, + "step": 28190 + }, + { + "epoch": 2.84, + "grad_norm": 4.625853443548703, + "learning_rate": 8.33997341188958e-08, + "loss": 0.6081, + "step": 28195 + }, + { + "epoch": 2.84, + "grad_norm": 4.898338864474646, + "learning_rate": 8.286711560665461e-08, + "loss": 0.6188, + "step": 28200 + }, + { + "epoch": 2.84, + "grad_norm": 5.4494813409120475, + "learning_rate": 8.233618907514285e-08, + "loss": 0.6067, + "step": 28205 + }, + { + "epoch": 2.84, + "grad_norm": 4.97554379414742, + "learning_rate": 8.180695470705157e-08, + "loss": 0.6264, + "step": 28210 + }, + { + "epoch": 2.84, + "grad_norm": 4.384806368994531, + "learning_rate": 8.127941268448903e-08, + "loss": 0.6668, + "step": 28215 + }, + { + "epoch": 2.85, + "grad_norm": 4.543114379222686, + "learning_rate": 8.075356318898331e-08, + "loss": 0.6087, + "step": 28220 + }, + { + "epoch": 2.85, + "grad_norm": 4.960937328784922, + "learning_rate": 8.022940640147803e-08, + "loss": 0.657, + "step": 28225 + }, + { + "epoch": 2.85, + "grad_norm": 4.8604583820643255, + "learning_rate": 7.970694250233502e-08, + "loss": 0.6145, + "step": 28230 + }, + { + "epoch": 2.85, + "grad_norm": 5.0096396603393805, + "learning_rate": 7.91861716713338e-08, + "loss": 0.6769, + "step": 28235 + }, + { + "epoch": 2.85, + "grad_norm": 4.488894932987289, + "learning_rate": 7.866709408767104e-08, + "loss": 0.6487, + "step": 28240 + }, + { + "epoch": 2.85, + "grad_norm": 5.000269938677299, + "learning_rate": 7.814970992995996e-08, + "loss": 0.6416, + "step": 28245 + }, + { + "epoch": 2.85, + "grad_norm": 5.439658402094098, + "learning_rate": 7.763401937623371e-08, + "loss": 0.66, + "step": 28250 + }, + { + "epoch": 2.85, + "grad_norm": 4.211970429252045, + "learning_rate": 7.712002260393925e-08, + "loss": 0.5953, + "step": 28255 + }, + { + "epoch": 2.85, + "grad_norm": 5.0036940064815605, + "learning_rate": 7.660771978994397e-08, + "loss": 0.6751, + "step": 28260 + }, + { + "epoch": 2.85, + "grad_norm": 5.004269407828965, + "learning_rate": 7.609711111052964e-08, + "loss": 0.6117, + "step": 28265 + }, + { + "epoch": 2.85, + "grad_norm": 4.73802167825799, + "learning_rate": 7.558819674139628e-08, + "loss": 0.625, + "step": 28270 + }, + { + "epoch": 2.85, + "grad_norm": 4.513531142950939, + "learning_rate": 7.508097685766103e-08, + "loss": 0.6253, + "step": 28275 + }, + { + "epoch": 2.85, + "grad_norm": 4.782433144091685, + "learning_rate": 7.457545163385815e-08, + "loss": 0.6081, + "step": 28280 + }, + { + "epoch": 2.85, + "grad_norm": 4.122275711517098, + "learning_rate": 7.407162124393741e-08, + "loss": 0.6561, + "step": 28285 + }, + { + "epoch": 2.85, + "grad_norm": 4.99388262921669, + "learning_rate": 7.356948586126789e-08, + "loss": 0.6557, + "step": 28290 + }, + { + "epoch": 2.85, + "grad_norm": 4.40740378136391, + "learning_rate": 7.306904565863249e-08, + "loss": 0.6308, + "step": 28295 + }, + { + "epoch": 2.85, + "grad_norm": 4.894685108940628, + "learning_rate": 7.25703008082329e-08, + "loss": 0.6651, + "step": 28300 + }, + { + "epoch": 2.85, + "grad_norm": 4.512397581048909, + "learning_rate": 7.207325148168631e-08, + "loss": 0.6553, + "step": 28305 + }, + { + "epoch": 2.85, + "grad_norm": 4.426837513909502, + "learning_rate": 7.1577897850027e-08, + "loss": 0.6654, + "step": 28310 + }, + { + "epoch": 2.85, + "grad_norm": 4.644005315701208, + "learning_rate": 7.10842400837064e-08, + "loss": 0.6559, + "step": 28315 + }, + { + "epoch": 2.86, + "grad_norm": 4.608825459085924, + "learning_rate": 7.059227835259086e-08, + "loss": 0.6582, + "step": 28320 + }, + { + "epoch": 2.86, + "grad_norm": 4.95551288211116, + "learning_rate": 7.010201282596385e-08, + "loss": 0.6505, + "step": 28325 + }, + { + "epoch": 2.86, + "grad_norm": 6.002840384245793, + "learning_rate": 6.961344367252654e-08, + "loss": 0.6185, + "step": 28330 + }, + { + "epoch": 2.86, + "grad_norm": 6.739543685579368, + "learning_rate": 6.912657106039389e-08, + "loss": 0.6302, + "step": 28335 + }, + { + "epoch": 2.86, + "grad_norm": 4.368697153210344, + "learning_rate": 6.8641395157098e-08, + "loss": 0.656, + "step": 28340 + }, + { + "epoch": 2.86, + "grad_norm": 5.085692854459775, + "learning_rate": 6.815791612958866e-08, + "loss": 0.6457, + "step": 28345 + }, + { + "epoch": 2.86, + "grad_norm": 4.445170781602102, + "learning_rate": 6.767613414422946e-08, + "loss": 0.6764, + "step": 28350 + }, + { + "epoch": 2.86, + "grad_norm": 4.554420320419269, + "learning_rate": 6.719604936680168e-08, + "loss": 0.6405, + "step": 28355 + }, + { + "epoch": 2.86, + "grad_norm": 4.608501096926762, + "learning_rate": 6.671766196250207e-08, + "loss": 0.6301, + "step": 28360 + }, + { + "epoch": 2.86, + "grad_norm": 4.375838707436465, + "learning_rate": 6.624097209594338e-08, + "loss": 0.6133, + "step": 28365 + }, + { + "epoch": 2.86, + "grad_norm": 4.5510601164594044, + "learning_rate": 6.57659799311533e-08, + "loss": 0.648, + "step": 28370 + }, + { + "epoch": 2.86, + "grad_norm": 4.507057003173995, + "learning_rate": 6.529268563157775e-08, + "loss": 0.6167, + "step": 28375 + }, + { + "epoch": 2.86, + "grad_norm": 4.530000484593963, + "learning_rate": 6.48210893600748e-08, + "loss": 0.6149, + "step": 28380 + }, + { + "epoch": 2.86, + "grad_norm": 4.561958919883844, + "learning_rate": 6.435119127892187e-08, + "loss": 0.6024, + "step": 28385 + }, + { + "epoch": 2.86, + "grad_norm": 5.857815718909283, + "learning_rate": 6.388299154981014e-08, + "loss": 0.6474, + "step": 28390 + }, + { + "epoch": 2.86, + "grad_norm": 4.530181675162402, + "learning_rate": 6.341649033384633e-08, + "loss": 0.6614, + "step": 28395 + }, + { + "epoch": 2.86, + "grad_norm": 5.168044989441414, + "learning_rate": 6.295168779155315e-08, + "loss": 0.6458, + "step": 28400 + }, + { + "epoch": 2.86, + "grad_norm": 4.6901341662647695, + "learning_rate": 6.248858408286872e-08, + "loss": 0.6399, + "step": 28405 + }, + { + "epoch": 2.86, + "grad_norm": 4.4105278572727125, + "learning_rate": 6.202717936714675e-08, + "loss": 0.6443, + "step": 28410 + }, + { + "epoch": 2.86, + "grad_norm": 4.419717470241641, + "learning_rate": 6.15674738031563e-08, + "loss": 0.6177, + "step": 28415 + }, + { + "epoch": 2.87, + "grad_norm": 4.984848875763577, + "learning_rate": 6.110946754908087e-08, + "loss": 0.6572, + "step": 28420 + }, + { + "epoch": 2.87, + "grad_norm": 5.196230562299611, + "learning_rate": 6.065316076252103e-08, + "loss": 0.6326, + "step": 28425 + }, + { + "epoch": 2.87, + "grad_norm": 4.6588927974984395, + "learning_rate": 6.01985536004901e-08, + "loss": 0.6516, + "step": 28430 + }, + { + "epoch": 2.87, + "grad_norm": 4.515798517057653, + "learning_rate": 5.974564621941958e-08, + "loss": 0.6212, + "step": 28435 + }, + { + "epoch": 2.87, + "grad_norm": 4.920040257974786, + "learning_rate": 5.929443877515317e-08, + "loss": 0.6606, + "step": 28440 + }, + { + "epoch": 2.87, + "grad_norm": 5.288818752985311, + "learning_rate": 5.884493142295222e-08, + "loss": 0.6362, + "step": 28445 + }, + { + "epoch": 2.87, + "grad_norm": 4.6413709322516885, + "learning_rate": 5.839712431749078e-08, + "loss": 0.5756, + "step": 28450 + }, + { + "epoch": 2.87, + "grad_norm": 4.351117395647321, + "learning_rate": 5.7951017612858926e-08, + "loss": 0.6283, + "step": 28455 + }, + { + "epoch": 2.87, + "grad_norm": 4.561415313136726, + "learning_rate": 5.750661146256165e-08, + "loss": 0.6549, + "step": 28460 + }, + { + "epoch": 2.87, + "grad_norm": 5.008280245016394, + "learning_rate": 5.7063906019518277e-08, + "loss": 0.6482, + "step": 28465 + }, + { + "epoch": 2.87, + "grad_norm": 4.273730171002147, + "learning_rate": 5.6622901436064746e-08, + "loss": 0.5978, + "step": 28470 + }, + { + "epoch": 2.87, + "grad_norm": 4.5930350497445405, + "learning_rate": 5.6183597863948555e-08, + "loss": 0.6405, + "step": 28475 + }, + { + "epoch": 2.87, + "grad_norm": 5.114048875812452, + "learning_rate": 5.57459954543349e-08, + "loss": 0.6446, + "step": 28480 + }, + { + "epoch": 2.87, + "grad_norm": 5.072325169261744, + "learning_rate": 5.531009435780166e-08, + "loss": 0.6248, + "step": 28485 + }, + { + "epoch": 2.87, + "grad_norm": 4.572562477929448, + "learning_rate": 5.487589472434274e-08, + "loss": 0.6274, + "step": 28490 + }, + { + "epoch": 2.87, + "grad_norm": 6.4104757417857945, + "learning_rate": 5.444339670336474e-08, + "loss": 0.6602, + "step": 28495 + }, + { + "epoch": 2.87, + "grad_norm": 4.555281937287776, + "learning_rate": 5.401260044369028e-08, + "loss": 0.6843, + "step": 28500 + }, + { + "epoch": 2.87, + "grad_norm": 4.332120683708192, + "learning_rate": 5.358350609355634e-08, + "loss": 0.6547, + "step": 28505 + }, + { + "epoch": 2.87, + "grad_norm": 5.392595167719035, + "learning_rate": 5.31561138006137e-08, + "loss": 0.6257, + "step": 28510 + }, + { + "epoch": 2.87, + "grad_norm": 5.111162902561142, + "learning_rate": 5.2730423711927495e-08, + "loss": 0.6605, + "step": 28515 + }, + { + "epoch": 2.88, + "grad_norm": 5.485432804363226, + "learning_rate": 5.230643597397722e-08, + "loss": 0.6624, + "step": 28520 + }, + { + "epoch": 2.88, + "grad_norm": 4.467263676798901, + "learning_rate": 5.1884150732656737e-08, + "loss": 0.6322, + "step": 28525 + }, + { + "epoch": 2.88, + "grad_norm": 5.378733829067483, + "learning_rate": 5.146356813327369e-08, + "loss": 0.6203, + "step": 28530 + }, + { + "epoch": 2.88, + "grad_norm": 4.93470561841416, + "learning_rate": 5.104468832055065e-08, + "loss": 0.6239, + "step": 28535 + }, + { + "epoch": 2.88, + "grad_norm": 5.605602015042057, + "learning_rate": 5.062751143862399e-08, + "loss": 0.6922, + "step": 28540 + }, + { + "epoch": 2.88, + "grad_norm": 4.825177520864215, + "learning_rate": 5.021203763104332e-08, + "loss": 0.6379, + "step": 28545 + }, + { + "epoch": 2.88, + "grad_norm": 6.085782351837812, + "learning_rate": 4.979826704077262e-08, + "loss": 0.6138, + "step": 28550 + }, + { + "epoch": 2.88, + "grad_norm": 6.327960087836945, + "learning_rate": 4.9386199810190214e-08, + "loss": 0.6185, + "step": 28555 + }, + { + "epoch": 2.88, + "grad_norm": 4.725564634623508, + "learning_rate": 4.8975836081088246e-08, + "loss": 0.6457, + "step": 28560 + }, + { + "epoch": 2.88, + "grad_norm": 5.797051546102331, + "learning_rate": 4.856717599467209e-08, + "loss": 0.6204, + "step": 28565 + }, + { + "epoch": 2.88, + "grad_norm": 4.555268789277485, + "learning_rate": 4.816021969156148e-08, + "loss": 0.6455, + "step": 28570 + }, + { + "epoch": 2.88, + "grad_norm": 4.6302031379590485, + "learning_rate": 4.775496731178997e-08, + "loss": 0.6175, + "step": 28575 + }, + { + "epoch": 2.88, + "grad_norm": 4.398912320781518, + "learning_rate": 4.735141899480433e-08, + "loss": 0.6231, + "step": 28580 + }, + { + "epoch": 2.88, + "grad_norm": 5.553316084687461, + "learning_rate": 4.694957487946517e-08, + "loss": 0.6552, + "step": 28585 + }, + { + "epoch": 2.88, + "grad_norm": 4.666877163122385, + "learning_rate": 4.65494351040463e-08, + "loss": 0.6706, + "step": 28590 + }, + { + "epoch": 2.88, + "grad_norm": 4.5545756201040595, + "learning_rate": 4.615099980623594e-08, + "loss": 0.6599, + "step": 28595 + }, + { + "epoch": 2.88, + "grad_norm": 5.254769283834533, + "learning_rate": 4.575426912313441e-08, + "loss": 0.6596, + "step": 28600 + }, + { + "epoch": 2.88, + "grad_norm": 5.205245943354018, + "learning_rate": 4.5359243191258065e-08, + "loss": 0.6479, + "step": 28605 + }, + { + "epoch": 2.88, + "grad_norm": 4.379569371062987, + "learning_rate": 4.496592214653317e-08, + "loss": 0.6546, + "step": 28610 + }, + { + "epoch": 2.89, + "grad_norm": 4.586698231517767, + "learning_rate": 4.457430612430202e-08, + "loss": 0.6606, + "step": 28615 + }, + { + "epoch": 2.89, + "grad_norm": 4.551421074840228, + "learning_rate": 4.4184395259319055e-08, + "loss": 0.6152, + "step": 28620 + }, + { + "epoch": 2.89, + "grad_norm": 5.2776623277684696, + "learning_rate": 4.379618968575305e-08, + "loss": 0.6255, + "step": 28625 + }, + { + "epoch": 2.89, + "grad_norm": 5.500571494344823, + "learning_rate": 4.340968953718327e-08, + "loss": 0.6573, + "step": 28630 + }, + { + "epoch": 2.89, + "grad_norm": 4.57162691956062, + "learning_rate": 4.302489494660611e-08, + "loss": 0.6461, + "step": 28635 + }, + { + "epoch": 2.89, + "grad_norm": 5.088584468734806, + "learning_rate": 4.264180604642787e-08, + "loss": 0.6568, + "step": 28640 + }, + { + "epoch": 2.89, + "grad_norm": 4.835489483073978, + "learning_rate": 4.226042296846977e-08, + "loss": 0.6373, + "step": 28645 + }, + { + "epoch": 2.89, + "grad_norm": 4.995947787844424, + "learning_rate": 4.1880745843964065e-08, + "loss": 0.6473, + "step": 28650 + }, + { + "epoch": 2.89, + "grad_norm": 6.055346263069701, + "learning_rate": 4.150277480355902e-08, + "loss": 0.6614, + "step": 28655 + }, + { + "epoch": 2.89, + "grad_norm": 4.192967250079167, + "learning_rate": 4.1126509977312266e-08, + "loss": 0.5951, + "step": 28660 + }, + { + "epoch": 2.89, + "grad_norm": 5.1990557125341, + "learning_rate": 4.075195149469802e-08, + "loss": 0.6582, + "step": 28665 + }, + { + "epoch": 2.89, + "grad_norm": 4.760477686001706, + "learning_rate": 4.0379099484600396e-08, + "loss": 0.6702, + "step": 28670 + }, + { + "epoch": 2.89, + "grad_norm": 4.6232514064036, + "learning_rate": 4.000795407531788e-08, + "loss": 0.6625, + "step": 28675 + }, + { + "epoch": 2.89, + "grad_norm": 5.6543014868891825, + "learning_rate": 3.963851539456054e-08, + "loss": 0.6184, + "step": 28680 + }, + { + "epoch": 2.89, + "grad_norm": 5.579939773683967, + "learning_rate": 3.9270783569452794e-08, + "loss": 0.6706, + "step": 28685 + }, + { + "epoch": 2.89, + "grad_norm": 4.9146514121656235, + "learning_rate": 3.89047587265301e-08, + "loss": 0.6489, + "step": 28690 + }, + { + "epoch": 2.89, + "grad_norm": 5.571272333715706, + "learning_rate": 3.85404409917417e-08, + "loss": 0.6581, + "step": 28695 + }, + { + "epoch": 2.89, + "grad_norm": 4.444044399986175, + "learning_rate": 3.817783049044899e-08, + "loss": 0.6373, + "step": 28700 + }, + { + "epoch": 2.89, + "grad_norm": 4.679007030962427, + "learning_rate": 3.781692734742548e-08, + "loss": 0.6619, + "step": 28705 + }, + { + "epoch": 2.89, + "grad_norm": 4.484198835518726, + "learning_rate": 3.745773168685796e-08, + "loss": 0.654, + "step": 28710 + }, + { + "epoch": 2.9, + "grad_norm": 4.51583489447258, + "learning_rate": 3.7100243632344767e-08, + "loss": 0.627, + "step": 28715 + }, + { + "epoch": 2.9, + "grad_norm": 5.491214712554003, + "learning_rate": 3.674446330689807e-08, + "loss": 0.6442, + "step": 28720 + }, + { + "epoch": 2.9, + "grad_norm": 4.871538819942215, + "learning_rate": 3.639039083294105e-08, + "loss": 0.6495, + "step": 28725 + }, + { + "epoch": 2.9, + "grad_norm": 4.532573252897236, + "learning_rate": 3.603802633230957e-08, + "loss": 0.6503, + "step": 28730 + }, + { + "epoch": 2.9, + "grad_norm": 4.605378611351734, + "learning_rate": 3.5687369926252216e-08, + "loss": 0.6521, + "step": 28735 + }, + { + "epoch": 2.9, + "grad_norm": 4.854656221628359, + "learning_rate": 3.533842173542967e-08, + "loss": 0.6543, + "step": 28740 + }, + { + "epoch": 2.9, + "grad_norm": 5.566033551316717, + "learning_rate": 3.499118187991368e-08, + "loss": 0.6348, + "step": 28745 + }, + { + "epoch": 2.9, + "grad_norm": 4.536806510621569, + "learning_rate": 3.464565047919033e-08, + "loss": 0.6349, + "step": 28750 + }, + { + "epoch": 2.9, + "grad_norm": 4.573169257564416, + "learning_rate": 3.430182765215617e-08, + "loss": 0.6367, + "step": 28755 + }, + { + "epoch": 2.9, + "grad_norm": 5.124899593147868, + "learning_rate": 3.395971351712046e-08, + "loss": 0.622, + "step": 28760 + }, + { + "epoch": 2.9, + "grad_norm": 5.411106375879562, + "learning_rate": 3.361930819180404e-08, + "loss": 0.6182, + "step": 28765 + }, + { + "epoch": 2.9, + "grad_norm": 4.701918083291764, + "learning_rate": 3.328061179334041e-08, + "loss": 0.6617, + "step": 28770 + }, + { + "epoch": 2.9, + "grad_norm": 4.451394066867098, + "learning_rate": 3.294362443827415e-08, + "loss": 0.6506, + "step": 28775 + }, + { + "epoch": 2.9, + "grad_norm": 4.966870524830984, + "learning_rate": 3.260834624256304e-08, + "loss": 0.6466, + "step": 28780 + }, + { + "epoch": 2.9, + "grad_norm": 4.952005053396158, + "learning_rate": 3.227477732157536e-08, + "loss": 0.6261, + "step": 28785 + }, + { + "epoch": 2.9, + "grad_norm": 4.530753020996644, + "learning_rate": 3.1942917790092064e-08, + "loss": 0.6375, + "step": 28790 + }, + { + "epoch": 2.9, + "grad_norm": 4.760861224352346, + "learning_rate": 3.1612767762305706e-08, + "loss": 0.6499, + "step": 28795 + }, + { + "epoch": 2.9, + "grad_norm": 4.295427145204775, + "learning_rate": 3.1284327351820964e-08, + "loss": 0.6451, + "step": 28800 + }, + { + "epoch": 2.9, + "grad_norm": 4.941272981204676, + "learning_rate": 3.095759667165299e-08, + "loss": 0.6474, + "step": 28805 + }, + { + "epoch": 2.9, + "grad_norm": 5.0352796812221525, + "learning_rate": 3.0632575834230184e-08, + "loss": 0.6505, + "step": 28810 + }, + { + "epoch": 2.91, + "grad_norm": 4.187736867890282, + "learning_rate": 3.030926495139142e-08, + "loss": 0.6327, + "step": 28815 + }, + { + "epoch": 2.91, + "grad_norm": 4.441092784267658, + "learning_rate": 2.998766413438881e-08, + "loss": 0.6631, + "step": 28820 + }, + { + "epoch": 2.91, + "grad_norm": 4.575506974996182, + "learning_rate": 2.9667773493883833e-08, + "loss": 0.6408, + "step": 28825 + }, + { + "epoch": 2.91, + "grad_norm": 4.9177492504304015, + "learning_rate": 2.9349593139950116e-08, + "loss": 0.6361, + "step": 28830 + }, + { + "epoch": 2.91, + "grad_norm": 6.001552291691809, + "learning_rate": 2.903312318207452e-08, + "loss": 0.6109, + "step": 28835 + }, + { + "epoch": 2.91, + "grad_norm": 6.813831831113115, + "learning_rate": 2.8718363729153264e-08, + "loss": 0.5963, + "step": 28840 + }, + { + "epoch": 2.91, + "grad_norm": 4.465535623922916, + "learning_rate": 2.8405314889494718e-08, + "loss": 0.6458, + "step": 28845 + }, + { + "epoch": 2.91, + "grad_norm": 4.728693209432457, + "learning_rate": 2.8093976770819377e-08, + "loss": 0.6618, + "step": 28850 + }, + { + "epoch": 2.91, + "grad_norm": 4.665203123879959, + "learning_rate": 2.7784349480257654e-08, + "loss": 0.6137, + "step": 28855 + }, + { + "epoch": 2.91, + "grad_norm": 5.069120938493329, + "learning_rate": 2.747643312435211e-08, + "loss": 0.6607, + "step": 28860 + }, + { + "epoch": 2.91, + "grad_norm": 5.394533717027899, + "learning_rate": 2.717022780905687e-08, + "loss": 0.6679, + "step": 28865 + }, + { + "epoch": 2.91, + "grad_norm": 4.495963076629805, + "learning_rate": 2.686573363973599e-08, + "loss": 0.6757, + "step": 28870 + }, + { + "epoch": 2.91, + "grad_norm": 7.649924820960543, + "learning_rate": 2.6562950721167324e-08, + "loss": 0.6546, + "step": 28875 + }, + { + "epoch": 2.91, + "grad_norm": 5.541609873383282, + "learning_rate": 2.6261879157536417e-08, + "loss": 0.6147, + "step": 28880 + }, + { + "epoch": 2.91, + "grad_norm": 5.56383200283721, + "learning_rate": 2.5962519052442627e-08, + "loss": 0.6423, + "step": 28885 + }, + { + "epoch": 2.91, + "grad_norm": 4.566909873140656, + "learning_rate": 2.566487050889521e-08, + "loss": 0.6306, + "step": 28890 + }, + { + "epoch": 2.91, + "grad_norm": 5.792659340433287, + "learning_rate": 2.536893362931503e-08, + "loss": 0.6826, + "step": 28895 + }, + { + "epoch": 2.91, + "grad_norm": 4.966064284959946, + "learning_rate": 2.5074708515532843e-08, + "loss": 0.625, + "step": 28900 + }, + { + "epoch": 2.91, + "grad_norm": 5.277741643462846, + "learning_rate": 2.4782195268792663e-08, + "loss": 0.662, + "step": 28905 + }, + { + "epoch": 2.91, + "grad_norm": 4.4850580438407865, + "learning_rate": 2.449139398974676e-08, + "loss": 0.6188, + "step": 28910 + }, + { + "epoch": 2.92, + "grad_norm": 4.677554062649509, + "learning_rate": 2.4202304778460083e-08, + "loss": 0.6213, + "step": 28915 + }, + { + "epoch": 2.92, + "grad_norm": 4.664419136392676, + "learning_rate": 2.3914927734408068e-08, + "loss": 0.7018, + "step": 28920 + }, + { + "epoch": 2.92, + "grad_norm": 4.547867417991105, + "learning_rate": 2.3629262956476605e-08, + "loss": 0.6345, + "step": 28925 + }, + { + "epoch": 2.92, + "grad_norm": 4.245861658306631, + "learning_rate": 2.334531054296263e-08, + "loss": 0.6428, + "step": 28930 + }, + { + "epoch": 2.92, + "grad_norm": 5.527071288715775, + "learning_rate": 2.306307059157409e-08, + "loss": 0.6533, + "step": 28935 + }, + { + "epoch": 2.92, + "grad_norm": 5.01586437245096, + "learning_rate": 2.2782543199429407e-08, + "loss": 0.6258, + "step": 28940 + }, + { + "epoch": 2.92, + "grad_norm": 5.136739091567763, + "learning_rate": 2.250372846305804e-08, + "loss": 0.6246, + "step": 28945 + }, + { + "epoch": 2.92, + "grad_norm": 4.407009944313833, + "learning_rate": 2.222662647839935e-08, + "loss": 0.6838, + "step": 28950 + }, + { + "epoch": 2.92, + "grad_norm": 4.477006030237568, + "learning_rate": 2.1951237340804287e-08, + "loss": 0.6309, + "step": 28955 + }, + { + "epoch": 2.92, + "grad_norm": 4.92534364353323, + "learning_rate": 2.1677561145034275e-08, + "loss": 0.6635, + "step": 28960 + }, + { + "epoch": 2.92, + "grad_norm": 5.0919001765583545, + "learning_rate": 2.1405597985260096e-08, + "loss": 0.656, + "step": 28965 + }, + { + "epoch": 2.92, + "grad_norm": 4.620281001158985, + "learning_rate": 2.1135347955064666e-08, + "loss": 0.6142, + "step": 28970 + }, + { + "epoch": 2.92, + "grad_norm": 4.52863045661348, + "learning_rate": 2.0866811147440825e-08, + "loss": 0.6213, + "step": 28975 + }, + { + "epoch": 2.92, + "grad_norm": 4.328487665223623, + "learning_rate": 2.0599987654791877e-08, + "loss": 0.6211, + "step": 28980 + }, + { + "epoch": 2.92, + "grad_norm": 4.585527272224497, + "learning_rate": 2.0334877568930488e-08, + "loss": 0.6704, + "step": 28985 + }, + { + "epoch": 2.92, + "grad_norm": 4.881744284413331, + "learning_rate": 2.0071480981082025e-08, + "loss": 0.6598, + "step": 28990 + }, + { + "epoch": 2.92, + "grad_norm": 5.006635771364114, + "learning_rate": 1.980979798188065e-08, + "loss": 0.5994, + "step": 28995 + }, + { + "epoch": 2.92, + "grad_norm": 5.228311640387504, + "learning_rate": 1.9549828661371562e-08, + "loss": 0.6527, + "step": 29000 + }, + { + "epoch": 2.92, + "grad_norm": 5.266788354535082, + "learning_rate": 1.9291573109008754e-08, + "loss": 0.6247, + "step": 29005 + }, + { + "epoch": 2.92, + "grad_norm": 4.670303037452094, + "learning_rate": 1.903503141365892e-08, + "loss": 0.6328, + "step": 29010 + }, + { + "epoch": 2.93, + "grad_norm": 5.461366194656828, + "learning_rate": 1.8780203663597562e-08, + "loss": 0.6534, + "step": 29015 + }, + { + "epoch": 2.93, + "grad_norm": 4.666554491042867, + "learning_rate": 1.852708994651009e-08, + "loss": 0.6293, + "step": 29020 + }, + { + "epoch": 2.93, + "grad_norm": 4.600295377045515, + "learning_rate": 1.82756903494935e-08, + "loss": 0.6302, + "step": 29025 + }, + { + "epoch": 2.93, + "grad_norm": 5.060027863989442, + "learning_rate": 1.80260049590536e-08, + "loss": 0.6439, + "step": 29030 + }, + { + "epoch": 2.93, + "grad_norm": 4.7525193226534554, + "learning_rate": 1.7778033861106657e-08, + "loss": 0.6362, + "step": 29035 + }, + { + "epoch": 2.93, + "grad_norm": 5.346645034935819, + "learning_rate": 1.7531777140980534e-08, + "loss": 0.6455, + "step": 29040 + }, + { + "epoch": 2.93, + "grad_norm": 4.530727983211217, + "learning_rate": 1.7287234883410775e-08, + "loss": 0.6448, + "step": 29045 + }, + { + "epoch": 2.93, + "grad_norm": 4.409577499950719, + "learning_rate": 1.7044407172543964e-08, + "loss": 0.6295, + "step": 29050 + }, + { + "epoch": 2.93, + "grad_norm": 4.738767443957085, + "learning_rate": 1.6803294091937707e-08, + "loss": 0.6209, + "step": 29055 + }, + { + "epoch": 2.93, + "grad_norm": 4.795117614097678, + "learning_rate": 1.6563895724558966e-08, + "loss": 0.6724, + "step": 29060 + }, + { + "epoch": 2.93, + "grad_norm": 4.948788391002489, + "learning_rate": 1.632621215278296e-08, + "loss": 0.6555, + "step": 29065 + }, + { + "epoch": 2.93, + "grad_norm": 5.1906257152362905, + "learning_rate": 1.609024345839816e-08, + "loss": 0.6347, + "step": 29070 + }, + { + "epoch": 2.93, + "grad_norm": 4.519593142681205, + "learning_rate": 1.5855989722600164e-08, + "loss": 0.6477, + "step": 29075 + }, + { + "epoch": 2.93, + "grad_norm": 7.336439870075627, + "learning_rate": 1.5623451025995605e-08, + "loss": 0.6656, + "step": 29080 + }, + { + "epoch": 2.93, + "grad_norm": 5.152827841994058, + "learning_rate": 1.5392627448601038e-08, + "loss": 0.6836, + "step": 29085 + }, + { + "epoch": 2.93, + "grad_norm": 4.919461978854965, + "learning_rate": 1.5163519069842924e-08, + "loss": 0.641, + "step": 29090 + }, + { + "epoch": 2.93, + "grad_norm": 5.28278762714753, + "learning_rate": 1.4936125968555982e-08, + "loss": 0.6402, + "step": 29095 + }, + { + "epoch": 2.93, + "grad_norm": 4.664234099410667, + "learning_rate": 1.4710448222987617e-08, + "loss": 0.6693, + "step": 29100 + }, + { + "epoch": 2.93, + "grad_norm": 4.5214278499396485, + "learning_rate": 1.448648591079238e-08, + "loss": 0.6272, + "step": 29105 + }, + { + "epoch": 2.93, + "grad_norm": 5.322543422565335, + "learning_rate": 1.4264239109035293e-08, + "loss": 0.6445, + "step": 29110 + }, + { + "epoch": 2.94, + "grad_norm": 5.001090059520327, + "learning_rate": 1.4043707894191294e-08, + "loss": 0.6431, + "step": 29115 + }, + { + "epoch": 2.94, + "grad_norm": 4.858119709583894, + "learning_rate": 1.382489234214579e-08, + "loss": 0.6427, + "step": 29120 + }, + { + "epoch": 2.94, + "grad_norm": 4.12636538178491, + "learning_rate": 1.3607792528192443e-08, + "loss": 0.664, + "step": 29125 + }, + { + "epoch": 2.94, + "grad_norm": 4.381062848659886, + "learning_rate": 1.3392408527034828e-08, + "loss": 0.6266, + "step": 29130 + }, + { + "epoch": 2.94, + "grad_norm": 4.636254099359382, + "learning_rate": 1.3178740412786995e-08, + "loss": 0.6124, + "step": 29135 + }, + { + "epoch": 2.94, + "grad_norm": 5.89566771162359, + "learning_rate": 1.2966788258971797e-08, + "loss": 0.6728, + "step": 29140 + }, + { + "epoch": 2.94, + "grad_norm": 4.962272202172932, + "learning_rate": 1.275655213852145e-08, + "loss": 0.6557, + "step": 29145 + }, + { + "epoch": 2.94, + "grad_norm": 4.58680564099952, + "learning_rate": 1.2548032123777531e-08, + "loss": 0.6365, + "step": 29150 + }, + { + "epoch": 2.94, + "grad_norm": 4.482177696174969, + "learning_rate": 1.23412282864932e-08, + "loss": 0.6259, + "step": 29155 + }, + { + "epoch": 2.94, + "grad_norm": 4.556075353290765, + "learning_rate": 1.2136140697827647e-08, + "loss": 0.6113, + "step": 29160 + }, + { + "epoch": 2.94, + "grad_norm": 4.9236493920375715, + "learning_rate": 1.1932769428352753e-08, + "loss": 0.6693, + "step": 29165 + }, + { + "epoch": 2.94, + "grad_norm": 4.378265065373981, + "learning_rate": 1.1731114548046984e-08, + "loss": 0.6324, + "step": 29170 + }, + { + "epoch": 2.94, + "grad_norm": 5.684460127715717, + "learning_rate": 1.1531176126300948e-08, + "loss": 0.6659, + "step": 29175 + }, + { + "epoch": 2.94, + "grad_norm": 4.441044263152972, + "learning_rate": 1.1332954231912385e-08, + "loss": 0.656, + "step": 29180 + }, + { + "epoch": 2.94, + "grad_norm": 5.058056139113183, + "learning_rate": 1.1136448933090072e-08, + "loss": 0.6624, + "step": 29185 + }, + { + "epoch": 2.94, + "grad_norm": 6.786763547952368, + "learning_rate": 1.0941660297449919e-08, + "loss": 0.64, + "step": 29190 + }, + { + "epoch": 2.94, + "grad_norm": 4.766601665819574, + "learning_rate": 1.0748588392019976e-08, + "loss": 0.655, + "step": 29195 + }, + { + "epoch": 2.94, + "grad_norm": 4.627987229644329, + "learning_rate": 1.0557233283235436e-08, + "loss": 0.6978, + "step": 29200 + }, + { + "epoch": 2.94, + "grad_norm": 5.066294768413905, + "learning_rate": 1.0367595036941402e-08, + "loss": 0.6804, + "step": 29205 + }, + { + "epoch": 2.95, + "grad_norm": 5.36893000113714, + "learning_rate": 1.0179673718392347e-08, + "loss": 0.6507, + "step": 29210 + }, + { + "epoch": 2.95, + "grad_norm": 4.440729392593793, + "learning_rate": 9.993469392251542e-09, + "loss": 0.621, + "step": 29215 + }, + { + "epoch": 2.95, + "grad_norm": 5.157361517178861, + "learning_rate": 9.808982122591626e-09, + "loss": 0.6287, + "step": 29220 + }, + { + "epoch": 2.95, + "grad_norm": 5.203885568995115, + "learning_rate": 9.62621197289515e-09, + "loss": 0.6518, + "step": 29225 + }, + { + "epoch": 2.95, + "grad_norm": 4.6360921751758095, + "learning_rate": 9.445159006052918e-09, + "loss": 0.6565, + "step": 29230 + }, + { + "epoch": 2.95, + "grad_norm": 4.43492102869121, + "learning_rate": 9.26582328436454e-09, + "loss": 0.6023, + "step": 29235 + }, + { + "epoch": 2.95, + "grad_norm": 4.347866074081188, + "learning_rate": 9.088204869540096e-09, + "loss": 0.6087, + "step": 29240 + }, + { + "epoch": 2.95, + "grad_norm": 4.382927803473537, + "learning_rate": 8.912303822697366e-09, + "loss": 0.6657, + "step": 29245 + }, + { + "epoch": 2.95, + "grad_norm": 5.906534231293559, + "learning_rate": 8.738120204363487e-09, + "loss": 0.6746, + "step": 29250 + }, + { + "epoch": 2.95, + "grad_norm": 4.626001112094817, + "learning_rate": 8.565654074476071e-09, + "loss": 0.6432, + "step": 29255 + }, + { + "epoch": 2.95, + "grad_norm": 4.613228889261147, + "learning_rate": 8.394905492378758e-09, + "loss": 0.6669, + "step": 29260 + }, + { + "epoch": 2.95, + "grad_norm": 4.950407492252555, + "learning_rate": 8.225874516827325e-09, + "loss": 0.6557, + "step": 29265 + }, + { + "epoch": 2.95, + "grad_norm": 4.8617479981023735, + "learning_rate": 8.05856120598525e-09, + "loss": 0.64, + "step": 29270 + }, + { + "epoch": 2.95, + "grad_norm": 5.431349888918077, + "learning_rate": 7.892965617423698e-09, + "loss": 0.6653, + "step": 29275 + }, + { + "epoch": 2.95, + "grad_norm": 4.570237562146757, + "learning_rate": 7.729087808124868e-09, + "loss": 0.6714, + "step": 29280 + }, + { + "epoch": 2.95, + "grad_norm": 4.504146337990504, + "learning_rate": 7.56692783447921e-09, + "loss": 0.6527, + "step": 29285 + }, + { + "epoch": 2.95, + "grad_norm": 4.851906925953283, + "learning_rate": 7.406485752284864e-09, + "loss": 0.6225, + "step": 29290 + }, + { + "epoch": 2.95, + "grad_norm": 5.8402884513015625, + "learning_rate": 7.247761616750449e-09, + "loss": 0.6469, + "step": 29295 + }, + { + "epoch": 2.95, + "grad_norm": 4.4351542588978035, + "learning_rate": 7.0907554824922734e-09, + "loss": 0.6303, + "step": 29300 + }, + { + "epoch": 2.95, + "grad_norm": 4.64055372052512, + "learning_rate": 6.935467403536567e-09, + "loss": 0.6229, + "step": 29305 + }, + { + "epoch": 2.96, + "grad_norm": 4.894591370143973, + "learning_rate": 6.781897433317808e-09, + "loss": 0.7172, + "step": 29310 + }, + { + "epoch": 2.96, + "grad_norm": 4.798009572789954, + "learning_rate": 6.630045624678727e-09, + "loss": 0.6643, + "step": 29315 + }, + { + "epoch": 2.96, + "grad_norm": 4.366269183216473, + "learning_rate": 6.479912029872526e-09, + "loss": 0.6645, + "step": 29320 + }, + { + "epoch": 2.96, + "grad_norm": 5.12951703426055, + "learning_rate": 6.331496700558437e-09, + "loss": 0.6074, + "step": 29325 + }, + { + "epoch": 2.96, + "grad_norm": 4.854777420180117, + "learning_rate": 6.1847996878072745e-09, + "loss": 0.5839, + "step": 29330 + }, + { + "epoch": 2.96, + "grad_norm": 4.656105622679166, + "learning_rate": 6.039821042096439e-09, + "loss": 0.6215, + "step": 29335 + }, + { + "epoch": 2.96, + "grad_norm": 5.39358766372186, + "learning_rate": 5.896560813313801e-09, + "loss": 0.6232, + "step": 29340 + }, + { + "epoch": 2.96, + "grad_norm": 4.391871972235465, + "learning_rate": 5.755019050754373e-09, + "loss": 0.6248, + "step": 29345 + }, + { + "epoch": 2.96, + "grad_norm": 5.191262489667674, + "learning_rate": 5.6151958031230855e-09, + "loss": 0.6542, + "step": 29350 + }, + { + "epoch": 2.96, + "grad_norm": 4.596109269185588, + "learning_rate": 5.477091118532562e-09, + "loss": 0.637, + "step": 29355 + }, + { + "epoch": 2.96, + "grad_norm": 4.630244059530306, + "learning_rate": 5.34070504450479e-09, + "loss": 0.6157, + "step": 29360 + }, + { + "epoch": 2.96, + "grad_norm": 4.727987744073469, + "learning_rate": 5.2060376279700065e-09, + "loss": 0.6494, + "step": 29365 + }, + { + "epoch": 2.96, + "grad_norm": 4.508158304269665, + "learning_rate": 5.073088915267255e-09, + "loss": 0.647, + "step": 29370 + }, + { + "epoch": 2.96, + "grad_norm": 4.8063412094916815, + "learning_rate": 4.941858952143275e-09, + "loss": 0.6464, + "step": 29375 + }, + { + "epoch": 2.96, + "grad_norm": 4.577388459120201, + "learning_rate": 4.812347783755278e-09, + "loss": 0.6468, + "step": 29380 + }, + { + "epoch": 2.96, + "grad_norm": 5.693548063025151, + "learning_rate": 4.6845554546676164e-09, + "loss": 0.6603, + "step": 29385 + }, + { + "epoch": 2.96, + "grad_norm": 4.680480106578478, + "learning_rate": 4.558482008852894e-09, + "loss": 0.6685, + "step": 29390 + }, + { + "epoch": 2.96, + "grad_norm": 4.4956655850957965, + "learning_rate": 4.434127489693629e-09, + "loss": 0.6422, + "step": 29395 + }, + { + "epoch": 2.96, + "grad_norm": 4.334691883331565, + "learning_rate": 4.311491939979484e-09, + "loss": 0.666, + "step": 29400 + }, + { + "epoch": 2.96, + "grad_norm": 4.838608087764427, + "learning_rate": 4.190575401910036e-09, + "loss": 0.661, + "step": 29405 + }, + { + "epoch": 2.97, + "grad_norm": 7.5438973303204495, + "learning_rate": 4.071377917091446e-09, + "loss": 0.7081, + "step": 29410 + }, + { + "epoch": 2.97, + "grad_norm": 4.435445651075055, + "learning_rate": 3.95389952654035e-09, + "loss": 0.6411, + "step": 29415 + }, + { + "epoch": 2.97, + "grad_norm": 4.562288771628546, + "learning_rate": 3.838140270680524e-09, + "loss": 0.638, + "step": 29420 + }, + { + "epoch": 2.97, + "grad_norm": 6.483748229835458, + "learning_rate": 3.7241001893451035e-09, + "loss": 0.6452, + "step": 29425 + }, + { + "epoch": 2.97, + "grad_norm": 4.857673991541536, + "learning_rate": 3.611779321774367e-09, + "loss": 0.6119, + "step": 29430 + }, + { + "epoch": 2.97, + "grad_norm": 5.737987373281746, + "learning_rate": 3.501177706618508e-09, + "loss": 0.6412, + "step": 29435 + }, + { + "epoch": 2.97, + "grad_norm": 4.5811841822383785, + "learning_rate": 3.392295381935418e-09, + "loss": 0.6375, + "step": 29440 + }, + { + "epoch": 2.97, + "grad_norm": 4.978688056473059, + "learning_rate": 3.2851323851906812e-09, + "loss": 0.6638, + "step": 29445 + }, + { + "epoch": 2.97, + "grad_norm": 4.967654656326927, + "learning_rate": 3.179688753259802e-09, + "loss": 0.671, + "step": 29450 + }, + { + "epoch": 2.97, + "grad_norm": 4.528084776273715, + "learning_rate": 3.075964522425423e-09, + "loss": 0.6668, + "step": 29455 + }, + { + "epoch": 2.97, + "grad_norm": 5.7681723490814, + "learning_rate": 2.9739597283789946e-09, + "loss": 0.6578, + "step": 29460 + }, + { + "epoch": 2.97, + "grad_norm": 5.237788531018208, + "learning_rate": 2.8736744062202173e-09, + "loss": 0.6294, + "step": 29465 + }, + { + "epoch": 2.97, + "grad_norm": 5.369903138483945, + "learning_rate": 2.775108590457598e-09, + "loss": 0.6641, + "step": 29470 + }, + { + "epoch": 2.97, + "grad_norm": 4.349803726300211, + "learning_rate": 2.6782623150067857e-09, + "loss": 0.6181, + "step": 29475 + }, + { + "epoch": 2.97, + "grad_norm": 4.870894509436404, + "learning_rate": 2.583135613193344e-09, + "loss": 0.6202, + "step": 29480 + }, + { + "epoch": 2.97, + "grad_norm": 5.561984308798503, + "learning_rate": 2.4897285177494234e-09, + "loss": 0.6406, + "step": 29485 + }, + { + "epoch": 2.97, + "grad_norm": 4.970137297751924, + "learning_rate": 2.3980410608170914e-09, + "loss": 0.6224, + "step": 29490 + }, + { + "epoch": 2.97, + "grad_norm": 5.930025831692994, + "learning_rate": 2.3080732739455545e-09, + "loss": 0.6585, + "step": 29495 + }, + { + "epoch": 2.97, + "grad_norm": 6.05574857651503, + "learning_rate": 2.2198251880922726e-09, + "loss": 0.6353, + "step": 29500 + }, + { + "epoch": 2.97, + "grad_norm": 4.581775862440681, + "learning_rate": 2.1332968336240654e-09, + "loss": 0.6416, + "step": 29505 + }, + { + "epoch": 2.98, + "grad_norm": 4.370277333475543, + "learning_rate": 2.0484882403148943e-09, + "loss": 0.6805, + "step": 29510 + }, + { + "epoch": 2.98, + "grad_norm": 5.3428214713148945, + "learning_rate": 1.965399437347526e-09, + "loss": 0.6417, + "step": 29515 + }, + { + "epoch": 2.98, + "grad_norm": 4.8639952216571185, + "learning_rate": 1.8840304533118694e-09, + "loss": 0.6714, + "step": 29520 + }, + { + "epoch": 2.98, + "grad_norm": 5.333033103133333, + "learning_rate": 1.8043813162083036e-09, + "loss": 0.631, + "step": 29525 + }, + { + "epoch": 2.98, + "grad_norm": 4.919789592117598, + "learning_rate": 1.7264520534426844e-09, + "loss": 0.6757, + "step": 29530 + }, + { + "epoch": 2.98, + "grad_norm": 4.901987244613265, + "learning_rate": 1.6502426918313387e-09, + "loss": 0.6216, + "step": 29535 + }, + { + "epoch": 2.98, + "grad_norm": 4.979375934899607, + "learning_rate": 1.575753257597734e-09, + "loss": 0.6487, + "step": 29540 + }, + { + "epoch": 2.98, + "grad_norm": 4.651082696096523, + "learning_rate": 1.502983776373035e-09, + "loss": 0.6217, + "step": 29545 + }, + { + "epoch": 2.98, + "grad_norm": 4.466242260270571, + "learning_rate": 1.431934273197766e-09, + "loss": 0.6561, + "step": 29550 + }, + { + "epoch": 2.98, + "grad_norm": 4.667264339911075, + "learning_rate": 1.3626047725195934e-09, + "loss": 0.6647, + "step": 29555 + }, + { + "epoch": 2.98, + "grad_norm": 5.900916909243711, + "learning_rate": 1.2949952981949899e-09, + "loss": 0.6604, + "step": 29560 + }, + { + "epoch": 2.98, + "grad_norm": 4.553663231238028, + "learning_rate": 1.2291058734881234e-09, + "loss": 0.6462, + "step": 29565 + }, + { + "epoch": 2.98, + "grad_norm": 6.09318450504494, + "learning_rate": 1.1649365210714137e-09, + "loss": 0.6316, + "step": 29570 + }, + { + "epoch": 2.98, + "grad_norm": 4.814946633549386, + "learning_rate": 1.1024872630255311e-09, + "loss": 0.6211, + "step": 29575 + }, + { + "epoch": 2.98, + "grad_norm": 5.257440542630988, + "learning_rate": 1.0417581208393979e-09, + "loss": 0.6522, + "step": 29580 + }, + { + "epoch": 2.98, + "grad_norm": 4.669418086862083, + "learning_rate": 9.827491154096315e-10, + "loss": 0.6647, + "step": 29585 + }, + { + "epoch": 2.98, + "grad_norm": 4.696641863133449, + "learning_rate": 9.254602670416557e-10, + "loss": 0.6464, + "step": 29590 + }, + { + "epoch": 2.98, + "grad_norm": 4.579217604058113, + "learning_rate": 8.698915954480358e-10, + "loss": 0.619, + "step": 29595 + }, + { + "epoch": 2.98, + "grad_norm": 4.632195703141335, + "learning_rate": 8.160431197495878e-10, + "loss": 0.6176, + "step": 29600 + }, + { + "epoch": 2.98, + "grad_norm": 5.773687529065707, + "learning_rate": 7.639148584764889e-10, + "loss": 0.653, + "step": 29605 + }, + { + "epoch": 2.99, + "grad_norm": 4.402195898389152, + "learning_rate": 7.135068295649472e-10, + "loss": 0.6091, + "step": 29610 + }, + { + "epoch": 2.99, + "grad_norm": 4.683534156836418, + "learning_rate": 6.648190503610874e-10, + "loss": 0.6599, + "step": 29615 + }, + { + "epoch": 2.99, + "grad_norm": 5.25776544357083, + "learning_rate": 6.178515376181748e-10, + "loss": 0.6392, + "step": 29620 + }, + { + "epoch": 2.99, + "grad_norm": 4.690548389862812, + "learning_rate": 5.726043074977262e-10, + "loss": 0.6222, + "step": 29625 + }, + { + "epoch": 2.99, + "grad_norm": 5.384653894013326, + "learning_rate": 5.290773755689537e-10, + "loss": 0.6298, + "step": 29630 + }, + { + "epoch": 2.99, + "grad_norm": 4.766865932934473, + "learning_rate": 4.872707568093216e-10, + "loss": 0.6399, + "step": 29635 + }, + { + "epoch": 2.99, + "grad_norm": 6.264078290847827, + "learning_rate": 4.471844656050994e-10, + "loss": 0.6358, + "step": 29640 + }, + { + "epoch": 2.99, + "grad_norm": 5.789134124948843, + "learning_rate": 4.088185157491431e-10, + "loss": 0.6618, + "step": 29645 + }, + { + "epoch": 2.99, + "grad_norm": 5.066132528261486, + "learning_rate": 3.7217292044422484e-10, + "loss": 0.6361, + "step": 29650 + }, + { + "epoch": 2.99, + "grad_norm": 4.259020902234293, + "learning_rate": 3.372476922991474e-10, + "loss": 0.6293, + "step": 29655 + }, + { + "epoch": 2.99, + "grad_norm": 5.152132238576465, + "learning_rate": 3.040428433315201e-10, + "loss": 0.6497, + "step": 29660 + }, + { + "epoch": 2.99, + "grad_norm": 7.046910457321705, + "learning_rate": 2.7255838496775824e-10, + "loss": 0.6266, + "step": 29665 + }, + { + "epoch": 2.99, + "grad_norm": 4.462013754364072, + "learning_rate": 2.42794328041418e-10, + "loss": 0.6296, + "step": 29670 + }, + { + "epoch": 2.99, + "grad_norm": 4.547167003537467, + "learning_rate": 2.1475068279430688e-10, + "loss": 0.6, + "step": 29675 + }, + { + "epoch": 2.99, + "grad_norm": 4.4896715999147565, + "learning_rate": 1.884274588759283e-10, + "loss": 0.6708, + "step": 29680 + }, + { + "epoch": 2.99, + "grad_norm": 5.0500035500995555, + "learning_rate": 1.6382466534459184e-10, + "loss": 0.6508, + "step": 29685 + }, + { + "epoch": 2.99, + "grad_norm": 4.448348736657129, + "learning_rate": 1.4094231066574816e-10, + "loss": 0.6573, + "step": 29690 + }, + { + "epoch": 2.99, + "grad_norm": 4.191738441045034, + "learning_rate": 1.1978040271309887e-10, + "loss": 0.6245, + "step": 29695 + }, + { + "epoch": 2.99, + "grad_norm": 4.598097897991266, + "learning_rate": 1.0033894876859685e-10, + "loss": 0.6464, + "step": 29700 + }, + { + "epoch": 2.99, + "grad_norm": 4.963954418786347, + "learning_rate": 8.261795552189089e-11, + "loss": 0.6265, + "step": 29705 + }, + { + "epoch": 3.0, + "grad_norm": 4.68072279151534, + "learning_rate": 6.661742907088098e-11, + "loss": 0.6927, + "step": 29710 + }, + { + "epoch": 3.0, + "grad_norm": 5.287053203230731, + "learning_rate": 5.23373749217182e-11, + "loss": 0.6326, + "step": 29715 + }, + { + "epoch": 3.0, + "grad_norm": 4.513974310633034, + "learning_rate": 3.9777797987139435e-11, + "loss": 0.6449, + "step": 29720 + }, + { + "epoch": 3.0, + "grad_norm": 4.508111634160351, + "learning_rate": 2.893870258979803e-11, + "loss": 0.6469, + "step": 29725 + }, + { + "epoch": 3.0, + "grad_norm": 4.394271709957407, + "learning_rate": 1.982009245948824e-11, + "loss": 0.6508, + "step": 29730 + }, + { + "epoch": 3.0, + "grad_norm": 5.741623800409336, + "learning_rate": 1.2421970733145217e-11, + "loss": 0.6351, + "step": 29735 + }, + { + "epoch": 3.0, + "grad_norm": 4.517673242269964, + "learning_rate": 6.744339957065471e-12, + "loss": 0.6477, + "step": 29740 + }, + { + "epoch": 3.0, + "grad_norm": 4.693274488472275, + "learning_rate": 2.7872020846864135e-12, + "loss": 0.6142, + "step": 29745 + }, + { + "epoch": 3.0, + "grad_norm": 5.337435074057215, + "learning_rate": 5.505584776965833e-13, + "loss": 0.6744, + "step": 29750 + }, + { + "epoch": 3.0, + "eval_loss": 1.1189379692077637, + "eval_runtime": 25.4202, + "eval_samples_per_second": 31.707, + "eval_steps_per_second": 3.973, + "step": 29754 + }, + { + "epoch": 3.0, + "step": 29754, + "total_flos": 1024319199117312.0, + "train_loss": 1.0088004046184542, + "train_runtime": 70733.587, + "train_samples_per_second": 6.731, + "train_steps_per_second": 0.421 + } + ], + "logging_steps": 5, + "max_steps": 29754, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 1024319199117312.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}