{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999172801720573, "eval_steps": 2300, "global_step": 7555, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 9.696115493774414, "learning_rate": 4.347826086956522e-08, "loss": 1.6976, "step": 10 }, { "epoch": 0.0, "grad_norm": 9.331488609313965, "learning_rate": 8.695652173913044e-08, "loss": 1.711, "step": 20 }, { "epoch": 0.0, "grad_norm": 9.760047912597656, "learning_rate": 1.3043478260869566e-07, "loss": 1.7084, "step": 30 }, { "epoch": 0.01, "grad_norm": 8.302080154418945, "learning_rate": 1.7391304347826088e-07, "loss": 1.6865, "step": 40 }, { "epoch": 0.01, "grad_norm": 7.753058433532715, "learning_rate": 2.173913043478261e-07, "loss": 1.6633, "step": 50 }, { "epoch": 0.01, "grad_norm": 7.2695817947387695, "learning_rate": 2.608695652173913e-07, "loss": 1.654, "step": 60 }, { "epoch": 0.01, "grad_norm": 6.353436470031738, "learning_rate": 3.0434782608695656e-07, "loss": 1.6029, "step": 70 }, { "epoch": 0.01, "grad_norm": 5.489682674407959, "learning_rate": 3.4782608695652175e-07, "loss": 1.5619, "step": 80 }, { "epoch": 0.01, "grad_norm": 5.3502631187438965, "learning_rate": 3.91304347826087e-07, "loss": 1.522, "step": 90 }, { "epoch": 0.01, "grad_norm": 5.116358280181885, "learning_rate": 4.347826086956522e-07, "loss": 1.4462, "step": 100 }, { "epoch": 0.01, "grad_norm": 4.4354634284973145, "learning_rate": 4.782608695652174e-07, "loss": 1.4002, "step": 110 }, { "epoch": 0.02, "grad_norm": 5.6585693359375, "learning_rate": 5.217391304347826e-07, "loss": 1.3227, "step": 120 }, { "epoch": 0.02, "grad_norm": 4.757506847381592, "learning_rate": 5.652173913043478e-07, "loss": 1.2458, "step": 130 }, { "epoch": 0.02, "grad_norm": 4.914801120758057, "learning_rate": 6.086956521739131e-07, "loss": 1.2131, "step": 140 }, { "epoch": 0.02, "grad_norm": 9.453673362731934, "learning_rate": 6.521739130434783e-07, "loss": 1.15, "step": 150 }, { "epoch": 0.02, "grad_norm": 6.355442047119141, "learning_rate": 6.956521739130435e-07, "loss": 1.0977, "step": 160 }, { "epoch": 0.02, "grad_norm": 6.3270087242126465, "learning_rate": 7.391304347826088e-07, "loss": 1.0666, "step": 170 }, { "epoch": 0.02, "grad_norm": 6.119747638702393, "learning_rate": 7.82608695652174e-07, "loss": 1.044, "step": 180 }, { "epoch": 0.03, "grad_norm": 8.558585166931152, "learning_rate": 8.260869565217392e-07, "loss": 1.0215, "step": 190 }, { "epoch": 0.03, "grad_norm": 5.8033766746521, "learning_rate": 8.695652173913044e-07, "loss": 0.996, "step": 200 }, { "epoch": 0.03, "grad_norm": 6.511552810668945, "learning_rate": 9.130434782608697e-07, "loss": 0.9866, "step": 210 }, { "epoch": 0.03, "grad_norm": 7.597463607788086, "learning_rate": 9.565217391304349e-07, "loss": 0.9724, "step": 220 }, { "epoch": 0.03, "grad_norm": 6.662442207336426, "learning_rate": 1.0000000000000002e-06, "loss": 0.9711, "step": 230 }, { "epoch": 0.03, "grad_norm": 5.849515914916992, "learning_rate": 1.0434782608695653e-06, "loss": 0.9607, "step": 240 }, { "epoch": 0.03, "grad_norm": 6.360912322998047, "learning_rate": 1.0869565217391306e-06, "loss": 0.946, "step": 250 }, { "epoch": 0.03, "grad_norm": 4.921197891235352, "learning_rate": 1.1304347826086956e-06, "loss": 0.9355, "step": 260 }, { "epoch": 0.04, "grad_norm": 4.507955074310303, "learning_rate": 1.173913043478261e-06, "loss": 0.9248, "step": 270 }, { "epoch": 0.04, "grad_norm": 4.760683059692383, "learning_rate": 1.2173913043478262e-06, "loss": 0.9315, "step": 280 }, { "epoch": 0.04, "grad_norm": 5.035203456878662, "learning_rate": 1.2608695652173913e-06, "loss": 0.9209, "step": 290 }, { "epoch": 0.04, "grad_norm": 4.834679126739502, "learning_rate": 1.3043478260869566e-06, "loss": 0.9104, "step": 300 }, { "epoch": 0.04, "grad_norm": 3.9745352268218994, "learning_rate": 1.347826086956522e-06, "loss": 0.9025, "step": 310 }, { "epoch": 0.04, "grad_norm": 3.602553606033325, "learning_rate": 1.391304347826087e-06, "loss": 0.8954, "step": 320 }, { "epoch": 0.04, "grad_norm": 3.429502487182617, "learning_rate": 1.4347826086956523e-06, "loss": 0.889, "step": 330 }, { "epoch": 0.04, "grad_norm": 4.117110252380371, "learning_rate": 1.4782608695652176e-06, "loss": 0.8732, "step": 340 }, { "epoch": 0.05, "grad_norm": 3.424821615219116, "learning_rate": 1.521739130434783e-06, "loss": 0.8819, "step": 350 }, { "epoch": 0.05, "grad_norm": 4.870906829833984, "learning_rate": 1.565217391304348e-06, "loss": 0.8829, "step": 360 }, { "epoch": 0.05, "grad_norm": 6.471639156341553, "learning_rate": 1.608695652173913e-06, "loss": 0.8632, "step": 370 }, { "epoch": 0.05, "grad_norm": 4.399796962738037, "learning_rate": 1.6521739130434784e-06, "loss": 0.8543, "step": 380 }, { "epoch": 0.05, "grad_norm": 3.4351227283477783, "learning_rate": 1.6956521739130435e-06, "loss": 0.847, "step": 390 }, { "epoch": 0.05, "grad_norm": 3.1236326694488525, "learning_rate": 1.7391304347826088e-06, "loss": 0.8445, "step": 400 }, { "epoch": 0.05, "grad_norm": 3.5134594440460205, "learning_rate": 1.782608695652174e-06, "loss": 0.8377, "step": 410 }, { "epoch": 0.06, "grad_norm": 3.1803650856018066, "learning_rate": 1.8260869565217394e-06, "loss": 0.835, "step": 420 }, { "epoch": 0.06, "grad_norm": 5.259217739105225, "learning_rate": 1.8695652173913044e-06, "loss": 0.8368, "step": 430 }, { "epoch": 0.06, "grad_norm": 4.378271579742432, "learning_rate": 1.9130434782608697e-06, "loss": 0.8377, "step": 440 }, { "epoch": 0.06, "grad_norm": 3.6259055137634277, "learning_rate": 1.956521739130435e-06, "loss": 0.8383, "step": 450 }, { "epoch": 0.06, "grad_norm": 4.089799404144287, "learning_rate": 2.0000000000000003e-06, "loss": 0.8309, "step": 460 }, { "epoch": 0.06, "grad_norm": 3.041405200958252, "learning_rate": 2.0434782608695656e-06, "loss": 0.8178, "step": 470 }, { "epoch": 0.06, "grad_norm": 3.9156289100646973, "learning_rate": 2.0869565217391305e-06, "loss": 0.8091, "step": 480 }, { "epoch": 0.06, "grad_norm": 3.1488077640533447, "learning_rate": 2.130434782608696e-06, "loss": 0.8287, "step": 490 }, { "epoch": 0.07, "grad_norm": 3.1481711864471436, "learning_rate": 2.173913043478261e-06, "loss": 0.8109, "step": 500 }, { "epoch": 0.07, "grad_norm": 3.6406595706939697, "learning_rate": 2.2173913043478264e-06, "loss": 0.8071, "step": 510 }, { "epoch": 0.07, "grad_norm": 3.674004554748535, "learning_rate": 2.2608695652173913e-06, "loss": 0.8033, "step": 520 }, { "epoch": 0.07, "grad_norm": 2.566720485687256, "learning_rate": 2.3043478260869566e-06, "loss": 0.8138, "step": 530 }, { "epoch": 0.07, "grad_norm": 3.0154435634613037, "learning_rate": 2.347826086956522e-06, "loss": 0.8147, "step": 540 }, { "epoch": 0.07, "grad_norm": 3.2751858234405518, "learning_rate": 2.391304347826087e-06, "loss": 0.7845, "step": 550 }, { "epoch": 0.07, "grad_norm": 5.122045993804932, "learning_rate": 2.4347826086956525e-06, "loss": 0.7984, "step": 560 }, { "epoch": 0.08, "grad_norm": 4.417994499206543, "learning_rate": 2.4782608695652178e-06, "loss": 0.7981, "step": 570 }, { "epoch": 0.08, "grad_norm": 2.893526315689087, "learning_rate": 2.5217391304347826e-06, "loss": 0.799, "step": 580 }, { "epoch": 0.08, "grad_norm": 3.6591286659240723, "learning_rate": 2.5652173913043484e-06, "loss": 0.7965, "step": 590 }, { "epoch": 0.08, "grad_norm": 2.970930337905884, "learning_rate": 2.6086956521739132e-06, "loss": 0.8057, "step": 600 }, { "epoch": 0.08, "grad_norm": 2.9321494102478027, "learning_rate": 2.6521739130434785e-06, "loss": 0.7862, "step": 610 }, { "epoch": 0.08, "grad_norm": 3.5950493812561035, "learning_rate": 2.695652173913044e-06, "loss": 0.7844, "step": 620 }, { "epoch": 0.08, "grad_norm": 2.530301332473755, "learning_rate": 2.7391304347826087e-06, "loss": 0.7863, "step": 630 }, { "epoch": 0.08, "grad_norm": 2.930530548095703, "learning_rate": 2.782608695652174e-06, "loss": 0.7914, "step": 640 }, { "epoch": 0.09, "grad_norm": 2.9592342376708984, "learning_rate": 2.8260869565217393e-06, "loss": 0.7738, "step": 650 }, { "epoch": 0.09, "grad_norm": 2.425995349884033, "learning_rate": 2.8695652173913046e-06, "loss": 0.7941, "step": 660 }, { "epoch": 0.09, "grad_norm": 2.796645402908325, "learning_rate": 2.9130434782608695e-06, "loss": 0.7969, "step": 670 }, { "epoch": 0.09, "grad_norm": 2.917015314102173, "learning_rate": 2.956521739130435e-06, "loss": 0.7803, "step": 680 }, { "epoch": 0.09, "grad_norm": 2.644934892654419, "learning_rate": 3e-06, "loss": 0.7925, "step": 690 }, { "epoch": 0.09, "grad_norm": 3.2409515380859375, "learning_rate": 3.043478260869566e-06, "loss": 0.7863, "step": 700 }, { "epoch": 0.09, "grad_norm": 2.5315630435943604, "learning_rate": 3.0869565217391307e-06, "loss": 0.7798, "step": 710 }, { "epoch": 0.1, "grad_norm": 2.755002498626709, "learning_rate": 3.130434782608696e-06, "loss": 0.7736, "step": 720 }, { "epoch": 0.1, "grad_norm": 2.5441417694091797, "learning_rate": 3.1739130434782613e-06, "loss": 0.7653, "step": 730 }, { "epoch": 0.1, "grad_norm": 2.50203537940979, "learning_rate": 3.217391304347826e-06, "loss": 0.7751, "step": 740 }, { "epoch": 0.1, "grad_norm": 2.8558547496795654, "learning_rate": 3.2608695652173914e-06, "loss": 0.7832, "step": 750 }, { "epoch": 0.1, "grad_norm": 2.7601280212402344, "learning_rate": 3.3043478260869567e-06, "loss": 0.7865, "step": 760 }, { "epoch": 0.1, "grad_norm": 4.165918827056885, "learning_rate": 3.347826086956522e-06, "loss": 0.7804, "step": 770 }, { "epoch": 0.1, "grad_norm": 3.20556378364563, "learning_rate": 3.391304347826087e-06, "loss": 0.7787, "step": 780 }, { "epoch": 0.1, "grad_norm": 2.259490489959717, "learning_rate": 3.4347826086956526e-06, "loss": 0.7776, "step": 790 }, { "epoch": 0.11, "grad_norm": 2.482300281524658, "learning_rate": 3.4782608695652175e-06, "loss": 0.7909, "step": 800 }, { "epoch": 0.11, "grad_norm": 2.6870365142822266, "learning_rate": 3.5217391304347832e-06, "loss": 0.7747, "step": 810 }, { "epoch": 0.11, "grad_norm": 2.509525775909424, "learning_rate": 3.565217391304348e-06, "loss": 0.7675, "step": 820 }, { "epoch": 0.11, "grad_norm": 2.8357813358306885, "learning_rate": 3.6086956521739134e-06, "loss": 0.7525, "step": 830 }, { "epoch": 0.11, "grad_norm": 3.0093586444854736, "learning_rate": 3.6521739130434787e-06, "loss": 0.7871, "step": 840 }, { "epoch": 0.11, "grad_norm": 2.5198683738708496, "learning_rate": 3.6956521739130436e-06, "loss": 0.7602, "step": 850 }, { "epoch": 0.11, "grad_norm": 2.4067280292510986, "learning_rate": 3.739130434782609e-06, "loss": 0.7709, "step": 860 }, { "epoch": 0.12, "grad_norm": 2.968722343444824, "learning_rate": 3.782608695652174e-06, "loss": 0.7638, "step": 870 }, { "epoch": 0.12, "grad_norm": 2.5267333984375, "learning_rate": 3.8260869565217395e-06, "loss": 0.7618, "step": 880 }, { "epoch": 0.12, "grad_norm": 2.519435405731201, "learning_rate": 3.869565217391304e-06, "loss": 0.7862, "step": 890 }, { "epoch": 0.12, "grad_norm": 2.373142957687378, "learning_rate": 3.91304347826087e-06, "loss": 0.7785, "step": 900 }, { "epoch": 0.12, "grad_norm": 2.939995288848877, "learning_rate": 3.956521739130435e-06, "loss": 0.7752, "step": 910 }, { "epoch": 0.12, "grad_norm": 2.4909372329711914, "learning_rate": 4.000000000000001e-06, "loss": 0.779, "step": 920 }, { "epoch": 0.12, "grad_norm": 2.6996891498565674, "learning_rate": 4.0434782608695655e-06, "loss": 0.7735, "step": 930 }, { "epoch": 0.12, "grad_norm": 2.628506660461426, "learning_rate": 4.086956521739131e-06, "loss": 0.7551, "step": 940 }, { "epoch": 0.13, "grad_norm": 2.350477457046509, "learning_rate": 4.130434782608696e-06, "loss": 0.7685, "step": 950 }, { "epoch": 0.13, "grad_norm": 2.2088937759399414, "learning_rate": 4.173913043478261e-06, "loss": 0.7714, "step": 960 }, { "epoch": 0.13, "grad_norm": 3.055957555770874, "learning_rate": 4.217391304347827e-06, "loss": 0.7633, "step": 970 }, { "epoch": 0.13, "grad_norm": 2.987377882003784, "learning_rate": 4.260869565217392e-06, "loss": 0.7639, "step": 980 }, { "epoch": 0.13, "grad_norm": 2.8698835372924805, "learning_rate": 4.304347826086957e-06, "loss": 0.7748, "step": 990 }, { "epoch": 0.13, "grad_norm": 2.728653907775879, "learning_rate": 4.347826086956522e-06, "loss": 0.7531, "step": 1000 }, { "epoch": 0.13, "grad_norm": 3.0617196559906006, "learning_rate": 4.391304347826087e-06, "loss": 0.7637, "step": 1010 }, { "epoch": 0.13, "grad_norm": 9.645702362060547, "learning_rate": 4.434782608695653e-06, "loss": 0.7734, "step": 1020 }, { "epoch": 0.14, "grad_norm": 3.174217462539673, "learning_rate": 4.478260869565218e-06, "loss": 0.7637, "step": 1030 }, { "epoch": 0.14, "grad_norm": 2.565565586090088, "learning_rate": 4.5217391304347826e-06, "loss": 0.7616, "step": 1040 }, { "epoch": 0.14, "grad_norm": 2.3000173568725586, "learning_rate": 4.565217391304348e-06, "loss": 0.7605, "step": 1050 }, { "epoch": 0.14, "grad_norm": 2.2204582691192627, "learning_rate": 4.608695652173913e-06, "loss": 0.7613, "step": 1060 }, { "epoch": 0.14, "grad_norm": 2.5566813945770264, "learning_rate": 4.652173913043478e-06, "loss": 0.745, "step": 1070 }, { "epoch": 0.14, "grad_norm": 2.7924296855926514, "learning_rate": 4.695652173913044e-06, "loss": 0.7643, "step": 1080 }, { "epoch": 0.14, "grad_norm": 2.78627610206604, "learning_rate": 4.739130434782609e-06, "loss": 0.765, "step": 1090 }, { "epoch": 0.15, "grad_norm": 2.9410696029663086, "learning_rate": 4.782608695652174e-06, "loss": 0.7534, "step": 1100 }, { "epoch": 0.15, "grad_norm": 2.6935067176818848, "learning_rate": 4.826086956521739e-06, "loss": 0.7501, "step": 1110 }, { "epoch": 0.15, "grad_norm": 2.8043696880340576, "learning_rate": 4.869565217391305e-06, "loss": 0.7576, "step": 1120 }, { "epoch": 0.15, "grad_norm": 2.8394672870635986, "learning_rate": 4.91304347826087e-06, "loss": 0.7536, "step": 1130 }, { "epoch": 0.15, "grad_norm": 2.5857579708099365, "learning_rate": 4.9565217391304355e-06, "loss": 0.7545, "step": 1140 }, { "epoch": 0.15, "grad_norm": 2.707064151763916, "learning_rate": 5e-06, "loss": 0.7686, "step": 1150 }, { "epoch": 0.15, "grad_norm": 2.7348179817199707, "learning_rate": 5.043478260869565e-06, "loss": 0.7594, "step": 1160 }, { "epoch": 0.15, "grad_norm": 2.8637123107910156, "learning_rate": 5.08695652173913e-06, "loss": 0.7719, "step": 1170 }, { "epoch": 0.16, "grad_norm": 2.605658769607544, "learning_rate": 5.130434782608697e-06, "loss": 0.7609, "step": 1180 }, { "epoch": 0.16, "grad_norm": 2.538159132003784, "learning_rate": 5.173913043478262e-06, "loss": 0.7447, "step": 1190 }, { "epoch": 0.16, "grad_norm": 2.6438486576080322, "learning_rate": 5.2173913043478265e-06, "loss": 0.7474, "step": 1200 }, { "epoch": 0.16, "grad_norm": 2.5470008850097656, "learning_rate": 5.260869565217391e-06, "loss": 0.755, "step": 1210 }, { "epoch": 0.16, "grad_norm": 2.9301180839538574, "learning_rate": 5.304347826086957e-06, "loss": 0.7507, "step": 1220 }, { "epoch": 0.16, "grad_norm": 2.509558916091919, "learning_rate": 5.347826086956523e-06, "loss": 0.7568, "step": 1230 }, { "epoch": 0.16, "grad_norm": 2.386697292327881, "learning_rate": 5.391304347826088e-06, "loss": 0.7615, "step": 1240 }, { "epoch": 0.17, "grad_norm": 2.8356692790985107, "learning_rate": 5.4347826086956525e-06, "loss": 0.7498, "step": 1250 }, { "epoch": 0.17, "grad_norm": 2.812669038772583, "learning_rate": 5.478260869565217e-06, "loss": 0.7467, "step": 1260 }, { "epoch": 0.17, "grad_norm": 2.591529369354248, "learning_rate": 5.521739130434783e-06, "loss": 0.7537, "step": 1270 }, { "epoch": 0.17, "grad_norm": 2.7662971019744873, "learning_rate": 5.565217391304348e-06, "loss": 0.7508, "step": 1280 }, { "epoch": 0.17, "grad_norm": 2.4197189807891846, "learning_rate": 5.608695652173914e-06, "loss": 0.7415, "step": 1290 }, { "epoch": 0.17, "grad_norm": 2.972205877304077, "learning_rate": 5.652173913043479e-06, "loss": 0.7587, "step": 1300 }, { "epoch": 0.17, "grad_norm": 3.1979939937591553, "learning_rate": 5.695652173913044e-06, "loss": 0.7369, "step": 1310 }, { "epoch": 0.17, "grad_norm": 2.4516711235046387, "learning_rate": 5.739130434782609e-06, "loss": 0.7438, "step": 1320 }, { "epoch": 0.18, "grad_norm": 2.620466470718384, "learning_rate": 5.782608695652174e-06, "loss": 0.7465, "step": 1330 }, { "epoch": 0.18, "grad_norm": 2.7041525840759277, "learning_rate": 5.826086956521739e-06, "loss": 0.7522, "step": 1340 }, { "epoch": 0.18, "grad_norm": 2.6662778854370117, "learning_rate": 5.8695652173913055e-06, "loss": 0.7567, "step": 1350 }, { "epoch": 0.18, "grad_norm": 5.283855438232422, "learning_rate": 5.91304347826087e-06, "loss": 0.7307, "step": 1360 }, { "epoch": 0.18, "grad_norm": 2.551743745803833, "learning_rate": 5.956521739130435e-06, "loss": 0.7466, "step": 1370 }, { "epoch": 0.18, "grad_norm": 3.212951898574829, "learning_rate": 6e-06, "loss": 0.752, "step": 1380 }, { "epoch": 0.18, "grad_norm": 2.417921304702759, "learning_rate": 6.043478260869565e-06, "loss": 0.7531, "step": 1390 }, { "epoch": 0.19, "grad_norm": 2.751988649368286, "learning_rate": 6.086956521739132e-06, "loss": 0.7393, "step": 1400 }, { "epoch": 0.19, "grad_norm": 2.8188741207122803, "learning_rate": 6.1304347826086965e-06, "loss": 0.748, "step": 1410 }, { "epoch": 0.19, "grad_norm": 2.727118730545044, "learning_rate": 6.173913043478261e-06, "loss": 0.7404, "step": 1420 }, { "epoch": 0.19, "grad_norm": 2.9528021812438965, "learning_rate": 6.217391304347826e-06, "loss": 0.7476, "step": 1430 }, { "epoch": 0.19, "grad_norm": 3.184258460998535, "learning_rate": 6.260869565217392e-06, "loss": 0.7439, "step": 1440 }, { "epoch": 0.19, "grad_norm": 3.2081708908081055, "learning_rate": 6.304347826086958e-06, "loss": 0.7328, "step": 1450 }, { "epoch": 0.19, "grad_norm": 2.437472343444824, "learning_rate": 6.3478260869565225e-06, "loss": 0.7447, "step": 1460 }, { "epoch": 0.19, "grad_norm": 2.4201667308807373, "learning_rate": 6.391304347826087e-06, "loss": 0.7333, "step": 1470 }, { "epoch": 0.2, "grad_norm": 3.096134901046753, "learning_rate": 6.434782608695652e-06, "loss": 0.7592, "step": 1480 }, { "epoch": 0.2, "grad_norm": 2.744535446166992, "learning_rate": 6.478260869565218e-06, "loss": 0.7469, "step": 1490 }, { "epoch": 0.2, "grad_norm": 2.768773317337036, "learning_rate": 6.521739130434783e-06, "loss": 0.7434, "step": 1500 }, { "epoch": 0.2, "grad_norm": 3.7912373542785645, "learning_rate": 6.565217391304349e-06, "loss": 0.7597, "step": 1510 }, { "epoch": 0.2, "grad_norm": 3.1697614192962646, "learning_rate": 6.6086956521739135e-06, "loss": 0.7484, "step": 1520 }, { "epoch": 0.2, "grad_norm": 3.172487735748291, "learning_rate": 6.652173913043479e-06, "loss": 0.7327, "step": 1530 }, { "epoch": 0.2, "grad_norm": 2.5283539295196533, "learning_rate": 6.695652173913044e-06, "loss": 0.743, "step": 1540 }, { "epoch": 0.21, "grad_norm": 3.1751644611358643, "learning_rate": 6.739130434782609e-06, "loss": 0.723, "step": 1550 }, { "epoch": 0.21, "grad_norm": 2.524111747741699, "learning_rate": 6.782608695652174e-06, "loss": 0.7248, "step": 1560 }, { "epoch": 0.21, "grad_norm": 5.5174455642700195, "learning_rate": 6.8260869565217395e-06, "loss": 0.7399, "step": 1570 }, { "epoch": 0.21, "grad_norm": 2.582502841949463, "learning_rate": 6.869565217391305e-06, "loss": 0.7428, "step": 1580 }, { "epoch": 0.21, "grad_norm": 2.751222848892212, "learning_rate": 6.91304347826087e-06, "loss": 0.7353, "step": 1590 }, { "epoch": 0.21, "grad_norm": 2.983644485473633, "learning_rate": 6.956521739130435e-06, "loss": 0.753, "step": 1600 }, { "epoch": 0.21, "grad_norm": 2.416503667831421, "learning_rate": 7e-06, "loss": 0.7323, "step": 1610 }, { "epoch": 0.21, "grad_norm": 2.5844953060150146, "learning_rate": 7.0434782608695665e-06, "loss": 0.7514, "step": 1620 }, { "epoch": 0.22, "grad_norm": 2.449826717376709, "learning_rate": 7.086956521739131e-06, "loss": 0.7514, "step": 1630 }, { "epoch": 0.22, "grad_norm": 2.574061393737793, "learning_rate": 7.130434782608696e-06, "loss": 0.7418, "step": 1640 }, { "epoch": 0.22, "grad_norm": 2.707425355911255, "learning_rate": 7.173913043478261e-06, "loss": 0.7402, "step": 1650 }, { "epoch": 0.22, "grad_norm": 2.7220213413238525, "learning_rate": 7.217391304347827e-06, "loss": 0.7501, "step": 1660 }, { "epoch": 0.22, "grad_norm": 2.730178117752075, "learning_rate": 7.2608695652173925e-06, "loss": 0.7351, "step": 1670 }, { "epoch": 0.22, "grad_norm": 2.536191940307617, "learning_rate": 7.304347826086957e-06, "loss": 0.7362, "step": 1680 }, { "epoch": 0.22, "grad_norm": 2.93157958984375, "learning_rate": 7.347826086956522e-06, "loss": 0.7369, "step": 1690 }, { "epoch": 0.22, "grad_norm": 2.8212029933929443, "learning_rate": 7.391304347826087e-06, "loss": 0.7325, "step": 1700 }, { "epoch": 0.23, "grad_norm": 3.0014121532440186, "learning_rate": 7.434782608695653e-06, "loss": 0.7383, "step": 1710 }, { "epoch": 0.23, "grad_norm": 3.5077619552612305, "learning_rate": 7.478260869565218e-06, "loss": 0.7317, "step": 1720 }, { "epoch": 0.23, "grad_norm": 2.7584381103515625, "learning_rate": 7.5217391304347835e-06, "loss": 0.7214, "step": 1730 }, { "epoch": 0.23, "grad_norm": 3.4156510829925537, "learning_rate": 7.565217391304348e-06, "loss": 0.7367, "step": 1740 }, { "epoch": 0.23, "grad_norm": 3.4717941284179688, "learning_rate": 7.608695652173914e-06, "loss": 0.7234, "step": 1750 }, { "epoch": 0.23, "grad_norm": 2.6128644943237305, "learning_rate": 7.652173913043479e-06, "loss": 0.7486, "step": 1760 }, { "epoch": 0.23, "grad_norm": 2.3647897243499756, "learning_rate": 7.695652173913044e-06, "loss": 0.7186, "step": 1770 }, { "epoch": 0.24, "grad_norm": 2.6185524463653564, "learning_rate": 7.739130434782609e-06, "loss": 0.7405, "step": 1780 }, { "epoch": 0.24, "grad_norm": 3.2258949279785156, "learning_rate": 7.782608695652174e-06, "loss": 0.7399, "step": 1790 }, { "epoch": 0.24, "grad_norm": 3.954819679260254, "learning_rate": 7.82608695652174e-06, "loss": 0.7277, "step": 1800 }, { "epoch": 0.24, "grad_norm": 3.0589473247528076, "learning_rate": 7.869565217391305e-06, "loss": 0.7282, "step": 1810 }, { "epoch": 0.24, "grad_norm": 2.6480607986450195, "learning_rate": 7.91304347826087e-06, "loss": 0.7262, "step": 1820 }, { "epoch": 0.24, "grad_norm": 2.735381603240967, "learning_rate": 7.956521739130435e-06, "loss": 0.7216, "step": 1830 }, { "epoch": 0.24, "grad_norm": 5.60382080078125, "learning_rate": 8.000000000000001e-06, "loss": 0.7275, "step": 1840 }, { "epoch": 0.24, "grad_norm": 2.710845947265625, "learning_rate": 8.043478260869566e-06, "loss": 0.739, "step": 1850 }, { "epoch": 0.25, "grad_norm": 2.4441914558410645, "learning_rate": 8.086956521739131e-06, "loss": 0.7145, "step": 1860 }, { "epoch": 0.25, "grad_norm": 2.7932469844818115, "learning_rate": 8.130434782608696e-06, "loss": 0.7226, "step": 1870 }, { "epoch": 0.25, "grad_norm": 2.782019853591919, "learning_rate": 8.173913043478263e-06, "loss": 0.7139, "step": 1880 }, { "epoch": 0.25, "grad_norm": 3.049837350845337, "learning_rate": 8.217391304347827e-06, "loss": 0.7393, "step": 1890 }, { "epoch": 0.25, "grad_norm": 2.894196033477783, "learning_rate": 8.260869565217392e-06, "loss": 0.7163, "step": 1900 }, { "epoch": 0.25, "grad_norm": 2.4531071186065674, "learning_rate": 8.304347826086957e-06, "loss": 0.7204, "step": 1910 }, { "epoch": 0.25, "grad_norm": 3.320891857147217, "learning_rate": 8.347826086956522e-06, "loss": 0.7248, "step": 1920 }, { "epoch": 0.26, "grad_norm": 3.3719470500946045, "learning_rate": 8.391304347826089e-06, "loss": 0.7315, "step": 1930 }, { "epoch": 0.26, "grad_norm": 2.7417898178100586, "learning_rate": 8.434782608695653e-06, "loss": 0.7256, "step": 1940 }, { "epoch": 0.26, "grad_norm": 2.6355440616607666, "learning_rate": 8.478260869565218e-06, "loss": 0.7305, "step": 1950 }, { "epoch": 0.26, "grad_norm": 2.6812551021575928, "learning_rate": 8.521739130434783e-06, "loss": 0.7157, "step": 1960 }, { "epoch": 0.26, "grad_norm": 3.1449575424194336, "learning_rate": 8.56521739130435e-06, "loss": 0.7186, "step": 1970 }, { "epoch": 0.26, "grad_norm": 4.587336540222168, "learning_rate": 8.608695652173915e-06, "loss": 0.7241, "step": 1980 }, { "epoch": 0.26, "grad_norm": 3.474202871322632, "learning_rate": 8.65217391304348e-06, "loss": 0.7318, "step": 1990 }, { "epoch": 0.26, "grad_norm": 4.36326789855957, "learning_rate": 8.695652173913044e-06, "loss": 0.7336, "step": 2000 }, { "epoch": 0.27, "grad_norm": 2.8643243312835693, "learning_rate": 8.73913043478261e-06, "loss": 0.7282, "step": 2010 }, { "epoch": 0.27, "grad_norm": 2.8812708854675293, "learning_rate": 8.782608695652174e-06, "loss": 0.7158, "step": 2020 }, { "epoch": 0.27, "grad_norm": 2.9906275272369385, "learning_rate": 8.82608695652174e-06, "loss": 0.7163, "step": 2030 }, { "epoch": 0.27, "grad_norm": 2.6248161792755127, "learning_rate": 8.869565217391306e-06, "loss": 0.736, "step": 2040 }, { "epoch": 0.27, "grad_norm": 2.564918041229248, "learning_rate": 8.91304347826087e-06, "loss": 0.733, "step": 2050 }, { "epoch": 0.27, "grad_norm": 2.4388699531555176, "learning_rate": 8.956521739130435e-06, "loss": 0.7192, "step": 2060 }, { "epoch": 0.27, "grad_norm": 2.6738839149475098, "learning_rate": 9e-06, "loss": 0.7107, "step": 2070 }, { "epoch": 0.28, "grad_norm": 4.980138778686523, "learning_rate": 9.043478260869565e-06, "loss": 0.716, "step": 2080 }, { "epoch": 0.28, "grad_norm": 2.9591591358184814, "learning_rate": 9.086956521739132e-06, "loss": 0.7197, "step": 2090 }, { "epoch": 0.28, "grad_norm": 2.6318516731262207, "learning_rate": 9.130434782608697e-06, "loss": 0.7179, "step": 2100 }, { "epoch": 0.28, "grad_norm": 2.6253883838653564, "learning_rate": 9.173913043478261e-06, "loss": 0.7184, "step": 2110 }, { "epoch": 0.28, "grad_norm": 2.5605149269104004, "learning_rate": 9.217391304347826e-06, "loss": 0.7305, "step": 2120 }, { "epoch": 0.28, "grad_norm": 4.129536151885986, "learning_rate": 9.260869565217391e-06, "loss": 0.7088, "step": 2130 }, { "epoch": 0.28, "grad_norm": 2.9836220741271973, "learning_rate": 9.304347826086956e-06, "loss": 0.7239, "step": 2140 }, { "epoch": 0.28, "grad_norm": 2.778383731842041, "learning_rate": 9.347826086956523e-06, "loss": 0.7067, "step": 2150 }, { "epoch": 0.29, "grad_norm": 2.8585681915283203, "learning_rate": 9.391304347826087e-06, "loss": 0.7085, "step": 2160 }, { "epoch": 0.29, "grad_norm": 2.595531940460205, "learning_rate": 9.434782608695652e-06, "loss": 0.7101, "step": 2170 }, { "epoch": 0.29, "grad_norm": 3.7232606410980225, "learning_rate": 9.478260869565217e-06, "loss": 0.7333, "step": 2180 }, { "epoch": 0.29, "grad_norm": 2.381574869155884, "learning_rate": 9.521739130434784e-06, "loss": 0.7236, "step": 2190 }, { "epoch": 0.29, "grad_norm": 3.042024612426758, "learning_rate": 9.565217391304349e-06, "loss": 0.7261, "step": 2200 }, { "epoch": 0.29, "grad_norm": 2.2856943607330322, "learning_rate": 9.608695652173914e-06, "loss": 0.7021, "step": 2210 }, { "epoch": 0.29, "grad_norm": 3.454638719558716, "learning_rate": 9.652173913043478e-06, "loss": 0.711, "step": 2220 }, { "epoch": 0.3, "grad_norm": 2.605741500854492, "learning_rate": 9.695652173913043e-06, "loss": 0.7078, "step": 2230 }, { "epoch": 0.3, "grad_norm": 3.4367051124572754, "learning_rate": 9.73913043478261e-06, "loss": 0.7208, "step": 2240 }, { "epoch": 0.3, "grad_norm": 2.4078874588012695, "learning_rate": 9.782608695652175e-06, "loss": 0.7148, "step": 2250 }, { "epoch": 0.3, "grad_norm": 2.6727590560913086, "learning_rate": 9.82608695652174e-06, "loss": 0.7276, "step": 2260 }, { "epoch": 0.3, "grad_norm": 3.0248544216156006, "learning_rate": 9.869565217391304e-06, "loss": 0.7133, "step": 2270 }, { "epoch": 0.3, "grad_norm": 2.7144577503204346, "learning_rate": 9.913043478260871e-06, "loss": 0.7101, "step": 2280 }, { "epoch": 0.3, "grad_norm": 2.7497832775115967, "learning_rate": 9.956521739130436e-06, "loss": 0.692, "step": 2290 }, { "epoch": 0.3, "grad_norm": 2.990417957305908, "learning_rate": 1e-05, "loss": 0.6961, "step": 2300 }, { "epoch": 0.3, "eval_loss": 0.7328751683235168, "eval_runtime": 199.3608, "eval_samples_per_second": 55.176, "eval_steps_per_second": 6.897, "step": 2300 }, { "epoch": 0.31, "grad_norm": 2.8692731857299805, "learning_rate": 9.999994241637783e-06, "loss": 0.715, "step": 2310 }, { "epoch": 0.31, "grad_norm": 2.5786540508270264, "learning_rate": 9.999976966564394e-06, "loss": 0.7163, "step": 2320 }, { "epoch": 0.31, "grad_norm": 3.327714443206787, "learning_rate": 9.999948174819623e-06, "loss": 0.7135, "step": 2330 }, { "epoch": 0.31, "grad_norm": 2.9145894050598145, "learning_rate": 9.999907866469787e-06, "loss": 0.7054, "step": 2340 }, { "epoch": 0.31, "grad_norm": 2.5164294242858887, "learning_rate": 9.999856041607732e-06, "loss": 0.7149, "step": 2350 }, { "epoch": 0.31, "grad_norm": 3.212944984436035, "learning_rate": 9.999792700352826e-06, "loss": 0.7022, "step": 2360 }, { "epoch": 0.31, "grad_norm": 2.477055072784424, "learning_rate": 9.99971784285097e-06, "loss": 0.7057, "step": 2370 }, { "epoch": 0.31, "grad_norm": 2.563532590866089, "learning_rate": 9.99963146927458e-06, "loss": 0.7117, "step": 2380 }, { "epoch": 0.32, "grad_norm": 3.0468506813049316, "learning_rate": 9.999533579822611e-06, "loss": 0.7152, "step": 2390 }, { "epoch": 0.32, "grad_norm": 2.904538154602051, "learning_rate": 9.99942417472053e-06, "loss": 0.72, "step": 2400 }, { "epoch": 0.32, "grad_norm": 2.625180244445801, "learning_rate": 9.999303254220342e-06, "loss": 0.7097, "step": 2410 }, { "epoch": 0.32, "grad_norm": 2.9058775901794434, "learning_rate": 9.999170818600562e-06, "loss": 0.7254, "step": 2420 }, { "epoch": 0.32, "grad_norm": 3.0165164470672607, "learning_rate": 9.999026868166238e-06, "loss": 0.7132, "step": 2430 }, { "epoch": 0.32, "grad_norm": 4.671907424926758, "learning_rate": 9.998871403248936e-06, "loss": 0.7191, "step": 2440 }, { "epoch": 0.32, "grad_norm": 2.6668975353240967, "learning_rate": 9.998704424206747e-06, "loss": 0.7066, "step": 2450 }, { "epoch": 0.33, "grad_norm": 3.66217041015625, "learning_rate": 9.998525931424279e-06, "loss": 0.6917, "step": 2460 }, { "epoch": 0.33, "grad_norm": 2.8258302211761475, "learning_rate": 9.998335925312666e-06, "loss": 0.6889, "step": 2470 }, { "epoch": 0.33, "grad_norm": 2.8596441745758057, "learning_rate": 9.998134406309555e-06, "loss": 0.6997, "step": 2480 }, { "epoch": 0.33, "grad_norm": 2.9443325996398926, "learning_rate": 9.997921374879112e-06, "loss": 0.7082, "step": 2490 }, { "epoch": 0.33, "grad_norm": 4.481228828430176, "learning_rate": 9.997696831512027e-06, "loss": 0.7007, "step": 2500 }, { "epoch": 0.33, "grad_norm": 3.7334978580474854, "learning_rate": 9.997460776725497e-06, "loss": 0.708, "step": 2510 }, { "epoch": 0.33, "grad_norm": 2.843071937561035, "learning_rate": 9.997213211063236e-06, "loss": 0.7201, "step": 2520 }, { "epoch": 0.33, "grad_norm": 3.9357028007507324, "learning_rate": 9.99695413509548e-06, "loss": 0.7152, "step": 2530 }, { "epoch": 0.34, "grad_norm": 2.709559440612793, "learning_rate": 9.996683549418964e-06, "loss": 0.7071, "step": 2540 }, { "epoch": 0.34, "grad_norm": 3.1510403156280518, "learning_rate": 9.996401454656941e-06, "loss": 0.6963, "step": 2550 }, { "epoch": 0.34, "grad_norm": 2.642859697341919, "learning_rate": 9.996107851459175e-06, "loss": 0.7107, "step": 2560 }, { "epoch": 0.34, "grad_norm": 3.1064438819885254, "learning_rate": 9.995802740501933e-06, "loss": 0.7045, "step": 2570 }, { "epoch": 0.34, "grad_norm": 2.6383016109466553, "learning_rate": 9.995486122487992e-06, "loss": 0.6912, "step": 2580 }, { "epoch": 0.34, "grad_norm": 3.1221187114715576, "learning_rate": 9.995157998146633e-06, "loss": 0.7, "step": 2590 }, { "epoch": 0.34, "grad_norm": 2.3788633346557617, "learning_rate": 9.994818368233639e-06, "loss": 0.7152, "step": 2600 }, { "epoch": 0.35, "grad_norm": 2.5052289962768555, "learning_rate": 9.994467233531294e-06, "loss": 0.7041, "step": 2610 }, { "epoch": 0.35, "grad_norm": 2.9792628288269043, "learning_rate": 9.994104594848383e-06, "loss": 0.707, "step": 2620 }, { "epoch": 0.35, "grad_norm": 2.6907389163970947, "learning_rate": 9.993730453020187e-06, "loss": 0.6965, "step": 2630 }, { "epoch": 0.35, "grad_norm": 2.660902738571167, "learning_rate": 9.993344808908486e-06, "loss": 0.6978, "step": 2640 }, { "epoch": 0.35, "grad_norm": 2.5266551971435547, "learning_rate": 9.992947663401548e-06, "loss": 0.6938, "step": 2650 }, { "epoch": 0.35, "grad_norm": 2.86554217338562, "learning_rate": 9.99253901741414e-06, "loss": 0.7002, "step": 2660 }, { "epoch": 0.35, "grad_norm": 3.3354568481445312, "learning_rate": 9.992118871887513e-06, "loss": 0.7191, "step": 2670 }, { "epoch": 0.35, "grad_norm": 2.7831523418426514, "learning_rate": 9.991687227789407e-06, "loss": 0.7031, "step": 2680 }, { "epoch": 0.36, "grad_norm": 2.882188558578491, "learning_rate": 9.991244086114046e-06, "loss": 0.6944, "step": 2690 }, { "epoch": 0.36, "grad_norm": 2.857550859451294, "learning_rate": 9.990789447882136e-06, "loss": 0.694, "step": 2700 }, { "epoch": 0.36, "grad_norm": 2.44887638092041, "learning_rate": 9.990323314140872e-06, "loss": 0.7152, "step": 2710 }, { "epoch": 0.36, "grad_norm": 4.176514148712158, "learning_rate": 9.989845685963917e-06, "loss": 0.7048, "step": 2720 }, { "epoch": 0.36, "grad_norm": 2.6085798740386963, "learning_rate": 9.989356564451415e-06, "loss": 0.6918, "step": 2730 }, { "epoch": 0.36, "grad_norm": 2.8457624912261963, "learning_rate": 9.988855950729979e-06, "loss": 0.6992, "step": 2740 }, { "epoch": 0.36, "grad_norm": 2.9820759296417236, "learning_rate": 9.988343845952697e-06, "loss": 0.708, "step": 2750 }, { "epoch": 0.37, "grad_norm": 3.1150028705596924, "learning_rate": 9.987820251299121e-06, "loss": 0.6925, "step": 2760 }, { "epoch": 0.37, "grad_norm": 2.9244368076324463, "learning_rate": 9.987285167975274e-06, "loss": 0.6865, "step": 2770 }, { "epoch": 0.37, "grad_norm": 2.4057462215423584, "learning_rate": 9.986738597213633e-06, "loss": 0.7015, "step": 2780 }, { "epoch": 0.37, "grad_norm": 3.322909116744995, "learning_rate": 9.986180540273143e-06, "loss": 0.6832, "step": 2790 }, { "epoch": 0.37, "grad_norm": 3.0608110427856445, "learning_rate": 9.985610998439198e-06, "loss": 0.6943, "step": 2800 }, { "epoch": 0.37, "grad_norm": 4.037482261657715, "learning_rate": 9.98502997302365e-06, "loss": 0.6892, "step": 2810 }, { "epoch": 0.37, "grad_norm": 3.36313796043396, "learning_rate": 9.984437465364802e-06, "loss": 0.6965, "step": 2820 }, { "epoch": 0.37, "grad_norm": 2.5192835330963135, "learning_rate": 9.983833476827404e-06, "loss": 0.7066, "step": 2830 }, { "epoch": 0.38, "grad_norm": 2.7486279010772705, "learning_rate": 9.983218008802648e-06, "loss": 0.699, "step": 2840 }, { "epoch": 0.38, "grad_norm": 2.654358148574829, "learning_rate": 9.982591062708172e-06, "loss": 0.6979, "step": 2850 }, { "epoch": 0.38, "grad_norm": 3.2748641967773438, "learning_rate": 9.981952639988046e-06, "loss": 0.6991, "step": 2860 }, { "epoch": 0.38, "grad_norm": 2.578864812850952, "learning_rate": 9.98130274211278e-06, "loss": 0.7049, "step": 2870 }, { "epoch": 0.38, "grad_norm": 2.6727378368377686, "learning_rate": 9.98064137057931e-06, "loss": 0.7018, "step": 2880 }, { "epoch": 0.38, "grad_norm": 3.929105043411255, "learning_rate": 9.979968526911006e-06, "loss": 0.7024, "step": 2890 }, { "epoch": 0.38, "grad_norm": 3.152024030685425, "learning_rate": 9.979284212657658e-06, "loss": 0.6998, "step": 2900 }, { "epoch": 0.39, "grad_norm": 2.9024596214294434, "learning_rate": 9.978588429395475e-06, "loss": 0.6984, "step": 2910 }, { "epoch": 0.39, "grad_norm": 2.836294651031494, "learning_rate": 9.97788117872709e-06, "loss": 0.6908, "step": 2920 }, { "epoch": 0.39, "grad_norm": 2.5680007934570312, "learning_rate": 9.977162462281544e-06, "loss": 0.6976, "step": 2930 }, { "epoch": 0.39, "grad_norm": 2.9260566234588623, "learning_rate": 9.976432281714289e-06, "loss": 0.7054, "step": 2940 }, { "epoch": 0.39, "grad_norm": 2.2062673568725586, "learning_rate": 9.97569063870718e-06, "loss": 0.6856, "step": 2950 }, { "epoch": 0.39, "grad_norm": 4.240058422088623, "learning_rate": 9.97493753496848e-06, "loss": 0.7103, "step": 2960 }, { "epoch": 0.39, "grad_norm": 2.477383852005005, "learning_rate": 9.974172972232845e-06, "loss": 0.6985, "step": 2970 }, { "epoch": 0.39, "grad_norm": 3.088667392730713, "learning_rate": 9.973396952261327e-06, "loss": 0.6934, "step": 2980 }, { "epoch": 0.4, "grad_norm": 3.2433605194091797, "learning_rate": 9.972609476841368e-06, "loss": 0.6985, "step": 2990 }, { "epoch": 0.4, "grad_norm": 2.7958269119262695, "learning_rate": 9.971810547786794e-06, "loss": 0.6962, "step": 3000 }, { "epoch": 0.4, "grad_norm": 2.777493953704834, "learning_rate": 9.971000166937815e-06, "loss": 0.6986, "step": 3010 }, { "epoch": 0.4, "grad_norm": 3.0154001712799072, "learning_rate": 9.970178336161018e-06, "loss": 0.6812, "step": 3020 }, { "epoch": 0.4, "grad_norm": 2.762068033218384, "learning_rate": 9.969345057349365e-06, "loss": 0.6936, "step": 3030 }, { "epoch": 0.4, "grad_norm": 2.7011806964874268, "learning_rate": 9.96850033242218e-06, "loss": 0.6913, "step": 3040 }, { "epoch": 0.4, "grad_norm": 2.9354515075683594, "learning_rate": 9.967644163325157e-06, "loss": 0.6717, "step": 3050 }, { "epoch": 0.4, "grad_norm": 2.9377188682556152, "learning_rate": 9.96677655203035e-06, "loss": 0.703, "step": 3060 }, { "epoch": 0.41, "grad_norm": 3.62003231048584, "learning_rate": 9.965897500536167e-06, "loss": 0.6982, "step": 3070 }, { "epoch": 0.41, "grad_norm": 3.100145101547241, "learning_rate": 9.965007010867366e-06, "loss": 0.6869, "step": 3080 }, { "epoch": 0.41, "grad_norm": 3.1306726932525635, "learning_rate": 9.964105085075053e-06, "loss": 0.6998, "step": 3090 }, { "epoch": 0.41, "grad_norm": 2.943037986755371, "learning_rate": 9.963191725236672e-06, "loss": 0.6983, "step": 3100 }, { "epoch": 0.41, "grad_norm": 2.641789197921753, "learning_rate": 9.962266933456008e-06, "loss": 0.7036, "step": 3110 }, { "epoch": 0.41, "grad_norm": 2.6184604167938232, "learning_rate": 9.961330711863175e-06, "loss": 0.6847, "step": 3120 }, { "epoch": 0.41, "grad_norm": 3.769002914428711, "learning_rate": 9.960383062614614e-06, "loss": 0.6908, "step": 3130 }, { "epoch": 0.42, "grad_norm": 2.812992811203003, "learning_rate": 9.959423987893086e-06, "loss": 0.694, "step": 3140 }, { "epoch": 0.42, "grad_norm": 2.8881213665008545, "learning_rate": 9.958453489907673e-06, "loss": 0.6891, "step": 3150 }, { "epoch": 0.42, "grad_norm": 2.4800655841827393, "learning_rate": 9.957471570893767e-06, "loss": 0.6945, "step": 3160 }, { "epoch": 0.42, "grad_norm": 2.597376585006714, "learning_rate": 9.956478233113066e-06, "loss": 0.6879, "step": 3170 }, { "epoch": 0.42, "grad_norm": 2.7456142902374268, "learning_rate": 9.955473478853567e-06, "loss": 0.6835, "step": 3180 }, { "epoch": 0.42, "grad_norm": 3.177309513092041, "learning_rate": 9.954457310429569e-06, "loss": 0.6912, "step": 3190 }, { "epoch": 0.42, "grad_norm": 2.0441253185272217, "learning_rate": 9.953429730181653e-06, "loss": 0.6797, "step": 3200 }, { "epoch": 0.42, "grad_norm": 2.5114927291870117, "learning_rate": 9.952390740476698e-06, "loss": 0.6952, "step": 3210 }, { "epoch": 0.43, "grad_norm": 2.6748149394989014, "learning_rate": 9.951340343707852e-06, "loss": 0.6844, "step": 3220 }, { "epoch": 0.43, "grad_norm": 3.3017261028289795, "learning_rate": 9.95027854229454e-06, "loss": 0.6827, "step": 3230 }, { "epoch": 0.43, "grad_norm": 2.5286686420440674, "learning_rate": 9.94920533868246e-06, "loss": 0.6895, "step": 3240 }, { "epoch": 0.43, "grad_norm": 2.7270824909210205, "learning_rate": 9.948120735343566e-06, "loss": 0.6841, "step": 3250 }, { "epoch": 0.43, "grad_norm": 3.2802062034606934, "learning_rate": 9.947024734776076e-06, "loss": 0.6866, "step": 3260 }, { "epoch": 0.43, "grad_norm": 2.7635345458984375, "learning_rate": 9.945917339504457e-06, "loss": 0.702, "step": 3270 }, { "epoch": 0.43, "grad_norm": 2.5334112644195557, "learning_rate": 9.944798552079422e-06, "loss": 0.7038, "step": 3280 }, { "epoch": 0.44, "grad_norm": 3.57198429107666, "learning_rate": 9.943668375077926e-06, "loss": 0.6817, "step": 3290 }, { "epoch": 0.44, "grad_norm": 3.0424792766571045, "learning_rate": 9.942526811103153e-06, "loss": 0.6894, "step": 3300 }, { "epoch": 0.44, "grad_norm": 2.894702672958374, "learning_rate": 9.94137386278452e-06, "loss": 0.6905, "step": 3310 }, { "epoch": 0.44, "grad_norm": 3.1093649864196777, "learning_rate": 9.940209532777666e-06, "loss": 0.7126, "step": 3320 }, { "epoch": 0.44, "grad_norm": 2.57892107963562, "learning_rate": 9.939033823764443e-06, "loss": 0.6727, "step": 3330 }, { "epoch": 0.44, "grad_norm": 2.641435146331787, "learning_rate": 9.937846738452914e-06, "loss": 0.6899, "step": 3340 }, { "epoch": 0.44, "grad_norm": 2.499209403991699, "learning_rate": 9.93664827957735e-06, "loss": 0.6804, "step": 3350 }, { "epoch": 0.44, "grad_norm": 2.339439868927002, "learning_rate": 9.93543844989821e-06, "loss": 0.6684, "step": 3360 }, { "epoch": 0.45, "grad_norm": 3.1188161373138428, "learning_rate": 9.93421725220215e-06, "loss": 0.6885, "step": 3370 }, { "epoch": 0.45, "grad_norm": 2.80849552154541, "learning_rate": 9.932984689302012e-06, "loss": 0.6861, "step": 3380 }, { "epoch": 0.45, "grad_norm": 2.6763992309570312, "learning_rate": 9.93174076403681e-06, "loss": 0.6995, "step": 3390 }, { "epoch": 0.45, "grad_norm": 2.9029862880706787, "learning_rate": 9.930485479271735e-06, "loss": 0.6881, "step": 3400 }, { "epoch": 0.45, "grad_norm": 2.9763121604919434, "learning_rate": 9.929218837898143e-06, "loss": 0.6877, "step": 3410 }, { "epoch": 0.45, "grad_norm": 2.593538999557495, "learning_rate": 9.92794084283354e-06, "loss": 0.6901, "step": 3420 }, { "epoch": 0.45, "grad_norm": 3.309509038925171, "learning_rate": 9.926651497021595e-06, "loss": 0.6841, "step": 3430 }, { "epoch": 0.46, "grad_norm": 3.0623817443847656, "learning_rate": 9.925350803432112e-06, "loss": 0.664, "step": 3440 }, { "epoch": 0.46, "grad_norm": 4.454458236694336, "learning_rate": 9.924038765061042e-06, "loss": 0.6818, "step": 3450 }, { "epoch": 0.46, "grad_norm": 3.123232126235962, "learning_rate": 9.922715384930455e-06, "loss": 0.6685, "step": 3460 }, { "epoch": 0.46, "grad_norm": 3.939410448074341, "learning_rate": 9.921380666088558e-06, "loss": 0.6869, "step": 3470 }, { "epoch": 0.46, "grad_norm": 5.342121601104736, "learning_rate": 9.920034611609667e-06, "loss": 0.6801, "step": 3480 }, { "epoch": 0.46, "grad_norm": 3.224928379058838, "learning_rate": 9.918677224594207e-06, "loss": 0.6746, "step": 3490 }, { "epoch": 0.46, "grad_norm": 2.875549793243408, "learning_rate": 9.917308508168712e-06, "loss": 0.6964, "step": 3500 }, { "epoch": 0.46, "grad_norm": 3.0345466136932373, "learning_rate": 9.915928465485805e-06, "loss": 0.6727, "step": 3510 }, { "epoch": 0.47, "grad_norm": 2.9075253009796143, "learning_rate": 9.914537099724204e-06, "loss": 0.6823, "step": 3520 }, { "epoch": 0.47, "grad_norm": 3.458336591720581, "learning_rate": 9.913134414088698e-06, "loss": 0.6884, "step": 3530 }, { "epoch": 0.47, "grad_norm": 3.051724910736084, "learning_rate": 9.911720411810163e-06, "loss": 0.7009, "step": 3540 }, { "epoch": 0.47, "grad_norm": 3.8520309925079346, "learning_rate": 9.91029509614553e-06, "loss": 0.6858, "step": 3550 }, { "epoch": 0.47, "grad_norm": 3.14030122756958, "learning_rate": 9.908858470377793e-06, "loss": 0.6847, "step": 3560 }, { "epoch": 0.47, "grad_norm": 3.072479009628296, "learning_rate": 9.907410537815997e-06, "loss": 0.7003, "step": 3570 }, { "epoch": 0.47, "grad_norm": 4.00950813293457, "learning_rate": 9.905951301795231e-06, "loss": 0.673, "step": 3580 }, { "epoch": 0.48, "grad_norm": 2.5979695320129395, "learning_rate": 9.904480765676617e-06, "loss": 0.685, "step": 3590 }, { "epoch": 0.48, "grad_norm": 2.7463061809539795, "learning_rate": 9.902998932847308e-06, "loss": 0.6966, "step": 3600 }, { "epoch": 0.48, "grad_norm": 2.2453413009643555, "learning_rate": 9.901505806720474e-06, "loss": 0.6906, "step": 3610 }, { "epoch": 0.48, "grad_norm": 3.755852699279785, "learning_rate": 9.9000013907353e-06, "loss": 0.7008, "step": 3620 }, { "epoch": 0.48, "grad_norm": 3.0272433757781982, "learning_rate": 9.89848568835698e-06, "loss": 0.6912, "step": 3630 }, { "epoch": 0.48, "grad_norm": 2.8252182006835938, "learning_rate": 9.896958703076693e-06, "loss": 0.6806, "step": 3640 }, { "epoch": 0.48, "grad_norm": 3.7654056549072266, "learning_rate": 9.895420438411616e-06, "loss": 0.6778, "step": 3650 }, { "epoch": 0.48, "grad_norm": 2.8534739017486572, "learning_rate": 9.8938708979049e-06, "loss": 0.6842, "step": 3660 }, { "epoch": 0.49, "grad_norm": 3.0817458629608154, "learning_rate": 9.892310085125675e-06, "loss": 0.686, "step": 3670 }, { "epoch": 0.49, "grad_norm": 3.03659725189209, "learning_rate": 9.890738003669029e-06, "loss": 0.6858, "step": 3680 }, { "epoch": 0.49, "grad_norm": 3.211000919342041, "learning_rate": 9.889154657156008e-06, "loss": 0.6809, "step": 3690 }, { "epoch": 0.49, "grad_norm": 3.543001413345337, "learning_rate": 9.887560049233606e-06, "loss": 0.6956, "step": 3700 }, { "epoch": 0.49, "grad_norm": 2.351623058319092, "learning_rate": 9.885954183574753e-06, "loss": 0.678, "step": 3710 }, { "epoch": 0.49, "grad_norm": 3.029533624649048, "learning_rate": 9.884337063878313e-06, "loss": 0.6772, "step": 3720 }, { "epoch": 0.49, "grad_norm": 3.0510919094085693, "learning_rate": 9.882708693869071e-06, "loss": 0.6707, "step": 3730 }, { "epoch": 0.49, "grad_norm": 2.8181586265563965, "learning_rate": 9.881069077297724e-06, "loss": 0.6768, "step": 3740 }, { "epoch": 0.5, "grad_norm": 3.0697293281555176, "learning_rate": 9.879418217940872e-06, "loss": 0.6893, "step": 3750 }, { "epoch": 0.5, "grad_norm": 2.8345251083374023, "learning_rate": 9.877756119601018e-06, "loss": 0.7028, "step": 3760 }, { "epoch": 0.5, "grad_norm": 3.112302780151367, "learning_rate": 9.876082786106546e-06, "loss": 0.6914, "step": 3770 }, { "epoch": 0.5, "grad_norm": 2.735736608505249, "learning_rate": 9.87439822131172e-06, "loss": 0.6748, "step": 3780 }, { "epoch": 0.5, "grad_norm": 2.8680014610290527, "learning_rate": 9.87270242909667e-06, "loss": 0.6785, "step": 3790 }, { "epoch": 0.5, "grad_norm": 2.8195078372955322, "learning_rate": 9.870995413367397e-06, "loss": 0.675, "step": 3800 }, { "epoch": 0.5, "grad_norm": 2.9144508838653564, "learning_rate": 9.86927717805574e-06, "loss": 0.685, "step": 3810 }, { "epoch": 0.51, "grad_norm": 2.886000871658325, "learning_rate": 9.867547727119396e-06, "loss": 0.6904, "step": 3820 }, { "epoch": 0.51, "grad_norm": 3.047471284866333, "learning_rate": 9.865807064541878e-06, "loss": 0.6943, "step": 3830 }, { "epoch": 0.51, "grad_norm": 2.9526615142822266, "learning_rate": 9.864055194332538e-06, "loss": 0.6815, "step": 3840 }, { "epoch": 0.51, "grad_norm": 3.2787699699401855, "learning_rate": 9.862292120526536e-06, "loss": 0.6791, "step": 3850 }, { "epoch": 0.51, "grad_norm": 2.6856937408447266, "learning_rate": 9.860517847184837e-06, "loss": 0.6978, "step": 3860 }, { "epoch": 0.51, "grad_norm": 2.927518367767334, "learning_rate": 9.858732378394207e-06, "loss": 0.6904, "step": 3870 }, { "epoch": 0.51, "grad_norm": 4.556071758270264, "learning_rate": 9.856935718267196e-06, "loss": 0.6889, "step": 3880 }, { "epoch": 0.51, "grad_norm": 2.559556245803833, "learning_rate": 9.855127870942131e-06, "loss": 0.69, "step": 3890 }, { "epoch": 0.52, "grad_norm": 3.2897391319274902, "learning_rate": 9.85330884058311e-06, "loss": 0.6872, "step": 3900 }, { "epoch": 0.52, "grad_norm": 2.722827196121216, "learning_rate": 9.851478631379982e-06, "loss": 0.6865, "step": 3910 }, { "epoch": 0.52, "grad_norm": 3.322338581085205, "learning_rate": 9.849637247548356e-06, "loss": 0.6919, "step": 3920 }, { "epoch": 0.52, "grad_norm": 2.7876293659210205, "learning_rate": 9.847784693329571e-06, "loss": 0.6665, "step": 3930 }, { "epoch": 0.52, "grad_norm": 2.6033077239990234, "learning_rate": 9.845920972990702e-06, "loss": 0.6801, "step": 3940 }, { "epoch": 0.52, "grad_norm": 2.751955032348633, "learning_rate": 9.844046090824533e-06, "loss": 0.667, "step": 3950 }, { "epoch": 0.52, "grad_norm": 2.8029513359069824, "learning_rate": 9.842160051149568e-06, "loss": 0.6841, "step": 3960 }, { "epoch": 0.53, "grad_norm": 3.345020294189453, "learning_rate": 9.840262858310007e-06, "loss": 0.684, "step": 3970 }, { "epoch": 0.53, "grad_norm": 2.843904495239258, "learning_rate": 9.83835451667574e-06, "loss": 0.6878, "step": 3980 }, { "epoch": 0.53, "grad_norm": 2.4852325916290283, "learning_rate": 9.836435030642335e-06, "loss": 0.7087, "step": 3990 }, { "epoch": 0.53, "grad_norm": 3.3790409564971924, "learning_rate": 9.834504404631032e-06, "loss": 0.6913, "step": 4000 }, { "epoch": 0.53, "grad_norm": 3.407959461212158, "learning_rate": 9.832562643088724e-06, "loss": 0.6912, "step": 4010 }, { "epoch": 0.53, "grad_norm": 2.723505973815918, "learning_rate": 9.830609750487963e-06, "loss": 0.6927, "step": 4020 }, { "epoch": 0.53, "grad_norm": 3.1616594791412354, "learning_rate": 9.82864573132693e-06, "loss": 0.6714, "step": 4030 }, { "epoch": 0.53, "grad_norm": 3.0670251846313477, "learning_rate": 9.826670590129442e-06, "loss": 0.6685, "step": 4040 }, { "epoch": 0.54, "grad_norm": 2.4355831146240234, "learning_rate": 9.824684331444926e-06, "loss": 0.6839, "step": 4050 }, { "epoch": 0.54, "grad_norm": 2.7502357959747314, "learning_rate": 9.822686959848425e-06, "loss": 0.6925, "step": 4060 }, { "epoch": 0.54, "grad_norm": 3.705983877182007, "learning_rate": 9.820678479940573e-06, "loss": 0.6715, "step": 4070 }, { "epoch": 0.54, "grad_norm": 2.4137468338012695, "learning_rate": 9.818658896347591e-06, "loss": 0.6882, "step": 4080 }, { "epoch": 0.54, "grad_norm": 3.260124921798706, "learning_rate": 9.81662821372128e-06, "loss": 0.684, "step": 4090 }, { "epoch": 0.54, "grad_norm": 3.2827420234680176, "learning_rate": 9.814586436738998e-06, "loss": 0.675, "step": 4100 }, { "epoch": 0.54, "grad_norm": 2.2685201168060303, "learning_rate": 9.812533570103663e-06, "loss": 0.6636, "step": 4110 }, { "epoch": 0.55, "grad_norm": 3.1365833282470703, "learning_rate": 9.810469618543737e-06, "loss": 0.6911, "step": 4120 }, { "epoch": 0.55, "grad_norm": 2.9192795753479004, "learning_rate": 9.808394586813209e-06, "loss": 0.6955, "step": 4130 }, { "epoch": 0.55, "grad_norm": 2.4191370010375977, "learning_rate": 9.806308479691595e-06, "loss": 0.6769, "step": 4140 }, { "epoch": 0.55, "grad_norm": 4.024157524108887, "learning_rate": 9.804211301983919e-06, "loss": 0.6837, "step": 4150 }, { "epoch": 0.55, "grad_norm": 2.6173815727233887, "learning_rate": 9.802103058520704e-06, "loss": 0.6703, "step": 4160 }, { "epoch": 0.55, "grad_norm": 2.637032985687256, "learning_rate": 9.799983754157961e-06, "loss": 0.681, "step": 4170 }, { "epoch": 0.55, "grad_norm": 2.4752278327941895, "learning_rate": 9.797853393777182e-06, "loss": 0.6667, "step": 4180 }, { "epoch": 0.55, "grad_norm": 3.132769823074341, "learning_rate": 9.795711982285317e-06, "loss": 0.6903, "step": 4190 }, { "epoch": 0.56, "grad_norm": 2.7234723567962646, "learning_rate": 9.793559524614779e-06, "loss": 0.6763, "step": 4200 }, { "epoch": 0.56, "grad_norm": 2.6191039085388184, "learning_rate": 9.791396025723418e-06, "loss": 0.6732, "step": 4210 }, { "epoch": 0.56, "grad_norm": 2.8403711318969727, "learning_rate": 9.78922149059452e-06, "loss": 0.676, "step": 4220 }, { "epoch": 0.56, "grad_norm": 3.4012792110443115, "learning_rate": 9.787035924236789e-06, "loss": 0.6576, "step": 4230 }, { "epoch": 0.56, "grad_norm": 3.256134271621704, "learning_rate": 9.784839331684338e-06, "loss": 0.7017, "step": 4240 }, { "epoch": 0.56, "grad_norm": 2.8171510696411133, "learning_rate": 9.782631717996675e-06, "loss": 0.6764, "step": 4250 }, { "epoch": 0.56, "grad_norm": 3.248218297958374, "learning_rate": 9.780413088258698e-06, "loss": 0.6807, "step": 4260 }, { "epoch": 0.57, "grad_norm": 3.497915267944336, "learning_rate": 9.778183447580675e-06, "loss": 0.6714, "step": 4270 }, { "epoch": 0.57, "grad_norm": 3.140228509902954, "learning_rate": 9.775942801098241e-06, "loss": 0.7066, "step": 4280 }, { "epoch": 0.57, "grad_norm": 3.035100221633911, "learning_rate": 9.773691153972375e-06, "loss": 0.6803, "step": 4290 }, { "epoch": 0.57, "grad_norm": 2.7118821144104004, "learning_rate": 9.771428511389395e-06, "loss": 0.6755, "step": 4300 }, { "epoch": 0.57, "grad_norm": 3.035815477371216, "learning_rate": 9.76915487856095e-06, "loss": 0.6707, "step": 4310 }, { "epoch": 0.57, "grad_norm": 3.3597121238708496, "learning_rate": 9.766870260724e-06, "loss": 0.6781, "step": 4320 }, { "epoch": 0.57, "grad_norm": 3.151930809020996, "learning_rate": 9.764574663140807e-06, "loss": 0.6644, "step": 4330 }, { "epoch": 0.57, "grad_norm": 2.971132278442383, "learning_rate": 9.762268091098926e-06, "loss": 0.6747, "step": 4340 }, { "epoch": 0.58, "grad_norm": 2.8724772930145264, "learning_rate": 9.759950549911185e-06, "loss": 0.6802, "step": 4350 }, { "epoch": 0.58, "grad_norm": 3.2583391666412354, "learning_rate": 9.757622044915682e-06, "loss": 0.6958, "step": 4360 }, { "epoch": 0.58, "grad_norm": 2.8724751472473145, "learning_rate": 9.755282581475769e-06, "loss": 0.6673, "step": 4370 }, { "epoch": 0.58, "grad_norm": 2.53116774559021, "learning_rate": 9.752932164980033e-06, "loss": 0.6771, "step": 4380 }, { "epoch": 0.58, "grad_norm": 2.5583338737487793, "learning_rate": 9.750570800842298e-06, "loss": 0.6835, "step": 4390 }, { "epoch": 0.58, "grad_norm": 2.7628743648529053, "learning_rate": 9.748198494501598e-06, "loss": 0.6759, "step": 4400 }, { "epoch": 0.58, "grad_norm": 2.4657537937164307, "learning_rate": 9.74581525142217e-06, "loss": 0.6912, "step": 4410 }, { "epoch": 0.58, "grad_norm": 2.4423508644104004, "learning_rate": 9.74342107709345e-06, "loss": 0.6584, "step": 4420 }, { "epoch": 0.59, "grad_norm": 2.9297876358032227, "learning_rate": 9.741015977030046e-06, "loss": 0.6898, "step": 4430 }, { "epoch": 0.59, "grad_norm": 4.085515022277832, "learning_rate": 9.73859995677173e-06, "loss": 0.6586, "step": 4440 }, { "epoch": 0.59, "grad_norm": 3.312915802001953, "learning_rate": 9.736173021883433e-06, "loss": 0.6819, "step": 4450 }, { "epoch": 0.59, "grad_norm": 2.7743465900421143, "learning_rate": 9.733735177955219e-06, "loss": 0.6621, "step": 4460 }, { "epoch": 0.59, "grad_norm": 3.45530366897583, "learning_rate": 9.73128643060229e-06, "loss": 0.6838, "step": 4470 }, { "epoch": 0.59, "grad_norm": 2.886432409286499, "learning_rate": 9.728826785464948e-06, "loss": 0.6859, "step": 4480 }, { "epoch": 0.59, "grad_norm": 4.044523239135742, "learning_rate": 9.72635624820861e-06, "loss": 0.6832, "step": 4490 }, { "epoch": 0.6, "grad_norm": 2.746330499649048, "learning_rate": 9.72387482452377e-06, "loss": 0.6917, "step": 4500 }, { "epoch": 0.6, "grad_norm": 3.0390594005584717, "learning_rate": 9.72138252012601e-06, "loss": 0.6656, "step": 4510 }, { "epoch": 0.6, "grad_norm": 2.8167319297790527, "learning_rate": 9.71887934075596e-06, "loss": 0.6885, "step": 4520 }, { "epoch": 0.6, "grad_norm": 2.6761083602905273, "learning_rate": 9.716365292179309e-06, "loss": 0.6942, "step": 4530 }, { "epoch": 0.6, "grad_norm": 2.7335264682769775, "learning_rate": 9.713840380186774e-06, "loss": 0.684, "step": 4540 }, { "epoch": 0.6, "grad_norm": 3.458198070526123, "learning_rate": 9.711304610594104e-06, "loss": 0.692, "step": 4550 }, { "epoch": 0.6, "grad_norm": 2.902730703353882, "learning_rate": 9.708757989242046e-06, "loss": 0.6638, "step": 4560 }, { "epoch": 0.6, "grad_norm": 2.5396640300750732, "learning_rate": 9.706200521996348e-06, "loss": 0.69, "step": 4570 }, { "epoch": 0.61, "grad_norm": 2.7966129779815674, "learning_rate": 9.703632214747742e-06, "loss": 0.6832, "step": 4580 }, { "epoch": 0.61, "grad_norm": 2.746511697769165, "learning_rate": 9.701053073411923e-06, "loss": 0.6749, "step": 4590 }, { "epoch": 0.61, "grad_norm": 3.5651538372039795, "learning_rate": 9.698463103929542e-06, "loss": 0.6722, "step": 4600 }, { "epoch": 0.61, "eval_loss": 0.7272596955299377, "eval_runtime": 198.5906, "eval_samples_per_second": 55.39, "eval_steps_per_second": 6.924, "step": 4600 }, { "epoch": 0.61, "grad_norm": 2.358306407928467, "learning_rate": 9.695862312266195e-06, "loss": 0.6808, "step": 4610 }, { "epoch": 0.61, "grad_norm": 2.843318223953247, "learning_rate": 9.6932507044124e-06, "loss": 0.6852, "step": 4620 }, { "epoch": 0.61, "grad_norm": 2.2861886024475098, "learning_rate": 9.690628286383593e-06, "loss": 0.6736, "step": 4630 }, { "epoch": 0.61, "grad_norm": 2.2561638355255127, "learning_rate": 9.687995064220102e-06, "loss": 0.6789, "step": 4640 }, { "epoch": 0.62, "grad_norm": 2.8437678813934326, "learning_rate": 9.685351043987151e-06, "loss": 0.6758, "step": 4650 }, { "epoch": 0.62, "grad_norm": 2.547785758972168, "learning_rate": 9.682696231774829e-06, "loss": 0.6855, "step": 4660 }, { "epoch": 0.62, "grad_norm": 2.6620736122131348, "learning_rate": 9.680030633698083e-06, "loss": 0.6711, "step": 4670 }, { "epoch": 0.62, "grad_norm": 3.146510362625122, "learning_rate": 9.677354255896706e-06, "loss": 0.6641, "step": 4680 }, { "epoch": 0.62, "grad_norm": 2.8901259899139404, "learning_rate": 9.674667104535318e-06, "loss": 0.6898, "step": 4690 }, { "epoch": 0.62, "grad_norm": 2.88307523727417, "learning_rate": 9.671969185803357e-06, "loss": 0.6848, "step": 4700 }, { "epoch": 0.62, "grad_norm": 2.6471335887908936, "learning_rate": 9.669260505915057e-06, "loss": 0.668, "step": 4710 }, { "epoch": 0.62, "grad_norm": 2.9674232006073, "learning_rate": 9.666541071109446e-06, "loss": 0.6849, "step": 4720 }, { "epoch": 0.63, "grad_norm": 2.370706081390381, "learning_rate": 9.66381088765032e-06, "loss": 0.6819, "step": 4730 }, { "epoch": 0.63, "grad_norm": 2.7703073024749756, "learning_rate": 9.661069961826228e-06, "loss": 0.6674, "step": 4740 }, { "epoch": 0.63, "grad_norm": 2.9082021713256836, "learning_rate": 9.658318299950473e-06, "loss": 0.6833, "step": 4750 }, { "epoch": 0.63, "grad_norm": 3.0396716594696045, "learning_rate": 9.65555590836108e-06, "loss": 0.6657, "step": 4760 }, { "epoch": 0.63, "grad_norm": 2.304875373840332, "learning_rate": 9.652782793420789e-06, "loss": 0.6964, "step": 4770 }, { "epoch": 0.63, "grad_norm": 2.8251640796661377, "learning_rate": 9.64999896151704e-06, "loss": 0.6774, "step": 4780 }, { "epoch": 0.63, "grad_norm": 3.527707815170288, "learning_rate": 9.647204419061957e-06, "loss": 0.6778, "step": 4790 }, { "epoch": 0.64, "grad_norm": 2.566895008087158, "learning_rate": 9.644399172492337e-06, "loss": 0.681, "step": 4800 }, { "epoch": 0.64, "grad_norm": 3.0783915519714355, "learning_rate": 9.641583228269629e-06, "loss": 0.6744, "step": 4810 }, { "epoch": 0.64, "grad_norm": 2.982912302017212, "learning_rate": 9.638756592879923e-06, "loss": 0.6849, "step": 4820 }, { "epoch": 0.64, "grad_norm": 2.7487356662750244, "learning_rate": 9.635919272833938e-06, "loss": 0.6709, "step": 4830 }, { "epoch": 0.64, "grad_norm": 3.3017807006835938, "learning_rate": 9.633071274666998e-06, "loss": 0.6698, "step": 4840 }, { "epoch": 0.64, "grad_norm": 2.7575645446777344, "learning_rate": 9.630212604939026e-06, "loss": 0.6823, "step": 4850 }, { "epoch": 0.64, "grad_norm": 3.032663345336914, "learning_rate": 9.627343270234526e-06, "loss": 0.6754, "step": 4860 }, { "epoch": 0.64, "grad_norm": 2.4695844650268555, "learning_rate": 9.624463277162563e-06, "loss": 0.6793, "step": 4870 }, { "epoch": 0.65, "grad_norm": 2.7239301204681396, "learning_rate": 9.621572632356754e-06, "loss": 0.7041, "step": 4880 }, { "epoch": 0.65, "grad_norm": 2.497579336166382, "learning_rate": 9.618671342475252e-06, "loss": 0.694, "step": 4890 }, { "epoch": 0.65, "grad_norm": 2.7662250995635986, "learning_rate": 9.615759414200729e-06, "loss": 0.6739, "step": 4900 }, { "epoch": 0.65, "grad_norm": 3.1290366649627686, "learning_rate": 9.61283685424036e-06, "loss": 0.6802, "step": 4910 }, { "epoch": 0.65, "grad_norm": 2.9241154193878174, "learning_rate": 9.609903669325807e-06, "loss": 0.6859, "step": 4920 }, { "epoch": 0.65, "grad_norm": 2.5501949787139893, "learning_rate": 9.606959866213206e-06, "loss": 0.6608, "step": 4930 }, { "epoch": 0.65, "grad_norm": 3.447155475616455, "learning_rate": 9.604005451683154e-06, "loss": 0.6813, "step": 4940 }, { "epoch": 0.66, "grad_norm": 2.4963529109954834, "learning_rate": 9.601040432540684e-06, "loss": 0.6743, "step": 4950 }, { "epoch": 0.66, "grad_norm": 2.8612847328186035, "learning_rate": 9.598064815615259e-06, "loss": 0.6614, "step": 4960 }, { "epoch": 0.66, "grad_norm": 3.20935320854187, "learning_rate": 9.59507860776075e-06, "loss": 0.6781, "step": 4970 }, { "epoch": 0.66, "grad_norm": 2.885199546813965, "learning_rate": 9.592081815855425e-06, "loss": 0.6738, "step": 4980 }, { "epoch": 0.66, "grad_norm": 3.2153961658477783, "learning_rate": 9.589074446801928e-06, "loss": 0.68, "step": 4990 }, { "epoch": 0.66, "grad_norm": 2.5133445262908936, "learning_rate": 9.586056507527266e-06, "loss": 0.6822, "step": 5000 }, { "epoch": 0.66, "grad_norm": 3.0179901123046875, "learning_rate": 9.583028004982798e-06, "loss": 0.675, "step": 5010 }, { "epoch": 0.66, "grad_norm": 3.4788663387298584, "learning_rate": 9.579988946144205e-06, "loss": 0.6832, "step": 5020 }, { "epoch": 0.67, "grad_norm": 2.847745418548584, "learning_rate": 9.57693933801149e-06, "loss": 0.6662, "step": 5030 }, { "epoch": 0.67, "grad_norm": 2.8910322189331055, "learning_rate": 9.573879187608954e-06, "loss": 0.6732, "step": 5040 }, { "epoch": 0.67, "grad_norm": 2.582803726196289, "learning_rate": 9.570808501985176e-06, "loss": 0.6782, "step": 5050 }, { "epoch": 0.67, "grad_norm": 2.1921629905700684, "learning_rate": 9.567727288213005e-06, "loss": 0.6881, "step": 5060 }, { "epoch": 0.67, "grad_norm": 2.068142890930176, "learning_rate": 9.56463555338954e-06, "loss": 0.6717, "step": 5070 }, { "epoch": 0.67, "grad_norm": 2.0534846782684326, "learning_rate": 9.561533304636111e-06, "loss": 0.6575, "step": 5080 }, { "epoch": 0.67, "grad_norm": 2.8238446712493896, "learning_rate": 9.558420549098269e-06, "loss": 0.6842, "step": 5090 }, { "epoch": 0.67, "grad_norm": 3.434664249420166, "learning_rate": 9.55529729394576e-06, "loss": 0.6789, "step": 5100 }, { "epoch": 0.68, "grad_norm": 2.864886522293091, "learning_rate": 9.552163546372521e-06, "loss": 0.6707, "step": 5110 }, { "epoch": 0.68, "grad_norm": 2.919512987136841, "learning_rate": 9.549019313596652e-06, "loss": 0.675, "step": 5120 }, { "epoch": 0.68, "grad_norm": 2.967341899871826, "learning_rate": 9.545864602860406e-06, "loss": 0.6915, "step": 5130 }, { "epoch": 0.68, "grad_norm": 4.124980449676514, "learning_rate": 9.542699421430169e-06, "loss": 0.6707, "step": 5140 }, { "epoch": 0.68, "grad_norm": 2.38964581489563, "learning_rate": 9.539523776596446e-06, "loss": 0.6779, "step": 5150 }, { "epoch": 0.68, "grad_norm": 2.722057580947876, "learning_rate": 9.536337675673842e-06, "loss": 0.6912, "step": 5160 }, { "epoch": 0.68, "grad_norm": 4.076712131500244, "learning_rate": 9.533141126001048e-06, "loss": 0.6835, "step": 5170 }, { "epoch": 0.69, "grad_norm": 2.521733522415161, "learning_rate": 9.529934134940819e-06, "loss": 0.6741, "step": 5180 }, { "epoch": 0.69, "grad_norm": 2.929415464401245, "learning_rate": 9.526716709879961e-06, "loss": 0.6681, "step": 5190 }, { "epoch": 0.69, "grad_norm": 2.9488470554351807, "learning_rate": 9.523488858229313e-06, "loss": 0.6695, "step": 5200 }, { "epoch": 0.69, "grad_norm": 2.68656849861145, "learning_rate": 9.520250587423733e-06, "loss": 0.6791, "step": 5210 }, { "epoch": 0.69, "grad_norm": 2.4024336338043213, "learning_rate": 9.517001904922074e-06, "loss": 0.6861, "step": 5220 }, { "epoch": 0.69, "grad_norm": 2.476743221282959, "learning_rate": 9.513742818207173e-06, "loss": 0.6895, "step": 5230 }, { "epoch": 0.69, "grad_norm": 2.6581668853759766, "learning_rate": 9.510473334785828e-06, "loss": 0.677, "step": 5240 }, { "epoch": 0.69, "grad_norm": 2.596719264984131, "learning_rate": 9.507193462188791e-06, "loss": 0.6842, "step": 5250 }, { "epoch": 0.7, "grad_norm": 2.334949016571045, "learning_rate": 9.503903207970735e-06, "loss": 0.6732, "step": 5260 }, { "epoch": 0.7, "grad_norm": 2.867070436477661, "learning_rate": 9.500602579710256e-06, "loss": 0.6676, "step": 5270 }, { "epoch": 0.7, "grad_norm": 3.4152629375457764, "learning_rate": 9.497291585009834e-06, "loss": 0.6618, "step": 5280 }, { "epoch": 0.7, "grad_norm": 2.69191837310791, "learning_rate": 9.493970231495836e-06, "loss": 0.6822, "step": 5290 }, { "epoch": 0.7, "grad_norm": 3.0387730598449707, "learning_rate": 9.490638526818482e-06, "loss": 0.6809, "step": 5300 }, { "epoch": 0.7, "grad_norm": 2.516139268875122, "learning_rate": 9.487296478651838e-06, "loss": 0.682, "step": 5310 }, { "epoch": 0.7, "grad_norm": 2.8808937072753906, "learning_rate": 9.48394409469379e-06, "loss": 0.6829, "step": 5320 }, { "epoch": 0.71, "grad_norm": 3.1148269176483154, "learning_rate": 9.480581382666041e-06, "loss": 0.666, "step": 5330 }, { "epoch": 0.71, "grad_norm": 2.933945894241333, "learning_rate": 9.477208350314072e-06, "loss": 0.6554, "step": 5340 }, { "epoch": 0.71, "grad_norm": 2.9011828899383545, "learning_rate": 9.47382500540714e-06, "loss": 0.6724, "step": 5350 }, { "epoch": 0.71, "grad_norm": 3.7872607707977295, "learning_rate": 9.470431355738257e-06, "loss": 0.6785, "step": 5360 }, { "epoch": 0.71, "grad_norm": 2.19966197013855, "learning_rate": 9.467027409124167e-06, "loss": 0.6767, "step": 5370 }, { "epoch": 0.71, "grad_norm": 2.590169668197632, "learning_rate": 9.463613173405335e-06, "loss": 0.6587, "step": 5380 }, { "epoch": 0.71, "grad_norm": 3.284235954284668, "learning_rate": 9.460188656445921e-06, "loss": 0.6819, "step": 5390 }, { "epoch": 0.71, "grad_norm": 2.668703556060791, "learning_rate": 9.45675386613377e-06, "loss": 0.6675, "step": 5400 }, { "epoch": 0.72, "grad_norm": 3.212360382080078, "learning_rate": 9.453308810380388e-06, "loss": 0.6832, "step": 5410 }, { "epoch": 0.72, "grad_norm": 3.038121461868286, "learning_rate": 9.449853497120928e-06, "loss": 0.6987, "step": 5420 }, { "epoch": 0.72, "grad_norm": 2.2262122631073, "learning_rate": 9.446387934314167e-06, "loss": 0.6688, "step": 5430 }, { "epoch": 0.72, "grad_norm": 2.7394657135009766, "learning_rate": 9.442912129942491e-06, "loss": 0.6788, "step": 5440 }, { "epoch": 0.72, "grad_norm": 2.70332932472229, "learning_rate": 9.439426092011877e-06, "loss": 0.671, "step": 5450 }, { "epoch": 0.72, "grad_norm": 2.807642936706543, "learning_rate": 9.435929828551872e-06, "loss": 0.6748, "step": 5460 }, { "epoch": 0.72, "grad_norm": 3.0139355659484863, "learning_rate": 9.432423347615578e-06, "loss": 0.6723, "step": 5470 }, { "epoch": 0.73, "grad_norm": 3.2098162174224854, "learning_rate": 9.428906657279629e-06, "loss": 0.6717, "step": 5480 }, { "epoch": 0.73, "grad_norm": 2.6355557441711426, "learning_rate": 9.425379765644174e-06, "loss": 0.6816, "step": 5490 }, { "epoch": 0.73, "grad_norm": 2.0204873085021973, "learning_rate": 9.421842680832862e-06, "loss": 0.6671, "step": 5500 }, { "epoch": 0.73, "grad_norm": 3.1098544597625732, "learning_rate": 9.418295410992821e-06, "loss": 0.6911, "step": 5510 }, { "epoch": 0.73, "grad_norm": 3.0662097930908203, "learning_rate": 9.414737964294636e-06, "loss": 0.6846, "step": 5520 }, { "epoch": 0.73, "grad_norm": 3.711937665939331, "learning_rate": 9.411170348932333e-06, "loss": 0.6731, "step": 5530 }, { "epoch": 0.73, "grad_norm": 2.982342481613159, "learning_rate": 9.407592573123359e-06, "loss": 0.6747, "step": 5540 }, { "epoch": 0.73, "grad_norm": 2.78140926361084, "learning_rate": 9.40400464510857e-06, "loss": 0.6787, "step": 5550 }, { "epoch": 0.74, "grad_norm": 3.521254539489746, "learning_rate": 9.400406573152196e-06, "loss": 0.6891, "step": 5560 }, { "epoch": 0.74, "grad_norm": 2.704249620437622, "learning_rate": 9.396798365541841e-06, "loss": 0.6823, "step": 5570 }, { "epoch": 0.74, "grad_norm": 2.6704928874969482, "learning_rate": 9.393180030588454e-06, "loss": 0.6814, "step": 5580 }, { "epoch": 0.74, "grad_norm": 2.9716169834136963, "learning_rate": 9.389551576626303e-06, "loss": 0.6786, "step": 5590 }, { "epoch": 0.74, "grad_norm": 2.880004405975342, "learning_rate": 9.385913012012972e-06, "loss": 0.6775, "step": 5600 }, { "epoch": 0.74, "grad_norm": 2.5522663593292236, "learning_rate": 9.382264345129329e-06, "loss": 0.6827, "step": 5610 }, { "epoch": 0.74, "grad_norm": 2.8483026027679443, "learning_rate": 9.378605584379515e-06, "loss": 0.656, "step": 5620 }, { "epoch": 0.75, "grad_norm": 2.7561230659484863, "learning_rate": 9.374936738190913e-06, "loss": 0.6694, "step": 5630 }, { "epoch": 0.75, "grad_norm": 3.149887800216675, "learning_rate": 9.371257815014145e-06, "loss": 0.6782, "step": 5640 }, { "epoch": 0.75, "grad_norm": 2.629521369934082, "learning_rate": 9.367568823323039e-06, "loss": 0.6758, "step": 5650 }, { "epoch": 0.75, "grad_norm": 3.5836355686187744, "learning_rate": 9.363869771614615e-06, "loss": 0.6738, "step": 5660 }, { "epoch": 0.75, "grad_norm": 2.83819317817688, "learning_rate": 9.360160668409063e-06, "loss": 0.6734, "step": 5670 }, { "epoch": 0.75, "grad_norm": 2.5035979747772217, "learning_rate": 9.35644152224973e-06, "loss": 0.6804, "step": 5680 }, { "epoch": 0.75, "grad_norm": 2.4892172813415527, "learning_rate": 9.35271234170309e-06, "loss": 0.6599, "step": 5690 }, { "epoch": 0.75, "grad_norm": 3.170015811920166, "learning_rate": 9.348973135358734e-06, "loss": 0.6771, "step": 5700 }, { "epoch": 0.76, "grad_norm": 2.2458276748657227, "learning_rate": 9.345223911829343e-06, "loss": 0.6785, "step": 5710 }, { "epoch": 0.76, "grad_norm": 2.567450761795044, "learning_rate": 9.341464679750669e-06, "loss": 0.6732, "step": 5720 }, { "epoch": 0.76, "grad_norm": 2.638319730758667, "learning_rate": 9.337695447781525e-06, "loss": 0.6753, "step": 5730 }, { "epoch": 0.76, "grad_norm": 3.1988255977630615, "learning_rate": 9.333916224603747e-06, "loss": 0.6776, "step": 5740 }, { "epoch": 0.76, "grad_norm": 2.337787389755249, "learning_rate": 9.330127018922195e-06, "loss": 0.6766, "step": 5750 }, { "epoch": 0.76, "grad_norm": 2.9823319911956787, "learning_rate": 9.326327839464711e-06, "loss": 0.6749, "step": 5760 }, { "epoch": 0.76, "grad_norm": 2.39164662361145, "learning_rate": 9.322518694982119e-06, "loss": 0.6703, "step": 5770 }, { "epoch": 0.76, "grad_norm": 2.7940139770507812, "learning_rate": 9.318699594248192e-06, "loss": 0.6612, "step": 5780 }, { "epoch": 0.77, "grad_norm": 2.8293371200561523, "learning_rate": 9.314870546059636e-06, "loss": 0.6598, "step": 5790 }, { "epoch": 0.77, "grad_norm": 2.8672988414764404, "learning_rate": 9.311031559236067e-06, "loss": 0.6811, "step": 5800 }, { "epoch": 0.77, "grad_norm": 2.592622995376587, "learning_rate": 9.307182642620001e-06, "loss": 0.6857, "step": 5810 }, { "epoch": 0.77, "grad_norm": 3.7279140949249268, "learning_rate": 9.303323805076816e-06, "loss": 0.6606, "step": 5820 }, { "epoch": 0.77, "grad_norm": 3.0750820636749268, "learning_rate": 9.299455055494747e-06, "loss": 0.6766, "step": 5830 }, { "epoch": 0.77, "grad_norm": 3.0468876361846924, "learning_rate": 9.295576402784858e-06, "loss": 0.6675, "step": 5840 }, { "epoch": 0.77, "grad_norm": 3.2743566036224365, "learning_rate": 9.291687855881027e-06, "loss": 0.6842, "step": 5850 }, { "epoch": 0.78, "grad_norm": 3.0007123947143555, "learning_rate": 9.287789423739915e-06, "loss": 0.6631, "step": 5860 }, { "epoch": 0.78, "grad_norm": 2.861750602722168, "learning_rate": 9.283881115340957e-06, "loss": 0.6624, "step": 5870 }, { "epoch": 0.78, "grad_norm": 2.6511833667755127, "learning_rate": 9.279962939686333e-06, "loss": 0.6735, "step": 5880 }, { "epoch": 0.78, "grad_norm": 2.709005832672119, "learning_rate": 9.276034905800957e-06, "loss": 0.6769, "step": 5890 }, { "epoch": 0.78, "grad_norm": 3.223933696746826, "learning_rate": 9.272097022732444e-06, "loss": 0.6818, "step": 5900 }, { "epoch": 0.78, "grad_norm": 2.787783622741699, "learning_rate": 9.268149299551095e-06, "loss": 0.6856, "step": 5910 }, { "epoch": 0.78, "grad_norm": 3.5154056549072266, "learning_rate": 9.264191745349882e-06, "loss": 0.6682, "step": 5920 }, { "epoch": 0.78, "grad_norm": 3.194385528564453, "learning_rate": 9.260224369244414e-06, "loss": 0.6659, "step": 5930 }, { "epoch": 0.79, "grad_norm": 2.8617637157440186, "learning_rate": 9.256247180372927e-06, "loss": 0.6855, "step": 5940 }, { "epoch": 0.79, "grad_norm": 2.8789777755737305, "learning_rate": 9.252260187896257e-06, "loss": 0.6829, "step": 5950 }, { "epoch": 0.79, "grad_norm": 2.897092580795288, "learning_rate": 9.248263400997826e-06, "loss": 0.6744, "step": 5960 }, { "epoch": 0.79, "grad_norm": 2.86773943901062, "learning_rate": 9.244256828883611e-06, "loss": 0.6867, "step": 5970 }, { "epoch": 0.79, "grad_norm": 2.738723039627075, "learning_rate": 9.24024048078213e-06, "loss": 0.6718, "step": 5980 }, { "epoch": 0.79, "grad_norm": 2.6888554096221924, "learning_rate": 9.236214365944418e-06, "loss": 0.6711, "step": 5990 }, { "epoch": 0.79, "grad_norm": 3.0452592372894287, "learning_rate": 9.232178493644006e-06, "loss": 0.6816, "step": 6000 }, { "epoch": 0.8, "grad_norm": 2.9094676971435547, "learning_rate": 9.228132873176899e-06, "loss": 0.6817, "step": 6010 }, { "epoch": 0.8, "grad_norm": 2.685194492340088, "learning_rate": 9.224077513861556e-06, "loss": 0.6684, "step": 6020 }, { "epoch": 0.8, "grad_norm": 2.4088940620422363, "learning_rate": 9.22001242503887e-06, "loss": 0.6707, "step": 6030 }, { "epoch": 0.8, "grad_norm": 2.5745279788970947, "learning_rate": 9.21593761607214e-06, "loss": 0.6866, "step": 6040 }, { "epoch": 0.8, "grad_norm": 2.8593175411224365, "learning_rate": 9.211853096347059e-06, "loss": 0.6713, "step": 6050 }, { "epoch": 0.8, "grad_norm": 2.8683552742004395, "learning_rate": 9.207758875271683e-06, "loss": 0.6566, "step": 6060 }, { "epoch": 0.8, "grad_norm": 2.874685287475586, "learning_rate": 9.203654962276415e-06, "loss": 0.6791, "step": 6070 }, { "epoch": 0.8, "grad_norm": 2.6957099437713623, "learning_rate": 9.199541366813984e-06, "loss": 0.6688, "step": 6080 }, { "epoch": 0.81, "grad_norm": 3.0089595317840576, "learning_rate": 9.195418098359417e-06, "loss": 0.6708, "step": 6090 }, { "epoch": 0.81, "grad_norm": 2.352421760559082, "learning_rate": 9.191285166410023e-06, "loss": 0.6637, "step": 6100 }, { "epoch": 0.81, "grad_norm": 2.6234710216522217, "learning_rate": 9.18714258048537e-06, "loss": 0.6764, "step": 6110 }, { "epoch": 0.81, "grad_norm": 3.9939608573913574, "learning_rate": 9.182990350127265e-06, "loss": 0.6553, "step": 6120 }, { "epoch": 0.81, "grad_norm": 3.0175065994262695, "learning_rate": 9.178828484899724e-06, "loss": 0.6709, "step": 6130 }, { "epoch": 0.81, "grad_norm": 3.120433807373047, "learning_rate": 9.174656994388957e-06, "loss": 0.6739, "step": 6140 }, { "epoch": 0.81, "grad_norm": 2.466984748840332, "learning_rate": 9.170475888203348e-06, "loss": 0.6652, "step": 6150 }, { "epoch": 0.82, "grad_norm": 2.471644639968872, "learning_rate": 9.166285175973424e-06, "loss": 0.6822, "step": 6160 }, { "epoch": 0.82, "grad_norm": 2.809940814971924, "learning_rate": 9.16208486735184e-06, "loss": 0.6806, "step": 6170 }, { "epoch": 0.82, "grad_norm": 3.266305446624756, "learning_rate": 9.157874972013361e-06, "loss": 0.6742, "step": 6180 }, { "epoch": 0.82, "grad_norm": 2.68939208984375, "learning_rate": 9.153655499654824e-06, "loss": 0.6778, "step": 6190 }, { "epoch": 0.82, "grad_norm": 2.9182255268096924, "learning_rate": 9.149426459995127e-06, "loss": 0.6688, "step": 6200 }, { "epoch": 0.82, "grad_norm": 2.9277961254119873, "learning_rate": 9.145187862775208e-06, "loss": 0.676, "step": 6210 }, { "epoch": 0.82, "grad_norm": 2.709533214569092, "learning_rate": 9.140939717758022e-06, "loss": 0.6713, "step": 6220 }, { "epoch": 0.82, "grad_norm": 3.068077325820923, "learning_rate": 9.136682034728508e-06, "loss": 0.6623, "step": 6230 }, { "epoch": 0.83, "grad_norm": 2.503319263458252, "learning_rate": 9.13241482349358e-06, "loss": 0.686, "step": 6240 }, { "epoch": 0.83, "grad_norm": 2.557908773422241, "learning_rate": 9.128138093882098e-06, "loss": 0.674, "step": 6250 }, { "epoch": 0.83, "grad_norm": 2.8420028686523438, "learning_rate": 9.123851855744842e-06, "loss": 0.6606, "step": 6260 }, { "epoch": 0.83, "grad_norm": 3.4535250663757324, "learning_rate": 9.119556118954503e-06, "loss": 0.6702, "step": 6270 }, { "epoch": 0.83, "grad_norm": 2.663339138031006, "learning_rate": 9.115250893405637e-06, "loss": 0.6788, "step": 6280 }, { "epoch": 0.83, "grad_norm": 2.507500410079956, "learning_rate": 9.110936189014668e-06, "loss": 0.6631, "step": 6290 }, { "epoch": 0.83, "grad_norm": 2.51786732673645, "learning_rate": 9.106612015719845e-06, "loss": 0.6617, "step": 6300 }, { "epoch": 0.84, "grad_norm": 3.4296956062316895, "learning_rate": 9.102278383481235e-06, "loss": 0.6818, "step": 6310 }, { "epoch": 0.84, "grad_norm": 2.9477152824401855, "learning_rate": 9.097935302280682e-06, "loss": 0.6797, "step": 6320 }, { "epoch": 0.84, "grad_norm": 2.557518243789673, "learning_rate": 9.093582782121805e-06, "loss": 0.6741, "step": 6330 }, { "epoch": 0.84, "grad_norm": 2.746699571609497, "learning_rate": 9.089220833029957e-06, "loss": 0.6732, "step": 6340 }, { "epoch": 0.84, "grad_norm": 2.7738733291625977, "learning_rate": 9.08484946505221e-06, "loss": 0.672, "step": 6350 }, { "epoch": 0.84, "grad_norm": 3.2371561527252197, "learning_rate": 9.080468688257334e-06, "loss": 0.6836, "step": 6360 }, { "epoch": 0.84, "grad_norm": 2.3297362327575684, "learning_rate": 9.07607851273577e-06, "loss": 0.6739, "step": 6370 }, { "epoch": 0.84, "grad_norm": 2.604583740234375, "learning_rate": 9.0716789485996e-06, "loss": 0.6861, "step": 6380 }, { "epoch": 0.85, "grad_norm": 2.323979616165161, "learning_rate": 9.067270005982545e-06, "loss": 0.673, "step": 6390 }, { "epoch": 0.85, "grad_norm": 3.385627269744873, "learning_rate": 9.062851695039915e-06, "loss": 0.6733, "step": 6400 }, { "epoch": 0.85, "grad_norm": 3.809943914413452, "learning_rate": 9.058424025948609e-06, "loss": 0.6802, "step": 6410 }, { "epoch": 0.85, "grad_norm": 3.2964417934417725, "learning_rate": 9.053987008907071e-06, "loss": 0.6912, "step": 6420 }, { "epoch": 0.85, "grad_norm": 3.160759210586548, "learning_rate": 9.049540654135285e-06, "loss": 0.6672, "step": 6430 }, { "epoch": 0.85, "grad_norm": 3.6694741249084473, "learning_rate": 9.045084971874738e-06, "loss": 0.6843, "step": 6440 }, { "epoch": 0.85, "grad_norm": 3.6869866847991943, "learning_rate": 9.040619972388402e-06, "loss": 0.671, "step": 6450 }, { "epoch": 0.85, "grad_norm": 2.74727463722229, "learning_rate": 9.036145665960715e-06, "loss": 0.6783, "step": 6460 }, { "epoch": 0.86, "grad_norm": 2.88826322555542, "learning_rate": 9.03166206289754e-06, "loss": 0.6547, "step": 6470 }, { "epoch": 0.86, "grad_norm": 2.761207103729248, "learning_rate": 9.02716917352617e-06, "loss": 0.6739, "step": 6480 }, { "epoch": 0.86, "grad_norm": 2.660529375076294, "learning_rate": 9.022667008195273e-06, "loss": 0.6595, "step": 6490 }, { "epoch": 0.86, "grad_norm": 2.8409204483032227, "learning_rate": 9.018155577274891e-06, "loss": 0.6881, "step": 6500 }, { "epoch": 0.86, "grad_norm": 2.212315320968628, "learning_rate": 9.013634891156404e-06, "loss": 0.6872, "step": 6510 }, { "epoch": 0.86, "grad_norm": 3.3113622665405273, "learning_rate": 9.009104960252513e-06, "loss": 0.6761, "step": 6520 }, { "epoch": 0.86, "grad_norm": 2.797126293182373, "learning_rate": 9.004565794997209e-06, "loss": 0.6741, "step": 6530 }, { "epoch": 0.87, "grad_norm": 2.9894332885742188, "learning_rate": 9.000017405845755e-06, "loss": 0.6835, "step": 6540 }, { "epoch": 0.87, "grad_norm": 2.9207139015197754, "learning_rate": 8.995459803274664e-06, "loss": 0.674, "step": 6550 }, { "epoch": 0.87, "grad_norm": 2.9169623851776123, "learning_rate": 8.990892997781661e-06, "loss": 0.6419, "step": 6560 }, { "epoch": 0.87, "grad_norm": 2.9211723804473877, "learning_rate": 8.986316999885678e-06, "loss": 0.6581, "step": 6570 }, { "epoch": 0.87, "grad_norm": 2.616330623626709, "learning_rate": 8.981731820126816e-06, "loss": 0.6741, "step": 6580 }, { "epoch": 0.87, "grad_norm": 2.5580813884735107, "learning_rate": 8.977137469066321e-06, "loss": 0.6741, "step": 6590 }, { "epoch": 0.87, "grad_norm": 2.785909414291382, "learning_rate": 8.972533957286574e-06, "loss": 0.6784, "step": 6600 }, { "epoch": 0.87, "grad_norm": 3.340866804122925, "learning_rate": 8.967921295391046e-06, "loss": 0.6687, "step": 6610 }, { "epoch": 0.88, "grad_norm": 2.7479071617126465, "learning_rate": 8.963299494004292e-06, "loss": 0.6738, "step": 6620 }, { "epoch": 0.88, "grad_norm": 2.5367236137390137, "learning_rate": 8.958668563771911e-06, "loss": 0.6776, "step": 6630 }, { "epoch": 0.88, "grad_norm": 2.654205322265625, "learning_rate": 8.954028515360535e-06, "loss": 0.6664, "step": 6640 }, { "epoch": 0.88, "grad_norm": 2.851851463317871, "learning_rate": 8.949379359457795e-06, "loss": 0.6765, "step": 6650 }, { "epoch": 0.88, "grad_norm": 2.946833848953247, "learning_rate": 8.944721106772298e-06, "loss": 0.6642, "step": 6660 }, { "epoch": 0.88, "grad_norm": 3.0130770206451416, "learning_rate": 8.94005376803361e-06, "loss": 0.6775, "step": 6670 }, { "epoch": 0.88, "grad_norm": 3.278046131134033, "learning_rate": 8.935377353992222e-06, "loss": 0.6853, "step": 6680 }, { "epoch": 0.89, "grad_norm": 2.2684073448181152, "learning_rate": 8.930691875419525e-06, "loss": 0.6704, "step": 6690 }, { "epoch": 0.89, "grad_norm": 3.093383550643921, "learning_rate": 8.925997343107796e-06, "loss": 0.6665, "step": 6700 }, { "epoch": 0.89, "grad_norm": 2.568152666091919, "learning_rate": 8.921293767870157e-06, "loss": 0.6643, "step": 6710 }, { "epoch": 0.89, "grad_norm": 3.2096123695373535, "learning_rate": 8.91658116054057e-06, "loss": 0.679, "step": 6720 }, { "epoch": 0.89, "grad_norm": 2.912968397140503, "learning_rate": 8.91185953197379e-06, "loss": 0.6856, "step": 6730 }, { "epoch": 0.89, "grad_norm": 2.980971097946167, "learning_rate": 8.907128893045359e-06, "loss": 0.6672, "step": 6740 }, { "epoch": 0.89, "grad_norm": 3.0038115978240967, "learning_rate": 8.902389254651568e-06, "loss": 0.6738, "step": 6750 }, { "epoch": 0.89, "grad_norm": 3.099684000015259, "learning_rate": 8.897640627709441e-06, "loss": 0.6924, "step": 6760 }, { "epoch": 0.9, "grad_norm": 2.444758892059326, "learning_rate": 8.892883023156703e-06, "loss": 0.6689, "step": 6770 }, { "epoch": 0.9, "grad_norm": 2.7014451026916504, "learning_rate": 8.888116451951755e-06, "loss": 0.683, "step": 6780 }, { "epoch": 0.9, "grad_norm": 3.183800458908081, "learning_rate": 8.88334092507366e-06, "loss": 0.6799, "step": 6790 }, { "epoch": 0.9, "grad_norm": 2.8283169269561768, "learning_rate": 8.8785564535221e-06, "loss": 0.6546, "step": 6800 }, { "epoch": 0.9, "grad_norm": 2.629321575164795, "learning_rate": 8.873763048317363e-06, "loss": 0.6774, "step": 6810 }, { "epoch": 0.9, "grad_norm": 2.320882558822632, "learning_rate": 8.868960720500314e-06, "loss": 0.6646, "step": 6820 }, { "epoch": 0.9, "grad_norm": 3.840276002883911, "learning_rate": 8.86414948113237e-06, "loss": 0.6768, "step": 6830 }, { "epoch": 0.91, "grad_norm": 2.9596545696258545, "learning_rate": 8.85932934129548e-06, "loss": 0.6873, "step": 6840 }, { "epoch": 0.91, "grad_norm": 2.690166473388672, "learning_rate": 8.854500312092081e-06, "loss": 0.6867, "step": 6850 }, { "epoch": 0.91, "grad_norm": 3.0237040519714355, "learning_rate": 8.849662404645097e-06, "loss": 0.664, "step": 6860 }, { "epoch": 0.91, "grad_norm": 3.3142759799957275, "learning_rate": 8.844815630097896e-06, "loss": 0.6776, "step": 6870 }, { "epoch": 0.91, "grad_norm": 2.4475510120391846, "learning_rate": 8.839959999614272e-06, "loss": 0.6755, "step": 6880 }, { "epoch": 0.91, "grad_norm": 2.689811944961548, "learning_rate": 8.835095524378413e-06, "loss": 0.677, "step": 6890 }, { "epoch": 0.91, "grad_norm": 2.6768815517425537, "learning_rate": 8.83022221559489e-06, "loss": 0.6867, "step": 6900 }, { "epoch": 0.91, "eval_loss": 0.7290458083152771, "eval_runtime": 197.9624, "eval_samples_per_second": 55.566, "eval_steps_per_second": 6.946, "step": 6900 }, { "epoch": 0.91, "grad_norm": 2.7535643577575684, "learning_rate": 8.82534008448861e-06, "loss": 0.6751, "step": 6910 }, { "epoch": 0.92, "grad_norm": 2.4862465858459473, "learning_rate": 8.820449142304805e-06, "loss": 0.6745, "step": 6920 }, { "epoch": 0.92, "grad_norm": 2.895951747894287, "learning_rate": 8.815549400309002e-06, "loss": 0.6701, "step": 6930 }, { "epoch": 0.92, "grad_norm": 2.7666430473327637, "learning_rate": 8.810640869786994e-06, "loss": 0.6522, "step": 6940 }, { "epoch": 0.92, "grad_norm": 3.116729259490967, "learning_rate": 8.805723562044825e-06, "loss": 0.6613, "step": 6950 }, { "epoch": 0.92, "grad_norm": 2.8136725425720215, "learning_rate": 8.800797488408746e-06, "loss": 0.6739, "step": 6960 }, { "epoch": 0.92, "grad_norm": 2.925579309463501, "learning_rate": 8.795862660225205e-06, "loss": 0.655, "step": 6970 }, { "epoch": 0.92, "grad_norm": 2.7187774181365967, "learning_rate": 8.790919088860815e-06, "loss": 0.662, "step": 6980 }, { "epoch": 0.93, "grad_norm": 3.002408742904663, "learning_rate": 8.785966785702323e-06, "loss": 0.6677, "step": 6990 }, { "epoch": 0.93, "grad_norm": 2.7283411026000977, "learning_rate": 8.781005762156593e-06, "loss": 0.6775, "step": 7000 }, { "epoch": 0.93, "grad_norm": 2.955579996109009, "learning_rate": 8.776036029650573e-06, "loss": 0.6777, "step": 7010 }, { "epoch": 0.93, "grad_norm": 2.8140358924865723, "learning_rate": 8.77105759963127e-06, "loss": 0.6734, "step": 7020 }, { "epoch": 0.93, "grad_norm": 2.5485150814056396, "learning_rate": 8.766070483565726e-06, "loss": 0.6805, "step": 7030 }, { "epoch": 0.93, "grad_norm": 2.6847705841064453, "learning_rate": 8.76107469294099e-06, "loss": 0.6564, "step": 7040 }, { "epoch": 0.93, "grad_norm": 3.189195156097412, "learning_rate": 8.756070239264089e-06, "loss": 0.6794, "step": 7050 }, { "epoch": 0.93, "grad_norm": 2.842616319656372, "learning_rate": 8.75105713406201e-06, "loss": 0.6784, "step": 7060 }, { "epoch": 0.94, "grad_norm": 3.710517168045044, "learning_rate": 8.746035388881655e-06, "loss": 0.6786, "step": 7070 }, { "epoch": 0.94, "grad_norm": 2.5461089611053467, "learning_rate": 8.741005015289843e-06, "loss": 0.6865, "step": 7080 }, { "epoch": 0.94, "grad_norm": 3.374018669128418, "learning_rate": 8.735966024873257e-06, "loss": 0.6646, "step": 7090 }, { "epoch": 0.94, "grad_norm": 3.168987989425659, "learning_rate": 8.730918429238429e-06, "loss": 0.6818, "step": 7100 }, { "epoch": 0.94, "grad_norm": 3.4316864013671875, "learning_rate": 8.72586224001171e-06, "loss": 0.6523, "step": 7110 }, { "epoch": 0.94, "grad_norm": 3.518183946609497, "learning_rate": 8.720797468839255e-06, "loss": 0.6692, "step": 7120 }, { "epoch": 0.94, "grad_norm": 3.6533937454223633, "learning_rate": 8.715724127386971e-06, "loss": 0.6729, "step": 7130 }, { "epoch": 0.94, "grad_norm": 2.501375913619995, "learning_rate": 8.710642227340518e-06, "loss": 0.6692, "step": 7140 }, { "epoch": 0.95, "grad_norm": 3.3970723152160645, "learning_rate": 8.705551780405264e-06, "loss": 0.6726, "step": 7150 }, { "epoch": 0.95, "grad_norm": 2.5831968784332275, "learning_rate": 8.70045279830626e-06, "loss": 0.684, "step": 7160 }, { "epoch": 0.95, "grad_norm": 2.8925693035125732, "learning_rate": 8.695345292788223e-06, "loss": 0.6587, "step": 7170 }, { "epoch": 0.95, "grad_norm": 3.127180576324463, "learning_rate": 8.690229275615503e-06, "loss": 0.67, "step": 7180 }, { "epoch": 0.95, "grad_norm": 2.4886677265167236, "learning_rate": 8.685104758572047e-06, "loss": 0.6666, "step": 7190 }, { "epoch": 0.95, "grad_norm": 4.1692070960998535, "learning_rate": 8.679971753461388e-06, "loss": 0.6668, "step": 7200 }, { "epoch": 0.95, "grad_norm": 2.864614486694336, "learning_rate": 8.674830272106604e-06, "loss": 0.6658, "step": 7210 }, { "epoch": 0.96, "grad_norm": 3.166633129119873, "learning_rate": 8.669680326350303e-06, "loss": 0.6726, "step": 7220 }, { "epoch": 0.96, "grad_norm": 2.520665407180786, "learning_rate": 8.664521928054585e-06, "loss": 0.6693, "step": 7230 }, { "epoch": 0.96, "grad_norm": 3.273986339569092, "learning_rate": 8.659355089101021e-06, "loss": 0.6762, "step": 7240 }, { "epoch": 0.96, "grad_norm": 2.5142078399658203, "learning_rate": 8.65417982139062e-06, "loss": 0.6659, "step": 7250 }, { "epoch": 0.96, "grad_norm": 3.154919385910034, "learning_rate": 8.648996136843814e-06, "loss": 0.671, "step": 7260 }, { "epoch": 0.96, "grad_norm": 2.0966339111328125, "learning_rate": 8.643804047400412e-06, "loss": 0.6641, "step": 7270 }, { "epoch": 0.96, "grad_norm": 3.209606885910034, "learning_rate": 8.638603565019588e-06, "loss": 0.6677, "step": 7280 }, { "epoch": 0.96, "grad_norm": 2.470289468765259, "learning_rate": 8.633394701679847e-06, "loss": 0.6628, "step": 7290 }, { "epoch": 0.97, "grad_norm": 2.4616212844848633, "learning_rate": 8.628177469378995e-06, "loss": 0.6772, "step": 7300 }, { "epoch": 0.97, "grad_norm": 3.5428974628448486, "learning_rate": 8.622951880134122e-06, "loss": 0.6737, "step": 7310 }, { "epoch": 0.97, "grad_norm": 3.179137945175171, "learning_rate": 8.617717945981558e-06, "loss": 0.6855, "step": 7320 }, { "epoch": 0.97, "grad_norm": 3.3000833988189697, "learning_rate": 8.612475678976861e-06, "loss": 0.6805, "step": 7330 }, { "epoch": 0.97, "grad_norm": 2.7179107666015625, "learning_rate": 8.60722509119478e-06, "loss": 0.6742, "step": 7340 }, { "epoch": 0.97, "grad_norm": 2.58263897895813, "learning_rate": 8.601966194729228e-06, "loss": 0.6746, "step": 7350 }, { "epoch": 0.97, "grad_norm": 2.772865056991577, "learning_rate": 8.596699001693257e-06, "loss": 0.6624, "step": 7360 }, { "epoch": 0.98, "grad_norm": 2.8705618381500244, "learning_rate": 8.59142352421903e-06, "loss": 0.6768, "step": 7370 }, { "epoch": 0.98, "grad_norm": 2.4928457736968994, "learning_rate": 8.586139774457791e-06, "loss": 0.6582, "step": 7380 }, { "epoch": 0.98, "grad_norm": 2.9825477600097656, "learning_rate": 8.58084776457984e-06, "loss": 0.686, "step": 7390 }, { "epoch": 0.98, "grad_norm": 2.839447021484375, "learning_rate": 8.575547506774498e-06, "loss": 0.6847, "step": 7400 }, { "epoch": 0.98, "grad_norm": 4.549822807312012, "learning_rate": 8.570239013250089e-06, "loss": 0.6599, "step": 7410 }, { "epoch": 0.98, "grad_norm": 2.622835874557495, "learning_rate": 8.5649222962339e-06, "loss": 0.6447, "step": 7420 }, { "epoch": 0.98, "grad_norm": 2.558931350708008, "learning_rate": 8.559597367972168e-06, "loss": 0.6653, "step": 7430 }, { "epoch": 0.98, "grad_norm": 2.8210113048553467, "learning_rate": 8.554264240730042e-06, "loss": 0.671, "step": 7440 }, { "epoch": 0.99, "grad_norm": 2.7722108364105225, "learning_rate": 8.548922926791545e-06, "loss": 0.6872, "step": 7450 }, { "epoch": 0.99, "grad_norm": 2.9724175930023193, "learning_rate": 8.543573438459573e-06, "loss": 0.6752, "step": 7460 }, { "epoch": 0.99, "grad_norm": 2.862396717071533, "learning_rate": 8.538215788055839e-06, "loss": 0.6667, "step": 7470 }, { "epoch": 0.99, "grad_norm": 3.055391788482666, "learning_rate": 8.532849987920859e-06, "loss": 0.6695, "step": 7480 }, { "epoch": 0.99, "grad_norm": 2.379634141921997, "learning_rate": 8.527476050413922e-06, "loss": 0.6535, "step": 7490 }, { "epoch": 0.99, "grad_norm": 3.219191551208496, "learning_rate": 8.522093987913063e-06, "loss": 0.6617, "step": 7500 }, { "epoch": 0.99, "grad_norm": 3.1809182167053223, "learning_rate": 8.516703812815024e-06, "loss": 0.6672, "step": 7510 }, { "epoch": 1.0, "grad_norm": 2.790281057357788, "learning_rate": 8.511305537535238e-06, "loss": 0.6722, "step": 7520 }, { "epoch": 1.0, "grad_norm": 3.374549388885498, "learning_rate": 8.505899174507793e-06, "loss": 0.6622, "step": 7530 }, { "epoch": 1.0, "grad_norm": 2.377967119216919, "learning_rate": 8.500484736185412e-06, "loss": 0.6531, "step": 7540 }, { "epoch": 1.0, "grad_norm": 2.5226247310638428, "learning_rate": 8.49506223503941e-06, "loss": 0.6491, "step": 7550 } ], "logging_steps": 10, "max_steps": 23000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1, "total_flos": 5.621498429758977e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }