{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 452, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004424778761061947, "grad_norm": 1624.0, "learning_rate": 9.523809523809523e-06, "loss": 18.8236, "step": 1 }, { "epoch": 0.008849557522123894, "grad_norm": 1632.0, "learning_rate": 1.9047619047619046e-05, "loss": 18.845, "step": 2 }, { "epoch": 0.01327433628318584, "grad_norm": 1104.0, "learning_rate": 2.857142857142857e-05, "loss": 13.897, "step": 3 }, { "epoch": 0.017699115044247787, "grad_norm": 556.0, "learning_rate": 3.809523809523809e-05, "loss": 12.7695, "step": 4 }, { "epoch": 0.022123893805309734, "grad_norm": 352.0, "learning_rate": 4.761904761904762e-05, "loss": 12.0053, "step": 5 }, { "epoch": 0.02654867256637168, "grad_norm": 121.5, "learning_rate": 5.714285714285714e-05, "loss": 9.8363, "step": 6 }, { "epoch": 0.030973451327433628, "grad_norm": 93.0, "learning_rate": 6.666666666666667e-05, "loss": 8.3813, "step": 7 }, { "epoch": 0.035398230088495575, "grad_norm": 32.75, "learning_rate": 7.619047619047618e-05, "loss": 7.5272, "step": 8 }, { "epoch": 0.03982300884955752, "grad_norm": 54.75, "learning_rate": 8.571428571428571e-05, "loss": 7.1281, "step": 9 }, { "epoch": 0.04424778761061947, "grad_norm": 20.125, "learning_rate": 9.523809523809524e-05, "loss": 6.8148, "step": 10 }, { "epoch": 0.048672566371681415, "grad_norm": 43.25, "learning_rate": 0.00010476190476190477, "loss": 6.8122, "step": 11 }, { "epoch": 0.05309734513274336, "grad_norm": 7.46875, "learning_rate": 0.00011428571428571428, "loss": 6.4345, "step": 12 }, { "epoch": 0.05752212389380531, "grad_norm": 17.75, "learning_rate": 0.0001238095238095238, "loss": 6.4061, "step": 13 }, { "epoch": 0.061946902654867256, "grad_norm": 34.25, "learning_rate": 0.00013333333333333334, "loss": 6.4773, "step": 14 }, { "epoch": 0.06637168141592921, "grad_norm": 11.9375, "learning_rate": 0.00014285714285714287, "loss": 6.2721, "step": 15 }, { "epoch": 0.07079646017699115, "grad_norm": 13.25, "learning_rate": 0.00015238095238095237, "loss": 6.1657, "step": 16 }, { "epoch": 0.0752212389380531, "grad_norm": 10.125, "learning_rate": 0.00016190476190476192, "loss": 5.9112, "step": 17 }, { "epoch": 0.07964601769911504, "grad_norm": 13.875, "learning_rate": 0.00017142857142857143, "loss": 5.3251, "step": 18 }, { "epoch": 0.084070796460177, "grad_norm": 179.0, "learning_rate": 0.00018095238095238095, "loss": 7.1923, "step": 19 }, { "epoch": 0.08849557522123894, "grad_norm": 26.875, "learning_rate": 0.00019047619047619048, "loss": 4.1209, "step": 20 }, { "epoch": 0.09292035398230089, "grad_norm": 43.25, "learning_rate": 0.0002, "loss": 3.2117, "step": 21 }, { "epoch": 0.09734513274336283, "grad_norm": 11.25, "learning_rate": 0.00019999885675796823, "loss": 2.2957, "step": 22 }, { "epoch": 0.10176991150442478, "grad_norm": 9.25, "learning_rate": 0.00019999542705801296, "loss": 1.8052, "step": 23 }, { "epoch": 0.10619469026548672, "grad_norm": 5.75, "learning_rate": 0.0001999897109785537, "loss": 1.5344, "step": 24 }, { "epoch": 0.11061946902654868, "grad_norm": 2.234375, "learning_rate": 0.00019998170865028774, "loss": 1.3085, "step": 25 }, { "epoch": 0.11504424778761062, "grad_norm": 2.71875, "learning_rate": 0.00019997142025618701, "loss": 1.2024, "step": 26 }, { "epoch": 0.11946902654867257, "grad_norm": 2.09375, "learning_rate": 0.00019995884603149402, "loss": 1.1342, "step": 27 }, { "epoch": 0.12389380530973451, "grad_norm": 1.5234375, "learning_rate": 0.00019994398626371643, "loss": 1.073, "step": 28 }, { "epoch": 0.12831858407079647, "grad_norm": 1.6953125, "learning_rate": 0.00019992684129262038, "loss": 0.9815, "step": 29 }, { "epoch": 0.13274336283185842, "grad_norm": 1.28125, "learning_rate": 0.00019990741151022301, "loss": 0.9502, "step": 30 }, { "epoch": 0.13716814159292035, "grad_norm": 0.953125, "learning_rate": 0.00019988569736078312, "loss": 0.8713, "step": 31 }, { "epoch": 0.1415929203539823, "grad_norm": 1.0546875, "learning_rate": 0.00019986169934079135, "loss": 0.8431, "step": 32 }, { "epoch": 0.14601769911504425, "grad_norm": 0.734375, "learning_rate": 0.0001998354179989585, "loss": 0.7957, "step": 33 }, { "epoch": 0.1504424778761062, "grad_norm": 0.875, "learning_rate": 0.00019980685393620337, "loss": 0.8088, "step": 34 }, { "epoch": 0.15486725663716813, "grad_norm": 0.8046875, "learning_rate": 0.00019977600780563863, "loss": 0.7827, "step": 35 }, { "epoch": 0.1592920353982301, "grad_norm": 0.75390625, "learning_rate": 0.00019974288031255618, "loss": 0.7153, "step": 36 }, { "epoch": 0.16371681415929204, "grad_norm": 0.74609375, "learning_rate": 0.00019970747221441083, "loss": 0.7449, "step": 37 }, { "epoch": 0.168141592920354, "grad_norm": 0.69921875, "learning_rate": 0.00019966978432080316, "loss": 0.734, "step": 38 }, { "epoch": 0.17256637168141592, "grad_norm": 0.671875, "learning_rate": 0.00019962981749346078, "loss": 0.7157, "step": 39 }, { "epoch": 0.17699115044247787, "grad_norm": 0.640625, "learning_rate": 0.0001995875726462189, "loss": 0.733, "step": 40 }, { "epoch": 0.18141592920353983, "grad_norm": 0.65234375, "learning_rate": 0.00019954305074499916, "loss": 0.6717, "step": 41 }, { "epoch": 0.18584070796460178, "grad_norm": 0.57421875, "learning_rate": 0.00019949625280778777, "loss": 0.7242, "step": 42 }, { "epoch": 0.1902654867256637, "grad_norm": 0.625, "learning_rate": 0.00019944717990461207, "loss": 0.6803, "step": 43 }, { "epoch": 0.19469026548672566, "grad_norm": 0.578125, "learning_rate": 0.00019939583315751623, "loss": 0.6824, "step": 44 }, { "epoch": 0.19911504424778761, "grad_norm": 0.578125, "learning_rate": 0.0001993422137405354, "loss": 0.669, "step": 45 }, { "epoch": 0.20353982300884957, "grad_norm": 0.56640625, "learning_rate": 0.000199286322879669, "loss": 0.6621, "step": 46 }, { "epoch": 0.2079646017699115, "grad_norm": 0.609375, "learning_rate": 0.00019922816185285265, "loss": 0.6433, "step": 47 }, { "epoch": 0.21238938053097345, "grad_norm": 0.53515625, "learning_rate": 0.000199167731989929, "loss": 0.647, "step": 48 }, { "epoch": 0.2168141592920354, "grad_norm": 0.6171875, "learning_rate": 0.00019910503467261722, "loss": 0.642, "step": 49 }, { "epoch": 0.22123893805309736, "grad_norm": 0.53515625, "learning_rate": 0.00019904007133448147, "loss": 0.6283, "step": 50 }, { "epoch": 0.22566371681415928, "grad_norm": 0.6171875, "learning_rate": 0.0001989728434608981, "loss": 0.6374, "step": 51 }, { "epoch": 0.23008849557522124, "grad_norm": 0.60546875, "learning_rate": 0.00019890335258902178, "loss": 0.6671, "step": 52 }, { "epoch": 0.2345132743362832, "grad_norm": 0.515625, "learning_rate": 0.00019883160030775016, "loss": 0.6303, "step": 53 }, { "epoch": 0.23893805309734514, "grad_norm": 0.62890625, "learning_rate": 0.0001987575882576878, "loss": 0.6423, "step": 54 }, { "epoch": 0.24336283185840707, "grad_norm": 0.54296875, "learning_rate": 0.00019868131813110832, "loss": 0.6428, "step": 55 }, { "epoch": 0.24778761061946902, "grad_norm": 0.515625, "learning_rate": 0.0001986027916719161, "loss": 0.6105, "step": 56 }, { "epoch": 0.252212389380531, "grad_norm": 0.5390625, "learning_rate": 0.00019852201067560606, "loss": 0.6109, "step": 57 }, { "epoch": 0.25663716814159293, "grad_norm": 0.58984375, "learning_rate": 0.00019843897698922284, "loss": 0.5753, "step": 58 }, { "epoch": 0.2610619469026549, "grad_norm": 0.52734375, "learning_rate": 0.00019835369251131846, "loss": 0.607, "step": 59 }, { "epoch": 0.26548672566371684, "grad_norm": 0.515625, "learning_rate": 0.00019826615919190887, "loss": 0.6029, "step": 60 }, { "epoch": 0.26991150442477874, "grad_norm": 0.51171875, "learning_rate": 0.0001981763790324295, "loss": 0.5668, "step": 61 }, { "epoch": 0.2743362831858407, "grad_norm": 0.515625, "learning_rate": 0.00019808435408568938, "loss": 0.6282, "step": 62 }, { "epoch": 0.27876106194690264, "grad_norm": 0.515625, "learning_rate": 0.0001979900864558242, "loss": 0.5816, "step": 63 }, { "epoch": 0.2831858407079646, "grad_norm": 0.53125, "learning_rate": 0.0001978935782982484, "loss": 0.6071, "step": 64 }, { "epoch": 0.28761061946902655, "grad_norm": 0.48046875, "learning_rate": 0.00019779483181960557, "loss": 0.56, "step": 65 }, { "epoch": 0.2920353982300885, "grad_norm": 0.498046875, "learning_rate": 0.0001976938492777182, "loss": 0.6138, "step": 66 }, { "epoch": 0.29646017699115046, "grad_norm": 0.486328125, "learning_rate": 0.00019759063298153598, "loss": 0.5735, "step": 67 }, { "epoch": 0.3008849557522124, "grad_norm": 0.5234375, "learning_rate": 0.00019748518529108316, "loss": 0.5871, "step": 68 }, { "epoch": 0.3053097345132743, "grad_norm": 0.5234375, "learning_rate": 0.00019737750861740431, "loss": 0.6027, "step": 69 }, { "epoch": 0.30973451327433627, "grad_norm": 0.5, "learning_rate": 0.00019726760542250946, "loss": 0.6089, "step": 70 }, { "epoch": 0.3141592920353982, "grad_norm": 0.474609375, "learning_rate": 0.0001971554782193176, "loss": 0.5682, "step": 71 }, { "epoch": 0.3185840707964602, "grad_norm": 0.474609375, "learning_rate": 0.0001970411295715994, "loss": 0.5707, "step": 72 }, { "epoch": 0.3230088495575221, "grad_norm": 0.515625, "learning_rate": 0.00019692456209391846, "loss": 0.5949, "step": 73 }, { "epoch": 0.3274336283185841, "grad_norm": 0.52734375, "learning_rate": 0.00019680577845157155, "loss": 0.577, "step": 74 }, { "epoch": 0.33185840707964603, "grad_norm": 0.69140625, "learning_rate": 0.00019668478136052774, "loss": 0.5871, "step": 75 }, { "epoch": 0.336283185840708, "grad_norm": 0.5078125, "learning_rate": 0.00019656157358736626, "loss": 0.5763, "step": 76 }, { "epoch": 0.3407079646017699, "grad_norm": 0.498046875, "learning_rate": 0.0001964361579492132, "loss": 0.5875, "step": 77 }, { "epoch": 0.34513274336283184, "grad_norm": 0.45703125, "learning_rate": 0.00019630853731367713, "loss": 0.5811, "step": 78 }, { "epoch": 0.3495575221238938, "grad_norm": 0.5625, "learning_rate": 0.0001961787145987835, "loss": 0.6122, "step": 79 }, { "epoch": 0.35398230088495575, "grad_norm": 0.4765625, "learning_rate": 0.00019604669277290806, "loss": 0.5584, "step": 80 }, { "epoch": 0.3584070796460177, "grad_norm": 0.46875, "learning_rate": 0.0001959124748547088, "loss": 0.5629, "step": 81 }, { "epoch": 0.36283185840707965, "grad_norm": 0.50390625, "learning_rate": 0.00019577606391305702, "loss": 0.5758, "step": 82 }, { "epoch": 0.3672566371681416, "grad_norm": 0.478515625, "learning_rate": 0.0001956374630669672, "loss": 0.5383, "step": 83 }, { "epoch": 0.37168141592920356, "grad_norm": 0.46484375, "learning_rate": 0.00019549667548552556, "loss": 0.5869, "step": 84 }, { "epoch": 0.37610619469026546, "grad_norm": 0.51953125, "learning_rate": 0.00019535370438781766, "loss": 0.5642, "step": 85 }, { "epoch": 0.3805309734513274, "grad_norm": 0.498046875, "learning_rate": 0.000195208553042855, "loss": 0.5895, "step": 86 }, { "epoch": 0.38495575221238937, "grad_norm": 0.48828125, "learning_rate": 0.00019506122476949981, "loss": 0.5744, "step": 87 }, { "epoch": 0.3893805309734513, "grad_norm": 0.5078125, "learning_rate": 0.00019491172293638968, "loss": 0.5537, "step": 88 }, { "epoch": 0.3938053097345133, "grad_norm": 0.50390625, "learning_rate": 0.00019476005096186015, "loss": 0.5748, "step": 89 }, { "epoch": 0.39823008849557523, "grad_norm": 0.486328125, "learning_rate": 0.00019460621231386676, "loss": 0.5511, "step": 90 }, { "epoch": 0.4026548672566372, "grad_norm": 0.470703125, "learning_rate": 0.00019445021050990571, "loss": 0.5636, "step": 91 }, { "epoch": 0.40707964601769914, "grad_norm": 0.482421875, "learning_rate": 0.00019429204911693333, "loss": 0.5649, "step": 92 }, { "epoch": 0.41150442477876104, "grad_norm": 0.49609375, "learning_rate": 0.00019413173175128473, "loss": 0.5468, "step": 93 }, { "epoch": 0.415929203539823, "grad_norm": 0.45703125, "learning_rate": 0.00019396926207859084, "loss": 0.5347, "step": 94 }, { "epoch": 0.42035398230088494, "grad_norm": 0.5078125, "learning_rate": 0.00019380464381369493, "loss": 0.5667, "step": 95 }, { "epoch": 0.4247787610619469, "grad_norm": 0.490234375, "learning_rate": 0.0001936378807205673, "loss": 0.5644, "step": 96 }, { "epoch": 0.42920353982300885, "grad_norm": 0.4765625, "learning_rate": 0.00019346897661221956, "loss": 0.5384, "step": 97 }, { "epoch": 0.4336283185840708, "grad_norm": 0.47265625, "learning_rate": 0.00019329793535061723, "loss": 0.5569, "step": 98 }, { "epoch": 0.43805309734513276, "grad_norm": 0.48046875, "learning_rate": 0.0001931247608465915, "loss": 0.5494, "step": 99 }, { "epoch": 0.4424778761061947, "grad_norm": 0.427734375, "learning_rate": 0.00019294945705974973, "loss": 0.536, "step": 100 }, { "epoch": 0.4469026548672566, "grad_norm": 0.447265625, "learning_rate": 0.00019277202799838518, "loss": 0.5319, "step": 101 }, { "epoch": 0.45132743362831856, "grad_norm": 0.4921875, "learning_rate": 0.000192592477719385, "loss": 0.5649, "step": 102 }, { "epoch": 0.4557522123893805, "grad_norm": 0.474609375, "learning_rate": 0.00019241081032813772, "loss": 0.5652, "step": 103 }, { "epoch": 0.46017699115044247, "grad_norm": 0.462890625, "learning_rate": 0.00019222702997843927, "loss": 0.5222, "step": 104 }, { "epoch": 0.4646017699115044, "grad_norm": 0.4765625, "learning_rate": 0.00019204114087239806, "loss": 0.551, "step": 105 }, { "epoch": 0.4690265486725664, "grad_norm": 0.4921875, "learning_rate": 0.00019185314726033893, "loss": 0.555, "step": 106 }, { "epoch": 0.47345132743362833, "grad_norm": 0.470703125, "learning_rate": 0.00019166305344070578, "loss": 0.5404, "step": 107 }, { "epoch": 0.4778761061946903, "grad_norm": 0.494140625, "learning_rate": 0.0001914708637599636, "loss": 0.5462, "step": 108 }, { "epoch": 0.4823008849557522, "grad_norm": 0.79296875, "learning_rate": 0.0001912765826124987, "loss": 0.5532, "step": 109 }, { "epoch": 0.48672566371681414, "grad_norm": 0.55078125, "learning_rate": 0.0001910802144405186, "loss": 0.5417, "step": 110 }, { "epoch": 0.4911504424778761, "grad_norm": 0.5, "learning_rate": 0.0001908817637339503, "loss": 0.5418, "step": 111 }, { "epoch": 0.49557522123893805, "grad_norm": 0.515625, "learning_rate": 0.00019068123503033753, "loss": 0.5362, "step": 112 }, { "epoch": 0.5, "grad_norm": 0.5, "learning_rate": 0.00019047863291473717, "loss": 0.5512, "step": 113 }, { "epoch": 0.504424778761062, "grad_norm": 0.51171875, "learning_rate": 0.0001902739620196143, "loss": 0.5541, "step": 114 }, { "epoch": 0.5088495575221239, "grad_norm": 0.5390625, "learning_rate": 0.00019006722702473629, "loss": 0.5138, "step": 115 }, { "epoch": 0.5132743362831859, "grad_norm": 0.490234375, "learning_rate": 0.00018985843265706588, "loss": 0.5457, "step": 116 }, { "epoch": 0.5176991150442478, "grad_norm": 0.5390625, "learning_rate": 0.000189647583690653, "loss": 0.5161, "step": 117 }, { "epoch": 0.5221238938053098, "grad_norm": 0.48046875, "learning_rate": 0.0001894346849465257, "loss": 0.5442, "step": 118 }, { "epoch": 0.5265486725663717, "grad_norm": 0.4375, "learning_rate": 0.0001892197412925798, "loss": 0.5079, "step": 119 }, { "epoch": 0.5309734513274337, "grad_norm": 0.48828125, "learning_rate": 0.00018900275764346768, "loss": 0.4962, "step": 120 }, { "epoch": 0.5353982300884956, "grad_norm": 0.5078125, "learning_rate": 0.00018878373896048594, "loss": 0.5248, "step": 121 }, { "epoch": 0.5398230088495575, "grad_norm": 0.470703125, "learning_rate": 0.0001885626902514618, "loss": 0.5247, "step": 122 }, { "epoch": 0.5442477876106194, "grad_norm": 0.498046875, "learning_rate": 0.00018833961657063885, "loss": 0.5397, "step": 123 }, { "epoch": 0.5486725663716814, "grad_norm": 0.48046875, "learning_rate": 0.0001881145230185612, "loss": 0.5146, "step": 124 }, { "epoch": 0.5530973451327433, "grad_norm": 0.46875, "learning_rate": 0.00018788741474195706, "loss": 0.5654, "step": 125 }, { "epoch": 0.5575221238938053, "grad_norm": 0.45703125, "learning_rate": 0.00018765829693362095, "loss": 0.52, "step": 126 }, { "epoch": 0.5619469026548672, "grad_norm": 0.423828125, "learning_rate": 0.0001874271748322951, "loss": 0.5543, "step": 127 }, { "epoch": 0.5663716814159292, "grad_norm": 0.44140625, "learning_rate": 0.00018719405372254948, "loss": 0.5199, "step": 128 }, { "epoch": 0.5707964601769911, "grad_norm": 0.4765625, "learning_rate": 0.0001869589389346611, "loss": 0.5064, "step": 129 }, { "epoch": 0.5752212389380531, "grad_norm": 0.4921875, "learning_rate": 0.00018672183584449216, "loss": 0.5137, "step": 130 }, { "epoch": 0.5796460176991151, "grad_norm": 0.4921875, "learning_rate": 0.00018648274987336704, "loss": 0.5058, "step": 131 }, { "epoch": 0.584070796460177, "grad_norm": 0.458984375, "learning_rate": 0.00018624168648794832, "loss": 0.5264, "step": 132 }, { "epoch": 0.588495575221239, "grad_norm": 0.4296875, "learning_rate": 0.00018599865120011192, "loss": 0.5229, "step": 133 }, { "epoch": 0.5929203539823009, "grad_norm": 0.470703125, "learning_rate": 0.00018575364956682094, "loss": 0.546, "step": 134 }, { "epoch": 0.5973451327433629, "grad_norm": 0.48046875, "learning_rate": 0.00018550668718999872, "loss": 0.5311, "step": 135 }, { "epoch": 0.6017699115044248, "grad_norm": 0.47265625, "learning_rate": 0.00018525776971640062, "loss": 0.5043, "step": 136 }, { "epoch": 0.6061946902654868, "grad_norm": 0.46875, "learning_rate": 0.00018500690283748504, "loss": 0.4815, "step": 137 }, { "epoch": 0.6106194690265486, "grad_norm": 0.474609375, "learning_rate": 0.00018475409228928312, "loss": 0.517, "step": 138 }, { "epoch": 0.6150442477876106, "grad_norm": 0.5859375, "learning_rate": 0.0001844993438522678, "loss": 0.513, "step": 139 }, { "epoch": 0.6194690265486725, "grad_norm": 0.53125, "learning_rate": 0.00018424266335122152, "loss": 0.5296, "step": 140 }, { "epoch": 0.6238938053097345, "grad_norm": 0.5, "learning_rate": 0.000183984056655103, "loss": 0.5186, "step": 141 }, { "epoch": 0.6283185840707964, "grad_norm": 0.4921875, "learning_rate": 0.0001837235296769131, "loss": 0.5239, "step": 142 }, { "epoch": 0.6327433628318584, "grad_norm": 0.4765625, "learning_rate": 0.00018346108837355972, "loss": 0.5024, "step": 143 }, { "epoch": 0.6371681415929203, "grad_norm": 0.45703125, "learning_rate": 0.0001831967387457214, "loss": 0.5144, "step": 144 }, { "epoch": 0.6415929203539823, "grad_norm": 0.484375, "learning_rate": 0.00018293048683771024, "loss": 0.5342, "step": 145 }, { "epoch": 0.6460176991150443, "grad_norm": 0.462890625, "learning_rate": 0.00018266233873733375, "loss": 0.5143, "step": 146 }, { "epoch": 0.6504424778761062, "grad_norm": 0.439453125, "learning_rate": 0.00018239230057575542, "loss": 0.5271, "step": 147 }, { "epoch": 0.6548672566371682, "grad_norm": 0.46484375, "learning_rate": 0.00018212037852735486, "loss": 0.5185, "step": 148 }, { "epoch": 0.6592920353982301, "grad_norm": 0.435546875, "learning_rate": 0.00018184657880958635, "loss": 0.5237, "step": 149 }, { "epoch": 0.6637168141592921, "grad_norm": 0.494140625, "learning_rate": 0.00018157090768283678, "loss": 0.5137, "step": 150 }, { "epoch": 0.668141592920354, "grad_norm": 0.482421875, "learning_rate": 0.00018129337145028255, "loss": 0.5284, "step": 151 }, { "epoch": 0.672566371681416, "grad_norm": 0.431640625, "learning_rate": 0.00018101397645774539, "loss": 0.5252, "step": 152 }, { "epoch": 0.6769911504424779, "grad_norm": 0.435546875, "learning_rate": 0.00018073272909354727, "loss": 0.5105, "step": 153 }, { "epoch": 0.6814159292035398, "grad_norm": 0.47265625, "learning_rate": 0.00018044963578836435, "loss": 0.5384, "step": 154 }, { "epoch": 0.6858407079646017, "grad_norm": 0.4140625, "learning_rate": 0.00018016470301507995, "loss": 0.5086, "step": 155 }, { "epoch": 0.6902654867256637, "grad_norm": 0.435546875, "learning_rate": 0.00017987793728863651, "loss": 0.5227, "step": 156 }, { "epoch": 0.6946902654867256, "grad_norm": 0.44921875, "learning_rate": 0.00017958934516588667, "loss": 0.4982, "step": 157 }, { "epoch": 0.6991150442477876, "grad_norm": 0.44921875, "learning_rate": 0.00017929893324544332, "loss": 0.5109, "step": 158 }, { "epoch": 0.7035398230088495, "grad_norm": 0.439453125, "learning_rate": 0.00017900670816752874, "loss": 0.5242, "step": 159 }, { "epoch": 0.7079646017699115, "grad_norm": 0.4453125, "learning_rate": 0.00017871267661382278, "loss": 0.5019, "step": 160 }, { "epoch": 0.7123893805309734, "grad_norm": 0.40234375, "learning_rate": 0.00017841684530731005, "loss": 0.5048, "step": 161 }, { "epoch": 0.7168141592920354, "grad_norm": 0.392578125, "learning_rate": 0.0001781192210121262, "loss": 0.4957, "step": 162 }, { "epoch": 0.7212389380530974, "grad_norm": 0.455078125, "learning_rate": 0.00017781981053340337, "loss": 0.5035, "step": 163 }, { "epoch": 0.7256637168141593, "grad_norm": 0.451171875, "learning_rate": 0.00017751862071711442, "loss": 0.5161, "step": 164 }, { "epoch": 0.7300884955752213, "grad_norm": 0.42578125, "learning_rate": 0.00017721565844991643, "loss": 0.4838, "step": 165 }, { "epoch": 0.7345132743362832, "grad_norm": 0.46484375, "learning_rate": 0.00017691093065899346, "loss": 0.5135, "step": 166 }, { "epoch": 0.7389380530973452, "grad_norm": 0.462890625, "learning_rate": 0.0001766044443118978, "loss": 0.5174, "step": 167 }, { "epoch": 0.7433628318584071, "grad_norm": 0.41796875, "learning_rate": 0.00017629620641639103, "loss": 0.4982, "step": 168 }, { "epoch": 0.7477876106194691, "grad_norm": 0.494140625, "learning_rate": 0.0001759862240202834, "loss": 0.4902, "step": 169 }, { "epoch": 0.7522123893805309, "grad_norm": 0.451171875, "learning_rate": 0.00017567450421127306, "loss": 0.4858, "step": 170 }, { "epoch": 0.7566371681415929, "grad_norm": 0.4453125, "learning_rate": 0.0001753610541167838, "loss": 0.5178, "step": 171 }, { "epoch": 0.7610619469026548, "grad_norm": 0.453125, "learning_rate": 0.00017504588090380197, "loss": 0.5209, "step": 172 }, { "epoch": 0.7654867256637168, "grad_norm": 0.435546875, "learning_rate": 0.00017472899177871297, "loss": 0.5042, "step": 173 }, { "epoch": 0.7699115044247787, "grad_norm": 0.4609375, "learning_rate": 0.00017441039398713608, "loss": 0.539, "step": 174 }, { "epoch": 0.7743362831858407, "grad_norm": 0.423828125, "learning_rate": 0.00017409009481375904, "loss": 0.5053, "step": 175 }, { "epoch": 0.7787610619469026, "grad_norm": 0.439453125, "learning_rate": 0.0001737681015821714, "loss": 0.5357, "step": 176 }, { "epoch": 0.7831858407079646, "grad_norm": 0.44921875, "learning_rate": 0.00017344442165469714, "loss": 0.5108, "step": 177 }, { "epoch": 0.7876106194690266, "grad_norm": 0.435546875, "learning_rate": 0.00017311906243222614, "loss": 0.4909, "step": 178 }, { "epoch": 0.7920353982300885, "grad_norm": 0.443359375, "learning_rate": 0.0001727920313540452, "loss": 0.506, "step": 179 }, { "epoch": 0.7964601769911505, "grad_norm": 0.455078125, "learning_rate": 0.00017246333589766787, "loss": 0.5186, "step": 180 }, { "epoch": 0.8008849557522124, "grad_norm": 0.439453125, "learning_rate": 0.00017213298357866326, "loss": 0.4941, "step": 181 }, { "epoch": 0.8053097345132744, "grad_norm": 0.455078125, "learning_rate": 0.00017180098195048458, "loss": 0.4811, "step": 182 }, { "epoch": 0.8097345132743363, "grad_norm": 0.435546875, "learning_rate": 0.00017146733860429612, "loss": 0.5137, "step": 183 }, { "epoch": 0.8141592920353983, "grad_norm": 0.455078125, "learning_rate": 0.00017113206116879982, "loss": 0.5152, "step": 184 }, { "epoch": 0.8185840707964602, "grad_norm": 0.427734375, "learning_rate": 0.00017079515731006085, "loss": 0.5052, "step": 185 }, { "epoch": 0.8230088495575221, "grad_norm": 0.431640625, "learning_rate": 0.00017045663473133215, "loss": 0.5018, "step": 186 }, { "epoch": 0.827433628318584, "grad_norm": 0.4765625, "learning_rate": 0.00017011650117287866, "loss": 0.5055, "step": 187 }, { "epoch": 0.831858407079646, "grad_norm": 0.416015625, "learning_rate": 0.00016977476441179992, "loss": 0.5109, "step": 188 }, { "epoch": 0.8362831858407079, "grad_norm": 0.4140625, "learning_rate": 0.00016943143226185253, "loss": 0.4591, "step": 189 }, { "epoch": 0.8407079646017699, "grad_norm": 0.498046875, "learning_rate": 0.00016908651257327138, "loss": 0.5182, "step": 190 }, { "epoch": 0.8451327433628318, "grad_norm": 0.431640625, "learning_rate": 0.00016874001323259011, "loss": 0.4982, "step": 191 }, { "epoch": 0.8495575221238938, "grad_norm": 0.4296875, "learning_rate": 0.00016839194216246108, "loss": 0.4869, "step": 192 }, { "epoch": 0.8539823008849557, "grad_norm": 0.451171875, "learning_rate": 0.0001680423073214737, "loss": 0.4933, "step": 193 }, { "epoch": 0.8584070796460177, "grad_norm": 0.408203125, "learning_rate": 0.00016769111670397296, "loss": 0.5087, "step": 194 }, { "epoch": 0.8628318584070797, "grad_norm": 0.455078125, "learning_rate": 0.00016733837833987633, "loss": 0.5372, "step": 195 }, { "epoch": 0.8672566371681416, "grad_norm": 0.43359375, "learning_rate": 0.0001669841002944903, "loss": 0.5232, "step": 196 }, { "epoch": 0.8716814159292036, "grad_norm": 0.390625, "learning_rate": 0.00016662829066832596, "loss": 0.5069, "step": 197 }, { "epoch": 0.8761061946902655, "grad_norm": 0.42578125, "learning_rate": 0.00016627095759691362, "loss": 0.5029, "step": 198 }, { "epoch": 0.8805309734513275, "grad_norm": 0.42578125, "learning_rate": 0.0001659121092506171, "loss": 0.5079, "step": 199 }, { "epoch": 0.8849557522123894, "grad_norm": 0.419921875, "learning_rate": 0.00016555175383444655, "loss": 0.4895, "step": 200 }, { "epoch": 0.8893805309734514, "grad_norm": 0.443359375, "learning_rate": 0.00016518989958787126, "loss": 0.5173, "step": 201 }, { "epoch": 0.8938053097345132, "grad_norm": 0.42578125, "learning_rate": 0.00016482655478463083, "loss": 0.4811, "step": 202 }, { "epoch": 0.8982300884955752, "grad_norm": 0.412109375, "learning_rate": 0.00016446172773254629, "loss": 0.5187, "step": 203 }, { "epoch": 0.9026548672566371, "grad_norm": 0.4296875, "learning_rate": 0.00016409542677333006, "loss": 0.5053, "step": 204 }, { "epoch": 0.9070796460176991, "grad_norm": 0.431640625, "learning_rate": 0.0001637276602823952, "loss": 0.5353, "step": 205 }, { "epoch": 0.911504424778761, "grad_norm": 0.4453125, "learning_rate": 0.00016335843666866388, "loss": 0.4827, "step": 206 }, { "epoch": 0.915929203539823, "grad_norm": 0.4140625, "learning_rate": 0.00016298776437437523, "loss": 0.4936, "step": 207 }, { "epoch": 0.9203539823008849, "grad_norm": 0.443359375, "learning_rate": 0.0001626156518748922, "loss": 0.5207, "step": 208 }, { "epoch": 0.9247787610619469, "grad_norm": 0.423828125, "learning_rate": 0.0001622421076785077, "loss": 0.4725, "step": 209 }, { "epoch": 0.9292035398230089, "grad_norm": 0.408203125, "learning_rate": 0.00016186714032625035, "loss": 0.4816, "step": 210 }, { "epoch": 0.9336283185840708, "grad_norm": 0.419921875, "learning_rate": 0.00016149075839168886, "loss": 0.5515, "step": 211 }, { "epoch": 0.9380530973451328, "grad_norm": 0.41796875, "learning_rate": 0.0001611129704807362, "loss": 0.5059, "step": 212 }, { "epoch": 0.9424778761061947, "grad_norm": 0.443359375, "learning_rate": 0.0001607337852314527, "loss": 0.496, "step": 213 }, { "epoch": 0.9469026548672567, "grad_norm": 0.408203125, "learning_rate": 0.0001603532113138487, "loss": 0.484, "step": 214 }, { "epoch": 0.9513274336283186, "grad_norm": 0.41015625, "learning_rate": 0.00015997125742968617, "loss": 0.5204, "step": 215 }, { "epoch": 0.9557522123893806, "grad_norm": 0.40625, "learning_rate": 0.0001595879323122798, "loss": 0.4906, "step": 216 }, { "epoch": 0.9601769911504425, "grad_norm": 0.416015625, "learning_rate": 0.00015920324472629732, "loss": 0.492, "step": 217 }, { "epoch": 0.9646017699115044, "grad_norm": 0.412109375, "learning_rate": 0.00015881720346755905, "loss": 0.4574, "step": 218 }, { "epoch": 0.9690265486725663, "grad_norm": 0.4296875, "learning_rate": 0.00015842981736283686, "loss": 0.5018, "step": 219 }, { "epoch": 0.9734513274336283, "grad_norm": 0.419921875, "learning_rate": 0.00015804109526965232, "loss": 0.4766, "step": 220 }, { "epoch": 0.9778761061946902, "grad_norm": 0.384765625, "learning_rate": 0.0001576510460760741, "loss": 0.4928, "step": 221 }, { "epoch": 0.9823008849557522, "grad_norm": 0.400390625, "learning_rate": 0.0001572596787005149, "loss": 0.4919, "step": 222 }, { "epoch": 0.9867256637168141, "grad_norm": 0.41796875, "learning_rate": 0.00015686700209152738, "loss": 0.4737, "step": 223 }, { "epoch": 0.9911504424778761, "grad_norm": 0.423828125, "learning_rate": 0.00015647302522759962, "loss": 0.5073, "step": 224 }, { "epoch": 0.995575221238938, "grad_norm": 0.44140625, "learning_rate": 0.00015607775711694977, "loss": 0.4684, "step": 225 }, { "epoch": 1.0, "grad_norm": 0.466796875, "learning_rate": 0.0001556812067973203, "loss": 0.4789, "step": 226 }, { "epoch": 1.0, "eval_loss": 0.49126777052879333, "eval_runtime": 5.6505, "eval_samples_per_second": 148.306, "eval_steps_per_second": 18.583, "step": 226 }, { "epoch": 1.0044247787610618, "grad_norm": 0.4609375, "learning_rate": 0.00015528338333577101, "loss": 0.4303, "step": 227 }, { "epoch": 1.008849557522124, "grad_norm": 0.474609375, "learning_rate": 0.00015488429582847192, "loss": 0.4457, "step": 228 }, { "epoch": 1.0132743362831858, "grad_norm": 0.4921875, "learning_rate": 0.00015448395340049537, "loss": 0.4697, "step": 229 }, { "epoch": 1.0176991150442478, "grad_norm": 0.470703125, "learning_rate": 0.0001540823652056071, "loss": 0.4294, "step": 230 }, { "epoch": 1.0221238938053097, "grad_norm": 0.400390625, "learning_rate": 0.0001536795404260572, "loss": 0.4192, "step": 231 }, { "epoch": 1.0265486725663717, "grad_norm": 0.435546875, "learning_rate": 0.00015327548827237007, "loss": 0.4449, "step": 232 }, { "epoch": 1.0309734513274336, "grad_norm": 0.408203125, "learning_rate": 0.0001528702179831338, "loss": 0.4394, "step": 233 }, { "epoch": 1.0353982300884956, "grad_norm": 0.41015625, "learning_rate": 0.00015246373882478898, "loss": 0.435, "step": 234 }, { "epoch": 1.0398230088495575, "grad_norm": 0.42578125, "learning_rate": 0.0001520560600914168, "loss": 0.4531, "step": 235 }, { "epoch": 1.0442477876106195, "grad_norm": 0.4140625, "learning_rate": 0.00015164719110452652, "loss": 0.4443, "step": 236 }, { "epoch": 1.0486725663716814, "grad_norm": 0.419921875, "learning_rate": 0.0001512371412128424, "loss": 0.4459, "step": 237 }, { "epoch": 1.0530973451327434, "grad_norm": 0.431640625, "learning_rate": 0.00015082591979208976, "loss": 0.4355, "step": 238 }, { "epoch": 1.0575221238938053, "grad_norm": 0.474609375, "learning_rate": 0.00015041353624478093, "loss": 0.443, "step": 239 }, { "epoch": 1.0619469026548674, "grad_norm": 0.4140625, "learning_rate": 0.00015000000000000001, "loss": 0.4295, "step": 240 }, { "epoch": 1.0663716814159292, "grad_norm": 0.384765625, "learning_rate": 0.0001495853205131873, "loss": 0.4249, "step": 241 }, { "epoch": 1.0707964601769913, "grad_norm": 0.431640625, "learning_rate": 0.00014916950726592322, "loss": 0.4464, "step": 242 }, { "epoch": 1.075221238938053, "grad_norm": 0.453125, "learning_rate": 0.00014875256976571135, "loss": 0.4404, "step": 243 }, { "epoch": 1.079646017699115, "grad_norm": 0.42578125, "learning_rate": 0.00014833451754576123, "loss": 0.4294, "step": 244 }, { "epoch": 1.084070796460177, "grad_norm": 0.427734375, "learning_rate": 0.00014791536016477022, "loss": 0.4172, "step": 245 }, { "epoch": 1.0884955752212389, "grad_norm": 0.43359375, "learning_rate": 0.00014749510720670506, "loss": 0.4333, "step": 246 }, { "epoch": 1.092920353982301, "grad_norm": 0.416015625, "learning_rate": 0.00014707376828058263, "loss": 0.4522, "step": 247 }, { "epoch": 1.0973451327433628, "grad_norm": 0.427734375, "learning_rate": 0.00014665135302025035, "loss": 0.46, "step": 248 }, { "epoch": 1.1017699115044248, "grad_norm": 0.40234375, "learning_rate": 0.00014622787108416584, "loss": 0.42, "step": 249 }, { "epoch": 1.1061946902654867, "grad_norm": 0.43359375, "learning_rate": 0.00014580333215517607, "loss": 0.4367, "step": 250 }, { "epoch": 1.1106194690265487, "grad_norm": 0.40625, "learning_rate": 0.000145377745940296, "loss": 0.4305, "step": 251 }, { "epoch": 1.1150442477876106, "grad_norm": 0.384765625, "learning_rate": 0.00014495112217048658, "loss": 0.4518, "step": 252 }, { "epoch": 1.1194690265486726, "grad_norm": 0.44140625, "learning_rate": 0.00014452347060043237, "loss": 0.4227, "step": 253 }, { "epoch": 1.1238938053097345, "grad_norm": 0.431640625, "learning_rate": 0.00014409480100831834, "loss": 0.4336, "step": 254 }, { "epoch": 1.1283185840707965, "grad_norm": 0.40625, "learning_rate": 0.0001436651231956064, "loss": 0.4077, "step": 255 }, { "epoch": 1.1327433628318584, "grad_norm": 0.423828125, "learning_rate": 0.00014323444698681126, "loss": 0.426, "step": 256 }, { "epoch": 1.1371681415929205, "grad_norm": 0.421875, "learning_rate": 0.0001428027822292758, "loss": 0.4403, "step": 257 }, { "epoch": 1.1415929203539823, "grad_norm": 0.4609375, "learning_rate": 0.0001423701387929459, "loss": 0.443, "step": 258 }, { "epoch": 1.1460176991150441, "grad_norm": 0.4296875, "learning_rate": 0.0001419365265701448, "loss": 0.438, "step": 259 }, { "epoch": 1.1504424778761062, "grad_norm": 0.419921875, "learning_rate": 0.00014150195547534686, "loss": 0.4318, "step": 260 }, { "epoch": 1.154867256637168, "grad_norm": 0.408203125, "learning_rate": 0.0001410664354449509, "loss": 0.4413, "step": 261 }, { "epoch": 1.1592920353982301, "grad_norm": 0.392578125, "learning_rate": 0.00014062997643705306, "loss": 0.4325, "step": 262 }, { "epoch": 1.163716814159292, "grad_norm": 0.39453125, "learning_rate": 0.00014019258843121893, "loss": 0.4164, "step": 263 }, { "epoch": 1.168141592920354, "grad_norm": 0.431640625, "learning_rate": 0.0001397542814282556, "loss": 0.4393, "step": 264 }, { "epoch": 1.1725663716814159, "grad_norm": 0.423828125, "learning_rate": 0.00013931506544998283, "loss": 0.4346, "step": 265 }, { "epoch": 1.176991150442478, "grad_norm": 0.439453125, "learning_rate": 0.00013887495053900397, "loss": 0.4294, "step": 266 }, { "epoch": 1.1814159292035398, "grad_norm": 0.416015625, "learning_rate": 0.00013843394675847634, "loss": 0.4404, "step": 267 }, { "epoch": 1.1858407079646018, "grad_norm": 0.388671875, "learning_rate": 0.00013799206419188103, "loss": 0.4198, "step": 268 }, { "epoch": 1.1902654867256637, "grad_norm": 0.40234375, "learning_rate": 0.00013754931294279263, "loss": 0.4158, "step": 269 }, { "epoch": 1.1946902654867257, "grad_norm": 0.3984375, "learning_rate": 0.00013710570313464778, "loss": 0.4234, "step": 270 }, { "epoch": 1.1991150442477876, "grad_norm": 0.4140625, "learning_rate": 0.0001366612449105141, "loss": 0.4233, "step": 271 }, { "epoch": 1.2035398230088497, "grad_norm": 0.439453125, "learning_rate": 0.00013621594843285802, "loss": 0.4518, "step": 272 }, { "epoch": 1.2079646017699115, "grad_norm": 0.412109375, "learning_rate": 0.0001357698238833126, "loss": 0.4465, "step": 273 }, { "epoch": 1.2123893805309733, "grad_norm": 0.412109375, "learning_rate": 0.00013532288146244446, "loss": 0.4446, "step": 274 }, { "epoch": 1.2168141592920354, "grad_norm": 0.4140625, "learning_rate": 0.00013487513138952094, "loss": 0.4384, "step": 275 }, { "epoch": 1.2212389380530975, "grad_norm": 0.41015625, "learning_rate": 0.00013442658390227602, "loss": 0.4286, "step": 276 }, { "epoch": 1.2256637168141593, "grad_norm": 0.390625, "learning_rate": 0.00013397724925667657, "loss": 0.43, "step": 277 }, { "epoch": 1.2300884955752212, "grad_norm": 0.38671875, "learning_rate": 0.00013352713772668765, "loss": 0.4293, "step": 278 }, { "epoch": 1.2345132743362832, "grad_norm": 0.39453125, "learning_rate": 0.00013307625960403763, "loss": 0.4326, "step": 279 }, { "epoch": 1.238938053097345, "grad_norm": 0.3671875, "learning_rate": 0.00013262462519798293, "loss": 0.4241, "step": 280 }, { "epoch": 1.2433628318584071, "grad_norm": 0.3984375, "learning_rate": 0.00013217224483507228, "loss": 0.4336, "step": 281 }, { "epoch": 1.247787610619469, "grad_norm": 0.427734375, "learning_rate": 0.00013171912885891063, "loss": 0.4418, "step": 282 }, { "epoch": 1.252212389380531, "grad_norm": 0.416015625, "learning_rate": 0.00013126528762992247, "loss": 0.4258, "step": 283 }, { "epoch": 1.2566371681415929, "grad_norm": 0.37109375, "learning_rate": 0.00013081073152511525, "loss": 0.438, "step": 284 }, { "epoch": 1.261061946902655, "grad_norm": 0.40234375, "learning_rate": 0.00013035547093784186, "loss": 0.4364, "step": 285 }, { "epoch": 1.2654867256637168, "grad_norm": 0.392578125, "learning_rate": 0.00012989951627756304, "loss": 0.409, "step": 286 }, { "epoch": 1.2699115044247788, "grad_norm": 0.416015625, "learning_rate": 0.0001294428779696095, "loss": 0.4448, "step": 287 }, { "epoch": 1.2743362831858407, "grad_norm": 0.41015625, "learning_rate": 0.00012898556645494325, "loss": 0.4497, "step": 288 }, { "epoch": 1.2787610619469025, "grad_norm": 0.404296875, "learning_rate": 0.00012852759218991933, "loss": 0.4188, "step": 289 }, { "epoch": 1.2831858407079646, "grad_norm": 0.388671875, "learning_rate": 0.00012806896564604626, "loss": 0.4226, "step": 290 }, { "epoch": 1.2876106194690267, "grad_norm": 0.427734375, "learning_rate": 0.00012760969730974694, "loss": 0.4281, "step": 291 }, { "epoch": 1.2920353982300885, "grad_norm": 0.4375, "learning_rate": 0.00012714979768211853, "loss": 0.4626, "step": 292 }, { "epoch": 1.2964601769911503, "grad_norm": 0.416015625, "learning_rate": 0.0001266892772786929, "loss": 0.4292, "step": 293 }, { "epoch": 1.3008849557522124, "grad_norm": 0.412109375, "learning_rate": 0.00012622814662919561, "loss": 0.4309, "step": 294 }, { "epoch": 1.3053097345132743, "grad_norm": 0.404296875, "learning_rate": 0.0001257664162773055, "loss": 0.4283, "step": 295 }, { "epoch": 1.3097345132743363, "grad_norm": 0.408203125, "learning_rate": 0.00012530409678041343, "loss": 0.4431, "step": 296 }, { "epoch": 1.3141592920353982, "grad_norm": 0.400390625, "learning_rate": 0.00012484119870938103, "loss": 0.4419, "step": 297 }, { "epoch": 1.3185840707964602, "grad_norm": 0.380859375, "learning_rate": 0.00012437773264829897, "loss": 0.4513, "step": 298 }, { "epoch": 1.323008849557522, "grad_norm": 0.39453125, "learning_rate": 0.00012391370919424485, "loss": 0.4389, "step": 299 }, { "epoch": 1.3274336283185841, "grad_norm": 0.390625, "learning_rate": 0.00012344913895704097, "loss": 0.4495, "step": 300 }, { "epoch": 1.331858407079646, "grad_norm": 0.40234375, "learning_rate": 0.00012298403255901186, "loss": 0.4169, "step": 301 }, { "epoch": 1.336283185840708, "grad_norm": 0.388671875, "learning_rate": 0.00012251840063474108, "loss": 0.4368, "step": 302 }, { "epoch": 1.3407079646017699, "grad_norm": 0.38671875, "learning_rate": 0.00012205225383082843, "loss": 0.4198, "step": 303 }, { "epoch": 1.3451327433628317, "grad_norm": 0.419921875, "learning_rate": 0.00012158560280564626, "loss": 0.4386, "step": 304 }, { "epoch": 1.3495575221238938, "grad_norm": 0.412109375, "learning_rate": 0.00012111845822909596, "loss": 0.4164, "step": 305 }, { "epoch": 1.3539823008849559, "grad_norm": 0.408203125, "learning_rate": 0.00012065083078236374, "loss": 0.4568, "step": 306 }, { "epoch": 1.3584070796460177, "grad_norm": 0.40625, "learning_rate": 0.00012018273115767673, "loss": 0.4283, "step": 307 }, { "epoch": 1.3628318584070795, "grad_norm": 0.416015625, "learning_rate": 0.00011971417005805818, "loss": 0.4254, "step": 308 }, { "epoch": 1.3672566371681416, "grad_norm": 0.4140625, "learning_rate": 0.000119245158197083, "loss": 0.434, "step": 309 }, { "epoch": 1.3716814159292037, "grad_norm": 0.408203125, "learning_rate": 0.00011877570629863266, "loss": 0.4203, "step": 310 }, { "epoch": 1.3761061946902655, "grad_norm": 0.408203125, "learning_rate": 0.00011830582509664995, "loss": 0.4373, "step": 311 }, { "epoch": 1.3805309734513274, "grad_norm": 0.41796875, "learning_rate": 0.00011783552533489372, "loss": 0.4303, "step": 312 }, { "epoch": 1.3849557522123894, "grad_norm": 0.390625, "learning_rate": 0.00011736481776669306, "loss": 0.429, "step": 313 }, { "epoch": 1.3893805309734513, "grad_norm": 0.408203125, "learning_rate": 0.0001168937131547015, "loss": 0.4122, "step": 314 }, { "epoch": 1.3938053097345133, "grad_norm": 0.412109375, "learning_rate": 0.00011642222227065089, "loss": 0.4519, "step": 315 }, { "epoch": 1.3982300884955752, "grad_norm": 0.43359375, "learning_rate": 0.00011595035589510522, "loss": 0.4303, "step": 316 }, { "epoch": 1.4026548672566372, "grad_norm": 0.38671875, "learning_rate": 0.00011547812481721388, "loss": 0.4215, "step": 317 }, { "epoch": 1.407079646017699, "grad_norm": 0.408203125, "learning_rate": 0.00011500553983446527, "loss": 0.4276, "step": 318 }, { "epoch": 1.411504424778761, "grad_norm": 0.427734375, "learning_rate": 0.00011453261175243973, "loss": 0.4249, "step": 319 }, { "epoch": 1.415929203539823, "grad_norm": 0.392578125, "learning_rate": 0.00011405935138456241, "loss": 0.4168, "step": 320 }, { "epoch": 1.420353982300885, "grad_norm": 0.41015625, "learning_rate": 0.0001135857695518563, "loss": 0.4194, "step": 321 }, { "epoch": 1.424778761061947, "grad_norm": 0.4296875, "learning_rate": 0.00011311187708269442, "loss": 0.4154, "step": 322 }, { "epoch": 1.4292035398230087, "grad_norm": 0.412109375, "learning_rate": 0.00011263768481255264, "loss": 0.426, "step": 323 }, { "epoch": 1.4336283185840708, "grad_norm": 0.4140625, "learning_rate": 0.00011216320358376157, "loss": 0.4359, "step": 324 }, { "epoch": 1.4380530973451329, "grad_norm": 0.400390625, "learning_rate": 0.00011168844424525902, "loss": 0.412, "step": 325 }, { "epoch": 1.4424778761061947, "grad_norm": 0.3984375, "learning_rate": 0.00011121341765234146, "loss": 0.4418, "step": 326 }, { "epoch": 1.4469026548672566, "grad_norm": 0.390625, "learning_rate": 0.00011073813466641632, "loss": 0.4193, "step": 327 }, { "epoch": 1.4513274336283186, "grad_norm": 0.416015625, "learning_rate": 0.00011026260615475333, "loss": 0.4161, "step": 328 }, { "epoch": 1.4557522123893805, "grad_norm": 0.384765625, "learning_rate": 0.00010978684299023607, "loss": 0.4367, "step": 329 }, { "epoch": 1.4601769911504425, "grad_norm": 0.400390625, "learning_rate": 0.00010931085605111354, "loss": 0.4334, "step": 330 }, { "epoch": 1.4646017699115044, "grad_norm": 0.384765625, "learning_rate": 0.0001088346562207512, "loss": 0.4069, "step": 331 }, { "epoch": 1.4690265486725664, "grad_norm": 0.40234375, "learning_rate": 0.00010835825438738232, "loss": 0.4183, "step": 332 }, { "epoch": 1.4734513274336283, "grad_norm": 0.40234375, "learning_rate": 0.00010788166144385888, "loss": 0.4275, "step": 333 }, { "epoch": 1.4778761061946903, "grad_norm": 0.42578125, "learning_rate": 0.00010740488828740258, "loss": 0.4342, "step": 334 }, { "epoch": 1.4823008849557522, "grad_norm": 0.375, "learning_rate": 0.00010692794581935566, "loss": 0.4199, "step": 335 }, { "epoch": 1.4867256637168142, "grad_norm": 0.3984375, "learning_rate": 0.00010645084494493165, "loss": 0.4134, "step": 336 }, { "epoch": 1.491150442477876, "grad_norm": 0.404296875, "learning_rate": 0.00010597359657296602, "loss": 0.4057, "step": 337 }, { "epoch": 1.495575221238938, "grad_norm": 0.38671875, "learning_rate": 0.0001054962116156667, "loss": 0.4129, "step": 338 }, { "epoch": 1.5, "grad_norm": 0.392578125, "learning_rate": 0.00010501870098836473, "loss": 0.4337, "step": 339 }, { "epoch": 1.504424778761062, "grad_norm": 0.40234375, "learning_rate": 0.00010454107560926443, "loss": 0.4292, "step": 340 }, { "epoch": 1.508849557522124, "grad_norm": 0.3671875, "learning_rate": 0.00010406334639919403, "loss": 0.4074, "step": 341 }, { "epoch": 1.5132743362831858, "grad_norm": 0.427734375, "learning_rate": 0.00010358552428135575, "loss": 0.4187, "step": 342 }, { "epoch": 1.5176991150442478, "grad_norm": 0.427734375, "learning_rate": 0.0001031076201810762, "loss": 0.4427, "step": 343 }, { "epoch": 1.5221238938053099, "grad_norm": 0.3828125, "learning_rate": 0.00010262964502555643, "loss": 0.4249, "step": 344 }, { "epoch": 1.5265486725663717, "grad_norm": 0.37890625, "learning_rate": 0.00010215160974362223, "loss": 0.4132, "step": 345 }, { "epoch": 1.5309734513274336, "grad_norm": 0.4140625, "learning_rate": 0.00010167352526547416, "loss": 0.4181, "step": 346 }, { "epoch": 1.5353982300884956, "grad_norm": 0.408203125, "learning_rate": 0.00010119540252243755, "loss": 0.4282, "step": 347 }, { "epoch": 1.5398230088495575, "grad_norm": 0.38671875, "learning_rate": 0.00010071725244671282, "loss": 0.4314, "step": 348 }, { "epoch": 1.5442477876106193, "grad_norm": 0.396484375, "learning_rate": 0.00010023908597112514, "loss": 0.4404, "step": 349 }, { "epoch": 1.5486725663716814, "grad_norm": 0.4296875, "learning_rate": 9.976091402887487e-05, "loss": 0.4394, "step": 350 }, { "epoch": 1.5530973451327434, "grad_norm": 0.419921875, "learning_rate": 9.928274755328723e-05, "loss": 0.4282, "step": 351 }, { "epoch": 1.5575221238938053, "grad_norm": 0.392578125, "learning_rate": 9.880459747756247e-05, "loss": 0.4234, "step": 352 }, { "epoch": 1.5619469026548671, "grad_norm": 0.388671875, "learning_rate": 9.83264747345259e-05, "loss": 0.412, "step": 353 }, { "epoch": 1.5663716814159292, "grad_norm": 0.40234375, "learning_rate": 9.784839025637778e-05, "loss": 0.4258, "step": 354 }, { "epoch": 1.5707964601769913, "grad_norm": 0.453125, "learning_rate": 9.737035497444361e-05, "loss": 0.4149, "step": 355 }, { "epoch": 1.575221238938053, "grad_norm": 0.404296875, "learning_rate": 9.689237981892382e-05, "loss": 0.4095, "step": 356 }, { "epoch": 1.579646017699115, "grad_norm": 0.41015625, "learning_rate": 9.641447571864429e-05, "loss": 0.3994, "step": 357 }, { "epoch": 1.584070796460177, "grad_norm": 0.37109375, "learning_rate": 9.593665360080599e-05, "loss": 0.417, "step": 358 }, { "epoch": 1.588495575221239, "grad_norm": 0.408203125, "learning_rate": 9.545892439073562e-05, "loss": 0.4328, "step": 359 }, { "epoch": 1.592920353982301, "grad_norm": 0.4296875, "learning_rate": 9.49812990116353e-05, "loss": 0.4364, "step": 360 }, { "epoch": 1.5973451327433628, "grad_norm": 0.39453125, "learning_rate": 9.450378838433331e-05, "loss": 0.4098, "step": 361 }, { "epoch": 1.6017699115044248, "grad_norm": 0.408203125, "learning_rate": 9.4026403427034e-05, "loss": 0.4155, "step": 362 }, { "epoch": 1.606194690265487, "grad_norm": 0.375, "learning_rate": 9.354915505506839e-05, "loss": 0.4338, "step": 363 }, { "epoch": 1.6106194690265485, "grad_norm": 0.408203125, "learning_rate": 9.307205418064437e-05, "loss": 0.4326, "step": 364 }, { "epoch": 1.6150442477876106, "grad_norm": 0.396484375, "learning_rate": 9.259511171259746e-05, "loss": 0.4167, "step": 365 }, { "epoch": 1.6194690265486726, "grad_norm": 0.40625, "learning_rate": 9.211833855614114e-05, "loss": 0.4413, "step": 366 }, { "epoch": 1.6238938053097345, "grad_norm": 0.376953125, "learning_rate": 9.164174561261771e-05, "loss": 0.4223, "step": 367 }, { "epoch": 1.6283185840707963, "grad_norm": 0.388671875, "learning_rate": 9.116534377924883e-05, "loss": 0.4171, "step": 368 }, { "epoch": 1.6327433628318584, "grad_norm": 0.384765625, "learning_rate": 9.06891439488865e-05, "loss": 0.4042, "step": 369 }, { "epoch": 1.6371681415929205, "grad_norm": 0.39453125, "learning_rate": 9.021315700976395e-05, "loss": 0.4168, "step": 370 }, { "epoch": 1.6415929203539823, "grad_norm": 0.3828125, "learning_rate": 8.973739384524674e-05, "loss": 0.4121, "step": 371 }, { "epoch": 1.6460176991150441, "grad_norm": 0.357421875, "learning_rate": 8.92618653335837e-05, "loss": 0.4084, "step": 372 }, { "epoch": 1.6504424778761062, "grad_norm": 0.390625, "learning_rate": 8.878658234765858e-05, "loss": 0.41, "step": 373 }, { "epoch": 1.6548672566371683, "grad_norm": 0.369140625, "learning_rate": 8.831155575474102e-05, "loss": 0.4131, "step": 374 }, { "epoch": 1.6592920353982301, "grad_norm": 0.37109375, "learning_rate": 8.783679641623845e-05, "loss": 0.4277, "step": 375 }, { "epoch": 1.663716814159292, "grad_norm": 0.37890625, "learning_rate": 8.73623151874474e-05, "loss": 0.4201, "step": 376 }, { "epoch": 1.668141592920354, "grad_norm": 0.39453125, "learning_rate": 8.688812291730563e-05, "loss": 0.417, "step": 377 }, { "epoch": 1.672566371681416, "grad_norm": 0.37109375, "learning_rate": 8.641423044814374e-05, "loss": 0.4111, "step": 378 }, { "epoch": 1.676991150442478, "grad_norm": 0.37109375, "learning_rate": 8.59406486154376e-05, "loss": 0.404, "step": 379 }, { "epoch": 1.6814159292035398, "grad_norm": 0.40625, "learning_rate": 8.54673882475603e-05, "loss": 0.428, "step": 380 }, { "epoch": 1.6858407079646018, "grad_norm": 0.396484375, "learning_rate": 8.499446016553474e-05, "loss": 0.4051, "step": 381 }, { "epoch": 1.6902654867256637, "grad_norm": 0.384765625, "learning_rate": 8.452187518278613e-05, "loss": 0.4164, "step": 382 }, { "epoch": 1.6946902654867255, "grad_norm": 0.375, "learning_rate": 8.404964410489485e-05, "loss": 0.3898, "step": 383 }, { "epoch": 1.6991150442477876, "grad_norm": 0.40234375, "learning_rate": 8.357777772934913e-05, "loss": 0.4264, "step": 384 }, { "epoch": 1.7035398230088497, "grad_norm": 0.357421875, "learning_rate": 8.310628684529856e-05, "loss": 0.405, "step": 385 }, { "epoch": 1.7079646017699115, "grad_norm": 0.380859375, "learning_rate": 8.263518223330697e-05, "loss": 0.4261, "step": 386 }, { "epoch": 1.7123893805309733, "grad_norm": 0.376953125, "learning_rate": 8.216447466510631e-05, "loss": 0.4001, "step": 387 }, { "epoch": 1.7168141592920354, "grad_norm": 0.404296875, "learning_rate": 8.169417490335007e-05, "loss": 0.4212, "step": 388 }, { "epoch": 1.7212389380530975, "grad_norm": 0.37109375, "learning_rate": 8.122429370136739e-05, "loss": 0.4097, "step": 389 }, { "epoch": 1.7256637168141593, "grad_norm": 0.375, "learning_rate": 8.075484180291701e-05, "loss": 0.4133, "step": 390 }, { "epoch": 1.7300884955752212, "grad_norm": 0.3984375, "learning_rate": 8.028582994194185e-05, "loss": 0.4104, "step": 391 }, { "epoch": 1.7345132743362832, "grad_norm": 0.390625, "learning_rate": 7.981726884232328e-05, "loss": 0.4147, "step": 392 }, { "epoch": 1.7389380530973453, "grad_norm": 0.408203125, "learning_rate": 7.934916921763628e-05, "loss": 0.4097, "step": 393 }, { "epoch": 1.7433628318584071, "grad_norm": 0.39453125, "learning_rate": 7.888154177090405e-05, "loss": 0.4126, "step": 394 }, { "epoch": 1.747787610619469, "grad_norm": 0.390625, "learning_rate": 7.841439719435377e-05, "loss": 0.414, "step": 395 }, { "epoch": 1.752212389380531, "grad_norm": 0.388671875, "learning_rate": 7.79477461691716e-05, "loss": 0.4348, "step": 396 }, { "epoch": 1.7566371681415929, "grad_norm": 0.38671875, "learning_rate": 7.748159936525896e-05, "loss": 0.4122, "step": 397 }, { "epoch": 1.7610619469026547, "grad_norm": 0.380859375, "learning_rate": 7.701596744098818e-05, "loss": 0.4134, "step": 398 }, { "epoch": 1.7654867256637168, "grad_norm": 0.375, "learning_rate": 7.655086104295904e-05, "loss": 0.406, "step": 399 }, { "epoch": 1.7699115044247788, "grad_norm": 0.39453125, "learning_rate": 7.608629080575518e-05, "loss": 0.4179, "step": 400 }, { "epoch": 1.7743362831858407, "grad_norm": 0.3828125, "learning_rate": 7.562226735170106e-05, "loss": 0.427, "step": 401 }, { "epoch": 1.7787610619469025, "grad_norm": 0.408203125, "learning_rate": 7.5158801290619e-05, "loss": 0.4255, "step": 402 }, { "epoch": 1.7831858407079646, "grad_norm": 0.384765625, "learning_rate": 7.469590321958662e-05, "loss": 0.407, "step": 403 }, { "epoch": 1.7876106194690267, "grad_norm": 0.388671875, "learning_rate": 7.423358372269455e-05, "loss": 0.4208, "step": 404 }, { "epoch": 1.7920353982300885, "grad_norm": 0.37109375, "learning_rate": 7.377185337080442e-05, "loss": 0.4167, "step": 405 }, { "epoch": 1.7964601769911503, "grad_norm": 0.37890625, "learning_rate": 7.331072272130712e-05, "loss": 0.4207, "step": 406 }, { "epoch": 1.8008849557522124, "grad_norm": 0.384765625, "learning_rate": 7.285020231788149e-05, "loss": 0.4195, "step": 407 }, { "epoch": 1.8053097345132745, "grad_norm": 0.3828125, "learning_rate": 7.239030269025311e-05, "loss": 0.4033, "step": 408 }, { "epoch": 1.8097345132743363, "grad_norm": 0.36328125, "learning_rate": 7.193103435395378e-05, "loss": 0.4012, "step": 409 }, { "epoch": 1.8141592920353982, "grad_norm": 0.384765625, "learning_rate": 7.147240781008068e-05, "loss": 0.4151, "step": 410 }, { "epoch": 1.8185840707964602, "grad_norm": 0.37890625, "learning_rate": 7.101443354505678e-05, "loss": 0.4091, "step": 411 }, { "epoch": 1.823008849557522, "grad_norm": 0.392578125, "learning_rate": 7.055712203039055e-05, "loss": 0.453, "step": 412 }, { "epoch": 1.827433628318584, "grad_norm": 0.400390625, "learning_rate": 7.010048372243698e-05, "loss": 0.4153, "step": 413 }, { "epoch": 1.831858407079646, "grad_norm": 0.3671875, "learning_rate": 6.964452906215815e-05, "loss": 0.3963, "step": 414 }, { "epoch": 1.836283185840708, "grad_norm": 0.373046875, "learning_rate": 6.918926847488476e-05, "loss": 0.4217, "step": 415 }, { "epoch": 1.8407079646017699, "grad_norm": 0.37109375, "learning_rate": 6.873471237007754e-05, "loss": 0.408, "step": 416 }, { "epoch": 1.8451327433628317, "grad_norm": 0.384765625, "learning_rate": 6.82808711410894e-05, "loss": 0.4029, "step": 417 }, { "epoch": 1.8495575221238938, "grad_norm": 0.408203125, "learning_rate": 6.782775516492771e-05, "loss": 0.4386, "step": 418 }, { "epoch": 1.8539823008849559, "grad_norm": 0.373046875, "learning_rate": 6.73753748020171e-05, "loss": 0.4093, "step": 419 }, { "epoch": 1.8584070796460177, "grad_norm": 0.3828125, "learning_rate": 6.69237403959624e-05, "loss": 0.4261, "step": 420 }, { "epoch": 1.8628318584070795, "grad_norm": 0.37109375, "learning_rate": 6.64728622733124e-05, "loss": 0.3903, "step": 421 }, { "epoch": 1.8672566371681416, "grad_norm": 0.388671875, "learning_rate": 6.602275074332345e-05, "loss": 0.4108, "step": 422 }, { "epoch": 1.8716814159292037, "grad_norm": 0.3828125, "learning_rate": 6.5573416097724e-05, "loss": 0.4265, "step": 423 }, { "epoch": 1.8761061946902655, "grad_norm": 0.361328125, "learning_rate": 6.512486861047911e-05, "loss": 0.4117, "step": 424 }, { "epoch": 1.8805309734513274, "grad_norm": 0.384765625, "learning_rate": 6.467711853755558e-05, "loss": 0.4202, "step": 425 }, { "epoch": 1.8849557522123894, "grad_norm": 0.388671875, "learning_rate": 6.423017611668745e-05, "loss": 0.4054, "step": 426 }, { "epoch": 1.8893805309734515, "grad_norm": 0.376953125, "learning_rate": 6.378405156714202e-05, "loss": 0.4023, "step": 427 }, { "epoch": 1.893805309734513, "grad_norm": 0.37109375, "learning_rate": 6.333875508948593e-05, "loss": 0.4138, "step": 428 }, { "epoch": 1.8982300884955752, "grad_norm": 0.384765625, "learning_rate": 6.289429686535226e-05, "loss": 0.4308, "step": 429 }, { "epoch": 1.9026548672566372, "grad_norm": 0.404296875, "learning_rate": 6.245068705720739e-05, "loss": 0.4343, "step": 430 }, { "epoch": 1.907079646017699, "grad_norm": 0.37890625, "learning_rate": 6.200793580811896e-05, "loss": 0.4085, "step": 431 }, { "epoch": 1.911504424778761, "grad_norm": 0.41796875, "learning_rate": 6.15660532415237e-05, "loss": 0.4365, "step": 432 }, { "epoch": 1.915929203539823, "grad_norm": 0.400390625, "learning_rate": 6.112504946099604e-05, "loss": 0.4068, "step": 433 }, { "epoch": 1.920353982300885, "grad_norm": 0.38671875, "learning_rate": 6.0684934550017184e-05, "loss": 0.4157, "step": 434 }, { "epoch": 1.924778761061947, "grad_norm": 0.376953125, "learning_rate": 6.024571857174443e-05, "loss": 0.438, "step": 435 }, { "epoch": 1.9292035398230087, "grad_norm": 0.375, "learning_rate": 5.980741156878109e-05, "loss": 0.41, "step": 436 }, { "epoch": 1.9336283185840708, "grad_norm": 0.365234375, "learning_rate": 5.9370023562946986e-05, "loss": 0.4104, "step": 437 }, { "epoch": 1.9380530973451329, "grad_norm": 0.37890625, "learning_rate": 5.8933564555049105e-05, "loss": 0.4245, "step": 438 }, { "epoch": 1.9424778761061947, "grad_norm": 0.3671875, "learning_rate": 5.849804452465317e-05, "loss": 0.4005, "step": 439 }, { "epoch": 1.9469026548672566, "grad_norm": 0.375, "learning_rate": 5.806347342985521e-05, "loss": 0.3941, "step": 440 }, { "epoch": 1.9513274336283186, "grad_norm": 0.3671875, "learning_rate": 5.7629861207054136e-05, "loss": 0.4167, "step": 441 }, { "epoch": 1.9557522123893807, "grad_norm": 0.3671875, "learning_rate": 5.7197217770724245e-05, "loss": 0.4032, "step": 442 }, { "epoch": 1.9601769911504425, "grad_norm": 0.376953125, "learning_rate": 5.6765553013188766e-05, "loss": 0.408, "step": 443 }, { "epoch": 1.9646017699115044, "grad_norm": 0.3828125, "learning_rate": 5.633487680439361e-05, "loss": 0.4142, "step": 444 }, { "epoch": 1.9690265486725664, "grad_norm": 0.419921875, "learning_rate": 5.5905198991681695e-05, "loss": 0.4044, "step": 445 }, { "epoch": 1.9734513274336283, "grad_norm": 0.38671875, "learning_rate": 5.547652939956764e-05, "loss": 0.3855, "step": 446 }, { "epoch": 1.9778761061946901, "grad_norm": 0.365234375, "learning_rate": 5.5048877829513424e-05, "loss": 0.3921, "step": 447 }, { "epoch": 1.9823008849557522, "grad_norm": 0.39453125, "learning_rate": 5.462225405970401e-05, "loss": 0.4108, "step": 448 }, { "epoch": 1.9867256637168142, "grad_norm": 0.39453125, "learning_rate": 5.419666784482398e-05, "loss": 0.427, "step": 449 }, { "epoch": 1.991150442477876, "grad_norm": 0.396484375, "learning_rate": 5.3772128915834184e-05, "loss": 0.404, "step": 450 }, { "epoch": 1.995575221238938, "grad_norm": 0.408203125, "learning_rate": 5.3348646979749687e-05, "loss": 0.4006, "step": 451 }, { "epoch": 2.0, "grad_norm": 0.47265625, "learning_rate": 5.29262317194174e-05, "loss": 0.4206, "step": 452 }, { "epoch": 2.0, "eval_loss": 0.4563676416873932, "eval_runtime": 5.7045, "eval_samples_per_second": 146.901, "eval_steps_per_second": 18.406, "step": 452 } ], "logging_steps": 1, "max_steps": 678, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.948804377680282e+16, "train_batch_size": 60, "trial_name": null, "trial_params": null }