diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,18253 +1,66927 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.0, - "global_step": 3038, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 9555, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0, - "learning_rate": 2.173913043478261e-07, - "loss": 3.3606, + "epoch": 0.00010465724751439037, + "grad_norm": 62.41193288917299, + "learning_rate": 6.968641114982578e-08, + "loss": 2.0951, "step": 1 }, { - "epoch": 0.0, - "learning_rate": 4.347826086956522e-07, - "loss": 3.1084, + "epoch": 0.00020931449502878074, + "grad_norm": 62.41976397338786, + "learning_rate": 1.3937282229965157e-07, + "loss": 1.9349, "step": 2 }, { - "epoch": 0.0, - "learning_rate": 6.521739130434783e-07, - "loss": 2.9241, + "epoch": 0.0003139717425431711, + "grad_norm": 86.35511116310265, + "learning_rate": 2.090592334494774e-07, + "loss": 2.2482, "step": 3 }, { - "epoch": 0.0, - "learning_rate": 8.695652173913044e-07, - "loss": 2.7249, + "epoch": 0.0004186289900575615, + "grad_norm": 66.12787272745858, + "learning_rate": 2.7874564459930313e-07, + "loss": 1.9908, "step": 4 }, { - "epoch": 0.0, - "learning_rate": 1.0869565217391306e-06, - "loss": 3.0549, + "epoch": 0.0005232862375719519, + "grad_norm": 60.71641111176538, + "learning_rate": 3.4843205574912896e-07, + "loss": 1.9762, "step": 5 }, { - "epoch": 0.0, - "learning_rate": 1.3043478260869566e-06, - "loss": 3.4679, + "epoch": 0.0006279434850863422, + "grad_norm": 56.78291196095233, + "learning_rate": 4.181184668989548e-07, + "loss": 1.949, "step": 6 }, { - "epoch": 0.0, - "learning_rate": 1.521739130434783e-06, - "loss": 2.6214, + "epoch": 0.0007326007326007326, + "grad_norm": 38.93880117457696, + "learning_rate": 4.878048780487805e-07, + "loss": 1.9651, "step": 7 }, { - "epoch": 0.01, - "learning_rate": 1.7391304347826088e-06, - "loss": 3.1068, + "epoch": 0.000837257980115123, + "grad_norm": 52.09622789077548, + "learning_rate": 5.574912891986063e-07, + "loss": 1.8705, "step": 8 }, { - "epoch": 0.01, - "learning_rate": 1.956521739130435e-06, - "loss": 2.6991, + "epoch": 0.0009419152276295133, + "grad_norm": 49.92480355404063, + "learning_rate": 6.271777003484321e-07, + "loss": 1.8799, "step": 9 }, { - "epoch": 0.01, - "learning_rate": 2.173913043478261e-06, - "loss": 2.1599, + "epoch": 0.0010465724751439038, + "grad_norm": 32.20342595275959, + "learning_rate": 6.968641114982579e-07, + "loss": 1.6128, "step": 10 }, { - "epoch": 0.01, - "learning_rate": 2.391304347826087e-06, - "loss": 2.3008, + "epoch": 0.001151229722658294, + "grad_norm": 53.634110936725754, + "learning_rate": 7.665505226480836e-07, + "loss": 1.7474, "step": 11 }, { - "epoch": 0.01, - "learning_rate": 2.6086956521739132e-06, - "loss": 2.0458, + "epoch": 0.0012558869701726845, + "grad_norm": 36.607148713405984, + "learning_rate": 8.362369337979096e-07, + "loss": 1.7438, "step": 12 }, { - "epoch": 0.01, - "learning_rate": 2.8260869565217393e-06, - "loss": 2.1521, + "epoch": 0.0013605442176870747, + "grad_norm": 3.9957477431354564, + "learning_rate": 9.059233449477353e-07, + "loss": 0.7564, "step": 13 }, { - "epoch": 0.01, - "learning_rate": 3.043478260869566e-06, - "loss": 1.8656, + "epoch": 0.0014652014652014652, + "grad_norm": 20.58298531785394, + "learning_rate": 9.75609756097561e-07, + "loss": 1.5063, "step": 14 }, { - "epoch": 0.01, - "learning_rate": 3.2608695652173914e-06, - "loss": 1.9967, + "epoch": 0.0015698587127158557, + "grad_norm": 15.444768444817306, + "learning_rate": 1.045296167247387e-06, + "loss": 1.4041, "step": 15 }, { - "epoch": 0.01, - "learning_rate": 3.4782608695652175e-06, - "loss": 1.6188, + "epoch": 0.001674515960230246, + "grad_norm": 12.107828560978803, + "learning_rate": 1.1149825783972125e-06, + "loss": 1.3931, "step": 16 }, { - "epoch": 0.01, - "learning_rate": 3.6956521739130436e-06, - "loss": 1.6884, + "epoch": 0.0017791732077446364, + "grad_norm": 10.814057953845309, + "learning_rate": 1.1846689895470384e-06, + "loss": 1.3134, "step": 17 }, { - "epoch": 0.01, - "learning_rate": 3.91304347826087e-06, - "loss": 1.652, + "epoch": 0.0018838304552590266, + "grad_norm": 7.249315729621174, + "learning_rate": 1.2543554006968642e-06, + "loss": 1.3668, "step": 18 }, { - "epoch": 0.01, - "learning_rate": 4.130434782608696e-06, - "loss": 1.6944, + "epoch": 0.001988487702773417, + "grad_norm": 7.144445780657985, + "learning_rate": 1.32404181184669e-06, + "loss": 1.2966, "step": 19 }, { - "epoch": 0.01, - "learning_rate": 4.347826086956522e-06, - "loss": 1.7657, + "epoch": 0.0020931449502878076, + "grad_norm": 8.323575437600967, + "learning_rate": 1.3937282229965158e-06, + "loss": 1.2833, "step": 20 }, { - "epoch": 0.01, - "learning_rate": 4.565217391304348e-06, - "loss": 1.4748, + "epoch": 0.002197802197802198, + "grad_norm": 6.584038254790013, + "learning_rate": 1.4634146341463414e-06, + "loss": 1.3209, "step": 21 }, { - "epoch": 0.01, - "learning_rate": 4.782608695652174e-06, - "loss": 1.5941, + "epoch": 0.002302459445316588, + "grad_norm": 7.650578965105668, + "learning_rate": 1.5331010452961673e-06, + "loss": 1.185, "step": 22 }, { - "epoch": 0.02, - "learning_rate": 5e-06, - "loss": 1.5832, + "epoch": 0.0024071166928309787, + "grad_norm": 6.43808617250343, + "learning_rate": 1.602787456445993e-06, + "loss": 1.1619, "step": 23 }, { - "epoch": 0.02, - "learning_rate": 5.2173913043478265e-06, - "loss": 1.5005, + "epoch": 0.002511773940345369, + "grad_norm": 6.450238348514814, + "learning_rate": 1.6724738675958191e-06, + "loss": 1.1685, "step": 24 }, { - "epoch": 0.02, - "learning_rate": 5.4347826086956525e-06, - "loss": 1.6937, + "epoch": 0.0026164311878597592, + "grad_norm": 6.596585528409829, + "learning_rate": 1.742160278745645e-06, + "loss": 1.2266, "step": 25 }, { - "epoch": 0.02, - "learning_rate": 5.652173913043479e-06, - "loss": 1.747, + "epoch": 0.0027210884353741495, + "grad_norm": 6.701054827351807, + "learning_rate": 1.8118466898954705e-06, + "loss": 1.2315, "step": 26 }, { - "epoch": 0.02, - "learning_rate": 5.8695652173913055e-06, - "loss": 1.7165, + "epoch": 0.00282574568288854, + "grad_norm": 6.515865205471776, + "learning_rate": 1.8815331010452964e-06, + "loss": 1.2951, "step": 27 }, { - "epoch": 0.02, - "learning_rate": 6.086956521739132e-06, - "loss": 1.572, + "epoch": 0.0029304029304029304, + "grad_norm": 5.495232722232645, + "learning_rate": 1.951219512195122e-06, + "loss": 1.1448, "step": 28 }, { - "epoch": 0.02, - "learning_rate": 6.304347826086958e-06, - "loss": 1.3604, + "epoch": 0.0030350601779173206, + "grad_norm": 5.290875061084644, + "learning_rate": 2.020905923344948e-06, + "loss": 1.1407, "step": 29 }, { - "epoch": 0.02, - "learning_rate": 6.521739130434783e-06, - "loss": 1.3899, + "epoch": 0.0031397174254317113, + "grad_norm": 5.3025777909139045, + "learning_rate": 2.090592334494774e-06, + "loss": 1.127, "step": 30 }, { - "epoch": 0.02, - "learning_rate": 6.739130434782609e-06, - "loss": 1.3702, + "epoch": 0.0032443746729461016, + "grad_norm": 5.245351898269604, + "learning_rate": 2.1602787456445995e-06, + "loss": 1.1103, "step": 31 }, { - "epoch": 0.02, - "learning_rate": 6.956521739130435e-06, - "loss": 1.5494, + "epoch": 0.003349031920460492, + "grad_norm": 5.300255475950051, + "learning_rate": 2.229965156794425e-06, + "loss": 1.1917, "step": 32 }, { - "epoch": 0.02, - "learning_rate": 7.173913043478261e-06, - "loss": 1.474, + "epoch": 0.003453689167974882, + "grad_norm": 4.572317456714689, + "learning_rate": 2.299651567944251e-06, + "loss": 1.0874, "step": 33 }, { - "epoch": 0.02, - "learning_rate": 7.391304347826087e-06, - "loss": 1.4282, + "epoch": 0.0035583464154892728, + "grad_norm": 4.107821899876536, + "learning_rate": 2.3693379790940767e-06, + "loss": 1.0825, "step": 34 }, { - "epoch": 0.02, - "learning_rate": 7.608695652173914e-06, - "loss": 1.4724, + "epoch": 0.003663003663003663, + "grad_norm": 4.21060285666979, + "learning_rate": 2.4390243902439027e-06, + "loss": 1.2219, "step": 35 }, { - "epoch": 0.02, - "learning_rate": 7.82608695652174e-06, - "loss": 1.45, + "epoch": 0.0037676609105180532, + "grad_norm": 4.328566900505888, + "learning_rate": 2.5087108013937284e-06, + "loss": 1.0502, "step": 36 }, { - "epoch": 0.02, - "learning_rate": 8.043478260869566e-06, - "loss": 1.3026, + "epoch": 0.003872318158032444, + "grad_norm": 4.291665457341644, + "learning_rate": 2.578397212543554e-06, + "loss": 1.1426, "step": 37 }, { - "epoch": 0.03, - "learning_rate": 8.260869565217392e-06, - "loss": 1.5396, + "epoch": 0.003976975405546834, + "grad_norm": 3.8850933278156563, + "learning_rate": 2.64808362369338e-06, + "loss": 1.0759, "step": 38 }, { - "epoch": 0.03, - "learning_rate": 8.478260869565218e-06, - "loss": 1.4549, + "epoch": 0.004081632653061225, + "grad_norm": 4.413531892474245, + "learning_rate": 2.7177700348432056e-06, + "loss": 1.1848, "step": 39 }, { - "epoch": 0.03, - "learning_rate": 8.695652173913044e-06, - "loss": 1.5142, + "epoch": 0.004186289900575615, + "grad_norm": 4.844856614218897, + "learning_rate": 2.7874564459930316e-06, + "loss": 1.0984, "step": 40 }, { - "epoch": 0.03, - "learning_rate": 8.91304347826087e-06, - "loss": 1.5001, + "epoch": 0.004290947148090005, + "grad_norm": 4.748816166519876, + "learning_rate": 2.8571428571428573e-06, + "loss": 1.0605, "step": 41 }, { - "epoch": 0.03, - "learning_rate": 9.130434782608697e-06, - "loss": 1.4477, + "epoch": 0.004395604395604396, + "grad_norm": 4.777444061979255, + "learning_rate": 2.926829268292683e-06, + "loss": 1.2714, "step": 42 }, { - "epoch": 0.03, - "learning_rate": 9.347826086956523e-06, - "loss": 1.2861, + "epoch": 0.004500261643118786, + "grad_norm": 4.522554555361124, + "learning_rate": 2.996515679442509e-06, + "loss": 1.0385, "step": 43 }, { - "epoch": 0.03, - "learning_rate": 9.565217391304349e-06, - "loss": 1.3809, + "epoch": 0.004604918890633176, + "grad_norm": 4.075973193199012, + "learning_rate": 3.0662020905923345e-06, + "loss": 1.0521, "step": 44 }, { - "epoch": 0.03, - "learning_rate": 9.782608695652175e-06, - "loss": 1.6367, + "epoch": 0.004709576138147566, + "grad_norm": 4.3231014366459934, + "learning_rate": 3.13588850174216e-06, + "loss": 1.0546, "step": 45 }, { - "epoch": 0.03, - "learning_rate": 1e-05, - "loss": 1.4074, + "epoch": 0.0048142333856619575, + "grad_norm": 5.680111667722333, + "learning_rate": 3.205574912891986e-06, + "loss": 1.0781, "step": 46 }, { - "epoch": 0.03, - "learning_rate": 1.0217391304347829e-05, - "loss": 1.4236, + "epoch": 0.004918890633176348, + "grad_norm": 4.1612873974328854, + "learning_rate": 3.2752613240418118e-06, + "loss": 1.027, "step": 47 }, { - "epoch": 0.03, - "learning_rate": 1.0434782608695653e-05, - "loss": 1.6102, + "epoch": 0.005023547880690738, + "grad_norm": 4.992758604601489, + "learning_rate": 3.3449477351916382e-06, + "loss": 1.1227, "step": 48 }, { - "epoch": 0.03, - "learning_rate": 1.0652173913043479e-05, - "loss": 1.4612, + "epoch": 0.005128205128205128, + "grad_norm": 4.226754919676679, + "learning_rate": 3.414634146341464e-06, + "loss": 1.1298, "step": 49 }, { - "epoch": 0.03, - "learning_rate": 1.0869565217391305e-05, - "loss": 1.3934, + "epoch": 0.0052328623757195184, + "grad_norm": 5.514671205059567, + "learning_rate": 3.48432055749129e-06, + "loss": 1.1269, "step": 50 }, { - "epoch": 0.03, - "learning_rate": 1.1086956521739131e-05, - "loss": 1.4294, + "epoch": 0.005337519623233909, + "grad_norm": 4.351259557452271, + "learning_rate": 3.5540069686411155e-06, + "loss": 1.0444, "step": 51 }, { - "epoch": 0.03, - "learning_rate": 1.1304347826086957e-05, - "loss": 1.5092, + "epoch": 0.005442176870748299, + "grad_norm": 5.399366279585442, + "learning_rate": 3.623693379790941e-06, + "loss": 1.1222, "step": 52 }, { - "epoch": 0.03, - "learning_rate": 1.1521739130434783e-05, - "loss": 1.2208, + "epoch": 0.00554683411826269, + "grad_norm": 4.028877965786529, + "learning_rate": 3.693379790940767e-06, + "loss": 1.1525, "step": 53 }, { - "epoch": 0.04, - "learning_rate": 1.1739130434782611e-05, - "loss": 1.2975, + "epoch": 0.00565149136577708, + "grad_norm": 4.683154305540764, + "learning_rate": 3.7630662020905927e-06, + "loss": 0.9988, "step": 54 }, { - "epoch": 0.04, - "learning_rate": 1.1956521739130435e-05, - "loss": 1.4067, + "epoch": 0.0057561486132914706, + "grad_norm": 3.937666148096688, + "learning_rate": 3.832752613240418e-06, + "loss": 1.0655, "step": 55 }, { - "epoch": 0.04, - "learning_rate": 1.2173913043478263e-05, - "loss": 1.3488, + "epoch": 0.005860805860805861, + "grad_norm": 4.766869445012149, + "learning_rate": 3.902439024390244e-06, + "loss": 0.7926, "step": 56 }, { - "epoch": 0.04, - "learning_rate": 1.2391304347826088e-05, - "loss": 1.3825, + "epoch": 0.005965463108320251, + "grad_norm": 4.319598323365691, + "learning_rate": 3.97212543554007e-06, + "loss": 1.0314, "step": 57 }, { - "epoch": 0.04, - "learning_rate": 1.2608695652173915e-05, - "loss": 1.3548, + "epoch": 0.006070120355834641, + "grad_norm": 4.576774250115059, + "learning_rate": 4.041811846689896e-06, + "loss": 1.1618, "step": 58 }, { - "epoch": 0.04, - "learning_rate": 1.282608695652174e-05, - "loss": 1.4098, + "epoch": 0.0061747776033490315, + "grad_norm": 5.167500660632476, + "learning_rate": 4.111498257839722e-06, + "loss": 1.0677, "step": 59 }, { - "epoch": 0.04, - "learning_rate": 1.3043478260869566e-05, - "loss": 1.5195, + "epoch": 0.006279434850863423, + "grad_norm": 4.38060785735574, + "learning_rate": 4.181184668989548e-06, + "loss": 1.0157, "step": 60 }, { - "epoch": 0.04, - "learning_rate": 1.3260869565217392e-05, - "loss": 1.4409, + "epoch": 0.006384092098377813, + "grad_norm": 4.893113671421494, + "learning_rate": 4.250871080139373e-06, + "loss": 1.1794, "step": 61 }, { - "epoch": 0.04, - "learning_rate": 1.3478260869565218e-05, - "loss": 1.5297, + "epoch": 0.006488749345892203, + "grad_norm": 4.849041572456957, + "learning_rate": 4.320557491289199e-06, + "loss": 1.046, "step": 62 }, { - "epoch": 0.04, - "learning_rate": 1.3695652173913046e-05, - "loss": 1.5921, + "epoch": 0.006593406593406593, + "grad_norm": 3.991277540603915, + "learning_rate": 4.390243902439025e-06, + "loss": 1.0056, "step": 63 }, { - "epoch": 0.04, - "learning_rate": 1.391304347826087e-05, - "loss": 1.2743, + "epoch": 0.006698063840920984, + "grad_norm": 4.542337282469629, + "learning_rate": 4.45993031358885e-06, + "loss": 1.0251, "step": 64 }, { - "epoch": 0.04, - "learning_rate": 1.4130434782608698e-05, - "loss": 1.3369, + "epoch": 0.006802721088435374, + "grad_norm": 4.28521583781609, + "learning_rate": 4.529616724738676e-06, + "loss": 1.0708, "step": 65 }, { - "epoch": 0.04, - "learning_rate": 1.4347826086956522e-05, - "loss": 1.3681, + "epoch": 0.006907378335949764, + "grad_norm": 4.125637623705243, + "learning_rate": 4.599303135888502e-06, + "loss": 1.0472, "step": 66 }, { - "epoch": 0.04, - "learning_rate": 1.456521739130435e-05, - "loss": 1.4715, + "epoch": 0.007012035583464155, + "grad_norm": 5.268122050018008, + "learning_rate": 4.668989547038328e-06, + "loss": 0.99, "step": 67 }, { - "epoch": 0.04, - "learning_rate": 1.4782608695652174e-05, - "loss": 1.4458, + "epoch": 0.0071166928309785455, + "grad_norm": 5.0648320868276375, + "learning_rate": 4.738675958188153e-06, + "loss": 1.0457, "step": 68 }, { - "epoch": 0.05, - "learning_rate": 1.5000000000000002e-05, - "loss": 1.3767, + "epoch": 0.007221350078492936, + "grad_norm": 3.948886008013687, + "learning_rate": 4.8083623693379794e-06, + "loss": 1.1263, "step": 69 }, { - "epoch": 0.05, - "learning_rate": 1.5217391304347828e-05, - "loss": 1.4051, + "epoch": 0.007326007326007326, + "grad_norm": 4.195666188025583, + "learning_rate": 4.8780487804878055e-06, + "loss": 1.0227, "step": 70 }, { - "epoch": 0.05, - "learning_rate": 1.5434782608695654e-05, - "loss": 1.5299, + "epoch": 0.007430664573521716, + "grad_norm": 4.413992487953222, + "learning_rate": 4.947735191637631e-06, + "loss": 1.0551, "step": 71 }, { - "epoch": 0.05, - "learning_rate": 1.565217391304348e-05, - "loss": 1.4234, + "epoch": 0.0075353218210361065, + "grad_norm": 5.788427063687076, + "learning_rate": 5.017421602787457e-06, + "loss": 1.094, "step": 72 }, { - "epoch": 0.05, - "learning_rate": 1.5869565217391306e-05, - "loss": 1.4651, + "epoch": 0.007639979068550497, + "grad_norm": 4.474163687473138, + "learning_rate": 5.087108013937283e-06, + "loss": 1.0967, "step": 73 }, { - "epoch": 0.05, - "learning_rate": 1.6086956521739132e-05, - "loss": 1.6008, + "epoch": 0.007744636316064888, + "grad_norm": 4.512506707897861, + "learning_rate": 5.156794425087108e-06, + "loss": 1.1004, "step": 74 }, { - "epoch": 0.05, - "learning_rate": 1.630434782608696e-05, - "loss": 1.4812, + "epoch": 0.007849293563579277, + "grad_norm": 4.228651613816608, + "learning_rate": 5.226480836236935e-06, + "loss": 1.0283, "step": 75 }, { - "epoch": 0.05, - "learning_rate": 1.6521739130434785e-05, - "loss": 1.3406, + "epoch": 0.007953950811093667, + "grad_norm": 4.3737582716641175, + "learning_rate": 5.29616724738676e-06, + "loss": 0.9979, "step": 76 }, { - "epoch": 0.05, - "learning_rate": 1.673913043478261e-05, - "loss": 1.4521, + "epoch": 0.00805860805860806, + "grad_norm": 3.9566453802387094, + "learning_rate": 5.365853658536586e-06, + "loss": 1.0923, "step": 77 }, { - "epoch": 0.05, - "learning_rate": 1.6956521739130437e-05, - "loss": 1.5115, + "epoch": 0.00816326530612245, + "grad_norm": 4.071859232406034, + "learning_rate": 5.435540069686411e-06, + "loss": 1.0203, "step": 78 }, { - "epoch": 0.05, - "learning_rate": 1.7173913043478263e-05, - "loss": 1.3607, + "epoch": 0.00826792255363684, + "grad_norm": 4.0563972426988295, + "learning_rate": 5.505226480836237e-06, + "loss": 1.127, "step": 79 }, { - "epoch": 0.05, - "learning_rate": 1.739130434782609e-05, - "loss": 1.5419, + "epoch": 0.00837257980115123, + "grad_norm": 4.657288632168375, + "learning_rate": 5.574912891986063e-06, + "loss": 1.1196, "step": 80 }, { - "epoch": 0.05, - "learning_rate": 1.7608695652173915e-05, - "loss": 1.4382, + "epoch": 0.00847723704866562, + "grad_norm": 4.090819887573403, + "learning_rate": 5.644599303135889e-06, + "loss": 1.0944, "step": 81 }, { - "epoch": 0.05, - "learning_rate": 1.782608695652174e-05, - "loss": 1.4131, + "epoch": 0.00858189429618001, + "grad_norm": 4.482297147920703, + "learning_rate": 5.7142857142857145e-06, + "loss": 1.0254, "step": 82 }, { - "epoch": 0.05, - "learning_rate": 1.8043478260869567e-05, - "loss": 1.2361, + "epoch": 0.008686551543694401, + "grad_norm": 4.152492838084387, + "learning_rate": 5.7839721254355405e-06, + "loss": 1.0602, "step": 83 }, { - "epoch": 0.06, - "learning_rate": 1.8260869565217393e-05, - "loss": 1.3589, + "epoch": 0.008791208791208791, + "grad_norm": 4.327953715994116, + "learning_rate": 5.853658536585366e-06, + "loss": 0.978, "step": 84 }, { - "epoch": 0.06, - "learning_rate": 1.847826086956522e-05, - "loss": 1.511, + "epoch": 0.008895866038723181, + "grad_norm": 3.0360168594436097, + "learning_rate": 5.923344947735193e-06, + "loss": 0.7671, "step": 85 }, { - "epoch": 0.06, - "learning_rate": 1.8695652173913045e-05, - "loss": 1.4226, + "epoch": 0.009000523286237572, + "grad_norm": 4.211239355773984, + "learning_rate": 5.993031358885018e-06, + "loss": 1.0706, "step": 86 }, { - "epoch": 0.06, - "learning_rate": 1.891304347826087e-05, - "loss": 1.4287, + "epoch": 0.009105180533751962, + "grad_norm": 4.660606741232546, + "learning_rate": 6.062717770034844e-06, + "loss": 1.0697, "step": 87 }, { - "epoch": 0.06, - "learning_rate": 1.9130434782608697e-05, - "loss": 1.4446, + "epoch": 0.009209837781266352, + "grad_norm": 3.96480287896892, + "learning_rate": 6.132404181184669e-06, + "loss": 1.0302, "step": 88 }, { - "epoch": 0.06, - "learning_rate": 1.9347826086956523e-05, - "loss": 1.3839, + "epoch": 0.009314495028780742, + "grad_norm": 4.126939118609246, + "learning_rate": 6.202090592334495e-06, + "loss": 1.1167, "step": 89 }, { - "epoch": 0.06, - "learning_rate": 1.956521739130435e-05, - "loss": 1.4631, + "epoch": 0.009419152276295133, + "grad_norm": 4.87024424753776, + "learning_rate": 6.27177700348432e-06, + "loss": 0.993, "step": 90 }, { - "epoch": 0.06, - "learning_rate": 1.9782608695652176e-05, - "loss": 1.4006, + "epoch": 0.009523809523809525, + "grad_norm": 3.9263349606700695, + "learning_rate": 6.341463414634147e-06, + "loss": 1.0546, "step": 91 }, { - "epoch": 0.06, - "learning_rate": 2e-05, - "loss": 1.4534, + "epoch": 0.009628466771323915, + "grad_norm": 3.985792246955264, + "learning_rate": 6.411149825783972e-06, + "loss": 0.9981, "step": 92 }, { - "epoch": 0.06, - "learning_rate": 1.999999431403446e-05, - "loss": 1.5352, + "epoch": 0.009733124018838305, + "grad_norm": 3.924993510446252, + "learning_rate": 6.480836236933798e-06, + "loss": 0.9752, "step": 93 }, { - "epoch": 0.06, - "learning_rate": 1.9999977256144297e-05, - "loss": 1.4683, + "epoch": 0.009837781266352695, + "grad_norm": 4.292200608183251, + "learning_rate": 6.5505226480836235e-06, + "loss": 1.0762, "step": 94 }, { - "epoch": 0.06, - "learning_rate": 1.9999948826348907e-05, - "loss": 1.5199, + "epoch": 0.009942438513867086, + "grad_norm": 4.485480700913085, + "learning_rate": 6.62020905923345e-06, + "loss": 1.104, "step": 95 }, { - "epoch": 0.06, - "learning_rate": 1.9999909024680632e-05, - "loss": 1.398, + "epoch": 0.010047095761381476, + "grad_norm": 4.5984608130045235, + "learning_rate": 6.6898954703832765e-06, + "loss": 1.051, "step": 96 }, { - "epoch": 0.06, - "learning_rate": 1.9999857851184724e-05, - "loss": 1.5483, + "epoch": 0.010151753008895866, + "grad_norm": 4.449884370761991, + "learning_rate": 6.759581881533102e-06, + "loss": 0.9836, "step": 97 }, { - "epoch": 0.06, - "learning_rate": 1.9999795305919377e-05, - "loss": 1.4268, + "epoch": 0.010256410256410256, + "grad_norm": 4.1157967239658575, + "learning_rate": 6.829268292682928e-06, + "loss": 1.0645, "step": 98 }, { - "epoch": 0.07, - "learning_rate": 1.9999721388955722e-05, - "loss": 1.2935, + "epoch": 0.010361067503924647, + "grad_norm": 3.864292249565954, + "learning_rate": 6.898954703832753e-06, + "loss": 1.0302, "step": 99 }, { - "epoch": 0.07, - "learning_rate": 1.9999636100377818e-05, - "loss": 1.5527, + "epoch": 0.010465724751439037, + "grad_norm": 5.087581226627934, + "learning_rate": 6.96864111498258e-06, + "loss": 0.9327, "step": 100 }, { - "epoch": 0.07, - "learning_rate": 1.9999539440282648e-05, - "loss": 1.2354, + "epoch": 0.010570381998953427, + "grad_norm": 4.4876214527409655, + "learning_rate": 7.038327526132405e-06, + "loss": 1.0256, "step": 101 }, { - "epoch": 0.07, - "learning_rate": 1.999943140878014e-05, - "loss": 1.3866, + "epoch": 0.010675039246467817, + "grad_norm": 4.768444458649774, + "learning_rate": 7.108013937282231e-06, + "loss": 0.9977, "step": 102 }, { - "epoch": 0.07, - "learning_rate": 1.999931200599314e-05, - "loss": 1.4202, + "epoch": 0.010779696493982208, + "grad_norm": 4.8508061117938075, + "learning_rate": 7.177700348432056e-06, + "loss": 1.0055, "step": 103 }, { - "epoch": 0.07, - "learning_rate": 1.9999181232057437e-05, - "loss": 1.3781, + "epoch": 0.010884353741496598, + "grad_norm": 4.971725250363107, + "learning_rate": 7.247386759581882e-06, + "loss": 0.9967, "step": 104 }, { - "epoch": 0.07, - "learning_rate": 1.9999039087121746e-05, - "loss": 1.1009, + "epoch": 0.01098901098901099, + "grad_norm": 4.397737680675271, + "learning_rate": 7.317073170731707e-06, + "loss": 1.0356, "step": 105 }, { - "epoch": 0.07, - "learning_rate": 1.9998885571347712e-05, - "loss": 1.3709, + "epoch": 0.01109366823652538, + "grad_norm": 3.973868161061544, + "learning_rate": 7.386759581881534e-06, + "loss": 1.0277, "step": 106 }, { - "epoch": 0.07, - "learning_rate": 1.9998720684909913e-05, - "loss": 1.4178, + "epoch": 0.01119832548403977, + "grad_norm": 4.846756354395717, + "learning_rate": 7.4564459930313594e-06, + "loss": 1.113, "step": 107 }, { - "epoch": 0.07, - "learning_rate": 1.999854442799585e-05, - "loss": 1.2833, + "epoch": 0.01130298273155416, + "grad_norm": 4.364102429082418, + "learning_rate": 7.5261324041811855e-06, + "loss": 1.0448, "step": 108 }, { - "epoch": 0.07, - "learning_rate": 1.9998356800805973e-05, - "loss": 1.3406, + "epoch": 0.01140763997906855, + "grad_norm": 4.772968488932661, + "learning_rate": 7.595818815331011e-06, + "loss": 0.9974, "step": 109 }, { - "epoch": 0.07, - "learning_rate": 1.9998157803553638e-05, - "loss": 1.4118, + "epoch": 0.011512297226582941, + "grad_norm": 4.338651147105108, + "learning_rate": 7.665505226480837e-06, + "loss": 1.0383, "step": 110 }, { - "epoch": 0.07, - "learning_rate": 1.9997947436465157e-05, - "loss": 1.4097, + "epoch": 0.011616954474097331, + "grad_norm": 4.665177086985384, + "learning_rate": 7.735191637630662e-06, + "loss": 0.9858, "step": 111 }, { - "epoch": 0.07, - "learning_rate": 1.9997725699779747e-05, - "loss": 1.3415, + "epoch": 0.011721611721611722, + "grad_norm": 4.226627817628828, + "learning_rate": 7.804878048780489e-06, + "loss": 1.1107, "step": 112 }, { - "epoch": 0.07, - "learning_rate": 1.999749259374957e-05, - "loss": 1.4323, + "epoch": 0.011826268969126112, + "grad_norm": 4.365500077240299, + "learning_rate": 7.874564459930314e-06, + "loss": 0.9337, "step": 113 }, { - "epoch": 0.08, - "learning_rate": 1.999724811863971e-05, - "loss": 1.4016, + "epoch": 0.011930926216640502, + "grad_norm": 4.57360703127662, + "learning_rate": 7.94425087108014e-06, + "loss": 1.0088, "step": 114 }, { - "epoch": 0.08, - "learning_rate": 1.999699227472818e-05, - "loss": 1.4401, + "epoch": 0.012035583464154892, + "grad_norm": 4.672192335977975, + "learning_rate": 8.013937282229966e-06, + "loss": 1.0041, "step": 115 }, { - "epoch": 0.08, - "learning_rate": 1.9996725062305934e-05, - "loss": 1.4507, + "epoch": 0.012140240711669283, + "grad_norm": 4.678168288512447, + "learning_rate": 8.083623693379791e-06, + "loss": 1.0472, "step": 116 }, { - "epoch": 0.08, - "learning_rate": 1.999644648167684e-05, - "loss": 1.387, + "epoch": 0.012244897959183673, + "grad_norm": 4.804225775870655, + "learning_rate": 8.153310104529616e-06, + "loss": 1.0888, "step": 117 }, { - "epoch": 0.08, - "learning_rate": 1.9996156533157692e-05, - "loss": 1.2825, + "epoch": 0.012349555206698063, + "grad_norm": 4.403636503472787, + "learning_rate": 8.222996515679443e-06, + "loss": 1.13, "step": 118 }, { - "epoch": 0.08, - "learning_rate": 1.999585521707822e-05, - "loss": 1.2307, + "epoch": 0.012454212454212455, + "grad_norm": 4.208797982507968, + "learning_rate": 8.292682926829268e-06, + "loss": 1.0292, "step": 119 }, { - "epoch": 0.08, - "learning_rate": 1.999554253378108e-05, - "loss": 1.301, + "epoch": 0.012558869701726845, + "grad_norm": 5.510093081064813, + "learning_rate": 8.362369337979095e-06, + "loss": 1.0361, "step": 120 }, { - "epoch": 0.08, - "learning_rate": 1.9995218483621858e-05, - "loss": 1.302, + "epoch": 0.012663526949241236, + "grad_norm": 5.492522216479893, + "learning_rate": 8.43205574912892e-06, + "loss": 1.0391, "step": 121 }, { - "epoch": 0.08, - "learning_rate": 1.9994883066969055e-05, - "loss": 1.342, + "epoch": 0.012768184196755626, + "grad_norm": 4.227779135360482, + "learning_rate": 8.501742160278746e-06, + "loss": 1.1114, "step": 122 }, { - "epoch": 0.08, - "learning_rate": 1.9994536284204106e-05, - "loss": 1.5534, + "epoch": 0.012872841444270016, + "grad_norm": 3.8028192975551014, + "learning_rate": 8.571428571428571e-06, + "loss": 1.0596, "step": 123 }, { - "epoch": 0.08, - "learning_rate": 1.9994178135721372e-05, - "loss": 1.2546, + "epoch": 0.012977498691784406, + "grad_norm": 4.700182372142366, + "learning_rate": 8.641114982578398e-06, + "loss": 1.1663, "step": 124 }, { - "epoch": 0.08, - "learning_rate": 1.999380862192813e-05, - "loss": 1.3045, + "epoch": 0.013082155939298797, + "grad_norm": 4.393426240660029, + "learning_rate": 8.710801393728223e-06, + "loss": 0.9717, "step": 125 }, { - "epoch": 0.08, - "learning_rate": 1.99934277432446e-05, - "loss": 1.3729, + "epoch": 0.013186813186813187, + "grad_norm": 5.209722101990777, + "learning_rate": 8.78048780487805e-06, + "loss": 1.0548, "step": 126 }, { - "epoch": 0.08, - "learning_rate": 1.9993035500103904e-05, - "loss": 1.4059, + "epoch": 0.013291470434327577, + "grad_norm": 4.44296200002556, + "learning_rate": 8.850174216027875e-06, + "loss": 1.0316, "step": 127 }, { - "epoch": 0.08, - "learning_rate": 1.9992631892952108e-05, - "loss": 1.3092, + "epoch": 0.013396127681841967, + "grad_norm": 3.2269766969577525, + "learning_rate": 8.9198606271777e-06, + "loss": 0.7394, "step": 128 }, { - "epoch": 0.08, - "learning_rate": 1.999221692224818e-05, - "loss": 1.2541, + "epoch": 0.013500784929356358, + "grad_norm": 4.000784251757387, + "learning_rate": 8.989547038327527e-06, + "loss": 0.9277, "step": 129 }, { - "epoch": 0.09, - "learning_rate": 1.9991790588464034e-05, - "loss": 1.5616, + "epoch": 0.013605442176870748, + "grad_norm": 3.76241263382153, + "learning_rate": 9.059233449477352e-06, + "loss": 1.0673, "step": 130 }, { - "epoch": 0.09, - "learning_rate": 1.9991352892084483e-05, - "loss": 1.3117, + "epoch": 0.013710099424385138, + "grad_norm": 4.24231924403224, + "learning_rate": 9.12891986062718e-06, + "loss": 0.9685, "step": 131 }, { - "epoch": 0.09, - "learning_rate": 1.999090383360728e-05, - "loss": 1.4864, + "epoch": 0.013814756671899528, + "grad_norm": 2.3758209921949343, + "learning_rate": 9.198606271777004e-06, + "loss": 0.7123, "step": 132 }, { - "epoch": 0.09, - "learning_rate": 1.9990443413543086e-05, - "loss": 1.1912, + "epoch": 0.01391941391941392, + "grad_norm": 3.3277360996370966, + "learning_rate": 9.268292682926831e-06, + "loss": 1.0094, "step": 133 }, { - "epoch": 0.09, - "learning_rate": 1.998997163241549e-05, - "loss": 1.3293, + "epoch": 0.01402407116692831, + "grad_norm": 3.9700536041385353, + "learning_rate": 9.337979094076656e-06, + "loss": 1.029, "step": 134 }, { - "epoch": 0.09, - "learning_rate": 1.9989488490761e-05, - "loss": 1.3906, + "epoch": 0.0141287284144427, + "grad_norm": 3.8999757749740973, + "learning_rate": 9.407665505226482e-06, + "loss": 1.099, "step": 135 }, { - "epoch": 0.09, - "learning_rate": 1.9988993989129035e-05, - "loss": 1.2405, + "epoch": 0.014233385661957091, + "grad_norm": 4.007407594413442, + "learning_rate": 9.477351916376307e-06, + "loss": 0.9941, "step": 136 }, { - "epoch": 0.09, - "learning_rate": 1.9988488128081943e-05, - "loss": 1.4372, + "epoch": 0.014338042909471481, + "grad_norm": 4.372638054940945, + "learning_rate": 9.547038327526134e-06, + "loss": 0.998, "step": 137 }, { - "epoch": 0.09, - "learning_rate": 1.998797090819499e-05, - "loss": 1.2302, + "epoch": 0.014442700156985872, + "grad_norm": 3.781559177153234, + "learning_rate": 9.616724738675959e-06, + "loss": 1.0021, "step": 138 }, { - "epoch": 0.09, - "learning_rate": 1.9987442330056352e-05, - "loss": 1.3168, + "epoch": 0.014547357404500262, + "grad_norm": 3.8918297025335358, + "learning_rate": 9.686411149825786e-06, + "loss": 1.1862, "step": 139 }, { - "epoch": 0.09, - "learning_rate": 1.9986902394267117e-05, - "loss": 1.3076, + "epoch": 0.014652014652014652, + "grad_norm": 4.87861639008213, + "learning_rate": 9.756097560975611e-06, + "loss": 1.0808, "step": 140 }, { - "epoch": 0.09, - "learning_rate": 1.9986351101441308e-05, - "loss": 1.1699, + "epoch": 0.014756671899529042, + "grad_norm": 4.2959641485004365, + "learning_rate": 9.825783972125436e-06, + "loss": 1.0558, "step": 141 }, { - "epoch": 0.09, - "learning_rate": 1.9985788452205847e-05, - "loss": 1.3322, + "epoch": 0.014861329147043432, + "grad_norm": 3.6891019296746816, + "learning_rate": 9.895470383275261e-06, + "loss": 0.9425, "step": 142 }, { - "epoch": 0.09, - "learning_rate": 1.9985214447200566e-05, - "loss": 1.3678, + "epoch": 0.014965986394557823, + "grad_norm": 2.421295406110499, + "learning_rate": 9.965156794425088e-06, + "loss": 0.7397, "step": 143 }, { - "epoch": 0.09, - "learning_rate": 1.9984629087078234e-05, - "loss": 1.2653, + "epoch": 0.015070643642072213, + "grad_norm": 4.319425469746967, + "learning_rate": 1.0034843205574913e-05, + "loss": 1.0007, "step": 144 }, { - "epoch": 0.1, - "learning_rate": 1.998403237250451e-05, - "loss": 1.3188, + "epoch": 0.015175300889586603, + "grad_norm": 6.872837574225154, + "learning_rate": 1.0104529616724739e-05, + "loss": 1.0009, "step": 145 }, { - "epoch": 0.1, - "learning_rate": 1.9983424304157976e-05, - "loss": 1.4451, + "epoch": 0.015279958137100993, + "grad_norm": 4.1481133816472875, + "learning_rate": 1.0174216027874565e-05, + "loss": 1.1726, "step": 146 }, { - "epoch": 0.1, - "learning_rate": 1.998280488273012e-05, - "loss": 1.3479, + "epoch": 0.015384615384615385, + "grad_norm": 4.440751447995416, + "learning_rate": 1.024390243902439e-05, + "loss": 1.1265, "step": 147 }, { - "epoch": 0.1, - "learning_rate": 1.9982174108925346e-05, - "loss": 1.4162, + "epoch": 0.015489272632129776, + "grad_norm": 4.171124786786287, + "learning_rate": 1.0313588850174216e-05, + "loss": 1.0745, "step": 148 }, { - "epoch": 0.1, - "learning_rate": 1.998153198346097e-05, - "loss": 1.4135, + "epoch": 0.015593929879644166, + "grad_norm": 4.2679960358102385, + "learning_rate": 1.0383275261324041e-05, + "loss": 1.0301, "step": 149 }, { - "epoch": 0.1, - "learning_rate": 1.9980878507067207e-05, - "loss": 1.3905, + "epoch": 0.015698587127158554, + "grad_norm": 4.083629703686367, + "learning_rate": 1.045296167247387e-05, + "loss": 1.0725, "step": 150 }, { - "epoch": 0.1, - "learning_rate": 1.9980213680487186e-05, - "loss": 1.4447, + "epoch": 0.015803244374672946, + "grad_norm": 4.756910822842423, + "learning_rate": 1.0522648083623695e-05, + "loss": 1.1538, "step": 151 }, { - "epoch": 0.1, - "learning_rate": 1.9979537504476945e-05, - "loss": 1.4285, + "epoch": 0.015907901622187335, + "grad_norm": 4.175459761454693, + "learning_rate": 1.059233449477352e-05, + "loss": 1.0311, "step": 152 }, { - "epoch": 0.1, - "learning_rate": 1.997884997980543e-05, - "loss": 1.4105, + "epoch": 0.016012558869701727, + "grad_norm": 4.660499784764975, + "learning_rate": 1.0662020905923345e-05, + "loss": 1.0348, "step": 153 }, { - "epoch": 0.1, - "learning_rate": 1.9978151107254477e-05, - "loss": 1.2898, + "epoch": 0.01611721611721612, + "grad_norm": 5.477934194364109, + "learning_rate": 1.0731707317073172e-05, + "loss": 0.947, "step": 154 }, { - "epoch": 0.1, - "learning_rate": 1.9977440887618855e-05, - "loss": 1.3787, + "epoch": 0.016221873364730507, + "grad_norm": 3.8257154240108475, + "learning_rate": 1.0801393728222997e-05, + "loss": 1.0325, "step": 155 }, { - "epoch": 0.1, - "learning_rate": 1.9976719321706213e-05, - "loss": 1.419, + "epoch": 0.0163265306122449, + "grad_norm": 4.473013121990363, + "learning_rate": 1.0871080139372822e-05, + "loss": 1.0694, "step": 156 }, { - "epoch": 0.1, - "learning_rate": 1.997598641033711e-05, - "loss": 1.4784, + "epoch": 0.016431187859759288, + "grad_norm": 3.7487977727407107, + "learning_rate": 1.0940766550522648e-05, + "loss": 0.9867, "step": 157 }, { - "epoch": 0.1, - "learning_rate": 1.997524215434501e-05, - "loss": 1.4202, + "epoch": 0.01653584510727368, + "grad_norm": 3.6501719775560844, + "learning_rate": 1.1010452961672475e-05, + "loss": 1.0029, "step": 158 }, { - "epoch": 0.1, - "learning_rate": 1.9974486554576274e-05, - "loss": 1.2834, + "epoch": 0.01664050235478807, + "grad_norm": 4.465954207259012, + "learning_rate": 1.1080139372822301e-05, + "loss": 1.1428, "step": 159 }, { - "epoch": 0.11, - "learning_rate": 1.9973719611890164e-05, - "loss": 1.4078, + "epoch": 0.01674515960230246, + "grad_norm": 3.7784565534446597, + "learning_rate": 1.1149825783972127e-05, + "loss": 1.0845, "step": 160 }, { - "epoch": 0.11, - "learning_rate": 1.9972941327158848e-05, - "loss": 1.2659, + "epoch": 0.01684981684981685, + "grad_norm": 4.3826834909022825, + "learning_rate": 1.1219512195121953e-05, + "loss": 1.029, "step": 161 }, { - "epoch": 0.11, - "learning_rate": 1.997215170126738e-05, - "loss": 1.3769, + "epoch": 0.01695447409733124, + "grad_norm": 5.076983929836477, + "learning_rate": 1.1289198606271779e-05, + "loss": 0.9905, "step": 162 }, { - "epoch": 0.11, - "learning_rate": 1.9971350735113718e-05, - "loss": 1.3037, + "epoch": 0.01705913134484563, + "grad_norm": 3.87202080289898, + "learning_rate": 1.1358885017421604e-05, + "loss": 0.9471, "step": 163 }, { - "epoch": 0.11, - "learning_rate": 1.9970538429608716e-05, - "loss": 1.3615, + "epoch": 0.01716378859236002, + "grad_norm": 3.947159057053555, + "learning_rate": 1.1428571428571429e-05, + "loss": 1.0517, "step": 164 }, { - "epoch": 0.11, - "learning_rate": 1.996971478567612e-05, - "loss": 1.4036, + "epoch": 0.01726844583987441, + "grad_norm": 3.5664649583899886, + "learning_rate": 1.1498257839721256e-05, + "loss": 1.0465, "step": 165 }, { - "epoch": 0.11, - "learning_rate": 1.996887980425258e-05, - "loss": 1.2713, + "epoch": 0.017373103087388802, + "grad_norm": 3.889725720772939, + "learning_rate": 1.1567944250871081e-05, + "loss": 1.0653, "step": 166 }, { - "epoch": 0.11, - "learning_rate": 1.996803348628762e-05, - "loss": 1.3764, + "epoch": 0.01747776033490319, + "grad_norm": 3.9170851836947143, + "learning_rate": 1.1637630662020906e-05, + "loss": 1.034, "step": 167 }, { - "epoch": 0.11, - "learning_rate": 1.9967175832743677e-05, - "loss": 1.4328, + "epoch": 0.017582417582417582, + "grad_norm": 3.72251997612241, + "learning_rate": 1.1707317073170731e-05, + "loss": 1.0297, "step": 168 }, { - "epoch": 0.11, - "learning_rate": 1.9966306844596063e-05, - "loss": 1.4352, + "epoch": 0.017687074829931974, + "grad_norm": 3.576705734526832, + "learning_rate": 1.177700348432056e-05, + "loss": 1.0623, "step": 169 }, { - "epoch": 0.11, - "learning_rate": 1.9965426522832985e-05, - "loss": 1.2599, + "epoch": 0.017791732077446363, + "grad_norm": 3.263183539989399, + "learning_rate": 1.1846689895470385e-05, + "loss": 1.0284, "step": 170 }, { - "epoch": 0.11, - "learning_rate": 1.9964534868455544e-05, - "loss": 1.3638, + "epoch": 0.017896389324960755, + "grad_norm": 4.86530329059982, + "learning_rate": 1.191637630662021e-05, + "loss": 1.1147, "step": 171 }, { - "epoch": 0.11, - "learning_rate": 1.9963631882477716e-05, - "loss": 1.3307, + "epoch": 0.018001046572475143, + "grad_norm": 3.74319247595939, + "learning_rate": 1.1986062717770036e-05, + "loss": 1.0216, "step": 172 }, { - "epoch": 0.11, - "learning_rate": 1.9962717565926376e-05, - "loss": 1.36, + "epoch": 0.018105703819989535, + "grad_norm": 3.4535523228322274, + "learning_rate": 1.2055749128919862e-05, + "loss": 1.1057, "step": 173 }, { - "epoch": 0.11, - "learning_rate": 1.9961791919841275e-05, - "loss": 1.3391, + "epoch": 0.018210361067503924, + "grad_norm": 3.5433866410709784, + "learning_rate": 1.2125435540069688e-05, + "loss": 1.1274, "step": 174 }, { - "epoch": 0.12, - "learning_rate": 1.9960854945275055e-05, - "loss": 1.4955, + "epoch": 0.018315018315018316, + "grad_norm": 3.8964338737423567, + "learning_rate": 1.2195121951219513e-05, + "loss": 1.1093, "step": 175 }, { - "epoch": 0.12, - "learning_rate": 1.995990664329323e-05, - "loss": 1.4003, + "epoch": 0.018419675562532704, + "grad_norm": 3.7316530734886197, + "learning_rate": 1.2264808362369338e-05, + "loss": 1.0501, "step": 176 }, { - "epoch": 0.12, - "learning_rate": 1.9958947014974216e-05, - "loss": 1.2154, + "epoch": 0.018524332810047096, + "grad_norm": 3.8491712042277872, + "learning_rate": 1.2334494773519165e-05, + "loss": 1.1212, "step": 177 }, { - "epoch": 0.12, - "learning_rate": 1.9957976061409277e-05, - "loss": 1.3286, + "epoch": 0.018628990057561485, + "grad_norm": 3.4777164601747863, + "learning_rate": 1.240418118466899e-05, + "loss": 1.0654, "step": 178 }, { - "epoch": 0.12, - "learning_rate": 1.9956993783702594e-05, - "loss": 1.4478, + "epoch": 0.018733647305075877, + "grad_norm": 2.261327289656553, + "learning_rate": 1.2473867595818815e-05, + "loss": 0.7282, "step": 179 }, { - "epoch": 0.12, - "learning_rate": 1.9956000182971192e-05, - "loss": 1.2906, + "epoch": 0.018838304552590265, + "grad_norm": 3.552001161873253, + "learning_rate": 1.254355400696864e-05, + "loss": 0.9128, "step": 180 }, { - "epoch": 0.12, - "learning_rate": 1.9954995260344993e-05, - "loss": 1.1524, + "epoch": 0.018942961800104657, + "grad_norm": 5.287461211801188, + "learning_rate": 1.2613240418118469e-05, + "loss": 1.051, "step": 181 }, { - "epoch": 0.12, - "learning_rate": 1.995397901696679e-05, - "loss": 1.3923, + "epoch": 0.01904761904761905, + "grad_norm": 4.4864817842055436, + "learning_rate": 1.2682926829268294e-05, + "loss": 1.0427, "step": 182 }, { - "epoch": 0.12, - "learning_rate": 1.995295145399224e-05, - "loss": 1.3211, + "epoch": 0.019152276295133438, + "grad_norm": 4.588279167310623, + "learning_rate": 1.275261324041812e-05, + "loss": 1.0934, "step": 183 }, { - "epoch": 0.12, - "learning_rate": 1.9951912572589892e-05, - "loss": 1.3806, + "epoch": 0.01925693354264783, + "grad_norm": 5.248743013483407, + "learning_rate": 1.2822299651567945e-05, + "loss": 1.0224, "step": 184 }, { - "epoch": 0.12, - "learning_rate": 1.9950862373941145e-05, - "loss": 1.3799, + "epoch": 0.01936159079016222, + "grad_norm": 3.599591701450194, + "learning_rate": 1.2891986062717772e-05, + "loss": 0.942, "step": 185 }, { - "epoch": 0.12, - "learning_rate": 1.994980085924028e-05, - "loss": 1.3846, + "epoch": 0.01946624803767661, + "grad_norm": 4.22948244161219, + "learning_rate": 1.2961672473867597e-05, + "loss": 1.096, "step": 186 }, { - "epoch": 0.12, - "learning_rate": 1.994872802969445e-05, - "loss": 1.4632, + "epoch": 0.019570905285191, + "grad_norm": 3.883052580674137, + "learning_rate": 1.3031358885017422e-05, + "loss": 1.0867, "step": 187 }, { - "epoch": 0.12, - "learning_rate": 1.994764388652366e-05, - "loss": 1.3105, + "epoch": 0.01967556253270539, + "grad_norm": 3.9641257982365024, + "learning_rate": 1.3101045296167247e-05, + "loss": 1.0495, "step": 188 }, { - "epoch": 0.12, - "learning_rate": 1.9946548430960796e-05, - "loss": 1.3987, + "epoch": 0.01978021978021978, + "grad_norm": 3.98653960238745, + "learning_rate": 1.3170731707317076e-05, + "loss": 1.1492, "step": 189 }, { - "epoch": 0.13, - "learning_rate": 1.9945441664251605e-05, - "loss": 1.4065, + "epoch": 0.01988487702773417, + "grad_norm": 2.3679885080466554, + "learning_rate": 1.32404181184669e-05, + "loss": 0.7717, "step": 190 }, { - "epoch": 0.13, - "learning_rate": 1.9944323587654685e-05, - "loss": 1.4344, + "epoch": 0.01998953427524856, + "grad_norm": 4.239294493465942, + "learning_rate": 1.3310104529616726e-05, + "loss": 1.1124, "step": 191 }, { - "epoch": 0.13, - "learning_rate": 1.9943194202441514e-05, - "loss": 1.3795, + "epoch": 0.020094191522762952, + "grad_norm": 4.028830074043196, + "learning_rate": 1.3379790940766553e-05, + "loss": 1.1247, "step": 192 }, { - "epoch": 0.13, - "learning_rate": 1.9942053509896417e-05, - "loss": 1.552, + "epoch": 0.02019884877027734, + "grad_norm": 3.522476393811863, + "learning_rate": 1.3449477351916378e-05, + "loss": 1.0408, "step": 193 }, { - "epoch": 0.13, - "learning_rate": 1.9940901511316585e-05, - "loss": 1.3405, + "epoch": 0.020303506017791732, + "grad_norm": 3.420820328724481, + "learning_rate": 1.3519163763066203e-05, + "loss": 1.0528, "step": 194 }, { - "epoch": 0.13, - "learning_rate": 1.993973820801206e-05, - "loss": 1.3728, + "epoch": 0.02040816326530612, + "grad_norm": 3.855766622211791, + "learning_rate": 1.3588850174216028e-05, + "loss": 1.0484, "step": 195 }, { - "epoch": 0.13, - "learning_rate": 1.993856360130574e-05, - "loss": 1.4416, + "epoch": 0.020512820512820513, + "grad_norm": 4.09125354171716, + "learning_rate": 1.3658536585365855e-05, + "loss": 1.1015, "step": 196 }, { - "epoch": 0.13, - "learning_rate": 1.9937377692533382e-05, - "loss": 1.3926, + "epoch": 0.020617477760334905, + "grad_norm": 3.3241462228706653, + "learning_rate": 1.372822299651568e-05, + "loss": 1.1006, "step": 197 }, { - "epoch": 0.13, - "learning_rate": 1.99361804830436e-05, - "loss": 1.2651, + "epoch": 0.020722135007849293, + "grad_norm": 4.516135214785916, + "learning_rate": 1.3797909407665506e-05, + "loss": 1.0546, "step": 198 }, { - "epoch": 0.13, - "learning_rate": 1.9934971974197843e-05, - "loss": 1.2976, + "epoch": 0.020826792255363685, + "grad_norm": 3.3260375654673564, + "learning_rate": 1.3867595818815331e-05, + "loss": 1.1214, "step": 199 }, { - "epoch": 0.13, - "learning_rate": 1.993375216737042e-05, - "loss": 1.4121, + "epoch": 0.020931449502878074, + "grad_norm": 4.0907170025291935, + "learning_rate": 1.393728222996516e-05, + "loss": 1.0947, "step": 200 }, { - "epoch": 0.13, - "learning_rate": 1.9932521063948494e-05, - "loss": 1.276, + "epoch": 0.021036106750392466, + "grad_norm": 3.9206762011268337, + "learning_rate": 1.4006968641114985e-05, + "loss": 1.0005, "step": 201 }, { - "epoch": 0.13, - "learning_rate": 1.9931278665332057e-05, - "loss": 1.3322, + "epoch": 0.021140763997906854, + "grad_norm": 3.74058600438095, + "learning_rate": 1.407665505226481e-05, + "loss": 1.0507, "step": 202 }, { - "epoch": 0.13, - "learning_rate": 1.9930024972933966e-05, - "loss": 1.3596, + "epoch": 0.021245421245421246, + "grad_norm": 4.046674364114655, + "learning_rate": 1.4146341463414635e-05, + "loss": 1.0381, "step": 203 }, { - "epoch": 0.13, - "learning_rate": 1.9928759988179908e-05, - "loss": 1.4512, + "epoch": 0.021350078492935635, + "grad_norm": 4.070409637629066, + "learning_rate": 1.4216027874564462e-05, + "loss": 1.0869, "step": 204 }, { - "epoch": 0.13, - "learning_rate": 1.992748371250841e-05, - "loss": 1.4053, + "epoch": 0.021454735740450027, + "grad_norm": 2.477212456664358, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.7274, "step": 205 }, { - "epoch": 0.14, - "learning_rate": 1.992619614737085e-05, - "loss": 1.2646, + "epoch": 0.021559392987964415, + "grad_norm": 4.809989174382412, + "learning_rate": 1.4355400696864112e-05, + "loss": 0.9624, "step": 206 }, { - "epoch": 0.14, - "learning_rate": 1.9924897294231434e-05, - "loss": 1.5216, + "epoch": 0.021664050235478807, + "grad_norm": 4.855067955207067, + "learning_rate": 1.4425087108013938e-05, + "loss": 1.056, "step": 207 }, { - "epoch": 0.14, - "learning_rate": 1.9923587154567214e-05, - "loss": 1.4538, + "epoch": 0.021768707482993196, + "grad_norm": 4.666013151279942, + "learning_rate": 1.4494773519163764e-05, + "loss": 0.9354, "step": 208 }, { - "epoch": 0.14, - "learning_rate": 1.9922265729868065e-05, - "loss": 1.4267, + "epoch": 0.021873364730507588, + "grad_norm": 3.940057891989908, + "learning_rate": 1.456445993031359e-05, + "loss": 1.014, "step": 209 }, { - "epoch": 0.14, - "learning_rate": 1.992093302163671e-05, - "loss": 1.5343, + "epoch": 0.02197802197802198, + "grad_norm": 5.015568593900297, + "learning_rate": 1.4634146341463415e-05, + "loss": 1.0634, "step": 210 }, { - "epoch": 0.14, - "learning_rate": 1.9919589031388688e-05, - "loss": 1.3869, + "epoch": 0.022082679225536368, + "grad_norm": 3.5260271531233927, + "learning_rate": 1.470383275261324e-05, + "loss": 0.9887, "step": 211 }, { - "epoch": 0.14, - "learning_rate": 1.991823376065238e-05, - "loss": 1.3431, + "epoch": 0.02218733647305076, + "grad_norm": 4.142631353724366, + "learning_rate": 1.4773519163763069e-05, + "loss": 1.0409, "step": 212 }, { - "epoch": 0.14, - "learning_rate": 1.991686721096899e-05, - "loss": 1.2332, + "epoch": 0.02229199372056515, + "grad_norm": 4.18461357230588, + "learning_rate": 1.4843205574912894e-05, + "loss": 1.0713, "step": 213 }, { - "epoch": 0.14, - "learning_rate": 1.991548938389255e-05, - "loss": 1.4032, + "epoch": 0.02239665096807954, + "grad_norm": 4.157498562874197, + "learning_rate": 1.4912891986062719e-05, + "loss": 1.0933, "step": 214 }, { - "epoch": 0.14, - "learning_rate": 1.991410028098991e-05, - "loss": 1.3987, + "epoch": 0.02250130821559393, + "grad_norm": 4.027272133207346, + "learning_rate": 1.4982578397212544e-05, + "loss": 0.9743, "step": 215 }, { - "epoch": 0.14, - "learning_rate": 1.9912699903840754e-05, - "loss": 1.5462, + "epoch": 0.02260596546310832, + "grad_norm": 4.379374705129572, + "learning_rate": 1.5052264808362371e-05, + "loss": 1.0026, "step": 216 }, { - "epoch": 0.14, - "learning_rate": 1.991128825403758e-05, - "loss": 1.3819, + "epoch": 0.02271062271062271, + "grad_norm": 3.239907491980648, + "learning_rate": 1.5121951219512196e-05, + "loss": 1.0678, "step": 217 }, { - "epoch": 0.14, - "learning_rate": 1.9909865333185704e-05, - "loss": 1.3906, + "epoch": 0.0228152799581371, + "grad_norm": 4.0127649077633665, + "learning_rate": 1.5191637630662021e-05, + "loss": 1.1173, "step": 218 }, { - "epoch": 0.14, - "learning_rate": 1.9908431142903263e-05, - "loss": 1.4463, + "epoch": 0.02291993720565149, + "grad_norm": 4.264440457331995, + "learning_rate": 1.5261324041811848e-05, + "loss": 0.9743, "step": 219 }, { - "epoch": 0.14, - "learning_rate": 1.9906985684821208e-05, - "loss": 1.3995, + "epoch": 0.023024594453165882, + "grad_norm": 3.997508098429076, + "learning_rate": 1.5331010452961673e-05, + "loss": 1.0646, "step": 220 }, { - "epoch": 0.15, - "learning_rate": 1.9905528960583305e-05, - "loss": 1.1931, + "epoch": 0.02312925170068027, + "grad_norm": 3.47902625924093, + "learning_rate": 1.54006968641115e-05, + "loss": 0.9376, "step": 221 }, { - "epoch": 0.15, - "learning_rate": 1.990406097184613e-05, - "loss": 1.328, + "epoch": 0.023233908948194663, + "grad_norm": 4.175847455405455, + "learning_rate": 1.5470383275261324e-05, + "loss": 1.16, "step": 222 }, { - "epoch": 0.15, - "learning_rate": 1.9902581720279074e-05, - "loss": 1.3747, + "epoch": 0.02333856619570905, + "grad_norm": 3.8697528880278775, + "learning_rate": 1.554006968641115e-05, + "loss": 1.1035, "step": 223 }, { - "epoch": 0.15, - "learning_rate": 1.9901091207564326e-05, - "loss": 1.356, + "epoch": 0.023443223443223443, + "grad_norm": 3.7062219666594522, + "learning_rate": 1.5609756097560978e-05, + "loss": 1.0437, "step": 224 }, { - "epoch": 0.15, - "learning_rate": 1.9899589435396888e-05, - "loss": 1.5297, + "epoch": 0.023547880690737835, + "grad_norm": 3.604348929857401, + "learning_rate": 1.5679442508710803e-05, + "loss": 1.1209, "step": 225 }, { - "epoch": 0.15, - "learning_rate": 1.9898076405484565e-05, - "loss": 1.2664, + "epoch": 0.023652537938252224, + "grad_norm": 4.264573745725593, + "learning_rate": 1.5749128919860628e-05, + "loss": 1.0217, "step": 226 }, { - "epoch": 0.15, - "learning_rate": 1.9896552119547964e-05, - "loss": 1.3391, + "epoch": 0.023757195185766616, + "grad_norm": 5.445543416503218, + "learning_rate": 1.5818815331010456e-05, + "loss": 1.0786, "step": 227 }, { - "epoch": 0.15, - "learning_rate": 1.9895016579320495e-05, - "loss": 1.5459, + "epoch": 0.023861852433281004, + "grad_norm": 5.26347191325024, + "learning_rate": 1.588850174216028e-05, + "loss": 1.1497, "step": 228 }, { - "epoch": 0.15, - "learning_rate": 1.989346978654836e-05, - "loss": 1.3014, + "epoch": 0.023966509680795396, + "grad_norm": 3.5473340256833548, + "learning_rate": 1.5958188153310107e-05, + "loss": 0.9933, "step": 229 }, { - "epoch": 0.15, - "learning_rate": 1.9891911742990567e-05, - "loss": 1.4365, + "epoch": 0.024071166928309785, + "grad_norm": 3.654247972110555, + "learning_rate": 1.6027874564459932e-05, + "loss": 1.0457, "step": 230 }, { - "epoch": 0.15, - "learning_rate": 1.9890342450418907e-05, - "loss": 1.5066, + "epoch": 0.024175824175824177, + "grad_norm": 3.6750069709677047, + "learning_rate": 1.6097560975609757e-05, + "loss": 1.1164, "step": 231 }, { - "epoch": 0.15, - "learning_rate": 1.9888761910617967e-05, - "loss": 1.3875, + "epoch": 0.024280481423338565, + "grad_norm": 3.542624844100214, + "learning_rate": 1.6167247386759582e-05, + "loss": 1.1167, "step": 232 }, { - "epoch": 0.15, - "learning_rate": 1.9887170125385135e-05, - "loss": 1.3043, + "epoch": 0.024385138670852957, + "grad_norm": 3.9872277812389996, + "learning_rate": 1.6236933797909408e-05, + "loss": 1.1433, "step": 233 }, { - "epoch": 0.15, - "learning_rate": 1.9885567096530566e-05, - "loss": 1.3391, + "epoch": 0.024489795918367346, + "grad_norm": 3.7156048056910382, + "learning_rate": 1.6306620209059233e-05, + "loss": 1.0351, "step": 234 }, { - "epoch": 0.15, - "learning_rate": 1.9883952825877225e-05, - "loss": 1.51, + "epoch": 0.024594453165881738, + "grad_norm": 3.7816100969884348, + "learning_rate": 1.637630662020906e-05, + "loss": 1.0736, "step": 235 }, { - "epoch": 0.16, - "learning_rate": 1.988232731526084e-05, - "loss": 1.4932, + "epoch": 0.024699110413396126, + "grad_norm": 2.5076547499457362, + "learning_rate": 1.6445993031358887e-05, + "loss": 0.7651, "step": 236 }, { - "epoch": 0.16, - "learning_rate": 1.9880690566529933e-05, - "loss": 1.3678, + "epoch": 0.024803767660910518, + "grad_norm": 3.3947366530434184, + "learning_rate": 1.6515679442508712e-05, + "loss": 1.1833, "step": 237 }, { - "epoch": 0.16, - "learning_rate": 1.987904258154581e-05, - "loss": 1.4132, + "epoch": 0.02490842490842491, + "grad_norm": 3.9432345380153713, + "learning_rate": 1.6585365853658537e-05, + "loss": 1.0596, "step": 238 }, { - "epoch": 0.16, - "learning_rate": 1.987738336218254e-05, - "loss": 1.4784, + "epoch": 0.0250130821559393, + "grad_norm": 4.098078794977877, + "learning_rate": 1.6655052264808366e-05, + "loss": 1.1229, "step": 239 }, { - "epoch": 0.16, - "learning_rate": 1.9875712910326982e-05, - "loss": 1.2517, + "epoch": 0.02511773940345369, + "grad_norm": 3.9973498865337533, + "learning_rate": 1.672473867595819e-05, + "loss": 0.9674, "step": 240 }, { - "epoch": 0.16, - "learning_rate": 1.987403122787876e-05, - "loss": 1.3915, + "epoch": 0.02522239665096808, + "grad_norm": 3.5983933762145037, + "learning_rate": 1.6794425087108016e-05, + "loss": 1.2461, "step": 241 }, { - "epoch": 0.16, - "learning_rate": 1.9872338316750264e-05, - "loss": 1.3817, + "epoch": 0.02532705389848247, + "grad_norm": 3.8197581407276977, + "learning_rate": 1.686411149825784e-05, + "loss": 1.033, "step": 242 }, { - "epoch": 0.16, - "learning_rate": 1.9870634178866676e-05, - "loss": 1.3482, + "epoch": 0.02543171114599686, + "grad_norm": 2.3297919147992445, + "learning_rate": 1.6933797909407666e-05, + "loss": 0.7915, "step": 243 }, { - "epoch": 0.16, - "learning_rate": 1.986891881616592e-05, - "loss": 1.4669, + "epoch": 0.02553636839351125, + "grad_norm": 3.8568402426046853, + "learning_rate": 1.700348432055749e-05, + "loss": 1.1112, "step": 244 }, { - "epoch": 0.16, - "learning_rate": 1.9867192230598692e-05, - "loss": 1.521, + "epoch": 0.02564102564102564, + "grad_norm": 3.3775792696962434, + "learning_rate": 1.7073170731707317e-05, + "loss": 1.0562, "step": 245 }, { - "epoch": 0.16, - "learning_rate": 1.986545442412846e-05, - "loss": 1.3481, + "epoch": 0.025745682888540032, + "grad_norm": 3.8185082938369206, + "learning_rate": 1.7142857142857142e-05, + "loss": 1.2041, "step": 246 }, { - "epoch": 0.16, - "learning_rate": 1.9863705398731444e-05, - "loss": 1.2345, + "epoch": 0.02585034013605442, + "grad_norm": 3.946632969495481, + "learning_rate": 1.721254355400697e-05, + "loss": 1.1593, "step": 247 }, { - "epoch": 0.16, - "learning_rate": 1.986194515639662e-05, - "loss": 1.39, + "epoch": 0.025954997383568813, + "grad_norm": 3.4354194263721243, + "learning_rate": 1.7282229965156796e-05, + "loss": 1.0919, "step": 248 }, { - "epoch": 0.16, - "learning_rate": 1.986017369912573e-05, - "loss": 1.4151, + "epoch": 0.0260596546310832, + "grad_norm": 4.090271178228001, + "learning_rate": 1.735191637630662e-05, + "loss": 1.1536, "step": 249 }, { - "epoch": 0.16, - "learning_rate": 1.9858391028933258e-05, - "loss": 1.3328, + "epoch": 0.026164311878597593, + "grad_norm": 3.7495551186516574, + "learning_rate": 1.7421602787456446e-05, + "loss": 1.024, "step": 250 }, { - "epoch": 0.17, - "learning_rate": 1.9856597147846447e-05, - "loss": 1.3425, + "epoch": 0.02626896912611198, + "grad_norm": 3.682112510859972, + "learning_rate": 1.7491289198606275e-05, + "loss": 1.1665, "step": 251 }, { - "epoch": 0.17, - "learning_rate": 1.985479205790528e-05, - "loss": 1.485, + "epoch": 0.026373626373626374, + "grad_norm": 3.3296117604440783, + "learning_rate": 1.75609756097561e-05, + "loss": 1.0428, "step": 252 }, { - "epoch": 0.17, - "learning_rate": 1.98529757611625e-05, - "loss": 1.2616, + "epoch": 0.026478283621140766, + "grad_norm": 3.200300909223925, + "learning_rate": 1.7630662020905925e-05, + "loss": 1.0666, "step": 253 }, { - "epoch": 0.17, - "learning_rate": 1.9851148259683586e-05, - "loss": 1.1877, + "epoch": 0.026582940868655154, + "grad_norm": 3.747386619716652, + "learning_rate": 1.770034843205575e-05, + "loss": 1.154, "step": 254 }, { - "epoch": 0.17, - "learning_rate": 1.984930955554676e-05, - "loss": 1.489, + "epoch": 0.026687598116169546, + "grad_norm": 3.9792918121852483, + "learning_rate": 1.7770034843205575e-05, + "loss": 0.9988, "step": 255 }, { - "epoch": 0.17, - "learning_rate": 1.984745965084298e-05, - "loss": 1.4196, + "epoch": 0.026792255363683935, + "grad_norm": 3.1282408579177283, + "learning_rate": 1.78397212543554e-05, + "loss": 0.9897, "step": 256 }, { - "epoch": 0.17, - "learning_rate": 1.9845598547675945e-05, - "loss": 1.3344, + "epoch": 0.026896912611198327, + "grad_norm": 2.056362322374514, + "learning_rate": 1.7909407665505226e-05, + "loss": 0.7055, "step": 257 }, { - "epoch": 0.17, - "learning_rate": 1.9843726248162096e-05, - "loss": 1.3713, + "epoch": 0.027001569858712715, + "grad_norm": 2.5522966724650624, + "learning_rate": 1.7979094076655054e-05, + "loss": 0.8675, "step": 258 }, { - "epoch": 0.17, - "learning_rate": 1.984184275443059e-05, - "loss": 1.2307, + "epoch": 0.027106227106227107, + "grad_norm": 3.5782559255097457, + "learning_rate": 1.804878048780488e-05, + "loss": 1.0991, "step": 259 }, { - "epoch": 0.17, - "learning_rate": 1.983994806862333e-05, - "loss": 1.4117, + "epoch": 0.027210884353741496, + "grad_norm": 3.928265536667273, + "learning_rate": 1.8118466898954705e-05, + "loss": 1.05, "step": 260 }, { - "epoch": 0.17, - "learning_rate": 1.9838042192894935e-05, - "loss": 1.4123, + "epoch": 0.027315541601255888, + "grad_norm": 3.551876974379943, + "learning_rate": 1.818815331010453e-05, + "loss": 1.0506, "step": 261 }, { - "epoch": 0.17, - "learning_rate": 1.9836125129412756e-05, - "loss": 1.1802, + "epoch": 0.027420198848770276, + "grad_norm": 2.60968997475218, + "learning_rate": 1.825783972125436e-05, + "loss": 0.7637, "step": 262 }, { - "epoch": 0.17, - "learning_rate": 1.9834196880356863e-05, - "loss": 1.4824, + "epoch": 0.027524856096284668, + "grad_norm": 3.2023640410720504, + "learning_rate": 1.8327526132404184e-05, + "loss": 1.1014, "step": 263 }, { - "epoch": 0.17, - "learning_rate": 1.983225744792005e-05, - "loss": 1.3801, + "epoch": 0.027629513343799057, + "grad_norm": 4.254369956741867, + "learning_rate": 1.839721254355401e-05, + "loss": 1.1119, "step": 264 }, { - "epoch": 0.17, - "learning_rate": 1.9830306834307828e-05, - "loss": 1.4142, + "epoch": 0.02773417059131345, + "grad_norm": 4.456237791959698, + "learning_rate": 1.8466898954703834e-05, + "loss": 1.0861, "step": 265 }, { - "epoch": 0.18, - "learning_rate": 1.9828345041738413e-05, - "loss": 1.2993, + "epoch": 0.02783882783882784, + "grad_norm": 3.326487692184649, + "learning_rate": 1.8536585365853663e-05, + "loss": 1.1481, "step": 266 }, { - "epoch": 0.18, - "learning_rate": 1.9826372072442747e-05, - "loss": 1.24, + "epoch": 0.02794348508634223, + "grad_norm": 3.4486894326387985, + "learning_rate": 1.8606271777003488e-05, + "loss": 0.9798, "step": 267 }, { - "epoch": 0.18, - "learning_rate": 1.9824387928664478e-05, - "loss": 1.3416, + "epoch": 0.02804814233385662, + "grad_norm": 3.5440191404796875, + "learning_rate": 1.8675958188153313e-05, + "loss": 1.1801, "step": 268 }, { - "epoch": 0.18, - "learning_rate": 1.9822392612659964e-05, - "loss": 1.4188, + "epoch": 0.02815279958137101, + "grad_norm": 3.978193923401927, + "learning_rate": 1.8745644599303138e-05, + "loss": 1.0222, "step": 269 }, { - "epoch": 0.18, - "learning_rate": 1.9820386126698256e-05, - "loss": 1.401, + "epoch": 0.0282574568288854, + "grad_norm": 2.5739712495009224, + "learning_rate": 1.8815331010452963e-05, + "loss": 0.7676, "step": 270 }, { - "epoch": 0.18, - "learning_rate": 1.981836847306112e-05, - "loss": 1.262, + "epoch": 0.02836211407639979, + "grad_norm": 3.7240630754036523, + "learning_rate": 1.888501742160279e-05, + "loss": 1.0653, "step": 271 }, { - "epoch": 0.18, - "learning_rate": 1.981633965404302e-05, - "loss": 1.1779, + "epoch": 0.028466771323914182, + "grad_norm": 3.1834334045524337, + "learning_rate": 1.8954703832752614e-05, + "loss": 1.0548, "step": 272 }, { - "epoch": 0.18, - "learning_rate": 1.9814299671951113e-05, - "loss": 1.3676, + "epoch": 0.02857142857142857, + "grad_norm": 4.050102203011576, + "learning_rate": 1.902439024390244e-05, + "loss": 1.0478, "step": 273 }, { - "epoch": 0.18, - "learning_rate": 1.9812248529105254e-05, - "loss": 1.3851, + "epoch": 0.028676085818942963, + "grad_norm": 3.1797343765169783, + "learning_rate": 1.9094076655052267e-05, + "loss": 1.0334, "step": 274 }, { - "epoch": 0.18, - "learning_rate": 1.9810186227837986e-05, - "loss": 1.4388, + "epoch": 0.02878074306645735, + "grad_norm": 3.6752536373704454, + "learning_rate": 1.9163763066202093e-05, + "loss": 1.1476, "step": 275 }, { - "epoch": 0.18, - "learning_rate": 1.9808112770494546e-05, - "loss": 1.1772, + "epoch": 0.028885400313971743, + "grad_norm": 4.244905455321221, + "learning_rate": 1.9233449477351918e-05, + "loss": 1.0884, "step": 276 }, { - "epoch": 0.18, - "learning_rate": 1.9806028159432854e-05, - "loss": 1.2482, + "epoch": 0.02899005756148613, + "grad_norm": 2.548313029921991, + "learning_rate": 1.9303135888501743e-05, + "loss": 0.7592, "step": 277 }, { - "epoch": 0.18, - "learning_rate": 1.9803932397023512e-05, - "loss": 1.3222, + "epoch": 0.029094714809000524, + "grad_norm": 4.015070153207213, + "learning_rate": 1.937282229965157e-05, + "loss": 1.1131, "step": 278 }, { - "epoch": 0.18, - "learning_rate": 1.9801825485649816e-05, - "loss": 1.3487, + "epoch": 0.029199372056514912, + "grad_norm": 3.5031056198015382, + "learning_rate": 1.9442508710801397e-05, + "loss": 0.9966, "step": 279 }, { - "epoch": 0.18, - "learning_rate": 1.9799707427707723e-05, - "loss": 1.1701, + "epoch": 0.029304029304029304, + "grad_norm": 3.7962075539310467, + "learning_rate": 1.9512195121951222e-05, + "loss": 1.1491, "step": 280 }, { - "epoch": 0.18, - "learning_rate": 1.9797578225605875e-05, - "loss": 1.3803, + "epoch": 0.029408686551543696, + "grad_norm": 3.575956042142137, + "learning_rate": 1.9581881533101047e-05, + "loss": 1.0795, "step": 281 }, { - "epoch": 0.19, - "learning_rate": 1.979543788176559e-05, - "loss": 1.2782, + "epoch": 0.029513343799058084, + "grad_norm": 3.5922001990846497, + "learning_rate": 1.9651567944250872e-05, + "loss": 1.0633, "step": 282 }, { - "epoch": 0.19, - "learning_rate": 1.979328639862085e-05, - "loss": 1.4364, + "epoch": 0.029618001046572476, + "grad_norm": 3.176407389065215, + "learning_rate": 1.9721254355400697e-05, + "loss": 1.0095, "step": 283 }, { - "epoch": 0.19, - "learning_rate": 1.9791123778618303e-05, - "loss": 1.3674, + "epoch": 0.029722658294086865, + "grad_norm": 3.9198920216298063, + "learning_rate": 1.9790940766550523e-05, + "loss": 1.1247, "step": 284 }, { - "epoch": 0.19, - "learning_rate": 1.978895002421727e-05, - "loss": 1.3909, + "epoch": 0.029827315541601257, + "grad_norm": 3.7058792697424425, + "learning_rate": 1.9860627177700348e-05, + "loss": 1.0748, "step": 285 }, { - "epoch": 0.19, - "learning_rate": 1.978676513788973e-05, - "loss": 1.3825, + "epoch": 0.029931972789115645, + "grad_norm": 3.1240553549271075, + "learning_rate": 1.9930313588850176e-05, + "loss": 1.0794, "step": 286 }, { - "epoch": 0.19, - "learning_rate": 1.978456912212032e-05, - "loss": 1.2967, + "epoch": 0.030036630036630037, + "grad_norm": 4.161059734877122, + "learning_rate": 2e-05, + "loss": 1.2648, "step": 287 }, { - "epoch": 0.19, - "learning_rate": 1.9782361979406334e-05, - "loss": 1.2333, + "epoch": 0.030141287284144426, + "grad_norm": 3.292538569117115, + "learning_rate": 1.9999999425489863e-05, + "loss": 1.132, "step": 288 }, { - "epoch": 0.19, - "learning_rate": 1.9780143712257715e-05, - "loss": 1.3207, + "epoch": 0.030245944531658818, + "grad_norm": 3.3817728904500988, + "learning_rate": 1.9999997701959524e-05, + "loss": 1.119, "step": 289 }, { - "epoch": 0.19, - "learning_rate": 1.9777914323197063e-05, - "loss": 1.322, + "epoch": 0.030350601779173206, + "grad_norm": 3.698433893111136, + "learning_rate": 1.999999482940917e-05, + "loss": 1.1232, "step": 290 }, { - "epoch": 0.19, - "learning_rate": 1.977567381475963e-05, - "loss": 1.4313, + "epoch": 0.0304552590266876, + "grad_norm": 3.867287084222632, + "learning_rate": 1.9999990807839145e-05, + "loss": 1.0713, "step": 291 }, { - "epoch": 0.19, - "learning_rate": 1.97734221894933e-05, - "loss": 1.4255, + "epoch": 0.030559916274201987, + "grad_norm": 3.4207322827343276, + "learning_rate": 1.99999856372499e-05, + "loss": 1.131, "step": 292 }, { - "epoch": 0.19, - "learning_rate": 1.9771159449958612e-05, - "loss": 1.3717, + "epoch": 0.03066457352171638, + "grad_norm": 3.489327239181255, + "learning_rate": 1.9999979317642035e-05, + "loss": 1.0825, "step": 293 }, { - "epoch": 0.19, - "learning_rate": 1.9768885598728734e-05, - "loss": 1.2229, + "epoch": 0.03076923076923077, + "grad_norm": 4.019683220751094, + "learning_rate": 1.9999971849016274e-05, + "loss": 1.0214, "step": 294 }, { - "epoch": 0.19, - "learning_rate": 1.976660063838947e-05, - "loss": 1.4424, + "epoch": 0.03087388801674516, + "grad_norm": 3.58104376114611, + "learning_rate": 1.9999963231373474e-05, + "loss": 1.0346, "step": 295 }, { - "epoch": 0.19, - "learning_rate": 1.9764304571539266e-05, - "loss": 1.3519, + "epoch": 0.03097854526425955, + "grad_norm": 4.29557092301096, + "learning_rate": 1.9999953464714628e-05, + "loss": 1.1057, "step": 296 }, { - "epoch": 0.2, - "learning_rate": 1.9761997400789194e-05, - "loss": 1.3743, + "epoch": 0.03108320251177394, + "grad_norm": 4.031376664345659, + "learning_rate": 1.9999942549040857e-05, + "loss": 1.199, "step": 297 }, { - "epoch": 0.2, - "learning_rate": 1.9759679128762946e-05, - "loss": 1.3588, + "epoch": 0.031187859759288332, + "grad_norm": 3.2778277667278837, + "learning_rate": 1.9999930484353412e-05, + "loss": 1.1605, "step": 298 }, { - "epoch": 0.2, - "learning_rate": 1.9757349758096855e-05, - "loss": 1.3905, + "epoch": 0.031292517006802724, + "grad_norm": 3.57216344766695, + "learning_rate": 1.9999917270653686e-05, + "loss": 1.2141, "step": 299 }, { - "epoch": 0.2, - "learning_rate": 1.975500929143986e-05, - "loss": 1.4055, + "epoch": 0.03139717425431711, + "grad_norm": 3.32290052080157, + "learning_rate": 1.9999902907943195e-05, + "loss": 1.0385, "step": 300 }, { - "epoch": 0.2, - "learning_rate": 1.9752657731453523e-05, - "loss": 1.3259, + "epoch": 0.0315018315018315, + "grad_norm": 3.3614253151991718, + "learning_rate": 1.9999887396223584e-05, + "loss": 0.8076, "step": 301 }, { - "epoch": 0.2, - "learning_rate": 1.9750295080812025e-05, - "loss": 1.342, + "epoch": 0.03160648874934589, + "grad_norm": 3.999164186143186, + "learning_rate": 1.999987073549664e-05, + "loss": 1.1335, "step": 302 }, { - "epoch": 0.2, - "learning_rate": 1.974792134220215e-05, - "loss": 1.4263, + "epoch": 0.031711145996860285, + "grad_norm": 3.8434118240060937, + "learning_rate": 1.9999852925764277e-05, + "loss": 1.0446, "step": 303 }, { - "epoch": 0.2, - "learning_rate": 1.9745536518323304e-05, - "loss": 1.393, + "epoch": 0.03181580324437467, + "grad_norm": 4.069391175340414, + "learning_rate": 1.9999833967028542e-05, + "loss": 1.0591, "step": 304 }, { - "epoch": 0.2, - "learning_rate": 1.9743140611887487e-05, - "loss": 1.3359, + "epoch": 0.03192046049188906, + "grad_norm": 3.8316602155773998, + "learning_rate": 1.999981385929161e-05, + "loss": 1.1707, "step": 305 }, { - "epoch": 0.2, - "learning_rate": 1.974073362561931e-05, - "loss": 1.4202, + "epoch": 0.032025117739403454, + "grad_norm": 4.737562782836604, + "learning_rate": 1.99997926025558e-05, + "loss": 1.0614, "step": 306 }, { - "epoch": 0.2, - "learning_rate": 1.9738315562255983e-05, - "loss": 1.3547, + "epoch": 0.032129774986917846, + "grad_norm": 3.8265410957341515, + "learning_rate": 1.9999770196823544e-05, + "loss": 1.1765, "step": 307 }, { - "epoch": 0.2, - "learning_rate": 1.9735886424547306e-05, - "loss": 1.4709, + "epoch": 0.03223443223443224, + "grad_norm": 3.265062349948494, + "learning_rate": 1.999974664209742e-05, + "loss": 1.0048, "step": 308 }, { - "epoch": 0.2, - "learning_rate": 1.973344621525568e-05, - "loss": 1.2986, + "epoch": 0.03233908948194662, + "grad_norm": 3.642030345306258, + "learning_rate": 1.9999721938380133e-05, + "loss": 1.1304, "step": 309 }, { - "epoch": 0.2, - "learning_rate": 1.97309949371561e-05, - "loss": 1.2082, + "epoch": 0.032443746729461015, + "grad_norm": 3.4292052143727405, + "learning_rate": 1.999969608567453e-05, + "loss": 1.1584, "step": 310 }, { - "epoch": 0.2, - "learning_rate": 1.9728532593036132e-05, - "loss": 1.3314, + "epoch": 0.03254840397697541, + "grad_norm": 3.7985067604810565, + "learning_rate": 1.999966908398357e-05, + "loss": 1.0927, "step": 311 }, { - "epoch": 0.21, - "learning_rate": 1.972605918569594e-05, - "loss": 1.387, + "epoch": 0.0326530612244898, + "grad_norm": 4.314018944438203, + "learning_rate": 1.999964093331036e-05, + "loss": 1.0461, "step": 312 }, { - "epoch": 0.21, - "learning_rate": 1.972357471794827e-05, - "loss": 1.1319, + "epoch": 0.032757718472004184, + "grad_norm": 3.083704093004486, + "learning_rate": 1.9999611633658142e-05, + "loss": 1.0489, "step": 313 }, { - "epoch": 0.21, - "learning_rate": 1.972107919261844e-05, - "loss": 1.2984, + "epoch": 0.032862375719518576, + "grad_norm": 3.261286750192, + "learning_rate": 1.999958118503027e-05, + "loss": 1.0078, "step": 314 }, { - "epoch": 0.21, - "learning_rate": 1.9718572612544345e-05, - "loss": 1.4288, + "epoch": 0.03296703296703297, + "grad_norm": 3.4697246675242375, + "learning_rate": 1.9999549587430252e-05, + "loss": 1.1564, "step": 315 }, { - "epoch": 0.21, - "learning_rate": 1.9716054980576446e-05, - "loss": 1.213, + "epoch": 0.03307169021454736, + "grad_norm": 3.5137061405420478, + "learning_rate": 1.9999516840861714e-05, + "loss": 0.9702, "step": 316 }, { - "epoch": 0.21, - "learning_rate": 1.971352629957778e-05, - "loss": 1.3471, + "epoch": 0.033176347462061745, + "grad_norm": 3.7031729797465465, + "learning_rate": 1.9999482945328422e-05, + "loss": 1.1463, "step": 317 }, { - "epoch": 0.21, - "learning_rate": 1.9710986572423947e-05, - "loss": 1.2475, + "epoch": 0.03328100470957614, + "grad_norm": 3.526121821006833, + "learning_rate": 1.9999447900834266e-05, + "loss": 0.9697, "step": 318 }, { - "epoch": 0.21, - "learning_rate": 1.9708435802003105e-05, - "loss": 1.4886, + "epoch": 0.03338566195709053, + "grad_norm": 3.582049192765141, + "learning_rate": 1.9999411707383273e-05, + "loss": 1.1307, "step": 319 }, { - "epoch": 0.21, - "learning_rate": 1.9705873991215973e-05, - "loss": 1.3517, + "epoch": 0.03349031920460492, + "grad_norm": 3.71675942867392, + "learning_rate": 1.9999374364979608e-05, + "loss": 1.1452, "step": 320 }, { - "epoch": 0.21, - "learning_rate": 1.970330114297583e-05, - "loss": 1.4266, + "epoch": 0.033594976452119306, + "grad_norm": 3.5193147394190074, + "learning_rate": 1.9999335873627555e-05, + "loss": 1.1801, "step": 321 }, { - "epoch": 0.21, - "learning_rate": 1.970071726020849e-05, - "loss": 1.3975, + "epoch": 0.0336996336996337, + "grad_norm": 3.5695487111516617, + "learning_rate": 1.999929623333154e-05, + "loss": 1.0796, "step": 322 }, { - "epoch": 0.21, - "learning_rate": 1.969812234585233e-05, - "loss": 1.2978, + "epoch": 0.03380429094714809, + "grad_norm": 3.32206675867571, + "learning_rate": 1.999925544409612e-05, + "loss": 1.0898, "step": 323 }, { - "epoch": 0.21, - "learning_rate": 1.9695516402858277e-05, - "loss": 1.2647, + "epoch": 0.03390894819466248, + "grad_norm": 2.9565893128467153, + "learning_rate": 1.9999213505925975e-05, + "loss": 1.1213, "step": 324 }, { - "epoch": 0.21, - "learning_rate": 1.9692899434189786e-05, - "loss": 1.3324, + "epoch": 0.034013605442176874, + "grad_norm": 3.595043850818446, + "learning_rate": 1.9999170418825928e-05, + "loss": 1.1474, "step": 325 }, { - "epoch": 0.21, - "learning_rate": 1.969027144282285e-05, - "loss": 1.475, + "epoch": 0.03411826268969126, + "grad_norm": 3.003534456854358, + "learning_rate": 1.9999126182800932e-05, + "loss": 1.0788, "step": 326 }, { - "epoch": 0.22, - "learning_rate": 1.9687632431746014e-05, - "loss": 1.2854, + "epoch": 0.03422291993720565, + "grad_norm": 3.9622027216915914, + "learning_rate": 1.9999080797856063e-05, + "loss": 1.1529, "step": 327 }, { - "epoch": 0.22, - "learning_rate": 1.9684982403960332e-05, - "loss": 1.3711, + "epoch": 0.03432757718472004, + "grad_norm": 3.7438205661201955, + "learning_rate": 1.9999034263996543e-05, + "loss": 1.1211, "step": 328 }, { - "epoch": 0.22, - "learning_rate": 1.968232136247941e-05, - "loss": 1.3683, + "epoch": 0.034432234432234435, + "grad_norm": 3.1550464025240004, + "learning_rate": 1.9998986581227718e-05, + "loss": 1.1448, "step": 329 }, { - "epoch": 0.22, - "learning_rate": 1.9679649310329353e-05, - "loss": 1.3178, + "epoch": 0.03453689167974882, + "grad_norm": 3.0209980125708435, + "learning_rate": 1.999893774955506e-05, + "loss": 1.053, "step": 330 }, { - "epoch": 0.22, - "learning_rate": 1.9676966250548807e-05, - "loss": 1.3448, + "epoch": 0.03464154892726321, + "grad_norm": 3.214470861959865, + "learning_rate": 1.999888776898419e-05, + "loss": 1.0786, "step": 331 }, { - "epoch": 0.22, - "learning_rate": 1.967427218618893e-05, - "loss": 1.3387, + "epoch": 0.034746206174777604, + "grad_norm": 3.747250332756373, + "learning_rate": 1.9998836639520843e-05, + "loss": 1.1941, "step": 332 }, { - "epoch": 0.22, - "learning_rate": 1.967156712031339e-05, - "loss": 1.3689, + "epoch": 0.034850863422291996, + "grad_norm": 3.406359341493149, + "learning_rate": 1.9998784361170893e-05, + "loss": 1.1671, "step": 333 }, { - "epoch": 0.22, - "learning_rate": 1.966885105599837e-05, - "loss": 1.3894, + "epoch": 0.03495552066980638, + "grad_norm": 2.8938267258829944, + "learning_rate": 1.9998730933940355e-05, + "loss": 1.0199, "step": 334 }, { - "epoch": 0.22, - "learning_rate": 1.9666123996332564e-05, - "loss": 1.3845, + "epoch": 0.03506017791732077, + "grad_norm": 3.1924364336563045, + "learning_rate": 1.999867635783536e-05, + "loss": 1.133, "step": 335 }, { - "epoch": 0.22, - "learning_rate": 1.966338594441716e-05, - "loss": 1.3974, + "epoch": 0.035164835164835165, + "grad_norm": 3.0001944625011707, + "learning_rate": 1.9998620632862184e-05, + "loss": 1.066, "step": 336 }, { - "epoch": 0.22, - "learning_rate": 1.9660636903365854e-05, - "loss": 1.4133, + "epoch": 0.03526949241234956, + "grad_norm": 3.3739671593672447, + "learning_rate": 1.9998563759027228e-05, + "loss": 0.993, "step": 337 }, { - "epoch": 0.22, - "learning_rate": 1.9657876876304835e-05, - "loss": 1.2042, + "epoch": 0.03537414965986395, + "grad_norm": 3.0009291832792693, + "learning_rate": 1.9998505736337022e-05, + "loss": 1.0397, "step": 338 }, { - "epoch": 0.22, - "learning_rate": 1.965510586637279e-05, - "loss": 1.3591, + "epoch": 0.035478806907378334, + "grad_norm": 3.57452005557107, + "learning_rate": 1.9998446564798243e-05, + "loss": 1.1239, "step": 339 }, { - "epoch": 0.22, - "learning_rate": 1.965232387672089e-05, - "loss": 1.4954, + "epoch": 0.035583464154892726, + "grad_norm": 3.4334676273713898, + "learning_rate": 1.9998386244417686e-05, + "loss": 1.059, "step": 340 }, { - "epoch": 0.22, - "learning_rate": 1.9649530910512796e-05, - "loss": 1.3161, + "epoch": 0.03568812140240712, + "grad_norm": 3.4932921996468846, + "learning_rate": 1.9998324775202277e-05, + "loss": 1.0886, "step": 341 }, { - "epoch": 0.23, - "learning_rate": 1.9646726970924645e-05, - "loss": 1.374, + "epoch": 0.03579277864992151, + "grad_norm": 3.293255380782077, + "learning_rate": 1.9998262157159087e-05, + "loss": 0.8386, "step": 342 }, { - "epoch": 0.23, - "learning_rate": 1.9643912061145065e-05, - "loss": 1.4118, + "epoch": 0.035897435897435895, + "grad_norm": 3.7924349792602188, + "learning_rate": 1.99981983902953e-05, + "loss": 1.1097, "step": 343 }, { - "epoch": 0.23, - "learning_rate": 1.9641086184375148e-05, - "loss": 1.3582, + "epoch": 0.03600209314495029, + "grad_norm": 3.4574803613996297, + "learning_rate": 1.9998133474618254e-05, + "loss": 1.1045, "step": 344 }, { - "epoch": 0.23, - "learning_rate": 1.9638249343828463e-05, - "loss": 1.2793, + "epoch": 0.03610675039246468, + "grad_norm": 3.2469937969404272, + "learning_rate": 1.9998067410135403e-05, + "loss": 1.1738, "step": 345 }, { - "epoch": 0.23, - "learning_rate": 1.9635401542731043e-05, - "loss": 1.4125, + "epoch": 0.03621140763997907, + "grad_norm": 1.9876001648747574, + "learning_rate": 1.999800019685434e-05, + "loss": 0.7593, "step": 346 }, { - "epoch": 0.23, - "learning_rate": 1.963254278432139e-05, - "loss": 1.3889, + "epoch": 0.036316064887493456, + "grad_norm": 3.247944294880278, + "learning_rate": 1.9997931834782783e-05, + "loss": 1.0247, "step": 347 }, { - "epoch": 0.23, - "learning_rate": 1.9629673071850463e-05, - "loss": 1.2159, + "epoch": 0.03642072213500785, + "grad_norm": 2.30114299659168, + "learning_rate": 1.999786232392859e-05, + "loss": 0.7979, "step": 348 }, { - "epoch": 0.23, - "learning_rate": 1.962679240858168e-05, - "loss": 1.2363, + "epoch": 0.03652537938252224, + "grad_norm": 3.0984066897700964, + "learning_rate": 1.999779166429975e-05, + "loss": 1.0613, "step": 349 }, { - "epoch": 0.23, - "learning_rate": 1.9623900797790914e-05, - "loss": 1.2471, + "epoch": 0.03663003663003663, + "grad_norm": 4.166217793012304, + "learning_rate": 1.999771985590438e-05, + "loss": 1.1616, "step": 350 }, { - "epoch": 0.23, - "learning_rate": 1.9620998242766482e-05, - "loss": 1.3256, + "epoch": 0.036734693877551024, + "grad_norm": 3.8538711686650697, + "learning_rate": 1.999764689875073e-05, + "loss": 1.034, "step": 351 }, { - "epoch": 0.23, - "learning_rate": 1.9618084746809147e-05, - "loss": 1.2357, + "epoch": 0.03683935112506541, + "grad_norm": 3.2661014681676273, + "learning_rate": 1.9997572792847186e-05, + "loss": 1.0767, "step": 352 }, { - "epoch": 0.23, - "learning_rate": 1.9615160313232122e-05, - "loss": 1.4569, + "epoch": 0.0369440083725798, + "grad_norm": 3.1154642175823195, + "learning_rate": 1.999749753820226e-05, + "loss": 1.1105, "step": 353 }, { - "epoch": 0.23, - "learning_rate": 1.961222494536105e-05, - "loss": 1.4797, + "epoch": 0.03704866562009419, + "grad_norm": 2.72464228837991, + "learning_rate": 1.99974211348246e-05, + "loss": 1.0197, "step": 354 }, { - "epoch": 0.23, - "learning_rate": 1.960927864653401e-05, - "loss": 1.2745, + "epoch": 0.037153322867608585, + "grad_norm": 4.151004358135479, + "learning_rate": 1.9997343582722984e-05, + "loss": 1.0986, "step": 355 }, { - "epoch": 0.23, - "learning_rate": 1.9606321420101514e-05, - "loss": 1.3715, + "epoch": 0.03725798011512297, + "grad_norm": 3.2194916123198096, + "learning_rate": 1.9997264881906324e-05, + "loss": 0.9507, "step": 356 }, { - "epoch": 0.24, - "learning_rate": 1.96033532694265e-05, - "loss": 1.2943, + "epoch": 0.03736263736263736, + "grad_norm": 3.094166715267168, + "learning_rate": 1.9997185032383663e-05, + "loss": 1.0726, "step": 357 }, { - "epoch": 0.24, - "learning_rate": 1.960037419788433e-05, - "loss": 1.338, + "epoch": 0.037467294610151754, + "grad_norm": 2.858902354576879, + "learning_rate": 1.9997104034164176e-05, + "loss": 0.9607, "step": 358 }, { - "epoch": 0.24, - "learning_rate": 1.9597384208862776e-05, - "loss": 1.246, + "epoch": 0.037571951857666146, + "grad_norm": 3.59796942440944, + "learning_rate": 1.9997021887257166e-05, + "loss": 1.1792, "step": 359 }, { - "epoch": 0.24, - "learning_rate": 1.9594383305762045e-05, - "loss": 1.3237, + "epoch": 0.03767660910518053, + "grad_norm": 3.4966139920154546, + "learning_rate": 1.9996938591672076e-05, + "loss": 1.1609, "step": 360 }, { - "epoch": 0.24, - "learning_rate": 1.9591371491994735e-05, - "loss": 1.3279, + "epoch": 0.03778126635269492, + "grad_norm": 3.2932872126047164, + "learning_rate": 1.9996854147418477e-05, + "loss": 1.1843, "step": 361 }, { - "epoch": 0.24, - "learning_rate": 1.958834877098586e-05, - "loss": 1.3185, + "epoch": 0.037885923600209315, + "grad_norm": 3.1771915837350897, + "learning_rate": 1.999676855450607e-05, + "loss": 1.1945, "step": 362 }, { - "epoch": 0.24, - "learning_rate": 1.9585315146172843e-05, - "loss": 1.3534, + "epoch": 0.03799058084772371, + "grad_norm": 3.0394535417384465, + "learning_rate": 1.999668181294469e-05, + "loss": 1.1143, "step": 363 }, { - "epoch": 0.24, - "learning_rate": 1.9582270621005494e-05, - "loss": 1.1566, + "epoch": 0.0380952380952381, + "grad_norm": 2.8537817079806067, + "learning_rate": 1.9996593922744308e-05, + "loss": 0.9112, "step": 364 }, { - "epoch": 0.24, - "learning_rate": 1.9579215198946028e-05, - "loss": 1.2134, + "epoch": 0.038199895342752484, + "grad_norm": 3.322133711285726, + "learning_rate": 1.9996504883915017e-05, + "loss": 1.1074, "step": 365 }, { - "epoch": 0.24, - "learning_rate": 1.9576148883469054e-05, - "loss": 1.3722, + "epoch": 0.038304552590266876, + "grad_norm": 3.0632114432772037, + "learning_rate": 1.999641469646705e-05, + "loss": 1.0545, "step": 366 }, { - "epoch": 0.24, - "learning_rate": 1.9573071678061565e-05, - "loss": 1.3042, + "epoch": 0.03840920983778127, + "grad_norm": 3.7007048473022235, + "learning_rate": 1.999632336041077e-05, + "loss": 1.174, "step": 367 }, { - "epoch": 0.24, - "learning_rate": 1.956998358622293e-05, - "loss": 1.4297, + "epoch": 0.03851386708529566, + "grad_norm": 3.2674017560088946, + "learning_rate": 1.999623087575667e-05, + "loss": 1.0591, "step": 368 }, { - "epoch": 0.24, - "learning_rate": 1.9566884611464916e-05, - "loss": 1.3346, + "epoch": 0.038618524332810045, + "grad_norm": 3.522836020354182, + "learning_rate": 1.9996137242515376e-05, + "loss": 1.0767, "step": 369 }, { - "epoch": 0.24, - "learning_rate": 1.9563774757311646e-05, - "loss": 1.2136, + "epoch": 0.03872318158032444, + "grad_norm": 3.827590637468791, + "learning_rate": 1.9996042460697654e-05, + "loss": 1.046, "step": 370 }, { - "epoch": 0.24, - "learning_rate": 1.956065402729963e-05, - "loss": 1.4095, + "epoch": 0.03882783882783883, + "grad_norm": 3.301858757724581, + "learning_rate": 1.9995946530314384e-05, + "loss": 1.0542, "step": 371 }, { - "epoch": 0.24, - "learning_rate": 1.9557522424977745e-05, - "loss": 1.2655, + "epoch": 0.03893249607535322, + "grad_norm": 3.119223795376493, + "learning_rate": 1.9995849451376593e-05, + "loss": 1.1795, "step": 372 }, { - "epoch": 0.25, - "learning_rate": 1.9554379953907217e-05, - "loss": 1.3623, + "epoch": 0.039037153322867606, + "grad_norm": 3.623763948471002, + "learning_rate": 1.999575122389544e-05, + "loss": 1.0188, "step": 373 }, { - "epoch": 0.25, - "learning_rate": 1.955122661766165e-05, - "loss": 1.3615, + "epoch": 0.039141810570382, + "grad_norm": 2.947104901430164, + "learning_rate": 1.9995651847882208e-05, + "loss": 1.0813, "step": 374 }, { - "epoch": 0.25, - "learning_rate": 1.954806241982699e-05, - "loss": 1.4546, + "epoch": 0.03924646781789639, + "grad_norm": 3.284552361791963, + "learning_rate": 1.9995551323348314e-05, + "loss": 1.1506, "step": 375 }, { - "epoch": 0.25, - "learning_rate": 1.954488736400155e-05, - "loss": 1.5023, + "epoch": 0.03935112506541078, + "grad_norm": 3.4207057905425566, + "learning_rate": 1.999544965030531e-05, + "loss": 1.0482, "step": 376 }, { - "epoch": 0.25, - "learning_rate": 1.9541701453795976e-05, - "loss": 1.3962, + "epoch": 0.03945578231292517, + "grad_norm": 3.9718583768537226, + "learning_rate": 1.999534682876488e-05, + "loss": 1.107, "step": 377 }, { - "epoch": 0.25, - "learning_rate": 1.9538504692833263e-05, - "loss": 1.313, + "epoch": 0.03956043956043956, + "grad_norm": 3.4233946505392274, + "learning_rate": 1.9995242858738834e-05, + "loss": 1.055, "step": 378 }, { - "epoch": 0.25, - "learning_rate": 1.9535297084748747e-05, - "loss": 1.2892, + "epoch": 0.03966509680795395, + "grad_norm": 3.4939287719910905, + "learning_rate": 1.999513774023912e-05, + "loss": 1.1309, "step": 379 }, { - "epoch": 0.25, - "learning_rate": 1.9532078633190097e-05, - "loss": 1.3584, + "epoch": 0.03976975405546834, + "grad_norm": 2.846888454584744, + "learning_rate": 1.9995031473277822e-05, + "loss": 1.0401, "step": 380 }, { - "epoch": 0.25, - "learning_rate": 1.9528849341817316e-05, - "loss": 1.4247, + "epoch": 0.039874411302982735, + "grad_norm": 3.4884066840679826, + "learning_rate": 1.9994924057867142e-05, + "loss": 1.1144, "step": 381 }, { - "epoch": 0.25, - "learning_rate": 1.9525609214302727e-05, - "loss": 1.4444, + "epoch": 0.03997906855049712, + "grad_norm": 3.2953541736196095, + "learning_rate": 1.999481549401943e-05, + "loss": 1.0356, "step": 382 }, { - "epoch": 0.25, - "learning_rate": 1.9522358254330987e-05, - "loss": 1.3144, + "epoch": 0.04008372579801151, + "grad_norm": 3.8515521747365353, + "learning_rate": 1.999470578174715e-05, + "loss": 1.0333, "step": 383 }, { - "epoch": 0.25, - "learning_rate": 1.951909646559906e-05, - "loss": 1.4022, + "epoch": 0.040188383045525904, + "grad_norm": 3.862495676492481, + "learning_rate": 1.999459492106292e-05, + "loss": 1.0685, "step": 384 }, { - "epoch": 0.25, - "learning_rate": 1.951582385181623e-05, - "loss": 1.3433, + "epoch": 0.040293040293040296, + "grad_norm": 3.8607869501733596, + "learning_rate": 1.999448291197947e-05, + "loss": 1.0893, "step": 385 }, { - "epoch": 0.25, - "learning_rate": 1.9512540416704095e-05, - "loss": 1.4016, + "epoch": 0.04039769754055468, + "grad_norm": 3.108316063988891, + "learning_rate": 1.999436975450967e-05, + "loss": 1.129, "step": 386 }, { - "epoch": 0.25, - "learning_rate": 1.950924616399655e-05, - "loss": 1.3799, + "epoch": 0.04050235478806907, + "grad_norm": 3.1460718799511724, + "learning_rate": 1.9994255448666528e-05, + "loss": 1.0922, "step": 387 }, { - "epoch": 0.26, - "learning_rate": 1.9505941097439805e-05, - "loss": 1.4395, + "epoch": 0.040607012035583465, + "grad_norm": 4.815195078193566, + "learning_rate": 1.9994139994463174e-05, + "loss": 1.0287, "step": 388 }, { - "epoch": 0.26, - "learning_rate": 1.9502625220792345e-05, - "loss": 1.303, + "epoch": 0.04071166928309786, + "grad_norm": 3.2133489103871464, + "learning_rate": 1.9994023391912873e-05, + "loss": 0.9636, "step": 389 }, { - "epoch": 0.26, - "learning_rate": 1.9499298537824972e-05, - "loss": 1.3902, + "epoch": 0.04081632653061224, + "grad_norm": 3.0323586032384466, + "learning_rate": 1.9993905641029024e-05, + "loss": 1.1492, "step": 390 }, { - "epoch": 0.26, - "learning_rate": 1.949596105232077e-05, - "loss": 1.3047, + "epoch": 0.040920983778126634, + "grad_norm": 3.408180618813306, + "learning_rate": 1.999378674182516e-05, + "loss": 1.1212, "step": 391 }, { - "epoch": 0.26, - "learning_rate": 1.9492612768075094e-05, - "loss": 1.1657, + "epoch": 0.041025641025641026, + "grad_norm": 2.832485184741635, + "learning_rate": 1.9993666694314938e-05, + "loss": 1.1055, "step": 392 }, { - "epoch": 0.26, - "learning_rate": 1.9489253688895594e-05, - "loss": 1.4182, + "epoch": 0.04113029827315542, + "grad_norm": 3.3779814563246826, + "learning_rate": 1.999354549851215e-05, + "loss": 1.0783, "step": 393 }, { - "epoch": 0.26, - "learning_rate": 1.9485883818602194e-05, - "loss": 1.205, + "epoch": 0.04123495552066981, + "grad_norm": 3.15338999466506, + "learning_rate": 1.9993423154430732e-05, + "loss": 1.1143, "step": 394 }, { - "epoch": 0.26, - "learning_rate": 1.948250316102709e-05, - "loss": 1.3438, + "epoch": 0.041339612768184195, + "grad_norm": 3.651309843882222, + "learning_rate": 1.9993299662084734e-05, + "loss": 1.1113, "step": 395 }, { - "epoch": 0.26, - "learning_rate": 1.9479111720014737e-05, - "loss": 1.3099, + "epoch": 0.04144427001569859, + "grad_norm": 2.5508921394839152, + "learning_rate": 1.9993175021488343e-05, + "loss": 0.8452, "step": 396 }, { - "epoch": 0.26, - "learning_rate": 1.947570949942186e-05, - "loss": 1.346, + "epoch": 0.04154892726321298, + "grad_norm": 3.3357843841110077, + "learning_rate": 1.9993049232655882e-05, + "loss": 1.1295, "step": 397 }, { - "epoch": 0.26, - "learning_rate": 1.9472296503117437e-05, - "loss": 1.4133, + "epoch": 0.04165358451072737, + "grad_norm": 3.9695559727733483, + "learning_rate": 1.999292229560181e-05, + "loss": 0.9869, "step": 398 }, { - "epoch": 0.26, - "learning_rate": 1.9468872734982705e-05, - "loss": 1.5194, + "epoch": 0.041758241758241756, + "grad_norm": 3.7523305234520024, + "learning_rate": 1.9992794210340707e-05, + "loss": 1.1228, "step": 399 }, { - "epoch": 0.26, - "learning_rate": 1.9465438198911158e-05, - "loss": 1.3365, + "epoch": 0.04186289900575615, + "grad_norm": 3.5321821977005503, + "learning_rate": 1.999266497688729e-05, + "loss": 1.0244, "step": 400 }, { - "epoch": 0.26, - "learning_rate": 1.946199289880852e-05, - "loss": 1.3004, + "epoch": 0.04196755625327054, + "grad_norm": 3.433702518298409, + "learning_rate": 1.9992534595256414e-05, + "loss": 1.1635, "step": 401 }, { - "epoch": 0.26, - "learning_rate": 1.9458536838592764e-05, - "loss": 1.4811, + "epoch": 0.04207221350078493, + "grad_norm": 3.3240435185890482, + "learning_rate": 1.9992403065463053e-05, + "loss": 1.1308, "step": 402 }, { - "epoch": 0.27, - "learning_rate": 1.9455070022194094e-05, - "loss": 1.3259, + "epoch": 0.04217687074829932, + "grad_norm": 2.418715245595869, + "learning_rate": 1.9992270387522327e-05, + "loss": 0.7399, "step": 403 }, { - "epoch": 0.27, - "learning_rate": 1.9451592453554956e-05, - "loss": 1.2946, + "epoch": 0.04228152799581371, + "grad_norm": 3.4217501748246497, + "learning_rate": 1.999213656144947e-05, + "loss": 1.0257, "step": 404 }, { - "epoch": 0.27, - "learning_rate": 1.944810413663001e-05, - "loss": 1.2141, + "epoch": 0.0423861852433281, + "grad_norm": 3.5574880915366642, + "learning_rate": 1.9992001587259872e-05, + "loss": 1.1526, "step": 405 }, { - "epoch": 0.27, - "learning_rate": 1.9444605075386156e-05, - "loss": 1.4021, + "epoch": 0.04249084249084249, + "grad_norm": 3.439254247799819, + "learning_rate": 1.9991865464969035e-05, + "loss": 1.0449, "step": 406 }, { - "epoch": 0.27, - "learning_rate": 1.944109527380249e-05, - "loss": 1.455, + "epoch": 0.042595499738356885, + "grad_norm": 3.5204318338726273, + "learning_rate": 1.99917281945926e-05, + "loss": 1.1471, "step": 407 }, { - "epoch": 0.27, - "learning_rate": 1.9437574735870346e-05, - "loss": 1.4715, + "epoch": 0.04270015698587127, + "grad_norm": 3.858873349887819, + "learning_rate": 1.9991589776146335e-05, + "loss": 1.0934, "step": 408 }, { - "epoch": 0.27, - "learning_rate": 1.9434043465593247e-05, - "loss": 1.4539, + "epoch": 0.04280481423338566, + "grad_norm": 3.362234619886235, + "learning_rate": 1.9991450209646153e-05, + "loss": 1.1658, "step": 409 }, { - "epoch": 0.27, - "learning_rate": 1.9430501466986932e-05, - "loss": 1.4801, + "epoch": 0.042909471480900054, + "grad_norm": 3.220742237697856, + "learning_rate": 1.999130949510809e-05, + "loss": 1.1169, "step": 410 }, { - "epoch": 0.27, - "learning_rate": 1.9426948744079338e-05, - "loss": 1.2148, + "epoch": 0.043014128728414446, + "grad_norm": 2.9342623132519536, + "learning_rate": 1.999116763254831e-05, + "loss": 0.9709, "step": 411 }, { - "epoch": 0.27, - "learning_rate": 1.94233853009106e-05, - "loss": 1.2596, + "epoch": 0.04311878597592883, + "grad_norm": 3.0789423142360572, + "learning_rate": 1.9991024621983115e-05, + "loss": 1.0549, "step": 412 }, { - "epoch": 0.27, - "learning_rate": 1.9419811141533032e-05, - "loss": 1.2145, + "epoch": 0.04322344322344322, + "grad_norm": 3.5769673216250717, + "learning_rate": 1.9990880463428938e-05, + "loss": 1.0587, "step": 413 }, { - "epoch": 0.27, - "learning_rate": 1.9416226270011152e-05, - "loss": 1.2662, + "epoch": 0.043328100470957615, + "grad_norm": 3.387771571360907, + "learning_rate": 1.9990735156902337e-05, + "loss": 1.1727, "step": 414 }, { - "epoch": 0.27, - "learning_rate": 1.9412630690421648e-05, - "loss": 1.302, + "epoch": 0.043432757718472007, + "grad_norm": 3.8004457138302237, + "learning_rate": 1.9990588702420017e-05, + "loss": 1.0929, "step": 415 }, { - "epoch": 0.27, - "learning_rate": 1.940902440685339e-05, - "loss": 1.2591, + "epoch": 0.04353741496598639, + "grad_norm": 2.9911775987883606, + "learning_rate": 1.9990441099998802e-05, + "loss": 1.0605, "step": 416 }, { - "epoch": 0.27, - "learning_rate": 1.9405407423407414e-05, - "loss": 1.4317, + "epoch": 0.043642072213500784, + "grad_norm": 3.3225073464157875, + "learning_rate": 1.999029234965565e-05, + "loss": 1.101, "step": 417 }, { - "epoch": 0.28, - "learning_rate": 1.9401779744196934e-05, - "loss": 1.3283, + "epoch": 0.043746729461015176, + "grad_norm": 3.142201219435067, + "learning_rate": 1.9990142451407658e-05, + "loss": 1.1804, "step": 418 }, { - "epoch": 0.28, - "learning_rate": 1.9398141373347317e-05, - "loss": 1.2165, + "epoch": 0.04385138670852957, + "grad_norm": 3.2497520075753905, + "learning_rate": 1.9989991405272043e-05, + "loss": 1.09, "step": 419 }, { - "epoch": 0.28, - "learning_rate": 1.93944923149961e-05, - "loss": 1.3613, + "epoch": 0.04395604395604396, + "grad_norm": 2.8504351959056646, + "learning_rate": 1.9989839211266164e-05, + "loss": 1.089, "step": 420 }, { - "epoch": 0.28, - "learning_rate": 1.9390832573292964e-05, - "loss": 1.3744, + "epoch": 0.044060701203558345, + "grad_norm": 3.425903015712905, + "learning_rate": 1.998968586940751e-05, + "loss": 1.1222, "step": 421 }, { - "epoch": 0.28, - "learning_rate": 1.938716215239974e-05, - "loss": 1.1967, + "epoch": 0.044165358451072737, + "grad_norm": 3.12857526376575, + "learning_rate": 1.9989531379713697e-05, + "loss": 1.1248, "step": 422 }, { - "epoch": 0.28, - "learning_rate": 1.9383481056490402e-05, - "loss": 1.2715, + "epoch": 0.04427001569858713, + "grad_norm": 2.753045621554163, + "learning_rate": 1.998937574220248e-05, + "loss": 1.076, "step": 423 }, { - "epoch": 0.28, - "learning_rate": 1.9379789289751073e-05, - "loss": 1.3597, + "epoch": 0.04437467294610152, + "grad_norm": 3.9313106712974, + "learning_rate": 1.998921895689174e-05, + "loss": 0.9975, "step": 424 }, { - "epoch": 0.28, - "learning_rate": 1.9376086856380007e-05, - "loss": 1.4245, + "epoch": 0.044479330193615906, + "grad_norm": 3.1879760675576465, + "learning_rate": 1.9989061023799486e-05, + "loss": 1.0489, "step": 425 }, { - "epoch": 0.28, - "learning_rate": 1.937237376058758e-05, - "loss": 1.3464, + "epoch": 0.0445839874411303, + "grad_norm": 3.5604479042648447, + "learning_rate": 1.9988901942943874e-05, + "loss": 1.0179, "step": 426 }, { - "epoch": 0.28, - "learning_rate": 1.93686500065963e-05, - "loss": 1.4296, + "epoch": 0.04468864468864469, + "grad_norm": 3.4144953084922376, + "learning_rate": 1.998874171434318e-05, + "loss": 0.9328, "step": 427 }, { - "epoch": 0.28, - "learning_rate": 1.9364915598640796e-05, - "loss": 1.2627, + "epoch": 0.04479330193615908, + "grad_norm": 2.955908397360233, + "learning_rate": 1.998858033801581e-05, + "loss": 1.1155, "step": 428 }, { - "epoch": 0.28, - "learning_rate": 1.936117054096781e-05, - "loss": 1.2727, + "epoch": 0.044897959183673466, + "grad_norm": 3.6880350580445254, + "learning_rate": 1.9988417813980315e-05, + "loss": 1.1311, "step": 429 }, { - "epoch": 0.28, - "learning_rate": 1.9357414837836192e-05, - "loss": 1.2721, + "epoch": 0.04500261643118786, + "grad_norm": 3.3607014595989178, + "learning_rate": 1.9988254142255362e-05, + "loss": 1.1893, "step": 430 }, { - "epoch": 0.28, - "learning_rate": 1.935364849351691e-05, - "loss": 1.3247, + "epoch": 0.04510727367870225, + "grad_norm": 2.8727076333580395, + "learning_rate": 1.998808932285976e-05, + "loss": 0.9684, "step": 431 }, { - "epoch": 0.28, - "learning_rate": 1.934987151229302e-05, - "loss": 1.5809, + "epoch": 0.04521193092621664, + "grad_norm": 3.439241913580116, + "learning_rate": 1.9987923355812448e-05, + "loss": 1.0141, "step": 432 }, { - "epoch": 0.29, - "learning_rate": 1.934608389845968e-05, - "loss": 1.3678, + "epoch": 0.04531658817373103, + "grad_norm": 2.9285693240861783, + "learning_rate": 1.9987756241132494e-05, + "loss": 1.0593, "step": 433 }, { - "epoch": 0.29, - "learning_rate": 1.9342285656324135e-05, - "loss": 1.4506, + "epoch": 0.04542124542124542, + "grad_norm": 3.670926244435246, + "learning_rate": 1.99875879788391e-05, + "loss": 1.0491, "step": 434 }, { - "epoch": 0.29, - "learning_rate": 1.9338476790205726e-05, - "loss": 1.2935, + "epoch": 0.04552590266875981, + "grad_norm": 3.006014723154654, + "learning_rate": 1.99874185689516e-05, + "loss": 1.1722, "step": 435 }, { - "epoch": 0.29, - "learning_rate": 1.933465730443586e-05, - "loss": 1.1895, + "epoch": 0.0456305599162742, + "grad_norm": 3.7892151003300305, + "learning_rate": 1.9987248011489462e-05, + "loss": 1.0608, "step": 436 }, { - "epoch": 0.29, - "learning_rate": 1.9330827203358044e-05, - "loss": 1.248, + "epoch": 0.045735217163788595, + "grad_norm": 3.2332272253735765, + "learning_rate": 1.998707630647228e-05, + "loss": 1.1638, "step": 437 }, { - "epoch": 0.29, - "learning_rate": 1.932698649132783e-05, - "loss": 1.3905, + "epoch": 0.04583987441130298, + "grad_norm": 3.480839997662857, + "learning_rate": 1.998690345391978e-05, + "loss": 1.0832, "step": 438 }, { - "epoch": 0.29, - "learning_rate": 1.932313517271285e-05, - "loss": 1.3918, + "epoch": 0.04594453165881737, + "grad_norm": 3.8054324863562385, + "learning_rate": 1.9986729453851833e-05, + "loss": 1.0328, "step": 439 }, { - "epoch": 0.29, - "learning_rate": 1.9319273251892805e-05, - "loss": 1.3228, + "epoch": 0.046049188906331764, + "grad_norm": 3.8488459066381497, + "learning_rate": 1.998655430628843e-05, + "loss": 0.9417, "step": 440 }, { - "epoch": 0.29, - "learning_rate": 1.931540073325944e-05, - "loss": 1.4809, + "epoch": 0.046153846153846156, + "grad_norm": 3.1253018165861786, + "learning_rate": 1.9986378011249684e-05, + "loss": 1.1043, "step": 441 }, { - "epoch": 0.29, - "learning_rate": 1.9311517621216555e-05, - "loss": 1.4098, + "epoch": 0.04625850340136054, + "grad_norm": 3.5195863235640408, + "learning_rate": 1.9986200568755863e-05, + "loss": 1.1416, "step": 442 }, { - "epoch": 0.29, - "learning_rate": 1.930762392018e-05, - "loss": 1.2479, + "epoch": 0.04636316064887493, + "grad_norm": 3.38872021874277, + "learning_rate": 1.9986021978827353e-05, + "loss": 1.1378, "step": 443 }, { - "epoch": 0.29, - "learning_rate": 1.9303719634577668e-05, - "loss": 1.4779, + "epoch": 0.046467817896389325, + "grad_norm": 3.2198779898823475, + "learning_rate": 1.9985842241484678e-05, + "loss": 1.1032, "step": 444 }, { - "epoch": 0.29, - "learning_rate": 1.929980476884948e-05, - "loss": 1.2986, + "epoch": 0.04657247514390372, + "grad_norm": 3.037988731718675, + "learning_rate": 1.998566135674848e-05, + "loss": 1.0834, "step": 445 }, { - "epoch": 0.29, - "learning_rate": 1.92958793274474e-05, - "loss": 1.3807, + "epoch": 0.0466771323914181, + "grad_norm": 3.113016605036802, + "learning_rate": 1.998547932463955e-05, + "loss": 1.0245, "step": 446 }, { - "epoch": 0.29, - "learning_rate": 1.9291943314835406e-05, - "loss": 1.535, + "epoch": 0.046781789638932494, + "grad_norm": 2.981225578291425, + "learning_rate": 1.9985296145178803e-05, + "loss": 1.1454, "step": 447 }, { - "epoch": 0.29, - "learning_rate": 1.928799673548951e-05, - "loss": 1.1961, + "epoch": 0.046886446886446886, + "grad_norm": 3.2756449737549818, + "learning_rate": 1.998511181838729e-05, + "loss": 1.0624, "step": 448 }, { - "epoch": 0.3, - "learning_rate": 1.9284039593897734e-05, - "loss": 1.327, + "epoch": 0.04699110413396128, + "grad_norm": 3.2146017721872417, + "learning_rate": 1.9984926344286184e-05, + "loss": 1.121, "step": 449 }, { - "epoch": 0.3, - "learning_rate": 1.928007189456011e-05, - "loss": 1.5256, + "epoch": 0.04709576138147567, + "grad_norm": 3.6515983191470642, + "learning_rate": 1.99847397228968e-05, + "loss": 1.0705, "step": 450 }, { - "epoch": 0.3, - "learning_rate": 1.9276093641988684e-05, - "loss": 1.5381, + "epoch": 0.047200418628990055, + "grad_norm": 2.7925140240267536, + "learning_rate": 1.998455195424058e-05, + "loss": 1.0412, "step": 451 }, { - "epoch": 0.3, - "learning_rate": 1.9272104840707487e-05, - "loss": 1.323, + "epoch": 0.04730507587650445, + "grad_norm": 3.2058786484672184, + "learning_rate": 1.99843630383391e-05, + "loss": 1.0438, "step": 452 }, { - "epoch": 0.3, - "learning_rate": 1.9268105495252568e-05, - "loss": 1.3281, + "epoch": 0.04740973312401884, + "grad_norm": 3.206199705921322, + "learning_rate": 1.9984172975214068e-05, + "loss": 1.1241, "step": 453 }, { - "epoch": 0.3, - "learning_rate": 1.9264095610171944e-05, - "loss": 1.3403, + "epoch": 0.04751439037153323, + "grad_norm": 2.96917837023507, + "learning_rate": 1.998398176488732e-05, + "loss": 1.0307, "step": 454 }, { - "epoch": 0.3, - "learning_rate": 1.926007519002564e-05, - "loss": 1.3414, + "epoch": 0.047619047619047616, + "grad_norm": 2.2142798706812514, + "learning_rate": 1.9983789407380828e-05, + "loss": 0.7792, "step": 455 }, { - "epoch": 0.3, - "learning_rate": 1.925604423938564e-05, - "loss": 1.2793, + "epoch": 0.04772370486656201, + "grad_norm": 2.1393293337536754, + "learning_rate": 1.9983595902716693e-05, + "loss": 0.8014, "step": 456 }, { - "epoch": 0.3, - "learning_rate": 1.9252002762835917e-05, - "loss": 1.3011, + "epoch": 0.0478283621140764, + "grad_norm": 3.088242788316876, + "learning_rate": 1.998340125091715e-05, + "loss": 1.0111, "step": 457 }, { - "epoch": 0.3, - "learning_rate": 1.924795076497241e-05, - "loss": 1.2783, + "epoch": 0.04793301936159079, + "grad_norm": 3.001125565502091, + "learning_rate": 1.9983205452004566e-05, + "loss": 0.994, "step": 458 }, { - "epoch": 0.3, - "learning_rate": 1.924388825040303e-05, - "loss": 1.2557, + "epoch": 0.04803767660910518, + "grad_norm": 3.783192428283837, + "learning_rate": 1.9983008506001437e-05, + "loss": 1.1141, "step": 459 }, { - "epoch": 0.3, - "learning_rate": 1.9239815223747632e-05, - "loss": 1.3768, + "epoch": 0.04814233385661957, + "grad_norm": 3.004204145609777, + "learning_rate": 1.9982810412930393e-05, + "loss": 1.1247, "step": 460 }, { - "epoch": 0.3, - "learning_rate": 1.9235731689638035e-05, - "loss": 1.2036, + "epoch": 0.04824699110413396, + "grad_norm": 3.1685378042245147, + "learning_rate": 1.9982611172814197e-05, + "loss": 1.1299, "step": 461 }, { - "epoch": 0.3, - "learning_rate": 1.9231637652718004e-05, - "loss": 1.2012, + "epoch": 0.04835164835164835, + "grad_norm": 3.1587611211309907, + "learning_rate": 1.9982410785675735e-05, + "loss": 1.112, "step": 462 }, { - "epoch": 0.3, - "learning_rate": 1.9227533117643256e-05, - "loss": 1.4446, + "epoch": 0.048456305599162745, + "grad_norm": 3.458436913532201, + "learning_rate": 1.9982209251538043e-05, + "loss": 1.0351, "step": 463 }, { - "epoch": 0.31, - "learning_rate": 1.922341808908144e-05, - "loss": 1.3174, + "epoch": 0.04856096284667713, + "grad_norm": 2.969870331031836, + "learning_rate": 1.998200657042427e-05, + "loss": 1.1774, "step": 464 }, { - "epoch": 0.31, - "learning_rate": 1.921929257171213e-05, - "loss": 1.3249, + "epoch": 0.04866562009419152, + "grad_norm": 3.763085180450192, + "learning_rate": 1.9981802742357704e-05, + "loss": 1.0587, "step": 465 }, { - "epoch": 0.31, - "learning_rate": 1.9215156570226844e-05, - "loss": 1.255, + "epoch": 0.048770277341705914, + "grad_norm": 3.33040533060541, + "learning_rate": 1.998159776736177e-05, + "loss": 1.0296, "step": 466 }, { - "epoch": 0.31, - "learning_rate": 1.921101008932901e-05, - "loss": 1.2195, + "epoch": 0.048874934589220306, + "grad_norm": 3.290492374884369, + "learning_rate": 1.998139164546002e-05, + "loss": 1.2095, "step": 467 }, { - "epoch": 0.31, - "learning_rate": 1.920685313373398e-05, - "loss": 1.3089, + "epoch": 0.04897959183673469, + "grad_norm": 2.857263563106438, + "learning_rate": 1.998118437667613e-05, + "loss": 0.9923, "step": 468 }, { - "epoch": 0.31, - "learning_rate": 1.920268570816901e-05, - "loss": 1.2939, + "epoch": 0.04908424908424908, + "grad_norm": 3.053876121052991, + "learning_rate": 1.9980975961033925e-05, + "loss": 0.8369, "step": 469 }, { - "epoch": 0.31, - "learning_rate": 1.9198507817373274e-05, - "loss": 1.4276, + "epoch": 0.049188906331763475, + "grad_norm": 3.177534083313426, + "learning_rate": 1.998076639855735e-05, + "loss": 1.0123, "step": 470 }, { - "epoch": 0.31, - "learning_rate": 1.919431946609784e-05, - "loss": 1.3933, + "epoch": 0.04929356357927787, + "grad_norm": 4.140885802398993, + "learning_rate": 1.998055568927048e-05, + "loss": 0.9744, "step": 471 }, { - "epoch": 0.31, - "learning_rate": 1.9190120659105663e-05, - "loss": 1.3057, + "epoch": 0.04939822082679225, + "grad_norm": 2.9041664415140396, + "learning_rate": 1.9980343833197528e-05, + "loss": 1.0679, "step": 472 }, { - "epoch": 0.31, - "learning_rate": 1.9185911401171615e-05, - "loss": 1.3188, + "epoch": 0.049502878074306644, + "grad_norm": 2.942481635418397, + "learning_rate": 1.998013083036284e-05, + "loss": 1.099, "step": 473 }, { - "epoch": 0.31, - "learning_rate": 1.918169169708242e-05, - "loss": 1.2174, + "epoch": 0.049607535321821036, + "grad_norm": 2.9546879658340486, + "learning_rate": 1.9979916680790885e-05, + "loss": 1.0112, "step": 474 }, { - "epoch": 0.31, - "learning_rate": 1.91774615516367e-05, - "loss": 1.4041, + "epoch": 0.04971219256933543, + "grad_norm": 3.0965296597879113, + "learning_rate": 1.997970138450627e-05, + "loss": 1.0086, "step": 475 }, { - "epoch": 0.31, - "learning_rate": 1.9173220969644947e-05, - "loss": 1.1018, + "epoch": 0.04981684981684982, + "grad_norm": 3.855575845463165, + "learning_rate": 1.997948494153374e-05, + "loss": 1.1754, "step": 476 }, { - "epoch": 0.31, - "learning_rate": 1.916896995592953e-05, - "loss": 1.3083, + "epoch": 0.049921507064364205, + "grad_norm": 3.993215917358464, + "learning_rate": 1.997926735189816e-05, + "loss": 1.0746, "step": 477 }, { - "epoch": 0.31, - "learning_rate": 1.916470851532466e-05, - "loss": 1.1981, + "epoch": 0.0500261643118786, + "grad_norm": 3.0492290153570094, + "learning_rate": 1.9979048615624526e-05, + "loss": 1.1768, "step": 478 }, { - "epoch": 0.32, - "learning_rate": 1.9160436652676425e-05, - "loss": 1.4576, + "epoch": 0.05013082155939299, + "grad_norm": 3.6838636144975068, + "learning_rate": 1.997882873273798e-05, + "loss": 1.0034, "step": 479 }, { - "epoch": 0.32, - "learning_rate": 1.9156154372842757e-05, - "loss": 1.1978, + "epoch": 0.05023547880690738, + "grad_norm": 3.4794216494384926, + "learning_rate": 1.9978607703263783e-05, + "loss": 1.0218, "step": 480 }, { - "epoch": 0.32, - "learning_rate": 1.9151861680693432e-05, - "loss": 1.193, + "epoch": 0.050340136054421766, + "grad_norm": 3.241477320849195, + "learning_rate": 1.9978385527227334e-05, + "loss": 1.0337, "step": 481 }, { - "epoch": 0.32, - "learning_rate": 1.914755858111008e-05, - "loss": 1.2737, + "epoch": 0.05044479330193616, + "grad_norm": 3.6191060732119857, + "learning_rate": 1.9978162204654155e-05, + "loss": 1.1394, "step": 482 }, { - "epoch": 0.32, - "learning_rate": 1.914324507898614e-05, - "loss": 1.3568, + "epoch": 0.05054945054945055, + "grad_norm": 2.8341172658181253, + "learning_rate": 1.9977937735569915e-05, + "loss": 1.0184, "step": 483 }, { - "epoch": 0.32, - "learning_rate": 1.9138921179226908e-05, - "loss": 1.4216, + "epoch": 0.05065410779696494, + "grad_norm": 3.16341632485204, + "learning_rate": 1.99777121200004e-05, + "loss": 1.0631, "step": 484 }, { - "epoch": 0.32, - "learning_rate": 1.9134586886749492e-05, - "loss": 1.2753, + "epoch": 0.05075876504447933, + "grad_norm": 3.015431914842525, + "learning_rate": 1.9977485357971535e-05, + "loss": 1.2249, "step": 485 }, { - "epoch": 0.32, - "learning_rate": 1.9130242206482817e-05, - "loss": 1.3887, + "epoch": 0.05086342229199372, + "grad_norm": 15.10309744378473, + "learning_rate": 1.997725744950938e-05, + "loss": 1.0752, "step": 486 }, { - "epoch": 0.32, - "learning_rate": 1.912588714336762e-05, - "loss": 1.3849, + "epoch": 0.05096807953950811, + "grad_norm": 3.0329623555885776, + "learning_rate": 1.9977028394640113e-05, + "loss": 1.1592, "step": 487 }, { - "epoch": 0.32, - "learning_rate": 1.912152170235646e-05, - "loss": 1.3427, + "epoch": 0.0510727367870225, + "grad_norm": 2.993215052911289, + "learning_rate": 1.997679819339006e-05, + "loss": 1.1015, "step": 488 }, { - "epoch": 0.32, - "learning_rate": 1.9117145888413678e-05, - "loss": 1.3543, + "epoch": 0.05117739403453689, + "grad_norm": 3.028558942288711, + "learning_rate": 1.997656684578567e-05, + "loss": 1.0943, "step": 489 }, { - "epoch": 0.32, - "learning_rate": 1.911275970651542e-05, - "loss": 1.3034, + "epoch": 0.05128205128205128, + "grad_norm": 2.6575824106168677, + "learning_rate": 1.9976334351853522e-05, + "loss": 1.1031, "step": 490 }, { - "epoch": 0.32, - "learning_rate": 1.910836316164962e-05, - "loss": 1.4744, + "epoch": 0.05138670852956567, + "grad_norm": 3.262048948295798, + "learning_rate": 1.997610071162033e-05, + "loss": 1.0627, "step": 491 }, { - "epoch": 0.32, - "learning_rate": 1.9103956258816008e-05, - "loss": 1.3314, + "epoch": 0.051491365777080064, + "grad_norm": 2.942164825678646, + "learning_rate": 1.997586592511295e-05, + "loss": 1.1196, "step": 492 }, { - "epoch": 0.32, - "learning_rate": 1.909953900302607e-05, - "loss": 1.4215, + "epoch": 0.051596023024594456, + "grad_norm": 3.143793552468755, + "learning_rate": 1.997562999235835e-05, + "loss": 1.1187, "step": 493 }, { - "epoch": 0.33, - "learning_rate": 1.909511139930309e-05, - "loss": 1.2962, + "epoch": 0.05170068027210884, + "grad_norm": 3.6555595068602846, + "learning_rate": 1.997539291338364e-05, + "loss": 0.9745, "step": 494 }, { - "epoch": 0.33, - "learning_rate": 1.9090673452682108e-05, - "loss": 1.3132, + "epoch": 0.05180533751962323, + "grad_norm": 3.3509907787982494, + "learning_rate": 1.9975154688216064e-05, + "loss": 0.9672, "step": 495 }, { - "epoch": 0.33, - "learning_rate": 1.9086225168209917e-05, - "loss": 1.2975, + "epoch": 0.051909994767137625, + "grad_norm": 3.446789550285205, + "learning_rate": 1.997491531688299e-05, + "loss": 1.0936, "step": 496 }, { - "epoch": 0.33, - "learning_rate": 1.9081766550945084e-05, - "loss": 1.3207, + "epoch": 0.05201465201465202, + "grad_norm": 3.215446069445594, + "learning_rate": 1.9974674799411927e-05, + "loss": 1.1857, "step": 497 }, { - "epoch": 0.33, - "learning_rate": 1.907729760595791e-05, - "loss": 1.3353, + "epoch": 0.0521193092621664, + "grad_norm": 3.694397906430922, + "learning_rate": 1.9974433135830505e-05, + "loss": 1.0549, "step": 498 }, { - "epoch": 0.33, - "learning_rate": 1.9072818338330455e-05, - "loss": 1.2535, + "epoch": 0.052223966509680794, + "grad_norm": 3.5507510319349462, + "learning_rate": 1.9974190326166498e-05, + "loss": 1.0954, "step": 499 }, { - "epoch": 0.33, - "learning_rate": 1.9068328753156514e-05, - "loss": 1.0754, + "epoch": 0.052328623757195186, + "grad_norm": 2.911759573200471, + "learning_rate": 1.9973946370447804e-05, + "loss": 1.1315, "step": 500 }, { - "epoch": 0.33, - "learning_rate": 1.9063828855541605e-05, - "loss": 1.3487, + "epoch": 0.05243328100470958, + "grad_norm": 3.830707804003234, + "learning_rate": 1.9973701268702454e-05, + "loss": 1.1961, "step": 501 }, { - "epoch": 0.33, - "learning_rate": 1.9059318650602986e-05, - "loss": 1.4404, + "epoch": 0.05253793825222396, + "grad_norm": 3.5874003170282953, + "learning_rate": 1.9973455020958602e-05, + "loss": 1.1831, "step": 502 }, { - "epoch": 0.33, - "learning_rate": 1.905479814346962e-05, - "loss": 1.2397, + "epoch": 0.052642595499738355, + "grad_norm": 5.067523741996903, + "learning_rate": 1.9973207627244556e-05, + "loss": 1.06, "step": 503 }, { - "epoch": 0.33, - "learning_rate": 1.9050267339282214e-05, - "loss": 1.378, + "epoch": 0.05274725274725275, + "grad_norm": 3.126368066369804, + "learning_rate": 1.9972959087588734e-05, + "loss": 0.7553, "step": 504 }, { - "epoch": 0.33, - "learning_rate": 1.9045726243193155e-05, - "loss": 1.2465, + "epoch": 0.05285190999476714, + "grad_norm": 3.7926866203618923, + "learning_rate": 1.9972709402019696e-05, + "loss": 1.1616, "step": 505 }, { - "epoch": 0.33, - "learning_rate": 1.904117486036655e-05, - "loss": 1.3214, + "epoch": 0.05295656724228153, + "grad_norm": 3.098260850078986, + "learning_rate": 1.9972458570566134e-05, + "loss": 1.0706, "step": 506 }, { - "epoch": 0.33, - "learning_rate": 1.9036613195978198e-05, - "loss": 1.3161, + "epoch": 0.053061224489795916, + "grad_norm": 3.121342346390446, + "learning_rate": 1.9972206593256863e-05, + "loss": 1.0746, "step": 507 }, { - "epoch": 0.33, - "learning_rate": 1.9032041255215596e-05, - "loss": 1.2936, + "epoch": 0.05316588173731031, + "grad_norm": 2.3263594607795484, + "learning_rate": 1.997195347012084e-05, + "loss": 0.8619, "step": 508 }, { - "epoch": 0.34, - "learning_rate": 1.9027459043277923e-05, - "loss": 1.436, + "epoch": 0.0532705389848247, + "grad_norm": 4.313521927576008, + "learning_rate": 1.997169920118714e-05, + "loss": 0.9864, "step": 509 }, { - "epoch": 0.34, - "learning_rate": 1.902286656537603e-05, - "loss": 1.2758, + "epoch": 0.05337519623233909, + "grad_norm": 2.8736994783195966, + "learning_rate": 1.9971443786485e-05, + "loss": 1.1794, "step": 510 }, { - "epoch": 0.34, - "learning_rate": 1.9018263826732467e-05, - "loss": 1.3259, + "epoch": 0.05347985347985348, + "grad_norm": 2.67925787298318, + "learning_rate": 1.9971187226043746e-05, + "loss": 0.92, "step": 511 }, { - "epoch": 0.34, - "learning_rate": 1.9013650832581424e-05, - "loss": 1.3638, + "epoch": 0.05358451072736787, + "grad_norm": 2.7647374456372544, + "learning_rate": 1.997092951989287e-05, + "loss": 1.1168, "step": 512 }, { - "epoch": 0.34, - "learning_rate": 1.9009027588168773e-05, - "loss": 1.357, + "epoch": 0.05368916797488226, + "grad_norm": 3.207257190593513, + "learning_rate": 1.9970670668061977e-05, + "loss": 1.182, "step": 513 }, { - "epoch": 0.34, - "learning_rate": 1.900439409875203e-05, - "loss": 1.3922, + "epoch": 0.05379382522239665, + "grad_norm": 2.9763787397307384, + "learning_rate": 1.997041067058081e-05, + "loss": 1.0417, "step": 514 }, { - "epoch": 0.34, - "learning_rate": 1.899975036960037e-05, - "loss": 1.2587, + "epoch": 0.05389848246991104, + "grad_norm": 3.3432997678318634, + "learning_rate": 1.997014952747925e-05, + "loss": 1.051, "step": 515 }, { - "epoch": 0.34, - "learning_rate": 1.8995096405994614e-05, - "loss": 1.4294, + "epoch": 0.05400313971742543, + "grad_norm": 2.789610544658323, + "learning_rate": 1.996988723878729e-05, + "loss": 1.1173, "step": 516 }, { - "epoch": 0.34, - "learning_rate": 1.899043221322721e-05, - "loss": 1.244, + "epoch": 0.05410779696493982, + "grad_norm": 3.7956635314776, + "learning_rate": 1.9969623804535084e-05, + "loss": 1.0319, "step": 517 }, { - "epoch": 0.34, - "learning_rate": 1.898575779660225e-05, - "loss": 1.1213, + "epoch": 0.054212454212454214, + "grad_norm": 3.0432435518135597, + "learning_rate": 1.9969359224752884e-05, + "loss": 1.0832, "step": 518 }, { - "epoch": 0.34, - "learning_rate": 1.8981073161435447e-05, - "loss": 1.4039, + "epoch": 0.054317111459968606, + "grad_norm": 3.060370744438756, + "learning_rate": 1.9969093499471106e-05, + "loss": 1.0148, "step": 519 }, { - "epoch": 0.34, - "learning_rate": 1.897637831305414e-05, - "loss": 1.3606, + "epoch": 0.05442176870748299, + "grad_norm": 3.0599503263592265, + "learning_rate": 1.996882662872027e-05, + "loss": 0.9405, "step": 520 }, { - "epoch": 0.34, - "learning_rate": 1.8971673256797275e-05, - "loss": 1.4622, + "epoch": 0.05452642595499738, + "grad_norm": 2.8659069744524346, + "learning_rate": 1.996855861253105e-05, + "loss": 1.0298, "step": 521 }, { - "epoch": 0.34, - "learning_rate": 1.8966957998015403e-05, - "loss": 1.3161, + "epoch": 0.054631083202511775, + "grad_norm": 3.698825550046783, + "learning_rate": 1.9968289450934235e-05, + "loss": 1.163, "step": 522 }, { - "epoch": 0.34, - "learning_rate": 1.896223254207069e-05, - "loss": 1.3393, + "epoch": 0.05473574045002617, + "grad_norm": 3.0714368145204616, + "learning_rate": 1.9968019143960755e-05, + "loss": 1.0566, "step": 523 }, { - "epoch": 0.34, - "learning_rate": 1.89574968943369e-05, - "loss": 1.3398, + "epoch": 0.05484039769754055, + "grad_norm": 2.9908123101769912, + "learning_rate": 1.9967747691641667e-05, + "loss": 1.1947, "step": 524 }, { - "epoch": 0.35, - "learning_rate": 1.895275106019937e-05, - "loss": 1.462, + "epoch": 0.054945054945054944, + "grad_norm": 3.455634138607162, + "learning_rate": 1.996747509400816e-05, + "loss": 1.1672, "step": 525 }, { - "epoch": 0.35, - "learning_rate": 1.8947995045055023e-05, - "loss": 1.3711, + "epoch": 0.055049712192569336, + "grad_norm": 2.910243752165646, + "learning_rate": 1.9967201351091562e-05, + "loss": 1.1195, "step": 526 }, { - "epoch": 0.35, - "learning_rate": 1.894322885431238e-05, - "loss": 1.4027, + "epoch": 0.05515436944008373, + "grad_norm": 2.779316157067133, + "learning_rate": 1.9966926462923324e-05, + "loss": 0.8195, "step": 527 }, { - "epoch": 0.35, - "learning_rate": 1.8938452493391515e-05, - "loss": 1.4405, + "epoch": 0.05525902668759811, + "grad_norm": 3.1584634955840154, + "learning_rate": 1.996665042953503e-05, + "loss": 1.1175, "step": 528 }, { - "epoch": 0.35, - "learning_rate": 1.8933665967724072e-05, - "loss": 1.3696, + "epoch": 0.055363683935112505, + "grad_norm": 2.6746711631574938, + "learning_rate": 1.9966373250958395e-05, + "loss": 0.9986, "step": 529 }, { - "epoch": 0.35, - "learning_rate": 1.892886928275325e-05, - "loss": 1.3695, + "epoch": 0.0554683411826269, + "grad_norm": 2.956338657336674, + "learning_rate": 1.9966094927225272e-05, + "loss": 1.0119, "step": 530 }, { - "epoch": 0.35, - "learning_rate": 1.892406244393382e-05, - "loss": 1.2519, + "epoch": 0.05557299843014129, + "grad_norm": 3.287242958068802, + "learning_rate": 1.9965815458367633e-05, + "loss": 1.0788, "step": 531 }, { - "epoch": 0.35, - "learning_rate": 1.891924545673207e-05, - "loss": 1.438, + "epoch": 0.05567765567765568, + "grad_norm": 2.8717406338940283, + "learning_rate": 1.99655348444176e-05, + "loss": 1.1331, "step": 532 }, { - "epoch": 0.35, - "learning_rate": 1.891441832662586e-05, - "loss": 1.2753, + "epoch": 0.055782312925170066, + "grad_norm": 3.1316748048585477, + "learning_rate": 1.996525308540741e-05, + "loss": 1.0685, "step": 533 }, { - "epoch": 0.35, - "learning_rate": 1.8909581059104557e-05, - "loss": 1.3365, + "epoch": 0.05588697017268446, + "grad_norm": 3.1662562257885165, + "learning_rate": 1.996497018136944e-05, + "loss": 1.1702, "step": 534 }, { - "epoch": 0.35, - "learning_rate": 1.8904733659669072e-05, - "loss": 1.3209, + "epoch": 0.05599162742019885, + "grad_norm": 2.9159713047319022, + "learning_rate": 1.9964686132336193e-05, + "loss": 1.1428, "step": 535 }, { - "epoch": 0.35, - "learning_rate": 1.8899876133831835e-05, - "loss": 1.1781, + "epoch": 0.05609628466771324, + "grad_norm": 2.9197107556733863, + "learning_rate": 1.9964400938340312e-05, + "loss": 1.2099, "step": 536 }, { - "epoch": 0.35, - "learning_rate": 1.8895008487116793e-05, - "loss": 1.3314, + "epoch": 0.05620094191522763, + "grad_norm": 3.103396125023537, + "learning_rate": 1.996411459941456e-05, + "loss": 1.0532, "step": 537 }, { - "epoch": 0.35, - "learning_rate": 1.8890130725059396e-05, - "loss": 1.2742, + "epoch": 0.05630559916274202, + "grad_norm": 2.9755305851979985, + "learning_rate": 1.9963827115591843e-05, + "loss": 1.1459, "step": 538 }, { - "epoch": 0.35, - "learning_rate": 1.8885242853206606e-05, - "loss": 1.3207, + "epoch": 0.05641025641025641, + "grad_norm": 3.1958262641616835, + "learning_rate": 1.996353848690519e-05, + "loss": 1.1823, "step": 539 }, { - "epoch": 0.36, - "learning_rate": 1.888034487711687e-05, - "loss": 1.2713, + "epoch": 0.0565149136577708, + "grad_norm": 3.044198764425719, + "learning_rate": 1.9963248713387767e-05, + "loss": 0.9377, "step": 540 }, { - "epoch": 0.36, - "learning_rate": 1.8875436802360142e-05, - "loss": 1.4202, + "epoch": 0.05661957090528519, + "grad_norm": 2.7930578822333825, + "learning_rate": 1.9962957795072874e-05, + "loss": 0.9952, "step": 541 }, { - "epoch": 0.36, - "learning_rate": 1.8870518634517843e-05, - "loss": 1.3311, + "epoch": 0.05672422815279958, + "grad_norm": 2.9967717314841598, + "learning_rate": 1.9962665731993925e-05, + "loss": 1.1515, "step": 542 }, { - "epoch": 0.36, - "learning_rate": 1.8865590379182885e-05, - "loss": 1.3785, + "epoch": 0.05682888540031397, + "grad_norm": 3.0834478594381025, + "learning_rate": 1.9962372524184493e-05, + "loss": 1.1046, "step": 543 }, { - "epoch": 0.36, - "learning_rate": 1.8860652041959643e-05, - "loss": 1.3968, + "epoch": 0.056933542647828364, + "grad_norm": 3.0669052108361456, + "learning_rate": 1.996207817167826e-05, + "loss": 1.1256, "step": 544 }, { - "epoch": 0.36, - "learning_rate": 1.8855703628463957e-05, - "loss": 1.3965, + "epoch": 0.05703819989534275, + "grad_norm": 2.9144654203085496, + "learning_rate": 1.996178267450905e-05, + "loss": 1.0783, "step": 545 }, { - "epoch": 0.36, - "learning_rate": 1.8850745144323137e-05, - "loss": 1.2262, + "epoch": 0.05714285714285714, + "grad_norm": 2.9822664939930537, + "learning_rate": 1.9961486032710813e-05, + "loss": 1.1593, "step": 546 }, { - "epoch": 0.36, - "learning_rate": 1.884577659517593e-05, - "loss": 1.358, + "epoch": 0.05724751439037153, + "grad_norm": 3.1486295364529227, + "learning_rate": 1.9961188246317644e-05, + "loss": 1.0789, "step": 547 }, { - "epoch": 0.36, - "learning_rate": 1.884079798667254e-05, - "loss": 1.286, + "epoch": 0.057352171637885925, + "grad_norm": 2.7926732727072054, + "learning_rate": 1.9960889315363747e-05, + "loss": 1.1259, "step": 548 }, { - "epoch": 0.36, - "learning_rate": 1.88358093244746e-05, - "loss": 1.3762, + "epoch": 0.05745682888540032, + "grad_norm": 3.057281352342737, + "learning_rate": 1.9960589239883474e-05, + "loss": 1.1181, "step": 549 }, { - "epoch": 0.36, - "learning_rate": 1.883081061425519e-05, - "loss": 1.3745, + "epoch": 0.0575614861329147, + "grad_norm": 3.254756100588196, + "learning_rate": 1.996028801991131e-05, + "loss": 1.047, "step": 550 }, { - "epoch": 0.36, - "learning_rate": 1.882580186169881e-05, - "loss": 1.2543, + "epoch": 0.057666143380429094, + "grad_norm": 2.770630465752798, + "learning_rate": 1.9959985655481855e-05, + "loss": 1.053, "step": 551 }, { - "epoch": 0.36, - "learning_rate": 1.882078307250137e-05, - "loss": 1.3128, + "epoch": 0.057770800627943486, + "grad_norm": 2.816610825508362, + "learning_rate": 1.9959682146629862e-05, + "loss": 1.1418, "step": 552 }, { - "epoch": 0.36, - "learning_rate": 1.8815754252370205e-05, - "loss": 1.4083, + "epoch": 0.05787545787545788, + "grad_norm": 2.0688323215063646, + "learning_rate": 1.9959377493390198e-05, + "loss": 0.8396, "step": 553 }, { - "epoch": 0.36, - "learning_rate": 1.881071540702406e-05, - "loss": 1.4121, + "epoch": 0.05798011512297226, + "grad_norm": 2.1085687907562662, + "learning_rate": 1.995907169579787e-05, + "loss": 0.793, "step": 554 }, { - "epoch": 0.37, - "learning_rate": 1.8805666542193073e-05, - "loss": 1.2899, + "epoch": 0.058084772370486655, + "grad_norm": 3.06254545731126, + "learning_rate": 1.9958764753888015e-05, + "loss": 1.1242, "step": 555 }, { - "epoch": 0.37, - "learning_rate": 1.8800607663618774e-05, - "loss": 1.4122, + "epoch": 0.05818942961800105, + "grad_norm": 2.75776805013571, + "learning_rate": 1.99584566676959e-05, + "loss": 0.9819, "step": 556 }, { - "epoch": 0.37, - "learning_rate": 1.879553877705409e-05, - "loss": 1.2897, + "epoch": 0.05829408686551544, + "grad_norm": 3.275437113764389, + "learning_rate": 1.995814743725693e-05, + "loss": 1.1104, "step": 557 }, { - "epoch": 0.37, - "learning_rate": 1.8790459888263323e-05, - "loss": 1.3716, + "epoch": 0.058398744113029824, + "grad_norm": 3.203542046928147, + "learning_rate": 1.995783706260663e-05, + "loss": 1.0711, "step": 558 }, { - "epoch": 0.37, - "learning_rate": 1.8785371003022145e-05, - "loss": 1.2673, + "epoch": 0.058503401360544216, + "grad_norm": 3.3467386858572934, + "learning_rate": 1.9957525543780663e-05, + "loss": 1.1925, "step": 559 }, { - "epoch": 0.37, - "learning_rate": 1.8780272127117606e-05, - "loss": 1.2832, + "epoch": 0.05860805860805861, + "grad_norm": 2.855041752035653, + "learning_rate": 1.9957212880814826e-05, + "loss": 0.8485, "step": 560 }, { - "epoch": 0.37, - "learning_rate": 1.8775163266348116e-05, - "loss": 1.3121, + "epoch": 0.058712715855573, + "grad_norm": 3.4035834251768855, + "learning_rate": 1.9956899073745046e-05, + "loss": 0.9867, "step": 561 }, { - "epoch": 0.37, - "learning_rate": 1.877004442652343e-05, - "loss": 1.2356, + "epoch": 0.05881737310308739, + "grad_norm": 2.891412935735807, + "learning_rate": 1.9956584122607373e-05, + "loss": 1.1266, "step": 562 }, { - "epoch": 0.37, - "learning_rate": 1.8764915613464657e-05, - "loss": 1.2424, + "epoch": 0.05892203035060178, + "grad_norm": 2.741554961840439, + "learning_rate": 1.9956268027438006e-05, + "loss": 1.1316, "step": 563 }, { - "epoch": 0.37, - "learning_rate": 1.8759776833004256e-05, - "loss": 1.4182, + "epoch": 0.05902668759811617, + "grad_norm": 2.629765377645471, + "learning_rate": 1.9955950788273255e-05, + "loss": 0.9629, "step": 564 }, { - "epoch": 0.37, - "learning_rate": 1.8754628090986e-05, - "loss": 1.141, + "epoch": 0.05913134484563056, + "grad_norm": 3.303903521460224, + "learning_rate": 1.9955632405149577e-05, + "loss": 1.1504, "step": 565 }, { - "epoch": 0.37, - "learning_rate": 1.8749469393265016e-05, - "loss": 1.3906, + "epoch": 0.05923600209314495, + "grad_norm": 2.4920423274313244, + "learning_rate": 1.9955312878103554e-05, + "loss": 1.1054, "step": 566 }, { - "epoch": 0.37, - "learning_rate": 1.874430074570773e-05, - "loss": 1.2956, + "epoch": 0.05934065934065934, + "grad_norm": 2.4458587609353613, + "learning_rate": 1.9954992207171898e-05, + "loss": 0.8266, "step": 567 }, { - "epoch": 0.37, - "learning_rate": 1.87391221541919e-05, - "loss": 1.3609, + "epoch": 0.05944531658817373, + "grad_norm": 3.0320812785910274, + "learning_rate": 1.995467039239146e-05, + "loss": 1.0863, "step": 568 }, { - "epoch": 0.37, - "learning_rate": 1.873393362460658e-05, - "loss": 1.3083, + "epoch": 0.05954997383568812, + "grad_norm": 3.1180464676532025, + "learning_rate": 1.995434743379921e-05, + "loss": 1.0776, "step": 569 }, { - "epoch": 0.38, - "learning_rate": 1.8728735162852125e-05, - "loss": 1.4665, + "epoch": 0.059654631083202514, + "grad_norm": 2.0897952402688795, + "learning_rate": 1.995402333143226e-05, + "loss": 0.8537, "step": 570 }, { - "epoch": 0.38, - "learning_rate": 1.87235267748402e-05, - "loss": 1.3554, + "epoch": 0.0597592883307169, + "grad_norm": 3.3450036899077102, + "learning_rate": 1.9953698085327856e-05, + "loss": 1.0668, "step": 571 }, { - "epoch": 0.38, - "learning_rate": 1.8718308466493745e-05, - "loss": 1.3665, + "epoch": 0.05986394557823129, + "grad_norm": 3.719131888220057, + "learning_rate": 1.995337169552336e-05, + "loss": 1.1326, "step": 572 }, { - "epoch": 0.38, - "learning_rate": 1.871308024374698e-05, - "loss": 1.4273, + "epoch": 0.05996860282574568, + "grad_norm": 3.887030150352439, + "learning_rate": 1.9953044162056275e-05, + "loss": 1.0363, "step": 573 }, { - "epoch": 0.38, - "learning_rate": 1.8707842112545406e-05, - "loss": 1.344, + "epoch": 0.060073260073260075, + "grad_norm": 3.301925941503779, + "learning_rate": 1.9952715484964242e-05, + "loss": 1.1164, "step": 574 }, { - "epoch": 0.38, - "learning_rate": 1.870259407884579e-05, - "loss": 1.4037, + "epoch": 0.06017791732077447, + "grad_norm": 3.126271535901025, + "learning_rate": 1.9952385664285024e-05, + "loss": 0.9704, "step": 575 }, { - "epoch": 0.38, - "learning_rate": 1.8697336148616162e-05, - "loss": 1.3336, + "epoch": 0.06028257456828885, + "grad_norm": 3.0038883887884067, + "learning_rate": 1.9952054700056518e-05, + "loss": 1.0455, "step": 576 }, { - "epoch": 0.38, - "learning_rate": 1.86920683278358e-05, - "loss": 1.3447, + "epoch": 0.060387231815803244, + "grad_norm": 3.0353412809857003, + "learning_rate": 1.995172259231675e-05, + "loss": 1.126, "step": 577 }, { - "epoch": 0.38, - "learning_rate": 1.868679062249524e-05, - "loss": 1.3287, + "epoch": 0.060491889063317636, + "grad_norm": 3.041115370144133, + "learning_rate": 1.9951389341103885e-05, + "loss": 1.1008, "step": 578 }, { - "epoch": 0.38, - "learning_rate": 1.8681503038596248e-05, - "loss": 1.2865, + "epoch": 0.06059654631083203, + "grad_norm": 3.577476624319239, + "learning_rate": 1.995105494645621e-05, + "loss": 1.157, "step": 579 }, { - "epoch": 0.38, - "learning_rate": 1.8676205582151822e-05, - "loss": 1.1761, + "epoch": 0.06070120355834641, + "grad_norm": 3.1702638194747808, + "learning_rate": 1.9950719408412145e-05, + "loss": 1.0744, "step": 580 }, { - "epoch": 0.38, - "learning_rate": 1.8670898259186202e-05, - "loss": 1.2776, + "epoch": 0.060805860805860805, + "grad_norm": 2.6999043263976747, + "learning_rate": 1.9950382727010254e-05, + "loss": 1.0654, "step": 581 }, { - "epoch": 0.38, - "learning_rate": 1.8665581075734836e-05, - "loss": 1.2189, + "epoch": 0.0609105180533752, + "grad_norm": 2.5781649995482216, + "learning_rate": 1.9950044902289214e-05, + "loss": 0.7968, "step": 582 }, { - "epoch": 0.38, - "learning_rate": 1.866025403784439e-05, - "loss": 1.3302, + "epoch": 0.06101517530088959, + "grad_norm": 2.8874420445573032, + "learning_rate": 1.994970593428784e-05, + "loss": 1.1324, "step": 583 }, { - "epoch": 0.38, - "learning_rate": 1.865491715157273e-05, - "loss": 1.3992, + "epoch": 0.061119832548403974, + "grad_norm": 3.5373274248424806, + "learning_rate": 1.994936582304509e-05, + "loss": 1.0826, "step": 584 }, { - "epoch": 0.39, - "learning_rate": 1.864957042298893e-05, - "loss": 1.3607, + "epoch": 0.061224489795918366, + "grad_norm": 3.2429438002106203, + "learning_rate": 1.994902456860003e-05, + "loss": 1.0991, "step": 585 }, { - "epoch": 0.39, - "learning_rate": 1.864421385817325e-05, - "loss": 1.3804, + "epoch": 0.06132914704343276, + "grad_norm": 2.83907357900198, + "learning_rate": 1.9948682170991884e-05, + "loss": 1.0165, "step": 586 }, { - "epoch": 0.39, - "learning_rate": 1.8638847463217145e-05, - "loss": 1.2228, + "epoch": 0.06143380429094715, + "grad_norm": 2.756315965811872, + "learning_rate": 1.9948338630259988e-05, + "loss": 1.1076, "step": 587 }, { - "epoch": 0.39, - "learning_rate": 1.8633471244223232e-05, - "loss": 1.2283, + "epoch": 0.06153846153846154, + "grad_norm": 3.0042678927369333, + "learning_rate": 1.9947993946443814e-05, + "loss": 1.1311, "step": 588 }, { - "epoch": 0.39, - "learning_rate": 1.8628085207305325e-05, - "loss": 1.3609, + "epoch": 0.06164311878597593, + "grad_norm": 2.6664016639707975, + "learning_rate": 1.9947648119582973e-05, + "loss": 1.0378, "step": 589 }, { - "epoch": 0.39, - "learning_rate": 1.8622689358588373e-05, - "loss": 1.3056, + "epoch": 0.06174777603349032, + "grad_norm": 3.1499853449871784, + "learning_rate": 1.9947301149717194e-05, + "loss": 1.0647, "step": 590 }, { - "epoch": 0.39, - "learning_rate": 1.8617283704208507e-05, - "loss": 1.3889, + "epoch": 0.06185243328100471, + "grad_norm": 3.2629382002059293, + "learning_rate": 1.9946953036886346e-05, + "loss": 1.0228, "step": 591 }, { - "epoch": 0.39, - "learning_rate": 1.8611868250312994e-05, - "loss": 1.389, + "epoch": 0.0619570905285191, + "grad_norm": 3.129418827715554, + "learning_rate": 1.994660378113043e-05, + "loss": 1.1271, "step": 592 }, { - "epoch": 0.39, - "learning_rate": 1.860644300306026e-05, - "loss": 1.3262, + "epoch": 0.06206174777603349, + "grad_norm": 2.9554401950010414, + "learning_rate": 1.994625338248958e-05, + "loss": 1.1953, "step": 593 }, { - "epoch": 0.39, - "learning_rate": 1.8601007968619847e-05, - "loss": 1.3581, + "epoch": 0.06216640502354788, + "grad_norm": 3.2598690152125314, + "learning_rate": 1.994590184100405e-05, + "loss": 1.1584, "step": 594 }, { - "epoch": 0.39, - "learning_rate": 1.8595563153172448e-05, - "loss": 1.3574, + "epoch": 0.06227106227106227, + "grad_norm": 2.437511527082557, + "learning_rate": 1.9945549156714236e-05, + "loss": 0.8374, "step": 595 }, { - "epoch": 0.39, - "learning_rate": 1.8590108562909862e-05, - "loss": 1.1755, + "epoch": 0.062375719518576664, + "grad_norm": 2.6096105594208985, + "learning_rate": 1.994519532966066e-05, + "loss": 1.1617, "step": 596 }, { - "epoch": 0.39, - "learning_rate": 1.858464420403502e-05, - "loss": 1.4054, + "epoch": 0.06248037676609105, + "grad_norm": 3.421662115903477, + "learning_rate": 1.9944840359883983e-05, + "loss": 1.1564, "step": 597 }, { - "epoch": 0.39, - "learning_rate": 1.857917008276195e-05, - "loss": 1.3015, + "epoch": 0.06258503401360545, + "grad_norm": 1.9096028175870379, + "learning_rate": 1.9944484247424986e-05, + "loss": 0.7336, "step": 598 }, { - "epoch": 0.39, - "learning_rate": 1.857368620531578e-05, - "loss": 1.3561, + "epoch": 0.06268969126111983, + "grad_norm": 2.8077084469651816, + "learning_rate": 1.9944126992324592e-05, + "loss": 1.1911, "step": 599 }, { - "epoch": 0.39, - "learning_rate": 1.8568192577932746e-05, - "loss": 1.4332, + "epoch": 0.06279434850863422, + "grad_norm": 2.8556439666255695, + "learning_rate": 1.9943768594623844e-05, + "loss": 1.0634, "step": 600 }, { - "epoch": 0.4, - "learning_rate": 1.8562689206860155e-05, - "loss": 1.3722, + "epoch": 0.06289900575614861, + "grad_norm": 3.117743097234504, + "learning_rate": 1.994340905436393e-05, + "loss": 1.1191, "step": 601 }, { - "epoch": 0.4, - "learning_rate": 1.8557176098356406e-05, - "loss": 1.2927, + "epoch": 0.063003663003663, + "grad_norm": 3.1601015532368866, + "learning_rate": 1.9943048371586158e-05, + "loss": 0.9725, "step": 602 }, { - "epoch": 0.4, - "learning_rate": 1.8551653258690972e-05, - "loss": 1.4575, + "epoch": 0.0631083202511774, + "grad_norm": 2.208608976213274, + "learning_rate": 1.994268654633197e-05, + "loss": 0.7586, "step": 603 }, { - "epoch": 0.4, - "learning_rate": 1.854612069414438e-05, - "loss": 1.4379, + "epoch": 0.06321297749869179, + "grad_norm": 2.1158633727476026, + "learning_rate": 1.994232357864294e-05, + "loss": 0.7692, "step": 604 }, { - "epoch": 0.4, - "learning_rate": 1.8540578411008233e-05, - "loss": 1.3694, + "epoch": 0.06331763474620618, + "grad_norm": 2.9413326628625205, + "learning_rate": 1.9941959468560782e-05, + "loss": 1.1525, "step": 605 }, { - "epoch": 0.4, - "learning_rate": 1.8535026415585168e-05, - "loss": 1.2722, + "epoch": 0.06342229199372057, + "grad_norm": 2.8204049381108742, + "learning_rate": 1.9941594216127323e-05, + "loss": 0.872, "step": 606 }, { - "epoch": 0.4, - "learning_rate": 1.8529464714188888e-05, - "loss": 1.192, + "epoch": 0.06352694924123496, + "grad_norm": 3.190529808813044, + "learning_rate": 1.994122782138453e-05, + "loss": 1.1846, "step": 607 }, { - "epoch": 0.4, - "learning_rate": 1.852389331314411e-05, - "loss": 1.2234, + "epoch": 0.06363160648874934, + "grad_norm": 3.08211718908032, + "learning_rate": 1.9940860284374515e-05, + "loss": 0.9465, "step": 608 }, { - "epoch": 0.4, - "learning_rate": 1.85183122187866e-05, - "loss": 1.3328, + "epoch": 0.06373626373626373, + "grad_norm": 3.0616588635006123, + "learning_rate": 1.99404916051395e-05, + "loss": 0.9674, "step": 609 }, { - "epoch": 0.4, - "learning_rate": 1.8512721437463136e-05, - "loss": 1.2733, + "epoch": 0.06384092098377812, + "grad_norm": 2.9537487271769116, + "learning_rate": 1.9940121783721846e-05, + "loss": 1.1373, "step": 610 }, { - "epoch": 0.4, - "learning_rate": 1.8507120975531517e-05, - "loss": 1.3759, + "epoch": 0.06394557823129252, + "grad_norm": 2.9668339388811407, + "learning_rate": 1.9939750820164044e-05, + "loss": 1.0418, "step": 611 }, { - "epoch": 0.4, - "learning_rate": 1.850151083936055e-05, - "loss": 1.2952, + "epoch": 0.06405023547880691, + "grad_norm": 2.8848768351239658, + "learning_rate": 1.9939378714508728e-05, + "loss": 0.9834, "step": 612 }, { - "epoch": 0.4, - "learning_rate": 1.849589103533004e-05, - "loss": 1.3329, + "epoch": 0.0641548927263213, + "grad_norm": 2.759886281009913, + "learning_rate": 1.9939005466798648e-05, + "loss": 1.0321, "step": 613 }, { - "epoch": 0.4, - "learning_rate": 1.84902615698308e-05, - "loss": 1.4101, + "epoch": 0.06425954997383569, + "grad_norm": 3.0129757609858245, + "learning_rate": 1.9938631077076692e-05, + "loss": 1.1952, "step": 614 }, { - "epoch": 0.4, - "learning_rate": 1.8484622449264608e-05, - "loss": 1.3062, + "epoch": 0.06436420722135008, + "grad_norm": 2.573835926627507, + "learning_rate": 1.9938255545385875e-05, + "loss": 0.9685, "step": 615 }, { - "epoch": 0.41, - "learning_rate": 1.8478973680044237e-05, - "loss": 1.3987, + "epoch": 0.06446886446886448, + "grad_norm": 3.453838047274854, + "learning_rate": 1.993787887176935e-05, + "loss": 1.129, "step": 616 }, { - "epoch": 0.41, - "learning_rate": 1.8473315268593432e-05, - "loss": 1.3151, + "epoch": 0.06457352171637885, + "grad_norm": 3.1074759117744137, + "learning_rate": 1.9937501056270397e-05, + "loss": 1.0101, "step": 617 }, { - "epoch": 0.41, - "learning_rate": 1.846764722134689e-05, - "loss": 1.3956, + "epoch": 0.06467817896389325, + "grad_norm": 2.755027947394353, + "learning_rate": 1.9937122098932428e-05, + "loss": 0.9722, "step": 618 }, { - "epoch": 0.41, - "learning_rate": 1.8461969544750286e-05, - "loss": 1.392, + "epoch": 0.06478283621140764, + "grad_norm": 2.9013696059112224, + "learning_rate": 1.9936741999798985e-05, + "loss": 1.1728, "step": 619 }, { - "epoch": 0.41, - "learning_rate": 1.845628224526023e-05, - "loss": 1.5057, + "epoch": 0.06488749345892203, + "grad_norm": 3.1607523426322586, + "learning_rate": 1.993636075891374e-05, + "loss": 0.895, "step": 620 }, { - "epoch": 0.41, - "learning_rate": 1.8450585329344277e-05, - "loss": 1.2634, + "epoch": 0.06499215070643642, + "grad_norm": 3.1382276779001597, + "learning_rate": 1.9935978376320503e-05, + "loss": 1.1031, "step": 621 }, { - "epoch": 0.41, - "learning_rate": 1.8444878803480925e-05, - "loss": 1.3566, + "epoch": 0.06509680795395081, + "grad_norm": 2.52595770004853, + "learning_rate": 1.9935594852063208e-05, + "loss": 0.938, "step": 622 }, { - "epoch": 0.41, - "learning_rate": 1.8439162674159595e-05, - "loss": 1.3247, + "epoch": 0.0652014652014652, + "grad_norm": 2.650381943162927, + "learning_rate": 1.993521018618592e-05, + "loss": 1.0193, "step": 623 }, { - "epoch": 0.41, - "learning_rate": 1.8433436947880626e-05, - "loss": 1.3977, + "epoch": 0.0653061224489796, + "grad_norm": 2.651852123933312, + "learning_rate": 1.9934824378732846e-05, + "loss": 1.0853, "step": 624 }, { - "epoch": 0.41, - "learning_rate": 1.842770163115528e-05, - "loss": 1.3899, + "epoch": 0.06541077969649398, + "grad_norm": 3.306163119634711, + "learning_rate": 1.9934437429748306e-05, + "loss": 1.154, "step": 625 }, { - "epoch": 0.41, - "learning_rate": 1.842195673050572e-05, - "loss": 1.308, + "epoch": 0.06551543694400837, + "grad_norm": 2.784487003175816, + "learning_rate": 1.993404933927677e-05, + "loss": 1.0449, "step": 626 }, { - "epoch": 0.41, - "learning_rate": 1.8416202252465e-05, - "loss": 1.4328, + "epoch": 0.06562009419152276, + "grad_norm": 2.759785634839251, + "learning_rate": 1.9933660107362825e-05, + "loss": 1.0753, "step": 627 }, { - "epoch": 0.41, - "learning_rate": 1.841043820357708e-05, - "loss": 1.3289, + "epoch": 0.06572475143903715, + "grad_norm": 2.731464057889386, + "learning_rate": 1.9933269734051196e-05, + "loss": 0.9269, "step": 628 }, { - "epoch": 0.41, - "learning_rate": 1.8404664590396796e-05, - "loss": 1.3466, + "epoch": 0.06582940868655154, + "grad_norm": 2.837489067099457, + "learning_rate": 1.9932878219386738e-05, + "loss": 1.0685, "step": 629 }, { - "epoch": 0.41, - "learning_rate": 1.8398881419489854e-05, - "loss": 1.3224, + "epoch": 0.06593406593406594, + "grad_norm": 3.017220693367057, + "learning_rate": 1.9932485563414436e-05, + "loss": 1.1035, "step": 630 }, { - "epoch": 0.42, - "learning_rate": 1.8393088697432847e-05, - "loss": 1.3337, + "epoch": 0.06603872318158033, + "grad_norm": 3.3205197744653194, + "learning_rate": 1.9932091766179408e-05, + "loss": 1.1205, "step": 631 }, { - "epoch": 0.42, - "learning_rate": 1.838728643081321e-05, - "loss": 1.3094, + "epoch": 0.06614338042909472, + "grad_norm": 2.891684092310922, + "learning_rate": 1.99316968277269e-05, + "loss": 1.1477, "step": 632 }, { - "epoch": 0.42, - "learning_rate": 1.8381474626229245e-05, - "loss": 1.3409, + "epoch": 0.06624803767660911, + "grad_norm": 2.9884409256785447, + "learning_rate": 1.9931300748102294e-05, + "loss": 1.1761, "step": 633 }, { - "epoch": 0.42, - "learning_rate": 1.8375653290290093e-05, - "loss": 1.4036, + "epoch": 0.06635269492412349, + "grad_norm": 2.536315479596216, + "learning_rate": 1.99309035273511e-05, + "loss": 1.0411, "step": 634 }, { - "epoch": 0.42, - "learning_rate": 1.836982242961574e-05, - "loss": 1.266, + "epoch": 0.06645735217163788, + "grad_norm": 2.9476335588680365, + "learning_rate": 1.9930505165518958e-05, + "loss": 1.1562, "step": 635 }, { - "epoch": 0.42, - "learning_rate": 1.8363982050836996e-05, - "loss": 1.2164, + "epoch": 0.06656200941915227, + "grad_norm": 3.0500527548211704, + "learning_rate": 1.993010566265164e-05, + "loss": 1.1814, "step": 636 }, { - "epoch": 0.42, - "learning_rate": 1.835813216059551e-05, - "loss": 1.3615, + "epoch": 0.06666666666666667, + "grad_norm": 2.6707063897105905, + "learning_rate": 1.9929705018795055e-05, + "loss": 1.0591, "step": 637 }, { - "epoch": 0.42, - "learning_rate": 1.8352272765543723e-05, - "loss": 1.3961, + "epoch": 0.06677132391418106, + "grad_norm": 2.618480485695442, + "learning_rate": 1.9929303233995227e-05, + "loss": 1.0465, "step": 638 }, { - "epoch": 0.42, - "learning_rate": 1.8346403872344907e-05, - "loss": 1.3247, + "epoch": 0.06687598116169545, + "grad_norm": 2.66882534809794, + "learning_rate": 1.9928900308298334e-05, + "loss": 1.1071, "step": 639 }, { - "epoch": 0.42, - "learning_rate": 1.8340525487673128e-05, - "loss": 1.4376, + "epoch": 0.06698063840920984, + "grad_norm": 2.9292541219026496, + "learning_rate": 1.9928496241750667e-05, + "loss": 1.1123, "step": 640 }, { - "epoch": 0.42, - "learning_rate": 1.8334637618213238e-05, - "loss": 1.3172, + "epoch": 0.06708529565672423, + "grad_norm": 2.7357462801221546, + "learning_rate": 1.992809103439865e-05, + "loss": 1.132, "step": 641 }, { - "epoch": 0.42, - "learning_rate": 1.8328740270660887e-05, - "loss": 1.3366, + "epoch": 0.06718995290423861, + "grad_norm": 3.003236810512024, + "learning_rate": 1.9927684686288854e-05, + "loss": 1.1636, "step": 642 }, { - "epoch": 0.42, - "learning_rate": 1.83228334517225e-05, - "loss": 1.2715, + "epoch": 0.067294610151753, + "grad_norm": 3.313716591599714, + "learning_rate": 1.992727719746796e-05, + "loss": 1.0848, "step": 643 }, { - "epoch": 0.42, - "learning_rate": 1.831691716811526e-05, - "loss": 1.387, + "epoch": 0.0673992673992674, + "grad_norm": 3.0324704855965163, + "learning_rate": 1.9926868567982788e-05, + "loss": 1.177, "step": 644 }, { - "epoch": 0.42, - "learning_rate": 1.8310991426567137e-05, - "loss": 1.2501, + "epoch": 0.06750392464678179, + "grad_norm": 3.172131349022228, + "learning_rate": 1.9926458797880294e-05, + "loss": 0.8632, "step": 645 }, { - "epoch": 0.43, - "learning_rate": 1.8305056233816833e-05, - "loss": 1.1845, + "epoch": 0.06760858189429618, + "grad_norm": 3.0648739329108077, + "learning_rate": 1.9926047887207565e-05, + "loss": 1.0914, "step": 646 }, { - "epoch": 0.43, - "learning_rate": 1.8299111596613815e-05, - "loss": 1.4116, + "epoch": 0.06771323914181057, + "grad_norm": 2.4156718664411696, + "learning_rate": 1.992563583601181e-05, + "loss": 0.947, "step": 647 }, { - "epoch": 0.43, - "learning_rate": 1.8293157521718284e-05, - "loss": 1.2341, + "epoch": 0.06781789638932496, + "grad_norm": 2.7382356031716646, + "learning_rate": 1.9925222644340372e-05, + "loss": 1.0838, "step": 648 }, { - "epoch": 0.43, - "learning_rate": 1.8287194015901167e-05, - "loss": 1.2673, + "epoch": 0.06792255363683936, + "grad_norm": 2.9469770945822047, + "learning_rate": 1.9924808312240737e-05, + "loss": 1.1064, "step": 649 }, { - "epoch": 0.43, - "learning_rate": 1.8281221085944128e-05, - "loss": 1.3124, + "epoch": 0.06802721088435375, + "grad_norm": 3.1261406037905513, + "learning_rate": 1.9924392839760505e-05, + "loss": 1.1091, "step": 650 }, { - "epoch": 0.43, - "learning_rate": 1.827523873863954e-05, - "loss": 1.3819, + "epoch": 0.06813186813186813, + "grad_norm": 3.1877565587473575, + "learning_rate": 1.9923976226947417e-05, + "loss": 1.1858, "step": 651 }, { - "epoch": 0.43, - "learning_rate": 1.8269246980790482e-05, - "loss": 1.2699, + "epoch": 0.06823652537938252, + "grad_norm": 2.8216280798542885, + "learning_rate": 1.992355847384934e-05, + "loss": 1.0689, "step": 652 }, { - "epoch": 0.43, - "learning_rate": 1.8263245819210745e-05, - "loss": 1.4479, + "epoch": 0.06834118262689691, + "grad_norm": 2.9795946286363875, + "learning_rate": 1.9923139580514284e-05, + "loss": 1.0729, "step": 653 }, { - "epoch": 0.43, - "learning_rate": 1.825723526072481e-05, - "loss": 1.2992, + "epoch": 0.0684458398744113, + "grad_norm": 2.5206528985759324, + "learning_rate": 1.9922719546990367e-05, + "loss": 0.8577, "step": 654 }, { - "epoch": 0.43, - "learning_rate": 1.8251215312167833e-05, - "loss": 1.2057, + "epoch": 0.0685504971219257, + "grad_norm": 2.9454318451318455, + "learning_rate": 1.992229837332586e-05, + "loss": 0.8568, "step": 655 }, { - "epoch": 0.43, - "learning_rate": 1.8245185980385673e-05, - "loss": 1.2626, + "epoch": 0.06865515436944009, + "grad_norm": 2.774719223512651, + "learning_rate": 1.9921876059569158e-05, + "loss": 0.9953, "step": 656 }, { - "epoch": 0.43, - "learning_rate": 1.823914727223483e-05, - "loss": 1.2323, + "epoch": 0.06875981161695448, + "grad_norm": 3.19787444851232, + "learning_rate": 1.9921452605768783e-05, + "loss": 1.0818, "step": 657 }, { - "epoch": 0.43, - "learning_rate": 1.823309919458249e-05, - "loss": 1.2677, + "epoch": 0.06886446886446887, + "grad_norm": 3.4049791357976105, + "learning_rate": 1.9921028011973388e-05, + "loss": 1.1536, "step": 658 }, { - "epoch": 0.43, - "learning_rate": 1.8227041754306484e-05, - "loss": 1.3755, + "epoch": 0.06896912611198326, + "grad_norm": 2.557786514794364, + "learning_rate": 1.9920602278231765e-05, + "loss": 0.986, "step": 659 }, { - "epoch": 0.43, - "learning_rate": 1.822097495829529e-05, - "loss": 1.1518, + "epoch": 0.06907378335949764, + "grad_norm": 3.1241150572729808, + "learning_rate": 1.9920175404592827e-05, + "loss": 0.9868, "step": 660 }, { - "epoch": 0.44, - "learning_rate": 1.821489881344803e-05, - "loss": 1.3471, + "epoch": 0.06917844060701203, + "grad_norm": 3.2992236992022788, + "learning_rate": 1.9919747391105626e-05, + "loss": 1.0701, "step": 661 }, { - "epoch": 0.44, - "learning_rate": 1.8208813326674444e-05, - "loss": 1.2778, + "epoch": 0.06928309785452642, + "grad_norm": 3.4812568274144655, + "learning_rate": 1.991931823781934e-05, + "loss": 0.9236, "step": 662 }, { - "epoch": 0.44, - "learning_rate": 1.8202718504894917e-05, - "loss": 1.3122, + "epoch": 0.06938775510204082, + "grad_norm": 2.7811990200114276, + "learning_rate": 1.991888794478328e-05, + "loss": 1.0129, "step": 663 }, { - "epoch": 0.44, - "learning_rate": 1.8196614355040436e-05, - "loss": 1.4677, + "epoch": 0.06949241234955521, + "grad_norm": 3.0685242353855267, + "learning_rate": 1.9918456512046886e-05, + "loss": 1.1438, "step": 664 }, { - "epoch": 0.44, - "learning_rate": 1.819050088405259e-05, - "loss": 1.284, + "epoch": 0.0695970695970696, + "grad_norm": 2.6809973552138824, + "learning_rate": 1.9918023939659735e-05, + "loss": 1.0722, "step": 665 }, { - "epoch": 0.44, - "learning_rate": 1.818437809888359e-05, - "loss": 1.4112, + "epoch": 0.06970172684458399, + "grad_norm": 2.5126108916830545, + "learning_rate": 1.9917590227671523e-05, + "loss": 0.8222, "step": 666 }, { - "epoch": 0.44, - "learning_rate": 1.817824600649621e-05, - "loss": 1.3281, + "epoch": 0.06980638409209838, + "grad_norm": 3.1637408972668135, + "learning_rate": 1.9917155376132092e-05, + "loss": 1.0751, "step": 667 }, { - "epoch": 0.44, - "learning_rate": 1.8172104613863838e-05, - "loss": 1.329, + "epoch": 0.06991104133961276, + "grad_norm": 2.7641342208213695, + "learning_rate": 1.9916719385091404e-05, + "loss": 0.9868, "step": 668 }, { - "epoch": 0.44, - "learning_rate": 1.816595392797041e-05, - "loss": 1.1344, + "epoch": 0.07001569858712715, + "grad_norm": 3.369758095168238, + "learning_rate": 1.9916282254599556e-05, + "loss": 1.073, "step": 669 }, { - "epoch": 0.44, - "learning_rate": 1.8159793955810453e-05, - "loss": 1.4005, + "epoch": 0.07012035583464155, + "grad_norm": 2.9489363996307576, + "learning_rate": 1.9915843984706773e-05, + "loss": 1.0683, "step": 670 }, { - "epoch": 0.44, - "learning_rate": 1.815362470438904e-05, - "loss": 1.3454, + "epoch": 0.07022501308215594, + "grad_norm": 3.103782003060308, + "learning_rate": 1.9915404575463414e-05, + "loss": 1.0495, "step": 671 }, { - "epoch": 0.44, - "learning_rate": 1.8147446180721802e-05, - "loss": 1.3562, + "epoch": 0.07032967032967033, + "grad_norm": 3.442901028729499, + "learning_rate": 1.991496402691997e-05, + "loss": 1.0754, "step": 672 }, { - "epoch": 0.44, - "learning_rate": 1.8141258391834914e-05, - "loss": 1.2067, + "epoch": 0.07043432757718472, + "grad_norm": 2.861095313005916, + "learning_rate": 1.991452233912706e-05, + "loss": 1.0522, "step": 673 }, { - "epoch": 0.44, - "learning_rate": 1.8135061344765088e-05, - "loss": 1.2327, + "epoch": 0.07053898482469911, + "grad_norm": 2.9472040327867144, + "learning_rate": 1.9914079512135434e-05, + "loss": 1.1586, "step": 674 }, { - "epoch": 0.44, - "learning_rate": 1.812885504655956e-05, - "loss": 1.1144, + "epoch": 0.0706436420722135, + "grad_norm": 3.1812368240216657, + "learning_rate": 1.9913635545995972e-05, + "loss": 1.1145, "step": 675 }, { - "epoch": 0.45, - "learning_rate": 1.8122639504276096e-05, - "loss": 1.2406, + "epoch": 0.0707482993197279, + "grad_norm": 2.9277804198588666, + "learning_rate": 1.9913190440759693e-05, + "loss": 1.0576, "step": 676 }, { - "epoch": 0.45, - "learning_rate": 1.811641472498296e-05, - "loss": 1.3028, + "epoch": 0.07085295656724228, + "grad_norm": 3.271562835349517, + "learning_rate": 1.9912744196477732e-05, + "loss": 0.9771, "step": 677 }, { - "epoch": 0.45, - "learning_rate": 1.8110180715758934e-05, - "loss": 1.3957, + "epoch": 0.07095761381475667, + "grad_norm": 3.0718494789605604, + "learning_rate": 1.9912296813201372e-05, + "loss": 1.0435, "step": 678 }, { - "epoch": 0.45, - "learning_rate": 1.8103937483693288e-05, - "loss": 1.3159, + "epoch": 0.07106227106227106, + "grad_norm": 2.1086283397641576, + "learning_rate": 1.991184829098201e-05, + "loss": 0.8067, "step": 679 }, { - "epoch": 0.45, - "learning_rate": 1.809768503588578e-05, - "loss": 1.4038, + "epoch": 0.07116692830978545, + "grad_norm": 2.0279982104266643, + "learning_rate": 1.9911398629871187e-05, + "loss": 0.6897, "step": 680 }, { - "epoch": 0.45, - "learning_rate": 1.8091423379446658e-05, - "loss": 1.3685, + "epoch": 0.07127158555729984, + "grad_norm": 2.7850815226838184, + "learning_rate": 1.991094782992057e-05, + "loss": 1.0727, "step": 681 }, { - "epoch": 0.45, - "learning_rate": 1.8085152521496627e-05, - "loss": 1.3142, + "epoch": 0.07137624280481424, + "grad_norm": 2.7710574348613313, + "learning_rate": 1.9910495891181956e-05, + "loss": 1.1439, "step": 682 }, { - "epoch": 0.45, - "learning_rate": 1.8078872469166864e-05, - "loss": 1.3661, + "epoch": 0.07148090005232863, + "grad_norm": 2.970435833694052, + "learning_rate": 1.9910042813707272e-05, + "loss": 1.1369, "step": 683 }, { - "epoch": 0.45, - "learning_rate": 1.8072583229599004e-05, - "loss": 1.3521, + "epoch": 0.07158555729984302, + "grad_norm": 2.618973989205177, + "learning_rate": 1.990958859754858e-05, + "loss": 1.0571, "step": 684 }, { - "epoch": 0.45, - "learning_rate": 1.8066284809945128e-05, - "loss": 1.4548, + "epoch": 0.0716902145473574, + "grad_norm": 3.5278999615831323, + "learning_rate": 1.9909133242758066e-05, + "loss": 1.177, "step": 685 }, { - "epoch": 0.45, - "learning_rate": 1.8059977217367753e-05, - "loss": 1.2938, + "epoch": 0.07179487179487179, + "grad_norm": 3.266029063508325, + "learning_rate": 1.990867674938806e-05, + "loss": 0.9545, "step": 686 }, { - "epoch": 0.45, - "learning_rate": 1.8053660459039833e-05, - "loss": 1.2422, + "epoch": 0.07189952904238618, + "grad_norm": 3.0659580296418856, + "learning_rate": 1.9908219117491004e-05, + "loss": 1.0684, "step": 687 }, { - "epoch": 0.45, - "learning_rate": 1.8047334542144738e-05, - "loss": 1.3967, + "epoch": 0.07200418628990057, + "grad_norm": 3.164931997476543, + "learning_rate": 1.9907760347119485e-05, + "loss": 1.0845, "step": 688 }, { - "epoch": 0.45, - "learning_rate": 1.8040999473876258e-05, - "loss": 1.428, + "epoch": 0.07210884353741497, + "grad_norm": 2.8716658890938174, + "learning_rate": 1.990730043832622e-05, + "loss": 1.0957, "step": 689 }, { - "epoch": 0.45, - "learning_rate": 1.8034655261438588e-05, - "loss": 1.2242, + "epoch": 0.07221350078492936, + "grad_norm": 2.545281120610984, + "learning_rate": 1.990683939116405e-05, + "loss": 1.0906, "step": 690 }, { - "epoch": 0.45, - "learning_rate": 1.8028301912046325e-05, - "loss": 1.3014, + "epoch": 0.07231815803244375, + "grad_norm": 3.317532471187599, + "learning_rate": 1.9906377205685953e-05, + "loss": 1.1177, "step": 691 }, { - "epoch": 0.46, - "learning_rate": 1.8021939432924456e-05, - "loss": 1.3546, + "epoch": 0.07242281527995814, + "grad_norm": 3.2165985399160575, + "learning_rate": 1.9905913881945027e-05, + "loss": 1.0378, "step": 692 }, { - "epoch": 0.46, - "learning_rate": 1.8015567831308344e-05, - "loss": 1.3246, + "epoch": 0.07252747252747253, + "grad_norm": 2.8761721393997193, + "learning_rate": 1.990544941999452e-05, + "loss": 1.1571, "step": 693 }, { - "epoch": 0.46, - "learning_rate": 1.800918711444373e-05, - "loss": 1.3759, + "epoch": 0.07263212977498691, + "grad_norm": 3.4562167899000076, + "learning_rate": 1.9904983819887788e-05, + "loss": 0.9759, "step": 694 }, { - "epoch": 0.46, - "learning_rate": 1.8002797289586726e-05, - "loss": 1.3185, + "epoch": 0.0727367870225013, + "grad_norm": 4.0163161813819155, + "learning_rate": 1.9904517081678342e-05, + "loss": 1.0135, "step": 695 }, { - "epoch": 0.46, - "learning_rate": 1.7996398364003797e-05, - "loss": 1.3874, + "epoch": 0.0728414442700157, + "grad_norm": 2.999273333333365, + "learning_rate": 1.99040492054198e-05, + "loss": 1.1353, "step": 696 }, { - "epoch": 0.46, - "learning_rate": 1.7989990344971747e-05, - "loss": 1.277, + "epoch": 0.07294610151753009, + "grad_norm": 2.947720739435947, + "learning_rate": 1.990358019116593e-05, + "loss": 1.0866, "step": 697 }, { - "epoch": 0.46, - "learning_rate": 1.7983573239777746e-05, - "loss": 1.3898, + "epoch": 0.07305075876504448, + "grad_norm": 2.559467975109432, + "learning_rate": 1.9903110038970617e-05, + "loss": 1.1007, "step": 698 }, { - "epoch": 0.46, - "learning_rate": 1.797714705571927e-05, - "loss": 1.3666, + "epoch": 0.07315541601255887, + "grad_norm": 2.6168641864922084, + "learning_rate": 1.9902638748887886e-05, + "loss": 1.0792, "step": 699 }, { - "epoch": 0.46, - "learning_rate": 1.7970711800104138e-05, - "loss": 1.3797, + "epoch": 0.07326007326007326, + "grad_norm": 3.3899484327660367, + "learning_rate": 1.9902166320971888e-05, + "loss": 0.9802, "step": 700 }, { - "epoch": 0.46, - "learning_rate": 1.7964267480250476e-05, - "loss": 1.3042, + "epoch": 0.07336473050758766, + "grad_norm": 3.5929670401239706, + "learning_rate": 1.9901692755276905e-05, + "loss": 1.14, "step": 701 }, { - "epoch": 0.46, - "learning_rate": 1.7957814103486716e-05, - "loss": 1.2159, + "epoch": 0.07346938775510205, + "grad_norm": 3.5869697848385083, + "learning_rate": 1.9901218051857354e-05, + "loss": 1.0971, "step": 702 }, { - "epoch": 0.46, - "learning_rate": 1.7951351677151604e-05, - "loss": 1.3311, + "epoch": 0.07357404500261643, + "grad_norm": 2.74561495745866, + "learning_rate": 1.9900742210767773e-05, + "loss": 1.1434, "step": 703 }, { - "epoch": 0.46, - "learning_rate": 1.7944880208594156e-05, - "loss": 1.3292, + "epoch": 0.07367870225013082, + "grad_norm": 3.1605580372961306, + "learning_rate": 1.9900265232062844e-05, + "loss": 1.0261, "step": 704 }, { - "epoch": 0.46, - "learning_rate": 1.7938399705173687e-05, - "loss": 1.443, + "epoch": 0.07378335949764521, + "grad_norm": 3.0884788086378463, + "learning_rate": 1.9899787115797373e-05, + "loss": 1.0976, "step": 705 }, { - "epoch": 0.46, - "learning_rate": 1.7931910174259777e-05, - "loss": 1.148, + "epoch": 0.0738880167451596, + "grad_norm": 2.904397093628698, + "learning_rate": 1.9899307862026288e-05, + "loss": 1.0172, "step": 706 }, { - "epoch": 0.47, - "learning_rate": 1.792541162323228e-05, - "loss": 1.3056, + "epoch": 0.073992673992674, + "grad_norm": 2.550496674073018, + "learning_rate": 1.9898827470804663e-05, + "loss": 1.1443, "step": 707 }, { - "epoch": 0.47, - "learning_rate": 1.7918904059481303e-05, - "loss": 1.1624, + "epoch": 0.07409733124018839, + "grad_norm": 3.0699639273230828, + "learning_rate": 1.9898345942187695e-05, + "loss": 1.1312, "step": 708 }, { - "epoch": 0.47, - "learning_rate": 1.79123874904072e-05, - "loss": 1.2849, + "epoch": 0.07420198848770278, + "grad_norm": 3.151696553595531, + "learning_rate": 1.9897863276230712e-05, + "loss": 0.9484, "step": 709 }, { - "epoch": 0.47, - "learning_rate": 1.7905861923420568e-05, - "loss": 1.1711, + "epoch": 0.07430664573521717, + "grad_norm": 2.936271363452074, + "learning_rate": 1.9897379472989177e-05, + "loss": 1.0499, "step": 710 }, { - "epoch": 0.47, - "learning_rate": 1.7899327365942242e-05, - "loss": 1.3396, + "epoch": 0.07441130298273155, + "grad_norm": 3.427173808865134, + "learning_rate": 1.989689453251867e-05, + "loss": 1.0594, "step": 711 }, { - "epoch": 0.47, - "learning_rate": 1.7892783825403272e-05, - "loss": 1.3184, + "epoch": 0.07451596023024594, + "grad_norm": 2.7577070019256165, + "learning_rate": 1.9896408454874924e-05, + "loss": 1.0995, "step": 712 }, { - "epoch": 0.47, - "learning_rate": 1.788623130924493e-05, - "loss": 1.3957, + "epoch": 0.07462061747776033, + "grad_norm": 2.737358494436629, + "learning_rate": 1.989592124011378e-05, + "loss": 0.9775, "step": 713 }, { - "epoch": 0.47, - "learning_rate": 1.7879669824918685e-05, - "loss": 1.3386, + "epoch": 0.07472527472527472, + "grad_norm": 2.637102118855309, + "learning_rate": 1.9895432888291228e-05, + "loss": 1.1897, "step": 714 }, { - "epoch": 0.47, - "learning_rate": 1.7873099379886217e-05, - "loss": 1.2501, + "epoch": 0.07482993197278912, + "grad_norm": 2.8640305527444525, + "learning_rate": 1.9894943399463373e-05, + "loss": 1.068, "step": 715 }, { - "epoch": 0.47, - "learning_rate": 1.7866519981619396e-05, - "loss": 1.4069, + "epoch": 0.07493458922030351, + "grad_norm": 2.506044517062919, + "learning_rate": 1.9894452773686463e-05, + "loss": 1.1048, "step": 716 }, { - "epoch": 0.47, - "learning_rate": 1.7859931637600256e-05, - "loss": 1.4704, + "epoch": 0.0750392464678179, + "grad_norm": 2.731113694615565, + "learning_rate": 1.9893961011016873e-05, + "loss": 1.1107, "step": 717 }, { - "epoch": 0.47, - "learning_rate": 1.7853334355321025e-05, - "loss": 1.2778, + "epoch": 0.07514390371533229, + "grad_norm": 2.892316241004994, + "learning_rate": 1.9893468111511106e-05, + "loss": 1.0969, "step": 718 }, { - "epoch": 0.47, - "learning_rate": 1.784672814228409e-05, - "loss": 1.3288, + "epoch": 0.07524856096284668, + "grad_norm": 2.55870522176311, + "learning_rate": 1.98929740752258e-05, + "loss": 1.0443, "step": 719 }, { - "epoch": 0.47, - "learning_rate": 1.784011300600198e-05, - "loss": 1.4501, + "epoch": 0.07535321821036106, + "grad_norm": 2.9295733001233044, + "learning_rate": 1.989247890221771e-05, + "loss": 1.0639, "step": 720 }, { - "epoch": 0.47, - "learning_rate": 1.783348895399739e-05, - "loss": 1.3109, + "epoch": 0.07545787545787545, + "grad_norm": 3.0665452355998073, + "learning_rate": 1.9891982592543748e-05, + "loss": 1.1135, "step": 721 }, { - "epoch": 0.48, - "learning_rate": 1.7826855993803148e-05, - "loss": 1.2988, + "epoch": 0.07556253270538985, + "grad_norm": 2.8998275546803223, + "learning_rate": 1.9891485146260926e-05, + "loss": 0.9916, "step": 722 }, { - "epoch": 0.48, - "learning_rate": 1.7820214132962205e-05, - "loss": 1.4882, + "epoch": 0.07566718995290424, + "grad_norm": 2.5710268709337294, + "learning_rate": 1.989098656342641e-05, + "loss": 1.0469, "step": 723 }, { - "epoch": 0.48, - "learning_rate": 1.7813563379027644e-05, - "loss": 1.5883, + "epoch": 0.07577184720041863, + "grad_norm": 3.0608910480478535, + "learning_rate": 1.989048684409749e-05, + "loss": 1.1399, "step": 724 }, { - "epoch": 0.48, - "learning_rate": 1.7806903739562652e-05, - "loss": 1.2206, + "epoch": 0.07587650444793302, + "grad_norm": 2.8003344138495887, + "learning_rate": 1.988998598833158e-05, + "loss": 1.0059, "step": 725 }, { - "epoch": 0.48, - "learning_rate": 1.7800235222140525e-05, - "loss": 1.6747, + "epoch": 0.07598116169544741, + "grad_norm": 2.9559350963453785, + "learning_rate": 1.988948399618623e-05, + "loss": 1.1011, "step": 726 }, { - "epoch": 0.48, - "learning_rate": 1.7793557834344662e-05, - "loss": 1.2666, + "epoch": 0.0760858189429618, + "grad_norm": 2.6079665939299583, + "learning_rate": 1.988898086771912e-05, + "loss": 0.9801, "step": 727 }, { - "epoch": 0.48, - "learning_rate": 1.7786871583768536e-05, - "loss": 1.2639, + "epoch": 0.0761904761904762, + "grad_norm": 2.6799211618975143, + "learning_rate": 1.9888476602988063e-05, + "loss": 1.0884, "step": 728 }, { - "epoch": 0.48, - "learning_rate": 1.7780176478015706e-05, - "loss": 1.2828, + "epoch": 0.07629513343799058, + "grad_norm": 2.6807547704920665, + "learning_rate": 1.9887971202050996e-05, + "loss": 1.0336, "step": 729 }, { - "epoch": 0.48, - "learning_rate": 1.7773472524699803e-05, - "loss": 1.2552, + "epoch": 0.07639979068550497, + "grad_norm": 2.8884167531514233, + "learning_rate": 1.9887464664965997e-05, + "loss": 1.108, "step": 730 }, { - "epoch": 0.48, - "learning_rate": 1.7766759731444512e-05, - "loss": 1.2183, + "epoch": 0.07650444793301936, + "grad_norm": 3.1096081530713118, + "learning_rate": 1.9886956991791258e-05, + "loss": 1.0963, "step": 731 }, { - "epoch": 0.48, - "learning_rate": 1.776003810588358e-05, - "loss": 1.3489, + "epoch": 0.07660910518053375, + "grad_norm": 2.5776344588421156, + "learning_rate": 1.9886448182585122e-05, + "loss": 1.071, "step": 732 }, { - "epoch": 0.48, - "learning_rate": 1.7753307655660792e-05, - "loss": 1.3426, + "epoch": 0.07671376242804814, + "grad_norm": 2.7119013146953814, + "learning_rate": 1.9885938237406048e-05, + "loss": 1.0899, "step": 733 }, { - "epoch": 0.48, - "learning_rate": 1.7746568388429966e-05, - "loss": 1.3664, + "epoch": 0.07681841967556254, + "grad_norm": 2.5283853043631153, + "learning_rate": 1.9885427156312627e-05, + "loss": 1.1398, "step": 734 }, { - "epoch": 0.48, - "learning_rate": 1.7739820311854957e-05, - "loss": 1.2674, + "epoch": 0.07692307692307693, + "grad_norm": 2.563527440025152, + "learning_rate": 1.988491493936359e-05, + "loss": 1.0551, "step": 735 }, { - "epoch": 0.48, - "learning_rate": 1.7733063433609623e-05, - "loss": 1.4536, + "epoch": 0.07702773417059132, + "grad_norm": 2.4468041205415805, + "learning_rate": 1.9884401586617785e-05, + "loss": 1.1602, "step": 736 }, { - "epoch": 0.49, - "learning_rate": 1.7726297761377847e-05, - "loss": 1.2702, + "epoch": 0.0771323914181057, + "grad_norm": 3.40716823965318, + "learning_rate": 1.98838870981342e-05, + "loss": 1.0084, "step": 737 }, { - "epoch": 0.49, - "learning_rate": 1.7719523302853498e-05, - "loss": 1.3485, + "epoch": 0.07723704866562009, + "grad_norm": 2.9705969422751113, + "learning_rate": 1.988337147397195e-05, + "loss": 1.2072, "step": 738 }, { - "epoch": 0.49, - "learning_rate": 1.771274006574045e-05, - "loss": 1.4078, + "epoch": 0.07734170591313448, + "grad_norm": 2.4958733209485864, + "learning_rate": 1.9882854714190283e-05, + "loss": 0.8871, "step": 739 }, { - "epoch": 0.49, - "learning_rate": 1.7705948057752547e-05, - "loss": 1.1793, + "epoch": 0.07744636316064887, + "grad_norm": 2.890941680142748, + "learning_rate": 1.9882336818848576e-05, + "loss": 0.9567, "step": 740 }, { - "epoch": 0.49, - "learning_rate": 1.769914728661362e-05, - "loss": 1.3122, + "epoch": 0.07755102040816327, + "grad_norm": 3.002499996703523, + "learning_rate": 1.9881817788006335e-05, + "loss": 1.1373, "step": 741 }, { - "epoch": 0.49, - "learning_rate": 1.7692337760057454e-05, - "loss": 1.3285, + "epoch": 0.07765567765567766, + "grad_norm": 2.697898190678546, + "learning_rate": 1.9881297621723194e-05, + "loss": 0.9728, "step": 742 }, { - "epoch": 0.49, - "learning_rate": 1.76855194858278e-05, - "loss": 1.4051, + "epoch": 0.07776033490319205, + "grad_norm": 3.317056448747156, + "learning_rate": 1.9880776320058926e-05, + "loss": 1.0526, "step": 743 }, { - "epoch": 0.49, - "learning_rate": 1.767869247167835e-05, - "loss": 1.3203, + "epoch": 0.07786499215070644, + "grad_norm": 3.070014562054591, + "learning_rate": 1.988025388307343e-05, + "loss": 1.1524, "step": 744 }, { - "epoch": 0.49, - "learning_rate": 1.7671856725372737e-05, - "loss": 1.3042, + "epoch": 0.07796964939822083, + "grad_norm": 2.942240381438674, + "learning_rate": 1.9879730310826733e-05, + "loss": 1.1355, "step": 745 }, { - "epoch": 0.49, - "learning_rate": 1.7665012254684522e-05, - "loss": 1.2165, + "epoch": 0.07807430664573521, + "grad_norm": 2.4967345724379144, + "learning_rate": 1.9879205603379e-05, + "loss": 1.0319, "step": 746 }, { - "epoch": 0.49, - "learning_rate": 1.76581590673972e-05, - "loss": 1.3246, + "epoch": 0.0781789638932496, + "grad_norm": 2.5917380077833445, + "learning_rate": 1.9878679760790508e-05, + "loss": 0.8069, "step": 747 }, { - "epoch": 0.49, - "learning_rate": 1.7651297171304157e-05, - "loss": 1.3853, + "epoch": 0.078283621140764, + "grad_norm": 1.830900388771415, + "learning_rate": 1.9878152783121686e-05, + "loss": 0.7969, "step": 748 }, { - "epoch": 0.49, - "learning_rate": 1.7644426574208698e-05, - "loss": 1.3546, + "epoch": 0.07838827838827839, + "grad_norm": 3.1335346676454017, + "learning_rate": 1.9877624670433086e-05, + "loss": 1.1493, "step": 749 }, { - "epoch": 0.49, - "learning_rate": 1.763754728392402e-05, - "loss": 1.3478, + "epoch": 0.07849293563579278, + "grad_norm": 2.3535864496436356, + "learning_rate": 1.987709542278539e-05, + "loss": 0.8744, "step": 750 }, { - "epoch": 0.49, - "learning_rate": 1.7630659308273204e-05, - "loss": 1.2798, + "epoch": 0.07859759288330717, + "grad_norm": 2.9044705756712044, + "learning_rate": 1.9876565040239404e-05, + "loss": 0.9138, "step": 751 }, { - "epoch": 0.5, - "learning_rate": 1.7623762655089208e-05, - "loss": 1.2209, + "epoch": 0.07870225013082156, + "grad_norm": 3.197340259960328, + "learning_rate": 1.9876033522856072e-05, + "loss": 1.0667, "step": 752 }, { - "epoch": 0.5, - "learning_rate": 1.761685733221486e-05, - "loss": 1.3514, + "epoch": 0.07880690737833596, + "grad_norm": 2.672736829036044, + "learning_rate": 1.987550087069647e-05, + "loss": 1.0583, "step": 753 }, { - "epoch": 0.5, - "learning_rate": 1.7609943347502846e-05, - "loss": 1.3803, + "epoch": 0.07891156462585033, + "grad_norm": 2.9518811281187585, + "learning_rate": 1.9874967083821796e-05, + "loss": 1.0376, "step": 754 }, { - "epoch": 0.5, - "learning_rate": 1.7603020708815695e-05, - "loss": 1.3253, + "epoch": 0.07901622187336473, + "grad_norm": 3.0369965552675917, + "learning_rate": 1.9874432162293388e-05, + "loss": 1.0123, "step": 755 }, { - "epoch": 0.5, - "learning_rate": 1.7596089424025794e-05, - "loss": 1.2287, + "epoch": 0.07912087912087912, + "grad_norm": 2.527882866802051, + "learning_rate": 1.9873896106172705e-05, + "loss": 1.0585, "step": 756 }, { - "epoch": 0.5, - "learning_rate": 1.7589149501015345e-05, - "loss": 1.5018, + "epoch": 0.07922553636839351, + "grad_norm": 2.619647602913425, + "learning_rate": 1.9873358915521345e-05, + "loss": 0.9611, "step": 757 }, { - "epoch": 0.5, - "learning_rate": 1.758220094767638e-05, - "loss": 1.1544, + "epoch": 0.0793301936159079, + "grad_norm": 3.167480526301533, + "learning_rate": 1.9872820590401028e-05, + "loss": 1.0156, "step": 758 }, { - "epoch": 0.5, - "learning_rate": 1.757524377191075e-05, - "loss": 1.2132, + "epoch": 0.0794348508634223, + "grad_norm": 2.573985063306484, + "learning_rate": 1.9872281130873608e-05, + "loss": 1.0724, "step": 759 }, { - "epoch": 0.5, - "learning_rate": 1.7568277981630107e-05, - "loss": 1.3695, + "epoch": 0.07953950811093669, + "grad_norm": 2.946085000736679, + "learning_rate": 1.9871740537001078e-05, + "loss": 1.1423, "step": 760 }, { - "epoch": 0.5, - "learning_rate": 1.7561303584755903e-05, - "loss": 1.2031, + "epoch": 0.07964416535845108, + "grad_norm": 2.569944570278952, + "learning_rate": 1.9871198808845546e-05, + "loss": 1.0442, "step": 761 }, { - "epoch": 0.5, - "learning_rate": 1.7554320589219368e-05, - "loss": 1.285, + "epoch": 0.07974882260596547, + "grad_norm": 2.1772171170370136, + "learning_rate": 1.987065594646926e-05, + "loss": 1.0529, "step": 762 }, { - "epoch": 0.5, - "learning_rate": 1.754732900296151e-05, - "loss": 1.2214, + "epoch": 0.07985347985347985, + "grad_norm": 2.845399072310291, + "learning_rate": 1.9870111949934597e-05, + "loss": 1.0669, "step": 763 }, { - "epoch": 0.5, - "learning_rate": 1.754032883393313e-05, - "loss": 1.3069, + "epoch": 0.07995813710099424, + "grad_norm": 2.987591086180299, + "learning_rate": 1.9869566819304064e-05, + "loss": 1.1109, "step": 764 }, { - "epoch": 0.5, - "learning_rate": 1.7533320090094763e-05, - "loss": 1.3448, + "epoch": 0.08006279434850863, + "grad_norm": 2.9260073141957794, + "learning_rate": 1.986902055464029e-05, + "loss": 1.1047, "step": 765 }, { - "epoch": 0.5, - "learning_rate": 1.7526302779416703e-05, - "loss": 1.287, + "epoch": 0.08016745159602302, + "grad_norm": 2.978042723105865, + "learning_rate": 1.9868473156006055e-05, + "loss": 1.1091, "step": 766 }, { - "epoch": 0.5, - "learning_rate": 1.7519276909878988e-05, - "loss": 1.3828, + "epoch": 0.08027210884353742, + "grad_norm": 3.0411024163342995, + "learning_rate": 1.9867924623464244e-05, + "loss": 1.072, "step": 767 }, { - "epoch": 0.51, - "learning_rate": 1.751224248947139e-05, - "loss": 1.4209, + "epoch": 0.08037676609105181, + "grad_norm": 2.804164851030498, + "learning_rate": 1.986737495707789e-05, + "loss": 1.1031, "step": 768 }, { - "epoch": 0.51, - "learning_rate": 1.7505199526193398e-05, - "loss": 1.2545, + "epoch": 0.0804814233385662, + "grad_norm": 2.531079552784771, + "learning_rate": 1.986682415691015e-05, + "loss": 1.0565, "step": 769 }, { - "epoch": 0.51, - "learning_rate": 1.749814802805423e-05, - "loss": 1.3509, + "epoch": 0.08058608058608059, + "grad_norm": 3.015392851420843, + "learning_rate": 1.9866272223024314e-05, + "loss": 1.0387, "step": 770 }, { - "epoch": 0.51, - "learning_rate": 1.7491088003072798e-05, - "loss": 1.2408, + "epoch": 0.08069073783359498, + "grad_norm": 3.0741672862303258, + "learning_rate": 1.9865719155483794e-05, + "loss": 1.0325, "step": 771 }, { - "epoch": 0.51, - "learning_rate": 1.748401945927771e-05, - "loss": 1.3241, + "epoch": 0.08079539508110936, + "grad_norm": 2.6992994649398785, + "learning_rate": 1.9865164954352146e-05, + "loss": 1.0413, "step": 772 }, { - "epoch": 0.51, - "learning_rate": 1.7476942404707267e-05, - "loss": 1.2471, + "epoch": 0.08090005232862375, + "grad_norm": 2.652137381945092, + "learning_rate": 1.986460961969305e-05, + "loss": 1.0665, "step": 773 }, { - "epoch": 0.51, - "learning_rate": 1.746985684740945e-05, - "loss": 1.356, + "epoch": 0.08100470957613815, + "grad_norm": 2.8296867873205316, + "learning_rate": 1.9864053151570305e-05, + "loss": 1.1983, "step": 774 }, { - "epoch": 0.51, - "learning_rate": 1.74627627954419e-05, - "loss": 1.297, + "epoch": 0.08110936682365254, + "grad_norm": 2.6932652023688908, + "learning_rate": 1.986349555004786e-05, + "loss": 0.9712, "step": 775 }, { - "epoch": 0.51, - "learning_rate": 1.745566025687193e-05, - "loss": 1.3418, + "epoch": 0.08121402407116693, + "grad_norm": 2.8908231690550337, + "learning_rate": 1.9862936815189778e-05, + "loss": 1.0777, "step": 776 }, { - "epoch": 0.51, - "learning_rate": 1.74485492397765e-05, - "loss": 1.3253, + "epoch": 0.08131868131868132, + "grad_norm": 2.7585434811777327, + "learning_rate": 1.9862376947060263e-05, + "loss": 1.0458, "step": 777 }, { - "epoch": 0.51, - "learning_rate": 1.74414297522422e-05, - "loss": 1.3031, + "epoch": 0.08142333856619571, + "grad_norm": 3.4111164784807535, + "learning_rate": 1.9861815945723647e-05, + "loss": 1.0959, "step": 778 }, { - "epoch": 0.51, - "learning_rate": 1.7434301802365264e-05, - "loss": 1.3136, + "epoch": 0.0815279958137101, + "grad_norm": 2.794422350270696, + "learning_rate": 1.9861253811244383e-05, + "loss": 1.0158, "step": 779 }, { - "epoch": 0.51, - "learning_rate": 1.7427165398251557e-05, - "loss": 1.2255, + "epoch": 0.08163265306122448, + "grad_norm": 2.669728034578084, + "learning_rate": 1.9860690543687064e-05, + "loss": 1.0899, "step": 780 }, { - "epoch": 0.51, - "learning_rate": 1.7420020548016538e-05, - "loss": 1.2092, + "epoch": 0.08173731030873888, + "grad_norm": 2.8079261760048513, + "learning_rate": 1.9860126143116412e-05, + "loss": 1.1234, "step": 781 }, { - "epoch": 0.51, - "learning_rate": 1.7412867259785286e-05, - "loss": 1.3789, + "epoch": 0.08184196755625327, + "grad_norm": 2.926846571046833, + "learning_rate": 1.9859560609597282e-05, + "loss": 1.0468, "step": 782 }, { - "epoch": 0.52, - "learning_rate": 1.7405705541692476e-05, - "loss": 1.2396, + "epoch": 0.08194662480376766, + "grad_norm": 2.8972050163501213, + "learning_rate": 1.9858993943194648e-05, + "loss": 1.044, "step": 783 }, { - "epoch": 0.52, - "learning_rate": 1.7398535401882353e-05, - "loss": 1.2194, + "epoch": 0.08205128205128205, + "grad_norm": 2.718732850064825, + "learning_rate": 1.9858426143973623e-05, + "loss": 1.0915, "step": 784 }, { - "epoch": 0.52, - "learning_rate": 1.739135684850876e-05, - "loss": 1.3393, + "epoch": 0.08215593929879644, + "grad_norm": 2.509735712880021, + "learning_rate": 1.9857857211999452e-05, + "loss": 1.1174, "step": 785 }, { - "epoch": 0.52, - "learning_rate": 1.7384169889735094e-05, - "loss": 1.3454, + "epoch": 0.08226059654631084, + "grad_norm": 2.7443000185455464, + "learning_rate": 1.98572871473375e-05, + "loss": 0.8195, "step": 786 }, { - "epoch": 0.52, - "learning_rate": 1.7376974533734317e-05, - "loss": 1.3465, + "epoch": 0.08236525379382523, + "grad_norm": 3.129320040379283, + "learning_rate": 1.9856715950053272e-05, + "loss": 1.0873, "step": 787 }, { - "epoch": 0.52, - "learning_rate": 1.7369770788688937e-05, - "loss": 1.3755, + "epoch": 0.08246991104133962, + "grad_norm": 2.734612164978207, + "learning_rate": 1.9856143620212403e-05, + "loss": 1.1222, "step": 788 }, { - "epoch": 0.52, - "learning_rate": 1.7362558662791003e-05, - "loss": 1.2685, + "epoch": 0.082574568288854, + "grad_norm": 3.008168731676326, + "learning_rate": 1.9855570157880648e-05, + "loss": 0.9387, "step": 789 }, { - "epoch": 0.52, - "learning_rate": 1.7355338164242098e-05, - "loss": 1.2445, + "epoch": 0.08267922553636839, + "grad_norm": 3.1280456930254923, + "learning_rate": 1.9854995563123904e-05, + "loss": 1.1312, "step": 790 }, { - "epoch": 0.52, - "learning_rate": 1.734810930125332e-05, - "loss": 1.4126, + "epoch": 0.08278388278388278, + "grad_norm": 2.8486226186326338, + "learning_rate": 1.985441983600819e-05, + "loss": 0.9654, "step": 791 }, { - "epoch": 0.52, - "learning_rate": 1.734087208204528e-05, - "loss": 1.2169, + "epoch": 0.08288854003139717, + "grad_norm": 2.880542844835971, + "learning_rate": 1.985384297659966e-05, + "loss": 1.0636, "step": 792 }, { - "epoch": 0.52, - "learning_rate": 1.73336265148481e-05, - "loss": 1.3358, + "epoch": 0.08299319727891157, + "grad_norm": 2.7235931768471437, + "learning_rate": 1.98532649849646e-05, + "loss": 1.017, "step": 793 }, { - "epoch": 0.52, - "learning_rate": 1.7326372607901388e-05, - "loss": 1.364, + "epoch": 0.08309785452642596, + "grad_norm": 2.968388296598955, + "learning_rate": 1.9852685861169415e-05, + "loss": 1.2004, "step": 794 }, { - "epoch": 0.52, - "learning_rate": 1.7319110369454232e-05, - "loss": 1.309, + "epoch": 0.08320251177394035, + "grad_norm": 2.80143430250923, + "learning_rate": 1.9852105605280653e-05, + "loss": 0.9673, "step": 795 }, { - "epoch": 0.52, - "learning_rate": 1.7311839807765204e-05, - "loss": 1.2683, + "epoch": 0.08330716902145474, + "grad_norm": 2.579406765144872, + "learning_rate": 1.9851524217364982e-05, + "loss": 0.9932, "step": 796 }, { - "epoch": 0.52, - "learning_rate": 1.7304560931102334e-05, - "loss": 1.3465, + "epoch": 0.08341182626896912, + "grad_norm": 2.5521410566455285, + "learning_rate": 1.985094169748921e-05, + "loss": 1.0031, "step": 797 }, { - "epoch": 0.53, - "learning_rate": 1.729727374774311e-05, - "loss": 1.2255, + "epoch": 0.08351648351648351, + "grad_norm": 2.5987542618084145, + "learning_rate": 1.9850358045720268e-05, + "loss": 1.066, "step": 798 }, { - "epoch": 0.53, - "learning_rate": 1.7289978265974466e-05, - "loss": 1.2879, + "epoch": 0.0836211407639979, + "grad_norm": 2.6060537256654017, + "learning_rate": 1.9849773262125215e-05, + "loss": 1.0626, "step": 799 }, { - "epoch": 0.53, - "learning_rate": 1.728267449409278e-05, - "loss": 1.28, + "epoch": 0.0837257980115123, + "grad_norm": 2.7508431371390514, + "learning_rate": 1.9849187346771247e-05, + "loss": 1.0875, "step": 800 }, { - "epoch": 0.53, - "learning_rate": 1.7275362440403844e-05, - "loss": 1.226, + "epoch": 0.08383045525902669, + "grad_norm": 3.149432837745595, + "learning_rate": 1.9848600299725687e-05, + "loss": 1.0231, "step": 801 }, { - "epoch": 0.53, - "learning_rate": 1.7268042113222877e-05, - "loss": 1.3465, + "epoch": 0.08393511250654108, + "grad_norm": 3.0521072775807716, + "learning_rate": 1.9848012121055987e-05, + "loss": 1.0271, "step": 802 }, { - "epoch": 0.53, - "learning_rate": 1.726071352087451e-05, - "loss": 1.3905, + "epoch": 0.08403976975405547, + "grad_norm": 2.5717755802540085, + "learning_rate": 1.9847422810829732e-05, + "loss": 1.0108, "step": 803 }, { - "epoch": 0.53, - "learning_rate": 1.725337667169276e-05, - "loss": 1.3734, + "epoch": 0.08414442700156986, + "grad_norm": 2.574749430789081, + "learning_rate": 1.984683236911463e-05, + "loss": 1.0094, "step": 804 }, { - "epoch": 0.53, - "learning_rate": 1.7246031574021047e-05, - "loss": 1.3877, + "epoch": 0.08424908424908426, + "grad_norm": 2.761780907407162, + "learning_rate": 1.984624079597853e-05, + "loss": 0.9979, "step": 805 }, { - "epoch": 0.53, - "learning_rate": 1.723867823621216e-05, - "loss": 1.4236, + "epoch": 0.08435374149659863, + "grad_norm": 2.2659780949782076, + "learning_rate": 1.98456480914894e-05, + "loss": 1.0866, "step": 806 }, { - "epoch": 0.53, - "learning_rate": 1.7231316666628268e-05, - "loss": 1.4508, + "epoch": 0.08445839874411303, + "grad_norm": 2.880890790249313, + "learning_rate": 1.9845054255715345e-05, + "loss": 1.0858, "step": 807 }, { - "epoch": 0.53, - "learning_rate": 1.7223946873640897e-05, - "loss": 1.3736, + "epoch": 0.08456305599162742, + "grad_norm": 3.2637552119352087, + "learning_rate": 1.98444592887246e-05, + "loss": 1.177, "step": 808 }, { - "epoch": 0.53, - "learning_rate": 1.7216568865630924e-05, - "loss": 1.3627, + "epoch": 0.08466771323914181, + "grad_norm": 2.7805016938861216, + "learning_rate": 1.9843863190585527e-05, + "loss": 1.1337, "step": 809 }, { - "epoch": 0.53, - "learning_rate": 1.720918265098857e-05, - "loss": 1.3579, + "epoch": 0.0847723704866562, + "grad_norm": 2.9010068085027023, + "learning_rate": 1.9843265961366614e-05, + "loss": 1.0259, "step": 810 }, { - "epoch": 0.53, - "learning_rate": 1.720178823811338e-05, - "loss": 1.3208, + "epoch": 0.0848770277341706, + "grad_norm": 2.252289860307481, + "learning_rate": 1.984266760113649e-05, + "loss": 0.7456, "step": 811 }, { - "epoch": 0.53, - "learning_rate": 1.7194385635414242e-05, - "loss": 1.3973, + "epoch": 0.08498168498168499, + "grad_norm": 2.80087053004204, + "learning_rate": 1.9842068109963904e-05, + "loss": 1.1414, "step": 812 }, { - "epoch": 0.54, - "learning_rate": 1.718697485130934e-05, - "loss": 1.2564, + "epoch": 0.08508634222919938, + "grad_norm": 2.9850122512331247, + "learning_rate": 1.9841467487917743e-05, + "loss": 1.1755, "step": 813 }, { - "epoch": 0.54, - "learning_rate": 1.717955589422616e-05, - "loss": 1.1836, + "epoch": 0.08519099947671377, + "grad_norm": 3.2848807536761595, + "learning_rate": 1.9840865735067016e-05, + "loss": 1.0185, "step": 814 }, { - "epoch": 0.54, - "learning_rate": 1.7172128772601497e-05, - "loss": 1.3372, + "epoch": 0.08529565672422815, + "grad_norm": 2.8281996236712272, + "learning_rate": 1.9840262851480866e-05, + "loss": 1.1314, "step": 815 }, { - "epoch": 0.54, - "learning_rate": 1.7164693494881418e-05, - "loss": 1.3002, + "epoch": 0.08540031397174254, + "grad_norm": 2.979553704920702, + "learning_rate": 1.9839658837228562e-05, + "loss": 1.1369, "step": 816 }, { - "epoch": 0.54, - "learning_rate": 1.715725006952127e-05, - "loss": 1.3185, + "epoch": 0.08550497121925693, + "grad_norm": 2.8697433057340134, + "learning_rate": 1.9839053692379516e-05, + "loss": 1.0285, "step": 817 }, { - "epoch": 0.54, - "learning_rate": 1.7149798504985665e-05, - "loss": 1.2068, + "epoch": 0.08560962846677132, + "grad_norm": 3.4761142719041906, + "learning_rate": 1.9838447417003253e-05, + "loss": 1.0726, "step": 818 }, { - "epoch": 0.54, - "learning_rate": 1.7142338809748475e-05, - "loss": 1.3255, + "epoch": 0.08571428571428572, + "grad_norm": 2.6104944016292957, + "learning_rate": 1.983784001116944e-05, + "loss": 0.9732, "step": 819 }, { - "epoch": 0.54, - "learning_rate": 1.7134870992292808e-05, - "loss": 1.3848, + "epoch": 0.08581894296180011, + "grad_norm": 2.773752386393791, + "learning_rate": 1.9837231474947862e-05, + "loss": 1.0283, "step": 820 }, { - "epoch": 0.54, - "learning_rate": 1.712739506111102e-05, - "loss": 1.2077, + "epoch": 0.0859236002093145, + "grad_norm": 2.70939361192841, + "learning_rate": 1.9836621808408448e-05, + "loss": 1.0672, "step": 821 }, { - "epoch": 0.54, - "learning_rate": 1.7119911024704684e-05, - "loss": 1.346, + "epoch": 0.08602825745682889, + "grad_norm": 2.73253043675312, + "learning_rate": 1.9836011011621246e-05, + "loss": 0.9136, "step": 822 }, { - "epoch": 0.54, - "learning_rate": 1.7112418891584598e-05, - "loss": 1.2968, + "epoch": 0.08613291470434327, + "grad_norm": 2.5285447684085773, + "learning_rate": 1.983539908465644e-05, + "loss": 1.0069, "step": 823 }, { - "epoch": 0.54, - "learning_rate": 1.7104918670270763e-05, - "loss": 1.3476, + "epoch": 0.08623757195185766, + "grad_norm": 3.039707735225743, + "learning_rate": 1.983478602758434e-05, + "loss": 1.0909, "step": 824 }, { - "epoch": 0.54, - "learning_rate": 1.7097410369292376e-05, - "loss": 1.2234, + "epoch": 0.08634222919937205, + "grad_norm": 3.0060510240991913, + "learning_rate": 1.983417184047539e-05, + "loss": 1.0854, "step": 825 }, { - "epoch": 0.54, - "learning_rate": 1.708989399718783e-05, - "loss": 1.2995, + "epoch": 0.08644688644688645, + "grad_norm": 2.575407129448105, + "learning_rate": 1.9833556523400156e-05, + "loss": 0.9942, "step": 826 }, { - "epoch": 0.54, - "learning_rate": 1.708236956250469e-05, - "loss": 1.454, + "epoch": 0.08655154369440084, + "grad_norm": 2.399757949657678, + "learning_rate": 1.9832940076429342e-05, + "loss": 1.0323, "step": 827 }, { - "epoch": 0.55, - "learning_rate": 1.707483707379969e-05, - "loss": 1.1749, + "epoch": 0.08665620094191523, + "grad_norm": 2.6351017617726935, + "learning_rate": 1.9832322499633785e-05, + "loss": 1.0229, "step": 828 }, { - "epoch": 0.55, - "learning_rate": 1.706729653963872e-05, - "loss": 1.3245, + "epoch": 0.08676085818942962, + "grad_norm": 2.0539887200521685, + "learning_rate": 1.9831703793084436e-05, + "loss": 0.8446, "step": 829 }, { - "epoch": 0.55, - "learning_rate": 1.7059747968596836e-05, - "loss": 1.3509, + "epoch": 0.08686551543694401, + "grad_norm": 2.655675530823441, + "learning_rate": 1.9831083956852396e-05, + "loss": 1.0945, "step": 830 }, { - "epoch": 0.55, - "learning_rate": 1.705219136925821e-05, - "loss": 1.3059, + "epoch": 0.0869701726844584, + "grad_norm": 2.3301289659294224, + "learning_rate": 1.9830462991008875e-05, + "loss": 0.9982, "step": 831 }, { - "epoch": 0.55, - "learning_rate": 1.704462675021616e-05, - "loss": 1.2868, + "epoch": 0.08707482993197278, + "grad_norm": 1.9878115279486144, + "learning_rate": 1.982984089562523e-05, + "loss": 0.7846, "step": 832 }, { - "epoch": 0.55, - "learning_rate": 1.7037054120073115e-05, - "loss": 1.3464, + "epoch": 0.08717948717948718, + "grad_norm": 2.7308227759486043, + "learning_rate": 1.9829217670772936e-05, + "loss": 1.0973, "step": 833 }, { - "epoch": 0.55, - "learning_rate": 1.7029473487440617e-05, - "loss": 1.3552, + "epoch": 0.08728414442700157, + "grad_norm": 3.21302132263838, + "learning_rate": 1.9828593316523608e-05, + "loss": 1.1387, "step": 834 }, { - "epoch": 0.55, - "learning_rate": 1.7021884860939314e-05, - "loss": 1.2263, + "epoch": 0.08738880167451596, + "grad_norm": 2.580087307714874, + "learning_rate": 1.9827967832948985e-05, + "loss": 0.9605, "step": 835 }, { - "epoch": 0.55, - "learning_rate": 1.7014288249198937e-05, - "loss": 1.3225, + "epoch": 0.08749345892203035, + "grad_norm": 3.2690704122614505, + "learning_rate": 1.9827341220120933e-05, + "loss": 1.1851, "step": 836 }, { - "epoch": 0.55, - "learning_rate": 1.70066836608583e-05, - "loss": 1.3354, + "epoch": 0.08759811616954474, + "grad_norm": 2.4671010918720144, + "learning_rate": 1.982671347811145e-05, + "loss": 1.0144, "step": 837 }, { - "epoch": 0.55, - "learning_rate": 1.699907110456529e-05, - "loss": 1.2825, + "epoch": 0.08770277341705913, + "grad_norm": 2.3586665066642176, + "learning_rate": 1.982608460699267e-05, + "loss": 0.9452, "step": 838 }, { - "epoch": 0.55, - "learning_rate": 1.6991450588976853e-05, - "loss": 1.3966, + "epoch": 0.08780743066457353, + "grad_norm": 2.827484875847677, + "learning_rate": 1.982545460683685e-05, + "loss": 1.0465, "step": 839 }, { - "epoch": 0.55, - "learning_rate": 1.698382212275899e-05, - "loss": 1.2653, + "epoch": 0.08791208791208792, + "grad_norm": 2.8254407799166446, + "learning_rate": 1.9824823477716374e-05, + "loss": 0.9405, "step": 840 }, { - "epoch": 0.55, - "learning_rate": 1.697618571458673e-05, - "loss": 1.3246, + "epoch": 0.0880167451596023, + "grad_norm": 2.828527033050868, + "learning_rate": 1.982419121970377e-05, + "loss": 1.1592, "step": 841 }, { - "epoch": 0.55, - "learning_rate": 1.6968541373144157e-05, - "loss": 1.4638, + "epoch": 0.08812140240711669, + "grad_norm": 2.958013431535381, + "learning_rate": 1.9823557832871675e-05, + "loss": 1.1503, "step": 842 }, { - "epoch": 0.55, - "learning_rate": 1.696088910712435e-05, - "loss": 1.34, + "epoch": 0.08822605965463108, + "grad_norm": 2.5277477619914617, + "learning_rate": 1.982292331729287e-05, + "loss": 1.1166, "step": 843 }, { - "epoch": 0.56, - "learning_rate": 1.695322892522943e-05, - "loss": 1.3196, + "epoch": 0.08833071690214547, + "grad_norm": 2.8289824096892353, + "learning_rate": 1.982228767304026e-05, + "loss": 0.9632, "step": 844 }, { - "epoch": 0.56, - "learning_rate": 1.6945560836170485e-05, - "loss": 1.316, + "epoch": 0.08843537414965986, + "grad_norm": 2.5616715856813195, + "learning_rate": 1.9821650900186887e-05, + "loss": 0.963, "step": 845 }, { - "epoch": 0.56, - "learning_rate": 1.6937884848667625e-05, - "loss": 1.2138, + "epoch": 0.08854003139717426, + "grad_norm": 2.5986696555097586, + "learning_rate": 1.982101299880592e-05, + "loss": 1.1771, "step": 846 }, { - "epoch": 0.56, - "learning_rate": 1.6930200971449923e-05, - "loss": 1.2587, + "epoch": 0.08864468864468865, + "grad_norm": 3.051131850126143, + "learning_rate": 1.9820373968970642e-05, + "loss": 0.9058, "step": 847 }, { - "epoch": 0.56, - "learning_rate": 1.692250921325544e-05, - "loss": 1.2763, + "epoch": 0.08874934589220304, + "grad_norm": 2.7872842033896648, + "learning_rate": 1.9819733810754492e-05, + "loss": 1.0216, "step": 848 }, { - "epoch": 0.56, - "learning_rate": 1.691480958283118e-05, - "loss": 1.3115, + "epoch": 0.08885400313971742, + "grad_norm": 2.582106404341739, + "learning_rate": 1.981909252423102e-05, + "loss": 1.0599, "step": 849 }, { - "epoch": 0.56, - "learning_rate": 1.6907102088933118e-05, - "loss": 1.6282, + "epoch": 0.08895866038723181, + "grad_norm": 2.524366926584677, + "learning_rate": 1.981845010947391e-05, + "loss": 1.0088, "step": 850 }, { - "epoch": 0.56, - "learning_rate": 1.6899386740326164e-05, - "loss": 1.3779, + "epoch": 0.0890633176347462, + "grad_norm": 2.5675683272516445, + "learning_rate": 1.9817806566556982e-05, + "loss": 0.9454, "step": 851 }, { - "epoch": 0.56, - "learning_rate": 1.6891663545784146e-05, - "loss": 1.4226, + "epoch": 0.0891679748822606, + "grad_norm": 2.522430268689835, + "learning_rate": 1.9817161895554173e-05, + "loss": 1.0239, "step": 852 }, { - "epoch": 0.56, - "learning_rate": 1.6883932514089842e-05, - "loss": 1.2005, + "epoch": 0.08927263212977499, + "grad_norm": 2.7158212443276897, + "learning_rate": 1.9816516096539563e-05, + "loss": 0.9493, "step": 853 }, { - "epoch": 0.56, - "learning_rate": 1.6876193654034922e-05, - "loss": 1.3838, + "epoch": 0.08937728937728938, + "grad_norm": 2.6167231825894963, + "learning_rate": 1.981586916958735e-05, + "loss": 0.9662, "step": 854 }, { - "epoch": 0.56, - "learning_rate": 1.6868446974419962e-05, - "loss": 1.1105, + "epoch": 0.08948194662480377, + "grad_norm": 2.566987568415224, + "learning_rate": 1.981522111477187e-05, + "loss": 0.8303, "step": 855 }, { - "epoch": 0.56, - "learning_rate": 1.686069248405444e-05, - "loss": 1.4163, + "epoch": 0.08958660387231816, + "grad_norm": 1.9987336493251993, + "learning_rate": 1.981457193216759e-05, + "loss": 0.8232, "step": 856 }, { - "epoch": 0.56, - "learning_rate": 1.68529301917567e-05, - "loss": 1.3145, + "epoch": 0.08969126111983255, + "grad_norm": 3.165150991848205, + "learning_rate": 1.98139216218491e-05, + "loss": 1.0353, "step": 857 }, { - "epoch": 0.56, - "learning_rate": 1.6845160106353975e-05, - "loss": 1.2457, + "epoch": 0.08979591836734693, + "grad_norm": 2.7844496531584766, + "learning_rate": 1.9813270183891117e-05, + "loss": 1.1477, "step": 858 }, { - "epoch": 0.57, - "learning_rate": 1.6837382236682347e-05, - "loss": 1.2608, + "epoch": 0.08990057561486132, + "grad_norm": 2.8027853604323747, + "learning_rate": 1.98126176183685e-05, + "loss": 1.0745, "step": 859 }, { - "epoch": 0.57, - "learning_rate": 1.682959659158676e-05, - "loss": 1.1787, + "epoch": 0.09000523286237572, + "grad_norm": 2.741551828332874, + "learning_rate": 1.9811963925356222e-05, + "loss": 1.2044, "step": 860 }, { - "epoch": 0.57, - "learning_rate": 1.6821803179920996e-05, - "loss": 1.259, + "epoch": 0.09010989010989011, + "grad_norm": 2.2316560473076015, + "learning_rate": 1.98113091049294e-05, + "loss": 0.8208, "step": 861 }, { - "epoch": 0.57, - "learning_rate": 1.681400201054766e-05, - "loss": 1.3097, + "epoch": 0.0902145473574045, + "grad_norm": 2.8642542806528026, + "learning_rate": 1.9810653157163275e-05, + "loss": 1.1049, "step": 862 }, { - "epoch": 0.57, - "learning_rate": 1.68061930923382e-05, - "loss": 1.199, + "epoch": 0.09031920460491889, + "grad_norm": 2.974066935720496, + "learning_rate": 1.9809996082133208e-05, + "loss": 1.1108, "step": 863 }, { - "epoch": 0.57, - "learning_rate": 1.6798376434172862e-05, - "loss": 1.3635, + "epoch": 0.09042386185243328, + "grad_norm": 3.751229650832022, + "learning_rate": 1.980933787991471e-05, + "loss": 1.0713, "step": 864 }, { - "epoch": 0.57, - "learning_rate": 1.6790552044940692e-05, - "loss": 1.347, + "epoch": 0.09052851909994768, + "grad_norm": 2.355691840091666, + "learning_rate": 1.98086785505834e-05, + "loss": 1.0267, "step": 865 }, { - "epoch": 0.57, - "learning_rate": 1.678271993353953e-05, - "loss": 1.4459, + "epoch": 0.09063317634746205, + "grad_norm": 2.4093700564341782, + "learning_rate": 1.9808018094215044e-05, + "loss": 1.0784, "step": 866 }, { - "epoch": 0.57, - "learning_rate": 1.6774880108876005e-05, - "loss": 1.3616, + "epoch": 0.09073783359497645, + "grad_norm": 2.9768464412651965, + "learning_rate": 1.9807356510885526e-05, + "loss": 1.0033, "step": 867 }, { - "epoch": 0.57, - "learning_rate": 1.6767032579865503e-05, - "loss": 1.096, + "epoch": 0.09084249084249084, + "grad_norm": 2.930398030142532, + "learning_rate": 1.980669380067086e-05, + "loss": 1.048, "step": 868 }, { - "epoch": 0.57, - "learning_rate": 1.6759177355432186e-05, - "loss": 1.4208, + "epoch": 0.09094714809000523, + "grad_norm": 2.519451550821377, + "learning_rate": 1.9806029963647198e-05, + "loss": 1.0554, "step": 869 }, { - "epoch": 0.57, - "learning_rate": 1.6751314444508967e-05, - "loss": 1.2412, + "epoch": 0.09105180533751962, + "grad_norm": 2.7989080225534084, + "learning_rate": 1.9805364999890813e-05, + "loss": 1.0775, "step": 870 }, { - "epoch": 0.57, - "learning_rate": 1.674344385603748e-05, - "loss": 1.3455, + "epoch": 0.09115646258503401, + "grad_norm": 2.41370990986101, + "learning_rate": 1.980469890947811e-05, + "loss": 1.0857, "step": 871 }, { - "epoch": 0.57, - "learning_rate": 1.6735565598968114e-05, - "loss": 1.2871, + "epoch": 0.0912611198325484, + "grad_norm": 3.1596287961198146, + "learning_rate": 1.9804031692485633e-05, + "loss": 1.0765, "step": 872 }, { - "epoch": 0.57, - "learning_rate": 1.672767968225997e-05, - "loss": 1.2198, + "epoch": 0.0913657770800628, + "grad_norm": 2.584700642828629, + "learning_rate": 1.9803363348990035e-05, + "loss": 0.941, "step": 873 }, { - "epoch": 0.58, - "learning_rate": 1.6719786114880846e-05, - "loss": 1.2766, + "epoch": 0.09147043432757719, + "grad_norm": 2.8784994569744193, + "learning_rate": 1.9802693879068116e-05, + "loss": 1.1304, "step": 874 }, { - "epoch": 0.58, - "learning_rate": 1.6711884905807264e-05, - "loss": 1.2012, + "epoch": 0.09157509157509157, + "grad_norm": 2.849171759445592, + "learning_rate": 1.9802023282796797e-05, + "loss": 1.0429, "step": 875 }, { - "epoch": 0.58, - "learning_rate": 1.670397606402442e-05, - "loss": 1.4071, + "epoch": 0.09167974882260596, + "grad_norm": 3.3153795943487268, + "learning_rate": 1.980135156025313e-05, + "loss": 1.1432, "step": 876 }, { - "epoch": 0.58, - "learning_rate": 1.66960595985262e-05, - "loss": 1.429, + "epoch": 0.09178440607012035, + "grad_norm": 2.472442664010389, + "learning_rate": 1.9800678711514304e-05, + "loss": 1.0624, "step": 877 }, { - "epoch": 0.58, - "learning_rate": 1.6688135518315144e-05, - "loss": 1.3494, + "epoch": 0.09188906331763474, + "grad_norm": 2.7245355328223533, + "learning_rate": 1.9800004736657623e-05, + "loss": 1.0272, "step": 878 }, { - "epoch": 0.58, - "learning_rate": 1.668020383240247e-05, - "loss": 1.2628, + "epoch": 0.09199372056514914, + "grad_norm": 2.5078380754741847, + "learning_rate": 1.9799329635760533e-05, + "loss": 1.0269, "step": 879 }, { - "epoch": 0.58, - "learning_rate": 1.6672264549808033e-05, - "loss": 1.3206, + "epoch": 0.09209837781266353, + "grad_norm": 2.9844539652083815, + "learning_rate": 1.97986534089006e-05, + "loss": 0.9574, "step": 880 }, { - "epoch": 0.58, - "learning_rate": 1.6664317679560333e-05, - "loss": 1.3291, + "epoch": 0.09220303506017792, + "grad_norm": 3.058018108494758, + "learning_rate": 1.9797976056155527e-05, + "loss": 1.0867, "step": 881 }, { - "epoch": 0.58, - "learning_rate": 1.665636323069649e-05, - "loss": 1.4145, + "epoch": 0.09230769230769231, + "grad_norm": 2.675532794463805, + "learning_rate": 1.9797297577603144e-05, + "loss": 1.1262, "step": 882 }, { - "epoch": 0.58, - "learning_rate": 1.6648401212262253e-05, - "loss": 1.3661, + "epoch": 0.0924123495552067, + "grad_norm": 2.626017155020485, + "learning_rate": 1.979661797332141e-05, + "loss": 1.1043, "step": 883 }, { - "epoch": 0.58, - "learning_rate": 1.6640431633311975e-05, - "loss": 1.3263, + "epoch": 0.09251700680272108, + "grad_norm": 2.876484472128455, + "learning_rate": 1.9795937243388405e-05, + "loss": 1.1571, "step": 884 }, { - "epoch": 0.58, - "learning_rate": 1.6632454502908605e-05, - "loss": 1.3406, + "epoch": 0.09262166405023547, + "grad_norm": 2.5311189083204146, + "learning_rate": 1.9795255387882357e-05, + "loss": 1.0845, "step": 885 }, { - "epoch": 0.58, - "learning_rate": 1.662446983012368e-05, - "loss": 1.3074, + "epoch": 0.09272632129774987, + "grad_norm": 2.510089613368414, + "learning_rate": 1.9794572406881606e-05, + "loss": 1.0432, "step": 886 }, { - "epoch": 0.58, - "learning_rate": 1.6616477624037317e-05, - "loss": 1.3181, + "epoch": 0.09283097854526426, + "grad_norm": 3.2255430026028247, + "learning_rate": 1.979388830046463e-05, + "loss": 1.1879, "step": 887 }, { - "epoch": 0.58, - "learning_rate": 1.6608477893738197e-05, - "loss": 1.3829, + "epoch": 0.09293563579277865, + "grad_norm": 2.4203136160149716, + "learning_rate": 1.9793203068710036e-05, + "loss": 0.8051, "step": 888 }, { - "epoch": 0.59, - "learning_rate": 1.6600470648323554e-05, - "loss": 1.284, + "epoch": 0.09304029304029304, + "grad_norm": 2.8289265412585736, + "learning_rate": 1.9792516711696555e-05, + "loss": 0.9694, "step": 889 }, { - "epoch": 0.59, - "learning_rate": 1.6592455896899177e-05, - "loss": 1.259, + "epoch": 0.09314495028780743, + "grad_norm": 2.582322870935106, + "learning_rate": 1.9791829229503054e-05, + "loss": 1.0845, "step": 890 }, { - "epoch": 0.59, - "learning_rate": 1.658443364857939e-05, - "loss": 1.2656, + "epoch": 0.09324960753532183, + "grad_norm": 2.6892222694722427, + "learning_rate": 1.9791140622208523e-05, + "loss": 1.111, "step": 891 }, { - "epoch": 0.59, - "learning_rate": 1.6576403912487026e-05, - "loss": 1.2248, + "epoch": 0.0933542647828362, + "grad_norm": 3.0266338857528505, + "learning_rate": 1.9790450889892082e-05, + "loss": 1.0868, "step": 892 }, { - "epoch": 0.59, - "learning_rate": 1.656836669775346e-05, - "loss": 1.4117, + "epoch": 0.0934589220303506, + "grad_norm": 2.966278924481458, + "learning_rate": 1.9789760032632993e-05, + "loss": 1.1863, "step": 893 }, { - "epoch": 0.59, - "learning_rate": 1.6560322013518542e-05, - "loss": 1.3967, + "epoch": 0.09356357927786499, + "grad_norm": 2.6158992441922773, + "learning_rate": 1.9789068050510627e-05, + "loss": 1.1321, "step": 894 }, { - "epoch": 0.59, - "learning_rate": 1.6552269868930646e-05, - "loss": 1.3535, + "epoch": 0.09366823652537938, + "grad_norm": 3.081967659922265, + "learning_rate": 1.9788374943604496e-05, + "loss": 0.9553, "step": 895 }, { - "epoch": 0.59, - "learning_rate": 1.6544210273146608e-05, - "loss": 1.3346, + "epoch": 0.09377289377289377, + "grad_norm": 2.527503447093225, + "learning_rate": 1.9787680711994244e-05, + "loss": 1.0302, "step": 896 }, { - "epoch": 0.59, - "learning_rate": 1.6536143235331743e-05, - "loss": 1.4138, + "epoch": 0.09387755102040816, + "grad_norm": 2.7162559652765466, + "learning_rate": 1.9786985355759633e-05, + "loss": 1.0795, "step": 897 }, { - "epoch": 0.59, - "learning_rate": 1.6528068764659838e-05, - "loss": 1.3445, + "epoch": 0.09398220826792256, + "grad_norm": 2.329247727727232, + "learning_rate": 1.9786288874980567e-05, + "loss": 0.8319, "step": 898 }, { - "epoch": 0.59, - "learning_rate": 1.651998687031312e-05, - "loss": 1.2632, + "epoch": 0.09408686551543695, + "grad_norm": 2.775539898357401, + "learning_rate": 1.978559126973707e-05, + "loss": 1.1444, "step": 899 }, { - "epoch": 0.59, - "learning_rate": 1.6511897561482263e-05, - "loss": 1.1623, + "epoch": 0.09419152276295134, + "grad_norm": 2.9580563525888635, + "learning_rate": 1.97848925401093e-05, + "loss": 1.0485, "step": 900 }, { - "epoch": 0.59, - "learning_rate": 1.6503800847366373e-05, - "loss": 1.265, + "epoch": 0.09429618001046572, + "grad_norm": 2.5695675385307553, + "learning_rate": 1.978419268617754e-05, + "loss": 0.9955, "step": 901 }, { - "epoch": 0.59, - "learning_rate": 1.6495696737172982e-05, - "loss": 1.2231, + "epoch": 0.09440083725798011, + "grad_norm": 2.2552487789593623, + "learning_rate": 1.9783491708022203e-05, + "loss": 0.8651, "step": 902 }, { - "epoch": 0.59, - "learning_rate": 1.6487585240118025e-05, - "loss": 1.3099, + "epoch": 0.0945054945054945, + "grad_norm": 2.9417126110660554, + "learning_rate": 1.978278960572384e-05, + "loss": 1.1695, "step": 903 }, { - "epoch": 0.6, - "learning_rate": 1.647946636542584e-05, - "loss": 1.3066, + "epoch": 0.0946101517530089, + "grad_norm": 2.183916665081056, + "learning_rate": 1.9782086379363116e-05, + "loss": 0.9073, "step": 904 }, { - "epoch": 0.6, - "learning_rate": 1.6471340122329158e-05, - "loss": 1.2451, + "epoch": 0.09471480900052329, + "grad_norm": 2.843970018173232, + "learning_rate": 1.978138202902084e-05, + "loss": 1.1337, "step": 905 }, { - "epoch": 0.6, - "learning_rate": 1.6463206520069083e-05, - "loss": 1.2643, + "epoch": 0.09481946624803768, + "grad_norm": 2.5391065676287794, + "learning_rate": 1.9780676554777937e-05, + "loss": 1.0872, "step": 906 }, { - "epoch": 0.6, - "learning_rate": 1.6455065567895098e-05, - "loss": 1.3528, + "epoch": 0.09492412349555207, + "grad_norm": 2.580130909019054, + "learning_rate": 1.9779969956715473e-05, + "loss": 0.9439, "step": 907 }, { - "epoch": 0.6, - "learning_rate": 1.6446917275065032e-05, - "loss": 1.3333, + "epoch": 0.09502878074306646, + "grad_norm": 2.8311688424167016, + "learning_rate": 1.9779262234914635e-05, + "loss": 1.1965, "step": 908 }, { - "epoch": 0.6, - "learning_rate": 1.643876165084507e-05, - "loss": 1.2977, + "epoch": 0.09513343799058084, + "grad_norm": 3.140721287835419, + "learning_rate": 1.977855338945674e-05, + "loss": 1.0228, "step": 909 }, { - "epoch": 0.6, - "learning_rate": 1.6430598704509723e-05, - "loss": 1.3413, + "epoch": 0.09523809523809523, + "grad_norm": 2.4322100869774976, + "learning_rate": 1.977784342042323e-05, + "loss": 1.1553, "step": 910 }, { - "epoch": 0.6, - "learning_rate": 1.642242844534185e-05, - "loss": 1.2886, + "epoch": 0.09534275248560962, + "grad_norm": 2.5964369564075196, + "learning_rate": 1.9777132327895702e-05, + "loss": 1.0501, "step": 911 }, { - "epoch": 0.6, - "learning_rate": 1.6414250882632607e-05, - "loss": 1.236, + "epoch": 0.09544740973312402, + "grad_norm": 2.6073741523793186, + "learning_rate": 1.9776420111955842e-05, + "loss": 1.0065, "step": 912 }, { - "epoch": 0.6, - "learning_rate": 1.6406066025681462e-05, - "loss": 1.5691, + "epoch": 0.09555206698063841, + "grad_norm": 2.6220467436425863, + "learning_rate": 1.9775706772685495e-05, + "loss": 1.1565, "step": 913 }, { - "epoch": 0.6, - "learning_rate": 1.6397873883796183e-05, - "loss": 1.2697, + "epoch": 0.0956567242281528, + "grad_norm": 2.923213677461564, + "learning_rate": 1.977499231016662e-05, + "loss": 1.0593, "step": 914 }, { - "epoch": 0.6, - "learning_rate": 1.6389674466292812e-05, - "loss": 1.3026, + "epoch": 0.09576138147566719, + "grad_norm": 2.6380776247624027, + "learning_rate": 1.9774276724481316e-05, + "loss": 1.0011, "step": 915 }, { - "epoch": 0.6, - "learning_rate": 1.638146778249567e-05, - "loss": 1.1728, + "epoch": 0.09586603872318158, + "grad_norm": 3.8624650361902617, + "learning_rate": 1.9773560015711798e-05, + "loss": 1.0226, "step": 916 }, { - "epoch": 0.6, - "learning_rate": 1.637325384173734e-05, - "loss": 1.3402, + "epoch": 0.09597069597069598, + "grad_norm": 2.818754439571562, + "learning_rate": 1.977284218394042e-05, + "loss": 1.1649, "step": 917 }, { - "epoch": 0.6, - "learning_rate": 1.6365032653358665e-05, - "loss": 1.2947, + "epoch": 0.09607535321821035, + "grad_norm": 2.4474931176286128, + "learning_rate": 1.977212322924967e-05, + "loss": 1.0857, "step": 918 }, { - "epoch": 0.61, - "learning_rate": 1.635680422670872e-05, - "loss": 1.3534, + "epoch": 0.09618001046572475, + "grad_norm": 2.8206323427096165, + "learning_rate": 1.9771403151722147e-05, + "loss": 1.0433, "step": 919 }, { - "epoch": 0.61, - "learning_rate": 1.6348568571144816e-05, - "loss": 1.3345, + "epoch": 0.09628466771323914, + "grad_norm": 3.213099908339713, + "learning_rate": 1.9770681951440595e-05, + "loss": 1.1182, "step": 920 }, { - "epoch": 0.61, - "learning_rate": 1.634032569603248e-05, - "loss": 1.3136, + "epoch": 0.09638932496075353, + "grad_norm": 2.9662481224716974, + "learning_rate": 1.9769959628487876e-05, + "loss": 0.9635, "step": 921 }, { - "epoch": 0.61, - "learning_rate": 1.633207561074546e-05, - "loss": 1.2049, + "epoch": 0.09649398220826792, + "grad_norm": 4.016583647471507, + "learning_rate": 1.9769236182946993e-05, + "loss": 0.9521, "step": 922 }, { - "epoch": 0.61, - "learning_rate": 1.6323818324665693e-05, - "loss": 1.2287, + "epoch": 0.09659863945578231, + "grad_norm": 3.508939746867643, + "learning_rate": 1.9768511614901067e-05, + "loss": 1.2703, "step": 923 }, { - "epoch": 0.61, - "learning_rate": 1.63155538471833e-05, - "loss": 1.457, + "epoch": 0.0967032967032967, + "grad_norm": 2.953270773462452, + "learning_rate": 1.9767785924433354e-05, + "loss": 1.124, "step": 924 }, { - "epoch": 0.61, - "learning_rate": 1.6307282187696602e-05, - "loss": 1.236, + "epoch": 0.0968079539508111, + "grad_norm": 2.537786165701537, + "learning_rate": 1.9767059111627237e-05, + "loss": 1.0141, "step": 925 }, { - "epoch": 0.61, - "learning_rate": 1.629900335561206e-05, - "loss": 1.2837, + "epoch": 0.09691261119832549, + "grad_norm": 3.2226759688842415, + "learning_rate": 1.9766331176566226e-05, + "loss": 1.0785, "step": 926 }, { - "epoch": 0.61, - "learning_rate": 1.6290717360344312e-05, - "loss": 1.3094, + "epoch": 0.09701726844583987, + "grad_norm": 2.8316546956911512, + "learning_rate": 1.9765602119333967e-05, + "loss": 1.084, "step": 927 }, { - "epoch": 0.61, - "learning_rate": 1.6282424211316134e-05, - "loss": 1.2838, + "epoch": 0.09712192569335426, + "grad_norm": 2.8227008106460985, + "learning_rate": 1.9764871940014226e-05, + "loss": 1.0238, "step": 928 }, { - "epoch": 0.61, - "learning_rate": 1.6274123917958438e-05, - "loss": 1.2095, + "epoch": 0.09722658294086865, + "grad_norm": 2.6956185802209323, + "learning_rate": 1.97641406386909e-05, + "loss": 1.1413, "step": 929 }, { - "epoch": 0.61, - "learning_rate": 1.626581648971026e-05, - "loss": 1.2071, + "epoch": 0.09733124018838304, + "grad_norm": 3.011507745775884, + "learning_rate": 1.976340821544803e-05, + "loss": 1.0178, "step": 930 }, { - "epoch": 0.61, - "learning_rate": 1.625750193601875e-05, - "loss": 1.2638, + "epoch": 0.09743589743589744, + "grad_norm": 2.7090414787190547, + "learning_rate": 1.9762674670369757e-05, + "loss": 1.1175, "step": 931 }, { - "epoch": 0.61, - "learning_rate": 1.624918026633916e-05, - "loss": 1.2514, + "epoch": 0.09754055468341183, + "grad_norm": 2.9878646220501337, + "learning_rate": 1.9761940003540375e-05, + "loss": 1.0434, "step": 932 }, { - "epoch": 0.61, - "learning_rate": 1.6240851490134834e-05, - "loss": 1.1429, + "epoch": 0.09764521193092622, + "grad_norm": 2.9207377165620563, + "learning_rate": 1.9761204215044295e-05, + "loss": 0.9525, "step": 933 }, { - "epoch": 0.61, - "learning_rate": 1.6232515616877204e-05, - "loss": 1.3468, + "epoch": 0.09774986917844061, + "grad_norm": 2.7502875366492985, + "learning_rate": 1.9760467304966066e-05, + "loss": 0.9973, "step": 934 }, { - "epoch": 0.62, - "learning_rate": 1.6224172656045767e-05, - "loss": 1.3237, + "epoch": 0.09785452642595499, + "grad_norm": 2.866734557712832, + "learning_rate": 1.9759729273390356e-05, + "loss": 1.0749, "step": 935 }, { - "epoch": 0.62, - "learning_rate": 1.6215822617128075e-05, - "loss": 1.2862, + "epoch": 0.09795918367346938, + "grad_norm": 2.195362172452806, + "learning_rate": 1.9758990120401967e-05, + "loss": 0.7863, "step": 936 }, { - "epoch": 0.62, - "learning_rate": 1.620746550961974e-05, - "loss": 1.2863, + "epoch": 0.09806384092098377, + "grad_norm": 2.470173351806748, + "learning_rate": 1.975824984608583e-05, + "loss": 1.0308, "step": 937 }, { - "epoch": 0.62, - "learning_rate": 1.6199101343024405e-05, - "loss": 1.4152, + "epoch": 0.09816849816849817, + "grad_norm": 1.9811710309835036, + "learning_rate": 1.9757508450527006e-05, + "loss": 0.8232, "step": 938 }, { - "epoch": 0.62, - "learning_rate": 1.619073012685374e-05, - "loss": 1.4615, + "epoch": 0.09827315541601256, + "grad_norm": 2.608135387091775, + "learning_rate": 1.9756765933810674e-05, + "loss": 1.031, "step": 939 }, { - "epoch": 0.62, - "learning_rate": 1.618235187062744e-05, - "loss": 1.2388, + "epoch": 0.09837781266352695, + "grad_norm": 2.57500776512692, + "learning_rate": 1.9756022296022164e-05, + "loss": 1.1257, "step": 940 }, { - "epoch": 0.62, - "learning_rate": 1.6173966583873195e-05, - "loss": 1.2309, + "epoch": 0.09848246991104134, + "grad_norm": 2.976812024734902, + "learning_rate": 1.9755277537246914e-05, + "loss": 1.1147, "step": 941 }, { - "epoch": 0.62, - "learning_rate": 1.6165574276126702e-05, - "loss": 1.1803, + "epoch": 0.09858712715855573, + "grad_norm": 2.6608551364060307, + "learning_rate": 1.9754531657570495e-05, + "loss": 1.1327, "step": 942 }, { - "epoch": 0.62, - "learning_rate": 1.6157174956931627e-05, - "loss": 1.3408, + "epoch": 0.09869178440607013, + "grad_norm": 2.564080580950772, + "learning_rate": 1.9753784657078613e-05, + "loss": 1.0696, "step": 943 }, { - "epoch": 0.62, - "learning_rate": 1.6148768635839623e-05, - "loss": 1.3339, + "epoch": 0.0987964416535845, + "grad_norm": 2.3432953283718327, + "learning_rate": 1.9753036535857106e-05, + "loss": 1.0246, "step": 944 }, { - "epoch": 0.62, - "learning_rate": 1.6140355322410304e-05, - "loss": 1.3751, + "epoch": 0.0989010989010989, + "grad_norm": 2.9307230460788904, + "learning_rate": 1.9752287293991927e-05, + "loss": 1.0714, "step": 945 }, { - "epoch": 0.62, - "learning_rate": 1.6131935026211224e-05, - "loss": 1.3877, + "epoch": 0.09900575614861329, + "grad_norm": 3.2102158887761867, + "learning_rate": 1.9751536931569167e-05, + "loss": 1.198, "step": 946 }, { - "epoch": 0.62, - "learning_rate": 1.612350775681789e-05, - "loss": 1.207, + "epoch": 0.09911041339612768, + "grad_norm": 2.8495315326753645, + "learning_rate": 1.975078544867505e-05, + "loss": 1.0824, "step": 947 }, { - "epoch": 0.62, - "learning_rate": 1.611507352381373e-05, - "loss": 1.4494, + "epoch": 0.09921507064364207, + "grad_norm": 2.682593584386815, + "learning_rate": 1.9750032845395914e-05, + "loss": 1.0324, "step": 948 }, { - "epoch": 0.62, - "learning_rate": 1.6106632336790105e-05, - "loss": 1.3727, + "epoch": 0.09931972789115646, + "grad_norm": 2.8693006339538116, + "learning_rate": 1.9749279121818235e-05, + "loss": 1.1512, "step": 949 }, { - "epoch": 0.63, - "learning_rate": 1.609818420534627e-05, - "loss": 1.3039, + "epoch": 0.09942438513867086, + "grad_norm": 2.7517932371635774, + "learning_rate": 1.974852427802863e-05, + "loss": 0.9674, "step": 950 }, { - "epoch": 0.63, - "learning_rate": 1.608972913908938e-05, - "loss": 1.1756, + "epoch": 0.09952904238618525, + "grad_norm": 2.734800109801035, + "learning_rate": 1.9747768314113817e-05, + "loss": 1.0879, "step": 951 }, { - "epoch": 0.63, - "learning_rate": 1.608126714763448e-05, - "loss": 1.2914, + "epoch": 0.09963369963369964, + "grad_norm": 2.7033090734169893, + "learning_rate": 1.9747011230160664e-05, + "loss": 1.1549, "step": 952 }, { - "epoch": 0.63, - "learning_rate": 1.6072798240604487e-05, - "loss": 1.3273, + "epoch": 0.09973835688121402, + "grad_norm": 2.6253103443822683, + "learning_rate": 1.9746253026256164e-05, + "loss": 1.0787, "step": 953 }, { - "epoch": 0.63, - "learning_rate": 1.6064322427630186e-05, - "loss": 1.322, + "epoch": 0.09984301412872841, + "grad_norm": 3.125722150105614, + "learning_rate": 1.974549370248743e-05, + "loss": 1.1259, "step": 954 }, { - "epoch": 0.63, - "learning_rate": 1.6055839718350207e-05, - "loss": 1.1453, + "epoch": 0.0999476713762428, + "grad_norm": 2.7448374047244326, + "learning_rate": 1.9744733258941717e-05, + "loss": 0.9836, "step": 955 }, { - "epoch": 0.63, - "learning_rate": 1.6047350122411038e-05, - "loss": 1.3025, + "epoch": 0.1000523286237572, + "grad_norm": 2.955943924297256, + "learning_rate": 1.9743971695706398e-05, + "loss": 1.1255, "step": 956 }, { - "epoch": 0.63, - "learning_rate": 1.6038853649466983e-05, - "loss": 1.1693, + "epoch": 0.10015698587127159, + "grad_norm": 2.7395719342747897, + "learning_rate": 1.9743209012868977e-05, + "loss": 1.042, "step": 957 }, { - "epoch": 0.63, - "learning_rate": 1.6030350309180173e-05, - "loss": 1.2767, + "epoch": 0.10026164311878598, + "grad_norm": 2.5186535770163103, + "learning_rate": 1.974244521051709e-05, + "loss": 0.7976, "step": 958 }, { - "epoch": 0.63, - "learning_rate": 1.6021840111220548e-05, - "loss": 1.4647, + "epoch": 0.10036630036630037, + "grad_norm": 2.499288564217357, + "learning_rate": 1.9741680288738495e-05, + "loss": 1.0014, "step": 959 }, { - "epoch": 0.63, - "learning_rate": 1.6013323065265848e-05, - "loss": 1.2958, + "epoch": 0.10047095761381476, + "grad_norm": 2.8901978870749008, + "learning_rate": 1.9740914247621088e-05, + "loss": 1.074, "step": 960 }, { - "epoch": 0.63, - "learning_rate": 1.60047991810016e-05, - "loss": 1.3274, + "epoch": 0.10057561486132914, + "grad_norm": 2.624567677834044, + "learning_rate": 1.9740147087252887e-05, + "loss": 1.0408, "step": 961 }, { - "epoch": 0.63, - "learning_rate": 1.59962684681211e-05, - "loss": 1.325, + "epoch": 0.10068027210884353, + "grad_norm": 2.1866362133414827, + "learning_rate": 1.973937880772204e-05, + "loss": 1.0002, "step": 962 }, { - "epoch": 0.63, - "learning_rate": 1.5987730936325424e-05, - "loss": 1.3704, + "epoch": 0.10078492935635792, + "grad_norm": 2.3411382143212793, + "learning_rate": 1.9738609409116824e-05, + "loss": 0.8504, "step": 963 }, { - "epoch": 0.63, - "learning_rate": 1.597918659532339e-05, - "loss": 1.3307, + "epoch": 0.10088958660387232, + "grad_norm": 2.9089697665795864, + "learning_rate": 1.9737838891525647e-05, + "loss": 1.0074, "step": 964 }, { - "epoch": 0.64, - "learning_rate": 1.5970635454831565e-05, - "loss": 1.2188, + "epoch": 0.10099424385138671, + "grad_norm": 2.532467662978773, + "learning_rate": 1.973706725503704e-05, + "loss": 1.0635, "step": 965 }, { - "epoch": 0.64, - "learning_rate": 1.5962077524574248e-05, - "loss": 1.2714, + "epoch": 0.1010989010989011, + "grad_norm": 2.6762533062616125, + "learning_rate": 1.9736294499739667e-05, + "loss": 0.9866, "step": 966 }, { - "epoch": 0.64, - "learning_rate": 1.595351281428345e-05, - "loss": 1.2796, + "epoch": 0.10120355834641549, + "grad_norm": 3.0420458033024658, + "learning_rate": 1.9735520625722312e-05, + "loss": 1.0371, "step": 967 }, { - "epoch": 0.64, - "learning_rate": 1.5944941333698912e-05, - "loss": 1.2497, + "epoch": 0.10130821559392988, + "grad_norm": 2.5023937580187985, + "learning_rate": 1.9734745633073908e-05, + "loss": 1.1425, "step": 968 }, { - "epoch": 0.64, - "learning_rate": 1.5936363092568058e-05, - "loss": 1.244, + "epoch": 0.10141287284144428, + "grad_norm": 2.638941018679516, + "learning_rate": 1.9733969521883494e-05, + "loss": 1.0191, "step": 969 }, { - "epoch": 0.64, - "learning_rate": 1.5927778100646007e-05, - "loss": 1.212, + "epoch": 0.10151753008895865, + "grad_norm": 3.0955666704793576, + "learning_rate": 1.9733192292240252e-05, + "loss": 1.013, "step": 970 }, { - "epoch": 0.64, - "learning_rate": 1.5919186367695546e-05, - "loss": 1.3979, + "epoch": 0.10162218733647305, + "grad_norm": 2.3039556868285223, + "learning_rate": 1.973241394423348e-05, + "loss": 0.8819, "step": 971 }, { - "epoch": 0.64, - "learning_rate": 1.5910587903487142e-05, - "loss": 1.3326, + "epoch": 0.10172684458398744, + "grad_norm": 2.808971945311129, + "learning_rate": 1.973163447795262e-05, + "loss": 1.2111, "step": 972 }, { - "epoch": 0.64, - "learning_rate": 1.5901982717798903e-05, - "loss": 1.3511, + "epoch": 0.10183150183150183, + "grad_norm": 2.1370050360048927, + "learning_rate": 1.973085389348723e-05, + "loss": 0.7627, "step": 973 }, { - "epoch": 0.64, - "learning_rate": 1.5893370820416594e-05, - "loss": 1.3456, + "epoch": 0.10193615907901622, + "grad_norm": 2.597731049761627, + "learning_rate": 1.9730072190926996e-05, + "loss": 1.1669, "step": 974 }, { - "epoch": 0.64, - "learning_rate": 1.5884752221133597e-05, - "loss": 1.3462, + "epoch": 0.10204081632653061, + "grad_norm": 3.395661508370253, + "learning_rate": 1.9729289370361745e-05, + "loss": 0.9587, "step": 975 }, { - "epoch": 0.64, - "learning_rate": 1.587612692975093e-05, - "loss": 1.2286, + "epoch": 0.102145473574045, + "grad_norm": 3.0895550900923294, + "learning_rate": 1.9728505431881425e-05, + "loss": 0.9229, "step": 976 }, { - "epoch": 0.64, - "learning_rate": 1.5867494956077215e-05, - "loss": 1.3136, + "epoch": 0.1022501308215594, + "grad_norm": 2.458084346576055, + "learning_rate": 1.972772037557611e-05, + "loss": 1.0632, "step": 977 }, { - "epoch": 0.64, - "learning_rate": 1.585885630992867e-05, - "loss": 1.3715, + "epoch": 0.10235478806907378, + "grad_norm": 3.0516732311665313, + "learning_rate": 1.9726934201535998e-05, + "loss": 1.1159, "step": 978 }, { - "epoch": 0.64, - "learning_rate": 1.58502110011291e-05, - "loss": 1.2604, + "epoch": 0.10245944531658817, + "grad_norm": 2.354235000879386, + "learning_rate": 1.972614690985143e-05, + "loss": 0.9929, "step": 979 }, { - "epoch": 0.65, - "learning_rate": 1.5841559039509898e-05, - "loss": 1.1808, + "epoch": 0.10256410256410256, + "grad_norm": 3.2111492690142995, + "learning_rate": 1.9725358500612865e-05, + "loss": 0.9502, "step": 980 }, { - "epoch": 0.65, - "learning_rate": 1.5832900434910014e-05, - "loss": 1.2897, + "epoch": 0.10266875981161695, + "grad_norm": 3.0248285962435895, + "learning_rate": 1.9724568973910893e-05, + "loss": 1.0119, "step": 981 }, { - "epoch": 0.65, - "learning_rate": 1.5824235197175954e-05, - "loss": 1.2339, + "epoch": 0.10277341705913134, + "grad_norm": 2.770623468886491, + "learning_rate": 1.9723778329836235e-05, + "loss": 1.0712, "step": 982 }, { - "epoch": 0.65, - "learning_rate": 1.5815563336161756e-05, - "loss": 1.2399, + "epoch": 0.10287807430664574, + "grad_norm": 2.5692351103675364, + "learning_rate": 1.9722986568479734e-05, + "loss": 1.0985, "step": 983 }, { - "epoch": 0.65, - "learning_rate": 1.580688486172902e-05, - "loss": 1.3925, + "epoch": 0.10298273155416013, + "grad_norm": 2.8212239759961104, + "learning_rate": 1.9722193689932364e-05, + "loss": 1.0989, "step": 984 }, { - "epoch": 0.65, - "learning_rate": 1.5798199783746828e-05, - "loss": 1.3116, + "epoch": 0.10308738880167452, + "grad_norm": 2.7660547575806187, + "learning_rate": 1.9721399694285227e-05, + "loss": 1.0853, "step": 985 }, { - "epoch": 0.65, - "learning_rate": 1.5789508112091804e-05, - "loss": 1.4549, + "epoch": 0.10319204604918891, + "grad_norm": 2.7676555235602995, + "learning_rate": 1.9720604581629562e-05, + "loss": 1.1553, "step": 986 }, { - "epoch": 0.65, - "learning_rate": 1.578080985664805e-05, - "loss": 1.2233, + "epoch": 0.10329670329670329, + "grad_norm": 2.487671204295441, + "learning_rate": 1.9719808352056728e-05, + "loss": 0.9583, "step": 987 }, { - "epoch": 0.65, - "learning_rate": 1.5772105027307168e-05, - "loss": 1.1387, + "epoch": 0.10340136054421768, + "grad_norm": 2.637848541738491, + "learning_rate": 1.9719011005658203e-05, + "loss": 1.1547, "step": 988 }, { - "epoch": 0.65, - "learning_rate": 1.5763393633968223e-05, - "loss": 1.2636, + "epoch": 0.10350601779173207, + "grad_norm": 2.6148860251102524, + "learning_rate": 1.9718212542525616e-05, + "loss": 1.0602, "step": 989 }, { - "epoch": 0.65, - "learning_rate": 1.5754675686537756e-05, - "loss": 1.155, + "epoch": 0.10361067503924647, + "grad_norm": 2.512048499705755, + "learning_rate": 1.9717412962750704e-05, + "loss": 1.1129, "step": 990 }, { - "epoch": 0.65, - "learning_rate": 1.5745951194929756e-05, - "loss": 1.098, + "epoch": 0.10371533228676086, + "grad_norm": 2.9398186150069945, + "learning_rate": 1.9716612266425343e-05, + "loss": 1.0064, "step": 991 }, { - "epoch": 0.65, - "learning_rate": 1.5737220169065656e-05, - "loss": 1.1932, + "epoch": 0.10381998953427525, + "grad_norm": 2.5999608156502245, + "learning_rate": 1.971581045364154e-05, + "loss": 1.0484, "step": 992 }, { - "epoch": 0.65, - "learning_rate": 1.5728482618874314e-05, - "loss": 1.1976, + "epoch": 0.10392464678178964, + "grad_norm": 2.6014924463154743, + "learning_rate": 1.9715007524491413e-05, + "loss": 0.9897, "step": 993 }, { - "epoch": 0.65, - "learning_rate": 1.5719738554292017e-05, - "loss": 1.2005, + "epoch": 0.10402930402930403, + "grad_norm": 2.361645305711183, + "learning_rate": 1.9714203479067232e-05, + "loss": 1.1281, "step": 994 }, { - "epoch": 0.66, - "learning_rate": 1.571098798526245e-05, - "loss": 1.5259, + "epoch": 0.10413396127681843, + "grad_norm": 2.5112783788806685, + "learning_rate": 1.9713398317461377e-05, + "loss": 0.8673, "step": 995 }, { - "epoch": 0.66, - "learning_rate": 1.5702230921736705e-05, - "loss": 1.2184, + "epoch": 0.1042386185243328, + "grad_norm": 2.7601681785587373, + "learning_rate": 1.9712592039766364e-05, + "loss": 1.1427, "step": 996 }, { - "epoch": 0.66, - "learning_rate": 1.569346737367325e-05, - "loss": 1.3392, + "epoch": 0.1043432757718472, + "grad_norm": 2.9435423523320696, + "learning_rate": 1.9711784646074837e-05, + "loss": 1.0373, "step": 997 }, { - "epoch": 0.66, - "learning_rate": 1.5684697351037935e-05, - "loss": 1.3473, + "epoch": 0.10444793301936159, + "grad_norm": 2.513526513093683, + "learning_rate": 1.9710976136479565e-05, + "loss": 1.0433, "step": 998 }, { - "epoch": 0.66, - "learning_rate": 1.5675920863803967e-05, - "loss": 1.2805, + "epoch": 0.10455259026687598, + "grad_norm": 2.8188327082080575, + "learning_rate": 1.971016651107345e-05, + "loss": 1.0745, "step": 999 }, { - "epoch": 0.66, - "learning_rate": 1.566713792195191e-05, - "loss": 1.1129, + "epoch": 0.10465724751439037, + "grad_norm": 2.713657193992208, + "learning_rate": 1.9709355769949515e-05, + "loss": 1.1284, "step": 1000 }, { - "epoch": 0.66, - "learning_rate": 1.565834853546966e-05, - "loss": 1.2706, + "epoch": 0.10476190476190476, + "grad_norm": 3.0415166367388573, + "learning_rate": 1.9708543913200925e-05, + "loss": 1.0857, "step": 1001 }, { - "epoch": 0.66, - "learning_rate": 1.5649552714352448e-05, - "loss": 1.323, + "epoch": 0.10486656200941916, + "grad_norm": 3.0280839509415074, + "learning_rate": 1.9707730940920955e-05, + "loss": 1.1141, "step": 1002 }, { - "epoch": 0.66, - "learning_rate": 1.5640750468602822e-05, - "loss": 1.3073, + "epoch": 0.10497121925693355, + "grad_norm": 2.94662887338583, + "learning_rate": 1.970691685320302e-05, + "loss": 1.0994, "step": 1003 }, { - "epoch": 0.66, - "learning_rate": 1.563194180823064e-05, - "loss": 1.2237, + "epoch": 0.10507587650444793, + "grad_norm": 2.610110149296357, + "learning_rate": 1.9706101650140657e-05, + "loss": 1.1039, "step": 1004 }, { - "epoch": 0.66, - "learning_rate": 1.562312674325304e-05, - "loss": 1.3814, + "epoch": 0.10518053375196232, + "grad_norm": 3.192863824515813, + "learning_rate": 1.9705285331827544e-05, + "loss": 1.0594, "step": 1005 }, { - "epoch": 0.66, - "learning_rate": 1.5614305283694462e-05, - "loss": 1.3482, + "epoch": 0.10528519099947671, + "grad_norm": 3.0083558422084393, + "learning_rate": 1.9704467898357473e-05, + "loss": 0.9451, "step": 1006 }, { - "epoch": 0.66, - "learning_rate": 1.56054774395866e-05, - "loss": 1.2636, + "epoch": 0.1053898482469911, + "grad_norm": 2.692443979020099, + "learning_rate": 1.970364934982436e-05, + "loss": 1.0166, "step": 1007 }, { - "epoch": 0.66, - "learning_rate": 1.559664322096843e-05, - "loss": 1.2905, + "epoch": 0.1054945054945055, + "grad_norm": 2.473816517340895, + "learning_rate": 1.9702829686322272e-05, + "loss": 1.0046, "step": 1008 }, { - "epoch": 0.66, - "learning_rate": 1.5587802637886156e-05, - "loss": 1.2747, + "epoch": 0.10559916274201989, + "grad_norm": 2.5772774995170136, + "learning_rate": 1.970200890794538e-05, + "loss": 1.0931, "step": 1009 }, { - "epoch": 0.66, - "learning_rate": 1.557895570039323e-05, - "loss": 1.2749, + "epoch": 0.10570381998953428, + "grad_norm": 2.4344331305938423, + "learning_rate": 1.9701187014787994e-05, + "loss": 0.9071, "step": 1010 }, { - "epoch": 0.67, - "learning_rate": 1.5570102418550327e-05, - "loss": 1.2037, + "epoch": 0.10580847723704867, + "grad_norm": 2.49913692348178, + "learning_rate": 1.970036400694456e-05, + "loss": 1.0385, "step": 1011 }, { - "epoch": 0.67, - "learning_rate": 1.5561242802425337e-05, - "loss": 1.3082, + "epoch": 0.10591313448456306, + "grad_norm": 2.7676304166369197, + "learning_rate": 1.9699539884509632e-05, + "loss": 0.9947, "step": 1012 }, { - "epoch": 0.67, - "learning_rate": 1.5552376862093357e-05, - "loss": 1.1666, + "epoch": 0.10601779173207744, + "grad_norm": 2.653669677176504, + "learning_rate": 1.969871464757791e-05, + "loss": 1.0598, "step": 1013 }, { - "epoch": 0.67, - "learning_rate": 1.554350460763667e-05, - "loss": 1.1587, + "epoch": 0.10612244897959183, + "grad_norm": 3.1238605727049733, + "learning_rate": 1.9697888296244214e-05, + "loss": 1.1389, "step": 1014 }, { - "epoch": 0.67, - "learning_rate": 1.5534626049144753e-05, - "loss": 1.3303, + "epoch": 0.10622710622710622, + "grad_norm": 2.9772018563723672, + "learning_rate": 1.9697060830603495e-05, + "loss": 1.0753, "step": 1015 }, { - "epoch": 0.67, - "learning_rate": 1.552574119671423e-05, - "loss": 1.254, + "epoch": 0.10633176347462062, + "grad_norm": 2.668071509791539, + "learning_rate": 1.9696232250750823e-05, + "loss": 1.0608, "step": 1016 }, { - "epoch": 0.67, - "learning_rate": 1.55168500604489e-05, - "loss": 1.2916, + "epoch": 0.10643642072213501, + "grad_norm": 2.589387863961408, + "learning_rate": 1.9695402556781414e-05, + "loss": 1.0848, "step": 1017 }, { - "epoch": 0.67, - "learning_rate": 1.5507952650459697e-05, - "loss": 1.2834, + "epoch": 0.1065410779696494, + "grad_norm": 2.575997267076478, + "learning_rate": 1.9694571748790593e-05, + "loss": 1.0157, "step": 1018 }, { - "epoch": 0.67, - "learning_rate": 1.54990489768647e-05, - "loss": 1.2852, + "epoch": 0.10664573521716379, + "grad_norm": 2.4804280288072222, + "learning_rate": 1.9693739826873828e-05, + "loss": 0.9616, "step": 1019 }, { - "epoch": 0.67, - "learning_rate": 1.5490139049789097e-05, - "loss": 1.37, + "epoch": 0.10675039246467818, + "grad_norm": 2.8490573537756854, + "learning_rate": 1.9692906791126708e-05, + "loss": 1.0175, "step": 1020 }, { - "epoch": 0.67, - "learning_rate": 1.5481222879365205e-05, - "loss": 1.2925, + "epoch": 0.10685504971219256, + "grad_norm": 2.7446287985468367, + "learning_rate": 1.9692072641644945e-05, + "loss": 1.0971, "step": 1021 }, { - "epoch": 0.67, - "learning_rate": 1.5472300475732427e-05, - "loss": 1.3541, + "epoch": 0.10695970695970695, + "grad_norm": 3.0448502861545292, + "learning_rate": 1.9691237378524387e-05, + "loss": 1.014, "step": 1022 }, { - "epoch": 0.67, - "learning_rate": 1.546337184903726e-05, - "loss": 1.3664, + "epoch": 0.10706436420722135, + "grad_norm": 2.6319835236235622, + "learning_rate": 1.9690401001861006e-05, + "loss": 1.0825, "step": 1023 }, { - "epoch": 0.67, - "learning_rate": 1.5454437009433277e-05, - "loss": 1.2974, + "epoch": 0.10716902145473574, + "grad_norm": 2.4456143448333547, + "learning_rate": 1.9689563511750907e-05, + "loss": 1.1329, "step": 1024 }, { - "epoch": 0.67, - "learning_rate": 1.5445495967081114e-05, - "loss": 1.3451, + "epoch": 0.10727367870225013, + "grad_norm": 3.059839362265981, + "learning_rate": 1.9688724908290318e-05, + "loss": 1.1094, "step": 1025 }, { - "epoch": 0.68, - "learning_rate": 1.543654873214846e-05, - "loss": 1.4349, + "epoch": 0.10737833594976452, + "grad_norm": 2.4535767216133797, + "learning_rate": 1.9687885191575597e-05, + "loss": 0.8073, "step": 1026 }, { - "epoch": 0.68, - "learning_rate": 1.542759531481006e-05, - "loss": 1.2617, + "epoch": 0.10748299319727891, + "grad_norm": 2.2330132410114656, + "learning_rate": 1.9687044361703228e-05, + "loss": 0.9921, "step": 1027 }, { - "epoch": 0.68, - "learning_rate": 1.5418635725247666e-05, - "loss": 1.2663, + "epoch": 0.1075876504447933, + "grad_norm": 2.6139103762274876, + "learning_rate": 1.9686202418769823e-05, + "loss": 1.0012, "step": 1028 }, { - "epoch": 0.68, - "learning_rate": 1.540966997365007e-05, - "loss": 1.2458, + "epoch": 0.1076923076923077, + "grad_norm": 2.7091370591165576, + "learning_rate": 1.9685359362872124e-05, + "loss": 1.2318, "step": 1029 }, { - "epoch": 0.68, - "learning_rate": 1.540069807021306e-05, - "loss": 1.3168, + "epoch": 0.10779696493982208, + "grad_norm": 2.414700209712605, + "learning_rate": 1.9684515194107004e-05, + "loss": 1.0775, "step": 1030 }, { - "epoch": 0.68, - "learning_rate": 1.539172002513942e-05, - "loss": 1.286, + "epoch": 0.10790162218733647, + "grad_norm": 2.6192205052365236, + "learning_rate": 1.9683669912571452e-05, + "loss": 1.0712, "step": 1031 }, { - "epoch": 0.68, - "learning_rate": 1.538273584863893e-05, - "loss": 1.233, + "epoch": 0.10800627943485086, + "grad_norm": 2.737341412891626, + "learning_rate": 1.96828235183626e-05, + "loss": 1.0652, "step": 1032 }, { - "epoch": 0.68, - "learning_rate": 1.537374555092832e-05, - "loss": 1.3463, + "epoch": 0.10811093668236525, + "grad_norm": 2.465865978038424, + "learning_rate": 1.968197601157769e-05, + "loss": 1.124, "step": 1033 }, { - "epoch": 0.68, - "learning_rate": 1.5364749142231302e-05, - "loss": 1.3144, + "epoch": 0.10821559392987964, + "grad_norm": 2.4388907774406574, + "learning_rate": 1.968112739231411e-05, + "loss": 1.0664, "step": 1034 }, { - "epoch": 0.68, - "learning_rate": 1.5355746632778533e-05, - "loss": 1.2864, + "epoch": 0.10832025117739404, + "grad_norm": 2.412351278224597, + "learning_rate": 1.968027766066937e-05, + "loss": 1.0165, "step": 1035 }, { - "epoch": 0.68, - "learning_rate": 1.5346738032807603e-05, - "loss": 1.352, + "epoch": 0.10842490842490843, + "grad_norm": 3.061388366092205, + "learning_rate": 1.9679426816741102e-05, + "loss": 1.2182, "step": 1036 }, { - "epoch": 0.68, - "learning_rate": 1.5337723352563028e-05, - "loss": 1.2151, + "epoch": 0.10852956567242282, + "grad_norm": 2.674591671542277, + "learning_rate": 1.967857486062707e-05, + "loss": 0.8857, "step": 1037 }, { - "epoch": 0.68, - "learning_rate": 1.5328702602296238e-05, - "loss": 1.2613, + "epoch": 0.10863422291993721, + "grad_norm": 2.6778191321571, + "learning_rate": 1.9677721792425167e-05, + "loss": 1.1491, "step": 1038 }, { - "epoch": 0.68, - "learning_rate": 1.531967579226557e-05, - "loss": 1.1542, + "epoch": 0.10873888016745159, + "grad_norm": 2.591446579098128, + "learning_rate": 1.967686761223341e-05, + "loss": 0.9744, "step": 1039 }, { - "epoch": 0.68, - "learning_rate": 1.5310642932736253e-05, - "loss": 1.2407, + "epoch": 0.10884353741496598, + "grad_norm": 2.7917814478780314, + "learning_rate": 1.967601232014995e-05, + "loss": 0.9032, "step": 1040 }, { - "epoch": 0.69, - "learning_rate": 1.5301604033980387e-05, - "loss": 1.2442, + "epoch": 0.10894819466248037, + "grad_norm": 2.2711122801371517, + "learning_rate": 1.9675155916273057e-05, + "loss": 1.0988, "step": 1041 }, { - "epoch": 0.69, - "learning_rate": 1.529255910627695e-05, - "loss": 1.203, + "epoch": 0.10905285190999477, + "grad_norm": 2.6881776652627645, + "learning_rate": 1.9674298400701137e-05, + "loss": 0.997, "step": 1042 }, { - "epoch": 0.69, - "learning_rate": 1.5283508159911776e-05, - "loss": 1.3566, + "epoch": 0.10915750915750916, + "grad_norm": 2.4975909079275502, + "learning_rate": 1.9673439773532714e-05, + "loss": 0.9299, "step": 1043 }, { - "epoch": 0.69, - "learning_rate": 1.5274451205177526e-05, - "loss": 1.2837, + "epoch": 0.10926216640502355, + "grad_norm": 2.710402657049504, + "learning_rate": 1.9672580034866455e-05, + "loss": 1.0081, "step": 1044 }, { - "epoch": 0.69, - "learning_rate": 1.5265388252373712e-05, - "loss": 1.3364, + "epoch": 0.10936682365253794, + "grad_norm": 2.643549698589489, + "learning_rate": 1.9671719184801144e-05, + "loss": 1.0695, "step": 1045 }, { - "epoch": 0.69, - "learning_rate": 1.525631931180667e-05, - "loss": 1.2052, + "epoch": 0.10947148090005233, + "grad_norm": 2.7508161610052246, + "learning_rate": 1.9670857223435688e-05, + "loss": 1.112, "step": 1046 }, { - "epoch": 0.69, - "learning_rate": 1.524724439378953e-05, - "loss": 1.3054, + "epoch": 0.10957613814756671, + "grad_norm": 2.5165912885846113, + "learning_rate": 1.9669994150869134e-05, + "loss": 0.9325, "step": 1047 }, { - "epoch": 0.69, - "learning_rate": 1.5238163508642219e-05, - "loss": 1.224, + "epoch": 0.1096807953950811, + "grad_norm": 2.4710904066047616, + "learning_rate": 1.9669129967200648e-05, + "loss": 1.1537, "step": 1048 }, { - "epoch": 0.69, - "learning_rate": 1.5229076666691471e-05, - "loss": 1.4566, + "epoch": 0.1097854526425955, + "grad_norm": 3.0975055499064514, + "learning_rate": 1.9668264672529528e-05, + "loss": 1.0, "step": 1049 }, { - "epoch": 0.69, - "learning_rate": 1.5219983878270767e-05, - "loss": 1.3532, + "epoch": 0.10989010989010989, + "grad_norm": 2.253295288672016, + "learning_rate": 1.9667398266955195e-05, + "loss": 1.0463, "step": 1050 }, { - "epoch": 0.69, - "learning_rate": 1.5210885153720374e-05, - "loss": 1.26, + "epoch": 0.10999476713762428, + "grad_norm": 2.6293412785828014, + "learning_rate": 1.9666530750577207e-05, + "loss": 0.9453, "step": 1051 }, { - "epoch": 0.69, - "learning_rate": 1.520178050338729e-05, - "loss": 1.1311, + "epoch": 0.11009942438513867, + "grad_norm": 2.1033760444602727, + "learning_rate": 1.966566212349524e-05, + "loss": 0.9526, "step": 1052 }, { - "epoch": 0.69, - "learning_rate": 1.519266993762527e-05, - "loss": 1.3461, + "epoch": 0.11020408163265306, + "grad_norm": 2.5610804233496127, + "learning_rate": 1.9664792385809094e-05, + "loss": 0.9974, "step": 1053 }, { - "epoch": 0.69, - "learning_rate": 1.5183553466794778e-05, - "loss": 1.2327, + "epoch": 0.11030873888016746, + "grad_norm": 2.5769705235076934, + "learning_rate": 1.9663921537618717e-05, + "loss": 1.0322, "step": 1054 }, { - "epoch": 0.69, - "learning_rate": 1.5174431101263004e-05, - "loss": 1.4256, + "epoch": 0.11041339612768185, + "grad_norm": 2.4860849473694935, + "learning_rate": 1.9663049579024162e-05, + "loss": 1.0821, "step": 1055 }, { - "epoch": 0.7, - "learning_rate": 1.5165302851403843e-05, - "loss": 1.3932, + "epoch": 0.11051805337519623, + "grad_norm": 2.556645999664551, + "learning_rate": 1.966217651012562e-05, + "loss": 0.9631, "step": 1056 }, { - "epoch": 0.7, - "learning_rate": 1.5156168727597873e-05, - "loss": 1.288, + "epoch": 0.11062271062271062, + "grad_norm": 2.69622491307049, + "learning_rate": 1.9661302331023413e-05, + "loss": 0.963, "step": 1057 }, { - "epoch": 0.7, - "learning_rate": 1.5147028740232361e-05, - "loss": 1.3192, + "epoch": 0.11072736787022501, + "grad_norm": 2.322114587301298, + "learning_rate": 1.9660427041817982e-05, + "loss": 0.77, "step": 1058 }, { - "epoch": 0.7, - "learning_rate": 1.5137882899701234e-05, - "loss": 1.27, + "epoch": 0.1108320251177394, + "grad_norm": 2.086861967831109, + "learning_rate": 1.9659550642609898e-05, + "loss": 0.8174, "step": 1059 }, { - "epoch": 0.7, - "learning_rate": 1.5128731216405084e-05, - "loss": 1.2837, + "epoch": 0.1109366823652538, + "grad_norm": 2.5465767908188, + "learning_rate": 1.9658673133499868e-05, + "loss": 1.1169, "step": 1060 }, { - "epoch": 0.7, - "learning_rate": 1.5119573700751133e-05, - "loss": 1.4091, + "epoch": 0.11104133961276819, + "grad_norm": 2.3306210424010865, + "learning_rate": 1.9657794514588708e-05, + "loss": 1.0342, "step": 1061 }, { - "epoch": 0.7, - "learning_rate": 1.5110410363153256e-05, - "loss": 1.2189, + "epoch": 0.11114599686028258, + "grad_norm": 4.169058819176632, + "learning_rate": 1.9656914785977386e-05, + "loss": 1.0828, "step": 1062 }, { - "epoch": 0.7, - "learning_rate": 1.5101241214031928e-05, - "loss": 1.3311, + "epoch": 0.11125065410779697, + "grad_norm": 2.974406996118035, + "learning_rate": 1.9656033947766974e-05, + "loss": 1.1862, "step": 1063 }, { - "epoch": 0.7, - "learning_rate": 1.5092066263814245e-05, - "loss": 1.2795, + "epoch": 0.11135531135531136, + "grad_norm": 2.263717528826466, + "learning_rate": 1.965515200005869e-05, + "loss": 0.8976, "step": 1064 }, { - "epoch": 0.7, - "learning_rate": 1.5082885522933901e-05, - "loss": 1.4888, + "epoch": 0.11145996860282574, + "grad_norm": 2.1583188268439297, + "learning_rate": 1.9654268942953867e-05, + "loss": 0.9869, "step": 1065 }, { - "epoch": 0.7, - "learning_rate": 1.5073699001831168e-05, - "loss": 1.1529, + "epoch": 0.11156462585034013, + "grad_norm": 2.6538687038608715, + "learning_rate": 1.965338477655397e-05, + "loss": 1.015, "step": 1066 }, { - "epoch": 0.7, - "learning_rate": 1.506450671095289e-05, - "loss": 1.2273, + "epoch": 0.11166928309785452, + "grad_norm": 3.088871449306695, + "learning_rate": 1.9652499500960594e-05, + "loss": 1.0569, "step": 1067 }, { - "epoch": 0.7, - "learning_rate": 1.5055308660752486e-05, - "loss": 1.4096, + "epoch": 0.11177394034536892, + "grad_norm": 2.576832115600818, + "learning_rate": 1.9651613116275457e-05, + "loss": 0.8432, "step": 1068 }, { - "epoch": 0.7, - "learning_rate": 1.5046104861689911e-05, - "loss": 1.2175, + "epoch": 0.11187859759288331, + "grad_norm": 2.110328498949912, + "learning_rate": 1.9650725622600407e-05, + "loss": 0.9884, "step": 1069 }, { - "epoch": 0.7, - "learning_rate": 1.5036895324231662e-05, - "loss": 1.2721, + "epoch": 0.1119832548403977, + "grad_norm": 2.790857879613387, + "learning_rate": 1.964983702003742e-05, + "loss": 1.074, "step": 1070 }, { - "epoch": 0.71, - "learning_rate": 1.5027680058850757e-05, - "loss": 1.2757, + "epoch": 0.11208791208791209, + "grad_norm": 2.9160190675466158, + "learning_rate": 1.9648947308688594e-05, + "loss": 1.1648, "step": 1071 }, { - "epoch": 0.71, - "learning_rate": 1.501845907602674e-05, - "loss": 1.306, + "epoch": 0.11219256933542648, + "grad_norm": 2.122470818651794, + "learning_rate": 1.9648056488656166e-05, + "loss": 0.8107, "step": 1072 }, { - "epoch": 0.71, - "learning_rate": 1.5009232386245644e-05, - "loss": 1.3438, + "epoch": 0.11229722658294086, + "grad_norm": 2.579496972713126, + "learning_rate": 1.9647164560042486e-05, + "loss": 1.1268, "step": 1073 }, { - "epoch": 0.71, - "learning_rate": 1.5000000000000002e-05, - "loss": 1.2436, + "epoch": 0.11240188383045525, + "grad_norm": 3.122189191320992, + "learning_rate": 1.9646271522950043e-05, + "loss": 1.0926, "step": 1074 }, { - "epoch": 0.71, - "learning_rate": 1.4990761927788815e-05, - "loss": 1.333, + "epoch": 0.11250654107796965, + "grad_norm": 2.596905120190331, + "learning_rate": 1.9645377377481446e-05, + "loss": 1.0616, "step": 1075 }, { - "epoch": 0.71, - "learning_rate": 1.4981518180117558e-05, - "loss": 1.402, + "epoch": 0.11261119832548404, + "grad_norm": 2.851456857123207, + "learning_rate": 1.9644482123739434e-05, + "loss": 1.1331, "step": 1076 }, { - "epoch": 0.71, - "learning_rate": 1.497226876749815e-05, - "loss": 1.3782, + "epoch": 0.11271585557299843, + "grad_norm": 2.4375447703917223, + "learning_rate": 1.964358576182688e-05, + "loss": 0.9973, "step": 1077 }, { - "epoch": 0.71, - "learning_rate": 1.4963013700448969e-05, - "loss": 1.3064, + "epoch": 0.11282051282051282, + "grad_norm": 2.388922166429233, + "learning_rate": 1.9642688291846762e-05, + "loss": 0.911, "step": 1078 }, { - "epoch": 0.71, - "learning_rate": 1.4953752989494815e-05, - "loss": 1.2067, + "epoch": 0.11292517006802721, + "grad_norm": 2.6304965592524328, + "learning_rate": 1.9641789713902223e-05, + "loss": 1.0654, "step": 1079 }, { - "epoch": 0.71, - "learning_rate": 1.4944486645166892e-05, - "loss": 1.3255, + "epoch": 0.1130298273155416, + "grad_norm": 2.4302290806051277, + "learning_rate": 1.9640890028096492e-05, + "loss": 1.0569, "step": 1080 }, { - "epoch": 0.71, - "learning_rate": 1.4935214678002834e-05, - "loss": 1.3169, + "epoch": 0.113134484563056, + "grad_norm": 2.2285317412672745, + "learning_rate": 1.9639989234532955e-05, + "loss": 1.1056, "step": 1081 }, { - "epoch": 0.71, - "learning_rate": 1.4925937098546652e-05, - "loss": 1.2642, + "epoch": 0.11323914181057038, + "grad_norm": 2.5935496575040693, + "learning_rate": 1.9639087333315113e-05, + "loss": 1.0176, "step": 1082 }, { - "epoch": 0.71, - "learning_rate": 1.4916653917348751e-05, - "loss": 1.1457, + "epoch": 0.11334379905808477, + "grad_norm": 2.699922891159306, + "learning_rate": 1.9638184324546597e-05, + "loss": 1.1053, "step": 1083 }, { - "epoch": 0.71, - "learning_rate": 1.4907365144965896e-05, - "loss": 1.297, + "epoch": 0.11344845630559916, + "grad_norm": 3.2198916620549354, + "learning_rate": 1.963728020833116e-05, + "loss": 1.1036, "step": 1084 }, { - "epoch": 0.71, - "learning_rate": 1.4898070791961215e-05, - "loss": 1.2597, + "epoch": 0.11355311355311355, + "grad_norm": 2.6375927221507145, + "learning_rate": 1.963637498477269e-05, + "loss": 0.9988, "step": 1085 }, { - "epoch": 0.71, - "learning_rate": 1.4888770868904185e-05, - "loss": 1.3395, + "epoch": 0.11365777080062794, + "grad_norm": 2.938305615487851, + "learning_rate": 1.9635468653975203e-05, + "loss": 1.166, "step": 1086 }, { - "epoch": 0.72, - "learning_rate": 1.4879465386370613e-05, - "loss": 1.2254, + "epoch": 0.11376242804814234, + "grad_norm": 2.2156017547233295, + "learning_rate": 1.9634561216042834e-05, + "loss": 1.0057, "step": 1087 }, { - "epoch": 0.72, - "learning_rate": 1.487015435494263e-05, - "loss": 1.1784, + "epoch": 0.11386708529565673, + "grad_norm": 2.4105372096002755, + "learning_rate": 1.9633652671079854e-05, + "loss": 0.9225, "step": 1088 }, { - "epoch": 0.72, - "learning_rate": 1.4860837785208678e-05, - "loss": 1.2387, + "epoch": 0.11397174254317112, + "grad_norm": 2.412529782609449, + "learning_rate": 1.9632743019190646e-05, + "loss": 0.9356, "step": 1089 }, { - "epoch": 0.72, - "learning_rate": 1.4851515687763492e-05, - "loss": 1.2022, + "epoch": 0.1140763997906855, + "grad_norm": 2.738234258928923, + "learning_rate": 1.9631832260479736e-05, + "loss": 1.1259, "step": 1090 }, { - "epoch": 0.72, - "learning_rate": 1.48421880732081e-05, - "loss": 1.3362, + "epoch": 0.11418105703819989, + "grad_norm": 2.295890829748179, + "learning_rate": 1.963092039505178e-05, + "loss": 1.0476, "step": 1091 }, { - "epoch": 0.72, - "learning_rate": 1.48328549521498e-05, - "loss": 1.2714, + "epoch": 0.11428571428571428, + "grad_norm": 2.4190921434998574, + "learning_rate": 1.9630007423011543e-05, + "loss": 1.0351, "step": 1092 }, { - "epoch": 0.72, - "learning_rate": 1.4823516335202152e-05, - "loss": 1.2764, + "epoch": 0.11439037153322867, + "grad_norm": 2.5227441453952033, + "learning_rate": 1.962909334446393e-05, + "loss": 1.0712, "step": 1093 }, { - "epoch": 0.72, - "learning_rate": 1.4814172232984968e-05, - "loss": 1.1313, + "epoch": 0.11449502878074307, + "grad_norm": 2.5782201716213775, + "learning_rate": 1.9628178159513976e-05, + "loss": 0.8372, "step": 1094 }, { - "epoch": 0.72, - "learning_rate": 1.4804822656124302e-05, - "loss": 1.3175, + "epoch": 0.11459968602825746, + "grad_norm": 2.4279239527241367, + "learning_rate": 1.962726186826683e-05, + "loss": 1.0871, "step": 1095 }, { - "epoch": 0.72, - "learning_rate": 1.479546761525242e-05, - "loss": 1.2485, + "epoch": 0.11470434327577185, + "grad_norm": 2.2289981767762734, + "learning_rate": 1.962634447082778e-05, + "loss": 1.0551, "step": 1096 }, { - "epoch": 0.72, - "learning_rate": 1.4786107121007813e-05, - "loss": 1.3056, + "epoch": 0.11480900052328624, + "grad_norm": 2.7919927641730387, + "learning_rate": 1.9625425967302232e-05, + "loss": 1.0695, "step": 1097 }, { - "epoch": 0.72, - "learning_rate": 1.4776741184035171e-05, - "loss": 1.1867, + "epoch": 0.11491365777080063, + "grad_norm": 2.567716154128025, + "learning_rate": 1.962450635779573e-05, + "loss": 1.1617, "step": 1098 }, { - "epoch": 0.72, - "learning_rate": 1.476736981498537e-05, - "loss": 1.266, + "epoch": 0.11501831501831501, + "grad_norm": 2.449051302816616, + "learning_rate": 1.962358564241394e-05, + "loss": 1.0829, "step": 1099 }, { - "epoch": 0.72, - "learning_rate": 1.475799302451547e-05, - "loss": 1.227, + "epoch": 0.1151229722658294, + "grad_norm": 2.4898532560578728, + "learning_rate": 1.9622663821262647e-05, + "loss": 1.1857, "step": 1100 }, { - "epoch": 0.72, - "learning_rate": 1.4748610823288693e-05, - "loss": 1.3661, + "epoch": 0.1152276295133438, + "grad_norm": 2.508714185849526, + "learning_rate": 1.9621740894447776e-05, + "loss": 0.9801, "step": 1101 }, { - "epoch": 0.73, - "learning_rate": 1.4739223221974415e-05, - "loss": 1.3033, + "epoch": 0.11533228676085819, + "grad_norm": 2.864088004419296, + "learning_rate": 1.9620816862075365e-05, + "loss": 0.9741, "step": 1102 }, { - "epoch": 0.73, - "learning_rate": 1.4729830231248143e-05, - "loss": 1.2722, + "epoch": 0.11543694400837258, + "grad_norm": 2.2451938017369457, + "learning_rate": 1.9619891724251602e-05, + "loss": 0.9874, "step": 1103 }, { - "epoch": 0.73, - "learning_rate": 1.4720431861791531e-05, - "loss": 1.2511, + "epoch": 0.11554160125588697, + "grad_norm": 2.43522131047157, + "learning_rate": 1.961896548108277e-05, + "loss": 1.0198, "step": 1104 }, { - "epoch": 0.73, - "learning_rate": 1.4711028124292332e-05, - "loss": 1.2971, + "epoch": 0.11564625850340136, + "grad_norm": 2.6482111396981196, + "learning_rate": 1.961803813267531e-05, + "loss": 1.0065, "step": 1105 }, { - "epoch": 0.73, - "learning_rate": 1.4701619029444419e-05, - "loss": 1.348, + "epoch": 0.11575091575091576, + "grad_norm": 2.663006273262908, + "learning_rate": 1.961710967913577e-05, + "loss": 1.0731, "step": 1106 }, { - "epoch": 0.73, - "learning_rate": 1.4692204587947745e-05, - "loss": 1.3339, + "epoch": 0.11585557299843015, + "grad_norm": 2.2592261853498745, + "learning_rate": 1.9616180120570833e-05, + "loss": 1.0667, "step": 1107 }, { - "epoch": 0.73, - "learning_rate": 1.468278481050835e-05, - "loss": 1.1522, + "epoch": 0.11596023024594453, + "grad_norm": 2.5579103444641587, + "learning_rate": 1.9615249457087302e-05, + "loss": 1.0649, "step": 1108 }, { - "epoch": 0.73, - "learning_rate": 1.4673359707838338e-05, - "loss": 1.1803, + "epoch": 0.11606488749345892, + "grad_norm": 2.5681709092759846, + "learning_rate": 1.9614317688792118e-05, + "loss": 1.0629, "step": 1109 }, { - "epoch": 0.73, - "learning_rate": 1.4663929290655872e-05, - "loss": 1.1292, + "epoch": 0.11616954474097331, + "grad_norm": 2.7062909391638468, + "learning_rate": 1.9613384815792343e-05, + "loss": 1.0538, "step": 1110 }, { - "epoch": 0.73, - "learning_rate": 1.4654493569685157e-05, - "loss": 1.3618, + "epoch": 0.1162742019884877, + "grad_norm": 3.1506469607183423, + "learning_rate": 1.9612450838195164e-05, + "loss": 0.9414, "step": 1111 }, { - "epoch": 0.73, - "learning_rate": 1.464505255565643e-05, - "loss": 1.369, + "epoch": 0.1163788592360021, + "grad_norm": 2.771093053485509, + "learning_rate": 1.9611515756107895e-05, + "loss": 1.0721, "step": 1112 }, { - "epoch": 0.73, - "learning_rate": 1.4635606259305948e-05, - "loss": 1.2768, + "epoch": 0.11648351648351649, + "grad_norm": 2.6299240759159033, + "learning_rate": 1.9610579569637982e-05, + "loss": 1.0506, "step": 1113 }, { - "epoch": 0.73, - "learning_rate": 1.4626154691375974e-05, - "loss": 1.2776, + "epoch": 0.11658817373103088, + "grad_norm": 2.718685548324005, + "learning_rate": 1.9609642278892992e-05, + "loss": 1.0423, "step": 1114 }, { - "epoch": 0.73, - "learning_rate": 1.4616697862614766e-05, - "loss": 1.2922, + "epoch": 0.11669283097854527, + "grad_norm": 2.617103354933396, + "learning_rate": 1.9608703883980625e-05, + "loss": 1.135, "step": 1115 }, { - "epoch": 0.73, - "learning_rate": 1.4607235783776563e-05, - "loss": 1.3472, + "epoch": 0.11679748822605965, + "grad_norm": 2.8284262061059775, + "learning_rate": 1.9607764385008706e-05, + "loss": 0.9975, "step": 1116 }, { - "epoch": 0.74, - "learning_rate": 1.4597768465621575e-05, - "loss": 1.2253, + "epoch": 0.11690214547357404, + "grad_norm": 2.422530563865842, + "learning_rate": 1.9606823782085178e-05, + "loss": 0.9965, "step": 1117 }, { - "epoch": 0.74, - "learning_rate": 1.4588295918915978e-05, - "loss": 1.303, + "epoch": 0.11700680272108843, + "grad_norm": 2.7309355633540022, + "learning_rate": 1.9605882075318124e-05, + "loss": 1.0416, "step": 1118 }, { - "epoch": 0.74, - "learning_rate": 1.4578818154431878e-05, - "loss": 1.2062, + "epoch": 0.11711145996860282, + "grad_norm": 2.41740256806887, + "learning_rate": 1.9604939264815746e-05, + "loss": 1.091, "step": 1119 }, { - "epoch": 0.74, - "learning_rate": 1.4569335182947325e-05, - "loss": 1.3157, + "epoch": 0.11721611721611722, + "grad_norm": 2.6049047174023565, + "learning_rate": 1.960399535068638e-05, + "loss": 0.9203, "step": 1120 }, { - "epoch": 0.74, - "learning_rate": 1.4559847015246294e-05, - "loss": 1.3139, + "epoch": 0.11732077446363161, + "grad_norm": 2.352711031496301, + "learning_rate": 1.9603050333038473e-05, + "loss": 1.0542, "step": 1121 }, { - "epoch": 0.74, - "learning_rate": 1.4550353662118659e-05, - "loss": 1.423, + "epoch": 0.117425431711146, + "grad_norm": 3.3295305172916727, + "learning_rate": 1.9602104211980614e-05, + "loss": 0.9534, "step": 1122 }, { - "epoch": 0.74, - "learning_rate": 1.4540855134360198e-05, - "loss": 1.2684, + "epoch": 0.11753008895866039, + "grad_norm": 3.1546428345111197, + "learning_rate": 1.960115698762152e-05, + "loss": 1.0338, "step": 1123 }, { - "epoch": 0.74, - "learning_rate": 1.453135144277257e-05, - "loss": 1.4146, + "epoch": 0.11763474620617478, + "grad_norm": 2.728833024559098, + "learning_rate": 1.9600208660070022e-05, + "loss": 1.131, "step": 1124 }, { - "epoch": 0.74, - "learning_rate": 1.4521842598163309e-05, - "loss": 1.3044, + "epoch": 0.11773940345368916, + "grad_norm": 2.7185354106465236, + "learning_rate": 1.9599259229435088e-05, + "loss": 0.9256, "step": 1125 }, { - "epoch": 0.74, - "learning_rate": 1.4512328611345808e-05, - "loss": 1.2642, + "epoch": 0.11784406070120355, + "grad_norm": 2.498618364520084, + "learning_rate": 1.959830869582581e-05, + "loss": 1.0662, "step": 1126 }, { - "epoch": 0.74, - "learning_rate": 1.4502809493139305e-05, - "loss": 1.3398, + "epoch": 0.11794871794871795, + "grad_norm": 2.495040333087842, + "learning_rate": 1.9597357059351404e-05, + "loss": 1.0717, "step": 1127 }, { - "epoch": 0.74, - "learning_rate": 1.4493285254368878e-05, - "loss": 1.1626, + "epoch": 0.11805337519623234, + "grad_norm": 2.623375696161012, + "learning_rate": 1.9596404320121217e-05, + "loss": 1.0103, "step": 1128 }, { - "epoch": 0.74, - "learning_rate": 1.4483755905865423e-05, - "loss": 1.2641, + "epoch": 0.11815803244374673, + "grad_norm": 2.4391065346519496, + "learning_rate": 1.9595450478244718e-05, + "loss": 0.9587, "step": 1129 }, { - "epoch": 0.74, - "learning_rate": 1.4474221458465651e-05, - "loss": 1.3286, + "epoch": 0.11826268969126112, + "grad_norm": 2.663286662491619, + "learning_rate": 1.9594495533831507e-05, + "loss": 1.1976, "step": 1130 }, { - "epoch": 0.74, - "learning_rate": 1.4464681923012072e-05, - "loss": 1.4002, + "epoch": 0.11836734693877551, + "grad_norm": 2.5842407873088766, + "learning_rate": 1.959353948699131e-05, + "loss": 1.1649, "step": 1131 }, { - "epoch": 0.75, - "learning_rate": 1.4455137310352978e-05, - "loss": 1.1133, + "epoch": 0.1184720041862899, + "grad_norm": 2.5723829154298534, + "learning_rate": 1.959258233783398e-05, + "loss": 1.1408, "step": 1132 }, { - "epoch": 0.75, - "learning_rate": 1.4445587631342433e-05, - "loss": 1.3146, + "epoch": 0.11857666143380428, + "grad_norm": 2.378313766984001, + "learning_rate": 1.959162408646949e-05, + "loss": 1.0323, "step": 1133 }, { - "epoch": 0.75, - "learning_rate": 1.4436032896840271e-05, - "loss": 1.2476, + "epoch": 0.11868131868131868, + "grad_norm": 2.561592915095796, + "learning_rate": 1.9590664733007947e-05, + "loss": 1.0473, "step": 1134 }, { - "epoch": 0.75, - "learning_rate": 1.4426473117712071e-05, - "loss": 1.3524, + "epoch": 0.11878597592883307, + "grad_norm": 2.6784137838787765, + "learning_rate": 1.9589704277559586e-05, + "loss": 0.9144, "step": 1135 }, { - "epoch": 0.75, - "learning_rate": 1.4416908304829142e-05, - "loss": 1.2622, + "epoch": 0.11889063317634746, + "grad_norm": 2.4050436698772057, + "learning_rate": 1.958874272023476e-05, + "loss": 0.9927, "step": 1136 }, { - "epoch": 0.75, - "learning_rate": 1.4407338469068532e-05, - "loss": 1.2244, + "epoch": 0.11899529042386185, + "grad_norm": 2.787218387154808, + "learning_rate": 1.9587780061143965e-05, + "loss": 1.112, "step": 1137 }, { - "epoch": 0.75, - "learning_rate": 1.4397763621312985e-05, - "loss": 1.3472, + "epoch": 0.11909994767137624, + "grad_norm": 2.467244551948357, + "learning_rate": 1.9586816300397796e-05, + "loss": 1.0967, "step": 1138 }, { - "epoch": 0.75, - "learning_rate": 1.4388183772450957e-05, - "loss": 1.1446, + "epoch": 0.11920460491889064, + "grad_norm": 2.974817620992224, + "learning_rate": 1.9585851438107007e-05, + "loss": 1.0886, "step": 1139 }, { - "epoch": 0.75, - "learning_rate": 1.437859893337658e-05, - "loss": 1.2744, + "epoch": 0.11930926216640503, + "grad_norm": 2.7737239161164906, + "learning_rate": 1.958488547438245e-05, + "loss": 0.9261, "step": 1140 }, { - "epoch": 0.75, - "learning_rate": 1.4369009114989671e-05, - "loss": 1.2349, + "epoch": 0.11941391941391942, + "grad_norm": 3.0863679881725132, + "learning_rate": 1.958391840933512e-05, + "loss": 0.8894, "step": 1141 }, { - "epoch": 0.75, - "learning_rate": 1.4359414328195704e-05, - "loss": 1.2209, + "epoch": 0.1195185766614338, + "grad_norm": 2.2653693939706403, + "learning_rate": 1.958295024307614e-05, + "loss": 1.0348, "step": 1142 }, { - "epoch": 0.75, - "learning_rate": 1.4349814583905807e-05, - "loss": 1.3382, + "epoch": 0.11962323390894819, + "grad_norm": 2.754842275125979, + "learning_rate": 1.958198097571675e-05, + "loss": 1.075, "step": 1143 }, { - "epoch": 0.75, - "learning_rate": 1.434020989303674e-05, - "loss": 1.2584, + "epoch": 0.11972789115646258, + "grad_norm": 2.832206040807847, + "learning_rate": 1.9581010607368324e-05, + "loss": 1.104, "step": 1144 }, { - "epoch": 0.75, - "learning_rate": 1.4330600266510895e-05, - "loss": 1.3664, + "epoch": 0.11983254840397697, + "grad_norm": 2.6220443857258044, + "learning_rate": 1.958003913814235e-05, + "loss": 1.0461, "step": 1145 }, { - "epoch": 0.75, - "learning_rate": 1.4320985715256269e-05, - "loss": 1.162, + "epoch": 0.11993720565149137, + "grad_norm": 3.167797177471765, + "learning_rate": 1.957906656815046e-05, + "loss": 1.2079, "step": 1146 }, { - "epoch": 0.76, - "learning_rate": 1.4311366250206464e-05, - "loss": 1.3295, + "epoch": 0.12004186289900576, + "grad_norm": 2.5145650658616647, + "learning_rate": 1.9578092897504404e-05, + "loss": 1.1371, "step": 1147 }, { - "epoch": 0.76, - "learning_rate": 1.4301741882300673e-05, - "loss": 1.3086, + "epoch": 0.12014652014652015, + "grad_norm": 2.532990354557582, + "learning_rate": 1.957711812631606e-05, + "loss": 0.847, "step": 1148 }, { - "epoch": 0.76, - "learning_rate": 1.4292112622483653e-05, - "loss": 1.278, + "epoch": 0.12025117739403454, + "grad_norm": 2.498723144014304, + "learning_rate": 1.9576142254697422e-05, + "loss": 1.0768, "step": 1149 }, { - "epoch": 0.76, - "learning_rate": 1.4282478481705742e-05, - "loss": 1.2891, + "epoch": 0.12035583464154893, + "grad_norm": 2.8848332638507608, + "learning_rate": 1.957516528276063e-05, + "loss": 1.0935, "step": 1150 }, { - "epoch": 0.76, - "learning_rate": 1.4272839470922812e-05, - "loss": 1.2409, + "epoch": 0.12046049188906331, + "grad_norm": 2.407079189345069, + "learning_rate": 1.9574187210617935e-05, + "loss": 0.9884, "step": 1151 }, { - "epoch": 0.76, - "learning_rate": 1.426319560109628e-05, - "loss": 1.2973, + "epoch": 0.1205651491365777, + "grad_norm": 2.4789862581848325, + "learning_rate": 1.9573208038381722e-05, + "loss": 1.0612, "step": 1152 }, { - "epoch": 0.76, - "learning_rate": 1.4253546883193093e-05, - "loss": 1.2486, + "epoch": 0.1206698063840921, + "grad_norm": 2.634050308370591, + "learning_rate": 1.95722277661645e-05, + "loss": 1.055, "step": 1153 }, { - "epoch": 0.76, - "learning_rate": 1.42438933281857e-05, - "loss": 1.2693, + "epoch": 0.12077446363160649, + "grad_norm": 2.646401341815591, + "learning_rate": 1.95712463940789e-05, + "loss": 1.1474, "step": 1154 }, { - "epoch": 0.76, - "learning_rate": 1.4234234947052063e-05, - "loss": 1.2249, + "epoch": 0.12087912087912088, + "grad_norm": 2.8047167361076957, + "learning_rate": 1.9570263922237686e-05, + "loss": 1.0503, "step": 1155 }, { - "epoch": 0.76, - "learning_rate": 1.4224571750775625e-05, - "loss": 1.3395, + "epoch": 0.12098377812663527, + "grad_norm": 2.5741451106686317, + "learning_rate": 1.956928035075375e-05, + "loss": 0.9393, "step": 1156 }, { - "epoch": 0.76, - "learning_rate": 1.4214903750345298e-05, - "loss": 1.449, + "epoch": 0.12108843537414966, + "grad_norm": 2.5973811680350045, + "learning_rate": 1.9568295679740098e-05, + "loss": 1.1236, "step": 1157 }, { - "epoch": 0.76, - "learning_rate": 1.4205230956755479e-05, - "loss": 1.1161, + "epoch": 0.12119309262166406, + "grad_norm": 2.6113678867025616, + "learning_rate": 1.956730990930988e-05, + "loss": 1.0036, "step": 1158 }, { - "epoch": 0.76, - "learning_rate": 1.419555338100599e-05, - "loss": 1.3085, + "epoch": 0.12129774986917843, + "grad_norm": 2.763270013817754, + "learning_rate": 1.9566323039576353e-05, + "loss": 1.1911, "step": 1159 }, { - "epoch": 0.76, - "learning_rate": 1.4185871034102117e-05, - "loss": 1.351, + "epoch": 0.12140240711669283, + "grad_norm": 2.705626562419259, + "learning_rate": 1.956533507065292e-05, + "loss": 1.2495, "step": 1160 }, { - "epoch": 0.76, - "learning_rate": 1.4176183927054545e-05, - "loss": 1.3234, + "epoch": 0.12150706436420722, + "grad_norm": 2.5695027901999175, + "learning_rate": 1.9564346002653094e-05, + "loss": 1.0895, "step": 1161 }, { - "epoch": 0.76, - "learning_rate": 1.4166492070879389e-05, - "loss": 1.2037, + "epoch": 0.12161172161172161, + "grad_norm": 2.688239436865601, + "learning_rate": 1.9563355835690526e-05, + "loss": 1.066, "step": 1162 }, { - "epoch": 0.77, - "learning_rate": 1.4156795476598166e-05, - "loss": 1.3678, + "epoch": 0.121716378859236, + "grad_norm": 2.4721295753539847, + "learning_rate": 1.9562364569878985e-05, + "loss": 0.9474, "step": 1163 }, { - "epoch": 0.77, - "learning_rate": 1.4147094155237772e-05, - "loss": 1.3293, + "epoch": 0.1218210361067504, + "grad_norm": 2.9695025162758064, + "learning_rate": 1.9561372205332366e-05, + "loss": 1.1938, "step": 1164 }, { - "epoch": 0.77, - "learning_rate": 1.4137388117830486e-05, - "loss": 1.2505, + "epoch": 0.12192569335426479, + "grad_norm": 2.3922931991769927, + "learning_rate": 1.95603787421647e-05, + "loss": 0.931, "step": 1165 }, { - "epoch": 0.77, - "learning_rate": 1.4127677375413943e-05, - "loss": 1.1536, + "epoch": 0.12203035060177918, + "grad_norm": 2.533155414113076, + "learning_rate": 1.9559384180490138e-05, + "loss": 1.0848, "step": 1166 }, { - "epoch": 0.77, - "learning_rate": 1.4117961939031135e-05, - "loss": 1.2298, + "epoch": 0.12213500784929357, + "grad_norm": 2.7804617711740462, + "learning_rate": 1.9558388520422955e-05, + "loss": 1.1158, "step": 1167 }, { - "epoch": 0.77, - "learning_rate": 1.4108241819730384e-05, - "loss": 1.2324, + "epoch": 0.12223966509680795, + "grad_norm": 3.07185941955718, + "learning_rate": 1.955739176207755e-05, + "loss": 1.1685, "step": 1168 }, { - "epoch": 0.77, - "learning_rate": 1.409851702856535e-05, - "loss": 1.3091, + "epoch": 0.12234432234432234, + "grad_norm": 2.632035250919555, + "learning_rate": 1.955639390556846e-05, + "loss": 1.0409, "step": 1169 }, { - "epoch": 0.77, - "learning_rate": 1.4088787576594996e-05, - "loss": 1.2983, + "epoch": 0.12244897959183673, + "grad_norm": 2.573941413049372, + "learning_rate": 1.9555394951010337e-05, + "loss": 1.2533, "step": 1170 }, { - "epoch": 0.77, - "learning_rate": 1.4079053474883584e-05, - "loss": 1.1055, + "epoch": 0.12255363683935112, + "grad_norm": 2.5319580043800687, + "learning_rate": 1.9554394898517962e-05, + "loss": 1.1681, "step": 1171 }, { - "epoch": 0.77, - "learning_rate": 1.4069314734500674e-05, - "loss": 1.2983, + "epoch": 0.12265829408686552, + "grad_norm": 2.983393734826706, + "learning_rate": 1.9553393748206245e-05, + "loss": 1.0353, "step": 1172 }, { - "epoch": 0.77, - "learning_rate": 1.405957136652109e-05, - "loss": 1.4468, + "epoch": 0.12276295133437991, + "grad_norm": 2.5654991130101776, + "learning_rate": 1.955239150019022e-05, + "loss": 1.1485, "step": 1173 }, { - "epoch": 0.77, - "learning_rate": 1.4049823382024922e-05, - "loss": 1.3416, + "epoch": 0.1228676085818943, + "grad_norm": 2.577167856594292, + "learning_rate": 1.9551388154585045e-05, + "loss": 1.0843, "step": 1174 }, { - "epoch": 0.77, - "learning_rate": 1.4040070792097515e-05, - "loss": 1.3257, + "epoch": 0.12297226582940869, + "grad_norm": 2.672800031307482, + "learning_rate": 1.955038371150601e-05, + "loss": 1.0571, "step": 1175 }, { - "epoch": 0.77, - "learning_rate": 1.4030313607829444e-05, - "loss": 1.3353, + "epoch": 0.12307692307692308, + "grad_norm": 2.2255667081208133, + "learning_rate": 1.954937817106853e-05, + "loss": 0.9892, "step": 1176 }, { - "epoch": 0.77, - "learning_rate": 1.4020551840316515e-05, - "loss": 1.1875, + "epoch": 0.12318158032443746, + "grad_norm": 2.5034867735183304, + "learning_rate": 1.9548371533388134e-05, + "loss": 1.1986, "step": 1177 }, { - "epoch": 0.78, - "learning_rate": 1.4010785500659735e-05, - "loss": 1.3478, + "epoch": 0.12328623757195185, + "grad_norm": 2.8720211636190043, + "learning_rate": 1.954736379858049e-05, + "loss": 1.2097, "step": 1178 }, { - "epoch": 0.78, - "learning_rate": 1.4001014599965327e-05, - "loss": 1.3157, + "epoch": 0.12339089481946625, + "grad_norm": 2.8330847231206047, + "learning_rate": 1.9546354966761397e-05, + "loss": 1.115, "step": 1179 }, { - "epoch": 0.78, - "learning_rate": 1.3991239149344686e-05, - "loss": 1.1173, + "epoch": 0.12349555206698064, + "grad_norm": 2.4195461792808306, + "learning_rate": 1.9545345038046762e-05, + "loss": 1.0036, "step": 1180 }, { - "epoch": 0.78, - "learning_rate": 1.3981459159914392e-05, - "loss": 1.2161, + "epoch": 0.12360020931449503, + "grad_norm": 2.906126283855808, + "learning_rate": 1.9544334012552633e-05, + "loss": 1.0183, "step": 1181 }, { - "epoch": 0.78, - "learning_rate": 1.3971674642796178e-05, - "loss": 1.2952, + "epoch": 0.12370486656200942, + "grad_norm": 2.5436235281065698, + "learning_rate": 1.954332189039518e-05, + "loss": 0.9942, "step": 1182 }, { - "epoch": 0.78, - "learning_rate": 1.396188560911693e-05, - "loss": 1.2907, + "epoch": 0.12380952380952381, + "grad_norm": 2.826315997445539, + "learning_rate": 1.954230867169069e-05, + "loss": 1.2136, "step": 1183 }, { - "epoch": 0.78, - "learning_rate": 1.3952092070008669e-05, - "loss": 1.4054, + "epoch": 0.1239141810570382, + "grad_norm": 2.7260954242001514, + "learning_rate": 1.9541294356555594e-05, + "loss": 0.9941, "step": 1184 }, { - "epoch": 0.78, - "learning_rate": 1.3942294036608544e-05, - "loss": 1.2853, + "epoch": 0.12401883830455258, + "grad_norm": 3.1932984486388234, + "learning_rate": 1.9540278945106433e-05, + "loss": 1.065, "step": 1185 }, { - "epoch": 0.78, - "learning_rate": 1.3932491520058807e-05, - "loss": 1.2316, + "epoch": 0.12412349555206698, + "grad_norm": 2.3449880675934884, + "learning_rate": 1.953926243745988e-05, + "loss": 0.965, "step": 1186 }, { - "epoch": 0.78, - "learning_rate": 1.392268453150681e-05, - "loss": 1.3066, + "epoch": 0.12422815279958137, + "grad_norm": 2.475504471283617, + "learning_rate": 1.9538244833732737e-05, + "loss": 1.0389, "step": 1187 }, { - "epoch": 0.78, - "learning_rate": 1.3912873082105002e-05, - "loss": 1.2377, + "epoch": 0.12433281004709576, + "grad_norm": 2.648680996485242, + "learning_rate": 1.9537226134041925e-05, + "loss": 1.1165, "step": 1188 }, { - "epoch": 0.78, - "learning_rate": 1.3903057183010887e-05, - "loss": 1.3614, + "epoch": 0.12443746729461015, + "grad_norm": 2.658874814055567, + "learning_rate": 1.9536206338504494e-05, + "loss": 1.0662, "step": 1189 }, { - "epoch": 0.78, - "learning_rate": 1.3893236845387041e-05, - "loss": 1.3397, + "epoch": 0.12454212454212454, + "grad_norm": 2.486917672270425, + "learning_rate": 1.9535185447237626e-05, + "loss": 1.0065, "step": 1190 }, { - "epoch": 0.78, - "learning_rate": 1.3883412080401083e-05, - "loss": 1.1668, + "epoch": 0.12464678178963894, + "grad_norm": 2.522352267870716, + "learning_rate": 1.953416346035862e-05, + "loss": 1.0536, "step": 1191 }, { - "epoch": 0.78, - "learning_rate": 1.3873582899225672e-05, - "loss": 1.1924, + "epoch": 0.12475143903715333, + "grad_norm": 2.595719707912744, + "learning_rate": 1.9533140377984906e-05, + "loss": 1.0279, "step": 1192 }, { - "epoch": 0.79, - "learning_rate": 1.3863749313038484e-05, - "loss": 1.3731, + "epoch": 0.12485609628466772, + "grad_norm": 2.7586476277587315, + "learning_rate": 1.9532116200234034e-05, + "loss": 0.997, "step": 1193 }, { - "epoch": 0.79, - "learning_rate": 1.3853911333022199e-05, - "loss": 1.1934, + "epoch": 0.1249607535321821, + "grad_norm": 2.323954362886985, + "learning_rate": 1.9531090927223687e-05, + "loss": 1.0201, "step": 1194 }, { - "epoch": 0.79, - "learning_rate": 1.3844068970364506e-05, - "loss": 1.1635, + "epoch": 0.1250654107796965, + "grad_norm": 2.3865025964537656, + "learning_rate": 1.9530064559071672e-05, + "loss": 1.0528, "step": 1195 }, { - "epoch": 0.79, - "learning_rate": 1.383422223625807e-05, - "loss": 1.3358, + "epoch": 0.1251700680272109, + "grad_norm": 2.4307960677955838, + "learning_rate": 1.952903709589592e-05, + "loss": 1.0709, "step": 1196 }, { - "epoch": 0.79, - "learning_rate": 1.3824371141900533e-05, - "loss": 1.2897, + "epoch": 0.12527472527472527, + "grad_norm": 2.51314339549336, + "learning_rate": 1.9528008537814488e-05, + "loss": 1.1309, "step": 1197 }, { - "epoch": 0.79, - "learning_rate": 1.3814515698494484e-05, - "loss": 1.3208, + "epoch": 0.12537938252223965, + "grad_norm": 2.4769743325065594, + "learning_rate": 1.952697888494556e-05, + "loss": 1.144, "step": 1198 }, { - "epoch": 0.79, - "learning_rate": 1.3804655917247471e-05, - "loss": 1.2866, + "epoch": 0.12548403976975406, + "grad_norm": 2.358721280245034, + "learning_rate": 1.9525948137407443e-05, + "loss": 0.9865, "step": 1199 }, { - "epoch": 0.79, - "learning_rate": 1.3794791809371971e-05, - "loss": 1.0992, + "epoch": 0.12558869701726844, + "grad_norm": 2.3206179689553723, + "learning_rate": 1.9524916295318576e-05, + "loss": 1.0968, "step": 1200 }, { - "epoch": 0.79, - "learning_rate": 1.3784923386085373e-05, - "loss": 1.1884, + "epoch": 0.12569335426478284, + "grad_norm": 2.316497811222147, + "learning_rate": 1.9523883358797517e-05, + "loss": 0.9474, "step": 1201 }, { - "epoch": 0.79, - "learning_rate": 1.3775050658609988e-05, - "loss": 1.2635, + "epoch": 0.12579801151229722, + "grad_norm": 2.706485454764553, + "learning_rate": 1.9522849327962954e-05, + "loss": 1.0489, "step": 1202 }, { - "epoch": 0.79, - "learning_rate": 1.3765173638173004e-05, - "loss": 1.3466, + "epoch": 0.12590266875981163, + "grad_norm": 2.838226256388987, + "learning_rate": 1.9521814202933697e-05, + "loss": 1.1056, "step": 1203 }, { - "epoch": 0.79, - "learning_rate": 1.3755292336006512e-05, - "loss": 1.2894, + "epoch": 0.126007326007326, + "grad_norm": 2.6389275404903327, + "learning_rate": 1.952077798382869e-05, + "loss": 1.0294, "step": 1204 }, { - "epoch": 0.79, - "learning_rate": 1.3745406763347446e-05, - "loss": 1.237, + "epoch": 0.1261119832548404, + "grad_norm": 2.24218821930294, + "learning_rate": 1.9519740670766985e-05, + "loss": 1.059, "step": 1205 }, { - "epoch": 0.79, - "learning_rate": 1.3735516931437625e-05, - "loss": 1.1691, + "epoch": 0.1262166405023548, + "grad_norm": 2.3594941049558176, + "learning_rate": 1.951870226386778e-05, + "loss": 1.1141, "step": 1206 }, { - "epoch": 0.79, - "learning_rate": 1.3725622851523692e-05, - "loss": 1.367, + "epoch": 0.12632129774986917, + "grad_norm": 2.867410423081424, + "learning_rate": 1.951766276325039e-05, + "loss": 1.1167, "step": 1207 }, { - "epoch": 0.8, - "learning_rate": 1.3715724534857127e-05, - "loss": 1.1355, + "epoch": 0.12642595499738357, + "grad_norm": 2.360366489000132, + "learning_rate": 1.9516622169034253e-05, + "loss": 1.0121, "step": 1208 }, { - "epoch": 0.8, - "learning_rate": 1.3705821992694228e-05, - "loss": 1.4048, + "epoch": 0.12653061224489795, + "grad_norm": 2.8918870260240634, + "learning_rate": 1.951558048133894e-05, + "loss": 0.9879, "step": 1209 }, { - "epoch": 0.8, - "learning_rate": 1.3695915236296095e-05, - "loss": 1.225, + "epoch": 0.12663526949241236, + "grad_norm": 2.470183042311598, + "learning_rate": 1.951453770028414e-05, + "loss": 1.0716, "step": 1210 }, { - "epoch": 0.8, - "learning_rate": 1.3686004276928627e-05, - "loss": 1.2157, + "epoch": 0.12673992673992673, + "grad_norm": 2.5896489389181374, + "learning_rate": 1.9513493825989664e-05, + "loss": 1.078, "step": 1211 }, { - "epoch": 0.8, - "learning_rate": 1.3676089125862495e-05, - "loss": 1.2612, + "epoch": 0.12684458398744114, + "grad_norm": 2.476294549537629, + "learning_rate": 1.9512448858575466e-05, + "loss": 0.9433, "step": 1212 }, { - "epoch": 0.8, - "learning_rate": 1.3666169794373141e-05, - "loss": 1.2672, + "epoch": 0.12694924123495552, + "grad_norm": 2.678105912674675, + "learning_rate": 1.951140279816161e-05, + "loss": 1.0967, "step": 1213 }, { - "epoch": 0.8, - "learning_rate": 1.3656246293740767e-05, - "loss": 1.1888, + "epoch": 0.12705389848246992, + "grad_norm": 2.837977652142853, + "learning_rate": 1.951035564486829e-05, + "loss": 1.0051, "step": 1214 }, { - "epoch": 0.8, - "learning_rate": 1.36463186352503e-05, - "loss": 1.3704, + "epoch": 0.1271585557299843, + "grad_norm": 2.836479859442291, + "learning_rate": 1.950930739881583e-05, + "loss": 1.0707, "step": 1215 }, { - "epoch": 0.8, - "learning_rate": 1.3636386830191409e-05, - "loss": 1.2579, + "epoch": 0.12726321297749868, + "grad_norm": 2.2675840070291633, + "learning_rate": 1.9508258060124668e-05, + "loss": 1.0395, "step": 1216 }, { - "epoch": 0.8, - "learning_rate": 1.3626450889858476e-05, - "loss": 1.3641, + "epoch": 0.12736787022501309, + "grad_norm": 3.2573289605017974, + "learning_rate": 1.950720762891538e-05, + "loss": 1.1698, "step": 1217 }, { - "epoch": 0.8, - "learning_rate": 1.3616510825550582e-05, - "loss": 1.2377, + "epoch": 0.12747252747252746, + "grad_norm": 2.252961963832606, + "learning_rate": 1.9506156105308665e-05, + "loss": 0.8967, "step": 1218 }, { - "epoch": 0.8, - "learning_rate": 1.36065666485715e-05, - "loss": 1.3744, + "epoch": 0.12757718472004187, + "grad_norm": 1.8906880106435673, + "learning_rate": 1.950510348942534e-05, + "loss": 0.8001, "step": 1219 }, { - "epoch": 0.8, - "learning_rate": 1.3596618370229679e-05, - "loss": 1.3327, + "epoch": 0.12768184196755625, + "grad_norm": 2.5679268387367373, + "learning_rate": 1.9504049781386358e-05, + "loss": 1.0456, "step": 1220 }, { - "epoch": 0.8, - "learning_rate": 1.3586666001838231e-05, - "loss": 1.2221, + "epoch": 0.12778649921507065, + "grad_norm": 2.742942369429627, + "learning_rate": 1.9502994981312785e-05, + "loss": 1.0444, "step": 1221 }, { - "epoch": 0.8, - "learning_rate": 1.3576709554714923e-05, - "loss": 1.27, + "epoch": 0.12789115646258503, + "grad_norm": 2.5837062790653302, + "learning_rate": 1.950193908932583e-05, + "loss": 1.0361, "step": 1222 }, { - "epoch": 0.81, - "learning_rate": 1.356674904018216e-05, - "loss": 1.3184, + "epoch": 0.12799581371009944, + "grad_norm": 2.601717403532944, + "learning_rate": 1.9500882105546806e-05, + "loss": 0.9211, "step": 1223 }, { - "epoch": 0.81, - "learning_rate": 1.3556784469566968e-05, - "loss": 1.3058, + "epoch": 0.12810047095761382, + "grad_norm": 2.3596656749420317, + "learning_rate": 1.9499824030097173e-05, + "loss": 0.9949, "step": 1224 }, { - "epoch": 0.81, - "learning_rate": 1.3546815854200987e-05, - "loss": 1.297, + "epoch": 0.1282051282051282, + "grad_norm": 2.431051750501933, + "learning_rate": 1.9498764863098494e-05, + "loss": 1.111, "step": 1225 }, { - "epoch": 0.81, - "learning_rate": 1.353684320542046e-05, - "loss": 1.3612, + "epoch": 0.1283097854526426, + "grad_norm": 2.465871960006348, + "learning_rate": 1.949770460467248e-05, + "loss": 1.0887, "step": 1226 }, { - "epoch": 0.81, - "learning_rate": 1.3526866534566213e-05, - "loss": 1.2144, + "epoch": 0.12841444270015698, + "grad_norm": 2.3804735150810883, + "learning_rate": 1.9496643254940952e-05, + "loss": 1.0723, "step": 1227 }, { - "epoch": 0.81, - "learning_rate": 1.3516885852983645e-05, - "loss": 1.3782, + "epoch": 0.12851909994767138, + "grad_norm": 2.5375697941496265, + "learning_rate": 1.9495580814025864e-05, + "loss": 0.9679, "step": 1228 }, { - "epoch": 0.81, - "learning_rate": 1.3506901172022724e-05, - "loss": 1.3937, + "epoch": 0.12862375719518576, + "grad_norm": 2.3461082467161565, + "learning_rate": 1.949451728204929e-05, + "loss": 0.9353, "step": 1229 }, { - "epoch": 0.81, - "learning_rate": 1.3496912503037959e-05, - "loss": 1.3548, + "epoch": 0.12872841444270017, + "grad_norm": 2.770975522225647, + "learning_rate": 1.9493452659133437e-05, + "loss": 1.0304, "step": 1230 }, { - "epoch": 0.81, - "learning_rate": 1.3486919857388392e-05, - "loss": 1.3579, + "epoch": 0.12883307169021455, + "grad_norm": 2.3083829511711396, + "learning_rate": 1.9492386945400623e-05, + "loss": 0.9771, "step": 1231 }, { - "epoch": 0.81, - "learning_rate": 1.347692324643759e-05, - "loss": 1.2221, + "epoch": 0.12893772893772895, + "grad_norm": 2.697784666924789, + "learning_rate": 1.9491320140973305e-05, + "loss": 1.1808, "step": 1232 }, { - "epoch": 0.81, - "learning_rate": 1.3466922681553636e-05, - "loss": 1.2839, + "epoch": 0.12904238618524333, + "grad_norm": 2.8520274235768532, + "learning_rate": 1.9490252245974062e-05, + "loss": 1.0209, "step": 1233 }, { - "epoch": 0.81, - "learning_rate": 1.3456918174109102e-05, - "loss": 1.2777, + "epoch": 0.1291470434327577, + "grad_norm": 2.2734770595974276, + "learning_rate": 1.9489183260525603e-05, + "loss": 1.0474, "step": 1234 }, { - "epoch": 0.81, - "learning_rate": 1.344690973548104e-05, - "loss": 1.2537, + "epoch": 0.1292517006802721, + "grad_norm": 2.645718364092188, + "learning_rate": 1.9488113184750747e-05, + "loss": 1.064, "step": 1235 }, { - "epoch": 0.81, - "learning_rate": 1.3436897377050982e-05, - "loss": 1.3699, + "epoch": 0.1293563579277865, + "grad_norm": 2.41791085736045, + "learning_rate": 1.9487042018772452e-05, + "loss": 1.0731, "step": 1236 }, { - "epoch": 0.81, - "learning_rate": 1.342688111020491e-05, - "loss": 1.248, + "epoch": 0.1294610151753009, + "grad_norm": 2.527589108825978, + "learning_rate": 1.9485969762713797e-05, + "loss": 1.082, "step": 1237 }, { - "epoch": 0.82, - "learning_rate": 1.3416860946333256e-05, - "loss": 1.3886, + "epoch": 0.12956567242281528, + "grad_norm": 2.779752862819606, + "learning_rate": 1.9484896416697983e-05, + "loss": 0.9216, "step": 1238 }, { - "epoch": 0.82, - "learning_rate": 1.3406836896830884e-05, - "loss": 1.1917, + "epoch": 0.12967032967032968, + "grad_norm": 2.732412932188138, + "learning_rate": 1.9483821980848346e-05, + "loss": 1.1465, "step": 1239 }, { - "epoch": 0.82, - "learning_rate": 1.3396808973097066e-05, - "loss": 1.2582, + "epoch": 0.12977498691784406, + "grad_norm": 2.4729282241151935, + "learning_rate": 1.948274645528834e-05, + "loss": 0.9567, "step": 1240 }, { - "epoch": 0.82, - "learning_rate": 1.3386777186535495e-05, - "loss": 1.2666, + "epoch": 0.12987964416535844, + "grad_norm": 2.6626696165070056, + "learning_rate": 1.948166984014154e-05, + "loss": 0.9809, "step": 1241 }, { - "epoch": 0.82, - "learning_rate": 1.3376741548554248e-05, - "loss": 1.1695, + "epoch": 0.12998430141287284, + "grad_norm": 2.6256425374078676, + "learning_rate": 1.9480592135531654e-05, + "loss": 1.0356, "step": 1242 }, { - "epoch": 0.82, - "learning_rate": 1.3366702070565775e-05, - "loss": 1.2851, + "epoch": 0.13008895866038722, + "grad_norm": 2.692113072642092, + "learning_rate": 1.9479513341582513e-05, + "loss": 1.028, "step": 1243 }, { - "epoch": 0.82, - "learning_rate": 1.3356658763986919e-05, - "loss": 1.1393, + "epoch": 0.13019361590790163, + "grad_norm": 2.834736069467216, + "learning_rate": 1.947843345841807e-05, + "loss": 0.9586, "step": 1244 }, { - "epoch": 0.82, - "learning_rate": 1.334661164023884e-05, - "loss": 1.3551, + "epoch": 0.130298273155416, + "grad_norm": 2.7981103330661568, + "learning_rate": 1.947735248616241e-05, + "loss": 1.0169, "step": 1245 }, { - "epoch": 0.82, - "learning_rate": 1.3336560710747073e-05, - "loss": 1.2921, + "epoch": 0.1304029304029304, + "grad_norm": 2.6541080453909545, + "learning_rate": 1.9476270424939736e-05, + "loss": 1.0977, "step": 1246 }, { - "epoch": 0.82, - "learning_rate": 1.3326505986941456e-05, - "loss": 1.2649, + "epoch": 0.1305075876504448, + "grad_norm": 2.6432248199471005, + "learning_rate": 1.9475187274874382e-05, + "loss": 1.1162, "step": 1247 }, { - "epoch": 0.82, - "learning_rate": 1.3316447480256153e-05, - "loss": 1.4216, + "epoch": 0.1306122448979592, + "grad_norm": 2.655070618581444, + "learning_rate": 1.9474103036090795e-05, + "loss": 1.0413, "step": 1248 }, { - "epoch": 0.82, - "learning_rate": 1.3306385202129633e-05, - "loss": 1.2133, + "epoch": 0.13071690214547357, + "grad_norm": 3.197415853957612, + "learning_rate": 1.947301770871357e-05, + "loss": 1.0412, "step": 1249 }, { - "epoch": 0.82, - "learning_rate": 1.3296319164004646e-05, - "loss": 1.1427, + "epoch": 0.13082155939298795, + "grad_norm": 2.592069054029112, + "learning_rate": 1.9471931292867405e-05, + "loss": 1.0165, "step": 1250 }, { - "epoch": 0.82, - "learning_rate": 1.3286249377328222e-05, - "loss": 1.0525, + "epoch": 0.13092621664050236, + "grad_norm": 3.655152675264706, + "learning_rate": 1.9470843788677132e-05, + "loss": 0.9054, "step": 1251 }, { - "epoch": 0.82, - "learning_rate": 1.327617585355165e-05, - "loss": 1.2209, + "epoch": 0.13103087388801674, + "grad_norm": 2.882892532973182, + "learning_rate": 1.9469755196267706e-05, + "loss": 1.1792, "step": 1252 }, { - "epoch": 0.82, - "learning_rate": 1.3266098604130477e-05, - "loss": 1.2439, + "epoch": 0.13113553113553114, + "grad_norm": 2.6595064843196954, + "learning_rate": 1.9468665515764216e-05, + "loss": 1.0301, "step": 1253 }, { - "epoch": 0.83, - "learning_rate": 1.325601764052448e-05, - "loss": 1.3988, + "epoch": 0.13124018838304552, + "grad_norm": 2.3865070700069158, + "learning_rate": 1.9467574747291862e-05, + "loss": 0.9721, "step": 1254 }, { - "epoch": 0.83, - "learning_rate": 1.324593297419766e-05, - "loss": 1.3372, + "epoch": 0.13134484563055993, + "grad_norm": 2.723979611241111, + "learning_rate": 1.9466482890975975e-05, + "loss": 1.1416, "step": 1255 }, { - "epoch": 0.83, - "learning_rate": 1.323584461661823e-05, - "loss": 1.1898, + "epoch": 0.1314495028780743, + "grad_norm": 2.605292005226839, + "learning_rate": 1.9465389946942013e-05, + "loss": 1.1899, "step": 1256 }, { - "epoch": 0.83, - "learning_rate": 1.3225752579258599e-05, - "loss": 1.2429, + "epoch": 0.1315541601255887, + "grad_norm": 2.5605584808263675, + "learning_rate": 1.9464295915315555e-05, + "loss": 1.139, "step": 1257 }, { - "epoch": 0.83, - "learning_rate": 1.3215656873595366e-05, - "loss": 1.2746, + "epoch": 0.1316588173731031, + "grad_norm": 2.6247600589750046, + "learning_rate": 1.9463200796222318e-05, + "loss": 1.1418, "step": 1258 }, { - "epoch": 0.83, - "learning_rate": 1.3205557511109298e-05, - "loss": 1.1461, + "epoch": 0.13176347462061747, + "grad_norm": 2.0595672766120896, + "learning_rate": 1.946210458978812e-05, + "loss": 1.0889, "step": 1259 }, { - "epoch": 0.83, - "learning_rate": 1.319545450328532e-05, - "loss": 1.1718, + "epoch": 0.13186813186813187, + "grad_norm": 2.3474913936763313, + "learning_rate": 1.9461007296138925e-05, + "loss": 1.0599, "step": 1260 }, { - "epoch": 0.83, - "learning_rate": 1.3185347861612501e-05, - "loss": 1.2531, + "epoch": 0.13197278911564625, + "grad_norm": 2.9581647619226685, + "learning_rate": 1.945990891540081e-05, + "loss": 0.9018, "step": 1261 }, { - "epoch": 0.83, - "learning_rate": 1.3175237597584045e-05, - "loss": 1.2999, + "epoch": 0.13207744636316066, + "grad_norm": 2.984957639865946, + "learning_rate": 1.9458809447699985e-05, + "loss": 1.1799, "step": 1262 }, { - "epoch": 0.83, - "learning_rate": 1.3165123722697276e-05, - "loss": 1.2824, + "epoch": 0.13218210361067503, + "grad_norm": 2.772705342436505, + "learning_rate": 1.945770889316278e-05, + "loss": 1.0051, "step": 1263 }, { - "epoch": 0.83, - "learning_rate": 1.3155006248453615e-05, - "loss": 1.16, + "epoch": 0.13228676085818944, + "grad_norm": 2.6058433960187894, + "learning_rate": 1.945660725191565e-05, + "loss": 1.0885, "step": 1264 }, { - "epoch": 0.83, - "learning_rate": 1.3144885186358596e-05, - "loss": 1.3179, + "epoch": 0.13239141810570382, + "grad_norm": 2.7920782381998035, + "learning_rate": 1.9455504524085174e-05, + "loss": 0.9356, "step": 1265 }, { - "epoch": 0.83, - "learning_rate": 1.3134760547921814e-05, - "loss": 1.234, + "epoch": 0.13249607535321822, + "grad_norm": 2.6926709248661966, + "learning_rate": 1.9454400709798063e-05, + "loss": 1.0235, "step": 1266 }, { - "epoch": 0.83, - "learning_rate": 1.3124632344656943e-05, - "loss": 1.3048, + "epoch": 0.1326007326007326, + "grad_norm": 2.306933420813203, + "learning_rate": 1.9453295809181144e-05, + "loss": 1.0709, "step": 1267 }, { - "epoch": 0.83, - "learning_rate": 1.3114500588081698e-05, - "loss": 1.2577, + "epoch": 0.13270538984824698, + "grad_norm": 2.400935059767989, + "learning_rate": 1.9452189822361368e-05, + "loss": 1.0141, "step": 1268 }, { - "epoch": 0.84, - "learning_rate": 1.3104365289717849e-05, - "loss": 1.2236, + "epoch": 0.13281004709576139, + "grad_norm": 2.706220647073549, + "learning_rate": 1.9451082749465822e-05, + "loss": 0.9398, "step": 1269 }, { - "epoch": 0.84, - "learning_rate": 1.3094226461091186e-05, - "loss": 1.2877, + "epoch": 0.13291470434327576, + "grad_norm": 2.5158658128425015, + "learning_rate": 1.9449974590621713e-05, + "loss": 1.1766, "step": 1270 }, { - "epoch": 0.84, - "learning_rate": 1.3084084113731514e-05, - "loss": 1.3343, + "epoch": 0.13301936159079017, + "grad_norm": 2.506227273861366, + "learning_rate": 1.9448865345956364e-05, + "loss": 1.0084, "step": 1271 }, { - "epoch": 0.84, - "learning_rate": 1.3073938259172645e-05, - "loss": 1.242, + "epoch": 0.13312401883830455, + "grad_norm": 2.2919348376307402, + "learning_rate": 1.944775501559723e-05, + "loss": 0.9608, "step": 1272 }, { - "epoch": 0.84, - "learning_rate": 1.3063788908952368e-05, - "loss": 1.3352, + "epoch": 0.13322867608581895, + "grad_norm": 2.9193041380716545, + "learning_rate": 1.9446643599671895e-05, + "loss": 0.8943, "step": 1273 }, { - "epoch": 0.84, - "learning_rate": 1.3053636074612457e-05, - "loss": 1.3005, + "epoch": 0.13333333333333333, + "grad_norm": 2.615394555513494, + "learning_rate": 1.944553109830806e-05, + "loss": 1.2087, "step": 1274 }, { - "epoch": 0.84, - "learning_rate": 1.3043479767698646e-05, - "loss": 1.1926, + "epoch": 0.13343799058084774, + "grad_norm": 2.489254352916976, + "learning_rate": 1.944441751163355e-05, + "loss": 0.9713, "step": 1275 }, { - "epoch": 0.84, - "learning_rate": 1.3033319999760619e-05, - "loss": 1.2242, + "epoch": 0.13354264782836212, + "grad_norm": 2.841255305838923, + "learning_rate": 1.9443302839776327e-05, + "loss": 1.0022, "step": 1276 }, { - "epoch": 0.84, - "learning_rate": 1.3023156782351989e-05, - "loss": 1.2421, + "epoch": 0.1336473050758765, + "grad_norm": 2.5946261317323507, + "learning_rate": 1.944218708286446e-05, + "loss": 1.0554, "step": 1277 }, { - "epoch": 0.84, - "learning_rate": 1.30129901270303e-05, - "loss": 1.158, + "epoch": 0.1337519623233909, + "grad_norm": 2.284752575634108, + "learning_rate": 1.944107024102616e-05, + "loss": 1.0401, "step": 1278 }, { - "epoch": 0.84, - "learning_rate": 1.3002820045357005e-05, - "loss": 1.2962, + "epoch": 0.13385661957090528, + "grad_norm": 2.9886804389402735, + "learning_rate": 1.943995231438975e-05, + "loss": 1.0555, "step": 1279 }, { - "epoch": 0.84, - "learning_rate": 1.2992646548897442e-05, - "loss": 1.1178, + "epoch": 0.13396127681841968, + "grad_norm": 2.602163456273472, + "learning_rate": 1.9438833303083677e-05, + "loss": 1.0488, "step": 1280 }, { - "epoch": 0.84, - "learning_rate": 1.2982469649220854e-05, - "loss": 1.3417, + "epoch": 0.13406593406593406, + "grad_norm": 2.485141727539193, + "learning_rate": 1.9437713207236525e-05, + "loss": 1.04, "step": 1281 }, { - "epoch": 0.84, - "learning_rate": 1.2972289357900332e-05, - "loss": 1.2901, + "epoch": 0.13417059131344847, + "grad_norm": 2.727354509959165, + "learning_rate": 1.9436592026976994e-05, + "loss": 1.0357, "step": 1282 }, { - "epoch": 0.84, - "learning_rate": 1.2962105686512834e-05, - "loss": 1.2904, + "epoch": 0.13427524856096285, + "grad_norm": 2.3638486724054513, + "learning_rate": 1.943546976243391e-05, + "loss": 1.1289, "step": 1283 }, { - "epoch": 0.85, - "learning_rate": 1.2951918646639163e-05, - "loss": 1.3182, + "epoch": 0.13437990580847722, + "grad_norm": 2.717669536906904, + "learning_rate": 1.943434641373622e-05, + "loss": 1.0574, "step": 1284 }, { - "epoch": 0.85, - "learning_rate": 1.294172824986395e-05, - "loss": 1.226, + "epoch": 0.13448456305599163, + "grad_norm": 2.2734518432998914, + "learning_rate": 1.9433221981013007e-05, + "loss": 1.0728, "step": 1285 }, { - "epoch": 0.85, - "learning_rate": 1.293153450777564e-05, - "loss": 1.2812, + "epoch": 0.134589220303506, + "grad_norm": 2.6630965958749533, + "learning_rate": 1.9432096464393463e-05, + "loss": 1.061, "step": 1286 }, { - "epoch": 0.85, - "learning_rate": 1.2921337431966494e-05, - "loss": 1.4035, + "epoch": 0.1346938775510204, + "grad_norm": 3.1093231373052923, + "learning_rate": 1.943096986400691e-05, + "loss": 1.0387, "step": 1287 }, { - "epoch": 0.85, - "learning_rate": 1.2911137034032553e-05, - "loss": 1.3059, + "epoch": 0.1347985347985348, + "grad_norm": 2.6785202972118682, + "learning_rate": 1.942984217998281e-05, + "loss": 1.021, "step": 1288 }, { - "epoch": 0.85, - "learning_rate": 1.2900933325573636e-05, - "loss": 1.3659, + "epoch": 0.1349031920460492, + "grad_norm": 2.5540937962722543, + "learning_rate": 1.9428713412450718e-05, + "loss": 1.0165, "step": 1289 }, { - "epoch": 0.85, - "learning_rate": 1.2890726318193332e-05, - "loss": 1.2342, + "epoch": 0.13500784929356358, + "grad_norm": 2.3166372688832726, + "learning_rate": 1.9427583561540344e-05, + "loss": 1.0847, "step": 1290 }, { - "epoch": 0.85, - "learning_rate": 1.2880516023498981e-05, - "loss": 1.4161, + "epoch": 0.13511250654107798, + "grad_norm": 2.301232179651965, + "learning_rate": 1.942645262738151e-05, + "loss": 1.1781, "step": 1291 }, { - "epoch": 0.85, - "learning_rate": 1.2870302453101657e-05, - "loss": 1.2108, + "epoch": 0.13521716378859236, + "grad_norm": 2.3252095241169077, + "learning_rate": 1.9425320610104157e-05, + "loss": 1.0947, "step": 1292 }, { - "epoch": 0.85, - "learning_rate": 1.2860085618616169e-05, - "loss": 1.3012, + "epoch": 0.13532182103610674, + "grad_norm": 2.493316820257742, + "learning_rate": 1.942418750983836e-05, + "loss": 1.0104, "step": 1293 }, { - "epoch": 0.85, - "learning_rate": 1.284986553166102e-05, - "loss": 1.2797, + "epoch": 0.13542647828362114, + "grad_norm": 2.3545726405840153, + "learning_rate": 1.942305332671431e-05, + "loss": 0.9365, "step": 1294 }, { - "epoch": 0.85, - "learning_rate": 1.2839642203858433e-05, - "loss": 1.2384, + "epoch": 0.13553113553113552, + "grad_norm": 2.3550828896798324, + "learning_rate": 1.9421918060862333e-05, + "loss": 1.1441, "step": 1295 }, { - "epoch": 0.85, - "learning_rate": 1.2829415646834295e-05, - "loss": 1.3128, + "epoch": 0.13563579277864993, + "grad_norm": 2.6556094154616146, + "learning_rate": 1.942078171241287e-05, + "loss": 1.0687, "step": 1296 }, { - "epoch": 0.85, - "learning_rate": 1.2819185872218184e-05, - "loss": 1.2666, + "epoch": 0.1357404500261643, + "grad_norm": 2.5889079401056017, + "learning_rate": 1.9419644281496492e-05, + "loss": 1.036, "step": 1297 }, { - "epoch": 0.85, - "learning_rate": 1.2808952891643327e-05, - "loss": 1.3178, + "epoch": 0.1358451072736787, + "grad_norm": 2.4413416204425222, + "learning_rate": 1.9418505768243888e-05, + "loss": 1.1142, "step": 1298 }, { - "epoch": 0.86, - "learning_rate": 1.2798716716746602e-05, - "loss": 1.2278, + "epoch": 0.1359497645211931, + "grad_norm": 2.2226669500183753, + "learning_rate": 1.941736617278588e-05, + "loss": 1.0729, "step": 1299 }, { - "epoch": 0.86, - "learning_rate": 1.2788477359168512e-05, - "loss": 1.3281, + "epoch": 0.1360544217687075, + "grad_norm": 2.3250136278743234, + "learning_rate": 1.9416225495253406e-05, + "loss": 1.0012, "step": 1300 }, { - "epoch": 0.86, - "learning_rate": 1.2778234830553183e-05, - "loss": 1.1671, + "epoch": 0.13615907901622187, + "grad_norm": 2.6455616428189743, + "learning_rate": 1.9415083735777535e-05, + "loss": 0.9987, "step": 1301 }, { - "epoch": 0.86, - "learning_rate": 1.276798914254835e-05, - "loss": 1.3176, + "epoch": 0.13626373626373625, + "grad_norm": 2.515570541189039, + "learning_rate": 1.9413940894489456e-05, + "loss": 0.9546, "step": 1302 }, { - "epoch": 0.86, - "learning_rate": 1.2757740306805345e-05, - "loss": 1.2479, + "epoch": 0.13636839351125066, + "grad_norm": 2.3610892292812014, + "learning_rate": 1.9412796971520486e-05, + "loss": 1.002, "step": 1303 }, { - "epoch": 0.86, - "learning_rate": 1.2747488334979064e-05, - "loss": 1.176, + "epoch": 0.13647305075876504, + "grad_norm": 2.351486484199385, + "learning_rate": 1.9411651967002057e-05, + "loss": 0.952, "step": 1304 }, { - "epoch": 0.86, - "learning_rate": 1.2737233238727982e-05, - "loss": 1.3775, + "epoch": 0.13657770800627944, + "grad_norm": 2.2806059320082377, + "learning_rate": 1.9410505881065745e-05, + "loss": 1.0238, "step": 1305 }, { - "epoch": 0.86, - "learning_rate": 1.2726975029714124e-05, - "loss": 1.3324, + "epoch": 0.13668236525379382, + "grad_norm": 2.8037668411236947, + "learning_rate": 1.9409358713843226e-05, + "loss": 0.8635, "step": 1306 }, { - "epoch": 0.86, - "learning_rate": 1.2716713719603053e-05, - "loss": 1.23, + "epoch": 0.13678702250130823, + "grad_norm": 2.354476426007749, + "learning_rate": 1.9408210465466315e-05, + "loss": 1.0029, "step": 1307 }, { - "epoch": 0.86, - "learning_rate": 1.2706449320063866e-05, - "loss": 1.21, + "epoch": 0.1368916797488226, + "grad_norm": 2.58570996843689, + "learning_rate": 1.9407061136066952e-05, + "loss": 1.1598, "step": 1308 }, { - "epoch": 0.86, - "learning_rate": 1.2696181842769161e-05, - "loss": 1.3184, + "epoch": 0.136996336996337, + "grad_norm": 2.2297849493454174, + "learning_rate": 1.940591072577719e-05, + "loss": 1.0518, "step": 1309 }, { - "epoch": 0.86, - "learning_rate": 1.2685911299395046e-05, - "loss": 1.1431, + "epoch": 0.1371009942438514, + "grad_norm": 2.4744470445442968, + "learning_rate": 1.9404759234729224e-05, + "loss": 1.0939, "step": 1310 }, { - "epoch": 0.86, - "learning_rate": 1.2675637701621111e-05, - "loss": 1.2324, + "epoch": 0.13720565149136577, + "grad_norm": 2.597406306328376, + "learning_rate": 1.940360666305535e-05, + "loss": 1.122, "step": 1311 }, { - "epoch": 0.86, - "learning_rate": 1.2665361061130416e-05, - "loss": 1.4506, + "epoch": 0.13731030873888017, + "grad_norm": 2.24297119060127, + "learning_rate": 1.9402453010888013e-05, + "loss": 0.9809, "step": 1312 }, { - "epoch": 0.86, - "learning_rate": 1.2655081389609496e-05, - "loss": 1.2632, + "epoch": 0.13741496598639455, + "grad_norm": 2.3878513385258633, + "learning_rate": 1.940129827835976e-05, + "loss": 1.1173, "step": 1313 }, { - "epoch": 0.87, - "learning_rate": 1.2644798698748314e-05, - "loss": 1.2199, + "epoch": 0.13751962323390896, + "grad_norm": 2.195222848508457, + "learning_rate": 1.940014246560328e-05, + "loss": 1.0293, "step": 1314 }, { - "epoch": 0.87, - "learning_rate": 1.2634513000240277e-05, - "loss": 1.1961, + "epoch": 0.13762428048142333, + "grad_norm": 2.266044481844879, + "learning_rate": 1.939898557275137e-05, + "loss": 0.9389, "step": 1315 }, { - "epoch": 0.87, - "learning_rate": 1.2624224305782215e-05, - "loss": 1.2649, + "epoch": 0.13772893772893774, + "grad_norm": 2.227326516279883, + "learning_rate": 1.9397827599936967e-05, + "loss": 1.0587, "step": 1316 }, { - "epoch": 0.87, - "learning_rate": 1.2613932627074356e-05, - "loss": 1.3104, + "epoch": 0.13783359497645212, + "grad_norm": 2.3264930899908216, + "learning_rate": 1.9396668547293123e-05, + "loss": 1.0047, "step": 1317 }, { - "epoch": 0.87, - "learning_rate": 1.2603637975820324e-05, - "loss": 1.0901, + "epoch": 0.13793825222396652, + "grad_norm": 2.7272328130888543, + "learning_rate": 1.9395508414953014e-05, + "loss": 1.0649, "step": 1318 }, { - "epoch": 0.87, - "learning_rate": 1.2593340363727131e-05, - "loss": 1.3713, + "epoch": 0.1380429094714809, + "grad_norm": 2.317310994073378, + "learning_rate": 1.9394347203049935e-05, + "loss": 0.8952, "step": 1319 }, { - "epoch": 0.87, - "learning_rate": 1.2583039802505148e-05, - "loss": 1.2994, + "epoch": 0.13814756671899528, + "grad_norm": 2.7425995244638863, + "learning_rate": 1.9393184911717325e-05, + "loss": 1.0672, "step": 1320 }, { - "epoch": 0.87, - "learning_rate": 1.2572736303868102e-05, - "loss": 1.307, + "epoch": 0.13825222396650969, + "grad_norm": 3.3758460756291897, + "learning_rate": 1.9392021541088723e-05, + "loss": 1.133, "step": 1321 }, { - "epoch": 0.87, - "learning_rate": 1.256242987953306e-05, - "loss": 1.2154, + "epoch": 0.13835688121402406, + "grad_norm": 2.316620489795767, + "learning_rate": 1.939085709129781e-05, + "loss": 0.9964, "step": 1322 }, { - "epoch": 0.87, - "learning_rate": 1.2552120541220419e-05, - "loss": 1.1019, + "epoch": 0.13846153846153847, + "grad_norm": 2.3989333216450355, + "learning_rate": 1.9389691562478375e-05, + "loss": 1.111, "step": 1323 }, { - "epoch": 0.87, - "learning_rate": 1.2541808300653883e-05, - "loss": 1.2299, + "epoch": 0.13856619570905285, + "grad_norm": 2.562438640327413, + "learning_rate": 1.9388524954764345e-05, + "loss": 0.9959, "step": 1324 }, { - "epoch": 0.87, - "learning_rate": 1.253149316956047e-05, - "loss": 1.1303, + "epoch": 0.13867085295656725, + "grad_norm": 2.4165917099961782, + "learning_rate": 1.938735726828977e-05, + "loss": 1.1408, "step": 1325 }, { - "epoch": 0.87, - "learning_rate": 1.2521175159670464e-05, - "loss": 1.3876, + "epoch": 0.13877551020408163, + "grad_norm": 2.5972911347326297, + "learning_rate": 1.938618850318881e-05, + "loss": 1.0121, "step": 1326 }, { - "epoch": 0.87, - "learning_rate": 1.2510854282717444e-05, - "loss": 1.2807, + "epoch": 0.138880167451596, + "grad_norm": 2.178384853061503, + "learning_rate": 1.9385018659595768e-05, + "loss": 0.9381, "step": 1327 }, { - "epoch": 0.87, - "learning_rate": 1.2500530550438232e-05, - "loss": 1.1717, + "epoch": 0.13898482469911042, + "grad_norm": 2.341869108186974, + "learning_rate": 1.9383847737645053e-05, + "loss": 1.0985, "step": 1328 }, { - "epoch": 0.87, - "learning_rate": 1.2490203974572915e-05, - "loss": 1.3065, + "epoch": 0.1390894819466248, + "grad_norm": 2.211733194769639, + "learning_rate": 1.938267573747121e-05, + "loss": 0.9999, "step": 1329 }, { - "epoch": 0.88, - "learning_rate": 1.2479874566864799e-05, - "loss": 1.2305, + "epoch": 0.1391941391941392, + "grad_norm": 2.4888824976978463, + "learning_rate": 1.9381502659208903e-05, + "loss": 1.1316, "step": 1330 }, { - "epoch": 0.88, - "learning_rate": 1.2469542339060414e-05, - "loss": 1.0857, + "epoch": 0.13929879644165358, + "grad_norm": 2.5487250145244755, + "learning_rate": 1.9380328502992926e-05, + "loss": 1.1004, "step": 1331 }, { - "epoch": 0.88, - "learning_rate": 1.2459207302909499e-05, - "loss": 1.1372, + "epoch": 0.13940345368916798, + "grad_norm": 2.8605454756354027, + "learning_rate": 1.9379153268958183e-05, + "loss": 0.86, "step": 1332 }, { - "epoch": 0.88, - "learning_rate": 1.2448869470164988e-05, - "loss": 1.4061, + "epoch": 0.13950811093668236, + "grad_norm": 2.275799199333925, + "learning_rate": 1.937797695723972e-05, + "loss": 1.0387, "step": 1333 }, { - "epoch": 0.88, - "learning_rate": 1.243852885258299e-05, - "loss": 1.3289, + "epoch": 0.13961276818419677, + "grad_norm": 2.256873395213135, + "learning_rate": 1.9376799567972692e-05, + "loss": 0.9848, "step": 1334 }, { - "epoch": 0.88, - "learning_rate": 1.2428185461922783e-05, - "loss": 1.3146, + "epoch": 0.13971742543171115, + "grad_norm": 2.664507783387787, + "learning_rate": 1.9375621101292386e-05, + "loss": 1.0539, "step": 1335 }, { - "epoch": 0.88, - "learning_rate": 1.2417839309946806e-05, - "loss": 1.3308, + "epoch": 0.13982208267922552, + "grad_norm": 2.571479041795175, + "learning_rate": 1.9374441557334206e-05, + "loss": 1.0578, "step": 1336 }, { - "epoch": 0.88, - "learning_rate": 1.2407490408420629e-05, - "loss": 1.1638, + "epoch": 0.13992673992673993, + "grad_norm": 2.6718984503316623, + "learning_rate": 1.9373260936233692e-05, + "loss": 1.0356, "step": 1337 }, { - "epoch": 0.88, - "learning_rate": 1.2397138769112949e-05, - "loss": 1.1167, + "epoch": 0.1400313971742543, + "grad_norm": 3.01919139078155, + "learning_rate": 1.937207923812649e-05, + "loss": 1.0371, "step": 1338 }, { - "epoch": 0.88, - "learning_rate": 1.2386784403795577e-05, - "loss": 1.1957, + "epoch": 0.1401360544217687, + "grad_norm": 2.267242964618876, + "learning_rate": 1.9370896463148387e-05, + "loss": 0.9883, "step": 1339 }, { - "epoch": 0.88, - "learning_rate": 1.2376427324243432e-05, - "loss": 1.2913, + "epoch": 0.1402407116692831, + "grad_norm": 2.513036526104354, + "learning_rate": 1.9369712611435285e-05, + "loss": 1.073, "step": 1340 }, { - "epoch": 0.88, - "learning_rate": 1.2366067542234512e-05, - "loss": 1.3546, + "epoch": 0.1403453689167975, + "grad_norm": 2.3727052030885663, + "learning_rate": 1.9368527683123205e-05, + "loss": 0.8568, "step": 1341 }, { - "epoch": 0.88, - "learning_rate": 1.2355705069549885e-05, - "loss": 1.2994, + "epoch": 0.14045002616431188, + "grad_norm": 2.3202546866710585, + "learning_rate": 1.9367341678348307e-05, + "loss": 1.0071, "step": 1342 }, { - "epoch": 0.88, - "learning_rate": 1.2345339917973688e-05, - "loss": 1.4036, + "epoch": 0.14055468341182628, + "grad_norm": 2.689452076940995, + "learning_rate": 1.9366154597246856e-05, + "loss": 1.1804, "step": 1343 }, { - "epoch": 0.88, - "learning_rate": 1.2334972099293102e-05, - "loss": 1.2087, + "epoch": 0.14065934065934066, + "grad_norm": 2.742100535424126, + "learning_rate": 1.936496643995526e-05, + "loss": 0.9589, "step": 1344 }, { - "epoch": 0.89, - "learning_rate": 1.2324601625298331e-05, - "loss": 1.4056, + "epoch": 0.14076399790685504, + "grad_norm": 2.481796517485657, + "learning_rate": 1.936377720661003e-05, + "loss": 1.0471, "step": 1345 }, { - "epoch": 0.89, - "learning_rate": 1.2314228507782614e-05, - "loss": 1.2082, + "epoch": 0.14086865515436944, + "grad_norm": 2.5501058876317995, + "learning_rate": 1.936258689734782e-05, + "loss": 0.9166, "step": 1346 }, { - "epoch": 0.89, - "learning_rate": 1.2303852758542187e-05, - "loss": 1.4642, + "epoch": 0.14097331240188382, + "grad_norm": 2.336496862440958, + "learning_rate": 1.9361395512305395e-05, + "loss": 1.088, "step": 1347 }, { - "epoch": 0.89, - "learning_rate": 1.2293474389376281e-05, - "loss": 1.2091, + "epoch": 0.14107796964939823, + "grad_norm": 2.8026304750796487, + "learning_rate": 1.9360203051619648e-05, + "loss": 1.084, "step": 1348 }, { - "epoch": 0.89, - "learning_rate": 1.2283093412087099e-05, - "loss": 1.2325, + "epoch": 0.1411826268969126, + "grad_norm": 3.299010180332248, + "learning_rate": 1.9359009515427595e-05, + "loss": 1.1847, "step": 1349 }, { - "epoch": 0.89, - "learning_rate": 1.2272709838479824e-05, - "loss": 1.2466, + "epoch": 0.141287284144427, + "grad_norm": 2.7958593548788655, + "learning_rate": 1.9357814903866376e-05, + "loss": 1.0987, "step": 1350 }, { - "epoch": 0.89, - "learning_rate": 1.2262323680362587e-05, - "loss": 1.2338, + "epoch": 0.1413919413919414, + "grad_norm": 2.3205373831985274, + "learning_rate": 1.9356619217073252e-05, + "loss": 0.9035, "step": 1351 }, { - "epoch": 0.89, - "learning_rate": 1.2251934949546446e-05, - "loss": 1.2539, + "epoch": 0.1414965986394558, + "grad_norm": 2.761571592559715, + "learning_rate": 1.9355422455185614e-05, + "loss": 1.1353, "step": 1352 }, { - "epoch": 0.89, - "learning_rate": 1.2241543657845403e-05, - "loss": 1.1242, + "epoch": 0.14160125588697017, + "grad_norm": 2.7278975042129443, + "learning_rate": 1.9354224618340974e-05, + "loss": 1.0695, "step": 1353 }, { - "epoch": 0.89, - "learning_rate": 1.2231149817076358e-05, - "loss": 1.2605, + "epoch": 0.14170591313448455, + "grad_norm": 2.261842313891641, + "learning_rate": 1.9353025706676956e-05, + "loss": 1.0403, "step": 1354 }, { - "epoch": 0.89, - "learning_rate": 1.2220753439059116e-05, - "loss": 1.3588, + "epoch": 0.14181057038199896, + "grad_norm": 2.375471536045439, + "learning_rate": 1.9351825720331328e-05, + "loss": 1.0732, "step": 1355 }, { - "epoch": 0.89, - "learning_rate": 1.2210354535616366e-05, - "loss": 1.2776, + "epoch": 0.14191522762951334, + "grad_norm": 2.27300544182062, + "learning_rate": 1.9350624659441966e-05, + "loss": 1.0484, "step": 1356 }, { - "epoch": 0.89, - "learning_rate": 1.2199953118573674e-05, - "loss": 1.241, + "epoch": 0.14201988487702774, + "grad_norm": 2.316150199671341, + "learning_rate": 1.9349422524146876e-05, + "loss": 1.0243, "step": 1357 }, { - "epoch": 0.89, - "learning_rate": 1.2189549199759454e-05, - "loss": 1.1881, + "epoch": 0.14212454212454212, + "grad_norm": 2.1366142849380494, + "learning_rate": 1.9348219314584182e-05, + "loss": 0.9372, "step": 1358 }, { - "epoch": 0.89, - "learning_rate": 1.2179142791004974e-05, - "loss": 1.2892, + "epoch": 0.14222919937205653, + "grad_norm": 2.4024552323411545, + "learning_rate": 1.934701503089214e-05, + "loss": 0.9742, "step": 1359 }, { - "epoch": 0.9, - "learning_rate": 1.2168733904144328e-05, - "loss": 1.1443, + "epoch": 0.1423338566195709, + "grad_norm": 2.7893632727077557, + "learning_rate": 1.934580967320912e-05, + "loss": 0.9427, "step": 1360 }, { - "epoch": 0.9, - "learning_rate": 1.2158322551014431e-05, - "loss": 1.1217, + "epoch": 0.1424385138670853, + "grad_norm": 2.7934280403563143, + "learning_rate": 1.9344603241673624e-05, + "loss": 1.0359, "step": 1361 }, { - "epoch": 0.9, - "learning_rate": 1.2147908743455005e-05, - "loss": 1.3153, + "epoch": 0.1425431711145997, + "grad_norm": 2.4453534181554044, + "learning_rate": 1.9343395736424273e-05, + "loss": 0.915, "step": 1362 }, { - "epoch": 0.9, - "learning_rate": 1.2137492493308558e-05, - "loss": 1.0986, + "epoch": 0.14264782836211407, + "grad_norm": 2.643045406854803, + "learning_rate": 1.934218715759981e-05, + "loss": 1.0316, "step": 1363 }, { - "epoch": 0.9, - "learning_rate": 1.2127073812420376e-05, - "loss": 1.0793, + "epoch": 0.14275248560962847, + "grad_norm": 2.6295938809870023, + "learning_rate": 1.9340977505339105e-05, + "loss": 1.0912, "step": 1364 }, { - "epoch": 0.9, - "learning_rate": 1.2116652712638513e-05, - "loss": 1.326, + "epoch": 0.14285714285714285, + "grad_norm": 2.34279514669924, + "learning_rate": 1.9339766779781145e-05, + "loss": 0.9535, "step": 1365 }, { - "epoch": 0.9, - "learning_rate": 1.210622920581377e-05, - "loss": 1.3537, + "epoch": 0.14296180010465726, + "grad_norm": 2.9096664201728535, + "learning_rate": 1.9338554981065055e-05, + "loss": 1.0789, "step": 1366 }, { - "epoch": 0.9, - "learning_rate": 1.2095803303799693e-05, - "loss": 1.2164, + "epoch": 0.14306645735217163, + "grad_norm": 1.7847384845546328, + "learning_rate": 1.9337342109330063e-05, + "loss": 0.8184, "step": 1367 }, { - "epoch": 0.9, - "learning_rate": 1.2085375018452541e-05, - "loss": 1.0878, + "epoch": 0.14317111459968604, + "grad_norm": 4.650586633766713, + "learning_rate": 1.933612816471553e-05, + "loss": 0.9754, "step": 1368 }, { - "epoch": 0.9, - "learning_rate": 1.2074944361631288e-05, - "loss": 1.2595, + "epoch": 0.14327577184720042, + "grad_norm": 2.4222691647243484, + "learning_rate": 1.9334913147360947e-05, + "loss": 1.1472, "step": 1369 }, { - "epoch": 0.9, - "learning_rate": 1.2064511345197606e-05, - "loss": 1.2288, + "epoch": 0.1433804290947148, + "grad_norm": 2.8646659408254034, + "learning_rate": 1.9333697057405923e-05, + "loss": 1.0426, "step": 1370 }, { - "epoch": 0.9, - "learning_rate": 1.205407598101585e-05, - "loss": 1.2282, + "epoch": 0.1434850863422292, + "grad_norm": 2.755651171543178, + "learning_rate": 1.933247989499018e-05, + "loss": 0.9983, "step": 1371 }, { - "epoch": 0.9, - "learning_rate": 1.2043638280953043e-05, - "loss": 1.3438, + "epoch": 0.14358974358974358, + "grad_norm": 2.823997144325609, + "learning_rate": 1.9331261660253582e-05, + "loss": 1.0014, "step": 1372 }, { - "epoch": 0.9, - "learning_rate": 1.2033198256878867e-05, - "loss": 1.271, + "epoch": 0.14369440083725799, + "grad_norm": 1.7965865119756081, + "learning_rate": 1.9330042353336105e-05, + "loss": 0.8996, "step": 1373 }, { - "epoch": 0.9, - "learning_rate": 1.2022755920665648e-05, - "loss": 1.2748, + "epoch": 0.14379905808477236, + "grad_norm": 2.530518145697574, + "learning_rate": 1.9328821974377843e-05, + "loss": 1.0131, "step": 1374 }, { - "epoch": 0.91, - "learning_rate": 1.201231128418833e-05, - "loss": 1.2505, + "epoch": 0.14390371533228677, + "grad_norm": 2.5598798754427587, + "learning_rate": 1.9327600523519024e-05, + "loss": 0.8846, "step": 1375 }, { - "epoch": 0.91, - "learning_rate": 1.200186435932449e-05, - "loss": 1.2923, + "epoch": 0.14400837257980115, + "grad_norm": 2.404146251680647, + "learning_rate": 1.93263780009e-05, + "loss": 0.9166, "step": 1376 }, { - "epoch": 0.91, - "learning_rate": 1.1991415157954293e-05, - "loss": 1.1689, + "epoch": 0.14411302982731555, + "grad_norm": 2.5090164360878915, + "learning_rate": 1.9325154406661236e-05, + "loss": 1.0343, "step": 1377 }, { - "epoch": 0.91, - "learning_rate": 1.1980963691960508e-05, - "loss": 1.2749, + "epoch": 0.14421768707482993, + "grad_norm": 2.613140816630469, + "learning_rate": 1.9323929740943327e-05, + "loss": 1.0372, "step": 1378 }, { - "epoch": 0.91, - "learning_rate": 1.1970509973228459e-05, - "loss": 1.1473, + "epoch": 0.1443223443223443, + "grad_norm": 2.7666096244065805, + "learning_rate": 1.9322704003886988e-05, + "loss": 1.0378, "step": 1379 }, { - "epoch": 0.91, - "learning_rate": 1.1960054013646048e-05, - "loss": 1.2292, + "epoch": 0.14442700156985872, + "grad_norm": 3.1803876980173023, + "learning_rate": 1.932147719563306e-05, + "loss": 1.0994, "step": 1380 }, { - "epoch": 0.91, - "learning_rate": 1.1949595825103721e-05, - "loss": 1.2904, + "epoch": 0.1445316588173731, + "grad_norm": 2.0896284201518567, + "learning_rate": 1.9320249316322505e-05, + "loss": 1.0265, "step": 1381 }, { - "epoch": 0.91, - "learning_rate": 1.1939135419494456e-05, - "loss": 1.2193, + "epoch": 0.1446363160648875, + "grad_norm": 2.322244889283912, + "learning_rate": 1.931902036609641e-05, + "loss": 0.9989, "step": 1382 }, { - "epoch": 0.91, - "learning_rate": 1.192867280871376e-05, - "loss": 1.0881, + "epoch": 0.14474097331240188, + "grad_norm": 2.752401702892132, + "learning_rate": 1.9317790345095984e-05, + "loss": 1.0343, "step": 1383 }, { - "epoch": 0.91, - "learning_rate": 1.1918208004659632e-05, - "loss": 1.3105, + "epoch": 0.14484563055991628, + "grad_norm": 2.6752134593317454, + "learning_rate": 1.9316559253462556e-05, + "loss": 1.0981, "step": 1384 }, { - "epoch": 0.91, - "learning_rate": 1.1907741019232584e-05, - "loss": 1.2666, + "epoch": 0.14495028780743066, + "grad_norm": 2.234026157936921, + "learning_rate": 1.9315327091337587e-05, + "loss": 0.9987, "step": 1385 }, { - "epoch": 0.91, - "learning_rate": 1.1897271864335594e-05, - "loss": 1.2536, + "epoch": 0.14505494505494507, + "grad_norm": 2.825030948671621, + "learning_rate": 1.931409385886265e-05, + "loss": 1.1929, "step": 1386 }, { - "epoch": 0.91, - "learning_rate": 1.1886800551874111e-05, - "loss": 1.3282, + "epoch": 0.14515960230245945, + "grad_norm": 2.4650646815837676, + "learning_rate": 1.9312859556179445e-05, + "loss": 1.0682, "step": 1387 }, { - "epoch": 0.91, - "learning_rate": 1.1876327093756048e-05, - "loss": 1.1443, + "epoch": 0.14526425954997382, + "grad_norm": 2.7590671185555267, + "learning_rate": 1.9311624183429795e-05, + "loss": 1.0708, "step": 1388 }, { - "epoch": 0.91, - "learning_rate": 1.1865851501891741e-05, - "loss": 1.1805, + "epoch": 0.14536891679748823, + "grad_norm": 2.5815894772069954, + "learning_rate": 1.9310387740755657e-05, + "loss": 0.9889, "step": 1389 }, { - "epoch": 0.92, - "learning_rate": 1.1855373788193969e-05, - "loss": 1.1864, + "epoch": 0.1454735740450026, + "grad_norm": 2.6236133860227637, + "learning_rate": 1.930915022829909e-05, + "loss": 1.071, "step": 1390 }, { - "epoch": 0.92, - "learning_rate": 1.1844893964577906e-05, - "loss": 1.2669, + "epoch": 0.145578231292517, + "grad_norm": 2.7247337786787638, + "learning_rate": 1.9307911646202288e-05, + "loss": 1.1056, "step": 1391 }, { - "epoch": 0.92, - "learning_rate": 1.1834412042961143e-05, - "loss": 1.3911, + "epoch": 0.1456828885400314, + "grad_norm": 2.862234904110278, + "learning_rate": 1.930667199460757e-05, + "loss": 1.0049, "step": 1392 }, { - "epoch": 0.92, - "learning_rate": 1.1823928035263646e-05, - "loss": 1.3024, + "epoch": 0.1457875457875458, + "grad_norm": 2.4645303878971156, + "learning_rate": 1.9305431273657373e-05, + "loss": 1.005, "step": 1393 }, { - "epoch": 0.92, - "learning_rate": 1.1813441953407754e-05, - "loss": 1.1906, + "epoch": 0.14589220303506018, + "grad_norm": 2.213086769666513, + "learning_rate": 1.9304189483494264e-05, + "loss": 0.8771, "step": 1394 }, { - "epoch": 0.92, - "learning_rate": 1.1802953809318173e-05, - "loss": 1.2009, + "epoch": 0.14599686028257458, + "grad_norm": 2.743298220409371, + "learning_rate": 1.9302946624260914e-05, + "loss": 1.2425, "step": 1395 }, { - "epoch": 0.92, - "learning_rate": 1.1792463614921945e-05, - "loss": 1.1585, + "epoch": 0.14610151753008896, + "grad_norm": 2.6622529241671757, + "learning_rate": 1.930170269610014e-05, + "loss": 1.0216, "step": 1396 }, { - "epoch": 0.92, - "learning_rate": 1.1781971382148446e-05, - "loss": 1.3063, + "epoch": 0.14620617477760334, + "grad_norm": 2.7147812441457515, + "learning_rate": 1.9300457699154874e-05, + "loss": 0.9964, "step": 1397 }, { - "epoch": 0.92, - "learning_rate": 1.177147712292937e-05, - "loss": 1.3428, + "epoch": 0.14631083202511774, + "grad_norm": 2.393611225263065, + "learning_rate": 1.929921163356816e-05, + "loss": 1.0048, "step": 1398 }, { - "epoch": 0.92, - "learning_rate": 1.1760980849198717e-05, - "loss": 1.2162, + "epoch": 0.14641548927263212, + "grad_norm": 2.7084391545813573, + "learning_rate": 1.9297964499483178e-05, + "loss": 1.069, "step": 1399 }, { - "epoch": 0.92, - "learning_rate": 1.1750482572892781e-05, - "loss": 1.2148, + "epoch": 0.14652014652014653, + "grad_norm": 4.288573871222411, + "learning_rate": 1.929671629704323e-05, + "loss": 1.0899, "step": 1400 }, { - "epoch": 0.92, - "learning_rate": 1.1739982305950124e-05, - "loss": 1.292, + "epoch": 0.1466248037676609, + "grad_norm": 2.317166017560044, + "learning_rate": 1.929546702639173e-05, + "loss": 0.9723, "step": 1401 }, { - "epoch": 0.92, - "learning_rate": 1.172948006031158e-05, - "loss": 1.2038, + "epoch": 0.1467294610151753, + "grad_norm": 2.0977663597117373, + "learning_rate": 1.9294216687672222e-05, + "loss": 1.0926, "step": 1402 }, { - "epoch": 0.92, - "learning_rate": 1.1718975847920229e-05, - "loss": 1.2464, + "epoch": 0.1468341182626897, + "grad_norm": 2.494438503271147, + "learning_rate": 1.929296528102838e-05, + "loss": 0.8911, "step": 1403 }, { - "epoch": 0.92, - "learning_rate": 1.170846968072139e-05, - "loss": 1.1437, + "epoch": 0.1469387755102041, + "grad_norm": 2.718143112765956, + "learning_rate": 1.9291712806603987e-05, + "loss": 1.0866, "step": 1404 }, { - "epoch": 0.92, - "learning_rate": 1.1697961570662606e-05, - "loss": 1.2991, + "epoch": 0.14704343275771847, + "grad_norm": 2.2131296320523797, + "learning_rate": 1.9290459264542957e-05, + "loss": 1.0103, "step": 1405 }, { - "epoch": 0.93, - "learning_rate": 1.1687451529693626e-05, - "loss": 1.2902, + "epoch": 0.14714809000523285, + "grad_norm": 2.9091331858933347, + "learning_rate": 1.9289204654989324e-05, + "loss": 1.1178, "step": 1406 }, { - "epoch": 0.93, - "learning_rate": 1.167693956976639e-05, - "loss": 1.2128, + "epoch": 0.14725274725274726, + "grad_norm": 2.6753705952770708, + "learning_rate": 1.928794897808724e-05, + "loss": 1.0762, "step": 1407 }, { - "epoch": 0.93, - "learning_rate": 1.1666425702835033e-05, - "loss": 1.1326, + "epoch": 0.14735740450026164, + "grad_norm": 2.000163233169189, + "learning_rate": 1.9286692233980994e-05, + "loss": 1.0423, "step": 1408 }, { - "epoch": 0.93, - "learning_rate": 1.1655909940855852e-05, - "loss": 1.1265, + "epoch": 0.14746206174777604, + "grad_norm": 2.293295761305028, + "learning_rate": 1.9285434422814984e-05, + "loss": 1.0555, "step": 1409 }, { - "epoch": 0.93, - "learning_rate": 1.1645392295787299e-05, - "loss": 1.3404, + "epoch": 0.14756671899529042, + "grad_norm": 2.2949310514737262, + "learning_rate": 1.9284175544733735e-05, + "loss": 1.0947, "step": 1410 }, { - "epoch": 0.93, - "learning_rate": 1.1634872779589966e-05, - "loss": 1.2424, + "epoch": 0.14767137624280482, + "grad_norm": 2.5294931864353485, + "learning_rate": 1.9282915599881895e-05, + "loss": 1.1798, "step": 1411 }, { - "epoch": 0.93, - "learning_rate": 1.1624351404226573e-05, - "loss": 1.1055, + "epoch": 0.1477760334903192, + "grad_norm": 2.3551404864975822, + "learning_rate": 1.9281654588404233e-05, + "loss": 1.0175, "step": 1412 }, { - "epoch": 0.93, - "learning_rate": 1.1613828181661957e-05, - "loss": 1.3506, + "epoch": 0.1478806907378336, + "grad_norm": 1.9955894483022132, + "learning_rate": 1.9280392510445646e-05, + "loss": 1.0019, "step": 1413 }, { - "epoch": 0.93, - "learning_rate": 1.1603303123863054e-05, - "loss": 1.2615, + "epoch": 0.147985347985348, + "grad_norm": 2.1950167870687385, + "learning_rate": 1.9279129366151143e-05, + "loss": 0.8811, "step": 1414 }, { - "epoch": 0.93, - "learning_rate": 1.1592776242798887e-05, - "loss": 1.2522, + "epoch": 0.14809000523286237, + "grad_norm": 2.21385771131273, + "learning_rate": 1.9277865155665867e-05, + "loss": 0.9541, "step": 1415 }, { - "epoch": 0.93, - "learning_rate": 1.1582247550440555e-05, - "loss": 1.2939, + "epoch": 0.14819466248037677, + "grad_norm": 3.1210671096804017, + "learning_rate": 1.9276599879135074e-05, + "loss": 1.3079, "step": 1416 }, { - "epoch": 0.93, - "learning_rate": 1.1571717058761211e-05, - "loss": 1.2789, + "epoch": 0.14829931972789115, + "grad_norm": 2.629699340784082, + "learning_rate": 1.927533353670415e-05, + "loss": 0.9163, "step": 1417 }, { - "epoch": 0.93, - "learning_rate": 1.1561184779736061e-05, - "loss": 1.2443, + "epoch": 0.14840397697540555, + "grad_norm": 2.77960958263951, + "learning_rate": 1.92740661285186e-05, + "loss": 1.0696, "step": 1418 }, { - "epoch": 0.93, - "learning_rate": 1.1550650725342337e-05, - "loss": 1.3018, + "epoch": 0.14850863422291993, + "grad_norm": 2.5708441812829177, + "learning_rate": 1.9272797654724052e-05, + "loss": 1.1057, "step": 1419 }, { - "epoch": 0.93, - "learning_rate": 1.1540114907559294e-05, - "loss": 1.3083, + "epoch": 0.14861329147043434, + "grad_norm": 2.1036647647909303, + "learning_rate": 1.9271528115466257e-05, + "loss": 0.9253, "step": 1420 }, { - "epoch": 0.94, - "learning_rate": 1.1529577338368191e-05, - "loss": 1.0566, + "epoch": 0.14871794871794872, + "grad_norm": 2.2534281596027483, + "learning_rate": 1.9270257510891083e-05, + "loss": 0.9099, "step": 1421 }, { - "epoch": 0.94, - "learning_rate": 1.1519038029752281e-05, - "loss": 1.2826, + "epoch": 0.1488226059654631, + "grad_norm": 2.1743546862854366, + "learning_rate": 1.926898584114453e-05, + "loss": 1.047, "step": 1422 }, { - "epoch": 0.94, - "learning_rate": 1.1508496993696791e-05, - "loss": 1.1012, + "epoch": 0.1489272632129775, + "grad_norm": 2.810151030114405, + "learning_rate": 1.9267713106372716e-05, + "loss": 0.9191, "step": 1423 }, { - "epoch": 0.94, - "learning_rate": 1.1497954242188913e-05, - "loss": 1.1776, + "epoch": 0.14903192046049188, + "grad_norm": 2.2964956304821578, + "learning_rate": 1.9266439306721874e-05, + "loss": 1.0545, "step": 1424 }, { - "epoch": 0.94, - "learning_rate": 1.14874097872178e-05, - "loss": 1.462, + "epoch": 0.14913657770800628, + "grad_norm": 2.3401825418391407, + "learning_rate": 1.9265164442338374e-05, + "loss": 1.0503, "step": 1425 }, { - "epoch": 0.94, - "learning_rate": 1.1476863640774523e-05, - "loss": 1.2547, + "epoch": 0.14924123495552066, + "grad_norm": 2.2508646328782818, + "learning_rate": 1.9263888513368693e-05, + "loss": 1.0283, "step": 1426 }, { - "epoch": 0.94, - "learning_rate": 1.1466315814852094e-05, - "loss": 1.2628, + "epoch": 0.14934589220303507, + "grad_norm": 2.6552965793248364, + "learning_rate": 1.9262611519959446e-05, + "loss": 1.0631, "step": 1427 }, { - "epoch": 0.94, - "learning_rate": 1.145576632144542e-05, - "loss": 1.2201, + "epoch": 0.14945054945054945, + "grad_norm": 2.532689406015064, + "learning_rate": 1.9261333462257357e-05, + "loss": 1.1762, "step": 1428 }, { - "epoch": 0.94, - "learning_rate": 1.144521517255132e-05, - "loss": 1.2119, + "epoch": 0.14955520669806385, + "grad_norm": 2.5552203568036758, + "learning_rate": 1.9260054340409277e-05, + "loss": 1.0354, "step": 1429 }, { - "epoch": 0.94, - "learning_rate": 1.1434662380168487e-05, - "loss": 1.2235, + "epoch": 0.14965986394557823, + "grad_norm": 2.3797872928219026, + "learning_rate": 1.9258774154562183e-05, + "loss": 1.044, "step": 1430 }, { - "epoch": 0.94, - "learning_rate": 1.1424107956297479e-05, - "loss": 1.2132, + "epoch": 0.1497645211930926, + "grad_norm": 2.5188480453800097, + "learning_rate": 1.9257492904863168e-05, + "loss": 1.0178, "step": 1431 }, { - "epoch": 0.94, - "learning_rate": 1.1413551912940721e-05, - "loss": 1.2307, + "epoch": 0.14986917844060701, + "grad_norm": 1.9882868472731003, + "learning_rate": 1.9256210591459452e-05, + "loss": 0.7584, "step": 1432 }, { - "epoch": 0.94, - "learning_rate": 1.1402994262102468e-05, - "loss": 1.2471, + "epoch": 0.1499738356881214, + "grad_norm": 2.4171327636272837, + "learning_rate": 1.9254927214498376e-05, + "loss": 1.0275, "step": 1433 }, { - "epoch": 0.94, - "learning_rate": 1.1392435015788805e-05, - "loss": 1.1631, + "epoch": 0.1500784929356358, + "grad_norm": 2.6335229959993693, + "learning_rate": 1.92536427741274e-05, + "loss": 1.157, "step": 1434 }, { - "epoch": 0.94, - "learning_rate": 1.138187418600764e-05, - "loss": 1.3305, + "epoch": 0.15018315018315018, + "grad_norm": 2.418758003326527, + "learning_rate": 1.9252357270494108e-05, + "loss": 1.107, "step": 1435 }, { - "epoch": 0.95, - "learning_rate": 1.1371311784768672e-05, - "loss": 1.1862, + "epoch": 0.15028780743066458, + "grad_norm": 2.4515624975379073, + "learning_rate": 1.9251070703746212e-05, + "loss": 1.0629, "step": 1436 }, { - "epoch": 0.95, - "learning_rate": 1.1360747824083397e-05, - "loss": 1.1721, + "epoch": 0.15039246467817896, + "grad_norm": 2.5078647866604364, + "learning_rate": 1.9249783074031537e-05, + "loss": 0.9575, "step": 1437 }, { - "epoch": 0.95, - "learning_rate": 1.1350182315965071e-05, - "loss": 1.2731, + "epoch": 0.15049712192569337, + "grad_norm": 2.05009431610471, + "learning_rate": 1.9248494381498036e-05, + "loss": 0.9304, "step": 1438 }, { - "epoch": 0.95, - "learning_rate": 1.1339615272428722e-05, - "loss": 1.0712, + "epoch": 0.15060177917320774, + "grad_norm": 2.2150107036134288, + "learning_rate": 1.924720462629378e-05, + "loss": 1.0904, "step": 1439 }, { - "epoch": 0.95, - "learning_rate": 1.1329046705491111e-05, - "loss": 1.2964, + "epoch": 0.15070643642072212, + "grad_norm": 2.147637671766143, + "learning_rate": 1.9245913808566972e-05, + "loss": 0.8344, "step": 1440 }, { - "epoch": 0.95, - "learning_rate": 1.1318476627170751e-05, - "loss": 1.2767, + "epoch": 0.15081109366823653, + "grad_norm": 2.883011617888164, + "learning_rate": 1.9244621928465922e-05, + "loss": 1.1529, "step": 1441 }, { - "epoch": 0.95, - "learning_rate": 1.1307905049487855e-05, - "loss": 1.3459, + "epoch": 0.1509157509157509, + "grad_norm": 2.5500903724133543, + "learning_rate": 1.9243328986139067e-05, + "loss": 1.0428, "step": 1442 }, { - "epoch": 0.95, - "learning_rate": 1.129733198446435e-05, - "loss": 1.3206, + "epoch": 0.1510204081632653, + "grad_norm": 2.4159647382548393, + "learning_rate": 1.9242034981734977e-05, + "loss": 1.0373, "step": 1443 }, { - "epoch": 0.95, - "learning_rate": 1.1286757444123854e-05, - "loss": 1.2021, + "epoch": 0.1511250654107797, + "grad_norm": 2.454453579212534, + "learning_rate": 1.924073991540233e-05, + "loss": 1.0861, "step": 1444 }, { - "epoch": 0.95, - "learning_rate": 1.1276181440491658e-05, - "loss": 1.302, + "epoch": 0.1512297226582941, + "grad_norm": 1.8614495838101486, + "learning_rate": 1.9239443787289937e-05, + "loss": 0.821, "step": 1445 }, { - "epoch": 0.95, - "learning_rate": 1.1265603985594723e-05, - "loss": 1.3366, + "epoch": 0.15133437990580847, + "grad_norm": 2.450919159468918, + "learning_rate": 1.923814659754672e-05, + "loss": 0.9241, "step": 1446 }, { - "epoch": 0.95, - "learning_rate": 1.1255025091461657e-05, - "loss": 1.3548, + "epoch": 0.15143903715332288, + "grad_norm": 2.3140383964591784, + "learning_rate": 1.9236848346321733e-05, + "loss": 1.09, "step": 1447 }, { - "epoch": 0.95, - "learning_rate": 1.1244444770122707e-05, - "loss": 1.2291, + "epoch": 0.15154369440083726, + "grad_norm": 2.4237734839061624, + "learning_rate": 1.9235549033764146e-05, + "loss": 0.944, "step": 1448 }, { - "epoch": 0.95, - "learning_rate": 1.123386303360974e-05, - "loss": 1.459, + "epoch": 0.15164835164835164, + "grad_norm": 2.748321943938643, + "learning_rate": 1.923424866002325e-05, + "loss": 1.1062, "step": 1449 }, { - "epoch": 0.95, - "learning_rate": 1.1223279893956234e-05, - "loss": 1.2989, + "epoch": 0.15175300889586604, + "grad_norm": 3.0209245770619453, + "learning_rate": 1.9232947225248465e-05, + "loss": 1.0515, "step": 1450 }, { - "epoch": 0.96, - "learning_rate": 1.1212695363197262e-05, - "loss": 1.2016, + "epoch": 0.15185766614338042, + "grad_norm": 2.5606861904785476, + "learning_rate": 1.9231644729589326e-05, + "loss": 1.1291, "step": 1451 }, { - "epoch": 0.96, - "learning_rate": 1.120210945336948e-05, - "loss": 1.3596, + "epoch": 0.15196232339089483, + "grad_norm": 2.581511623994528, + "learning_rate": 1.9230341173195493e-05, + "loss": 1.0573, "step": 1452 }, { - "epoch": 0.96, - "learning_rate": 1.1191522176511112e-05, - "loss": 1.2772, + "epoch": 0.1520669806384092, + "grad_norm": 2.158586432143417, + "learning_rate": 1.9229036556216748e-05, + "loss": 0.9755, "step": 1453 }, { - "epoch": 0.96, - "learning_rate": 1.1180933544661936e-05, - "loss": 1.1584, + "epoch": 0.1521716378859236, + "grad_norm": 2.4855105857033926, + "learning_rate": 1.9227730878802996e-05, + "loss": 1.023, "step": 1454 }, { - "epoch": 0.96, - "learning_rate": 1.1170343569863274e-05, - "loss": 1.0758, + "epoch": 0.152276295133438, + "grad_norm": 2.5999432367590316, + "learning_rate": 1.9226424141104252e-05, + "loss": 1.1008, "step": 1455 }, { - "epoch": 0.96, - "learning_rate": 1.1159752264157963e-05, - "loss": 1.3042, + "epoch": 0.1523809523809524, + "grad_norm": 2.3917057103613075, + "learning_rate": 1.9225116343270677e-05, + "loss": 1.0145, "step": 1456 }, { - "epoch": 0.96, - "learning_rate": 1.1149159639590377e-05, - "loss": 1.1603, + "epoch": 0.15248560962846677, + "grad_norm": 2.2293516130088795, + "learning_rate": 1.9223807485452532e-05, + "loss": 1.019, "step": 1457 }, { - "epoch": 0.96, - "learning_rate": 1.1138565708206364e-05, - "loss": 1.3142, + "epoch": 0.15259026687598115, + "grad_norm": 2.42242544414817, + "learning_rate": 1.9222497567800208e-05, + "loss": 0.9934, "step": 1458 }, { - "epoch": 0.96, - "learning_rate": 1.1127970482053274e-05, - "loss": 1.0895, + "epoch": 0.15269492412349556, + "grad_norm": 2.209915153802859, + "learning_rate": 1.9221186590464214e-05, + "loss": 1.0644, "step": 1459 }, { - "epoch": 0.96, - "learning_rate": 1.1117373973179925e-05, - "loss": 1.1043, + "epoch": 0.15279958137100993, + "grad_norm": 2.2671395202419773, + "learning_rate": 1.9219874553595192e-05, + "loss": 0.98, "step": 1460 }, { - "epoch": 0.96, - "learning_rate": 1.1106776193636595e-05, - "loss": 1.3214, + "epoch": 0.15290423861852434, + "grad_norm": 2.6320601579269725, + "learning_rate": 1.9218561457343892e-05, + "loss": 0.9559, "step": 1461 }, { - "epoch": 0.96, - "learning_rate": 1.1096177155475006e-05, - "loss": 1.1407, + "epoch": 0.15300889586603872, + "grad_norm": 3.128326326853589, + "learning_rate": 1.9217247301861192e-05, + "loss": 1.057, "step": 1462 }, { - "epoch": 0.96, - "learning_rate": 1.1085576870748307e-05, - "loss": 1.2877, + "epoch": 0.15311355311355312, + "grad_norm": 2.6110874383167224, + "learning_rate": 1.9215932087298093e-05, + "loss": 1.1651, "step": 1463 }, { - "epoch": 0.96, - "learning_rate": 1.1074975351511075e-05, - "loss": 1.1184, + "epoch": 0.1532182103610675, + "grad_norm": 2.899961166471011, + "learning_rate": 1.9214615813805713e-05, + "loss": 1.0419, "step": 1464 }, { - "epoch": 0.96, - "learning_rate": 1.1064372609819279e-05, - "loss": 1.2395, + "epoch": 0.15332286760858188, + "grad_norm": 2.484808931850369, + "learning_rate": 1.9213298481535295e-05, + "loss": 0.9486, "step": 1465 }, { - "epoch": 0.97, - "learning_rate": 1.1053768657730284e-05, - "loss": 1.2231, + "epoch": 0.1534275248560963, + "grad_norm": 2.4399470897359095, + "learning_rate": 1.9211980090638204e-05, + "loss": 0.9143, "step": 1466 }, { - "epoch": 0.97, - "learning_rate": 1.1043163507302835e-05, - "loss": 1.2085, + "epoch": 0.15353218210361066, + "grad_norm": 2.6027448143774756, + "learning_rate": 1.9210660641265926e-05, + "loss": 1.0988, "step": 1467 }, { - "epoch": 0.97, - "learning_rate": 1.1032557170597034e-05, - "loss": 1.1212, + "epoch": 0.15363683935112507, + "grad_norm": 2.806968895627165, + "learning_rate": 1.920934013357007e-05, + "loss": 1.1467, "step": 1468 }, { - "epoch": 0.97, - "learning_rate": 1.1021949659674337e-05, - "loss": 1.0985, + "epoch": 0.15374149659863945, + "grad_norm": 2.5531294035484446, + "learning_rate": 1.9208018567702365e-05, + "loss": 1.1388, "step": 1469 }, { - "epoch": 0.97, - "learning_rate": 1.1011340986597528e-05, - "loss": 1.2408, + "epoch": 0.15384615384615385, + "grad_norm": 2.811782089564992, + "learning_rate": 1.9206695943814657e-05, + "loss": 0.9528, "step": 1470 }, { - "epoch": 0.97, - "learning_rate": 1.100073116343072e-05, - "loss": 1.2321, + "epoch": 0.15395081109366823, + "grad_norm": 2.3041792729089683, + "learning_rate": 1.9205372262058923e-05, + "loss": 0.9661, "step": 1471 }, { - "epoch": 0.97, - "learning_rate": 1.0990120202239324e-05, - "loss": 1.3038, + "epoch": 0.15405546834118264, + "grad_norm": 2.106029830838967, + "learning_rate": 1.9204047522587256e-05, + "loss": 0.8363, "step": 1472 }, { - "epoch": 0.97, - "learning_rate": 1.097950811509006e-05, - "loss": 1.2647, + "epoch": 0.15416012558869702, + "grad_norm": 2.417394330181495, + "learning_rate": 1.9202721725551864e-05, + "loss": 1.1144, "step": 1473 }, { - "epoch": 0.97, - "learning_rate": 1.096889491405092e-05, - "loss": 1.2462, + "epoch": 0.1542647828362114, + "grad_norm": 2.046523358752946, + "learning_rate": 1.9201394871105098e-05, + "loss": 0.9455, "step": 1474 }, { - "epoch": 0.97, - "learning_rate": 1.0958280611191162e-05, - "loss": 1.1796, + "epoch": 0.1543694400837258, + "grad_norm": 2.6640103245094946, + "learning_rate": 1.9200066959399403e-05, + "loss": 1.007, "step": 1475 }, { - "epoch": 0.97, - "learning_rate": 1.0947665218581293e-05, - "loss": 1.2872, + "epoch": 0.15447409733124018, + "grad_norm": 2.583568326692352, + "learning_rate": 1.9198737990587367e-05, + "loss": 1.0767, "step": 1476 }, { - "epoch": 0.97, - "learning_rate": 1.0937048748293068e-05, - "loss": 1.3001, + "epoch": 0.15457875457875458, + "grad_norm": 2.5142524512605644, + "learning_rate": 1.9197407964821684e-05, + "loss": 1.1079, "step": 1477 }, { - "epoch": 0.97, - "learning_rate": 1.0926431212399467e-05, - "loss": 1.2903, + "epoch": 0.15468341182626896, + "grad_norm": 2.608398316355045, + "learning_rate": 1.919607688225518e-05, + "loss": 1.0722, "step": 1478 }, { - "epoch": 0.97, - "learning_rate": 1.0915812622974673e-05, - "loss": 1.1658, + "epoch": 0.15478806907378337, + "grad_norm": 2.575108862444703, + "learning_rate": 1.9194744743040806e-05, + "loss": 1.1298, "step": 1479 }, { - "epoch": 0.97, - "learning_rate": 1.0905192992094075e-05, - "loss": 1.2011, + "epoch": 0.15489272632129775, + "grad_norm": 2.548511459425752, + "learning_rate": 1.9193411547331618e-05, + "loss": 1.0149, "step": 1480 }, { - "epoch": 0.97, - "learning_rate": 1.0894572331834248e-05, - "loss": 1.1497, + "epoch": 0.15499738356881215, + "grad_norm": 2.8655016663149366, + "learning_rate": 1.9192077295280804e-05, + "loss": 0.9912, "step": 1481 }, { - "epoch": 0.98, - "learning_rate": 1.0883950654272925e-05, - "loss": 1.3064, + "epoch": 0.15510204081632653, + "grad_norm": 2.302307429570692, + "learning_rate": 1.9190741987041677e-05, + "loss": 1.1227, "step": 1482 }, { - "epoch": 0.98, - "learning_rate": 1.0873327971489015e-05, - "loss": 1.1824, + "epoch": 0.1552066980638409, + "grad_norm": 2.709474213319208, + "learning_rate": 1.9189405622767666e-05, + "loss": 1.1448, "step": 1483 }, { - "epoch": 0.98, - "learning_rate": 1.086270429556255e-05, - "loss": 1.145, + "epoch": 0.15531135531135531, + "grad_norm": 1.9449774082886633, + "learning_rate": 1.9188068202612317e-05, + "loss": 0.8605, "step": 1484 }, { - "epoch": 0.98, - "learning_rate": 1.0852079638574708e-05, - "loss": 1.1142, + "epoch": 0.1554160125588697, + "grad_norm": 2.217490324767267, + "learning_rate": 1.9186729726729308e-05, + "loss": 1.0131, "step": 1485 }, { - "epoch": 0.98, - "learning_rate": 1.084145401260777e-05, - "loss": 1.4423, + "epoch": 0.1555206698063841, + "grad_norm": 2.3193422710271605, + "learning_rate": 1.9185390195272428e-05, + "loss": 1.0585, "step": 1486 }, { - "epoch": 0.98, - "learning_rate": 1.083082742974513e-05, - "loss": 1.3293, + "epoch": 0.15562532705389848, + "grad_norm": 2.4865611357772432, + "learning_rate": 1.9184049608395596e-05, + "loss": 1.0997, "step": 1487 }, { - "epoch": 0.98, - "learning_rate": 1.082019990207126e-05, - "loss": 1.2088, + "epoch": 0.15572998430141288, + "grad_norm": 2.783780276185395, + "learning_rate": 1.9182707966252842e-05, + "loss": 0.9589, "step": 1488 }, { - "epoch": 0.98, - "learning_rate": 1.0809571441671716e-05, - "loss": 1.3175, + "epoch": 0.15583464154892726, + "grad_norm": 1.8816997343630404, + "learning_rate": 1.9181365268998328e-05, + "loss": 0.8778, "step": 1489 }, { - "epoch": 0.98, - "learning_rate": 1.0798942060633108e-05, - "loss": 1.1579, + "epoch": 0.15593929879644167, + "grad_norm": 2.341369035073116, + "learning_rate": 1.9180021516786334e-05, + "loss": 1.0037, "step": 1490 }, { - "epoch": 0.98, - "learning_rate": 1.0788311771043092e-05, - "loss": 1.2273, + "epoch": 0.15604395604395604, + "grad_norm": 2.7516857057542397, + "learning_rate": 1.917867670977126e-05, + "loss": 1.1602, "step": 1491 }, { - "epoch": 0.98, - "learning_rate": 1.0777680584990366e-05, - "loss": 1.4225, + "epoch": 0.15614861329147042, + "grad_norm": 2.5272703478492127, + "learning_rate": 1.917733084810762e-05, + "loss": 1.0647, "step": 1492 }, { - "epoch": 0.98, - "learning_rate": 1.0767048514564633e-05, - "loss": 1.345, + "epoch": 0.15625327053898483, + "grad_norm": 2.4404363392464723, + "learning_rate": 1.9175983931950064e-05, + "loss": 0.9985, "step": 1493 }, { - "epoch": 0.98, - "learning_rate": 1.0756415571856617e-05, - "loss": 0.9828, + "epoch": 0.1563579277864992, + "grad_norm": 2.9891735698022908, + "learning_rate": 1.9174635961453353e-05, + "loss": 1.1115, "step": 1494 }, { - "epoch": 0.98, - "learning_rate": 1.074578176895803e-05, - "loss": 1.276, + "epoch": 0.1564625850340136, + "grad_norm": 2.67141152526144, + "learning_rate": 1.9173286936772368e-05, + "loss": 1.1347, "step": 1495 }, { - "epoch": 0.98, - "learning_rate": 1.073514711796155e-05, - "loss": 1.2913, + "epoch": 0.156567242281528, + "grad_norm": 2.4091308182799223, + "learning_rate": 1.917193685806212e-05, + "loss": 1.1213, "step": 1496 }, { - "epoch": 0.99, - "learning_rate": 1.0724511630960837e-05, - "loss": 1.2195, + "epoch": 0.1566718995290424, + "grad_norm": 2.126176198100282, + "learning_rate": 1.9170585725477734e-05, + "loss": 1.0569, "step": 1497 }, { - "epoch": 0.99, - "learning_rate": 1.0713875320050487e-05, - "loss": 1.2488, + "epoch": 0.15677655677655677, + "grad_norm": 2.729603104944734, + "learning_rate": 1.9169233539174458e-05, + "loss": 1.0782, "step": 1498 }, { - "epoch": 0.99, - "learning_rate": 1.0703238197326042e-05, - "loss": 1.1805, + "epoch": 0.15688121402407118, + "grad_norm": 2.513953681189516, + "learning_rate": 1.916788029930766e-05, + "loss": 1.0887, "step": 1499 }, { - "epoch": 0.99, - "learning_rate": 1.0692600274883969e-05, - "loss": 1.2677, + "epoch": 0.15698587127158556, + "grad_norm": 2.6538867773454715, + "learning_rate": 1.9166526006032828e-05, + "loss": 1.0529, "step": 1500 }, { - "epoch": 0.99, - "learning_rate": 1.0681961564821632e-05, - "loss": 1.2276, + "epoch": 0.15709052851909994, + "grad_norm": 2.7220788186480744, + "learning_rate": 1.9165170659505577e-05, + "loss": 1.108, "step": 1501 }, { - "epoch": 0.99, - "learning_rate": 1.0671322079237307e-05, - "loss": 1.1109, + "epoch": 0.15719518576661434, + "grad_norm": 2.4347603808435676, + "learning_rate": 1.916381425988164e-05, + "loss": 1.0695, "step": 1502 }, { - "epoch": 0.99, - "learning_rate": 1.0660681830230137e-05, - "loss": 1.2044, + "epoch": 0.15729984301412872, + "grad_norm": 2.8818520587577425, + "learning_rate": 1.916245680731686e-05, + "loss": 1.1366, "step": 1503 }, { - "epoch": 0.99, - "learning_rate": 1.0650040829900143e-05, - "loss": 1.2021, + "epoch": 0.15740450026164313, + "grad_norm": 2.3660768203543325, + "learning_rate": 1.916109830196723e-05, + "loss": 1.1054, "step": 1504 }, { - "epoch": 0.99, - "learning_rate": 1.06393990903482e-05, - "loss": 1.0864, + "epoch": 0.1575091575091575, + "grad_norm": 2.0199292578050114, + "learning_rate": 1.9159738743988824e-05, + "loss": 0.9812, "step": 1505 }, { - "epoch": 0.99, - "learning_rate": 1.0628756623676016e-05, - "loss": 1.2712, + "epoch": 0.1576138147566719, + "grad_norm": 2.846624791338478, + "learning_rate": 1.915837813353787e-05, + "loss": 1.0785, "step": 1506 }, { - "epoch": 0.99, - "learning_rate": 1.061811344198613e-05, - "loss": 1.3277, + "epoch": 0.1577184720041863, + "grad_norm": 2.84528056057021, + "learning_rate": 1.9157016470770704e-05, + "loss": 1.0292, "step": 1507 }, { - "epoch": 0.99, - "learning_rate": 1.06074695573819e-05, - "loss": 1.1496, + "epoch": 0.15782312925170067, + "grad_norm": 2.9007238219313765, + "learning_rate": 1.915565375584378e-05, + "loss": 1.0423, "step": 1508 }, { - "epoch": 0.99, - "learning_rate": 1.0596824981967475e-05, - "loss": 1.2536, + "epoch": 0.15792778649921507, + "grad_norm": 2.2980335605739857, + "learning_rate": 1.9154289988913684e-05, + "loss": 1.0505, "step": 1509 }, { - "epoch": 0.99, - "learning_rate": 1.0586179727847791e-05, - "loss": 1.3152, + "epoch": 0.15803244374672945, + "grad_norm": 2.5527713053393626, + "learning_rate": 1.9152925170137107e-05, + "loss": 0.9359, "step": 1510 }, { - "epoch": 0.99, - "learning_rate": 1.0575533807128567e-05, - "loss": 1.2341, + "epoch": 0.15813710099424386, + "grad_norm": 2.8078774178279953, + "learning_rate": 1.915155929967087e-05, + "loss": 0.941, "step": 1511 }, { - "epoch": 1.0, - "learning_rate": 1.0564887231916256e-05, - "loss": 1.2395, + "epoch": 0.15824175824175823, + "grad_norm": 2.5590185945658077, + "learning_rate": 1.9150192377671923e-05, + "loss": 1.0774, "step": 1512 }, { - "epoch": 1.0, - "learning_rate": 1.0554240014318085e-05, - "loss": 1.2996, + "epoch": 0.15834641548927264, + "grad_norm": 2.6027080227490953, + "learning_rate": 1.914882440429732e-05, + "loss": 0.9899, "step": 1513 }, { - "epoch": 1.0, - "learning_rate": 1.0543592166441983e-05, - "loss": 1.1862, + "epoch": 0.15845107273678702, + "grad_norm": 2.6337545570436927, + "learning_rate": 1.9147455379704246e-05, + "loss": 1.0319, "step": 1514 }, { - "epoch": 1.0, - "learning_rate": 1.0532943700396616e-05, - "loss": 1.3286, + "epoch": 0.15855572998430142, + "grad_norm": 2.502047847027918, + "learning_rate": 1.9146085304050007e-05, + "loss": 0.8999, "step": 1515 }, { - "epoch": 1.0, - "learning_rate": 1.0522294628291347e-05, - "loss": 1.2093, + "epoch": 0.1586603872318158, + "grad_norm": 2.327612116726649, + "learning_rate": 1.9144714177492022e-05, + "loss": 1.0622, "step": 1516 }, { - "epoch": 1.0, - "learning_rate": 1.0511644962236228e-05, - "loss": 1.2798, + "epoch": 0.15876504447933018, + "grad_norm": 2.5795756527864837, + "learning_rate": 1.9143342000187843e-05, + "loss": 1.0127, "step": 1517 }, { - "epoch": 1.0, - "learning_rate": 1.0500994714341984e-05, - "loss": 1.2026, + "epoch": 0.1588697017268446, + "grad_norm": 2.440086755209261, + "learning_rate": 1.914196877229513e-05, + "loss": 0.9672, "step": 1518 }, { - "epoch": 1.0, - "learning_rate": 1.049034389672e-05, - "loss": 1.1517, + "epoch": 0.15897435897435896, + "grad_norm": 2.6625915906187654, + "learning_rate": 1.9140594493971676e-05, + "loss": 1.0476, "step": 1519 }, { - "epoch": 1.0, - "learning_rate": 1.0479692521482316e-05, - "loss": 0.7474, + "epoch": 0.15907901622187337, + "grad_norm": 2.2315198666216443, + "learning_rate": 1.9139219165375383e-05, + "loss": 1.0311, "step": 1520 }, { - "epoch": 1.0, - "learning_rate": 1.0469040600741603e-05, - "loss": 0.8453, + "epoch": 0.15918367346938775, + "grad_norm": 2.4701713494311592, + "learning_rate": 1.913784278666428e-05, + "loss": 1.0938, "step": 1521 }, { - "epoch": 1.0, - "learning_rate": 1.0458388146611151e-05, - "loss": 0.7812, + "epoch": 0.15928833071690215, + "grad_norm": 2.377821429542283, + "learning_rate": 1.9136465357996517e-05, + "loss": 1.0208, "step": 1522 }, { - "epoch": 1.0, - "learning_rate": 1.044773517120486e-05, - "loss": 0.7486, + "epoch": 0.15939298796441653, + "grad_norm": 2.4194712197927974, + "learning_rate": 1.9135086879530363e-05, + "loss": 1.0568, "step": 1523 }, { - "epoch": 1.0, - "learning_rate": 1.0437081686637216e-05, - "loss": 0.8325, + "epoch": 0.15949764521193094, + "grad_norm": 2.3390951758786005, + "learning_rate": 1.9133707351424204e-05, + "loss": 1.0695, "step": 1524 }, { - "epoch": 1.0, - "learning_rate": 1.0426427705023289e-05, - "loss": 0.8364, + "epoch": 0.15960230245944532, + "grad_norm": 2.530783648332205, + "learning_rate": 1.913232677383656e-05, + "loss": 0.9741, "step": 1525 }, { - "epoch": 1.0, - "learning_rate": 1.0415773238478715e-05, - "loss": 0.8639, + "epoch": 0.1597069597069597, + "grad_norm": 2.5392759403405254, + "learning_rate": 1.9130945146926054e-05, + "loss": 1.0942, "step": 1526 }, { - "epoch": 1.01, - "learning_rate": 1.0405118299119682e-05, - "loss": 0.9195, + "epoch": 0.1598116169544741, + "grad_norm": 2.3718557105745073, + "learning_rate": 1.912956247085144e-05, + "loss": 0.9684, "step": 1527 }, { - "epoch": 1.01, - "learning_rate": 1.039446289906291e-05, - "loss": 0.8059, + "epoch": 0.15991627420198848, + "grad_norm": 2.260719250710198, + "learning_rate": 1.9128178745771592e-05, + "loss": 0.9989, "step": 1528 }, { - "epoch": 1.01, - "learning_rate": 1.0383807050425648e-05, - "loss": 0.7991, + "epoch": 0.16002093144950288, + "grad_norm": 2.739167566121222, + "learning_rate": 1.91267939718455e-05, + "loss": 1.0678, "step": 1529 }, { - "epoch": 1.01, - "learning_rate": 1.037315076532565e-05, - "loss": 0.8506, + "epoch": 0.16012558869701726, + "grad_norm": 2.6646863118785613, + "learning_rate": 1.912540814923228e-05, + "loss": 1.0639, "step": 1530 }, { - "epoch": 1.01, - "learning_rate": 1.0362494055881172e-05, - "loss": 0.798, + "epoch": 0.16023024594453167, + "grad_norm": 2.194868214474638, + "learning_rate": 1.912402127809116e-05, + "loss": 0.874, "step": 1531 }, { - "epoch": 1.01, - "learning_rate": 1.0351836934210957e-05, - "loss": 0.7271, + "epoch": 0.16033490319204605, + "grad_norm": 2.3676540984204992, + "learning_rate": 1.9122633358581507e-05, + "loss": 1.1603, "step": 1532 }, { - "epoch": 1.01, - "learning_rate": 1.03411794124342e-05, - "loss": 0.7752, + "epoch": 0.16043956043956045, + "grad_norm": 2.6218767772877634, + "learning_rate": 1.912124439086278e-05, + "loss": 1.0975, "step": 1533 }, { - "epoch": 1.01, - "learning_rate": 1.0330521502670574e-05, - "loss": 0.7374, + "epoch": 0.16054421768707483, + "grad_norm": 2.5083643616431206, + "learning_rate": 1.9119854375094586e-05, + "loss": 0.9346, "step": 1534 }, { - "epoch": 1.01, - "learning_rate": 1.0319863217040166e-05, - "loss": 0.8382, + "epoch": 0.1606488749345892, + "grad_norm": 2.721016148141621, + "learning_rate": 1.9118463311436636e-05, + "loss": 1.0989, "step": 1535 }, { - "epoch": 1.01, - "learning_rate": 1.0309204567663513e-05, - "loss": 0.8403, + "epoch": 0.16075353218210361, + "grad_norm": 2.473905042239637, + "learning_rate": 1.9117071200048766e-05, + "loss": 1.1074, "step": 1536 }, { - "epoch": 1.01, - "learning_rate": 1.0298545566661558e-05, - "loss": 0.7714, + "epoch": 0.160858189429618, + "grad_norm": 2.265192355726035, + "learning_rate": 1.9115678041090934e-05, + "loss": 0.9356, "step": 1537 }, { - "epoch": 1.01, - "learning_rate": 1.028788622615564e-05, - "loss": 0.7968, + "epoch": 0.1609628466771324, + "grad_norm": 2.8019320178423563, + "learning_rate": 1.911428383472321e-05, + "loss": 1.1416, "step": 1538 }, { - "epoch": 1.01, - "learning_rate": 1.0277226558267494e-05, - "loss": 0.7852, + "epoch": 0.16106750392464678, + "grad_norm": 2.339428603113728, + "learning_rate": 1.9112888581105803e-05, + "loss": 1.0687, "step": 1539 }, { - "epoch": 1.01, - "learning_rate": 1.0266566575119213e-05, - "loss": 0.7557, + "epoch": 0.16117216117216118, + "grad_norm": 2.832854657617948, + "learning_rate": 1.9111492280399022e-05, + "loss": 1.056, "step": 1540 }, { - "epoch": 1.01, - "learning_rate": 1.0255906288833259e-05, - "loss": 0.7614, + "epoch": 0.16127681841967556, + "grad_norm": 2.230222759725198, + "learning_rate": 1.911009493276331e-05, + "loss": 1.0508, "step": 1541 }, { - "epoch": 1.02, - "learning_rate": 1.0245245711532437e-05, - "loss": 0.8238, + "epoch": 0.16138147566718997, + "grad_norm": 2.3090795673914215, + "learning_rate": 1.9108696538359218e-05, + "loss": 1.1137, "step": 1542 }, { - "epoch": 1.02, - "learning_rate": 1.0234584855339884e-05, - "loss": 0.6947, + "epoch": 0.16148613291470434, + "grad_norm": 2.613787477378895, + "learning_rate": 1.910729709734743e-05, + "loss": 1.0693, "step": 1543 }, { - "epoch": 1.02, - "learning_rate": 1.0223923732379049e-05, - "loss": 0.7356, + "epoch": 0.16159079016221872, + "grad_norm": 2.7873744759292243, + "learning_rate": 1.9105896609888742e-05, + "loss": 1.0308, "step": 1544 }, { - "epoch": 1.02, - "learning_rate": 1.0213262354773689e-05, - "loss": 0.6816, + "epoch": 0.16169544740973313, + "grad_norm": 2.2750511261647026, + "learning_rate": 1.9104495076144077e-05, + "loss": 0.9986, "step": 1545 }, { - "epoch": 1.02, - "learning_rate": 1.020260073464785e-05, - "loss": 0.6796, + "epoch": 0.1618001046572475, + "grad_norm": 2.2610184840565863, + "learning_rate": 1.9103092496274467e-05, + "loss": 0.9975, "step": 1546 }, { - "epoch": 1.02, - "learning_rate": 1.0191938884125847e-05, - "loss": 0.7914, + "epoch": 0.1619047619047619, + "grad_norm": 2.6441259800557275, + "learning_rate": 1.910168887044108e-05, + "loss": 1.0092, "step": 1547 }, { - "epoch": 1.02, - "learning_rate": 1.0181276815332273e-05, - "loss": 0.7402, + "epoch": 0.1620094191522763, + "grad_norm": 2.4288506046325096, + "learning_rate": 1.910028419880519e-05, + "loss": 1.1243, "step": 1548 }, { - "epoch": 1.02, - "learning_rate": 1.0170614540391952e-05, - "loss": 0.6915, + "epoch": 0.1621140763997907, + "grad_norm": 2.4032519124937473, + "learning_rate": 1.9098878481528192e-05, + "loss": 0.9376, "step": 1549 }, { - "epoch": 1.02, - "learning_rate": 1.0159952071429953e-05, - "loss": 0.7947, + "epoch": 0.16221873364730507, + "grad_norm": 2.6930599410682805, + "learning_rate": 1.9097471718771615e-05, + "loss": 1.0011, "step": 1550 }, { - "epoch": 1.02, - "learning_rate": 1.0149289420571558e-05, - "loss": 0.7117, + "epoch": 0.16232339089481945, + "grad_norm": 2.944692543573583, + "learning_rate": 1.9096063910697096e-05, + "loss": 1.1354, "step": 1551 }, { - "epoch": 1.02, - "learning_rate": 1.0138626599942263e-05, - "loss": 0.7479, + "epoch": 0.16242804814233386, + "grad_norm": 2.1076650082395787, + "learning_rate": 1.9094655057466394e-05, + "loss": 0.8077, "step": 1552 }, { - "epoch": 1.02, - "learning_rate": 1.0127963621667754e-05, - "loss": 0.6167, + "epoch": 0.16253270538984824, + "grad_norm": 2.633417159588095, + "learning_rate": 1.9093245159241386e-05, + "loss": 0.9511, "step": 1553 }, { - "epoch": 1.02, - "learning_rate": 1.0117300497873897e-05, - "loss": 0.6633, + "epoch": 0.16263736263736264, + "grad_norm": 2.4640899726417986, + "learning_rate": 1.9091834216184078e-05, + "loss": 1.1126, "step": 1554 }, { - "epoch": 1.02, - "learning_rate": 1.0106637240686723e-05, - "loss": 0.7158, + "epoch": 0.16274201988487702, + "grad_norm": 2.309637404087591, + "learning_rate": 1.9090422228456586e-05, + "loss": 1.0576, "step": 1555 }, { - "epoch": 1.02, - "learning_rate": 1.009597386223241e-05, - "loss": 0.6957, + "epoch": 0.16284667713239143, + "grad_norm": 2.4383894345913655, + "learning_rate": 1.9089009196221153e-05, + "loss": 0.7495, "step": 1556 }, { - "epoch": 1.03, - "learning_rate": 1.0085310374637288e-05, - "loss": 0.8345, + "epoch": 0.1629513343799058, + "grad_norm": 2.445885248005966, + "learning_rate": 1.9087595119640137e-05, + "loss": 1.0271, "step": 1557 }, { - "epoch": 1.03, - "learning_rate": 1.0074646790027792e-05, - "loss": 0.8166, + "epoch": 0.1630559916274202, + "grad_norm": 2.5458426627911757, + "learning_rate": 1.908617999887602e-05, + "loss": 1.0411, "step": 1558 }, { - "epoch": 1.03, - "learning_rate": 1.006398312053048e-05, - "loss": 0.7793, + "epoch": 0.1631606488749346, + "grad_norm": 2.1937796229963213, + "learning_rate": 1.90847638340914e-05, + "loss": 1.0347, "step": 1559 }, { - "epoch": 1.03, - "learning_rate": 1.0053319378272008e-05, - "loss": 0.7357, + "epoch": 0.16326530612244897, + "grad_norm": 2.537434579639156, + "learning_rate": 1.9083346625449e-05, + "loss": 1.0541, "step": 1560 }, { - "epoch": 1.03, - "learning_rate": 1.0042655575379103e-05, - "loss": 0.7202, + "epoch": 0.16336996336996337, + "grad_norm": 2.144339908895929, + "learning_rate": 1.908192837311166e-05, + "loss": 0.9906, "step": 1561 }, { - "epoch": 1.03, - "learning_rate": 1.0031991723978574e-05, - "loss": 0.8471, + "epoch": 0.16347462061747775, + "grad_norm": 2.7950297429331843, + "learning_rate": 1.9080509077242336e-05, + "loss": 1.0121, "step": 1562 }, { - "epoch": 1.03, - "learning_rate": 1.0021327836197274e-05, - "loss": 0.7022, + "epoch": 0.16357927786499216, + "grad_norm": 2.146536389927275, + "learning_rate": 1.9079088738004115e-05, + "loss": 0.8715, "step": 1563 }, { - "epoch": 1.03, - "learning_rate": 1.001066392416211e-05, - "loss": 0.7363, + "epoch": 0.16368393511250653, + "grad_norm": 2.342813449285308, + "learning_rate": 1.907766735556019e-05, + "loss": 0.9457, "step": 1564 }, { - "epoch": 1.03, - "learning_rate": 1e-05, - "loss": 0.8191, + "epoch": 0.16378859236002094, + "grad_norm": 2.423858021588017, + "learning_rate": 1.9076244930073885e-05, + "loss": 0.9405, "step": 1565 }, { - "epoch": 1.03, - "learning_rate": 9.989336075837893e-06, - "loss": 0.831, + "epoch": 0.16389324960753532, + "grad_norm": 2.2113311521403514, + "learning_rate": 1.9074821461708633e-05, + "loss": 1.032, "step": 1566 }, { - "epoch": 1.03, - "learning_rate": 9.978672163802726e-06, - "loss": 0.846, + "epoch": 0.16399790685504972, + "grad_norm": 2.5538309174750347, + "learning_rate": 1.9073396950628006e-05, + "loss": 1.078, "step": 1567 }, { - "epoch": 1.03, - "learning_rate": 9.96800827602143e-06, - "loss": 0.8937, + "epoch": 0.1641025641025641, + "grad_norm": 2.116186550938168, + "learning_rate": 1.9071971396995673e-05, + "loss": 0.9762, "step": 1568 }, { - "epoch": 1.03, - "learning_rate": 9.9573444246209e-06, - "loss": 0.7693, + "epoch": 0.16420722135007848, + "grad_norm": 2.787785331892462, + "learning_rate": 1.9070544800975433e-05, + "loss": 1.1774, "step": 1569 }, { - "epoch": 1.03, - "learning_rate": 9.946680621727995e-06, - "loss": 0.5848, + "epoch": 0.1643118785975929, + "grad_norm": 2.4908693717866295, + "learning_rate": 1.906911716273121e-05, + "loss": 1.0658, "step": 1570 }, { - "epoch": 1.03, - "learning_rate": 9.936016879469522e-06, - "loss": 0.76, + "epoch": 0.16441653584510726, + "grad_norm": 2.267571098697772, + "learning_rate": 1.9067688482427037e-05, + "loss": 0.9325, "step": 1571 }, { - "epoch": 1.03, - "learning_rate": 9.925353209972213e-06, - "loss": 0.6796, + "epoch": 0.16452119309262167, + "grad_norm": 2.3281432184570665, + "learning_rate": 1.906625876022708e-05, + "loss": 1.0401, "step": 1572 }, { - "epoch": 1.04, - "learning_rate": 9.914689625362717e-06, - "loss": 0.7551, + "epoch": 0.16462585034013605, + "grad_norm": 2.2909902279482703, + "learning_rate": 1.906482799629561e-05, + "loss": 1.0412, "step": 1573 }, { - "epoch": 1.04, - "learning_rate": 9.904026137767591e-06, - "loss": 0.7341, + "epoch": 0.16473050758765045, + "grad_norm": 2.2388140959121916, + "learning_rate": 1.9063396190797024e-05, + "loss": 1.0802, "step": 1574 }, { - "epoch": 1.04, - "learning_rate": 9.89336275931328e-06, - "loss": 0.6881, + "epoch": 0.16483516483516483, + "grad_norm": 2.4751194312927973, + "learning_rate": 1.9061963343895848e-05, + "loss": 0.9623, "step": 1575 }, { - "epoch": 1.04, - "learning_rate": 9.882699502126103e-06, - "loss": 0.7877, + "epoch": 0.16493982208267924, + "grad_norm": 2.2766727680557963, + "learning_rate": 1.9060529455756707e-05, + "loss": 1.078, "step": 1576 }, { - "epoch": 1.04, - "learning_rate": 9.872036378332251e-06, - "loss": 0.6821, + "epoch": 0.16504447933019362, + "grad_norm": 2.2759067164759785, + "learning_rate": 1.905909452654437e-05, + "loss": 1.0286, "step": 1577 }, { - "epoch": 1.04, - "learning_rate": 9.86137340005774e-06, - "loss": 0.6926, + "epoch": 0.165149136577708, + "grad_norm": 2.366060725452454, + "learning_rate": 1.9057658556423702e-05, + "loss": 0.9825, "step": 1578 }, { - "epoch": 1.04, - "learning_rate": 9.850710579428445e-06, - "loss": 0.7044, + "epoch": 0.1652537938252224, + "grad_norm": 2.3640547803376646, + "learning_rate": 1.9056221545559704e-05, + "loss": 0.9975, "step": 1579 }, { - "epoch": 1.04, - "learning_rate": 9.84004792857005e-06, - "loss": 0.6913, + "epoch": 0.16535845107273678, + "grad_norm": 2.4952266534643512, + "learning_rate": 1.9054783494117496e-05, + "loss": 0.9696, "step": 1580 }, { - "epoch": 1.04, - "learning_rate": 9.82938545960805e-06, - "loss": 0.7573, + "epoch": 0.16546310832025118, + "grad_norm": 2.566903238353252, + "learning_rate": 1.9053344402262306e-05, + "loss": 1.1521, "step": 1581 }, { - "epoch": 1.04, - "learning_rate": 9.818723184667732e-06, - "loss": 0.7585, + "epoch": 0.16556776556776556, + "grad_norm": 2.39022571551939, + "learning_rate": 1.9051904270159492e-05, + "loss": 0.8817, "step": 1582 }, { - "epoch": 1.04, - "learning_rate": 9.808061115874156e-06, - "loss": 0.7767, + "epoch": 0.16567242281527997, + "grad_norm": 2.461629879445559, + "learning_rate": 1.9050463097974528e-05, + "loss": 0.9734, "step": 1583 }, { - "epoch": 1.04, - "learning_rate": 9.797399265352154e-06, - "loss": 0.721, + "epoch": 0.16577708006279435, + "grad_norm": 2.398269719344242, + "learning_rate": 1.9049020885873008e-05, + "loss": 1.0474, "step": 1584 }, { - "epoch": 1.04, - "learning_rate": 9.786737645226313e-06, - "loss": 0.7766, + "epoch": 0.16588173731030875, + "grad_norm": 1.9737021667497825, + "learning_rate": 1.904757763402064e-05, + "loss": 0.9153, "step": 1585 }, { - "epoch": 1.04, - "learning_rate": 9.776076267620955e-06, - "loss": 0.697, + "epoch": 0.16598639455782313, + "grad_norm": 2.4557990474197258, + "learning_rate": 1.904613334258326e-05, + "loss": 1.0526, "step": 1586 }, { - "epoch": 1.04, - "learning_rate": 9.76541514466012e-06, - "loss": 0.7165, + "epoch": 0.1660910518053375, + "grad_norm": 2.296988769284568, + "learning_rate": 1.9044688011726823e-05, + "loss": 0.9095, "step": 1587 }, { - "epoch": 1.05, - "learning_rate": 9.754754288467564e-06, - "loss": 0.821, + "epoch": 0.16619570905285191, + "grad_norm": 2.261789866599913, + "learning_rate": 1.9043241641617398e-05, + "loss": 1.0134, "step": 1588 }, { - "epoch": 1.05, - "learning_rate": 9.744093711166745e-06, - "loss": 0.732, + "epoch": 0.1663003663003663, + "grad_norm": 2.457617607787954, + "learning_rate": 1.9041794232421177e-05, + "loss": 1.0589, "step": 1589 }, { - "epoch": 1.05, - "learning_rate": 9.733433424880789e-06, - "loss": 0.705, + "epoch": 0.1664050235478807, + "grad_norm": 2.5050727103817696, + "learning_rate": 1.9040345784304467e-05, + "loss": 1.0968, "step": 1590 }, { - "epoch": 1.05, - "learning_rate": 9.72277344173251e-06, - "loss": 0.7957, + "epoch": 0.16650968079539508, + "grad_norm": 2.6614296904360106, + "learning_rate": 1.90388962974337e-05, + "loss": 0.9425, "step": 1591 }, { - "epoch": 1.05, - "learning_rate": 9.712113773844361e-06, - "loss": 0.7128, + "epoch": 0.16661433804290948, + "grad_norm": 2.388506412574016, + "learning_rate": 1.9037445771975425e-05, + "loss": 1.0221, "step": 1592 }, { - "epoch": 1.05, - "learning_rate": 9.701454433338445e-06, - "loss": 0.7031, + "epoch": 0.16671899529042386, + "grad_norm": 2.410859511215317, + "learning_rate": 1.903599420809631e-05, + "loss": 0.9942, "step": 1593 }, { - "epoch": 1.05, - "learning_rate": 9.690795432336489e-06, - "loss": 0.7189, + "epoch": 0.16682365253793824, + "grad_norm": 2.263106423709346, + "learning_rate": 1.9034541605963143e-05, + "loss": 1.0328, "step": 1594 }, { - "epoch": 1.05, - "learning_rate": 9.680136782959835e-06, - "loss": 0.7433, + "epoch": 0.16692830978545264, + "grad_norm": 2.322273159353024, + "learning_rate": 1.9033087965742826e-05, + "loss": 1.0696, "step": 1595 }, { - "epoch": 1.05, - "learning_rate": 9.669478497329431e-06, - "loss": 0.7002, + "epoch": 0.16703296703296702, + "grad_norm": 2.5405277930274197, + "learning_rate": 1.9031633287602396e-05, + "loss": 1.0605, "step": 1596 }, { - "epoch": 1.05, - "learning_rate": 9.658820587565801e-06, - "loss": 0.7529, + "epoch": 0.16713762428048143, + "grad_norm": 2.2074468419651474, + "learning_rate": 1.903017757170899e-05, + "loss": 1.0402, "step": 1597 }, { - "epoch": 1.05, - "learning_rate": 9.648163065789046e-06, - "loss": 0.6715, + "epoch": 0.1672422815279958, + "grad_norm": 2.754455496878057, + "learning_rate": 1.9028720818229874e-05, + "loss": 1.0416, "step": 1598 }, { - "epoch": 1.05, - "learning_rate": 9.637505944118828e-06, - "loss": 0.7375, + "epoch": 0.1673469387755102, + "grad_norm": 2.3580569683993198, + "learning_rate": 1.9027263027332434e-05, + "loss": 1.1088, "step": 1599 }, { - "epoch": 1.05, - "learning_rate": 9.626849234674356e-06, - "loss": 0.7689, + "epoch": 0.1674515960230246, + "grad_norm": 2.3717864565978757, + "learning_rate": 1.9025804199184173e-05, + "loss": 1.0257, "step": 1600 }, { - "epoch": 1.05, - "learning_rate": 9.616192949574357e-06, - "loss": 0.7926, + "epoch": 0.167556253270539, + "grad_norm": 2.314198126343136, + "learning_rate": 1.9024344333952708e-05, + "loss": 1.1035, "step": 1601 }, { - "epoch": 1.05, - "learning_rate": 9.605537100937094e-06, - "loss": 0.7911, + "epoch": 0.16766091051805337, + "grad_norm": 2.442992823002754, + "learning_rate": 1.902288343180579e-05, + "loss": 0.9067, "step": 1602 }, { - "epoch": 1.06, - "learning_rate": 9.59488170088032e-06, - "loss": 0.7955, + "epoch": 0.16776556776556775, + "grad_norm": 2.3303308497808963, + "learning_rate": 1.9021421492911272e-05, + "loss": 1.0525, "step": 1603 }, { - "epoch": 1.06, - "learning_rate": 9.584226761521285e-06, - "loss": 0.8466, + "epoch": 0.16787022501308216, + "grad_norm": 2.3600681521712255, + "learning_rate": 1.901995851743714e-05, + "loss": 1.0627, "step": 1604 }, { - "epoch": 1.06, - "learning_rate": 9.573572294976714e-06, - "loss": 0.6859, + "epoch": 0.16797488226059654, + "grad_norm": 2.5485749167168077, + "learning_rate": 1.9018494505551484e-05, + "loss": 1.0949, "step": 1605 }, { - "epoch": 1.06, - "learning_rate": 9.562918313362787e-06, - "loss": 0.6251, + "epoch": 0.16807953950811094, + "grad_norm": 2.403018203528686, + "learning_rate": 1.9017029457422525e-05, + "loss": 0.9727, "step": 1606 }, { - "epoch": 1.06, - "learning_rate": 9.552264828795142e-06, - "loss": 0.7441, + "epoch": 0.16818419675562532, + "grad_norm": 2.186348329353992, + "learning_rate": 1.9015563373218607e-05, + "loss": 1.0965, "step": 1607 }, { - "epoch": 1.06, - "learning_rate": 9.541611853388849e-06, - "loss": 0.7742, + "epoch": 0.16828885400313973, + "grad_norm": 2.5369378525468216, + "learning_rate": 1.901409625310818e-05, + "loss": 0.9336, "step": 1608 }, { - "epoch": 1.06, - "learning_rate": 9.530959399258397e-06, - "loss": 0.755, + "epoch": 0.1683935112506541, + "grad_norm": 2.406260744207432, + "learning_rate": 1.901262809725982e-05, + "loss": 0.9296, "step": 1609 }, { - "epoch": 1.06, - "learning_rate": 9.520307478517686e-06, - "loss": 0.6872, + "epoch": 0.1684981684981685, + "grad_norm": 2.229664380822881, + "learning_rate": 1.9011158905842218e-05, + "loss": 1.0721, "step": 1610 }, { - "epoch": 1.06, - "learning_rate": 9.509656103280004e-06, - "loss": 0.7239, + "epoch": 0.1686028257456829, + "grad_norm": 2.588043339747656, + "learning_rate": 1.900968867902419e-05, + "loss": 0.9903, "step": 1611 }, { - "epoch": 1.06, - "learning_rate": 9.499005285658021e-06, - "loss": 0.8755, + "epoch": 0.16870748299319727, + "grad_norm": 2.4100976190366903, + "learning_rate": 1.9008217416974674e-05, + "loss": 0.9882, "step": 1612 }, { - "epoch": 1.06, - "learning_rate": 9.488355037763774e-06, - "loss": 0.8583, + "epoch": 0.16881214024071167, + "grad_norm": 2.2572340824619013, + "learning_rate": 1.9006745119862713e-05, + "loss": 1.0155, "step": 1613 }, { - "epoch": 1.06, - "learning_rate": 9.477705371708657e-06, - "loss": 0.7734, + "epoch": 0.16891679748822605, + "grad_norm": 2.5087512390175117, + "learning_rate": 1.9005271787857477e-05, + "loss": 1.0246, "step": 1614 }, { - "epoch": 1.06, - "learning_rate": 9.467056299603388e-06, - "loss": 0.7673, + "epoch": 0.16902145473574046, + "grad_norm": 2.644963375872955, + "learning_rate": 1.900379742112826e-05, + "loss": 1.0464, "step": 1615 }, { - "epoch": 1.06, - "learning_rate": 9.456407833558019e-06, - "loss": 0.7102, + "epoch": 0.16912611198325483, + "grad_norm": 2.4254460348690987, + "learning_rate": 1.900232201984446e-05, + "loss": 1.0792, "step": 1616 }, { - "epoch": 1.06, - "learning_rate": 9.445759985681919e-06, - "loss": 0.7643, + "epoch": 0.16923076923076924, + "grad_norm": 2.3127114426824975, + "learning_rate": 1.9000845584175617e-05, + "loss": 0.9903, "step": 1617 }, { - "epoch": 1.07, - "learning_rate": 9.435112768083742e-06, - "loss": 0.7464, + "epoch": 0.16933542647828362, + "grad_norm": 2.412862064453713, + "learning_rate": 1.8999368114291367e-05, + "loss": 0.9951, "step": 1618 }, { - "epoch": 1.07, - "learning_rate": 9.42446619287144e-06, - "loss": 0.6291, + "epoch": 0.16944008372579802, + "grad_norm": 2.469305422395179, + "learning_rate": 1.899788961036148e-05, + "loss": 0.918, "step": 1619 }, { - "epoch": 1.07, - "learning_rate": 9.41382027215221e-06, - "loss": 0.8474, + "epoch": 0.1695447409733124, + "grad_norm": 2.1595304334372347, + "learning_rate": 1.8996410072555833e-05, + "loss": 1.0967, "step": 1620 }, { - "epoch": 1.07, - "learning_rate": 9.403175018032528e-06, - "loss": 0.824, + "epoch": 0.16964939822082678, + "grad_norm": 2.7358337011342106, + "learning_rate": 1.8994929501044433e-05, + "loss": 1.0458, "step": 1621 }, { - "epoch": 1.07, - "learning_rate": 9.3925304426181e-06, - "loss": 0.67, + "epoch": 0.1697540554683412, + "grad_norm": 2.5015188075028747, + "learning_rate": 1.8993447895997396e-05, + "loss": 1.0962, "step": 1622 }, { - "epoch": 1.07, - "learning_rate": 9.38188655801387e-06, - "loss": 0.78, + "epoch": 0.16985871271585556, + "grad_norm": 2.746712031614558, + "learning_rate": 1.8991965257584967e-05, + "loss": 0.9008, "step": 1623 }, { - "epoch": 1.07, - "learning_rate": 9.37124337632399e-06, - "loss": 0.6538, + "epoch": 0.16996336996336997, + "grad_norm": 2.308046183330425, + "learning_rate": 1.8990481585977503e-05, + "loss": 0.8214, "step": 1624 }, { - "epoch": 1.07, - "learning_rate": 9.360600909651804e-06, - "loss": 0.6635, + "epoch": 0.17006802721088435, + "grad_norm": 2.241008498260066, + "learning_rate": 1.8988996881345472e-05, + "loss": 0.9915, "step": 1625 }, { - "epoch": 1.07, - "learning_rate": 9.349959170099859e-06, - "loss": 0.7027, + "epoch": 0.17017268445839875, + "grad_norm": 2.336157225073833, + "learning_rate": 1.8987511143859484e-05, + "loss": 1.0093, "step": 1626 }, { - "epoch": 1.07, - "learning_rate": 9.339318169769865e-06, - "loss": 0.7704, + "epoch": 0.17027734170591313, + "grad_norm": 2.456597311443299, + "learning_rate": 1.898602437369024e-05, + "loss": 1.0753, "step": 1627 }, { - "epoch": 1.07, - "learning_rate": 9.328677920762698e-06, - "loss": 0.7284, + "epoch": 0.17038199895342754, + "grad_norm": 2.1653784471734583, + "learning_rate": 1.8984536571008585e-05, + "loss": 0.9081, "step": 1628 }, { - "epoch": 1.07, - "learning_rate": 9.318038435178371e-06, - "loss": 0.7295, + "epoch": 0.17048665620094192, + "grad_norm": 2.1955935295728515, + "learning_rate": 1.8983047735985464e-05, + "loss": 0.9734, "step": 1629 }, { - "epoch": 1.07, - "learning_rate": 9.307399725116035e-06, - "loss": 0.7458, + "epoch": 0.1705913134484563, + "grad_norm": 3.0667450842861457, + "learning_rate": 1.8981557868791943e-05, + "loss": 1.1295, "step": 1630 }, { - "epoch": 1.07, - "learning_rate": 9.29676180267396e-06, - "loss": 0.701, + "epoch": 0.1706959706959707, + "grad_norm": 2.457644941062628, + "learning_rate": 1.8980066969599216e-05, + "loss": 1.1109, "step": 1631 }, { - "epoch": 1.07, - "learning_rate": 9.286124679949515e-06, - "loss": 0.7679, + "epoch": 0.17080062794348508, + "grad_norm": 2.5249379238888627, + "learning_rate": 1.8978575038578593e-05, + "loss": 1.0207, "step": 1632 }, { - "epoch": 1.08, - "learning_rate": 9.275488369039169e-06, - "loss": 0.7512, + "epoch": 0.17090528519099948, + "grad_norm": 2.3825489629259966, + "learning_rate": 1.8977082075901493e-05, + "loss": 1.0369, "step": 1633 }, { - "epoch": 1.08, - "learning_rate": 9.264852882038453e-06, - "loss": 0.7408, + "epoch": 0.17100994243851386, + "grad_norm": 2.2994283024432063, + "learning_rate": 1.8975588081739468e-05, + "loss": 0.9943, "step": 1634 }, { - "epoch": 1.08, - "learning_rate": 9.254218231041973e-06, - "loss": 0.7379, + "epoch": 0.17111459968602827, + "grad_norm": 2.7924613134451888, + "learning_rate": 1.8974093056264173e-05, + "loss": 1.1996, "step": 1635 }, { - "epoch": 1.08, - "learning_rate": 9.243584428143383e-06, - "loss": 0.74, + "epoch": 0.17121925693354265, + "grad_norm": 2.44507996655918, + "learning_rate": 1.8972596999647398e-05, + "loss": 1.1193, "step": 1636 }, { - "epoch": 1.08, - "learning_rate": 9.232951485435367e-06, - "loss": 0.7516, + "epoch": 0.17132391418105705, + "grad_norm": 2.16180218326001, + "learning_rate": 1.897109991206103e-05, + "loss": 0.8592, "step": 1637 }, { - "epoch": 1.08, - "learning_rate": 9.222319415009639e-06, - "loss": 0.8017, + "epoch": 0.17142857142857143, + "grad_norm": 2.699376290036355, + "learning_rate": 1.8969601793677107e-05, + "loss": 1.1523, "step": 1638 }, { - "epoch": 1.08, - "learning_rate": 9.21168822895691e-06, - "loss": 0.7807, + "epoch": 0.1715332286760858, + "grad_norm": 2.735100934448089, + "learning_rate": 1.8968102644667748e-05, + "loss": 1.0564, "step": 1639 }, { - "epoch": 1.08, - "learning_rate": 9.201057939366896e-06, - "loss": 0.7438, + "epoch": 0.17163788592360021, + "grad_norm": 2.573807990283748, + "learning_rate": 1.8966602465205214e-05, + "loss": 1.0115, "step": 1640 }, { - "epoch": 1.08, - "learning_rate": 9.190428558328285e-06, - "loss": 0.6397, + "epoch": 0.1717425431711146, + "grad_norm": 2.7249740908747784, + "learning_rate": 1.8965101255461883e-05, + "loss": 0.9658, "step": 1641 }, { - "epoch": 1.08, - "learning_rate": 9.179800097928743e-06, - "loss": 0.7843, + "epoch": 0.171847200418629, + "grad_norm": 2.9345488789227785, + "learning_rate": 1.896359901561024e-05, + "loss": 1.0128, "step": 1642 }, { - "epoch": 1.08, - "learning_rate": 9.169172570254874e-06, - "loss": 0.7219, + "epoch": 0.17195185766614338, + "grad_norm": 2.889098265415476, + "learning_rate": 1.89620957458229e-05, + "loss": 1.0816, "step": 1643 }, { - "epoch": 1.08, - "learning_rate": 9.158545987392233e-06, - "loss": 1.1518, + "epoch": 0.17205651491365778, + "grad_norm": 2.627888084723054, + "learning_rate": 1.8960591446272595e-05, + "loss": 1.0394, "step": 1644 }, { - "epoch": 1.08, - "learning_rate": 9.147920361425295e-06, - "loss": 0.8327, + "epoch": 0.17216117216117216, + "grad_norm": 2.2397484008487663, + "learning_rate": 1.8959086117132162e-05, + "loss": 0.9653, "step": 1645 }, { - "epoch": 1.08, - "learning_rate": 9.13729570443745e-06, - "loss": 0.7701, + "epoch": 0.17226582940868654, + "grad_norm": 1.9966793466082513, + "learning_rate": 1.8957579758574575e-05, + "loss": 0.8132, "step": 1646 }, { - "epoch": 1.08, - "learning_rate": 9.12667202851099e-06, - "loss": 0.693, + "epoch": 0.17237048665620094, + "grad_norm": 3.0317885783655307, + "learning_rate": 1.8956072370772914e-05, + "loss": 1.0076, "step": 1647 }, { - "epoch": 1.08, - "learning_rate": 9.116049345727076e-06, - "loss": 0.6348, + "epoch": 0.17247514390371532, + "grad_norm": 2.6374048369635483, + "learning_rate": 1.8954563953900386e-05, + "loss": 1.0983, "step": 1648 }, { - "epoch": 1.09, - "learning_rate": 9.105427668165755e-06, - "loss": 0.7202, + "epoch": 0.17257980115122973, + "grad_norm": 2.96669464371244, + "learning_rate": 1.8953054508130302e-05, + "loss": 0.9889, "step": 1649 }, { - "epoch": 1.09, - "learning_rate": 9.094807007905927e-06, - "loss": 0.8613, + "epoch": 0.1726844583987441, + "grad_norm": 2.4723284751273322, + "learning_rate": 1.8951544033636105e-05, + "loss": 1.1264, "step": 1650 }, { - "epoch": 1.09, - "learning_rate": 9.084187377025334e-06, - "loss": 0.6858, + "epoch": 0.1727891156462585, + "grad_norm": 2.622144198286044, + "learning_rate": 1.8950032530591356e-05, + "loss": 1.0118, "step": 1651 }, { - "epoch": 1.09, - "learning_rate": 9.073568787600538e-06, - "loss": 0.7561, + "epoch": 0.1728937728937729, + "grad_norm": 2.891941422685323, + "learning_rate": 1.8948519999169723e-05, + "loss": 0.9545, "step": 1652 }, { - "epoch": 1.09, - "learning_rate": 9.062951251706933e-06, - "loss": 0.5943, + "epoch": 0.1729984301412873, + "grad_norm": 2.91494413227998, + "learning_rate": 1.8947006439545e-05, + "loss": 1.0054, "step": 1653 }, { - "epoch": 1.09, - "learning_rate": 9.052334781418708e-06, - "loss": 0.9754, + "epoch": 0.17310308738880167, + "grad_norm": 2.3618205093150344, + "learning_rate": 1.89454918518911e-05, + "loss": 1.0833, "step": 1654 }, { - "epoch": 1.09, - "learning_rate": 9.04171938880884e-06, - "loss": 0.8839, + "epoch": 0.17320774463631605, + "grad_norm": 2.381058104001439, + "learning_rate": 1.8943976236382055e-05, + "loss": 0.7995, "step": 1655 }, { - "epoch": 1.09, - "learning_rate": 9.031105085949083e-06, - "loss": 0.7718, + "epoch": 0.17331240188383046, + "grad_norm": 2.4725029468294357, + "learning_rate": 1.8942459593192008e-05, + "loss": 1.1094, "step": 1656 }, { - "epoch": 1.09, - "learning_rate": 9.020491884909941e-06, - "loss": 0.7039, + "epoch": 0.17341705913134484, + "grad_norm": 2.5977724802112045, + "learning_rate": 1.8940941922495223e-05, + "loss": 0.9014, "step": 1657 }, { - "epoch": 1.09, - "learning_rate": 9.009879797760679e-06, - "loss": 0.8086, + "epoch": 0.17352171637885924, + "grad_norm": 3.566756798991803, + "learning_rate": 1.8939423224466086e-05, + "loss": 0.9807, "step": 1658 }, { - "epoch": 1.09, - "learning_rate": 8.999268836569284e-06, - "loss": 0.8561, + "epoch": 0.17362637362637362, + "grad_norm": 2.5787026088981637, + "learning_rate": 1.8937903499279104e-05, + "loss": 1.0571, "step": 1659 }, { - "epoch": 1.09, - "learning_rate": 8.988659013402474e-06, - "loss": 0.7165, + "epoch": 0.17373103087388803, + "grad_norm": 2.5611787827337182, + "learning_rate": 1.8936382747108885e-05, + "loss": 1.0353, "step": 1660 }, { - "epoch": 1.09, - "learning_rate": 8.978050340325668e-06, - "loss": 0.6422, + "epoch": 0.1738356881214024, + "grad_norm": 2.5012911002756417, + "learning_rate": 1.8934860968130176e-05, + "loss": 1.1265, "step": 1661 }, { - "epoch": 1.09, - "learning_rate": 8.967442829402968e-06, - "loss": 0.8316, + "epoch": 0.1739403453689168, + "grad_norm": 2.2836015231005033, + "learning_rate": 1.8933338162517824e-05, + "loss": 1.026, "step": 1662 }, { - "epoch": 1.09, - "learning_rate": 8.956836492697167e-06, - "loss": 0.7368, + "epoch": 0.1740450026164312, + "grad_norm": 2.8317634648824486, + "learning_rate": 1.8931814330446813e-05, + "loss": 1.1476, "step": 1663 }, { - "epoch": 1.1, - "learning_rate": 8.94623134226972e-06, - "loss": 0.7332, + "epoch": 0.17414965986394557, + "grad_norm": 2.5276867878076907, + "learning_rate": 1.8930289472092225e-05, + "loss": 1.0794, "step": 1664 }, { - "epoch": 1.1, - "learning_rate": 8.935627390180728e-06, - "loss": 0.7195, + "epoch": 0.17425431711145997, + "grad_norm": 2.6172530432146957, + "learning_rate": 1.8928763587629273e-05, + "loss": 1.1507, "step": 1665 }, { - "epoch": 1.1, - "learning_rate": 8.925024648488932e-06, - "loss": 0.7806, + "epoch": 0.17435897435897435, + "grad_norm": 3.298918918566836, + "learning_rate": 1.8927236677233286e-05, + "loss": 1.0261, "step": 1666 }, { - "epoch": 1.1, - "learning_rate": 8.914423129251697e-06, - "loss": 0.6617, + "epoch": 0.17446363160648876, + "grad_norm": 2.0906785380095956, + "learning_rate": 1.8925708741079706e-05, + "loss": 0.9586, "step": 1667 }, { - "epoch": 1.1, - "learning_rate": 8.903822844524999e-06, - "loss": 0.6465, + "epoch": 0.17456828885400313, + "grad_norm": 2.3161353864865806, + "learning_rate": 1.89241797793441e-05, + "loss": 1.0169, "step": 1668 }, { - "epoch": 1.1, - "learning_rate": 8.893223806363407e-06, - "loss": 0.7953, + "epoch": 0.17467294610151754, + "grad_norm": 2.7515446593384065, + "learning_rate": 1.8922649792202145e-05, + "loss": 1.1237, "step": 1669 }, { - "epoch": 1.1, - "learning_rate": 8.882626026820078e-06, - "loss": 0.7214, + "epoch": 0.17477760334903192, + "grad_norm": 2.5581343380268833, + "learning_rate": 1.892111877982964e-05, + "loss": 0.9509, "step": 1670 }, { - "epoch": 1.1, - "learning_rate": 8.872029517946729e-06, - "loss": 0.754, + "epoch": 0.17488226059654632, + "grad_norm": 2.215250689710175, + "learning_rate": 1.8919586742402504e-05, + "loss": 1.0509, "step": 1671 }, { - "epoch": 1.1, - "learning_rate": 8.861434291793639e-06, - "loss": 0.7778, + "epoch": 0.1749869178440607, + "grad_norm": 2.2557081770607454, + "learning_rate": 1.891805368009677e-05, + "loss": 0.9262, "step": 1672 }, { - "epoch": 1.1, - "learning_rate": 8.850840360409627e-06, - "loss": 0.8362, + "epoch": 0.17509157509157508, + "grad_norm": 2.6566875549500626, + "learning_rate": 1.8916519593088586e-05, + "loss": 1.0298, "step": 1673 }, { - "epoch": 1.1, - "learning_rate": 8.840247735842037e-06, - "loss": 0.7572, + "epoch": 0.17519623233908949, + "grad_norm": 2.5841170671199993, + "learning_rate": 1.8914984481554226e-05, + "loss": 1.1961, "step": 1674 }, { - "epoch": 1.1, - "learning_rate": 8.829656430136732e-06, - "loss": 0.7194, + "epoch": 0.17530088958660386, + "grad_norm": 2.401181500742945, + "learning_rate": 1.8913448345670075e-05, + "loss": 0.8559, "step": 1675 }, { - "epoch": 1.1, - "learning_rate": 8.819066455338067e-06, - "loss": 0.6776, + "epoch": 0.17540554683411827, + "grad_norm": 2.7236964577084275, + "learning_rate": 1.891191118561264e-05, + "loss": 0.9921, "step": 1676 }, { - "epoch": 1.1, - "learning_rate": 8.80847782348889e-06, - "loss": 0.8073, + "epoch": 0.17551020408163265, + "grad_norm": 2.279719174576562, + "learning_rate": 1.8910373001558548e-05, + "loss": 1.1413, "step": 1677 }, { - "epoch": 1.1, - "learning_rate": 8.797890546630522e-06, - "loss": 0.6929, + "epoch": 0.17561486132914705, + "grad_norm": 2.4343550584438725, + "learning_rate": 1.890883379368453e-05, + "loss": 1.0764, "step": 1678 }, { - "epoch": 1.11, - "learning_rate": 8.787304636802743e-06, - "loss": 0.7384, + "epoch": 0.17571951857666143, + "grad_norm": 2.6430739908289986, + "learning_rate": 1.890729356216745e-05, + "loss": 0.9843, "step": 1679 }, { - "epoch": 1.11, - "learning_rate": 8.77672010604377e-06, - "loss": 0.7707, + "epoch": 0.17582417582417584, + "grad_norm": 2.2188927123894544, + "learning_rate": 1.8905752307184287e-05, + "loss": 1.0551, "step": 1680 }, { - "epoch": 1.11, - "learning_rate": 8.766136966390263e-06, - "loss": 0.6968, + "epoch": 0.17592883307169022, + "grad_norm": 2.674634884089362, + "learning_rate": 1.8904210028912126e-05, + "loss": 1.0451, "step": 1681 }, { - "epoch": 1.11, - "learning_rate": 8.755555229877295e-06, - "loss": 0.8593, + "epoch": 0.1760334903192046, + "grad_norm": 2.604704932941921, + "learning_rate": 1.8902666727528184e-05, + "loss": 1.0089, "step": 1682 }, { - "epoch": 1.11, - "learning_rate": 8.744974908538345e-06, - "loss": 0.7993, + "epoch": 0.176138147566719, + "grad_norm": 2.542591420241676, + "learning_rate": 1.8901122403209785e-05, + "loss": 1.0962, "step": 1683 }, { - "epoch": 1.11, - "learning_rate": 8.734396014405282e-06, - "loss": 0.7381, + "epoch": 0.17624280481423338, + "grad_norm": 2.6574054799605755, + "learning_rate": 1.889957705613438e-05, + "loss": 1.1428, "step": 1684 }, { - "epoch": 1.11, - "learning_rate": 8.723818559508345e-06, - "loss": 0.7781, + "epoch": 0.17634746206174778, + "grad_norm": 2.486565966246706, + "learning_rate": 1.8898030686479526e-05, + "loss": 1.0292, "step": 1685 }, { - "epoch": 1.11, - "learning_rate": 8.713242555876149e-06, - "loss": 0.7275, + "epoch": 0.17645211930926216, + "grad_norm": 2.4219588038934443, + "learning_rate": 1.889648329442291e-05, + "loss": 1.0308, "step": 1686 }, { - "epoch": 1.11, - "learning_rate": 8.702668015535651e-06, - "loss": 0.6601, + "epoch": 0.17655677655677657, + "grad_norm": 2.503380792899425, + "learning_rate": 1.889493488014233e-05, + "loss": 1.0214, "step": 1687 }, { - "epoch": 1.11, - "learning_rate": 8.692094950512145e-06, - "loss": 0.665, + "epoch": 0.17666143380429095, + "grad_norm": 2.346582662942175, + "learning_rate": 1.88933854438157e-05, + "loss": 1.0546, "step": 1688 }, { - "epoch": 1.11, - "learning_rate": 8.681523372829254e-06, - "loss": 0.8076, + "epoch": 0.17676609105180532, + "grad_norm": 2.460045482771163, + "learning_rate": 1.889183498562105e-05, + "loss": 1.1341, "step": 1689 }, { - "epoch": 1.11, - "learning_rate": 8.670953294508892e-06, - "loss": 0.7575, + "epoch": 0.17687074829931973, + "grad_norm": 2.5881691646400653, + "learning_rate": 1.889028350573654e-05, + "loss": 1.1007, "step": 1690 }, { - "epoch": 1.11, - "learning_rate": 8.660384727571282e-06, - "loss": 0.7832, + "epoch": 0.1769754055468341, + "grad_norm": 2.622909655326263, + "learning_rate": 1.888873100434043e-05, + "loss": 1.069, "step": 1691 }, { - "epoch": 1.11, - "learning_rate": 8.64981768403493e-06, - "loss": 0.7434, + "epoch": 0.17708006279434851, + "grad_norm": 2.5204132573206692, + "learning_rate": 1.8887177481611107e-05, + "loss": 0.885, "step": 1692 }, { - "epoch": 1.11, - "learning_rate": 8.639252175916608e-06, - "loss": 0.6912, + "epoch": 0.1771847200418629, + "grad_norm": 2.510954873616835, + "learning_rate": 1.888562293772708e-05, + "loss": 1.0541, "step": 1693 }, { - "epoch": 1.12, - "learning_rate": 8.62868821523133e-06, - "loss": 0.7807, + "epoch": 0.1772893772893773, + "grad_norm": 2.7873358491588167, + "learning_rate": 1.888406737286696e-05, + "loss": 0.9605, "step": 1694 }, { - "epoch": 1.12, - "learning_rate": 8.618125813992364e-06, - "loss": 0.6817, + "epoch": 0.17739403453689168, + "grad_norm": 2.326692971407233, + "learning_rate": 1.888251078720949e-05, + "loss": 0.967, "step": 1695 }, { - "epoch": 1.12, - "learning_rate": 8.607564984211199e-06, - "loss": 0.7088, + "epoch": 0.17749869178440608, + "grad_norm": 2.106615127470322, + "learning_rate": 1.8880953180933524e-05, + "loss": 0.8576, "step": 1696 }, { - "epoch": 1.12, - "learning_rate": 8.597005737897534e-06, - "loss": 0.7503, + "epoch": 0.17760334903192046, + "grad_norm": 2.4795735591289487, + "learning_rate": 1.8879394554218033e-05, + "loss": 1.1236, "step": 1697 }, { - "epoch": 1.12, - "learning_rate": 8.586448087059284e-06, - "loss": 0.701, + "epoch": 0.17770800627943484, + "grad_norm": 3.0042166107472825, + "learning_rate": 1.887783490724211e-05, + "loss": 1.0146, "step": 1698 }, { - "epoch": 1.12, - "learning_rate": 8.575892043702523e-06, - "loss": 0.83, + "epoch": 0.17781266352694924, + "grad_norm": 2.492060495608788, + "learning_rate": 1.8876274240184954e-05, + "loss": 0.9161, "step": 1699 }, { - "epoch": 1.12, - "learning_rate": 8.565337619831516e-06, - "loss": 0.6545, + "epoch": 0.17791732077446362, + "grad_norm": 2.777606937433633, + "learning_rate": 1.88747125532259e-05, + "loss": 1.0153, "step": 1700 }, { - "epoch": 1.12, - "learning_rate": 8.554784827448681e-06, - "loss": 0.6521, + "epoch": 0.17802197802197803, + "grad_norm": 2.460229108054716, + "learning_rate": 1.8873149846544377e-05, + "loss": 0.9356, "step": 1701 }, { - "epoch": 1.12, - "learning_rate": 8.544233678554581e-06, - "loss": 0.8336, + "epoch": 0.1781266352694924, + "grad_norm": 2.5144312422866197, + "learning_rate": 1.8871586120319952e-05, + "loss": 1.0638, "step": 1702 }, { - "epoch": 1.12, - "learning_rate": 8.533684185147913e-06, - "loss": 0.802, + "epoch": 0.1782312925170068, + "grad_norm": 2.1719985008734466, + "learning_rate": 1.8870021374732297e-05, + "loss": 1.009, "step": 1703 }, { - "epoch": 1.12, - "learning_rate": 8.52313635922548e-06, - "loss": 0.6752, + "epoch": 0.1783359497645212, + "grad_norm": 2.464059885032523, + "learning_rate": 1.8868455609961206e-05, + "loss": 1.1706, "step": 1704 }, { - "epoch": 1.12, - "learning_rate": 8.512590212782203e-06, - "loss": 0.6158, + "epoch": 0.1784406070120356, + "grad_norm": 2.7220400341035917, + "learning_rate": 1.8866888826186583e-05, + "loss": 1.0106, "step": 1705 }, { - "epoch": 1.12, - "learning_rate": 8.502045757811087e-06, - "loss": 0.8078, + "epoch": 0.17854526425954997, + "grad_norm": 2.3596603917693, + "learning_rate": 1.886532102358846e-05, + "loss": 1.0622, "step": 1706 }, { - "epoch": 1.12, - "learning_rate": 8.491503006303212e-06, - "loss": 0.7619, + "epoch": 0.17864992150706435, + "grad_norm": 2.4967392208163317, + "learning_rate": 1.8863752202346982e-05, + "loss": 1.0043, "step": 1707 }, { - "epoch": 1.12, - "learning_rate": 8.480961970247722e-06, - "loss": 0.7769, + "epoch": 0.17875457875457876, + "grad_norm": 1.937686380270267, + "learning_rate": 1.8862182362642406e-05, + "loss": 0.8468, "step": 1708 }, { - "epoch": 1.13, - "learning_rate": 8.47042266163181e-06, - "loss": 0.6736, + "epoch": 0.17885923600209314, + "grad_norm": 2.4387935220753048, + "learning_rate": 1.8860611504655112e-05, + "loss": 1.0241, "step": 1709 }, { - "epoch": 1.13, - "learning_rate": 8.459885092440708e-06, - "loss": 0.8792, + "epoch": 0.17896389324960754, + "grad_norm": 2.3758956185372755, + "learning_rate": 1.885903962856559e-05, + "loss": 1.1111, "step": 1710 }, { - "epoch": 1.13, - "learning_rate": 8.449349274657665e-06, - "loss": 0.7375, + "epoch": 0.17906855049712192, + "grad_norm": 2.21312437965851, + "learning_rate": 1.885746673455446e-05, + "loss": 1.0037, "step": 1711 }, { - "epoch": 1.13, - "learning_rate": 8.438815220263942e-06, - "loss": 0.6718, + "epoch": 0.17917320774463633, + "grad_norm": 2.5602801001146673, + "learning_rate": 1.8855892822802443e-05, + "loss": 1.1503, "step": 1712 }, { - "epoch": 1.13, - "learning_rate": 8.42828294123879e-06, - "loss": 0.7983, + "epoch": 0.1792778649921507, + "grad_norm": 2.558534262130665, + "learning_rate": 1.885431789349039e-05, + "loss": 1.1362, "step": 1713 }, { - "epoch": 1.13, - "learning_rate": 8.417752449559447e-06, - "loss": 0.8696, + "epoch": 0.1793825222396651, + "grad_norm": 2.074103983805813, + "learning_rate": 1.885274194679926e-05, + "loss": 1.017, "step": 1714 }, { - "epoch": 1.13, - "learning_rate": 8.407223757201113e-06, - "loss": 0.6301, + "epoch": 0.1794871794871795, + "grad_norm": 3.144604908715539, + "learning_rate": 1.8851164982910135e-05, + "loss": 1.1212, "step": 1715 }, { - "epoch": 1.13, - "learning_rate": 8.396696876136948e-06, - "loss": 0.6936, + "epoch": 0.17959183673469387, + "grad_norm": 2.142185971803945, + "learning_rate": 1.884958700200421e-05, + "loss": 1.1587, "step": 1716 }, { - "epoch": 1.13, - "learning_rate": 8.386171818338048e-06, - "loss": 0.768, + "epoch": 0.17969649398220827, + "grad_norm": 2.0371324766241394, + "learning_rate": 1.88480080042628e-05, + "loss": 1.0488, "step": 1717 }, { - "epoch": 1.13, - "learning_rate": 8.37564859577343e-06, - "loss": 0.82, + "epoch": 0.17980115122972265, + "grad_norm": 2.5493299229194855, + "learning_rate": 1.8846427989867334e-05, + "loss": 1.1183, "step": 1718 }, { - "epoch": 1.13, - "learning_rate": 8.365127220410037e-06, - "loss": 0.7359, + "epoch": 0.17990580847723706, + "grad_norm": 2.1735484655699477, + "learning_rate": 1.8844846958999355e-05, + "loss": 1.0369, "step": 1719 }, { - "epoch": 1.13, - "learning_rate": 8.354607704212703e-06, - "loss": 0.7153, + "epoch": 0.18001046572475143, + "grad_norm": 2.084854109587084, + "learning_rate": 1.884326491184053e-05, + "loss": 0.9337, "step": 1720 }, { - "epoch": 1.13, - "learning_rate": 8.344090059144151e-06, - "loss": 0.8518, + "epoch": 0.18011512297226584, + "grad_norm": 2.6326212200262202, + "learning_rate": 1.884168184857264e-05, + "loss": 1.0429, "step": 1721 }, { - "epoch": 1.13, - "learning_rate": 8.333574297164969e-06, - "loss": 0.6675, + "epoch": 0.18021978021978022, + "grad_norm": 2.4562822207892756, + "learning_rate": 1.8840097769377585e-05, + "loss": 1.0598, "step": 1722 }, { - "epoch": 1.13, - "learning_rate": 8.323060430233613e-06, - "loss": 0.8015, + "epoch": 0.18032443746729462, + "grad_norm": 2.4277508413270272, + "learning_rate": 1.8838512674437372e-05, + "loss": 0.9412, "step": 1723 }, { - "epoch": 1.13, - "learning_rate": 8.312548470306379e-06, - "loss": 0.7307, + "epoch": 0.180429094714809, + "grad_norm": 2.8821956418407804, + "learning_rate": 1.8836926563934137e-05, + "loss": 0.8627, "step": 1724 }, { - "epoch": 1.14, - "learning_rate": 8.302038429337396e-06, - "loss": 0.722, + "epoch": 0.18053375196232338, + "grad_norm": 2.709669048226828, + "learning_rate": 1.8835339438050125e-05, + "loss": 0.9982, "step": 1725 }, { - "epoch": 1.14, - "learning_rate": 8.291530319278614e-06, - "loss": 0.8462, + "epoch": 0.18063840920983779, + "grad_norm": 2.111424162303935, + "learning_rate": 1.88337512969677e-05, + "loss": 1.0113, "step": 1726 }, { - "epoch": 1.14, - "learning_rate": 8.281024152079775e-06, - "loss": 0.8232, + "epoch": 0.18074306645735216, + "grad_norm": 2.234604237746023, + "learning_rate": 1.8832162140869343e-05, + "loss": 1.0126, "step": 1727 }, { - "epoch": 1.14, - "learning_rate": 8.270519939688421e-06, - "loss": 0.7586, + "epoch": 0.18084772370486657, + "grad_norm": 2.524183949936875, + "learning_rate": 1.883057196993765e-05, + "loss": 1.2232, "step": 1728 }, { - "epoch": 1.14, - "learning_rate": 8.260017694049878e-06, - "loss": 0.8015, + "epoch": 0.18095238095238095, + "grad_norm": 2.130924634431564, + "learning_rate": 1.882898078435534e-05, + "loss": 0.9923, "step": 1729 }, { - "epoch": 1.14, - "learning_rate": 8.249517427107225e-06, - "loss": 0.7307, + "epoch": 0.18105703819989535, + "grad_norm": 2.4549286849941385, + "learning_rate": 1.8827388584305237e-05, + "loss": 1.1306, "step": 1730 }, { - "epoch": 1.14, - "learning_rate": 8.239019150801286e-06, - "loss": 0.6664, + "epoch": 0.18116169544740973, + "grad_norm": 2.698247562016706, + "learning_rate": 1.882579536997029e-05, + "loss": 1.0832, "step": 1731 }, { - "epoch": 1.14, - "learning_rate": 8.228522877070636e-06, - "loss": 0.7064, + "epoch": 0.1812663526949241, + "grad_norm": 2.4883956970188463, + "learning_rate": 1.8824201141533565e-05, + "loss": 1.0421, "step": 1732 }, { - "epoch": 1.14, - "learning_rate": 8.21802861785156e-06, - "loss": 0.758, + "epoch": 0.18137100994243852, + "grad_norm": 2.205789516439095, + "learning_rate": 1.8822605899178242e-05, + "loss": 0.8382, "step": 1733 }, { - "epoch": 1.14, - "learning_rate": 8.207536385078057e-06, - "loss": 0.8387, + "epoch": 0.1814756671899529, + "grad_norm": 2.043592458587824, + "learning_rate": 1.8821009643087613e-05, + "loss": 1.0106, "step": 1734 }, { - "epoch": 1.14, - "learning_rate": 8.197046190681832e-06, - "loss": 0.6874, + "epoch": 0.1815803244374673, + "grad_norm": 2.678564167470479, + "learning_rate": 1.8819412373445094e-05, + "loss": 1.0847, "step": 1735 }, { - "epoch": 1.14, - "learning_rate": 8.186558046592247e-06, - "loss": 0.8579, + "epoch": 0.18168498168498168, + "grad_norm": 2.29945001233077, + "learning_rate": 1.8817814090434218e-05, + "loss": 0.9098, "step": 1736 }, { - "epoch": 1.14, - "learning_rate": 8.176071964736358e-06, - "loss": 0.6297, + "epoch": 0.18178963893249608, + "grad_norm": 2.294178577646669, + "learning_rate": 1.8816214794238623e-05, + "loss": 1.0782, "step": 1737 }, { - "epoch": 1.14, - "learning_rate": 8.16558795703886e-06, - "loss": 0.6555, + "epoch": 0.18189429618001046, + "grad_norm": 2.420407699481665, + "learning_rate": 1.8814614485042077e-05, + "loss": 1.1683, "step": 1738 }, { - "epoch": 1.14, - "learning_rate": 8.155106035422095e-06, - "loss": 0.6886, + "epoch": 0.18199895342752487, + "grad_norm": 2.389361847046125, + "learning_rate": 1.881301316302846e-05, + "loss": 0.9132, "step": 1739 }, { - "epoch": 1.15, - "learning_rate": 8.144626211806038e-06, - "loss": 0.7178, + "epoch": 0.18210361067503925, + "grad_norm": 2.3666157461593653, + "learning_rate": 1.881141082838176e-05, + "loss": 1.0545, "step": 1740 }, { - "epoch": 1.15, - "learning_rate": 8.13414849810826e-06, - "loss": 0.7144, + "epoch": 0.18220826792255362, + "grad_norm": 2.3334015197345024, + "learning_rate": 1.8809807481286096e-05, + "loss": 1.0485, "step": 1741 }, { - "epoch": 1.15, - "learning_rate": 8.123672906243955e-06, - "loss": 0.7969, + "epoch": 0.18231292517006803, + "grad_norm": 2.0751756735643223, + "learning_rate": 1.8808203121925692e-05, + "loss": 0.8661, "step": 1742 }, { - "epoch": 1.15, - "learning_rate": 8.11319944812589e-06, - "loss": 0.7901, + "epoch": 0.1824175824175824, + "grad_norm": 2.138459888794792, + "learning_rate": 1.8806597750484895e-05, + "loss": 0.9413, "step": 1743 }, { - "epoch": 1.15, - "learning_rate": 8.102728135664411e-06, - "loss": 0.7181, + "epoch": 0.1825222396650968, + "grad_norm": 2.3884065014316707, + "learning_rate": 1.8804991367148165e-05, + "loss": 1.0748, "step": 1744 }, { - "epoch": 1.15, - "learning_rate": 8.09225898076742e-06, - "loss": 0.8622, + "epoch": 0.1826268969126112, + "grad_norm": 2.3903789475716826, + "learning_rate": 1.8803383972100073e-05, + "loss": 1.2001, "step": 1745 }, { - "epoch": 1.15, - "learning_rate": 8.08179199534037e-06, - "loss": 0.8609, + "epoch": 0.1827315541601256, + "grad_norm": 2.431230215643124, + "learning_rate": 1.880177556552532e-05, + "loss": 1.097, "step": 1746 }, { - "epoch": 1.15, - "learning_rate": 8.071327191286244e-06, - "loss": 0.7806, + "epoch": 0.18283621140763998, + "grad_norm": 2.6229790482474016, + "learning_rate": 1.880016614760871e-05, + "loss": 1.1281, "step": 1747 }, { - "epoch": 1.15, - "learning_rate": 8.060864580505542e-06, - "loss": 0.6724, + "epoch": 0.18294086865515438, + "grad_norm": 2.3850699291301, + "learning_rate": 1.879855571853517e-05, + "loss": 1.0819, "step": 1748 }, { - "epoch": 1.15, - "learning_rate": 8.050404174896282e-06, - "loss": 0.7419, + "epoch": 0.18304552590266876, + "grad_norm": 2.876444507419557, + "learning_rate": 1.879694427848974e-05, + "loss": 1.0471, "step": 1749 }, { - "epoch": 1.15, - "learning_rate": 8.039945986353954e-06, - "loss": 0.7907, + "epoch": 0.18315018315018314, + "grad_norm": 2.316504236046675, + "learning_rate": 1.879533182765758e-05, + "loss": 1.0519, "step": 1750 }, { - "epoch": 1.15, - "learning_rate": 8.029490026771545e-06, - "loss": 0.6673, + "epoch": 0.18325484039769754, + "grad_norm": 2.2030334955552413, + "learning_rate": 1.8793718366223963e-05, + "loss": 0.928, "step": 1751 }, { - "epoch": 1.15, - "learning_rate": 8.019036308039497e-06, - "loss": 0.8518, + "epoch": 0.18335949764521192, + "grad_norm": 2.3511232102886495, + "learning_rate": 1.879210389437428e-05, + "loss": 0.9923, "step": 1752 }, { - "epoch": 1.15, - "learning_rate": 8.008584842045706e-06, - "loss": 0.7744, + "epoch": 0.18346415489272633, + "grad_norm": 2.4416831019547907, + "learning_rate": 1.8790488412294035e-05, + "loss": 1.0361, "step": 1753 }, { - "epoch": 1.15, - "learning_rate": 7.998135640675514e-06, - "loss": 0.8429, + "epoch": 0.1835688121402407, + "grad_norm": 2.4333832677997034, + "learning_rate": 1.8788871920168855e-05, + "loss": 1.0897, "step": 1754 }, { - "epoch": 1.16, - "learning_rate": 7.987688715811672e-06, - "loss": 0.7407, + "epoch": 0.1836734693877551, + "grad_norm": 2.2799904335051915, + "learning_rate": 1.878725441818447e-05, + "loss": 1.0096, "step": 1755 }, { - "epoch": 1.16, - "learning_rate": 7.977244079334357e-06, - "loss": 0.7023, + "epoch": 0.1837781266352695, + "grad_norm": 2.462858451122516, + "learning_rate": 1.8785635906526737e-05, + "loss": 1.0372, "step": 1756 }, { - "epoch": 1.16, - "learning_rate": 7.966801743121133e-06, - "loss": 0.7511, + "epoch": 0.1838827838827839, + "grad_norm": 2.3533243746357146, + "learning_rate": 1.8784016385381633e-05, + "loss": 1.0798, "step": 1757 }, { - "epoch": 1.16, - "learning_rate": 7.95636171904696e-06, - "loss": 0.6814, + "epoch": 0.18398744113029827, + "grad_norm": 2.718496586207958, + "learning_rate": 1.8782395854935237e-05, + "loss": 1.0633, "step": 1758 }, { - "epoch": 1.16, - "learning_rate": 7.945924018984155e-06, - "loss": 0.8168, + "epoch": 0.18409209837781265, + "grad_norm": 2.652705423511989, + "learning_rate": 1.878077431537375e-05, + "loss": 1.0664, "step": 1759 }, { - "epoch": 1.16, - "learning_rate": 7.935488654802395e-06, - "loss": 0.6846, + "epoch": 0.18419675562532706, + "grad_norm": 2.586313483256547, + "learning_rate": 1.8779151766883502e-05, + "loss": 1.0038, "step": 1760 }, { - "epoch": 1.16, - "learning_rate": 7.925055638368713e-06, - "loss": 0.6312, + "epoch": 0.18430141287284144, + "grad_norm": 2.5041990988530514, + "learning_rate": 1.877752820965091e-05, + "loss": 1.0798, "step": 1761 }, { - "epoch": 1.16, - "learning_rate": 7.914624981547459e-06, - "loss": 0.712, + "epoch": 0.18440607012035584, + "grad_norm": 2.53966882445818, + "learning_rate": 1.877590364386254e-05, + "loss": 1.1485, "step": 1762 }, { - "epoch": 1.16, - "learning_rate": 7.90419669620031e-06, - "loss": 0.8883, + "epoch": 0.18451072736787022, + "grad_norm": 2.5828686624066695, + "learning_rate": 1.877427806970505e-05, + "loss": 1.0969, "step": 1763 }, { - "epoch": 1.16, - "learning_rate": 7.893770794186231e-06, - "loss": 0.7941, + "epoch": 0.18461538461538463, + "grad_norm": 2.3393212197615343, + "learning_rate": 1.8772651487365217e-05, + "loss": 1.1038, "step": 1764 }, { - "epoch": 1.16, - "learning_rate": 7.88334728736149e-06, - "loss": 0.7611, + "epoch": 0.184720041862899, + "grad_norm": 2.7443600527578393, + "learning_rate": 1.8771023897029948e-05, + "loss": 0.8974, "step": 1765 }, { - "epoch": 1.16, - "learning_rate": 7.872926187579625e-06, - "loss": 0.729, + "epoch": 0.1848246991104134, + "grad_norm": 2.5532153964529893, + "learning_rate": 1.876939529888625e-05, + "loss": 0.9866, "step": 1766 }, { - "epoch": 1.16, - "learning_rate": 7.862507506691444e-06, - "loss": 0.7939, + "epoch": 0.1849293563579278, + "grad_norm": 2.4029344349509194, + "learning_rate": 1.8767765693121258e-05, + "loss": 1.0403, "step": 1767 }, { - "epoch": 1.16, - "learning_rate": 7.852091256545e-06, - "loss": 0.713, + "epoch": 0.18503401360544217, + "grad_norm": 2.176455654469618, + "learning_rate": 1.8766135079922213e-05, + "loss": 0.9654, "step": 1768 }, { - "epoch": 1.16, - "learning_rate": 7.841677448985572e-06, - "loss": 0.8089, + "epoch": 0.18513867085295657, + "grad_norm": 2.396055407588553, + "learning_rate": 1.8764503459476476e-05, + "loss": 0.8856, "step": 1769 }, { - "epoch": 1.17, - "learning_rate": 7.831266095855675e-06, - "loss": 0.6924, + "epoch": 0.18524332810047095, + "grad_norm": 2.8406525852271165, + "learning_rate": 1.8762870831971523e-05, + "loss": 1.1081, "step": 1770 }, { - "epoch": 1.17, - "learning_rate": 7.82085720899503e-06, - "loss": 0.6898, + "epoch": 0.18534798534798536, + "grad_norm": 2.9851011238261362, + "learning_rate": 1.8761237197594945e-05, + "loss": 1.0719, "step": 1771 }, { - "epoch": 1.17, - "learning_rate": 7.810450800240549e-06, - "loss": 0.7316, + "epoch": 0.18545264259549973, + "grad_norm": 1.982828765335705, + "learning_rate": 1.8759602556534458e-05, + "loss": 0.9932, "step": 1772 }, { - "epoch": 1.17, - "learning_rate": 7.800046881426331e-06, - "loss": 0.7067, + "epoch": 0.18555729984301414, + "grad_norm": 2.7241455065505287, + "learning_rate": 1.8757966908977875e-05, + "loss": 1.2454, "step": 1773 }, { - "epoch": 1.17, - "learning_rate": 7.789645464383635e-06, - "loss": 0.6733, + "epoch": 0.18566195709052852, + "grad_norm": 2.252662593453829, + "learning_rate": 1.875633025511314e-05, + "loss": 1.0011, "step": 1774 }, { - "epoch": 1.17, - "learning_rate": 7.779246560940887e-06, - "loss": 0.8138, + "epoch": 0.1857666143380429, + "grad_norm": 2.5528950049004204, + "learning_rate": 1.8754692595128313e-05, + "loss": 0.9809, "step": 1775 }, { - "epoch": 1.17, - "learning_rate": 7.768850182923646e-06, - "loss": 0.7343, + "epoch": 0.1858712715855573, + "grad_norm": 2.0122663818965134, + "learning_rate": 1.8753053929211555e-05, + "loss": 0.9965, "step": 1776 }, { - "epoch": 1.17, - "learning_rate": 7.758456342154602e-06, - "loss": 0.8757, + "epoch": 0.18597592883307168, + "grad_norm": 2.1764615737427997, + "learning_rate": 1.875141425755116e-05, + "loss": 1.0969, "step": 1777 }, { - "epoch": 1.17, - "learning_rate": 7.748065050453557e-06, - "loss": 0.7713, + "epoch": 0.18608058608058609, + "grad_norm": 1.8948836535370301, + "learning_rate": 1.874977358033552e-05, + "loss": 0.8658, "step": 1778 }, { - "epoch": 1.17, - "learning_rate": 7.737676319637417e-06, - "loss": 0.7135, + "epoch": 0.18618524332810046, + "grad_norm": 2.361535893832336, + "learning_rate": 1.874813189775316e-05, + "loss": 0.964, "step": 1779 }, { - "epoch": 1.17, - "learning_rate": 7.727290161520176e-06, - "loss": 0.8717, + "epoch": 0.18628990057561487, + "grad_norm": 2.3456335496105707, + "learning_rate": 1.8746489209992713e-05, + "loss": 0.924, "step": 1780 }, { - "epoch": 1.17, - "learning_rate": 7.716906587912903e-06, - "loss": 0.6306, + "epoch": 0.18639455782312925, + "grad_norm": 2.423670459664142, + "learning_rate": 1.874484551724292e-05, + "loss": 0.9615, "step": 1781 }, { - "epoch": 1.17, - "learning_rate": 7.706525610623726e-06, - "loss": 0.7388, + "epoch": 0.18649921507064365, + "grad_norm": 2.2890242877296125, + "learning_rate": 1.8743200819692652e-05, + "loss": 1.0306, "step": 1782 }, { - "epoch": 1.17, - "learning_rate": 7.696147241457817e-06, - "loss": 0.6993, + "epoch": 0.18660387231815803, + "grad_norm": 2.5596300069718505, + "learning_rate": 1.8741555117530887e-05, + "loss": 0.9138, "step": 1783 }, { - "epoch": 1.17, - "learning_rate": 7.685771492217387e-06, - "loss": 0.6875, + "epoch": 0.1867085295656724, + "grad_norm": 2.129042464863284, + "learning_rate": 1.873990841094672e-05, + "loss": 1.1452, "step": 1784 }, { - "epoch": 1.18, - "learning_rate": 7.675398374701669e-06, - "loss": 0.821, + "epoch": 0.18681318681318682, + "grad_norm": 3.064823960289263, + "learning_rate": 1.8738260700129354e-05, + "loss": 0.9969, "step": 1785 }, { - "epoch": 1.18, - "learning_rate": 7.665027900706902e-06, - "loss": 0.7293, + "epoch": 0.1869178440607012, + "grad_norm": 2.1130779511781688, + "learning_rate": 1.8736611985268124e-05, + "loss": 0.9711, "step": 1786 }, { - "epoch": 1.18, - "learning_rate": 7.654660082026313e-06, - "loss": 0.7566, + "epoch": 0.1870225013082156, + "grad_norm": 2.0389255541139994, + "learning_rate": 1.873496226655246e-05, + "loss": 0.8438, "step": 1787 }, { - "epoch": 1.18, - "learning_rate": 7.644294930450117e-06, - "loss": 0.8534, + "epoch": 0.18712715855572998, + "grad_norm": 2.2033567685248947, + "learning_rate": 1.873331154417193e-05, + "loss": 1.0108, "step": 1788 }, { - "epoch": 1.18, - "learning_rate": 7.633932457765491e-06, - "loss": 0.7217, + "epoch": 0.18723181580324438, + "grad_norm": 2.15080379319512, + "learning_rate": 1.8731659818316196e-05, + "loss": 0.8633, "step": 1789 }, { - "epoch": 1.18, - "learning_rate": 7.623572675756568e-06, - "loss": 0.7808, + "epoch": 0.18733647305075876, + "grad_norm": 2.2191227916720004, + "learning_rate": 1.8730007089175047e-05, + "loss": 0.9503, "step": 1790 }, { - "epoch": 1.18, - "learning_rate": 7.613215596204425e-06, - "loss": 0.7292, + "epoch": 0.18744113029827317, + "grad_norm": 2.3128407690400814, + "learning_rate": 1.8728353356938386e-05, + "loss": 0.9191, "step": 1791 }, { - "epoch": 1.18, - "learning_rate": 7.602861230887055e-06, - "loss": 0.73, + "epoch": 0.18754578754578755, + "grad_norm": 2.3372349993067782, + "learning_rate": 1.872669862179623e-05, + "loss": 1.0454, "step": 1792 }, { - "epoch": 1.18, - "learning_rate": 7.592509591579373e-06, - "loss": 0.8007, + "epoch": 0.18765044479330192, + "grad_norm": 2.4025198105450607, + "learning_rate": 1.8725042883938714e-05, + "loss": 1.0697, "step": 1793 }, { - "epoch": 1.18, - "learning_rate": 7.582160690053194e-06, - "loss": 0.9258, + "epoch": 0.18775510204081633, + "grad_norm": 2.2588303051011196, + "learning_rate": 1.8723386143556085e-05, + "loss": 1.21, "step": 1794 }, { - "epoch": 1.18, - "learning_rate": 7.571814538077216e-06, - "loss": 0.7619, + "epoch": 0.1878597592883307, + "grad_norm": 2.1758809138656834, + "learning_rate": 1.87217284008387e-05, + "loss": 1.017, "step": 1795 }, { - "epoch": 1.18, - "learning_rate": 7.5614711474170155e-06, - "loss": 0.7193, + "epoch": 0.1879644165358451, + "grad_norm": 2.505013897983491, + "learning_rate": 1.872006965597704e-05, + "loss": 0.96, "step": 1796 }, { - "epoch": 1.18, - "learning_rate": 7.551130529835016e-06, - "loss": 0.7072, + "epoch": 0.1880690737833595, + "grad_norm": 2.5795773446705303, + "learning_rate": 1.8718409909161704e-05, + "loss": 1.0457, "step": 1797 }, { - "epoch": 1.18, - "learning_rate": 7.540792697090502e-06, - "loss": 0.6891, + "epoch": 0.1881737310308739, + "grad_norm": 2.5053247104374945, + "learning_rate": 1.8716749160583393e-05, + "loss": 1.0533, "step": 1798 }, { - "epoch": 1.18, - "learning_rate": 7.530457660939588e-06, - "loss": 0.7172, + "epoch": 0.18827838827838828, + "grad_norm": 2.5122626396928633, + "learning_rate": 1.871508741043293e-05, + "loss": 0.9278, "step": 1799 }, { - "epoch": 1.18, - "learning_rate": 7.5201254331352056e-06, - "loss": 0.7301, + "epoch": 0.18838304552590268, + "grad_norm": 2.443610775616308, + "learning_rate": 1.871342465890126e-05, + "loss": 0.85, "step": 1800 }, { - "epoch": 1.19, - "learning_rate": 7.509796025427088e-06, - "loss": 0.7413, + "epoch": 0.18848770277341706, + "grad_norm": 2.515901355227164, + "learning_rate": 1.8711760906179428e-05, + "loss": 1.0415, "step": 1801 }, { - "epoch": 1.19, - "learning_rate": 7.49946944956177e-06, - "loss": 0.7004, + "epoch": 0.18859236002093144, + "grad_norm": 2.6696223601207714, + "learning_rate": 1.871009615245861e-05, + "loss": 1.0236, "step": 1802 }, { - "epoch": 1.19, - "learning_rate": 7.489145717282559e-06, - "loss": 0.5519, + "epoch": 0.18869701726844584, + "grad_norm": 2.4784833366393553, + "learning_rate": 1.870843039793009e-05, + "loss": 0.9525, "step": 1803 }, { - "epoch": 1.19, - "learning_rate": 7.478824840329537e-06, - "loss": 0.7054, + "epoch": 0.18880167451596022, + "grad_norm": 2.4542775584185264, + "learning_rate": 1.870676364278526e-05, + "loss": 1.1093, "step": 1804 }, { - "epoch": 1.19, - "learning_rate": 7.4685068304395366e-06, - "loss": 0.8361, + "epoch": 0.18890633176347463, + "grad_norm": 2.1766923572797774, + "learning_rate": 1.8705095887215636e-05, + "loss": 0.9177, "step": 1805 }, { - "epoch": 1.19, - "learning_rate": 7.45819169934612e-06, - "loss": 0.7112, + "epoch": 0.189010989010989, + "grad_norm": 2.486937565003481, + "learning_rate": 1.870342713141285e-05, + "loss": 1.0531, "step": 1806 }, { - "epoch": 1.19, - "learning_rate": 7.4478794587795855e-06, - "loss": 0.813, + "epoch": 0.1891156462585034, + "grad_norm": 2.5489906996516605, + "learning_rate": 1.8701757375568642e-05, + "loss": 1.1089, "step": 1807 }, { - "epoch": 1.19, - "learning_rate": 7.437570120466943e-06, - "loss": 0.7203, + "epoch": 0.1892203035060178, + "grad_norm": 1.940507425147104, + "learning_rate": 1.870008661987487e-05, + "loss": 0.8917, "step": 1808 }, { - "epoch": 1.19, - "learning_rate": 7.427263696131903e-06, - "loss": 0.7893, + "epoch": 0.1893249607535322, + "grad_norm": 2.0972924062895464, + "learning_rate": 1.8698414864523512e-05, + "loss": 1.0538, "step": 1809 }, { - "epoch": 1.19, - "learning_rate": 7.416960197494856e-06, - "loss": 0.7818, + "epoch": 0.18942961800104657, + "grad_norm": 2.2496046336500166, + "learning_rate": 1.869674210970665e-05, + "loss": 1.0835, "step": 1810 }, { - "epoch": 1.19, - "learning_rate": 7.406659636272872e-06, - "loss": 0.6633, + "epoch": 0.18953427524856095, + "grad_norm": 2.5621615108753035, + "learning_rate": 1.869506835561649e-05, + "loss": 1.0739, "step": 1811 }, { - "epoch": 1.19, - "learning_rate": 7.396362024179677e-06, - "loss": 0.7338, + "epoch": 0.18963893249607536, + "grad_norm": 2.440505472169808, + "learning_rate": 1.869339360244535e-05, + "loss": 1.0871, "step": 1812 }, { - "epoch": 1.19, - "learning_rate": 7.386067372925647e-06, - "loss": 0.735, + "epoch": 0.18974358974358974, + "grad_norm": 2.403241452428988, + "learning_rate": 1.869171785038566e-05, + "loss": 0.9661, "step": 1813 }, { - "epoch": 1.19, - "learning_rate": 7.375775694217788e-06, - "loss": 0.81, + "epoch": 0.18984824699110414, + "grad_norm": 2.469688713891638, + "learning_rate": 1.869004109962997e-05, + "loss": 1.0122, "step": 1814 }, { - "epoch": 1.19, - "learning_rate": 7.365486999759723e-06, - "loss": 0.7522, + "epoch": 0.18995290423861852, + "grad_norm": 2.333884875941877, + "learning_rate": 1.868836335037094e-05, + "loss": 1.1133, "step": 1815 }, { - "epoch": 1.2, - "learning_rate": 7.355201301251689e-06, - "loss": 0.6648, + "epoch": 0.19005756148613293, + "grad_norm": 2.3621677518785376, + "learning_rate": 1.868668460280135e-05, + "loss": 0.8132, "step": 1816 }, { - "epoch": 1.2, - "learning_rate": 7.344918610390506e-06, - "loss": 0.6333, + "epoch": 0.1901622187336473, + "grad_norm": 2.7072055929293235, + "learning_rate": 1.8685004857114085e-05, + "loss": 1.0276, "step": 1817 }, { - "epoch": 1.2, - "learning_rate": 7.334638938869582e-06, - "loss": 0.68, + "epoch": 0.19026687598116168, + "grad_norm": 2.645013242609602, + "learning_rate": 1.8683324113502158e-05, + "loss": 1.0181, "step": 1818 }, { - "epoch": 1.2, - "learning_rate": 7.324362298378893e-06, - "loss": 0.7931, + "epoch": 0.1903715332286761, + "grad_norm": 2.5368104479142692, + "learning_rate": 1.868164237215869e-05, + "loss": 0.919, "step": 1819 }, { - "epoch": 1.2, - "learning_rate": 7.314088700604958e-06, - "loss": 0.7953, + "epoch": 0.19047619047619047, + "grad_norm": 1.9021982130849477, + "learning_rate": 1.867995963327691e-05, + "loss": 0.8938, "step": 1820 }, { - "epoch": 1.2, - "learning_rate": 7.303818157230841e-06, - "loss": 0.6957, + "epoch": 0.19058084772370487, + "grad_norm": 2.33319178257821, + "learning_rate": 1.8678275897050177e-05, + "loss": 1.0466, "step": 1821 }, { - "epoch": 1.2, - "learning_rate": 7.293550679936135e-06, - "loss": 0.7854, + "epoch": 0.19068550497121925, + "grad_norm": 2.7008377094227676, + "learning_rate": 1.8676591163671948e-05, + "loss": 1.1263, "step": 1822 }, { - "epoch": 1.2, - "learning_rate": 7.283286280396948e-06, - "loss": 0.6751, + "epoch": 0.19079016221873366, + "grad_norm": 2.1650716424421073, + "learning_rate": 1.8674905433335805e-05, + "loss": 0.9968, "step": 1823 }, { - "epoch": 1.2, - "learning_rate": 7.27302497028588e-06, - "loss": 0.7078, + "epoch": 0.19089481946624803, + "grad_norm": 2.398261119939507, + "learning_rate": 1.867321870623544e-05, + "loss": 1.1226, "step": 1824 }, { - "epoch": 1.2, - "learning_rate": 7.262766761272022e-06, - "loss": 0.7975, + "epoch": 0.19099947671376244, + "grad_norm": 2.312460276704168, + "learning_rate": 1.8671530982564664e-05, + "loss": 0.9971, "step": 1825 }, { - "epoch": 1.2, - "learning_rate": 7.252511665020939e-06, - "loss": 0.6619, + "epoch": 0.19110413396127682, + "grad_norm": 2.647523183773923, + "learning_rate": 1.86698422625174e-05, + "loss": 1.1726, "step": 1826 }, { - "epoch": 1.2, - "learning_rate": 7.242259693194658e-06, - "loss": 0.7909, + "epoch": 0.1912087912087912, + "grad_norm": 2.6076487342169603, + "learning_rate": 1.8668152546287686e-05, + "loss": 1.0306, "step": 1827 }, { - "epoch": 1.2, - "learning_rate": 7.232010857451653e-06, - "loss": 0.6512, + "epoch": 0.1913134484563056, + "grad_norm": 2.3115074361083883, + "learning_rate": 1.8666461834069672e-05, + "loss": 0.8432, "step": 1828 }, { - "epoch": 1.2, - "learning_rate": 7.221765169446819e-06, - "loss": 0.748, + "epoch": 0.19141810570381998, + "grad_norm": 2.1836521898238197, + "learning_rate": 1.866477012605762e-05, + "loss": 1.0452, "step": 1829 }, { - "epoch": 1.2, - "learning_rate": 7.211522640831492e-06, - "loss": 0.6781, + "epoch": 0.19152276295133439, + "grad_norm": 2.330317681639118, + "learning_rate": 1.866307742244592e-05, + "loss": 1.0984, "step": 1830 }, { - "epoch": 1.21, - "learning_rate": 7.2012832832534e-06, - "loss": 0.7517, + "epoch": 0.19162742019884876, + "grad_norm": 2.457940283244116, + "learning_rate": 1.8661383723429062e-05, + "loss": 1.108, "step": 1831 }, { - "epoch": 1.21, - "learning_rate": 7.1910471083566725e-06, - "loss": 0.7194, + "epoch": 0.19173207744636317, + "grad_norm": 2.4031558334493757, + "learning_rate": 1.8659689029201654e-05, + "loss": 1.0354, "step": 1832 }, { - "epoch": 1.21, - "learning_rate": 7.180814127781819e-06, - "loss": 0.658, + "epoch": 0.19183673469387755, + "grad_norm": 1.9518351072841722, + "learning_rate": 1.8657993339958422e-05, + "loss": 0.8381, "step": 1833 }, { - "epoch": 1.21, - "learning_rate": 7.170584353165708e-06, - "loss": 0.7403, + "epoch": 0.19194139194139195, + "grad_norm": 2.140948339516903, + "learning_rate": 1.8656296655894205e-05, + "loss": 0.9547, "step": 1834 }, { - "epoch": 1.21, - "learning_rate": 7.1603577961415704e-06, - "loss": 0.7236, + "epoch": 0.19204604918890633, + "grad_norm": 2.248185211385205, + "learning_rate": 1.865459897720395e-05, + "loss": 0.8359, "step": 1835 }, { - "epoch": 1.21, - "learning_rate": 7.15013446833898e-06, - "loss": 0.8157, + "epoch": 0.1921507064364207, + "grad_norm": 1.9412091916767238, + "learning_rate": 1.865290030408273e-05, + "loss": 0.7927, "step": 1836 }, { - "epoch": 1.21, - "learning_rate": 7.139914381383837e-06, - "loss": 0.7308, + "epoch": 0.19225536368393512, + "grad_norm": 2.7448554100886717, + "learning_rate": 1.8651200636725727e-05, + "loss": 1.1132, "step": 1837 }, { - "epoch": 1.21, - "learning_rate": 7.129697546898344e-06, - "loss": 0.7194, + "epoch": 0.1923600209314495, + "grad_norm": 2.2079122058545533, + "learning_rate": 1.864949997532823e-05, + "loss": 0.9166, "step": 1838 }, { - "epoch": 1.21, - "learning_rate": 7.119483976501021e-06, - "loss": 0.8648, + "epoch": 0.1924646781789639, + "grad_norm": 2.512086470901001, + "learning_rate": 1.864779832008565e-05, + "loss": 1.0589, "step": 1839 }, { - "epoch": 1.21, - "learning_rate": 7.10927368180667e-06, - "loss": 0.6839, + "epoch": 0.19256933542647828, + "grad_norm": 2.232551449072093, + "learning_rate": 1.8646095671193512e-05, + "loss": 1.0556, "step": 1840 }, { - "epoch": 1.21, - "learning_rate": 7.0990666744263646e-06, - "loss": 0.7048, + "epoch": 0.19267399267399268, + "grad_norm": 2.5120233437446577, + "learning_rate": 1.8644392028847457e-05, + "loss": 1.0832, "step": 1841 }, { - "epoch": 1.21, - "learning_rate": 7.0888629659674525e-06, - "loss": 0.695, + "epoch": 0.19277864992150706, + "grad_norm": 2.4799212636866454, + "learning_rate": 1.8642687393243233e-05, + "loss": 0.9932, "step": 1842 }, { - "epoch": 1.21, - "learning_rate": 7.07866256803351e-06, - "loss": 0.7182, + "epoch": 0.19288330716902147, + "grad_norm": 2.0756667968361073, + "learning_rate": 1.8640981764576707e-05, + "loss": 1.0258, "step": 1843 }, { - "epoch": 1.21, - "learning_rate": 7.068465492224361e-06, - "loss": 0.6605, + "epoch": 0.19298796441653585, + "grad_norm": 2.570142389257925, + "learning_rate": 1.863927514304386e-05, + "loss": 1.179, "step": 1844 }, { - "epoch": 1.21, - "learning_rate": 7.058271750136054e-06, - "loss": 0.725, + "epoch": 0.19309262166405022, + "grad_norm": 2.3453819079115314, + "learning_rate": 1.8637567528840784e-05, + "loss": 0.9876, "step": 1845 }, { - "epoch": 1.22, - "learning_rate": 7.04808135336084e-06, - "loss": 0.7677, + "epoch": 0.19319727891156463, + "grad_norm": 2.1997745972557015, + "learning_rate": 1.863585892216369e-05, + "loss": 0.9367, "step": 1846 }, { - "epoch": 1.22, - "learning_rate": 7.03789431348717e-06, - "loss": 0.6982, + "epoch": 0.193301936159079, + "grad_norm": 2.2836486587979543, + "learning_rate": 1.8634149323208896e-05, + "loss": 1.0386, "step": 1847 }, { - "epoch": 1.22, - "learning_rate": 7.027710642099672e-06, - "loss": 0.6249, + "epoch": 0.1934065934065934, + "grad_norm": 2.21044201206823, + "learning_rate": 1.8632438732172846e-05, + "loss": 1.0342, "step": 1848 }, { - "epoch": 1.22, - "learning_rate": 7.017530350779148e-06, - "loss": 0.8421, + "epoch": 0.1935112506541078, + "grad_norm": 2.9097569831352352, + "learning_rate": 1.8630727149252082e-05, + "loss": 0.8409, "step": 1849 }, { - "epoch": 1.22, - "learning_rate": 7.007353451102557e-06, - "loss": 0.8239, + "epoch": 0.1936159079016222, + "grad_norm": 2.498032621838584, + "learning_rate": 1.8629014574643275e-05, + "loss": 1.032, "step": 1850 }, { - "epoch": 1.22, - "learning_rate": 6.997179954642998e-06, - "loss": 0.6682, + "epoch": 0.19372056514913658, + "grad_norm": 2.3432769230009196, + "learning_rate": 1.8627301008543198e-05, + "loss": 1.0969, "step": 1851 }, { - "epoch": 1.22, - "learning_rate": 6.9870098729697015e-06, - "loss": 0.7499, + "epoch": 0.19382522239665098, + "grad_norm": 2.241916204274371, + "learning_rate": 1.8625586451148747e-05, + "loss": 0.8316, "step": 1852 }, { - "epoch": 1.22, - "learning_rate": 6.976843217648013e-06, - "loss": 0.6502, + "epoch": 0.19392987964416536, + "grad_norm": 2.5512406560898055, + "learning_rate": 1.8623870902656928e-05, + "loss": 1.0418, "step": 1853 }, { - "epoch": 1.22, - "learning_rate": 6.966680000239384e-06, - "loss": 0.6084, + "epoch": 0.19403453689167974, + "grad_norm": 2.3680570078020233, + "learning_rate": 1.862215436326486e-05, + "loss": 0.8752, "step": 1854 }, { - "epoch": 1.22, - "learning_rate": 6.956520232301355e-06, - "loss": 0.762, + "epoch": 0.19413919413919414, + "grad_norm": 2.416117696667423, + "learning_rate": 1.8620436833169773e-05, + "loss": 0.972, "step": 1855 }, { - "epoch": 1.22, - "learning_rate": 6.946363925387546e-06, - "loss": 0.7446, + "epoch": 0.19424385138670852, + "grad_norm": 2.2006979425801783, + "learning_rate": 1.861871831256902e-05, + "loss": 1.0598, "step": 1856 }, { - "epoch": 1.22, - "learning_rate": 6.936211091047636e-06, - "loss": 0.8434, + "epoch": 0.19434850863422293, + "grad_norm": 2.1767343281841884, + "learning_rate": 1.861699880166006e-05, + "loss": 0.9034, "step": 1857 }, { - "epoch": 1.22, - "learning_rate": 6.926061740827359e-06, - "loss": 0.8147, + "epoch": 0.1944531658817373, + "grad_norm": 2.4534072009756973, + "learning_rate": 1.8615278300640475e-05, + "loss": 0.9428, "step": 1858 }, { - "epoch": 1.22, - "learning_rate": 6.915915886268486e-06, - "loss": 0.7941, + "epoch": 0.1945578231292517, + "grad_norm": 2.1602024149446355, + "learning_rate": 1.8613556809707943e-05, + "loss": 1.0906, "step": 1859 }, { - "epoch": 1.22, - "learning_rate": 6.905773538908814e-06, - "loss": 0.7455, + "epoch": 0.1946624803767661, + "grad_norm": 2.169341746326727, + "learning_rate": 1.8611834329060272e-05, + "loss": 0.9131, "step": 1860 }, { - "epoch": 1.23, - "learning_rate": 6.895634710282153e-06, - "loss": 0.8107, + "epoch": 0.1947671376242805, + "grad_norm": 2.9643059788895476, + "learning_rate": 1.8610110858895383e-05, + "loss": 0.9651, "step": 1861 }, { - "epoch": 1.23, - "learning_rate": 6.885499411918304e-06, - "loss": 0.7882, + "epoch": 0.19487179487179487, + "grad_norm": 2.213393165596908, + "learning_rate": 1.8608386399411296e-05, + "loss": 1.1012, "step": 1862 }, { - "epoch": 1.23, - "learning_rate": 6.8753676553430606e-06, - "loss": 0.8008, + "epoch": 0.19497645211930925, + "grad_norm": 2.530920660576334, + "learning_rate": 1.8606660950806165e-05, + "loss": 1.0807, "step": 1863 }, { - "epoch": 1.23, - "learning_rate": 6.8652394520781865e-06, - "loss": 0.7703, + "epoch": 0.19508110936682366, + "grad_norm": 2.5971872408801744, + "learning_rate": 1.860493451327824e-05, + "loss": 1.0592, "step": 1864 }, { - "epoch": 1.23, - "learning_rate": 6.855114813641408e-06, - "loss": 0.8034, + "epoch": 0.19518576661433804, + "grad_norm": 2.9737379735677725, + "learning_rate": 1.8603207087025897e-05, + "loss": 1.0376, "step": 1865 }, { - "epoch": 1.23, - "learning_rate": 6.844993751546388e-06, - "loss": 0.7733, + "epoch": 0.19529042386185244, + "grad_norm": 2.3943885247243006, + "learning_rate": 1.8601478672247622e-05, + "loss": 0.8863, "step": 1866 }, { - "epoch": 1.23, - "learning_rate": 6.8348762773027285e-06, - "loss": 0.6668, + "epoch": 0.19539508110936682, + "grad_norm": 2.4125949201995787, + "learning_rate": 1.8599749269142007e-05, + "loss": 1.0162, "step": 1867 }, { - "epoch": 1.23, - "learning_rate": 6.824762402415958e-06, - "loss": 0.8248, + "epoch": 0.19549973835688123, + "grad_norm": 2.3700879805758883, + "learning_rate": 1.859801887790777e-05, + "loss": 0.9566, "step": 1868 }, { - "epoch": 1.23, - "learning_rate": 6.814652138387501e-06, - "loss": 0.7283, + "epoch": 0.1956043956043956, + "grad_norm": 2.17186126800691, + "learning_rate": 1.859628749874373e-05, + "loss": 1.0475, "step": 1869 }, { - "epoch": 1.23, - "learning_rate": 6.8045454967146854e-06, - "loss": 0.7483, + "epoch": 0.19570905285190998, + "grad_norm": 2.0282801776801715, + "learning_rate": 1.8594555131848834e-05, + "loss": 0.9321, "step": 1870 }, { - "epoch": 1.23, - "learning_rate": 6.794442488890705e-06, - "loss": 0.7233, + "epoch": 0.1958137100994244, + "grad_norm": 2.4375974168550423, + "learning_rate": 1.8592821777422126e-05, + "loss": 0.9614, "step": 1871 }, { - "epoch": 1.23, - "learning_rate": 6.784343126404636e-06, - "loss": 0.8548, + "epoch": 0.19591836734693877, + "grad_norm": 2.2740457885998624, + "learning_rate": 1.859108743566278e-05, + "loss": 1.0135, "step": 1872 }, { - "epoch": 1.23, - "learning_rate": 6.774247420741401e-06, - "loss": 0.7449, + "epoch": 0.19602302459445317, + "grad_norm": 2.417294338783456, + "learning_rate": 1.8589352106770072e-05, + "loss": 0.9173, "step": 1873 }, { - "epoch": 1.23, - "learning_rate": 6.764155383381772e-06, - "loss": 0.8191, + "epoch": 0.19612768184196755, + "grad_norm": 2.2618143493988363, + "learning_rate": 1.858761579094339e-05, + "loss": 0.8704, "step": 1874 }, { - "epoch": 1.23, - "learning_rate": 6.754067025802346e-06, - "loss": 0.7457, + "epoch": 0.19623233908948196, + "grad_norm": 2.3808284708358363, + "learning_rate": 1.858587848838225e-05, + "loss": 1.0471, "step": 1875 }, { - "epoch": 1.24, - "learning_rate": 6.743982359475525e-06, - "loss": 0.647, + "epoch": 0.19633699633699633, + "grad_norm": 2.210022086562613, + "learning_rate": 1.8584140199286263e-05, + "loss": 0.9631, "step": 1876 }, { - "epoch": 1.24, - "learning_rate": 6.733901395869527e-06, - "loss": 0.8944, + "epoch": 0.19644165358451074, + "grad_norm": 2.3840130902341916, + "learning_rate": 1.8582400923855166e-05, + "loss": 1.045, "step": 1877 }, { - "epoch": 1.24, - "learning_rate": 6.723824146448352e-06, - "loss": 0.695, + "epoch": 0.19654631083202512, + "grad_norm": 2.606315391047431, + "learning_rate": 1.8580660662288807e-05, + "loss": 1.0785, "step": 1878 }, { - "epoch": 1.24, - "learning_rate": 6.713750622671784e-06, - "loss": 0.596, + "epoch": 0.1966509680795395, + "grad_norm": 2.6364699756222194, + "learning_rate": 1.857891941478714e-05, + "loss": 0.9784, "step": 1879 }, { - "epoch": 1.24, - "learning_rate": 6.7036808359953585e-06, - "loss": 0.5971, + "epoch": 0.1967556253270539, + "grad_norm": 2.072482657011719, + "learning_rate": 1.8577177181550243e-05, + "loss": 0.8879, "step": 1880 }, { - "epoch": 1.24, - "learning_rate": 6.693614797870371e-06, - "loss": 0.6719, + "epoch": 0.19686028257456828, + "grad_norm": 2.2034184725027237, + "learning_rate": 1.8575433962778297e-05, + "loss": 1.0744, "step": 1881 }, { - "epoch": 1.24, - "learning_rate": 6.68355251974385e-06, - "loss": 0.7233, + "epoch": 0.19696493982208269, + "grad_norm": 2.5086871489753526, + "learning_rate": 1.8573689758671607e-05, + "loss": 1.0613, "step": 1882 }, { - "epoch": 1.24, - "learning_rate": 6.673494013058548e-06, - "loss": 0.7284, + "epoch": 0.19706959706959706, + "grad_norm": 2.1761373108687416, + "learning_rate": 1.8571944569430582e-05, + "loss": 1.0279, "step": 1883 }, { - "epoch": 1.24, - "learning_rate": 6.663439289252932e-06, - "loss": 0.8362, + "epoch": 0.19717425431711147, + "grad_norm": 2.3377334894353488, + "learning_rate": 1.857019839525575e-05, + "loss": 1.0278, "step": 1884 }, { - "epoch": 1.24, - "learning_rate": 6.653388359761161e-06, - "loss": 0.8038, + "epoch": 0.19727891156462585, + "grad_norm": 2.4580887107257685, + "learning_rate": 1.856845123634775e-05, + "loss": 1.019, "step": 1885 }, { - "epoch": 1.24, - "learning_rate": 6.6433412360130855e-06, - "loss": 0.7687, + "epoch": 0.19738356881214025, + "grad_norm": 2.5860343029123913, + "learning_rate": 1.856670309290733e-05, + "loss": 1.0096, "step": 1886 }, { - "epoch": 1.24, - "learning_rate": 6.633297929434224e-06, - "loss": 0.6707, + "epoch": 0.19748822605965463, + "grad_norm": 2.960738910419292, + "learning_rate": 1.8564953965135358e-05, + "loss": 0.9591, "step": 1887 }, { - "epoch": 1.24, - "learning_rate": 6.623258451445758e-06, - "loss": 0.7155, + "epoch": 0.197592883307169, + "grad_norm": 2.3795107266092264, + "learning_rate": 1.8563203853232812e-05, + "loss": 0.8907, "step": 1888 }, { - "epoch": 1.24, - "learning_rate": 6.613222813464508e-06, - "loss": 0.6336, + "epoch": 0.19769754055468342, + "grad_norm": 2.2435840544300336, + "learning_rate": 1.8561452757400785e-05, + "loss": 0.9413, "step": 1889 }, { - "epoch": 1.24, - "learning_rate": 6.603191026902936e-06, - "loss": 0.7285, + "epoch": 0.1978021978021978, + "grad_norm": 2.068348693567875, + "learning_rate": 1.8559700677840482e-05, + "loss": 1.1047, "step": 1890 }, { - "epoch": 1.24, - "learning_rate": 6.59316310316912e-06, - "loss": 0.6758, + "epoch": 0.1979068550497122, + "grad_norm": 2.114604247450884, + "learning_rate": 1.855794761475322e-05, + "loss": 1.0478, "step": 1891 }, { - "epoch": 1.25, - "learning_rate": 6.583139053666745e-06, - "loss": 0.7682, + "epoch": 0.19801151229722658, + "grad_norm": 2.3715850812930466, + "learning_rate": 1.8556193568340423e-05, + "loss": 1.0699, "step": 1892 }, { - "epoch": 1.25, - "learning_rate": 6.573118889795093e-06, - "loss": 0.6451, + "epoch": 0.19811616954474098, + "grad_norm": 2.037660362224576, + "learning_rate": 1.855443853880364e-05, + "loss": 0.9552, "step": 1893 }, { - "epoch": 1.25, - "learning_rate": 6.563102622949021e-06, - "loss": 0.733, + "epoch": 0.19822082679225536, + "grad_norm": 2.472478303294875, + "learning_rate": 1.8552682526344532e-05, + "loss": 1.0812, "step": 1894 }, { - "epoch": 1.25, - "learning_rate": 6.553090264518962e-06, - "loss": 0.7325, + "epoch": 0.19832548403976977, + "grad_norm": 2.174274765095585, + "learning_rate": 1.855092553116486e-05, + "loss": 1.0514, "step": 1895 }, { - "epoch": 1.25, - "learning_rate": 6.543081825890901e-06, - "loss": 0.7829, + "epoch": 0.19843014128728415, + "grad_norm": 2.418758882311152, + "learning_rate": 1.8549167553466515e-05, + "loss": 1.1054, "step": 1896 }, { - "epoch": 1.25, - "learning_rate": 6.533077318446365e-06, - "loss": 0.7278, + "epoch": 0.19853479853479852, + "grad_norm": 2.6391811792480926, + "learning_rate": 1.8547408593451483e-05, + "loss": 0.8834, "step": 1897 }, { - "epoch": 1.25, - "learning_rate": 6.523076753562411e-06, - "loss": 0.7916, + "epoch": 0.19863945578231293, + "grad_norm": 2.362589627641238, + "learning_rate": 1.8545648651321874e-05, + "loss": 1.049, "step": 1898 }, { - "epoch": 1.25, - "learning_rate": 6.513080142611612e-06, - "loss": 0.7904, + "epoch": 0.1987441130298273, + "grad_norm": 3.6366633137355113, + "learning_rate": 1.8543887727279915e-05, + "loss": 0.8768, "step": 1899 }, { - "epoch": 1.25, - "learning_rate": 6.503087496962044e-06, - "loss": 0.6089, + "epoch": 0.1988487702773417, + "grad_norm": 2.5316763474698423, + "learning_rate": 1.8542125821527933e-05, + "loss": 0.95, "step": 1900 }, { - "epoch": 1.25, - "learning_rate": 6.493098827977278e-06, - "loss": 0.7025, + "epoch": 0.1989534275248561, + "grad_norm": 2.4002232910837322, + "learning_rate": 1.8540362934268378e-05, + "loss": 1.1223, "step": 1901 }, { - "epoch": 1.25, - "learning_rate": 6.483114147016358e-06, - "loss": 0.7289, + "epoch": 0.1990580847723705, + "grad_norm": 2.4136377650468477, + "learning_rate": 1.853859906570381e-05, + "loss": 1.0736, "step": 1902 }, { - "epoch": 1.25, - "learning_rate": 6.473133465433793e-06, - "loss": 0.754, + "epoch": 0.19916274201988488, + "grad_norm": 2.5540511046453362, + "learning_rate": 1.85368342160369e-05, + "loss": 1.1074, "step": 1903 }, { - "epoch": 1.25, - "learning_rate": 6.463156794579543e-06, - "loss": 0.7923, + "epoch": 0.19926739926739928, + "grad_norm": 2.2183989268155706, + "learning_rate": 1.8535068385470434e-05, + "loss": 0.9569, "step": 1904 }, { - "epoch": 1.25, - "learning_rate": 6.453184145799015e-06, - "loss": 0.7206, + "epoch": 0.19937205651491366, + "grad_norm": 2.2251618210625463, + "learning_rate": 1.8533301574207305e-05, + "loss": 1.024, "step": 1905 }, { - "epoch": 1.25, - "learning_rate": 6.4432155304330335e-06, - "loss": 0.6077, + "epoch": 0.19947671376242804, + "grad_norm": 2.4725959487205005, + "learning_rate": 1.853153378245053e-05, + "loss": 0.9864, "step": 1906 }, { - "epoch": 1.26, - "learning_rate": 6.433250959817845e-06, - "loss": 0.6976, + "epoch": 0.19958137100994244, + "grad_norm": 2.426105671626237, + "learning_rate": 1.8529765010403226e-05, + "loss": 1.1186, "step": 1907 }, { - "epoch": 1.26, - "learning_rate": 6.42329044528508e-06, - "loss": 0.7523, + "epoch": 0.19968602825745682, + "grad_norm": 2.2610224754205457, + "learning_rate": 1.852799525826863e-05, + "loss": 1.0378, "step": 1908 }, { - "epoch": 1.26, - "learning_rate": 6.413333998161772e-06, - "loss": 0.6929, + "epoch": 0.19979068550497123, + "grad_norm": 2.1525597269244683, + "learning_rate": 1.852622452625009e-05, + "loss": 0.9537, "step": 1909 }, { - "epoch": 1.26, - "learning_rate": 6.4033816297703245e-06, - "loss": 0.8903, + "epoch": 0.1998953427524856, + "grad_norm": 2.251421695737976, + "learning_rate": 1.8524452814551067e-05, + "loss": 1.0182, "step": 1910 }, { - "epoch": 1.26, - "learning_rate": 6.393433351428502e-06, - "loss": 0.6549, + "epoch": 0.2, + "grad_norm": 2.4742262214471458, + "learning_rate": 1.852268012337514e-05, + "loss": 0.9118, "step": 1911 }, { - "epoch": 1.26, - "learning_rate": 6.383489174449422e-06, - "loss": 0.6684, + "epoch": 0.2001046572475144, + "grad_norm": 2.351956722228832, + "learning_rate": 1.8520906452925983e-05, + "loss": 0.9906, "step": 1912 }, { - "epoch": 1.26, - "learning_rate": 6.373549110141528e-06, - "loss": 0.6565, + "epoch": 0.20020931449502877, + "grad_norm": 2.5457326056054304, + "learning_rate": 1.8519131803407405e-05, + "loss": 0.9223, "step": 1913 }, { - "epoch": 1.26, - "learning_rate": 6.363613169808593e-06, - "loss": 0.6944, + "epoch": 0.20031397174254317, + "grad_norm": 2.3125295104481087, + "learning_rate": 1.8517356175023312e-05, + "loss": 1.0705, "step": 1914 }, { - "epoch": 1.26, - "learning_rate": 6.353681364749703e-06, - "loss": 0.7965, + "epoch": 0.20041862899005755, + "grad_norm": 2.4672166374713655, + "learning_rate": 1.8515579567977728e-05, + "loss": 1.003, "step": 1915 }, { - "epoch": 1.26, - "learning_rate": 6.343753706259239e-06, - "loss": 0.7501, + "epoch": 0.20052328623757196, + "grad_norm": 2.1521918308626193, + "learning_rate": 1.8513801982474788e-05, + "loss": 0.9784, "step": 1916 }, { - "epoch": 1.26, - "learning_rate": 6.333830205626861e-06, - "loss": 0.7404, + "epoch": 0.20062794348508634, + "grad_norm": 2.4414899299046295, + "learning_rate": 1.8512023418718745e-05, + "loss": 1.0942, "step": 1917 }, { - "epoch": 1.26, - "learning_rate": 6.323910874137509e-06, - "loss": 0.7629, + "epoch": 0.20073260073260074, + "grad_norm": 2.3650808987908105, + "learning_rate": 1.851024387691395e-05, + "loss": 1.047, "step": 1918 }, { - "epoch": 1.26, - "learning_rate": 6.313995723071377e-06, - "loss": 0.8053, + "epoch": 0.20083725798011512, + "grad_norm": 2.5314876863007703, + "learning_rate": 1.8508463357264883e-05, + "loss": 1.0066, "step": 1919 }, { - "epoch": 1.26, - "learning_rate": 6.3040847637039075e-06, - "loss": 0.6441, + "epoch": 0.20094191522762953, + "grad_norm": 2.0756056456744156, + "learning_rate": 1.8506681859976127e-05, + "loss": 0.8131, "step": 1920 }, { - "epoch": 1.26, - "learning_rate": 6.294178007305776e-06, - "loss": 0.718, + "epoch": 0.2010465724751439, + "grad_norm": 2.4985838715578894, + "learning_rate": 1.850489938525238e-05, + "loss": 0.9835, "step": 1921 }, { - "epoch": 1.27, - "learning_rate": 6.284275465142875e-06, - "loss": 0.7312, + "epoch": 0.20115122972265828, + "grad_norm": 2.272801695052019, + "learning_rate": 1.8503115933298455e-05, + "loss": 1.0649, "step": 1922 }, { - "epoch": 1.27, - "learning_rate": 6.27437714847631e-06, - "loss": 0.7502, + "epoch": 0.2012558869701727, + "grad_norm": 2.0961137864615265, + "learning_rate": 1.850133150431927e-05, + "loss": 1.0169, "step": 1923 }, { - "epoch": 1.27, - "learning_rate": 6.2644830685623756e-06, - "loss": 0.8266, + "epoch": 0.20136054421768707, + "grad_norm": 2.2133469820080234, + "learning_rate": 1.8499546098519863e-05, + "loss": 0.7855, "step": 1924 }, { - "epoch": 1.27, - "learning_rate": 6.254593236652554e-06, - "loss": 0.7382, + "epoch": 0.20146520146520147, + "grad_norm": 2.272863601676403, + "learning_rate": 1.8497759716105376e-05, + "loss": 1.1192, "step": 1925 }, { - "epoch": 1.27, - "learning_rate": 6.244707663993494e-06, - "loss": 0.7314, + "epoch": 0.20156985871271585, + "grad_norm": 2.3222789724527915, + "learning_rate": 1.8495972357281073e-05, + "loss": 1.1352, "step": 1926 }, { - "epoch": 1.27, - "learning_rate": 6.2348263618269975e-06, - "loss": 0.7311, + "epoch": 0.20167451596023026, + "grad_norm": 2.3537538388981933, + "learning_rate": 1.8494184022252324e-05, + "loss": 1.0501, "step": 1927 }, { - "epoch": 1.27, - "learning_rate": 6.224949341390017e-06, - "loss": 0.7672, + "epoch": 0.20177917320774463, + "grad_norm": 2.7018467868355485, + "learning_rate": 1.849239471122461e-05, + "loss": 1.1058, "step": 1928 }, { - "epoch": 1.27, - "learning_rate": 6.215076613914628e-06, - "loss": 0.7134, + "epoch": 0.20188383045525904, + "grad_norm": 2.004307635291416, + "learning_rate": 1.8490604424403527e-05, + "loss": 0.874, "step": 1929 }, { - "epoch": 1.27, - "learning_rate": 6.205208190628034e-06, - "loss": 0.7885, + "epoch": 0.20198848770277342, + "grad_norm": 2.096808325868463, + "learning_rate": 1.8488813161994784e-05, + "loss": 1.0896, "step": 1930 }, { - "epoch": 1.27, - "learning_rate": 6.1953440827525306e-06, - "loss": 0.7421, + "epoch": 0.2020931449502878, + "grad_norm": 2.0352763262829705, + "learning_rate": 1.84870209242042e-05, + "loss": 0.8975, "step": 1931 }, { - "epoch": 1.27, - "learning_rate": 6.185484301505518e-06, - "loss": 0.7605, + "epoch": 0.2021978021978022, + "grad_norm": 2.6788994218915705, + "learning_rate": 1.8485227711237707e-05, + "loss": 1.0147, "step": 1932 }, { - "epoch": 1.27, - "learning_rate": 6.175628858099471e-06, - "loss": 0.7185, + "epoch": 0.20230245944531658, + "grad_norm": 2.462010742069801, + "learning_rate": 1.848343352330135e-05, + "loss": 1.1053, "step": 1933 }, { - "epoch": 1.27, - "learning_rate": 6.165777763741932e-06, - "loss": 0.7015, + "epoch": 0.20240711669283099, + "grad_norm": 2.50095298478946, + "learning_rate": 1.848163836060128e-05, + "loss": 1.0792, "step": 1934 }, { - "epoch": 1.27, - "learning_rate": 6.1559310296355005e-06, - "loss": 0.6552, + "epoch": 0.20251177394034536, + "grad_norm": 2.49507265553553, + "learning_rate": 1.847984222334377e-05, + "loss": 0.9941, "step": 1935 }, { - "epoch": 1.27, - "learning_rate": 6.146088666977804e-06, - "loss": 0.7147, + "epoch": 0.20261643118785977, + "grad_norm": 2.669346708093879, + "learning_rate": 1.84780451117352e-05, + "loss": 1.1946, "step": 1936 }, { - "epoch": 1.28, - "learning_rate": 6.1362506869615205e-06, - "loss": 0.6034, + "epoch": 0.20272108843537415, + "grad_norm": 3.0351707992452344, + "learning_rate": 1.8476247025982058e-05, + "loss": 1.0224, "step": 1937 }, { - "epoch": 1.28, - "learning_rate": 6.126417100774329e-06, - "loss": 0.6316, + "epoch": 0.20282574568288855, + "grad_norm": 2.1301304657947058, + "learning_rate": 1.847444796629095e-05, + "loss": 0.9222, "step": 1938 }, { - "epoch": 1.28, - "learning_rate": 6.116587919598915e-06, - "loss": 0.6678, + "epoch": 0.20293040293040293, + "grad_norm": 2.1876447064810467, + "learning_rate": 1.847264793286859e-05, + "loss": 0.9753, "step": 1939 }, { - "epoch": 1.28, - "learning_rate": 6.106763154612963e-06, - "loss": 0.6918, + "epoch": 0.2030350601779173, + "grad_norm": 2.2563332049617006, + "learning_rate": 1.8470846925921807e-05, + "loss": 1.0181, "step": 1940 }, { - "epoch": 1.28, - "learning_rate": 6.096942816989116e-06, - "loss": 0.7406, + "epoch": 0.20313971742543172, + "grad_norm": 2.3267222100748826, + "learning_rate": 1.8469044945657543e-05, + "loss": 1.015, "step": 1941 }, { - "epoch": 1.28, - "learning_rate": 6.0871269178949986e-06, - "loss": 0.7056, + "epoch": 0.2032443746729461, + "grad_norm": 2.441159690044786, + "learning_rate": 1.8467241992282842e-05, + "loss": 1.1735, "step": 1942 }, { - "epoch": 1.28, - "learning_rate": 6.0773154684931875e-06, - "loss": 0.6214, + "epoch": 0.2033490319204605, + "grad_norm": 2.1163326570343806, + "learning_rate": 1.8465438066004875e-05, + "loss": 1.0618, "step": 1943 }, { - "epoch": 1.28, - "learning_rate": 6.0675084799411975e-06, - "loss": 0.648, + "epoch": 0.20345368916797488, + "grad_norm": 2.117403160909713, + "learning_rate": 1.846363316703091e-05, + "loss": 1.0112, "step": 1944 }, { - "epoch": 1.28, - "learning_rate": 6.057705963391461e-06, - "loss": 0.6987, + "epoch": 0.20355834641548928, + "grad_norm": 2.597051152568622, + "learning_rate": 1.846182729556834e-05, + "loss": 0.9412, "step": 1945 }, { - "epoch": 1.28, - "learning_rate": 6.047907929991333e-06, - "loss": 0.7254, + "epoch": 0.20366300366300366, + "grad_norm": 2.3342553808450024, + "learning_rate": 1.8460020451824658e-05, + "loss": 0.8846, "step": 1946 }, { - "epoch": 1.28, - "learning_rate": 6.038114390883073e-06, - "loss": 0.7628, + "epoch": 0.20376766091051807, + "grad_norm": 2.493805789843552, + "learning_rate": 1.8458212636007474e-05, + "loss": 1.0156, "step": 1947 }, { - "epoch": 1.28, - "learning_rate": 6.028325357203822e-06, - "loss": 0.6533, + "epoch": 0.20387231815803245, + "grad_norm": 2.597283250350886, + "learning_rate": 1.8456403848324513e-05, + "loss": 1.0745, "step": 1948 }, { - "epoch": 1.28, - "learning_rate": 6.018540840085611e-06, - "loss": 0.9035, + "epoch": 0.20397697540554682, + "grad_norm": 2.178100429523685, + "learning_rate": 1.8454594088983608e-05, + "loss": 1.0338, "step": 1949 }, { - "epoch": 1.28, - "learning_rate": 6.008760850655316e-06, - "loss": 0.7052, + "epoch": 0.20408163265306123, + "grad_norm": 2.627673207192234, + "learning_rate": 1.8452783358192697e-05, + "loss": 1.1234, "step": 1950 }, { - "epoch": 1.28, - "learning_rate": 5.998985400034676e-06, - "loss": 0.6983, + "epoch": 0.2041862899005756, + "grad_norm": 2.6660377019985315, + "learning_rate": 1.845097165615985e-05, + "loss": 1.0812, "step": 1951 }, { - "epoch": 1.29, - "learning_rate": 5.989214499340267e-06, - "loss": 0.6291, + "epoch": 0.20429094714809, + "grad_norm": 2.4399501701622492, + "learning_rate": 1.8449158983093226e-05, + "loss": 1.1253, "step": 1952 }, { - "epoch": 1.29, - "learning_rate": 5.979448159683491e-06, - "loss": 0.7766, + "epoch": 0.2043956043956044, + "grad_norm": 2.078040777525875, + "learning_rate": 1.8447345339201105e-05, + "loss": 0.9986, "step": 1953 }, { - "epoch": 1.29, - "learning_rate": 5.96968639217056e-06, - "loss": 0.6956, + "epoch": 0.2045002616431188, + "grad_norm": 2.344622782021923, + "learning_rate": 1.844553072469188e-05, + "loss": 1.0179, "step": 1954 }, { - "epoch": 1.29, - "learning_rate": 5.959929207902488e-06, - "loss": 0.742, + "epoch": 0.20460491889063318, + "grad_norm": 2.3007348240880807, + "learning_rate": 1.8443715139774055e-05, + "loss": 1.0646, "step": 1955 }, { - "epoch": 1.29, - "learning_rate": 5.950176617975079e-06, - "loss": 0.7034, + "epoch": 0.20470957613814755, + "grad_norm": 2.281518769125784, + "learning_rate": 1.844189858465624e-05, + "loss": 1.0593, "step": 1956 }, { - "epoch": 1.29, - "learning_rate": 5.9404286334789115e-06, - "loss": 0.6267, + "epoch": 0.20481423338566196, + "grad_norm": 2.4164773491310765, + "learning_rate": 1.844008105954717e-05, + "loss": 1.078, "step": 1957 }, { - "epoch": 1.29, - "learning_rate": 5.930685265499329e-06, - "loss": 0.589, + "epoch": 0.20491889063317634, + "grad_norm": 2.3330463609538232, + "learning_rate": 1.843826256465567e-05, + "loss": 1.0492, "step": 1958 }, { - "epoch": 1.29, - "learning_rate": 5.9209465251164175e-06, - "loss": 0.8011, + "epoch": 0.20502354788069074, + "grad_norm": 2.2742931414383167, + "learning_rate": 1.8436443100190702e-05, + "loss": 0.9409, "step": 1959 }, { - "epoch": 1.29, - "learning_rate": 5.911212423405008e-06, - "loss": 0.6309, + "epoch": 0.20512820512820512, + "grad_norm": 4.341203039506652, + "learning_rate": 1.8434622666361316e-05, + "loss": 1.0529, "step": 1960 }, { - "epoch": 1.29, - "learning_rate": 5.901482971434652e-06, - "loss": 0.6323, + "epoch": 0.20523286237571953, + "grad_norm": 2.293281560740709, + "learning_rate": 1.8432801263376686e-05, + "loss": 1.1591, "step": 1961 }, { - "epoch": 1.29, - "learning_rate": 5.891758180269618e-06, - "loss": 0.71, + "epoch": 0.2053375196232339, + "grad_norm": 2.0799628461136614, + "learning_rate": 1.8430978891446097e-05, + "loss": 1.009, "step": 1962 }, { - "epoch": 1.29, - "learning_rate": 5.882038060968871e-06, - "loss": 0.6835, + "epoch": 0.2054421768707483, + "grad_norm": 2.34597923911034, + "learning_rate": 1.8429155550778942e-05, + "loss": 1.0635, "step": 1963 }, { - "epoch": 1.29, - "learning_rate": 5.872322624586061e-06, - "loss": 0.7589, + "epoch": 0.2055468341182627, + "grad_norm": 2.525872889513285, + "learning_rate": 1.8427331241584724e-05, + "loss": 0.9838, "step": 1964 }, { - "epoch": 1.29, - "learning_rate": 5.862611882169518e-06, - "loss": 0.7705, + "epoch": 0.20565149136577707, + "grad_norm": 2.178144919085616, + "learning_rate": 1.8425505964073065e-05, + "loss": 1.0336, "step": 1965 }, { - "epoch": 1.29, - "learning_rate": 5.852905844762229e-06, - "loss": 0.7749, + "epoch": 0.20575614861329147, + "grad_norm": 2.8651481979144036, + "learning_rate": 1.8423679718453692e-05, + "loss": 1.1158, "step": 1966 }, { - "epoch": 1.29, - "learning_rate": 5.8432045234018365e-06, - "loss": 0.7817, + "epoch": 0.20586080586080585, + "grad_norm": 2.555904916520759, + "learning_rate": 1.842185250493644e-05, + "loss": 1.0651, "step": 1967 }, { - "epoch": 1.3, - "learning_rate": 5.833507929120612e-06, - "loss": 0.6737, + "epoch": 0.20596546310832026, + "grad_norm": 2.140596234465768, + "learning_rate": 1.842002432373126e-05, + "loss": 0.8645, "step": 1968 }, { - "epoch": 1.3, - "learning_rate": 5.823816072945458e-06, - "loss": 0.6447, + "epoch": 0.20607012035583464, + "grad_norm": 2.089196272187266, + "learning_rate": 1.8418195175048217e-05, + "loss": 1.0134, "step": 1969 }, { - "epoch": 1.3, - "learning_rate": 5.814128965897887e-06, - "loss": 0.7379, + "epoch": 0.20617477760334904, + "grad_norm": 2.6661677515756597, + "learning_rate": 1.841636505909749e-05, + "loss": 1.0963, "step": 1970 }, { - "epoch": 1.3, - "learning_rate": 5.804446618994009e-06, - "loss": 0.613, + "epoch": 0.20627943485086342, + "grad_norm": 2.023006671314738, + "learning_rate": 1.841453397608935e-05, + "loss": 0.9889, "step": 1971 }, { - "epoch": 1.3, - "learning_rate": 5.794769043244527e-06, - "loss": 0.5713, + "epoch": 0.20638409209837782, + "grad_norm": 2.4607506296207267, + "learning_rate": 1.8412701926234197e-05, + "loss": 0.9246, "step": 1972 }, { - "epoch": 1.3, - "learning_rate": 5.785096249654705e-06, - "loss": 0.8384, + "epoch": 0.2064887493458922, + "grad_norm": 2.0748117406848228, + "learning_rate": 1.841086890974254e-05, + "loss": 1.0495, "step": 1973 }, { - "epoch": 1.3, - "learning_rate": 5.775428249224382e-06, - "loss": 0.7849, + "epoch": 0.20659340659340658, + "grad_norm": 2.293516807467432, + "learning_rate": 1.8409034926824995e-05, + "loss": 1.0593, "step": 1974 }, { - "epoch": 1.3, - "learning_rate": 5.765765052947941e-06, - "loss": 0.9222, + "epoch": 0.206698063840921, + "grad_norm": 2.2095392100107003, + "learning_rate": 1.8407199977692292e-05, + "loss": 0.9036, "step": 1975 }, { - "epoch": 1.3, - "learning_rate": 5.756106671814301e-06, - "loss": 0.8544, + "epoch": 0.20680272108843537, + "grad_norm": 2.0148849689379587, + "learning_rate": 1.8405364062555263e-05, + "loss": 0.9078, "step": 1976 }, { - "epoch": 1.3, - "learning_rate": 5.7464531168069096e-06, - "loss": 0.7115, + "epoch": 0.20690737833594977, + "grad_norm": 2.3688402106868907, + "learning_rate": 1.840352718162487e-05, + "loss": 1.0614, "step": 1977 }, { - "epoch": 1.3, - "learning_rate": 5.7368043989037206e-06, - "loss": 0.7258, + "epoch": 0.20701203558346415, + "grad_norm": 2.316317970644639, + "learning_rate": 1.8401689335112164e-05, + "loss": 0.9362, "step": 1978 }, { - "epoch": 1.3, - "learning_rate": 5.7271605290771895e-06, - "loss": 0.7261, + "epoch": 0.20711669283097855, + "grad_norm": 2.16166387910284, + "learning_rate": 1.8399850523228325e-05, + "loss": 0.9942, "step": 1979 }, { - "epoch": 1.3, - "learning_rate": 5.717521518294259e-06, - "loss": 0.6235, + "epoch": 0.20722135007849293, + "grad_norm": 2.6063947890703902, + "learning_rate": 1.8398010746184628e-05, + "loss": 1.1124, "step": 1980 }, { - "epoch": 1.3, - "learning_rate": 5.70788737751635e-06, - "loss": 0.6413, + "epoch": 0.20732600732600734, + "grad_norm": 2.47106442178898, + "learning_rate": 1.8396170004192474e-05, + "loss": 1.1101, "step": 1981 }, { - "epoch": 1.3, - "learning_rate": 5.698258117699334e-06, - "loss": 0.7574, + "epoch": 0.20743066457352172, + "grad_norm": 2.2549560321044875, + "learning_rate": 1.8394328297463366e-05, + "loss": 1.0547, "step": 1982 }, { - "epoch": 1.31, - "learning_rate": 5.68863374979354e-06, - "loss": 0.7668, + "epoch": 0.2075353218210361, + "grad_norm": 2.0122298157744796, + "learning_rate": 1.839248562620892e-05, + "loss": 0.8095, "step": 1983 }, { - "epoch": 1.31, - "learning_rate": 5.6790142847437355e-06, - "loss": 0.7314, + "epoch": 0.2076399790685505, + "grad_norm": 2.1855154297200494, + "learning_rate": 1.839064199064086e-05, + "loss": 0.9771, "step": 1984 }, { - "epoch": 1.31, - "learning_rate": 5.669399733489108e-06, - "loss": 0.7755, + "epoch": 0.20774463631606488, + "grad_norm": 1.9322206736673364, + "learning_rate": 1.8388797390971026e-05, + "loss": 0.8952, "step": 1985 }, { - "epoch": 1.31, - "learning_rate": 5.659790106963261e-06, - "loss": 0.7053, + "epoch": 0.20784929356357928, + "grad_norm": 2.414532007703879, + "learning_rate": 1.838695182741137e-05, + "loss": 0.9121, "step": 1986 }, { - "epoch": 1.31, - "learning_rate": 5.650185416094194e-06, - "loss": 0.6335, + "epoch": 0.20795395081109366, + "grad_norm": 2.186308171958817, + "learning_rate": 1.8385105300173943e-05, + "loss": 1.0025, "step": 1987 }, { - "epoch": 1.31, - "learning_rate": 5.640585671804296e-06, - "loss": 0.6977, + "epoch": 0.20805860805860807, + "grad_norm": 2.2280850457274455, + "learning_rate": 1.838325780947092e-05, + "loss": 1.0759, "step": 1988 }, { - "epoch": 1.31, - "learning_rate": 5.63099088501033e-06, - "loss": 0.7254, + "epoch": 0.20816326530612245, + "grad_norm": 2.2909334704230195, + "learning_rate": 1.838140935551458e-05, + "loss": 0.9594, "step": 1989 }, { - "epoch": 1.31, - "learning_rate": 5.621401066623421e-06, - "loss": 0.75, + "epoch": 0.20826792255363685, + "grad_norm": 2.104015005202415, + "learning_rate": 1.8379559938517314e-05, + "loss": 0.8966, "step": 1990 }, { - "epoch": 1.31, - "learning_rate": 5.611816227549048e-06, - "loss": 0.7286, + "epoch": 0.20837257980115123, + "grad_norm": 3.017180350499111, + "learning_rate": 1.8377709558691622e-05, + "loss": 1.0479, "step": 1991 }, { - "epoch": 1.31, - "learning_rate": 5.602236378687018e-06, - "loss": 0.7389, + "epoch": 0.2084772370486656, + "grad_norm": 2.1719617305672356, + "learning_rate": 1.837585821625012e-05, + "loss": 1.131, "step": 1992 }, { - "epoch": 1.31, - "learning_rate": 5.59266153093147e-06, - "loss": 0.7002, + "epoch": 0.20858189429618001, + "grad_norm": 2.8996497541322994, + "learning_rate": 1.8374005911405528e-05, + "loss": 0.9922, "step": 1993 }, { - "epoch": 1.31, - "learning_rate": 5.583091695170857e-06, - "loss": 0.8015, + "epoch": 0.2086865515436944, + "grad_norm": 2.2079598409798527, + "learning_rate": 1.8372152644370684e-05, + "loss": 1.007, "step": 1994 }, { - "epoch": 1.31, - "learning_rate": 5.5735268822879344e-06, - "loss": 0.6768, + "epoch": 0.2087912087912088, + "grad_norm": 2.1831162058992684, + "learning_rate": 1.8370298415358527e-05, + "loss": 1.0665, "step": 1995 }, { - "epoch": 1.31, - "learning_rate": 5.563967103159733e-06, - "loss": 0.7159, + "epoch": 0.20889586603872318, + "grad_norm": 2.164569022998782, + "learning_rate": 1.8368443224582115e-05, + "loss": 1.1039, "step": 1996 }, { - "epoch": 1.31, - "learning_rate": 5.554412368657572e-06, - "loss": 0.6989, + "epoch": 0.20900052328623758, + "grad_norm": 3.8082638015537533, + "learning_rate": 1.836658707225461e-05, + "loss": 1.1108, "step": 1997 }, { - "epoch": 1.32, - "learning_rate": 5.544862689647024e-06, - "loss": 0.7128, + "epoch": 0.20910518053375196, + "grad_norm": 2.252635657133077, + "learning_rate": 1.836472995858929e-05, + "loss": 1.0929, "step": 1998 }, { - "epoch": 1.32, - "learning_rate": 5.535318076987928e-06, - "loss": 0.5933, + "epoch": 0.20920983778126634, + "grad_norm": 2.159315517759604, + "learning_rate": 1.836287188379954e-05, + "loss": 1.0949, "step": 1999 }, { - "epoch": 1.32, - "learning_rate": 5.52577854153435e-06, - "loss": 0.7736, + "epoch": 0.20931449502878074, + "grad_norm": 2.3732269117605327, + "learning_rate": 1.836101284809886e-05, + "loss": 1.2197, "step": 2000 }, { - "epoch": 1.32, - "learning_rate": 5.516244094134578e-06, - "loss": 0.6531, + "epoch": 0.20941915227629512, + "grad_norm": 2.062458615626669, + "learning_rate": 1.835915285170085e-05, + "loss": 0.8862, "step": 2001 }, { - "epoch": 1.32, - "learning_rate": 5.506714745631124e-06, - "loss": 0.8514, + "epoch": 0.20952380952380953, + "grad_norm": 2.3583708531763636, + "learning_rate": 1.8357291894819236e-05, + "loss": 1.0029, "step": 2002 }, { - "epoch": 1.32, - "learning_rate": 5.497190506860695e-06, - "loss": 0.8415, + "epoch": 0.2096284667713239, + "grad_norm": 2.284742186222721, + "learning_rate": 1.8355429977667837e-05, + "loss": 0.9637, "step": 2003 }, { - "epoch": 1.32, - "learning_rate": 5.487671388654192e-06, - "loss": 0.7159, + "epoch": 0.2097331240188383, + "grad_norm": 2.2878915237206865, + "learning_rate": 1.83535671004606e-05, + "loss": 1.0314, "step": 2004 }, { - "epoch": 1.32, - "learning_rate": 5.478157401836694e-06, - "loss": 0.7302, + "epoch": 0.2098377812663527, + "grad_norm": 2.380158906701526, + "learning_rate": 1.8351703263411567e-05, + "loss": 1.0208, "step": 2005 }, { - "epoch": 1.32, - "learning_rate": 5.468648557227434e-06, - "loss": 0.6675, + "epoch": 0.2099424385138671, + "grad_norm": 2.3836744458989756, + "learning_rate": 1.83498384667349e-05, + "loss": 1.0668, "step": 2006 }, { - "epoch": 1.32, - "learning_rate": 5.459144865639806e-06, - "loss": 0.7683, + "epoch": 0.21004709576138147, + "grad_norm": 2.1286045732612147, + "learning_rate": 1.8347972710644862e-05, + "loss": 0.9946, "step": 2007 }, { - "epoch": 1.32, - "learning_rate": 5.449646337881345e-06, - "loss": 0.7269, + "epoch": 0.21015175300889585, + "grad_norm": 2.400087509008151, + "learning_rate": 1.8346105995355837e-05, + "loss": 0.9493, "step": 2008 }, { - "epoch": 1.32, - "learning_rate": 5.440152984753709e-06, - "loss": 0.7207, + "epoch": 0.21025641025641026, + "grad_norm": 2.128726329762142, + "learning_rate": 1.8344238321082316e-05, + "loss": 0.9336, "step": 2009 }, { - "epoch": 1.32, - "learning_rate": 5.430664817052678e-06, - "loss": 0.7395, + "epoch": 0.21036106750392464, + "grad_norm": 2.1830248786492956, + "learning_rate": 1.8342369688038894e-05, + "loss": 1.0607, "step": 2010 }, { - "epoch": 1.32, - "learning_rate": 5.421181845568125e-06, - "loss": 0.7995, + "epoch": 0.21046572475143904, + "grad_norm": 2.2689021885521576, + "learning_rate": 1.8340500096440286e-05, + "loss": 1.069, "step": 2011 }, { - "epoch": 1.32, - "learning_rate": 5.411704081084025e-06, - "loss": 0.666, + "epoch": 0.21057038199895342, + "grad_norm": 2.5029531826193057, + "learning_rate": 1.833862954650131e-05, + "loss": 1.1992, "step": 2012 }, { - "epoch": 1.33, - "learning_rate": 5.402231534378424e-06, - "loss": 0.6999, + "epoch": 0.21067503924646783, + "grad_norm": 2.69872427128542, + "learning_rate": 1.833675803843689e-05, + "loss": 1.048, "step": 2013 }, { - "epoch": 1.33, - "learning_rate": 5.392764216223441e-06, - "loss": 0.7382, + "epoch": 0.2107796964939822, + "grad_norm": 2.6551070968845094, + "learning_rate": 1.8334885572462076e-05, + "loss": 1.0161, "step": 2014 }, { - "epoch": 1.33, - "learning_rate": 5.383302137385238e-06, - "loss": 0.8523, + "epoch": 0.2108843537414966, + "grad_norm": 2.5294187266588293, + "learning_rate": 1.8333012148792008e-05, + "loss": 1.0735, "step": 2015 }, { - "epoch": 1.33, - "learning_rate": 5.373845308624028e-06, - "loss": 0.6393, + "epoch": 0.210989010989011, + "grad_norm": 2.1783913112001754, + "learning_rate": 1.8331137767641958e-05, + "loss": 1.0417, "step": 2016 }, { - "epoch": 1.33, - "learning_rate": 5.364393740694054e-06, - "loss": 0.6108, + "epoch": 0.21109366823652537, + "grad_norm": 2.369045148316055, + "learning_rate": 1.8329262429227285e-05, + "loss": 1.1235, "step": 2017 }, { - "epoch": 1.33, - "learning_rate": 5.3549474443435726e-06, - "loss": 0.6351, + "epoch": 0.21119832548403977, + "grad_norm": 2.117286187563206, + "learning_rate": 1.8327386133763475e-05, + "loss": 0.8765, "step": 2018 }, { - "epoch": 1.33, - "learning_rate": 5.345506430314845e-06, - "loss": 0.6475, + "epoch": 0.21130298273155415, + "grad_norm": 2.481083285401919, + "learning_rate": 1.8325508881466115e-05, + "loss": 1.0116, "step": 2019 }, { - "epoch": 1.33, - "learning_rate": 5.33607070934413e-06, - "loss": 0.6502, + "epoch": 0.21140763997906856, + "grad_norm": 2.2535978341456753, + "learning_rate": 1.832363067255091e-05, + "loss": 1.0478, "step": 2020 }, { - "epoch": 1.33, - "learning_rate": 5.326640292161665e-06, - "loss": 0.6687, + "epoch": 0.21151229722658293, + "grad_norm": 2.1891174065268824, + "learning_rate": 1.8321751507233663e-05, + "loss": 0.9707, "step": 2021 }, { - "epoch": 1.33, - "learning_rate": 5.317215189491651e-06, - "loss": 0.5928, + "epoch": 0.21161695447409734, + "grad_norm": 2.29735895919819, + "learning_rate": 1.8319871385730298e-05, + "loss": 1.0669, "step": 2022 }, { - "epoch": 1.33, - "learning_rate": 5.307795412052259e-06, - "loss": 0.6536, + "epoch": 0.21172161172161172, + "grad_norm": 2.147861663463576, + "learning_rate": 1.831799030825685e-05, + "loss": 1.0486, "step": 2023 }, { - "epoch": 1.33, - "learning_rate": 5.298380970555584e-06, - "loss": 0.6886, + "epoch": 0.21182626896912612, + "grad_norm": 2.0405385220270973, + "learning_rate": 1.831610827502945e-05, + "loss": 0.9038, "step": 2024 }, { - "epoch": 1.33, - "learning_rate": 5.288971875707669e-06, - "loss": 0.6748, + "epoch": 0.2119309262166405, + "grad_norm": 2.5478447806636657, + "learning_rate": 1.8314225286264353e-05, + "loss": 1.0453, "step": 2025 }, { - "epoch": 1.33, - "learning_rate": 5.279568138208473e-06, - "loss": 0.7961, + "epoch": 0.21203558346415488, + "grad_norm": 2.3690660543652475, + "learning_rate": 1.8312341342177912e-05, + "loss": 1.0493, "step": 2026 }, { - "epoch": 1.33, - "learning_rate": 5.270169768751858e-06, - "loss": 0.756, + "epoch": 0.2121402407116693, + "grad_norm": 2.2032721351975995, + "learning_rate": 1.83104564429866e-05, + "loss": 0.9498, "step": 2027 }, { - "epoch": 1.34, - "learning_rate": 5.260776778025591e-06, - "loss": 0.7973, + "epoch": 0.21224489795918366, + "grad_norm": 2.609109823726276, + "learning_rate": 1.8308570588906996e-05, + "loss": 1.1299, "step": 2028 }, { - "epoch": 1.34, - "learning_rate": 5.251389176711307e-06, - "loss": 0.7368, + "epoch": 0.21234955520669807, + "grad_norm": 1.8610292340166064, + "learning_rate": 1.8306683780155792e-05, + "loss": 0.9029, "step": 2029 }, { - "epoch": 1.34, - "learning_rate": 5.242006975484528e-06, - "loss": 0.706, + "epoch": 0.21245421245421245, + "grad_norm": 2.4304964979220025, + "learning_rate": 1.830479601694978e-05, + "loss": 1.087, "step": 2030 }, { - "epoch": 1.34, - "learning_rate": 5.2326301850146285e-06, - "loss": 0.7257, + "epoch": 0.21255886970172685, + "grad_norm": 2.381417687058148, + "learning_rate": 1.830290729950587e-05, + "loss": 0.9628, "step": 2031 }, { - "epoch": 1.34, - "learning_rate": 5.223258815964833e-06, - "loss": 0.7576, + "epoch": 0.21266352694924123, + "grad_norm": 2.017480163965436, + "learning_rate": 1.8301017628041076e-05, + "loss": 0.799, "step": 2032 }, { - "epoch": 1.34, - "learning_rate": 5.213892878992191e-06, - "loss": 0.7511, + "epoch": 0.21276818419675564, + "grad_norm": 2.935623238435619, + "learning_rate": 1.829912700277253e-05, + "loss": 0.9128, "step": 2033 }, { - "epoch": 1.34, - "learning_rate": 5.204532384747584e-06, - "loss": 0.7854, + "epoch": 0.21287284144427002, + "grad_norm": 2.2265140475206384, + "learning_rate": 1.8297235423917473e-05, + "loss": 0.9985, "step": 2034 }, { - "epoch": 1.34, - "learning_rate": 5.195177343875701e-06, - "loss": 0.6626, + "epoch": 0.2129774986917844, + "grad_norm": 2.188420784579257, + "learning_rate": 1.829534289169324e-05, + "loss": 1.0615, "step": 2035 }, { - "epoch": 1.34, - "learning_rate": 5.185827767015031e-06, - "loss": 0.6775, + "epoch": 0.2130821559392988, + "grad_norm": 1.9589933092767846, + "learning_rate": 1.8293449406317294e-05, + "loss": 0.9486, "step": 2036 }, { - "epoch": 1.34, - "learning_rate": 5.176483664797852e-06, - "loss": 0.721, + "epoch": 0.21318681318681318, + "grad_norm": 2.7569189446285796, + "learning_rate": 1.82915549680072e-05, + "loss": 1.0995, "step": 2037 }, { - "epoch": 1.34, - "learning_rate": 5.1671450478502064e-06, - "loss": 0.7386, + "epoch": 0.21329147043432758, + "grad_norm": 1.9836569561149964, + "learning_rate": 1.828965957698063e-05, + "loss": 1.0582, "step": 2038 }, { - "epoch": 1.34, - "learning_rate": 5.157811926791905e-06, - "loss": 0.738, + "epoch": 0.21339612768184196, + "grad_norm": 2.6141067722134266, + "learning_rate": 1.8287763233455365e-05, + "loss": 0.9942, "step": 2039 }, { - "epoch": 1.34, - "learning_rate": 5.1484843122365125e-06, - "loss": 0.8146, + "epoch": 0.21350078492935637, + "grad_norm": 2.6804666482513726, + "learning_rate": 1.8285865937649313e-05, + "loss": 1.0738, "step": 2040 }, { - "epoch": 1.34, - "learning_rate": 5.139162214791322e-06, - "loss": 0.5877, + "epoch": 0.21360544217687075, + "grad_norm": 2.129743160129942, + "learning_rate": 1.8283967689780464e-05, + "loss": 0.81, "step": 2041 }, { - "epoch": 1.34, - "learning_rate": 5.129845645057372e-06, - "loss": 0.7367, + "epoch": 0.21371009942438512, + "grad_norm": 2.363273343477441, + "learning_rate": 1.8282068490066932e-05, + "loss": 0.8765, "step": 2042 }, { - "epoch": 1.34, - "learning_rate": 5.120534613629389e-06, - "loss": 0.7743, + "epoch": 0.21381475667189953, + "grad_norm": 2.476072286035289, + "learning_rate": 1.828016833872694e-05, + "loss": 1.0747, "step": 2043 }, { - "epoch": 1.35, - "learning_rate": 5.1112291310958166e-06, - "loss": 0.8649, + "epoch": 0.2139194139194139, + "grad_norm": 2.2832920524718343, + "learning_rate": 1.8278267235978823e-05, + "loss": 1.0481, "step": 2044 }, { - "epoch": 1.35, - "learning_rate": 5.101929208038786e-06, - "loss": 0.7196, + "epoch": 0.21402407116692831, + "grad_norm": 2.0256536513257473, + "learning_rate": 1.8276365182041015e-05, + "loss": 0.9437, "step": 2045 }, { - "epoch": 1.35, - "learning_rate": 5.09263485503411e-06, - "loss": 0.5693, + "epoch": 0.2141287284144427, + "grad_norm": 2.0349450996429237, + "learning_rate": 1.8274462177132074e-05, + "loss": 0.9745, "step": 2046 }, { - "epoch": 1.35, - "learning_rate": 5.083346082651255e-06, - "loss": 0.7333, + "epoch": 0.2142333856619571, + "grad_norm": 2.3558277924424154, + "learning_rate": 1.8272558221470652e-05, + "loss": 0.9551, "step": 2047 }, { - "epoch": 1.35, - "learning_rate": 5.074062901453352e-06, - "loss": 0.7562, + "epoch": 0.21433804290947148, + "grad_norm": 2.2290513676067962, + "learning_rate": 1.8270653315275525e-05, + "loss": 1.0465, "step": 2048 }, { - "epoch": 1.35, - "learning_rate": 5.06478532199717e-06, - "loss": 0.7743, + "epoch": 0.21444270015698588, + "grad_norm": 2.258045963609993, + "learning_rate": 1.826874745876556e-05, + "loss": 1.0306, "step": 2049 }, { - "epoch": 1.35, - "learning_rate": 5.05551335483311e-06, - "loss": 0.7186, + "epoch": 0.21454735740450026, + "grad_norm": 3.527078541148376, + "learning_rate": 1.8266840652159752e-05, + "loss": 0.8868, "step": 2050 }, { - "epoch": 1.35, - "learning_rate": 5.04624701050519e-06, - "loss": 0.6196, + "epoch": 0.21465201465201464, + "grad_norm": 2.2609749036853226, + "learning_rate": 1.8264932895677195e-05, + "loss": 0.9106, "step": 2051 }, { - "epoch": 1.35, - "learning_rate": 5.03698629955103e-06, - "loss": 0.6444, + "epoch": 0.21475667189952904, + "grad_norm": 2.5062029257028215, + "learning_rate": 1.826302418953709e-05, + "loss": 1.1274, "step": 2052 }, { - "epoch": 1.35, - "learning_rate": 5.027731232501849e-06, - "loss": 0.8377, + "epoch": 0.21486132914704342, + "grad_norm": 2.353992741301644, + "learning_rate": 1.8261114533958757e-05, + "loss": 1.0853, "step": 2053 }, { - "epoch": 1.35, - "learning_rate": 5.018481819882446e-06, - "loss": 0.807, + "epoch": 0.21496598639455783, + "grad_norm": 2.2428620293641144, + "learning_rate": 1.825920392916162e-05, + "loss": 0.9662, "step": 2054 }, { - "epoch": 1.35, - "learning_rate": 5.009238072211187e-06, - "loss": 0.7461, + "epoch": 0.2150706436420722, + "grad_norm": 2.305344851187627, + "learning_rate": 1.8257292375365202e-05, + "loss": 1.1007, "step": 2055 }, { - "epoch": 1.35, - "learning_rate": 5.000000000000003e-06, - "loss": 0.6753, + "epoch": 0.2151753008895866, + "grad_norm": 2.2656413224426033, + "learning_rate": 1.825537987278916e-05, + "loss": 0.9697, "step": 2056 }, { - "epoch": 1.35, - "learning_rate": 4.990767613754358e-06, - "loss": 0.8889, + "epoch": 0.215279958137101, + "grad_norm": 2.148064446805488, + "learning_rate": 1.8253466421653226e-05, + "loss": 0.9534, "step": 2057 }, { - "epoch": 1.35, - "learning_rate": 4.981540923973263e-06, - "loss": 0.7549, + "epoch": 0.2153846153846154, + "grad_norm": 2.1646722395898714, + "learning_rate": 1.8251552022177273e-05, + "loss": 0.9849, "step": 2058 }, { - "epoch": 1.36, - "learning_rate": 4.972319941149245e-06, - "loss": 0.745, + "epoch": 0.21548927263212977, + "grad_norm": 2.2230725886492007, + "learning_rate": 1.8249636674581265e-05, + "loss": 1.0095, "step": 2059 }, { - "epoch": 1.36, - "learning_rate": 4.963104675768345e-06, - "loss": 0.6283, + "epoch": 0.21559392987964415, + "grad_norm": 2.284428221986189, + "learning_rate": 1.824772037908528e-05, + "loss": 0.987, "step": 2060 }, { - "epoch": 1.36, - "learning_rate": 4.953895138310091e-06, - "loss": 0.6988, + "epoch": 0.21569858712715856, + "grad_norm": 2.5819764848015256, + "learning_rate": 1.8245803135909498e-05, + "loss": 1.0183, "step": 2061 }, { - "epoch": 1.36, - "learning_rate": 4.944691339247515e-06, - "loss": 0.7217, + "epoch": 0.21580324437467294, + "grad_norm": 2.3210577007465423, + "learning_rate": 1.8243884945274227e-05, + "loss": 0.9843, "step": 2062 }, { - "epoch": 1.36, - "learning_rate": 4.9354932890471096e-06, - "loss": 0.654, + "epoch": 0.21590790162218734, + "grad_norm": 2.5362607817744625, + "learning_rate": 1.824196580739986e-05, + "loss": 1.1811, "step": 2063 }, { - "epoch": 1.36, - "learning_rate": 4.926300998168835e-06, - "loss": 0.7342, + "epoch": 0.21601255886970172, + "grad_norm": 2.3962630982553397, + "learning_rate": 1.8240045722506915e-05, + "loss": 1.0299, "step": 2064 }, { - "epoch": 1.36, - "learning_rate": 4.917114477066104e-06, - "loss": 0.6831, + "epoch": 0.21611721611721613, + "grad_norm": 1.988482806508578, + "learning_rate": 1.823812469081601e-05, + "loss": 1.0407, "step": 2065 }, { - "epoch": 1.36, - "learning_rate": 4.907933736185757e-06, - "loss": 0.5879, + "epoch": 0.2162218733647305, + "grad_norm": 2.727138846671659, + "learning_rate": 1.823620271254788e-05, + "loss": 0.8944, "step": 2066 }, { - "epoch": 1.36, - "learning_rate": 4.898758785968075e-06, - "loss": 0.8035, + "epoch": 0.2163265306122449, + "grad_norm": 2.2182850283211475, + "learning_rate": 1.8234279787923358e-05, + "loss": 1.0132, "step": 2067 }, { - "epoch": 1.36, - "learning_rate": 4.889589636846747e-06, - "loss": 0.7861, + "epoch": 0.2164311878597593, + "grad_norm": 2.4303463162856365, + "learning_rate": 1.82323559171634e-05, + "loss": 1.0625, "step": 2068 }, { - "epoch": 1.36, - "learning_rate": 4.880426299248867e-06, - "loss": 0.7847, + "epoch": 0.21653584510727367, + "grad_norm": 2.419684174616418, + "learning_rate": 1.8230431100489057e-05, + "loss": 0.89, "step": 2069 }, { - "epoch": 1.36, - "learning_rate": 4.871268783594922e-06, - "loss": 0.6659, + "epoch": 0.21664050235478807, + "grad_norm": 2.4048668926385885, + "learning_rate": 1.8228505338121496e-05, + "loss": 1.0583, "step": 2070 }, { - "epoch": 1.36, - "learning_rate": 4.86211710029877e-06, - "loss": 0.7981, + "epoch": 0.21674515960230245, + "grad_norm": 2.342058187371194, + "learning_rate": 1.822657863028199e-05, + "loss": 0.9905, "step": 2071 }, { - "epoch": 1.36, - "learning_rate": 4.852971259767642e-06, - "loss": 0.6935, + "epoch": 0.21684981684981686, + "grad_norm": 2.4230482675249716, + "learning_rate": 1.822465097719192e-05, + "loss": 1.0081, "step": 2072 }, { - "epoch": 1.36, - "learning_rate": 4.843831272402126e-06, - "loss": 0.7619, + "epoch": 0.21695447409733123, + "grad_norm": 2.72537241732261, + "learning_rate": 1.8222722379072785e-05, + "loss": 1.0245, "step": 2073 }, { - "epoch": 1.37, - "learning_rate": 4.83469714859616e-06, - "loss": 0.6715, + "epoch": 0.21705913134484564, + "grad_norm": 2.4296676623937556, + "learning_rate": 1.8220792836146177e-05, + "loss": 1.0386, "step": 2074 }, { - "epoch": 1.37, - "learning_rate": 4.825568898736999e-06, - "loss": 0.7134, + "epoch": 0.21716378859236002, + "grad_norm": 2.5375888498111343, + "learning_rate": 1.8218862348633806e-05, + "loss": 1.0651, "step": 2075 }, { - "epoch": 1.37, - "learning_rate": 4.816446533205225e-06, - "loss": 0.741, + "epoch": 0.21726844583987442, + "grad_norm": 2.3320537485576804, + "learning_rate": 1.821693091675749e-05, + "loss": 1.0419, "step": 2076 }, { - "epoch": 1.37, - "learning_rate": 4.807330062374733e-06, - "loss": 0.7401, + "epoch": 0.2173731030873888, + "grad_norm": 2.2920845320825793, + "learning_rate": 1.8214998540739154e-05, + "loss": 1.0797, "step": 2077 }, { - "epoch": 1.37, - "learning_rate": 4.79821949661271e-06, - "loss": 0.7026, + "epoch": 0.21747776033490318, + "grad_norm": 2.156245726685389, + "learning_rate": 1.8213065220800833e-05, + "loss": 1.0072, "step": 2078 }, { - "epoch": 1.37, - "learning_rate": 4.789114846279631e-06, - "loss": 0.6982, + "epoch": 0.2175824175824176, + "grad_norm": 2.4898649858067117, + "learning_rate": 1.821113095716467e-05, + "loss": 1.1149, "step": 2079 }, { - "epoch": 1.37, - "learning_rate": 4.780016121729236e-06, - "loss": 0.7182, + "epoch": 0.21768707482993196, + "grad_norm": 2.0771189974183586, + "learning_rate": 1.820919575005291e-05, + "loss": 1.0435, "step": 2080 }, { - "epoch": 1.37, - "learning_rate": 4.7709233333085345e-06, - "loss": 0.7178, + "epoch": 0.21779173207744637, + "grad_norm": 2.277727814797236, + "learning_rate": 1.820725959968792e-05, + "loss": 1.0899, "step": 2081 }, { - "epoch": 1.37, - "learning_rate": 4.761836491357782e-06, - "loss": 0.7042, + "epoch": 0.21789638932496075, + "grad_norm": 2.373876155395553, + "learning_rate": 1.8205322506292162e-05, + "loss": 1.0877, "step": 2082 }, { - "epoch": 1.37, - "learning_rate": 4.7527556062104756e-06, - "loss": 0.6837, + "epoch": 0.21800104657247515, + "grad_norm": 2.1588302308233525, + "learning_rate": 1.8203384470088217e-05, + "loss": 0.8801, "step": 2083 }, { - "epoch": 1.37, - "learning_rate": 4.7436806881933304e-06, - "loss": 0.7406, + "epoch": 0.21810570381998953, + "grad_norm": 2.28595739064826, + "learning_rate": 1.8201445491298765e-05, + "loss": 1.051, "step": 2084 }, { - "epoch": 1.37, - "learning_rate": 4.734611747626285e-06, - "loss": 0.7689, + "epoch": 0.21821036106750394, + "grad_norm": 2.1087266644570923, + "learning_rate": 1.81995055701466e-05, + "loss": 1.1129, "step": 2085 }, { - "epoch": 1.37, - "learning_rate": 4.7255487948224754e-06, - "loss": 0.7852, + "epoch": 0.21831501831501832, + "grad_norm": 2.425503184353989, + "learning_rate": 1.8197564706854624e-05, + "loss": 0.855, "step": 2086 }, { - "epoch": 1.37, - "learning_rate": 4.716491840088227e-06, - "loss": 0.7435, + "epoch": 0.2184196755625327, + "grad_norm": 2.2595365972197294, + "learning_rate": 1.8195622901645843e-05, + "loss": 0.8961, "step": 2087 }, { - "epoch": 1.37, - "learning_rate": 4.707440893723052e-06, - "loss": 0.6964, + "epoch": 0.2185243328100471, + "grad_norm": 2.514090138405137, + "learning_rate": 1.8193680154743375e-05, + "loss": 1.0657, "step": 2088 }, { - "epoch": 1.38, - "learning_rate": 4.698395966019615e-06, - "loss": 0.7188, + "epoch": 0.21862899005756148, + "grad_norm": 2.4118987132654266, + "learning_rate": 1.819173646637045e-05, + "loss": 1.0297, "step": 2089 }, { - "epoch": 1.38, - "learning_rate": 4.689357067263751e-06, - "loss": 0.7081, + "epoch": 0.21873364730507588, + "grad_norm": 2.1575019695204127, + "learning_rate": 1.8189791836750396e-05, + "loss": 0.9463, "step": 2090 }, { - "epoch": 1.38, - "learning_rate": 4.680324207734434e-06, - "loss": 0.813, + "epoch": 0.21883830455259026, + "grad_norm": 2.408277886570118, + "learning_rate": 1.818784626610666e-05, + "loss": 1.1036, "step": 2091 }, { - "epoch": 1.38, - "learning_rate": 4.671297397703766e-06, - "loss": 0.7185, + "epoch": 0.21894296180010467, + "grad_norm": 1.795388244350893, + "learning_rate": 1.8185899754662787e-05, + "loss": 0.8641, "step": 2092 }, { - "epoch": 1.38, - "learning_rate": 4.662276647436977e-06, - "loss": 0.5856, + "epoch": 0.21904761904761905, + "grad_norm": 2.581772835215814, + "learning_rate": 1.818395230264244e-05, + "loss": 1.061, "step": 2093 }, { - "epoch": 1.38, - "learning_rate": 4.653261967192398e-06, - "loss": 0.6918, + "epoch": 0.21915227629513342, + "grad_norm": 2.259983033421856, + "learning_rate": 1.8182003910269382e-05, + "loss": 1.0749, "step": 2094 }, { - "epoch": 1.38, - "learning_rate": 4.644253367221467e-06, - "loss": 0.7439, + "epoch": 0.21925693354264783, + "grad_norm": 2.3449988057297206, + "learning_rate": 1.818005457776749e-05, + "loss": 1.1159, "step": 2095 }, { - "epoch": 1.38, - "learning_rate": 4.635250857768696e-06, - "loss": 0.7696, + "epoch": 0.2193615907901622, + "grad_norm": 2.2960373512554746, + "learning_rate": 1.8178104305360743e-05, + "loss": 1.155, "step": 2096 }, { - "epoch": 1.38, - "learning_rate": 4.62625444907168e-06, - "loss": 0.7711, + "epoch": 0.21946624803767661, + "grad_norm": 2.2977900952956687, + "learning_rate": 1.817615309327323e-05, + "loss": 1.0825, "step": 2097 }, { - "epoch": 1.38, - "learning_rate": 4.6172641513610765e-06, - "loss": 0.771, + "epoch": 0.219570905285191, + "grad_norm": 1.9351630635888202, + "learning_rate": 1.8174200941729153e-05, + "loss": 0.818, "step": 2098 }, { - "epoch": 1.38, - "learning_rate": 4.608279974860581e-06, - "loss": 0.8323, + "epoch": 0.2196755625327054, + "grad_norm": 1.9567019706927198, + "learning_rate": 1.8172247850952816e-05, + "loss": 0.9481, "step": 2099 }, { - "epoch": 1.38, - "learning_rate": 4.599301929786943e-06, - "loss": 0.6977, + "epoch": 0.21978021978021978, + "grad_norm": 2.431540987817807, + "learning_rate": 1.817029382116864e-05, + "loss": 1.1475, "step": 2100 }, { - "epoch": 1.38, - "learning_rate": 4.5903300263499315e-06, - "loss": 0.7317, + "epoch": 0.21988487702773418, + "grad_norm": 2.6689599398800925, + "learning_rate": 1.8168338852601136e-05, + "loss": 0.9702, "step": 2101 }, { - "epoch": 1.38, - "learning_rate": 4.581364274752338e-06, - "loss": 0.7067, + "epoch": 0.21998953427524856, + "grad_norm": 2.1586431953787004, + "learning_rate": 1.8166382945474937e-05, + "loss": 0.9888, "step": 2102 }, { - "epoch": 1.38, - "learning_rate": 4.572404685189946e-06, - "loss": 0.6092, + "epoch": 0.22009419152276294, + "grad_norm": 2.0960254129246865, + "learning_rate": 1.8164426100014787e-05, + "loss": 0.8704, "step": 2103 }, { - "epoch": 1.39, - "learning_rate": 4.563451267851542e-06, - "loss": 0.6164, + "epoch": 0.22019884877027734, + "grad_norm": 2.717246868399569, + "learning_rate": 1.8162468316445526e-05, + "loss": 0.7871, "step": 2104 }, { - "epoch": 1.39, - "learning_rate": 4.554504032918888e-06, - "loss": 0.7474, + "epoch": 0.22030350601779172, + "grad_norm": 2.2870173127883255, + "learning_rate": 1.8160509594992105e-05, + "loss": 1.0826, "step": 2105 }, { - "epoch": 1.39, - "learning_rate": 4.545562990566725e-06, - "loss": 0.6677, + "epoch": 0.22040816326530613, + "grad_norm": 2.383105180002082, + "learning_rate": 1.815854993587959e-05, + "loss": 1.017, "step": 2106 }, { - "epoch": 1.39, - "learning_rate": 4.536628150962744e-06, - "loss": 0.6281, + "epoch": 0.2205128205128205, + "grad_norm": 2.1939394100255596, + "learning_rate": 1.8156589339333154e-05, + "loss": 0.992, "step": 2107 }, { - "epoch": 1.39, - "learning_rate": 4.527699524267576e-06, - "loss": 0.6745, + "epoch": 0.2206174777603349, + "grad_norm": 1.995289142653823, + "learning_rate": 1.815462780557806e-05, + "loss": 0.833, "step": 2108 }, { - "epoch": 1.39, - "learning_rate": 4.518777120634798e-06, - "loss": 0.7419, + "epoch": 0.2207221350078493, + "grad_norm": 2.528094256454689, + "learning_rate": 1.8152665334839704e-05, + "loss": 1.1359, "step": 2109 }, { - "epoch": 1.39, - "learning_rate": 4.509860950210905e-06, - "loss": 0.6872, + "epoch": 0.2208267922553637, + "grad_norm": 2.8889436520004237, + "learning_rate": 1.815070192734357e-05, + "loss": 0.9671, "step": 2110 }, { - "epoch": 1.39, - "learning_rate": 4.500951023135308e-06, - "loss": 0.6998, + "epoch": 0.22093144950287807, + "grad_norm": 2.566640937673796, + "learning_rate": 1.8148737583315266e-05, + "loss": 1.0503, "step": 2111 }, { - "epoch": 1.39, - "learning_rate": 4.492047349540308e-06, - "loss": 0.6863, + "epoch": 0.22103610675039245, + "grad_norm": 2.1263834504381656, + "learning_rate": 1.8146772302980494e-05, + "loss": 0.7854, "step": 2112 }, { - "epoch": 1.39, - "learning_rate": 4.483149939551105e-06, - "loss": 0.6969, + "epoch": 0.22114076399790686, + "grad_norm": 2.121646140657449, + "learning_rate": 1.814480608656507e-05, + "loss": 0.8763, "step": 2113 }, { - "epoch": 1.39, - "learning_rate": 4.474258803285774e-06, - "loss": 0.7998, + "epoch": 0.22124542124542124, + "grad_norm": 2.4127305165775956, + "learning_rate": 1.814283893429491e-05, + "loss": 0.906, "step": 2114 }, { - "epoch": 1.39, - "learning_rate": 4.465373950855249e-06, - "loss": 0.6863, + "epoch": 0.22135007849293564, + "grad_norm": 2.4552352187699493, + "learning_rate": 1.8140870846396055e-05, + "loss": 1.1067, "step": 2115 }, { - "epoch": 1.39, - "learning_rate": 4.4564953923633295e-06, - "loss": 0.6583, + "epoch": 0.22145473574045002, + "grad_norm": 2.116477172570462, + "learning_rate": 1.8138901823094634e-05, + "loss": 0.9615, "step": 2116 }, { - "epoch": 1.39, - "learning_rate": 4.4476231379066445e-06, - "loss": 0.6266, + "epoch": 0.22155939298796443, + "grad_norm": 2.1641002061702035, + "learning_rate": 1.8136931864616894e-05, + "loss": 1.0059, "step": 2117 }, { - "epoch": 1.39, - "learning_rate": 4.4387571975746655e-06, - "loss": 0.7109, + "epoch": 0.2216640502354788, + "grad_norm": 2.418244872883079, + "learning_rate": 1.8134960971189186e-05, + "loss": 1.1253, "step": 2118 }, { - "epoch": 1.39, - "learning_rate": 4.429897581449677e-06, - "loss": 0.7029, + "epoch": 0.2217687074829932, + "grad_norm": 2.550192195359462, + "learning_rate": 1.8132989143037973e-05, + "loss": 0.9129, "step": 2119 }, { - "epoch": 1.4, - "learning_rate": 4.4210442996067724e-06, - "loss": 0.7064, + "epoch": 0.2218733647305076, + "grad_norm": 2.1808748350442664, + "learning_rate": 1.8131016380389822e-05, + "loss": 0.9744, "step": 2120 }, { - "epoch": 1.4, - "learning_rate": 4.412197362113848e-06, - "loss": 0.7868, + "epoch": 0.22197802197802197, + "grad_norm": 1.8551489114253599, + "learning_rate": 1.8129042683471404e-05, + "loss": 0.8682, "step": 2121 }, { - "epoch": 1.4, - "learning_rate": 4.403356779031572e-06, - "loss": 0.7483, + "epoch": 0.22208267922553637, + "grad_norm": 2.08940663789057, + "learning_rate": 1.81270680525095e-05, + "loss": 0.9402, "step": 2122 }, { - "epoch": 1.4, - "learning_rate": 4.394522560413399e-06, - "loss": 0.7122, + "epoch": 0.22218733647305075, + "grad_norm": 2.2964529328748506, + "learning_rate": 1.8125092487731005e-05, + "loss": 0.9279, "step": 2123 }, { - "epoch": 1.4, - "learning_rate": 4.385694716305541e-06, - "loss": 0.6946, + "epoch": 0.22229199372056516, + "grad_norm": 2.215371699637496, + "learning_rate": 1.812311598936291e-05, + "loss": 1.0981, "step": 2124 }, { - "epoch": 1.4, - "learning_rate": 4.376873256746962e-06, - "loss": 0.7781, + "epoch": 0.22239665096807953, + "grad_norm": 2.652740527835587, + "learning_rate": 1.8121138557632322e-05, + "loss": 1.083, "step": 2125 }, { - "epoch": 1.4, - "learning_rate": 4.368058191769363e-06, - "loss": 0.7361, + "epoch": 0.22250130821559394, + "grad_norm": 2.5149433842670503, + "learning_rate": 1.811916019276645e-05, + "loss": 0.8659, "step": 2126 }, { - "epoch": 1.4, - "learning_rate": 4.359249531397177e-06, - "loss": 0.6632, + "epoch": 0.22260596546310832, + "grad_norm": 1.9949643722746215, + "learning_rate": 1.811718089499261e-05, + "loss": 1.0477, "step": 2127 }, { - "epoch": 1.4, - "learning_rate": 4.350447285647552e-06, - "loss": 0.6804, + "epoch": 0.22271062271062272, + "grad_norm": 2.6367101170753173, + "learning_rate": 1.8115200664538234e-05, + "loss": 0.9465, "step": 2128 }, { - "epoch": 1.4, - "learning_rate": 4.341651464530341e-06, - "loss": 0.7049, + "epoch": 0.2228152799581371, + "grad_norm": 2.1662145017836645, + "learning_rate": 1.8113219501630848e-05, + "loss": 0.9172, "step": 2129 }, { - "epoch": 1.4, - "learning_rate": 4.332862078048095e-06, - "loss": 0.7433, + "epoch": 0.22291993720565148, + "grad_norm": 2.139400049602698, + "learning_rate": 1.8111237406498096e-05, + "loss": 0.9777, "step": 2130 }, { - "epoch": 1.4, - "learning_rate": 4.324079136196034e-06, - "loss": 0.7752, + "epoch": 0.2230245944531659, + "grad_norm": 2.0637795213950327, + "learning_rate": 1.810925437936772e-05, + "loss": 0.8885, "step": 2131 }, { - "epoch": 1.4, - "learning_rate": 4.315302648962066e-06, - "loss": 0.7045, + "epoch": 0.22312925170068026, + "grad_norm": 2.4433179860559733, + "learning_rate": 1.810727042046758e-05, + "loss": 1.1147, "step": 2132 }, { - "epoch": 1.4, - "learning_rate": 4.30653262632675e-06, - "loss": 0.7195, + "epoch": 0.22323390894819467, + "grad_norm": 2.507296458731943, + "learning_rate": 1.810528553002563e-05, + "loss": 1.0424, "step": 2133 }, { - "epoch": 1.4, - "learning_rate": 4.297769078263296e-06, - "loss": 0.8205, + "epoch": 0.22333856619570905, + "grad_norm": 2.476296110697989, + "learning_rate": 1.810329970826994e-05, + "loss": 1.0969, "step": 2134 }, { - "epoch": 1.41, - "learning_rate": 4.2890120147375545e-06, - "loss": 0.7518, + "epoch": 0.22344322344322345, + "grad_norm": 2.2719919935385966, + "learning_rate": 1.810131295542869e-05, + "loss": 1.086, "step": 2135 }, { - "epoch": 1.41, - "learning_rate": 4.280261445707985e-06, - "loss": 0.7425, + "epoch": 0.22354788069073783, + "grad_norm": 2.1254287766207867, + "learning_rate": 1.8099325271730158e-05, + "loss": 1.0048, "step": 2136 }, { - "epoch": 1.41, - "learning_rate": 4.271517381125687e-06, - "loss": 0.73, + "epoch": 0.2236525379382522, + "grad_norm": 2.2899571405767913, + "learning_rate": 1.8097336657402733e-05, + "loss": 1.025, "step": 2137 }, { - "epoch": 1.41, - "learning_rate": 4.262779830934346e-06, - "loss": 0.7006, + "epoch": 0.22375719518576662, + "grad_norm": 2.493384145123412, + "learning_rate": 1.809534711267491e-05, + "loss": 0.9832, "step": 2138 }, { - "epoch": 1.41, - "learning_rate": 4.254048805070248e-06, - "loss": 0.6843, + "epoch": 0.223861852433281, + "grad_norm": 2.3792722351135867, + "learning_rate": 1.8093356637775296e-05, + "loss": 1.1609, "step": 2139 }, { - "epoch": 1.41, - "learning_rate": 4.2453243134622465e-06, - "loss": 0.6841, + "epoch": 0.2239665096807954, + "grad_norm": 2.007926261369221, + "learning_rate": 1.8091365232932595e-05, + "loss": 0.9712, "step": 2140 }, { - "epoch": 1.41, - "learning_rate": 4.2366063660317805e-06, - "loss": 0.6899, + "epoch": 0.22407116692830978, + "grad_norm": 2.483067842936785, + "learning_rate": 1.8089372898375622e-05, + "loss": 1.0444, "step": 2141 }, { - "epoch": 1.41, - "learning_rate": 4.2278949726928354e-06, - "loss": 0.7306, + "epoch": 0.22417582417582418, + "grad_norm": 2.3009244591558815, + "learning_rate": 1.808737963433331e-05, + "loss": 1.13, "step": 2142 }, { - "epoch": 1.41, - "learning_rate": 4.219190143351951e-06, - "loss": 0.6526, + "epoch": 0.22428048142333856, + "grad_norm": 2.3317608975873028, + "learning_rate": 1.808538544103468e-05, + "loss": 1.0947, "step": 2143 }, { - "epoch": 1.41, - "learning_rate": 4.210491887908201e-06, - "loss": 0.7064, + "epoch": 0.22438513867085297, + "grad_norm": 2.058037185074867, + "learning_rate": 1.8083390318708875e-05, + "loss": 1.0688, "step": 2144 }, { - "epoch": 1.41, - "learning_rate": 4.201800216253176e-06, - "loss": 0.7287, + "epoch": 0.22448979591836735, + "grad_norm": 2.1821994697350813, + "learning_rate": 1.8081394267585134e-05, + "loss": 1.0108, "step": 2145 }, { - "epoch": 1.41, - "learning_rate": 4.193115138270986e-06, - "loss": 0.7231, + "epoch": 0.22459445316588172, + "grad_norm": 2.093805533273431, + "learning_rate": 1.8079397287892808e-05, + "loss": 1.0334, "step": 2146 }, { - "epoch": 1.41, - "learning_rate": 4.184436663838245e-06, - "loss": 0.6422, + "epoch": 0.22469911041339613, + "grad_norm": 2.230362524509398, + "learning_rate": 1.807739937986136e-05, + "loss": 0.983, "step": 2147 }, { - "epoch": 1.41, - "learning_rate": 4.175764802824049e-06, - "loss": 0.7402, + "epoch": 0.2248037676609105, + "grad_norm": 2.2669738777674517, + "learning_rate": 1.8075400543720342e-05, + "loss": 1.0644, "step": 2148 }, { - "epoch": 1.41, - "learning_rate": 4.167099565089989e-06, - "loss": 0.7965, + "epoch": 0.22490842490842491, + "grad_norm": 2.4539503469583237, + "learning_rate": 1.8073400779699434e-05, + "loss": 1.0708, "step": 2149 }, { - "epoch": 1.42, - "learning_rate": 4.158440960490104e-06, - "loss": 0.6677, + "epoch": 0.2250130821559393, + "grad_norm": 2.366911153866586, + "learning_rate": 1.807140008802841e-05, + "loss": 1.0431, "step": 2150 }, { - "epoch": 1.42, - "learning_rate": 4.149788998870902e-06, - "loss": 0.7098, + "epoch": 0.2251177394034537, + "grad_norm": 2.144584808902021, + "learning_rate": 1.8069398468937153e-05, + "loss": 0.9618, "step": 2151 }, { - "epoch": 1.42, - "learning_rate": 4.141143690071335e-06, - "loss": 0.8215, + "epoch": 0.22522239665096808, + "grad_norm": 2.540674460417446, + "learning_rate": 1.806739592265565e-05, + "loss": 1.138, "step": 2152 }, { - "epoch": 1.42, - "learning_rate": 4.132505043922792e-06, - "loss": 0.7118, + "epoch": 0.22532705389848248, + "grad_norm": 2.206689167590175, + "learning_rate": 1.8065392449414004e-05, + "loss": 1.0123, "step": 2153 }, { - "epoch": 1.42, - "learning_rate": 4.123873070249074e-06, - "loss": 0.6459, + "epoch": 0.22543171114599686, + "grad_norm": 2.2813596687579474, + "learning_rate": 1.806338804944242e-05, + "loss": 1.07, "step": 2154 }, { - "epoch": 1.42, - "learning_rate": 4.115247778866407e-06, - "loss": 0.7414, + "epoch": 0.22553636839351124, + "grad_norm": 2.4718690247538873, + "learning_rate": 1.8061382722971196e-05, + "loss": 0.9068, "step": 2155 }, { - "epoch": 1.42, - "learning_rate": 4.106629179583412e-06, - "loss": 0.7304, + "epoch": 0.22564102564102564, + "grad_norm": 2.3583234330786507, + "learning_rate": 1.8059376470230757e-05, + "loss": 1.055, "step": 2156 }, { - "epoch": 1.42, - "learning_rate": 4.0980172822010985e-06, - "loss": 0.6996, + "epoch": 0.22574568288854002, + "grad_norm": 2.4215574962079205, + "learning_rate": 1.8057369291451626e-05, + "loss": 1.0428, "step": 2157 }, { - "epoch": 1.42, - "learning_rate": 4.0894120965128615e-06, - "loss": 0.7429, + "epoch": 0.22585034013605443, + "grad_norm": 1.9930502014891032, + "learning_rate": 1.805536118686443e-05, + "loss": 0.9008, "step": 2158 }, { - "epoch": 1.42, - "learning_rate": 4.080813632304455e-06, - "loss": 0.6918, + "epoch": 0.2259549973835688, + "grad_norm": 2.3283037616599027, + "learning_rate": 1.80533521566999e-05, + "loss": 1.0235, "step": 2159 }, { - "epoch": 1.42, - "learning_rate": 4.072221899353995e-06, - "loss": 0.7265, + "epoch": 0.2260596546310832, + "grad_norm": 2.3801695391927753, + "learning_rate": 1.805134220118888e-05, + "loss": 1.022, "step": 2160 }, { - "epoch": 1.42, - "learning_rate": 4.063636907431942e-06, - "loss": 0.6741, + "epoch": 0.2261643118785976, + "grad_norm": 2.397543403584425, + "learning_rate": 1.8049331320562324e-05, + "loss": 0.915, "step": 2161 }, { - "epoch": 1.42, - "learning_rate": 4.055058666301087e-06, - "loss": 0.7794, + "epoch": 0.226268969126112, + "grad_norm": 2.626509392011337, + "learning_rate": 1.804731951505128e-05, + "loss": 1.0158, "step": 2162 }, { - "epoch": 1.42, - "learning_rate": 4.046487185716552e-06, - "loss": 0.7572, + "epoch": 0.22637362637362637, + "grad_norm": 2.6074284746843395, + "learning_rate": 1.804530678488691e-05, + "loss": 0.9578, "step": 2163 }, { - "epoch": 1.42, - "learning_rate": 4.037922475425759e-06, - "loss": 0.6487, + "epoch": 0.22647828362114075, + "grad_norm": 2.2563607196090665, + "learning_rate": 1.8043293130300482e-05, + "loss": 0.9736, "step": 2164 }, { - "epoch": 1.43, - "learning_rate": 4.029364545168438e-06, - "loss": 0.6464, + "epoch": 0.22658294086865516, + "grad_norm": 2.5275115770936285, + "learning_rate": 1.8041278551523365e-05, + "loss": 1.1927, "step": 2165 }, { - "epoch": 1.43, - "learning_rate": 4.020813404676612e-06, - "loss": 0.7904, + "epoch": 0.22668759811616954, + "grad_norm": 3.2122832502725074, + "learning_rate": 1.8039263048787042e-05, + "loss": 1.0338, "step": 2166 }, { - "epoch": 1.43, - "learning_rate": 4.012269063674581e-06, - "loss": 0.663, + "epoch": 0.22679225536368394, + "grad_norm": 2.1422445536418078, + "learning_rate": 1.80372466223231e-05, + "loss": 0.8647, "step": 2167 }, { - "epoch": 1.43, - "learning_rate": 4.0037315318789e-06, - "loss": 0.7922, + "epoch": 0.22689691261119832, + "grad_norm": 2.30508825920427, + "learning_rate": 1.8035229272363226e-05, + "loss": 0.9192, "step": 2168 }, { - "epoch": 1.43, - "learning_rate": 3.9952008189984025e-06, - "loss": 0.6516, + "epoch": 0.22700156985871273, + "grad_norm": 2.3600467400757346, + "learning_rate": 1.803321099913922e-05, + "loss": 1.0098, "step": 2169 }, { - "epoch": 1.43, - "learning_rate": 3.986676934734151e-06, - "loss": 0.7492, + "epoch": 0.2271062271062271, + "grad_norm": 2.239202924612382, + "learning_rate": 1.8031191802882984e-05, + "loss": 1.0374, "step": 2170 }, { - "epoch": 1.43, - "learning_rate": 3.978159888779451e-06, - "loss": 0.6673, + "epoch": 0.2272108843537415, + "grad_norm": 2.007640041272985, + "learning_rate": 1.8029171683826528e-05, + "loss": 0.9717, "step": 2171 }, { - "epoch": 1.43, - "learning_rate": 3.96964969081983e-06, - "loss": 0.6747, + "epoch": 0.2273155416012559, + "grad_norm": 2.0849186079241453, + "learning_rate": 1.802715064220197e-05, + "loss": 0.8889, "step": 2172 }, { - "epoch": 1.43, - "learning_rate": 3.961146350533021e-06, - "loss": 0.5325, + "epoch": 0.22742019884877027, + "grad_norm": 2.441637760590174, + "learning_rate": 1.8025128678241532e-05, + "loss": 1.0504, "step": 2173 }, { - "epoch": 1.43, - "learning_rate": 3.9526498775889645e-06, - "loss": 0.7031, + "epoch": 0.22752485609628467, + "grad_norm": 1.9626865084302276, + "learning_rate": 1.802310579217754e-05, + "loss": 0.9714, "step": 2174 }, { - "epoch": 1.43, - "learning_rate": 3.944160281649793e-06, - "loss": 0.6815, + "epoch": 0.22762951334379905, + "grad_norm": 2.434914606469338, + "learning_rate": 1.8021081984242426e-05, + "loss": 1.0148, "step": 2175 }, { - "epoch": 1.43, - "learning_rate": 3.935677572369817e-06, - "loss": 0.6953, + "epoch": 0.22773417059131346, + "grad_norm": 2.464927468743602, + "learning_rate": 1.801905725466873e-05, + "loss": 0.9263, "step": 2176 }, { - "epoch": 1.43, - "learning_rate": 3.927201759395518e-06, - "loss": 0.7131, + "epoch": 0.22783882783882783, + "grad_norm": 2.1365896230444017, + "learning_rate": 1.8017031603689105e-05, + "loss": 1.0849, "step": 2177 }, { - "epoch": 1.43, - "learning_rate": 3.918732852365524e-06, - "loss": 0.6739, + "epoch": 0.22794348508634224, + "grad_norm": 2.0108040763164894, + "learning_rate": 1.801500503153629e-05, + "loss": 0.8429, "step": 2178 }, { - "epoch": 1.43, - "learning_rate": 3.910270860910623e-06, - "loss": 0.6494, + "epoch": 0.22804814233385662, + "grad_norm": 2.5168079546538507, + "learning_rate": 1.8012977538443156e-05, + "loss": 1.0145, "step": 2179 }, { - "epoch": 1.44, - "learning_rate": 3.901815794653729e-06, - "loss": 0.6589, + "epoch": 0.228152799581371, + "grad_norm": 2.2024177758411088, + "learning_rate": 1.801094912464265e-05, + "loss": 1.1185, "step": 2180 }, { - "epoch": 1.44, - "learning_rate": 3.893367663209896e-06, - "loss": 0.6446, + "epoch": 0.2282574568288854, + "grad_norm": 1.8831830056093484, + "learning_rate": 1.8008919790367854e-05, + "loss": 0.9436, "step": 2181 }, { - "epoch": 1.44, - "learning_rate": 3.884926476186269e-06, - "loss": 0.6824, + "epoch": 0.22836211407639978, + "grad_norm": 2.5755945919781063, + "learning_rate": 1.800688953585194e-05, + "loss": 1.0213, "step": 2182 }, { - "epoch": 1.44, - "learning_rate": 3.876492243182113e-06, - "loss": 0.7825, + "epoch": 0.2284667713239142, + "grad_norm": 2.3639675977701535, + "learning_rate": 1.8004858361328185e-05, + "loss": 1.009, "step": 2183 }, { - "epoch": 1.44, - "learning_rate": 3.8680649737887784e-06, - "loss": 0.6295, + "epoch": 0.22857142857142856, + "grad_norm": 2.4516774443504716, + "learning_rate": 1.8002826267029977e-05, + "loss": 1.0469, "step": 2184 }, { - "epoch": 1.44, - "learning_rate": 3.859644677589699e-06, - "loss": 0.5602, + "epoch": 0.22867608581894297, + "grad_norm": 2.2277531735938414, + "learning_rate": 1.800079325319081e-05, + "loss": 1.1314, "step": 2185 }, { - "epoch": 1.44, - "learning_rate": 3.851231364160379e-06, - "loss": 0.6399, + "epoch": 0.22878074306645735, + "grad_norm": 2.1946636768725116, + "learning_rate": 1.7998759320044276e-05, + "loss": 1.1728, "step": 2186 }, { - "epoch": 1.44, - "learning_rate": 3.842825043068377e-06, - "loss": 0.7782, + "epoch": 0.22888540031397175, + "grad_norm": 1.8628296808775904, + "learning_rate": 1.799672446782408e-05, + "loss": 0.7964, "step": 2187 }, { - "epoch": 1.44, - "learning_rate": 3.834425723873303e-06, - "loss": 0.6725, + "epoch": 0.22899005756148613, + "grad_norm": 2.2414870196187984, + "learning_rate": 1.7994688696764037e-05, + "loss": 1.0465, "step": 2188 }, { - "epoch": 1.44, - "learning_rate": 3.826033416126807e-06, - "loss": 0.6962, + "epoch": 0.2290947148090005, + "grad_norm": 2.1280884247041056, + "learning_rate": 1.799265200709805e-05, + "loss": 0.934, "step": 2189 }, { - "epoch": 1.44, - "learning_rate": 3.817648129372563e-06, - "loss": 0.7005, + "epoch": 0.22919937205651492, + "grad_norm": 2.4695395417950308, + "learning_rate": 1.7990614399060144e-05, + "loss": 1.078, "step": 2190 }, { - "epoch": 1.44, - "learning_rate": 3.809269873146263e-06, - "loss": 0.7011, + "epoch": 0.2293040293040293, + "grad_norm": 2.240850930676284, + "learning_rate": 1.798857587288445e-05, + "loss": 1.0257, "step": 2191 }, { - "epoch": 1.44, - "learning_rate": 3.800898656975599e-06, - "loss": 0.658, + "epoch": 0.2294086865515437, + "grad_norm": 2.4333038713844988, + "learning_rate": 1.798653642880519e-05, + "loss": 0.9771, "step": 2192 }, { - "epoch": 1.44, - "learning_rate": 3.792534490380263e-06, - "loss": 0.7342, + "epoch": 0.22951334379905808, + "grad_norm": 2.3919820342346827, + "learning_rate": 1.7984496067056704e-05, + "loss": 1.0281, "step": 2193 }, { - "epoch": 1.44, - "learning_rate": 3.7841773828719277e-06, - "loss": 0.7488, + "epoch": 0.22961800104657248, + "grad_norm": 2.9237043226574633, + "learning_rate": 1.7982454787873436e-05, + "loss": 1.0971, "step": 2194 }, { - "epoch": 1.45, - "learning_rate": 3.775827343954239e-06, - "loss": 0.6383, + "epoch": 0.22972265829408686, + "grad_norm": 2.499578413555501, + "learning_rate": 1.7980412591489932e-05, + "loss": 1.0503, "step": 2195 }, { - "epoch": 1.45, - "learning_rate": 3.767484383122799e-06, - "loss": 0.6609, + "epoch": 0.22982731554160127, + "grad_norm": 2.1481213076928793, + "learning_rate": 1.7978369478140838e-05, + "loss": 1.0634, "step": 2196 }, { - "epoch": 1.45, - "learning_rate": 3.759148509865168e-06, - "loss": 0.6105, + "epoch": 0.22993197278911565, + "grad_norm": 2.0306762217088568, + "learning_rate": 1.797632544806092e-05, + "loss": 0.9728, "step": 2197 }, { - "epoch": 1.45, - "learning_rate": 3.750819733660844e-06, - "loss": 0.7654, + "epoch": 0.23003663003663002, + "grad_norm": 2.483176372385859, + "learning_rate": 1.7974280501485037e-05, + "loss": 0.9275, "step": 2198 }, { - "epoch": 1.45, - "learning_rate": 3.7424980639812534e-06, - "loss": 0.756, + "epoch": 0.23014128728414443, + "grad_norm": 2.319447402532949, + "learning_rate": 1.797223463864816e-05, + "loss": 1.146, "step": 2199 }, { - "epoch": 1.45, - "learning_rate": 3.734183510289742e-06, - "loss": 0.5883, + "epoch": 0.2302459445316588, + "grad_norm": 2.277766050423453, + "learning_rate": 1.797018785978536e-05, + "loss": 0.8616, "step": 2200 }, { - "epoch": 1.45, - "learning_rate": 3.725876082041563e-06, - "loss": 0.8041, + "epoch": 0.23035060177917321, + "grad_norm": 2.3369318419186786, + "learning_rate": 1.796814016513182e-05, + "loss": 1.0268, "step": 2201 }, { - "epoch": 1.45, - "learning_rate": 3.717575788683866e-06, - "loss": 0.5943, + "epoch": 0.2304552590266876, + "grad_norm": 2.597419543719222, + "learning_rate": 1.7966091554922823e-05, + "loss": 0.9834, "step": 2202 }, { - "epoch": 1.45, - "learning_rate": 3.7092826396556882e-06, - "loss": 0.7253, + "epoch": 0.230559916274202, + "grad_norm": 2.011269693810685, + "learning_rate": 1.7964042029393755e-05, + "loss": 0.9039, "step": 2203 }, { - "epoch": 1.45, - "learning_rate": 3.700996644387944e-06, - "loss": 0.7387, + "epoch": 0.23066457352171638, + "grad_norm": 2.2717414735854837, + "learning_rate": 1.7961991588780114e-05, + "loss": 1.0691, "step": 2204 }, { - "epoch": 1.45, - "learning_rate": 3.6927178123034034e-06, - "loss": 0.7315, + "epoch": 0.23076923076923078, + "grad_norm": 2.2621982011763326, + "learning_rate": 1.79599402333175e-05, + "loss": 1.0414, "step": 2205 }, { - "epoch": 1.45, - "learning_rate": 3.684446152816702e-06, - "loss": 0.801, + "epoch": 0.23087388801674516, + "grad_norm": 2.1871162127683323, + "learning_rate": 1.7957887963241613e-05, + "loss": 1.0806, "step": 2206 }, { - "epoch": 1.45, - "learning_rate": 3.6761816753343114e-06, - "loss": 0.7474, + "epoch": 0.23097854526425954, + "grad_norm": 2.2467526764365604, + "learning_rate": 1.7955834778788266e-05, + "loss": 1.0368, "step": 2207 }, { - "epoch": 1.45, - "learning_rate": 3.6679243892545414e-06, - "loss": 0.6163, + "epoch": 0.23108320251177394, + "grad_norm": 2.49703081611621, + "learning_rate": 1.795378068019338e-05, + "loss": 1.0251, "step": 2208 }, { - "epoch": 1.45, - "learning_rate": 3.659674303967522e-06, - "loss": 0.6888, + "epoch": 0.23118785975928832, + "grad_norm": 2.2621831608178393, + "learning_rate": 1.7951725667692967e-05, + "loss": 0.9758, "step": 2209 }, { - "epoch": 1.45, - "learning_rate": 3.6514314288551877e-06, - "loss": 0.7455, + "epoch": 0.23129251700680273, + "grad_norm": 2.8459265279372192, + "learning_rate": 1.7949669741523156e-05, + "loss": 1.0373, "step": 2210 }, { - "epoch": 1.46, - "learning_rate": 3.6431957732912793e-06, - "loss": 0.7855, + "epoch": 0.2313971742543171, + "grad_norm": 2.6273563201754957, + "learning_rate": 1.7947612901920174e-05, + "loss": 0.8752, "step": 2211 }, { - "epoch": 1.46, - "learning_rate": 3.634967346641334e-06, - "loss": 0.7015, + "epoch": 0.2315018315018315, + "grad_norm": 2.5197273005659264, + "learning_rate": 1.794555514912036e-05, + "loss": 0.9882, "step": 2212 }, { - "epoch": 1.46, - "learning_rate": 3.6267461582626584e-06, - "loss": 0.7168, + "epoch": 0.2316064887493459, + "grad_norm": 2.274914592268484, + "learning_rate": 1.7943496483360152e-05, + "loss": 0.9677, "step": 2213 }, { - "epoch": 1.46, - "learning_rate": 3.6185322175043336e-06, - "loss": 0.7709, + "epoch": 0.2317111459968603, + "grad_norm": 2.428626518972456, + "learning_rate": 1.7941436904876095e-05, + "loss": 1.1441, "step": 2214 }, { - "epoch": 1.46, - "learning_rate": 3.6103255337071918e-06, - "loss": 0.7098, + "epoch": 0.23181580324437467, + "grad_norm": 2.394836969811865, + "learning_rate": 1.7939376413904836e-05, + "loss": 0.9864, "step": 2215 }, { - "epoch": 1.46, - "learning_rate": 3.602126116203819e-06, - "loss": 0.717, + "epoch": 0.23192046049188905, + "grad_norm": 2.6595728488415182, + "learning_rate": 1.7937315010683135e-05, + "loss": 1.0768, "step": 2216 }, { - "epoch": 1.46, - "learning_rate": 3.5939339743185376e-06, - "loss": 0.6801, + "epoch": 0.23202511773940346, + "grad_norm": 2.1906514810889846, + "learning_rate": 1.7935252695447846e-05, + "loss": 0.9923, "step": 2217 }, { - "epoch": 1.46, - "learning_rate": 3.5857491173673976e-06, - "loss": 0.6834, + "epoch": 0.23212977498691784, + "grad_norm": 2.3158045248201065, + "learning_rate": 1.793318946843594e-05, + "loss": 1.0749, "step": 2218 }, { - "epoch": 1.46, - "learning_rate": 3.5775715546581546e-06, - "loss": 0.7986, + "epoch": 0.23223443223443224, + "grad_norm": 2.2698926380173563, + "learning_rate": 1.793112532988448e-05, + "loss": 1.0514, "step": 2219 }, { - "epoch": 1.46, - "learning_rate": 3.5694012954902803e-06, - "loss": 0.8078, + "epoch": 0.23233908948194662, + "grad_norm": 2.3318940874120564, + "learning_rate": 1.7929060280030642e-05, + "loss": 1.0105, "step": 2220 }, { - "epoch": 1.46, - "learning_rate": 3.5612383491549384e-06, - "loss": 0.7479, + "epoch": 0.23244374672946103, + "grad_norm": 2.3711702488900857, + "learning_rate": 1.7926994319111704e-05, + "loss": 1.0331, "step": 2221 }, { - "epoch": 1.46, - "learning_rate": 3.553082724934973e-06, - "loss": 0.6753, + "epoch": 0.2325484039769754, + "grad_norm": 2.150975144233381, + "learning_rate": 1.7924927447365048e-05, + "loss": 1.0613, "step": 2222 }, { - "epoch": 1.46, - "learning_rate": 3.544934432104905e-06, - "loss": 0.7859, + "epoch": 0.23265306122448978, + "grad_norm": 2.536648515717417, + "learning_rate": 1.7922859665028164e-05, + "loss": 1.0065, "step": 2223 }, { - "epoch": 1.46, - "learning_rate": 3.5367934799309167e-06, - "loss": 0.7151, + "epoch": 0.2327577184720042, + "grad_norm": 1.9397689780705203, + "learning_rate": 1.792079097233864e-05, + "loss": 0.9451, "step": 2224 }, { - "epoch": 1.46, - "learning_rate": 3.5286598776708436e-06, - "loss": 0.7203, + "epoch": 0.23286237571951857, + "grad_norm": 2.2704190613976114, + "learning_rate": 1.791872136953418e-05, + "loss": 0.9858, "step": 2225 }, { - "epoch": 1.47, - "learning_rate": 3.5205336345741603e-06, - "loss": 0.7241, + "epoch": 0.23296703296703297, + "grad_norm": 2.8347645497669656, + "learning_rate": 1.7916650856852577e-05, + "loss": 1.1249, "step": 2226 }, { - "epoch": 1.47, - "learning_rate": 3.512414759881976e-06, - "loss": 0.719, + "epoch": 0.23307169021454735, + "grad_norm": 2.408645013203526, + "learning_rate": 1.7914579434531746e-05, + "loss": 0.9415, "step": 2227 }, { - "epoch": 1.47, - "learning_rate": 3.5043032628270225e-06, - "loss": 0.6583, + "epoch": 0.23317634746206176, + "grad_norm": 2.5151198920475557, + "learning_rate": 1.7912507102809692e-05, + "loss": 1.0078, "step": 2228 }, { - "epoch": 1.47, - "learning_rate": 3.49619915263363e-06, - "loss": 0.8169, + "epoch": 0.23328100470957613, + "grad_norm": 2.049430149571946, + "learning_rate": 1.7910433861924533e-05, + "loss": 0.9601, "step": 2229 }, { - "epoch": 1.47, - "learning_rate": 3.488102438517742e-06, - "loss": 0.6847, + "epoch": 0.23338566195709054, + "grad_norm": 2.406663254302629, + "learning_rate": 1.7908359712114484e-05, + "loss": 0.9539, "step": 2230 }, { - "epoch": 1.47, - "learning_rate": 3.4800131296868842e-06, - "loss": 0.5998, + "epoch": 0.23349031920460492, + "grad_norm": 2.32058310508671, + "learning_rate": 1.7906284653617874e-05, + "loss": 1.0463, "step": 2231 }, { - "epoch": 1.47, - "learning_rate": 3.4719312353401647e-06, - "loss": 0.7476, + "epoch": 0.2335949764521193, + "grad_norm": 2.1243497033328076, + "learning_rate": 1.790420868667313e-05, + "loss": 0.8934, "step": 2232 }, { - "epoch": 1.47, - "learning_rate": 3.4638567646682576e-06, - "loss": 0.753, + "epoch": 0.2336996336996337, + "grad_norm": 2.172322288650889, + "learning_rate": 1.7902131811518784e-05, + "loss": 0.8929, "step": 2233 }, { - "epoch": 1.47, - "learning_rate": 3.4557897268533935e-06, - "loss": 0.6882, + "epoch": 0.23380429094714808, + "grad_norm": 2.626463122324986, + "learning_rate": 1.7900054028393475e-05, + "loss": 0.9942, "step": 2234 }, { - "epoch": 1.47, - "learning_rate": 3.4477301310693544e-06, - "loss": 0.756, + "epoch": 0.23390894819466249, + "grad_norm": 2.277059459157924, + "learning_rate": 1.789797533753594e-05, + "loss": 1.0414, "step": 2235 }, { - "epoch": 1.47, - "learning_rate": 3.4396779864814568e-06, - "loss": 0.5769, + "epoch": 0.23401360544217686, + "grad_norm": 2.676163205276392, + "learning_rate": 1.789589573918503e-05, + "loss": 0.8041, "step": 2236 }, { - "epoch": 1.47, - "learning_rate": 3.4316333022465465e-06, - "loss": 0.7097, + "epoch": 0.23411826268969127, + "grad_norm": 2.1358940445833188, + "learning_rate": 1.7893815233579692e-05, + "loss": 1.0874, "step": 2237 }, { - "epoch": 1.47, - "learning_rate": 3.423596087512976e-06, - "loss": 0.5898, + "epoch": 0.23422291993720565, + "grad_norm": 2.499667365100757, + "learning_rate": 1.7891733820958983e-05, + "loss": 0.9692, "step": 2238 }, { - "epoch": 1.47, - "learning_rate": 3.415566351420615e-06, - "loss": 0.6184, + "epoch": 0.23432757718472005, + "grad_norm": 2.8983591306679104, + "learning_rate": 1.788965150156206e-05, + "loss": 0.8887, "step": 2239 }, { - "epoch": 1.47, - "learning_rate": 3.407544103100824e-06, - "loss": 0.6735, + "epoch": 0.23443223443223443, + "grad_norm": 2.4745131578348856, + "learning_rate": 1.788756827562818e-05, + "loss": 0.975, "step": 2240 }, { - "epoch": 1.48, - "learning_rate": 3.399529351676447e-06, - "loss": 0.7342, + "epoch": 0.2345368916797488, + "grad_norm": 2.3648247665584434, + "learning_rate": 1.788548414339672e-05, + "loss": 0.924, "step": 2241 }, { - "epoch": 1.48, - "learning_rate": 3.39152210626181e-06, - "loss": 0.5299, + "epoch": 0.23464154892726322, + "grad_norm": 2.3253167404589643, + "learning_rate": 1.7883399105107146e-05, + "loss": 1.0481, "step": 2242 }, { - "epoch": 1.48, - "learning_rate": 3.3835223759626845e-06, - "loss": 0.7169, + "epoch": 0.2347462061747776, + "grad_norm": 2.2078932669131306, + "learning_rate": 1.7881313160999035e-05, + "loss": 1.0526, "step": 2243 }, { - "epoch": 1.48, - "learning_rate": 3.3755301698763197e-06, - "loss": 0.7386, + "epoch": 0.234850863422292, + "grad_norm": 2.260715971363778, + "learning_rate": 1.7879226311312063e-05, + "loss": 0.9958, "step": 2244 }, { - "epoch": 1.48, - "learning_rate": 3.367545497091396e-06, - "loss": 0.6406, + "epoch": 0.23495552066980638, + "grad_norm": 2.103577147400585, + "learning_rate": 1.7877138556286014e-05, + "loss": 1.0592, "step": 2245 }, { - "epoch": 1.48, - "learning_rate": 3.3595683666880286e-06, - "loss": 0.7105, + "epoch": 0.23506017791732078, + "grad_norm": 2.7075505673940774, + "learning_rate": 1.787504989616078e-05, + "loss": 1.0385, "step": 2246 }, { - "epoch": 1.48, - "learning_rate": 3.3515987877377498e-06, - "loss": 0.7014, + "epoch": 0.23516483516483516, + "grad_norm": 2.6763027546096807, + "learning_rate": 1.7872960331176347e-05, + "loss": 0.972, "step": 2247 }, { - "epoch": 1.48, - "learning_rate": 3.343636769303512e-06, - "loss": 0.7209, + "epoch": 0.23526949241234957, + "grad_norm": 2.3328246332033227, + "learning_rate": 1.7870869861572807e-05, + "loss": 0.8997, "step": 2248 }, { - "epoch": 1.48, - "learning_rate": 3.3356823204396706e-06, - "loss": 0.6988, + "epoch": 0.23537414965986395, + "grad_norm": 2.4266627069351108, + "learning_rate": 1.786877848759037e-05, + "loss": 0.9698, "step": 2249 }, { - "epoch": 1.48, - "learning_rate": 3.3277354501919677e-06, - "loss": 0.7941, + "epoch": 0.23547880690737832, + "grad_norm": 2.093701504040857, + "learning_rate": 1.7866686209469328e-05, + "loss": 0.871, "step": 2250 }, { - "epoch": 1.48, - "learning_rate": 3.319796167597533e-06, - "loss": 0.7455, + "epoch": 0.23558346415489273, + "grad_norm": 2.4270676083889087, + "learning_rate": 1.7864593027450097e-05, + "loss": 1.156, "step": 2251 }, { - "epoch": 1.48, - "learning_rate": 3.3118644816848577e-06, - "loss": 0.7725, + "epoch": 0.2356881214024071, + "grad_norm": 2.6995988050304716, + "learning_rate": 1.7862498941773184e-05, + "loss": 1.0907, "step": 2252 }, { - "epoch": 1.48, - "learning_rate": 3.303940401473804e-06, - "loss": 0.6013, + "epoch": 0.23579277864992151, + "grad_norm": 2.0633497910278256, + "learning_rate": 1.78604039526792e-05, + "loss": 0.9509, "step": 2253 }, { - "epoch": 1.48, - "learning_rate": 3.2960239359755808e-06, - "loss": 0.759, + "epoch": 0.2358974358974359, + "grad_norm": 2.337404008068736, + "learning_rate": 1.785830806040887e-05, + "loss": 1.0717, "step": 2254 }, { - "epoch": 1.48, - "learning_rate": 3.288115094192734e-06, - "loss": 0.6683, + "epoch": 0.2360020931449503, + "grad_norm": 2.2055920312977295, + "learning_rate": 1.785621126520301e-05, + "loss": 0.8269, "step": 2255 }, { - "epoch": 1.49, - "learning_rate": 3.2802138851191558e-06, - "loss": 0.8061, + "epoch": 0.23610675039246468, + "grad_norm": 2.5981676845560764, + "learning_rate": 1.7854113567302557e-05, + "loss": 0.9251, "step": 2256 }, { - "epoch": 1.49, - "learning_rate": 3.2723203177400352e-06, - "loss": 0.7195, + "epoch": 0.23621140763997908, + "grad_norm": 2.183621120398911, + "learning_rate": 1.785201496694853e-05, + "loss": 1.0284, "step": 2257 }, { - "epoch": 1.49, - "learning_rate": 3.264434401031887e-06, - "loss": 0.7414, + "epoch": 0.23631606488749346, + "grad_norm": 2.2329023327526, + "learning_rate": 1.784991546438206e-05, + "loss": 1.0342, "step": 2258 }, { - "epoch": 1.49, - "learning_rate": 3.2565561439625205e-06, - "loss": 0.6787, + "epoch": 0.23642072213500784, + "grad_norm": 2.871738043676333, + "learning_rate": 1.7847815059844395e-05, + "loss": 1.1025, "step": 2259 }, { - "epoch": 1.49, - "learning_rate": 3.248685555491039e-06, - "loss": 0.8007, + "epoch": 0.23652537938252224, + "grad_norm": 2.4479413257311378, + "learning_rate": 1.784571375357687e-05, + "loss": 1.0315, "step": 2260 }, { - "epoch": 1.49, - "learning_rate": 3.240822644567815e-06, - "loss": 0.7207, + "epoch": 0.23663003663003662, + "grad_norm": 2.315149134540888, + "learning_rate": 1.7843611545820926e-05, + "loss": 1.1481, "step": 2261 }, { - "epoch": 1.49, - "learning_rate": 3.2329674201344997e-06, - "loss": 0.6228, + "epoch": 0.23673469387755103, + "grad_norm": 2.4131417962503177, + "learning_rate": 1.784150843681812e-05, + "loss": 0.9819, "step": 2262 }, { - "epoch": 1.49, - "learning_rate": 3.2251198911240002e-06, - "loss": 0.6543, + "epoch": 0.2368393511250654, + "grad_norm": 2.009874656026206, + "learning_rate": 1.7839404426810095e-05, + "loss": 0.8828, "step": 2263 }, { - "epoch": 1.49, - "learning_rate": 3.217280066460472e-06, - "loss": 0.8337, + "epoch": 0.2369440083725798, + "grad_norm": 2.30932078574492, + "learning_rate": 1.7837299516038608e-05, + "loss": 1.0608, "step": 2264 }, { - "epoch": 1.49, - "learning_rate": 3.2094479550593107e-06, - "loss": 0.6155, + "epoch": 0.2370486656200942, + "grad_norm": 2.056529909679889, + "learning_rate": 1.7835193704745523e-05, + "loss": 0.931, "step": 2265 }, { - "epoch": 1.49, - "learning_rate": 3.201623565827139e-06, - "loss": 0.7811, + "epoch": 0.23715332286760857, + "grad_norm": 1.9976085370847383, + "learning_rate": 1.7833086993172797e-05, + "loss": 0.8794, "step": 2266 }, { - "epoch": 1.49, - "learning_rate": 3.1938069076617983e-06, - "loss": 0.6261, + "epoch": 0.23725798011512297, + "grad_norm": 2.3451892003689854, + "learning_rate": 1.7830979381562493e-05, + "loss": 1.0935, "step": 2267 }, { - "epoch": 1.49, - "learning_rate": 3.1859979894523375e-06, - "loss": 0.8209, + "epoch": 0.23736263736263735, + "grad_norm": 2.3332512825866756, + "learning_rate": 1.7828870870156783e-05, + "loss": 1.0952, "step": 2268 }, { - "epoch": 1.49, - "learning_rate": 3.1781968200790094e-06, - "loss": 0.686, + "epoch": 0.23746729461015176, + "grad_norm": 2.4752809555669257, + "learning_rate": 1.7826761459197943e-05, + "loss": 0.9602, "step": 2269 }, { - "epoch": 1.49, - "learning_rate": 3.170403408413243e-06, - "loss": 0.788, + "epoch": 0.23757195185766614, + "grad_norm": 2.2358190924088017, + "learning_rate": 1.7824651148928343e-05, + "loss": 1.1246, "step": 2270 }, { - "epoch": 1.5, - "learning_rate": 3.1626177633176557e-06, - "loss": 0.7175, + "epoch": 0.23767660910518054, + "grad_norm": 2.6059633471148915, + "learning_rate": 1.782253993959046e-05, + "loss": 1.0941, "step": 2271 }, { - "epoch": 1.5, - "learning_rate": 3.154839893646028e-06, - "loss": 0.7192, + "epoch": 0.23778126635269492, + "grad_norm": 2.084559757590681, + "learning_rate": 1.7820427831426887e-05, + "loss": 0.9926, "step": 2272 }, { - "epoch": 1.5, - "learning_rate": 3.1470698082433014e-06, - "loss": 0.7721, + "epoch": 0.23788592360020933, + "grad_norm": 2.2523944777498968, + "learning_rate": 1.78183148246803e-05, + "loss": 0.9949, "step": 2273 }, { - "epoch": 1.5, - "learning_rate": 3.139307515945563e-06, - "loss": 0.7285, + "epoch": 0.2379905808477237, + "grad_norm": 2.8181662408595187, + "learning_rate": 1.781620091959349e-05, + "loss": 1.012, "step": 2274 }, { - "epoch": 1.5, - "learning_rate": 3.1315530255800384e-06, - "loss": 0.6929, + "epoch": 0.23809523809523808, + "grad_norm": 2.328558412920668, + "learning_rate": 1.781408611640935e-05, + "loss": 1.0071, "step": 2275 }, { - "epoch": 1.5, - "learning_rate": 3.1238063459650802e-06, - "loss": 0.6265, + "epoch": 0.2381998953427525, + "grad_norm": 2.183389318821081, + "learning_rate": 1.781197041537087e-05, + "loss": 1.0634, "step": 2276 }, { - "epoch": 1.5, - "learning_rate": 3.116067485910159e-06, - "loss": 0.6691, + "epoch": 0.23830455259026687, + "grad_norm": 2.2754131017205417, + "learning_rate": 1.7809853816721157e-05, + "loss": 0.9493, "step": 2277 }, { - "epoch": 1.5, - "learning_rate": 3.1083364542158535e-06, - "loss": 0.7067, + "epoch": 0.23840920983778127, + "grad_norm": 2.180076226224215, + "learning_rate": 1.7807736320703405e-05, + "loss": 0.98, "step": 2278 }, { - "epoch": 1.5, - "learning_rate": 3.1006132596738426e-06, - "loss": 0.7177, + "epoch": 0.23851386708529565, + "grad_norm": 2.1011853330474772, + "learning_rate": 1.7805617927560926e-05, + "loss": 1.0353, "step": 2279 }, { - "epoch": 1.5, - "learning_rate": 3.092897911066883e-06, - "loss": 0.661, + "epoch": 0.23861852433281006, + "grad_norm": 2.685310139913174, + "learning_rate": 1.780349863753712e-05, + "loss": 0.8339, "step": 2280 }, { - "epoch": 1.5, - "learning_rate": 3.0851904171688196e-06, - "loss": 0.659, + "epoch": 0.23872318158032443, + "grad_norm": 2.2643234179394183, + "learning_rate": 1.7801378450875504e-05, + "loss": 0.9221, "step": 2281 }, { - "epoch": 1.5, - "learning_rate": 3.0774907867445623e-06, - "loss": 0.6698, + "epoch": 0.23882783882783884, + "grad_norm": 2.0845511977759914, + "learning_rate": 1.7799257367819687e-05, + "loss": 1.0691, "step": 2282 }, { - "epoch": 1.5, - "learning_rate": 3.0697990285500802e-06, - "loss": 0.698, + "epoch": 0.23893249607535322, + "grad_norm": 2.1996457975610286, + "learning_rate": 1.779713538861339e-05, + "loss": 1.0552, "step": 2283 }, { - "epoch": 1.5, - "learning_rate": 3.06211515133238e-06, - "loss": 0.5568, + "epoch": 0.2390371533228676, + "grad_norm": 2.563382788930006, + "learning_rate": 1.7795012513500427e-05, + "loss": 1.0386, "step": 2284 }, { - "epoch": 1.5, - "learning_rate": 3.054439163829519e-06, - "loss": 0.6433, + "epoch": 0.239141810570382, + "grad_norm": 2.482883361316951, + "learning_rate": 1.7792888742724727e-05, + "loss": 1.116, "step": 2285 }, { - "epoch": 1.5, - "learning_rate": 3.0467710747705735e-06, - "loss": 0.678, + "epoch": 0.23924646781789638, + "grad_norm": 2.173594595079943, + "learning_rate": 1.7790764076530312e-05, + "loss": 1.0827, "step": 2286 }, { - "epoch": 1.51, - "learning_rate": 3.0391108928756486e-06, - "loss": 0.6623, + "epoch": 0.23935112506541079, + "grad_norm": 1.973340347588087, + "learning_rate": 1.778863851516131e-05, + "loss": 1.068, "step": 2287 }, { - "epoch": 1.51, - "learning_rate": 3.0314586268558486e-06, - "loss": 0.6125, + "epoch": 0.23945578231292516, + "grad_norm": 2.266375557620404, + "learning_rate": 1.7786512058861952e-05, + "loss": 0.8809, "step": 2288 }, { - "epoch": 1.51, - "learning_rate": 3.0238142854132722e-06, - "loss": 0.7927, + "epoch": 0.23956043956043957, + "grad_norm": 1.952846674363708, + "learning_rate": 1.7784384707876576e-05, + "loss": 1.0294, "step": 2289 }, { - "epoch": 1.51, - "learning_rate": 3.0161778772410153e-06, - "loss": 0.73, + "epoch": 0.23966509680795395, + "grad_norm": 2.2539440156526975, + "learning_rate": 1.7782256462449615e-05, + "loss": 1.0162, "step": 2290 }, { - "epoch": 1.51, - "learning_rate": 3.0085494110231493e-06, - "loss": 0.7054, + "epoch": 0.23976975405546835, + "grad_norm": 2.278316834840628, + "learning_rate": 1.7780127322825615e-05, + "loss": 1.0605, "step": 2291 }, { - "epoch": 1.51, - "learning_rate": 3.0009288954347115e-06, - "loss": 0.6187, + "epoch": 0.23987441130298273, + "grad_norm": 2.1166188092910674, + "learning_rate": 1.777799728924921e-05, + "loss": 0.9187, "step": 2292 }, { - "epoch": 1.51, - "learning_rate": 2.9933163391417042e-06, - "loss": 0.7471, + "epoch": 0.2399790685504971, + "grad_norm": 2.2188937843904473, + "learning_rate": 1.7775866361965145e-05, + "loss": 0.9865, "step": 2293 }, { - "epoch": 1.51, - "learning_rate": 2.985711750801068e-06, - "loss": 0.6812, + "epoch": 0.24008372579801152, + "grad_norm": 2.2834201572097936, + "learning_rate": 1.7773734541218277e-05, + "loss": 1.027, "step": 2294 }, { - "epoch": 1.51, - "learning_rate": 2.97811513906069e-06, - "loss": 0.81, + "epoch": 0.2401883830455259, + "grad_norm": 2.3639841943193223, + "learning_rate": 1.777160182725355e-05, + "loss": 0.998, "step": 2295 }, { - "epoch": 1.51, - "learning_rate": 2.9705265125593863e-06, - "loss": 0.7013, + "epoch": 0.2402930402930403, + "grad_norm": 2.6545219995276783, + "learning_rate": 1.7769468220316016e-05, + "loss": 1.0317, "step": 2296 }, { - "epoch": 1.51, - "learning_rate": 2.9629458799268897e-06, - "loss": 0.6364, + "epoch": 0.24039769754055468, + "grad_norm": 2.806578222987241, + "learning_rate": 1.7767333720650836e-05, + "loss": 1.0528, "step": 2297 }, { - "epoch": 1.51, - "learning_rate": 2.955373249783844e-06, - "loss": 0.7189, + "epoch": 0.24050235478806908, + "grad_norm": 2.450717164421872, + "learning_rate": 1.7765198328503262e-05, + "loss": 1.0429, "step": 2298 }, { - "epoch": 1.51, - "learning_rate": 2.947808630741791e-06, - "loss": 0.6564, + "epoch": 0.24060701203558346, + "grad_norm": 2.3826191374301326, + "learning_rate": 1.7763062044118662e-05, + "loss": 0.9769, "step": 2299 }, { - "epoch": 1.51, - "learning_rate": 2.9402520314031646e-06, - "loss": 0.6106, + "epoch": 0.24071166928309787, + "grad_norm": 2.5379042446021782, + "learning_rate": 1.7760924867742493e-05, + "loss": 1.0339, "step": 2300 }, { - "epoch": 1.51, - "learning_rate": 2.932703460361277e-06, - "loss": 0.7337, + "epoch": 0.24081632653061225, + "grad_norm": 2.3669619873635352, + "learning_rate": 1.7758786799620327e-05, + "loss": 1.0572, "step": 2301 }, { - "epoch": 1.52, - "learning_rate": 2.9251629262003135e-06, - "loss": 0.5722, + "epoch": 0.24092098377812662, + "grad_norm": 2.2289468079215142, + "learning_rate": 1.7756647839997824e-05, + "loss": 1.0289, "step": 2302 }, { - "epoch": 1.52, - "learning_rate": 2.9176304374953123e-06, - "loss": 0.6796, + "epoch": 0.24102564102564103, + "grad_norm": 2.2529359568142517, + "learning_rate": 1.7754507989120762e-05, + "loss": 0.9694, "step": 2303 }, { - "epoch": 1.52, - "learning_rate": 2.910106002812172e-06, - "loss": 0.7378, + "epoch": 0.2411302982731554, + "grad_norm": 2.222490094675877, + "learning_rate": 1.7752367247235015e-05, + "loss": 0.9996, "step": 2304 }, { - "epoch": 1.52, - "learning_rate": 2.902589630707624e-06, - "loss": 0.5941, + "epoch": 0.2412349555206698, + "grad_norm": 2.3032261423529254, + "learning_rate": 1.7750225614586557e-05, + "loss": 1.0049, "step": 2305 }, { - "epoch": 1.52, - "learning_rate": 2.895081329729239e-06, - "loss": 0.6849, + "epoch": 0.2413396127681842, + "grad_norm": 2.158558091358129, + "learning_rate": 1.774808309142146e-05, + "loss": 1.013, "step": 2306 }, { - "epoch": 1.52, - "learning_rate": 2.8875811084154025e-06, - "loss": 0.7581, + "epoch": 0.2414442700156986, + "grad_norm": 2.0156168291391134, + "learning_rate": 1.7745939677985912e-05, + "loss": 0.9208, "step": 2307 }, { - "epoch": 1.52, - "learning_rate": 2.8800889752953156e-06, - "loss": 0.6943, + "epoch": 0.24154892726321298, + "grad_norm": 2.2427438503410353, + "learning_rate": 1.7743795374526186e-05, + "loss": 1.1672, "step": 2308 }, { - "epoch": 1.52, - "learning_rate": 2.872604938888981e-06, - "loss": 0.655, + "epoch": 0.24165358451072738, + "grad_norm": 2.416172594533138, + "learning_rate": 1.774165018128868e-05, + "loss": 1.0452, "step": 2309 }, { - "epoch": 1.52, - "learning_rate": 2.8651290077071914e-06, - "loss": 0.6559, + "epoch": 0.24175824175824176, + "grad_norm": 2.5723958354089254, + "learning_rate": 1.7739504098519872e-05, + "loss": 0.973, "step": 2310 }, { - "epoch": 1.52, - "learning_rate": 2.8576611902515283e-06, - "loss": 0.6779, + "epoch": 0.24186289900575614, + "grad_norm": 2.9877649086017666, + "learning_rate": 1.773735712646635e-05, + "loss": 0.9828, "step": 2311 }, { - "epoch": 1.52, - "learning_rate": 2.8502014950143376e-06, - "loss": 0.6959, + "epoch": 0.24196755625327054, + "grad_norm": 2.3078412731916367, + "learning_rate": 1.7735209265374816e-05, + "loss": 1.0852, "step": 2312 }, { - "epoch": 1.52, - "learning_rate": 2.842749930478734e-06, - "loss": 0.6175, + "epoch": 0.24207221350078492, + "grad_norm": 2.1969741515349233, + "learning_rate": 1.773306051549205e-05, + "loss": 1.0456, "step": 2313 }, { - "epoch": 1.52, - "learning_rate": 2.8353065051185867e-06, - "loss": 0.7628, + "epoch": 0.24217687074829933, + "grad_norm": 2.5003537240671885, + "learning_rate": 1.7730910877064955e-05, + "loss": 0.9049, "step": 2314 }, { - "epoch": 1.52, - "learning_rate": 2.827871227398508e-06, - "loss": 0.7496, + "epoch": 0.2422815279958137, + "grad_norm": 2.519922133498177, + "learning_rate": 1.772876035034053e-05, + "loss": 1.0232, "step": 2315 }, { - "epoch": 1.52, - "learning_rate": 2.820444105773845e-06, - "loss": 0.6525, + "epoch": 0.2423861852433281, + "grad_norm": 2.477645819821275, + "learning_rate": 1.7726608935565874e-05, + "loss": 0.9229, "step": 2316 }, { - "epoch": 1.53, - "learning_rate": 2.8130251486906666e-06, - "loss": 0.7134, + "epoch": 0.2424908424908425, + "grad_norm": 2.6081751548110685, + "learning_rate": 1.7724456632988188e-05, + "loss": 1.0518, "step": 2317 }, { - "epoch": 1.53, - "learning_rate": 2.805614364585758e-06, - "loss": 0.7566, + "epoch": 0.24259549973835687, + "grad_norm": 2.5904924581138005, + "learning_rate": 1.7722303442854774e-05, + "loss": 0.8876, "step": 2318 }, { - "epoch": 1.53, - "learning_rate": 2.7982117618866177e-06, - "loss": 0.7241, + "epoch": 0.24270015698587127, + "grad_norm": 2.2369477600349845, + "learning_rate": 1.7720149365413036e-05, + "loss": 0.9971, "step": 2319 }, { - "epoch": 1.53, - "learning_rate": 2.790817349011432e-06, - "loss": 0.6949, + "epoch": 0.24280481423338565, + "grad_norm": 2.4995571839955786, + "learning_rate": 1.771799440091049e-05, + "loss": 0.8509, "step": 2320 }, { - "epoch": 1.53, - "learning_rate": 2.783431134369079e-06, - "loss": 0.6533, + "epoch": 0.24290947148090006, + "grad_norm": 2.2952550447378774, + "learning_rate": 1.7715838549594744e-05, + "loss": 1.0066, "step": 2321 }, { - "epoch": 1.53, - "learning_rate": 2.776053126359105e-06, - "loss": 0.6817, + "epoch": 0.24301412872841444, + "grad_norm": 2.190996661293464, + "learning_rate": 1.7713681811713504e-05, + "loss": 1.0152, "step": 2322 }, { - "epoch": 1.53, - "learning_rate": 2.7686833333717334e-06, - "loss": 0.6128, + "epoch": 0.24311878597592884, + "grad_norm": 2.259684687855643, + "learning_rate": 1.7711524187514586e-05, + "loss": 0.8633, "step": 2323 }, { - "epoch": 1.53, - "learning_rate": 2.761321763787841e-06, - "loss": 0.6683, + "epoch": 0.24322344322344322, + "grad_norm": 2.121917767970034, + "learning_rate": 1.7709365677245906e-05, + "loss": 0.9001, "step": 2324 }, { - "epoch": 1.53, - "learning_rate": 2.753968425978958e-06, - "loss": 0.6115, + "epoch": 0.24332810047095763, + "grad_norm": 2.2233654020848475, + "learning_rate": 1.7707206281155482e-05, + "loss": 1.1153, "step": 2325 }, { - "epoch": 1.53, - "learning_rate": 2.746623328307243e-06, - "loss": 0.6595, + "epoch": 0.243432757718472, + "grad_norm": 2.444459263315588, + "learning_rate": 1.770504599949143e-05, + "loss": 1.0226, "step": 2326 }, { - "epoch": 1.53, - "learning_rate": 2.739286479125494e-06, - "loss": 0.7621, + "epoch": 0.24353741496598638, + "grad_norm": 2.349907556465352, + "learning_rate": 1.7702884832501978e-05, + "loss": 1.0579, "step": 2327 }, { - "epoch": 1.53, - "learning_rate": 2.7319578867771237e-06, - "loss": 0.6654, + "epoch": 0.2436420722135008, + "grad_norm": 2.484238355922039, + "learning_rate": 1.7700722780435437e-05, + "loss": 0.9627, "step": 2328 }, { - "epoch": 1.53, - "learning_rate": 2.7246375595961584e-06, - "loss": 0.6504, + "epoch": 0.24374672946101517, + "grad_norm": 2.4085127927382732, + "learning_rate": 1.7698559843540242e-05, + "loss": 1.0758, "step": 2329 }, { - "epoch": 1.53, - "learning_rate": 2.7173255059072234e-06, - "loss": 0.6322, + "epoch": 0.24385138670852957, + "grad_norm": 2.4015208510522745, + "learning_rate": 1.769639602206491e-05, + "loss": 1.0412, "step": 2330 }, { - "epoch": 1.53, - "learning_rate": 2.7100217340255352e-06, - "loss": 0.6879, + "epoch": 0.24395604395604395, + "grad_norm": 2.2260639525963675, + "learning_rate": 1.769423131625808e-05, + "loss": 1.0326, "step": 2331 }, { - "epoch": 1.54, - "learning_rate": 2.7027262522568944e-06, - "loss": 0.736, + "epoch": 0.24406070120355836, + "grad_norm": 2.2914496037327288, + "learning_rate": 1.769206572636847e-05, + "loss": 1.1783, "step": 2332 }, { - "epoch": 1.54, - "learning_rate": 2.6954390688976706e-06, - "loss": 0.6613, + "epoch": 0.24416535845107273, + "grad_norm": 2.226935286535785, + "learning_rate": 1.7689899252644912e-05, + "loss": 0.9281, "step": 2333 }, { - "epoch": 1.54, - "learning_rate": 2.6881601922348e-06, - "loss": 0.729, + "epoch": 0.24427001569858714, + "grad_norm": 2.835666597627508, + "learning_rate": 1.768773189533634e-05, + "loss": 1.016, "step": 2334 }, { - "epoch": 1.54, - "learning_rate": 2.6808896305457733e-06, - "loss": 0.5926, + "epoch": 0.24437467294610152, + "grad_norm": 2.7851995018242417, + "learning_rate": 1.7685563654691797e-05, + "loss": 1.0492, "step": 2335 }, { - "epoch": 1.54, - "learning_rate": 2.6736273920986166e-06, - "loss": 0.6028, + "epoch": 0.2444793301936159, + "grad_norm": 2.1483770409252445, + "learning_rate": 1.7683394530960402e-05, + "loss": 1.0236, "step": 2336 }, { - "epoch": 1.54, - "learning_rate": 2.6663734851519017e-06, - "loss": 0.66, + "epoch": 0.2445839874411303, + "grad_norm": 2.170226481420682, + "learning_rate": 1.76812245243914e-05, + "loss": 1.0287, "step": 2337 }, { - "epoch": 1.54, - "learning_rate": 2.65912791795472e-06, - "loss": 0.6421, + "epoch": 0.24468864468864468, + "grad_norm": 2.5889214185058895, + "learning_rate": 1.7679053635234134e-05, + "loss": 0.9133, "step": 2338 }, { - "epoch": 1.54, - "learning_rate": 2.651890698746682e-06, - "loss": 0.7681, + "epoch": 0.24479330193615909, + "grad_norm": 2.526081416072903, + "learning_rate": 1.7676881863738034e-05, + "loss": 1.1081, "step": 2339 }, { - "epoch": 1.54, - "learning_rate": 2.6446618357579035e-06, - "loss": 0.5866, + "epoch": 0.24489795918367346, + "grad_norm": 1.928523666124805, + "learning_rate": 1.767470921015265e-05, + "loss": 0.7997, "step": 2340 }, { - "epoch": 1.54, - "learning_rate": 2.6374413372089968e-06, - "loss": 0.637, + "epoch": 0.24500261643118787, + "grad_norm": 2.20948369983928, + "learning_rate": 1.767253567472761e-05, + "loss": 0.9804, "step": 2341 }, { - "epoch": 1.54, - "learning_rate": 2.630229211311064e-06, - "loss": 0.6654, + "epoch": 0.24510727367870225, + "grad_norm": 2.0194710314361632, + "learning_rate": 1.7670361257712674e-05, + "loss": 0.987, "step": 2342 }, { - "epoch": 1.54, - "learning_rate": 2.623025466265684e-06, - "loss": 0.5687, + "epoch": 0.24521193092621665, + "grad_norm": 2.5584737884538935, + "learning_rate": 1.766818595935768e-05, + "loss": 1.0407, "step": 2343 }, { - "epoch": 1.54, - "learning_rate": 2.6158301102649085e-06, - "loss": 0.6714, + "epoch": 0.24531658817373103, + "grad_norm": 2.236875174210803, + "learning_rate": 1.7666009779912574e-05, + "loss": 1.0802, "step": 2344 }, { - "epoch": 1.54, - "learning_rate": 2.6086431514912438e-06, - "loss": 0.7063, + "epoch": 0.2454212454212454, + "grad_norm": 2.0282577109477686, + "learning_rate": 1.7663832719627404e-05, + "loss": 0.953, "step": 2345 }, { - "epoch": 1.54, - "learning_rate": 2.6014645981176488e-06, - "loss": 0.7165, + "epoch": 0.24552590266875982, + "grad_norm": 2.073084277889144, + "learning_rate": 1.7661654778752315e-05, + "loss": 1.0081, "step": 2346 }, { - "epoch": 1.55, - "learning_rate": 2.594294458307528e-06, - "loss": 0.6265, + "epoch": 0.2456305599162742, + "grad_norm": 2.149914350909731, + "learning_rate": 1.765947595753756e-05, + "loss": 1.0456, "step": 2347 }, { - "epoch": 1.55, - "learning_rate": 2.5871327402147173e-06, - "loss": 0.6936, + "epoch": 0.2457352171637886, + "grad_norm": 2.2340276298746535, + "learning_rate": 1.7657296256233494e-05, + "loss": 1.0553, "step": 2348 }, { - "epoch": 1.55, - "learning_rate": 2.5799794519834632e-06, - "loss": 0.5665, + "epoch": 0.24583987441130298, + "grad_norm": 2.0128834946003784, + "learning_rate": 1.7655115675090566e-05, + "loss": 1.0138, "step": 2349 }, { - "epoch": 1.55, - "learning_rate": 2.5728346017484453e-06, - "loss": 0.7499, + "epoch": 0.24594453165881738, + "grad_norm": 2.032422061077809, + "learning_rate": 1.7652934214359324e-05, + "loss": 1.005, "step": 2350 }, { - "epoch": 1.55, - "learning_rate": 2.565698197634735e-06, - "loss": 0.6924, + "epoch": 0.24604918890633176, + "grad_norm": 2.2528606162122515, + "learning_rate": 1.7650751874290427e-05, + "loss": 1.1266, "step": 2351 }, { - "epoch": 1.55, - "learning_rate": 2.5585702477578024e-06, - "loss": 0.7012, + "epoch": 0.24615384615384617, + "grad_norm": 2.1025231064248566, + "learning_rate": 1.7648568655134633e-05, + "loss": 1.1058, "step": 2352 }, { - "epoch": 1.55, - "learning_rate": 2.5514507602235063e-06, - "loss": 0.7002, + "epoch": 0.24625850340136055, + "grad_norm": 2.1702704933156896, + "learning_rate": 1.7646384557142796e-05, + "loss": 1.0684, "step": 2353 }, { - "epoch": 1.55, - "learning_rate": 2.5443397431280703e-06, - "loss": 0.6393, + "epoch": 0.24636316064887492, + "grad_norm": 2.343302825772316, + "learning_rate": 1.764419958056587e-05, + "loss": 1.0974, "step": 2354 }, { - "epoch": 1.55, - "learning_rate": 2.5372372045581006e-06, - "loss": 0.6277, + "epoch": 0.24646781789638933, + "grad_norm": 2.3639946811692045, + "learning_rate": 1.7642013725654915e-05, + "loss": 0.9979, "step": 2355 }, { - "epoch": 1.55, - "learning_rate": 2.530143152590553e-06, - "loss": 0.7114, + "epoch": 0.2465724751439037, + "grad_norm": 2.336282852242526, + "learning_rate": 1.763982699266109e-05, + "loss": 1.0527, "step": 2356 }, { - "epoch": 1.55, - "learning_rate": 2.523057595292734e-06, - "loss": 0.6846, + "epoch": 0.2466771323914181, + "grad_norm": 2.2842090040435368, + "learning_rate": 1.7637639381835658e-05, + "loss": 0.959, "step": 2357 }, { - "epoch": 1.55, - "learning_rate": 2.515980540722295e-06, - "loss": 0.6611, + "epoch": 0.2467817896389325, + "grad_norm": 1.9972596299075889, + "learning_rate": 1.7635450893429977e-05, + "loss": 0.9649, "step": 2358 }, { - "epoch": 1.55, - "learning_rate": 2.508911996927207e-06, - "loss": 0.5545, + "epoch": 0.2468864468864469, + "grad_norm": 1.9988043374236892, + "learning_rate": 1.763326152769551e-05, + "loss": 1.1658, "step": 2359 }, { - "epoch": 1.55, - "learning_rate": 2.5018519719457725e-06, - "loss": 0.6492, + "epoch": 0.24699110413396128, + "grad_norm": 2.0588021433267896, + "learning_rate": 1.763107128488382e-05, + "loss": 1.0258, "step": 2360 }, { - "epoch": 1.55, - "learning_rate": 2.4948004738066013e-06, - "loss": 0.582, + "epoch": 0.24709576138147565, + "grad_norm": 1.8656858774757512, + "learning_rate": 1.7628880165246572e-05, + "loss": 0.9793, "step": 2361 }, { - "epoch": 1.55, - "learning_rate": 2.4877575105286146e-06, - "loss": 0.6536, + "epoch": 0.24720041862899006, + "grad_norm": 2.218035457125112, + "learning_rate": 1.7626688169035524e-05, + "loss": 0.9543, "step": 2362 }, { - "epoch": 1.56, - "learning_rate": 2.4807230901210155e-06, - "loss": 0.5477, + "epoch": 0.24730507587650444, + "grad_norm": 1.9653508672118056, + "learning_rate": 1.7624495296502545e-05, + "loss": 0.8786, "step": 2363 }, { - "epoch": 1.56, - "learning_rate": 2.4736972205832997e-06, - "loss": 0.7073, + "epoch": 0.24740973312401884, + "grad_norm": 1.9480601105860273, + "learning_rate": 1.7622301547899605e-05, + "loss": 0.8262, "step": 2364 }, { - "epoch": 1.56, - "learning_rate": 2.4666799099052384e-06, - "loss": 0.6569, + "epoch": 0.24751439037153322, + "grad_norm": 2.333099449535231, + "learning_rate": 1.7620106923478756e-05, + "loss": 1.0377, "step": 2365 }, { - "epoch": 1.56, - "learning_rate": 2.459671166066869e-06, - "loss": 0.6961, + "epoch": 0.24761904761904763, + "grad_norm": 2.0818968205046455, + "learning_rate": 1.761791142349218e-05, + "loss": 0.8524, "step": 2366 }, { - "epoch": 1.56, - "learning_rate": 2.45267099703849e-06, - "loss": 0.7883, + "epoch": 0.247723704866562, + "grad_norm": 2.3374344327141303, + "learning_rate": 1.761571504819214e-05, + "loss": 1.1216, "step": 2367 }, { - "epoch": 1.56, - "learning_rate": 2.445679410780638e-06, - "loss": 0.7122, + "epoch": 0.2478283621140764, + "grad_norm": 2.4268357445524424, + "learning_rate": 1.7613517797830995e-05, + "loss": 1.0762, "step": 2368 }, { - "epoch": 1.56, - "learning_rate": 2.4386964152441016e-06, - "loss": 0.8225, + "epoch": 0.2479330193615908, + "grad_norm": 2.590696844481067, + "learning_rate": 1.7611319672661227e-05, + "loss": 0.9002, "step": 2369 }, { - "epoch": 1.56, - "learning_rate": 2.431722018369894e-06, - "loss": 0.6341, + "epoch": 0.24803767660910517, + "grad_norm": 2.909652369590741, + "learning_rate": 1.7609120672935396e-05, + "loss": 1.0978, "step": 2370 }, { - "epoch": 1.56, - "learning_rate": 2.424756228089251e-06, - "loss": 0.5397, + "epoch": 0.24814233385661957, + "grad_norm": 2.576806230412862, + "learning_rate": 1.7606920798906176e-05, + "loss": 1.0588, "step": 2371 }, { - "epoch": 1.56, - "learning_rate": 2.4177990523236216e-06, - "loss": 0.6188, + "epoch": 0.24824699110413395, + "grad_norm": 2.0096476869887, + "learning_rate": 1.7604720050826333e-05, + "loss": 0.9611, "step": 2372 }, { - "epoch": 1.56, - "learning_rate": 2.410850498984659e-06, - "loss": 0.7613, + "epoch": 0.24835164835164836, + "grad_norm": 2.2977012476580208, + "learning_rate": 1.7602518428948742e-05, + "loss": 1.0389, "step": 2373 }, { - "epoch": 1.56, - "learning_rate": 2.4039105759742098e-06, - "loss": 0.6933, + "epoch": 0.24845630559916274, + "grad_norm": 2.945269543421569, + "learning_rate": 1.760031593352637e-05, + "loss": 1.0266, "step": 2374 }, { - "epoch": 1.56, - "learning_rate": 2.396979291184306e-06, - "loss": 0.6117, + "epoch": 0.24856096284667714, + "grad_norm": 2.1640744849684155, + "learning_rate": 1.7598112564812292e-05, + "loss": 1.1166, "step": 2375 }, { - "epoch": 1.56, - "learning_rate": 2.3900566524971593e-06, - "loss": 0.6098, + "epoch": 0.24866562009419152, + "grad_norm": 2.1568155494244285, + "learning_rate": 1.7595908323059675e-05, + "loss": 1.1188, "step": 2376 }, { - "epoch": 1.56, - "learning_rate": 2.3831426677851423e-06, - "loss": 0.7346, + "epoch": 0.24877027734170593, + "grad_norm": 2.584518671074471, + "learning_rate": 1.7593703208521794e-05, + "loss": 1.0749, "step": 2377 }, { - "epoch": 1.57, - "learning_rate": 2.3762373449107935e-06, - "loss": 0.6459, + "epoch": 0.2488749345892203, + "grad_norm": 2.6817350218871767, + "learning_rate": 1.7591497221452022e-05, + "loss": 1.0488, "step": 2378 }, { - "epoch": 1.57, - "learning_rate": 2.369340691726798e-06, - "loss": 0.7263, + "epoch": 0.24897959183673468, + "grad_norm": 2.0586150260625797, + "learning_rate": 1.7589290362103827e-05, + "loss": 1.0097, "step": 2379 }, { - "epoch": 1.57, - "learning_rate": 2.3624527160759812e-06, - "loss": 0.6671, + "epoch": 0.2490842490842491, + "grad_norm": 2.459396989353071, + "learning_rate": 1.7587082630730786e-05, + "loss": 1.0839, "step": 2380 }, { - "epoch": 1.57, - "learning_rate": 2.3555734257913045e-06, - "loss": 0.6921, + "epoch": 0.24918890633176347, + "grad_norm": 2.4145810020805314, + "learning_rate": 1.758487402758657e-05, + "loss": 1.0401, "step": 2381 }, { - "epoch": 1.57, - "learning_rate": 2.3487028286958458e-06, - "loss": 0.603, + "epoch": 0.24929356357927787, + "grad_norm": 2.3872298010666952, + "learning_rate": 1.7582664552924953e-05, + "loss": 1.1245, "step": 2382 }, { - "epoch": 1.57, - "learning_rate": 2.3418409326028037e-06, - "loss": 0.6765, + "epoch": 0.24939822082679225, + "grad_norm": 2.466040223475429, + "learning_rate": 1.7580454206999805e-05, + "loss": 0.8704, "step": 2383 }, { - "epoch": 1.57, - "learning_rate": 2.334987745315478e-06, - "loss": 0.7422, + "epoch": 0.24950287807430666, + "grad_norm": 2.327011051347902, + "learning_rate": 1.75782429900651e-05, + "loss": 1.0102, "step": 2384 }, { - "epoch": 1.57, - "learning_rate": 2.328143274627266e-06, - "loss": 0.825, + "epoch": 0.24960753532182103, + "grad_norm": 2.222950496140978, + "learning_rate": 1.7576030902374917e-05, + "loss": 0.8952, "step": 2385 }, { - "epoch": 1.57, - "learning_rate": 2.3213075283216547e-06, - "loss": 0.646, + "epoch": 0.24971219256933544, + "grad_norm": 2.0002826498689266, + "learning_rate": 1.7573817944183426e-05, + "loss": 0.977, "step": 2386 }, { - "epoch": 1.57, - "learning_rate": 2.3144805141722038e-06, - "loss": 0.7573, + "epoch": 0.24981684981684982, + "grad_norm": 2.261276443674775, + "learning_rate": 1.7571604115744895e-05, + "loss": 1.0078, "step": 2387 }, { - "epoch": 1.57, - "learning_rate": 2.3076622399425485e-06, - "loss": 0.7257, + "epoch": 0.2499215070643642, + "grad_norm": 2.31943025867348, + "learning_rate": 1.7569389417313704e-05, + "loss": 1.0752, "step": 2388 }, { - "epoch": 1.57, - "learning_rate": 2.3008527133863836e-06, - "loss": 0.7305, + "epoch": 0.2500261643118786, + "grad_norm": 2.2233824615996007, + "learning_rate": 1.7567173849144322e-05, + "loss": 1.0591, "step": 2389 }, { - "epoch": 1.57, - "learning_rate": 2.294051942247457e-06, - "loss": 0.7919, + "epoch": 0.250130821559393, + "grad_norm": 2.335773242353471, + "learning_rate": 1.7564957411491324e-05, + "loss": 1.0174, "step": 2390 }, { - "epoch": 1.57, - "learning_rate": 2.2872599342595557e-06, - "loss": 0.6848, + "epoch": 0.2502354788069074, + "grad_norm": 2.274951953356186, + "learning_rate": 1.756274010460939e-05, + "loss": 1.088, "step": 2391 }, { - "epoch": 1.57, - "learning_rate": 2.2804766971465054e-06, - "loss": 0.571, + "epoch": 0.2503401360544218, + "grad_norm": 1.928296571150385, + "learning_rate": 1.756052192875328e-05, + "loss": 0.8277, "step": 2392 }, { - "epoch": 1.58, - "learning_rate": 2.2737022386221552e-06, - "loss": 0.6643, + "epoch": 0.25044479330193614, + "grad_norm": 2.2156454730583555, + "learning_rate": 1.7558302884177876e-05, + "loss": 1.0403, "step": 2393 }, { - "epoch": 1.58, - "learning_rate": 2.2669365663903764e-06, - "loss": 0.6848, + "epoch": 0.25054945054945055, + "grad_norm": 2.1520512914884153, + "learning_rate": 1.755608297113815e-05, + "loss": 1.0643, "step": 2394 }, { - "epoch": 1.58, - "learning_rate": 2.2601796881450468e-06, - "loss": 0.6377, + "epoch": 0.25065410779696495, + "grad_norm": 2.2740218868907927, + "learning_rate": 1.7553862189889172e-05, + "loss": 1.0674, "step": 2395 }, { - "epoch": 1.58, - "learning_rate": 2.253431611570035e-06, - "loss": 0.6544, + "epoch": 0.2507587650444793, + "grad_norm": 2.1433388658904318, + "learning_rate": 1.7551640540686114e-05, + "loss": 0.9212, "step": 2396 }, { - "epoch": 1.58, - "learning_rate": 2.24669234433921e-06, - "loss": 0.8275, + "epoch": 0.2508634222919937, + "grad_norm": 2.5749819609252764, + "learning_rate": 1.754941802378425e-05, + "loss": 0.9503, "step": 2397 }, { - "epoch": 1.58, - "learning_rate": 2.2399618941164202e-06, - "loss": 0.7129, + "epoch": 0.2509680795395081, + "grad_norm": 2.4391582025577843, + "learning_rate": 1.754719463943895e-05, + "loss": 1.0504, "step": 2398 }, { - "epoch": 1.58, - "learning_rate": 2.2332402685554876e-06, - "loss": 0.753, + "epoch": 0.2510727367870225, + "grad_norm": 2.517643489899688, + "learning_rate": 1.7544970387905687e-05, + "loss": 0.97, "step": 2399 }, { - "epoch": 1.58, - "learning_rate": 2.2265274753002008e-06, - "loss": 0.5883, + "epoch": 0.25117739403453687, + "grad_norm": 2.202807983897776, + "learning_rate": 1.754274526944003e-05, + "loss": 0.8349, "step": 2400 }, { - "epoch": 1.58, - "learning_rate": 2.2198235219842967e-06, - "loss": 0.6828, + "epoch": 0.2512820512820513, + "grad_norm": 2.9251756781669127, + "learning_rate": 1.7540519284297652e-05, + "loss": 1.1699, "step": 2401 }, { - "epoch": 1.58, - "learning_rate": 2.213128416231468e-06, - "loss": 0.6688, + "epoch": 0.2513867085295657, + "grad_norm": 2.356785224256036, + "learning_rate": 1.753829243273432e-05, + "loss": 0.9605, "step": 2402 }, { - "epoch": 1.58, - "learning_rate": 2.2064421656553415e-06, - "loss": 0.6618, + "epoch": 0.2514913657770801, + "grad_norm": 2.423973980202741, + "learning_rate": 1.7536064715005906e-05, + "loss": 0.8849, "step": 2403 }, { - "epoch": 1.58, - "learning_rate": 2.199764777859477e-06, - "loss": 0.7577, + "epoch": 0.25159602302459444, + "grad_norm": 2.446059031088621, + "learning_rate": 1.7533836131368383e-05, + "loss": 1.093, "step": 2404 }, { - "epoch": 1.58, - "learning_rate": 2.1930962604373517e-06, - "loss": 0.8434, + "epoch": 0.25170068027210885, + "grad_norm": 2.399076855736286, + "learning_rate": 1.7531606682077814e-05, + "loss": 1.0542, "step": 2405 }, { - "epoch": 1.58, - "learning_rate": 2.1864366209723597e-06, - "loss": 0.6372, + "epoch": 0.25180533751962325, + "grad_norm": 2.2136011597833267, + "learning_rate": 1.7529376367390366e-05, + "loss": 0.915, "step": 2406 }, { - "epoch": 1.58, - "learning_rate": 2.179785867037797e-06, - "loss": 0.7029, + "epoch": 0.2519099947671376, + "grad_norm": 2.226461148880702, + "learning_rate": 1.7527145187562314e-05, + "loss": 0.9114, "step": 2407 }, { - "epoch": 1.59, - "learning_rate": 2.1731440061968536e-06, - "loss": 0.6978, + "epoch": 0.252014652014652, + "grad_norm": 2.41593091900151, + "learning_rate": 1.7524913142850017e-05, + "loss": 1.0315, "step": 2408 }, { - "epoch": 1.59, - "learning_rate": 2.166511046002612e-06, - "loss": 0.6433, + "epoch": 0.2521193092621664, + "grad_norm": 2.2319401902707146, + "learning_rate": 1.7522680233509947e-05, + "loss": 0.987, "step": 2409 }, { - "epoch": 1.59, - "learning_rate": 2.1598869939980225e-06, - "loss": 0.6724, + "epoch": 0.2522239665096808, + "grad_norm": 2.7690125251281366, + "learning_rate": 1.752044645979867e-05, + "loss": 1.1371, "step": 2410 }, { - "epoch": 1.59, - "learning_rate": 2.153271857715915e-06, - "loss": 0.6688, + "epoch": 0.25232862375719517, + "grad_norm": 2.286815126513628, + "learning_rate": 1.7518211821972845e-05, + "loss": 1.1027, "step": 2411 }, { - "epoch": 1.59, - "learning_rate": 2.1466656446789757e-06, - "loss": 0.532, + "epoch": 0.2524332810047096, + "grad_norm": 2.0820445058724695, + "learning_rate": 1.7515976320289247e-05, + "loss": 1.0881, "step": 2412 }, { - "epoch": 1.59, - "learning_rate": 2.140068362399745e-06, - "loss": 0.775, + "epoch": 0.252537938252224, + "grad_norm": 2.987553814644758, + "learning_rate": 1.751373995500473e-05, + "loss": 1.107, "step": 2413 }, { - "epoch": 1.59, - "learning_rate": 2.133480018380608e-06, - "loss": 0.7806, + "epoch": 0.25264259549973833, + "grad_norm": 1.9581027443263326, + "learning_rate": 1.7511502726376257e-05, + "loss": 0.8612, "step": 2414 }, { - "epoch": 1.59, - "learning_rate": 2.1269006201137833e-06, - "loss": 0.6316, + "epoch": 0.25274725274725274, + "grad_norm": 2.376542007505258, + "learning_rate": 1.7509264634660896e-05, + "loss": 1.006, "step": 2415 }, { - "epoch": 1.59, - "learning_rate": 2.1203301750813166e-06, - "loss": 0.7329, + "epoch": 0.25285190999476714, + "grad_norm": 2.80707883970879, + "learning_rate": 1.7507025680115807e-05, + "loss": 1.1002, "step": 2416 }, { - "epoch": 1.59, - "learning_rate": 2.1137686907550737e-06, - "loss": 0.6746, + "epoch": 0.25295656724228155, + "grad_norm": 1.6942186283185672, + "learning_rate": 1.7504785862998246e-05, + "loss": 0.8788, "step": 2417 }, { - "epoch": 1.59, - "learning_rate": 2.1072161745967313e-06, - "loss": 0.6451, + "epoch": 0.2530612244897959, + "grad_norm": 2.5777349929769433, + "learning_rate": 1.750254518356558e-05, + "loss": 1.103, "step": 2418 }, { - "epoch": 1.59, - "learning_rate": 2.1006726340577597e-06, - "loss": 0.6995, + "epoch": 0.2531658817373103, + "grad_norm": 2.4495279259341776, + "learning_rate": 1.7500303642075257e-05, + "loss": 1.0692, "step": 2419 }, { - "epoch": 1.59, - "learning_rate": 2.094138076579433e-06, - "loss": 0.695, + "epoch": 0.2532705389848247, + "grad_norm": 1.8822814244979826, + "learning_rate": 1.7498061238784843e-05, + "loss": 0.8956, "step": 2420 }, { - "epoch": 1.59, - "learning_rate": 2.0876125095928033e-06, - "loss": 0.7856, + "epoch": 0.2533751962323391, + "grad_norm": 2.223853110843607, + "learning_rate": 1.7495817973951997e-05, + "loss": 0.969, "step": 2421 }, { - "epoch": 1.59, - "learning_rate": 2.0810959405187005e-06, - "loss": 0.6353, + "epoch": 0.25347985347985347, + "grad_norm": 2.238360132707453, + "learning_rate": 1.7493573847834465e-05, + "loss": 1.0264, "step": 2422 }, { - "epoch": 1.6, - "learning_rate": 2.0745883767677246e-06, - "loss": 0.7336, + "epoch": 0.2535845107273679, + "grad_norm": 2.0197198632761606, + "learning_rate": 1.749132886069011e-05, + "loss": 0.9527, "step": 2423 }, { - "epoch": 1.6, - "learning_rate": 2.068089825740225e-06, - "loss": 0.7266, + "epoch": 0.2536891679748823, + "grad_norm": 2.22631274546597, + "learning_rate": 1.748908301277688e-05, + "loss": 0.9435, "step": 2424 }, { - "epoch": 1.6, - "learning_rate": 2.0616002948263157e-06, - "loss": 0.7532, + "epoch": 0.25379382522239663, + "grad_norm": 2.299475782910513, + "learning_rate": 1.748683630435283e-05, + "loss": 1.1404, "step": 2425 }, { - "epoch": 1.6, - "learning_rate": 2.0551197914058464e-06, - "loss": 0.6806, + "epoch": 0.25389848246991104, + "grad_norm": 2.5843227105301034, + "learning_rate": 1.7484588735676117e-05, + "loss": 0.8494, "step": 2426 }, { - "epoch": 1.6, - "learning_rate": 2.0486483228484012e-06, - "loss": 0.7049, + "epoch": 0.25400313971742544, + "grad_norm": 1.9572204780029898, + "learning_rate": 1.748234030700498e-05, + "loss": 0.9776, "step": 2427 }, { - "epoch": 1.6, - "learning_rate": 2.0421858965132856e-06, - "loss": 0.6314, + "epoch": 0.25410779696493985, + "grad_norm": 3.069066964069534, + "learning_rate": 1.7480091018597775e-05, + "loss": 0.968, "step": 2428 }, { - "epoch": 1.6, - "learning_rate": 2.035732519749528e-06, - "loss": 0.6876, + "epoch": 0.2542124542124542, + "grad_norm": 2.5784423660832405, + "learning_rate": 1.7477840870712946e-05, + "loss": 1.0712, "step": 2429 }, { - "epoch": 1.6, - "learning_rate": 2.0292881998958634e-06, - "loss": 0.6627, + "epoch": 0.2543171114599686, + "grad_norm": 2.400723658989671, + "learning_rate": 1.7475589863609042e-05, + "loss": 0.9463, "step": 2430 }, { - "epoch": 1.6, - "learning_rate": 2.022852944280731e-06, - "loss": 0.7102, + "epoch": 0.254421768707483, + "grad_norm": 2.332512218507682, + "learning_rate": 1.747333799754471e-05, + "loss": 1.0427, "step": 2431 }, { - "epoch": 1.6, - "learning_rate": 2.016426760222259e-06, - "loss": 0.6204, + "epoch": 0.25452642595499736, + "grad_norm": 2.069397809708061, + "learning_rate": 1.747108527277869e-05, + "loss": 0.9479, "step": 2432 }, { - "epoch": 1.6, - "learning_rate": 2.0100096550282543e-06, - "loss": 0.5164, + "epoch": 0.25463108320251177, + "grad_norm": 2.284939347387363, + "learning_rate": 1.7468831689569827e-05, + "loss": 1.0377, "step": 2433 }, { - "epoch": 1.6, - "learning_rate": 2.0036016359962084e-06, - "loss": 0.7561, + "epoch": 0.25473574045002617, + "grad_norm": 2.013249004544656, + "learning_rate": 1.7466577248177062e-05, + "loss": 1.0894, "step": 2434 }, { - "epoch": 1.6, - "learning_rate": 1.9972027104132763e-06, - "loss": 0.7087, + "epoch": 0.2548403976975406, + "grad_norm": 2.095695638030201, + "learning_rate": 1.746432194885944e-05, + "loss": 1.0162, "step": 2435 }, { - "epoch": 1.6, - "learning_rate": 1.990812885556269e-06, - "loss": 0.7155, + "epoch": 0.2549450549450549, + "grad_norm": 2.308895697628336, + "learning_rate": 1.7462065791876087e-05, + "loss": 0.9949, "step": 2436 }, { - "epoch": 1.6, - "learning_rate": 1.9844321686916602e-06, - "loss": 0.7704, + "epoch": 0.25504971219256933, + "grad_norm": 2.1787458199138183, + "learning_rate": 1.745980877748625e-05, + "loss": 1.0669, "step": 2437 }, { - "epoch": 1.61, - "learning_rate": 1.9780605670755473e-06, - "loss": 0.6917, + "epoch": 0.25515436944008374, + "grad_norm": 2.27381391199627, + "learning_rate": 1.7457550905949263e-05, + "loss": 0.9188, "step": 2438 }, { - "epoch": 1.61, - "learning_rate": 1.971698087953676e-06, - "loss": 0.6903, + "epoch": 0.2552590266875981, + "grad_norm": 2.0547178362816423, + "learning_rate": 1.745529217752456e-05, + "loss": 1.0003, "step": 2439 }, { - "epoch": 1.61, - "learning_rate": 1.965344738561414e-06, - "loss": 0.6161, + "epoch": 0.2553636839351125, + "grad_norm": 2.003826247372063, + "learning_rate": 1.7453032592471667e-05, + "loss": 0.8219, "step": 2440 }, { - "epoch": 1.61, - "learning_rate": 1.9590005261237475e-06, - "loss": 0.6375, + "epoch": 0.2554683411826269, + "grad_norm": 2.2165668382792676, + "learning_rate": 1.7450772151050226e-05, + "loss": 1.078, "step": 2441 }, { - "epoch": 1.61, - "learning_rate": 1.9526654578552673e-06, - "loss": 0.7197, + "epoch": 0.2555729984301413, + "grad_norm": 2.406127059643738, + "learning_rate": 1.744851085351996e-05, + "loss": 1.0154, "step": 2442 }, { - "epoch": 1.61, - "learning_rate": 1.9463395409601713e-06, - "loss": 0.6918, + "epoch": 0.25567765567765566, + "grad_norm": 1.9609166344880296, + "learning_rate": 1.7446248700140694e-05, + "loss": 1.0885, "step": 2443 }, { - "epoch": 1.61, - "learning_rate": 1.940022782632248e-06, - "loss": 0.6887, + "epoch": 0.25578231292517006, + "grad_norm": 2.0711606782794947, + "learning_rate": 1.744398569117236e-05, + "loss": 1.0171, "step": 2444 }, { - "epoch": 1.61, - "learning_rate": 1.9337151900548734e-06, - "loss": 0.7296, + "epoch": 0.25588697017268447, + "grad_norm": 2.0871258754382636, + "learning_rate": 1.7441721826874976e-05, + "loss": 0.9873, "step": 2445 }, { - "epoch": 1.61, - "learning_rate": 1.9274167704009973e-06, - "loss": 0.6461, + "epoch": 0.2559916274201989, + "grad_norm": 2.112116868972381, + "learning_rate": 1.743945710750867e-05, + "loss": 0.9903, "step": 2446 }, { - "epoch": 1.61, - "learning_rate": 1.921127530833138e-06, - "loss": 0.7237, + "epoch": 0.2560962846677132, + "grad_norm": 1.8482931898345583, + "learning_rate": 1.7437191533333663e-05, + "loss": 0.8978, "step": 2447 }, { - "epoch": 1.61, - "learning_rate": 1.9148474785033768e-06, - "loss": 0.6848, + "epoch": 0.25620094191522763, + "grad_norm": 2.361301370582731, + "learning_rate": 1.7434925104610273e-05, + "loss": 1.0131, "step": 2448 }, { - "epoch": 1.61, - "learning_rate": 1.908576620553345e-06, - "loss": 0.6442, + "epoch": 0.25630559916274204, + "grad_norm": 2.1470314899465275, + "learning_rate": 1.7432657821598912e-05, + "loss": 1.1222, "step": 2449 }, { - "epoch": 1.61, - "learning_rate": 1.902314964114219e-06, - "loss": 0.7891, + "epoch": 0.2564102564102564, + "grad_norm": 2.243154931598059, + "learning_rate": 1.74303896845601e-05, + "loss": 0.9956, "step": 2450 }, { - "epoch": 1.61, - "learning_rate": 1.8960625163067158e-06, - "loss": 0.6919, + "epoch": 0.2565149136577708, + "grad_norm": 2.090796158960878, + "learning_rate": 1.7428120693754453e-05, + "loss": 0.9429, "step": 2451 }, { - "epoch": 1.61, - "learning_rate": 1.889819284241069e-06, - "loss": 0.7969, + "epoch": 0.2566195709052852, + "grad_norm": 2.129012668639158, + "learning_rate": 1.7425850849442677e-05, + "loss": 0.9842, "step": 2452 }, { - "epoch": 1.61, - "learning_rate": 1.8835852750170414e-06, - "loss": 0.7172, + "epoch": 0.2567242281527996, + "grad_norm": 2.587169226816751, + "learning_rate": 1.742358015188559e-05, + "loss": 0.9024, "step": 2453 }, { - "epoch": 1.62, - "learning_rate": 1.8773604957239067e-06, - "loss": 0.6268, + "epoch": 0.25682888540031396, + "grad_norm": 2.186311266652209, + "learning_rate": 1.742130860134409e-05, + "loss": 1.0747, "step": 2454 }, { - "epoch": 1.62, - "learning_rate": 1.8711449534404424e-06, - "loss": 0.6722, + "epoch": 0.25693354264782836, + "grad_norm": 2.2324519069996223, + "learning_rate": 1.7419036198079186e-05, + "loss": 1.0882, "step": 2455 }, { - "epoch": 1.62, - "learning_rate": 1.8649386552349136e-06, - "loss": 0.7591, + "epoch": 0.25703819989534277, + "grad_norm": 2.044693604835276, + "learning_rate": 1.7416762942351986e-05, + "loss": 1.0221, "step": 2456 }, { - "epoch": 1.62, - "learning_rate": 1.8587416081650866e-06, - "loss": 0.684, + "epoch": 0.2571428571428571, + "grad_norm": 2.4466601197617344, + "learning_rate": 1.7414488834423687e-05, + "loss": 1.1308, "step": 2457 }, { - "epoch": 1.62, - "learning_rate": 1.8525538192782e-06, - "loss": 0.554, + "epoch": 0.2572475143903715, + "grad_norm": 2.473780111943485, + "learning_rate": 1.7412213874555594e-05, + "loss": 1.1312, "step": 2458 }, { - "epoch": 1.62, - "learning_rate": 1.8463752956109616e-06, - "loss": 0.6304, + "epoch": 0.25735217163788593, + "grad_norm": 2.4776047017601948, + "learning_rate": 1.7409938063009092e-05, + "loss": 0.9445, "step": 2459 }, { - "epoch": 1.62, - "learning_rate": 1.8402060441895508e-06, - "loss": 0.675, + "epoch": 0.25745682888540034, + "grad_norm": 2.4309185606448636, + "learning_rate": 1.740766140004569e-05, + "loss": 1.1362, "step": 2460 }, { - "epoch": 1.62, - "learning_rate": 1.8340460720295917e-06, - "loss": 0.6858, + "epoch": 0.2575614861329147, + "grad_norm": 2.3084232636893853, + "learning_rate": 1.740538388592698e-05, + "loss": 0.9599, "step": 2461 }, { - "epoch": 1.62, - "learning_rate": 1.827895386136166e-06, - "loss": 0.6739, + "epoch": 0.2576661433804291, + "grad_norm": 2.2338587055904693, + "learning_rate": 1.7403105520914644e-05, + "loss": 1.0872, "step": 2462 }, { - "epoch": 1.62, - "learning_rate": 1.8217539935037897e-06, - "loss": 0.7592, + "epoch": 0.2577708006279435, + "grad_norm": 2.32586858159479, + "learning_rate": 1.7400826305270477e-05, + "loss": 1.1381, "step": 2463 }, { - "epoch": 1.62, - "learning_rate": 1.8156219011164121e-06, - "loss": 0.6757, + "epoch": 0.2578754578754579, + "grad_norm": 1.9459202093623904, + "learning_rate": 1.7398546239256367e-05, + "loss": 0.888, "step": 2464 }, { - "epoch": 1.62, - "learning_rate": 1.8094991159474107e-06, - "loss": 0.6849, + "epoch": 0.25798011512297225, + "grad_norm": 2.1322352083234266, + "learning_rate": 1.7396265323134293e-05, + "loss": 1.0406, "step": 2465 }, { - "epoch": 1.62, - "learning_rate": 1.8033856449595689e-06, - "loss": 0.608, + "epoch": 0.25808477237048666, + "grad_norm": 2.1704661597271118, + "learning_rate": 1.739398355716634e-05, + "loss": 0.9406, "step": 2466 }, { - "epoch": 1.62, - "learning_rate": 1.7972814951050855e-06, - "loss": 0.8048, + "epoch": 0.25818942961800107, + "grad_norm": 2.5070809156506564, + "learning_rate": 1.7391700941614687e-05, + "loss": 0.9449, "step": 2467 }, { - "epoch": 1.62, - "learning_rate": 1.7911866733255556e-06, - "loss": 0.5433, + "epoch": 0.2582940868655154, + "grad_norm": 2.2868072880319286, + "learning_rate": 1.7389417476741615e-05, + "loss": 1.1516, "step": 2468 }, { - "epoch": 1.63, - "learning_rate": 1.785101186551975e-06, - "loss": 0.698, + "epoch": 0.2583987441130298, + "grad_norm": 2.2741590192268526, + "learning_rate": 1.7387133162809492e-05, + "loss": 1.0228, "step": 2469 }, { - "epoch": 1.63, - "learning_rate": 1.7790250417047116e-06, - "loss": 0.6862, + "epoch": 0.2585034013605442, + "grad_norm": 2.361486277697732, + "learning_rate": 1.738484800008079e-05, + "loss": 0.9587, "step": 2470 }, { - "epoch": 1.63, - "learning_rate": 1.772958245693518e-06, - "loss": 0.668, + "epoch": 0.25860805860805863, + "grad_norm": 2.1605751628195358, + "learning_rate": 1.738256198881809e-05, + "loss": 1.087, "step": 2471 }, { - "epoch": 1.63, - "learning_rate": 1.7669008054175108e-06, - "loss": 0.6286, + "epoch": 0.258712715855573, + "grad_norm": 2.0828368567159368, + "learning_rate": 1.7380275129284047e-05, + "loss": 1.0085, "step": 2472 }, { - "epoch": 1.63, - "learning_rate": 1.7608527277651711e-06, - "loss": 0.6147, + "epoch": 0.2588173731030874, + "grad_norm": 2.180627297007591, + "learning_rate": 1.7377987421741428e-05, + "loss": 1.0592, "step": 2473 }, { - "epoch": 1.63, - "learning_rate": 1.7548140196143338e-06, - "loss": 0.6577, + "epoch": 0.2589220303506018, + "grad_norm": 2.2690156377111035, + "learning_rate": 1.73756988664531e-05, + "loss": 1.1374, "step": 2474 }, { - "epoch": 1.63, - "learning_rate": 1.7487846878321691e-06, - "loss": 0.7557, + "epoch": 0.25902668759811615, + "grad_norm": 2.16338779374175, + "learning_rate": 1.7373409463682017e-05, + "loss": 0.999, "step": 2475 }, { - "epoch": 1.63, - "learning_rate": 1.7427647392751967e-06, - "loss": 0.5973, + "epoch": 0.25913134484563055, + "grad_norm": 2.1101239537203216, + "learning_rate": 1.7371119213691243e-05, + "loss": 0.9949, "step": 2476 }, { - "epoch": 1.63, - "learning_rate": 1.7367541807892586e-06, - "loss": 0.5939, + "epoch": 0.25923600209314496, + "grad_norm": 2.0134589337565085, + "learning_rate": 1.7368828116743927e-05, + "loss": 0.9535, "step": 2477 }, { - "epoch": 1.63, - "learning_rate": 1.7307530192095212e-06, - "loss": 0.6397, + "epoch": 0.25934065934065936, + "grad_norm": 2.0592210690399995, + "learning_rate": 1.7366536173103325e-05, + "loss": 1.0145, "step": 2478 }, { - "epoch": 1.63, - "learning_rate": 1.7247612613604637e-06, - "loss": 0.7555, + "epoch": 0.2594453165881737, + "grad_norm": 2.418699264093393, + "learning_rate": 1.736424338303278e-05, + "loss": 0.9113, "step": 2479 }, { - "epoch": 1.63, - "learning_rate": 1.718778914055873e-06, - "loss": 0.714, + "epoch": 0.2595499738356881, + "grad_norm": 2.6190682853440084, + "learning_rate": 1.7361949746795744e-05, + "loss": 1.1143, "step": 2480 }, { - "epoch": 1.63, - "learning_rate": 1.712805984098832e-06, - "loss": 0.6922, + "epoch": 0.2596546310832025, + "grad_norm": 2.2236033474143975, + "learning_rate": 1.7359655264655755e-05, + "loss": 0.9108, "step": 2481 }, { - "epoch": 1.63, - "learning_rate": 1.7068424782817162e-06, - "loss": 0.8039, + "epoch": 0.2597592883307169, + "grad_norm": 2.1342142709306207, + "learning_rate": 1.7357359936876457e-05, + "loss": 1.077, "step": 2482 }, { - "epoch": 1.63, - "learning_rate": 1.700888403386186e-06, - "loss": 0.6169, + "epoch": 0.2598639455782313, + "grad_norm": 2.0438697294062544, + "learning_rate": 1.735506376372159e-05, + "loss": 1.0832, "step": 2483 }, { - "epoch": 1.64, - "learning_rate": 1.694943766183168e-06, - "loss": 0.6418, + "epoch": 0.2599686028257457, + "grad_norm": 2.2685453564325933, + "learning_rate": 1.7352766745454982e-05, + "loss": 1.0577, "step": 2484 }, { - "epoch": 1.64, - "learning_rate": 1.6890085734328666e-06, - "loss": 0.7807, + "epoch": 0.2600732600732601, + "grad_norm": 2.277948996427162, + "learning_rate": 1.7350468882340572e-05, + "loss": 1.1536, "step": 2485 }, { - "epoch": 1.64, - "learning_rate": 1.6830828318847415e-06, - "loss": 0.6801, + "epoch": 0.26017791732077444, + "grad_norm": 2.2839736714043117, + "learning_rate": 1.7348170174642387e-05, + "loss": 1.0408, "step": 2486 }, { - "epoch": 1.64, - "learning_rate": 1.6771665482775058e-06, - "loss": 0.7606, + "epoch": 0.26028257456828885, + "grad_norm": 2.1783014726910532, + "learning_rate": 1.7345870622624552e-05, + "loss": 1.0058, "step": 2487 }, { - "epoch": 1.64, - "learning_rate": 1.6712597293391153e-06, - "loss": 0.6118, + "epoch": 0.26038723181580326, + "grad_norm": 2.316943295044069, + "learning_rate": 1.734357022655129e-05, + "loss": 1.1339, "step": 2488 }, { - "epoch": 1.64, - "learning_rate": 1.6653623817867638e-06, - "loss": 0.6275, + "epoch": 0.26049188906331766, + "grad_norm": 2.2969718645888144, + "learning_rate": 1.7341268986686924e-05, + "loss": 1.0786, "step": 2489 }, { - "epoch": 1.64, - "learning_rate": 1.6594745123268751e-06, - "loss": 0.6102, + "epoch": 0.260596546310832, + "grad_norm": 2.1322028884382185, + "learning_rate": 1.7338966903295865e-05, + "loss": 0.8794, "step": 2490 }, { - "epoch": 1.64, - "learning_rate": 1.6535961276550948e-06, - "loss": 0.6771, + "epoch": 0.2607012035583464, + "grad_norm": 2.3892801496526994, + "learning_rate": 1.7336663976642634e-05, + "loss": 1.0003, "step": 2491 }, { - "epoch": 1.64, - "learning_rate": 1.6477272344562789e-06, - "loss": 0.5991, + "epoch": 0.2608058608058608, + "grad_norm": 2.3446023789762047, + "learning_rate": 1.7334360206991842e-05, + "loss": 0.9991, "step": 2492 }, { - "epoch": 1.64, - "learning_rate": 1.6418678394044962e-06, - "loss": 0.6772, + "epoch": 0.2609105180533752, + "grad_norm": 2.222641398320293, + "learning_rate": 1.7332055594608188e-05, + "loss": 1.0998, "step": 2493 }, { - "epoch": 1.64, - "learning_rate": 1.6360179491630056e-06, - "loss": 0.6505, + "epoch": 0.2610151753008896, + "grad_norm": 2.0463322929767784, + "learning_rate": 1.7329750139756484e-05, + "loss": 1.0081, "step": 2494 }, { - "epoch": 1.64, - "learning_rate": 1.6301775703842637e-06, - "loss": 0.7815, + "epoch": 0.261119832548404, + "grad_norm": 2.156653372785402, + "learning_rate": 1.732744384270163e-05, + "loss": 1.0108, "step": 2495 }, { - "epoch": 1.64, - "learning_rate": 1.6243467097099097e-06, - "loss": 0.614, + "epoch": 0.2612244897959184, + "grad_norm": 2.2798443834114432, + "learning_rate": 1.732513670370863e-05, + "loss": 0.9697, "step": 2496 }, { - "epoch": 1.64, - "learning_rate": 1.6185253737707607e-06, - "loss": 0.6995, + "epoch": 0.26132914704343274, + "grad_norm": 2.5215817238707827, + "learning_rate": 1.7322828723042566e-05, + "loss": 1.0863, "step": 2497 }, { - "epoch": 1.64, - "learning_rate": 1.6127135691867945e-06, - "loss": 0.5755, + "epoch": 0.26143380429094715, + "grad_norm": 2.1107145371174347, + "learning_rate": 1.732051990096864e-05, + "loss": 1.0245, "step": 2498 }, { - "epoch": 1.65, - "learning_rate": 1.6069113025671546e-06, - "loss": 0.6053, + "epoch": 0.26153846153846155, + "grad_norm": 2.3976283399692537, + "learning_rate": 1.7318210237752137e-05, + "loss": 0.9136, "step": 2499 }, { - "epoch": 1.65, - "learning_rate": 1.6011185805101447e-06, - "loss": 0.6471, + "epoch": 0.2616431187859759, + "grad_norm": 2.182358026003498, + "learning_rate": 1.7315899733658443e-05, + "loss": 0.9992, "step": 2500 }, { - "epoch": 1.65, - "learning_rate": 1.5953354096032048e-06, - "loss": 0.5996, + "epoch": 0.2617477760334903, + "grad_norm": 2.2560760531967947, + "learning_rate": 1.7313588388953037e-05, + "loss": 0.9428, "step": 2501 }, { - "epoch": 1.65, - "learning_rate": 1.5895617964229215e-06, - "loss": 0.6437, + "epoch": 0.2618524332810047, + "grad_norm": 2.409534450316176, + "learning_rate": 1.73112762039015e-05, + "loss": 1.1515, "step": 2502 }, { - "epoch": 1.65, - "learning_rate": 1.5837977475350008e-06, - "loss": 0.6752, + "epoch": 0.2619570905285191, + "grad_norm": 2.0440248637735365, + "learning_rate": 1.7308963178769507e-05, + "loss": 0.9662, "step": 2503 }, { - "epoch": 1.65, - "learning_rate": 1.5780432694942815e-06, - "loss": 0.8052, + "epoch": 0.26206174777603347, + "grad_norm": 2.2837513962147105, + "learning_rate": 1.7306649313822826e-05, + "loss": 1.0215, "step": 2504 }, { - "epoch": 1.65, - "learning_rate": 1.572298368844719e-06, - "loss": 0.6576, + "epoch": 0.2621664050235479, + "grad_norm": 2.0651869724475986, + "learning_rate": 1.7304334609327326e-05, + "loss": 1.0749, "step": 2505 }, { - "epoch": 1.65, - "learning_rate": 1.5665630521193753e-06, - "loss": 0.7189, + "epoch": 0.2622710622710623, + "grad_norm": 2.2950964938180034, + "learning_rate": 1.7302019065548973e-05, + "loss": 1.0932, "step": 2506 }, { - "epoch": 1.65, - "learning_rate": 1.5608373258404087e-06, - "loss": 0.747, + "epoch": 0.2623757195185767, + "grad_norm": 2.6958405572322697, + "learning_rate": 1.7299702682753826e-05, + "loss": 0.8741, "step": 2507 }, { - "epoch": 1.65, - "learning_rate": 1.5551211965190783e-06, - "loss": 0.7151, + "epoch": 0.26248037676609104, + "grad_norm": 2.3931112849601615, + "learning_rate": 1.7297385461208044e-05, + "loss": 0.9674, "step": 2508 }, { - "epoch": 1.65, - "learning_rate": 1.5494146706557257e-06, - "loss": 0.6727, + "epoch": 0.26258503401360545, + "grad_norm": 2.0885169426285652, + "learning_rate": 1.7295067401177877e-05, + "loss": 1.0842, "step": 2509 }, { - "epoch": 1.65, - "learning_rate": 1.5437177547397742e-06, - "loss": 0.6923, + "epoch": 0.26268969126111985, + "grad_norm": 2.3250017262626503, + "learning_rate": 1.7292748502929678e-05, + "loss": 1.012, "step": 2510 }, { - "epoch": 1.65, - "learning_rate": 1.538030455249716e-06, - "loss": 0.655, + "epoch": 0.2627943485086342, + "grad_norm": 2.379541018104846, + "learning_rate": 1.7290428766729893e-05, + "loss": 1.1306, "step": 2511 }, { - "epoch": 1.65, - "learning_rate": 1.5323527786531113e-06, - "loss": 0.5931, + "epoch": 0.2628990057561486, + "grad_norm": 2.1706941809048153, + "learning_rate": 1.728810819284506e-05, + "loss": 0.9991, "step": 2512 }, { - "epoch": 1.65, - "learning_rate": 1.5266847314065726e-06, - "loss": 0.6519, + "epoch": 0.263003663003663, + "grad_norm": 2.075305628554083, + "learning_rate": 1.7285786781541825e-05, + "loss": 1.0348, "step": 2513 }, { - "epoch": 1.66, - "learning_rate": 1.521026319955764e-06, - "loss": 0.741, + "epoch": 0.2631083202511774, + "grad_norm": 2.0890810561710933, + "learning_rate": 1.7283464533086917e-05, + "loss": 0.8281, "step": 2514 }, { - "epoch": 1.66, - "learning_rate": 1.515377550735394e-06, - "loss": 0.6867, + "epoch": 0.26321297749869177, + "grad_norm": 2.3435013116193035, + "learning_rate": 1.728114144774717e-05, + "loss": 1.0713, "step": 2515 }, { - "epoch": 1.66, - "learning_rate": 1.5097384301692041e-06, - "loss": 0.6973, + "epoch": 0.2633176347462062, + "grad_norm": 2.1669632646702155, + "learning_rate": 1.7278817525789513e-05, + "loss": 1.1107, "step": 2516 }, { - "epoch": 1.66, - "learning_rate": 1.5041089646699603e-06, - "loss": 0.5613, + "epoch": 0.2634222919937206, + "grad_norm": 2.171445878531362, + "learning_rate": 1.7276492767480962e-05, + "loss": 0.9642, "step": 2517 }, { - "epoch": 1.66, - "learning_rate": 1.4984891606394535e-06, - "loss": 0.6384, + "epoch": 0.26352694924123493, + "grad_norm": 2.417950887212835, + "learning_rate": 1.7274167173088643e-05, + "loss": 1.0002, "step": 2518 }, { - "epoch": 1.66, - "learning_rate": 1.4928790244684866e-06, - "loss": 0.7297, + "epoch": 0.26363160648874934, + "grad_norm": 2.0843255174378092, + "learning_rate": 1.727184074287977e-05, + "loss": 1.0267, "step": 2519 }, { - "epoch": 1.66, - "learning_rate": 1.487278562536868e-06, - "loss": 0.663, + "epoch": 0.26373626373626374, + "grad_norm": 2.0538455916227134, + "learning_rate": 1.7269513477121652e-05, + "loss": 0.8743, "step": 2520 }, { - "epoch": 1.66, - "learning_rate": 1.4816877812134034e-06, - "loss": 0.7282, + "epoch": 0.26384092098377815, + "grad_norm": 2.1773574455703892, + "learning_rate": 1.7267185376081702e-05, + "loss": 0.9744, "step": 2521 }, { - "epoch": 1.66, - "learning_rate": 1.4761066868558915e-06, - "loss": 0.7148, + "epoch": 0.2639455782312925, + "grad_norm": 2.5975951297910167, + "learning_rate": 1.7264856440027417e-05, + "loss": 0.9104, "step": 2522 }, { - "epoch": 1.66, - "learning_rate": 1.4705352858111144e-06, - "loss": 0.7311, + "epoch": 0.2640502354788069, + "grad_norm": 2.03025817791244, + "learning_rate": 1.72625266692264e-05, + "loss": 0.987, "step": 2523 }, { - "epoch": 1.66, - "learning_rate": 1.4649735844148304e-06, - "loss": 0.7404, + "epoch": 0.2641548927263213, + "grad_norm": 2.549525974397687, + "learning_rate": 1.726019606394635e-05, + "loss": 1.0157, "step": 2524 }, { - "epoch": 1.66, - "learning_rate": 1.4594215889917707e-06, - "loss": 0.6626, + "epoch": 0.26425954997383566, + "grad_norm": 3.248084272655294, + "learning_rate": 1.7257864624455048e-05, + "loss": 1.0282, "step": 2525 }, { - "epoch": 1.66, - "learning_rate": 1.4538793058556222e-06, - "loss": 0.6253, + "epoch": 0.26436420722135007, + "grad_norm": 2.6468935665860807, + "learning_rate": 1.7255532351020393e-05, + "loss": 0.9803, "step": 2526 }, { - "epoch": 1.66, - "learning_rate": 1.4483467413090324e-06, - "loss": 0.6016, + "epoch": 0.2644688644688645, + "grad_norm": 2.088664398222349, + "learning_rate": 1.7253199243910357e-05, + "loss": 1.1068, "step": 2527 }, { - "epoch": 1.66, - "learning_rate": 1.4428239016435953e-06, - "loss": 0.6774, + "epoch": 0.2645735217163789, + "grad_norm": 2.181269418325987, + "learning_rate": 1.725086530339303e-05, + "loss": 0.9588, "step": 2528 }, { - "epoch": 1.66, - "learning_rate": 1.4373107931398477e-06, - "loss": 0.6316, + "epoch": 0.26467817896389323, + "grad_norm": 1.9358065140447658, + "learning_rate": 1.7248530529736575e-05, + "loss": 0.9188, "step": 2529 }, { - "epoch": 1.67, - "learning_rate": 1.4318074220672595e-06, - "loss": 0.6527, + "epoch": 0.26478283621140764, + "grad_norm": 2.182930759674411, + "learning_rate": 1.724619492320927e-05, + "loss": 1.1289, "step": 2530 }, { - "epoch": 1.67, - "learning_rate": 1.426313794684221e-06, - "loss": 0.659, + "epoch": 0.26488749345892204, + "grad_norm": 2.1868917273765227, + "learning_rate": 1.724385848407948e-05, + "loss": 0.9698, "step": 2531 }, { - "epoch": 1.67, - "learning_rate": 1.4208299172380512e-06, - "loss": 0.6483, + "epoch": 0.26499215070643645, + "grad_norm": 2.0248128570915918, + "learning_rate": 1.7241521212615663e-05, + "loss": 0.9791, "step": 2532 }, { - "epoch": 1.67, - "learning_rate": 1.4153557959649788e-06, - "loss": 0.6334, + "epoch": 0.2650968079539508, + "grad_norm": 2.4684093554379167, + "learning_rate": 1.7239183109086384e-05, + "loss": 1.0178, "step": 2533 }, { - "epoch": 1.67, - "learning_rate": 1.4098914370901385e-06, - "loss": 0.6306, + "epoch": 0.2652014652014652, + "grad_norm": 2.3248588536485535, + "learning_rate": 1.7236844173760286e-05, + "loss": 0.9927, "step": 2534 }, { - "epoch": 1.67, - "learning_rate": 1.4044368468275549e-06, - "loss": 0.7679, + "epoch": 0.2653061224489796, + "grad_norm": 2.3426825101423945, + "learning_rate": 1.7234504406906124e-05, + "loss": 1.0613, "step": 2535 }, { - "epoch": 1.67, - "learning_rate": 1.3989920313801552e-06, - "loss": 0.6251, + "epoch": 0.26541077969649396, + "grad_norm": 2.1367495439409643, + "learning_rate": 1.7232163808792733e-05, + "loss": 1.0705, "step": 2536 }, { - "epoch": 1.67, - "learning_rate": 1.3935569969397434e-06, - "loss": 0.5506, + "epoch": 0.26551543694400837, + "grad_norm": 2.3408082854159264, + "learning_rate": 1.7229822379689067e-05, + "loss": 1.1775, "step": 2537 }, { - "epoch": 1.67, - "learning_rate": 1.3881317496870062e-06, - "loss": 0.672, + "epoch": 0.26562009419152277, + "grad_norm": 2.2416930097749646, + "learning_rate": 1.722748011986415e-05, + "loss": 1.0386, "step": 2538 }, { - "epoch": 1.67, - "learning_rate": 1.3827162957914975e-06, - "loss": 0.5093, + "epoch": 0.2657247514390372, + "grad_norm": 2.1645588725679112, + "learning_rate": 1.7225137029587115e-05, + "loss": 0.9904, "step": 2539 }, { - "epoch": 1.67, - "learning_rate": 1.37731064141163e-06, - "loss": 0.5369, + "epoch": 0.2658294086865515, + "grad_norm": 2.1423029600647925, + "learning_rate": 1.7222793109127193e-05, + "loss": 0.987, "step": 2540 }, { - "epoch": 1.67, - "learning_rate": 1.3719147926946797e-06, - "loss": 0.6313, + "epoch": 0.26593406593406593, + "grad_norm": 2.360699966256053, + "learning_rate": 1.7220448358753693e-05, + "loss": 1.0804, "step": 2541 }, { - "epoch": 1.67, - "learning_rate": 1.3665287557767682e-06, - "loss": 0.7615, + "epoch": 0.26603872318158034, + "grad_norm": 2.484820983087851, + "learning_rate": 1.7218102778736046e-05, + "loss": 1.1052, "step": 2542 }, { - "epoch": 1.67, - "learning_rate": 1.361152536782856e-06, - "loss": 0.7162, + "epoch": 0.2661433804290947, + "grad_norm": 2.5559902418339298, + "learning_rate": 1.7215756369343755e-05, + "loss": 1.166, "step": 2543 }, { - "epoch": 1.67, - "learning_rate": 1.3557861418267514e-06, - "loss": 0.5893, + "epoch": 0.2662480376766091, + "grad_norm": 2.273062565811467, + "learning_rate": 1.721340913084643e-05, + "loss": 1.0314, "step": 2544 }, { - "epoch": 1.68, - "learning_rate": 1.3504295770110731e-06, - "loss": 0.6642, + "epoch": 0.2663526949241235, + "grad_norm": 2.390606522743288, + "learning_rate": 1.721106106351377e-05, + "loss": 1.0458, "step": 2545 }, { - "epoch": 1.68, - "learning_rate": 1.3450828484272727e-06, - "loss": 0.6711, + "epoch": 0.2664573521716379, + "grad_norm": 2.3461424836775313, + "learning_rate": 1.720871216761558e-05, + "loss": 0.9914, "step": 2546 }, { - "epoch": 1.68, - "learning_rate": 1.339745962155613e-06, - "loss": 0.6091, + "epoch": 0.26656200941915226, + "grad_norm": 2.1093537321214333, + "learning_rate": 1.7206362443421742e-05, + "loss": 0.9635, "step": 2547 }, { - "epoch": 1.68, - "learning_rate": 1.3344189242651672e-06, - "loss": 0.7028, + "epoch": 0.26666666666666666, + "grad_norm": 2.2076603089511333, + "learning_rate": 1.7204011891202256e-05, + "loss": 0.8798, "step": 2548 }, { - "epoch": 1.68, - "learning_rate": 1.3291017408138007e-06, - "loss": 0.7104, + "epoch": 0.26677132391418107, + "grad_norm": 2.1198208472221376, + "learning_rate": 1.7201660511227195e-05, + "loss": 1.0799, "step": 2549 }, { - "epoch": 1.68, - "learning_rate": 1.32379441784818e-06, - "loss": 0.6543, + "epoch": 0.2668759811616955, + "grad_norm": 2.257648787705932, + "learning_rate": 1.7199308303766745e-05, + "loss": 1.1216, "step": 2550 }, { - "epoch": 1.68, - "learning_rate": 1.3184969614037567e-06, - "loss": 0.6472, + "epoch": 0.2669806384092098, + "grad_norm": 2.167568660205902, + "learning_rate": 1.7196955269091175e-05, + "loss": 0.9121, "step": 2551 }, { - "epoch": 1.68, - "learning_rate": 1.3132093775047616e-06, - "loss": 0.8348, + "epoch": 0.26708529565672423, + "grad_norm": 2.070104233631567, + "learning_rate": 1.7194601407470857e-05, + "loss": 1.1141, "step": 2552 }, { - "epoch": 1.68, - "learning_rate": 1.3079316721641987e-06, - "loss": 0.7482, + "epoch": 0.26718995290423864, + "grad_norm": 2.121285217113809, + "learning_rate": 1.719224671917625e-05, + "loss": 1.0316, "step": 2553 }, { - "epoch": 1.68, - "learning_rate": 1.3026638513838386e-06, - "loss": 0.588, + "epoch": 0.267294610151753, + "grad_norm": 2.1816334225398997, + "learning_rate": 1.718989120447792e-05, + "loss": 0.8018, "step": 2554 }, { - "epoch": 1.68, - "learning_rate": 1.29740592115421e-06, - "loss": 0.5869, + "epoch": 0.2673992673992674, + "grad_norm": 2.2372260823335557, + "learning_rate": 1.718753486364651e-05, + "loss": 0.936, "step": 2555 }, { - "epoch": 1.68, - "learning_rate": 1.292157887454596e-06, - "loss": 0.6731, + "epoch": 0.2675039246467818, + "grad_norm": 2.1411788143946895, + "learning_rate": 1.7185177696952773e-05, + "loss": 1.0857, "step": 2556 }, { - "epoch": 1.68, - "learning_rate": 1.2869197562530223e-06, - "loss": 0.7313, + "epoch": 0.2676085818942962, + "grad_norm": 2.06941219144131, + "learning_rate": 1.7182819704667552e-05, + "loss": 0.9053, "step": 2557 }, { - "epoch": 1.68, - "learning_rate": 1.2816915335062596e-06, - "loss": 0.72, + "epoch": 0.26771323914181056, + "grad_norm": 2.152176539140967, + "learning_rate": 1.718046088706179e-05, + "loss": 0.9493, "step": 2558 }, { - "epoch": 1.68, - "learning_rate": 1.2764732251598022e-06, - "loss": 0.5977, + "epoch": 0.26781789638932496, + "grad_norm": 2.09104261234392, + "learning_rate": 1.7178101244406512e-05, + "loss": 1.0374, "step": 2559 }, { - "epoch": 1.69, - "learning_rate": 1.2712648371478763e-06, - "loss": 0.7444, + "epoch": 0.26792255363683937, + "grad_norm": 2.346305270831672, + "learning_rate": 1.7175740776972855e-05, + "loss": 0.9768, "step": 2560 }, { - "epoch": 1.69, - "learning_rate": 1.2660663753934256e-06, - "loss": 0.6558, + "epoch": 0.2680272108843537, + "grad_norm": 2.2927895598403847, + "learning_rate": 1.7173379485032028e-05, + "loss": 0.939, "step": 2561 }, { - "epoch": 1.69, - "learning_rate": 1.260877845808105e-06, - "loss": 0.6358, + "epoch": 0.2681318681318681, + "grad_norm": 2.3628912312336148, + "learning_rate": 1.7171017368855363e-05, + "loss": 1.0344, "step": 2562 }, { - "epoch": 1.69, - "learning_rate": 1.2556992542922708e-06, - "loss": 0.7264, + "epoch": 0.26823652537938253, + "grad_norm": 2.210950954427031, + "learning_rate": 1.7168654428714262e-05, + "loss": 1.0587, "step": 2563 }, { - "epoch": 1.69, - "learning_rate": 1.2505306067349853e-06, - "loss": 0.7023, + "epoch": 0.26834118262689693, + "grad_norm": 2.4183417784541517, + "learning_rate": 1.7166290664880234e-05, + "loss": 0.9456, "step": 2564 }, { - "epoch": 1.69, - "learning_rate": 1.2453719090139994e-06, - "loss": 0.7608, + "epoch": 0.2684458398744113, + "grad_norm": 2.123364599494718, + "learning_rate": 1.7163926077624886e-05, + "loss": 0.9227, "step": 2565 }, { - "epoch": 1.69, - "learning_rate": 1.2402231669957465e-06, - "loss": 0.6659, + "epoch": 0.2685504971219257, + "grad_norm": 2.1478356534162586, + "learning_rate": 1.7161560667219907e-05, + "loss": 0.9954, "step": 2566 }, { - "epoch": 1.69, - "learning_rate": 1.2350843865353447e-06, - "loss": 0.5094, + "epoch": 0.2686551543694401, + "grad_norm": 2.2147876235241797, + "learning_rate": 1.7159194433937087e-05, + "loss": 0.9726, "step": 2567 }, { - "epoch": 1.69, - "learning_rate": 1.2299555734765733e-06, - "loss": 0.7642, + "epoch": 0.26875981161695445, + "grad_norm": 2.1406444243414526, + "learning_rate": 1.7156827378048313e-05, + "loss": 0.9457, "step": 2568 }, { - "epoch": 1.69, - "learning_rate": 1.2248367336518874e-06, - "loss": 0.6149, + "epoch": 0.26886446886446885, + "grad_norm": 1.979769022328967, + "learning_rate": 1.7154459499825564e-05, + "loss": 0.9121, "step": 2569 }, { - "epoch": 1.69, - "learning_rate": 1.2197278728823947e-06, - "loss": 0.7276, + "epoch": 0.26896912611198326, + "grad_norm": 2.0747397671929626, + "learning_rate": 1.715209079954092e-05, + "loss": 1.0564, "step": 2570 }, { - "epoch": 1.69, - "learning_rate": 1.214628996977857e-06, - "loss": 0.6943, + "epoch": 0.26907378335949766, + "grad_norm": 2.2405940574201164, + "learning_rate": 1.7149721277466537e-05, + "loss": 1.0748, "step": 2571 }, { - "epoch": 1.69, - "learning_rate": 1.2095401117366823e-06, - "loss": 0.7002, + "epoch": 0.269178440607012, + "grad_norm": 2.38814683134122, + "learning_rate": 1.7147350933874693e-05, + "loss": 0.9308, "step": 2572 }, { - "epoch": 1.69, - "learning_rate": 1.204461222945913e-06, - "loss": 0.6412, + "epoch": 0.2692830978545264, + "grad_norm": 2.1247668633842394, + "learning_rate": 1.7144979769037732e-05, + "loss": 1.0777, "step": 2573 }, { - "epoch": 1.69, - "learning_rate": 1.1993923363812288e-06, - "loss": 0.6831, + "epoch": 0.2693877551020408, + "grad_norm": 2.262069730010127, + "learning_rate": 1.714260778322811e-05, + "loss": 1.0581, "step": 2574 }, { - "epoch": 1.7, - "learning_rate": 1.1943334578069277e-06, - "loss": 0.8331, + "epoch": 0.26949241234955523, + "grad_norm": 2.184376285021379, + "learning_rate": 1.7140234976718376e-05, + "loss": 1.0185, "step": 2575 }, { - "epoch": 1.7, - "learning_rate": 1.1892845929759412e-06, - "loss": 0.712, + "epoch": 0.2695970695970696, + "grad_norm": 2.267123524429682, + "learning_rate": 1.7137861349781172e-05, + "loss": 0.9554, "step": 2576 }, { - "epoch": 1.7, - "learning_rate": 1.1842457476297953e-06, - "loss": 0.7024, + "epoch": 0.269701726844584, + "grad_norm": 2.36727063416499, + "learning_rate": 1.7135486902689226e-05, + "loss": 1.0104, "step": 2577 }, { - "epoch": 1.7, - "learning_rate": 1.179216927498632e-06, - "loss": 0.6374, + "epoch": 0.2698063840920984, + "grad_norm": 2.0589416272264223, + "learning_rate": 1.713311163571537e-05, + "loss": 1.0443, "step": 2578 }, { - "epoch": 1.7, - "learning_rate": 1.1741981383011924e-06, - "loss": 0.6785, + "epoch": 0.26991104133961275, + "grad_norm": 1.902080690208934, + "learning_rate": 1.7130735549132528e-05, + "loss": 1.0204, "step": 2579 }, { - "epoch": 1.7, - "learning_rate": 1.1691893857448089e-06, - "loss": 0.7947, + "epoch": 0.27001569858712715, + "grad_norm": 2.4326326349397838, + "learning_rate": 1.7128358643213715e-05, + "loss": 1.0687, "step": 2580 }, { - "epoch": 1.7, - "learning_rate": 1.1641906755254007e-06, - "loss": 0.8104, + "epoch": 0.27012035583464156, + "grad_norm": 2.068315109987545, + "learning_rate": 1.7125980918232043e-05, + "loss": 1.0579, "step": 2581 }, { - "epoch": 1.7, - "learning_rate": 1.159202013327464e-06, - "loss": 0.671, + "epoch": 0.27022501308215596, + "grad_norm": 2.2810820908724567, + "learning_rate": 1.712360237446072e-05, + "loss": 1.0034, "step": 2582 }, { - "epoch": 1.7, - "learning_rate": 1.154223404824073e-06, - "loss": 0.7137, + "epoch": 0.2703296703296703, + "grad_norm": 2.16614232204801, + "learning_rate": 1.7121223012173037e-05, + "loss": 1.0208, "step": 2583 }, { - "epoch": 1.7, - "learning_rate": 1.1492548556768668e-06, - "loss": 0.6039, + "epoch": 0.2704343275771847, + "grad_norm": 2.3587680319750266, + "learning_rate": 1.71188428316424e-05, + "loss": 0.9352, "step": 2584 }, { - "epoch": 1.7, - "learning_rate": 1.1442963715360444e-06, - "loss": 0.721, + "epoch": 0.2705389848246991, + "grad_norm": 2.1472264377221766, + "learning_rate": 1.711646183314229e-05, + "loss": 1.0184, "step": 2585 }, { - "epoch": 1.7, - "learning_rate": 1.139347958040361e-06, - "loss": 0.6585, + "epoch": 0.2706436420722135, + "grad_norm": 2.114763044771736, + "learning_rate": 1.711408001694628e-05, + "loss": 0.9823, "step": 2586 }, { - "epoch": 1.7, - "learning_rate": 1.1344096208171186e-06, - "loss": 0.6241, + "epoch": 0.2707482993197279, + "grad_norm": 2.241676810783072, + "learning_rate": 1.7111697383328066e-05, + "loss": 1.0956, "step": 2587 }, { - "epoch": 1.7, - "learning_rate": 1.1294813654821591e-06, - "loss": 0.877, + "epoch": 0.2708529565672423, + "grad_norm": 2.1020077851868635, + "learning_rate": 1.71093139325614e-05, + "loss": 1.0291, "step": 2588 }, { - "epoch": 1.7, - "learning_rate": 1.1245631976398608e-06, - "loss": 0.6719, + "epoch": 0.2709576138147567, + "grad_norm": 2.022324455238823, + "learning_rate": 1.710692966492015e-05, + "loss": 0.8429, "step": 2589 }, { - "epoch": 1.71, - "learning_rate": 1.1196551228831321e-06, - "loss": 0.723, + "epoch": 0.27106227106227104, + "grad_norm": 2.112019727111849, + "learning_rate": 1.7104544580678276e-05, + "loss": 1.0672, "step": 2590 }, { - "epoch": 1.71, - "learning_rate": 1.1147571467933983e-06, - "loss": 0.6761, + "epoch": 0.27116692830978545, + "grad_norm": 2.2294841470996736, + "learning_rate": 1.7102158680109827e-05, + "loss": 1.0076, "step": 2591 }, { - "epoch": 1.71, - "learning_rate": 1.1098692749406058e-06, - "loss": 0.7836, + "epoch": 0.27127158555729985, + "grad_norm": 2.0630709446164976, + "learning_rate": 1.7099771963488948e-05, + "loss": 0.9595, "step": 2592 }, { - "epoch": 1.71, - "learning_rate": 1.10499151288321e-06, - "loss": 0.6073, + "epoch": 0.27137624280481426, + "grad_norm": 2.17182249171102, + "learning_rate": 1.709738443108988e-05, + "loss": 0.9664, "step": 2593 }, { - "epoch": 1.71, - "learning_rate": 1.1001238661681657e-06, - "loss": 0.5906, + "epoch": 0.2714809000523286, + "grad_norm": 1.6976481509832229, + "learning_rate": 1.7094996083186947e-05, + "loss": 0.8269, "step": 2594 }, { - "epoch": 1.71, - "learning_rate": 1.0952663403309294e-06, - "loss": 0.8204, + "epoch": 0.271585557299843, + "grad_norm": 2.297472828435568, + "learning_rate": 1.709260692005459e-05, + "loss": 1.0357, "step": 2595 }, { - "epoch": 1.71, - "learning_rate": 1.0904189408954445e-06, - "loss": 0.7257, + "epoch": 0.2716902145473574, + "grad_norm": 2.215787382610391, + "learning_rate": 1.7090216941967314e-05, + "loss": 1.0665, "step": 2596 }, { - "epoch": 1.71, - "learning_rate": 1.085581673374142e-06, - "loss": 0.6876, + "epoch": 0.2717948717948718, + "grad_norm": 1.9350048535413022, + "learning_rate": 1.7087826149199735e-05, + "loss": 0.9186, "step": 2597 }, { - "epoch": 1.71, - "learning_rate": 1.0807545432679278e-06, - "loss": 0.6238, + "epoch": 0.2718995290423862, + "grad_norm": 2.449759441348814, + "learning_rate": 1.7085434542026568e-05, + "loss": 1.0463, "step": 2598 }, { - "epoch": 1.71, - "learning_rate": 1.0759375560661834e-06, - "loss": 0.7294, + "epoch": 0.2720041862899006, + "grad_norm": 2.045258414864514, + "learning_rate": 1.7083042120722606e-05, + "loss": 0.999, "step": 2599 }, { - "epoch": 1.71, - "learning_rate": 1.07113071724675e-06, - "loss": 0.6923, + "epoch": 0.272108843537415, + "grad_norm": 2.1280959060956333, + "learning_rate": 1.7080648885562746e-05, + "loss": 1.0704, "step": 2600 }, { - "epoch": 1.71, - "learning_rate": 1.0663340322759318e-06, - "loss": 0.5949, + "epoch": 0.27221350078492934, + "grad_norm": 2.058863219285702, + "learning_rate": 1.7078254836821978e-05, + "loss": 0.9272, "step": 2601 }, { - "epoch": 1.71, - "learning_rate": 1.061547506608488e-06, - "loss": 0.673, + "epoch": 0.27231815803244375, + "grad_norm": 2.139899360821639, + "learning_rate": 1.7075859974775377e-05, + "loss": 0.9393, "step": 2602 }, { - "epoch": 1.71, - "learning_rate": 1.0567711456876218e-06, - "loss": 0.6742, + "epoch": 0.27242281527995815, + "grad_norm": 2.4744357953899474, + "learning_rate": 1.7073464299698122e-05, + "loss": 1.1305, "step": 2603 }, { - "epoch": 1.71, - "learning_rate": 1.0520049549449796e-06, - "loss": 0.7142, + "epoch": 0.2725274725274725, + "grad_norm": 2.070308973029564, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.9733, "step": 2604 }, { - "epoch": 1.71, - "learning_rate": 1.047248939800637e-06, - "loss": 0.7914, + "epoch": 0.2726321297749869, + "grad_norm": 2.2526454703490364, + "learning_rate": 1.706867051155281e-05, + "loss": 0.9585, "step": 2605 }, { - "epoch": 1.72, - "learning_rate": 1.0425031056631007e-06, - "loss": 0.6933, + "epoch": 0.2727367870225013, + "grad_norm": 2.212572397139927, + "learning_rate": 1.7066272399035568e-05, + "loss": 0.9461, "step": 2606 }, { - "epoch": 1.72, - "learning_rate": 1.0377674579293062e-06, - "loss": 0.649, + "epoch": 0.2728414442700157, + "grad_norm": 2.263865126136771, + "learning_rate": 1.70638734745893e-05, + "loss": 1.091, "step": 2607 }, { - "epoch": 1.72, - "learning_rate": 1.0330420019845965e-06, - "loss": 0.6101, + "epoch": 0.27294610151753007, + "grad_norm": 2.1151782209613854, + "learning_rate": 1.7061473738489655e-05, + "loss": 0.8887, "step": 2608 }, { - "epoch": 1.72, - "learning_rate": 1.02832674320273e-06, - "loss": 0.7183, + "epoch": 0.2730507587650445, + "grad_norm": 2.246083764897805, + "learning_rate": 1.705907319101236e-05, + "loss": 1.1668, "step": 2609 }, { - "epoch": 1.72, - "learning_rate": 1.0236216869458626e-06, - "loss": 0.7793, + "epoch": 0.2731554160125589, + "grad_norm": 2.0910504342650262, + "learning_rate": 1.7056671832433246e-05, + "loss": 0.9767, "step": 2610 }, { - "epoch": 1.72, - "learning_rate": 1.0189268385645524e-06, - "loss": 0.6259, + "epoch": 0.27326007326007323, + "grad_norm": 1.844847039254374, + "learning_rate": 1.7054269663028232e-05, + "loss": 0.9653, "step": 2611 }, { - "epoch": 1.72, - "learning_rate": 1.0142422033977506e-06, - "loss": 0.6424, + "epoch": 0.27336473050758764, + "grad_norm": 2.324658565103168, + "learning_rate": 1.7051866683073337e-05, + "loss": 1.0032, "step": 2612 }, { - "epoch": 1.72, - "learning_rate": 1.0095677867727927e-06, - "loss": 0.6323, + "epoch": 0.27346938775510204, + "grad_norm": 2.3338728511235636, + "learning_rate": 1.7049462892844663e-05, + "loss": 1.0501, "step": 2613 }, { - "epoch": 1.72, - "learning_rate": 1.0049035940053897e-06, - "loss": 0.6899, + "epoch": 0.27357404500261645, + "grad_norm": 2.3092244039265895, + "learning_rate": 1.704705829261841e-05, + "loss": 1.0663, "step": 2614 }, { - "epoch": 1.72, - "learning_rate": 1.0002496303996312e-06, - "loss": 0.6545, + "epoch": 0.2736787022501308, + "grad_norm": 2.158121468687123, + "learning_rate": 1.7044652882670874e-05, + "loss": 0.9789, "step": 2615 }, { - "epoch": 1.72, - "learning_rate": 9.956059012479725e-07, - "loss": 0.6734, + "epoch": 0.2737833594976452, + "grad_norm": 2.4459807471934174, + "learning_rate": 1.704224666327844e-05, + "loss": 0.9862, "step": 2616 }, { - "epoch": 1.72, - "learning_rate": 9.909724118312304e-07, - "loss": 0.589, + "epoch": 0.2738880167451596, + "grad_norm": 1.97502784186903, + "learning_rate": 1.703983963471759e-05, + "loss": 0.9387, "step": 2617 }, { - "epoch": 1.72, - "learning_rate": 9.863491674185776e-07, - "loss": 0.7629, + "epoch": 0.273992673992674, + "grad_norm": 2.2367807182106834, + "learning_rate": 1.703743179726489e-05, + "loss": 0.9994, "step": 2618 }, { - "epoch": 1.72, - "learning_rate": 9.817361732675346e-07, - "loss": 0.6731, + "epoch": 0.27409733124018837, + "grad_norm": 1.8582838838815607, + "learning_rate": 1.703502315119702e-05, + "loss": 0.969, "step": 2619 }, { - "epoch": 1.72, - "learning_rate": 9.77133434623968e-07, - "loss": 0.7219, + "epoch": 0.2742019884877028, + "grad_norm": 2.021019921533219, + "learning_rate": 1.7032613696790718e-05, + "loss": 1.0694, "step": 2620 }, { - "epoch": 1.73, - "learning_rate": 9.7254095672208e-07, - "loss": 0.6475, + "epoch": 0.2743066457352172, + "grad_norm": 2.1420561465239993, + "learning_rate": 1.7030203434322852e-05, + "loss": 1.1157, "step": 2621 }, { - "epoch": 1.73, - "learning_rate": 9.679587447844052e-07, - "loss": 0.7196, + "epoch": 0.27441130298273153, + "grad_norm": 2.192997753489374, + "learning_rate": 1.7027792364070358e-05, + "loss": 1.1059, "step": 2622 }, { - "epoch": 1.73, - "learning_rate": 9.633868040218043e-07, - "loss": 0.634, + "epoch": 0.27451596023024594, + "grad_norm": 2.363367853081556, + "learning_rate": 1.7025380486310275e-05, + "loss": 1.024, "step": 2623 }, { - "epoch": 1.73, - "learning_rate": 9.588251396334524e-07, - "loss": 0.717, + "epoch": 0.27462061747776034, + "grad_norm": 2.0560880376217803, + "learning_rate": 1.702296780131973e-05, + "loss": 0.9622, "step": 2624 }, { - "epoch": 1.73, - "learning_rate": 9.542737568068472e-07, - "loss": 0.715, + "epoch": 0.27472527472527475, + "grad_norm": 2.0836175068267004, + "learning_rate": 1.7020554309375947e-05, + "loss": 0.8943, "step": 2625 }, { - "epoch": 1.73, - "learning_rate": 9.497326607177881e-07, - "loss": 0.7918, + "epoch": 0.2748299319727891, + "grad_norm": 2.1909340338926797, + "learning_rate": 1.7018140010756246e-05, + "loss": 1.0327, "step": 2626 }, { - "epoch": 1.73, - "learning_rate": 9.452018565303789e-07, - "loss": 0.524, + "epoch": 0.2749345892203035, + "grad_norm": 2.5561649979411385, + "learning_rate": 1.7015724905738025e-05, + "loss": 0.9729, "step": 2627 }, { - "epoch": 1.73, - "learning_rate": 9.40681349397019e-07, - "loss": 0.6232, + "epoch": 0.2750392464678179, + "grad_norm": 2.0793636772474717, + "learning_rate": 1.7013308994598796e-05, + "loss": 0.8657, "step": 2628 }, { - "epoch": 1.73, - "learning_rate": 9.361711444583965e-07, - "loss": 0.6308, + "epoch": 0.27514390371533226, + "grad_norm": 2.092569812266147, + "learning_rate": 1.701089227761614e-05, + "loss": 0.9508, "step": 2629 }, { - "epoch": 1.73, - "learning_rate": 9.316712468434874e-07, - "loss": 0.743, + "epoch": 0.27524856096284667, + "grad_norm": 2.0132533158168804, + "learning_rate": 1.7008474755067755e-05, + "loss": 0.938, "step": 2630 }, { - "epoch": 1.73, - "learning_rate": 9.271816616695439e-07, - "loss": 0.7237, + "epoch": 0.2753532182103611, + "grad_norm": 1.926555041783822, + "learning_rate": 1.700605642723141e-05, + "loss": 1.0228, "step": 2631 }, { - "epoch": 1.73, - "learning_rate": 9.22702394042092e-07, - "loss": 0.6994, + "epoch": 0.2754578754578755, + "grad_norm": 2.234682199384651, + "learning_rate": 1.700363729438498e-05, + "loss": 1.091, "step": 2632 }, { - "epoch": 1.73, - "learning_rate": 9.182334490549205e-07, - "loss": 0.7045, + "epoch": 0.27556253270538983, + "grad_norm": 2.424151503612507, + "learning_rate": 1.7001217356806424e-05, + "loss": 1.1339, "step": 2633 }, { - "epoch": 1.73, - "learning_rate": 9.137748317900852e-07, - "loss": 0.5457, + "epoch": 0.27566718995290423, + "grad_norm": 2.3376824460058794, + "learning_rate": 1.6998796614773802e-05, + "loss": 0.9105, "step": 2634 }, { - "epoch": 1.73, - "learning_rate": 9.093265473178958e-07, - "loss": 0.732, + "epoch": 0.27577184720041864, + "grad_norm": 2.057373304612056, + "learning_rate": 1.6996375068565264e-05, + "loss": 0.9211, "step": 2635 }, { - "epoch": 1.74, - "learning_rate": 9.048886006969093e-07, - "loss": 0.6142, + "epoch": 0.27587650444793305, + "grad_norm": 2.187999947763851, + "learning_rate": 1.6993952718459044e-05, + "loss": 1.0002, "step": 2636 }, { - "epoch": 1.74, - "learning_rate": 9.004609969739308e-07, - "loss": 0.6379, + "epoch": 0.2759811616954474, + "grad_norm": 3.049341917756041, + "learning_rate": 1.699152956473348e-05, + "loss": 1.0756, "step": 2637 }, { - "epoch": 1.74, - "learning_rate": 8.960437411839939e-07, - "loss": 0.6108, + "epoch": 0.2760858189429618, + "grad_norm": 2.014925562032507, + "learning_rate": 1.6989105607666993e-05, + "loss": 0.9493, "step": 2638 }, { - "epoch": 1.74, - "learning_rate": 8.916368383503782e-07, - "loss": 0.6138, + "epoch": 0.2761904761904762, + "grad_norm": 2.2153507871464293, + "learning_rate": 1.6986680847538107e-05, + "loss": 0.9276, "step": 2639 }, { - "epoch": 1.74, - "learning_rate": 8.872402934845814e-07, - "loss": 0.714, + "epoch": 0.27629513343799056, + "grad_norm": 2.042229591409745, + "learning_rate": 1.6984255284625425e-05, + "loss": 0.9397, "step": 2640 }, { - "epoch": 1.74, - "learning_rate": 8.828541115863253e-07, - "loss": 0.7185, + "epoch": 0.27639979068550496, + "grad_norm": 2.317973974796922, + "learning_rate": 1.6981828919207656e-05, + "loss": 0.9842, "step": 2641 }, { - "epoch": 1.74, - "learning_rate": 8.784782976435425e-07, - "loss": 0.7271, + "epoch": 0.27650444793301937, + "grad_norm": 2.0908721172521103, + "learning_rate": 1.6979401751563584e-05, + "loss": 1.0226, "step": 2642 }, { - "epoch": 1.74, - "learning_rate": 8.741128566323797e-07, - "loss": 0.7367, + "epoch": 0.2766091051805338, + "grad_norm": 2.3306273872258294, + "learning_rate": 1.697697378197211e-05, + "loss": 0.9604, "step": 2643 }, { - "epoch": 1.74, - "learning_rate": 8.697577935171864e-07, - "loss": 0.6225, + "epoch": 0.2767137624280481, + "grad_norm": 2.037113746353718, + "learning_rate": 1.69745450107122e-05, + "loss": 1.0226, "step": 2644 }, { - "epoch": 1.74, - "learning_rate": 8.654131132505106e-07, - "loss": 0.6469, + "epoch": 0.27681841967556253, + "grad_norm": 2.3954710797112524, + "learning_rate": 1.697211543806293e-05, + "loss": 0.9924, "step": 2645 }, { - "epoch": 1.74, - "learning_rate": 8.610788207730958e-07, - "loss": 0.7463, + "epoch": 0.27692307692307694, + "grad_norm": 2.2168247323986083, + "learning_rate": 1.6969685064303462e-05, + "loss": 1.0382, "step": 2646 }, { - "epoch": 1.74, - "learning_rate": 8.567549210138637e-07, - "loss": 0.5993, + "epoch": 0.2770277341705913, + "grad_norm": 2.497079920708726, + "learning_rate": 1.696725388971305e-05, + "loss": 0.9435, "step": 2647 }, { - "epoch": 1.74, - "learning_rate": 8.524414188899266e-07, - "loss": 0.6791, + "epoch": 0.2771323914181057, + "grad_norm": 2.526133193397156, + "learning_rate": 1.6964821914571046e-05, + "loss": 1.1236, "step": 2648 }, { - "epoch": 1.74, - "learning_rate": 8.481383193065684e-07, - "loss": 0.7428, + "epoch": 0.2772370486656201, + "grad_norm": 2.626567114858534, + "learning_rate": 1.6962389139156883e-05, + "loss": 1.1044, "step": 2649 }, { - "epoch": 1.74, - "learning_rate": 8.438456271572437e-07, - "loss": 0.7658, + "epoch": 0.2773417059131345, + "grad_norm": 2.2181232399143402, + "learning_rate": 1.6959955563750094e-05, + "loss": 1.0042, "step": 2650 }, { - "epoch": 1.75, - "learning_rate": 8.395633473235764e-07, - "loss": 0.7767, + "epoch": 0.27744636316064886, + "grad_norm": 2.2945097994513053, + "learning_rate": 1.69575211886303e-05, + "loss": 0.8661, "step": 2651 }, { - "epoch": 1.75, - "learning_rate": 8.352914846753413e-07, - "loss": 0.6969, + "epoch": 0.27755102040816326, + "grad_norm": 2.310307859900674, + "learning_rate": 1.6955086014077215e-05, + "loss": 1.0534, "step": 2652 }, { - "epoch": 1.75, - "learning_rate": 8.310300440704732e-07, - "loss": 0.5453, + "epoch": 0.27765567765567767, + "grad_norm": 2.069211757222885, + "learning_rate": 1.6952650040370652e-05, + "loss": 1.034, "step": 2653 }, { - "epoch": 1.75, - "learning_rate": 8.267790303550527e-07, - "loss": 0.8743, + "epoch": 0.277760334903192, + "grad_norm": 1.9076079670724744, + "learning_rate": 1.6950213267790504e-05, + "loss": 0.8245, "step": 2654 }, { - "epoch": 1.75, - "learning_rate": 8.225384483633036e-07, - "loss": 0.6619, + "epoch": 0.2778649921507064, + "grad_norm": 3.6932632423547944, + "learning_rate": 1.694777569661676e-05, + "loss": 0.8772, "step": 2655 }, { - "epoch": 1.75, - "learning_rate": 8.183083029175853e-07, - "loss": 0.541, + "epoch": 0.27796964939822083, + "grad_norm": 1.9051520913310178, + "learning_rate": 1.6945337327129504e-05, + "loss": 0.8986, "step": 2656 }, { - "epoch": 1.75, - "learning_rate": 8.140885988283898e-07, - "loss": 0.8292, + "epoch": 0.27807430664573524, + "grad_norm": 2.305637947733258, + "learning_rate": 1.694289815960891e-05, + "loss": 1.0223, "step": 2657 }, { - "epoch": 1.75, - "learning_rate": 8.098793408943361e-07, - "loss": 0.7288, + "epoch": 0.2781789638932496, + "grad_norm": 2.7028733357321615, + "learning_rate": 1.6940458194335243e-05, + "loss": 0.9702, "step": 2658 }, { - "epoch": 1.75, - "learning_rate": 8.056805339021645e-07, - "loss": 0.6667, + "epoch": 0.278283621140764, + "grad_norm": 2.390388801259551, + "learning_rate": 1.693801743158886e-05, + "loss": 0.9889, "step": 2659 }, { - "epoch": 1.75, - "learning_rate": 8.014921826267286e-07, - "loss": 0.7894, + "epoch": 0.2783882783882784, + "grad_norm": 2.2381841916149066, + "learning_rate": 1.693557587165021e-05, + "loss": 1.0082, "step": 2660 }, { - "epoch": 1.75, - "learning_rate": 7.973142918309928e-07, - "loss": 0.7047, + "epoch": 0.2784929356357928, + "grad_norm": 2.071420896938347, + "learning_rate": 1.693313351479983e-05, + "loss": 1.0009, "step": 2661 }, { - "epoch": 1.75, - "learning_rate": 7.931468662660246e-07, - "loss": 0.6641, + "epoch": 0.27859759288330715, + "grad_norm": 2.446957221763722, + "learning_rate": 1.693069036131836e-05, + "loss": 1.0082, "step": 2662 }, { - "epoch": 1.75, - "learning_rate": 7.88989910670993e-07, - "loss": 0.6732, + "epoch": 0.27870225013082156, + "grad_norm": 2.105944137294003, + "learning_rate": 1.692824641148651e-05, + "loss": 0.8971, "step": 2663 }, { - "epoch": 1.75, - "learning_rate": 7.8484342977316e-07, - "loss": 0.6306, + "epoch": 0.27880690737833597, + "grad_norm": 2.3614811300833884, + "learning_rate": 1.6925801665585103e-05, + "loss": 0.9576, "step": 2664 }, { - "epoch": 1.75, - "learning_rate": 7.807074282878723e-07, - "loss": 0.6054, + "epoch": 0.2789115646258503, + "grad_norm": 2.058948239976716, + "learning_rate": 1.692335612389505e-05, + "loss": 0.9072, "step": 2665 }, { - "epoch": 1.76, - "learning_rate": 7.765819109185635e-07, - "loss": 0.6509, + "epoch": 0.2790162218733647, + "grad_norm": 2.2051843696736153, + "learning_rate": 1.692090978669734e-05, + "loss": 0.9603, "step": 2666 }, { - "epoch": 1.76, - "learning_rate": 7.724668823567439e-07, - "loss": 0.6439, + "epoch": 0.27912087912087913, + "grad_norm": 2.2044668414956456, + "learning_rate": 1.6918462654273063e-05, + "loss": 1.0132, "step": 2667 }, { - "epoch": 1.76, - "learning_rate": 7.683623472819956e-07, - "loss": 0.767, + "epoch": 0.27922553636839353, + "grad_norm": 2.388456279940295, + "learning_rate": 1.6916014726903408e-05, + "loss": 0.9757, "step": 2668 }, { - "epoch": 1.76, - "learning_rate": 7.642683103619686e-07, - "loss": 0.7687, + "epoch": 0.2793301936159079, + "grad_norm": 2.1759554884870957, + "learning_rate": 1.6913566004869637e-05, + "loss": 1.0602, "step": 2669 }, { - "epoch": 1.76, - "learning_rate": 7.601847762523718e-07, - "loss": 0.6927, + "epoch": 0.2794348508634223, + "grad_norm": 2.4236744464592825, + "learning_rate": 1.6911116488453118e-05, + "loss": 1.096, "step": 2670 }, { - "epoch": 1.76, - "learning_rate": 7.561117495969717e-07, - "loss": 0.671, + "epoch": 0.2795395081109367, + "grad_norm": 2.354246032793071, + "learning_rate": 1.69086661779353e-05, + "loss": 0.8912, "step": 2671 }, { - "epoch": 1.76, - "learning_rate": 7.520492350275876e-07, - "loss": 0.6598, + "epoch": 0.27964416535845105, + "grad_norm": 1.9463786670522631, + "learning_rate": 1.6906215073597736e-05, + "loss": 1.0506, "step": 2672 }, { - "epoch": 1.76, - "learning_rate": 7.479972371640831e-07, - "loss": 0.6614, + "epoch": 0.27974882260596545, + "grad_norm": 2.3854108573613213, + "learning_rate": 1.690376317572206e-05, + "loss": 1.0731, "step": 2673 }, { - "epoch": 1.76, - "learning_rate": 7.439557606143633e-07, - "loss": 0.716, + "epoch": 0.27985347985347986, + "grad_norm": 2.287680313241345, + "learning_rate": 1.690131048459e-05, + "loss": 1.0453, "step": 2674 }, { - "epoch": 1.76, - "learning_rate": 7.39924809974365e-07, - "loss": 0.6041, + "epoch": 0.27995813710099426, + "grad_norm": 2.23259788225129, + "learning_rate": 1.6898857000483375e-05, + "loss": 1.039, "step": 2675 }, { - "epoch": 1.76, - "learning_rate": 7.359043898280571e-07, - "loss": 0.6687, + "epoch": 0.2800627943485086, + "grad_norm": 2.4640669787362626, + "learning_rate": 1.6896402723684095e-05, + "loss": 1.035, "step": 2676 }, { - "epoch": 1.76, - "learning_rate": 7.318945047474357e-07, - "loss": 0.7462, + "epoch": 0.280167451596023, + "grad_norm": 2.226817030735215, + "learning_rate": 1.689394765447416e-05, + "loss": 1.0474, "step": 2677 }, { - "epoch": 1.76, - "learning_rate": 7.278951592925154e-07, - "loss": 0.6986, + "epoch": 0.2802721088435374, + "grad_norm": 1.9193868173180237, + "learning_rate": 1.6891491793135663e-05, + "loss": 1.0085, "step": 2678 }, { - "epoch": 1.76, - "learning_rate": 7.239063580113203e-07, - "loss": 0.5393, + "epoch": 0.28037676609105183, + "grad_norm": 2.1659873373450838, + "learning_rate": 1.6889035139950795e-05, + "loss": 0.9146, "step": 2679 }, { - "epoch": 1.76, - "learning_rate": 7.19928105439891e-07, - "loss": 0.6693, + "epoch": 0.2804814233385662, + "grad_norm": 2.57303138488225, + "learning_rate": 1.6886577695201816e-05, + "loss": 1.1251, "step": 2680 }, { - "epoch": 1.76, - "learning_rate": 7.159604061022663e-07, - "loss": 0.6553, + "epoch": 0.2805860805860806, + "grad_norm": 2.042240644425295, + "learning_rate": 1.6884119459171104e-05, + "loss": 0.9829, "step": 2681 }, { - "epoch": 1.77, - "learning_rate": 7.120032645104891e-07, - "loss": 0.6441, + "epoch": 0.280690737833595, + "grad_norm": 1.931077914764023, + "learning_rate": 1.688166043214111e-05, + "loss": 1.0262, "step": 2682 }, { - "epoch": 1.77, - "learning_rate": 7.080566851645954e-07, - "loss": 0.6279, + "epoch": 0.28079539508110934, + "grad_norm": 2.6511446782137083, + "learning_rate": 1.687920061439438e-05, + "loss": 1.0604, "step": 2683 }, { - "epoch": 1.77, - "learning_rate": 7.041206725526029e-07, - "loss": 0.6176, + "epoch": 0.28090005232862375, + "grad_norm": 2.3009554948835698, + "learning_rate": 1.6876740006213556e-05, + "loss": 0.8862, "step": 2684 }, { - "epoch": 1.77, - "learning_rate": 7.0019523115052e-07, - "loss": 0.7056, + "epoch": 0.28100470957613816, + "grad_norm": 2.4972612313324274, + "learning_rate": 1.6874278607881362e-05, + "loss": 0.9708, "step": 2685 }, { - "epoch": 1.77, - "learning_rate": 6.962803654223327e-07, - "loss": 0.7062, + "epoch": 0.28110936682365256, + "grad_norm": 2.156106891119093, + "learning_rate": 1.687181641968062e-05, + "loss": 0.9762, "step": 2686 }, { - "epoch": 1.77, - "learning_rate": 6.923760798199986e-07, - "loss": 0.6981, + "epoch": 0.2812140240711669, + "grad_norm": 2.211635752927475, + "learning_rate": 1.6869353441894245e-05, + "loss": 1.0014, "step": 2687 }, { - "epoch": 1.77, - "learning_rate": 6.884823787834471e-07, - "loss": 0.7004, + "epoch": 0.2813186813186813, + "grad_norm": 2.4317542466014284, + "learning_rate": 1.6866889674805233e-05, + "loss": 0.9357, "step": 2688 }, { - "epoch": 1.77, - "learning_rate": 6.845992667405621e-07, - "loss": 0.5681, + "epoch": 0.2814233385661957, + "grad_norm": 2.4931495962019987, + "learning_rate": 1.686442511869667e-05, + "loss": 0.9232, "step": 2689 }, { - "epoch": 1.77, - "learning_rate": 6.807267481071966e-07, - "loss": 0.724, + "epoch": 0.2815279958137101, + "grad_norm": 2.1696324074929394, + "learning_rate": 1.6861959773851754e-05, + "loss": 0.8919, "step": 2690 }, { - "epoch": 1.77, - "learning_rate": 6.768648272871503e-07, - "loss": 0.6296, + "epoch": 0.2816326530612245, + "grad_norm": 2.168218378578173, + "learning_rate": 1.685949364055375e-05, + "loss": 1.1053, "step": 2691 }, { - "epoch": 1.77, - "learning_rate": 6.730135086721746e-07, - "loss": 0.7343, + "epoch": 0.2817373103087389, + "grad_norm": 2.334913438897909, + "learning_rate": 1.6857026719086014e-05, + "loss": 1.0497, "step": 2692 }, { - "epoch": 1.77, - "learning_rate": 6.691727966419603e-07, - "loss": 0.564, + "epoch": 0.2818419675562533, + "grad_norm": 2.307342605366626, + "learning_rate": 1.6854559009732006e-05, + "loss": 1.0982, "step": 2693 }, { - "epoch": 1.77, - "learning_rate": 6.653426955641396e-07, - "loss": 0.6441, + "epoch": 0.28194662480376764, + "grad_norm": 2.11751636176187, + "learning_rate": 1.685209051277528e-05, + "loss": 0.9288, "step": 2694 }, { - "epoch": 1.77, - "learning_rate": 6.615232097942769e-07, - "loss": 0.6008, + "epoch": 0.28205128205128205, + "grad_norm": 1.702869333805771, + "learning_rate": 1.684962122849946e-05, + "loss": 0.8285, "step": 2695 }, { - "epoch": 1.77, - "learning_rate": 6.577143436758659e-07, - "loss": 0.5842, + "epoch": 0.28215593929879645, + "grad_norm": 2.5478588197388983, + "learning_rate": 1.6847151157188274e-05, + "loss": 1.1483, "step": 2696 }, { - "epoch": 1.78, - "learning_rate": 6.53916101540324e-07, - "loss": 0.7402, + "epoch": 0.2822605965463108, + "grad_norm": 1.6423189951148351, + "learning_rate": 1.6844680299125542e-05, + "loss": 0.8226, "step": 2697 }, { - "epoch": 1.78, - "learning_rate": 6.501284877069824e-07, - "loss": 0.6289, + "epoch": 0.2823652537938252, + "grad_norm": 2.7051398620041796, + "learning_rate": 1.6842208654595164e-05, + "loss": 1.1181, "step": 2698 }, { - "epoch": 1.78, - "learning_rate": 6.463515064830916e-07, - "loss": 0.6598, + "epoch": 0.2824699110413396, + "grad_norm": 2.328813950042279, + "learning_rate": 1.6839736223881144e-05, + "loss": 1.0891, "step": 2699 }, { - "epoch": 1.78, - "learning_rate": 6.42585162163808e-07, - "loss": 0.5905, + "epoch": 0.282574568288854, + "grad_norm": 2.2355207375652766, + "learning_rate": 1.6837263007267567e-05, + "loss": 0.8753, "step": 2700 }, { - "epoch": 1.78, - "learning_rate": 6.38829459032193e-07, - "loss": 0.7595, + "epoch": 0.2826792255363684, + "grad_norm": 2.395946624547171, + "learning_rate": 1.683478900503861e-05, + "loss": 1.0681, "step": 2701 }, { - "epoch": 1.78, - "learning_rate": 6.350844013592062e-07, - "loss": 0.715, + "epoch": 0.2827838827838828, + "grad_norm": 1.8895809027956487, + "learning_rate": 1.6832314217478538e-05, + "loss": 0.966, "step": 2702 }, { - "epoch": 1.78, - "learning_rate": 6.313499934037015e-07, - "loss": 0.7258, + "epoch": 0.2828885400313972, + "grad_norm": 2.108842553403623, + "learning_rate": 1.6829838644871716e-05, + "loss": 0.9917, "step": 2703 }, { - "epoch": 1.78, - "learning_rate": 6.276262394124222e-07, - "loss": 0.5514, + "epoch": 0.2829931972789116, + "grad_norm": 1.9546264051078577, + "learning_rate": 1.6827362287502583e-05, + "loss": 0.9654, "step": 2704 }, { - "epoch": 1.78, - "learning_rate": 6.239131436199941e-07, - "loss": 0.6828, + "epoch": 0.28309785452642594, + "grad_norm": 2.1750684526478095, + "learning_rate": 1.6824885145655685e-05, + "loss": 0.9892, "step": 2705 }, { - "epoch": 1.78, - "learning_rate": 6.202107102489274e-07, - "loss": 0.6106, + "epoch": 0.28320251177394035, + "grad_norm": 2.2396694828655472, + "learning_rate": 1.6822407219615646e-05, + "loss": 0.8008, "step": 2706 }, { - "epoch": 1.78, - "learning_rate": 6.165189435095997e-07, - "loss": 0.6825, + "epoch": 0.28330716902145475, + "grad_norm": 2.0811467507416777, + "learning_rate": 1.6819928509667193e-05, + "loss": 1.0043, "step": 2707 }, { - "epoch": 1.78, - "learning_rate": 6.12837847600265e-07, - "loss": 0.7687, + "epoch": 0.2834118262689691, + "grad_norm": 2.156062381671543, + "learning_rate": 1.6817449016095124e-05, + "loss": 0.8708, "step": 2708 }, { - "epoch": 1.78, - "learning_rate": 6.091674267070391e-07, - "loss": 0.6662, + "epoch": 0.2835164835164835, + "grad_norm": 2.2971033102764524, + "learning_rate": 1.681496873918434e-05, + "loss": 0.9787, "step": 2709 }, { - "epoch": 1.78, - "learning_rate": 6.05507685003901e-07, - "loss": 0.8031, + "epoch": 0.2836211407639979, + "grad_norm": 1.8028079138511994, + "learning_rate": 1.681248767921984e-05, + "loss": 0.7918, "step": 2710 }, { - "epoch": 1.78, - "learning_rate": 6.018586266526838e-07, - "loss": 0.6603, + "epoch": 0.2837257980115123, + "grad_norm": 1.9797359746272927, + "learning_rate": 1.6810005836486693e-05, + "loss": 0.9433, "step": 2711 }, { - "epoch": 1.79, - "learning_rate": 5.982202558030692e-07, - "loss": 0.7243, + "epoch": 0.28383045525902667, + "grad_norm": 2.1233134417416246, + "learning_rate": 1.6807523211270065e-05, + "loss": 1.1024, "step": 2712 }, { - "epoch": 1.79, - "learning_rate": 5.945925765925875e-07, - "loss": 0.5889, + "epoch": 0.2839351125065411, + "grad_norm": 2.0119377968605217, + "learning_rate": 1.6805039803855225e-05, + "loss": 1.0896, "step": 2713 }, { - "epoch": 1.79, - "learning_rate": 5.909755931466121e-07, - "loss": 0.5792, + "epoch": 0.2840397697540555, + "grad_norm": 2.1762408296082083, + "learning_rate": 1.6802555614527513e-05, + "loss": 1.1555, "step": 2714 }, { - "epoch": 1.79, - "learning_rate": 5.873693095783529e-07, - "loss": 0.706, + "epoch": 0.28414442700156983, + "grad_norm": 2.0014014481336195, + "learning_rate": 1.6800070643572374e-05, + "loss": 1.0324, "step": 2715 }, { - "epoch": 1.79, - "learning_rate": 5.837737299888502e-07, - "loss": 0.6878, + "epoch": 0.28424908424908424, + "grad_norm": 2.2844238601393676, + "learning_rate": 1.679758489127533e-05, + "loss": 1.0019, "step": 2716 }, { - "epoch": 1.79, - "learning_rate": 5.80188858466969e-07, - "loss": 0.761, + "epoch": 0.28435374149659864, + "grad_norm": 2.107100861748374, + "learning_rate": 1.6795098357922004e-05, + "loss": 0.8909, "step": 2717 }, { - "epoch": 1.79, - "learning_rate": 5.766146990894039e-07, - "loss": 0.6688, + "epoch": 0.28445839874411305, + "grad_norm": 2.309820312590304, + "learning_rate": 1.67926110437981e-05, + "loss": 1.0038, "step": 2718 }, { - "epoch": 1.79, - "learning_rate": 5.730512559206625e-07, - "loss": 0.6699, + "epoch": 0.2845630559916274, + "grad_norm": 2.9601375842932005, + "learning_rate": 1.679012294918942e-05, + "loss": 1.0204, "step": 2719 }, { - "epoch": 1.79, - "learning_rate": 5.694985330130698e-07, - "loss": 0.6538, + "epoch": 0.2846677132391418, + "grad_norm": 2.301335233810261, + "learning_rate": 1.6787634074381844e-05, + "loss": 1.1004, "step": 2720 }, { - "epoch": 1.79, - "learning_rate": 5.659565344067552e-07, - "loss": 0.7084, + "epoch": 0.2847723704866562, + "grad_norm": 2.2912609106687367, + "learning_rate": 1.6785144419661356e-05, + "loss": 0.8342, "step": 2721 }, { - "epoch": 1.79, - "learning_rate": 5.624252641296568e-07, - "loss": 0.6618, + "epoch": 0.2848770277341706, + "grad_norm": 2.2352473794305903, + "learning_rate": 1.678265398531402e-05, + "loss": 0.9728, "step": 2722 }, { - "epoch": 1.79, - "learning_rate": 5.5890472619751e-07, - "loss": 0.6645, + "epoch": 0.28498168498168497, + "grad_norm": 2.378174318133523, + "learning_rate": 1.6780162771625987e-05, + "loss": 1.0774, "step": 2723 }, { - "epoch": 1.79, - "learning_rate": 5.553949246138468e-07, - "loss": 0.6506, + "epoch": 0.2850863422291994, + "grad_norm": 2.18094544554846, + "learning_rate": 1.6777670778883507e-05, + "loss": 1.049, "step": 2724 }, { - "epoch": 1.79, - "learning_rate": 5.5189586336999e-07, - "loss": 0.6687, + "epoch": 0.2851909994767138, + "grad_norm": 2.4504749993683723, + "learning_rate": 1.6775178007372915e-05, + "loss": 0.8661, "step": 2725 }, { - "epoch": 1.79, - "learning_rate": 5.484075464450456e-07, - "loss": 0.6963, + "epoch": 0.28529565672422813, + "grad_norm": 2.282848626968931, + "learning_rate": 1.677268445738064e-05, + "loss": 1.0587, "step": 2726 }, { - "epoch": 1.8, - "learning_rate": 5.449299778059069e-07, - "loss": 0.6859, + "epoch": 0.28540031397174254, + "grad_norm": 2.2046921458707653, + "learning_rate": 1.677019012919319e-05, + "loss": 0.8776, "step": 2727 }, { - "epoch": 1.8, - "learning_rate": 5.414631614072385e-07, - "loss": 0.6964, + "epoch": 0.28550497121925694, + "grad_norm": 1.9336248225640682, + "learning_rate": 1.6767695023097164e-05, + "loss": 0.9612, "step": 2728 }, { - "epoch": 1.8, - "learning_rate": 5.380071011914823e-07, - "loss": 0.6917, + "epoch": 0.28560962846677135, + "grad_norm": 2.1614112374322523, + "learning_rate": 1.6765199139379265e-05, + "loss": 1.0527, "step": 2729 }, { - "epoch": 1.8, - "learning_rate": 5.345618010888431e-07, - "loss": 0.6154, + "epoch": 0.2857142857142857, + "grad_norm": 2.026874347225188, + "learning_rate": 1.676270247832627e-05, + "loss": 0.9901, "step": 2730 }, { - "epoch": 1.8, - "learning_rate": 5.311272650172949e-07, - "loss": 0.7428, + "epoch": 0.2858189429618001, + "grad_norm": 2.5279801213743056, + "learning_rate": 1.6760205040225052e-05, + "loss": 1.0751, "step": 2731 }, { - "epoch": 1.8, - "learning_rate": 5.277034968825667e-07, - "loss": 0.6999, + "epoch": 0.2859236002093145, + "grad_norm": 2.408066662111667, + "learning_rate": 1.6757706825362565e-05, + "loss": 1.0041, "step": 2732 }, { - "epoch": 1.8, - "learning_rate": 5.242905005781451e-07, - "loss": 0.6909, + "epoch": 0.28602825745682886, + "grad_norm": 2.2561275528825693, + "learning_rate": 1.675520783402587e-05, + "loss": 1.0136, "step": 2733 }, { - "epoch": 1.8, - "learning_rate": 5.208882799852666e-07, - "loss": 0.7756, + "epoch": 0.28613291470434327, + "grad_norm": 2.079863026171039, + "learning_rate": 1.6752708066502097e-05, + "loss": 1.0614, "step": 2734 }, { - "epoch": 1.8, - "learning_rate": 5.174968389729118e-07, - "loss": 0.6534, + "epoch": 0.28623757195185767, + "grad_norm": 2.081290751507961, + "learning_rate": 1.675020752307848e-05, + "loss": 1.0573, "step": 2735 }, { - "epoch": 1.8, - "learning_rate": 5.141161813978046e-07, - "loss": 0.654, + "epoch": 0.2863422291993721, + "grad_norm": 2.030844176643617, + "learning_rate": 1.6747706204042335e-05, + "loss": 1.1131, "step": 2736 }, { - "epoch": 1.8, - "learning_rate": 5.107463111044053e-07, - "loss": 0.6402, + "epoch": 0.28644688644688643, + "grad_norm": 2.1050959828240936, + "learning_rate": 1.6745204109681064e-05, + "loss": 0.9127, "step": 2737 }, { - "epoch": 1.8, - "learning_rate": 5.073872319249073e-07, - "loss": 0.6297, + "epoch": 0.28655154369440083, + "grad_norm": 2.055509983409726, + "learning_rate": 1.6742701240282174e-05, + "loss": 0.9553, "step": 2738 }, { - "epoch": 1.8, - "learning_rate": 5.04038947679234e-07, - "loss": 0.7351, + "epoch": 0.28665620094191524, + "grad_norm": 2.137470369141178, + "learning_rate": 1.6740197596133238e-05, + "loss": 1.0597, "step": 2739 }, { - "epoch": 1.8, - "learning_rate": 5.007014621750272e-07, - "loss": 0.6511, + "epoch": 0.2867608581894296, + "grad_norm": 2.048361552464316, + "learning_rate": 1.6737693177521936e-05, + "loss": 0.8446, "step": 2740 }, { - "epoch": 1.8, - "learning_rate": 4.973747792076555e-07, - "loss": 0.579, + "epoch": 0.286865515436944, + "grad_norm": 2.12112325524762, + "learning_rate": 1.673518798473603e-05, + "loss": 1.0618, "step": 2741 }, { - "epoch": 1.81, - "learning_rate": 4.940589025601983e-07, - "loss": 0.7071, + "epoch": 0.2869701726844584, + "grad_norm": 2.6242506865312363, + "learning_rate": 1.6732682018063368e-05, + "loss": 0.8252, "step": 2742 }, { - "epoch": 1.81, - "learning_rate": 4.907538360034502e-07, - "loss": 0.6111, + "epoch": 0.2870748299319728, + "grad_norm": 3.0996650803222674, + "learning_rate": 1.6730175277791895e-05, + "loss": 1.0003, "step": 2743 }, { - "epoch": 1.81, - "learning_rate": 4.874595832959061e-07, - "loss": 0.6774, + "epoch": 0.28717948717948716, + "grad_norm": 2.350027250748929, + "learning_rate": 1.6727667764209638e-05, + "loss": 0.9309, "step": 2744 }, { - "epoch": 1.81, - "learning_rate": 4.841761481837703e-07, - "loss": 0.7553, + "epoch": 0.28728414442700156, + "grad_norm": 2.0077156617023855, + "learning_rate": 1.6725159477604716e-05, + "loss": 1.0252, "step": 2745 }, { - "epoch": 1.81, - "learning_rate": 4.809035344009416e-07, - "loss": 0.6495, + "epoch": 0.28738880167451597, + "grad_norm": 2.635741023964438, + "learning_rate": 1.672265041826534e-05, + "loss": 0.8513, "step": 2746 }, { - "epoch": 1.81, - "learning_rate": 4.776417456690152e-07, - "loss": 0.7881, + "epoch": 0.2874934589220304, + "grad_norm": 2.2433336022783172, + "learning_rate": 1.67201405864798e-05, + "loss": 1.0492, "step": 2747 }, { - "epoch": 1.81, - "learning_rate": 4.74390785697274e-07, - "loss": 0.6295, + "epoch": 0.2875981161695447, + "grad_norm": 2.0807771153561547, + "learning_rate": 1.6717629982536484e-05, + "loss": 0.9764, "step": 2748 }, { - "epoch": 1.81, - "learning_rate": 4.7115065818268636e-07, - "loss": 0.7305, + "epoch": 0.28770277341705913, + "grad_norm": 2.3123250176131807, + "learning_rate": 1.6715118606723867e-05, + "loss": 0.9599, "step": 2749 }, { - "epoch": 1.81, - "learning_rate": 4.6792136680990363e-07, - "loss": 0.7171, + "epoch": 0.28780743066457354, + "grad_norm": 2.0101181547077998, + "learning_rate": 1.6712606459330503e-05, + "loss": 0.8779, "step": 2750 }, { - "epoch": 1.81, - "learning_rate": 4.6470291525125386e-07, - "loss": 0.6919, + "epoch": 0.2879120879120879, + "grad_norm": 2.0975900598381467, + "learning_rate": 1.6710093540645056e-05, + "loss": 0.9257, "step": 2751 }, { - "epoch": 1.81, - "learning_rate": 4.614953071667383e-07, - "loss": 0.7825, + "epoch": 0.2880167451596023, + "grad_norm": 1.958047612574768, + "learning_rate": 1.6707579850956256e-05, + "loss": 0.8114, "step": 2752 }, { - "epoch": 1.81, - "learning_rate": 4.582985462040268e-07, - "loss": 0.7472, + "epoch": 0.2881214024071167, + "grad_norm": 2.1295560514183425, + "learning_rate": 1.6705065390552934e-05, + "loss": 1.0102, "step": 2753 }, { - "epoch": 1.81, - "learning_rate": 4.5511263599845145e-07, - "loss": 0.6995, + "epoch": 0.2882260596546311, + "grad_norm": 2.3322838337100795, + "learning_rate": 1.6702550159724005e-05, + "loss": 1.0251, "step": 2754 }, { - "epoch": 1.81, - "learning_rate": 4.5193758017301084e-07, - "loss": 0.6419, + "epoch": 0.28833071690214546, + "grad_norm": 2.0269061447136276, + "learning_rate": 1.6700034158758476e-05, + "loss": 0.9773, "step": 2755 }, { - "epoch": 1.81, - "learning_rate": 4.4877338233835223e-07, - "loss": 0.6327, + "epoch": 0.28843537414965986, + "grad_norm": 2.2135086571849927, + "learning_rate": 1.6697517387945437e-05, + "loss": 1.0397, "step": 2756 }, { - "epoch": 1.82, - "learning_rate": 4.456200460927862e-07, - "loss": 0.6755, + "epoch": 0.28854003139717427, + "grad_norm": 3.020623424201838, + "learning_rate": 1.669499984757408e-05, + "loss": 1.1779, "step": 2757 }, { - "epoch": 1.82, - "learning_rate": 4.4247757502225983e-07, - "loss": 0.6664, + "epoch": 0.2886446886446886, + "grad_norm": 2.3913345388059675, + "learning_rate": 1.669248153793366e-05, + "loss": 0.9306, "step": 2758 }, { - "epoch": 1.82, - "learning_rate": 4.3934597270037017e-07, - "loss": 0.6987, + "epoch": 0.288749345892203, + "grad_norm": 2.0642701710655196, + "learning_rate": 1.6689962459313547e-05, + "loss": 1.0335, "step": 2759 }, { - "epoch": 1.82, - "learning_rate": 4.3622524268835507e-07, - "loss": 0.6946, + "epoch": 0.28885400313971743, + "grad_norm": 2.084776755214591, + "learning_rate": 1.6687442612003185e-05, + "loss": 0.8789, "step": 2760 }, { - "epoch": 1.82, - "learning_rate": 4.3311538853508697e-07, - "loss": 0.7739, + "epoch": 0.28895866038723184, + "grad_norm": 2.1559657822579616, + "learning_rate": 1.668492199629211e-05, + "loss": 0.873, "step": 2761 }, { - "epoch": 1.82, - "learning_rate": 4.3001641377707125e-07, - "loss": 0.6093, + "epoch": 0.2890633176347462, + "grad_norm": 2.1968258921987043, + "learning_rate": 1.668240061246995e-05, + "loss": 1.0585, "step": 2762 }, { - "epoch": 1.82, - "learning_rate": 4.2692832193843794e-07, - "loss": 0.6907, + "epoch": 0.2891679748822606, + "grad_norm": 2.037768963343927, + "learning_rate": 1.6679878460826407e-05, + "loss": 1.0065, "step": 2763 }, { - "epoch": 1.82, - "learning_rate": 4.2385111653094667e-07, - "loss": 0.7846, + "epoch": 0.289272632129775, + "grad_norm": 2.311144324625343, + "learning_rate": 1.6677355541651292e-05, + "loss": 1.0024, "step": 2764 }, { - "epoch": 1.82, - "learning_rate": 4.207848010539717e-07, - "loss": 0.8014, + "epoch": 0.2893772893772894, + "grad_norm": 2.5667946832040536, + "learning_rate": 1.6674831855234486e-05, + "loss": 0.9484, "step": 2765 }, { - "epoch": 1.82, - "learning_rate": 4.1772937899450804e-07, - "loss": 0.6396, + "epoch": 0.28948194662480375, + "grad_norm": 2.434219899597603, + "learning_rate": 1.667230740186597e-05, + "loss": 1.0179, "step": 2766 }, { - "epoch": 1.82, - "learning_rate": 4.1468485382716085e-07, - "loss": 0.6481, + "epoch": 0.28958660387231816, + "grad_norm": 2.7601269464079676, + "learning_rate": 1.6669782181835807e-05, + "loss": 1.0228, "step": 2767 }, { - "epoch": 1.82, - "learning_rate": 4.1165122901414055e-07, - "loss": 0.742, + "epoch": 0.28969126111983257, + "grad_norm": 2.5403438469183697, + "learning_rate": 1.666725619543415e-05, + "loss": 1.047, "step": 2768 }, { - "epoch": 1.82, - "learning_rate": 4.0862850800526655e-07, - "loss": 0.7864, + "epoch": 0.2897959183673469, + "grad_norm": 2.051984141112977, + "learning_rate": 1.666472944295124e-05, + "loss": 0.9447, "step": 2769 }, { - "epoch": 1.82, - "learning_rate": 4.056166942379558e-07, - "loss": 0.7122, + "epoch": 0.2899005756148613, + "grad_norm": 2.512139628224453, + "learning_rate": 1.666220192467741e-05, + "loss": 1.1367, "step": 2770 }, { - "epoch": 1.82, - "learning_rate": 4.026157911372242e-07, - "loss": 0.7419, + "epoch": 0.29000523286237573, + "grad_norm": 2.718503682041811, + "learning_rate": 1.6659673640903067e-05, + "loss": 1.0922, "step": 2771 }, { - "epoch": 1.82, - "learning_rate": 3.9962580211567405e-07, - "loss": 0.7055, + "epoch": 0.29010989010989013, + "grad_norm": 2.5150069168743885, + "learning_rate": 1.6657144591918726e-05, + "loss": 1.1432, "step": 2772 }, { - "epoch": 1.83, - "learning_rate": 3.9664673057350223e-07, - "loss": 0.7459, + "epoch": 0.2902145473574045, + "grad_norm": 2.3080940442950304, + "learning_rate": 1.665461477801497e-05, + "loss": 1.1565, "step": 2773 }, { - "epoch": 1.83, - "learning_rate": 3.9367857989848765e-07, - "loss": 0.6831, + "epoch": 0.2903192046049189, + "grad_norm": 3.162428243155795, + "learning_rate": 1.6652084199482496e-05, + "loss": 1.1392, "step": 2774 }, { - "epoch": 1.83, - "learning_rate": 3.907213534659926e-07, - "loss": 0.6474, + "epoch": 0.2904238618524333, + "grad_norm": 2.460964843259728, + "learning_rate": 1.6649552856612056e-05, + "loss": 1.0668, "step": 2775 }, { - "epoch": 1.83, - "learning_rate": 3.877750546389536e-07, - "loss": 0.7284, + "epoch": 0.29052851909994765, + "grad_norm": 2.2837818852824747, + "learning_rate": 1.6647020749694513e-05, + "loss": 1.0405, "step": 2776 }, { - "epoch": 1.83, - "learning_rate": 3.8483968676788055e-07, - "loss": 0.6127, + "epoch": 0.29063317634746205, + "grad_norm": 2.3757375317221348, + "learning_rate": 1.6644487879020812e-05, + "loss": 0.988, "step": 2777 }, { - "epoch": 1.83, - "learning_rate": 3.819152531908543e-07, - "loss": 0.726, + "epoch": 0.29073783359497646, + "grad_norm": 2.0992714934923065, + "learning_rate": 1.6641954244881984e-05, + "loss": 1.0678, "step": 2778 }, { - "epoch": 1.83, - "learning_rate": 3.790017572335214e-07, - "loss": 0.6844, + "epoch": 0.29084249084249086, + "grad_norm": 2.4400916221420963, + "learning_rate": 1.6639419847569147e-05, + "loss": 1.0578, "step": 2779 }, { - "epoch": 1.83, - "learning_rate": 3.7609920220908813e-07, - "loss": 0.5946, + "epoch": 0.2909471480900052, + "grad_norm": 2.1469546303983043, + "learning_rate": 1.6636884687373508e-05, + "loss": 0.9257, "step": 2780 }, { - "epoch": 1.83, - "learning_rate": 3.73207591418322e-07, - "loss": 0.6261, + "epoch": 0.2910518053375196, + "grad_norm": 2.334369619059665, + "learning_rate": 1.663434876458637e-05, + "loss": 0.9772, "step": 2781 }, { - "epoch": 1.83, - "learning_rate": 3.703269281495392e-07, - "loss": 0.8017, + "epoch": 0.291156462585034, + "grad_norm": 2.3345181976583653, + "learning_rate": 1.6631812079499106e-05, + "loss": 1.0597, "step": 2782 }, { - "epoch": 1.83, - "learning_rate": 3.674572156786127e-07, - "loss": 0.6984, + "epoch": 0.2912611198325484, + "grad_norm": 2.0700463369958815, + "learning_rate": 1.6629274632403193e-05, + "loss": 1.0176, "step": 2783 }, { - "epoch": 1.83, - "learning_rate": 3.6459845726895984e-07, - "loss": 0.6563, + "epoch": 0.2913657770800628, + "grad_norm": 2.202087466914214, + "learning_rate": 1.6626736423590186e-05, + "loss": 1.0216, "step": 2784 }, { - "epoch": 1.83, - "learning_rate": 3.6175065617154113e-07, - "loss": 0.6191, + "epoch": 0.2914704343275772, + "grad_norm": 2.0357122851821883, + "learning_rate": 1.6624197453351723e-05, + "loss": 1.0036, "step": 2785 }, { - "epoch": 1.83, - "learning_rate": 3.5891381562485504e-07, - "loss": 0.7208, + "epoch": 0.2915750915750916, + "grad_norm": 2.6405418320254457, + "learning_rate": 1.662165772197955e-05, + "loss": 1.0112, "step": 2786 }, { - "epoch": 1.83, - "learning_rate": 3.5608793885493766e-07, - "loss": 0.6748, + "epoch": 0.29167974882260594, + "grad_norm": 2.1803315886051453, + "learning_rate": 1.661911722976548e-05, + "loss": 0.8626, "step": 2787 }, { - "epoch": 1.84, - "learning_rate": 3.53273029075355e-07, - "loss": 0.5896, + "epoch": 0.29178440607012035, + "grad_norm": 1.9949267891582323, + "learning_rate": 1.6616575977001423e-05, + "loss": 0.8258, "step": 2788 }, { - "epoch": 1.84, - "learning_rate": 3.504690894872065e-07, - "loss": 0.6115, + "epoch": 0.29188906331763476, + "grad_norm": 2.1296048269861965, + "learning_rate": 1.661403396397937e-05, + "loss": 0.9722, "step": 2789 }, { - "epoch": 1.84, - "learning_rate": 3.476761232791115e-07, - "loss": 0.84, + "epoch": 0.29199372056514916, + "grad_norm": 1.9633811576783402, + "learning_rate": 1.661149119099141e-05, + "loss": 0.9531, "step": 2790 }, { - "epoch": 1.84, - "learning_rate": 3.4489413362721156e-07, - "loss": 0.6417, + "epoch": 0.2920983778126635, + "grad_norm": 2.4559017455704146, + "learning_rate": 1.6608947658329705e-05, + "loss": 0.9185, "step": 2791 }, { - "epoch": 1.84, - "learning_rate": 3.4212312369516496e-07, - "loss": 0.6939, + "epoch": 0.2922030350601779, + "grad_norm": 2.6490541995498593, + "learning_rate": 1.6606403366286522e-05, + "loss": 1.0208, "step": 2792 }, { - "epoch": 1.84, - "learning_rate": 3.393630966341477e-07, - "loss": 0.6228, + "epoch": 0.2923076923076923, + "grad_norm": 2.230218351877687, + "learning_rate": 1.6603858315154194e-05, + "loss": 0.9397, "step": 2793 }, { - "epoch": 1.84, - "learning_rate": 3.3661405558284124e-07, - "loss": 0.6105, + "epoch": 0.2924123495552067, + "grad_norm": 2.386055723778366, + "learning_rate": 1.660131250522516e-05, + "loss": 0.8659, "step": 2794 }, { - "epoch": 1.84, - "learning_rate": 3.338760036674382e-07, - "loss": 0.6057, + "epoch": 0.2925170068027211, + "grad_norm": 2.223350214103197, + "learning_rate": 1.659876593679194e-05, + "loss": 1.0085, "step": 2795 }, { - "epoch": 1.84, - "learning_rate": 3.311489440016302e-07, - "loss": 0.541, + "epoch": 0.2926216640502355, + "grad_norm": 2.362704726315233, + "learning_rate": 1.6596218610147134e-05, + "loss": 1.0483, "step": 2796 }, { - "epoch": 1.84, - "learning_rate": 3.284328796866121e-07, - "loss": 0.7407, + "epoch": 0.2927263212977499, + "grad_norm": 2.554013585062049, + "learning_rate": 1.6593670525583437e-05, + "loss": 0.9678, "step": 2797 }, { - "epoch": 1.84, - "learning_rate": 3.25727813811072e-07, - "loss": 0.6918, + "epoch": 0.29283097854526424, + "grad_norm": 2.9600036988498477, + "learning_rate": 1.659112168339363e-05, + "loss": 0.855, "step": 2798 }, { - "epoch": 1.84, - "learning_rate": 3.230337494511937e-07, - "loss": 0.6731, + "epoch": 0.29293563579277865, + "grad_norm": 2.1825672871247135, + "learning_rate": 1.6588572083870583e-05, + "loss": 1.0531, "step": 2799 }, { - "epoch": 1.84, - "learning_rate": 3.2035068967064876e-07, - "loss": 0.6961, + "epoch": 0.29304029304029305, + "grad_norm": 2.0933057115237603, + "learning_rate": 1.6586021727307247e-05, + "loss": 1.0515, "step": 2800 }, { - "epoch": 1.84, - "learning_rate": 3.176786375205931e-07, - "loss": 0.7355, + "epoch": 0.2931449502878074, + "grad_norm": 2.400496066375719, + "learning_rate": 1.658347061399666e-05, + "loss": 1.0419, "step": 2801 }, { - "epoch": 1.84, - "learning_rate": 3.150175960396673e-07, - "loss": 0.6905, + "epoch": 0.2932496075353218, + "grad_norm": 2.016597627986569, + "learning_rate": 1.6580918744231955e-05, + "loss": 0.9104, "step": 2802 }, { - "epoch": 1.85, - "learning_rate": 3.123675682539884e-07, - "loss": 0.6961, + "epoch": 0.2933542647828362, + "grad_norm": 2.3757951069404326, + "learning_rate": 1.6578366118306343e-05, + "loss": 1.1151, "step": 2803 }, { - "epoch": 1.85, - "learning_rate": 3.0972855717715134e-07, - "loss": 0.5558, + "epoch": 0.2934589220303506, + "grad_norm": 2.0100169684030114, + "learning_rate": 1.6575812736513133e-05, + "loss": 0.9798, "step": 2804 }, { - "epoch": 1.85, - "learning_rate": 3.0710056581021886e-07, - "loss": 0.5556, + "epoch": 0.29356357927786497, + "grad_norm": 2.1122429080682945, + "learning_rate": 1.6573258599145704e-05, + "loss": 0.9442, "step": 2805 }, { - "epoch": 1.85, - "learning_rate": 3.0448359714172593e-07, - "loss": 0.6529, + "epoch": 0.2936682365253794, + "grad_norm": 2.154489488864629, + "learning_rate": 1.657070370649754e-05, + "loss": 1.0333, "step": 2806 }, { - "epoch": 1.85, - "learning_rate": 3.018776541476709e-07, - "loss": 0.6608, + "epoch": 0.2937728937728938, + "grad_norm": 2.275200742495966, + "learning_rate": 1.65681480588622e-05, + "loss": 1.083, "step": 2807 }, { - "epoch": 1.85, - "learning_rate": 2.9928273979151433e-07, - "loss": 0.779, + "epoch": 0.2938775510204082, + "grad_norm": 2.1710490075798514, + "learning_rate": 1.656559165653333e-05, + "loss": 1.0605, "step": 2808 }, { - "epoch": 1.85, - "learning_rate": 2.966988570241747e-07, - "loss": 0.6715, + "epoch": 0.29398220826792254, + "grad_norm": 2.598428034733466, + "learning_rate": 1.6563034499804672e-05, + "loss": 1.0464, "step": 2809 }, { - "epoch": 1.85, - "learning_rate": 2.9412600878402696e-07, - "loss": 0.56, + "epoch": 0.29408686551543695, + "grad_norm": 2.151540858385915, + "learning_rate": 1.6560476588970044e-05, + "loss": 1.0399, "step": 2810 }, { - "epoch": 1.85, - "learning_rate": 2.915641979968953e-07, - "loss": 0.8199, + "epoch": 0.29419152276295135, + "grad_norm": 2.1278728592287868, + "learning_rate": 1.6557917924323358e-05, + "loss": 1.0988, "step": 2811 }, { - "epoch": 1.85, - "learning_rate": 2.8901342757605256e-07, - "loss": 0.7911, + "epoch": 0.2942961800104657, + "grad_norm": 2.1705417888870944, + "learning_rate": 1.6555358506158604e-05, + "loss": 0.9765, "step": 2812 }, { - "epoch": 1.85, - "learning_rate": 2.8647370042222064e-07, - "loss": 0.6145, + "epoch": 0.2944008372579801, + "grad_norm": 2.206832470090881, + "learning_rate": 1.6552798334769874e-05, + "loss": 0.9632, "step": 2813 }, { - "epoch": 1.85, - "learning_rate": 2.839450194235549e-07, - "loss": 0.745, + "epoch": 0.2945054945054945, + "grad_norm": 2.1115011840786586, + "learning_rate": 1.655023741045133e-05, + "loss": 1.1335, "step": 2814 }, { - "epoch": 1.85, - "learning_rate": 2.8142738745565724e-07, - "loss": 0.7071, + "epoch": 0.2946101517530089, + "grad_norm": 1.90252480707429, + "learning_rate": 1.6547675733497226e-05, + "loss": 1.0063, "step": 2815 }, { - "epoch": 1.85, - "learning_rate": 2.789208073815608e-07, - "loss": 0.7781, + "epoch": 0.29471480900052327, + "grad_norm": 2.349864203175167, + "learning_rate": 1.6545113304201906e-05, + "loss": 1.018, "step": 2816 }, { - "epoch": 1.85, - "learning_rate": 2.764252820517299e-07, - "loss": 0.6849, + "epoch": 0.2948194662480377, + "grad_norm": 1.9420251810407316, + "learning_rate": 1.6542550122859804e-05, + "loss": 0.88, "step": 2817 }, { - "epoch": 1.86, - "learning_rate": 2.739408143040623e-07, - "loss": 0.7383, + "epoch": 0.2949241234955521, + "grad_norm": 2.2700830883100767, + "learning_rate": 1.6539986189765425e-05, + "loss": 1.0005, "step": 2818 }, { - "epoch": 1.86, - "learning_rate": 2.714674069638712e-07, - "loss": 0.6071, + "epoch": 0.29502878074306643, + "grad_norm": 2.214309833369823, + "learning_rate": 1.6537421505213377e-05, + "loss": 0.9971, "step": 2819 }, { - "epoch": 1.86, - "learning_rate": 2.6900506284390446e-07, - "loss": 0.725, + "epoch": 0.29513343799058084, + "grad_norm": 2.0402371350258415, + "learning_rate": 1.6534856069498345e-05, + "loss": 1.0959, "step": 2820 }, { - "epoch": 1.86, - "learning_rate": 2.6655378474432e-07, - "loss": 0.6599, + "epoch": 0.29523809523809524, + "grad_norm": 2.0958531308968196, + "learning_rate": 1.6532289882915104e-05, + "loss": 1.0072, "step": 2821 }, { - "epoch": 1.86, - "learning_rate": 2.641135754526958e-07, - "loss": 0.6231, + "epoch": 0.29534275248560965, + "grad_norm": 1.858230244811686, + "learning_rate": 1.6529722945758512e-05, + "loss": 0.8212, "step": 2822 }, { - "epoch": 1.86, - "learning_rate": 2.6168443774401976e-07, - "loss": 0.6586, + "epoch": 0.295447409733124, + "grad_norm": 1.817144737007705, + "learning_rate": 1.6527155258323517e-05, + "loss": 1.0134, "step": 2823 }, { - "epoch": 1.86, - "learning_rate": 2.592663743806911e-07, - "loss": 0.8006, + "epoch": 0.2955520669806384, + "grad_norm": 2.4276651505529454, + "learning_rate": 1.652458682090515e-05, + "loss": 1.0547, "step": 2824 }, { - "epoch": 1.86, - "learning_rate": 2.5685938811251476e-07, - "loss": 0.6761, + "epoch": 0.2956567242281528, + "grad_norm": 2.963051911787662, + "learning_rate": 1.6522017633798534e-05, + "loss": 1.0579, "step": 2825 }, { - "epoch": 1.86, - "learning_rate": 2.544634816766989e-07, - "loss": 0.6951, + "epoch": 0.2957613814756672, + "grad_norm": 2.2165478606094355, + "learning_rate": 1.6519447697298866e-05, + "loss": 0.9951, "step": 2826 }, { - "epoch": 1.86, - "learning_rate": 2.520786577978518e-07, - "loss": 0.6571, + "epoch": 0.29586603872318157, + "grad_norm": 1.9820489456783845, + "learning_rate": 1.6516877011701443e-05, + "loss": 0.7579, "step": 2827 }, { - "epoch": 1.86, - "learning_rate": 2.4970491918797855e-07, - "loss": 0.6903, + "epoch": 0.295970695970696, + "grad_norm": 2.211338495085747, + "learning_rate": 1.651430557730164e-05, + "loss": 1.0503, "step": 2828 }, { - "epoch": 1.86, - "learning_rate": 2.4734226854647883e-07, - "loss": 0.7043, + "epoch": 0.2960753532182104, + "grad_norm": 2.0412669316228964, + "learning_rate": 1.6511733394394922e-05, + "loss": 0.8973, "step": 2829 }, { - "epoch": 1.86, - "learning_rate": 2.449907085601422e-07, - "loss": 0.6137, + "epoch": 0.29618001046572473, + "grad_norm": 1.930076486135168, + "learning_rate": 1.6509160463276835e-05, + "loss": 0.945, "step": 2830 }, { - "epoch": 1.86, - "learning_rate": 2.426502419031451e-07, - "loss": 0.8411, + "epoch": 0.29628466771323914, + "grad_norm": 2.2334950597171677, + "learning_rate": 1.6506586784243015e-05, + "loss": 0.9924, "step": 2831 }, { - "epoch": 1.86, - "learning_rate": 2.4032087123705414e-07, - "loss": 0.7115, + "epoch": 0.29638932496075354, + "grad_norm": 2.3824449949926727, + "learning_rate": 1.650401235758918e-05, + "loss": 0.8262, "step": 2832 }, { - "epoch": 1.87, - "learning_rate": 2.3800259921081037e-07, - "loss": 0.5807, + "epoch": 0.29649398220826795, + "grad_norm": 2.2784589120562795, + "learning_rate": 1.6501437183611148e-05, + "loss": 1.1117, "step": 2833 }, { - "epoch": 1.87, - "learning_rate": 2.35695428460736e-07, - "loss": 0.7759, + "epoch": 0.2965986394557823, + "grad_norm": 2.250028215509174, + "learning_rate": 1.64988612626048e-05, + "loss": 1.0606, "step": 2834 }, { - "epoch": 1.87, - "learning_rate": 2.3339936161053235e-07, - "loss": 0.6828, + "epoch": 0.2967032967032967, + "grad_norm": 2.3794072494274827, + "learning_rate": 1.6496284594866115e-05, + "loss": 0.8615, "step": 2835 }, { - "epoch": 1.87, - "learning_rate": 2.311144012712707e-07, - "loss": 0.6282, + "epoch": 0.2968079539508111, + "grad_norm": 2.470955073403848, + "learning_rate": 1.6493707180691166e-05, + "loss": 0.9446, "step": 2836 }, { - "epoch": 1.87, - "learning_rate": 2.2884055004139038e-07, - "loss": 0.7145, + "epoch": 0.29691261119832546, + "grad_norm": 2.1185633078780004, + "learning_rate": 1.6491129020376094e-05, + "loss": 1.1212, "step": 2837 }, { - "epoch": 1.87, - "learning_rate": 2.265778105066996e-07, - "loss": 0.6468, + "epoch": 0.29701726844583987, + "grad_norm": 2.40021678899163, + "learning_rate": 1.6488550114217138e-05, + "loss": 1.1674, "step": 2838 }, { - "epoch": 1.87, - "learning_rate": 2.243261852403711e-07, - "loss": 0.6459, + "epoch": 0.29712192569335427, + "grad_norm": 2.8426932715804276, + "learning_rate": 1.6485970462510624e-05, + "loss": 0.9707, "step": 2839 }, { - "epoch": 1.87, - "learning_rate": 2.220856768029367e-07, - "loss": 0.6344, + "epoch": 0.2972265829408687, + "grad_norm": 1.9555538468974023, + "learning_rate": 1.6483390065552952e-05, + "loss": 0.9935, "step": 2840 }, { - "epoch": 1.87, - "learning_rate": 2.1985628774228717e-07, - "loss": 0.5841, + "epoch": 0.29733124018838303, + "grad_norm": 2.001436241338665, + "learning_rate": 1.648080892364062e-05, + "loss": 0.8675, "step": 2841 }, { - "epoch": 1.87, - "learning_rate": 2.1763802059367012e-07, - "loss": 0.6143, + "epoch": 0.29743589743589743, + "grad_norm": 2.3809732845318456, + "learning_rate": 1.6478227037070205e-05, + "loss": 1.0029, "step": 2842 }, { - "epoch": 1.87, - "learning_rate": 2.154308778796821e-07, - "loss": 0.5321, + "epoch": 0.29754055468341184, + "grad_norm": 2.052147061815293, + "learning_rate": 1.6475644406138372e-05, + "loss": 0.9261, "step": 2843 }, { - "epoch": 1.87, - "learning_rate": 2.1323486211026979e-07, - "loss": 0.7149, + "epoch": 0.2976452119309262, + "grad_norm": 1.8579343591461566, + "learning_rate": 1.6473061031141868e-05, + "loss": 0.9575, "step": 2844 }, { - "epoch": 1.87, - "learning_rate": 2.1104997578273e-07, - "loss": 0.683, + "epoch": 0.2977498691784406, + "grad_norm": 2.0536350820054743, + "learning_rate": 1.6470476912377526e-05, + "loss": 0.8525, "step": 2845 }, { - "epoch": 1.87, - "learning_rate": 2.088762213816986e-07, - "loss": 0.7292, + "epoch": 0.297854526425955, + "grad_norm": 2.7957230117068432, + "learning_rate": 1.646789205014227e-05, + "loss": 1.0007, "step": 2846 }, { - "epoch": 1.87, - "learning_rate": 2.0671360137915375e-07, - "loss": 0.7957, + "epoch": 0.2979591836734694, + "grad_norm": 1.8368027105734126, + "learning_rate": 1.646530644473311e-05, + "loss": 0.8189, "step": 2847 }, { - "epoch": 1.87, - "learning_rate": 2.0456211823441153e-07, - "loss": 0.6577, + "epoch": 0.29806384092098376, + "grad_norm": 1.9674169831122257, + "learning_rate": 1.6462720096447134e-05, + "loss": 0.8723, "step": 2848 }, { - "epoch": 1.88, - "learning_rate": 2.0242177439412592e-07, - "loss": 0.684, + "epoch": 0.29816849816849816, + "grad_norm": 2.1875273033291136, + "learning_rate": 1.6460133005581512e-05, + "loss": 0.953, "step": 2849 }, { - "epoch": 1.88, - "learning_rate": 2.0029257229227993e-07, - "loss": 0.7029, + "epoch": 0.29827315541601257, + "grad_norm": 1.9643878581413023, + "learning_rate": 1.6457545172433515e-05, + "loss": 0.8681, "step": 2850 }, { - "epoch": 1.88, - "learning_rate": 1.9817451435018676e-07, - "loss": 0.7119, + "epoch": 0.298377812663527, + "grad_norm": 2.071057314759131, + "learning_rate": 1.6454956597300486e-05, + "loss": 1.0712, "step": 2851 }, { - "epoch": 1.88, - "learning_rate": 1.9606760297648742e-07, - "loss": 0.6767, + "epoch": 0.2984824699110413, + "grad_norm": 3.0502621658394604, + "learning_rate": 1.645236728047986e-05, + "loss": 0.9732, "step": 2852 }, { - "epoch": 1.88, - "learning_rate": 1.9397184056714868e-07, - "loss": 0.6691, + "epoch": 0.29858712715855573, + "grad_norm": 2.208086371273379, + "learning_rate": 1.6449777222269153e-05, + "loss": 1.1401, "step": 2853 }, { - "epoch": 1.88, - "learning_rate": 1.9188722950545636e-07, - "loss": 0.6608, + "epoch": 0.29869178440607014, + "grad_norm": 2.374468906716569, + "learning_rate": 1.6447186422965962e-05, + "loss": 0.9075, "step": 2854 }, { - "epoch": 1.88, - "learning_rate": 1.8981377216201636e-07, - "loss": 0.6504, + "epoch": 0.2987964416535845, + "grad_norm": 2.075865806365837, + "learning_rate": 1.644459488286799e-05, + "loss": 0.9824, "step": 2855 }, { - "epoch": 1.88, - "learning_rate": 1.877514708947481e-07, - "loss": 0.6051, + "epoch": 0.2989010989010989, + "grad_norm": 2.1721807151694814, + "learning_rate": 1.6442002602272994e-05, + "loss": 1.0686, "step": 2856 }, { - "epoch": 1.88, - "learning_rate": 1.8570032804888783e-07, - "loss": 0.6595, + "epoch": 0.2990057561486133, + "grad_norm": 2.1427069344984546, + "learning_rate": 1.6439409581478843e-05, + "loss": 1.0767, "step": 2857 }, { - "epoch": 1.88, - "learning_rate": 1.836603459569808e-07, - "loss": 0.7658, + "epoch": 0.2991104133961277, + "grad_norm": 2.3195474794616504, + "learning_rate": 1.6436815820783477e-05, + "loss": 1.0688, "step": 2858 }, { - "epoch": 1.88, - "learning_rate": 1.816315269388802e-07, - "loss": 0.7564, + "epoch": 0.29921507064364206, + "grad_norm": 2.078009990693235, + "learning_rate": 1.6434221320484928e-05, + "loss": 1.0676, "step": 2859 }, { - "epoch": 1.88, - "learning_rate": 1.7961387330174606e-07, - "loss": 0.5965, + "epoch": 0.29931972789115646, + "grad_norm": 2.652882959489745, + "learning_rate": 1.64316260808813e-05, + "loss": 1.1277, "step": 2860 }, { - "epoch": 1.88, - "learning_rate": 1.7760738734003858e-07, - "loss": 0.631, + "epoch": 0.29942438513867087, + "grad_norm": 1.9324618218876926, + "learning_rate": 1.6429030102270802e-05, + "loss": 0.9057, "step": 2861 }, { - "epoch": 1.88, - "learning_rate": 1.756120713355214e-07, - "loss": 0.6346, + "epoch": 0.2995290423861852, + "grad_norm": 2.1542787245014763, + "learning_rate": 1.6426433384951707e-05, + "loss": 1.0036, "step": 2862 }, { - "epoch": 1.88, - "learning_rate": 1.7362792755725278e-07, - "loss": 0.6879, + "epoch": 0.2996336996336996, + "grad_norm": 2.316215732282425, + "learning_rate": 1.6423835929222393e-05, + "loss": 1.0189, "step": 2863 }, { - "epoch": 1.89, - "learning_rate": 1.7165495826158896e-07, - "loss": 0.6362, + "epoch": 0.29973835688121403, + "grad_norm": 2.0832717770915785, + "learning_rate": 1.6421237735381305e-05, + "loss": 0.9312, "step": 2864 }, { - "epoch": 1.89, - "learning_rate": 1.696931656921752e-07, - "loss": 0.5911, + "epoch": 0.29984301412872844, + "grad_norm": 2.0990095053120794, + "learning_rate": 1.6418638803726988e-05, + "loss": 1.0452, "step": 2865 }, { - "epoch": 1.89, - "learning_rate": 1.6774255207994915e-07, - "loss": 0.7731, + "epoch": 0.2999476713762428, + "grad_norm": 2.3858579539152416, + "learning_rate": 1.6416039134558058e-05, + "loss": 0.8431, "step": 2866 }, { - "epoch": 1.89, - "learning_rate": 1.6580311964313645e-07, - "loss": 0.6895, + "epoch": 0.3000523286237572, + "grad_norm": 2.0445888077165084, + "learning_rate": 1.641343872817322e-05, + "loss": 0.921, "step": 2867 }, { - "epoch": 1.89, - "learning_rate": 1.6387487058724504e-07, - "loss": 0.7009, + "epoch": 0.3001569858712716, + "grad_norm": 2.3463310835524225, + "learning_rate": 1.6410837584871276e-05, + "loss": 1.0862, "step": 2868 }, { - "epoch": 1.89, - "learning_rate": 1.6195780710506648e-07, - "loss": 0.7321, + "epoch": 0.300261643118786, + "grad_norm": 2.5259232781216316, + "learning_rate": 1.6408235704951098e-05, + "loss": 0.9665, "step": 2869 }, { - "epoch": 1.89, - "learning_rate": 1.600519313766724e-07, - "loss": 0.6184, + "epoch": 0.30036630036630035, + "grad_norm": 2.4845918292536244, + "learning_rate": 1.6405633088711646e-05, + "loss": 1.0941, "step": 2870 }, { - "epoch": 1.89, - "learning_rate": 1.5815724556941135e-07, - "loss": 0.6653, + "epoch": 0.30047095761381476, + "grad_norm": 2.250546185914107, + "learning_rate": 1.6403029736451964e-05, + "loss": 1.0009, "step": 2871 }, { - "epoch": 1.89, - "learning_rate": 1.562737518379065e-07, - "loss": 0.6153, + "epoch": 0.30057561486132917, + "grad_norm": 2.2255990588605004, + "learning_rate": 1.6400425648471187e-05, + "loss": 0.9602, "step": 2872 }, { - "epoch": 1.89, - "learning_rate": 1.5440145232405557e-07, - "loss": 0.7641, + "epoch": 0.3006802721088435, + "grad_norm": 2.17264095954443, + "learning_rate": 1.6397820825068528e-05, + "loss": 0.9746, "step": 2873 }, { - "epoch": 1.89, - "learning_rate": 1.525403491570221e-07, - "loss": 0.662, + "epoch": 0.3007849293563579, + "grad_norm": 2.425953579485797, + "learning_rate": 1.6395215266543284e-05, + "loss": 0.8, "step": 2874 }, { - "epoch": 1.89, - "learning_rate": 1.506904444532431e-07, - "loss": 0.7431, + "epoch": 0.3008895866038723, + "grad_norm": 2.4373639606890998, + "learning_rate": 1.6392608973194847e-05, + "loss": 0.9525, "step": 2875 }, { - "epoch": 1.89, - "learning_rate": 1.488517403164147e-07, - "loss": 0.7146, + "epoch": 0.30099424385138673, + "grad_norm": 2.6105194549403894, + "learning_rate": 1.6390001945322674e-05, + "loss": 1.0145, "step": 2876 }, { - "epoch": 1.89, - "learning_rate": 1.4702423883749983e-07, - "loss": 0.6645, + "epoch": 0.3010989010989011, + "grad_norm": 2.2916573263775644, + "learning_rate": 1.6387394183226327e-05, + "loss": 0.9484, "step": 2877 }, { - "epoch": 1.89, - "learning_rate": 1.4520794209472056e-07, - "loss": 0.672, + "epoch": 0.3012035583464155, + "grad_norm": 2.385274366893819, + "learning_rate": 1.6384785687205438e-05, + "loss": 1.1407, "step": 2878 }, { - "epoch": 1.9, - "learning_rate": 1.4340285215355577e-07, - "loss": 0.6396, + "epoch": 0.3013082155939299, + "grad_norm": 2.3566579851084395, + "learning_rate": 1.6382176457559732e-05, + "loss": 1.0001, "step": 2879 }, { - "epoch": 1.9, - "learning_rate": 1.4160897106674343e-07, - "loss": 0.6892, + "epoch": 0.30141287284144425, + "grad_norm": 2.395710605917217, + "learning_rate": 1.637956649458901e-05, + "loss": 1.0079, "step": 2880 }, { - "epoch": 1.9, - "learning_rate": 1.3982630087427062e-07, - "loss": 0.651, + "epoch": 0.30151753008895865, + "grad_norm": 2.0298439688834753, + "learning_rate": 1.637695579859317e-05, + "loss": 1.0266, "step": 2881 }, { - "epoch": 1.9, - "learning_rate": 1.3805484360337906e-07, - "loss": 0.6363, + "epoch": 0.30162218733647306, + "grad_norm": 2.3803391461053116, + "learning_rate": 1.637434436987218e-05, + "loss": 0.953, "step": 2882 }, { - "epoch": 1.9, - "learning_rate": 1.362946012685573e-07, - "loss": 0.5743, + "epoch": 0.30172684458398746, + "grad_norm": 2.339978618501876, + "learning_rate": 1.6371732208726098e-05, + "loss": 1.0875, "step": 2883 }, { - "epoch": 1.9, - "learning_rate": 1.3454557587154083e-07, - "loss": 0.6203, + "epoch": 0.3018315018315018, + "grad_norm": 2.0575949988075752, + "learning_rate": 1.6369119315455067e-05, + "loss": 1.1127, "step": 2884 }, { - "epoch": 1.9, - "learning_rate": 1.3280776940130858e-07, - "loss": 0.7716, + "epoch": 0.3019361590790162, + "grad_norm": 2.553619190127974, + "learning_rate": 1.6366505690359316e-05, + "loss": 0.8558, "step": 2885 }, { - "epoch": 1.9, - "learning_rate": 1.3108118383408418e-07, - "loss": 0.7167, + "epoch": 0.3020408163265306, + "grad_norm": 2.2250332246678606, + "learning_rate": 1.6363891333739153e-05, + "loss": 0.9865, "step": 2886 }, { - "epoch": 1.9, - "learning_rate": 1.2936582113332707e-07, - "loss": 0.6412, + "epoch": 0.302145473574045, + "grad_norm": 2.263025646780511, + "learning_rate": 1.6361276245894982e-05, + "loss": 0.94, "step": 2887 }, { - "epoch": 1.9, - "learning_rate": 1.276616832497346e-07, - "loss": 0.6639, + "epoch": 0.3022501308215594, + "grad_norm": 3.2202185282441733, + "learning_rate": 1.6358660427127267e-05, + "loss": 0.9522, "step": 2888 }, { - "epoch": 1.9, - "learning_rate": 1.2596877212124327e-07, - "loss": 0.7092, + "epoch": 0.3023547880690738, + "grad_norm": 2.6014220655520686, + "learning_rate": 1.6356043877736582e-05, + "loss": 1.0032, "step": 2889 }, { - "epoch": 1.9, - "learning_rate": 1.2428708967301973e-07, - "loss": 0.6886, + "epoch": 0.3024594453165882, + "grad_norm": 2.5626669505776145, + "learning_rate": 1.635342659802357e-05, + "loss": 1.0856, "step": 2890 }, { - "epoch": 1.9, - "learning_rate": 1.226166378174598e-07, - "loss": 0.7492, + "epoch": 0.30256410256410254, + "grad_norm": 2.343833892601788, + "learning_rate": 1.6350808588288964e-05, + "loss": 1.0161, "step": 2891 }, { - "epoch": 1.9, - "learning_rate": 1.2095741845419174e-07, - "loss": 0.5752, + "epoch": 0.30266875981161695, + "grad_norm": 2.586511158457927, + "learning_rate": 1.6348189848833574e-05, + "loss": 1.0082, "step": 2892 }, { - "epoch": 1.9, - "learning_rate": 1.1930943347006618e-07, - "loss": 0.6927, + "epoch": 0.30277341705913136, + "grad_norm": 1.7944614432349806, + "learning_rate": 1.6345570379958302e-05, + "loss": 0.8952, "step": 2893 }, { - "epoch": 1.91, - "learning_rate": 1.1767268473916182e-07, - "loss": 0.7242, + "epoch": 0.30287807430664576, + "grad_norm": 2.2117394132793047, + "learning_rate": 1.634295018196413e-05, + "loss": 0.9989, "step": 2894 }, { - "epoch": 1.91, - "learning_rate": 1.1604717412277866e-07, - "loss": 0.6736, + "epoch": 0.3029827315541601, + "grad_norm": 2.12160379753247, + "learning_rate": 1.6340329255152124e-05, + "loss": 1.0493, "step": 2895 }, { - "epoch": 1.91, - "learning_rate": 1.144329034694347e-07, - "loss": 0.727, + "epoch": 0.3030873888016745, + "grad_norm": 2.5588546522643707, + "learning_rate": 1.633770759982343e-05, + "loss": 0.9269, "step": 2896 }, { - "epoch": 1.91, - "learning_rate": 1.1282987461486927e-07, - "loss": 0.5983, + "epoch": 0.3031920460491889, + "grad_norm": 2.4173578432974008, + "learning_rate": 1.633508521627929e-05, + "loss": 0.915, "step": 2897 }, { - "epoch": 1.91, - "learning_rate": 1.1123808938203306e-07, - "loss": 0.7715, + "epoch": 0.3032967032967033, + "grad_norm": 1.9462185216594179, + "learning_rate": 1.633246210482101e-05, + "loss": 1.0375, "step": 2898 }, { - "epoch": 1.91, - "learning_rate": 1.0965754958109587e-07, - "loss": 0.6424, + "epoch": 0.3034013605442177, + "grad_norm": 3.1222882590726995, + "learning_rate": 1.6329838265750005e-05, + "loss": 1.0446, "step": 2899 }, { - "epoch": 1.91, - "learning_rate": 1.0808825700943437e-07, - "loss": 0.7114, + "epoch": 0.3035060177917321, + "grad_norm": 2.2579929742271796, + "learning_rate": 1.6327213699367746e-05, + "loss": 1.0918, "step": 2900 }, { - "epoch": 1.91, - "learning_rate": 1.0653021345163995e-07, - "loss": 0.668, + "epoch": 0.3036106750392465, + "grad_norm": 2.130384120094451, + "learning_rate": 1.6324588405975807e-05, + "loss": 0.8925, "step": 2901 }, { - "epoch": 1.91, - "learning_rate": 1.049834206795064e-07, - "loss": 0.6584, + "epoch": 0.30371533228676084, + "grad_norm": 2.2025752462669486, + "learning_rate": 1.6321962385875837e-05, + "loss": 0.9963, "step": 2902 }, { - "epoch": 1.91, - "learning_rate": 1.0344788045203668e-07, - "loss": 0.5823, + "epoch": 0.30381998953427525, + "grad_norm": 2.385277002826419, + "learning_rate": 1.6319335639369577e-05, + "loss": 1.0002, "step": 2903 }, { - "epoch": 1.91, - "learning_rate": 1.0192359451543731e-07, - "loss": 0.6654, + "epoch": 0.30392464678178965, + "grad_norm": 1.9086149634590928, + "learning_rate": 1.631670816675884e-05, + "loss": 0.8627, "step": 2904 }, { - "epoch": 1.91, - "learning_rate": 1.0041056460311393e-07, - "loss": 0.5543, + "epoch": 0.304029304029304, + "grad_norm": 2.172309116560836, + "learning_rate": 1.6314079968345527e-05, + "loss": 0.9886, "step": 2905 }, { - "epoch": 1.91, - "learning_rate": 9.890879243567686e-08, - "loss": 0.6764, + "epoch": 0.3041339612768184, + "grad_norm": 2.286003474124238, + "learning_rate": 1.631145104443163e-05, + "loss": 0.8937, "step": 2906 }, { - "epoch": 1.91, - "learning_rate": 9.741827972092777e-08, - "loss": 0.6297, + "epoch": 0.3042386185243328, + "grad_norm": 2.1054654506329804, + "learning_rate": 1.630882139531921e-05, + "loss": 0.9913, "step": 2907 }, { - "epoch": 1.91, - "learning_rate": 9.593902815386968e-08, - "loss": 0.6081, + "epoch": 0.3043432757718472, + "grad_norm": 2.188221004194734, + "learning_rate": 1.6306191021310423e-05, + "loss": 1.0191, "step": 2908 }, { - "epoch": 1.92, - "learning_rate": 9.447103941669588e-08, - "loss": 0.6845, + "epoch": 0.30444793301936157, + "grad_norm": 2.1904821257327374, + "learning_rate": 1.6303559922707503e-05, + "loss": 0.8218, "step": 2909 }, { - "epoch": 1.92, - "learning_rate": 9.301431517879323e-08, - "loss": 0.6775, + "epoch": 0.304552590266876, + "grad_norm": 2.5842095448369844, + "learning_rate": 1.6300928099812772e-05, + "loss": 0.9329, "step": 2910 }, { - "epoch": 1.92, - "learning_rate": 9.156885709674101e-08, - "loss": 0.6371, + "epoch": 0.3046572475143904, + "grad_norm": 2.132246343055798, + "learning_rate": 1.6298295552928626e-05, + "loss": 0.8609, "step": 2911 }, { - "epoch": 1.92, - "learning_rate": 9.013466681429994e-08, - "loss": 0.6572, + "epoch": 0.3047619047619048, + "grad_norm": 2.2252491798180514, + "learning_rate": 1.6295662282357555e-05, + "loss": 1.118, "step": 2912 }, { - "epoch": 1.92, - "learning_rate": 8.871174596242427e-08, - "loss": 0.6672, + "epoch": 0.30486656200941914, + "grad_norm": 2.098601354897258, + "learning_rate": 1.6293028288402123e-05, + "loss": 0.9224, "step": 2913 }, { - "epoch": 1.92, - "learning_rate": 8.73000961592485e-08, - "loss": 0.6575, + "epoch": 0.30497121925693355, + "grad_norm": 2.012587732978616, + "learning_rate": 1.6290393571364987e-05, + "loss": 1.0842, "step": 2914 }, { - "epoch": 1.92, - "learning_rate": 8.589971901009186e-08, - "loss": 0.6667, + "epoch": 0.30507587650444795, + "grad_norm": 2.3614383102118555, + "learning_rate": 1.628775813154887e-05, + "loss": 1.0265, "step": 2915 }, { - "epoch": 1.92, - "learning_rate": 8.451061610745381e-08, - "loss": 0.6693, + "epoch": 0.3051805337519623, + "grad_norm": 2.7878156230807134, + "learning_rate": 1.6285121969256605e-05, + "loss": 1.1703, "step": 2916 }, { - "epoch": 1.92, - "learning_rate": 8.313278903101075e-08, - "loss": 0.6127, + "epoch": 0.3052851909994767, + "grad_norm": 2.1636226722521856, + "learning_rate": 1.6282485084791086e-05, + "loss": 1.072, "step": 2917 }, { - "epoch": 1.92, - "learning_rate": 8.17662393476204e-08, - "loss": 0.6235, + "epoch": 0.3053898482469911, + "grad_norm": 1.914547100044751, + "learning_rate": 1.627984747845529e-05, + "loss": 0.8617, "step": 2918 }, { - "epoch": 1.92, - "learning_rate": 8.041096861131303e-08, - "loss": 0.5349, + "epoch": 0.3054945054945055, + "grad_norm": 2.4366729944284753, + "learning_rate": 1.6277209150552285e-05, + "loss": 1.042, "step": 2919 }, { - "epoch": 1.92, - "learning_rate": 7.906697836329247e-08, - "loss": 0.8139, + "epoch": 0.30559916274201987, + "grad_norm": 1.8211650521764042, + "learning_rate": 1.627457010138523e-05, + "loss": 1.0024, "step": 2920 }, { - "epoch": 1.92, - "learning_rate": 7.773427013193502e-08, - "loss": 0.7365, + "epoch": 0.3057038199895343, + "grad_norm": 2.4531522351641417, + "learning_rate": 1.6271930331257345e-05, + "loss": 0.9587, "step": 2921 }, { - "epoch": 1.92, - "learning_rate": 7.641284543278838e-08, - "loss": 0.7617, + "epoch": 0.3058084772370487, + "grad_norm": 2.5130079196870208, + "learning_rate": 1.6269289840471952e-05, + "loss": 1.0224, "step": 2922 }, { - "epoch": 1.92, - "learning_rate": 7.510270576856605e-08, - "loss": 0.6038, + "epoch": 0.30591313448456303, + "grad_norm": 1.9747172743888348, + "learning_rate": 1.6266648629332448e-05, + "loss": 0.9663, "step": 2923 }, { - "epoch": 1.92, - "learning_rate": 7.38038526291518e-08, - "loss": 0.7505, + "epoch": 0.30601779173207744, + "grad_norm": 2.3944082981680705, + "learning_rate": 1.6264006698142318e-05, + "loss": 0.9934, "step": 2924 }, { - "epoch": 1.93, - "learning_rate": 7.25162874915919e-08, - "loss": 0.7365, + "epoch": 0.30612244897959184, + "grad_norm": 2.508078569407357, + "learning_rate": 1.6261364047205114e-05, + "loss": 1.017, "step": 2925 }, { - "epoch": 1.93, - "learning_rate": 7.12400118200951e-08, - "loss": 0.6976, + "epoch": 0.30622710622710625, + "grad_norm": 2.33208975658998, + "learning_rate": 1.6258720676824487e-05, + "loss": 0.8922, "step": 2926 }, { - "epoch": 1.93, - "learning_rate": 6.997502706603487e-08, - "loss": 0.6077, + "epoch": 0.3063317634746206, + "grad_norm": 1.9714230710953324, + "learning_rate": 1.625607658730417e-05, + "loss": 0.9413, "step": 2927 }, { - "epoch": 1.93, - "learning_rate": 6.872133466794163e-08, - "loss": 0.6821, + "epoch": 0.306436420722135, + "grad_norm": 2.006875902861234, + "learning_rate": 1.625343177894797e-05, + "loss": 0.9562, "step": 2928 }, { - "epoch": 1.93, - "learning_rate": 6.747893605150824e-08, - "loss": 0.6679, + "epoch": 0.3065410779696494, + "grad_norm": 1.9744248707683965, + "learning_rate": 1.625078625205978e-05, + "loss": 1.0386, "step": 2929 }, { - "epoch": 1.93, - "learning_rate": 6.624783262958012e-08, - "loss": 0.6501, + "epoch": 0.30664573521716376, + "grad_norm": 2.3082227558275377, + "learning_rate": 1.6248140006943577e-05, + "loss": 1.1613, "step": 2930 }, { - "epoch": 1.93, - "learning_rate": 6.50280258021596e-08, - "loss": 0.7814, + "epoch": 0.30675039246467817, + "grad_norm": 2.1702890355942337, + "learning_rate": 1.624549304390342e-05, + "loss": 0.9356, "step": 2931 }, { - "epoch": 1.93, - "learning_rate": 6.381951695640154e-08, - "loss": 0.6804, + "epoch": 0.3068550497121926, + "grad_norm": 2.424739431310441, + "learning_rate": 1.6242845363243455e-05, + "loss": 1.1796, "step": 2932 }, { - "epoch": 1.93, - "learning_rate": 6.262230746661657e-08, - "loss": 0.5597, + "epoch": 0.306959706959707, + "grad_norm": 2.171158515959578, + "learning_rate": 1.62401969652679e-05, + "loss": 1.109, "step": 2933 }, { - "epoch": 1.93, - "learning_rate": 6.143639869426232e-08, - "loss": 0.6584, + "epoch": 0.30706436420722133, + "grad_norm": 2.186889110464127, + "learning_rate": 1.6237547850281065e-05, + "loss": 0.9602, "step": 2934 }, { - "epoch": 1.93, - "learning_rate": 6.026179198794335e-08, - "loss": 0.838, + "epoch": 0.30716902145473574, + "grad_norm": 2.404453450737933, + "learning_rate": 1.6234898018587336e-05, + "loss": 1.0674, "step": 2935 }, { - "epoch": 1.93, - "learning_rate": 5.9098488683417834e-08, - "loss": 0.5844, + "epoch": 0.30727367870225014, + "grad_norm": 2.2426489372917224, + "learning_rate": 1.6232247470491188e-05, + "loss": 1.0203, "step": 2936 }, { - "epoch": 1.93, - "learning_rate": 5.794649010358422e-08, - "loss": 0.836, + "epoch": 0.30737833594976455, + "grad_norm": 2.1305208739832135, + "learning_rate": 1.622959620629717e-05, + "loss": 0.8933, "step": 2937 }, { - "epoch": 1.93, - "learning_rate": 5.680579755848681e-08, - "loss": 0.7431, + "epoch": 0.3074829931972789, + "grad_norm": 2.475213166949153, + "learning_rate": 1.6226944226309916e-05, + "loss": 0.9715, "step": 2938 }, { - "epoch": 1.93, - "learning_rate": 5.5676412345316844e-08, - "loss": 0.647, + "epoch": 0.3075876504447933, + "grad_norm": 2.229771800563122, + "learning_rate": 1.6224291530834147e-05, + "loss": 1.0384, "step": 2939 }, { - "epoch": 1.94, - "learning_rate": 5.455833574839919e-08, - "loss": 0.5488, + "epoch": 0.3076923076923077, + "grad_norm": 1.9303541858289048, + "learning_rate": 1.6221638120174668e-05, + "loss": 1.0791, "step": 2940 }, { - "epoch": 1.94, - "learning_rate": 5.3451569039205674e-08, - "loss": 0.6635, + "epoch": 0.30779696493982206, + "grad_norm": 2.3662003014744437, + "learning_rate": 1.6218983994636355e-05, + "loss": 1.0525, "step": 2941 }, { - "epoch": 1.94, - "learning_rate": 5.235611347634173e-08, - "loss": 0.6223, + "epoch": 0.30790162218733647, + "grad_norm": 2.1079944888721127, + "learning_rate": 1.621632915452417e-05, + "loss": 0.9803, "step": 2942 }, { - "epoch": 1.94, - "learning_rate": 5.12719703055542e-08, - "loss": 0.6356, + "epoch": 0.30800627943485087, + "grad_norm": 2.092870856439816, + "learning_rate": 1.6213673600143168e-05, + "loss": 0.9583, "step": 2943 }, { - "epoch": 1.94, - "learning_rate": 5.0199140759721324e-08, - "loss": 0.7753, + "epoch": 0.3081109366823653, + "grad_norm": 2.243256700536703, + "learning_rate": 1.6211017331798473e-05, + "loss": 0.9972, "step": 2944 }, { - "epoch": 1.94, - "learning_rate": 4.913762605885719e-08, - "loss": 0.6892, + "epoch": 0.3082155939298796, + "grad_norm": 2.205695168816815, + "learning_rate": 1.6208360349795293e-05, + "loss": 1.0897, "step": 2945 }, { - "epoch": 1.94, - "learning_rate": 4.8087427410110635e-08, - "loss": 0.6796, + "epoch": 0.30832025117739403, + "grad_norm": 2.4776470468527103, + "learning_rate": 1.620570265443892e-05, + "loss": 1.0388, "step": 2946 }, { - "epoch": 1.94, - "learning_rate": 4.7048546007759656e-08, - "loss": 0.7213, + "epoch": 0.30842490842490844, + "grad_norm": 2.344647528820799, + "learning_rate": 1.620304424603474e-05, + "loss": 1.0009, "step": 2947 }, { - "epoch": 1.94, - "learning_rate": 4.602098303321256e-08, - "loss": 0.6885, + "epoch": 0.3085295656724228, + "grad_norm": 2.108670142100681, + "learning_rate": 1.6200385124888195e-05, + "loss": 0.9776, "step": 2948 }, { - "epoch": 1.94, - "learning_rate": 4.5004739655006844e-08, - "loss": 0.6352, + "epoch": 0.3086342229199372, + "grad_norm": 2.3408563934308897, + "learning_rate": 1.6197725291304833e-05, + "loss": 0.9592, "step": 2949 }, { - "epoch": 1.94, - "learning_rate": 4.399981702880918e-08, - "loss": 0.6829, + "epoch": 0.3087388801674516, + "grad_norm": 2.18058930736239, + "learning_rate": 1.6195064745590267e-05, + "loss": 1.049, "step": 2950 }, { - "epoch": 1.94, - "learning_rate": 4.3006216297407645e-08, - "loss": 0.666, + "epoch": 0.308843537414966, + "grad_norm": 2.1381863651278854, + "learning_rate": 1.619240348805021e-05, + "loss": 0.962, "step": 2951 }, { - "epoch": 1.94, - "learning_rate": 4.2023938590720625e-08, - "loss": 0.7365, + "epoch": 0.30894819466248036, + "grad_norm": 1.8703885232830255, + "learning_rate": 1.6189741518990433e-05, + "loss": 1.0408, "step": 2952 }, { - "epoch": 1.94, - "learning_rate": 4.1052985025787914e-08, - "loss": 0.6313, + "epoch": 0.30905285190999476, + "grad_norm": 2.170717558554135, + "learning_rate": 1.618707883871681e-05, + "loss": 1.136, "step": 2953 }, { - "epoch": 1.94, - "learning_rate": 4.009335670676851e-08, - "loss": 0.7312, + "epoch": 0.30915750915750917, + "grad_norm": 2.365760154975676, + "learning_rate": 1.6184415447535284e-05, + "loss": 0.9957, "step": 2954 }, { - "epoch": 1.95, - "learning_rate": 3.914505472494723e-08, - "loss": 0.7549, + "epoch": 0.3092621664050236, + "grad_norm": 2.2575554436817606, + "learning_rate": 1.6181751345751887e-05, + "loss": 1.0823, "step": 2955 }, { - "epoch": 1.95, - "learning_rate": 3.820808015872701e-08, - "loss": 0.6622, + "epoch": 0.3093668236525379, + "grad_norm": 2.00512183891559, + "learning_rate": 1.6179086533672726e-05, + "loss": 1.064, "step": 2956 }, { - "epoch": 1.95, - "learning_rate": 3.728243407362664e-08, - "loss": 0.6694, + "epoch": 0.30947148090005233, + "grad_norm": 2.0900482163912932, + "learning_rate": 1.6176421011604e-05, + "loss": 1.0277, "step": 2957 }, { - "epoch": 1.95, - "learning_rate": 3.636811752228519e-08, - "loss": 0.6484, + "epoch": 0.30957613814756674, + "grad_norm": 2.028018177800534, + "learning_rate": 1.6173754779851978e-05, + "loss": 0.9109, "step": 2958 }, { - "epoch": 1.95, - "learning_rate": 3.546513154445874e-08, - "loss": 0.7049, + "epoch": 0.3096807953950811, + "grad_norm": 1.911257078717531, + "learning_rate": 1.6171087838723015e-05, + "loss": 1.0144, "step": 2959 }, { - "epoch": 1.95, - "learning_rate": 3.457347716701587e-08, - "loss": 0.6954, + "epoch": 0.3097854526425955, + "grad_norm": 2.2217963538032284, + "learning_rate": 1.616842018852355e-05, + "loss": 0.9425, "step": 2960 }, { - "epoch": 1.95, - "learning_rate": 3.3693155403938805e-08, - "loss": 0.7506, + "epoch": 0.3098901098901099, + "grad_norm": 2.59813827847896, + "learning_rate": 1.61657518295601e-05, + "loss": 1.0, "step": 2961 }, { - "epoch": 1.95, - "learning_rate": 3.282416725632454e-08, - "loss": 0.7205, + "epoch": 0.3099947671376243, + "grad_norm": 2.16386391537696, + "learning_rate": 1.6163082762139265e-05, + "loss": 0.836, "step": 2962 }, { - "epoch": 1.95, - "learning_rate": 3.1966513712380356e-08, - "loss": 0.6839, + "epoch": 0.31009942438513866, + "grad_norm": 2.8465111690795064, + "learning_rate": 1.616041298656773e-05, + "loss": 1.008, "step": 2963 }, { - "epoch": 1.95, - "learning_rate": 3.1120195747422755e-08, - "loss": 0.7295, + "epoch": 0.31020408163265306, + "grad_norm": 2.58047061290731, + "learning_rate": 1.6157742503152253e-05, + "loss": 0.9143, "step": 2964 }, { - "epoch": 1.95, - "learning_rate": 3.0285214323879655e-08, - "loss": 0.6969, + "epoch": 0.31030873888016747, + "grad_norm": 1.9097639693984674, + "learning_rate": 1.6155071312199676e-05, + "loss": 0.8861, "step": 2965 }, { - "epoch": 1.95, - "learning_rate": 2.9461570391287053e-08, - "loss": 0.5346, + "epoch": 0.3104133961276818, + "grad_norm": 2.124026315060564, + "learning_rate": 1.615239941401693e-05, + "loss": 0.965, "step": 2966 }, { - "epoch": 1.95, - "learning_rate": 2.864926488628572e-08, - "loss": 0.6258, + "epoch": 0.3105180533751962, + "grad_norm": 2.39401063448098, + "learning_rate": 1.614972680891102e-05, + "loss": 1.0503, "step": 2967 }, { - "epoch": 1.95, - "learning_rate": 2.78482987326234e-08, - "loss": 0.6871, + "epoch": 0.31062271062271063, + "grad_norm": 1.8182089689462924, + "learning_rate": 1.6147053497189032e-05, + "loss": 0.8512, "step": 2968 }, { - "epoch": 1.95, - "learning_rate": 2.7058672841154822e-08, - "loss": 0.7879, + "epoch": 0.31072736787022504, + "grad_norm": 2.0677360921080115, + "learning_rate": 1.6144379479158136e-05, + "loss": 0.9903, "step": 2969 }, { - "epoch": 1.96, - "learning_rate": 2.628038810983613e-08, - "loss": 0.6921, + "epoch": 0.3108320251177394, + "grad_norm": 2.428228198980805, + "learning_rate": 1.614170475512558e-05, + "loss": 0.8599, "step": 2970 }, { - "epoch": 1.96, - "learning_rate": 2.5513445423728246e-08, - "loss": 0.6992, + "epoch": 0.3109366823652538, + "grad_norm": 2.0105272211059027, + "learning_rate": 1.61390293253987e-05, + "loss": 0.9592, "step": 2971 }, { - "epoch": 1.96, - "learning_rate": 2.4757845654992396e-08, - "loss": 0.6271, + "epoch": 0.3110413396127682, + "grad_norm": 2.323659569472944, + "learning_rate": 1.6136353190284903e-05, + "loss": 0.9728, "step": 2972 }, { - "epoch": 1.96, - "learning_rate": 2.4013589662892356e-08, - "loss": 0.5533, + "epoch": 0.31114599686028255, + "grad_norm": 1.9635717769932388, + "learning_rate": 1.6133676350091686e-05, + "loss": 0.9582, "step": 2973 }, { - "epoch": 1.96, - "learning_rate": 2.3280678293788883e-08, - "loss": 0.6439, + "epoch": 0.31125065410779695, + "grad_norm": 1.9068151782035103, + "learning_rate": 1.6130998805126622e-05, + "loss": 0.8325, "step": 2974 }, { - "epoch": 1.96, - "learning_rate": 2.2559112381146386e-08, - "loss": 0.6682, + "epoch": 0.31135531135531136, + "grad_norm": 2.0598995448173314, + "learning_rate": 1.6128320555697365e-05, + "loss": 1.0608, "step": 2975 }, { - "epoch": 1.96, - "learning_rate": 2.1848892745522932e-08, - "loss": 0.6897, + "epoch": 0.31145996860282577, + "grad_norm": 2.2373806491125463, + "learning_rate": 1.6125641602111655e-05, + "loss": 0.9931, "step": 2976 }, { - "epoch": 1.96, - "learning_rate": 2.1150020194574682e-08, - "loss": 0.555, + "epoch": 0.3115646258503401, + "grad_norm": 2.0920136465295918, + "learning_rate": 1.6122961944677308e-05, + "loss": 0.9315, "step": 2977 }, { - "epoch": 1.96, - "learning_rate": 2.0462495523057013e-08, - "loss": 0.6069, + "epoch": 0.3116692830978545, + "grad_norm": 2.082366198297534, + "learning_rate": 1.612028158370222e-05, + "loss": 0.9886, "step": 2978 }, { - "epoch": 1.96, - "learning_rate": 1.978631951281673e-08, - "loss": 0.6374, + "epoch": 0.3117739403453689, + "grad_norm": 2.2060623386259635, + "learning_rate": 1.6117600519494373e-05, + "loss": 0.9608, "step": 2979 }, { - "epoch": 1.96, - "learning_rate": 1.9121492932796526e-08, - "loss": 0.6606, + "epoch": 0.31187859759288333, + "grad_norm": 2.552233556749464, + "learning_rate": 1.611491875236182e-05, + "loss": 1.018, "step": 2980 }, { - "epoch": 1.96, - "learning_rate": 1.846801653903274e-08, - "loss": 0.7836, + "epoch": 0.3119832548403977, + "grad_norm": 2.2999204774617157, + "learning_rate": 1.6112236282612705e-05, + "loss": 0.9881, "step": 2981 }, { - "epoch": 1.96, - "learning_rate": 1.7825891074654267e-08, - "loss": 0.5954, + "epoch": 0.3120879120879121, + "grad_norm": 2.084664368943567, + "learning_rate": 1.6109553110555256e-05, + "loss": 0.9935, "step": 2982 }, { - "epoch": 1.96, - "learning_rate": 1.7195117269881433e-08, - "loss": 0.7067, + "epoch": 0.3121925693354265, + "grad_norm": 2.077448020127175, + "learning_rate": 1.6106869236497767e-05, + "loss": 0.9272, "step": 2983 }, { - "epoch": 1.96, - "learning_rate": 1.6575695842027116e-08, - "loss": 0.6521, + "epoch": 0.31229722658294085, + "grad_norm": 1.9328287512609774, + "learning_rate": 1.610418466074862e-05, + "loss": 1.037, "step": 2984 }, { - "epoch": 1.97, - "learning_rate": 1.59676274954923e-08, - "loss": 0.6676, + "epoch": 0.31240188383045525, + "grad_norm": 2.3308588787662674, + "learning_rate": 1.6101499383616284e-05, + "loss": 1.0365, "step": 2985 }, { - "epoch": 1.97, - "learning_rate": 1.537091292176829e-08, - "loss": 0.6783, + "epoch": 0.31250654107796966, + "grad_norm": 2.1473404366037023, + "learning_rate": 1.60988134054093e-05, + "loss": 0.9637, "step": 2986 }, { - "epoch": 1.97, - "learning_rate": 1.4785552799433388e-08, - "loss": 0.6663, + "epoch": 0.31261119832548406, + "grad_norm": 2.0439604173751627, + "learning_rate": 1.609612672643629e-05, + "loss": 0.8799, "step": 2987 }, { - "epoch": 1.97, - "learning_rate": 1.4211547794157344e-08, - "loss": 0.6096, + "epoch": 0.3127158555729984, + "grad_norm": 2.285023732772858, + "learning_rate": 1.609343934700596e-05, + "loss": 1.0245, "step": 2988 }, { - "epoch": 1.97, - "learning_rate": 1.3648898558693558e-08, - "loss": 0.6837, + "epoch": 0.3128205128205128, + "grad_norm": 2.47915241747156, + "learning_rate": 1.60907512674271e-05, + "loss": 1.05, "step": 2989 }, { - "epoch": 1.97, - "learning_rate": 1.3097605732882435e-08, - "loss": 0.5759, + "epoch": 0.3129251700680272, + "grad_norm": 2.0809584106263057, + "learning_rate": 1.6088062488008575e-05, + "loss": 1.0695, "step": 2990 }, { - "epoch": 1.97, - "learning_rate": 1.2557669943650264e-08, - "loss": 0.6818, + "epoch": 0.3130298273155416, + "grad_norm": 2.1591951982467927, + "learning_rate": 1.6085373009059322e-05, + "loss": 0.9742, "step": 2991 }, { - "epoch": 1.97, - "learning_rate": 1.2029091805010328e-08, - "loss": 0.7021, + "epoch": 0.313134484563056, + "grad_norm": 1.8386586833176444, + "learning_rate": 1.6082682830888376e-05, + "loss": 0.8092, "step": 2992 }, { - "epoch": 1.97, - "learning_rate": 1.151187191805625e-08, - "loss": 0.6455, + "epoch": 0.3132391418105704, + "grad_norm": 2.598989002728532, + "learning_rate": 1.607999195380484e-05, + "loss": 0.9899, "step": 2993 }, { - "epoch": 1.97, - "learning_rate": 1.1006010870966422e-08, - "loss": 0.587, + "epoch": 0.3133437990580848, + "grad_norm": 2.7275001106943297, + "learning_rate": 1.6077300378117906e-05, + "loss": 0.9487, "step": 2994 }, { - "epoch": 1.97, - "learning_rate": 1.0511509239004013e-08, - "loss": 0.7055, + "epoch": 0.31344845630559914, + "grad_norm": 2.2062421090046342, + "learning_rate": 1.6074608104136838e-05, + "loss": 0.9532, "step": 2995 }, { - "epoch": 1.97, - "learning_rate": 1.0028367584512532e-08, - "loss": 0.7165, + "epoch": 0.31355311355311355, + "grad_norm": 2.0978845722449995, + "learning_rate": 1.6071915132170986e-05, + "loss": 0.8883, "step": 2996 }, { - "epoch": 1.97, - "learning_rate": 9.556586456916928e-09, - "loss": 0.7311, + "epoch": 0.31365777080062796, + "grad_norm": 2.1704288073650653, + "learning_rate": 1.606922146252977e-05, + "loss": 1.0379, "step": 2997 }, { - "epoch": 1.97, - "learning_rate": 9.096166392723593e-09, - "loss": 0.572, + "epoch": 0.31376242804814236, + "grad_norm": 2.0169657476089755, + "learning_rate": 1.6066527095522707e-05, + "loss": 1.0009, "step": 2998 }, { - "epoch": 1.97, - "learning_rate": 8.647107915519259e-09, - "loss": 0.6315, + "epoch": 0.3138670852956567, + "grad_norm": 2.256441777616629, + "learning_rate": 1.6063832031459384e-05, + "loss": 0.972, "step": 2999 }, { - "epoch": 1.97, - "learning_rate": 8.209411535969881e-09, - "loss": 0.5715, + "epoch": 0.3139717425431711, + "grad_norm": 1.8938052813988708, + "learning_rate": 1.6061136270649473e-05, + "loss": 0.9554, "step": 3000 }, { - "epoch": 1.98, - "learning_rate": 7.783077751820633e-09, - "loss": 0.5621, + "epoch": 0.3140763997906855, + "grad_norm": 2.5447202789705554, + "learning_rate": 1.6058439813402712e-05, + "loss": 1.0768, "step": 3001 }, { - "epoch": 1.98, - "learning_rate": 7.3681070478948125e-09, - "loss": 0.6666, + "epoch": 0.3141810570381999, + "grad_norm": 2.0871529984817965, + "learning_rate": 1.605574266002893e-05, + "loss": 0.9473, "step": 3002 }, { - "epoch": 1.98, - "learning_rate": 6.964499896096044e-09, - "loss": 0.672, + "epoch": 0.3142857142857143, + "grad_norm": 2.1668427542881097, + "learning_rate": 1.6053044810838048e-05, + "loss": 0.9402, "step": 3003 }, { - "epoch": 1.98, - "learning_rate": 6.572256755401629e-09, - "loss": 0.7563, + "epoch": 0.3143903715332287, + "grad_norm": 2.3159990985111936, + "learning_rate": 1.6050346266140046e-05, + "loss": 0.9726, "step": 3004 }, { - "epoch": 1.98, - "learning_rate": 6.191378071869203e-09, - "loss": 0.7738, + "epoch": 0.3144950287807431, + "grad_norm": 1.9154842831401457, + "learning_rate": 1.6047647026244988e-05, + "loss": 0.8467, "step": 3005 }, { - "epoch": 1.98, - "learning_rate": 5.821864278631184e-09, - "loss": 0.7616, + "epoch": 0.31459968602825744, + "grad_norm": 2.25244360448727, + "learning_rate": 1.604494709146303e-05, + "loss": 1.1072, "step": 3006 }, { - "epoch": 1.98, - "learning_rate": 5.463715795895885e-09, - "loss": 0.74, + "epoch": 0.31470434327577185, + "grad_norm": 2.207816514269716, + "learning_rate": 1.6042246462104394e-05, + "loss": 1.011, "step": 3007 }, { - "epoch": 1.98, - "learning_rate": 5.116933030946403e-09, - "loss": 0.6748, + "epoch": 0.31480900052328625, + "grad_norm": 2.5547164500222044, + "learning_rate": 1.6039545138479395e-05, + "loss": 0.968, "step": 3008 }, { - "epoch": 1.98, - "learning_rate": 4.781516378142837e-09, - "loss": 0.7536, + "epoch": 0.3149136577708006, + "grad_norm": 2.677040611649757, + "learning_rate": 1.603684312089841e-05, + "loss": 1.1204, "step": 3009 }, { - "epoch": 1.98, - "learning_rate": 4.45746621891785e-09, - "loss": 0.6959, + "epoch": 0.315018315018315, + "grad_norm": 1.9755495487026258, + "learning_rate": 1.6034140409671916e-05, + "loss": 0.97, "step": 3010 }, { - "epoch": 1.98, - "learning_rate": 4.144782921779999e-09, - "loss": 0.6405, + "epoch": 0.3151229722658294, + "grad_norm": 2.1587533164996566, + "learning_rate": 1.6031437005110456e-05, + "loss": 1.0208, "step": 3011 }, { - "epoch": 1.98, - "learning_rate": 3.843466842310406e-09, - "loss": 0.6195, + "epoch": 0.3152276295133438, + "grad_norm": 1.9806892809879053, + "learning_rate": 1.602873290752466e-05, + "loss": 1.1013, "step": 3012 }, { - "epoch": 1.98, - "learning_rate": 3.5535183231627523e-09, - "loss": 0.5632, + "epoch": 0.31533228676085817, + "grad_norm": 2.4483979214827745, + "learning_rate": 1.602602811722523e-05, + "loss": 1.1004, "step": 3013 }, { - "epoch": 1.98, - "learning_rate": 3.274937694065505e-09, - "loss": 0.7038, + "epoch": 0.3154369440083726, + "grad_norm": 2.5101769689868436, + "learning_rate": 1.602332263452295e-05, + "loss": 0.9379, "step": 3014 }, { - "epoch": 1.98, - "learning_rate": 3.0077252718185844e-09, - "loss": 0.5748, + "epoch": 0.315541601255887, + "grad_norm": 2.538507265258334, + "learning_rate": 1.602061645972869e-05, + "loss": 1.0772, "step": 3015 }, { - "epoch": 1.99, - "learning_rate": 2.751881360292252e-09, - "loss": 0.6874, + "epoch": 0.31564625850340133, + "grad_norm": 2.1327347713309415, + "learning_rate": 1.6017909593153394e-05, + "loss": 0.9862, "step": 3016 }, { - "epoch": 1.99, - "learning_rate": 2.507406250433775e-09, - "loss": 0.584, + "epoch": 0.31575091575091574, + "grad_norm": 1.8079317995730482, + "learning_rate": 1.601520203510809e-05, + "loss": 0.9613, "step": 3017 }, { - "epoch": 1.99, - "learning_rate": 2.27430022025521e-09, - "loss": 0.6519, + "epoch": 0.31585557299843015, + "grad_norm": 2.3477974076051424, + "learning_rate": 1.6012493785903874e-05, + "loss": 1.0938, "step": 3018 }, { - "epoch": 1.99, - "learning_rate": 2.0525635348456195e-09, - "loss": 0.6832, + "epoch": 0.31596023024594455, + "grad_norm": 2.3995917756166953, + "learning_rate": 1.6009784845851936e-05, + "loss": 0.9591, "step": 3019 }, { - "epoch": 1.99, - "learning_rate": 1.8421964463610776e-09, - "loss": 0.7397, + "epoch": 0.3160648874934589, + "grad_norm": 2.171025021213507, + "learning_rate": 1.6007075215263533e-05, + "loss": 1.0401, "step": 3020 }, { - "epoch": 1.99, - "learning_rate": 1.6431991940302206e-09, - "loss": 0.728, + "epoch": 0.3161695447409733, + "grad_norm": 2.453631613652852, + "learning_rate": 1.6004364894450015e-05, + "loss": 1.0872, "step": 3021 }, { - "epoch": 1.99, - "learning_rate": 1.455572004150918e-09, - "loss": 0.7637, + "epoch": 0.3162742019884877, + "grad_norm": 2.0649905270363003, + "learning_rate": 1.6001653883722794e-05, + "loss": 0.947, "step": 3022 }, { - "epoch": 1.99, - "learning_rate": 1.279315090091382e-09, - "loss": 0.6522, + "epoch": 0.3163788592360021, + "grad_norm": 1.9845232246558884, + "learning_rate": 1.5998942183393372e-05, + "loss": 1.0143, "step": 3023 }, { - "epoch": 1.99, - "learning_rate": 1.114428652290167e-09, - "loss": 0.7245, + "epoch": 0.31648351648351647, + "grad_norm": 2.027999737482337, + "learning_rate": 1.599622979377334e-05, + "loss": 1.058, "step": 3024 }, { - "epoch": 1.99, - "learning_rate": 9.609128782550604e-10, - "loss": 0.7767, + "epoch": 0.3165881737310309, + "grad_norm": 2.386479104325292, + "learning_rate": 1.5993516715174345e-05, + "loss": 1.0363, "step": 3025 }, { - "epoch": 1.99, - "learning_rate": 8.187679425630813e-10, - "loss": 0.6937, + "epoch": 0.3166928309785453, + "grad_norm": 2.026243723400283, + "learning_rate": 1.5990802947908132e-05, + "loss": 0.8436, "step": 3026 }, { - "epoch": 1.99, - "learning_rate": 6.879940068604818e-10, - "loss": 0.6176, + "epoch": 0.31679748822605963, + "grad_norm": 2.4461769884069486, + "learning_rate": 1.598808849228651e-05, + "loss": 1.0391, "step": 3027 }, { - "epoch": 1.99, - "learning_rate": 5.685912198627464e-10, - "loss": 0.7871, + "epoch": 0.31690214547357404, + "grad_norm": 2.735072516013726, + "learning_rate": 1.5985373348621384e-05, + "loss": 1.1361, "step": 3028 }, { - "epoch": 1.99, - "learning_rate": 4.605597173534815e-10, - "loss": 0.6383, + "epoch": 0.31700680272108844, + "grad_norm": 2.0367817571925677, + "learning_rate": 1.598265751722473e-05, + "loss": 1.1262, "step": 3029 }, { - "epoch": 1.99, - "learning_rate": 3.63899622184416e-10, - "loss": 0.6543, + "epoch": 0.31711145996860285, + "grad_norm": 2.389592094981779, + "learning_rate": 1.5979940998408593e-05, + "loss": 1.0193, "step": 3030 }, { - "epoch": 2.0, - "learning_rate": 2.786110442787315e-10, - "loss": 0.6971, + "epoch": 0.3172161172161172, + "grad_norm": 2.051597409307426, + "learning_rate": 1.597722379248512e-05, + "loss": 0.8997, "step": 3031 }, { - "epoch": 2.0, - "learning_rate": 2.0469408062440131e-10, - "loss": 0.7434, + "epoch": 0.3173207744636316, + "grad_norm": 1.8589251291767148, + "learning_rate": 1.597450589976651e-05, + "loss": 0.957, "step": 3032 }, { - "epoch": 2.0, - "learning_rate": 1.4214881527974123e-10, - "loss": 0.7192, + "epoch": 0.317425431711146, + "grad_norm": 2.2975676929699937, + "learning_rate": 1.5971787320565063e-05, + "loss": 0.9536, "step": 3033 }, { - "epoch": 2.0, - "learning_rate": 9.097531937118931e-11, - "loss": 0.6138, + "epoch": 0.31753008895866036, + "grad_norm": 2.2772719111714297, + "learning_rate": 1.5969068055193146e-05, + "loss": 1.008, "step": 3034 }, { - "epoch": 2.0, - "learning_rate": 5.1173651093305854e-11, - "loss": 0.6704, + "epoch": 0.31763474620617477, + "grad_norm": 2.348844355294729, + "learning_rate": 1.596634810396321e-05, + "loss": 0.9407, "step": 3035 }, { - "epoch": 2.0, - "learning_rate": 2.274385570655291e-11, - "loss": 0.7346, + "epoch": 0.3177394034536892, + "grad_norm": 2.1144033212390494, + "learning_rate": 1.5963627467187783e-05, + "loss": 0.913, "step": 3036 }, { - "epoch": 2.0, - "learning_rate": 5.685965542845395e-12, - "loss": 0.6479, + "epoch": 0.3178440607012036, + "grad_norm": 2.078038440024001, + "learning_rate": 1.5960906145179466e-05, + "loss": 0.9828, "step": 3037 }, { - "epoch": 2.0, - "learning_rate": 0.0, - "loss": 0.5033, + "epoch": 0.31794871794871793, + "grad_norm": 2.1282457724139667, + "learning_rate": 1.595818413825095e-05, + "loss": 0.9836, "step": 3038 }, { - "epoch": 2.0, - "step": 3038, - "total_flos": 148653022003200.0, - "train_loss": 1.016740475225166, - "train_runtime": 19333.4248, - "train_samples_per_second": 5.027, - "train_steps_per_second": 0.157 - } - ], - "max_steps": 3038, - "num_train_epochs": 2, - "total_flos": 148653022003200.0, + "epoch": 0.31805337519623234, + "grad_norm": 1.9799340852762877, + "learning_rate": 1.5955461446715002e-05, + "loss": 1.044, + "step": 3039 + }, + { + "epoch": 0.31815803244374674, + "grad_norm": 2.1983306415646693, + "learning_rate": 1.5952738070884458e-05, + "loss": 0.9124, + "step": 3040 + }, + { + "epoch": 0.31826268969126115, + "grad_norm": 1.974260011569216, + "learning_rate": 1.5950014011072238e-05, + "loss": 0.9891, + "step": 3041 + }, + { + "epoch": 0.3183673469387755, + "grad_norm": 2.310426349353939, + "learning_rate": 1.5947289267591354e-05, + "loss": 0.9872, + "step": 3042 + }, + { + "epoch": 0.3184720041862899, + "grad_norm": 1.9450362823631278, + "learning_rate": 1.5944563840754873e-05, + "loss": 1.0055, + "step": 3043 + }, + { + "epoch": 0.3185766614338043, + "grad_norm": 2.036067128143145, + "learning_rate": 1.5941837730875956e-05, + "loss": 0.9706, + "step": 3044 + }, + { + "epoch": 0.31868131868131866, + "grad_norm": 2.05039801626101, + "learning_rate": 1.593911093826784e-05, + "loss": 0.9528, + "step": 3045 + }, + { + "epoch": 0.31878597592883307, + "grad_norm": 2.2876059678714857, + "learning_rate": 1.5936383463243836e-05, + "loss": 0.9563, + "step": 3046 + }, + { + "epoch": 0.31889063317634747, + "grad_norm": 2.697037738761109, + "learning_rate": 1.5933655306117338e-05, + "loss": 1.0216, + "step": 3047 + }, + { + "epoch": 0.3189952904238619, + "grad_norm": 2.5022369955164385, + "learning_rate": 1.5930926467201816e-05, + "loss": 1.033, + "step": 3048 + }, + { + "epoch": 0.3190999476713762, + "grad_norm": 2.113886043265922, + "learning_rate": 1.592819694681082e-05, + "loss": 0.9792, + "step": 3049 + }, + { + "epoch": 0.31920460491889063, + "grad_norm": 2.237260875974751, + "learning_rate": 1.5925466745257977e-05, + "loss": 1.0271, + "step": 3050 + }, + { + "epoch": 0.31930926216640504, + "grad_norm": 2.1237623188008037, + "learning_rate": 1.592273586285699e-05, + "loss": 0.9983, + "step": 3051 + }, + { + "epoch": 0.3194139194139194, + "grad_norm": 2.2392068397320344, + "learning_rate": 1.5920004299921652e-05, + "loss": 1.0828, + "step": 3052 + }, + { + "epoch": 0.3195185766614338, + "grad_norm": 3.338921058059234, + "learning_rate": 1.5917272056765815e-05, + "loss": 1.0108, + "step": 3053 + }, + { + "epoch": 0.3196232339089482, + "grad_norm": 2.1076301049943647, + "learning_rate": 1.591453913370342e-05, + "loss": 1.0223, + "step": 3054 + }, + { + "epoch": 0.3197278911564626, + "grad_norm": 2.0503145112616825, + "learning_rate": 1.5911805531048495e-05, + "loss": 1.0396, + "step": 3055 + }, + { + "epoch": 0.31983254840397696, + "grad_norm": 2.4375226451961827, + "learning_rate": 1.5909071249115128e-05, + "loss": 1.07, + "step": 3056 + }, + { + "epoch": 0.31993720565149136, + "grad_norm": 1.9288776837663408, + "learning_rate": 1.5906336288217494e-05, + "loss": 0.9863, + "step": 3057 + }, + { + "epoch": 0.32004186289900577, + "grad_norm": 2.0943331121912605, + "learning_rate": 1.5903600648669847e-05, + "loss": 0.8811, + "step": 3058 + }, + { + "epoch": 0.3201465201465201, + "grad_norm": 2.0975483698938917, + "learning_rate": 1.590086433078652e-05, + "loss": 1.1083, + "step": 3059 + }, + { + "epoch": 0.3202511773940345, + "grad_norm": 1.991398727953297, + "learning_rate": 1.5898127334881916e-05, + "loss": 0.8839, + "step": 3060 + }, + { + "epoch": 0.32035583464154893, + "grad_norm": 2.2279462939438655, + "learning_rate": 1.5895389661270528e-05, + "loss": 0.9735, + "step": 3061 + }, + { + "epoch": 0.32046049188906334, + "grad_norm": 1.8857564494547183, + "learning_rate": 1.5892651310266912e-05, + "loss": 1.0815, + "step": 3062 + }, + { + "epoch": 0.3205651491365777, + "grad_norm": 2.1252881559677492, + "learning_rate": 1.5889912282185717e-05, + "loss": 0.956, + "step": 3063 + }, + { + "epoch": 0.3206698063840921, + "grad_norm": 2.2558822549332236, + "learning_rate": 1.588717257734166e-05, + "loss": 1.0757, + "step": 3064 + }, + { + "epoch": 0.3207744636316065, + "grad_norm": 2.3702017920147567, + "learning_rate": 1.588443219604954e-05, + "loss": 0.9591, + "step": 3065 + }, + { + "epoch": 0.3208791208791209, + "grad_norm": 1.9235192765898275, + "learning_rate": 1.5881691138624236e-05, + "loss": 1.0212, + "step": 3066 + }, + { + "epoch": 0.32098377812663526, + "grad_norm": 2.664761728680135, + "learning_rate": 1.5878949405380694e-05, + "loss": 1.1368, + "step": 3067 + }, + { + "epoch": 0.32108843537414966, + "grad_norm": 2.3349986003807683, + "learning_rate": 1.5876206996633943e-05, + "loss": 1.0861, + "step": 3068 + }, + { + "epoch": 0.32119309262166407, + "grad_norm": 2.0350639615647435, + "learning_rate": 1.58734639126991e-05, + "loss": 1.0298, + "step": 3069 + }, + { + "epoch": 0.3212977498691784, + "grad_norm": 2.1379630016372473, + "learning_rate": 1.5870720153891346e-05, + "loss": 1.0008, + "step": 3070 + }, + { + "epoch": 0.3214024071166928, + "grad_norm": 2.423558596633754, + "learning_rate": 1.5867975720525947e-05, + "loss": 0.9308, + "step": 3071 + }, + { + "epoch": 0.32150706436420723, + "grad_norm": 2.0816034578512728, + "learning_rate": 1.586523061291824e-05, + "loss": 0.9646, + "step": 3072 + }, + { + "epoch": 0.32161172161172163, + "grad_norm": 2.560250524049416, + "learning_rate": 1.5862484831383643e-05, + "loss": 1.0258, + "step": 3073 + }, + { + "epoch": 0.321716378859236, + "grad_norm": 2.239124234673666, + "learning_rate": 1.5859738376237657e-05, + "loss": 1.0865, + "step": 3074 + }, + { + "epoch": 0.3218210361067504, + "grad_norm": 2.3699409843476666, + "learning_rate": 1.5856991247795853e-05, + "loss": 0.9975, + "step": 3075 + }, + { + "epoch": 0.3219256933542648, + "grad_norm": 1.9916759495681915, + "learning_rate": 1.5854243446373877e-05, + "loss": 0.9162, + "step": 3076 + }, + { + "epoch": 0.32203035060177915, + "grad_norm": 1.9898290475403624, + "learning_rate": 1.5851494972287466e-05, + "loss": 1.0024, + "step": 3077 + }, + { + "epoch": 0.32213500784929355, + "grad_norm": 2.0511214319935456, + "learning_rate": 1.584874582585242e-05, + "loss": 1.0025, + "step": 3078 + }, + { + "epoch": 0.32223966509680796, + "grad_norm": 2.180082289189908, + "learning_rate": 1.5845996007384626e-05, + "loss": 0.7952, + "step": 3079 + }, + { + "epoch": 0.32234432234432236, + "grad_norm": 2.0373190957074314, + "learning_rate": 1.5843245517200033e-05, + "loss": 0.9029, + "step": 3080 + }, + { + "epoch": 0.3224489795918367, + "grad_norm": 2.2294041825628117, + "learning_rate": 1.5840494355614686e-05, + "loss": 1.1581, + "step": 3081 + }, + { + "epoch": 0.3225536368393511, + "grad_norm": 2.3966321362004512, + "learning_rate": 1.5837742522944698e-05, + "loss": 0.9631, + "step": 3082 + }, + { + "epoch": 0.3226582940868655, + "grad_norm": 1.924447534905052, + "learning_rate": 1.5834990019506263e-05, + "loss": 1.0266, + "step": 3083 + }, + { + "epoch": 0.32276295133437993, + "grad_norm": 2.3585671923673908, + "learning_rate": 1.583223684561564e-05, + "loss": 1.0525, + "step": 3084 + }, + { + "epoch": 0.3228676085818943, + "grad_norm": 2.3735070616891334, + "learning_rate": 1.5829483001589185e-05, + "loss": 1.0239, + "step": 3085 + }, + { + "epoch": 0.3229722658294087, + "grad_norm": 2.107789442162236, + "learning_rate": 1.582672848774331e-05, + "loss": 0.9487, + "step": 3086 + }, + { + "epoch": 0.3230769230769231, + "grad_norm": 1.9939952977167719, + "learning_rate": 1.5823973304394526e-05, + "loss": 0.9027, + "step": 3087 + }, + { + "epoch": 0.32318158032443745, + "grad_norm": 1.9853892321221516, + "learning_rate": 1.5821217451859402e-05, + "loss": 0.9402, + "step": 3088 + }, + { + "epoch": 0.32328623757195185, + "grad_norm": 2.294801256285806, + "learning_rate": 1.5818460930454588e-05, + "loss": 1.082, + "step": 3089 + }, + { + "epoch": 0.32339089481946626, + "grad_norm": 2.0584151362636267, + "learning_rate": 1.5815703740496823e-05, + "loss": 0.9276, + "step": 3090 + }, + { + "epoch": 0.32349555206698066, + "grad_norm": 1.9598606086672974, + "learning_rate": 1.5812945882302907e-05, + "loss": 0.9895, + "step": 3091 + }, + { + "epoch": 0.323600209314495, + "grad_norm": 2.0072911970323135, + "learning_rate": 1.5810187356189722e-05, + "loss": 1.0237, + "step": 3092 + }, + { + "epoch": 0.3237048665620094, + "grad_norm": 1.8808583035296527, + "learning_rate": 1.580742816247423e-05, + "loss": 0.9557, + "step": 3093 + }, + { + "epoch": 0.3238095238095238, + "grad_norm": 2.241856615594079, + "learning_rate": 1.5804668301473473e-05, + "loss": 1.0337, + "step": 3094 + }, + { + "epoch": 0.3239141810570382, + "grad_norm": 2.238604082081988, + "learning_rate": 1.5801907773504563e-05, + "loss": 1.0139, + "step": 3095 + }, + { + "epoch": 0.3240188383045526, + "grad_norm": 2.086041350119793, + "learning_rate": 1.5799146578884686e-05, + "loss": 1.0195, + "step": 3096 + }, + { + "epoch": 0.324123495552067, + "grad_norm": 2.177520822966867, + "learning_rate": 1.579638471793111e-05, + "loss": 0.9967, + "step": 3097 + }, + { + "epoch": 0.3242281527995814, + "grad_norm": 2.590324178614816, + "learning_rate": 1.579362219096118e-05, + "loss": 0.9962, + "step": 3098 + }, + { + "epoch": 0.32433281004709574, + "grad_norm": 2.0699609992470407, + "learning_rate": 1.5790858998292313e-05, + "loss": 1.0322, + "step": 3099 + }, + { + "epoch": 0.32443746729461015, + "grad_norm": 2.206119437570607, + "learning_rate": 1.5788095140242014e-05, + "loss": 1.0231, + "step": 3100 + }, + { + "epoch": 0.32454212454212455, + "grad_norm": 1.8212021688643432, + "learning_rate": 1.5785330617127844e-05, + "loss": 0.8309, + "step": 3101 + }, + { + "epoch": 0.3246467817896389, + "grad_norm": 2.2727201676845583, + "learning_rate": 1.578256542926746e-05, + "loss": 0.986, + "step": 3102 + }, + { + "epoch": 0.3247514390371533, + "grad_norm": 2.1449608431036684, + "learning_rate": 1.5779799576978584e-05, + "loss": 1.099, + "step": 3103 + }, + { + "epoch": 0.3248560962846677, + "grad_norm": 2.480440020807763, + "learning_rate": 1.5777033060579022e-05, + "loss": 0.9017, + "step": 3104 + }, + { + "epoch": 0.3249607535321821, + "grad_norm": 2.0367956477342166, + "learning_rate": 1.577426588038665e-05, + "loss": 0.8987, + "step": 3105 + }, + { + "epoch": 0.3250654107796965, + "grad_norm": 1.9815075457418991, + "learning_rate": 1.5771498036719418e-05, + "loss": 0.8948, + "step": 3106 + }, + { + "epoch": 0.3251700680272109, + "grad_norm": 1.9726747058162075, + "learning_rate": 1.5768729529895365e-05, + "loss": 0.9563, + "step": 3107 + }, + { + "epoch": 0.3252747252747253, + "grad_norm": 2.1307459003781295, + "learning_rate": 1.57659603602326e-05, + "loss": 0.9732, + "step": 3108 + }, + { + "epoch": 0.3253793825222397, + "grad_norm": 4.287887282310838, + "learning_rate": 1.5763190528049295e-05, + "loss": 1.1042, + "step": 3109 + }, + { + "epoch": 0.32548403976975404, + "grad_norm": 2.1285826245261483, + "learning_rate": 1.5760420033663716e-05, + "loss": 1.0633, + "step": 3110 + }, + { + "epoch": 0.32558869701726845, + "grad_norm": 1.8011470361015922, + "learning_rate": 1.5757648877394197e-05, + "loss": 0.8628, + "step": 3111 + }, + { + "epoch": 0.32569335426478285, + "grad_norm": 2.151338719221326, + "learning_rate": 1.575487705955915e-05, + "loss": 1.0653, + "step": 3112 + }, + { + "epoch": 0.3257980115122972, + "grad_norm": 2.1328861874541563, + "learning_rate": 1.5752104580477065e-05, + "loss": 1.11, + "step": 3113 + }, + { + "epoch": 0.3259026687598116, + "grad_norm": 2.088911019800362, + "learning_rate": 1.5749331440466497e-05, + "loss": 1.0583, + "step": 3114 + }, + { + "epoch": 0.326007326007326, + "grad_norm": 1.7497905672760596, + "learning_rate": 1.5746557639846095e-05, + "loss": 0.8168, + "step": 3115 + }, + { + "epoch": 0.3261119832548404, + "grad_norm": 2.0703455005124765, + "learning_rate": 1.5743783178934572e-05, + "loss": 0.9295, + "step": 3116 + }, + { + "epoch": 0.32621664050235477, + "grad_norm": 2.051050023116992, + "learning_rate": 1.5741008058050718e-05, + "loss": 0.9884, + "step": 3117 + }, + { + "epoch": 0.3263212977498692, + "grad_norm": 1.8426576087061717, + "learning_rate": 1.57382322775134e-05, + "loss": 0.9219, + "step": 3118 + }, + { + "epoch": 0.3264259549973836, + "grad_norm": 2.1973030476954123, + "learning_rate": 1.5735455837641556e-05, + "loss": 0.9935, + "step": 3119 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 2.390653054556527, + "learning_rate": 1.5732678738754214e-05, + "loss": 1.1316, + "step": 3120 + }, + { + "epoch": 0.32663526949241234, + "grad_norm": 2.099033446066326, + "learning_rate": 1.5729900981170464e-05, + "loss": 1.036, + "step": 3121 + }, + { + "epoch": 0.32673992673992674, + "grad_norm": 1.7457495629091635, + "learning_rate": 1.5727122565209474e-05, + "loss": 0.8337, + "step": 3122 + }, + { + "epoch": 0.32684458398744115, + "grad_norm": 2.5405638057373365, + "learning_rate": 1.5724343491190488e-05, + "loss": 1.019, + "step": 3123 + }, + { + "epoch": 0.3269492412349555, + "grad_norm": 2.2469815540311826, + "learning_rate": 1.5721563759432835e-05, + "loss": 1.0026, + "step": 3124 + }, + { + "epoch": 0.3270538984824699, + "grad_norm": 2.397277989964288, + "learning_rate": 1.5718783370255905e-05, + "loss": 0.9063, + "step": 3125 + }, + { + "epoch": 0.3271585557299843, + "grad_norm": 2.353318814929306, + "learning_rate": 1.5716002323979174e-05, + "loss": 1.194, + "step": 3126 + }, + { + "epoch": 0.3272632129774987, + "grad_norm": 2.028797944646975, + "learning_rate": 1.571322062092219e-05, + "loss": 0.7781, + "step": 3127 + }, + { + "epoch": 0.32736787022501307, + "grad_norm": 2.1763538986167346, + "learning_rate": 1.571043826140457e-05, + "loss": 1.0173, + "step": 3128 + }, + { + "epoch": 0.3274725274725275, + "grad_norm": 2.2559481805621866, + "learning_rate": 1.5707655245746022e-05, + "loss": 1.0209, + "step": 3129 + }, + { + "epoch": 0.3275771847200419, + "grad_norm": 2.266294866644528, + "learning_rate": 1.5704871574266315e-05, + "loss": 1.1683, + "step": 3130 + }, + { + "epoch": 0.32768184196755623, + "grad_norm": 2.1728502972455774, + "learning_rate": 1.5702087247285297e-05, + "loss": 1.0714, + "step": 3131 + }, + { + "epoch": 0.32778649921507064, + "grad_norm": 2.412631577860173, + "learning_rate": 1.5699302265122894e-05, + "loss": 1.0777, + "step": 3132 + }, + { + "epoch": 0.32789115646258504, + "grad_norm": 2.4781137882095403, + "learning_rate": 1.569651662809911e-05, + "loss": 1.086, + "step": 3133 + }, + { + "epoch": 0.32799581371009945, + "grad_norm": 2.2300712669478036, + "learning_rate": 1.5693730336534017e-05, + "loss": 1.0021, + "step": 3134 + }, + { + "epoch": 0.3281004709576138, + "grad_norm": 2.2052539997942655, + "learning_rate": 1.5690943390747764e-05, + "loss": 1.0284, + "step": 3135 + }, + { + "epoch": 0.3282051282051282, + "grad_norm": 2.4413624909490497, + "learning_rate": 1.5688155791060577e-05, + "loss": 1.0287, + "step": 3136 + }, + { + "epoch": 0.3283097854526426, + "grad_norm": 2.556415622005147, + "learning_rate": 1.5685367537792765e-05, + "loss": 1.075, + "step": 3137 + }, + { + "epoch": 0.32841444270015696, + "grad_norm": 2.1283762756864677, + "learning_rate": 1.568257863126469e-05, + "loss": 0.8205, + "step": 3138 + }, + { + "epoch": 0.32851909994767137, + "grad_norm": 2.232790862732562, + "learning_rate": 1.567978907179682e-05, + "loss": 1.0162, + "step": 3139 + }, + { + "epoch": 0.3286237571951858, + "grad_norm": 2.3136474249151537, + "learning_rate": 1.5676998859709663e-05, + "loss": 1.0353, + "step": 3140 + }, + { + "epoch": 0.3287284144427002, + "grad_norm": 2.638925386745023, + "learning_rate": 1.5674207995323828e-05, + "loss": 0.9493, + "step": 3141 + }, + { + "epoch": 0.32883307169021453, + "grad_norm": 2.506618787870084, + "learning_rate": 1.5671416478959995e-05, + "loss": 1.0422, + "step": 3142 + }, + { + "epoch": 0.32893772893772893, + "grad_norm": 2.0975015700370125, + "learning_rate": 1.5668624310938914e-05, + "loss": 0.9647, + "step": 3143 + }, + { + "epoch": 0.32904238618524334, + "grad_norm": 2.1397256829041456, + "learning_rate": 1.5665831491581405e-05, + "loss": 0.8599, + "step": 3144 + }, + { + "epoch": 0.3291470434327577, + "grad_norm": 2.63504903803654, + "learning_rate": 1.566303802120837e-05, + "loss": 1.0202, + "step": 3145 + }, + { + "epoch": 0.3292517006802721, + "grad_norm": 2.026871373608857, + "learning_rate": 1.566024390014079e-05, + "loss": 1.0174, + "step": 3146 + }, + { + "epoch": 0.3293563579277865, + "grad_norm": 2.222517598112396, + "learning_rate": 1.5657449128699708e-05, + "loss": 0.8657, + "step": 3147 + }, + { + "epoch": 0.3294610151753009, + "grad_norm": 1.8704721739634205, + "learning_rate": 1.5654653707206255e-05, + "loss": 1.0354, + "step": 3148 + }, + { + "epoch": 0.32956567242281526, + "grad_norm": 2.215919635887023, + "learning_rate": 1.5651857635981628e-05, + "loss": 1.0849, + "step": 3149 + }, + { + "epoch": 0.32967032967032966, + "grad_norm": 2.564246658121425, + "learning_rate": 1.56490609153471e-05, + "loss": 0.9606, + "step": 3150 + }, + { + "epoch": 0.32977498691784407, + "grad_norm": 2.1280666629677696, + "learning_rate": 1.564626354562402e-05, + "loss": 1.0398, + "step": 3151 + }, + { + "epoch": 0.3298796441653585, + "grad_norm": 2.007176609705347, + "learning_rate": 1.5643465527133815e-05, + "loss": 0.9364, + "step": 3152 + }, + { + "epoch": 0.3299843014128728, + "grad_norm": 2.374920213374514, + "learning_rate": 1.5640666860197975e-05, + "loss": 1.0571, + "step": 3153 + }, + { + "epoch": 0.33008895866038723, + "grad_norm": 1.9142213744766876, + "learning_rate": 1.5637867545138083e-05, + "loss": 0.96, + "step": 3154 + }, + { + "epoch": 0.33019361590790164, + "grad_norm": 2.2782769847496223, + "learning_rate": 1.563506758227578e-05, + "loss": 1.0653, + "step": 3155 + }, + { + "epoch": 0.330298273155416, + "grad_norm": 1.884020880571543, + "learning_rate": 1.563226697193279e-05, + "loss": 1.0129, + "step": 3156 + }, + { + "epoch": 0.3304029304029304, + "grad_norm": 2.2062503790267862, + "learning_rate": 1.5629465714430906e-05, + "loss": 0.9958, + "step": 3157 + }, + { + "epoch": 0.3305075876504448, + "grad_norm": 2.3997612816521086, + "learning_rate": 1.5626663810091997e-05, + "loss": 1.0093, + "step": 3158 + }, + { + "epoch": 0.3306122448979592, + "grad_norm": 2.103715083415718, + "learning_rate": 1.562386125923801e-05, + "loss": 0.9297, + "step": 3159 + }, + { + "epoch": 0.33071690214547356, + "grad_norm": 2.215995087737775, + "learning_rate": 1.5621058062190965e-05, + "loss": 0.9648, + "step": 3160 + }, + { + "epoch": 0.33082155939298796, + "grad_norm": 1.8692320023257354, + "learning_rate": 1.5618254219272954e-05, + "loss": 1.0103, + "step": 3161 + }, + { + "epoch": 0.33092621664050237, + "grad_norm": 2.0395101583342305, + "learning_rate": 1.561544973080614e-05, + "loss": 0.936, + "step": 3162 + }, + { + "epoch": 0.3310308738880167, + "grad_norm": 1.9092428105317705, + "learning_rate": 1.5612644597112773e-05, + "loss": 0.8791, + "step": 3163 + }, + { + "epoch": 0.3311355311355311, + "grad_norm": 2.340830105583403, + "learning_rate": 1.5609838818515162e-05, + "loss": 1.0334, + "step": 3164 + }, + { + "epoch": 0.33124018838304553, + "grad_norm": 2.500046118391191, + "learning_rate": 1.56070323953357e-05, + "loss": 0.8887, + "step": 3165 + }, + { + "epoch": 0.33134484563055994, + "grad_norm": 2.3218591928110794, + "learning_rate": 1.5604225327896846e-05, + "loss": 1.0837, + "step": 3166 + }, + { + "epoch": 0.3314495028780743, + "grad_norm": 2.119253835435931, + "learning_rate": 1.5601417616521143e-05, + "loss": 0.9716, + "step": 3167 + }, + { + "epoch": 0.3315541601255887, + "grad_norm": 2.038070808087544, + "learning_rate": 1.5598609261531202e-05, + "loss": 1.0377, + "step": 3168 + }, + { + "epoch": 0.3316588173731031, + "grad_norm": 2.4134269571581215, + "learning_rate": 1.5595800263249702e-05, + "loss": 0.8931, + "step": 3169 + }, + { + "epoch": 0.3317634746206175, + "grad_norm": 1.7317120142862439, + "learning_rate": 1.5592990621999414e-05, + "loss": 0.8002, + "step": 3170 + }, + { + "epoch": 0.33186813186813185, + "grad_norm": 2.150415859427035, + "learning_rate": 1.559018033810316e-05, + "loss": 0.8975, + "step": 3171 + }, + { + "epoch": 0.33197278911564626, + "grad_norm": 2.90864630369287, + "learning_rate": 1.5587369411883856e-05, + "loss": 0.998, + "step": 3172 + }, + { + "epoch": 0.33207744636316067, + "grad_norm": 2.271286673783965, + "learning_rate": 1.558455784366448e-05, + "loss": 1.021, + "step": 3173 + }, + { + "epoch": 0.332182103610675, + "grad_norm": 2.3150664350547885, + "learning_rate": 1.5581745633768086e-05, + "loss": 1.0001, + "step": 3174 + }, + { + "epoch": 0.3322867608581894, + "grad_norm": 3.363470350927157, + "learning_rate": 1.5578932782517804e-05, + "loss": 0.9165, + "step": 3175 + }, + { + "epoch": 0.33239141810570383, + "grad_norm": 2.08188408813949, + "learning_rate": 1.5576119290236836e-05, + "loss": 0.9887, + "step": 3176 + }, + { + "epoch": 0.33249607535321823, + "grad_norm": 1.930989000386139, + "learning_rate": 1.5573305157248454e-05, + "loss": 0.9224, + "step": 3177 + }, + { + "epoch": 0.3326007326007326, + "grad_norm": 1.9621642580428715, + "learning_rate": 1.5570490383876015e-05, + "loss": 0.9062, + "step": 3178 + }, + { + "epoch": 0.332705389848247, + "grad_norm": 1.9661202771421844, + "learning_rate": 1.5567674970442936e-05, + "loss": 0.8901, + "step": 3179 + }, + { + "epoch": 0.3328100470957614, + "grad_norm": 2.1572459606930625, + "learning_rate": 1.5564858917272716e-05, + "loss": 0.9869, + "step": 3180 + }, + { + "epoch": 0.33291470434327575, + "grad_norm": 2.474124091599375, + "learning_rate": 1.5562042224688926e-05, + "loss": 1.1371, + "step": 3181 + }, + { + "epoch": 0.33301936159079015, + "grad_norm": 1.8653639765364336, + "learning_rate": 1.555922489301521e-05, + "loss": 0.8485, + "step": 3182 + }, + { + "epoch": 0.33312401883830456, + "grad_norm": 2.1078524268043783, + "learning_rate": 1.555640692257528e-05, + "loss": 0.9803, + "step": 3183 + }, + { + "epoch": 0.33322867608581896, + "grad_norm": 2.139476652742253, + "learning_rate": 1.5553588313692936e-05, + "loss": 0.9596, + "step": 3184 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.141936726484649, + "learning_rate": 1.5550769066692036e-05, + "loss": 1.0432, + "step": 3185 + }, + { + "epoch": 0.3334379905808477, + "grad_norm": 2.12152468260362, + "learning_rate": 1.5547949181896515e-05, + "loss": 1.01, + "step": 3186 + }, + { + "epoch": 0.3335426478283621, + "grad_norm": 2.264842415629573, + "learning_rate": 1.5545128659630385e-05, + "loss": 1.0728, + "step": 3187 + }, + { + "epoch": 0.3336473050758765, + "grad_norm": 2.1304146804426924, + "learning_rate": 1.5542307500217734e-05, + "loss": 0.967, + "step": 3188 + }, + { + "epoch": 0.3337519623233909, + "grad_norm": 1.9276499028871452, + "learning_rate": 1.5539485703982714e-05, + "loss": 0.9121, + "step": 3189 + }, + { + "epoch": 0.3338566195709053, + "grad_norm": 1.9765479283756435, + "learning_rate": 1.553666327124956e-05, + "loss": 1.0808, + "step": 3190 + }, + { + "epoch": 0.3339612768184197, + "grad_norm": 1.7361909209283715, + "learning_rate": 1.553384020234257e-05, + "loss": 0.8643, + "step": 3191 + }, + { + "epoch": 0.33406593406593404, + "grad_norm": 2.1761684193729174, + "learning_rate": 1.553101649758612e-05, + "loss": 1.0361, + "step": 3192 + }, + { + "epoch": 0.33417059131344845, + "grad_norm": 2.523913526660812, + "learning_rate": 1.5528192157304663e-05, + "loss": 1.0898, + "step": 3193 + }, + { + "epoch": 0.33427524856096286, + "grad_norm": 2.118661622161171, + "learning_rate": 1.5525367181822722e-05, + "loss": 0.956, + "step": 3194 + }, + { + "epoch": 0.33437990580847726, + "grad_norm": 2.1545428502001123, + "learning_rate": 1.5522541571464894e-05, + "loss": 0.9939, + "step": 3195 + }, + { + "epoch": 0.3344845630559916, + "grad_norm": 2.2676647398119107, + "learning_rate": 1.5519715326555837e-05, + "loss": 1.0386, + "step": 3196 + }, + { + "epoch": 0.334589220303506, + "grad_norm": 2.06105181377489, + "learning_rate": 1.5516888447420303e-05, + "loss": 1.0757, + "step": 3197 + }, + { + "epoch": 0.3346938775510204, + "grad_norm": 2.3715768654167775, + "learning_rate": 1.5514060934383103e-05, + "loss": 0.997, + "step": 3198 + }, + { + "epoch": 0.3347985347985348, + "grad_norm": 2.027803037575869, + "learning_rate": 1.5511232787769124e-05, + "loss": 0.8679, + "step": 3199 + }, + { + "epoch": 0.3349031920460492, + "grad_norm": 2.1181232730913964, + "learning_rate": 1.5508404007903322e-05, + "loss": 0.9015, + "step": 3200 + }, + { + "epoch": 0.3350078492935636, + "grad_norm": 2.3226842222737365, + "learning_rate": 1.5505574595110734e-05, + "loss": 0.9824, + "step": 3201 + }, + { + "epoch": 0.335112506541078, + "grad_norm": 2.47655647815724, + "learning_rate": 1.5502744549716465e-05, + "loss": 0.9697, + "step": 3202 + }, + { + "epoch": 0.33521716378859234, + "grad_norm": 2.0566095932796986, + "learning_rate": 1.5499913872045694e-05, + "loss": 0.8571, + "step": 3203 + }, + { + "epoch": 0.33532182103610675, + "grad_norm": 2.28076401752358, + "learning_rate": 1.5497082562423665e-05, + "loss": 1.0462, + "step": 3204 + }, + { + "epoch": 0.33542647828362115, + "grad_norm": 2.3423053756381784, + "learning_rate": 1.5494250621175706e-05, + "loss": 1.01, + "step": 3205 + }, + { + "epoch": 0.3355311355311355, + "grad_norm": 2.5072831733976653, + "learning_rate": 1.5491418048627214e-05, + "loss": 1.0932, + "step": 3206 + }, + { + "epoch": 0.3356357927786499, + "grad_norm": 2.1432608031534723, + "learning_rate": 1.548858484510366e-05, + "loss": 0.8312, + "step": 3207 + }, + { + "epoch": 0.3357404500261643, + "grad_norm": 2.437167122394929, + "learning_rate": 1.5485751010930577e-05, + "loss": 0.927, + "step": 3208 + }, + { + "epoch": 0.3358451072736787, + "grad_norm": 2.0576201295605925, + "learning_rate": 1.5482916546433583e-05, + "loss": 1.0367, + "step": 3209 + }, + { + "epoch": 0.3359497645211931, + "grad_norm": 2.1847007438604362, + "learning_rate": 1.5480081451938362e-05, + "loss": 1.0442, + "step": 3210 + }, + { + "epoch": 0.3360544217687075, + "grad_norm": 2.068195651770208, + "learning_rate": 1.5477245727770672e-05, + "loss": 0.9245, + "step": 3211 + }, + { + "epoch": 0.3361590790162219, + "grad_norm": 2.1072128887111967, + "learning_rate": 1.5474409374256344e-05, + "loss": 0.9496, + "step": 3212 + }, + { + "epoch": 0.3362637362637363, + "grad_norm": 2.3804536046173914, + "learning_rate": 1.5471572391721282e-05, + "loss": 0.977, + "step": 3213 + }, + { + "epoch": 0.33636839351125064, + "grad_norm": 2.0895509232276406, + "learning_rate": 1.5468734780491465e-05, + "loss": 1.0336, + "step": 3214 + }, + { + "epoch": 0.33647305075876505, + "grad_norm": 2.168752460124443, + "learning_rate": 1.5465896540892932e-05, + "loss": 0.9361, + "step": 3215 + }, + { + "epoch": 0.33657770800627945, + "grad_norm": 2.1004970170203197, + "learning_rate": 1.5463057673251802e-05, + "loss": 0.9727, + "step": 3216 + }, + { + "epoch": 0.3366823652537938, + "grad_norm": 2.050190223227202, + "learning_rate": 1.5460218177894274e-05, + "loss": 0.9999, + "step": 3217 + }, + { + "epoch": 0.3367870225013082, + "grad_norm": 2.038917201160083, + "learning_rate": 1.5457378055146607e-05, + "loss": 0.962, + "step": 3218 + }, + { + "epoch": 0.3368916797488226, + "grad_norm": 2.0493286691810395, + "learning_rate": 1.545453730533514e-05, + "loss": 0.918, + "step": 3219 + }, + { + "epoch": 0.336996336996337, + "grad_norm": 2.009804376163448, + "learning_rate": 1.545169592878628e-05, + "loss": 0.7884, + "step": 3220 + }, + { + "epoch": 0.33710099424385137, + "grad_norm": 1.9418895146815451, + "learning_rate": 1.5448853925826505e-05, + "loss": 0.9334, + "step": 3221 + }, + { + "epoch": 0.3372056514913658, + "grad_norm": 2.0920719859447363, + "learning_rate": 1.5446011296782368e-05, + "loss": 0.9992, + "step": 3222 + }, + { + "epoch": 0.3373103087388802, + "grad_norm": 2.7317450031003876, + "learning_rate": 1.5443168041980488e-05, + "loss": 0.9853, + "step": 3223 + }, + { + "epoch": 0.33741496598639453, + "grad_norm": 2.5941883234023564, + "learning_rate": 1.544032416174757e-05, + "loss": 1.0523, + "step": 3224 + }, + { + "epoch": 0.33751962323390894, + "grad_norm": 2.2629384674782442, + "learning_rate": 1.5437479656410376e-05, + "loss": 1.1233, + "step": 3225 + }, + { + "epoch": 0.33762428048142334, + "grad_norm": 2.2335346692765126, + "learning_rate": 1.543463452629575e-05, + "loss": 0.9641, + "step": 3226 + }, + { + "epoch": 0.33772893772893775, + "grad_norm": 2.1221463470407427, + "learning_rate": 1.5431788771730597e-05, + "loss": 1.0066, + "step": 3227 + }, + { + "epoch": 0.3378335949764521, + "grad_norm": 2.2293540541218504, + "learning_rate": 1.5428942393041904e-05, + "loss": 1.0154, + "step": 3228 + }, + { + "epoch": 0.3379382522239665, + "grad_norm": 2.3900893182152845, + "learning_rate": 1.5426095390556725e-05, + "loss": 1.053, + "step": 3229 + }, + { + "epoch": 0.3380429094714809, + "grad_norm": 2.048307102090152, + "learning_rate": 1.542324776460218e-05, + "loss": 1.0966, + "step": 3230 + }, + { + "epoch": 0.33814756671899526, + "grad_norm": 2.141298333153114, + "learning_rate": 1.5420399515505478e-05, + "loss": 0.9943, + "step": 3231 + }, + { + "epoch": 0.33825222396650967, + "grad_norm": 2.1539784614556936, + "learning_rate": 1.5417550643593885e-05, + "loss": 1.0074, + "step": 3232 + }, + { + "epoch": 0.3383568812140241, + "grad_norm": 2.244060531894042, + "learning_rate": 1.5414701149194736e-05, + "loss": 1.0491, + "step": 3233 + }, + { + "epoch": 0.3384615384615385, + "grad_norm": 1.995913218973165, + "learning_rate": 1.541185103263545e-05, + "loss": 0.9373, + "step": 3234 + }, + { + "epoch": 0.33856619570905283, + "grad_norm": 2.172151133964098, + "learning_rate": 1.540900029424351e-05, + "loss": 0.9444, + "step": 3235 + }, + { + "epoch": 0.33867085295656724, + "grad_norm": 1.89676124574755, + "learning_rate": 1.5406148934346468e-05, + "loss": 0.8628, + "step": 3236 + }, + { + "epoch": 0.33877551020408164, + "grad_norm": 1.8828681953222985, + "learning_rate": 1.5403296953271958e-05, + "loss": 0.9545, + "step": 3237 + }, + { + "epoch": 0.33888016745159605, + "grad_norm": 1.8741424977674062, + "learning_rate": 1.5400444351347674e-05, + "loss": 0.966, + "step": 3238 + }, + { + "epoch": 0.3389848246991104, + "grad_norm": 2.0333171496050304, + "learning_rate": 1.5397591128901384e-05, + "loss": 0.9455, + "step": 3239 + }, + { + "epoch": 0.3390894819466248, + "grad_norm": 1.9816357166577598, + "learning_rate": 1.539473728626093e-05, + "loss": 1.0213, + "step": 3240 + }, + { + "epoch": 0.3391941391941392, + "grad_norm": 2.363598537983322, + "learning_rate": 1.539188282375423e-05, + "loss": 1.0232, + "step": 3241 + }, + { + "epoch": 0.33929879644165356, + "grad_norm": 2.0397959543670314, + "learning_rate": 1.538902774170926e-05, + "loss": 1.0472, + "step": 3242 + }, + { + "epoch": 0.33940345368916797, + "grad_norm": 2.070254879911, + "learning_rate": 1.538617204045408e-05, + "loss": 1.0323, + "step": 3243 + }, + { + "epoch": 0.3395081109366824, + "grad_norm": 2.0695439666681814, + "learning_rate": 1.538331572031681e-05, + "loss": 1.063, + "step": 3244 + }, + { + "epoch": 0.3396127681841968, + "grad_norm": 2.1305043728996567, + "learning_rate": 1.5380458781625654e-05, + "loss": 1.0096, + "step": 3245 + }, + { + "epoch": 0.33971742543171113, + "grad_norm": 1.967788896520279, + "learning_rate": 1.5377601224708875e-05, + "loss": 1.002, + "step": 3246 + }, + { + "epoch": 0.33982208267922553, + "grad_norm": 2.1322066005520592, + "learning_rate": 1.5374743049894813e-05, + "loss": 1.0679, + "step": 3247 + }, + { + "epoch": 0.33992673992673994, + "grad_norm": 1.8579885877502837, + "learning_rate": 1.537188425751188e-05, + "loss": 0.8455, + "step": 3248 + }, + { + "epoch": 0.3400313971742543, + "grad_norm": 2.15408723090576, + "learning_rate": 1.536902484788856e-05, + "loss": 1.0029, + "step": 3249 + }, + { + "epoch": 0.3401360544217687, + "grad_norm": 2.4599439352791537, + "learning_rate": 1.5366164821353392e-05, + "loss": 1.0669, + "step": 3250 + }, + { + "epoch": 0.3402407116692831, + "grad_norm": 2.2542203659980142, + "learning_rate": 1.5363304178235017e-05, + "loss": 1.0087, + "step": 3251 + }, + { + "epoch": 0.3403453689167975, + "grad_norm": 2.0949948404828187, + "learning_rate": 1.536044291886211e-05, + "loss": 0.9923, + "step": 3252 + }, + { + "epoch": 0.34045002616431186, + "grad_norm": 1.9315131178475988, + "learning_rate": 1.535758104356345e-05, + "loss": 1.0347, + "step": 3253 + }, + { + "epoch": 0.34055468341182626, + "grad_norm": 2.172260985606811, + "learning_rate": 1.535471855266787e-05, + "loss": 1.0161, + "step": 3254 + }, + { + "epoch": 0.34065934065934067, + "grad_norm": 2.03347702907374, + "learning_rate": 1.5351855446504268e-05, + "loss": 0.9756, + "step": 3255 + }, + { + "epoch": 0.3407639979068551, + "grad_norm": 2.1511761068591344, + "learning_rate": 1.534899172540163e-05, + "loss": 0.9812, + "step": 3256 + }, + { + "epoch": 0.3408686551543694, + "grad_norm": 2.014625870073615, + "learning_rate": 1.5346127389688996e-05, + "loss": 1.0583, + "step": 3257 + }, + { + "epoch": 0.34097331240188383, + "grad_norm": 2.424338507190857, + "learning_rate": 1.534326243969549e-05, + "loss": 1.1823, + "step": 3258 + }, + { + "epoch": 0.34107796964939824, + "grad_norm": 2.381135018727616, + "learning_rate": 1.5340396875750294e-05, + "loss": 0.9861, + "step": 3259 + }, + { + "epoch": 0.3411826268969126, + "grad_norm": 2.4725877348005585, + "learning_rate": 1.5337530698182674e-05, + "loss": 1.0594, + "step": 3260 + }, + { + "epoch": 0.341287284144427, + "grad_norm": 1.9709311757797843, + "learning_rate": 1.5334663907321957e-05, + "loss": 1.0043, + "step": 3261 + }, + { + "epoch": 0.3413919413919414, + "grad_norm": 2.0576277902991693, + "learning_rate": 1.533179650349754e-05, + "loss": 0.9401, + "step": 3262 + }, + { + "epoch": 0.3414965986394558, + "grad_norm": 1.9172132204679706, + "learning_rate": 1.5328928487038898e-05, + "loss": 1.0187, + "step": 3263 + }, + { + "epoch": 0.34160125588697016, + "grad_norm": 2.2358901429303026, + "learning_rate": 1.5326059858275568e-05, + "loss": 1.0068, + "step": 3264 + }, + { + "epoch": 0.34170591313448456, + "grad_norm": 2.130154051130339, + "learning_rate": 1.5323190617537165e-05, + "loss": 0.9645, + "step": 3265 + }, + { + "epoch": 0.34181057038199897, + "grad_norm": 1.9736640857121206, + "learning_rate": 1.5320320765153367e-05, + "loss": 0.9449, + "step": 3266 + }, + { + "epoch": 0.3419152276295133, + "grad_norm": 2.2610379556501226, + "learning_rate": 1.531745030145393e-05, + "loss": 0.9335, + "step": 3267 + }, + { + "epoch": 0.3420198848770277, + "grad_norm": 1.9304145684884284, + "learning_rate": 1.531457922676867e-05, + "loss": 0.9527, + "step": 3268 + }, + { + "epoch": 0.34212454212454213, + "grad_norm": 2.23937829886418, + "learning_rate": 1.531170754142749e-05, + "loss": 1.0275, + "step": 3269 + }, + { + "epoch": 0.34222919937205654, + "grad_norm": 1.952354365893699, + "learning_rate": 1.5308835245760342e-05, + "loss": 0.9735, + "step": 3270 + }, + { + "epoch": 0.3423338566195709, + "grad_norm": 1.7116086172760445, + "learning_rate": 1.530596234009726e-05, + "loss": 0.9834, + "step": 3271 + }, + { + "epoch": 0.3424385138670853, + "grad_norm": 2.209172716848948, + "learning_rate": 1.5303088824768347e-05, + "loss": 0.9872, + "step": 3272 + }, + { + "epoch": 0.3425431711145997, + "grad_norm": 2.2644113537174504, + "learning_rate": 1.5300214700103784e-05, + "loss": 1.0005, + "step": 3273 + }, + { + "epoch": 0.3426478283621141, + "grad_norm": 2.2147137441366063, + "learning_rate": 1.52973399664338e-05, + "loss": 1.0471, + "step": 3274 + }, + { + "epoch": 0.34275248560962845, + "grad_norm": 2.202945184782102, + "learning_rate": 1.5294464624088718e-05, + "loss": 1.0856, + "step": 3275 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 2.036998214551154, + "learning_rate": 1.529158867339892e-05, + "loss": 1.0183, + "step": 3276 + }, + { + "epoch": 0.34296180010465727, + "grad_norm": 2.3880333648717778, + "learning_rate": 1.5288712114694855e-05, + "loss": 0.9732, + "step": 3277 + }, + { + "epoch": 0.3430664573521716, + "grad_norm": 2.293141355535519, + "learning_rate": 1.5285834948307047e-05, + "loss": 1.0911, + "step": 3278 + }, + { + "epoch": 0.343171114599686, + "grad_norm": 2.3667237873101103, + "learning_rate": 1.5282957174566086e-05, + "loss": 1.0271, + "step": 3279 + }, + { + "epoch": 0.34327577184720043, + "grad_norm": 2.5802091802539118, + "learning_rate": 1.528007879380264e-05, + "loss": 1.0383, + "step": 3280 + }, + { + "epoch": 0.34338042909471483, + "grad_norm": 2.2984418484959037, + "learning_rate": 1.527719980634743e-05, + "loss": 1.0983, + "step": 3281 + }, + { + "epoch": 0.3434850863422292, + "grad_norm": 1.9988451780658998, + "learning_rate": 1.527432021253127e-05, + "loss": 0.9391, + "step": 3282 + }, + { + "epoch": 0.3435897435897436, + "grad_norm": 1.9979482563308306, + "learning_rate": 1.5271440012685027e-05, + "loss": 0.9803, + "step": 3283 + }, + { + "epoch": 0.343694400837258, + "grad_norm": 2.0462333311095144, + "learning_rate": 1.5268559207139637e-05, + "loss": 0.953, + "step": 3284 + }, + { + "epoch": 0.34379905808477235, + "grad_norm": 2.018679553660942, + "learning_rate": 1.5265677796226114e-05, + "loss": 1.0009, + "step": 3285 + }, + { + "epoch": 0.34390371533228675, + "grad_norm": 1.8547261935286408, + "learning_rate": 1.5262795780275538e-05, + "loss": 1.0313, + "step": 3286 + }, + { + "epoch": 0.34400837257980116, + "grad_norm": 2.0846595576576843, + "learning_rate": 1.5259913159619054e-05, + "loss": 0.9638, + "step": 3287 + }, + { + "epoch": 0.34411302982731556, + "grad_norm": 2.2092617207892458, + "learning_rate": 1.5257029934587888e-05, + "loss": 1.048, + "step": 3288 + }, + { + "epoch": 0.3442176870748299, + "grad_norm": 2.115010279212093, + "learning_rate": 1.5254146105513328e-05, + "loss": 1.139, + "step": 3289 + }, + { + "epoch": 0.3443223443223443, + "grad_norm": 2.0072225659373903, + "learning_rate": 1.5251261672726727e-05, + "loss": 1.0207, + "step": 3290 + }, + { + "epoch": 0.3444270015698587, + "grad_norm": 1.9343608998324062, + "learning_rate": 1.5248376636559514e-05, + "loss": 0.8169, + "step": 3291 + }, + { + "epoch": 0.3445316588173731, + "grad_norm": 2.2114081845502795, + "learning_rate": 1.5245490997343187e-05, + "loss": 1.0064, + "step": 3292 + }, + { + "epoch": 0.3446363160648875, + "grad_norm": 2.19450830303441, + "learning_rate": 1.5242604755409307e-05, + "loss": 1.0446, + "step": 3293 + }, + { + "epoch": 0.3447409733124019, + "grad_norm": 2.9828919291241727, + "learning_rate": 1.5239717911089518e-05, + "loss": 0.9943, + "step": 3294 + }, + { + "epoch": 0.3448456305599163, + "grad_norm": 1.8836484748965538, + "learning_rate": 1.5236830464715517e-05, + "loss": 0.8834, + "step": 3295 + }, + { + "epoch": 0.34495028780743064, + "grad_norm": 2.484034519489334, + "learning_rate": 1.523394241661908e-05, + "loss": 0.8373, + "step": 3296 + }, + { + "epoch": 0.34505494505494505, + "grad_norm": 2.3959300576508262, + "learning_rate": 1.5231053767132047e-05, + "loss": 0.9524, + "step": 3297 + }, + { + "epoch": 0.34515960230245946, + "grad_norm": 2.0058874784680447, + "learning_rate": 1.5228164516586332e-05, + "loss": 0.9646, + "step": 3298 + }, + { + "epoch": 0.34526425954997386, + "grad_norm": 2.1806341190742944, + "learning_rate": 1.5225274665313916e-05, + "loss": 0.9534, + "step": 3299 + }, + { + "epoch": 0.3453689167974882, + "grad_norm": 2.3809715607129274, + "learning_rate": 1.5222384213646847e-05, + "loss": 1.0433, + "step": 3300 + }, + { + "epoch": 0.3454735740450026, + "grad_norm": 2.243507765788451, + "learning_rate": 1.5219493161917248e-05, + "loss": 0.9947, + "step": 3301 + }, + { + "epoch": 0.345578231292517, + "grad_norm": 1.9486940004775435, + "learning_rate": 1.5216601510457305e-05, + "loss": 1.0537, + "step": 3302 + }, + { + "epoch": 0.3456828885400314, + "grad_norm": 2.1151878794379058, + "learning_rate": 1.5213709259599267e-05, + "loss": 0.9945, + "step": 3303 + }, + { + "epoch": 0.3457875457875458, + "grad_norm": 2.4085393603562073, + "learning_rate": 1.5210816409675473e-05, + "loss": 1.0041, + "step": 3304 + }, + { + "epoch": 0.3458922030350602, + "grad_norm": 2.0934987255320405, + "learning_rate": 1.5207922961018304e-05, + "loss": 1.0212, + "step": 3305 + }, + { + "epoch": 0.3459968602825746, + "grad_norm": 2.040087227874492, + "learning_rate": 1.5205028913960232e-05, + "loss": 1.045, + "step": 3306 + }, + { + "epoch": 0.34610151753008894, + "grad_norm": 2.0986542779184183, + "learning_rate": 1.5202134268833786e-05, + "loss": 0.9866, + "step": 3307 + }, + { + "epoch": 0.34620617477760335, + "grad_norm": 2.2898782266773425, + "learning_rate": 1.5199239025971567e-05, + "loss": 0.8739, + "step": 3308 + }, + { + "epoch": 0.34631083202511775, + "grad_norm": 2.7507453481175292, + "learning_rate": 1.5196343185706244e-05, + "loss": 0.9127, + "step": 3309 + }, + { + "epoch": 0.3464154892726321, + "grad_norm": 2.1750979583796366, + "learning_rate": 1.5193446748370551e-05, + "loss": 1.0279, + "step": 3310 + }, + { + "epoch": 0.3465201465201465, + "grad_norm": 2.0867343208501365, + "learning_rate": 1.5190549714297303e-05, + "loss": 1.0373, + "step": 3311 + }, + { + "epoch": 0.3466248037676609, + "grad_norm": 2.5327508046734626, + "learning_rate": 1.5187652083819369e-05, + "loss": 1.0135, + "step": 3312 + }, + { + "epoch": 0.3467294610151753, + "grad_norm": 2.1859927718890555, + "learning_rate": 1.5184753857269697e-05, + "loss": 0.9699, + "step": 3313 + }, + { + "epoch": 0.3468341182626897, + "grad_norm": 2.009327763025113, + "learning_rate": 1.518185503498129e-05, + "loss": 0.907, + "step": 3314 + }, + { + "epoch": 0.3469387755102041, + "grad_norm": 2.2224539969275643, + "learning_rate": 1.5178955617287239e-05, + "loss": 1.083, + "step": 3315 + }, + { + "epoch": 0.3470434327577185, + "grad_norm": 2.4096293598821177, + "learning_rate": 1.517605560452069e-05, + "loss": 0.8995, + "step": 3316 + }, + { + "epoch": 0.3471480900052329, + "grad_norm": 2.146816047792693, + "learning_rate": 1.5173154997014857e-05, + "loss": 0.8222, + "step": 3317 + }, + { + "epoch": 0.34725274725274724, + "grad_norm": 2.3069897827667623, + "learning_rate": 1.5170253795103025e-05, + "loss": 0.9993, + "step": 3318 + }, + { + "epoch": 0.34735740450026165, + "grad_norm": 1.74531938947694, + "learning_rate": 1.5167351999118554e-05, + "loss": 0.9175, + "step": 3319 + }, + { + "epoch": 0.34746206174777605, + "grad_norm": 2.1288546754174993, + "learning_rate": 1.5164449609394863e-05, + "loss": 1.0347, + "step": 3320 + }, + { + "epoch": 0.3475667189952904, + "grad_norm": 1.8501462739239363, + "learning_rate": 1.5161546626265442e-05, + "loss": 0.9374, + "step": 3321 + }, + { + "epoch": 0.3476713762428048, + "grad_norm": 2.416666658774087, + "learning_rate": 1.515864305006385e-05, + "loss": 1.1008, + "step": 3322 + }, + { + "epoch": 0.3477760334903192, + "grad_norm": 2.1683860693468913, + "learning_rate": 1.5155738881123711e-05, + "loss": 0.8261, + "step": 3323 + }, + { + "epoch": 0.3478806907378336, + "grad_norm": 1.9981929383306056, + "learning_rate": 1.5152834119778727e-05, + "loss": 0.8493, + "step": 3324 + }, + { + "epoch": 0.34798534798534797, + "grad_norm": 2.149137266048455, + "learning_rate": 1.5149928766362658e-05, + "loss": 1.0977, + "step": 3325 + }, + { + "epoch": 0.3480900052328624, + "grad_norm": 2.078026164203414, + "learning_rate": 1.5147022821209331e-05, + "loss": 1.107, + "step": 3326 + }, + { + "epoch": 0.3481946624803768, + "grad_norm": 2.3425230247962987, + "learning_rate": 1.5144116284652645e-05, + "loss": 0.9754, + "step": 3327 + }, + { + "epoch": 0.34829931972789113, + "grad_norm": 1.9091246891467695, + "learning_rate": 1.5141209157026576e-05, + "loss": 0.8426, + "step": 3328 + }, + { + "epoch": 0.34840397697540554, + "grad_norm": 1.9479723011505157, + "learning_rate": 1.5138301438665148e-05, + "loss": 1.0919, + "step": 3329 + }, + { + "epoch": 0.34850863422291994, + "grad_norm": 2.2310776562980412, + "learning_rate": 1.5135393129902469e-05, + "loss": 1.1552, + "step": 3330 + }, + { + "epoch": 0.34861329147043435, + "grad_norm": 2.0947428083855786, + "learning_rate": 1.513248423107271e-05, + "loss": 1.0404, + "step": 3331 + }, + { + "epoch": 0.3487179487179487, + "grad_norm": 2.066377865083883, + "learning_rate": 1.5129574742510107e-05, + "loss": 1.0278, + "step": 3332 + }, + { + "epoch": 0.3488226059654631, + "grad_norm": 2.099307559790499, + "learning_rate": 1.5126664664548969e-05, + "loss": 1.1413, + "step": 3333 + }, + { + "epoch": 0.3489272632129775, + "grad_norm": 2.154455920018568, + "learning_rate": 1.5123753997523665e-05, + "loss": 1.0198, + "step": 3334 + }, + { + "epoch": 0.34903192046049186, + "grad_norm": 2.2352534977710152, + "learning_rate": 1.5120842741768644e-05, + "loss": 1.0821, + "step": 3335 + }, + { + "epoch": 0.34913657770800627, + "grad_norm": 1.883735141874073, + "learning_rate": 1.5117930897618407e-05, + "loss": 1.0526, + "step": 3336 + }, + { + "epoch": 0.3492412349555207, + "grad_norm": 2.085232249057728, + "learning_rate": 1.5115018465407536e-05, + "loss": 0.9961, + "step": 3337 + }, + { + "epoch": 0.3493458922030351, + "grad_norm": 2.2405865672055567, + "learning_rate": 1.5112105445470677e-05, + "loss": 0.9606, + "step": 3338 + }, + { + "epoch": 0.34945054945054943, + "grad_norm": 2.210569750343716, + "learning_rate": 1.5109191838142537e-05, + "loss": 0.9826, + "step": 3339 + }, + { + "epoch": 0.34955520669806384, + "grad_norm": 1.8900569961244176, + "learning_rate": 1.5106277643757895e-05, + "loss": 0.9309, + "step": 3340 + }, + { + "epoch": 0.34965986394557824, + "grad_norm": 2.119499479604053, + "learning_rate": 1.5103362862651602e-05, + "loss": 0.8946, + "step": 3341 + }, + { + "epoch": 0.34976452119309265, + "grad_norm": 2.14866784367062, + "learning_rate": 1.5100447495158572e-05, + "loss": 1.0256, + "step": 3342 + }, + { + "epoch": 0.349869178440607, + "grad_norm": 2.21964636178506, + "learning_rate": 1.5097531541613784e-05, + "loss": 0.8199, + "step": 3343 + }, + { + "epoch": 0.3499738356881214, + "grad_norm": 2.0824336161678474, + "learning_rate": 1.5094615002352284e-05, + "loss": 1.0465, + "step": 3344 + }, + { + "epoch": 0.3500784929356358, + "grad_norm": 2.2906700002330354, + "learning_rate": 1.50916978777092e-05, + "loss": 1.0289, + "step": 3345 + }, + { + "epoch": 0.35018315018315016, + "grad_norm": 1.7870706054978838, + "learning_rate": 1.5088780168019704e-05, + "loss": 0.8907, + "step": 3346 + }, + { + "epoch": 0.35028780743066457, + "grad_norm": 2.350201821325944, + "learning_rate": 1.508586187361905e-05, + "loss": 0.9207, + "step": 3347 + }, + { + "epoch": 0.35039246467817897, + "grad_norm": 1.9237037149190483, + "learning_rate": 1.5082942994842557e-05, + "loss": 0.9421, + "step": 3348 + }, + { + "epoch": 0.3504971219256934, + "grad_norm": 2.6474938275921, + "learning_rate": 1.508002353202561e-05, + "loss": 1.0688, + "step": 3349 + }, + { + "epoch": 0.35060177917320773, + "grad_norm": 2.266954434535588, + "learning_rate": 1.5077103485503664e-05, + "loss": 0.938, + "step": 3350 + }, + { + "epoch": 0.35070643642072213, + "grad_norm": 1.838992898597147, + "learning_rate": 1.507418285561223e-05, + "loss": 1.0003, + "step": 3351 + }, + { + "epoch": 0.35081109366823654, + "grad_norm": 2.1043607397062702, + "learning_rate": 1.5071261642686903e-05, + "loss": 0.8642, + "step": 3352 + }, + { + "epoch": 0.3509157509157509, + "grad_norm": 2.092736361302439, + "learning_rate": 1.506833984706333e-05, + "loss": 1.0074, + "step": 3353 + }, + { + "epoch": 0.3510204081632653, + "grad_norm": 2.540716324430332, + "learning_rate": 1.5065417469077235e-05, + "loss": 1.112, + "step": 3354 + }, + { + "epoch": 0.3511250654107797, + "grad_norm": 2.2462673291428548, + "learning_rate": 1.5062494509064406e-05, + "loss": 1.053, + "step": 3355 + }, + { + "epoch": 0.3512297226582941, + "grad_norm": 1.9698347906672438, + "learning_rate": 1.5059570967360693e-05, + "loss": 0.9501, + "step": 3356 + }, + { + "epoch": 0.35133437990580846, + "grad_norm": 2.8732667059564707, + "learning_rate": 1.5056646844302018e-05, + "loss": 0.9699, + "step": 3357 + }, + { + "epoch": 0.35143903715332286, + "grad_norm": 2.300905830511158, + "learning_rate": 1.5053722140224371e-05, + "loss": 1.0558, + "step": 3358 + }, + { + "epoch": 0.35154369440083727, + "grad_norm": 2.1008549188317573, + "learning_rate": 1.5050796855463807e-05, + "loss": 1.0461, + "step": 3359 + }, + { + "epoch": 0.3516483516483517, + "grad_norm": 2.193971701711426, + "learning_rate": 1.5047870990356443e-05, + "loss": 0.8999, + "step": 3360 + }, + { + "epoch": 0.351753008895866, + "grad_norm": 1.9237206891325724, + "learning_rate": 1.5044944545238472e-05, + "loss": 0.919, + "step": 3361 + }, + { + "epoch": 0.35185766614338043, + "grad_norm": 2.3398493395367828, + "learning_rate": 1.5042017520446144e-05, + "loss": 0.9966, + "step": 3362 + }, + { + "epoch": 0.35196232339089484, + "grad_norm": 2.0245486796099414, + "learning_rate": 1.5039089916315778e-05, + "loss": 0.9552, + "step": 3363 + }, + { + "epoch": 0.3520669806384092, + "grad_norm": 2.146653040509672, + "learning_rate": 1.5036161733183772e-05, + "loss": 0.9682, + "step": 3364 + }, + { + "epoch": 0.3521716378859236, + "grad_norm": 2.3003979859770167, + "learning_rate": 1.5033232971386569e-05, + "loss": 0.9486, + "step": 3365 + }, + { + "epoch": 0.352276295133438, + "grad_norm": 2.080353917018897, + "learning_rate": 1.5030303631260691e-05, + "loss": 1.008, + "step": 3366 + }, + { + "epoch": 0.3523809523809524, + "grad_norm": 2.145495285206271, + "learning_rate": 1.5027373713142735e-05, + "loss": 0.8679, + "step": 3367 + }, + { + "epoch": 0.35248560962846676, + "grad_norm": 2.219416644891993, + "learning_rate": 1.5024443217369346e-05, + "loss": 0.8524, + "step": 3368 + }, + { + "epoch": 0.35259026687598116, + "grad_norm": 2.5932343785171432, + "learning_rate": 1.5021512144277246e-05, + "loss": 0.8825, + "step": 3369 + }, + { + "epoch": 0.35269492412349557, + "grad_norm": 2.2140426590236575, + "learning_rate": 1.5018580494203217e-05, + "loss": 1.0054, + "step": 3370 + }, + { + "epoch": 0.3527995813710099, + "grad_norm": 2.640222446891625, + "learning_rate": 1.501564826748412e-05, + "loss": 1.0553, + "step": 3371 + }, + { + "epoch": 0.3529042386185243, + "grad_norm": 2.6088669036618453, + "learning_rate": 1.5012715464456866e-05, + "loss": 0.9632, + "step": 3372 + }, + { + "epoch": 0.35300889586603873, + "grad_norm": 2.3393856800568513, + "learning_rate": 1.5009782085458447e-05, + "loss": 0.9854, + "step": 3373 + }, + { + "epoch": 0.35311355311355314, + "grad_norm": 2.13358684905417, + "learning_rate": 1.5006848130825909e-05, + "loss": 0.861, + "step": 3374 + }, + { + "epoch": 0.3532182103610675, + "grad_norm": 1.948963946557993, + "learning_rate": 1.500391360089637e-05, + "loss": 0.93, + "step": 3375 + }, + { + "epoch": 0.3533228676085819, + "grad_norm": 2.2621209126353374, + "learning_rate": 1.5000978496007013e-05, + "loss": 1.0111, + "step": 3376 + }, + { + "epoch": 0.3534275248560963, + "grad_norm": 2.175682145812267, + "learning_rate": 1.4998042816495091e-05, + "loss": 1.0821, + "step": 3377 + }, + { + "epoch": 0.35353218210361065, + "grad_norm": 2.118572813572351, + "learning_rate": 1.4995106562697917e-05, + "loss": 1.0052, + "step": 3378 + }, + { + "epoch": 0.35363683935112505, + "grad_norm": 2.2925720756150514, + "learning_rate": 1.4992169734952872e-05, + "loss": 0.8211, + "step": 3379 + }, + { + "epoch": 0.35374149659863946, + "grad_norm": 1.9505334994916146, + "learning_rate": 1.4989232333597404e-05, + "loss": 0.9554, + "step": 3380 + }, + { + "epoch": 0.35384615384615387, + "grad_norm": 2.0479847216050135, + "learning_rate": 1.4986294358969029e-05, + "loss": 1.0622, + "step": 3381 + }, + { + "epoch": 0.3539508110936682, + "grad_norm": 1.940644043048965, + "learning_rate": 1.498335581140532e-05, + "loss": 0.9721, + "step": 3382 + }, + { + "epoch": 0.3540554683411826, + "grad_norm": 2.357644955495196, + "learning_rate": 1.4980416691243929e-05, + "loss": 0.9845, + "step": 3383 + }, + { + "epoch": 0.35416012558869703, + "grad_norm": 1.9728875562735373, + "learning_rate": 1.4977476998822564e-05, + "loss": 0.9456, + "step": 3384 + }, + { + "epoch": 0.35426478283621143, + "grad_norm": 2.5759061498541773, + "learning_rate": 1.4974536734479e-05, + "loss": 1.0961, + "step": 3385 + }, + { + "epoch": 0.3543694400837258, + "grad_norm": 2.0650501070386436, + "learning_rate": 1.497159589855108e-05, + "loss": 0.9497, + "step": 3386 + }, + { + "epoch": 0.3544740973312402, + "grad_norm": 2.049511205390927, + "learning_rate": 1.4968654491376715e-05, + "loss": 0.9196, + "step": 3387 + }, + { + "epoch": 0.3545787545787546, + "grad_norm": 2.119804481767199, + "learning_rate": 1.4965712513293874e-05, + "loss": 1.1051, + "step": 3388 + }, + { + "epoch": 0.35468341182626895, + "grad_norm": 2.0198789773840153, + "learning_rate": 1.49627699646406e-05, + "loss": 0.9479, + "step": 3389 + }, + { + "epoch": 0.35478806907378335, + "grad_norm": 2.2514166159676376, + "learning_rate": 1.4959826845754996e-05, + "loss": 1.1349, + "step": 3390 + }, + { + "epoch": 0.35489272632129776, + "grad_norm": 1.9991992740675673, + "learning_rate": 1.4956883156975235e-05, + "loss": 0.9907, + "step": 3391 + }, + { + "epoch": 0.35499738356881216, + "grad_norm": 2.165858304353837, + "learning_rate": 1.495393889863955e-05, + "loss": 1.0128, + "step": 3392 + }, + { + "epoch": 0.3551020408163265, + "grad_norm": 2.046874393542093, + "learning_rate": 1.495099407108624e-05, + "loss": 0.9728, + "step": 3393 + }, + { + "epoch": 0.3552066980638409, + "grad_norm": 1.841861405528776, + "learning_rate": 1.4948048674653677e-05, + "loss": 0.9024, + "step": 3394 + }, + { + "epoch": 0.3553113553113553, + "grad_norm": 1.963788808953176, + "learning_rate": 1.4945102709680291e-05, + "loss": 1.047, + "step": 3395 + }, + { + "epoch": 0.3554160125588697, + "grad_norm": 1.8854311633911396, + "learning_rate": 1.4942156176504577e-05, + "loss": 0.9661, + "step": 3396 + }, + { + "epoch": 0.3555206698063841, + "grad_norm": 2.0001437303254312, + "learning_rate": 1.49392090754651e-05, + "loss": 1.0731, + "step": 3397 + }, + { + "epoch": 0.3556253270538985, + "grad_norm": 2.2429282145699645, + "learning_rate": 1.493626140690049e-05, + "loss": 1.0166, + "step": 3398 + }, + { + "epoch": 0.3557299843014129, + "grad_norm": 2.230032061722695, + "learning_rate": 1.4933313171149433e-05, + "loss": 0.9602, + "step": 3399 + }, + { + "epoch": 0.35583464154892724, + "grad_norm": 2.3611671076976517, + "learning_rate": 1.4930364368550697e-05, + "loss": 1.0514, + "step": 3400 + }, + { + "epoch": 0.35593929879644165, + "grad_norm": 1.9997981736421566, + "learning_rate": 1.4927414999443093e-05, + "loss": 0.8934, + "step": 3401 + }, + { + "epoch": 0.35604395604395606, + "grad_norm": 2.2260425699224378, + "learning_rate": 1.492446506416552e-05, + "loss": 1.105, + "step": 3402 + }, + { + "epoch": 0.35614861329147046, + "grad_norm": 2.6007886380070064, + "learning_rate": 1.4921514563056931e-05, + "loss": 1.0282, + "step": 3403 + }, + { + "epoch": 0.3562532705389848, + "grad_norm": 2.2911564149754664, + "learning_rate": 1.491856349645634e-05, + "loss": 0.9593, + "step": 3404 + }, + { + "epoch": 0.3563579277864992, + "grad_norm": 2.144042496081372, + "learning_rate": 1.4915611864702832e-05, + "loss": 1.0542, + "step": 3405 + }, + { + "epoch": 0.3564625850340136, + "grad_norm": 2.314520779479154, + "learning_rate": 1.4912659668135553e-05, + "loss": 1.0739, + "step": 3406 + }, + { + "epoch": 0.356567242281528, + "grad_norm": 2.0119651946546755, + "learning_rate": 1.4909706907093725e-05, + "loss": 1.0338, + "step": 3407 + }, + { + "epoch": 0.3566718995290424, + "grad_norm": 2.0594144199775517, + "learning_rate": 1.4906753581916618e-05, + "loss": 1.0535, + "step": 3408 + }, + { + "epoch": 0.3567765567765568, + "grad_norm": 2.628859441235232, + "learning_rate": 1.4903799692943575e-05, + "loss": 1.1152, + "step": 3409 + }, + { + "epoch": 0.3568812140240712, + "grad_norm": 2.3483961540859224, + "learning_rate": 1.4900845240514009e-05, + "loss": 1.0825, + "step": 3410 + }, + { + "epoch": 0.35698587127158554, + "grad_norm": 2.199102694351282, + "learning_rate": 1.4897890224967388e-05, + "loss": 1.0981, + "step": 3411 + }, + { + "epoch": 0.35709052851909995, + "grad_norm": 2.437840230574091, + "learning_rate": 1.4894934646643253e-05, + "loss": 1.0649, + "step": 3412 + }, + { + "epoch": 0.35719518576661435, + "grad_norm": 2.35758514512899, + "learning_rate": 1.4891978505881202e-05, + "loss": 1.0239, + "step": 3413 + }, + { + "epoch": 0.3572998430141287, + "grad_norm": 2.2981006350213304, + "learning_rate": 1.4889021803020904e-05, + "loss": 1.031, + "step": 3414 + }, + { + "epoch": 0.3574045002616431, + "grad_norm": 2.0708809030772484, + "learning_rate": 1.4886064538402091e-05, + "loss": 0.9587, + "step": 3415 + }, + { + "epoch": 0.3575091575091575, + "grad_norm": 2.454255588394301, + "learning_rate": 1.4883106712364558e-05, + "loss": 0.9636, + "step": 3416 + }, + { + "epoch": 0.3576138147566719, + "grad_norm": 1.870007612357461, + "learning_rate": 1.4880148325248163e-05, + "loss": 1.0022, + "step": 3417 + }, + { + "epoch": 0.35771847200418627, + "grad_norm": 2.1025070655550904, + "learning_rate": 1.4877189377392831e-05, + "loss": 1.0294, + "step": 3418 + }, + { + "epoch": 0.3578231292517007, + "grad_norm": 2.192703285083379, + "learning_rate": 1.4874229869138554e-05, + "loss": 1.1935, + "step": 3419 + }, + { + "epoch": 0.3579277864992151, + "grad_norm": 2.1921332205785555, + "learning_rate": 1.4871269800825384e-05, + "loss": 0.9556, + "step": 3420 + }, + { + "epoch": 0.35803244374672943, + "grad_norm": 2.3389520267003547, + "learning_rate": 1.486830917279344e-05, + "loss": 0.959, + "step": 3421 + }, + { + "epoch": 0.35813710099424384, + "grad_norm": 1.887005791860528, + "learning_rate": 1.4865347985382901e-05, + "loss": 0.9283, + "step": 3422 + }, + { + "epoch": 0.35824175824175825, + "grad_norm": 3.738025461337002, + "learning_rate": 1.4862386238934017e-05, + "loss": 1.0093, + "step": 3423 + }, + { + "epoch": 0.35834641548927265, + "grad_norm": 2.0931366465972268, + "learning_rate": 1.4859423933787097e-05, + "loss": 0.9058, + "step": 3424 + }, + { + "epoch": 0.358451072736787, + "grad_norm": 2.068043177767639, + "learning_rate": 1.4856461070282512e-05, + "loss": 1.044, + "step": 3425 + }, + { + "epoch": 0.3585557299843014, + "grad_norm": 2.2134904451583206, + "learning_rate": 1.4853497648760711e-05, + "loss": 0.8614, + "step": 3426 + }, + { + "epoch": 0.3586603872318158, + "grad_norm": 2.1848502762270368, + "learning_rate": 1.4850533669562189e-05, + "loss": 1.0403, + "step": 3427 + }, + { + "epoch": 0.3587650444793302, + "grad_norm": 2.190313212576901, + "learning_rate": 1.4847569133027512e-05, + "loss": 1.0629, + "step": 3428 + }, + { + "epoch": 0.35886970172684457, + "grad_norm": 2.2018800818289765, + "learning_rate": 1.4844604039497319e-05, + "loss": 1.0115, + "step": 3429 + }, + { + "epoch": 0.358974358974359, + "grad_norm": 1.8297165257801538, + "learning_rate": 1.4841638389312298e-05, + "loss": 0.9768, + "step": 3430 + }, + { + "epoch": 0.3590790162218734, + "grad_norm": 2.0502371253182177, + "learning_rate": 1.4838672182813213e-05, + "loss": 0.8625, + "step": 3431 + }, + { + "epoch": 0.35918367346938773, + "grad_norm": 2.0678658262449994, + "learning_rate": 1.4835705420340884e-05, + "loss": 0.9285, + "step": 3432 + }, + { + "epoch": 0.35928833071690214, + "grad_norm": 2.097477714048872, + "learning_rate": 1.48327381022362e-05, + "loss": 0.9737, + "step": 3433 + }, + { + "epoch": 0.35939298796441654, + "grad_norm": 2.216587571402107, + "learning_rate": 1.4829770228840111e-05, + "loss": 0.9205, + "step": 3434 + }, + { + "epoch": 0.35949764521193095, + "grad_norm": 2.242990585095391, + "learning_rate": 1.482680180049363e-05, + "loss": 1.0677, + "step": 3435 + }, + { + "epoch": 0.3596023024594453, + "grad_norm": 2.1956250689462595, + "learning_rate": 1.4823832817537838e-05, + "loss": 0.9873, + "step": 3436 + }, + { + "epoch": 0.3597069597069597, + "grad_norm": 2.250186078331522, + "learning_rate": 1.4820863280313874e-05, + "loss": 0.9965, + "step": 3437 + }, + { + "epoch": 0.3598116169544741, + "grad_norm": 2.080456079650869, + "learning_rate": 1.4817893189162954e-05, + "loss": 1.0024, + "step": 3438 + }, + { + "epoch": 0.35991627420198846, + "grad_norm": 2.0093127130290576, + "learning_rate": 1.4814922544426332e-05, + "loss": 1.0008, + "step": 3439 + }, + { + "epoch": 0.36002093144950287, + "grad_norm": 2.2251021302294083, + "learning_rate": 1.481195134644535e-05, + "loss": 1.0177, + "step": 3440 + }, + { + "epoch": 0.3601255886970173, + "grad_norm": 1.9221853447634059, + "learning_rate": 1.4808979595561406e-05, + "loss": 0.8362, + "step": 3441 + }, + { + "epoch": 0.3602302459445317, + "grad_norm": 1.9001182786985462, + "learning_rate": 1.4806007292115955e-05, + "loss": 1.077, + "step": 3442 + }, + { + "epoch": 0.36033490319204603, + "grad_norm": 2.24243203183523, + "learning_rate": 1.4803034436450528e-05, + "loss": 0.9866, + "step": 3443 + }, + { + "epoch": 0.36043956043956044, + "grad_norm": 2.1248905323421217, + "learning_rate": 1.4800061028906703e-05, + "loss": 0.9718, + "step": 3444 + }, + { + "epoch": 0.36054421768707484, + "grad_norm": 2.1426511270144966, + "learning_rate": 1.4797087069826136e-05, + "loss": 1.0464, + "step": 3445 + }, + { + "epoch": 0.36064887493458925, + "grad_norm": 1.7983654364896233, + "learning_rate": 1.4794112559550542e-05, + "loss": 0.8639, + "step": 3446 + }, + { + "epoch": 0.3607535321821036, + "grad_norm": 2.053627748221558, + "learning_rate": 1.4791137498421692e-05, + "loss": 0.903, + "step": 3447 + }, + { + "epoch": 0.360858189429618, + "grad_norm": 1.8683934406849398, + "learning_rate": 1.4788161886781434e-05, + "loss": 0.867, + "step": 3448 + }, + { + "epoch": 0.3609628466771324, + "grad_norm": 2.101452076878475, + "learning_rate": 1.4785185724971667e-05, + "loss": 0.9869, + "step": 3449 + }, + { + "epoch": 0.36106750392464676, + "grad_norm": 2.496970829177601, + "learning_rate": 1.4782209013334364e-05, + "loss": 0.8874, + "step": 3450 + }, + { + "epoch": 0.36117216117216117, + "grad_norm": 1.9906643960387973, + "learning_rate": 1.4779231752211546e-05, + "loss": 1.0344, + "step": 3451 + }, + { + "epoch": 0.36127681841967557, + "grad_norm": 2.134773764897517, + "learning_rate": 1.477625394194531e-05, + "loss": 1.0026, + "step": 3452 + }, + { + "epoch": 0.36138147566719, + "grad_norm": 1.9360626015222449, + "learning_rate": 1.4773275582877818e-05, + "loss": 1.0467, + "step": 3453 + }, + { + "epoch": 0.36148613291470433, + "grad_norm": 2.3545030642754585, + "learning_rate": 1.477029667535128e-05, + "loss": 1.0638, + "step": 3454 + }, + { + "epoch": 0.36159079016221873, + "grad_norm": 2.178802837894888, + "learning_rate": 1.476731721970799e-05, + "loss": 0.9444, + "step": 3455 + }, + { + "epoch": 0.36169544740973314, + "grad_norm": 2.4728900427384155, + "learning_rate": 1.4764337216290283e-05, + "loss": 1.0585, + "step": 3456 + }, + { + "epoch": 0.3618001046572475, + "grad_norm": 2.203666741614066, + "learning_rate": 1.4761356665440571e-05, + "loss": 1.0357, + "step": 3457 + }, + { + "epoch": 0.3619047619047619, + "grad_norm": 2.1800575119431986, + "learning_rate": 1.475837556750133e-05, + "loss": 0.9192, + "step": 3458 + }, + { + "epoch": 0.3620094191522763, + "grad_norm": 2.237067611748822, + "learning_rate": 1.4755393922815086e-05, + "loss": 0.9418, + "step": 3459 + }, + { + "epoch": 0.3621140763997907, + "grad_norm": 2.0662472125409352, + "learning_rate": 1.4752411731724443e-05, + "loss": 1.0651, + "step": 3460 + }, + { + "epoch": 0.36221873364730506, + "grad_norm": 1.9800019356904621, + "learning_rate": 1.4749428994572058e-05, + "loss": 1.0229, + "step": 3461 + }, + { + "epoch": 0.36232339089481946, + "grad_norm": 2.193928271349309, + "learning_rate": 1.4746445711700648e-05, + "loss": 0.9627, + "step": 3462 + }, + { + "epoch": 0.36242804814233387, + "grad_norm": 2.1284831634544017, + "learning_rate": 1.474346188345301e-05, + "loss": 1.0074, + "step": 3463 + }, + { + "epoch": 0.3625327053898482, + "grad_norm": 1.9794414505430156, + "learning_rate": 1.4740477510171982e-05, + "loss": 0.9674, + "step": 3464 + }, + { + "epoch": 0.3626373626373626, + "grad_norm": 1.9781425949045437, + "learning_rate": 1.473749259220048e-05, + "loss": 0.9827, + "step": 3465 + }, + { + "epoch": 0.36274201988487703, + "grad_norm": 2.1680974375346174, + "learning_rate": 1.4734507129881473e-05, + "loss": 0.9404, + "step": 3466 + }, + { + "epoch": 0.36284667713239144, + "grad_norm": 2.104960303718753, + "learning_rate": 1.4731521123558e-05, + "loss": 0.9687, + "step": 3467 + }, + { + "epoch": 0.3629513343799058, + "grad_norm": 1.950557453989959, + "learning_rate": 1.4728534573573159e-05, + "loss": 0.766, + "step": 3468 + }, + { + "epoch": 0.3630559916274202, + "grad_norm": 2.2931497243072374, + "learning_rate": 1.472554748027011e-05, + "loss": 0.9558, + "step": 3469 + }, + { + "epoch": 0.3631606488749346, + "grad_norm": 2.114730292398457, + "learning_rate": 1.4722559843992075e-05, + "loss": 1.0599, + "step": 3470 + }, + { + "epoch": 0.363265306122449, + "grad_norm": 1.9387223713176414, + "learning_rate": 1.471957166508234e-05, + "loss": 0.9173, + "step": 3471 + }, + { + "epoch": 0.36336996336996336, + "grad_norm": 2.0196303025643934, + "learning_rate": 1.4716582943884254e-05, + "loss": 0.8972, + "step": 3472 + }, + { + "epoch": 0.36347462061747776, + "grad_norm": 2.3779507104223176, + "learning_rate": 1.4713593680741225e-05, + "loss": 1.0191, + "step": 3473 + }, + { + "epoch": 0.36357927786499217, + "grad_norm": 2.171151514960678, + "learning_rate": 1.4710603875996725e-05, + "loss": 0.9061, + "step": 3474 + }, + { + "epoch": 0.3636839351125065, + "grad_norm": 2.3376877060855974, + "learning_rate": 1.4707613529994294e-05, + "loss": 1.0131, + "step": 3475 + }, + { + "epoch": 0.3637885923600209, + "grad_norm": 2.240363414874548, + "learning_rate": 1.4704622643077524e-05, + "loss": 1.0021, + "step": 3476 + }, + { + "epoch": 0.36389324960753533, + "grad_norm": 2.2172322785021126, + "learning_rate": 1.4701631215590075e-05, + "loss": 1.0355, + "step": 3477 + }, + { + "epoch": 0.36399790685504974, + "grad_norm": 2.072175096832046, + "learning_rate": 1.4698639247875668e-05, + "loss": 1.0331, + "step": 3478 + }, + { + "epoch": 0.3641025641025641, + "grad_norm": 2.0743508713728844, + "learning_rate": 1.4695646740278084e-05, + "loss": 0.9647, + "step": 3479 + }, + { + "epoch": 0.3642072213500785, + "grad_norm": 1.8532767415518483, + "learning_rate": 1.4692653693141175e-05, + "loss": 0.8776, + "step": 3480 + }, + { + "epoch": 0.3643118785975929, + "grad_norm": 2.302001815845674, + "learning_rate": 1.468966010680884e-05, + "loss": 0.9075, + "step": 3481 + }, + { + "epoch": 0.36441653584510725, + "grad_norm": 2.394789643374174, + "learning_rate": 1.4686665981625055e-05, + "loss": 1.0686, + "step": 3482 + }, + { + "epoch": 0.36452119309262165, + "grad_norm": 2.2837151238361697, + "learning_rate": 1.4683671317933843e-05, + "loss": 1.0853, + "step": 3483 + }, + { + "epoch": 0.36462585034013606, + "grad_norm": 2.025193817081867, + "learning_rate": 1.4680676116079307e-05, + "loss": 0.9073, + "step": 3484 + }, + { + "epoch": 0.36473050758765047, + "grad_norm": 2.4029403299228274, + "learning_rate": 1.4677680376405592e-05, + "loss": 0.9786, + "step": 3485 + }, + { + "epoch": 0.3648351648351648, + "grad_norm": 2.275681739422216, + "learning_rate": 1.4674684099256923e-05, + "loss": 1.1143, + "step": 3486 + }, + { + "epoch": 0.3649398220826792, + "grad_norm": 2.2365915095598576, + "learning_rate": 1.4671687284977572e-05, + "loss": 0.9557, + "step": 3487 + }, + { + "epoch": 0.3650444793301936, + "grad_norm": 2.2324428792489495, + "learning_rate": 1.4668689933911882e-05, + "loss": 0.961, + "step": 3488 + }, + { + "epoch": 0.36514913657770803, + "grad_norm": 2.187984861710933, + "learning_rate": 1.4665692046404253e-05, + "loss": 1.063, + "step": 3489 + }, + { + "epoch": 0.3652537938252224, + "grad_norm": 2.2899556426594243, + "learning_rate": 1.4662693622799152e-05, + "loss": 1.0662, + "step": 3490 + }, + { + "epoch": 0.3653584510727368, + "grad_norm": 2.4136159700742716, + "learning_rate": 1.4659694663441102e-05, + "loss": 0.9803, + "step": 3491 + }, + { + "epoch": 0.3654631083202512, + "grad_norm": 1.7662751611483918, + "learning_rate": 1.4656695168674686e-05, + "loss": 0.8149, + "step": 3492 + }, + { + "epoch": 0.36556776556776555, + "grad_norm": 2.2806309857676226, + "learning_rate": 1.4653695138844556e-05, + "loss": 1.0818, + "step": 3493 + }, + { + "epoch": 0.36567242281527995, + "grad_norm": 1.7491999471330408, + "learning_rate": 1.4650694574295423e-05, + "loss": 0.8499, + "step": 3494 + }, + { + "epoch": 0.36577708006279436, + "grad_norm": 2.1237260413546526, + "learning_rate": 1.4647693475372054e-05, + "loss": 0.9252, + "step": 3495 + }, + { + "epoch": 0.36588173731030876, + "grad_norm": 2.0122406066004506, + "learning_rate": 1.4644691842419278e-05, + "loss": 0.9047, + "step": 3496 + }, + { + "epoch": 0.3659863945578231, + "grad_norm": 1.7997357542002177, + "learning_rate": 1.4641689675782e-05, + "loss": 0.9075, + "step": 3497 + }, + { + "epoch": 0.3660910518053375, + "grad_norm": 2.0095662635854126, + "learning_rate": 1.4638686975805168e-05, + "loss": 0.99, + "step": 3498 + }, + { + "epoch": 0.3661957090528519, + "grad_norm": 2.305964189521301, + "learning_rate": 1.4635683742833798e-05, + "loss": 0.9384, + "step": 3499 + }, + { + "epoch": 0.3663003663003663, + "grad_norm": 1.9289338620569885, + "learning_rate": 1.4632679977212965e-05, + "loss": 0.9879, + "step": 3500 + }, + { + "epoch": 0.3664050235478807, + "grad_norm": 2.134260629081426, + "learning_rate": 1.4629675679287814e-05, + "loss": 0.9906, + "step": 3501 + }, + { + "epoch": 0.3665096807953951, + "grad_norm": 2.125024277582387, + "learning_rate": 1.4626670849403541e-05, + "loss": 0.8551, + "step": 3502 + }, + { + "epoch": 0.3666143380429095, + "grad_norm": 2.014078167441046, + "learning_rate": 1.4623665487905409e-05, + "loss": 0.9331, + "step": 3503 + }, + { + "epoch": 0.36671899529042384, + "grad_norm": 2.3595548560106674, + "learning_rate": 1.462065959513874e-05, + "loss": 1.0379, + "step": 3504 + }, + { + "epoch": 0.36682365253793825, + "grad_norm": 2.9757237588936847, + "learning_rate": 1.4617653171448915e-05, + "loss": 1.1066, + "step": 3505 + }, + { + "epoch": 0.36692830978545266, + "grad_norm": 2.246060097728677, + "learning_rate": 1.4614646217181384e-05, + "loss": 1.0206, + "step": 3506 + }, + { + "epoch": 0.367032967032967, + "grad_norm": 2.1279065491723577, + "learning_rate": 1.461163873268164e-05, + "loss": 0.9343, + "step": 3507 + }, + { + "epoch": 0.3671376242804814, + "grad_norm": 2.0617313415147582, + "learning_rate": 1.4608630718295262e-05, + "loss": 1.0097, + "step": 3508 + }, + { + "epoch": 0.3672422815279958, + "grad_norm": 1.8166947694326565, + "learning_rate": 1.4605622174367873e-05, + "loss": 1.0153, + "step": 3509 + }, + { + "epoch": 0.3673469387755102, + "grad_norm": 2.1056303565410115, + "learning_rate": 1.4602613101245156e-05, + "loss": 1.0769, + "step": 3510 + }, + { + "epoch": 0.3674515960230246, + "grad_norm": 2.229739943150304, + "learning_rate": 1.4599603499272866e-05, + "loss": 1.0299, + "step": 3511 + }, + { + "epoch": 0.367556253270539, + "grad_norm": 2.1505713190979674, + "learning_rate": 1.4596593368796808e-05, + "loss": 1.0079, + "step": 3512 + }, + { + "epoch": 0.3676609105180534, + "grad_norm": 2.0137320566247445, + "learning_rate": 1.4593582710162854e-05, + "loss": 0.9401, + "step": 3513 + }, + { + "epoch": 0.3677655677655678, + "grad_norm": 1.97738685057115, + "learning_rate": 1.4590571523716932e-05, + "loss": 0.9479, + "step": 3514 + }, + { + "epoch": 0.36787022501308214, + "grad_norm": 2.1775568508800305, + "learning_rate": 1.4587559809805041e-05, + "loss": 0.8715, + "step": 3515 + }, + { + "epoch": 0.36797488226059655, + "grad_norm": 2.4652646105763663, + "learning_rate": 1.4584547568773225e-05, + "loss": 1.0586, + "step": 3516 + }, + { + "epoch": 0.36807953950811095, + "grad_norm": 2.2880185364374355, + "learning_rate": 1.4581534800967598e-05, + "loss": 0.9568, + "step": 3517 + }, + { + "epoch": 0.3681841967556253, + "grad_norm": 2.0626244211775986, + "learning_rate": 1.4578521506734337e-05, + "loss": 0.8189, + "step": 3518 + }, + { + "epoch": 0.3682888540031397, + "grad_norm": 2.0228867528318966, + "learning_rate": 1.4575507686419672e-05, + "loss": 0.9727, + "step": 3519 + }, + { + "epoch": 0.3683935112506541, + "grad_norm": 1.9220254744007452, + "learning_rate": 1.4572493340369899e-05, + "loss": 1.0084, + "step": 3520 + }, + { + "epoch": 0.3684981684981685, + "grad_norm": 2.2566198672292392, + "learning_rate": 1.4569478468931371e-05, + "loss": 1.0256, + "step": 3521 + }, + { + "epoch": 0.36860282574568287, + "grad_norm": 2.180361824104086, + "learning_rate": 1.4566463072450503e-05, + "loss": 1.044, + "step": 3522 + }, + { + "epoch": 0.3687074829931973, + "grad_norm": 2.623792129539169, + "learning_rate": 1.4563447151273774e-05, + "loss": 0.9307, + "step": 3523 + }, + { + "epoch": 0.3688121402407117, + "grad_norm": 2.4668276543972047, + "learning_rate": 1.4560430705747712e-05, + "loss": 0.9877, + "step": 3524 + }, + { + "epoch": 0.36891679748822603, + "grad_norm": 2.454886424734209, + "learning_rate": 1.4557413736218921e-05, + "loss": 0.8058, + "step": 3525 + }, + { + "epoch": 0.36902145473574044, + "grad_norm": 2.1383002692744557, + "learning_rate": 1.4554396243034052e-05, + "loss": 0.9637, + "step": 3526 + }, + { + "epoch": 0.36912611198325485, + "grad_norm": 2.187789113145796, + "learning_rate": 1.4551378226539821e-05, + "loss": 1.011, + "step": 3527 + }, + { + "epoch": 0.36923076923076925, + "grad_norm": 1.7592035312864471, + "learning_rate": 1.4548359687083005e-05, + "loss": 0.8919, + "step": 3528 + }, + { + "epoch": 0.3693354264782836, + "grad_norm": 2.1794650803355373, + "learning_rate": 1.454534062501044e-05, + "loss": 0.9867, + "step": 3529 + }, + { + "epoch": 0.369440083725798, + "grad_norm": 1.9320597793347067, + "learning_rate": 1.4542321040669025e-05, + "loss": 0.9989, + "step": 3530 + }, + { + "epoch": 0.3695447409733124, + "grad_norm": 1.9447012593805288, + "learning_rate": 1.4539300934405712e-05, + "loss": 0.9174, + "step": 3531 + }, + { + "epoch": 0.3696493982208268, + "grad_norm": 2.161309649226642, + "learning_rate": 1.4536280306567521e-05, + "loss": 1.0729, + "step": 3532 + }, + { + "epoch": 0.36975405546834117, + "grad_norm": 2.199055254223592, + "learning_rate": 1.4533259157501523e-05, + "loss": 0.9474, + "step": 3533 + }, + { + "epoch": 0.3698587127158556, + "grad_norm": 1.9653608066864128, + "learning_rate": 1.4530237487554863e-05, + "loss": 0.8146, + "step": 3534 + }, + { + "epoch": 0.36996336996337, + "grad_norm": 2.0938517524245803, + "learning_rate": 1.4527215297074729e-05, + "loss": 1.0321, + "step": 3535 + }, + { + "epoch": 0.37006802721088433, + "grad_norm": 2.2264094228758307, + "learning_rate": 1.452419258640838e-05, + "loss": 1.068, + "step": 3536 + }, + { + "epoch": 0.37017268445839874, + "grad_norm": 2.003519287512863, + "learning_rate": 1.4521169355903134e-05, + "loss": 1.0125, + "step": 3537 + }, + { + "epoch": 0.37027734170591314, + "grad_norm": 2.06288913711494, + "learning_rate": 1.451814560590636e-05, + "loss": 1.0217, + "step": 3538 + }, + { + "epoch": 0.37038199895342755, + "grad_norm": 2.32869949729458, + "learning_rate": 1.4515121336765499e-05, + "loss": 0.9788, + "step": 3539 + }, + { + "epoch": 0.3704866562009419, + "grad_norm": 2.2629636033796503, + "learning_rate": 1.4512096548828041e-05, + "loss": 1.0602, + "step": 3540 + }, + { + "epoch": 0.3705913134484563, + "grad_norm": 1.9185846266270503, + "learning_rate": 1.4509071242441543e-05, + "loss": 0.9762, + "step": 3541 + }, + { + "epoch": 0.3706959706959707, + "grad_norm": 2.2625877749763172, + "learning_rate": 1.4506045417953623e-05, + "loss": 1.0291, + "step": 3542 + }, + { + "epoch": 0.37080062794348506, + "grad_norm": 1.809617016801832, + "learning_rate": 1.4503019075711944e-05, + "loss": 0.9528, + "step": 3543 + }, + { + "epoch": 0.37090528519099947, + "grad_norm": 2.002346966236511, + "learning_rate": 1.4499992216064247e-05, + "loss": 0.8509, + "step": 3544 + }, + { + "epoch": 0.3710099424385139, + "grad_norm": 2.0335315203538697, + "learning_rate": 1.4496964839358322e-05, + "loss": 0.976, + "step": 3545 + }, + { + "epoch": 0.3711145996860283, + "grad_norm": 2.438768221681832, + "learning_rate": 1.449393694594202e-05, + "loss": 0.9592, + "step": 3546 + }, + { + "epoch": 0.37121925693354263, + "grad_norm": 2.188091743315195, + "learning_rate": 1.4490908536163256e-05, + "loss": 1.0567, + "step": 3547 + }, + { + "epoch": 0.37132391418105704, + "grad_norm": 2.0144776071078168, + "learning_rate": 1.448787961036999e-05, + "loss": 1.0472, + "step": 3548 + }, + { + "epoch": 0.37142857142857144, + "grad_norm": 1.9614612256190855, + "learning_rate": 1.4484850168910264e-05, + "loss": 0.8886, + "step": 3549 + }, + { + "epoch": 0.3715332286760858, + "grad_norm": 1.8138847376496081, + "learning_rate": 1.4481820212132163e-05, + "loss": 0.9632, + "step": 3550 + }, + { + "epoch": 0.3716378859236002, + "grad_norm": 2.2730125431428863, + "learning_rate": 1.4478789740383831e-05, + "loss": 0.9063, + "step": 3551 + }, + { + "epoch": 0.3717425431711146, + "grad_norm": 2.4175931682424947, + "learning_rate": 1.4475758754013481e-05, + "loss": 1.0576, + "step": 3552 + }, + { + "epoch": 0.371847200418629, + "grad_norm": 1.9765613042579417, + "learning_rate": 1.4472727253369374e-05, + "loss": 0.9173, + "step": 3553 + }, + { + "epoch": 0.37195185766614336, + "grad_norm": 2.105898251066037, + "learning_rate": 1.4469695238799841e-05, + "loss": 1.0151, + "step": 3554 + }, + { + "epoch": 0.37205651491365777, + "grad_norm": 2.063365637058352, + "learning_rate": 1.4466662710653261e-05, + "loss": 0.9934, + "step": 3555 + }, + { + "epoch": 0.37216117216117217, + "grad_norm": 2.16238034321692, + "learning_rate": 1.4463629669278083e-05, + "loss": 1.095, + "step": 3556 + }, + { + "epoch": 0.3722658294086866, + "grad_norm": 2.064593186035993, + "learning_rate": 1.4460596115022808e-05, + "loss": 0.93, + "step": 3557 + }, + { + "epoch": 0.3723704866562009, + "grad_norm": 2.1031716067377406, + "learning_rate": 1.4457562048235993e-05, + "loss": 1.1012, + "step": 3558 + }, + { + "epoch": 0.37247514390371533, + "grad_norm": 2.040768861352751, + "learning_rate": 1.4454527469266269e-05, + "loss": 1.0625, + "step": 3559 + }, + { + "epoch": 0.37257980115122974, + "grad_norm": 2.1485703086705508, + "learning_rate": 1.4451492378462302e-05, + "loss": 1.119, + "step": 3560 + }, + { + "epoch": 0.3726844583987441, + "grad_norm": 1.8403984678229408, + "learning_rate": 1.444845677617284e-05, + "loss": 0.9352, + "step": 3561 + }, + { + "epoch": 0.3727891156462585, + "grad_norm": 1.922757309168295, + "learning_rate": 1.4445420662746674e-05, + "loss": 1.0356, + "step": 3562 + }, + { + "epoch": 0.3728937728937729, + "grad_norm": 2.1213514343363564, + "learning_rate": 1.4442384038532664e-05, + "loss": 1.0686, + "step": 3563 + }, + { + "epoch": 0.3729984301412873, + "grad_norm": 2.112512593103979, + "learning_rate": 1.4439346903879724e-05, + "loss": 0.9844, + "step": 3564 + }, + { + "epoch": 0.37310308738880166, + "grad_norm": 2.1935515618029577, + "learning_rate": 1.443630925913682e-05, + "loss": 0.9552, + "step": 3565 + }, + { + "epoch": 0.37320774463631606, + "grad_norm": 1.963564429493465, + "learning_rate": 1.4433271104652993e-05, + "loss": 1.0212, + "step": 3566 + }, + { + "epoch": 0.37331240188383047, + "grad_norm": 2.1958400025455855, + "learning_rate": 1.4430232440777325e-05, + "loss": 1.0752, + "step": 3567 + }, + { + "epoch": 0.3734170591313448, + "grad_norm": 2.321242717364229, + "learning_rate": 1.4427193267858974e-05, + "loss": 0.9584, + "step": 3568 + }, + { + "epoch": 0.3735217163788592, + "grad_norm": 2.4134787950833174, + "learning_rate": 1.4424153586247139e-05, + "loss": 1.075, + "step": 3569 + }, + { + "epoch": 0.37362637362637363, + "grad_norm": 2.204301556211592, + "learning_rate": 1.4421113396291087e-05, + "loss": 1.0884, + "step": 3570 + }, + { + "epoch": 0.37373103087388804, + "grad_norm": 2.4179458922764874, + "learning_rate": 1.4418072698340147e-05, + "loss": 0.9831, + "step": 3571 + }, + { + "epoch": 0.3738356881214024, + "grad_norm": 2.07945845423551, + "learning_rate": 1.4415031492743692e-05, + "loss": 1.1162, + "step": 3572 + }, + { + "epoch": 0.3739403453689168, + "grad_norm": 2.383328131104694, + "learning_rate": 1.4411989779851175e-05, + "loss": 0.9214, + "step": 3573 + }, + { + "epoch": 0.3740450026164312, + "grad_norm": 2.811893175065896, + "learning_rate": 1.4408947560012081e-05, + "loss": 0.9125, + "step": 3574 + }, + { + "epoch": 0.3741496598639456, + "grad_norm": 2.177298784002885, + "learning_rate": 1.4405904833575979e-05, + "loss": 0.99, + "step": 3575 + }, + { + "epoch": 0.37425431711145996, + "grad_norm": 2.1489316661311437, + "learning_rate": 1.440286160089248e-05, + "loss": 1.0198, + "step": 3576 + }, + { + "epoch": 0.37435897435897436, + "grad_norm": 2.2742667087048436, + "learning_rate": 1.4399817862311255e-05, + "loss": 1.0415, + "step": 3577 + }, + { + "epoch": 0.37446363160648877, + "grad_norm": 2.2904104266630103, + "learning_rate": 1.439677361818204e-05, + "loss": 0.9554, + "step": 3578 + }, + { + "epoch": 0.3745682888540031, + "grad_norm": 2.0120333801365846, + "learning_rate": 1.4393728868854622e-05, + "loss": 0.9557, + "step": 3579 + }, + { + "epoch": 0.3746729461015175, + "grad_norm": 1.97838166233458, + "learning_rate": 1.4390683614678851e-05, + "loss": 0.9937, + "step": 3580 + }, + { + "epoch": 0.37477760334903193, + "grad_norm": 2.1059022035617803, + "learning_rate": 1.4387637856004633e-05, + "loss": 1.0074, + "step": 3581 + }, + { + "epoch": 0.37488226059654634, + "grad_norm": 2.0354065733173403, + "learning_rate": 1.438459159318193e-05, + "loss": 0.9762, + "step": 3582 + }, + { + "epoch": 0.3749869178440607, + "grad_norm": 2.2000318232269924, + "learning_rate": 1.4381544826560762e-05, + "loss": 0.9475, + "step": 3583 + }, + { + "epoch": 0.3750915750915751, + "grad_norm": 1.928430649502382, + "learning_rate": 1.4378497556491212e-05, + "loss": 0.8959, + "step": 3584 + }, + { + "epoch": 0.3751962323390895, + "grad_norm": 2.0181021559315897, + "learning_rate": 1.4375449783323419e-05, + "loss": 0.8834, + "step": 3585 + }, + { + "epoch": 0.37530088958660385, + "grad_norm": 2.179626276000838, + "learning_rate": 1.4372401507407575e-05, + "loss": 0.8578, + "step": 3586 + }, + { + "epoch": 0.37540554683411825, + "grad_norm": 1.9333891804031262, + "learning_rate": 1.436935272909393e-05, + "loss": 1.0699, + "step": 3587 + }, + { + "epoch": 0.37551020408163266, + "grad_norm": 1.9551604770253188, + "learning_rate": 1.4366303448732804e-05, + "loss": 0.9671, + "step": 3588 + }, + { + "epoch": 0.37561486132914707, + "grad_norm": 1.9683480948127803, + "learning_rate": 1.4363253666674559e-05, + "loss": 0.8561, + "step": 3589 + }, + { + "epoch": 0.3757195185766614, + "grad_norm": 2.2367501994279526, + "learning_rate": 1.4360203383269622e-05, + "loss": 0.8811, + "step": 3590 + }, + { + "epoch": 0.3758241758241758, + "grad_norm": 2.108577595208127, + "learning_rate": 1.4357152598868478e-05, + "loss": 1.0762, + "step": 3591 + }, + { + "epoch": 0.3759288330716902, + "grad_norm": 2.0801774600295357, + "learning_rate": 1.4354101313821666e-05, + "loss": 1.0351, + "step": 3592 + }, + { + "epoch": 0.3760334903192046, + "grad_norm": 1.9951115073973749, + "learning_rate": 1.4351049528479788e-05, + "loss": 0.9729, + "step": 3593 + }, + { + "epoch": 0.376138147566719, + "grad_norm": 2.254132484905743, + "learning_rate": 1.4347997243193497e-05, + "loss": 0.9553, + "step": 3594 + }, + { + "epoch": 0.3762428048142334, + "grad_norm": 2.2575494533039846, + "learning_rate": 1.434494445831351e-05, + "loss": 0.9477, + "step": 3595 + }, + { + "epoch": 0.3763474620617478, + "grad_norm": 2.411830867087975, + "learning_rate": 1.4341891174190594e-05, + "loss": 0.8515, + "step": 3596 + }, + { + "epoch": 0.37645211930926215, + "grad_norm": 2.0691126406810563, + "learning_rate": 1.4338837391175582e-05, + "loss": 0.9937, + "step": 3597 + }, + { + "epoch": 0.37655677655677655, + "grad_norm": 1.8560655060436033, + "learning_rate": 1.4335783109619356e-05, + "loss": 0.9589, + "step": 3598 + }, + { + "epoch": 0.37666143380429096, + "grad_norm": 1.9064645496752881, + "learning_rate": 1.4332728329872867e-05, + "loss": 1.0487, + "step": 3599 + }, + { + "epoch": 0.37676609105180536, + "grad_norm": 2.4222992304653923, + "learning_rate": 1.4329673052287105e-05, + "loss": 0.9782, + "step": 3600 + }, + { + "epoch": 0.3768707482993197, + "grad_norm": 2.0131408426702126, + "learning_rate": 1.432661727721313e-05, + "loss": 1.0806, + "step": 3601 + }, + { + "epoch": 0.3769754055468341, + "grad_norm": 2.0235190034405948, + "learning_rate": 1.4323561005002064e-05, + "loss": 0.8947, + "step": 3602 + }, + { + "epoch": 0.3770800627943485, + "grad_norm": 2.1458509322659545, + "learning_rate": 1.432050423600507e-05, + "loss": 1.0682, + "step": 3603 + }, + { + "epoch": 0.3771847200418629, + "grad_norm": 2.266595492288038, + "learning_rate": 1.4317446970573386e-05, + "loss": 0.9339, + "step": 3604 + }, + { + "epoch": 0.3772893772893773, + "grad_norm": 1.7601596273055828, + "learning_rate": 1.4314389209058287e-05, + "loss": 0.7858, + "step": 3605 + }, + { + "epoch": 0.3773940345368917, + "grad_norm": 1.9387994291788024, + "learning_rate": 1.4311330951811125e-05, + "loss": 0.9226, + "step": 3606 + }, + { + "epoch": 0.3774986917844061, + "grad_norm": 1.8920940023031685, + "learning_rate": 1.4308272199183296e-05, + "loss": 0.9885, + "step": 3607 + }, + { + "epoch": 0.37760334903192044, + "grad_norm": 2.058831017637809, + "learning_rate": 1.4305212951526255e-05, + "loss": 0.8887, + "step": 3608 + }, + { + "epoch": 0.37770800627943485, + "grad_norm": 2.025314142101822, + "learning_rate": 1.4302153209191518e-05, + "loss": 0.9362, + "step": 3609 + }, + { + "epoch": 0.37781266352694926, + "grad_norm": 2.202887178692797, + "learning_rate": 1.4299092972530656e-05, + "loss": 0.9994, + "step": 3610 + }, + { + "epoch": 0.3779173207744636, + "grad_norm": 2.1307137717760694, + "learning_rate": 1.42960322418953e-05, + "loss": 1.0086, + "step": 3611 + }, + { + "epoch": 0.378021978021978, + "grad_norm": 2.23112028651418, + "learning_rate": 1.4292971017637131e-05, + "loss": 1.0339, + "step": 3612 + }, + { + "epoch": 0.3781266352694924, + "grad_norm": 1.826253404688636, + "learning_rate": 1.4289909300107885e-05, + "loss": 0.8473, + "step": 3613 + }, + { + "epoch": 0.3782312925170068, + "grad_norm": 2.428885841856068, + "learning_rate": 1.4286847089659368e-05, + "loss": 1.001, + "step": 3614 + }, + { + "epoch": 0.3783359497645212, + "grad_norm": 1.8607496884224413, + "learning_rate": 1.4283784386643427e-05, + "loss": 1.0102, + "step": 3615 + }, + { + "epoch": 0.3784406070120356, + "grad_norm": 2.101434244434794, + "learning_rate": 1.4280721191411976e-05, + "loss": 1.0508, + "step": 3616 + }, + { + "epoch": 0.37854526425955, + "grad_norm": 1.9702696695086308, + "learning_rate": 1.4277657504316984e-05, + "loss": 0.9761, + "step": 3617 + }, + { + "epoch": 0.3786499215070644, + "grad_norm": 2.189847545110545, + "learning_rate": 1.427459332571047e-05, + "loss": 0.9961, + "step": 3618 + }, + { + "epoch": 0.37875457875457874, + "grad_norm": 2.30294474388609, + "learning_rate": 1.4271528655944522e-05, + "loss": 0.9599, + "step": 3619 + }, + { + "epoch": 0.37885923600209315, + "grad_norm": 1.8696799148064482, + "learning_rate": 1.4268463495371267e-05, + "loss": 1.0181, + "step": 3620 + }, + { + "epoch": 0.37896389324960755, + "grad_norm": 2.09128819824349, + "learning_rate": 1.426539784434291e-05, + "loss": 1.0629, + "step": 3621 + }, + { + "epoch": 0.3790685504971219, + "grad_norm": 2.0599948614085815, + "learning_rate": 1.4262331703211686e-05, + "loss": 0.9998, + "step": 3622 + }, + { + "epoch": 0.3791732077446363, + "grad_norm": 2.05462296115126, + "learning_rate": 1.4259265072329914e-05, + "loss": 0.8353, + "step": 3623 + }, + { + "epoch": 0.3792778649921507, + "grad_norm": 2.4818761789564814, + "learning_rate": 1.425619795204995e-05, + "loss": 0.885, + "step": 3624 + }, + { + "epoch": 0.3793825222396651, + "grad_norm": 1.9669887610252532, + "learning_rate": 1.425313034272421e-05, + "loss": 1.0076, + "step": 3625 + }, + { + "epoch": 0.37948717948717947, + "grad_norm": 2.4144006548804295, + "learning_rate": 1.4250062244705174e-05, + "loss": 1.0021, + "step": 3626 + }, + { + "epoch": 0.3795918367346939, + "grad_norm": 2.298079044624755, + "learning_rate": 1.4246993658345369e-05, + "loss": 0.8731, + "step": 3627 + }, + { + "epoch": 0.3796964939822083, + "grad_norm": 1.8240722916104797, + "learning_rate": 1.4243924583997386e-05, + "loss": 0.9051, + "step": 3628 + }, + { + "epoch": 0.37980115122972263, + "grad_norm": 2.5476300949512387, + "learning_rate": 1.4240855022013863e-05, + "loss": 1.1031, + "step": 3629 + }, + { + "epoch": 0.37990580847723704, + "grad_norm": 1.7527472576385803, + "learning_rate": 1.4237784972747501e-05, + "loss": 0.8347, + "step": 3630 + }, + { + "epoch": 0.38001046572475145, + "grad_norm": 2.0657909815224618, + "learning_rate": 1.4234714436551053e-05, + "loss": 0.853, + "step": 3631 + }, + { + "epoch": 0.38011512297226585, + "grad_norm": 2.0203292973651283, + "learning_rate": 1.4231643413777333e-05, + "loss": 0.9679, + "step": 3632 + }, + { + "epoch": 0.3802197802197802, + "grad_norm": 2.0882649368291553, + "learning_rate": 1.4228571904779209e-05, + "loss": 1.0813, + "step": 3633 + }, + { + "epoch": 0.3803244374672946, + "grad_norm": 2.1962421392589837, + "learning_rate": 1.4225499909909597e-05, + "loss": 1.0255, + "step": 3634 + }, + { + "epoch": 0.380429094714809, + "grad_norm": 2.0096595971812405, + "learning_rate": 1.4222427429521482e-05, + "loss": 0.9536, + "step": 3635 + }, + { + "epoch": 0.38053375196232336, + "grad_norm": 2.349899372814253, + "learning_rate": 1.4219354463967893e-05, + "loss": 0.9828, + "step": 3636 + }, + { + "epoch": 0.38063840920983777, + "grad_norm": 2.15612266191841, + "learning_rate": 1.4216281013601926e-05, + "loss": 0.8592, + "step": 3637 + }, + { + "epoch": 0.3807430664573522, + "grad_norm": 2.1616922957234164, + "learning_rate": 1.4213207078776723e-05, + "loss": 1.0161, + "step": 3638 + }, + { + "epoch": 0.3808477237048666, + "grad_norm": 2.0271980981355813, + "learning_rate": 1.4210132659845482e-05, + "loss": 0.8895, + "step": 3639 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 1.8793729122503329, + "learning_rate": 1.4207057757161465e-05, + "loss": 0.9433, + "step": 3640 + }, + { + "epoch": 0.38105703819989534, + "grad_norm": 2.2584716002700422, + "learning_rate": 1.4203982371077984e-05, + "loss": 0.9715, + "step": 3641 + }, + { + "epoch": 0.38116169544740974, + "grad_norm": 1.9923565504390046, + "learning_rate": 1.4200906501948405e-05, + "loss": 0.8118, + "step": 3642 + }, + { + "epoch": 0.38126635269492415, + "grad_norm": 2.3266543948061065, + "learning_rate": 1.4197830150126155e-05, + "loss": 1.0415, + "step": 3643 + }, + { + "epoch": 0.3813710099424385, + "grad_norm": 1.9354684638961517, + "learning_rate": 1.4194753315964707e-05, + "loss": 0.9977, + "step": 3644 + }, + { + "epoch": 0.3814756671899529, + "grad_norm": 2.0521405009628513, + "learning_rate": 1.4191675999817603e-05, + "loss": 0.9696, + "step": 3645 + }, + { + "epoch": 0.3815803244374673, + "grad_norm": 2.2085561140812775, + "learning_rate": 1.4188598202038428e-05, + "loss": 1.0361, + "step": 3646 + }, + { + "epoch": 0.38168498168498166, + "grad_norm": 2.447907112557791, + "learning_rate": 1.418551992298083e-05, + "loss": 1.0262, + "step": 3647 + }, + { + "epoch": 0.38178963893249607, + "grad_norm": 2.083006037799446, + "learning_rate": 1.4182441162998506e-05, + "loss": 0.9471, + "step": 3648 + }, + { + "epoch": 0.3818942961800105, + "grad_norm": 2.2012013601209466, + "learning_rate": 1.4179361922445214e-05, + "loss": 1.0669, + "step": 3649 + }, + { + "epoch": 0.3819989534275249, + "grad_norm": 2.356165760065059, + "learning_rate": 1.4176282201674764e-05, + "loss": 1.0236, + "step": 3650 + }, + { + "epoch": 0.38210361067503923, + "grad_norm": 2.1910213266631553, + "learning_rate": 1.4173202001041024e-05, + "loss": 0.9729, + "step": 3651 + }, + { + "epoch": 0.38220826792255364, + "grad_norm": 1.9619249365232363, + "learning_rate": 1.4170121320897911e-05, + "loss": 0.9726, + "step": 3652 + }, + { + "epoch": 0.38231292517006804, + "grad_norm": 2.7128459699808483, + "learning_rate": 1.416704016159941e-05, + "loss": 1.0244, + "step": 3653 + }, + { + "epoch": 0.3824175824175824, + "grad_norm": 1.9243530614458784, + "learning_rate": 1.4163958523499543e-05, + "loss": 0.8521, + "step": 3654 + }, + { + "epoch": 0.3825222396650968, + "grad_norm": 2.139320327183821, + "learning_rate": 1.4160876406952402e-05, + "loss": 0.9358, + "step": 3655 + }, + { + "epoch": 0.3826268969126112, + "grad_norm": 1.8395056038544089, + "learning_rate": 1.4157793812312126e-05, + "loss": 0.9061, + "step": 3656 + }, + { + "epoch": 0.3827315541601256, + "grad_norm": 2.094018606545468, + "learning_rate": 1.4154710739932913e-05, + "loss": 0.9147, + "step": 3657 + }, + { + "epoch": 0.38283621140763996, + "grad_norm": 2.0930445989438553, + "learning_rate": 1.4151627190169013e-05, + "loss": 0.9932, + "step": 3658 + }, + { + "epoch": 0.38294086865515437, + "grad_norm": 2.143172865705711, + "learning_rate": 1.4148543163374733e-05, + "loss": 0.9458, + "step": 3659 + }, + { + "epoch": 0.38304552590266877, + "grad_norm": 2.2091982738545033, + "learning_rate": 1.4145458659904433e-05, + "loss": 0.9759, + "step": 3660 + }, + { + "epoch": 0.3831501831501832, + "grad_norm": 1.9567813439206125, + "learning_rate": 1.4142373680112528e-05, + "loss": 0.9427, + "step": 3661 + }, + { + "epoch": 0.3832548403976975, + "grad_norm": 2.041638652930433, + "learning_rate": 1.4139288224353494e-05, + "loss": 0.9669, + "step": 3662 + }, + { + "epoch": 0.38335949764521193, + "grad_norm": 2.194450474916531, + "learning_rate": 1.4136202292981848e-05, + "loss": 0.8689, + "step": 3663 + }, + { + "epoch": 0.38346415489272634, + "grad_norm": 2.048372715974301, + "learning_rate": 1.4133115886352176e-05, + "loss": 0.8711, + "step": 3664 + }, + { + "epoch": 0.3835688121402407, + "grad_norm": 2.081252667466177, + "learning_rate": 1.4130029004819109e-05, + "loss": 0.9163, + "step": 3665 + }, + { + "epoch": 0.3836734693877551, + "grad_norm": 2.063327599869014, + "learning_rate": 1.4126941648737334e-05, + "loss": 0.9385, + "step": 3666 + }, + { + "epoch": 0.3837781266352695, + "grad_norm": 2.2359367310037133, + "learning_rate": 1.4123853818461601e-05, + "loss": 1.1035, + "step": 3667 + }, + { + "epoch": 0.3838827838827839, + "grad_norm": 1.7634923657138273, + "learning_rate": 1.4120765514346703e-05, + "loss": 0.8968, + "step": 3668 + }, + { + "epoch": 0.38398744113029826, + "grad_norm": 2.141693015056442, + "learning_rate": 1.4117676736747494e-05, + "loss": 0.921, + "step": 3669 + }, + { + "epoch": 0.38409209837781266, + "grad_norm": 2.1476056016940035, + "learning_rate": 1.411458748601888e-05, + "loss": 1.0029, + "step": 3670 + }, + { + "epoch": 0.38419675562532707, + "grad_norm": 1.9647392959113228, + "learning_rate": 1.4111497762515822e-05, + "loss": 0.9642, + "step": 3671 + }, + { + "epoch": 0.3843014128728414, + "grad_norm": 2.5121210210965006, + "learning_rate": 1.410840756659334e-05, + "loss": 1.1309, + "step": 3672 + }, + { + "epoch": 0.3844060701203558, + "grad_norm": 1.9954842133931197, + "learning_rate": 1.4105316898606499e-05, + "loss": 0.9308, + "step": 3673 + }, + { + "epoch": 0.38451072736787023, + "grad_norm": 2.1044462151613135, + "learning_rate": 1.410222575891042e-05, + "loss": 0.8752, + "step": 3674 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 2.0953131458808962, + "learning_rate": 1.4099134147860287e-05, + "loss": 0.6864, + "step": 3675 + }, + { + "epoch": 0.384720041862899, + "grad_norm": 2.0201216369124335, + "learning_rate": 1.4096042065811333e-05, + "loss": 1.0058, + "step": 3676 + }, + { + "epoch": 0.3848246991104134, + "grad_norm": 2.1104902948779034, + "learning_rate": 1.409294951311884e-05, + "loss": 0.9358, + "step": 3677 + }, + { + "epoch": 0.3849293563579278, + "grad_norm": 2.0782997552273783, + "learning_rate": 1.408985649013815e-05, + "loss": 1.0495, + "step": 3678 + }, + { + "epoch": 0.38503401360544215, + "grad_norm": 2.2354543394222226, + "learning_rate": 1.4086762997224658e-05, + "loss": 1.028, + "step": 3679 + }, + { + "epoch": 0.38513867085295656, + "grad_norm": 2.344608781601967, + "learning_rate": 1.4083669034733816e-05, + "loss": 1.1165, + "step": 3680 + }, + { + "epoch": 0.38524332810047096, + "grad_norm": 2.093693999681073, + "learning_rate": 1.4080574603021121e-05, + "loss": 0.9505, + "step": 3681 + }, + { + "epoch": 0.38534798534798537, + "grad_norm": 1.9846470070251416, + "learning_rate": 1.4077479702442132e-05, + "loss": 0.9975, + "step": 3682 + }, + { + "epoch": 0.3854526425954997, + "grad_norm": 1.9333444382455947, + "learning_rate": 1.407438433335246e-05, + "loss": 0.8526, + "step": 3683 + }, + { + "epoch": 0.3855572998430141, + "grad_norm": 2.020798413551147, + "learning_rate": 1.4071288496107769e-05, + "loss": 0.8515, + "step": 3684 + }, + { + "epoch": 0.38566195709052853, + "grad_norm": 2.008663061169858, + "learning_rate": 1.4068192191063777e-05, + "loss": 0.9006, + "step": 3685 + }, + { + "epoch": 0.38576661433804293, + "grad_norm": 2.117370458308192, + "learning_rate": 1.4065095418576254e-05, + "loss": 1.0621, + "step": 3686 + }, + { + "epoch": 0.3858712715855573, + "grad_norm": 2.4013394801880836, + "learning_rate": 1.4061998179001025e-05, + "loss": 1.0932, + "step": 3687 + }, + { + "epoch": 0.3859759288330717, + "grad_norm": 2.1143080149970603, + "learning_rate": 1.4058900472693973e-05, + "loss": 0.8957, + "step": 3688 + }, + { + "epoch": 0.3860805860805861, + "grad_norm": 2.075156804250162, + "learning_rate": 1.4055802300011027e-05, + "loss": 1.0239, + "step": 3689 + }, + { + "epoch": 0.38618524332810045, + "grad_norm": 2.8275247517173216, + "learning_rate": 1.4052703661308175e-05, + "loss": 1.0158, + "step": 3690 + }, + { + "epoch": 0.38628990057561485, + "grad_norm": 1.7962929732068007, + "learning_rate": 1.4049604556941457e-05, + "loss": 0.924, + "step": 3691 + }, + { + "epoch": 0.38639455782312926, + "grad_norm": 2.01783824353475, + "learning_rate": 1.4046504987266964e-05, + "loss": 0.9401, + "step": 3692 + }, + { + "epoch": 0.38649921507064366, + "grad_norm": 2.2187966441047435, + "learning_rate": 1.4043404952640848e-05, + "loss": 1.0663, + "step": 3693 + }, + { + "epoch": 0.386603872318158, + "grad_norm": 1.999279925714374, + "learning_rate": 1.4040304453419306e-05, + "loss": 0.9407, + "step": 3694 + }, + { + "epoch": 0.3867085295656724, + "grad_norm": 2.130348742018532, + "learning_rate": 1.4037203489958593e-05, + "loss": 0.9873, + "step": 3695 + }, + { + "epoch": 0.3868131868131868, + "grad_norm": 2.325652550191666, + "learning_rate": 1.4034102062615009e-05, + "loss": 0.8994, + "step": 3696 + }, + { + "epoch": 0.3869178440607012, + "grad_norm": 2.3197905613136345, + "learning_rate": 1.4031000171744924e-05, + "loss": 0.9815, + "step": 3697 + }, + { + "epoch": 0.3870225013082156, + "grad_norm": 2.2641945333562385, + "learning_rate": 1.402789781770475e-05, + "loss": 1.0339, + "step": 3698 + }, + { + "epoch": 0.38712715855573, + "grad_norm": 2.3574842229562614, + "learning_rate": 1.4024795000850946e-05, + "loss": 0.9534, + "step": 3699 + }, + { + "epoch": 0.3872318158032444, + "grad_norm": 2.0918637615880566, + "learning_rate": 1.4021691721540038e-05, + "loss": 0.9808, + "step": 3700 + }, + { + "epoch": 0.38733647305075875, + "grad_norm": 2.075992424556234, + "learning_rate": 1.4018587980128602e-05, + "loss": 0.8702, + "step": 3701 + }, + { + "epoch": 0.38744113029827315, + "grad_norm": 1.8782927333898265, + "learning_rate": 1.401548377697326e-05, + "loss": 1.0075, + "step": 3702 + }, + { + "epoch": 0.38754578754578756, + "grad_norm": 2.083887164652835, + "learning_rate": 1.4012379112430692e-05, + "loss": 1.0557, + "step": 3703 + }, + { + "epoch": 0.38765044479330196, + "grad_norm": 2.389475853756546, + "learning_rate": 1.4009273986857625e-05, + "loss": 0.9441, + "step": 3704 + }, + { + "epoch": 0.3877551020408163, + "grad_norm": 2.1864008731163795, + "learning_rate": 1.4006168400610853e-05, + "loss": 0.9095, + "step": 3705 + }, + { + "epoch": 0.3878597592883307, + "grad_norm": 2.1399945231613624, + "learning_rate": 1.4003062354047211e-05, + "loss": 1.0264, + "step": 3706 + }, + { + "epoch": 0.3879644165358451, + "grad_norm": 2.039186746148691, + "learning_rate": 1.3999955847523588e-05, + "loss": 1.0078, + "step": 3707 + }, + { + "epoch": 0.3880690737833595, + "grad_norm": 1.7528376320399341, + "learning_rate": 1.3996848881396932e-05, + "loss": 0.7449, + "step": 3708 + }, + { + "epoch": 0.3881737310308739, + "grad_norm": 2.4937767704582265, + "learning_rate": 1.3993741456024233e-05, + "loss": 0.9109, + "step": 3709 + }, + { + "epoch": 0.3882783882783883, + "grad_norm": 2.129798206261498, + "learning_rate": 1.3990633571762547e-05, + "loss": 1.1029, + "step": 3710 + }, + { + "epoch": 0.3883830455259027, + "grad_norm": 1.9957488140578195, + "learning_rate": 1.3987525228968972e-05, + "loss": 1.018, + "step": 3711 + }, + { + "epoch": 0.38848770277341704, + "grad_norm": 2.1354640527218947, + "learning_rate": 1.398441642800067e-05, + "loss": 0.8997, + "step": 3712 + }, + { + "epoch": 0.38859236002093145, + "grad_norm": 1.964424076142884, + "learning_rate": 1.3981307169214837e-05, + "loss": 0.9455, + "step": 3713 + }, + { + "epoch": 0.38869701726844585, + "grad_norm": 2.221101115356642, + "learning_rate": 1.397819745296874e-05, + "loss": 1.0268, + "step": 3714 + }, + { + "epoch": 0.3888016745159602, + "grad_norm": 2.3833418030736553, + "learning_rate": 1.3975087279619695e-05, + "loss": 1.04, + "step": 3715 + }, + { + "epoch": 0.3889063317634746, + "grad_norm": 1.9463675227198336, + "learning_rate": 1.397197664952506e-05, + "loss": 0.9108, + "step": 3716 + }, + { + "epoch": 0.389010989010989, + "grad_norm": 2.104522166089339, + "learning_rate": 1.3968865563042255e-05, + "loss": 1.0029, + "step": 3717 + }, + { + "epoch": 0.3891156462585034, + "grad_norm": 2.2293325863509623, + "learning_rate": 1.3965754020528755e-05, + "loss": 1.0006, + "step": 3718 + }, + { + "epoch": 0.3892203035060178, + "grad_norm": 2.225977481709003, + "learning_rate": 1.3962642022342075e-05, + "loss": 0.9976, + "step": 3719 + }, + { + "epoch": 0.3893249607535322, + "grad_norm": 2.1371606373643983, + "learning_rate": 1.3959529568839796e-05, + "loss": 0.9584, + "step": 3720 + }, + { + "epoch": 0.3894296180010466, + "grad_norm": 2.1984571676095217, + "learning_rate": 1.3956416660379542e-05, + "loss": 0.9966, + "step": 3721 + }, + { + "epoch": 0.389534275248561, + "grad_norm": 2.3612095316369994, + "learning_rate": 1.3953303297318992e-05, + "loss": 1.0549, + "step": 3722 + }, + { + "epoch": 0.38963893249607534, + "grad_norm": 2.002770628808304, + "learning_rate": 1.3950189480015879e-05, + "loss": 0.9726, + "step": 3723 + }, + { + "epoch": 0.38974358974358975, + "grad_norm": 2.2409687498155817, + "learning_rate": 1.3947075208827986e-05, + "loss": 1.051, + "step": 3724 + }, + { + "epoch": 0.38984824699110415, + "grad_norm": 2.4369275610874417, + "learning_rate": 1.3943960484113154e-05, + "loss": 1.0222, + "step": 3725 + }, + { + "epoch": 0.3899529042386185, + "grad_norm": 2.001381883429874, + "learning_rate": 1.3940845306229263e-05, + "loss": 0.8523, + "step": 3726 + }, + { + "epoch": 0.3900575614861329, + "grad_norm": 2.379959624189031, + "learning_rate": 1.3937729675534259e-05, + "loss": 1.0068, + "step": 3727 + }, + { + "epoch": 0.3901622187336473, + "grad_norm": 2.065993568806048, + "learning_rate": 1.3934613592386133e-05, + "loss": 0.9928, + "step": 3728 + }, + { + "epoch": 0.3902668759811617, + "grad_norm": 2.14687057918789, + "learning_rate": 1.393149705714293e-05, + "loss": 0.9085, + "step": 3729 + }, + { + "epoch": 0.39037153322867607, + "grad_norm": 2.056457087459079, + "learning_rate": 1.3928380070162743e-05, + "loss": 1.031, + "step": 3730 + }, + { + "epoch": 0.3904761904761905, + "grad_norm": 2.3376645905376447, + "learning_rate": 1.3925262631803722e-05, + "loss": 1.0316, + "step": 3731 + }, + { + "epoch": 0.3905808477237049, + "grad_norm": 2.345112786800701, + "learning_rate": 1.392214474242407e-05, + "loss": 1.0802, + "step": 3732 + }, + { + "epoch": 0.39068550497121923, + "grad_norm": 2.0352455265227443, + "learning_rate": 1.3919026402382034e-05, + "loss": 0.9909, + "step": 3733 + }, + { + "epoch": 0.39079016221873364, + "grad_norm": 2.0459097619442863, + "learning_rate": 1.3915907612035922e-05, + "loss": 0.9397, + "step": 3734 + }, + { + "epoch": 0.39089481946624804, + "grad_norm": 1.954972542902261, + "learning_rate": 1.3912788371744084e-05, + "loss": 0.825, + "step": 3735 + }, + { + "epoch": 0.39099947671376245, + "grad_norm": 2.1145543797342983, + "learning_rate": 1.3909668681864932e-05, + "loss": 1.0745, + "step": 3736 + }, + { + "epoch": 0.3911041339612768, + "grad_norm": 1.9636359750174326, + "learning_rate": 1.3906548542756921e-05, + "loss": 0.9003, + "step": 3737 + }, + { + "epoch": 0.3912087912087912, + "grad_norm": 2.405810011459009, + "learning_rate": 1.3903427954778566e-05, + "loss": 1.0238, + "step": 3738 + }, + { + "epoch": 0.3913134484563056, + "grad_norm": 2.0934967739240813, + "learning_rate": 1.3900306918288423e-05, + "loss": 1.004, + "step": 3739 + }, + { + "epoch": 0.39141810570381996, + "grad_norm": 2.017817533341966, + "learning_rate": 1.3897185433645106e-05, + "loss": 0.9562, + "step": 3740 + }, + { + "epoch": 0.39152276295133437, + "grad_norm": 2.0048073269684865, + "learning_rate": 1.389406350120729e-05, + "loss": 0.9614, + "step": 3741 + }, + { + "epoch": 0.3916274201988488, + "grad_norm": 2.1093352540533985, + "learning_rate": 1.3890941121333677e-05, + "loss": 0.9876, + "step": 3742 + }, + { + "epoch": 0.3917320774463632, + "grad_norm": 2.221682557807781, + "learning_rate": 1.3887818294383042e-05, + "loss": 0.9904, + "step": 3743 + }, + { + "epoch": 0.39183673469387753, + "grad_norm": 1.9422817754452815, + "learning_rate": 1.3884695020714207e-05, + "loss": 0.8561, + "step": 3744 + }, + { + "epoch": 0.39194139194139194, + "grad_norm": 2.080159971701426, + "learning_rate": 1.3881571300686037e-05, + "loss": 0.9193, + "step": 3745 + }, + { + "epoch": 0.39204604918890634, + "grad_norm": 2.1895432862916078, + "learning_rate": 1.3878447134657455e-05, + "loss": 0.9276, + "step": 3746 + }, + { + "epoch": 0.39215070643642075, + "grad_norm": 2.154912897139779, + "learning_rate": 1.3875322522987435e-05, + "loss": 0.9417, + "step": 3747 + }, + { + "epoch": 0.3922553636839351, + "grad_norm": 2.375072450587214, + "learning_rate": 1.3872197466035003e-05, + "loss": 1.0635, + "step": 3748 + }, + { + "epoch": 0.3923600209314495, + "grad_norm": 2.029749571414169, + "learning_rate": 1.3869071964159231e-05, + "loss": 0.9101, + "step": 3749 + }, + { + "epoch": 0.3924646781789639, + "grad_norm": 2.298693925354746, + "learning_rate": 1.3865946017719248e-05, + "loss": 1.0388, + "step": 3750 + }, + { + "epoch": 0.39256933542647826, + "grad_norm": 2.32567539400059, + "learning_rate": 1.3862819627074231e-05, + "loss": 0.9207, + "step": 3751 + }, + { + "epoch": 0.39267399267399267, + "grad_norm": 1.8738585362137903, + "learning_rate": 1.3859692792583403e-05, + "loss": 0.9748, + "step": 3752 + }, + { + "epoch": 0.3927786499215071, + "grad_norm": 1.903263299759873, + "learning_rate": 1.3856565514606053e-05, + "loss": 1.0709, + "step": 3753 + }, + { + "epoch": 0.3928833071690215, + "grad_norm": 2.1106708790964377, + "learning_rate": 1.3853437793501507e-05, + "loss": 1.0031, + "step": 3754 + }, + { + "epoch": 0.39298796441653583, + "grad_norm": 2.0628055794677493, + "learning_rate": 1.3850309629629146e-05, + "loss": 0.998, + "step": 3755 + }, + { + "epoch": 0.39309262166405023, + "grad_norm": 1.8840425280715498, + "learning_rate": 1.3847181023348404e-05, + "loss": 0.9089, + "step": 3756 + }, + { + "epoch": 0.39319727891156464, + "grad_norm": 2.2547660697868332, + "learning_rate": 1.3844051975018761e-05, + "loss": 0.9776, + "step": 3757 + }, + { + "epoch": 0.393301936159079, + "grad_norm": 1.9372101963139208, + "learning_rate": 1.3840922484999758e-05, + "loss": 0.9781, + "step": 3758 + }, + { + "epoch": 0.3934065934065934, + "grad_norm": 2.3442712849451657, + "learning_rate": 1.383779255365097e-05, + "loss": 0.9706, + "step": 3759 + }, + { + "epoch": 0.3935112506541078, + "grad_norm": 1.9728314083799356, + "learning_rate": 1.3834662181332043e-05, + "loss": 0.8493, + "step": 3760 + }, + { + "epoch": 0.3936159079016222, + "grad_norm": 1.8671773883426177, + "learning_rate": 1.3831531368402653e-05, + "loss": 0.8621, + "step": 3761 + }, + { + "epoch": 0.39372056514913656, + "grad_norm": 2.0851610975093, + "learning_rate": 1.3828400115222542e-05, + "loss": 0.9945, + "step": 3762 + }, + { + "epoch": 0.39382522239665096, + "grad_norm": 2.0950765767067296, + "learning_rate": 1.38252684221515e-05, + "loss": 0.9923, + "step": 3763 + }, + { + "epoch": 0.39392987964416537, + "grad_norm": 2.0426883514777616, + "learning_rate": 1.382213628954936e-05, + "loss": 1.0127, + "step": 3764 + }, + { + "epoch": 0.3940345368916798, + "grad_norm": 2.1796168252887713, + "learning_rate": 1.381900371777601e-05, + "loss": 1.0253, + "step": 3765 + }, + { + "epoch": 0.3941391941391941, + "grad_norm": 2.056669859459506, + "learning_rate": 1.3815870707191393e-05, + "loss": 1.0314, + "step": 3766 + }, + { + "epoch": 0.39424385138670853, + "grad_norm": 2.058977648719983, + "learning_rate": 1.3812737258155495e-05, + "loss": 0.8932, + "step": 3767 + }, + { + "epoch": 0.39434850863422294, + "grad_norm": 2.3166421400138177, + "learning_rate": 1.380960337102836e-05, + "loss": 0.964, + "step": 3768 + }, + { + "epoch": 0.3944531658817373, + "grad_norm": 1.8919186197649047, + "learning_rate": 1.380646904617007e-05, + "loss": 0.7797, + "step": 3769 + }, + { + "epoch": 0.3945578231292517, + "grad_norm": 2.0859503146939256, + "learning_rate": 1.3803334283940772e-05, + "loss": 0.9242, + "step": 3770 + }, + { + "epoch": 0.3946624803767661, + "grad_norm": 1.9003991213190858, + "learning_rate": 1.3800199084700655e-05, + "loss": 0.9279, + "step": 3771 + }, + { + "epoch": 0.3947671376242805, + "grad_norm": 1.9199813086679065, + "learning_rate": 1.3797063448809959e-05, + "loss": 0.8576, + "step": 3772 + }, + { + "epoch": 0.39487179487179486, + "grad_norm": 1.984278812039716, + "learning_rate": 1.3793927376628977e-05, + "loss": 0.9722, + "step": 3773 + }, + { + "epoch": 0.39497645211930926, + "grad_norm": 2.1654990774470426, + "learning_rate": 1.3790790868518044e-05, + "loss": 0.8915, + "step": 3774 + }, + { + "epoch": 0.39508110936682367, + "grad_norm": 2.0693477910040152, + "learning_rate": 1.378765392483756e-05, + "loss": 1.0609, + "step": 3775 + }, + { + "epoch": 0.395185766614338, + "grad_norm": 2.5214519932464263, + "learning_rate": 1.378451654594796e-05, + "loss": 0.8346, + "step": 3776 + }, + { + "epoch": 0.3952904238618524, + "grad_norm": 2.273499618027654, + "learning_rate": 1.3781378732209738e-05, + "loss": 0.971, + "step": 3777 + }, + { + "epoch": 0.39539508110936683, + "grad_norm": 2.0346748171851594, + "learning_rate": 1.3778240483983432e-05, + "loss": 0.921, + "step": 3778 + }, + { + "epoch": 0.39549973835688124, + "grad_norm": 2.4181856266842705, + "learning_rate": 1.3775101801629636e-05, + "loss": 0.9929, + "step": 3779 + }, + { + "epoch": 0.3956043956043956, + "grad_norm": 1.8889123693724994, + "learning_rate": 1.3771962685508991e-05, + "loss": 0.9226, + "step": 3780 + }, + { + "epoch": 0.39570905285191, + "grad_norm": 1.8892857747949212, + "learning_rate": 1.3768823135982185e-05, + "loss": 0.9675, + "step": 3781 + }, + { + "epoch": 0.3958137100994244, + "grad_norm": 2.421740842709229, + "learning_rate": 1.3765683153409963e-05, + "loss": 0.9961, + "step": 3782 + }, + { + "epoch": 0.39591836734693875, + "grad_norm": 2.19282905248247, + "learning_rate": 1.376254273815311e-05, + "loss": 1.045, + "step": 3783 + }, + { + "epoch": 0.39602302459445315, + "grad_norm": 2.6404415016056695, + "learning_rate": 1.375940189057247e-05, + "loss": 1.0346, + "step": 3784 + }, + { + "epoch": 0.39612768184196756, + "grad_norm": 2.087172492105292, + "learning_rate": 1.3756260611028932e-05, + "loss": 1.0005, + "step": 3785 + }, + { + "epoch": 0.39623233908948197, + "grad_norm": 1.8576562979717508, + "learning_rate": 1.3753118899883436e-05, + "loss": 0.8769, + "step": 3786 + }, + { + "epoch": 0.3963369963369963, + "grad_norm": 1.7251243282419155, + "learning_rate": 1.3749976757496969e-05, + "loss": 0.9917, + "step": 3787 + }, + { + "epoch": 0.3964416535845107, + "grad_norm": 2.3431283462971244, + "learning_rate": 1.3746834184230569e-05, + "loss": 0.9063, + "step": 3788 + }, + { + "epoch": 0.39654631083202513, + "grad_norm": 2.100375176753719, + "learning_rate": 1.3743691180445328e-05, + "loss": 1.056, + "step": 3789 + }, + { + "epoch": 0.39665096807953953, + "grad_norm": 1.883781822130877, + "learning_rate": 1.374054774650238e-05, + "loss": 0.9573, + "step": 3790 + }, + { + "epoch": 0.3967556253270539, + "grad_norm": 1.9710349136351437, + "learning_rate": 1.3737403882762914e-05, + "loss": 0.9161, + "step": 3791 + }, + { + "epoch": 0.3968602825745683, + "grad_norm": 2.0619488055452067, + "learning_rate": 1.3734259589588165e-05, + "loss": 0.8067, + "step": 3792 + }, + { + "epoch": 0.3969649398220827, + "grad_norm": 2.1265304547515607, + "learning_rate": 1.3731114867339418e-05, + "loss": 1.1066, + "step": 3793 + }, + { + "epoch": 0.39706959706959705, + "grad_norm": 2.1845388692035135, + "learning_rate": 1.3727969716378012e-05, + "loss": 1.0085, + "step": 3794 + }, + { + "epoch": 0.39717425431711145, + "grad_norm": 2.025263339801749, + "learning_rate": 1.3724824137065325e-05, + "loss": 0.9922, + "step": 3795 + }, + { + "epoch": 0.39727891156462586, + "grad_norm": 1.9103665780889987, + "learning_rate": 1.3721678129762792e-05, + "loss": 1.0307, + "step": 3796 + }, + { + "epoch": 0.39738356881214026, + "grad_norm": 2.420546254970621, + "learning_rate": 1.3718531694831903e-05, + "loss": 0.9924, + "step": 3797 + }, + { + "epoch": 0.3974882260596546, + "grad_norm": 1.9578789743491487, + "learning_rate": 1.371538483263418e-05, + "loss": 0.9418, + "step": 3798 + }, + { + "epoch": 0.397592883307169, + "grad_norm": 2.2999551242197107, + "learning_rate": 1.3712237543531208e-05, + "loss": 1.0449, + "step": 3799 + }, + { + "epoch": 0.3976975405546834, + "grad_norm": 1.9021993595587816, + "learning_rate": 1.3709089827884616e-05, + "loss": 0.8744, + "step": 3800 + }, + { + "epoch": 0.3978021978021978, + "grad_norm": 2.0695970711361444, + "learning_rate": 1.3705941686056086e-05, + "loss": 1.0607, + "step": 3801 + }, + { + "epoch": 0.3979068550497122, + "grad_norm": 1.8591241528827835, + "learning_rate": 1.3702793118407345e-05, + "loss": 0.9284, + "step": 3802 + }, + { + "epoch": 0.3980115122972266, + "grad_norm": 1.87633082654073, + "learning_rate": 1.3699644125300165e-05, + "loss": 0.9277, + "step": 3803 + }, + { + "epoch": 0.398116169544741, + "grad_norm": 2.1413866117330675, + "learning_rate": 1.3696494707096374e-05, + "loss": 0.9777, + "step": 3804 + }, + { + "epoch": 0.39822082679225534, + "grad_norm": 2.029453889878876, + "learning_rate": 1.369334486415785e-05, + "loss": 0.9102, + "step": 3805 + }, + { + "epoch": 0.39832548403976975, + "grad_norm": 1.914840282215036, + "learning_rate": 1.3690194596846516e-05, + "loss": 0.8982, + "step": 3806 + }, + { + "epoch": 0.39843014128728416, + "grad_norm": 2.2156107459198267, + "learning_rate": 1.3687043905524337e-05, + "loss": 0.9374, + "step": 3807 + }, + { + "epoch": 0.39853479853479856, + "grad_norm": 2.353831603694141, + "learning_rate": 1.3683892790553344e-05, + "loss": 1.0299, + "step": 3808 + }, + { + "epoch": 0.3986394557823129, + "grad_norm": 2.2201207812404333, + "learning_rate": 1.3680741252295597e-05, + "loss": 0.9126, + "step": 3809 + }, + { + "epoch": 0.3987441130298273, + "grad_norm": 2.0962471812380006, + "learning_rate": 1.367758929111322e-05, + "loss": 0.9077, + "step": 3810 + }, + { + "epoch": 0.3988487702773417, + "grad_norm": 2.21964726775872, + "learning_rate": 1.3674436907368377e-05, + "loss": 1.1087, + "step": 3811 + }, + { + "epoch": 0.3989534275248561, + "grad_norm": 2.5819871421432286, + "learning_rate": 1.3671284101423288e-05, + "loss": 0.9415, + "step": 3812 + }, + { + "epoch": 0.3990580847723705, + "grad_norm": 1.7198628116479615, + "learning_rate": 1.3668130873640211e-05, + "loss": 0.8898, + "step": 3813 + }, + { + "epoch": 0.3991627420198849, + "grad_norm": 1.8597498193101791, + "learning_rate": 1.3664977224381461e-05, + "loss": 0.8606, + "step": 3814 + }, + { + "epoch": 0.3992673992673993, + "grad_norm": 2.6740292842251234, + "learning_rate": 1.3661823154009397e-05, + "loss": 1.0387, + "step": 3815 + }, + { + "epoch": 0.39937205651491364, + "grad_norm": 2.3003617051512735, + "learning_rate": 1.3658668662886432e-05, + "loss": 1.1021, + "step": 3816 + }, + { + "epoch": 0.39947671376242805, + "grad_norm": 2.3043271725583963, + "learning_rate": 1.3655513751375017e-05, + "loss": 0.9864, + "step": 3817 + }, + { + "epoch": 0.39958137100994245, + "grad_norm": 1.741403290306905, + "learning_rate": 1.3652358419837664e-05, + "loss": 0.9179, + "step": 3818 + }, + { + "epoch": 0.3996860282574568, + "grad_norm": 2.143710219932102, + "learning_rate": 1.3649202668636923e-05, + "loss": 0.8958, + "step": 3819 + }, + { + "epoch": 0.3997906855049712, + "grad_norm": 2.029828508661703, + "learning_rate": 1.36460464981354e-05, + "loss": 0.9657, + "step": 3820 + }, + { + "epoch": 0.3998953427524856, + "grad_norm": 1.9833644803425685, + "learning_rate": 1.3642889908695742e-05, + "loss": 0.9026, + "step": 3821 + }, + { + "epoch": 0.4, + "grad_norm": 2.210053513135352, + "learning_rate": 1.3639732900680646e-05, + "loss": 1.0516, + "step": 3822 + }, + { + "epoch": 0.4001046572475144, + "grad_norm": 2.3239837565552985, + "learning_rate": 1.3636575474452865e-05, + "loss": 0.9058, + "step": 3823 + }, + { + "epoch": 0.4002093144950288, + "grad_norm": 2.0848417519232454, + "learning_rate": 1.3633417630375188e-05, + "loss": 0.9769, + "step": 3824 + }, + { + "epoch": 0.4003139717425432, + "grad_norm": 1.953543514114223, + "learning_rate": 1.3630259368810461e-05, + "loss": 0.9432, + "step": 3825 + }, + { + "epoch": 0.40041862899005753, + "grad_norm": 2.1198900948291928, + "learning_rate": 1.3627100690121571e-05, + "loss": 0.9108, + "step": 3826 + }, + { + "epoch": 0.40052328623757194, + "grad_norm": 2.1614969325672377, + "learning_rate": 1.362394159467146e-05, + "loss": 0.9297, + "step": 3827 + }, + { + "epoch": 0.40062794348508635, + "grad_norm": 2.2955139319444418, + "learning_rate": 1.3620782082823115e-05, + "loss": 0.9899, + "step": 3828 + }, + { + "epoch": 0.40073260073260075, + "grad_norm": 2.2734494900620947, + "learning_rate": 1.3617622154939565e-05, + "loss": 0.8636, + "step": 3829 + }, + { + "epoch": 0.4008372579801151, + "grad_norm": 2.3826089612311607, + "learning_rate": 1.3614461811383897e-05, + "loss": 0.9787, + "step": 3830 + }, + { + "epoch": 0.4009419152276295, + "grad_norm": 2.233808465500224, + "learning_rate": 1.3611301052519242e-05, + "loss": 1.1001, + "step": 3831 + }, + { + "epoch": 0.4010465724751439, + "grad_norm": 2.4947957206362976, + "learning_rate": 1.360813987870877e-05, + "loss": 1.0562, + "step": 3832 + }, + { + "epoch": 0.4011512297226583, + "grad_norm": 2.1160921112573807, + "learning_rate": 1.3604978290315717e-05, + "loss": 0.9986, + "step": 3833 + }, + { + "epoch": 0.40125588697017267, + "grad_norm": 2.310532165642057, + "learning_rate": 1.360181628770335e-05, + "loss": 0.9502, + "step": 3834 + }, + { + "epoch": 0.4013605442176871, + "grad_norm": 2.3190655097380106, + "learning_rate": 1.3598653871234986e-05, + "loss": 1.0915, + "step": 3835 + }, + { + "epoch": 0.4014652014652015, + "grad_norm": 2.108855272952508, + "learning_rate": 1.3595491041273999e-05, + "loss": 0.8137, + "step": 3836 + }, + { + "epoch": 0.40156985871271583, + "grad_norm": 2.0659350062129263, + "learning_rate": 1.3592327798183802e-05, + "loss": 1.0533, + "step": 3837 + }, + { + "epoch": 0.40167451596023024, + "grad_norm": 1.9843597273847278, + "learning_rate": 1.3589164142327863e-05, + "loss": 0.9426, + "step": 3838 + }, + { + "epoch": 0.40177917320774464, + "grad_norm": 2.074638091286358, + "learning_rate": 1.3586000074069679e-05, + "loss": 1.0032, + "step": 3839 + }, + { + "epoch": 0.40188383045525905, + "grad_norm": 1.8783659326755497, + "learning_rate": 1.3582835593772822e-05, + "loss": 0.9629, + "step": 3840 + }, + { + "epoch": 0.4019884877027734, + "grad_norm": 2.1806512008747188, + "learning_rate": 1.3579670701800893e-05, + "loss": 1.0359, + "step": 3841 + }, + { + "epoch": 0.4020931449502878, + "grad_norm": 1.8780201218056713, + "learning_rate": 1.3576505398517546e-05, + "loss": 0.8616, + "step": 3842 + }, + { + "epoch": 0.4021978021978022, + "grad_norm": 2.16324024690094, + "learning_rate": 1.3573339684286472e-05, + "loss": 0.9952, + "step": 3843 + }, + { + "epoch": 0.40230245944531656, + "grad_norm": 2.07630913117498, + "learning_rate": 1.3570173559471427e-05, + "loss": 1.0358, + "step": 3844 + }, + { + "epoch": 0.40240711669283097, + "grad_norm": 2.062316520805037, + "learning_rate": 1.3567007024436206e-05, + "loss": 0.9196, + "step": 3845 + }, + { + "epoch": 0.4025117739403454, + "grad_norm": 2.252816836018333, + "learning_rate": 1.3563840079544642e-05, + "loss": 0.9085, + "step": 3846 + }, + { + "epoch": 0.4026164311878598, + "grad_norm": 2.0407067151488087, + "learning_rate": 1.3560672725160631e-05, + "loss": 1.0633, + "step": 3847 + }, + { + "epoch": 0.40272108843537413, + "grad_norm": 2.1656996102091126, + "learning_rate": 1.3557504961648102e-05, + "loss": 1.046, + "step": 3848 + }, + { + "epoch": 0.40282574568288854, + "grad_norm": 1.904183686696131, + "learning_rate": 1.3554336789371046e-05, + "loss": 0.9816, + "step": 3849 + }, + { + "epoch": 0.40293040293040294, + "grad_norm": 2.0311197336923623, + "learning_rate": 1.3551168208693486e-05, + "loss": 0.9372, + "step": 3850 + }, + { + "epoch": 0.40303506017791735, + "grad_norm": 2.1048762647281403, + "learning_rate": 1.35479992199795e-05, + "loss": 1.0747, + "step": 3851 + }, + { + "epoch": 0.4031397174254317, + "grad_norm": 2.219151744165179, + "learning_rate": 1.3544829823593208e-05, + "loss": 1.0558, + "step": 3852 + }, + { + "epoch": 0.4032443746729461, + "grad_norm": 1.8701877669813198, + "learning_rate": 1.3541660019898784e-05, + "loss": 1.0087, + "step": 3853 + }, + { + "epoch": 0.4033490319204605, + "grad_norm": 1.8767505975265026, + "learning_rate": 1.3538489809260447e-05, + "loss": 0.8751, + "step": 3854 + }, + { + "epoch": 0.40345368916797486, + "grad_norm": 1.949016676761772, + "learning_rate": 1.3535319192042455e-05, + "loss": 1.0158, + "step": 3855 + }, + { + "epoch": 0.40355834641548927, + "grad_norm": 1.9762439941084444, + "learning_rate": 1.3532148168609123e-05, + "loss": 1.0097, + "step": 3856 + }, + { + "epoch": 0.40366300366300367, + "grad_norm": 2.490558941314484, + "learning_rate": 1.3528976739324807e-05, + "loss": 0.9381, + "step": 3857 + }, + { + "epoch": 0.4037676609105181, + "grad_norm": 2.3815368861074737, + "learning_rate": 1.3525804904553906e-05, + "loss": 0.994, + "step": 3858 + }, + { + "epoch": 0.40387231815803243, + "grad_norm": 2.224663411985367, + "learning_rate": 1.3522632664660878e-05, + "loss": 1.0024, + "step": 3859 + }, + { + "epoch": 0.40397697540554683, + "grad_norm": 2.070837677342153, + "learning_rate": 1.351946002001021e-05, + "loss": 0.9638, + "step": 3860 + }, + { + "epoch": 0.40408163265306124, + "grad_norm": 2.0394436878689084, + "learning_rate": 1.3516286970966454e-05, + "loss": 1.0819, + "step": 3861 + }, + { + "epoch": 0.4041862899005756, + "grad_norm": 2.104205603022299, + "learning_rate": 1.35131135178942e-05, + "loss": 0.9877, + "step": 3862 + }, + { + "epoch": 0.40429094714809, + "grad_norm": 2.2820283958841325, + "learning_rate": 1.3509939661158078e-05, + "loss": 1.0813, + "step": 3863 + }, + { + "epoch": 0.4043956043956044, + "grad_norm": 1.8165693573504802, + "learning_rate": 1.3506765401122774e-05, + "loss": 0.9544, + "step": 3864 + }, + { + "epoch": 0.4045002616431188, + "grad_norm": 1.8593322923606608, + "learning_rate": 1.3503590738153015e-05, + "loss": 0.8905, + "step": 3865 + }, + { + "epoch": 0.40460491889063316, + "grad_norm": 1.9003454201874181, + "learning_rate": 1.3500415672613578e-05, + "loss": 0.9831, + "step": 3866 + }, + { + "epoch": 0.40470957613814756, + "grad_norm": 2.0723713810408553, + "learning_rate": 1.3497240204869287e-05, + "loss": 0.9065, + "step": 3867 + }, + { + "epoch": 0.40481423338566197, + "grad_norm": 2.0143809176887264, + "learning_rate": 1.3494064335285004e-05, + "loss": 0.9393, + "step": 3868 + }, + { + "epoch": 0.4049188906331763, + "grad_norm": 2.297759622527477, + "learning_rate": 1.3490888064225645e-05, + "loss": 0.9157, + "step": 3869 + }, + { + "epoch": 0.4050235478806907, + "grad_norm": 2.047595647603498, + "learning_rate": 1.348771139205617e-05, + "loss": 1.0448, + "step": 3870 + }, + { + "epoch": 0.40512820512820513, + "grad_norm": 2.304679069365117, + "learning_rate": 1.3484534319141592e-05, + "loss": 1.0223, + "step": 3871 + }, + { + "epoch": 0.40523286237571954, + "grad_norm": 2.2989950063633264, + "learning_rate": 1.3481356845846951e-05, + "loss": 0.9272, + "step": 3872 + }, + { + "epoch": 0.4053375196232339, + "grad_norm": 2.202029868599979, + "learning_rate": 1.3478178972537354e-05, + "loss": 0.9782, + "step": 3873 + }, + { + "epoch": 0.4054421768707483, + "grad_norm": 2.271682115831657, + "learning_rate": 1.347500069957794e-05, + "loss": 0.9876, + "step": 3874 + }, + { + "epoch": 0.4055468341182627, + "grad_norm": 2.1087888782860222, + "learning_rate": 1.3471822027333901e-05, + "loss": 1.1044, + "step": 3875 + }, + { + "epoch": 0.4056514913657771, + "grad_norm": 1.9199653211400192, + "learning_rate": 1.3468642956170474e-05, + "loss": 0.9796, + "step": 3876 + }, + { + "epoch": 0.40575614861329146, + "grad_norm": 2.2590968517959737, + "learning_rate": 1.3465463486452942e-05, + "loss": 0.8456, + "step": 3877 + }, + { + "epoch": 0.40586080586080586, + "grad_norm": 2.4486677027922283, + "learning_rate": 1.3462283618546623e-05, + "loss": 1.0471, + "step": 3878 + }, + { + "epoch": 0.40596546310832027, + "grad_norm": 2.1883091875894745, + "learning_rate": 1.3459103352816905e-05, + "loss": 1.0535, + "step": 3879 + }, + { + "epoch": 0.4060701203558346, + "grad_norm": 1.9403740977737154, + "learning_rate": 1.3455922689629195e-05, + "loss": 0.9947, + "step": 3880 + }, + { + "epoch": 0.406174777603349, + "grad_norm": 2.1633648926775164, + "learning_rate": 1.3452741629348966e-05, + "loss": 0.9966, + "step": 3881 + }, + { + "epoch": 0.40627943485086343, + "grad_norm": 2.1785103292518944, + "learning_rate": 1.3449560172341718e-05, + "loss": 0.9293, + "step": 3882 + }, + { + "epoch": 0.40638409209837784, + "grad_norm": 2.0241202507761953, + "learning_rate": 1.344637831897302e-05, + "loss": 0.9471, + "step": 3883 + }, + { + "epoch": 0.4064887493458922, + "grad_norm": 2.0299069179358016, + "learning_rate": 1.3443196069608462e-05, + "loss": 0.9633, + "step": 3884 + }, + { + "epoch": 0.4065934065934066, + "grad_norm": 1.8230670234656603, + "learning_rate": 1.3440013424613699e-05, + "loss": 0.8566, + "step": 3885 + }, + { + "epoch": 0.406698063840921, + "grad_norm": 2.265586535508352, + "learning_rate": 1.3436830384354417e-05, + "loss": 1.1007, + "step": 3886 + }, + { + "epoch": 0.40680272108843535, + "grad_norm": 2.0941195839844484, + "learning_rate": 1.3433646949196354e-05, + "loss": 0.9932, + "step": 3887 + }, + { + "epoch": 0.40690737833594975, + "grad_norm": 2.0164409933200034, + "learning_rate": 1.3430463119505304e-05, + "loss": 0.9493, + "step": 3888 + }, + { + "epoch": 0.40701203558346416, + "grad_norm": 2.1172725827353522, + "learning_rate": 1.3427278895647083e-05, + "loss": 1.0876, + "step": 3889 + }, + { + "epoch": 0.40711669283097857, + "grad_norm": 2.1818693573193984, + "learning_rate": 1.342409427798757e-05, + "loss": 0.9947, + "step": 3890 + }, + { + "epoch": 0.4072213500784929, + "grad_norm": 1.8963779475096132, + "learning_rate": 1.3420909266892679e-05, + "loss": 1.092, + "step": 3891 + }, + { + "epoch": 0.4073260073260073, + "grad_norm": 2.388464219116994, + "learning_rate": 1.341772386272838e-05, + "loss": 1.0153, + "step": 3892 + }, + { + "epoch": 0.40743066457352173, + "grad_norm": 2.0886208182575445, + "learning_rate": 1.3414538065860685e-05, + "loss": 1.0086, + "step": 3893 + }, + { + "epoch": 0.40753532182103613, + "grad_norm": 1.8683253041052692, + "learning_rate": 1.341135187665564e-05, + "loss": 1.0031, + "step": 3894 + }, + { + "epoch": 0.4076399790685505, + "grad_norm": 1.9302119953502512, + "learning_rate": 1.340816529547935e-05, + "loss": 0.9738, + "step": 3895 + }, + { + "epoch": 0.4077446363160649, + "grad_norm": 2.076097227173372, + "learning_rate": 1.340497832269796e-05, + "loss": 0.9809, + "step": 3896 + }, + { + "epoch": 0.4078492935635793, + "grad_norm": 1.9832695165206116, + "learning_rate": 1.3401790958677653e-05, + "loss": 0.875, + "step": 3897 + }, + { + "epoch": 0.40795395081109365, + "grad_norm": 1.9007778657336216, + "learning_rate": 1.3398603203784678e-05, + "loss": 0.9117, + "step": 3898 + }, + { + "epoch": 0.40805860805860805, + "grad_norm": 2.1000112600332077, + "learning_rate": 1.3395415058385297e-05, + "loss": 0.9539, + "step": 3899 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 2.128028337883832, + "learning_rate": 1.3392226522845843e-05, + "loss": 0.9829, + "step": 3900 + }, + { + "epoch": 0.40826792255363686, + "grad_norm": 2.3062569400722808, + "learning_rate": 1.3389037597532689e-05, + "loss": 0.9517, + "step": 3901 + }, + { + "epoch": 0.4083725798011512, + "grad_norm": 2.2026909714583254, + "learning_rate": 1.3385848282812242e-05, + "loss": 0.9477, + "step": 3902 + }, + { + "epoch": 0.4084772370486656, + "grad_norm": 1.8784837438489583, + "learning_rate": 1.3382658579050964e-05, + "loss": 0.8672, + "step": 3903 + }, + { + "epoch": 0.40858189429618, + "grad_norm": 1.9667452879169667, + "learning_rate": 1.3379468486615357e-05, + "loss": 0.9565, + "step": 3904 + }, + { + "epoch": 0.4086865515436944, + "grad_norm": 1.9420479807212934, + "learning_rate": 1.3376278005871972e-05, + "loss": 0.8869, + "step": 3905 + }, + { + "epoch": 0.4087912087912088, + "grad_norm": 2.1557813800708057, + "learning_rate": 1.33730871371874e-05, + "loss": 0.9541, + "step": 3906 + }, + { + "epoch": 0.4088958660387232, + "grad_norm": 2.0260654319412574, + "learning_rate": 1.3369895880928277e-05, + "loss": 1.0403, + "step": 3907 + }, + { + "epoch": 0.4090005232862376, + "grad_norm": 2.1520685254880885, + "learning_rate": 1.3366704237461283e-05, + "loss": 0.9977, + "step": 3908 + }, + { + "epoch": 0.40910518053375194, + "grad_norm": 2.0017224417903794, + "learning_rate": 1.336351220715315e-05, + "loss": 1.0277, + "step": 3909 + }, + { + "epoch": 0.40920983778126635, + "grad_norm": 2.085668966136735, + "learning_rate": 1.3360319790370646e-05, + "loss": 0.9904, + "step": 3910 + }, + { + "epoch": 0.40931449502878076, + "grad_norm": 2.233810172732676, + "learning_rate": 1.3357126987480587e-05, + "loss": 1.0078, + "step": 3911 + }, + { + "epoch": 0.4094191522762951, + "grad_norm": 2.456133446509698, + "learning_rate": 1.335393379884983e-05, + "loss": 1.1043, + "step": 3912 + }, + { + "epoch": 0.4095238095238095, + "grad_norm": 1.9446442501126147, + "learning_rate": 1.3350740224845277e-05, + "loss": 0.9661, + "step": 3913 + }, + { + "epoch": 0.4096284667713239, + "grad_norm": 2.139239033329667, + "learning_rate": 1.3347546265833885e-05, + "loss": 1.0164, + "step": 3914 + }, + { + "epoch": 0.4097331240188383, + "grad_norm": 2.414435331126368, + "learning_rate": 1.3344351922182642e-05, + "loss": 0.9962, + "step": 3915 + }, + { + "epoch": 0.4098377812663527, + "grad_norm": 2.2612369566120236, + "learning_rate": 1.3341157194258578e-05, + "loss": 0.8626, + "step": 3916 + }, + { + "epoch": 0.4099424385138671, + "grad_norm": 2.007696031752954, + "learning_rate": 1.3337962082428782e-05, + "loss": 0.9339, + "step": 3917 + }, + { + "epoch": 0.4100470957613815, + "grad_norm": 2.1776335671383453, + "learning_rate": 1.3334766587060372e-05, + "loss": 0.9919, + "step": 3918 + }, + { + "epoch": 0.4101517530088959, + "grad_norm": 2.4159392572041267, + "learning_rate": 1.3331570708520527e-05, + "loss": 0.8825, + "step": 3919 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 2.2382392402884173, + "learning_rate": 1.3328374447176448e-05, + "loss": 0.9198, + "step": 3920 + }, + { + "epoch": 0.41036106750392465, + "grad_norm": 2.011649128934293, + "learning_rate": 1.3325177803395402e-05, + "loss": 0.958, + "step": 3921 + }, + { + "epoch": 0.41046572475143905, + "grad_norm": 2.1286275554802456, + "learning_rate": 1.3321980777544686e-05, + "loss": 0.9464, + "step": 3922 + }, + { + "epoch": 0.4105703819989534, + "grad_norm": 1.892321406734822, + "learning_rate": 1.331878336999164e-05, + "loss": 0.9145, + "step": 3923 + }, + { + "epoch": 0.4106750392464678, + "grad_norm": 1.8424101188930837, + "learning_rate": 1.3315585581103663e-05, + "loss": 0.8959, + "step": 3924 + }, + { + "epoch": 0.4107796964939822, + "grad_norm": 2.126780664206234, + "learning_rate": 1.3312387411248175e-05, + "loss": 1.0817, + "step": 3925 + }, + { + "epoch": 0.4108843537414966, + "grad_norm": 2.0459834656964384, + "learning_rate": 1.330918886079266e-05, + "loss": 1.0016, + "step": 3926 + }, + { + "epoch": 0.41098901098901097, + "grad_norm": 1.9966709058022425, + "learning_rate": 1.3305989930104639e-05, + "loss": 0.9498, + "step": 3927 + }, + { + "epoch": 0.4110936682365254, + "grad_norm": 1.9943061429132058, + "learning_rate": 1.3302790619551673e-05, + "loss": 1.0189, + "step": 3928 + }, + { + "epoch": 0.4111983254840398, + "grad_norm": 2.2183362311043466, + "learning_rate": 1.3299590929501369e-05, + "loss": 0.9745, + "step": 3929 + }, + { + "epoch": 0.41130298273155413, + "grad_norm": 2.1746772650891826, + "learning_rate": 1.3296390860321376e-05, + "loss": 0.9183, + "step": 3930 + }, + { + "epoch": 0.41140763997906854, + "grad_norm": 1.9839787897777303, + "learning_rate": 1.3293190412379392e-05, + "loss": 0.9328, + "step": 3931 + }, + { + "epoch": 0.41151229722658295, + "grad_norm": 2.106457836929408, + "learning_rate": 1.3289989586043153e-05, + "loss": 0.9307, + "step": 3932 + }, + { + "epoch": 0.41161695447409735, + "grad_norm": 2.00060665012128, + "learning_rate": 1.3286788381680444e-05, + "loss": 1.0349, + "step": 3933 + }, + { + "epoch": 0.4117216117216117, + "grad_norm": 2.1356259208594497, + "learning_rate": 1.3283586799659083e-05, + "loss": 1.0561, + "step": 3934 + }, + { + "epoch": 0.4118262689691261, + "grad_norm": 2.0728612956807138, + "learning_rate": 1.3280384840346942e-05, + "loss": 0.9573, + "step": 3935 + }, + { + "epoch": 0.4119309262166405, + "grad_norm": 2.114394535148644, + "learning_rate": 1.3277182504111936e-05, + "loss": 0.9407, + "step": 3936 + }, + { + "epoch": 0.4120355834641549, + "grad_norm": 2.3060951675811867, + "learning_rate": 1.3273979791322016e-05, + "loss": 1.0736, + "step": 3937 + }, + { + "epoch": 0.41214024071166927, + "grad_norm": 2.2146993223832774, + "learning_rate": 1.3270776702345182e-05, + "loss": 0.8368, + "step": 3938 + }, + { + "epoch": 0.4122448979591837, + "grad_norm": 1.9659217726233473, + "learning_rate": 1.3267573237549471e-05, + "loss": 0.9114, + "step": 3939 + }, + { + "epoch": 0.4123495552066981, + "grad_norm": 2.1795203888166546, + "learning_rate": 1.3264369397302974e-05, + "loss": 1.0584, + "step": 3940 + }, + { + "epoch": 0.41245421245421243, + "grad_norm": 2.2274974384458184, + "learning_rate": 1.3261165181973814e-05, + "loss": 0.941, + "step": 3941 + }, + { + "epoch": 0.41255886970172684, + "grad_norm": 2.0938075682129784, + "learning_rate": 1.3257960591930164e-05, + "loss": 0.9865, + "step": 3942 + }, + { + "epoch": 0.41266352694924124, + "grad_norm": 2.252476991607518, + "learning_rate": 1.3254755627540237e-05, + "loss": 0.8637, + "step": 3943 + }, + { + "epoch": 0.41276818419675565, + "grad_norm": 1.9902821855928159, + "learning_rate": 1.3251550289172293e-05, + "loss": 0.8776, + "step": 3944 + }, + { + "epoch": 0.41287284144427, + "grad_norm": 2.4524909184883628, + "learning_rate": 1.3248344577194625e-05, + "loss": 0.9399, + "step": 3945 + }, + { + "epoch": 0.4129774986917844, + "grad_norm": 2.113793753822339, + "learning_rate": 1.3245138491975582e-05, + "loss": 0.9082, + "step": 3946 + }, + { + "epoch": 0.4130821559392988, + "grad_norm": 2.122883266073302, + "learning_rate": 1.3241932033883544e-05, + "loss": 0.9503, + "step": 3947 + }, + { + "epoch": 0.41318681318681316, + "grad_norm": 1.750988968337933, + "learning_rate": 1.3238725203286951e-05, + "loss": 0.8322, + "step": 3948 + }, + { + "epoch": 0.41329147043432757, + "grad_norm": 2.1864026140069694, + "learning_rate": 1.323551800055426e-05, + "loss": 1.0516, + "step": 3949 + }, + { + "epoch": 0.413396127681842, + "grad_norm": 2.1920452942734316, + "learning_rate": 1.3232310426053996e-05, + "loss": 0.974, + "step": 3950 + }, + { + "epoch": 0.4135007849293564, + "grad_norm": 2.531072565644497, + "learning_rate": 1.3229102480154708e-05, + "loss": 0.9761, + "step": 3951 + }, + { + "epoch": 0.41360544217687073, + "grad_norm": 2.1000618795832526, + "learning_rate": 1.3225894163224999e-05, + "loss": 1.0461, + "step": 3952 + }, + { + "epoch": 0.41371009942438514, + "grad_norm": 2.4031213533226694, + "learning_rate": 1.3222685475633515e-05, + "loss": 0.945, + "step": 3953 + }, + { + "epoch": 0.41381475667189954, + "grad_norm": 2.1941396313682606, + "learning_rate": 1.3219476417748934e-05, + "loss": 1.0891, + "step": 3954 + }, + { + "epoch": 0.4139194139194139, + "grad_norm": 1.8725259557278697, + "learning_rate": 1.3216266989939987e-05, + "loss": 0.9485, + "step": 3955 + }, + { + "epoch": 0.4140240711669283, + "grad_norm": 2.0302624935221814, + "learning_rate": 1.321305719257544e-05, + "loss": 0.9908, + "step": 3956 + }, + { + "epoch": 0.4141287284144427, + "grad_norm": 2.1705816692660274, + "learning_rate": 1.3209847026024112e-05, + "loss": 0.9744, + "step": 3957 + }, + { + "epoch": 0.4142333856619571, + "grad_norm": 1.958836116018982, + "learning_rate": 1.3206636490654851e-05, + "loss": 0.8859, + "step": 3958 + }, + { + "epoch": 0.41433804290947146, + "grad_norm": 1.9407740225892292, + "learning_rate": 1.3203425586836558e-05, + "loss": 0.9871, + "step": 3959 + }, + { + "epoch": 0.41444270015698587, + "grad_norm": 2.356610680228509, + "learning_rate": 1.3200214314938169e-05, + "loss": 0.8781, + "step": 3960 + }, + { + "epoch": 0.41454735740450027, + "grad_norm": 1.9211681674761067, + "learning_rate": 1.3197002675328669e-05, + "loss": 1.0168, + "step": 3961 + }, + { + "epoch": 0.4146520146520147, + "grad_norm": 1.9962879034585508, + "learning_rate": 1.3193790668377082e-05, + "loss": 0.9542, + "step": 3962 + }, + { + "epoch": 0.41475667189952903, + "grad_norm": 2.236000041036604, + "learning_rate": 1.3190578294452471e-05, + "loss": 1.0742, + "step": 3963 + }, + { + "epoch": 0.41486132914704343, + "grad_norm": 1.9426088190833695, + "learning_rate": 1.3187365553923948e-05, + "loss": 1.0262, + "step": 3964 + }, + { + "epoch": 0.41496598639455784, + "grad_norm": 2.35541736370535, + "learning_rate": 1.3184152447160658e-05, + "loss": 1.0172, + "step": 3965 + }, + { + "epoch": 0.4150706436420722, + "grad_norm": 2.088936709244117, + "learning_rate": 1.3180938974531799e-05, + "loss": 1.0208, + "step": 3966 + }, + { + "epoch": 0.4151753008895866, + "grad_norm": 1.8738282005240101, + "learning_rate": 1.3177725136406605e-05, + "loss": 0.9879, + "step": 3967 + }, + { + "epoch": 0.415279958137101, + "grad_norm": 1.9962816803985017, + "learning_rate": 1.3174510933154348e-05, + "loss": 1.0465, + "step": 3968 + }, + { + "epoch": 0.4153846153846154, + "grad_norm": 2.0688274484112705, + "learning_rate": 1.3171296365144351e-05, + "loss": 0.9194, + "step": 3969 + }, + { + "epoch": 0.41548927263212976, + "grad_norm": 1.778517909912603, + "learning_rate": 1.3168081432745971e-05, + "loss": 0.9127, + "step": 3970 + }, + { + "epoch": 0.41559392987964416, + "grad_norm": 2.0828264061542083, + "learning_rate": 1.3164866136328613e-05, + "loss": 0.8817, + "step": 3971 + }, + { + "epoch": 0.41569858712715857, + "grad_norm": 2.229721638477741, + "learning_rate": 1.3161650476261723e-05, + "loss": 1.0506, + "step": 3972 + }, + { + "epoch": 0.4158032443746729, + "grad_norm": 1.9680337726426218, + "learning_rate": 1.315843445291478e-05, + "loss": 0.9459, + "step": 3973 + }, + { + "epoch": 0.4159079016221873, + "grad_norm": 2.14107133654801, + "learning_rate": 1.3155218066657315e-05, + "loss": 1.0548, + "step": 3974 + }, + { + "epoch": 0.41601255886970173, + "grad_norm": 2.013522258681747, + "learning_rate": 1.3152001317858902e-05, + "loss": 1.0813, + "step": 3975 + }, + { + "epoch": 0.41611721611721614, + "grad_norm": 2.1608038286842595, + "learning_rate": 1.3148784206889144e-05, + "loss": 1.0775, + "step": 3976 + }, + { + "epoch": 0.4162218733647305, + "grad_norm": 2.4659678787298094, + "learning_rate": 1.3145566734117701e-05, + "loss": 1.079, + "step": 3977 + }, + { + "epoch": 0.4163265306122449, + "grad_norm": 2.2519431817057, + "learning_rate": 1.3142348899914259e-05, + "loss": 1.0753, + "step": 3978 + }, + { + "epoch": 0.4164311878597593, + "grad_norm": 1.9039732617022176, + "learning_rate": 1.3139130704648562e-05, + "loss": 0.9651, + "step": 3979 + }, + { + "epoch": 0.4165358451072737, + "grad_norm": 2.0825234022954486, + "learning_rate": 1.313591214869038e-05, + "loss": 0.9306, + "step": 3980 + }, + { + "epoch": 0.41664050235478806, + "grad_norm": 1.8388913462765786, + "learning_rate": 1.3132693232409538e-05, + "loss": 0.7844, + "step": 3981 + }, + { + "epoch": 0.41674515960230246, + "grad_norm": 1.9839223196264366, + "learning_rate": 1.3129473956175893e-05, + "loss": 1.0239, + "step": 3982 + }, + { + "epoch": 0.41684981684981687, + "grad_norm": 2.1149890042363855, + "learning_rate": 1.3126254320359344e-05, + "loss": 1.1038, + "step": 3983 + }, + { + "epoch": 0.4169544740973312, + "grad_norm": 2.3388045331623553, + "learning_rate": 1.3123034325329838e-05, + "loss": 1.0914, + "step": 3984 + }, + { + "epoch": 0.4170591313448456, + "grad_norm": 2.3672832146022524, + "learning_rate": 1.3119813971457356e-05, + "loss": 1.0583, + "step": 3985 + }, + { + "epoch": 0.41716378859236003, + "grad_norm": 2.198594788955325, + "learning_rate": 1.3116593259111922e-05, + "loss": 1.0107, + "step": 3986 + }, + { + "epoch": 0.41726844583987444, + "grad_norm": 2.066530857997824, + "learning_rate": 1.311337218866361e-05, + "loss": 1.0408, + "step": 3987 + }, + { + "epoch": 0.4173731030873888, + "grad_norm": 2.1047196181740557, + "learning_rate": 1.3110150760482518e-05, + "loss": 1.0169, + "step": 3988 + }, + { + "epoch": 0.4174777603349032, + "grad_norm": 1.9723714808567003, + "learning_rate": 1.3106928974938801e-05, + "loss": 0.9556, + "step": 3989 + }, + { + "epoch": 0.4175824175824176, + "grad_norm": 2.264669592855649, + "learning_rate": 1.3103706832402643e-05, + "loss": 0.9994, + "step": 3990 + }, + { + "epoch": 0.41768707482993195, + "grad_norm": 1.9073420765404958, + "learning_rate": 1.3100484333244282e-05, + "loss": 0.8359, + "step": 3991 + }, + { + "epoch": 0.41779173207744635, + "grad_norm": 1.8596817557441248, + "learning_rate": 1.3097261477833985e-05, + "loss": 0.9506, + "step": 3992 + }, + { + "epoch": 0.41789638932496076, + "grad_norm": 2.363734651762851, + "learning_rate": 1.3094038266542062e-05, + "loss": 0.8798, + "step": 3993 + }, + { + "epoch": 0.41800104657247517, + "grad_norm": 2.0676118196497733, + "learning_rate": 1.3090814699738875e-05, + "loss": 0.8891, + "step": 3994 + }, + { + "epoch": 0.4181057038199895, + "grad_norm": 2.515973859229479, + "learning_rate": 1.3087590777794805e-05, + "loss": 0.9715, + "step": 3995 + }, + { + "epoch": 0.4182103610675039, + "grad_norm": 2.5102079979788647, + "learning_rate": 1.3084366501080304e-05, + "loss": 0.9184, + "step": 3996 + }, + { + "epoch": 0.4183150183150183, + "grad_norm": 2.0704341062792397, + "learning_rate": 1.3081141869965834e-05, + "loss": 0.9982, + "step": 3997 + }, + { + "epoch": 0.4184196755625327, + "grad_norm": 2.255147279683263, + "learning_rate": 1.3077916884821922e-05, + "loss": 0.9624, + "step": 3998 + }, + { + "epoch": 0.4185243328100471, + "grad_norm": 2.1492963447091644, + "learning_rate": 1.3074691546019116e-05, + "loss": 0.8961, + "step": 3999 + }, + { + "epoch": 0.4186289900575615, + "grad_norm": 2.0388505483770194, + "learning_rate": 1.3071465853928018e-05, + "loss": 0.9276, + "step": 4000 + }, + { + "epoch": 0.4187336473050759, + "grad_norm": 1.866837078619738, + "learning_rate": 1.3068239808919271e-05, + "loss": 0.8709, + "step": 4001 + }, + { + "epoch": 0.41883830455259025, + "grad_norm": 1.986139574512811, + "learning_rate": 1.3065013411363548e-05, + "loss": 0.9092, + "step": 4002 + }, + { + "epoch": 0.41894296180010465, + "grad_norm": 1.8886956223922813, + "learning_rate": 1.306178666163157e-05, + "loss": 0.9102, + "step": 4003 + }, + { + "epoch": 0.41904761904761906, + "grad_norm": 1.9906538968606682, + "learning_rate": 1.3058559560094097e-05, + "loss": 0.9435, + "step": 4004 + }, + { + "epoch": 0.41915227629513346, + "grad_norm": 2.1441939858155425, + "learning_rate": 1.3055332107121932e-05, + "loss": 1.0081, + "step": 4005 + }, + { + "epoch": 0.4192569335426478, + "grad_norm": 2.3353197187128836, + "learning_rate": 1.3052104303085915e-05, + "loss": 1.064, + "step": 4006 + }, + { + "epoch": 0.4193615907901622, + "grad_norm": 1.83565573131337, + "learning_rate": 1.3048876148356924e-05, + "loss": 0.8715, + "step": 4007 + }, + { + "epoch": 0.4194662480376766, + "grad_norm": 2.3218482775446474, + "learning_rate": 1.3045647643305883e-05, + "loss": 1.0566, + "step": 4008 + }, + { + "epoch": 0.419570905285191, + "grad_norm": 1.9284607667631406, + "learning_rate": 1.3042418788303752e-05, + "loss": 0.9192, + "step": 4009 + }, + { + "epoch": 0.4196755625327054, + "grad_norm": 2.061609946734348, + "learning_rate": 1.303918958372154e-05, + "loss": 0.9721, + "step": 4010 + }, + { + "epoch": 0.4197802197802198, + "grad_norm": 1.8955350228878054, + "learning_rate": 1.303596002993028e-05, + "loss": 0.9905, + "step": 4011 + }, + { + "epoch": 0.4198848770277342, + "grad_norm": 2.3133480934371526, + "learning_rate": 1.3032730127301056e-05, + "loss": 0.8862, + "step": 4012 + }, + { + "epoch": 0.41998953427524854, + "grad_norm": 2.0623630507668596, + "learning_rate": 1.3029499876204996e-05, + "loss": 0.9311, + "step": 4013 + }, + { + "epoch": 0.42009419152276295, + "grad_norm": 1.9829665019642981, + "learning_rate": 1.3026269277013256e-05, + "loss": 1.0163, + "step": 4014 + }, + { + "epoch": 0.42019884877027736, + "grad_norm": 2.626803548305666, + "learning_rate": 1.3023038330097047e-05, + "loss": 1.0789, + "step": 4015 + }, + { + "epoch": 0.4203035060177917, + "grad_norm": 2.229456363139348, + "learning_rate": 1.3019807035827599e-05, + "loss": 1.0455, + "step": 4016 + }, + { + "epoch": 0.4204081632653061, + "grad_norm": 1.945363260039406, + "learning_rate": 1.3016575394576204e-05, + "loss": 1.0051, + "step": 4017 + }, + { + "epoch": 0.4205128205128205, + "grad_norm": 1.8657699677861808, + "learning_rate": 1.3013343406714181e-05, + "loss": 0.9354, + "step": 4018 + }, + { + "epoch": 0.4206174777603349, + "grad_norm": 2.13301131218169, + "learning_rate": 1.3010111072612888e-05, + "loss": 0.9128, + "step": 4019 + }, + { + "epoch": 0.4207221350078493, + "grad_norm": 2.0809458736583637, + "learning_rate": 1.3006878392643736e-05, + "loss": 0.9963, + "step": 4020 + }, + { + "epoch": 0.4208267922553637, + "grad_norm": 2.2025198937923784, + "learning_rate": 1.3003645367178158e-05, + "loss": 1.0341, + "step": 4021 + }, + { + "epoch": 0.4209314495028781, + "grad_norm": 1.8303178475154966, + "learning_rate": 1.300041199658764e-05, + "loss": 0.9209, + "step": 4022 + }, + { + "epoch": 0.4210361067503925, + "grad_norm": 2.0995406687477423, + "learning_rate": 1.2997178281243698e-05, + "loss": 1.0328, + "step": 4023 + }, + { + "epoch": 0.42114076399790684, + "grad_norm": 2.133177530342129, + "learning_rate": 1.2993944221517898e-05, + "loss": 0.918, + "step": 4024 + }, + { + "epoch": 0.42124542124542125, + "grad_norm": 1.913400092526822, + "learning_rate": 1.2990709817781839e-05, + "loss": 0.8177, + "step": 4025 + }, + { + "epoch": 0.42135007849293565, + "grad_norm": 1.8746443011882772, + "learning_rate": 1.2987475070407154e-05, + "loss": 0.9147, + "step": 4026 + }, + { + "epoch": 0.42145473574045, + "grad_norm": 2.01816221130607, + "learning_rate": 1.2984239979765531e-05, + "loss": 1.0686, + "step": 4027 + }, + { + "epoch": 0.4215593929879644, + "grad_norm": 2.1127015875668365, + "learning_rate": 1.2981004546228685e-05, + "loss": 1.0448, + "step": 4028 + }, + { + "epoch": 0.4216640502354788, + "grad_norm": 2.1501657966221526, + "learning_rate": 1.2977768770168374e-05, + "loss": 1.0125, + "step": 4029 + }, + { + "epoch": 0.4217687074829932, + "grad_norm": 1.8600363609360118, + "learning_rate": 1.2974532651956393e-05, + "loss": 1.0479, + "step": 4030 + }, + { + "epoch": 0.42187336473050757, + "grad_norm": 2.4569790644754996, + "learning_rate": 1.297129619196458e-05, + "loss": 0.9219, + "step": 4031 + }, + { + "epoch": 0.421978021978022, + "grad_norm": 2.47767730012033, + "learning_rate": 1.2968059390564813e-05, + "loss": 0.9192, + "step": 4032 + }, + { + "epoch": 0.4220826792255364, + "grad_norm": 2.0774703638862415, + "learning_rate": 1.2964822248129003e-05, + "loss": 0.9324, + "step": 4033 + }, + { + "epoch": 0.42218733647305073, + "grad_norm": 2.5616604388013524, + "learning_rate": 1.2961584765029107e-05, + "loss": 0.9997, + "step": 4034 + }, + { + "epoch": 0.42229199372056514, + "grad_norm": 2.3036391018677227, + "learning_rate": 1.2958346941637119e-05, + "loss": 0.9766, + "step": 4035 + }, + { + "epoch": 0.42239665096807955, + "grad_norm": 2.4294354631420885, + "learning_rate": 1.295510877832507e-05, + "loss": 0.8532, + "step": 4036 + }, + { + "epoch": 0.42250130821559395, + "grad_norm": 2.262444310074482, + "learning_rate": 1.2951870275465033e-05, + "loss": 1.0549, + "step": 4037 + }, + { + "epoch": 0.4226059654631083, + "grad_norm": 2.1283035221559654, + "learning_rate": 1.2948631433429114e-05, + "loss": 1.0502, + "step": 4038 + }, + { + "epoch": 0.4227106227106227, + "grad_norm": 2.214881325613008, + "learning_rate": 1.2945392252589466e-05, + "loss": 0.9077, + "step": 4039 + }, + { + "epoch": 0.4228152799581371, + "grad_norm": 2.369040409827918, + "learning_rate": 1.2942152733318278e-05, + "loss": 0.9257, + "step": 4040 + }, + { + "epoch": 0.42291993720565146, + "grad_norm": 1.6747394664320463, + "learning_rate": 1.2938912875987776e-05, + "loss": 0.9738, + "step": 4041 + }, + { + "epoch": 0.42302459445316587, + "grad_norm": 1.8358883748542993, + "learning_rate": 1.2935672680970231e-05, + "loss": 0.8974, + "step": 4042 + }, + { + "epoch": 0.4231292517006803, + "grad_norm": 1.9700721618750736, + "learning_rate": 1.2932432148637937e-05, + "loss": 0.8539, + "step": 4043 + }, + { + "epoch": 0.4232339089481947, + "grad_norm": 1.7081738341481085, + "learning_rate": 1.2929191279363248e-05, + "loss": 0.9157, + "step": 4044 + }, + { + "epoch": 0.42333856619570903, + "grad_norm": 1.9888702561572258, + "learning_rate": 1.2925950073518543e-05, + "loss": 0.8719, + "step": 4045 + }, + { + "epoch": 0.42344322344322344, + "grad_norm": 2.1023936971584494, + "learning_rate": 1.2922708531476245e-05, + "loss": 0.9698, + "step": 4046 + }, + { + "epoch": 0.42354788069073784, + "grad_norm": 2.065987444351679, + "learning_rate": 1.2919466653608807e-05, + "loss": 1.0014, + "step": 4047 + }, + { + "epoch": 0.42365253793825225, + "grad_norm": 2.1081730464926682, + "learning_rate": 1.2916224440288736e-05, + "loss": 0.8841, + "step": 4048 + }, + { + "epoch": 0.4237571951857666, + "grad_norm": 1.9254261511446567, + "learning_rate": 1.2912981891888566e-05, + "loss": 1.0805, + "step": 4049 + }, + { + "epoch": 0.423861852433281, + "grad_norm": 1.8610671376702372, + "learning_rate": 1.2909739008780866e-05, + "loss": 1.0011, + "step": 4050 + }, + { + "epoch": 0.4239665096807954, + "grad_norm": 1.8387402872473773, + "learning_rate": 1.290649579133826e-05, + "loss": 1.0046, + "step": 4051 + }, + { + "epoch": 0.42407116692830976, + "grad_norm": 1.9929812428438674, + "learning_rate": 1.2903252239933393e-05, + "loss": 0.8668, + "step": 4052 + }, + { + "epoch": 0.42417582417582417, + "grad_norm": 2.140516044935987, + "learning_rate": 1.290000835493896e-05, + "loss": 1.0137, + "step": 4053 + }, + { + "epoch": 0.4242804814233386, + "grad_norm": 2.271991022792856, + "learning_rate": 1.2896764136727687e-05, + "loss": 0.9522, + "step": 4054 + }, + { + "epoch": 0.424385138670853, + "grad_norm": 1.8813686606836715, + "learning_rate": 1.2893519585672342e-05, + "loss": 0.986, + "step": 4055 + }, + { + "epoch": 0.42448979591836733, + "grad_norm": 1.974109588184302, + "learning_rate": 1.289027470214573e-05, + "loss": 1.0201, + "step": 4056 + }, + { + "epoch": 0.42459445316588174, + "grad_norm": 2.0120661694359607, + "learning_rate": 1.2887029486520697e-05, + "loss": 0.9441, + "step": 4057 + }, + { + "epoch": 0.42469911041339614, + "grad_norm": 2.141950924459728, + "learning_rate": 1.2883783939170125e-05, + "loss": 0.9926, + "step": 4058 + }, + { + "epoch": 0.4248037676609105, + "grad_norm": 2.5623343563978915, + "learning_rate": 1.2880538060466931e-05, + "loss": 0.9227, + "step": 4059 + }, + { + "epoch": 0.4249084249084249, + "grad_norm": 1.9750731791255807, + "learning_rate": 1.287729185078407e-05, + "loss": 1.0129, + "step": 4060 + }, + { + "epoch": 0.4250130821559393, + "grad_norm": 2.0054191639646692, + "learning_rate": 1.2874045310494549e-05, + "loss": 0.9068, + "step": 4061 + }, + { + "epoch": 0.4251177394034537, + "grad_norm": 1.9724083424778205, + "learning_rate": 1.2870798439971393e-05, + "loss": 0.862, + "step": 4062 + }, + { + "epoch": 0.42522239665096806, + "grad_norm": 2.319494091259867, + "learning_rate": 1.2867551239587677e-05, + "loss": 1.0978, + "step": 4063 + }, + { + "epoch": 0.42532705389848247, + "grad_norm": 2.319487442343795, + "learning_rate": 1.286430370971651e-05, + "loss": 1.0296, + "step": 4064 + }, + { + "epoch": 0.42543171114599687, + "grad_norm": 2.0836979512763336, + "learning_rate": 1.2861055850731039e-05, + "loss": 0.972, + "step": 4065 + }, + { + "epoch": 0.4255363683935113, + "grad_norm": 2.552512993363399, + "learning_rate": 1.2857807663004453e-05, + "loss": 0.8689, + "step": 4066 + }, + { + "epoch": 0.4256410256410256, + "grad_norm": 2.1467202430472763, + "learning_rate": 1.285455914690997e-05, + "loss": 0.9987, + "step": 4067 + }, + { + "epoch": 0.42574568288854003, + "grad_norm": 2.1204522151177474, + "learning_rate": 1.285131030282086e-05, + "loss": 1.0174, + "step": 4068 + }, + { + "epoch": 0.42585034013605444, + "grad_norm": 2.3582725345374027, + "learning_rate": 1.284806113111041e-05, + "loss": 0.9143, + "step": 4069 + }, + { + "epoch": 0.4259549973835688, + "grad_norm": 2.0482186802071287, + "learning_rate": 1.2844811632151964e-05, + "loss": 0.9063, + "step": 4070 + }, + { + "epoch": 0.4260596546310832, + "grad_norm": 2.0895838757594696, + "learning_rate": 1.2841561806318897e-05, + "loss": 1.052, + "step": 4071 + }, + { + "epoch": 0.4261643118785976, + "grad_norm": 2.0282129968162246, + "learning_rate": 1.2838311653984615e-05, + "loss": 0.9849, + "step": 4072 + }, + { + "epoch": 0.426268969126112, + "grad_norm": 2.5256239159226745, + "learning_rate": 1.2835061175522569e-05, + "loss": 1.102, + "step": 4073 + }, + { + "epoch": 0.42637362637362636, + "grad_norm": 2.0879581133791687, + "learning_rate": 1.2831810371306247e-05, + "loss": 0.9754, + "step": 4074 + }, + { + "epoch": 0.42647828362114076, + "grad_norm": 1.8366234669283463, + "learning_rate": 1.2828559241709177e-05, + "loss": 0.7956, + "step": 4075 + }, + { + "epoch": 0.42658294086865517, + "grad_norm": 1.8797736027924894, + "learning_rate": 1.2825307787104917e-05, + "loss": 0.9943, + "step": 4076 + }, + { + "epoch": 0.4266875981161695, + "grad_norm": 2.092067441756589, + "learning_rate": 1.282205600786706e-05, + "loss": 1.0322, + "step": 4077 + }, + { + "epoch": 0.4267922553636839, + "grad_norm": 2.080732968369759, + "learning_rate": 1.2818803904369248e-05, + "loss": 0.9993, + "step": 4078 + }, + { + "epoch": 0.42689691261119833, + "grad_norm": 1.8121577795467074, + "learning_rate": 1.2815551476985151e-05, + "loss": 0.8637, + "step": 4079 + }, + { + "epoch": 0.42700156985871274, + "grad_norm": 1.8717669220769215, + "learning_rate": 1.2812298726088486e-05, + "loss": 0.8787, + "step": 4080 + }, + { + "epoch": 0.4271062271062271, + "grad_norm": 2.179750288411442, + "learning_rate": 1.2809045652052992e-05, + "loss": 0.8961, + "step": 4081 + }, + { + "epoch": 0.4272108843537415, + "grad_norm": 2.0838225686688263, + "learning_rate": 1.280579225525246e-05, + "loss": 0.9929, + "step": 4082 + }, + { + "epoch": 0.4273155416012559, + "grad_norm": 2.275118316941627, + "learning_rate": 1.2802538536060712e-05, + "loss": 1.0092, + "step": 4083 + }, + { + "epoch": 0.42742019884877025, + "grad_norm": 2.130895970762807, + "learning_rate": 1.2799284494851602e-05, + "loss": 0.8968, + "step": 4084 + }, + { + "epoch": 0.42752485609628466, + "grad_norm": 1.8836199307900716, + "learning_rate": 1.279603013199903e-05, + "loss": 0.9125, + "step": 4085 + }, + { + "epoch": 0.42762951334379906, + "grad_norm": 2.458988223380957, + "learning_rate": 1.2792775447876928e-05, + "loss": 1.0487, + "step": 4086 + }, + { + "epoch": 0.42773417059131347, + "grad_norm": 1.9971378709402376, + "learning_rate": 1.2789520442859265e-05, + "loss": 0.863, + "step": 4087 + }, + { + "epoch": 0.4278388278388278, + "grad_norm": 2.007011122353183, + "learning_rate": 1.2786265117320047e-05, + "loss": 1.0189, + "step": 4088 + }, + { + "epoch": 0.4279434850863422, + "grad_norm": 2.1617166345362673, + "learning_rate": 1.2783009471633321e-05, + "loss": 0.9055, + "step": 4089 + }, + { + "epoch": 0.42804814233385663, + "grad_norm": 2.5367597991139115, + "learning_rate": 1.2779753506173163e-05, + "loss": 1.0275, + "step": 4090 + }, + { + "epoch": 0.42815279958137104, + "grad_norm": 1.9826469266228839, + "learning_rate": 1.277649722131369e-05, + "loss": 0.893, + "step": 4091 + }, + { + "epoch": 0.4282574568288854, + "grad_norm": 2.038739638755624, + "learning_rate": 1.277324061742906e-05, + "loss": 1.0419, + "step": 4092 + }, + { + "epoch": 0.4283621140763998, + "grad_norm": 2.5868448257014705, + "learning_rate": 1.2769983694893461e-05, + "loss": 1.0708, + "step": 4093 + }, + { + "epoch": 0.4284667713239142, + "grad_norm": 1.952363024664158, + "learning_rate": 1.2766726454081124e-05, + "loss": 0.8693, + "step": 4094 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 2.537669019028795, + "learning_rate": 1.2763468895366304e-05, + "loss": 0.8952, + "step": 4095 + }, + { + "epoch": 0.42867608581894295, + "grad_norm": 2.1143917570175166, + "learning_rate": 1.2760211019123307e-05, + "loss": 0.9822, + "step": 4096 + }, + { + "epoch": 0.42878074306645736, + "grad_norm": 2.0948895182122915, + "learning_rate": 1.2756952825726469e-05, + "loss": 0.9116, + "step": 4097 + }, + { + "epoch": 0.42888540031397177, + "grad_norm": 2.222927984303674, + "learning_rate": 1.2753694315550163e-05, + "loss": 0.9202, + "step": 4098 + }, + { + "epoch": 0.4289900575614861, + "grad_norm": 2.127865487750365, + "learning_rate": 1.2750435488968793e-05, + "loss": 1.0449, + "step": 4099 + }, + { + "epoch": 0.4290947148090005, + "grad_norm": 1.9788232612921248, + "learning_rate": 1.2747176346356815e-05, + "loss": 0.9387, + "step": 4100 + }, + { + "epoch": 0.4291993720565149, + "grad_norm": 1.9016287167665056, + "learning_rate": 1.2743916888088706e-05, + "loss": 0.8767, + "step": 4101 + }, + { + "epoch": 0.4293040293040293, + "grad_norm": 2.1605389550086196, + "learning_rate": 1.274065711453898e-05, + "loss": 1.0258, + "step": 4102 + }, + { + "epoch": 0.4294086865515437, + "grad_norm": 2.5323063383357463, + "learning_rate": 1.2737397026082199e-05, + "loss": 0.9743, + "step": 4103 + }, + { + "epoch": 0.4295133437990581, + "grad_norm": 1.9921710514030042, + "learning_rate": 1.2734136623092948e-05, + "loss": 1.0633, + "step": 4104 + }, + { + "epoch": 0.4296180010465725, + "grad_norm": 2.054756726068761, + "learning_rate": 1.273087590594586e-05, + "loss": 1.0129, + "step": 4105 + }, + { + "epoch": 0.42972265829408685, + "grad_norm": 2.1517769857523517, + "learning_rate": 1.272761487501559e-05, + "loss": 0.9996, + "step": 4106 + }, + { + "epoch": 0.42982731554160125, + "grad_norm": 2.0631947885960598, + "learning_rate": 1.2724353530676842e-05, + "loss": 1.0288, + "step": 4107 + }, + { + "epoch": 0.42993197278911566, + "grad_norm": 2.0629095347948625, + "learning_rate": 1.2721091873304351e-05, + "loss": 0.9055, + "step": 4108 + }, + { + "epoch": 0.43003663003663006, + "grad_norm": 2.1035215723837544, + "learning_rate": 1.2717829903272889e-05, + "loss": 0.9523, + "step": 4109 + }, + { + "epoch": 0.4301412872841444, + "grad_norm": 2.1532839005297766, + "learning_rate": 1.2714567620957262e-05, + "loss": 0.9819, + "step": 4110 + }, + { + "epoch": 0.4302459445316588, + "grad_norm": 2.4892633893694778, + "learning_rate": 1.271130502673231e-05, + "loss": 0.9969, + "step": 4111 + }, + { + "epoch": 0.4303506017791732, + "grad_norm": 2.0071464961288243, + "learning_rate": 1.2708042120972914e-05, + "loss": 1.0227, + "step": 4112 + }, + { + "epoch": 0.4304552590266876, + "grad_norm": 2.155107165578016, + "learning_rate": 1.270477890405399e-05, + "loss": 0.9929, + "step": 4113 + }, + { + "epoch": 0.430559916274202, + "grad_norm": 2.169518740126109, + "learning_rate": 1.2701515376350489e-05, + "loss": 0.9272, + "step": 4114 + }, + { + "epoch": 0.4306645735217164, + "grad_norm": 2.739527866572885, + "learning_rate": 1.2698251538237389e-05, + "loss": 0.9999, + "step": 4115 + }, + { + "epoch": 0.4307692307692308, + "grad_norm": 2.513640769034525, + "learning_rate": 1.2694987390089723e-05, + "loss": 1.0112, + "step": 4116 + }, + { + "epoch": 0.43087388801674514, + "grad_norm": 2.0925444858975326, + "learning_rate": 1.269172293228254e-05, + "loss": 0.9724, + "step": 4117 + }, + { + "epoch": 0.43097854526425955, + "grad_norm": 2.1067772462429883, + "learning_rate": 1.2688458165190933e-05, + "loss": 0.9119, + "step": 4118 + }, + { + "epoch": 0.43108320251177396, + "grad_norm": 2.0030783316902165, + "learning_rate": 1.2685193089190038e-05, + "loss": 1.0959, + "step": 4119 + }, + { + "epoch": 0.4311878597592883, + "grad_norm": 2.1073270714006034, + "learning_rate": 1.2681927704655008e-05, + "loss": 1.0327, + "step": 4120 + }, + { + "epoch": 0.4312925170068027, + "grad_norm": 2.137450289870023, + "learning_rate": 1.2678662011961051e-05, + "loss": 0.9802, + "step": 4121 + }, + { + "epoch": 0.4313971742543171, + "grad_norm": 2.0752705351892673, + "learning_rate": 1.2675396011483398e-05, + "loss": 0.9755, + "step": 4122 + }, + { + "epoch": 0.4315018315018315, + "grad_norm": 2.2558252138712582, + "learning_rate": 1.2672129703597321e-05, + "loss": 1.0148, + "step": 4123 + }, + { + "epoch": 0.4316064887493459, + "grad_norm": 1.9748773751441262, + "learning_rate": 1.2668863088678124e-05, + "loss": 0.9922, + "step": 4124 + }, + { + "epoch": 0.4317111459968603, + "grad_norm": 1.8299642601513824, + "learning_rate": 1.2665596167101144e-05, + "loss": 0.8775, + "step": 4125 + }, + { + "epoch": 0.4318158032443747, + "grad_norm": 2.2374692170514527, + "learning_rate": 1.2662328939241766e-05, + "loss": 1.0831, + "step": 4126 + }, + { + "epoch": 0.4319204604918891, + "grad_norm": 2.1794273635322376, + "learning_rate": 1.2659061405475392e-05, + "loss": 0.9673, + "step": 4127 + }, + { + "epoch": 0.43202511773940344, + "grad_norm": 1.939053173523464, + "learning_rate": 1.2655793566177475e-05, + "loss": 0.8898, + "step": 4128 + }, + { + "epoch": 0.43212977498691785, + "grad_norm": 2.0573506268728328, + "learning_rate": 1.2652525421723489e-05, + "loss": 0.8148, + "step": 4129 + }, + { + "epoch": 0.43223443223443225, + "grad_norm": 1.8819328528426567, + "learning_rate": 1.2649256972488956e-05, + "loss": 1.0517, + "step": 4130 + }, + { + "epoch": 0.4323390894819466, + "grad_norm": 1.9984163786543963, + "learning_rate": 1.2645988218849432e-05, + "loss": 0.908, + "step": 4131 + }, + { + "epoch": 0.432443746729461, + "grad_norm": 1.8773422683017276, + "learning_rate": 1.264271916118049e-05, + "loss": 0.8387, + "step": 4132 + }, + { + "epoch": 0.4325484039769754, + "grad_norm": 2.0149662866765863, + "learning_rate": 1.2639449799857766e-05, + "loss": 0.9982, + "step": 4133 + }, + { + "epoch": 0.4326530612244898, + "grad_norm": 2.174292902817768, + "learning_rate": 1.2636180135256904e-05, + "loss": 1.1082, + "step": 4134 + }, + { + "epoch": 0.43275771847200417, + "grad_norm": 2.2208332106910897, + "learning_rate": 1.2632910167753601e-05, + "loss": 0.8762, + "step": 4135 + }, + { + "epoch": 0.4328623757195186, + "grad_norm": 2.181100971590527, + "learning_rate": 1.2629639897723585e-05, + "loss": 0.975, + "step": 4136 + }, + { + "epoch": 0.432967032967033, + "grad_norm": 1.931613675855517, + "learning_rate": 1.262636932554261e-05, + "loss": 0.8959, + "step": 4137 + }, + { + "epoch": 0.43307169021454733, + "grad_norm": 2.2708040082763943, + "learning_rate": 1.2623098451586477e-05, + "loss": 1.0665, + "step": 4138 + }, + { + "epoch": 0.43317634746206174, + "grad_norm": 1.9224472683374367, + "learning_rate": 1.2619827276231017e-05, + "loss": 0.9472, + "step": 4139 + }, + { + "epoch": 0.43328100470957615, + "grad_norm": 2.2865237719592275, + "learning_rate": 1.261655579985209e-05, + "loss": 0.8564, + "step": 4140 + }, + { + "epoch": 0.43338566195709055, + "grad_norm": 1.937006380048429, + "learning_rate": 1.2613284022825596e-05, + "loss": 1.0623, + "step": 4141 + }, + { + "epoch": 0.4334903192046049, + "grad_norm": 2.5686460907237096, + "learning_rate": 1.2610011945527471e-05, + "loss": 1.0288, + "step": 4142 + }, + { + "epoch": 0.4335949764521193, + "grad_norm": 2.4290445999367405, + "learning_rate": 1.2606739568333684e-05, + "loss": 1.0339, + "step": 4143 + }, + { + "epoch": 0.4336996336996337, + "grad_norm": 2.009556122935142, + "learning_rate": 1.2603466891620232e-05, + "loss": 0.8586, + "step": 4144 + }, + { + "epoch": 0.43380429094714806, + "grad_norm": 1.7741581344189656, + "learning_rate": 1.2600193915763162e-05, + "loss": 0.8876, + "step": 4145 + }, + { + "epoch": 0.43390894819466247, + "grad_norm": 2.4372967065545317, + "learning_rate": 1.2596920641138538e-05, + "loss": 1.0327, + "step": 4146 + }, + { + "epoch": 0.4340136054421769, + "grad_norm": 2.0894117465917215, + "learning_rate": 1.2593647068122465e-05, + "loss": 1.0019, + "step": 4147 + }, + { + "epoch": 0.4341182626896913, + "grad_norm": 2.123368845322283, + "learning_rate": 1.259037319709109e-05, + "loss": 0.9752, + "step": 4148 + }, + { + "epoch": 0.43422291993720563, + "grad_norm": 2.041973404473876, + "learning_rate": 1.258709902842058e-05, + "loss": 1.0645, + "step": 4149 + }, + { + "epoch": 0.43432757718472004, + "grad_norm": 2.2360927293499544, + "learning_rate": 1.258382456248715e-05, + "loss": 0.9802, + "step": 4150 + }, + { + "epoch": 0.43443223443223444, + "grad_norm": 1.9616686428786763, + "learning_rate": 1.2580549799667034e-05, + "loss": 0.8553, + "step": 4151 + }, + { + "epoch": 0.43453689167974885, + "grad_norm": 2.2952336749644804, + "learning_rate": 1.2577274740336523e-05, + "loss": 1.0347, + "step": 4152 + }, + { + "epoch": 0.4346415489272632, + "grad_norm": 2.370451934856473, + "learning_rate": 1.2573999384871916e-05, + "loss": 0.9924, + "step": 4153 + }, + { + "epoch": 0.4347462061747776, + "grad_norm": 2.0939601713417577, + "learning_rate": 1.2570723733649558e-05, + "loss": 0.9574, + "step": 4154 + }, + { + "epoch": 0.434850863422292, + "grad_norm": 2.388950829479333, + "learning_rate": 1.256744778704584e-05, + "loss": 0.9239, + "step": 4155 + }, + { + "epoch": 0.43495552066980636, + "grad_norm": 1.978177781520638, + "learning_rate": 1.2564171545437156e-05, + "loss": 0.9031, + "step": 4156 + }, + { + "epoch": 0.43506017791732077, + "grad_norm": 2.112737659944072, + "learning_rate": 1.2560895009199972e-05, + "loss": 0.9537, + "step": 4157 + }, + { + "epoch": 0.4351648351648352, + "grad_norm": 2.033176365951217, + "learning_rate": 1.2557618178710756e-05, + "loss": 1.0335, + "step": 4158 + }, + { + "epoch": 0.4352694924123496, + "grad_norm": 2.1148782403285784, + "learning_rate": 1.2554341054346027e-05, + "loss": 0.7705, + "step": 4159 + }, + { + "epoch": 0.43537414965986393, + "grad_norm": 1.6826596407900571, + "learning_rate": 1.2551063636482332e-05, + "loss": 0.7461, + "step": 4160 + }, + { + "epoch": 0.43547880690737834, + "grad_norm": 2.0821373935182623, + "learning_rate": 1.2547785925496255e-05, + "loss": 0.8977, + "step": 4161 + }, + { + "epoch": 0.43558346415489274, + "grad_norm": 2.7138514710502126, + "learning_rate": 1.254450792176441e-05, + "loss": 1.0286, + "step": 4162 + }, + { + "epoch": 0.4356881214024071, + "grad_norm": 1.928568939003756, + "learning_rate": 1.2541229625663445e-05, + "loss": 0.8945, + "step": 4163 + }, + { + "epoch": 0.4357927786499215, + "grad_norm": 2.5708479006499085, + "learning_rate": 1.2537951037570047e-05, + "loss": 0.9281, + "step": 4164 + }, + { + "epoch": 0.4358974358974359, + "grad_norm": 2.7402121626792773, + "learning_rate": 1.2534672157860928e-05, + "loss": 0.9111, + "step": 4165 + }, + { + "epoch": 0.4360020931449503, + "grad_norm": 2.1020926928873744, + "learning_rate": 1.2531392986912839e-05, + "loss": 1.0641, + "step": 4166 + }, + { + "epoch": 0.43610675039246466, + "grad_norm": 2.0147104900016153, + "learning_rate": 1.2528113525102568e-05, + "loss": 0.9251, + "step": 4167 + }, + { + "epoch": 0.43621140763997907, + "grad_norm": 2.1361681568328463, + "learning_rate": 1.2524833772806924e-05, + "loss": 1.0695, + "step": 4168 + }, + { + "epoch": 0.43631606488749347, + "grad_norm": 1.9844534323381833, + "learning_rate": 1.2521553730402762e-05, + "loss": 1.022, + "step": 4169 + }, + { + "epoch": 0.4364207221350079, + "grad_norm": 2.1536611659943854, + "learning_rate": 1.2518273398266963e-05, + "loss": 1.0211, + "step": 4170 + }, + { + "epoch": 0.4365253793825222, + "grad_norm": 1.9066376292841682, + "learning_rate": 1.251499277677645e-05, + "loss": 0.9381, + "step": 4171 + }, + { + "epoch": 0.43663003663003663, + "grad_norm": 1.979655024358628, + "learning_rate": 1.2511711866308167e-05, + "loss": 0.9354, + "step": 4172 + }, + { + "epoch": 0.43673469387755104, + "grad_norm": 1.930714983152759, + "learning_rate": 1.2508430667239094e-05, + "loss": 0.8527, + "step": 4173 + }, + { + "epoch": 0.4368393511250654, + "grad_norm": 2.1508753326787144, + "learning_rate": 1.2505149179946257e-05, + "loss": 1.0292, + "step": 4174 + }, + { + "epoch": 0.4369440083725798, + "grad_norm": 1.799153917872675, + "learning_rate": 1.25018674048067e-05, + "loss": 0.9594, + "step": 4175 + }, + { + "epoch": 0.4370486656200942, + "grad_norm": 2.1158310072558493, + "learning_rate": 1.2498585342197507e-05, + "loss": 1.0612, + "step": 4176 + }, + { + "epoch": 0.4371533228676086, + "grad_norm": 1.784434212580124, + "learning_rate": 1.2495302992495793e-05, + "loss": 1.0151, + "step": 4177 + }, + { + "epoch": 0.43725798011512296, + "grad_norm": 1.9237924946084939, + "learning_rate": 1.2492020356078705e-05, + "loss": 0.8651, + "step": 4178 + }, + { + "epoch": 0.43736263736263736, + "grad_norm": 2.5836868280522576, + "learning_rate": 1.2488737433323429e-05, + "loss": 0.9226, + "step": 4179 + }, + { + "epoch": 0.43746729461015177, + "grad_norm": 2.019400496982174, + "learning_rate": 1.2485454224607171e-05, + "loss": 0.8531, + "step": 4180 + }, + { + "epoch": 0.4375719518576661, + "grad_norm": 1.8357957654319403, + "learning_rate": 1.2482170730307191e-05, + "loss": 0.9594, + "step": 4181 + }, + { + "epoch": 0.4376766091051805, + "grad_norm": 2.2211596017891084, + "learning_rate": 1.2478886950800757e-05, + "loss": 0.9483, + "step": 4182 + }, + { + "epoch": 0.43778126635269493, + "grad_norm": 2.0328108335435093, + "learning_rate": 1.2475602886465187e-05, + "loss": 1.0499, + "step": 4183 + }, + { + "epoch": 0.43788592360020934, + "grad_norm": 1.9848307097983084, + "learning_rate": 1.2472318537677833e-05, + "loss": 0.868, + "step": 4184 + }, + { + "epoch": 0.4379905808477237, + "grad_norm": 1.9830682161242188, + "learning_rate": 1.2469033904816062e-05, + "loss": 0.952, + "step": 4185 + }, + { + "epoch": 0.4380952380952381, + "grad_norm": 1.8538521533923138, + "learning_rate": 1.246574898825729e-05, + "loss": 0.8598, + "step": 4186 + }, + { + "epoch": 0.4381998953427525, + "grad_norm": 2.3076134727295843, + "learning_rate": 1.246246378837896e-05, + "loss": 0.8155, + "step": 4187 + }, + { + "epoch": 0.43830455259026685, + "grad_norm": 2.3923552507037185, + "learning_rate": 1.2459178305558553e-05, + "loss": 1.0912, + "step": 4188 + }, + { + "epoch": 0.43840920983778126, + "grad_norm": 1.8990741957670583, + "learning_rate": 1.2455892540173572e-05, + "loss": 0.8969, + "step": 4189 + }, + { + "epoch": 0.43851386708529566, + "grad_norm": 1.8395674522822645, + "learning_rate": 1.2452606492601556e-05, + "loss": 0.905, + "step": 4190 + }, + { + "epoch": 0.43861852433281007, + "grad_norm": 1.859136871339698, + "learning_rate": 1.2449320163220087e-05, + "loss": 0.9013, + "step": 4191 + }, + { + "epoch": 0.4387231815803244, + "grad_norm": 2.407371693080437, + "learning_rate": 1.2446033552406763e-05, + "loss": 0.8304, + "step": 4192 + }, + { + "epoch": 0.4388278388278388, + "grad_norm": 2.388853746571948, + "learning_rate": 1.2442746660539226e-05, + "loss": 1.089, + "step": 4193 + }, + { + "epoch": 0.43893249607535323, + "grad_norm": 1.9365985500657477, + "learning_rate": 1.2439459487995145e-05, + "loss": 0.9722, + "step": 4194 + }, + { + "epoch": 0.43903715332286763, + "grad_norm": 2.963598627913234, + "learning_rate": 1.2436172035152223e-05, + "loss": 0.8233, + "step": 4195 + }, + { + "epoch": 0.439141810570382, + "grad_norm": 2.2703125367436363, + "learning_rate": 1.2432884302388198e-05, + "loss": 1.0116, + "step": 4196 + }, + { + "epoch": 0.4392464678178964, + "grad_norm": 2.129118913151718, + "learning_rate": 1.2429596290080832e-05, + "loss": 1.0584, + "step": 4197 + }, + { + "epoch": 0.4393511250654108, + "grad_norm": 2.0256017953605383, + "learning_rate": 1.2426307998607929e-05, + "loss": 1.008, + "step": 4198 + }, + { + "epoch": 0.43945578231292515, + "grad_norm": 2.028462889521559, + "learning_rate": 1.2423019428347316e-05, + "loss": 0.9788, + "step": 4199 + }, + { + "epoch": 0.43956043956043955, + "grad_norm": 2.1206707709023003, + "learning_rate": 1.2419730579676861e-05, + "loss": 0.8672, + "step": 4200 + }, + { + "epoch": 0.43966509680795396, + "grad_norm": 2.0885573335422705, + "learning_rate": 1.2416441452974457e-05, + "loss": 0.934, + "step": 4201 + }, + { + "epoch": 0.43976975405546836, + "grad_norm": 1.9950363704570153, + "learning_rate": 1.241315204861803e-05, + "loss": 0.9081, + "step": 4202 + }, + { + "epoch": 0.4398744113029827, + "grad_norm": 1.8813239970215452, + "learning_rate": 1.2409862366985541e-05, + "loss": 0.8674, + "step": 4203 + }, + { + "epoch": 0.4399790685504971, + "grad_norm": 2.1510971349415757, + "learning_rate": 1.240657240845498e-05, + "loss": 0.9775, + "step": 4204 + }, + { + "epoch": 0.4400837257980115, + "grad_norm": 2.455900196950652, + "learning_rate": 1.2403282173404373e-05, + "loss": 0.8979, + "step": 4205 + }, + { + "epoch": 0.4401883830455259, + "grad_norm": 2.003615035493078, + "learning_rate": 1.239999166221177e-05, + "loss": 0.949, + "step": 4206 + }, + { + "epoch": 0.4402930402930403, + "grad_norm": 2.122829687717139, + "learning_rate": 1.2396700875255263e-05, + "loss": 1.0648, + "step": 4207 + }, + { + "epoch": 0.4403976975405547, + "grad_norm": 1.7612998154135466, + "learning_rate": 1.2393409812912963e-05, + "loss": 0.8243, + "step": 4208 + }, + { + "epoch": 0.4405023547880691, + "grad_norm": 2.151390451235141, + "learning_rate": 1.2390118475563024e-05, + "loss": 1.1014, + "step": 4209 + }, + { + "epoch": 0.44060701203558345, + "grad_norm": 1.769570106535323, + "learning_rate": 1.238682686358363e-05, + "loss": 0.9075, + "step": 4210 + }, + { + "epoch": 0.44071166928309785, + "grad_norm": 1.8741126127698302, + "learning_rate": 1.2383534977352989e-05, + "loss": 0.9628, + "step": 4211 + }, + { + "epoch": 0.44081632653061226, + "grad_norm": 2.11490519415388, + "learning_rate": 1.2380242817249343e-05, + "loss": 1.0346, + "step": 4212 + }, + { + "epoch": 0.44092098377812666, + "grad_norm": 1.9472990388460194, + "learning_rate": 1.2376950383650978e-05, + "loss": 0.9551, + "step": 4213 + }, + { + "epoch": 0.441025641025641, + "grad_norm": 2.1912277529516357, + "learning_rate": 1.2373657676936194e-05, + "loss": 0.9105, + "step": 4214 + }, + { + "epoch": 0.4411302982731554, + "grad_norm": 2.216215119841429, + "learning_rate": 1.237036469748333e-05, + "loss": 1.0117, + "step": 4215 + }, + { + "epoch": 0.4412349555206698, + "grad_norm": 1.9395445115395555, + "learning_rate": 1.2367071445670755e-05, + "loss": 0.9748, + "step": 4216 + }, + { + "epoch": 0.4413396127681842, + "grad_norm": 2.4154728984909717, + "learning_rate": 1.2363777921876876e-05, + "loss": 1.048, + "step": 4217 + }, + { + "epoch": 0.4414442700156986, + "grad_norm": 2.2295606753839006, + "learning_rate": 1.2360484126480121e-05, + "loss": 1.1085, + "step": 4218 + }, + { + "epoch": 0.441548927263213, + "grad_norm": 21.310321961686725, + "learning_rate": 1.2357190059858955e-05, + "loss": 0.9542, + "step": 4219 + }, + { + "epoch": 0.4416535845107274, + "grad_norm": 2.275313654143178, + "learning_rate": 1.2353895722391873e-05, + "loss": 1.0865, + "step": 4220 + }, + { + "epoch": 0.44175824175824174, + "grad_norm": 2.008270022618239, + "learning_rate": 1.2350601114457397e-05, + "loss": 0.8425, + "step": 4221 + }, + { + "epoch": 0.44186289900575615, + "grad_norm": 30.912506956421982, + "learning_rate": 1.2347306236434091e-05, + "loss": 1.8411, + "step": 4222 + }, + { + "epoch": 0.44196755625327055, + "grad_norm": 14.423513861175078, + "learning_rate": 1.234401108870054e-05, + "loss": 1.2736, + "step": 4223 + }, + { + "epoch": 0.4420722135007849, + "grad_norm": 2.5430152347578816, + "learning_rate": 1.2340715671635365e-05, + "loss": 0.9757, + "step": 4224 + }, + { + "epoch": 0.4421768707482993, + "grad_norm": 18.390708578682016, + "learning_rate": 1.2337419985617209e-05, + "loss": 1.0966, + "step": 4225 + }, + { + "epoch": 0.4422815279958137, + "grad_norm": 2.1563225849286964, + "learning_rate": 1.2334124031024759e-05, + "loss": 0.9753, + "step": 4226 + }, + { + "epoch": 0.4423861852433281, + "grad_norm": 2.209454133083187, + "learning_rate": 1.2330827808236729e-05, + "loss": 1.0031, + "step": 4227 + }, + { + "epoch": 0.4424908424908425, + "grad_norm": 2.0533065013534006, + "learning_rate": 1.2327531317631858e-05, + "loss": 1.0234, + "step": 4228 + }, + { + "epoch": 0.4425954997383569, + "grad_norm": 2.267657972321477, + "learning_rate": 1.232423455958892e-05, + "loss": 0.9795, + "step": 4229 + }, + { + "epoch": 0.4427001569858713, + "grad_norm": 2.3724169669212163, + "learning_rate": 1.232093753448672e-05, + "loss": 1.0609, + "step": 4230 + }, + { + "epoch": 0.44280481423338564, + "grad_norm": 2.2647618736763864, + "learning_rate": 1.2317640242704089e-05, + "loss": 0.9277, + "step": 4231 + }, + { + "epoch": 0.44290947148090004, + "grad_norm": 2.426127300716815, + "learning_rate": 1.23143426846199e-05, + "loss": 0.9978, + "step": 4232 + }, + { + "epoch": 0.44301412872841445, + "grad_norm": 12.748064877007662, + "learning_rate": 1.2311044860613044e-05, + "loss": 1.059, + "step": 4233 + }, + { + "epoch": 0.44311878597592885, + "grad_norm": 2.2159616926614953, + "learning_rate": 1.2307746771062445e-05, + "loss": 0.9901, + "step": 4234 + }, + { + "epoch": 0.4432234432234432, + "grad_norm": 1.9980360395518977, + "learning_rate": 1.2304448416347066e-05, + "loss": 0.8674, + "step": 4235 + }, + { + "epoch": 0.4433281004709576, + "grad_norm": 2.2959534557033416, + "learning_rate": 1.2301149796845895e-05, + "loss": 1.0436, + "step": 4236 + }, + { + "epoch": 0.443432757718472, + "grad_norm": 2.449657687751539, + "learning_rate": 1.2297850912937946e-05, + "loss": 1.1151, + "step": 4237 + }, + { + "epoch": 0.4435374149659864, + "grad_norm": 1.852361721600591, + "learning_rate": 1.2294551765002265e-05, + "loss": 0.8302, + "step": 4238 + }, + { + "epoch": 0.44364207221350077, + "grad_norm": 1.7397853007381971, + "learning_rate": 1.229125235341794e-05, + "loss": 0.839, + "step": 4239 + }, + { + "epoch": 0.4437467294610152, + "grad_norm": 2.309355901645421, + "learning_rate": 1.228795267856407e-05, + "loss": 1.0563, + "step": 4240 + }, + { + "epoch": 0.4438513867085296, + "grad_norm": 2.1118423440247627, + "learning_rate": 1.2284652740819803e-05, + "loss": 1.0022, + "step": 4241 + }, + { + "epoch": 0.44395604395604393, + "grad_norm": 2.214872624726237, + "learning_rate": 1.2281352540564302e-05, + "loss": 0.9913, + "step": 4242 + }, + { + "epoch": 0.44406070120355834, + "grad_norm": 2.239625811455294, + "learning_rate": 1.227805207817677e-05, + "loss": 0.9527, + "step": 4243 + }, + { + "epoch": 0.44416535845107274, + "grad_norm": 31.368906412723593, + "learning_rate": 1.2274751354036439e-05, + "loss": 1.574, + "step": 4244 + }, + { + "epoch": 0.44427001569858715, + "grad_norm": 1.9338765858884037, + "learning_rate": 1.2271450368522562e-05, + "loss": 0.9677, + "step": 4245 + }, + { + "epoch": 0.4443746729461015, + "grad_norm": 2.531637449689326, + "learning_rate": 1.2268149122014436e-05, + "loss": 1.0082, + "step": 4246 + }, + { + "epoch": 0.4444793301936159, + "grad_norm": 12.934239661888748, + "learning_rate": 1.2264847614891373e-05, + "loss": 1.0943, + "step": 4247 + }, + { + "epoch": 0.4445839874411303, + "grad_norm": 2.519052355502292, + "learning_rate": 1.2261545847532731e-05, + "loss": 0.958, + "step": 4248 + }, + { + "epoch": 0.44468864468864466, + "grad_norm": 109.81451808318943, + "learning_rate": 1.225824382031789e-05, + "loss": 2.7126, + "step": 4249 + }, + { + "epoch": 0.44479330193615907, + "grad_norm": 2.461447085373303, + "learning_rate": 1.2254941533626253e-05, + "loss": 0.9693, + "step": 4250 + }, + { + "epoch": 0.4448979591836735, + "grad_norm": 2.9594119092355458, + "learning_rate": 1.225163898783726e-05, + "loss": 0.8497, + "step": 4251 + }, + { + "epoch": 0.4450026164311879, + "grad_norm": 5.254203001278294, + "learning_rate": 1.2248336183330384e-05, + "loss": 1.0249, + "step": 4252 + }, + { + "epoch": 0.44510727367870223, + "grad_norm": 4.105904824801587, + "learning_rate": 1.2245033120485127e-05, + "loss": 0.9186, + "step": 4253 + }, + { + "epoch": 0.44521193092621664, + "grad_norm": 2.945698769993366, + "learning_rate": 1.2241729799681008e-05, + "loss": 0.8103, + "step": 4254 + }, + { + "epoch": 0.44531658817373104, + "grad_norm": 5.8775538590842995, + "learning_rate": 1.223842622129759e-05, + "loss": 1.147, + "step": 4255 + }, + { + "epoch": 0.44542124542124545, + "grad_norm": 30.161697236880144, + "learning_rate": 1.2235122385714467e-05, + "loss": 4.1431, + "step": 4256 + }, + { + "epoch": 0.4455259026687598, + "grad_norm": 8.326858087895113, + "learning_rate": 1.2231818293311246e-05, + "loss": 1.1568, + "step": 4257 + }, + { + "epoch": 0.4456305599162742, + "grad_norm": 9.194801521266452, + "learning_rate": 1.2228513944467581e-05, + "loss": 1.2268, + "step": 4258 + }, + { + "epoch": 0.4457352171637886, + "grad_norm": 7.57812928636459, + "learning_rate": 1.2225209339563144e-05, + "loss": 1.1154, + "step": 4259 + }, + { + "epoch": 0.44583987441130296, + "grad_norm": 6.130800488712524, + "learning_rate": 1.2221904478977646e-05, + "loss": 0.9692, + "step": 4260 + }, + { + "epoch": 0.44594453165881737, + "grad_norm": 4.833189877025352, + "learning_rate": 1.2218599363090818e-05, + "loss": 0.9329, + "step": 4261 + }, + { + "epoch": 0.4460491889063318, + "grad_norm": 3.840930122922004, + "learning_rate": 1.2215293992282424e-05, + "loss": 1.103, + "step": 4262 + }, + { + "epoch": 0.4461538461538462, + "grad_norm": 3.8094608876035267, + "learning_rate": 1.2211988366932262e-05, + "loss": 1.0614, + "step": 4263 + }, + { + "epoch": 0.44625850340136053, + "grad_norm": 4.16696149043726, + "learning_rate": 1.2208682487420149e-05, + "loss": 0.9907, + "step": 4264 + }, + { + "epoch": 0.44636316064887493, + "grad_norm": 4.495255491410338, + "learning_rate": 1.2205376354125943e-05, + "loss": 1.1127, + "step": 4265 + }, + { + "epoch": 0.44646781789638934, + "grad_norm": 3.289846433479056, + "learning_rate": 1.2202069967429524e-05, + "loss": 1.0025, + "step": 4266 + }, + { + "epoch": 0.4465724751439037, + "grad_norm": 2.922002497701106, + "learning_rate": 1.2198763327710801e-05, + "loss": 1.0079, + "step": 4267 + }, + { + "epoch": 0.4466771323914181, + "grad_norm": 2.33163105345098, + "learning_rate": 1.2195456435349714e-05, + "loss": 1.0025, + "step": 4268 + }, + { + "epoch": 0.4467817896389325, + "grad_norm": 2.993882810202546, + "learning_rate": 1.219214929072623e-05, + "loss": 1.0342, + "step": 4269 + }, + { + "epoch": 0.4468864468864469, + "grad_norm": 3.138840354515787, + "learning_rate": 1.2188841894220351e-05, + "loss": 1.0363, + "step": 4270 + }, + { + "epoch": 0.44699110413396126, + "grad_norm": 2.8709505565686144, + "learning_rate": 1.21855342462121e-05, + "loss": 0.9547, + "step": 4271 + }, + { + "epoch": 0.44709576138147566, + "grad_norm": 3.5060252310763858, + "learning_rate": 1.2182226347081536e-05, + "loss": 1.0352, + "step": 4272 + }, + { + "epoch": 0.44720041862899007, + "grad_norm": 3.067631908770296, + "learning_rate": 1.2178918197208737e-05, + "loss": 0.936, + "step": 4273 + }, + { + "epoch": 0.4473050758765044, + "grad_norm": 3.033451592129497, + "learning_rate": 1.2175609796973822e-05, + "loss": 0.9364, + "step": 4274 + }, + { + "epoch": 0.4474097331240188, + "grad_norm": 2.0393393340393087, + "learning_rate": 1.2172301146756933e-05, + "loss": 0.9712, + "step": 4275 + }, + { + "epoch": 0.44751439037153323, + "grad_norm": 38.81812474011344, + "learning_rate": 1.2168992246938235e-05, + "loss": 2.2501, + "step": 4276 + }, + { + "epoch": 0.44761904761904764, + "grad_norm": 2.253226221283191, + "learning_rate": 1.2165683097897931e-05, + "loss": 1.1345, + "step": 4277 + }, + { + "epoch": 0.447723704866562, + "grad_norm": 2.3205323315504813, + "learning_rate": 1.2162373700016253e-05, + "loss": 1.0651, + "step": 4278 + }, + { + "epoch": 0.4478283621140764, + "grad_norm": 4.7092329513884295, + "learning_rate": 1.2159064053673449e-05, + "loss": 0.9147, + "step": 4279 + }, + { + "epoch": 0.4479330193615908, + "grad_norm": 29.970922275864194, + "learning_rate": 1.2155754159249811e-05, + "loss": 1.2273, + "step": 4280 + }, + { + "epoch": 0.4480376766091052, + "grad_norm": 2.187035261760947, + "learning_rate": 1.2152444017125647e-05, + "loss": 0.9912, + "step": 4281 + }, + { + "epoch": 0.44814233385661956, + "grad_norm": 3.2809808282344446, + "learning_rate": 1.2149133627681303e-05, + "loss": 1.1204, + "step": 4282 + }, + { + "epoch": 0.44824699110413396, + "grad_norm": 2.045684946775204, + "learning_rate": 1.2145822991297148e-05, + "loss": 1.0637, + "step": 4283 + }, + { + "epoch": 0.44835164835164837, + "grad_norm": 2.000665560599059, + "learning_rate": 1.2142512108353583e-05, + "loss": 1.0627, + "step": 4284 + }, + { + "epoch": 0.4484563055991627, + "grad_norm": 1.879007116893966, + "learning_rate": 1.2139200979231033e-05, + "loss": 0.9889, + "step": 4285 + }, + { + "epoch": 0.4485609628466771, + "grad_norm": 2.191861241077447, + "learning_rate": 1.2135889604309952e-05, + "loss": 0.9967, + "step": 4286 + }, + { + "epoch": 0.44866562009419153, + "grad_norm": 2.090962524194402, + "learning_rate": 1.2132577983970828e-05, + "loss": 0.9794, + "step": 4287 + }, + { + "epoch": 0.44877027734170594, + "grad_norm": 1.88873946046564, + "learning_rate": 1.212926611859417e-05, + "loss": 0.9027, + "step": 4288 + }, + { + "epoch": 0.4488749345892203, + "grad_norm": 2.2151145974628763, + "learning_rate": 1.212595400856052e-05, + "loss": 0.9032, + "step": 4289 + }, + { + "epoch": 0.4489795918367347, + "grad_norm": 2.130497424667586, + "learning_rate": 1.212264165425044e-05, + "loss": 0.9292, + "step": 4290 + }, + { + "epoch": 0.4490842490842491, + "grad_norm": 2.0469637222247554, + "learning_rate": 1.2119329056044533e-05, + "loss": 1.0018, + "step": 4291 + }, + { + "epoch": 0.44918890633176345, + "grad_norm": 2.5073666390831466, + "learning_rate": 1.2116016214323423e-05, + "loss": 1.0831, + "step": 4292 + }, + { + "epoch": 0.44929356357927785, + "grad_norm": 2.1859064919513522, + "learning_rate": 1.2112703129467758e-05, + "loss": 1.0436, + "step": 4293 + }, + { + "epoch": 0.44939822082679226, + "grad_norm": 2.0934320510193682, + "learning_rate": 1.2109389801858225e-05, + "loss": 0.9668, + "step": 4294 + }, + { + "epoch": 0.44950287807430667, + "grad_norm": 2.1756826044133004, + "learning_rate": 1.2106076231875524e-05, + "loss": 0.9583, + "step": 4295 + }, + { + "epoch": 0.449607535321821, + "grad_norm": 2.353709737315531, + "learning_rate": 1.2102762419900391e-05, + "loss": 1.0629, + "step": 4296 + }, + { + "epoch": 0.4497121925693354, + "grad_norm": 1.952613682916308, + "learning_rate": 1.20994483663136e-05, + "loss": 0.9243, + "step": 4297 + }, + { + "epoch": 0.44981684981684983, + "grad_norm": 2.2467039184701445, + "learning_rate": 1.209613407149593e-05, + "loss": 1.0368, + "step": 4298 + }, + { + "epoch": 0.44992150706436423, + "grad_norm": 2.0181287771348284, + "learning_rate": 1.2092819535828208e-05, + "loss": 0.8519, + "step": 4299 + }, + { + "epoch": 0.4500261643118786, + "grad_norm": 1.9314596098382928, + "learning_rate": 1.208950475969128e-05, + "loss": 0.8568, + "step": 4300 + }, + { + "epoch": 0.450130821559393, + "grad_norm": 1.832701400236754, + "learning_rate": 1.208618974346602e-05, + "loss": 0.927, + "step": 4301 + }, + { + "epoch": 0.4502354788069074, + "grad_norm": 1.9459296613855224, + "learning_rate": 1.2082874487533324e-05, + "loss": 1.0155, + "step": 4302 + }, + { + "epoch": 0.45034013605442175, + "grad_norm": 2.3792299560839667, + "learning_rate": 1.2079558992274131e-05, + "loss": 1.0069, + "step": 4303 + }, + { + "epoch": 0.45044479330193615, + "grad_norm": 2.170868389797402, + "learning_rate": 1.2076243258069396e-05, + "loss": 1.0151, + "step": 4304 + }, + { + "epoch": 0.45054945054945056, + "grad_norm": 2.2013372328892458, + "learning_rate": 1.2072927285300099e-05, + "loss": 0.9911, + "step": 4305 + }, + { + "epoch": 0.45065410779696496, + "grad_norm": 2.2705803608142863, + "learning_rate": 1.2069611074347255e-05, + "loss": 1.0045, + "step": 4306 + }, + { + "epoch": 0.4507587650444793, + "grad_norm": 1.9277985272803437, + "learning_rate": 1.2066294625591903e-05, + "loss": 0.8673, + "step": 4307 + }, + { + "epoch": 0.4508634222919937, + "grad_norm": 2.0440176202060156, + "learning_rate": 1.206297793941511e-05, + "loss": 1.0108, + "step": 4308 + }, + { + "epoch": 0.4509680795395081, + "grad_norm": 2.5462646813427097, + "learning_rate": 1.205966101619797e-05, + "loss": 0.859, + "step": 4309 + }, + { + "epoch": 0.4510727367870225, + "grad_norm": 2.248355643793843, + "learning_rate": 1.2056343856321605e-05, + "loss": 1.1021, + "step": 4310 + }, + { + "epoch": 0.4511773940345369, + "grad_norm": 4.908250414144032, + "learning_rate": 1.2053026460167158e-05, + "loss": 1.0568, + "step": 4311 + }, + { + "epoch": 0.4512820512820513, + "grad_norm": 2.248952995961383, + "learning_rate": 1.2049708828115811e-05, + "loss": 0.9196, + "step": 4312 + }, + { + "epoch": 0.4513867085295657, + "grad_norm": 1.9290283733393867, + "learning_rate": 1.2046390960548765e-05, + "loss": 1.0563, + "step": 4313 + }, + { + "epoch": 0.45149136577708004, + "grad_norm": 2.0660674453272336, + "learning_rate": 1.204307285784725e-05, + "loss": 0.9519, + "step": 4314 + }, + { + "epoch": 0.45159602302459445, + "grad_norm": 1.996879072685833, + "learning_rate": 1.2039754520392522e-05, + "loss": 1.0722, + "step": 4315 + }, + { + "epoch": 0.45170068027210886, + "grad_norm": 2.5022938436613624, + "learning_rate": 1.2036435948565863e-05, + "loss": 1.0778, + "step": 4316 + }, + { + "epoch": 0.4518053375196232, + "grad_norm": 2.2109024944276885, + "learning_rate": 1.2033117142748585e-05, + "loss": 1.0058, + "step": 4317 + }, + { + "epoch": 0.4519099947671376, + "grad_norm": 2.2693762543652674, + "learning_rate": 1.2029798103322028e-05, + "loss": 0.9833, + "step": 4318 + }, + { + "epoch": 0.452014652014652, + "grad_norm": 2.116453756308576, + "learning_rate": 1.2026478830667551e-05, + "loss": 0.9363, + "step": 4319 + }, + { + "epoch": 0.4521193092621664, + "grad_norm": 3.342959773212483, + "learning_rate": 1.2023159325166551e-05, + "loss": 0.8688, + "step": 4320 + }, + { + "epoch": 0.4522239665096808, + "grad_norm": 2.3163640304872812, + "learning_rate": 1.2019839587200442e-05, + "loss": 0.8287, + "step": 4321 + }, + { + "epoch": 0.4523286237571952, + "grad_norm": 1.9681851990856516, + "learning_rate": 1.2016519617150668e-05, + "loss": 0.9896, + "step": 4322 + }, + { + "epoch": 0.4524332810047096, + "grad_norm": 2.143371304104883, + "learning_rate": 1.2013199415398706e-05, + "loss": 0.8437, + "step": 4323 + }, + { + "epoch": 0.452537938252224, + "grad_norm": 2.4121771940664587, + "learning_rate": 1.2009878982326047e-05, + "loss": 0.898, + "step": 4324 + }, + { + "epoch": 0.45264259549973834, + "grad_norm": 2.2077705572979847, + "learning_rate": 1.2006558318314219e-05, + "loss": 0.981, + "step": 4325 + }, + { + "epoch": 0.45274725274725275, + "grad_norm": 1.8137959654643108, + "learning_rate": 1.2003237423744773e-05, + "loss": 0.9494, + "step": 4326 + }, + { + "epoch": 0.45285190999476715, + "grad_norm": 2.043816728834952, + "learning_rate": 1.1999916298999285e-05, + "loss": 1.0507, + "step": 4327 + }, + { + "epoch": 0.4529565672422815, + "grad_norm": 1.8399500676215097, + "learning_rate": 1.1996594944459363e-05, + "loss": 0.9921, + "step": 4328 + }, + { + "epoch": 0.4530612244897959, + "grad_norm": 1.8888260838202213, + "learning_rate": 1.199327336050663e-05, + "loss": 1.0088, + "step": 4329 + }, + { + "epoch": 0.4531658817373103, + "grad_norm": 2.2320849845692474, + "learning_rate": 1.198995154752275e-05, + "loss": 1.0454, + "step": 4330 + }, + { + "epoch": 0.4532705389848247, + "grad_norm": 2.315466026238162, + "learning_rate": 1.1986629505889402e-05, + "loss": 0.8162, + "step": 4331 + }, + { + "epoch": 0.4533751962323391, + "grad_norm": 2.0448363790200745, + "learning_rate": 1.1983307235988297e-05, + "loss": 0.92, + "step": 4332 + }, + { + "epoch": 0.4534798534798535, + "grad_norm": 1.9783944834680685, + "learning_rate": 1.1979984738201172e-05, + "loss": 0.9508, + "step": 4333 + }, + { + "epoch": 0.4535845107273679, + "grad_norm": 2.1566268663323167, + "learning_rate": 1.1976662012909781e-05, + "loss": 0.9213, + "step": 4334 + }, + { + "epoch": 0.45368916797488223, + "grad_norm": 2.088854843791031, + "learning_rate": 1.1973339060495923e-05, + "loss": 0.9617, + "step": 4335 + }, + { + "epoch": 0.45379382522239664, + "grad_norm": 2.0416167816146977, + "learning_rate": 1.1970015881341405e-05, + "loss": 0.9292, + "step": 4336 + }, + { + "epoch": 0.45389848246991105, + "grad_norm": 2.0230522167032206, + "learning_rate": 1.196669247582807e-05, + "loss": 0.8476, + "step": 4337 + }, + { + "epoch": 0.45400313971742545, + "grad_norm": 2.1730001331296704, + "learning_rate": 1.1963368844337779e-05, + "loss": 0.9025, + "step": 4338 + }, + { + "epoch": 0.4541077969649398, + "grad_norm": 1.8508787301092124, + "learning_rate": 1.196004498725243e-05, + "loss": 0.8897, + "step": 4339 + }, + { + "epoch": 0.4542124542124542, + "grad_norm": 2.2654361716293336, + "learning_rate": 1.195672090495394e-05, + "loss": 1.0252, + "step": 4340 + }, + { + "epoch": 0.4543171114599686, + "grad_norm": 1.8928691469715988, + "learning_rate": 1.1953396597824249e-05, + "loss": 0.9031, + "step": 4341 + }, + { + "epoch": 0.454421768707483, + "grad_norm": 2.3645611004316702, + "learning_rate": 1.1950072066245328e-05, + "loss": 1.0823, + "step": 4342 + }, + { + "epoch": 0.45452642595499737, + "grad_norm": 1.7075263528138125, + "learning_rate": 1.1946747310599176e-05, + "loss": 0.9554, + "step": 4343 + }, + { + "epoch": 0.4546310832025118, + "grad_norm": 2.3091455726092884, + "learning_rate": 1.1943422331267811e-05, + "loss": 1.0067, + "step": 4344 + }, + { + "epoch": 0.4547357404500262, + "grad_norm": 2.098765765313036, + "learning_rate": 1.194009712863328e-05, + "loss": 0.9867, + "step": 4345 + }, + { + "epoch": 0.45484039769754053, + "grad_norm": 1.7727533702277667, + "learning_rate": 1.1936771703077656e-05, + "loss": 0.9671, + "step": 4346 + }, + { + "epoch": 0.45494505494505494, + "grad_norm": 2.074696485047073, + "learning_rate": 1.1933446054983035e-05, + "loss": 0.9257, + "step": 4347 + }, + { + "epoch": 0.45504971219256934, + "grad_norm": 2.039328732869746, + "learning_rate": 1.1930120184731544e-05, + "loss": 0.9301, + "step": 4348 + }, + { + "epoch": 0.45515436944008375, + "grad_norm": 2.1160277945116586, + "learning_rate": 1.1926794092705335e-05, + "loss": 0.9874, + "step": 4349 + }, + { + "epoch": 0.4552590266875981, + "grad_norm": 1.8772607321330006, + "learning_rate": 1.1923467779286573e-05, + "loss": 0.941, + "step": 4350 + }, + { + "epoch": 0.4553636839351125, + "grad_norm": 1.9022941147099999, + "learning_rate": 1.1920141244857466e-05, + "loss": 0.908, + "step": 4351 + }, + { + "epoch": 0.4554683411826269, + "grad_norm": 2.3523711222096404, + "learning_rate": 1.191681448980024e-05, + "loss": 0.8785, + "step": 4352 + }, + { + "epoch": 0.45557299843014126, + "grad_norm": 2.002200825932997, + "learning_rate": 1.191348751449714e-05, + "loss": 1.0048, + "step": 4353 + }, + { + "epoch": 0.45567765567765567, + "grad_norm": 2.0270694451502282, + "learning_rate": 1.1910160319330446e-05, + "loss": 1.0233, + "step": 4354 + }, + { + "epoch": 0.4557823129251701, + "grad_norm": 2.0541524762488814, + "learning_rate": 1.1906832904682459e-05, + "loss": 0.7993, + "step": 4355 + }, + { + "epoch": 0.4558869701726845, + "grad_norm": 1.8638621351295752, + "learning_rate": 1.1903505270935507e-05, + "loss": 0.8905, + "step": 4356 + }, + { + "epoch": 0.45599162742019883, + "grad_norm": 1.9736021287043854, + "learning_rate": 1.190017741847194e-05, + "loss": 1.0373, + "step": 4357 + }, + { + "epoch": 0.45609628466771324, + "grad_norm": 1.8797795224789213, + "learning_rate": 1.1896849347674135e-05, + "loss": 0.9821, + "step": 4358 + }, + { + "epoch": 0.45620094191522764, + "grad_norm": 1.9368205053092564, + "learning_rate": 1.1893521058924497e-05, + "loss": 1.0668, + "step": 4359 + }, + { + "epoch": 0.456305599162742, + "grad_norm": 2.132381992165538, + "learning_rate": 1.1890192552605447e-05, + "loss": 0.9218, + "step": 4360 + }, + { + "epoch": 0.4564102564102564, + "grad_norm": 2.0456266510770056, + "learning_rate": 1.1886863829099442e-05, + "loss": 0.8965, + "step": 4361 + }, + { + "epoch": 0.4565149136577708, + "grad_norm": 2.2461716024926823, + "learning_rate": 1.1883534888788961e-05, + "loss": 1.072, + "step": 4362 + }, + { + "epoch": 0.4566195709052852, + "grad_norm": 1.8114815853829138, + "learning_rate": 1.18802057320565e-05, + "loss": 0.9784, + "step": 4363 + }, + { + "epoch": 0.45672422815279956, + "grad_norm": 2.074839338100702, + "learning_rate": 1.1876876359284589e-05, + "loss": 0.9458, + "step": 4364 + }, + { + "epoch": 0.45682888540031397, + "grad_norm": 3.272621021673472, + "learning_rate": 1.187354677085578e-05, + "loss": 1.0344, + "step": 4365 + }, + { + "epoch": 0.4569335426478284, + "grad_norm": 1.8423824960811983, + "learning_rate": 1.1870216967152649e-05, + "loss": 0.9991, + "step": 4366 + }, + { + "epoch": 0.4570381998953428, + "grad_norm": 2.0436626042627735, + "learning_rate": 1.1866886948557796e-05, + "loss": 0.9367, + "step": 4367 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 2.243427552622139, + "learning_rate": 1.1863556715453847e-05, + "loss": 1.081, + "step": 4368 + }, + { + "epoch": 0.45724751439037153, + "grad_norm": 2.1396337485328965, + "learning_rate": 1.1860226268223459e-05, + "loss": 1.0269, + "step": 4369 + }, + { + "epoch": 0.45735217163788594, + "grad_norm": 2.0912618560301715, + "learning_rate": 1.1856895607249297e-05, + "loss": 0.9635, + "step": 4370 + }, + { + "epoch": 0.4574568288854003, + "grad_norm": 1.8134320104293313, + "learning_rate": 1.1853564732914066e-05, + "loss": 0.8776, + "step": 4371 + }, + { + "epoch": 0.4575614861329147, + "grad_norm": 1.873748145270781, + "learning_rate": 1.185023364560049e-05, + "loss": 0.8853, + "step": 4372 + }, + { + "epoch": 0.4576661433804291, + "grad_norm": 1.8136741058042303, + "learning_rate": 1.1846902345691317e-05, + "loss": 0.98, + "step": 4373 + }, + { + "epoch": 0.4577708006279435, + "grad_norm": 1.7292941149458805, + "learning_rate": 1.1843570833569324e-05, + "loss": 0.9699, + "step": 4374 + }, + { + "epoch": 0.45787545787545786, + "grad_norm": 2.6314289006744587, + "learning_rate": 1.1840239109617302e-05, + "loss": 1.1551, + "step": 4375 + }, + { + "epoch": 0.45798011512297226, + "grad_norm": 22.007184596120965, + "learning_rate": 1.1836907174218078e-05, + "loss": 1.8465, + "step": 4376 + }, + { + "epoch": 0.45808477237048667, + "grad_norm": 2.295041122834174, + "learning_rate": 1.183357502775449e-05, + "loss": 1.0055, + "step": 4377 + }, + { + "epoch": 0.458189429618001, + "grad_norm": 31.559871912501077, + "learning_rate": 1.1830242670609417e-05, + "loss": 1.7263, + "step": 4378 + }, + { + "epoch": 0.4582940868655154, + "grad_norm": 2.0871333437628055, + "learning_rate": 1.1826910103165753e-05, + "loss": 0.8551, + "step": 4379 + }, + { + "epoch": 0.45839874411302983, + "grad_norm": 2.726282817146809, + "learning_rate": 1.1823577325806414e-05, + "loss": 1.0138, + "step": 4380 + }, + { + "epoch": 0.45850340136054424, + "grad_norm": 2.1086177426453374, + "learning_rate": 1.1820244338914341e-05, + "loss": 0.9771, + "step": 4381 + }, + { + "epoch": 0.4586080586080586, + "grad_norm": 2.3764884198932, + "learning_rate": 1.1816911142872507e-05, + "loss": 1.0197, + "step": 4382 + }, + { + "epoch": 0.458712715855573, + "grad_norm": 2.045556899175085, + "learning_rate": 1.1813577738063897e-05, + "loss": 0.9103, + "step": 4383 + }, + { + "epoch": 0.4588173731030874, + "grad_norm": 2.5972265367678564, + "learning_rate": 1.1810244124871528e-05, + "loss": 1.0018, + "step": 4384 + }, + { + "epoch": 0.4589220303506018, + "grad_norm": 2.551709456154248, + "learning_rate": 1.1806910303678443e-05, + "loss": 1.0322, + "step": 4385 + }, + { + "epoch": 0.45902668759811616, + "grad_norm": 2.1748517522412407, + "learning_rate": 1.1803576274867698e-05, + "loss": 0.9927, + "step": 4386 + }, + { + "epoch": 0.45913134484563056, + "grad_norm": 3.812812110949521, + "learning_rate": 1.1800242038822382e-05, + "loss": 0.9246, + "step": 4387 + }, + { + "epoch": 0.45923600209314497, + "grad_norm": 2.376394222425811, + "learning_rate": 1.1796907595925608e-05, + "loss": 0.9445, + "step": 4388 + }, + { + "epoch": 0.4593406593406593, + "grad_norm": 2.5161863830002735, + "learning_rate": 1.179357294656051e-05, + "loss": 1.0452, + "step": 4389 + }, + { + "epoch": 0.4594453165881737, + "grad_norm": 2.6069106789560488, + "learning_rate": 1.179023809111024e-05, + "loss": 1.1245, + "step": 4390 + }, + { + "epoch": 0.45954997383568813, + "grad_norm": 2.4849062318250397, + "learning_rate": 1.1786903029957987e-05, + "loss": 1.0237, + "step": 4391 + }, + { + "epoch": 0.45965463108320254, + "grad_norm": 2.1071131463822357, + "learning_rate": 1.1783567763486954e-05, + "loss": 0.9831, + "step": 4392 + }, + { + "epoch": 0.4597592883307169, + "grad_norm": 2.7912058793141115, + "learning_rate": 1.1780232292080371e-05, + "loss": 0.9761, + "step": 4393 + }, + { + "epoch": 0.4598639455782313, + "grad_norm": 2.033218484100733, + "learning_rate": 1.1776896616121483e-05, + "loss": 0.8881, + "step": 4394 + }, + { + "epoch": 0.4599686028257457, + "grad_norm": 2.044503825084946, + "learning_rate": 1.1773560735993576e-05, + "loss": 1.0179, + "step": 4395 + }, + { + "epoch": 0.46007326007326005, + "grad_norm": 2.2543377501620308, + "learning_rate": 1.1770224652079943e-05, + "loss": 0.823, + "step": 4396 + }, + { + "epoch": 0.46017791732077445, + "grad_norm": 2.3807439687952217, + "learning_rate": 1.1766888364763912e-05, + "loss": 0.8314, + "step": 4397 + }, + { + "epoch": 0.46028257456828886, + "grad_norm": 1.970046805954839, + "learning_rate": 1.1763551874428823e-05, + "loss": 0.879, + "step": 4398 + }, + { + "epoch": 0.46038723181580327, + "grad_norm": 2.1330010409516116, + "learning_rate": 1.1760215181458048e-05, + "loss": 1.1079, + "step": 4399 + }, + { + "epoch": 0.4604918890633176, + "grad_norm": 1.9239002886217489, + "learning_rate": 1.1756878286234983e-05, + "loss": 0.9412, + "step": 4400 + }, + { + "epoch": 0.460596546310832, + "grad_norm": 2.1793334815489693, + "learning_rate": 1.1753541189143039e-05, + "loss": 0.8828, + "step": 4401 + }, + { + "epoch": 0.46070120355834643, + "grad_norm": 2.254420506227355, + "learning_rate": 1.1750203890565661e-05, + "loss": 0.8499, + "step": 4402 + }, + { + "epoch": 0.4608058608058608, + "grad_norm": 2.1754811115427843, + "learning_rate": 1.1746866390886304e-05, + "loss": 0.924, + "step": 4403 + }, + { + "epoch": 0.4609105180533752, + "grad_norm": 2.396703022879703, + "learning_rate": 1.1743528690488457e-05, + "loss": 0.936, + "step": 4404 + }, + { + "epoch": 0.4610151753008896, + "grad_norm": 1.8475247870093567, + "learning_rate": 1.1740190789755635e-05, + "loss": 0.9359, + "step": 4405 + }, + { + "epoch": 0.461119832548404, + "grad_norm": 1.9948668165368353, + "learning_rate": 1.1736852689071357e-05, + "loss": 0.9159, + "step": 4406 + }, + { + "epoch": 0.46122448979591835, + "grad_norm": 2.3311014731095474, + "learning_rate": 1.1733514388819186e-05, + "loss": 1.0174, + "step": 4407 + }, + { + "epoch": 0.46132914704343275, + "grad_norm": 2.657571922556327, + "learning_rate": 1.1730175889382697e-05, + "loss": 0.9448, + "step": 4408 + }, + { + "epoch": 0.46143380429094716, + "grad_norm": 1.9668833389759839, + "learning_rate": 1.1726837191145492e-05, + "loss": 0.9506, + "step": 4409 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 161.64348480668815, + "learning_rate": 1.1723498294491196e-05, + "loss": 3.7708, + "step": 4410 + }, + { + "epoch": 0.4616431187859759, + "grad_norm": 2.1254582340869836, + "learning_rate": 1.1720159199803447e-05, + "loss": 1.0292, + "step": 4411 + }, + { + "epoch": 0.4617477760334903, + "grad_norm": 3.149115475190535, + "learning_rate": 1.1716819907465922e-05, + "loss": 1.0485, + "step": 4412 + }, + { + "epoch": 0.4618524332810047, + "grad_norm": 2.7285678675780627, + "learning_rate": 1.1713480417862306e-05, + "loss": 1.0172, + "step": 4413 + }, + { + "epoch": 0.4619570905285191, + "grad_norm": 2.2641817986994957, + "learning_rate": 1.171014073137632e-05, + "loss": 0.829, + "step": 4414 + }, + { + "epoch": 0.4620617477760335, + "grad_norm": 21.38524678867409, + "learning_rate": 1.1706800848391694e-05, + "loss": 2.726, + "step": 4415 + }, + { + "epoch": 0.4621664050235479, + "grad_norm": 2.442948735347796, + "learning_rate": 1.1703460769292192e-05, + "loss": 0.9794, + "step": 4416 + }, + { + "epoch": 0.4622710622710623, + "grad_norm": 2.762838250862807, + "learning_rate": 1.1700120494461595e-05, + "loss": 1.0578, + "step": 4417 + }, + { + "epoch": 0.46237571951857664, + "grad_norm": 3.0564755310218885, + "learning_rate": 1.1696780024283708e-05, + "loss": 0.912, + "step": 4418 + }, + { + "epoch": 0.46248037676609105, + "grad_norm": 4.2059114992513935, + "learning_rate": 1.1693439359142356e-05, + "loss": 1.211, + "step": 4419 + }, + { + "epoch": 0.46258503401360546, + "grad_norm": 3.764016376227401, + "learning_rate": 1.1690098499421384e-05, + "loss": 1.0091, + "step": 4420 + }, + { + "epoch": 0.4626896912611198, + "grad_norm": 3.1796219459492705, + "learning_rate": 1.1686757445504673e-05, + "loss": 1.0657, + "step": 4421 + }, + { + "epoch": 0.4627943485086342, + "grad_norm": 2.579766465159248, + "learning_rate": 1.1683416197776113e-05, + "loss": 0.9986, + "step": 4422 + }, + { + "epoch": 0.4628990057561486, + "grad_norm": 2.122008428264213, + "learning_rate": 1.1680074756619614e-05, + "loss": 1.0272, + "step": 4423 + }, + { + "epoch": 0.463003663003663, + "grad_norm": 2.26997985317698, + "learning_rate": 1.1676733122419124e-05, + "loss": 0.9752, + "step": 4424 + }, + { + "epoch": 0.4631083202511774, + "grad_norm": 2.2969054913130313, + "learning_rate": 1.1673391295558596e-05, + "loss": 0.9876, + "step": 4425 + }, + { + "epoch": 0.4632129774986918, + "grad_norm": 2.7300799446270467, + "learning_rate": 1.1670049276422017e-05, + "loss": 1.006, + "step": 4426 + }, + { + "epoch": 0.4633176347462062, + "grad_norm": 2.843913255345786, + "learning_rate": 1.166670706539339e-05, + "loss": 1.0773, + "step": 4427 + }, + { + "epoch": 0.4634222919937206, + "grad_norm": 2.661110599574868, + "learning_rate": 1.1663364662856747e-05, + "loss": 0.9482, + "step": 4428 + }, + { + "epoch": 0.46352694924123494, + "grad_norm": 2.2380660123868115, + "learning_rate": 1.1660022069196126e-05, + "loss": 0.8998, + "step": 4429 + }, + { + "epoch": 0.46363160648874935, + "grad_norm": 2.178013787617105, + "learning_rate": 1.1656679284795607e-05, + "loss": 0.9949, + "step": 4430 + }, + { + "epoch": 0.46373626373626375, + "grad_norm": 2.1088259317784295, + "learning_rate": 1.165333631003928e-05, + "loss": 0.9476, + "step": 4431 + }, + { + "epoch": 0.4638409209837781, + "grad_norm": 2.185800456163915, + "learning_rate": 1.1649993145311258e-05, + "loss": 1.0237, + "step": 4432 + }, + { + "epoch": 0.4639455782312925, + "grad_norm": 2.148724627286947, + "learning_rate": 1.1646649790995679e-05, + "loss": 0.9447, + "step": 4433 + }, + { + "epoch": 0.4640502354788069, + "grad_norm": 2.4213799049050566, + "learning_rate": 1.1643306247476702e-05, + "loss": 0.9744, + "step": 4434 + }, + { + "epoch": 0.4641548927263213, + "grad_norm": 2.088289187776693, + "learning_rate": 1.1639962515138506e-05, + "loss": 0.9488, + "step": 4435 + }, + { + "epoch": 0.4642595499738357, + "grad_norm": 2.1310001246837653, + "learning_rate": 1.1636618594365291e-05, + "loss": 0.9596, + "step": 4436 + }, + { + "epoch": 0.4643642072213501, + "grad_norm": 2.4081476306831053, + "learning_rate": 1.1633274485541284e-05, + "loss": 0.9266, + "step": 4437 + }, + { + "epoch": 0.4644688644688645, + "grad_norm": 2.247732950837275, + "learning_rate": 1.1629930189050724e-05, + "loss": 1.141, + "step": 4438 + }, + { + "epoch": 0.46457352171637883, + "grad_norm": 2.7553884410693534, + "learning_rate": 1.1626585705277886e-05, + "loss": 0.977, + "step": 4439 + }, + { + "epoch": 0.46467817896389324, + "grad_norm": 2.093553580818768, + "learning_rate": 1.1623241034607049e-05, + "loss": 0.92, + "step": 4440 + }, + { + "epoch": 0.46478283621140765, + "grad_norm": 2.102413811719979, + "learning_rate": 1.161989617742253e-05, + "loss": 0.9748, + "step": 4441 + }, + { + "epoch": 0.46488749345892205, + "grad_norm": 2.1474470371663523, + "learning_rate": 1.1616551134108649e-05, + "loss": 0.8978, + "step": 4442 + }, + { + "epoch": 0.4649921507064364, + "grad_norm": 1.8755006879257996, + "learning_rate": 1.1613205905049774e-05, + "loss": 1.0037, + "step": 4443 + }, + { + "epoch": 0.4650968079539508, + "grad_norm": 12.367373504928828, + "learning_rate": 1.1609860490630265e-05, + "loss": 1.7202, + "step": 4444 + }, + { + "epoch": 0.4652014652014652, + "grad_norm": 2.434919186952808, + "learning_rate": 1.1606514891234526e-05, + "loss": 1.0571, + "step": 4445 + }, + { + "epoch": 0.46530612244897956, + "grad_norm": 8.610048201919636, + "learning_rate": 1.1603169107246966e-05, + "loss": 1.6151, + "step": 4446 + }, + { + "epoch": 0.46541077969649397, + "grad_norm": 1.9525960112801914, + "learning_rate": 1.1599823139052027e-05, + "loss": 0.875, + "step": 4447 + }, + { + "epoch": 0.4655154369440084, + "grad_norm": 2.2632342098228775, + "learning_rate": 1.1596476987034167e-05, + "loss": 1.0215, + "step": 4448 + }, + { + "epoch": 0.4656200941915228, + "grad_norm": 2.2366838939003335, + "learning_rate": 1.1593130651577861e-05, + "loss": 1.0237, + "step": 4449 + }, + { + "epoch": 0.46572475143903713, + "grad_norm": 38.16329198450561, + "learning_rate": 1.1589784133067619e-05, + "loss": 1.9354, + "step": 4450 + }, + { + "epoch": 0.46582940868655154, + "grad_norm": 2.3716405493770485, + "learning_rate": 1.1586437431887954e-05, + "loss": 0.8736, + "step": 4451 + }, + { + "epoch": 0.46593406593406594, + "grad_norm": 2.125681534199429, + "learning_rate": 1.158309054842341e-05, + "loss": 1.0289, + "step": 4452 + }, + { + "epoch": 0.46603872318158035, + "grad_norm": 2.3919411314017536, + "learning_rate": 1.1579743483058556e-05, + "loss": 0.9878, + "step": 4453 + }, + { + "epoch": 0.4661433804290947, + "grad_norm": 2.39638097963942, + "learning_rate": 1.1576396236177974e-05, + "loss": 1.0409, + "step": 4454 + }, + { + "epoch": 0.4662480376766091, + "grad_norm": 2.5103806573334215, + "learning_rate": 1.1573048808166265e-05, + "loss": 0.9669, + "step": 4455 + }, + { + "epoch": 0.4663526949241235, + "grad_norm": 19.54312815228122, + "learning_rate": 1.1569701199408063e-05, + "loss": 1.6532, + "step": 4456 + }, + { + "epoch": 0.46645735217163786, + "grad_norm": 2.118501244763855, + "learning_rate": 1.1566353410288009e-05, + "loss": 1.0849, + "step": 4457 + }, + { + "epoch": 0.46656200941915227, + "grad_norm": 1.7858040697407882, + "learning_rate": 1.1563005441190776e-05, + "loss": 0.8408, + "step": 4458 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 2.2969793541608166, + "learning_rate": 1.1559657292501044e-05, + "loss": 0.9529, + "step": 4459 + }, + { + "epoch": 0.4667713239141811, + "grad_norm": 2.5581364794755355, + "learning_rate": 1.1556308964603531e-05, + "loss": 0.8612, + "step": 4460 + }, + { + "epoch": 0.46687598116169543, + "grad_norm": 2.300009884961941, + "learning_rate": 1.155296045788296e-05, + "loss": 1.1076, + "step": 4461 + }, + { + "epoch": 0.46698063840920984, + "grad_norm": 1.9569146697798587, + "learning_rate": 1.1549611772724087e-05, + "loss": 0.9219, + "step": 4462 + }, + { + "epoch": 0.46708529565672424, + "grad_norm": 2.0103709930682876, + "learning_rate": 1.1546262909511679e-05, + "loss": 1.0447, + "step": 4463 + }, + { + "epoch": 0.4671899529042386, + "grad_norm": 2.2197999718443238, + "learning_rate": 1.1542913868630527e-05, + "loss": 1.0264, + "step": 4464 + }, + { + "epoch": 0.467294610151753, + "grad_norm": 1.8177368735546657, + "learning_rate": 1.1539564650465447e-05, + "loss": 0.9716, + "step": 4465 + }, + { + "epoch": 0.4673992673992674, + "grad_norm": 2.1059310144937617, + "learning_rate": 1.1536215255401265e-05, + "loss": 1.0372, + "step": 4466 + }, + { + "epoch": 0.4675039246467818, + "grad_norm": 2.197191188748699, + "learning_rate": 1.1532865683822836e-05, + "loss": 1.0703, + "step": 4467 + }, + { + "epoch": 0.46760858189429616, + "grad_norm": 2.6122799139573423, + "learning_rate": 1.1529515936115033e-05, + "loss": 1.0208, + "step": 4468 + }, + { + "epoch": 0.46771323914181057, + "grad_norm": 2.093417244729284, + "learning_rate": 1.1526166012662748e-05, + "loss": 0.8367, + "step": 4469 + }, + { + "epoch": 0.46781789638932497, + "grad_norm": 2.135644048890782, + "learning_rate": 1.1522815913850896e-05, + "loss": 0.905, + "step": 4470 + }, + { + "epoch": 0.4679225536368394, + "grad_norm": 2.045207253531856, + "learning_rate": 1.1519465640064405e-05, + "loss": 0.8331, + "step": 4471 + }, + { + "epoch": 0.46802721088435373, + "grad_norm": 7.896758500820517, + "learning_rate": 1.1516115191688232e-05, + "loss": 1.3637, + "step": 4472 + }, + { + "epoch": 0.46813186813186813, + "grad_norm": 2.143393184203902, + "learning_rate": 1.151276456910735e-05, + "loss": 1.0432, + "step": 4473 + }, + { + "epoch": 0.46823652537938254, + "grad_norm": 2.075941729013746, + "learning_rate": 1.1509413772706757e-05, + "loss": 1.1271, + "step": 4474 + }, + { + "epoch": 0.4683411826268969, + "grad_norm": 1.863301564352949, + "learning_rate": 1.150606280287146e-05, + "loss": 0.9209, + "step": 4475 + }, + { + "epoch": 0.4684458398744113, + "grad_norm": 2.298175682660464, + "learning_rate": 1.1502711659986491e-05, + "loss": 0.9713, + "step": 4476 + }, + { + "epoch": 0.4685504971219257, + "grad_norm": 2.4343036503123994, + "learning_rate": 1.1499360344436908e-05, + "loss": 0.9463, + "step": 4477 + }, + { + "epoch": 0.4686551543694401, + "grad_norm": 2.1765146376199946, + "learning_rate": 1.1496008856607779e-05, + "loss": 0.9651, + "step": 4478 + }, + { + "epoch": 0.46875981161695446, + "grad_norm": 1.8298464336339146, + "learning_rate": 1.1492657196884203e-05, + "loss": 0.9422, + "step": 4479 + }, + { + "epoch": 0.46886446886446886, + "grad_norm": 1.945013493191885, + "learning_rate": 1.1489305365651286e-05, + "loss": 1.0195, + "step": 4480 + }, + { + "epoch": 0.46896912611198327, + "grad_norm": 1.9336766437798703, + "learning_rate": 1.1485953363294167e-05, + "loss": 1.0002, + "step": 4481 + }, + { + "epoch": 0.4690737833594976, + "grad_norm": 1.9030733453182003, + "learning_rate": 1.1482601190197993e-05, + "loss": 0.8946, + "step": 4482 + }, + { + "epoch": 0.469178440607012, + "grad_norm": 4.129348859610016, + "learning_rate": 1.1479248846747934e-05, + "loss": 1.1738, + "step": 4483 + }, + { + "epoch": 0.46928309785452643, + "grad_norm": 2.041585816161266, + "learning_rate": 1.1475896333329188e-05, + "loss": 0.9353, + "step": 4484 + }, + { + "epoch": 0.46938775510204084, + "grad_norm": 1.8593945186994896, + "learning_rate": 1.1472543650326959e-05, + "loss": 0.9923, + "step": 4485 + }, + { + "epoch": 0.4694924123495552, + "grad_norm": 5.031841167564904, + "learning_rate": 1.1469190798126479e-05, + "loss": 1.179, + "step": 4486 + }, + { + "epoch": 0.4695970695970696, + "grad_norm": 2.1889564063975135, + "learning_rate": 1.1465837777113e-05, + "loss": 0.9345, + "step": 4487 + }, + { + "epoch": 0.469701726844584, + "grad_norm": 2.0960505576447868, + "learning_rate": 1.1462484587671788e-05, + "loss": 1.057, + "step": 4488 + }, + { + "epoch": 0.46980638409209835, + "grad_norm": 2.122413193941446, + "learning_rate": 1.1459131230188132e-05, + "loss": 0.9431, + "step": 4489 + }, + { + "epoch": 0.46991104133961276, + "grad_norm": 1.8665692849116569, + "learning_rate": 1.1455777705047337e-05, + "loss": 0.8859, + "step": 4490 + }, + { + "epoch": 0.47001569858712716, + "grad_norm": 2.2256281211232967, + "learning_rate": 1.1452424012634734e-05, + "loss": 0.9491, + "step": 4491 + }, + { + "epoch": 0.47012035583464157, + "grad_norm": 2.3097076298000068, + "learning_rate": 1.1449070153335669e-05, + "loss": 1.0756, + "step": 4492 + }, + { + "epoch": 0.4702250130821559, + "grad_norm": 5.195580495117896, + "learning_rate": 1.1445716127535506e-05, + "loss": 1.1469, + "step": 4493 + }, + { + "epoch": 0.4703296703296703, + "grad_norm": 1.8105322884131707, + "learning_rate": 1.1442361935619628e-05, + "loss": 0.9045, + "step": 4494 + }, + { + "epoch": 0.47043432757718473, + "grad_norm": 2.01942155404332, + "learning_rate": 1.143900757797344e-05, + "loss": 0.9083, + "step": 4495 + }, + { + "epoch": 0.47053898482469914, + "grad_norm": 1.932387279006279, + "learning_rate": 1.1435653054982365e-05, + "loss": 0.9502, + "step": 4496 + }, + { + "epoch": 0.4706436420722135, + "grad_norm": 4.1256458431646355, + "learning_rate": 1.1432298367031843e-05, + "loss": 1.0059, + "step": 4497 + }, + { + "epoch": 0.4707482993197279, + "grad_norm": 1.942103396468479, + "learning_rate": 1.1428943514507334e-05, + "loss": 1.0407, + "step": 4498 + }, + { + "epoch": 0.4708529565672423, + "grad_norm": 2.273840379567588, + "learning_rate": 1.1425588497794319e-05, + "loss": 1.0614, + "step": 4499 + }, + { + "epoch": 0.47095761381475665, + "grad_norm": 2.1200667200778907, + "learning_rate": 1.1422233317278295e-05, + "loss": 0.9771, + "step": 4500 + }, + { + "epoch": 0.47106227106227105, + "grad_norm": 2.083148734600215, + "learning_rate": 1.1418877973344781e-05, + "loss": 0.8671, + "step": 4501 + }, + { + "epoch": 0.47116692830978546, + "grad_norm": 2.2953006328435674, + "learning_rate": 1.1415522466379312e-05, + "loss": 0.9418, + "step": 4502 + }, + { + "epoch": 0.47127158555729987, + "grad_norm": 2.189812095653051, + "learning_rate": 1.1412166796767442e-05, + "loss": 0.8781, + "step": 4503 + }, + { + "epoch": 0.4713762428048142, + "grad_norm": 2.2314434200482407, + "learning_rate": 1.1408810964894743e-05, + "loss": 0.9772, + "step": 4504 + }, + { + "epoch": 0.4714809000523286, + "grad_norm": 2.0090730558505205, + "learning_rate": 1.1405454971146808e-05, + "loss": 0.9608, + "step": 4505 + }, + { + "epoch": 0.47158555729984303, + "grad_norm": 2.2175393347464314, + "learning_rate": 1.140209881590925e-05, + "loss": 1.0648, + "step": 4506 + }, + { + "epoch": 0.4716902145473574, + "grad_norm": 1.8968894676133286, + "learning_rate": 1.1398742499567689e-05, + "loss": 0.9704, + "step": 4507 + }, + { + "epoch": 0.4717948717948718, + "grad_norm": 2.2753935373797916, + "learning_rate": 1.1395386022507786e-05, + "loss": 0.9457, + "step": 4508 + }, + { + "epoch": 0.4718995290423862, + "grad_norm": 2.2158138128200533, + "learning_rate": 1.13920293851152e-05, + "loss": 0.939, + "step": 4509 + }, + { + "epoch": 0.4720041862899006, + "grad_norm": 2.149361055179521, + "learning_rate": 1.1388672587775615e-05, + "loss": 0.8684, + "step": 4510 + }, + { + "epoch": 0.47210884353741495, + "grad_norm": 2.242585827443349, + "learning_rate": 1.1385315630874732e-05, + "loss": 0.9222, + "step": 4511 + }, + { + "epoch": 0.47221350078492935, + "grad_norm": 2.0858170359584878, + "learning_rate": 1.1381958514798277e-05, + "loss": 1.0521, + "step": 4512 + }, + { + "epoch": 0.47231815803244376, + "grad_norm": 2.3578219833534635, + "learning_rate": 1.1378601239931988e-05, + "loss": 0.9778, + "step": 4513 + }, + { + "epoch": 0.47242281527995816, + "grad_norm": 2.318116192208368, + "learning_rate": 1.1375243806661617e-05, + "loss": 0.9584, + "step": 4514 + }, + { + "epoch": 0.4725274725274725, + "grad_norm": 1.8752032576547455, + "learning_rate": 1.1371886215372952e-05, + "loss": 1.0631, + "step": 4515 + }, + { + "epoch": 0.4726321297749869, + "grad_norm": 2.4461026659114067, + "learning_rate": 1.1368528466451774e-05, + "loss": 1.0267, + "step": 4516 + }, + { + "epoch": 0.4727367870225013, + "grad_norm": 2.0213212076244638, + "learning_rate": 1.1365170560283903e-05, + "loss": 0.9101, + "step": 4517 + }, + { + "epoch": 0.4728414442700157, + "grad_norm": 1.8825133489006614, + "learning_rate": 1.1361812497255167e-05, + "loss": 1.0026, + "step": 4518 + }, + { + "epoch": 0.4729461015175301, + "grad_norm": 2.2245743255363286, + "learning_rate": 1.1358454277751415e-05, + "loss": 0.9904, + "step": 4519 + }, + { + "epoch": 0.4730507587650445, + "grad_norm": 1.9671246859720815, + "learning_rate": 1.1355095902158507e-05, + "loss": 0.8788, + "step": 4520 + }, + { + "epoch": 0.4731554160125589, + "grad_norm": 2.045528524105347, + "learning_rate": 1.1351737370862339e-05, + "loss": 1.0552, + "step": 4521 + }, + { + "epoch": 0.47326007326007324, + "grad_norm": 2.076578499573767, + "learning_rate": 1.1348378684248806e-05, + "loss": 0.9716, + "step": 4522 + }, + { + "epoch": 0.47336473050758765, + "grad_norm": 2.1579134355425764, + "learning_rate": 1.134501984270383e-05, + "loss": 1.1023, + "step": 4523 + }, + { + "epoch": 0.47346938775510206, + "grad_norm": 2.340911759330156, + "learning_rate": 1.1341660846613343e-05, + "loss": 0.9794, + "step": 4524 + }, + { + "epoch": 0.4735740450026164, + "grad_norm": 1.9924334227820981, + "learning_rate": 1.133830169636331e-05, + "loss": 1.0848, + "step": 4525 + }, + { + "epoch": 0.4736787022501308, + "grad_norm": 1.962966847625367, + "learning_rate": 1.1334942392339693e-05, + "loss": 1.0938, + "step": 4526 + }, + { + "epoch": 0.4737833594976452, + "grad_norm": 2.178454110615139, + "learning_rate": 1.1331582934928495e-05, + "loss": 1.0561, + "step": 4527 + }, + { + "epoch": 0.4738880167451596, + "grad_norm": 2.368913775696146, + "learning_rate": 1.1328223324515714e-05, + "loss": 0.8537, + "step": 4528 + }, + { + "epoch": 0.473992673992674, + "grad_norm": 1.9280380818904281, + "learning_rate": 1.1324863561487383e-05, + "loss": 0.8553, + "step": 4529 + }, + { + "epoch": 0.4740973312401884, + "grad_norm": 1.962816395052731, + "learning_rate": 1.1321503646229543e-05, + "loss": 0.8091, + "step": 4530 + }, + { + "epoch": 0.4742019884877028, + "grad_norm": 2.068969926585598, + "learning_rate": 1.1318143579128254e-05, + "loss": 0.9076, + "step": 4531 + }, + { + "epoch": 0.47430664573521714, + "grad_norm": 1.9731106713105366, + "learning_rate": 1.1314783360569597e-05, + "loss": 0.9354, + "step": 4532 + }, + { + "epoch": 0.47441130298273154, + "grad_norm": 1.9249243484641931, + "learning_rate": 1.1311422990939664e-05, + "loss": 0.8459, + "step": 4533 + }, + { + "epoch": 0.47451596023024595, + "grad_norm": 2.014143494128217, + "learning_rate": 1.1308062470624572e-05, + "loss": 1.0709, + "step": 4534 + }, + { + "epoch": 0.47462061747776035, + "grad_norm": 1.790897621437224, + "learning_rate": 1.1304701800010453e-05, + "loss": 0.9237, + "step": 4535 + }, + { + "epoch": 0.4747252747252747, + "grad_norm": 1.8946726238557325, + "learning_rate": 1.1301340979483448e-05, + "loss": 0.9411, + "step": 4536 + }, + { + "epoch": 0.4748299319727891, + "grad_norm": 1.8356376485135328, + "learning_rate": 1.129798000942973e-05, + "loss": 0.958, + "step": 4537 + }, + { + "epoch": 0.4749345892203035, + "grad_norm": 2.0838881862402765, + "learning_rate": 1.1294618890235474e-05, + "loss": 1.0522, + "step": 4538 + }, + { + "epoch": 0.4750392464678179, + "grad_norm": 2.1995593212183264, + "learning_rate": 1.1291257622286887e-05, + "loss": 0.9551, + "step": 4539 + }, + { + "epoch": 0.47514390371533227, + "grad_norm": 1.8441762381994096, + "learning_rate": 1.1287896205970178e-05, + "loss": 0.8266, + "step": 4540 + }, + { + "epoch": 0.4752485609628467, + "grad_norm": 4.422227979112271, + "learning_rate": 1.1284534641671588e-05, + "loss": 1.1611, + "step": 4541 + }, + { + "epoch": 0.4753532182103611, + "grad_norm": 2.3144636336989945, + "learning_rate": 1.128117292977736e-05, + "loss": 0.9875, + "step": 4542 + }, + { + "epoch": 0.47545787545787543, + "grad_norm": 2.10412297009563, + "learning_rate": 1.1277811070673765e-05, + "loss": 0.9113, + "step": 4543 + }, + { + "epoch": 0.47556253270538984, + "grad_norm": 2.215518349186847, + "learning_rate": 1.127444906474709e-05, + "loss": 0.8954, + "step": 4544 + }, + { + "epoch": 0.47566718995290425, + "grad_norm": 2.30718848055899, + "learning_rate": 1.1271086912383633e-05, + "loss": 0.9245, + "step": 4545 + }, + { + "epoch": 0.47577184720041865, + "grad_norm": 2.028770055513572, + "learning_rate": 1.1267724613969712e-05, + "loss": 0.9973, + "step": 4546 + }, + { + "epoch": 0.475876504447933, + "grad_norm": 2.2846525131478437, + "learning_rate": 1.1264362169891664e-05, + "loss": 1.0271, + "step": 4547 + }, + { + "epoch": 0.4759811616954474, + "grad_norm": 2.0978658851996257, + "learning_rate": 1.1260999580535842e-05, + "loss": 0.9312, + "step": 4548 + }, + { + "epoch": 0.4760858189429618, + "grad_norm": 3.541394648972959, + "learning_rate": 1.125763684628861e-05, + "loss": 1.0477, + "step": 4549 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 2.317340528280266, + "learning_rate": 1.1254273967536352e-05, + "loss": 0.9305, + "step": 4550 + }, + { + "epoch": 0.47629513343799057, + "grad_norm": 2.5592216831346093, + "learning_rate": 1.1250910944665475e-05, + "loss": 0.9933, + "step": 4551 + }, + { + "epoch": 0.476399790685505, + "grad_norm": 2.3033433240600942, + "learning_rate": 1.1247547778062394e-05, + "loss": 1.0646, + "step": 4552 + }, + { + "epoch": 0.4765044479330194, + "grad_norm": 1.782189325078589, + "learning_rate": 1.1244184468113544e-05, + "loss": 0.8961, + "step": 4553 + }, + { + "epoch": 0.47660910518053373, + "grad_norm": 2.3437218275063914, + "learning_rate": 1.1240821015205378e-05, + "loss": 0.9471, + "step": 4554 + }, + { + "epoch": 0.47671376242804814, + "grad_norm": 2.0231024397759563, + "learning_rate": 1.1237457419724357e-05, + "loss": 0.9574, + "step": 4555 + }, + { + "epoch": 0.47681841967556254, + "grad_norm": 1.9992919395657165, + "learning_rate": 1.1234093682056976e-05, + "loss": 0.8381, + "step": 4556 + }, + { + "epoch": 0.47692307692307695, + "grad_norm": 2.2448074144884433, + "learning_rate": 1.1230729802589727e-05, + "loss": 1.005, + "step": 4557 + }, + { + "epoch": 0.4770277341705913, + "grad_norm": 2.094998090831663, + "learning_rate": 1.1227365781709129e-05, + "loss": 0.8991, + "step": 4558 + }, + { + "epoch": 0.4771323914181057, + "grad_norm": 2.1848516156132227, + "learning_rate": 1.1224001619801711e-05, + "loss": 0.9532, + "step": 4559 + }, + { + "epoch": 0.4772370486656201, + "grad_norm": 1.9856619963316688, + "learning_rate": 1.122063731725403e-05, + "loss": 0.9081, + "step": 4560 + }, + { + "epoch": 0.47734170591313446, + "grad_norm": 2.5402648997557367, + "learning_rate": 1.1217272874452647e-05, + "loss": 1.0494, + "step": 4561 + }, + { + "epoch": 0.47744636316064887, + "grad_norm": 2.129703098208944, + "learning_rate": 1.121390829178414e-05, + "loss": 0.8839, + "step": 4562 + }, + { + "epoch": 0.4775510204081633, + "grad_norm": 2.207630890065239, + "learning_rate": 1.1210543569635113e-05, + "loss": 1.0222, + "step": 4563 + }, + { + "epoch": 0.4776556776556777, + "grad_norm": 2.3323434569261425, + "learning_rate": 1.1207178708392173e-05, + "loss": 0.8753, + "step": 4564 + }, + { + "epoch": 0.47776033490319203, + "grad_norm": 2.0722087950689616, + "learning_rate": 1.1203813708441953e-05, + "loss": 1.0534, + "step": 4565 + }, + { + "epoch": 0.47786499215070644, + "grad_norm": 1.9543831145256765, + "learning_rate": 1.1200448570171099e-05, + "loss": 1.0708, + "step": 4566 + }, + { + "epoch": 0.47796964939822084, + "grad_norm": 2.2669487242998487, + "learning_rate": 1.1197083293966267e-05, + "loss": 0.901, + "step": 4567 + }, + { + "epoch": 0.4780743066457352, + "grad_norm": 1.950939820948088, + "learning_rate": 1.1193717880214139e-05, + "loss": 0.8952, + "step": 4568 + }, + { + "epoch": 0.4781789638932496, + "grad_norm": 2.1867834960215737, + "learning_rate": 1.1190352329301408e-05, + "loss": 0.9256, + "step": 4569 + }, + { + "epoch": 0.478283621140764, + "grad_norm": 1.9100913001648718, + "learning_rate": 1.118698664161478e-05, + "loss": 0.9184, + "step": 4570 + }, + { + "epoch": 0.4783882783882784, + "grad_norm": 2.1641906169210765, + "learning_rate": 1.1183620817540985e-05, + "loss": 0.7917, + "step": 4571 + }, + { + "epoch": 0.47849293563579276, + "grad_norm": 1.964578596649523, + "learning_rate": 1.1180254857466753e-05, + "loss": 0.9407, + "step": 4572 + }, + { + "epoch": 0.47859759288330717, + "grad_norm": 1.9010529569383956, + "learning_rate": 1.1176888761778848e-05, + "loss": 0.9953, + "step": 4573 + }, + { + "epoch": 0.47870225013082157, + "grad_norm": 1.933991820551362, + "learning_rate": 1.1173522530864036e-05, + "loss": 1.007, + "step": 4574 + }, + { + "epoch": 0.478806907378336, + "grad_norm": 1.972812990270169, + "learning_rate": 1.117015616510911e-05, + "loss": 0.9306, + "step": 4575 + }, + { + "epoch": 0.47891156462585033, + "grad_norm": 2.0473114243949815, + "learning_rate": 1.1166789664900866e-05, + "loss": 1.005, + "step": 4576 + }, + { + "epoch": 0.47901622187336473, + "grad_norm": 2.1372399776159168, + "learning_rate": 1.1163423030626123e-05, + "loss": 0.9524, + "step": 4577 + }, + { + "epoch": 0.47912087912087914, + "grad_norm": 2.1890057903200826, + "learning_rate": 1.1160056262671718e-05, + "loss": 0.8805, + "step": 4578 + }, + { + "epoch": 0.4792255363683935, + "grad_norm": 2.2821866808196196, + "learning_rate": 1.1156689361424496e-05, + "loss": 0.931, + "step": 4579 + }, + { + "epoch": 0.4793301936159079, + "grad_norm": 1.92030262663086, + "learning_rate": 1.1153322327271324e-05, + "loss": 1.0229, + "step": 4580 + }, + { + "epoch": 0.4794348508634223, + "grad_norm": 2.264619342723187, + "learning_rate": 1.1149955160599073e-05, + "loss": 1.0943, + "step": 4581 + }, + { + "epoch": 0.4795395081109367, + "grad_norm": 2.4491546035308986, + "learning_rate": 1.1146587861794645e-05, + "loss": 0.8622, + "step": 4582 + }, + { + "epoch": 0.47964416535845106, + "grad_norm": 2.700636952916195, + "learning_rate": 1.114322043124495e-05, + "loss": 0.9035, + "step": 4583 + }, + { + "epoch": 0.47974882260596546, + "grad_norm": 5.883040265797574, + "learning_rate": 1.1139852869336907e-05, + "loss": 1.0753, + "step": 4584 + }, + { + "epoch": 0.47985347985347987, + "grad_norm": 2.2890986655042234, + "learning_rate": 1.113648517645746e-05, + "loss": 1.0844, + "step": 4585 + }, + { + "epoch": 0.4799581371009942, + "grad_norm": 1.8524051771038759, + "learning_rate": 1.1133117352993559e-05, + "loss": 0.9628, + "step": 4586 + }, + { + "epoch": 0.4800627943485086, + "grad_norm": 2.2350258071349263, + "learning_rate": 1.1129749399332182e-05, + "loss": 0.9315, + "step": 4587 + }, + { + "epoch": 0.48016745159602303, + "grad_norm": 3.5795609315488726, + "learning_rate": 1.1126381315860306e-05, + "loss": 1.1486, + "step": 4588 + }, + { + "epoch": 0.48027210884353744, + "grad_norm": 2.0026623409331696, + "learning_rate": 1.1123013102964934e-05, + "loss": 1.0134, + "step": 4589 + }, + { + "epoch": 0.4803767660910518, + "grad_norm": 2.44461585559246, + "learning_rate": 1.1119644761033079e-05, + "loss": 1.0314, + "step": 4590 + }, + { + "epoch": 0.4804814233385662, + "grad_norm": 1.9386712926730827, + "learning_rate": 1.1116276290451773e-05, + "loss": 1.0203, + "step": 4591 + }, + { + "epoch": 0.4805860805860806, + "grad_norm": 1.9273580062176585, + "learning_rate": 1.1112907691608059e-05, + "loss": 0.9667, + "step": 4592 + }, + { + "epoch": 0.48069073783359495, + "grad_norm": 2.2393335832841483, + "learning_rate": 1.1109538964888993e-05, + "loss": 0.9332, + "step": 4593 + }, + { + "epoch": 0.48079539508110936, + "grad_norm": 2.2101404361932397, + "learning_rate": 1.110617011068165e-05, + "loss": 0.8757, + "step": 4594 + }, + { + "epoch": 0.48090005232862376, + "grad_norm": 2.07916151536985, + "learning_rate": 1.110280112937312e-05, + "loss": 0.8394, + "step": 4595 + }, + { + "epoch": 0.48100470957613817, + "grad_norm": 2.4008188696017077, + "learning_rate": 1.1099432021350504e-05, + "loss": 0.9937, + "step": 4596 + }, + { + "epoch": 0.4811093668236525, + "grad_norm": 1.98281407430736, + "learning_rate": 1.1096062787000923e-05, + "loss": 0.9293, + "step": 4597 + }, + { + "epoch": 0.4812140240711669, + "grad_norm": 1.6750650744945574, + "learning_rate": 1.1092693426711501e-05, + "loss": 0.8145, + "step": 4598 + }, + { + "epoch": 0.48131868131868133, + "grad_norm": 2.1258343895313137, + "learning_rate": 1.1089323940869392e-05, + "loss": 1.0777, + "step": 4599 + }, + { + "epoch": 0.48142333856619574, + "grad_norm": 2.3850630786161227, + "learning_rate": 1.1085954329861754e-05, + "loss": 1.1615, + "step": 4600 + }, + { + "epoch": 0.4815279958137101, + "grad_norm": 2.041493035918398, + "learning_rate": 1.1082584594075762e-05, + "loss": 0.9125, + "step": 4601 + }, + { + "epoch": 0.4816326530612245, + "grad_norm": 2.102783748957608, + "learning_rate": 1.1079214733898606e-05, + "loss": 0.9909, + "step": 4602 + }, + { + "epoch": 0.4817373103087389, + "grad_norm": 1.6810209540981638, + "learning_rate": 1.1075844749717486e-05, + "loss": 0.9937, + "step": 4603 + }, + { + "epoch": 0.48184196755625325, + "grad_norm": 1.9499292480253156, + "learning_rate": 1.1072474641919626e-05, + "loss": 1.0339, + "step": 4604 + }, + { + "epoch": 0.48194662480376765, + "grad_norm": 1.9692957698439062, + "learning_rate": 1.1069104410892255e-05, + "loss": 0.9911, + "step": 4605 + }, + { + "epoch": 0.48205128205128206, + "grad_norm": 1.9832587584087509, + "learning_rate": 1.1065734057022622e-05, + "loss": 0.9872, + "step": 4606 + }, + { + "epoch": 0.48215593929879647, + "grad_norm": 1.9753953053703226, + "learning_rate": 1.1062363580697982e-05, + "loss": 0.8596, + "step": 4607 + }, + { + "epoch": 0.4822605965463108, + "grad_norm": 2.2053778552706387, + "learning_rate": 1.1058992982305613e-05, + "loss": 0.9707, + "step": 4608 + }, + { + "epoch": 0.4823652537938252, + "grad_norm": 4.451868601106817, + "learning_rate": 1.1055622262232808e-05, + "loss": 1.0707, + "step": 4609 + }, + { + "epoch": 0.4824699110413396, + "grad_norm": 2.427513648919258, + "learning_rate": 1.105225142086686e-05, + "loss": 1.093, + "step": 4610 + }, + { + "epoch": 0.482574568288854, + "grad_norm": 1.8975018847651661, + "learning_rate": 1.1048880458595093e-05, + "loss": 0.9038, + "step": 4611 + }, + { + "epoch": 0.4826792255363684, + "grad_norm": 2.101137234743737, + "learning_rate": 1.1045509375804835e-05, + "loss": 0.9936, + "step": 4612 + }, + { + "epoch": 0.4827838827838828, + "grad_norm": 2.116146425294967, + "learning_rate": 1.104213817288343e-05, + "loss": 0.9991, + "step": 4613 + }, + { + "epoch": 0.4828885400313972, + "grad_norm": 7.379023528902324, + "learning_rate": 1.1038766850218237e-05, + "loss": 1.0964, + "step": 4614 + }, + { + "epoch": 0.48299319727891155, + "grad_norm": 2.303483496597832, + "learning_rate": 1.1035395408196625e-05, + "loss": 1.0693, + "step": 4615 + }, + { + "epoch": 0.48309785452642595, + "grad_norm": 1.8368414922882212, + "learning_rate": 1.103202384720598e-05, + "loss": 0.9938, + "step": 4616 + }, + { + "epoch": 0.48320251177394036, + "grad_norm": 1.939124879567201, + "learning_rate": 1.1028652167633706e-05, + "loss": 0.9361, + "step": 4617 + }, + { + "epoch": 0.48330716902145476, + "grad_norm": 1.931462825025141, + "learning_rate": 1.1025280369867216e-05, + "loss": 0.996, + "step": 4618 + }, + { + "epoch": 0.4834118262689691, + "grad_norm": 1.8660312624366957, + "learning_rate": 1.102190845429393e-05, + "loss": 0.8925, + "step": 4619 + }, + { + "epoch": 0.4835164835164835, + "grad_norm": 1.9759725551334741, + "learning_rate": 1.1018536421301287e-05, + "loss": 0.8974, + "step": 4620 + }, + { + "epoch": 0.4836211407639979, + "grad_norm": 2.042722969414731, + "learning_rate": 1.1015164271276747e-05, + "loss": 1.0188, + "step": 4621 + }, + { + "epoch": 0.4837257980115123, + "grad_norm": 2.038746548550091, + "learning_rate": 1.1011792004607778e-05, + "loss": 0.875, + "step": 4622 + }, + { + "epoch": 0.4838304552590267, + "grad_norm": 2.104636214257139, + "learning_rate": 1.1008419621681854e-05, + "loss": 0.9584, + "step": 4623 + }, + { + "epoch": 0.4839351125065411, + "grad_norm": 1.8747597757848282, + "learning_rate": 1.100504712288647e-05, + "loss": 0.9251, + "step": 4624 + }, + { + "epoch": 0.4840397697540555, + "grad_norm": 2.2211808317732418, + "learning_rate": 1.1001674508609135e-05, + "loss": 1.0446, + "step": 4625 + }, + { + "epoch": 0.48414442700156984, + "grad_norm": 2.3946698315504005, + "learning_rate": 1.0998301779237367e-05, + "loss": 1.0903, + "step": 4626 + }, + { + "epoch": 0.48424908424908425, + "grad_norm": 2.2432670399188472, + "learning_rate": 1.0994928935158703e-05, + "loss": 0.9233, + "step": 4627 + }, + { + "epoch": 0.48435374149659866, + "grad_norm": 1.8197431587694193, + "learning_rate": 1.0991555976760688e-05, + "loss": 0.9245, + "step": 4628 + }, + { + "epoch": 0.484458398744113, + "grad_norm": 2.025709348707062, + "learning_rate": 1.098818290443088e-05, + "loss": 0.9994, + "step": 4629 + }, + { + "epoch": 0.4845630559916274, + "grad_norm": 2.065981035535976, + "learning_rate": 1.098480971855685e-05, + "loss": 1.0224, + "step": 4630 + }, + { + "epoch": 0.4846677132391418, + "grad_norm": 5.77757048663166, + "learning_rate": 1.0981436419526193e-05, + "loss": 1.1061, + "step": 4631 + }, + { + "epoch": 0.4847723704866562, + "grad_norm": 1.870408334687293, + "learning_rate": 1.0978063007726495e-05, + "loss": 0.7735, + "step": 4632 + }, + { + "epoch": 0.4848770277341706, + "grad_norm": 3.3267436752556065, + "learning_rate": 1.0974689483545378e-05, + "loss": 1.0417, + "step": 4633 + }, + { + "epoch": 0.484981684981685, + "grad_norm": 1.9245609462036217, + "learning_rate": 1.0971315847370463e-05, + "loss": 0.8995, + "step": 4634 + }, + { + "epoch": 0.4850863422291994, + "grad_norm": 1.9011101446154672, + "learning_rate": 1.0967942099589391e-05, + "loss": 0.8339, + "step": 4635 + }, + { + "epoch": 0.48519099947671374, + "grad_norm": 1.9115078692573162, + "learning_rate": 1.0964568240589808e-05, + "loss": 0.7742, + "step": 4636 + }, + { + "epoch": 0.48529565672422814, + "grad_norm": 1.9022548969400057, + "learning_rate": 1.0961194270759375e-05, + "loss": 0.9253, + "step": 4637 + }, + { + "epoch": 0.48540031397174255, + "grad_norm": 2.0318224541882657, + "learning_rate": 1.0957820190485775e-05, + "loss": 0.9972, + "step": 4638 + }, + { + "epoch": 0.48550497121925695, + "grad_norm": 2.0727823257764544, + "learning_rate": 1.0954446000156693e-05, + "loss": 1.058, + "step": 4639 + }, + { + "epoch": 0.4856096284667713, + "grad_norm": 1.9956018186674882, + "learning_rate": 1.0951071700159833e-05, + "loss": 0.8057, + "step": 4640 + }, + { + "epoch": 0.4857142857142857, + "grad_norm": 2.20621162649101, + "learning_rate": 1.0947697290882903e-05, + "loss": 0.834, + "step": 4641 + }, + { + "epoch": 0.4858189429618001, + "grad_norm": 2.051794455421428, + "learning_rate": 1.0944322772713634e-05, + "loss": 0.9757, + "step": 4642 + }, + { + "epoch": 0.4859236002093145, + "grad_norm": 2.1409550137945974, + "learning_rate": 1.0940948146039768e-05, + "loss": 0.974, + "step": 4643 + }, + { + "epoch": 0.48602825745682887, + "grad_norm": 1.948251250014736, + "learning_rate": 1.0937573411249047e-05, + "loss": 0.9849, + "step": 4644 + }, + { + "epoch": 0.4861329147043433, + "grad_norm": 1.9818999450949544, + "learning_rate": 1.0934198568729245e-05, + "loss": 0.9915, + "step": 4645 + }, + { + "epoch": 0.4862375719518577, + "grad_norm": 2.056814682650398, + "learning_rate": 1.0930823618868128e-05, + "loss": 0.9572, + "step": 4646 + }, + { + "epoch": 0.48634222919937203, + "grad_norm": 1.93186679850152, + "learning_rate": 1.0927448562053494e-05, + "loss": 0.9447, + "step": 4647 + }, + { + "epoch": 0.48644688644688644, + "grad_norm": 1.9807004018502983, + "learning_rate": 1.092407339867314e-05, + "loss": 0.9509, + "step": 4648 + }, + { + "epoch": 0.48655154369440085, + "grad_norm": 2.2561421939508324, + "learning_rate": 1.0920698129114877e-05, + "loss": 0.9115, + "step": 4649 + }, + { + "epoch": 0.48665620094191525, + "grad_norm": 1.7925812958550895, + "learning_rate": 1.0917322753766536e-05, + "loss": 0.8921, + "step": 4650 + }, + { + "epoch": 0.4867608581894296, + "grad_norm": 1.980186761047103, + "learning_rate": 1.0913947273015948e-05, + "loss": 0.9832, + "step": 4651 + }, + { + "epoch": 0.486865515436944, + "grad_norm": 2.3562237026686343, + "learning_rate": 1.0910571687250965e-05, + "loss": 0.9792, + "step": 4652 + }, + { + "epoch": 0.4869701726844584, + "grad_norm": 5.0353868336406915, + "learning_rate": 1.0907195996859453e-05, + "loss": 0.9708, + "step": 4653 + }, + { + "epoch": 0.48707482993197276, + "grad_norm": 1.9131908425142234, + "learning_rate": 1.090382020222928e-05, + "loss": 0.9326, + "step": 4654 + }, + { + "epoch": 0.48717948717948717, + "grad_norm": 1.790537959582867, + "learning_rate": 1.0900444303748333e-05, + "loss": 0.9168, + "step": 4655 + }, + { + "epoch": 0.4872841444270016, + "grad_norm": 2.140607384118724, + "learning_rate": 1.089706830180451e-05, + "loss": 1.1137, + "step": 4656 + }, + { + "epoch": 0.487388801674516, + "grad_norm": 1.891634591818433, + "learning_rate": 1.0893692196785722e-05, + "loss": 0.8324, + "step": 4657 + }, + { + "epoch": 0.48749345892203033, + "grad_norm": 2.09720854094086, + "learning_rate": 1.0890315989079887e-05, + "loss": 0.9571, + "step": 4658 + }, + { + "epoch": 0.48759811616954474, + "grad_norm": 2.332233200389157, + "learning_rate": 1.0886939679074939e-05, + "loss": 1.0647, + "step": 4659 + }, + { + "epoch": 0.48770277341705914, + "grad_norm": 2.3675130191516014, + "learning_rate": 1.0883563267158827e-05, + "loss": 0.8783, + "step": 4660 + }, + { + "epoch": 0.48780743066457355, + "grad_norm": 2.067847253311974, + "learning_rate": 1.0880186753719505e-05, + "loss": 0.9496, + "step": 4661 + }, + { + "epoch": 0.4879120879120879, + "grad_norm": 2.059957357047224, + "learning_rate": 1.087681013914494e-05, + "loss": 1.0097, + "step": 4662 + }, + { + "epoch": 0.4880167451596023, + "grad_norm": 1.7780085625194908, + "learning_rate": 1.0873433423823108e-05, + "loss": 0.9731, + "step": 4663 + }, + { + "epoch": 0.4881214024071167, + "grad_norm": 2.2693994526383183, + "learning_rate": 1.087005660814201e-05, + "loss": 0.9633, + "step": 4664 + }, + { + "epoch": 0.48822605965463106, + "grad_norm": 2.2058172273341343, + "learning_rate": 1.0866679692489643e-05, + "loss": 0.8353, + "step": 4665 + }, + { + "epoch": 0.48833071690214547, + "grad_norm": 2.1061269956596846, + "learning_rate": 1.0863302677254021e-05, + "loss": 0.9219, + "step": 4666 + }, + { + "epoch": 0.4884353741496599, + "grad_norm": 2.000729147710908, + "learning_rate": 1.0859925562823172e-05, + "loss": 0.9055, + "step": 4667 + }, + { + "epoch": 0.4885400313971743, + "grad_norm": 1.9868769740264263, + "learning_rate": 1.0856548349585132e-05, + "loss": 0.9625, + "step": 4668 + }, + { + "epoch": 0.48864468864468863, + "grad_norm": 1.9715388535082388, + "learning_rate": 1.0853171037927952e-05, + "loss": 0.9459, + "step": 4669 + }, + { + "epoch": 0.48874934589220304, + "grad_norm": 2.1247424417156147, + "learning_rate": 1.0849793628239687e-05, + "loss": 1.0457, + "step": 4670 + }, + { + "epoch": 0.48885400313971744, + "grad_norm": 2.0181718772211426, + "learning_rate": 1.0846416120908416e-05, + "loss": 0.8496, + "step": 4671 + }, + { + "epoch": 0.4889586603872318, + "grad_norm": 1.858935680154186, + "learning_rate": 1.0843038516322213e-05, + "loss": 1.0369, + "step": 4672 + }, + { + "epoch": 0.4890633176347462, + "grad_norm": 1.9353831132266497, + "learning_rate": 1.0839660814869174e-05, + "loss": 1.0288, + "step": 4673 + }, + { + "epoch": 0.4891679748822606, + "grad_norm": 1.92877995038633, + "learning_rate": 1.083628301693741e-05, + "loss": 0.9495, + "step": 4674 + }, + { + "epoch": 0.489272632129775, + "grad_norm": 1.8743301066850448, + "learning_rate": 1.083290512291503e-05, + "loss": 0.9853, + "step": 4675 + }, + { + "epoch": 0.48937728937728936, + "grad_norm": 2.166368633461282, + "learning_rate": 1.0829527133190163e-05, + "loss": 0.9273, + "step": 4676 + }, + { + "epoch": 0.48948194662480377, + "grad_norm": 2.626516377134673, + "learning_rate": 1.0826149048150947e-05, + "loss": 0.9658, + "step": 4677 + }, + { + "epoch": 0.48958660387231817, + "grad_norm": 2.007119331685366, + "learning_rate": 1.082277086818553e-05, + "loss": 0.9525, + "step": 4678 + }, + { + "epoch": 0.4896912611198325, + "grad_norm": 2.3214333736204043, + "learning_rate": 1.0819392593682074e-05, + "loss": 0.9228, + "step": 4679 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 2.1792774915549464, + "learning_rate": 1.0816014225028745e-05, + "loss": 1.001, + "step": 4680 + }, + { + "epoch": 0.48990057561486133, + "grad_norm": 1.871180713970224, + "learning_rate": 1.0812635762613728e-05, + "loss": 0.9649, + "step": 4681 + }, + { + "epoch": 0.49000523286237574, + "grad_norm": 2.415124070494655, + "learning_rate": 1.0809257206825214e-05, + "loss": 0.8755, + "step": 4682 + }, + { + "epoch": 0.4901098901098901, + "grad_norm": 1.972768747408386, + "learning_rate": 1.080587855805141e-05, + "loss": 0.9227, + "step": 4683 + }, + { + "epoch": 0.4902145473574045, + "grad_norm": 2.0814935675576196, + "learning_rate": 1.0802499816680525e-05, + "loss": 0.9524, + "step": 4684 + }, + { + "epoch": 0.4903192046049189, + "grad_norm": 2.1273011146323424, + "learning_rate": 1.079912098310078e-05, + "loss": 0.9118, + "step": 4685 + }, + { + "epoch": 0.4904238618524333, + "grad_norm": 2.3374604970516932, + "learning_rate": 1.0795742057700419e-05, + "loss": 1.0986, + "step": 4686 + }, + { + "epoch": 0.49052851909994766, + "grad_norm": 2.362107494802518, + "learning_rate": 1.0792363040867677e-05, + "loss": 0.941, + "step": 4687 + }, + { + "epoch": 0.49063317634746206, + "grad_norm": 1.7863431015815339, + "learning_rate": 1.0788983932990821e-05, + "loss": 0.8811, + "step": 4688 + }, + { + "epoch": 0.49073783359497647, + "grad_norm": 1.9476171669722804, + "learning_rate": 1.0785604734458108e-05, + "loss": 0.9976, + "step": 4689 + }, + { + "epoch": 0.4908424908424908, + "grad_norm": 1.9535812880238408, + "learning_rate": 1.0782225445657818e-05, + "loss": 0.9657, + "step": 4690 + }, + { + "epoch": 0.4909471480900052, + "grad_norm": 2.09502949679378, + "learning_rate": 1.0778846066978244e-05, + "loss": 1.0246, + "step": 4691 + }, + { + "epoch": 0.49105180533751963, + "grad_norm": 1.9911290871631744, + "learning_rate": 1.0775466598807673e-05, + "loss": 0.9342, + "step": 4692 + }, + { + "epoch": 0.49115646258503404, + "grad_norm": 2.0405251799273074, + "learning_rate": 1.077208704153442e-05, + "loss": 0.8705, + "step": 4693 + }, + { + "epoch": 0.4912611198325484, + "grad_norm": 2.230283723602765, + "learning_rate": 1.07687073955468e-05, + "loss": 0.9413, + "step": 4694 + }, + { + "epoch": 0.4913657770800628, + "grad_norm": 2.0963860345437997, + "learning_rate": 1.076532766123314e-05, + "loss": 1.0367, + "step": 4695 + }, + { + "epoch": 0.4914704343275772, + "grad_norm": 2.0670944200322845, + "learning_rate": 1.0761947838981784e-05, + "loss": 0.9836, + "step": 4696 + }, + { + "epoch": 0.49157509157509155, + "grad_norm": 1.9165456806981649, + "learning_rate": 1.0758567929181074e-05, + "loss": 0.959, + "step": 4697 + }, + { + "epoch": 0.49167974882260596, + "grad_norm": 2.228963424636731, + "learning_rate": 1.075518793221937e-05, + "loss": 0.9847, + "step": 4698 + }, + { + "epoch": 0.49178440607012036, + "grad_norm": 2.3859878281796747, + "learning_rate": 1.0751807848485043e-05, + "loss": 0.9669, + "step": 4699 + }, + { + "epoch": 0.49188906331763477, + "grad_norm": 1.8576336024035527, + "learning_rate": 1.0748427678366474e-05, + "loss": 0.9819, + "step": 4700 + }, + { + "epoch": 0.4919937205651491, + "grad_norm": 1.9665979871967703, + "learning_rate": 1.0745047422252041e-05, + "loss": 0.9886, + "step": 4701 + }, + { + "epoch": 0.4920983778126635, + "grad_norm": 1.9561562986500205, + "learning_rate": 1.0741667080530152e-05, + "loss": 0.9116, + "step": 4702 + }, + { + "epoch": 0.49220303506017793, + "grad_norm": 1.84642427868714, + "learning_rate": 1.0738286653589215e-05, + "loss": 0.9101, + "step": 4703 + }, + { + "epoch": 0.49230769230769234, + "grad_norm": 2.2932509082000823, + "learning_rate": 1.0734906141817638e-05, + "loss": 1.0601, + "step": 4704 + }, + { + "epoch": 0.4924123495552067, + "grad_norm": 2.1010715133795403, + "learning_rate": 1.0731525545603863e-05, + "loss": 0.9556, + "step": 4705 + }, + { + "epoch": 0.4925170068027211, + "grad_norm": 2.0849794034429117, + "learning_rate": 1.0728144865336315e-05, + "loss": 0.8108, + "step": 4706 + }, + { + "epoch": 0.4926216640502355, + "grad_norm": 6.181013708885361, + "learning_rate": 1.0724764101403445e-05, + "loss": 0.9935, + "step": 4707 + }, + { + "epoch": 0.49272632129774985, + "grad_norm": 2.045937670470183, + "learning_rate": 1.0721383254193714e-05, + "loss": 0.9471, + "step": 4708 + }, + { + "epoch": 0.49283097854526425, + "grad_norm": 2.7772247895749613, + "learning_rate": 1.0718002324095582e-05, + "loss": 0.9565, + "step": 4709 + }, + { + "epoch": 0.49293563579277866, + "grad_norm": 1.890604125310268, + "learning_rate": 1.071462131149753e-05, + "loss": 0.9257, + "step": 4710 + }, + { + "epoch": 0.49304029304029307, + "grad_norm": 2.489027702426938, + "learning_rate": 1.0711240216788036e-05, + "loss": 0.9638, + "step": 4711 + }, + { + "epoch": 0.4931449502878074, + "grad_norm": 2.455683040276122, + "learning_rate": 1.0707859040355603e-05, + "loss": 1.0213, + "step": 4712 + }, + { + "epoch": 0.4932496075353218, + "grad_norm": 2.143398110677024, + "learning_rate": 1.0704477782588733e-05, + "loss": 0.9781, + "step": 4713 + }, + { + "epoch": 0.4933542647828362, + "grad_norm": 1.9663410678077706, + "learning_rate": 1.0701096443875936e-05, + "loss": 0.9224, + "step": 4714 + }, + { + "epoch": 0.4934589220303506, + "grad_norm": 2.1480345691790705, + "learning_rate": 1.0697715024605735e-05, + "loss": 0.9175, + "step": 4715 + }, + { + "epoch": 0.493563579277865, + "grad_norm": 9.477655375017251, + "learning_rate": 1.0694333525166662e-05, + "loss": 0.978, + "step": 4716 + }, + { + "epoch": 0.4936682365253794, + "grad_norm": 1.80863597665584, + "learning_rate": 1.0690951945947263e-05, + "loss": 0.8808, + "step": 4717 + }, + { + "epoch": 0.4937728937728938, + "grad_norm": 1.886985809641338, + "learning_rate": 1.0687570287336082e-05, + "loss": 0.8756, + "step": 4718 + }, + { + "epoch": 0.49387755102040815, + "grad_norm": 2.0556937103470143, + "learning_rate": 1.0684188549721683e-05, + "loss": 0.9244, + "step": 4719 + }, + { + "epoch": 0.49398220826792255, + "grad_norm": 2.0110374519014154, + "learning_rate": 1.0680806733492632e-05, + "loss": 1.0597, + "step": 4720 + }, + { + "epoch": 0.49408686551543696, + "grad_norm": 1.7973197164685586, + "learning_rate": 1.0677424839037506e-05, + "loss": 0.9803, + "step": 4721 + }, + { + "epoch": 0.4941915227629513, + "grad_norm": 2.186835840861067, + "learning_rate": 1.0674042866744896e-05, + "loss": 0.9843, + "step": 4722 + }, + { + "epoch": 0.4942961800104657, + "grad_norm": 1.9713736175942005, + "learning_rate": 1.0670660817003391e-05, + "loss": 1.0191, + "step": 4723 + }, + { + "epoch": 0.4944008372579801, + "grad_norm": 2.1417694138266845, + "learning_rate": 1.0667278690201597e-05, + "loss": 0.9995, + "step": 4724 + }, + { + "epoch": 0.4945054945054945, + "grad_norm": 2.8859513823150724, + "learning_rate": 1.0663896486728134e-05, + "loss": 1.1489, + "step": 4725 + }, + { + "epoch": 0.4946101517530089, + "grad_norm": 1.9628740590998055, + "learning_rate": 1.0660514206971615e-05, + "loss": 0.9062, + "step": 4726 + }, + { + "epoch": 0.4947148090005233, + "grad_norm": 1.8512096693230278, + "learning_rate": 1.0657131851320677e-05, + "loss": 0.8803, + "step": 4727 + }, + { + "epoch": 0.4948194662480377, + "grad_norm": 2.433646998663187, + "learning_rate": 1.0653749420163953e-05, + "loss": 0.9568, + "step": 4728 + }, + { + "epoch": 0.4949241234955521, + "grad_norm": 2.0345077036147656, + "learning_rate": 1.06503669138901e-05, + "loss": 0.7353, + "step": 4729 + }, + { + "epoch": 0.49502878074306644, + "grad_norm": 1.9424603161161924, + "learning_rate": 1.0646984332887766e-05, + "loss": 0.968, + "step": 4730 + }, + { + "epoch": 0.49513343799058085, + "grad_norm": 2.3399922101360553, + "learning_rate": 1.0643601677545626e-05, + "loss": 0.9581, + "step": 4731 + }, + { + "epoch": 0.49523809523809526, + "grad_norm": 1.8649126698331577, + "learning_rate": 1.0640218948252345e-05, + "loss": 0.7876, + "step": 4732 + }, + { + "epoch": 0.4953427524856096, + "grad_norm": 2.331216885557431, + "learning_rate": 1.0636836145396607e-05, + "loss": 0.9769, + "step": 4733 + }, + { + "epoch": 0.495447409733124, + "grad_norm": 2.163582657761894, + "learning_rate": 1.063345326936711e-05, + "loss": 0.9926, + "step": 4734 + }, + { + "epoch": 0.4955520669806384, + "grad_norm": 2.081349677166352, + "learning_rate": 1.0630070320552544e-05, + "loss": 1.0198, + "step": 4735 + }, + { + "epoch": 0.4956567242281528, + "grad_norm": 1.7921126006767771, + "learning_rate": 1.0626687299341622e-05, + "loss": 0.8941, + "step": 4736 + }, + { + "epoch": 0.4957613814756672, + "grad_norm": 1.7566094714296194, + "learning_rate": 1.062330420612306e-05, + "loss": 0.9175, + "step": 4737 + }, + { + "epoch": 0.4958660387231816, + "grad_norm": 2.3106445373652496, + "learning_rate": 1.0619921041285579e-05, + "loss": 0.9188, + "step": 4738 + }, + { + "epoch": 0.495970695970696, + "grad_norm": 2.8264044010883413, + "learning_rate": 1.0616537805217918e-05, + "loss": 0.8804, + "step": 4739 + }, + { + "epoch": 0.49607535321821034, + "grad_norm": 1.9520944530357538, + "learning_rate": 1.0613154498308808e-05, + "loss": 0.957, + "step": 4740 + }, + { + "epoch": 0.49618001046572474, + "grad_norm": 2.3861280899553905, + "learning_rate": 1.0609771120947005e-05, + "loss": 0.9467, + "step": 4741 + }, + { + "epoch": 0.49628466771323915, + "grad_norm": 2.0740843031163045, + "learning_rate": 1.0606387673521265e-05, + "loss": 0.9223, + "step": 4742 + }, + { + "epoch": 0.49638932496075355, + "grad_norm": 9.399003361754902, + "learning_rate": 1.0603004156420354e-05, + "loss": 1.2298, + "step": 4743 + }, + { + "epoch": 0.4964939822082679, + "grad_norm": 2.0945292190249747, + "learning_rate": 1.059962057003304e-05, + "loss": 1.0156, + "step": 4744 + }, + { + "epoch": 0.4965986394557823, + "grad_norm": 2.1545476746682453, + "learning_rate": 1.0596236914748107e-05, + "loss": 1.0291, + "step": 4745 + }, + { + "epoch": 0.4967032967032967, + "grad_norm": 1.8857323776138402, + "learning_rate": 1.0592853190954345e-05, + "loss": 0.9664, + "step": 4746 + }, + { + "epoch": 0.4968079539508111, + "grad_norm": 2.095800214096105, + "learning_rate": 1.0589469399040546e-05, + "loss": 0.9522, + "step": 4747 + }, + { + "epoch": 0.49691261119832547, + "grad_norm": 2.0247693141512055, + "learning_rate": 1.0586085539395523e-05, + "loss": 1.0192, + "step": 4748 + }, + { + "epoch": 0.4970172684458399, + "grad_norm": 1.8584866253076624, + "learning_rate": 1.0582701612408083e-05, + "loss": 0.9195, + "step": 4749 + }, + { + "epoch": 0.4971219256933543, + "grad_norm": 1.931335358563096, + "learning_rate": 1.0579317618467043e-05, + "loss": 0.9267, + "step": 4750 + }, + { + "epoch": 0.49722658294086863, + "grad_norm": 2.097579996549894, + "learning_rate": 1.0575933557961238e-05, + "loss": 0.9581, + "step": 4751 + }, + { + "epoch": 0.49733124018838304, + "grad_norm": 2.033270771923552, + "learning_rate": 1.0572549431279495e-05, + "loss": 0.9175, + "step": 4752 + }, + { + "epoch": 0.49743589743589745, + "grad_norm": 2.2607341333747772, + "learning_rate": 1.0569165238810666e-05, + "loss": 1.0193, + "step": 4753 + }, + { + "epoch": 0.49754055468341185, + "grad_norm": 1.8647489455827506, + "learning_rate": 1.0565780980943596e-05, + "loss": 0.9772, + "step": 4754 + }, + { + "epoch": 0.4976452119309262, + "grad_norm": 1.9057129930055388, + "learning_rate": 1.0562396658067142e-05, + "loss": 0.9925, + "step": 4755 + }, + { + "epoch": 0.4977498691784406, + "grad_norm": 2.8885566484907823, + "learning_rate": 1.0559012270570177e-05, + "loss": 0.9656, + "step": 4756 + }, + { + "epoch": 0.497854526425955, + "grad_norm": 1.895968677495429, + "learning_rate": 1.0555627818841563e-05, + "loss": 0.8475, + "step": 4757 + }, + { + "epoch": 0.49795918367346936, + "grad_norm": 2.019680946799338, + "learning_rate": 1.055224330327019e-05, + "loss": 0.983, + "step": 4758 + }, + { + "epoch": 0.49806384092098377, + "grad_norm": 1.9749773215931818, + "learning_rate": 1.0548858724244939e-05, + "loss": 0.948, + "step": 4759 + }, + { + "epoch": 0.4981684981684982, + "grad_norm": 1.9535266279189702, + "learning_rate": 1.0545474082154711e-05, + "loss": 0.9621, + "step": 4760 + }, + { + "epoch": 0.4982731554160126, + "grad_norm": 3.9985738676646103, + "learning_rate": 1.0542089377388406e-05, + "loss": 0.9936, + "step": 4761 + }, + { + "epoch": 0.49837781266352693, + "grad_norm": 2.2584714009921356, + "learning_rate": 1.0538704610334929e-05, + "loss": 0.871, + "step": 4762 + }, + { + "epoch": 0.49848246991104134, + "grad_norm": 2.2312381629656404, + "learning_rate": 1.0535319781383202e-05, + "loss": 1.001, + "step": 4763 + }, + { + "epoch": 0.49858712715855574, + "grad_norm": 2.0424072639034816, + "learning_rate": 1.0531934890922147e-05, + "loss": 0.9762, + "step": 4764 + }, + { + "epoch": 0.4986917844060701, + "grad_norm": 2.1580470907610887, + "learning_rate": 1.0528549939340695e-05, + "loss": 0.9045, + "step": 4765 + }, + { + "epoch": 0.4987964416535845, + "grad_norm": 2.2467255664294528, + "learning_rate": 1.0525164927027783e-05, + "loss": 1.0969, + "step": 4766 + }, + { + "epoch": 0.4989010989010989, + "grad_norm": 1.982846113442221, + "learning_rate": 1.0521779854372353e-05, + "loss": 1.038, + "step": 4767 + }, + { + "epoch": 0.4990057561486133, + "grad_norm": 2.3854040119508233, + "learning_rate": 1.0518394721763365e-05, + "loss": 1.0746, + "step": 4768 + }, + { + "epoch": 0.49911041339612766, + "grad_norm": 1.9707658343258043, + "learning_rate": 1.051500952958977e-05, + "loss": 0.8012, + "step": 4769 + }, + { + "epoch": 0.49921507064364207, + "grad_norm": 2.1046625201814746, + "learning_rate": 1.051162427824054e-05, + "loss": 0.9312, + "step": 4770 + }, + { + "epoch": 0.4993197278911565, + "grad_norm": 1.6661194083882027, + "learning_rate": 1.0508238968104638e-05, + "loss": 0.7945, + "step": 4771 + }, + { + "epoch": 0.4994243851386709, + "grad_norm": 2.083849894430902, + "learning_rate": 1.050485359957105e-05, + "loss": 0.8743, + "step": 4772 + }, + { + "epoch": 0.49952904238618523, + "grad_norm": 2.2924292467485974, + "learning_rate": 1.0501468173028762e-05, + "loss": 1.0575, + "step": 4773 + }, + { + "epoch": 0.49963369963369964, + "grad_norm": 1.9733099739758075, + "learning_rate": 1.0498082688866762e-05, + "loss": 0.9393, + "step": 4774 + }, + { + "epoch": 0.49973835688121404, + "grad_norm": 1.9074190492709282, + "learning_rate": 1.0494697147474053e-05, + "loss": 0.9925, + "step": 4775 + }, + { + "epoch": 0.4998430141287284, + "grad_norm": 1.859529806411289, + "learning_rate": 1.0491311549239635e-05, + "loss": 1.0015, + "step": 4776 + }, + { + "epoch": 0.4999476713762428, + "grad_norm": 1.8791033216965147, + "learning_rate": 1.0487925894552529e-05, + "loss": 1.0242, + "step": 4777 + }, + { + "epoch": 0.5000523286237571, + "grad_norm": 1.9342110638713974, + "learning_rate": 1.0484540183801744e-05, + "loss": 0.9966, + "step": 4778 + }, + { + "epoch": 0.5001569858712716, + "grad_norm": 1.7724120947349102, + "learning_rate": 1.0481154417376313e-05, + "loss": 0.9939, + "step": 4779 + }, + { + "epoch": 0.500261643118786, + "grad_norm": 2.154486064088719, + "learning_rate": 1.0477768595665263e-05, + "loss": 1.0931, + "step": 4780 + }, + { + "epoch": 0.5003663003663004, + "grad_norm": 2.1518890500826506, + "learning_rate": 1.0474382719057633e-05, + "loss": 0.9757, + "step": 4781 + }, + { + "epoch": 0.5004709576138148, + "grad_norm": 1.8763970416206928, + "learning_rate": 1.0470996787942466e-05, + "loss": 0.9491, + "step": 4782 + }, + { + "epoch": 0.5005756148613292, + "grad_norm": 2.3823417687483732, + "learning_rate": 1.0467610802708814e-05, + "loss": 1.045, + "step": 4783 + }, + { + "epoch": 0.5006802721088436, + "grad_norm": 2.0877142215744087, + "learning_rate": 1.0464224763745737e-05, + "loss": 0.8905, + "step": 4784 + }, + { + "epoch": 0.5007849293563579, + "grad_norm": 1.8525479505640599, + "learning_rate": 1.0460838671442286e-05, + "loss": 0.9826, + "step": 4785 + }, + { + "epoch": 0.5008895866038723, + "grad_norm": 2.029476451987434, + "learning_rate": 1.0457452526187541e-05, + "loss": 1.035, + "step": 4786 + }, + { + "epoch": 0.5009942438513867, + "grad_norm": 2.0510409778147842, + "learning_rate": 1.0454066328370573e-05, + "loss": 1.0014, + "step": 4787 + }, + { + "epoch": 0.5010989010989011, + "grad_norm": 2.050804457353104, + "learning_rate": 1.0450680078380465e-05, + "loss": 1.0106, + "step": 4788 + }, + { + "epoch": 0.5012035583464155, + "grad_norm": 1.9228482758753471, + "learning_rate": 1.04472937766063e-05, + "loss": 1.0036, + "step": 4789 + }, + { + "epoch": 0.5013082155939299, + "grad_norm": 1.8304036460035518, + "learning_rate": 1.0443907423437177e-05, + "loss": 0.9524, + "step": 4790 + }, + { + "epoch": 0.5014128728414443, + "grad_norm": 2.2850976774419918, + "learning_rate": 1.0440521019262187e-05, + "loss": 0.8331, + "step": 4791 + }, + { + "epoch": 0.5015175300889586, + "grad_norm": 2.038030446976533, + "learning_rate": 1.0437134564470442e-05, + "loss": 0.9628, + "step": 4792 + }, + { + "epoch": 0.501622187336473, + "grad_norm": 2.113415421458547, + "learning_rate": 1.0433748059451045e-05, + "loss": 1.0095, + "step": 4793 + }, + { + "epoch": 0.5017268445839874, + "grad_norm": 2.1782622269854675, + "learning_rate": 1.0430361504593118e-05, + "loss": 0.9503, + "step": 4794 + }, + { + "epoch": 0.5018315018315018, + "grad_norm": 2.238496790294893, + "learning_rate": 1.0426974900285784e-05, + "loss": 1.0506, + "step": 4795 + }, + { + "epoch": 0.5019361590790162, + "grad_norm": 2.1891901037629746, + "learning_rate": 1.0423588246918168e-05, + "loss": 1.0063, + "step": 4796 + }, + { + "epoch": 0.5020408163265306, + "grad_norm": 2.092133045742852, + "learning_rate": 1.0420201544879405e-05, + "loss": 1.0078, + "step": 4797 + }, + { + "epoch": 0.502145473574045, + "grad_norm": 1.9013198519529835, + "learning_rate": 1.0416814794558628e-05, + "loss": 0.9786, + "step": 4798 + }, + { + "epoch": 0.5022501308215594, + "grad_norm": 2.1099077143715914, + "learning_rate": 1.0413427996344991e-05, + "loss": 0.9702, + "step": 4799 + }, + { + "epoch": 0.5023547880690737, + "grad_norm": 1.7239541182847857, + "learning_rate": 1.0410041150627636e-05, + "loss": 0.9089, + "step": 4800 + }, + { + "epoch": 0.5024594453165881, + "grad_norm": 2.3381648619636137, + "learning_rate": 1.0406654257795723e-05, + "loss": 0.7951, + "step": 4801 + }, + { + "epoch": 0.5025641025641026, + "grad_norm": 2.282729233569848, + "learning_rate": 1.040326731823841e-05, + "loss": 1.0216, + "step": 4802 + }, + { + "epoch": 0.502668759811617, + "grad_norm": 2.2977040163903264, + "learning_rate": 1.0399880332344865e-05, + "loss": 1.0084, + "step": 4803 + }, + { + "epoch": 0.5027734170591314, + "grad_norm": 2.006735467260644, + "learning_rate": 1.0396493300504263e-05, + "loss": 0.9048, + "step": 4804 + }, + { + "epoch": 0.5028780743066458, + "grad_norm": 1.8040980766553913, + "learning_rate": 1.0393106223105771e-05, + "loss": 0.9794, + "step": 4805 + }, + { + "epoch": 0.5029827315541602, + "grad_norm": 2.131327716426053, + "learning_rate": 1.0389719100538583e-05, + "loss": 0.9558, + "step": 4806 + }, + { + "epoch": 0.5030873888016745, + "grad_norm": 2.0367496240204246, + "learning_rate": 1.0386331933191874e-05, + "loss": 0.8524, + "step": 4807 + }, + { + "epoch": 0.5031920460491889, + "grad_norm": 1.927303561475201, + "learning_rate": 1.0382944721454847e-05, + "loss": 0.8965, + "step": 4808 + }, + { + "epoch": 0.5032967032967033, + "grad_norm": 2.4684209425092485, + "learning_rate": 1.0379557465716696e-05, + "loss": 0.8173, + "step": 4809 + }, + { + "epoch": 0.5034013605442177, + "grad_norm": 1.9985491017896084, + "learning_rate": 1.0376170166366619e-05, + "loss": 1.0035, + "step": 4810 + }, + { + "epoch": 0.5035060177917321, + "grad_norm": 1.8490510380262115, + "learning_rate": 1.037278282379383e-05, + "loss": 0.889, + "step": 4811 + }, + { + "epoch": 0.5036106750392465, + "grad_norm": 1.9348394119418788, + "learning_rate": 1.0369395438387535e-05, + "loss": 0.8902, + "step": 4812 + }, + { + "epoch": 0.5037153322867609, + "grad_norm": 2.08472258783819, + "learning_rate": 1.0366008010536962e-05, + "loss": 0.8893, + "step": 4813 + }, + { + "epoch": 0.5038199895342752, + "grad_norm": 1.8325956711014155, + "learning_rate": 1.036262054063132e-05, + "loss": 0.9288, + "step": 4814 + }, + { + "epoch": 0.5039246467817896, + "grad_norm": 1.9933367795637793, + "learning_rate": 1.0359233029059845e-05, + "loss": 0.886, + "step": 4815 + }, + { + "epoch": 0.504029304029304, + "grad_norm": 2.1844459941163157, + "learning_rate": 1.035584547621177e-05, + "loss": 0.8664, + "step": 4816 + }, + { + "epoch": 0.5041339612768184, + "grad_norm": 2.124796059309372, + "learning_rate": 1.0352457882476326e-05, + "loss": 0.901, + "step": 4817 + }, + { + "epoch": 0.5042386185243328, + "grad_norm": 2.0620543862987395, + "learning_rate": 1.0349070248242756e-05, + "loss": 1.0371, + "step": 4818 + }, + { + "epoch": 0.5043432757718472, + "grad_norm": 1.9219187469619123, + "learning_rate": 1.0345682573900306e-05, + "loss": 0.8801, + "step": 4819 + }, + { + "epoch": 0.5044479330193616, + "grad_norm": 1.9754947289145173, + "learning_rate": 1.0342294859838228e-05, + "loss": 0.9137, + "step": 4820 + }, + { + "epoch": 0.5045525902668759, + "grad_norm": 1.9746909929500338, + "learning_rate": 1.0338907106445781e-05, + "loss": 1.008, + "step": 4821 + }, + { + "epoch": 0.5046572475143903, + "grad_norm": 1.827473906585448, + "learning_rate": 1.0335519314112217e-05, + "loss": 0.8131, + "step": 4822 + }, + { + "epoch": 0.5047619047619047, + "grad_norm": 2.196935612027101, + "learning_rate": 1.0332131483226805e-05, + "loss": 1.0269, + "step": 4823 + }, + { + "epoch": 0.5048665620094192, + "grad_norm": 2.0147704818493, + "learning_rate": 1.0328743614178807e-05, + "loss": 0.8966, + "step": 4824 + }, + { + "epoch": 0.5049712192569336, + "grad_norm": 1.7839156040063775, + "learning_rate": 1.0325355707357507e-05, + "loss": 0.9519, + "step": 4825 + }, + { + "epoch": 0.505075876504448, + "grad_norm": 2.299507673691946, + "learning_rate": 1.0321967763152176e-05, + "loss": 0.959, + "step": 4826 + }, + { + "epoch": 0.5051805337519624, + "grad_norm": 2.2267555529088825, + "learning_rate": 1.0318579781952095e-05, + "loss": 0.8851, + "step": 4827 + }, + { + "epoch": 0.5052851909994767, + "grad_norm": 2.102441977176161, + "learning_rate": 1.0315191764146551e-05, + "loss": 0.9512, + "step": 4828 + }, + { + "epoch": 0.5053898482469911, + "grad_norm": 1.9711747773484447, + "learning_rate": 1.0311803710124832e-05, + "loss": 1.0464, + "step": 4829 + }, + { + "epoch": 0.5054945054945055, + "grad_norm": 2.1197570532175285, + "learning_rate": 1.030841562027624e-05, + "loss": 0.914, + "step": 4830 + }, + { + "epoch": 0.5055991627420199, + "grad_norm": 2.2152984722921327, + "learning_rate": 1.0305027494990065e-05, + "loss": 0.9595, + "step": 4831 + }, + { + "epoch": 0.5057038199895343, + "grad_norm": 2.006104338040643, + "learning_rate": 1.0301639334655612e-05, + "loss": 0.9543, + "step": 4832 + }, + { + "epoch": 0.5058084772370487, + "grad_norm": 2.233956630960079, + "learning_rate": 1.029825113966219e-05, + "loss": 1.0661, + "step": 4833 + }, + { + "epoch": 0.5059131344845631, + "grad_norm": 1.8056443704209098, + "learning_rate": 1.0294862910399106e-05, + "loss": 0.778, + "step": 4834 + }, + { + "epoch": 0.5060177917320774, + "grad_norm": 1.8210897127610195, + "learning_rate": 1.0291474647255677e-05, + "loss": 1.0009, + "step": 4835 + }, + { + "epoch": 0.5061224489795918, + "grad_norm": 2.265489705370913, + "learning_rate": 1.0288086350621219e-05, + "loss": 0.8936, + "step": 4836 + }, + { + "epoch": 0.5062271062271062, + "grad_norm": 2.2329819145492724, + "learning_rate": 1.0284698020885054e-05, + "loss": 0.8475, + "step": 4837 + }, + { + "epoch": 0.5063317634746206, + "grad_norm": 1.9140187907534734, + "learning_rate": 1.0281309658436514e-05, + "loss": 0.9804, + "step": 4838 + }, + { + "epoch": 0.506436420722135, + "grad_norm": 2.1495837609117077, + "learning_rate": 1.0277921263664918e-05, + "loss": 0.9649, + "step": 4839 + }, + { + "epoch": 0.5065410779696494, + "grad_norm": 2.147434202179756, + "learning_rate": 1.027453283695961e-05, + "loss": 0.9257, + "step": 4840 + }, + { + "epoch": 0.5066457352171638, + "grad_norm": 2.1508850173428504, + "learning_rate": 1.0271144378709919e-05, + "loss": 1.0815, + "step": 4841 + }, + { + "epoch": 0.5067503924646782, + "grad_norm": 1.890843092683489, + "learning_rate": 1.026775588930519e-05, + "loss": 0.9078, + "step": 4842 + }, + { + "epoch": 0.5068550497121925, + "grad_norm": 1.718743135555168, + "learning_rate": 1.0264367369134767e-05, + "loss": 0.8615, + "step": 4843 + }, + { + "epoch": 0.5069597069597069, + "grad_norm": 2.1338027250972718, + "learning_rate": 1.0260978818588e-05, + "loss": 0.9485, + "step": 4844 + }, + { + "epoch": 0.5070643642072213, + "grad_norm": 2.3030713079734197, + "learning_rate": 1.0257590238054238e-05, + "loss": 1.0152, + "step": 4845 + }, + { + "epoch": 0.5071690214547357, + "grad_norm": 2.108066963927156, + "learning_rate": 1.0254201627922829e-05, + "loss": 0.8987, + "step": 4846 + }, + { + "epoch": 0.5072736787022502, + "grad_norm": 1.9348248604669798, + "learning_rate": 1.0250812988583144e-05, + "loss": 0.8945, + "step": 4847 + }, + { + "epoch": 0.5073783359497646, + "grad_norm": 2.0214829385791426, + "learning_rate": 1.0247424320424534e-05, + "loss": 0.9324, + "step": 4848 + }, + { + "epoch": 0.507482993197279, + "grad_norm": 2.103062514985118, + "learning_rate": 1.024403562383637e-05, + "loss": 0.9869, + "step": 4849 + }, + { + "epoch": 0.5075876504447933, + "grad_norm": 2.0388689399057633, + "learning_rate": 1.0240646899208018e-05, + "loss": 1.0085, + "step": 4850 + }, + { + "epoch": 0.5076923076923077, + "grad_norm": 2.2019460294281243, + "learning_rate": 1.0237258146928849e-05, + "loss": 0.9457, + "step": 4851 + }, + { + "epoch": 0.5077969649398221, + "grad_norm": 1.8404221416654807, + "learning_rate": 1.023386936738824e-05, + "loss": 0.9133, + "step": 4852 + }, + { + "epoch": 0.5079016221873365, + "grad_norm": 1.9361603963782874, + "learning_rate": 1.0230480560975564e-05, + "loss": 0.9412, + "step": 4853 + }, + { + "epoch": 0.5080062794348509, + "grad_norm": 2.0201875001702807, + "learning_rate": 1.0227091728080203e-05, + "loss": 0.9903, + "step": 4854 + }, + { + "epoch": 0.5081109366823653, + "grad_norm": 2.174119911436257, + "learning_rate": 1.0223702869091548e-05, + "loss": 0.9529, + "step": 4855 + }, + { + "epoch": 0.5082155939298797, + "grad_norm": 2.1378859332077047, + "learning_rate": 1.0220313984398972e-05, + "loss": 1.0162, + "step": 4856 + }, + { + "epoch": 0.508320251177394, + "grad_norm": 1.9197826619859129, + "learning_rate": 1.0216925074391879e-05, + "loss": 0.9307, + "step": 4857 + }, + { + "epoch": 0.5084249084249084, + "grad_norm": 2.2932973290164096, + "learning_rate": 1.0213536139459651e-05, + "loss": 1.0056, + "step": 4858 + }, + { + "epoch": 0.5085295656724228, + "grad_norm": 2.2317819242913695, + "learning_rate": 1.021014717999169e-05, + "loss": 0.9703, + "step": 4859 + }, + { + "epoch": 0.5086342229199372, + "grad_norm": 2.1281106553255675, + "learning_rate": 1.0206758196377391e-05, + "loss": 1.0403, + "step": 4860 + }, + { + "epoch": 0.5087388801674516, + "grad_norm": 2.171215516994493, + "learning_rate": 1.0203369189006157e-05, + "loss": 0.8991, + "step": 4861 + }, + { + "epoch": 0.508843537414966, + "grad_norm": 2.255784354708711, + "learning_rate": 1.019998015826739e-05, + "loss": 0.8639, + "step": 4862 + }, + { + "epoch": 0.5089481946624804, + "grad_norm": 2.2311385876757166, + "learning_rate": 1.0196591104550497e-05, + "loss": 0.9527, + "step": 4863 + }, + { + "epoch": 0.5090528519099947, + "grad_norm": 2.1216367946605614, + "learning_rate": 1.0193202028244891e-05, + "loss": 0.7992, + "step": 4864 + }, + { + "epoch": 0.5091575091575091, + "grad_norm": 2.2266924800513928, + "learning_rate": 1.0189812929739976e-05, + "loss": 0.9312, + "step": 4865 + }, + { + "epoch": 0.5092621664050235, + "grad_norm": 2.184861742273941, + "learning_rate": 1.0186423809425175e-05, + "loss": 0.9421, + "step": 4866 + }, + { + "epoch": 0.5093668236525379, + "grad_norm": 2.2359774420575045, + "learning_rate": 1.0183034667689898e-05, + "loss": 0.8726, + "step": 4867 + }, + { + "epoch": 0.5094714809000523, + "grad_norm": 2.251077097785011, + "learning_rate": 1.0179645504923565e-05, + "loss": 0.9215, + "step": 4868 + }, + { + "epoch": 0.5095761381475667, + "grad_norm": 1.962671210761371, + "learning_rate": 1.0176256321515601e-05, + "loss": 0.9715, + "step": 4869 + }, + { + "epoch": 0.5096807953950812, + "grad_norm": 2.1786034572062687, + "learning_rate": 1.0172867117855428e-05, + "loss": 0.9004, + "step": 4870 + }, + { + "epoch": 0.5097854526425954, + "grad_norm": 1.984923185389256, + "learning_rate": 1.0169477894332473e-05, + "loss": 1.0999, + "step": 4871 + }, + { + "epoch": 0.5098901098901099, + "grad_norm": 2.3249280492593107, + "learning_rate": 1.016608865133616e-05, + "loss": 1.0306, + "step": 4872 + }, + { + "epoch": 0.5099947671376243, + "grad_norm": 2.631735537137025, + "learning_rate": 1.0162699389255933e-05, + "loss": 0.7941, + "step": 4873 + }, + { + "epoch": 0.5100994243851387, + "grad_norm": 2.074101642604738, + "learning_rate": 1.0159310108481212e-05, + "loss": 0.8317, + "step": 4874 + }, + { + "epoch": 0.5102040816326531, + "grad_norm": 2.0707425216929094, + "learning_rate": 1.0155920809401437e-05, + "loss": 0.9869, + "step": 4875 + }, + { + "epoch": 0.5103087388801675, + "grad_norm": 1.9855437335816901, + "learning_rate": 1.015253149240604e-05, + "loss": 0.8376, + "step": 4876 + }, + { + "epoch": 0.5104133961276819, + "grad_norm": 2.125627681296742, + "learning_rate": 1.014914215788447e-05, + "loss": 0.8265, + "step": 4877 + }, + { + "epoch": 0.5105180533751962, + "grad_norm": 2.0097025281038183, + "learning_rate": 1.0145752806226165e-05, + "loss": 1.0651, + "step": 4878 + }, + { + "epoch": 0.5106227106227106, + "grad_norm": 1.9381824884213676, + "learning_rate": 1.0142363437820566e-05, + "loss": 0.9377, + "step": 4879 + }, + { + "epoch": 0.510727367870225, + "grad_norm": 1.8057048082927534, + "learning_rate": 1.0138974053057118e-05, + "loss": 1.0258, + "step": 4880 + }, + { + "epoch": 0.5108320251177394, + "grad_norm": 1.9284372353034598, + "learning_rate": 1.013558465232527e-05, + "loss": 0.9767, + "step": 4881 + }, + { + "epoch": 0.5109366823652538, + "grad_norm": 2.0912145996565807, + "learning_rate": 1.0132195236014471e-05, + "loss": 0.9621, + "step": 4882 + }, + { + "epoch": 0.5110413396127682, + "grad_norm": 2.4074410237986203, + "learning_rate": 1.0128805804514173e-05, + "loss": 0.9953, + "step": 4883 + }, + { + "epoch": 0.5111459968602826, + "grad_norm": 2.1350122261578948, + "learning_rate": 1.0125416358213825e-05, + "loss": 0.9114, + "step": 4884 + }, + { + "epoch": 0.511250654107797, + "grad_norm": 2.1766627738568824, + "learning_rate": 1.0122026897502882e-05, + "loss": 0.9303, + "step": 4885 + }, + { + "epoch": 0.5113553113553113, + "grad_norm": 1.659780196796726, + "learning_rate": 1.0118637422770803e-05, + "loss": 0.8643, + "step": 4886 + }, + { + "epoch": 0.5114599686028257, + "grad_norm": 1.8759283670973985, + "learning_rate": 1.0115247934407045e-05, + "loss": 0.9288, + "step": 4887 + }, + { + "epoch": 0.5115646258503401, + "grad_norm": 1.981277049724218, + "learning_rate": 1.0111858432801063e-05, + "loss": 0.807, + "step": 4888 + }, + { + "epoch": 0.5116692830978545, + "grad_norm": 2.104064781470603, + "learning_rate": 1.0108468918342317e-05, + "loss": 0.9398, + "step": 4889 + }, + { + "epoch": 0.5117739403453689, + "grad_norm": 1.862863320953064, + "learning_rate": 1.0105079391420281e-05, + "loss": 0.8971, + "step": 4890 + }, + { + "epoch": 0.5118785975928833, + "grad_norm": 2.0632811238069872, + "learning_rate": 1.0101689852424404e-05, + "loss": 0.9229, + "step": 4891 + }, + { + "epoch": 0.5119832548403978, + "grad_norm": 2.0328293416753787, + "learning_rate": 1.0098300301744159e-05, + "loss": 0.8541, + "step": 4892 + }, + { + "epoch": 0.512087912087912, + "grad_norm": 2.2183959790280094, + "learning_rate": 1.0094910739769007e-05, + "loss": 0.9588, + "step": 4893 + }, + { + "epoch": 0.5121925693354265, + "grad_norm": 2.0086969358639597, + "learning_rate": 1.0091521166888422e-05, + "loss": 0.9431, + "step": 4894 + }, + { + "epoch": 0.5122972265829409, + "grad_norm": 1.8566480087720016, + "learning_rate": 1.0088131583491869e-05, + "loss": 0.9237, + "step": 4895 + }, + { + "epoch": 0.5124018838304553, + "grad_norm": 1.978008384805466, + "learning_rate": 1.0084741989968818e-05, + "loss": 0.9383, + "step": 4896 + }, + { + "epoch": 0.5125065410779697, + "grad_norm": 2.2899059061417146, + "learning_rate": 1.008135238670874e-05, + "loss": 1.0155, + "step": 4897 + }, + { + "epoch": 0.5126111983254841, + "grad_norm": 2.1435767412931215, + "learning_rate": 1.0077962774101108e-05, + "loss": 0.7721, + "step": 4898 + }, + { + "epoch": 0.5127158555729985, + "grad_norm": 1.9651414340794402, + "learning_rate": 1.0074573152535397e-05, + "loss": 0.8895, + "step": 4899 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 1.9700996694269328, + "learning_rate": 1.0071183522401078e-05, + "loss": 0.8386, + "step": 4900 + }, + { + "epoch": 0.5129251700680272, + "grad_norm": 2.043250711393673, + "learning_rate": 1.0067793884087626e-05, + "loss": 1.0259, + "step": 4901 + }, + { + "epoch": 0.5130298273155416, + "grad_norm": 2.2215963106479064, + "learning_rate": 1.0064404237984522e-05, + "loss": 1.0239, + "step": 4902 + }, + { + "epoch": 0.513134484563056, + "grad_norm": 2.043312538764727, + "learning_rate": 1.0061014584481242e-05, + "loss": 0.9675, + "step": 4903 + }, + { + "epoch": 0.5132391418105704, + "grad_norm": 2.017720307338605, + "learning_rate": 1.0057624923967258e-05, + "loss": 1.0402, + "step": 4904 + }, + { + "epoch": 0.5133437990580848, + "grad_norm": 1.9522219810870778, + "learning_rate": 1.0054235256832057e-05, + "loss": 0.9143, + "step": 4905 + }, + { + "epoch": 0.5134484563055992, + "grad_norm": 2.3060614281471423, + "learning_rate": 1.0050845583465112e-05, + "loss": 0.8615, + "step": 4906 + }, + { + "epoch": 0.5135531135531135, + "grad_norm": 2.070929318747068, + "learning_rate": 1.004745590425591e-05, + "loss": 0.8553, + "step": 4907 + }, + { + "epoch": 0.5136577708006279, + "grad_norm": 2.198912283882256, + "learning_rate": 1.0044066219593925e-05, + "loss": 0.9677, + "step": 4908 + }, + { + "epoch": 0.5137624280481423, + "grad_norm": 2.2590338340709204, + "learning_rate": 1.0040676529868644e-05, + "loss": 1.0453, + "step": 4909 + }, + { + "epoch": 0.5138670852956567, + "grad_norm": 2.3124159355511766, + "learning_rate": 1.0037286835469546e-05, + "loss": 1.1255, + "step": 4910 + }, + { + "epoch": 0.5139717425431711, + "grad_norm": 1.8769919169356388, + "learning_rate": 1.0033897136786115e-05, + "loss": 0.8565, + "step": 4911 + }, + { + "epoch": 0.5140763997906855, + "grad_norm": 2.032937691495361, + "learning_rate": 1.0030507434207836e-05, + "loss": 0.9021, + "step": 4912 + }, + { + "epoch": 0.5141810570381999, + "grad_norm": 1.948805286567476, + "learning_rate": 1.002711772812419e-05, + "loss": 0.9255, + "step": 4913 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 1.8811051091573516, + "learning_rate": 1.0023728018924663e-05, + "loss": 0.9634, + "step": 4914 + }, + { + "epoch": 0.5143903715332286, + "grad_norm": 1.895037622840537, + "learning_rate": 1.0020338306998739e-05, + "loss": 0.7764, + "step": 4915 + }, + { + "epoch": 0.514495028780743, + "grad_norm": 2.3622734576508417, + "learning_rate": 1.0016948592735898e-05, + "loss": 0.9616, + "step": 4916 + }, + { + "epoch": 0.5145996860282575, + "grad_norm": 2.1068542919039004, + "learning_rate": 1.0013558876525635e-05, + "loss": 0.9764, + "step": 4917 + }, + { + "epoch": 0.5147043432757719, + "grad_norm": 2.0518452033894428, + "learning_rate": 1.0010169158757425e-05, + "loss": 0.9017, + "step": 4918 + }, + { + "epoch": 0.5148090005232863, + "grad_norm": 2.268883115799465, + "learning_rate": 1.000677943982076e-05, + "loss": 0.8253, + "step": 4919 + }, + { + "epoch": 0.5149136577708007, + "grad_norm": 1.9101165871841854, + "learning_rate": 1.000338972010512e-05, + "loss": 1.0396, + "step": 4920 + }, + { + "epoch": 0.515018315018315, + "grad_norm": 1.9018696065166185, + "learning_rate": 1e-05, + "loss": 0.8675, + "step": 4921 + }, + { + "epoch": 0.5151229722658294, + "grad_norm": 1.9135895846787352, + "learning_rate": 9.99661027989488e-06, + "loss": 0.953, + "step": 4922 + }, + { + "epoch": 0.5152276295133438, + "grad_norm": 2.2944292672819744, + "learning_rate": 9.993220560179244e-06, + "loss": 0.9654, + "step": 4923 + }, + { + "epoch": 0.5153322867608582, + "grad_norm": 2.3373997913546187, + "learning_rate": 9.98983084124258e-06, + "loss": 1.0137, + "step": 4924 + }, + { + "epoch": 0.5154369440083726, + "grad_norm": 2.1760314698321697, + "learning_rate": 9.986441123474371e-06, + "loss": 1.0149, + "step": 4925 + }, + { + "epoch": 0.515541601255887, + "grad_norm": 2.0995522000742635, + "learning_rate": 9.983051407264102e-06, + "loss": 0.994, + "step": 4926 + }, + { + "epoch": 0.5156462585034014, + "grad_norm": 2.135301385827847, + "learning_rate": 9.979661693001267e-06, + "loss": 0.9958, + "step": 4927 + }, + { + "epoch": 0.5157509157509158, + "grad_norm": 2.016177869195349, + "learning_rate": 9.976271981075339e-06, + "loss": 1.0032, + "step": 4928 + }, + { + "epoch": 0.5158555729984301, + "grad_norm": 2.198242514789201, + "learning_rate": 9.972882271875814e-06, + "loss": 0.9751, + "step": 4929 + }, + { + "epoch": 0.5159602302459445, + "grad_norm": 2.568304071771772, + "learning_rate": 9.969492565792166e-06, + "loss": 0.8212, + "step": 4930 + }, + { + "epoch": 0.5160648874934589, + "grad_norm": 2.150285720935691, + "learning_rate": 9.966102863213889e-06, + "loss": 0.8911, + "step": 4931 + }, + { + "epoch": 0.5161695447409733, + "grad_norm": 1.8427410847871126, + "learning_rate": 9.962713164530457e-06, + "loss": 0.9333, + "step": 4932 + }, + { + "epoch": 0.5162742019884877, + "grad_norm": 1.9777153731928134, + "learning_rate": 9.95932347013136e-06, + "loss": 0.9572, + "step": 4933 + }, + { + "epoch": 0.5163788592360021, + "grad_norm": 2.023147402384463, + "learning_rate": 9.955933780406079e-06, + "loss": 0.9403, + "step": 4934 + }, + { + "epoch": 0.5164835164835165, + "grad_norm": 2.339289650649997, + "learning_rate": 9.952544095744092e-06, + "loss": 1.012, + "step": 4935 + }, + { + "epoch": 0.5165881737310308, + "grad_norm": 2.0703824315481447, + "learning_rate": 9.94915441653489e-06, + "loss": 0.8705, + "step": 4936 + }, + { + "epoch": 0.5166928309785452, + "grad_norm": 2.1054511171333252, + "learning_rate": 9.945764743167947e-06, + "loss": 0.8875, + "step": 4937 + }, + { + "epoch": 0.5167974882260596, + "grad_norm": 2.0257304303475108, + "learning_rate": 9.942375076032745e-06, + "loss": 0.792, + "step": 4938 + }, + { + "epoch": 0.516902145473574, + "grad_norm": 2.1389835239632298, + "learning_rate": 9.938985415518763e-06, + "loss": 0.8896, + "step": 4939 + }, + { + "epoch": 0.5170068027210885, + "grad_norm": 2.2239270796318156, + "learning_rate": 9.935595762015481e-06, + "loss": 0.9462, + "step": 4940 + }, + { + "epoch": 0.5171114599686029, + "grad_norm": 1.922334466980784, + "learning_rate": 9.932206115912379e-06, + "loss": 0.9672, + "step": 4941 + }, + { + "epoch": 0.5172161172161173, + "grad_norm": 2.0497231083957583, + "learning_rate": 9.928816477598927e-06, + "loss": 0.9285, + "step": 4942 + }, + { + "epoch": 0.5173207744636316, + "grad_norm": 2.036361821419443, + "learning_rate": 9.925426847464605e-06, + "loss": 0.9388, + "step": 4943 + }, + { + "epoch": 0.517425431711146, + "grad_norm": 2.0485533284742767, + "learning_rate": 9.922037225898893e-06, + "loss": 1.0012, + "step": 4944 + }, + { + "epoch": 0.5175300889586604, + "grad_norm": 1.781792397254675, + "learning_rate": 9.918647613291262e-06, + "loss": 0.9396, + "step": 4945 + }, + { + "epoch": 0.5176347462061748, + "grad_norm": 2.6325182895039045, + "learning_rate": 9.915258010031184e-06, + "loss": 1.0054, + "step": 4946 + }, + { + "epoch": 0.5177394034536892, + "grad_norm": 2.24395320097852, + "learning_rate": 9.911868416508133e-06, + "loss": 0.8993, + "step": 4947 + }, + { + "epoch": 0.5178440607012036, + "grad_norm": 1.950860927917329, + "learning_rate": 9.908478833111581e-06, + "loss": 0.7653, + "step": 4948 + }, + { + "epoch": 0.517948717948718, + "grad_norm": 2.3503369439679136, + "learning_rate": 9.905089260230994e-06, + "loss": 0.8748, + "step": 4949 + }, + { + "epoch": 0.5180533751962323, + "grad_norm": 1.8774277642314408, + "learning_rate": 9.901699698255846e-06, + "loss": 0.989, + "step": 4950 + }, + { + "epoch": 0.5181580324437467, + "grad_norm": 1.8303975022931225, + "learning_rate": 9.8983101475756e-06, + "loss": 1.0733, + "step": 4951 + }, + { + "epoch": 0.5182626896912611, + "grad_norm": 2.317212810335493, + "learning_rate": 9.89492060857972e-06, + "loss": 1.1076, + "step": 4952 + }, + { + "epoch": 0.5183673469387755, + "grad_norm": 2.1808943237436993, + "learning_rate": 9.891531081657681e-06, + "loss": 0.9556, + "step": 4953 + }, + { + "epoch": 0.5184720041862899, + "grad_norm": 2.029241896808696, + "learning_rate": 9.888141567198938e-06, + "loss": 1.0964, + "step": 4954 + }, + { + "epoch": 0.5185766614338043, + "grad_norm": 1.9949608216514931, + "learning_rate": 9.88475206559296e-06, + "loss": 0.9498, + "step": 4955 + }, + { + "epoch": 0.5186813186813187, + "grad_norm": 2.2830478335421174, + "learning_rate": 9.881362577229199e-06, + "loss": 1.0157, + "step": 4956 + }, + { + "epoch": 0.518785975928833, + "grad_norm": 1.9690660007947534, + "learning_rate": 9.87797310249712e-06, + "loss": 1.0411, + "step": 4957 + }, + { + "epoch": 0.5188906331763474, + "grad_norm": 1.977273523937895, + "learning_rate": 9.874583641786178e-06, + "loss": 0.8597, + "step": 4958 + }, + { + "epoch": 0.5189952904238618, + "grad_norm": 1.9243362489372648, + "learning_rate": 9.871194195485833e-06, + "loss": 0.9745, + "step": 4959 + }, + { + "epoch": 0.5190999476713762, + "grad_norm": 2.1432993449566737, + "learning_rate": 9.867804763985534e-06, + "loss": 1.0604, + "step": 4960 + }, + { + "epoch": 0.5192046049188906, + "grad_norm": 2.0876636574915595, + "learning_rate": 9.864415347674732e-06, + "loss": 0.9654, + "step": 4961 + }, + { + "epoch": 0.519309262166405, + "grad_norm": 1.9248359573202058, + "learning_rate": 9.861025946942883e-06, + "loss": 0.8849, + "step": 4962 + }, + { + "epoch": 0.5194139194139195, + "grad_norm": 2.151905236783193, + "learning_rate": 9.857636562179439e-06, + "loss": 0.917, + "step": 4963 + }, + { + "epoch": 0.5195185766614338, + "grad_norm": 2.40363372620985, + "learning_rate": 9.854247193773837e-06, + "loss": 0.8764, + "step": 4964 + }, + { + "epoch": 0.5196232339089482, + "grad_norm": 2.2427852684647425, + "learning_rate": 9.850857842115533e-06, + "loss": 1.0175, + "step": 4965 + }, + { + "epoch": 0.5197278911564626, + "grad_norm": 1.9138156790964573, + "learning_rate": 9.847468507593961e-06, + "loss": 0.8898, + "step": 4966 + }, + { + "epoch": 0.519832548403977, + "grad_norm": 2.0160695177069603, + "learning_rate": 9.844079190598568e-06, + "loss": 0.9839, + "step": 4967 + }, + { + "epoch": 0.5199372056514914, + "grad_norm": 1.8516868195463738, + "learning_rate": 9.840689891518793e-06, + "loss": 1.0036, + "step": 4968 + }, + { + "epoch": 0.5200418628990058, + "grad_norm": 2.1447596627697254, + "learning_rate": 9.837300610744069e-06, + "loss": 0.8792, + "step": 4969 + }, + { + "epoch": 0.5201465201465202, + "grad_norm": 2.4054927998015447, + "learning_rate": 9.833911348663838e-06, + "loss": 1.0158, + "step": 4970 + }, + { + "epoch": 0.5202511773940346, + "grad_norm": 2.228363478525115, + "learning_rate": 9.83052210566753e-06, + "loss": 0.9684, + "step": 4971 + }, + { + "epoch": 0.5203558346415489, + "grad_norm": 1.8149258250247282, + "learning_rate": 9.827132882144576e-06, + "loss": 0.9464, + "step": 4972 + }, + { + "epoch": 0.5204604918890633, + "grad_norm": 2.380722994225285, + "learning_rate": 9.8237436784844e-06, + "loss": 1.027, + "step": 4973 + }, + { + "epoch": 0.5205651491365777, + "grad_norm": 1.7873907854560183, + "learning_rate": 9.820354495076439e-06, + "loss": 0.769, + "step": 4974 + }, + { + "epoch": 0.5206698063840921, + "grad_norm": 2.2802115646024133, + "learning_rate": 9.816965332310106e-06, + "loss": 0.9555, + "step": 4975 + }, + { + "epoch": 0.5207744636316065, + "grad_norm": 2.038426429660569, + "learning_rate": 9.81357619057483e-06, + "loss": 0.9761, + "step": 4976 + }, + { + "epoch": 0.5208791208791209, + "grad_norm": 2.264959189290348, + "learning_rate": 9.810187070260029e-06, + "loss": 0.8613, + "step": 4977 + }, + { + "epoch": 0.5209837781266353, + "grad_norm": 2.3352359731890537, + "learning_rate": 9.80679797175511e-06, + "loss": 0.9207, + "step": 4978 + }, + { + "epoch": 0.5210884353741496, + "grad_norm": 1.813637189730288, + "learning_rate": 9.803408895449502e-06, + "loss": 0.8949, + "step": 4979 + }, + { + "epoch": 0.521193092621664, + "grad_norm": 2.0367522181120985, + "learning_rate": 9.800019841732613e-06, + "loss": 0.9757, + "step": 4980 + }, + { + "epoch": 0.5212977498691784, + "grad_norm": 2.098582096085878, + "learning_rate": 9.796630810993844e-06, + "loss": 1.0026, + "step": 4981 + }, + { + "epoch": 0.5214024071166928, + "grad_norm": 1.9944200217385741, + "learning_rate": 9.793241803622612e-06, + "loss": 0.8153, + "step": 4982 + }, + { + "epoch": 0.5215070643642072, + "grad_norm": 2.0471451034556147, + "learning_rate": 9.789852820008312e-06, + "loss": 0.782, + "step": 4983 + }, + { + "epoch": 0.5216117216117216, + "grad_norm": 2.199390774507097, + "learning_rate": 9.786463860540352e-06, + "loss": 0.8614, + "step": 4984 + }, + { + "epoch": 0.521716378859236, + "grad_norm": 2.071453495859607, + "learning_rate": 9.783074925608126e-06, + "loss": 0.8793, + "step": 4985 + }, + { + "epoch": 0.5218210361067503, + "grad_norm": 2.3490583249057893, + "learning_rate": 9.779686015601031e-06, + "loss": 1.0274, + "step": 4986 + }, + { + "epoch": 0.5219256933542648, + "grad_norm": 1.959370734056799, + "learning_rate": 9.776297130908456e-06, + "loss": 0.9316, + "step": 4987 + }, + { + "epoch": 0.5220303506017792, + "grad_norm": 2.2160442511485825, + "learning_rate": 9.772908271919797e-06, + "loss": 1.0251, + "step": 4988 + }, + { + "epoch": 0.5221350078492936, + "grad_norm": 1.968606658582411, + "learning_rate": 9.76951943902444e-06, + "loss": 0.944, + "step": 4989 + }, + { + "epoch": 0.522239665096808, + "grad_norm": 2.4107803674809736, + "learning_rate": 9.766130632611762e-06, + "loss": 0.9429, + "step": 4990 + }, + { + "epoch": 0.5223443223443224, + "grad_norm": 2.0652044268509693, + "learning_rate": 9.762741853071153e-06, + "loss": 0.8697, + "step": 4991 + }, + { + "epoch": 0.5224489795918368, + "grad_norm": 2.062990165066534, + "learning_rate": 9.759353100791986e-06, + "loss": 1.0389, + "step": 4992 + }, + { + "epoch": 0.5225536368393511, + "grad_norm": 1.9364323150880431, + "learning_rate": 9.755964376163633e-06, + "loss": 0.9711, + "step": 4993 + }, + { + "epoch": 0.5226582940868655, + "grad_norm": 2.4876581630290273, + "learning_rate": 9.752575679575469e-06, + "loss": 0.8624, + "step": 4994 + }, + { + "epoch": 0.5227629513343799, + "grad_norm": 1.8708610631837064, + "learning_rate": 9.749187011416858e-06, + "loss": 0.8952, + "step": 4995 + }, + { + "epoch": 0.5228676085818943, + "grad_norm": 2.0610166226921467, + "learning_rate": 9.745798372077173e-06, + "loss": 1.0246, + "step": 4996 + }, + { + "epoch": 0.5229722658294087, + "grad_norm": 2.091437740906605, + "learning_rate": 9.742409761945765e-06, + "loss": 0.9257, + "step": 4997 + }, + { + "epoch": 0.5230769230769231, + "grad_norm": 3.1322596751647236, + "learning_rate": 9.739021181412003e-06, + "loss": 0.9185, + "step": 4998 + }, + { + "epoch": 0.5231815803244375, + "grad_norm": 2.983387965543901, + "learning_rate": 9.735632630865234e-06, + "loss": 0.8107, + "step": 4999 + }, + { + "epoch": 0.5232862375719518, + "grad_norm": 2.3964467020952425, + "learning_rate": 9.732244110694813e-06, + "loss": 0.9936, + "step": 5000 + }, + { + "epoch": 0.5233908948194662, + "grad_norm": 2.1655991778042893, + "learning_rate": 9.728855621290086e-06, + "loss": 0.881, + "step": 5001 + }, + { + "epoch": 0.5234955520669806, + "grad_norm": 2.323301195527726, + "learning_rate": 9.725467163040395e-06, + "loss": 1.0034, + "step": 5002 + }, + { + "epoch": 0.523600209314495, + "grad_norm": 2.1683566774088194, + "learning_rate": 9.722078736335087e-06, + "loss": 0.9435, + "step": 5003 + }, + { + "epoch": 0.5237048665620094, + "grad_norm": 1.9021798201348261, + "learning_rate": 9.71869034156349e-06, + "loss": 0.9163, + "step": 5004 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 2.1575573537521278, + "learning_rate": 9.715301979114946e-06, + "loss": 0.9994, + "step": 5005 + }, + { + "epoch": 0.5239141810570382, + "grad_norm": 2.1460735218265174, + "learning_rate": 9.711913649378785e-06, + "loss": 0.9674, + "step": 5006 + }, + { + "epoch": 0.5240188383045525, + "grad_norm": 2.0470323099465486, + "learning_rate": 9.708525352744325e-06, + "loss": 0.9025, + "step": 5007 + }, + { + "epoch": 0.5241234955520669, + "grad_norm": 2.0043888800821907, + "learning_rate": 9.705137089600898e-06, + "loss": 0.943, + "step": 5008 + }, + { + "epoch": 0.5242281527995813, + "grad_norm": 2.011463626160743, + "learning_rate": 9.701748860337812e-06, + "loss": 0.935, + "step": 5009 + }, + { + "epoch": 0.5243328100470958, + "grad_norm": 2.050505275508048, + "learning_rate": 9.698360665344391e-06, + "loss": 0.9326, + "step": 5010 + }, + { + "epoch": 0.5244374672946102, + "grad_norm": 2.144129516792977, + "learning_rate": 9.694972505009938e-06, + "loss": 1.0178, + "step": 5011 + }, + { + "epoch": 0.5245421245421246, + "grad_norm": 1.9003656125321036, + "learning_rate": 9.691584379723766e-06, + "loss": 0.9608, + "step": 5012 + }, + { + "epoch": 0.524646781789639, + "grad_norm": 2.2612768774796654, + "learning_rate": 9.688196289875168e-06, + "loss": 0.9817, + "step": 5013 + }, + { + "epoch": 0.5247514390371534, + "grad_norm": 2.290640926354991, + "learning_rate": 9.68480823585345e-06, + "loss": 1.1034, + "step": 5014 + }, + { + "epoch": 0.5248560962846677, + "grad_norm": 2.4607191178201475, + "learning_rate": 9.681420218047909e-06, + "loss": 0.9906, + "step": 5015 + }, + { + "epoch": 0.5249607535321821, + "grad_norm": 1.9026816954278603, + "learning_rate": 9.678032236847827e-06, + "loss": 0.8915, + "step": 5016 + }, + { + "epoch": 0.5250654107796965, + "grad_norm": 1.9421672245762756, + "learning_rate": 9.674644292642496e-06, + "loss": 0.9105, + "step": 5017 + }, + { + "epoch": 0.5251700680272109, + "grad_norm": 2.4702863347349084, + "learning_rate": 9.671256385821196e-06, + "loss": 1.0937, + "step": 5018 + }, + { + "epoch": 0.5252747252747253, + "grad_norm": 1.876747432470777, + "learning_rate": 9.6678685167732e-06, + "loss": 0.8908, + "step": 5019 + }, + { + "epoch": 0.5253793825222397, + "grad_norm": 2.9876022744753934, + "learning_rate": 9.66448068588779e-06, + "loss": 0.9807, + "step": 5020 + }, + { + "epoch": 0.5254840397697541, + "grad_norm": 1.9265704361668035, + "learning_rate": 9.661092893554222e-06, + "loss": 0.8965, + "step": 5021 + }, + { + "epoch": 0.5255886970172684, + "grad_norm": 2.092597658236264, + "learning_rate": 9.657705140161772e-06, + "loss": 0.9318, + "step": 5022 + }, + { + "epoch": 0.5256933542647828, + "grad_norm": 1.9546071290696667, + "learning_rate": 9.654317426099695e-06, + "loss": 0.9737, + "step": 5023 + }, + { + "epoch": 0.5257980115122972, + "grad_norm": 1.9365552977725047, + "learning_rate": 9.650929751757247e-06, + "loss": 1.0049, + "step": 5024 + }, + { + "epoch": 0.5259026687598116, + "grad_norm": 1.967006406063824, + "learning_rate": 9.647542117523679e-06, + "loss": 0.9506, + "step": 5025 + }, + { + "epoch": 0.526007326007326, + "grad_norm": 2.3155282840857847, + "learning_rate": 9.644154523788233e-06, + "loss": 0.7686, + "step": 5026 + }, + { + "epoch": 0.5261119832548404, + "grad_norm": 2.157265802659998, + "learning_rate": 9.640766970940157e-06, + "loss": 0.9982, + "step": 5027 + }, + { + "epoch": 0.5262166405023548, + "grad_norm": 2.1389647147739823, + "learning_rate": 9.637379459368682e-06, + "loss": 0.8971, + "step": 5028 + }, + { + "epoch": 0.5263212977498691, + "grad_norm": 2.0363015164707607, + "learning_rate": 9.633991989463043e-06, + "loss": 1.0196, + "step": 5029 + }, + { + "epoch": 0.5264259549973835, + "grad_norm": 1.9294652609584093, + "learning_rate": 9.630604561612465e-06, + "loss": 0.7722, + "step": 5030 + }, + { + "epoch": 0.5265306122448979, + "grad_norm": 2.342838953335117, + "learning_rate": 9.627217176206172e-06, + "loss": 0.9184, + "step": 5031 + }, + { + "epoch": 0.5266352694924124, + "grad_norm": 1.9756381699879217, + "learning_rate": 9.623829833633384e-06, + "loss": 1.0176, + "step": 5032 + }, + { + "epoch": 0.5267399267399268, + "grad_norm": 2.03546759419274, + "learning_rate": 9.620442534283308e-06, + "loss": 0.9261, + "step": 5033 + }, + { + "epoch": 0.5268445839874412, + "grad_norm": 2.288452566068461, + "learning_rate": 9.617055278545155e-06, + "loss": 0.9991, + "step": 5034 + }, + { + "epoch": 0.5269492412349556, + "grad_norm": 2.057195359204059, + "learning_rate": 9.61366806680813e-06, + "loss": 0.9043, + "step": 5035 + }, + { + "epoch": 0.5270538984824699, + "grad_norm": 2.0034791907235134, + "learning_rate": 9.610280899461422e-06, + "loss": 0.8265, + "step": 5036 + }, + { + "epoch": 0.5271585557299843, + "grad_norm": 2.300980115407969, + "learning_rate": 9.606893776894232e-06, + "loss": 0.972, + "step": 5037 + }, + { + "epoch": 0.5272632129774987, + "grad_norm": 2.1374768924728715, + "learning_rate": 9.603506699495742e-06, + "loss": 0.9861, + "step": 5038 + }, + { + "epoch": 0.5273678702250131, + "grad_norm": 1.901539200611645, + "learning_rate": 9.600119667655135e-06, + "loss": 0.9153, + "step": 5039 + }, + { + "epoch": 0.5274725274725275, + "grad_norm": 1.898313837762518, + "learning_rate": 9.596732681761591e-06, + "loss": 0.9927, + "step": 5040 + }, + { + "epoch": 0.5275771847200419, + "grad_norm": 2.203116494419673, + "learning_rate": 9.593345742204278e-06, + "loss": 0.9173, + "step": 5041 + }, + { + "epoch": 0.5276818419675563, + "grad_norm": 1.6960490365340934, + "learning_rate": 9.589958849372367e-06, + "loss": 0.9086, + "step": 5042 + }, + { + "epoch": 0.5277864992150706, + "grad_norm": 2.2070200124057133, + "learning_rate": 9.586572003655012e-06, + "loss": 0.8878, + "step": 5043 + }, + { + "epoch": 0.527891156462585, + "grad_norm": 1.954022666257893, + "learning_rate": 9.583185205441376e-06, + "loss": 0.9223, + "step": 5044 + }, + { + "epoch": 0.5279958137100994, + "grad_norm": 2.0745859713137866, + "learning_rate": 9.5797984551206e-06, + "loss": 1.0438, + "step": 5045 + }, + { + "epoch": 0.5281004709576138, + "grad_norm": 2.265741127424256, + "learning_rate": 9.576411753081837e-06, + "loss": 0.9013, + "step": 5046 + }, + { + "epoch": 0.5282051282051282, + "grad_norm": 1.8371450812640804, + "learning_rate": 9.573025099714218e-06, + "loss": 1.0232, + "step": 5047 + }, + { + "epoch": 0.5283097854526426, + "grad_norm": 2.033374031917156, + "learning_rate": 9.569638495406882e-06, + "loss": 0.8909, + "step": 5048 + }, + { + "epoch": 0.528414442700157, + "grad_norm": 2.1076096190375697, + "learning_rate": 9.566251940548957e-06, + "loss": 0.9968, + "step": 5049 + }, + { + "epoch": 0.5285190999476713, + "grad_norm": 1.941537663106642, + "learning_rate": 9.562865435529563e-06, + "loss": 0.884, + "step": 5050 + }, + { + "epoch": 0.5286237571951857, + "grad_norm": 1.993306824385622, + "learning_rate": 9.559478980737817e-06, + "loss": 0.9804, + "step": 5051 + }, + { + "epoch": 0.5287284144427001, + "grad_norm": 1.8310381477052629, + "learning_rate": 9.556092576562828e-06, + "loss": 0.8713, + "step": 5052 + }, + { + "epoch": 0.5288330716902145, + "grad_norm": 2.1706814859478643, + "learning_rate": 9.552706223393704e-06, + "loss": 0.953, + "step": 5053 + }, + { + "epoch": 0.528937728937729, + "grad_norm": 2.0181734603318917, + "learning_rate": 9.54931992161954e-06, + "loss": 1.0406, + "step": 5054 + }, + { + "epoch": 0.5290423861852434, + "grad_norm": 2.1729682113864537, + "learning_rate": 9.54593367162943e-06, + "loss": 0.9522, + "step": 5055 + }, + { + "epoch": 0.5291470434327578, + "grad_norm": 1.9793023318843381, + "learning_rate": 9.54254747381246e-06, + "loss": 0.8517, + "step": 5056 + }, + { + "epoch": 0.5292517006802722, + "grad_norm": 1.956291432464463, + "learning_rate": 9.539161328557716e-06, + "loss": 0.9517, + "step": 5057 + }, + { + "epoch": 0.5293563579277865, + "grad_norm": 2.033082190907283, + "learning_rate": 9.535775236254268e-06, + "loss": 0.8648, + "step": 5058 + }, + { + "epoch": 0.5294610151753009, + "grad_norm": 2.13270173634752, + "learning_rate": 9.532389197291187e-06, + "loss": 0.9357, + "step": 5059 + }, + { + "epoch": 0.5295656724228153, + "grad_norm": 1.91157758973134, + "learning_rate": 9.529003212057536e-06, + "loss": 0.9013, + "step": 5060 + }, + { + "epoch": 0.5296703296703297, + "grad_norm": 1.832312512024985, + "learning_rate": 9.52561728094237e-06, + "loss": 0.9018, + "step": 5061 + }, + { + "epoch": 0.5297749869178441, + "grad_norm": 2.316257080957267, + "learning_rate": 9.52223140433474e-06, + "loss": 0.7679, + "step": 5062 + }, + { + "epoch": 0.5298796441653585, + "grad_norm": 1.8496246062520718, + "learning_rate": 9.518845582623692e-06, + "loss": 0.9617, + "step": 5063 + }, + { + "epoch": 0.5299843014128729, + "grad_norm": 2.0807591771920197, + "learning_rate": 9.515459816198258e-06, + "loss": 0.9035, + "step": 5064 + }, + { + "epoch": 0.5300889586603872, + "grad_norm": 1.8998070623413483, + "learning_rate": 9.512074105447473e-06, + "loss": 0.9626, + "step": 5065 + }, + { + "epoch": 0.5301936159079016, + "grad_norm": 1.9812096820937861, + "learning_rate": 9.508688450760367e-06, + "loss": 0.9601, + "step": 5066 + }, + { + "epoch": 0.530298273155416, + "grad_norm": 2.147757889449252, + "learning_rate": 9.50530285252595e-06, + "loss": 1.0095, + "step": 5067 + }, + { + "epoch": 0.5304029304029304, + "grad_norm": 1.8908877406835036, + "learning_rate": 9.501917311133241e-06, + "loss": 0.9554, + "step": 5068 + }, + { + "epoch": 0.5305075876504448, + "grad_norm": 2.006086304429676, + "learning_rate": 9.498531826971242e-06, + "loss": 0.9882, + "step": 5069 + }, + { + "epoch": 0.5306122448979592, + "grad_norm": 1.7842507152981133, + "learning_rate": 9.495146400428953e-06, + "loss": 0.9366, + "step": 5070 + }, + { + "epoch": 0.5307169021454736, + "grad_norm": 2.046078498853944, + "learning_rate": 9.491761031895364e-06, + "loss": 0.876, + "step": 5071 + }, + { + "epoch": 0.5308215593929879, + "grad_norm": 1.9166303496266188, + "learning_rate": 9.488375721759467e-06, + "loss": 0.8727, + "step": 5072 + }, + { + "epoch": 0.5309262166405023, + "grad_norm": 1.8738691308012003, + "learning_rate": 9.48499047041023e-06, + "loss": 0.9591, + "step": 5073 + }, + { + "epoch": 0.5310308738880167, + "grad_norm": 2.107065364309017, + "learning_rate": 9.481605278236636e-06, + "loss": 0.8822, + "step": 5074 + }, + { + "epoch": 0.5311355311355311, + "grad_norm": 2.327347288791375, + "learning_rate": 9.478220145627645e-06, + "loss": 1.0899, + "step": 5075 + }, + { + "epoch": 0.5312401883830455, + "grad_norm": 2.2223497049637713, + "learning_rate": 9.47483507297222e-06, + "loss": 0.8937, + "step": 5076 + }, + { + "epoch": 0.53134484563056, + "grad_norm": 2.256580775897136, + "learning_rate": 9.471450060659308e-06, + "loss": 1.0627, + "step": 5077 + }, + { + "epoch": 0.5314495028780744, + "grad_norm": 2.1603161263939628, + "learning_rate": 9.468065109077858e-06, + "loss": 0.8642, + "step": 5078 + }, + { + "epoch": 0.5315541601255886, + "grad_norm": 2.021675968414649, + "learning_rate": 9.464680218616801e-06, + "loss": 0.9218, + "step": 5079 + }, + { + "epoch": 0.531658817373103, + "grad_norm": 2.027162622909609, + "learning_rate": 9.461295389665075e-06, + "loss": 0.8784, + "step": 5080 + }, + { + "epoch": 0.5317634746206175, + "grad_norm": 2.1737803034324896, + "learning_rate": 9.457910622611599e-06, + "loss": 0.9499, + "step": 5081 + }, + { + "epoch": 0.5318681318681319, + "grad_norm": 2.1361694404225573, + "learning_rate": 9.454525917845289e-06, + "loss": 0.9557, + "step": 5082 + }, + { + "epoch": 0.5319727891156463, + "grad_norm": 1.9555190602414432, + "learning_rate": 9.451141275755061e-06, + "loss": 1.0161, + "step": 5083 + }, + { + "epoch": 0.5320774463631607, + "grad_norm": 2.0343065372865823, + "learning_rate": 9.447756696729811e-06, + "loss": 0.9702, + "step": 5084 + }, + { + "epoch": 0.5321821036106751, + "grad_norm": 1.990208162802974, + "learning_rate": 9.444372181158439e-06, + "loss": 0.881, + "step": 5085 + }, + { + "epoch": 0.5322867608581894, + "grad_norm": 1.9953926441133305, + "learning_rate": 9.440987729429827e-06, + "loss": 0.8863, + "step": 5086 + }, + { + "epoch": 0.5323914181057038, + "grad_norm": 2.3741453667707257, + "learning_rate": 9.437603341932861e-06, + "loss": 0.8405, + "step": 5087 + }, + { + "epoch": 0.5324960753532182, + "grad_norm": 2.0080306933110252, + "learning_rate": 9.434219019056407e-06, + "loss": 0.8811, + "step": 5088 + }, + { + "epoch": 0.5326007326007326, + "grad_norm": 1.9052829376729115, + "learning_rate": 9.430834761189339e-06, + "loss": 0.9062, + "step": 5089 + }, + { + "epoch": 0.532705389848247, + "grad_norm": 2.232762167686221, + "learning_rate": 9.427450568720507e-06, + "loss": 1.066, + "step": 5090 + }, + { + "epoch": 0.5328100470957614, + "grad_norm": 1.7077102794519774, + "learning_rate": 9.424066442038764e-06, + "loss": 0.998, + "step": 5091 + }, + { + "epoch": 0.5329147043432758, + "grad_norm": 2.0986156305726316, + "learning_rate": 9.420682381532959e-06, + "loss": 1.0063, + "step": 5092 + }, + { + "epoch": 0.5330193615907901, + "grad_norm": 1.985756966883505, + "learning_rate": 9.417298387591923e-06, + "loss": 0.9973, + "step": 5093 + }, + { + "epoch": 0.5331240188383045, + "grad_norm": 2.3041795560347555, + "learning_rate": 9.41391446060448e-06, + "loss": 1.0417, + "step": 5094 + }, + { + "epoch": 0.5332286760858189, + "grad_norm": 2.3998949708050135, + "learning_rate": 9.410530600959455e-06, + "loss": 1.0162, + "step": 5095 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.9102226487477791, + "learning_rate": 9.407146809045658e-06, + "loss": 0.8784, + "step": 5096 + }, + { + "epoch": 0.5334379905808477, + "grad_norm": 1.9875471868758454, + "learning_rate": 9.403763085251898e-06, + "loss": 0.8156, + "step": 5097 + }, + { + "epoch": 0.5335426478283621, + "grad_norm": 1.953172303607464, + "learning_rate": 9.400379429966965e-06, + "loss": 0.8733, + "step": 5098 + }, + { + "epoch": 0.5336473050758765, + "grad_norm": 2.2056345622053537, + "learning_rate": 9.396995843579648e-06, + "loss": 0.9868, + "step": 5099 + }, + { + "epoch": 0.533751962323391, + "grad_norm": 2.053690818273544, + "learning_rate": 9.393612326478736e-06, + "loss": 0.8402, + "step": 5100 + }, + { + "epoch": 0.5338566195709052, + "grad_norm": 2.3399486682168775, + "learning_rate": 9.390228879052995e-06, + "loss": 1.043, + "step": 5101 + }, + { + "epoch": 0.5339612768184197, + "grad_norm": 2.075427595574749, + "learning_rate": 9.386845501691194e-06, + "loss": 0.9583, + "step": 5102 + }, + { + "epoch": 0.5340659340659341, + "grad_norm": 2.21323312612293, + "learning_rate": 9.383462194782086e-06, + "loss": 0.8203, + "step": 5103 + }, + { + "epoch": 0.5341705913134485, + "grad_norm": 1.7638697451900374, + "learning_rate": 9.380078958714424e-06, + "loss": 0.8476, + "step": 5104 + }, + { + "epoch": 0.5342752485609629, + "grad_norm": 2.1757947162837636, + "learning_rate": 9.376695793876943e-06, + "loss": 1.0289, + "step": 5105 + }, + { + "epoch": 0.5343799058084773, + "grad_norm": 2.0498873657140675, + "learning_rate": 9.373312700658381e-06, + "loss": 0.9722, + "step": 5106 + }, + { + "epoch": 0.5344845630559917, + "grad_norm": 2.146781521567094, + "learning_rate": 9.36992967944746e-06, + "loss": 0.9346, + "step": 5107 + }, + { + "epoch": 0.534589220303506, + "grad_norm": 2.4924758465124643, + "learning_rate": 9.366546730632892e-06, + "loss": 0.9294, + "step": 5108 + }, + { + "epoch": 0.5346938775510204, + "grad_norm": 2.1871982606239224, + "learning_rate": 9.363163854603393e-06, + "loss": 0.9268, + "step": 5109 + }, + { + "epoch": 0.5347985347985348, + "grad_norm": 2.6130844536579145, + "learning_rate": 9.359781051747658e-06, + "loss": 0.8488, + "step": 5110 + }, + { + "epoch": 0.5349031920460492, + "grad_norm": 2.3176010643788856, + "learning_rate": 9.356398322454378e-06, + "loss": 1.0246, + "step": 5111 + }, + { + "epoch": 0.5350078492935636, + "grad_norm": 1.9976965271715834, + "learning_rate": 9.353015667112236e-06, + "loss": 0.9324, + "step": 5112 + }, + { + "epoch": 0.535112506541078, + "grad_norm": 2.1264457121845073, + "learning_rate": 9.349633086109904e-06, + "loss": 0.957, + "step": 5113 + }, + { + "epoch": 0.5352171637885924, + "grad_norm": 2.1082662476439897, + "learning_rate": 9.34625057983605e-06, + "loss": 0.8603, + "step": 5114 + }, + { + "epoch": 0.5353218210361067, + "grad_norm": 2.081793729672057, + "learning_rate": 9.342868148679328e-06, + "loss": 0.9384, + "step": 5115 + }, + { + "epoch": 0.5354264782836211, + "grad_norm": 1.8602117077983622, + "learning_rate": 9.33948579302839e-06, + "loss": 0.9818, + "step": 5116 + }, + { + "epoch": 0.5355311355311355, + "grad_norm": 2.068748019729851, + "learning_rate": 9.33610351327187e-06, + "loss": 0.8313, + "step": 5117 + }, + { + "epoch": 0.5356357927786499, + "grad_norm": 2.250399416328852, + "learning_rate": 9.332721309798403e-06, + "loss": 0.9082, + "step": 5118 + }, + { + "epoch": 0.5357404500261643, + "grad_norm": 2.0089516460747068, + "learning_rate": 9.329339182996614e-06, + "loss": 1.0031, + "step": 5119 + }, + { + "epoch": 0.5358451072736787, + "grad_norm": 2.1921142529491062, + "learning_rate": 9.325957133255107e-06, + "loss": 0.9773, + "step": 5120 + }, + { + "epoch": 0.5359497645211931, + "grad_norm": 1.7765848948209197, + "learning_rate": 9.322575160962497e-06, + "loss": 0.8713, + "step": 5121 + }, + { + "epoch": 0.5360544217687074, + "grad_norm": 2.2822852338608133, + "learning_rate": 9.319193266507371e-06, + "loss": 1.0108, + "step": 5122 + }, + { + "epoch": 0.5361590790162218, + "grad_norm": 1.9080359520771442, + "learning_rate": 9.315811450278322e-06, + "loss": 1.0417, + "step": 5123 + }, + { + "epoch": 0.5362637362637362, + "grad_norm": 2.126848470721643, + "learning_rate": 9.312429712663922e-06, + "loss": 1.0819, + "step": 5124 + }, + { + "epoch": 0.5363683935112507, + "grad_norm": 1.9448417891971947, + "learning_rate": 9.309048054052738e-06, + "loss": 1.0022, + "step": 5125 + }, + { + "epoch": 0.5364730507587651, + "grad_norm": 2.0110601683746716, + "learning_rate": 9.30566647483334e-06, + "loss": 0.9406, + "step": 5126 + }, + { + "epoch": 0.5365777080062795, + "grad_norm": 2.059272200316169, + "learning_rate": 9.302284975394267e-06, + "loss": 0.9979, + "step": 5127 + }, + { + "epoch": 0.5366823652537939, + "grad_norm": 1.9516944950557442, + "learning_rate": 9.298903556124069e-06, + "loss": 0.964, + "step": 5128 + }, + { + "epoch": 0.5367870225013082, + "grad_norm": 1.9463999538178405, + "learning_rate": 9.29552221741127e-06, + "loss": 0.9758, + "step": 5129 + }, + { + "epoch": 0.5368916797488226, + "grad_norm": 1.98796486302683, + "learning_rate": 9.292140959644398e-06, + "loss": 0.9545, + "step": 5130 + }, + { + "epoch": 0.536996336996337, + "grad_norm": 1.9929189015148039, + "learning_rate": 9.288759783211967e-06, + "loss": 0.9298, + "step": 5131 + }, + { + "epoch": 0.5371009942438514, + "grad_norm": 2.1502952424043196, + "learning_rate": 9.285378688502475e-06, + "loss": 0.9023, + "step": 5132 + }, + { + "epoch": 0.5372056514913658, + "grad_norm": 1.8849112778207193, + "learning_rate": 9.281997675904423e-06, + "loss": 0.9625, + "step": 5133 + }, + { + "epoch": 0.5373103087388802, + "grad_norm": 2.2034978552750926, + "learning_rate": 9.27861674580629e-06, + "loss": 0.8996, + "step": 5134 + }, + { + "epoch": 0.5374149659863946, + "grad_norm": 2.1228025728157514, + "learning_rate": 9.275235898596555e-06, + "loss": 0.8977, + "step": 5135 + }, + { + "epoch": 0.5375196232339089, + "grad_norm": 2.155478741058007, + "learning_rate": 9.271855134663689e-06, + "loss": 1.0623, + "step": 5136 + }, + { + "epoch": 0.5376242804814233, + "grad_norm": 2.017078129664075, + "learning_rate": 9.26847445439614e-06, + "loss": 0.9586, + "step": 5137 + }, + { + "epoch": 0.5377289377289377, + "grad_norm": 2.2424961468183295, + "learning_rate": 9.265093858182364e-06, + "loss": 1.0009, + "step": 5138 + }, + { + "epoch": 0.5378335949764521, + "grad_norm": 1.9391397613594792, + "learning_rate": 9.26171334641079e-06, + "loss": 0.831, + "step": 5139 + }, + { + "epoch": 0.5379382522239665, + "grad_norm": 2.041390251270642, + "learning_rate": 9.258332919469851e-06, + "loss": 0.9798, + "step": 5140 + }, + { + "epoch": 0.5380429094714809, + "grad_norm": 2.086322474319101, + "learning_rate": 9.25495257774796e-06, + "loss": 1.0148, + "step": 5141 + }, + { + "epoch": 0.5381475667189953, + "grad_norm": 1.9006136542065337, + "learning_rate": 9.251572321633533e-06, + "loss": 0.9227, + "step": 5142 + }, + { + "epoch": 0.5382522239665097, + "grad_norm": 1.9414787885775822, + "learning_rate": 9.248192151514957e-06, + "loss": 0.8972, + "step": 5143 + }, + { + "epoch": 0.538356881214024, + "grad_norm": 1.8618340965139868, + "learning_rate": 9.24481206778063e-06, + "loss": 1.0364, + "step": 5144 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 1.7750231543939474, + "learning_rate": 9.241432070818929e-06, + "loss": 0.8416, + "step": 5145 + }, + { + "epoch": 0.5385661957090528, + "grad_norm": 1.7192970430261034, + "learning_rate": 9.23805216101822e-06, + "loss": 0.8291, + "step": 5146 + }, + { + "epoch": 0.5386708529565672, + "grad_norm": 2.267675160306062, + "learning_rate": 9.234672338766863e-06, + "loss": 0.8009, + "step": 5147 + }, + { + "epoch": 0.5387755102040817, + "grad_norm": 1.9151967169817872, + "learning_rate": 9.231292604453203e-06, + "loss": 0.983, + "step": 5148 + }, + { + "epoch": 0.5388801674515961, + "grad_norm": 1.8860340166769067, + "learning_rate": 9.227912958465584e-06, + "loss": 0.9827, + "step": 5149 + }, + { + "epoch": 0.5389848246991105, + "grad_norm": 2.377402452195343, + "learning_rate": 9.224533401192332e-06, + "loss": 0.7757, + "step": 5150 + }, + { + "epoch": 0.5390894819466248, + "grad_norm": 2.1940029519811444, + "learning_rate": 9.221153933021759e-06, + "loss": 0.9636, + "step": 5151 + }, + { + "epoch": 0.5391941391941392, + "grad_norm": 2.0338541104057266, + "learning_rate": 9.21777455434218e-06, + "loss": 1.0547, + "step": 5152 + }, + { + "epoch": 0.5392987964416536, + "grad_norm": 2.9722612954759255, + "learning_rate": 9.214395265541894e-06, + "loss": 0.9978, + "step": 5153 + }, + { + "epoch": 0.539403453689168, + "grad_norm": 2.1825836270355654, + "learning_rate": 9.211016067009182e-06, + "loss": 0.8928, + "step": 5154 + }, + { + "epoch": 0.5395081109366824, + "grad_norm": 2.0138240313959543, + "learning_rate": 9.207636959132325e-06, + "loss": 0.8572, + "step": 5155 + }, + { + "epoch": 0.5396127681841968, + "grad_norm": 2.3175892149524966, + "learning_rate": 9.204257942299585e-06, + "loss": 1.0819, + "step": 5156 + }, + { + "epoch": 0.5397174254317112, + "grad_norm": 2.235711126597796, + "learning_rate": 9.200879016899223e-06, + "loss": 0.8762, + "step": 5157 + }, + { + "epoch": 0.5398220826792255, + "grad_norm": 2.450747499531445, + "learning_rate": 9.197500183319481e-06, + "loss": 1.0596, + "step": 5158 + }, + { + "epoch": 0.5399267399267399, + "grad_norm": 2.3432608161807513, + "learning_rate": 9.194121441948596e-06, + "loss": 0.9195, + "step": 5159 + }, + { + "epoch": 0.5400313971742543, + "grad_norm": 2.2491282476173775, + "learning_rate": 9.190742793174786e-06, + "loss": 0.8444, + "step": 5160 + }, + { + "epoch": 0.5401360544217687, + "grad_norm": 2.3639647455866895, + "learning_rate": 9.187364237386273e-06, + "loss": 0.9589, + "step": 5161 + }, + { + "epoch": 0.5402407116692831, + "grad_norm": 2.359812971612293, + "learning_rate": 9.183985774971258e-06, + "loss": 0.8739, + "step": 5162 + }, + { + "epoch": 0.5403453689167975, + "grad_norm": 1.7777656507828674, + "learning_rate": 9.18060740631793e-06, + "loss": 0.8449, + "step": 5163 + }, + { + "epoch": 0.5404500261643119, + "grad_norm": 2.173125028700542, + "learning_rate": 9.177229131814473e-06, + "loss": 0.9589, + "step": 5164 + }, + { + "epoch": 0.5405546834118262, + "grad_norm": 2.1583802338493427, + "learning_rate": 9.173850951849057e-06, + "loss": 1.0773, + "step": 5165 + }, + { + "epoch": 0.5406593406593406, + "grad_norm": 1.897622605941461, + "learning_rate": 9.170472866809842e-06, + "loss": 0.9341, + "step": 5166 + }, + { + "epoch": 0.540763997906855, + "grad_norm": 1.884134801109637, + "learning_rate": 9.167094877084974e-06, + "loss": 0.9404, + "step": 5167 + }, + { + "epoch": 0.5408686551543694, + "grad_norm": 2.1870995357335525, + "learning_rate": 9.163716983062594e-06, + "loss": 0.8965, + "step": 5168 + }, + { + "epoch": 0.5409733124018838, + "grad_norm": 2.141040011803557, + "learning_rate": 9.160339185130825e-06, + "loss": 0.9386, + "step": 5169 + }, + { + "epoch": 0.5410779696493982, + "grad_norm": 2.042779608504371, + "learning_rate": 9.15696148367779e-06, + "loss": 0.8375, + "step": 5170 + }, + { + "epoch": 0.5411826268969127, + "grad_norm": 1.8593064018228953, + "learning_rate": 9.153583879091588e-06, + "loss": 0.8555, + "step": 5171 + }, + { + "epoch": 0.541287284144427, + "grad_norm": 2.3293817595274056, + "learning_rate": 9.150206371760315e-06, + "loss": 0.9834, + "step": 5172 + }, + { + "epoch": 0.5413919413919414, + "grad_norm": 2.0079430212166627, + "learning_rate": 9.146828962072051e-06, + "loss": 0.9392, + "step": 5173 + }, + { + "epoch": 0.5414965986394558, + "grad_norm": 2.1119483487510355, + "learning_rate": 9.143451650414871e-06, + "loss": 1.0077, + "step": 5174 + }, + { + "epoch": 0.5416012558869702, + "grad_norm": 1.9355398541105262, + "learning_rate": 9.140074437176831e-06, + "loss": 0.9627, + "step": 5175 + }, + { + "epoch": 0.5417059131344846, + "grad_norm": 2.066119618087384, + "learning_rate": 9.136697322745984e-06, + "loss": 0.9772, + "step": 5176 + }, + { + "epoch": 0.541810570381999, + "grad_norm": 1.8564381011561766, + "learning_rate": 9.133320307510359e-06, + "loss": 0.8735, + "step": 5177 + }, + { + "epoch": 0.5419152276295134, + "grad_norm": 1.9999578854370876, + "learning_rate": 9.129943391857992e-06, + "loss": 1.0295, + "step": 5178 + }, + { + "epoch": 0.5420198848770277, + "grad_norm": 2.264031570128424, + "learning_rate": 9.126566576176893e-06, + "loss": 0.7778, + "step": 5179 + }, + { + "epoch": 0.5421245421245421, + "grad_norm": 1.8699876284645591, + "learning_rate": 9.123189860855064e-06, + "loss": 0.9129, + "step": 5180 + }, + { + "epoch": 0.5422291993720565, + "grad_norm": 1.9231443138469622, + "learning_rate": 9.119813246280499e-06, + "loss": 0.9216, + "step": 5181 + }, + { + "epoch": 0.5423338566195709, + "grad_norm": 1.7836314115264373, + "learning_rate": 9.116436732841174e-06, + "loss": 0.9005, + "step": 5182 + }, + { + "epoch": 0.5424385138670853, + "grad_norm": 2.2350357880443448, + "learning_rate": 9.113060320925063e-06, + "loss": 0.9321, + "step": 5183 + }, + { + "epoch": 0.5425431711145997, + "grad_norm": 2.079863133130194, + "learning_rate": 9.109684010920116e-06, + "loss": 0.8967, + "step": 5184 + }, + { + "epoch": 0.5426478283621141, + "grad_norm": 2.2139801096197553, + "learning_rate": 9.106307803214283e-06, + "loss": 1.0516, + "step": 5185 + }, + { + "epoch": 0.5427524856096285, + "grad_norm": 2.179734467120083, + "learning_rate": 9.102931698195491e-06, + "loss": 0.7468, + "step": 5186 + }, + { + "epoch": 0.5428571428571428, + "grad_norm": 2.157564716376065, + "learning_rate": 9.099555696251667e-06, + "loss": 0.9357, + "step": 5187 + }, + { + "epoch": 0.5429618001046572, + "grad_norm": 2.041100938324909, + "learning_rate": 9.096179797770723e-06, + "loss": 0.9291, + "step": 5188 + }, + { + "epoch": 0.5430664573521716, + "grad_norm": 1.8303406459070106, + "learning_rate": 9.09280400314055e-06, + "loss": 0.8139, + "step": 5189 + }, + { + "epoch": 0.543171114599686, + "grad_norm": 1.8598137235962091, + "learning_rate": 9.089428312749036e-06, + "loss": 0.8703, + "step": 5190 + }, + { + "epoch": 0.5432757718472004, + "grad_norm": 2.0757977617675945, + "learning_rate": 9.086052726984057e-06, + "loss": 0.9464, + "step": 5191 + }, + { + "epoch": 0.5433804290947148, + "grad_norm": 1.891684900539831, + "learning_rate": 9.08267724623347e-06, + "loss": 0.9067, + "step": 5192 + }, + { + "epoch": 0.5434850863422293, + "grad_norm": 1.9766032826136062, + "learning_rate": 9.079301870885126e-06, + "loss": 0.9417, + "step": 5193 + }, + { + "epoch": 0.5435897435897435, + "grad_norm": 2.1848428050243824, + "learning_rate": 9.075926601326865e-06, + "loss": 0.8109, + "step": 5194 + }, + { + "epoch": 0.543694400837258, + "grad_norm": 1.928000464883851, + "learning_rate": 9.072551437946508e-06, + "loss": 0.9048, + "step": 5195 + }, + { + "epoch": 0.5437990580847724, + "grad_norm": 2.2851706994938477, + "learning_rate": 9.069176381131874e-06, + "loss": 1.0057, + "step": 5196 + }, + { + "epoch": 0.5439037153322868, + "grad_norm": 2.2316974873884634, + "learning_rate": 9.06580143127076e-06, + "loss": 0.9539, + "step": 5197 + }, + { + "epoch": 0.5440083725798012, + "grad_norm": 2.431415213869758, + "learning_rate": 9.062426588750956e-06, + "loss": 0.9679, + "step": 5198 + }, + { + "epoch": 0.5441130298273156, + "grad_norm": 1.8303873361505676, + "learning_rate": 9.059051853960237e-06, + "loss": 1.0003, + "step": 5199 + }, + { + "epoch": 0.54421768707483, + "grad_norm": 1.9180411674794864, + "learning_rate": 9.05567722728637e-06, + "loss": 0.941, + "step": 5200 + }, + { + "epoch": 0.5443223443223443, + "grad_norm": 2.1309566319139055, + "learning_rate": 9.0523027091171e-06, + "loss": 0.9292, + "step": 5201 + }, + { + "epoch": 0.5444270015698587, + "grad_norm": 1.9920223205280956, + "learning_rate": 9.048928299840172e-06, + "loss": 0.9156, + "step": 5202 + }, + { + "epoch": 0.5445316588173731, + "grad_norm": 1.9755399701908867, + "learning_rate": 9.04555399984331e-06, + "loss": 1.0254, + "step": 5203 + }, + { + "epoch": 0.5446363160648875, + "grad_norm": 1.867791911625654, + "learning_rate": 9.042179809514225e-06, + "loss": 0.9065, + "step": 5204 + }, + { + "epoch": 0.5447409733124019, + "grad_norm": 2.142420704324338, + "learning_rate": 9.038805729240626e-06, + "loss": 0.9879, + "step": 5205 + }, + { + "epoch": 0.5448456305599163, + "grad_norm": 1.8804509672023069, + "learning_rate": 9.035431759410194e-06, + "loss": 0.8701, + "step": 5206 + }, + { + "epoch": 0.5449502878074307, + "grad_norm": 1.8907052172885805, + "learning_rate": 9.032057900410612e-06, + "loss": 0.9644, + "step": 5207 + }, + { + "epoch": 0.545054945054945, + "grad_norm": 2.3910996543132965, + "learning_rate": 9.02868415262954e-06, + "loss": 0.9196, + "step": 5208 + }, + { + "epoch": 0.5451596023024594, + "grad_norm": 3.681066797684883, + "learning_rate": 9.025310516454623e-06, + "loss": 1.0363, + "step": 5209 + }, + { + "epoch": 0.5452642595499738, + "grad_norm": 1.6994987522917748, + "learning_rate": 9.021936992273508e-06, + "loss": 0.8546, + "step": 5210 + }, + { + "epoch": 0.5453689167974882, + "grad_norm": 2.5337992006092405, + "learning_rate": 9.018563580473814e-06, + "loss": 0.9382, + "step": 5211 + }, + { + "epoch": 0.5454735740450026, + "grad_norm": 1.8493509787452933, + "learning_rate": 9.01519028144315e-06, + "loss": 0.8746, + "step": 5212 + }, + { + "epoch": 0.545578231292517, + "grad_norm": 1.9579976590736436, + "learning_rate": 9.011817095569124e-06, + "loss": 0.9685, + "step": 5213 + }, + { + "epoch": 0.5456828885400314, + "grad_norm": 2.1333444289770087, + "learning_rate": 9.008444023239315e-06, + "loss": 0.7952, + "step": 5214 + }, + { + "epoch": 0.5457875457875457, + "grad_norm": 2.3503433202907478, + "learning_rate": 9.005071064841299e-06, + "loss": 0.88, + "step": 5215 + }, + { + "epoch": 0.5458922030350601, + "grad_norm": 2.2546724085226844, + "learning_rate": 9.001698220762634e-06, + "loss": 0.9255, + "step": 5216 + }, + { + "epoch": 0.5459968602825745, + "grad_norm": 2.356440368535242, + "learning_rate": 8.998325491390869e-06, + "loss": 1.0099, + "step": 5217 + }, + { + "epoch": 0.546101517530089, + "grad_norm": 2.2034666717622033, + "learning_rate": 8.994952877113533e-06, + "loss": 1.0675, + "step": 5218 + }, + { + "epoch": 0.5462061747776034, + "grad_norm": 1.9688211472663686, + "learning_rate": 8.991580378318151e-06, + "loss": 0.845, + "step": 5219 + }, + { + "epoch": 0.5463108320251178, + "grad_norm": 2.1233709050192213, + "learning_rate": 8.988207995392227e-06, + "loss": 1.1076, + "step": 5220 + }, + { + "epoch": 0.5464154892726322, + "grad_norm": 2.2100117848067886, + "learning_rate": 8.984835728723251e-06, + "loss": 0.9292, + "step": 5221 + }, + { + "epoch": 0.5465201465201465, + "grad_norm": 2.00762864700889, + "learning_rate": 8.981463578698714e-06, + "loss": 0.9889, + "step": 5222 + }, + { + "epoch": 0.5466248037676609, + "grad_norm": 2.000506555020569, + "learning_rate": 8.978091545706074e-06, + "loss": 0.9671, + "step": 5223 + }, + { + "epoch": 0.5467294610151753, + "grad_norm": 1.9318729631393265, + "learning_rate": 8.974719630132788e-06, + "loss": 0.9534, + "step": 5224 + }, + { + "epoch": 0.5468341182626897, + "grad_norm": 2.704458307866667, + "learning_rate": 8.971347832366296e-06, + "loss": 0.9215, + "step": 5225 + }, + { + "epoch": 0.5469387755102041, + "grad_norm": 1.8785357601729604, + "learning_rate": 8.96797615279402e-06, + "loss": 0.8026, + "step": 5226 + }, + { + "epoch": 0.5470434327577185, + "grad_norm": 1.982632154167765, + "learning_rate": 8.96460459180338e-06, + "loss": 0.9695, + "step": 5227 + }, + { + "epoch": 0.5471480900052329, + "grad_norm": 2.143564500630684, + "learning_rate": 8.961233149781768e-06, + "loss": 0.939, + "step": 5228 + }, + { + "epoch": 0.5472527472527473, + "grad_norm": 2.203497996122728, + "learning_rate": 8.957861827116576e-06, + "loss": 0.9991, + "step": 5229 + }, + { + "epoch": 0.5473574045002616, + "grad_norm": 1.9379036312231492, + "learning_rate": 8.954490624195169e-06, + "loss": 0.8894, + "step": 5230 + }, + { + "epoch": 0.547462061747776, + "grad_norm": 2.0508974667099245, + "learning_rate": 8.951119541404908e-06, + "loss": 1.0449, + "step": 5231 + }, + { + "epoch": 0.5475667189952904, + "grad_norm": 2.588002315713727, + "learning_rate": 8.947748579133143e-06, + "loss": 0.9373, + "step": 5232 + }, + { + "epoch": 0.5476713762428048, + "grad_norm": 2.38449359167579, + "learning_rate": 8.944377737767195e-06, + "loss": 1.041, + "step": 5233 + }, + { + "epoch": 0.5477760334903192, + "grad_norm": 1.9288662968847525, + "learning_rate": 8.94100701769439e-06, + "loss": 0.9473, + "step": 5234 + }, + { + "epoch": 0.5478806907378336, + "grad_norm": 2.1753526717070155, + "learning_rate": 8.93763641930202e-06, + "loss": 0.8533, + "step": 5235 + }, + { + "epoch": 0.547985347985348, + "grad_norm": 2.214682121752073, + "learning_rate": 8.934265942977383e-06, + "loss": 1.0522, + "step": 5236 + }, + { + "epoch": 0.5480900052328623, + "grad_norm": 1.9884467197972144, + "learning_rate": 8.930895589107748e-06, + "loss": 1.0617, + "step": 5237 + }, + { + "epoch": 0.5481946624803767, + "grad_norm": 1.8273085911630584, + "learning_rate": 8.927525358080374e-06, + "loss": 0.8846, + "step": 5238 + }, + { + "epoch": 0.5482993197278911, + "grad_norm": 2.132364155223627, + "learning_rate": 8.924155250282515e-06, + "loss": 0.8767, + "step": 5239 + }, + { + "epoch": 0.5484039769754055, + "grad_norm": 2.1653790608930557, + "learning_rate": 8.920785266101396e-06, + "loss": 1.0162, + "step": 5240 + }, + { + "epoch": 0.54850863422292, + "grad_norm": 1.9836556420176448, + "learning_rate": 8.917415405924241e-06, + "loss": 0.9864, + "step": 5241 + }, + { + "epoch": 0.5486132914704344, + "grad_norm": 2.497343830248175, + "learning_rate": 8.914045670138247e-06, + "loss": 1.0194, + "step": 5242 + }, + { + "epoch": 0.5487179487179488, + "grad_norm": 2.2357270888989262, + "learning_rate": 8.910676059130611e-06, + "loss": 1.0598, + "step": 5243 + }, + { + "epoch": 0.5488226059654631, + "grad_norm": 2.076847097610531, + "learning_rate": 8.907306573288502e-06, + "loss": 0.916, + "step": 5244 + }, + { + "epoch": 0.5489272632129775, + "grad_norm": 1.9802029893372697, + "learning_rate": 8.903937212999082e-06, + "loss": 0.843, + "step": 5245 + }, + { + "epoch": 0.5490319204604919, + "grad_norm": 2.0192766915384155, + "learning_rate": 8.900567978649501e-06, + "loss": 0.9069, + "step": 5246 + }, + { + "epoch": 0.5491365777080063, + "grad_norm": 2.432583695671836, + "learning_rate": 8.897198870626882e-06, + "loss": 0.8992, + "step": 5247 + }, + { + "epoch": 0.5492412349555207, + "grad_norm": 2.213741486304002, + "learning_rate": 8.893829889318352e-06, + "loss": 0.8917, + "step": 5248 + }, + { + "epoch": 0.5493458922030351, + "grad_norm": 2.135434999613721, + "learning_rate": 8.89046103511101e-06, + "loss": 0.9377, + "step": 5249 + }, + { + "epoch": 0.5494505494505495, + "grad_norm": 2.122303160131647, + "learning_rate": 8.887092308391945e-06, + "loss": 0.9298, + "step": 5250 + }, + { + "epoch": 0.5495552066980638, + "grad_norm": 2.2880405163464532, + "learning_rate": 8.883723709548232e-06, + "loss": 1.0074, + "step": 5251 + }, + { + "epoch": 0.5496598639455782, + "grad_norm": 1.9274854009826472, + "learning_rate": 8.880355238966923e-06, + "loss": 0.8281, + "step": 5252 + }, + { + "epoch": 0.5497645211930926, + "grad_norm": 1.9114757164675305, + "learning_rate": 8.87698689703507e-06, + "loss": 0.9118, + "step": 5253 + }, + { + "epoch": 0.549869178440607, + "grad_norm": 2.1358804324224088, + "learning_rate": 8.873618684139697e-06, + "loss": 0.9843, + "step": 5254 + }, + { + "epoch": 0.5499738356881214, + "grad_norm": 2.1558357260108205, + "learning_rate": 8.870250600667823e-06, + "loss": 1.0874, + "step": 5255 + }, + { + "epoch": 0.5500784929356358, + "grad_norm": 2.4247493566253926, + "learning_rate": 8.866882647006441e-06, + "loss": 0.9887, + "step": 5256 + }, + { + "epoch": 0.5501831501831502, + "grad_norm": 2.028235978688342, + "learning_rate": 8.863514823542542e-06, + "loss": 0.9167, + "step": 5257 + }, + { + "epoch": 0.5502878074306645, + "grad_norm": 1.9043186883483203, + "learning_rate": 8.860147130663096e-06, + "loss": 0.8844, + "step": 5258 + }, + { + "epoch": 0.5503924646781789, + "grad_norm": 2.8270360697499566, + "learning_rate": 8.856779568755052e-06, + "loss": 0.9991, + "step": 5259 + }, + { + "epoch": 0.5504971219256933, + "grad_norm": 1.942578281067632, + "learning_rate": 8.853412138205357e-06, + "loss": 0.9976, + "step": 5260 + }, + { + "epoch": 0.5506017791732077, + "grad_norm": 2.2072497818987467, + "learning_rate": 8.85004483940093e-06, + "loss": 0.9079, + "step": 5261 + }, + { + "epoch": 0.5507064364207221, + "grad_norm": 1.958197617874853, + "learning_rate": 8.846677672728683e-06, + "loss": 1.0171, + "step": 5262 + }, + { + "epoch": 0.5508110936682366, + "grad_norm": 1.946933390805341, + "learning_rate": 8.84331063857551e-06, + "loss": 0.9166, + "step": 5263 + }, + { + "epoch": 0.550915750915751, + "grad_norm": 2.099808823892071, + "learning_rate": 8.839943737328282e-06, + "loss": 0.9827, + "step": 5264 + }, + { + "epoch": 0.5510204081632653, + "grad_norm": 2.018139469461617, + "learning_rate": 8.836576969373877e-06, + "loss": 0.9721, + "step": 5265 + }, + { + "epoch": 0.5511250654107797, + "grad_norm": 2.297998216881771, + "learning_rate": 8.833210335099137e-06, + "loss": 0.9523, + "step": 5266 + }, + { + "epoch": 0.5512297226582941, + "grad_norm": 2.0134991695196005, + "learning_rate": 8.829843834890893e-06, + "loss": 0.975, + "step": 5267 + }, + { + "epoch": 0.5513343799058085, + "grad_norm": 2.150846758924609, + "learning_rate": 8.826477469135967e-06, + "loss": 0.8611, + "step": 5268 + }, + { + "epoch": 0.5514390371533229, + "grad_norm": 1.82333106005884, + "learning_rate": 8.823111238221156e-06, + "loss": 0.9238, + "step": 5269 + }, + { + "epoch": 0.5515436944008373, + "grad_norm": 2.4050851629812655, + "learning_rate": 8.819745142533252e-06, + "loss": 1.0834, + "step": 5270 + }, + { + "epoch": 0.5516483516483517, + "grad_norm": 2.28131876935219, + "learning_rate": 8.81637918245902e-06, + "loss": 0.9526, + "step": 5271 + }, + { + "epoch": 0.5517530088958661, + "grad_norm": 2.1174671106347236, + "learning_rate": 8.813013358385223e-06, + "loss": 0.9436, + "step": 5272 + }, + { + "epoch": 0.5518576661433804, + "grad_norm": 1.9671009087310183, + "learning_rate": 8.809647670698594e-06, + "loss": 1.0025, + "step": 5273 + }, + { + "epoch": 0.5519623233908948, + "grad_norm": 2.052327149623945, + "learning_rate": 8.806282119785861e-06, + "loss": 0.8248, + "step": 5274 + }, + { + "epoch": 0.5520669806384092, + "grad_norm": 2.384339181879582, + "learning_rate": 8.802916706033734e-06, + "loss": 1.008, + "step": 5275 + }, + { + "epoch": 0.5521716378859236, + "grad_norm": 1.8901455647678653, + "learning_rate": 8.799551429828906e-06, + "loss": 0.8009, + "step": 5276 + }, + { + "epoch": 0.552276295133438, + "grad_norm": 1.9504635654515794, + "learning_rate": 8.796186291558052e-06, + "loss": 0.9597, + "step": 5277 + }, + { + "epoch": 0.5523809523809524, + "grad_norm": 2.3275789921544017, + "learning_rate": 8.79282129160783e-06, + "loss": 0.8494, + "step": 5278 + }, + { + "epoch": 0.5524856096284668, + "grad_norm": 1.835713336563086, + "learning_rate": 8.789456430364892e-06, + "loss": 0.9284, + "step": 5279 + }, + { + "epoch": 0.5525902668759811, + "grad_norm": 2.017436152195592, + "learning_rate": 8.786091708215863e-06, + "loss": 0.8889, + "step": 5280 + }, + { + "epoch": 0.5526949241234955, + "grad_norm": 1.959115114298543, + "learning_rate": 8.782727125547358e-06, + "loss": 0.8949, + "step": 5281 + }, + { + "epoch": 0.5527995813710099, + "grad_norm": 2.0528665958582093, + "learning_rate": 8.779362682745971e-06, + "loss": 0.9803, + "step": 5282 + }, + { + "epoch": 0.5529042386185243, + "grad_norm": 1.8604560176356941, + "learning_rate": 8.77599838019829e-06, + "loss": 0.923, + "step": 5283 + }, + { + "epoch": 0.5530088958660387, + "grad_norm": 1.800262625922265, + "learning_rate": 8.772634218290874e-06, + "loss": 0.9148, + "step": 5284 + }, + { + "epoch": 0.5531135531135531, + "grad_norm": 2.4993048236879525, + "learning_rate": 8.769270197410277e-06, + "loss": 0.9607, + "step": 5285 + }, + { + "epoch": 0.5532182103610676, + "grad_norm": 1.8252037891658632, + "learning_rate": 8.765906317943027e-06, + "loss": 0.8508, + "step": 5286 + }, + { + "epoch": 0.5533228676085818, + "grad_norm": 2.238304823724985, + "learning_rate": 8.762542580275644e-06, + "loss": 0.9228, + "step": 5287 + }, + { + "epoch": 0.5534275248560963, + "grad_norm": 2.1072228350806896, + "learning_rate": 8.759178984794627e-06, + "loss": 0.8588, + "step": 5288 + }, + { + "epoch": 0.5535321821036107, + "grad_norm": 1.9622546540770844, + "learning_rate": 8.755815531886461e-06, + "loss": 0.9111, + "step": 5289 + }, + { + "epoch": 0.5536368393511251, + "grad_norm": 2.2818234572001144, + "learning_rate": 8.752452221937608e-06, + "loss": 1.0615, + "step": 5290 + }, + { + "epoch": 0.5537414965986395, + "grad_norm": 2.1750822391853344, + "learning_rate": 8.749089055334527e-06, + "loss": 0.8601, + "step": 5291 + }, + { + "epoch": 0.5538461538461539, + "grad_norm": 2.0616636889051896, + "learning_rate": 8.74572603246365e-06, + "loss": 1.0359, + "step": 5292 + }, + { + "epoch": 0.5539508110936683, + "grad_norm": 2.710571314021102, + "learning_rate": 8.742363153711396e-06, + "loss": 1.0601, + "step": 5293 + }, + { + "epoch": 0.5540554683411826, + "grad_norm": 2.6145632006844886, + "learning_rate": 8.739000419464164e-06, + "loss": 1.0233, + "step": 5294 + }, + { + "epoch": 0.554160125588697, + "grad_norm": 1.9109618531413386, + "learning_rate": 8.735637830108337e-06, + "loss": 0.9872, + "step": 5295 + }, + { + "epoch": 0.5542647828362114, + "grad_norm": 2.1467379854230995, + "learning_rate": 8.732275386030291e-06, + "loss": 0.9363, + "step": 5296 + }, + { + "epoch": 0.5543694400837258, + "grad_norm": 2.0056169978726786, + "learning_rate": 8.72891308761637e-06, + "loss": 0.989, + "step": 5297 + }, + { + "epoch": 0.5544740973312402, + "grad_norm": 2.1780493741262608, + "learning_rate": 8.725550935252914e-06, + "loss": 1.0403, + "step": 5298 + }, + { + "epoch": 0.5545787545787546, + "grad_norm": 2.21960346530747, + "learning_rate": 8.722188929326237e-06, + "loss": 0.9504, + "step": 5299 + }, + { + "epoch": 0.554683411826269, + "grad_norm": 2.1450710730738116, + "learning_rate": 8.718827070222642e-06, + "loss": 1.1257, + "step": 5300 + }, + { + "epoch": 0.5547880690737833, + "grad_norm": 2.228582965255394, + "learning_rate": 8.715465358328417e-06, + "loss": 0.9697, + "step": 5301 + }, + { + "epoch": 0.5548927263212977, + "grad_norm": 2.4830808941857554, + "learning_rate": 8.712103794029824e-06, + "loss": 0.8945, + "step": 5302 + }, + { + "epoch": 0.5549973835688121, + "grad_norm": 2.2847911242856656, + "learning_rate": 8.708742377713118e-06, + "loss": 0.9817, + "step": 5303 + }, + { + "epoch": 0.5551020408163265, + "grad_norm": 2.1604390174895416, + "learning_rate": 8.705381109764529e-06, + "loss": 0.9853, + "step": 5304 + }, + { + "epoch": 0.5552066980638409, + "grad_norm": 2.2396160300426082, + "learning_rate": 8.702019990570274e-06, + "loss": 0.8813, + "step": 5305 + }, + { + "epoch": 0.5553113553113553, + "grad_norm": 2.130623108079099, + "learning_rate": 8.698659020516556e-06, + "loss": 0.8336, + "step": 5306 + }, + { + "epoch": 0.5554160125588697, + "grad_norm": 1.9908826133921063, + "learning_rate": 8.695298199989552e-06, + "loss": 0.9344, + "step": 5307 + }, + { + "epoch": 0.555520669806384, + "grad_norm": 1.9711870901565465, + "learning_rate": 8.691937529375428e-06, + "loss": 0.9166, + "step": 5308 + }, + { + "epoch": 0.5556253270538984, + "grad_norm": 1.9835386623262095, + "learning_rate": 8.688577009060338e-06, + "loss": 0.9319, + "step": 5309 + }, + { + "epoch": 0.5557299843014128, + "grad_norm": 2.3085235533894126, + "learning_rate": 8.685216639430405e-06, + "loss": 0.8664, + "step": 5310 + }, + { + "epoch": 0.5558346415489273, + "grad_norm": 3.3332655720477895, + "learning_rate": 8.681856420871747e-06, + "loss": 0.8843, + "step": 5311 + }, + { + "epoch": 0.5559392987964417, + "grad_norm": 1.8582951572007596, + "learning_rate": 8.678496353770458e-06, + "loss": 0.8867, + "step": 5312 + }, + { + "epoch": 0.5560439560439561, + "grad_norm": 2.173807723457474, + "learning_rate": 8.67513643851262e-06, + "loss": 0.9826, + "step": 5313 + }, + { + "epoch": 0.5561486132914705, + "grad_norm": 1.7581129151706683, + "learning_rate": 8.671776675484288e-06, + "loss": 0.9056, + "step": 5314 + }, + { + "epoch": 0.5562532705389849, + "grad_norm": 2.1180334877984017, + "learning_rate": 8.66841706507151e-06, + "loss": 0.9616, + "step": 5315 + }, + { + "epoch": 0.5563579277864992, + "grad_norm": 2.508246391771904, + "learning_rate": 8.665057607660305e-06, + "loss": 0.9179, + "step": 5316 + }, + { + "epoch": 0.5564625850340136, + "grad_norm": 1.9132610573755788, + "learning_rate": 8.661698303636692e-06, + "loss": 0.8745, + "step": 5317 + }, + { + "epoch": 0.556567242281528, + "grad_norm": 2.4958112187692767, + "learning_rate": 8.658339153386658e-06, + "loss": 0.9097, + "step": 5318 + }, + { + "epoch": 0.5566718995290424, + "grad_norm": 2.249652224152348, + "learning_rate": 8.654980157296173e-06, + "loss": 1.0163, + "step": 5319 + }, + { + "epoch": 0.5567765567765568, + "grad_norm": 2.1636612203919787, + "learning_rate": 8.651621315751197e-06, + "loss": 1.0146, + "step": 5320 + }, + { + "epoch": 0.5568812140240712, + "grad_norm": 2.4873635948113484, + "learning_rate": 8.648262629137663e-06, + "loss": 1.0301, + "step": 5321 + }, + { + "epoch": 0.5569858712715856, + "grad_norm": 2.1298989438781044, + "learning_rate": 8.644904097841494e-06, + "loss": 0.979, + "step": 5322 + }, + { + "epoch": 0.5570905285190999, + "grad_norm": 2.296389158373194, + "learning_rate": 8.641545722248592e-06, + "loss": 0.9278, + "step": 5323 + }, + { + "epoch": 0.5571951857666143, + "grad_norm": 2.2433733862004055, + "learning_rate": 8.638187502744838e-06, + "loss": 0.8756, + "step": 5324 + }, + { + "epoch": 0.5572998430141287, + "grad_norm": 1.9895145461612076, + "learning_rate": 8.634829439716099e-06, + "loss": 0.9287, + "step": 5325 + }, + { + "epoch": 0.5574045002616431, + "grad_norm": 1.8763204041932915, + "learning_rate": 8.63147153354823e-06, + "loss": 0.875, + "step": 5326 + }, + { + "epoch": 0.5575091575091575, + "grad_norm": 2.2272253472444103, + "learning_rate": 8.628113784627053e-06, + "loss": 0.9695, + "step": 5327 + }, + { + "epoch": 0.5576138147566719, + "grad_norm": 1.972305846861178, + "learning_rate": 8.624756193338384e-06, + "loss": 0.9767, + "step": 5328 + }, + { + "epoch": 0.5577184720041863, + "grad_norm": 2.3386342244281124, + "learning_rate": 8.621398760068017e-06, + "loss": 0.9315, + "step": 5329 + }, + { + "epoch": 0.5578231292517006, + "grad_norm": 2.3263287856510773, + "learning_rate": 8.618041485201728e-06, + "loss": 0.8489, + "step": 5330 + }, + { + "epoch": 0.557927786499215, + "grad_norm": 2.1728955209423315, + "learning_rate": 8.61468436912527e-06, + "loss": 0.8711, + "step": 5331 + }, + { + "epoch": 0.5580324437467294, + "grad_norm": 1.9615782696853798, + "learning_rate": 8.61132741222439e-06, + "loss": 0.9714, + "step": 5332 + }, + { + "epoch": 0.5581371009942439, + "grad_norm": 1.997852261589968, + "learning_rate": 8.607970614884804e-06, + "loss": 0.7903, + "step": 5333 + }, + { + "epoch": 0.5582417582417583, + "grad_norm": 2.0318840672120375, + "learning_rate": 8.604613977492212e-06, + "loss": 0.9649, + "step": 5334 + }, + { + "epoch": 0.5583464154892727, + "grad_norm": 2.0231096418522756, + "learning_rate": 8.60125750043231e-06, + "loss": 0.894, + "step": 5335 + }, + { + "epoch": 0.5584510727367871, + "grad_norm": 2.0941739900946663, + "learning_rate": 8.597901184090753e-06, + "loss": 1.0217, + "step": 5336 + }, + { + "epoch": 0.5585557299843014, + "grad_norm": 2.0371217772719383, + "learning_rate": 8.594545028853195e-06, + "loss": 0.9738, + "step": 5337 + }, + { + "epoch": 0.5586603872318158, + "grad_norm": 2.4539134994923724, + "learning_rate": 8.59118903510526e-06, + "loss": 0.8423, + "step": 5338 + }, + { + "epoch": 0.5587650444793302, + "grad_norm": 2.0675918762574557, + "learning_rate": 8.587833203232564e-06, + "loss": 0.9802, + "step": 5339 + }, + { + "epoch": 0.5588697017268446, + "grad_norm": 2.419698934398359, + "learning_rate": 8.584477533620693e-06, + "loss": 1.0828, + "step": 5340 + }, + { + "epoch": 0.558974358974359, + "grad_norm": 1.7962022864475375, + "learning_rate": 8.581122026655222e-06, + "loss": 0.8116, + "step": 5341 + }, + { + "epoch": 0.5590790162218734, + "grad_norm": 2.2433085787237044, + "learning_rate": 8.577766682721705e-06, + "loss": 0.913, + "step": 5342 + }, + { + "epoch": 0.5591836734693878, + "grad_norm": 2.1428509222947656, + "learning_rate": 8.574411502205682e-06, + "loss": 1.0034, + "step": 5343 + }, + { + "epoch": 0.5592883307169021, + "grad_norm": 2.0066030593881035, + "learning_rate": 8.571056485492668e-06, + "loss": 0.9946, + "step": 5344 + }, + { + "epoch": 0.5593929879644165, + "grad_norm": 2.1542448139874164, + "learning_rate": 8.56770163296816e-06, + "loss": 0.9432, + "step": 5345 + }, + { + "epoch": 0.5594976452119309, + "grad_norm": 2.133404649889683, + "learning_rate": 8.564346945017637e-06, + "loss": 0.8289, + "step": 5346 + }, + { + "epoch": 0.5596023024594453, + "grad_norm": 1.9683719436497848, + "learning_rate": 8.560992422026563e-06, + "loss": 0.9729, + "step": 5347 + }, + { + "epoch": 0.5597069597069597, + "grad_norm": 2.0108641044421374, + "learning_rate": 8.557638064380375e-06, + "loss": 0.9478, + "step": 5348 + }, + { + "epoch": 0.5598116169544741, + "grad_norm": 1.9417606569777943, + "learning_rate": 8.554283872464499e-06, + "loss": 1.1138, + "step": 5349 + }, + { + "epoch": 0.5599162742019885, + "grad_norm": 2.0004963619150646, + "learning_rate": 8.550929846664333e-06, + "loss": 0.832, + "step": 5350 + }, + { + "epoch": 0.5600209314495028, + "grad_norm": 1.9750719959886254, + "learning_rate": 8.547575987365266e-06, + "loss": 0.8858, + "step": 5351 + }, + { + "epoch": 0.5601255886970172, + "grad_norm": 1.962371605202089, + "learning_rate": 8.544222294952665e-06, + "loss": 0.8864, + "step": 5352 + }, + { + "epoch": 0.5602302459445316, + "grad_norm": 2.2675771268273506, + "learning_rate": 8.540868769811871e-06, + "loss": 1.0038, + "step": 5353 + }, + { + "epoch": 0.560334903192046, + "grad_norm": 2.0704526254917055, + "learning_rate": 8.537515412328216e-06, + "loss": 0.9498, + "step": 5354 + }, + { + "epoch": 0.5604395604395604, + "grad_norm": 2.073132608363666, + "learning_rate": 8.534162222887003e-06, + "loss": 0.9982, + "step": 5355 + }, + { + "epoch": 0.5605442176870749, + "grad_norm": 2.171878885625719, + "learning_rate": 8.530809201873523e-06, + "loss": 0.8136, + "step": 5356 + }, + { + "epoch": 0.5606488749345893, + "grad_norm": 2.1045724839104367, + "learning_rate": 8.527456349673043e-06, + "loss": 1.0294, + "step": 5357 + }, + { + "epoch": 0.5607535321821037, + "grad_norm": 2.1863543307087987, + "learning_rate": 8.524103666670817e-06, + "loss": 0.9556, + "step": 5358 + }, + { + "epoch": 0.560858189429618, + "grad_norm": 2.0478052085618557, + "learning_rate": 8.52075115325207e-06, + "loss": 0.9486, + "step": 5359 + }, + { + "epoch": 0.5609628466771324, + "grad_norm": 2.186343411269352, + "learning_rate": 8.517398809802009e-06, + "loss": 1.1094, + "step": 5360 + }, + { + "epoch": 0.5610675039246468, + "grad_norm": 2.1314430854613007, + "learning_rate": 8.514046636705835e-06, + "loss": 0.9564, + "step": 5361 + }, + { + "epoch": 0.5611721611721612, + "grad_norm": 2.118979548912484, + "learning_rate": 8.510694634348715e-06, + "loss": 0.9487, + "step": 5362 + }, + { + "epoch": 0.5612768184196756, + "grad_norm": 1.9296804650577581, + "learning_rate": 8.507342803115799e-06, + "loss": 1.041, + "step": 5363 + }, + { + "epoch": 0.56138147566719, + "grad_norm": 1.7942107435770736, + "learning_rate": 8.503991143392225e-06, + "loss": 0.9425, + "step": 5364 + }, + { + "epoch": 0.5614861329147044, + "grad_norm": 2.465347277668139, + "learning_rate": 8.500639655563097e-06, + "loss": 0.9761, + "step": 5365 + }, + { + "epoch": 0.5615907901622187, + "grad_norm": 2.1891772863766774, + "learning_rate": 8.497288340013514e-06, + "loss": 0.9151, + "step": 5366 + }, + { + "epoch": 0.5616954474097331, + "grad_norm": 1.726643594528601, + "learning_rate": 8.493937197128546e-06, + "loss": 0.959, + "step": 5367 + }, + { + "epoch": 0.5618001046572475, + "grad_norm": 1.7289012170427154, + "learning_rate": 8.490586227293244e-06, + "loss": 0.8114, + "step": 5368 + }, + { + "epoch": 0.5619047619047619, + "grad_norm": 1.8444026585255053, + "learning_rate": 8.487235430892648e-06, + "loss": 0.8111, + "step": 5369 + }, + { + "epoch": 0.5620094191522763, + "grad_norm": 2.1272015623519778, + "learning_rate": 8.483884808311768e-06, + "loss": 0.8135, + "step": 5370 + }, + { + "epoch": 0.5621140763997907, + "grad_norm": 2.064621150380741, + "learning_rate": 8.480534359935596e-06, + "loss": 0.7135, + "step": 5371 + }, + { + "epoch": 0.5622187336473051, + "grad_norm": 2.201762241819266, + "learning_rate": 8.477184086149109e-06, + "loss": 1.0045, + "step": 5372 + }, + { + "epoch": 0.5623233908948194, + "grad_norm": 2.331143016583063, + "learning_rate": 8.473833987337255e-06, + "loss": 0.8975, + "step": 5373 + }, + { + "epoch": 0.5624280481423338, + "grad_norm": 2.956408422523764, + "learning_rate": 8.470484063884969e-06, + "loss": 0.9348, + "step": 5374 + }, + { + "epoch": 0.5625327053898482, + "grad_norm": 2.929476780401184, + "learning_rate": 8.467134316177167e-06, + "loss": 0.9468, + "step": 5375 + }, + { + "epoch": 0.5626373626373626, + "grad_norm": 2.022910118977885, + "learning_rate": 8.46378474459874e-06, + "loss": 0.8546, + "step": 5376 + }, + { + "epoch": 0.562742019884877, + "grad_norm": 2.2903787311759123, + "learning_rate": 8.460435349534555e-06, + "loss": 0.9358, + "step": 5377 + }, + { + "epoch": 0.5628466771323914, + "grad_norm": 2.899288834804206, + "learning_rate": 8.457086131369475e-06, + "loss": 0.938, + "step": 5378 + }, + { + "epoch": 0.5629513343799059, + "grad_norm": 1.8808885484596338, + "learning_rate": 8.453737090488324e-06, + "loss": 0.8926, + "step": 5379 + }, + { + "epoch": 0.5630559916274201, + "grad_norm": 2.015376498610437, + "learning_rate": 8.450388227275914e-06, + "loss": 0.9181, + "step": 5380 + }, + { + "epoch": 0.5631606488749346, + "grad_norm": 2.4062132896620576, + "learning_rate": 8.447039542117043e-06, + "loss": 0.9622, + "step": 5381 + }, + { + "epoch": 0.563265306122449, + "grad_norm": 2.163527877551462, + "learning_rate": 8.443691035396472e-06, + "loss": 0.9314, + "step": 5382 + }, + { + "epoch": 0.5633699633699634, + "grad_norm": 1.7474043964097528, + "learning_rate": 8.440342707498961e-06, + "loss": 0.8849, + "step": 5383 + }, + { + "epoch": 0.5634746206174778, + "grad_norm": 2.0844952386004256, + "learning_rate": 8.43699455880923e-06, + "loss": 0.9463, + "step": 5384 + }, + { + "epoch": 0.5635792778649922, + "grad_norm": 2.1603071390316018, + "learning_rate": 8.433646589711996e-06, + "loss": 0.9383, + "step": 5385 + }, + { + "epoch": 0.5636839351125066, + "grad_norm": 2.0181491721198626, + "learning_rate": 8.430298800591939e-06, + "loss": 0.9197, + "step": 5386 + }, + { + "epoch": 0.5637885923600209, + "grad_norm": 1.827058275655275, + "learning_rate": 8.426951191833735e-06, + "loss": 0.7931, + "step": 5387 + }, + { + "epoch": 0.5638932496075353, + "grad_norm": 1.901008131381671, + "learning_rate": 8.42360376382203e-06, + "loss": 0.9066, + "step": 5388 + }, + { + "epoch": 0.5639979068550497, + "grad_norm": 2.061629426104861, + "learning_rate": 8.420256516941446e-06, + "loss": 0.9644, + "step": 5389 + }, + { + "epoch": 0.5641025641025641, + "grad_norm": 2.0842792186189225, + "learning_rate": 8.416909451576592e-06, + "loss": 0.8135, + "step": 5390 + }, + { + "epoch": 0.5642072213500785, + "grad_norm": 2.3449029492925906, + "learning_rate": 8.41356256811205e-06, + "loss": 0.9087, + "step": 5391 + }, + { + "epoch": 0.5643118785975929, + "grad_norm": 2.382812471439128, + "learning_rate": 8.410215866932386e-06, + "loss": 0.9713, + "step": 5392 + }, + { + "epoch": 0.5644165358451073, + "grad_norm": 2.1076018814211466, + "learning_rate": 8.40686934842214e-06, + "loss": 1.0884, + "step": 5393 + }, + { + "epoch": 0.5645211930926216, + "grad_norm": 2.220481614569876, + "learning_rate": 8.403523012965835e-06, + "loss": 0.9772, + "step": 5394 + }, + { + "epoch": 0.564625850340136, + "grad_norm": 2.0659611056947687, + "learning_rate": 8.400176860947975e-06, + "loss": 1.0665, + "step": 5395 + }, + { + "epoch": 0.5647305075876504, + "grad_norm": 1.8672547133425201, + "learning_rate": 8.396830892753036e-06, + "loss": 0.9571, + "step": 5396 + }, + { + "epoch": 0.5648351648351648, + "grad_norm": 2.1394363159336955, + "learning_rate": 8.393485108765477e-06, + "loss": 0.7481, + "step": 5397 + }, + { + "epoch": 0.5649398220826792, + "grad_norm": 2.007437446941716, + "learning_rate": 8.390139509369739e-06, + "loss": 0.8622, + "step": 5398 + }, + { + "epoch": 0.5650444793301936, + "grad_norm": 2.1993539972983154, + "learning_rate": 8.386794094950231e-06, + "loss": 1.032, + "step": 5399 + }, + { + "epoch": 0.565149136577708, + "grad_norm": 1.9010216623009064, + "learning_rate": 8.383448865891353e-06, + "loss": 0.8278, + "step": 5400 + }, + { + "epoch": 0.5652537938252225, + "grad_norm": 1.8728602015988864, + "learning_rate": 8.380103822577476e-06, + "loss": 0.7388, + "step": 5401 + }, + { + "epoch": 0.5653584510727367, + "grad_norm": 1.9149296409387317, + "learning_rate": 8.376758965392956e-06, + "loss": 0.9607, + "step": 5402 + }, + { + "epoch": 0.5654631083202512, + "grad_norm": 1.8429299678951467, + "learning_rate": 8.373414294722117e-06, + "loss": 0.9577, + "step": 5403 + }, + { + "epoch": 0.5655677655677656, + "grad_norm": 2.0351716866666587, + "learning_rate": 8.370069810949276e-06, + "loss": 0.9344, + "step": 5404 + }, + { + "epoch": 0.56567242281528, + "grad_norm": 2.029063031151813, + "learning_rate": 8.366725514458719e-06, + "loss": 0.8565, + "step": 5405 + }, + { + "epoch": 0.5657770800627944, + "grad_norm": 2.25021340401501, + "learning_rate": 8.36338140563471e-06, + "loss": 0.9405, + "step": 5406 + }, + { + "epoch": 0.5658817373103088, + "grad_norm": 1.9493532133820115, + "learning_rate": 8.360037484861497e-06, + "loss": 1.0342, + "step": 5407 + }, + { + "epoch": 0.5659863945578232, + "grad_norm": 1.9189066069765968, + "learning_rate": 8.3566937525233e-06, + "loss": 0.9019, + "step": 5408 + }, + { + "epoch": 0.5660910518053375, + "grad_norm": 2.2068316926644633, + "learning_rate": 8.353350209004323e-06, + "loss": 1.089, + "step": 5409 + }, + { + "epoch": 0.5661957090528519, + "grad_norm": 2.0845230459581163, + "learning_rate": 8.350006854688744e-06, + "loss": 0.8706, + "step": 5410 + }, + { + "epoch": 0.5663003663003663, + "grad_norm": 1.9447916121641364, + "learning_rate": 8.346663689960724e-06, + "loss": 0.8611, + "step": 5411 + }, + { + "epoch": 0.5664050235478807, + "grad_norm": 2.1579606427265885, + "learning_rate": 8.343320715204393e-06, + "loss": 0.894, + "step": 5412 + }, + { + "epoch": 0.5665096807953951, + "grad_norm": 2.0662389913637087, + "learning_rate": 8.339977930803872e-06, + "loss": 1.0039, + "step": 5413 + }, + { + "epoch": 0.5666143380429095, + "grad_norm": 1.9074458194803643, + "learning_rate": 8.336635337143257e-06, + "loss": 0.9126, + "step": 5414 + }, + { + "epoch": 0.5667189952904239, + "grad_norm": 2.260555516177037, + "learning_rate": 8.333292934606611e-06, + "loss": 0.9502, + "step": 5415 + }, + { + "epoch": 0.5668236525379382, + "grad_norm": 2.5104319898551886, + "learning_rate": 8.329950723577985e-06, + "loss": 0.8688, + "step": 5416 + }, + { + "epoch": 0.5669283097854526, + "grad_norm": 1.938544875865869, + "learning_rate": 8.326608704441408e-06, + "loss": 0.8114, + "step": 5417 + }, + { + "epoch": 0.567032967032967, + "grad_norm": 1.873974736150882, + "learning_rate": 8.323266877580881e-06, + "loss": 0.9325, + "step": 5418 + }, + { + "epoch": 0.5671376242804814, + "grad_norm": 2.316623632030778, + "learning_rate": 8.31992524338039e-06, + "loss": 0.9622, + "step": 5419 + }, + { + "epoch": 0.5672422815279958, + "grad_norm": 1.762820617049393, + "learning_rate": 8.316583802223892e-06, + "loss": 0.9175, + "step": 5420 + }, + { + "epoch": 0.5673469387755102, + "grad_norm": 2.0087856448492736, + "learning_rate": 8.313242554495328e-06, + "loss": 1.0089, + "step": 5421 + }, + { + "epoch": 0.5674515960230246, + "grad_norm": 2.0981791703798978, + "learning_rate": 8.309901500578617e-06, + "loss": 0.9378, + "step": 5422 + }, + { + "epoch": 0.5675562532705389, + "grad_norm": 2.0273142602067216, + "learning_rate": 8.306560640857649e-06, + "loss": 0.896, + "step": 5423 + }, + { + "epoch": 0.5676609105180533, + "grad_norm": 2.2220399006402647, + "learning_rate": 8.303219975716296e-06, + "loss": 0.8499, + "step": 5424 + }, + { + "epoch": 0.5677655677655677, + "grad_norm": 1.9703508493150457, + "learning_rate": 8.299879505538407e-06, + "loss": 0.9592, + "step": 5425 + }, + { + "epoch": 0.5678702250130822, + "grad_norm": 2.105250742608055, + "learning_rate": 8.29653923070781e-06, + "loss": 0.8972, + "step": 5426 + }, + { + "epoch": 0.5679748822605966, + "grad_norm": 2.030286562075985, + "learning_rate": 8.293199151608307e-06, + "loss": 0.9613, + "step": 5427 + }, + { + "epoch": 0.568079539508111, + "grad_norm": 2.3379239596475685, + "learning_rate": 8.289859268623685e-06, + "loss": 0.9236, + "step": 5428 + }, + { + "epoch": 0.5681841967556254, + "grad_norm": 1.9841324284776523, + "learning_rate": 8.286519582137694e-06, + "loss": 1.055, + "step": 5429 + }, + { + "epoch": 0.5682888540031397, + "grad_norm": 2.4795485146638647, + "learning_rate": 8.28318009253408e-06, + "loss": 0.9924, + "step": 5430 + }, + { + "epoch": 0.5683935112506541, + "grad_norm": 2.320539277472763, + "learning_rate": 8.279840800196555e-06, + "loss": 1.0494, + "step": 5431 + }, + { + "epoch": 0.5684981684981685, + "grad_norm": 1.8100197495806924, + "learning_rate": 8.276501705508808e-06, + "loss": 0.8236, + "step": 5432 + }, + { + "epoch": 0.5686028257456829, + "grad_norm": 1.9211836188740459, + "learning_rate": 8.27316280885451e-06, + "loss": 0.9775, + "step": 5433 + }, + { + "epoch": 0.5687074829931973, + "grad_norm": 1.7835418034139316, + "learning_rate": 8.269824110617306e-06, + "loss": 0.9137, + "step": 5434 + }, + { + "epoch": 0.5688121402407117, + "grad_norm": 2.4840168814546533, + "learning_rate": 8.266485611180816e-06, + "loss": 1.0129, + "step": 5435 + }, + { + "epoch": 0.5689167974882261, + "grad_norm": 2.124908482971047, + "learning_rate": 8.263147310928647e-06, + "loss": 1.0432, + "step": 5436 + }, + { + "epoch": 0.5690214547357404, + "grad_norm": 1.8840247947452105, + "learning_rate": 8.259809210244372e-06, + "loss": 0.9159, + "step": 5437 + }, + { + "epoch": 0.5691261119832548, + "grad_norm": 2.0456780135127426, + "learning_rate": 8.256471309511541e-06, + "loss": 0.9619, + "step": 5438 + }, + { + "epoch": 0.5692307692307692, + "grad_norm": 1.9569879555280116, + "learning_rate": 8.253133609113699e-06, + "loss": 0.8922, + "step": 5439 + }, + { + "epoch": 0.5693354264782836, + "grad_norm": 2.2202855654973868, + "learning_rate": 8.249796109434342e-06, + "loss": 0.8476, + "step": 5440 + }, + { + "epoch": 0.569440083725798, + "grad_norm": 1.8957910452182647, + "learning_rate": 8.246458810856963e-06, + "loss": 0.974, + "step": 5441 + }, + { + "epoch": 0.5695447409733124, + "grad_norm": 2.0379277438037575, + "learning_rate": 8.24312171376502e-06, + "loss": 0.9315, + "step": 5442 + }, + { + "epoch": 0.5696493982208268, + "grad_norm": 2.3174337089710435, + "learning_rate": 8.239784818541955e-06, + "loss": 0.9172, + "step": 5443 + }, + { + "epoch": 0.5697540554683412, + "grad_norm": 1.8964131960152406, + "learning_rate": 8.23644812557118e-06, + "loss": 0.9638, + "step": 5444 + }, + { + "epoch": 0.5698587127158555, + "grad_norm": 1.8962610994511044, + "learning_rate": 8.233111635236093e-06, + "loss": 0.8465, + "step": 5445 + }, + { + "epoch": 0.5699633699633699, + "grad_norm": 2.0617322216840632, + "learning_rate": 8.22977534792006e-06, + "loss": 0.9347, + "step": 5446 + }, + { + "epoch": 0.5700680272108843, + "grad_norm": 2.010417513155231, + "learning_rate": 8.226439264006425e-06, + "loss": 0.9013, + "step": 5447 + }, + { + "epoch": 0.5701726844583987, + "grad_norm": 1.905616574561522, + "learning_rate": 8.223103383878517e-06, + "loss": 0.9417, + "step": 5448 + }, + { + "epoch": 0.5702773417059132, + "grad_norm": 2.0075024867237548, + "learning_rate": 8.219767707919632e-06, + "loss": 1.0392, + "step": 5449 + }, + { + "epoch": 0.5703819989534276, + "grad_norm": 1.8721709785411973, + "learning_rate": 8.216432236513048e-06, + "loss": 0.9228, + "step": 5450 + }, + { + "epoch": 0.570486656200942, + "grad_norm": 1.9218512333873883, + "learning_rate": 8.213096970042015e-06, + "loss": 1.0, + "step": 5451 + }, + { + "epoch": 0.5705913134484563, + "grad_norm": 2.329819893077006, + "learning_rate": 8.209761908889762e-06, + "loss": 0.9031, + "step": 5452 + }, + { + "epoch": 0.5706959706959707, + "grad_norm": 1.8170918140020158, + "learning_rate": 8.206427053439497e-06, + "loss": 0.8399, + "step": 5453 + }, + { + "epoch": 0.5708006279434851, + "grad_norm": 2.195702490756807, + "learning_rate": 8.203092404074395e-06, + "loss": 0.902, + "step": 5454 + }, + { + "epoch": 0.5709052851909995, + "grad_norm": 1.8273594046031365, + "learning_rate": 8.199757961177618e-06, + "loss": 0.8918, + "step": 5455 + }, + { + "epoch": 0.5710099424385139, + "grad_norm": 2.146875048935083, + "learning_rate": 8.196423725132305e-06, + "loss": 0.9573, + "step": 5456 + }, + { + "epoch": 0.5711145996860283, + "grad_norm": 2.1126148763204022, + "learning_rate": 8.19308969632156e-06, + "loss": 0.8839, + "step": 5457 + }, + { + "epoch": 0.5712192569335427, + "grad_norm": 2.390207657529769, + "learning_rate": 8.189755875128474e-06, + "loss": 1.0406, + "step": 5458 + }, + { + "epoch": 0.571323914181057, + "grad_norm": 2.1517068860045128, + "learning_rate": 8.186422261936105e-06, + "loss": 0.8052, + "step": 5459 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 2.119466613691594, + "learning_rate": 8.183088857127496e-06, + "loss": 0.9079, + "step": 5460 + }, + { + "epoch": 0.5715332286760858, + "grad_norm": 2.231374940946366, + "learning_rate": 8.17975566108566e-06, + "loss": 0.9486, + "step": 5461 + }, + { + "epoch": 0.5716378859236002, + "grad_norm": 2.073842713633685, + "learning_rate": 8.176422674193591e-06, + "loss": 0.9796, + "step": 5462 + }, + { + "epoch": 0.5717425431711146, + "grad_norm": 1.8850162422585321, + "learning_rate": 8.173089896834249e-06, + "loss": 0.9233, + "step": 5463 + }, + { + "epoch": 0.571847200418629, + "grad_norm": 2.2918049824032605, + "learning_rate": 8.169757329390581e-06, + "loss": 0.9653, + "step": 5464 + }, + { + "epoch": 0.5719518576661434, + "grad_norm": 1.987836099412543, + "learning_rate": 8.16642497224551e-06, + "loss": 0.8883, + "step": 5465 + }, + { + "epoch": 0.5720565149136577, + "grad_norm": 1.9580693885487788, + "learning_rate": 8.163092825781927e-06, + "loss": 1.0531, + "step": 5466 + }, + { + "epoch": 0.5721611721611721, + "grad_norm": 2.5063692651334195, + "learning_rate": 8.159760890382703e-06, + "loss": 0.9574, + "step": 5467 + }, + { + "epoch": 0.5722658294086865, + "grad_norm": 2.2807498841062275, + "learning_rate": 8.15642916643068e-06, + "loss": 1.09, + "step": 5468 + }, + { + "epoch": 0.5723704866562009, + "grad_norm": 1.9133632160896512, + "learning_rate": 8.153097654308684e-06, + "loss": 0.8269, + "step": 5469 + }, + { + "epoch": 0.5724751439037153, + "grad_norm": 1.8332089750958485, + "learning_rate": 8.149766354399512e-06, + "loss": 0.8831, + "step": 5470 + }, + { + "epoch": 0.5725798011512298, + "grad_norm": 1.9413274135809737, + "learning_rate": 8.146435267085937e-06, + "loss": 0.9679, + "step": 5471 + }, + { + "epoch": 0.5726844583987442, + "grad_norm": 1.8297455313311937, + "learning_rate": 8.143104392750708e-06, + "loss": 0.949, + "step": 5472 + }, + { + "epoch": 0.5727891156462585, + "grad_norm": 2.0994208439284052, + "learning_rate": 8.139773731776545e-06, + "loss": 0.9182, + "step": 5473 + }, + { + "epoch": 0.5728937728937729, + "grad_norm": 2.2224164168293887, + "learning_rate": 8.136443284546153e-06, + "loss": 0.8501, + "step": 5474 + }, + { + "epoch": 0.5729984301412873, + "grad_norm": 1.9784406564547241, + "learning_rate": 8.133113051442207e-06, + "loss": 0.9075, + "step": 5475 + }, + { + "epoch": 0.5731030873888017, + "grad_norm": 2.1561299853644162, + "learning_rate": 8.129783032847353e-06, + "loss": 0.8528, + "step": 5476 + }, + { + "epoch": 0.5732077446363161, + "grad_norm": 1.8007982019446203, + "learning_rate": 8.126453229144224e-06, + "loss": 0.9403, + "step": 5477 + }, + { + "epoch": 0.5733124018838305, + "grad_norm": 1.944118541502138, + "learning_rate": 8.123123640715415e-06, + "loss": 0.9122, + "step": 5478 + }, + { + "epoch": 0.5734170591313449, + "grad_norm": 2.0045254601484443, + "learning_rate": 8.119794267943506e-06, + "loss": 0.8893, + "step": 5479 + }, + { + "epoch": 0.5735217163788592, + "grad_norm": 1.908937792505844, + "learning_rate": 8.116465111211045e-06, + "loss": 0.8759, + "step": 5480 + }, + { + "epoch": 0.5736263736263736, + "grad_norm": 1.9982276022368928, + "learning_rate": 8.113136170900558e-06, + "loss": 0.9882, + "step": 5481 + }, + { + "epoch": 0.573731030873888, + "grad_norm": 2.0425269282462803, + "learning_rate": 8.109807447394555e-06, + "loss": 0.9599, + "step": 5482 + }, + { + "epoch": 0.5738356881214024, + "grad_norm": 1.8900946053549617, + "learning_rate": 8.106478941075506e-06, + "loss": 0.8938, + "step": 5483 + }, + { + "epoch": 0.5739403453689168, + "grad_norm": 2.0201948859873022, + "learning_rate": 8.103150652325867e-06, + "loss": 0.9496, + "step": 5484 + }, + { + "epoch": 0.5740450026164312, + "grad_norm": 1.914651039584181, + "learning_rate": 8.099822581528061e-06, + "loss": 0.8073, + "step": 5485 + }, + { + "epoch": 0.5741496598639456, + "grad_norm": 1.9592779998675791, + "learning_rate": 8.096494729064497e-06, + "loss": 0.9203, + "step": 5486 + }, + { + "epoch": 0.57425431711146, + "grad_norm": 1.9404520688455333, + "learning_rate": 8.093167095317543e-06, + "loss": 0.9277, + "step": 5487 + }, + { + "epoch": 0.5743589743589743, + "grad_norm": 2.102772910368145, + "learning_rate": 8.089839680669557e-06, + "loss": 1.0397, + "step": 5488 + }, + { + "epoch": 0.5744636316064887, + "grad_norm": 2.0883574055987197, + "learning_rate": 8.086512485502864e-06, + "loss": 0.8756, + "step": 5489 + }, + { + "epoch": 0.5745682888540031, + "grad_norm": 1.885771295192384, + "learning_rate": 8.083185510199762e-06, + "loss": 0.9681, + "step": 5490 + }, + { + "epoch": 0.5746729461015175, + "grad_norm": 2.025131247967418, + "learning_rate": 8.079858755142536e-06, + "loss": 0.9212, + "step": 5491 + }, + { + "epoch": 0.5747776033490319, + "grad_norm": 2.055060910903538, + "learning_rate": 8.076532220713429e-06, + "loss": 0.9848, + "step": 5492 + }, + { + "epoch": 0.5748822605965463, + "grad_norm": 1.6221987425177355, + "learning_rate": 8.073205907294669e-06, + "loss": 0.8392, + "step": 5493 + }, + { + "epoch": 0.5749869178440608, + "grad_norm": 2.112224534296191, + "learning_rate": 8.069879815268458e-06, + "loss": 0.9113, + "step": 5494 + }, + { + "epoch": 0.575091575091575, + "grad_norm": 1.9240692006326605, + "learning_rate": 8.066553945016968e-06, + "loss": 0.9778, + "step": 5495 + }, + { + "epoch": 0.5751962323390895, + "grad_norm": 1.9511742285030678, + "learning_rate": 8.063228296922349e-06, + "loss": 0.9533, + "step": 5496 + }, + { + "epoch": 0.5753008895866039, + "grad_norm": 2.2922538518406, + "learning_rate": 8.059902871366724e-06, + "loss": 0.8646, + "step": 5497 + }, + { + "epoch": 0.5754055468341183, + "grad_norm": 2.126390921205575, + "learning_rate": 8.056577668732196e-06, + "loss": 0.7988, + "step": 5498 + }, + { + "epoch": 0.5755102040816327, + "grad_norm": 2.0074966511095758, + "learning_rate": 8.053252689400826e-06, + "loss": 0.9166, + "step": 5499 + }, + { + "epoch": 0.5756148613291471, + "grad_norm": 2.0738861300544134, + "learning_rate": 8.049927933754672e-06, + "loss": 1.006, + "step": 5500 + }, + { + "epoch": 0.5757195185766615, + "grad_norm": 2.415212004526124, + "learning_rate": 8.046603402175755e-06, + "loss": 0.9133, + "step": 5501 + }, + { + "epoch": 0.5758241758241758, + "grad_norm": 1.9234109153078869, + "learning_rate": 8.043279095046064e-06, + "loss": 0.8681, + "step": 5502 + }, + { + "epoch": 0.5759288330716902, + "grad_norm": 1.889529105841766, + "learning_rate": 8.039955012747573e-06, + "loss": 0.9011, + "step": 5503 + }, + { + "epoch": 0.5760334903192046, + "grad_norm": 2.034277892033179, + "learning_rate": 8.036631155662223e-06, + "loss": 1.0261, + "step": 5504 + }, + { + "epoch": 0.576138147566719, + "grad_norm": 1.8787654537409386, + "learning_rate": 8.033307524171935e-06, + "loss": 0.9117, + "step": 5505 + }, + { + "epoch": 0.5762428048142334, + "grad_norm": 1.9010114330459553, + "learning_rate": 8.029984118658598e-06, + "loss": 0.9688, + "step": 5506 + }, + { + "epoch": 0.5763474620617478, + "grad_norm": 1.8350448320881683, + "learning_rate": 8.026660939504076e-06, + "loss": 0.9382, + "step": 5507 + }, + { + "epoch": 0.5764521193092622, + "grad_norm": 2.3648628400669183, + "learning_rate": 8.023337987090219e-06, + "loss": 0.7732, + "step": 5508 + }, + { + "epoch": 0.5765567765567765, + "grad_norm": 1.9441530508631961, + "learning_rate": 8.02001526179883e-06, + "loss": 0.9569, + "step": 5509 + }, + { + "epoch": 0.5766614338042909, + "grad_norm": 2.148620913574854, + "learning_rate": 8.016692764011705e-06, + "loss": 1.017, + "step": 5510 + }, + { + "epoch": 0.5767660910518053, + "grad_norm": 2.054332121041, + "learning_rate": 8.013370494110601e-06, + "loss": 0.9866, + "step": 5511 + }, + { + "epoch": 0.5768707482993197, + "grad_norm": 1.7926126111942025, + "learning_rate": 8.010048452477253e-06, + "loss": 0.9684, + "step": 5512 + }, + { + "epoch": 0.5769754055468341, + "grad_norm": 1.9449655810781312, + "learning_rate": 8.006726639493374e-06, + "loss": 0.8362, + "step": 5513 + }, + { + "epoch": 0.5770800627943485, + "grad_norm": 2.1002111681115965, + "learning_rate": 8.003405055540642e-06, + "loss": 0.8563, + "step": 5514 + }, + { + "epoch": 0.5771847200418629, + "grad_norm": 1.8369318308457716, + "learning_rate": 8.00008370100072e-06, + "loss": 0.9018, + "step": 5515 + }, + { + "epoch": 0.5772893772893772, + "grad_norm": 2.2436953873311256, + "learning_rate": 7.996762576255229e-06, + "loss": 0.8905, + "step": 5516 + }, + { + "epoch": 0.5773940345368916, + "grad_norm": 2.1273954997385434, + "learning_rate": 7.993441681685783e-06, + "loss": 0.9944, + "step": 5517 + }, + { + "epoch": 0.577498691784406, + "grad_norm": 2.0031275428527837, + "learning_rate": 7.990121017673956e-06, + "loss": 0.8795, + "step": 5518 + }, + { + "epoch": 0.5776033490319205, + "grad_norm": 2.2855770714536683, + "learning_rate": 7.986800584601298e-06, + "loss": 1.0042, + "step": 5519 + }, + { + "epoch": 0.5777080062794349, + "grad_norm": 1.7868487916899245, + "learning_rate": 7.983480382849335e-06, + "loss": 0.8817, + "step": 5520 + }, + { + "epoch": 0.5778126635269493, + "grad_norm": 2.0217427991401986, + "learning_rate": 7.980160412799561e-06, + "loss": 0.8515, + "step": 5521 + }, + { + "epoch": 0.5779173207744637, + "grad_norm": 2.0627313405323697, + "learning_rate": 7.976840674833454e-06, + "loss": 1.0603, + "step": 5522 + }, + { + "epoch": 0.578021978021978, + "grad_norm": 1.8428954962859756, + "learning_rate": 7.973521169332452e-06, + "loss": 0.857, + "step": 5523 + }, + { + "epoch": 0.5781266352694924, + "grad_norm": 2.075289447886697, + "learning_rate": 7.970201896677978e-06, + "loss": 0.9109, + "step": 5524 + }, + { + "epoch": 0.5782312925170068, + "grad_norm": 1.885372734744151, + "learning_rate": 7.966882857251416e-06, + "loss": 0.9159, + "step": 5525 + }, + { + "epoch": 0.5783359497645212, + "grad_norm": 1.6845318805206528, + "learning_rate": 7.963564051434139e-06, + "loss": 0.7578, + "step": 5526 + }, + { + "epoch": 0.5784406070120356, + "grad_norm": 2.0450170748383014, + "learning_rate": 7.960245479607482e-06, + "loss": 0.9455, + "step": 5527 + }, + { + "epoch": 0.57854526425955, + "grad_norm": 2.2160296759535885, + "learning_rate": 7.956927142152751e-06, + "loss": 1.004, + "step": 5528 + }, + { + "epoch": 0.5786499215070644, + "grad_norm": 1.71231827285406, + "learning_rate": 7.953609039451238e-06, + "loss": 0.7896, + "step": 5529 + }, + { + "epoch": 0.5787545787545788, + "grad_norm": 2.0196918037710567, + "learning_rate": 7.950291171884192e-06, + "loss": 0.8954, + "step": 5530 + }, + { + "epoch": 0.5788592360020931, + "grad_norm": 1.9380102025151027, + "learning_rate": 7.946973539832845e-06, + "loss": 0.8539, + "step": 5531 + }, + { + "epoch": 0.5789638932496075, + "grad_norm": 2.079187841186523, + "learning_rate": 7.943656143678402e-06, + "loss": 0.9011, + "step": 5532 + }, + { + "epoch": 0.5790685504971219, + "grad_norm": 2.0873564237094318, + "learning_rate": 7.940338983802032e-06, + "loss": 0.859, + "step": 5533 + }, + { + "epoch": 0.5791732077446363, + "grad_norm": 3.551683446968433, + "learning_rate": 7.937022060584891e-06, + "loss": 0.8587, + "step": 5534 + }, + { + "epoch": 0.5792778649921507, + "grad_norm": 1.786949141630154, + "learning_rate": 7.933705374408099e-06, + "loss": 0.8939, + "step": 5535 + }, + { + "epoch": 0.5793825222396651, + "grad_norm": 2.307443887128846, + "learning_rate": 7.930388925652746e-06, + "loss": 0.9096, + "step": 5536 + }, + { + "epoch": 0.5794871794871795, + "grad_norm": 2.429912604129563, + "learning_rate": 7.927072714699903e-06, + "loss": 0.9904, + "step": 5537 + }, + { + "epoch": 0.5795918367346938, + "grad_norm": 1.929972910007676, + "learning_rate": 7.923756741930607e-06, + "loss": 0.9509, + "step": 5538 + }, + { + "epoch": 0.5796964939822082, + "grad_norm": 2.0723179929400652, + "learning_rate": 7.92044100772587e-06, + "loss": 0.9514, + "step": 5539 + }, + { + "epoch": 0.5798011512297226, + "grad_norm": 1.9377894076263404, + "learning_rate": 7.917125512466677e-06, + "loss": 0.9522, + "step": 5540 + }, + { + "epoch": 0.579905808477237, + "grad_norm": 1.9466085430494366, + "learning_rate": 7.913810256533987e-06, + "loss": 1.0248, + "step": 5541 + }, + { + "epoch": 0.5800104657247515, + "grad_norm": 1.9946195154612312, + "learning_rate": 7.910495240308722e-06, + "loss": 0.9148, + "step": 5542 + }, + { + "epoch": 0.5801151229722659, + "grad_norm": 2.066175895381359, + "learning_rate": 7.907180464171792e-06, + "loss": 0.992, + "step": 5543 + }, + { + "epoch": 0.5802197802197803, + "grad_norm": 2.091752799122247, + "learning_rate": 7.903865928504073e-06, + "loss": 0.9818, + "step": 5544 + }, + { + "epoch": 0.5803244374672946, + "grad_norm": 2.0815270541413438, + "learning_rate": 7.900551633686405e-06, + "loss": 1.053, + "step": 5545 + }, + { + "epoch": 0.580429094714809, + "grad_norm": 1.8552562530126415, + "learning_rate": 7.89723758009961e-06, + "loss": 0.8185, + "step": 5546 + }, + { + "epoch": 0.5805337519623234, + "grad_norm": 2.3294466334249555, + "learning_rate": 7.893923768124481e-06, + "loss": 0.9063, + "step": 5547 + }, + { + "epoch": 0.5806384092098378, + "grad_norm": 2.883079539310358, + "learning_rate": 7.890610198141782e-06, + "loss": 0.9315, + "step": 5548 + }, + { + "epoch": 0.5807430664573522, + "grad_norm": 1.7119553078829457, + "learning_rate": 7.887296870532246e-06, + "loss": 0.9367, + "step": 5549 + }, + { + "epoch": 0.5808477237048666, + "grad_norm": 2.2200861287766114, + "learning_rate": 7.88398378567658e-06, + "loss": 0.7905, + "step": 5550 + }, + { + "epoch": 0.580952380952381, + "grad_norm": 2.039585468725626, + "learning_rate": 7.880670943955467e-06, + "loss": 1.0008, + "step": 5551 + }, + { + "epoch": 0.5810570381998953, + "grad_norm": 1.9366320130825088, + "learning_rate": 7.877358345749562e-06, + "loss": 0.8768, + "step": 5552 + }, + { + "epoch": 0.5811616954474097, + "grad_norm": 2.056342630692964, + "learning_rate": 7.874045991439484e-06, + "loss": 0.9288, + "step": 5553 + }, + { + "epoch": 0.5812663526949241, + "grad_norm": 2.113800312709259, + "learning_rate": 7.870733881405835e-06, + "loss": 0.9684, + "step": 5554 + }, + { + "epoch": 0.5813710099424385, + "grad_norm": 2.0422803075735403, + "learning_rate": 7.867422016029174e-06, + "loss": 0.8649, + "step": 5555 + }, + { + "epoch": 0.5814756671899529, + "grad_norm": 2.208609946344471, + "learning_rate": 7.864110395690051e-06, + "loss": 1.0076, + "step": 5556 + }, + { + "epoch": 0.5815803244374673, + "grad_norm": 2.2154587482971935, + "learning_rate": 7.86079902076897e-06, + "loss": 0.7496, + "step": 5557 + }, + { + "epoch": 0.5816849816849817, + "grad_norm": 2.0643605224522616, + "learning_rate": 7.857487891646422e-06, + "loss": 0.8465, + "step": 5558 + }, + { + "epoch": 0.581789638932496, + "grad_norm": 1.900572692373446, + "learning_rate": 7.854177008702852e-06, + "loss": 0.9254, + "step": 5559 + }, + { + "epoch": 0.5818942961800104, + "grad_norm": 2.070474926104496, + "learning_rate": 7.850866372318697e-06, + "loss": 1.0204, + "step": 5560 + }, + { + "epoch": 0.5819989534275248, + "grad_norm": 1.9724412431279685, + "learning_rate": 7.847555982874355e-06, + "loss": 1.0067, + "step": 5561 + }, + { + "epoch": 0.5821036106750392, + "grad_norm": 2.0205088862472285, + "learning_rate": 7.844245840750194e-06, + "loss": 0.9708, + "step": 5562 + }, + { + "epoch": 0.5822082679225536, + "grad_norm": 2.1780476787506076, + "learning_rate": 7.840935946326554e-06, + "loss": 0.8583, + "step": 5563 + }, + { + "epoch": 0.582312925170068, + "grad_norm": 2.113023798763882, + "learning_rate": 7.83762629998375e-06, + "loss": 0.9245, + "step": 5564 + }, + { + "epoch": 0.5824175824175825, + "grad_norm": 2.197691507927687, + "learning_rate": 7.834316902102072e-06, + "loss": 0.8333, + "step": 5565 + }, + { + "epoch": 0.5825222396650968, + "grad_norm": 2.1409592615846664, + "learning_rate": 7.83100775306177e-06, + "loss": 0.9258, + "step": 5566 + }, + { + "epoch": 0.5826268969126112, + "grad_norm": 2.045119732160066, + "learning_rate": 7.827698853243073e-06, + "loss": 0.957, + "step": 5567 + }, + { + "epoch": 0.5827315541601256, + "grad_norm": 2.256212640620829, + "learning_rate": 7.82439020302618e-06, + "loss": 0.9719, + "step": 5568 + }, + { + "epoch": 0.58283621140764, + "grad_norm": 1.9793551575101223, + "learning_rate": 7.821081802791265e-06, + "loss": 1.0033, + "step": 5569 + }, + { + "epoch": 0.5829408686551544, + "grad_norm": 1.9336354212815183, + "learning_rate": 7.817773652918467e-06, + "loss": 0.9265, + "step": 5570 + }, + { + "epoch": 0.5830455259026688, + "grad_norm": 2.132258291924315, + "learning_rate": 7.814465753787902e-06, + "loss": 0.9252, + "step": 5571 + }, + { + "epoch": 0.5831501831501832, + "grad_norm": 2.2295413408637854, + "learning_rate": 7.81115810577965e-06, + "loss": 0.9697, + "step": 5572 + }, + { + "epoch": 0.5832548403976976, + "grad_norm": 1.906463312716307, + "learning_rate": 7.807850709273772e-06, + "loss": 0.8958, + "step": 5573 + }, + { + "epoch": 0.5833594976452119, + "grad_norm": 2.1146161586182495, + "learning_rate": 7.80454356465029e-06, + "loss": 0.9138, + "step": 5574 + }, + { + "epoch": 0.5834641548927263, + "grad_norm": 2.3811043954927906, + "learning_rate": 7.801236672289204e-06, + "loss": 1.0349, + "step": 5575 + }, + { + "epoch": 0.5835688121402407, + "grad_norm": 2.108644748299407, + "learning_rate": 7.79793003257048e-06, + "loss": 0.839, + "step": 5576 + }, + { + "epoch": 0.5836734693877551, + "grad_norm": 2.1813751914717234, + "learning_rate": 7.794623645874057e-06, + "loss": 0.99, + "step": 5577 + }, + { + "epoch": 0.5837781266352695, + "grad_norm": 1.8978749173293086, + "learning_rate": 7.791317512579853e-06, + "loss": 0.9008, + "step": 5578 + }, + { + "epoch": 0.5838827838827839, + "grad_norm": 2.0693475512491664, + "learning_rate": 7.78801163306774e-06, + "loss": 0.9453, + "step": 5579 + }, + { + "epoch": 0.5839874411302983, + "grad_norm": 2.3044788296228194, + "learning_rate": 7.784706007717578e-06, + "loss": 0.948, + "step": 5580 + }, + { + "epoch": 0.5840920983778126, + "grad_norm": 1.8045170295820876, + "learning_rate": 7.781400636909184e-06, + "loss": 0.9382, + "step": 5581 + }, + { + "epoch": 0.584196755625327, + "grad_norm": 2.483078766173217, + "learning_rate": 7.778095521022357e-06, + "loss": 0.8534, + "step": 5582 + }, + { + "epoch": 0.5843014128728414, + "grad_norm": 2.1631414208383806, + "learning_rate": 7.774790660436857e-06, + "loss": 0.8908, + "step": 5583 + }, + { + "epoch": 0.5844060701203558, + "grad_norm": 2.168390803098506, + "learning_rate": 7.771486055532424e-06, + "loss": 0.9576, + "step": 5584 + }, + { + "epoch": 0.5845107273678702, + "grad_norm": 2.123503361696419, + "learning_rate": 7.768181706688754e-06, + "loss": 0.9218, + "step": 5585 + }, + { + "epoch": 0.5846153846153846, + "grad_norm": 2.1013306321139487, + "learning_rate": 7.764877614285533e-06, + "loss": 0.8682, + "step": 5586 + }, + { + "epoch": 0.584720041862899, + "grad_norm": 2.255480731355252, + "learning_rate": 7.76157377870241e-06, + "loss": 1.0501, + "step": 5587 + }, + { + "epoch": 0.5848246991104133, + "grad_norm": 1.945562700084701, + "learning_rate": 7.758270200318995e-06, + "loss": 0.8875, + "step": 5588 + }, + { + "epoch": 0.5849293563579278, + "grad_norm": 2.1195133177923235, + "learning_rate": 7.754966879514878e-06, + "loss": 0.8932, + "step": 5589 + }, + { + "epoch": 0.5850340136054422, + "grad_norm": 2.1209423994658114, + "learning_rate": 7.75166381666962e-06, + "loss": 1.0249, + "step": 5590 + }, + { + "epoch": 0.5851386708529566, + "grad_norm": 2.009199929745769, + "learning_rate": 7.748361012162743e-06, + "loss": 0.9536, + "step": 5591 + }, + { + "epoch": 0.585243328100471, + "grad_norm": 1.8131883322530284, + "learning_rate": 7.745058466373754e-06, + "loss": 0.7916, + "step": 5592 + }, + { + "epoch": 0.5853479853479854, + "grad_norm": 2.318244040850203, + "learning_rate": 7.741756179682116e-06, + "loss": 0.8724, + "step": 5593 + }, + { + "epoch": 0.5854526425954998, + "grad_norm": 2.1235713244627523, + "learning_rate": 7.738454152467267e-06, + "loss": 0.9192, + "step": 5594 + }, + { + "epoch": 0.5855572998430141, + "grad_norm": 2.2040042407951987, + "learning_rate": 7.735152385108627e-06, + "loss": 1.007, + "step": 5595 + }, + { + "epoch": 0.5856619570905285, + "grad_norm": 2.292748328386475, + "learning_rate": 7.731850877985567e-06, + "loss": 0.9257, + "step": 5596 + }, + { + "epoch": 0.5857666143380429, + "grad_norm": 2.077810847731391, + "learning_rate": 7.728549631477441e-06, + "loss": 0.9772, + "step": 5597 + }, + { + "epoch": 0.5858712715855573, + "grad_norm": 2.0349358401554727, + "learning_rate": 7.725248645963565e-06, + "loss": 0.9287, + "step": 5598 + }, + { + "epoch": 0.5859759288330717, + "grad_norm": 2.4781346688637935, + "learning_rate": 7.721947921823232e-06, + "loss": 0.9193, + "step": 5599 + }, + { + "epoch": 0.5860805860805861, + "grad_norm": 1.9931375215068494, + "learning_rate": 7.7186474594357e-06, + "loss": 0.9548, + "step": 5600 + }, + { + "epoch": 0.5861852433281005, + "grad_norm": 1.7997207751821085, + "learning_rate": 7.715347259180202e-06, + "loss": 0.9453, + "step": 5601 + }, + { + "epoch": 0.5862899005756148, + "grad_norm": 2.010052931739926, + "learning_rate": 7.712047321435932e-06, + "loss": 1.0162, + "step": 5602 + }, + { + "epoch": 0.5863945578231292, + "grad_norm": 2.333239467555819, + "learning_rate": 7.708747646582061e-06, + "loss": 0.9328, + "step": 5603 + }, + { + "epoch": 0.5864992150706436, + "grad_norm": 2.10594950324619, + "learning_rate": 7.705448234997735e-06, + "loss": 0.9705, + "step": 5604 + }, + { + "epoch": 0.586603872318158, + "grad_norm": 2.166799778137154, + "learning_rate": 7.702149087062056e-06, + "loss": 0.9954, + "step": 5605 + }, + { + "epoch": 0.5867085295656724, + "grad_norm": 2.0918172155366177, + "learning_rate": 7.698850203154108e-06, + "loss": 0.839, + "step": 5606 + }, + { + "epoch": 0.5868131868131868, + "grad_norm": 1.9671646846087965, + "learning_rate": 7.695551583652935e-06, + "loss": 0.8696, + "step": 5607 + }, + { + "epoch": 0.5869178440607012, + "grad_norm": 2.2227576188186267, + "learning_rate": 7.692253228937558e-06, + "loss": 1.0203, + "step": 5608 + }, + { + "epoch": 0.5870225013082155, + "grad_norm": 2.2068097944504164, + "learning_rate": 7.688955139386961e-06, + "loss": 0.9753, + "step": 5609 + }, + { + "epoch": 0.5871271585557299, + "grad_norm": 2.439406039090037, + "learning_rate": 7.685657315380105e-06, + "loss": 0.9711, + "step": 5610 + }, + { + "epoch": 0.5872318158032444, + "grad_norm": 2.1525531876859767, + "learning_rate": 7.682359757295911e-06, + "loss": 1.005, + "step": 5611 + }, + { + "epoch": 0.5873364730507588, + "grad_norm": 2.036134319104039, + "learning_rate": 7.679062465513282e-06, + "loss": 0.9106, + "step": 5612 + }, + { + "epoch": 0.5874411302982732, + "grad_norm": 1.980663244945154, + "learning_rate": 7.675765440411082e-06, + "loss": 1.0143, + "step": 5613 + }, + { + "epoch": 0.5875457875457876, + "grad_norm": 1.9155213798006931, + "learning_rate": 7.672468682368145e-06, + "loss": 0.8642, + "step": 5614 + }, + { + "epoch": 0.587650444793302, + "grad_norm": 2.0336061775870693, + "learning_rate": 7.669172191763273e-06, + "loss": 0.9126, + "step": 5615 + }, + { + "epoch": 0.5877551020408164, + "grad_norm": 1.8849398925253606, + "learning_rate": 7.665875968975243e-06, + "loss": 0.7622, + "step": 5616 + }, + { + "epoch": 0.5878597592883307, + "grad_norm": 2.325465135874042, + "learning_rate": 7.662580014382794e-06, + "loss": 0.9221, + "step": 5617 + }, + { + "epoch": 0.5879644165358451, + "grad_norm": 2.315387773578512, + "learning_rate": 7.659284328364642e-06, + "loss": 0.854, + "step": 5618 + }, + { + "epoch": 0.5880690737833595, + "grad_norm": 2.520796475508092, + "learning_rate": 7.655988911299463e-06, + "loss": 0.9864, + "step": 5619 + }, + { + "epoch": 0.5881737310308739, + "grad_norm": 1.9719906781148255, + "learning_rate": 7.652693763565909e-06, + "loss": 1.0244, + "step": 5620 + }, + { + "epoch": 0.5882783882783883, + "grad_norm": 2.1962848838041085, + "learning_rate": 7.649398885542603e-06, + "loss": 0.9411, + "step": 5621 + }, + { + "epoch": 0.5883830455259027, + "grad_norm": 2.0497329566693945, + "learning_rate": 7.64610427760813e-06, + "loss": 0.9327, + "step": 5622 + }, + { + "epoch": 0.5884877027734171, + "grad_norm": 2.3462319809252232, + "learning_rate": 7.642809940141047e-06, + "loss": 1.0223, + "step": 5623 + }, + { + "epoch": 0.5885923600209314, + "grad_norm": 1.7385446984243167, + "learning_rate": 7.639515873519882e-06, + "loss": 0.8747, + "step": 5624 + }, + { + "epoch": 0.5886970172684458, + "grad_norm": 2.0670189309137323, + "learning_rate": 7.636222078123126e-06, + "loss": 0.9075, + "step": 5625 + }, + { + "epoch": 0.5888016745159602, + "grad_norm": 1.9507533341417447, + "learning_rate": 7.632928554329246e-06, + "loss": 0.9021, + "step": 5626 + }, + { + "epoch": 0.5889063317634746, + "grad_norm": 2.3873787735902074, + "learning_rate": 7.629635302516673e-06, + "loss": 1.0425, + "step": 5627 + }, + { + "epoch": 0.589010989010989, + "grad_norm": 2.252557905765437, + "learning_rate": 7.626342323063811e-06, + "loss": 0.9155, + "step": 5628 + }, + { + "epoch": 0.5891156462585034, + "grad_norm": 1.9671306821641084, + "learning_rate": 7.623049616349023e-06, + "loss": 0.8436, + "step": 5629 + }, + { + "epoch": 0.5892203035060178, + "grad_norm": 2.228355294848782, + "learning_rate": 7.619757182750655e-06, + "loss": 0.9464, + "step": 5630 + }, + { + "epoch": 0.5893249607535321, + "grad_norm": 1.944122763671574, + "learning_rate": 7.616465022647015e-06, + "loss": 0.917, + "step": 5631 + }, + { + "epoch": 0.5894296180010465, + "grad_norm": 2.0093474365195956, + "learning_rate": 7.613173136416373e-06, + "loss": 0.9609, + "step": 5632 + }, + { + "epoch": 0.589534275248561, + "grad_norm": 2.3171664214346746, + "learning_rate": 7.6098815244369785e-06, + "loss": 0.9894, + "step": 5633 + }, + { + "epoch": 0.5896389324960754, + "grad_norm": 2.0398463213152893, + "learning_rate": 7.60659018708704e-06, + "loss": 0.9755, + "step": 5634 + }, + { + "epoch": 0.5897435897435898, + "grad_norm": 1.974866296695783, + "learning_rate": 7.603299124744743e-06, + "loss": 0.9829, + "step": 5635 + }, + { + "epoch": 0.5898482469911042, + "grad_norm": 1.8436780153736634, + "learning_rate": 7.600008337788233e-06, + "loss": 0.8847, + "step": 5636 + }, + { + "epoch": 0.5899529042386186, + "grad_norm": 1.7596168571097486, + "learning_rate": 7.596717826595627e-06, + "loss": 0.8302, + "step": 5637 + }, + { + "epoch": 0.5900575614861329, + "grad_norm": 2.286365759016598, + "learning_rate": 7.593427591545021e-06, + "loss": 0.9302, + "step": 5638 + }, + { + "epoch": 0.5901622187336473, + "grad_norm": 2.069205329083003, + "learning_rate": 7.5901376330144606e-06, + "loss": 0.8283, + "step": 5639 + }, + { + "epoch": 0.5902668759811617, + "grad_norm": 2.241407089031377, + "learning_rate": 7.586847951381973e-06, + "loss": 0.9715, + "step": 5640 + }, + { + "epoch": 0.5903715332286761, + "grad_norm": 2.0834114666515267, + "learning_rate": 7.583558547025546e-06, + "loss": 0.9345, + "step": 5641 + }, + { + "epoch": 0.5904761904761905, + "grad_norm": 2.1953788167615262, + "learning_rate": 7.580269420323143e-06, + "loss": 1.0446, + "step": 5642 + }, + { + "epoch": 0.5905808477237049, + "grad_norm": 2.2480917530067375, + "learning_rate": 7.5769805716526876e-06, + "loss": 0.8577, + "step": 5643 + }, + { + "epoch": 0.5906855049712193, + "grad_norm": 2.4677953070932155, + "learning_rate": 7.5736920013920756e-06, + "loss": 0.9092, + "step": 5644 + }, + { + "epoch": 0.5907901622187336, + "grad_norm": 2.0662054886088934, + "learning_rate": 7.570403709919173e-06, + "loss": 0.7921, + "step": 5645 + }, + { + "epoch": 0.590894819466248, + "grad_norm": 2.1423059288297246, + "learning_rate": 7.5671156976118056e-06, + "loss": 0.9767, + "step": 5646 + }, + { + "epoch": 0.5909994767137624, + "grad_norm": 2.7938569718008646, + "learning_rate": 7.563827964847779e-06, + "loss": 0.9416, + "step": 5647 + }, + { + "epoch": 0.5911041339612768, + "grad_norm": 2.012361091935804, + "learning_rate": 7.560540512004859e-06, + "loss": 0.9601, + "step": 5648 + }, + { + "epoch": 0.5912087912087912, + "grad_norm": 2.025112150721346, + "learning_rate": 7.557253339460777e-06, + "loss": 1.0345, + "step": 5649 + }, + { + "epoch": 0.5913134484563056, + "grad_norm": 2.098963973383834, + "learning_rate": 7.5539664475932415e-06, + "loss": 0.953, + "step": 5650 + }, + { + "epoch": 0.59141810570382, + "grad_norm": 2.155862643447173, + "learning_rate": 7.550679836779917e-06, + "loss": 0.9, + "step": 5651 + }, + { + "epoch": 0.5915227629513344, + "grad_norm": 2.0776024089586214, + "learning_rate": 7.547393507398447e-06, + "loss": 0.8768, + "step": 5652 + }, + { + "epoch": 0.5916274201988487, + "grad_norm": 2.040547845106933, + "learning_rate": 7.544107459826433e-06, + "loss": 0.885, + "step": 5653 + }, + { + "epoch": 0.5917320774463631, + "grad_norm": 1.877629870992842, + "learning_rate": 7.5408216944414515e-06, + "loss": 0.7986, + "step": 5654 + }, + { + "epoch": 0.5918367346938775, + "grad_norm": 2.108404306286894, + "learning_rate": 7.537536211621039e-06, + "loss": 0.9868, + "step": 5655 + }, + { + "epoch": 0.591941391941392, + "grad_norm": 2.3003723833156635, + "learning_rate": 7.53425101174271e-06, + "loss": 0.854, + "step": 5656 + }, + { + "epoch": 0.5920460491889064, + "grad_norm": 2.14897580363839, + "learning_rate": 7.530966095183942e-06, + "loss": 0.8716, + "step": 5657 + }, + { + "epoch": 0.5921507064364208, + "grad_norm": 1.856710380053524, + "learning_rate": 7.5276814623221705e-06, + "loss": 0.9414, + "step": 5658 + }, + { + "epoch": 0.5922553636839352, + "grad_norm": 2.125965075529774, + "learning_rate": 7.5243971135348136e-06, + "loss": 0.8968, + "step": 5659 + }, + { + "epoch": 0.5923600209314495, + "grad_norm": 2.076781557575531, + "learning_rate": 7.521113049199244e-06, + "loss": 0.9287, + "step": 5660 + }, + { + "epoch": 0.5924646781789639, + "grad_norm": 1.9002232671975265, + "learning_rate": 7.517829269692815e-06, + "loss": 1.0265, + "step": 5661 + }, + { + "epoch": 0.5925693354264783, + "grad_norm": 2.222162234459858, + "learning_rate": 7.514545775392833e-06, + "loss": 0.9957, + "step": 5662 + }, + { + "epoch": 0.5926739926739927, + "grad_norm": 2.124845865430392, + "learning_rate": 7.511262566676574e-06, + "loss": 1.0127, + "step": 5663 + }, + { + "epoch": 0.5927786499215071, + "grad_norm": 1.9891156119781586, + "learning_rate": 7.507979643921297e-06, + "loss": 0.8318, + "step": 5664 + }, + { + "epoch": 0.5928833071690215, + "grad_norm": 2.0909557900542133, + "learning_rate": 7.504697007504212e-06, + "loss": 0.8629, + "step": 5665 + }, + { + "epoch": 0.5929879644165359, + "grad_norm": 1.9403378716887565, + "learning_rate": 7.501414657802495e-06, + "loss": 0.9464, + "step": 5666 + }, + { + "epoch": 0.5930926216640502, + "grad_norm": 2.043310271462764, + "learning_rate": 7.498132595193304e-06, + "loss": 0.8681, + "step": 5667 + }, + { + "epoch": 0.5931972789115646, + "grad_norm": 1.9687560488818703, + "learning_rate": 7.494850820053745e-06, + "loss": 1.0069, + "step": 5668 + }, + { + "epoch": 0.593301936159079, + "grad_norm": 2.053403263724564, + "learning_rate": 7.491569332760909e-06, + "loss": 0.7221, + "step": 5669 + }, + { + "epoch": 0.5934065934065934, + "grad_norm": 2.046467258975427, + "learning_rate": 7.488288133691839e-06, + "loss": 0.9088, + "step": 5670 + }, + { + "epoch": 0.5935112506541078, + "grad_norm": 1.9238393365949373, + "learning_rate": 7.485007223223556e-06, + "loss": 0.8819, + "step": 5671 + }, + { + "epoch": 0.5936159079016222, + "grad_norm": 1.6696389669150944, + "learning_rate": 7.481726601733037e-06, + "loss": 0.7529, + "step": 5672 + }, + { + "epoch": 0.5937205651491366, + "grad_norm": 2.59751100553255, + "learning_rate": 7.478446269597239e-06, + "loss": 0.7676, + "step": 5673 + }, + { + "epoch": 0.5938252223966509, + "grad_norm": 1.9990709348617675, + "learning_rate": 7.475166227193079e-06, + "loss": 0.889, + "step": 5674 + }, + { + "epoch": 0.5939298796441653, + "grad_norm": 1.910729649122657, + "learning_rate": 7.471886474897436e-06, + "loss": 0.9689, + "step": 5675 + }, + { + "epoch": 0.5940345368916797, + "grad_norm": 1.9841915683745623, + "learning_rate": 7.468607013087164e-06, + "loss": 0.9288, + "step": 5676 + }, + { + "epoch": 0.5941391941391941, + "grad_norm": 2.0782450749530095, + "learning_rate": 7.465327842139074e-06, + "loss": 0.8997, + "step": 5677 + }, + { + "epoch": 0.5942438513867085, + "grad_norm": 2.1382950383338013, + "learning_rate": 7.4620489624299576e-06, + "loss": 0.9772, + "step": 5678 + }, + { + "epoch": 0.594348508634223, + "grad_norm": 1.9347499610481047, + "learning_rate": 7.458770374336557e-06, + "loss": 0.8774, + "step": 5679 + }, + { + "epoch": 0.5944531658817374, + "grad_norm": 1.943431111602753, + "learning_rate": 7.455492078235596e-06, + "loss": 0.9336, + "step": 5680 + }, + { + "epoch": 0.5945578231292517, + "grad_norm": 1.8838788238338073, + "learning_rate": 7.452214074503748e-06, + "loss": 0.8452, + "step": 5681 + }, + { + "epoch": 0.5946624803767661, + "grad_norm": 1.9609711265415661, + "learning_rate": 7.44893636351767e-06, + "loss": 0.91, + "step": 5682 + }, + { + "epoch": 0.5947671376242805, + "grad_norm": 2.2357068851381894, + "learning_rate": 7.445658945653975e-06, + "loss": 0.8955, + "step": 5683 + }, + { + "epoch": 0.5948717948717949, + "grad_norm": 2.129172955119575, + "learning_rate": 7.442381821289248e-06, + "loss": 0.8215, + "step": 5684 + }, + { + "epoch": 0.5949764521193093, + "grad_norm": 2.0015511967472412, + "learning_rate": 7.439104990800032e-06, + "loss": 0.9191, + "step": 5685 + }, + { + "epoch": 0.5950811093668237, + "grad_norm": 2.0354524088249515, + "learning_rate": 7.435828454562846e-06, + "loss": 0.8986, + "step": 5686 + }, + { + "epoch": 0.5951857666143381, + "grad_norm": 2.539924639282485, + "learning_rate": 7.432552212954167e-06, + "loss": 0.9566, + "step": 5687 + }, + { + "epoch": 0.5952904238618524, + "grad_norm": 2.6018174099109874, + "learning_rate": 7.429276266350445e-06, + "loss": 0.8285, + "step": 5688 + }, + { + "epoch": 0.5953950811093668, + "grad_norm": 2.0533166547957884, + "learning_rate": 7.426000615128087e-06, + "loss": 0.9661, + "step": 5689 + }, + { + "epoch": 0.5954997383568812, + "grad_norm": 1.7827839006990294, + "learning_rate": 7.422725259663479e-06, + "loss": 0.8243, + "step": 5690 + }, + { + "epoch": 0.5956043956043956, + "grad_norm": 2.2481511288263603, + "learning_rate": 7.419450200332965e-06, + "loss": 0.9338, + "step": 5691 + }, + { + "epoch": 0.59570905285191, + "grad_norm": 1.9634491736223318, + "learning_rate": 7.416175437512854e-06, + "loss": 1.0209, + "step": 5692 + }, + { + "epoch": 0.5958137100994244, + "grad_norm": 1.9664186395780534, + "learning_rate": 7.412900971579422e-06, + "loss": 0.8375, + "step": 5693 + }, + { + "epoch": 0.5959183673469388, + "grad_norm": 2.002056184772738, + "learning_rate": 7.4096268029089136e-06, + "loss": 0.9765, + "step": 5694 + }, + { + "epoch": 0.5960230245944532, + "grad_norm": 2.0319582433132224, + "learning_rate": 7.406352931877538e-06, + "loss": 0.885, + "step": 5695 + }, + { + "epoch": 0.5961276818419675, + "grad_norm": 1.9584262931491643, + "learning_rate": 7.403079358861466e-06, + "loss": 1.0275, + "step": 5696 + }, + { + "epoch": 0.5962323390894819, + "grad_norm": 1.9530535654924333, + "learning_rate": 7.399806084236843e-06, + "loss": 0.7674, + "step": 5697 + }, + { + "epoch": 0.5963369963369963, + "grad_norm": 2.022147305775228, + "learning_rate": 7.396533108379767e-06, + "loss": 0.9294, + "step": 5698 + }, + { + "epoch": 0.5964416535845107, + "grad_norm": 2.0733270021944303, + "learning_rate": 7.393260431666317e-06, + "loss": 0.9985, + "step": 5699 + }, + { + "epoch": 0.5965463108320251, + "grad_norm": 2.057304250473014, + "learning_rate": 7.38998805447253e-06, + "loss": 1.0389, + "step": 5700 + }, + { + "epoch": 0.5966509680795395, + "grad_norm": 2.465161068029499, + "learning_rate": 7.386715977174406e-06, + "loss": 0.9919, + "step": 5701 + }, + { + "epoch": 0.596755625327054, + "grad_norm": 2.3904950100409454, + "learning_rate": 7.383444200147914e-06, + "loss": 0.9943, + "step": 5702 + }, + { + "epoch": 0.5968602825745682, + "grad_norm": 2.083028232439757, + "learning_rate": 7.380172723768988e-06, + "loss": 0.9033, + "step": 5703 + }, + { + "epoch": 0.5969649398220827, + "grad_norm": 2.3016492387048535, + "learning_rate": 7.376901548413525e-06, + "loss": 1.0375, + "step": 5704 + }, + { + "epoch": 0.5970695970695971, + "grad_norm": 1.8832286691796583, + "learning_rate": 7.373630674457393e-06, + "loss": 0.8139, + "step": 5705 + }, + { + "epoch": 0.5971742543171115, + "grad_norm": 2.183575586002926, + "learning_rate": 7.370360102276419e-06, + "loss": 0.9699, + "step": 5706 + }, + { + "epoch": 0.5972789115646259, + "grad_norm": 2.067091529022829, + "learning_rate": 7.367089832246399e-06, + "loss": 0.8456, + "step": 5707 + }, + { + "epoch": 0.5973835688121403, + "grad_norm": 1.9426102438871136, + "learning_rate": 7.363819864743098e-06, + "loss": 0.9009, + "step": 5708 + }, + { + "epoch": 0.5974882260596547, + "grad_norm": 1.990935623039761, + "learning_rate": 7.360550200142238e-06, + "loss": 0.889, + "step": 5709 + }, + { + "epoch": 0.597592883307169, + "grad_norm": 1.7709079037998374, + "learning_rate": 7.357280838819512e-06, + "loss": 0.9062, + "step": 5710 + }, + { + "epoch": 0.5976975405546834, + "grad_norm": 2.2877813592159004, + "learning_rate": 7.354011781150572e-06, + "loss": 1.0542, + "step": 5711 + }, + { + "epoch": 0.5978021978021978, + "grad_norm": 2.020894122061592, + "learning_rate": 7.350743027511045e-06, + "loss": 0.9956, + "step": 5712 + }, + { + "epoch": 0.5979068550497122, + "grad_norm": 1.9546500888309954, + "learning_rate": 7.3474745782765125e-06, + "loss": 1.0172, + "step": 5713 + }, + { + "epoch": 0.5980115122972266, + "grad_norm": 1.9265892498802972, + "learning_rate": 7.3442064338225315e-06, + "loss": 0.9464, + "step": 5714 + }, + { + "epoch": 0.598116169544741, + "grad_norm": 2.2740116881308605, + "learning_rate": 7.340938594524611e-06, + "loss": 0.935, + "step": 5715 + }, + { + "epoch": 0.5982208267922554, + "grad_norm": 2.3159963017309275, + "learning_rate": 7.337671060758236e-06, + "loss": 0.8863, + "step": 5716 + }, + { + "epoch": 0.5983254840397697, + "grad_norm": 2.1779592238981573, + "learning_rate": 7.334403832898856e-06, + "loss": 0.92, + "step": 5717 + }, + { + "epoch": 0.5984301412872841, + "grad_norm": 2.3970607095582284, + "learning_rate": 7.331136911321879e-06, + "loss": 0.9618, + "step": 5718 + }, + { + "epoch": 0.5985347985347985, + "grad_norm": 1.7681357135220832, + "learning_rate": 7.327870296402682e-06, + "loss": 0.8879, + "step": 5719 + }, + { + "epoch": 0.5986394557823129, + "grad_norm": 1.8481004018316487, + "learning_rate": 7.324603988516605e-06, + "loss": 0.9292, + "step": 5720 + }, + { + "epoch": 0.5987441130298273, + "grad_norm": 2.0081982515082624, + "learning_rate": 7.321337988038952e-06, + "loss": 0.831, + "step": 5721 + }, + { + "epoch": 0.5988487702773417, + "grad_norm": 2.053737396236648, + "learning_rate": 7.318072295344996e-06, + "loss": 0.9926, + "step": 5722 + }, + { + "epoch": 0.5989534275248561, + "grad_norm": 1.932122093359507, + "learning_rate": 7.314806910809968e-06, + "loss": 0.8565, + "step": 5723 + }, + { + "epoch": 0.5990580847723704, + "grad_norm": 2.112245623049056, + "learning_rate": 7.311541834809066e-06, + "loss": 0.9997, + "step": 5724 + }, + { + "epoch": 0.5991627420198848, + "grad_norm": 2.2611995197979664, + "learning_rate": 7.308277067717464e-06, + "loss": 0.9893, + "step": 5725 + }, + { + "epoch": 0.5992673992673992, + "grad_norm": 2.1706750569520508, + "learning_rate": 7.30501260991028e-06, + "loss": 0.902, + "step": 5726 + }, + { + "epoch": 0.5993720565149137, + "grad_norm": 2.0515848466057873, + "learning_rate": 7.301748461762612e-06, + "loss": 0.918, + "step": 5727 + }, + { + "epoch": 0.5994767137624281, + "grad_norm": 2.0173947607217504, + "learning_rate": 7.298484623649515e-06, + "loss": 0.8884, + "step": 5728 + }, + { + "epoch": 0.5995813710099425, + "grad_norm": 2.538071049144709, + "learning_rate": 7.295221095946012e-06, + "loss": 0.9454, + "step": 5729 + }, + { + "epoch": 0.5996860282574569, + "grad_norm": 1.9533801934375317, + "learning_rate": 7.291957879027087e-06, + "loss": 0.9301, + "step": 5730 + }, + { + "epoch": 0.5997906855049712, + "grad_norm": 2.25854110503167, + "learning_rate": 7.288694973267693e-06, + "loss": 1.059, + "step": 5731 + }, + { + "epoch": 0.5998953427524856, + "grad_norm": 1.9262169394873385, + "learning_rate": 7.2854323790427425e-06, + "loss": 0.8526, + "step": 5732 + }, + { + "epoch": 0.6, + "grad_norm": 2.0501402686145234, + "learning_rate": 7.28217009672711e-06, + "loss": 0.8932, + "step": 5733 + }, + { + "epoch": 0.6001046572475144, + "grad_norm": 1.990109963355697, + "learning_rate": 7.2789081266956495e-06, + "loss": 0.9841, + "step": 5734 + }, + { + "epoch": 0.6002093144950288, + "grad_norm": 1.9142264065765036, + "learning_rate": 7.2756464693231585e-06, + "loss": 0.9663, + "step": 5735 + }, + { + "epoch": 0.6003139717425432, + "grad_norm": 2.1083186271489995, + "learning_rate": 7.272385124984414e-06, + "loss": 0.9541, + "step": 5736 + }, + { + "epoch": 0.6004186289900576, + "grad_norm": 2.438641624027379, + "learning_rate": 7.269124094054144e-06, + "loss": 0.9692, + "step": 5737 + }, + { + "epoch": 0.600523286237572, + "grad_norm": 1.7085030156404912, + "learning_rate": 7.265863376907054e-06, + "loss": 0.8373, + "step": 5738 + }, + { + "epoch": 0.6006279434850863, + "grad_norm": 2.175525308596229, + "learning_rate": 7.262602973917807e-06, + "loss": 0.8631, + "step": 5739 + }, + { + "epoch": 0.6007326007326007, + "grad_norm": 2.4383259594069577, + "learning_rate": 7.259342885461023e-06, + "loss": 0.9384, + "step": 5740 + }, + { + "epoch": 0.6008372579801151, + "grad_norm": 2.3558493908846363, + "learning_rate": 7.256083111911301e-06, + "loss": 0.8584, + "step": 5741 + }, + { + "epoch": 0.6009419152276295, + "grad_norm": 1.8065835519696472, + "learning_rate": 7.252823653643186e-06, + "loss": 0.938, + "step": 5742 + }, + { + "epoch": 0.6010465724751439, + "grad_norm": 2.2545538260720392, + "learning_rate": 7.249564511031206e-06, + "loss": 0.9175, + "step": 5743 + }, + { + "epoch": 0.6011512297226583, + "grad_norm": 1.8867293237713263, + "learning_rate": 7.246305684449842e-06, + "loss": 0.8941, + "step": 5744 + }, + { + "epoch": 0.6012558869701727, + "grad_norm": 2.091930423181412, + "learning_rate": 7.243047174273534e-06, + "loss": 1.0297, + "step": 5745 + }, + { + "epoch": 0.601360544217687, + "grad_norm": 1.9561529165386784, + "learning_rate": 7.239788980876696e-06, + "loss": 0.8983, + "step": 5746 + }, + { + "epoch": 0.6014652014652014, + "grad_norm": 2.088099544372334, + "learning_rate": 7.236531104633699e-06, + "loss": 0.9198, + "step": 5747 + }, + { + "epoch": 0.6015698587127158, + "grad_norm": 2.1188876407664066, + "learning_rate": 7.233273545918882e-06, + "loss": 0.8943, + "step": 5748 + }, + { + "epoch": 0.6016745159602302, + "grad_norm": 2.402914355301177, + "learning_rate": 7.230016305106539e-06, + "loss": 0.9966, + "step": 5749 + }, + { + "epoch": 0.6017791732077447, + "grad_norm": 2.157789260501364, + "learning_rate": 7.226759382570938e-06, + "loss": 1.0866, + "step": 5750 + }, + { + "epoch": 0.6018838304552591, + "grad_norm": 2.4322710098817746, + "learning_rate": 7.223502778686311e-06, + "loss": 0.9455, + "step": 5751 + }, + { + "epoch": 0.6019884877027735, + "grad_norm": 2.104737383056769, + "learning_rate": 7.220246493826839e-06, + "loss": 0.8708, + "step": 5752 + }, + { + "epoch": 0.6020931449502878, + "grad_norm": 1.9327478109090237, + "learning_rate": 7.216990528366683e-06, + "loss": 0.8164, + "step": 5753 + }, + { + "epoch": 0.6021978021978022, + "grad_norm": 1.9784215425645908, + "learning_rate": 7.213734882679955e-06, + "loss": 0.8658, + "step": 5754 + }, + { + "epoch": 0.6023024594453166, + "grad_norm": 2.0937291019563755, + "learning_rate": 7.21047955714074e-06, + "loss": 0.9194, + "step": 5755 + }, + { + "epoch": 0.602407116692831, + "grad_norm": 2.2283678360274823, + "learning_rate": 7.207224552123075e-06, + "loss": 1.0105, + "step": 5756 + }, + { + "epoch": 0.6025117739403454, + "grad_norm": 2.086487895813268, + "learning_rate": 7.203969868000975e-06, + "loss": 0.9044, + "step": 5757 + }, + { + "epoch": 0.6026164311878598, + "grad_norm": 2.0822701283652316, + "learning_rate": 7.200715505148404e-06, + "loss": 0.9292, + "step": 5758 + }, + { + "epoch": 0.6027210884353742, + "grad_norm": 2.51397980592967, + "learning_rate": 7.1974614639392915e-06, + "loss": 1.006, + "step": 5759 + }, + { + "epoch": 0.6028257456828885, + "grad_norm": 2.3703132970824274, + "learning_rate": 7.194207744747541e-06, + "loss": 0.907, + "step": 5760 + }, + { + "epoch": 0.6029304029304029, + "grad_norm": 2.063722563737003, + "learning_rate": 7.19095434794701e-06, + "loss": 0.8732, + "step": 5761 + }, + { + "epoch": 0.6030350601779173, + "grad_norm": 2.262710366424598, + "learning_rate": 7.187701273911518e-06, + "loss": 0.9704, + "step": 5762 + }, + { + "epoch": 0.6031397174254317, + "grad_norm": 2.2761471236819615, + "learning_rate": 7.184448523014851e-06, + "loss": 0.9213, + "step": 5763 + }, + { + "epoch": 0.6032443746729461, + "grad_norm": 2.252681155905622, + "learning_rate": 7.181196095630756e-06, + "loss": 1.0017, + "step": 5764 + }, + { + "epoch": 0.6033490319204605, + "grad_norm": 2.218841967236834, + "learning_rate": 7.177943992132945e-06, + "loss": 1.0078, + "step": 5765 + }, + { + "epoch": 0.6034536891679749, + "grad_norm": 2.371167922078118, + "learning_rate": 7.174692212895089e-06, + "loss": 1.0293, + "step": 5766 + }, + { + "epoch": 0.6035583464154892, + "grad_norm": 2.0568882242914177, + "learning_rate": 7.1714407582908265e-06, + "loss": 0.8454, + "step": 5767 + }, + { + "epoch": 0.6036630036630036, + "grad_norm": 1.691696703496195, + "learning_rate": 7.168189628693752e-06, + "loss": 0.8477, + "step": 5768 + }, + { + "epoch": 0.603767660910518, + "grad_norm": 1.8979982946907608, + "learning_rate": 7.16493882447743e-06, + "loss": 0.9209, + "step": 5769 + }, + { + "epoch": 0.6038723181580324, + "grad_norm": 2.5519832703990266, + "learning_rate": 7.1616883460153875e-06, + "loss": 0.8273, + "step": 5770 + }, + { + "epoch": 0.6039769754055468, + "grad_norm": 2.0415668920164434, + "learning_rate": 7.158438193681107e-06, + "loss": 0.861, + "step": 5771 + }, + { + "epoch": 0.6040816326530613, + "grad_norm": 2.0190515557119846, + "learning_rate": 7.155188367848038e-06, + "loss": 0.8852, + "step": 5772 + }, + { + "epoch": 0.6041862899005757, + "grad_norm": 2.018878210032007, + "learning_rate": 7.151938868889593e-06, + "loss": 0.9771, + "step": 5773 + }, + { + "epoch": 0.60429094714809, + "grad_norm": 2.3049004074694337, + "learning_rate": 7.148689697179147e-06, + "loss": 0.9558, + "step": 5774 + }, + { + "epoch": 0.6043956043956044, + "grad_norm": 1.9496529744525466, + "learning_rate": 7.145440853090033e-06, + "loss": 0.947, + "step": 5775 + }, + { + "epoch": 0.6045002616431188, + "grad_norm": 1.7724900507895518, + "learning_rate": 7.1421923369955485e-06, + "loss": 0.8816, + "step": 5776 + }, + { + "epoch": 0.6046049188906332, + "grad_norm": 2.3142724481991936, + "learning_rate": 7.138944149268963e-06, + "loss": 0.9659, + "step": 5777 + }, + { + "epoch": 0.6047095761381476, + "grad_norm": 2.1236464851755743, + "learning_rate": 7.1356962902834935e-06, + "loss": 0.9831, + "step": 5778 + }, + { + "epoch": 0.604814233385662, + "grad_norm": 2.2271103729867083, + "learning_rate": 7.132448760412326e-06, + "loss": 1.0104, + "step": 5779 + }, + { + "epoch": 0.6049188906331764, + "grad_norm": 2.0033029015172774, + "learning_rate": 7.129201560028611e-06, + "loss": 0.9995, + "step": 5780 + }, + { + "epoch": 0.6050235478806908, + "grad_norm": 2.5095302090496037, + "learning_rate": 7.1259546895054545e-06, + "loss": 0.979, + "step": 5781 + }, + { + "epoch": 0.6051282051282051, + "grad_norm": 1.8438079575156054, + "learning_rate": 7.122708149215931e-06, + "loss": 0.989, + "step": 5782 + }, + { + "epoch": 0.6052328623757195, + "grad_norm": 1.7506092086261658, + "learning_rate": 7.1194619395330746e-06, + "loss": 0.9699, + "step": 5783 + }, + { + "epoch": 0.6053375196232339, + "grad_norm": 1.8710351375548784, + "learning_rate": 7.116216060829881e-06, + "loss": 0.7888, + "step": 5784 + }, + { + "epoch": 0.6054421768707483, + "grad_norm": 2.253637141321553, + "learning_rate": 7.112970513479303e-06, + "loss": 1.0412, + "step": 5785 + }, + { + "epoch": 0.6055468341182627, + "grad_norm": 2.1113622507792926, + "learning_rate": 7.10972529785427e-06, + "loss": 0.9001, + "step": 5786 + }, + { + "epoch": 0.6056514913657771, + "grad_norm": 2.116957635415604, + "learning_rate": 7.106480414327661e-06, + "loss": 0.9627, + "step": 5787 + }, + { + "epoch": 0.6057561486132915, + "grad_norm": 1.9655834383321291, + "learning_rate": 7.103235863272315e-06, + "loss": 0.9217, + "step": 5788 + }, + { + "epoch": 0.6058608058608058, + "grad_norm": 1.9253182713618326, + "learning_rate": 7.099991645061044e-06, + "loss": 0.9115, + "step": 5789 + }, + { + "epoch": 0.6059654631083202, + "grad_norm": 1.9112762675976478, + "learning_rate": 7.096747760066609e-06, + "loss": 0.9361, + "step": 5790 + }, + { + "epoch": 0.6060701203558346, + "grad_norm": 1.9544948764185148, + "learning_rate": 7.0935042086617444e-06, + "loss": 0.8774, + "step": 5791 + }, + { + "epoch": 0.606174777603349, + "grad_norm": 2.1298758076771387, + "learning_rate": 7.090260991219135e-06, + "loss": 0.9291, + "step": 5792 + }, + { + "epoch": 0.6062794348508634, + "grad_norm": 1.8407570735350673, + "learning_rate": 7.087018108111441e-06, + "loss": 0.7909, + "step": 5793 + }, + { + "epoch": 0.6063840920983778, + "grad_norm": 2.0086101467681123, + "learning_rate": 7.083775559711265e-06, + "loss": 0.8417, + "step": 5794 + }, + { + "epoch": 0.6064887493458923, + "grad_norm": 2.1799980186333197, + "learning_rate": 7.080533346391192e-06, + "loss": 0.9765, + "step": 5795 + }, + { + "epoch": 0.6065934065934065, + "grad_norm": 2.045424197962744, + "learning_rate": 7.0772914685237585e-06, + "loss": 0.9536, + "step": 5796 + }, + { + "epoch": 0.606698063840921, + "grad_norm": 1.6592809893413245, + "learning_rate": 7.074049926481459e-06, + "loss": 0.846, + "step": 5797 + }, + { + "epoch": 0.6068027210884354, + "grad_norm": 2.6554474976351625, + "learning_rate": 7.070808720636754e-06, + "loss": 0.9715, + "step": 5798 + }, + { + "epoch": 0.6069073783359498, + "grad_norm": 2.0940703129068563, + "learning_rate": 7.067567851362066e-06, + "loss": 0.9867, + "step": 5799 + }, + { + "epoch": 0.6070120355834642, + "grad_norm": 2.1226712056254584, + "learning_rate": 7.064327319029775e-06, + "loss": 0.9931, + "step": 5800 + }, + { + "epoch": 0.6071166928309786, + "grad_norm": 1.7565753791576322, + "learning_rate": 7.061087124012227e-06, + "loss": 0.8574, + "step": 5801 + }, + { + "epoch": 0.607221350078493, + "grad_norm": 2.5046485756656036, + "learning_rate": 7.057847266681723e-06, + "loss": 0.9254, + "step": 5802 + }, + { + "epoch": 0.6073260073260073, + "grad_norm": 2.099170045438125, + "learning_rate": 7.054607747410535e-06, + "loss": 0.9769, + "step": 5803 + }, + { + "epoch": 0.6074306645735217, + "grad_norm": 2.422351961670763, + "learning_rate": 7.05136856657089e-06, + "loss": 0.9626, + "step": 5804 + }, + { + "epoch": 0.6075353218210361, + "grad_norm": 1.934073799795149, + "learning_rate": 7.048129724534971e-06, + "loss": 0.9296, + "step": 5805 + }, + { + "epoch": 0.6076399790685505, + "grad_norm": 1.9490333500220711, + "learning_rate": 7.044891221674934e-06, + "loss": 0.8619, + "step": 5806 + }, + { + "epoch": 0.6077446363160649, + "grad_norm": 2.0467303551021256, + "learning_rate": 7.041653058362883e-06, + "loss": 0.8767, + "step": 5807 + }, + { + "epoch": 0.6078492935635793, + "grad_norm": 2.1579085714167805, + "learning_rate": 7.038415234970896e-06, + "loss": 0.8894, + "step": 5808 + }, + { + "epoch": 0.6079539508110937, + "grad_norm": 1.94349634206974, + "learning_rate": 7.035177751870999e-06, + "loss": 0.9374, + "step": 5809 + }, + { + "epoch": 0.608058608058608, + "grad_norm": 2.0657703345239278, + "learning_rate": 7.031940609435192e-06, + "loss": 0.8923, + "step": 5810 + }, + { + "epoch": 0.6081632653061224, + "grad_norm": 2.070527663098741, + "learning_rate": 7.028703808035421e-06, + "loss": 0.8972, + "step": 5811 + }, + { + "epoch": 0.6082679225536368, + "grad_norm": 1.9175580806828474, + "learning_rate": 7.025467348043608e-06, + "loss": 0.947, + "step": 5812 + }, + { + "epoch": 0.6083725798011512, + "grad_norm": 2.106923878326725, + "learning_rate": 7.022231229831629e-06, + "loss": 0.9325, + "step": 5813 + }, + { + "epoch": 0.6084772370486656, + "grad_norm": 1.9730559307739923, + "learning_rate": 7.018995453771318e-06, + "loss": 0.8576, + "step": 5814 + }, + { + "epoch": 0.60858189429618, + "grad_norm": 2.0508153945924, + "learning_rate": 7.01576002023447e-06, + "loss": 0.8826, + "step": 5815 + }, + { + "epoch": 0.6086865515436944, + "grad_norm": 2.001780253253112, + "learning_rate": 7.012524929592848e-06, + "loss": 0.8283, + "step": 5816 + }, + { + "epoch": 0.6087912087912087, + "grad_norm": 1.8973036803507934, + "learning_rate": 7.009290182218166e-06, + "loss": 0.9091, + "step": 5817 + }, + { + "epoch": 0.6088958660387231, + "grad_norm": 1.9406056643325182, + "learning_rate": 7.006055778482106e-06, + "loss": 0.8443, + "step": 5818 + }, + { + "epoch": 0.6090005232862375, + "grad_norm": 2.111807902749534, + "learning_rate": 7.002821718756305e-06, + "loss": 0.8516, + "step": 5819 + }, + { + "epoch": 0.609105180533752, + "grad_norm": 2.0277118642001994, + "learning_rate": 6.999588003412363e-06, + "loss": 0.8915, + "step": 5820 + }, + { + "epoch": 0.6092098377812664, + "grad_norm": 1.8707711600366936, + "learning_rate": 6.996354632821845e-06, + "loss": 0.9782, + "step": 5821 + }, + { + "epoch": 0.6093144950287808, + "grad_norm": 1.826421423539411, + "learning_rate": 6.993121607356267e-06, + "loss": 0.8156, + "step": 5822 + }, + { + "epoch": 0.6094191522762952, + "grad_norm": 1.8648721842207836, + "learning_rate": 6.989888927387114e-06, + "loss": 0.8111, + "step": 5823 + }, + { + "epoch": 0.6095238095238096, + "grad_norm": 1.9578433557880752, + "learning_rate": 6.986656593285824e-06, + "loss": 0.7663, + "step": 5824 + }, + { + "epoch": 0.6096284667713239, + "grad_norm": 2.3788854431050046, + "learning_rate": 6.983424605423801e-06, + "loss": 0.8084, + "step": 5825 + }, + { + "epoch": 0.6097331240188383, + "grad_norm": 1.9702807385169419, + "learning_rate": 6.980192964172404e-06, + "loss": 0.9583, + "step": 5826 + }, + { + "epoch": 0.6098377812663527, + "grad_norm": 2.1748718305550687, + "learning_rate": 6.97696166990296e-06, + "loss": 0.9703, + "step": 5827 + }, + { + "epoch": 0.6099424385138671, + "grad_norm": 2.2580551673376763, + "learning_rate": 6.9737307229867435e-06, + "loss": 0.804, + "step": 5828 + }, + { + "epoch": 0.6100470957613815, + "grad_norm": 2.2895933354446316, + "learning_rate": 6.970500123795004e-06, + "loss": 0.9405, + "step": 5829 + }, + { + "epoch": 0.6101517530088959, + "grad_norm": 2.0595158690924906, + "learning_rate": 6.9672698726989455e-06, + "loss": 0.8, + "step": 5830 + }, + { + "epoch": 0.6102564102564103, + "grad_norm": 2.3271064177398637, + "learning_rate": 6.964039970069722e-06, + "loss": 0.9542, + "step": 5831 + }, + { + "epoch": 0.6103610675039246, + "grad_norm": 1.7796600145148627, + "learning_rate": 6.9608104162784655e-06, + "loss": 0.9259, + "step": 5832 + }, + { + "epoch": 0.610465724751439, + "grad_norm": 1.973121849768075, + "learning_rate": 6.957581211696251e-06, + "loss": 0.878, + "step": 5833 + }, + { + "epoch": 0.6105703819989534, + "grad_norm": 1.9358767168769708, + "learning_rate": 6.954352356694121e-06, + "loss": 0.9568, + "step": 5834 + }, + { + "epoch": 0.6106750392464678, + "grad_norm": 2.2637017462136377, + "learning_rate": 6.9511238516430814e-06, + "loss": 1.0218, + "step": 5835 + }, + { + "epoch": 0.6107796964939822, + "grad_norm": 1.679462688094085, + "learning_rate": 6.947895696914091e-06, + "loss": 0.7966, + "step": 5836 + }, + { + "epoch": 0.6108843537414966, + "grad_norm": 2.1125768150832873, + "learning_rate": 6.944667892878069e-06, + "loss": 0.7789, + "step": 5837 + }, + { + "epoch": 0.610989010989011, + "grad_norm": 2.1110418210605633, + "learning_rate": 6.941440439905904e-06, + "loss": 0.9139, + "step": 5838 + }, + { + "epoch": 0.6110936682365253, + "grad_norm": 2.4312531473703736, + "learning_rate": 6.938213338368431e-06, + "loss": 0.9509, + "step": 5839 + }, + { + "epoch": 0.6111983254840397, + "grad_norm": 2.215490332569269, + "learning_rate": 6.934986588636455e-06, + "loss": 0.8285, + "step": 5840 + }, + { + "epoch": 0.6113029827315541, + "grad_norm": 2.028628090171237, + "learning_rate": 6.931760191080732e-06, + "loss": 0.8616, + "step": 5841 + }, + { + "epoch": 0.6114076399790686, + "grad_norm": 2.1117389186178945, + "learning_rate": 6.928534146071983e-06, + "loss": 0.9494, + "step": 5842 + }, + { + "epoch": 0.611512297226583, + "grad_norm": 1.8477902899995793, + "learning_rate": 6.925308453980887e-06, + "loss": 0.8444, + "step": 5843 + }, + { + "epoch": 0.6116169544740974, + "grad_norm": 2.2634245815200615, + "learning_rate": 6.9220831151780845e-06, + "loss": 0.8982, + "step": 5844 + }, + { + "epoch": 0.6117216117216118, + "grad_norm": 1.8300503492297004, + "learning_rate": 6.918858130034167e-06, + "loss": 0.9627, + "step": 5845 + }, + { + "epoch": 0.6118262689691261, + "grad_norm": 2.0217643390425013, + "learning_rate": 6.915633498919698e-06, + "loss": 0.8742, + "step": 5846 + }, + { + "epoch": 0.6119309262166405, + "grad_norm": 2.1967587863707707, + "learning_rate": 6.9124092222051945e-06, + "loss": 0.9867, + "step": 5847 + }, + { + "epoch": 0.6120355834641549, + "grad_norm": 1.8375475239699013, + "learning_rate": 6.9091853002611296e-06, + "loss": 0.897, + "step": 5848 + }, + { + "epoch": 0.6121402407116693, + "grad_norm": 2.1183156451380487, + "learning_rate": 6.905961733457941e-06, + "loss": 0.8377, + "step": 5849 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 1.9783448138880826, + "learning_rate": 6.902738522166019e-06, + "loss": 0.9428, + "step": 5850 + }, + { + "epoch": 0.6123495552066981, + "grad_norm": 2.21998303732816, + "learning_rate": 6.899515666755722e-06, + "loss": 0.909, + "step": 5851 + }, + { + "epoch": 0.6124542124542125, + "grad_norm": 2.3831003494991063, + "learning_rate": 6.896293167597361e-06, + "loss": 0.9709, + "step": 5852 + }, + { + "epoch": 0.6125588697017268, + "grad_norm": 2.0207894472977608, + "learning_rate": 6.8930710250612044e-06, + "loss": 0.9789, + "step": 5853 + }, + { + "epoch": 0.6126635269492412, + "grad_norm": 1.9009358397826535, + "learning_rate": 6.889849239517483e-06, + "loss": 0.8241, + "step": 5854 + }, + { + "epoch": 0.6127681841967556, + "grad_norm": 1.8698070433483918, + "learning_rate": 6.886627811336393e-06, + "loss": 0.8008, + "step": 5855 + }, + { + "epoch": 0.61287284144427, + "grad_norm": 2.005435434602962, + "learning_rate": 6.883406740888077e-06, + "loss": 0.9513, + "step": 5856 + }, + { + "epoch": 0.6129774986917844, + "grad_norm": 1.8480823682591345, + "learning_rate": 6.880186028542647e-06, + "loss": 0.8963, + "step": 5857 + }, + { + "epoch": 0.6130821559392988, + "grad_norm": 1.8958475524506964, + "learning_rate": 6.876965674670164e-06, + "loss": 0.9899, + "step": 5858 + }, + { + "epoch": 0.6131868131868132, + "grad_norm": 2.2557462857033186, + "learning_rate": 6.8737456796406594e-06, + "loss": 0.9383, + "step": 5859 + }, + { + "epoch": 0.6132914704343275, + "grad_norm": 1.8928132503094905, + "learning_rate": 6.870526043824111e-06, + "loss": 0.8379, + "step": 5860 + }, + { + "epoch": 0.6133961276818419, + "grad_norm": 1.938497040014973, + "learning_rate": 6.867306767590466e-06, + "loss": 1.0145, + "step": 5861 + }, + { + "epoch": 0.6135007849293563, + "grad_norm": 1.8779981359486946, + "learning_rate": 6.864087851309622e-06, + "loss": 0.9063, + "step": 5862 + }, + { + "epoch": 0.6136054421768707, + "grad_norm": 1.834130321006479, + "learning_rate": 6.860869295351439e-06, + "loss": 0.8479, + "step": 5863 + }, + { + "epoch": 0.6137100994243851, + "grad_norm": 2.225164974831015, + "learning_rate": 6.857651100085742e-06, + "loss": 1.0329, + "step": 5864 + }, + { + "epoch": 0.6138147566718996, + "grad_norm": 2.2962326966980067, + "learning_rate": 6.854433265882301e-06, + "loss": 0.9488, + "step": 5865 + }, + { + "epoch": 0.613919413919414, + "grad_norm": 1.9342617679397862, + "learning_rate": 6.851215793110857e-06, + "loss": 0.9494, + "step": 5866 + }, + { + "epoch": 0.6140240711669284, + "grad_norm": 2.052176188364496, + "learning_rate": 6.8479986821411e-06, + "loss": 0.9748, + "step": 5867 + }, + { + "epoch": 0.6141287284144427, + "grad_norm": 2.1836996140380402, + "learning_rate": 6.844781933342687e-06, + "loss": 0.9203, + "step": 5868 + }, + { + "epoch": 0.6142333856619571, + "grad_norm": 2.1846424471266843, + "learning_rate": 6.841565547085223e-06, + "loss": 1.0104, + "step": 5869 + }, + { + "epoch": 0.6143380429094715, + "grad_norm": 2.063761945658502, + "learning_rate": 6.838349523738283e-06, + "loss": 1.0052, + "step": 5870 + }, + { + "epoch": 0.6144427001569859, + "grad_norm": 2.345754619202839, + "learning_rate": 6.83513386367139e-06, + "loss": 0.9294, + "step": 5871 + }, + { + "epoch": 0.6145473574045003, + "grad_norm": 1.8945517389181037, + "learning_rate": 6.83191856725403e-06, + "loss": 0.8727, + "step": 5872 + }, + { + "epoch": 0.6146520146520147, + "grad_norm": 2.042059497052054, + "learning_rate": 6.828703634855651e-06, + "loss": 0.8322, + "step": 5873 + }, + { + "epoch": 0.6147566718995291, + "grad_norm": 2.1770922500505354, + "learning_rate": 6.825489066845654e-06, + "loss": 0.8193, + "step": 5874 + }, + { + "epoch": 0.6148613291470434, + "grad_norm": 2.3497422750449126, + "learning_rate": 6.822274863593399e-06, + "loss": 0.9328, + "step": 5875 + }, + { + "epoch": 0.6149659863945578, + "grad_norm": 2.176112444967684, + "learning_rate": 6.8190610254682034e-06, + "loss": 0.9731, + "step": 5876 + }, + { + "epoch": 0.6150706436420722, + "grad_norm": 1.918343615201989, + "learning_rate": 6.815847552839343e-06, + "loss": 0.8911, + "step": 5877 + }, + { + "epoch": 0.6151753008895866, + "grad_norm": 2.01864048793102, + "learning_rate": 6.812634446076056e-06, + "loss": 1.0051, + "step": 5878 + }, + { + "epoch": 0.615279958137101, + "grad_norm": 2.1005849771125953, + "learning_rate": 6.809421705547532e-06, + "loss": 1.0042, + "step": 5879 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 2.1391206561805016, + "learning_rate": 6.806209331622919e-06, + "loss": 0.8773, + "step": 5880 + }, + { + "epoch": 0.6154892726321298, + "grad_norm": 2.1522010281571227, + "learning_rate": 6.802997324671332e-06, + "loss": 0.7671, + "step": 5881 + }, + { + "epoch": 0.6155939298796441, + "grad_norm": 1.888842162645125, + "learning_rate": 6.799785685061832e-06, + "loss": 0.8971, + "step": 5882 + }, + { + "epoch": 0.6156985871271585, + "grad_norm": 2.2616843959962356, + "learning_rate": 6.796574413163446e-06, + "loss": 0.7999, + "step": 5883 + }, + { + "epoch": 0.6158032443746729, + "grad_norm": 2.2496513510327247, + "learning_rate": 6.793363509345151e-06, + "loss": 1.0051, + "step": 5884 + }, + { + "epoch": 0.6159079016221873, + "grad_norm": 2.0525978260362687, + "learning_rate": 6.790152973975892e-06, + "loss": 0.9044, + "step": 5885 + }, + { + "epoch": 0.6160125588697017, + "grad_norm": 2.01635783084318, + "learning_rate": 6.786942807424561e-06, + "loss": 1.0252, + "step": 5886 + }, + { + "epoch": 0.6161172161172161, + "grad_norm": 2.153628252992972, + "learning_rate": 6.783733010060018e-06, + "loss": 0.8818, + "step": 5887 + }, + { + "epoch": 0.6162218733647306, + "grad_norm": 1.9635936104933305, + "learning_rate": 6.78052358225107e-06, + "loss": 0.9493, + "step": 5888 + }, + { + "epoch": 0.6163265306122448, + "grad_norm": 2.0898861954151733, + "learning_rate": 6.777314524366486e-06, + "loss": 0.89, + "step": 5889 + }, + { + "epoch": 0.6164311878597593, + "grad_norm": 1.9717149097514748, + "learning_rate": 6.774105836775e-06, + "loss": 0.8447, + "step": 5890 + }, + { + "epoch": 0.6165358451072737, + "grad_norm": 1.956228723574464, + "learning_rate": 6.770897519845294e-06, + "loss": 0.8423, + "step": 5891 + }, + { + "epoch": 0.6166405023547881, + "grad_norm": 1.9423670450685322, + "learning_rate": 6.767689573946008e-06, + "loss": 0.9756, + "step": 5892 + }, + { + "epoch": 0.6167451596023025, + "grad_norm": 2.0612156081235313, + "learning_rate": 6.7644819994457424e-06, + "loss": 0.8537, + "step": 5893 + }, + { + "epoch": 0.6168498168498169, + "grad_norm": 2.134024125429241, + "learning_rate": 6.761274796713053e-06, + "loss": 0.8747, + "step": 5894 + }, + { + "epoch": 0.6169544740973313, + "grad_norm": 2.1094468946194276, + "learning_rate": 6.7580679661164575e-06, + "loss": 0.9272, + "step": 5895 + }, + { + "epoch": 0.6170591313448456, + "grad_norm": 1.96616440967844, + "learning_rate": 6.754861508024422e-06, + "loss": 1.0121, + "step": 5896 + }, + { + "epoch": 0.61716378859236, + "grad_norm": 2.2360078591987667, + "learning_rate": 6.751655422805381e-06, + "loss": 0.9208, + "step": 5897 + }, + { + "epoch": 0.6172684458398744, + "grad_norm": 1.9125102631579798, + "learning_rate": 6.748449710827711e-06, + "loss": 0.9544, + "step": 5898 + }, + { + "epoch": 0.6173731030873888, + "grad_norm": 1.9824080317071273, + "learning_rate": 6.745244372459765e-06, + "loss": 0.9728, + "step": 5899 + }, + { + "epoch": 0.6174777603349032, + "grad_norm": 2.128988283347369, + "learning_rate": 6.7420394080698385e-06, + "loss": 0.8864, + "step": 5900 + }, + { + "epoch": 0.6175824175824176, + "grad_norm": 1.8967381718005982, + "learning_rate": 6.738834818026187e-06, + "loss": 0.8736, + "step": 5901 + }, + { + "epoch": 0.617687074829932, + "grad_norm": 2.013465087057385, + "learning_rate": 6.7356306026970295e-06, + "loss": 1.0341, + "step": 5902 + }, + { + "epoch": 0.6177917320774463, + "grad_norm": 2.129706926698933, + "learning_rate": 6.7324267624505305e-06, + "loss": 0.8628, + "step": 5903 + }, + { + "epoch": 0.6178963893249607, + "grad_norm": 1.8225296515950085, + "learning_rate": 6.729223297654823e-06, + "loss": 0.8744, + "step": 5904 + }, + { + "epoch": 0.6180010465724751, + "grad_norm": 2.171135955598889, + "learning_rate": 6.726020208677986e-06, + "loss": 0.9598, + "step": 5905 + }, + { + "epoch": 0.6181057038199895, + "grad_norm": 1.9615288782179878, + "learning_rate": 6.722817495888063e-06, + "loss": 0.8919, + "step": 5906 + }, + { + "epoch": 0.6182103610675039, + "grad_norm": 2.3348478756088964, + "learning_rate": 6.719615159653057e-06, + "loss": 1.0335, + "step": 5907 + }, + { + "epoch": 0.6183150183150183, + "grad_norm": 1.8529500238451388, + "learning_rate": 6.716413200340917e-06, + "loss": 0.9656, + "step": 5908 + }, + { + "epoch": 0.6184196755625327, + "grad_norm": 2.101263167248366, + "learning_rate": 6.713211618319559e-06, + "loss": 0.8609, + "step": 5909 + }, + { + "epoch": 0.6185243328100472, + "grad_norm": 2.082342140819496, + "learning_rate": 6.710010413956849e-06, + "loss": 0.9781, + "step": 5910 + }, + { + "epoch": 0.6186289900575614, + "grad_norm": 2.0995332207635595, + "learning_rate": 6.706809587620611e-06, + "loss": 0.9303, + "step": 5911 + }, + { + "epoch": 0.6187336473050759, + "grad_norm": 1.8736139083894665, + "learning_rate": 6.703609139678628e-06, + "loss": 0.7238, + "step": 5912 + }, + { + "epoch": 0.6188383045525903, + "grad_norm": 2.02936203552506, + "learning_rate": 6.700409070498636e-06, + "loss": 0.8549, + "step": 5913 + }, + { + "epoch": 0.6189429618001047, + "grad_norm": 2.1028471786177434, + "learning_rate": 6.697209380448333e-06, + "loss": 0.835, + "step": 5914 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 2.220094548261755, + "learning_rate": 6.694010069895363e-06, + "loss": 0.8769, + "step": 5915 + }, + { + "epoch": 0.6191522762951335, + "grad_norm": 1.7219940474883257, + "learning_rate": 6.6908111392073395e-06, + "loss": 0.7758, + "step": 5916 + }, + { + "epoch": 0.6192569335426479, + "grad_norm": 2.0998250638549525, + "learning_rate": 6.687612588751827e-06, + "loss": 0.9183, + "step": 5917 + }, + { + "epoch": 0.6193615907901622, + "grad_norm": 2.0505701485922643, + "learning_rate": 6.684414418896341e-06, + "loss": 0.8306, + "step": 5918 + }, + { + "epoch": 0.6194662480376766, + "grad_norm": 2.2514903555360717, + "learning_rate": 6.681216630008363e-06, + "loss": 1.044, + "step": 5919 + }, + { + "epoch": 0.619570905285191, + "grad_norm": 1.7740449558139142, + "learning_rate": 6.678019222455318e-06, + "loss": 0.9587, + "step": 5920 + }, + { + "epoch": 0.6196755625327054, + "grad_norm": 1.9435874998522282, + "learning_rate": 6.674822196604601e-06, + "loss": 0.8681, + "step": 5921 + }, + { + "epoch": 0.6197802197802198, + "grad_norm": 2.136025694684406, + "learning_rate": 6.671625552823553e-06, + "loss": 0.9398, + "step": 5922 + }, + { + "epoch": 0.6198848770277342, + "grad_norm": 1.923864776761002, + "learning_rate": 6.668429291479479e-06, + "loss": 0.9224, + "step": 5923 + }, + { + "epoch": 0.6199895342752486, + "grad_norm": 2.163924861536042, + "learning_rate": 6.665233412939628e-06, + "loss": 0.8994, + "step": 5924 + }, + { + "epoch": 0.6200941915227629, + "grad_norm": 2.4204765320256647, + "learning_rate": 6.6620379175712205e-06, + "loss": 1.0021, + "step": 5925 + }, + { + "epoch": 0.6201988487702773, + "grad_norm": 2.06754965058645, + "learning_rate": 6.6588428057414256e-06, + "loss": 0.9148, + "step": 5926 + }, + { + "epoch": 0.6203035060177917, + "grad_norm": 2.337630297175959, + "learning_rate": 6.655648077817362e-06, + "loss": 0.9678, + "step": 5927 + }, + { + "epoch": 0.6204081632653061, + "grad_norm": 2.040465726503583, + "learning_rate": 6.652453734166118e-06, + "loss": 0.9536, + "step": 5928 + }, + { + "epoch": 0.6205128205128205, + "grad_norm": 2.092903427609399, + "learning_rate": 6.649259775154725e-06, + "loss": 0.9262, + "step": 5929 + }, + { + "epoch": 0.6206174777603349, + "grad_norm": 1.9715070313368872, + "learning_rate": 6.646066201150174e-06, + "loss": 1.048, + "step": 5930 + }, + { + "epoch": 0.6207221350078493, + "grad_norm": 1.8718505714062148, + "learning_rate": 6.6428730125194196e-06, + "loss": 0.9048, + "step": 5931 + }, + { + "epoch": 0.6208267922553636, + "grad_norm": 1.8924153254665501, + "learning_rate": 6.6396802096293555e-06, + "loss": 0.951, + "step": 5932 + }, + { + "epoch": 0.620931449502878, + "grad_norm": 2.242893217335704, + "learning_rate": 6.6364877928468505e-06, + "loss": 0.9777, + "step": 5933 + }, + { + "epoch": 0.6210361067503924, + "grad_norm": 2.0854393025410247, + "learning_rate": 6.633295762538718e-06, + "loss": 0.9969, + "step": 5934 + }, + { + "epoch": 0.6211407639979069, + "grad_norm": 1.8436423025912454, + "learning_rate": 6.6301041190717255e-06, + "loss": 0.964, + "step": 5935 + }, + { + "epoch": 0.6212454212454213, + "grad_norm": 1.8863362444224359, + "learning_rate": 6.626912862812605e-06, + "loss": 0.8836, + "step": 5936 + }, + { + "epoch": 0.6213500784929357, + "grad_norm": 2.262054897692573, + "learning_rate": 6.62372199412803e-06, + "loss": 0.8957, + "step": 5937 + }, + { + "epoch": 0.6214547357404501, + "grad_norm": 2.201402572775819, + "learning_rate": 6.620531513384645e-06, + "loss": 0.9112, + "step": 5938 + }, + { + "epoch": 0.6215593929879644, + "grad_norm": 2.2199717929386664, + "learning_rate": 6.6173414209490395e-06, + "loss": 1.1, + "step": 5939 + }, + { + "epoch": 0.6216640502354788, + "grad_norm": 2.3970536946249505, + "learning_rate": 6.614151717187762e-06, + "loss": 0.935, + "step": 5940 + }, + { + "epoch": 0.6217687074829932, + "grad_norm": 1.9767721037140733, + "learning_rate": 6.610962402467314e-06, + "loss": 0.9553, + "step": 5941 + }, + { + "epoch": 0.6218733647305076, + "grad_norm": 2.205710848436116, + "learning_rate": 6.607773477154156e-06, + "loss": 0.8938, + "step": 5942 + }, + { + "epoch": 0.621978021978022, + "grad_norm": 1.9654480260758713, + "learning_rate": 6.604584941614706e-06, + "loss": 0.9242, + "step": 5943 + }, + { + "epoch": 0.6220826792255364, + "grad_norm": 1.750180216403602, + "learning_rate": 6.601396796215327e-06, + "loss": 0.8576, + "step": 5944 + }, + { + "epoch": 0.6221873364730508, + "grad_norm": 2.236278725054879, + "learning_rate": 6.5982090413223476e-06, + "loss": 1.0135, + "step": 5945 + }, + { + "epoch": 0.6222919937205651, + "grad_norm": 2.027612274915688, + "learning_rate": 6.595021677302044e-06, + "loss": 0.939, + "step": 5946 + }, + { + "epoch": 0.6223966509680795, + "grad_norm": 2.244750079445713, + "learning_rate": 6.591834704520653e-06, + "loss": 0.8214, + "step": 5947 + }, + { + "epoch": 0.6225013082155939, + "grad_norm": 2.4590294267920476, + "learning_rate": 6.5886481233443656e-06, + "loss": 0.9864, + "step": 5948 + }, + { + "epoch": 0.6226059654631083, + "grad_norm": 2.030725863909143, + "learning_rate": 6.585461934139321e-06, + "loss": 0.9592, + "step": 5949 + }, + { + "epoch": 0.6227106227106227, + "grad_norm": 1.988399104391876, + "learning_rate": 6.5822761372716195e-06, + "loss": 0.9493, + "step": 5950 + }, + { + "epoch": 0.6228152799581371, + "grad_norm": 1.954025794060076, + "learning_rate": 6.579090733107323e-06, + "loss": 0.9747, + "step": 5951 + }, + { + "epoch": 0.6229199372056515, + "grad_norm": 2.1922622101716223, + "learning_rate": 6.575905722012435e-06, + "loss": 0.9192, + "step": 5952 + }, + { + "epoch": 0.6230245944531659, + "grad_norm": 2.1750494542346788, + "learning_rate": 6.5727211043529214e-06, + "loss": 0.9707, + "step": 5953 + }, + { + "epoch": 0.6231292517006802, + "grad_norm": 2.2279016358653125, + "learning_rate": 6.5695368804947e-06, + "loss": 0.8988, + "step": 5954 + }, + { + "epoch": 0.6232339089481946, + "grad_norm": 2.3809272214435104, + "learning_rate": 6.566353050803646e-06, + "loss": 1.0132, + "step": 5955 + }, + { + "epoch": 0.623338566195709, + "grad_norm": 2.3706566423677065, + "learning_rate": 6.563169615645586e-06, + "loss": 1.0054, + "step": 5956 + }, + { + "epoch": 0.6234432234432234, + "grad_norm": 1.7752367380253748, + "learning_rate": 6.559986575386307e-06, + "loss": 0.862, + "step": 5957 + }, + { + "epoch": 0.6235478806907379, + "grad_norm": 1.8311258992017134, + "learning_rate": 6.556803930391542e-06, + "loss": 0.7931, + "step": 5958 + }, + { + "epoch": 0.6236525379382523, + "grad_norm": 2.005969903941888, + "learning_rate": 6.553621681026983e-06, + "loss": 0.8692, + "step": 5959 + }, + { + "epoch": 0.6237571951857667, + "grad_norm": 2.4536539179443575, + "learning_rate": 6.550439827658282e-06, + "loss": 1.0995, + "step": 5960 + }, + { + "epoch": 0.623861852433281, + "grad_norm": 1.829663168954961, + "learning_rate": 6.5472583706510375e-06, + "loss": 0.9042, + "step": 5961 + }, + { + "epoch": 0.6239665096807954, + "grad_norm": 1.8713774810103647, + "learning_rate": 6.544077310370806e-06, + "loss": 0.8337, + "step": 5962 + }, + { + "epoch": 0.6240711669283098, + "grad_norm": 1.7943958864399616, + "learning_rate": 6.540896647183098e-06, + "loss": 0.8306, + "step": 5963 + }, + { + "epoch": 0.6241758241758242, + "grad_norm": 2.142987115852985, + "learning_rate": 6.537716381453379e-06, + "loss": 0.9367, + "step": 5964 + }, + { + "epoch": 0.6242804814233386, + "grad_norm": 2.0012001401459862, + "learning_rate": 6.534536513547065e-06, + "loss": 0.8729, + "step": 5965 + }, + { + "epoch": 0.624385138670853, + "grad_norm": 1.976227019935359, + "learning_rate": 6.531357043829529e-06, + "loss": 0.8953, + "step": 5966 + }, + { + "epoch": 0.6244897959183674, + "grad_norm": 2.044153940630976, + "learning_rate": 6.5281779726661e-06, + "loss": 0.9151, + "step": 5967 + }, + { + "epoch": 0.6245944531658817, + "grad_norm": 2.5746730325720395, + "learning_rate": 6.524999300422064e-06, + "loss": 0.8603, + "step": 5968 + }, + { + "epoch": 0.6246991104133961, + "grad_norm": 1.8639820031464824, + "learning_rate": 6.5218210274626495e-06, + "loss": 0.868, + "step": 5969 + }, + { + "epoch": 0.6248037676609105, + "grad_norm": 2.294972502505636, + "learning_rate": 6.5186431541530514e-06, + "loss": 0.9509, + "step": 5970 + }, + { + "epoch": 0.6249084249084249, + "grad_norm": 2.251140816785971, + "learning_rate": 6.515465680858412e-06, + "loss": 0.8821, + "step": 5971 + }, + { + "epoch": 0.6250130821559393, + "grad_norm": 1.8794029352605075, + "learning_rate": 6.512288607943831e-06, + "loss": 0.7838, + "step": 5972 + }, + { + "epoch": 0.6251177394034537, + "grad_norm": 1.9880704609132205, + "learning_rate": 6.509111935774357e-06, + "loss": 0.9442, + "step": 5973 + }, + { + "epoch": 0.6252223966509681, + "grad_norm": 2.1245881624662153, + "learning_rate": 6.5059356647150004e-06, + "loss": 0.9828, + "step": 5974 + }, + { + "epoch": 0.6253270538984824, + "grad_norm": 2.241601235510337, + "learning_rate": 6.502759795130717e-06, + "loss": 1.0231, + "step": 5975 + }, + { + "epoch": 0.6254317111459968, + "grad_norm": 2.075450945502386, + "learning_rate": 6.499584327386422e-06, + "loss": 0.9298, + "step": 5976 + }, + { + "epoch": 0.6255363683935112, + "grad_norm": 1.9389873315576378, + "learning_rate": 6.496409261846988e-06, + "loss": 0.7791, + "step": 5977 + }, + { + "epoch": 0.6256410256410256, + "grad_norm": 2.1343252505964387, + "learning_rate": 6.493234598877229e-06, + "loss": 1.1021, + "step": 5978 + }, + { + "epoch": 0.62574568288854, + "grad_norm": 1.7296284100239168, + "learning_rate": 6.490060338841926e-06, + "loss": 0.8877, + "step": 5979 + }, + { + "epoch": 0.6258503401360545, + "grad_norm": 2.6208758089241075, + "learning_rate": 6.486886482105803e-06, + "loss": 1.0258, + "step": 5980 + }, + { + "epoch": 0.6259549973835689, + "grad_norm": 1.6310737999103246, + "learning_rate": 6.483713029033547e-06, + "loss": 0.8419, + "step": 5981 + }, + { + "epoch": 0.6260596546310832, + "grad_norm": 1.9090171828142979, + "learning_rate": 6.4805399799897905e-06, + "loss": 0.9513, + "step": 5982 + }, + { + "epoch": 0.6261643118785976, + "grad_norm": 2.0317937367699104, + "learning_rate": 6.477367335339129e-06, + "loss": 0.8979, + "step": 5983 + }, + { + "epoch": 0.626268969126112, + "grad_norm": 1.9744350469086347, + "learning_rate": 6.474195095446099e-06, + "loss": 0.9687, + "step": 5984 + }, + { + "epoch": 0.6263736263736264, + "grad_norm": 2.0167761714344996, + "learning_rate": 6.471023260675196e-06, + "loss": 0.9428, + "step": 5985 + }, + { + "epoch": 0.6264782836211408, + "grad_norm": 1.944143991441491, + "learning_rate": 6.467851831390879e-06, + "loss": 1.0335, + "step": 5986 + }, + { + "epoch": 0.6265829408686552, + "grad_norm": 2.037332698815376, + "learning_rate": 6.464680807957547e-06, + "loss": 0.9471, + "step": 5987 + }, + { + "epoch": 0.6266875981161696, + "grad_norm": 2.1505226752128794, + "learning_rate": 6.461510190739555e-06, + "loss": 0.9963, + "step": 5988 + }, + { + "epoch": 0.6267922553636839, + "grad_norm": 1.970070926677773, + "learning_rate": 6.458339980101218e-06, + "loss": 0.9675, + "step": 5989 + }, + { + "epoch": 0.6268969126111983, + "grad_norm": 1.8959906001986315, + "learning_rate": 6.455170176406795e-06, + "loss": 0.8347, + "step": 5990 + }, + { + "epoch": 0.6270015698587127, + "grad_norm": 1.8273443555041868, + "learning_rate": 6.452000780020507e-06, + "loss": 0.9721, + "step": 5991 + }, + { + "epoch": 0.6271062271062271, + "grad_norm": 2.1115215989855187, + "learning_rate": 6.448831791306518e-06, + "loss": 0.9706, + "step": 5992 + }, + { + "epoch": 0.6272108843537415, + "grad_norm": 1.9586483418053429, + "learning_rate": 6.445663210628955e-06, + "loss": 0.8123, + "step": 5993 + }, + { + "epoch": 0.6273155416012559, + "grad_norm": 2.2343184089780768, + "learning_rate": 6.442495038351898e-06, + "loss": 0.9436, + "step": 5994 + }, + { + "epoch": 0.6274201988487703, + "grad_norm": 2.0306223291943546, + "learning_rate": 6.439327274839371e-06, + "loss": 0.8866, + "step": 5995 + }, + { + "epoch": 0.6275248560962847, + "grad_norm": 2.2375207468319482, + "learning_rate": 6.4361599204553604e-06, + "loss": 0.9457, + "step": 5996 + }, + { + "epoch": 0.627629513343799, + "grad_norm": 1.8709896809018647, + "learning_rate": 6.4329929755637975e-06, + "loss": 0.9759, + "step": 5997 + }, + { + "epoch": 0.6277341705913134, + "grad_norm": 2.0216688575621453, + "learning_rate": 6.429826440528575e-06, + "loss": 0.888, + "step": 5998 + }, + { + "epoch": 0.6278388278388278, + "grad_norm": 1.8399467700878034, + "learning_rate": 6.4266603157135295e-06, + "loss": 0.8394, + "step": 5999 + }, + { + "epoch": 0.6279434850863422, + "grad_norm": 2.1376344086932373, + "learning_rate": 6.4234946014824604e-06, + "loss": 0.9686, + "step": 6000 + }, + { + "epoch": 0.6280481423338566, + "grad_norm": 1.9022841655578102, + "learning_rate": 6.42032929819911e-06, + "loss": 0.864, + "step": 6001 + }, + { + "epoch": 0.628152799581371, + "grad_norm": 2.083437132779924, + "learning_rate": 6.417164406227177e-06, + "loss": 0.9067, + "step": 6002 + }, + { + "epoch": 0.6282574568288855, + "grad_norm": 2.2018711885514612, + "learning_rate": 6.413999925930321e-06, + "loss": 0.9072, + "step": 6003 + }, + { + "epoch": 0.6283621140763997, + "grad_norm": 1.9021036398571836, + "learning_rate": 6.410835857672142e-06, + "loss": 0.9087, + "step": 6004 + }, + { + "epoch": 0.6284667713239142, + "grad_norm": 1.935354772806226, + "learning_rate": 6.4076722018162e-06, + "loss": 0.9119, + "step": 6005 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 1.9033643377647045, + "learning_rate": 6.4045089587260054e-06, + "loss": 0.8665, + "step": 6006 + }, + { + "epoch": 0.628676085818943, + "grad_norm": 2.3015121040796434, + "learning_rate": 6.401346128765018e-06, + "loss": 0.9669, + "step": 6007 + }, + { + "epoch": 0.6287807430664574, + "grad_norm": 2.203066783924028, + "learning_rate": 6.398183712296657e-06, + "loss": 0.8435, + "step": 6008 + }, + { + "epoch": 0.6288854003139718, + "grad_norm": 1.833007785001033, + "learning_rate": 6.395021709684288e-06, + "loss": 0.9143, + "step": 6009 + }, + { + "epoch": 0.6289900575614862, + "grad_norm": 2.3499962012268814, + "learning_rate": 6.391860121291233e-06, + "loss": 0.8905, + "step": 6010 + }, + { + "epoch": 0.6290947148090005, + "grad_norm": 2.1025471869532764, + "learning_rate": 6.388698947480762e-06, + "loss": 0.8267, + "step": 6011 + }, + { + "epoch": 0.6291993720565149, + "grad_norm": 1.8540577312675692, + "learning_rate": 6.385538188616105e-06, + "loss": 0.8396, + "step": 6012 + }, + { + "epoch": 0.6293040293040293, + "grad_norm": 1.8832622109992163, + "learning_rate": 6.3823778450604376e-06, + "loss": 0.8341, + "step": 6013 + }, + { + "epoch": 0.6294086865515437, + "grad_norm": 2.404866254697502, + "learning_rate": 6.37921791717689e-06, + "loss": 0.9017, + "step": 6014 + }, + { + "epoch": 0.6295133437990581, + "grad_norm": 1.940080093100944, + "learning_rate": 6.376058405328543e-06, + "loss": 0.964, + "step": 6015 + }, + { + "epoch": 0.6296180010465725, + "grad_norm": 2.045127466425197, + "learning_rate": 6.372899309878432e-06, + "loss": 0.9815, + "step": 6016 + }, + { + "epoch": 0.6297226582940869, + "grad_norm": 2.1577986035955505, + "learning_rate": 6.369740631189545e-06, + "loss": 1.0398, + "step": 6017 + }, + { + "epoch": 0.6298273155416012, + "grad_norm": 1.749672091610911, + "learning_rate": 6.366582369624816e-06, + "loss": 0.7923, + "step": 6018 + }, + { + "epoch": 0.6299319727891156, + "grad_norm": 2.109385169681741, + "learning_rate": 6.363424525547136e-06, + "loss": 1.0015, + "step": 6019 + }, + { + "epoch": 0.63003663003663, + "grad_norm": 2.4515134604996156, + "learning_rate": 6.360267099319355e-06, + "loss": 1.0339, + "step": 6020 + }, + { + "epoch": 0.6301412872841444, + "grad_norm": 1.8551476559676408, + "learning_rate": 6.357110091304259e-06, + "loss": 0.8783, + "step": 6021 + }, + { + "epoch": 0.6302459445316588, + "grad_norm": 2.0924036270338786, + "learning_rate": 6.353953501864602e-06, + "loss": 0.9481, + "step": 6022 + }, + { + "epoch": 0.6303506017791732, + "grad_norm": 1.895267228752824, + "learning_rate": 6.35079733136308e-06, + "loss": 0.9792, + "step": 6023 + }, + { + "epoch": 0.6304552590266876, + "grad_norm": 1.8118380058338375, + "learning_rate": 6.3476415801623394e-06, + "loss": 0.8829, + "step": 6024 + }, + { + "epoch": 0.6305599162742019, + "grad_norm": 1.8405134542145882, + "learning_rate": 6.344486248624987e-06, + "loss": 0.7713, + "step": 6025 + }, + { + "epoch": 0.6306645735217163, + "grad_norm": 2.218547626688002, + "learning_rate": 6.341331337113573e-06, + "loss": 0.9786, + "step": 6026 + }, + { + "epoch": 0.6307692307692307, + "grad_norm": 1.882913448926194, + "learning_rate": 6.338176845990608e-06, + "loss": 0.9793, + "step": 6027 + }, + { + "epoch": 0.6308738880167452, + "grad_norm": 1.9751456320726135, + "learning_rate": 6.335022775618542e-06, + "loss": 0.8445, + "step": 6028 + }, + { + "epoch": 0.6309785452642596, + "grad_norm": 2.1125744499378496, + "learning_rate": 6.331869126359791e-06, + "loss": 0.9355, + "step": 6029 + }, + { + "epoch": 0.631083202511774, + "grad_norm": 1.6815396479096147, + "learning_rate": 6.328715898576716e-06, + "loss": 0.7911, + "step": 6030 + }, + { + "epoch": 0.6311878597592884, + "grad_norm": 1.7816434864441257, + "learning_rate": 6.3255630926316236e-06, + "loss": 0.8058, + "step": 6031 + }, + { + "epoch": 0.6312925170068027, + "grad_norm": 2.075892498540421, + "learning_rate": 6.3224107088867835e-06, + "loss": 0.9163, + "step": 6032 + }, + { + "epoch": 0.6313971742543171, + "grad_norm": 1.7175221205045825, + "learning_rate": 6.319258747704406e-06, + "loss": 0.7745, + "step": 6033 + }, + { + "epoch": 0.6315018315018315, + "grad_norm": 1.90865898477721, + "learning_rate": 6.316107209446662e-06, + "loss": 0.8428, + "step": 6034 + }, + { + "epoch": 0.6316064887493459, + "grad_norm": 2.011691584165178, + "learning_rate": 6.312956094475666e-06, + "loss": 0.9125, + "step": 6035 + }, + { + "epoch": 0.6317111459968603, + "grad_norm": 2.117883184326949, + "learning_rate": 6.30980540315349e-06, + "loss": 1.0146, + "step": 6036 + }, + { + "epoch": 0.6318158032443747, + "grad_norm": 1.9763590631964858, + "learning_rate": 6.306655135842151e-06, + "loss": 0.8664, + "step": 6037 + }, + { + "epoch": 0.6319204604918891, + "grad_norm": 1.9200563612501522, + "learning_rate": 6.303505292903627e-06, + "loss": 0.865, + "step": 6038 + }, + { + "epoch": 0.6320251177394035, + "grad_norm": 1.95702742334413, + "learning_rate": 6.300355874699839e-06, + "loss": 0.984, + "step": 6039 + }, + { + "epoch": 0.6321297749869178, + "grad_norm": 1.9835884003747482, + "learning_rate": 6.297206881592659e-06, + "loss": 0.8374, + "step": 6040 + }, + { + "epoch": 0.6322344322344322, + "grad_norm": 1.9891775252939559, + "learning_rate": 6.294058313943916e-06, + "loss": 0.8428, + "step": 6041 + }, + { + "epoch": 0.6323390894819466, + "grad_norm": 2.2448267920259655, + "learning_rate": 6.290910172115387e-06, + "loss": 0.8765, + "step": 6042 + }, + { + "epoch": 0.632443746729461, + "grad_norm": 2.0880819996965405, + "learning_rate": 6.287762456468796e-06, + "loss": 1.0132, + "step": 6043 + }, + { + "epoch": 0.6325484039769754, + "grad_norm": 2.321191163987611, + "learning_rate": 6.284615167365826e-06, + "loss": 0.9126, + "step": 6044 + }, + { + "epoch": 0.6326530612244898, + "grad_norm": 2.157686318063533, + "learning_rate": 6.281468305168102e-06, + "loss": 0.9658, + "step": 6045 + }, + { + "epoch": 0.6327577184720042, + "grad_norm": 1.8521424353333522, + "learning_rate": 6.278321870237207e-06, + "loss": 0.9874, + "step": 6046 + }, + { + "epoch": 0.6328623757195185, + "grad_norm": 1.985600041977417, + "learning_rate": 6.275175862934679e-06, + "loss": 0.8763, + "step": 6047 + }, + { + "epoch": 0.6329670329670329, + "grad_norm": 2.167359949334803, + "learning_rate": 6.2720302836219925e-06, + "loss": 0.8538, + "step": 6048 + }, + { + "epoch": 0.6330716902145473, + "grad_norm": 1.892338682422864, + "learning_rate": 6.268885132660585e-06, + "loss": 0.857, + "step": 6049 + }, + { + "epoch": 0.6331763474620618, + "grad_norm": 2.074732815371282, + "learning_rate": 6.265740410411838e-06, + "loss": 0.9257, + "step": 6050 + }, + { + "epoch": 0.6332810047095762, + "grad_norm": 1.9538306094870916, + "learning_rate": 6.26259611723709e-06, + "loss": 0.9022, + "step": 6051 + }, + { + "epoch": 0.6333856619570906, + "grad_norm": 2.052293995501398, + "learning_rate": 6.259452253497622e-06, + "loss": 0.9688, + "step": 6052 + }, + { + "epoch": 0.633490319204605, + "grad_norm": 1.9040390605918567, + "learning_rate": 6.256308819554676e-06, + "loss": 0.9216, + "step": 6053 + }, + { + "epoch": 0.6335949764521193, + "grad_norm": 1.887491395994407, + "learning_rate": 6.253165815769431e-06, + "loss": 0.9326, + "step": 6054 + }, + { + "epoch": 0.6336996336996337, + "grad_norm": 2.1059814291825054, + "learning_rate": 6.250023242503031e-06, + "loss": 0.9491, + "step": 6055 + }, + { + "epoch": 0.6338042909471481, + "grad_norm": 1.949072153051419, + "learning_rate": 6.246881100116566e-06, + "loss": 0.7935, + "step": 6056 + }, + { + "epoch": 0.6339089481946625, + "grad_norm": 2.0739299147239607, + "learning_rate": 6.243739388971068e-06, + "loss": 0.959, + "step": 6057 + }, + { + "epoch": 0.6340136054421769, + "grad_norm": 2.001653115479742, + "learning_rate": 6.240598109427531e-06, + "loss": 0.9694, + "step": 6058 + }, + { + "epoch": 0.6341182626896913, + "grad_norm": 2.2062451919821355, + "learning_rate": 6.237457261846891e-06, + "loss": 0.9713, + "step": 6059 + }, + { + "epoch": 0.6342229199372057, + "grad_norm": 2.0518558414861845, + "learning_rate": 6.234316846590041e-06, + "loss": 0.9721, + "step": 6060 + }, + { + "epoch": 0.63432757718472, + "grad_norm": 2.0231758494324352, + "learning_rate": 6.231176864017819e-06, + "loss": 1.0087, + "step": 6061 + }, + { + "epoch": 0.6344322344322344, + "grad_norm": 2.315599702222333, + "learning_rate": 6.228037314491013e-06, + "loss": 0.9439, + "step": 6062 + }, + { + "epoch": 0.6345368916797488, + "grad_norm": 2.0500400828302956, + "learning_rate": 6.224898198370363e-06, + "loss": 0.9502, + "step": 6063 + }, + { + "epoch": 0.6346415489272632, + "grad_norm": 1.9582030992702546, + "learning_rate": 6.22175951601657e-06, + "loss": 0.9388, + "step": 6064 + }, + { + "epoch": 0.6347462061747776, + "grad_norm": 1.8671881803341661, + "learning_rate": 6.218621267790263e-06, + "loss": 0.9292, + "step": 6065 + }, + { + "epoch": 0.634850863422292, + "grad_norm": 1.9018283992660436, + "learning_rate": 6.215483454052043e-06, + "loss": 0.8481, + "step": 6066 + }, + { + "epoch": 0.6349555206698064, + "grad_norm": 2.1228197696527915, + "learning_rate": 6.2123460751624425e-06, + "loss": 0.9088, + "step": 6067 + }, + { + "epoch": 0.6350601779173207, + "grad_norm": 2.0541407952110817, + "learning_rate": 6.209209131481957e-06, + "loss": 0.8942, + "step": 6068 + }, + { + "epoch": 0.6351648351648351, + "grad_norm": 2.4812046684098146, + "learning_rate": 6.206072623371027e-06, + "loss": 0.9851, + "step": 6069 + }, + { + "epoch": 0.6352694924123495, + "grad_norm": 2.466479670275315, + "learning_rate": 6.202936551190044e-06, + "loss": 0.9819, + "step": 6070 + }, + { + "epoch": 0.6353741496598639, + "grad_norm": 1.928191042129914, + "learning_rate": 6.199800915299345e-06, + "loss": 0.8792, + "step": 6071 + }, + { + "epoch": 0.6354788069073783, + "grad_norm": 1.638080208345466, + "learning_rate": 6.196665716059229e-06, + "loss": 0.7502, + "step": 6072 + }, + { + "epoch": 0.6355834641548928, + "grad_norm": 1.935024991236978, + "learning_rate": 6.193530953829931e-06, + "loss": 0.9067, + "step": 6073 + }, + { + "epoch": 0.6356881214024072, + "grad_norm": 1.9042984524406725, + "learning_rate": 6.190396628971644e-06, + "loss": 1.0139, + "step": 6074 + }, + { + "epoch": 0.6357927786499215, + "grad_norm": 2.096217219665164, + "learning_rate": 6.187262741844507e-06, + "loss": 0.9256, + "step": 6075 + }, + { + "epoch": 0.6358974358974359, + "grad_norm": 2.1629109423290602, + "learning_rate": 6.18412929280861e-06, + "loss": 1.1161, + "step": 6076 + }, + { + "epoch": 0.6360020931449503, + "grad_norm": 2.4819953770244543, + "learning_rate": 6.180996282223993e-06, + "loss": 1.015, + "step": 6077 + }, + { + "epoch": 0.6361067503924647, + "grad_norm": 1.973489568225318, + "learning_rate": 6.177863710450645e-06, + "loss": 0.8687, + "step": 6078 + }, + { + "epoch": 0.6362114076399791, + "grad_norm": 2.0537245713359455, + "learning_rate": 6.174731577848505e-06, + "loss": 1.0115, + "step": 6079 + }, + { + "epoch": 0.6363160648874935, + "grad_norm": 2.1026542763075056, + "learning_rate": 6.171599884777458e-06, + "loss": 0.8719, + "step": 6080 + }, + { + "epoch": 0.6364207221350079, + "grad_norm": 1.9810644360085412, + "learning_rate": 6.168468631597349e-06, + "loss": 0.9621, + "step": 6081 + }, + { + "epoch": 0.6365253793825223, + "grad_norm": 1.8498313936174973, + "learning_rate": 6.16533781866796e-06, + "loss": 0.8684, + "step": 6082 + }, + { + "epoch": 0.6366300366300366, + "grad_norm": 1.882513401890856, + "learning_rate": 6.162207446349031e-06, + "loss": 0.8804, + "step": 6083 + }, + { + "epoch": 0.636734693877551, + "grad_norm": 2.1454199707720045, + "learning_rate": 6.159077515000245e-06, + "loss": 0.8786, + "step": 6084 + }, + { + "epoch": 0.6368393511250654, + "grad_norm": 2.0348014676633848, + "learning_rate": 6.155948024981241e-06, + "loss": 0.9031, + "step": 6085 + }, + { + "epoch": 0.6369440083725798, + "grad_norm": 2.0035495835149786, + "learning_rate": 6.1528189766515996e-06, + "loss": 0.965, + "step": 6086 + }, + { + "epoch": 0.6370486656200942, + "grad_norm": 2.180282601428462, + "learning_rate": 6.1496903703708575e-06, + "loss": 1.0432, + "step": 6087 + }, + { + "epoch": 0.6371533228676086, + "grad_norm": 2.1368862326905043, + "learning_rate": 6.146562206498497e-06, + "loss": 0.9117, + "step": 6088 + }, + { + "epoch": 0.637257980115123, + "grad_norm": 2.043077967035869, + "learning_rate": 6.143434485393947e-06, + "loss": 0.9375, + "step": 6089 + }, + { + "epoch": 0.6373626373626373, + "grad_norm": 2.133413981464569, + "learning_rate": 6.140307207416598e-06, + "loss": 0.9432, + "step": 6090 + }, + { + "epoch": 0.6374672946101517, + "grad_norm": 1.8874105602733608, + "learning_rate": 6.137180372925773e-06, + "loss": 0.9215, + "step": 6091 + }, + { + "epoch": 0.6375719518576661, + "grad_norm": 2.1540406495422624, + "learning_rate": 6.134053982280756e-06, + "loss": 0.8921, + "step": 6092 + }, + { + "epoch": 0.6376766091051805, + "grad_norm": 2.267611660094676, + "learning_rate": 6.130928035840771e-06, + "loss": 0.9158, + "step": 6093 + }, + { + "epoch": 0.6377812663526949, + "grad_norm": 2.132196533283258, + "learning_rate": 6.127802533965001e-06, + "loss": 0.9576, + "step": 6094 + }, + { + "epoch": 0.6378859236002093, + "grad_norm": 1.7952285637216865, + "learning_rate": 6.124677477012566e-06, + "loss": 0.8652, + "step": 6095 + }, + { + "epoch": 0.6379905808477238, + "grad_norm": 2.268355822133597, + "learning_rate": 6.121552865342548e-06, + "loss": 0.9496, + "step": 6096 + }, + { + "epoch": 0.638095238095238, + "grad_norm": 2.480468965787619, + "learning_rate": 6.118428699313965e-06, + "loss": 0.9992, + "step": 6097 + }, + { + "epoch": 0.6381998953427525, + "grad_norm": 2.284899377257853, + "learning_rate": 6.115304979285794e-06, + "loss": 1.0189, + "step": 6098 + }, + { + "epoch": 0.6383045525902669, + "grad_norm": 2.2705419138837386, + "learning_rate": 6.112181705616958e-06, + "loss": 0.7452, + "step": 6099 + }, + { + "epoch": 0.6384092098377813, + "grad_norm": 1.9550159382736858, + "learning_rate": 6.109058878666326e-06, + "loss": 0.934, + "step": 6100 + }, + { + "epoch": 0.6385138670852957, + "grad_norm": 2.380989502303415, + "learning_rate": 6.105936498792715e-06, + "loss": 0.8759, + "step": 6101 + }, + { + "epoch": 0.6386185243328101, + "grad_norm": 2.3521973407044827, + "learning_rate": 6.102814566354896e-06, + "loss": 0.9418, + "step": 6102 + }, + { + "epoch": 0.6387231815803245, + "grad_norm": 2.1620835017709092, + "learning_rate": 6.099693081711582e-06, + "loss": 0.9934, + "step": 6103 + }, + { + "epoch": 0.6388278388278388, + "grad_norm": 2.304478296902805, + "learning_rate": 6.09657204522144e-06, + "loss": 0.9824, + "step": 6104 + }, + { + "epoch": 0.6389324960753532, + "grad_norm": 2.124733642865628, + "learning_rate": 6.093451457243082e-06, + "loss": 0.8826, + "step": 6105 + }, + { + "epoch": 0.6390371533228676, + "grad_norm": 2.2586347480935216, + "learning_rate": 6.09033131813507e-06, + "loss": 1.0353, + "step": 6106 + }, + { + "epoch": 0.639141810570382, + "grad_norm": 2.183731475063431, + "learning_rate": 6.087211628255918e-06, + "loss": 0.9315, + "step": 6107 + }, + { + "epoch": 0.6392464678178964, + "grad_norm": 2.1727879672656063, + "learning_rate": 6.08409238796408e-06, + "loss": 1.017, + "step": 6108 + }, + { + "epoch": 0.6393511250654108, + "grad_norm": 1.8520184743679367, + "learning_rate": 6.080973597617969e-06, + "loss": 0.7484, + "step": 6109 + }, + { + "epoch": 0.6394557823129252, + "grad_norm": 2.1309335686211384, + "learning_rate": 6.077855257575932e-06, + "loss": 0.9459, + "step": 6110 + }, + { + "epoch": 0.6395604395604395, + "grad_norm": 2.112146194809444, + "learning_rate": 6.074737368196279e-06, + "loss": 0.9341, + "step": 6111 + }, + { + "epoch": 0.6396650968079539, + "grad_norm": 2.2818082480682373, + "learning_rate": 6.071619929837259e-06, + "loss": 0.9016, + "step": 6112 + }, + { + "epoch": 0.6397697540554683, + "grad_norm": 1.9751366903614145, + "learning_rate": 6.068502942857075e-06, + "loss": 0.7711, + "step": 6113 + }, + { + "epoch": 0.6398744113029827, + "grad_norm": 2.0611570942571813, + "learning_rate": 6.065386407613869e-06, + "loss": 0.9724, + "step": 6114 + }, + { + "epoch": 0.6399790685504971, + "grad_norm": 2.0283236031744316, + "learning_rate": 6.06227032446574e-06, + "loss": 0.8681, + "step": 6115 + }, + { + "epoch": 0.6400837257980115, + "grad_norm": 1.840467172717708, + "learning_rate": 6.0591546937707375e-06, + "loss": 0.8617, + "step": 6116 + }, + { + "epoch": 0.6401883830455259, + "grad_norm": 2.216732313893134, + "learning_rate": 6.056039515886848e-06, + "loss": 0.9226, + "step": 6117 + }, + { + "epoch": 0.6402930402930402, + "grad_norm": 2.0576439170006062, + "learning_rate": 6.0529247911720145e-06, + "loss": 0.9738, + "step": 6118 + }, + { + "epoch": 0.6403976975405546, + "grad_norm": 2.004555679033776, + "learning_rate": 6.049810519984125e-06, + "loss": 0.9752, + "step": 6119 + }, + { + "epoch": 0.640502354788069, + "grad_norm": 2.121801609117083, + "learning_rate": 6.046696702681012e-06, + "loss": 0.9376, + "step": 6120 + }, + { + "epoch": 0.6406070120355835, + "grad_norm": 2.0557921754943638, + "learning_rate": 6.043583339620465e-06, + "loss": 0.8175, + "step": 6121 + }, + { + "epoch": 0.6407116692830979, + "grad_norm": 2.117196486653035, + "learning_rate": 6.0404704311602095e-06, + "loss": 0.914, + "step": 6122 + }, + { + "epoch": 0.6408163265306123, + "grad_norm": 2.454371990920559, + "learning_rate": 6.037357977657926e-06, + "loss": 0.8974, + "step": 6123 + }, + { + "epoch": 0.6409209837781267, + "grad_norm": 1.779633370179486, + "learning_rate": 6.034245979471249e-06, + "loss": 0.8059, + "step": 6124 + }, + { + "epoch": 0.6410256410256411, + "grad_norm": 2.224227729668064, + "learning_rate": 6.031134436957747e-06, + "loss": 0.9438, + "step": 6125 + }, + { + "epoch": 0.6411302982731554, + "grad_norm": 1.8793850100294136, + "learning_rate": 6.028023350474943e-06, + "loss": 0.8463, + "step": 6126 + }, + { + "epoch": 0.6412349555206698, + "grad_norm": 2.0192489810899823, + "learning_rate": 6.024912720380309e-06, + "loss": 0.8065, + "step": 6127 + }, + { + "epoch": 0.6413396127681842, + "grad_norm": 2.086275873749281, + "learning_rate": 6.021802547031263e-06, + "loss": 0.9134, + "step": 6128 + }, + { + "epoch": 0.6414442700156986, + "grad_norm": 2.239582460843957, + "learning_rate": 6.018692830785167e-06, + "loss": 0.8925, + "step": 6129 + }, + { + "epoch": 0.641548927263213, + "grad_norm": 1.9036697749177023, + "learning_rate": 6.015583571999337e-06, + "loss": 0.9513, + "step": 6130 + }, + { + "epoch": 0.6416535845107274, + "grad_norm": 1.967430877780038, + "learning_rate": 6.012474771031029e-06, + "loss": 0.9026, + "step": 6131 + }, + { + "epoch": 0.6417582417582418, + "grad_norm": 1.9634313043742981, + "learning_rate": 6.009366428237453e-06, + "loss": 0.9432, + "step": 6132 + }, + { + "epoch": 0.6418628990057561, + "grad_norm": 1.907415430341769, + "learning_rate": 6.006258543975769e-06, + "loss": 1.0205, + "step": 6133 + }, + { + "epoch": 0.6419675562532705, + "grad_norm": 1.9958836925353802, + "learning_rate": 6.003151118603071e-06, + "loss": 0.9038, + "step": 6134 + }, + { + "epoch": 0.6420722135007849, + "grad_norm": 1.974573234104124, + "learning_rate": 6.000044152476414e-06, + "loss": 0.8984, + "step": 6135 + }, + { + "epoch": 0.6421768707482993, + "grad_norm": 1.9662322439234587, + "learning_rate": 5.996937645952792e-06, + "loss": 0.8528, + "step": 6136 + }, + { + "epoch": 0.6422815279958137, + "grad_norm": 2.049330183711433, + "learning_rate": 5.993831599389149e-06, + "loss": 0.8704, + "step": 6137 + }, + { + "epoch": 0.6423861852433281, + "grad_norm": 1.8386217231992765, + "learning_rate": 5.990726013142378e-06, + "loss": 0.8267, + "step": 6138 + }, + { + "epoch": 0.6424908424908425, + "grad_norm": 1.9591896043802715, + "learning_rate": 5.987620887569314e-06, + "loss": 0.8269, + "step": 6139 + }, + { + "epoch": 0.6425954997383568, + "grad_norm": 2.0298076454052065, + "learning_rate": 5.984516223026746e-06, + "loss": 0.9347, + "step": 6140 + }, + { + "epoch": 0.6427001569858712, + "grad_norm": 2.0195419480669683, + "learning_rate": 5.9814120198714e-06, + "loss": 0.9478, + "step": 6141 + }, + { + "epoch": 0.6428048142333856, + "grad_norm": 2.0883419789864335, + "learning_rate": 5.9783082784599615e-06, + "loss": 0.8287, + "step": 6142 + }, + { + "epoch": 0.6429094714809, + "grad_norm": 2.1269657774275004, + "learning_rate": 5.975204999149056e-06, + "loss": 0.9302, + "step": 6143 + }, + { + "epoch": 0.6430141287284145, + "grad_norm": 2.016828126075074, + "learning_rate": 5.972102182295254e-06, + "loss": 1.0265, + "step": 6144 + }, + { + "epoch": 0.6431187859759289, + "grad_norm": 1.6917299157653185, + "learning_rate": 5.968999828255079e-06, + "loss": 0.7979, + "step": 6145 + }, + { + "epoch": 0.6432234432234433, + "grad_norm": 1.8616864521421854, + "learning_rate": 5.965897937384992e-06, + "loss": 0.7963, + "step": 6146 + }, + { + "epoch": 0.6433281004709576, + "grad_norm": 2.341785220873819, + "learning_rate": 5.962796510041413e-06, + "loss": 0.9599, + "step": 6147 + }, + { + "epoch": 0.643432757718472, + "grad_norm": 1.9957063834445605, + "learning_rate": 5.9596955465806974e-06, + "loss": 1.0314, + "step": 6148 + }, + { + "epoch": 0.6435374149659864, + "grad_norm": 2.08882364485049, + "learning_rate": 5.956595047359151e-06, + "loss": 0.8416, + "step": 6149 + }, + { + "epoch": 0.6436420722135008, + "grad_norm": 1.8725034362012762, + "learning_rate": 5.953495012733035e-06, + "loss": 0.8405, + "step": 6150 + }, + { + "epoch": 0.6437467294610152, + "grad_norm": 1.9625445407363562, + "learning_rate": 5.950395443058543e-06, + "loss": 0.9756, + "step": 6151 + }, + { + "epoch": 0.6438513867085296, + "grad_norm": 2.0630976658247766, + "learning_rate": 5.947296338691827e-06, + "loss": 0.9711, + "step": 6152 + }, + { + "epoch": 0.643956043956044, + "grad_norm": 1.9216746094429082, + "learning_rate": 5.944197699988975e-06, + "loss": 0.9195, + "step": 6153 + }, + { + "epoch": 0.6440607012035583, + "grad_norm": 2.2677133214478324, + "learning_rate": 5.9410995273060316e-06, + "loss": 0.9668, + "step": 6154 + }, + { + "epoch": 0.6441653584510727, + "grad_norm": 2.026665277815765, + "learning_rate": 5.93800182099898e-06, + "loss": 1.006, + "step": 6155 + }, + { + "epoch": 0.6442700156985871, + "grad_norm": 2.309012027737432, + "learning_rate": 5.93490458142375e-06, + "loss": 0.8949, + "step": 6156 + }, + { + "epoch": 0.6443746729461015, + "grad_norm": 1.8376028280551246, + "learning_rate": 5.93180780893623e-06, + "loss": 0.9266, + "step": 6157 + }, + { + "epoch": 0.6444793301936159, + "grad_norm": 1.8687096935531817, + "learning_rate": 5.928711503892233e-06, + "loss": 0.7169, + "step": 6158 + }, + { + "epoch": 0.6445839874411303, + "grad_norm": 1.7565771976435207, + "learning_rate": 5.925615666647541e-06, + "loss": 0.8147, + "step": 6159 + }, + { + "epoch": 0.6446886446886447, + "grad_norm": 2.2216686274696182, + "learning_rate": 5.92252029755787e-06, + "loss": 0.9832, + "step": 6160 + }, + { + "epoch": 0.644793301936159, + "grad_norm": 1.9664584115913126, + "learning_rate": 5.91942539697888e-06, + "loss": 0.9132, + "step": 6161 + }, + { + "epoch": 0.6448979591836734, + "grad_norm": 2.2259482681346325, + "learning_rate": 5.9163309652661886e-06, + "loss": 1.0205, + "step": 6162 + }, + { + "epoch": 0.6450026164311878, + "grad_norm": 1.974310097833552, + "learning_rate": 5.913237002775342e-06, + "loss": 0.8725, + "step": 6163 + }, + { + "epoch": 0.6451072736787022, + "grad_norm": 1.9361555583674028, + "learning_rate": 5.910143509861854e-06, + "loss": 0.8765, + "step": 6164 + }, + { + "epoch": 0.6452119309262166, + "grad_norm": 2.3259766646067805, + "learning_rate": 5.907050486881164e-06, + "loss": 0.8672, + "step": 6165 + }, + { + "epoch": 0.645316588173731, + "grad_norm": 2.0520065137688976, + "learning_rate": 5.903957934188673e-06, + "loss": 0.942, + "step": 6166 + }, + { + "epoch": 0.6454212454212455, + "grad_norm": 2.308496686545806, + "learning_rate": 5.900865852139715e-06, + "loss": 1.0235, + "step": 6167 + }, + { + "epoch": 0.6455259026687599, + "grad_norm": 1.9275377714588016, + "learning_rate": 5.897774241089581e-06, + "loss": 0.9101, + "step": 6168 + }, + { + "epoch": 0.6456305599162742, + "grad_norm": 1.7246090565713368, + "learning_rate": 5.894683101393505e-06, + "loss": 0.813, + "step": 6169 + }, + { + "epoch": 0.6457352171637886, + "grad_norm": 2.090211706193853, + "learning_rate": 5.8915924334066635e-06, + "loss": 0.9071, + "step": 6170 + }, + { + "epoch": 0.645839874411303, + "grad_norm": 2.225562586570551, + "learning_rate": 5.888502237484179e-06, + "loss": 0.89, + "step": 6171 + }, + { + "epoch": 0.6459445316588174, + "grad_norm": 2.1959182655755507, + "learning_rate": 5.885412513981121e-06, + "loss": 0.9924, + "step": 6172 + }, + { + "epoch": 0.6460491889063318, + "grad_norm": 2.0400315285376918, + "learning_rate": 5.88232326325251e-06, + "loss": 0.9976, + "step": 6173 + }, + { + "epoch": 0.6461538461538462, + "grad_norm": 2.2524957844292084, + "learning_rate": 5.879234485653302e-06, + "loss": 0.9471, + "step": 6174 + }, + { + "epoch": 0.6462585034013606, + "grad_norm": 1.9865407512804576, + "learning_rate": 5.8761461815384005e-06, + "loss": 1.007, + "step": 6175 + }, + { + "epoch": 0.6463631606488749, + "grad_norm": 1.8960367283948207, + "learning_rate": 5.873058351262666e-06, + "loss": 0.8504, + "step": 6176 + }, + { + "epoch": 0.6464678178963893, + "grad_norm": 1.7594307610018052, + "learning_rate": 5.869970995180896e-06, + "loss": 0.9622, + "step": 6177 + }, + { + "epoch": 0.6465724751439037, + "grad_norm": 2.262308058632982, + "learning_rate": 5.866884113647827e-06, + "loss": 1.0252, + "step": 6178 + }, + { + "epoch": 0.6466771323914181, + "grad_norm": 1.9209190231553315, + "learning_rate": 5.863797707018155e-06, + "loss": 0.8007, + "step": 6179 + }, + { + "epoch": 0.6467817896389325, + "grad_norm": 1.5447458009603428, + "learning_rate": 5.86071177564651e-06, + "loss": 0.778, + "step": 6180 + }, + { + "epoch": 0.6468864468864469, + "grad_norm": 1.8092557053926288, + "learning_rate": 5.857626319887475e-06, + "loss": 0.9498, + "step": 6181 + }, + { + "epoch": 0.6469911041339613, + "grad_norm": 2.2187668350942693, + "learning_rate": 5.85454134009557e-06, + "loss": 0.8878, + "step": 6182 + }, + { + "epoch": 0.6470957613814756, + "grad_norm": 2.0160589952562384, + "learning_rate": 5.851456836625271e-06, + "loss": 0.8794, + "step": 6183 + }, + { + "epoch": 0.64720041862899, + "grad_norm": 1.8692365583128616, + "learning_rate": 5.848372809830989e-06, + "loss": 0.9669, + "step": 6184 + }, + { + "epoch": 0.6473050758765044, + "grad_norm": 2.127493895012381, + "learning_rate": 5.845289260067089e-06, + "loss": 0.92, + "step": 6185 + }, + { + "epoch": 0.6474097331240188, + "grad_norm": 2.2617699972750125, + "learning_rate": 5.842206187687876e-06, + "loss": 0.889, + "step": 6186 + }, + { + "epoch": 0.6475143903715332, + "grad_norm": 2.134441719505655, + "learning_rate": 5.8391235930476e-06, + "loss": 0.8696, + "step": 6187 + }, + { + "epoch": 0.6476190476190476, + "grad_norm": 2.6083578482447947, + "learning_rate": 5.836041476500458e-06, + "loss": 0.9542, + "step": 6188 + }, + { + "epoch": 0.6477237048665621, + "grad_norm": 2.3257712734015388, + "learning_rate": 5.832959838400593e-06, + "loss": 0.8224, + "step": 6189 + }, + { + "epoch": 0.6478283621140764, + "grad_norm": 2.3044088351111256, + "learning_rate": 5.82987867910209e-06, + "loss": 0.8298, + "step": 6190 + }, + { + "epoch": 0.6479330193615908, + "grad_norm": 2.026561166745952, + "learning_rate": 5.8267979989589815e-06, + "loss": 0.82, + "step": 6191 + }, + { + "epoch": 0.6480376766091052, + "grad_norm": 1.965006660005929, + "learning_rate": 5.823717798325238e-06, + "loss": 0.8973, + "step": 6192 + }, + { + "epoch": 0.6481423338566196, + "grad_norm": 1.9771041937813247, + "learning_rate": 5.820638077554785e-06, + "loss": 0.9496, + "step": 6193 + }, + { + "epoch": 0.648246991104134, + "grad_norm": 2.1258967665098134, + "learning_rate": 5.8175588370014955e-06, + "loss": 0.9364, + "step": 6194 + }, + { + "epoch": 0.6483516483516484, + "grad_norm": 2.1439985227923644, + "learning_rate": 5.814480077019173e-06, + "loss": 0.988, + "step": 6195 + }, + { + "epoch": 0.6484563055991628, + "grad_norm": 2.133683906939611, + "learning_rate": 5.811401797961576e-06, + "loss": 0.7218, + "step": 6196 + }, + { + "epoch": 0.6485609628466771, + "grad_norm": 2.1684367284425567, + "learning_rate": 5.8083240001824015e-06, + "loss": 0.9307, + "step": 6197 + }, + { + "epoch": 0.6486656200941915, + "grad_norm": 2.1509821140259078, + "learning_rate": 5.805246684035293e-06, + "loss": 0.9419, + "step": 6198 + }, + { + "epoch": 0.6487702773417059, + "grad_norm": 2.40745342036477, + "learning_rate": 5.802169849873849e-06, + "loss": 0.971, + "step": 6199 + }, + { + "epoch": 0.6488749345892203, + "grad_norm": 1.8193600056832606, + "learning_rate": 5.799093498051599e-06, + "loss": 0.948, + "step": 6200 + }, + { + "epoch": 0.6489795918367347, + "grad_norm": 1.7020389159483353, + "learning_rate": 5.796017628922018e-06, + "loss": 0.7398, + "step": 6201 + }, + { + "epoch": 0.6490842490842491, + "grad_norm": 2.04512342282411, + "learning_rate": 5.792942242838537e-06, + "loss": 0.8347, + "step": 6202 + }, + { + "epoch": 0.6491889063317635, + "grad_norm": 2.5262087123815844, + "learning_rate": 5.789867340154518e-06, + "loss": 0.8673, + "step": 6203 + }, + { + "epoch": 0.6492935635792778, + "grad_norm": 1.7089620050014493, + "learning_rate": 5.786792921223281e-06, + "loss": 0.7972, + "step": 6204 + }, + { + "epoch": 0.6493982208267922, + "grad_norm": 2.1423319228812825, + "learning_rate": 5.783718986398077e-06, + "loss": 0.84, + "step": 6205 + }, + { + "epoch": 0.6495028780743066, + "grad_norm": 2.095804353523492, + "learning_rate": 5.78064553603211e-06, + "loss": 0.8465, + "step": 6206 + }, + { + "epoch": 0.649607535321821, + "grad_norm": 1.6986750357778488, + "learning_rate": 5.77757257047852e-06, + "loss": 0.8179, + "step": 6207 + }, + { + "epoch": 0.6497121925693354, + "grad_norm": 1.9991041419560274, + "learning_rate": 5.774500090090404e-06, + "loss": 0.8437, + "step": 6208 + }, + { + "epoch": 0.6498168498168498, + "grad_norm": 2.5476008805446435, + "learning_rate": 5.7714280952207955e-06, + "loss": 0.894, + "step": 6209 + }, + { + "epoch": 0.6499215070643642, + "grad_norm": 1.840271331808465, + "learning_rate": 5.7683565862226676e-06, + "loss": 0.9403, + "step": 6210 + }, + { + "epoch": 0.6500261643118787, + "grad_norm": 2.5171778195678955, + "learning_rate": 5.765285563448948e-06, + "loss": 0.8461, + "step": 6211 + }, + { + "epoch": 0.650130821559393, + "grad_norm": 1.7716194503918323, + "learning_rate": 5.7622150272525e-06, + "loss": 0.9805, + "step": 6212 + }, + { + "epoch": 0.6502354788069074, + "grad_norm": 1.8377563447111556, + "learning_rate": 5.759144977986138e-06, + "loss": 0.8581, + "step": 6213 + }, + { + "epoch": 0.6503401360544218, + "grad_norm": 1.8723011718069902, + "learning_rate": 5.756075416002618e-06, + "loss": 0.9373, + "step": 6214 + }, + { + "epoch": 0.6504447933019362, + "grad_norm": 2.069818431579889, + "learning_rate": 5.753006341654634e-06, + "loss": 0.9817, + "step": 6215 + }, + { + "epoch": 0.6505494505494506, + "grad_norm": 2.1304030948159105, + "learning_rate": 5.749937755294831e-06, + "loss": 0.8727, + "step": 6216 + }, + { + "epoch": 0.650654107796965, + "grad_norm": 1.8379536286843263, + "learning_rate": 5.746869657275792e-06, + "loss": 0.8884, + "step": 6217 + }, + { + "epoch": 0.6507587650444794, + "grad_norm": 2.193473187783517, + "learning_rate": 5.743802047950055e-06, + "loss": 0.9205, + "step": 6218 + }, + { + "epoch": 0.6508634222919937, + "grad_norm": 2.122540449060093, + "learning_rate": 5.740734927670089e-06, + "loss": 0.8567, + "step": 6219 + }, + { + "epoch": 0.6509680795395081, + "grad_norm": 1.9639756156046886, + "learning_rate": 5.737668296788314e-06, + "loss": 0.9146, + "step": 6220 + }, + { + "epoch": 0.6510727367870225, + "grad_norm": 1.952699246508381, + "learning_rate": 5.734602155657096e-06, + "loss": 0.8395, + "step": 6221 + }, + { + "epoch": 0.6511773940345369, + "grad_norm": 1.8418042050457508, + "learning_rate": 5.731536504628732e-06, + "loss": 0.8573, + "step": 6222 + }, + { + "epoch": 0.6512820512820513, + "grad_norm": 1.8770950809548828, + "learning_rate": 5.728471344055482e-06, + "loss": 0.9337, + "step": 6223 + }, + { + "epoch": 0.6513867085295657, + "grad_norm": 1.8578509595752808, + "learning_rate": 5.725406674289532e-06, + "loss": 0.8205, + "step": 6224 + }, + { + "epoch": 0.6514913657770801, + "grad_norm": 2.250049618236253, + "learning_rate": 5.722342495683021e-06, + "loss": 0.9339, + "step": 6225 + }, + { + "epoch": 0.6515960230245944, + "grad_norm": 2.075268254460256, + "learning_rate": 5.7192788085880245e-06, + "loss": 0.9066, + "step": 6226 + }, + { + "epoch": 0.6517006802721088, + "grad_norm": 2.0103126248212058, + "learning_rate": 5.716215613356577e-06, + "loss": 0.8782, + "step": 6227 + }, + { + "epoch": 0.6518053375196232, + "grad_norm": 2.170160364779897, + "learning_rate": 5.713152910340634e-06, + "loss": 0.9634, + "step": 6228 + }, + { + "epoch": 0.6519099947671376, + "grad_norm": 1.8429017442207223, + "learning_rate": 5.7100906998921154e-06, + "loss": 0.8229, + "step": 6229 + }, + { + "epoch": 0.652014652014652, + "grad_norm": 1.9662643778222724, + "learning_rate": 5.707028982362873e-06, + "loss": 0.9536, + "step": 6230 + }, + { + "epoch": 0.6521193092621664, + "grad_norm": 2.3974867995173166, + "learning_rate": 5.7039677581046984e-06, + "loss": 0.9297, + "step": 6231 + }, + { + "epoch": 0.6522239665096808, + "grad_norm": 2.0737121782229746, + "learning_rate": 5.700907027469342e-06, + "loss": 0.9138, + "step": 6232 + }, + { + "epoch": 0.6523286237571951, + "grad_norm": 1.9226135804608533, + "learning_rate": 5.697846790808483e-06, + "loss": 0.7444, + "step": 6233 + }, + { + "epoch": 0.6524332810047095, + "grad_norm": 2.0066517801777994, + "learning_rate": 5.69478704847375e-06, + "loss": 0.82, + "step": 6234 + }, + { + "epoch": 0.652537938252224, + "grad_norm": 1.891347237478382, + "learning_rate": 5.691727800816712e-06, + "loss": 0.9539, + "step": 6235 + }, + { + "epoch": 0.6526425954997384, + "grad_norm": 1.9297427031420433, + "learning_rate": 5.68866904818888e-06, + "loss": 0.8912, + "step": 6236 + }, + { + "epoch": 0.6527472527472528, + "grad_norm": 1.9332020266170267, + "learning_rate": 5.685610790941713e-06, + "loss": 0.8453, + "step": 6237 + }, + { + "epoch": 0.6528519099947672, + "grad_norm": 2.0549224122607925, + "learning_rate": 5.6825530294266185e-06, + "loss": 0.9198, + "step": 6238 + }, + { + "epoch": 0.6529565672422816, + "grad_norm": 1.7906027105209696, + "learning_rate": 5.679495763994931e-06, + "loss": 0.9266, + "step": 6239 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 1.6523611298603982, + "learning_rate": 5.67643899499794e-06, + "loss": 0.8616, + "step": 6240 + }, + { + "epoch": 0.6531658817373103, + "grad_norm": 2.1388297304679718, + "learning_rate": 5.673382722786869e-06, + "loss": 0.8582, + "step": 6241 + }, + { + "epoch": 0.6532705389848247, + "grad_norm": 1.918250281902123, + "learning_rate": 5.670326947712899e-06, + "loss": 0.9843, + "step": 6242 + }, + { + "epoch": 0.6533751962323391, + "grad_norm": 1.990471992424265, + "learning_rate": 5.667271670127138e-06, + "loss": 0.8863, + "step": 6243 + }, + { + "epoch": 0.6534798534798535, + "grad_norm": 2.053395723942595, + "learning_rate": 5.664216890380647e-06, + "loss": 1.0306, + "step": 6244 + }, + { + "epoch": 0.6535845107273679, + "grad_norm": 1.9787757140031736, + "learning_rate": 5.66116260882442e-06, + "loss": 0.8954, + "step": 6245 + }, + { + "epoch": 0.6536891679748823, + "grad_norm": 2.1654076339913457, + "learning_rate": 5.6581088258094054e-06, + "loss": 0.9597, + "step": 6246 + }, + { + "epoch": 0.6537938252223966, + "grad_norm": 2.18172363279797, + "learning_rate": 5.655055541686491e-06, + "loss": 1.008, + "step": 6247 + }, + { + "epoch": 0.653898482469911, + "grad_norm": 1.9685000331248872, + "learning_rate": 5.652002756806506e-06, + "loss": 0.8069, + "step": 6248 + }, + { + "epoch": 0.6540031397174254, + "grad_norm": 2.024974383790366, + "learning_rate": 5.648950471520217e-06, + "loss": 0.888, + "step": 6249 + }, + { + "epoch": 0.6541077969649398, + "grad_norm": 1.9484380553012453, + "learning_rate": 5.645898686178335e-06, + "loss": 0.944, + "step": 6250 + }, + { + "epoch": 0.6542124542124542, + "grad_norm": 2.231538349975824, + "learning_rate": 5.642847401131526e-06, + "loss": 0.8687, + "step": 6251 + }, + { + "epoch": 0.6543171114599686, + "grad_norm": 1.9993737936439624, + "learning_rate": 5.639796616730382e-06, + "loss": 0.9617, + "step": 6252 + }, + { + "epoch": 0.654421768707483, + "grad_norm": 2.1435053716762376, + "learning_rate": 5.636746333325447e-06, + "loss": 1.0731, + "step": 6253 + }, + { + "epoch": 0.6545264259549974, + "grad_norm": 1.9548403870891942, + "learning_rate": 5.633696551267198e-06, + "loss": 0.9179, + "step": 6254 + }, + { + "epoch": 0.6546310832025117, + "grad_norm": 1.910022534905838, + "learning_rate": 5.630647270906071e-06, + "loss": 0.9102, + "step": 6255 + }, + { + "epoch": 0.6547357404500261, + "grad_norm": 1.8907963804653292, + "learning_rate": 5.627598492592428e-06, + "loss": 0.922, + "step": 6256 + }, + { + "epoch": 0.6548403976975405, + "grad_norm": 1.954572343884513, + "learning_rate": 5.624550216676584e-06, + "loss": 0.9166, + "step": 6257 + }, + { + "epoch": 0.654945054945055, + "grad_norm": 1.8353655197423198, + "learning_rate": 5.621502443508791e-06, + "loss": 0.9632, + "step": 6258 + }, + { + "epoch": 0.6550497121925694, + "grad_norm": 2.18435647821634, + "learning_rate": 5.618455173439244e-06, + "loss": 0.8807, + "step": 6259 + }, + { + "epoch": 0.6551543694400838, + "grad_norm": 2.527165737623044, + "learning_rate": 5.615408406818074e-06, + "loss": 1.04, + "step": 6260 + }, + { + "epoch": 0.6552590266875982, + "grad_norm": 2.1620951304463962, + "learning_rate": 5.6123621439953715e-06, + "loss": 0.8673, + "step": 6261 + }, + { + "epoch": 0.6553636839351125, + "grad_norm": 1.9064074958893684, + "learning_rate": 5.609316385321149e-06, + "loss": 0.8773, + "step": 6262 + }, + { + "epoch": 0.6554683411826269, + "grad_norm": 2.240290365862607, + "learning_rate": 5.60627113114538e-06, + "loss": 0.8222, + "step": 6263 + }, + { + "epoch": 0.6555729984301413, + "grad_norm": 2.0718457091136147, + "learning_rate": 5.6032263818179635e-06, + "loss": 0.8648, + "step": 6264 + }, + { + "epoch": 0.6556776556776557, + "grad_norm": 2.0971279832921965, + "learning_rate": 5.6001821376887454e-06, + "loss": 0.9237, + "step": 6265 + }, + { + "epoch": 0.6557823129251701, + "grad_norm": 1.9785023614732484, + "learning_rate": 5.5971383991075234e-06, + "loss": 0.7651, + "step": 6266 + }, + { + "epoch": 0.6558869701726845, + "grad_norm": 1.9909685665336019, + "learning_rate": 5.594095166424025e-06, + "loss": 0.8833, + "step": 6267 + }, + { + "epoch": 0.6559916274201989, + "grad_norm": 2.105567126725272, + "learning_rate": 5.591052439987923e-06, + "loss": 0.8752, + "step": 6268 + }, + { + "epoch": 0.6560962846677132, + "grad_norm": 1.8400676598198988, + "learning_rate": 5.588010220148834e-06, + "loss": 0.8795, + "step": 6269 + }, + { + "epoch": 0.6562009419152276, + "grad_norm": 1.8357939138695751, + "learning_rate": 5.58496850725631e-06, + "loss": 0.7996, + "step": 6270 + }, + { + "epoch": 0.656305599162742, + "grad_norm": 2.0900322388751653, + "learning_rate": 5.581927301659855e-06, + "loss": 1.0018, + "step": 6271 + }, + { + "epoch": 0.6564102564102564, + "grad_norm": 1.9794563181186657, + "learning_rate": 5.578886603708914e-06, + "loss": 0.9069, + "step": 6272 + }, + { + "epoch": 0.6565149136577708, + "grad_norm": 1.9823398345613306, + "learning_rate": 5.575846413752865e-06, + "loss": 0.8751, + "step": 6273 + }, + { + "epoch": 0.6566195709052852, + "grad_norm": 2.443483126872738, + "learning_rate": 5.572806732141031e-06, + "loss": 0.891, + "step": 6274 + }, + { + "epoch": 0.6567242281527996, + "grad_norm": 1.952400398515589, + "learning_rate": 5.569767559222674e-06, + "loss": 0.8864, + "step": 6275 + }, + { + "epoch": 0.6568288854003139, + "grad_norm": 1.8830733377298436, + "learning_rate": 5.5667288953470115e-06, + "loss": 0.9083, + "step": 6276 + }, + { + "epoch": 0.6569335426478283, + "grad_norm": 1.9839437867503655, + "learning_rate": 5.563690740863184e-06, + "loss": 0.8745, + "step": 6277 + }, + { + "epoch": 0.6570381998953427, + "grad_norm": 1.8975928814696994, + "learning_rate": 5.560653096120283e-06, + "loss": 0.9024, + "step": 6278 + }, + { + "epoch": 0.6571428571428571, + "grad_norm": 2.147697717951089, + "learning_rate": 5.557615961467338e-06, + "loss": 0.8365, + "step": 6279 + }, + { + "epoch": 0.6572475143903715, + "grad_norm": 1.7757224649611865, + "learning_rate": 5.554579337253326e-06, + "loss": 0.827, + "step": 6280 + }, + { + "epoch": 0.657352171637886, + "grad_norm": 2.306267658791178, + "learning_rate": 5.551543223827162e-06, + "loss": 0.8783, + "step": 6281 + }, + { + "epoch": 0.6574568288854004, + "grad_norm": 1.938353410885278, + "learning_rate": 5.5485076215377e-06, + "loss": 0.8631, + "step": 6282 + }, + { + "epoch": 0.6575614861329147, + "grad_norm": 3.588827654123888, + "learning_rate": 5.545472530733738e-06, + "loss": 0.922, + "step": 6283 + }, + { + "epoch": 0.6576661433804291, + "grad_norm": 2.3146863860542197, + "learning_rate": 5.542437951764006e-06, + "loss": 0.9648, + "step": 6284 + }, + { + "epoch": 0.6577708006279435, + "grad_norm": 1.8737153115128953, + "learning_rate": 5.539403884977195e-06, + "loss": 0.8414, + "step": 6285 + }, + { + "epoch": 0.6578754578754579, + "grad_norm": 1.92712911224643, + "learning_rate": 5.536370330721921e-06, + "loss": 0.9653, + "step": 6286 + }, + { + "epoch": 0.6579801151229723, + "grad_norm": 1.7867220203896153, + "learning_rate": 5.533337289346743e-06, + "loss": 0.9625, + "step": 6287 + }, + { + "epoch": 0.6580847723704867, + "grad_norm": 2.618706543055954, + "learning_rate": 5.530304761200162e-06, + "loss": 1.0739, + "step": 6288 + }, + { + "epoch": 0.6581894296180011, + "grad_norm": 1.9789921876753085, + "learning_rate": 5.527272746630625e-06, + "loss": 0.8933, + "step": 6289 + }, + { + "epoch": 0.6582940868655154, + "grad_norm": 1.9939197083721731, + "learning_rate": 5.5242412459865215e-06, + "loss": 1.0279, + "step": 6290 + }, + { + "epoch": 0.6583987441130298, + "grad_norm": 1.911672548685458, + "learning_rate": 5.521210259616171e-06, + "loss": 0.9188, + "step": 6291 + }, + { + "epoch": 0.6585034013605442, + "grad_norm": 2.081526787157685, + "learning_rate": 5.518179787867841e-06, + "loss": 0.8945, + "step": 6292 + }, + { + "epoch": 0.6586080586080586, + "grad_norm": 2.102166057104072, + "learning_rate": 5.515149831089739e-06, + "loss": 0.901, + "step": 6293 + }, + { + "epoch": 0.658712715855573, + "grad_norm": 1.7756557962954296, + "learning_rate": 5.51212038963001e-06, + "loss": 0.933, + "step": 6294 + }, + { + "epoch": 0.6588173731030874, + "grad_norm": 1.8389100423069065, + "learning_rate": 5.5090914638367495e-06, + "loss": 0.9373, + "step": 6295 + }, + { + "epoch": 0.6589220303506018, + "grad_norm": 2.112443963445209, + "learning_rate": 5.506063054057984e-06, + "loss": 0.9985, + "step": 6296 + }, + { + "epoch": 0.6590266875981162, + "grad_norm": 1.9306029603381285, + "learning_rate": 5.50303516064168e-06, + "loss": 0.7859, + "step": 6297 + }, + { + "epoch": 0.6591313448456305, + "grad_norm": 1.9275367700602777, + "learning_rate": 5.500007783935757e-06, + "loss": 0.9392, + "step": 6298 + }, + { + "epoch": 0.6592360020931449, + "grad_norm": 1.9953473847028906, + "learning_rate": 5.4969809242880555e-06, + "loss": 1.0136, + "step": 6299 + }, + { + "epoch": 0.6593406593406593, + "grad_norm": 2.0576053965813808, + "learning_rate": 5.493954582046381e-06, + "loss": 1.006, + "step": 6300 + }, + { + "epoch": 0.6594453165881737, + "grad_norm": 1.9656517014244708, + "learning_rate": 5.490928757558458e-06, + "loss": 0.8898, + "step": 6301 + }, + { + "epoch": 0.6595499738356881, + "grad_norm": 2.328622538340134, + "learning_rate": 5.487903451171963e-06, + "loss": 0.9392, + "step": 6302 + }, + { + "epoch": 0.6596546310832025, + "grad_norm": 2.024875566385732, + "learning_rate": 5.484878663234503e-06, + "loss": 0.9248, + "step": 6303 + }, + { + "epoch": 0.659759288330717, + "grad_norm": 2.151355182140815, + "learning_rate": 5.481854394093643e-06, + "loss": 0.879, + "step": 6304 + }, + { + "epoch": 0.6598639455782312, + "grad_norm": 2.190977467557319, + "learning_rate": 5.478830644096872e-06, + "loss": 1.0399, + "step": 6305 + }, + { + "epoch": 0.6599686028257457, + "grad_norm": 1.7250845041049725, + "learning_rate": 5.475807413591621e-06, + "loss": 0.8283, + "step": 6306 + }, + { + "epoch": 0.6600732600732601, + "grad_norm": 2.0039747052514776, + "learning_rate": 5.4727847029252735e-06, + "loss": 0.8717, + "step": 6307 + }, + { + "epoch": 0.6601779173207745, + "grad_norm": 2.2311307267829874, + "learning_rate": 5.469762512445142e-06, + "loss": 0.8769, + "step": 6308 + }, + { + "epoch": 0.6602825745682889, + "grad_norm": 1.992690818121849, + "learning_rate": 5.466740842498477e-06, + "loss": 0.8671, + "step": 6309 + }, + { + "epoch": 0.6603872318158033, + "grad_norm": 2.2879738880594176, + "learning_rate": 5.463719693432483e-06, + "loss": 0.9096, + "step": 6310 + }, + { + "epoch": 0.6604918890633177, + "grad_norm": 2.0374811856000465, + "learning_rate": 5.460699065594292e-06, + "loss": 0.9504, + "step": 6311 + }, + { + "epoch": 0.660596546310832, + "grad_norm": 1.8719692965200658, + "learning_rate": 5.4576789593309805e-06, + "loss": 0.9048, + "step": 6312 + }, + { + "epoch": 0.6607012035583464, + "grad_norm": 2.2231971812121794, + "learning_rate": 5.454659374989563e-06, + "loss": 0.9228, + "step": 6313 + }, + { + "epoch": 0.6608058608058608, + "grad_norm": 1.965484690948648, + "learning_rate": 5.451640312916995e-06, + "loss": 0.9135, + "step": 6314 + }, + { + "epoch": 0.6609105180533752, + "grad_norm": 2.1030519351417505, + "learning_rate": 5.448621773460181e-06, + "loss": 0.9864, + "step": 6315 + }, + { + "epoch": 0.6610151753008896, + "grad_norm": 1.9317968866735937, + "learning_rate": 5.445603756965952e-06, + "loss": 0.9095, + "step": 6316 + }, + { + "epoch": 0.661119832548404, + "grad_norm": 2.0771595808477943, + "learning_rate": 5.442586263781082e-06, + "loss": 0.979, + "step": 6317 + }, + { + "epoch": 0.6612244897959184, + "grad_norm": 2.108827558618817, + "learning_rate": 5.439569294252287e-06, + "loss": 0.9105, + "step": 6318 + }, + { + "epoch": 0.6613291470434327, + "grad_norm": 2.0983819392455314, + "learning_rate": 5.436552848726229e-06, + "loss": 0.9923, + "step": 6319 + }, + { + "epoch": 0.6614338042909471, + "grad_norm": 2.473425853943777, + "learning_rate": 5.433536927549499e-06, + "loss": 1.0007, + "step": 6320 + }, + { + "epoch": 0.6615384615384615, + "grad_norm": 1.9633903506211725, + "learning_rate": 5.430521531068634e-06, + "loss": 0.8951, + "step": 6321 + }, + { + "epoch": 0.6616431187859759, + "grad_norm": 2.100683110676726, + "learning_rate": 5.427506659630104e-06, + "loss": 0.9602, + "step": 6322 + }, + { + "epoch": 0.6617477760334903, + "grad_norm": 2.266215087701523, + "learning_rate": 5.4244923135803275e-06, + "loss": 0.9114, + "step": 6323 + }, + { + "epoch": 0.6618524332810047, + "grad_norm": 1.8670218973616595, + "learning_rate": 5.421478493265664e-06, + "loss": 0.9076, + "step": 6324 + }, + { + "epoch": 0.6619570905285191, + "grad_norm": 2.0326379729684545, + "learning_rate": 5.418465199032404e-06, + "loss": 0.849, + "step": 6325 + }, + { + "epoch": 0.6620617477760334, + "grad_norm": 1.9942841480440203, + "learning_rate": 5.4154524312267795e-06, + "loss": 0.886, + "step": 6326 + }, + { + "epoch": 0.6621664050235478, + "grad_norm": 2.0675772411489386, + "learning_rate": 5.412440190194965e-06, + "loss": 0.8767, + "step": 6327 + }, + { + "epoch": 0.6622710622710622, + "grad_norm": 2.216673868421644, + "learning_rate": 5.409428476283068e-06, + "loss": 0.8531, + "step": 6328 + }, + { + "epoch": 0.6623757195185767, + "grad_norm": 2.114144277955494, + "learning_rate": 5.4064172898371506e-06, + "loss": 0.8506, + "step": 6329 + }, + { + "epoch": 0.6624803767660911, + "grad_norm": 2.11736267037912, + "learning_rate": 5.403406631203197e-06, + "loss": 0.8308, + "step": 6330 + }, + { + "epoch": 0.6625850340136055, + "grad_norm": 1.8344929417267373, + "learning_rate": 5.400396500727141e-06, + "loss": 0.885, + "step": 6331 + }, + { + "epoch": 0.6626896912611199, + "grad_norm": 2.005302325582287, + "learning_rate": 5.397386898754847e-06, + "loss": 0.7028, + "step": 6332 + }, + { + "epoch": 0.6627943485086342, + "grad_norm": 2.007310766798443, + "learning_rate": 5.394377825632129e-06, + "loss": 1.0009, + "step": 6333 + }, + { + "epoch": 0.6628990057561486, + "grad_norm": 2.298430147972449, + "learning_rate": 5.3913692817047395e-06, + "loss": 0.7607, + "step": 6334 + }, + { + "epoch": 0.663003663003663, + "grad_norm": 2.0426983676977746, + "learning_rate": 5.388361267318362e-06, + "loss": 1.0074, + "step": 6335 + }, + { + "epoch": 0.6631083202511774, + "grad_norm": 2.36635698834763, + "learning_rate": 5.385353782818623e-06, + "loss": 0.8771, + "step": 6336 + }, + { + "epoch": 0.6632129774986918, + "grad_norm": 1.89057291548456, + "learning_rate": 5.382346828551086e-06, + "loss": 0.9702, + "step": 6337 + }, + { + "epoch": 0.6633176347462062, + "grad_norm": 2.1549128599478347, + "learning_rate": 5.379340404861263e-06, + "loss": 0.9578, + "step": 6338 + }, + { + "epoch": 0.6634222919937206, + "grad_norm": 2.264597570417053, + "learning_rate": 5.3763345120945944e-06, + "loss": 0.9258, + "step": 6339 + }, + { + "epoch": 0.663526949241235, + "grad_norm": 1.9983468153168773, + "learning_rate": 5.37332915059646e-06, + "loss": 0.8108, + "step": 6340 + }, + { + "epoch": 0.6636316064887493, + "grad_norm": 2.199629599477845, + "learning_rate": 5.370324320712189e-06, + "loss": 0.7242, + "step": 6341 + }, + { + "epoch": 0.6637362637362637, + "grad_norm": 2.037900821563259, + "learning_rate": 5.3673200227870345e-06, + "loss": 0.856, + "step": 6342 + }, + { + "epoch": 0.6638409209837781, + "grad_norm": 1.9505192747155962, + "learning_rate": 5.364316257166207e-06, + "loss": 0.8252, + "step": 6343 + }, + { + "epoch": 0.6639455782312925, + "grad_norm": 2.092352950973099, + "learning_rate": 5.361313024194837e-06, + "loss": 0.9537, + "step": 6344 + }, + { + "epoch": 0.6640502354788069, + "grad_norm": 2.284549338180081, + "learning_rate": 5.358310324218003e-06, + "loss": 0.7024, + "step": 6345 + }, + { + "epoch": 0.6641548927263213, + "grad_norm": 2.113760328686243, + "learning_rate": 5.355308157580725e-06, + "loss": 0.8261, + "step": 6346 + }, + { + "epoch": 0.6642595499738357, + "grad_norm": 2.256190025263873, + "learning_rate": 5.35230652462795e-06, + "loss": 0.9598, + "step": 6347 + }, + { + "epoch": 0.66436420722135, + "grad_norm": 1.9207259907187089, + "learning_rate": 5.349305425704583e-06, + "loss": 0.9572, + "step": 6348 + }, + { + "epoch": 0.6644688644688644, + "grad_norm": 2.3515307832470334, + "learning_rate": 5.346304861155445e-06, + "loss": 0.9013, + "step": 6349 + }, + { + "epoch": 0.6645735217163788, + "grad_norm": 1.988489105543637, + "learning_rate": 5.343304831325315e-06, + "loss": 0.8901, + "step": 6350 + }, + { + "epoch": 0.6646781789638933, + "grad_norm": 2.1734081104477965, + "learning_rate": 5.340305336558902e-06, + "loss": 1.0388, + "step": 6351 + }, + { + "epoch": 0.6647828362114077, + "grad_norm": 2.0143701834494263, + "learning_rate": 5.337306377200848e-06, + "loss": 0.8924, + "step": 6352 + }, + { + "epoch": 0.6648874934589221, + "grad_norm": 1.9233691798259414, + "learning_rate": 5.334307953595747e-06, + "loss": 0.9476, + "step": 6353 + }, + { + "epoch": 0.6649921507064365, + "grad_norm": 2.130187551657956, + "learning_rate": 5.331310066088121e-06, + "loss": 0.8532, + "step": 6354 + }, + { + "epoch": 0.6650968079539508, + "grad_norm": 1.9538394918636528, + "learning_rate": 5.328312715022432e-06, + "loss": 0.9532, + "step": 6355 + }, + { + "epoch": 0.6652014652014652, + "grad_norm": 2.3831371289530074, + "learning_rate": 5.32531590074308e-06, + "loss": 0.9532, + "step": 6356 + }, + { + "epoch": 0.6653061224489796, + "grad_norm": 2.4969974074294585, + "learning_rate": 5.32231962359441e-06, + "loss": 0.9987, + "step": 6357 + }, + { + "epoch": 0.665410779696494, + "grad_norm": 2.176103875459444, + "learning_rate": 5.319323883920695e-06, + "loss": 0.988, + "step": 6358 + }, + { + "epoch": 0.6655154369440084, + "grad_norm": 1.6925971627256475, + "learning_rate": 5.316328682066157e-06, + "loss": 0.9428, + "step": 6359 + }, + { + "epoch": 0.6656200941915228, + "grad_norm": 2.0471516717225784, + "learning_rate": 5.313334018374949e-06, + "loss": 0.921, + "step": 6360 + }, + { + "epoch": 0.6657247514390372, + "grad_norm": 1.9398078548541235, + "learning_rate": 5.310339893191161e-06, + "loss": 0.8818, + "step": 6361 + }, + { + "epoch": 0.6658294086865515, + "grad_norm": 1.9679163945822484, + "learning_rate": 5.307346306858827e-06, + "loss": 0.9378, + "step": 6362 + }, + { + "epoch": 0.6659340659340659, + "grad_norm": 1.9372496636716319, + "learning_rate": 5.304353259721917e-06, + "loss": 0.8058, + "step": 6363 + }, + { + "epoch": 0.6660387231815803, + "grad_norm": 1.943843808421226, + "learning_rate": 5.301360752124337e-06, + "loss": 0.8519, + "step": 6364 + }, + { + "epoch": 0.6661433804290947, + "grad_norm": 2.0411186947239552, + "learning_rate": 5.298368784409931e-06, + "loss": 0.8791, + "step": 6365 + }, + { + "epoch": 0.6662480376766091, + "grad_norm": 2.2814426931295793, + "learning_rate": 5.29537735692248e-06, + "loss": 1.0396, + "step": 6366 + }, + { + "epoch": 0.6663526949241235, + "grad_norm": 2.0786777164691888, + "learning_rate": 5.292386470005706e-06, + "loss": 0.844, + "step": 6367 + }, + { + "epoch": 0.6664573521716379, + "grad_norm": 2.04748550942362, + "learning_rate": 5.289396124003274e-06, + "loss": 1.0169, + "step": 6368 + }, + { + "epoch": 0.6665620094191522, + "grad_norm": 2.123957153054228, + "learning_rate": 5.286406319258779e-06, + "loss": 0.9936, + "step": 6369 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.0189091508075583, + "learning_rate": 5.2834170561157514e-06, + "loss": 0.8023, + "step": 6370 + }, + { + "epoch": 0.666771323914181, + "grad_norm": 1.9067663948673013, + "learning_rate": 5.280428334917662e-06, + "loss": 0.8841, + "step": 6371 + }, + { + "epoch": 0.6668759811616954, + "grad_norm": 2.0191663801552426, + "learning_rate": 5.277440156007929e-06, + "loss": 0.8464, + "step": 6372 + }, + { + "epoch": 0.6669806384092098, + "grad_norm": 1.9211457524771827, + "learning_rate": 5.274452519729895e-06, + "loss": 0.8674, + "step": 6373 + }, + { + "epoch": 0.6670852956567243, + "grad_norm": 1.9382829851230479, + "learning_rate": 5.271465426426847e-06, + "loss": 1.0491, + "step": 6374 + }, + { + "epoch": 0.6671899529042387, + "grad_norm": 1.8888165263402947, + "learning_rate": 5.268478876442003e-06, + "loss": 0.8538, + "step": 6375 + }, + { + "epoch": 0.667294610151753, + "grad_norm": 2.091423517591985, + "learning_rate": 5.2654928701185274e-06, + "loss": 0.8253, + "step": 6376 + }, + { + "epoch": 0.6673992673992674, + "grad_norm": 2.2768215094704822, + "learning_rate": 5.262507407799522e-06, + "loss": 0.9538, + "step": 6377 + }, + { + "epoch": 0.6675039246467818, + "grad_norm": 2.1494565343878533, + "learning_rate": 5.259522489828022e-06, + "loss": 0.9563, + "step": 6378 + }, + { + "epoch": 0.6676085818942962, + "grad_norm": 2.189850705311329, + "learning_rate": 5.2565381165469954e-06, + "loss": 0.8845, + "step": 6379 + }, + { + "epoch": 0.6677132391418106, + "grad_norm": 2.3476427995274882, + "learning_rate": 5.253554288299352e-06, + "loss": 0.893, + "step": 6380 + }, + { + "epoch": 0.667817896389325, + "grad_norm": 1.92825991613659, + "learning_rate": 5.250571005427947e-06, + "loss": 0.8203, + "step": 6381 + }, + { + "epoch": 0.6679225536368394, + "grad_norm": 2.031488675623292, + "learning_rate": 5.247588268275561e-06, + "loss": 0.8845, + "step": 6382 + }, + { + "epoch": 0.6680272108843538, + "grad_norm": 1.7586373295933992, + "learning_rate": 5.244606077184918e-06, + "loss": 0.7734, + "step": 6383 + }, + { + "epoch": 0.6681318681318681, + "grad_norm": 2.394679003738517, + "learning_rate": 5.241624432498673e-06, + "loss": 0.9904, + "step": 6384 + }, + { + "epoch": 0.6682365253793825, + "grad_norm": 2.2617940928572047, + "learning_rate": 5.23864333455943e-06, + "loss": 0.9123, + "step": 6385 + }, + { + "epoch": 0.6683411826268969, + "grad_norm": 6.141677769409521, + "learning_rate": 5.235662783709717e-06, + "loss": 0.8455, + "step": 6386 + }, + { + "epoch": 0.6684458398744113, + "grad_norm": 1.8536950099015304, + "learning_rate": 5.232682780292012e-06, + "loss": 0.872, + "step": 6387 + }, + { + "epoch": 0.6685504971219257, + "grad_norm": 1.9152603791182172, + "learning_rate": 5.22970332464872e-06, + "loss": 0.9247, + "step": 6388 + }, + { + "epoch": 0.6686551543694401, + "grad_norm": 2.1906359883969166, + "learning_rate": 5.2267244171221864e-06, + "loss": 0.9432, + "step": 6389 + }, + { + "epoch": 0.6687598116169545, + "grad_norm": 1.8408546196814852, + "learning_rate": 5.223746058054691e-06, + "loss": 0.9843, + "step": 6390 + }, + { + "epoch": 0.6688644688644688, + "grad_norm": 1.977899230941687, + "learning_rate": 5.220768247788458e-06, + "loss": 0.8972, + "step": 6391 + }, + { + "epoch": 0.6689691261119832, + "grad_norm": 2.0400496145208225, + "learning_rate": 5.217790986665639e-06, + "loss": 0.8671, + "step": 6392 + }, + { + "epoch": 0.6690737833594976, + "grad_norm": 2.7116537319504066, + "learning_rate": 5.214814275028334e-06, + "loss": 1.0178, + "step": 6393 + }, + { + "epoch": 0.669178440607012, + "grad_norm": 2.073375860762836, + "learning_rate": 5.211838113218568e-06, + "loss": 0.9932, + "step": 6394 + }, + { + "epoch": 0.6692830978545264, + "grad_norm": 2.096782968232844, + "learning_rate": 5.208862501578307e-06, + "loss": 0.7974, + "step": 6395 + }, + { + "epoch": 0.6693877551020408, + "grad_norm": 1.8056629099927985, + "learning_rate": 5.205887440449462e-06, + "loss": 0.8922, + "step": 6396 + }, + { + "epoch": 0.6694924123495553, + "grad_norm": 2.1562515150078734, + "learning_rate": 5.202912930173867e-06, + "loss": 0.9763, + "step": 6397 + }, + { + "epoch": 0.6695970695970695, + "grad_norm": 1.964969717746662, + "learning_rate": 5.1999389710933015e-06, + "loss": 0.7849, + "step": 6398 + }, + { + "epoch": 0.669701726844584, + "grad_norm": 2.1575433456324498, + "learning_rate": 5.196965563549475e-06, + "loss": 0.8537, + "step": 6399 + }, + { + "epoch": 0.6698063840920984, + "grad_norm": 2.2390395785938106, + "learning_rate": 5.193992707884045e-06, + "loss": 0.9165, + "step": 6400 + }, + { + "epoch": 0.6699110413396128, + "grad_norm": 2.0512664839575727, + "learning_rate": 5.1910204044385935e-06, + "loss": 0.8743, + "step": 6401 + }, + { + "epoch": 0.6700156985871272, + "grad_norm": 1.832927563252606, + "learning_rate": 5.188048653554649e-06, + "loss": 0.9634, + "step": 6402 + }, + { + "epoch": 0.6701203558346416, + "grad_norm": 2.163369405212209, + "learning_rate": 5.185077455573671e-06, + "loss": 0.8162, + "step": 6403 + }, + { + "epoch": 0.670225013082156, + "grad_norm": 2.096116535555188, + "learning_rate": 5.182106810837053e-06, + "loss": 0.8113, + "step": 6404 + }, + { + "epoch": 0.6703296703296703, + "grad_norm": 2.2187096652736225, + "learning_rate": 5.179136719686124e-06, + "loss": 0.7653, + "step": 6405 + }, + { + "epoch": 0.6704343275771847, + "grad_norm": 1.7426313844832375, + "learning_rate": 5.176167182462164e-06, + "loss": 0.7568, + "step": 6406 + }, + { + "epoch": 0.6705389848246991, + "grad_norm": 2.610383984694903, + "learning_rate": 5.173198199506375e-06, + "loss": 1.0115, + "step": 6407 + }, + { + "epoch": 0.6706436420722135, + "grad_norm": 1.7869109945081258, + "learning_rate": 5.170229771159896e-06, + "loss": 0.8499, + "step": 6408 + }, + { + "epoch": 0.6707482993197279, + "grad_norm": 1.8327398454588113, + "learning_rate": 5.167261897763804e-06, + "loss": 0.9006, + "step": 6409 + }, + { + "epoch": 0.6708529565672423, + "grad_norm": 1.7464546014687978, + "learning_rate": 5.164294579659117e-06, + "loss": 0.8632, + "step": 6410 + }, + { + "epoch": 0.6709576138147567, + "grad_norm": 2.0024420499608797, + "learning_rate": 5.161327817186789e-06, + "loss": 0.8386, + "step": 6411 + }, + { + "epoch": 0.671062271062271, + "grad_norm": 3.683976549889003, + "learning_rate": 5.158361610687704e-06, + "loss": 0.9202, + "step": 6412 + }, + { + "epoch": 0.6711669283097854, + "grad_norm": 2.418966089950013, + "learning_rate": 5.155395960502685e-06, + "loss": 0.8548, + "step": 6413 + }, + { + "epoch": 0.6712715855572998, + "grad_norm": 1.8796425565869193, + "learning_rate": 5.152430866972489e-06, + "loss": 0.8549, + "step": 6414 + }, + { + "epoch": 0.6713762428048142, + "grad_norm": 2.208142074213644, + "learning_rate": 5.1494663304378144e-06, + "loss": 0.9177, + "step": 6415 + }, + { + "epoch": 0.6714809000523286, + "grad_norm": 2.2446841553324903, + "learning_rate": 5.146502351239293e-06, + "loss": 0.8943, + "step": 6416 + }, + { + "epoch": 0.671585557299843, + "grad_norm": 1.8751432705355553, + "learning_rate": 5.143538929717491e-06, + "loss": 0.7813, + "step": 6417 + }, + { + "epoch": 0.6716902145473574, + "grad_norm": 1.9930880359584375, + "learning_rate": 5.140576066212906e-06, + "loss": 0.8551, + "step": 6418 + }, + { + "epoch": 0.6717948717948717, + "grad_norm": 2.4278638629645815, + "learning_rate": 5.137613761065983e-06, + "loss": 0.9474, + "step": 6419 + }, + { + "epoch": 0.6718995290423861, + "grad_norm": 2.14429691352793, + "learning_rate": 5.134652014617099e-06, + "loss": 0.9384, + "step": 6420 + }, + { + "epoch": 0.6720041862899006, + "grad_norm": 1.8951332364379743, + "learning_rate": 5.131690827206563e-06, + "loss": 0.9869, + "step": 6421 + }, + { + "epoch": 0.672108843537415, + "grad_norm": 1.9516044400568116, + "learning_rate": 5.128730199174619e-06, + "loss": 0.885, + "step": 6422 + }, + { + "epoch": 0.6722135007849294, + "grad_norm": 2.1976500792646414, + "learning_rate": 5.125770130861449e-06, + "loss": 1.0194, + "step": 6423 + }, + { + "epoch": 0.6723181580324438, + "grad_norm": 2.0018038250643317, + "learning_rate": 5.12281062260717e-06, + "loss": 0.9788, + "step": 6424 + }, + { + "epoch": 0.6724228152799582, + "grad_norm": 2.1270551713262744, + "learning_rate": 5.119851674751841e-06, + "loss": 0.9429, + "step": 6425 + }, + { + "epoch": 0.6725274725274726, + "grad_norm": 2.025799543591311, + "learning_rate": 5.116893287635448e-06, + "loss": 0.7762, + "step": 6426 + }, + { + "epoch": 0.6726321297749869, + "grad_norm": 1.8699078179658104, + "learning_rate": 5.1139354615979105e-06, + "loss": 0.8726, + "step": 6427 + }, + { + "epoch": 0.6727367870225013, + "grad_norm": 1.9041060855902985, + "learning_rate": 5.110978196979098e-06, + "loss": 0.84, + "step": 6428 + }, + { + "epoch": 0.6728414442700157, + "grad_norm": 2.1433565757666115, + "learning_rate": 5.1080214941187975e-06, + "loss": 0.954, + "step": 6429 + }, + { + "epoch": 0.6729461015175301, + "grad_norm": 2.046061335552731, + "learning_rate": 5.1050653533567496e-06, + "loss": 0.8755, + "step": 6430 + }, + { + "epoch": 0.6730507587650445, + "grad_norm": 1.8884753453862546, + "learning_rate": 5.102109775032615e-06, + "loss": 0.7611, + "step": 6431 + }, + { + "epoch": 0.6731554160125589, + "grad_norm": 2.033544305804191, + "learning_rate": 5.0991547594859955e-06, + "loss": 0.908, + "step": 6432 + }, + { + "epoch": 0.6732600732600733, + "grad_norm": 1.8338267562653663, + "learning_rate": 5.096200307056426e-06, + "loss": 1.005, + "step": 6433 + }, + { + "epoch": 0.6733647305075876, + "grad_norm": 2.108484710554183, + "learning_rate": 5.093246418083386e-06, + "loss": 0.9521, + "step": 6434 + }, + { + "epoch": 0.673469387755102, + "grad_norm": 2.085184894766022, + "learning_rate": 5.09029309290628e-06, + "loss": 0.8918, + "step": 6435 + }, + { + "epoch": 0.6735740450026164, + "grad_norm": 2.1541212873096636, + "learning_rate": 5.087340331864446e-06, + "loss": 0.7858, + "step": 6436 + }, + { + "epoch": 0.6736787022501308, + "grad_norm": 2.1748331560744893, + "learning_rate": 5.084388135297171e-06, + "loss": 0.9436, + "step": 6437 + }, + { + "epoch": 0.6737833594976452, + "grad_norm": 2.213867089819224, + "learning_rate": 5.08143650354366e-06, + "loss": 0.9051, + "step": 6438 + }, + { + "epoch": 0.6738880167451596, + "grad_norm": 2.276279627419914, + "learning_rate": 5.07848543694307e-06, + "loss": 0.8651, + "step": 6439 + }, + { + "epoch": 0.673992673992674, + "grad_norm": 1.7847556728333231, + "learning_rate": 5.075534935834481e-06, + "loss": 0.7922, + "step": 6440 + }, + { + "epoch": 0.6740973312401883, + "grad_norm": 1.9203154340553588, + "learning_rate": 5.07258500055691e-06, + "loss": 0.8485, + "step": 6441 + }, + { + "epoch": 0.6742019884877027, + "grad_norm": 2.033181016154668, + "learning_rate": 5.069635631449311e-06, + "loss": 0.9363, + "step": 6442 + }, + { + "epoch": 0.6743066457352171, + "grad_norm": 1.953116403946782, + "learning_rate": 5.066686828850569e-06, + "loss": 0.9573, + "step": 6443 + }, + { + "epoch": 0.6744113029827316, + "grad_norm": 2.146061199630762, + "learning_rate": 5.063738593099512e-06, + "loss": 0.883, + "step": 6444 + }, + { + "epoch": 0.674515960230246, + "grad_norm": 2.1400402168097457, + "learning_rate": 5.060790924534902e-06, + "loss": 0.8179, + "step": 6445 + }, + { + "epoch": 0.6746206174777604, + "grad_norm": 1.7980728816836642, + "learning_rate": 5.057843823495425e-06, + "loss": 0.8783, + "step": 6446 + }, + { + "epoch": 0.6747252747252748, + "grad_norm": 2.283729906739794, + "learning_rate": 5.054897290319713e-06, + "loss": 0.8026, + "step": 6447 + }, + { + "epoch": 0.6748299319727891, + "grad_norm": 2.037589904474863, + "learning_rate": 5.051951325346325e-06, + "loss": 0.9345, + "step": 6448 + }, + { + "epoch": 0.6749345892203035, + "grad_norm": 2.1144335870638185, + "learning_rate": 5.049005928913762e-06, + "loss": 0.9946, + "step": 6449 + }, + { + "epoch": 0.6750392464678179, + "grad_norm": 1.936401258521119, + "learning_rate": 5.046061101360456e-06, + "loss": 0.8188, + "step": 6450 + }, + { + "epoch": 0.6751439037153323, + "grad_norm": 2.2507533396021926, + "learning_rate": 5.04311684302477e-06, + "loss": 0.7269, + "step": 6451 + }, + { + "epoch": 0.6752485609628467, + "grad_norm": 1.963538133706413, + "learning_rate": 5.0401731542450046e-06, + "loss": 0.8515, + "step": 6452 + }, + { + "epoch": 0.6753532182103611, + "grad_norm": 1.9890165032492324, + "learning_rate": 5.037230035359398e-06, + "loss": 0.8833, + "step": 6453 + }, + { + "epoch": 0.6754578754578755, + "grad_norm": 1.9566124199728854, + "learning_rate": 5.034287486706126e-06, + "loss": 0.8651, + "step": 6454 + }, + { + "epoch": 0.6755625327053898, + "grad_norm": 1.9567451091759998, + "learning_rate": 5.031345508623287e-06, + "loss": 0.8554, + "step": 6455 + }, + { + "epoch": 0.6756671899529042, + "grad_norm": 2.0495146610167954, + "learning_rate": 5.028404101448923e-06, + "loss": 0.9391, + "step": 6456 + }, + { + "epoch": 0.6757718472004186, + "grad_norm": 1.9180912820360527, + "learning_rate": 5.025463265521001e-06, + "loss": 0.8958, + "step": 6457 + }, + { + "epoch": 0.675876504447933, + "grad_norm": 2.108925051546432, + "learning_rate": 5.0225230011774395e-06, + "loss": 0.9158, + "step": 6458 + }, + { + "epoch": 0.6759811616954474, + "grad_norm": 2.0210637600732655, + "learning_rate": 5.0195833087560745e-06, + "loss": 0.8969, + "step": 6459 + }, + { + "epoch": 0.6760858189429618, + "grad_norm": 1.978491474863179, + "learning_rate": 5.016644188594683e-06, + "loss": 0.9462, + "step": 6460 + }, + { + "epoch": 0.6761904761904762, + "grad_norm": 1.7918602559844456, + "learning_rate": 5.013705641030978e-06, + "loss": 0.916, + "step": 6461 + }, + { + "epoch": 0.6762951334379905, + "grad_norm": 2.1981945869138233, + "learning_rate": 5.010767666402599e-06, + "loss": 0.9247, + "step": 6462 + }, + { + "epoch": 0.6763997906855049, + "grad_norm": 2.1851426012383866, + "learning_rate": 5.007830265047129e-06, + "loss": 0.9502, + "step": 6463 + }, + { + "epoch": 0.6765044479330193, + "grad_norm": 2.2216190305636596, + "learning_rate": 5.004893437302085e-06, + "loss": 1.0375, + "step": 6464 + }, + { + "epoch": 0.6766091051805337, + "grad_norm": 1.815266465146527, + "learning_rate": 5.001957183504913e-06, + "loss": 0.9195, + "step": 6465 + }, + { + "epoch": 0.6767137624280481, + "grad_norm": 2.0023594949351593, + "learning_rate": 4.999021503992992e-06, + "loss": 0.7293, + "step": 6466 + }, + { + "epoch": 0.6768184196755626, + "grad_norm": 2.5183477674119086, + "learning_rate": 4.996086399103633e-06, + "loss": 0.9346, + "step": 6467 + }, + { + "epoch": 0.676923076923077, + "grad_norm": 2.98805111119296, + "learning_rate": 4.9931518691740954e-06, + "loss": 0.8498, + "step": 6468 + }, + { + "epoch": 0.6770277341705914, + "grad_norm": 1.9793193853571587, + "learning_rate": 4.990217914541559e-06, + "loss": 0.8217, + "step": 6469 + }, + { + "epoch": 0.6771323914181057, + "grad_norm": 2.2564143430387302, + "learning_rate": 4.987284535543139e-06, + "loss": 0.8779, + "step": 6470 + }, + { + "epoch": 0.6772370486656201, + "grad_norm": 1.671524269897445, + "learning_rate": 4.984351732515883e-06, + "loss": 0.8147, + "step": 6471 + }, + { + "epoch": 0.6773417059131345, + "grad_norm": 2.255727997330643, + "learning_rate": 4.981419505796782e-06, + "loss": 0.9483, + "step": 6472 + }, + { + "epoch": 0.6774463631606489, + "grad_norm": 2.0870637943921917, + "learning_rate": 4.978487855722757e-06, + "loss": 0.9186, + "step": 6473 + }, + { + "epoch": 0.6775510204081633, + "grad_norm": 2.061420193131777, + "learning_rate": 4.975556782630657e-06, + "loss": 0.8609, + "step": 6474 + }, + { + "epoch": 0.6776556776556777, + "grad_norm": 2.1025694958037966, + "learning_rate": 4.972626286857268e-06, + "loss": 0.9063, + "step": 6475 + }, + { + "epoch": 0.6777603349031921, + "grad_norm": 2.0926436744834533, + "learning_rate": 4.969696368739308e-06, + "loss": 0.9524, + "step": 6476 + }, + { + "epoch": 0.6778649921507064, + "grad_norm": 2.4314254518234386, + "learning_rate": 4.966767028613435e-06, + "loss": 0.965, + "step": 6477 + }, + { + "epoch": 0.6779696493982208, + "grad_norm": 2.473343295432155, + "learning_rate": 4.963838266816234e-06, + "loss": 0.7574, + "step": 6478 + }, + { + "epoch": 0.6780743066457352, + "grad_norm": 2.163717925184423, + "learning_rate": 4.960910083684222e-06, + "loss": 0.9157, + "step": 6479 + }, + { + "epoch": 0.6781789638932496, + "grad_norm": 1.9819090705808335, + "learning_rate": 4.95798247955386e-06, + "loss": 0.7951, + "step": 6480 + }, + { + "epoch": 0.678283621140764, + "grad_norm": 2.306299973255293, + "learning_rate": 4.955055454761532e-06, + "loss": 0.8959, + "step": 6481 + }, + { + "epoch": 0.6783882783882784, + "grad_norm": 1.8508763399022372, + "learning_rate": 4.952129009643557e-06, + "loss": 0.8809, + "step": 6482 + }, + { + "epoch": 0.6784929356357928, + "grad_norm": 2.1808393788781903, + "learning_rate": 4.949203144536195e-06, + "loss": 0.8524, + "step": 6483 + }, + { + "epoch": 0.6785975928833071, + "grad_norm": 2.6263905356474657, + "learning_rate": 4.946277859775631e-06, + "loss": 0.9204, + "step": 6484 + }, + { + "epoch": 0.6787022501308215, + "grad_norm": 1.9970126875729537, + "learning_rate": 4.943353155697985e-06, + "loss": 0.8717, + "step": 6485 + }, + { + "epoch": 0.6788069073783359, + "grad_norm": 2.0984787953717556, + "learning_rate": 4.94042903263931e-06, + "loss": 0.9325, + "step": 6486 + }, + { + "epoch": 0.6789115646258503, + "grad_norm": 2.00008151496788, + "learning_rate": 4.937505490935599e-06, + "loss": 0.9307, + "step": 6487 + }, + { + "epoch": 0.6790162218733647, + "grad_norm": 1.9504354659273084, + "learning_rate": 4.934582530922765e-06, + "loss": 0.889, + "step": 6488 + }, + { + "epoch": 0.6791208791208792, + "grad_norm": 1.9738078584785779, + "learning_rate": 4.931660152936673e-06, + "loss": 0.829, + "step": 6489 + }, + { + "epoch": 0.6792255363683936, + "grad_norm": 2.258801843221124, + "learning_rate": 4.928738357313102e-06, + "loss": 0.865, + "step": 6490 + }, + { + "epoch": 0.6793301936159079, + "grad_norm": 1.7536804465963125, + "learning_rate": 4.9258171443877715e-06, + "loss": 0.9223, + "step": 6491 + }, + { + "epoch": 0.6794348508634223, + "grad_norm": 1.8798393029654916, + "learning_rate": 4.922896514496341e-06, + "loss": 0.8812, + "step": 6492 + }, + { + "epoch": 0.6795395081109367, + "grad_norm": 1.9041463209466114, + "learning_rate": 4.919976467974393e-06, + "loss": 0.8996, + "step": 6493 + }, + { + "epoch": 0.6796441653584511, + "grad_norm": 2.2063895357599925, + "learning_rate": 4.917057005157447e-06, + "loss": 0.8618, + "step": 6494 + }, + { + "epoch": 0.6797488226059655, + "grad_norm": 2.068532864168024, + "learning_rate": 4.914138126380952e-06, + "loss": 0.9522, + "step": 6495 + }, + { + "epoch": 0.6798534798534799, + "grad_norm": 1.870324584594993, + "learning_rate": 4.911219831980299e-06, + "loss": 0.9341, + "step": 6496 + }, + { + "epoch": 0.6799581371009943, + "grad_norm": 2.014330793845203, + "learning_rate": 4.908302122290801e-06, + "loss": 0.8728, + "step": 6497 + }, + { + "epoch": 0.6800627943485086, + "grad_norm": 2.038100868952104, + "learning_rate": 4.905384997647715e-06, + "loss": 0.8701, + "step": 6498 + }, + { + "epoch": 0.680167451596023, + "grad_norm": 2.1455491337861843, + "learning_rate": 4.9024684583862206e-06, + "loss": 0.8911, + "step": 6499 + }, + { + "epoch": 0.6802721088435374, + "grad_norm": 1.903568909831072, + "learning_rate": 4.8995525048414325e-06, + "loss": 0.7694, + "step": 6500 + }, + { + "epoch": 0.6803767660910518, + "grad_norm": 1.6985342718111633, + "learning_rate": 4.896637137348399e-06, + "loss": 0.8418, + "step": 6501 + }, + { + "epoch": 0.6804814233385662, + "grad_norm": 2.2017283202638773, + "learning_rate": 4.893722356242108e-06, + "loss": 0.9125, + "step": 6502 + }, + { + "epoch": 0.6805860805860806, + "grad_norm": 2.2531458138902702, + "learning_rate": 4.8908081618574685e-06, + "loss": 0.8669, + "step": 6503 + }, + { + "epoch": 0.680690737833595, + "grad_norm": 1.853444123308966, + "learning_rate": 4.88789455452933e-06, + "loss": 0.8773, + "step": 6504 + }, + { + "epoch": 0.6807953950811093, + "grad_norm": 2.3305108211749928, + "learning_rate": 4.884981534592466e-06, + "loss": 0.9728, + "step": 6505 + }, + { + "epoch": 0.6809000523286237, + "grad_norm": 2.424972902046859, + "learning_rate": 4.882069102381593e-06, + "loss": 0.9775, + "step": 6506 + }, + { + "epoch": 0.6810047095761381, + "grad_norm": 2.5274554940365643, + "learning_rate": 4.879157258231358e-06, + "loss": 0.8468, + "step": 6507 + }, + { + "epoch": 0.6811093668236525, + "grad_norm": 2.0276231050948446, + "learning_rate": 4.876246002476337e-06, + "loss": 0.933, + "step": 6508 + }, + { + "epoch": 0.6812140240711669, + "grad_norm": 1.9937132552470647, + "learning_rate": 4.873335335451036e-06, + "loss": 1.0518, + "step": 6509 + }, + { + "epoch": 0.6813186813186813, + "grad_norm": 1.920197673037133, + "learning_rate": 4.870425257489895e-06, + "loss": 0.8334, + "step": 6510 + }, + { + "epoch": 0.6814233385661957, + "grad_norm": 1.7724221748359235, + "learning_rate": 4.8675157689272936e-06, + "loss": 0.8897, + "step": 6511 + }, + { + "epoch": 0.6815279958137102, + "grad_norm": 2.30963352647495, + "learning_rate": 4.864606870097535e-06, + "loss": 0.867, + "step": 6512 + }, + { + "epoch": 0.6816326530612244, + "grad_norm": 1.9385477533205635, + "learning_rate": 4.861698561334858e-06, + "loss": 0.842, + "step": 6513 + }, + { + "epoch": 0.6817373103087389, + "grad_norm": 2.362323418390193, + "learning_rate": 4.858790842973428e-06, + "loss": 1.0088, + "step": 6514 + }, + { + "epoch": 0.6818419675562533, + "grad_norm": 1.9262422529795675, + "learning_rate": 4.855883715347353e-06, + "loss": 0.8423, + "step": 6515 + }, + { + "epoch": 0.6819466248037677, + "grad_norm": 1.9990004709172928, + "learning_rate": 4.852977178790671e-06, + "loss": 0.9374, + "step": 6516 + }, + { + "epoch": 0.6820512820512821, + "grad_norm": 2.3761732334999084, + "learning_rate": 4.850071233637345e-06, + "loss": 0.9739, + "step": 6517 + }, + { + "epoch": 0.6821559392987965, + "grad_norm": 1.9130503375804242, + "learning_rate": 4.847165880221275e-06, + "loss": 0.9095, + "step": 6518 + }, + { + "epoch": 0.6822605965463109, + "grad_norm": 1.8654272704894526, + "learning_rate": 4.844261118876291e-06, + "loss": 0.8022, + "step": 6519 + }, + { + "epoch": 0.6823652537938252, + "grad_norm": 2.2423240527964166, + "learning_rate": 4.841356949936152e-06, + "loss": 1.0298, + "step": 6520 + }, + { + "epoch": 0.6824699110413396, + "grad_norm": 2.0407588030898745, + "learning_rate": 4.838453373734562e-06, + "loss": 0.947, + "step": 6521 + }, + { + "epoch": 0.682574568288854, + "grad_norm": 2.273219656507149, + "learning_rate": 4.835550390605141e-06, + "loss": 0.8439, + "step": 6522 + }, + { + "epoch": 0.6826792255363684, + "grad_norm": 2.3321959685240374, + "learning_rate": 4.832648000881448e-06, + "loss": 0.9268, + "step": 6523 + }, + { + "epoch": 0.6827838827838828, + "grad_norm": 1.922263615943614, + "learning_rate": 4.829746204896978e-06, + "loss": 0.953, + "step": 6524 + }, + { + "epoch": 0.6828885400313972, + "grad_norm": 1.9490059628377225, + "learning_rate": 4.8268450029851456e-06, + "loss": 0.8965, + "step": 6525 + }, + { + "epoch": 0.6829931972789116, + "grad_norm": 2.0239108880099144, + "learning_rate": 4.823944395479314e-06, + "loss": 0.9316, + "step": 6526 + }, + { + "epoch": 0.6830978545264259, + "grad_norm": 2.019795192924344, + "learning_rate": 4.821044382712764e-06, + "loss": 0.9969, + "step": 6527 + }, + { + "epoch": 0.6832025117739403, + "grad_norm": 2.066141054067395, + "learning_rate": 4.818144965018714e-06, + "loss": 0.911, + "step": 6528 + }, + { + "epoch": 0.6833071690214547, + "grad_norm": 2.085608618994838, + "learning_rate": 4.8152461427303075e-06, + "loss": 0.8309, + "step": 6529 + }, + { + "epoch": 0.6834118262689691, + "grad_norm": 2.546699608829966, + "learning_rate": 4.812347916180634e-06, + "loss": 0.9144, + "step": 6530 + }, + { + "epoch": 0.6835164835164835, + "grad_norm": 2.214180533685667, + "learning_rate": 4.809450285702697e-06, + "loss": 0.8954, + "step": 6531 + }, + { + "epoch": 0.6836211407639979, + "grad_norm": 2.308704951945097, + "learning_rate": 4.806553251629449e-06, + "loss": 0.7669, + "step": 6532 + }, + { + "epoch": 0.6837257980115123, + "grad_norm": 1.9800008001673324, + "learning_rate": 4.803656814293761e-06, + "loss": 0.8632, + "step": 6533 + }, + { + "epoch": 0.6838304552590266, + "grad_norm": 1.6800482696957828, + "learning_rate": 4.800760974028435e-06, + "loss": 0.816, + "step": 6534 + }, + { + "epoch": 0.683935112506541, + "grad_norm": 2.1903832073879825, + "learning_rate": 4.7978657311662155e-06, + "loss": 0.9341, + "step": 6535 + }, + { + "epoch": 0.6840397697540554, + "grad_norm": 2.1866955766796607, + "learning_rate": 4.794971086039771e-06, + "loss": 0.9187, + "step": 6536 + }, + { + "epoch": 0.6841444270015699, + "grad_norm": 2.171235833925488, + "learning_rate": 4.7920770389816995e-06, + "loss": 0.9389, + "step": 6537 + }, + { + "epoch": 0.6842490842490843, + "grad_norm": 2.133592554033059, + "learning_rate": 4.7891835903245345e-06, + "loss": 0.8337, + "step": 6538 + }, + { + "epoch": 0.6843537414965987, + "grad_norm": 2.164792966032931, + "learning_rate": 4.786290740400734e-06, + "loss": 0.8745, + "step": 6539 + }, + { + "epoch": 0.6844583987441131, + "grad_norm": 2.566602076396997, + "learning_rate": 4.783398489542696e-06, + "loss": 1.0129, + "step": 6540 + }, + { + "epoch": 0.6845630559916274, + "grad_norm": 2.1672816313961385, + "learning_rate": 4.780506838082752e-06, + "loss": 0.91, + "step": 6541 + }, + { + "epoch": 0.6846677132391418, + "grad_norm": 2.5299316051254173, + "learning_rate": 4.7776157863531535e-06, + "loss": 0.8804, + "step": 6542 + }, + { + "epoch": 0.6847723704866562, + "grad_norm": 2.050095970695568, + "learning_rate": 4.7747253346860865e-06, + "loss": 0.9931, + "step": 6543 + }, + { + "epoch": 0.6848770277341706, + "grad_norm": 1.950038717236096, + "learning_rate": 4.771835483413668e-06, + "loss": 0.8983, + "step": 6544 + }, + { + "epoch": 0.684981684981685, + "grad_norm": 1.7294746393168354, + "learning_rate": 4.768946232867956e-06, + "loss": 0.7711, + "step": 6545 + }, + { + "epoch": 0.6850863422291994, + "grad_norm": 2.1177999258178346, + "learning_rate": 4.766057583380925e-06, + "loss": 0.9021, + "step": 6546 + }, + { + "epoch": 0.6851909994767138, + "grad_norm": 2.1071905114151597, + "learning_rate": 4.763169535284488e-06, + "loss": 0.923, + "step": 6547 + }, + { + "epoch": 0.6852956567242282, + "grad_norm": 2.186034990894795, + "learning_rate": 4.760282088910485e-06, + "loss": 0.9689, + "step": 6548 + }, + { + "epoch": 0.6854003139717425, + "grad_norm": 2.0749256838860552, + "learning_rate": 4.757395244590692e-06, + "loss": 0.9672, + "step": 6549 + }, + { + "epoch": 0.6855049712192569, + "grad_norm": 1.9694250708561203, + "learning_rate": 4.754509002656815e-06, + "loss": 0.936, + "step": 6550 + }, + { + "epoch": 0.6856096284667713, + "grad_norm": 1.698078542210978, + "learning_rate": 4.751623363440488e-06, + "loss": 0.8718, + "step": 6551 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 2.0573301693561583, + "learning_rate": 4.748738327273277e-06, + "loss": 0.93, + "step": 6552 + }, + { + "epoch": 0.6858189429618001, + "grad_norm": 2.2826375431191774, + "learning_rate": 4.745853894486674e-06, + "loss": 1.0443, + "step": 6553 + }, + { + "epoch": 0.6859236002093145, + "grad_norm": 1.8209812613407657, + "learning_rate": 4.742970065412112e-06, + "loss": 0.8636, + "step": 6554 + }, + { + "epoch": 0.6860282574568289, + "grad_norm": 2.050736728583209, + "learning_rate": 4.740086840380948e-06, + "loss": 0.9672, + "step": 6555 + }, + { + "epoch": 0.6861329147043432, + "grad_norm": 1.9397854484778958, + "learning_rate": 4.7372042197244684e-06, + "loss": 0.9258, + "step": 6556 + }, + { + "epoch": 0.6862375719518576, + "grad_norm": 2.086617574002967, + "learning_rate": 4.734322203773889e-06, + "loss": 0.9393, + "step": 6557 + }, + { + "epoch": 0.686342229199372, + "grad_norm": 2.012325233074569, + "learning_rate": 4.731440792860368e-06, + "loss": 0.8641, + "step": 6558 + }, + { + "epoch": 0.6864468864468865, + "grad_norm": 2.1204083885533227, + "learning_rate": 4.728559987314975e-06, + "loss": 0.9989, + "step": 6559 + }, + { + "epoch": 0.6865515436944009, + "grad_norm": 2.148258411085086, + "learning_rate": 4.72567978746873e-06, + "loss": 0.8573, + "step": 6560 + }, + { + "epoch": 0.6866562009419153, + "grad_norm": 2.079331381933887, + "learning_rate": 4.72280019365257e-06, + "loss": 0.8727, + "step": 6561 + }, + { + "epoch": 0.6867608581894297, + "grad_norm": 1.8462912023639682, + "learning_rate": 4.719921206197365e-06, + "loss": 0.8153, + "step": 6562 + }, + { + "epoch": 0.686865515436944, + "grad_norm": 2.1589001836310433, + "learning_rate": 4.717042825433914e-06, + "loss": 0.9984, + "step": 6563 + }, + { + "epoch": 0.6869701726844584, + "grad_norm": 2.0590111114309293, + "learning_rate": 4.714165051692956e-06, + "loss": 0.9904, + "step": 6564 + }, + { + "epoch": 0.6870748299319728, + "grad_norm": 2.0472445373680217, + "learning_rate": 4.711287885305149e-06, + "loss": 0.8908, + "step": 6565 + }, + { + "epoch": 0.6871794871794872, + "grad_norm": 2.1696081831633967, + "learning_rate": 4.708411326601081e-06, + "loss": 0.9935, + "step": 6566 + }, + { + "epoch": 0.6872841444270016, + "grad_norm": 2.7641764279513628, + "learning_rate": 4.705535375911283e-06, + "loss": 0.9877, + "step": 6567 + }, + { + "epoch": 0.687388801674516, + "grad_norm": 2.131915916399177, + "learning_rate": 4.7026600335662e-06, + "loss": 0.9278, + "step": 6568 + }, + { + "epoch": 0.6874934589220304, + "grad_norm": 1.8259729970596064, + "learning_rate": 4.69978529989622e-06, + "loss": 0.7988, + "step": 6569 + }, + { + "epoch": 0.6875981161695447, + "grad_norm": 2.1246313055880135, + "learning_rate": 4.696911175231655e-06, + "loss": 0.9864, + "step": 6570 + }, + { + "epoch": 0.6877027734170591, + "grad_norm": 2.028298179481332, + "learning_rate": 4.694037659902747e-06, + "loss": 0.9062, + "step": 6571 + }, + { + "epoch": 0.6878074306645735, + "grad_norm": 1.6089562184719646, + "learning_rate": 4.691164754239663e-06, + "loss": 0.7789, + "step": 6572 + }, + { + "epoch": 0.6879120879120879, + "grad_norm": 1.8248486409728757, + "learning_rate": 4.6882924585725155e-06, + "loss": 0.8198, + "step": 6573 + }, + { + "epoch": 0.6880167451596023, + "grad_norm": 2.095291077164802, + "learning_rate": 4.685420773231333e-06, + "loss": 0.9641, + "step": 6574 + }, + { + "epoch": 0.6881214024071167, + "grad_norm": 2.2450995973799013, + "learning_rate": 4.682549698546073e-06, + "loss": 0.9317, + "step": 6575 + }, + { + "epoch": 0.6882260596546311, + "grad_norm": 2.0526149027810283, + "learning_rate": 4.679679234846636e-06, + "loss": 0.9398, + "step": 6576 + }, + { + "epoch": 0.6883307169021454, + "grad_norm": 1.7534041188486345, + "learning_rate": 4.67680938246284e-06, + "loss": 0.8108, + "step": 6577 + }, + { + "epoch": 0.6884353741496598, + "grad_norm": 2.0637109461898553, + "learning_rate": 4.6739401417244335e-06, + "loss": 0.9386, + "step": 6578 + }, + { + "epoch": 0.6885400313971742, + "grad_norm": 1.954071200949578, + "learning_rate": 4.671071512961106e-06, + "loss": 0.8725, + "step": 6579 + }, + { + "epoch": 0.6886446886446886, + "grad_norm": 2.128303103825469, + "learning_rate": 4.668203496502464e-06, + "loss": 0.9578, + "step": 6580 + }, + { + "epoch": 0.688749345892203, + "grad_norm": 1.8201644126560876, + "learning_rate": 4.665336092678049e-06, + "loss": 0.903, + "step": 6581 + }, + { + "epoch": 0.6888540031397175, + "grad_norm": 2.1543435935120763, + "learning_rate": 4.6624693018173285e-06, + "loss": 0.9748, + "step": 6582 + }, + { + "epoch": 0.6889586603872319, + "grad_norm": 2.1608061059358765, + "learning_rate": 4.659603124249704e-06, + "loss": 0.9338, + "step": 6583 + }, + { + "epoch": 0.6890633176347462, + "grad_norm": 2.260571417991753, + "learning_rate": 4.6567375603045114e-06, + "loss": 0.8783, + "step": 6584 + }, + { + "epoch": 0.6891679748822606, + "grad_norm": 2.0583659884788923, + "learning_rate": 4.653872610311005e-06, + "loss": 0.922, + "step": 6585 + }, + { + "epoch": 0.689272632129775, + "grad_norm": 1.7393344357365446, + "learning_rate": 4.651008274598373e-06, + "loss": 0.8636, + "step": 6586 + }, + { + "epoch": 0.6893772893772894, + "grad_norm": 2.1302121690545084, + "learning_rate": 4.648144553495732e-06, + "loss": 0.895, + "step": 6587 + }, + { + "epoch": 0.6894819466248038, + "grad_norm": 1.870330710588847, + "learning_rate": 4.645281447332133e-06, + "loss": 0.9744, + "step": 6588 + }, + { + "epoch": 0.6895866038723182, + "grad_norm": 2.1942089129705358, + "learning_rate": 4.642418956436551e-06, + "loss": 0.8749, + "step": 6589 + }, + { + "epoch": 0.6896912611198326, + "grad_norm": 1.7509480014685161, + "learning_rate": 4.639557081137891e-06, + "loss": 0.8509, + "step": 6590 + }, + { + "epoch": 0.689795918367347, + "grad_norm": 2.0185777788794015, + "learning_rate": 4.636695821764987e-06, + "loss": 0.9448, + "step": 6591 + }, + { + "epoch": 0.6899005756148613, + "grad_norm": 2.0964031784253674, + "learning_rate": 4.633835178646605e-06, + "loss": 0.9755, + "step": 6592 + }, + { + "epoch": 0.6900052328623757, + "grad_norm": 2.1491220196897958, + "learning_rate": 4.630975152111443e-06, + "loss": 0.9519, + "step": 6593 + }, + { + "epoch": 0.6901098901098901, + "grad_norm": 1.9800787335964507, + "learning_rate": 4.628115742488119e-06, + "loss": 0.8376, + "step": 6594 + }, + { + "epoch": 0.6902145473574045, + "grad_norm": 2.2297898584366918, + "learning_rate": 4.625256950105188e-06, + "loss": 0.9368, + "step": 6595 + }, + { + "epoch": 0.6903192046049189, + "grad_norm": 1.9376950455128639, + "learning_rate": 4.622398775291129e-06, + "loss": 0.7967, + "step": 6596 + }, + { + "epoch": 0.6904238618524333, + "grad_norm": 2.1640506849550776, + "learning_rate": 4.6195412183743485e-06, + "loss": 0.965, + "step": 6597 + }, + { + "epoch": 0.6905285190999477, + "grad_norm": 2.1629346113510945, + "learning_rate": 4.616684279683193e-06, + "loss": 0.7332, + "step": 6598 + }, + { + "epoch": 0.690633176347462, + "grad_norm": 2.4687954434374264, + "learning_rate": 4.6138279595459255e-06, + "loss": 0.9789, + "step": 6599 + }, + { + "epoch": 0.6907378335949764, + "grad_norm": 1.9293003022663573, + "learning_rate": 4.610972258290745e-06, + "loss": 0.8768, + "step": 6600 + }, + { + "epoch": 0.6908424908424908, + "grad_norm": 2.000231072203521, + "learning_rate": 4.608117176245773e-06, + "loss": 0.9758, + "step": 6601 + }, + { + "epoch": 0.6909471480900052, + "grad_norm": 2.223037956191113, + "learning_rate": 4.60526271373907e-06, + "loss": 0.9322, + "step": 6602 + }, + { + "epoch": 0.6910518053375196, + "grad_norm": 1.9628517839572384, + "learning_rate": 4.602408871098618e-06, + "loss": 0.8642, + "step": 6603 + }, + { + "epoch": 0.691156462585034, + "grad_norm": 1.985045530723375, + "learning_rate": 4.599555648652331e-06, + "loss": 0.9152, + "step": 6604 + }, + { + "epoch": 0.6912611198325485, + "grad_norm": 1.9758634460748625, + "learning_rate": 4.5967030467280475e-06, + "loss": 0.8055, + "step": 6605 + }, + { + "epoch": 0.6913657770800627, + "grad_norm": 3.103005744229717, + "learning_rate": 4.5938510656535325e-06, + "loss": 0.8094, + "step": 6606 + }, + { + "epoch": 0.6914704343275772, + "grad_norm": 2.1448807382230024, + "learning_rate": 4.590999705756495e-06, + "loss": 0.9655, + "step": 6607 + }, + { + "epoch": 0.6915750915750916, + "grad_norm": 1.8041089841225364, + "learning_rate": 4.588148967364555e-06, + "loss": 0.8029, + "step": 6608 + }, + { + "epoch": 0.691679748822606, + "grad_norm": 2.0056877275259106, + "learning_rate": 4.585298850805266e-06, + "loss": 0.8963, + "step": 6609 + }, + { + "epoch": 0.6917844060701204, + "grad_norm": 1.993991355774141, + "learning_rate": 4.58244935640612e-06, + "loss": 0.8601, + "step": 6610 + }, + { + "epoch": 0.6918890633176348, + "grad_norm": 1.8982333609414117, + "learning_rate": 4.579600484494522e-06, + "loss": 0.9538, + "step": 6611 + }, + { + "epoch": 0.6919937205651492, + "grad_norm": 2.2109365168453623, + "learning_rate": 4.57675223539782e-06, + "loss": 0.95, + "step": 6612 + }, + { + "epoch": 0.6920983778126635, + "grad_norm": 2.077498771030345, + "learning_rate": 4.573904609443281e-06, + "loss": 0.9701, + "step": 6613 + }, + { + "epoch": 0.6922030350601779, + "grad_norm": 2.706301406949021, + "learning_rate": 4.571057606958102e-06, + "loss": 0.9979, + "step": 6614 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 1.9123792412988372, + "learning_rate": 4.568211228269408e-06, + "loss": 0.8161, + "step": 6615 + }, + { + "epoch": 0.6924123495552067, + "grad_norm": 2.0744022076341393, + "learning_rate": 4.565365473704253e-06, + "loss": 0.8749, + "step": 6616 + }, + { + "epoch": 0.6925170068027211, + "grad_norm": 1.9700768269508973, + "learning_rate": 4.5625203435896244e-06, + "loss": 0.7862, + "step": 6617 + }, + { + "epoch": 0.6926216640502355, + "grad_norm": 2.1381491046390257, + "learning_rate": 4.55967583825243e-06, + "loss": 0.8554, + "step": 6618 + }, + { + "epoch": 0.6927263212977499, + "grad_norm": 1.912057699434843, + "learning_rate": 4.556831958019512e-06, + "loss": 0.884, + "step": 6619 + }, + { + "epoch": 0.6928309785452642, + "grad_norm": 1.8093614568345422, + "learning_rate": 4.553988703217638e-06, + "loss": 0.9215, + "step": 6620 + }, + { + "epoch": 0.6929356357927786, + "grad_norm": 1.9690519624929577, + "learning_rate": 4.551146074173497e-06, + "loss": 0.8911, + "step": 6621 + }, + { + "epoch": 0.693040293040293, + "grad_norm": 2.0811151718511525, + "learning_rate": 4.548304071213723e-06, + "loss": 1.0378, + "step": 6622 + }, + { + "epoch": 0.6931449502878074, + "grad_norm": 1.9485593151139378, + "learning_rate": 4.545462694664863e-06, + "loss": 0.8596, + "step": 6623 + }, + { + "epoch": 0.6932496075353218, + "grad_norm": 2.1609504606964784, + "learning_rate": 4.542621944853396e-06, + "loss": 0.8655, + "step": 6624 + }, + { + "epoch": 0.6933542647828362, + "grad_norm": 1.897884223321221, + "learning_rate": 4.539781822105728e-06, + "loss": 0.8386, + "step": 6625 + }, + { + "epoch": 0.6934589220303506, + "grad_norm": 1.72032226811971, + "learning_rate": 4.5369423267482004e-06, + "loss": 0.8291, + "step": 6626 + }, + { + "epoch": 0.6935635792778649, + "grad_norm": 2.2020630417755997, + "learning_rate": 4.534103459107071e-06, + "loss": 0.8895, + "step": 6627 + }, + { + "epoch": 0.6936682365253793, + "grad_norm": 2.0309417972154664, + "learning_rate": 4.5312652195085385e-06, + "loss": 0.9303, + "step": 6628 + }, + { + "epoch": 0.6937728937728938, + "grad_norm": 2.0639435237790953, + "learning_rate": 4.528427608278718e-06, + "loss": 0.9642, + "step": 6629 + }, + { + "epoch": 0.6938775510204082, + "grad_norm": 2.1146137247817527, + "learning_rate": 4.525590625743654e-06, + "loss": 0.8332, + "step": 6630 + }, + { + "epoch": 0.6939822082679226, + "grad_norm": 2.218674423062428, + "learning_rate": 4.522754272229329e-06, + "loss": 0.99, + "step": 6631 + }, + { + "epoch": 0.694086865515437, + "grad_norm": 2.1718805339372045, + "learning_rate": 4.519918548061642e-06, + "loss": 0.974, + "step": 6632 + }, + { + "epoch": 0.6941915227629514, + "grad_norm": 1.925204404610261, + "learning_rate": 4.517083453566422e-06, + "loss": 0.8138, + "step": 6633 + }, + { + "epoch": 0.6942961800104658, + "grad_norm": 2.5317634962266538, + "learning_rate": 4.514248989069428e-06, + "loss": 1.0308, + "step": 6634 + }, + { + "epoch": 0.6944008372579801, + "grad_norm": 1.8774753895683007, + "learning_rate": 4.511415154896343e-06, + "loss": 0.8771, + "step": 6635 + }, + { + "epoch": 0.6945054945054945, + "grad_norm": 2.006100990453835, + "learning_rate": 4.508581951372783e-06, + "loss": 0.9325, + "step": 6636 + }, + { + "epoch": 0.6946101517530089, + "grad_norm": 1.9301716377433031, + "learning_rate": 4.505749378824294e-06, + "loss": 0.89, + "step": 6637 + }, + { + "epoch": 0.6947148090005233, + "grad_norm": 1.851932126585198, + "learning_rate": 4.502917437576338e-06, + "loss": 0.9756, + "step": 6638 + }, + { + "epoch": 0.6948194662480377, + "grad_norm": 1.8626235138561205, + "learning_rate": 4.500086127954313e-06, + "loss": 0.9258, + "step": 6639 + }, + { + "epoch": 0.6949241234955521, + "grad_norm": 1.8143489151160312, + "learning_rate": 4.497255450283537e-06, + "loss": 0.8582, + "step": 6640 + }, + { + "epoch": 0.6950287807430665, + "grad_norm": 1.682072962368349, + "learning_rate": 4.4944254048892685e-06, + "loss": 0.8266, + "step": 6641 + }, + { + "epoch": 0.6951334379905808, + "grad_norm": 2.2156198148472828, + "learning_rate": 4.491595992096682e-06, + "loss": 0.9815, + "step": 6642 + }, + { + "epoch": 0.6952380952380952, + "grad_norm": 1.9277959774869025, + "learning_rate": 4.488767212230884e-06, + "loss": 0.9182, + "step": 6643 + }, + { + "epoch": 0.6953427524856096, + "grad_norm": 2.2062231983360925, + "learning_rate": 4.4859390656169e-06, + "loss": 0.9683, + "step": 6644 + }, + { + "epoch": 0.695447409733124, + "grad_norm": 1.9811122926788303, + "learning_rate": 4.4831115525796965e-06, + "loss": 0.8219, + "step": 6645 + }, + { + "epoch": 0.6955520669806384, + "grad_norm": 2.0834949431347014, + "learning_rate": 4.480284673444163e-06, + "loss": 1.0044, + "step": 6646 + }, + { + "epoch": 0.6956567242281528, + "grad_norm": 1.9488184187877493, + "learning_rate": 4.477458428535111e-06, + "loss": 0.9673, + "step": 6647 + }, + { + "epoch": 0.6957613814756672, + "grad_norm": 2.297182952318726, + "learning_rate": 4.47463281817728e-06, + "loss": 0.9403, + "step": 6648 + }, + { + "epoch": 0.6958660387231815, + "grad_norm": 1.7266007102300813, + "learning_rate": 4.471807842695339e-06, + "loss": 0.7784, + "step": 6649 + }, + { + "epoch": 0.6959706959706959, + "grad_norm": 2.228867198331409, + "learning_rate": 4.468983502413882e-06, + "loss": 0.9004, + "step": 6650 + }, + { + "epoch": 0.6960753532182103, + "grad_norm": 2.141327124826885, + "learning_rate": 4.466159797657434e-06, + "loss": 0.9162, + "step": 6651 + }, + { + "epoch": 0.6961800104657248, + "grad_norm": 2.0821483089510937, + "learning_rate": 4.463336728750446e-06, + "loss": 0.8988, + "step": 6652 + }, + { + "epoch": 0.6962846677132392, + "grad_norm": 2.0393438945172804, + "learning_rate": 4.460514296017286e-06, + "loss": 0.9121, + "step": 6653 + }, + { + "epoch": 0.6963893249607536, + "grad_norm": 2.407022437793897, + "learning_rate": 4.457692499782269e-06, + "loss": 0.9244, + "step": 6654 + }, + { + "epoch": 0.696493982208268, + "grad_norm": 2.222925129117027, + "learning_rate": 4.454871340369614e-06, + "loss": 0.9437, + "step": 6655 + }, + { + "epoch": 0.6965986394557823, + "grad_norm": 2.113703320766968, + "learning_rate": 4.452050818103487e-06, + "loss": 0.8637, + "step": 6656 + }, + { + "epoch": 0.6967032967032967, + "grad_norm": 1.79736876167149, + "learning_rate": 4.4492309333079685e-06, + "loss": 0.7871, + "step": 6657 + }, + { + "epoch": 0.6968079539508111, + "grad_norm": 2.035698628522382, + "learning_rate": 4.446411686307067e-06, + "loss": 1.034, + "step": 6658 + }, + { + "epoch": 0.6969126111983255, + "grad_norm": 1.9079323228404186, + "learning_rate": 4.443593077424718e-06, + "loss": 0.9502, + "step": 6659 + }, + { + "epoch": 0.6970172684458399, + "grad_norm": 2.018868747120195, + "learning_rate": 4.440775106984793e-06, + "loss": 0.8453, + "step": 6660 + }, + { + "epoch": 0.6971219256933543, + "grad_norm": 1.9769821063615076, + "learning_rate": 4.437957775311073e-06, + "loss": 1.0526, + "step": 6661 + }, + { + "epoch": 0.6972265829408687, + "grad_norm": 1.8727187713600468, + "learning_rate": 4.435141082727285e-06, + "loss": 0.8593, + "step": 6662 + }, + { + "epoch": 0.697331240188383, + "grad_norm": 2.087270497186403, + "learning_rate": 4.432325029557067e-06, + "loss": 0.8752, + "step": 6663 + }, + { + "epoch": 0.6974358974358974, + "grad_norm": 2.1401689080867365, + "learning_rate": 4.429509616123987e-06, + "loss": 0.9799, + "step": 6664 + }, + { + "epoch": 0.6975405546834118, + "grad_norm": 1.7455887505421854, + "learning_rate": 4.426694842751548e-06, + "loss": 0.9056, + "step": 6665 + }, + { + "epoch": 0.6976452119309262, + "grad_norm": 2.0761648451950054, + "learning_rate": 4.4238807097631685e-06, + "loss": 0.6883, + "step": 6666 + }, + { + "epoch": 0.6977498691784406, + "grad_norm": 2.4630286085688673, + "learning_rate": 4.421067217482201e-06, + "loss": 0.8402, + "step": 6667 + }, + { + "epoch": 0.697854526425955, + "grad_norm": 2.2700430686821496, + "learning_rate": 4.418254366231919e-06, + "loss": 0.8151, + "step": 6668 + }, + { + "epoch": 0.6979591836734694, + "grad_norm": 2.106078902374301, + "learning_rate": 4.415442156335522e-06, + "loss": 0.974, + "step": 6669 + }, + { + "epoch": 0.6980638409209837, + "grad_norm": 2.1498422358932547, + "learning_rate": 4.412630588116144e-06, + "loss": 0.9164, + "step": 6670 + }, + { + "epoch": 0.6981684981684981, + "grad_norm": 2.1170538037613413, + "learning_rate": 4.409819661896839e-06, + "loss": 0.9384, + "step": 6671 + }, + { + "epoch": 0.6982731554160125, + "grad_norm": 2.0390019015189864, + "learning_rate": 4.40700937800059e-06, + "loss": 0.9529, + "step": 6672 + }, + { + "epoch": 0.6983778126635269, + "grad_norm": 2.1816342624312215, + "learning_rate": 4.4041997367503e-06, + "loss": 0.7565, + "step": 6673 + }, + { + "epoch": 0.6984824699110413, + "grad_norm": 2.0528691851303353, + "learning_rate": 4.401390738468801e-06, + "loss": 0.9013, + "step": 6674 + }, + { + "epoch": 0.6985871271585558, + "grad_norm": 2.2025257423746725, + "learning_rate": 4.39858238347886e-06, + "loss": 0.8969, + "step": 6675 + }, + { + "epoch": 0.6986917844060702, + "grad_norm": 2.1117193529663916, + "learning_rate": 4.395774672103157e-06, + "loss": 0.9545, + "step": 6676 + }, + { + "epoch": 0.6987964416535846, + "grad_norm": 1.8309553111911094, + "learning_rate": 4.392967604664306e-06, + "loss": 0.8649, + "step": 6677 + }, + { + "epoch": 0.6989010989010989, + "grad_norm": 1.8989290566978339, + "learning_rate": 4.3901611814848396e-06, + "loss": 0.9028, + "step": 6678 + }, + { + "epoch": 0.6990057561486133, + "grad_norm": 2.326356063146579, + "learning_rate": 4.387355402887227e-06, + "loss": 0.9389, + "step": 6679 + }, + { + "epoch": 0.6991104133961277, + "grad_norm": 1.9814214365310432, + "learning_rate": 4.384550269193859e-06, + "loss": 0.947, + "step": 6680 + }, + { + "epoch": 0.6992150706436421, + "grad_norm": 1.936796902489133, + "learning_rate": 4.381745780727049e-06, + "loss": 0.7648, + "step": 6681 + }, + { + "epoch": 0.6993197278911565, + "grad_norm": 2.3068454046130187, + "learning_rate": 4.378941937809039e-06, + "loss": 0.9382, + "step": 6682 + }, + { + "epoch": 0.6994243851386709, + "grad_norm": 2.0579861242161295, + "learning_rate": 4.3761387407619915e-06, + "loss": 0.8562, + "step": 6683 + }, + { + "epoch": 0.6995290423861853, + "grad_norm": 2.022770051321196, + "learning_rate": 4.373336189908007e-06, + "loss": 0.8727, + "step": 6684 + }, + { + "epoch": 0.6996336996336996, + "grad_norm": 1.8295973872025644, + "learning_rate": 4.3705342855691e-06, + "loss": 0.8223, + "step": 6685 + }, + { + "epoch": 0.699738356881214, + "grad_norm": 2.415516211385723, + "learning_rate": 4.367733028067217e-06, + "loss": 0.8492, + "step": 6686 + }, + { + "epoch": 0.6998430141287284, + "grad_norm": 2.5417469338554994, + "learning_rate": 4.364932417724222e-06, + "loss": 0.8159, + "step": 6687 + }, + { + "epoch": 0.6999476713762428, + "grad_norm": 1.7713596531603963, + "learning_rate": 4.362132454861916e-06, + "loss": 0.8118, + "step": 6688 + }, + { + "epoch": 0.7000523286237572, + "grad_norm": 1.9958944703716315, + "learning_rate": 4.359333139802024e-06, + "loss": 0.7404, + "step": 6689 + }, + { + "epoch": 0.7001569858712716, + "grad_norm": 2.0988662837685887, + "learning_rate": 4.356534472866189e-06, + "loss": 0.9781, + "step": 6690 + }, + { + "epoch": 0.700261643118786, + "grad_norm": 2.213783510735197, + "learning_rate": 4.353736454375983e-06, + "loss": 0.9575, + "step": 6691 + }, + { + "epoch": 0.7003663003663003, + "grad_norm": 2.17188348926095, + "learning_rate": 4.350939084652906e-06, + "loss": 0.9206, + "step": 6692 + }, + { + "epoch": 0.7004709576138147, + "grad_norm": 2.115726513977887, + "learning_rate": 4.348142364018375e-06, + "loss": 0.9514, + "step": 6693 + }, + { + "epoch": 0.7005756148613291, + "grad_norm": 1.9850295855218871, + "learning_rate": 4.345346292793748e-06, + "loss": 0.8236, + "step": 6694 + }, + { + "epoch": 0.7006802721088435, + "grad_norm": 1.94482693927181, + "learning_rate": 4.342550871300295e-06, + "loss": 0.8051, + "step": 6695 + }, + { + "epoch": 0.7007849293563579, + "grad_norm": 2.043191145044091, + "learning_rate": 4.339756099859213e-06, + "loss": 0.8906, + "step": 6696 + }, + { + "epoch": 0.7008895866038723, + "grad_norm": 2.128017700100571, + "learning_rate": 4.336961978791632e-06, + "loss": 0.9098, + "step": 6697 + }, + { + "epoch": 0.7009942438513868, + "grad_norm": 1.9203516360713857, + "learning_rate": 4.3341685084185965e-06, + "loss": 0.833, + "step": 6698 + }, + { + "epoch": 0.701098901098901, + "grad_norm": 1.9846571799966501, + "learning_rate": 4.331375689061089e-06, + "loss": 0.8093, + "step": 6699 + }, + { + "epoch": 0.7012035583464155, + "grad_norm": 2.6300245916251925, + "learning_rate": 4.328583521040006e-06, + "loss": 0.8283, + "step": 6700 + }, + { + "epoch": 0.7013082155939299, + "grad_norm": 2.227609828599236, + "learning_rate": 4.325792004676175e-06, + "loss": 0.9819, + "step": 6701 + }, + { + "epoch": 0.7014128728414443, + "grad_norm": 2.438764625145741, + "learning_rate": 4.32300114029034e-06, + "loss": 0.8809, + "step": 6702 + }, + { + "epoch": 0.7015175300889587, + "grad_norm": 2.2905189173309046, + "learning_rate": 4.320210928203187e-06, + "loss": 0.9054, + "step": 6703 + }, + { + "epoch": 0.7016221873364731, + "grad_norm": 2.089414522385608, + "learning_rate": 4.317421368735313e-06, + "loss": 0.9524, + "step": 6704 + }, + { + "epoch": 0.7017268445839875, + "grad_norm": 1.774274758188664, + "learning_rate": 4.314632462207239e-06, + "loss": 0.8178, + "step": 6705 + }, + { + "epoch": 0.7018315018315018, + "grad_norm": 2.147402104777586, + "learning_rate": 4.311844208939424e-06, + "loss": 0.7808, + "step": 6706 + }, + { + "epoch": 0.7019361590790162, + "grad_norm": 1.9194606735701536, + "learning_rate": 4.309056609252241e-06, + "loss": 0.9445, + "step": 6707 + }, + { + "epoch": 0.7020408163265306, + "grad_norm": 2.464691497275074, + "learning_rate": 4.306269663465986e-06, + "loss": 0.9314, + "step": 6708 + }, + { + "epoch": 0.702145473574045, + "grad_norm": 2.258686029857579, + "learning_rate": 4.3034833719008935e-06, + "loss": 0.9055, + "step": 6709 + }, + { + "epoch": 0.7022501308215594, + "grad_norm": 1.9254564663234373, + "learning_rate": 4.300697734877109e-06, + "loss": 0.9699, + "step": 6710 + }, + { + "epoch": 0.7023547880690738, + "grad_norm": 2.266755065720574, + "learning_rate": 4.297912752714709e-06, + "loss": 0.8583, + "step": 6711 + }, + { + "epoch": 0.7024594453165882, + "grad_norm": 1.9181752019643432, + "learning_rate": 4.29512842573369e-06, + "loss": 0.9748, + "step": 6712 + }, + { + "epoch": 0.7025641025641025, + "grad_norm": 1.8549903822927598, + "learning_rate": 4.2923447542539785e-06, + "loss": 0.7526, + "step": 6713 + }, + { + "epoch": 0.7026687598116169, + "grad_norm": 1.9363352003242609, + "learning_rate": 4.289561738595431e-06, + "loss": 0.9082, + "step": 6714 + }, + { + "epoch": 0.7027734170591313, + "grad_norm": 2.2343004147251095, + "learning_rate": 4.2867793790778136e-06, + "loss": 0.8977, + "step": 6715 + }, + { + "epoch": 0.7028780743066457, + "grad_norm": 2.0029331901560696, + "learning_rate": 4.283997676020829e-06, + "loss": 0.8673, + "step": 6716 + }, + { + "epoch": 0.7029827315541601, + "grad_norm": 1.93825276090313, + "learning_rate": 4.2812166297440946e-06, + "loss": 0.8511, + "step": 6717 + }, + { + "epoch": 0.7030873888016745, + "grad_norm": 2.1492481371612215, + "learning_rate": 4.278436240567166e-06, + "loss": 0.8906, + "step": 6718 + }, + { + "epoch": 0.703192046049189, + "grad_norm": 1.9356220967112918, + "learning_rate": 4.2756565088095125e-06, + "loss": 0.8833, + "step": 6719 + }, + { + "epoch": 0.7032967032967034, + "grad_norm": 1.9085420160045585, + "learning_rate": 4.272877434790531e-06, + "loss": 0.7844, + "step": 6720 + }, + { + "epoch": 0.7034013605442176, + "grad_norm": 2.1196564007597307, + "learning_rate": 4.270099018829539e-06, + "loss": 0.9646, + "step": 6721 + }, + { + "epoch": 0.703506017791732, + "grad_norm": 2.2766384603785346, + "learning_rate": 4.267321261245785e-06, + "loss": 0.8636, + "step": 6722 + }, + { + "epoch": 0.7036106750392465, + "grad_norm": 2.224471343348745, + "learning_rate": 4.264544162358443e-06, + "loss": 0.9361, + "step": 6723 + }, + { + "epoch": 0.7037153322867609, + "grad_norm": 1.8561557729274152, + "learning_rate": 4.2617677224866035e-06, + "loss": 0.7476, + "step": 6724 + }, + { + "epoch": 0.7038199895342753, + "grad_norm": 2.0486894102396254, + "learning_rate": 4.258991941949286e-06, + "loss": 0.7964, + "step": 6725 + }, + { + "epoch": 0.7039246467817897, + "grad_norm": 2.0441810946350936, + "learning_rate": 4.256216821065431e-06, + "loss": 0.9484, + "step": 6726 + }, + { + "epoch": 0.7040293040293041, + "grad_norm": 2.1128385722097507, + "learning_rate": 4.253442360153905e-06, + "loss": 0.9235, + "step": 6727 + }, + { + "epoch": 0.7041339612768184, + "grad_norm": 2.313931237729976, + "learning_rate": 4.250668559533504e-06, + "loss": 0.9055, + "step": 6728 + }, + { + "epoch": 0.7042386185243328, + "grad_norm": 1.8402369003536179, + "learning_rate": 4.247895419522941e-06, + "loss": 0.7616, + "step": 6729 + }, + { + "epoch": 0.7043432757718472, + "grad_norm": 1.8143536226822832, + "learning_rate": 4.245122940440855e-06, + "loss": 0.935, + "step": 6730 + }, + { + "epoch": 0.7044479330193616, + "grad_norm": 1.9312652171108742, + "learning_rate": 4.242351122605807e-06, + "loss": 0.8543, + "step": 6731 + }, + { + "epoch": 0.704552590266876, + "grad_norm": 1.9610927104433202, + "learning_rate": 4.239579966336286e-06, + "loss": 0.824, + "step": 6732 + }, + { + "epoch": 0.7046572475143904, + "grad_norm": 2.003342197333868, + "learning_rate": 4.236809471950708e-06, + "loss": 0.859, + "step": 6733 + }, + { + "epoch": 0.7047619047619048, + "grad_norm": 1.9433809349811346, + "learning_rate": 4.234039639767406e-06, + "loss": 0.902, + "step": 6734 + }, + { + "epoch": 0.7048665620094191, + "grad_norm": 2.178288945579384, + "learning_rate": 4.231270470104636e-06, + "loss": 0.8413, + "step": 6735 + }, + { + "epoch": 0.7049712192569335, + "grad_norm": 2.3256221301858866, + "learning_rate": 4.228501963280581e-06, + "loss": 0.9749, + "step": 6736 + }, + { + "epoch": 0.7050758765044479, + "grad_norm": 2.1566296507304763, + "learning_rate": 4.225734119613354e-06, + "loss": 0.9976, + "step": 6737 + }, + { + "epoch": 0.7051805337519623, + "grad_norm": 2.2955385696891453, + "learning_rate": 4.222966939420983e-06, + "loss": 0.9847, + "step": 6738 + }, + { + "epoch": 0.7052851909994767, + "grad_norm": 2.219932302560416, + "learning_rate": 4.220200423021421e-06, + "loss": 0.9591, + "step": 6739 + }, + { + "epoch": 0.7053898482469911, + "grad_norm": 2.171556004505835, + "learning_rate": 4.217434570732544e-06, + "loss": 0.9488, + "step": 6740 + }, + { + "epoch": 0.7054945054945055, + "grad_norm": 2.003742070991652, + "learning_rate": 4.214669382872157e-06, + "loss": 1.0053, + "step": 6741 + }, + { + "epoch": 0.7055991627420198, + "grad_norm": 2.2735271709277938, + "learning_rate": 4.2119048597579905e-06, + "loss": 0.7012, + "step": 6742 + }, + { + "epoch": 0.7057038199895342, + "grad_norm": 1.9509244256018325, + "learning_rate": 4.2091410017076884e-06, + "loss": 0.8871, + "step": 6743 + }, + { + "epoch": 0.7058084772370486, + "grad_norm": 1.803214751829208, + "learning_rate": 4.206377809038825e-06, + "loss": 0.8344, + "step": 6744 + }, + { + "epoch": 0.705913134484563, + "grad_norm": 2.1681624054889075, + "learning_rate": 4.203615282068896e-06, + "loss": 0.9701, + "step": 6745 + }, + { + "epoch": 0.7060177917320775, + "grad_norm": 2.003295420905956, + "learning_rate": 4.200853421115317e-06, + "loss": 0.9148, + "step": 6746 + }, + { + "epoch": 0.7061224489795919, + "grad_norm": 1.8542405762144707, + "learning_rate": 4.1980922264954415e-06, + "loss": 0.7981, + "step": 6747 + }, + { + "epoch": 0.7062271062271063, + "grad_norm": 2.2185412641129605, + "learning_rate": 4.195331698526526e-06, + "loss": 0.9382, + "step": 6748 + }, + { + "epoch": 0.7063317634746206, + "grad_norm": 1.9421670362200358, + "learning_rate": 4.19257183752577e-06, + "loss": 0.9211, + "step": 6749 + }, + { + "epoch": 0.706436420722135, + "grad_norm": 2.350163108156549, + "learning_rate": 4.189812643810282e-06, + "loss": 0.9539, + "step": 6750 + }, + { + "epoch": 0.7065410779696494, + "grad_norm": 2.1933671368986256, + "learning_rate": 4.187054117697097e-06, + "loss": 0.9205, + "step": 6751 + }, + { + "epoch": 0.7066457352171638, + "grad_norm": 1.8016529905030467, + "learning_rate": 4.184296259503181e-06, + "loss": 0.863, + "step": 6752 + }, + { + "epoch": 0.7067503924646782, + "grad_norm": 1.638442385400924, + "learning_rate": 4.181539069545414e-06, + "loss": 0.8414, + "step": 6753 + }, + { + "epoch": 0.7068550497121926, + "grad_norm": 2.2815059203943218, + "learning_rate": 4.1787825481406044e-06, + "loss": 0.9858, + "step": 6754 + }, + { + "epoch": 0.706959706959707, + "grad_norm": 2.0723833971887684, + "learning_rate": 4.176026695605476e-06, + "loss": 0.8515, + "step": 6755 + }, + { + "epoch": 0.7070643642072213, + "grad_norm": 2.0981663306838585, + "learning_rate": 4.17327151225669e-06, + "loss": 0.9072, + "step": 6756 + }, + { + "epoch": 0.7071690214547357, + "grad_norm": 1.885601962808024, + "learning_rate": 4.170516998410817e-06, + "loss": 0.9031, + "step": 6757 + }, + { + "epoch": 0.7072736787022501, + "grad_norm": 2.253291731810213, + "learning_rate": 4.1677631543843625e-06, + "loss": 0.9391, + "step": 6758 + }, + { + "epoch": 0.7073783359497645, + "grad_norm": 1.956313519509557, + "learning_rate": 4.165009980493742e-06, + "loss": 0.8985, + "step": 6759 + }, + { + "epoch": 0.7074829931972789, + "grad_norm": 2.2664708007932743, + "learning_rate": 4.1622574770553025e-06, + "loss": 0.8731, + "step": 6760 + }, + { + "epoch": 0.7075876504447933, + "grad_norm": 2.195150032088823, + "learning_rate": 4.159505644385316e-06, + "loss": 0.8524, + "step": 6761 + }, + { + "epoch": 0.7076923076923077, + "grad_norm": 1.9346641546170762, + "learning_rate": 4.1567544827999705e-06, + "loss": 0.8993, + "step": 6762 + }, + { + "epoch": 0.7077969649398221, + "grad_norm": 2.2504254337448453, + "learning_rate": 4.154003992615381e-06, + "loss": 0.9644, + "step": 6763 + }, + { + "epoch": 0.7079016221873364, + "grad_norm": 2.188401815842773, + "learning_rate": 4.151254174147584e-06, + "loss": 0.8787, + "step": 6764 + }, + { + "epoch": 0.7080062794348508, + "grad_norm": 2.037348430435397, + "learning_rate": 4.148505027712535e-06, + "loss": 0.8898, + "step": 6765 + }, + { + "epoch": 0.7081109366823652, + "grad_norm": 2.121364396425157, + "learning_rate": 4.1457565536261204e-06, + "loss": 0.9164, + "step": 6766 + }, + { + "epoch": 0.7082155939298796, + "grad_norm": 2.1227061314556104, + "learning_rate": 4.14300875220415e-06, + "loss": 0.8375, + "step": 6767 + }, + { + "epoch": 0.7083202511773941, + "grad_norm": 2.2345470752377112, + "learning_rate": 4.140261623762346e-06, + "loss": 1.0657, + "step": 6768 + }, + { + "epoch": 0.7084249084249085, + "grad_norm": 2.115378021426516, + "learning_rate": 4.13751516861636e-06, + "loss": 0.8428, + "step": 6769 + }, + { + "epoch": 0.7085295656724229, + "grad_norm": 1.8781409525877706, + "learning_rate": 4.134769387081764e-06, + "loss": 0.8158, + "step": 6770 + }, + { + "epoch": 0.7086342229199372, + "grad_norm": 2.870662361008902, + "learning_rate": 4.132024279474058e-06, + "loss": 0.8034, + "step": 6771 + }, + { + "epoch": 0.7087388801674516, + "grad_norm": 1.9910392284133023, + "learning_rate": 4.129279846108658e-06, + "loss": 0.8839, + "step": 6772 + }, + { + "epoch": 0.708843537414966, + "grad_norm": 2.7345053141599274, + "learning_rate": 4.1265360873009054e-06, + "loss": 0.8741, + "step": 6773 + }, + { + "epoch": 0.7089481946624804, + "grad_norm": 1.817693344021189, + "learning_rate": 4.123793003366059e-06, + "loss": 0.8432, + "step": 6774 + }, + { + "epoch": 0.7090528519099948, + "grad_norm": 1.9857696095073964, + "learning_rate": 4.121050594619308e-06, + "loss": 0.9486, + "step": 6775 + }, + { + "epoch": 0.7091575091575092, + "grad_norm": 2.138607157252604, + "learning_rate": 4.118308861375766e-06, + "loss": 0.9636, + "step": 6776 + }, + { + "epoch": 0.7092621664050236, + "grad_norm": 1.801323948809371, + "learning_rate": 4.11556780395046e-06, + "loss": 0.8476, + "step": 6777 + }, + { + "epoch": 0.7093668236525379, + "grad_norm": 2.120684116550481, + "learning_rate": 4.112827422658341e-06, + "loss": 0.9265, + "step": 6778 + }, + { + "epoch": 0.7094714809000523, + "grad_norm": 1.9405112301994334, + "learning_rate": 4.1100877178142825e-06, + "loss": 0.8766, + "step": 6779 + }, + { + "epoch": 0.7095761381475667, + "grad_norm": 2.33308294996142, + "learning_rate": 4.10734868973309e-06, + "loss": 0.8442, + "step": 6780 + }, + { + "epoch": 0.7096807953950811, + "grad_norm": 2.0092401960246016, + "learning_rate": 4.104610338729478e-06, + "loss": 0.9067, + "step": 6781 + }, + { + "epoch": 0.7097854526425955, + "grad_norm": 2.4295543764879763, + "learning_rate": 4.101872665118088e-06, + "loss": 0.9433, + "step": 6782 + }, + { + "epoch": 0.7098901098901099, + "grad_norm": 2.1008017438183924, + "learning_rate": 4.099135669213483e-06, + "loss": 0.8513, + "step": 6783 + }, + { + "epoch": 0.7099947671376243, + "grad_norm": 2.445983785008863, + "learning_rate": 4.096399351330156e-06, + "loss": 0.9137, + "step": 6784 + }, + { + "epoch": 0.7100994243851386, + "grad_norm": 2.1839137429933606, + "learning_rate": 4.093663711782507e-06, + "loss": 0.8465, + "step": 6785 + }, + { + "epoch": 0.710204081632653, + "grad_norm": 1.9874222387615066, + "learning_rate": 4.0909287508848745e-06, + "loss": 0.8232, + "step": 6786 + }, + { + "epoch": 0.7103087388801674, + "grad_norm": 1.8844372558093263, + "learning_rate": 4.0881944689515085e-06, + "loss": 0.8779, + "step": 6787 + }, + { + "epoch": 0.7104133961276818, + "grad_norm": 1.9799678528512243, + "learning_rate": 4.085460866296581e-06, + "loss": 0.9447, + "step": 6788 + }, + { + "epoch": 0.7105180533751962, + "grad_norm": 2.124231521122012, + "learning_rate": 4.082727943234187e-06, + "loss": 0.8971, + "step": 6789 + }, + { + "epoch": 0.7106227106227107, + "grad_norm": 1.8530431577227409, + "learning_rate": 4.079995700078352e-06, + "loss": 0.8753, + "step": 6790 + }, + { + "epoch": 0.7107273678702251, + "grad_norm": 2.0466031803847162, + "learning_rate": 4.077264137143012e-06, + "loss": 0.8169, + "step": 6791 + }, + { + "epoch": 0.7108320251177394, + "grad_norm": 2.183687382955889, + "learning_rate": 4.074533254742026e-06, + "loss": 0.937, + "step": 6792 + }, + { + "epoch": 0.7109366823652538, + "grad_norm": 1.8299109318292106, + "learning_rate": 4.071803053189184e-06, + "loss": 0.8489, + "step": 6793 + }, + { + "epoch": 0.7110413396127682, + "grad_norm": 2.48666879446886, + "learning_rate": 4.069073532798185e-06, + "loss": 0.9826, + "step": 6794 + }, + { + "epoch": 0.7111459968602826, + "grad_norm": 1.9966103200441245, + "learning_rate": 4.066344693882665e-06, + "loss": 0.9272, + "step": 6795 + }, + { + "epoch": 0.711250654107797, + "grad_norm": 2.0372668354924715, + "learning_rate": 4.063616536756168e-06, + "loss": 0.8891, + "step": 6796 + }, + { + "epoch": 0.7113553113553114, + "grad_norm": 2.0421407323614487, + "learning_rate": 4.060889061732165e-06, + "loss": 0.8493, + "step": 6797 + }, + { + "epoch": 0.7114599686028258, + "grad_norm": 2.051281230221165, + "learning_rate": 4.058162269124045e-06, + "loss": 0.8805, + "step": 6798 + }, + { + "epoch": 0.7115646258503401, + "grad_norm": 2.0417527238947617, + "learning_rate": 4.055436159245129e-06, + "loss": 0.8527, + "step": 6799 + }, + { + "epoch": 0.7116692830978545, + "grad_norm": 2.01692569770081, + "learning_rate": 4.0527107324086465e-06, + "loss": 0.8964, + "step": 6800 + }, + { + "epoch": 0.7117739403453689, + "grad_norm": 2.498291537658827, + "learning_rate": 4.04998598892776e-06, + "loss": 0.8871, + "step": 6801 + }, + { + "epoch": 0.7118785975928833, + "grad_norm": 1.7272578518417077, + "learning_rate": 4.047261929115546e-06, + "loss": 0.7592, + "step": 6802 + }, + { + "epoch": 0.7119832548403977, + "grad_norm": 2.067238416917133, + "learning_rate": 4.044538553285003e-06, + "loss": 0.8075, + "step": 6803 + }, + { + "epoch": 0.7120879120879121, + "grad_norm": 2.0306398676192043, + "learning_rate": 4.04181586174905e-06, + "loss": 0.9844, + "step": 6804 + }, + { + "epoch": 0.7121925693354265, + "grad_norm": 1.7327165325882197, + "learning_rate": 4.039093854820537e-06, + "loss": 0.8486, + "step": 6805 + }, + { + "epoch": 0.7122972265829409, + "grad_norm": 1.954070649531059, + "learning_rate": 4.036372532812224e-06, + "loss": 0.9154, + "step": 6806 + }, + { + "epoch": 0.7124018838304552, + "grad_norm": 2.113577347797847, + "learning_rate": 4.033651896036797e-06, + "loss": 0.8904, + "step": 6807 + }, + { + "epoch": 0.7125065410779696, + "grad_norm": 2.103592454954578, + "learning_rate": 4.030931944806857e-06, + "loss": 0.8943, + "step": 6808 + }, + { + "epoch": 0.712611198325484, + "grad_norm": 2.1336538127277405, + "learning_rate": 4.0282126794349385e-06, + "loss": 0.8553, + "step": 6809 + }, + { + "epoch": 0.7127158555729984, + "grad_norm": 1.919816279896638, + "learning_rate": 4.0254941002334914e-06, + "loss": 0.8675, + "step": 6810 + }, + { + "epoch": 0.7128205128205128, + "grad_norm": 1.8626947261175397, + "learning_rate": 4.022776207514885e-06, + "loss": 0.8163, + "step": 6811 + }, + { + "epoch": 0.7129251700680272, + "grad_norm": 2.2003625119304875, + "learning_rate": 4.020059001591409e-06, + "loss": 0.9314, + "step": 6812 + }, + { + "epoch": 0.7130298273155417, + "grad_norm": 2.1580488886290325, + "learning_rate": 4.0173424827752725e-06, + "loss": 0.9063, + "step": 6813 + }, + { + "epoch": 0.713134484563056, + "grad_norm": 1.813499024046145, + "learning_rate": 4.014626651378617e-06, + "loss": 0.8229, + "step": 6814 + }, + { + "epoch": 0.7132391418105704, + "grad_norm": 2.13054755191047, + "learning_rate": 4.011911507713493e-06, + "loss": 0.9653, + "step": 6815 + }, + { + "epoch": 0.7133437990580848, + "grad_norm": 2.3481830820459466, + "learning_rate": 4.0091970520918755e-06, + "loss": 0.9658, + "step": 6816 + }, + { + "epoch": 0.7134484563055992, + "grad_norm": 3.110141424492723, + "learning_rate": 4.006483284825658e-06, + "loss": 0.9323, + "step": 6817 + }, + { + "epoch": 0.7135531135531136, + "grad_norm": 1.9336469464972577, + "learning_rate": 4.00377020622666e-06, + "loss": 0.9056, + "step": 6818 + }, + { + "epoch": 0.713657770800628, + "grad_norm": 1.7890408343625537, + "learning_rate": 4.001057816606626e-06, + "loss": 0.8723, + "step": 6819 + }, + { + "epoch": 0.7137624280481424, + "grad_norm": 2.0458672431955427, + "learning_rate": 3.99834611627721e-06, + "loss": 0.9242, + "step": 6820 + }, + { + "epoch": 0.7138670852956567, + "grad_norm": 2.169252164372577, + "learning_rate": 3.995635105549991e-06, + "loss": 0.8732, + "step": 6821 + }, + { + "epoch": 0.7139717425431711, + "grad_norm": 2.1207025732586677, + "learning_rate": 3.99292478473647e-06, + "loss": 0.9312, + "step": 6822 + }, + { + "epoch": 0.7140763997906855, + "grad_norm": 1.8860336974481258, + "learning_rate": 3.990215154148067e-06, + "loss": 0.8711, + "step": 6823 + }, + { + "epoch": 0.7141810570381999, + "grad_norm": 1.7535400399995449, + "learning_rate": 3.987506214096128e-06, + "loss": 0.8432, + "step": 6824 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.9294483445338717, + "learning_rate": 3.984797964891914e-06, + "loss": 0.8547, + "step": 6825 + }, + { + "epoch": 0.7143903715332287, + "grad_norm": 2.1241763749956415, + "learning_rate": 3.982090406846605e-06, + "loss": 0.9769, + "step": 6826 + }, + { + "epoch": 0.7144950287807431, + "grad_norm": 1.617410025245596, + "learning_rate": 3.979383540271311e-06, + "loss": 0.8158, + "step": 6827 + }, + { + "epoch": 0.7145996860282574, + "grad_norm": 1.9851405262752755, + "learning_rate": 3.976677365477049e-06, + "loss": 0.8617, + "step": 6828 + }, + { + "epoch": 0.7147043432757718, + "grad_norm": 2.3520800347490125, + "learning_rate": 3.973971882774773e-06, + "loss": 0.8526, + "step": 6829 + }, + { + "epoch": 0.7148090005232862, + "grad_norm": 1.9514505903548016, + "learning_rate": 3.971267092475343e-06, + "loss": 0.9752, + "step": 6830 + }, + { + "epoch": 0.7149136577708006, + "grad_norm": 2.5826157191432997, + "learning_rate": 3.968562994889545e-06, + "loss": 0.9407, + "step": 6831 + }, + { + "epoch": 0.715018315018315, + "grad_norm": 2.300126908869992, + "learning_rate": 3.9658595903280836e-06, + "loss": 0.8193, + "step": 6832 + }, + { + "epoch": 0.7151229722658294, + "grad_norm": 1.8165911897167484, + "learning_rate": 3.963156879101591e-06, + "loss": 0.8707, + "step": 6833 + }, + { + "epoch": 0.7152276295133438, + "grad_norm": 2.059647993743263, + "learning_rate": 3.96045486152061e-06, + "loss": 0.9069, + "step": 6834 + }, + { + "epoch": 0.7153322867608581, + "grad_norm": 3.670243670467124, + "learning_rate": 3.957753537895607e-06, + "loss": 0.9077, + "step": 6835 + }, + { + "epoch": 0.7154369440083725, + "grad_norm": 2.4214679282114204, + "learning_rate": 3.9550529085369735e-06, + "loss": 0.9734, + "step": 6836 + }, + { + "epoch": 0.715541601255887, + "grad_norm": 1.908703225014119, + "learning_rate": 3.952352973755012e-06, + "loss": 0.9761, + "step": 6837 + }, + { + "epoch": 0.7156462585034014, + "grad_norm": 1.9494300614897873, + "learning_rate": 3.949653733859958e-06, + "loss": 0.9582, + "step": 6838 + }, + { + "epoch": 0.7157509157509158, + "grad_norm": 2.3349105601456652, + "learning_rate": 3.946955189161954e-06, + "loss": 0.9848, + "step": 6839 + }, + { + "epoch": 0.7158555729984302, + "grad_norm": 1.98806617370881, + "learning_rate": 3.944257339971071e-06, + "loss": 1.0294, + "step": 6840 + }, + { + "epoch": 0.7159602302459446, + "grad_norm": 2.032978994521527, + "learning_rate": 3.941560186597295e-06, + "loss": 0.7697, + "step": 6841 + }, + { + "epoch": 0.7160648874934589, + "grad_norm": 2.2859142638467196, + "learning_rate": 3.938863729350532e-06, + "loss": 0.9599, + "step": 6842 + }, + { + "epoch": 0.7161695447409733, + "grad_norm": 1.95473032037277, + "learning_rate": 3.936167968540617e-06, + "loss": 0.8378, + "step": 6843 + }, + { + "epoch": 0.7162742019884877, + "grad_norm": 2.2383483044471206, + "learning_rate": 3.933472904477291e-06, + "loss": 0.8798, + "step": 6844 + }, + { + "epoch": 0.7163788592360021, + "grad_norm": 1.6286590487546038, + "learning_rate": 3.930778537470231e-06, + "loss": 0.9243, + "step": 6845 + }, + { + "epoch": 0.7164835164835165, + "grad_norm": 1.8458670553744083, + "learning_rate": 3.928084867829021e-06, + "loss": 0.7944, + "step": 6846 + }, + { + "epoch": 0.7165881737310309, + "grad_norm": 2.270745347465646, + "learning_rate": 3.9253918958631635e-06, + "loss": 1.0004, + "step": 6847 + }, + { + "epoch": 0.7166928309785453, + "grad_norm": 1.8785468739801112, + "learning_rate": 3.922699621882097e-06, + "loss": 0.8825, + "step": 6848 + }, + { + "epoch": 0.7167974882260597, + "grad_norm": 2.05640939822826, + "learning_rate": 3.920008046195163e-06, + "loss": 0.8316, + "step": 6849 + }, + { + "epoch": 0.716902145473574, + "grad_norm": 2.2722674004261476, + "learning_rate": 3.917317169111629e-06, + "loss": 1.0579, + "step": 6850 + }, + { + "epoch": 0.7170068027210884, + "grad_norm": 2.4202451915007512, + "learning_rate": 3.9146269909406805e-06, + "loss": 0.9921, + "step": 6851 + }, + { + "epoch": 0.7171114599686028, + "grad_norm": 2.2412583484266295, + "learning_rate": 3.911937511991428e-06, + "loss": 0.8362, + "step": 6852 + }, + { + "epoch": 0.7172161172161172, + "grad_norm": 2.0935113321976733, + "learning_rate": 3.9092487325729e-06, + "loss": 0.7682, + "step": 6853 + }, + { + "epoch": 0.7173207744636316, + "grad_norm": 2.0222550806048267, + "learning_rate": 3.906560652994039e-06, + "loss": 0.8426, + "step": 6854 + }, + { + "epoch": 0.717425431711146, + "grad_norm": 1.9696243728220855, + "learning_rate": 3.903873273563713e-06, + "loss": 0.8273, + "step": 6855 + }, + { + "epoch": 0.7175300889586604, + "grad_norm": 1.8907957710602337, + "learning_rate": 3.901186594590702e-06, + "loss": 0.8415, + "step": 6856 + }, + { + "epoch": 0.7176347462061747, + "grad_norm": 1.8888147086385425, + "learning_rate": 3.898500616383718e-06, + "loss": 0.9082, + "step": 6857 + }, + { + "epoch": 0.7177394034536891, + "grad_norm": 1.9341244750710724, + "learning_rate": 3.895815339251382e-06, + "loss": 0.8821, + "step": 6858 + }, + { + "epoch": 0.7178440607012035, + "grad_norm": 2.2280568062526522, + "learning_rate": 3.893130763502239e-06, + "loss": 0.8867, + "step": 6859 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 1.9985998533707623, + "learning_rate": 3.890446889444751e-06, + "loss": 0.8671, + "step": 6860 + }, + { + "epoch": 0.7180533751962324, + "grad_norm": 1.9789150948560013, + "learning_rate": 3.887763717387296e-06, + "loss": 0.9274, + "step": 6861 + }, + { + "epoch": 0.7181580324437468, + "grad_norm": 2.008681323462506, + "learning_rate": 3.8850812476381814e-06, + "loss": 0.918, + "step": 6862 + }, + { + "epoch": 0.7182626896912612, + "grad_norm": 1.6970462178438779, + "learning_rate": 3.882399480505632e-06, + "loss": 0.7673, + "step": 6863 + }, + { + "epoch": 0.7183673469387755, + "grad_norm": 1.9737100576957622, + "learning_rate": 3.879718416297783e-06, + "loss": 0.8705, + "step": 6864 + }, + { + "epoch": 0.7184720041862899, + "grad_norm": 2.116672494256888, + "learning_rate": 3.877038055322696e-06, + "loss": 0.9756, + "step": 6865 + }, + { + "epoch": 0.7185766614338043, + "grad_norm": 1.6446313933488226, + "learning_rate": 3.874358397888345e-06, + "loss": 0.7854, + "step": 6866 + }, + { + "epoch": 0.7186813186813187, + "grad_norm": 2.2698290566150323, + "learning_rate": 3.871679444302635e-06, + "loss": 0.8557, + "step": 6867 + }, + { + "epoch": 0.7187859759288331, + "grad_norm": 2.1256730677441578, + "learning_rate": 3.86900119487338e-06, + "loss": 0.8339, + "step": 6868 + }, + { + "epoch": 0.7188906331763475, + "grad_norm": 1.9881658784209284, + "learning_rate": 3.866323649908318e-06, + "loss": 0.9258, + "step": 6869 + }, + { + "epoch": 0.7189952904238619, + "grad_norm": 2.124198993385882, + "learning_rate": 3.8636468097150995e-06, + "loss": 0.883, + "step": 6870 + }, + { + "epoch": 0.7190999476713762, + "grad_norm": 1.9123797963557438, + "learning_rate": 3.860970674601301e-06, + "loss": 0.8941, + "step": 6871 + }, + { + "epoch": 0.7192046049188906, + "grad_norm": 2.0104065734716636, + "learning_rate": 3.85829524487442e-06, + "loss": 0.8806, + "step": 6872 + }, + { + "epoch": 0.719309262166405, + "grad_norm": 2.1166454011026707, + "learning_rate": 3.855620520841868e-06, + "loss": 0.923, + "step": 6873 + }, + { + "epoch": 0.7194139194139194, + "grad_norm": 1.9204632832965207, + "learning_rate": 3.8529465028109725e-06, + "loss": 0.8695, + "step": 6874 + }, + { + "epoch": 0.7195185766614338, + "grad_norm": 1.937070827448725, + "learning_rate": 3.8502731910889826e-06, + "loss": 0.835, + "step": 6875 + }, + { + "epoch": 0.7196232339089482, + "grad_norm": 2.423772623864867, + "learning_rate": 3.8476005859830725e-06, + "loss": 1.0035, + "step": 6876 + }, + { + "epoch": 0.7197278911564626, + "grad_norm": 2.2319802937639213, + "learning_rate": 3.844928687800328e-06, + "loss": 0.9172, + "step": 6877 + }, + { + "epoch": 0.7198325484039769, + "grad_norm": 1.8816969569554647, + "learning_rate": 3.842257496847751e-06, + "loss": 0.8183, + "step": 6878 + }, + { + "epoch": 0.7199372056514913, + "grad_norm": 2.328593060017702, + "learning_rate": 3.839587013432273e-06, + "loss": 0.9581, + "step": 6879 + }, + { + "epoch": 0.7200418628990057, + "grad_norm": 2.357599652370192, + "learning_rate": 3.836917237860738e-06, + "loss": 0.8882, + "step": 6880 + }, + { + "epoch": 0.7201465201465201, + "grad_norm": 2.112443593657366, + "learning_rate": 3.834248170439901e-06, + "loss": 0.8824, + "step": 6881 + }, + { + "epoch": 0.7202511773940345, + "grad_norm": 2.5342213753725265, + "learning_rate": 3.831579811476452e-06, + "loss": 0.8558, + "step": 6882 + }, + { + "epoch": 0.720355834641549, + "grad_norm": 2.187453453613242, + "learning_rate": 3.8289121612769885e-06, + "loss": 1.0146, + "step": 6883 + }, + { + "epoch": 0.7204604918890634, + "grad_norm": 2.0491956184185764, + "learning_rate": 3.8262452201480275e-06, + "loss": 0.9876, + "step": 6884 + }, + { + "epoch": 0.7205651491365777, + "grad_norm": 1.9619918810973644, + "learning_rate": 3.823578988396003e-06, + "loss": 0.9359, + "step": 6885 + }, + { + "epoch": 0.7206698063840921, + "grad_norm": 2.435486034108263, + "learning_rate": 3.820913466327276e-06, + "loss": 0.9142, + "step": 6886 + }, + { + "epoch": 0.7207744636316065, + "grad_norm": 2.128985121841604, + "learning_rate": 3.818248654248115e-06, + "loss": 0.8178, + "step": 6887 + }, + { + "epoch": 0.7208791208791209, + "grad_norm": 2.1846007264436063, + "learning_rate": 3.81558455246472e-06, + "loss": 0.8921, + "step": 6888 + }, + { + "epoch": 0.7209837781266353, + "grad_norm": 2.134017674581687, + "learning_rate": 3.8129211612831953e-06, + "loss": 0.9682, + "step": 6889 + }, + { + "epoch": 0.7210884353741497, + "grad_norm": 2.4046779312825386, + "learning_rate": 3.8102584810095687e-06, + "loss": 0.801, + "step": 6890 + }, + { + "epoch": 0.7211930926216641, + "grad_norm": 2.2865554223892066, + "learning_rate": 3.8075965119497936e-06, + "loss": 0.9072, + "step": 6891 + }, + { + "epoch": 0.7212977498691785, + "grad_norm": 1.8509159996167859, + "learning_rate": 3.804935254409734e-06, + "loss": 0.88, + "step": 6892 + }, + { + "epoch": 0.7214024071166928, + "grad_norm": 2.2394314733138647, + "learning_rate": 3.8022747086951715e-06, + "loss": 0.904, + "step": 6893 + }, + { + "epoch": 0.7215070643642072, + "grad_norm": 1.9861651335567048, + "learning_rate": 3.7996148751118057e-06, + "loss": 0.8216, + "step": 6894 + }, + { + "epoch": 0.7216117216117216, + "grad_norm": 1.9218843095758162, + "learning_rate": 3.7969557539652636e-06, + "loss": 0.918, + "step": 6895 + }, + { + "epoch": 0.721716378859236, + "grad_norm": 1.9157472248957887, + "learning_rate": 3.7942973455610766e-06, + "loss": 0.8062, + "step": 6896 + }, + { + "epoch": 0.7218210361067504, + "grad_norm": 1.9189826257514782, + "learning_rate": 3.791639650204709e-06, + "loss": 0.8006, + "step": 6897 + }, + { + "epoch": 0.7219256933542648, + "grad_norm": 1.8660285862083692, + "learning_rate": 3.7889826682015306e-06, + "loss": 0.7262, + "step": 6898 + }, + { + "epoch": 0.7220303506017792, + "grad_norm": 2.0909598391703046, + "learning_rate": 3.7863263998568346e-06, + "loss": 0.9164, + "step": 6899 + }, + { + "epoch": 0.7221350078492935, + "grad_norm": 2.5637117016782507, + "learning_rate": 3.7836708454758287e-06, + "loss": 0.8562, + "step": 6900 + }, + { + "epoch": 0.7222396650968079, + "grad_norm": 2.09685317781106, + "learning_rate": 3.781016005363648e-06, + "loss": 0.9648, + "step": 6901 + }, + { + "epoch": 0.7223443223443223, + "grad_norm": 1.832463976356295, + "learning_rate": 3.7783618798253354e-06, + "loss": 0.8759, + "step": 6902 + }, + { + "epoch": 0.7224489795918367, + "grad_norm": 1.9229291271734896, + "learning_rate": 3.7757084691658552e-06, + "loss": 0.8913, + "step": 6903 + }, + { + "epoch": 0.7225536368393511, + "grad_norm": 2.519016468836667, + "learning_rate": 3.7730557736900865e-06, + "loss": 0.9351, + "step": 6904 + }, + { + "epoch": 0.7226582940868655, + "grad_norm": 1.9402129919604572, + "learning_rate": 3.7704037937028324e-06, + "loss": 0.9447, + "step": 6905 + }, + { + "epoch": 0.72276295133438, + "grad_norm": 1.9677225873579622, + "learning_rate": 3.7677525295088146e-06, + "loss": 0.8379, + "step": 6906 + }, + { + "epoch": 0.7228676085818942, + "grad_norm": 2.3777845625897482, + "learning_rate": 3.7651019814126656e-06, + "loss": 0.8652, + "step": 6907 + }, + { + "epoch": 0.7229722658294087, + "grad_norm": 1.9315748698397706, + "learning_rate": 3.762452149718938e-06, + "loss": 0.9468, + "step": 6908 + }, + { + "epoch": 0.7230769230769231, + "grad_norm": 2.205149403013181, + "learning_rate": 3.7598030347321e-06, + "loss": 0.8733, + "step": 6909 + }, + { + "epoch": 0.7231815803244375, + "grad_norm": 1.8693736948006856, + "learning_rate": 3.757154636756547e-06, + "loss": 0.8561, + "step": 6910 + }, + { + "epoch": 0.7232862375719519, + "grad_norm": 1.873969507521071, + "learning_rate": 3.7545069560965817e-06, + "loss": 0.8318, + "step": 6911 + }, + { + "epoch": 0.7233908948194663, + "grad_norm": 2.42283057627858, + "learning_rate": 3.751859993056428e-06, + "loss": 0.8737, + "step": 6912 + }, + { + "epoch": 0.7234955520669807, + "grad_norm": 1.865784635381646, + "learning_rate": 3.749213747940225e-06, + "loss": 1.0405, + "step": 6913 + }, + { + "epoch": 0.723600209314495, + "grad_norm": 2.6319699814934623, + "learning_rate": 3.7465682210520325e-06, + "loss": 0.934, + "step": 6914 + }, + { + "epoch": 0.7237048665620094, + "grad_norm": 1.9195838969682297, + "learning_rate": 3.7439234126958326e-06, + "loss": 0.8494, + "step": 6915 + }, + { + "epoch": 0.7238095238095238, + "grad_norm": 2.1904292611117726, + "learning_rate": 3.741279323175515e-06, + "loss": 0.9698, + "step": 6916 + }, + { + "epoch": 0.7239141810570382, + "grad_norm": 2.2002972862059833, + "learning_rate": 3.7386359527948914e-06, + "loss": 0.8893, + "step": 6917 + }, + { + "epoch": 0.7240188383045526, + "grad_norm": 2.3913339759973917, + "learning_rate": 3.7359933018576887e-06, + "loss": 0.8367, + "step": 6918 + }, + { + "epoch": 0.724123495552067, + "grad_norm": 2.1969314364288484, + "learning_rate": 3.733351370667552e-06, + "loss": 0.9061, + "step": 6919 + }, + { + "epoch": 0.7242281527995814, + "grad_norm": 2.024923979347576, + "learning_rate": 3.7307101595280495e-06, + "loss": 0.9443, + "step": 6920 + }, + { + "epoch": 0.7243328100470957, + "grad_norm": 2.113986243612249, + "learning_rate": 3.7280696687426578e-06, + "loss": 0.9265, + "step": 6921 + }, + { + "epoch": 0.7244374672946101, + "grad_norm": 1.950614982154138, + "learning_rate": 3.725429898614773e-06, + "loss": 0.9245, + "step": 6922 + }, + { + "epoch": 0.7245421245421245, + "grad_norm": 2.1324732521905205, + "learning_rate": 3.7227908494477174e-06, + "loss": 0.9454, + "step": 6923 + }, + { + "epoch": 0.7246467817896389, + "grad_norm": 2.2386944352919906, + "learning_rate": 3.7201525215447133e-06, + "loss": 0.9405, + "step": 6924 + }, + { + "epoch": 0.7247514390371533, + "grad_norm": 2.0332904488764236, + "learning_rate": 3.7175149152089185e-06, + "loss": 0.982, + "step": 6925 + }, + { + "epoch": 0.7248560962846677, + "grad_norm": 1.8576178852621916, + "learning_rate": 3.7148780307433975e-06, + "loss": 0.7685, + "step": 6926 + }, + { + "epoch": 0.7249607535321821, + "grad_norm": 2.1484299818831945, + "learning_rate": 3.7122418684511306e-06, + "loss": 0.8236, + "step": 6927 + }, + { + "epoch": 0.7250654107796964, + "grad_norm": 2.516712152720112, + "learning_rate": 3.7096064286350164e-06, + "loss": 0.8525, + "step": 6928 + }, + { + "epoch": 0.7251700680272108, + "grad_norm": 2.1566899339097936, + "learning_rate": 3.706971711597879e-06, + "loss": 0.926, + "step": 6929 + }, + { + "epoch": 0.7252747252747253, + "grad_norm": 1.7869536827919366, + "learning_rate": 3.7043377176424467e-06, + "loss": 0.9757, + "step": 6930 + }, + { + "epoch": 0.7253793825222397, + "grad_norm": 1.70469612797693, + "learning_rate": 3.701704447071376e-06, + "loss": 0.8288, + "step": 6931 + }, + { + "epoch": 0.7254840397697541, + "grad_norm": 1.8514012748832875, + "learning_rate": 3.699071900187232e-06, + "loss": 0.8338, + "step": 6932 + }, + { + "epoch": 0.7255886970172685, + "grad_norm": 2.3795752920291773, + "learning_rate": 3.696440077292497e-06, + "loss": 0.862, + "step": 6933 + }, + { + "epoch": 0.7256933542647829, + "grad_norm": 2.2429703352322004, + "learning_rate": 3.6938089786895783e-06, + "loss": 1.0258, + "step": 6934 + }, + { + "epoch": 0.7257980115122973, + "grad_norm": 1.986089459097724, + "learning_rate": 3.691178604680793e-06, + "loss": 0.8968, + "step": 6935 + }, + { + "epoch": 0.7259026687598116, + "grad_norm": 1.8760611340077848, + "learning_rate": 3.688548955568375e-06, + "loss": 0.8637, + "step": 6936 + }, + { + "epoch": 0.726007326007326, + "grad_norm": 1.8062260800693908, + "learning_rate": 3.685920031654476e-06, + "loss": 0.8974, + "step": 6937 + }, + { + "epoch": 0.7261119832548404, + "grad_norm": 2.0910032713345155, + "learning_rate": 3.683291833241163e-06, + "loss": 0.9255, + "step": 6938 + }, + { + "epoch": 0.7262166405023548, + "grad_norm": 1.952084219425161, + "learning_rate": 3.6806643606304226e-06, + "loss": 0.954, + "step": 6939 + }, + { + "epoch": 0.7263212977498692, + "grad_norm": 2.2475768165777756, + "learning_rate": 3.6780376141241624e-06, + "loss": 0.7996, + "step": 6940 + }, + { + "epoch": 0.7264259549973836, + "grad_norm": 2.039613944161897, + "learning_rate": 3.6754115940241954e-06, + "loss": 0.8029, + "step": 6941 + }, + { + "epoch": 0.726530612244898, + "grad_norm": 2.4372491132455054, + "learning_rate": 3.6727863006322585e-06, + "loss": 0.8573, + "step": 6942 + }, + { + "epoch": 0.7266352694924123, + "grad_norm": 2.135064158968042, + "learning_rate": 3.6701617342499975e-06, + "loss": 0.9144, + "step": 6943 + }, + { + "epoch": 0.7267399267399267, + "grad_norm": 2.141195883147221, + "learning_rate": 3.66753789517899e-06, + "loss": 0.9424, + "step": 6944 + }, + { + "epoch": 0.7268445839874411, + "grad_norm": 2.3453010874226496, + "learning_rate": 3.6649147837207143e-06, + "loss": 0.8654, + "step": 6945 + }, + { + "epoch": 0.7269492412349555, + "grad_norm": 1.7642203173816415, + "learning_rate": 3.6622924001765725e-06, + "loss": 0.8269, + "step": 6946 + }, + { + "epoch": 0.7270538984824699, + "grad_norm": 2.2323698391066835, + "learning_rate": 3.6596707448478796e-06, + "loss": 0.9424, + "step": 6947 + }, + { + "epoch": 0.7271585557299843, + "grad_norm": 2.631836450156995, + "learning_rate": 3.6570498180358705e-06, + "loss": 0.9094, + "step": 6948 + }, + { + "epoch": 0.7272632129774987, + "grad_norm": 1.9050273254003347, + "learning_rate": 3.6544296200416994e-06, + "loss": 0.9081, + "step": 6949 + }, + { + "epoch": 0.727367870225013, + "grad_norm": 2.1745905818103606, + "learning_rate": 3.6518101511664284e-06, + "loss": 0.9148, + "step": 6950 + }, + { + "epoch": 0.7274725274725274, + "grad_norm": 2.1487595088513487, + "learning_rate": 3.6491914117110405e-06, + "loss": 0.9708, + "step": 6951 + }, + { + "epoch": 0.7275771847200418, + "grad_norm": 2.171265284624992, + "learning_rate": 3.646573401976431e-06, + "loss": 0.9458, + "step": 6952 + }, + { + "epoch": 0.7276818419675563, + "grad_norm": 2.762939860481466, + "learning_rate": 3.64395612226342e-06, + "loss": 0.9506, + "step": 6953 + }, + { + "epoch": 0.7277864992150707, + "grad_norm": 2.076483034212252, + "learning_rate": 3.641339572872735e-06, + "loss": 0.8965, + "step": 6954 + }, + { + "epoch": 0.7278911564625851, + "grad_norm": 1.8982571709599116, + "learning_rate": 3.638723754105025e-06, + "loss": 0.8545, + "step": 6955 + }, + { + "epoch": 0.7279958137100995, + "grad_norm": 1.6455039950849177, + "learning_rate": 3.636108666260847e-06, + "loss": 0.7816, + "step": 6956 + }, + { + "epoch": 0.7281004709576138, + "grad_norm": 2.1303615491352286, + "learning_rate": 3.6334943096406873e-06, + "loss": 0.967, + "step": 6957 + }, + { + "epoch": 0.7282051282051282, + "grad_norm": 2.3205676424264197, + "learning_rate": 3.630880684544934e-06, + "loss": 0.9537, + "step": 6958 + }, + { + "epoch": 0.7283097854526426, + "grad_norm": 1.9729345731294923, + "learning_rate": 3.628267791273906e-06, + "loss": 0.8789, + "step": 6959 + }, + { + "epoch": 0.728414442700157, + "grad_norm": 2.0365158246102704, + "learning_rate": 3.625655630127826e-06, + "loss": 0.8889, + "step": 6960 + }, + { + "epoch": 0.7285190999476714, + "grad_norm": 2.0572596307136086, + "learning_rate": 3.6230442014068346e-06, + "loss": 0.8673, + "step": 6961 + }, + { + "epoch": 0.7286237571951858, + "grad_norm": 2.265474450323085, + "learning_rate": 3.6204335054109897e-06, + "loss": 0.8654, + "step": 6962 + }, + { + "epoch": 0.7287284144427002, + "grad_norm": 2.036780676342137, + "learning_rate": 3.6178235424402707e-06, + "loss": 0.9066, + "step": 6963 + }, + { + "epoch": 0.7288330716902145, + "grad_norm": 2.3072772153174985, + "learning_rate": 3.6152143127945647e-06, + "loss": 0.8402, + "step": 6964 + }, + { + "epoch": 0.7289377289377289, + "grad_norm": 1.5267072530528316, + "learning_rate": 3.6126058167736742e-06, + "loss": 0.7113, + "step": 6965 + }, + { + "epoch": 0.7290423861852433, + "grad_norm": 2.312199363621423, + "learning_rate": 3.609998054677327e-06, + "loss": 0.894, + "step": 6966 + }, + { + "epoch": 0.7291470434327577, + "grad_norm": 2.1365598908923964, + "learning_rate": 3.607391026805155e-06, + "loss": 0.8484, + "step": 6967 + }, + { + "epoch": 0.7292517006802721, + "grad_norm": 1.814578926920421, + "learning_rate": 3.6047847334567153e-06, + "loss": 0.8571, + "step": 6968 + }, + { + "epoch": 0.7293563579277865, + "grad_norm": 2.5103976495037257, + "learning_rate": 3.602179174931475e-06, + "loss": 0.874, + "step": 6969 + }, + { + "epoch": 0.7294610151753009, + "grad_norm": 1.9816767917289961, + "learning_rate": 3.5995743515288163e-06, + "loss": 0.9487, + "step": 6970 + }, + { + "epoch": 0.7295656724228152, + "grad_norm": 1.6802672691339464, + "learning_rate": 3.5969702635480374e-06, + "loss": 0.9187, + "step": 6971 + }, + { + "epoch": 0.7296703296703296, + "grad_norm": 1.9087881371348294, + "learning_rate": 3.594366911288358e-06, + "loss": 0.8732, + "step": 6972 + }, + { + "epoch": 0.729774986917844, + "grad_norm": 2.097973491871284, + "learning_rate": 3.5917642950489062e-06, + "loss": 1.0095, + "step": 6973 + }, + { + "epoch": 0.7298796441653584, + "grad_norm": 2.0996759377189043, + "learning_rate": 3.5891624151287253e-06, + "loss": 0.9249, + "step": 6974 + }, + { + "epoch": 0.7299843014128728, + "grad_norm": 2.0231942048602405, + "learning_rate": 3.5865612718267807e-06, + "loss": 0.8537, + "step": 6975 + }, + { + "epoch": 0.7300889586603873, + "grad_norm": 2.2582186780087894, + "learning_rate": 3.583960865441949e-06, + "loss": 0.8968, + "step": 6976 + }, + { + "epoch": 0.7301936159079017, + "grad_norm": 2.1127814779861462, + "learning_rate": 3.5813611962730165e-06, + "loss": 0.8105, + "step": 6977 + }, + { + "epoch": 0.7302982731554161, + "grad_norm": 2.1735847999153677, + "learning_rate": 3.578762264618697e-06, + "loss": 0.9232, + "step": 6978 + }, + { + "epoch": 0.7304029304029304, + "grad_norm": 2.086077873057493, + "learning_rate": 3.5761640707776115e-06, + "loss": 0.9935, + "step": 6979 + }, + { + "epoch": 0.7305075876504448, + "grad_norm": 1.926989359894305, + "learning_rate": 3.573566615048297e-06, + "loss": 0.9609, + "step": 6980 + }, + { + "epoch": 0.7306122448979592, + "grad_norm": 1.7955588355729455, + "learning_rate": 3.570969897729202e-06, + "loss": 0.9249, + "step": 6981 + }, + { + "epoch": 0.7307169021454736, + "grad_norm": 2.0822537293283134, + "learning_rate": 3.5683739191187027e-06, + "loss": 0.92, + "step": 6982 + }, + { + "epoch": 0.730821559392988, + "grad_norm": 1.988495753359855, + "learning_rate": 3.565778679515075e-06, + "loss": 0.7726, + "step": 6983 + }, + { + "epoch": 0.7309262166405024, + "grad_norm": 2.1187421281363172, + "learning_rate": 3.5631841792165236e-06, + "loss": 0.8689, + "step": 6984 + }, + { + "epoch": 0.7310308738880168, + "grad_norm": 1.8611241031497499, + "learning_rate": 3.5605904185211582e-06, + "loss": 0.931, + "step": 6985 + }, + { + "epoch": 0.7311355311355311, + "grad_norm": 1.8366170447158119, + "learning_rate": 3.5579973977270057e-06, + "loss": 0.864, + "step": 6986 + }, + { + "epoch": 0.7312401883830455, + "grad_norm": 2.204065983976754, + "learning_rate": 3.5554051171320136e-06, + "loss": 0.9756, + "step": 6987 + }, + { + "epoch": 0.7313448456305599, + "grad_norm": 2.133144515554238, + "learning_rate": 3.552813577034039e-06, + "loss": 0.9533, + "step": 6988 + }, + { + "epoch": 0.7314495028780743, + "grad_norm": 1.9662548833887095, + "learning_rate": 3.550222777730854e-06, + "loss": 0.8933, + "step": 6989 + }, + { + "epoch": 0.7315541601255887, + "grad_norm": 1.8080125380457004, + "learning_rate": 3.5476327195201463e-06, + "loss": 0.8176, + "step": 6990 + }, + { + "epoch": 0.7316588173731031, + "grad_norm": 2.097754039027627, + "learning_rate": 3.545043402699514e-06, + "loss": 0.9289, + "step": 6991 + }, + { + "epoch": 0.7317634746206175, + "grad_norm": 1.9647814759624955, + "learning_rate": 3.542454827566486e-06, + "loss": 0.9348, + "step": 6992 + }, + { + "epoch": 0.7318681318681318, + "grad_norm": 2.183568175809098, + "learning_rate": 3.5398669944184894e-06, + "loss": 0.8125, + "step": 6993 + }, + { + "epoch": 0.7319727891156462, + "grad_norm": 2.2546413328737906, + "learning_rate": 3.5372799035528716e-06, + "loss": 0.8066, + "step": 6994 + }, + { + "epoch": 0.7320774463631606, + "grad_norm": 2.018409429501914, + "learning_rate": 3.5346935552668936e-06, + "loss": 0.876, + "step": 6995 + }, + { + "epoch": 0.732182103610675, + "grad_norm": 1.8056317532440103, + "learning_rate": 3.5321079498577292e-06, + "loss": 0.7721, + "step": 6996 + }, + { + "epoch": 0.7322867608581894, + "grad_norm": 2.3671016869226147, + "learning_rate": 3.5295230876224763e-06, + "loss": 0.9053, + "step": 6997 + }, + { + "epoch": 0.7323914181057039, + "grad_norm": 2.26537165470211, + "learning_rate": 3.526938968858139e-06, + "loss": 0.8924, + "step": 6998 + }, + { + "epoch": 0.7324960753532183, + "grad_norm": 2.261177073983615, + "learning_rate": 3.5243555938616348e-06, + "loss": 0.7816, + "step": 6999 + }, + { + "epoch": 0.7326007326007326, + "grad_norm": 2.0780543997404592, + "learning_rate": 3.521772962929798e-06, + "loss": 0.9149, + "step": 7000 + }, + { + "epoch": 0.732705389848247, + "grad_norm": 2.0900590029616897, + "learning_rate": 3.5191910763593808e-06, + "loss": 0.8636, + "step": 7001 + }, + { + "epoch": 0.7328100470957614, + "grad_norm": 2.0840433266339184, + "learning_rate": 3.5166099344470493e-06, + "loss": 0.8882, + "step": 7002 + }, + { + "epoch": 0.7329147043432758, + "grad_norm": 2.1351284570724713, + "learning_rate": 3.5140295374893797e-06, + "loss": 0.8069, + "step": 7003 + }, + { + "epoch": 0.7330193615907902, + "grad_norm": 2.104817813524145, + "learning_rate": 3.511449885782865e-06, + "loss": 0.9998, + "step": 7004 + }, + { + "epoch": 0.7331240188383046, + "grad_norm": 1.977905622536921, + "learning_rate": 3.5088709796239086e-06, + "loss": 0.8087, + "step": 7005 + }, + { + "epoch": 0.733228676085819, + "grad_norm": 1.803351442799279, + "learning_rate": 3.5062928193088385e-06, + "loss": 0.8974, + "step": 7006 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 2.1137702304154575, + "learning_rate": 3.5037154051338883e-06, + "loss": 0.9726, + "step": 7007 + }, + { + "epoch": 0.7334379905808477, + "grad_norm": 2.2621279545247024, + "learning_rate": 3.501138737395208e-06, + "loss": 0.9225, + "step": 7008 + }, + { + "epoch": 0.7335426478283621, + "grad_norm": 2.042349787240519, + "learning_rate": 3.4985628163888564e-06, + "loss": 1.043, + "step": 7009 + }, + { + "epoch": 0.7336473050758765, + "grad_norm": 2.1738234952277793, + "learning_rate": 3.4959876424108173e-06, + "loss": 0.9549, + "step": 7010 + }, + { + "epoch": 0.7337519623233909, + "grad_norm": 1.8561044157153646, + "learning_rate": 3.4934132157569866e-06, + "loss": 0.9149, + "step": 7011 + }, + { + "epoch": 0.7338566195709053, + "grad_norm": 1.8449197732704996, + "learning_rate": 3.4908395367231683e-06, + "loss": 0.7587, + "step": 7012 + }, + { + "epoch": 0.7339612768184197, + "grad_norm": 2.0696609987749945, + "learning_rate": 3.488266605605082e-06, + "loss": 0.9359, + "step": 7013 + }, + { + "epoch": 0.734065934065934, + "grad_norm": 2.138497203840195, + "learning_rate": 3.4856944226983637e-06, + "loss": 0.8327, + "step": 7014 + }, + { + "epoch": 0.7341705913134484, + "grad_norm": 1.6468331397988603, + "learning_rate": 3.4831229882985585e-06, + "loss": 0.8576, + "step": 7015 + }, + { + "epoch": 0.7342752485609628, + "grad_norm": 1.977918872042594, + "learning_rate": 3.4805523027011368e-06, + "loss": 0.9462, + "step": 7016 + }, + { + "epoch": 0.7343799058084772, + "grad_norm": 1.8654816094421645, + "learning_rate": 3.477982366201468e-06, + "loss": 0.8792, + "step": 7017 + }, + { + "epoch": 0.7344845630559916, + "grad_norm": 2.2589654681645936, + "learning_rate": 3.475413179094851e-06, + "loss": 0.9523, + "step": 7018 + }, + { + "epoch": 0.734589220303506, + "grad_norm": 1.9481397990624987, + "learning_rate": 3.4728447416764854e-06, + "loss": 0.9307, + "step": 7019 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 1.8809532054531273, + "learning_rate": 3.470277054241488e-06, + "loss": 0.8929, + "step": 7020 + }, + { + "epoch": 0.7347985347985349, + "grad_norm": 1.7678658679826986, + "learning_rate": 3.467710117084897e-06, + "loss": 0.8018, + "step": 7021 + }, + { + "epoch": 0.7349031920460491, + "grad_norm": 2.14212227669439, + "learning_rate": 3.4651439305016565e-06, + "loss": 0.8358, + "step": 7022 + }, + { + "epoch": 0.7350078492935636, + "grad_norm": 2.0802743972041218, + "learning_rate": 3.4625784947866258e-06, + "loss": 0.8696, + "step": 7023 + }, + { + "epoch": 0.735112506541078, + "grad_norm": 1.945003157814486, + "learning_rate": 3.460013810234576e-06, + "loss": 0.9213, + "step": 7024 + }, + { + "epoch": 0.7352171637885924, + "grad_norm": 2.0651692861153945, + "learning_rate": 3.4574498771401997e-06, + "loss": 0.8322, + "step": 7025 + }, + { + "epoch": 0.7353218210361068, + "grad_norm": 1.8951231983789885, + "learning_rate": 3.454886695798093e-06, + "loss": 0.9921, + "step": 7026 + }, + { + "epoch": 0.7354264782836212, + "grad_norm": 2.034145921283138, + "learning_rate": 3.452324266502777e-06, + "loss": 0.9316, + "step": 7027 + }, + { + "epoch": 0.7355311355311356, + "grad_norm": 2.632863001346807, + "learning_rate": 3.4497625895486755e-06, + "loss": 0.9863, + "step": 7028 + }, + { + "epoch": 0.7356357927786499, + "grad_norm": 1.8786479307461825, + "learning_rate": 3.4472016652301276e-06, + "loss": 0.8824, + "step": 7029 + }, + { + "epoch": 0.7357404500261643, + "grad_norm": 2.0135193900407584, + "learning_rate": 3.4446414938413964e-06, + "loss": 0.9466, + "step": 7030 + }, + { + "epoch": 0.7358451072736787, + "grad_norm": 2.2920626537087316, + "learning_rate": 3.442082075676646e-06, + "loss": 0.864, + "step": 7031 + }, + { + "epoch": 0.7359497645211931, + "grad_norm": 1.9835068207886113, + "learning_rate": 3.4395234110299593e-06, + "loss": 1.0386, + "step": 7032 + }, + { + "epoch": 0.7360544217687075, + "grad_norm": 1.8964138450221253, + "learning_rate": 3.4369655001953327e-06, + "loss": 0.8778, + "step": 7033 + }, + { + "epoch": 0.7361590790162219, + "grad_norm": 1.9000636515942269, + "learning_rate": 3.4344083434666707e-06, + "loss": 0.9043, + "step": 7034 + }, + { + "epoch": 0.7362637362637363, + "grad_norm": 1.885698255986817, + "learning_rate": 3.4318519411378006e-06, + "loss": 0.8209, + "step": 7035 + }, + { + "epoch": 0.7363683935112506, + "grad_norm": 1.9675198172784103, + "learning_rate": 3.429296293502461e-06, + "loss": 0.9245, + "step": 7036 + }, + { + "epoch": 0.736473050758765, + "grad_norm": 2.3490174091108518, + "learning_rate": 3.4267414008542967e-06, + "loss": 0.8804, + "step": 7037 + }, + { + "epoch": 0.7365777080062794, + "grad_norm": 1.981563826198973, + "learning_rate": 3.424187263486871e-06, + "loss": 0.9156, + "step": 7038 + }, + { + "epoch": 0.7366823652537938, + "grad_norm": 2.0151298287952373, + "learning_rate": 3.421633881693657e-06, + "loss": 0.8659, + "step": 7039 + }, + { + "epoch": 0.7367870225013082, + "grad_norm": 1.7892471082853696, + "learning_rate": 3.4190812557680487e-06, + "loss": 0.8883, + "step": 7040 + }, + { + "epoch": 0.7368916797488226, + "grad_norm": 2.6350578287776822, + "learning_rate": 3.4165293860033444e-06, + "loss": 0.9169, + "step": 7041 + }, + { + "epoch": 0.736996336996337, + "grad_norm": 1.9233102606504318, + "learning_rate": 3.4139782726927597e-06, + "loss": 0.8025, + "step": 7042 + }, + { + "epoch": 0.7371009942438513, + "grad_norm": 1.8506753750571818, + "learning_rate": 3.4114279161294195e-06, + "loss": 0.9393, + "step": 7043 + }, + { + "epoch": 0.7372056514913657, + "grad_norm": 1.850056642327913, + "learning_rate": 3.4088783166063686e-06, + "loss": 0.8943, + "step": 7044 + }, + { + "epoch": 0.7373103087388801, + "grad_norm": 2.153444187423901, + "learning_rate": 3.406329474416563e-06, + "loss": 1.0154, + "step": 7045 + }, + { + "epoch": 0.7374149659863946, + "grad_norm": 2.0167001869694383, + "learning_rate": 3.4037813898528682e-06, + "loss": 0.8576, + "step": 7046 + }, + { + "epoch": 0.737519623233909, + "grad_norm": 2.065120817248703, + "learning_rate": 3.401234063208064e-06, + "loss": 0.839, + "step": 7047 + }, + { + "epoch": 0.7376242804814234, + "grad_norm": 2.066946242767024, + "learning_rate": 3.3986874947748428e-06, + "loss": 0.9666, + "step": 7048 + }, + { + "epoch": 0.7377289377289378, + "grad_norm": 1.8889031726518197, + "learning_rate": 3.3961416848458073e-06, + "loss": 0.8222, + "step": 7049 + }, + { + "epoch": 0.7378335949764521, + "grad_norm": 1.9632185729176141, + "learning_rate": 3.393596633713483e-06, + "loss": 0.8447, + "step": 7050 + }, + { + "epoch": 0.7379382522239665, + "grad_norm": 2.0510910347919413, + "learning_rate": 3.3910523416702977e-06, + "loss": 1.0462, + "step": 7051 + }, + { + "epoch": 0.7380429094714809, + "grad_norm": 2.12388351552657, + "learning_rate": 3.388508809008594e-06, + "loss": 0.8637, + "step": 7052 + }, + { + "epoch": 0.7381475667189953, + "grad_norm": 1.8703899844326586, + "learning_rate": 3.3859660360206336e-06, + "loss": 0.8867, + "step": 7053 + }, + { + "epoch": 0.7382522239665097, + "grad_norm": 2.022816664688723, + "learning_rate": 3.383424022998579e-06, + "loss": 0.8605, + "step": 7054 + }, + { + "epoch": 0.7383568812140241, + "grad_norm": 2.1134498670646216, + "learning_rate": 3.3808827702345227e-06, + "loss": 0.8655, + "step": 7055 + }, + { + "epoch": 0.7384615384615385, + "grad_norm": 1.9780213401794018, + "learning_rate": 3.3783422780204535e-06, + "loss": 0.9235, + "step": 7056 + }, + { + "epoch": 0.7385661957090528, + "grad_norm": 1.9464458644225906, + "learning_rate": 3.37580254664828e-06, + "loss": 0.9951, + "step": 7057 + }, + { + "epoch": 0.7386708529565672, + "grad_norm": 1.8745039327238657, + "learning_rate": 3.37326357640982e-06, + "loss": 0.7171, + "step": 7058 + }, + { + "epoch": 0.7387755102040816, + "grad_norm": 2.037899312393001, + "learning_rate": 3.370725367596811e-06, + "loss": 0.9593, + "step": 7059 + }, + { + "epoch": 0.738880167451596, + "grad_norm": 2.1099882075886844, + "learning_rate": 3.368187920500897e-06, + "loss": 0.9113, + "step": 7060 + }, + { + "epoch": 0.7389848246991104, + "grad_norm": 2.6349773976618667, + "learning_rate": 3.3656512354136317e-06, + "loss": 0.7943, + "step": 7061 + }, + { + "epoch": 0.7390894819466248, + "grad_norm": 2.262938094716051, + "learning_rate": 3.3631153126264915e-06, + "loss": 0.8173, + "step": 7062 + }, + { + "epoch": 0.7391941391941392, + "grad_norm": 1.9162832453637948, + "learning_rate": 3.3605801524308535e-06, + "loss": 0.7447, + "step": 7063 + }, + { + "epoch": 0.7392987964416536, + "grad_norm": 2.4566628443132372, + "learning_rate": 3.358045755118019e-06, + "loss": 0.7898, + "step": 7064 + }, + { + "epoch": 0.7394034536891679, + "grad_norm": 2.1132700561524316, + "learning_rate": 3.3555121209791906e-06, + "loss": 0.8397, + "step": 7065 + }, + { + "epoch": 0.7395081109366823, + "grad_norm": 2.6164728972304645, + "learning_rate": 3.3529792503054903e-06, + "loss": 0.9608, + "step": 7066 + }, + { + "epoch": 0.7396127681841967, + "grad_norm": 2.3074332053564137, + "learning_rate": 3.3504471433879493e-06, + "loss": 0.9128, + "step": 7067 + }, + { + "epoch": 0.7397174254317112, + "grad_norm": 2.0330030779609234, + "learning_rate": 3.3479158005175082e-06, + "loss": 0.9589, + "step": 7068 + }, + { + "epoch": 0.7398220826792256, + "grad_norm": 2.120921500126816, + "learning_rate": 3.345385221985026e-06, + "loss": 0.9727, + "step": 7069 + }, + { + "epoch": 0.73992673992674, + "grad_norm": 2.6676041899583107, + "learning_rate": 3.342855408081276e-06, + "loss": 0.912, + "step": 7070 + }, + { + "epoch": 0.7400313971742544, + "grad_norm": 1.9000272361188917, + "learning_rate": 3.340326359096935e-06, + "loss": 0.7653, + "step": 7071 + }, + { + "epoch": 0.7401360544217687, + "grad_norm": 2.1277226935445817, + "learning_rate": 3.337798075322596e-06, + "loss": 0.835, + "step": 7072 + }, + { + "epoch": 0.7402407116692831, + "grad_norm": 2.1217486595239565, + "learning_rate": 3.3352705570487598e-06, + "loss": 0.9353, + "step": 7073 + }, + { + "epoch": 0.7403453689167975, + "grad_norm": 2.099696714940113, + "learning_rate": 3.332743804565851e-06, + "loss": 0.9437, + "step": 7074 + }, + { + "epoch": 0.7404500261643119, + "grad_norm": 2.310679915225124, + "learning_rate": 3.330217818164195e-06, + "loss": 0.8943, + "step": 7075 + }, + { + "epoch": 0.7405546834118263, + "grad_norm": 1.9472970296459848, + "learning_rate": 3.3276925981340336e-06, + "loss": 0.8915, + "step": 7076 + }, + { + "epoch": 0.7406593406593407, + "grad_norm": 2.3805641441348975, + "learning_rate": 3.325168144765515e-06, + "loss": 0.9253, + "step": 7077 + }, + { + "epoch": 0.7407639979068551, + "grad_norm": 1.9193799868580086, + "learning_rate": 3.3226444583487085e-06, + "loss": 1.0081, + "step": 7078 + }, + { + "epoch": 0.7408686551543694, + "grad_norm": 2.2795516135400287, + "learning_rate": 3.320121539173592e-06, + "loss": 0.8681, + "step": 7079 + }, + { + "epoch": 0.7409733124018838, + "grad_norm": 1.9038999366774318, + "learning_rate": 3.3175993875300527e-06, + "loss": 0.8152, + "step": 7080 + }, + { + "epoch": 0.7410779696493982, + "grad_norm": 2.209622572681599, + "learning_rate": 3.315078003707891e-06, + "loss": 1.0209, + "step": 7081 + }, + { + "epoch": 0.7411826268969126, + "grad_norm": 2.3636374036749004, + "learning_rate": 3.3125573879968154e-06, + "loss": 0.8659, + "step": 7082 + }, + { + "epoch": 0.741287284144427, + "grad_norm": 2.125287520788546, + "learning_rate": 3.310037540686455e-06, + "loss": 1.0528, + "step": 7083 + }, + { + "epoch": 0.7413919413919414, + "grad_norm": 2.015778138651614, + "learning_rate": 3.307518462066344e-06, + "loss": 0.8942, + "step": 7084 + }, + { + "epoch": 0.7414965986394558, + "grad_norm": 2.4686495141026987, + "learning_rate": 3.305000152425928e-06, + "loss": 0.8532, + "step": 7085 + }, + { + "epoch": 0.7416012558869701, + "grad_norm": 1.9474568598951196, + "learning_rate": 3.3024826120545673e-06, + "loss": 0.9053, + "step": 7086 + }, + { + "epoch": 0.7417059131344845, + "grad_norm": 1.6828735085636821, + "learning_rate": 3.299965841241525e-06, + "loss": 0.8393, + "step": 7087 + }, + { + "epoch": 0.7418105703819989, + "grad_norm": 2.081597291218235, + "learning_rate": 3.297449840275996e-06, + "loss": 0.9671, + "step": 7088 + }, + { + "epoch": 0.7419152276295133, + "grad_norm": 1.6821107578784833, + "learning_rate": 3.294934609447068e-06, + "loss": 0.7459, + "step": 7089 + }, + { + "epoch": 0.7420198848770277, + "grad_norm": 2.2858216269881684, + "learning_rate": 3.292420149043747e-06, + "loss": 0.9515, + "step": 7090 + }, + { + "epoch": 0.7421245421245422, + "grad_norm": 2.1396541624839793, + "learning_rate": 3.2899064593549477e-06, + "loss": 0.9349, + "step": 7091 + }, + { + "epoch": 0.7422291993720566, + "grad_norm": 2.172610516131483, + "learning_rate": 3.2873935406694956e-06, + "loss": 0.9582, + "step": 7092 + }, + { + "epoch": 0.7423338566195709, + "grad_norm": 2.1072743883866374, + "learning_rate": 3.284881393276137e-06, + "loss": 0.9638, + "step": 7093 + }, + { + "epoch": 0.7424385138670853, + "grad_norm": 2.30997826595737, + "learning_rate": 3.2823700174635185e-06, + "loss": 0.9079, + "step": 7094 + }, + { + "epoch": 0.7425431711145997, + "grad_norm": 2.079680297310184, + "learning_rate": 3.2798594135202012e-06, + "loss": 0.9564, + "step": 7095 + }, + { + "epoch": 0.7426478283621141, + "grad_norm": 2.2079477477185763, + "learning_rate": 3.2773495817346636e-06, + "loss": 0.8085, + "step": 7096 + }, + { + "epoch": 0.7427524856096285, + "grad_norm": 1.9783748322647048, + "learning_rate": 3.274840522395283e-06, + "loss": 0.9652, + "step": 7097 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 2.013884564094195, + "learning_rate": 3.272332235790363e-06, + "loss": 0.8247, + "step": 7098 + }, + { + "epoch": 0.7429618001046573, + "grad_norm": 1.8380568856422557, + "learning_rate": 3.269824722208108e-06, + "loss": 0.7911, + "step": 7099 + }, + { + "epoch": 0.7430664573521716, + "grad_norm": 1.9699143563198767, + "learning_rate": 3.2673179819366363e-06, + "loss": 0.917, + "step": 7100 + }, + { + "epoch": 0.743171114599686, + "grad_norm": 1.8324074187595984, + "learning_rate": 3.264812015263973e-06, + "loss": 0.8143, + "step": 7101 + }, + { + "epoch": 0.7432757718472004, + "grad_norm": 2.0807751148157196, + "learning_rate": 3.2623068224780663e-06, + "loss": 0.9625, + "step": 7102 + }, + { + "epoch": 0.7433804290947148, + "grad_norm": 2.0925637950807556, + "learning_rate": 3.2598024038667655e-06, + "loss": 0.8431, + "step": 7103 + }, + { + "epoch": 0.7434850863422292, + "grad_norm": 2.3755126784483402, + "learning_rate": 3.2572987597178274e-06, + "loss": 0.8246, + "step": 7104 + }, + { + "epoch": 0.7435897435897436, + "grad_norm": 1.9385820118174737, + "learning_rate": 3.254795890318935e-06, + "loss": 0.8357, + "step": 7105 + }, + { + "epoch": 0.743694400837258, + "grad_norm": 1.9631332561496653, + "learning_rate": 3.2522937959576684e-06, + "loss": 0.9545, + "step": 7106 + }, + { + "epoch": 0.7437990580847724, + "grad_norm": 1.9072432029408546, + "learning_rate": 3.2497924769215206e-06, + "loss": 0.9965, + "step": 7107 + }, + { + "epoch": 0.7439037153322867, + "grad_norm": 2.3417750032717493, + "learning_rate": 3.2472919334979034e-06, + "loss": 1.0051, + "step": 7108 + }, + { + "epoch": 0.7440083725798011, + "grad_norm": 2.00729117210432, + "learning_rate": 3.2447921659741333e-06, + "loss": 0.9051, + "step": 7109 + }, + { + "epoch": 0.7441130298273155, + "grad_norm": 1.8562956071816943, + "learning_rate": 3.2422931746374375e-06, + "loss": 0.8059, + "step": 7110 + }, + { + "epoch": 0.7442176870748299, + "grad_norm": 2.5424170068230647, + "learning_rate": 3.2397949597749525e-06, + "loss": 0.767, + "step": 7111 + }, + { + "epoch": 0.7443223443223443, + "grad_norm": 2.2671400535027204, + "learning_rate": 3.2372975216737335e-06, + "loss": 0.9178, + "step": 7112 + }, + { + "epoch": 0.7444270015698587, + "grad_norm": 2.3361550774786988, + "learning_rate": 3.234800860620736e-06, + "loss": 0.7264, + "step": 7113 + }, + { + "epoch": 0.7445316588173732, + "grad_norm": 1.9859089354809334, + "learning_rate": 3.232304976902837e-06, + "loss": 0.9158, + "step": 7114 + }, + { + "epoch": 0.7446363160648874, + "grad_norm": 1.7913360410024908, + "learning_rate": 3.229809870806815e-06, + "loss": 0.8756, + "step": 7115 + }, + { + "epoch": 0.7447409733124019, + "grad_norm": 2.122253887975913, + "learning_rate": 3.2273155426193613e-06, + "loss": 0.6974, + "step": 7116 + }, + { + "epoch": 0.7448456305599163, + "grad_norm": 1.790135145026046, + "learning_rate": 3.224821992627084e-06, + "loss": 0.8797, + "step": 7117 + }, + { + "epoch": 0.7449502878074307, + "grad_norm": 2.0103996789911998, + "learning_rate": 3.2223292211164946e-06, + "loss": 0.7686, + "step": 7118 + }, + { + "epoch": 0.7450549450549451, + "grad_norm": 2.3239725196592413, + "learning_rate": 3.2198372283740176e-06, + "loss": 0.8821, + "step": 7119 + }, + { + "epoch": 0.7451596023024595, + "grad_norm": 2.4439097088179667, + "learning_rate": 3.217346014685985e-06, + "loss": 0.8314, + "step": 7120 + }, + { + "epoch": 0.7452642595499739, + "grad_norm": 1.8200085048128307, + "learning_rate": 3.214855580338644e-06, + "loss": 0.8329, + "step": 7121 + }, + { + "epoch": 0.7453689167974882, + "grad_norm": 1.8651829102105335, + "learning_rate": 3.212365925618156e-06, + "loss": 0.8808, + "step": 7122 + }, + { + "epoch": 0.7454735740450026, + "grad_norm": 1.954259108086037, + "learning_rate": 3.2098770508105825e-06, + "loss": 0.9463, + "step": 7123 + }, + { + "epoch": 0.745578231292517, + "grad_norm": 2.026272455843576, + "learning_rate": 3.207388956201901e-06, + "loss": 0.8874, + "step": 7124 + }, + { + "epoch": 0.7456828885400314, + "grad_norm": 1.9861545009241273, + "learning_rate": 3.204901642077999e-06, + "loss": 0.836, + "step": 7125 + }, + { + "epoch": 0.7457875457875458, + "grad_norm": 2.1476725693471126, + "learning_rate": 3.2024151087246704e-06, + "loss": 0.9005, + "step": 7126 + }, + { + "epoch": 0.7458922030350602, + "grad_norm": 2.3373997568981313, + "learning_rate": 3.199929356427628e-06, + "loss": 0.8782, + "step": 7127 + }, + { + "epoch": 0.7459968602825746, + "grad_norm": 2.0228477518559522, + "learning_rate": 3.197444385472489e-06, + "loss": 0.8036, + "step": 7128 + }, + { + "epoch": 0.7461015175300889, + "grad_norm": 1.728215649932167, + "learning_rate": 3.1949601961447795e-06, + "loss": 0.7443, + "step": 7129 + }, + { + "epoch": 0.7462061747776033, + "grad_norm": 2.1796559194931606, + "learning_rate": 3.1924767887299357e-06, + "loss": 0.8856, + "step": 7130 + }, + { + "epoch": 0.7463108320251177, + "grad_norm": 2.171160116743099, + "learning_rate": 3.1899941635133092e-06, + "loss": 0.8125, + "step": 7131 + }, + { + "epoch": 0.7464154892726321, + "grad_norm": 1.688400395699494, + "learning_rate": 3.1875123207801616e-06, + "loss": 0.7771, + "step": 7132 + }, + { + "epoch": 0.7465201465201465, + "grad_norm": 2.6231117402138593, + "learning_rate": 3.1850312608156596e-06, + "loss": 0.9425, + "step": 7133 + }, + { + "epoch": 0.7466248037676609, + "grad_norm": 1.8438824122826507, + "learning_rate": 3.1825509839048806e-06, + "loss": 0.788, + "step": 7134 + }, + { + "epoch": 0.7467294610151753, + "grad_norm": 2.2251217003891828, + "learning_rate": 3.1800714903328102e-06, + "loss": 0.9457, + "step": 7135 + }, + { + "epoch": 0.7468341182626896, + "grad_norm": 2.1306082735607808, + "learning_rate": 3.1775927803843543e-06, + "loss": 0.9576, + "step": 7136 + }, + { + "epoch": 0.746938775510204, + "grad_norm": 2.1790105073105757, + "learning_rate": 3.1751148543443187e-06, + "loss": 0.8739, + "step": 7137 + }, + { + "epoch": 0.7470434327577185, + "grad_norm": 2.239558304766855, + "learning_rate": 3.1726377124974217e-06, + "loss": 0.8207, + "step": 7138 + }, + { + "epoch": 0.7471480900052329, + "grad_norm": 2.008095181346031, + "learning_rate": 3.1701613551282893e-06, + "loss": 0.867, + "step": 7139 + }, + { + "epoch": 0.7472527472527473, + "grad_norm": 2.3260833446259777, + "learning_rate": 3.1676857825214623e-06, + "loss": 0.8897, + "step": 7140 + }, + { + "epoch": 0.7473574045002617, + "grad_norm": 2.070498373715419, + "learning_rate": 3.165210994961393e-06, + "loss": 0.8824, + "step": 7141 + }, + { + "epoch": 0.7474620617477761, + "grad_norm": 2.0573703279887807, + "learning_rate": 3.162736992732436e-06, + "loss": 0.8889, + "step": 7142 + }, + { + "epoch": 0.7475667189952904, + "grad_norm": 2.3464698580212304, + "learning_rate": 3.160263776118858e-06, + "loss": 0.9127, + "step": 7143 + }, + { + "epoch": 0.7476713762428048, + "grad_norm": 2.314229707278681, + "learning_rate": 3.1577913454048393e-06, + "loss": 0.9432, + "step": 7144 + }, + { + "epoch": 0.7477760334903192, + "grad_norm": 2.2903886734227856, + "learning_rate": 3.1553197008744607e-06, + "loss": 0.9033, + "step": 7145 + }, + { + "epoch": 0.7478806907378336, + "grad_norm": 2.26707032536701, + "learning_rate": 3.1528488428117287e-06, + "loss": 0.8826, + "step": 7146 + }, + { + "epoch": 0.747985347985348, + "grad_norm": 2.201793259018066, + "learning_rate": 3.150378771500542e-06, + "loss": 0.8848, + "step": 7147 + }, + { + "epoch": 0.7480900052328624, + "grad_norm": 2.153815935771742, + "learning_rate": 3.147909487224723e-06, + "loss": 1.0046, + "step": 7148 + }, + { + "epoch": 0.7481946624803768, + "grad_norm": 2.23656638946847, + "learning_rate": 3.145440990267994e-06, + "loss": 0.8385, + "step": 7149 + }, + { + "epoch": 0.7482993197278912, + "grad_norm": 1.9518763135706487, + "learning_rate": 3.142973280913988e-06, + "loss": 0.8302, + "step": 7150 + }, + { + "epoch": 0.7484039769754055, + "grad_norm": 1.9560563182373518, + "learning_rate": 3.140506359446256e-06, + "loss": 0.8629, + "step": 7151 + }, + { + "epoch": 0.7485086342229199, + "grad_norm": 1.827294647023681, + "learning_rate": 3.138040226148249e-06, + "loss": 0.8265, + "step": 7152 + }, + { + "epoch": 0.7486132914704343, + "grad_norm": 2.1222980400949245, + "learning_rate": 3.1355748813033305e-06, + "loss": 0.8273, + "step": 7153 + }, + { + "epoch": 0.7487179487179487, + "grad_norm": 2.1538492848488855, + "learning_rate": 3.1331103251947703e-06, + "loss": 0.8809, + "step": 7154 + }, + { + "epoch": 0.7488226059654631, + "grad_norm": 1.9153204561792283, + "learning_rate": 3.130646558105758e-06, + "loss": 0.8623, + "step": 7155 + }, + { + "epoch": 0.7489272632129775, + "grad_norm": 2.146577046698544, + "learning_rate": 3.128183580319378e-06, + "loss": 0.9102, + "step": 7156 + }, + { + "epoch": 0.7490319204604919, + "grad_norm": 2.2253981483892598, + "learning_rate": 3.125721392118639e-06, + "loss": 0.8395, + "step": 7157 + }, + { + "epoch": 0.7491365777080062, + "grad_norm": 2.252424457310398, + "learning_rate": 3.1232599937864483e-06, + "loss": 1.0134, + "step": 7158 + }, + { + "epoch": 0.7492412349555206, + "grad_norm": 2.160152684893339, + "learning_rate": 3.1207993856056205e-06, + "loss": 0.9069, + "step": 7159 + }, + { + "epoch": 0.749345892203035, + "grad_norm": 2.163254492760139, + "learning_rate": 3.118339567858892e-06, + "loss": 0.8737, + "step": 7160 + }, + { + "epoch": 0.7494505494505495, + "grad_norm": 1.6978836150995644, + "learning_rate": 3.1158805408288995e-06, + "loss": 0.8209, + "step": 7161 + }, + { + "epoch": 0.7495552066980639, + "grad_norm": 1.9948469938453408, + "learning_rate": 3.1134223047981872e-06, + "loss": 0.947, + "step": 7162 + }, + { + "epoch": 0.7496598639455783, + "grad_norm": 2.261922765416398, + "learning_rate": 3.1109648600492126e-06, + "loss": 0.8666, + "step": 7163 + }, + { + "epoch": 0.7497645211930927, + "grad_norm": 2.004553104391324, + "learning_rate": 3.108508206864338e-06, + "loss": 0.8072, + "step": 7164 + }, + { + "epoch": 0.749869178440607, + "grad_norm": 1.9717258623857916, + "learning_rate": 3.1060523455258406e-06, + "loss": 0.8721, + "step": 7165 + }, + { + "epoch": 0.7499738356881214, + "grad_norm": 2.1108494565639857, + "learning_rate": 3.1035972763159074e-06, + "loss": 0.8608, + "step": 7166 + }, + { + "epoch": 0.7500784929356358, + "grad_norm": 2.0578079658481423, + "learning_rate": 3.1011429995166288e-06, + "loss": 0.8963, + "step": 7167 + }, + { + "epoch": 0.7501831501831502, + "grad_norm": 1.983828357087619, + "learning_rate": 3.098689515410004e-06, + "loss": 0.899, + "step": 7168 + }, + { + "epoch": 0.7502878074306646, + "grad_norm": 2.2998223358846333, + "learning_rate": 3.0962368242779406e-06, + "loss": 0.8537, + "step": 7169 + }, + { + "epoch": 0.750392464678179, + "grad_norm": 1.8820805197314077, + "learning_rate": 3.0937849264022658e-06, + "loss": 0.8407, + "step": 7170 + }, + { + "epoch": 0.7504971219256934, + "grad_norm": 2.177356874448031, + "learning_rate": 3.0913338220647028e-06, + "loss": 0.9178, + "step": 7171 + }, + { + "epoch": 0.7506017791732077, + "grad_norm": 2.198171422534131, + "learning_rate": 3.0888835115468883e-06, + "loss": 0.9125, + "step": 7172 + }, + { + "epoch": 0.7507064364207221, + "grad_norm": 2.031436579163277, + "learning_rate": 3.086433995130367e-06, + "loss": 0.8067, + "step": 7173 + }, + { + "epoch": 0.7508110936682365, + "grad_norm": 1.8475737903440372, + "learning_rate": 3.0839852730965934e-06, + "loss": 0.8454, + "step": 7174 + }, + { + "epoch": 0.7509157509157509, + "grad_norm": 2.25678437649617, + "learning_rate": 3.081537345726936e-06, + "loss": 0.839, + "step": 7175 + }, + { + "epoch": 0.7510204081632653, + "grad_norm": 1.9401602491295404, + "learning_rate": 3.0790902133026625e-06, + "loss": 0.8353, + "step": 7176 + }, + { + "epoch": 0.7511250654107797, + "grad_norm": 2.0825066629501694, + "learning_rate": 3.0766438761049544e-06, + "loss": 0.8053, + "step": 7177 + }, + { + "epoch": 0.7512297226582941, + "grad_norm": 2.451954510004089, + "learning_rate": 3.074198334414896e-06, + "loss": 0.851, + "step": 7178 + }, + { + "epoch": 0.7513343799058084, + "grad_norm": 2.048655055859803, + "learning_rate": 3.071753588513493e-06, + "loss": 0.8167, + "step": 7179 + }, + { + "epoch": 0.7514390371533228, + "grad_norm": 2.177508435886103, + "learning_rate": 3.069309638681647e-06, + "loss": 0.9355, + "step": 7180 + }, + { + "epoch": 0.7515436944008372, + "grad_norm": 2.144395053127334, + "learning_rate": 3.066866485200174e-06, + "loss": 1.0297, + "step": 7181 + }, + { + "epoch": 0.7516483516483516, + "grad_norm": 1.8941507496621728, + "learning_rate": 3.0644241283497934e-06, + "loss": 0.7487, + "step": 7182 + }, + { + "epoch": 0.751753008895866, + "grad_norm": 2.03238588774476, + "learning_rate": 3.0619825684111425e-06, + "loss": 0.9322, + "step": 7183 + }, + { + "epoch": 0.7518576661433805, + "grad_norm": 2.232382445814665, + "learning_rate": 3.0595418056647574e-06, + "loss": 1.0348, + "step": 7184 + }, + { + "epoch": 0.7519623233908949, + "grad_norm": 2.027939873344001, + "learning_rate": 3.0571018403910914e-06, + "loss": 0.8486, + "step": 7185 + }, + { + "epoch": 0.7520669806384092, + "grad_norm": 1.7087518006551394, + "learning_rate": 3.0546626728704986e-06, + "loss": 0.7637, + "step": 7186 + }, + { + "epoch": 0.7521716378859236, + "grad_norm": 2.090097485492809, + "learning_rate": 3.0522243033832455e-06, + "loss": 0.8497, + "step": 7187 + }, + { + "epoch": 0.752276295133438, + "grad_norm": 1.9895769202770441, + "learning_rate": 3.0497867322094998e-06, + "loss": 0.8822, + "step": 7188 + }, + { + "epoch": 0.7523809523809524, + "grad_norm": 2.40584150601798, + "learning_rate": 3.047349959629352e-06, + "loss": 0.9995, + "step": 7189 + }, + { + "epoch": 0.7524856096284668, + "grad_norm": 2.0368993311848977, + "learning_rate": 3.0449139859227883e-06, + "loss": 0.8247, + "step": 7190 + }, + { + "epoch": 0.7525902668759812, + "grad_norm": 2.056257702126359, + "learning_rate": 3.0424788113697036e-06, + "loss": 0.9778, + "step": 7191 + }, + { + "epoch": 0.7526949241234956, + "grad_norm": 1.9693791719687486, + "learning_rate": 3.0400444362499115e-06, + "loss": 0.9017, + "step": 7192 + }, + { + "epoch": 0.75279958137101, + "grad_norm": 2.0591702890964654, + "learning_rate": 3.0376108608431188e-06, + "loss": 0.9149, + "step": 7193 + }, + { + "epoch": 0.7529042386185243, + "grad_norm": 2.0700766255920944, + "learning_rate": 3.035178085428957e-06, + "loss": 0.8404, + "step": 7194 + }, + { + "epoch": 0.7530088958660387, + "grad_norm": 1.8951621768222429, + "learning_rate": 3.0327461102869514e-06, + "loss": 0.8727, + "step": 7195 + }, + { + "epoch": 0.7531135531135531, + "grad_norm": 1.760732012119775, + "learning_rate": 3.0303149356965424e-06, + "loss": 0.7707, + "step": 7196 + }, + { + "epoch": 0.7532182103610675, + "grad_norm": 2.0503813015447903, + "learning_rate": 3.027884561937072e-06, + "loss": 0.9688, + "step": 7197 + }, + { + "epoch": 0.7533228676085819, + "grad_norm": 2.0021170411447766, + "learning_rate": 3.0254549892878038e-06, + "loss": 0.922, + "step": 7198 + }, + { + "epoch": 0.7534275248560963, + "grad_norm": 1.8860849552256713, + "learning_rate": 3.0230262180278925e-06, + "loss": 0.9059, + "step": 7199 + }, + { + "epoch": 0.7535321821036107, + "grad_norm": 2.1818774407352404, + "learning_rate": 3.020598248436415e-06, + "loss": 0.9216, + "step": 7200 + }, + { + "epoch": 0.753636839351125, + "grad_norm": 1.8584792324138004, + "learning_rate": 3.0181710807923492e-06, + "loss": 0.7917, + "step": 7201 + }, + { + "epoch": 0.7537414965986394, + "grad_norm": 1.819519299711091, + "learning_rate": 3.015744715374579e-06, + "loss": 0.8911, + "step": 7202 + }, + { + "epoch": 0.7538461538461538, + "grad_norm": 2.164007510379012, + "learning_rate": 3.0133191524618956e-06, + "loss": 0.9111, + "step": 7203 + }, + { + "epoch": 0.7539508110936682, + "grad_norm": 1.8081808504852512, + "learning_rate": 3.010894392333009e-06, + "loss": 1.0013, + "step": 7204 + }, + { + "epoch": 0.7540554683411826, + "grad_norm": 1.9661369597242528, + "learning_rate": 3.008470435266525e-06, + "loss": 0.9094, + "step": 7205 + }, + { + "epoch": 0.754160125588697, + "grad_norm": 2.1709315190330494, + "learning_rate": 3.0060472815409614e-06, + "loss": 0.856, + "step": 7206 + }, + { + "epoch": 0.7542647828362115, + "grad_norm": 2.077801523128462, + "learning_rate": 3.00362493143474e-06, + "loss": 0.8178, + "step": 7207 + }, + { + "epoch": 0.7543694400837258, + "grad_norm": 2.0343924494049737, + "learning_rate": 3.001203385226198e-06, + "loss": 0.9672, + "step": 7208 + }, + { + "epoch": 0.7544740973312402, + "grad_norm": 2.612125199657769, + "learning_rate": 2.9987826431935773e-06, + "loss": 0.7652, + "step": 7209 + }, + { + "epoch": 0.7545787545787546, + "grad_norm": 1.9881607065905555, + "learning_rate": 2.9963627056150234e-06, + "loss": 0.8869, + "step": 7210 + }, + { + "epoch": 0.754683411826269, + "grad_norm": 2.2653350892421775, + "learning_rate": 2.993943572768594e-06, + "loss": 1.0044, + "step": 7211 + }, + { + "epoch": 0.7547880690737834, + "grad_norm": 1.9442254836980624, + "learning_rate": 2.9915252449322463e-06, + "loss": 0.8595, + "step": 7212 + }, + { + "epoch": 0.7548927263212978, + "grad_norm": 1.7828004495021605, + "learning_rate": 2.9891077223838594e-06, + "loss": 0.7383, + "step": 7213 + }, + { + "epoch": 0.7549973835688122, + "grad_norm": 2.1375811309701143, + "learning_rate": 2.9866910054012078e-06, + "loss": 0.8917, + "step": 7214 + }, + { + "epoch": 0.7551020408163265, + "grad_norm": 1.8434137581998618, + "learning_rate": 2.9842750942619767e-06, + "loss": 0.8375, + "step": 7215 + }, + { + "epoch": 0.7552066980638409, + "grad_norm": 1.8544288019845543, + "learning_rate": 2.9818599892437572e-06, + "loss": 0.8359, + "step": 7216 + }, + { + "epoch": 0.7553113553113553, + "grad_norm": 2.190544453353631, + "learning_rate": 2.979445690624051e-06, + "loss": 0.9268, + "step": 7217 + }, + { + "epoch": 0.7554160125588697, + "grad_norm": 1.6724087178652072, + "learning_rate": 2.977032198680272e-06, + "loss": 0.788, + "step": 7218 + }, + { + "epoch": 0.7555206698063841, + "grad_norm": 2.444110327040675, + "learning_rate": 2.9746195136897294e-06, + "loss": 1.0125, + "step": 7219 + }, + { + "epoch": 0.7556253270538985, + "grad_norm": 2.395682775915913, + "learning_rate": 2.9722076359296457e-06, + "loss": 0.9869, + "step": 7220 + }, + { + "epoch": 0.7557299843014129, + "grad_norm": 2.1788752080998837, + "learning_rate": 2.9697965656771534e-06, + "loss": 0.8759, + "step": 7221 + }, + { + "epoch": 0.7558346415489272, + "grad_norm": 2.3785762750866484, + "learning_rate": 2.967386303209283e-06, + "loss": 0.9297, + "step": 7222 + }, + { + "epoch": 0.7559392987964416, + "grad_norm": 1.9232465597929709, + "learning_rate": 2.9649768488029862e-06, + "loss": 0.867, + "step": 7223 + }, + { + "epoch": 0.756043956043956, + "grad_norm": 1.9326752300709829, + "learning_rate": 2.962568202735111e-06, + "loss": 0.8029, + "step": 7224 + }, + { + "epoch": 0.7561486132914704, + "grad_norm": 2.127211853559956, + "learning_rate": 2.960160365282413e-06, + "loss": 0.855, + "step": 7225 + }, + { + "epoch": 0.7562532705389848, + "grad_norm": 2.268097028045051, + "learning_rate": 2.957753336721563e-06, + "loss": 0.9204, + "step": 7226 + }, + { + "epoch": 0.7563579277864992, + "grad_norm": 2.0703952112572157, + "learning_rate": 2.955347117329127e-06, + "loss": 0.9523, + "step": 7227 + }, + { + "epoch": 0.7564625850340136, + "grad_norm": 2.6447867664491347, + "learning_rate": 2.9529417073815925e-06, + "loss": 0.8857, + "step": 7228 + }, + { + "epoch": 0.7565672422815279, + "grad_norm": 2.5816773745873927, + "learning_rate": 2.950537107155341e-06, + "loss": 0.9542, + "step": 7229 + }, + { + "epoch": 0.7566718995290423, + "grad_norm": 2.16016794368315, + "learning_rate": 2.9481333169266667e-06, + "loss": 0.9248, + "step": 7230 + }, + { + "epoch": 0.7567765567765568, + "grad_norm": 2.0925825053657405, + "learning_rate": 2.945730336971767e-06, + "loss": 0.8558, + "step": 7231 + }, + { + "epoch": 0.7568812140240712, + "grad_norm": 2.0853760218196413, + "learning_rate": 2.9433281675667545e-06, + "loss": 0.9312, + "step": 7232 + }, + { + "epoch": 0.7569858712715856, + "grad_norm": 2.2418754596740755, + "learning_rate": 2.940926808987642e-06, + "loss": 0.7494, + "step": 7233 + }, + { + "epoch": 0.7570905285191, + "grad_norm": 1.958769862807278, + "learning_rate": 2.938526261510346e-06, + "loss": 0.9602, + "step": 7234 + }, + { + "epoch": 0.7571951857666144, + "grad_norm": 1.9315836818382839, + "learning_rate": 2.9361265254106997e-06, + "loss": 0.9757, + "step": 7235 + }, + { + "epoch": 0.7572998430141288, + "grad_norm": 2.2173927927463297, + "learning_rate": 2.933727600964433e-06, + "loss": 0.8727, + "step": 7236 + }, + { + "epoch": 0.7574045002616431, + "grad_norm": 1.7683552818088055, + "learning_rate": 2.9313294884471945e-06, + "loss": 0.7764, + "step": 7237 + }, + { + "epoch": 0.7575091575091575, + "grad_norm": 2.042604009056469, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.8552, + "step": 7238 + }, + { + "epoch": 0.7576138147566719, + "grad_norm": 1.963283487176052, + "learning_rate": 2.9265357003018836e-06, + "loss": 0.8042, + "step": 7239 + }, + { + "epoch": 0.7577184720041863, + "grad_norm": 2.181411025825987, + "learning_rate": 2.924140025224629e-06, + "loss": 0.9476, + "step": 7240 + }, + { + "epoch": 0.7578231292517007, + "grad_norm": 1.7246146896003263, + "learning_rate": 2.921745163178026e-06, + "loss": 0.7424, + "step": 7241 + }, + { + "epoch": 0.7579277864992151, + "grad_norm": 1.679941753555536, + "learning_rate": 2.9193511144372565e-06, + "loss": 0.8579, + "step": 7242 + }, + { + "epoch": 0.7580324437467295, + "grad_norm": 2.166472002968409, + "learning_rate": 2.9169578792773944e-06, + "loss": 0.9046, + "step": 7243 + }, + { + "epoch": 0.7581371009942438, + "grad_norm": 1.6957884419729579, + "learning_rate": 2.9145654579734352e-06, + "loss": 0.7482, + "step": 7244 + }, + { + "epoch": 0.7582417582417582, + "grad_norm": 2.2702186393221044, + "learning_rate": 2.9121738508002675e-06, + "loss": 0.8589, + "step": 7245 + }, + { + "epoch": 0.7583464154892726, + "grad_norm": 2.1812945897899727, + "learning_rate": 2.90978305803269e-06, + "loss": 0.7682, + "step": 7246 + }, + { + "epoch": 0.758451072736787, + "grad_norm": 1.7927929123690813, + "learning_rate": 2.9073930799454153e-06, + "loss": 0.7735, + "step": 7247 + }, + { + "epoch": 0.7585557299843014, + "grad_norm": 2.353877525391729, + "learning_rate": 2.9050039168130537e-06, + "loss": 0.9638, + "step": 7248 + }, + { + "epoch": 0.7586603872318158, + "grad_norm": 2.653249641565394, + "learning_rate": 2.9026155689101256e-06, + "loss": 0.8886, + "step": 7249 + }, + { + "epoch": 0.7587650444793302, + "grad_norm": 2.343195015332429, + "learning_rate": 2.9002280365110534e-06, + "loss": 0.8646, + "step": 7250 + }, + { + "epoch": 0.7588697017268445, + "grad_norm": 7.944090802217359, + "learning_rate": 2.8978413198901754e-06, + "loss": 0.7561, + "step": 7251 + }, + { + "epoch": 0.7589743589743589, + "grad_norm": 1.7220024064319135, + "learning_rate": 2.8954554193217254e-06, + "loss": 0.8006, + "step": 7252 + }, + { + "epoch": 0.7590790162218733, + "grad_norm": 2.230587821004055, + "learning_rate": 2.893070335079852e-06, + "loss": 0.8161, + "step": 7253 + }, + { + "epoch": 0.7591836734693878, + "grad_norm": 2.278411767642871, + "learning_rate": 2.890686067438605e-06, + "loss": 0.8948, + "step": 7254 + }, + { + "epoch": 0.7592883307169022, + "grad_norm": 2.3002079292626387, + "learning_rate": 2.8883026166719374e-06, + "loss": 1.0273, + "step": 7255 + }, + { + "epoch": 0.7593929879644166, + "grad_norm": 2.3979608311717864, + "learning_rate": 2.8859199830537188e-06, + "loss": 0.8825, + "step": 7256 + }, + { + "epoch": 0.759497645211931, + "grad_norm": 1.9704760095316236, + "learning_rate": 2.8835381668577167e-06, + "loss": 0.9408, + "step": 7257 + }, + { + "epoch": 0.7596023024594453, + "grad_norm": 1.9634553592169977, + "learning_rate": 2.8811571683576047e-06, + "loss": 0.8909, + "step": 7258 + }, + { + "epoch": 0.7597069597069597, + "grad_norm": 2.0676146544534997, + "learning_rate": 2.8787769878269667e-06, + "loss": 0.8999, + "step": 7259 + }, + { + "epoch": 0.7598116169544741, + "grad_norm": 1.902506916202938, + "learning_rate": 2.8763976255392854e-06, + "loss": 0.7752, + "step": 7260 + }, + { + "epoch": 0.7599162742019885, + "grad_norm": 1.784936461204701, + "learning_rate": 2.874019081767958e-06, + "loss": 0.9145, + "step": 7261 + }, + { + "epoch": 0.7600209314495029, + "grad_norm": 1.9199931698650219, + "learning_rate": 2.8716413567862865e-06, + "loss": 0.8145, + "step": 7262 + }, + { + "epoch": 0.7601255886970173, + "grad_norm": 2.053832292499548, + "learning_rate": 2.869264450867475e-06, + "loss": 0.8412, + "step": 7263 + }, + { + "epoch": 0.7602302459445317, + "grad_norm": 1.8965072652709092, + "learning_rate": 2.8668883642846325e-06, + "loss": 0.8407, + "step": 7264 + }, + { + "epoch": 0.760334903192046, + "grad_norm": 2.121023584744659, + "learning_rate": 2.864513097310775e-06, + "loss": 0.9228, + "step": 7265 + }, + { + "epoch": 0.7604395604395604, + "grad_norm": 2.2573579326522, + "learning_rate": 2.8621386502188296e-06, + "loss": 0.8228, + "step": 7266 + }, + { + "epoch": 0.7605442176870748, + "grad_norm": 2.0668406704888103, + "learning_rate": 2.8597650232816245e-06, + "loss": 0.8929, + "step": 7267 + }, + { + "epoch": 0.7606488749345892, + "grad_norm": 2.249728883019577, + "learning_rate": 2.8573922167718927e-06, + "loss": 0.8714, + "step": 7268 + }, + { + "epoch": 0.7607535321821036, + "grad_norm": 2.140716219541326, + "learning_rate": 2.855020230962271e-06, + "loss": 0.9412, + "step": 7269 + }, + { + "epoch": 0.760858189429618, + "grad_norm": 2.313215833515219, + "learning_rate": 2.8526490661253104e-06, + "loss": 0.9789, + "step": 7270 + }, + { + "epoch": 0.7609628466771324, + "grad_norm": 2.1190694336129585, + "learning_rate": 2.8502787225334636e-06, + "loss": 0.8946, + "step": 7271 + }, + { + "epoch": 0.7610675039246467, + "grad_norm": 2.4208134953494795, + "learning_rate": 2.8479092004590857e-06, + "loss": 0.7714, + "step": 7272 + }, + { + "epoch": 0.7611721611721611, + "grad_norm": 2.3515481614362646, + "learning_rate": 2.8455405001744397e-06, + "loss": 0.7356, + "step": 7273 + }, + { + "epoch": 0.7612768184196755, + "grad_norm": 1.8723314249073932, + "learning_rate": 2.8431726219516896e-06, + "loss": 0.7776, + "step": 7274 + }, + { + "epoch": 0.7613814756671899, + "grad_norm": 1.7706806040241194, + "learning_rate": 2.8408055660629185e-06, + "loss": 0.8914, + "step": 7275 + }, + { + "epoch": 0.7614861329147044, + "grad_norm": 1.923104823220645, + "learning_rate": 2.8384393327800997e-06, + "loss": 0.8769, + "step": 7276 + }, + { + "epoch": 0.7615907901622188, + "grad_norm": 2.094266643745062, + "learning_rate": 2.83607392237512e-06, + "loss": 0.9438, + "step": 7277 + }, + { + "epoch": 0.7616954474097332, + "grad_norm": 1.7758737658741865, + "learning_rate": 2.8337093351197664e-06, + "loss": 0.8055, + "step": 7278 + }, + { + "epoch": 0.7618001046572476, + "grad_norm": 2.2135817220161376, + "learning_rate": 2.8313455712857408e-06, + "loss": 0.8821, + "step": 7279 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 1.933667595189823, + "learning_rate": 2.828982631144639e-06, + "loss": 0.8911, + "step": 7280 + }, + { + "epoch": 0.7620094191522763, + "grad_norm": 2.5068411421201553, + "learning_rate": 2.8266205149679717e-06, + "loss": 1.022, + "step": 7281 + }, + { + "epoch": 0.7621140763997907, + "grad_norm": 2.109661522511011, + "learning_rate": 2.8242592230271506e-06, + "loss": 0.9163, + "step": 7282 + }, + { + "epoch": 0.7622187336473051, + "grad_norm": 2.2212961677842493, + "learning_rate": 2.821898755593491e-06, + "loss": 0.9055, + "step": 7283 + }, + { + "epoch": 0.7623233908948195, + "grad_norm": 2.4633904814759053, + "learning_rate": 2.819539112938212e-06, + "loss": 0.7539, + "step": 7284 + }, + { + "epoch": 0.7624280481423339, + "grad_norm": 1.8332947615386888, + "learning_rate": 2.817180295332449e-06, + "loss": 0.8608, + "step": 7285 + }, + { + "epoch": 0.7625327053898483, + "grad_norm": 2.252313473361386, + "learning_rate": 2.8148223030472287e-06, + "loss": 0.8452, + "step": 7286 + }, + { + "epoch": 0.7626373626373626, + "grad_norm": 2.430798136578984, + "learning_rate": 2.812465136353494e-06, + "loss": 0.9796, + "step": 7287 + }, + { + "epoch": 0.762742019884877, + "grad_norm": 2.145484648346231, + "learning_rate": 2.8101087955220864e-06, + "loss": 0.9313, + "step": 7288 + }, + { + "epoch": 0.7628466771323914, + "grad_norm": 2.30890945546967, + "learning_rate": 2.8077532808237497e-06, + "loss": 0.9836, + "step": 7289 + }, + { + "epoch": 0.7629513343799058, + "grad_norm": 2.4395138433728456, + "learning_rate": 2.805398592529145e-06, + "loss": 0.7632, + "step": 7290 + }, + { + "epoch": 0.7630559916274202, + "grad_norm": 2.1560151415099833, + "learning_rate": 2.803044730908826e-06, + "loss": 0.938, + "step": 7291 + }, + { + "epoch": 0.7631606488749346, + "grad_norm": 1.799807577137677, + "learning_rate": 2.8006916962332586e-06, + "loss": 0.749, + "step": 7292 + }, + { + "epoch": 0.763265306122449, + "grad_norm": 2.6437825820162133, + "learning_rate": 2.7983394887728054e-06, + "loss": 0.9678, + "step": 7293 + }, + { + "epoch": 0.7633699633699633, + "grad_norm": 2.0001658220701786, + "learning_rate": 2.795988108797748e-06, + "loss": 0.879, + "step": 7294 + }, + { + "epoch": 0.7634746206174777, + "grad_norm": 2.4220922582417654, + "learning_rate": 2.793637556578258e-06, + "loss": 0.8539, + "step": 7295 + }, + { + "epoch": 0.7635792778649921, + "grad_norm": 2.326513581121549, + "learning_rate": 2.791287832384424e-06, + "loss": 0.9231, + "step": 7296 + }, + { + "epoch": 0.7636839351125065, + "grad_norm": 2.0582333962241037, + "learning_rate": 2.788938936486232e-06, + "loss": 0.817, + "step": 7297 + }, + { + "epoch": 0.763788592360021, + "grad_norm": 1.964193500525283, + "learning_rate": 2.7865908691535746e-06, + "loss": 0.7875, + "step": 7298 + }, + { + "epoch": 0.7638932496075354, + "grad_norm": 1.9120684932559597, + "learning_rate": 2.784243630656247e-06, + "loss": 0.9067, + "step": 7299 + }, + { + "epoch": 0.7639979068550498, + "grad_norm": 2.203369880119307, + "learning_rate": 2.781897221263956e-06, + "loss": 0.9302, + "step": 7300 + }, + { + "epoch": 0.764102564102564, + "grad_norm": 1.8664131554131014, + "learning_rate": 2.7795516412463077e-06, + "loss": 0.8078, + "step": 7301 + }, + { + "epoch": 0.7642072213500785, + "grad_norm": 2.152251063216064, + "learning_rate": 2.7772068908728133e-06, + "loss": 0.8731, + "step": 7302 + }, + { + "epoch": 0.7643118785975929, + "grad_norm": 1.9704503123853456, + "learning_rate": 2.774862970412886e-06, + "loss": 0.9309, + "step": 7303 + }, + { + "epoch": 0.7644165358451073, + "grad_norm": 2.217027087214899, + "learning_rate": 2.77251988013585e-06, + "loss": 0.9636, + "step": 7304 + }, + { + "epoch": 0.7645211930926217, + "grad_norm": 2.4945137079215836, + "learning_rate": 2.7701776203109342e-06, + "loss": 0.9788, + "step": 7305 + }, + { + "epoch": 0.7646258503401361, + "grad_norm": 1.979000751747689, + "learning_rate": 2.767836191207267e-06, + "loss": 0.9692, + "step": 7306 + }, + { + "epoch": 0.7647305075876505, + "grad_norm": 1.9320377725750433, + "learning_rate": 2.765495593093882e-06, + "loss": 0.8792, + "step": 7307 + }, + { + "epoch": 0.7648351648351648, + "grad_norm": 1.661537117127484, + "learning_rate": 2.7631558262397164e-06, + "loss": 0.7901, + "step": 7308 + }, + { + "epoch": 0.7649398220826792, + "grad_norm": 2.066054899073813, + "learning_rate": 2.7608168909136203e-06, + "loss": 0.789, + "step": 7309 + }, + { + "epoch": 0.7650444793301936, + "grad_norm": 2.309887605904426, + "learning_rate": 2.7584787873843376e-06, + "loss": 0.9919, + "step": 7310 + }, + { + "epoch": 0.765149136577708, + "grad_norm": 2.219400510624547, + "learning_rate": 2.756141515920524e-06, + "loss": 0.8606, + "step": 7311 + }, + { + "epoch": 0.7652537938252224, + "grad_norm": 2.1735797860738977, + "learning_rate": 2.753805076790731e-06, + "loss": 0.854, + "step": 7312 + }, + { + "epoch": 0.7653584510727368, + "grad_norm": 2.132548975039601, + "learning_rate": 2.7514694702634237e-06, + "loss": 0.8229, + "step": 7313 + }, + { + "epoch": 0.7654631083202512, + "grad_norm": 2.2324206156113777, + "learning_rate": 2.7491346966069734e-06, + "loss": 0.8859, + "step": 7314 + }, + { + "epoch": 0.7655677655677655, + "grad_norm": 2.0237282392424114, + "learning_rate": 2.7468007560896435e-06, + "loss": 0.817, + "step": 7315 + }, + { + "epoch": 0.7656724228152799, + "grad_norm": 1.8891116758472233, + "learning_rate": 2.744467648979612e-06, + "loss": 0.8634, + "step": 7316 + }, + { + "epoch": 0.7657770800627943, + "grad_norm": 1.9422661279068079, + "learning_rate": 2.7421353755449552e-06, + "loss": 0.9092, + "step": 7317 + }, + { + "epoch": 0.7658817373103087, + "grad_norm": 2.2850690583866644, + "learning_rate": 2.7398039360536543e-06, + "loss": 0.8976, + "step": 7318 + }, + { + "epoch": 0.7659863945578231, + "grad_norm": 2.0211359219333467, + "learning_rate": 2.7374733307736025e-06, + "loss": 0.8664, + "step": 7319 + }, + { + "epoch": 0.7660910518053375, + "grad_norm": 1.8583554118941399, + "learning_rate": 2.7351435599725874e-06, + "loss": 0.8905, + "step": 7320 + }, + { + "epoch": 0.766195709052852, + "grad_norm": 1.9734674152504228, + "learning_rate": 2.7328146239183007e-06, + "loss": 0.8991, + "step": 7321 + }, + { + "epoch": 0.7663003663003664, + "grad_norm": 1.7218591246248722, + "learning_rate": 2.7304865228783507e-06, + "loss": 0.8628, + "step": 7322 + }, + { + "epoch": 0.7664050235478806, + "grad_norm": 1.7601071260258068, + "learning_rate": 2.7281592571202307e-06, + "loss": 0.8435, + "step": 7323 + }, + { + "epoch": 0.766509680795395, + "grad_norm": 2.5384380504587583, + "learning_rate": 2.725832826911359e-06, + "loss": 0.9552, + "step": 7324 + }, + { + "epoch": 0.7666143380429095, + "grad_norm": 1.9897641468927232, + "learning_rate": 2.7235072325190404e-06, + "loss": 0.8246, + "step": 7325 + }, + { + "epoch": 0.7667189952904239, + "grad_norm": 2.18530144040617, + "learning_rate": 2.721182474210492e-06, + "loss": 1.0168, + "step": 7326 + }, + { + "epoch": 0.7668236525379383, + "grad_norm": 1.9476014865880453, + "learning_rate": 2.71885855225283e-06, + "loss": 0.8297, + "step": 7327 + }, + { + "epoch": 0.7669283097854527, + "grad_norm": 2.0184306914039083, + "learning_rate": 2.716535466913084e-06, + "loss": 0.8521, + "step": 7328 + }, + { + "epoch": 0.7670329670329671, + "grad_norm": 2.1006322169829144, + "learning_rate": 2.714213218458178e-06, + "loss": 0.7977, + "step": 7329 + }, + { + "epoch": 0.7671376242804814, + "grad_norm": 2.0262143165867337, + "learning_rate": 2.7118918071549395e-06, + "loss": 0.8349, + "step": 7330 + }, + { + "epoch": 0.7672422815279958, + "grad_norm": 2.269865382008233, + "learning_rate": 2.7095712332701108e-06, + "loss": 0.9142, + "step": 7331 + }, + { + "epoch": 0.7673469387755102, + "grad_norm": 2.3038111471769858, + "learning_rate": 2.7072514970703224e-06, + "loss": 0.9053, + "step": 7332 + }, + { + "epoch": 0.7674515960230246, + "grad_norm": 1.936577382811956, + "learning_rate": 2.7049325988221255e-06, + "loss": 0.8464, + "step": 7333 + }, + { + "epoch": 0.767556253270539, + "grad_norm": 2.5043378818478494, + "learning_rate": 2.70261453879196e-06, + "loss": 0.7622, + "step": 7334 + }, + { + "epoch": 0.7676609105180534, + "grad_norm": 2.0270141293438635, + "learning_rate": 2.7002973172461775e-06, + "loss": 0.8564, + "step": 7335 + }, + { + "epoch": 0.7677655677655678, + "grad_norm": 2.238914386289856, + "learning_rate": 2.6979809344510323e-06, + "loss": 0.949, + "step": 7336 + }, + { + "epoch": 0.7678702250130821, + "grad_norm": 2.112977680901526, + "learning_rate": 2.695665390672677e-06, + "loss": 0.8794, + "step": 7337 + }, + { + "epoch": 0.7679748822605965, + "grad_norm": 1.7578779088455374, + "learning_rate": 2.6933506861771764e-06, + "loss": 0.9656, + "step": 7338 + }, + { + "epoch": 0.7680795395081109, + "grad_norm": 2.025671558545498, + "learning_rate": 2.691036821230496e-06, + "loss": 0.8891, + "step": 7339 + }, + { + "epoch": 0.7681841967556253, + "grad_norm": 2.2330212723105913, + "learning_rate": 2.6887237960985024e-06, + "loss": 0.9527, + "step": 7340 + }, + { + "epoch": 0.7682888540031397, + "grad_norm": 2.312102639631419, + "learning_rate": 2.6864116110469664e-06, + "loss": 0.9365, + "step": 7341 + }, + { + "epoch": 0.7683935112506541, + "grad_norm": 2.2905403470328847, + "learning_rate": 2.684100266341558e-06, + "loss": 0.9652, + "step": 7342 + }, + { + "epoch": 0.7684981684981685, + "grad_norm": 1.7444191920341985, + "learning_rate": 2.681789762247864e-06, + "loss": 0.7956, + "step": 7343 + }, + { + "epoch": 0.7686028257456828, + "grad_norm": 2.125810203110687, + "learning_rate": 2.6794800990313617e-06, + "loss": 0.8981, + "step": 7344 + }, + { + "epoch": 0.7687074829931972, + "grad_norm": 2.0640436363199584, + "learning_rate": 2.6771712769574364e-06, + "loss": 0.9515, + "step": 7345 + }, + { + "epoch": 0.7688121402407117, + "grad_norm": 2.2540317780667283, + "learning_rate": 2.6748632962913732e-06, + "loss": 0.9397, + "step": 7346 + }, + { + "epoch": 0.7689167974882261, + "grad_norm": 2.2939707078082434, + "learning_rate": 2.672556157298367e-06, + "loss": 0.873, + "step": 7347 + }, + { + "epoch": 0.7690214547357405, + "grad_norm": 2.0271839169120343, + "learning_rate": 2.6702498602435155e-06, + "loss": 0.9098, + "step": 7348 + }, + { + "epoch": 0.7691261119832549, + "grad_norm": 2.194903694850483, + "learning_rate": 2.6679444053918137e-06, + "loss": 0.7982, + "step": 7349 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 1.8945213031067587, + "learning_rate": 2.6656397930081635e-06, + "loss": 0.9307, + "step": 7350 + }, + { + "epoch": 0.7693354264782836, + "grad_norm": 1.9063057362434497, + "learning_rate": 2.6633360233573656e-06, + "loss": 0.9288, + "step": 7351 + }, + { + "epoch": 0.769440083725798, + "grad_norm": 2.318201908671572, + "learning_rate": 2.6610330967041366e-06, + "loss": 0.9832, + "step": 7352 + }, + { + "epoch": 0.7695447409733124, + "grad_norm": 2.087584884442611, + "learning_rate": 2.6587310133130805e-06, + "loss": 0.8781, + "step": 7353 + }, + { + "epoch": 0.7696493982208268, + "grad_norm": 1.8841301045621537, + "learning_rate": 2.6564297734487144e-06, + "loss": 0.7486, + "step": 7354 + }, + { + "epoch": 0.7697540554683412, + "grad_norm": 2.048554104308666, + "learning_rate": 2.654129377375454e-06, + "loss": 0.7509, + "step": 7355 + }, + { + "epoch": 0.7698587127158556, + "grad_norm": 1.9448133038229751, + "learning_rate": 2.6518298253576167e-06, + "loss": 0.8034, + "step": 7356 + }, + { + "epoch": 0.76996336996337, + "grad_norm": 2.3334382875645465, + "learning_rate": 2.6495311176594286e-06, + "loss": 1.0057, + "step": 7357 + }, + { + "epoch": 0.7700680272108843, + "grad_norm": 2.0609258709754203, + "learning_rate": 2.6472332545450195e-06, + "loss": 0.9352, + "step": 7358 + }, + { + "epoch": 0.7701726844583987, + "grad_norm": 2.043836151919123, + "learning_rate": 2.6449362362784147e-06, + "loss": 0.8086, + "step": 7359 + }, + { + "epoch": 0.7702773417059131, + "grad_norm": 1.9222038165954007, + "learning_rate": 2.642640063123546e-06, + "loss": 0.8762, + "step": 7360 + }, + { + "epoch": 0.7703819989534275, + "grad_norm": 2.272474659914367, + "learning_rate": 2.640344735344247e-06, + "loss": 0.887, + "step": 7361 + }, + { + "epoch": 0.7704866562009419, + "grad_norm": 2.0559426832557066, + "learning_rate": 2.6380502532042608e-06, + "loss": 0.8748, + "step": 7362 + }, + { + "epoch": 0.7705913134484563, + "grad_norm": 1.9381978766078287, + "learning_rate": 2.635756616967223e-06, + "loss": 0.8543, + "step": 7363 + }, + { + "epoch": 0.7706959706959707, + "grad_norm": 1.885340263704655, + "learning_rate": 2.6334638268966773e-06, + "loss": 0.8073, + "step": 7364 + }, + { + "epoch": 0.7708006279434851, + "grad_norm": 1.939561768160846, + "learning_rate": 2.631171883256074e-06, + "loss": 0.8754, + "step": 7365 + }, + { + "epoch": 0.7709052851909994, + "grad_norm": 1.7916271930244994, + "learning_rate": 2.628880786308756e-06, + "loss": 0.9365, + "step": 7366 + }, + { + "epoch": 0.7710099424385138, + "grad_norm": 1.9214226616582117, + "learning_rate": 2.626590536317982e-06, + "loss": 0.9126, + "step": 7367 + }, + { + "epoch": 0.7711145996860282, + "grad_norm": 1.6201683681441432, + "learning_rate": 2.6243011335469027e-06, + "loss": 0.7946, + "step": 7368 + }, + { + "epoch": 0.7712192569335427, + "grad_norm": 2.0135703962696954, + "learning_rate": 2.622012578258576e-06, + "loss": 0.9228, + "step": 7369 + }, + { + "epoch": 0.7713239141810571, + "grad_norm": 2.1109014783062747, + "learning_rate": 2.6197248707159575e-06, + "loss": 0.8835, + "step": 7370 + }, + { + "epoch": 0.7714285714285715, + "grad_norm": 2.004035053535731, + "learning_rate": 2.6174380111819144e-06, + "loss": 0.8671, + "step": 7371 + }, + { + "epoch": 0.7715332286760859, + "grad_norm": 2.027444223995198, + "learning_rate": 2.615151999919211e-06, + "loss": 0.9357, + "step": 7372 + }, + { + "epoch": 0.7716378859236002, + "grad_norm": 1.9959569059145792, + "learning_rate": 2.61286683719051e-06, + "loss": 0.9009, + "step": 7373 + }, + { + "epoch": 0.7717425431711146, + "grad_norm": 1.9792281441894708, + "learning_rate": 2.610582523258388e-06, + "loss": 0.8614, + "step": 7374 + }, + { + "epoch": 0.771847200418629, + "grad_norm": 1.983946749775523, + "learning_rate": 2.608299058385314e-06, + "loss": 0.8806, + "step": 7375 + }, + { + "epoch": 0.7719518576661434, + "grad_norm": 1.9428206558603256, + "learning_rate": 2.60601644283366e-06, + "loss": 0.8755, + "step": 7376 + }, + { + "epoch": 0.7720565149136578, + "grad_norm": 2.0749303955604703, + "learning_rate": 2.603734676865708e-06, + "loss": 0.8801, + "step": 7377 + }, + { + "epoch": 0.7721611721611722, + "grad_norm": 1.9352525690019882, + "learning_rate": 2.6014537607436365e-06, + "loss": 0.9563, + "step": 7378 + }, + { + "epoch": 0.7722658294086866, + "grad_norm": 2.0793047689529844, + "learning_rate": 2.5991736947295254e-06, + "loss": 0.8347, + "step": 7379 + }, + { + "epoch": 0.7723704866562009, + "grad_norm": 2.293524394942549, + "learning_rate": 2.596894479085357e-06, + "loss": 0.8757, + "step": 7380 + }, + { + "epoch": 0.7724751439037153, + "grad_norm": 1.751731557615987, + "learning_rate": 2.5946161140730243e-06, + "loss": 0.8043, + "step": 7381 + }, + { + "epoch": 0.7725798011512297, + "grad_norm": 2.25419896408031, + "learning_rate": 2.592338599954308e-06, + "loss": 0.7497, + "step": 7382 + }, + { + "epoch": 0.7726844583987441, + "grad_norm": 2.0024558583214342, + "learning_rate": 2.5900619369909074e-06, + "loss": 0.8527, + "step": 7383 + }, + { + "epoch": 0.7727891156462585, + "grad_norm": 2.0969280155721552, + "learning_rate": 2.5877861254444115e-06, + "loss": 0.7273, + "step": 7384 + }, + { + "epoch": 0.7728937728937729, + "grad_norm": 2.1227745715824717, + "learning_rate": 2.5855111655763134e-06, + "loss": 0.86, + "step": 7385 + }, + { + "epoch": 0.7729984301412873, + "grad_norm": 1.8610864043360293, + "learning_rate": 2.583237057648016e-06, + "loss": 0.8691, + "step": 7386 + }, + { + "epoch": 0.7731030873888016, + "grad_norm": 2.1425965300073884, + "learning_rate": 2.580963801920816e-06, + "loss": 0.8999, + "step": 7387 + }, + { + "epoch": 0.773207744636316, + "grad_norm": 2.042623189976076, + "learning_rate": 2.5786913986559147e-06, + "loss": 0.8993, + "step": 7388 + }, + { + "epoch": 0.7733124018838304, + "grad_norm": 1.9775430060536299, + "learning_rate": 2.5764198481144164e-06, + "loss": 0.9127, + "step": 7389 + }, + { + "epoch": 0.7734170591313448, + "grad_norm": 2.000333442979551, + "learning_rate": 2.5741491505573213e-06, + "loss": 0.9102, + "step": 7390 + }, + { + "epoch": 0.7735217163788592, + "grad_norm": 1.9614405240639192, + "learning_rate": 2.5718793062455472e-06, + "loss": 0.8835, + "step": 7391 + }, + { + "epoch": 0.7736263736263737, + "grad_norm": 2.4448514916772806, + "learning_rate": 2.5696103154399e-06, + "loss": 0.9974, + "step": 7392 + }, + { + "epoch": 0.7737310308738881, + "grad_norm": 1.9778536256057575, + "learning_rate": 2.567342178401091e-06, + "loss": 0.8884, + "step": 7393 + }, + { + "epoch": 0.7738356881214024, + "grad_norm": 1.965676707722803, + "learning_rate": 2.565074895389733e-06, + "loss": 0.8692, + "step": 7394 + }, + { + "epoch": 0.7739403453689168, + "grad_norm": 2.1903706642179195, + "learning_rate": 2.562808466666338e-06, + "loss": 0.8845, + "step": 7395 + }, + { + "epoch": 0.7740450026164312, + "grad_norm": 1.9841184192672507, + "learning_rate": 2.5605428924913312e-06, + "loss": 0.8959, + "step": 7396 + }, + { + "epoch": 0.7741496598639456, + "grad_norm": 2.169747156193136, + "learning_rate": 2.558278173125026e-06, + "loss": 0.8318, + "step": 7397 + }, + { + "epoch": 0.77425431711146, + "grad_norm": 2.742205301494219, + "learning_rate": 2.5560143088276456e-06, + "loss": 0.9708, + "step": 7398 + }, + { + "epoch": 0.7743589743589744, + "grad_norm": 2.1736256054455145, + "learning_rate": 2.553751299859308e-06, + "loss": 0.8634, + "step": 7399 + }, + { + "epoch": 0.7744636316064888, + "grad_norm": 2.0298243788952792, + "learning_rate": 2.551489146480042e-06, + "loss": 0.98, + "step": 7400 + }, + { + "epoch": 0.7745682888540032, + "grad_norm": 1.8171963930524047, + "learning_rate": 2.5492278489497745e-06, + "loss": 0.8868, + "step": 7401 + }, + { + "epoch": 0.7746729461015175, + "grad_norm": 1.8750562250635816, + "learning_rate": 2.5469674075283325e-06, + "loss": 0.8463, + "step": 7402 + }, + { + "epoch": 0.7747776033490319, + "grad_norm": 2.0170705567329934, + "learning_rate": 2.544707822475444e-06, + "loss": 0.8602, + "step": 7403 + }, + { + "epoch": 0.7748822605965463, + "grad_norm": 2.0889797065511657, + "learning_rate": 2.5424490940507373e-06, + "loss": 0.8067, + "step": 7404 + }, + { + "epoch": 0.7749869178440607, + "grad_norm": 2.084921965591833, + "learning_rate": 2.5401912225137504e-06, + "loss": 0.8538, + "step": 7405 + }, + { + "epoch": 0.7750915750915751, + "grad_norm": 1.999119792459058, + "learning_rate": 2.5379342081239157e-06, + "loss": 0.954, + "step": 7406 + }, + { + "epoch": 0.7751962323390895, + "grad_norm": 1.9723776486252687, + "learning_rate": 2.5356780511405666e-06, + "loss": 0.8641, + "step": 7407 + }, + { + "epoch": 0.7753008895866039, + "grad_norm": 2.07024245752328, + "learning_rate": 2.533422751822938e-06, + "loss": 0.8942, + "step": 7408 + }, + { + "epoch": 0.7754055468341182, + "grad_norm": 1.8764528707975676, + "learning_rate": 2.531168310430172e-06, + "loss": 0.9478, + "step": 7409 + }, + { + "epoch": 0.7755102040816326, + "grad_norm": 1.8645837015695488, + "learning_rate": 2.528914727221311e-06, + "loss": 0.8407, + "step": 7410 + }, + { + "epoch": 0.775614861329147, + "grad_norm": 2.172900648266963, + "learning_rate": 2.5266620024552937e-06, + "loss": 0.856, + "step": 7411 + }, + { + "epoch": 0.7757195185766614, + "grad_norm": 2.0753861210504425, + "learning_rate": 2.524410136390961e-06, + "loss": 0.9372, + "step": 7412 + }, + { + "epoch": 0.7758241758241758, + "grad_norm": 2.0279346463791588, + "learning_rate": 2.5221591292870595e-06, + "loss": 0.8748, + "step": 7413 + }, + { + "epoch": 0.7759288330716902, + "grad_norm": 1.9053518322080332, + "learning_rate": 2.5199089814022293e-06, + "loss": 0.846, + "step": 7414 + }, + { + "epoch": 0.7760334903192047, + "grad_norm": 1.951751195516781, + "learning_rate": 2.517659692995025e-06, + "loss": 0.7616, + "step": 7415 + }, + { + "epoch": 0.776138147566719, + "grad_norm": 2.258508279860391, + "learning_rate": 2.515411264323887e-06, + "loss": 0.9151, + "step": 7416 + }, + { + "epoch": 0.7762428048142334, + "grad_norm": 2.318867938802657, + "learning_rate": 2.5131636956471696e-06, + "loss": 0.8769, + "step": 7417 + }, + { + "epoch": 0.7763474620617478, + "grad_norm": 1.867705727278113, + "learning_rate": 2.510916987223122e-06, + "loss": 0.9135, + "step": 7418 + }, + { + "epoch": 0.7764521193092622, + "grad_norm": 2.100130971915618, + "learning_rate": 2.5086711393098906e-06, + "loss": 0.85, + "step": 7419 + }, + { + "epoch": 0.7765567765567766, + "grad_norm": 2.0579255004196852, + "learning_rate": 2.5064261521655355e-06, + "loss": 0.8838, + "step": 7420 + }, + { + "epoch": 0.776661433804291, + "grad_norm": 2.0359736419827033, + "learning_rate": 2.5041820260480063e-06, + "loss": 0.891, + "step": 7421 + }, + { + "epoch": 0.7767660910518054, + "grad_norm": 2.007394652518043, + "learning_rate": 2.501938761215158e-06, + "loss": 0.8018, + "step": 7422 + }, + { + "epoch": 0.7768707482993197, + "grad_norm": 2.309114884247499, + "learning_rate": 2.4996963579247433e-06, + "loss": 0.8789, + "step": 7423 + }, + { + "epoch": 0.7769754055468341, + "grad_norm": 2.1276331305118017, + "learning_rate": 2.497454816434425e-06, + "loss": 0.9842, + "step": 7424 + }, + { + "epoch": 0.7770800627943485, + "grad_norm": 2.0491657489397985, + "learning_rate": 2.4952141370017536e-06, + "loss": 0.8562, + "step": 7425 + }, + { + "epoch": 0.7771847200418629, + "grad_norm": 1.780804392760166, + "learning_rate": 2.492974319884196e-06, + "loss": 0.8353, + "step": 7426 + }, + { + "epoch": 0.7772893772893773, + "grad_norm": 2.829423926147118, + "learning_rate": 2.4907353653391062e-06, + "loss": 0.8086, + "step": 7427 + }, + { + "epoch": 0.7773940345368917, + "grad_norm": 2.1527688872034516, + "learning_rate": 2.4884972736237433e-06, + "loss": 0.8685, + "step": 7428 + }, + { + "epoch": 0.7774986917844061, + "grad_norm": 1.7756364808096, + "learning_rate": 2.4862600449952744e-06, + "loss": 0.8753, + "step": 7429 + }, + { + "epoch": 0.7776033490319204, + "grad_norm": 2.193742164532645, + "learning_rate": 2.4840236797107575e-06, + "loss": 0.7831, + "step": 7430 + }, + { + "epoch": 0.7777080062794348, + "grad_norm": 2.415166469626955, + "learning_rate": 2.481788178027157e-06, + "loss": 0.8495, + "step": 7431 + }, + { + "epoch": 0.7778126635269492, + "grad_norm": 2.2290675901663652, + "learning_rate": 2.479553540201335e-06, + "loss": 0.9439, + "step": 7432 + }, + { + "epoch": 0.7779173207744636, + "grad_norm": 2.2829886391415255, + "learning_rate": 2.4773197664900538e-06, + "loss": 1.0281, + "step": 7433 + }, + { + "epoch": 0.778021978021978, + "grad_norm": 2.142967693368868, + "learning_rate": 2.475086857149982e-06, + "loss": 0.9277, + "step": 7434 + }, + { + "epoch": 0.7781266352694924, + "grad_norm": 2.1352013664869403, + "learning_rate": 2.4728548124376882e-06, + "loss": 1.0059, + "step": 7435 + }, + { + "epoch": 0.7782312925170068, + "grad_norm": 2.1208486875780537, + "learning_rate": 2.4706236326096357e-06, + "loss": 0.9999, + "step": 7436 + }, + { + "epoch": 0.7783359497645211, + "grad_norm": 2.351231832444182, + "learning_rate": 2.4683933179221908e-06, + "loss": 0.9088, + "step": 7437 + }, + { + "epoch": 0.7784406070120355, + "grad_norm": 2.612668770820266, + "learning_rate": 2.4661638686316193e-06, + "loss": 0.8534, + "step": 7438 + }, + { + "epoch": 0.77854526425955, + "grad_norm": 1.9406158005630711, + "learning_rate": 2.4639352849940947e-06, + "loss": 0.8145, + "step": 7439 + }, + { + "epoch": 0.7786499215070644, + "grad_norm": 2.000613163110507, + "learning_rate": 2.461707567265682e-06, + "loss": 0.8868, + "step": 7440 + }, + { + "epoch": 0.7787545787545788, + "grad_norm": 2.316740372445306, + "learning_rate": 2.4594807157023525e-06, + "loss": 0.8466, + "step": 7441 + }, + { + "epoch": 0.7788592360020932, + "grad_norm": 1.9850899017077899, + "learning_rate": 2.4572547305599727e-06, + "loss": 0.9479, + "step": 7442 + }, + { + "epoch": 0.7789638932496076, + "grad_norm": 2.2991940584000825, + "learning_rate": 2.455029612094314e-06, + "loss": 0.8785, + "step": 7443 + }, + { + "epoch": 0.779068550497122, + "grad_norm": 2.0602886300262164, + "learning_rate": 2.4528053605610513e-06, + "loss": 0.9742, + "step": 7444 + }, + { + "epoch": 0.7791732077446363, + "grad_norm": 1.7206711855095842, + "learning_rate": 2.4505819762157534e-06, + "loss": 0.7868, + "step": 7445 + }, + { + "epoch": 0.7792778649921507, + "grad_norm": 2.1321280190827805, + "learning_rate": 2.4483594593138894e-06, + "loss": 0.9418, + "step": 7446 + }, + { + "epoch": 0.7793825222396651, + "grad_norm": 2.2378800662787888, + "learning_rate": 2.4461378101108333e-06, + "loss": 1.0102, + "step": 7447 + }, + { + "epoch": 0.7794871794871795, + "grad_norm": 1.891205894491629, + "learning_rate": 2.443917028861853e-06, + "loss": 0.8933, + "step": 7448 + }, + { + "epoch": 0.7795918367346939, + "grad_norm": 2.21558145359656, + "learning_rate": 2.441697115822126e-06, + "loss": 0.878, + "step": 7449 + }, + { + "epoch": 0.7796964939822083, + "grad_norm": 2.072454113701976, + "learning_rate": 2.4394780712467236e-06, + "loss": 0.7732, + "step": 7450 + }, + { + "epoch": 0.7798011512297227, + "grad_norm": 2.10593743081749, + "learning_rate": 2.4372598953906133e-06, + "loss": 0.768, + "step": 7451 + }, + { + "epoch": 0.779905808477237, + "grad_norm": 1.9053195631944175, + "learning_rate": 2.435042588508677e-06, + "loss": 0.8531, + "step": 7452 + }, + { + "epoch": 0.7800104657247514, + "grad_norm": 1.7533274864047492, + "learning_rate": 2.4328261508556795e-06, + "loss": 0.9297, + "step": 7453 + }, + { + "epoch": 0.7801151229722658, + "grad_norm": 2.074809239844302, + "learning_rate": 2.4306105826862993e-06, + "loss": 0.8273, + "step": 7454 + }, + { + "epoch": 0.7802197802197802, + "grad_norm": 2.125111767929294, + "learning_rate": 2.428395884255109e-06, + "loss": 0.7983, + "step": 7455 + }, + { + "epoch": 0.7803244374672946, + "grad_norm": 2.1801053470480323, + "learning_rate": 2.4261820558165804e-06, + "loss": 0.858, + "step": 7456 + }, + { + "epoch": 0.780429094714809, + "grad_norm": 1.8010107947795544, + "learning_rate": 2.4239690976250854e-06, + "loss": 0.7268, + "step": 7457 + }, + { + "epoch": 0.7805337519623234, + "grad_norm": 2.26916467402209, + "learning_rate": 2.421757009934901e-06, + "loss": 0.8247, + "step": 7458 + }, + { + "epoch": 0.7806384092098377, + "grad_norm": 1.9033771203519017, + "learning_rate": 2.4195457930001998e-06, + "loss": 0.7597, + "step": 7459 + }, + { + "epoch": 0.7807430664573521, + "grad_norm": 1.9507378053947775, + "learning_rate": 2.417335447075051e-06, + "loss": 0.8733, + "step": 7460 + }, + { + "epoch": 0.7808477237048665, + "grad_norm": 2.099170592286283, + "learning_rate": 2.4151259724134336e-06, + "loss": 0.824, + "step": 7461 + }, + { + "epoch": 0.780952380952381, + "grad_norm": 2.227444898261774, + "learning_rate": 2.4129173692692155e-06, + "loss": 0.8599, + "step": 7462 + }, + { + "epoch": 0.7810570381998954, + "grad_norm": 1.9903419632586217, + "learning_rate": 2.4107096378961746e-06, + "loss": 0.9733, + "step": 7463 + }, + { + "epoch": 0.7811616954474098, + "grad_norm": 2.037233493155889, + "learning_rate": 2.408502778547982e-06, + "loss": 0.7757, + "step": 7464 + }, + { + "epoch": 0.7812663526949242, + "grad_norm": 2.0708333939156134, + "learning_rate": 2.4062967914782086e-06, + "loss": 0.8146, + "step": 7465 + }, + { + "epoch": 0.7813710099424385, + "grad_norm": 2.0344131802341976, + "learning_rate": 2.404091676940329e-06, + "loss": 0.9128, + "step": 7466 + }, + { + "epoch": 0.7814756671899529, + "grad_norm": 2.032647250442873, + "learning_rate": 2.401887435187712e-06, + "loss": 0.9985, + "step": 7467 + }, + { + "epoch": 0.7815803244374673, + "grad_norm": 2.431120027142936, + "learning_rate": 2.39968406647363e-06, + "loss": 0.8236, + "step": 7468 + }, + { + "epoch": 0.7816849816849817, + "grad_norm": 2.1068090556259738, + "learning_rate": 2.39748157105126e-06, + "loss": 0.9063, + "step": 7469 + }, + { + "epoch": 0.7817896389324961, + "grad_norm": 2.234104108726407, + "learning_rate": 2.39527994917367e-06, + "loss": 0.9853, + "step": 7470 + }, + { + "epoch": 0.7818942961800105, + "grad_norm": 2.115693775219522, + "learning_rate": 2.3930792010938286e-06, + "loss": 0.9584, + "step": 7471 + }, + { + "epoch": 0.7819989534275249, + "grad_norm": 2.173122868655898, + "learning_rate": 2.3908793270646057e-06, + "loss": 0.8623, + "step": 7472 + }, + { + "epoch": 0.7821036106750392, + "grad_norm": 2.158065559668168, + "learning_rate": 2.388680327338777e-06, + "loss": 0.9185, + "step": 7473 + }, + { + "epoch": 0.7822082679225536, + "grad_norm": 2.4956821194494454, + "learning_rate": 2.386482202169008e-06, + "loss": 1.0272, + "step": 7474 + }, + { + "epoch": 0.782312925170068, + "grad_norm": 2.0104495991381888, + "learning_rate": 2.384284951807868e-06, + "loss": 0.8998, + "step": 7475 + }, + { + "epoch": 0.7824175824175824, + "grad_norm": 1.8416688486042285, + "learning_rate": 2.3820885765078226e-06, + "loss": 0.8212, + "step": 7476 + }, + { + "epoch": 0.7825222396650968, + "grad_norm": 2.302585517865405, + "learning_rate": 2.3798930765212435e-06, + "loss": 0.9272, + "step": 7477 + }, + { + "epoch": 0.7826268969126112, + "grad_norm": 2.13934576419671, + "learning_rate": 2.3776984521003997e-06, + "loss": 0.7576, + "step": 7478 + }, + { + "epoch": 0.7827315541601256, + "grad_norm": 2.136647066906963, + "learning_rate": 2.3755047034974567e-06, + "loss": 0.9061, + "step": 7479 + }, + { + "epoch": 0.7828362114076399, + "grad_norm": 1.8057322334613368, + "learning_rate": 2.373311830964479e-06, + "loss": 0.8824, + "step": 7480 + }, + { + "epoch": 0.7829408686551543, + "grad_norm": 1.9635915461888118, + "learning_rate": 2.3711198347534305e-06, + "loss": 0.8677, + "step": 7481 + }, + { + "epoch": 0.7830455259026687, + "grad_norm": 2.1726928757916673, + "learning_rate": 2.36892871511618e-06, + "loss": 0.8336, + "step": 7482 + }, + { + "epoch": 0.7831501831501831, + "grad_norm": 2.002270471261225, + "learning_rate": 2.3667384723044918e-06, + "loss": 0.923, + "step": 7483 + }, + { + "epoch": 0.7832548403976975, + "grad_norm": 2.0664098657994883, + "learning_rate": 2.3645491065700257e-06, + "loss": 0.8348, + "step": 7484 + }, + { + "epoch": 0.783359497645212, + "grad_norm": 2.2593815059893734, + "learning_rate": 2.3623606181643465e-06, + "loss": 0.9542, + "step": 7485 + }, + { + "epoch": 0.7834641548927264, + "grad_norm": 2.025749143560053, + "learning_rate": 2.3601730073389096e-06, + "loss": 0.9224, + "step": 7486 + }, + { + "epoch": 0.7835688121402408, + "grad_norm": 2.0959469108319047, + "learning_rate": 2.3579862743450877e-06, + "loss": 0.8847, + "step": 7487 + }, + { + "epoch": 0.7836734693877551, + "grad_norm": 2.2588339335795173, + "learning_rate": 2.3558004194341343e-06, + "loss": 0.8129, + "step": 7488 + }, + { + "epoch": 0.7837781266352695, + "grad_norm": 1.9031430821916164, + "learning_rate": 2.35361544285721e-06, + "loss": 0.8896, + "step": 7489 + }, + { + "epoch": 0.7838827838827839, + "grad_norm": 2.2458748833211506, + "learning_rate": 2.3514313448653715e-06, + "loss": 0.965, + "step": 7490 + }, + { + "epoch": 0.7839874411302983, + "grad_norm": 1.9843366337068091, + "learning_rate": 2.3492481257095746e-06, + "loss": 0.9045, + "step": 7491 + }, + { + "epoch": 0.7840920983778127, + "grad_norm": 2.3071137715563186, + "learning_rate": 2.34706578564068e-06, + "loss": 0.8596, + "step": 7492 + }, + { + "epoch": 0.7841967556253271, + "grad_norm": 2.1993251510987437, + "learning_rate": 2.34488432490944e-06, + "loss": 1.0281, + "step": 7493 + }, + { + "epoch": 0.7843014128728415, + "grad_norm": 1.767074435073521, + "learning_rate": 2.342703743766508e-06, + "loss": 0.9149, + "step": 7494 + }, + { + "epoch": 0.7844060701203558, + "grad_norm": 1.9658161545730846, + "learning_rate": 2.340524042462441e-06, + "loss": 0.8962, + "step": 7495 + }, + { + "epoch": 0.7845107273678702, + "grad_norm": 2.1705849889367888, + "learning_rate": 2.3383452212476866e-06, + "loss": 0.9249, + "step": 7496 + }, + { + "epoch": 0.7846153846153846, + "grad_norm": 2.0783706544712217, + "learning_rate": 2.3361672803725997e-06, + "loss": 0.972, + "step": 7497 + }, + { + "epoch": 0.784720041862899, + "grad_norm": 1.8899752350082029, + "learning_rate": 2.33399022008743e-06, + "loss": 0.8817, + "step": 7498 + }, + { + "epoch": 0.7848246991104134, + "grad_norm": 2.5517986810451783, + "learning_rate": 2.331814040642324e-06, + "loss": 0.8759, + "step": 7499 + }, + { + "epoch": 0.7849293563579278, + "grad_norm": 1.8563647336761964, + "learning_rate": 2.329638742287327e-06, + "loss": 0.8498, + "step": 7500 + }, + { + "epoch": 0.7850340136054422, + "grad_norm": 1.7691375881118099, + "learning_rate": 2.3274643252723904e-06, + "loss": 0.7296, + "step": 7501 + }, + { + "epoch": 0.7851386708529565, + "grad_norm": 2.1783402785198667, + "learning_rate": 2.3252907898473564e-06, + "loss": 0.8703, + "step": 7502 + }, + { + "epoch": 0.7852433281004709, + "grad_norm": 2.205810214469163, + "learning_rate": 2.323118136261967e-06, + "loss": 0.9018, + "step": 7503 + }, + { + "epoch": 0.7853479853479853, + "grad_norm": 1.8545931677630307, + "learning_rate": 2.320946364765869e-06, + "loss": 0.8532, + "step": 7504 + }, + { + "epoch": 0.7854526425954997, + "grad_norm": 2.0691817245860333, + "learning_rate": 2.3187754756086e-06, + "loss": 0.9241, + "step": 7505 + }, + { + "epoch": 0.7855572998430141, + "grad_norm": 1.830258047422379, + "learning_rate": 2.316605469039599e-06, + "loss": 0.777, + "step": 7506 + }, + { + "epoch": 0.7856619570905286, + "grad_norm": 2.1983345444045077, + "learning_rate": 2.3144363453082076e-06, + "loss": 0.9011, + "step": 7507 + }, + { + "epoch": 0.785766614338043, + "grad_norm": 2.1342406030341867, + "learning_rate": 2.31226810466366e-06, + "loss": 0.9166, + "step": 7508 + }, + { + "epoch": 0.7858712715855573, + "grad_norm": 2.1953438695436276, + "learning_rate": 2.310100747355092e-06, + "loss": 0.9392, + "step": 7509 + }, + { + "epoch": 0.7859759288330717, + "grad_norm": 2.0745384468715766, + "learning_rate": 2.3079342736315335e-06, + "loss": 0.85, + "step": 7510 + }, + { + "epoch": 0.7860805860805861, + "grad_norm": 2.0531247089902953, + "learning_rate": 2.3057686837419246e-06, + "loss": 0.9608, + "step": 7511 + }, + { + "epoch": 0.7861852433281005, + "grad_norm": 2.0646468979200194, + "learning_rate": 2.303603977935088e-06, + "loss": 0.8047, + "step": 7512 + }, + { + "epoch": 0.7862899005756149, + "grad_norm": 2.232605413909429, + "learning_rate": 2.30144015645976e-06, + "loss": 0.8882, + "step": 7513 + }, + { + "epoch": 0.7863945578231293, + "grad_norm": 1.9565581752784238, + "learning_rate": 2.299277219564565e-06, + "loss": 0.8375, + "step": 7514 + }, + { + "epoch": 0.7864992150706437, + "grad_norm": 2.0031871819667497, + "learning_rate": 2.2971151674980254e-06, + "loss": 0.8445, + "step": 7515 + }, + { + "epoch": 0.786603872318158, + "grad_norm": 2.222141146594989, + "learning_rate": 2.29495400050857e-06, + "loss": 0.9172, + "step": 7516 + }, + { + "epoch": 0.7867085295656724, + "grad_norm": 1.9729975164234717, + "learning_rate": 2.2927937188445217e-06, + "loss": 0.9044, + "step": 7517 + }, + { + "epoch": 0.7868131868131868, + "grad_norm": 1.8762093594300924, + "learning_rate": 2.2906343227540973e-06, + "loss": 0.8125, + "step": 7518 + }, + { + "epoch": 0.7869178440607012, + "grad_norm": 2.134221883980738, + "learning_rate": 2.2884758124854156e-06, + "loss": 0.8042, + "step": 7519 + }, + { + "epoch": 0.7870225013082156, + "grad_norm": 1.9860025023180254, + "learning_rate": 2.2863181882864995e-06, + "loss": 0.8895, + "step": 7520 + }, + { + "epoch": 0.78712715855573, + "grad_norm": 1.879511345324259, + "learning_rate": 2.2841614504052576e-06, + "loss": 0.8006, + "step": 7521 + }, + { + "epoch": 0.7872318158032444, + "grad_norm": 1.9777761746411802, + "learning_rate": 2.28200559908951e-06, + "loss": 0.9588, + "step": 7522 + }, + { + "epoch": 0.7873364730507587, + "grad_norm": 2.0015943184841034, + "learning_rate": 2.279850634586963e-06, + "loss": 0.8274, + "step": 7523 + }, + { + "epoch": 0.7874411302982731, + "grad_norm": 2.126665132820641, + "learning_rate": 2.2776965571452305e-06, + "loss": 0.8101, + "step": 7524 + }, + { + "epoch": 0.7875457875457875, + "grad_norm": 1.9386922457161486, + "learning_rate": 2.2755433670118156e-06, + "loss": 0.9006, + "step": 7525 + }, + { + "epoch": 0.7876504447933019, + "grad_norm": 2.1069264857135974, + "learning_rate": 2.2733910644341284e-06, + "loss": 0.8787, + "step": 7526 + }, + { + "epoch": 0.7877551020408163, + "grad_norm": 2.312845843018434, + "learning_rate": 2.2712396496594724e-06, + "loss": 0.8824, + "step": 7527 + }, + { + "epoch": 0.7878597592883307, + "grad_norm": 2.410379692069536, + "learning_rate": 2.2690891229350474e-06, + "loss": 0.8534, + "step": 7528 + }, + { + "epoch": 0.7879644165358451, + "grad_norm": 1.6004259225754003, + "learning_rate": 2.266939484507953e-06, + "loss": 0.7655, + "step": 7529 + }, + { + "epoch": 0.7880690737833596, + "grad_norm": 1.9401251846318475, + "learning_rate": 2.2647907346251874e-06, + "loss": 0.9055, + "step": 7530 + }, + { + "epoch": 0.7881737310308738, + "grad_norm": 1.8259854422426756, + "learning_rate": 2.2626428735336504e-06, + "loss": 0.8849, + "step": 7531 + }, + { + "epoch": 0.7882783882783883, + "grad_norm": 2.086066746215832, + "learning_rate": 2.260495901480132e-06, + "loss": 0.9008, + "step": 7532 + }, + { + "epoch": 0.7883830455259027, + "grad_norm": 1.920464809197468, + "learning_rate": 2.2583498187113253e-06, + "loss": 0.9415, + "step": 7533 + }, + { + "epoch": 0.7884877027734171, + "grad_norm": 2.266413825635565, + "learning_rate": 2.2562046254738144e-06, + "loss": 0.8273, + "step": 7534 + }, + { + "epoch": 0.7885923600209315, + "grad_norm": 1.737401277999669, + "learning_rate": 2.2540603220140934e-06, + "loss": 0.9201, + "step": 7535 + }, + { + "epoch": 0.7886970172684459, + "grad_norm": 2.1096363498094854, + "learning_rate": 2.251916908578544e-06, + "loss": 0.9345, + "step": 7536 + }, + { + "epoch": 0.7888016745159603, + "grad_norm": 2.150831886317924, + "learning_rate": 2.2497743854134493e-06, + "loss": 0.844, + "step": 7537 + }, + { + "epoch": 0.7889063317634746, + "grad_norm": 2.0401061997847547, + "learning_rate": 2.2476327527649865e-06, + "loss": 0.7954, + "step": 7538 + }, + { + "epoch": 0.789010989010989, + "grad_norm": 2.337995915788079, + "learning_rate": 2.2454920108792354e-06, + "loss": 0.9412, + "step": 7539 + }, + { + "epoch": 0.7891156462585034, + "grad_norm": 1.9830974874569107, + "learning_rate": 2.2433521600021744e-06, + "loss": 0.8127, + "step": 7540 + }, + { + "epoch": 0.7892203035060178, + "grad_norm": 2.167769265794437, + "learning_rate": 2.241213200379676e-06, + "loss": 0.7914, + "step": 7541 + }, + { + "epoch": 0.7893249607535322, + "grad_norm": 2.33273978878609, + "learning_rate": 2.239075132257509e-06, + "loss": 0.8763, + "step": 7542 + }, + { + "epoch": 0.7894296180010466, + "grad_norm": 2.030942504421985, + "learning_rate": 2.236937955881343e-06, + "loss": 0.9236, + "step": 7543 + }, + { + "epoch": 0.789534275248561, + "grad_norm": 1.7829580923400006, + "learning_rate": 2.234801671496739e-06, + "loss": 0.7905, + "step": 7544 + }, + { + "epoch": 0.7896389324960753, + "grad_norm": 1.9662829211533972, + "learning_rate": 2.232666279349168e-06, + "loss": 0.8887, + "step": 7545 + }, + { + "epoch": 0.7897435897435897, + "grad_norm": 2.0352979592881444, + "learning_rate": 2.2305317796839887e-06, + "loss": 0.7428, + "step": 7546 + }, + { + "epoch": 0.7898482469911041, + "grad_norm": 1.6539156555371572, + "learning_rate": 2.2283981727464544e-06, + "loss": 0.7944, + "step": 7547 + }, + { + "epoch": 0.7899529042386185, + "grad_norm": 2.1614558820180783, + "learning_rate": 2.226265458781728e-06, + "loss": 0.8095, + "step": 7548 + }, + { + "epoch": 0.7900575614861329, + "grad_norm": 1.908631638174456, + "learning_rate": 2.2241336380348557e-06, + "loss": 0.8981, + "step": 7549 + }, + { + "epoch": 0.7901622187336473, + "grad_norm": 2.073463123923285, + "learning_rate": 2.222002710750796e-06, + "loss": 0.8807, + "step": 7550 + }, + { + "epoch": 0.7902668759811617, + "grad_norm": 2.0673778416845185, + "learning_rate": 2.2198726771743905e-06, + "loss": 0.8022, + "step": 7551 + }, + { + "epoch": 0.790371533228676, + "grad_norm": 1.6946057542306046, + "learning_rate": 2.2177435375503874e-06, + "loss": 0.7675, + "step": 7552 + }, + { + "epoch": 0.7904761904761904, + "grad_norm": 2.04567393679294, + "learning_rate": 2.2156152921234254e-06, + "loss": 0.9319, + "step": 7553 + }, + { + "epoch": 0.7905808477237048, + "grad_norm": 2.3078311593974514, + "learning_rate": 2.2134879411380505e-06, + "loss": 0.9253, + "step": 7554 + }, + { + "epoch": 0.7906855049712193, + "grad_norm": 2.115100849729748, + "learning_rate": 2.2113614848386924e-06, + "loss": 0.9134, + "step": 7555 + }, + { + "epoch": 0.7907901622187337, + "grad_norm": 2.20323705537602, + "learning_rate": 2.2092359234696916e-06, + "loss": 0.977, + "step": 7556 + }, + { + "epoch": 0.7908948194662481, + "grad_norm": 1.9616275621121293, + "learning_rate": 2.207111257275276e-06, + "loss": 0.9302, + "step": 7557 + }, + { + "epoch": 0.7909994767137625, + "grad_norm": 2.059271815772192, + "learning_rate": 2.2049874864995735e-06, + "loss": 0.9602, + "step": 7558 + }, + { + "epoch": 0.7911041339612768, + "grad_norm": 1.9637722265176025, + "learning_rate": 2.2028646113866127e-06, + "loss": 0.8741, + "step": 7559 + }, + { + "epoch": 0.7912087912087912, + "grad_norm": 2.2232201652151837, + "learning_rate": 2.200742632180315e-06, + "loss": 0.8846, + "step": 7560 + }, + { + "epoch": 0.7913134484563056, + "grad_norm": 2.30436743903849, + "learning_rate": 2.1986215491245e-06, + "loss": 0.9266, + "step": 7561 + }, + { + "epoch": 0.79141810570382, + "grad_norm": 2.364762077617069, + "learning_rate": 2.196501362462883e-06, + "loss": 0.9375, + "step": 7562 + }, + { + "epoch": 0.7915227629513344, + "grad_norm": 2.353159650570667, + "learning_rate": 2.1943820724390763e-06, + "loss": 0.8549, + "step": 7563 + }, + { + "epoch": 0.7916274201988488, + "grad_norm": 2.032814738216188, + "learning_rate": 2.1922636792965935e-06, + "loss": 0.7728, + "step": 7564 + }, + { + "epoch": 0.7917320774463632, + "grad_norm": 1.8205772867039858, + "learning_rate": 2.1901461832788453e-06, + "loss": 0.9063, + "step": 7565 + }, + { + "epoch": 0.7918367346938775, + "grad_norm": 2.1878001602801254, + "learning_rate": 2.188029584629131e-06, + "loss": 0.8742, + "step": 7566 + }, + { + "epoch": 0.7919413919413919, + "grad_norm": 2.2177495247640664, + "learning_rate": 2.1859138835906557e-06, + "loss": 0.8931, + "step": 7567 + }, + { + "epoch": 0.7920460491889063, + "grad_norm": 2.5843532600798915, + "learning_rate": 2.1837990804065125e-06, + "loss": 0.93, + "step": 7568 + }, + { + "epoch": 0.7921507064364207, + "grad_norm": 1.7462838713565032, + "learning_rate": 2.1816851753197023e-06, + "loss": 0.7721, + "step": 7569 + }, + { + "epoch": 0.7922553636839351, + "grad_norm": 2.7345603215828964, + "learning_rate": 2.1795721685731153e-06, + "loss": 0.7922, + "step": 7570 + }, + { + "epoch": 0.7923600209314495, + "grad_norm": 2.458606643347218, + "learning_rate": 2.1774600604095407e-06, + "loss": 0.893, + "step": 7571 + }, + { + "epoch": 0.7924646781789639, + "grad_norm": 2.0422870744164956, + "learning_rate": 2.175348851071659e-06, + "loss": 0.8463, + "step": 7572 + }, + { + "epoch": 0.7925693354264783, + "grad_norm": 2.220719234475042, + "learning_rate": 2.1732385408020575e-06, + "loss": 0.941, + "step": 7573 + }, + { + "epoch": 0.7926739926739926, + "grad_norm": 1.975429690998908, + "learning_rate": 2.1711291298432157e-06, + "loss": 0.9173, + "step": 7574 + }, + { + "epoch": 0.792778649921507, + "grad_norm": 2.036016848312796, + "learning_rate": 2.1690206184375086e-06, + "loss": 0.8766, + "step": 7575 + }, + { + "epoch": 0.7928833071690214, + "grad_norm": 1.8743908375351486, + "learning_rate": 2.166913006827207e-06, + "loss": 0.8244, + "step": 7576 + }, + { + "epoch": 0.7929879644165359, + "grad_norm": 1.933946634791488, + "learning_rate": 2.164806295254478e-06, + "loss": 0.8598, + "step": 7577 + }, + { + "epoch": 0.7930926216640503, + "grad_norm": 1.7813235104033394, + "learning_rate": 2.162700483961392e-06, + "loss": 0.8794, + "step": 7578 + }, + { + "epoch": 0.7931972789115647, + "grad_norm": 2.2090460695960026, + "learning_rate": 2.1605955731899086e-06, + "loss": 0.9021, + "step": 7579 + }, + { + "epoch": 0.7933019361590791, + "grad_norm": 2.0974899535355616, + "learning_rate": 2.158491563181885e-06, + "loss": 0.8928, + "step": 7580 + }, + { + "epoch": 0.7934065934065934, + "grad_norm": 2.0169609622705154, + "learning_rate": 2.1563884541790747e-06, + "loss": 0.7913, + "step": 7581 + }, + { + "epoch": 0.7935112506541078, + "grad_norm": 2.199278005040713, + "learning_rate": 2.154286246423135e-06, + "loss": 0.9365, + "step": 7582 + }, + { + "epoch": 0.7936159079016222, + "grad_norm": 1.9357028205928937, + "learning_rate": 2.152184940155607e-06, + "loss": 0.8404, + "step": 7583 + }, + { + "epoch": 0.7937205651491366, + "grad_norm": 2.299234559003062, + "learning_rate": 2.150084535617942e-06, + "loss": 0.8855, + "step": 7584 + }, + { + "epoch": 0.793825222396651, + "grad_norm": 2.1127780268188454, + "learning_rate": 2.147985033051476e-06, + "loss": 0.8773, + "step": 7585 + }, + { + "epoch": 0.7939298796441654, + "grad_norm": 1.9778297585940738, + "learning_rate": 2.1458864326974495e-06, + "loss": 0.7991, + "step": 7586 + }, + { + "epoch": 0.7940345368916798, + "grad_norm": 2.815180081463802, + "learning_rate": 2.14378873479699e-06, + "loss": 1.1133, + "step": 7587 + }, + { + "epoch": 0.7941391941391941, + "grad_norm": 2.2536327406341474, + "learning_rate": 2.1416919395911327e-06, + "loss": 0.9107, + "step": 7588 + }, + { + "epoch": 0.7942438513867085, + "grad_norm": 2.0517416713013, + "learning_rate": 2.139596047320803e-06, + "loss": 0.9105, + "step": 7589 + }, + { + "epoch": 0.7943485086342229, + "grad_norm": 1.9716793290444405, + "learning_rate": 2.1375010582268194e-06, + "loss": 0.9514, + "step": 7590 + }, + { + "epoch": 0.7944531658817373, + "grad_norm": 1.7217851253182228, + "learning_rate": 2.135406972549906e-06, + "loss": 0.776, + "step": 7591 + }, + { + "epoch": 0.7945578231292517, + "grad_norm": 1.9071443124007157, + "learning_rate": 2.1333137905306723e-06, + "loss": 0.953, + "step": 7592 + }, + { + "epoch": 0.7946624803767661, + "grad_norm": 1.789970511220544, + "learning_rate": 2.1312215124096337e-06, + "loss": 0.7764, + "step": 7593 + }, + { + "epoch": 0.7947671376242805, + "grad_norm": 2.728962359247706, + "learning_rate": 2.1291301384271955e-06, + "loss": 0.8014, + "step": 7594 + }, + { + "epoch": 0.7948717948717948, + "grad_norm": 1.9322455444724893, + "learning_rate": 2.1270396688236595e-06, + "loss": 0.7246, + "step": 7595 + }, + { + "epoch": 0.7949764521193092, + "grad_norm": 2.0774223052480107, + "learning_rate": 2.124950103839224e-06, + "loss": 1.0114, + "step": 7596 + }, + { + "epoch": 0.7950811093668236, + "grad_norm": 2.275100035949746, + "learning_rate": 2.122861443713987e-06, + "loss": 0.8527, + "step": 7597 + }, + { + "epoch": 0.795185766614338, + "grad_norm": 1.9482436075243985, + "learning_rate": 2.1207736886879406e-06, + "loss": 0.8765, + "step": 7598 + }, + { + "epoch": 0.7952904238618524, + "grad_norm": 1.9226926868108565, + "learning_rate": 2.1186868390009664e-06, + "loss": 0.8391, + "step": 7599 + }, + { + "epoch": 0.7953950811093669, + "grad_norm": 1.968122748788881, + "learning_rate": 2.116600894892855e-06, + "loss": 0.8938, + "step": 7600 + }, + { + "epoch": 0.7954997383568813, + "grad_norm": 2.0362358868182655, + "learning_rate": 2.1145158566032808e-06, + "loss": 0.92, + "step": 7601 + }, + { + "epoch": 0.7956043956043956, + "grad_norm": 1.8167616269052242, + "learning_rate": 2.1124317243718184e-06, + "loss": 0.874, + "step": 7602 + }, + { + "epoch": 0.79570905285191, + "grad_norm": 2.081440590488677, + "learning_rate": 2.1103484984379443e-06, + "loss": 0.9316, + "step": 7603 + }, + { + "epoch": 0.7958137100994244, + "grad_norm": 2.0790409856006864, + "learning_rate": 2.1082661790410197e-06, + "loss": 0.8893, + "step": 7604 + }, + { + "epoch": 0.7959183673469388, + "grad_norm": 2.0996779895786135, + "learning_rate": 2.1061847664203107e-06, + "loss": 0.9361, + "step": 7605 + }, + { + "epoch": 0.7960230245944532, + "grad_norm": 2.1580716719163386, + "learning_rate": 2.1041042608149707e-06, + "loss": 0.9383, + "step": 7606 + }, + { + "epoch": 0.7961276818419676, + "grad_norm": 1.8816414196677191, + "learning_rate": 2.1020246624640595e-06, + "loss": 0.7567, + "step": 7607 + }, + { + "epoch": 0.796232339089482, + "grad_norm": 2.052842909348786, + "learning_rate": 2.0999459716065273e-06, + "loss": 0.8624, + "step": 7608 + }, + { + "epoch": 0.7963369963369963, + "grad_norm": 2.0109322496007733, + "learning_rate": 2.097868188481217e-06, + "loss": 0.9112, + "step": 7609 + }, + { + "epoch": 0.7964416535845107, + "grad_norm": 1.9850219383542753, + "learning_rate": 2.0957913133268726e-06, + "loss": 0.7805, + "step": 7610 + }, + { + "epoch": 0.7965463108320251, + "grad_norm": 2.257844754431785, + "learning_rate": 2.0937153463821257e-06, + "loss": 0.8675, + "step": 7611 + }, + { + "epoch": 0.7966509680795395, + "grad_norm": 2.002649601742002, + "learning_rate": 2.0916402878855168e-06, + "loss": 0.8938, + "step": 7612 + }, + { + "epoch": 0.7967556253270539, + "grad_norm": 1.921356147737922, + "learning_rate": 2.0895661380754715e-06, + "loss": 0.8056, + "step": 7613 + }, + { + "epoch": 0.7968602825745683, + "grad_norm": 2.2073850950321185, + "learning_rate": 2.087492897190312e-06, + "loss": 0.9354, + "step": 7614 + }, + { + "epoch": 0.7969649398220827, + "grad_norm": 2.3982569594670218, + "learning_rate": 2.085420565468256e-06, + "loss": 0.9788, + "step": 7615 + }, + { + "epoch": 0.7970695970695971, + "grad_norm": 2.3499158850699007, + "learning_rate": 2.083349143147422e-06, + "loss": 0.9463, + "step": 7616 + }, + { + "epoch": 0.7971742543171114, + "grad_norm": 1.784998934231335, + "learning_rate": 2.081278630465823e-06, + "loss": 0.8517, + "step": 7617 + }, + { + "epoch": 0.7972789115646258, + "grad_norm": 2.3333292170699687, + "learning_rate": 2.079209027661362e-06, + "loss": 0.8067, + "step": 7618 + }, + { + "epoch": 0.7973835688121402, + "grad_norm": 2.4308857648334627, + "learning_rate": 2.077140334971841e-06, + "loss": 0.8981, + "step": 7619 + }, + { + "epoch": 0.7974882260596546, + "grad_norm": 2.2155185275133524, + "learning_rate": 2.075072552634957e-06, + "loss": 0.8724, + "step": 7620 + }, + { + "epoch": 0.797592883307169, + "grad_norm": 1.8788752477824127, + "learning_rate": 2.0730056808882993e-06, + "loss": 0.7483, + "step": 7621 + }, + { + "epoch": 0.7976975405546834, + "grad_norm": 2.2866944996543146, + "learning_rate": 2.0709397199693616e-06, + "loss": 1.0201, + "step": 7622 + }, + { + "epoch": 0.7978021978021979, + "grad_norm": 2.2884876205119853, + "learning_rate": 2.068874670115524e-06, + "loss": 0.8809, + "step": 7623 + }, + { + "epoch": 0.7979068550497121, + "grad_norm": 2.2069390747276367, + "learning_rate": 2.0668105315640643e-06, + "loss": 0.9331, + "step": 7624 + }, + { + "epoch": 0.7980115122972266, + "grad_norm": 2.2300750543609316, + "learning_rate": 2.0647473045521536e-06, + "loss": 0.9861, + "step": 7625 + }, + { + "epoch": 0.798116169544741, + "grad_norm": 1.9473189124396104, + "learning_rate": 2.0626849893168655e-06, + "loss": 0.8974, + "step": 7626 + }, + { + "epoch": 0.7982208267922554, + "grad_norm": 2.1241126439462175, + "learning_rate": 2.060623586095164e-06, + "loss": 0.8733, + "step": 7627 + }, + { + "epoch": 0.7983254840397698, + "grad_norm": 2.596050491489067, + "learning_rate": 2.058563095123908e-06, + "loss": 0.8455, + "step": 7628 + }, + { + "epoch": 0.7984301412872842, + "grad_norm": 1.6496184313046733, + "learning_rate": 2.056503516639852e-06, + "loss": 0.7479, + "step": 7629 + }, + { + "epoch": 0.7985347985347986, + "grad_norm": 1.9392866056291445, + "learning_rate": 2.054444850879641e-06, + "loss": 0.8385, + "step": 7630 + }, + { + "epoch": 0.7986394557823129, + "grad_norm": 2.2720725714699754, + "learning_rate": 2.052387098079828e-06, + "loss": 0.9297, + "step": 7631 + }, + { + "epoch": 0.7987441130298273, + "grad_norm": 2.0705720785860238, + "learning_rate": 2.050330258476848e-06, + "loss": 0.8512, + "step": 7632 + }, + { + "epoch": 0.7988487702773417, + "grad_norm": 1.9969354338670204, + "learning_rate": 2.048274332307034e-06, + "loss": 0.8813, + "step": 7633 + }, + { + "epoch": 0.7989534275248561, + "grad_norm": 2.6051722309501764, + "learning_rate": 2.0462193198066226e-06, + "loss": 0.9048, + "step": 7634 + }, + { + "epoch": 0.7990580847723705, + "grad_norm": 2.164629627302996, + "learning_rate": 2.0441652212117326e-06, + "loss": 0.9597, + "step": 7635 + }, + { + "epoch": 0.7991627420198849, + "grad_norm": 1.9912390762348158, + "learning_rate": 2.042112036758389e-06, + "loss": 0.9058, + "step": 7636 + }, + { + "epoch": 0.7992673992673993, + "grad_norm": 1.8720345668097196, + "learning_rate": 2.040059766682504e-06, + "loss": 0.8318, + "step": 7637 + }, + { + "epoch": 0.7993720565149136, + "grad_norm": 2.0710286245998946, + "learning_rate": 2.0380084112198893e-06, + "loss": 0.849, + "step": 7638 + }, + { + "epoch": 0.799476713762428, + "grad_norm": 1.8830717363876028, + "learning_rate": 2.0359579706062493e-06, + "loss": 0.9041, + "step": 7639 + }, + { + "epoch": 0.7995813710099424, + "grad_norm": 1.895209812890259, + "learning_rate": 2.0339084450771797e-06, + "loss": 0.7978, + "step": 7640 + }, + { + "epoch": 0.7996860282574568, + "grad_norm": 1.8023260782135615, + "learning_rate": 2.031859834868182e-06, + "loss": 0.8613, + "step": 7641 + }, + { + "epoch": 0.7997906855049712, + "grad_norm": 2.1045087518839125, + "learning_rate": 2.0298121402146386e-06, + "loss": 0.8604, + "step": 7642 + }, + { + "epoch": 0.7998953427524856, + "grad_norm": 1.7435782505197222, + "learning_rate": 2.0277653613518413e-06, + "loss": 0.7904, + "step": 7643 + }, + { + "epoch": 0.8, + "grad_norm": 2.393742131042189, + "learning_rate": 2.0257194985149653e-06, + "loss": 0.8332, + "step": 7644 + }, + { + "epoch": 0.8001046572475143, + "grad_norm": 2.0074026574381003, + "learning_rate": 2.023674551939081e-06, + "loss": 1.0339, + "step": 7645 + }, + { + "epoch": 0.8002093144950287, + "grad_norm": 2.1072035338876045, + "learning_rate": 2.0216305218591636e-06, + "loss": 0.9326, + "step": 7646 + }, + { + "epoch": 0.8003139717425432, + "grad_norm": 2.2169694179321766, + "learning_rate": 2.019587408510074e-06, + "loss": 1.0086, + "step": 7647 + }, + { + "epoch": 0.8004186289900576, + "grad_norm": 1.745468429945779, + "learning_rate": 2.017545212126568e-06, + "loss": 0.7837, + "step": 7648 + }, + { + "epoch": 0.800523286237572, + "grad_norm": 2.307042540661392, + "learning_rate": 2.015503932943297e-06, + "loss": 0.9432, + "step": 7649 + }, + { + "epoch": 0.8006279434850864, + "grad_norm": 1.8426257142814524, + "learning_rate": 2.0134635711948125e-06, + "loss": 0.9432, + "step": 7650 + }, + { + "epoch": 0.8007326007326008, + "grad_norm": 1.9616990567522987, + "learning_rate": 2.011424127115552e-06, + "loss": 0.8726, + "step": 7651 + }, + { + "epoch": 0.8008372579801151, + "grad_norm": 2.0819208074157536, + "learning_rate": 2.009385600939856e-06, + "loss": 0.8719, + "step": 7652 + }, + { + "epoch": 0.8009419152276295, + "grad_norm": 2.357873291710539, + "learning_rate": 2.0073479929019536e-06, + "loss": 1.0302, + "step": 7653 + }, + { + "epoch": 0.8010465724751439, + "grad_norm": 2.22508676135223, + "learning_rate": 2.005311303235966e-06, + "loss": 0.9423, + "step": 7654 + }, + { + "epoch": 0.8011512297226583, + "grad_norm": 2.4557652270731616, + "learning_rate": 2.00327553217592e-06, + "loss": 0.8045, + "step": 7655 + }, + { + "epoch": 0.8012558869701727, + "grad_norm": 2.258447659718386, + "learning_rate": 2.001240679955727e-06, + "loss": 0.8484, + "step": 7656 + }, + { + "epoch": 0.8013605442176871, + "grad_norm": 2.0218151709850316, + "learning_rate": 1.9992067468091947e-06, + "loss": 0.9074, + "step": 7657 + }, + { + "epoch": 0.8014652014652015, + "grad_norm": 2.31088112748935, + "learning_rate": 1.9971737329700256e-06, + "loss": 0.7952, + "step": 7658 + }, + { + "epoch": 0.8015698587127159, + "grad_norm": 2.132378499236807, + "learning_rate": 1.995141638671816e-06, + "loss": 0.8665, + "step": 7659 + }, + { + "epoch": 0.8016745159602302, + "grad_norm": 2.1426183845478053, + "learning_rate": 1.993110464148059e-06, + "loss": 0.9102, + "step": 7660 + }, + { + "epoch": 0.8017791732077446, + "grad_norm": 2.1293042574252867, + "learning_rate": 1.9910802096321457e-06, + "loss": 0.8794, + "step": 7661 + }, + { + "epoch": 0.801883830455259, + "grad_norm": 2.1089889655402168, + "learning_rate": 1.98905087535735e-06, + "loss": 0.9595, + "step": 7662 + }, + { + "epoch": 0.8019884877027734, + "grad_norm": 2.0969209074252615, + "learning_rate": 1.98702246155685e-06, + "loss": 0.9647, + "step": 7663 + }, + { + "epoch": 0.8020931449502878, + "grad_norm": 1.992617104506457, + "learning_rate": 1.9849949684637103e-06, + "loss": 0.8987, + "step": 7664 + }, + { + "epoch": 0.8021978021978022, + "grad_norm": 2.633618499233918, + "learning_rate": 1.9829683963108992e-06, + "loss": 0.9034, + "step": 7665 + }, + { + "epoch": 0.8023024594453166, + "grad_norm": 2.2056134263285907, + "learning_rate": 1.980942745331271e-06, + "loss": 0.8807, + "step": 7666 + }, + { + "epoch": 0.8024071166928309, + "grad_norm": 2.068273680645422, + "learning_rate": 1.9789180157575795e-06, + "loss": 0.8689, + "step": 7667 + }, + { + "epoch": 0.8025117739403453, + "grad_norm": 2.019594583927551, + "learning_rate": 1.976894207822464e-06, + "loss": 0.9953, + "step": 7668 + }, + { + "epoch": 0.8026164311878597, + "grad_norm": 2.316889621162141, + "learning_rate": 1.9748713217584693e-06, + "loss": 0.8515, + "step": 7669 + }, + { + "epoch": 0.8027210884353742, + "grad_norm": 2.111794166943864, + "learning_rate": 1.972849357798031e-06, + "loss": 0.9745, + "step": 7670 + }, + { + "epoch": 0.8028257456828886, + "grad_norm": 2.03962760228518, + "learning_rate": 1.970828316173474e-06, + "loss": 0.8139, + "step": 7671 + }, + { + "epoch": 0.802930402930403, + "grad_norm": 2.069040546093496, + "learning_rate": 1.9688081971170202e-06, + "loss": 0.8368, + "step": 7672 + }, + { + "epoch": 0.8030350601779174, + "grad_norm": 2.1304131521617613, + "learning_rate": 1.9667890008607827e-06, + "loss": 0.8431, + "step": 7673 + }, + { + "epoch": 0.8031397174254317, + "grad_norm": 1.95426739187706, + "learning_rate": 1.964770727636778e-06, + "loss": 0.9034, + "step": 7674 + }, + { + "epoch": 0.8032443746729461, + "grad_norm": 2.1064759648577773, + "learning_rate": 1.962753377676905e-06, + "loss": 0.8432, + "step": 7675 + }, + { + "epoch": 0.8033490319204605, + "grad_norm": 1.8850818133792997, + "learning_rate": 1.9607369512129615e-06, + "loss": 0.8973, + "step": 7676 + }, + { + "epoch": 0.8034536891679749, + "grad_norm": 1.5859786709753045, + "learning_rate": 1.958721448476637e-06, + "loss": 0.7476, + "step": 7677 + }, + { + "epoch": 0.8035583464154893, + "grad_norm": 2.220270218255743, + "learning_rate": 1.9567068696995227e-06, + "loss": 0.9196, + "step": 7678 + }, + { + "epoch": 0.8036630036630037, + "grad_norm": 1.6376859017657255, + "learning_rate": 1.9546932151130913e-06, + "loss": 0.771, + "step": 7679 + }, + { + "epoch": 0.8037676609105181, + "grad_norm": 2.1672382842265185, + "learning_rate": 1.9526804849487225e-06, + "loss": 0.9884, + "step": 7680 + }, + { + "epoch": 0.8038723181580324, + "grad_norm": 2.16714258180435, + "learning_rate": 1.9506686794376782e-06, + "loss": 0.9114, + "step": 7681 + }, + { + "epoch": 0.8039769754055468, + "grad_norm": 2.130041323517797, + "learning_rate": 1.948657798811121e-06, + "loss": 0.9454, + "step": 7682 + }, + { + "epoch": 0.8040816326530612, + "grad_norm": 2.0922079549466512, + "learning_rate": 1.9466478433001022e-06, + "loss": 0.8827, + "step": 7683 + }, + { + "epoch": 0.8041862899005756, + "grad_norm": 2.0945418746948197, + "learning_rate": 1.9446388131355753e-06, + "loss": 0.8897, + "step": 7684 + }, + { + "epoch": 0.80429094714809, + "grad_norm": 2.236535687893619, + "learning_rate": 1.9426307085483753e-06, + "loss": 0.9248, + "step": 7685 + }, + { + "epoch": 0.8043956043956044, + "grad_norm": 2.0020235889260647, + "learning_rate": 1.9406235297692434e-06, + "loss": 0.8902, + "step": 7686 + }, + { + "epoch": 0.8045002616431188, + "grad_norm": 2.161723959121442, + "learning_rate": 1.9386172770288057e-06, + "loss": 0.9059, + "step": 7687 + }, + { + "epoch": 0.8046049188906331, + "grad_norm": 2.1277819801638373, + "learning_rate": 1.936611950557583e-06, + "loss": 0.7741, + "step": 7688 + }, + { + "epoch": 0.8047095761381475, + "grad_norm": 1.8553064489672486, + "learning_rate": 1.9346075505859953e-06, + "loss": 0.8993, + "step": 7689 + }, + { + "epoch": 0.8048142333856619, + "grad_norm": 1.7911656465689691, + "learning_rate": 1.932604077344351e-06, + "loss": 0.7607, + "step": 7690 + }, + { + "epoch": 0.8049188906331763, + "grad_norm": 1.6049725585488301, + "learning_rate": 1.9306015310628523e-06, + "loss": 0.7695, + "step": 7691 + }, + { + "epoch": 0.8050235478806907, + "grad_norm": 1.9936774666260564, + "learning_rate": 1.9285999119715936e-06, + "loss": 0.755, + "step": 7692 + }, + { + "epoch": 0.8051282051282052, + "grad_norm": 2.164086612979867, + "learning_rate": 1.926599220300569e-06, + "loss": 0.9036, + "step": 7693 + }, + { + "epoch": 0.8052328623757196, + "grad_norm": 2.1877719623928247, + "learning_rate": 1.924599456279659e-06, + "loss": 0.9081, + "step": 7694 + }, + { + "epoch": 0.8053375196232339, + "grad_norm": 1.9764063147501494, + "learning_rate": 1.922600620138645e-06, + "loss": 0.7899, + "step": 7695 + }, + { + "epoch": 0.8054421768707483, + "grad_norm": 2.096661411005218, + "learning_rate": 1.9206027121071937e-06, + "loss": 0.8407, + "step": 7696 + }, + { + "epoch": 0.8055468341182627, + "grad_norm": 2.2156877709091742, + "learning_rate": 1.918605732414869e-06, + "loss": 0.924, + "step": 7697 + }, + { + "epoch": 0.8056514913657771, + "grad_norm": 1.6524591320676993, + "learning_rate": 1.916609681291126e-06, + "loss": 0.8249, + "step": 7698 + }, + { + "epoch": 0.8057561486132915, + "grad_norm": 2.3280205948736694, + "learning_rate": 1.914614558965321e-06, + "loss": 0.864, + "step": 7699 + }, + { + "epoch": 0.8058608058608059, + "grad_norm": 1.8899662650771822, + "learning_rate": 1.9126203656666918e-06, + "loss": 0.7827, + "step": 7700 + }, + { + "epoch": 0.8059654631083203, + "grad_norm": 1.8294831259412443, + "learning_rate": 1.9106271016243792e-06, + "loss": 0.7265, + "step": 7701 + }, + { + "epoch": 0.8060701203558347, + "grad_norm": 1.8127721118364941, + "learning_rate": 1.908634767067409e-06, + "loss": 0.7789, + "step": 7702 + }, + { + "epoch": 0.806174777603349, + "grad_norm": 2.0601656865089004, + "learning_rate": 1.906643362224706e-06, + "loss": 0.7856, + "step": 7703 + }, + { + "epoch": 0.8062794348508634, + "grad_norm": 2.0340414414029984, + "learning_rate": 1.9046528873250902e-06, + "loss": 0.8515, + "step": 7704 + }, + { + "epoch": 0.8063840920983778, + "grad_norm": 1.9432516065364067, + "learning_rate": 1.9026633425972696e-06, + "loss": 0.8491, + "step": 7705 + }, + { + "epoch": 0.8064887493458922, + "grad_norm": 2.4510874165764838, + "learning_rate": 1.9006747282698445e-06, + "loss": 0.7151, + "step": 7706 + }, + { + "epoch": 0.8065934065934066, + "grad_norm": 2.3967160624954786, + "learning_rate": 1.8986870445713112e-06, + "loss": 0.8809, + "step": 7707 + }, + { + "epoch": 0.806698063840921, + "grad_norm": 2.1255469660957766, + "learning_rate": 1.8967002917300614e-06, + "loss": 0.918, + "step": 7708 + }, + { + "epoch": 0.8068027210884354, + "grad_norm": 2.178668681213783, + "learning_rate": 1.8947144699743747e-06, + "loss": 0.9926, + "step": 7709 + }, + { + "epoch": 0.8069073783359497, + "grad_norm": 2.2070908456686933, + "learning_rate": 1.8927295795324275e-06, + "loss": 0.8722, + "step": 7710 + }, + { + "epoch": 0.8070120355834641, + "grad_norm": 1.817792067239064, + "learning_rate": 1.890745620632284e-06, + "loss": 0.8888, + "step": 7711 + }, + { + "epoch": 0.8071166928309785, + "grad_norm": 2.5554707262103293, + "learning_rate": 1.8887625935019072e-06, + "loss": 0.9806, + "step": 7712 + }, + { + "epoch": 0.8072213500784929, + "grad_norm": 2.304574255962299, + "learning_rate": 1.886780498369154e-06, + "loss": 0.8317, + "step": 7713 + }, + { + "epoch": 0.8073260073260073, + "grad_norm": 2.0339279457346375, + "learning_rate": 1.8847993354617689e-06, + "loss": 0.9634, + "step": 7714 + }, + { + "epoch": 0.8074306645735218, + "grad_norm": 2.0844568689691867, + "learning_rate": 1.8828191050073918e-06, + "loss": 0.9645, + "step": 7715 + }, + { + "epoch": 0.8075353218210362, + "grad_norm": 1.9109270715202566, + "learning_rate": 1.8808398072335555e-06, + "loss": 0.8214, + "step": 7716 + }, + { + "epoch": 0.8076399790685505, + "grad_norm": 1.9339620974128462, + "learning_rate": 1.878861442367681e-06, + "loss": 0.8987, + "step": 7717 + }, + { + "epoch": 0.8077446363160649, + "grad_norm": 1.9397724352401922, + "learning_rate": 1.8768840106370922e-06, + "loss": 0.8367, + "step": 7718 + }, + { + "epoch": 0.8078492935635793, + "grad_norm": 2.1190867449260917, + "learning_rate": 1.8749075122689986e-06, + "loss": 0.8944, + "step": 7719 + }, + { + "epoch": 0.8079539508110937, + "grad_norm": 2.670252322303464, + "learning_rate": 1.8729319474905005e-06, + "loss": 0.8799, + "step": 7720 + }, + { + "epoch": 0.8080586080586081, + "grad_norm": 2.141457200125952, + "learning_rate": 1.8709573165286e-06, + "loss": 0.8191, + "step": 7721 + }, + { + "epoch": 0.8081632653061225, + "grad_norm": 1.7558169293557664, + "learning_rate": 1.8689836196101797e-06, + "loss": 0.8741, + "step": 7722 + }, + { + "epoch": 0.8082679225536369, + "grad_norm": 2.06145311599018, + "learning_rate": 1.867010856962027e-06, + "loss": 0.9091, + "step": 7723 + }, + { + "epoch": 0.8083725798011512, + "grad_norm": 2.1553715349745937, + "learning_rate": 1.8650390288108156e-06, + "loss": 0.9571, + "step": 7724 + }, + { + "epoch": 0.8084772370486656, + "grad_norm": 2.6232290933594307, + "learning_rate": 1.8630681353831104e-06, + "loss": 0.7693, + "step": 7725 + }, + { + "epoch": 0.80858189429618, + "grad_norm": 2.3021150057545627, + "learning_rate": 1.8610981769053681e-06, + "loss": 0.9316, + "step": 7726 + }, + { + "epoch": 0.8086865515436944, + "grad_norm": 1.9431158516385172, + "learning_rate": 1.859129153603949e-06, + "loss": 0.9173, + "step": 7727 + }, + { + "epoch": 0.8087912087912088, + "grad_norm": 2.040241180020073, + "learning_rate": 1.8571610657050921e-06, + "loss": 0.9225, + "step": 7728 + }, + { + "epoch": 0.8088958660387232, + "grad_norm": 2.049568434268225, + "learning_rate": 1.855193913434934e-06, + "loss": 0.8637, + "step": 7729 + }, + { + "epoch": 0.8090005232862376, + "grad_norm": 2.2567330112829866, + "learning_rate": 1.8532276970195085e-06, + "loss": 0.8094, + "step": 7730 + }, + { + "epoch": 0.8091051805337519, + "grad_norm": 2.4898641315793144, + "learning_rate": 1.8512624166847338e-06, + "loss": 0.9544, + "step": 7731 + }, + { + "epoch": 0.8092098377812663, + "grad_norm": 2.087408291721611, + "learning_rate": 1.8492980726564292e-06, + "loss": 0.8928, + "step": 7732 + }, + { + "epoch": 0.8093144950287807, + "grad_norm": 1.9848372835802772, + "learning_rate": 1.8473346651602986e-06, + "loss": 0.9422, + "step": 7733 + }, + { + "epoch": 0.8094191522762951, + "grad_norm": 1.9749206184830705, + "learning_rate": 1.8453721944219428e-06, + "loss": 0.835, + "step": 7734 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 2.298149888799161, + "learning_rate": 1.8434106606668522e-06, + "loss": 0.9568, + "step": 7735 + }, + { + "epoch": 0.8096284667713239, + "grad_norm": 2.081259887949481, + "learning_rate": 1.8414500641204104e-06, + "loss": 0.8567, + "step": 7736 + }, + { + "epoch": 0.8097331240188383, + "grad_norm": 2.1873810909435294, + "learning_rate": 1.8394904050078943e-06, + "loss": 0.8909, + "step": 7737 + }, + { + "epoch": 0.8098377812663526, + "grad_norm": 2.064239458078798, + "learning_rate": 1.8375316835544775e-06, + "loss": 0.7535, + "step": 7738 + }, + { + "epoch": 0.809942438513867, + "grad_norm": 1.9681755929693125, + "learning_rate": 1.8355738999852157e-06, + "loss": 0.8752, + "step": 7739 + }, + { + "epoch": 0.8100470957613815, + "grad_norm": 1.9419726368546049, + "learning_rate": 1.8336170545250653e-06, + "loss": 0.9044, + "step": 7740 + }, + { + "epoch": 0.8101517530088959, + "grad_norm": 2.1087815522447033, + "learning_rate": 1.8316611473988678e-06, + "loss": 0.8237, + "step": 7741 + }, + { + "epoch": 0.8102564102564103, + "grad_norm": 2.078852602883871, + "learning_rate": 1.8297061788313652e-06, + "loss": 0.843, + "step": 7742 + }, + { + "epoch": 0.8103610675039247, + "grad_norm": 1.951856666967445, + "learning_rate": 1.827752149047185e-06, + "loss": 0.9578, + "step": 7743 + }, + { + "epoch": 0.8104657247514391, + "grad_norm": 1.7953444494690902, + "learning_rate": 1.8257990582708508e-06, + "loss": 0.7784, + "step": 7744 + }, + { + "epoch": 0.8105703819989535, + "grad_norm": 1.9250641329953944, + "learning_rate": 1.8238469067267728e-06, + "loss": 0.9653, + "step": 7745 + }, + { + "epoch": 0.8106750392464678, + "grad_norm": 1.8425811745515865, + "learning_rate": 1.8218956946392597e-06, + "loss": 0.8461, + "step": 7746 + }, + { + "epoch": 0.8107796964939822, + "grad_norm": 1.6927128485168828, + "learning_rate": 1.819945422232513e-06, + "loss": 0.8375, + "step": 7747 + }, + { + "epoch": 0.8108843537414966, + "grad_norm": 1.96120471071044, + "learning_rate": 1.8179960897306192e-06, + "loss": 0.6927, + "step": 7748 + }, + { + "epoch": 0.810989010989011, + "grad_norm": 1.8908478940115092, + "learning_rate": 1.8160476973575624e-06, + "loss": 0.9127, + "step": 7749 + }, + { + "epoch": 0.8110936682365254, + "grad_norm": 2.3740896100862194, + "learning_rate": 1.8141002453372126e-06, + "loss": 0.8577, + "step": 7750 + }, + { + "epoch": 0.8111983254840398, + "grad_norm": 1.9405102048474503, + "learning_rate": 1.8121537338933425e-06, + "loss": 0.8621, + "step": 7751 + }, + { + "epoch": 0.8113029827315542, + "grad_norm": 2.101087698006679, + "learning_rate": 1.8102081632496061e-06, + "loss": 0.9008, + "step": 7752 + }, + { + "epoch": 0.8114076399790685, + "grad_norm": 2.109664241764611, + "learning_rate": 1.808263533629554e-06, + "loss": 0.8728, + "step": 7753 + }, + { + "epoch": 0.8115122972265829, + "grad_norm": 1.957536899983253, + "learning_rate": 1.8063198452566288e-06, + "loss": 0.9817, + "step": 7754 + }, + { + "epoch": 0.8116169544740973, + "grad_norm": 2.29053423643524, + "learning_rate": 1.804377098354161e-06, + "loss": 0.8997, + "step": 7755 + }, + { + "epoch": 0.8117216117216117, + "grad_norm": 1.9515520676387297, + "learning_rate": 1.8024352931453791e-06, + "loss": 0.9398, + "step": 7756 + }, + { + "epoch": 0.8118262689691261, + "grad_norm": 2.2052154737151164, + "learning_rate": 1.8004944298534022e-06, + "loss": 0.9776, + "step": 7757 + }, + { + "epoch": 0.8119309262166405, + "grad_norm": 1.8460361778888326, + "learning_rate": 1.798554508701238e-06, + "loss": 0.7619, + "step": 7758 + }, + { + "epoch": 0.8120355834641549, + "grad_norm": 2.0564089124118703, + "learning_rate": 1.796615529911786e-06, + "loss": 0.9962, + "step": 7759 + }, + { + "epoch": 0.8121402407116692, + "grad_norm": 1.5242392257544974, + "learning_rate": 1.7946774937078383e-06, + "loss": 0.7455, + "step": 7760 + }, + { + "epoch": 0.8122448979591836, + "grad_norm": 1.9373147411960117, + "learning_rate": 1.792740400312082e-06, + "loss": 0.9294, + "step": 7761 + }, + { + "epoch": 0.812349555206698, + "grad_norm": 2.253211453976033, + "learning_rate": 1.7908042499470924e-06, + "loss": 1.0895, + "step": 7762 + }, + { + "epoch": 0.8124542124542125, + "grad_norm": 2.010874372372902, + "learning_rate": 1.788869042835335e-06, + "loss": 0.7848, + "step": 7763 + }, + { + "epoch": 0.8125588697017269, + "grad_norm": 2.1110812558999026, + "learning_rate": 1.7869347791991686e-06, + "loss": 0.8749, + "step": 7764 + }, + { + "epoch": 0.8126635269492413, + "grad_norm": 2.0790033933698506, + "learning_rate": 1.7850014592608457e-06, + "loss": 0.8357, + "step": 7765 + }, + { + "epoch": 0.8127681841967557, + "grad_norm": 2.2764948530690794, + "learning_rate": 1.7830690832425114e-06, + "loss": 0.85, + "step": 7766 + }, + { + "epoch": 0.81287284144427, + "grad_norm": 2.0246844811622617, + "learning_rate": 1.781137651366196e-06, + "loss": 0.9454, + "step": 7767 + }, + { + "epoch": 0.8129774986917844, + "grad_norm": 2.062713254460875, + "learning_rate": 1.7792071638538267e-06, + "loss": 0.914, + "step": 7768 + }, + { + "epoch": 0.8130821559392988, + "grad_norm": 2.060831788514829, + "learning_rate": 1.777277620927217e-06, + "loss": 0.8909, + "step": 7769 + }, + { + "epoch": 0.8131868131868132, + "grad_norm": 2.1960138141055037, + "learning_rate": 1.7753490228080795e-06, + "loss": 0.8942, + "step": 7770 + }, + { + "epoch": 0.8132914704343276, + "grad_norm": 2.019766608559704, + "learning_rate": 1.7734213697180137e-06, + "loss": 0.7633, + "step": 7771 + }, + { + "epoch": 0.813396127681842, + "grad_norm": 1.9474343159482823, + "learning_rate": 1.7714946618785056e-06, + "loss": 0.8488, + "step": 7772 + }, + { + "epoch": 0.8135007849293564, + "grad_norm": 2.1858861212932004, + "learning_rate": 1.769568899510945e-06, + "loss": 0.8966, + "step": 7773 + }, + { + "epoch": 0.8136054421768707, + "grad_norm": 1.8619826191371032, + "learning_rate": 1.7676440828366026e-06, + "loss": 0.849, + "step": 7774 + }, + { + "epoch": 0.8137100994243851, + "grad_norm": 1.882083312231594, + "learning_rate": 1.7657202120766414e-06, + "loss": 0.851, + "step": 7775 + }, + { + "epoch": 0.8138147566718995, + "grad_norm": 2.0412076045520138, + "learning_rate": 1.7637972874521237e-06, + "loss": 0.711, + "step": 7776 + }, + { + "epoch": 0.8139194139194139, + "grad_norm": 2.0941706761569145, + "learning_rate": 1.7618753091839924e-06, + "loss": 0.904, + "step": 7777 + }, + { + "epoch": 0.8140240711669283, + "grad_norm": 2.070231862068319, + "learning_rate": 1.75995427749309e-06, + "loss": 0.9615, + "step": 7778 + }, + { + "epoch": 0.8141287284144427, + "grad_norm": 1.9533497638106823, + "learning_rate": 1.758034192600142e-06, + "loss": 0.7906, + "step": 7779 + }, + { + "epoch": 0.8142333856619571, + "grad_norm": 1.8061715691464177, + "learning_rate": 1.7561150547257766e-06, + "loss": 0.9816, + "step": 7780 + }, + { + "epoch": 0.8143380429094714, + "grad_norm": 1.9499562486471225, + "learning_rate": 1.7541968640905005e-06, + "loss": 0.8275, + "step": 7781 + }, + { + "epoch": 0.8144427001569858, + "grad_norm": 1.9941853271497223, + "learning_rate": 1.7522796209147241e-06, + "loss": 1.0171, + "step": 7782 + }, + { + "epoch": 0.8145473574045002, + "grad_norm": 2.016835021467956, + "learning_rate": 1.750363325418738e-06, + "loss": 0.9194, + "step": 7783 + }, + { + "epoch": 0.8146520146520146, + "grad_norm": 2.349001003678539, + "learning_rate": 1.7484479778227281e-06, + "loss": 0.8795, + "step": 7784 + }, + { + "epoch": 0.814756671899529, + "grad_norm": 2.815561540260484, + "learning_rate": 1.7465335783467753e-06, + "loss": 0.8893, + "step": 7785 + }, + { + "epoch": 0.8148613291470435, + "grad_norm": 1.856879993868537, + "learning_rate": 1.7446201272108454e-06, + "loss": 0.8485, + "step": 7786 + }, + { + "epoch": 0.8149659863945579, + "grad_norm": 1.6972040689665617, + "learning_rate": 1.7427076246347984e-06, + "loss": 0.8006, + "step": 7787 + }, + { + "epoch": 0.8150706436420723, + "grad_norm": 2.4050270886241174, + "learning_rate": 1.7407960708383852e-06, + "loss": 0.8119, + "step": 7788 + }, + { + "epoch": 0.8151753008895866, + "grad_norm": 2.0596208441064747, + "learning_rate": 1.7388854660412435e-06, + "loss": 0.8787, + "step": 7789 + }, + { + "epoch": 0.815279958137101, + "grad_norm": 2.6144498345375675, + "learning_rate": 1.7369758104629098e-06, + "loss": 1.0528, + "step": 7790 + }, + { + "epoch": 0.8153846153846154, + "grad_norm": 1.8301486855272067, + "learning_rate": 1.7350671043228072e-06, + "loss": 0.8948, + "step": 7791 + }, + { + "epoch": 0.8154892726321298, + "grad_norm": 2.2092777762924656, + "learning_rate": 1.7331593478402508e-06, + "loss": 0.9019, + "step": 7792 + }, + { + "epoch": 0.8155939298796442, + "grad_norm": 1.931054386839299, + "learning_rate": 1.7312525412344428e-06, + "loss": 0.8937, + "step": 7793 + }, + { + "epoch": 0.8156985871271586, + "grad_norm": 2.0088421049144327, + "learning_rate": 1.7293466847244788e-06, + "loss": 0.7165, + "step": 7794 + }, + { + "epoch": 0.815803244374673, + "grad_norm": 2.064343587455126, + "learning_rate": 1.7274417785293485e-06, + "loss": 0.7951, + "step": 7795 + }, + { + "epoch": 0.8159079016221873, + "grad_norm": 1.9979352317040366, + "learning_rate": 1.7255378228679286e-06, + "loss": 0.8228, + "step": 7796 + }, + { + "epoch": 0.8160125588697017, + "grad_norm": 2.257708751650851, + "learning_rate": 1.7236348179589869e-06, + "loss": 0.8863, + "step": 7797 + }, + { + "epoch": 0.8161172161172161, + "grad_norm": 2.518173481553307, + "learning_rate": 1.72173276402118e-06, + "loss": 0.8981, + "step": 7798 + }, + { + "epoch": 0.8162218733647305, + "grad_norm": 1.9462944845738346, + "learning_rate": 1.7198316612730603e-06, + "loss": 0.9347, + "step": 7799 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 2.595705625274348, + "learning_rate": 1.7179315099330707e-06, + "loss": 0.966, + "step": 7800 + }, + { + "epoch": 0.8164311878597593, + "grad_norm": 1.977122273036907, + "learning_rate": 1.7160323102195408e-06, + "loss": 0.8375, + "step": 7801 + }, + { + "epoch": 0.8165358451072737, + "grad_norm": 2.1978666809470386, + "learning_rate": 1.7141340623506908e-06, + "loss": 0.8184, + "step": 7802 + }, + { + "epoch": 0.816640502354788, + "grad_norm": 2.1815414177103523, + "learning_rate": 1.7122367665446327e-06, + "loss": 0.8825, + "step": 7803 + }, + { + "epoch": 0.8167451596023024, + "grad_norm": 1.547128969295642, + "learning_rate": 1.7103404230193733e-06, + "loss": 0.6754, + "step": 7804 + }, + { + "epoch": 0.8168498168498168, + "grad_norm": 1.6815037025349835, + "learning_rate": 1.7084450319928037e-06, + "loss": 0.6787, + "step": 7805 + }, + { + "epoch": 0.8169544740973312, + "grad_norm": 1.959387083414091, + "learning_rate": 1.7065505936827097e-06, + "loss": 0.972, + "step": 7806 + }, + { + "epoch": 0.8170591313448456, + "grad_norm": 2.3459581609655227, + "learning_rate": 1.704657108306762e-06, + "loss": 0.7988, + "step": 7807 + }, + { + "epoch": 0.81716378859236, + "grad_norm": 2.8102944684349676, + "learning_rate": 1.7027645760825284e-06, + "loss": 0.926, + "step": 7808 + }, + { + "epoch": 0.8172684458398745, + "grad_norm": 1.7141239024853219, + "learning_rate": 1.7008729972274685e-06, + "loss": 0.8889, + "step": 7809 + }, + { + "epoch": 0.8173731030873888, + "grad_norm": 1.9586989744589782, + "learning_rate": 1.698982371958925e-06, + "loss": 0.9195, + "step": 7810 + }, + { + "epoch": 0.8174777603349032, + "grad_norm": 2.13663131703756, + "learning_rate": 1.697092700494135e-06, + "loss": 1.009, + "step": 7811 + }, + { + "epoch": 0.8175824175824176, + "grad_norm": 1.9020925116780771, + "learning_rate": 1.6952039830502253e-06, + "loss": 0.8457, + "step": 7812 + }, + { + "epoch": 0.817687074829932, + "grad_norm": 1.9607266851857876, + "learning_rate": 1.693316219844211e-06, + "loss": 0.8744, + "step": 7813 + }, + { + "epoch": 0.8177917320774464, + "grad_norm": 2.1863791159608077, + "learning_rate": 1.6914294110930052e-06, + "loss": 0.8709, + "step": 7814 + }, + { + "epoch": 0.8178963893249608, + "grad_norm": 1.918146066711303, + "learning_rate": 1.6895435570134032e-06, + "loss": 0.8755, + "step": 7815 + }, + { + "epoch": 0.8180010465724752, + "grad_norm": 2.3464466253166925, + "learning_rate": 1.687658657822091e-06, + "loss": 0.8249, + "step": 7816 + }, + { + "epoch": 0.8181057038199895, + "grad_norm": 2.094038559858304, + "learning_rate": 1.6857747137356529e-06, + "loss": 0.8999, + "step": 7817 + }, + { + "epoch": 0.8182103610675039, + "grad_norm": 1.969446534885216, + "learning_rate": 1.683891724970551e-06, + "loss": 0.7884, + "step": 7818 + }, + { + "epoch": 0.8183150183150183, + "grad_norm": 1.9254682173238997, + "learning_rate": 1.6820096917431527e-06, + "loss": 0.8815, + "step": 7819 + }, + { + "epoch": 0.8184196755625327, + "grad_norm": 1.705412210976725, + "learning_rate": 1.6801286142697015e-06, + "loss": 0.7476, + "step": 7820 + }, + { + "epoch": 0.8185243328100471, + "grad_norm": 1.8741055231594557, + "learning_rate": 1.67824849276634e-06, + "loss": 0.8466, + "step": 7821 + }, + { + "epoch": 0.8186289900575615, + "grad_norm": 1.9978999965045052, + "learning_rate": 1.6763693274490932e-06, + "loss": 0.8856, + "step": 7822 + }, + { + "epoch": 0.8187336473050759, + "grad_norm": 2.1962770410951977, + "learning_rate": 1.6744911185338874e-06, + "loss": 0.8014, + "step": 7823 + }, + { + "epoch": 0.8188383045525902, + "grad_norm": 2.5133884425384214, + "learning_rate": 1.6726138662365276e-06, + "loss": 0.9689, + "step": 7824 + }, + { + "epoch": 0.8189429618001046, + "grad_norm": 2.0611491066129197, + "learning_rate": 1.6707375707727169e-06, + "loss": 0.7664, + "step": 7825 + }, + { + "epoch": 0.819047619047619, + "grad_norm": 1.9924960787409718, + "learning_rate": 1.6688622323580461e-06, + "loss": 0.7497, + "step": 7826 + }, + { + "epoch": 0.8191522762951334, + "grad_norm": 2.112698383860386, + "learning_rate": 1.6669878512079906e-06, + "loss": 0.8948, + "step": 7827 + }, + { + "epoch": 0.8192569335426478, + "grad_norm": 2.4115852205279804, + "learning_rate": 1.6651144275379261e-06, + "loss": 0.9912, + "step": 7828 + }, + { + "epoch": 0.8193615907901622, + "grad_norm": 2.141183062004032, + "learning_rate": 1.6632419615631112e-06, + "loss": 0.8449, + "step": 7829 + }, + { + "epoch": 0.8194662480376766, + "grad_norm": 2.3622806603126696, + "learning_rate": 1.661370453498694e-06, + "loss": 0.9455, + "step": 7830 + }, + { + "epoch": 0.819570905285191, + "grad_norm": 2.161674209789048, + "learning_rate": 1.659499903559717e-06, + "loss": 0.8994, + "step": 7831 + }, + { + "epoch": 0.8196755625327053, + "grad_norm": 2.2133213841412993, + "learning_rate": 1.6576303119611058e-06, + "loss": 0.9306, + "step": 7832 + }, + { + "epoch": 0.8197802197802198, + "grad_norm": 2.2348885981045226, + "learning_rate": 1.6557616789176844e-06, + "loss": 0.7268, + "step": 7833 + }, + { + "epoch": 0.8198848770277342, + "grad_norm": 1.9426173933907054, + "learning_rate": 1.653894004644163e-06, + "loss": 0.8147, + "step": 7834 + }, + { + "epoch": 0.8199895342752486, + "grad_norm": 2.4120961582899416, + "learning_rate": 1.6520272893551404e-06, + "loss": 0.8665, + "step": 7835 + }, + { + "epoch": 0.820094191522763, + "grad_norm": 1.7715546995709663, + "learning_rate": 1.6501615332651054e-06, + "loss": 0.8394, + "step": 7836 + }, + { + "epoch": 0.8201988487702774, + "grad_norm": 2.1557330830514867, + "learning_rate": 1.648296736588435e-06, + "loss": 0.82, + "step": 7837 + }, + { + "epoch": 0.8203035060177918, + "grad_norm": 1.9987706186923369, + "learning_rate": 1.6464328995394018e-06, + "loss": 0.8204, + "step": 7838 + }, + { + "epoch": 0.8204081632653061, + "grad_norm": 2.057663150677782, + "learning_rate": 1.6445700223321637e-06, + "loss": 0.8971, + "step": 7839 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 1.7641921997749053, + "learning_rate": 1.642708105180768e-06, + "loss": 0.8375, + "step": 7840 + }, + { + "epoch": 0.8206174777603349, + "grad_norm": 2.0728485301068957, + "learning_rate": 1.6408471482991506e-06, + "loss": 0.8569, + "step": 7841 + }, + { + "epoch": 0.8207221350078493, + "grad_norm": 2.036559853741599, + "learning_rate": 1.6389871519011414e-06, + "loss": 0.8177, + "step": 7842 + }, + { + "epoch": 0.8208267922553637, + "grad_norm": 2.1297430764924368, + "learning_rate": 1.6371281162004604e-06, + "loss": 0.8486, + "step": 7843 + }, + { + "epoch": 0.8209314495028781, + "grad_norm": 1.920858753141409, + "learning_rate": 1.6352700414107113e-06, + "loss": 0.9678, + "step": 7844 + }, + { + "epoch": 0.8210361067503925, + "grad_norm": 1.977146348689036, + "learning_rate": 1.6334129277453935e-06, + "loss": 0.8985, + "step": 7845 + }, + { + "epoch": 0.8211407639979068, + "grad_norm": 2.220183521723425, + "learning_rate": 1.6315567754178896e-06, + "loss": 0.7621, + "step": 7846 + }, + { + "epoch": 0.8212454212454212, + "grad_norm": 2.1123357019169626, + "learning_rate": 1.6297015846414755e-06, + "loss": 0.7536, + "step": 7847 + }, + { + "epoch": 0.8213500784929356, + "grad_norm": 2.353601092727366, + "learning_rate": 1.6278473556293195e-06, + "loss": 0.8775, + "step": 7848 + }, + { + "epoch": 0.82145473574045, + "grad_norm": 2.14734632695945, + "learning_rate": 1.6259940885944737e-06, + "loss": 0.8968, + "step": 7849 + }, + { + "epoch": 0.8215593929879644, + "grad_norm": 2.0771918291151557, + "learning_rate": 1.6241417837498818e-06, + "loss": 0.9364, + "step": 7850 + }, + { + "epoch": 0.8216640502354788, + "grad_norm": 2.0500743901250504, + "learning_rate": 1.6222904413083806e-06, + "loss": 0.8666, + "step": 7851 + }, + { + "epoch": 0.8217687074829932, + "grad_norm": 2.0916815220393183, + "learning_rate": 1.6204400614826886e-06, + "loss": 0.7629, + "step": 7852 + }, + { + "epoch": 0.8218733647305075, + "grad_norm": 2.0764823972980597, + "learning_rate": 1.6185906444854227e-06, + "loss": 1.0247, + "step": 7853 + }, + { + "epoch": 0.8219780219780219, + "grad_norm": 1.832597003879775, + "learning_rate": 1.6167421905290837e-06, + "loss": 0.9094, + "step": 7854 + }, + { + "epoch": 0.8220826792255364, + "grad_norm": 2.0982201560289813, + "learning_rate": 1.6148946998260605e-06, + "loss": 1.0225, + "step": 7855 + }, + { + "epoch": 0.8221873364730508, + "grad_norm": 1.8679791707883626, + "learning_rate": 1.6130481725886327e-06, + "loss": 0.854, + "step": 7856 + }, + { + "epoch": 0.8222919937205652, + "grad_norm": 2.4377106859096997, + "learning_rate": 1.6112026090289744e-06, + "loss": 0.9012, + "step": 7857 + }, + { + "epoch": 0.8223966509680796, + "grad_norm": 1.9716656960681878, + "learning_rate": 1.609358009359142e-06, + "loss": 0.8272, + "step": 7858 + }, + { + "epoch": 0.822501308215594, + "grad_norm": 2.039719279072701, + "learning_rate": 1.6075143737910826e-06, + "loss": 0.7955, + "step": 7859 + }, + { + "epoch": 0.8226059654631083, + "grad_norm": 2.1817572238574945, + "learning_rate": 1.6056717025366364e-06, + "loss": 0.8247, + "step": 7860 + }, + { + "epoch": 0.8227106227106227, + "grad_norm": 1.9613810519827801, + "learning_rate": 1.6038299958075266e-06, + "loss": 0.922, + "step": 7861 + }, + { + "epoch": 0.8228152799581371, + "grad_norm": 1.964948398942913, + "learning_rate": 1.6019892538153725e-06, + "loss": 0.7781, + "step": 7862 + }, + { + "epoch": 0.8229199372056515, + "grad_norm": 2.13598752292871, + "learning_rate": 1.6001494767716786e-06, + "loss": 0.7924, + "step": 7863 + }, + { + "epoch": 0.8230245944531659, + "grad_norm": 2.204916092833003, + "learning_rate": 1.598310664887839e-06, + "loss": 1.0455, + "step": 7864 + }, + { + "epoch": 0.8231292517006803, + "grad_norm": 2.2186437702248085, + "learning_rate": 1.5964728183751343e-06, + "loss": 0.8428, + "step": 7865 + }, + { + "epoch": 0.8232339089481947, + "grad_norm": 2.478462994063437, + "learning_rate": 1.5946359374447374e-06, + "loss": 0.8116, + "step": 7866 + }, + { + "epoch": 0.823338566195709, + "grad_norm": 1.8826304139905394, + "learning_rate": 1.5928000223077123e-06, + "loss": 0.9264, + "step": 7867 + }, + { + "epoch": 0.8234432234432234, + "grad_norm": 2.166154757400412, + "learning_rate": 1.5909650731750048e-06, + "loss": 0.8777, + "step": 7868 + }, + { + "epoch": 0.8235478806907378, + "grad_norm": 2.6409362779319325, + "learning_rate": 1.5891310902574608e-06, + "loss": 0.9525, + "step": 7869 + }, + { + "epoch": 0.8236525379382522, + "grad_norm": 2.0823720652767244, + "learning_rate": 1.587298073765805e-06, + "loss": 0.9399, + "step": 7870 + }, + { + "epoch": 0.8237571951857666, + "grad_norm": 2.147673418143053, + "learning_rate": 1.5854660239106523e-06, + "loss": 0.9762, + "step": 7871 + }, + { + "epoch": 0.823861852433281, + "grad_norm": 2.000527006452208, + "learning_rate": 1.5836349409025143e-06, + "loss": 0.7641, + "step": 7872 + }, + { + "epoch": 0.8239665096807954, + "grad_norm": 2.4818231228477012, + "learning_rate": 1.5818048249517825e-06, + "loss": 1.0183, + "step": 7873 + }, + { + "epoch": 0.8240711669283098, + "grad_norm": 2.0501868554721265, + "learning_rate": 1.5799756762687424e-06, + "loss": 0.8242, + "step": 7874 + }, + { + "epoch": 0.8241758241758241, + "grad_norm": 2.3433303375111545, + "learning_rate": 1.5781474950635633e-06, + "loss": 0.8638, + "step": 7875 + }, + { + "epoch": 0.8242804814233385, + "grad_norm": 2.0395663531419577, + "learning_rate": 1.5763202815463109e-06, + "loss": 0.8088, + "step": 7876 + }, + { + "epoch": 0.824385138670853, + "grad_norm": 2.037603786377761, + "learning_rate": 1.5744940359269357e-06, + "loss": 0.9229, + "step": 7877 + }, + { + "epoch": 0.8244897959183674, + "grad_norm": 1.8506973012344865, + "learning_rate": 1.5726687584152778e-06, + "loss": 0.8673, + "step": 7878 + }, + { + "epoch": 0.8245944531658818, + "grad_norm": 2.288058667834201, + "learning_rate": 1.5708444492210617e-06, + "loss": 0.7638, + "step": 7879 + }, + { + "epoch": 0.8246991104133962, + "grad_norm": 2.1522314144552337, + "learning_rate": 1.5690211085539054e-06, + "loss": 0.8846, + "step": 7880 + }, + { + "epoch": 0.8248037676609106, + "grad_norm": 1.6233838085258567, + "learning_rate": 1.5671987366233166e-06, + "loss": 0.7405, + "step": 7881 + }, + { + "epoch": 0.8249084249084249, + "grad_norm": 2.0740723600378566, + "learning_rate": 1.565377333638688e-06, + "loss": 0.9315, + "step": 7882 + }, + { + "epoch": 0.8250130821559393, + "grad_norm": 2.2960235649753047, + "learning_rate": 1.563556899809302e-06, + "loss": 0.9879, + "step": 7883 + }, + { + "epoch": 0.8251177394034537, + "grad_norm": 1.7103999518415944, + "learning_rate": 1.5617374353443316e-06, + "loss": 0.7416, + "step": 7884 + }, + { + "epoch": 0.8252223966509681, + "grad_norm": 2.5050104670986597, + "learning_rate": 1.5599189404528337e-06, + "loss": 0.8611, + "step": 7885 + }, + { + "epoch": 0.8253270538984825, + "grad_norm": 2.612738780164187, + "learning_rate": 1.5581014153437578e-06, + "loss": 0.9615, + "step": 7886 + }, + { + "epoch": 0.8254317111459969, + "grad_norm": 1.8339034152233065, + "learning_rate": 1.5562848602259473e-06, + "loss": 0.9589, + "step": 7887 + }, + { + "epoch": 0.8255363683935113, + "grad_norm": 1.9861260916511432, + "learning_rate": 1.5544692753081226e-06, + "loss": 0.8367, + "step": 7888 + }, + { + "epoch": 0.8256410256410256, + "grad_norm": 2.019172682002892, + "learning_rate": 1.552654660798899e-06, + "loss": 0.8907, + "step": 7889 + }, + { + "epoch": 0.82574568288854, + "grad_norm": 2.266069990170753, + "learning_rate": 1.5508410169067767e-06, + "loss": 0.7255, + "step": 7890 + }, + { + "epoch": 0.8258503401360544, + "grad_norm": 1.7448652237490383, + "learning_rate": 1.5490283438401533e-06, + "loss": 0.8038, + "step": 7891 + }, + { + "epoch": 0.8259549973835688, + "grad_norm": 2.236043533397473, + "learning_rate": 1.5472166418073042e-06, + "loss": 0.8716, + "step": 7892 + }, + { + "epoch": 0.8260596546310832, + "grad_norm": 1.7295344167594429, + "learning_rate": 1.5454059110163977e-06, + "loss": 0.7992, + "step": 7893 + }, + { + "epoch": 0.8261643118785976, + "grad_norm": 2.114455364406809, + "learning_rate": 1.5435961516754904e-06, + "loss": 0.8744, + "step": 7894 + }, + { + "epoch": 0.826268969126112, + "grad_norm": 1.7900260474592373, + "learning_rate": 1.5417873639925274e-06, + "loss": 0.8994, + "step": 7895 + }, + { + "epoch": 0.8263736263736263, + "grad_norm": 2.0911021977650135, + "learning_rate": 1.5399795481753454e-06, + "loss": 0.8086, + "step": 7896 + }, + { + "epoch": 0.8264782836211407, + "grad_norm": 2.1517967909735547, + "learning_rate": 1.5381727044316641e-06, + "loss": 0.9165, + "step": 7897 + }, + { + "epoch": 0.8265829408686551, + "grad_norm": 1.864956322145815, + "learning_rate": 1.536366832969093e-06, + "loss": 0.9554, + "step": 7898 + }, + { + "epoch": 0.8266875981161695, + "grad_norm": 2.0999023389103915, + "learning_rate": 1.5345619339951267e-06, + "loss": 0.884, + "step": 7899 + }, + { + "epoch": 0.826792255363684, + "grad_norm": 2.3031128045450577, + "learning_rate": 1.5327580077171589e-06, + "loss": 0.7399, + "step": 7900 + }, + { + "epoch": 0.8268969126111984, + "grad_norm": 1.8424502804616814, + "learning_rate": 1.5309550543424612e-06, + "loss": 0.7417, + "step": 7901 + }, + { + "epoch": 0.8270015698587128, + "grad_norm": 1.9163642835084853, + "learning_rate": 1.5291530740781923e-06, + "loss": 0.751, + "step": 7902 + }, + { + "epoch": 0.827106227106227, + "grad_norm": 1.87018617560281, + "learning_rate": 1.5273520671314113e-06, + "loss": 0.8395, + "step": 7903 + }, + { + "epoch": 0.8272108843537415, + "grad_norm": 1.9411831340122883, + "learning_rate": 1.5255520337090534e-06, + "loss": 0.8028, + "step": 7904 + }, + { + "epoch": 0.8273155416012559, + "grad_norm": 2.4058082707488113, + "learning_rate": 1.5237529740179436e-06, + "loss": 0.9005, + "step": 7905 + }, + { + "epoch": 0.8274201988487703, + "grad_norm": 2.1409745262541287, + "learning_rate": 1.5219548882648027e-06, + "loss": 0.9353, + "step": 7906 + }, + { + "epoch": 0.8275248560962847, + "grad_norm": 1.9577882525040946, + "learning_rate": 1.5201577766562314e-06, + "loss": 0.8448, + "step": 7907 + }, + { + "epoch": 0.8276295133437991, + "grad_norm": 2.0318542534635275, + "learning_rate": 1.5183616393987232e-06, + "loss": 0.8689, + "step": 7908 + }, + { + "epoch": 0.8277341705913135, + "grad_norm": 2.301362720034534, + "learning_rate": 1.5165664766986533e-06, + "loss": 0.9667, + "step": 7909 + }, + { + "epoch": 0.8278388278388278, + "grad_norm": 1.886895169603913, + "learning_rate": 1.5147722887622961e-06, + "loss": 0.8658, + "step": 7910 + }, + { + "epoch": 0.8279434850863422, + "grad_norm": 2.46318780324625, + "learning_rate": 1.5129790757958008e-06, + "loss": 0.8375, + "step": 7911 + }, + { + "epoch": 0.8280481423338566, + "grad_norm": 1.8771844183637372, + "learning_rate": 1.5111868380052185e-06, + "loss": 0.7701, + "step": 7912 + }, + { + "epoch": 0.828152799581371, + "grad_norm": 2.070606556582261, + "learning_rate": 1.5093955755964762e-06, + "loss": 1.0035, + "step": 7913 + }, + { + "epoch": 0.8282574568288854, + "grad_norm": 2.201869626898473, + "learning_rate": 1.5076052887753923e-06, + "loss": 0.904, + "step": 7914 + }, + { + "epoch": 0.8283621140763998, + "grad_norm": 1.996397707124482, + "learning_rate": 1.5058159777476788e-06, + "loss": 0.7816, + "step": 7915 + }, + { + "epoch": 0.8284667713239142, + "grad_norm": 2.0077562325381133, + "learning_rate": 1.5040276427189294e-06, + "loss": 0.9121, + "step": 7916 + }, + { + "epoch": 0.8285714285714286, + "grad_norm": 1.9941521295888824, + "learning_rate": 1.502240283894626e-06, + "loss": 0.8295, + "step": 7917 + }, + { + "epoch": 0.8286760858189429, + "grad_norm": 1.9915709085478066, + "learning_rate": 1.500453901480139e-06, + "loss": 0.8647, + "step": 7918 + }, + { + "epoch": 0.8287807430664573, + "grad_norm": 2.0000071542196145, + "learning_rate": 1.498668495680732e-06, + "loss": 0.9201, + "step": 7919 + }, + { + "epoch": 0.8288854003139717, + "grad_norm": 1.9756711232215005, + "learning_rate": 1.4968840667015462e-06, + "loss": 0.8459, + "step": 7920 + }, + { + "epoch": 0.8289900575614861, + "grad_norm": 1.8528329572690898, + "learning_rate": 1.4951006147476211e-06, + "loss": 0.7497, + "step": 7921 + }, + { + "epoch": 0.8290947148090005, + "grad_norm": 1.782034124098557, + "learning_rate": 1.493318140023876e-06, + "loss": 0.8792, + "step": 7922 + }, + { + "epoch": 0.829199372056515, + "grad_norm": 2.106613209839586, + "learning_rate": 1.4915366427351219e-06, + "loss": 0.7935, + "step": 7923 + }, + { + "epoch": 0.8293040293040294, + "grad_norm": 2.1408753179094857, + "learning_rate": 1.4897561230860536e-06, + "loss": 1.0341, + "step": 7924 + }, + { + "epoch": 0.8294086865515437, + "grad_norm": 2.1278800514184324, + "learning_rate": 1.4879765812812607e-06, + "loss": 0.8901, + "step": 7925 + }, + { + "epoch": 0.8295133437990581, + "grad_norm": 2.1440190302189714, + "learning_rate": 1.4861980175252154e-06, + "loss": 0.898, + "step": 7926 + }, + { + "epoch": 0.8296180010465725, + "grad_norm": 2.0503918898516384, + "learning_rate": 1.4844204320222767e-06, + "loss": 0.8825, + "step": 7927 + }, + { + "epoch": 0.8297226582940869, + "grad_norm": 2.353937197043025, + "learning_rate": 1.4826438249766906e-06, + "loss": 0.8895, + "step": 7928 + }, + { + "epoch": 0.8298273155416013, + "grad_norm": 2.2495600621190586, + "learning_rate": 1.4808681965925954e-06, + "loss": 0.9815, + "step": 7929 + }, + { + "epoch": 0.8299319727891157, + "grad_norm": 2.1503657595480843, + "learning_rate": 1.4790935470740175e-06, + "loss": 0.9053, + "step": 7930 + }, + { + "epoch": 0.8300366300366301, + "grad_norm": 1.8861867201869633, + "learning_rate": 1.4773198766248642e-06, + "loss": 0.8636, + "step": 7931 + }, + { + "epoch": 0.8301412872841444, + "grad_norm": 2.1460191083705906, + "learning_rate": 1.4755471854489345e-06, + "loss": 1.0363, + "step": 7932 + }, + { + "epoch": 0.8302459445316588, + "grad_norm": 2.493276713397057, + "learning_rate": 1.4737754737499111e-06, + "loss": 0.8972, + "step": 7933 + }, + { + "epoch": 0.8303506017791732, + "grad_norm": 1.6858625797981937, + "learning_rate": 1.4720047417313731e-06, + "loss": 0.7886, + "step": 7934 + }, + { + "epoch": 0.8304552590266876, + "grad_norm": 1.8962906371419326, + "learning_rate": 1.4702349895967782e-06, + "loss": 0.9196, + "step": 7935 + }, + { + "epoch": 0.830559916274202, + "grad_norm": 1.9184229473202252, + "learning_rate": 1.4684662175494747e-06, + "loss": 0.8052, + "step": 7936 + }, + { + "epoch": 0.8306645735217164, + "grad_norm": 2.137568048466092, + "learning_rate": 1.4666984257926964e-06, + "loss": 0.7906, + "step": 7937 + }, + { + "epoch": 0.8307692307692308, + "grad_norm": 1.5889031763544768, + "learning_rate": 1.4649316145295668e-06, + "loss": 0.7409, + "step": 7938 + }, + { + "epoch": 0.8308738880167451, + "grad_norm": 1.8404031630051605, + "learning_rate": 1.4631657839630996e-06, + "loss": 0.7866, + "step": 7939 + }, + { + "epoch": 0.8309785452642595, + "grad_norm": 2.286617894950235, + "learning_rate": 1.46140093429619e-06, + "loss": 0.8095, + "step": 7940 + }, + { + "epoch": 0.8310832025117739, + "grad_norm": 2.05496477949122, + "learning_rate": 1.4596370657316229e-06, + "loss": 0.9575, + "step": 7941 + }, + { + "epoch": 0.8311878597592883, + "grad_norm": 2.427977961236888, + "learning_rate": 1.4578741784720697e-06, + "loss": 0.876, + "step": 7942 + }, + { + "epoch": 0.8312925170068027, + "grad_norm": 2.227236132950701, + "learning_rate": 1.4561122727200872e-06, + "loss": 0.7827, + "step": 7943 + }, + { + "epoch": 0.8313971742543171, + "grad_norm": 1.8480415224847573, + "learning_rate": 1.454351348678128e-06, + "loss": 0.9043, + "step": 7944 + }, + { + "epoch": 0.8315018315018315, + "grad_norm": 2.152502773769006, + "learning_rate": 1.4525914065485225e-06, + "loss": 0.824, + "step": 7945 + }, + { + "epoch": 0.8316064887493458, + "grad_norm": 1.9448080461974455, + "learning_rate": 1.4508324465334888e-06, + "loss": 0.8931, + "step": 7946 + }, + { + "epoch": 0.8317111459968602, + "grad_norm": 2.0868231871819733, + "learning_rate": 1.449074468835141e-06, + "loss": 0.8808, + "step": 7947 + }, + { + "epoch": 0.8318158032443747, + "grad_norm": 2.0813903646254786, + "learning_rate": 1.4473174736554685e-06, + "loss": 0.7902, + "step": 7948 + }, + { + "epoch": 0.8319204604918891, + "grad_norm": 2.123099741849126, + "learning_rate": 1.4455614611963599e-06, + "loss": 0.8902, + "step": 7949 + }, + { + "epoch": 0.8320251177394035, + "grad_norm": 1.9932955603081572, + "learning_rate": 1.44380643165958e-06, + "loss": 0.7715, + "step": 7950 + }, + { + "epoch": 0.8321297749869179, + "grad_norm": 2.068162191946537, + "learning_rate": 1.4420523852467861e-06, + "loss": 0.8567, + "step": 7951 + }, + { + "epoch": 0.8322344322344323, + "grad_norm": 1.9814333811554212, + "learning_rate": 1.4402993221595197e-06, + "loss": 0.8771, + "step": 7952 + }, + { + "epoch": 0.8323390894819466, + "grad_norm": 2.2197393374158936, + "learning_rate": 1.4385472425992164e-06, + "loss": 0.8788, + "step": 7953 + }, + { + "epoch": 0.832443746729461, + "grad_norm": 2.375333340064998, + "learning_rate": 1.4367961467671875e-06, + "loss": 0.9018, + "step": 7954 + }, + { + "epoch": 0.8325484039769754, + "grad_norm": 1.9216141945810121, + "learning_rate": 1.4350460348646444e-06, + "loss": 0.8605, + "step": 7955 + }, + { + "epoch": 0.8326530612244898, + "grad_norm": 2.398938782927487, + "learning_rate": 1.4332969070926739e-06, + "loss": 0.8632, + "step": 7956 + }, + { + "epoch": 0.8327577184720042, + "grad_norm": 2.0555051893772056, + "learning_rate": 1.431548763652253e-06, + "loss": 0.9676, + "step": 7957 + }, + { + "epoch": 0.8328623757195186, + "grad_norm": 2.1581306202378276, + "learning_rate": 1.429801604744252e-06, + "loss": 0.9032, + "step": 7958 + }, + { + "epoch": 0.832967032967033, + "grad_norm": 1.897752103573272, + "learning_rate": 1.4280554305694205e-06, + "loss": 0.8139, + "step": 7959 + }, + { + "epoch": 0.8330716902145474, + "grad_norm": 1.9164586231571057, + "learning_rate": 1.426310241328397e-06, + "loss": 0.794, + "step": 7960 + }, + { + "epoch": 0.8331763474620617, + "grad_norm": 2.1626145846140323, + "learning_rate": 1.4245660372217062e-06, + "loss": 0.9293, + "step": 7961 + }, + { + "epoch": 0.8332810047095761, + "grad_norm": 2.1783332161398836, + "learning_rate": 1.4228228184497607e-06, + "loss": 0.8771, + "step": 7962 + }, + { + "epoch": 0.8333856619570905, + "grad_norm": 2.0795885143306276, + "learning_rate": 1.4210805852128617e-06, + "loss": 0.8529, + "step": 7963 + }, + { + "epoch": 0.8334903192046049, + "grad_norm": 2.5330157272730602, + "learning_rate": 1.4193393377111964e-06, + "loss": 0.8425, + "step": 7964 + }, + { + "epoch": 0.8335949764521193, + "grad_norm": 1.8622801123769794, + "learning_rate": 1.4175990761448355e-06, + "loss": 0.8042, + "step": 7965 + }, + { + "epoch": 0.8336996336996337, + "grad_norm": 1.8796275243042533, + "learning_rate": 1.4158598007137391e-06, + "loss": 0.7952, + "step": 7966 + }, + { + "epoch": 0.8338042909471481, + "grad_norm": 2.0693042166079056, + "learning_rate": 1.414121511617752e-06, + "loss": 0.8121, + "step": 7967 + }, + { + "epoch": 0.8339089481946624, + "grad_norm": 2.1022638373991014, + "learning_rate": 1.4123842090566108e-06, + "loss": 0.915, + "step": 7968 + }, + { + "epoch": 0.8340136054421768, + "grad_norm": 1.993990099910452, + "learning_rate": 1.4106478932299316e-06, + "loss": 0.8766, + "step": 7969 + }, + { + "epoch": 0.8341182626896912, + "grad_norm": 1.7756065930189895, + "learning_rate": 1.4089125643372237e-06, + "loss": 0.777, + "step": 7970 + }, + { + "epoch": 0.8342229199372057, + "grad_norm": 2.5496490825925906, + "learning_rate": 1.4071782225778751e-06, + "loss": 0.8664, + "step": 7971 + }, + { + "epoch": 0.8343275771847201, + "grad_norm": 2.2433472242170396, + "learning_rate": 1.405444868151168e-06, + "loss": 0.9331, + "step": 7972 + }, + { + "epoch": 0.8344322344322345, + "grad_norm": 1.7433595951078031, + "learning_rate": 1.4037125012562702e-06, + "loss": 0.7727, + "step": 7973 + }, + { + "epoch": 0.8345368916797489, + "grad_norm": 1.8722606546409677, + "learning_rate": 1.401981122092233e-06, + "loss": 0.9949, + "step": 7974 + }, + { + "epoch": 0.8346415489272632, + "grad_norm": 2.019791859336298, + "learning_rate": 1.400250730857996e-06, + "loss": 0.8875, + "step": 7975 + }, + { + "epoch": 0.8347462061747776, + "grad_norm": 1.984713578640695, + "learning_rate": 1.3985213277523802e-06, + "loss": 0.8334, + "step": 7976 + }, + { + "epoch": 0.834850863422292, + "grad_norm": 1.8860559224768174, + "learning_rate": 1.3967929129741032e-06, + "loss": 0.8732, + "step": 7977 + }, + { + "epoch": 0.8349555206698064, + "grad_norm": 2.4921343619909906, + "learning_rate": 1.3950654867217605e-06, + "loss": 0.8214, + "step": 7978 + }, + { + "epoch": 0.8350601779173208, + "grad_norm": 2.6451975469839133, + "learning_rate": 1.3933390491938381e-06, + "loss": 1.0114, + "step": 7979 + }, + { + "epoch": 0.8351648351648352, + "grad_norm": 2.345016840340204, + "learning_rate": 1.3916136005887048e-06, + "loss": 0.8941, + "step": 7980 + }, + { + "epoch": 0.8352694924123496, + "grad_norm": 2.414100792833315, + "learning_rate": 1.3898891411046212e-06, + "loss": 0.8546, + "step": 7981 + }, + { + "epoch": 0.8353741496598639, + "grad_norm": 1.9845989400524062, + "learning_rate": 1.3881656709397272e-06, + "loss": 0.8789, + "step": 7982 + }, + { + "epoch": 0.8354788069073783, + "grad_norm": 1.7934306576299728, + "learning_rate": 1.3864431902920584e-06, + "loss": 0.8559, + "step": 7983 + }, + { + "epoch": 0.8355834641548927, + "grad_norm": 1.9810087701516985, + "learning_rate": 1.3847216993595292e-06, + "loss": 0.7373, + "step": 7984 + }, + { + "epoch": 0.8356881214024071, + "grad_norm": 1.84000027892072, + "learning_rate": 1.3830011983399404e-06, + "loss": 0.8423, + "step": 7985 + }, + { + "epoch": 0.8357927786499215, + "grad_norm": 1.983947105295788, + "learning_rate": 1.381281687430981e-06, + "loss": 0.9642, + "step": 7986 + }, + { + "epoch": 0.8358974358974359, + "grad_norm": 2.240241092713476, + "learning_rate": 1.37956316683023e-06, + "loss": 0.8095, + "step": 7987 + }, + { + "epoch": 0.8360020931449503, + "grad_norm": 2.328360525402851, + "learning_rate": 1.3778456367351455e-06, + "loss": 0.9569, + "step": 7988 + }, + { + "epoch": 0.8361067503924646, + "grad_norm": 2.290792673993712, + "learning_rate": 1.3761290973430753e-06, + "loss": 0.8242, + "step": 7989 + }, + { + "epoch": 0.836211407639979, + "grad_norm": 1.9941656847351477, + "learning_rate": 1.3744135488512556e-06, + "loss": 0.9385, + "step": 7990 + }, + { + "epoch": 0.8363160648874934, + "grad_norm": 1.9735468832454361, + "learning_rate": 1.3726989914568034e-06, + "loss": 0.8731, + "step": 7991 + }, + { + "epoch": 0.8364207221350078, + "grad_norm": 1.9837606343238872, + "learning_rate": 1.3709854253567278e-06, + "loss": 0.8264, + "step": 7992 + }, + { + "epoch": 0.8365253793825222, + "grad_norm": 1.9088882775380245, + "learning_rate": 1.3692728507479214e-06, + "loss": 0.8842, + "step": 7993 + }, + { + "epoch": 0.8366300366300367, + "grad_norm": 2.119436328203735, + "learning_rate": 1.3675612678271588e-06, + "loss": 0.906, + "step": 7994 + }, + { + "epoch": 0.8367346938775511, + "grad_norm": 2.407719091887164, + "learning_rate": 1.365850676791105e-06, + "loss": 0.8475, + "step": 7995 + }, + { + "epoch": 0.8368393511250654, + "grad_norm": 2.103481136150311, + "learning_rate": 1.364141077836314e-06, + "loss": 0.8192, + "step": 7996 + }, + { + "epoch": 0.8369440083725798, + "grad_norm": 2.1854323710686114, + "learning_rate": 1.3624324711592196e-06, + "loss": 0.8484, + "step": 7997 + }, + { + "epoch": 0.8370486656200942, + "grad_norm": 2.1386964506824895, + "learning_rate": 1.3607248569561426e-06, + "loss": 0.8292, + "step": 7998 + }, + { + "epoch": 0.8371533228676086, + "grad_norm": 2.115583413702666, + "learning_rate": 1.359018235423295e-06, + "loss": 0.8606, + "step": 7999 + }, + { + "epoch": 0.837257980115123, + "grad_norm": 2.238976220893437, + "learning_rate": 1.3573126067567688e-06, + "loss": 0.8858, + "step": 8000 + }, + { + "epoch": 0.8373626373626374, + "grad_norm": 2.049600921572707, + "learning_rate": 1.3556079711525439e-06, + "loss": 0.8915, + "step": 8001 + }, + { + "epoch": 0.8374672946101518, + "grad_norm": 1.8171596729987456, + "learning_rate": 1.353904328806488e-06, + "loss": 0.8115, + "step": 8002 + }, + { + "epoch": 0.8375719518576662, + "grad_norm": 2.399384125655146, + "learning_rate": 1.3522016799143522e-06, + "loss": 0.9062, + "step": 8003 + }, + { + "epoch": 0.8376766091051805, + "grad_norm": 1.7533273781629497, + "learning_rate": 1.3505000246717747e-06, + "loss": 0.8903, + "step": 8004 + }, + { + "epoch": 0.8377812663526949, + "grad_norm": 1.9579932469356014, + "learning_rate": 1.348799363274277e-06, + "loss": 0.7385, + "step": 8005 + }, + { + "epoch": 0.8378859236002093, + "grad_norm": 1.981901250159665, + "learning_rate": 1.34709969591727e-06, + "loss": 0.8783, + "step": 8006 + }, + { + "epoch": 0.8379905808477237, + "grad_norm": 2.167993399995397, + "learning_rate": 1.3454010227960513e-06, + "loss": 0.9084, + "step": 8007 + }, + { + "epoch": 0.8380952380952381, + "grad_norm": 1.9178702073749012, + "learning_rate": 1.3437033441057989e-06, + "loss": 0.8963, + "step": 8008 + }, + { + "epoch": 0.8381998953427525, + "grad_norm": 1.951517186745851, + "learning_rate": 1.3420066600415815e-06, + "loss": 0.8355, + "step": 8009 + }, + { + "epoch": 0.8383045525902669, + "grad_norm": 1.909486949227223, + "learning_rate": 1.3403109707983475e-06, + "loss": 0.8665, + "step": 8010 + }, + { + "epoch": 0.8384092098377812, + "grad_norm": 1.9369839312160841, + "learning_rate": 1.338616276570941e-06, + "loss": 0.8397, + "step": 8011 + }, + { + "epoch": 0.8385138670852956, + "grad_norm": 1.8773973941438988, + "learning_rate": 1.336922577554083e-06, + "loss": 0.8907, + "step": 8012 + }, + { + "epoch": 0.83861852433281, + "grad_norm": 2.116860394094237, + "learning_rate": 1.335229873942382e-06, + "loss": 0.8874, + "step": 8013 + }, + { + "epoch": 0.8387231815803244, + "grad_norm": 2.170506935277406, + "learning_rate": 1.3335381659303315e-06, + "loss": 0.9886, + "step": 8014 + }, + { + "epoch": 0.8388278388278388, + "grad_norm": 1.8806562248296783, + "learning_rate": 1.3318474537123138e-06, + "loss": 0.7677, + "step": 8015 + }, + { + "epoch": 0.8389324960753533, + "grad_norm": 2.407172290628666, + "learning_rate": 1.3301577374825992e-06, + "loss": 0.7599, + "step": 8016 + }, + { + "epoch": 0.8390371533228677, + "grad_norm": 3.184412100368414, + "learning_rate": 1.3284690174353364e-06, + "loss": 0.8256, + "step": 8017 + }, + { + "epoch": 0.839141810570382, + "grad_norm": 1.7731206492806486, + "learning_rate": 1.326781293764562e-06, + "loss": 0.7793, + "step": 8018 + }, + { + "epoch": 0.8392464678178964, + "grad_norm": 1.9962901080318212, + "learning_rate": 1.325094566664199e-06, + "loss": 0.8381, + "step": 8019 + }, + { + "epoch": 0.8393511250654108, + "grad_norm": 2.0858347365200736, + "learning_rate": 1.323408836328054e-06, + "loss": 0.8617, + "step": 8020 + }, + { + "epoch": 0.8394557823129252, + "grad_norm": 1.941127370534017, + "learning_rate": 1.3217241029498263e-06, + "loss": 0.8268, + "step": 8021 + }, + { + "epoch": 0.8395604395604396, + "grad_norm": 1.9438795971577278, + "learning_rate": 1.320040366723091e-06, + "loss": 0.7854, + "step": 8022 + }, + { + "epoch": 0.839665096807954, + "grad_norm": 2.0873160477402175, + "learning_rate": 1.3183576278413134e-06, + "loss": 0.8417, + "step": 8023 + }, + { + "epoch": 0.8397697540554684, + "grad_norm": 1.7491945690304727, + "learning_rate": 1.3166758864978424e-06, + "loss": 0.8151, + "step": 8024 + }, + { + "epoch": 0.8398744113029827, + "grad_norm": 1.9769300150442357, + "learning_rate": 1.3149951428859142e-06, + "loss": 0.8788, + "step": 8025 + }, + { + "epoch": 0.8399790685504971, + "grad_norm": 2.140498082088263, + "learning_rate": 1.3133153971986534e-06, + "loss": 0.7433, + "step": 8026 + }, + { + "epoch": 0.8400837257980115, + "grad_norm": 2.4165371301116045, + "learning_rate": 1.3116366496290633e-06, + "loss": 0.8785, + "step": 8027 + }, + { + "epoch": 0.8401883830455259, + "grad_norm": 1.9222172060976765, + "learning_rate": 1.3099589003700341e-06, + "loss": 0.9783, + "step": 8028 + }, + { + "epoch": 0.8402930402930403, + "grad_norm": 2.3781849035951943, + "learning_rate": 1.3082821496143428e-06, + "loss": 0.7902, + "step": 8029 + }, + { + "epoch": 0.8403976975405547, + "grad_norm": 2.0993772636604304, + "learning_rate": 1.3066063975546538e-06, + "loss": 0.9355, + "step": 8030 + }, + { + "epoch": 0.8405023547880691, + "grad_norm": 2.1721127058441674, + "learning_rate": 1.3049316443835137e-06, + "loss": 0.7599, + "step": 8031 + }, + { + "epoch": 0.8406070120355834, + "grad_norm": 2.531892795002363, + "learning_rate": 1.3032578902933546e-06, + "loss": 0.8561, + "step": 8032 + }, + { + "epoch": 0.8407116692830978, + "grad_norm": 2.2300704327515004, + "learning_rate": 1.301585135476492e-06, + "loss": 0.8676, + "step": 8033 + }, + { + "epoch": 0.8408163265306122, + "grad_norm": 2.1993303961757826, + "learning_rate": 1.2999133801251295e-06, + "loss": 0.8671, + "step": 8034 + }, + { + "epoch": 0.8409209837781266, + "grad_norm": 2.154741893990914, + "learning_rate": 1.2982426244313595e-06, + "loss": 0.8462, + "step": 8035 + }, + { + "epoch": 0.841025641025641, + "grad_norm": 2.129202801530897, + "learning_rate": 1.2965728685871525e-06, + "loss": 1.0065, + "step": 8036 + }, + { + "epoch": 0.8411302982731554, + "grad_norm": 1.8293468950695384, + "learning_rate": 1.2949041127843665e-06, + "loss": 0.8104, + "step": 8037 + }, + { + "epoch": 0.8412349555206698, + "grad_norm": 1.907250964658945, + "learning_rate": 1.2932363572147445e-06, + "loss": 0.8117, + "step": 8038 + }, + { + "epoch": 0.8413396127681841, + "grad_norm": 2.2639632750436793, + "learning_rate": 1.2915696020699142e-06, + "loss": 0.8617, + "step": 8039 + }, + { + "epoch": 0.8414442700156985, + "grad_norm": 2.308249827742698, + "learning_rate": 1.2899038475413917e-06, + "loss": 0.8622, + "step": 8040 + }, + { + "epoch": 0.841548927263213, + "grad_norm": 2.227723639811261, + "learning_rate": 1.2882390938205725e-06, + "loss": 0.9042, + "step": 8041 + }, + { + "epoch": 0.8416535845107274, + "grad_norm": 2.1043886188970413, + "learning_rate": 1.2865753410987447e-06, + "loss": 0.9361, + "step": 8042 + }, + { + "epoch": 0.8417582417582418, + "grad_norm": 1.845500363111109, + "learning_rate": 1.2849125895670733e-06, + "loss": 0.8998, + "step": 8043 + }, + { + "epoch": 0.8418628990057562, + "grad_norm": 2.042113662517905, + "learning_rate": 1.283250839416611e-06, + "loss": 0.8967, + "step": 8044 + }, + { + "epoch": 0.8419675562532706, + "grad_norm": 2.5260072411582986, + "learning_rate": 1.2815900908383004e-06, + "loss": 0.8395, + "step": 8045 + }, + { + "epoch": 0.842072213500785, + "grad_norm": 1.569850831580542, + "learning_rate": 1.2799303440229616e-06, + "loss": 0.764, + "step": 8046 + }, + { + "epoch": 0.8421768707482993, + "grad_norm": 2.164867452931564, + "learning_rate": 1.2782715991613048e-06, + "loss": 0.8424, + "step": 8047 + }, + { + "epoch": 0.8422815279958137, + "grad_norm": 1.713908988310335, + "learning_rate": 1.276613856443919e-06, + "loss": 0.8094, + "step": 8048 + }, + { + "epoch": 0.8423861852433281, + "grad_norm": 1.9683438158526272, + "learning_rate": 1.2749571160612872e-06, + "loss": 0.8499, + "step": 8049 + }, + { + "epoch": 0.8424908424908425, + "grad_norm": 2.9203629814106002, + "learning_rate": 1.2733013782037695e-06, + "loss": 0.8889, + "step": 8050 + }, + { + "epoch": 0.8425954997383569, + "grad_norm": 1.8540479952789417, + "learning_rate": 1.2716466430616148e-06, + "loss": 0.8076, + "step": 8051 + }, + { + "epoch": 0.8427001569858713, + "grad_norm": 1.9814166025936952, + "learning_rate": 1.2699929108249565e-06, + "loss": 0.8634, + "step": 8052 + }, + { + "epoch": 0.8428048142333857, + "grad_norm": 1.9793401850173653, + "learning_rate": 1.2683401816838071e-06, + "loss": 0.8196, + "step": 8053 + }, + { + "epoch": 0.8429094714809, + "grad_norm": 2.267300532522284, + "learning_rate": 1.2666884558280745e-06, + "loss": 0.7867, + "step": 8054 + }, + { + "epoch": 0.8430141287284144, + "grad_norm": 1.789887206030474, + "learning_rate": 1.2650377334475416e-06, + "loss": 0.7908, + "step": 8055 + }, + { + "epoch": 0.8431187859759288, + "grad_norm": 1.881345825048694, + "learning_rate": 1.2633880147318822e-06, + "loss": 0.8574, + "step": 8056 + }, + { + "epoch": 0.8432234432234432, + "grad_norm": 1.9203037592773267, + "learning_rate": 1.2617392998706502e-06, + "loss": 0.7534, + "step": 8057 + }, + { + "epoch": 0.8433281004709576, + "grad_norm": 2.2154089371004573, + "learning_rate": 1.260091589053284e-06, + "loss": 0.8416, + "step": 8058 + }, + { + "epoch": 0.843432757718472, + "grad_norm": 2.1555356685174454, + "learning_rate": 1.2584448824691132e-06, + "loss": 0.8896, + "step": 8059 + }, + { + "epoch": 0.8435374149659864, + "grad_norm": 2.081590502046218, + "learning_rate": 1.2567991803073476e-06, + "loss": 0.8501, + "step": 8060 + }, + { + "epoch": 0.8436420722135007, + "grad_norm": 2.1329378594631203, + "learning_rate": 1.2551544827570805e-06, + "loss": 0.8398, + "step": 8061 + }, + { + "epoch": 0.8437467294610151, + "grad_norm": 2.037326064879042, + "learning_rate": 1.2535107900072918e-06, + "loss": 0.9452, + "step": 8062 + }, + { + "epoch": 0.8438513867085295, + "grad_norm": 1.8536690182768223, + "learning_rate": 1.2518681022468414e-06, + "loss": 0.841, + "step": 8063 + }, + { + "epoch": 0.843956043956044, + "grad_norm": 2.418709398888962, + "learning_rate": 1.250226419664483e-06, + "loss": 0.8493, + "step": 8064 + }, + { + "epoch": 0.8440607012035584, + "grad_norm": 1.9982124764464644, + "learning_rate": 1.2485857424488457e-06, + "loss": 0.9066, + "step": 8065 + }, + { + "epoch": 0.8441653584510728, + "grad_norm": 1.826129673099164, + "learning_rate": 1.2469460707884485e-06, + "loss": 0.7961, + "step": 8066 + }, + { + "epoch": 0.8442700156985872, + "grad_norm": 2.0052685371560126, + "learning_rate": 1.2453074048716896e-06, + "loss": 0.9495, + "step": 8067 + }, + { + "epoch": 0.8443746729461015, + "grad_norm": 2.4774578028328977, + "learning_rate": 1.2436697448868572e-06, + "loss": 0.8768, + "step": 8068 + }, + { + "epoch": 0.8444793301936159, + "grad_norm": 1.9032740646595938, + "learning_rate": 1.2420330910221245e-06, + "loss": 0.9143, + "step": 8069 + }, + { + "epoch": 0.8445839874411303, + "grad_norm": 2.1131379226433524, + "learning_rate": 1.240397443465544e-06, + "loss": 0.8795, + "step": 8070 + }, + { + "epoch": 0.8446886446886447, + "grad_norm": 2.037973685230345, + "learning_rate": 1.2387628024050557e-06, + "loss": 0.8836, + "step": 8071 + }, + { + "epoch": 0.8447933019361591, + "grad_norm": 1.8857034384869027, + "learning_rate": 1.237129168028479e-06, + "loss": 0.8511, + "step": 8072 + }, + { + "epoch": 0.8448979591836735, + "grad_norm": 2.0457364245538807, + "learning_rate": 1.2354965405235276e-06, + "loss": 0.9149, + "step": 8073 + }, + { + "epoch": 0.8450026164311879, + "grad_norm": 2.1833956563172103, + "learning_rate": 1.2338649200777908e-06, + "loss": 0.7978, + "step": 8074 + }, + { + "epoch": 0.8451072736787022, + "grad_norm": 1.95060154540977, + "learning_rate": 1.2322343068787456e-06, + "loss": 0.8084, + "step": 8075 + }, + { + "epoch": 0.8452119309262166, + "grad_norm": 2.051054237471809, + "learning_rate": 1.23060470111375e-06, + "loss": 0.8513, + "step": 8076 + }, + { + "epoch": 0.845316588173731, + "grad_norm": 1.9270136243745513, + "learning_rate": 1.2289761029700553e-06, + "loss": 0.8409, + "step": 8077 + }, + { + "epoch": 0.8454212454212454, + "grad_norm": 2.2645480935646702, + "learning_rate": 1.227348512634784e-06, + "loss": 0.9181, + "step": 8078 + }, + { + "epoch": 0.8455259026687598, + "grad_norm": 2.2076948615617185, + "learning_rate": 1.2257219302949541e-06, + "loss": 0.9305, + "step": 8079 + }, + { + "epoch": 0.8456305599162742, + "grad_norm": 2.511590266125373, + "learning_rate": 1.224096356137463e-06, + "loss": 0.911, + "step": 8080 + }, + { + "epoch": 0.8457352171637886, + "grad_norm": 1.9894212656748564, + "learning_rate": 1.2224717903490901e-06, + "loss": 0.8961, + "step": 8081 + }, + { + "epoch": 0.8458398744113029, + "grad_norm": 2.2954720374673028, + "learning_rate": 1.2208482331165007e-06, + "loss": 0.8985, + "step": 8082 + }, + { + "epoch": 0.8459445316588173, + "grad_norm": 1.9884523540654926, + "learning_rate": 1.2192256846262484e-06, + "loss": 0.8576, + "step": 8083 + }, + { + "epoch": 0.8460491889063317, + "grad_norm": 1.6017827918804268, + "learning_rate": 1.2176041450647657e-06, + "loss": 0.7318, + "step": 8084 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 2.191831478479857, + "learning_rate": 1.215983614618369e-06, + "loss": 0.8629, + "step": 8085 + }, + { + "epoch": 0.8462585034013606, + "grad_norm": 1.9686856066340168, + "learning_rate": 1.2143640934732636e-06, + "loss": 0.7544, + "step": 8086 + }, + { + "epoch": 0.846363160648875, + "grad_norm": 2.15328965743791, + "learning_rate": 1.2127455818155321e-06, + "loss": 0.9058, + "step": 8087 + }, + { + "epoch": 0.8464678178963894, + "grad_norm": 2.0301113009573046, + "learning_rate": 1.2111280798311486e-06, + "loss": 0.8852, + "step": 8088 + }, + { + "epoch": 0.8465724751439038, + "grad_norm": 2.124293788106227, + "learning_rate": 1.209511587705966e-06, + "loss": 0.825, + "step": 8089 + }, + { + "epoch": 0.8466771323914181, + "grad_norm": 1.8508848742570292, + "learning_rate": 1.2078961056257222e-06, + "loss": 0.8428, + "step": 8090 + }, + { + "epoch": 0.8467817896389325, + "grad_norm": 2.1311794047736337, + "learning_rate": 1.2062816337760374e-06, + "loss": 0.9301, + "step": 8091 + }, + { + "epoch": 0.8468864468864469, + "grad_norm": 2.104243985600498, + "learning_rate": 1.2046681723424214e-06, + "loss": 0.872, + "step": 8092 + }, + { + "epoch": 0.8469911041339613, + "grad_norm": 2.130616894657293, + "learning_rate": 1.20305572151026e-06, + "loss": 0.7725, + "step": 8093 + }, + { + "epoch": 0.8470957613814757, + "grad_norm": 1.9408014352741592, + "learning_rate": 1.2014442814648318e-06, + "loss": 0.9378, + "step": 8094 + }, + { + "epoch": 0.8472004186289901, + "grad_norm": 2.430277353641727, + "learning_rate": 1.1998338523912923e-06, + "loss": 0.8954, + "step": 8095 + }, + { + "epoch": 0.8473050758765045, + "grad_norm": 2.147447871976377, + "learning_rate": 1.198224434474683e-06, + "loss": 0.9456, + "step": 8096 + }, + { + "epoch": 0.8474097331240188, + "grad_norm": 2.150148583677661, + "learning_rate": 1.1966160278999273e-06, + "loss": 0.845, + "step": 8097 + }, + { + "epoch": 0.8475143903715332, + "grad_norm": 1.8999522656749204, + "learning_rate": 1.1950086328518383e-06, + "loss": 0.9598, + "step": 8098 + }, + { + "epoch": 0.8476190476190476, + "grad_norm": 2.22762433278561, + "learning_rate": 1.1934022495151064e-06, + "loss": 0.9627, + "step": 8099 + }, + { + "epoch": 0.847723704866562, + "grad_norm": 1.9018640415303447, + "learning_rate": 1.1917968780743094e-06, + "loss": 0.9066, + "step": 8100 + }, + { + "epoch": 0.8478283621140764, + "grad_norm": 2.2146699983704914, + "learning_rate": 1.1901925187139052e-06, + "loss": 0.8584, + "step": 8101 + }, + { + "epoch": 0.8479330193615908, + "grad_norm": 2.0599972177345305, + "learning_rate": 1.1885891716182395e-06, + "loss": 0.8889, + "step": 8102 + }, + { + "epoch": 0.8480376766091052, + "grad_norm": 2.21986430951051, + "learning_rate": 1.1869868369715431e-06, + "loss": 0.9083, + "step": 8103 + }, + { + "epoch": 0.8481423338566195, + "grad_norm": 2.1417810174460854, + "learning_rate": 1.1853855149579251e-06, + "loss": 0.8543, + "step": 8104 + }, + { + "epoch": 0.8482469911041339, + "grad_norm": 1.7461793353367485, + "learning_rate": 1.1837852057613808e-06, + "loss": 0.7139, + "step": 8105 + }, + { + "epoch": 0.8483516483516483, + "grad_norm": 1.80563643788023, + "learning_rate": 1.182185909565785e-06, + "loss": 0.8036, + "step": 8106 + }, + { + "epoch": 0.8484563055991627, + "grad_norm": 2.3503816379233844, + "learning_rate": 1.1805876265549076e-06, + "loss": 0.9276, + "step": 8107 + }, + { + "epoch": 0.8485609628466771, + "grad_norm": 2.4489921440588605, + "learning_rate": 1.1789903569123896e-06, + "loss": 0.9568, + "step": 8108 + }, + { + "epoch": 0.8486656200941916, + "grad_norm": 1.9034330562968986, + "learning_rate": 1.1773941008217627e-06, + "loss": 0.8873, + "step": 8109 + }, + { + "epoch": 0.848770277341706, + "grad_norm": 2.2375424098728205, + "learning_rate": 1.175798858466436e-06, + "loss": 0.8962, + "step": 8110 + }, + { + "epoch": 0.8488749345892203, + "grad_norm": 2.0427220355559004, + "learning_rate": 1.1742046300297084e-06, + "loss": 0.8506, + "step": 8111 + }, + { + "epoch": 0.8489795918367347, + "grad_norm": 1.9535985727245424, + "learning_rate": 1.1726114156947644e-06, + "loss": 0.7011, + "step": 8112 + }, + { + "epoch": 0.8490842490842491, + "grad_norm": 1.9013330814423128, + "learning_rate": 1.171019215644662e-06, + "loss": 0.8496, + "step": 8113 + }, + { + "epoch": 0.8491889063317635, + "grad_norm": 1.8885673787395196, + "learning_rate": 1.1694280300623505e-06, + "loss": 0.9879, + "step": 8114 + }, + { + "epoch": 0.8492935635792779, + "grad_norm": 1.7156423861186427, + "learning_rate": 1.1678378591306604e-06, + "loss": 0.812, + "step": 8115 + }, + { + "epoch": 0.8493982208267923, + "grad_norm": 2.1750963180245293, + "learning_rate": 1.1662487030323021e-06, + "loss": 0.9194, + "step": 8116 + }, + { + "epoch": 0.8495028780743067, + "grad_norm": 2.0315617464728604, + "learning_rate": 1.1646605619498784e-06, + "loss": 0.8036, + "step": 8117 + }, + { + "epoch": 0.849607535321821, + "grad_norm": 2.1841952937990325, + "learning_rate": 1.1630734360658669e-06, + "loss": 0.8595, + "step": 8118 + }, + { + "epoch": 0.8497121925693354, + "grad_norm": 2.1088635653194627, + "learning_rate": 1.161487325562629e-06, + "loss": 0.7815, + "step": 8119 + }, + { + "epoch": 0.8498168498168498, + "grad_norm": 2.058500389607447, + "learning_rate": 1.1599022306224184e-06, + "loss": 0.8606, + "step": 8120 + }, + { + "epoch": 0.8499215070643642, + "grad_norm": 2.140038653165263, + "learning_rate": 1.15831815142736e-06, + "loss": 0.9239, + "step": 8121 + }, + { + "epoch": 0.8500261643118786, + "grad_norm": 2.0778034483126575, + "learning_rate": 1.1567350881594708e-06, + "loss": 0.955, + "step": 8122 + }, + { + "epoch": 0.850130821559393, + "grad_norm": 1.998286679619626, + "learning_rate": 1.1551530410006484e-06, + "loss": 0.9129, + "step": 8123 + }, + { + "epoch": 0.8502354788069074, + "grad_norm": 1.5817030186016467, + "learning_rate": 1.153572010132672e-06, + "loss": 0.7925, + "step": 8124 + }, + { + "epoch": 0.8503401360544217, + "grad_norm": 1.9794772310958975, + "learning_rate": 1.1519919957372029e-06, + "loss": 0.9053, + "step": 8125 + }, + { + "epoch": 0.8504447933019361, + "grad_norm": 1.8803338170594943, + "learning_rate": 1.1504129979957913e-06, + "loss": 0.8949, + "step": 8126 + }, + { + "epoch": 0.8505494505494505, + "grad_norm": 2.145083879121954, + "learning_rate": 1.1488350170898676e-06, + "loss": 0.9441, + "step": 8127 + }, + { + "epoch": 0.8506541077969649, + "grad_norm": 1.864355430498479, + "learning_rate": 1.1472580532007405e-06, + "loss": 0.9112, + "step": 8128 + }, + { + "epoch": 0.8507587650444793, + "grad_norm": 1.5284392423180004, + "learning_rate": 1.1456821065096125e-06, + "loss": 0.7273, + "step": 8129 + }, + { + "epoch": 0.8508634222919937, + "grad_norm": 1.7736534908958845, + "learning_rate": 1.1441071771975576e-06, + "loss": 0.746, + "step": 8130 + }, + { + "epoch": 0.8509680795395081, + "grad_norm": 2.0415742507935613, + "learning_rate": 1.1425332654455422e-06, + "loss": 0.7637, + "step": 8131 + }, + { + "epoch": 0.8510727367870226, + "grad_norm": 1.9995682557341876, + "learning_rate": 1.140960371434411e-06, + "loss": 0.8595, + "step": 8132 + }, + { + "epoch": 0.8511773940345368, + "grad_norm": 2.134115968088377, + "learning_rate": 1.1393884953448931e-06, + "loss": 0.9112, + "step": 8133 + }, + { + "epoch": 0.8512820512820513, + "grad_norm": 2.0560620276562083, + "learning_rate": 1.1378176373575977e-06, + "loss": 0.8082, + "step": 8134 + }, + { + "epoch": 0.8513867085295657, + "grad_norm": 1.951430060069111, + "learning_rate": 1.1362477976530206e-06, + "loss": 0.8975, + "step": 8135 + }, + { + "epoch": 0.8514913657770801, + "grad_norm": 1.6765488705122318, + "learning_rate": 1.1346789764115418e-06, + "loss": 0.7827, + "step": 8136 + }, + { + "epoch": 0.8515960230245945, + "grad_norm": 2.1566144334273014, + "learning_rate": 1.1331111738134193e-06, + "loss": 0.8666, + "step": 8137 + }, + { + "epoch": 0.8517006802721089, + "grad_norm": 2.324273210903453, + "learning_rate": 1.131544390038799e-06, + "loss": 0.8484, + "step": 8138 + }, + { + "epoch": 0.8518053375196233, + "grad_norm": 2.1363318732510352, + "learning_rate": 1.129978625267707e-06, + "loss": 0.8032, + "step": 8139 + }, + { + "epoch": 0.8519099947671376, + "grad_norm": 1.873627689153686, + "learning_rate": 1.1284138796800492e-06, + "loss": 0.8898, + "step": 8140 + }, + { + "epoch": 0.852014652014652, + "grad_norm": 2.2677845731395205, + "learning_rate": 1.1268501534556242e-06, + "loss": 0.8408, + "step": 8141 + }, + { + "epoch": 0.8521193092621664, + "grad_norm": 2.0835248846493557, + "learning_rate": 1.125287446774105e-06, + "loss": 0.9684, + "step": 8142 + }, + { + "epoch": 0.8522239665096808, + "grad_norm": 2.1770740624500657, + "learning_rate": 1.1237257598150487e-06, + "loss": 0.8597, + "step": 8143 + }, + { + "epoch": 0.8523286237571952, + "grad_norm": 2.1071307255681284, + "learning_rate": 1.1221650927578942e-06, + "loss": 0.9521, + "step": 8144 + }, + { + "epoch": 0.8524332810047096, + "grad_norm": 2.3515348488854944, + "learning_rate": 1.1206054457819671e-06, + "loss": 0.8748, + "step": 8145 + }, + { + "epoch": 0.852537938252224, + "grad_norm": 2.228241072663553, + "learning_rate": 1.1190468190664782e-06, + "loss": 0.8862, + "step": 8146 + }, + { + "epoch": 0.8526425954997383, + "grad_norm": 2.0454265798917985, + "learning_rate": 1.1174892127905123e-06, + "loss": 0.7851, + "step": 8147 + }, + { + "epoch": 0.8527472527472527, + "grad_norm": 2.9906239949974824, + "learning_rate": 1.115932627133043e-06, + "loss": 0.9877, + "step": 8148 + }, + { + "epoch": 0.8528519099947671, + "grad_norm": 1.7183578316341401, + "learning_rate": 1.1143770622729232e-06, + "loss": 0.8169, + "step": 8149 + }, + { + "epoch": 0.8529565672422815, + "grad_norm": 1.9631303418317718, + "learning_rate": 1.112822518388893e-06, + "loss": 0.9437, + "step": 8150 + }, + { + "epoch": 0.8530612244897959, + "grad_norm": 2.2478081957198883, + "learning_rate": 1.1112689956595724e-06, + "loss": 0.9049, + "step": 8151 + }, + { + "epoch": 0.8531658817373103, + "grad_norm": 1.9907598676903082, + "learning_rate": 1.109716494263463e-06, + "loss": 0.7481, + "step": 8152 + }, + { + "epoch": 0.8532705389848247, + "grad_norm": 2.092413746291313, + "learning_rate": 1.1081650143789513e-06, + "loss": 0.888, + "step": 8153 + }, + { + "epoch": 0.853375196232339, + "grad_norm": 2.150281264392465, + "learning_rate": 1.1066145561843033e-06, + "loss": 0.9098, + "step": 8154 + }, + { + "epoch": 0.8534798534798534, + "grad_norm": 1.7025890287640117, + "learning_rate": 1.1050651198576713e-06, + "loss": 0.7393, + "step": 8155 + }, + { + "epoch": 0.8535845107273679, + "grad_norm": 2.165113850117995, + "learning_rate": 1.1035167055770901e-06, + "loss": 0.8349, + "step": 8156 + }, + { + "epoch": 0.8536891679748823, + "grad_norm": 2.5762419911687826, + "learning_rate": 1.1019693135204757e-06, + "loss": 0.969, + "step": 8157 + }, + { + "epoch": 0.8537938252223967, + "grad_norm": 1.9241185785842263, + "learning_rate": 1.1004229438656245e-06, + "loss": 0.7895, + "step": 8158 + }, + { + "epoch": 0.8538984824699111, + "grad_norm": 2.028496874452248, + "learning_rate": 1.0988775967902165e-06, + "loss": 0.8903, + "step": 8159 + }, + { + "epoch": 0.8540031397174255, + "grad_norm": 2.388468483206559, + "learning_rate": 1.09733327247182e-06, + "loss": 0.773, + "step": 8160 + }, + { + "epoch": 0.8541077969649398, + "grad_norm": 1.941083291471243, + "learning_rate": 1.0957899710878772e-06, + "loss": 0.8528, + "step": 8161 + }, + { + "epoch": 0.8542124542124542, + "grad_norm": 2.2068646875053877, + "learning_rate": 1.0942476928157175e-06, + "loss": 0.8462, + "step": 8162 + }, + { + "epoch": 0.8543171114599686, + "grad_norm": 2.071496324715311, + "learning_rate": 1.0927064378325503e-06, + "loss": 0.9587, + "step": 8163 + }, + { + "epoch": 0.854421768707483, + "grad_norm": 2.150573325542622, + "learning_rate": 1.0911662063154694e-06, + "loss": 0.9465, + "step": 8164 + }, + { + "epoch": 0.8545264259549974, + "grad_norm": 1.7662897309607424, + "learning_rate": 1.0896269984414543e-06, + "loss": 0.8114, + "step": 8165 + }, + { + "epoch": 0.8546310832025118, + "grad_norm": 1.7296411837745311, + "learning_rate": 1.088088814387359e-06, + "loss": 0.7644, + "step": 8166 + }, + { + "epoch": 0.8547357404500262, + "grad_norm": 2.7077350662708257, + "learning_rate": 1.086551654329927e-06, + "loss": 0.6619, + "step": 8167 + }, + { + "epoch": 0.8548403976975405, + "grad_norm": 1.7773722774962573, + "learning_rate": 1.0850155184457767e-06, + "loss": 0.768, + "step": 8168 + }, + { + "epoch": 0.8549450549450549, + "grad_norm": 1.913533199941065, + "learning_rate": 1.083480406911418e-06, + "loss": 0.9215, + "step": 8169 + }, + { + "epoch": 0.8550497121925693, + "grad_norm": 2.2851663792939894, + "learning_rate": 1.0819463199032354e-06, + "loss": 0.9138, + "step": 8170 + }, + { + "epoch": 0.8551543694400837, + "grad_norm": 2.434539639739813, + "learning_rate": 1.0804132575974979e-06, + "loss": 0.7474, + "step": 8171 + }, + { + "epoch": 0.8552590266875981, + "grad_norm": 2.220301148902414, + "learning_rate": 1.0788812201703614e-06, + "loss": 0.9146, + "step": 8172 + }, + { + "epoch": 0.8553636839351125, + "grad_norm": 2.412245947837503, + "learning_rate": 1.0773502077978571e-06, + "loss": 0.9518, + "step": 8173 + }, + { + "epoch": 0.8554683411826269, + "grad_norm": 2.0701141368028915, + "learning_rate": 1.075820220655901e-06, + "loss": 0.7442, + "step": 8174 + }, + { + "epoch": 0.8555729984301413, + "grad_norm": 2.0366797846248668, + "learning_rate": 1.0742912589202935e-06, + "loss": 0.8856, + "step": 8175 + }, + { + "epoch": 0.8556776556776556, + "grad_norm": 2.2237305192244743, + "learning_rate": 1.0727633227667157e-06, + "loss": 0.9214, + "step": 8176 + }, + { + "epoch": 0.85578231292517, + "grad_norm": 1.9347821344648546, + "learning_rate": 1.0712364123707287e-06, + "loss": 0.9748, + "step": 8177 + }, + { + "epoch": 0.8558869701726844, + "grad_norm": 2.0653013623994054, + "learning_rate": 1.069710527907777e-06, + "loss": 0.8951, + "step": 8178 + }, + { + "epoch": 0.8559916274201989, + "grad_norm": 2.3214860069724947, + "learning_rate": 1.068185669553191e-06, + "loss": 0.9846, + "step": 8179 + }, + { + "epoch": 0.8560962846677133, + "grad_norm": 2.1905071628526986, + "learning_rate": 1.0666618374821759e-06, + "loss": 0.7894, + "step": 8180 + }, + { + "epoch": 0.8562009419152277, + "grad_norm": 2.1369361625201875, + "learning_rate": 1.0651390318698285e-06, + "loss": 0.9375, + "step": 8181 + }, + { + "epoch": 0.8563055991627421, + "grad_norm": 2.1626356161328935, + "learning_rate": 1.0636172528911182e-06, + "loss": 0.9148, + "step": 8182 + }, + { + "epoch": 0.8564102564102564, + "grad_norm": 2.085183578524157, + "learning_rate": 1.0620965007208993e-06, + "loss": 0.928, + "step": 8183 + }, + { + "epoch": 0.8565149136577708, + "grad_norm": 1.7680046788843726, + "learning_rate": 1.0605767755339147e-06, + "loss": 0.7463, + "step": 8184 + }, + { + "epoch": 0.8566195709052852, + "grad_norm": 1.9582799315088195, + "learning_rate": 1.0590580775047798e-06, + "loss": 0.9068, + "step": 8185 + }, + { + "epoch": 0.8567242281527996, + "grad_norm": 2.0988281526131845, + "learning_rate": 1.0575404068079965e-06, + "loss": 0.8693, + "step": 8186 + }, + { + "epoch": 0.856828885400314, + "grad_norm": 2.192703067041728, + "learning_rate": 1.0560237636179493e-06, + "loss": 0.8442, + "step": 8187 + }, + { + "epoch": 0.8569335426478284, + "grad_norm": 1.8377811369460275, + "learning_rate": 1.0545081481089015e-06, + "loss": 0.853, + "step": 8188 + }, + { + "epoch": 0.8570381998953428, + "grad_norm": 1.9277235332711045, + "learning_rate": 1.0529935604550012e-06, + "loss": 0.8852, + "step": 8189 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 1.8127579290454636, + "learning_rate": 1.0514800008302806e-06, + "loss": 0.7326, + "step": 8190 + }, + { + "epoch": 0.8572475143903715, + "grad_norm": 2.4923220960574306, + "learning_rate": 1.049967469408648e-06, + "loss": 1.0099, + "step": 8191 + }, + { + "epoch": 0.8573521716378859, + "grad_norm": 2.291069167052195, + "learning_rate": 1.0484559663638971e-06, + "loss": 0.8967, + "step": 8192 + }, + { + "epoch": 0.8574568288854003, + "grad_norm": 2.1378655909099367, + "learning_rate": 1.0469454918696998e-06, + "loss": 0.9139, + "step": 8193 + }, + { + "epoch": 0.8575614861329147, + "grad_norm": 2.0113703844240303, + "learning_rate": 1.0454360460996181e-06, + "loss": 0.8301, + "step": 8194 + }, + { + "epoch": 0.8576661433804291, + "grad_norm": 2.1271770785333066, + "learning_rate": 1.0439276292270872e-06, + "loss": 0.812, + "step": 8195 + }, + { + "epoch": 0.8577708006279435, + "grad_norm": 2.3638152006231903, + "learning_rate": 1.042420241425427e-06, + "loss": 0.777, + "step": 8196 + }, + { + "epoch": 0.8578754578754578, + "grad_norm": 1.9732167242741625, + "learning_rate": 1.0409138828678389e-06, + "loss": 0.6977, + "step": 8197 + }, + { + "epoch": 0.8579801151229722, + "grad_norm": 2.1089360027170083, + "learning_rate": 1.0394085537274069e-06, + "loss": 0.7946, + "step": 8198 + }, + { + "epoch": 0.8580847723704866, + "grad_norm": 1.7844456116204388, + "learning_rate": 1.0379042541771e-06, + "loss": 0.796, + "step": 8199 + }, + { + "epoch": 0.858189429618001, + "grad_norm": 2.136236846911897, + "learning_rate": 1.0364009843897615e-06, + "loss": 0.8329, + "step": 8200 + }, + { + "epoch": 0.8582940868655154, + "grad_norm": 1.86851966011732, + "learning_rate": 1.0348987445381208e-06, + "loss": 0.774, + "step": 8201 + }, + { + "epoch": 0.8583987441130299, + "grad_norm": 2.1299132196721042, + "learning_rate": 1.0333975347947866e-06, + "loss": 0.9495, + "step": 8202 + }, + { + "epoch": 0.8585034013605443, + "grad_norm": 1.785497778491649, + "learning_rate": 1.0318973553322553e-06, + "loss": 0.8073, + "step": 8203 + }, + { + "epoch": 0.8586080586080586, + "grad_norm": 2.259096733636607, + "learning_rate": 1.0303982063228978e-06, + "loss": 0.8527, + "step": 8204 + }, + { + "epoch": 0.858712715855573, + "grad_norm": 2.0636141494036515, + "learning_rate": 1.02890008793897e-06, + "loss": 0.898, + "step": 8205 + }, + { + "epoch": 0.8588173731030874, + "grad_norm": 2.349058809970281, + "learning_rate": 1.0274030003526069e-06, + "loss": 0.8855, + "step": 8206 + }, + { + "epoch": 0.8589220303506018, + "grad_norm": 2.152390907657251, + "learning_rate": 1.0259069437358271e-06, + "loss": 0.8474, + "step": 8207 + }, + { + "epoch": 0.8590266875981162, + "grad_norm": 2.104781303316511, + "learning_rate": 1.024411918260535e-06, + "loss": 0.8537, + "step": 8208 + }, + { + "epoch": 0.8591313448456306, + "grad_norm": 2.002176163454363, + "learning_rate": 1.0229179240985089e-06, + "loss": 0.8734, + "step": 8209 + }, + { + "epoch": 0.859236002093145, + "grad_norm": 2.148962156095097, + "learning_rate": 1.021424961421411e-06, + "loss": 0.8443, + "step": 8210 + }, + { + "epoch": 0.8593406593406593, + "grad_norm": 1.7193849211791452, + "learning_rate": 1.0199330304007858e-06, + "loss": 0.7636, + "step": 8211 + }, + { + "epoch": 0.8594453165881737, + "grad_norm": 2.107603846212653, + "learning_rate": 1.0184421312080594e-06, + "loss": 0.9227, + "step": 8212 + }, + { + "epoch": 0.8595499738356881, + "grad_norm": 1.9252852291664944, + "learning_rate": 1.0169522640145412e-06, + "loss": 0.9483, + "step": 8213 + }, + { + "epoch": 0.8596546310832025, + "grad_norm": 2.226183549251737, + "learning_rate": 1.015463428991419e-06, + "loss": 1.0013, + "step": 8214 + }, + { + "epoch": 0.8597592883307169, + "grad_norm": 2.323773979390847, + "learning_rate": 1.013975626309759e-06, + "loss": 0.8943, + "step": 8215 + }, + { + "epoch": 0.8598639455782313, + "grad_norm": 2.019532286245795, + "learning_rate": 1.0124888561405188e-06, + "loss": 0.8684, + "step": 8216 + }, + { + "epoch": 0.8599686028257457, + "grad_norm": 2.270980325536687, + "learning_rate": 1.0110031186545265e-06, + "loss": 0.9197, + "step": 8217 + }, + { + "epoch": 0.8600732600732601, + "grad_norm": 2.00584994006669, + "learning_rate": 1.0095184140225011e-06, + "loss": 0.804, + "step": 8218 + }, + { + "epoch": 0.8601779173207744, + "grad_norm": 2.059032541958812, + "learning_rate": 1.0080347424150349e-06, + "loss": 0.9117, + "step": 8219 + }, + { + "epoch": 0.8602825745682888, + "grad_norm": 1.8583774944181204, + "learning_rate": 1.006552104002605e-06, + "loss": 0.8409, + "step": 8220 + }, + { + "epoch": 0.8603872318158032, + "grad_norm": 1.7469649900300717, + "learning_rate": 1.0050704989555693e-06, + "loss": 0.7859, + "step": 8221 + }, + { + "epoch": 0.8604918890633176, + "grad_norm": 1.9209381565368364, + "learning_rate": 1.0035899274441684e-06, + "loss": 0.8993, + "step": 8222 + }, + { + "epoch": 0.860596546310832, + "grad_norm": 2.034671367413749, + "learning_rate": 1.002110389638521e-06, + "loss": 0.8571, + "step": 8223 + }, + { + "epoch": 0.8607012035583465, + "grad_norm": 1.9206079541192072, + "learning_rate": 1.0006318857086328e-06, + "loss": 0.8058, + "step": 8224 + }, + { + "epoch": 0.8608058608058609, + "grad_norm": 1.7267084357758915, + "learning_rate": 9.991544158243848e-07, + "loss": 0.7573, + "step": 8225 + }, + { + "epoch": 0.8609105180533752, + "grad_norm": 2.271042408187385, + "learning_rate": 9.976779801555404e-07, + "loss": 0.9341, + "step": 8226 + }, + { + "epoch": 0.8610151753008896, + "grad_norm": 2.194966202864827, + "learning_rate": 9.962025788717434e-07, + "loss": 0.8549, + "step": 8227 + }, + { + "epoch": 0.861119832548404, + "grad_norm": 2.1563970927227016, + "learning_rate": 9.94728212142525e-07, + "loss": 0.9266, + "step": 8228 + }, + { + "epoch": 0.8612244897959184, + "grad_norm": 1.98123025878276, + "learning_rate": 9.93254880137291e-07, + "loss": 0.8194, + "step": 8229 + }, + { + "epoch": 0.8613291470434328, + "grad_norm": 2.027077949956647, + "learning_rate": 9.917825830253303e-07, + "loss": 0.962, + "step": 8230 + }, + { + "epoch": 0.8614338042909472, + "grad_norm": 2.3467824454655966, + "learning_rate": 9.903113209758098e-07, + "loss": 0.9625, + "step": 8231 + }, + { + "epoch": 0.8615384615384616, + "grad_norm": 1.7368296277382502, + "learning_rate": 9.888410941577819e-07, + "loss": 0.7428, + "step": 8232 + }, + { + "epoch": 0.8616431187859759, + "grad_norm": 1.9207232431358994, + "learning_rate": 9.87371902740183e-07, + "loss": 0.9178, + "step": 8233 + }, + { + "epoch": 0.8617477760334903, + "grad_norm": 1.7639576896606057, + "learning_rate": 9.859037468918232e-07, + "loss": 0.8275, + "step": 8234 + }, + { + "epoch": 0.8618524332810047, + "grad_norm": 2.096385523082317, + "learning_rate": 9.844366267813965e-07, + "loss": 0.8697, + "step": 8235 + }, + { + "epoch": 0.8619570905285191, + "grad_norm": 2.1971747126218157, + "learning_rate": 9.82970542577475e-07, + "loss": 0.86, + "step": 8236 + }, + { + "epoch": 0.8620617477760335, + "grad_norm": 2.0670900252205056, + "learning_rate": 9.815054944485203e-07, + "loss": 0.85, + "step": 8237 + }, + { + "epoch": 0.8621664050235479, + "grad_norm": 2.0255958527875766, + "learning_rate": 9.800414825628657e-07, + "loss": 0.7782, + "step": 8238 + }, + { + "epoch": 0.8622710622710623, + "grad_norm": 2.5228783731175466, + "learning_rate": 9.78578507088731e-07, + "loss": 0.8302, + "step": 8239 + }, + { + "epoch": 0.8623757195185766, + "grad_norm": 2.0907415401597547, + "learning_rate": 9.771165681942118e-07, + "loss": 0.9448, + "step": 8240 + }, + { + "epoch": 0.862480376766091, + "grad_norm": 1.795604413521706, + "learning_rate": 9.756556660472904e-07, + "loss": 0.8402, + "step": 8241 + }, + { + "epoch": 0.8625850340136054, + "grad_norm": 2.224964285791813, + "learning_rate": 9.741958008158292e-07, + "loss": 0.8443, + "step": 8242 + }, + { + "epoch": 0.8626896912611198, + "grad_norm": 2.1230855516357847, + "learning_rate": 9.727369726675673e-07, + "loss": 0.85, + "step": 8243 + }, + { + "epoch": 0.8627943485086342, + "grad_norm": 2.047579087438642, + "learning_rate": 9.712791817701273e-07, + "loss": 0.8912, + "step": 8244 + }, + { + "epoch": 0.8628990057561486, + "grad_norm": 1.8823706152629818, + "learning_rate": 9.698224282910128e-07, + "loss": 0.8377, + "step": 8245 + }, + { + "epoch": 0.863003663003663, + "grad_norm": 2.0560768069448714, + "learning_rate": 9.68366712397606e-07, + "loss": 0.8321, + "step": 8246 + }, + { + "epoch": 0.8631083202511773, + "grad_norm": 2.010540058978142, + "learning_rate": 9.669120342571747e-07, + "loss": 0.8789, + "step": 8247 + }, + { + "epoch": 0.8632129774986917, + "grad_norm": 1.78965265727034, + "learning_rate": 9.654583940368622e-07, + "loss": 0.6758, + "step": 8248 + }, + { + "epoch": 0.8633176347462062, + "grad_norm": 2.0314846628821766, + "learning_rate": 9.640057919036926e-07, + "loss": 0.8291, + "step": 8249 + }, + { + "epoch": 0.8634222919937206, + "grad_norm": 2.0004853915166354, + "learning_rate": 9.62554228024578e-07, + "loss": 0.8815, + "step": 8250 + }, + { + "epoch": 0.863526949241235, + "grad_norm": 2.1679537430544205, + "learning_rate": 9.611037025663017e-07, + "loss": 0.955, + "step": 8251 + }, + { + "epoch": 0.8636316064887494, + "grad_norm": 1.8433670228793322, + "learning_rate": 9.596542156955357e-07, + "loss": 0.8938, + "step": 8252 + }, + { + "epoch": 0.8637362637362638, + "grad_norm": 2.1490342455394162, + "learning_rate": 9.58205767578827e-07, + "loss": 0.8708, + "step": 8253 + }, + { + "epoch": 0.8638409209837782, + "grad_norm": 2.258196552431531, + "learning_rate": 9.567583583826046e-07, + "loss": 0.8351, + "step": 8254 + }, + { + "epoch": 0.8639455782312925, + "grad_norm": 2.076287178014026, + "learning_rate": 9.553119882731777e-07, + "loss": 0.949, + "step": 8255 + }, + { + "epoch": 0.8640502354788069, + "grad_norm": 2.247501553759808, + "learning_rate": 9.53866657416741e-07, + "loss": 0.9345, + "step": 8256 + }, + { + "epoch": 0.8641548927263213, + "grad_norm": 2.281215175212065, + "learning_rate": 9.524223659793641e-07, + "loss": 0.8919, + "step": 8257 + }, + { + "epoch": 0.8642595499738357, + "grad_norm": 1.9878173669322587, + "learning_rate": 9.509791141269964e-07, + "loss": 0.9017, + "step": 8258 + }, + { + "epoch": 0.8643642072213501, + "grad_norm": 1.9534688327422565, + "learning_rate": 9.495369020254753e-07, + "loss": 0.8423, + "step": 8259 + }, + { + "epoch": 0.8644688644688645, + "grad_norm": 1.9918692395385151, + "learning_rate": 9.48095729840508e-07, + "loss": 0.9165, + "step": 8260 + }, + { + "epoch": 0.8645735217163789, + "grad_norm": 1.8489436949868145, + "learning_rate": 9.466555977376946e-07, + "loss": 0.8444, + "step": 8261 + }, + { + "epoch": 0.8646781789638932, + "grad_norm": 2.1193645421178093, + "learning_rate": 9.452165058825058e-07, + "loss": 0.8266, + "step": 8262 + }, + { + "epoch": 0.8647828362114076, + "grad_norm": 2.0097760670300975, + "learning_rate": 9.437784544402961e-07, + "loss": 0.8798, + "step": 8263 + }, + { + "epoch": 0.864887493458922, + "grad_norm": 2.107855735727707, + "learning_rate": 9.423414435763012e-07, + "loss": 0.8519, + "step": 8264 + }, + { + "epoch": 0.8649921507064364, + "grad_norm": 1.9743559910054354, + "learning_rate": 9.409054734556344e-07, + "loss": 0.8584, + "step": 8265 + }, + { + "epoch": 0.8650968079539508, + "grad_norm": 2.565003258614491, + "learning_rate": 9.39470544243295e-07, + "loss": 0.9632, + "step": 8266 + }, + { + "epoch": 0.8652014652014652, + "grad_norm": 2.0088418728464643, + "learning_rate": 9.380366561041553e-07, + "loss": 0.897, + "step": 8267 + }, + { + "epoch": 0.8653061224489796, + "grad_norm": 2.102961311337027, + "learning_rate": 9.366038092029773e-07, + "loss": 0.9862, + "step": 8268 + }, + { + "epoch": 0.8654107796964939, + "grad_norm": 2.0365497413831424, + "learning_rate": 9.351720037043943e-07, + "loss": 0.7902, + "step": 8269 + }, + { + "epoch": 0.8655154369440083, + "grad_norm": 2.2180607315989436, + "learning_rate": 9.337412397729229e-07, + "loss": 0.8182, + "step": 8270 + }, + { + "epoch": 0.8656200941915227, + "grad_norm": 2.488823116296402, + "learning_rate": 9.323115175729636e-07, + "loss": 0.7917, + "step": 8271 + }, + { + "epoch": 0.8657247514390372, + "grad_norm": 2.2666525932415844, + "learning_rate": 9.308828372687939e-07, + "loss": 0.8258, + "step": 8272 + }, + { + "epoch": 0.8658294086865516, + "grad_norm": 1.7807084928441752, + "learning_rate": 9.294551990245704e-07, + "loss": 0.8042, + "step": 8273 + }, + { + "epoch": 0.865934065934066, + "grad_norm": 1.886320152024955, + "learning_rate": 9.280286030043306e-07, + "loss": 0.9426, + "step": 8274 + }, + { + "epoch": 0.8660387231815804, + "grad_norm": 2.3063773024197705, + "learning_rate": 9.266030493719979e-07, + "loss": 0.9073, + "step": 8275 + }, + { + "epoch": 0.8661433804290947, + "grad_norm": 2.2479580097764535, + "learning_rate": 9.251785382913658e-07, + "loss": 0.9288, + "step": 8276 + }, + { + "epoch": 0.8662480376766091, + "grad_norm": 2.1411449789130566, + "learning_rate": 9.237550699261188e-07, + "loss": 0.8874, + "step": 8277 + }, + { + "epoch": 0.8663526949241235, + "grad_norm": 1.824544022620309, + "learning_rate": 9.223326444398128e-07, + "loss": 0.8135, + "step": 8278 + }, + { + "epoch": 0.8664573521716379, + "grad_norm": 1.6429551373780529, + "learning_rate": 9.20911261995887e-07, + "loss": 0.7566, + "step": 8279 + }, + { + "epoch": 0.8665620094191523, + "grad_norm": 2.176909472080666, + "learning_rate": 9.194909227576642e-07, + "loss": 0.9291, + "step": 8280 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 2.093261943588003, + "learning_rate": 9.180716268883427e-07, + "loss": 0.8321, + "step": 8281 + }, + { + "epoch": 0.8667713239141811, + "grad_norm": 2.1603700977335256, + "learning_rate": 9.16653374551002e-07, + "loss": 0.8882, + "step": 8282 + }, + { + "epoch": 0.8668759811616954, + "grad_norm": 1.8687484939968493, + "learning_rate": 9.152361659086029e-07, + "loss": 0.9273, + "step": 8283 + }, + { + "epoch": 0.8669806384092098, + "grad_norm": 2.03676255437104, + "learning_rate": 9.138200011239828e-07, + "loss": 0.9724, + "step": 8284 + }, + { + "epoch": 0.8670852956567242, + "grad_norm": 2.116389446692822, + "learning_rate": 9.124048803598639e-07, + "loss": 0.9146, + "step": 8285 + }, + { + "epoch": 0.8671899529042386, + "grad_norm": 1.919555592655635, + "learning_rate": 9.109908037788484e-07, + "loss": 0.9482, + "step": 8286 + }, + { + "epoch": 0.867294610151753, + "grad_norm": 2.1601273225442625, + "learning_rate": 9.095777715434162e-07, + "loss": 0.9593, + "step": 8287 + }, + { + "epoch": 0.8673992673992674, + "grad_norm": 1.927506321708376, + "learning_rate": 9.081657838159253e-07, + "loss": 0.8351, + "step": 8288 + }, + { + "epoch": 0.8675039246467818, + "grad_norm": 2.0177113831137357, + "learning_rate": 9.067548407586146e-07, + "loss": 0.889, + "step": 8289 + }, + { + "epoch": 0.8676085818942961, + "grad_norm": 2.1523136474810163, + "learning_rate": 9.053449425336092e-07, + "loss": 0.9081, + "step": 8290 + }, + { + "epoch": 0.8677132391418105, + "grad_norm": 1.9842678328945351, + "learning_rate": 9.03936089302907e-07, + "loss": 0.9047, + "step": 8291 + }, + { + "epoch": 0.8678178963893249, + "grad_norm": 1.946904140627659, + "learning_rate": 9.025282812283875e-07, + "loss": 0.7917, + "step": 8292 + }, + { + "epoch": 0.8679225536368393, + "grad_norm": 1.7343881101313423, + "learning_rate": 9.011215184718081e-07, + "loss": 0.7012, + "step": 8293 + }, + { + "epoch": 0.8680272108843538, + "grad_norm": 1.6121923338112008, + "learning_rate": 8.997158011948126e-07, + "loss": 0.739, + "step": 8294 + }, + { + "epoch": 0.8681318681318682, + "grad_norm": 1.7420719829107674, + "learning_rate": 8.98311129558922e-07, + "loss": 0.6873, + "step": 8295 + }, + { + "epoch": 0.8682365253793826, + "grad_norm": 2.1273765607618667, + "learning_rate": 8.969075037255326e-07, + "loss": 0.9182, + "step": 8296 + }, + { + "epoch": 0.868341182626897, + "grad_norm": 1.9827264389330077, + "learning_rate": 8.955049238559254e-07, + "loss": 0.9137, + "step": 8297 + }, + { + "epoch": 0.8684458398744113, + "grad_norm": 1.9931692934159142, + "learning_rate": 8.941033901112572e-07, + "loss": 0.9219, + "step": 8298 + }, + { + "epoch": 0.8685504971219257, + "grad_norm": 2.0187325482134026, + "learning_rate": 8.927029026525713e-07, + "loss": 0.8923, + "step": 8299 + }, + { + "epoch": 0.8686551543694401, + "grad_norm": 2.121244415757878, + "learning_rate": 8.913034616407846e-07, + "loss": 0.8702, + "step": 8300 + }, + { + "epoch": 0.8687598116169545, + "grad_norm": 2.0917927570195145, + "learning_rate": 8.899050672366949e-07, + "loss": 0.7015, + "step": 8301 + }, + { + "epoch": 0.8688644688644689, + "grad_norm": 2.3545018308026564, + "learning_rate": 8.885077196009783e-07, + "loss": 0.8551, + "step": 8302 + }, + { + "epoch": 0.8689691261119833, + "grad_norm": 2.367894373396556, + "learning_rate": 8.871114188941987e-07, + "loss": 0.8081, + "step": 8303 + }, + { + "epoch": 0.8690737833594977, + "grad_norm": 1.9788955523872949, + "learning_rate": 8.857161652767887e-07, + "loss": 0.83, + "step": 8304 + }, + { + "epoch": 0.869178440607012, + "grad_norm": 2.27059813164129, + "learning_rate": 8.843219589090701e-07, + "loss": 0.8601, + "step": 8305 + }, + { + "epoch": 0.8692830978545264, + "grad_norm": 2.3545604185359497, + "learning_rate": 8.829287999512371e-07, + "loss": 0.8818, + "step": 8306 + }, + { + "epoch": 0.8693877551020408, + "grad_norm": 2.122637516854195, + "learning_rate": 8.81536688563367e-07, + "loss": 0.9421, + "step": 8307 + }, + { + "epoch": 0.8694924123495552, + "grad_norm": 2.041713532250219, + "learning_rate": 8.801456249054152e-07, + "loss": 0.8007, + "step": 8308 + }, + { + "epoch": 0.8695970695970696, + "grad_norm": 2.141284640895024, + "learning_rate": 8.787556091372207e-07, + "loss": 0.7057, + "step": 8309 + }, + { + "epoch": 0.869701726844584, + "grad_norm": 2.1933709636305596, + "learning_rate": 8.773666414184955e-07, + "loss": 0.8779, + "step": 8310 + }, + { + "epoch": 0.8698063840920984, + "grad_norm": 2.2346155866799533, + "learning_rate": 8.759787219088389e-07, + "loss": 0.7949, + "step": 8311 + }, + { + "epoch": 0.8699110413396127, + "grad_norm": 2.3392273884858836, + "learning_rate": 8.745918507677232e-07, + "loss": 0.9422, + "step": 8312 + }, + { + "epoch": 0.8700156985871271, + "grad_norm": 1.8380709262710446, + "learning_rate": 8.732060281545007e-07, + "loss": 0.769, + "step": 8313 + }, + { + "epoch": 0.8701203558346415, + "grad_norm": 1.7448127426806292, + "learning_rate": 8.718212542284099e-07, + "loss": 0.8186, + "step": 8314 + }, + { + "epoch": 0.8702250130821559, + "grad_norm": 2.1138633029838867, + "learning_rate": 8.704375291485623e-07, + "loss": 0.7661, + "step": 8315 + }, + { + "epoch": 0.8703296703296703, + "grad_norm": 2.0932322539738784, + "learning_rate": 8.690548530739496e-07, + "loss": 0.9311, + "step": 8316 + }, + { + "epoch": 0.8704343275771848, + "grad_norm": 1.7979421071378918, + "learning_rate": 8.676732261634424e-07, + "loss": 0.784, + "step": 8317 + }, + { + "epoch": 0.8705389848246992, + "grad_norm": 1.9703330311133151, + "learning_rate": 8.662926485757961e-07, + "loss": 0.8364, + "step": 8318 + }, + { + "epoch": 0.8706436420722135, + "grad_norm": 2.3370476834492413, + "learning_rate": 8.649131204696392e-07, + "loss": 0.9065, + "step": 8319 + }, + { + "epoch": 0.8707482993197279, + "grad_norm": 2.404568265608144, + "learning_rate": 8.635346420034862e-07, + "loss": 0.8264, + "step": 8320 + }, + { + "epoch": 0.8708529565672423, + "grad_norm": 1.8537120114233208, + "learning_rate": 8.621572133357236e-07, + "loss": 0.8729, + "step": 8321 + }, + { + "epoch": 0.8709576138147567, + "grad_norm": 2.2930255513118705, + "learning_rate": 8.607808346246216e-07, + "loss": 0.9316, + "step": 8322 + }, + { + "epoch": 0.8710622710622711, + "grad_norm": 2.029189543208393, + "learning_rate": 8.594055060283268e-07, + "loss": 0.7619, + "step": 8323 + }, + { + "epoch": 0.8711669283097855, + "grad_norm": 2.570973752644621, + "learning_rate": 8.580312277048708e-07, + "loss": 0.9147, + "step": 8324 + }, + { + "epoch": 0.8712715855572999, + "grad_norm": 2.0422859206952424, + "learning_rate": 8.566579998121593e-07, + "loss": 0.9099, + "step": 8325 + }, + { + "epoch": 0.8713762428048142, + "grad_norm": 2.76800341990982, + "learning_rate": 8.552858225079807e-07, + "loss": 0.8457, + "step": 8326 + }, + { + "epoch": 0.8714809000523286, + "grad_norm": 2.1054140471943392, + "learning_rate": 8.539146959499956e-07, + "loss": 0.7575, + "step": 8327 + }, + { + "epoch": 0.871585557299843, + "grad_norm": 2.0253201711623565, + "learning_rate": 8.525446202957543e-07, + "loss": 0.8476, + "step": 8328 + }, + { + "epoch": 0.8716902145473574, + "grad_norm": 1.644414039598465, + "learning_rate": 8.511755957026813e-07, + "loss": 0.6856, + "step": 8329 + }, + { + "epoch": 0.8717948717948718, + "grad_norm": 2.278761036199369, + "learning_rate": 8.498076223280794e-07, + "loss": 0.8249, + "step": 8330 + }, + { + "epoch": 0.8718995290423862, + "grad_norm": 2.0636072314286698, + "learning_rate": 8.484407003291306e-07, + "loss": 0.8947, + "step": 8331 + }, + { + "epoch": 0.8720041862899006, + "grad_norm": 2.211711428835568, + "learning_rate": 8.47074829862895e-07, + "loss": 0.9184, + "step": 8332 + }, + { + "epoch": 0.8721088435374149, + "grad_norm": 2.016664180022378, + "learning_rate": 8.457100110863192e-07, + "loss": 0.8512, + "step": 8333 + }, + { + "epoch": 0.8722135007849293, + "grad_norm": 2.0273235658439654, + "learning_rate": 8.443462441562211e-07, + "loss": 0.9258, + "step": 8334 + }, + { + "epoch": 0.8723181580324437, + "grad_norm": 2.1797348714707145, + "learning_rate": 8.429835292292987e-07, + "loss": 0.9337, + "step": 8335 + }, + { + "epoch": 0.8724228152799581, + "grad_norm": 2.128984970826725, + "learning_rate": 8.416218664621312e-07, + "loss": 0.903, + "step": 8336 + }, + { + "epoch": 0.8725274725274725, + "grad_norm": 1.9652208208969064, + "learning_rate": 8.402612560111767e-07, + "loss": 0.908, + "step": 8337 + }, + { + "epoch": 0.8726321297749869, + "grad_norm": 2.0961986353412168, + "learning_rate": 8.389016980327746e-07, + "loss": 0.9465, + "step": 8338 + }, + { + "epoch": 0.8727367870225013, + "grad_norm": 2.5872968306860313, + "learning_rate": 8.37543192683139e-07, + "loss": 0.9034, + "step": 8339 + }, + { + "epoch": 0.8728414442700158, + "grad_norm": 2.443057693441533, + "learning_rate": 8.361857401183649e-07, + "loss": 0.8729, + "step": 8340 + }, + { + "epoch": 0.87294610151753, + "grad_norm": 2.1845773081813635, + "learning_rate": 8.348293404944263e-07, + "loss": 0.8835, + "step": 8341 + }, + { + "epoch": 0.8730507587650445, + "grad_norm": 1.9310878809345682, + "learning_rate": 8.334739939671743e-07, + "loss": 0.8102, + "step": 8342 + }, + { + "epoch": 0.8731554160125589, + "grad_norm": 1.9452518156409584, + "learning_rate": 8.321197006923442e-07, + "loss": 0.8457, + "step": 8343 + }, + { + "epoch": 0.8732600732600733, + "grad_norm": 1.8616667743331106, + "learning_rate": 8.307664608255461e-07, + "loss": 0.9066, + "step": 8344 + }, + { + "epoch": 0.8733647305075877, + "grad_norm": 1.9175403993591587, + "learning_rate": 8.294142745222678e-07, + "loss": 0.8309, + "step": 8345 + }, + { + "epoch": 0.8734693877551021, + "grad_norm": 2.4850953298749996, + "learning_rate": 8.280631419378815e-07, + "loss": 0.9685, + "step": 8346 + }, + { + "epoch": 0.8735740450026165, + "grad_norm": 2.044753493702447, + "learning_rate": 8.267130632276321e-07, + "loss": 0.9937, + "step": 8347 + }, + { + "epoch": 0.8736787022501308, + "grad_norm": 2.3810577763966876, + "learning_rate": 8.253640385466499e-07, + "loss": 1.0281, + "step": 8348 + }, + { + "epoch": 0.8737833594976452, + "grad_norm": 1.963834819571071, + "learning_rate": 8.240160680499388e-07, + "loss": 0.9345, + "step": 8349 + }, + { + "epoch": 0.8738880167451596, + "grad_norm": 1.8751548428811395, + "learning_rate": 8.226691518923835e-07, + "loss": 0.8952, + "step": 8350 + }, + { + "epoch": 0.873992673992674, + "grad_norm": 2.330321107761336, + "learning_rate": 8.213232902287438e-07, + "loss": 0.899, + "step": 8351 + }, + { + "epoch": 0.8740973312401884, + "grad_norm": 2.2633398328522847, + "learning_rate": 8.199784832136682e-07, + "loss": 0.9012, + "step": 8352 + }, + { + "epoch": 0.8742019884877028, + "grad_norm": 1.916777768307915, + "learning_rate": 8.18634731001674e-07, + "loss": 0.8853, + "step": 8353 + }, + { + "epoch": 0.8743066457352172, + "grad_norm": 2.1050944988737412, + "learning_rate": 8.172920337471601e-07, + "loss": 0.9568, + "step": 8354 + }, + { + "epoch": 0.8744113029827315, + "grad_norm": 1.8481452188298315, + "learning_rate": 8.159503916044087e-07, + "loss": 0.9897, + "step": 8355 + }, + { + "epoch": 0.8745159602302459, + "grad_norm": 2.250311788077037, + "learning_rate": 8.146098047275741e-07, + "loss": 0.8498, + "step": 8356 + }, + { + "epoch": 0.8746206174777603, + "grad_norm": 2.3260125318486335, + "learning_rate": 8.132702732706954e-07, + "loss": 0.9126, + "step": 8357 + }, + { + "epoch": 0.8747252747252747, + "grad_norm": 2.2314067384407994, + "learning_rate": 8.11931797387685e-07, + "loss": 0.8325, + "step": 8358 + }, + { + "epoch": 0.8748299319727891, + "grad_norm": 1.9457108277580055, + "learning_rate": 8.105943772323377e-07, + "loss": 0.9133, + "step": 8359 + }, + { + "epoch": 0.8749345892203035, + "grad_norm": 2.0107629993432607, + "learning_rate": 8.09258012958325e-07, + "loss": 0.8153, + "step": 8360 + }, + { + "epoch": 0.8750392464678179, + "grad_norm": 2.2793780471848253, + "learning_rate": 8.079227047191962e-07, + "loss": 0.9221, + "step": 8361 + }, + { + "epoch": 0.8751439037153322, + "grad_norm": 2.123456861211417, + "learning_rate": 8.065884526683832e-07, + "loss": 0.7958, + "step": 8362 + }, + { + "epoch": 0.8752485609628466, + "grad_norm": 1.926502112741626, + "learning_rate": 8.052552569591965e-07, + "loss": 0.7358, + "step": 8363 + }, + { + "epoch": 0.875353218210361, + "grad_norm": 2.0489738249039506, + "learning_rate": 8.039231177448192e-07, + "loss": 0.8492, + "step": 8364 + }, + { + "epoch": 0.8754578754578755, + "grad_norm": 1.8975269132659613, + "learning_rate": 8.025920351783189e-07, + "loss": 0.8582, + "step": 8365 + }, + { + "epoch": 0.8755625327053899, + "grad_norm": 2.256895879823261, + "learning_rate": 8.012620094126367e-07, + "loss": 0.8729, + "step": 8366 + }, + { + "epoch": 0.8756671899529043, + "grad_norm": 2.025845185272999, + "learning_rate": 7.999330406005989e-07, + "loss": 0.8187, + "step": 8367 + }, + { + "epoch": 0.8757718472004187, + "grad_norm": 2.0895848407187922, + "learning_rate": 7.986051288949059e-07, + "loss": 0.9252, + "step": 8368 + }, + { + "epoch": 0.875876504447933, + "grad_norm": 2.139667630918293, + "learning_rate": 7.972782744481367e-07, + "loss": 0.8566, + "step": 8369 + }, + { + "epoch": 0.8759811616954474, + "grad_norm": 2.3232743712934294, + "learning_rate": 7.959524774127481e-07, + "loss": 0.7616, + "step": 8370 + }, + { + "epoch": 0.8760858189429618, + "grad_norm": 2.128576816385248, + "learning_rate": 7.946277379410771e-07, + "loss": 0.7066, + "step": 8371 + }, + { + "epoch": 0.8761904761904762, + "grad_norm": 1.9889905519313846, + "learning_rate": 7.933040561853433e-07, + "loss": 0.8717, + "step": 8372 + }, + { + "epoch": 0.8762951334379906, + "grad_norm": 1.7790368676618327, + "learning_rate": 7.919814322976371e-07, + "loss": 0.7829, + "step": 8373 + }, + { + "epoch": 0.876399790685505, + "grad_norm": 2.191730959271324, + "learning_rate": 7.906598664299303e-07, + "loss": 0.889, + "step": 8374 + }, + { + "epoch": 0.8765044479330194, + "grad_norm": 2.018457235121592, + "learning_rate": 7.893393587340725e-07, + "loss": 0.9284, + "step": 8375 + }, + { + "epoch": 0.8766091051805337, + "grad_norm": 1.844476762441355, + "learning_rate": 7.880199093617968e-07, + "loss": 0.771, + "step": 8376 + }, + { + "epoch": 0.8767137624280481, + "grad_norm": 1.9252327144567856, + "learning_rate": 7.867015184647075e-07, + "loss": 0.8846, + "step": 8377 + }, + { + "epoch": 0.8768184196755625, + "grad_norm": 2.428489369199683, + "learning_rate": 7.853841861942912e-07, + "loss": 0.8245, + "step": 8378 + }, + { + "epoch": 0.8769230769230769, + "grad_norm": 2.1246442895723927, + "learning_rate": 7.840679127019124e-07, + "loss": 0.9374, + "step": 8379 + }, + { + "epoch": 0.8770277341705913, + "grad_norm": 2.0282518178011637, + "learning_rate": 7.827526981388112e-07, + "loss": 0.7992, + "step": 8380 + }, + { + "epoch": 0.8771323914181057, + "grad_norm": 2.2457001610010634, + "learning_rate": 7.814385426561099e-07, + "loss": 0.8949, + "step": 8381 + }, + { + "epoch": 0.8772370486656201, + "grad_norm": 2.358939783286938, + "learning_rate": 7.801254464048092e-07, + "loss": 0.8137, + "step": 8382 + }, + { + "epoch": 0.8773417059131345, + "grad_norm": 1.9086185068244006, + "learning_rate": 7.788134095357869e-07, + "loss": 0.9158, + "step": 8383 + }, + { + "epoch": 0.8774463631606488, + "grad_norm": 1.400948820647384, + "learning_rate": 7.775024321997959e-07, + "loss": 0.6796, + "step": 8384 + }, + { + "epoch": 0.8775510204081632, + "grad_norm": 2.1402975917994134, + "learning_rate": 7.761925145474702e-07, + "loss": 0.9254, + "step": 8385 + }, + { + "epoch": 0.8776556776556776, + "grad_norm": 2.28391791015574, + "learning_rate": 7.748836567293238e-07, + "loss": 0.8438, + "step": 8386 + }, + { + "epoch": 0.877760334903192, + "grad_norm": 2.0399330609262143, + "learning_rate": 7.735758588957476e-07, + "loss": 0.8036, + "step": 8387 + }, + { + "epoch": 0.8778649921507065, + "grad_norm": 1.6399077111346803, + "learning_rate": 7.722691211970079e-07, + "loss": 0.7449, + "step": 8388 + }, + { + "epoch": 0.8779696493982209, + "grad_norm": 2.060963302205363, + "learning_rate": 7.709634437832537e-07, + "loss": 0.8799, + "step": 8389 + }, + { + "epoch": 0.8780743066457353, + "grad_norm": 2.0689600387801446, + "learning_rate": 7.696588268045069e-07, + "loss": 0.892, + "step": 8390 + }, + { + "epoch": 0.8781789638932496, + "grad_norm": 1.8775333765697284, + "learning_rate": 7.683552704106756e-07, + "loss": 0.7523, + "step": 8391 + }, + { + "epoch": 0.878283621140764, + "grad_norm": 2.347297278777117, + "learning_rate": 7.670527747515366e-07, + "loss": 0.8529, + "step": 8392 + }, + { + "epoch": 0.8783882783882784, + "grad_norm": 1.8280093832995228, + "learning_rate": 7.657513399767525e-07, + "loss": 0.8632, + "step": 8393 + }, + { + "epoch": 0.8784929356357928, + "grad_norm": 2.4132747406203694, + "learning_rate": 7.644509662358568e-07, + "loss": 0.7902, + "step": 8394 + }, + { + "epoch": 0.8785975928833072, + "grad_norm": 1.7734274400061258, + "learning_rate": 7.631516536782691e-07, + "loss": 0.8234, + "step": 8395 + }, + { + "epoch": 0.8787022501308216, + "grad_norm": 1.9302295122926418, + "learning_rate": 7.61853402453282e-07, + "loss": 0.9466, + "step": 8396 + }, + { + "epoch": 0.878806907378336, + "grad_norm": 2.017176887079925, + "learning_rate": 7.605562127100641e-07, + "loss": 0.9422, + "step": 8397 + }, + { + "epoch": 0.8789115646258503, + "grad_norm": 2.204693966210214, + "learning_rate": 7.592600845976694e-07, + "loss": 0.9219, + "step": 8398 + }, + { + "epoch": 0.8790162218733647, + "grad_norm": 1.9260542762683226, + "learning_rate": 7.579650182650245e-07, + "loss": 0.8415, + "step": 8399 + }, + { + "epoch": 0.8791208791208791, + "grad_norm": 2.3183132581995025, + "learning_rate": 7.566710138609323e-07, + "loss": 0.8479, + "step": 8400 + }, + { + "epoch": 0.8792255363683935, + "grad_norm": 1.9805826657109464, + "learning_rate": 7.55378071534082e-07, + "loss": 0.8046, + "step": 8401 + }, + { + "epoch": 0.8793301936159079, + "grad_norm": 2.177056885011439, + "learning_rate": 7.540861914330299e-07, + "loss": 0.9871, + "step": 8402 + }, + { + "epoch": 0.8794348508634223, + "grad_norm": 2.287633781357303, + "learning_rate": 7.527953737062188e-07, + "loss": 0.8662, + "step": 8403 + }, + { + "epoch": 0.8795395081109367, + "grad_norm": 2.043158350778613, + "learning_rate": 7.515056185019642e-07, + "loss": 0.8425, + "step": 8404 + }, + { + "epoch": 0.879644165358451, + "grad_norm": 2.4480997698564857, + "learning_rate": 7.502169259684633e-07, + "loss": 0.9648, + "step": 8405 + }, + { + "epoch": 0.8797488226059654, + "grad_norm": 2.4174555351369555, + "learning_rate": 7.489292962537887e-07, + "loss": 0.8642, + "step": 8406 + }, + { + "epoch": 0.8798534798534798, + "grad_norm": 2.360136663855378, + "learning_rate": 7.476427295058918e-07, + "loss": 0.822, + "step": 8407 + }, + { + "epoch": 0.8799581371009942, + "grad_norm": 2.1342092374115054, + "learning_rate": 7.463572258726027e-07, + "loss": 0.9561, + "step": 8408 + }, + { + "epoch": 0.8800627943485086, + "grad_norm": 2.2233381286212315, + "learning_rate": 7.45072785501626e-07, + "loss": 0.9782, + "step": 8409 + }, + { + "epoch": 0.880167451596023, + "grad_norm": 2.0969014014371865, + "learning_rate": 7.437894085405484e-07, + "loss": 0.9458, + "step": 8410 + }, + { + "epoch": 0.8802721088435375, + "grad_norm": 2.1757758424267464, + "learning_rate": 7.42507095136833e-07, + "loss": 0.8587, + "step": 8411 + }, + { + "epoch": 0.8803767660910518, + "grad_norm": 2.306166415311752, + "learning_rate": 7.412258454378196e-07, + "loss": 0.8882, + "step": 8412 + }, + { + "epoch": 0.8804814233385662, + "grad_norm": 2.269362231176697, + "learning_rate": 7.39945659590724e-07, + "loss": 0.8249, + "step": 8413 + }, + { + "epoch": 0.8805860805860806, + "grad_norm": 1.938643465951603, + "learning_rate": 7.386665377426438e-07, + "loss": 0.8922, + "step": 8414 + }, + { + "epoch": 0.880690737833595, + "grad_norm": 2.005191893832761, + "learning_rate": 7.37388480040555e-07, + "loss": 0.8959, + "step": 8415 + }, + { + "epoch": 0.8807953950811094, + "grad_norm": 2.1034084182074713, + "learning_rate": 7.361114866313069e-07, + "loss": 0.8488, + "step": 8416 + }, + { + "epoch": 0.8809000523286238, + "grad_norm": 2.0833541849870536, + "learning_rate": 7.348355576616295e-07, + "loss": 0.8547, + "step": 8417 + }, + { + "epoch": 0.8810047095761382, + "grad_norm": 1.9436409663601768, + "learning_rate": 7.33560693278128e-07, + "loss": 0.9296, + "step": 8418 + }, + { + "epoch": 0.8811093668236525, + "grad_norm": 2.6142542944966416, + "learning_rate": 7.322868936272876e-07, + "loss": 0.9786, + "step": 8419 + }, + { + "epoch": 0.8812140240711669, + "grad_norm": 1.975543273238409, + "learning_rate": 7.310141588554709e-07, + "loss": 0.8701, + "step": 8420 + }, + { + "epoch": 0.8813186813186813, + "grad_norm": 2.175997926219932, + "learning_rate": 7.297424891089189e-07, + "loss": 0.833, + "step": 8421 + }, + { + "epoch": 0.8814233385661957, + "grad_norm": 1.779910295096858, + "learning_rate": 7.284718845337469e-07, + "loss": 0.8065, + "step": 8422 + }, + { + "epoch": 0.8815279958137101, + "grad_norm": 2.323280192485811, + "learning_rate": 7.272023452759491e-07, + "loss": 0.9152, + "step": 8423 + }, + { + "epoch": 0.8816326530612245, + "grad_norm": 1.9712072593281076, + "learning_rate": 7.259338714813991e-07, + "loss": 0.9558, + "step": 8424 + }, + { + "epoch": 0.8817373103087389, + "grad_norm": 2.0403158599975058, + "learning_rate": 7.2466646329585e-07, + "loss": 0.9015, + "step": 8425 + }, + { + "epoch": 0.8818419675562533, + "grad_norm": 2.1242870940274057, + "learning_rate": 7.234001208649277e-07, + "loss": 0.9321, + "step": 8426 + }, + { + "epoch": 0.8819466248037676, + "grad_norm": 2.268858095340938, + "learning_rate": 7.221348443341369e-07, + "loss": 0.9714, + "step": 8427 + }, + { + "epoch": 0.882051282051282, + "grad_norm": 1.995883555744217, + "learning_rate": 7.208706338488591e-07, + "loss": 0.9395, + "step": 8428 + }, + { + "epoch": 0.8821559392987964, + "grad_norm": 2.1098847144251054, + "learning_rate": 7.196074895543581e-07, + "loss": 0.8725, + "step": 8429 + }, + { + "epoch": 0.8822605965463108, + "grad_norm": 2.409353406645249, + "learning_rate": 7.183454115957688e-07, + "loss": 1.0137, + "step": 8430 + }, + { + "epoch": 0.8823652537938252, + "grad_norm": 2.1180455101201408, + "learning_rate": 7.170844001181087e-07, + "loss": 0.9623, + "step": 8431 + }, + { + "epoch": 0.8824699110413396, + "grad_norm": 2.151280329173097, + "learning_rate": 7.158244552662674e-07, + "loss": 0.9246, + "step": 8432 + }, + { + "epoch": 0.8825745682888541, + "grad_norm": 2.0323499167485104, + "learning_rate": 7.145655771850179e-07, + "loss": 0.9656, + "step": 8433 + }, + { + "epoch": 0.8826792255363684, + "grad_norm": 2.282886441190445, + "learning_rate": 7.13307766019008e-07, + "loss": 0.9472, + "step": 8434 + }, + { + "epoch": 0.8827838827838828, + "grad_norm": 2.06209446070617, + "learning_rate": 7.120510219127619e-07, + "loss": 0.9185, + "step": 8435 + }, + { + "epoch": 0.8828885400313972, + "grad_norm": 2.5731722325288855, + "learning_rate": 7.107953450106819e-07, + "loss": 0.8998, + "step": 8436 + }, + { + "epoch": 0.8829931972789116, + "grad_norm": 1.8408399665438568, + "learning_rate": 7.095407354570483e-07, + "loss": 0.8116, + "step": 8437 + }, + { + "epoch": 0.883097854526426, + "grad_norm": 2.0268812460354453, + "learning_rate": 7.082871933960156e-07, + "loss": 0.8808, + "step": 8438 + }, + { + "epoch": 0.8832025117739404, + "grad_norm": 1.6490758747411414, + "learning_rate": 7.070347189716231e-07, + "loss": 0.7978, + "step": 8439 + }, + { + "epoch": 0.8833071690214548, + "grad_norm": 1.8039746990089798, + "learning_rate": 7.057833123277779e-07, + "loss": 0.8668, + "step": 8440 + }, + { + "epoch": 0.8834118262689691, + "grad_norm": 2.0025313820891246, + "learning_rate": 7.04532973608274e-07, + "loss": 0.8699, + "step": 8441 + }, + { + "epoch": 0.8835164835164835, + "grad_norm": 2.013068044107957, + "learning_rate": 7.032837029567741e-07, + "loss": 0.8394, + "step": 8442 + }, + { + "epoch": 0.8836211407639979, + "grad_norm": 1.9791349791116444, + "learning_rate": 7.020355005168223e-07, + "loss": 0.8135, + "step": 8443 + }, + { + "epoch": 0.8837257980115123, + "grad_norm": 1.9761998674343688, + "learning_rate": 7.007883664318416e-07, + "loss": 0.9571, + "step": 8444 + }, + { + "epoch": 0.8838304552590267, + "grad_norm": 2.217869103536607, + "learning_rate": 6.995423008451296e-07, + "loss": 0.8986, + "step": 8445 + }, + { + "epoch": 0.8839351125065411, + "grad_norm": 1.7193627330671324, + "learning_rate": 6.982973038998608e-07, + "loss": 0.7972, + "step": 8446 + }, + { + "epoch": 0.8840397697540555, + "grad_norm": 1.9318398791675795, + "learning_rate": 6.970533757390874e-07, + "loss": 0.9375, + "step": 8447 + }, + { + "epoch": 0.8841444270015698, + "grad_norm": 2.040044194610826, + "learning_rate": 6.958105165057405e-07, + "loss": 0.9477, + "step": 8448 + }, + { + "epoch": 0.8842490842490842, + "grad_norm": 2.3099661723992795, + "learning_rate": 6.945687263426259e-07, + "loss": 0.996, + "step": 8449 + }, + { + "epoch": 0.8843537414965986, + "grad_norm": 2.3098910608338126, + "learning_rate": 6.933280053924307e-07, + "loss": 0.8932, + "step": 8450 + }, + { + "epoch": 0.884458398744113, + "grad_norm": 2.0083350641115847, + "learning_rate": 6.920883537977141e-07, + "loss": 0.8619, + "step": 8451 + }, + { + "epoch": 0.8845630559916274, + "grad_norm": 2.17092705026124, + "learning_rate": 6.908497717009132e-07, + "loss": 0.8771, + "step": 8452 + }, + { + "epoch": 0.8846677132391418, + "grad_norm": 1.9588841874329892, + "learning_rate": 6.896122592443466e-07, + "loss": 0.843, + "step": 8453 + }, + { + "epoch": 0.8847723704866562, + "grad_norm": 2.06887109912915, + "learning_rate": 6.883758165702048e-07, + "loss": 0.9597, + "step": 8454 + }, + { + "epoch": 0.8848770277341705, + "grad_norm": 2.0239969794410007, + "learning_rate": 6.871404438205598e-07, + "loss": 0.8352, + "step": 8455 + }, + { + "epoch": 0.884981684981685, + "grad_norm": 2.2891549492476795, + "learning_rate": 6.859061411373557e-07, + "loss": 0.8596, + "step": 8456 + }, + { + "epoch": 0.8850863422291994, + "grad_norm": 2.134710339429134, + "learning_rate": 6.846729086624159e-07, + "loss": 0.7921, + "step": 8457 + }, + { + "epoch": 0.8851909994767138, + "grad_norm": 2.021668403658534, + "learning_rate": 6.834407465374438e-07, + "loss": 0.9, + "step": 8458 + }, + { + "epoch": 0.8852956567242282, + "grad_norm": 1.6910079970551906, + "learning_rate": 6.82209654904018e-07, + "loss": 0.7481, + "step": 8459 + }, + { + "epoch": 0.8854003139717426, + "grad_norm": 2.2018914810401933, + "learning_rate": 6.809796339035923e-07, + "loss": 0.915, + "step": 8460 + }, + { + "epoch": 0.885504971219257, + "grad_norm": 1.6986887147965564, + "learning_rate": 6.797506836774981e-07, + "loss": 0.7935, + "step": 8461 + }, + { + "epoch": 0.8856096284667713, + "grad_norm": 1.9522509649243391, + "learning_rate": 6.785228043669423e-07, + "loss": 0.8457, + "step": 8462 + }, + { + "epoch": 0.8857142857142857, + "grad_norm": 2.5679630478892643, + "learning_rate": 6.772959961130154e-07, + "loss": 0.9314, + "step": 8463 + }, + { + "epoch": 0.8858189429618001, + "grad_norm": 1.8880060673610959, + "learning_rate": 6.76070259056677e-07, + "loss": 0.8698, + "step": 8464 + }, + { + "epoch": 0.8859236002093145, + "grad_norm": 2.0499745907035742, + "learning_rate": 6.748455933387666e-07, + "loss": 0.8072, + "step": 8465 + }, + { + "epoch": 0.8860282574568289, + "grad_norm": 2.0849975990310283, + "learning_rate": 6.736219991000015e-07, + "loss": 0.8479, + "step": 8466 + }, + { + "epoch": 0.8861329147043433, + "grad_norm": 2.285861980076164, + "learning_rate": 6.72399476480975e-07, + "loss": 0.8301, + "step": 8467 + }, + { + "epoch": 0.8862375719518577, + "grad_norm": 2.0055314886862146, + "learning_rate": 6.711780256221578e-07, + "loss": 0.9137, + "step": 8468 + }, + { + "epoch": 0.8863422291993721, + "grad_norm": 2.3060763965788507, + "learning_rate": 6.699576466638991e-07, + "loss": 0.9079, + "step": 8469 + }, + { + "epoch": 0.8864468864468864, + "grad_norm": 2.007322337516734, + "learning_rate": 6.687383397464187e-07, + "loss": 0.8655, + "step": 8470 + }, + { + "epoch": 0.8865515436944008, + "grad_norm": 2.0833706920548356, + "learning_rate": 6.67520105009819e-07, + "loss": 0.783, + "step": 8471 + }, + { + "epoch": 0.8866562009419152, + "grad_norm": 2.0582965508334468, + "learning_rate": 6.663029425940804e-07, + "loss": 0.9511, + "step": 8472 + }, + { + "epoch": 0.8867608581894296, + "grad_norm": 2.2808925451077933, + "learning_rate": 6.650868526390541e-07, + "loss": 0.8643, + "step": 8473 + }, + { + "epoch": 0.886865515436944, + "grad_norm": 1.946899059032306, + "learning_rate": 6.63871835284472e-07, + "loss": 0.9066, + "step": 8474 + }, + { + "epoch": 0.8869701726844584, + "grad_norm": 2.068214617975085, + "learning_rate": 6.626578906699421e-07, + "loss": 0.8209, + "step": 8475 + }, + { + "epoch": 0.8870748299319728, + "grad_norm": 2.093189904582817, + "learning_rate": 6.614450189349497e-07, + "loss": 0.9761, + "step": 8476 + }, + { + "epoch": 0.8871794871794871, + "grad_norm": 2.1709216272284424, + "learning_rate": 6.602332202188544e-07, + "loss": 0.9507, + "step": 8477 + }, + { + "epoch": 0.8872841444270015, + "grad_norm": 1.9156403618097797, + "learning_rate": 6.590224946608981e-07, + "loss": 0.7868, + "step": 8478 + }, + { + "epoch": 0.887388801674516, + "grad_norm": 1.8763348533122814, + "learning_rate": 6.578128424001929e-07, + "loss": 0.7929, + "step": 8479 + }, + { + "epoch": 0.8874934589220304, + "grad_norm": 2.1596639833953897, + "learning_rate": 6.566042635757297e-07, + "loss": 0.9342, + "step": 8480 + }, + { + "epoch": 0.8875981161695448, + "grad_norm": 2.25994957436867, + "learning_rate": 6.553967583263776e-07, + "loss": 0.8573, + "step": 8481 + }, + { + "epoch": 0.8877027734170592, + "grad_norm": 2.171303606502574, + "learning_rate": 6.541903267908823e-07, + "loss": 0.8721, + "step": 8482 + }, + { + "epoch": 0.8878074306645736, + "grad_norm": 2.1186511007584454, + "learning_rate": 6.529849691078638e-07, + "loss": 0.8509, + "step": 8483 + }, + { + "epoch": 0.8879120879120879, + "grad_norm": 1.9924052169713273, + "learning_rate": 6.517806854158204e-07, + "loss": 0.8332, + "step": 8484 + }, + { + "epoch": 0.8880167451596023, + "grad_norm": 1.8798641457276724, + "learning_rate": 6.50577475853128e-07, + "loss": 0.9086, + "step": 8485 + }, + { + "epoch": 0.8881214024071167, + "grad_norm": 2.030261354667674, + "learning_rate": 6.493753405580349e-07, + "loss": 0.8013, + "step": 8486 + }, + { + "epoch": 0.8882260596546311, + "grad_norm": 1.9482170795471423, + "learning_rate": 6.48174279668673e-07, + "loss": 0.8105, + "step": 8487 + }, + { + "epoch": 0.8883307169021455, + "grad_norm": 2.1213839881968064, + "learning_rate": 6.46974293323045e-07, + "loss": 0.8659, + "step": 8488 + }, + { + "epoch": 0.8884353741496599, + "grad_norm": 1.7682685942041678, + "learning_rate": 6.457753816590307e-07, + "loss": 0.8687, + "step": 8489 + }, + { + "epoch": 0.8885400313971743, + "grad_norm": 2.0809007657182397, + "learning_rate": 6.445775448143854e-07, + "loss": 0.8524, + "step": 8490 + }, + { + "epoch": 0.8886446886446886, + "grad_norm": 2.2455986826973278, + "learning_rate": 6.433807829267491e-07, + "loss": 0.9529, + "step": 8491 + }, + { + "epoch": 0.888749345892203, + "grad_norm": 1.9497228881367459, + "learning_rate": 6.421850961336262e-07, + "loss": 0.9028, + "step": 8492 + }, + { + "epoch": 0.8888540031397174, + "grad_norm": 1.7807386421204503, + "learning_rate": 6.409904845724068e-07, + "loss": 0.8272, + "step": 8493 + }, + { + "epoch": 0.8889586603872318, + "grad_norm": 1.9539951561107944, + "learning_rate": 6.397969483803546e-07, + "loss": 0.8359, + "step": 8494 + }, + { + "epoch": 0.8890633176347462, + "grad_norm": 1.8823048125200246, + "learning_rate": 6.386044876946073e-07, + "loss": 0.9503, + "step": 8495 + }, + { + "epoch": 0.8891679748822606, + "grad_norm": 2.013354508723251, + "learning_rate": 6.374131026521813e-07, + "loss": 0.8966, + "step": 8496 + }, + { + "epoch": 0.889272632129775, + "grad_norm": 2.187845858922677, + "learning_rate": 6.362227933899701e-07, + "loss": 0.8339, + "step": 8497 + }, + { + "epoch": 0.8893772893772893, + "grad_norm": 2.1954224044761563, + "learning_rate": 6.350335600447433e-07, + "loss": 0.9413, + "step": 8498 + }, + { + "epoch": 0.8894819466248037, + "grad_norm": 2.1320897727273165, + "learning_rate": 6.338454027531449e-07, + "loss": 0.952, + "step": 8499 + }, + { + "epoch": 0.8895866038723181, + "grad_norm": 2.336668103416007, + "learning_rate": 6.326583216516957e-07, + "loss": 0.8382, + "step": 8500 + }, + { + "epoch": 0.8896912611198325, + "grad_norm": 1.8795899973747725, + "learning_rate": 6.314723168767945e-07, + "loss": 0.8066, + "step": 8501 + }, + { + "epoch": 0.889795918367347, + "grad_norm": 1.9308760249674062, + "learning_rate": 6.302873885647176e-07, + "loss": 0.8645, + "step": 8502 + }, + { + "epoch": 0.8899005756148614, + "grad_norm": 1.9444834126204222, + "learning_rate": 6.291035368516141e-07, + "loss": 0.7657, + "step": 8503 + }, + { + "epoch": 0.8900052328623758, + "grad_norm": 2.1736958135133593, + "learning_rate": 6.279207618735117e-07, + "loss": 1.0026, + "step": 8504 + }, + { + "epoch": 0.8901098901098901, + "grad_norm": 2.1650434307122093, + "learning_rate": 6.267390637663107e-07, + "loss": 0.8737, + "step": 8505 + }, + { + "epoch": 0.8902145473574045, + "grad_norm": 2.5479226003532167, + "learning_rate": 6.255584426657946e-07, + "loss": 0.8008, + "step": 8506 + }, + { + "epoch": 0.8903192046049189, + "grad_norm": 1.8553494681317815, + "learning_rate": 6.243788987076172e-07, + "loss": 0.8209, + "step": 8507 + }, + { + "epoch": 0.8904238618524333, + "grad_norm": 1.9483177698300278, + "learning_rate": 6.232004320273111e-07, + "loss": 0.8308, + "step": 8508 + }, + { + "epoch": 0.8905285190999477, + "grad_norm": 1.6634174156291366, + "learning_rate": 6.220230427602814e-07, + "loss": 0.827, + "step": 8509 + }, + { + "epoch": 0.8906331763474621, + "grad_norm": 2.1685853413646323, + "learning_rate": 6.208467310418165e-07, + "loss": 0.9, + "step": 8510 + }, + { + "epoch": 0.8907378335949765, + "grad_norm": 1.9637945475462335, + "learning_rate": 6.196714970070772e-07, + "loss": 0.8718, + "step": 8511 + }, + { + "epoch": 0.8908424908424909, + "grad_norm": 1.96490255215975, + "learning_rate": 6.184973407910977e-07, + "loss": 0.8939, + "step": 8512 + }, + { + "epoch": 0.8909471480900052, + "grad_norm": 2.101746073662847, + "learning_rate": 6.173242625287934e-07, + "loss": 0.8335, + "step": 8513 + }, + { + "epoch": 0.8910518053375196, + "grad_norm": 2.106827762594044, + "learning_rate": 6.161522623549509e-07, + "loss": 0.8706, + "step": 8514 + }, + { + "epoch": 0.891156462585034, + "grad_norm": 2.1039813494995157, + "learning_rate": 6.14981340404236e-07, + "loss": 0.8625, + "step": 8515 + }, + { + "epoch": 0.8912611198325484, + "grad_norm": 2.1108841193616983, + "learning_rate": 6.138114968111918e-07, + "loss": 0.9328, + "step": 8516 + }, + { + "epoch": 0.8913657770800628, + "grad_norm": 2.3087652071285567, + "learning_rate": 6.126427317102335e-07, + "loss": 0.9118, + "step": 8517 + }, + { + "epoch": 0.8914704343275772, + "grad_norm": 2.2400146974772515, + "learning_rate": 6.114750452356555e-07, + "loss": 0.9425, + "step": 8518 + }, + { + "epoch": 0.8915750915750916, + "grad_norm": 2.311746241511402, + "learning_rate": 6.103084375216273e-07, + "loss": 0.9304, + "step": 8519 + }, + { + "epoch": 0.8916797488226059, + "grad_norm": 2.1638223427184586, + "learning_rate": 6.091429087021938e-07, + "loss": 0.9901, + "step": 8520 + }, + { + "epoch": 0.8917844060701203, + "grad_norm": 1.9528700522695477, + "learning_rate": 6.079784589112792e-07, + "loss": 0.7986, + "step": 8521 + }, + { + "epoch": 0.8918890633176347, + "grad_norm": 1.6876998724103345, + "learning_rate": 6.068150882826785e-07, + "loss": 0.7391, + "step": 8522 + }, + { + "epoch": 0.8919937205651491, + "grad_norm": 1.8784697035517413, + "learning_rate": 6.056527969500659e-07, + "loss": 0.7807, + "step": 8523 + }, + { + "epoch": 0.8920983778126635, + "grad_norm": 1.7297614397650942, + "learning_rate": 6.044915850469912e-07, + "loss": 0.7549, + "step": 8524 + }, + { + "epoch": 0.892203035060178, + "grad_norm": 2.148635473847736, + "learning_rate": 6.033314527068801e-07, + "loss": 0.901, + "step": 8525 + }, + { + "epoch": 0.8923076923076924, + "grad_norm": 2.409030290338374, + "learning_rate": 6.021724000630347e-07, + "loss": 0.8012, + "step": 8526 + }, + { + "epoch": 0.8924123495552067, + "grad_norm": 2.035224893111058, + "learning_rate": 6.010144272486307e-07, + "loss": 0.8819, + "step": 8527 + }, + { + "epoch": 0.8925170068027211, + "grad_norm": 1.9861692794496122, + "learning_rate": 5.998575343967238e-07, + "loss": 0.8553, + "step": 8528 + }, + { + "epoch": 0.8926216640502355, + "grad_norm": 2.139461496658353, + "learning_rate": 5.987017216402402e-07, + "loss": 0.8632, + "step": 8529 + }, + { + "epoch": 0.8927263212977499, + "grad_norm": 2.3325515767132985, + "learning_rate": 5.9754698911199e-07, + "loss": 0.8565, + "step": 8530 + }, + { + "epoch": 0.8928309785452643, + "grad_norm": 2.1213813372529686, + "learning_rate": 5.963933369446506e-07, + "loss": 0.9007, + "step": 8531 + }, + { + "epoch": 0.8929356357927787, + "grad_norm": 1.9716547476429467, + "learning_rate": 5.952407652707803e-07, + "loss": 0.801, + "step": 8532 + }, + { + "epoch": 0.8930402930402931, + "grad_norm": 2.0956419653603167, + "learning_rate": 5.940892742228111e-07, + "loss": 0.9066, + "step": 8533 + }, + { + "epoch": 0.8931449502878074, + "grad_norm": 2.1015997554130053, + "learning_rate": 5.929388639330514e-07, + "loss": 0.8079, + "step": 8534 + }, + { + "epoch": 0.8932496075353218, + "grad_norm": 2.338930939945522, + "learning_rate": 5.917895345336866e-07, + "loss": 0.9304, + "step": 8535 + }, + { + "epoch": 0.8933542647828362, + "grad_norm": 2.370725798752193, + "learning_rate": 5.906412861567768e-07, + "loss": 0.9179, + "step": 8536 + }, + { + "epoch": 0.8934589220303506, + "grad_norm": 2.212434822841478, + "learning_rate": 5.894941189342585e-07, + "loss": 0.9368, + "step": 8537 + }, + { + "epoch": 0.893563579277865, + "grad_norm": 1.7221383035811781, + "learning_rate": 5.88348032997943e-07, + "loss": 0.818, + "step": 8538 + }, + { + "epoch": 0.8936682365253794, + "grad_norm": 2.160032789570768, + "learning_rate": 5.872030284795161e-07, + "loss": 0.8613, + "step": 8539 + }, + { + "epoch": 0.8937728937728938, + "grad_norm": 2.0833805352848525, + "learning_rate": 5.860591055105446e-07, + "loss": 0.8991, + "step": 8540 + }, + { + "epoch": 0.8938775510204081, + "grad_norm": 2.386495900806836, + "learning_rate": 5.849162642224659e-07, + "loss": 0.973, + "step": 8541 + }, + { + "epoch": 0.8939822082679225, + "grad_norm": 2.127775483833274, + "learning_rate": 5.837745047465959e-07, + "loss": 0.8681, + "step": 8542 + }, + { + "epoch": 0.8940868655154369, + "grad_norm": 2.1017239869056916, + "learning_rate": 5.826338272141219e-07, + "loss": 0.8879, + "step": 8543 + }, + { + "epoch": 0.8941915227629513, + "grad_norm": 2.470116877242277, + "learning_rate": 5.814942317561134e-07, + "loss": 0.6756, + "step": 8544 + }, + { + "epoch": 0.8942961800104657, + "grad_norm": 2.4032567630871706, + "learning_rate": 5.803557185035092e-07, + "loss": 0.9041, + "step": 8545 + }, + { + "epoch": 0.8944008372579801, + "grad_norm": 1.847332668697746, + "learning_rate": 5.792182875871299e-07, + "loss": 0.8516, + "step": 8546 + }, + { + "epoch": 0.8945054945054945, + "grad_norm": 2.2507894430237214, + "learning_rate": 5.78081939137668e-07, + "loss": 0.862, + "step": 8547 + }, + { + "epoch": 0.8946101517530088, + "grad_norm": 2.233081933735458, + "learning_rate": 5.769466732856899e-07, + "loss": 0.955, + "step": 8548 + }, + { + "epoch": 0.8947148090005232, + "grad_norm": 2.158698856453009, + "learning_rate": 5.758124901616424e-07, + "loss": 0.8542, + "step": 8549 + }, + { + "epoch": 0.8948194662480377, + "grad_norm": 2.3430196102926213, + "learning_rate": 5.746793898958458e-07, + "loss": 0.9469, + "step": 8550 + }, + { + "epoch": 0.8949241234955521, + "grad_norm": 2.313609302497972, + "learning_rate": 5.735473726184937e-07, + "loss": 0.859, + "step": 8551 + }, + { + "epoch": 0.8950287807430665, + "grad_norm": 2.2653809195446235, + "learning_rate": 5.724164384596576e-07, + "loss": 0.8801, + "step": 8552 + }, + { + "epoch": 0.8951334379905809, + "grad_norm": 1.7697838567450326, + "learning_rate": 5.712865875492835e-07, + "loss": 0.7973, + "step": 8553 + }, + { + "epoch": 0.8952380952380953, + "grad_norm": 2.25827405042216, + "learning_rate": 5.701578200171942e-07, + "loss": 0.845, + "step": 8554 + }, + { + "epoch": 0.8953427524856097, + "grad_norm": 2.3169595444377893, + "learning_rate": 5.690301359930883e-07, + "loss": 0.8851, + "step": 8555 + }, + { + "epoch": 0.895447409733124, + "grad_norm": 2.1775535449606003, + "learning_rate": 5.679035356065399e-07, + "loss": 0.7539, + "step": 8556 + }, + { + "epoch": 0.8955520669806384, + "grad_norm": 1.827925306636399, + "learning_rate": 5.667780189869954e-07, + "loss": 0.8239, + "step": 8557 + }, + { + "epoch": 0.8956567242281528, + "grad_norm": 2.042695270687885, + "learning_rate": 5.656535862637779e-07, + "loss": 0.8298, + "step": 8558 + }, + { + "epoch": 0.8957613814756672, + "grad_norm": 1.8663790088557182, + "learning_rate": 5.64530237566091e-07, + "loss": 0.8654, + "step": 8559 + }, + { + "epoch": 0.8958660387231816, + "grad_norm": 2.3551080593344285, + "learning_rate": 5.634079730230069e-07, + "loss": 0.8198, + "step": 8560 + }, + { + "epoch": 0.895970695970696, + "grad_norm": 2.085971922826618, + "learning_rate": 5.622867927634768e-07, + "loss": 0.9079, + "step": 8561 + }, + { + "epoch": 0.8960753532182104, + "grad_norm": 2.1241341765259976, + "learning_rate": 5.611666969163243e-07, + "loss": 0.8598, + "step": 8562 + }, + { + "epoch": 0.8961800104657247, + "grad_norm": 2.0606089660513596, + "learning_rate": 5.600476856102543e-07, + "loss": 0.9451, + "step": 8563 + }, + { + "epoch": 0.8962846677132391, + "grad_norm": 2.0689539964279797, + "learning_rate": 5.589297589738418e-07, + "loss": 0.9374, + "step": 8564 + }, + { + "epoch": 0.8963893249607535, + "grad_norm": 1.7677701894726472, + "learning_rate": 5.578129171355407e-07, + "loss": 0.6829, + "step": 8565 + }, + { + "epoch": 0.8964939822082679, + "grad_norm": 2.0461327983112727, + "learning_rate": 5.56697160223676e-07, + "loss": 0.8308, + "step": 8566 + }, + { + "epoch": 0.8965986394557823, + "grad_norm": 2.2030981092811928, + "learning_rate": 5.555824883664496e-07, + "loss": 0.8447, + "step": 8567 + }, + { + "epoch": 0.8967032967032967, + "grad_norm": 2.165264235397933, + "learning_rate": 5.544689016919425e-07, + "loss": 0.8813, + "step": 8568 + }, + { + "epoch": 0.8968079539508111, + "grad_norm": 2.040145311512098, + "learning_rate": 5.533564003281067e-07, + "loss": 0.941, + "step": 8569 + }, + { + "epoch": 0.8969126111983254, + "grad_norm": 2.139615944092077, + "learning_rate": 5.52244984402771e-07, + "loss": 0.8565, + "step": 8570 + }, + { + "epoch": 0.8970172684458398, + "grad_norm": 2.000369105386169, + "learning_rate": 5.51134654043638e-07, + "loss": 0.9503, + "step": 8571 + }, + { + "epoch": 0.8971219256933542, + "grad_norm": 2.3047482566850266, + "learning_rate": 5.500254093782886e-07, + "loss": 0.9249, + "step": 8572 + }, + { + "epoch": 0.8972265829408687, + "grad_norm": 2.357051052649448, + "learning_rate": 5.489172505341756e-07, + "loss": 0.8812, + "step": 8573 + }, + { + "epoch": 0.8973312401883831, + "grad_norm": 1.7124549432994773, + "learning_rate": 5.478101776386324e-07, + "loss": 0.8168, + "step": 8574 + }, + { + "epoch": 0.8974358974358975, + "grad_norm": 2.081260844225879, + "learning_rate": 5.467041908188608e-07, + "loss": 0.9044, + "step": 8575 + }, + { + "epoch": 0.8975405546834119, + "grad_norm": 2.3845487096865634, + "learning_rate": 5.455992902019413e-07, + "loss": 0.9173, + "step": 8576 + }, + { + "epoch": 0.8976452119309262, + "grad_norm": 2.236894173393846, + "learning_rate": 5.44495475914828e-07, + "loss": 0.902, + "step": 8577 + }, + { + "epoch": 0.8977498691784406, + "grad_norm": 1.8686621081953942, + "learning_rate": 5.433927480843537e-07, + "loss": 0.8863, + "step": 8578 + }, + { + "epoch": 0.897854526425955, + "grad_norm": 2.0220966066928128, + "learning_rate": 5.422911068372227e-07, + "loss": 0.8258, + "step": 8579 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 2.407389734858554, + "learning_rate": 5.411905523000172e-07, + "loss": 0.8185, + "step": 8580 + }, + { + "epoch": 0.8980638409209838, + "grad_norm": 2.180556601813161, + "learning_rate": 5.400910845991925e-07, + "loss": 0.7839, + "step": 8581 + }, + { + "epoch": 0.8981684981684982, + "grad_norm": 2.2606617621305434, + "learning_rate": 5.389927038610776e-07, + "loss": 0.8516, + "step": 8582 + }, + { + "epoch": 0.8982731554160126, + "grad_norm": 1.780676054406007, + "learning_rate": 5.378954102118827e-07, + "loss": 0.7737, + "step": 8583 + }, + { + "epoch": 0.8983778126635269, + "grad_norm": 2.0946716817061386, + "learning_rate": 5.367992037776859e-07, + "loss": 0.9084, + "step": 8584 + }, + { + "epoch": 0.8984824699110413, + "grad_norm": 1.91970079060269, + "learning_rate": 5.357040846844452e-07, + "loss": 0.7988, + "step": 8585 + }, + { + "epoch": 0.8985871271585557, + "grad_norm": 1.9948211908466287, + "learning_rate": 5.346100530579911e-07, + "loss": 0.8677, + "step": 8586 + }, + { + "epoch": 0.8986917844060701, + "grad_norm": 2.1808113443571493, + "learning_rate": 5.335171090240287e-07, + "loss": 0.9124, + "step": 8587 + }, + { + "epoch": 0.8987964416535845, + "grad_norm": 1.9339613369467432, + "learning_rate": 5.324252527081408e-07, + "loss": 0.8807, + "step": 8588 + }, + { + "epoch": 0.8989010989010989, + "grad_norm": 1.8243315084924947, + "learning_rate": 5.31334484235786e-07, + "loss": 0.7639, + "step": 8589 + }, + { + "epoch": 0.8990057561486133, + "grad_norm": 1.8725702372061233, + "learning_rate": 5.302448037322938e-07, + "loss": 0.7613, + "step": 8590 + }, + { + "epoch": 0.8991104133961276, + "grad_norm": 2.267799187028563, + "learning_rate": 5.29156211322871e-07, + "loss": 0.8779, + "step": 8591 + }, + { + "epoch": 0.899215070643642, + "grad_norm": 2.147263551468828, + "learning_rate": 5.280687071325974e-07, + "loss": 0.7696, + "step": 8592 + }, + { + "epoch": 0.8993197278911564, + "grad_norm": 2.2925736831888557, + "learning_rate": 5.269822912864319e-07, + "loss": 0.8893, + "step": 8593 + }, + { + "epoch": 0.8994243851386708, + "grad_norm": 2.1055062293520415, + "learning_rate": 5.258969639092049e-07, + "loss": 0.8334, + "step": 8594 + }, + { + "epoch": 0.8995290423861853, + "grad_norm": 2.361570188646754, + "learning_rate": 5.248127251256241e-07, + "loss": 0.8185, + "step": 8595 + }, + { + "epoch": 0.8996336996336997, + "grad_norm": 1.937369636411695, + "learning_rate": 5.237295750602667e-07, + "loss": 0.8652, + "step": 8596 + }, + { + "epoch": 0.8997383568812141, + "grad_norm": 2.090179273646795, + "learning_rate": 5.22647513837592e-07, + "loss": 0.8299, + "step": 8597 + }, + { + "epoch": 0.8998430141287285, + "grad_norm": 1.984663581897822, + "learning_rate": 5.215665415819315e-07, + "loss": 0.9113, + "step": 8598 + }, + { + "epoch": 0.8999476713762428, + "grad_norm": 2.3235181239813163, + "learning_rate": 5.204866584174906e-07, + "loss": 0.9133, + "step": 8599 + }, + { + "epoch": 0.9000523286237572, + "grad_norm": 2.0441409129727797, + "learning_rate": 5.194078644683498e-07, + "loss": 0.943, + "step": 8600 + }, + { + "epoch": 0.9001569858712716, + "grad_norm": 1.8067996809796987, + "learning_rate": 5.183301598584633e-07, + "loss": 0.8274, + "step": 8601 + }, + { + "epoch": 0.900261643118786, + "grad_norm": 2.1626365258620486, + "learning_rate": 5.172535447116634e-07, + "loss": 0.8934, + "step": 8602 + }, + { + "epoch": 0.9003663003663004, + "grad_norm": 2.2909402751050814, + "learning_rate": 5.161780191516552e-07, + "loss": 0.8364, + "step": 8603 + }, + { + "epoch": 0.9004709576138148, + "grad_norm": 1.9983348573884885, + "learning_rate": 5.151035833020179e-07, + "loss": 0.862, + "step": 8604 + }, + { + "epoch": 0.9005756148613292, + "grad_norm": 2.202331715706061, + "learning_rate": 5.14030237286206e-07, + "loss": 0.8951, + "step": 8605 + }, + { + "epoch": 0.9006802721088435, + "grad_norm": 1.838553460141115, + "learning_rate": 5.129579812275498e-07, + "loss": 0.7951, + "step": 8606 + }, + { + "epoch": 0.9007849293563579, + "grad_norm": 2.2012009750979735, + "learning_rate": 5.118868152492551e-07, + "loss": 0.6918, + "step": 8607 + }, + { + "epoch": 0.9008895866038723, + "grad_norm": 1.8524490674209988, + "learning_rate": 5.10816739474399e-07, + "loss": 0.8473, + "step": 8608 + }, + { + "epoch": 0.9009942438513867, + "grad_norm": 2.3959923668906518, + "learning_rate": 5.097477540259365e-07, + "loss": 0.8172, + "step": 8609 + }, + { + "epoch": 0.9010989010989011, + "grad_norm": 1.9678387832442228, + "learning_rate": 5.086798590266961e-07, + "loss": 0.9161, + "step": 8610 + }, + { + "epoch": 0.9012035583464155, + "grad_norm": 1.9199312869676968, + "learning_rate": 5.076130545993796e-07, + "loss": 0.7752, + "step": 8611 + }, + { + "epoch": 0.9013082155939299, + "grad_norm": 2.7222834602307215, + "learning_rate": 5.065473408665678e-07, + "loss": 0.9413, + "step": 8612 + }, + { + "epoch": 0.9014128728414442, + "grad_norm": 1.9979574681439125, + "learning_rate": 5.054827179507115e-07, + "loss": 0.7943, + "step": 8613 + }, + { + "epoch": 0.9015175300889586, + "grad_norm": 2.206392186358271, + "learning_rate": 5.044191859741365e-07, + "loss": 0.8969, + "step": 8614 + }, + { + "epoch": 0.901622187336473, + "grad_norm": 2.1430840266190376, + "learning_rate": 5.033567450590482e-07, + "loss": 0.8642, + "step": 8615 + }, + { + "epoch": 0.9017268445839874, + "grad_norm": 2.017404088868683, + "learning_rate": 5.022953953275201e-07, + "loss": 0.9434, + "step": 8616 + }, + { + "epoch": 0.9018315018315018, + "grad_norm": 2.0133723850020346, + "learning_rate": 5.012351369015067e-07, + "loss": 0.7414, + "step": 8617 + }, + { + "epoch": 0.9019361590790163, + "grad_norm": 2.046862624034036, + "learning_rate": 5.001759699028319e-07, + "loss": 0.9387, + "step": 8618 + }, + { + "epoch": 0.9020408163265307, + "grad_norm": 2.0907637713335987, + "learning_rate": 4.99117894453196e-07, + "loss": 0.7797, + "step": 8619 + }, + { + "epoch": 0.902145473574045, + "grad_norm": 2.013207011952914, + "learning_rate": 4.980609106741729e-07, + "loss": 0.9043, + "step": 8620 + }, + { + "epoch": 0.9022501308215594, + "grad_norm": 1.8844202484567696, + "learning_rate": 4.970050186872155e-07, + "loss": 0.879, + "step": 8621 + }, + { + "epoch": 0.9023547880690738, + "grad_norm": 2.3362225206127696, + "learning_rate": 4.959502186136456e-07, + "loss": 0.8738, + "step": 8622 + }, + { + "epoch": 0.9024594453165882, + "grad_norm": 2.0248898990471367, + "learning_rate": 4.948965105746617e-07, + "loss": 0.8554, + "step": 8623 + }, + { + "epoch": 0.9025641025641026, + "grad_norm": 1.8717907789505432, + "learning_rate": 4.938438946913382e-07, + "loss": 0.8623, + "step": 8624 + }, + { + "epoch": 0.902668759811617, + "grad_norm": 2.1406021198538188, + "learning_rate": 4.927923710846228e-07, + "loss": 0.8994, + "step": 8625 + }, + { + "epoch": 0.9027734170591314, + "grad_norm": 2.105701972491203, + "learning_rate": 4.917419398753342e-07, + "loss": 0.8342, + "step": 8626 + }, + { + "epoch": 0.9028780743066457, + "grad_norm": 2.0643422231373707, + "learning_rate": 4.906926011841751e-07, + "loss": 0.9147, + "step": 8627 + }, + { + "epoch": 0.9029827315541601, + "grad_norm": 1.9814741563458926, + "learning_rate": 4.89644355131712e-07, + "loss": 0.8054, + "step": 8628 + }, + { + "epoch": 0.9030873888016745, + "grad_norm": 2.1051194899963317, + "learning_rate": 4.885972018383933e-07, + "loss": 0.9514, + "step": 8629 + }, + { + "epoch": 0.9031920460491889, + "grad_norm": 2.459613012959799, + "learning_rate": 4.87551141424536e-07, + "loss": 0.9572, + "step": 8630 + }, + { + "epoch": 0.9032967032967033, + "grad_norm": 2.629282062410733, + "learning_rate": 4.865061740103361e-07, + "loss": 0.8158, + "step": 8631 + }, + { + "epoch": 0.9034013605442177, + "grad_norm": 2.0103071109487853, + "learning_rate": 4.854622997158643e-07, + "loss": 0.8943, + "step": 8632 + }, + { + "epoch": 0.9035060177917321, + "grad_norm": 1.9239594017345965, + "learning_rate": 4.844195186610622e-07, + "loss": 0.8074, + "step": 8633 + }, + { + "epoch": 0.9036106750392464, + "grad_norm": 2.1402627344081875, + "learning_rate": 4.833778309657467e-07, + "loss": 0.8907, + "step": 8634 + }, + { + "epoch": 0.9037153322867608, + "grad_norm": 2.137809328262573, + "learning_rate": 4.823372367496104e-07, + "loss": 0.8235, + "step": 8635 + }, + { + "epoch": 0.9038199895342752, + "grad_norm": 2.0550886701913593, + "learning_rate": 4.812977361322201e-07, + "loss": 0.8927, + "step": 8636 + }, + { + "epoch": 0.9039246467817896, + "grad_norm": 1.9128522746883567, + "learning_rate": 4.802593292330171e-07, + "loss": 0.7968, + "step": 8637 + }, + { + "epoch": 0.904029304029304, + "grad_norm": 1.9358701087954804, + "learning_rate": 4.792220161713157e-07, + "loss": 0.8978, + "step": 8638 + }, + { + "epoch": 0.9041339612768184, + "grad_norm": 2.291398568641111, + "learning_rate": 4.78185797066304e-07, + "loss": 0.8892, + "step": 8639 + }, + { + "epoch": 0.9042386185243328, + "grad_norm": 2.4919196805707418, + "learning_rate": 4.771506720370466e-07, + "loss": 0.9424, + "step": 8640 + }, + { + "epoch": 0.9043432757718473, + "grad_norm": 1.6894319599246974, + "learning_rate": 4.761166412024831e-07, + "loss": 0.7277, + "step": 8641 + }, + { + "epoch": 0.9044479330193615, + "grad_norm": 1.9107521958807865, + "learning_rate": 4.7508370468142496e-07, + "loss": 0.885, + "step": 8642 + }, + { + "epoch": 0.904552590266876, + "grad_norm": 2.0614995865460863, + "learning_rate": 4.7405186259255833e-07, + "loss": 0.7822, + "step": 8643 + }, + { + "epoch": 0.9046572475143904, + "grad_norm": 1.7924039712471196, + "learning_rate": 4.7302111505444284e-07, + "loss": 0.9039, + "step": 8644 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 2.249557355888428, + "learning_rate": 4.719914621855137e-07, + "loss": 0.8715, + "step": 8645 + }, + { + "epoch": 0.9048665620094192, + "grad_norm": 2.1203352917619345, + "learning_rate": 4.7096290410408196e-07, + "loss": 0.8193, + "step": 8646 + }, + { + "epoch": 0.9049712192569336, + "grad_norm": 2.115446838243104, + "learning_rate": 4.6993544092832965e-07, + "loss": 0.7967, + "step": 8647 + }, + { + "epoch": 0.905075876504448, + "grad_norm": 1.6513015444716435, + "learning_rate": 4.689090727763157e-07, + "loss": 0.6944, + "step": 8648 + }, + { + "epoch": 0.9051805337519623, + "grad_norm": 2.1211276289299272, + "learning_rate": 4.6788379976596797e-07, + "loss": 0.8097, + "step": 8649 + }, + { + "epoch": 0.9052851909994767, + "grad_norm": 1.7872427675425049, + "learning_rate": 4.668596220150967e-07, + "loss": 0.742, + "step": 8650 + }, + { + "epoch": 0.9053898482469911, + "grad_norm": 1.898405645689105, + "learning_rate": 4.658365396413811e-07, + "loss": 0.6804, + "step": 8651 + }, + { + "epoch": 0.9054945054945055, + "grad_norm": 2.2484345299825756, + "learning_rate": 4.6481455276237485e-07, + "loss": 0.9168, + "step": 8652 + }, + { + "epoch": 0.9055991627420199, + "grad_norm": 1.8890310215611323, + "learning_rate": 4.637936614955063e-07, + "loss": 0.8853, + "step": 8653 + }, + { + "epoch": 0.9057038199895343, + "grad_norm": 2.116084072039392, + "learning_rate": 4.6277386595807717e-07, + "loss": 0.9191, + "step": 8654 + }, + { + "epoch": 0.9058084772370487, + "grad_norm": 1.8431218097882023, + "learning_rate": 4.617551662672659e-07, + "loss": 0.8677, + "step": 8655 + }, + { + "epoch": 0.905913134484563, + "grad_norm": 2.3481517816362905, + "learning_rate": 4.6073756254012224e-07, + "loss": 0.8765, + "step": 8656 + }, + { + "epoch": 0.9060177917320774, + "grad_norm": 2.0817117165028898, + "learning_rate": 4.597210548935693e-07, + "loss": 0.8923, + "step": 8657 + }, + { + "epoch": 0.9061224489795918, + "grad_norm": 2.2883970597196366, + "learning_rate": 4.5870564344440815e-07, + "loss": 0.914, + "step": 8658 + }, + { + "epoch": 0.9062271062271062, + "grad_norm": 1.83618675401989, + "learning_rate": 4.576913283093098e-07, + "loss": 0.765, + "step": 8659 + }, + { + "epoch": 0.9063317634746206, + "grad_norm": 1.989383983995067, + "learning_rate": 4.566781096048234e-07, + "loss": 0.7973, + "step": 8660 + }, + { + "epoch": 0.906436420722135, + "grad_norm": 2.3861984180984344, + "learning_rate": 4.5566598744736903e-07, + "loss": 0.9352, + "step": 8661 + }, + { + "epoch": 0.9065410779696494, + "grad_norm": 2.0378146649226068, + "learning_rate": 4.5465496195324035e-07, + "loss": 0.8419, + "step": 8662 + }, + { + "epoch": 0.9066457352171637, + "grad_norm": 2.392679580789577, + "learning_rate": 4.5364503323860666e-07, + "loss": 0.849, + "step": 8663 + }, + { + "epoch": 0.9067503924646781, + "grad_norm": 1.9701786010249667, + "learning_rate": 4.526362014195107e-07, + "loss": 0.9152, + "step": 8664 + }, + { + "epoch": 0.9068550497121926, + "grad_norm": 2.2953866617085827, + "learning_rate": 4.516284666118709e-07, + "loss": 0.9003, + "step": 8665 + }, + { + "epoch": 0.906959706959707, + "grad_norm": 1.9748169384630196, + "learning_rate": 4.506218289314757e-07, + "loss": 0.8263, + "step": 8666 + }, + { + "epoch": 0.9070643642072214, + "grad_norm": 1.9629754135097301, + "learning_rate": 4.496162884939914e-07, + "loss": 0.934, + "step": 8667 + }, + { + "epoch": 0.9071690214547358, + "grad_norm": 2.008476459907381, + "learning_rate": 4.4861184541495685e-07, + "loss": 0.8925, + "step": 8668 + }, + { + "epoch": 0.9072736787022502, + "grad_norm": 1.7441402259407277, + "learning_rate": 4.4760849980978184e-07, + "loss": 0.8045, + "step": 8669 + }, + { + "epoch": 0.9073783359497645, + "grad_norm": 2.085568340205773, + "learning_rate": 4.466062517937575e-07, + "loss": 0.8026, + "step": 8670 + }, + { + "epoch": 0.9074829931972789, + "grad_norm": 2.1982232449226977, + "learning_rate": 4.456051014820406e-07, + "loss": 0.8355, + "step": 8671 + }, + { + "epoch": 0.9075876504447933, + "grad_norm": 2.1354303926988765, + "learning_rate": 4.4460504898966693e-07, + "loss": 0.8294, + "step": 8672 + }, + { + "epoch": 0.9076923076923077, + "grad_norm": 2.034301795378783, + "learning_rate": 4.4360609443154233e-07, + "loss": 0.9312, + "step": 8673 + }, + { + "epoch": 0.9077969649398221, + "grad_norm": 2.355315349345066, + "learning_rate": 4.426082379224517e-07, + "loss": 0.9095, + "step": 8674 + }, + { + "epoch": 0.9079016221873365, + "grad_norm": 1.9806257600484884, + "learning_rate": 4.4161147957704767e-07, + "loss": 0.7933, + "step": 8675 + }, + { + "epoch": 0.9080062794348509, + "grad_norm": 2.2251811085208804, + "learning_rate": 4.406158195098642e-07, + "loss": 0.9325, + "step": 8676 + }, + { + "epoch": 0.9081109366823652, + "grad_norm": 2.2630252661426575, + "learning_rate": 4.39621257835301e-07, + "loss": 0.8204, + "step": 8677 + }, + { + "epoch": 0.9082155939298796, + "grad_norm": 2.013858317073263, + "learning_rate": 4.386277946676343e-07, + "loss": 0.8767, + "step": 8678 + }, + { + "epoch": 0.908320251177394, + "grad_norm": 2.051873546111938, + "learning_rate": 4.376354301210184e-07, + "loss": 0.9409, + "step": 8679 + }, + { + "epoch": 0.9084249084249084, + "grad_norm": 2.13004369241599, + "learning_rate": 4.3664416430947766e-07, + "loss": 0.8222, + "step": 8680 + }, + { + "epoch": 0.9085295656724228, + "grad_norm": 2.0442840076206705, + "learning_rate": 4.356539973469087e-07, + "loss": 0.9183, + "step": 8681 + }, + { + "epoch": 0.9086342229199372, + "grad_norm": 1.7421295502754213, + "learning_rate": 4.346649293470839e-07, + "loss": 0.8429, + "step": 8682 + }, + { + "epoch": 0.9087388801674516, + "grad_norm": 2.3968867260487117, + "learning_rate": 4.33676960423649e-07, + "loss": 0.9008, + "step": 8683 + }, + { + "epoch": 0.908843537414966, + "grad_norm": 1.9127124116543281, + "learning_rate": 4.326900906901232e-07, + "loss": 0.7683, + "step": 8684 + }, + { + "epoch": 0.9089481946624803, + "grad_norm": 1.95961808035582, + "learning_rate": 4.317043202599036e-07, + "loss": 0.8165, + "step": 8685 + }, + { + "epoch": 0.9090528519099947, + "grad_norm": 1.9844531465124018, + "learning_rate": 4.30719649246254e-07, + "loss": 0.8512, + "step": 8686 + }, + { + "epoch": 0.9091575091575091, + "grad_norm": 1.9622621837590426, + "learning_rate": 4.297360777623161e-07, + "loss": 0.7713, + "step": 8687 + }, + { + "epoch": 0.9092621664050236, + "grad_norm": 1.8041201019218724, + "learning_rate": 4.287536059211017e-07, + "loss": 0.8295, + "step": 8688 + }, + { + "epoch": 0.909366823652538, + "grad_norm": 2.337575130743714, + "learning_rate": 4.277722338355028e-07, + "loss": 0.8251, + "step": 8689 + }, + { + "epoch": 0.9094714809000524, + "grad_norm": 2.0458103819689866, + "learning_rate": 4.267919616182792e-07, + "loss": 0.8311, + "step": 8690 + }, + { + "epoch": 0.9095761381475668, + "grad_norm": 1.8361080971407697, + "learning_rate": 4.2581278938206626e-07, + "loss": 0.7123, + "step": 8691 + }, + { + "epoch": 0.9096807953950811, + "grad_norm": 2.1827141207863607, + "learning_rate": 4.2483471723937075e-07, + "loss": 0.9557, + "step": 8692 + }, + { + "epoch": 0.9097854526425955, + "grad_norm": 1.7316495003042163, + "learning_rate": 4.2385774530257717e-07, + "loss": 0.7874, + "step": 8693 + }, + { + "epoch": 0.9098901098901099, + "grad_norm": 2.1643266776521997, + "learning_rate": 4.2288187368394353e-07, + "loss": 0.7739, + "step": 8694 + }, + { + "epoch": 0.9099947671376243, + "grad_norm": 1.6984430830527935, + "learning_rate": 4.219071024955967e-07, + "loss": 0.781, + "step": 8695 + }, + { + "epoch": 0.9100994243851387, + "grad_norm": 1.9916952789284774, + "learning_rate": 4.209334318495406e-07, + "loss": 0.8571, + "step": 8696 + }, + { + "epoch": 0.9102040816326531, + "grad_norm": 2.02650344774727, + "learning_rate": 4.1996086185765115e-07, + "loss": 0.8146, + "step": 8697 + }, + { + "epoch": 0.9103087388801675, + "grad_norm": 2.0774078712351667, + "learning_rate": 4.1898939263168127e-07, + "loss": 0.8396, + "step": 8698 + }, + { + "epoch": 0.9104133961276818, + "grad_norm": 2.171351968437315, + "learning_rate": 4.1801902428325156e-07, + "loss": 0.9199, + "step": 8699 + }, + { + "epoch": 0.9105180533751962, + "grad_norm": 2.0626628160103855, + "learning_rate": 4.1704975692386186e-07, + "loss": 0.8262, + "step": 8700 + }, + { + "epoch": 0.9106227106227106, + "grad_norm": 2.2474029662554256, + "learning_rate": 4.160815906648796e-07, + "loss": 0.8491, + "step": 8701 + }, + { + "epoch": 0.910727367870225, + "grad_norm": 1.8343930692503205, + "learning_rate": 4.1511452561755263e-07, + "loss": 0.9191, + "step": 8702 + }, + { + "epoch": 0.9108320251177394, + "grad_norm": 1.919914539345838, + "learning_rate": 4.1414856189299635e-07, + "loss": 0.8064, + "step": 8703 + }, + { + "epoch": 0.9109366823652538, + "grad_norm": 2.2697378486199185, + "learning_rate": 4.131836996022043e-07, + "loss": 0.8934, + "step": 8704 + }, + { + "epoch": 0.9110413396127682, + "grad_norm": 2.1565994352606648, + "learning_rate": 4.122199388560388e-07, + "loss": 0.8283, + "step": 8705 + }, + { + "epoch": 0.9111459968602825, + "grad_norm": 2.0593897209824243, + "learning_rate": 4.1125727976523923e-07, + "loss": 0.8751, + "step": 8706 + }, + { + "epoch": 0.9112506541077969, + "grad_norm": 2.633616179719094, + "learning_rate": 4.1029572244041585e-07, + "loss": 0.8806, + "step": 8707 + }, + { + "epoch": 0.9113553113553113, + "grad_norm": 2.217152686708234, + "learning_rate": 4.0933526699205475e-07, + "loss": 0.911, + "step": 8708 + }, + { + "epoch": 0.9114599686028257, + "grad_norm": 2.166895554817742, + "learning_rate": 4.083759135305132e-07, + "loss": 0.7358, + "step": 8709 + }, + { + "epoch": 0.9115646258503401, + "grad_norm": 2.0236710752805713, + "learning_rate": 4.074176621660242e-07, + "loss": 0.8996, + "step": 8710 + }, + { + "epoch": 0.9116692830978546, + "grad_norm": 2.2962325093594607, + "learning_rate": 4.0646051300869295e-07, + "loss": 0.9279, + "step": 8711 + }, + { + "epoch": 0.911773940345369, + "grad_norm": 2.042939432062761, + "learning_rate": 4.0550446616849484e-07, + "loss": 0.8324, + "step": 8712 + }, + { + "epoch": 0.9118785975928833, + "grad_norm": 2.428545782174381, + "learning_rate": 4.0454952175528527e-07, + "loss": 0.8856, + "step": 8713 + }, + { + "epoch": 0.9119832548403977, + "grad_norm": 2.2988669806643784, + "learning_rate": 4.0359567987878655e-07, + "loss": 0.8653, + "step": 8714 + }, + { + "epoch": 0.9120879120879121, + "grad_norm": 1.9821777641406835, + "learning_rate": 4.026429406485988e-07, + "loss": 0.9642, + "step": 8715 + }, + { + "epoch": 0.9121925693354265, + "grad_norm": 2.301086960995479, + "learning_rate": 4.016913041741921e-07, + "loss": 0.8863, + "step": 8716 + }, + { + "epoch": 0.9122972265829409, + "grad_norm": 2.5204244955883515, + "learning_rate": 4.0074077056491355e-07, + "loss": 0.8635, + "step": 8717 + }, + { + "epoch": 0.9124018838304553, + "grad_norm": 1.8160765406116912, + "learning_rate": 3.997913399299791e-07, + "loss": 0.7991, + "step": 8718 + }, + { + "epoch": 0.9125065410779697, + "grad_norm": 1.934088855695629, + "learning_rate": 3.9884301237848255e-07, + "loss": 0.9356, + "step": 8719 + }, + { + "epoch": 0.912611198325484, + "grad_norm": 2.2496984791324177, + "learning_rate": 3.978957880193868e-07, + "loss": 0.8782, + "step": 8720 + }, + { + "epoch": 0.9127158555729984, + "grad_norm": 2.3632313366350703, + "learning_rate": 3.969496669615314e-07, + "loss": 0.8665, + "step": 8721 + }, + { + "epoch": 0.9128205128205128, + "grad_norm": 2.272870516424909, + "learning_rate": 3.9600464931362494e-07, + "loss": 0.973, + "step": 8722 + }, + { + "epoch": 0.9129251700680272, + "grad_norm": 2.146882820843765, + "learning_rate": 3.9506073518425504e-07, + "loss": 0.9286, + "step": 8723 + }, + { + "epoch": 0.9130298273155416, + "grad_norm": 2.0647435155269576, + "learning_rate": 3.9411792468187716e-07, + "loss": 0.8374, + "step": 8724 + }, + { + "epoch": 0.913134484563056, + "grad_norm": 2.059170164055368, + "learning_rate": 3.931762179148235e-07, + "loss": 0.8575, + "step": 8725 + }, + { + "epoch": 0.9132391418105704, + "grad_norm": 1.9573837147170599, + "learning_rate": 3.9223561499129647e-07, + "loss": 0.8199, + "step": 8726 + }, + { + "epoch": 0.9133437990580848, + "grad_norm": 2.1437953288885017, + "learning_rate": 3.91296116019374e-07, + "loss": 0.7705, + "step": 8727 + }, + { + "epoch": 0.9134484563055991, + "grad_norm": 1.858952500811545, + "learning_rate": 3.903577211070075e-07, + "loss": 0.825, + "step": 8728 + }, + { + "epoch": 0.9135531135531135, + "grad_norm": 2.160108943394208, + "learning_rate": 3.894204303620197e-07, + "loss": 0.8654, + "step": 8729 + }, + { + "epoch": 0.9136577708006279, + "grad_norm": 2.1084513039063117, + "learning_rate": 3.884842438921077e-07, + "loss": 1.033, + "step": 8730 + }, + { + "epoch": 0.9137624280481423, + "grad_norm": 2.7878592333589607, + "learning_rate": 3.875491618048388e-07, + "loss": 0.9281, + "step": 8731 + }, + { + "epoch": 0.9138670852956567, + "grad_norm": 1.991323149670702, + "learning_rate": 3.866151842076593e-07, + "loss": 0.9283, + "step": 8732 + }, + { + "epoch": 0.9139717425431712, + "grad_norm": 2.2833156721267507, + "learning_rate": 3.8568231120788334e-07, + "loss": 0.9989, + "step": 8733 + }, + { + "epoch": 0.9140763997906856, + "grad_norm": 1.9572296751214142, + "learning_rate": 3.8475054291270077e-07, + "loss": 0.7219, + "step": 8734 + }, + { + "epoch": 0.9141810570381999, + "grad_norm": 2.2701248411948085, + "learning_rate": 3.838198794291714e-07, + "loss": 0.911, + "step": 8735 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 1.924604288549798, + "learning_rate": 3.8289032086423095e-07, + "loss": 0.9511, + "step": 8736 + }, + { + "epoch": 0.9143903715332287, + "grad_norm": 2.141850749754829, + "learning_rate": 3.8196186732469056e-07, + "loss": 0.8538, + "step": 8737 + }, + { + "epoch": 0.9144950287807431, + "grad_norm": 1.8871726093707848, + "learning_rate": 3.810345189172293e-07, + "loss": 0.8743, + "step": 8738 + }, + { + "epoch": 0.9145996860282575, + "grad_norm": 2.345605190515875, + "learning_rate": 3.80108275748402e-07, + "loss": 0.9693, + "step": 8739 + }, + { + "epoch": 0.9147043432757719, + "grad_norm": 1.8872898192305327, + "learning_rate": 3.791831379246347e-07, + "loss": 0.8189, + "step": 8740 + }, + { + "epoch": 0.9148090005232863, + "grad_norm": 2.146002787179079, + "learning_rate": 3.7825910555222666e-07, + "loss": 0.8324, + "step": 8741 + }, + { + "epoch": 0.9149136577708006, + "grad_norm": 2.085783356155515, + "learning_rate": 3.773361787373553e-07, + "loss": 0.8937, + "step": 8742 + }, + { + "epoch": 0.915018315018315, + "grad_norm": 2.383660628094254, + "learning_rate": 3.7641435758606347e-07, + "loss": 0.9274, + "step": 8743 + }, + { + "epoch": 0.9151229722658294, + "grad_norm": 1.887353522221794, + "learning_rate": 3.754936422042699e-07, + "loss": 0.8397, + "step": 8744 + }, + { + "epoch": 0.9152276295133438, + "grad_norm": 2.1935449423146656, + "learning_rate": 3.745740326977687e-07, + "loss": 0.8892, + "step": 8745 + }, + { + "epoch": 0.9153322867608582, + "grad_norm": 1.9262362500423253, + "learning_rate": 3.7365552917222213e-07, + "loss": 0.8224, + "step": 8746 + }, + { + "epoch": 0.9154369440083726, + "grad_norm": 2.0505509798208417, + "learning_rate": 3.7273813173317243e-07, + "loss": 0.8285, + "step": 8747 + }, + { + "epoch": 0.915541601255887, + "grad_norm": 2.3324333915301216, + "learning_rate": 3.718218404860263e-07, + "loss": 0.8936, + "step": 8748 + }, + { + "epoch": 0.9156462585034013, + "grad_norm": 2.031276939093479, + "learning_rate": 3.7090665553607076e-07, + "loss": 0.7884, + "step": 8749 + }, + { + "epoch": 0.9157509157509157, + "grad_norm": 2.2510972837867875, + "learning_rate": 3.6999257698845825e-07, + "loss": 0.9864, + "step": 8750 + }, + { + "epoch": 0.9158555729984301, + "grad_norm": 2.3120088138033257, + "learning_rate": 3.690796049482226e-07, + "loss": 0.8674, + "step": 8751 + }, + { + "epoch": 0.9159602302459445, + "grad_norm": 2.5331940368689367, + "learning_rate": 3.681677395202632e-07, + "loss": 0.8305, + "step": 8752 + }, + { + "epoch": 0.9160648874934589, + "grad_norm": 2.134818511890312, + "learning_rate": 3.6725698080935626e-07, + "loss": 0.8018, + "step": 8753 + }, + { + "epoch": 0.9161695447409733, + "grad_norm": 2.2432740685775148, + "learning_rate": 3.6634732892015025e-07, + "loss": 0.9043, + "step": 8754 + }, + { + "epoch": 0.9162742019884877, + "grad_norm": 1.9549596052281204, + "learning_rate": 3.654387839571649e-07, + "loss": 0.9349, + "step": 8755 + }, + { + "epoch": 0.916378859236002, + "grad_norm": 2.0607140029707605, + "learning_rate": 3.645313460247957e-07, + "loss": 0.8228, + "step": 8756 + }, + { + "epoch": 0.9164835164835164, + "grad_norm": 1.9888703343791325, + "learning_rate": 3.6362501522730797e-07, + "loss": 0.7932, + "step": 8757 + }, + { + "epoch": 0.9165881737310309, + "grad_norm": 2.1547284667729314, + "learning_rate": 3.6271979166884076e-07, + "loss": 0.8284, + "step": 8758 + }, + { + "epoch": 0.9166928309785453, + "grad_norm": 2.3830343538126315, + "learning_rate": 3.6181567545340634e-07, + "loss": 0.8717, + "step": 8759 + }, + { + "epoch": 0.9167974882260597, + "grad_norm": 2.7710627812990074, + "learning_rate": 3.6091266668488835e-07, + "loss": 0.7445, + "step": 8760 + }, + { + "epoch": 0.9169021454735741, + "grad_norm": 1.8300014783449674, + "learning_rate": 3.6001076546704595e-07, + "loss": 0.92, + "step": 8761 + }, + { + "epoch": 0.9170068027210885, + "grad_norm": 2.035387613216914, + "learning_rate": 3.591099719035096e-07, + "loss": 0.7801, + "step": 8762 + }, + { + "epoch": 0.9171114599686028, + "grad_norm": 2.104498222120233, + "learning_rate": 3.582102860977821e-07, + "loss": 0.8691, + "step": 8763 + }, + { + "epoch": 0.9172161172161172, + "grad_norm": 2.083491104776923, + "learning_rate": 3.5731170815323733e-07, + "loss": 0.918, + "step": 8764 + }, + { + "epoch": 0.9173207744636316, + "grad_norm": 2.0988470418475984, + "learning_rate": 3.5641423817312503e-07, + "loss": 0.8197, + "step": 8765 + }, + { + "epoch": 0.917425431711146, + "grad_norm": 2.3323620083693677, + "learning_rate": 3.555178762605671e-07, + "loss": 0.9047, + "step": 8766 + }, + { + "epoch": 0.9175300889586604, + "grad_norm": 2.235958503717904, + "learning_rate": 3.546226225185567e-07, + "loss": 0.8173, + "step": 8767 + }, + { + "epoch": 0.9176347462061748, + "grad_norm": 2.3113114359456826, + "learning_rate": 3.537284770499605e-07, + "loss": 0.9114, + "step": 8768 + }, + { + "epoch": 0.9177394034536892, + "grad_norm": 2.095066016594399, + "learning_rate": 3.528354399575151e-07, + "loss": 1.0252, + "step": 8769 + }, + { + "epoch": 0.9178440607012036, + "grad_norm": 1.9933094078470959, + "learning_rate": 3.519435113438352e-07, + "loss": 0.8784, + "step": 8770 + }, + { + "epoch": 0.9179487179487179, + "grad_norm": 2.0786146110365746, + "learning_rate": 3.510526913114065e-07, + "loss": 0.8294, + "step": 8771 + }, + { + "epoch": 0.9180533751962323, + "grad_norm": 2.195402968074223, + "learning_rate": 3.5016297996258276e-07, + "loss": 0.9349, + "step": 8772 + }, + { + "epoch": 0.9181580324437467, + "grad_norm": 2.05314297727512, + "learning_rate": 3.4927437739959547e-07, + "loss": 0.7679, + "step": 8773 + }, + { + "epoch": 0.9182626896912611, + "grad_norm": 2.244479903766985, + "learning_rate": 3.483868837245452e-07, + "loss": 0.929, + "step": 8774 + }, + { + "epoch": 0.9183673469387755, + "grad_norm": 2.015140547965409, + "learning_rate": 3.4750049903940817e-07, + "loss": 0.8448, + "step": 8775 + }, + { + "epoch": 0.9184720041862899, + "grad_norm": 1.8382679008968412, + "learning_rate": 3.466152234460318e-07, + "loss": 0.751, + "step": 8776 + }, + { + "epoch": 0.9185766614338043, + "grad_norm": 2.205165672351134, + "learning_rate": 3.4573105704613586e-07, + "loss": 0.9405, + "step": 8777 + }, + { + "epoch": 0.9186813186813186, + "grad_norm": 1.821503436915656, + "learning_rate": 3.4484799994131345e-07, + "loss": 0.808, + "step": 8778 + }, + { + "epoch": 0.918785975928833, + "grad_norm": 2.259936480939421, + "learning_rate": 3.439660522330268e-07, + "loss": 0.8702, + "step": 8779 + }, + { + "epoch": 0.9188906331763474, + "grad_norm": 2.0808618852041847, + "learning_rate": 3.43085214022616e-07, + "loss": 0.7654, + "step": 8780 + }, + { + "epoch": 0.9189952904238619, + "grad_norm": 1.970573498767891, + "learning_rate": 3.4220548541129217e-07, + "loss": 0.8929, + "step": 8781 + }, + { + "epoch": 0.9190999476713763, + "grad_norm": 1.9944791324781226, + "learning_rate": 3.413268665001357e-07, + "loss": 0.8615, + "step": 8782 + }, + { + "epoch": 0.9192046049188907, + "grad_norm": 1.9279258269514712, + "learning_rate": 3.404493573901024e-07, + "loss": 0.7665, + "step": 8783 + }, + { + "epoch": 0.9193092621664051, + "grad_norm": 2.180216746439435, + "learning_rate": 3.3957295818201954e-07, + "loss": 0.9315, + "step": 8784 + }, + { + "epoch": 0.9194139194139194, + "grad_norm": 1.9619360610575096, + "learning_rate": 3.3869766897658753e-07, + "loss": 0.878, + "step": 8785 + }, + { + "epoch": 0.9195185766614338, + "grad_norm": 2.030802148720462, + "learning_rate": 3.378234898743804e-07, + "loss": 0.73, + "step": 8786 + }, + { + "epoch": 0.9196232339089482, + "grad_norm": 2.1100710529585625, + "learning_rate": 3.369504209758401e-07, + "loss": 0.8821, + "step": 8787 + }, + { + "epoch": 0.9197278911564626, + "grad_norm": 2.2642436944668822, + "learning_rate": 3.3607846238128517e-07, + "loss": 0.8657, + "step": 8788 + }, + { + "epoch": 0.919832548403977, + "grad_norm": 1.8980867961682124, + "learning_rate": 3.352076141909055e-07, + "loss": 0.7698, + "step": 8789 + }, + { + "epoch": 0.9199372056514914, + "grad_norm": 2.116331603985098, + "learning_rate": 3.3433787650476444e-07, + "loss": 0.7825, + "step": 8790 + }, + { + "epoch": 0.9200418628990058, + "grad_norm": 2.016895335801583, + "learning_rate": 3.3346924942279645e-07, + "loss": 0.8306, + "step": 8791 + }, + { + "epoch": 0.9201465201465201, + "grad_norm": 2.1969878425950435, + "learning_rate": 3.3260173304480724e-07, + "loss": 0.9345, + "step": 8792 + }, + { + "epoch": 0.9202511773940345, + "grad_norm": 2.4238866705980904, + "learning_rate": 3.317353274704749e-07, + "loss": 0.8095, + "step": 8793 + }, + { + "epoch": 0.9203558346415489, + "grad_norm": 2.061511830200133, + "learning_rate": 3.3087003279935527e-07, + "loss": 0.9398, + "step": 8794 + }, + { + "epoch": 0.9204604918890633, + "grad_norm": 1.9785156370811716, + "learning_rate": 3.3000584913087e-07, + "loss": 0.8761, + "step": 8795 + }, + { + "epoch": 0.9205651491365777, + "grad_norm": 2.6239846193899687, + "learning_rate": 3.2914277656431405e-07, + "loss": 0.9125, + "step": 8796 + }, + { + "epoch": 0.9206698063840921, + "grad_norm": 2.098352277907113, + "learning_rate": 3.282808151988603e-07, + "loss": 0.7692, + "step": 8797 + }, + { + "epoch": 0.9207744636316065, + "grad_norm": 1.9428905633278999, + "learning_rate": 3.2741996513354637e-07, + "loss": 0.8322, + "step": 8798 + }, + { + "epoch": 0.9208791208791208, + "grad_norm": 1.783743284332841, + "learning_rate": 3.265602264672862e-07, + "loss": 0.7968, + "step": 8799 + }, + { + "epoch": 0.9209837781266352, + "grad_norm": 1.9603968225564672, + "learning_rate": 3.257015992988677e-07, + "loss": 0.9033, + "step": 8800 + }, + { + "epoch": 0.9210884353741496, + "grad_norm": 1.9634203307317444, + "learning_rate": 3.248440837269462e-07, + "loss": 0.9142, + "step": 8801 + }, + { + "epoch": 0.921193092621664, + "grad_norm": 1.8398499107034954, + "learning_rate": 3.239876798500541e-07, + "loss": 0.8375, + "step": 8802 + }, + { + "epoch": 0.9212977498691785, + "grad_norm": 2.113545071381351, + "learning_rate": 3.2313238776659037e-07, + "loss": 0.842, + "step": 8803 + }, + { + "epoch": 0.9214024071166929, + "grad_norm": 2.0202953316393164, + "learning_rate": 3.222782075748354e-07, + "loss": 0.7569, + "step": 8804 + }, + { + "epoch": 0.9215070643642073, + "grad_norm": 2.1413122563788227, + "learning_rate": 3.2142513937293064e-07, + "loss": 0.8861, + "step": 8805 + }, + { + "epoch": 0.9216117216117216, + "grad_norm": 2.235823853768752, + "learning_rate": 3.2057318325889877e-07, + "loss": 0.9496, + "step": 8806 + }, + { + "epoch": 0.921716378859236, + "grad_norm": 2.16956827769149, + "learning_rate": 3.1972233933063144e-07, + "loss": 0.8859, + "step": 8807 + }, + { + "epoch": 0.9218210361067504, + "grad_norm": 2.1026176075046874, + "learning_rate": 3.1887260768588946e-07, + "loss": 0.8599, + "step": 8808 + }, + { + "epoch": 0.9219256933542648, + "grad_norm": 1.8287944523325435, + "learning_rate": 3.180239884223124e-07, + "loss": 0.8797, + "step": 8809 + }, + { + "epoch": 0.9220303506017792, + "grad_norm": 2.0117907483069764, + "learning_rate": 3.1717648163740567e-07, + "loss": 0.9159, + "step": 8810 + }, + { + "epoch": 0.9221350078492936, + "grad_norm": 2.1179998673763234, + "learning_rate": 3.163300874285513e-07, + "loss": 0.7744, + "step": 8811 + }, + { + "epoch": 0.922239665096808, + "grad_norm": 2.2470761723344452, + "learning_rate": 3.154848058929982e-07, + "loss": 0.9273, + "step": 8812 + }, + { + "epoch": 0.9223443223443224, + "grad_norm": 1.881792495195148, + "learning_rate": 3.146406371278754e-07, + "loss": 0.8834, + "step": 8813 + }, + { + "epoch": 0.9224489795918367, + "grad_norm": 1.979370856312482, + "learning_rate": 3.1379758123017636e-07, + "loss": 0.8525, + "step": 8814 + }, + { + "epoch": 0.9225536368393511, + "grad_norm": 2.139478294351095, + "learning_rate": 3.1295563829677247e-07, + "loss": 0.8343, + "step": 8815 + }, + { + "epoch": 0.9226582940868655, + "grad_norm": 2.1238442720838013, + "learning_rate": 3.121148084244041e-07, + "loss": 1.0092, + "step": 8816 + }, + { + "epoch": 0.9227629513343799, + "grad_norm": 1.7314699596986751, + "learning_rate": 3.1127509170968295e-07, + "loss": 0.8082, + "step": 8817 + }, + { + "epoch": 0.9228676085818943, + "grad_norm": 2.012711651464119, + "learning_rate": 3.10436488249094e-07, + "loss": 0.8154, + "step": 8818 + }, + { + "epoch": 0.9229722658294087, + "grad_norm": 2.611942421259733, + "learning_rate": 3.0959899813899576e-07, + "loss": 0.6697, + "step": 8819 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 2.2358990969751193, + "learning_rate": 3.0876262147561784e-07, + "loss": 0.9779, + "step": 8820 + }, + { + "epoch": 0.9231815803244374, + "grad_norm": 2.303016822984439, + "learning_rate": 3.0792735835506015e-07, + "loss": 0.7672, + "step": 8821 + }, + { + "epoch": 0.9232862375719518, + "grad_norm": 2.125149271133452, + "learning_rate": 3.0709320887329697e-07, + "loss": 0.8014, + "step": 8822 + }, + { + "epoch": 0.9233908948194662, + "grad_norm": 1.825121847185592, + "learning_rate": 3.0626017312617274e-07, + "loss": 0.841, + "step": 8823 + }, + { + "epoch": 0.9234955520669806, + "grad_norm": 1.8166108344674616, + "learning_rate": 3.0542825120940757e-07, + "loss": 0.9354, + "step": 8824 + }, + { + "epoch": 0.923600209314495, + "grad_norm": 2.1961545359673376, + "learning_rate": 3.045974432185883e-07, + "loss": 0.9738, + "step": 8825 + }, + { + "epoch": 0.9237048665620095, + "grad_norm": 2.3971264132457257, + "learning_rate": 3.0376774924917863e-07, + "loss": 1.0354, + "step": 8826 + }, + { + "epoch": 0.9238095238095239, + "grad_norm": 2.021091464574574, + "learning_rate": 3.029391693965089e-07, + "loss": 0.7179, + "step": 8827 + }, + { + "epoch": 0.9239141810570382, + "grad_norm": 2.02476604473955, + "learning_rate": 3.0211170375578747e-07, + "loss": 0.9331, + "step": 8828 + }, + { + "epoch": 0.9240188383045526, + "grad_norm": 2.2717440474645407, + "learning_rate": 3.0128535242209155e-07, + "loss": 0.8373, + "step": 8829 + }, + { + "epoch": 0.924123495552067, + "grad_norm": 2.419214715389792, + "learning_rate": 3.004601154903697e-07, + "loss": 0.9407, + "step": 8830 + }, + { + "epoch": 0.9242281527995814, + "grad_norm": 2.0851387695815573, + "learning_rate": 2.9963599305544267e-07, + "loss": 0.9082, + "step": 8831 + }, + { + "epoch": 0.9243328100470958, + "grad_norm": 2.2246862727059207, + "learning_rate": 2.9881298521200477e-07, + "loss": 0.8642, + "step": 8832 + }, + { + "epoch": 0.9244374672946102, + "grad_norm": 2.3566781906040704, + "learning_rate": 2.9799109205462254e-07, + "loss": 0.9774, + "step": 8833 + }, + { + "epoch": 0.9245421245421246, + "grad_norm": 2.2258715574619967, + "learning_rate": 2.971703136777315e-07, + "loss": 0.9839, + "step": 8834 + }, + { + "epoch": 0.9246467817896389, + "grad_norm": 2.37543036241895, + "learning_rate": 2.963506501756419e-07, + "loss": 0.8264, + "step": 8835 + }, + { + "epoch": 0.9247514390371533, + "grad_norm": 2.1881079550525584, + "learning_rate": 2.955321016425328e-07, + "loss": 0.8516, + "step": 8836 + }, + { + "epoch": 0.9248560962846677, + "grad_norm": 2.0859708402910884, + "learning_rate": 2.947146681724578e-07, + "loss": 0.8664, + "step": 8837 + }, + { + "epoch": 0.9249607535321821, + "grad_norm": 2.1999782584393506, + "learning_rate": 2.938983498593428e-07, + "loss": 0.8514, + "step": 8838 + }, + { + "epoch": 0.9250654107796965, + "grad_norm": 2.1661333998629333, + "learning_rate": 2.930831467969841e-07, + "loss": 0.8986, + "step": 8839 + }, + { + "epoch": 0.9251700680272109, + "grad_norm": 2.4097416377658254, + "learning_rate": 2.922690590790478e-07, + "loss": 0.9982, + "step": 8840 + }, + { + "epoch": 0.9252747252747253, + "grad_norm": 1.7688846225655979, + "learning_rate": 2.91456086799079e-07, + "loss": 0.7996, + "step": 8841 + }, + { + "epoch": 0.9253793825222396, + "grad_norm": 2.3499275256087273, + "learning_rate": 2.906442300504841e-07, + "loss": 0.9031, + "step": 8842 + }, + { + "epoch": 0.925484039769754, + "grad_norm": 2.0100308955005257, + "learning_rate": 2.8983348892655303e-07, + "loss": 0.9804, + "step": 8843 + }, + { + "epoch": 0.9255886970172684, + "grad_norm": 2.6234232462180898, + "learning_rate": 2.8902386352043675e-07, + "loss": 0.8138, + "step": 8844 + }, + { + "epoch": 0.9256933542647828, + "grad_norm": 2.3067667180043174, + "learning_rate": 2.8821535392516644e-07, + "loss": 0.8897, + "step": 8845 + }, + { + "epoch": 0.9257980115122972, + "grad_norm": 2.275440873262541, + "learning_rate": 2.874079602336377e-07, + "loss": 0.8794, + "step": 8846 + }, + { + "epoch": 0.9259026687598116, + "grad_norm": 2.5876415121542107, + "learning_rate": 2.866016825386253e-07, + "loss": 0.8667, + "step": 8847 + }, + { + "epoch": 0.926007326007326, + "grad_norm": 1.8078289776282361, + "learning_rate": 2.857965209327695e-07, + "loss": 0.8499, + "step": 8848 + }, + { + "epoch": 0.9261119832548403, + "grad_norm": 1.8051645575388477, + "learning_rate": 2.8499247550858753e-07, + "loss": 0.8481, + "step": 8849 + }, + { + "epoch": 0.9262166405023547, + "grad_norm": 2.1114047838811127, + "learning_rate": 2.841895463584643e-07, + "loss": 0.8574, + "step": 8850 + }, + { + "epoch": 0.9263212977498692, + "grad_norm": 2.221883898704795, + "learning_rate": 2.833877335746571e-07, + "loss": 0.9177, + "step": 8851 + }, + { + "epoch": 0.9264259549973836, + "grad_norm": 2.049867364566891, + "learning_rate": 2.8258703724929783e-07, + "loss": 0.8104, + "step": 8852 + }, + { + "epoch": 0.926530612244898, + "grad_norm": 2.0227819344438682, + "learning_rate": 2.817874574743873e-07, + "loss": 0.9001, + "step": 8853 + }, + { + "epoch": 0.9266352694924124, + "grad_norm": 2.154423868042412, + "learning_rate": 2.8098899434179874e-07, + "loss": 0.8108, + "step": 8854 + }, + { + "epoch": 0.9267399267399268, + "grad_norm": 2.2633954827114424, + "learning_rate": 2.8019164794327756e-07, + "loss": 0.8926, + "step": 8855 + }, + { + "epoch": 0.9268445839874412, + "grad_norm": 2.122598963752719, + "learning_rate": 2.793954183704384e-07, + "loss": 0.8493, + "step": 8856 + }, + { + "epoch": 0.9269492412349555, + "grad_norm": 2.2463946823059646, + "learning_rate": 2.7860030571477237e-07, + "loss": 0.9276, + "step": 8857 + }, + { + "epoch": 0.9270538984824699, + "grad_norm": 1.7833875668973516, + "learning_rate": 2.778063100676387e-07, + "loss": 0.7611, + "step": 8858 + }, + { + "epoch": 0.9271585557299843, + "grad_norm": 2.1337740367922007, + "learning_rate": 2.7701343152026883e-07, + "loss": 0.9961, + "step": 8859 + }, + { + "epoch": 0.9272632129774987, + "grad_norm": 1.6322981589903927, + "learning_rate": 2.7622167016376767e-07, + "loss": 0.7977, + "step": 8860 + }, + { + "epoch": 0.9273678702250131, + "grad_norm": 1.9024737473287638, + "learning_rate": 2.754310260891058e-07, + "loss": 0.8432, + "step": 8861 + }, + { + "epoch": 0.9274725274725275, + "grad_norm": 1.8617079620794417, + "learning_rate": 2.746414993871349e-07, + "loss": 0.7266, + "step": 8862 + }, + { + "epoch": 0.9275771847200419, + "grad_norm": 2.230787630710201, + "learning_rate": 2.738530901485714e-07, + "loss": 0.8972, + "step": 8863 + }, + { + "epoch": 0.9276818419675562, + "grad_norm": 1.9515071420546217, + "learning_rate": 2.7306579846400396e-07, + "loss": 0.7411, + "step": 8864 + }, + { + "epoch": 0.9277864992150706, + "grad_norm": 2.0010638403952616, + "learning_rate": 2.7227962442389453e-07, + "loss": 0.7764, + "step": 8865 + }, + { + "epoch": 0.927891156462585, + "grad_norm": 2.269353148166856, + "learning_rate": 2.7149456811857546e-07, + "loss": 0.8705, + "step": 8866 + }, + { + "epoch": 0.9279958137100994, + "grad_norm": 2.1031934409220963, + "learning_rate": 2.7071062963825444e-07, + "loss": 0.8417, + "step": 8867 + }, + { + "epoch": 0.9281004709576138, + "grad_norm": 2.015166756657374, + "learning_rate": 2.699278090730051e-07, + "loss": 0.7953, + "step": 8868 + }, + { + "epoch": 0.9282051282051282, + "grad_norm": 2.3657122687519143, + "learning_rate": 2.6914610651277427e-07, + "loss": 0.8704, + "step": 8869 + }, + { + "epoch": 0.9283097854526426, + "grad_norm": 1.7952034748061156, + "learning_rate": 2.683655220473824e-07, + "loss": 0.8351, + "step": 8870 + }, + { + "epoch": 0.9284144427001569, + "grad_norm": 2.261515967889273, + "learning_rate": 2.675860557665211e-07, + "loss": 0.8985, + "step": 8871 + }, + { + "epoch": 0.9285190999476713, + "grad_norm": 1.8641043302959848, + "learning_rate": 2.6680770775975196e-07, + "loss": 0.8101, + "step": 8872 + }, + { + "epoch": 0.9286237571951858, + "grad_norm": 1.9479323389296166, + "learning_rate": 2.6603047811650795e-07, + "loss": 0.8856, + "step": 8873 + }, + { + "epoch": 0.9287284144427002, + "grad_norm": 2.3595217516180487, + "learning_rate": 2.652543669260932e-07, + "loss": 0.8375, + "step": 8874 + }, + { + "epoch": 0.9288330716902146, + "grad_norm": 2.2492688188141723, + "learning_rate": 2.644793742776874e-07, + "loss": 0.7974, + "step": 8875 + }, + { + "epoch": 0.928937728937729, + "grad_norm": 1.5664487634193003, + "learning_rate": 2.637055002603373e-07, + "loss": 0.7147, + "step": 8876 + }, + { + "epoch": 0.9290423861852434, + "grad_norm": 1.6411930077631471, + "learning_rate": 2.6293274496296263e-07, + "loss": 0.7573, + "step": 8877 + }, + { + "epoch": 0.9291470434327577, + "grad_norm": 2.2388975244958655, + "learning_rate": 2.621611084743558e-07, + "loss": 0.8901, + "step": 8878 + }, + { + "epoch": 0.9292517006802721, + "grad_norm": 1.8373181670762098, + "learning_rate": 2.6139059088317707e-07, + "loss": 0.7095, + "step": 8879 + }, + { + "epoch": 0.9293563579277865, + "grad_norm": 2.0228887964257214, + "learning_rate": 2.606211922779611e-07, + "loss": 0.7826, + "step": 8880 + }, + { + "epoch": 0.9294610151753009, + "grad_norm": 2.582925357687833, + "learning_rate": 2.59852912747115e-07, + "loss": 0.9785, + "step": 8881 + }, + { + "epoch": 0.9295656724228153, + "grad_norm": 2.186974123773659, + "learning_rate": 2.5908575237891476e-07, + "loss": 0.8608, + "step": 8882 + }, + { + "epoch": 0.9296703296703297, + "grad_norm": 1.5660897431908913, + "learning_rate": 2.5831971126150767e-07, + "loss": 0.6913, + "step": 8883 + }, + { + "epoch": 0.9297749869178441, + "grad_norm": 2.4094802073679755, + "learning_rate": 2.575547894829145e-07, + "loss": 0.9072, + "step": 8884 + }, + { + "epoch": 0.9298796441653584, + "grad_norm": 2.2742005617782755, + "learning_rate": 2.567909871310259e-07, + "loss": 0.9929, + "step": 8885 + }, + { + "epoch": 0.9299843014128728, + "grad_norm": 2.4008864387506677, + "learning_rate": 2.5602830429360516e-07, + "loss": 0.8501, + "step": 8886 + }, + { + "epoch": 0.9300889586603872, + "grad_norm": 1.9539433575898302, + "learning_rate": 2.552667410582843e-07, + "loss": 0.8582, + "step": 8887 + }, + { + "epoch": 0.9301936159079016, + "grad_norm": 2.0416965429628635, + "learning_rate": 2.5450629751257117e-07, + "loss": 0.8709, + "step": 8888 + }, + { + "epoch": 0.930298273155416, + "grad_norm": 2.506235715702831, + "learning_rate": 2.5374697374383806e-07, + "loss": 1.0232, + "step": 8889 + }, + { + "epoch": 0.9304029304029304, + "grad_norm": 1.99806566122585, + "learning_rate": 2.529887698393374e-07, + "loss": 0.8371, + "step": 8890 + }, + { + "epoch": 0.9305075876504448, + "grad_norm": 1.9042092264811785, + "learning_rate": 2.522316858861862e-07, + "loss": 0.855, + "step": 8891 + }, + { + "epoch": 0.9306122448979591, + "grad_norm": 2.038381656157126, + "learning_rate": 2.514757219713737e-07, + "loss": 0.8411, + "step": 8892 + }, + { + "epoch": 0.9307169021454735, + "grad_norm": 2.340813532948446, + "learning_rate": 2.507208781817638e-07, + "loss": 0.923, + "step": 8893 + }, + { + "epoch": 0.9308215593929879, + "grad_norm": 2.0448666923378265, + "learning_rate": 2.499671546040894e-07, + "loss": 0.8978, + "step": 8894 + }, + { + "epoch": 0.9309262166405023, + "grad_norm": 2.199976902362381, + "learning_rate": 2.492145513249533e-07, + "loss": 0.961, + "step": 8895 + }, + { + "epoch": 0.9310308738880168, + "grad_norm": 2.0776211466361274, + "learning_rate": 2.4846306843083315e-07, + "loss": 0.7628, + "step": 8896 + }, + { + "epoch": 0.9311355311355312, + "grad_norm": 1.957550937998734, + "learning_rate": 2.477127060080753e-07, + "loss": 0.8212, + "step": 8897 + }, + { + "epoch": 0.9312401883830456, + "grad_norm": 1.925782850036767, + "learning_rate": 2.469634641428964e-07, + "loss": 0.8907, + "step": 8898 + }, + { + "epoch": 0.93134484563056, + "grad_norm": 2.1162098484306533, + "learning_rate": 2.462153429213865e-07, + "loss": 0.8915, + "step": 8899 + }, + { + "epoch": 0.9314495028780743, + "grad_norm": 1.9771946100363844, + "learning_rate": 2.454683424295068e-07, + "loss": 0.742, + "step": 8900 + }, + { + "epoch": 0.9315541601255887, + "grad_norm": 1.9810077999238465, + "learning_rate": 2.4472246275308976e-07, + "loss": 0.9926, + "step": 8901 + }, + { + "epoch": 0.9316588173731031, + "grad_norm": 1.9014151842209916, + "learning_rate": 2.4397770397783794e-07, + "loss": 0.8555, + "step": 8902 + }, + { + "epoch": 0.9317634746206175, + "grad_norm": 1.9053437153678374, + "learning_rate": 2.4323406618932623e-07, + "loss": 0.8252, + "step": 8903 + }, + { + "epoch": 0.9318681318681319, + "grad_norm": 2.1866635447476193, + "learning_rate": 2.4249154947299734e-07, + "loss": 0.8793, + "step": 8904 + }, + { + "epoch": 0.9319727891156463, + "grad_norm": 2.162217503848748, + "learning_rate": 2.4175015391417087e-07, + "loss": 0.8766, + "step": 8905 + }, + { + "epoch": 0.9320774463631607, + "grad_norm": 2.3164940465618886, + "learning_rate": 2.410098795980342e-07, + "loss": 0.9261, + "step": 8906 + }, + { + "epoch": 0.932182103610675, + "grad_norm": 1.968520269068906, + "learning_rate": 2.40270726609646e-07, + "loss": 0.879, + "step": 8907 + }, + { + "epoch": 0.9322867608581894, + "grad_norm": 2.3833618329222865, + "learning_rate": 2.3953269503393496e-07, + "loss": 0.9269, + "step": 8908 + }, + { + "epoch": 0.9323914181057038, + "grad_norm": 2.403516866391237, + "learning_rate": 2.387957849557032e-07, + "loss": 0.8012, + "step": 8909 + }, + { + "epoch": 0.9324960753532182, + "grad_norm": 2.0016102230175528, + "learning_rate": 2.380599964596264e-07, + "loss": 0.8501, + "step": 8910 + }, + { + "epoch": 0.9326007326007326, + "grad_norm": 2.0012188366238703, + "learning_rate": 2.3732532963024468e-07, + "loss": 0.9091, + "step": 8911 + }, + { + "epoch": 0.932705389848247, + "grad_norm": 2.332974869714162, + "learning_rate": 2.3659178455197274e-07, + "loss": 0.8408, + "step": 8912 + }, + { + "epoch": 0.9328100470957614, + "grad_norm": 2.2463160819031587, + "learning_rate": 2.358593613090987e-07, + "loss": 0.8569, + "step": 8913 + }, + { + "epoch": 0.9329147043432757, + "grad_norm": 1.8053639500260543, + "learning_rate": 2.3512805998577638e-07, + "loss": 0.7705, + "step": 8914 + }, + { + "epoch": 0.9330193615907901, + "grad_norm": 2.2853468515179345, + "learning_rate": 2.343978806660363e-07, + "loss": 0.8963, + "step": 8915 + }, + { + "epoch": 0.9331240188383045, + "grad_norm": 2.0353226627808616, + "learning_rate": 2.3366882343377695e-07, + "loss": 0.8253, + "step": 8916 + }, + { + "epoch": 0.9332286760858189, + "grad_norm": 2.312638823803725, + "learning_rate": 2.3294088837276796e-07, + "loss": 0.8579, + "step": 8917 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.9526099594306408, + "learning_rate": 2.322140755666491e-07, + "loss": 0.812, + "step": 8918 + }, + { + "epoch": 0.9334379905808478, + "grad_norm": 2.0469902618255724, + "learning_rate": 2.3148838509893579e-07, + "loss": 0.8683, + "step": 8919 + }, + { + "epoch": 0.9335426478283622, + "grad_norm": 1.880159180549677, + "learning_rate": 2.3076381705301022e-07, + "loss": 0.9037, + "step": 8920 + }, + { + "epoch": 0.9336473050758765, + "grad_norm": 1.5969199873906468, + "learning_rate": 2.3004037151212576e-07, + "loss": 0.7873, + "step": 8921 + }, + { + "epoch": 0.9337519623233909, + "grad_norm": 1.9933441261897502, + "learning_rate": 2.293180485594093e-07, + "loss": 0.9562, + "step": 8922 + }, + { + "epoch": 0.9338566195709053, + "grad_norm": 1.9912633475021717, + "learning_rate": 2.2859684827785444e-07, + "loss": 0.8938, + "step": 8923 + }, + { + "epoch": 0.9339612768184197, + "grad_norm": 1.7915205291719956, + "learning_rate": 2.2787677075033266e-07, + "loss": 0.797, + "step": 8924 + }, + { + "epoch": 0.9340659340659341, + "grad_norm": 1.9232604600710044, + "learning_rate": 2.2715781605957886e-07, + "loss": 0.8286, + "step": 8925 + }, + { + "epoch": 0.9341705913134485, + "grad_norm": 2.106021331701821, + "learning_rate": 2.2643998428820368e-07, + "loss": 0.7703, + "step": 8926 + }, + { + "epoch": 0.9342752485609629, + "grad_norm": 1.880171483367633, + "learning_rate": 2.2572327551868778e-07, + "loss": 0.8119, + "step": 8927 + }, + { + "epoch": 0.9343799058084772, + "grad_norm": 2.318935123745428, + "learning_rate": 2.2500768983338085e-07, + "loss": 0.9226, + "step": 8928 + }, + { + "epoch": 0.9344845630559916, + "grad_norm": 1.6832835230122096, + "learning_rate": 2.2429322731450708e-07, + "loss": 0.7947, + "step": 8929 + }, + { + "epoch": 0.934589220303506, + "grad_norm": 2.1120628902234615, + "learning_rate": 2.2357988804415976e-07, + "loss": 0.9599, + "step": 8930 + }, + { + "epoch": 0.9346938775510204, + "grad_norm": 2.0233896081392504, + "learning_rate": 2.2286767210430105e-07, + "loss": 0.8827, + "step": 8931 + }, + { + "epoch": 0.9347985347985348, + "grad_norm": 2.040817721869765, + "learning_rate": 2.2215657957676774e-07, + "loss": 0.9144, + "step": 8932 + }, + { + "epoch": 0.9349031920460492, + "grad_norm": 1.922624595035762, + "learning_rate": 2.2144661054326443e-07, + "loss": 0.8159, + "step": 8933 + }, + { + "epoch": 0.9350078492935636, + "grad_norm": 1.6714847110688713, + "learning_rate": 2.2073776508537038e-07, + "loss": 0.7365, + "step": 8934 + }, + { + "epoch": 0.9351125065410779, + "grad_norm": 2.1817225705896393, + "learning_rate": 2.2003004328452926e-07, + "loss": 0.7756, + "step": 8935 + }, + { + "epoch": 0.9352171637885923, + "grad_norm": 2.173883426438636, + "learning_rate": 2.1932344522206494e-07, + "loss": 0.8909, + "step": 8936 + }, + { + "epoch": 0.9353218210361067, + "grad_norm": 2.029134530081249, + "learning_rate": 2.186179709791625e-07, + "loss": 0.9164, + "step": 8937 + }, + { + "epoch": 0.9354264782836211, + "grad_norm": 2.020431951662609, + "learning_rate": 2.1791362063688481e-07, + "loss": 0.8419, + "step": 8938 + }, + { + "epoch": 0.9355311355311355, + "grad_norm": 2.1073518757776664, + "learning_rate": 2.1721039427616164e-07, + "loss": 0.8759, + "step": 8939 + }, + { + "epoch": 0.9356357927786499, + "grad_norm": 1.976379251020562, + "learning_rate": 2.1650829197779722e-07, + "loss": 0.9092, + "step": 8940 + }, + { + "epoch": 0.9357404500261643, + "grad_norm": 2.2148390374272737, + "learning_rate": 2.1580731382246255e-07, + "loss": 0.8463, + "step": 8941 + }, + { + "epoch": 0.9358451072736788, + "grad_norm": 2.3658272163282734, + "learning_rate": 2.1510745989070215e-07, + "loss": 0.8825, + "step": 8942 + }, + { + "epoch": 0.935949764521193, + "grad_norm": 2.1121430732937596, + "learning_rate": 2.144087302629305e-07, + "loss": 0.7826, + "step": 8943 + }, + { + "epoch": 0.9360544217687075, + "grad_norm": 1.725035272587334, + "learning_rate": 2.1371112501943348e-07, + "loss": 0.7282, + "step": 8944 + }, + { + "epoch": 0.9361590790162219, + "grad_norm": 1.8745605871155686, + "learning_rate": 2.130146442403669e-07, + "loss": 0.864, + "step": 8945 + }, + { + "epoch": 0.9362637362637363, + "grad_norm": 2.4987700519915994, + "learning_rate": 2.1231928800575897e-07, + "loss": 0.9905, + "step": 8946 + }, + { + "epoch": 0.9363683935112507, + "grad_norm": 1.7532505918821066, + "learning_rate": 2.1162505639550468e-07, + "loss": 0.7843, + "step": 8947 + }, + { + "epoch": 0.9364730507587651, + "grad_norm": 2.1811207939897783, + "learning_rate": 2.109319494893758e-07, + "loss": 0.9079, + "step": 8948 + }, + { + "epoch": 0.9365777080062795, + "grad_norm": 2.0571665531615944, + "learning_rate": 2.1023996736700968e-07, + "loss": 0.7592, + "step": 8949 + }, + { + "epoch": 0.9366823652537938, + "grad_norm": 1.9908145189339748, + "learning_rate": 2.0954911010791834e-07, + "loss": 0.8919, + "step": 8950 + }, + { + "epoch": 0.9367870225013082, + "grad_norm": 1.879635801498437, + "learning_rate": 2.0885937779148158e-07, + "loss": 0.756, + "step": 8951 + }, + { + "epoch": 0.9368916797488226, + "grad_norm": 2.0278954913132843, + "learning_rate": 2.0817077049694934e-07, + "loss": 0.8411, + "step": 8952 + }, + { + "epoch": 0.936996336996337, + "grad_norm": 2.1053845951642396, + "learning_rate": 2.074832883034461e-07, + "loss": 0.8745, + "step": 8953 + }, + { + "epoch": 0.9371009942438514, + "grad_norm": 1.9499721033329678, + "learning_rate": 2.0679693128996535e-07, + "loss": 0.8599, + "step": 8954 + }, + { + "epoch": 0.9372056514913658, + "grad_norm": 2.2937105767755996, + "learning_rate": 2.061116995353707e-07, + "loss": 0.9106, + "step": 8955 + }, + { + "epoch": 0.9373103087388802, + "grad_norm": 2.0231281748039085, + "learning_rate": 2.0542759311839468e-07, + "loss": 0.8773, + "step": 8956 + }, + { + "epoch": 0.9374149659863945, + "grad_norm": 2.0532732484680594, + "learning_rate": 2.047446121176444e-07, + "loss": 0.939, + "step": 8957 + }, + { + "epoch": 0.9375196232339089, + "grad_norm": 1.8426241069013005, + "learning_rate": 2.040627566115949e-07, + "loss": 0.8707, + "step": 8958 + }, + { + "epoch": 0.9376242804814233, + "grad_norm": 1.9657462776989114, + "learning_rate": 2.0338202667859462e-07, + "loss": 0.9519, + "step": 8959 + }, + { + "epoch": 0.9377289377289377, + "grad_norm": 2.2777404864137556, + "learning_rate": 2.0270242239685768e-07, + "loss": 0.8969, + "step": 8960 + }, + { + "epoch": 0.9378335949764521, + "grad_norm": 2.080680228339346, + "learning_rate": 2.0202394384447376e-07, + "loss": 0.9103, + "step": 8961 + }, + { + "epoch": 0.9379382522239665, + "grad_norm": 2.301306982738778, + "learning_rate": 2.0134659109940057e-07, + "loss": 0.9517, + "step": 8962 + }, + { + "epoch": 0.938042909471481, + "grad_norm": 2.1571228068534767, + "learning_rate": 2.0067036423946916e-07, + "loss": 0.9849, + "step": 8963 + }, + { + "epoch": 0.9381475667189952, + "grad_norm": 1.9252681696412213, + "learning_rate": 1.999952633423785e-07, + "loss": 0.7812, + "step": 8964 + }, + { + "epoch": 0.9382522239665096, + "grad_norm": 1.9760309458462875, + "learning_rate": 1.9932128848569875e-07, + "loss": 0.8616, + "step": 8965 + }, + { + "epoch": 0.938356881214024, + "grad_norm": 2.4624548411289324, + "learning_rate": 1.9864843974687133e-07, + "loss": 0.9182, + "step": 8966 + }, + { + "epoch": 0.9384615384615385, + "grad_norm": 2.0682795924841653, + "learning_rate": 1.9797671720320543e-07, + "loss": 0.9754, + "step": 8967 + }, + { + "epoch": 0.9385661957090529, + "grad_norm": 2.014284349621675, + "learning_rate": 1.9730612093188716e-07, + "loss": 0.8685, + "step": 8968 + }, + { + "epoch": 0.9386708529565673, + "grad_norm": 2.4120062967864513, + "learning_rate": 1.966366510099682e-07, + "loss": 0.9244, + "step": 8969 + }, + { + "epoch": 0.9387755102040817, + "grad_norm": 2.3280174180689035, + "learning_rate": 1.9596830751437034e-07, + "loss": 0.9299, + "step": 8970 + }, + { + "epoch": 0.938880167451596, + "grad_norm": 2.074408047932079, + "learning_rate": 1.9530109052188883e-07, + "loss": 0.9401, + "step": 8971 + }, + { + "epoch": 0.9389848246991104, + "grad_norm": 1.7420567087126304, + "learning_rate": 1.946350001091879e-07, + "loss": 0.669, + "step": 8972 + }, + { + "epoch": 0.9390894819466248, + "grad_norm": 2.2410970937411743, + "learning_rate": 1.9397003635280408e-07, + "loss": 0.8553, + "step": 8973 + }, + { + "epoch": 0.9391941391941392, + "grad_norm": 1.9769963769064203, + "learning_rate": 1.9330619932914184e-07, + "loss": 0.7936, + "step": 8974 + }, + { + "epoch": 0.9392987964416536, + "grad_norm": 2.37273596450195, + "learning_rate": 1.9264348911447795e-07, + "loss": 0.7941, + "step": 8975 + }, + { + "epoch": 0.939403453689168, + "grad_norm": 2.0813158115777854, + "learning_rate": 1.91981905784957e-07, + "loss": 0.9177, + "step": 8976 + }, + { + "epoch": 0.9395081109366824, + "grad_norm": 2.290897372588027, + "learning_rate": 1.9132144941659935e-07, + "loss": 0.8598, + "step": 8977 + }, + { + "epoch": 0.9396127681841967, + "grad_norm": 2.2230141215809534, + "learning_rate": 1.9066212008529095e-07, + "loss": 0.9614, + "step": 8978 + }, + { + "epoch": 0.9397174254317111, + "grad_norm": 1.9411269671666274, + "learning_rate": 1.9000391786679118e-07, + "loss": 0.8767, + "step": 8979 + }, + { + "epoch": 0.9398220826792255, + "grad_norm": 1.9684623940624517, + "learning_rate": 1.8934684283672844e-07, + "loss": 0.7992, + "step": 8980 + }, + { + "epoch": 0.9399267399267399, + "grad_norm": 2.2097045916457687, + "learning_rate": 1.886908950706001e-07, + "loss": 0.7911, + "step": 8981 + }, + { + "epoch": 0.9400313971742543, + "grad_norm": 2.2051242891982823, + "learning_rate": 1.8803607464377816e-07, + "loss": 0.8268, + "step": 8982 + }, + { + "epoch": 0.9401360544217687, + "grad_norm": 2.0731015805325947, + "learning_rate": 1.8738238163150346e-07, + "loss": 0.9092, + "step": 8983 + }, + { + "epoch": 0.9402407116692831, + "grad_norm": 2.0982860002338235, + "learning_rate": 1.8672981610888484e-07, + "loss": 0.9503, + "step": 8984 + }, + { + "epoch": 0.9403453689167975, + "grad_norm": 2.0377487369557055, + "learning_rate": 1.8607837815090345e-07, + "loss": 0.9304, + "step": 8985 + }, + { + "epoch": 0.9404500261643118, + "grad_norm": 2.2351989370813543, + "learning_rate": 1.8542806783241053e-07, + "loss": 0.8992, + "step": 8986 + }, + { + "epoch": 0.9405546834118262, + "grad_norm": 2.0623835431044477, + "learning_rate": 1.8477888522812847e-07, + "loss": 0.9759, + "step": 8987 + }, + { + "epoch": 0.9406593406593406, + "grad_norm": 2.2769860510032776, + "learning_rate": 1.8413083041265213e-07, + "loss": 0.8573, + "step": 8988 + }, + { + "epoch": 0.940763997906855, + "grad_norm": 1.9523228068902658, + "learning_rate": 1.8348390346044187e-07, + "loss": 0.7817, + "step": 8989 + }, + { + "epoch": 0.9408686551543695, + "grad_norm": 2.18666471676111, + "learning_rate": 1.828381044458305e-07, + "loss": 0.9009, + "step": 8990 + }, + { + "epoch": 0.9409733124018839, + "grad_norm": 1.9241205019311667, + "learning_rate": 1.8219343344302198e-07, + "loss": 0.8631, + "step": 8991 + }, + { + "epoch": 0.9410779696493983, + "grad_norm": 2.04039821239051, + "learning_rate": 1.815498905260915e-07, + "loss": 0.8371, + "step": 8992 + }, + { + "epoch": 0.9411826268969126, + "grad_norm": 2.3419236200201516, + "learning_rate": 1.8090747576898217e-07, + "loss": 0.8494, + "step": 8993 + }, + { + "epoch": 0.941287284144427, + "grad_norm": 2.1610439047997145, + "learning_rate": 1.8026618924551041e-07, + "loss": 0.8086, + "step": 8994 + }, + { + "epoch": 0.9413919413919414, + "grad_norm": 1.9220224743991272, + "learning_rate": 1.7962603102935848e-07, + "loss": 0.8518, + "step": 8995 + }, + { + "epoch": 0.9414965986394558, + "grad_norm": 2.0058482503008763, + "learning_rate": 1.789870011940842e-07, + "loss": 0.7359, + "step": 8996 + }, + { + "epoch": 0.9416012558869702, + "grad_norm": 2.1650612466125145, + "learning_rate": 1.7834909981311321e-07, + "loss": 0.9187, + "step": 8997 + }, + { + "epoch": 0.9417059131344846, + "grad_norm": 2.1222322501737008, + "learning_rate": 1.7771232695974028e-07, + "loss": 0.9376, + "step": 8998 + }, + { + "epoch": 0.941810570381999, + "grad_norm": 1.8756795331701073, + "learning_rate": 1.7707668270713463e-07, + "loss": 0.8314, + "step": 8999 + }, + { + "epoch": 0.9419152276295133, + "grad_norm": 2.0250266930245373, + "learning_rate": 1.7644216712832897e-07, + "loss": 0.8112, + "step": 9000 + }, + { + "epoch": 0.9420198848770277, + "grad_norm": 1.8965489062179972, + "learning_rate": 1.7580878029623382e-07, + "loss": 0.8542, + "step": 9001 + }, + { + "epoch": 0.9421245421245421, + "grad_norm": 2.0182139408122657, + "learning_rate": 1.751765222836266e-07, + "loss": 0.9074, + "step": 9002 + }, + { + "epoch": 0.9422291993720565, + "grad_norm": 1.915756504936258, + "learning_rate": 1.7454539316315356e-07, + "loss": 0.9071, + "step": 9003 + }, + { + "epoch": 0.9423338566195709, + "grad_norm": 1.8890927536983408, + "learning_rate": 1.7391539300733118e-07, + "loss": 0.7825, + "step": 9004 + }, + { + "epoch": 0.9424385138670853, + "grad_norm": 2.0792217263743016, + "learning_rate": 1.7328652188855044e-07, + "loss": 0.8445, + "step": 9005 + }, + { + "epoch": 0.9425431711145997, + "grad_norm": 2.2281920177680767, + "learning_rate": 1.726587798790702e-07, + "loss": 0.7547, + "step": 9006 + }, + { + "epoch": 0.942647828362114, + "grad_norm": 2.3393383954988853, + "learning_rate": 1.7203216705101834e-07, + "loss": 0.8758, + "step": 9007 + }, + { + "epoch": 0.9427524856096284, + "grad_norm": 2.025231383373048, + "learning_rate": 1.714066834763939e-07, + "loss": 0.8952, + "step": 9008 + }, + { + "epoch": 0.9428571428571428, + "grad_norm": 1.9231761232116151, + "learning_rate": 1.7078232922706495e-07, + "loss": 0.8794, + "step": 9009 + }, + { + "epoch": 0.9429618001046572, + "grad_norm": 2.55540070865077, + "learning_rate": 1.70159104374773e-07, + "loss": 0.8213, + "step": 9010 + }, + { + "epoch": 0.9430664573521716, + "grad_norm": 2.2864462638812113, + "learning_rate": 1.6953700899112746e-07, + "loss": 0.9661, + "step": 9011 + }, + { + "epoch": 0.9431711145996861, + "grad_norm": 2.022644471342003, + "learning_rate": 1.689160431476078e-07, + "loss": 0.7653, + "step": 9012 + }, + { + "epoch": 0.9432757718472005, + "grad_norm": 1.9668619711190107, + "learning_rate": 1.682962069155636e-07, + "loss": 0.8767, + "step": 9013 + }, + { + "epoch": 0.9433804290947148, + "grad_norm": 2.125692202668173, + "learning_rate": 1.6767750036621677e-07, + "loss": 0.8827, + "step": 9014 + }, + { + "epoch": 0.9434850863422292, + "grad_norm": 1.9970347338306567, + "learning_rate": 1.6705992357065713e-07, + "loss": 0.8181, + "step": 9015 + }, + { + "epoch": 0.9435897435897436, + "grad_norm": 2.1417706192896504, + "learning_rate": 1.6644347659984571e-07, + "loss": 0.8472, + "step": 9016 + }, + { + "epoch": 0.943694400837258, + "grad_norm": 2.4107829527631517, + "learning_rate": 1.6582815952461474e-07, + "loss": 0.9227, + "step": 9017 + }, + { + "epoch": 0.9437990580847724, + "grad_norm": 2.272587316891678, + "learning_rate": 1.6521397241566317e-07, + "loss": 0.7621, + "step": 9018 + }, + { + "epoch": 0.9439037153322868, + "grad_norm": 1.9616247365575001, + "learning_rate": 1.6460091534356238e-07, + "loss": 0.8642, + "step": 9019 + }, + { + "epoch": 0.9440083725798012, + "grad_norm": 2.213024695754038, + "learning_rate": 1.6398898837875598e-07, + "loss": 0.8659, + "step": 9020 + }, + { + "epoch": 0.9441130298273155, + "grad_norm": 2.18849504081884, + "learning_rate": 1.6337819159155556e-07, + "loss": 0.8914, + "step": 9021 + }, + { + "epoch": 0.9442176870748299, + "grad_norm": 1.9962396515971108, + "learning_rate": 1.6276852505213937e-07, + "loss": 0.8487, + "step": 9022 + }, + { + "epoch": 0.9443223443223443, + "grad_norm": 1.60529112595556, + "learning_rate": 1.621599888305636e-07, + "loss": 0.8027, + "step": 9023 + }, + { + "epoch": 0.9444270015698587, + "grad_norm": 2.3647898369898535, + "learning_rate": 1.615525829967479e-07, + "loss": 0.8755, + "step": 9024 + }, + { + "epoch": 0.9445316588173731, + "grad_norm": 2.199820408495756, + "learning_rate": 1.6094630762048535e-07, + "loss": 0.9419, + "step": 9025 + }, + { + "epoch": 0.9446363160648875, + "grad_norm": 2.1623987139154295, + "learning_rate": 1.6034116277143795e-07, + "loss": 0.9243, + "step": 9026 + }, + { + "epoch": 0.9447409733124019, + "grad_norm": 2.223255861105165, + "learning_rate": 1.5973714851913792e-07, + "loss": 0.9347, + "step": 9027 + }, + { + "epoch": 0.9448456305599163, + "grad_norm": 2.227214544210135, + "learning_rate": 1.5913426493298745e-07, + "loss": 0.8761, + "step": 9028 + }, + { + "epoch": 0.9449502878074306, + "grad_norm": 2.0771281382507465, + "learning_rate": 1.5853251208225895e-07, + "loss": 0.8977, + "step": 9029 + }, + { + "epoch": 0.945054945054945, + "grad_norm": 2.1250598064265716, + "learning_rate": 1.57931890036096e-07, + "loss": 0.8215, + "step": 9030 + }, + { + "epoch": 0.9451596023024594, + "grad_norm": 1.906429846009798, + "learning_rate": 1.5733239886351114e-07, + "loss": 0.8799, + "step": 9031 + }, + { + "epoch": 0.9452642595499738, + "grad_norm": 2.0717540052678385, + "learning_rate": 1.567340386333871e-07, + "loss": 0.8241, + "step": 9032 + }, + { + "epoch": 0.9453689167974882, + "grad_norm": 2.2358749882717532, + "learning_rate": 1.5613680941447663e-07, + "loss": 0.878, + "step": 9033 + }, + { + "epoch": 0.9454735740450027, + "grad_norm": 2.3438037978057498, + "learning_rate": 1.555407112754015e-07, + "loss": 0.9401, + "step": 9034 + }, + { + "epoch": 0.9455782312925171, + "grad_norm": 1.8342854640981783, + "learning_rate": 1.549457442846558e-07, + "loss": 0.9056, + "step": 9035 + }, + { + "epoch": 0.9456828885400314, + "grad_norm": 1.9116944068383945, + "learning_rate": 1.5435190851060156e-07, + "loss": 0.7609, + "step": 9036 + }, + { + "epoch": 0.9457875457875458, + "grad_norm": 1.7292150071570478, + "learning_rate": 1.5375920402147305e-07, + "loss": 0.7398, + "step": 9037 + }, + { + "epoch": 0.9458922030350602, + "grad_norm": 2.181589596999383, + "learning_rate": 1.5316763088537135e-07, + "loss": 0.8476, + "step": 9038 + }, + { + "epoch": 0.9459968602825746, + "grad_norm": 2.1307840848771447, + "learning_rate": 1.52577189170271e-07, + "loss": 0.8192, + "step": 9039 + }, + { + "epoch": 0.946101517530089, + "grad_norm": 2.360122477075766, + "learning_rate": 1.5198787894401433e-07, + "loss": 0.9385, + "step": 9040 + }, + { + "epoch": 0.9462061747776034, + "grad_norm": 2.456600162959172, + "learning_rate": 1.5139970027431505e-07, + "loss": 0.87, + "step": 9041 + }, + { + "epoch": 0.9463108320251178, + "grad_norm": 1.9169412412415379, + "learning_rate": 1.508126532287557e-07, + "loss": 0.8699, + "step": 9042 + }, + { + "epoch": 0.9464154892726321, + "grad_norm": 2.2877674182580297, + "learning_rate": 1.5022673787478793e-07, + "loss": 0.8462, + "step": 9043 + }, + { + "epoch": 0.9465201465201465, + "grad_norm": 2.039640660135877, + "learning_rate": 1.496419542797356e-07, + "loss": 0.7836, + "step": 9044 + }, + { + "epoch": 0.9466248037676609, + "grad_norm": 1.9739212006187399, + "learning_rate": 1.4905830251079167e-07, + "loss": 0.9003, + "step": 9045 + }, + { + "epoch": 0.9467294610151753, + "grad_norm": 1.9770702491808176, + "learning_rate": 1.4847578263501916e-07, + "loss": 0.8087, + "step": 9046 + }, + { + "epoch": 0.9468341182626897, + "grad_norm": 2.25033314641664, + "learning_rate": 1.4789439471935008e-07, + "loss": 0.9281, + "step": 9047 + }, + { + "epoch": 0.9469387755102041, + "grad_norm": 2.2873109923661126, + "learning_rate": 1.4731413883058655e-07, + "loss": 0.9335, + "step": 9048 + }, + { + "epoch": 0.9470434327577185, + "grad_norm": 2.151440014464125, + "learning_rate": 1.4673501503540187e-07, + "loss": 0.8403, + "step": 9049 + }, + { + "epoch": 0.9471480900052328, + "grad_norm": 1.8386378234588971, + "learning_rate": 1.4615702340033954e-07, + "loss": 0.8304, + "step": 9050 + }, + { + "epoch": 0.9472527472527472, + "grad_norm": 2.11039769161247, + "learning_rate": 1.4558016399181086e-07, + "loss": 0.8667, + "step": 9051 + }, + { + "epoch": 0.9473574045002616, + "grad_norm": 2.2668021084417527, + "learning_rate": 1.4500443687609945e-07, + "loss": 0.9028, + "step": 9052 + }, + { + "epoch": 0.947462061747776, + "grad_norm": 2.5925480089352844, + "learning_rate": 1.4442984211935463e-07, + "loss": 0.9285, + "step": 9053 + }, + { + "epoch": 0.9475667189952904, + "grad_norm": 1.9257446949630599, + "learning_rate": 1.4385637978760026e-07, + "loss": 0.8561, + "step": 9054 + }, + { + "epoch": 0.9476713762428048, + "grad_norm": 2.2283800756599264, + "learning_rate": 1.4328404994672917e-07, + "loss": 0.8976, + "step": 9055 + }, + { + "epoch": 0.9477760334903192, + "grad_norm": 1.8811361955772714, + "learning_rate": 1.4271285266250323e-07, + "loss": 0.8489, + "step": 9056 + }, + { + "epoch": 0.9478806907378335, + "grad_norm": 2.2540466745301493, + "learning_rate": 1.4214278800055102e-07, + "loss": 0.7986, + "step": 9057 + }, + { + "epoch": 0.947985347985348, + "grad_norm": 1.7605020970876353, + "learning_rate": 1.4157385602637685e-07, + "loss": 0.7758, + "step": 9058 + }, + { + "epoch": 0.9480900052328624, + "grad_norm": 2.0483519247325197, + "learning_rate": 1.4100605680535284e-07, + "loss": 0.8848, + "step": 9059 + }, + { + "epoch": 0.9481946624803768, + "grad_norm": 1.814643560597211, + "learning_rate": 1.4043939040272014e-07, + "loss": 0.8009, + "step": 9060 + }, + { + "epoch": 0.9482993197278912, + "grad_norm": 2.1844789581596333, + "learning_rate": 1.398738568835878e-07, + "loss": 0.848, + "step": 9061 + }, + { + "epoch": 0.9484039769754056, + "grad_norm": 2.08957785838643, + "learning_rate": 1.3930945631293712e-07, + "loss": 0.9124, + "step": 9062 + }, + { + "epoch": 0.94850863422292, + "grad_norm": 2.1549858224284826, + "learning_rate": 1.3874618875561963e-07, + "loss": 0.9237, + "step": 9063 + }, + { + "epoch": 0.9486132914704343, + "grad_norm": 2.029197205102344, + "learning_rate": 1.3818405427635683e-07, + "loss": 0.8428, + "step": 9064 + }, + { + "epoch": 0.9487179487179487, + "grad_norm": 2.162465083314952, + "learning_rate": 1.376230529397371e-07, + "loss": 0.953, + "step": 9065 + }, + { + "epoch": 0.9488226059654631, + "grad_norm": 2.2719870379626967, + "learning_rate": 1.370631848102233e-07, + "loss": 0.9192, + "step": 9066 + }, + { + "epoch": 0.9489272632129775, + "grad_norm": 2.3224845795701117, + "learning_rate": 1.3650444995214286e-07, + "loss": 0.8813, + "step": 9067 + }, + { + "epoch": 0.9490319204604919, + "grad_norm": 2.1462838164243014, + "learning_rate": 1.3594684842969664e-07, + "loss": 0.8676, + "step": 9068 + }, + { + "epoch": 0.9491365777080063, + "grad_norm": 1.815945744671672, + "learning_rate": 1.3539038030695338e-07, + "loss": 0.8986, + "step": 9069 + }, + { + "epoch": 0.9492412349555207, + "grad_norm": 1.906488906851326, + "learning_rate": 1.3483504564785422e-07, + "loss": 0.8174, + "step": 9070 + }, + { + "epoch": 0.9493458922030351, + "grad_norm": 2.236795411684707, + "learning_rate": 1.3428084451620694e-07, + "loss": 0.8814, + "step": 9071 + }, + { + "epoch": 0.9494505494505494, + "grad_norm": 2.0557267418605902, + "learning_rate": 1.337277769756895e-07, + "loss": 0.8568, + "step": 9072 + }, + { + "epoch": 0.9495552066980638, + "grad_norm": 2.6957377041769504, + "learning_rate": 1.331758430898522e-07, + "loss": 0.9719, + "step": 9073 + }, + { + "epoch": 0.9496598639455782, + "grad_norm": 1.8970879310242872, + "learning_rate": 1.326250429221121e-07, + "loss": 0.8987, + "step": 9074 + }, + { + "epoch": 0.9497645211930926, + "grad_norm": 1.934705112665449, + "learning_rate": 1.3207537653575851e-07, + "loss": 0.8891, + "step": 9075 + }, + { + "epoch": 0.949869178440607, + "grad_norm": 2.2063646405916124, + "learning_rate": 1.3152684399394877e-07, + "loss": 0.8031, + "step": 9076 + }, + { + "epoch": 0.9499738356881214, + "grad_norm": 1.979196270712241, + "learning_rate": 1.3097944535970908e-07, + "loss": 0.8487, + "step": 9077 + }, + { + "epoch": 0.9500784929356358, + "grad_norm": 2.3747448704287173, + "learning_rate": 1.3043318069593914e-07, + "loss": 0.9484, + "step": 9078 + }, + { + "epoch": 0.9501831501831501, + "grad_norm": 1.9720183796039061, + "learning_rate": 1.2988805006540317e-07, + "loss": 0.7593, + "step": 9079 + }, + { + "epoch": 0.9502878074306645, + "grad_norm": 1.66623523158947, + "learning_rate": 1.2934405353073997e-07, + "loss": 0.7237, + "step": 9080 + }, + { + "epoch": 0.950392464678179, + "grad_norm": 2.010121946011193, + "learning_rate": 1.288011911544551e-07, + "loss": 0.781, + "step": 9081 + }, + { + "epoch": 0.9504971219256934, + "grad_norm": 2.338519331926691, + "learning_rate": 1.282594629989231e-07, + "loss": 0.8435, + "step": 9082 + }, + { + "epoch": 0.9506017791732078, + "grad_norm": 2.1718237588653144, + "learning_rate": 1.2771886912639088e-07, + "loss": 0.8509, + "step": 9083 + }, + { + "epoch": 0.9507064364207222, + "grad_norm": 2.2698280951956, + "learning_rate": 1.2717940959897424e-07, + "loss": 0.857, + "step": 9084 + }, + { + "epoch": 0.9508110936682366, + "grad_norm": 2.1107172447457065, + "learning_rate": 1.2664108447865918e-07, + "loss": 0.7931, + "step": 9085 + }, + { + "epoch": 0.9509157509157509, + "grad_norm": 1.9797524667575515, + "learning_rate": 1.2610389382729738e-07, + "loss": 0.7899, + "step": 9086 + }, + { + "epoch": 0.9510204081632653, + "grad_norm": 2.057673027423047, + "learning_rate": 1.2556783770661497e-07, + "loss": 0.9024, + "step": 9087 + }, + { + "epoch": 0.9511250654107797, + "grad_norm": 2.1671668650036535, + "learning_rate": 1.250329161782049e-07, + "loss": 0.8494, + "step": 9088 + }, + { + "epoch": 0.9512297226582941, + "grad_norm": 2.0877723433934086, + "learning_rate": 1.2449912930353248e-07, + "loss": 0.885, + "step": 9089 + }, + { + "epoch": 0.9513343799058085, + "grad_norm": 2.177290420704625, + "learning_rate": 1.2396647714392974e-07, + "loss": 0.8511, + "step": 9090 + }, + { + "epoch": 0.9514390371533229, + "grad_norm": 2.051463049714876, + "learning_rate": 1.2343495976059882e-07, + "loss": 0.9175, + "step": 9091 + }, + { + "epoch": 0.9515436944008373, + "grad_norm": 1.9910691736016368, + "learning_rate": 1.2290457721461314e-07, + "loss": 0.8602, + "step": 9092 + }, + { + "epoch": 0.9516483516483516, + "grad_norm": 1.9342141459380122, + "learning_rate": 1.22375329566915e-07, + "loss": 0.8365, + "step": 9093 + }, + { + "epoch": 0.951753008895866, + "grad_norm": 1.800982384325224, + "learning_rate": 1.2184721687831468e-07, + "loss": 0.847, + "step": 9094 + }, + { + "epoch": 0.9518576661433804, + "grad_norm": 2.176163511782998, + "learning_rate": 1.2132023920949586e-07, + "loss": 0.8672, + "step": 9095 + }, + { + "epoch": 0.9519623233908948, + "grad_norm": 2.0098442785162343, + "learning_rate": 1.2079439662100567e-07, + "loss": 0.7643, + "step": 9096 + }, + { + "epoch": 0.9520669806384092, + "grad_norm": 2.1686254394715867, + "learning_rate": 1.20269689173268e-07, + "loss": 0.8923, + "step": 9097 + }, + { + "epoch": 0.9521716378859236, + "grad_norm": 2.4479350828010182, + "learning_rate": 1.1974611692657124e-07, + "loss": 0.9458, + "step": 9098 + }, + { + "epoch": 0.952276295133438, + "grad_norm": 1.7758983963507677, + "learning_rate": 1.1922367994107397e-07, + "loss": 0.798, + "step": 9099 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 1.9312505481701443, + "learning_rate": 1.1870237827680708e-07, + "loss": 0.8766, + "step": 9100 + }, + { + "epoch": 0.9524856096284667, + "grad_norm": 2.1489020046415384, + "learning_rate": 1.181822119936682e-07, + "loss": 0.8693, + "step": 9101 + }, + { + "epoch": 0.9525902668759811, + "grad_norm": 1.9604487123502172, + "learning_rate": 1.1766318115142505e-07, + "loss": 0.9279, + "step": 9102 + }, + { + "epoch": 0.9526949241234955, + "grad_norm": 1.979028252970426, + "learning_rate": 1.1714528580971774e-07, + "loss": 0.9074, + "step": 9103 + }, + { + "epoch": 0.95279958137101, + "grad_norm": 2.0147250746415244, + "learning_rate": 1.1662852602805087e-07, + "loss": 0.9098, + "step": 9104 + }, + { + "epoch": 0.9529042386185244, + "grad_norm": 1.941672354841768, + "learning_rate": 1.1611290186580137e-07, + "loss": 0.8705, + "step": 9105 + }, + { + "epoch": 0.9530088958660388, + "grad_norm": 2.081202528742428, + "learning_rate": 1.1559841338221633e-07, + "loss": 0.8618, + "step": 9106 + }, + { + "epoch": 0.9531135531135531, + "grad_norm": 1.900573593944078, + "learning_rate": 1.1508506063641178e-07, + "loss": 0.7274, + "step": 9107 + }, + { + "epoch": 0.9532182103610675, + "grad_norm": 2.2879689467033897, + "learning_rate": 1.1457284368737276e-07, + "loss": 0.8774, + "step": 9108 + }, + { + "epoch": 0.9533228676085819, + "grad_norm": 2.0021870264860553, + "learning_rate": 1.140617625939533e-07, + "loss": 0.8242, + "step": 9109 + }, + { + "epoch": 0.9534275248560963, + "grad_norm": 2.189380584746929, + "learning_rate": 1.1355181741487864e-07, + "loss": 0.9973, + "step": 9110 + }, + { + "epoch": 0.9535321821036107, + "grad_norm": 2.046145341571205, + "learning_rate": 1.1304300820874192e-07, + "loss": 0.9143, + "step": 9111 + }, + { + "epoch": 0.9536368393511251, + "grad_norm": 1.9336419044364952, + "learning_rate": 1.1253533503400638e-07, + "loss": 0.8793, + "step": 9112 + }, + { + "epoch": 0.9537414965986395, + "grad_norm": 2.1511547350474616, + "learning_rate": 1.1202879794900535e-07, + "loss": 0.8096, + "step": 9113 + }, + { + "epoch": 0.9538461538461539, + "grad_norm": 1.9734945556556622, + "learning_rate": 1.1152339701194004e-07, + "loss": 0.808, + "step": 9114 + }, + { + "epoch": 0.9539508110936682, + "grad_norm": 2.0081235379107203, + "learning_rate": 1.1101913228088179e-07, + "loss": 0.8715, + "step": 9115 + }, + { + "epoch": 0.9540554683411826, + "grad_norm": 2.117755926561859, + "learning_rate": 1.1051600381377315e-07, + "loss": 0.8589, + "step": 9116 + }, + { + "epoch": 0.954160125588697, + "grad_norm": 2.0526210136326695, + "learning_rate": 1.100140116684234e-07, + "loss": 0.8793, + "step": 9117 + }, + { + "epoch": 0.9542647828362114, + "grad_norm": 2.112405339433251, + "learning_rate": 1.095131559025131e-07, + "loss": 0.8667, + "step": 9118 + }, + { + "epoch": 0.9543694400837258, + "grad_norm": 2.533576036279591, + "learning_rate": 1.0901343657359175e-07, + "loss": 0.7516, + "step": 9119 + }, + { + "epoch": 0.9544740973312402, + "grad_norm": 2.154561014085007, + "learning_rate": 1.0851485373907677e-07, + "loss": 0.9504, + "step": 9120 + }, + { + "epoch": 0.9545787545787546, + "grad_norm": 2.4408447371630464, + "learning_rate": 1.0801740745625677e-07, + "loss": 0.9668, + "step": 9121 + }, + { + "epoch": 0.9546834118262689, + "grad_norm": 1.9109247782495005, + "learning_rate": 1.0752109778229158e-07, + "loss": 0.8883, + "step": 9122 + }, + { + "epoch": 0.9547880690737833, + "grad_norm": 2.013558136876636, + "learning_rate": 1.070259247742056e-07, + "loss": 0.885, + "step": 9123 + }, + { + "epoch": 0.9548927263212977, + "grad_norm": 2.136904562312622, + "learning_rate": 1.0653188848889551e-07, + "loss": 0.8507, + "step": 9124 + }, + { + "epoch": 0.9549973835688121, + "grad_norm": 1.978016541353706, + "learning_rate": 1.0603898898312704e-07, + "loss": 0.8988, + "step": 9125 + }, + { + "epoch": 0.9551020408163265, + "grad_norm": 2.292835501413161, + "learning_rate": 1.0554722631353709e-07, + "loss": 0.9214, + "step": 9126 + }, + { + "epoch": 0.955206698063841, + "grad_norm": 1.9578723859363283, + "learning_rate": 1.0505660053662825e-07, + "loss": 0.807, + "step": 9127 + }, + { + "epoch": 0.9553113553113554, + "grad_norm": 2.074313232154308, + "learning_rate": 1.0456711170877543e-07, + "loss": 0.865, + "step": 9128 + }, + { + "epoch": 0.9554160125588697, + "grad_norm": 2.4188323428193166, + "learning_rate": 1.0407875988622252e-07, + "loss": 0.8573, + "step": 9129 + }, + { + "epoch": 0.9555206698063841, + "grad_norm": 1.9965673098227608, + "learning_rate": 1.0359154512508019e-07, + "loss": 0.8046, + "step": 9130 + }, + { + "epoch": 0.9556253270538985, + "grad_norm": 2.424445738477294, + "learning_rate": 1.0310546748133143e-07, + "loss": 1.0085, + "step": 9131 + }, + { + "epoch": 0.9557299843014129, + "grad_norm": 1.9689504274155718, + "learning_rate": 1.0262052701082825e-07, + "loss": 0.8351, + "step": 9132 + }, + { + "epoch": 0.9558346415489273, + "grad_norm": 2.108713019030409, + "learning_rate": 1.021367237692894e-07, + "loss": 0.9932, + "step": 9133 + }, + { + "epoch": 0.9559392987964417, + "grad_norm": 1.7814011137810855, + "learning_rate": 1.0165405781230708e-07, + "loss": 0.8543, + "step": 9134 + }, + { + "epoch": 0.9560439560439561, + "grad_norm": 2.458888218864724, + "learning_rate": 1.0117252919533804e-07, + "loss": 0.9106, + "step": 9135 + }, + { + "epoch": 0.9561486132914704, + "grad_norm": 2.454880605782708, + "learning_rate": 1.0069213797371357e-07, + "loss": 0.8216, + "step": 9136 + }, + { + "epoch": 0.9562532705389848, + "grad_norm": 2.3292738121246934, + "learning_rate": 1.0021288420263065e-07, + "loss": 0.8631, + "step": 9137 + }, + { + "epoch": 0.9563579277864992, + "grad_norm": 1.9940769879892457, + "learning_rate": 9.97347679371563e-08, + "loss": 0.9346, + "step": 9138 + }, + { + "epoch": 0.9564625850340136, + "grad_norm": 2.168224739156848, + "learning_rate": 9.925778923222773e-08, + "loss": 0.9215, + "step": 9139 + }, + { + "epoch": 0.956567242281528, + "grad_norm": 2.0395622501364605, + "learning_rate": 9.878194814264886e-08, + "loss": 0.9338, + "step": 9140 + }, + { + "epoch": 0.9566718995290424, + "grad_norm": 2.2995306333544696, + "learning_rate": 9.830724472309594e-08, + "loss": 0.9621, + "step": 9141 + }, + { + "epoch": 0.9567765567765568, + "grad_norm": 1.9195393542003314, + "learning_rate": 9.783367902811424e-08, + "loss": 0.9511, + "step": 9142 + }, + { + "epoch": 0.9568812140240711, + "grad_norm": 2.001176501644658, + "learning_rate": 9.73612511121158e-08, + "loss": 0.9361, + "step": 9143 + }, + { + "epoch": 0.9569858712715855, + "grad_norm": 2.1602312695732224, + "learning_rate": 9.688996102938497e-08, + "loss": 0.7655, + "step": 9144 + }, + { + "epoch": 0.9570905285190999, + "grad_norm": 2.3661293111777515, + "learning_rate": 9.641980883407287e-08, + "loss": 0.9697, + "step": 9145 + }, + { + "epoch": 0.9571951857666143, + "grad_norm": 1.9875384007327113, + "learning_rate": 9.595079458020185e-08, + "loss": 0.7232, + "step": 9146 + }, + { + "epoch": 0.9572998430141287, + "grad_norm": 1.5923915548495342, + "learning_rate": 9.548291832166213e-08, + "loss": 0.8366, + "step": 9147 + }, + { + "epoch": 0.9574045002616431, + "grad_norm": 2.2856279399112225, + "learning_rate": 9.501618011221291e-08, + "loss": 0.9121, + "step": 9148 + }, + { + "epoch": 0.9575091575091575, + "grad_norm": 2.1702328491811045, + "learning_rate": 9.455058000548467e-08, + "loss": 0.8221, + "step": 9149 + }, + { + "epoch": 0.957613814756672, + "grad_norm": 2.169354118160728, + "learning_rate": 9.408611805497459e-08, + "loss": 0.8867, + "step": 9150 + }, + { + "epoch": 0.9577184720041862, + "grad_norm": 1.9949827280347652, + "learning_rate": 9.36227943140522e-08, + "loss": 0.8586, + "step": 9151 + }, + { + "epoch": 0.9578231292517007, + "grad_norm": 2.6143300269638288, + "learning_rate": 9.316060883595158e-08, + "loss": 0.8846, + "step": 9152 + }, + { + "epoch": 0.9579277864992151, + "grad_norm": 2.0238769325890096, + "learning_rate": 9.269956167378136e-08, + "loss": 0.863, + "step": 9153 + }, + { + "epoch": 0.9580324437467295, + "grad_norm": 1.7312936279188003, + "learning_rate": 9.223965288051474e-08, + "loss": 0.775, + "step": 9154 + }, + { + "epoch": 0.9581371009942439, + "grad_norm": 1.679219951168678, + "learning_rate": 9.17808825089983e-08, + "loss": 0.7436, + "step": 9155 + }, + { + "epoch": 0.9582417582417583, + "grad_norm": 2.0696046017408114, + "learning_rate": 9.132325061194325e-08, + "loss": 0.8891, + "step": 9156 + }, + { + "epoch": 0.9583464154892727, + "grad_norm": 1.9650852136731827, + "learning_rate": 9.086675724193528e-08, + "loss": 0.8477, + "step": 9157 + }, + { + "epoch": 0.958451072736787, + "grad_norm": 1.9933766746277393, + "learning_rate": 9.041140245142355e-08, + "loss": 0.9044, + "step": 9158 + }, + { + "epoch": 0.9585557299843014, + "grad_norm": 2.43537039286737, + "learning_rate": 8.995718629273065e-08, + "loss": 0.9137, + "step": 9159 + }, + { + "epoch": 0.9586603872318158, + "grad_norm": 2.0921310588419493, + "learning_rate": 8.950410881804706e-08, + "loss": 0.9069, + "step": 9160 + }, + { + "epoch": 0.9587650444793302, + "grad_norm": 2.1188665014531147, + "learning_rate": 8.905217007943223e-08, + "loss": 0.8711, + "step": 9161 + }, + { + "epoch": 0.9588697017268446, + "grad_norm": 1.8922155563080787, + "learning_rate": 8.860137012881465e-08, + "loss": 0.9361, + "step": 9162 + }, + { + "epoch": 0.958974358974359, + "grad_norm": 1.711472138483851, + "learning_rate": 8.815170901799174e-08, + "loss": 0.7832, + "step": 9163 + }, + { + "epoch": 0.9590790162218734, + "grad_norm": 2.05901764490939, + "learning_rate": 8.770318679862999e-08, + "loss": 0.791, + "step": 9164 + }, + { + "epoch": 0.9591836734693877, + "grad_norm": 2.0760777814710285, + "learning_rate": 8.725580352226815e-08, + "loss": 0.8347, + "step": 9165 + }, + { + "epoch": 0.9592883307169021, + "grad_norm": 1.904314569738717, + "learning_rate": 8.680955924030954e-08, + "loss": 0.8394, + "step": 9166 + }, + { + "epoch": 0.9593929879644165, + "grad_norm": 1.5764956424130905, + "learning_rate": 8.63644540040276e-08, + "loss": 0.6921, + "step": 9167 + }, + { + "epoch": 0.9594976452119309, + "grad_norm": 1.9033111511642289, + "learning_rate": 8.592048786456698e-08, + "loss": 0.7482, + "step": 9168 + }, + { + "epoch": 0.9596023024594453, + "grad_norm": 2.07972888411746, + "learning_rate": 8.547766087294018e-08, + "loss": 0.8112, + "step": 9169 + }, + { + "epoch": 0.9597069597069597, + "grad_norm": 2.1173563174976393, + "learning_rate": 8.503597308002986e-08, + "loss": 0.7684, + "step": 9170 + }, + { + "epoch": 0.9598116169544741, + "grad_norm": 2.3978904207022067, + "learning_rate": 8.459542453658653e-08, + "loss": 0.9662, + "step": 9171 + }, + { + "epoch": 0.9599162742019884, + "grad_norm": 2.015486093729567, + "learning_rate": 8.415601529322858e-08, + "loss": 0.9097, + "step": 9172 + }, + { + "epoch": 0.9600209314495028, + "grad_norm": 2.100667884440473, + "learning_rate": 8.371774540044564e-08, + "loss": 0.8485, + "step": 9173 + }, + { + "epoch": 0.9601255886970173, + "grad_norm": 2.2677052547515455, + "learning_rate": 8.328061490859629e-08, + "loss": 0.805, + "step": 9174 + }, + { + "epoch": 0.9602302459445317, + "grad_norm": 2.502950900699366, + "learning_rate": 8.284462386790815e-08, + "loss": 0.8131, + "step": 9175 + }, + { + "epoch": 0.9603349031920461, + "grad_norm": 1.845406579180182, + "learning_rate": 8.240977232847669e-08, + "loss": 0.8898, + "step": 9176 + }, + { + "epoch": 0.9604395604395605, + "grad_norm": 2.644639245831181, + "learning_rate": 8.19760603402675e-08, + "loss": 0.9884, + "step": 9177 + }, + { + "epoch": 0.9605442176870749, + "grad_norm": 2.0136239922533767, + "learning_rate": 8.154348795311518e-08, + "loss": 0.9439, + "step": 9178 + }, + { + "epoch": 0.9606488749345892, + "grad_norm": 2.508687484109057, + "learning_rate": 8.111205521672105e-08, + "loss": 0.842, + "step": 9179 + }, + { + "epoch": 0.9607535321821036, + "grad_norm": 2.247230140286789, + "learning_rate": 8.068176218066104e-08, + "loss": 0.7672, + "step": 9180 + }, + { + "epoch": 0.960858189429618, + "grad_norm": 2.1501454719365465, + "learning_rate": 8.025260889437558e-08, + "loss": 0.906, + "step": 9181 + }, + { + "epoch": 0.9609628466771324, + "grad_norm": 2.2857097799522395, + "learning_rate": 7.982459540717413e-08, + "loss": 0.8782, + "step": 9182 + }, + { + "epoch": 0.9610675039246468, + "grad_norm": 2.418986186789819, + "learning_rate": 7.939772176823735e-08, + "loss": 0.8045, + "step": 9183 + }, + { + "epoch": 0.9611721611721612, + "grad_norm": 2.1459549533496594, + "learning_rate": 7.897198802661266e-08, + "loss": 0.9515, + "step": 9184 + }, + { + "epoch": 0.9612768184196756, + "grad_norm": 2.109672695130541, + "learning_rate": 7.854739423121982e-08, + "loss": 0.8162, + "step": 9185 + }, + { + "epoch": 0.9613814756671899, + "grad_norm": 2.0736403186020205, + "learning_rate": 7.812394043084315e-08, + "loss": 0.9333, + "step": 9186 + }, + { + "epoch": 0.9614861329147043, + "grad_norm": 1.9672228441543578, + "learning_rate": 7.770162667413928e-08, + "loss": 0.878, + "step": 9187 + }, + { + "epoch": 0.9615907901622187, + "grad_norm": 2.152750014034463, + "learning_rate": 7.728045300963383e-08, + "loss": 0.8695, + "step": 9188 + }, + { + "epoch": 0.9616954474097331, + "grad_norm": 2.1946243167100854, + "learning_rate": 7.686041948571921e-08, + "loss": 0.8581, + "step": 9189 + }, + { + "epoch": 0.9618001046572475, + "grad_norm": 1.7958639695003695, + "learning_rate": 7.644152615065903e-08, + "loss": 0.9317, + "step": 9190 + }, + { + "epoch": 0.9619047619047619, + "grad_norm": 2.103440015443088, + "learning_rate": 7.602377305258479e-08, + "loss": 0.8852, + "step": 9191 + }, + { + "epoch": 0.9620094191522763, + "grad_norm": 1.9632826033869528, + "learning_rate": 7.560716023949699e-08, + "loss": 0.9123, + "step": 9192 + }, + { + "epoch": 0.9621140763997907, + "grad_norm": 2.582600940053974, + "learning_rate": 7.51916877592651e-08, + "loss": 0.7138, + "step": 9193 + }, + { + "epoch": 0.962218733647305, + "grad_norm": 2.426593772598077, + "learning_rate": 7.477735565962873e-08, + "loss": 0.9125, + "step": 9194 + }, + { + "epoch": 0.9623233908948194, + "grad_norm": 2.0534735800973913, + "learning_rate": 7.436416398819313e-08, + "loss": 0.9127, + "step": 9195 + }, + { + "epoch": 0.9624280481423338, + "grad_norm": 1.8146955476652327, + "learning_rate": 7.395211279243808e-08, + "loss": 0.7993, + "step": 9196 + }, + { + "epoch": 0.9625327053898483, + "grad_norm": 2.2323319981904874, + "learning_rate": 7.354120211970684e-08, + "loss": 0.9055, + "step": 9197 + }, + { + "epoch": 0.9626373626373627, + "grad_norm": 1.846938976318557, + "learning_rate": 7.313143201721384e-08, + "loss": 0.7602, + "step": 9198 + }, + { + "epoch": 0.9627420198848771, + "grad_norm": 1.9136272699219579, + "learning_rate": 7.272280253204367e-08, + "loss": 0.7391, + "step": 9199 + }, + { + "epoch": 0.9628466771323915, + "grad_norm": 2.430133835674404, + "learning_rate": 7.231531371114875e-08, + "loss": 0.8995, + "step": 9200 + }, + { + "epoch": 0.9629513343799058, + "grad_norm": 2.131458703559915, + "learning_rate": 7.190896560134942e-08, + "loss": 0.8002, + "step": 9201 + }, + { + "epoch": 0.9630559916274202, + "grad_norm": 1.9827611709558692, + "learning_rate": 7.150375824933608e-08, + "loss": 0.8915, + "step": 9202 + }, + { + "epoch": 0.9631606488749346, + "grad_norm": 2.1954279263735317, + "learning_rate": 7.10996917016682e-08, + "loss": 0.9135, + "step": 9203 + }, + { + "epoch": 0.963265306122449, + "grad_norm": 2.150004140942097, + "learning_rate": 7.069676600477304e-08, + "loss": 0.8268, + "step": 9204 + }, + { + "epoch": 0.9633699633699634, + "grad_norm": 2.2008699779019256, + "learning_rate": 7.029498120494916e-08, + "loss": 0.8064, + "step": 9205 + }, + { + "epoch": 0.9634746206174778, + "grad_norm": 1.9973249721182413, + "learning_rate": 6.989433734836182e-08, + "loss": 0.8953, + "step": 9206 + }, + { + "epoch": 0.9635792778649922, + "grad_norm": 2.1012015027082023, + "learning_rate": 6.949483448104422e-08, + "loss": 0.8212, + "step": 9207 + }, + { + "epoch": 0.9636839351125065, + "grad_norm": 2.252680211089396, + "learning_rate": 6.909647264890296e-08, + "loss": 0.9004, + "step": 9208 + }, + { + "epoch": 0.9637885923600209, + "grad_norm": 2.0623106938542444, + "learning_rate": 6.869925189770809e-08, + "loss": 0.8154, + "step": 9209 + }, + { + "epoch": 0.9638932496075353, + "grad_norm": 2.4261216634526113, + "learning_rate": 6.830317227310202e-08, + "loss": 0.9477, + "step": 9210 + }, + { + "epoch": 0.9639979068550497, + "grad_norm": 2.2540675121561056, + "learning_rate": 6.790823382059497e-08, + "loss": 0.947, + "step": 9211 + }, + { + "epoch": 0.9641025641025641, + "grad_norm": 2.1048027694750844, + "learning_rate": 6.751443658556733e-08, + "loss": 0.9247, + "step": 9212 + }, + { + "epoch": 0.9642072213500785, + "grad_norm": 2.0158913138500045, + "learning_rate": 6.712178061326513e-08, + "loss": 0.9507, + "step": 9213 + }, + { + "epoch": 0.9643118785975929, + "grad_norm": 2.003911985362856, + "learning_rate": 6.673026594880672e-08, + "loss": 0.8659, + "step": 9214 + }, + { + "epoch": 0.9644165358451072, + "grad_norm": 1.9811498476133749, + "learning_rate": 6.633989263717722e-08, + "loss": 0.8903, + "step": 9215 + }, + { + "epoch": 0.9645211930926216, + "grad_norm": 2.232106088676546, + "learning_rate": 6.595066072323297e-08, + "loss": 0.8571, + "step": 9216 + }, + { + "epoch": 0.964625850340136, + "grad_norm": 2.5558019802236105, + "learning_rate": 6.556257025169488e-08, + "loss": 0.787, + "step": 9217 + }, + { + "epoch": 0.9647305075876504, + "grad_norm": 1.8740286415803826, + "learning_rate": 6.517562126715726e-08, + "loss": 0.7737, + "step": 9218 + }, + { + "epoch": 0.9648351648351648, + "grad_norm": 2.2998119940125403, + "learning_rate": 6.478981381408012e-08, + "loss": 0.8484, + "step": 9219 + }, + { + "epoch": 0.9649398220826793, + "grad_norm": 2.1309929398704592, + "learning_rate": 6.440514793679465e-08, + "loss": 0.9934, + "step": 9220 + }, + { + "epoch": 0.9650444793301937, + "grad_norm": 1.6771005831362835, + "learning_rate": 6.402162367949883e-08, + "loss": 0.7815, + "step": 9221 + }, + { + "epoch": 0.965149136577708, + "grad_norm": 2.1014499143649603, + "learning_rate": 6.363924108626074e-08, + "loss": 0.8083, + "step": 9222 + }, + { + "epoch": 0.9652537938252224, + "grad_norm": 1.896185187952413, + "learning_rate": 6.325800020101747e-08, + "loss": 0.7202, + "step": 9223 + }, + { + "epoch": 0.9653584510727368, + "grad_norm": 1.9377624373854216, + "learning_rate": 6.287790106757396e-08, + "loss": 0.8109, + "step": 9224 + }, + { + "epoch": 0.9654631083202512, + "grad_norm": 2.079814185022681, + "learning_rate": 6.249894372960419e-08, + "loss": 0.8617, + "step": 9225 + }, + { + "epoch": 0.9655677655677656, + "grad_norm": 1.9888426701736415, + "learning_rate": 6.212112823065109e-08, + "loss": 0.7941, + "step": 9226 + }, + { + "epoch": 0.96567242281528, + "grad_norm": 2.014484823659193, + "learning_rate": 6.174445461412659e-08, + "loss": 0.9727, + "step": 9227 + }, + { + "epoch": 0.9657770800627944, + "grad_norm": 2.114946519857328, + "learning_rate": 6.136892292331165e-08, + "loss": 0.9009, + "step": 9228 + }, + { + "epoch": 0.9658817373103087, + "grad_norm": 2.464678652778644, + "learning_rate": 6.099453320135396e-08, + "loss": 0.8523, + "step": 9229 + }, + { + "epoch": 0.9659863945578231, + "grad_norm": 2.064257270218177, + "learning_rate": 6.062128549127355e-08, + "loss": 0.7611, + "step": 9230 + }, + { + "epoch": 0.9660910518053375, + "grad_norm": 2.434856201533525, + "learning_rate": 6.024917983595613e-08, + "loss": 0.8406, + "step": 9231 + }, + { + "epoch": 0.9661957090528519, + "grad_norm": 2.3887496596256113, + "learning_rate": 5.98782162781586e-08, + "loss": 0.8096, + "step": 9232 + }, + { + "epoch": 0.9663003663003663, + "grad_norm": 2.365481278236734, + "learning_rate": 5.9508394860504635e-08, + "loss": 0.8159, + "step": 9233 + }, + { + "epoch": 0.9664050235478807, + "grad_norm": 2.3102168301713832, + "learning_rate": 5.9139715625488036e-08, + "loss": 0.9929, + "step": 9234 + }, + { + "epoch": 0.9665096807953951, + "grad_norm": 2.0516025566151264, + "learning_rate": 5.8772178615469355e-08, + "loss": 0.8985, + "step": 9235 + }, + { + "epoch": 0.9666143380429095, + "grad_norm": 1.9481670124245183, + "learning_rate": 5.8405783872680364e-08, + "loss": 0.9224, + "step": 9236 + }, + { + "epoch": 0.9667189952904238, + "grad_norm": 1.936915127599676, + "learning_rate": 5.804053143922184e-08, + "loss": 0.9193, + "step": 9237 + }, + { + "epoch": 0.9668236525379382, + "grad_norm": 2.159472821731071, + "learning_rate": 5.76764213570602e-08, + "loss": 0.933, + "step": 9238 + }, + { + "epoch": 0.9669283097854526, + "grad_norm": 2.1432057819082044, + "learning_rate": 5.731345366803198e-08, + "loss": 0.7927, + "step": 9239 + }, + { + "epoch": 0.967032967032967, + "grad_norm": 2.1392093036071356, + "learning_rate": 5.695162841384383e-08, + "loss": 0.8698, + "step": 9240 + }, + { + "epoch": 0.9671376242804814, + "grad_norm": 1.6287695667451592, + "learning_rate": 5.659094563607137e-08, + "loss": 0.6806, + "step": 9241 + }, + { + "epoch": 0.9672422815279959, + "grad_norm": 1.9649345198964663, + "learning_rate": 5.62314053761559e-08, + "loss": 1.017, + "step": 9242 + }, + { + "epoch": 0.9673469387755103, + "grad_norm": 1.9674398387069323, + "learning_rate": 5.587300767540993e-08, + "loss": 0.8725, + "step": 9243 + }, + { + "epoch": 0.9674515960230246, + "grad_norm": 2.1414946410927294, + "learning_rate": 5.551575257501496e-08, + "loss": 1.0452, + "step": 9244 + }, + { + "epoch": 0.967556253270539, + "grad_norm": 1.9468821502685336, + "learning_rate": 5.5159640116019264e-08, + "loss": 0.8087, + "step": 9245 + }, + { + "epoch": 0.9676609105180534, + "grad_norm": 1.6540577154311573, + "learning_rate": 5.4804670339341225e-08, + "loss": 0.7482, + "step": 9246 + }, + { + "epoch": 0.9677655677655678, + "grad_norm": 1.7009899889159676, + "learning_rate": 5.445084328576711e-08, + "loss": 0.7608, + "step": 9247 + }, + { + "epoch": 0.9678702250130822, + "grad_norm": 2.2598483825041096, + "learning_rate": 5.4098158995953274e-08, + "loss": 0.9915, + "step": 9248 + }, + { + "epoch": 0.9679748822605966, + "grad_norm": 2.3568793617507335, + "learning_rate": 5.374661751042287e-08, + "loss": 0.8288, + "step": 9249 + }, + { + "epoch": 0.968079539508111, + "grad_norm": 2.1736820945639828, + "learning_rate": 5.339621886957025e-08, + "loss": 0.9018, + "step": 9250 + }, + { + "epoch": 0.9681841967556253, + "grad_norm": 2.101174708101391, + "learning_rate": 5.304696311365543e-08, + "loss": 0.8313, + "step": 9251 + }, + { + "epoch": 0.9682888540031397, + "grad_norm": 2.3344750804334122, + "learning_rate": 5.269885028280963e-08, + "loss": 0.8387, + "step": 9252 + }, + { + "epoch": 0.9683935112506541, + "grad_norm": 2.2550389618502407, + "learning_rate": 5.2351880417030874e-08, + "loss": 0.8327, + "step": 9253 + }, + { + "epoch": 0.9684981684981685, + "grad_norm": 1.8950935631200951, + "learning_rate": 5.2006053556186155e-08, + "loss": 0.9351, + "step": 9254 + }, + { + "epoch": 0.9686028257456829, + "grad_norm": 2.3653393297563, + "learning_rate": 5.166136974001368e-08, + "loss": 0.8625, + "step": 9255 + }, + { + "epoch": 0.9687074829931973, + "grad_norm": 2.098650984633713, + "learning_rate": 5.1317829008116215e-08, + "loss": 0.9453, + "step": 9256 + }, + { + "epoch": 0.9688121402407117, + "grad_norm": 2.0350999152809006, + "learning_rate": 5.097543139996886e-08, + "loss": 0.8874, + "step": 9257 + }, + { + "epoch": 0.968916797488226, + "grad_norm": 1.7287186179945104, + "learning_rate": 5.0634176954913464e-08, + "loss": 0.8007, + "step": 9258 + }, + { + "epoch": 0.9690214547357404, + "grad_norm": 2.243175615621909, + "learning_rate": 5.029406571216089e-08, + "loss": 0.8416, + "step": 9259 + }, + { + "epoch": 0.9691261119832548, + "grad_norm": 2.089301015163357, + "learning_rate": 4.995509771078877e-08, + "loss": 0.8799, + "step": 9260 + }, + { + "epoch": 0.9692307692307692, + "grad_norm": 1.8265334250247782, + "learning_rate": 4.9617272989748166e-08, + "loss": 0.7664, + "step": 9261 + }, + { + "epoch": 0.9693354264782836, + "grad_norm": 2.1422139711001673, + "learning_rate": 4.92805915878547e-08, + "loss": 0.8051, + "step": 9262 + }, + { + "epoch": 0.969440083725798, + "grad_norm": 1.7236231384803589, + "learning_rate": 4.8945053543792975e-08, + "loss": 0.7868, + "step": 9263 + }, + { + "epoch": 0.9695447409733124, + "grad_norm": 1.9977730114261136, + "learning_rate": 4.86106588961166e-08, + "loss": 0.8113, + "step": 9264 + }, + { + "epoch": 0.9696493982208267, + "grad_norm": 2.196435687000056, + "learning_rate": 4.827740768324929e-08, + "loss": 0.88, + "step": 9265 + }, + { + "epoch": 0.9697540554683411, + "grad_norm": 2.154480773213278, + "learning_rate": 4.794529994348374e-08, + "loss": 0.8232, + "step": 9266 + }, + { + "epoch": 0.9698587127158556, + "grad_norm": 2.054282523683849, + "learning_rate": 4.76143357149772e-08, + "loss": 0.9648, + "step": 9267 + }, + { + "epoch": 0.96996336996337, + "grad_norm": 2.1057006673654297, + "learning_rate": 4.728451503575815e-08, + "loss": 0.8939, + "step": 9268 + }, + { + "epoch": 0.9700680272108844, + "grad_norm": 2.1885036662303445, + "learning_rate": 4.695583794372516e-08, + "loss": 0.9587, + "step": 9269 + }, + { + "epoch": 0.9701726844583988, + "grad_norm": 2.15267308775205, + "learning_rate": 4.662830447664357e-08, + "loss": 0.9066, + "step": 9270 + }, + { + "epoch": 0.9702773417059132, + "grad_norm": 2.4891197615275202, + "learning_rate": 4.630191467214773e-08, + "loss": 0.9047, + "step": 9271 + }, + { + "epoch": 0.9703819989534275, + "grad_norm": 2.152695134921288, + "learning_rate": 4.5976668567739856e-08, + "loss": 0.8575, + "step": 9272 + }, + { + "epoch": 0.9704866562009419, + "grad_norm": 2.086583458860564, + "learning_rate": 4.565256620079117e-08, + "loss": 0.9318, + "step": 9273 + }, + { + "epoch": 0.9705913134484563, + "grad_norm": 1.9990374714123995, + "learning_rate": 4.532960760854299e-08, + "loss": 0.8295, + "step": 9274 + }, + { + "epoch": 0.9706959706959707, + "grad_norm": 2.045660681387157, + "learning_rate": 4.5007792828102305e-08, + "loss": 0.7581, + "step": 9275 + }, + { + "epoch": 0.9708006279434851, + "grad_norm": 2.1535992375534088, + "learning_rate": 4.4687121896447305e-08, + "loss": 0.8037, + "step": 9276 + }, + { + "epoch": 0.9709052851909995, + "grad_norm": 1.933861715496548, + "learning_rate": 4.436759485042408e-08, + "loss": 0.9047, + "step": 9277 + }, + { + "epoch": 0.9710099424385139, + "grad_norm": 2.460250636187858, + "learning_rate": 4.404921172674659e-08, + "loss": 0.8763, + "step": 9278 + }, + { + "epoch": 0.9711145996860283, + "grad_norm": 2.05066277066522, + "learning_rate": 4.373197256199668e-08, + "loss": 0.8936, + "step": 9279 + }, + { + "epoch": 0.9712192569335426, + "grad_norm": 2.0608607605365914, + "learning_rate": 4.3415877392626314e-08, + "loss": 0.8599, + "step": 9280 + }, + { + "epoch": 0.971323914181057, + "grad_norm": 1.9005786956775899, + "learning_rate": 4.3100926254956434e-08, + "loss": 0.9429, + "step": 9281 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 2.3320969105885028, + "learning_rate": 4.2787119185174756e-08, + "loss": 0.9442, + "step": 9282 + }, + { + "epoch": 0.9715332286760858, + "grad_norm": 1.9467790462210524, + "learning_rate": 4.247445621933799e-08, + "loss": 0.9022, + "step": 9283 + }, + { + "epoch": 0.9716378859236002, + "grad_norm": 2.1353572789569237, + "learning_rate": 4.216293739337296e-08, + "loss": 0.8758, + "step": 9284 + }, + { + "epoch": 0.9717425431711146, + "grad_norm": 2.0519015338547826, + "learning_rate": 4.1852562743072144e-08, + "loss": 0.8413, + "step": 9285 + }, + { + "epoch": 0.971847200418629, + "grad_norm": 2.07726966403895, + "learning_rate": 4.154333230410035e-08, + "loss": 0.9582, + "step": 9286 + }, + { + "epoch": 0.9719518576661433, + "grad_norm": 1.734959100833163, + "learning_rate": 4.1235246111986925e-08, + "loss": 0.715, + "step": 9287 + }, + { + "epoch": 0.9720565149136577, + "grad_norm": 2.3376164950199807, + "learning_rate": 4.092830420213134e-08, + "loss": 0.7634, + "step": 9288 + }, + { + "epoch": 0.9721611721611721, + "grad_norm": 2.0354642090531447, + "learning_rate": 4.062250660980427e-08, + "loss": 0.9849, + "step": 9289 + }, + { + "epoch": 0.9722658294086866, + "grad_norm": 2.4203466382212264, + "learning_rate": 4.0317853370139826e-08, + "loss": 0.9165, + "step": 9290 + }, + { + "epoch": 0.972370486656201, + "grad_norm": 2.2074429056634024, + "learning_rate": 4.0014344518145566e-08, + "loss": 0.8997, + "step": 9291 + }, + { + "epoch": 0.9724751439037154, + "grad_norm": 1.9811031986210714, + "learning_rate": 3.971198008869359e-08, + "loss": 0.8448, + "step": 9292 + }, + { + "epoch": 0.9725798011512298, + "grad_norm": 2.140986208627522, + "learning_rate": 3.9410760116526115e-08, + "loss": 0.9095, + "step": 9293 + }, + { + "epoch": 0.9726844583987441, + "grad_norm": 2.1998429755193047, + "learning_rate": 3.9110684636254334e-08, + "loss": 0.8572, + "step": 9294 + }, + { + "epoch": 0.9727891156462585, + "grad_norm": 1.791680128166834, + "learning_rate": 3.8811753682358454e-08, + "loss": 0.8926, + "step": 9295 + }, + { + "epoch": 0.9728937728937729, + "grad_norm": 2.0214903816242873, + "learning_rate": 3.851396728918544e-08, + "loss": 0.9469, + "step": 9296 + }, + { + "epoch": 0.9729984301412873, + "grad_norm": 2.086981122937348, + "learning_rate": 3.8217325490952365e-08, + "loss": 0.8404, + "step": 9297 + }, + { + "epoch": 0.9731030873888017, + "grad_norm": 2.175421733396647, + "learning_rate": 3.792182832174196e-08, + "loss": 0.9427, + "step": 9298 + }, + { + "epoch": 0.9732077446363161, + "grad_norm": 1.9291300718410933, + "learning_rate": 3.76274758155093e-08, + "loss": 0.8368, + "step": 9299 + }, + { + "epoch": 0.9733124018838305, + "grad_norm": 2.117352303798171, + "learning_rate": 3.7334268006075094e-08, + "loss": 0.9744, + "step": 9300 + }, + { + "epoch": 0.9734170591313448, + "grad_norm": 1.675142403551649, + "learning_rate": 3.7042204927130177e-08, + "loss": 0.7633, + "step": 9301 + }, + { + "epoch": 0.9735217163788592, + "grad_norm": 2.036570056386732, + "learning_rate": 3.675128661223326e-08, + "loss": 0.6977, + "step": 9302 + }, + { + "epoch": 0.9736263736263736, + "grad_norm": 1.8889553909422538, + "learning_rate": 3.646151309481094e-08, + "loss": 0.792, + "step": 9303 + }, + { + "epoch": 0.973731030873888, + "grad_norm": 2.209590888127619, + "learning_rate": 3.6172884408158805e-08, + "loss": 0.874, + "step": 9304 + }, + { + "epoch": 0.9738356881214024, + "grad_norm": 1.9542284512171744, + "learning_rate": 3.5885400585441435e-08, + "loss": 0.8408, + "step": 9305 + }, + { + "epoch": 0.9739403453689168, + "grad_norm": 2.079751071735509, + "learning_rate": 3.559906165969018e-08, + "loss": 0.8197, + "step": 9306 + }, + { + "epoch": 0.9740450026164312, + "grad_norm": 1.7986235225721088, + "learning_rate": 3.531386766380762e-08, + "loss": 0.8663, + "step": 9307 + }, + { + "epoch": 0.9741496598639455, + "grad_norm": 2.120431133086565, + "learning_rate": 3.502981863056088e-08, + "loss": 1.016, + "step": 9308 + }, + { + "epoch": 0.9742543171114599, + "grad_norm": 2.27161995008741, + "learning_rate": 3.4746914592590496e-08, + "loss": 0.8601, + "step": 9309 + }, + { + "epoch": 0.9743589743589743, + "grad_norm": 1.8602129108469572, + "learning_rate": 3.446515558240049e-08, + "loss": 0.8764, + "step": 9310 + }, + { + "epoch": 0.9744636316064887, + "grad_norm": 2.5728016050016245, + "learning_rate": 3.4184541632366066e-08, + "loss": 1.0023, + "step": 9311 + }, + { + "epoch": 0.9745682888540032, + "grad_norm": 2.3187035734822334, + "learning_rate": 3.390507277473143e-08, + "loss": 0.9637, + "step": 9312 + }, + { + "epoch": 0.9746729461015176, + "grad_norm": 1.8523908673682685, + "learning_rate": 3.3626749041606454e-08, + "loss": 0.9465, + "step": 9313 + }, + { + "epoch": 0.974777603349032, + "grad_norm": 1.9384599924677752, + "learning_rate": 3.3349570464973336e-08, + "loss": 0.7849, + "step": 9314 + }, + { + "epoch": 0.9748822605965463, + "grad_norm": 1.7474977395994695, + "learning_rate": 3.307353707667771e-08, + "loss": 0.7028, + "step": 9315 + }, + { + "epoch": 0.9749869178440607, + "grad_norm": 1.7458067005238376, + "learning_rate": 3.279864890843865e-08, + "loss": 0.9057, + "step": 9316 + }, + { + "epoch": 0.9750915750915751, + "grad_norm": 1.93802753381986, + "learning_rate": 3.2524905991839775e-08, + "loss": 0.7735, + "step": 9317 + }, + { + "epoch": 0.9751962323390895, + "grad_norm": 2.0070696055706616, + "learning_rate": 3.225230835833593e-08, + "loss": 0.8985, + "step": 9318 + }, + { + "epoch": 0.9753008895866039, + "grad_norm": 2.290676066932009, + "learning_rate": 3.198085603924761e-08, + "loss": 0.8573, + "step": 9319 + }, + { + "epoch": 0.9754055468341183, + "grad_norm": 1.9242114892590325, + "learning_rate": 3.1710549065767645e-08, + "loss": 0.8236, + "step": 9320 + }, + { + "epoch": 0.9755102040816327, + "grad_norm": 2.0179022005643312, + "learning_rate": 3.144138746895231e-08, + "loss": 0.9594, + "step": 9321 + }, + { + "epoch": 0.9756148613291471, + "grad_norm": 1.688029558610551, + "learning_rate": 3.117337127973019e-08, + "loss": 0.6891, + "step": 9322 + }, + { + "epoch": 0.9757195185766614, + "grad_norm": 2.123029102476217, + "learning_rate": 3.0906500528896656e-08, + "loss": 0.9667, + "step": 9323 + }, + { + "epoch": 0.9758241758241758, + "grad_norm": 2.114035431835684, + "learning_rate": 3.064077524711606e-08, + "loss": 0.8246, + "step": 9324 + }, + { + "epoch": 0.9759288330716902, + "grad_norm": 1.9867461075805437, + "learning_rate": 3.0376195464920655e-08, + "loss": 0.9916, + "step": 9325 + }, + { + "epoch": 0.9760334903192046, + "grad_norm": 2.2706805982649962, + "learning_rate": 3.0112761212709454e-08, + "loss": 0.716, + "step": 9326 + }, + { + "epoch": 0.976138147566719, + "grad_norm": 2.2268234648526413, + "learning_rate": 2.9850472520753795e-08, + "loss": 0.84, + "step": 9327 + }, + { + "epoch": 0.9762428048142334, + "grad_norm": 2.217677339428372, + "learning_rate": 2.9589329419190683e-08, + "loss": 0.8897, + "step": 9328 + }, + { + "epoch": 0.9763474620617478, + "grad_norm": 2.093519971186103, + "learning_rate": 2.9329331938026117e-08, + "loss": 0.8808, + "step": 9329 + }, + { + "epoch": 0.9764521193092621, + "grad_norm": 1.9406821046190856, + "learning_rate": 2.907048010713398e-08, + "loss": 0.8893, + "step": 9330 + }, + { + "epoch": 0.9765567765567765, + "grad_norm": 1.9257719073652975, + "learning_rate": 2.8812773956256034e-08, + "loss": 0.8714, + "step": 9331 + }, + { + "epoch": 0.9766614338042909, + "grad_norm": 2.330309710581296, + "learning_rate": 2.855621351500415e-08, + "loss": 0.9156, + "step": 9332 + }, + { + "epoch": 0.9767660910518053, + "grad_norm": 1.742461766113821, + "learning_rate": 2.8300798812858078e-08, + "loss": 0.7883, + "step": 9333 + }, + { + "epoch": 0.9768707482993197, + "grad_norm": 2.150066157396379, + "learning_rate": 2.8046529879164343e-08, + "loss": 0.7517, + "step": 9334 + }, + { + "epoch": 0.9769754055468342, + "grad_norm": 2.162913910208753, + "learning_rate": 2.7793406743139572e-08, + "loss": 0.8699, + "step": 9335 + }, + { + "epoch": 0.9770800627943486, + "grad_norm": 3.0655887965284343, + "learning_rate": 2.7541429433869393e-08, + "loss": 0.8514, + "step": 9336 + }, + { + "epoch": 0.9771847200418629, + "grad_norm": 1.9873130806621349, + "learning_rate": 2.729059798030398e-08, + "loss": 0.7885, + "step": 9337 + }, + { + "epoch": 0.9772893772893773, + "grad_norm": 2.2769678998356295, + "learning_rate": 2.7040912411265828e-08, + "loss": 0.9203, + "step": 9338 + }, + { + "epoch": 0.9773940345368917, + "grad_norm": 1.9774891685632308, + "learning_rate": 2.679237275544422e-08, + "loss": 0.8983, + "step": 9339 + }, + { + "epoch": 0.9774986917844061, + "grad_norm": 2.2420458226870985, + "learning_rate": 2.6544979041397412e-08, + "loss": 0.7687, + "step": 9340 + }, + { + "epoch": 0.9776033490319205, + "grad_norm": 2.119669776522172, + "learning_rate": 2.6298731297550452e-08, + "loss": 0.8316, + "step": 9341 + }, + { + "epoch": 0.9777080062794349, + "grad_norm": 2.4118682693637385, + "learning_rate": 2.605362955219737e-08, + "loss": 0.7018, + "step": 9342 + }, + { + "epoch": 0.9778126635269493, + "grad_norm": 2.0233107477477787, + "learning_rate": 2.5809673833502303e-08, + "loss": 0.8836, + "step": 9343 + }, + { + "epoch": 0.9779173207744636, + "grad_norm": 1.9871161511437192, + "learning_rate": 2.5566864169495054e-08, + "loss": 0.735, + "step": 9344 + }, + { + "epoch": 0.978021978021978, + "grad_norm": 2.292335147286737, + "learning_rate": 2.5325200588076636e-08, + "loss": 0.839, + "step": 9345 + }, + { + "epoch": 0.9781266352694924, + "grad_norm": 2.2868212579297644, + "learning_rate": 2.508468311701151e-08, + "loss": 0.8197, + "step": 9346 + }, + { + "epoch": 0.9782312925170068, + "grad_norm": 2.1010720301980004, + "learning_rate": 2.4845311783938676e-08, + "loss": 0.9124, + "step": 9347 + }, + { + "epoch": 0.9783359497645212, + "grad_norm": 1.8598221177836944, + "learning_rate": 2.4607086616361687e-08, + "loss": 0.8497, + "step": 9348 + }, + { + "epoch": 0.9784406070120356, + "grad_norm": 2.237038039840748, + "learning_rate": 2.4370007641651983e-08, + "loss": 0.8217, + "step": 9349 + }, + { + "epoch": 0.97854526425955, + "grad_norm": 2.085673606140328, + "learning_rate": 2.4134074887051108e-08, + "loss": 0.8642, + "step": 9350 + }, + { + "epoch": 0.9786499215070643, + "grad_norm": 1.9367343428587138, + "learning_rate": 2.3899288379668483e-08, + "loss": 0.9598, + "step": 9351 + }, + { + "epoch": 0.9787545787545787, + "grad_norm": 1.968532478877287, + "learning_rate": 2.3665648146480315e-08, + "loss": 0.9346, + "step": 9352 + }, + { + "epoch": 0.9788592360020931, + "grad_norm": 2.2698739958097547, + "learning_rate": 2.343315421433401e-08, + "loss": 0.9245, + "step": 9353 + }, + { + "epoch": 0.9789638932496075, + "grad_norm": 2.1111746202775517, + "learning_rate": 2.3201806609943756e-08, + "loss": 0.8771, + "step": 9354 + }, + { + "epoch": 0.9790685504971219, + "grad_norm": 2.395697271895756, + "learning_rate": 2.2971605359890514e-08, + "loss": 0.7715, + "step": 9355 + }, + { + "epoch": 0.9791732077446363, + "grad_norm": 2.048806092369536, + "learning_rate": 2.2742550490624237e-08, + "loss": 0.8791, + "step": 9356 + }, + { + "epoch": 0.9792778649921507, + "grad_norm": 2.0576345151800197, + "learning_rate": 2.2514642028466092e-08, + "loss": 0.9454, + "step": 9357 + }, + { + "epoch": 0.979382522239665, + "grad_norm": 2.1060942188116294, + "learning_rate": 2.2287879999601804e-08, + "loss": 0.8494, + "step": 9358 + }, + { + "epoch": 0.9794871794871794, + "grad_norm": 2.0030711225076745, + "learning_rate": 2.2062264430087187e-08, + "loss": 0.7593, + "step": 9359 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 2.18284415979648, + "learning_rate": 2.1837795345844847e-08, + "loss": 0.8958, + "step": 9360 + }, + { + "epoch": 0.9796964939822083, + "grad_norm": 1.9127644215367114, + "learning_rate": 2.1614472772668592e-08, + "loss": 0.9716, + "step": 9361 + }, + { + "epoch": 0.9798011512297227, + "grad_norm": 1.8495943717092973, + "learning_rate": 2.13922967362179e-08, + "loss": 0.8342, + "step": 9362 + }, + { + "epoch": 0.9799058084772371, + "grad_norm": 2.1175372567636703, + "learning_rate": 2.1171267262020124e-08, + "loss": 0.9731, + "step": 9363 + }, + { + "epoch": 0.9800104657247515, + "grad_norm": 1.961813987025682, + "learning_rate": 2.0951384375473837e-08, + "loss": 0.8674, + "step": 9364 + }, + { + "epoch": 0.9801151229722659, + "grad_norm": 2.2096628727490555, + "learning_rate": 2.073264810184328e-08, + "loss": 0.8222, + "step": 9365 + }, + { + "epoch": 0.9802197802197802, + "grad_norm": 3.0174110994465653, + "learning_rate": 2.0515058466260563e-08, + "loss": 0.7058, + "step": 9366 + }, + { + "epoch": 0.9803244374672946, + "grad_norm": 1.9638967173681348, + "learning_rate": 2.0298615493729025e-08, + "loss": 0.8708, + "step": 9367 + }, + { + "epoch": 0.980429094714809, + "grad_norm": 1.8984784000934047, + "learning_rate": 2.0083319209116547e-08, + "loss": 0.8403, + "step": 9368 + }, + { + "epoch": 0.9805337519623234, + "grad_norm": 1.9998734455651994, + "learning_rate": 1.9869169637162232e-08, + "loss": 0.8549, + "step": 9369 + }, + { + "epoch": 0.9806384092098378, + "grad_norm": 2.7056047403555916, + "learning_rate": 1.9656166802473066e-08, + "loss": 0.7828, + "step": 9370 + }, + { + "epoch": 0.9807430664573522, + "grad_norm": 1.885856551772748, + "learning_rate": 1.9444310729521686e-08, + "loss": 0.8384, + "step": 9371 + }, + { + "epoch": 0.9808477237048666, + "grad_norm": 2.3439297429643835, + "learning_rate": 1.9233601442653073e-08, + "loss": 0.8945, + "step": 9372 + }, + { + "epoch": 0.9809523809523809, + "grad_norm": 2.273645673597301, + "learning_rate": 1.902403896607563e-08, + "loss": 0.8892, + "step": 9373 + }, + { + "epoch": 0.9810570381998953, + "grad_norm": 2.101477849817953, + "learning_rate": 1.8815623323870102e-08, + "loss": 0.7195, + "step": 9374 + }, + { + "epoch": 0.9811616954474097, + "grad_norm": 2.067407200022368, + "learning_rate": 1.8608354539982886e-08, + "loss": 0.931, + "step": 9375 + }, + { + "epoch": 0.9812663526949241, + "grad_norm": 2.0656438344157837, + "learning_rate": 1.8402232638230488e-08, + "loss": 0.9159, + "step": 9376 + }, + { + "epoch": 0.9813710099424385, + "grad_norm": 1.989366318840071, + "learning_rate": 1.8197257642296183e-08, + "loss": 0.9061, + "step": 9377 + }, + { + "epoch": 0.9814756671899529, + "grad_norm": 2.1104290095055007, + "learning_rate": 1.7993429575732247e-08, + "loss": 0.7356, + "step": 9378 + }, + { + "epoch": 0.9815803244374673, + "grad_norm": 2.3574441233228804, + "learning_rate": 1.779074846195883e-08, + "loss": 0.7541, + "step": 9379 + }, + { + "epoch": 0.9816849816849816, + "grad_norm": 2.053364465624434, + "learning_rate": 1.758921432426397e-08, + "loss": 0.929, + "step": 9380 + }, + { + "epoch": 0.981789638932496, + "grad_norm": 1.9733850210433772, + "learning_rate": 1.738882718580581e-08, + "loss": 0.8051, + "step": 9381 + }, + { + "epoch": 0.9818942961800105, + "grad_norm": 2.010472060114112, + "learning_rate": 1.718958706960816e-08, + "loss": 0.8714, + "step": 9382 + }, + { + "epoch": 0.9819989534275249, + "grad_norm": 1.9418541630139494, + "learning_rate": 1.6991493998563814e-08, + "loss": 0.9111, + "step": 9383 + }, + { + "epoch": 0.9821036106750393, + "grad_norm": 2.0017027127013303, + "learning_rate": 1.6794547995434563e-08, + "loss": 0.8167, + "step": 9384 + }, + { + "epoch": 0.9822082679225537, + "grad_norm": 2.0504485758602895, + "learning_rate": 1.6598749082850084e-08, + "loss": 0.8904, + "step": 9385 + }, + { + "epoch": 0.9823129251700681, + "grad_norm": 2.360758373402244, + "learning_rate": 1.640409728330794e-08, + "loss": 0.9911, + "step": 9386 + }, + { + "epoch": 0.9824175824175824, + "grad_norm": 1.99459443178208, + "learning_rate": 1.621059261917357e-08, + "loss": 0.844, + "step": 9387 + }, + { + "epoch": 0.9825222396650968, + "grad_norm": 1.8418136023305347, + "learning_rate": 1.6018235112681412e-08, + "loss": 0.8789, + "step": 9388 + }, + { + "epoch": 0.9826268969126112, + "grad_norm": 1.6553513965603035, + "learning_rate": 1.5827024785933785e-08, + "loss": 0.7062, + "step": 9389 + }, + { + "epoch": 0.9827315541601256, + "grad_norm": 1.9968661967922576, + "learning_rate": 1.5636961660899787e-08, + "loss": 0.7489, + "step": 9390 + }, + { + "epoch": 0.98283621140764, + "grad_norm": 2.3529232712311807, + "learning_rate": 1.5448045759420827e-08, + "loss": 0.9545, + "step": 9391 + }, + { + "epoch": 0.9829408686551544, + "grad_norm": 1.8819806519766973, + "learning_rate": 1.526027710320177e-08, + "loss": 0.8448, + "step": 9392 + }, + { + "epoch": 0.9830455259026688, + "grad_norm": 2.138462511740512, + "learning_rate": 1.5073655713818692e-08, + "loss": 0.9485, + "step": 9393 + }, + { + "epoch": 0.9831501831501831, + "grad_norm": 2.3151261223417703, + "learning_rate": 1.4888181612713327e-08, + "loss": 0.9372, + "step": 9394 + }, + { + "epoch": 0.9832548403976975, + "grad_norm": 2.1342293657223292, + "learning_rate": 1.4703854821197516e-08, + "loss": 0.7339, + "step": 9395 + }, + { + "epoch": 0.9833594976452119, + "grad_norm": 1.9715185392028405, + "learning_rate": 1.4520675360450987e-08, + "loss": 0.9157, + "step": 9396 + }, + { + "epoch": 0.9834641548927263, + "grad_norm": 2.6450142775406964, + "learning_rate": 1.4338643251522456e-08, + "loss": 0.9199, + "step": 9397 + }, + { + "epoch": 0.9835688121402407, + "grad_norm": 1.8046838157595604, + "learning_rate": 1.4157758515326303e-08, + "loss": 0.749, + "step": 9398 + }, + { + "epoch": 0.9836734693877551, + "grad_norm": 1.8899814023719925, + "learning_rate": 1.3978021172648126e-08, + "loss": 0.93, + "step": 9399 + }, + { + "epoch": 0.9837781266352695, + "grad_norm": 1.6809007606749307, + "learning_rate": 1.3799431244138073e-08, + "loss": 0.7007, + "step": 9400 + }, + { + "epoch": 0.9838827838827838, + "grad_norm": 1.6089297962761584, + "learning_rate": 1.3621988750317505e-08, + "loss": 0.7509, + "step": 9401 + }, + { + "epoch": 0.9839874411302982, + "grad_norm": 2.032535731745511, + "learning_rate": 1.344569371157567e-08, + "loss": 0.8048, + "step": 9402 + }, + { + "epoch": 0.9840920983778126, + "grad_norm": 1.9666083217252117, + "learning_rate": 1.3270546148167474e-08, + "loss": 0.8324, + "step": 9403 + }, + { + "epoch": 0.984196755625327, + "grad_norm": 2.0185643963937054, + "learning_rate": 1.3096546080219042e-08, + "loss": 0.8746, + "step": 9404 + }, + { + "epoch": 0.9843014128728415, + "grad_norm": 1.9656375458562914, + "learning_rate": 1.292369352772327e-08, + "loss": 0.8422, + "step": 9405 + }, + { + "epoch": 0.9844060701203559, + "grad_norm": 2.331480394760753, + "learning_rate": 1.2751988510540937e-08, + "loss": 0.9247, + "step": 9406 + }, + { + "epoch": 0.9845107273678703, + "grad_norm": 1.9029882738516415, + "learning_rate": 1.2581431048401815e-08, + "loss": 0.8051, + "step": 9407 + }, + { + "epoch": 0.9846153846153847, + "grad_norm": 1.9192845614133922, + "learning_rate": 1.2412021160902454e-08, + "loss": 0.851, + "step": 9408 + }, + { + "epoch": 0.984720041862899, + "grad_norm": 1.8685299347220463, + "learning_rate": 1.2243758867508393e-08, + "loss": 0.798, + "step": 9409 + }, + { + "epoch": 0.9848246991104134, + "grad_norm": 2.171112776891488, + "learning_rate": 1.2076644187554165e-08, + "loss": 0.9017, + "step": 9410 + }, + { + "epoch": 0.9849293563579278, + "grad_norm": 2.3532297197914427, + "learning_rate": 1.1910677140241077e-08, + "loss": 0.902, + "step": 9411 + }, + { + "epoch": 0.9850340136054422, + "grad_norm": 2.1352803267811944, + "learning_rate": 1.1745857744639433e-08, + "loss": 0.9434, + "step": 9412 + }, + { + "epoch": 0.9851386708529566, + "grad_norm": 2.0466470004257444, + "learning_rate": 1.1582186019686303e-08, + "loss": 0.854, + "step": 9413 + }, + { + "epoch": 0.985243328100471, + "grad_norm": 1.8753970699615763, + "learning_rate": 1.1419661984189978e-08, + "loss": 0.885, + "step": 9414 + }, + { + "epoch": 0.9853479853479854, + "grad_norm": 1.9062133085606772, + "learning_rate": 1.1258285656822187e-08, + "loss": 0.8093, + "step": 9415 + }, + { + "epoch": 0.9854526425954997, + "grad_norm": 2.2126700579173644, + "learning_rate": 1.1098057056126987e-08, + "loss": 0.9876, + "step": 9416 + }, + { + "epoch": 0.9855572998430141, + "grad_norm": 2.0255896960925255, + "learning_rate": 1.0938976200515206e-08, + "loss": 0.742, + "step": 9417 + }, + { + "epoch": 0.9856619570905285, + "grad_norm": 2.1729727012223328, + "learning_rate": 1.0781043108264444e-08, + "loss": 0.8281, + "step": 9418 + }, + { + "epoch": 0.9857666143380429, + "grad_norm": 2.5225886899280794, + "learning_rate": 1.0624257797522408e-08, + "loss": 0.8728, + "step": 9419 + }, + { + "epoch": 0.9858712715855573, + "grad_norm": 2.0350945295517993, + "learning_rate": 1.0468620286303577e-08, + "loss": 0.8491, + "step": 9420 + }, + { + "epoch": 0.9859759288330717, + "grad_norm": 2.2700948009138253, + "learning_rate": 1.0314130592490313e-08, + "loss": 0.9252, + "step": 9421 + }, + { + "epoch": 0.9860805860805861, + "grad_norm": 1.9846202839221025, + "learning_rate": 1.0160788733836191e-08, + "loss": 0.8137, + "step": 9422 + }, + { + "epoch": 0.9861852433281004, + "grad_norm": 2.3145335457286462, + "learning_rate": 1.0008594727958232e-08, + "loss": 0.6612, + "step": 9423 + }, + { + "epoch": 0.9862899005756148, + "grad_norm": 1.6193909422722288, + "learning_rate": 9.857548592343558e-09, + "loss": 0.7132, + "step": 9424 + }, + { + "epoch": 0.9863945578231292, + "grad_norm": 2.3517435794568353, + "learning_rate": 9.707650344350505e-09, + "loss": 0.8793, + "step": 9425 + }, + { + "epoch": 0.9864992150706436, + "grad_norm": 1.9920534058323922, + "learning_rate": 9.558900001199745e-09, + "loss": 0.8218, + "step": 9426 + }, + { + "epoch": 0.986603872318158, + "grad_norm": 2.427971815968549, + "learning_rate": 9.411297579984269e-09, + "loss": 0.9036, + "step": 9427 + }, + { + "epoch": 0.9867085295656725, + "grad_norm": 1.8261977196224324, + "learning_rate": 9.264843097663845e-09, + "loss": 0.776, + "step": 9428 + }, + { + "epoch": 0.9868131868131869, + "grad_norm": 2.197882541506156, + "learning_rate": 9.119536571066123e-09, + "loss": 0.8071, + "step": 9429 + }, + { + "epoch": 0.9869178440607012, + "grad_norm": 1.9625634256614233, + "learning_rate": 8.975378016887748e-09, + "loss": 0.7967, + "step": 9430 + }, + { + "epoch": 0.9870225013082156, + "grad_norm": 1.6165607498949757, + "learning_rate": 8.832367451692137e-09, + "loss": 0.7942, + "step": 9431 + }, + { + "epoch": 0.98712715855573, + "grad_norm": 2.0636190975616793, + "learning_rate": 8.6905048919117e-09, + "loss": 0.9357, + "step": 9432 + }, + { + "epoch": 0.9872318158032444, + "grad_norm": 2.0496132764727255, + "learning_rate": 8.549790353846731e-09, + "loss": 0.9125, + "step": 9433 + }, + { + "epoch": 0.9873364730507588, + "grad_norm": 1.98179465824096, + "learning_rate": 8.410223853665411e-09, + "loss": 0.7822, + "step": 9434 + }, + { + "epoch": 0.9874411302982732, + "grad_norm": 2.321799912705264, + "learning_rate": 8.27180540740491e-09, + "loss": 0.8323, + "step": 9435 + }, + { + "epoch": 0.9875457875457876, + "grad_norm": 1.8047308460509242, + "learning_rate": 8.134535030968061e-09, + "loss": 0.839, + "step": 9436 + }, + { + "epoch": 0.9876504447933019, + "grad_norm": 2.7150759739338293, + "learning_rate": 7.998412740130023e-09, + "loss": 0.9566, + "step": 9437 + }, + { + "epoch": 0.9877551020408163, + "grad_norm": 1.8975694108774446, + "learning_rate": 7.863438550529401e-09, + "loss": 0.8174, + "step": 9438 + }, + { + "epoch": 0.9878597592883307, + "grad_norm": 2.239706974304119, + "learning_rate": 7.729612477677118e-09, + "loss": 0.9733, + "step": 9439 + }, + { + "epoch": 0.9879644165358451, + "grad_norm": 1.7630296843202333, + "learning_rate": 7.596934536947542e-09, + "loss": 0.8704, + "step": 9440 + }, + { + "epoch": 0.9880690737833595, + "grad_norm": 2.1363895002879407, + "learning_rate": 7.465404743587368e-09, + "loss": 0.7886, + "step": 9441 + }, + { + "epoch": 0.9881737310308739, + "grad_norm": 2.2348274430335837, + "learning_rate": 7.33502311271006e-09, + "loss": 0.9167, + "step": 9442 + }, + { + "epoch": 0.9882783882783883, + "grad_norm": 2.9112946474193615, + "learning_rate": 7.205789659294748e-09, + "loss": 0.9517, + "step": 9443 + }, + { + "epoch": 0.9883830455259026, + "grad_norm": 1.9351507967777781, + "learning_rate": 7.0777043981917756e-09, + "loss": 0.9959, + "step": 9444 + }, + { + "epoch": 0.988487702773417, + "grad_norm": 2.205838160688878, + "learning_rate": 6.950767344118259e-09, + "loss": 1.0637, + "step": 9445 + }, + { + "epoch": 0.9885923600209314, + "grad_norm": 1.7747166910206333, + "learning_rate": 6.824978511660307e-09, + "loss": 0.7722, + "step": 9446 + }, + { + "epoch": 0.9886970172684458, + "grad_norm": 1.7660762190615527, + "learning_rate": 6.700337915269694e-09, + "loss": 0.8423, + "step": 9447 + }, + { + "epoch": 0.9888016745159602, + "grad_norm": 2.1500153429277287, + "learning_rate": 6.576845569269408e-09, + "loss": 0.9119, + "step": 9448 + }, + { + "epoch": 0.9889063317634746, + "grad_norm": 2.1119803885084334, + "learning_rate": 6.454501487848097e-09, + "loss": 0.8946, + "step": 9449 + }, + { + "epoch": 0.989010989010989, + "grad_norm": 2.045363078081982, + "learning_rate": 6.333305685064517e-09, + "loss": 0.814, + "step": 9450 + }, + { + "epoch": 0.9891156462585035, + "grad_norm": 2.143018279644762, + "learning_rate": 6.213258174841974e-09, + "loss": 0.8763, + "step": 9451 + }, + { + "epoch": 0.9892203035060178, + "grad_norm": 1.8414553993367668, + "learning_rate": 6.094358970976099e-09, + "loss": 0.8197, + "step": 9452 + }, + { + "epoch": 0.9893249607535322, + "grad_norm": 2.2429501404967374, + "learning_rate": 5.976608087128188e-09, + "loss": 0.9149, + "step": 9453 + }, + { + "epoch": 0.9894296180010466, + "grad_norm": 1.7802095129658109, + "learning_rate": 5.860005536828528e-09, + "loss": 0.829, + "step": 9454 + }, + { + "epoch": 0.989534275248561, + "grad_norm": 1.958887466082894, + "learning_rate": 5.74455133347418e-09, + "loss": 0.9254, + "step": 9455 + }, + { + "epoch": 0.9896389324960754, + "grad_norm": 2.0406723361448926, + "learning_rate": 5.630245490331199e-09, + "loss": 0.8935, + "step": 9456 + }, + { + "epoch": 0.9897435897435898, + "grad_norm": 1.985359908345336, + "learning_rate": 5.517088020533523e-09, + "loss": 0.7691, + "step": 9457 + }, + { + "epoch": 0.9898482469911042, + "grad_norm": 1.9928529617111481, + "learning_rate": 5.405078937082975e-09, + "loss": 0.8635, + "step": 9458 + }, + { + "epoch": 0.9899529042386185, + "grad_norm": 2.0172391384978616, + "learning_rate": 5.2942182528503695e-09, + "loss": 0.8513, + "step": 9459 + }, + { + "epoch": 0.9900575614861329, + "grad_norm": 2.3808186332110246, + "learning_rate": 5.184505980574406e-09, + "loss": 0.9489, + "step": 9460 + }, + { + "epoch": 0.9901622187336473, + "grad_norm": 2.318056374421155, + "learning_rate": 5.075942132859446e-09, + "loss": 0.9651, + "step": 9461 + }, + { + "epoch": 0.9902668759811617, + "grad_norm": 2.342430733838004, + "learning_rate": 4.968526722179956e-09, + "loss": 0.9706, + "step": 9462 + }, + { + "epoch": 0.9903715332286761, + "grad_norm": 2.210156685115261, + "learning_rate": 4.8622597608793954e-09, + "loss": 0.8883, + "step": 9463 + }, + { + "epoch": 0.9904761904761905, + "grad_norm": 2.100418573269556, + "learning_rate": 4.757141261167997e-09, + "loss": 0.9061, + "step": 9464 + }, + { + "epoch": 0.9905808477237049, + "grad_norm": 2.092025490922726, + "learning_rate": 4.653171235122767e-09, + "loss": 0.9194, + "step": 9465 + }, + { + "epoch": 0.9906855049712192, + "grad_norm": 2.2321294289010294, + "learning_rate": 4.550349694690814e-09, + "loss": 0.8867, + "step": 9466 + }, + { + "epoch": 0.9907901622187336, + "grad_norm": 2.032473032864792, + "learning_rate": 4.4486766516871335e-09, + "loss": 0.8543, + "step": 9467 + }, + { + "epoch": 0.990894819466248, + "grad_norm": 2.117448656836186, + "learning_rate": 4.348152117793491e-09, + "loss": 0.8434, + "step": 9468 + }, + { + "epoch": 0.9909994767137624, + "grad_norm": 2.2695523367962305, + "learning_rate": 4.248776104560648e-09, + "loss": 0.9899, + "step": 9469 + }, + { + "epoch": 0.9911041339612768, + "grad_norm": 2.053174387036949, + "learning_rate": 4.150548623406136e-09, + "loss": 0.8451, + "step": 9470 + }, + { + "epoch": 0.9912087912087912, + "grad_norm": 2.1987039378958233, + "learning_rate": 4.053469685617595e-09, + "loss": 0.7486, + "step": 9471 + }, + { + "epoch": 0.9913134484563056, + "grad_norm": 2.0752962011666423, + "learning_rate": 3.957539302349434e-09, + "loss": 0.9223, + "step": 9472 + }, + { + "epoch": 0.9914181057038199, + "grad_norm": 2.318495473294372, + "learning_rate": 3.862757484623947e-09, + "loss": 0.8358, + "step": 9473 + }, + { + "epoch": 0.9915227629513343, + "grad_norm": 2.0490582959624812, + "learning_rate": 3.769124243332423e-09, + "loss": 0.9201, + "step": 9474 + }, + { + "epoch": 0.9916274201988488, + "grad_norm": 2.1895429135365436, + "learning_rate": 3.676639589232922e-09, + "loss": 0.8669, + "step": 9475 + }, + { + "epoch": 0.9917320774463632, + "grad_norm": 2.081764463908597, + "learning_rate": 3.5853035329513898e-09, + "loss": 0.7138, + "step": 9476 + }, + { + "epoch": 0.9918367346938776, + "grad_norm": 1.9405197361065825, + "learning_rate": 3.4951160849838738e-09, + "loss": 0.9535, + "step": 9477 + }, + { + "epoch": 0.991941391941392, + "grad_norm": 1.962384250829884, + "learning_rate": 3.4060772556931965e-09, + "loss": 0.8824, + "step": 9478 + }, + { + "epoch": 0.9920460491889064, + "grad_norm": 1.9074856383254064, + "learning_rate": 3.3181870553089523e-09, + "loss": 0.889, + "step": 9479 + }, + { + "epoch": 0.9921507064364207, + "grad_norm": 2.044982555109191, + "learning_rate": 3.2314454939297304e-09, + "loss": 0.8188, + "step": 9480 + }, + { + "epoch": 0.9922553636839351, + "grad_norm": 1.9272346116373464, + "learning_rate": 3.1458525815231122e-09, + "loss": 0.7799, + "step": 9481 + }, + { + "epoch": 0.9923600209314495, + "grad_norm": 1.8056929489369389, + "learning_rate": 3.0614083279245644e-09, + "loss": 0.7488, + "step": 9482 + }, + { + "epoch": 0.9924646781789639, + "grad_norm": 2.111478786571531, + "learning_rate": 2.9781127428352153e-09, + "loss": 0.9358, + "step": 9483 + }, + { + "epoch": 0.9925693354264783, + "grad_norm": 2.2187962376027532, + "learning_rate": 2.8959658358274077e-09, + "loss": 0.8856, + "step": 9484 + }, + { + "epoch": 0.9926739926739927, + "grad_norm": 2.250021426198808, + "learning_rate": 2.814967616339148e-09, + "loss": 0.9267, + "step": 9485 + }, + { + "epoch": 0.9927786499215071, + "grad_norm": 1.9471155417501587, + "learning_rate": 2.735118093678546e-09, + "loss": 0.8243, + "step": 9486 + }, + { + "epoch": 0.9928833071690214, + "grad_norm": 2.2292413033155447, + "learning_rate": 2.656417277018264e-09, + "loss": 0.9037, + "step": 9487 + }, + { + "epoch": 0.9929879644165358, + "grad_norm": 2.1438692135997863, + "learning_rate": 2.578865175402179e-09, + "loss": 0.92, + "step": 9488 + }, + { + "epoch": 0.9930926216640502, + "grad_norm": 2.0987549084951564, + "learning_rate": 2.5024617977420507e-09, + "loss": 0.866, + "step": 9489 + }, + { + "epoch": 0.9931972789115646, + "grad_norm": 2.1710980962999833, + "learning_rate": 2.427207152815303e-09, + "loss": 0.8711, + "step": 9490 + }, + { + "epoch": 0.993301936159079, + "grad_norm": 1.9976864946771278, + "learning_rate": 2.3531012492705728e-09, + "loss": 0.8194, + "step": 9491 + }, + { + "epoch": 0.9934065934065934, + "grad_norm": 2.0151148991749315, + "learning_rate": 2.2801440956210506e-09, + "loss": 0.8259, + "step": 9492 + }, + { + "epoch": 0.9935112506541078, + "grad_norm": 1.7788086796827887, + "learning_rate": 2.208335700251141e-09, + "loss": 0.6933, + "step": 9493 + }, + { + "epoch": 0.9936159079016222, + "grad_norm": 2.243955438948143, + "learning_rate": 2.13767607141091e-09, + "loss": 1.0067, + "step": 9494 + }, + { + "epoch": 0.9937205651491365, + "grad_norm": 2.138700100591113, + "learning_rate": 2.06816521721942e-09, + "loss": 0.9177, + "step": 9495 + }, + { + "epoch": 0.9938252223966509, + "grad_norm": 2.0913847556873537, + "learning_rate": 1.9998031456636147e-09, + "loss": 0.9114, + "step": 9496 + }, + { + "epoch": 0.9939298796441653, + "grad_norm": 1.658736570996112, + "learning_rate": 1.9325898645983223e-09, + "loss": 0.8015, + "step": 9497 + }, + { + "epoch": 0.9940345368916798, + "grad_norm": 2.1338090873686615, + "learning_rate": 1.866525381747364e-09, + "loss": 0.9249, + "step": 9498 + }, + { + "epoch": 0.9941391941391942, + "grad_norm": 2.469627726192965, + "learning_rate": 1.8016097047002246e-09, + "loss": 0.8704, + "step": 9499 + }, + { + "epoch": 0.9942438513867086, + "grad_norm": 2.1414411950046026, + "learning_rate": 1.7378428409164927e-09, + "loss": 0.8624, + "step": 9500 + }, + { + "epoch": 0.994348508634223, + "grad_norm": 2.042150186442989, + "learning_rate": 1.6752247977236401e-09, + "loss": 0.8937, + "step": 9501 + }, + { + "epoch": 0.9944531658817373, + "grad_norm": 1.9900829634578683, + "learning_rate": 1.6137555823159123e-09, + "loss": 0.9209, + "step": 9502 + }, + { + "epoch": 0.9945578231292517, + "grad_norm": 2.410591588659901, + "learning_rate": 1.5534352017565479e-09, + "loss": 0.8172, + "step": 9503 + }, + { + "epoch": 0.9946624803767661, + "grad_norm": 2.236434312692957, + "learning_rate": 1.4942636629766693e-09, + "loss": 0.8395, + "step": 9504 + }, + { + "epoch": 0.9947671376242805, + "grad_norm": 2.1401814250288265, + "learning_rate": 1.4362409727752823e-09, + "loss": 0.8297, + "step": 9505 + }, + { + "epoch": 0.9948717948717949, + "grad_norm": 1.648097635917843, + "learning_rate": 1.3793671378181662e-09, + "loss": 0.7285, + "step": 9506 + }, + { + "epoch": 0.9949764521193093, + "grad_norm": 2.3796168135285676, + "learning_rate": 1.3236421646412034e-09, + "loss": 0.8813, + "step": 9507 + }, + { + "epoch": 0.9950811093668237, + "grad_norm": 1.9707127190807336, + "learning_rate": 1.2690660596481609e-09, + "loss": 0.7346, + "step": 9508 + }, + { + "epoch": 0.995185766614338, + "grad_norm": 2.161094149096642, + "learning_rate": 1.2156388291084675e-09, + "loss": 0.8541, + "step": 9509 + }, + { + "epoch": 0.9952904238618524, + "grad_norm": 2.010479082385231, + "learning_rate": 1.1633604791605468e-09, + "loss": 0.8944, + "step": 9510 + }, + { + "epoch": 0.9953950811093668, + "grad_norm": 2.4630434927929925, + "learning_rate": 1.1122310158129257e-09, + "loss": 1.0294, + "step": 9511 + }, + { + "epoch": 0.9954997383568812, + "grad_norm": 2.254975083731318, + "learning_rate": 1.0622504449409043e-09, + "loss": 0.9957, + "step": 9512 + }, + { + "epoch": 0.9956043956043956, + "grad_norm": 2.4241701416533905, + "learning_rate": 1.013418772285446e-09, + "loss": 0.8659, + "step": 9513 + }, + { + "epoch": 0.99570905285191, + "grad_norm": 2.3939669658872362, + "learning_rate": 9.65736003457618e-10, + "loss": 0.7979, + "step": 9514 + }, + { + "epoch": 0.9958137100994244, + "grad_norm": 2.2441183666873323, + "learning_rate": 9.19202143937481e-10, + "loss": 0.7062, + "step": 9515 + }, + { + "epoch": 0.9959183673469387, + "grad_norm": 1.9769218010919176, + "learning_rate": 8.738171990707589e-10, + "loss": 0.8455, + "step": 9516 + }, + { + "epoch": 0.9960230245944531, + "grad_norm": 1.9141948632707264, + "learning_rate": 8.295811740732796e-10, + "loss": 0.7915, + "step": 9517 + }, + { + "epoch": 0.9961276818419675, + "grad_norm": 2.3415485768220994, + "learning_rate": 7.864940740276439e-10, + "loss": 0.9133, + "step": 9518 + }, + { + "epoch": 0.9962323390894819, + "grad_norm": 2.1401706902733837, + "learning_rate": 7.445559038832262e-10, + "loss": 0.8676, + "step": 9519 + }, + { + "epoch": 0.9963369963369964, + "grad_norm": 1.8282468400440854, + "learning_rate": 7.037666684606148e-10, + "loss": 0.9874, + "step": 9520 + }, + { + "epoch": 0.9964416535845108, + "grad_norm": 2.2683960849665543, + "learning_rate": 6.641263724460612e-10, + "loss": 0.7478, + "step": 9521 + }, + { + "epoch": 0.9965463108320252, + "grad_norm": 2.1668339291877157, + "learning_rate": 6.256350203948103e-10, + "loss": 0.8291, + "step": 9522 + }, + { + "epoch": 0.9966509680795395, + "grad_norm": 2.4253965330798546, + "learning_rate": 5.882926167277703e-10, + "loss": 0.8643, + "step": 9523 + }, + { + "epoch": 0.9967556253270539, + "grad_norm": 2.1583972769715265, + "learning_rate": 5.520991657370633e-10, + "loss": 0.8868, + "step": 9524 + }, + { + "epoch": 0.9968602825745683, + "grad_norm": 2.3240808973920046, + "learning_rate": 5.17054671581585e-10, + "loss": 0.8017, + "step": 9525 + }, + { + "epoch": 0.9969649398220827, + "grad_norm": 2.194269062168786, + "learning_rate": 4.831591382870038e-10, + "loss": 0.9831, + "step": 9526 + }, + { + "epoch": 0.9970695970695971, + "grad_norm": 2.0354807231276872, + "learning_rate": 4.504125697490924e-10, + "loss": 0.9727, + "step": 9527 + }, + { + "epoch": 0.9971742543171115, + "grad_norm": 2.063899720456543, + "learning_rate": 4.188149697303967e-10, + "loss": 0.9148, + "step": 9528 + }, + { + "epoch": 0.9972789115646259, + "grad_norm": 2.2768683287427196, + "learning_rate": 3.883663418602357e-10, + "loss": 0.8469, + "step": 9529 + }, + { + "epoch": 0.9973835688121402, + "grad_norm": 2.0468757802028033, + "learning_rate": 3.5906668963803235e-10, + "loss": 0.8325, + "step": 9530 + }, + { + "epoch": 0.9974882260596546, + "grad_norm": 2.051294044317142, + "learning_rate": 3.3091601643109314e-10, + "loss": 0.8025, + "step": 9531 + }, + { + "epoch": 0.997592883307169, + "grad_norm": 2.105439857515838, + "learning_rate": 3.039143254723875e-10, + "loss": 0.9892, + "step": 9532 + }, + { + "epoch": 0.9976975405546834, + "grad_norm": 2.220984101222805, + "learning_rate": 2.7806161986609905e-10, + "loss": 0.7738, + "step": 9533 + }, + { + "epoch": 0.9978021978021978, + "grad_norm": 2.115650631099441, + "learning_rate": 2.5335790258207426e-10, + "loss": 0.8181, + "step": 9534 + }, + { + "epoch": 0.9979068550497122, + "grad_norm": 2.1913243479989855, + "learning_rate": 2.298031764591535e-10, + "loss": 0.8192, + "step": 9535 + }, + { + "epoch": 0.9980115122972266, + "grad_norm": 1.9652396497864315, + "learning_rate": 2.0739744420295026e-10, + "loss": 0.8074, + "step": 9536 + }, + { + "epoch": 0.998116169544741, + "grad_norm": 2.4031821017278814, + "learning_rate": 1.8614070838918198e-10, + "loss": 0.9575, + "step": 9537 + }, + { + "epoch": 0.9982208267922553, + "grad_norm": 2.037609844489397, + "learning_rate": 1.6603297145922902e-10, + "loss": 0.8618, + "step": 9538 + }, + { + "epoch": 0.9983254840397697, + "grad_norm": 1.6984142714399961, + "learning_rate": 1.4707423572346556e-10, + "loss": 0.754, + "step": 9539 + }, + { + "epoch": 0.9984301412872841, + "grad_norm": 1.627184498513617, + "learning_rate": 1.292645033612594e-10, + "loss": 0.7935, + "step": 9540 + }, + { + "epoch": 0.9985347985347985, + "grad_norm": 1.6578736068172026, + "learning_rate": 1.1260377641764131e-10, + "loss": 0.7769, + "step": 9541 + }, + { + "epoch": 0.998639455782313, + "grad_norm": 2.0902509066149784, + "learning_rate": 9.709205680885624e-11, + "loss": 0.7798, + "step": 9542 + }, + { + "epoch": 0.9987441130298274, + "grad_norm": 2.1775821639631023, + "learning_rate": 8.272934631459173e-11, + "loss": 0.8777, + "step": 9543 + }, + { + "epoch": 0.9988487702773418, + "grad_norm": 2.2443525221661678, + "learning_rate": 6.951564658796983e-11, + "loss": 0.85, + "step": 9544 + }, + { + "epoch": 0.998953427524856, + "grad_norm": 1.8463156292838934, + "learning_rate": 5.745095914555521e-11, + "loss": 0.8024, + "step": 9545 + }, + { + "epoch": 0.9990580847723705, + "grad_norm": 2.2214610429811676, + "learning_rate": 4.6535285374016416e-11, + "loss": 0.843, + "step": 9546 + }, + { + "epoch": 0.9991627420198849, + "grad_norm": 2.1287397785084075, + "learning_rate": 3.676862652790547e-11, + "loss": 0.8161, + "step": 9547 + }, + { + "epoch": 0.9992673992673993, + "grad_norm": 2.1670560388230014, + "learning_rate": 2.8150983728547632e-11, + "loss": 0.9613, + "step": 9548 + }, + { + "epoch": 0.9993720565149137, + "grad_norm": 1.8271825583030723, + "learning_rate": 2.0682357966261834e-11, + "loss": 0.8962, + "step": 9549 + }, + { + "epoch": 0.9994767137624281, + "grad_norm": 1.847917867704652, + "learning_rate": 1.4362750100360701e-11, + "loss": 0.8912, + "step": 9550 + }, + { + "epoch": 0.9995813710099425, + "grad_norm": 1.540095392303899, + "learning_rate": 9.192160856930088e-12, + "loss": 0.6913, + "step": 9551 + }, + { + "epoch": 0.9996860282574568, + "grad_norm": 1.8041355621927853, + "learning_rate": 5.170590828829092e-12, + "loss": 0.7122, + "step": 9552 + }, + { + "epoch": 0.9997906855049712, + "grad_norm": 2.0773412293224243, + "learning_rate": 2.2980404779104904e-12, + "loss": 0.8123, + "step": 9553 + }, + { + "epoch": 0.9998953427524856, + "grad_norm": 2.2586840266436203, + "learning_rate": 5.745101361309679e-13, + "loss": 0.8988, + "step": 9554 + }, + { + "epoch": 1.0, + "grad_norm": 1.738276172232474, + "learning_rate": 0.0, + "loss": 0.7992, + "step": 9555 + }, + { + "epoch": 1.0, + "step": 9555, + "total_flos": 1276904626028544.0, + "train_loss": 0.9531744299042231, + "train_runtime": 26831.9909, + "train_samples_per_second": 22.79, + "train_steps_per_second": 0.356 + } + ], + "logging_steps": 1.0, + "max_steps": 9555, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1276904626028544.0, + "train_batch_size": 8, "trial_name": null, "trial_params": null }