{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 708, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002824858757062147, "grad_norm": 0.6147084683934281, "learning_rate": 9.999950776495983e-06, "loss": 0.1718, "step": 1 }, { "epoch": 0.005649717514124294, "grad_norm": 0.6665089926445491, "learning_rate": 9.99980310695311e-06, "loss": 0.1906, "step": 2 }, { "epoch": 0.00847457627118644, "grad_norm": 0.5879608524504671, "learning_rate": 9.99955699427891e-06, "loss": 0.1905, "step": 3 }, { "epoch": 0.011299435028248588, "grad_norm": 0.5728624077474995, "learning_rate": 9.999212443319191e-06, "loss": 0.1806, "step": 4 }, { "epoch": 0.014124293785310734, "grad_norm": 0.45407189292974287, "learning_rate": 9.998769460857955e-06, "loss": 0.1587, "step": 5 }, { "epoch": 0.01694915254237288, "grad_norm": 0.695598470337452, "learning_rate": 9.998228055617264e-06, "loss": 0.3073, "step": 6 }, { "epoch": 0.01977401129943503, "grad_norm": 0.4388502111835372, "learning_rate": 9.99758823825706e-06, "loss": 0.1658, "step": 7 }, { "epoch": 0.022598870056497175, "grad_norm": 0.506295761059801, "learning_rate": 9.996850021374969e-06, "loss": 0.1822, "step": 8 }, { "epoch": 0.025423728813559324, "grad_norm": 0.5054469922635372, "learning_rate": 9.996013419506035e-06, "loss": 0.1878, "step": 9 }, { "epoch": 0.02824858757062147, "grad_norm": 0.5236855115593358, "learning_rate": 9.99507844912245e-06, "loss": 0.164, "step": 10 }, { "epoch": 0.031073446327683617, "grad_norm": 0.6005034865395634, "learning_rate": 9.994045128633221e-06, "loss": 0.2148, "step": 11 }, { "epoch": 0.03389830508474576, "grad_norm": 0.39196864541665416, "learning_rate": 9.99291347838381e-06, "loss": 0.1439, "step": 12 }, { "epoch": 0.03672316384180791, "grad_norm": 0.3520974148515624, "learning_rate": 9.991683520655735e-06, "loss": 0.138, "step": 13 }, { "epoch": 0.03954802259887006, "grad_norm": 0.39891278345339304, "learning_rate": 9.990355279666124e-06, "loss": 0.1778, "step": 14 }, { "epoch": 0.0423728813559322, "grad_norm": 0.38823463524310226, "learning_rate": 9.988928781567251e-06, "loss": 0.1802, "step": 15 }, { "epoch": 0.04519774011299435, "grad_norm": 0.3456396477919156, "learning_rate": 9.987404054446009e-06, "loss": 0.1537, "step": 16 }, { "epoch": 0.0480225988700565, "grad_norm": 0.4433642134268957, "learning_rate": 9.98578112832336e-06, "loss": 0.2148, "step": 17 }, { "epoch": 0.05084745762711865, "grad_norm": 0.5978046488506871, "learning_rate": 9.984060035153752e-06, "loss": 0.1769, "step": 18 }, { "epoch": 0.05367231638418079, "grad_norm": 0.3537901451183181, "learning_rate": 9.982240808824477e-06, "loss": 0.1323, "step": 19 }, { "epoch": 0.05649717514124294, "grad_norm": 0.5417108653864326, "learning_rate": 9.980323485155013e-06, "loss": 0.1666, "step": 20 }, { "epoch": 0.059322033898305086, "grad_norm": 0.35289882003600165, "learning_rate": 9.978308101896318e-06, "loss": 0.1205, "step": 21 }, { "epoch": 0.062146892655367235, "grad_norm": 0.36004933523243315, "learning_rate": 9.97619469873008e-06, "loss": 0.1371, "step": 22 }, { "epoch": 0.06497175141242938, "grad_norm": 0.3854571301167785, "learning_rate": 9.973983317267944e-06, "loss": 0.1568, "step": 23 }, { "epoch": 0.06779661016949153, "grad_norm": 0.6487420228246645, "learning_rate": 9.971674001050687e-06, "loss": 0.1647, "step": 24 }, { "epoch": 0.07062146892655367, "grad_norm": 0.3854893336014017, "learning_rate": 9.969266795547364e-06, "loss": 0.1854, "step": 25 }, { "epoch": 0.07344632768361582, "grad_norm": 0.3832699272168604, "learning_rate": 9.96676174815441e-06, "loss": 0.1955, "step": 26 }, { "epoch": 0.07627118644067797, "grad_norm": 0.31760668668906533, "learning_rate": 9.964158908194708e-06, "loss": 0.1195, "step": 27 }, { "epoch": 0.07909604519774012, "grad_norm": 0.41114801745145746, "learning_rate": 9.961458326916624e-06, "loss": 0.1414, "step": 28 }, { "epoch": 0.08192090395480225, "grad_norm": 0.3240630544735708, "learning_rate": 9.958660057492982e-06, "loss": 0.1562, "step": 29 }, { "epoch": 0.0847457627118644, "grad_norm": 0.3317790911132787, "learning_rate": 9.955764155020037e-06, "loss": 0.1263, "step": 30 }, { "epoch": 0.08757062146892655, "grad_norm": 0.3730810477019189, "learning_rate": 9.952770676516372e-06, "loss": 0.1549, "step": 31 }, { "epoch": 0.0903954802259887, "grad_norm": 0.3594977608871381, "learning_rate": 9.94967968092179e-06, "loss": 0.1362, "step": 32 }, { "epoch": 0.09322033898305085, "grad_norm": 0.5395386567672124, "learning_rate": 9.946491229096143e-06, "loss": 0.1447, "step": 33 }, { "epoch": 0.096045197740113, "grad_norm": 0.34069006187102624, "learning_rate": 9.943205383818142e-06, "loss": 0.1602, "step": 34 }, { "epoch": 0.09887005649717515, "grad_norm": 0.40507986945355756, "learning_rate": 9.93982220978411e-06, "loss": 0.1727, "step": 35 }, { "epoch": 0.1016949152542373, "grad_norm": 0.3101237427572441, "learning_rate": 9.936341773606723e-06, "loss": 0.1328, "step": 36 }, { "epoch": 0.10451977401129943, "grad_norm": 0.32711902171143986, "learning_rate": 9.932764143813686e-06, "loss": 0.1377, "step": 37 }, { "epoch": 0.10734463276836158, "grad_norm": 0.3698266114267764, "learning_rate": 9.929089390846389e-06, "loss": 0.1593, "step": 38 }, { "epoch": 0.11016949152542373, "grad_norm": 0.36372051261110894, "learning_rate": 9.925317587058516e-06, "loss": 0.1238, "step": 39 }, { "epoch": 0.11299435028248588, "grad_norm": 0.36933238520645995, "learning_rate": 9.92144880671463e-06, "loss": 0.1678, "step": 40 }, { "epoch": 0.11581920903954802, "grad_norm": 0.3460461190525915, "learning_rate": 9.9174831259887e-06, "loss": 0.1578, "step": 41 }, { "epoch": 0.11864406779661017, "grad_norm": 0.5499266672914876, "learning_rate": 9.913420622962606e-06, "loss": 0.1437, "step": 42 }, { "epoch": 0.12146892655367232, "grad_norm": 0.4049849988270768, "learning_rate": 9.909261377624601e-06, "loss": 0.187, "step": 43 }, { "epoch": 0.12429378531073447, "grad_norm": 0.47235530071518683, "learning_rate": 9.90500547186774e-06, "loss": 0.1449, "step": 44 }, { "epoch": 0.1271186440677966, "grad_norm": 0.36137299735697387, "learning_rate": 9.900652989488255e-06, "loss": 0.1505, "step": 45 }, { "epoch": 0.12994350282485875, "grad_norm": 0.372928772406179, "learning_rate": 9.896204016183924e-06, "loss": 0.1984, "step": 46 }, { "epoch": 0.1327683615819209, "grad_norm": 0.31608553050802646, "learning_rate": 9.891658639552368e-06, "loss": 0.1233, "step": 47 }, { "epoch": 0.13559322033898305, "grad_norm": 0.34009496825621116, "learning_rate": 9.887016949089334e-06, "loss": 0.1533, "step": 48 }, { "epoch": 0.1384180790960452, "grad_norm": 0.3475244874058269, "learning_rate": 9.882279036186927e-06, "loss": 0.1348, "step": 49 }, { "epoch": 0.14124293785310735, "grad_norm": 0.3465923771954804, "learning_rate": 9.87744499413182e-06, "loss": 0.1211, "step": 50 }, { "epoch": 0.1440677966101695, "grad_norm": 0.4625874588181501, "learning_rate": 9.872514918103407e-06, "loss": 0.1316, "step": 51 }, { "epoch": 0.14689265536723164, "grad_norm": 0.4219123981937324, "learning_rate": 9.867488905171934e-06, "loss": 0.1499, "step": 52 }, { "epoch": 0.1497175141242938, "grad_norm": 0.28327838865809335, "learning_rate": 9.86236705429659e-06, "loss": 0.118, "step": 53 }, { "epoch": 0.15254237288135594, "grad_norm": 0.42996698854118004, "learning_rate": 9.85714946632355e-06, "loss": 0.1445, "step": 54 }, { "epoch": 0.1553672316384181, "grad_norm": 0.3158297884352036, "learning_rate": 9.851836243984005e-06, "loss": 0.1026, "step": 55 }, { "epoch": 0.15819209039548024, "grad_norm": 0.34425358123925115, "learning_rate": 9.846427491892117e-06, "loss": 0.1296, "step": 56 }, { "epoch": 0.16101694915254236, "grad_norm": 0.26454441870053136, "learning_rate": 9.840923316542984e-06, "loss": 0.0945, "step": 57 }, { "epoch": 0.1638418079096045, "grad_norm": 0.47330273293079517, "learning_rate": 9.835323826310522e-06, "loss": 0.1645, "step": 58 }, { "epoch": 0.16666666666666666, "grad_norm": 0.30609709369908644, "learning_rate": 9.829629131445342e-06, "loss": 0.122, "step": 59 }, { "epoch": 0.1694915254237288, "grad_norm": 0.4330091411356326, "learning_rate": 9.823839344072582e-06, "loss": 0.1494, "step": 60 }, { "epoch": 0.17231638418079095, "grad_norm": 0.42693180952210663, "learning_rate": 9.817954578189686e-06, "loss": 0.1235, "step": 61 }, { "epoch": 0.1751412429378531, "grad_norm": 0.3324864445850298, "learning_rate": 9.811974949664176e-06, "loss": 0.1327, "step": 62 }, { "epoch": 0.17796610169491525, "grad_norm": 0.41054779677427783, "learning_rate": 9.805900576231358e-06, "loss": 0.1741, "step": 63 }, { "epoch": 0.1807909604519774, "grad_norm": 0.33830483714924636, "learning_rate": 9.79973157749201e-06, "loss": 0.1265, "step": 64 }, { "epoch": 0.18361581920903955, "grad_norm": 0.3152187519500513, "learning_rate": 9.793468074910028e-06, "loss": 0.1202, "step": 65 }, { "epoch": 0.1864406779661017, "grad_norm": 0.368592840409881, "learning_rate": 9.787110191810027e-06, "loss": 0.1292, "step": 66 }, { "epoch": 0.18926553672316385, "grad_norm": 0.3471829002234472, "learning_rate": 9.780658053374923e-06, "loss": 0.1532, "step": 67 }, { "epoch": 0.192090395480226, "grad_norm": 0.3461593726912688, "learning_rate": 9.77411178664346e-06, "loss": 0.1352, "step": 68 }, { "epoch": 0.19491525423728814, "grad_norm": 0.647720230993773, "learning_rate": 9.767471520507713e-06, "loss": 0.1291, "step": 69 }, { "epoch": 0.1977401129943503, "grad_norm": 0.31371308918917606, "learning_rate": 9.760737385710546e-06, "loss": 0.1363, "step": 70 }, { "epoch": 0.20056497175141244, "grad_norm": 0.36616052339629535, "learning_rate": 9.753909514843047e-06, "loss": 0.163, "step": 71 }, { "epoch": 0.2033898305084746, "grad_norm": 0.3366775524198522, "learning_rate": 9.746988042341907e-06, "loss": 0.1211, "step": 72 }, { "epoch": 0.2062146892655367, "grad_norm": 0.32238872392227863, "learning_rate": 9.739973104486777e-06, "loss": 0.1244, "step": 73 }, { "epoch": 0.20903954802259886, "grad_norm": 0.309699834626684, "learning_rate": 9.732864839397585e-06, "loss": 0.1123, "step": 74 }, { "epoch": 0.211864406779661, "grad_norm": 0.40415497402853257, "learning_rate": 9.725663387031818e-06, "loss": 0.1185, "step": 75 }, { "epoch": 0.21468926553672316, "grad_norm": 0.43571692379160615, "learning_rate": 9.718368889181763e-06, "loss": 0.1205, "step": 76 }, { "epoch": 0.2175141242937853, "grad_norm": 0.3220023585216405, "learning_rate": 9.710981489471721e-06, "loss": 0.1513, "step": 77 }, { "epoch": 0.22033898305084745, "grad_norm": 0.41630953777961766, "learning_rate": 9.703501333355167e-06, "loss": 0.1249, "step": 78 }, { "epoch": 0.2231638418079096, "grad_norm": 0.5019640716602216, "learning_rate": 9.6959285681119e-06, "loss": 0.1914, "step": 79 }, { "epoch": 0.22598870056497175, "grad_norm": 0.4023458378359013, "learning_rate": 9.68826334284514e-06, "loss": 0.1346, "step": 80 }, { "epoch": 0.2288135593220339, "grad_norm": 0.34337841648791245, "learning_rate": 9.680505808478583e-06, "loss": 0.1272, "step": 81 }, { "epoch": 0.23163841807909605, "grad_norm": 0.32382210914063797, "learning_rate": 9.672656117753435e-06, "loss": 0.1155, "step": 82 }, { "epoch": 0.2344632768361582, "grad_norm": 0.4098493186152155, "learning_rate": 9.664714425225414e-06, "loss": 0.155, "step": 83 }, { "epoch": 0.23728813559322035, "grad_norm": 0.30680580576204597, "learning_rate": 9.656680887261693e-06, "loss": 0.1268, "step": 84 }, { "epoch": 0.2401129943502825, "grad_norm": 0.34530888636266843, "learning_rate": 9.648555662037826e-06, "loss": 0.1255, "step": 85 }, { "epoch": 0.24293785310734464, "grad_norm": 0.31982112281364816, "learning_rate": 9.640338909534636e-06, "loss": 0.1187, "step": 86 }, { "epoch": 0.2457627118644068, "grad_norm": 0.3249369835609272, "learning_rate": 9.632030791535063e-06, "loss": 0.1094, "step": 87 }, { "epoch": 0.24858757062146894, "grad_norm": 0.4202492399280893, "learning_rate": 9.62363147162098e-06, "loss": 0.1473, "step": 88 }, { "epoch": 0.2514124293785311, "grad_norm": 0.4206786168286477, "learning_rate": 9.615141115169968e-06, "loss": 0.1321, "step": 89 }, { "epoch": 0.2542372881355932, "grad_norm": 0.35455170569024247, "learning_rate": 9.606559889352065e-06, "loss": 0.1333, "step": 90 }, { "epoch": 0.2570621468926554, "grad_norm": 0.30654274314933255, "learning_rate": 9.597887963126476e-06, "loss": 0.1324, "step": 91 }, { "epoch": 0.2598870056497175, "grad_norm": 0.31426903973238157, "learning_rate": 9.589125507238234e-06, "loss": 0.0965, "step": 92 }, { "epoch": 0.2627118644067797, "grad_norm": 0.39888445390797206, "learning_rate": 9.580272694214855e-06, "loss": 0.1795, "step": 93 }, { "epoch": 0.2655367231638418, "grad_norm": 0.32688334776107986, "learning_rate": 9.571329698362931e-06, "loss": 0.0993, "step": 94 }, { "epoch": 0.268361581920904, "grad_norm": 0.37334590333180434, "learning_rate": 9.562296695764695e-06, "loss": 0.1201, "step": 95 }, { "epoch": 0.2711864406779661, "grad_norm": 0.3189896682514011, "learning_rate": 9.553173864274567e-06, "loss": 0.1261, "step": 96 }, { "epoch": 0.2740112994350282, "grad_norm": 0.34974753780587814, "learning_rate": 9.543961383515638e-06, "loss": 0.1476, "step": 97 }, { "epoch": 0.2768361581920904, "grad_norm": 0.3593944813062625, "learning_rate": 9.53465943487614e-06, "loss": 0.123, "step": 98 }, { "epoch": 0.2796610169491525, "grad_norm": 0.35812916144207474, "learning_rate": 9.52526820150588e-06, "loss": 0.1257, "step": 99 }, { "epoch": 0.2824858757062147, "grad_norm": 0.32235620707593265, "learning_rate": 9.51578786831262e-06, "loss": 0.1493, "step": 100 }, { "epoch": 0.2853107344632768, "grad_norm": 0.37142164119731536, "learning_rate": 9.506218621958448e-06, "loss": 0.1278, "step": 101 }, { "epoch": 0.288135593220339, "grad_norm": 0.44435482682594635, "learning_rate": 9.496560650856097e-06, "loss": 0.1443, "step": 102 }, { "epoch": 0.2909604519774011, "grad_norm": 0.3566707987782221, "learning_rate": 9.486814145165242e-06, "loss": 0.1122, "step": 103 }, { "epoch": 0.2937853107344633, "grad_norm": 0.33349931081864886, "learning_rate": 9.476979296788746e-06, "loss": 0.1185, "step": 104 }, { "epoch": 0.2966101694915254, "grad_norm": 0.3196942221194822, "learning_rate": 9.467056299368888e-06, "loss": 0.1405, "step": 105 }, { "epoch": 0.2994350282485876, "grad_norm": 0.300108694500991, "learning_rate": 9.457045348283552e-06, "loss": 0.1032, "step": 106 }, { "epoch": 0.3022598870056497, "grad_norm": 0.2628183218575623, "learning_rate": 9.446946640642372e-06, "loss": 0.0891, "step": 107 }, { "epoch": 0.3050847457627119, "grad_norm": 0.3173943921412448, "learning_rate": 9.436760375282858e-06, "loss": 0.1389, "step": 108 }, { "epoch": 0.307909604519774, "grad_norm": 0.3374023308082592, "learning_rate": 9.426486752766481e-06, "loss": 0.1433, "step": 109 }, { "epoch": 0.3107344632768362, "grad_norm": 0.32296890768155634, "learning_rate": 9.416125975374722e-06, "loss": 0.1489, "step": 110 }, { "epoch": 0.3135593220338983, "grad_norm": 0.809125754824113, "learning_rate": 9.405678247105083e-06, "loss": 0.1197, "step": 111 }, { "epoch": 0.3163841807909605, "grad_norm": 0.2899410860114942, "learning_rate": 9.395143773667089e-06, "loss": 0.1101, "step": 112 }, { "epoch": 0.3192090395480226, "grad_norm": 0.35261776609510237, "learning_rate": 9.38452276247821e-06, "loss": 0.15, "step": 113 }, { "epoch": 0.3220338983050847, "grad_norm": 0.40136031754220597, "learning_rate": 9.373815422659806e-06, "loss": 0.1822, "step": 114 }, { "epoch": 0.3248587570621469, "grad_norm": 0.313017574757418, "learning_rate": 9.363021965032993e-06, "loss": 0.1188, "step": 115 }, { "epoch": 0.327683615819209, "grad_norm": 0.373452387713479, "learning_rate": 9.352142602114487e-06, "loss": 0.137, "step": 116 }, { "epoch": 0.3305084745762712, "grad_norm": 0.3420346614883704, "learning_rate": 9.341177548112437e-06, "loss": 0.1344, "step": 117 }, { "epoch": 0.3333333333333333, "grad_norm": 0.4036780338239374, "learning_rate": 9.330127018922195e-06, "loss": 0.133, "step": 118 }, { "epoch": 0.3361581920903955, "grad_norm": 0.32756640437501205, "learning_rate": 9.318991232122065e-06, "loss": 0.1321, "step": 119 }, { "epoch": 0.3389830508474576, "grad_norm": 0.345171287549607, "learning_rate": 9.307770406969032e-06, "loss": 0.1202, "step": 120 }, { "epoch": 0.3418079096045198, "grad_norm": 0.437595297190657, "learning_rate": 9.296464764394422e-06, "loss": 0.1824, "step": 121 }, { "epoch": 0.3446327683615819, "grad_norm": 0.2838090630299304, "learning_rate": 9.285074526999577e-06, "loss": 0.114, "step": 122 }, { "epoch": 0.3474576271186441, "grad_norm": 0.33725871702683274, "learning_rate": 9.273599919051452e-06, "loss": 0.1254, "step": 123 }, { "epoch": 0.3502824858757062, "grad_norm": 0.2980485886022378, "learning_rate": 9.262041166478215e-06, "loss": 0.1139, "step": 124 }, { "epoch": 0.3531073446327684, "grad_norm": 0.35961892985059185, "learning_rate": 9.250398496864782e-06, "loss": 0.114, "step": 125 }, { "epoch": 0.3559322033898305, "grad_norm": 0.44215669470603974, "learning_rate": 9.238672139448354e-06, "loss": 0.1133, "step": 126 }, { "epoch": 0.3587570621468927, "grad_norm": 0.4377017180308674, "learning_rate": 9.226862325113894e-06, "loss": 0.1438, "step": 127 }, { "epoch": 0.3615819209039548, "grad_norm": 0.3248130129102471, "learning_rate": 9.214969286389577e-06, "loss": 0.1079, "step": 128 }, { "epoch": 0.3644067796610169, "grad_norm": 0.3687271690510821, "learning_rate": 9.202993257442216e-06, "loss": 0.1576, "step": 129 }, { "epoch": 0.3672316384180791, "grad_norm": 0.31854432765060964, "learning_rate": 9.190934474072658e-06, "loss": 0.1128, "step": 130 }, { "epoch": 0.3700564971751412, "grad_norm": 0.43632199490503926, "learning_rate": 9.178793173711133e-06, "loss": 0.16, "step": 131 }, { "epoch": 0.3728813559322034, "grad_norm": 0.3184995213048756, "learning_rate": 9.166569595412576e-06, "loss": 0.1087, "step": 132 }, { "epoch": 0.3757062146892655, "grad_norm": 0.2852410374038083, "learning_rate": 9.154263979851932e-06, "loss": 0.1017, "step": 133 }, { "epoch": 0.3785310734463277, "grad_norm": 0.4894599992720483, "learning_rate": 9.141876569319405e-06, "loss": 0.1338, "step": 134 }, { "epoch": 0.3813559322033898, "grad_norm": 0.3025442389978783, "learning_rate": 9.129407607715697e-06, "loss": 0.1029, "step": 135 }, { "epoch": 0.384180790960452, "grad_norm": 0.3362331028333517, "learning_rate": 9.116857340547203e-06, "loss": 0.1171, "step": 136 }, { "epoch": 0.3870056497175141, "grad_norm": 0.3058727798799934, "learning_rate": 9.104226014921171e-06, "loss": 0.1242, "step": 137 }, { "epoch": 0.3898305084745763, "grad_norm": 0.32225074355114075, "learning_rate": 9.091513879540845e-06, "loss": 0.1199, "step": 138 }, { "epoch": 0.3926553672316384, "grad_norm": 0.4160727630117183, "learning_rate": 9.078721184700565e-06, "loss": 0.1703, "step": 139 }, { "epoch": 0.3954802259887006, "grad_norm": 0.3825137129544069, "learning_rate": 9.065848182280835e-06, "loss": 0.1417, "step": 140 }, { "epoch": 0.3983050847457627, "grad_norm": 0.3109793025220252, "learning_rate": 9.05289512574337e-06, "loss": 0.1079, "step": 141 }, { "epoch": 0.4011299435028249, "grad_norm": 0.32403338158384704, "learning_rate": 9.039862270126102e-06, "loss": 0.1304, "step": 142 }, { "epoch": 0.403954802259887, "grad_norm": 0.333516403508159, "learning_rate": 9.026749872038161e-06, "loss": 0.1215, "step": 143 }, { "epoch": 0.4067796610169492, "grad_norm": 0.5512785845644815, "learning_rate": 9.013558189654819e-06, "loss": 0.119, "step": 144 }, { "epoch": 0.4096045197740113, "grad_norm": 0.3342692670079559, "learning_rate": 9.000287482712407e-06, "loss": 0.1327, "step": 145 }, { "epoch": 0.4124293785310734, "grad_norm": 0.3844315934040279, "learning_rate": 8.986938012503203e-06, "loss": 0.1354, "step": 146 }, { "epoch": 0.4152542372881356, "grad_norm": 0.3704715200009876, "learning_rate": 8.973510041870287e-06, "loss": 0.116, "step": 147 }, { "epoch": 0.4180790960451977, "grad_norm": 0.3731805906895666, "learning_rate": 8.960003835202369e-06, "loss": 0.1447, "step": 148 }, { "epoch": 0.4209039548022599, "grad_norm": 0.3074985687330806, "learning_rate": 8.946419658428573e-06, "loss": 0.0944, "step": 149 }, { "epoch": 0.423728813559322, "grad_norm": 0.43341288159333946, "learning_rate": 8.932757779013214e-06, "loss": 0.1266, "step": 150 }, { "epoch": 0.4265536723163842, "grad_norm": 0.336284725389875, "learning_rate": 8.919018465950517e-06, "loss": 0.1398, "step": 151 }, { "epoch": 0.4293785310734463, "grad_norm": 0.35240140744565995, "learning_rate": 8.90520198975934e-06, "loss": 0.161, "step": 152 }, { "epoch": 0.4322033898305085, "grad_norm": 0.33406617309219283, "learning_rate": 8.89130862247783e-06, "loss": 0.1452, "step": 153 }, { "epoch": 0.4350282485875706, "grad_norm": 0.4037611933501478, "learning_rate": 8.877338637658074e-06, "loss": 0.1355, "step": 154 }, { "epoch": 0.4378531073446328, "grad_norm": 0.3475789016676025, "learning_rate": 8.863292310360716e-06, "loss": 0.1298, "step": 155 }, { "epoch": 0.4406779661016949, "grad_norm": 0.4841263150458153, "learning_rate": 8.849169917149532e-06, "loss": 0.1207, "step": 156 }, { "epoch": 0.4435028248587571, "grad_norm": 0.30177409440633907, "learning_rate": 8.834971736085995e-06, "loss": 0.1092, "step": 157 }, { "epoch": 0.4463276836158192, "grad_norm": 0.3405364099196524, "learning_rate": 8.820698046723796e-06, "loss": 0.1196, "step": 158 }, { "epoch": 0.4491525423728814, "grad_norm": 0.3399165082210464, "learning_rate": 8.806349130103334e-06, "loss": 0.1215, "step": 159 }, { "epoch": 0.4519774011299435, "grad_norm": 0.30309252729845493, "learning_rate": 8.791925268746193e-06, "loss": 0.133, "step": 160 }, { "epoch": 0.4548022598870056, "grad_norm": 0.4127573259391668, "learning_rate": 8.777426746649571e-06, "loss": 0.1131, "step": 161 }, { "epoch": 0.4576271186440678, "grad_norm": 0.3210525339974507, "learning_rate": 8.762853849280692e-06, "loss": 0.1097, "step": 162 }, { "epoch": 0.4604519774011299, "grad_norm": 0.38978996569997443, "learning_rate": 8.748206863571188e-06, "loss": 0.1259, "step": 163 }, { "epoch": 0.4632768361581921, "grad_norm": 0.3200510621145131, "learning_rate": 8.73348607791144e-06, "loss": 0.1028, "step": 164 }, { "epoch": 0.4661016949152542, "grad_norm": 0.5781086641593365, "learning_rate": 8.718691782144908e-06, "loss": 0.1504, "step": 165 }, { "epoch": 0.4689265536723164, "grad_norm": 0.3161434045596249, "learning_rate": 8.703824267562424e-06, "loss": 0.1341, "step": 166 }, { "epoch": 0.4717514124293785, "grad_norm": 0.2920424513956196, "learning_rate": 8.688883826896458e-06, "loss": 0.0985, "step": 167 }, { "epoch": 0.4745762711864407, "grad_norm": 0.3862783048815386, "learning_rate": 8.673870754315336e-06, "loss": 0.1352, "step": 168 }, { "epoch": 0.4774011299435028, "grad_norm": 0.4426996657474187, "learning_rate": 8.658785345417484e-06, "loss": 0.1414, "step": 169 }, { "epoch": 0.480225988700565, "grad_norm": 0.461223196307408, "learning_rate": 8.64362789722557e-06, "loss": 0.1397, "step": 170 }, { "epoch": 0.4830508474576271, "grad_norm": 0.3236362093117533, "learning_rate": 8.62839870818068e-06, "loss": 0.1185, "step": 171 }, { "epoch": 0.4858757062146893, "grad_norm": 0.5017393956700863, "learning_rate": 8.613098078136436e-06, "loss": 0.1301, "step": 172 }, { "epoch": 0.4887005649717514, "grad_norm": 0.3476632540894821, "learning_rate": 8.597726308353085e-06, "loss": 0.1265, "step": 173 }, { "epoch": 0.4915254237288136, "grad_norm": 0.38990017498501656, "learning_rate": 8.582283701491576e-06, "loss": 0.152, "step": 174 }, { "epoch": 0.4943502824858757, "grad_norm": 0.3190826675735708, "learning_rate": 8.566770561607598e-06, "loss": 0.1281, "step": 175 }, { "epoch": 0.4971751412429379, "grad_norm": 0.3967141406957403, "learning_rate": 8.551187194145591e-06, "loss": 0.1546, "step": 176 }, { "epoch": 0.5, "grad_norm": 0.3328836982483565, "learning_rate": 8.535533905932739e-06, "loss": 0.1314, "step": 177 }, { "epoch": 0.5028248587570622, "grad_norm": 0.374457685382645, "learning_rate": 8.519811005172916e-06, "loss": 0.1165, "step": 178 }, { "epoch": 0.5056497175141242, "grad_norm": 0.363385614117873, "learning_rate": 8.50401880144063e-06, "loss": 0.1226, "step": 179 }, { "epoch": 0.5084745762711864, "grad_norm": 0.32620237927339973, "learning_rate": 8.488157605674924e-06, "loss": 0.1203, "step": 180 }, { "epoch": 0.5112994350282486, "grad_norm": 0.2961670016082417, "learning_rate": 8.472227730173252e-06, "loss": 0.104, "step": 181 }, { "epoch": 0.5141242937853108, "grad_norm": 0.3350355060825277, "learning_rate": 8.456229488585328e-06, "loss": 0.1136, "step": 182 }, { "epoch": 0.5169491525423728, "grad_norm": 0.3540976396926151, "learning_rate": 8.440163195906959e-06, "loss": 0.0946, "step": 183 }, { "epoch": 0.519774011299435, "grad_norm": 0.33467249667837795, "learning_rate": 8.424029168473829e-06, "loss": 0.1341, "step": 184 }, { "epoch": 0.5225988700564972, "grad_norm": 0.3582483372686221, "learning_rate": 8.407827723955287e-06, "loss": 0.101, "step": 185 }, { "epoch": 0.5254237288135594, "grad_norm": 0.3871399637156461, "learning_rate": 8.391559181348081e-06, "loss": 0.1368, "step": 186 }, { "epoch": 0.5282485875706214, "grad_norm": 0.44494561948427913, "learning_rate": 8.375223860970078e-06, "loss": 0.1542, "step": 187 }, { "epoch": 0.5310734463276836, "grad_norm": 0.342778980721638, "learning_rate": 8.358822084453964e-06, "loss": 0.1343, "step": 188 }, { "epoch": 0.5338983050847458, "grad_norm": 0.3190954383119015, "learning_rate": 8.342354174740904e-06, "loss": 0.1217, "step": 189 }, { "epoch": 0.536723163841808, "grad_norm": 0.32918038764737034, "learning_rate": 8.325820456074181e-06, "loss": 0.1158, "step": 190 }, { "epoch": 0.53954802259887, "grad_norm": 0.3879838615458632, "learning_rate": 8.309221253992825e-06, "loss": 0.1256, "step": 191 }, { "epoch": 0.5423728813559322, "grad_norm": 0.35618247459755487, "learning_rate": 8.292556895325195e-06, "loss": 0.1099, "step": 192 }, { "epoch": 0.5451977401129944, "grad_norm": 0.3234827304096444, "learning_rate": 8.275827708182536e-06, "loss": 0.1284, "step": 193 }, { "epoch": 0.5480225988700564, "grad_norm": 0.34766152172703946, "learning_rate": 8.259034021952537e-06, "loss": 0.1356, "step": 194 }, { "epoch": 0.5508474576271186, "grad_norm": 0.36732776065701966, "learning_rate": 8.242176167292827e-06, "loss": 0.1151, "step": 195 }, { "epoch": 0.5536723163841808, "grad_norm": 0.33889281731020826, "learning_rate": 8.225254476124479e-06, "loss": 0.1063, "step": 196 }, { "epoch": 0.556497175141243, "grad_norm": 0.3193880715488108, "learning_rate": 8.208269281625466e-06, "loss": 0.1276, "step": 197 }, { "epoch": 0.559322033898305, "grad_norm": 0.3204272104884364, "learning_rate": 8.191220918224102e-06, "loss": 0.1223, "step": 198 }, { "epoch": 0.5621468926553672, "grad_norm": 0.2837424255636162, "learning_rate": 8.174109721592463e-06, "loss": 0.1004, "step": 199 }, { "epoch": 0.5649717514124294, "grad_norm": 0.37977501376826955, "learning_rate": 8.156936028639768e-06, "loss": 0.1343, "step": 200 }, { "epoch": 0.5677966101694916, "grad_norm": 0.3802933186191023, "learning_rate": 8.13970017750576e-06, "loss": 0.1229, "step": 201 }, { "epoch": 0.5706214689265536, "grad_norm": 0.3336231226315229, "learning_rate": 8.12240250755403e-06, "loss": 0.1071, "step": 202 }, { "epoch": 0.5734463276836158, "grad_norm": 0.31611617409721354, "learning_rate": 8.10504335936535e-06, "loss": 0.1315, "step": 203 }, { "epoch": 0.576271186440678, "grad_norm": 0.37986304879280974, "learning_rate": 8.08762307473096e-06, "loss": 0.1488, "step": 204 }, { "epoch": 0.5790960451977402, "grad_norm": 0.47382310798593613, "learning_rate": 8.07014199664584e-06, "loss": 0.1199, "step": 205 }, { "epoch": 0.5819209039548022, "grad_norm": 0.328423980785905, "learning_rate": 8.052600469301958e-06, "loss": 0.1094, "step": 206 }, { "epoch": 0.5847457627118644, "grad_norm": 0.36866153232988097, "learning_rate": 8.03499883808149e-06, "loss": 0.1215, "step": 207 }, { "epoch": 0.5875706214689266, "grad_norm": 0.4258404561855114, "learning_rate": 8.01733744955002e-06, "loss": 0.1728, "step": 208 }, { "epoch": 0.5903954802259888, "grad_norm": 0.4379066754162319, "learning_rate": 7.999616651449722e-06, "loss": 0.1334, "step": 209 }, { "epoch": 0.5932203389830508, "grad_norm": 0.3298246566080818, "learning_rate": 7.981836792692508e-06, "loss": 0.1321, "step": 210 }, { "epoch": 0.596045197740113, "grad_norm": 0.3213552929669426, "learning_rate": 7.963998223353154e-06, "loss": 0.1475, "step": 211 }, { "epoch": 0.5988700564971752, "grad_norm": 0.6800071694421395, "learning_rate": 7.946101294662418e-06, "loss": 0.1521, "step": 212 }, { "epoch": 0.6016949152542372, "grad_norm": 0.31394438576238753, "learning_rate": 7.928146359000117e-06, "loss": 0.1269, "step": 213 }, { "epoch": 0.6045197740112994, "grad_norm": 0.34115929397266076, "learning_rate": 7.91013376988819e-06, "loss": 0.1079, "step": 214 }, { "epoch": 0.6073446327683616, "grad_norm": 0.3070888783690557, "learning_rate": 7.892063881983736e-06, "loss": 0.1037, "step": 215 }, { "epoch": 0.6101694915254238, "grad_norm": 0.35233334420808005, "learning_rate": 7.873937051072037e-06, "loss": 0.1291, "step": 216 }, { "epoch": 0.6129943502824858, "grad_norm": 0.37926219175637094, "learning_rate": 7.855753634059543e-06, "loss": 0.1997, "step": 217 }, { "epoch": 0.615819209039548, "grad_norm": 0.281725523224709, "learning_rate": 7.83751398896686e-06, "loss": 0.0946, "step": 218 }, { "epoch": 0.6186440677966102, "grad_norm": 0.33366851056170144, "learning_rate": 7.81921847492168e-06, "loss": 0.1355, "step": 219 }, { "epoch": 0.6214689265536724, "grad_norm": 0.4693528685955929, "learning_rate": 7.80086745215173e-06, "loss": 0.1235, "step": 220 }, { "epoch": 0.6242937853107344, "grad_norm": 0.348566291026797, "learning_rate": 7.782461281977668e-06, "loss": 0.1463, "step": 221 }, { "epoch": 0.6271186440677966, "grad_norm": 0.3432846241066091, "learning_rate": 7.764000326805967e-06, "loss": 0.1253, "step": 222 }, { "epoch": 0.6299435028248588, "grad_norm": 0.3907373096790576, "learning_rate": 7.74548495012179e-06, "loss": 0.1391, "step": 223 }, { "epoch": 0.632768361581921, "grad_norm": 0.3728412944790509, "learning_rate": 7.726915516481824e-06, "loss": 0.122, "step": 224 }, { "epoch": 0.635593220338983, "grad_norm": 0.5220133391248876, "learning_rate": 7.708292391507105e-06, "loss": 0.154, "step": 225 }, { "epoch": 0.6384180790960452, "grad_norm": 0.4146492597116334, "learning_rate": 7.68961594187582e-06, "loss": 0.1359, "step": 226 }, { "epoch": 0.6412429378531074, "grad_norm": 0.31397589485063837, "learning_rate": 7.670886535316086e-06, "loss": 0.1219, "step": 227 }, { "epoch": 0.6440677966101694, "grad_norm": 0.3176441428020906, "learning_rate": 7.652104540598712e-06, "loss": 0.1178, "step": 228 }, { "epoch": 0.6468926553672316, "grad_norm": 0.4044788848890745, "learning_rate": 7.633270327529936e-06, "loss": 0.0976, "step": 229 }, { "epoch": 0.6497175141242938, "grad_norm": 0.4182319443410884, "learning_rate": 7.614384266944139e-06, "loss": 0.1645, "step": 230 }, { "epoch": 0.652542372881356, "grad_norm": 0.456563352937022, "learning_rate": 7.595446730696554e-06, "loss": 0.1382, "step": 231 }, { "epoch": 0.655367231638418, "grad_norm": 0.32903524423468056, "learning_rate": 7.5764580916559405e-06, "loss": 0.1326, "step": 232 }, { "epoch": 0.6581920903954802, "grad_norm": 0.28365921307195663, "learning_rate": 7.5574187236972344e-06, "loss": 0.0935, "step": 233 }, { "epoch": 0.6610169491525424, "grad_norm": 0.33982089630336487, "learning_rate": 7.5383290016942e-06, "loss": 0.1454, "step": 234 }, { "epoch": 0.6638418079096046, "grad_norm": 0.3772064426329603, "learning_rate": 7.519189301512042e-06, "loss": 0.1138, "step": 235 }, { "epoch": 0.6666666666666666, "grad_norm": 0.32618022313934375, "learning_rate": 7.500000000000001e-06, "loss": 0.1334, "step": 236 }, { "epoch": 0.6694915254237288, "grad_norm": 0.45333599138089803, "learning_rate": 7.480761474983943e-06, "loss": 0.1124, "step": 237 }, { "epoch": 0.672316384180791, "grad_norm": 0.2925984327006227, "learning_rate": 7.461474105258911e-06, "loss": 0.1186, "step": 238 }, { "epoch": 0.6751412429378532, "grad_norm": 0.29274180541284806, "learning_rate": 7.442138270581676e-06, "loss": 0.1152, "step": 239 }, { "epoch": 0.6779661016949152, "grad_norm": 0.31809376762962216, "learning_rate": 7.422754351663252e-06, "loss": 0.1305, "step": 240 }, { "epoch": 0.6807909604519774, "grad_norm": 0.38676571872596516, "learning_rate": 7.403322730161402e-06, "loss": 0.1282, "step": 241 }, { "epoch": 0.6836158192090396, "grad_norm": 0.4018879090622138, "learning_rate": 7.3838437886731264e-06, "loss": 0.1183, "step": 242 }, { "epoch": 0.6864406779661016, "grad_norm": 0.3776316331740439, "learning_rate": 7.364317910727128e-06, "loss": 0.1222, "step": 243 }, { "epoch": 0.6892655367231638, "grad_norm": 0.3824776982755315, "learning_rate": 7.3447454807762565e-06, "loss": 0.1428, "step": 244 }, { "epoch": 0.692090395480226, "grad_norm": 0.3047645047687849, "learning_rate": 7.325126884189948e-06, "loss": 0.1385, "step": 245 }, { "epoch": 0.6949152542372882, "grad_norm": 0.3314073663562561, "learning_rate": 7.30546250724663e-06, "loss": 0.1206, "step": 246 }, { "epoch": 0.6977401129943502, "grad_norm": 0.4400078921981207, "learning_rate": 7.285752737126117e-06, "loss": 0.1327, "step": 247 }, { "epoch": 0.7005649717514124, "grad_norm": 0.4934623443358996, "learning_rate": 7.265997961901987e-06, "loss": 0.1564, "step": 248 }, { "epoch": 0.7033898305084746, "grad_norm": 0.30484309715193525, "learning_rate": 7.246198570533944e-06, "loss": 0.1242, "step": 249 }, { "epoch": 0.7062146892655368, "grad_norm": 0.29354477145041785, "learning_rate": 7.226354952860157e-06, "loss": 0.1149, "step": 250 }, { "epoch": 0.7090395480225988, "grad_norm": 0.3505364073788135, "learning_rate": 7.206467499589584e-06, "loss": 0.1087, "step": 251 }, { "epoch": 0.711864406779661, "grad_norm": 0.29464691426886963, "learning_rate": 7.186536602294278e-06, "loss": 0.1142, "step": 252 }, { "epoch": 0.7146892655367232, "grad_norm": 0.39810478648654263, "learning_rate": 7.166562653401681e-06, "loss": 0.1723, "step": 253 }, { "epoch": 0.7175141242937854, "grad_norm": 0.6996654555301565, "learning_rate": 7.146546046186893e-06, "loss": 0.1509, "step": 254 }, { "epoch": 0.7203389830508474, "grad_norm": 0.43941978897146655, "learning_rate": 7.126487174764936e-06, "loss": 0.1214, "step": 255 }, { "epoch": 0.7231638418079096, "grad_norm": 0.8711552264444874, "learning_rate": 7.106386434082979e-06, "loss": 0.1814, "step": 256 }, { "epoch": 0.7259887005649718, "grad_norm": 0.44400959950712626, "learning_rate": 7.0862442199125836e-06, "loss": 0.1358, "step": 257 }, { "epoch": 0.7288135593220338, "grad_norm": 0.35970291529551507, "learning_rate": 7.066060928841891e-06, "loss": 0.168, "step": 258 }, { "epoch": 0.731638418079096, "grad_norm": 0.3562466168617285, "learning_rate": 7.0458369582678276e-06, "loss": 0.1436, "step": 259 }, { "epoch": 0.7344632768361582, "grad_norm": 0.3120380022750767, "learning_rate": 7.025572706388268e-06, "loss": 0.1146, "step": 260 }, { "epoch": 0.7372881355932204, "grad_norm": 0.3774265959308726, "learning_rate": 7.005268572194208e-06, "loss": 0.1034, "step": 261 }, { "epoch": 0.7401129943502824, "grad_norm": 0.2979654585888925, "learning_rate": 6.984924955461901e-06, "loss": 0.1314, "step": 262 }, { "epoch": 0.7429378531073446, "grad_norm": 0.581337456503481, "learning_rate": 6.964542256744986e-06, "loss": 0.1417, "step": 263 }, { "epoch": 0.7457627118644068, "grad_norm": 0.38453399260208176, "learning_rate": 6.944120877366605e-06, "loss": 0.1564, "step": 264 }, { "epoch": 0.748587570621469, "grad_norm": 0.4514847039771076, "learning_rate": 6.923661219411494e-06, "loss": 0.1104, "step": 265 }, { "epoch": 0.751412429378531, "grad_norm": 0.32941464728785497, "learning_rate": 6.9031636857180795e-06, "loss": 0.1232, "step": 266 }, { "epoch": 0.7542372881355932, "grad_norm": 0.32342200359362855, "learning_rate": 6.8826286798705325e-06, "loss": 0.1298, "step": 267 }, { "epoch": 0.7570621468926554, "grad_norm": 0.31600309169763335, "learning_rate": 6.86205660619083e-06, "loss": 0.1052, "step": 268 }, { "epoch": 0.7598870056497176, "grad_norm": 0.3385392307218001, "learning_rate": 6.841447869730794e-06, "loss": 0.1078, "step": 269 }, { "epoch": 0.7627118644067796, "grad_norm": 0.3999058208022931, "learning_rate": 6.820802876264112e-06, "loss": 0.1002, "step": 270 }, { "epoch": 0.7655367231638418, "grad_norm": 0.3086907716307657, "learning_rate": 6.800122032278351e-06, "loss": 0.1057, "step": 271 }, { "epoch": 0.768361581920904, "grad_norm": 0.3019429125850021, "learning_rate": 6.7794057449669545e-06, "loss": 0.1224, "step": 272 }, { "epoch": 0.7711864406779662, "grad_norm": 0.3649142358181941, "learning_rate": 6.758654422221225e-06, "loss": 0.1229, "step": 273 }, { "epoch": 0.7740112994350282, "grad_norm": 0.3886884309856128, "learning_rate": 6.7378684726222875e-06, "loss": 0.1347, "step": 274 }, { "epoch": 0.7768361581920904, "grad_norm": 0.39178721337354694, "learning_rate": 6.717048305433053e-06, "loss": 0.1395, "step": 275 }, { "epoch": 0.7796610169491526, "grad_norm": 0.3560643954364327, "learning_rate": 6.6961943305901515e-06, "loss": 0.0996, "step": 276 }, { "epoch": 0.7824858757062146, "grad_norm": 0.2987347215445713, "learning_rate": 6.675306958695874e-06, "loss": 0.0932, "step": 277 }, { "epoch": 0.7853107344632768, "grad_norm": 0.391815080682088, "learning_rate": 6.65438660101007e-06, "loss": 0.1008, "step": 278 }, { "epoch": 0.788135593220339, "grad_norm": 0.36385558646484645, "learning_rate": 6.633433669442066e-06, "loss": 0.1477, "step": 279 }, { "epoch": 0.7909604519774012, "grad_norm": 0.3950859179251757, "learning_rate": 6.612448576542545e-06, "loss": 0.1546, "step": 280 }, { "epoch": 0.7937853107344632, "grad_norm": 0.3508433770997797, "learning_rate": 6.59143173549543e-06, "loss": 0.1226, "step": 281 }, { "epoch": 0.7966101694915254, "grad_norm": 0.31172563185935787, "learning_rate": 6.570383560109745e-06, "loss": 0.1159, "step": 282 }, { "epoch": 0.7994350282485876, "grad_norm": 0.3722434789366503, "learning_rate": 6.549304464811467e-06, "loss": 0.1718, "step": 283 }, { "epoch": 0.8022598870056498, "grad_norm": 0.5468457727438586, "learning_rate": 6.52819486463537e-06, "loss": 0.129, "step": 284 }, { "epoch": 0.8050847457627118, "grad_norm": 0.37527355148100994, "learning_rate": 6.50705517521685e-06, "loss": 0.1245, "step": 285 }, { "epoch": 0.807909604519774, "grad_norm": 0.3327457659403298, "learning_rate": 6.48588581278374e-06, "loss": 0.1058, "step": 286 }, { "epoch": 0.8107344632768362, "grad_norm": 0.32768437622701796, "learning_rate": 6.464687194148121e-06, "loss": 0.1215, "step": 287 }, { "epoch": 0.8135593220338984, "grad_norm": 0.2981605604482326, "learning_rate": 6.443459736698106e-06, "loss": 0.107, "step": 288 }, { "epoch": 0.8163841807909604, "grad_norm": 0.39440686793408264, "learning_rate": 6.422203858389633e-06, "loss": 0.099, "step": 289 }, { "epoch": 0.8192090395480226, "grad_norm": 0.3079389916506814, "learning_rate": 6.400919977738222e-06, "loss": 0.1261, "step": 290 }, { "epoch": 0.8220338983050848, "grad_norm": 0.39230281512992937, "learning_rate": 6.379608513810753e-06, "loss": 0.1388, "step": 291 }, { "epoch": 0.8248587570621468, "grad_norm": 0.34403030238315363, "learning_rate": 6.3582698862171945e-06, "loss": 0.1144, "step": 292 }, { "epoch": 0.827683615819209, "grad_norm": 0.3554434392149389, "learning_rate": 6.336904515102355e-06, "loss": 0.1401, "step": 293 }, { "epoch": 0.8305084745762712, "grad_norm": 0.3903419801304912, "learning_rate": 6.315512821137606e-06, "loss": 0.1166, "step": 294 }, { "epoch": 0.8333333333333334, "grad_norm": 0.29926598642859675, "learning_rate": 6.294095225512604e-06, "loss": 0.1172, "step": 295 }, { "epoch": 0.8361581920903954, "grad_norm": 0.3247406039107872, "learning_rate": 6.272652149926989e-06, "loss": 0.1206, "step": 296 }, { "epoch": 0.8389830508474576, "grad_norm": 0.4155920338425029, "learning_rate": 6.251184016582088e-06, "loss": 0.1569, "step": 297 }, { "epoch": 0.8418079096045198, "grad_norm": 0.49517100510425244, "learning_rate": 6.229691248172599e-06, "loss": 0.1269, "step": 298 }, { "epoch": 0.844632768361582, "grad_norm": 0.3069243773291746, "learning_rate": 6.208174267878272e-06, "loss": 0.1039, "step": 299 }, { "epoch": 0.847457627118644, "grad_norm": 0.30008373909078834, "learning_rate": 6.186633499355576e-06, "loss": 0.1011, "step": 300 }, { "epoch": 0.8502824858757062, "grad_norm": 0.39498835620426576, "learning_rate": 6.165069366729347e-06, "loss": 0.1262, "step": 301 }, { "epoch": 0.8531073446327684, "grad_norm": 0.45259238504149724, "learning_rate": 6.143482294584459e-06, "loss": 0.1555, "step": 302 }, { "epoch": 0.8559322033898306, "grad_norm": 0.3111562216471757, "learning_rate": 6.121872707957441e-06, "loss": 0.1037, "step": 303 }, { "epoch": 0.8587570621468926, "grad_norm": 0.38394089498308964, "learning_rate": 6.100241032328125e-06, "loss": 0.1381, "step": 304 }, { "epoch": 0.8615819209039548, "grad_norm": 0.4084090521249512, "learning_rate": 6.078587693611258e-06, "loss": 0.132, "step": 305 }, { "epoch": 0.864406779661017, "grad_norm": 0.32206751376923765, "learning_rate": 6.056913118148122e-06, "loss": 0.115, "step": 306 }, { "epoch": 0.867231638418079, "grad_norm": 0.39859711839150824, "learning_rate": 6.035217732698141e-06, "loss": 0.0989, "step": 307 }, { "epoch": 0.8700564971751412, "grad_norm": 0.3223901842186738, "learning_rate": 6.013501964430468e-06, "loss": 0.1129, "step": 308 }, { "epoch": 0.8728813559322034, "grad_norm": 0.37501991260972894, "learning_rate": 5.9917662409155896e-06, "loss": 0.1158, "step": 309 }, { "epoch": 0.8757062146892656, "grad_norm": 0.3680155603064568, "learning_rate": 5.970010990116892e-06, "loss": 0.135, "step": 310 }, { "epoch": 0.8785310734463276, "grad_norm": 0.4225399124226897, "learning_rate": 5.948236640382249e-06, "loss": 0.1597, "step": 311 }, { "epoch": 0.8813559322033898, "grad_norm": 0.3062593607691136, "learning_rate": 5.926443620435572e-06, "loss": 0.1136, "step": 312 }, { "epoch": 0.884180790960452, "grad_norm": 0.39959497875600314, "learning_rate": 5.904632359368388e-06, "loss": 0.1177, "step": 313 }, { "epoch": 0.8870056497175142, "grad_norm": 1.0041774505409626, "learning_rate": 5.8828032866313725e-06, "loss": 0.1129, "step": 314 }, { "epoch": 0.8898305084745762, "grad_norm": 0.33893621066441904, "learning_rate": 5.860956832025907e-06, "loss": 0.1375, "step": 315 }, { "epoch": 0.8926553672316384, "grad_norm": 0.39383272399940145, "learning_rate": 5.839093425695609e-06, "loss": 0.1422, "step": 316 }, { "epoch": 0.8954802259887006, "grad_norm": 0.35738567030204327, "learning_rate": 5.817213498117866e-06, "loss": 0.1529, "step": 317 }, { "epoch": 0.8983050847457628, "grad_norm": 0.39608069155077835, "learning_rate": 5.795317480095361e-06, "loss": 0.1716, "step": 318 }, { "epoch": 0.9011299435028248, "grad_norm": 0.3631332118845927, "learning_rate": 5.773405802747585e-06, "loss": 0.1555, "step": 319 }, { "epoch": 0.903954802259887, "grad_norm": 0.3841861480992692, "learning_rate": 5.751478897502353e-06, "loss": 0.1894, "step": 320 }, { "epoch": 0.9067796610169492, "grad_norm": 0.29759516462695884, "learning_rate": 5.729537196087309e-06, "loss": 0.112, "step": 321 }, { "epoch": 0.9096045197740112, "grad_norm": 0.28809510654440856, "learning_rate": 5.707581130521424e-06, "loss": 0.1134, "step": 322 }, { "epoch": 0.9124293785310734, "grad_norm": 0.397848892460871, "learning_rate": 5.685611133106491e-06, "loss": 0.1297, "step": 323 }, { "epoch": 0.9152542372881356, "grad_norm": 0.3160261105495359, "learning_rate": 5.663627636418611e-06, "loss": 0.1023, "step": 324 }, { "epoch": 0.9180790960451978, "grad_norm": 0.31177416648537104, "learning_rate": 5.64163107329968e-06, "loss": 0.1143, "step": 325 }, { "epoch": 0.9209039548022598, "grad_norm": 0.32018560545646063, "learning_rate": 5.619621876848864e-06, "loss": 0.0991, "step": 326 }, { "epoch": 0.923728813559322, "grad_norm": 0.2963637468584666, "learning_rate": 5.597600480414069e-06, "loss": 0.1292, "step": 327 }, { "epoch": 0.9265536723163842, "grad_norm": 0.44573857637587677, "learning_rate": 5.575567317583415e-06, "loss": 0.1217, "step": 328 }, { "epoch": 0.9293785310734464, "grad_norm": 0.39445386690118284, "learning_rate": 5.553522822176694e-06, "loss": 0.1684, "step": 329 }, { "epoch": 0.9322033898305084, "grad_norm": 0.5570629744547292, "learning_rate": 5.531467428236827e-06, "loss": 0.121, "step": 330 }, { "epoch": 0.9350282485875706, "grad_norm": 0.2927441446558258, "learning_rate": 5.5094015700213254e-06, "loss": 0.1199, "step": 331 }, { "epoch": 0.9378531073446328, "grad_norm": 0.31083045505241014, "learning_rate": 5.4873256819937325e-06, "loss": 0.1299, "step": 332 }, { "epoch": 0.940677966101695, "grad_norm": 0.3685953460491956, "learning_rate": 5.465240198815073e-06, "loss": 0.1432, "step": 333 }, { "epoch": 0.943502824858757, "grad_norm": 0.358140929608912, "learning_rate": 5.443145555335296e-06, "loss": 0.1148, "step": 334 }, { "epoch": 0.9463276836158192, "grad_norm": 0.4047657464869157, "learning_rate": 5.421042186584708e-06, "loss": 0.1339, "step": 335 }, { "epoch": 0.9491525423728814, "grad_norm": 0.30030457933241933, "learning_rate": 5.398930527765416e-06, "loss": 0.1301, "step": 336 }, { "epoch": 0.9519774011299436, "grad_norm": 0.31945028324973423, "learning_rate": 5.376811014242749e-06, "loss": 0.1147, "step": 337 }, { "epoch": 0.9548022598870056, "grad_norm": 0.48059842793389146, "learning_rate": 5.354684081536693e-06, "loss": 0.1251, "step": 338 }, { "epoch": 0.9576271186440678, "grad_norm": 0.3547595891598906, "learning_rate": 5.332550165313312e-06, "loss": 0.1256, "step": 339 }, { "epoch": 0.96045197740113, "grad_norm": 0.4010586374877903, "learning_rate": 5.31040970137617e-06, "loss": 0.1521, "step": 340 }, { "epoch": 0.963276836158192, "grad_norm": 0.30749195608479307, "learning_rate": 5.288263125657757e-06, "loss": 0.0898, "step": 341 }, { "epoch": 0.9661016949152542, "grad_norm": 0.38150734834003086, "learning_rate": 5.266110874210893e-06, "loss": 0.1153, "step": 342 }, { "epoch": 0.9689265536723164, "grad_norm": 0.4214873565398611, "learning_rate": 5.2439533832001565e-06, "loss": 0.1148, "step": 343 }, { "epoch": 0.9717514124293786, "grad_norm": 0.6122493487962786, "learning_rate": 5.221791088893282e-06, "loss": 0.1104, "step": 344 }, { "epoch": 0.9745762711864406, "grad_norm": 0.3363807115202252, "learning_rate": 5.199624427652589e-06, "loss": 0.1223, "step": 345 }, { "epoch": 0.9774011299435028, "grad_norm": 0.4560391650148292, "learning_rate": 5.177453835926366e-06, "loss": 0.1279, "step": 346 }, { "epoch": 0.980225988700565, "grad_norm": 0.34518676510036406, "learning_rate": 5.155279750240302e-06, "loss": 0.111, "step": 347 }, { "epoch": 0.9830508474576272, "grad_norm": 0.37562743319165254, "learning_rate": 5.133102607188875e-06, "loss": 0.1223, "step": 348 }, { "epoch": 0.9858757062146892, "grad_norm": 0.37060789770525393, "learning_rate": 5.1109228434267585e-06, "loss": 0.1205, "step": 349 }, { "epoch": 0.9887005649717514, "grad_norm": 0.30512645778837, "learning_rate": 5.0887408956602316e-06, "loss": 0.1123, "step": 350 }, { "epoch": 0.9915254237288136, "grad_norm": 0.31734580505159626, "learning_rate": 5.06655720063857e-06, "loss": 0.1393, "step": 351 }, { "epoch": 0.9943502824858758, "grad_norm": 0.4252275291418614, "learning_rate": 5.044372195145455e-06, "loss": 0.1804, "step": 352 }, { "epoch": 0.9971751412429378, "grad_norm": 0.445405112876313, "learning_rate": 5.022186315990371e-06, "loss": 0.1466, "step": 353 }, { "epoch": 1.0, "grad_norm": 0.33905894838175626, "learning_rate": 5e-06, "loss": 0.123, "step": 354 }, { "epoch": 1.002824858757062, "grad_norm": 0.2724848502115793, "learning_rate": 4.97781368400963e-06, "loss": 0.0967, "step": 355 }, { "epoch": 1.0056497175141244, "grad_norm": 0.25595208945562964, "learning_rate": 4.9556278048545445e-06, "loss": 0.0704, "step": 356 }, { "epoch": 1.0084745762711864, "grad_norm": 0.2425492096389186, "learning_rate": 4.933442799361432e-06, "loss": 0.0799, "step": 357 }, { "epoch": 1.0112994350282485, "grad_norm": 0.2761823631954267, "learning_rate": 4.911259104339771e-06, "loss": 0.0936, "step": 358 }, { "epoch": 1.0141242937853108, "grad_norm": 0.27454279772458723, "learning_rate": 4.889077156573242e-06, "loss": 0.1175, "step": 359 }, { "epoch": 1.0169491525423728, "grad_norm": 0.31992512429885583, "learning_rate": 4.866897392811127e-06, "loss": 0.0968, "step": 360 }, { "epoch": 1.0197740112994351, "grad_norm": 0.2894306047919742, "learning_rate": 4.8447202497596975e-06, "loss": 0.1236, "step": 361 }, { "epoch": 1.0225988700564972, "grad_norm": 0.24920669865954365, "learning_rate": 4.822546164073635e-06, "loss": 0.0852, "step": 362 }, { "epoch": 1.0254237288135593, "grad_norm": 0.2952175126202192, "learning_rate": 4.800375572347414e-06, "loss": 0.0991, "step": 363 }, { "epoch": 1.0282485875706215, "grad_norm": 0.23148883860994288, "learning_rate": 4.778208911106718e-06, "loss": 0.066, "step": 364 }, { "epoch": 1.0310734463276836, "grad_norm": 0.2692480902893908, "learning_rate": 4.756046616799845e-06, "loss": 0.0973, "step": 365 }, { "epoch": 1.0338983050847457, "grad_norm": 0.2760736585863661, "learning_rate": 4.7338891257891085e-06, "loss": 0.0912, "step": 366 }, { "epoch": 1.036723163841808, "grad_norm": 0.24876270637682962, "learning_rate": 4.7117368743422435e-06, "loss": 0.0837, "step": 367 }, { "epoch": 1.03954802259887, "grad_norm": 0.2494244392514283, "learning_rate": 4.689590298623831e-06, "loss": 0.0811, "step": 368 }, { "epoch": 1.042372881355932, "grad_norm": 0.24104486268426012, "learning_rate": 4.667449834686689e-06, "loss": 0.076, "step": 369 }, { "epoch": 1.0451977401129944, "grad_norm": 0.2789423674939665, "learning_rate": 4.645315918463308e-06, "loss": 0.086, "step": 370 }, { "epoch": 1.0480225988700564, "grad_norm": 0.2770971973884279, "learning_rate": 4.623188985757252e-06, "loss": 0.089, "step": 371 }, { "epoch": 1.0508474576271187, "grad_norm": 0.28375117683833695, "learning_rate": 4.601069472234584e-06, "loss": 0.1046, "step": 372 }, { "epoch": 1.0536723163841808, "grad_norm": 0.23874038173398437, "learning_rate": 4.578957813415293e-06, "loss": 0.0657, "step": 373 }, { "epoch": 1.0564971751412429, "grad_norm": 0.27927992062438906, "learning_rate": 4.556854444664706e-06, "loss": 0.0823, "step": 374 }, { "epoch": 1.0593220338983051, "grad_norm": 0.25006740348604295, "learning_rate": 4.534759801184928e-06, "loss": 0.084, "step": 375 }, { "epoch": 1.0621468926553672, "grad_norm": 0.26172192084359713, "learning_rate": 4.512674318006268e-06, "loss": 0.0688, "step": 376 }, { "epoch": 1.0649717514124293, "grad_norm": 0.27984219569057084, "learning_rate": 4.490598429978676e-06, "loss": 0.1003, "step": 377 }, { "epoch": 1.0677966101694916, "grad_norm": 0.31355314424830794, "learning_rate": 4.468532571763174e-06, "loss": 0.093, "step": 378 }, { "epoch": 1.0706214689265536, "grad_norm": 0.2789313738912747, "learning_rate": 4.446477177823308e-06, "loss": 0.0891, "step": 379 }, { "epoch": 1.073446327683616, "grad_norm": 0.2647947856217315, "learning_rate": 4.424432682416585e-06, "loss": 0.0657, "step": 380 }, { "epoch": 1.076271186440678, "grad_norm": 0.309205913837631, "learning_rate": 4.402399519585932e-06, "loss": 0.091, "step": 381 }, { "epoch": 1.07909604519774, "grad_norm": 0.27983746723387665, "learning_rate": 4.380378123151139e-06, "loss": 0.0758, "step": 382 }, { "epoch": 1.0819209039548023, "grad_norm": 0.265317674784406, "learning_rate": 4.358368926700321e-06, "loss": 0.0744, "step": 383 }, { "epoch": 1.0847457627118644, "grad_norm": 0.3844212564195145, "learning_rate": 4.336372363581391e-06, "loss": 0.1193, "step": 384 }, { "epoch": 1.0875706214689265, "grad_norm": 0.301325518174874, "learning_rate": 4.314388866893512e-06, "loss": 0.0954, "step": 385 }, { "epoch": 1.0903954802259888, "grad_norm": 0.2778786659311699, "learning_rate": 4.292418869478577e-06, "loss": 0.0791, "step": 386 }, { "epoch": 1.0932203389830508, "grad_norm": 0.2662950349405817, "learning_rate": 4.270462803912692e-06, "loss": 0.076, "step": 387 }, { "epoch": 1.0960451977401129, "grad_norm": 0.2796794355932831, "learning_rate": 4.248521102497649e-06, "loss": 0.0804, "step": 388 }, { "epoch": 1.0988700564971752, "grad_norm": 0.2768998598394203, "learning_rate": 4.226594197252417e-06, "loss": 0.0834, "step": 389 }, { "epoch": 1.1016949152542372, "grad_norm": 0.27374836470708835, "learning_rate": 4.204682519904641e-06, "loss": 0.0718, "step": 390 }, { "epoch": 1.1045197740112995, "grad_norm": 0.3445589105129125, "learning_rate": 4.182786501882135e-06, "loss": 0.1162, "step": 391 }, { "epoch": 1.1073446327683616, "grad_norm": 0.34154494685980724, "learning_rate": 4.160906574304392e-06, "loss": 0.0821, "step": 392 }, { "epoch": 1.1101694915254237, "grad_norm": 0.28827252450009566, "learning_rate": 4.139043167974096e-06, "loss": 0.0789, "step": 393 }, { "epoch": 1.112994350282486, "grad_norm": 0.29707210161706604, "learning_rate": 4.117196713368629e-06, "loss": 0.0826, "step": 394 }, { "epoch": 1.115819209039548, "grad_norm": 0.27591994189237984, "learning_rate": 4.095367640631614e-06, "loss": 0.0703, "step": 395 }, { "epoch": 1.11864406779661, "grad_norm": 0.31721715937650236, "learning_rate": 4.073556379564429e-06, "loss": 0.0741, "step": 396 }, { "epoch": 1.1214689265536724, "grad_norm": 0.2809918490407584, "learning_rate": 4.051763359617753e-06, "loss": 0.0768, "step": 397 }, { "epoch": 1.1242937853107344, "grad_norm": 0.3012659075431546, "learning_rate": 4.0299890098831096e-06, "loss": 0.0899, "step": 398 }, { "epoch": 1.1271186440677967, "grad_norm": 0.41884111681760783, "learning_rate": 4.00823375908441e-06, "loss": 0.1056, "step": 399 }, { "epoch": 1.1299435028248588, "grad_norm": 0.3072172569806948, "learning_rate": 3.986498035569533e-06, "loss": 0.0946, "step": 400 }, { "epoch": 1.1327683615819208, "grad_norm": 0.30733421162176133, "learning_rate": 3.964782267301861e-06, "loss": 0.1148, "step": 401 }, { "epoch": 1.1355932203389831, "grad_norm": 0.3062196925146934, "learning_rate": 3.9430868818518786e-06, "loss": 0.0939, "step": 402 }, { "epoch": 1.1384180790960452, "grad_norm": 0.31120677637400174, "learning_rate": 3.921412306388744e-06, "loss": 0.0907, "step": 403 }, { "epoch": 1.1412429378531073, "grad_norm": 0.29992317179141076, "learning_rate": 3.899758967671879e-06, "loss": 0.0936, "step": 404 }, { "epoch": 1.1440677966101696, "grad_norm": 0.3192209928402148, "learning_rate": 3.8781272920425605e-06, "loss": 0.0926, "step": 405 }, { "epoch": 1.1468926553672316, "grad_norm": 0.2758355027322586, "learning_rate": 3.856517705415543e-06, "loss": 0.0716, "step": 406 }, { "epoch": 1.1497175141242937, "grad_norm": 0.30041138229139885, "learning_rate": 3.834930633270654e-06, "loss": 0.0915, "step": 407 }, { "epoch": 1.152542372881356, "grad_norm": 0.2793345364877422, "learning_rate": 3.813366500644426e-06, "loss": 0.084, "step": 408 }, { "epoch": 1.155367231638418, "grad_norm": 0.31892388449193476, "learning_rate": 3.791825732121729e-06, "loss": 0.0874, "step": 409 }, { "epoch": 1.1581920903954803, "grad_norm": 0.31584149456312116, "learning_rate": 3.770308751827402e-06, "loss": 0.0973, "step": 410 }, { "epoch": 1.1610169491525424, "grad_norm": 0.34820980091040554, "learning_rate": 3.748815983417914e-06, "loss": 0.1253, "step": 411 }, { "epoch": 1.1638418079096045, "grad_norm": 0.28853430489950965, "learning_rate": 3.727347850073012e-06, "loss": 0.0759, "step": 412 }, { "epoch": 1.1666666666666667, "grad_norm": 0.3062765715619059, "learning_rate": 3.705904774487396e-06, "loss": 0.0933, "step": 413 }, { "epoch": 1.1694915254237288, "grad_norm": 0.32486365661487987, "learning_rate": 3.6844871788623946e-06, "loss": 0.0911, "step": 414 }, { "epoch": 1.1723163841807909, "grad_norm": 0.3026538149404601, "learning_rate": 3.6630954848976472e-06, "loss": 0.0942, "step": 415 }, { "epoch": 1.1751412429378532, "grad_norm": 0.3953970274281564, "learning_rate": 3.641730113782807e-06, "loss": 0.0779, "step": 416 }, { "epoch": 1.1779661016949152, "grad_norm": 0.2978638146695922, "learning_rate": 3.6203914861892483e-06, "loss": 0.0907, "step": 417 }, { "epoch": 1.1807909604519775, "grad_norm": 0.29056550406150716, "learning_rate": 3.5990800222617774e-06, "loss": 0.0754, "step": 418 }, { "epoch": 1.1836158192090396, "grad_norm": 0.2519599426657091, "learning_rate": 3.577796141610369e-06, "loss": 0.0632, "step": 419 }, { "epoch": 1.1864406779661016, "grad_norm": 0.2867955638349113, "learning_rate": 3.5565402633018963e-06, "loss": 0.0854, "step": 420 }, { "epoch": 1.189265536723164, "grad_norm": 0.27040139234217375, "learning_rate": 3.535312805851881e-06, "loss": 0.0676, "step": 421 }, { "epoch": 1.192090395480226, "grad_norm": 0.3332989661333647, "learning_rate": 3.5141141872162613e-06, "loss": 0.1127, "step": 422 }, { "epoch": 1.194915254237288, "grad_norm": 0.309745904183909, "learning_rate": 3.4929448247831523e-06, "loss": 0.0917, "step": 423 }, { "epoch": 1.1977401129943503, "grad_norm": 0.45096384475777856, "learning_rate": 3.4718051353646304e-06, "loss": 0.1173, "step": 424 }, { "epoch": 1.2005649717514124, "grad_norm": 0.30878271329326906, "learning_rate": 3.4506955351885346e-06, "loss": 0.0919, "step": 425 }, { "epoch": 1.2033898305084745, "grad_norm": 0.320682805049876, "learning_rate": 3.4296164398902576e-06, "loss": 0.0922, "step": 426 }, { "epoch": 1.2062146892655368, "grad_norm": 0.30581016394054344, "learning_rate": 3.408568264504571e-06, "loss": 0.0809, "step": 427 }, { "epoch": 1.2090395480225988, "grad_norm": 0.2747339191019564, "learning_rate": 3.387551423457456e-06, "loss": 0.0802, "step": 428 }, { "epoch": 1.211864406779661, "grad_norm": 0.552467904262321, "learning_rate": 3.366566330557935e-06, "loss": 0.1036, "step": 429 }, { "epoch": 1.2146892655367232, "grad_norm": 0.32041597951648537, "learning_rate": 3.345613398989932e-06, "loss": 0.0849, "step": 430 }, { "epoch": 1.2175141242937852, "grad_norm": 0.32164554989021144, "learning_rate": 3.324693041304128e-06, "loss": 0.0901, "step": 431 }, { "epoch": 1.2203389830508475, "grad_norm": 0.297941242679515, "learning_rate": 3.3038056694098485e-06, "loss": 0.0857, "step": 432 }, { "epoch": 1.2231638418079096, "grad_norm": 0.29988786294219155, "learning_rate": 3.2829516945669493e-06, "loss": 0.0658, "step": 433 }, { "epoch": 1.2259887005649717, "grad_norm": 0.28671491672159266, "learning_rate": 3.262131527377715e-06, "loss": 0.0825, "step": 434 }, { "epoch": 1.228813559322034, "grad_norm": 0.29202073423769753, "learning_rate": 3.241345577778775e-06, "loss": 0.0793, "step": 435 }, { "epoch": 1.231638418079096, "grad_norm": 0.26686163546586056, "learning_rate": 3.220594255033046e-06, "loss": 0.0621, "step": 436 }, { "epoch": 1.2344632768361583, "grad_norm": 0.2713137149878859, "learning_rate": 3.1998779677216508e-06, "loss": 0.0731, "step": 437 }, { "epoch": 1.2372881355932204, "grad_norm": 0.2775500944738776, "learning_rate": 3.1791971237358893e-06, "loss": 0.0734, "step": 438 }, { "epoch": 1.2401129943502824, "grad_norm": 0.28779045523216457, "learning_rate": 3.1585521302692073e-06, "loss": 0.0924, "step": 439 }, { "epoch": 1.2429378531073447, "grad_norm": 0.3012961352670417, "learning_rate": 3.1379433938091695e-06, "loss": 0.0977, "step": 440 }, { "epoch": 1.2457627118644068, "grad_norm": 0.2790213382568145, "learning_rate": 3.117371320129469e-06, "loss": 0.0638, "step": 441 }, { "epoch": 1.2485875706214689, "grad_norm": 0.289333568849024, "learning_rate": 3.0968363142819226e-06, "loss": 0.0835, "step": 442 }, { "epoch": 1.2514124293785311, "grad_norm": 0.27842001426208673, "learning_rate": 3.076338780588507e-06, "loss": 0.0744, "step": 443 }, { "epoch": 1.2542372881355932, "grad_norm": 0.29218223586498643, "learning_rate": 3.0558791226333974e-06, "loss": 0.084, "step": 444 }, { "epoch": 1.2570621468926553, "grad_norm": 0.27223825874055874, "learning_rate": 3.035457743255016e-06, "loss": 0.0836, "step": 445 }, { "epoch": 1.2598870056497176, "grad_norm": 0.27957551489646953, "learning_rate": 3.0150750445380995e-06, "loss": 0.0782, "step": 446 }, { "epoch": 1.2627118644067796, "grad_norm": 0.3122940814886727, "learning_rate": 2.9947314278057927e-06, "loss": 0.1053, "step": 447 }, { "epoch": 1.2655367231638417, "grad_norm": 0.3191436186366097, "learning_rate": 2.9744272936117323e-06, "loss": 0.1014, "step": 448 }, { "epoch": 1.268361581920904, "grad_norm": 0.2842992416668331, "learning_rate": 2.954163041732174e-06, "loss": 0.0749, "step": 449 }, { "epoch": 1.271186440677966, "grad_norm": 0.30400106395182974, "learning_rate": 2.9339390711581105e-06, "loss": 0.0887, "step": 450 }, { "epoch": 1.274011299435028, "grad_norm": 0.29506183884480336, "learning_rate": 2.9137557800874177e-06, "loss": 0.091, "step": 451 }, { "epoch": 1.2768361581920904, "grad_norm": 0.3144746461494332, "learning_rate": 2.8936135659170217e-06, "loss": 0.1059, "step": 452 }, { "epoch": 1.2796610169491525, "grad_norm": 0.2996152337603787, "learning_rate": 2.8735128252350677e-06, "loss": 0.0786, "step": 453 }, { "epoch": 1.2824858757062148, "grad_norm": 0.34649915198509984, "learning_rate": 2.853453953813108e-06, "loss": 0.0802, "step": 454 }, { "epoch": 1.2853107344632768, "grad_norm": 0.3012239481267599, "learning_rate": 2.8334373465983216e-06, "loss": 0.0895, "step": 455 }, { "epoch": 1.288135593220339, "grad_norm": 0.28862860268152546, "learning_rate": 2.8134633977057236e-06, "loss": 0.0839, "step": 456 }, { "epoch": 1.2909604519774012, "grad_norm": 0.3084982778477155, "learning_rate": 2.7935325004104164e-06, "loss": 0.1009, "step": 457 }, { "epoch": 1.2937853107344632, "grad_norm": 0.3282395905268862, "learning_rate": 2.7736450471398435e-06, "loss": 0.0652, "step": 458 }, { "epoch": 1.2966101694915255, "grad_norm": 0.2793458687882274, "learning_rate": 2.7538014294660564e-06, "loss": 0.06, "step": 459 }, { "epoch": 1.2994350282485876, "grad_norm": 0.29032428847483527, "learning_rate": 2.734002038098015e-06, "loss": 0.0674, "step": 460 }, { "epoch": 1.3022598870056497, "grad_norm": 0.26031067828121474, "learning_rate": 2.7142472628738846e-06, "loss": 0.0628, "step": 461 }, { "epoch": 1.305084745762712, "grad_norm": 0.3479546918961412, "learning_rate": 2.69453749275337e-06, "loss": 0.105, "step": 462 }, { "epoch": 1.307909604519774, "grad_norm": 0.2777871642249738, "learning_rate": 2.6748731158100528e-06, "loss": 0.0733, "step": 463 }, { "epoch": 1.310734463276836, "grad_norm": 0.29860877000280583, "learning_rate": 2.655254519223746e-06, "loss": 0.0956, "step": 464 }, { "epoch": 1.3135593220338984, "grad_norm": 0.39509978738969753, "learning_rate": 2.6356820892728752e-06, "loss": 0.098, "step": 465 }, { "epoch": 1.3163841807909604, "grad_norm": 0.26842617785571943, "learning_rate": 2.616156211326875e-06, "loss": 0.0683, "step": 466 }, { "epoch": 1.3192090395480225, "grad_norm": 0.2638729229718093, "learning_rate": 2.5966772698386e-06, "loss": 0.0697, "step": 467 }, { "epoch": 1.3220338983050848, "grad_norm": 0.2885132528975007, "learning_rate": 2.57724564833675e-06, "loss": 0.0853, "step": 468 }, { "epoch": 1.3248587570621468, "grad_norm": 0.2909952803458812, "learning_rate": 2.557861729418326e-06, "loss": 0.0702, "step": 469 }, { "epoch": 1.327683615819209, "grad_norm": 0.3508969831429909, "learning_rate": 2.5385258947410908e-06, "loss": 0.1163, "step": 470 }, { "epoch": 1.3305084745762712, "grad_norm": 0.3137041512371342, "learning_rate": 2.5192385250160587e-06, "loss": 0.0791, "step": 471 }, { "epoch": 1.3333333333333333, "grad_norm": 0.34011082522993374, "learning_rate": 2.5000000000000015e-06, "loss": 0.0863, "step": 472 }, { "epoch": 1.3361581920903955, "grad_norm": 0.31378850435891975, "learning_rate": 2.4808106984879597e-06, "loss": 0.1031, "step": 473 }, { "epoch": 1.3389830508474576, "grad_norm": 0.32916522472842985, "learning_rate": 2.461670998305802e-06, "loss": 0.1104, "step": 474 }, { "epoch": 1.34180790960452, "grad_norm": 0.2858816116663427, "learning_rate": 2.4425812763027672e-06, "loss": 0.082, "step": 475 }, { "epoch": 1.344632768361582, "grad_norm": 0.3311097135552006, "learning_rate": 2.4235419083440615e-06, "loss": 0.1001, "step": 476 }, { "epoch": 1.347457627118644, "grad_norm": 0.3212815875251671, "learning_rate": 2.404553269303448e-06, "loss": 0.0706, "step": 477 }, { "epoch": 1.3502824858757063, "grad_norm": 0.3059752435224393, "learning_rate": 2.3856157330558625e-06, "loss": 0.0858, "step": 478 }, { "epoch": 1.3531073446327684, "grad_norm": 0.3389049114062409, "learning_rate": 2.366729672470065e-06, "loss": 0.0853, "step": 479 }, { "epoch": 1.3559322033898304, "grad_norm": 0.32200853396437623, "learning_rate": 2.3478954594012884e-06, "loss": 0.11, "step": 480 }, { "epoch": 1.3587570621468927, "grad_norm": 0.30985593467605343, "learning_rate": 2.329113464683913e-06, "loss": 0.0925, "step": 481 }, { "epoch": 1.3615819209039548, "grad_norm": 0.29010419081013045, "learning_rate": 2.310384058124181e-06, "loss": 0.079, "step": 482 }, { "epoch": 1.3644067796610169, "grad_norm": 0.2868431818032766, "learning_rate": 2.2917076084928953e-06, "loss": 0.0691, "step": 483 }, { "epoch": 1.3672316384180792, "grad_norm": 0.32921780136851597, "learning_rate": 2.273084483518176e-06, "loss": 0.1029, "step": 484 }, { "epoch": 1.3700564971751412, "grad_norm": 0.3443609157960376, "learning_rate": 2.25451504987821e-06, "loss": 0.1094, "step": 485 }, { "epoch": 1.3728813559322033, "grad_norm": 0.3442323075700346, "learning_rate": 2.2359996731940348e-06, "loss": 0.1148, "step": 486 }, { "epoch": 1.3757062146892656, "grad_norm": 0.31019876061048274, "learning_rate": 2.2175387180223333e-06, "loss": 0.0846, "step": 487 }, { "epoch": 1.3785310734463276, "grad_norm": 0.27898269961579986, "learning_rate": 2.1991325478482695e-06, "loss": 0.0858, "step": 488 }, { "epoch": 1.3813559322033897, "grad_norm": 0.30200643340593814, "learning_rate": 2.1807815250783194e-06, "loss": 0.0901, "step": 489 }, { "epoch": 1.384180790960452, "grad_norm": 0.28412971697416345, "learning_rate": 2.162486011033142e-06, "loss": 0.0649, "step": 490 }, { "epoch": 1.387005649717514, "grad_norm": 0.28849690690010993, "learning_rate": 2.1442463659404587e-06, "loss": 0.0734, "step": 491 }, { "epoch": 1.3898305084745763, "grad_norm": 0.2872214027286925, "learning_rate": 2.1260629489279662e-06, "loss": 0.0744, "step": 492 }, { "epoch": 1.3926553672316384, "grad_norm": 0.2856113105572892, "learning_rate": 2.1079361180162657e-06, "loss": 0.0772, "step": 493 }, { "epoch": 1.3954802259887007, "grad_norm": 0.2917591710852941, "learning_rate": 2.089866230111813e-06, "loss": 0.0872, "step": 494 }, { "epoch": 1.3983050847457628, "grad_norm": 0.3156299163799424, "learning_rate": 2.0718536409998834e-06, "loss": 0.0755, "step": 495 }, { "epoch": 1.4011299435028248, "grad_norm": 0.3420052483929326, "learning_rate": 2.053898705337583e-06, "loss": 0.0833, "step": 496 }, { "epoch": 1.4039548022598871, "grad_norm": 0.3007655048264405, "learning_rate": 2.0360017766468466e-06, "loss": 0.0755, "step": 497 }, { "epoch": 1.4067796610169492, "grad_norm": 0.3285257384928867, "learning_rate": 2.0181632073074925e-06, "loss": 0.0882, "step": 498 }, { "epoch": 1.4096045197740112, "grad_norm": 0.27048721301742223, "learning_rate": 2.000383348550279e-06, "loss": 0.0739, "step": 499 }, { "epoch": 1.4124293785310735, "grad_norm": 0.2984410410038522, "learning_rate": 1.9826625504499807e-06, "loss": 0.0954, "step": 500 }, { "epoch": 1.4124293785310735, "eval_loss": 0.130197674036026, "eval_runtime": 1.5872, "eval_samples_per_second": 18.271, "eval_steps_per_second": 5.04, "step": 500 }, { "epoch": 1.4152542372881356, "grad_norm": 0.2898926398978837, "learning_rate": 1.965001161918513e-06, "loss": 0.0789, "step": 501 }, { "epoch": 1.4180790960451977, "grad_norm": 0.3465927210704245, "learning_rate": 1.947399530698043e-06, "loss": 0.0979, "step": 502 }, { "epoch": 1.42090395480226, "grad_norm": 0.3475682598971018, "learning_rate": 1.92985800335416e-06, "loss": 0.0843, "step": 503 }, { "epoch": 1.423728813559322, "grad_norm": 0.3294028190875718, "learning_rate": 1.912376925269041e-06, "loss": 0.1121, "step": 504 }, { "epoch": 1.426553672316384, "grad_norm": 0.3498640725406821, "learning_rate": 1.894956640634652e-06, "loss": 0.0828, "step": 505 }, { "epoch": 1.4293785310734464, "grad_norm": 0.30225327794900386, "learning_rate": 1.8775974924459716e-06, "loss": 0.085, "step": 506 }, { "epoch": 1.4322033898305084, "grad_norm": 0.3931329736136659, "learning_rate": 1.860299822494241e-06, "loss": 0.0724, "step": 507 }, { "epoch": 1.4350282485875705, "grad_norm": 0.28157973731776237, "learning_rate": 1.8430639713602317e-06, "loss": 0.0658, "step": 508 }, { "epoch": 1.4378531073446328, "grad_norm": 0.28515233111654203, "learning_rate": 1.8258902784075394e-06, "loss": 0.0847, "step": 509 }, { "epoch": 1.4406779661016949, "grad_norm": 0.2760261233151599, "learning_rate": 1.808779081775901e-06, "loss": 0.066, "step": 510 }, { "epoch": 1.4435028248587571, "grad_norm": 0.2941924396596464, "learning_rate": 1.7917307183745353e-06, "loss": 0.0884, "step": 511 }, { "epoch": 1.4463276836158192, "grad_norm": 0.2686743706277037, "learning_rate": 1.7747455238755223e-06, "loss": 0.0743, "step": 512 }, { "epoch": 1.4491525423728815, "grad_norm": 0.3151180489919417, "learning_rate": 1.757823832707175e-06, "loss": 0.1007, "step": 513 }, { "epoch": 1.4519774011299436, "grad_norm": 0.34222558948663734, "learning_rate": 1.7409659780474652e-06, "loss": 0.103, "step": 514 }, { "epoch": 1.4548022598870056, "grad_norm": 0.3402549223734959, "learning_rate": 1.7241722918174642e-06, "loss": 0.1213, "step": 515 }, { "epoch": 1.457627118644068, "grad_norm": 0.3145312403253552, "learning_rate": 1.7074431046748075e-06, "loss": 0.0969, "step": 516 }, { "epoch": 1.46045197740113, "grad_norm": 0.2896418247469403, "learning_rate": 1.6907787460071756e-06, "loss": 0.0862, "step": 517 }, { "epoch": 1.463276836158192, "grad_norm": 0.31049200696233425, "learning_rate": 1.6741795439258218e-06, "loss": 0.098, "step": 518 }, { "epoch": 1.4661016949152543, "grad_norm": 0.36636752026678193, "learning_rate": 1.6576458252590988e-06, "loss": 0.1391, "step": 519 }, { "epoch": 1.4689265536723164, "grad_norm": 0.27909992931423844, "learning_rate": 1.641177915546036e-06, "loss": 0.0744, "step": 520 }, { "epoch": 1.4717514124293785, "grad_norm": 0.2864859483455, "learning_rate": 1.6247761390299221e-06, "loss": 0.0898, "step": 521 }, { "epoch": 1.4745762711864407, "grad_norm": 0.4362808575365348, "learning_rate": 1.6084408186519195e-06, "loss": 0.0734, "step": 522 }, { "epoch": 1.4774011299435028, "grad_norm": 0.28051391012639115, "learning_rate": 1.5921722760447144e-06, "loss": 0.0678, "step": 523 }, { "epoch": 1.4802259887005649, "grad_norm": 0.30037283427959355, "learning_rate": 1.5759708315261724e-06, "loss": 0.0932, "step": 524 }, { "epoch": 1.4830508474576272, "grad_norm": 0.2982894826882516, "learning_rate": 1.5598368040930427e-06, "loss": 0.0735, "step": 525 }, { "epoch": 1.4858757062146892, "grad_norm": 0.3166739516240939, "learning_rate": 1.5437705114146735e-06, "loss": 0.1003, "step": 526 }, { "epoch": 1.4887005649717513, "grad_norm": 0.3288809776102775, "learning_rate": 1.527772269826749e-06, "loss": 0.0984, "step": 527 }, { "epoch": 1.4915254237288136, "grad_norm": 0.29168718949906514, "learning_rate": 1.511842394325077e-06, "loss": 0.0907, "step": 528 }, { "epoch": 1.4943502824858756, "grad_norm": 0.2993454545746122, "learning_rate": 1.4959811985593707e-06, "loss": 0.0648, "step": 529 }, { "epoch": 1.497175141242938, "grad_norm": 0.2901988801448637, "learning_rate": 1.4801889948270852e-06, "loss": 0.0843, "step": 530 }, { "epoch": 1.5, "grad_norm": 0.276078183792322, "learning_rate": 1.4644660940672628e-06, "loss": 0.0646, "step": 531 }, { "epoch": 1.5028248587570623, "grad_norm": 0.2957096527763229, "learning_rate": 1.44881280585441e-06, "loss": 0.0838, "step": 532 }, { "epoch": 1.5056497175141241, "grad_norm": 0.3074950453422565, "learning_rate": 1.4332294383924034e-06, "loss": 0.0976, "step": 533 }, { "epoch": 1.5084745762711864, "grad_norm": 0.28111249969205165, "learning_rate": 1.4177162985084242e-06, "loss": 0.07, "step": 534 }, { "epoch": 1.5112994350282487, "grad_norm": 0.3040746365690992, "learning_rate": 1.4022736916469166e-06, "loss": 0.0675, "step": 535 }, { "epoch": 1.5141242937853108, "grad_norm": 0.2970452295573905, "learning_rate": 1.3869019218635644e-06, "loss": 0.0937, "step": 536 }, { "epoch": 1.5169491525423728, "grad_norm": 0.3091154520174239, "learning_rate": 1.3716012918193206e-06, "loss": 0.0761, "step": 537 }, { "epoch": 1.5197740112994351, "grad_norm": 0.33856268143824403, "learning_rate": 1.3563721027744309e-06, "loss": 0.0941, "step": 538 }, { "epoch": 1.5225988700564972, "grad_norm": 0.2960053858988549, "learning_rate": 1.3412146545825166e-06, "loss": 0.0731, "step": 539 }, { "epoch": 1.5254237288135593, "grad_norm": 0.3000001098205527, "learning_rate": 1.3261292456846648e-06, "loss": 0.0777, "step": 540 }, { "epoch": 1.5282485875706215, "grad_norm": 0.32275915585442194, "learning_rate": 1.3111161731035448e-06, "loss": 0.1028, "step": 541 }, { "epoch": 1.5310734463276836, "grad_norm": 0.28047948839083625, "learning_rate": 1.2961757324375768e-06, "loss": 0.0773, "step": 542 }, { "epoch": 1.5338983050847457, "grad_norm": 0.2897511789785588, "learning_rate": 1.2813082178550929e-06, "loss": 0.0761, "step": 543 }, { "epoch": 1.536723163841808, "grad_norm": 0.3604669071306025, "learning_rate": 1.2665139220885615e-06, "loss": 0.0966, "step": 544 }, { "epoch": 1.53954802259887, "grad_norm": 0.35066125768752815, "learning_rate": 1.2517931364288133e-06, "loss": 0.1189, "step": 545 }, { "epoch": 1.542372881355932, "grad_norm": 0.36481440937249643, "learning_rate": 1.2371461507193077e-06, "loss": 0.0854, "step": 546 }, { "epoch": 1.5451977401129944, "grad_norm": 0.2705446394892136, "learning_rate": 1.2225732533504309e-06, "loss": 0.0681, "step": 547 }, { "epoch": 1.5480225988700564, "grad_norm": 0.2709042435292725, "learning_rate": 1.2080747312538082e-06, "loss": 0.0605, "step": 548 }, { "epoch": 1.5508474576271185, "grad_norm": 0.2941674674541573, "learning_rate": 1.1936508698966664e-06, "loss": 0.0759, "step": 549 }, { "epoch": 1.5536723163841808, "grad_norm": 0.3301045484204278, "learning_rate": 1.1793019532762057e-06, "loss": 0.09, "step": 550 }, { "epoch": 1.556497175141243, "grad_norm": 0.5929461456725253, "learning_rate": 1.1650282639140066e-06, "loss": 0.115, "step": 551 }, { "epoch": 1.559322033898305, "grad_norm": 0.29282230586457825, "learning_rate": 1.1508300828504682e-06, "loss": 0.068, "step": 552 }, { "epoch": 1.5621468926553672, "grad_norm": 0.290916040961216, "learning_rate": 1.1367076896392853e-06, "loss": 0.0759, "step": 553 }, { "epoch": 1.5649717514124295, "grad_norm": 0.3308510796313253, "learning_rate": 1.122661362341927e-06, "loss": 0.107, "step": 554 }, { "epoch": 1.5677966101694916, "grad_norm": 0.2902882953470368, "learning_rate": 1.1086913775221709e-06, "loss": 0.0817, "step": 555 }, { "epoch": 1.5706214689265536, "grad_norm": 0.25808920224703275, "learning_rate": 1.0947980102406597e-06, "loss": 0.063, "step": 556 }, { "epoch": 1.573446327683616, "grad_norm": 0.3042205877096776, "learning_rate": 1.0809815340494822e-06, "loss": 0.0755, "step": 557 }, { "epoch": 1.576271186440678, "grad_norm": 0.3051764415373886, "learning_rate": 1.0672422209867879e-06, "loss": 0.0652, "step": 558 }, { "epoch": 1.57909604519774, "grad_norm": 0.3146319364999993, "learning_rate": 1.053580341571428e-06, "loss": 0.1059, "step": 559 }, { "epoch": 1.5819209039548023, "grad_norm": 0.3027084741124625, "learning_rate": 1.0399961647976315e-06, "loss": 0.0812, "step": 560 }, { "epoch": 1.5847457627118644, "grad_norm": 0.3278175149471125, "learning_rate": 1.0264899581297121e-06, "loss": 0.1192, "step": 561 }, { "epoch": 1.5875706214689265, "grad_norm": 0.28592160027084834, "learning_rate": 1.0130619874967983e-06, "loss": 0.0752, "step": 562 }, { "epoch": 1.5903954802259888, "grad_norm": 0.2930417798367745, "learning_rate": 9.997125172875943e-07, "loss": 0.0879, "step": 563 }, { "epoch": 1.5932203389830508, "grad_norm": 0.2704337798545791, "learning_rate": 9.86441810345183e-07, "loss": 0.0624, "step": 564 }, { "epoch": 1.5960451977401129, "grad_norm": 0.3166751927355104, "learning_rate": 9.732501279618388e-07, "loss": 0.0848, "step": 565 }, { "epoch": 1.5988700564971752, "grad_norm": 0.29461324641929826, "learning_rate": 9.60137729873898e-07, "loss": 0.0789, "step": 566 }, { "epoch": 1.6016949152542372, "grad_norm": 0.31189484709881815, "learning_rate": 9.471048742566313e-07, "loss": 0.0822, "step": 567 }, { "epoch": 1.6045197740112993, "grad_norm": 0.3466231703998608, "learning_rate": 9.34151817719166e-07, "loss": 0.0767, "step": 568 }, { "epoch": 1.6073446327683616, "grad_norm": 0.30675238542761885, "learning_rate": 9.212788152994367e-07, "loss": 0.1034, "step": 569 }, { "epoch": 1.6101694915254239, "grad_norm": 0.522761335835565, "learning_rate": 9.08486120459155e-07, "loss": 0.1273, "step": 570 }, { "epoch": 1.6129943502824857, "grad_norm": 0.2810136760249764, "learning_rate": 8.957739850788288e-07, "loss": 0.073, "step": 571 }, { "epoch": 1.615819209039548, "grad_norm": 0.31293166014889473, "learning_rate": 8.831426594527976e-07, "loss": 0.0956, "step": 572 }, { "epoch": 1.6186440677966103, "grad_norm": 0.3408526367512878, "learning_rate": 8.705923922843041e-07, "loss": 0.0891, "step": 573 }, { "epoch": 1.6214689265536724, "grad_norm": 0.30421762095488025, "learning_rate": 8.581234306805969e-07, "loss": 0.0946, "step": 574 }, { "epoch": 1.6242937853107344, "grad_norm": 0.2940599119195987, "learning_rate": 8.457360201480702e-07, "loss": 0.0692, "step": 575 }, { "epoch": 1.6271186440677967, "grad_norm": 0.3002501406760772, "learning_rate": 8.334304045874248e-07, "loss": 0.0815, "step": 576 }, { "epoch": 1.6299435028248588, "grad_norm": 0.268425275016888, "learning_rate": 8.212068262888684e-07, "loss": 0.0751, "step": 577 }, { "epoch": 1.6327683615819208, "grad_norm": 0.2928986782866679, "learning_rate": 8.090655259273428e-07, "loss": 0.0918, "step": 578 }, { "epoch": 1.6355932203389831, "grad_norm": 0.32485035024590025, "learning_rate": 7.970067425577849e-07, "loss": 0.0933, "step": 579 }, { "epoch": 1.6384180790960452, "grad_norm": 0.3234267210299417, "learning_rate": 7.850307136104246e-07, "loss": 0.0904, "step": 580 }, { "epoch": 1.6412429378531073, "grad_norm": 0.30188930742886005, "learning_rate": 7.731376748861069e-07, "loss": 0.0889, "step": 581 }, { "epoch": 1.6440677966101696, "grad_norm": 0.335467078244967, "learning_rate": 7.613278605516455e-07, "loss": 0.1325, "step": 582 }, { "epoch": 1.6468926553672316, "grad_norm": 0.3072516075801986, "learning_rate": 7.4960150313522e-07, "loss": 0.0783, "step": 583 }, { "epoch": 1.6497175141242937, "grad_norm": 0.3131137120089587, "learning_rate": 7.379588335217875e-07, "loss": 0.0995, "step": 584 }, { "epoch": 1.652542372881356, "grad_norm": 0.2914572623071712, "learning_rate": 7.264000809485483e-07, "loss": 0.0863, "step": 585 }, { "epoch": 1.655367231638418, "grad_norm": 0.32502736910136926, "learning_rate": 7.149254730004246e-07, "loss": 0.1124, "step": 586 }, { "epoch": 1.65819209039548, "grad_norm": 0.326075318059859, "learning_rate": 7.035352356055786e-07, "loss": 0.1201, "step": 587 }, { "epoch": 1.6610169491525424, "grad_norm": 0.37300992352749307, "learning_rate": 6.922295930309691e-07, "loss": 0.1073, "step": 588 }, { "epoch": 1.6638418079096047, "grad_norm": 0.28876296213713953, "learning_rate": 6.810087678779353e-07, "loss": 0.0743, "step": 589 }, { "epoch": 1.6666666666666665, "grad_norm": 0.30368795978426233, "learning_rate": 6.698729810778065e-07, "loss": 0.0798, "step": 590 }, { "epoch": 1.6694915254237288, "grad_norm": 0.3072791432365542, "learning_rate": 6.588224518875647e-07, "loss": 0.0812, "step": 591 }, { "epoch": 1.672316384180791, "grad_norm": 0.3056293727238639, "learning_rate": 6.478573978855146e-07, "loss": 0.0684, "step": 592 }, { "epoch": 1.6751412429378532, "grad_norm": 0.30153905844016693, "learning_rate": 6.369780349670085e-07, "loss": 0.0779, "step": 593 }, { "epoch": 1.6779661016949152, "grad_norm": 0.2858390899342033, "learning_rate": 6.261845773401936e-07, "loss": 0.0713, "step": 594 }, { "epoch": 1.6807909604519775, "grad_norm": 0.30891966645412655, "learning_rate": 6.154772375217905e-07, "loss": 0.0837, "step": 595 }, { "epoch": 1.6836158192090396, "grad_norm": 0.29088564996940475, "learning_rate": 6.048562263329139e-07, "loss": 0.0825, "step": 596 }, { "epoch": 1.6864406779661016, "grad_norm": 0.29812823046797693, "learning_rate": 5.943217528949169e-07, "loss": 0.0927, "step": 597 }, { "epoch": 1.689265536723164, "grad_norm": 0.30652461054045976, "learning_rate": 5.838740246252794e-07, "loss": 0.0766, "step": 598 }, { "epoch": 1.692090395480226, "grad_norm": 0.3012630776892009, "learning_rate": 5.735132472335192e-07, "loss": 0.0893, "step": 599 }, { "epoch": 1.694915254237288, "grad_norm": 0.30023304223451514, "learning_rate": 5.632396247171429e-07, "loss": 0.1049, "step": 600 }, { "epoch": 1.6977401129943503, "grad_norm": 0.3280835461780043, "learning_rate": 5.530533593576292e-07, "loss": 0.116, "step": 601 }, { "epoch": 1.7005649717514124, "grad_norm": 0.2762418019865388, "learning_rate": 5.429546517164486e-07, "loss": 0.067, "step": 602 }, { "epoch": 1.7033898305084745, "grad_norm": 0.31675842207409677, "learning_rate": 5.329437006311122e-07, "loss": 0.0872, "step": 603 }, { "epoch": 1.7062146892655368, "grad_norm": 0.3041486434567392, "learning_rate": 5.230207032112549e-07, "loss": 0.0752, "step": 604 }, { "epoch": 1.7090395480225988, "grad_norm": 0.29993780041723445, "learning_rate": 5.131858548347596e-07, "loss": 0.0717, "step": 605 }, { "epoch": 1.711864406779661, "grad_norm": 0.30787432406626875, "learning_rate": 5.034393491439044e-07, "loss": 0.0802, "step": 606 }, { "epoch": 1.7146892655367232, "grad_norm": 0.28990243338730465, "learning_rate": 4.93781378041554e-07, "loss": 0.0871, "step": 607 }, { "epoch": 1.7175141242937855, "grad_norm": 0.3024236161639379, "learning_rate": 4.842121316873821e-07, "loss": 0.0855, "step": 608 }, { "epoch": 1.7203389830508473, "grad_norm": 0.3186961475088875, "learning_rate": 4.747317984941213e-07, "loss": 0.0875, "step": 609 }, { "epoch": 1.7231638418079096, "grad_norm": 0.2907838927015749, "learning_rate": 4.653405651238607e-07, "loss": 0.0908, "step": 610 }, { "epoch": 1.725988700564972, "grad_norm": 0.30763579782876077, "learning_rate": 4.560386164843639e-07, "loss": 0.0964, "step": 611 }, { "epoch": 1.7288135593220337, "grad_norm": 0.30987458360594455, "learning_rate": 4.468261357254339e-07, "loss": 0.0947, "step": 612 }, { "epoch": 1.731638418079096, "grad_norm": 0.29429416026094735, "learning_rate": 4.3770330423530626e-07, "loss": 0.0834, "step": 613 }, { "epoch": 1.7344632768361583, "grad_norm": 0.3063918655948546, "learning_rate": 4.286703016370719e-07, "loss": 0.0925, "step": 614 }, { "epoch": 1.7372881355932204, "grad_norm": 0.33329444172148553, "learning_rate": 4.197273057851464e-07, "loss": 0.0983, "step": 615 }, { "epoch": 1.7401129943502824, "grad_norm": 0.29709804495802744, "learning_rate": 4.108744927617669e-07, "loss": 0.079, "step": 616 }, { "epoch": 1.7429378531073447, "grad_norm": 0.2937681249333296, "learning_rate": 4.021120368735254e-07, "loss": 0.088, "step": 617 }, { "epoch": 1.7457627118644068, "grad_norm": 0.33235581919645196, "learning_rate": 3.934401106479352e-07, "loss": 0.093, "step": 618 }, { "epoch": 1.7485875706214689, "grad_norm": 0.3052937062176937, "learning_rate": 3.8485888483003384e-07, "loss": 0.0987, "step": 619 }, { "epoch": 1.7514124293785311, "grad_norm": 0.2953169881958288, "learning_rate": 3.763685283790208e-07, "loss": 0.0861, "step": 620 }, { "epoch": 1.7542372881355932, "grad_norm": 0.39089257775250585, "learning_rate": 3.679692084649372e-07, "loss": 0.1092, "step": 621 }, { "epoch": 1.7570621468926553, "grad_norm": 0.298962166079189, "learning_rate": 3.596610904653652e-07, "loss": 0.0877, "step": 622 }, { "epoch": 1.7598870056497176, "grad_norm": 0.33068479547784435, "learning_rate": 3.5144433796217515e-07, "loss": 0.0868, "step": 623 }, { "epoch": 1.7627118644067796, "grad_norm": 0.29230226311663704, "learning_rate": 3.433191127383079e-07, "loss": 0.0786, "step": 624 }, { "epoch": 1.7655367231638417, "grad_norm": 0.342045177865139, "learning_rate": 3.352855747745859e-07, "loss": 0.1034, "step": 625 }, { "epoch": 1.768361581920904, "grad_norm": 0.3538547033051671, "learning_rate": 3.2734388224656575e-07, "loss": 0.0913, "step": 626 }, { "epoch": 1.7711864406779663, "grad_norm": 0.3022791955228267, "learning_rate": 3.1949419152142e-07, "loss": 0.0912, "step": 627 }, { "epoch": 1.774011299435028, "grad_norm": 0.3286478841800299, "learning_rate": 3.1173665715486076e-07, "loss": 0.1005, "step": 628 }, { "epoch": 1.7768361581920904, "grad_norm": 0.2914491072414654, "learning_rate": 3.0407143188809885e-07, "loss": 0.087, "step": 629 }, { "epoch": 1.7796610169491527, "grad_norm": 0.26788446393967724, "learning_rate": 2.9649866664483387e-07, "loss": 0.06, "step": 630 }, { "epoch": 1.7824858757062145, "grad_norm": 0.29391848362521833, "learning_rate": 2.8901851052828e-07, "loss": 0.0789, "step": 631 }, { "epoch": 1.7853107344632768, "grad_norm": 0.2778847008048511, "learning_rate": 2.816311108182368e-07, "loss": 0.0626, "step": 632 }, { "epoch": 1.788135593220339, "grad_norm": 0.3154659365103027, "learning_rate": 2.743366129681824e-07, "loss": 0.101, "step": 633 }, { "epoch": 1.7909604519774012, "grad_norm": 0.31293085126205966, "learning_rate": 2.671351606024153e-07, "loss": 0.0762, "step": 634 }, { "epoch": 1.7937853107344632, "grad_norm": 0.47243232664184365, "learning_rate": 2.6002689551322403e-07, "loss": 0.1006, "step": 635 }, { "epoch": 1.7966101694915255, "grad_norm": 0.2781939275244853, "learning_rate": 2.530119576580936e-07, "loss": 0.0638, "step": 636 }, { "epoch": 1.7994350282485876, "grad_norm": 0.2796358570748197, "learning_rate": 2.460904851569534e-07, "loss": 0.0636, "step": 637 }, { "epoch": 1.8022598870056497, "grad_norm": 0.31697163166683606, "learning_rate": 2.3926261428945386e-07, "loss": 0.0707, "step": 638 }, { "epoch": 1.805084745762712, "grad_norm": 0.2764809135900223, "learning_rate": 2.325284794922883e-07, "loss": 0.0674, "step": 639 }, { "epoch": 1.807909604519774, "grad_norm": 0.3238518566445213, "learning_rate": 2.2588821335654044e-07, "loss": 0.0824, "step": 640 }, { "epoch": 1.810734463276836, "grad_norm": 0.34129160389189483, "learning_rate": 2.1934194662507736e-07, "loss": 0.0851, "step": 641 }, { "epoch": 1.8135593220338984, "grad_norm": 0.2825014150604838, "learning_rate": 2.1288980818997272e-07, "loss": 0.077, "step": 642 }, { "epoch": 1.8163841807909604, "grad_norm": 0.29713126389115424, "learning_rate": 2.0653192508997222e-07, "loss": 0.0762, "step": 643 }, { "epoch": 1.8192090395480225, "grad_norm": 0.300338264829181, "learning_rate": 2.0026842250799038e-07, "loss": 0.0878, "step": 644 }, { "epoch": 1.8220338983050848, "grad_norm": 1.4166868383475169, "learning_rate": 1.9409942376864333e-07, "loss": 0.0867, "step": 645 }, { "epoch": 1.8248587570621468, "grad_norm": 0.25772375662960606, "learning_rate": 1.8802505033582608e-07, "loss": 0.0604, "step": 646 }, { "epoch": 1.827683615819209, "grad_norm": 0.31296870168050195, "learning_rate": 1.8204542181031572e-07, "loss": 0.0909, "step": 647 }, { "epoch": 1.8305084745762712, "grad_norm": 0.3064057159229424, "learning_rate": 1.7616065592742038e-07, "loss": 0.0881, "step": 648 }, { "epoch": 1.8333333333333335, "grad_norm": 0.2998465592381362, "learning_rate": 1.7037086855465902e-07, "loss": 0.0629, "step": 649 }, { "epoch": 1.8361581920903953, "grad_norm": 0.29682999878586, "learning_rate": 1.6467617368947918e-07, "loss": 0.0786, "step": 650 }, { "epoch": 1.8389830508474576, "grad_norm": 0.2781558427742286, "learning_rate": 1.5907668345701732e-07, "loss": 0.0818, "step": 651 }, { "epoch": 1.84180790960452, "grad_norm": 0.27914990667274464, "learning_rate": 1.5357250810788316e-07, "loss": 0.0739, "step": 652 }, { "epoch": 1.844632768361582, "grad_norm": 0.3438724850417393, "learning_rate": 1.4816375601599653e-07, "loss": 0.0723, "step": 653 }, { "epoch": 1.847457627118644, "grad_norm": 0.31828540808245065, "learning_rate": 1.4285053367645074e-07, "loss": 0.077, "step": 654 }, { "epoch": 1.8502824858757063, "grad_norm": 0.27972916277751064, "learning_rate": 1.37632945703412e-07, "loss": 0.0705, "step": 655 }, { "epoch": 1.8531073446327684, "grad_norm": 0.24716233730443476, "learning_rate": 1.3251109482806667e-07, "loss": 0.0489, "step": 656 }, { "epoch": 1.8559322033898304, "grad_norm": 0.308468550353476, "learning_rate": 1.2748508189659447e-07, "loss": 0.0866, "step": 657 }, { "epoch": 1.8587570621468927, "grad_norm": 0.3092098443706496, "learning_rate": 1.2255500586818015e-07, "loss": 0.1055, "step": 658 }, { "epoch": 1.8615819209039548, "grad_norm": 0.28162343440142174, "learning_rate": 1.177209638130733e-07, "loss": 0.072, "step": 659 }, { "epoch": 1.8644067796610169, "grad_norm": 0.3007912961091226, "learning_rate": 1.1298305091066664e-07, "loss": 0.0908, "step": 660 }, { "epoch": 1.8672316384180792, "grad_norm": 0.3136447204914414, "learning_rate": 1.0834136044763188e-07, "loss": 0.0836, "step": 661 }, { "epoch": 1.8700564971751412, "grad_norm": 0.31357010636279004, "learning_rate": 1.0379598381607681e-07, "loss": 0.0807, "step": 662 }, { "epoch": 1.8728813559322033, "grad_norm": 0.3090649577202879, "learning_rate": 9.93470105117461e-08, "loss": 0.0949, "step": 663 }, { "epoch": 1.8757062146892656, "grad_norm": 0.30061286344301524, "learning_rate": 9.499452813226284e-08, "loss": 0.0832, "step": 664 }, { "epoch": 1.8785310734463276, "grad_norm": 0.3207780749688742, "learning_rate": 9.073862237539977e-08, "loss": 0.0922, "step": 665 }, { "epoch": 1.8813559322033897, "grad_norm": 0.35510559900468425, "learning_rate": 8.657937703739516e-08, "loss": 0.0989, "step": 666 }, { "epoch": 1.884180790960452, "grad_norm": 0.29122472340879935, "learning_rate": 8.251687401130137e-08, "loss": 0.0806, "step": 667 }, { "epoch": 1.8870056497175143, "grad_norm": 0.3387325114735546, "learning_rate": 7.855119328537109e-08, "loss": 0.1179, "step": 668 }, { "epoch": 1.8898305084745761, "grad_norm": 0.3838105969146293, "learning_rate": 7.468241294148471e-08, "loss": 0.1056, "step": 669 }, { "epoch": 1.8926553672316384, "grad_norm": 0.34521981231042703, "learning_rate": 7.09106091536127e-08, "loss": 0.0708, "step": 670 }, { "epoch": 1.8954802259887007, "grad_norm": 0.2860135017170368, "learning_rate": 6.723585618631456e-08, "loss": 0.0807, "step": 671 }, { "epoch": 1.8983050847457628, "grad_norm": 0.25713094736882913, "learning_rate": 6.365822639327724e-08, "loss": 0.0596, "step": 672 }, { "epoch": 1.9011299435028248, "grad_norm": 0.32493231522498606, "learning_rate": 6.017779021589065e-08, "loss": 0.0783, "step": 673 }, { "epoch": 1.9039548022598871, "grad_norm": 0.3195456319023967, "learning_rate": 5.679461618185944e-08, "loss": 0.0989, "step": 674 }, { "epoch": 1.9067796610169492, "grad_norm": 0.28832414052354244, "learning_rate": 5.350877090385731e-08, "loss": 0.0842, "step": 675 }, { "epoch": 1.9096045197740112, "grad_norm": 0.29862804180466584, "learning_rate": 5.032031907821089e-08, "loss": 0.0799, "step": 676 }, { "epoch": 1.9124293785310735, "grad_norm": 0.39261040413818743, "learning_rate": 4.722932348362852e-08, "loss": 0.0763, "step": 677 }, { "epoch": 1.9152542372881356, "grad_norm": 0.28345109997512263, "learning_rate": 4.423584497996458e-08, "loss": 0.0623, "step": 678 }, { "epoch": 1.9180790960451977, "grad_norm": 0.3730843254054893, "learning_rate": 4.1339942507018225e-08, "loss": 0.1051, "step": 679 }, { "epoch": 1.92090395480226, "grad_norm": 0.408781043992154, "learning_rate": 3.8541673083377086e-08, "loss": 0.0972, "step": 680 }, { "epoch": 1.923728813559322, "grad_norm": 0.27235202225360894, "learning_rate": 3.584109180529205e-08, "loss": 0.078, "step": 681 }, { "epoch": 1.926553672316384, "grad_norm": 0.3249573151531456, "learning_rate": 3.323825184559204e-08, "loss": 0.0665, "step": 682 }, { "epoch": 1.9293785310734464, "grad_norm": 0.3917754153996301, "learning_rate": 3.073320445263817e-08, "loss": 0.0948, "step": 683 }, { "epoch": 1.9322033898305084, "grad_norm": 0.3436451083397938, "learning_rate": 2.8325998949314536e-08, "loss": 0.1001, "step": 684 }, { "epoch": 1.9350282485875705, "grad_norm": 0.2962089436476239, "learning_rate": 2.6016682732057375e-08, "loss": 0.092, "step": 685 }, { "epoch": 1.9378531073446328, "grad_norm": 0.31332687230877443, "learning_rate": 2.3805301269920754e-08, "loss": 0.0719, "step": 686 }, { "epoch": 1.940677966101695, "grad_norm": 0.29206089331438057, "learning_rate": 2.1691898103682885e-08, "loss": 0.0803, "step": 687 }, { "epoch": 1.943502824858757, "grad_norm": 0.29588225232531956, "learning_rate": 1.9676514844987338e-08, "loss": 0.0746, "step": 688 }, { "epoch": 1.9463276836158192, "grad_norm": 0.3052178637723924, "learning_rate": 1.775919117552427e-08, "loss": 0.0683, "step": 689 }, { "epoch": 1.9491525423728815, "grad_norm": 0.33628210770097194, "learning_rate": 1.593996484624938e-08, "loss": 0.0876, "step": 690 }, { "epoch": 1.9519774011299436, "grad_norm": 0.41716535023044066, "learning_rate": 1.42188716766406e-08, "loss": 0.0797, "step": 691 }, { "epoch": 1.9548022598870056, "grad_norm": 0.2721830233815887, "learning_rate": 1.2595945553992572e-08, "loss": 0.0746, "step": 692 }, { "epoch": 1.957627118644068, "grad_norm": 0.3093867803026701, "learning_rate": 1.1071218432749942e-08, "loss": 0.0927, "step": 693 }, { "epoch": 1.96045197740113, "grad_norm": 0.28442945395766916, "learning_rate": 9.6447203338762e-09, "loss": 0.0787, "step": 694 }, { "epoch": 1.963276836158192, "grad_norm": 0.30806352104616475, "learning_rate": 8.316479344266382e-09, "loss": 0.0888, "step": 695 }, { "epoch": 1.9661016949152543, "grad_norm": 0.366990265292286, "learning_rate": 7.0865216161902785e-09, "loss": 0.1067, "step": 696 }, { "epoch": 1.9689265536723164, "grad_norm": 0.37187519570368377, "learning_rate": 5.954871366779525e-09, "loss": 0.0814, "step": 697 }, { "epoch": 1.9717514124293785, "grad_norm": 0.3167633384396195, "learning_rate": 4.921550877550752e-09, "loss": 0.0687, "step": 698 }, { "epoch": 1.9745762711864407, "grad_norm": 0.301910108787057, "learning_rate": 3.9865804939659414e-09, "loss": 0.0748, "step": 699 }, { "epoch": 1.9774011299435028, "grad_norm": 0.32697329312245565, "learning_rate": 3.1499786250321904e-09, "loss": 0.1215, "step": 700 }, { "epoch": 1.9802259887005649, "grad_norm": 0.31440661932949104, "learning_rate": 2.411761742939778e-09, "loss": 0.0798, "step": 701 }, { "epoch": 1.9830508474576272, "grad_norm": 0.32934297243481625, "learning_rate": 1.7719443827368677e-09, "loss": 0.0759, "step": 702 }, { "epoch": 1.9858757062146892, "grad_norm": 0.3018134458199549, "learning_rate": 1.2305391420458502e-09, "loss": 0.0935, "step": 703 }, { "epoch": 1.9887005649717513, "grad_norm": 0.2717967961520749, "learning_rate": 7.875566808107638e-10, "loss": 0.0621, "step": 704 }, { "epoch": 1.9915254237288136, "grad_norm": 0.30491685297395554, "learning_rate": 4.4300572109134965e-10, "loss": 0.0913, "step": 705 }, { "epoch": 1.9943502824858759, "grad_norm": 0.2812227615626389, "learning_rate": 1.9689304688985667e-10, "loss": 0.0699, "step": 706 }, { "epoch": 1.9971751412429377, "grad_norm": 0.27084127166875765, "learning_rate": 4.922350401781461e-11, "loss": 0.0745, "step": 707 }, { "epoch": 2.0, "grad_norm": 0.29812406633307104, "learning_rate": 0.0, "loss": 0.0894, "step": 708 }, { "epoch": 2.0, "step": 708, "total_flos": 20169300639744.0, "train_loss": 0.10943256686362675, "train_runtime": 738.5688, "train_samples_per_second": 7.666, "train_steps_per_second": 0.959 } ], "logging_steps": 1, "max_steps": 708, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 20169300639744.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }