{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5823358136525396, "eval_steps": 386, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012940795858945326, "grad_norm": 1.7405146360397339, "learning_rate": 2.0000000000000003e-06, "loss": 2.4269, "step": 1 }, { "epoch": 0.0012940795858945326, "eval_loss": 2.247628688812256, "eval_runtime": 189.8853, "eval_samples_per_second": 3.428, "eval_steps_per_second": 0.858, "step": 1 }, { "epoch": 0.002588159171789065, "grad_norm": 1.6643542051315308, "learning_rate": 4.000000000000001e-06, "loss": 2.2583, "step": 2 }, { "epoch": 0.0038822387576835974, "grad_norm": 1.8690767288208008, "learning_rate": 6e-06, "loss": 2.2696, "step": 3 }, { "epoch": 0.00517631834357813, "grad_norm": 1.828118085861206, "learning_rate": 8.000000000000001e-06, "loss": 2.3646, "step": 4 }, { "epoch": 0.006470397929472662, "grad_norm": 1.9319926500320435, "learning_rate": 1e-05, "loss": 2.4196, "step": 5 }, { "epoch": 0.007764477515367195, "grad_norm": 1.7723782062530518, "learning_rate": 1.2e-05, "loss": 2.4177, "step": 6 }, { "epoch": 0.009058557101261728, "grad_norm": 1.9500815868377686, "learning_rate": 1.4000000000000001e-05, "loss": 2.3497, "step": 7 }, { "epoch": 0.01035263668715626, "grad_norm": 2.3909075260162354, "learning_rate": 1.6000000000000003e-05, "loss": 2.405, "step": 8 }, { "epoch": 0.011646716273050793, "grad_norm": 2.0620856285095215, "learning_rate": 1.8e-05, "loss": 2.4098, "step": 9 }, { "epoch": 0.012940795858945324, "grad_norm": 1.8054910898208618, "learning_rate": 2e-05, "loss": 2.1233, "step": 10 }, { "epoch": 0.014234875444839857, "grad_norm": 2.190964937210083, "learning_rate": 2.2000000000000003e-05, "loss": 2.3985, "step": 11 }, { "epoch": 0.01552895503073439, "grad_norm": 1.9412921667099, "learning_rate": 2.4e-05, "loss": 2.462, "step": 12 }, { "epoch": 0.016823034616628922, "grad_norm": 1.9161555767059326, "learning_rate": 2.6000000000000002e-05, "loss": 2.2118, "step": 13 }, { "epoch": 0.018117114202523456, "grad_norm": 1.7161599397659302, "learning_rate": 2.8000000000000003e-05, "loss": 2.2175, "step": 14 }, { "epoch": 0.019411193788417987, "grad_norm": 2.173877000808716, "learning_rate": 3e-05, "loss": 2.2521, "step": 15 }, { "epoch": 0.02070527337431252, "grad_norm": 2.0000555515289307, "learning_rate": 3.2000000000000005e-05, "loss": 2.1615, "step": 16 }, { "epoch": 0.021999352960207053, "grad_norm": 1.5915080308914185, "learning_rate": 3.4000000000000007e-05, "loss": 1.9522, "step": 17 }, { "epoch": 0.023293432546101587, "grad_norm": 1.6972448825836182, "learning_rate": 3.6e-05, "loss": 1.7224, "step": 18 }, { "epoch": 0.024587512131996118, "grad_norm": 1.7509772777557373, "learning_rate": 3.8e-05, "loss": 2.0414, "step": 19 }, { "epoch": 0.02588159171789065, "grad_norm": 1.697340488433838, "learning_rate": 4e-05, "loss": 2.0427, "step": 20 }, { "epoch": 0.027175671303785183, "grad_norm": 1.8733758926391602, "learning_rate": 4.2e-05, "loss": 1.6772, "step": 21 }, { "epoch": 0.028469750889679714, "grad_norm": 1.6085255146026611, "learning_rate": 4.4000000000000006e-05, "loss": 1.6527, "step": 22 }, { "epoch": 0.029763830475574248, "grad_norm": 1.5792337656021118, "learning_rate": 4.600000000000001e-05, "loss": 1.6567, "step": 23 }, { "epoch": 0.03105791006146878, "grad_norm": 1.4392567873001099, "learning_rate": 4.8e-05, "loss": 1.508, "step": 24 }, { "epoch": 0.03235198964736331, "grad_norm": 1.5222433805465698, "learning_rate": 5e-05, "loss": 1.4606, "step": 25 }, { "epoch": 0.033646069233257844, "grad_norm": 1.5480064153671265, "learning_rate": 5.2000000000000004e-05, "loss": 1.5027, "step": 26 }, { "epoch": 0.034940148819152375, "grad_norm": 1.6736445426940918, "learning_rate": 5.4000000000000005e-05, "loss": 1.2426, "step": 27 }, { "epoch": 0.03623422840504691, "grad_norm": 1.7392551898956299, "learning_rate": 5.6000000000000006e-05, "loss": 1.4703, "step": 28 }, { "epoch": 0.037528307990941444, "grad_norm": 1.6173359155654907, "learning_rate": 5.8e-05, "loss": 1.4546, "step": 29 }, { "epoch": 0.038822387576835975, "grad_norm": 1.3955802917480469, "learning_rate": 6e-05, "loss": 1.3808, "step": 30 }, { "epoch": 0.040116467162730506, "grad_norm": 1.353873372077942, "learning_rate": 6.2e-05, "loss": 1.229, "step": 31 }, { "epoch": 0.04141054674862504, "grad_norm": 1.2547746896743774, "learning_rate": 6.400000000000001e-05, "loss": 1.1668, "step": 32 }, { "epoch": 0.042704626334519574, "grad_norm": 1.3806778192520142, "learning_rate": 6.6e-05, "loss": 1.0691, "step": 33 }, { "epoch": 0.043998705920414105, "grad_norm": 1.2815773487091064, "learning_rate": 6.800000000000001e-05, "loss": 1.2409, "step": 34 }, { "epoch": 0.045292785506308636, "grad_norm": 1.3677266836166382, "learning_rate": 7e-05, "loss": 0.9668, "step": 35 }, { "epoch": 0.046586865092203174, "grad_norm": 1.5457032918930054, "learning_rate": 7.2e-05, "loss": 1.1385, "step": 36 }, { "epoch": 0.047880944678097705, "grad_norm": 1.5587060451507568, "learning_rate": 7.4e-05, "loss": 1.1707, "step": 37 }, { "epoch": 0.049175024263992236, "grad_norm": 1.079053282737732, "learning_rate": 7.6e-05, "loss": 1.0655, "step": 38 }, { "epoch": 0.050469103849886766, "grad_norm": 1.1773897409439087, "learning_rate": 7.800000000000001e-05, "loss": 1.0465, "step": 39 }, { "epoch": 0.0517631834357813, "grad_norm": 1.2437673807144165, "learning_rate": 8e-05, "loss": 1.2779, "step": 40 }, { "epoch": 0.053057263021675835, "grad_norm": 1.254847526550293, "learning_rate": 8.2e-05, "loss": 1.0898, "step": 41 }, { "epoch": 0.054351342607570366, "grad_norm": 1.1771515607833862, "learning_rate": 8.4e-05, "loss": 1.1827, "step": 42 }, { "epoch": 0.0556454221934649, "grad_norm": 1.1400648355484009, "learning_rate": 8.6e-05, "loss": 1.1066, "step": 43 }, { "epoch": 0.05693950177935943, "grad_norm": 1.2047138214111328, "learning_rate": 8.800000000000001e-05, "loss": 0.8974, "step": 44 }, { "epoch": 0.058233581365253966, "grad_norm": 1.1269346475601196, "learning_rate": 9e-05, "loss": 1.0146, "step": 45 }, { "epoch": 0.059527660951148496, "grad_norm": 1.169231653213501, "learning_rate": 9.200000000000001e-05, "loss": 1.1266, "step": 46 }, { "epoch": 0.06082174053704303, "grad_norm": 0.9771779179573059, "learning_rate": 9.4e-05, "loss": 0.8351, "step": 47 }, { "epoch": 0.06211582012293756, "grad_norm": 1.2849314212799072, "learning_rate": 9.6e-05, "loss": 1.1822, "step": 48 }, { "epoch": 0.0634098997088321, "grad_norm": 1.023181676864624, "learning_rate": 9.8e-05, "loss": 0.9082, "step": 49 }, { "epoch": 0.06470397929472663, "grad_norm": 1.135751724243164, "learning_rate": 0.0001, "loss": 0.9407, "step": 50 }, { "epoch": 0.06599805888062116, "grad_norm": 0.9701154828071594, "learning_rate": 9.999998300231494e-05, "loss": 0.9423, "step": 51 }, { "epoch": 0.06729213846651569, "grad_norm": 1.2891143560409546, "learning_rate": 9.999993200927133e-05, "loss": 0.9757, "step": 52 }, { "epoch": 0.06858621805241022, "grad_norm": 1.3360975980758667, "learning_rate": 9.999984702090383e-05, "loss": 1.0158, "step": 53 }, { "epoch": 0.06988029763830475, "grad_norm": 0.977446436882019, "learning_rate": 9.999972803727024e-05, "loss": 0.8175, "step": 54 }, { "epoch": 0.0711743772241993, "grad_norm": 0.9943827390670776, "learning_rate": 9.999957505845144e-05, "loss": 0.8627, "step": 55 }, { "epoch": 0.07246845681009383, "grad_norm": 1.1531224250793457, "learning_rate": 9.999938808455145e-05, "loss": 1.143, "step": 56 }, { "epoch": 0.07376253639598836, "grad_norm": 1.287972092628479, "learning_rate": 9.99991671156974e-05, "loss": 1.2342, "step": 57 }, { "epoch": 0.07505661598188289, "grad_norm": 1.1554590463638306, "learning_rate": 9.999891215203949e-05, "loss": 0.9692, "step": 58 }, { "epoch": 0.07635069556777742, "grad_norm": 1.0786008834838867, "learning_rate": 9.999862319375113e-05, "loss": 1.1254, "step": 59 }, { "epoch": 0.07764477515367195, "grad_norm": 1.0764508247375488, "learning_rate": 9.999830024102874e-05, "loss": 0.9312, "step": 60 }, { "epoch": 0.07893885473956648, "grad_norm": 1.1909526586532593, "learning_rate": 9.999794329409194e-05, "loss": 0.9959, "step": 61 }, { "epoch": 0.08023293432546101, "grad_norm": 0.9989166259765625, "learning_rate": 9.999755235318337e-05, "loss": 0.934, "step": 62 }, { "epoch": 0.08152701391135554, "grad_norm": 1.0302046537399292, "learning_rate": 9.999712741856889e-05, "loss": 1.1017, "step": 63 }, { "epoch": 0.08282109349725009, "grad_norm": 0.9583478569984436, "learning_rate": 9.999666849053738e-05, "loss": 1.1384, "step": 64 }, { "epoch": 0.08411517308314462, "grad_norm": 1.001126766204834, "learning_rate": 9.999617556940085e-05, "loss": 0.9279, "step": 65 }, { "epoch": 0.08540925266903915, "grad_norm": 1.0130903720855713, "learning_rate": 9.999564865549449e-05, "loss": 0.9381, "step": 66 }, { "epoch": 0.08670333225493368, "grad_norm": 1.1210829019546509, "learning_rate": 9.999508774917652e-05, "loss": 0.9607, "step": 67 }, { "epoch": 0.08799741184082821, "grad_norm": 1.045749545097351, "learning_rate": 9.999449285082831e-05, "loss": 1.0037, "step": 68 }, { "epoch": 0.08929149142672274, "grad_norm": 1.1308139562606812, "learning_rate": 9.999386396085434e-05, "loss": 0.9086, "step": 69 }, { "epoch": 0.09058557101261727, "grad_norm": 1.1013413667678833, "learning_rate": 9.999320107968219e-05, "loss": 1.0712, "step": 70 }, { "epoch": 0.0918796505985118, "grad_norm": 1.0830566883087158, "learning_rate": 9.999250420776258e-05, "loss": 1.0326, "step": 71 }, { "epoch": 0.09317373018440635, "grad_norm": 1.0673171281814575, "learning_rate": 9.999177334556929e-05, "loss": 1.0034, "step": 72 }, { "epoch": 0.09446780977030088, "grad_norm": 1.1546461582183838, "learning_rate": 9.999100849359926e-05, "loss": 1.059, "step": 73 }, { "epoch": 0.09576188935619541, "grad_norm": 0.9139528870582581, "learning_rate": 9.999020965237249e-05, "loss": 0.8596, "step": 74 }, { "epoch": 0.09705596894208994, "grad_norm": 1.1570812463760376, "learning_rate": 9.998937682243215e-05, "loss": 1.0456, "step": 75 }, { "epoch": 0.09835004852798447, "grad_norm": 1.3232612609863281, "learning_rate": 9.998851000434448e-05, "loss": 0.9994, "step": 76 }, { "epoch": 0.099644128113879, "grad_norm": 1.2017115354537964, "learning_rate": 9.998760919869883e-05, "loss": 1.2664, "step": 77 }, { "epoch": 0.10093820769977353, "grad_norm": 1.0694175958633423, "learning_rate": 9.998667440610765e-05, "loss": 0.9483, "step": 78 }, { "epoch": 0.10223228728566806, "grad_norm": 0.9963059425354004, "learning_rate": 9.998570562720654e-05, "loss": 0.9577, "step": 79 }, { "epoch": 0.1035263668715626, "grad_norm": 0.8873535394668579, "learning_rate": 9.998470286265416e-05, "loss": 0.8498, "step": 80 }, { "epoch": 0.10482044645745714, "grad_norm": 1.1350760459899902, "learning_rate": 9.99836661131323e-05, "loss": 1.0024, "step": 81 }, { "epoch": 0.10611452604335167, "grad_norm": 0.8355389833450317, "learning_rate": 9.998259537934586e-05, "loss": 0.7399, "step": 82 }, { "epoch": 0.1074086056292462, "grad_norm": 0.9935446381568909, "learning_rate": 9.998149066202284e-05, "loss": 0.9809, "step": 83 }, { "epoch": 0.10870268521514073, "grad_norm": 1.0571558475494385, "learning_rate": 9.998035196191435e-05, "loss": 1.0144, "step": 84 }, { "epoch": 0.10999676480103526, "grad_norm": 0.9860286116600037, "learning_rate": 9.99791792797946e-05, "loss": 1.0467, "step": 85 }, { "epoch": 0.1112908443869298, "grad_norm": 1.1422507762908936, "learning_rate": 9.997797261646089e-05, "loss": 0.9535, "step": 86 }, { "epoch": 0.11258492397282432, "grad_norm": 0.8561545014381409, "learning_rate": 9.997673197273365e-05, "loss": 1.007, "step": 87 }, { "epoch": 0.11387900355871886, "grad_norm": 1.0027543306350708, "learning_rate": 9.997545734945639e-05, "loss": 0.9861, "step": 88 }, { "epoch": 0.1151730831446134, "grad_norm": 0.8489773273468018, "learning_rate": 9.997414874749575e-05, "loss": 0.9672, "step": 89 }, { "epoch": 0.11646716273050793, "grad_norm": 1.0517115592956543, "learning_rate": 9.997280616774147e-05, "loss": 1.1672, "step": 90 }, { "epoch": 0.11776124231640246, "grad_norm": 1.0035395622253418, "learning_rate": 9.997142961110634e-05, "loss": 0.9294, "step": 91 }, { "epoch": 0.11905532190229699, "grad_norm": 1.1194915771484375, "learning_rate": 9.997001907852635e-05, "loss": 1.0857, "step": 92 }, { "epoch": 0.12034940148819152, "grad_norm": 1.5234825611114502, "learning_rate": 9.996857457096047e-05, "loss": 1.027, "step": 93 }, { "epoch": 0.12164348107408605, "grad_norm": 0.949878454208374, "learning_rate": 9.996709608939088e-05, "loss": 0.8173, "step": 94 }, { "epoch": 0.12293756065998059, "grad_norm": 0.8736472129821777, "learning_rate": 9.996558363482277e-05, "loss": 0.855, "step": 95 }, { "epoch": 0.12423164024587512, "grad_norm": 0.8604567050933838, "learning_rate": 9.996403720828449e-05, "loss": 0.9485, "step": 96 }, { "epoch": 0.12552571983176966, "grad_norm": 1.020851492881775, "learning_rate": 9.996245681082748e-05, "loss": 1.0024, "step": 97 }, { "epoch": 0.1268197994176642, "grad_norm": 1.0704892873764038, "learning_rate": 9.996084244352623e-05, "loss": 0.9246, "step": 98 }, { "epoch": 0.12811387900355872, "grad_norm": 0.8441987037658691, "learning_rate": 9.99591941074784e-05, "loss": 1.0343, "step": 99 }, { "epoch": 0.12940795858945325, "grad_norm": 1.0280612707138062, "learning_rate": 9.995751180380466e-05, "loss": 0.9644, "step": 100 }, { "epoch": 0.13070203817534778, "grad_norm": 0.9827906489372253, "learning_rate": 9.995579553364887e-05, "loss": 0.9583, "step": 101 }, { "epoch": 0.13199611776124232, "grad_norm": 1.035618543624878, "learning_rate": 9.995404529817791e-05, "loss": 1.0366, "step": 102 }, { "epoch": 0.13329019734713685, "grad_norm": 1.2775524854660034, "learning_rate": 9.995226109858178e-05, "loss": 0.9353, "step": 103 }, { "epoch": 0.13458427693303138, "grad_norm": 1.0101919174194336, "learning_rate": 9.995044293607355e-05, "loss": 0.9045, "step": 104 }, { "epoch": 0.1358783565189259, "grad_norm": 0.8396942019462585, "learning_rate": 9.994859081188943e-05, "loss": 0.867, "step": 105 }, { "epoch": 0.13717243610482044, "grad_norm": 1.04515540599823, "learning_rate": 9.99467047272887e-05, "loss": 0.9693, "step": 106 }, { "epoch": 0.13846651569071497, "grad_norm": 1.099042534828186, "learning_rate": 9.994478468355369e-05, "loss": 0.8879, "step": 107 }, { "epoch": 0.1397605952766095, "grad_norm": 0.8710360527038574, "learning_rate": 9.994283068198988e-05, "loss": 0.9018, "step": 108 }, { "epoch": 0.14105467486250403, "grad_norm": 0.961025059223175, "learning_rate": 9.99408427239258e-05, "loss": 0.8806, "step": 109 }, { "epoch": 0.1423487544483986, "grad_norm": 0.915665328502655, "learning_rate": 9.993882081071306e-05, "loss": 0.8628, "step": 110 }, { "epoch": 0.14364283403429312, "grad_norm": 1.2776648998260498, "learning_rate": 9.993676494372642e-05, "loss": 0.9742, "step": 111 }, { "epoch": 0.14493691362018765, "grad_norm": 1.1270071268081665, "learning_rate": 9.993467512436364e-05, "loss": 0.9729, "step": 112 }, { "epoch": 0.14623099320608218, "grad_norm": 0.8188664317131042, "learning_rate": 9.99325513540456e-05, "loss": 0.9428, "step": 113 }, { "epoch": 0.1475250727919767, "grad_norm": 1.0760393142700195, "learning_rate": 9.993039363421627e-05, "loss": 0.9482, "step": 114 }, { "epoch": 0.14881915237787124, "grad_norm": 1.019920825958252, "learning_rate": 9.992820196634273e-05, "loss": 0.9785, "step": 115 }, { "epoch": 0.15011323196376578, "grad_norm": 0.8342046737670898, "learning_rate": 9.992597635191509e-05, "loss": 0.9291, "step": 116 }, { "epoch": 0.1514073115496603, "grad_norm": 0.8460632562637329, "learning_rate": 9.992371679244658e-05, "loss": 0.8797, "step": 117 }, { "epoch": 0.15270139113555484, "grad_norm": 0.933060348033905, "learning_rate": 9.992142328947345e-05, "loss": 0.9657, "step": 118 }, { "epoch": 0.15399547072144937, "grad_norm": 0.8822593688964844, "learning_rate": 9.991909584455511e-05, "loss": 0.8872, "step": 119 }, { "epoch": 0.1552895503073439, "grad_norm": 0.9599350094795227, "learning_rate": 9.991673445927398e-05, "loss": 0.9064, "step": 120 }, { "epoch": 0.15658362989323843, "grad_norm": 0.8505874872207642, "learning_rate": 9.99143391352356e-05, "loss": 0.9966, "step": 121 }, { "epoch": 0.15787770947913296, "grad_norm": 1.3977786302566528, "learning_rate": 9.991190987406857e-05, "loss": 0.9145, "step": 122 }, { "epoch": 0.1591717890650275, "grad_norm": 0.8947294354438782, "learning_rate": 9.990944667742455e-05, "loss": 0.9569, "step": 123 }, { "epoch": 0.16046586865092202, "grad_norm": 0.7973839044570923, "learning_rate": 9.990694954697828e-05, "loss": 0.8853, "step": 124 }, { "epoch": 0.16175994823681655, "grad_norm": 0.9481159448623657, "learning_rate": 9.99044184844276e-05, "loss": 1.04, "step": 125 }, { "epoch": 0.16305402782271108, "grad_norm": 1.3568611145019531, "learning_rate": 9.990185349149339e-05, "loss": 1.1104, "step": 126 }, { "epoch": 0.16434810740860564, "grad_norm": 0.900867223739624, "learning_rate": 9.98992545699196e-05, "loss": 0.8427, "step": 127 }, { "epoch": 0.16564218699450017, "grad_norm": 0.9025059938430786, "learning_rate": 9.989662172147326e-05, "loss": 0.9671, "step": 128 }, { "epoch": 0.1669362665803947, "grad_norm": 0.944692850112915, "learning_rate": 9.989395494794446e-05, "loss": 1.0966, "step": 129 }, { "epoch": 0.16823034616628924, "grad_norm": 1.196311116218567, "learning_rate": 9.989125425114638e-05, "loss": 1.0888, "step": 130 }, { "epoch": 0.16952442575218377, "grad_norm": 0.9069584608078003, "learning_rate": 9.988851963291522e-05, "loss": 0.8579, "step": 131 }, { "epoch": 0.1708185053380783, "grad_norm": 0.8150789141654968, "learning_rate": 9.988575109511026e-05, "loss": 0.7622, "step": 132 }, { "epoch": 0.17211258492397283, "grad_norm": 1.0844395160675049, "learning_rate": 9.988294863961387e-05, "loss": 0.9284, "step": 133 }, { "epoch": 0.17340666450986736, "grad_norm": 1.0463049411773682, "learning_rate": 9.988011226833146e-05, "loss": 0.9185, "step": 134 }, { "epoch": 0.1747007440957619, "grad_norm": 0.9481234550476074, "learning_rate": 9.987724198319148e-05, "loss": 0.8631, "step": 135 }, { "epoch": 0.17599482368165642, "grad_norm": 0.882074773311615, "learning_rate": 9.987433778614549e-05, "loss": 0.8997, "step": 136 }, { "epoch": 0.17728890326755095, "grad_norm": 0.9853332042694092, "learning_rate": 9.987139967916805e-05, "loss": 0.9226, "step": 137 }, { "epoch": 0.17858298285344548, "grad_norm": 1.151941180229187, "learning_rate": 9.98684276642568e-05, "loss": 1.0486, "step": 138 }, { "epoch": 0.17987706243934, "grad_norm": 1.0128459930419922, "learning_rate": 9.986542174343245e-05, "loss": 1.0797, "step": 139 }, { "epoch": 0.18117114202523454, "grad_norm": 0.9798718094825745, "learning_rate": 9.986238191873874e-05, "loss": 0.875, "step": 140 }, { "epoch": 0.18246522161112907, "grad_norm": 0.8143295645713806, "learning_rate": 9.985930819224247e-05, "loss": 0.8454, "step": 141 }, { "epoch": 0.1837593011970236, "grad_norm": 0.8755755424499512, "learning_rate": 9.985620056603348e-05, "loss": 0.8029, "step": 142 }, { "epoch": 0.18505338078291814, "grad_norm": 0.899174690246582, "learning_rate": 9.985305904222469e-05, "loss": 0.9608, "step": 143 }, { "epoch": 0.1863474603688127, "grad_norm": 0.920137882232666, "learning_rate": 9.984988362295203e-05, "loss": 0.9022, "step": 144 }, { "epoch": 0.18764153995470723, "grad_norm": 1.1012908220291138, "learning_rate": 9.984667431037447e-05, "loss": 0.9621, "step": 145 }, { "epoch": 0.18893561954060176, "grad_norm": 0.8609358668327332, "learning_rate": 9.98434311066741e-05, "loss": 0.917, "step": 146 }, { "epoch": 0.1902296991264963, "grad_norm": 0.8248727321624756, "learning_rate": 9.984015401405594e-05, "loss": 0.7864, "step": 147 }, { "epoch": 0.19152377871239082, "grad_norm": 0.8680225610733032, "learning_rate": 9.983684303474815e-05, "loss": 0.9288, "step": 148 }, { "epoch": 0.19281785829828535, "grad_norm": 1.0807067155838013, "learning_rate": 9.983349817100188e-05, "loss": 0.9842, "step": 149 }, { "epoch": 0.19411193788417988, "grad_norm": 0.9310898780822754, "learning_rate": 9.983011942509131e-05, "loss": 1.0568, "step": 150 }, { "epoch": 0.1954060174700744, "grad_norm": 0.8052242398262024, "learning_rate": 9.98267067993137e-05, "loss": 0.8235, "step": 151 }, { "epoch": 0.19670009705596894, "grad_norm": 0.9700384140014648, "learning_rate": 9.982326029598931e-05, "loss": 0.8611, "step": 152 }, { "epoch": 0.19799417664186347, "grad_norm": 0.8437764048576355, "learning_rate": 9.981977991746142e-05, "loss": 0.83, "step": 153 }, { "epoch": 0.199288256227758, "grad_norm": 0.930636465549469, "learning_rate": 9.98162656660964e-05, "loss": 1.0892, "step": 154 }, { "epoch": 0.20058233581365253, "grad_norm": 0.9111954569816589, "learning_rate": 9.98127175442836e-05, "loss": 0.9962, "step": 155 }, { "epoch": 0.20187641539954707, "grad_norm": 0.9521974921226501, "learning_rate": 9.980913555443541e-05, "loss": 0.911, "step": 156 }, { "epoch": 0.2031704949854416, "grad_norm": 0.8516745567321777, "learning_rate": 9.980551969898727e-05, "loss": 0.9009, "step": 157 }, { "epoch": 0.20446457457133613, "grad_norm": 0.8302998542785645, "learning_rate": 9.98018699803976e-05, "loss": 0.8854, "step": 158 }, { "epoch": 0.20575865415723066, "grad_norm": 0.814391016960144, "learning_rate": 9.979818640114789e-05, "loss": 0.9601, "step": 159 }, { "epoch": 0.2070527337431252, "grad_norm": 0.8938564658164978, "learning_rate": 9.979446896374262e-05, "loss": 0.8834, "step": 160 }, { "epoch": 0.20834681332901975, "grad_norm": 0.9066985249519348, "learning_rate": 9.979071767070932e-05, "loss": 0.7427, "step": 161 }, { "epoch": 0.20964089291491428, "grad_norm": 0.7866595983505249, "learning_rate": 9.978693252459851e-05, "loss": 0.8556, "step": 162 }, { "epoch": 0.2109349725008088, "grad_norm": 0.9159708023071289, "learning_rate": 9.978311352798374e-05, "loss": 0.8101, "step": 163 }, { "epoch": 0.21222905208670334, "grad_norm": 1.1350793838500977, "learning_rate": 9.977926068346157e-05, "loss": 0.9374, "step": 164 }, { "epoch": 0.21352313167259787, "grad_norm": 1.0535932779312134, "learning_rate": 9.977537399365159e-05, "loss": 1.0238, "step": 165 }, { "epoch": 0.2148172112584924, "grad_norm": 0.8717033267021179, "learning_rate": 9.977145346119637e-05, "loss": 1.0265, "step": 166 }, { "epoch": 0.21611129084438693, "grad_norm": 0.8357003927230835, "learning_rate": 9.976749908876152e-05, "loss": 0.9016, "step": 167 }, { "epoch": 0.21740537043028146, "grad_norm": 0.8369495868682861, "learning_rate": 9.976351087903568e-05, "loss": 0.8764, "step": 168 }, { "epoch": 0.218699450016176, "grad_norm": 0.912352979183197, "learning_rate": 9.97594888347304e-05, "loss": 0.9078, "step": 169 }, { "epoch": 0.21999352960207053, "grad_norm": 0.8475804328918457, "learning_rate": 9.975543295858035e-05, "loss": 0.8836, "step": 170 }, { "epoch": 0.22128760918796506, "grad_norm": 0.8391397595405579, "learning_rate": 9.97513432533431e-05, "loss": 0.9003, "step": 171 }, { "epoch": 0.2225816887738596, "grad_norm": 0.9666821360588074, "learning_rate": 9.974721972179931e-05, "loss": 0.9528, "step": 172 }, { "epoch": 0.22387576835975412, "grad_norm": 0.9321691393852234, "learning_rate": 9.974306236675259e-05, "loss": 0.9575, "step": 173 }, { "epoch": 0.22516984794564865, "grad_norm": 0.8022271990776062, "learning_rate": 9.973887119102957e-05, "loss": 0.8731, "step": 174 }, { "epoch": 0.22646392753154318, "grad_norm": 1.1056872606277466, "learning_rate": 9.973464619747983e-05, "loss": 0.9925, "step": 175 }, { "epoch": 0.2277580071174377, "grad_norm": 0.810420036315918, "learning_rate": 9.9730387388976e-05, "loss": 1.0073, "step": 176 }, { "epoch": 0.22905208670333224, "grad_norm": 0.9536454677581787, "learning_rate": 9.972609476841367e-05, "loss": 0.9595, "step": 177 }, { "epoch": 0.2303461662892268, "grad_norm": 0.8205066919326782, "learning_rate": 9.972176833871142e-05, "loss": 0.8146, "step": 178 }, { "epoch": 0.23164024587512133, "grad_norm": 0.9716495275497437, "learning_rate": 9.971740810281083e-05, "loss": 1.0377, "step": 179 }, { "epoch": 0.23293432546101586, "grad_norm": 0.828642725944519, "learning_rate": 9.971301406367644e-05, "loss": 0.8619, "step": 180 }, { "epoch": 0.2342284050469104, "grad_norm": 0.6980477571487427, "learning_rate": 9.970858622429579e-05, "loss": 0.8271, "step": 181 }, { "epoch": 0.23552248463280492, "grad_norm": 0.954387903213501, "learning_rate": 9.970412458767943e-05, "loss": 0.8465, "step": 182 }, { "epoch": 0.23681656421869945, "grad_norm": 0.8425692915916443, "learning_rate": 9.969962915686083e-05, "loss": 0.8893, "step": 183 }, { "epoch": 0.23811064380459399, "grad_norm": 0.8565071225166321, "learning_rate": 9.969509993489647e-05, "loss": 0.939, "step": 184 }, { "epoch": 0.23940472339048852, "grad_norm": 0.8831691145896912, "learning_rate": 9.969053692486583e-05, "loss": 0.8907, "step": 185 }, { "epoch": 0.24069880297638305, "grad_norm": 0.9661678075790405, "learning_rate": 9.96859401298713e-05, "loss": 0.9191, "step": 186 }, { "epoch": 0.24199288256227758, "grad_norm": 0.8784729838371277, "learning_rate": 9.968130955303828e-05, "loss": 1.0393, "step": 187 }, { "epoch": 0.2432869621481721, "grad_norm": 0.8830071091651917, "learning_rate": 9.967664519751515e-05, "loss": 0.9837, "step": 188 }, { "epoch": 0.24458104173406664, "grad_norm": 0.862108588218689, "learning_rate": 9.967194706647322e-05, "loss": 0.7871, "step": 189 }, { "epoch": 0.24587512131996117, "grad_norm": 1.0068063735961914, "learning_rate": 9.966721516310682e-05, "loss": 0.9526, "step": 190 }, { "epoch": 0.2471692009058557, "grad_norm": 0.9828710556030273, "learning_rate": 9.966244949063316e-05, "loss": 0.8923, "step": 191 }, { "epoch": 0.24846328049175023, "grad_norm": 1.0729883909225464, "learning_rate": 9.965765005229248e-05, "loss": 1.0115, "step": 192 }, { "epoch": 0.24975736007764476, "grad_norm": 0.9844326972961426, "learning_rate": 9.965281685134796e-05, "loss": 0.9855, "step": 193 }, { "epoch": 0.2510514396635393, "grad_norm": 1.1593172550201416, "learning_rate": 9.96479498910857e-05, "loss": 1.0912, "step": 194 }, { "epoch": 0.2523455192494338, "grad_norm": 0.8835370540618896, "learning_rate": 9.964304917481482e-05, "loss": 0.9951, "step": 195 }, { "epoch": 0.2536395988353284, "grad_norm": 0.9553850889205933, "learning_rate": 9.963811470586733e-05, "loss": 0.9335, "step": 196 }, { "epoch": 0.2549336784212229, "grad_norm": 0.863814115524292, "learning_rate": 9.963314648759823e-05, "loss": 1.0203, "step": 197 }, { "epoch": 0.25622775800711745, "grad_norm": 0.9639378786087036, "learning_rate": 9.962814452338542e-05, "loss": 1.0357, "step": 198 }, { "epoch": 0.25752183759301195, "grad_norm": 0.880519688129425, "learning_rate": 9.96231088166298e-05, "loss": 0.9964, "step": 199 }, { "epoch": 0.2588159171789065, "grad_norm": 0.8445360064506531, "learning_rate": 9.961803937075516e-05, "loss": 0.9766, "step": 200 }, { "epoch": 0.260109996764801, "grad_norm": 0.8204835057258606, "learning_rate": 9.961293618920826e-05, "loss": 0.8864, "step": 201 }, { "epoch": 0.26140407635069557, "grad_norm": 0.9315406084060669, "learning_rate": 9.960779927545883e-05, "loss": 1.0388, "step": 202 }, { "epoch": 0.2626981559365901, "grad_norm": 0.9286762475967407, "learning_rate": 9.960262863299943e-05, "loss": 0.9653, "step": 203 }, { "epoch": 0.26399223552248463, "grad_norm": 0.8037816882133484, "learning_rate": 9.959742426534566e-05, "loss": 0.765, "step": 204 }, { "epoch": 0.2652863151083792, "grad_norm": 0.9435904622077942, "learning_rate": 9.9592186176036e-05, "loss": 0.8889, "step": 205 }, { "epoch": 0.2665803946942737, "grad_norm": 1.0072762966156006, "learning_rate": 9.958691436863188e-05, "loss": 0.8358, "step": 206 }, { "epoch": 0.26787447428016825, "grad_norm": 0.9463568329811096, "learning_rate": 9.958160884671761e-05, "loss": 0.8815, "step": 207 }, { "epoch": 0.26916855386606275, "grad_norm": 0.9203188419342041, "learning_rate": 9.957626961390047e-05, "loss": 0.9312, "step": 208 }, { "epoch": 0.2704626334519573, "grad_norm": 1.0614677667617798, "learning_rate": 9.957089667381064e-05, "loss": 0.9822, "step": 209 }, { "epoch": 0.2717567130378518, "grad_norm": 0.8971818089485168, "learning_rate": 9.956549003010123e-05, "loss": 0.9421, "step": 210 }, { "epoch": 0.2730507926237464, "grad_norm": 0.9978768825531006, "learning_rate": 9.956004968644825e-05, "loss": 0.9539, "step": 211 }, { "epoch": 0.2743448722096409, "grad_norm": 0.7017274498939514, "learning_rate": 9.955457564655064e-05, "loss": 0.665, "step": 212 }, { "epoch": 0.27563895179553544, "grad_norm": 0.8292055726051331, "learning_rate": 9.954906791413023e-05, "loss": 0.922, "step": 213 }, { "epoch": 0.27693303138142994, "grad_norm": 0.978084146976471, "learning_rate": 9.954352649293178e-05, "loss": 0.9465, "step": 214 }, { "epoch": 0.2782271109673245, "grad_norm": 1.0260313749313354, "learning_rate": 9.953795138672291e-05, "loss": 0.9093, "step": 215 }, { "epoch": 0.279521190553219, "grad_norm": 1.162850022315979, "learning_rate": 9.95323425992942e-05, "loss": 1.0372, "step": 216 }, { "epoch": 0.28081527013911356, "grad_norm": 0.9785279631614685, "learning_rate": 9.952670013445913e-05, "loss": 0.8818, "step": 217 }, { "epoch": 0.28210934972500806, "grad_norm": 0.9386499524116516, "learning_rate": 9.9521023996054e-05, "loss": 0.8711, "step": 218 }, { "epoch": 0.2834034293109026, "grad_norm": 0.8620506525039673, "learning_rate": 9.951531418793812e-05, "loss": 1.011, "step": 219 }, { "epoch": 0.2846975088967972, "grad_norm": 0.9523435831069946, "learning_rate": 9.950957071399357e-05, "loss": 0.8541, "step": 220 }, { "epoch": 0.2859915884826917, "grad_norm": 0.8993477821350098, "learning_rate": 9.950379357812543e-05, "loss": 1.0253, "step": 221 }, { "epoch": 0.28728566806858624, "grad_norm": 1.073880910873413, "learning_rate": 9.949798278426158e-05, "loss": 1.115, "step": 222 }, { "epoch": 0.28857974765448074, "grad_norm": 0.7941976189613342, "learning_rate": 9.949213833635285e-05, "loss": 0.9398, "step": 223 }, { "epoch": 0.2898738272403753, "grad_norm": 0.798089325428009, "learning_rate": 9.948626023837291e-05, "loss": 0.8523, "step": 224 }, { "epoch": 0.2911679068262698, "grad_norm": 1.0251280069351196, "learning_rate": 9.948034849431831e-05, "loss": 0.939, "step": 225 }, { "epoch": 0.29246198641216437, "grad_norm": 0.9793195724487305, "learning_rate": 9.947440310820852e-05, "loss": 1.0998, "step": 226 }, { "epoch": 0.29375606599805887, "grad_norm": 0.8190125823020935, "learning_rate": 9.946842408408583e-05, "loss": 0.9606, "step": 227 }, { "epoch": 0.2950501455839534, "grad_norm": 0.8229602575302124, "learning_rate": 9.946241142601543e-05, "loss": 0.7944, "step": 228 }, { "epoch": 0.29634422516984793, "grad_norm": 0.8640865683555603, "learning_rate": 9.945636513808537e-05, "loss": 1.112, "step": 229 }, { "epoch": 0.2976383047557425, "grad_norm": 0.774501621723175, "learning_rate": 9.945028522440653e-05, "loss": 0.8986, "step": 230 }, { "epoch": 0.298932384341637, "grad_norm": 0.9039688110351562, "learning_rate": 9.944417168911275e-05, "loss": 1.0461, "step": 231 }, { "epoch": 0.30022646392753155, "grad_norm": 0.8048250675201416, "learning_rate": 9.943802453636065e-05, "loss": 0.89, "step": 232 }, { "epoch": 0.30152054351342605, "grad_norm": 0.8166521787643433, "learning_rate": 9.94318437703297e-05, "loss": 0.9149, "step": 233 }, { "epoch": 0.3028146230993206, "grad_norm": 0.7571333646774292, "learning_rate": 9.942562939522228e-05, "loss": 0.9655, "step": 234 }, { "epoch": 0.3041087026852151, "grad_norm": 0.6913223266601562, "learning_rate": 9.941938141526354e-05, "loss": 0.869, "step": 235 }, { "epoch": 0.3054027822711097, "grad_norm": 0.8476676344871521, "learning_rate": 9.94130998347016e-05, "loss": 0.8849, "step": 236 }, { "epoch": 0.30669686185700423, "grad_norm": 0.8454031944274902, "learning_rate": 9.940678465780728e-05, "loss": 0.9102, "step": 237 }, { "epoch": 0.30799094144289874, "grad_norm": 0.8514583706855774, "learning_rate": 9.940043588887438e-05, "loss": 0.9723, "step": 238 }, { "epoch": 0.3092850210287933, "grad_norm": 0.7330415844917297, "learning_rate": 9.939405353221942e-05, "loss": 0.9537, "step": 239 }, { "epoch": 0.3105791006146878, "grad_norm": 0.9652897715568542, "learning_rate": 9.938763759218185e-05, "loss": 0.9736, "step": 240 }, { "epoch": 0.31187318020058236, "grad_norm": 0.7517886161804199, "learning_rate": 9.93811880731239e-05, "loss": 0.8436, "step": 241 }, { "epoch": 0.31316725978647686, "grad_norm": 0.8159210681915283, "learning_rate": 9.937470497943064e-05, "loss": 0.7521, "step": 242 }, { "epoch": 0.3144613393723714, "grad_norm": 0.9554911851882935, "learning_rate": 9.936818831550998e-05, "loss": 1.1076, "step": 243 }, { "epoch": 0.3157554189582659, "grad_norm": 0.8745877742767334, "learning_rate": 9.936163808579266e-05, "loss": 0.8908, "step": 244 }, { "epoch": 0.3170494985441605, "grad_norm": 0.8050674200057983, "learning_rate": 9.93550542947322e-05, "loss": 1.0134, "step": 245 }, { "epoch": 0.318343578130055, "grad_norm": 1.009790062904358, "learning_rate": 9.9348436946805e-05, "loss": 1.0264, "step": 246 }, { "epoch": 0.31963765771594954, "grad_norm": 0.8702448606491089, "learning_rate": 9.934178604651023e-05, "loss": 1.0067, "step": 247 }, { "epoch": 0.32093173730184404, "grad_norm": 0.8105303049087524, "learning_rate": 9.933510159836989e-05, "loss": 0.8121, "step": 248 }, { "epoch": 0.3222258168877386, "grad_norm": 0.7680085897445679, "learning_rate": 9.932838360692878e-05, "loss": 0.8951, "step": 249 }, { "epoch": 0.3235198964736331, "grad_norm": 0.8338052034378052, "learning_rate": 9.93216320767545e-05, "loss": 0.8878, "step": 250 }, { "epoch": 0.32481397605952766, "grad_norm": 0.8709661960601807, "learning_rate": 9.93148470124375e-05, "loss": 0.8786, "step": 251 }, { "epoch": 0.32610805564542217, "grad_norm": 0.9593453407287598, "learning_rate": 9.930802841859095e-05, "loss": 1.0659, "step": 252 }, { "epoch": 0.3274021352313167, "grad_norm": 1.0130974054336548, "learning_rate": 9.93011762998509e-05, "loss": 0.9626, "step": 253 }, { "epoch": 0.3286962148172113, "grad_norm": 0.9949910640716553, "learning_rate": 9.929429066087616e-05, "loss": 1.0499, "step": 254 }, { "epoch": 0.3299902944031058, "grad_norm": 0.927542507648468, "learning_rate": 9.92873715063483e-05, "loss": 0.9571, "step": 255 }, { "epoch": 0.33128437398900035, "grad_norm": 1.0638468265533447, "learning_rate": 9.92804188409717e-05, "loss": 1.0293, "step": 256 }, { "epoch": 0.33257845357489485, "grad_norm": 0.7083877325057983, "learning_rate": 9.927343266947356e-05, "loss": 0.875, "step": 257 }, { "epoch": 0.3338725331607894, "grad_norm": 0.7915517091751099, "learning_rate": 9.92664129966038e-05, "loss": 0.8848, "step": 258 }, { "epoch": 0.3351666127466839, "grad_norm": 0.8054295182228088, "learning_rate": 9.925935982713518e-05, "loss": 0.8981, "step": 259 }, { "epoch": 0.33646069233257847, "grad_norm": 1.012574553489685, "learning_rate": 9.925227316586316e-05, "loss": 0.8119, "step": 260 }, { "epoch": 0.337754771918473, "grad_norm": 0.8329979181289673, "learning_rate": 9.924515301760606e-05, "loss": 0.8467, "step": 261 }, { "epoch": 0.33904885150436753, "grad_norm": 0.801017701625824, "learning_rate": 9.923799938720488e-05, "loss": 0.8333, "step": 262 }, { "epoch": 0.34034293109026204, "grad_norm": 0.9083892703056335, "learning_rate": 9.923081227952347e-05, "loss": 0.8727, "step": 263 }, { "epoch": 0.3416370106761566, "grad_norm": 0.7917154431343079, "learning_rate": 9.922359169944834e-05, "loss": 1.0341, "step": 264 }, { "epoch": 0.3429310902620511, "grad_norm": 0.6865798234939575, "learning_rate": 9.921633765188886e-05, "loss": 0.9117, "step": 265 }, { "epoch": 0.34422516984794566, "grad_norm": 0.9448872208595276, "learning_rate": 9.92090501417771e-05, "loss": 1.1399, "step": 266 }, { "epoch": 0.34551924943384016, "grad_norm": 0.8711137175559998, "learning_rate": 9.920172917406789e-05, "loss": 0.8158, "step": 267 }, { "epoch": 0.3468133290197347, "grad_norm": 0.7883780598640442, "learning_rate": 9.919437475373882e-05, "loss": 1.0259, "step": 268 }, { "epoch": 0.3481074086056292, "grad_norm": 0.7776859402656555, "learning_rate": 9.91869868857902e-05, "loss": 0.9145, "step": 269 }, { "epoch": 0.3494014881915238, "grad_norm": 0.8383511900901794, "learning_rate": 9.91795655752451e-05, "loss": 0.8769, "step": 270 }, { "epoch": 0.35069556777741834, "grad_norm": 0.8562968373298645, "learning_rate": 9.917211082714933e-05, "loss": 1.0078, "step": 271 }, { "epoch": 0.35198964736331284, "grad_norm": 0.810517430305481, "learning_rate": 9.916462264657142e-05, "loss": 0.8847, "step": 272 }, { "epoch": 0.3532837269492074, "grad_norm": 0.8934103846549988, "learning_rate": 9.915710103860263e-05, "loss": 0.8633, "step": 273 }, { "epoch": 0.3545778065351019, "grad_norm": 0.8963167667388916, "learning_rate": 9.914954600835699e-05, "loss": 0.8411, "step": 274 }, { "epoch": 0.35587188612099646, "grad_norm": 0.774557888507843, "learning_rate": 9.91419575609712e-05, "loss": 0.958, "step": 275 }, { "epoch": 0.35716596570689096, "grad_norm": 0.8483523726463318, "learning_rate": 9.913433570160469e-05, "loss": 0.905, "step": 276 }, { "epoch": 0.3584600452927855, "grad_norm": 0.8082010746002197, "learning_rate": 9.912668043543964e-05, "loss": 0.8633, "step": 277 }, { "epoch": 0.35975412487868, "grad_norm": 0.8654133677482605, "learning_rate": 9.911899176768091e-05, "loss": 0.8465, "step": 278 }, { "epoch": 0.3610482044645746, "grad_norm": 0.8709694743156433, "learning_rate": 9.911126970355609e-05, "loss": 0.9286, "step": 279 }, { "epoch": 0.3623422840504691, "grad_norm": 0.8783992528915405, "learning_rate": 9.910351424831546e-05, "loss": 0.9349, "step": 280 }, { "epoch": 0.36363636363636365, "grad_norm": 0.8846459984779358, "learning_rate": 9.909572540723202e-05, "loss": 0.7986, "step": 281 }, { "epoch": 0.36493044322225815, "grad_norm": 0.8271780610084534, "learning_rate": 9.908790318560146e-05, "loss": 0.8179, "step": 282 }, { "epoch": 0.3662245228081527, "grad_norm": 0.8694506883621216, "learning_rate": 9.908004758874216e-05, "loss": 0.8453, "step": 283 }, { "epoch": 0.3675186023940472, "grad_norm": 0.8625207543373108, "learning_rate": 9.90721586219952e-05, "loss": 0.7825, "step": 284 }, { "epoch": 0.36881268197994177, "grad_norm": 0.8270084261894226, "learning_rate": 9.906423629072434e-05, "loss": 1.0889, "step": 285 }, { "epoch": 0.3701067615658363, "grad_norm": 0.8891452550888062, "learning_rate": 9.905628060031605e-05, "loss": 0.8847, "step": 286 }, { "epoch": 0.37140084115173083, "grad_norm": 0.8682214021682739, "learning_rate": 9.904829155617945e-05, "loss": 0.9311, "step": 287 }, { "epoch": 0.3726949207376254, "grad_norm": 1.2218021154403687, "learning_rate": 9.904026916374636e-05, "loss": 0.92, "step": 288 }, { "epoch": 0.3739890003235199, "grad_norm": 1.0069034099578857, "learning_rate": 9.903221342847125e-05, "loss": 1.0061, "step": 289 }, { "epoch": 0.37528307990941445, "grad_norm": 0.7723405361175537, "learning_rate": 9.902412435583128e-05, "loss": 0.8627, "step": 290 }, { "epoch": 0.37657715949530896, "grad_norm": 0.8336161971092224, "learning_rate": 9.901600195132627e-05, "loss": 0.9815, "step": 291 }, { "epoch": 0.3778712390812035, "grad_norm": 0.8170490264892578, "learning_rate": 9.90078462204787e-05, "loss": 0.8305, "step": 292 }, { "epoch": 0.379165318667098, "grad_norm": 0.7367318868637085, "learning_rate": 9.899965716883372e-05, "loss": 0.8314, "step": 293 }, { "epoch": 0.3804593982529926, "grad_norm": 0.8122355937957764, "learning_rate": 9.899143480195913e-05, "loss": 0.8369, "step": 294 }, { "epoch": 0.3817534778388871, "grad_norm": 0.7415926456451416, "learning_rate": 9.898317912544536e-05, "loss": 0.8985, "step": 295 }, { "epoch": 0.38304755742478164, "grad_norm": 0.8654418587684631, "learning_rate": 9.897489014490553e-05, "loss": 0.8752, "step": 296 }, { "epoch": 0.38434163701067614, "grad_norm": 0.7863161563873291, "learning_rate": 9.896656786597535e-05, "loss": 1.0138, "step": 297 }, { "epoch": 0.3856357165965707, "grad_norm": 0.8346667885780334, "learning_rate": 9.895821229431323e-05, "loss": 0.94, "step": 298 }, { "epoch": 0.3869297961824652, "grad_norm": 0.9577547907829285, "learning_rate": 9.894982343560016e-05, "loss": 0.9967, "step": 299 }, { "epoch": 0.38822387576835976, "grad_norm": 0.7633039951324463, "learning_rate": 9.894140129553981e-05, "loss": 0.8469, "step": 300 }, { "epoch": 0.38951795535425426, "grad_norm": 0.8275448083877563, "learning_rate": 9.893294587985843e-05, "loss": 0.8295, "step": 301 }, { "epoch": 0.3908120349401488, "grad_norm": 0.9197372198104858, "learning_rate": 9.892445719430493e-05, "loss": 0.9363, "step": 302 }, { "epoch": 0.3921061145260433, "grad_norm": 0.7137726545333862, "learning_rate": 9.891593524465083e-05, "loss": 0.8207, "step": 303 }, { "epoch": 0.3934001941119379, "grad_norm": 0.9908462762832642, "learning_rate": 9.890738003669029e-05, "loss": 0.9287, "step": 304 }, { "epoch": 0.39469427369783244, "grad_norm": 1.0070343017578125, "learning_rate": 9.889879157624002e-05, "loss": 1.2092, "step": 305 }, { "epoch": 0.39598835328372695, "grad_norm": 0.7812051177024841, "learning_rate": 9.889016986913941e-05, "loss": 0.8149, "step": 306 }, { "epoch": 0.3972824328696215, "grad_norm": 0.8907694816589355, "learning_rate": 9.888151492125039e-05, "loss": 0.9008, "step": 307 }, { "epoch": 0.398576512455516, "grad_norm": 0.960164487361908, "learning_rate": 9.887282673845754e-05, "loss": 0.9437, "step": 308 }, { "epoch": 0.39987059204141057, "grad_norm": 0.8578314185142517, "learning_rate": 9.886410532666805e-05, "loss": 0.8422, "step": 309 }, { "epoch": 0.40116467162730507, "grad_norm": 0.8671669960021973, "learning_rate": 9.885535069181162e-05, "loss": 0.924, "step": 310 }, { "epoch": 0.40245875121319963, "grad_norm": 1.037150263786316, "learning_rate": 9.884656283984062e-05, "loss": 1.0074, "step": 311 }, { "epoch": 0.40375283079909413, "grad_norm": 0.7935649156570435, "learning_rate": 9.883774177672998e-05, "loss": 0.9804, "step": 312 }, { "epoch": 0.4050469103849887, "grad_norm": 0.8249583840370178, "learning_rate": 9.882888750847717e-05, "loss": 0.9627, "step": 313 }, { "epoch": 0.4063409899708832, "grad_norm": 0.7363491058349609, "learning_rate": 9.882000004110233e-05, "loss": 0.8309, "step": 314 }, { "epoch": 0.40763506955677775, "grad_norm": 0.8029589653015137, "learning_rate": 9.881107938064806e-05, "loss": 0.8762, "step": 315 }, { "epoch": 0.40892914914267225, "grad_norm": 0.7131720781326294, "learning_rate": 9.880212553317963e-05, "loss": 0.882, "step": 316 }, { "epoch": 0.4102232287285668, "grad_norm": 0.7951234579086304, "learning_rate": 9.879313850478478e-05, "loss": 1.0095, "step": 317 }, { "epoch": 0.4115173083144613, "grad_norm": 0.7714753150939941, "learning_rate": 9.87841183015739e-05, "loss": 0.8688, "step": 318 }, { "epoch": 0.4128113879003559, "grad_norm": 0.7753300666809082, "learning_rate": 9.877506492967987e-05, "loss": 0.9669, "step": 319 }, { "epoch": 0.4141054674862504, "grad_norm": 0.9767395853996277, "learning_rate": 9.876597839525814e-05, "loss": 1.1169, "step": 320 }, { "epoch": 0.41539954707214494, "grad_norm": 0.7923420667648315, "learning_rate": 9.875685870448672e-05, "loss": 0.9942, "step": 321 }, { "epoch": 0.4166936266580395, "grad_norm": 0.7265552282333374, "learning_rate": 9.874770586356616e-05, "loss": 1.0377, "step": 322 }, { "epoch": 0.417987706243934, "grad_norm": 0.7586270570755005, "learning_rate": 9.873851987871954e-05, "loss": 0.9172, "step": 323 }, { "epoch": 0.41928178582982856, "grad_norm": 0.782192587852478, "learning_rate": 9.872930075619249e-05, "loss": 0.9219, "step": 324 }, { "epoch": 0.42057586541572306, "grad_norm": 0.8508116602897644, "learning_rate": 9.872004850225313e-05, "loss": 0.939, "step": 325 }, { "epoch": 0.4218699450016176, "grad_norm": 0.8639410138130188, "learning_rate": 9.871076312319218e-05, "loss": 1.0854, "step": 326 }, { "epoch": 0.4231640245875121, "grad_norm": 0.809862494468689, "learning_rate": 9.870144462532281e-05, "loss": 0.8283, "step": 327 }, { "epoch": 0.4244581041734067, "grad_norm": 0.7755741477012634, "learning_rate": 9.869209301498072e-05, "loss": 0.7645, "step": 328 }, { "epoch": 0.4257521837593012, "grad_norm": 0.8238282203674316, "learning_rate": 9.868270829852416e-05, "loss": 0.8858, "step": 329 }, { "epoch": 0.42704626334519574, "grad_norm": 0.810691773891449, "learning_rate": 9.867329048233387e-05, "loss": 0.7913, "step": 330 }, { "epoch": 0.42834034293109025, "grad_norm": 0.8913648128509521, "learning_rate": 9.866383957281309e-05, "loss": 0.7885, "step": 331 }, { "epoch": 0.4296344225169848, "grad_norm": 0.9160618782043457, "learning_rate": 9.865435557638757e-05, "loss": 0.7296, "step": 332 }, { "epoch": 0.4309285021028793, "grad_norm": 0.7498170137405396, "learning_rate": 9.864483849950553e-05, "loss": 0.9655, "step": 333 }, { "epoch": 0.43222258168877387, "grad_norm": 0.7315449714660645, "learning_rate": 9.863528834863773e-05, "loss": 0.6886, "step": 334 }, { "epoch": 0.43351666127466837, "grad_norm": 0.8278869390487671, "learning_rate": 9.862570513027735e-05, "loss": 0.9637, "step": 335 }, { "epoch": 0.4348107408605629, "grad_norm": 0.8372804522514343, "learning_rate": 9.861608885094012e-05, "loss": 0.8609, "step": 336 }, { "epoch": 0.43610482044645743, "grad_norm": 0.8712325096130371, "learning_rate": 9.860643951716421e-05, "loss": 0.9718, "step": 337 }, { "epoch": 0.437398900032352, "grad_norm": 0.9869045615196228, "learning_rate": 9.859675713551028e-05, "loss": 0.887, "step": 338 }, { "epoch": 0.43869297961824655, "grad_norm": 0.9166460037231445, "learning_rate": 9.858704171256145e-05, "loss": 1.0751, "step": 339 }, { "epoch": 0.43998705920414105, "grad_norm": 1.1965091228485107, "learning_rate": 9.857729325492329e-05, "loss": 1.0093, "step": 340 }, { "epoch": 0.4412811387900356, "grad_norm": 0.8646867275238037, "learning_rate": 9.856751176922388e-05, "loss": 0.9235, "step": 341 }, { "epoch": 0.4425752183759301, "grad_norm": 0.7576479315757751, "learning_rate": 9.85576972621137e-05, "loss": 0.8323, "step": 342 }, { "epoch": 0.44386929796182467, "grad_norm": 0.8257366418838501, "learning_rate": 9.854784974026572e-05, "loss": 0.8478, "step": 343 }, { "epoch": 0.4451633775477192, "grad_norm": 0.8963577747344971, "learning_rate": 9.853796921037534e-05, "loss": 0.8943, "step": 344 }, { "epoch": 0.44645745713361373, "grad_norm": 0.8696343898773193, "learning_rate": 9.85280556791604e-05, "loss": 0.991, "step": 345 }, { "epoch": 0.44775153671950824, "grad_norm": 0.8321841359138489, "learning_rate": 9.851810915336119e-05, "loss": 1.0462, "step": 346 }, { "epoch": 0.4490456163054028, "grad_norm": 0.8194176554679871, "learning_rate": 9.850812963974042e-05, "loss": 0.9484, "step": 347 }, { "epoch": 0.4503396958912973, "grad_norm": 0.9131197333335876, "learning_rate": 9.849811714508323e-05, "loss": 0.9687, "step": 348 }, { "epoch": 0.45163377547719186, "grad_norm": 0.891231894493103, "learning_rate": 9.848807167619721e-05, "loss": 0.9605, "step": 349 }, { "epoch": 0.45292785506308636, "grad_norm": 0.9574115872383118, "learning_rate": 9.847799323991234e-05, "loss": 1.077, "step": 350 }, { "epoch": 0.4542219346489809, "grad_norm": 0.9179444909095764, "learning_rate": 9.8467881843081e-05, "loss": 0.8353, "step": 351 }, { "epoch": 0.4555160142348754, "grad_norm": 0.855802595615387, "learning_rate": 9.845773749257804e-05, "loss": 0.9764, "step": 352 }, { "epoch": 0.45681009382077, "grad_norm": 0.9086332321166992, "learning_rate": 9.844756019530066e-05, "loss": 1.0526, "step": 353 }, { "epoch": 0.4581041734066645, "grad_norm": 0.890271782875061, "learning_rate": 9.843734995816848e-05, "loss": 0.8905, "step": 354 }, { "epoch": 0.45939825299255904, "grad_norm": 0.7878096699714661, "learning_rate": 9.842710678812351e-05, "loss": 0.8706, "step": 355 }, { "epoch": 0.4606923325784536, "grad_norm": 0.9886014461517334, "learning_rate": 9.841683069213017e-05, "loss": 0.9579, "step": 356 }, { "epoch": 0.4619864121643481, "grad_norm": 0.8265432119369507, "learning_rate": 9.840652167717526e-05, "loss": 0.8528, "step": 357 }, { "epoch": 0.46328049175024266, "grad_norm": 0.7354372143745422, "learning_rate": 9.839617975026793e-05, "loss": 0.775, "step": 358 }, { "epoch": 0.46457457133613717, "grad_norm": 0.8311409950256348, "learning_rate": 9.838580491843976e-05, "loss": 1.0374, "step": 359 }, { "epoch": 0.4658686509220317, "grad_norm": 0.8180521130561829, "learning_rate": 9.837539718874464e-05, "loss": 0.818, "step": 360 }, { "epoch": 0.4671627305079262, "grad_norm": 0.7231691479682922, "learning_rate": 9.83649565682589e-05, "loss": 0.9141, "step": 361 }, { "epoch": 0.4684568100938208, "grad_norm": 0.7644848227500916, "learning_rate": 9.835448306408118e-05, "loss": 0.9565, "step": 362 }, { "epoch": 0.4697508896797153, "grad_norm": 0.7103495001792908, "learning_rate": 9.83439766833325e-05, "loss": 0.7706, "step": 363 }, { "epoch": 0.47104496926560985, "grad_norm": 0.8206077218055725, "learning_rate": 9.83334374331562e-05, "loss": 1.0923, "step": 364 }, { "epoch": 0.47233904885150435, "grad_norm": 0.7409754991531372, "learning_rate": 9.832286532071802e-05, "loss": 0.8534, "step": 365 }, { "epoch": 0.4736331284373989, "grad_norm": 0.8880231380462646, "learning_rate": 9.831226035320602e-05, "loss": 0.9025, "step": 366 }, { "epoch": 0.4749272080232934, "grad_norm": 0.9805209636688232, "learning_rate": 9.830162253783058e-05, "loss": 0.9939, "step": 367 }, { "epoch": 0.47622128760918797, "grad_norm": 0.9529579281806946, "learning_rate": 9.829095188182442e-05, "loss": 0.9645, "step": 368 }, { "epoch": 0.4775153671950825, "grad_norm": 0.8122926354408264, "learning_rate": 9.828024839244263e-05, "loss": 0.9269, "step": 369 }, { "epoch": 0.47880944678097703, "grad_norm": 0.8557335734367371, "learning_rate": 9.826951207696258e-05, "loss": 0.8781, "step": 370 }, { "epoch": 0.48010352636687154, "grad_norm": 0.959077775478363, "learning_rate": 9.825874294268396e-05, "loss": 0.9448, "step": 371 }, { "epoch": 0.4813976059527661, "grad_norm": 0.7479959726333618, "learning_rate": 9.824794099692878e-05, "loss": 0.8608, "step": 372 }, { "epoch": 0.48269168553866065, "grad_norm": 0.8064723014831543, "learning_rate": 9.823710624704137e-05, "loss": 1.1222, "step": 373 }, { "epoch": 0.48398576512455516, "grad_norm": 0.8007084131240845, "learning_rate": 9.822623870038838e-05, "loss": 0.8967, "step": 374 }, { "epoch": 0.4852798447104497, "grad_norm": 0.988571286201477, "learning_rate": 9.82153383643587e-05, "loss": 0.9574, "step": 375 }, { "epoch": 0.4865739242963442, "grad_norm": 0.9317258596420288, "learning_rate": 9.820440524636356e-05, "loss": 0.929, "step": 376 }, { "epoch": 0.4878680038822388, "grad_norm": 0.8651334643363953, "learning_rate": 9.819343935383649e-05, "loss": 0.9483, "step": 377 }, { "epoch": 0.4891620834681333, "grad_norm": 0.791306734085083, "learning_rate": 9.818244069423325e-05, "loss": 0.9277, "step": 378 }, { "epoch": 0.49045616305402784, "grad_norm": 0.6706424355506897, "learning_rate": 9.817140927503192e-05, "loss": 0.9013, "step": 379 }, { "epoch": 0.49175024263992234, "grad_norm": 0.8574838042259216, "learning_rate": 9.816034510373286e-05, "loss": 0.9889, "step": 380 }, { "epoch": 0.4930443222258169, "grad_norm": 0.829309344291687, "learning_rate": 9.814924818785865e-05, "loss": 0.8139, "step": 381 }, { "epoch": 0.4943384018117114, "grad_norm": 0.83943110704422, "learning_rate": 9.81381185349542e-05, "loss": 0.9202, "step": 382 }, { "epoch": 0.49563248139760596, "grad_norm": 0.7981933355331421, "learning_rate": 9.812695615258662e-05, "loss": 0.9131, "step": 383 }, { "epoch": 0.49692656098350046, "grad_norm": 0.7930905818939209, "learning_rate": 9.81157610483453e-05, "loss": 0.769, "step": 384 }, { "epoch": 0.498220640569395, "grad_norm": 0.8699679970741272, "learning_rate": 9.81045332298419e-05, "loss": 0.9468, "step": 385 }, { "epoch": 0.4995147201552895, "grad_norm": 0.7733138799667358, "learning_rate": 9.809327270471025e-05, "loss": 0.8982, "step": 386 }, { "epoch": 0.4995147201552895, "eval_loss": 0.8872498869895935, "eval_runtime": 189.8035, "eval_samples_per_second": 3.43, "eval_steps_per_second": 0.859, "step": 386 }, { "epoch": 0.5008087997411841, "grad_norm": 0.9202268123626709, "learning_rate": 9.808197948060651e-05, "loss": 0.9664, "step": 387 }, { "epoch": 0.5021028793270786, "grad_norm": 0.7772640585899353, "learning_rate": 9.807065356520899e-05, "loss": 0.8825, "step": 388 }, { "epoch": 0.5033969589129732, "grad_norm": 0.8117741346359253, "learning_rate": 9.805929496621828e-05, "loss": 0.8166, "step": 389 }, { "epoch": 0.5046910384988677, "grad_norm": 0.7308422327041626, "learning_rate": 9.804790369135718e-05, "loss": 0.8657, "step": 390 }, { "epoch": 0.5059851180847622, "grad_norm": 0.8373358845710754, "learning_rate": 9.80364797483707e-05, "loss": 0.9175, "step": 391 }, { "epoch": 0.5072791976706568, "grad_norm": 0.8288902640342712, "learning_rate": 9.802502314502607e-05, "loss": 0.7463, "step": 392 }, { "epoch": 0.5085732772565513, "grad_norm": 0.6780114769935608, "learning_rate": 9.801353388911269e-05, "loss": 0.7973, "step": 393 }, { "epoch": 0.5098673568424458, "grad_norm": 0.9328367710113525, "learning_rate": 9.800201198844221e-05, "loss": 1.0405, "step": 394 }, { "epoch": 0.5111614364283403, "grad_norm": 0.9010327458381653, "learning_rate": 9.799045745084847e-05, "loss": 1.1194, "step": 395 }, { "epoch": 0.5124555160142349, "grad_norm": 0.7900609374046326, "learning_rate": 9.797887028418746e-05, "loss": 0.9747, "step": 396 }, { "epoch": 0.5137495956001294, "grad_norm": 0.8245125412940979, "learning_rate": 9.796725049633741e-05, "loss": 0.8594, "step": 397 }, { "epoch": 0.5150436751860239, "grad_norm": 0.7617928385734558, "learning_rate": 9.795559809519866e-05, "loss": 0.8176, "step": 398 }, { "epoch": 0.5163377547719185, "grad_norm": 0.9844444990158081, "learning_rate": 9.79439130886938e-05, "loss": 1.0116, "step": 399 }, { "epoch": 0.517631834357813, "grad_norm": 0.8849684000015259, "learning_rate": 9.793219548476753e-05, "loss": 0.8706, "step": 400 }, { "epoch": 0.5189259139437076, "grad_norm": 0.893902063369751, "learning_rate": 9.792044529138674e-05, "loss": 0.8217, "step": 401 }, { "epoch": 0.520219993529602, "grad_norm": 0.7412934899330139, "learning_rate": 9.79086625165405e-05, "loss": 0.868, "step": 402 }, { "epoch": 0.5215140731154966, "grad_norm": 0.796435534954071, "learning_rate": 9.789684716823995e-05, "loss": 0.8691, "step": 403 }, { "epoch": 0.5228081527013911, "grad_norm": 1.063193440437317, "learning_rate": 9.788499925451849e-05, "loss": 1.0085, "step": 404 }, { "epoch": 0.5241022322872857, "grad_norm": 0.952882707118988, "learning_rate": 9.787311878343157e-05, "loss": 0.8378, "step": 405 }, { "epoch": 0.5253963118731803, "grad_norm": 0.7899916768074036, "learning_rate": 9.786120576305682e-05, "loss": 0.8917, "step": 406 }, { "epoch": 0.5266903914590747, "grad_norm": 0.8242781758308411, "learning_rate": 9.784926020149398e-05, "loss": 0.9778, "step": 407 }, { "epoch": 0.5279844710449693, "grad_norm": 0.9736928343772888, "learning_rate": 9.783728210686496e-05, "loss": 1.0145, "step": 408 }, { "epoch": 0.5292785506308638, "grad_norm": 0.8070263862609863, "learning_rate": 9.782527148731372e-05, "loss": 0.9923, "step": 409 }, { "epoch": 0.5305726302167584, "grad_norm": 0.861262857913971, "learning_rate": 9.781322835100638e-05, "loss": 1.0261, "step": 410 }, { "epoch": 0.5318667098026528, "grad_norm": 0.8204033970832825, "learning_rate": 9.780115270613115e-05, "loss": 0.8865, "step": 411 }, { "epoch": 0.5331607893885474, "grad_norm": 0.8053429126739502, "learning_rate": 9.778904456089838e-05, "loss": 0.8606, "step": 412 }, { "epoch": 0.5344548689744419, "grad_norm": 0.7944838404655457, "learning_rate": 9.777690392354045e-05, "loss": 0.8925, "step": 413 }, { "epoch": 0.5357489485603365, "grad_norm": 0.9882616400718689, "learning_rate": 9.77647308023119e-05, "loss": 0.8577, "step": 414 }, { "epoch": 0.537043028146231, "grad_norm": 0.6937118768692017, "learning_rate": 9.77525252054893e-05, "loss": 0.6972, "step": 415 }, { "epoch": 0.5383371077321255, "grad_norm": 1.0071479082107544, "learning_rate": 9.774028714137133e-05, "loss": 0.9233, "step": 416 }, { "epoch": 0.5396311873180201, "grad_norm": 0.916771411895752, "learning_rate": 9.772801661827874e-05, "loss": 1.0072, "step": 417 }, { "epoch": 0.5409252669039146, "grad_norm": 0.8663278818130493, "learning_rate": 9.771571364455439e-05, "loss": 1.1011, "step": 418 }, { "epoch": 0.5422193464898091, "grad_norm": 0.7842355370521545, "learning_rate": 9.77033782285631e-05, "loss": 1.0382, "step": 419 }, { "epoch": 0.5435134260757036, "grad_norm": 0.8407487273216248, "learning_rate": 9.769101037869187e-05, "loss": 0.9612, "step": 420 }, { "epoch": 0.5448075056615982, "grad_norm": 0.7557950615882874, "learning_rate": 9.767861010334962e-05, "loss": 0.9965, "step": 421 }, { "epoch": 0.5461015852474927, "grad_norm": 0.812099814414978, "learning_rate": 9.766617741096746e-05, "loss": 0.8824, "step": 422 }, { "epoch": 0.5473956648333873, "grad_norm": 0.8438544273376465, "learning_rate": 9.765371230999843e-05, "loss": 0.8852, "step": 423 }, { "epoch": 0.5486897444192818, "grad_norm": 0.7006101608276367, "learning_rate": 9.764121480891765e-05, "loss": 0.808, "step": 424 }, { "epoch": 0.5499838240051763, "grad_norm": 0.7312132716178894, "learning_rate": 9.76286849162223e-05, "loss": 0.7964, "step": 425 }, { "epoch": 0.5512779035910709, "grad_norm": 0.939939558506012, "learning_rate": 9.76161226404315e-05, "loss": 0.9434, "step": 426 }, { "epoch": 0.5525719831769654, "grad_norm": 0.7815152406692505, "learning_rate": 9.760352799008643e-05, "loss": 1.0487, "step": 427 }, { "epoch": 0.5538660627628599, "grad_norm": 0.8934217691421509, "learning_rate": 9.759090097375032e-05, "loss": 1.0036, "step": 428 }, { "epoch": 0.5551601423487544, "grad_norm": 0.7264319658279419, "learning_rate": 9.757824160000837e-05, "loss": 0.7866, "step": 429 }, { "epoch": 0.556454221934649, "grad_norm": 0.8536062836647034, "learning_rate": 9.756554987746776e-05, "loss": 0.9949, "step": 430 }, { "epoch": 0.5577483015205436, "grad_norm": 0.8253523111343384, "learning_rate": 9.755282581475769e-05, "loss": 0.9801, "step": 431 }, { "epoch": 0.559042381106438, "grad_norm": 0.6686350703239441, "learning_rate": 9.754006942052936e-05, "loss": 0.8388, "step": 432 }, { "epoch": 0.5603364606923326, "grad_norm": 0.8560882210731506, "learning_rate": 9.752728070345591e-05, "loss": 1.0167, "step": 433 }, { "epoch": 0.5616305402782271, "grad_norm": 0.765852153301239, "learning_rate": 9.751445967223252e-05, "loss": 0.7865, "step": 434 }, { "epoch": 0.5629246198641217, "grad_norm": 0.8869863152503967, "learning_rate": 9.750160633557627e-05, "loss": 0.9424, "step": 435 }, { "epoch": 0.5642186994500161, "grad_norm": 0.8522469401359558, "learning_rate": 9.748872070222625e-05, "loss": 0.8151, "step": 436 }, { "epoch": 0.5655127790359107, "grad_norm": 0.862091064453125, "learning_rate": 9.747580278094352e-05, "loss": 1.0131, "step": 437 }, { "epoch": 0.5668068586218052, "grad_norm": 0.8809049725532532, "learning_rate": 9.746285258051104e-05, "loss": 0.9092, "step": 438 }, { "epoch": 0.5681009382076998, "grad_norm": 0.8506961464881897, "learning_rate": 9.744987010973377e-05, "loss": 0.8756, "step": 439 }, { "epoch": 0.5693950177935944, "grad_norm": 0.7440844774246216, "learning_rate": 9.743685537743856e-05, "loss": 0.7948, "step": 440 }, { "epoch": 0.5706890973794888, "grad_norm": 0.8117014169692993, "learning_rate": 9.742380839247425e-05, "loss": 0.9382, "step": 441 }, { "epoch": 0.5719831769653834, "grad_norm": 0.8515580296516418, "learning_rate": 9.741072916371157e-05, "loss": 0.9202, "step": 442 }, { "epoch": 0.5732772565512779, "grad_norm": 0.8774899244308472, "learning_rate": 9.739761770004318e-05, "loss": 0.9129, "step": 443 }, { "epoch": 0.5745713361371725, "grad_norm": 0.7189667820930481, "learning_rate": 9.738447401038367e-05, "loss": 0.7392, "step": 444 }, { "epoch": 0.5758654157230669, "grad_norm": 0.7225956916809082, "learning_rate": 9.737129810366952e-05, "loss": 0.8703, "step": 445 }, { "epoch": 0.5771594953089615, "grad_norm": 0.8351484537124634, "learning_rate": 9.735808998885915e-05, "loss": 0.9626, "step": 446 }, { "epoch": 0.578453574894856, "grad_norm": 0.860819399356842, "learning_rate": 9.734484967493282e-05, "loss": 0.9061, "step": 447 }, { "epoch": 0.5797476544807506, "grad_norm": 0.7928848266601562, "learning_rate": 9.733157717089277e-05, "loss": 0.9141, "step": 448 }, { "epoch": 0.581041734066645, "grad_norm": 0.9132540822029114, "learning_rate": 9.7318272485763e-05, "loss": 0.7613, "step": 449 }, { "epoch": 0.5823358136525396, "grad_norm": 0.7894258499145508, "learning_rate": 9.730493562858953e-05, "loss": 0.8234, "step": 450 } ], "logging_steps": 1, "max_steps": 3860, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.030717272621056e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }