{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 179, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00558659217877095, "grad_norm": 3.744102716445923, "learning_rate": 1.0000000000000002e-06, "loss": 44.5125, "step": 1 }, { "epoch": 0.0111731843575419, "grad_norm": 4.090838432312012, "learning_rate": 2.0000000000000003e-06, "loss": 44.5456, "step": 2 }, { "epoch": 0.01675977653631285, "grad_norm": 4.943282127380371, "learning_rate": 3e-06, "loss": 44.5655, "step": 3 }, { "epoch": 0.0223463687150838, "grad_norm": 5.5685224533081055, "learning_rate": 4.000000000000001e-06, "loss": 44.5459, "step": 4 }, { "epoch": 0.027932960893854747, "grad_norm": 6.192835807800293, "learning_rate": 5e-06, "loss": 44.5324, "step": 5 }, { "epoch": 0.0335195530726257, "grad_norm": 6.504510879516602, "learning_rate": 6e-06, "loss": 44.5719, "step": 6 }, { "epoch": 0.03910614525139665, "grad_norm": 7.036836624145508, "learning_rate": 7.000000000000001e-06, "loss": 44.5859, "step": 7 }, { "epoch": 0.0446927374301676, "grad_norm": 6.947369575500488, "learning_rate": 8.000000000000001e-06, "loss": 44.547, "step": 8 }, { "epoch": 0.05027932960893855, "grad_norm": 6.922020435333252, "learning_rate": 9e-06, "loss": 44.509, "step": 9 }, { "epoch": 0.055865921787709494, "grad_norm": 7.10311222076416, "learning_rate": 1e-05, "loss": 44.4877, "step": 10 }, { "epoch": 0.061452513966480445, "grad_norm": 7.27517032623291, "learning_rate": 1.1000000000000001e-05, "loss": 44.5195, "step": 11 }, { "epoch": 0.0670391061452514, "grad_norm": 7.023418426513672, "learning_rate": 1.2e-05, "loss": 44.4954, "step": 12 }, { "epoch": 0.07262569832402235, "grad_norm": 7.948561191558838, "learning_rate": 1.3000000000000001e-05, "loss": 44.4922, "step": 13 }, { "epoch": 0.0782122905027933, "grad_norm": 7.6504926681518555, "learning_rate": 1.4000000000000001e-05, "loss": 44.459, "step": 14 }, { "epoch": 0.08379888268156424, "grad_norm": 8.089448928833008, "learning_rate": 1.5e-05, "loss": 44.4743, "step": 15 }, { "epoch": 0.0893854748603352, "grad_norm": 7.560296058654785, "learning_rate": 1.6000000000000003e-05, "loss": 44.5191, "step": 16 }, { "epoch": 0.09497206703910614, "grad_norm": 7.827347755432129, "learning_rate": 1.7000000000000003e-05, "loss": 44.4517, "step": 17 }, { "epoch": 0.1005586592178771, "grad_norm": 8.726825714111328, "learning_rate": 1.8e-05, "loss": 44.484, "step": 18 }, { "epoch": 0.10614525139664804, "grad_norm": 7.8799943923950195, "learning_rate": 1.9e-05, "loss": 44.4365, "step": 19 }, { "epoch": 0.11173184357541899, "grad_norm": 7.783633232116699, "learning_rate": 2e-05, "loss": 44.432, "step": 20 }, { "epoch": 0.11731843575418995, "grad_norm": 8.265217781066895, "learning_rate": 2.1e-05, "loss": 44.3541, "step": 21 }, { "epoch": 0.12290502793296089, "grad_norm": 8.179393768310547, "learning_rate": 2.2000000000000003e-05, "loss": 44.4657, "step": 22 }, { "epoch": 0.12849162011173185, "grad_norm": 8.401519775390625, "learning_rate": 2.3000000000000003e-05, "loss": 44.3664, "step": 23 }, { "epoch": 0.1340782122905028, "grad_norm": 8.343657493591309, "learning_rate": 2.4e-05, "loss": 44.356, "step": 24 }, { "epoch": 0.13966480446927373, "grad_norm": 8.581681251525879, "learning_rate": 2.5e-05, "loss": 44.3288, "step": 25 }, { "epoch": 0.1452513966480447, "grad_norm": 8.727663040161133, "learning_rate": 2.6000000000000002e-05, "loss": 44.3571, "step": 26 }, { "epoch": 0.15083798882681565, "grad_norm": 8.828510284423828, "learning_rate": 2.7000000000000002e-05, "loss": 44.2998, "step": 27 }, { "epoch": 0.1564245810055866, "grad_norm": 8.784181594848633, "learning_rate": 2.8000000000000003e-05, "loss": 44.2794, "step": 28 }, { "epoch": 0.16201117318435754, "grad_norm": 8.831645965576172, "learning_rate": 2.9e-05, "loss": 44.2463, "step": 29 }, { "epoch": 0.16759776536312848, "grad_norm": 8.893630981445312, "learning_rate": 3e-05, "loss": 44.2231, "step": 30 }, { "epoch": 0.17318435754189945, "grad_norm": 9.3904390335083, "learning_rate": 3.1e-05, "loss": 44.1382, "step": 31 }, { "epoch": 0.1787709497206704, "grad_norm": 8.971839904785156, "learning_rate": 3.2000000000000005e-05, "loss": 44.12, "step": 32 }, { "epoch": 0.18435754189944134, "grad_norm": 9.59831428527832, "learning_rate": 3.3e-05, "loss": 44.114, "step": 33 }, { "epoch": 0.18994413407821228, "grad_norm": 9.502549171447754, "learning_rate": 3.4000000000000007e-05, "loss": 44.0284, "step": 34 }, { "epoch": 0.19553072625698323, "grad_norm": 9.720560073852539, "learning_rate": 3.5e-05, "loss": 43.9652, "step": 35 }, { "epoch": 0.2011173184357542, "grad_norm": 10.408364295959473, "learning_rate": 3.6e-05, "loss": 43.9892, "step": 36 }, { "epoch": 0.20670391061452514, "grad_norm": 9.865303993225098, "learning_rate": 3.7e-05, "loss": 43.9092, "step": 37 }, { "epoch": 0.2122905027932961, "grad_norm": 9.902585983276367, "learning_rate": 3.8e-05, "loss": 43.8354, "step": 38 }, { "epoch": 0.21787709497206703, "grad_norm": 9.79003620147705, "learning_rate": 3.9000000000000006e-05, "loss": 43.7437, "step": 39 }, { "epoch": 0.22346368715083798, "grad_norm": 10.79495906829834, "learning_rate": 4e-05, "loss": 43.7073, "step": 40 }, { "epoch": 0.22905027932960895, "grad_norm": 10.677367210388184, "learning_rate": 4.1e-05, "loss": 43.6095, "step": 41 }, { "epoch": 0.2346368715083799, "grad_norm": 11.262158393859863, "learning_rate": 4.2e-05, "loss": 43.5185, "step": 42 }, { "epoch": 0.24022346368715083, "grad_norm": 11.903176307678223, "learning_rate": 4.3e-05, "loss": 43.3865, "step": 43 }, { "epoch": 0.24581005586592178, "grad_norm": 11.95860481262207, "learning_rate": 4.4000000000000006e-05, "loss": 43.3, "step": 44 }, { "epoch": 0.25139664804469275, "grad_norm": 5.72437047958374, "learning_rate": 4.5e-05, "loss": 44.2749, "step": 45 }, { "epoch": 0.2569832402234637, "grad_norm": 4.2836833000183105, "learning_rate": 4.600000000000001e-05, "loss": 44.1268, "step": 46 }, { "epoch": 0.26256983240223464, "grad_norm": 5.577621936798096, "learning_rate": 4.7e-05, "loss": 43.9228, "step": 47 }, { "epoch": 0.2681564245810056, "grad_norm": 5.966395854949951, "learning_rate": 4.8e-05, "loss": 43.7729, "step": 48 }, { "epoch": 0.2737430167597765, "grad_norm": 6.50309419631958, "learning_rate": 4.9e-05, "loss": 43.671, "step": 49 }, { "epoch": 0.27932960893854747, "grad_norm": 6.988935947418213, "learning_rate": 5e-05, "loss": 43.5641, "step": 50 }, { "epoch": 0.2849162011173184, "grad_norm": 7.089043140411377, "learning_rate": 5.1000000000000006e-05, "loss": 43.5187, "step": 51 }, { "epoch": 0.2905027932960894, "grad_norm": 7.402230739593506, "learning_rate": 5.2000000000000004e-05, "loss": 43.4116, "step": 52 }, { "epoch": 0.29608938547486036, "grad_norm": 7.716123580932617, "learning_rate": 5.300000000000001e-05, "loss": 43.2982, "step": 53 }, { "epoch": 0.3016759776536313, "grad_norm": 7.589552879333496, "learning_rate": 5.4000000000000005e-05, "loss": 43.2493, "step": 54 }, { "epoch": 0.30726256983240224, "grad_norm": 8.715299606323242, "learning_rate": 5.500000000000001e-05, "loss": 43.0231, "step": 55 }, { "epoch": 0.3128491620111732, "grad_norm": 8.259060859680176, "learning_rate": 5.6000000000000006e-05, "loss": 42.9955, "step": 56 }, { "epoch": 0.31843575418994413, "grad_norm": 7.728129863739014, "learning_rate": 5.6999999999999996e-05, "loss": 42.9847, "step": 57 }, { "epoch": 0.3240223463687151, "grad_norm": 8.82850170135498, "learning_rate": 5.8e-05, "loss": 42.7967, "step": 58 }, { "epoch": 0.329608938547486, "grad_norm": 7.893709659576416, "learning_rate": 5.9e-05, "loss": 42.8577, "step": 59 }, { "epoch": 0.33519553072625696, "grad_norm": 8.985579490661621, "learning_rate": 6e-05, "loss": 42.5203, "step": 60 }, { "epoch": 0.3407821229050279, "grad_norm": 8.47461223602295, "learning_rate": 6.1e-05, "loss": 42.5511, "step": 61 }, { "epoch": 0.3463687150837989, "grad_norm": 8.799412727355957, "learning_rate": 6.2e-05, "loss": 42.4044, "step": 62 }, { "epoch": 0.35195530726256985, "grad_norm": 8.567609786987305, "learning_rate": 6.3e-05, "loss": 42.4246, "step": 63 }, { "epoch": 0.3575418994413408, "grad_norm": 8.90191650390625, "learning_rate": 6.400000000000001e-05, "loss": 42.2267, "step": 64 }, { "epoch": 0.36312849162011174, "grad_norm": 8.651405334472656, "learning_rate": 6.500000000000001e-05, "loss": 42.2001, "step": 65 }, { "epoch": 0.3687150837988827, "grad_norm": 8.63864517211914, "learning_rate": 6.6e-05, "loss": 42.1931, "step": 66 }, { "epoch": 0.3743016759776536, "grad_norm": 9.15833854675293, "learning_rate": 6.7e-05, "loss": 41.9268, "step": 67 }, { "epoch": 0.37988826815642457, "grad_norm": 8.82404613494873, "learning_rate": 6.800000000000001e-05, "loss": 41.9039, "step": 68 }, { "epoch": 0.3854748603351955, "grad_norm": 9.213386535644531, "learning_rate": 6.9e-05, "loss": 41.7373, "step": 69 }, { "epoch": 0.39106145251396646, "grad_norm": 9.157903671264648, "learning_rate": 7e-05, "loss": 41.6397, "step": 70 }, { "epoch": 0.39664804469273746, "grad_norm": 9.073963165283203, "learning_rate": 7.1e-05, "loss": 41.5592, "step": 71 }, { "epoch": 0.4022346368715084, "grad_norm": 9.316044807434082, "learning_rate": 7.2e-05, "loss": 41.4418, "step": 72 }, { "epoch": 0.40782122905027934, "grad_norm": 9.458603858947754, "learning_rate": 7.3e-05, "loss": 41.2307, "step": 73 }, { "epoch": 0.4134078212290503, "grad_norm": 9.475081443786621, "learning_rate": 7.4e-05, "loss": 41.1623, "step": 74 }, { "epoch": 0.41899441340782123, "grad_norm": 9.649319648742676, "learning_rate": 7.500000000000001e-05, "loss": 41.0343, "step": 75 }, { "epoch": 0.4245810055865922, "grad_norm": 9.324378967285156, "learning_rate": 7.6e-05, "loss": 41.0046, "step": 76 }, { "epoch": 0.4301675977653631, "grad_norm": 9.610836029052734, "learning_rate": 7.7e-05, "loss": 40.7329, "step": 77 }, { "epoch": 0.43575418994413406, "grad_norm": 9.787153244018555, "learning_rate": 7.800000000000001e-05, "loss": 40.5896, "step": 78 }, { "epoch": 0.441340782122905, "grad_norm": 9.9530668258667, "learning_rate": 7.900000000000001e-05, "loss": 40.448, "step": 79 }, { "epoch": 0.44692737430167595, "grad_norm": 9.689455032348633, "learning_rate": 8e-05, "loss": 40.5078, "step": 80 }, { "epoch": 0.45251396648044695, "grad_norm": 9.322986602783203, "learning_rate": 8.1e-05, "loss": 40.4844, "step": 81 }, { "epoch": 0.4581005586592179, "grad_norm": 9.782176971435547, "learning_rate": 8.2e-05, "loss": 40.2081, "step": 82 }, { "epoch": 0.46368715083798884, "grad_norm": 9.894826889038086, "learning_rate": 8.3e-05, "loss": 40.0162, "step": 83 }, { "epoch": 0.4692737430167598, "grad_norm": 10.18328857421875, "learning_rate": 8.4e-05, "loss": 39.9261, "step": 84 }, { "epoch": 0.4748603351955307, "grad_norm": 10.649673461914062, "learning_rate": 8.5e-05, "loss": 39.692, "step": 85 }, { "epoch": 0.48044692737430167, "grad_norm": 10.699885368347168, "learning_rate": 8.6e-05, "loss": 39.5022, "step": 86 }, { "epoch": 0.4860335195530726, "grad_norm": 11.469558715820312, "learning_rate": 8.7e-05, "loss": 39.1245, "step": 87 }, { "epoch": 0.49162011173184356, "grad_norm": 10.971457481384277, "learning_rate": 8.800000000000001e-05, "loss": 39.1588, "step": 88 }, { "epoch": 0.4972067039106145, "grad_norm": 7.072690010070801, "learning_rate": 8.900000000000001e-05, "loss": 42.9127, "step": 89 }, { "epoch": 0.5027932960893855, "grad_norm": 5.59442663192749, "learning_rate": 9e-05, "loss": 42.0852, "step": 90 }, { "epoch": 0.5083798882681564, "grad_norm": 6.485333442687988, "learning_rate": 9.1e-05, "loss": 41.2355, "step": 91 }, { "epoch": 0.5139664804469274, "grad_norm": 7.4060797691345215, "learning_rate": 9.200000000000001e-05, "loss": 40.6051, "step": 92 }, { "epoch": 0.5195530726256983, "grad_norm": 7.867504596710205, "learning_rate": 9.300000000000001e-05, "loss": 40.2738, "step": 93 }, { "epoch": 0.5251396648044693, "grad_norm": 7.020578384399414, "learning_rate": 9.4e-05, "loss": 40.4952, "step": 94 }, { "epoch": 0.5307262569832403, "grad_norm": 7.6585516929626465, "learning_rate": 9.5e-05, "loss": 40.097, "step": 95 }, { "epoch": 0.5363128491620112, "grad_norm": 8.454778671264648, "learning_rate": 9.6e-05, "loss": 39.7017, "step": 96 }, { "epoch": 0.5418994413407822, "grad_norm": 8.670785903930664, "learning_rate": 9.7e-05, "loss": 39.4651, "step": 97 }, { "epoch": 0.547486033519553, "grad_norm": 8.416338920593262, "learning_rate": 9.8e-05, "loss": 39.519, "step": 98 }, { "epoch": 0.553072625698324, "grad_norm": 8.337173461914062, "learning_rate": 9.900000000000001e-05, "loss": 39.4007, "step": 99 }, { "epoch": 0.5586592178770949, "grad_norm": 8.42642879486084, "learning_rate": 0.0001, "loss": 39.1839, "step": 100 }, { "epoch": 0.5642458100558659, "grad_norm": 8.729036331176758, "learning_rate": 9.996046986136509e-05, "loss": 38.9884, "step": 101 }, { "epoch": 0.5698324022346368, "grad_norm": 8.876291275024414, "learning_rate": 9.98419419507348e-05, "loss": 38.7318, "step": 102 }, { "epoch": 0.5754189944134078, "grad_norm": 8.565130233764648, "learning_rate": 9.964460368509867e-05, "loss": 38.7833, "step": 103 }, { "epoch": 0.5810055865921788, "grad_norm": 8.649067878723145, "learning_rate": 9.936876709681668e-05, "loss": 38.6301, "step": 104 }, { "epoch": 0.5865921787709497, "grad_norm": 8.931990623474121, "learning_rate": 9.901486834023182e-05, "loss": 38.4478, "step": 105 }, { "epoch": 0.5921787709497207, "grad_norm": 9.383662223815918, "learning_rate": 9.85834670020205e-05, "loss": 38.209, "step": 106 }, { "epoch": 0.5977653631284916, "grad_norm": 8.624167442321777, "learning_rate": 9.807524521637102e-05, "loss": 38.3029, "step": 107 }, { "epoch": 0.6033519553072626, "grad_norm": 8.637563705444336, "learning_rate": 9.749100658638914e-05, "loss": 38.1992, "step": 108 }, { "epoch": 0.6089385474860335, "grad_norm": 9.110807418823242, "learning_rate": 9.68316749134364e-05, "loss": 37.9357, "step": 109 }, { "epoch": 0.6145251396648045, "grad_norm": 8.739825248718262, "learning_rate": 9.609829273641034e-05, "loss": 37.929, "step": 110 }, { "epoch": 0.6201117318435754, "grad_norm": 9.419713020324707, "learning_rate": 9.529201968327616e-05, "loss": 37.5423, "step": 111 }, { "epoch": 0.6256983240223464, "grad_norm": 9.36577320098877, "learning_rate": 9.44141306374566e-05, "loss": 37.4528, "step": 112 }, { "epoch": 0.6312849162011173, "grad_norm": 9.40024185180664, "learning_rate": 9.346601372197914e-05, "loss": 37.3334, "step": 113 }, { "epoch": 0.6368715083798883, "grad_norm": 9.359634399414062, "learning_rate": 9.244916810456821e-05, "loss": 37.3102, "step": 114 }, { "epoch": 0.6424581005586593, "grad_norm": 9.717304229736328, "learning_rate": 9.136520162715287e-05, "loss": 37.1064, "step": 115 }, { "epoch": 0.6480446927374302, "grad_norm": 9.580801963806152, "learning_rate": 9.021582826353824e-05, "loss": 37.0496, "step": 116 }, { "epoch": 0.6536312849162011, "grad_norm": 9.399163246154785, "learning_rate": 8.900286540926061e-05, "loss": 36.9901, "step": 117 }, { "epoch": 0.659217877094972, "grad_norm": 9.52145004272461, "learning_rate": 8.772823100791151e-05, "loss": 36.79, "step": 118 }, { "epoch": 0.664804469273743, "grad_norm": 9.533082962036133, "learning_rate": 8.639394051847472e-05, "loss": 36.6575, "step": 119 }, { "epoch": 0.6703910614525139, "grad_norm": 9.642265319824219, "learning_rate": 8.500210372847127e-05, "loss": 36.4436, "step": 120 }, { "epoch": 0.6759776536312849, "grad_norm": 10.193774223327637, "learning_rate": 8.355492141795185e-05, "loss": 36.1696, "step": 121 }, { "epoch": 0.6815642458100558, "grad_norm": 9.560378074645996, "learning_rate": 8.2054681879611e-05, "loss": 36.3306, "step": 122 }, { "epoch": 0.6871508379888268, "grad_norm": 9.826172828674316, "learning_rate": 8.050375730052621e-05, "loss": 36.1994, "step": 123 }, { "epoch": 0.6927374301675978, "grad_norm": 10.785990715026855, "learning_rate": 7.890460001124242e-05, "loss": 36.3292, "step": 124 }, { "epoch": 0.6983240223463687, "grad_norm": 10.091948509216309, "learning_rate": 7.725973860813338e-05, "loss": 35.8338, "step": 125 }, { "epoch": 0.7039106145251397, "grad_norm": 9.727133750915527, "learning_rate": 7.557177395517112e-05, "loss": 36.0643, "step": 126 }, { "epoch": 0.7094972067039106, "grad_norm": 10.128454208374023, "learning_rate": 7.384337507142531e-05, "loss": 35.8633, "step": 127 }, { "epoch": 0.7150837988826816, "grad_norm": 10.032463073730469, "learning_rate": 7.20772749107956e-05, "loss": 35.75, "step": 128 }, { "epoch": 0.7206703910614525, "grad_norm": 10.427178382873535, "learning_rate": 7.027626604064969e-05, "loss": 35.4646, "step": 129 }, { "epoch": 0.7262569832402235, "grad_norm": 10.725203514099121, "learning_rate": 6.844319622620039e-05, "loss": 35.4019, "step": 130 }, { "epoch": 0.7318435754189944, "grad_norm": 11.060110092163086, "learning_rate": 6.65809639276034e-05, "loss": 35.1042, "step": 131 }, { "epoch": 0.7374301675977654, "grad_norm": 11.478692054748535, "learning_rate": 6.469251371689606e-05, "loss": 34.8198, "step": 132 }, { "epoch": 0.7430167597765364, "grad_norm": 6.56369161605835, "learning_rate": 6.278083162202375e-05, "loss": 39.9774, "step": 133 }, { "epoch": 0.7486033519553073, "grad_norm": 6.60885763168335, "learning_rate": 6.08489404053159e-05, "loss": 38.8113, "step": 134 }, { "epoch": 0.7541899441340782, "grad_norm": 6.839470863342285, "learning_rate": 5.889989478387753e-05, "loss": 38.2958, "step": 135 }, { "epoch": 0.7597765363128491, "grad_norm": 7.189307689666748, "learning_rate": 5.6936776599453424e-05, "loss": 37.661, "step": 136 }, { "epoch": 0.7653631284916201, "grad_norm": 7.412992000579834, "learning_rate": 5.496268994540309e-05, "loss": 37.3239, "step": 137 }, { "epoch": 0.770949720670391, "grad_norm": 7.726637363433838, "learning_rate": 5.2980756258490995e-05, "loss": 36.8866, "step": 138 }, { "epoch": 0.776536312849162, "grad_norm": 8.184843063354492, "learning_rate": 5.0994109383253506e-05, "loss": 36.6651, "step": 139 }, { "epoch": 0.7821229050279329, "grad_norm": 7.629161834716797, "learning_rate": 4.900589061674649e-05, "loss": 36.7795, "step": 140 }, { "epoch": 0.7877094972067039, "grad_norm": 8.492084503173828, "learning_rate": 4.701924374150901e-05, "loss": 36.3768, "step": 141 }, { "epoch": 0.7932960893854749, "grad_norm": 8.768118858337402, "learning_rate": 4.503731005459693e-05, "loss": 36.2209, "step": 142 }, { "epoch": 0.7988826815642458, "grad_norm": 9.186103820800781, "learning_rate": 4.3063223400546594e-05, "loss": 35.9594, "step": 143 }, { "epoch": 0.8044692737430168, "grad_norm": 7.938751220703125, "learning_rate": 4.11001052161225e-05, "loss": 36.3444, "step": 144 }, { "epoch": 0.8100558659217877, "grad_norm": 8.192068099975586, "learning_rate": 3.91510595946841e-05, "loss": 36.202, "step": 145 }, { "epoch": 0.8156424581005587, "grad_norm": 8.316092491149902, "learning_rate": 3.721916837797627e-05, "loss": 36.128, "step": 146 }, { "epoch": 0.8212290502793296, "grad_norm": 7.804288387298584, "learning_rate": 3.5307486283103966e-05, "loss": 36.2991, "step": 147 }, { "epoch": 0.8268156424581006, "grad_norm": 9.470741271972656, "learning_rate": 3.3419036072396616e-05, "loss": 35.5116, "step": 148 }, { "epoch": 0.8324022346368715, "grad_norm": 8.43538761138916, "learning_rate": 3.1556803773799614e-05, "loss": 35.9411, "step": 149 }, { "epoch": 0.8379888268156425, "grad_norm": 9.065338134765625, "learning_rate": 2.9723733959350307e-05, "loss": 35.6712, "step": 150 }, { "epoch": 0.8435754189944135, "grad_norm": 8.624382972717285, "learning_rate": 2.7922725089204426e-05, "loss": 35.8245, "step": 151 }, { "epoch": 0.8491620111731844, "grad_norm": 8.553651809692383, "learning_rate": 2.6156624928574707e-05, "loss": 35.8094, "step": 152 }, { "epoch": 0.8547486033519553, "grad_norm": 9.5851411819458, "learning_rate": 2.4428226044828896e-05, "loss": 35.4061, "step": 153 }, { "epoch": 0.8603351955307262, "grad_norm": 9.327908515930176, "learning_rate": 2.2740261391866637e-05, "loss": 35.4764, "step": 154 }, { "epoch": 0.8659217877094972, "grad_norm": 9.505815505981445, "learning_rate": 2.1095399988757574e-05, "loss": 35.3205, "step": 155 }, { "epoch": 0.8715083798882681, "grad_norm": 9.7329740524292, "learning_rate": 1.9496242699473783e-05, "loss": 35.1241, "step": 156 }, { "epoch": 0.8770949720670391, "grad_norm": 9.277899742126465, "learning_rate": 1.794531812038901e-05, "loss": 35.3066, "step": 157 }, { "epoch": 0.88268156424581, "grad_norm": 9.590785026550293, "learning_rate": 1.6445078582048155e-05, "loss": 35.1618, "step": 158 }, { "epoch": 0.888268156424581, "grad_norm": 9.559935569763184, "learning_rate": 1.4997896271528739e-05, "loss": 35.1343, "step": 159 }, { "epoch": 0.8938547486033519, "grad_norm": 9.499947547912598, "learning_rate": 1.3606059481525296e-05, "loss": 35.1343, "step": 160 }, { "epoch": 0.8994413407821229, "grad_norm": 9.608393669128418, "learning_rate": 1.2271768992088489e-05, "loss": 35.2073, "step": 161 }, { "epoch": 0.9050279329608939, "grad_norm": 9.510586738586426, "learning_rate": 1.09971345907394e-05, "loss": 35.1469, "step": 162 }, { "epoch": 0.9106145251396648, "grad_norm": 9.812493324279785, "learning_rate": 9.784171736461762e-06, "loss": 34.9144, "step": 163 }, { "epoch": 0.9162011173184358, "grad_norm": 10.067626953125, "learning_rate": 8.634798372847148e-06, "loss": 34.9231, "step": 164 }, { "epoch": 0.9217877094972067, "grad_norm": 9.783865928649902, "learning_rate": 7.550831895431798e-06, "loss": 34.8879, "step": 165 }, { "epoch": 0.9273743016759777, "grad_norm": 10.413803100585938, "learning_rate": 6.533986278020876e-06, "loss": 34.6783, "step": 166 }, { "epoch": 0.9329608938547486, "grad_norm": 9.892301559448242, "learning_rate": 5.585869362543416e-06, "loss": 34.9329, "step": 167 }, { "epoch": 0.9385474860335196, "grad_norm": 10.211995124816895, "learning_rate": 4.707980316723837e-06, "loss": 34.9967, "step": 168 }, { "epoch": 0.9441340782122905, "grad_norm": 10.002083778381348, "learning_rate": 3.901707263589671e-06, "loss": 34.7943, "step": 169 }, { "epoch": 0.9497206703910615, "grad_norm": 9.93572998046875, "learning_rate": 3.1683250865636114e-06, "loss": 34.842, "step": 170 }, { "epoch": 0.9553072625698324, "grad_norm": 9.871206283569336, "learning_rate": 2.5089934136108664e-06, "loss": 34.8242, "step": 171 }, { "epoch": 0.9608938547486033, "grad_norm": 10.331404685974121, "learning_rate": 1.9247547836289793e-06, "loss": 34.9185, "step": 172 }, { "epoch": 0.9664804469273743, "grad_norm": 10.266081809997559, "learning_rate": 1.4165329979794973e-06, "loss": 34.6464, "step": 173 }, { "epoch": 0.9720670391061452, "grad_norm": 11.090232849121094, "learning_rate": 9.851316597681958e-07, "loss": 34.4733, "step": 174 }, { "epoch": 0.9776536312849162, "grad_norm": 11.902629852294922, "learning_rate": 6.312329031833319e-07, "loss": 34.1088, "step": 175 }, { "epoch": 0.9832402234636871, "grad_norm": 10.5414400100708, "learning_rate": 3.553963149013295e-07, "loss": 34.5576, "step": 176 }, { "epoch": 0.9888268156424581, "grad_norm": 6.398901462554932, "learning_rate": 1.580580492652084e-07, "loss": 37.1557, "step": 177 }, { "epoch": 0.994413407821229, "grad_norm": 9.017801284790039, "learning_rate": 3.953013863490784e-08, "loss": 35.3366, "step": 178 }, { "epoch": 1.0, "grad_norm": 10.074228286743164, "learning_rate": 0.0, "loss": 34.6733, "step": 179 } ], "logging_steps": 1, "max_steps": 179, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 239, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 154875372503040.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }