|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.966777408637874, |
|
"eval_steps": 500, |
|
"global_step": 1500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006644518272425249, |
|
"grad_norm": 324.0, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 34.1539, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03322259136212625, |
|
"grad_norm": 328.0, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 34.4732, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0664451827242525, |
|
"grad_norm": 132.0, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 30.9731, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09966777408637874, |
|
"grad_norm": 57.75, |
|
"learning_rate": 2e-05, |
|
"loss": 24.1357, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.132890365448505, |
|
"grad_norm": 19.125, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 19.6743, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16611295681063123, |
|
"grad_norm": 14.1875, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 17.9465, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.19933554817275748, |
|
"grad_norm": 7.25, |
|
"learning_rate": 4e-05, |
|
"loss": 15.9561, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23255813953488372, |
|
"grad_norm": 4.125, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 14.7788, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.26578073089701, |
|
"grad_norm": 3.484375, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 14.139, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.29900332225913623, |
|
"grad_norm": 4.75, |
|
"learning_rate": 6e-05, |
|
"loss": 13.5886, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.33222591362126247, |
|
"grad_norm": 6.125, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 13.0275, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3654485049833887, |
|
"grad_norm": 11.5625, |
|
"learning_rate": 7.333333333333333e-05, |
|
"loss": 11.9071, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.39867109634551495, |
|
"grad_norm": 18.375, |
|
"learning_rate": 8e-05, |
|
"loss": 9.4575, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4318936877076412, |
|
"grad_norm": 21.25, |
|
"learning_rate": 8.666666666666667e-05, |
|
"loss": 5.8479, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 2.6937, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4983388704318937, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0051, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.53156146179402, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 0.00010666666666666667, |
|
"loss": 1.7309, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5647840531561462, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00011333333333333334, |
|
"loss": 1.5823, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5980066445182725, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00012, |
|
"loss": 1.4702, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6312292358803987, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00012666666666666666, |
|
"loss": 1.3996, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6644518272425249, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 1.3389, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00014, |
|
"loss": 1.293, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7308970099667774, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00014666666666666666, |
|
"loss": 1.2656, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7641196013289037, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00015333333333333334, |
|
"loss": 1.2254, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7973421926910299, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00016, |
|
"loss": 1.2072, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8305647840531561, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 1.1856, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8637873754152824, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00017333333333333334, |
|
"loss": 1.169, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8970099667774086, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00018, |
|
"loss": 1.1497, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.0001866666666666667, |
|
"loss": 1.131, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9634551495016611, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.00019333333333333333, |
|
"loss": 1.1275, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9966777408637874, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1216, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9966777408637874, |
|
"eval_loss": 2.580465793609619, |
|
"eval_runtime": 0.2799, |
|
"eval_samples_per_second": 35.728, |
|
"eval_steps_per_second": 3.573, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0299003322259137, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00019999323080037624, |
|
"loss": 1.1202, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.06312292358804, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 0.00019997292411794618, |
|
"loss": 1.0982, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0963455149501662, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0001999390827019096, |
|
"loss": 1.1059, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.1295681063122924, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0001998917111338525, |
|
"loss": 1.079, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1627906976744187, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00019983081582712685, |
|
"loss": 1.0626, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.196013289036545, |
|
"grad_norm": 3.75, |
|
"learning_rate": 0.00019975640502598244, |
|
"loss": 1.0644, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2292358803986712, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00019966848880445062, |
|
"loss": 1.064, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.2624584717607974, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.00019956707906498044, |
|
"loss": 1.0638, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2956810631229236, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.00019945218953682734, |
|
"loss": 1.0598, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.3289036544850499, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00019932383577419432, |
|
"loss": 1.0433, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3621262458471761, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00019918203515412617, |
|
"loss": 1.0375, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.3953488372093024, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00019902680687415705, |
|
"loss": 1.0293, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00019885817194971117, |
|
"loss": 1.0196, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.4617940199335548, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.00019867615321125795, |
|
"loss": 1.0227, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.495016611295681, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 1.0192, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.5282392026578073, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 0.00019827206467064133, |
|
"loss": 1.0254, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.5614617940199336, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 0.00019805004957559793, |
|
"loss": 1.0076, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.5946843853820598, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 0.9979, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.627906976744186, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 0.00019756622801842143, |
|
"loss": 0.9963, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.6611295681063123, |
|
"grad_norm": 2.625, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 1.0017, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6943521594684385, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 0.00019702957262759965, |
|
"loss": 1.0055, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.7275747508305648, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 0.00019674152194628638, |
|
"loss": 0.993, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.760797342192691, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.0001964403740114939, |
|
"loss": 0.9875, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.7940199335548173, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0001961261695938319, |
|
"loss": 1.0015, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.8272425249169435, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0001957989512315489, |
|
"loss": 0.9879, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 0.0001954587632247732, |
|
"loss": 0.9846, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.893687707641196, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.9816, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.9269102990033222, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.00019473966425143292, |
|
"loss": 0.9832, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.9601328903654485, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00019436085063935835, |
|
"loss": 0.9838, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.9933554817275747, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.9828, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.516935110092163, |
|
"eval_runtime": 0.2355, |
|
"eval_samples_per_second": 42.456, |
|
"eval_steps_per_second": 4.246, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.026578073089701, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 0.00019356495158395315, |
|
"loss": 0.9602, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.0598006644518274, |
|
"grad_norm": 3.375, |
|
"learning_rate": 0.00019314797389261424, |
|
"loss": 0.9484, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.0930232558139537, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00019271838545667876, |
|
"loss": 0.9496, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.12624584717608, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00019227624443554425, |
|
"loss": 0.9405, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.159468438538206, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00019182161068802741, |
|
"loss": 0.9509, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.1926910299003324, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 0.9532, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.2259136212624586, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00019087511289735644, |
|
"loss": 0.9421, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.259136212624585, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 0.00019038337699485208, |
|
"loss": 0.9347, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.292358803986711, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.0001898794046299167, |
|
"loss": 0.9451, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 5.25, |
|
"learning_rate": 0.00018936326403234125, |
|
"loss": 0.9503, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.3588039867109636, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.00018883502507930042, |
|
"loss": 0.9515, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.39202657807309, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.00018829475928589271, |
|
"loss": 0.9382, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.425249169435216, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.0001877425397954582, |
|
"loss": 0.9309, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.4584717607973423, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.00018717844136967624, |
|
"loss": 0.9487, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.4916943521594686, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.9414, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.524916943521595, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00018601491478953657, |
|
"loss": 0.9575, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.558139534883721, |
|
"grad_norm": 1.90625, |
|
"learning_rate": 0.00018541564415805258, |
|
"loss": 0.9469, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.5913621262458473, |
|
"grad_norm": 8.25, |
|
"learning_rate": 0.0001848048096156426, |
|
"loss": 0.9246, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.6245847176079735, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00018418249385952575, |
|
"loss": 0.9357, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.6578073089700998, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00018354878114129367, |
|
"loss": 0.9264, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.691029900332226, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00018290375725550417, |
|
"loss": 0.934, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.7242524916943522, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 0.00018224750952806624, |
|
"loss": 0.9378, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.7574750830564785, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00018158012680441723, |
|
"loss": 0.9325, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.7906976744186047, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.9343, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.823920265780731, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0001802123192755044, |
|
"loss": 0.9322, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.0001795120796494848, |
|
"loss": 0.9203, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.8903654485049834, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00017880107536067218, |
|
"loss": 0.9181, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.9235880398671097, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00017807940266766593, |
|
"loss": 0.9152, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.956810631229236, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0001773471592733964, |
|
"loss": 0.9193, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.990033222591362, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 0.9157, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.9966777408637872, |
|
"eval_loss": 2.4835643768310547, |
|
"eval_runtime": 0.2608, |
|
"eval_samples_per_second": 38.338, |
|
"eval_steps_per_second": 3.834, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 3.0232558139534884, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.00017585135833488692, |
|
"loss": 0.9023, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.0564784053156147, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.00017508800329814995, |
|
"loss": 0.8957, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.089700996677741, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.00017431448254773944, |
|
"loss": 0.8963, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.122923588039867, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.0001735309008059829, |
|
"loss": 0.8938, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.1561461794019934, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00017273736415730488, |
|
"loss": 0.8832, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.1893687707641196, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0001719339800338651, |
|
"loss": 0.8824, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.222591362126246, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00017112085720101373, |
|
"loss": 0.8985, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.255813953488372, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0001702981057425662, |
|
"loss": 0.8915, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.2890365448504983, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00016946583704589973, |
|
"loss": 0.8959, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.3222591362126246, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.8932, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.355481727574751, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00016777319991457325, |
|
"loss": 0.9034, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.388704318936877, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 0.8914, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.4219269102990033, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00016604386239990078, |
|
"loss": 0.8968, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.4551495016611296, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00016516572288214552, |
|
"loss": 0.8899, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.488372093023256, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 0.888, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.521594684385382, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.00016338309673987101, |
|
"loss": 0.8966, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.5548172757475083, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.000162478851454396, |
|
"loss": 0.8802, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.5880398671096345, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.0001615661475325658, |
|
"loss": 0.8864, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.6212624584717608, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00016064510853988138, |
|
"loss": 0.8816, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.654485049833887, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.00015971585917027862, |
|
"loss": 0.8906, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.6877076411960132, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.8896, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.7209302325581395, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00015783323361679864, |
|
"loss": 0.8806, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.7541528239202657, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00015688011231028518, |
|
"loss": 0.8758, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.787375415282392, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.0001559192903470747, |
|
"loss": 0.871, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.820598006644518, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.0001549508978070806, |
|
"loss": 0.8882, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.8538205980066444, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0001539750657951513, |
|
"loss": 0.8719, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.8870431893687707, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0001529919264233205, |
|
"loss": 0.8794, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.920265780730897, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00015200161279292155, |
|
"loss": 0.8787, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.953488372093023, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00015100425897656753, |
|
"loss": 0.873, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 3.9867109634551494, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.8753, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.5010673999786377, |
|
"eval_runtime": 0.239, |
|
"eval_samples_per_second": 41.842, |
|
"eval_steps_per_second": 4.184, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 4.019933554817276, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.0001489889718238087, |
|
"loss": 0.8697, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 4.053156146179402, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00014797131132502465, |
|
"loss": 0.8496, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.086378737541528, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 0.00014694715627858908, |
|
"loss": 0.8601, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 4.119601328903655, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00014591664533870118, |
|
"loss": 0.8647, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.152823920265781, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00014487991802004623, |
|
"loss": 0.8541, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.186046511627907, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00014383711467890774, |
|
"loss": 0.8481, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.219269102990033, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00014278837649416544, |
|
"loss": 0.8514, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.25249169435216, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0001417338454481818, |
|
"loss": 0.8498, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00014067366430758004, |
|
"loss": 0.8368, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.318936877076412, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0001396079766039157, |
|
"loss": 0.8439, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.352159468438538, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00013853692661424484, |
|
"loss": 0.8565, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.385382059800665, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00013746065934159123, |
|
"loss": 0.8426, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.4186046511627906, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00013637932049531516, |
|
"loss": 0.8471, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.451827242524917, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.00013529305647138687, |
|
"loss": 0.8417, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.485049833887043, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 0.8493, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 4.51827242524917, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.0001331063417884958, |
|
"loss": 0.8506, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.5514950166112955, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00013200618717569714, |
|
"loss": 0.841, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 4.584717607973422, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.8415, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.617940199335548, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0001297930281038482, |
|
"loss": 0.8506, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 4.651162790697675, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00012868032327110904, |
|
"loss": 0.8425, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.6843853820598005, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0001275637355816999, |
|
"loss": 0.8466, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 4.717607973421927, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00012644341620372023, |
|
"loss": 0.841, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.750830564784053, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 0.0001253195168104802, |
|
"loss": 0.8396, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 4.78405315614618, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.00012419218955996676, |
|
"loss": 0.8423, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.8172757475083055, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00012306158707424403, |
|
"loss": 0.839, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.850498338870432, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00012192786241879033, |
|
"loss": 0.8342, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.883720930232558, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00012079116908177593, |
|
"loss": 0.8358, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 4.916943521594685, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00011965166095328301, |
|
"loss": 0.8432, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.95016611295681, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00011850949230447145, |
|
"loss": 0.8368, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 4.983388704318937, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.8334, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.996677740863787, |
|
"eval_loss": 2.4944658279418945, |
|
"eval_runtime": 0.2592, |
|
"eval_samples_per_second": 38.58, |
|
"eval_steps_per_second": 3.858, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 5.016611295681063, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00011621779231055676, |
|
"loss": 0.8264, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 5.04983388704319, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.00011506857122494831, |
|
"loss": 0.8175, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 5.083056478405315, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00011391731009600654, |
|
"loss": 0.8207, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 5.116279069767442, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00011276416478605949, |
|
"loss": 0.8134, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 5.149501661129568, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00011160929141252303, |
|
"loss": 0.8146, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 5.1827242524916945, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 0.8118, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.21594684385382, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 0.00010929498609293924, |
|
"loss": 0.8142, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 5.249169435215947, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00010813586746678583, |
|
"loss": 0.8156, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 5.282392026578073, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 0.00010697564737441252, |
|
"loss": 0.8097, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 5.3156146179401995, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00010581448289104758, |
|
"loss": 0.8213, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.348837209302325, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.0001046525312197747, |
|
"loss": 0.8087, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 5.382059800664452, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00010348994967025012, |
|
"loss": 0.8046, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 5.415282392026578, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 0.00010232689563740563, |
|
"loss": 0.8086, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 5.4485049833887045, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00010116352658013973, |
|
"loss": 0.809, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 5.48172757475083, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8155, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 5.514950166112957, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 9.883647341986032e-05, |
|
"loss": 0.8016, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 5.548172757475083, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 9.767310436259438e-05, |
|
"loss": 0.8013, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 5.5813953488372094, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 9.651005032974994e-05, |
|
"loss": 0.8123, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 5.614617940199335, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 9.534746878022534e-05, |
|
"loss": 0.8163, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 5.647840531561462, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 9.418551710895243e-05, |
|
"loss": 0.8164, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.681063122923588, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 9.302435262558747e-05, |
|
"loss": 0.7974, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 9.186413253321418e-05, |
|
"loss": 0.8142, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 5.74750830564784, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 9.070501390706079e-05, |
|
"loss": 0.8026, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 5.780730897009967, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 8.954715367323468e-05, |
|
"loss": 0.8005, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 5.813953488372093, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 8.839070858747697e-05, |
|
"loss": 0.8015, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 5.847176079734219, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 8.723583521394054e-05, |
|
"loss": 0.7924, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 5.880398671096345, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 8.608268990399349e-05, |
|
"loss": 0.812, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 5.913621262458472, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 8.49314287750517e-05, |
|
"loss": 0.7969, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 5.946843853820598, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 8.378220768944327e-05, |
|
"loss": 0.7965, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 5.980066445182724, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 0.796, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.531708240509033, |
|
"eval_runtime": 0.239, |
|
"eval_samples_per_second": 41.85, |
|
"eval_steps_per_second": 4.185, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 6.01328903654485, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 8.149050769552856e-05, |
|
"loss": 0.7892, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 6.046511627906977, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 8.034833904671698e-05, |
|
"loss": 0.7792, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 6.079734219269103, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 7.920883091822408e-05, |
|
"loss": 0.7814, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 6.112956810631229, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 7.807213758120966e-05, |
|
"loss": 0.7822, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 6.146179401993355, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 7.693841292575598e-05, |
|
"loss": 0.7749, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 6.179401993355482, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 7.580781044003324e-05, |
|
"loss": 0.7821, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 6.212624584717608, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 7.468048318951983e-05, |
|
"loss": 0.7872, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 6.245847176079734, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 7.35565837962798e-05, |
|
"loss": 0.7855, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 6.27906976744186, |
|
"grad_norm": 3.28125, |
|
"learning_rate": 7.243626441830009e-05, |
|
"loss": 0.7763, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 6.312292358803987, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 7.131967672889101e-05, |
|
"loss": 0.7901, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 6.3455149501661126, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 7.02069718961518e-05, |
|
"loss": 0.7814, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 6.378737541528239, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.7752, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 6.411960132890365, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 6.799381282430284e-05, |
|
"loss": 0.7782, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 6.445182724252492, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 6.68936582115042e-05, |
|
"loss": 0.7748, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 6.4784053156146175, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 6.579798566743314e-05, |
|
"loss": 0.7815, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 6.511627906976744, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 6.470694352861312e-05, |
|
"loss": 0.7747, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 6.544850498338871, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 6.362067950468489e-05, |
|
"loss": 0.785, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 6.578073089700997, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 6.25393406584088e-05, |
|
"loss": 0.7716, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 6.6112956810631225, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 6.146307338575519e-05, |
|
"loss": 0.7723, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 6.644518272425249, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.7745, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 6.677740863787376, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 5.9326335692419995e-05, |
|
"loss": 0.7848, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 6.710963455149502, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 5.8266154551818216e-05, |
|
"loss": 0.7797, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 6.7441860465116275, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 5.72116235058346e-05, |
|
"loss": 0.7714, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 6.777408637873754, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 5.616288532109225e-05, |
|
"loss": 0.7716, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 6.810631229235881, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 5.5120081979953785e-05, |
|
"loss": 0.7807, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 6.843853820598007, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 5.4083354661298814e-05, |
|
"loss": 0.7647, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 6.877076411960132, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 5.305284372141095e-05, |
|
"loss": 0.7755, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 6.910299003322259, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 5.2028688674975415e-05, |
|
"loss": 0.7738, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 6.943521594684386, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 5.101102817619131e-05, |
|
"loss": 0.7765, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 6.976744186046512, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.7745, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 6.996677740863787, |
|
"eval_loss": 2.5435612201690674, |
|
"eval_runtime": 0.2585, |
|
"eval_samples_per_second": 38.679, |
|
"eval_steps_per_second": 3.868, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 7.009966777408638, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 4.899574102343247e-05, |
|
"loss": 0.771, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 7.043189368770764, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 4.799838720707846e-05, |
|
"loss": 0.7653, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 7.076411960132891, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 4.700807357667952e-05, |
|
"loss": 0.7644, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 7.1096345514950166, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 4.6024934204848745e-05, |
|
"loss": 0.7632, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 7.142857142857143, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 4.50491021929194e-05, |
|
"loss": 0.7686, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 7.176079734219269, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 4.4080709652925336e-05, |
|
"loss": 0.7549, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 7.209302325581396, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 4.3119887689714844e-05, |
|
"loss": 0.7626, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 7.2425249169435215, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 4.216676638320135e-05, |
|
"loss": 0.7588, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 7.275747508305648, |
|
"grad_norm": 0.5, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.7583, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 7.308970099667774, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 4.028414082972141e-05, |
|
"loss": 0.7529, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 7.342192691029901, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.935489146011869e-05, |
|
"loss": 0.766, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 7.3754152823920265, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 3.843385246743417e-05, |
|
"loss": 0.7592, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 7.408637873754153, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 3.7521148545604e-05, |
|
"loss": 0.7645, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 7.441860465116279, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 3.661690326012897e-05, |
|
"loss": 0.7629, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 7.475083056478406, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 0.7591, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 7.5083056478405314, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 3.483427711785449e-05, |
|
"loss": 0.7558, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 7.541528239202658, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 3.395613760009925e-05, |
|
"loss": 0.7611, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 7.574750830564784, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 3.308693936411421e-05, |
|
"loss": 0.7619, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 7.607973421926911, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 3.222680008542678e-05, |
|
"loss": 0.7585, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 7.641196013289036, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 3.137583621312665e-05, |
|
"loss": 0.7551, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 7.674418604651163, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 3.053416295410026e-05, |
|
"loss": 0.7626, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 7.707641196013289, |
|
"grad_norm": 0.5, |
|
"learning_rate": 2.9701894257433826e-05, |
|
"loss": 0.764, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 7.740863787375416, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 2.8879142798986292e-05, |
|
"loss": 0.755, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 7.774086378737541, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 2.8066019966134904e-05, |
|
"loss": 0.7563, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 7.807308970099668, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 2.7262635842695127e-05, |
|
"loss": 0.7688, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 7.840531561461794, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 2.6469099194017143e-05, |
|
"loss": 0.7665, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 7.8737541528239205, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 2.5685517452260567e-05, |
|
"loss": 0.7664, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 7.906976744186046, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 2.491199670185008e-05, |
|
"loss": 0.753, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 7.940199335548173, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 2.4148641665113113e-05, |
|
"loss": 0.7614, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 7.973421926910299, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 0.7582, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.5521774291992188, |
|
"eval_runtime": 0.24, |
|
"eval_samples_per_second": 41.669, |
|
"eval_steps_per_second": 4.167, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 8.006644518272426, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 2.265284072660362e-05, |
|
"loss": 0.7646, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 8.039867109634551, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 2.192059733233408e-05, |
|
"loss": 0.758, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 8.073089700996677, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 2.119892463932781e-05, |
|
"loss": 0.7566, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 8.106312292358805, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 2.0487920350515212e-05, |
|
"loss": 0.7551, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 8.13953488372093, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 1.9787680724495617e-05, |
|
"loss": 0.7421, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 8.172757475083056, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.7513, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 8.205980066445182, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 1.8419873195582814e-05, |
|
"loss": 0.7578, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 8.23920265780731, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 1.775249047193377e-05, |
|
"loss": 0.7518, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 8.272425249169435, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 1.7096242744495837e-05, |
|
"loss": 0.7519, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 8.305647840531561, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 0.7514, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 8.338870431893687, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 1.5817506140474247e-05, |
|
"loss": 0.7553, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 8.372093023255815, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 1.5195190384357404e-05, |
|
"loss": 0.7487, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 8.40531561461794, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 1.458435584194745e-05, |
|
"loss": 0.7518, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 8.438538205980066, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 1.3985085210463477e-05, |
|
"loss": 0.7487, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 8.471760797342192, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 0.7467, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 8.50498338870432, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 1.2821558630323772e-05, |
|
"loss": 0.7478, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 8.538205980066445, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 1.2257460204541794e-05, |
|
"loss": 0.7558, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 1.1705240714107302e-05, |
|
"loss": 0.7426, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 8.604651162790697, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 1.116497492069961e-05, |
|
"loss": 0.7411, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 8.637873754152825, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 1.0636735967658784e-05, |
|
"loss": 0.7524, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 8.67109634551495, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 1.0120595370083318e-05, |
|
"loss": 0.7499, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 8.704318936877076, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 9.616623005147951e-06, |
|
"loss": 0.7603, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 8.737541528239202, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 9.124887102643575e-06, |
|
"loss": 0.7563, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 8.77076411960133, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 8.645454235739903e-06, |
|
"loss": 0.7594, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 8.803986710963455, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 8.178389311972612e-06, |
|
"loss": 0.7648, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 8.837209302325581, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 7.72375556445577e-06, |
|
"loss": 0.7555, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 8.870431893687707, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 7.281614543321269e-06, |
|
"loss": 0.7461, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 8.903654485049834, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 6.852026107385756e-06, |
|
"loss": 0.7606, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 8.93687707641196, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 6.435048416046863e-06, |
|
"loss": 0.7598, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 8.970099667774086, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 0.754, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 8.996677740863788, |
|
"eval_loss": 2.5503978729248047, |
|
"eval_runtime": 0.2601, |
|
"eval_samples_per_second": 38.445, |
|
"eval_steps_per_second": 3.845, |
|
"step": 1354 |
|
}, |
|
{ |
|
"epoch": 9.003322259136212, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 5.639149360641649e-06, |
|
"loss": 0.7546, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 9.03654485049834, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 5.26033574856708e-06, |
|
"loss": 0.7562, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 9.069767441860465, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 0.7522, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 9.102990033222591, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 4.541236775226809e-06, |
|
"loss": 0.7522, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 9.136212624584717, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 4.20104876845111e-06, |
|
"loss": 0.7509, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 9.169435215946844, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 3.873830406168111e-06, |
|
"loss": 0.7444, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 9.20265780730897, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 3.5596259885061102e-06, |
|
"loss": 0.7561, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 9.235880398671096, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 3.2584780537136207e-06, |
|
"loss": 0.7502, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 9.269102990033222, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 2.970427372400353e-06, |
|
"loss": 0.7546, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 9.30232558139535, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 2.6955129420176196e-06, |
|
"loss": 0.7506, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 9.335548172757475, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 2.433771981578581e-06, |
|
"loss": 0.7531, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 9.368770764119601, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 0.75, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 9.401993355481727, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 1.9499504244020693e-06, |
|
"loss": 0.7449, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 9.435215946843854, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 1.7279353293586765e-06, |
|
"loss": 0.765, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 9.46843853820598, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 1.5192246987791981e-06, |
|
"loss": 0.7472, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 9.501661129568106, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 1.323846788742078e-06, |
|
"loss": 0.7461, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 9.534883720930232, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 1.14182805028884e-06, |
|
"loss": 0.7501, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 9.56810631229236, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 9.731931258429638e-07, |
|
"loss": 0.7501, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 9.601328903654485, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 8.17964845873831e-07, |
|
"loss": 0.7511, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 9.634551495016611, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 6.761642258056978e-07, |
|
"loss": 0.7556, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 9.667774086378738, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 5.478104631726711e-07, |
|
"loss": 0.751, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 9.700996677740864, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 4.329209350195651e-07, |
|
"loss": 0.7598, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 9.73421926910299, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 3.315111955493944e-07, |
|
"loss": 0.7572, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 9.767441860465116, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 2.4359497401758024e-07, |
|
"loss": 0.7478, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 9.800664451827242, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 1.6918417287318245e-07, |
|
"loss": 0.749, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 9.83388704318937, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 1.0828886614754341e-07, |
|
"loss": 0.7488, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 9.867109634551495, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 6.09172980904238e-08, |
|
"loss": 0.7407, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 9.90033222591362, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 2.7075882053828605e-08, |
|
"loss": 0.7491, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 9.933554817275748, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 6.769199623779532e-09, |
|
"loss": 0.7417, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 9.966777408637874, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.0, |
|
"loss": 0.7572, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 9.966777408637874, |
|
"eval_loss": 2.5546562671661377, |
|
"eval_runtime": 0.2333, |
|
"eval_samples_per_second": 42.867, |
|
"eval_steps_per_second": 4.287, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 9.966777408637874, |
|
"step": 1500, |
|
"total_flos": 4.5794490708666614e+18, |
|
"train_loss": 1.5882705609003702, |
|
"train_runtime": 3659.0045, |
|
"train_samples_per_second": 26.291, |
|
"train_steps_per_second": 0.41 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 4.5794490708666614e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|