|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9945750452079567, |
|
"eval_steps": 500, |
|
"global_step": 1242, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.024110910186859555, |
|
"grad_norm": 6.533456606849755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8842, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04822182037371911, |
|
"grad_norm": 1.7431134609583179, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7716, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07233273056057866, |
|
"grad_norm": 1.3259257434449123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7528, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09644364074743822, |
|
"grad_norm": 0.7933729448787181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7272, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12055455093429777, |
|
"grad_norm": 0.9489596846030721, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7249, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14466546112115733, |
|
"grad_norm": 0.9021467855462361, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7048, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16877637130801687, |
|
"grad_norm": 0.9161730770097936, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7005, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19288728149487644, |
|
"grad_norm": 0.7938637800361754, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6932, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21699819168173598, |
|
"grad_norm": 0.5753057373815283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6974, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24110910186859555, |
|
"grad_norm": 0.9667031835337068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6757, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2652200120554551, |
|
"grad_norm": 0.5028322572120897, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6834, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.28933092224231466, |
|
"grad_norm": 0.6797693061745307, |
|
"learning_rate": 5e-06, |
|
"loss": 0.674, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3134418324291742, |
|
"grad_norm": 1.1680360190006298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6722, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33755274261603374, |
|
"grad_norm": 0.544648561957048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.677, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3616636528028933, |
|
"grad_norm": 0.7257586557706087, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6747, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3857745629897529, |
|
"grad_norm": 0.4119617826643094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6693, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4098854731766124, |
|
"grad_norm": 0.5605900141967505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6688, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.43399638336347196, |
|
"grad_norm": 0.6385688604944322, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6643, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.45810729355033153, |
|
"grad_norm": 1.0125290764918353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6641, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4822182037371911, |
|
"grad_norm": 0.6186627046406172, |
|
"learning_rate": 5e-06, |
|
"loss": 0.658, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5063291139240507, |
|
"grad_norm": 0.8414739576765752, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6671, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5304400241109102, |
|
"grad_norm": 0.6025383396507406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6651, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5545509342977697, |
|
"grad_norm": 0.5410823595468066, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6554, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5786618444846293, |
|
"grad_norm": 0.6181513401688427, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6641, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6027727546714888, |
|
"grad_norm": 0.4896208841371711, |
|
"learning_rate": 5e-06, |
|
"loss": 0.657, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6268836648583485, |
|
"grad_norm": 0.544546111477725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6638, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.650994575045208, |
|
"grad_norm": 0.5356265326461168, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6602, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6751054852320675, |
|
"grad_norm": 0.5475932069244179, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6538, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6992163954189271, |
|
"grad_norm": 0.8345098978281534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6523, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7233273056057866, |
|
"grad_norm": 0.8160477568039888, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6604, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7474382157926461, |
|
"grad_norm": 0.5563594159462366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6581, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7715491259795058, |
|
"grad_norm": 0.6104670026137493, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6543, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7956600361663653, |
|
"grad_norm": 0.5818143425968119, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6508, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8197709463532248, |
|
"grad_norm": 0.39317653113678785, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6459, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8438818565400844, |
|
"grad_norm": 0.4869964807571895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6476, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8679927667269439, |
|
"grad_norm": 0.9839633535279524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6517, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8921036769138035, |
|
"grad_norm": 0.4947132075136725, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6554, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9162145871006631, |
|
"grad_norm": 0.42196728270115014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.647, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9403254972875226, |
|
"grad_norm": 0.7036293961206416, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6437, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9644364074743822, |
|
"grad_norm": 0.4303638291795801, |
|
"learning_rate": 5e-06, |
|
"loss": 0.647, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9885473176612417, |
|
"grad_norm": 0.4993150805880552, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6408, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9981916817359855, |
|
"eval_loss": 0.6481794714927673, |
|
"eval_runtime": 221.8793, |
|
"eval_samples_per_second": 50.37, |
|
"eval_steps_per_second": 0.397, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.0126582278481013, |
|
"grad_norm": 0.5861902646980864, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6219, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0367691380349608, |
|
"grad_norm": 0.4608104690581376, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6109, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0608800482218204, |
|
"grad_norm": 0.7019806195266277, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5999, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0849909584086799, |
|
"grad_norm": 0.4666118598227287, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6071, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1091018685955394, |
|
"grad_norm": 0.49273088471001014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6079, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1332127787823991, |
|
"grad_norm": 0.5608412041594104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6093, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1573236889692586, |
|
"grad_norm": 0.5133766270512516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6023, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1814345991561181, |
|
"grad_norm": 0.4639503656965253, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6067, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2055455093429777, |
|
"grad_norm": 0.4941484591532595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6034, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2296564195298372, |
|
"grad_norm": 0.532046568060987, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6021, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.253767329716697, |
|
"grad_norm": 0.6313451414543506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6072, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2778782399035564, |
|
"grad_norm": 0.48840150221258893, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6008, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.301989150090416, |
|
"grad_norm": 0.4346073819877919, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6084, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3261000602772754, |
|
"grad_norm": 0.5696969325867375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6073, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.350210970464135, |
|
"grad_norm": 0.6029521082479712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6045, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.3743218806509945, |
|
"grad_norm": 0.5359000000861764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.601, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3984327908378542, |
|
"grad_norm": 0.4280776424781654, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6068, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4225437010247137, |
|
"grad_norm": 0.42975173635641, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6058, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.4466546112115732, |
|
"grad_norm": 0.4148935722421534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6084, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4707655213984328, |
|
"grad_norm": 0.4346895040838288, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6083, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4948764315852923, |
|
"grad_norm": 0.456872099031643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6101, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.518987341772152, |
|
"grad_norm": 0.518636393965265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6107, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5430982519590115, |
|
"grad_norm": 0.4976317739138397, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5995, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.567209162145871, |
|
"grad_norm": 0.5121056663367101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6147, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.5913200723327305, |
|
"grad_norm": 0.49181051844188867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6046, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.61543098251959, |
|
"grad_norm": 0.4913489094366748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.605, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6395418927064496, |
|
"grad_norm": 0.4360413141924259, |
|
"learning_rate": 5e-06, |
|
"loss": 0.603, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.663652802893309, |
|
"grad_norm": 0.5553873036504335, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6037, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.6877637130801688, |
|
"grad_norm": 0.439159626571011, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6037, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7118746232670283, |
|
"grad_norm": 0.5009323338564864, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6046, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7359855334538878, |
|
"grad_norm": 0.49820787215486934, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6057, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.7600964436407476, |
|
"grad_norm": 0.553637472752945, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6066, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.784207353827607, |
|
"grad_norm": 0.44541140483577896, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8083182640144666, |
|
"grad_norm": 0.5310706794248644, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6098, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.8324291742013261, |
|
"grad_norm": 0.6630764624549126, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6054, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8565400843881856, |
|
"grad_norm": 0.553711920694149, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6037, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.8806509945750451, |
|
"grad_norm": 0.566305473833487, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6019, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.582333160680419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6051, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.9288728149487642, |
|
"grad_norm": 0.509141986707748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6052, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.952983725135624, |
|
"grad_norm": 0.4543923308424651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.598, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.9770946353224834, |
|
"grad_norm": 0.45958164108182104, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6026, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.998794454490657, |
|
"eval_loss": 0.6378007531166077, |
|
"eval_runtime": 222.8588, |
|
"eval_samples_per_second": 50.148, |
|
"eval_steps_per_second": 0.395, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 2.001205545509343, |
|
"grad_norm": 0.7867156441363433, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6043, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.0253164556962027, |
|
"grad_norm": 0.5284619907031279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5707, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.049427365883062, |
|
"grad_norm": 0.5795012320295118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5606, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.0735382760699217, |
|
"grad_norm": 0.5627294692682645, |
|
"learning_rate": 5e-06, |
|
"loss": 0.563, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.097649186256781, |
|
"grad_norm": 0.5052179539566712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5631, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.1217600964436407, |
|
"grad_norm": 0.4428407773542258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5595, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.1458710066305002, |
|
"grad_norm": 0.5267499633401915, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5601, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.1699819168173597, |
|
"grad_norm": 0.4655374512529405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5607, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.1940928270042193, |
|
"grad_norm": 0.48398838396056276, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5595, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.2182037371910788, |
|
"grad_norm": 0.48096941817619093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5677, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.2423146473779383, |
|
"grad_norm": 0.5154141010470734, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5594, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.2664255575647982, |
|
"grad_norm": 0.4799488446079912, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5621, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.2905364677516578, |
|
"grad_norm": 0.5540016498502853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5638, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.3146473779385173, |
|
"grad_norm": 0.6082357481189948, |
|
"learning_rate": 5e-06, |
|
"loss": 0.564, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.338758288125377, |
|
"grad_norm": 0.5420853183530063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5643, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.3628691983122363, |
|
"grad_norm": 0.42570128293415416, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5642, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.386980108499096, |
|
"grad_norm": 0.5255517048498499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5672, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.4110910186859553, |
|
"grad_norm": 0.5353694927594205, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5565, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.435201928872815, |
|
"grad_norm": 0.4617633168683323, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5657, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.4593128390596743, |
|
"grad_norm": 0.449869806649973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5586, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.483423749246534, |
|
"grad_norm": 0.5115337318725849, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5576, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.507534659433394, |
|
"grad_norm": 0.6907411145245406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5681, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.5316455696202533, |
|
"grad_norm": 0.5238948140915647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5659, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.555756479807113, |
|
"grad_norm": 0.6589003211840228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5667, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.5798673899939724, |
|
"grad_norm": 0.4764556136945032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5629, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.603978300180832, |
|
"grad_norm": 0.44468254080490577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5687, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.6280892103676914, |
|
"grad_norm": 0.5124860341949249, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5692, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.652200120554551, |
|
"grad_norm": 0.5228826110878407, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5667, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.6763110307414104, |
|
"grad_norm": 0.5458373344595544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5617, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.70042194092827, |
|
"grad_norm": 0.43248189186264496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5589, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.7245328511151294, |
|
"grad_norm": 0.44951413853647815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5696, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.748643761301989, |
|
"grad_norm": 0.5059427152996532, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5645, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.7727546714888485, |
|
"grad_norm": 0.4713166756254001, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5659, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.7968655816757084, |
|
"grad_norm": 0.4662277376061737, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5638, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.820976491862568, |
|
"grad_norm": 0.5055943494520574, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5651, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.8450874020494274, |
|
"grad_norm": 0.49826856850045714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5664, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.869198312236287, |
|
"grad_norm": 0.46906591997365343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5708, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.8933092224231465, |
|
"grad_norm": 0.5743140790459712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5713, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.917420132610006, |
|
"grad_norm": 0.5413293244789124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.566, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.9415310427968655, |
|
"grad_norm": 0.4769984493754597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5653, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.965641952983725, |
|
"grad_norm": 0.4784113431133355, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5663, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.9897528631705845, |
|
"grad_norm": 0.46857130335535624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.569, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.9945750452079567, |
|
"eval_loss": 0.6389562487602234, |
|
"eval_runtime": 223.5725, |
|
"eval_samples_per_second": 49.988, |
|
"eval_steps_per_second": 0.394, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 2.9945750452079567, |
|
"step": 1242, |
|
"total_flos": 2079977499525120.0, |
|
"train_loss": 0.615725677754376, |
|
"train_runtime": 37262.9121, |
|
"train_samples_per_second": 17.094, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1242, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2079977499525120.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|