|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 550, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01818181818181818, |
|
"grad_norm": 251.0, |
|
"learning_rate": 3.636363636363636e-06, |
|
"loss": 46.8778, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 239.0, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 46.8224, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 106.5, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 39.5877, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": 19.75, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 28.2487, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 15.8125, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 23.0407, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 5.5, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 20.515, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 19.5202, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": 5.75, |
|
"learning_rate": 0.00012727272727272728, |
|
"loss": 18.2599, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 10.8125, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 16.923, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"grad_norm": 21.25, |
|
"learning_rate": 0.00016363636363636366, |
|
"loss": 14.3261, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 29.625, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 9.0683, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 9.625, |
|
"learning_rate": 0.0002, |
|
"loss": 3.5798, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 4.059033393859863, |
|
"eval_runtime": 0.2418, |
|
"eval_samples_per_second": 41.356, |
|
"eval_steps_per_second": 4.136, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 10.125, |
|
"learning_rate": 0.00019994965423831854, |
|
"loss": 2.5621, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1818181818181819, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 0.00019979866764718843, |
|
"loss": 2.0701, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 20.125, |
|
"learning_rate": 0.00019954719225730847, |
|
"loss": 1.8568, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00019919548128307954, |
|
"loss": 1.6836, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00019874388886763944, |
|
"loss": 1.5757, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.5454545454545454, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00019819286972627066, |
|
"loss": 1.52, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00019754297868854073, |
|
"loss": 1.4757, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.7272727272727273, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 0.00019679487013963564, |
|
"loss": 1.4221, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 1.4012, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 0.00019500711177409454, |
|
"loss": 1.3672, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 1.346, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.6943764686584473, |
|
"eval_runtime": 0.2333, |
|
"eval_samples_per_second": 42.871, |
|
"eval_steps_per_second": 4.287, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.090909090909091, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.00019283679330160726, |
|
"loss": 1.3092, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 0.00019161084574320696, |
|
"loss": 1.3052, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00019029265382866214, |
|
"loss": 1.2724, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.3636363636363638, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00018888354486549237, |
|
"loss": 1.2612, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.4545454545454546, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00018738493770697852, |
|
"loss": 1.2532, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 4.25, |
|
"learning_rate": 0.00018579834132349772, |
|
"loss": 1.2422, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.6363636363636362, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 1.2256, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.0001823676581429833, |
|
"loss": 1.2271, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.8181818181818183, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00018052702575310588, |
|
"loss": 1.2114, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00017860530947427875, |
|
"loss": 1.2068, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 10.6875, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 1.1944, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.6026840209960938, |
|
"eval_runtime": 0.2422, |
|
"eval_samples_per_second": 41.281, |
|
"eval_steps_per_second": 4.128, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.090909090909091, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 0.0001745264449675755, |
|
"loss": 1.194, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 0.00017237340381050703, |
|
"loss": 1.1668, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 3.2727272727272725, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00017014748877063214, |
|
"loss": 1.1693, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.3636363636363638, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 0.00016785094115571322, |
|
"loss": 1.167, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 3.4545454545454546, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 1.1601, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.5454545454545454, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00016305526670845226, |
|
"loss": 1.1343, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00016056096871376667, |
|
"loss": 1.1379, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.7272727272727275, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00015800569095711982, |
|
"loss": 1.1339, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 3.8181818181818183, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.00015539200638661104, |
|
"loss": 1.1393, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.909090909090909, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00015272254676105025, |
|
"loss": 1.1015, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.1119, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.5778679847717285, |
|
"eval_runtime": 0.2336, |
|
"eval_samples_per_second": 42.816, |
|
"eval_steps_per_second": 4.282, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.090909090909091, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.0001472271074772683, |
|
"loss": 1.113, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 4.181818181818182, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00014440666126057744, |
|
"loss": 1.0963, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.2727272727272725, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 1.104, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00013863451256931287, |
|
"loss": 1.0957, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.454545454545454, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.00013568862215918717, |
|
"loss": 1.0835, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00013270679633174218, |
|
"loss": 1.0802, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.636363636363637, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 0.0001296920375328275, |
|
"loss": 1.0762, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00012664738136900348, |
|
"loss": 1.0773, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.818181818181818, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00012357589355094275, |
|
"loss": 1.082, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 4.909090909090909, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 0.00012048066680651908, |
|
"loss": 1.0687, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 1.0741, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.5495071411132812, |
|
"eval_runtime": 0.2341, |
|
"eval_samples_per_second": 42.713, |
|
"eval_steps_per_second": 4.271, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.090909090909091, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 1.0546, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.181818181818182, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00011108381999010111, |
|
"loss": 1.0566, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 5.2727272727272725, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00010792499568567884, |
|
"loss": 1.0477, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 5.363636363636363, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00010475819158237425, |
|
"loss": 1.0483, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00010158659638348081, |
|
"loss": 1.0542, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.545454545454545, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.84134036165192e-05, |
|
"loss": 1.0437, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 5.636363636363637, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 9.524180841762577e-05, |
|
"loss": 1.0542, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 5.7272727272727275, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 9.207500431432115e-05, |
|
"loss": 1.0424, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 5.818181818181818, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 8.891618000989891e-05, |
|
"loss": 1.0356, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.909090909090909, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 1.0432, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 1.0435, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.5500569343566895, |
|
"eval_runtime": 0.2352, |
|
"eval_samples_per_second": 42.524, |
|
"eval_steps_per_second": 4.252, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.090909090909091, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 7.951933319348095e-05, |
|
"loss": 1.0185, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 6.181818181818182, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 7.642410644905726e-05, |
|
"loss": 1.0249, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.2727272727272725, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 7.335261863099651e-05, |
|
"loss": 1.0206, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 6.363636363636363, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 7.030796246717255e-05, |
|
"loss": 1.0209, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 6.454545454545454, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 6.729320366825784e-05, |
|
"loss": 1.0285, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 6.545454545454545, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.431137784081282e-05, |
|
"loss": 1.0206, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.636363636363637, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 6.136548743068713e-05, |
|
"loss": 1.0245, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 6.7272727272727275, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 5.845849869981137e-05, |
|
"loss": 1.0174, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 6.818181818181818, |
|
"grad_norm": 0.75, |
|
"learning_rate": 5.559333873942259e-05, |
|
"loss": 1.0206, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 6.909090909090909, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 5.277289252273174e-05, |
|
"loss": 1.0244, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 1.0191, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.5536184310913086, |
|
"eval_runtime": 0.2341, |
|
"eval_samples_per_second": 42.722, |
|
"eval_steps_per_second": 4.272, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 7.090909090909091, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.727745323894976e-05, |
|
"loss": 1.011, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 7.181818181818182, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 4.4607993613388976e-05, |
|
"loss": 1.0087, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 4.19943090428802e-05, |
|
"loss": 1.0019, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.363636363636363, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 3.943903128623335e-05, |
|
"loss": 1.0112, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 7.454545454545454, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 3.694473329154778e-05, |
|
"loss": 1.0075, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 7.545454545454545, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 1.0023, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 7.636363636363637, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.21490588442868e-05, |
|
"loss": 1.0112, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 7.7272727272727275, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 2.9852511229367865e-05, |
|
"loss": 1.0069, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 7.818181818181818, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 2.7626596189492983e-05, |
|
"loss": 1.002, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 7.909090909090909, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 2.5473555032424533e-05, |
|
"loss": 1.0076, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 0.9965, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.5604465007781982, |
|
"eval_runtime": 0.24, |
|
"eval_samples_per_second": 41.665, |
|
"eval_steps_per_second": 4.167, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.090909090909092, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 2.139469052572127e-05, |
|
"loss": 1.0043, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 8.181818181818182, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 1.947297424689414e-05, |
|
"loss": 0.9997, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 8.272727272727273, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.763234185701673e-05, |
|
"loss": 0.9942, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 8.363636363636363, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.587464671688187e-05, |
|
"loss": 0.9952, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 8.454545454545455, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 1.4201658676502294e-05, |
|
"loss": 1.0037, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 8.545454545454545, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 1.2615062293021507e-05, |
|
"loss": 1.0051, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 8.636363636363637, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 1.1116455134507664e-05, |
|
"loss": 0.9905, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 8.727272727272727, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 9.707346171337894e-06, |
|
"loss": 0.9977, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 8.818181818181818, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 8.38915425679304e-06, |
|
"loss": 0.9894, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 8.909090909090908, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 7.163206698392744e-06, |
|
"loss": 1.005, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 0.9986, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 2.5596535205841064, |
|
"eval_runtime": 0.2334, |
|
"eval_samples_per_second": 42.846, |
|
"eval_steps_per_second": 4.285, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 4.992888225905468e-06, |
|
"loss": 0.9957, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.181818181818182, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 4.050702638550275e-06, |
|
"loss": 1.0036, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 9.272727272727273, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 3.2051298603643753e-06, |
|
"loss": 0.9985, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 9.363636363636363, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 2.4570213114592954e-06, |
|
"loss": 0.9961, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 9.454545454545455, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 1.8071302737293295e-06, |
|
"loss": 1.0066, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 9.545454545454545, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 1.2561111323605712e-06, |
|
"loss": 0.9996, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 9.636363636363637, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 8.04518716920466e-07, |
|
"loss": 0.9941, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 9.727272727272727, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 4.5280774269154115e-07, |
|
"loss": 0.9898, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 9.818181818181818, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 2.0133235281156736e-07, |
|
"loss": 0.9888, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 9.909090909090908, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 5.0345761681491746e-08, |
|
"loss": 0.9953, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.0, |
|
"loss": 0.9948, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.5602283477783203, |
|
"eval_runtime": 0.2428, |
|
"eval_samples_per_second": 41.182, |
|
"eval_steps_per_second": 4.118, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 550, |
|
"total_flos": 1.6777423328808796e+18, |
|
"train_loss": 3.202145513187755, |
|
"train_runtime": 1331.6624, |
|
"train_samples_per_second": 26.313, |
|
"train_steps_per_second": 0.413 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 550, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 1.6777423328808796e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|