|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 344, |
|
"global_step": 5510, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0014519056261343012, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.6156, |
|
"num_input_tokens_seen": 499226, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0029038112522686023, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.0994, |
|
"num_input_tokens_seen": 1014244, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.004355716878402904, |
|
"grad_norm": 2.125, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0849, |
|
"num_input_tokens_seen": 1528464, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.005807622504537205, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 3.999979012178918e-05, |
|
"loss": 0.12, |
|
"num_input_tokens_seen": 2041011, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.007259528130671506, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 3.9999160491561583e-05, |
|
"loss": 0.1437, |
|
"num_input_tokens_seen": 2530185, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.008711433756805808, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 3.9998111122531796e-05, |
|
"loss": 0.0898, |
|
"num_input_tokens_seen": 3017273, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.010163339382940109, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 3.999664203672378e-05, |
|
"loss": 0.1247, |
|
"num_input_tokens_seen": 3507672, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.01161524500907441, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.999475326497044e-05, |
|
"loss": 0.0819, |
|
"num_input_tokens_seen": 4018539, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.013067150635208712, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 3.999244484691299e-05, |
|
"loss": 0.1078, |
|
"num_input_tokens_seen": 4525857, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.014519056261343012, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 3.998971683100009e-05, |
|
"loss": 0.099, |
|
"num_input_tokens_seen": 5023032, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.015970961887477313, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.9986569274486843e-05, |
|
"loss": 0.0855, |
|
"num_input_tokens_seen": 5524113, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.017422867513611617, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.9983002243433615e-05, |
|
"loss": 0.1026, |
|
"num_input_tokens_seen": 5999882, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.018874773139745917, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 3.9979015812704605e-05, |
|
"loss": 0.0843, |
|
"num_input_tokens_seen": 6471878, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.020326678765880218, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 3.997461006596631e-05, |
|
"loss": 0.0841, |
|
"num_input_tokens_seen": 6944973, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.021778584392014518, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 3.9969785095685765e-05, |
|
"loss": 0.0982, |
|
"num_input_tokens_seen": 7460215, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.02323049001814882, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.996454100312857e-05, |
|
"loss": 0.0971, |
|
"num_input_tokens_seen": 7942417, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.024682395644283123, |
|
"grad_norm": 82.0, |
|
"learning_rate": 3.9958877898356806e-05, |
|
"loss": 0.2563, |
|
"num_input_tokens_seen": 8454243, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.026134301270417423, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.99527959002267e-05, |
|
"loss": 0.1566, |
|
"num_input_tokens_seen": 8973734, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.027586206896551724, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 3.994629513638614e-05, |
|
"loss": 0.1109, |
|
"num_input_tokens_seen": 9497439, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.029038112522686024, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 3.993937574327201e-05, |
|
"loss": 0.1353, |
|
"num_input_tokens_seen": 9988636, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.030490018148820328, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 3.993203786610727e-05, |
|
"loss": 0.1002, |
|
"num_input_tokens_seen": 10460548, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.031941923774954625, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.992428165889799e-05, |
|
"loss": 0.0952, |
|
"num_input_tokens_seen": 10983644, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.033393829401088926, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.991610728443006e-05, |
|
"loss": 0.1082, |
|
"num_input_tokens_seen": 11485663, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.03484573502722323, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.9907514914265776e-05, |
|
"loss": 0.0907, |
|
"num_input_tokens_seen": 11981340, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.036297640653357534, |
|
"grad_norm": 12.0625, |
|
"learning_rate": 3.989850472874027e-05, |
|
"loss": 0.0704, |
|
"num_input_tokens_seen": 12482463, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.037749546279491834, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 3.988907691695771e-05, |
|
"loss": 0.0847, |
|
"num_input_tokens_seen": 12968571, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.039201451905626135, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 3.987923167678732e-05, |
|
"loss": 0.0968, |
|
"num_input_tokens_seen": 13451536, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.040653357531760435, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 3.986896921485924e-05, |
|
"loss": 0.1026, |
|
"num_input_tokens_seen": 13949131, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.042105263157894736, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 3.9858289746560183e-05, |
|
"loss": 0.1126, |
|
"num_input_tokens_seen": 14447251, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.043557168784029036, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.984719349602892e-05, |
|
"loss": 0.0934, |
|
"num_input_tokens_seen": 14937783, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.04500907441016334, |
|
"grad_norm": 1.75, |
|
"learning_rate": 3.983568069615157e-05, |
|
"loss": 0.0936, |
|
"num_input_tokens_seen": 15429323, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.04646098003629764, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.982375158855672e-05, |
|
"loss": 0.0749, |
|
"num_input_tokens_seen": 15920688, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.047912885662431945, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 3.981140642361034e-05, |
|
"loss": 0.0868, |
|
"num_input_tokens_seen": 16393398, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.049364791288566245, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 3.9798645460410544e-05, |
|
"loss": 0.0997, |
|
"num_input_tokens_seen": 16894283, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.050816696914700546, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 3.9785468966782155e-05, |
|
"loss": 0.0849, |
|
"num_input_tokens_seen": 17371830, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.052268602540834846, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 3.9771877219271055e-05, |
|
"loss": 0.0925, |
|
"num_input_tokens_seen": 17893827, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.05372050816696915, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 3.975787050313841e-05, |
|
"loss": 0.0822, |
|
"num_input_tokens_seen": 18380621, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.05517241379310345, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 3.9743449112354676e-05, |
|
"loss": 0.1172, |
|
"num_input_tokens_seen": 18905348, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.05662431941923775, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.9728613349593415e-05, |
|
"loss": 0.1075, |
|
"num_input_tokens_seen": 19399905, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.05807622504537205, |
|
"grad_norm": 18.25, |
|
"learning_rate": 3.971336352622496e-05, |
|
"loss": 0.1882, |
|
"num_input_tokens_seen": 19921923, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.05952813067150635, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 3.969769996230989e-05, |
|
"loss": 0.1074, |
|
"num_input_tokens_seen": 20436822, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.060980036297640657, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 3.968162298659227e-05, |
|
"loss": 0.1112, |
|
"num_input_tokens_seen": 20943888, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.06243194192377496, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 3.9665132936492794e-05, |
|
"loss": 0.1519, |
|
"num_input_tokens_seen": 21418243, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.06243194192377496, |
|
"eval_loss": 0.11010845005512238, |
|
"eval_runtime": 2622.9951, |
|
"eval_samples_per_second": 1.188, |
|
"eval_steps_per_second": 0.149, |
|
"num_input_tokens_seen": 21418243, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.06388384754990925, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 3.9648230158101674e-05, |
|
"loss": 0.123, |
|
"num_input_tokens_seen": 21924518, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.06533575317604355, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 3.9630915006171416e-05, |
|
"loss": 0.1086, |
|
"num_input_tokens_seen": 22403227, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.06678765880217785, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 3.961318784410932e-05, |
|
"loss": 0.1068, |
|
"num_input_tokens_seen": 22901361, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.06823956442831217, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 3.95950490439699e-05, |
|
"loss": 0.0931, |
|
"num_input_tokens_seen": 23408098, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.06969147005444647, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 3.9576498986447026e-05, |
|
"loss": 0.0817, |
|
"num_input_tokens_seen": 23890867, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.07114337568058077, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.9557538060866005e-05, |
|
"loss": 0.0917, |
|
"num_input_tokens_seen": 24393313, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.07259528130671507, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.9538166665175354e-05, |
|
"loss": 0.0865, |
|
"num_input_tokens_seen": 24894282, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07404718693284937, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.9518385205938446e-05, |
|
"loss": 0.1222, |
|
"num_input_tokens_seen": 25397169, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.07549909255898367, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 3.949819409832502e-05, |
|
"loss": 0.0899, |
|
"num_input_tokens_seen": 25894407, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.07695099818511797, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.947759376610242e-05, |
|
"loss": 0.0716, |
|
"num_input_tokens_seen": 26375741, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.07840290381125227, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3.945658464162674e-05, |
|
"loss": 0.1094, |
|
"num_input_tokens_seen": 26881148, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.07985480943738657, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 3.9435167165833724e-05, |
|
"loss": 0.1517, |
|
"num_input_tokens_seen": 27373108, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.08130671506352087, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 3.9413341788229524e-05, |
|
"loss": 0.0959, |
|
"num_input_tokens_seen": 27852888, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.08275862068965517, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 3.939110896688126e-05, |
|
"loss": 0.0824, |
|
"num_input_tokens_seen": 28338065, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.08421052631578947, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 3.93684691684074e-05, |
|
"loss": 0.1234, |
|
"num_input_tokens_seen": 28842856, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.08566243194192377, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 3.9345422867967995e-05, |
|
"loss": 0.1118, |
|
"num_input_tokens_seen": 29349096, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.08711433756805807, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 3.9321970549254664e-05, |
|
"loss": 0.1055, |
|
"num_input_tokens_seen": 29826034, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.08856624319419237, |
|
"grad_norm": 18.75, |
|
"learning_rate": 3.929811270448049e-05, |
|
"loss": 0.1166, |
|
"num_input_tokens_seen": 30321718, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.09001814882032667, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 3.927384983436964e-05, |
|
"loss": 0.1134, |
|
"num_input_tokens_seen": 30812607, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.09147005444646097, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 3.924918244814689e-05, |
|
"loss": 0.0805, |
|
"num_input_tokens_seen": 31304931, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.09292196007259527, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.922411106352694e-05, |
|
"loss": 0.0849, |
|
"num_input_tokens_seen": 31792831, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.09437386569872959, |
|
"grad_norm": 1.375, |
|
"learning_rate": 3.9198636206703516e-05, |
|
"loss": 0.0919, |
|
"num_input_tokens_seen": 32286282, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.09582577132486389, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 3.9172758412338346e-05, |
|
"loss": 0.0896, |
|
"num_input_tokens_seen": 32770941, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.09727767695099819, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 3.9146478223549974e-05, |
|
"loss": 0.0925, |
|
"num_input_tokens_seen": 33253136, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.09872958257713249, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 3.9119796191902274e-05, |
|
"loss": 0.0656, |
|
"num_input_tokens_seen": 33760146, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.10018148820326679, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 3.9092712877392965e-05, |
|
"loss": 0.1162, |
|
"num_input_tokens_seen": 34251987, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.10163339382940109, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.906522884844181e-05, |
|
"loss": 0.1153, |
|
"num_input_tokens_seen": 34730598, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.10308529945553539, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 3.903734468187868e-05, |
|
"loss": 0.0731, |
|
"num_input_tokens_seen": 35215481, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.10453720508166969, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.900906096293148e-05, |
|
"loss": 0.0992, |
|
"num_input_tokens_seen": 35691971, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.105989110707804, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 3.8980378285213846e-05, |
|
"loss": 0.1058, |
|
"num_input_tokens_seen": 36191442, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.1074410163339383, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.895129725071268e-05, |
|
"loss": 0.0841, |
|
"num_input_tokens_seen": 36677760, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.1088929219600726, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.892181846977553e-05, |
|
"loss": 0.096, |
|
"num_input_tokens_seen": 37169594, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1103448275862069, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.8891942561097787e-05, |
|
"loss": 0.0865, |
|
"num_input_tokens_seen": 37658243, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.1117967332123412, |
|
"grad_norm": 3.40625, |
|
"learning_rate": 3.8861670151709664e-05, |
|
"loss": 0.0926, |
|
"num_input_tokens_seen": 38172841, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.1132486388384755, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 3.883100187696308e-05, |
|
"loss": 0.0844, |
|
"num_input_tokens_seen": 38680418, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.1147005444646098, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 3.87999383805183e-05, |
|
"loss": 0.0889, |
|
"num_input_tokens_seen": 39168241, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.1161524500907441, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 3.876848031433042e-05, |
|
"loss": 0.0931, |
|
"num_input_tokens_seen": 39636702, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1176043557168784, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.8736628338635716e-05, |
|
"loss": 0.0638, |
|
"num_input_tokens_seen": 40118232, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.1190562613430127, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 3.870438312193774e-05, |
|
"loss": 0.0775, |
|
"num_input_tokens_seen": 40614511, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.120508166969147, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.8671745340993354e-05, |
|
"loss": 0.0902, |
|
"num_input_tokens_seen": 41136221, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.12196007259528131, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.863871568079845e-05, |
|
"loss": 0.1083, |
|
"num_input_tokens_seen": 41626515, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.12341197822141561, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 3.860529483457362e-05, |
|
"loss": 0.0914, |
|
"num_input_tokens_seen": 42128107, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.12486388384754991, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 3.8571483503749625e-05, |
|
"loss": 0.1172, |
|
"num_input_tokens_seen": 42626752, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.12486388384754991, |
|
"eval_loss": 0.08887020498514175, |
|
"eval_runtime": 2566.1938, |
|
"eval_samples_per_second": 1.215, |
|
"eval_steps_per_second": 0.152, |
|
"num_input_tokens_seen": 42626752, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.12631578947368421, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.8537282397952604e-05, |
|
"loss": 0.0873, |
|
"num_input_tokens_seen": 43128274, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.1277676950998185, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 3.8502692234989265e-05, |
|
"loss": 0.0807, |
|
"num_input_tokens_seen": 43630580, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.12921960072595282, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 3.846771374083175e-05, |
|
"loss": 0.0792, |
|
"num_input_tokens_seen": 44143904, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.1306715063520871, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 3.843234764960244e-05, |
|
"loss": 0.0808, |
|
"num_input_tokens_seen": 44635682, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.13212341197822142, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 3.839659470355853e-05, |
|
"loss": 0.0902, |
|
"num_input_tokens_seen": 45110870, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.1335753176043557, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 3.8360455653076446e-05, |
|
"loss": 0.0872, |
|
"num_input_tokens_seen": 45620246, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.13502722323049002, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 3.832393125663613e-05, |
|
"loss": 0.1095, |
|
"num_input_tokens_seen": 46106634, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.13647912885662433, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.8287022280805064e-05, |
|
"loss": 0.1008, |
|
"num_input_tokens_seen": 46599497, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 3.824972950022224e-05, |
|
"loss": 0.0761, |
|
"num_input_tokens_seen": 47098121, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.13938294010889293, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.8212053697581855e-05, |
|
"loss": 0.0864, |
|
"num_input_tokens_seen": 47599433, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.14083484573502722, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 3.817399566361692e-05, |
|
"loss": 0.0756, |
|
"num_input_tokens_seen": 48099996, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.14228675136116153, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.8135556197082647e-05, |
|
"loss": 0.0991, |
|
"num_input_tokens_seen": 48591151, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.14373865698729582, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.809673610473967e-05, |
|
"loss": 0.0859, |
|
"num_input_tokens_seen": 49119581, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.14519056261343014, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 3.805753620133715e-05, |
|
"loss": 0.0938, |
|
"num_input_tokens_seen": 49589057, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.14664246823956442, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 3.801795730959565e-05, |
|
"loss": 0.0657, |
|
"num_input_tokens_seen": 50091363, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.14809437386569874, |
|
"grad_norm": 1.5, |
|
"learning_rate": 3.7978000260189854e-05, |
|
"loss": 0.1124, |
|
"num_input_tokens_seen": 50595440, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.14954627949183302, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.793766589173117e-05, |
|
"loss": 0.0969, |
|
"num_input_tokens_seen": 51097536, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.15099818511796734, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 3.789695505075013e-05, |
|
"loss": 0.0815, |
|
"num_input_tokens_seen": 51592933, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.15245009074410162, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 3.785586859167855e-05, |
|
"loss": 0.0806, |
|
"num_input_tokens_seen": 52089163, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.15390199637023594, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.78144073768317e-05, |
|
"loss": 0.0628, |
|
"num_input_tokens_seen": 52591035, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.15535390199637023, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.7772572276390125e-05, |
|
"loss": 0.1, |
|
"num_input_tokens_seen": 53108139, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.15680580762250454, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.7730364168381444e-05, |
|
"loss": 0.1083, |
|
"num_input_tokens_seen": 53612734, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.15825771324863883, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 3.768778393866186e-05, |
|
"loss": 0.0782, |
|
"num_input_tokens_seen": 54104981, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.15970961887477314, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 3.764483248089763e-05, |
|
"loss": 0.1166, |
|
"num_input_tokens_seen": 54591628, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.16116152450090745, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 3.760151069654626e-05, |
|
"loss": 0.0958, |
|
"num_input_tokens_seen": 55092240, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.16261343012704174, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.75578194948376e-05, |
|
"loss": 0.0904, |
|
"num_input_tokens_seen": 55596058, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.16406533575317606, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.751375979275479e-05, |
|
"loss": 0.0816, |
|
"num_input_tokens_seen": 56065485, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.16551724137931034, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.746933251501497e-05, |
|
"loss": 0.0729, |
|
"num_input_tokens_seen": 56559741, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.16696914700544466, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.7424538594049886e-05, |
|
"loss": 0.0626, |
|
"num_input_tokens_seen": 57042468, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.16842105263157894, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.737937896998634e-05, |
|
"loss": 0.0872, |
|
"num_input_tokens_seen": 57530081, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.16987295825771326, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 3.733385459062645e-05, |
|
"loss": 0.0863, |
|
"num_input_tokens_seen": 58052036, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.17132486388384754, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.728796641142775e-05, |
|
"loss": 0.0747, |
|
"num_input_tokens_seen": 58558654, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.17277676950998186, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 3.724171539548311e-05, |
|
"loss": 0.0946, |
|
"num_input_tokens_seen": 59069780, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.17422867513611615, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 3.71951025135006e-05, |
|
"loss": 0.0707, |
|
"num_input_tokens_seen": 59546270, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.17568058076225046, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.714812874378305e-05, |
|
"loss": 0.0796, |
|
"num_input_tokens_seen": 60050879, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.17713248638838475, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 3.710079507220751e-05, |
|
"loss": 0.0908, |
|
"num_input_tokens_seen": 60542881, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.17858439201451906, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 3.705310249220463e-05, |
|
"loss": 0.0799, |
|
"num_input_tokens_seen": 61009270, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.18003629764065335, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.700505200473774e-05, |
|
"loss": 0.0937, |
|
"num_input_tokens_seen": 61499242, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.18148820326678766, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 3.695664461828187e-05, |
|
"loss": 0.0913, |
|
"num_input_tokens_seen": 61987954, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.18294010889292195, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.69078813488026e-05, |
|
"loss": 0.0546, |
|
"num_input_tokens_seen": 62482644, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.18439201451905626, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.68587632197347e-05, |
|
"loss": 0.0788, |
|
"num_input_tokens_seen": 62950426, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.18584392014519055, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 3.6809291261960655e-05, |
|
"loss": 0.0865, |
|
"num_input_tokens_seen": 63454867, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.18729582577132486, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 3.675946651378909e-05, |
|
"loss": 0.0832, |
|
"num_input_tokens_seen": 63980224, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.18729582577132486, |
|
"eval_loss": 0.07875645905733109, |
|
"eval_runtime": 2702.6122, |
|
"eval_samples_per_second": 1.153, |
|
"eval_steps_per_second": 0.144, |
|
"num_input_tokens_seen": 63980224, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.18874773139745918, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 3.67092900209329e-05, |
|
"loss": 0.0831, |
|
"num_input_tokens_seen": 64445080, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.19019963702359347, |
|
"grad_norm": 1.25, |
|
"learning_rate": 3.665876283648732e-05, |
|
"loss": 0.0697, |
|
"num_input_tokens_seen": 64941877, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.19165154264972778, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 3.660788602090788e-05, |
|
"loss": 0.0845, |
|
"num_input_tokens_seen": 65451057, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.19310344827586207, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.655666064198807e-05, |
|
"loss": 0.0822, |
|
"num_input_tokens_seen": 65944830, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.19455535390199638, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.6505087774836977e-05, |
|
"loss": 0.0974, |
|
"num_input_tokens_seen": 66458462, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.19600725952813067, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.645316850185672e-05, |
|
"loss": 0.0907, |
|
"num_input_tokens_seen": 66955532, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.19745916515426498, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 3.6400903912719696e-05, |
|
"loss": 0.0791, |
|
"num_input_tokens_seen": 67453162, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.19891107078039927, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.6348295104345764e-05, |
|
"loss": 0.0593, |
|
"num_input_tokens_seen": 67939256, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.20036297640653358, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.629534318087918e-05, |
|
"loss": 0.1024, |
|
"num_input_tokens_seen": 68457767, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.20181488203266787, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.624204925366543e-05, |
|
"loss": 0.0621, |
|
"num_input_tokens_seen": 68964063, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 0.20326678765880218, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 3.618841444122794e-05, |
|
"loss": 0.0685, |
|
"num_input_tokens_seen": 69443542, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.20471869328493647, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 3.613443986924455e-05, |
|
"loss": 0.0866, |
|
"num_input_tokens_seen": 69941074, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.20617059891107078, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.60801266705239e-05, |
|
"loss": 0.0873, |
|
"num_input_tokens_seen": 70410725, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 0.20762250453720507, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 3.6025475984981716e-05, |
|
"loss": 0.0767, |
|
"num_input_tokens_seen": 70885703, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 0.20907441016333939, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.59704889596168e-05, |
|
"loss": 0.08, |
|
"num_input_tokens_seen": 71379385, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.21052631578947367, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 3.5915166748486984e-05, |
|
"loss": 0.0974, |
|
"num_input_tokens_seen": 71863351, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.211978221415608, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.585951051268496e-05, |
|
"loss": 0.0799, |
|
"num_input_tokens_seen": 72351447, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 0.21343012704174227, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 3.5803521420313836e-05, |
|
"loss": 0.0598, |
|
"num_input_tokens_seen": 72853284, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 0.2148820326678766, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 3.574720064646267e-05, |
|
"loss": 0.1021, |
|
"num_input_tokens_seen": 73354953, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 0.2163339382940109, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 3.5690549373181785e-05, |
|
"loss": 0.0749, |
|
"num_input_tokens_seen": 73851645, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 0.2177858439201452, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 3.563356878945797e-05, |
|
"loss": 0.0677, |
|
"num_input_tokens_seen": 74351802, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2192377495462795, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.557626009118951e-05, |
|
"loss": 0.0632, |
|
"num_input_tokens_seen": 74849173, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 0.2206896551724138, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 3.551862448116113e-05, |
|
"loss": 0.1037, |
|
"num_input_tokens_seen": 75333244, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 0.2221415607985481, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.546066316901869e-05, |
|
"loss": 0.0675, |
|
"num_input_tokens_seen": 75799822, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 0.2235934664246824, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 3.540237737124384e-05, |
|
"loss": 0.0684, |
|
"num_input_tokens_seen": 76300896, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 0.2250453720508167, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.534376831112848e-05, |
|
"loss": 0.0757, |
|
"num_input_tokens_seen": 76787655, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.226497277676951, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.528483721874907e-05, |
|
"loss": 0.0651, |
|
"num_input_tokens_seen": 77298718, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 0.2279491833030853, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 3.522558533094084e-05, |
|
"loss": 0.0863, |
|
"num_input_tokens_seen": 77797727, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 0.2294010889292196, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 3.51660138912718e-05, |
|
"loss": 0.0885, |
|
"num_input_tokens_seen": 78292669, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 0.2308529945553539, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 3.510612415001668e-05, |
|
"loss": 0.0892, |
|
"num_input_tokens_seen": 78800617, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 0.2323049001814882, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.5045917364130644e-05, |
|
"loss": 0.0527, |
|
"num_input_tokens_seen": 79317483, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.2337568058076225, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 3.4985394797222954e-05, |
|
"loss": 0.0587, |
|
"num_input_tokens_seen": 79807917, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 0.2352087114337568, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 3.49245577195304e-05, |
|
"loss": 0.0546, |
|
"num_input_tokens_seen": 80289419, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 0.2366606170598911, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 3.4863407407890696e-05, |
|
"loss": 0.0982, |
|
"num_input_tokens_seen": 80784249, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 0.2381125226860254, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 3.480194514571564e-05, |
|
"loss": 0.0965, |
|
"num_input_tokens_seen": 81278666, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 0.2395644283121597, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 3.474017222296419e-05, |
|
"loss": 0.0984, |
|
"num_input_tokens_seen": 81786558, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.241016333938294, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 3.4678089936115395e-05, |
|
"loss": 0.1122, |
|
"num_input_tokens_seen": 82281843, |
|
"step": 1328 |
|
}, |
|
{ |
|
"epoch": 0.2424682395644283, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 3.461569958814119e-05, |
|
"loss": 0.0745, |
|
"num_input_tokens_seen": 82776869, |
|
"step": 1336 |
|
}, |
|
{ |
|
"epoch": 0.24392014519056263, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 3.455300248847903e-05, |
|
"loss": 0.1094, |
|
"num_input_tokens_seen": 83275171, |
|
"step": 1344 |
|
}, |
|
{ |
|
"epoch": 0.2453720508166969, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.448999995300443e-05, |
|
"loss": 0.0663, |
|
"num_input_tokens_seen": 83755833, |
|
"step": 1352 |
|
}, |
|
{ |
|
"epoch": 0.24682395644283123, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 3.4426693304003324e-05, |
|
"loss": 0.0879, |
|
"num_input_tokens_seen": 84237888, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.2482758620689655, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.4363083870144346e-05, |
|
"loss": 0.0661, |
|
"num_input_tokens_seen": 84739837, |
|
"step": 1368 |
|
}, |
|
{ |
|
"epoch": 0.24972776769509983, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.4299172986450906e-05, |
|
"loss": 0.0764, |
|
"num_input_tokens_seen": 85221444, |
|
"step": 1376 |
|
}, |
|
{ |
|
"epoch": 0.24972776769509983, |
|
"eval_loss": 0.08076217025518417, |
|
"eval_runtime": 2579.1691, |
|
"eval_samples_per_second": 1.209, |
|
"eval_steps_per_second": 0.151, |
|
"num_input_tokens_seen": 85221444, |
|
"step": 1376 |
|
}, |
|
{ |
|
"epoch": 0.25117967332123414, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.4234961994273206e-05, |
|
"loss": 0.0714, |
|
"num_input_tokens_seen": 85711647, |
|
"step": 1384 |
|
}, |
|
{ |
|
"epoch": 0.25263157894736843, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 3.417045224126004e-05, |
|
"loss": 0.0774, |
|
"num_input_tokens_seen": 86223550, |
|
"step": 1392 |
|
}, |
|
{ |
|
"epoch": 0.2540834845735027, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 3.410564508133058e-05, |
|
"loss": 0.0872, |
|
"num_input_tokens_seen": 86721404, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.255535390199637, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 3.40405418746459e-05, |
|
"loss": 0.0729, |
|
"num_input_tokens_seen": 87180793, |
|
"step": 1408 |
|
}, |
|
{ |
|
"epoch": 0.25698729582577134, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 3.397514398758046e-05, |
|
"loss": 0.0732, |
|
"num_input_tokens_seen": 87680677, |
|
"step": 1416 |
|
}, |
|
{ |
|
"epoch": 0.25843920145190563, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 3.39094527926934e-05, |
|
"loss": 0.0765, |
|
"num_input_tokens_seen": 88187512, |
|
"step": 1424 |
|
}, |
|
{ |
|
"epoch": 0.2598911070780399, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 3.384346966869976e-05, |
|
"loss": 0.0684, |
|
"num_input_tokens_seen": 88692751, |
|
"step": 1432 |
|
}, |
|
{ |
|
"epoch": 0.2613430127041742, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 3.377719600044156e-05, |
|
"loss": 0.0878, |
|
"num_input_tokens_seen": 89183444, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.26279491833030855, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 3.371063317885868e-05, |
|
"loss": 0.0738, |
|
"num_input_tokens_seen": 89681459, |
|
"step": 1448 |
|
}, |
|
{ |
|
"epoch": 0.26424682395644283, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 3.364378260095972e-05, |
|
"loss": 0.075, |
|
"num_input_tokens_seen": 90168008, |
|
"step": 1456 |
|
}, |
|
{ |
|
"epoch": 0.2656987295825771, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 3.3576645669792634e-05, |
|
"loss": 0.0606, |
|
"num_input_tokens_seen": 90654438, |
|
"step": 1464 |
|
}, |
|
{ |
|
"epoch": 0.2671506352087114, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 3.350922379441534e-05, |
|
"loss": 0.0853, |
|
"num_input_tokens_seen": 91167951, |
|
"step": 1472 |
|
}, |
|
{ |
|
"epoch": 0.26860254083484575, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 3.3441518389866075e-05, |
|
"loss": 0.0518, |
|
"num_input_tokens_seen": 91650643, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.27005444646098004, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 3.3373530877133764e-05, |
|
"loss": 0.0749, |
|
"num_input_tokens_seen": 92155336, |
|
"step": 1488 |
|
}, |
|
{ |
|
"epoch": 0.2715063520871143, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.330526268312817e-05, |
|
"loss": 0.0583, |
|
"num_input_tokens_seen": 92628298, |
|
"step": 1496 |
|
}, |
|
{ |
|
"epoch": 0.27295825771324866, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.323671524064992e-05, |
|
"loss": 0.0885, |
|
"num_input_tokens_seen": 93154901, |
|
"step": 1504 |
|
}, |
|
{ |
|
"epoch": 0.27441016333938295, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 3.316788998836048e-05, |
|
"loss": 0.0583, |
|
"num_input_tokens_seen": 93650095, |
|
"step": 1512 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 3.309878837075193e-05, |
|
"loss": 0.0764, |
|
"num_input_tokens_seen": 94136210, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.2773139745916515, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.3029411838116654e-05, |
|
"loss": 0.0638, |
|
"num_input_tokens_seen": 94624523, |
|
"step": 1528 |
|
}, |
|
{ |
|
"epoch": 0.27876588021778587, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 3.295976184651691e-05, |
|
"loss": 0.0685, |
|
"num_input_tokens_seen": 95110498, |
|
"step": 1536 |
|
}, |
|
{ |
|
"epoch": 0.28021778584392015, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.288983985775426e-05, |
|
"loss": 0.0853, |
|
"num_input_tokens_seen": 95620511, |
|
"step": 1544 |
|
}, |
|
{ |
|
"epoch": 0.28166969147005444, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 3.281964733933889e-05, |
|
"loss": 0.0779, |
|
"num_input_tokens_seen": 96130692, |
|
"step": 1552 |
|
}, |
|
{ |
|
"epoch": 0.2831215970961887, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.274918576445882e-05, |
|
"loss": 0.0713, |
|
"num_input_tokens_seen": 96638367, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.28457350272232307, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 3.267845661194898e-05, |
|
"loss": 0.0653, |
|
"num_input_tokens_seen": 97154890, |
|
"step": 1568 |
|
}, |
|
{ |
|
"epoch": 0.28602540834845736, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 3.260746136626016e-05, |
|
"loss": 0.0522, |
|
"num_input_tokens_seen": 97650182, |
|
"step": 1576 |
|
}, |
|
{ |
|
"epoch": 0.28747731397459164, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 3.253620151742788e-05, |
|
"loss": 0.0868, |
|
"num_input_tokens_seen": 98121695, |
|
"step": 1584 |
|
}, |
|
{ |
|
"epoch": 0.28892921960072593, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 3.24646785610411e-05, |
|
"loss": 0.0844, |
|
"num_input_tokens_seen": 98595616, |
|
"step": 1592 |
|
}, |
|
{ |
|
"epoch": 0.29038112522686027, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 3.239289399821083e-05, |
|
"loss": 0.0668, |
|
"num_input_tokens_seen": 99105755, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.29183303085299456, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 3.2320849335538636e-05, |
|
"loss": 0.0699, |
|
"num_input_tokens_seen": 99595258, |
|
"step": 1608 |
|
}, |
|
{ |
|
"epoch": 0.29328493647912884, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 3.2248546085084995e-05, |
|
"loss": 0.0903, |
|
"num_input_tokens_seen": 100106643, |
|
"step": 1616 |
|
}, |
|
{ |
|
"epoch": 0.29473684210526313, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 3.21759857643376e-05, |
|
"loss": 0.0826, |
|
"num_input_tokens_seen": 100593045, |
|
"step": 1624 |
|
}, |
|
{ |
|
"epoch": 0.2961887477313975, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 3.2103169896179476e-05, |
|
"loss": 0.084, |
|
"num_input_tokens_seen": 101094273, |
|
"step": 1632 |
|
}, |
|
{ |
|
"epoch": 0.29764065335753176, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.203010000885704e-05, |
|
"loss": 0.0742, |
|
"num_input_tokens_seen": 101593296, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.29909255898366605, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 3.1956777635948016e-05, |
|
"loss": 0.064, |
|
"num_input_tokens_seen": 102074203, |
|
"step": 1648 |
|
}, |
|
{ |
|
"epoch": 0.3005444646098004, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 3.188320431632924e-05, |
|
"loss": 0.0569, |
|
"num_input_tokens_seen": 102576481, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 0.3019963702359347, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 3.180938159414439e-05, |
|
"loss": 0.0932, |
|
"num_input_tokens_seen": 103070807, |
|
"step": 1664 |
|
}, |
|
{ |
|
"epoch": 0.30344827586206896, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 3.173531101877155e-05, |
|
"loss": 0.0621, |
|
"num_input_tokens_seen": 103568290, |
|
"step": 1672 |
|
}, |
|
{ |
|
"epoch": 0.30490018148820325, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.166099414479069e-05, |
|
"loss": 0.0579, |
|
"num_input_tokens_seen": 104059494, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.3063520871143376, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 3.158643253195108e-05, |
|
"loss": 0.0695, |
|
"num_input_tokens_seen": 104556886, |
|
"step": 1688 |
|
}, |
|
{ |
|
"epoch": 0.3078039927404719, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 3.15116277451385e-05, |
|
"loss": 0.0723, |
|
"num_input_tokens_seen": 105058562, |
|
"step": 1696 |
|
}, |
|
{ |
|
"epoch": 0.30925589836660616, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.143658135434244e-05, |
|
"loss": 0.0652, |
|
"num_input_tokens_seen": 105536081, |
|
"step": 1704 |
|
}, |
|
{ |
|
"epoch": 0.31070780399274045, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 3.136129493462312e-05, |
|
"loss": 0.0748, |
|
"num_input_tokens_seen": 106037792, |
|
"step": 1712 |
|
}, |
|
{ |
|
"epoch": 0.3121597096188748, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 3.1285770066078445e-05, |
|
"loss": 0.072, |
|
"num_input_tokens_seen": 106546503, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.3121597096188748, |
|
"eval_loss": 0.06825637072324753, |
|
"eval_runtime": 2711.2246, |
|
"eval_samples_per_second": 1.15, |
|
"eval_steps_per_second": 0.144, |
|
"num_input_tokens_seen": 106546503, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.3136116152450091, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 3.121000833381084e-05, |
|
"loss": 0.0737, |
|
"num_input_tokens_seen": 107037952, |
|
"step": 1728 |
|
}, |
|
{ |
|
"epoch": 0.31506352087114337, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.113401132789399e-05, |
|
"loss": 0.0712, |
|
"num_input_tokens_seen": 107540349, |
|
"step": 1736 |
|
}, |
|
{ |
|
"epoch": 0.31651542649727765, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.1057780643339465e-05, |
|
"loss": 0.0685, |
|
"num_input_tokens_seen": 108034983, |
|
"step": 1744 |
|
}, |
|
{ |
|
"epoch": 0.317967332123412, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 3.098131788006322e-05, |
|
"loss": 0.0718, |
|
"num_input_tokens_seen": 108503192, |
|
"step": 1752 |
|
}, |
|
{ |
|
"epoch": 0.3194192377495463, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 3.0904624642852065e-05, |
|
"loss": 0.076, |
|
"num_input_tokens_seen": 109019554, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.32087114337568057, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 3.082770254132993e-05, |
|
"loss": 0.0549, |
|
"num_input_tokens_seen": 109504850, |
|
"step": 1768 |
|
}, |
|
{ |
|
"epoch": 0.3223230490018149, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 3.075055318992412e-05, |
|
"loss": 0.068, |
|
"num_input_tokens_seen": 110008850, |
|
"step": 1776 |
|
}, |
|
{ |
|
"epoch": 0.3237749546279492, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 3.067317820783143e-05, |
|
"loss": 0.0676, |
|
"num_input_tokens_seen": 110528376, |
|
"step": 1784 |
|
}, |
|
{ |
|
"epoch": 0.3252268602540835, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 3.0595579218984124e-05, |
|
"loss": 0.0862, |
|
"num_input_tokens_seen": 111026349, |
|
"step": 1792 |
|
}, |
|
{ |
|
"epoch": 0.32667876588021777, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 3.05177578520159e-05, |
|
"loss": 0.0561, |
|
"num_input_tokens_seen": 111515922, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3281306715063521, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.04397157402277e-05, |
|
"loss": 0.0599, |
|
"num_input_tokens_seen": 112007455, |
|
"step": 1808 |
|
}, |
|
{ |
|
"epoch": 0.3295825771324864, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 3.0361454521553383e-05, |
|
"loss": 0.0856, |
|
"num_input_tokens_seen": 112491694, |
|
"step": 1816 |
|
}, |
|
{ |
|
"epoch": 0.3310344827586207, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 3.028297583852541e-05, |
|
"loss": 0.055, |
|
"num_input_tokens_seen": 112968009, |
|
"step": 1824 |
|
}, |
|
{ |
|
"epoch": 0.33248638838475497, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 3.020428133824035e-05, |
|
"loss": 0.0495, |
|
"num_input_tokens_seen": 113462356, |
|
"step": 1832 |
|
}, |
|
{ |
|
"epoch": 0.3339382940108893, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 3.0125372672324285e-05, |
|
"loss": 0.0765, |
|
"num_input_tokens_seen": 113976443, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.3353901996370236, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 3.0046251496898177e-05, |
|
"loss": 0.0521, |
|
"num_input_tokens_seen": 114445408, |
|
"step": 1848 |
|
}, |
|
{ |
|
"epoch": 0.3368421052631579, |
|
"grad_norm": 1.0, |
|
"learning_rate": 2.9966919472543098e-05, |
|
"loss": 0.0659, |
|
"num_input_tokens_seen": 114933077, |
|
"step": 1856 |
|
}, |
|
{ |
|
"epoch": 0.3382940108892922, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.9887378264265387e-05, |
|
"loss": 0.0853, |
|
"num_input_tokens_seen": 115416098, |
|
"step": 1864 |
|
}, |
|
{ |
|
"epoch": 0.3397459165154265, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 2.9807629541461693e-05, |
|
"loss": 0.0611, |
|
"num_input_tokens_seen": 115937997, |
|
"step": 1872 |
|
}, |
|
{ |
|
"epoch": 0.3411978221415608, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 2.972767497788393e-05, |
|
"loss": 0.048, |
|
"num_input_tokens_seen": 116441850, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.3426497277676951, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.9647516251604192e-05, |
|
"loss": 0.0777, |
|
"num_input_tokens_seen": 116937086, |
|
"step": 1888 |
|
}, |
|
{ |
|
"epoch": 0.3441016333938294, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 2.9567155044979466e-05, |
|
"loss": 0.0598, |
|
"num_input_tokens_seen": 117443956, |
|
"step": 1896 |
|
}, |
|
{ |
|
"epoch": 0.3455535390199637, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 2.9486593044616394e-05, |
|
"loss": 0.0686, |
|
"num_input_tokens_seen": 117937379, |
|
"step": 1904 |
|
}, |
|
{ |
|
"epoch": 0.347005444646098, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.9405831941335816e-05, |
|
"loss": 0.053, |
|
"num_input_tokens_seen": 118423431, |
|
"step": 1912 |
|
}, |
|
{ |
|
"epoch": 0.3484573502722323, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 2.932487343013732e-05, |
|
"loss": 0.0485, |
|
"num_input_tokens_seen": 118938547, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.34990925589836663, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.9243719210163654e-05, |
|
"loss": 0.076, |
|
"num_input_tokens_seen": 119414827, |
|
"step": 1928 |
|
}, |
|
{ |
|
"epoch": 0.3513611615245009, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 2.916237098466507e-05, |
|
"loss": 0.037, |
|
"num_input_tokens_seen": 119906010, |
|
"step": 1936 |
|
}, |
|
{ |
|
"epoch": 0.3528130671506352, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 2.9080830460963563e-05, |
|
"loss": 0.0561, |
|
"num_input_tokens_seen": 120390508, |
|
"step": 1944 |
|
}, |
|
{ |
|
"epoch": 0.3542649727767695, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 2.8999099350417065e-05, |
|
"loss": 0.0846, |
|
"num_input_tokens_seen": 120863309, |
|
"step": 1952 |
|
}, |
|
{ |
|
"epoch": 0.35571687840290384, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 2.8917179368383493e-05, |
|
"loss": 0.0403, |
|
"num_input_tokens_seen": 121339176, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.3571687840290381, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 2.883507223418478e-05, |
|
"loss": 0.0645, |
|
"num_input_tokens_seen": 121867501, |
|
"step": 1968 |
|
}, |
|
{ |
|
"epoch": 0.3586206896551724, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 2.875277967107076e-05, |
|
"loss": 0.0911, |
|
"num_input_tokens_seen": 122375421, |
|
"step": 1976 |
|
}, |
|
{ |
|
"epoch": 0.3600725952813067, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 2.867030340618303e-05, |
|
"loss": 0.0454, |
|
"num_input_tokens_seen": 122856601, |
|
"step": 1984 |
|
}, |
|
{ |
|
"epoch": 0.36152450090744104, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 2.858764517051868e-05, |
|
"loss": 0.0615, |
|
"num_input_tokens_seen": 123347371, |
|
"step": 1992 |
|
}, |
|
{ |
|
"epoch": 0.3629764065335753, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 2.850480669889397e-05, |
|
"loss": 0.0536, |
|
"num_input_tokens_seen": 123846779, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3644283121597096, |
|
"grad_norm": 1.875, |
|
"learning_rate": 2.8421789729907928e-05, |
|
"loss": 0.0499, |
|
"num_input_tokens_seen": 124332390, |
|
"step": 2008 |
|
}, |
|
{ |
|
"epoch": 0.3658802177858439, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 2.833859600590583e-05, |
|
"loss": 0.076, |
|
"num_input_tokens_seen": 124806640, |
|
"step": 2016 |
|
}, |
|
{ |
|
"epoch": 0.36733212341197824, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 2.825522727294268e-05, |
|
"loss": 0.0347, |
|
"num_input_tokens_seen": 125289556, |
|
"step": 2024 |
|
}, |
|
{ |
|
"epoch": 0.3687840290381125, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 2.817168528074654e-05, |
|
"loss": 0.0854, |
|
"num_input_tokens_seen": 125783042, |
|
"step": 2032 |
|
}, |
|
{ |
|
"epoch": 0.3702359346642468, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 2.8087971782681774e-05, |
|
"loss": 0.0731, |
|
"num_input_tokens_seen": 126277662, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.3716878402903811, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.8004088535712315e-05, |
|
"loss": 0.0833, |
|
"num_input_tokens_seen": 126770182, |
|
"step": 2048 |
|
}, |
|
{ |
|
"epoch": 0.37313974591651544, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.7920037300364746e-05, |
|
"loss": 0.0752, |
|
"num_input_tokens_seen": 127265873, |
|
"step": 2056 |
|
}, |
|
{ |
|
"epoch": 0.37459165154264973, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.783581984069134e-05, |
|
"loss": 0.0652, |
|
"num_input_tokens_seen": 127767598, |
|
"step": 2064 |
|
}, |
|
{ |
|
"epoch": 0.37459165154264973, |
|
"eval_loss": 0.06295192986726761, |
|
"eval_runtime": 2754.9055, |
|
"eval_samples_per_second": 1.131, |
|
"eval_steps_per_second": 0.142, |
|
"num_input_tokens_seen": 127767598, |
|
"step": 2064 |
|
}, |
|
{ |
|
"epoch": 0.376043557168784, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 2.7751437924233093e-05, |
|
"loss": 0.06, |
|
"num_input_tokens_seen": 128256289, |
|
"step": 2072 |
|
}, |
|
{ |
|
"epoch": 0.37749546279491836, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 2.7666893321982548e-05, |
|
"loss": 0.0714, |
|
"num_input_tokens_seen": 128789423, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.37894736842105264, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.758218780834671e-05, |
|
"loss": 0.0608, |
|
"num_input_tokens_seen": 129283910, |
|
"step": 2088 |
|
}, |
|
{ |
|
"epoch": 0.38039927404718693, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 2.7497323161109734e-05, |
|
"loss": 0.0567, |
|
"num_input_tokens_seen": 129762227, |
|
"step": 2096 |
|
}, |
|
{ |
|
"epoch": 0.3818511796733212, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.741230116139565e-05, |
|
"loss": 0.0822, |
|
"num_input_tokens_seen": 130260949, |
|
"step": 2104 |
|
}, |
|
{ |
|
"epoch": 0.38330308529945556, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 2.7327123593630984e-05, |
|
"loss": 0.0744, |
|
"num_input_tokens_seen": 130738461, |
|
"step": 2112 |
|
}, |
|
{ |
|
"epoch": 0.38475499092558985, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 2.7241792245507284e-05, |
|
"loss": 0.0428, |
|
"num_input_tokens_seen": 131250070, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.38620689655172413, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 2.715630890794362e-05, |
|
"loss": 0.0764, |
|
"num_input_tokens_seen": 131731607, |
|
"step": 2128 |
|
}, |
|
{ |
|
"epoch": 0.3876588021778584, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 2.7070675375048984e-05, |
|
"loss": 0.0464, |
|
"num_input_tokens_seen": 132241144, |
|
"step": 2136 |
|
}, |
|
{ |
|
"epoch": 0.38911070780399276, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 2.698489344408464e-05, |
|
"loss": 0.0598, |
|
"num_input_tokens_seen": 132728134, |
|
"step": 2144 |
|
}, |
|
{ |
|
"epoch": 0.39056261343012705, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 2.689896491542642e-05, |
|
"loss": 0.0897, |
|
"num_input_tokens_seen": 133209860, |
|
"step": 2152 |
|
}, |
|
{ |
|
"epoch": 0.39201451905626133, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.681289159252689e-05, |
|
"loss": 0.0525, |
|
"num_input_tokens_seen": 133711627, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.3934664246823956, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 2.6726675281877567e-05, |
|
"loss": 0.0602, |
|
"num_input_tokens_seen": 134198176, |
|
"step": 2168 |
|
}, |
|
{ |
|
"epoch": 0.39491833030852996, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 2.6640317792970947e-05, |
|
"loss": 0.0562, |
|
"num_input_tokens_seen": 134689114, |
|
"step": 2176 |
|
}, |
|
{ |
|
"epoch": 0.39637023593466425, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.6553820938262557e-05, |
|
"loss": 0.0341, |
|
"num_input_tokens_seen": 135179499, |
|
"step": 2184 |
|
}, |
|
{ |
|
"epoch": 0.39782214156079854, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 2.6467186533132906e-05, |
|
"loss": 0.0783, |
|
"num_input_tokens_seen": 135700208, |
|
"step": 2192 |
|
}, |
|
{ |
|
"epoch": 0.3992740471869328, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 2.638041639584939e-05, |
|
"loss": 0.0604, |
|
"num_input_tokens_seen": 136212202, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.40072595281306717, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 2.6293512347528122e-05, |
|
"loss": 0.0591, |
|
"num_input_tokens_seen": 136698380, |
|
"step": 2208 |
|
}, |
|
{ |
|
"epoch": 0.40217785843920145, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 2.6206476212095734e-05, |
|
"loss": 0.0743, |
|
"num_input_tokens_seen": 137191271, |
|
"step": 2216 |
|
}, |
|
{ |
|
"epoch": 0.40362976406533574, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 2.6119309816251042e-05, |
|
"loss": 0.0437, |
|
"num_input_tokens_seen": 137660173, |
|
"step": 2224 |
|
}, |
|
{ |
|
"epoch": 0.4050816696914701, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 2.6032014989426784e-05, |
|
"loss": 0.0597, |
|
"num_input_tokens_seen": 138165909, |
|
"step": 2232 |
|
}, |
|
{ |
|
"epoch": 0.40653357531760437, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 2.594459356375116e-05, |
|
"loss": 0.0504, |
|
"num_input_tokens_seen": 138631528, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.40798548094373865, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.585704737400941e-05, |
|
"loss": 0.0611, |
|
"num_input_tokens_seen": 139130348, |
|
"step": 2248 |
|
}, |
|
{ |
|
"epoch": 0.40943738656987294, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.57693782576053e-05, |
|
"loss": 0.0461, |
|
"num_input_tokens_seen": 139617268, |
|
"step": 2256 |
|
}, |
|
{ |
|
"epoch": 0.4108892921960073, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 2.568158805452256e-05, |
|
"loss": 0.062, |
|
"num_input_tokens_seen": 140121646, |
|
"step": 2264 |
|
}, |
|
{ |
|
"epoch": 0.41234119782214157, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.559367860728627e-05, |
|
"loss": 0.0506, |
|
"num_input_tokens_seen": 140625443, |
|
"step": 2272 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 2.5505651760924182e-05, |
|
"loss": 0.0757, |
|
"num_input_tokens_seen": 141135512, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.41524500907441014, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 2.5417509362927986e-05, |
|
"loss": 0.078, |
|
"num_input_tokens_seen": 141614186, |
|
"step": 2288 |
|
}, |
|
{ |
|
"epoch": 0.4166969147005445, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 2.5329253263214573e-05, |
|
"loss": 0.0549, |
|
"num_input_tokens_seen": 142126285, |
|
"step": 2296 |
|
}, |
|
{ |
|
"epoch": 0.41814882032667877, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 2.5240885314087162e-05, |
|
"loss": 0.0592, |
|
"num_input_tokens_seen": 142609607, |
|
"step": 2304 |
|
}, |
|
{ |
|
"epoch": 0.41960072595281306, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 2.5152407370196467e-05, |
|
"loss": 0.0477, |
|
"num_input_tokens_seen": 143090080, |
|
"step": 2312 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 2.5063821288501746e-05, |
|
"loss": 0.0576, |
|
"num_input_tokens_seen": 143576776, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.4225045372050817, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 2.4975128928231823e-05, |
|
"loss": 0.0671, |
|
"num_input_tokens_seen": 144070311, |
|
"step": 2328 |
|
}, |
|
{ |
|
"epoch": 0.423956442831216, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 2.4886332150846092e-05, |
|
"loss": 0.0637, |
|
"num_input_tokens_seen": 144581612, |
|
"step": 2336 |
|
}, |
|
{ |
|
"epoch": 0.42540834845735026, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 2.4797432819995427e-05, |
|
"loss": 0.0496, |
|
"num_input_tokens_seen": 145085129, |
|
"step": 2344 |
|
}, |
|
{ |
|
"epoch": 0.42686025408348455, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 2.4708432801483086e-05, |
|
"loss": 0.0662, |
|
"num_input_tokens_seen": 145568633, |
|
"step": 2352 |
|
}, |
|
{ |
|
"epoch": 0.4283121597096189, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.4619333963225525e-05, |
|
"loss": 0.059, |
|
"num_input_tokens_seen": 146076350, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.4297640653357532, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.4530138175213222e-05, |
|
"loss": 0.1076, |
|
"num_input_tokens_seen": 146577893, |
|
"step": 2368 |
|
}, |
|
{ |
|
"epoch": 0.43121597096188746, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 2.4440847309471422e-05, |
|
"loss": 0.0794, |
|
"num_input_tokens_seen": 147074725, |
|
"step": 2376 |
|
}, |
|
{ |
|
"epoch": 0.4326678765880218, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 2.435146324002083e-05, |
|
"loss": 0.0537, |
|
"num_input_tokens_seen": 147559139, |
|
"step": 2384 |
|
}, |
|
{ |
|
"epoch": 0.4341197822141561, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 2.426198784283831e-05, |
|
"loss": 0.0429, |
|
"num_input_tokens_seen": 148055859, |
|
"step": 2392 |
|
}, |
|
{ |
|
"epoch": 0.4355716878402904, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 2.4172422995817496e-05, |
|
"loss": 0.0583, |
|
"num_input_tokens_seen": 148559803, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.43702359346642466, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 2.408277057872936e-05, |
|
"loss": 0.0693, |
|
"num_input_tokens_seen": 149047633, |
|
"step": 2408 |
|
}, |
|
{ |
|
"epoch": 0.43702359346642466, |
|
"eval_loss": 0.05809076130390167, |
|
"eval_runtime": 2813.328, |
|
"eval_samples_per_second": 1.108, |
|
"eval_steps_per_second": 0.139, |
|
"num_input_tokens_seen": 149047633, |
|
"step": 2408 |
|
}, |
|
{ |
|
"epoch": 0.438475499092559, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 2.3993032473182796e-05, |
|
"loss": 0.0627, |
|
"num_input_tokens_seen": 149553600, |
|
"step": 2416 |
|
}, |
|
{ |
|
"epoch": 0.4399274047186933, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 2.390321056258511e-05, |
|
"loss": 0.0518, |
|
"num_input_tokens_seen": 150031007, |
|
"step": 2424 |
|
}, |
|
{ |
|
"epoch": 0.4413793103448276, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.3813306732102483e-05, |
|
"loss": 0.0564, |
|
"num_input_tokens_seen": 150506503, |
|
"step": 2432 |
|
}, |
|
{ |
|
"epoch": 0.44283121597096187, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 2.3723322868620436e-05, |
|
"loss": 0.0728, |
|
"num_input_tokens_seen": 151018070, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.4442831215970962, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 2.3633260860704188e-05, |
|
"loss": 0.0428, |
|
"num_input_tokens_seen": 151507916, |
|
"step": 2448 |
|
}, |
|
{ |
|
"epoch": 0.4457350272232305, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 2.3543122598559053e-05, |
|
"loss": 0.0458, |
|
"num_input_tokens_seen": 151999967, |
|
"step": 2456 |
|
}, |
|
{ |
|
"epoch": 0.4471869328493648, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 2.345290997399074e-05, |
|
"loss": 0.051, |
|
"num_input_tokens_seen": 152499025, |
|
"step": 2464 |
|
}, |
|
{ |
|
"epoch": 0.44863883847549907, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 2.3362624880365677e-05, |
|
"loss": 0.0713, |
|
"num_input_tokens_seen": 152984867, |
|
"step": 2472 |
|
}, |
|
{ |
|
"epoch": 0.4500907441016334, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 2.3272269212571262e-05, |
|
"loss": 0.0627, |
|
"num_input_tokens_seen": 153473082, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.4515426497277677, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 2.3181844866976076e-05, |
|
"loss": 0.048, |
|
"num_input_tokens_seen": 153951602, |
|
"step": 2488 |
|
}, |
|
{ |
|
"epoch": 0.452994555353902, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 2.3091353741390116e-05, |
|
"loss": 0.0476, |
|
"num_input_tokens_seen": 154432971, |
|
"step": 2496 |
|
}, |
|
{ |
|
"epoch": 0.45444646098003627, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 2.3000797735024922e-05, |
|
"loss": 0.049, |
|
"num_input_tokens_seen": 154912331, |
|
"step": 2504 |
|
}, |
|
{ |
|
"epoch": 0.4558983666061706, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 2.2910178748453765e-05, |
|
"loss": 0.0544, |
|
"num_input_tokens_seen": 155385055, |
|
"step": 2512 |
|
}, |
|
{ |
|
"epoch": 0.4573502722323049, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 2.2819498683571718e-05, |
|
"loss": 0.0494, |
|
"num_input_tokens_seen": 155892191, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.4588021778584392, |
|
"grad_norm": 0.625, |
|
"learning_rate": 2.272875944355575e-05, |
|
"loss": 0.066, |
|
"num_input_tokens_seen": 156405102, |
|
"step": 2528 |
|
}, |
|
{ |
|
"epoch": 0.46025408348457353, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 2.2637962932824803e-05, |
|
"loss": 0.0605, |
|
"num_input_tokens_seen": 156909466, |
|
"step": 2536 |
|
}, |
|
{ |
|
"epoch": 0.4617059891107078, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.2547111056999808e-05, |
|
"loss": 0.0394, |
|
"num_input_tokens_seen": 157391122, |
|
"step": 2544 |
|
}, |
|
{ |
|
"epoch": 0.4631578947368421, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 2.245620572286366e-05, |
|
"loss": 0.0525, |
|
"num_input_tokens_seen": 157880121, |
|
"step": 2552 |
|
}, |
|
{ |
|
"epoch": 0.4646098003629764, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 2.2365248838321273e-05, |
|
"loss": 0.0491, |
|
"num_input_tokens_seen": 158360167, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.46606170598911073, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 2.2274242312359445e-05, |
|
"loss": 0.0528, |
|
"num_input_tokens_seen": 158867422, |
|
"step": 2568 |
|
}, |
|
{ |
|
"epoch": 0.467513611615245, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 2.2183188055006867e-05, |
|
"loss": 0.0679, |
|
"num_input_tokens_seen": 159364296, |
|
"step": 2576 |
|
}, |
|
{ |
|
"epoch": 0.4689655172413793, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 2.2092087977294e-05, |
|
"loss": 0.0744, |
|
"num_input_tokens_seen": 159890619, |
|
"step": 2584 |
|
}, |
|
{ |
|
"epoch": 0.4704174228675136, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 2.2000943991212977e-05, |
|
"loss": 0.0419, |
|
"num_input_tokens_seen": 160398651, |
|
"step": 2592 |
|
}, |
|
{ |
|
"epoch": 0.47186932849364793, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 2.190975800967747e-05, |
|
"loss": 0.0616, |
|
"num_input_tokens_seen": 160922909, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.4733212341197822, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 2.1818531946482543e-05, |
|
"loss": 0.0442, |
|
"num_input_tokens_seen": 161419902, |
|
"step": 2608 |
|
}, |
|
{ |
|
"epoch": 0.4747731397459165, |
|
"grad_norm": 0.625, |
|
"learning_rate": 2.172726771626449e-05, |
|
"loss": 0.0469, |
|
"num_input_tokens_seen": 161929180, |
|
"step": 2616 |
|
}, |
|
{ |
|
"epoch": 0.4762250453720508, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 2.163596723446065e-05, |
|
"loss": 0.0573, |
|
"num_input_tokens_seen": 162437709, |
|
"step": 2624 |
|
}, |
|
{ |
|
"epoch": 0.47767695099818513, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 2.1544632417269194e-05, |
|
"loss": 0.052, |
|
"num_input_tokens_seen": 162950151, |
|
"step": 2632 |
|
}, |
|
{ |
|
"epoch": 0.4791288566243194, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 2.145326518160893e-05, |
|
"loss": 0.0576, |
|
"num_input_tokens_seen": 163429462, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.4805807622504537, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 2.136186744507904e-05, |
|
"loss": 0.0577, |
|
"num_input_tokens_seen": 163939160, |
|
"step": 2648 |
|
}, |
|
{ |
|
"epoch": 0.482032667876588, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 2.1270441125918882e-05, |
|
"loss": 0.051, |
|
"num_input_tokens_seen": 164446079, |
|
"step": 2656 |
|
}, |
|
{ |
|
"epoch": 0.48348457350272234, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 2.1178988142967678e-05, |
|
"loss": 0.0489, |
|
"num_input_tokens_seen": 164936233, |
|
"step": 2664 |
|
}, |
|
{ |
|
"epoch": 0.4849364791288566, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 2.108751041562427e-05, |
|
"loss": 0.0622, |
|
"num_input_tokens_seen": 165409965, |
|
"step": 2672 |
|
}, |
|
{ |
|
"epoch": 0.4863883847549909, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 2.0996009863806834e-05, |
|
"loss": 0.0578, |
|
"num_input_tokens_seen": 165901841, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.48784029038112525, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 2.0904488407912575e-05, |
|
"loss": 0.0389, |
|
"num_input_tokens_seen": 166384603, |
|
"step": 2688 |
|
}, |
|
{ |
|
"epoch": 0.48929219600725954, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 2.0812947968777437e-05, |
|
"loss": 0.0432, |
|
"num_input_tokens_seen": 166889709, |
|
"step": 2696 |
|
}, |
|
{ |
|
"epoch": 0.4907441016333938, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 2.0721390467635788e-05, |
|
"loss": 0.0453, |
|
"num_input_tokens_seen": 167372121, |
|
"step": 2704 |
|
}, |
|
{ |
|
"epoch": 0.4921960072595281, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 2.0629817826080073e-05, |
|
"loss": 0.0447, |
|
"num_input_tokens_seen": 167871991, |
|
"step": 2712 |
|
}, |
|
{ |
|
"epoch": 0.49364791288566245, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 2.053823196602051e-05, |
|
"loss": 0.0543, |
|
"num_input_tokens_seen": 168369985, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.49509981851179674, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 2.044663480964474e-05, |
|
"loss": 0.0416, |
|
"num_input_tokens_seen": 168846412, |
|
"step": 2728 |
|
}, |
|
{ |
|
"epoch": 0.496551724137931, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 2.0355028279377498e-05, |
|
"loss": 0.0467, |
|
"num_input_tokens_seen": 169335334, |
|
"step": 2736 |
|
}, |
|
{ |
|
"epoch": 0.4980036297640653, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 2.026341429784025e-05, |
|
"loss": 0.0724, |
|
"num_input_tokens_seen": 169830612, |
|
"step": 2744 |
|
}, |
|
{ |
|
"epoch": 0.49945553539019966, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 2.0171794787810842e-05, |
|
"loss": 0.0723, |
|
"num_input_tokens_seen": 170349739, |
|
"step": 2752 |
|
}, |
|
{ |
|
"epoch": 0.49945553539019966, |
|
"eval_loss": 0.054387591779232025, |
|
"eval_runtime": 2838.6975, |
|
"eval_samples_per_second": 1.098, |
|
"eval_steps_per_second": 0.137, |
|
"num_input_tokens_seen": 170349739, |
|
"step": 2752 |
|
}, |
|
{ |
|
"epoch": 0.5009074410163339, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 2.008017167218317e-05, |
|
"loss": 0.0365, |
|
"num_input_tokens_seen": 170843316, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.5023593466424683, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 1.9988546873926788e-05, |
|
"loss": 0.0456, |
|
"num_input_tokens_seen": 171324496, |
|
"step": 2768 |
|
}, |
|
{ |
|
"epoch": 0.5038112522686026, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.9896922316046562e-05, |
|
"loss": 0.0416, |
|
"num_input_tokens_seen": 171829665, |
|
"step": 2776 |
|
}, |
|
{ |
|
"epoch": 0.5052631578947369, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.980529992154233e-05, |
|
"loss": 0.0395, |
|
"num_input_tokens_seen": 172325874, |
|
"step": 2784 |
|
}, |
|
{ |
|
"epoch": 0.5067150635208711, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 1.9713681613368506e-05, |
|
"loss": 0.0536, |
|
"num_input_tokens_seen": 172832464, |
|
"step": 2792 |
|
}, |
|
{ |
|
"epoch": 0.5081669691470054, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.9622069314393753e-05, |
|
"loss": 0.0505, |
|
"num_input_tokens_seen": 173320567, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5096188747731397, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 1.9530464947360615e-05, |
|
"loss": 0.0528, |
|
"num_input_tokens_seen": 173816293, |
|
"step": 2808 |
|
}, |
|
{ |
|
"epoch": 0.511070780399274, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 1.943887043484515e-05, |
|
"loss": 0.0766, |
|
"num_input_tokens_seen": 174302982, |
|
"step": 2816 |
|
}, |
|
{ |
|
"epoch": 0.5125226860254084, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.9347287699216602e-05, |
|
"loss": 0.0574, |
|
"num_input_tokens_seen": 174807598, |
|
"step": 2824 |
|
}, |
|
{ |
|
"epoch": 0.5139745916515427, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 1.9255718662597044e-05, |
|
"loss": 0.0667, |
|
"num_input_tokens_seen": 175302323, |
|
"step": 2832 |
|
}, |
|
{ |
|
"epoch": 0.515426497277677, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 1.9164165246821026e-05, |
|
"loss": 0.0434, |
|
"num_input_tokens_seen": 175782712, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.5168784029038113, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 1.9072629373395268e-05, |
|
"loss": 0.0573, |
|
"num_input_tokens_seen": 176252965, |
|
"step": 2848 |
|
}, |
|
{ |
|
"epoch": 0.5183303085299455, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.8981112963458293e-05, |
|
"loss": 0.0541, |
|
"num_input_tokens_seen": 176746353, |
|
"step": 2856 |
|
}, |
|
{ |
|
"epoch": 0.5197822141560798, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 1.8889617937740146e-05, |
|
"loss": 0.0457, |
|
"num_input_tokens_seen": 177252614, |
|
"step": 2864 |
|
}, |
|
{ |
|
"epoch": 0.5212341197822141, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 1.879814621652206e-05, |
|
"loss": 0.0588, |
|
"num_input_tokens_seen": 177752505, |
|
"step": 2872 |
|
}, |
|
{ |
|
"epoch": 0.5226860254083484, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 1.8706699719596138e-05, |
|
"loss": 0.0717, |
|
"num_input_tokens_seen": 178248588, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.5241379310344828, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 1.8615280366225113e-05, |
|
"loss": 0.0634, |
|
"num_input_tokens_seen": 178746624, |
|
"step": 2888 |
|
}, |
|
{ |
|
"epoch": 0.5255898366606171, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.852389007510201e-05, |
|
"loss": 0.0573, |
|
"num_input_tokens_seen": 179239200, |
|
"step": 2896 |
|
}, |
|
{ |
|
"epoch": 0.5270417422867514, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 1.8432530764309916e-05, |
|
"loss": 0.0574, |
|
"num_input_tokens_seen": 179731398, |
|
"step": 2904 |
|
}, |
|
{ |
|
"epoch": 0.5284936479128857, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 1.8341204351281684e-05, |
|
"loss": 0.0786, |
|
"num_input_tokens_seen": 180216141, |
|
"step": 2912 |
|
}, |
|
{ |
|
"epoch": 0.52994555353902, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 1.8249912752759748e-05, |
|
"loss": 0.0481, |
|
"num_input_tokens_seen": 180719896, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.5313974591651542, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 1.8158657884755832e-05, |
|
"loss": 0.0595, |
|
"num_input_tokens_seen": 181215874, |
|
"step": 2928 |
|
}, |
|
{ |
|
"epoch": 0.5328493647912885, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.8067441662510782e-05, |
|
"loss": 0.0495, |
|
"num_input_tokens_seen": 181715660, |
|
"step": 2936 |
|
}, |
|
{ |
|
"epoch": 0.5343012704174228, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 1.797626600045435e-05, |
|
"loss": 0.0507, |
|
"num_input_tokens_seen": 182189644, |
|
"step": 2944 |
|
}, |
|
{ |
|
"epoch": 0.5357531760435572, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 1.7885132812165022e-05, |
|
"loss": 0.0457, |
|
"num_input_tokens_seen": 182692258, |
|
"step": 2952 |
|
}, |
|
{ |
|
"epoch": 0.5372050816696915, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 1.7794044010329844e-05, |
|
"loss": 0.0454, |
|
"num_input_tokens_seen": 183173683, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.5386569872958258, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.7703001506704297e-05, |
|
"loss": 0.0612, |
|
"num_input_tokens_seen": 183670207, |
|
"step": 2968 |
|
}, |
|
{ |
|
"epoch": 0.5401088929219601, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.761200721207215e-05, |
|
"loss": 0.0559, |
|
"num_input_tokens_seen": 184191448, |
|
"step": 2976 |
|
}, |
|
{ |
|
"epoch": 0.5415607985480944, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 1.7521063036205383e-05, |
|
"loss": 0.032, |
|
"num_input_tokens_seen": 184672691, |
|
"step": 2984 |
|
}, |
|
{ |
|
"epoch": 0.5430127041742286, |
|
"grad_norm": 0.625, |
|
"learning_rate": 1.7430170887824088e-05, |
|
"loss": 0.0597, |
|
"num_input_tokens_seen": 185179876, |
|
"step": 2992 |
|
}, |
|
{ |
|
"epoch": 0.5444646098003629, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 1.7339332674556408e-05, |
|
"loss": 0.0566, |
|
"num_input_tokens_seen": 185659670, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5459165154264973, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 1.724855030289852e-05, |
|
"loss": 0.028, |
|
"num_input_tokens_seen": 186148613, |
|
"step": 3008 |
|
}, |
|
{ |
|
"epoch": 0.5473684210526316, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 1.715782567817459e-05, |
|
"loss": 0.0567, |
|
"num_input_tokens_seen": 186651171, |
|
"step": 3016 |
|
}, |
|
{ |
|
"epoch": 0.5488203266787659, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.7067160704496817e-05, |
|
"loss": 0.0584, |
|
"num_input_tokens_seen": 187155654, |
|
"step": 3024 |
|
}, |
|
{ |
|
"epoch": 0.5502722323049002, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 1.6976557284725434e-05, |
|
"loss": 0.0554, |
|
"num_input_tokens_seen": 187631290, |
|
"step": 3032 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.6886017320428817e-05, |
|
"loss": 0.0654, |
|
"num_input_tokens_seen": 188114682, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.5531760435571688, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.6795542711843535e-05, |
|
"loss": 0.0489, |
|
"num_input_tokens_seen": 188586657, |
|
"step": 3048 |
|
}, |
|
{ |
|
"epoch": 0.554627949183303, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 1.670513535783448e-05, |
|
"loss": 0.0432, |
|
"num_input_tokens_seen": 189073577, |
|
"step": 3056 |
|
}, |
|
{ |
|
"epoch": 0.5560798548094373, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 1.661479715585503e-05, |
|
"loss": 0.0559, |
|
"num_input_tokens_seen": 189536844, |
|
"step": 3064 |
|
}, |
|
{ |
|
"epoch": 0.5575317604355717, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 1.6524530001907196e-05, |
|
"loss": 0.0552, |
|
"num_input_tokens_seen": 190005564, |
|
"step": 3072 |
|
}, |
|
{ |
|
"epoch": 0.558983666061706, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 1.643433579050186e-05, |
|
"loss": 0.0479, |
|
"num_input_tokens_seen": 190494115, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.5604355716878403, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 1.6344216414618998e-05, |
|
"loss": 0.0558, |
|
"num_input_tokens_seen": 190997100, |
|
"step": 3088 |
|
}, |
|
{ |
|
"epoch": 0.5618874773139746, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.625417376566794e-05, |
|
"loss": 0.0854, |
|
"num_input_tokens_seen": 191513399, |
|
"step": 3096 |
|
}, |
|
{ |
|
"epoch": 0.5618874773139746, |
|
"eval_loss": 0.0525849312543869, |
|
"eval_runtime": 2614.8433, |
|
"eval_samples_per_second": 1.192, |
|
"eval_steps_per_second": 0.149, |
|
"num_input_tokens_seen": 191513399, |
|
"step": 3096 |
|
}, |
|
{ |
|
"epoch": 0.5633393829401089, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 1.616420973344769e-05, |
|
"loss": 0.0467, |
|
"num_input_tokens_seen": 191995923, |
|
"step": 3104 |
|
}, |
|
{ |
|
"epoch": 0.5647912885662432, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1.607432620610727e-05, |
|
"loss": 0.0564, |
|
"num_input_tokens_seen": 192465595, |
|
"step": 3112 |
|
}, |
|
{ |
|
"epoch": 0.5662431941923775, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 1.5984525070106065e-05, |
|
"loss": 0.0507, |
|
"num_input_tokens_seen": 192958871, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.5676950998185119, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 1.5894808210174252e-05, |
|
"loss": 0.0574, |
|
"num_input_tokens_seen": 193430762, |
|
"step": 3128 |
|
}, |
|
{ |
|
"epoch": 0.5691470054446461, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 1.5805177509273226e-05, |
|
"loss": 0.0545, |
|
"num_input_tokens_seen": 193908960, |
|
"step": 3136 |
|
}, |
|
{ |
|
"epoch": 0.5705989110707804, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.571563484855611e-05, |
|
"loss": 0.0532, |
|
"num_input_tokens_seen": 194435990, |
|
"step": 3144 |
|
}, |
|
{ |
|
"epoch": 0.5720508166969147, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.5626182107328253e-05, |
|
"loss": 0.0402, |
|
"num_input_tokens_seen": 194945870, |
|
"step": 3152 |
|
}, |
|
{ |
|
"epoch": 0.573502722323049, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1.5536821163007768e-05, |
|
"loss": 0.0728, |
|
"num_input_tokens_seen": 195449492, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.5749546279491833, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 1.5447553891086178e-05, |
|
"loss": 0.0457, |
|
"num_input_tokens_seen": 195943237, |
|
"step": 3168 |
|
}, |
|
{ |
|
"epoch": 0.5764065335753176, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 1.5358382165089008e-05, |
|
"loss": 0.0612, |
|
"num_input_tokens_seen": 196442834, |
|
"step": 3176 |
|
}, |
|
{ |
|
"epoch": 0.5778584392014519, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 1.5269307856536486e-05, |
|
"loss": 0.0533, |
|
"num_input_tokens_seen": 196964754, |
|
"step": 3184 |
|
}, |
|
{ |
|
"epoch": 0.5793103448275863, |
|
"grad_norm": 0.625, |
|
"learning_rate": 1.5180332834904276e-05, |
|
"loss": 0.0331, |
|
"num_input_tokens_seen": 197500093, |
|
"step": 3192 |
|
}, |
|
{ |
|
"epoch": 0.5807622504537205, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.5091458967584199e-05, |
|
"loss": 0.0689, |
|
"num_input_tokens_seen": 197994930, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5822141560798548, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.5002688119845086e-05, |
|
"loss": 0.0541, |
|
"num_input_tokens_seen": 198501247, |
|
"step": 3208 |
|
}, |
|
{ |
|
"epoch": 0.5836660617059891, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1.4914022154793613e-05, |
|
"loss": 0.0435, |
|
"num_input_tokens_seen": 199000501, |
|
"step": 3216 |
|
}, |
|
{ |
|
"epoch": 0.5851179673321234, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 1.482546293333518e-05, |
|
"loss": 0.0557, |
|
"num_input_tokens_seen": 199479084, |
|
"step": 3224 |
|
}, |
|
{ |
|
"epoch": 0.5865698729582577, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 1.473701231413489e-05, |
|
"loss": 0.0382, |
|
"num_input_tokens_seen": 200003062, |
|
"step": 3232 |
|
}, |
|
{ |
|
"epoch": 0.588021778584392, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.464867215357851e-05, |
|
"loss": 0.0529, |
|
"num_input_tokens_seen": 200510961, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.5894736842105263, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 1.4560444305733521e-05, |
|
"loss": 0.0628, |
|
"num_input_tokens_seen": 201013169, |
|
"step": 3248 |
|
}, |
|
{ |
|
"epoch": 0.5909255898366607, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 1.447233062231022e-05, |
|
"loss": 0.0322, |
|
"num_input_tokens_seen": 201480209, |
|
"step": 3256 |
|
}, |
|
{ |
|
"epoch": 0.592377495462795, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.4384332952622815e-05, |
|
"loss": 0.0567, |
|
"num_input_tokens_seen": 201973667, |
|
"step": 3264 |
|
}, |
|
{ |
|
"epoch": 0.5938294010889292, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.4296453143550664e-05, |
|
"loss": 0.0463, |
|
"num_input_tokens_seen": 202453986, |
|
"step": 3272 |
|
}, |
|
{ |
|
"epoch": 0.5952813067150635, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 1.4208693039499468e-05, |
|
"loss": 0.0425, |
|
"num_input_tokens_seen": 202952414, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.5967332123411978, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.4121054482362592e-05, |
|
"loss": 0.048, |
|
"num_input_tokens_seen": 203470869, |
|
"step": 3288 |
|
}, |
|
{ |
|
"epoch": 0.5981851179673321, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 1.4033539311482403e-05, |
|
"loss": 0.0449, |
|
"num_input_tokens_seen": 203946575, |
|
"step": 3296 |
|
}, |
|
{ |
|
"epoch": 0.5996370235934664, |
|
"grad_norm": 1.0, |
|
"learning_rate": 1.3946149363611631e-05, |
|
"loss": 0.0579, |
|
"num_input_tokens_seen": 204443918, |
|
"step": 3304 |
|
}, |
|
{ |
|
"epoch": 0.6010889292196008, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 1.3858886472874881e-05, |
|
"loss": 0.1074, |
|
"num_input_tokens_seen": 204950872, |
|
"step": 3312 |
|
}, |
|
{ |
|
"epoch": 0.6025408348457351, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.3771752470730078e-05, |
|
"loss": 0.0591, |
|
"num_input_tokens_seen": 205454235, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.6039927404718693, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.3684749185930088e-05, |
|
"loss": 0.055, |
|
"num_input_tokens_seen": 205939041, |
|
"step": 3328 |
|
}, |
|
{ |
|
"epoch": 0.6054446460980036, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 1.3597878444484272e-05, |
|
"loss": 0.0483, |
|
"num_input_tokens_seen": 206431197, |
|
"step": 3336 |
|
}, |
|
{ |
|
"epoch": 0.6068965517241379, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 1.351114206962021e-05, |
|
"loss": 0.0568, |
|
"num_input_tokens_seen": 206925320, |
|
"step": 3344 |
|
}, |
|
{ |
|
"epoch": 0.6083484573502722, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 1.3424541881745425e-05, |
|
"loss": 0.0553, |
|
"num_input_tokens_seen": 207406668, |
|
"step": 3352 |
|
}, |
|
{ |
|
"epoch": 0.6098003629764065, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 1.333807969840916e-05, |
|
"loss": 0.0517, |
|
"num_input_tokens_seen": 207877782, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.6112522686025408, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 1.3251757334264253e-05, |
|
"loss": 0.04, |
|
"num_input_tokens_seen": 208344318, |
|
"step": 3368 |
|
}, |
|
{ |
|
"epoch": 0.6127041742286752, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 1.316557660102903e-05, |
|
"loss": 0.0488, |
|
"num_input_tokens_seen": 208814858, |
|
"step": 3376 |
|
}, |
|
{ |
|
"epoch": 0.6141560798548095, |
|
"grad_norm": 0.5, |
|
"learning_rate": 1.3079539307449311e-05, |
|
"loss": 0.044, |
|
"num_input_tokens_seen": 209297102, |
|
"step": 3384 |
|
}, |
|
{ |
|
"epoch": 0.6156079854809438, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.2993647259260418e-05, |
|
"loss": 0.0469, |
|
"num_input_tokens_seen": 209774677, |
|
"step": 3392 |
|
}, |
|
{ |
|
"epoch": 0.617059891107078, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 1.2907902259149287e-05, |
|
"loss": 0.0694, |
|
"num_input_tokens_seen": 210275870, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6185117967332123, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.2822306106716645e-05, |
|
"loss": 0.0595, |
|
"num_input_tokens_seen": 210797636, |
|
"step": 3408 |
|
}, |
|
{ |
|
"epoch": 0.6199637023593466, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 1.2736860598439215e-05, |
|
"loss": 0.0665, |
|
"num_input_tokens_seen": 211287706, |
|
"step": 3416 |
|
}, |
|
{ |
|
"epoch": 0.6214156079854809, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 1.2651567527632045e-05, |
|
"loss": 0.0698, |
|
"num_input_tokens_seen": 211773156, |
|
"step": 3424 |
|
}, |
|
{ |
|
"epoch": 0.6228675136116153, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.2566428684410843e-05, |
|
"loss": 0.0348, |
|
"num_input_tokens_seen": 212277142, |
|
"step": 3432 |
|
}, |
|
{ |
|
"epoch": 0.6243194192377496, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.2481445855654415e-05, |
|
"loss": 0.0474, |
|
"num_input_tokens_seen": 212767513, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.6243194192377496, |
|
"eval_loss": 0.05037084221839905, |
|
"eval_runtime": 2739.6179, |
|
"eval_samples_per_second": 1.138, |
|
"eval_steps_per_second": 0.142, |
|
"num_input_tokens_seen": 212767513, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.6257713248638839, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.2396620824967169e-05, |
|
"loss": 0.1043, |
|
"num_input_tokens_seen": 213273298, |
|
"step": 3448 |
|
}, |
|
{ |
|
"epoch": 0.6272232304900182, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 1.2311955372641674e-05, |
|
"loss": 0.0779, |
|
"num_input_tokens_seen": 213743600, |
|
"step": 3456 |
|
}, |
|
{ |
|
"epoch": 0.6286751361161524, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 1.222745127562129e-05, |
|
"loss": 0.0474, |
|
"num_input_tokens_seen": 214249105, |
|
"step": 3464 |
|
}, |
|
{ |
|
"epoch": 0.6301270417422867, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 1.2143110307462892e-05, |
|
"loss": 0.0914, |
|
"num_input_tokens_seen": 214743732, |
|
"step": 3472 |
|
}, |
|
{ |
|
"epoch": 0.631578947368421, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 1.2058934238299625e-05, |
|
"loss": 0.0333, |
|
"num_input_tokens_seen": 215240214, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.6330308529945553, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 1.1974924834803765e-05, |
|
"loss": 0.0477, |
|
"num_input_tokens_seen": 215752215, |
|
"step": 3488 |
|
}, |
|
{ |
|
"epoch": 0.6344827586206897, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 1.1891083860149653e-05, |
|
"loss": 0.0456, |
|
"num_input_tokens_seen": 216218681, |
|
"step": 3496 |
|
}, |
|
{ |
|
"epoch": 0.635934664246824, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.1807413073976655e-05, |
|
"loss": 0.0537, |
|
"num_input_tokens_seen": 216717186, |
|
"step": 3504 |
|
}, |
|
{ |
|
"epoch": 0.6373865698729583, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.1723914232352265e-05, |
|
"loss": 0.0543, |
|
"num_input_tokens_seen": 217224763, |
|
"step": 3512 |
|
}, |
|
{ |
|
"epoch": 0.6388384754990926, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 1.1640589087735222e-05, |
|
"loss": 0.053, |
|
"num_input_tokens_seen": 217712978, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.6402903811252268, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 1.1557439388938772e-05, |
|
"loss": 0.0464, |
|
"num_input_tokens_seen": 218177197, |
|
"step": 3528 |
|
}, |
|
{ |
|
"epoch": 0.6417422867513611, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.1474466881093904e-05, |
|
"loss": 0.0679, |
|
"num_input_tokens_seen": 218664950, |
|
"step": 3536 |
|
}, |
|
{ |
|
"epoch": 0.6431941923774954, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 1.139167330561277e-05, |
|
"loss": 0.0551, |
|
"num_input_tokens_seen": 219190307, |
|
"step": 3544 |
|
}, |
|
{ |
|
"epoch": 0.6446460980036298, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 1.130906040015211e-05, |
|
"loss": 0.045, |
|
"num_input_tokens_seen": 219656276, |
|
"step": 3552 |
|
}, |
|
{ |
|
"epoch": 0.6460980036297641, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 1.1226629898576818e-05, |
|
"loss": 0.0516, |
|
"num_input_tokens_seen": 220153311, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.6475499092558984, |
|
"grad_norm": 1.125, |
|
"learning_rate": 1.1144383530923505e-05, |
|
"loss": 0.04, |
|
"num_input_tokens_seen": 220641855, |
|
"step": 3568 |
|
}, |
|
{ |
|
"epoch": 0.6490018148820327, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1.1062323023364217e-05, |
|
"loss": 0.0566, |
|
"num_input_tokens_seen": 221165742, |
|
"step": 3576 |
|
}, |
|
{ |
|
"epoch": 0.650453720508167, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 1.0980450098170211e-05, |
|
"loss": 0.0598, |
|
"num_input_tokens_seen": 221645634, |
|
"step": 3584 |
|
}, |
|
{ |
|
"epoch": 0.6519056261343013, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.0898766473675795e-05, |
|
"loss": 0.0582, |
|
"num_input_tokens_seen": 222128368, |
|
"step": 3592 |
|
}, |
|
{ |
|
"epoch": 0.6533575317604355, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 1.081727386424225e-05, |
|
"loss": 0.0637, |
|
"num_input_tokens_seen": 222630366, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6548094373865698, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.0735973980221898e-05, |
|
"loss": 0.0319, |
|
"num_input_tokens_seen": 223132889, |
|
"step": 3608 |
|
}, |
|
{ |
|
"epoch": 0.6562613430127042, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 1.0654868527922157e-05, |
|
"loss": 0.0605, |
|
"num_input_tokens_seen": 223620866, |
|
"step": 3616 |
|
}, |
|
{ |
|
"epoch": 0.6577132486388385, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 1.0573959209569736e-05, |
|
"loss": 0.0563, |
|
"num_input_tokens_seen": 224112161, |
|
"step": 3624 |
|
}, |
|
{ |
|
"epoch": 0.6591651542649728, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.0493247723274949e-05, |
|
"loss": 0.0637, |
|
"num_input_tokens_seen": 224615692, |
|
"step": 3632 |
|
}, |
|
{ |
|
"epoch": 0.6606170598911071, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 1.0412735762996022e-05, |
|
"loss": 0.0525, |
|
"num_input_tokens_seen": 225123661, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.6620689655172414, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 1.0332425018503573e-05, |
|
"loss": 0.0448, |
|
"num_input_tokens_seen": 225606843, |
|
"step": 3648 |
|
}, |
|
{ |
|
"epoch": 0.6635208711433757, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.025231717534513e-05, |
|
"loss": 0.0511, |
|
"num_input_tokens_seen": 226083858, |
|
"step": 3656 |
|
}, |
|
{ |
|
"epoch": 0.6649727767695099, |
|
"grad_norm": 0.625, |
|
"learning_rate": 1.0172413914809791e-05, |
|
"loss": 0.0297, |
|
"num_input_tokens_seen": 226586157, |
|
"step": 3664 |
|
}, |
|
{ |
|
"epoch": 0.6664246823956442, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 1.0092716913892878e-05, |
|
"loss": 0.0542, |
|
"num_input_tokens_seen": 227090262, |
|
"step": 3672 |
|
}, |
|
{ |
|
"epoch": 0.6678765880217786, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 1.0013227845260785e-05, |
|
"loss": 0.0496, |
|
"num_input_tokens_seen": 227568348, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.6693284936479129, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 9.933948377215873e-06, |
|
"loss": 0.0474, |
|
"num_input_tokens_seen": 228069156, |
|
"step": 3688 |
|
}, |
|
{ |
|
"epoch": 0.6707803992740472, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 9.85488017366143e-06, |
|
"loss": 0.0276, |
|
"num_input_tokens_seen": 228546696, |
|
"step": 3696 |
|
}, |
|
{ |
|
"epoch": 0.6722323049001815, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.776024894066755e-06, |
|
"loss": 0.0413, |
|
"num_input_tokens_seen": 229039860, |
|
"step": 3704 |
|
}, |
|
{ |
|
"epoch": 0.6736842105263158, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 9.697384193432365e-06, |
|
"loss": 0.0398, |
|
"num_input_tokens_seen": 229524911, |
|
"step": 3712 |
|
}, |
|
{ |
|
"epoch": 0.6751361161524501, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 9.618959722255204e-06, |
|
"loss": 0.0448, |
|
"num_input_tokens_seen": 230032334, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.6765880217785843, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 9.540753126494035e-06, |
|
"loss": 0.0746, |
|
"num_input_tokens_seen": 230518610, |
|
"step": 3728 |
|
}, |
|
{ |
|
"epoch": 0.6780399274047187, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 9.462766047534915e-06, |
|
"loss": 0.0463, |
|
"num_input_tokens_seen": 231010962, |
|
"step": 3736 |
|
}, |
|
{ |
|
"epoch": 0.679491833030853, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 9.385000122156695e-06, |
|
"loss": 0.0675, |
|
"num_input_tokens_seen": 231515592, |
|
"step": 3744 |
|
}, |
|
{ |
|
"epoch": 0.6809437386569873, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 9.3074569824967e-06, |
|
"loss": 0.0627, |
|
"num_input_tokens_seen": 232031254, |
|
"step": 3752 |
|
}, |
|
{ |
|
"epoch": 0.6823956442831216, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 9.230138256016461e-06, |
|
"loss": 0.0601, |
|
"num_input_tokens_seen": 232525195, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.6838475499092559, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 9.153045565467605e-06, |
|
"loss": 0.0587, |
|
"num_input_tokens_seen": 232999291, |
|
"step": 3768 |
|
}, |
|
{ |
|
"epoch": 0.6852994555353902, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 9.076180528857709e-06, |
|
"loss": 0.0536, |
|
"num_input_tokens_seen": 233490579, |
|
"step": 3776 |
|
}, |
|
{ |
|
"epoch": 0.6867513611615245, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 8.999544759416413e-06, |
|
"loss": 0.0346, |
|
"num_input_tokens_seen": 234000641, |
|
"step": 3784 |
|
}, |
|
{ |
|
"epoch": 0.6867513611615245, |
|
"eval_loss": 0.04955988749861717, |
|
"eval_runtime": 2842.036, |
|
"eval_samples_per_second": 1.097, |
|
"eval_steps_per_second": 0.137, |
|
"num_input_tokens_seen": 234000641, |
|
"step": 3784 |
|
}, |
|
{ |
|
"epoch": 0.6882032667876588, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 8.923139865561525e-06, |
|
"loss": 0.0568, |
|
"num_input_tokens_seen": 234523989, |
|
"step": 3792 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 8.846967450865302e-06, |
|
"loss": 0.0471, |
|
"num_input_tokens_seen": 234995824, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6911070780399274, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.77102911402075e-06, |
|
"loss": 0.0396, |
|
"num_input_tokens_seen": 235480070, |
|
"step": 3808 |
|
}, |
|
{ |
|
"epoch": 0.6925589836660617, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 8.695326448808089e-06, |
|
"loss": 0.0427, |
|
"num_input_tokens_seen": 235969468, |
|
"step": 3816 |
|
}, |
|
{ |
|
"epoch": 0.694010889292196, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 8.61986104406132e-06, |
|
"loss": 0.0468, |
|
"num_input_tokens_seen": 236457438, |
|
"step": 3824 |
|
}, |
|
{ |
|
"epoch": 0.6954627949183303, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 8.544634483634855e-06, |
|
"loss": 0.07, |
|
"num_input_tokens_seen": 236964483, |
|
"step": 3832 |
|
}, |
|
{ |
|
"epoch": 0.6969147005444646, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 8.469648346370275e-06, |
|
"loss": 0.0681, |
|
"num_input_tokens_seen": 237478465, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.6983666061705989, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 8.39490420606323e-06, |
|
"loss": 0.0486, |
|
"num_input_tokens_seen": 237972518, |
|
"step": 3848 |
|
}, |
|
{ |
|
"epoch": 0.6998185117967333, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 8.320403631430352e-06, |
|
"loss": 0.0398, |
|
"num_input_tokens_seen": 238453985, |
|
"step": 3856 |
|
}, |
|
{ |
|
"epoch": 0.7012704174228676, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 8.246148186076367e-06, |
|
"loss": 0.0565, |
|
"num_input_tokens_seen": 238956557, |
|
"step": 3864 |
|
}, |
|
{ |
|
"epoch": 0.7027223230490018, |
|
"grad_norm": 1.125, |
|
"learning_rate": 8.172139428461292e-06, |
|
"loss": 0.0699, |
|
"num_input_tokens_seen": 239428560, |
|
"step": 3872 |
|
}, |
|
{ |
|
"epoch": 0.7041742286751361, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 8.098378911867682e-06, |
|
"loss": 0.0595, |
|
"num_input_tokens_seen": 239904462, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.7056261343012704, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 8.02486818436806e-06, |
|
"loss": 0.0696, |
|
"num_input_tokens_seen": 240404479, |
|
"step": 3888 |
|
}, |
|
{ |
|
"epoch": 0.7070780399274047, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 7.95160878879242e-06, |
|
"loss": 0.0534, |
|
"num_input_tokens_seen": 240926945, |
|
"step": 3896 |
|
}, |
|
{ |
|
"epoch": 0.708529945553539, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 7.87860226269586e-06, |
|
"loss": 0.0596, |
|
"num_input_tokens_seen": 241440836, |
|
"step": 3904 |
|
}, |
|
{ |
|
"epoch": 0.7099818511796733, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 7.805850138326282e-06, |
|
"loss": 0.035, |
|
"num_input_tokens_seen": 241942169, |
|
"step": 3912 |
|
}, |
|
{ |
|
"epoch": 0.7114337568058077, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 7.733353942592246e-06, |
|
"loss": 0.0501, |
|
"num_input_tokens_seen": 242419037, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.712885662431942, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 7.661115197030954e-06, |
|
"loss": 0.0576, |
|
"num_input_tokens_seen": 242917759, |
|
"step": 3928 |
|
}, |
|
{ |
|
"epoch": 0.7143375680580762, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 7.589135417776266e-06, |
|
"loss": 0.0394, |
|
"num_input_tokens_seen": 243411063, |
|
"step": 3936 |
|
}, |
|
{ |
|
"epoch": 0.7157894736842105, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 7.517416115526901e-06, |
|
"loss": 0.0485, |
|
"num_input_tokens_seen": 243885516, |
|
"step": 3944 |
|
}, |
|
{ |
|
"epoch": 0.7172413793103448, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 7.445958795514761e-06, |
|
"loss": 0.0642, |
|
"num_input_tokens_seen": 244397104, |
|
"step": 3952 |
|
}, |
|
{ |
|
"epoch": 0.7186932849364791, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 7.374764957473281e-06, |
|
"loss": 0.0486, |
|
"num_input_tokens_seen": 244892690, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.7201451905626134, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 7.303836095605994e-06, |
|
"loss": 0.0532, |
|
"num_input_tokens_seen": 245418852, |
|
"step": 3968 |
|
}, |
|
{ |
|
"epoch": 0.7215970961887477, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 7.233173698555174e-06, |
|
"loss": 0.0389, |
|
"num_input_tokens_seen": 245925757, |
|
"step": 3976 |
|
}, |
|
{ |
|
"epoch": 0.7230490018148821, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 7.16277924937056e-06, |
|
"loss": 0.0514, |
|
"num_input_tokens_seen": 246421511, |
|
"step": 3984 |
|
}, |
|
{ |
|
"epoch": 0.7245009074410164, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 7.092654225478257e-06, |
|
"loss": 0.041, |
|
"num_input_tokens_seen": 246952363, |
|
"step": 3992 |
|
}, |
|
{ |
|
"epoch": 0.7259528130671506, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 7.022800098649716e-06, |
|
"loss": 0.0446, |
|
"num_input_tokens_seen": 247450049, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7274047186932849, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 6.953218334970861e-06, |
|
"loss": 0.0379, |
|
"num_input_tokens_seen": 247943269, |
|
"step": 4008 |
|
}, |
|
{ |
|
"epoch": 0.7288566243194192, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 6.8839103948113e-06, |
|
"loss": 0.0394, |
|
"num_input_tokens_seen": 248447780, |
|
"step": 4016 |
|
}, |
|
{ |
|
"epoch": 0.7303085299455535, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 6.814877732793663e-06, |
|
"loss": 0.0401, |
|
"num_input_tokens_seen": 248921260, |
|
"step": 4024 |
|
}, |
|
{ |
|
"epoch": 0.7317604355716878, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 6.7461217977631325e-06, |
|
"loss": 0.0447, |
|
"num_input_tokens_seen": 249435130, |
|
"step": 4032 |
|
}, |
|
{ |
|
"epoch": 0.7332123411978222, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 6.67764403275696e-06, |
|
"loss": 0.0457, |
|
"num_input_tokens_seen": 249913307, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.7346642468239565, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 6.609445874974218e-06, |
|
"loss": 0.066, |
|
"num_input_tokens_seen": 250435878, |
|
"step": 4048 |
|
}, |
|
{ |
|
"epoch": 0.7361161524500908, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 6.5415287557456585e-06, |
|
"loss": 0.0509, |
|
"num_input_tokens_seen": 250946234, |
|
"step": 4056 |
|
}, |
|
{ |
|
"epoch": 0.737568058076225, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 6.473894100503615e-06, |
|
"loss": 0.0553, |
|
"num_input_tokens_seen": 251435205, |
|
"step": 4064 |
|
}, |
|
{ |
|
"epoch": 0.7390199637023593, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 6.4065433287521306e-06, |
|
"loss": 0.0445, |
|
"num_input_tokens_seen": 251949775, |
|
"step": 4072 |
|
}, |
|
{ |
|
"epoch": 0.7404718693284936, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 6.33947785403716e-06, |
|
"loss": 0.0626, |
|
"num_input_tokens_seen": 252447111, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.7419237749546279, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 6.272699083916885e-06, |
|
"loss": 0.0685, |
|
"num_input_tokens_seen": 252958790, |
|
"step": 4088 |
|
}, |
|
{ |
|
"epoch": 0.7433756805807622, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 6.20620841993218e-06, |
|
"loss": 0.0705, |
|
"num_input_tokens_seen": 253436330, |
|
"step": 4096 |
|
}, |
|
{ |
|
"epoch": 0.7448275862068966, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 6.1400072575772056e-06, |
|
"loss": 0.0599, |
|
"num_input_tokens_seen": 253927128, |
|
"step": 4104 |
|
}, |
|
{ |
|
"epoch": 0.7462794918330309, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 6.0740969862701195e-06, |
|
"loss": 0.0407, |
|
"num_input_tokens_seen": 254426830, |
|
"step": 4112 |
|
}, |
|
{ |
|
"epoch": 0.7477313974591652, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 6.008478989323898e-06, |
|
"loss": 0.0566, |
|
"num_input_tokens_seen": 254922990, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.7491833030852995, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 5.943154643917315e-06, |
|
"loss": 0.0498, |
|
"num_input_tokens_seen": 255423630, |
|
"step": 4128 |
|
}, |
|
{ |
|
"epoch": 0.7491833030852995, |
|
"eval_loss": 0.049039360135793686, |
|
"eval_runtime": 2629.7216, |
|
"eval_samples_per_second": 1.185, |
|
"eval_steps_per_second": 0.148, |
|
"num_input_tokens_seen": 255423630, |
|
"step": 4128 |
|
}, |
|
{ |
|
"epoch": 0.7506352087114337, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 5.87812532106606e-06, |
|
"loss": 0.0614, |
|
"num_input_tokens_seen": 255929632, |
|
"step": 4136 |
|
}, |
|
{ |
|
"epoch": 0.752087114337568, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 5.813392385593915e-06, |
|
"loss": 0.0651, |
|
"num_input_tokens_seen": 256430965, |
|
"step": 4144 |
|
}, |
|
{ |
|
"epoch": 0.7535390199637023, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 5.7489571961041415e-06, |
|
"loss": 0.0618, |
|
"num_input_tokens_seen": 256934909, |
|
"step": 4152 |
|
}, |
|
{ |
|
"epoch": 0.7549909255898367, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 5.684821104950984e-06, |
|
"loss": 0.0604, |
|
"num_input_tokens_seen": 257421654, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.756442831215971, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 5.620985458211241e-06, |
|
"loss": 0.0516, |
|
"num_input_tokens_seen": 257913684, |
|
"step": 4168 |
|
}, |
|
{ |
|
"epoch": 0.7578947368421053, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 5.55745159565604e-06, |
|
"loss": 0.0418, |
|
"num_input_tokens_seen": 258400849, |
|
"step": 4176 |
|
}, |
|
{ |
|
"epoch": 0.7593466424682396, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 5.494220850722729e-06, |
|
"loss": 0.062, |
|
"num_input_tokens_seen": 258878333, |
|
"step": 4184 |
|
}, |
|
{ |
|
"epoch": 0.7607985480943739, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 5.431294550486869e-06, |
|
"loss": 0.0615, |
|
"num_input_tokens_seen": 259369068, |
|
"step": 4192 |
|
}, |
|
{ |
|
"epoch": 0.7622504537205081, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 5.3686740156343805e-06, |
|
"loss": 0.0584, |
|
"num_input_tokens_seen": 259870513, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7637023593466424, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 5.306360560433854e-06, |
|
"loss": 0.0419, |
|
"num_input_tokens_seen": 260370376, |
|
"step": 4208 |
|
}, |
|
{ |
|
"epoch": 0.7651542649727767, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 5.244355492708941e-06, |
|
"loss": 0.0582, |
|
"num_input_tokens_seen": 260881761, |
|
"step": 4216 |
|
}, |
|
{ |
|
"epoch": 0.7666061705989111, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 5.182660113810907e-06, |
|
"loss": 0.0468, |
|
"num_input_tokens_seen": 261402673, |
|
"step": 4224 |
|
}, |
|
{ |
|
"epoch": 0.7680580762250454, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 5.121275718591321e-06, |
|
"loss": 0.0686, |
|
"num_input_tokens_seen": 261898525, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 0.7695099818511797, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 5.0602035953748865e-06, |
|
"loss": 0.0624, |
|
"num_input_tokens_seen": 262392396, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.770961887477314, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 4.999445025932408e-06, |
|
"loss": 0.0429, |
|
"num_input_tokens_seen": 262882816, |
|
"step": 4248 |
|
}, |
|
{ |
|
"epoch": 0.7724137931034483, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 4.939001285453864e-06, |
|
"loss": 0.0372, |
|
"num_input_tokens_seen": 263383267, |
|
"step": 4256 |
|
}, |
|
{ |
|
"epoch": 0.7738656987295826, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 4.8788736425216595e-06, |
|
"loss": 0.0343, |
|
"num_input_tokens_seen": 263858756, |
|
"step": 4264 |
|
}, |
|
{ |
|
"epoch": 0.7753176043557168, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 4.81906335908402e-06, |
|
"loss": 0.048, |
|
"num_input_tokens_seen": 264345998, |
|
"step": 4272 |
|
}, |
|
{ |
|
"epoch": 0.7767695099818511, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 4.759571690428464e-06, |
|
"loss": 0.0595, |
|
"num_input_tokens_seen": 264834486, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.7782214156079855, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 4.700399885155487e-06, |
|
"loss": 0.0456, |
|
"num_input_tokens_seen": 265331269, |
|
"step": 4288 |
|
}, |
|
{ |
|
"epoch": 0.7796733212341198, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.641549185152359e-06, |
|
"loss": 0.0374, |
|
"num_input_tokens_seen": 265836347, |
|
"step": 4296 |
|
}, |
|
{ |
|
"epoch": 0.7811252268602541, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 4.583020825567039e-06, |
|
"loss": 0.0359, |
|
"num_input_tokens_seen": 266324737, |
|
"step": 4304 |
|
}, |
|
{ |
|
"epoch": 0.7825771324863884, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 4.524816034782263e-06, |
|
"loss": 0.0575, |
|
"num_input_tokens_seen": 266808164, |
|
"step": 4312 |
|
}, |
|
{ |
|
"epoch": 0.7840290381125227, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 4.46693603438977e-06, |
|
"loss": 0.0502, |
|
"num_input_tokens_seen": 267324813, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.785480943738657, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 4.409382039164653e-06, |
|
"loss": 0.063, |
|
"num_input_tokens_seen": 267822646, |
|
"step": 4328 |
|
}, |
|
{ |
|
"epoch": 0.7869328493647912, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 4.352155257039865e-06, |
|
"loss": 0.0736, |
|
"num_input_tokens_seen": 268320339, |
|
"step": 4336 |
|
}, |
|
{ |
|
"epoch": 0.7883847549909256, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 4.295256889080865e-06, |
|
"loss": 0.0568, |
|
"num_input_tokens_seen": 268805229, |
|
"step": 4344 |
|
}, |
|
{ |
|
"epoch": 0.7898366606170599, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.238688129460431e-06, |
|
"loss": 0.0398, |
|
"num_input_tokens_seen": 269290686, |
|
"step": 4352 |
|
}, |
|
{ |
|
"epoch": 0.7912885662431942, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 4.18245016543356e-06, |
|
"loss": 0.0468, |
|
"num_input_tokens_seen": 269771817, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.7927404718693285, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 4.126544177312577e-06, |
|
"loss": 0.0497, |
|
"num_input_tokens_seen": 270261530, |
|
"step": 4368 |
|
}, |
|
{ |
|
"epoch": 0.7941923774954628, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 4.0709713384423685e-06, |
|
"loss": 0.0356, |
|
"num_input_tokens_seen": 270769688, |
|
"step": 4376 |
|
}, |
|
{ |
|
"epoch": 0.7956442831215971, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.015732815175728e-06, |
|
"loss": 0.0573, |
|
"num_input_tokens_seen": 271284923, |
|
"step": 4384 |
|
}, |
|
{ |
|
"epoch": 0.7970961887477314, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 3.960829766848893e-06, |
|
"loss": 0.056, |
|
"num_input_tokens_seen": 271756884, |
|
"step": 4392 |
|
}, |
|
{ |
|
"epoch": 0.7985480943738656, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 3.906263345757231e-06, |
|
"loss": 0.0309, |
|
"num_input_tokens_seen": 272248473, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.852034697131015e-06, |
|
"loss": 0.0447, |
|
"num_input_tokens_seen": 272755455, |
|
"step": 4408 |
|
}, |
|
{ |
|
"epoch": 0.8014519056261343, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 3.7981449591114207e-06, |
|
"loss": 0.0459, |
|
"num_input_tokens_seen": 273244979, |
|
"step": 4416 |
|
}, |
|
{ |
|
"epoch": 0.8029038112522686, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 3.7445952627266336e-06, |
|
"loss": 0.0642, |
|
"num_input_tokens_seen": 273749266, |
|
"step": 4424 |
|
}, |
|
{ |
|
"epoch": 0.8043557168784029, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.6913867318680984e-06, |
|
"loss": 0.0455, |
|
"num_input_tokens_seen": 274271081, |
|
"step": 4432 |
|
}, |
|
{ |
|
"epoch": 0.8058076225045372, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 3.6385204832669385e-06, |
|
"loss": 0.0414, |
|
"num_input_tokens_seen": 274770517, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.8072595281306715, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 3.585997626470519e-06, |
|
"loss": 0.0426, |
|
"num_input_tokens_seen": 275248505, |
|
"step": 4448 |
|
}, |
|
{ |
|
"epoch": 0.8087114337568058, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 3.533819263819167e-06, |
|
"loss": 0.0498, |
|
"num_input_tokens_seen": 275748095, |
|
"step": 4456 |
|
}, |
|
{ |
|
"epoch": 0.8101633393829402, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 3.4819864904230195e-06, |
|
"loss": 0.0508, |
|
"num_input_tokens_seen": 276242421, |
|
"step": 4464 |
|
}, |
|
{ |
|
"epoch": 0.8116152450090744, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 3.4305003941390468e-06, |
|
"loss": 0.0605, |
|
"num_input_tokens_seen": 276731693, |
|
"step": 4472 |
|
}, |
|
{ |
|
"epoch": 0.8116152450090744, |
|
"eval_loss": 0.04871319234371185, |
|
"eval_runtime": 2768.9798, |
|
"eval_samples_per_second": 1.126, |
|
"eval_steps_per_second": 0.141, |
|
"num_input_tokens_seen": 276731693, |
|
"step": 4472 |
|
}, |
|
{ |
|
"epoch": 0.8130671506352087, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 3.3793620555482322e-06, |
|
"loss": 0.053, |
|
"num_input_tokens_seen": 277218277, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.814519056261343, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 3.3285725479328757e-06, |
|
"loss": 0.0582, |
|
"num_input_tokens_seen": 277705169, |
|
"step": 4488 |
|
}, |
|
{ |
|
"epoch": 0.8159709618874773, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 3.2781329372540683e-06, |
|
"loss": 0.0618, |
|
"num_input_tokens_seen": 278213285, |
|
"step": 4496 |
|
}, |
|
{ |
|
"epoch": 0.8174228675136116, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 3.2280442821293455e-06, |
|
"loss": 0.0556, |
|
"num_input_tokens_seen": 278697097, |
|
"step": 4504 |
|
}, |
|
{ |
|
"epoch": 0.8188747731397459, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 3.178307633810436e-06, |
|
"loss": 0.0526, |
|
"num_input_tokens_seen": 279193929, |
|
"step": 4512 |
|
}, |
|
{ |
|
"epoch": 0.8203266787658802, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 3.128924036161207e-06, |
|
"loss": 0.0411, |
|
"num_input_tokens_seen": 279698041, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.8217785843920146, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 3.079894525635783e-06, |
|
"loss": 0.0505, |
|
"num_input_tokens_seen": 280182805, |
|
"step": 4528 |
|
}, |
|
{ |
|
"epoch": 0.8232304900181489, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 3.0312201312567536e-06, |
|
"loss": 0.04, |
|
"num_input_tokens_seen": 280651028, |
|
"step": 4536 |
|
}, |
|
{ |
|
"epoch": 0.8246823956442831, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 2.982901874593598e-06, |
|
"loss": 0.0696, |
|
"num_input_tokens_seen": 281162798, |
|
"step": 4544 |
|
}, |
|
{ |
|
"epoch": 0.8261343012704174, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 2.934940769741239e-06, |
|
"loss": 0.0356, |
|
"num_input_tokens_seen": 281658265, |
|
"step": 4552 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 2.8873378232987726e-06, |
|
"loss": 0.0503, |
|
"num_input_tokens_seen": 282170245, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.829038112522686, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 2.840094034348315e-06, |
|
"loss": 0.0471, |
|
"num_input_tokens_seen": 282655198, |
|
"step": 4568 |
|
}, |
|
{ |
|
"epoch": 0.8304900181488203, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 2.793210394434056e-06, |
|
"loss": 0.0615, |
|
"num_input_tokens_seen": 283132416, |
|
"step": 4576 |
|
}, |
|
{ |
|
"epoch": 0.8319419237749546, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 2.746687887541448e-06, |
|
"loss": 0.0537, |
|
"num_input_tokens_seen": 283628667, |
|
"step": 4584 |
|
}, |
|
{ |
|
"epoch": 0.833393829401089, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 2.700527490076539e-06, |
|
"loss": 0.0375, |
|
"num_input_tokens_seen": 284146751, |
|
"step": 4592 |
|
}, |
|
{ |
|
"epoch": 0.8348457350272233, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 2.6547301708454877e-06, |
|
"loss": 0.041, |
|
"num_input_tokens_seen": 284643128, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8362976406533575, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 2.609296891034241e-06, |
|
"loss": 0.0473, |
|
"num_input_tokens_seen": 285145371, |
|
"step": 4608 |
|
}, |
|
{ |
|
"epoch": 0.8377495462794918, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 2.5642286041883458e-06, |
|
"loss": 0.0472, |
|
"num_input_tokens_seen": 285639963, |
|
"step": 4616 |
|
}, |
|
{ |
|
"epoch": 0.8392014519056261, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 2.519526256192939e-06, |
|
"loss": 0.0493, |
|
"num_input_tokens_seen": 286128983, |
|
"step": 4624 |
|
}, |
|
{ |
|
"epoch": 0.8406533575317604, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 2.47519078525291e-06, |
|
"loss": 0.0726, |
|
"num_input_tokens_seen": 286625920, |
|
"step": 4632 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 2.431223121873183e-06, |
|
"loss": 0.0465, |
|
"num_input_tokens_seen": 287119525, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.8435571687840291, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 2.3876241888392173e-06, |
|
"loss": 0.0553, |
|
"num_input_tokens_seen": 287610722, |
|
"step": 4648 |
|
}, |
|
{ |
|
"epoch": 0.8450090744101634, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 2.3443949011976107e-06, |
|
"loss": 0.0428, |
|
"num_input_tokens_seen": 288097243, |
|
"step": 4656 |
|
}, |
|
{ |
|
"epoch": 0.8464609800362977, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 2.301536166236926e-06, |
|
"loss": 0.048, |
|
"num_input_tokens_seen": 288598177, |
|
"step": 4664 |
|
}, |
|
{ |
|
"epoch": 0.847912885662432, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 2.259048883468622e-06, |
|
"loss": 0.0436, |
|
"num_input_tokens_seen": 289095940, |
|
"step": 4672 |
|
}, |
|
{ |
|
"epoch": 0.8493647912885662, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 2.216933944608184e-06, |
|
"loss": 0.0525, |
|
"num_input_tokens_seen": 289579822, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.8508166969147005, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 2.1751922335564134e-06, |
|
"loss": 0.0752, |
|
"num_input_tokens_seen": 290090500, |
|
"step": 4688 |
|
}, |
|
{ |
|
"epoch": 0.8522686025408348, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 2.13382462638088e-06, |
|
"loss": 0.0348, |
|
"num_input_tokens_seen": 290583181, |
|
"step": 4696 |
|
}, |
|
{ |
|
"epoch": 0.8537205081669691, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 2.0928319912975193e-06, |
|
"loss": 0.063, |
|
"num_input_tokens_seen": 291086649, |
|
"step": 4704 |
|
}, |
|
{ |
|
"epoch": 0.8551724137931035, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 2.0522151886524153e-06, |
|
"loss": 0.0492, |
|
"num_input_tokens_seen": 291577384, |
|
"step": 4712 |
|
}, |
|
{ |
|
"epoch": 0.8566243194192378, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 2.0119750709037646e-06, |
|
"loss": 0.0428, |
|
"num_input_tokens_seen": 292058725, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.8580762250453721, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 1.972112482603954e-06, |
|
"loss": 0.074, |
|
"num_input_tokens_seen": 292542677, |
|
"step": 4728 |
|
}, |
|
{ |
|
"epoch": 0.8595281306715064, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 1.9326282603818526e-06, |
|
"loss": 0.0493, |
|
"num_input_tokens_seen": 293025201, |
|
"step": 4736 |
|
}, |
|
{ |
|
"epoch": 0.8609800362976406, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 1.8935232329252585e-06, |
|
"loss": 0.0431, |
|
"num_input_tokens_seen": 293508845, |
|
"step": 4744 |
|
}, |
|
{ |
|
"epoch": 0.8624319419237749, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 1.854798220963485e-06, |
|
"loss": 0.0356, |
|
"num_input_tokens_seen": 293995884, |
|
"step": 4752 |
|
}, |
|
{ |
|
"epoch": 0.8638838475499092, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 1.816454037250155e-06, |
|
"loss": 0.0548, |
|
"num_input_tokens_seen": 294512519, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.8653357531760436, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 1.778491486546141e-06, |
|
"loss": 0.0409, |
|
"num_input_tokens_seen": 295012760, |
|
"step": 4768 |
|
}, |
|
{ |
|
"epoch": 0.8667876588021779, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.7409113656026643e-06, |
|
"loss": 0.0336, |
|
"num_input_tokens_seen": 295509942, |
|
"step": 4776 |
|
}, |
|
{ |
|
"epoch": 0.8682395644283122, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 1.7037144631445745e-06, |
|
"loss": 0.0413, |
|
"num_input_tokens_seen": 296013081, |
|
"step": 4784 |
|
}, |
|
{ |
|
"epoch": 0.8696914700544465, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.666901559853804e-06, |
|
"loss": 0.0387, |
|
"num_input_tokens_seen": 296492427, |
|
"step": 4792 |
|
}, |
|
{ |
|
"epoch": 0.8711433756805808, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 1.63047342835299e-06, |
|
"loss": 0.0468, |
|
"num_input_tokens_seen": 297011120, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.872595281306715, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.594430833189231e-06, |
|
"loss": 0.0518, |
|
"num_input_tokens_seen": 297502338, |
|
"step": 4808 |
|
}, |
|
{ |
|
"epoch": 0.8740471869328493, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 1.5587745308180656e-06, |
|
"loss": 0.055, |
|
"num_input_tokens_seen": 298011343, |
|
"step": 4816 |
|
}, |
|
{ |
|
"epoch": 0.8740471869328493, |
|
"eval_loss": 0.04861417040228844, |
|
"eval_runtime": 2715.815, |
|
"eval_samples_per_second": 1.148, |
|
"eval_steps_per_second": 0.144, |
|
"num_input_tokens_seen": 298011343, |
|
"step": 4816 |
|
}, |
|
{ |
|
"epoch": 0.8754990925589836, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 1.523505269587595e-06, |
|
"loss": 0.0366, |
|
"num_input_tokens_seen": 298524933, |
|
"step": 4824 |
|
}, |
|
{ |
|
"epoch": 0.876950998185118, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 1.4886237897227584e-06, |
|
"loss": 0.0466, |
|
"num_input_tokens_seen": 299031985, |
|
"step": 4832 |
|
}, |
|
{ |
|
"epoch": 0.8784029038112523, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 1.4541308233098117e-06, |
|
"loss": 0.0472, |
|
"num_input_tokens_seen": 299512381, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.8798548094373866, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 1.420027094280969e-06, |
|
"loss": 0.0585, |
|
"num_input_tokens_seen": 300023962, |
|
"step": 4848 |
|
}, |
|
{ |
|
"epoch": 0.8813067150635209, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 1.3863133183991905e-06, |
|
"loss": 0.0455, |
|
"num_input_tokens_seen": 300499402, |
|
"step": 4856 |
|
}, |
|
{ |
|
"epoch": 0.8827586206896552, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 1.3529902032431698e-06, |
|
"loss": 0.0572, |
|
"num_input_tokens_seen": 301015365, |
|
"step": 4864 |
|
}, |
|
{ |
|
"epoch": 0.8842105263157894, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 1.3200584481924915e-06, |
|
"loss": 0.054, |
|
"num_input_tokens_seen": 301509565, |
|
"step": 4872 |
|
}, |
|
{ |
|
"epoch": 0.8856624319419237, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 1.2875187444129366e-06, |
|
"loss": 0.0505, |
|
"num_input_tokens_seen": 302023484, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.8871143375680581, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.2553717748419846e-06, |
|
"loss": 0.0426, |
|
"num_input_tokens_seen": 302520603, |
|
"step": 4888 |
|
}, |
|
{ |
|
"epoch": 0.8885662431941924, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 1.2236182141744757e-06, |
|
"loss": 0.0495, |
|
"num_input_tokens_seen": 303012766, |
|
"step": 4896 |
|
}, |
|
{ |
|
"epoch": 0.8900181488203267, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 1.192258728848472e-06, |
|
"loss": 0.0561, |
|
"num_input_tokens_seen": 303502416, |
|
"step": 4904 |
|
}, |
|
{ |
|
"epoch": 0.891470054446461, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.1612939770312325e-06, |
|
"loss": 0.0365, |
|
"num_input_tokens_seen": 304003546, |
|
"step": 4912 |
|
}, |
|
{ |
|
"epoch": 0.8929219600725953, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 1.130724608605427e-06, |
|
"loss": 0.05, |
|
"num_input_tokens_seen": 304494827, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.8943738656987296, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.1005512651554983e-06, |
|
"loss": 0.0365, |
|
"num_input_tokens_seen": 304962434, |
|
"step": 4928 |
|
}, |
|
{ |
|
"epoch": 0.8958257713248639, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 1.0707745799541748e-06, |
|
"loss": 0.0505, |
|
"num_input_tokens_seen": 305453792, |
|
"step": 4936 |
|
}, |
|
{ |
|
"epoch": 0.8972776769509981, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.041395177949196e-06, |
|
"loss": 0.0371, |
|
"num_input_tokens_seen": 305940285, |
|
"step": 4944 |
|
}, |
|
{ |
|
"epoch": 0.8987295825771325, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 1.0124136757502012e-06, |
|
"loss": 0.0523, |
|
"num_input_tokens_seen": 306438405, |
|
"step": 4952 |
|
}, |
|
{ |
|
"epoch": 0.9001814882032668, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 9.838306816157695e-07, |
|
"loss": 0.0405, |
|
"num_input_tokens_seen": 306937715, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.9016333938294011, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 9.556467954406634e-07, |
|
"loss": 0.0742, |
|
"num_input_tokens_seen": 307458431, |
|
"step": 4968 |
|
}, |
|
{ |
|
"epoch": 0.9030852994555354, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 9.278626087432529e-07, |
|
"loss": 0.049, |
|
"num_input_tokens_seen": 307956789, |
|
"step": 4976 |
|
}, |
|
{ |
|
"epoch": 0.9045372050816697, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 9.004787046530694e-07, |
|
"loss": 0.0432, |
|
"num_input_tokens_seen": 308463995, |
|
"step": 4984 |
|
}, |
|
{ |
|
"epoch": 0.905989110707804, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 8.734956578985976e-07, |
|
"loss": 0.057, |
|
"num_input_tokens_seen": 308971509, |
|
"step": 4992 |
|
}, |
|
{ |
|
"epoch": 0.9074410163339383, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 8.469140347951898e-07, |
|
"loss": 0.0461, |
|
"num_input_tokens_seen": 309453074, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9088929219600725, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 8.207343932332023e-07, |
|
"loss": 0.042, |
|
"num_input_tokens_seen": 309930257, |
|
"step": 5008 |
|
}, |
|
{ |
|
"epoch": 0.9103448275862069, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 7.949572826662622e-07, |
|
"loss": 0.077, |
|
"num_input_tokens_seen": 310432591, |
|
"step": 5016 |
|
}, |
|
{ |
|
"epoch": 0.9117967332123412, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 7.695832440997563e-07, |
|
"loss": 0.0504, |
|
"num_input_tokens_seen": 310899484, |
|
"step": 5024 |
|
}, |
|
{ |
|
"epoch": 0.9132486388384755, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 7.44612810079468e-07, |
|
"loss": 0.0577, |
|
"num_input_tokens_seen": 311385620, |
|
"step": 5032 |
|
}, |
|
{ |
|
"epoch": 0.9147005444646098, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 7.200465046803984e-07, |
|
"loss": 0.065, |
|
"num_input_tokens_seen": 311886953, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.9161524500907441, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 6.958848434957643e-07, |
|
"loss": 0.0473, |
|
"num_input_tokens_seen": 312387145, |
|
"step": 5048 |
|
}, |
|
{ |
|
"epoch": 0.9176043557168784, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 6.721283336261964e-07, |
|
"loss": 0.0464, |
|
"num_input_tokens_seen": 312865084, |
|
"step": 5056 |
|
}, |
|
{ |
|
"epoch": 0.9190562613430127, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 6.487774736690688e-07, |
|
"loss": 0.0462, |
|
"num_input_tokens_seen": 313342169, |
|
"step": 5064 |
|
}, |
|
{ |
|
"epoch": 0.9205081669691471, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 6.258327537080488e-07, |
|
"loss": 0.0407, |
|
"num_input_tokens_seen": 313820850, |
|
"step": 5072 |
|
}, |
|
{ |
|
"epoch": 0.9219600725952813, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 6.032946553028196e-07, |
|
"loss": 0.048, |
|
"num_input_tokens_seen": 314294169, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.9234119782214156, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 5.811636514789598e-07, |
|
"loss": 0.0393, |
|
"num_input_tokens_seen": 314789090, |
|
"step": 5088 |
|
}, |
|
{ |
|
"epoch": 0.9248638838475499, |
|
"grad_norm": 0.5, |
|
"learning_rate": 5.594402067180116e-07, |
|
"loss": 0.0466, |
|
"num_input_tokens_seen": 315317576, |
|
"step": 5096 |
|
}, |
|
{ |
|
"epoch": 0.9263157894736842, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 5.381247769477504e-07, |
|
"loss": 0.0336, |
|
"num_input_tokens_seen": 315804951, |
|
"step": 5104 |
|
}, |
|
{ |
|
"epoch": 0.9277676950998185, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 5.172178095326019e-07, |
|
"loss": 0.0515, |
|
"num_input_tokens_seen": 316286642, |
|
"step": 5112 |
|
}, |
|
{ |
|
"epoch": 0.9292196007259528, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.967197432642579e-07, |
|
"loss": 0.079, |
|
"num_input_tokens_seen": 316792651, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.9306715063520871, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.7663100835246614e-07, |
|
"loss": 0.0423, |
|
"num_input_tokens_seen": 317277912, |
|
"step": 5128 |
|
}, |
|
{ |
|
"epoch": 0.9321234119782215, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 4.569520264159977e-07, |
|
"loss": 0.0307, |
|
"num_input_tokens_seen": 317761276, |
|
"step": 5136 |
|
}, |
|
{ |
|
"epoch": 0.9335753176043557, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 4.3768321047380936e-07, |
|
"loss": 0.0443, |
|
"num_input_tokens_seen": 318275629, |
|
"step": 5144 |
|
}, |
|
{ |
|
"epoch": 0.93502722323049, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.188249649363596e-07, |
|
"loss": 0.037, |
|
"num_input_tokens_seen": 318764138, |
|
"step": 5152 |
|
}, |
|
{ |
|
"epoch": 0.9364791288566243, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 4.0037768559712864e-07, |
|
"loss": 0.0398, |
|
"num_input_tokens_seen": 319237492, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.9364791288566243, |
|
"eval_loss": 0.04859951138496399, |
|
"eval_runtime": 2495.2416, |
|
"eval_samples_per_second": 1.249, |
|
"eval_steps_per_second": 0.156, |
|
"num_input_tokens_seen": 319237492, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.9379310344827586, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 3.8234175962432284e-07, |
|
"loss": 0.0643, |
|
"num_input_tokens_seen": 319726771, |
|
"step": 5168 |
|
}, |
|
{ |
|
"epoch": 0.9393829401088929, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 3.647175655527235e-07, |
|
"loss": 0.0545, |
|
"num_input_tokens_seen": 320207370, |
|
"step": 5176 |
|
}, |
|
{ |
|
"epoch": 0.9408348457350272, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.4750547327576434e-07, |
|
"loss": 0.0645, |
|
"num_input_tokens_seen": 320689649, |
|
"step": 5184 |
|
}, |
|
{ |
|
"epoch": 0.9422867513611616, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 3.3070584403775754e-07, |
|
"loss": 0.0368, |
|
"num_input_tokens_seen": 321189372, |
|
"step": 5192 |
|
}, |
|
{ |
|
"epoch": 0.9437386569872959, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 3.143190304263177e-07, |
|
"loss": 0.0461, |
|
"num_input_tokens_seen": 321681717, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9451905626134302, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.9834537636495466e-07, |
|
"loss": 0.0348, |
|
"num_input_tokens_seen": 322172599, |
|
"step": 5208 |
|
}, |
|
{ |
|
"epoch": 0.9466424682395644, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 2.8278521710586315e-07, |
|
"loss": 0.0484, |
|
"num_input_tokens_seen": 322668094, |
|
"step": 5216 |
|
}, |
|
{ |
|
"epoch": 0.9480943738656987, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.6763887922288236e-07, |
|
"loss": 0.0589, |
|
"num_input_tokens_seen": 323137080, |
|
"step": 5224 |
|
}, |
|
{ |
|
"epoch": 0.949546279491833, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 2.5290668060464095e-07, |
|
"loss": 0.0323, |
|
"num_input_tokens_seen": 323645462, |
|
"step": 5232 |
|
}, |
|
{ |
|
"epoch": 0.9509981851179673, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 2.385889304478872e-07, |
|
"loss": 0.05, |
|
"num_input_tokens_seen": 324137149, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.9524500907441016, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 2.2468592925100062e-07, |
|
"loss": 0.0392, |
|
"num_input_tokens_seen": 324621626, |
|
"step": 5248 |
|
}, |
|
{ |
|
"epoch": 0.953901996370236, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 2.1119796880768374e-07, |
|
"loss": 0.0468, |
|
"num_input_tokens_seen": 325115784, |
|
"step": 5256 |
|
}, |
|
{ |
|
"epoch": 0.9553539019963703, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 1.9812533220083362e-07, |
|
"loss": 0.0679, |
|
"num_input_tokens_seen": 325614737, |
|
"step": 5264 |
|
}, |
|
{ |
|
"epoch": 0.9568058076225046, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.8546829379661125e-07, |
|
"loss": 0.07, |
|
"num_input_tokens_seen": 326095021, |
|
"step": 5272 |
|
}, |
|
{ |
|
"epoch": 0.9582577132486388, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 1.7322711923867475e-07, |
|
"loss": 0.0609, |
|
"num_input_tokens_seen": 326613882, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.9597096188747731, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 1.6140206544260407e-07, |
|
"loss": 0.0323, |
|
"num_input_tokens_seen": 327087152, |
|
"step": 5288 |
|
}, |
|
{ |
|
"epoch": 0.9611615245009074, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 1.4999338059051184e-07, |
|
"loss": 0.0431, |
|
"num_input_tokens_seen": 327601813, |
|
"step": 5296 |
|
}, |
|
{ |
|
"epoch": 0.9626134301270417, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 1.3900130412583646e-07, |
|
"loss": 0.0378, |
|
"num_input_tokens_seen": 328093647, |
|
"step": 5304 |
|
}, |
|
{ |
|
"epoch": 0.964065335753176, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.2842606674831058e-07, |
|
"loss": 0.0777, |
|
"num_input_tokens_seen": 328588015, |
|
"step": 5312 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.1826789040912723e-07, |
|
"loss": 0.0603, |
|
"num_input_tokens_seen": 329080878, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.9669691470054447, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 1.0852698830627007e-07, |
|
"loss": 0.0433, |
|
"num_input_tokens_seen": 329543543, |
|
"step": 5328 |
|
}, |
|
{ |
|
"epoch": 0.968421052631579, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 9.920356488005045e-08, |
|
"loss": 0.0625, |
|
"num_input_tokens_seen": 330031499, |
|
"step": 5336 |
|
}, |
|
{ |
|
"epoch": 0.9698729582577132, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 9.029781580881081e-08, |
|
"loss": 0.0408, |
|
"num_input_tokens_seen": 330508472, |
|
"step": 5344 |
|
}, |
|
{ |
|
"epoch": 0.9713248638838475, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 8.180992800482124e-08, |
|
"loss": 0.0362, |
|
"num_input_tokens_seen": 330999368, |
|
"step": 5352 |
|
}, |
|
{ |
|
"epoch": 0.9727767695099818, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 7.374007961035157e-08, |
|
"loss": 0.0372, |
|
"num_input_tokens_seen": 331494527, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.9742286751361161, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 6.608843999393655e-08, |
|
"loss": 0.0544, |
|
"num_input_tokens_seen": 331992801, |
|
"step": 5368 |
|
}, |
|
{ |
|
"epoch": 0.9756805807622505, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 5.885516974681871e-08, |
|
"loss": 0.0434, |
|
"num_input_tokens_seen": 332484019, |
|
"step": 5376 |
|
}, |
|
{ |
|
"epoch": 0.9771324863883848, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 5.2040420679577706e-08, |
|
"loss": 0.0463, |
|
"num_input_tokens_seen": 332971275, |
|
"step": 5384 |
|
}, |
|
{ |
|
"epoch": 0.9785843920145191, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 4.564433581895067e-08, |
|
"loss": 0.0291, |
|
"num_input_tokens_seen": 333465979, |
|
"step": 5392 |
|
}, |
|
{ |
|
"epoch": 0.9800362976406534, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 3.966704940482347e-08, |
|
"loss": 0.0428, |
|
"num_input_tokens_seen": 333965786, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.9814882032667877, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 3.4108686887408537e-08, |
|
"loss": 0.0382, |
|
"num_input_tokens_seen": 334462422, |
|
"step": 5408 |
|
}, |
|
{ |
|
"epoch": 0.9829401088929219, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 2.8969364924629205e-08, |
|
"loss": 0.0335, |
|
"num_input_tokens_seen": 334957763, |
|
"step": 5416 |
|
}, |
|
{ |
|
"epoch": 0.9843920145190562, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 2.424919137965276e-08, |
|
"loss": 0.0386, |
|
"num_input_tokens_seen": 335453503, |
|
"step": 5424 |
|
}, |
|
{ |
|
"epoch": 0.9858439201451905, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 1.9948265318638915e-08, |
|
"loss": 0.0471, |
|
"num_input_tokens_seen": 335956152, |
|
"step": 5432 |
|
}, |
|
{ |
|
"epoch": 0.9872958257713249, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.606667700865261e-08, |
|
"loss": 0.0428, |
|
"num_input_tokens_seen": 336428666, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.9887477313974592, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 1.2604507915774389e-08, |
|
"loss": 0.0409, |
|
"num_input_tokens_seen": 336955164, |
|
"step": 5448 |
|
}, |
|
{ |
|
"epoch": 0.9901996370235935, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 9.561830703390673e-09, |
|
"loss": 0.0468, |
|
"num_input_tokens_seen": 337481648, |
|
"step": 5456 |
|
}, |
|
{ |
|
"epoch": 0.9916515426497278, |
|
"grad_norm": 1.6953125, |
|
"learning_rate": 6.938709230666085e-09, |
|
"loss": 0.0517, |
|
"num_input_tokens_seen": 337980342, |
|
"step": 5464 |
|
}, |
|
{ |
|
"epoch": 0.993103448275862, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 4.7351985512067435e-09, |
|
"loss": 0.0586, |
|
"num_input_tokens_seen": 338476887, |
|
"step": 5472 |
|
}, |
|
{ |
|
"epoch": 0.9945553539019963, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 2.9513449118967475e-09, |
|
"loss": 0.0758, |
|
"num_input_tokens_seen": 338954735, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.9960072595281306, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 1.5871857519411671e-09, |
|
"loss": 0.0453, |
|
"num_input_tokens_seen": 339472532, |
|
"step": 5488 |
|
}, |
|
{ |
|
"epoch": 0.997459165154265, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 6.427497020644602e-10, |
|
"loss": 0.0365, |
|
"num_input_tokens_seen": 339948028, |
|
"step": 5496 |
|
}, |
|
{ |
|
"epoch": 0.9989110707803993, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 1.1805658392427533e-10, |
|
"loss": 0.0511, |
|
"num_input_tokens_seen": 340437678, |
|
"step": 5504 |
|
}, |
|
{ |
|
"epoch": 0.9989110707803993, |
|
"eval_loss": 0.04862402379512787, |
|
"eval_runtime": 2527.5451, |
|
"eval_samples_per_second": 1.233, |
|
"eval_steps_per_second": 0.154, |
|
"num_input_tokens_seen": 340437678, |
|
"step": 5504 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"num_input_tokens_seen": 340779614, |
|
"step": 5510, |
|
"total_flos": 1.7763887818171482e+19, |
|
"train_loss": 0.06540190598601221, |
|
"train_runtime": 392745.8674, |
|
"train_samples_per_second": 0.786, |
|
"train_steps_per_second": 0.014, |
|
"train_tokens_per_second": 108.825 |
|
} |
|
], |
|
"logging_steps": 8, |
|
"max_steps": 5510, |
|
"num_input_tokens_seen": 340779614, |
|
"num_train_epochs": 1, |
|
"save_steps": 688, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7763887818171482e+19, |
|
"train_batch_size": 7, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|