|
{ |
|
"best_metric": 0.7249829173088074, |
|
"best_model_checkpoint": "saves/starcoder2-7b/lora/sft/checkpoint-5000", |
|
"epoch": 0.7980845969672785, |
|
"eval_steps": 100, |
|
"global_step": 5000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.628385603427887, |
|
"learning_rate": 4.999999126897802e-05, |
|
"loss": 1.2582, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.0855119228363037, |
|
"learning_rate": 4.999996507591817e-05, |
|
"loss": 0.801, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.5689586400985718, |
|
"learning_rate": 4.9999921420838745e-05, |
|
"loss": 1.067, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.0851330757141113, |
|
"learning_rate": 4.999986030377024e-05, |
|
"loss": 1.2953, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.397479772567749, |
|
"learning_rate": 4.999978172475535e-05, |
|
"loss": 0.9826, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.344118595123291, |
|
"learning_rate": 4.9999685683848954e-05, |
|
"loss": 0.9485, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.158163070678711, |
|
"learning_rate": 4.9999596278606616e-05, |
|
"loss": 0.8103, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.602233648300171, |
|
"learning_rate": 4.999946880647276e-05, |
|
"loss": 0.8648, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.557242751121521, |
|
"learning_rate": 4.999932387266596e-05, |
|
"loss": 1.0198, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.36068856716156, |
|
"learning_rate": 4.999916147728746e-05, |
|
"loss": 0.9367, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.3263639211654663, |
|
"learning_rate": 4.999898162045068e-05, |
|
"loss": 0.9695, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.333601474761963, |
|
"learning_rate": 4.999878430228126e-05, |
|
"loss": 1.1509, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.4753800630569458, |
|
"learning_rate": 4.999856952291702e-05, |
|
"loss": 1.1461, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.5096240043640137, |
|
"learning_rate": 4.9998337282507965e-05, |
|
"loss": 1.1722, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.189892053604126, |
|
"learning_rate": 4.999808758121633e-05, |
|
"loss": 1.1834, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.9292634725570679, |
|
"learning_rate": 4.999782041921651e-05, |
|
"loss": 0.9498, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.1775777339935303, |
|
"learning_rate": 4.9997535796695134e-05, |
|
"loss": 0.9346, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.6854296922683716, |
|
"learning_rate": 4.999723371385099e-05, |
|
"loss": 1.119, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.4571490287780762, |
|
"learning_rate": 4.999691417089507e-05, |
|
"loss": 0.8671, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.277044653892517, |
|
"learning_rate": 4.999657716805059e-05, |
|
"loss": 1.2469, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 0.8478816747665405, |
|
"eval_runtime": 96.2736, |
|
"eval_samples_per_second": 7.24, |
|
"eval_steps_per_second": 7.24, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.6687743067741394, |
|
"learning_rate": 4.9996222705552933e-05, |
|
"loss": 0.735, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.3488354682922363, |
|
"learning_rate": 4.9995850783649665e-05, |
|
"loss": 0.8344, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.1043323278427124, |
|
"learning_rate": 4.9995461402600593e-05, |
|
"loss": 0.8254, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9382895827293396, |
|
"learning_rate": 4.9995054562677684e-05, |
|
"loss": 0.9179, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.2824612855911255, |
|
"learning_rate": 4.9994630264165107e-05, |
|
"loss": 0.8663, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.0491925477981567, |
|
"learning_rate": 4.999418850735923e-05, |
|
"loss": 0.9247, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.3642233610153198, |
|
"learning_rate": 4.99937292925686e-05, |
|
"loss": 0.8253, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.747757911682129, |
|
"learning_rate": 4.9993252620113976e-05, |
|
"loss": 1.0245, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.299494981765747, |
|
"learning_rate": 4.999275849032832e-05, |
|
"loss": 0.8723, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.7195830345153809, |
|
"learning_rate": 4.999224690355675e-05, |
|
"loss": 1.0524, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9922987222671509, |
|
"learning_rate": 4.9991717860156616e-05, |
|
"loss": 0.9502, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.0577458143234253, |
|
"learning_rate": 4.9991171360497437e-05, |
|
"loss": 1.0115, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.0001195669174194, |
|
"learning_rate": 4.999060740496093e-05, |
|
"loss": 1.1999, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.2456804513931274, |
|
"learning_rate": 4.999002599394102e-05, |
|
"loss": 0.8882, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.0445325374603271, |
|
"learning_rate": 4.9989427127843814e-05, |
|
"loss": 1.0615, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.2410887479782104, |
|
"learning_rate": 4.9988810807087584e-05, |
|
"loss": 1.1068, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8935971260070801, |
|
"learning_rate": 4.998817703210285e-05, |
|
"loss": 0.6683, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.1614488363265991, |
|
"learning_rate": 4.9987525803332265e-05, |
|
"loss": 0.7446, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9392004013061523, |
|
"learning_rate": 4.998685712123072e-05, |
|
"loss": 0.7397, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.0314444303512573, |
|
"learning_rate": 4.9986170986265266e-05, |
|
"loss": 1.3584, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 0.8368077278137207, |
|
"eval_runtime": 96.5262, |
|
"eval_samples_per_second": 7.221, |
|
"eval_steps_per_second": 7.221, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8964811563491821, |
|
"learning_rate": 4.998546739891516e-05, |
|
"loss": 0.9546, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.0679796934127808, |
|
"learning_rate": 4.998474635967185e-05, |
|
"loss": 0.864, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.2340985536575317, |
|
"learning_rate": 4.998400786903896e-05, |
|
"loss": 0.885, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7219617366790771, |
|
"learning_rate": 4.9983251927532315e-05, |
|
"loss": 1.1069, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.1480705738067627, |
|
"learning_rate": 4.9982478535679924e-05, |
|
"loss": 1.0416, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.515589714050293, |
|
"learning_rate": 4.9981687694021996e-05, |
|
"loss": 1.1844, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.6687963008880615, |
|
"learning_rate": 4.998087940311091e-05, |
|
"loss": 0.8664, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.9256645441055298, |
|
"learning_rate": 4.998005366351125e-05, |
|
"loss": 1.0125, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.2500052452087402, |
|
"learning_rate": 4.997921047579978e-05, |
|
"loss": 1.1374, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.0543216466903687, |
|
"learning_rate": 4.9978349840565434e-05, |
|
"loss": 0.8502, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.3009012937545776, |
|
"learning_rate": 4.997747175840937e-05, |
|
"loss": 1.0357, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8456661105155945, |
|
"learning_rate": 4.997657622994491e-05, |
|
"loss": 0.6883, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.5856515765190125, |
|
"learning_rate": 4.9975663255797555e-05, |
|
"loss": 0.7656, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.973818302154541, |
|
"learning_rate": 4.997473283660501e-05, |
|
"loss": 0.823, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9960187673568726, |
|
"learning_rate": 4.997378497301715e-05, |
|
"loss": 0.8726, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.2900679111480713, |
|
"learning_rate": 4.997281966569604e-05, |
|
"loss": 0.9781, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.828894853591919, |
|
"learning_rate": 4.9971836915315926e-05, |
|
"loss": 0.8932, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.239621877670288, |
|
"learning_rate": 4.9970836722563256e-05, |
|
"loss": 1.2022, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.0117149353027344, |
|
"learning_rate": 4.996981908813664e-05, |
|
"loss": 0.8032, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8861119747161865, |
|
"learning_rate": 4.996878401274687e-05, |
|
"loss": 1.0651, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.8281473517417908, |
|
"eval_runtime": 96.5283, |
|
"eval_samples_per_second": 7.221, |
|
"eval_steps_per_second": 7.221, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8583046197891235, |
|
"learning_rate": 4.996773149711693e-05, |
|
"loss": 0.8784, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.5717499256134033, |
|
"learning_rate": 4.9966661541981984e-05, |
|
"loss": 0.8395, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.982342004776001, |
|
"learning_rate": 4.9965574148089376e-05, |
|
"loss": 0.9869, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.9000777006149292, |
|
"learning_rate": 4.9964469316198633e-05, |
|
"loss": 0.8435, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8733209371566772, |
|
"learning_rate": 4.9963347047081464e-05, |
|
"loss": 0.7281, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.323739767074585, |
|
"learning_rate": 4.9962207341521746e-05, |
|
"loss": 1.1013, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.7102876901626587, |
|
"learning_rate": 4.996105020031554e-05, |
|
"loss": 0.8276, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.9196123480796814, |
|
"learning_rate": 4.995987562427109e-05, |
|
"loss": 0.8274, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.210099458694458, |
|
"learning_rate": 4.995868361420883e-05, |
|
"loss": 1.3257, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8923581838607788, |
|
"learning_rate": 4.9957474170961335e-05, |
|
"loss": 0.6815, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.9576735496520996, |
|
"learning_rate": 4.9956247295373396e-05, |
|
"loss": 1.23, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.3774089813232422, |
|
"learning_rate": 4.995500298830196e-05, |
|
"loss": 1.0556, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.1523677110671997, |
|
"learning_rate": 4.995374125061614e-05, |
|
"loss": 1.1787, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8310608863830566, |
|
"learning_rate": 4.9952462083197246e-05, |
|
"loss": 0.8525, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.9814196825027466, |
|
"learning_rate": 4.9951165486938765e-05, |
|
"loss": 0.8522, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.9878122210502625, |
|
"learning_rate": 4.994985146274633e-05, |
|
"loss": 0.6618, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.2652586698532104, |
|
"learning_rate": 4.994852001153777e-05, |
|
"loss": 1.0489, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.2940975427627563, |
|
"learning_rate": 4.994717113424307e-05, |
|
"loss": 1.104, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.9636249542236328, |
|
"learning_rate": 4.99458048318044e-05, |
|
"loss": 0.9228, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8122813105583191, |
|
"learning_rate": 4.994442110517611e-05, |
|
"loss": 0.9209, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 0.8184689879417419, |
|
"eval_runtime": 96.4572, |
|
"eval_samples_per_second": 7.226, |
|
"eval_steps_per_second": 7.226, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8742052912712097, |
|
"learning_rate": 4.99430199553247e-05, |
|
"loss": 0.9608, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5679522752761841, |
|
"learning_rate": 4.9941601383228835e-05, |
|
"loss": 0.5963, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.0234627723693848, |
|
"learning_rate": 4.994016538987938e-05, |
|
"loss": 0.8642, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8581897616386414, |
|
"learning_rate": 4.993871197627934e-05, |
|
"loss": 0.8993, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4666485786437988, |
|
"learning_rate": 4.9937241143443904e-05, |
|
"loss": 0.8565, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.1166578531265259, |
|
"learning_rate": 4.993575289240041e-05, |
|
"loss": 0.881, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.303992748260498, |
|
"learning_rate": 4.9934247224188393e-05, |
|
"loss": 0.9962, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.9011989235877991, |
|
"learning_rate": 4.993272413985952e-05, |
|
"loss": 0.9316, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8321458101272583, |
|
"learning_rate": 4.993118364047764e-05, |
|
"loss": 0.7889, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7780352234840393, |
|
"learning_rate": 4.992962572711877e-05, |
|
"loss": 0.8287, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.9090210199356079, |
|
"learning_rate": 4.992805040087108e-05, |
|
"loss": 0.7018, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8694137334823608, |
|
"learning_rate": 4.9926457662834906e-05, |
|
"loss": 0.8484, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.6327371001243591, |
|
"learning_rate": 4.992484751412274e-05, |
|
"loss": 0.716, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.200668215751648, |
|
"learning_rate": 4.9923219955859254e-05, |
|
"loss": 0.9525, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.8530198931694031, |
|
"learning_rate": 4.9921574989181266e-05, |
|
"loss": 0.744, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.168479323387146, |
|
"learning_rate": 4.991991261523775e-05, |
|
"loss": 0.729, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9499714970588684, |
|
"learning_rate": 4.9918232835189834e-05, |
|
"loss": 0.7725, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.8434467911720276, |
|
"learning_rate": 4.991653565021084e-05, |
|
"loss": 1.1558, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.7665804624557495, |
|
"learning_rate": 4.99148210614862e-05, |
|
"loss": 1.0208, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5782546401023865, |
|
"learning_rate": 4.991308907021353e-05, |
|
"loss": 0.8306, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 0.8132078051567078, |
|
"eval_runtime": 96.433, |
|
"eval_samples_per_second": 7.228, |
|
"eval_steps_per_second": 7.228, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.0821778774261475, |
|
"learning_rate": 4.9911339677602584e-05, |
|
"loss": 0.9503, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5409029126167297, |
|
"learning_rate": 4.99095728848753e-05, |
|
"loss": 0.8586, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9011789560317993, |
|
"learning_rate": 4.990778869326575e-05, |
|
"loss": 0.7981, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.0092263221740723, |
|
"learning_rate": 4.990598710402013e-05, |
|
"loss": 1.0174, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.4362307786941528, |
|
"learning_rate": 4.9904168118396844e-05, |
|
"loss": 0.8373, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.1772639751434326, |
|
"learning_rate": 4.9902331737666414e-05, |
|
"loss": 0.9599, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9610542058944702, |
|
"learning_rate": 4.990047796311151e-05, |
|
"loss": 0.6895, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9922348260879517, |
|
"learning_rate": 4.989860679602698e-05, |
|
"loss": 0.7315, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.2409151792526245, |
|
"learning_rate": 4.9896718237719785e-05, |
|
"loss": 0.8574, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.016333818435669, |
|
"learning_rate": 4.9894812289509046e-05, |
|
"loss": 1.1248, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9131489396095276, |
|
"learning_rate": 4.989288895272604e-05, |
|
"loss": 0.9847, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.215469479560852, |
|
"learning_rate": 4.989094822871419e-05, |
|
"loss": 0.912, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.0536105632781982, |
|
"learning_rate": 4.988899011882903e-05, |
|
"loss": 0.8425, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.9705311059951782, |
|
"learning_rate": 4.988701462443829e-05, |
|
"loss": 0.9385, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.2488442659378052, |
|
"learning_rate": 4.98850217469218e-05, |
|
"loss": 0.7865, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.7318600416183472, |
|
"learning_rate": 4.988301148767157e-05, |
|
"loss": 0.8231, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.8247858881950378, |
|
"learning_rate": 4.9880983848091704e-05, |
|
"loss": 0.8553, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.858172595500946, |
|
"learning_rate": 4.987893882959849e-05, |
|
"loss": 1.3952, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.2286418676376343, |
|
"learning_rate": 4.987687643362033e-05, |
|
"loss": 0.837, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.034350872039795, |
|
"learning_rate": 4.9874796661597765e-05, |
|
"loss": 0.9175, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 0.8063747882843018, |
|
"eval_runtime": 96.4224, |
|
"eval_samples_per_second": 7.229, |
|
"eval_steps_per_second": 7.229, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7192366123199463, |
|
"learning_rate": 4.987269951498348e-05, |
|
"loss": 0.8563, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.2645854949951172, |
|
"learning_rate": 4.98705849952423e-05, |
|
"loss": 0.6663, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.0610381364822388, |
|
"learning_rate": 4.9868453103851176e-05, |
|
"loss": 0.8452, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8550002574920654, |
|
"learning_rate": 4.986630384229919e-05, |
|
"loss": 0.8894, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7490519285202026, |
|
"learning_rate": 4.986413721208757e-05, |
|
"loss": 0.9106, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.557860255241394, |
|
"learning_rate": 4.986195321472965e-05, |
|
"loss": 0.685, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7450752258300781, |
|
"learning_rate": 4.9859751851750934e-05, |
|
"loss": 0.8472, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.176376461982727, |
|
"learning_rate": 4.985753312468903e-05, |
|
"loss": 1.0197, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.0625300407409668, |
|
"learning_rate": 4.985529703509367e-05, |
|
"loss": 0.9685, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8808372616767883, |
|
"learning_rate": 4.985304358452672e-05, |
|
"loss": 0.8612, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8110201954841614, |
|
"learning_rate": 4.985077277456218e-05, |
|
"loss": 0.8401, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.9364888072013855, |
|
"learning_rate": 4.984848460678618e-05, |
|
"loss": 0.6197, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.0113518238067627, |
|
"learning_rate": 4.984617908279694e-05, |
|
"loss": 0.9889, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.1148868799209595, |
|
"learning_rate": 4.984385620420485e-05, |
|
"loss": 0.9558, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.9506175518035889, |
|
"learning_rate": 4.984151597263238e-05, |
|
"loss": 0.7323, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.0044193267822266, |
|
"learning_rate": 4.983915838971415e-05, |
|
"loss": 0.7504, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.2674214839935303, |
|
"learning_rate": 4.9836783457096875e-05, |
|
"loss": 1.032, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.4945333003997803, |
|
"learning_rate": 4.983439117643942e-05, |
|
"loss": 1.0359, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.9860715866088867, |
|
"learning_rate": 4.9831981549412744e-05, |
|
"loss": 1.1152, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8287227153778076, |
|
"learning_rate": 4.982955457769992e-05, |
|
"loss": 0.8157, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 0.8022791743278503, |
|
"eval_runtime": 96.5324, |
|
"eval_samples_per_second": 7.22, |
|
"eval_steps_per_second": 7.22, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.9216273427009583, |
|
"learning_rate": 4.9827110262996144e-05, |
|
"loss": 0.8395, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7642357349395752, |
|
"learning_rate": 4.982464860700874e-05, |
|
"loss": 0.8817, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8851175308227539, |
|
"learning_rate": 4.982216961145711e-05, |
|
"loss": 0.8558, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.44226109981536865, |
|
"learning_rate": 4.98196732780728e-05, |
|
"loss": 0.882, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8005027174949646, |
|
"learning_rate": 4.981715960859945e-05, |
|
"loss": 0.8835, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7451304793357849, |
|
"learning_rate": 4.981462860479281e-05, |
|
"loss": 0.8551, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.1069347858428955, |
|
"learning_rate": 4.9812080268420745e-05, |
|
"loss": 0.999, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8892244100570679, |
|
"learning_rate": 4.980951460126322e-05, |
|
"loss": 1.012, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8935977816581726, |
|
"learning_rate": 4.9806931605112305e-05, |
|
"loss": 0.9911, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8456961512565613, |
|
"learning_rate": 4.9804331281772176e-05, |
|
"loss": 0.7595, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.78443443775177, |
|
"learning_rate": 4.980171363305911e-05, |
|
"loss": 0.8308, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.0028038024902344, |
|
"learning_rate": 4.979907866080149e-05, |
|
"loss": 0.9637, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.1801577806472778, |
|
"learning_rate": 4.9796426366839786e-05, |
|
"loss": 0.6159, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8370681405067444, |
|
"learning_rate": 4.979375675302659e-05, |
|
"loss": 0.9276, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8605382442474365, |
|
"learning_rate": 4.979106982122658e-05, |
|
"loss": 1.1077, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7788259387016296, |
|
"learning_rate": 4.978836557331652e-05, |
|
"loss": 0.8172, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.4312686920166016, |
|
"learning_rate": 4.978564401118528e-05, |
|
"loss": 0.8759, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.9109662175178528, |
|
"learning_rate": 4.978290513673381e-05, |
|
"loss": 0.947, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.1819065809249878, |
|
"learning_rate": 4.9780148951875195e-05, |
|
"loss": 0.7364, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.9400575160980225, |
|
"learning_rate": 4.977737545853455e-05, |
|
"loss": 0.9469, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_loss": 0.7995806932449341, |
|
"eval_runtime": 96.5877, |
|
"eval_samples_per_second": 7.216, |
|
"eval_steps_per_second": 7.216, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.693812370300293, |
|
"learning_rate": 4.9774584658649126e-05, |
|
"loss": 0.9433, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.0892895460128784, |
|
"learning_rate": 4.9771776554168234e-05, |
|
"loss": 0.7027, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.9118362665176392, |
|
"learning_rate": 4.976895114705329e-05, |
|
"loss": 0.9468, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8032681345939636, |
|
"learning_rate": 4.976610843927779e-05, |
|
"loss": 0.7927, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.168225646018982, |
|
"learning_rate": 4.976324843282732e-05, |
|
"loss": 0.9673, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.077602744102478, |
|
"learning_rate": 4.976037112969953e-05, |
|
"loss": 0.9156, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8643108606338501, |
|
"learning_rate": 4.9757476531904165e-05, |
|
"loss": 0.6999, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.933397650718689, |
|
"learning_rate": 4.975456464146306e-05, |
|
"loss": 0.8828, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.7036295533180237, |
|
"learning_rate": 4.975163546041011e-05, |
|
"loss": 0.8709, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5974694490432739, |
|
"learning_rate": 4.974868899079128e-05, |
|
"loss": 0.7594, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7244943380355835, |
|
"learning_rate": 4.974572523466465e-05, |
|
"loss": 0.8714, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5783522725105286, |
|
"learning_rate": 4.9742744194100345e-05, |
|
"loss": 0.8941, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7480617761611938, |
|
"learning_rate": 4.973974587118055e-05, |
|
"loss": 0.9798, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7548874020576477, |
|
"learning_rate": 4.973673026799956e-05, |
|
"loss": 0.7767, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7075071930885315, |
|
"learning_rate": 4.97336973866637e-05, |
|
"loss": 0.7779, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7042987942695618, |
|
"learning_rate": 4.97306472292914e-05, |
|
"loss": 0.8249, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.0242459774017334, |
|
"learning_rate": 4.972757979801313e-05, |
|
"loss": 0.9223, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6138095259666443, |
|
"learning_rate": 4.9724495094971436e-05, |
|
"loss": 0.9842, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7905042767524719, |
|
"learning_rate": 4.9721393122320925e-05, |
|
"loss": 0.8738, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.9658048748970032, |
|
"learning_rate": 4.9718273882228265e-05, |
|
"loss": 0.8872, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 0.7954564690589905, |
|
"eval_runtime": 96.643, |
|
"eval_samples_per_second": 7.212, |
|
"eval_steps_per_second": 7.212, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.8425014019012451, |
|
"learning_rate": 4.97151373768722e-05, |
|
"loss": 0.778, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5527231693267822, |
|
"learning_rate": 4.971198360844351e-05, |
|
"loss": 0.8332, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7870334386825562, |
|
"learning_rate": 4.9708812579145056e-05, |
|
"loss": 0.9265, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.9935321807861328, |
|
"learning_rate": 4.970562429119173e-05, |
|
"loss": 0.7243, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.9546892046928406, |
|
"learning_rate": 4.970241874681051e-05, |
|
"loss": 0.9908, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7340118885040283, |
|
"learning_rate": 4.969919594824039e-05, |
|
"loss": 0.7932, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.1686015129089355, |
|
"learning_rate": 4.9695955897732453e-05, |
|
"loss": 0.9842, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.9721456170082092, |
|
"learning_rate": 4.9692698597549815e-05, |
|
"loss": 0.9271, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6477334499359131, |
|
"learning_rate": 4.9689424049967623e-05, |
|
"loss": 0.934, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.0759055614471436, |
|
"learning_rate": 4.968613225727311e-05, |
|
"loss": 1.0465, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7222158908843994, |
|
"learning_rate": 4.968282322176552e-05, |
|
"loss": 0.7732, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.8591343760490417, |
|
"learning_rate": 4.9679496945756155e-05, |
|
"loss": 0.9062, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.8495111465454102, |
|
"learning_rate": 4.967615343156837e-05, |
|
"loss": 0.8861, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6847331523895264, |
|
"learning_rate": 4.967279268153753e-05, |
|
"loss": 0.8001, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.690113365650177, |
|
"learning_rate": 4.9669414698011074e-05, |
|
"loss": 0.7378, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8349626064300537, |
|
"learning_rate": 4.9666019483348456e-05, |
|
"loss": 0.7193, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6444108486175537, |
|
"learning_rate": 4.966260703992116e-05, |
|
"loss": 0.8729, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.9515655040740967, |
|
"learning_rate": 4.965917737011274e-05, |
|
"loss": 0.7532, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8138986229896545, |
|
"learning_rate": 4.965573047631873e-05, |
|
"loss": 1.0124, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.0182080268859863, |
|
"learning_rate": 4.9652266360946745e-05, |
|
"loss": 0.8842, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.7912728190422058, |
|
"eval_runtime": 96.5004, |
|
"eval_samples_per_second": 7.223, |
|
"eval_steps_per_second": 7.223, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.9665297269821167, |
|
"learning_rate": 4.96487850264164e-05, |
|
"loss": 1.0155, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.1356585025787354, |
|
"learning_rate": 4.964528647515933e-05, |
|
"loss": 0.8705, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5548833608627319, |
|
"learning_rate": 4.9641770709619234e-05, |
|
"loss": 0.9634, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.8028444647789001, |
|
"learning_rate": 4.9638237732251794e-05, |
|
"loss": 0.8722, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.934234082698822, |
|
"learning_rate": 4.9634687545524724e-05, |
|
"loss": 0.9731, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7293463349342346, |
|
"learning_rate": 4.963112015191778e-05, |
|
"loss": 1.0237, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6442769169807434, |
|
"learning_rate": 4.962753555392271e-05, |
|
"loss": 1.1331, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7877534031867981, |
|
"learning_rate": 4.962393375404331e-05, |
|
"loss": 1.0737, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5739997625350952, |
|
"learning_rate": 4.9620314754795343e-05, |
|
"loss": 0.8836, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7318402528762817, |
|
"learning_rate": 4.9616678558706634e-05, |
|
"loss": 0.9981, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5463365316390991, |
|
"learning_rate": 4.961302516831699e-05, |
|
"loss": 0.7336, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7839176654815674, |
|
"learning_rate": 4.960935458617824e-05, |
|
"loss": 1.025, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7076404690742493, |
|
"learning_rate": 4.9605666814854225e-05, |
|
"loss": 0.833, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.732940673828125, |
|
"learning_rate": 4.960196185692077e-05, |
|
"loss": 0.5103, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7256388068199158, |
|
"learning_rate": 4.959823971496574e-05, |
|
"loss": 0.8617, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.1714242696762085, |
|
"learning_rate": 4.959450039158898e-05, |
|
"loss": 1.0345, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5849193930625916, |
|
"learning_rate": 4.9590743889402325e-05, |
|
"loss": 0.729, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6283109784126282, |
|
"learning_rate": 4.958697021102963e-05, |
|
"loss": 0.8527, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6387770175933838, |
|
"learning_rate": 4.9583179359106746e-05, |
|
"loss": 0.7411, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5853758454322815, |
|
"learning_rate": 4.957937133628151e-05, |
|
"loss": 0.7909, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 0.7863278985023499, |
|
"eval_runtime": 96.3784, |
|
"eval_samples_per_second": 7.232, |
|
"eval_steps_per_second": 7.232, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.9301708936691284, |
|
"learning_rate": 4.9575546145213755e-05, |
|
"loss": 0.7149, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.125088095664978, |
|
"learning_rate": 4.9571703788575314e-05, |
|
"loss": 0.8034, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.0697988271713257, |
|
"learning_rate": 4.956784426905e-05, |
|
"loss": 0.8874, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7094873189926147, |
|
"learning_rate": 4.956396758933361e-05, |
|
"loss": 0.6612, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8048680424690247, |
|
"learning_rate": 4.956007375213393e-05, |
|
"loss": 0.9558, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8820949196815491, |
|
"learning_rate": 4.9556162760170756e-05, |
|
"loss": 0.9442, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7214958071708679, |
|
"learning_rate": 4.955223461617583e-05, |
|
"loss": 0.8392, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8364250063896179, |
|
"learning_rate": 4.954828932289288e-05, |
|
"loss": 0.9834, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8735854625701904, |
|
"learning_rate": 4.954432688307764e-05, |
|
"loss": 0.8817, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.810013473033905, |
|
"learning_rate": 4.9540347299497805e-05, |
|
"loss": 0.7723, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8791002035140991, |
|
"learning_rate": 4.953635057493302e-05, |
|
"loss": 0.706, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7556783556938171, |
|
"learning_rate": 4.953233671217493e-05, |
|
"loss": 0.8145, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.3251086473464966, |
|
"learning_rate": 4.952830571402716e-05, |
|
"loss": 0.8413, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.8531173467636108, |
|
"learning_rate": 4.952425758330527e-05, |
|
"loss": 0.8236, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.0738744735717773, |
|
"learning_rate": 4.952019232283681e-05, |
|
"loss": 0.8357, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7908213138580322, |
|
"learning_rate": 4.9516109935461306e-05, |
|
"loss": 0.6165, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.9802565574645996, |
|
"learning_rate": 4.951201042403021e-05, |
|
"loss": 0.7203, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7866708636283875, |
|
"learning_rate": 4.9507893791406974e-05, |
|
"loss": 0.8479, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6721138954162598, |
|
"learning_rate": 4.950376004046698e-05, |
|
"loss": 0.8871, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.1981366872787476, |
|
"learning_rate": 4.9499609174097574e-05, |
|
"loss": 0.8196, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 0.7843652367591858, |
|
"eval_runtime": 96.5411, |
|
"eval_samples_per_second": 7.22, |
|
"eval_steps_per_second": 7.22, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7013841867446899, |
|
"learning_rate": 4.9495441195198064e-05, |
|
"loss": 1.0009, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.8476290702819824, |
|
"learning_rate": 4.949125610667972e-05, |
|
"loss": 0.5127, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7680797576904297, |
|
"learning_rate": 4.9487053911465735e-05, |
|
"loss": 0.7003, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.9771925806999207, |
|
"learning_rate": 4.948283461249127e-05, |
|
"loss": 1.1135, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.4247405529022217, |
|
"learning_rate": 4.947859821270342e-05, |
|
"loss": 0.8253, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.184887409210205, |
|
"learning_rate": 4.947434471506125e-05, |
|
"loss": 1.1208, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7579745054244995, |
|
"learning_rate": 4.9470074122535745e-05, |
|
"loss": 1.1363, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.8529625535011292, |
|
"learning_rate": 4.9465786438109826e-05, |
|
"loss": 0.8699, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.810576319694519, |
|
"learning_rate": 4.9461481664778374e-05, |
|
"loss": 1.0166, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.8605110049247742, |
|
"learning_rate": 4.9457159805548187e-05, |
|
"loss": 0.9427, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.59971684217453, |
|
"learning_rate": 4.945282086343801e-05, |
|
"loss": 0.6536, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.0233818292617798, |
|
"learning_rate": 4.9448464841478506e-05, |
|
"loss": 0.9505, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.8945149779319763, |
|
"learning_rate": 4.9444091742712293e-05, |
|
"loss": 0.8416, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.702805757522583, |
|
"learning_rate": 4.9439701570193886e-05, |
|
"loss": 0.9419, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7464181184768677, |
|
"learning_rate": 4.9435294326989745e-05, |
|
"loss": 0.7972, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.1765002012252808, |
|
"learning_rate": 4.943175624360097e-05, |
|
"loss": 0.9914, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6549853682518005, |
|
"learning_rate": 4.9427318280928034e-05, |
|
"loss": 0.8924, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5978650450706482, |
|
"learning_rate": 4.942286325621888e-05, |
|
"loss": 0.6224, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7752617597579956, |
|
"learning_rate": 4.941839117258523e-05, |
|
"loss": 0.8666, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6919072866439819, |
|
"learning_rate": 4.941390203315078e-05, |
|
"loss": 0.9341, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 0.7824844717979431, |
|
"eval_runtime": 96.8874, |
|
"eval_samples_per_second": 7.194, |
|
"eval_steps_per_second": 7.194, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7222729325294495, |
|
"learning_rate": 4.94093958410511e-05, |
|
"loss": 0.9925, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.9575716853141785, |
|
"learning_rate": 4.9404872599433686e-05, |
|
"loss": 0.8623, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7721400260925293, |
|
"learning_rate": 4.940033231145793e-05, |
|
"loss": 1.0061, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7019990682601929, |
|
"learning_rate": 4.9395774980295165e-05, |
|
"loss": 0.8697, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7828916907310486, |
|
"learning_rate": 4.939120060912858e-05, |
|
"loss": 1.0066, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.0238871574401855, |
|
"learning_rate": 4.93866092011533e-05, |
|
"loss": 1.0285, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.48669734597206116, |
|
"learning_rate": 4.938200075957634e-05, |
|
"loss": 0.7454, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.8834619522094727, |
|
"learning_rate": 4.93773752876166e-05, |
|
"loss": 0.9998, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6462609767913818, |
|
"learning_rate": 4.9372732788504905e-05, |
|
"loss": 0.7278, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7309257388114929, |
|
"learning_rate": 4.936807326548395e-05, |
|
"loss": 0.7301, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.8515027165412903, |
|
"learning_rate": 4.936339672180833e-05, |
|
"loss": 0.8307, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.913206934928894, |
|
"learning_rate": 4.935870316074451e-05, |
|
"loss": 0.9467, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6705841422080994, |
|
"learning_rate": 4.935399258557088e-05, |
|
"loss": 0.7124, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.676695704460144, |
|
"learning_rate": 4.934926499957767e-05, |
|
"loss": 0.9318, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.0529104471206665, |
|
"learning_rate": 4.934452040606703e-05, |
|
"loss": 1.0307, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7150225639343262, |
|
"learning_rate": 4.933975880835296e-05, |
|
"loss": 0.8718, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7180047035217285, |
|
"learning_rate": 4.933498020976135e-05, |
|
"loss": 0.7515, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.0961759090423584, |
|
"learning_rate": 4.933018461362997e-05, |
|
"loss": 0.8797, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.830609142780304, |
|
"learning_rate": 4.9325372023308446e-05, |
|
"loss": 0.6927, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.5277318358421326, |
|
"learning_rate": 4.9320542442158305e-05, |
|
"loss": 0.8801, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_loss": 0.7787255644798279, |
|
"eval_runtime": 96.8812, |
|
"eval_samples_per_second": 7.194, |
|
"eval_steps_per_second": 7.194, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.3845161199569702, |
|
"learning_rate": 4.931569587355289e-05, |
|
"loss": 0.8782, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.8579941987991333, |
|
"learning_rate": 4.9310832320877476e-05, |
|
"loss": 0.713, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2643532454967499, |
|
"learning_rate": 4.930595178752914e-05, |
|
"loss": 0.9781, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.4968445897102356, |
|
"learning_rate": 4.930105427691685e-05, |
|
"loss": 0.93, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.9254417419433594, |
|
"learning_rate": 4.929613979246144e-05, |
|
"loss": 0.6353, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.9814417958259583, |
|
"learning_rate": 4.9291208337595574e-05, |
|
"loss": 0.9672, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7159338593482971, |
|
"learning_rate": 4.928625991576379e-05, |
|
"loss": 0.9482, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.623866617679596, |
|
"learning_rate": 4.9281294530422476e-05, |
|
"loss": 0.623, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.8750379681587219, |
|
"learning_rate": 4.927631218503985e-05, |
|
"loss": 0.772, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.5593128800392151, |
|
"learning_rate": 4.9271312883096e-05, |
|
"loss": 0.6579, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.6411569714546204, |
|
"learning_rate": 4.9266296628082834e-05, |
|
"loss": 0.9239, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.9317705631256104, |
|
"learning_rate": 4.9261263423504135e-05, |
|
"loss": 0.9315, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.8312699198722839, |
|
"learning_rate": 4.9256213272875486e-05, |
|
"loss": 0.7334, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.6170663833618164, |
|
"learning_rate": 4.925114617972433e-05, |
|
"loss": 0.8603, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7176920771598816, |
|
"learning_rate": 4.924606214758995e-05, |
|
"loss": 0.8738, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.8957033157348633, |
|
"learning_rate": 4.924096118002343e-05, |
|
"loss": 0.8861, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5490685701370239, |
|
"learning_rate": 4.923584328058772e-05, |
|
"loss": 0.712, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7401763796806335, |
|
"learning_rate": 4.923070845285757e-05, |
|
"loss": 0.8118, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7380841374397278, |
|
"learning_rate": 4.922555670041957e-05, |
|
"loss": 0.8476, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0009427070617676, |
|
"learning_rate": 4.922038802687212e-05, |
|
"loss": 0.9109, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.777683675289154, |
|
"eval_runtime": 96.9147, |
|
"eval_samples_per_second": 7.192, |
|
"eval_steps_per_second": 7.192, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7970065474510193, |
|
"learning_rate": 4.921520243582545e-05, |
|
"loss": 0.616, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.6530303955078125, |
|
"learning_rate": 4.92099999309016e-05, |
|
"loss": 0.9223, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.48044708371162415, |
|
"learning_rate": 4.9204780515734406e-05, |
|
"loss": 0.6762, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7560244798660278, |
|
"learning_rate": 4.919954419396956e-05, |
|
"loss": 0.8726, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.8580659031867981, |
|
"learning_rate": 4.919429096926453e-05, |
|
"loss": 0.7654, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1246473789215088, |
|
"learning_rate": 4.918902084528859e-05, |
|
"loss": 0.9123, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.0745307207107544, |
|
"learning_rate": 4.918373382572283e-05, |
|
"loss": 0.79, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.9591856598854065, |
|
"learning_rate": 4.917842991426014e-05, |
|
"loss": 1.1778, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.0233389139175415, |
|
"learning_rate": 4.91731091146052e-05, |
|
"loss": 0.8827, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.648965060710907, |
|
"learning_rate": 4.91677714304745e-05, |
|
"loss": 0.8634, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6523327231407166, |
|
"learning_rate": 4.91624168655963e-05, |
|
"loss": 0.9916, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.8029198050498962, |
|
"learning_rate": 4.915704542371068e-05, |
|
"loss": 0.7867, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6397082805633545, |
|
"learning_rate": 4.915165710856948e-05, |
|
"loss": 0.7738, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5862845778465271, |
|
"learning_rate": 4.914625192393636e-05, |
|
"loss": 0.7026, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5333505868911743, |
|
"learning_rate": 4.914082987358673e-05, |
|
"loss": 0.8623, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5689602494239807, |
|
"learning_rate": 4.913539096130779e-05, |
|
"loss": 0.7619, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7333836555480957, |
|
"learning_rate": 4.912993519089853e-05, |
|
"loss": 0.8116, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7610496282577515, |
|
"learning_rate": 4.91244625661697e-05, |
|
"loss": 0.74, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6331669092178345, |
|
"learning_rate": 4.9118973090943835e-05, |
|
"loss": 1.0445, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7263479828834534, |
|
"learning_rate": 4.911346676905521e-05, |
|
"loss": 0.8964, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 0.7759388089179993, |
|
"eval_runtime": 96.8818, |
|
"eval_samples_per_second": 7.194, |
|
"eval_steps_per_second": 7.194, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.6523721814155579, |
|
"learning_rate": 4.910794360434993e-05, |
|
"loss": 1.0127, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.055384874343872, |
|
"learning_rate": 4.9102403600685796e-05, |
|
"loss": 0.9855, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7640814185142517, |
|
"learning_rate": 4.9096846761932414e-05, |
|
"loss": 0.7963, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5843799710273743, |
|
"learning_rate": 4.9091273091971124e-05, |
|
"loss": 0.8854, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.9825207591056824, |
|
"learning_rate": 4.9085682594695036e-05, |
|
"loss": 0.8086, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.9490563869476318, |
|
"learning_rate": 4.908007527400901e-05, |
|
"loss": 0.6838, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.9472922682762146, |
|
"learning_rate": 4.907445113382966e-05, |
|
"loss": 0.8732, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.6690593957901001, |
|
"learning_rate": 4.9068810178085344e-05, |
|
"loss": 0.8551, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7245538830757141, |
|
"learning_rate": 4.906315241071616e-05, |
|
"loss": 0.7639, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.8342815041542053, |
|
"learning_rate": 4.905747783567397e-05, |
|
"loss": 0.9417, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.6241989135742188, |
|
"learning_rate": 4.9051786456922354e-05, |
|
"loss": 0.9394, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.5671687126159668, |
|
"learning_rate": 4.904607827843663e-05, |
|
"loss": 0.6381, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.795868456363678, |
|
"learning_rate": 4.9040353304203864e-05, |
|
"loss": 0.7676, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.9995182156562805, |
|
"learning_rate": 4.9034611538222844e-05, |
|
"loss": 1.0327, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7473803758621216, |
|
"learning_rate": 4.902885298450409e-05, |
|
"loss": 0.8835, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5757468938827515, |
|
"learning_rate": 4.902307764706984e-05, |
|
"loss": 0.7548, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.8357987403869629, |
|
"learning_rate": 4.901728552995407e-05, |
|
"loss": 0.9184, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6664137244224548, |
|
"learning_rate": 4.901147663720247e-05, |
|
"loss": 0.9872, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.861997663974762, |
|
"learning_rate": 4.900565097287243e-05, |
|
"loss": 0.8541, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7566475868225098, |
|
"learning_rate": 4.8999808541033086e-05, |
|
"loss": 0.9265, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"eval_loss": 0.7741928696632385, |
|
"eval_runtime": 96.9038, |
|
"eval_samples_per_second": 7.193, |
|
"eval_steps_per_second": 7.193, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.45475611090660095, |
|
"learning_rate": 4.8993949345765266e-05, |
|
"loss": 0.7186, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.8672823905944824, |
|
"learning_rate": 4.8988073391161515e-05, |
|
"loss": 0.919, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7782495617866516, |
|
"learning_rate": 4.8982180681326074e-05, |
|
"loss": 0.6618, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6640329957008362, |
|
"learning_rate": 4.897627122037489e-05, |
|
"loss": 0.6662, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.8019454479217529, |
|
"learning_rate": 4.897034501243561e-05, |
|
"loss": 0.9459, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.8336368799209595, |
|
"learning_rate": 4.896440206164761e-05, |
|
"loss": 0.8058, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6316781044006348, |
|
"learning_rate": 4.8958442372161906e-05, |
|
"loss": 0.9132, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7768308520317078, |
|
"learning_rate": 4.895246594814124e-05, |
|
"loss": 0.7512, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.9891632795333862, |
|
"learning_rate": 4.894647279376002e-05, |
|
"loss": 0.843, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6162430047988892, |
|
"learning_rate": 4.894046291320439e-05, |
|
"loss": 0.8233, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6184887290000916, |
|
"learning_rate": 4.893443631067211e-05, |
|
"loss": 0.7428, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7117312550544739, |
|
"learning_rate": 4.892839299037267e-05, |
|
"loss": 0.8707, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7165163159370422, |
|
"learning_rate": 4.892233295652721e-05, |
|
"loss": 1.0485, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.8377657532691956, |
|
"learning_rate": 4.891625621336855e-05, |
|
"loss": 0.7368, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6349939703941345, |
|
"learning_rate": 4.89101627651412e-05, |
|
"loss": 0.7357, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.969137191772461, |
|
"learning_rate": 4.890405261610131e-05, |
|
"loss": 0.7605, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.5980018377304077, |
|
"learning_rate": 4.889792577051671e-05, |
|
"loss": 0.9253, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.681398332118988, |
|
"learning_rate": 4.889178223266688e-05, |
|
"loss": 0.7235, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.6999421715736389, |
|
"learning_rate": 4.888562200684299e-05, |
|
"loss": 0.8521, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7693730592727661, |
|
"learning_rate": 4.887944509734783e-05, |
|
"loss": 0.8632, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 0.76987224817276, |
|
"eval_runtime": 96.9052, |
|
"eval_samples_per_second": 7.193, |
|
"eval_steps_per_second": 7.193, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7641138434410095, |
|
"learning_rate": 4.8873251508495865e-05, |
|
"loss": 0.7074, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.732545018196106, |
|
"learning_rate": 4.886704124461321e-05, |
|
"loss": 0.6901, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.0327179431915283, |
|
"learning_rate": 4.88608143100376e-05, |
|
"loss": 0.8256, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7066757082939148, |
|
"learning_rate": 4.885457070911845e-05, |
|
"loss": 0.6635, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.809877336025238, |
|
"learning_rate": 4.8848310446216806e-05, |
|
"loss": 0.795, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.738153338432312, |
|
"learning_rate": 4.8842033525705335e-05, |
|
"loss": 0.9089, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.754896879196167, |
|
"learning_rate": 4.883573995196836e-05, |
|
"loss": 0.7103, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.0111182928085327, |
|
"learning_rate": 4.8829429729401826e-05, |
|
"loss": 1.046, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.6233395934104919, |
|
"learning_rate": 4.8823102862413306e-05, |
|
"loss": 0.761, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.3443419933319092, |
|
"learning_rate": 4.8816759355422e-05, |
|
"loss": 0.8436, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.6685923337936401, |
|
"learning_rate": 4.8810399212858736e-05, |
|
"loss": 0.8956, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.0405924320220947, |
|
"learning_rate": 4.880402243916596e-05, |
|
"loss": 1.1458, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.8413107991218567, |
|
"learning_rate": 4.879762903879772e-05, |
|
"loss": 0.8133, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7151504158973694, |
|
"learning_rate": 4.8791219016219705e-05, |
|
"loss": 0.9207, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.6887856125831604, |
|
"learning_rate": 4.878479237590918e-05, |
|
"loss": 0.8185, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5687748193740845, |
|
"learning_rate": 4.877834912235506e-05, |
|
"loss": 0.9035, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.9966350793838501, |
|
"learning_rate": 4.877188926005782e-05, |
|
"loss": 0.7764, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.0459462404251099, |
|
"learning_rate": 4.8765412793529574e-05, |
|
"loss": 0.6658, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.8338847160339355, |
|
"learning_rate": 4.8758919727293995e-05, |
|
"loss": 0.7363, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7602768540382385, |
|
"learning_rate": 4.875241006588638e-05, |
|
"loss": 1.0081, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 0.7692809700965881, |
|
"eval_runtime": 96.4899, |
|
"eval_samples_per_second": 7.224, |
|
"eval_steps_per_second": 7.224, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5455746054649353, |
|
"learning_rate": 4.874588381385362e-05, |
|
"loss": 0.7855, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.8574795126914978, |
|
"learning_rate": 4.8739340975754165e-05, |
|
"loss": 1.068, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.0321904420852661, |
|
"learning_rate": 4.873278155615808e-05, |
|
"loss": 0.8239, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.2484744787216187, |
|
"learning_rate": 4.8726205559646996e-05, |
|
"loss": 0.9307, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7140147686004639, |
|
"learning_rate": 4.871961299081412e-05, |
|
"loss": 0.9876, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.8003590106964111, |
|
"learning_rate": 4.871300385426426e-05, |
|
"loss": 0.8615, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7282931208610535, |
|
"learning_rate": 4.870637815461376e-05, |
|
"loss": 0.8734, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6800629496574402, |
|
"learning_rate": 4.869973589649055e-05, |
|
"loss": 0.7718, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.8813210129737854, |
|
"learning_rate": 4.869307708453413e-05, |
|
"loss": 0.7943, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6612805724143982, |
|
"learning_rate": 4.868640172339557e-05, |
|
"loss": 0.6807, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.653191328048706, |
|
"learning_rate": 4.867970981773748e-05, |
|
"loss": 0.8948, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.7479822635650635, |
|
"learning_rate": 4.8673001372234025e-05, |
|
"loss": 0.8583, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": NaN, |
|
"learning_rate": 4.8667622710291026e-05, |
|
"loss": 0.7443, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5788535475730896, |
|
"learning_rate": 4.866088450488172e-05, |
|
"loss": 0.7249, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7408040165901184, |
|
"learning_rate": 4.86541297727762e-05, |
|
"loss": 0.7115, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6549968719482422, |
|
"learning_rate": 4.864735851869251e-05, |
|
"loss": 0.9095, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.4595119059085846, |
|
"learning_rate": 4.864057074736026e-05, |
|
"loss": 1.2808, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5746715068817139, |
|
"learning_rate": 4.863376646352058e-05, |
|
"loss": 0.8139, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6972643136978149, |
|
"learning_rate": 4.862694567192614e-05, |
|
"loss": 0.9797, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6935243010520935, |
|
"learning_rate": 4.8620108377341124e-05, |
|
"loss": 0.7651, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.766412615776062, |
|
"eval_runtime": 96.4555, |
|
"eval_samples_per_second": 7.226, |
|
"eval_steps_per_second": 7.226, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9983006715774536, |
|
"learning_rate": 4.861325458454128e-05, |
|
"loss": 0.8256, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6732650995254517, |
|
"learning_rate": 4.860638429831384e-05, |
|
"loss": 0.8136, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6780042052268982, |
|
"learning_rate": 4.859949752345758e-05, |
|
"loss": 0.8911, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9892123937606812, |
|
"learning_rate": 4.8592594264782794e-05, |
|
"loss": 0.7907, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9327254295349121, |
|
"learning_rate": 4.8585674527111266e-05, |
|
"loss": 0.8712, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.0295612812042236, |
|
"learning_rate": 4.857873831527632e-05, |
|
"loss": 0.9188, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.3071186542510986, |
|
"learning_rate": 4.8571785634122766e-05, |
|
"loss": 0.8801, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.9625150561332703, |
|
"learning_rate": 4.856481648850694e-05, |
|
"loss": 0.8333, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.6674854159355164, |
|
"learning_rate": 4.855783088329664e-05, |
|
"loss": 1.0388, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5447000861167908, |
|
"learning_rate": 4.8550828823371196e-05, |
|
"loss": 0.7893, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.9970148801803589, |
|
"learning_rate": 4.854381031362142e-05, |
|
"loss": 0.8198, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7657136917114258, |
|
"learning_rate": 4.853677535894961e-05, |
|
"loss": 0.5977, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.4694065451622009, |
|
"learning_rate": 4.852972396426956e-05, |
|
"loss": 0.5965, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.8955700993537903, |
|
"learning_rate": 4.852265613450653e-05, |
|
"loss": 0.6938, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.9884099960327148, |
|
"learning_rate": 4.851557187459727e-05, |
|
"loss": 0.8946, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.6793637871742249, |
|
"learning_rate": 4.850847118949002e-05, |
|
"loss": 0.841, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7438017725944519, |
|
"learning_rate": 4.850135408414447e-05, |
|
"loss": 0.8843, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7632609009742737, |
|
"learning_rate": 4.849422056353178e-05, |
|
"loss": 0.8263, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7281492352485657, |
|
"learning_rate": 4.84870706326346e-05, |
|
"loss": 0.8989, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6480591893196106, |
|
"learning_rate": 4.847990429644702e-05, |
|
"loss": 1.0037, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 0.7653521299362183, |
|
"eval_runtime": 96.4452, |
|
"eval_samples_per_second": 7.227, |
|
"eval_steps_per_second": 7.227, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5578673481941223, |
|
"learning_rate": 4.8472721559974584e-05, |
|
"loss": 0.911, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5615595579147339, |
|
"learning_rate": 4.846552242823433e-05, |
|
"loss": 0.6938, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.588246762752533, |
|
"learning_rate": 4.845830690625469e-05, |
|
"loss": 0.7898, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.8140611052513123, |
|
"learning_rate": 4.8451074999075595e-05, |
|
"loss": 0.7702, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.9400056600570679, |
|
"learning_rate": 4.8443826711748385e-05, |
|
"loss": 0.7959, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7187873721122742, |
|
"learning_rate": 4.8436562049335874e-05, |
|
"loss": 0.7223, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7627830505371094, |
|
"learning_rate": 4.8429281016912275e-05, |
|
"loss": 0.793, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6755004525184631, |
|
"learning_rate": 4.842198361956328e-05, |
|
"loss": 0.7665, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6032254695892334, |
|
"learning_rate": 4.8414669862385966e-05, |
|
"loss": 0.7952, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.8377916216850281, |
|
"learning_rate": 4.840733975048887e-05, |
|
"loss": 1.0016, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7361429929733276, |
|
"learning_rate": 4.839999328899194e-05, |
|
"loss": 0.8773, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.8006517887115479, |
|
"learning_rate": 4.8392630483026546e-05, |
|
"loss": 0.9334, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.9716467261314392, |
|
"learning_rate": 4.8385251337735473e-05, |
|
"loss": 1.0359, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6826418042182922, |
|
"learning_rate": 4.8377855858272925e-05, |
|
"loss": 0.6841, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.4519975781440735, |
|
"learning_rate": 4.8370444049804494e-05, |
|
"loss": 0.8326, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.677891731262207, |
|
"learning_rate": 4.836301591750721e-05, |
|
"loss": 1.0841, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.5161852836608887, |
|
"learning_rate": 4.835557146656948e-05, |
|
"loss": 0.8701, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6586780548095703, |
|
"learning_rate": 4.834811070219112e-05, |
|
"loss": 0.8261, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.48046165704727173, |
|
"learning_rate": 4.834063362958333e-05, |
|
"loss": 0.6375, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.0315968990325928, |
|
"learning_rate": 4.833314025396872e-05, |
|
"loss": 0.8768, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 0.7641988396644592, |
|
"eval_runtime": 96.3923, |
|
"eval_samples_per_second": 7.231, |
|
"eval_steps_per_second": 7.231, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.7704123258590698, |
|
"learning_rate": 4.8325630580581263e-05, |
|
"loss": 0.8849, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.087425708770752, |
|
"learning_rate": 4.831810461466634e-05, |
|
"loss": 0.9828, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.4766077995300293, |
|
"learning_rate": 4.83105623614807e-05, |
|
"loss": 0.7103, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6079148054122925, |
|
"learning_rate": 4.830300382629247e-05, |
|
"loss": 0.7253, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6767585873603821, |
|
"learning_rate": 4.829542901438115e-05, |
|
"loss": 0.7852, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7065784335136414, |
|
"learning_rate": 4.8287837931037585e-05, |
|
"loss": 0.8047, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.8305274248123169, |
|
"learning_rate": 4.828023058156404e-05, |
|
"loss": 0.7912, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.8435990810394287, |
|
"learning_rate": 4.827260697127409e-05, |
|
"loss": 0.826, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.8484389185905457, |
|
"learning_rate": 4.8264967105492705e-05, |
|
"loss": 0.706, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7461299300193787, |
|
"learning_rate": 4.825731098955617e-05, |
|
"loss": 0.763, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7928741574287415, |
|
"learning_rate": 4.824963862881216e-05, |
|
"loss": 0.8125, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7152695059776306, |
|
"learning_rate": 4.824195002861968e-05, |
|
"loss": 1.129, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.8594226241111755, |
|
"learning_rate": 4.8234245194349056e-05, |
|
"loss": 0.8873, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.9760085940361023, |
|
"learning_rate": 4.822652413138199e-05, |
|
"loss": 0.9713, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7297483682632446, |
|
"learning_rate": 4.8218786845111505e-05, |
|
"loss": 0.6953, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.8251492381095886, |
|
"learning_rate": 4.8211033340941956e-05, |
|
"loss": 0.7649, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.742917537689209, |
|
"learning_rate": 4.820326362428901e-05, |
|
"loss": 0.9756, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.7784115076065063, |
|
"learning_rate": 4.819547770057969e-05, |
|
"loss": 0.6937, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.782772183418274, |
|
"learning_rate": 4.8187675575252314e-05, |
|
"loss": 0.9062, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.7802585363388062, |
|
"learning_rate": 4.8179857253756514e-05, |
|
"loss": 0.8052, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_loss": 0.7618402242660522, |
|
"eval_runtime": 96.4079, |
|
"eval_samples_per_second": 7.23, |
|
"eval_steps_per_second": 7.23, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.799985945224762, |
|
"learning_rate": 4.8172022741553255e-05, |
|
"loss": 0.9046, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.026978850364685, |
|
"learning_rate": 4.816417204411481e-05, |
|
"loss": 0.7195, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.8067365884780884, |
|
"learning_rate": 4.8156305166924734e-05, |
|
"loss": 0.8193, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.247164249420166, |
|
"learning_rate": 4.81484221154779e-05, |
|
"loss": 0.6138, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.8662647604942322, |
|
"learning_rate": 4.814052289528047e-05, |
|
"loss": 0.7763, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.9020537734031677, |
|
"learning_rate": 4.813260751184992e-05, |
|
"loss": 0.9236, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6113781929016113, |
|
"learning_rate": 4.812467597071499e-05, |
|
"loss": 0.8753, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6988622546195984, |
|
"learning_rate": 4.811672827741572e-05, |
|
"loss": 0.6747, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.9095928072929382, |
|
"learning_rate": 4.810876443750344e-05, |
|
"loss": 1.0578, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.643699049949646, |
|
"learning_rate": 4.8100784456540724e-05, |
|
"loss": 0.8177, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7084022760391235, |
|
"learning_rate": 4.809278834010146e-05, |
|
"loss": 0.9345, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5328305959701538, |
|
"learning_rate": 4.808477609377078e-05, |
|
"loss": 0.6781, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.8238436579704285, |
|
"learning_rate": 4.80767477231451e-05, |
|
"loss": 0.7306, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.0184216499328613, |
|
"learning_rate": 4.806870323383208e-05, |
|
"loss": 1.0288, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.8620426654815674, |
|
"learning_rate": 4.806064263145066e-05, |
|
"loss": 0.7925, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6541377305984497, |
|
"learning_rate": 4.805256592163102e-05, |
|
"loss": 0.8629, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.8664489984512329, |
|
"learning_rate": 4.8044473110014594e-05, |
|
"loss": 0.8184, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7283564209938049, |
|
"learning_rate": 4.803636420225406e-05, |
|
"loss": 0.9444, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7168800234794617, |
|
"learning_rate": 4.802823920401335e-05, |
|
"loss": 0.8118, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.8198531866073608, |
|
"learning_rate": 4.802009812096762e-05, |
|
"loss": 0.7271, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 0.7595117688179016, |
|
"eval_runtime": 96.4847, |
|
"eval_samples_per_second": 7.224, |
|
"eval_steps_per_second": 7.224, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5693966150283813, |
|
"learning_rate": 4.801194095880327e-05, |
|
"loss": 0.7801, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7175332307815552, |
|
"learning_rate": 4.800376772321793e-05, |
|
"loss": 0.7873, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7779633402824402, |
|
"learning_rate": 4.799557841992046e-05, |
|
"loss": 0.894, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7832231521606445, |
|
"learning_rate": 4.798737305463092e-05, |
|
"loss": 0.8035, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5115272998809814, |
|
"learning_rate": 4.797915163308064e-05, |
|
"loss": 0.8885, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.9534878730773926, |
|
"learning_rate": 4.79709141610121e-05, |
|
"loss": 0.8175, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7053850889205933, |
|
"learning_rate": 4.796266064417905e-05, |
|
"loss": 0.6971, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.236257791519165, |
|
"learning_rate": 4.795439108834641e-05, |
|
"loss": 1.0832, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6936543583869934, |
|
"learning_rate": 4.794610549929031e-05, |
|
"loss": 0.858, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.8064691424369812, |
|
"learning_rate": 4.793780388279809e-05, |
|
"loss": 0.6951, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7180449962615967, |
|
"learning_rate": 4.792948624466827e-05, |
|
"loss": 0.6779, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6903377175331116, |
|
"learning_rate": 4.792115259071058e-05, |
|
"loss": 0.8281, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.9112733006477356, |
|
"learning_rate": 4.791280292674591e-05, |
|
"loss": 0.938, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.8657469153404236, |
|
"learning_rate": 4.790443725860636e-05, |
|
"loss": 0.8063, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.9260883927345276, |
|
"learning_rate": 4.7896055592135194e-05, |
|
"loss": 1.0093, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7651245594024658, |
|
"learning_rate": 4.788765793318685e-05, |
|
"loss": 0.6686, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6063816547393799, |
|
"learning_rate": 4.7879244287626945e-05, |
|
"loss": 0.8516, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.9127621650695801, |
|
"learning_rate": 4.787081466133225e-05, |
|
"loss": 0.7992, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.061246633529663, |
|
"learning_rate": 4.7862369060190716e-05, |
|
"loss": 0.8232, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7100695967674255, |
|
"learning_rate": 4.785390749010143e-05, |
|
"loss": 0.9615, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.7581596970558167, |
|
"eval_runtime": 96.5797, |
|
"eval_samples_per_second": 7.217, |
|
"eval_steps_per_second": 7.217, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 16.361513137817383, |
|
"learning_rate": 4.784542995697464e-05, |
|
"loss": 0.7725, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7746205925941467, |
|
"learning_rate": 4.7836936466731764e-05, |
|
"loss": 0.8464, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7703484892845154, |
|
"learning_rate": 4.7828427025305345e-05, |
|
"loss": 0.8596, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7838412523269653, |
|
"learning_rate": 4.7819901638639066e-05, |
|
"loss": 0.666, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5832842588424683, |
|
"learning_rate": 4.781136031268776e-05, |
|
"loss": 0.4995, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.798271894454956, |
|
"learning_rate": 4.780280305341739e-05, |
|
"loss": 1.0017, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.463828444480896, |
|
"learning_rate": 4.779422986680503e-05, |
|
"loss": 0.5894, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.761908233165741, |
|
"learning_rate": 4.7785640758838916e-05, |
|
"loss": 0.9198, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8427887558937073, |
|
"learning_rate": 4.777703573551837e-05, |
|
"loss": 0.8572, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6188894510269165, |
|
"learning_rate": 4.776841480285384e-05, |
|
"loss": 0.9102, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.7198623418807983, |
|
"learning_rate": 4.775977796686691e-05, |
|
"loss": 0.8472, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.0144587755203247, |
|
"learning_rate": 4.775112523359023e-05, |
|
"loss": 0.7059, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.9784219861030579, |
|
"learning_rate": 4.77424566090676e-05, |
|
"loss": 0.7417, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5349156856536865, |
|
"learning_rate": 4.773377209935387e-05, |
|
"loss": 0.7287, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.7715370655059814, |
|
"learning_rate": 4.772507171051502e-05, |
|
"loss": 0.8393, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8483054637908936, |
|
"learning_rate": 4.771635544862813e-05, |
|
"loss": 0.8938, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8196272253990173, |
|
"learning_rate": 4.770762331978132e-05, |
|
"loss": 0.8321, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6155353784561157, |
|
"learning_rate": 4.769887533007384e-05, |
|
"loss": 0.9291, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8897277116775513, |
|
"learning_rate": 4.769011148561601e-05, |
|
"loss": 0.7098, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.2256160974502563, |
|
"learning_rate": 4.768133179252921e-05, |
|
"loss": 0.8284, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 0.7554901838302612, |
|
"eval_runtime": 96.5279, |
|
"eval_samples_per_second": 7.221, |
|
"eval_steps_per_second": 7.221, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6943432688713074, |
|
"learning_rate": 4.767253625694588e-05, |
|
"loss": 0.8785, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6707726120948792, |
|
"learning_rate": 4.7663724885009556e-05, |
|
"loss": 0.7949, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5595915913581848, |
|
"learning_rate": 4.765489768287481e-05, |
|
"loss": 0.8796, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.9889727234840393, |
|
"learning_rate": 4.7646054656707306e-05, |
|
"loss": 1.0676, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.8624396324157715, |
|
"learning_rate": 4.763719581268371e-05, |
|
"loss": 0.709, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7466241121292114, |
|
"learning_rate": 4.7628321156991767e-05, |
|
"loss": 0.8084, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6439360976219177, |
|
"learning_rate": 4.761943069583027e-05, |
|
"loss": 0.8831, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.9999917149543762, |
|
"learning_rate": 4.761052443540904e-05, |
|
"loss": 0.6372, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.688369870185852, |
|
"learning_rate": 4.760160238194894e-05, |
|
"loss": 0.7938, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6920734643936157, |
|
"learning_rate": 4.759266454168186e-05, |
|
"loss": 0.7378, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7592100501060486, |
|
"learning_rate": 4.758371092085073e-05, |
|
"loss": 1.097, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.9243403077125549, |
|
"learning_rate": 4.757474152570946e-05, |
|
"loss": 1.0404, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8212980031967163, |
|
"learning_rate": 4.756575636252304e-05, |
|
"loss": 0.6179, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6905696392059326, |
|
"learning_rate": 4.755675543756744e-05, |
|
"loss": 0.8398, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8420882821083069, |
|
"learning_rate": 4.754773875712961e-05, |
|
"loss": 0.7552, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6216087341308594, |
|
"learning_rate": 4.7538706327507575e-05, |
|
"loss": 0.8345, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7430551648139954, |
|
"learning_rate": 4.75296581550103e-05, |
|
"loss": 0.8277, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7866222262382507, |
|
"learning_rate": 4.752059424595778e-05, |
|
"loss": 0.9178, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6548468470573425, |
|
"learning_rate": 4.7511514606680985e-05, |
|
"loss": 0.745, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6956586837768555, |
|
"learning_rate": 4.750241924352187e-05, |
|
"loss": 0.8631, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_loss": 0.7539612650871277, |
|
"eval_runtime": 96.4433, |
|
"eval_samples_per_second": 7.227, |
|
"eval_steps_per_second": 7.227, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6508235335350037, |
|
"learning_rate": 4.7493308162833394e-05, |
|
"loss": 0.9936, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8658422827720642, |
|
"learning_rate": 4.7484181370979475e-05, |
|
"loss": 0.8, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.9571516513824463, |
|
"learning_rate": 4.747503887433501e-05, |
|
"loss": 0.7028, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7693742513656616, |
|
"learning_rate": 4.7465880679285866e-05, |
|
"loss": 0.7194, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.34340238571167, |
|
"learning_rate": 4.745670679222888e-05, |
|
"loss": 1.0445, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.71327805519104, |
|
"learning_rate": 4.7447517219571834e-05, |
|
"loss": 0.8088, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.9449920058250427, |
|
"learning_rate": 4.743831196773349e-05, |
|
"loss": 0.7939, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.8091790676116943, |
|
"learning_rate": 4.742909104314353e-05, |
|
"loss": 0.7816, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5790795087814331, |
|
"learning_rate": 4.741985445224263e-05, |
|
"loss": 0.8778, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.1936956644058228, |
|
"learning_rate": 4.741060220148236e-05, |
|
"loss": 1.0242, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5158389806747437, |
|
"learning_rate": 4.7401334297325244e-05, |
|
"loss": 0.7954, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.8950900435447693, |
|
"learning_rate": 4.7392050746244754e-05, |
|
"loss": 0.7603, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7289401888847351, |
|
"learning_rate": 4.738275155472528e-05, |
|
"loss": 0.879, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.8410510420799255, |
|
"learning_rate": 4.7373436729262145e-05, |
|
"loss": 0.7399, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7992503643035889, |
|
"learning_rate": 4.736410627636156e-05, |
|
"loss": 0.6779, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6706194281578064, |
|
"learning_rate": 4.73547602025407e-05, |
|
"loss": 0.7878, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7177903652191162, |
|
"learning_rate": 4.734539851432763e-05, |
|
"loss": 0.6958, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6557692885398865, |
|
"learning_rate": 4.73360212182613e-05, |
|
"loss": 0.6695, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6754157543182373, |
|
"learning_rate": 4.7326628320891586e-05, |
|
"loss": 0.9057, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.1403777599334717, |
|
"learning_rate": 4.731721982877926e-05, |
|
"loss": 1.0507, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 0.7518497705459595, |
|
"eval_runtime": 96.4525, |
|
"eval_samples_per_second": 7.226, |
|
"eval_steps_per_second": 7.226, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.8268899321556091, |
|
"learning_rate": 4.730779574849598e-05, |
|
"loss": 0.7375, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.5358712673187256, |
|
"learning_rate": 4.72983560866243e-05, |
|
"loss": 0.7839, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.0761948823928833, |
|
"learning_rate": 4.7288900849757636e-05, |
|
"loss": 0.7936, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7037429213523865, |
|
"learning_rate": 4.7279430044500315e-05, |
|
"loss": 0.6875, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6378889679908752, |
|
"learning_rate": 4.726994367746751e-05, |
|
"loss": 0.9209, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.5508277416229248, |
|
"learning_rate": 4.7260441755285284e-05, |
|
"loss": 0.9402, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.9046247005462646, |
|
"learning_rate": 4.725092428459055e-05, |
|
"loss": 0.6336, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.8689594864845276, |
|
"learning_rate": 4.7241391272031096e-05, |
|
"loss": 1.1281, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.8785949945449829, |
|
"learning_rate": 4.723184272426555e-05, |
|
"loss": 0.711, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.9959015250205994, |
|
"learning_rate": 4.722227864796339e-05, |
|
"loss": 0.7432, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6438590884208679, |
|
"learning_rate": 4.721269904980497e-05, |
|
"loss": 0.883, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6714455485343933, |
|
"learning_rate": 4.720310393648145e-05, |
|
"loss": 1.065, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7378780245780945, |
|
"learning_rate": 4.7193493314694846e-05, |
|
"loss": 0.5352, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7698020935058594, |
|
"learning_rate": 4.7183867191158006e-05, |
|
"loss": 0.7016, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.952795684337616, |
|
"learning_rate": 4.7174225572594586e-05, |
|
"loss": 1.0659, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6401458978652954, |
|
"learning_rate": 4.71645684657391e-05, |
|
"loss": 0.7335, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.8375076055526733, |
|
"learning_rate": 4.715489587733685e-05, |
|
"loss": 0.9264, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.693505048751831, |
|
"learning_rate": 4.714520781414397e-05, |
|
"loss": 1.0286, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.0239859819412231, |
|
"learning_rate": 4.7135504282927375e-05, |
|
"loss": 0.6875, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.602035403251648, |
|
"learning_rate": 4.712578529046483e-05, |
|
"loss": 0.8247, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 0.7512397766113281, |
|
"eval_runtime": 96.4745, |
|
"eval_samples_per_second": 7.225, |
|
"eval_steps_per_second": 7.225, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6859713196754456, |
|
"learning_rate": 4.711605084354487e-05, |
|
"loss": 0.7521, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7126486301422119, |
|
"learning_rate": 4.7106300948966817e-05, |
|
"loss": 0.7656, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.4363511800765991, |
|
"learning_rate": 4.70965356135408e-05, |
|
"loss": 1.1595, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6381859183311462, |
|
"learning_rate": 4.7086754844087724e-05, |
|
"loss": 0.6949, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.7931796312332153, |
|
"learning_rate": 4.7076958647439284e-05, |
|
"loss": 1.0821, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.9333865642547607, |
|
"learning_rate": 4.706714703043795e-05, |
|
"loss": 0.7753, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.8860915899276733, |
|
"learning_rate": 4.705731999993694e-05, |
|
"loss": 0.7257, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6868377327919006, |
|
"learning_rate": 4.704747756280027e-05, |
|
"loss": 0.8148, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.5337914228439331, |
|
"learning_rate": 4.7037619725902706e-05, |
|
"loss": 0.7379, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.4664730429649353, |
|
"learning_rate": 4.7027746496129745e-05, |
|
"loss": 0.6226, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.7305762767791748, |
|
"learning_rate": 4.701785788037768e-05, |
|
"loss": 0.9018, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6576158404350281, |
|
"learning_rate": 4.7007953885553525e-05, |
|
"loss": 0.7777, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.9728206396102905, |
|
"learning_rate": 4.699803451857503e-05, |
|
"loss": 0.8004, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6211077570915222, |
|
"learning_rate": 4.69880997863707e-05, |
|
"loss": 0.7407, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.2564159631729126, |
|
"learning_rate": 4.697814969587976e-05, |
|
"loss": 0.7993, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.927930474281311, |
|
"learning_rate": 4.696818425405217e-05, |
|
"loss": 0.8803, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.9062425494194031, |
|
"learning_rate": 4.695820346784861e-05, |
|
"loss": 0.8835, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6738875508308411, |
|
"learning_rate": 4.694820734424047e-05, |
|
"loss": 0.7817, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.326353669166565, |
|
"learning_rate": 4.6938195890209866e-05, |
|
"loss": 0.9213, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.4853856563568115, |
|
"learning_rate": 4.692816911274962e-05, |
|
"loss": 0.9835, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 0.7496011257171631, |
|
"eval_runtime": 96.515, |
|
"eval_samples_per_second": 7.222, |
|
"eval_steps_per_second": 7.222, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.5411309003829956, |
|
"learning_rate": 4.691812701886324e-05, |
|
"loss": 0.7556, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7545793652534485, |
|
"learning_rate": 4.6908069615564966e-05, |
|
"loss": 0.8295, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.850104808807373, |
|
"learning_rate": 4.6897996909879695e-05, |
|
"loss": 1.0194, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.69708651304245, |
|
"learning_rate": 4.6887908908843026e-05, |
|
"loss": 0.7918, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.1333253383636475, |
|
"learning_rate": 4.687780561950126e-05, |
|
"loss": 0.7287, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.9223487973213196, |
|
"learning_rate": 4.686768704891134e-05, |
|
"loss": 0.9592, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7700949311256409, |
|
"learning_rate": 4.685755320414091e-05, |
|
"loss": 0.8572, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5573208332061768, |
|
"learning_rate": 4.684740409226829e-05, |
|
"loss": 0.9441, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.6346720457077026, |
|
"learning_rate": 4.6837239720382426e-05, |
|
"loss": 0.8398, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.8065741062164307, |
|
"learning_rate": 4.682706009558297e-05, |
|
"loss": 0.9325, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.3001660406589508, |
|
"learning_rate": 4.681686522498018e-05, |
|
"loss": 0.8997, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.860211968421936, |
|
"learning_rate": 4.680665511569501e-05, |
|
"loss": 0.6883, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.722518265247345, |
|
"learning_rate": 4.6796429774859015e-05, |
|
"loss": 0.8607, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.6525880694389343, |
|
"learning_rate": 4.678618920961442e-05, |
|
"loss": 0.9256, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.7581719756126404, |
|
"learning_rate": 4.6775933427114084e-05, |
|
"loss": 0.662, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.6604760885238647, |
|
"learning_rate": 4.676566243452146e-05, |
|
"loss": 0.734, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.7573785781860352, |
|
"learning_rate": 4.6755376239010665e-05, |
|
"loss": 0.7113, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.8933848738670349, |
|
"learning_rate": 4.674507484776641e-05, |
|
"loss": 0.8523, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5941946506500244, |
|
"learning_rate": 4.6734758267984044e-05, |
|
"loss": 0.7907, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.7756261825561523, |
|
"learning_rate": 4.672442650686949e-05, |
|
"loss": 0.8407, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_loss": 0.7495761513710022, |
|
"eval_runtime": 96.4482, |
|
"eval_samples_per_second": 7.227, |
|
"eval_steps_per_second": 7.227, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6407367587089539, |
|
"learning_rate": 4.671407957163931e-05, |
|
"loss": 0.6413, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.069754719734192, |
|
"learning_rate": 4.670371746952063e-05, |
|
"loss": 0.8934, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.9698624610900879, |
|
"learning_rate": 4.669334020775122e-05, |
|
"loss": 0.7261, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6487118005752563, |
|
"learning_rate": 4.668294779357938e-05, |
|
"loss": 0.8951, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.0640240907669067, |
|
"learning_rate": 4.667254023426404e-05, |
|
"loss": 0.8568, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5772892236709595, |
|
"learning_rate": 4.666211753707468e-05, |
|
"loss": 0.9798, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6915898323059082, |
|
"learning_rate": 4.665167970929137e-05, |
|
"loss": 0.8694, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5959879159927368, |
|
"learning_rate": 4.664122675820474e-05, |
|
"loss": 0.6521, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.833991289138794, |
|
"learning_rate": 4.663075869111597e-05, |
|
"loss": 0.9194, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.9575549960136414, |
|
"learning_rate": 4.662027551533685e-05, |
|
"loss": 1.0088, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5501818656921387, |
|
"learning_rate": 4.660977723818965e-05, |
|
"loss": 0.5997, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6001989245414734, |
|
"learning_rate": 4.659926386700725e-05, |
|
"loss": 0.7643, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6806654930114746, |
|
"learning_rate": 4.658873540913303e-05, |
|
"loss": 0.899, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7098959684371948, |
|
"learning_rate": 4.657819187192094e-05, |
|
"loss": 1.0281, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.9234817028045654, |
|
"learning_rate": 4.6567633262735446e-05, |
|
"loss": 0.9495, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.596527636051178, |
|
"learning_rate": 4.655705958895153e-05, |
|
"loss": 0.6352, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.154539704322815, |
|
"learning_rate": 4.6546470857954736e-05, |
|
"loss": 0.8939, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7502239942550659, |
|
"learning_rate": 4.653586707714108e-05, |
|
"loss": 0.692, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7868794202804565, |
|
"learning_rate": 4.652524825391711e-05, |
|
"loss": 0.908, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6436206102371216, |
|
"learning_rate": 4.6514614395699886e-05, |
|
"loss": 0.7417, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 0.7466740012168884, |
|
"eval_runtime": 96.4309, |
|
"eval_samples_per_second": 7.228, |
|
"eval_steps_per_second": 7.228, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.8566870093345642, |
|
"learning_rate": 4.6503965509916956e-05, |
|
"loss": 0.8041, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5982272028923035, |
|
"learning_rate": 4.649330160400639e-05, |
|
"loss": 0.4528, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6511960029602051, |
|
"learning_rate": 4.648262268541671e-05, |
|
"loss": 0.877, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.8476071357727051, |
|
"learning_rate": 4.6471928761606965e-05, |
|
"loss": 0.7145, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.0408881902694702, |
|
"learning_rate": 4.6461219840046654e-05, |
|
"loss": 0.5539, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7445903420448303, |
|
"learning_rate": 4.645049592821577e-05, |
|
"loss": 0.8306, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.9672279357910156, |
|
"learning_rate": 4.6439757033604756e-05, |
|
"loss": 0.8645, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7082134485244751, |
|
"learning_rate": 4.6429003163714556e-05, |
|
"loss": 0.8188, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8803107142448425, |
|
"learning_rate": 4.641823432605654e-05, |
|
"loss": 0.7956, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7926101088523865, |
|
"learning_rate": 4.640745052815254e-05, |
|
"loss": 0.715, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.890519380569458, |
|
"learning_rate": 4.639665177753485e-05, |
|
"loss": 0.8825, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.9909971952438354, |
|
"learning_rate": 4.638583808174619e-05, |
|
"loss": 0.7843, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7450726628303528, |
|
"learning_rate": 4.6375009448339743e-05, |
|
"loss": 0.9714, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8220781683921814, |
|
"learning_rate": 4.636416588487911e-05, |
|
"loss": 0.8467, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.025499701499939, |
|
"learning_rate": 4.63533073989383e-05, |
|
"loss": 0.9301, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8067827820777893, |
|
"learning_rate": 4.634243399810181e-05, |
|
"loss": 0.7078, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8833619952201843, |
|
"learning_rate": 4.6331545689964475e-05, |
|
"loss": 0.699, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.0600448846817017, |
|
"learning_rate": 4.632064248213159e-05, |
|
"loss": 0.7849, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.0503095388412476, |
|
"learning_rate": 4.630972438221885e-05, |
|
"loss": 0.6215, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.5159885287284851, |
|
"learning_rate": 4.629879139785235e-05, |
|
"loss": 0.7449, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 0.7472941279411316, |
|
"eval_runtime": 96.4994, |
|
"eval_samples_per_second": 7.223, |
|
"eval_steps_per_second": 7.223, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.072464108467102, |
|
"learning_rate": 4.6287843536668575e-05, |
|
"loss": 0.8511, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.9016098976135254, |
|
"learning_rate": 4.62768808063144e-05, |
|
"loss": 0.7373, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.0161947011947632, |
|
"learning_rate": 4.626590321444712e-05, |
|
"loss": 0.9035, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7459146976470947, |
|
"learning_rate": 4.625491076873435e-05, |
|
"loss": 0.6468, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.950080394744873, |
|
"learning_rate": 4.624390347685413e-05, |
|
"loss": 0.7211, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7308927774429321, |
|
"learning_rate": 4.623288134649485e-05, |
|
"loss": 0.9238, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7227129340171814, |
|
"learning_rate": 4.622184438535527e-05, |
|
"loss": 0.9773, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7054020166397095, |
|
"learning_rate": 4.62107926011445e-05, |
|
"loss": 0.7783, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6535981297492981, |
|
"learning_rate": 4.619972600158201e-05, |
|
"loss": 0.6559, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7245693206787109, |
|
"learning_rate": 4.618864459439762e-05, |
|
"loss": 0.8352, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.9683626890182495, |
|
"learning_rate": 4.6177548387331485e-05, |
|
"loss": 0.9397, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1847660541534424, |
|
"learning_rate": 4.616643738813411e-05, |
|
"loss": 0.7383, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.8566804528236389, |
|
"learning_rate": 4.615531160456633e-05, |
|
"loss": 0.8066, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7312522530555725, |
|
"learning_rate": 4.61441710443993e-05, |
|
"loss": 0.7974, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6620572209358215, |
|
"learning_rate": 4.6133015715414484e-05, |
|
"loss": 0.9136, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.5405072569847107, |
|
"learning_rate": 4.612184562540369e-05, |
|
"loss": 0.6921, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7474086284637451, |
|
"learning_rate": 4.611066078216901e-05, |
|
"loss": 0.8463, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.9125152230262756, |
|
"learning_rate": 4.609946119352287e-05, |
|
"loss": 0.8508, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.9998400211334229, |
|
"learning_rate": 4.608824686728797e-05, |
|
"loss": 0.8735, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.5990025401115417, |
|
"learning_rate": 4.6077017811297304e-05, |
|
"loss": 0.8562, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 0.743736743927002, |
|
"eval_runtime": 96.3748, |
|
"eval_samples_per_second": 7.232, |
|
"eval_steps_per_second": 7.232, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.35676899552345276, |
|
"learning_rate": 4.606577403339418e-05, |
|
"loss": 0.8914, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.772233784198761, |
|
"learning_rate": 4.605451554143216e-05, |
|
"loss": 0.779, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7336989641189575, |
|
"learning_rate": 4.604324234327509e-05, |
|
"loss": 0.7678, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7039794325828552, |
|
"learning_rate": 4.603195444679711e-05, |
|
"loss": 0.8783, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.6955629587173462, |
|
"learning_rate": 4.602065185988259e-05, |
|
"loss": 0.818, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7369412779808044, |
|
"learning_rate": 4.60093345904262e-05, |
|
"loss": 0.6942, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.6824669241905212, |
|
"learning_rate": 4.5998002646332835e-05, |
|
"loss": 0.9274, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.355720043182373, |
|
"learning_rate": 4.598665603551765e-05, |
|
"loss": 0.7219, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.8629677295684814, |
|
"learning_rate": 4.597529476590605e-05, |
|
"loss": 0.8023, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.2956135272979736, |
|
"learning_rate": 4.596391884543368e-05, |
|
"loss": 0.9574, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.2683722972869873, |
|
"learning_rate": 4.59525282820464e-05, |
|
"loss": 0.6996, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7317371368408203, |
|
"learning_rate": 4.594112308370032e-05, |
|
"loss": 1.03, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.0310641527175903, |
|
"learning_rate": 4.5929703258361756e-05, |
|
"loss": 0.6917, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.9479489326477051, |
|
"learning_rate": 4.591826881400726e-05, |
|
"loss": 0.9939, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.9485552310943604, |
|
"learning_rate": 4.5906819758623576e-05, |
|
"loss": 1.0317, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.724987268447876, |
|
"learning_rate": 4.589535610020765e-05, |
|
"loss": 0.6915, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7091718316078186, |
|
"learning_rate": 4.5883877846766654e-05, |
|
"loss": 0.8673, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.8297457098960876, |
|
"learning_rate": 4.587238500631793e-05, |
|
"loss": 0.8114, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7213541269302368, |
|
"learning_rate": 4.586087758688903e-05, |
|
"loss": 0.863, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.1096009016036987, |
|
"learning_rate": 4.584935559651765e-05, |
|
"loss": 0.9222, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.7428527474403381, |
|
"eval_runtime": 96.3993, |
|
"eval_samples_per_second": 7.23, |
|
"eval_steps_per_second": 7.23, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5834380984306335, |
|
"learning_rate": 4.583781904325172e-05, |
|
"loss": 0.6609, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5797068476676941, |
|
"learning_rate": 4.5826267935149285e-05, |
|
"loss": 0.7933, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6027450561523438, |
|
"learning_rate": 4.581470228027861e-05, |
|
"loss": 0.7841, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5674509406089783, |
|
"learning_rate": 4.5803122086718077e-05, |
|
"loss": 0.7721, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7398461103439331, |
|
"learning_rate": 4.5791527362556235e-05, |
|
"loss": 0.7651, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6617181301116943, |
|
"learning_rate": 4.577991811589181e-05, |
|
"loss": 0.9359, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.49279505014419556, |
|
"learning_rate": 4.576829435483362e-05, |
|
"loss": 0.6278, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.5201964378356934, |
|
"learning_rate": 4.575665608750067e-05, |
|
"loss": 0.853, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7188725471496582, |
|
"learning_rate": 4.5745003322022084e-05, |
|
"loss": 0.8338, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.0798031091690063, |
|
"learning_rate": 4.573333606653708e-05, |
|
"loss": 0.9776, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.6439509987831116, |
|
"learning_rate": 4.5721654329195046e-05, |
|
"loss": 0.9331, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7663920521736145, |
|
"learning_rate": 4.570995811815545e-05, |
|
"loss": 1.0533, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7230969071388245, |
|
"learning_rate": 4.569824744158789e-05, |
|
"loss": 0.6966, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.016112208366394, |
|
"learning_rate": 4.568652230767205e-05, |
|
"loss": 0.8393, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.0165222883224487, |
|
"learning_rate": 4.567478272459773e-05, |
|
"loss": 1.0218, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.709685742855072, |
|
"learning_rate": 4.5663028700564826e-05, |
|
"loss": 0.7273, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.5664321780204773, |
|
"learning_rate": 4.565126024378328e-05, |
|
"loss": 0.9079, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7938306927680969, |
|
"learning_rate": 4.5639477362473173e-05, |
|
"loss": 0.976, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.6710417866706848, |
|
"learning_rate": 4.5627680064864606e-05, |
|
"loss": 1.1969, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.9886580109596252, |
|
"learning_rate": 4.5615868359197796e-05, |
|
"loss": 0.9242, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_loss": 0.7412505149841309, |
|
"eval_runtime": 96.4, |
|
"eval_samples_per_second": 7.23, |
|
"eval_steps_per_second": 7.23, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.8157562613487244, |
|
"learning_rate": 4.5604042253723014e-05, |
|
"loss": 0.8398, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.2368131875991821, |
|
"learning_rate": 4.559220175670054e-05, |
|
"loss": 0.8742, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6060155034065247, |
|
"learning_rate": 4.558034687640078e-05, |
|
"loss": 0.6993, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.1366558074951172, |
|
"learning_rate": 4.556847762110415e-05, |
|
"loss": 0.9328, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.7205525636672974, |
|
"learning_rate": 4.555659399910108e-05, |
|
"loss": 0.827, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6944175958633423, |
|
"learning_rate": 4.554469601869209e-05, |
|
"loss": 0.7805, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6939406394958496, |
|
"learning_rate": 4.55327836881877e-05, |
|
"loss": 0.7996, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.592650830745697, |
|
"learning_rate": 4.552085701590844e-05, |
|
"loss": 0.6599, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.5287877321243286, |
|
"learning_rate": 4.5508916010184884e-05, |
|
"loss": 0.6856, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6414081454277039, |
|
"learning_rate": 4.549696067935762e-05, |
|
"loss": 0.7622, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.2272289991378784, |
|
"learning_rate": 4.548499103177719e-05, |
|
"loss": 1.0834, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.5912505388259888, |
|
"learning_rate": 4.547300707580422e-05, |
|
"loss": 0.8738, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6686813235282898, |
|
"learning_rate": 4.5461008819809246e-05, |
|
"loss": 0.6221, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.891153872013092, |
|
"learning_rate": 4.544899627217286e-05, |
|
"loss": 0.9009, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.1651557683944702, |
|
"learning_rate": 4.543696944128559e-05, |
|
"loss": 0.8448, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.7525443434715271, |
|
"learning_rate": 4.5424928335547964e-05, |
|
"loss": 0.6654, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6798614859580994, |
|
"learning_rate": 4.541287296337048e-05, |
|
"loss": 0.9244, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.498735249042511, |
|
"learning_rate": 4.540080333317358e-05, |
|
"loss": 0.6815, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6097673773765564, |
|
"learning_rate": 4.5388719453387694e-05, |
|
"loss": 0.8536, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6685522198677063, |
|
"learning_rate": 4.537662133245319e-05, |
|
"loss": 0.8092, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 0.7402560114860535, |
|
"eval_runtime": 96.4998, |
|
"eval_samples_per_second": 7.223, |
|
"eval_steps_per_second": 7.223, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.162788987159729, |
|
"learning_rate": 4.5364508978820375e-05, |
|
"loss": 0.6143, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.8281823992729187, |
|
"learning_rate": 4.5352382400949524e-05, |
|
"loss": 0.8143, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6465135812759399, |
|
"learning_rate": 4.534024160731082e-05, |
|
"loss": 0.9152, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5903899669647217, |
|
"learning_rate": 4.532808660638438e-05, |
|
"loss": 0.7229, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6988681554794312, |
|
"learning_rate": 4.5315917406660265e-05, |
|
"loss": 0.6863, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.7910459637641907, |
|
"learning_rate": 4.530373401663843e-05, |
|
"loss": 0.8762, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.7580087184906006, |
|
"learning_rate": 4.529153644482875e-05, |
|
"loss": 0.9896, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6871665716171265, |
|
"learning_rate": 4.5279324699751005e-05, |
|
"loss": 0.8831, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.0093677043914795, |
|
"learning_rate": 4.526709878993488e-05, |
|
"loss": 0.742, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.9898921847343445, |
|
"learning_rate": 4.525485872391996e-05, |
|
"loss": 0.766, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.8706837296485901, |
|
"learning_rate": 4.524260451025569e-05, |
|
"loss": 0.7545, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.1715607643127441, |
|
"learning_rate": 4.523033615750142e-05, |
|
"loss": 0.84, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.017062783241272, |
|
"learning_rate": 4.521805367422638e-05, |
|
"loss": 0.7477, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6071624159812927, |
|
"learning_rate": 4.520575706900965e-05, |
|
"loss": 0.793, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5821404457092285, |
|
"learning_rate": 4.519344635044018e-05, |
|
"loss": 0.7514, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6849238872528076, |
|
"learning_rate": 4.51811215271168e-05, |
|
"loss": 0.862, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.8808868527412415, |
|
"learning_rate": 4.5168782607648166e-05, |
|
"loss": 0.7189, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.7080340385437012, |
|
"learning_rate": 4.5156429600652774e-05, |
|
"loss": 0.6987, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.705869734287262, |
|
"learning_rate": 4.5144062514759e-05, |
|
"loss": 0.6482, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6345694065093994, |
|
"learning_rate": 4.5131681358605007e-05, |
|
"loss": 0.7279, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_loss": 0.7394095063209534, |
|
"eval_runtime": 96.4977, |
|
"eval_samples_per_second": 7.223, |
|
"eval_steps_per_second": 7.223, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.748913586139679, |
|
"learning_rate": 4.511928614083881e-05, |
|
"loss": 0.7474, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6260043382644653, |
|
"learning_rate": 4.5106876870118255e-05, |
|
"loss": 0.7469, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.596367597579956, |
|
"learning_rate": 4.509445355511098e-05, |
|
"loss": 0.8437, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.3925014734268188, |
|
"learning_rate": 4.5082016204494445e-05, |
|
"loss": 1.0928, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.1370338201522827, |
|
"learning_rate": 4.506956482695592e-05, |
|
"loss": 0.8908, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6746950149536133, |
|
"learning_rate": 4.505709943119246e-05, |
|
"loss": 0.7121, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6608826518058777, |
|
"learning_rate": 4.504462002591091e-05, |
|
"loss": 0.9397, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6542508006095886, |
|
"learning_rate": 4.5032126619827916e-05, |
|
"loss": 0.6942, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.5825070738792419, |
|
"learning_rate": 4.5019619221669895e-05, |
|
"loss": 0.7083, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.8596588373184204, |
|
"learning_rate": 4.500709784017303e-05, |
|
"loss": 0.839, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.641009509563446, |
|
"learning_rate": 4.499456248408328e-05, |
|
"loss": 0.72, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.213782548904419, |
|
"learning_rate": 4.498201316215635e-05, |
|
"loss": 0.7116, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.1411411762237549, |
|
"learning_rate": 4.496944988315775e-05, |
|
"loss": 1.0208, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.8265553712844849, |
|
"learning_rate": 4.495687265586266e-05, |
|
"loss": 0.7664, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.9309681057929993, |
|
"learning_rate": 4.4944281489056065e-05, |
|
"loss": 0.9126, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.49171608686447144, |
|
"learning_rate": 4.493167639153266e-05, |
|
"loss": 0.6271, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.743669867515564, |
|
"learning_rate": 4.491905737209688e-05, |
|
"loss": 0.7965, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.6191633939743042, |
|
"learning_rate": 4.490642443956287e-05, |
|
"loss": 0.5884, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5481441020965576, |
|
"learning_rate": 4.489377760275452e-05, |
|
"loss": 0.6281, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.7155417203903198, |
|
"learning_rate": 4.488111687050539e-05, |
|
"loss": 0.7774, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_loss": 0.738506019115448, |
|
"eval_runtime": 96.7667, |
|
"eval_samples_per_second": 7.203, |
|
"eval_steps_per_second": 7.203, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.032523274421692, |
|
"learning_rate": 4.4868442251658795e-05, |
|
"loss": 0.7621, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.584082841873169, |
|
"learning_rate": 4.4855753755067703e-05, |
|
"loss": 0.6617, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.7214722037315369, |
|
"learning_rate": 4.4843051389594814e-05, |
|
"loss": 0.8669, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6019904613494873, |
|
"learning_rate": 4.4830335164112504e-05, |
|
"loss": 0.736, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.8038384318351746, |
|
"learning_rate": 4.48176050875028e-05, |
|
"loss": 0.637, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.9631878733634949, |
|
"learning_rate": 4.4804861168657455e-05, |
|
"loss": 0.9722, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.5342935919761658, |
|
"learning_rate": 4.4792103416477836e-05, |
|
"loss": 0.8081, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.5893488526344299, |
|
"learning_rate": 4.477933183987503e-05, |
|
"loss": 0.61, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.388850212097168, |
|
"learning_rate": 4.476654644776973e-05, |
|
"loss": 0.8454, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6928623914718628, |
|
"learning_rate": 4.4753747249092305e-05, |
|
"loss": 0.7209, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.2383430004119873, |
|
"learning_rate": 4.4740934252782757e-05, |
|
"loss": 0.8205, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6005001664161682, |
|
"learning_rate": 4.472810746779074e-05, |
|
"loss": 0.6083, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7928474545478821, |
|
"learning_rate": 4.471526690307552e-05, |
|
"loss": 0.9735, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.8710891008377075, |
|
"learning_rate": 4.4702412567606014e-05, |
|
"loss": 0.7573, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6327987313270569, |
|
"learning_rate": 4.468954447036071e-05, |
|
"loss": 0.8563, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7048762440681458, |
|
"learning_rate": 4.467666262032777e-05, |
|
"loss": 0.9176, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6058861017227173, |
|
"learning_rate": 4.466376702650492e-05, |
|
"loss": 0.5525, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.637993574142456, |
|
"learning_rate": 4.465085769789949e-05, |
|
"loss": 0.7256, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6992897987365723, |
|
"learning_rate": 4.463793464352842e-05, |
|
"loss": 0.8824, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7812734246253967, |
|
"learning_rate": 4.462499787241822e-05, |
|
"loss": 0.8942, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 0.7363680601119995, |
|
"eval_runtime": 96.9231, |
|
"eval_samples_per_second": 7.191, |
|
"eval_steps_per_second": 7.191, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.907598078250885, |
|
"learning_rate": 4.4612047393605e-05, |
|
"loss": 0.867, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.9081722497940063, |
|
"learning_rate": 4.459908321613442e-05, |
|
"loss": 0.8757, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5538048148155212, |
|
"learning_rate": 4.4586105349061726e-05, |
|
"loss": 0.6709, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6632833480834961, |
|
"learning_rate": 4.457311380145173e-05, |
|
"loss": 0.8362, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.8646539449691772, |
|
"learning_rate": 4.4560108582378766e-05, |
|
"loss": 0.8527, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6309005618095398, |
|
"learning_rate": 4.454708970092678e-05, |
|
"loss": 0.595, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5711541175842285, |
|
"learning_rate": 4.45340571661892e-05, |
|
"loss": 0.8069, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.1379880905151367, |
|
"learning_rate": 4.4521010987269006e-05, |
|
"loss": 0.8464, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6005469560623169, |
|
"learning_rate": 4.450795117327874e-05, |
|
"loss": 0.5801, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.7842866778373718, |
|
"learning_rate": 4.449487773334042e-05, |
|
"loss": 0.6238, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.7519890069961548, |
|
"learning_rate": 4.448179067658563e-05, |
|
"loss": 1.1255, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5955212712287903, |
|
"learning_rate": 4.446869001215542e-05, |
|
"loss": 0.7738, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5085921287536621, |
|
"learning_rate": 4.4455575749200364e-05, |
|
"loss": 0.6239, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.779778003692627, |
|
"learning_rate": 4.444244789688056e-05, |
|
"loss": 0.9719, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.7279208898544312, |
|
"learning_rate": 4.442930646436554e-05, |
|
"loss": 0.9854, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.9218065738677979, |
|
"learning_rate": 4.4416151460834376e-05, |
|
"loss": 0.8096, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.7595914006233215, |
|
"learning_rate": 4.44029828954756e-05, |
|
"loss": 0.7955, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.785493016242981, |
|
"learning_rate": 4.43898007774872e-05, |
|
"loss": 0.8598, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5540453195571899, |
|
"learning_rate": 4.437660511607666e-05, |
|
"loss": 0.8485, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.7215760350227356, |
|
"learning_rate": 4.43633959204609e-05, |
|
"loss": 0.9286, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 0.7347923517227173, |
|
"eval_runtime": 96.8658, |
|
"eval_samples_per_second": 7.196, |
|
"eval_steps_per_second": 7.196, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.7934743762016296, |
|
"learning_rate": 4.435017319986631e-05, |
|
"loss": 0.7829, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.503614068031311, |
|
"learning_rate": 4.43369369635287e-05, |
|
"loss": 0.7203, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6292420625686646, |
|
"learning_rate": 4.4323687220693365e-05, |
|
"loss": 0.7556, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6981114149093628, |
|
"learning_rate": 4.431042398061499e-05, |
|
"loss": 0.6953, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.8554514050483704, |
|
"learning_rate": 4.4297147252557715e-05, |
|
"loss": 0.7731, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.1464003324508667, |
|
"learning_rate": 4.428385704579509e-05, |
|
"loss": 0.7761, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6772524118423462, |
|
"learning_rate": 4.427055336961008e-05, |
|
"loss": 0.7529, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.5949820280075073, |
|
"learning_rate": 4.425723623329507e-05, |
|
"loss": 0.9164, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.848900318145752, |
|
"learning_rate": 4.4243905646151825e-05, |
|
"loss": 0.8385, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.7119936943054199, |
|
"learning_rate": 4.4230561617491514e-05, |
|
"loss": 0.6342, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.4240078628063202, |
|
"learning_rate": 4.421720415663472e-05, |
|
"loss": 0.9921, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.166399359703064, |
|
"learning_rate": 4.4203833272911355e-05, |
|
"loss": 0.6751, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.7882303595542908, |
|
"learning_rate": 4.4190448975660756e-05, |
|
"loss": 0.8711, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.7739405632019043, |
|
"learning_rate": 4.417705127423162e-05, |
|
"loss": 0.7635, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.6729245781898499, |
|
"learning_rate": 4.416364017798197e-05, |
|
"loss": 1.0083, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.7291648983955383, |
|
"learning_rate": 4.4150215696279233e-05, |
|
"loss": 0.9355, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.569436252117157, |
|
"learning_rate": 4.413677783850015e-05, |
|
"loss": 0.5718, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.7857233285903931, |
|
"learning_rate": 4.412332661403085e-05, |
|
"loss": 0.6356, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.124894618988037, |
|
"learning_rate": 4.410986203226672e-05, |
|
"loss": 0.9911, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.7088748216629028, |
|
"learning_rate": 4.409638410261256e-05, |
|
"loss": 0.7703, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_loss": 0.7353793978691101, |
|
"eval_runtime": 96.9146, |
|
"eval_samples_per_second": 7.192, |
|
"eval_steps_per_second": 7.192, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.8883334398269653, |
|
"learning_rate": 4.4082892834482456e-05, |
|
"loss": 0.7829, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5809643864631653, |
|
"learning_rate": 4.406938823729979e-05, |
|
"loss": 0.79, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.2371888160705566, |
|
"learning_rate": 4.405587032049731e-05, |
|
"loss": 0.9394, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.6468964219093323, |
|
"learning_rate": 4.4042339093517e-05, |
|
"loss": 0.7621, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.8613569736480713, |
|
"learning_rate": 4.4028794565810194e-05, |
|
"loss": 0.9303, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8210548162460327, |
|
"learning_rate": 4.4015236746837505e-05, |
|
"loss": 1.04, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8066801428794861, |
|
"learning_rate": 4.4001665646068804e-05, |
|
"loss": 0.9942, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6841477751731873, |
|
"learning_rate": 4.3988081272983263e-05, |
|
"loss": 0.6893, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.7812705636024475, |
|
"learning_rate": 4.3974483637069333e-05, |
|
"loss": 0.9125, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.7913382649421692, |
|
"learning_rate": 4.3960872747824686e-05, |
|
"loss": 0.9298, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6053805947303772, |
|
"learning_rate": 4.394724861475631e-05, |
|
"loss": 0.7055, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6879487633705139, |
|
"learning_rate": 4.393361124738039e-05, |
|
"loss": 0.605, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.7929925918579102, |
|
"learning_rate": 4.3919960655222394e-05, |
|
"loss": 0.8569, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5888631939888, |
|
"learning_rate": 4.390629684781701e-05, |
|
"loss": 0.6246, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.9546008706092834, |
|
"learning_rate": 4.389261983470815e-05, |
|
"loss": 0.7964, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.620267391204834, |
|
"learning_rate": 4.387892962544896e-05, |
|
"loss": 0.7127, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.7655039429664612, |
|
"learning_rate": 4.3865226229601805e-05, |
|
"loss": 0.6936, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5404471158981323, |
|
"learning_rate": 4.3851509656738264e-05, |
|
"loss": 0.6141, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.9140282273292542, |
|
"learning_rate": 4.38377799164391e-05, |
|
"loss": 1.152, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.4845621585845947, |
|
"learning_rate": 4.382403701829429e-05, |
|
"loss": 0.8322, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 0.733027458190918, |
|
"eval_runtime": 96.886, |
|
"eval_samples_per_second": 7.194, |
|
"eval_steps_per_second": 7.194, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.598147988319397, |
|
"learning_rate": 4.381028097190299e-05, |
|
"loss": 0.772, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5572992563247681, |
|
"learning_rate": 4.3796511786873574e-05, |
|
"loss": 0.7232, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7913936376571655, |
|
"learning_rate": 4.378272947282354e-05, |
|
"loss": 0.6972, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.4532865583896637, |
|
"learning_rate": 4.376893403937959e-05, |
|
"loss": 0.7454, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.8871356844902039, |
|
"learning_rate": 4.375512549617759e-05, |
|
"loss": 0.6946, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7564520835876465, |
|
"learning_rate": 4.374130385286255e-05, |
|
"loss": 0.9257, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7280387282371521, |
|
"learning_rate": 4.3727469119088624e-05, |
|
"loss": 0.756, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6494055986404419, |
|
"learning_rate": 4.3713621304519144e-05, |
|
"loss": 0.6358, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6048948764801025, |
|
"learning_rate": 4.369976041882654e-05, |
|
"loss": 0.6705, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6458585858345032, |
|
"learning_rate": 4.36858864716924e-05, |
|
"loss": 0.7999, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.837872326374054, |
|
"learning_rate": 4.36719994728074e-05, |
|
"loss": 0.7671, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6451572775840759, |
|
"learning_rate": 4.365809943187138e-05, |
|
"loss": 0.8672, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6438645124435425, |
|
"learning_rate": 4.364418635859326e-05, |
|
"loss": 0.78, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7427099347114563, |
|
"learning_rate": 4.363026026269106e-05, |
|
"loss": 0.8977, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7844499945640564, |
|
"learning_rate": 4.36163211538919e-05, |
|
"loss": 0.7586, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.8544999361038208, |
|
"learning_rate": 4.360236904193201e-05, |
|
"loss": 0.7085, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.431629180908203, |
|
"learning_rate": 4.358840393655668e-05, |
|
"loss": 0.8572, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6864097118377686, |
|
"learning_rate": 4.357442584752027e-05, |
|
"loss": 0.6848, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7158388495445251, |
|
"learning_rate": 4.356043478458623e-05, |
|
"loss": 1.0071, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7883514165878296, |
|
"learning_rate": 4.3546430757527066e-05, |
|
"loss": 0.9851, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 0.732368528842926, |
|
"eval_runtime": 96.9109, |
|
"eval_samples_per_second": 7.192, |
|
"eval_steps_per_second": 7.192, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.673925518989563, |
|
"learning_rate": 4.353241377612433e-05, |
|
"loss": 0.7076, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.9540270566940308, |
|
"learning_rate": 4.351838385016862e-05, |
|
"loss": 0.8989, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.8137551546096802, |
|
"learning_rate": 4.35043409894596e-05, |
|
"loss": 0.7633, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.779330313205719, |
|
"learning_rate": 4.349028520380594e-05, |
|
"loss": 0.7013, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.7883580327033997, |
|
"learning_rate": 4.347621650302535e-05, |
|
"loss": 0.9788, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.7106336951255798, |
|
"learning_rate": 4.3462134896944565e-05, |
|
"loss": 0.8399, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6303668022155762, |
|
"learning_rate": 4.344804039539933e-05, |
|
"loss": 0.5943, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.2975471019744873, |
|
"learning_rate": 4.3433933008234395e-05, |
|
"loss": 0.8917, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.68232661485672, |
|
"learning_rate": 4.341981274530351e-05, |
|
"loss": 0.7756, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6689594984054565, |
|
"learning_rate": 4.340567961646943e-05, |
|
"loss": 0.772, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.102365493774414, |
|
"learning_rate": 4.339153363160388e-05, |
|
"loss": 0.738, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6535090804100037, |
|
"learning_rate": 4.337737480058758e-05, |
|
"loss": 0.9096, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.676058292388916, |
|
"learning_rate": 4.3363203133310206e-05, |
|
"loss": 0.9634, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.9258711934089661, |
|
"learning_rate": 4.3349018639670415e-05, |
|
"loss": 0.8025, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.5786353349685669, |
|
"learning_rate": 4.333482132957581e-05, |
|
"loss": 0.7638, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7258582711219788, |
|
"learning_rate": 4.332061121294296e-05, |
|
"loss": 1.3538, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.9384926557540894, |
|
"learning_rate": 4.330638829969738e-05, |
|
"loss": 0.8485, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5252525806427002, |
|
"learning_rate": 4.3292152599773494e-05, |
|
"loss": 0.8547, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7551200985908508, |
|
"learning_rate": 4.32779041231147e-05, |
|
"loss": 0.7435, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7492663264274597, |
|
"learning_rate": 4.3263642879673286e-05, |
|
"loss": 0.8712, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 0.7316818237304688, |
|
"eval_runtime": 96.9418, |
|
"eval_samples_per_second": 7.19, |
|
"eval_steps_per_second": 7.19, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7490917444229126, |
|
"learning_rate": 4.3249368879410475e-05, |
|
"loss": 0.7598, |
|
"step": 4505 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7305790781974792, |
|
"learning_rate": 4.323508213229639e-05, |
|
"loss": 0.8315, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7009093165397644, |
|
"learning_rate": 4.3220782648310075e-05, |
|
"loss": 0.7482, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7155885100364685, |
|
"learning_rate": 4.320647043743945e-05, |
|
"loss": 0.8385, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6159176826477051, |
|
"learning_rate": 4.319214550968133e-05, |
|
"loss": 0.6507, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7776069045066833, |
|
"learning_rate": 4.3177807875041424e-05, |
|
"loss": 0.855, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6204195618629456, |
|
"learning_rate": 4.316345754353432e-05, |
|
"loss": 0.7169, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7233458757400513, |
|
"learning_rate": 4.3149094525183426e-05, |
|
"loss": 0.5399, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7921779155731201, |
|
"learning_rate": 4.313471883002108e-05, |
|
"loss": 0.9124, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.9145547747612, |
|
"learning_rate": 4.3120330468088435e-05, |
|
"loss": 1.2346, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.8733106255531311, |
|
"learning_rate": 4.310592944943549e-05, |
|
"loss": 0.6737, |
|
"step": 4555 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.6620619297027588, |
|
"learning_rate": 4.3091515784121107e-05, |
|
"loss": 0.8041, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7026892900466919, |
|
"learning_rate": 4.307708948221296e-05, |
|
"loss": 0.9422, |
|
"step": 4565 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7953292727470398, |
|
"learning_rate": 4.3062650553787566e-05, |
|
"loss": 0.7398, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.6465870141983032, |
|
"learning_rate": 4.304819900893024e-05, |
|
"loss": 0.8175, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.3427163362503052, |
|
"learning_rate": 4.303373485773513e-05, |
|
"loss": 0.7331, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.6665405035018921, |
|
"learning_rate": 4.3019258110305186e-05, |
|
"loss": 0.7529, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.796320915222168, |
|
"learning_rate": 4.300476877675215e-05, |
|
"loss": 0.915, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.551832377910614, |
|
"learning_rate": 4.299026686719655e-05, |
|
"loss": 0.7693, |
|
"step": 4595 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.75690096616745, |
|
"learning_rate": 4.297575239176771e-05, |
|
"loss": 0.7871, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_loss": 0.730965256690979, |
|
"eval_runtime": 96.8803, |
|
"eval_samples_per_second": 7.194, |
|
"eval_steps_per_second": 7.194, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7093445062637329, |
|
"learning_rate": 4.296122536060373e-05, |
|
"loss": 0.6279, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6522731781005859, |
|
"learning_rate": 4.294668578385147e-05, |
|
"loss": 0.5442, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7964634299278259, |
|
"learning_rate": 4.2932133671666565e-05, |
|
"loss": 1.0221, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7733820676803589, |
|
"learning_rate": 4.2917569034213395e-05, |
|
"loss": 0.7152, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.8039364218711853, |
|
"learning_rate": 4.2902991881665097e-05, |
|
"loss": 1.0939, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.756020724773407, |
|
"learning_rate": 4.2888402224203536e-05, |
|
"loss": 0.7539, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5059025287628174, |
|
"learning_rate": 4.2873800072019345e-05, |
|
"loss": 0.8716, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.8273636102676392, |
|
"learning_rate": 4.285918543531183e-05, |
|
"loss": 0.687, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6505921483039856, |
|
"learning_rate": 4.2844558324289076e-05, |
|
"loss": 1.0697, |
|
"step": 4645 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6481053829193115, |
|
"learning_rate": 4.282991874916784e-05, |
|
"loss": 0.884, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.8193663954734802, |
|
"learning_rate": 4.28152667201736e-05, |
|
"loss": 0.962, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7153398990631104, |
|
"learning_rate": 4.280060224754053e-05, |
|
"loss": 0.7705, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7197556495666504, |
|
"learning_rate": 4.278592534151149e-05, |
|
"loss": 0.8521, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.009098768234253, |
|
"learning_rate": 4.2771236012338044e-05, |
|
"loss": 0.8425, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.598564088344574, |
|
"learning_rate": 4.275653427028041e-05, |
|
"loss": 0.7072, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8035867810249329, |
|
"learning_rate": 4.2741820125607504e-05, |
|
"loss": 0.6689, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6109891533851624, |
|
"learning_rate": 4.2727093588596866e-05, |
|
"loss": 0.7776, |
|
"step": 4685 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.7504151463508606, |
|
"learning_rate": 4.271235466953473e-05, |
|
"loss": 0.7481, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.9492344260215759, |
|
"learning_rate": 4.269760337871594e-05, |
|
"loss": 0.7261, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.5792133212089539, |
|
"learning_rate": 4.2682839726444035e-05, |
|
"loss": 0.7156, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 0.7283556461334229, |
|
"eval_runtime": 96.8998, |
|
"eval_samples_per_second": 7.193, |
|
"eval_steps_per_second": 7.193, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8895491361618042, |
|
"learning_rate": 4.266806372303113e-05, |
|
"loss": 0.8466, |
|
"step": 4705 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.7995960712432861, |
|
"learning_rate": 4.2653275378798005e-05, |
|
"loss": 0.7823, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6673771739006042, |
|
"learning_rate": 4.263847470407405e-05, |
|
"loss": 0.5461, |
|
"step": 4715 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6228974461555481, |
|
"learning_rate": 4.262366170919726e-05, |
|
"loss": 0.7611, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8050612807273865, |
|
"learning_rate": 4.2608836404514255e-05, |
|
"loss": 0.6524, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8815121650695801, |
|
"learning_rate": 4.2593998800380216e-05, |
|
"loss": 0.9997, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.0408731698989868, |
|
"learning_rate": 4.257914890715897e-05, |
|
"loss": 0.7031, |
|
"step": 4735 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6744192838668823, |
|
"learning_rate": 4.256428673522287e-05, |
|
"loss": 0.6587, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.014369249343872, |
|
"learning_rate": 4.254941229495289e-05, |
|
"loss": 0.7726, |
|
"step": 4745 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7497864365577698, |
|
"learning_rate": 4.2534525596738526e-05, |
|
"loss": 0.7327, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6479122042655945, |
|
"learning_rate": 4.2519626650977905e-05, |
|
"loss": 0.7071, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6300268173217773, |
|
"learning_rate": 4.250471546807765e-05, |
|
"loss": 0.9479, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.8272077441215515, |
|
"learning_rate": 4.248979205845294e-05, |
|
"loss": 0.9013, |
|
"step": 4765 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7070410847663879, |
|
"learning_rate": 4.2474856432527524e-05, |
|
"loss": 0.713, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7199767231941223, |
|
"learning_rate": 4.2459908600733654e-05, |
|
"loss": 0.9308, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6886048316955566, |
|
"learning_rate": 4.244494857351212e-05, |
|
"loss": 0.8008, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6097077131271362, |
|
"learning_rate": 4.242997636131222e-05, |
|
"loss": 0.9639, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.0947343111038208, |
|
"learning_rate": 4.241499197459178e-05, |
|
"loss": 0.9012, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6965738534927368, |
|
"learning_rate": 4.239999542381712e-05, |
|
"loss": 0.6745, |
|
"step": 4795 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.8290371894836426, |
|
"learning_rate": 4.238498671946306e-05, |
|
"loss": 0.7856, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_loss": 0.7277354598045349, |
|
"eval_runtime": 96.9165, |
|
"eval_samples_per_second": 7.192, |
|
"eval_steps_per_second": 7.192, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.8061904907226562, |
|
"learning_rate": 4.2369965872012904e-05, |
|
"loss": 0.7034, |
|
"step": 4805 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6652625799179077, |
|
"learning_rate": 4.2354932891958434e-05, |
|
"loss": 0.5825, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6529026627540588, |
|
"learning_rate": 4.2339887789799916e-05, |
|
"loss": 0.7407, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.7802493572235107, |
|
"learning_rate": 4.232483057604607e-05, |
|
"loss": 0.8906, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7709060907363892, |
|
"learning_rate": 4.230976126121411e-05, |
|
"loss": 0.863, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.2582249641418457, |
|
"learning_rate": 4.229467985582966e-05, |
|
"loss": 1.065, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.5523508191108704, |
|
"learning_rate": 4.22795863704268e-05, |
|
"loss": 0.5925, |
|
"step": 4835 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.3535953760147095, |
|
"learning_rate": 4.2264480815548076e-05, |
|
"loss": 0.7993, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.067133903503418, |
|
"learning_rate": 4.2249363201744425e-05, |
|
"loss": 0.7921, |
|
"step": 4845 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6478603482246399, |
|
"learning_rate": 4.223423353957523e-05, |
|
"loss": 0.6769, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6439855694770813, |
|
"learning_rate": 4.2219091839608276e-05, |
|
"loss": 0.9018, |
|
"step": 4855 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5302556753158569, |
|
"learning_rate": 4.2203938112419786e-05, |
|
"loss": 0.837, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.8129810690879822, |
|
"learning_rate": 4.218877236859433e-05, |
|
"loss": 0.9195, |
|
"step": 4865 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.6531801819801331, |
|
"learning_rate": 4.217359461872493e-05, |
|
"loss": 0.6829, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.7695423364639282, |
|
"learning_rate": 4.215840487341296e-05, |
|
"loss": 0.7739, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.3163946866989136, |
|
"learning_rate": 4.2143203143268184e-05, |
|
"loss": 0.9678, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.1124577522277832, |
|
"learning_rate": 4.212798943890871e-05, |
|
"loss": 0.9327, |
|
"step": 4885 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.8979106545448303, |
|
"learning_rate": 4.2112763770961074e-05, |
|
"loss": 0.7043, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.9879763126373291, |
|
"learning_rate": 4.2097526150060085e-05, |
|
"loss": 0.8129, |
|
"step": 4895 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.7016007304191589, |
|
"learning_rate": 4.208227658684898e-05, |
|
"loss": 0.7906, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 0.7254941463470459, |
|
"eval_runtime": 96.9328, |
|
"eval_samples_per_second": 7.191, |
|
"eval_steps_per_second": 7.191, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5404706597328186, |
|
"learning_rate": 4.206701509197927e-05, |
|
"loss": 0.7769, |
|
"step": 4905 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.7096789479255676, |
|
"learning_rate": 4.205174167611085e-05, |
|
"loss": 0.5985, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.8139373660087585, |
|
"learning_rate": 4.20364563499119e-05, |
|
"loss": 0.75, |
|
"step": 4915 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.2196255922317505, |
|
"learning_rate": 4.202115912405897e-05, |
|
"loss": 0.8441, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.6912347674369812, |
|
"learning_rate": 4.200585000923689e-05, |
|
"loss": 0.8885, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7879334688186646, |
|
"learning_rate": 4.199052901613878e-05, |
|
"loss": 0.6353, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.0302627086639404, |
|
"learning_rate": 4.197519615546608e-05, |
|
"loss": 0.7704, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5351320505142212, |
|
"learning_rate": 4.195985143792851e-05, |
|
"loss": 0.8094, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7632457613945007, |
|
"learning_rate": 4.194449487424409e-05, |
|
"loss": 0.9625, |
|
"step": 4945 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7706131935119629, |
|
"learning_rate": 4.1929126475139096e-05, |
|
"loss": 0.7007, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.6262048482894897, |
|
"learning_rate": 4.191374625134806e-05, |
|
"loss": 0.7768, |
|
"step": 4955 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.8302519917488098, |
|
"learning_rate": 4.189835421361381e-05, |
|
"loss": 0.8281, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5914260149002075, |
|
"learning_rate": 4.188295037268738e-05, |
|
"loss": 0.8554, |
|
"step": 4965 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7599936127662659, |
|
"learning_rate": 4.1867534739328085e-05, |
|
"loss": 0.9547, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.4832470417022705, |
|
"learning_rate": 4.1852107324303455e-05, |
|
"loss": 0.5212, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.8040557503700256, |
|
"learning_rate": 4.183666813838927e-05, |
|
"loss": 0.8939, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7503822445869446, |
|
"learning_rate": 4.182121719236952e-05, |
|
"loss": 0.9279, |
|
"step": 4985 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7608035206794739, |
|
"learning_rate": 4.180575449703639e-05, |
|
"loss": 0.7965, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5479308366775513, |
|
"learning_rate": 4.1790280063190315e-05, |
|
"loss": 0.7478, |
|
"step": 4995 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7714606523513794, |
|
"learning_rate": 4.177479390163989e-05, |
|
"loss": 0.7917, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.7249829173088074, |
|
"eval_runtime": 96.9791, |
|
"eval_samples_per_second": 7.187, |
|
"eval_steps_per_second": 7.187, |
|
"step": 5000 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 18795, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 2.1577658793984e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|