|
{ |
|
"best_metric": 0.7402730584144592, |
|
"best_model_checkpoint": "runs/deepseek_lora_20240423-223943/checkpoint-10000", |
|
"epoch": 0.25, |
|
"eval_steps": 500, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.086414337158203, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 0.7892, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 8.478134155273438, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 0.7746, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.574502468109131, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.8222, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.6497371196746826, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.7423, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.116753339767456, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.7622, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.179832696914673, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.8183, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.9869463443756104, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.822, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.093494415283203, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.7966, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.230633735656738, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.8113, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 9.374403953552246, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.7582, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 6.465492248535156, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.7662, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 6.279934883117676, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.8376, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.799221992492676, |
|
"learning_rate": 5.2e-06, |
|
"loss": 0.7965, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.222240686416626, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.8855, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 9.009174346923828, |
|
"learning_rate": 6e-06, |
|
"loss": 0.8394, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 8.040350914001465, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.8426, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 4.131030559539795, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 0.7747, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 3.31986927986145, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.7125, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 5.7623395919799805, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 0.7854, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.848206520080566, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.7756, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 13.455166816711426, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.7894, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.759767532348633, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.7454, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.262899875640869, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 0.8555, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.28985071182251, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.6845, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.174241542816162, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7983, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.931599617004395, |
|
"learning_rate": 1.04e-05, |
|
"loss": 0.9041, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.004627227783203, |
|
"learning_rate": 1.0800000000000002e-05, |
|
"loss": 0.817, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.6102757453918457, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.7292, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.764902353286743, |
|
"learning_rate": 1.16e-05, |
|
"loss": 0.9042, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.958317995071411, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.7539, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.098923683166504, |
|
"learning_rate": 1.2400000000000002e-05, |
|
"loss": 0.7955, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.129098892211914, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.849, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.054119825363159, |
|
"learning_rate": 1.3200000000000002e-05, |
|
"loss": 0.8645, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.205028057098389, |
|
"learning_rate": 1.3600000000000002e-05, |
|
"loss": 0.8175, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.614790439605713, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.8998, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.9891204833984375, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.8108, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.152099609375, |
|
"learning_rate": 1.48e-05, |
|
"loss": 0.7855, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 9.833850860595703, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 0.7736, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.849621295928955, |
|
"learning_rate": 1.5600000000000003e-05, |
|
"loss": 0.7668, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.4542975425720215, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.7781, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.197661876678467, |
|
"learning_rate": 1.64e-05, |
|
"loss": 0.8654, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.2606770992279053, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.7565, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.9680209159851074, |
|
"learning_rate": 1.72e-05, |
|
"loss": 0.7886, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 18.749984741210938, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.7305, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.822000503540039, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.7833, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.999715805053711, |
|
"learning_rate": 1.8400000000000003e-05, |
|
"loss": 0.8483, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.193736553192139, |
|
"learning_rate": 1.88e-05, |
|
"loss": 0.84, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.573124885559082, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.8437, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 4.4221601486206055, |
|
"learning_rate": 1.9600000000000002e-05, |
|
"loss": 0.6836, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.0399410724639893, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8264, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 0.8175864219665527, |
|
"eval_runtime": 67.7802, |
|
"eval_samples_per_second": 14.754, |
|
"eval_steps_per_second": 14.754, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.971303701400757, |
|
"learning_rate": 1.9978947368421054e-05, |
|
"loss": 0.7385, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.8043839931488037, |
|
"learning_rate": 1.9957894736842107e-05, |
|
"loss": 0.7826, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 11.702253341674805, |
|
"learning_rate": 1.993684210526316e-05, |
|
"loss": 0.7971, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.176826000213623, |
|
"learning_rate": 1.9915789473684212e-05, |
|
"loss": 0.748, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.120133876800537, |
|
"learning_rate": 1.9894736842105265e-05, |
|
"loss": 0.8461, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 12.286151885986328, |
|
"learning_rate": 1.9873684210526318e-05, |
|
"loss": 0.8335, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.857172966003418, |
|
"learning_rate": 1.985263157894737e-05, |
|
"loss": 0.7231, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 5.327859401702881, |
|
"learning_rate": 1.9831578947368423e-05, |
|
"loss": 0.877, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.9340362548828125, |
|
"learning_rate": 1.9810526315789476e-05, |
|
"loss": 0.8984, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.1034326553344727, |
|
"learning_rate": 1.9789473684210528e-05, |
|
"loss": 0.7045, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.853721857070923, |
|
"learning_rate": 1.976842105263158e-05, |
|
"loss": 0.761, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.6926398277282715, |
|
"learning_rate": 1.9747368421052633e-05, |
|
"loss": 0.9493, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.261799335479736, |
|
"learning_rate": 1.9726315789473686e-05, |
|
"loss": 0.7719, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.864114284515381, |
|
"learning_rate": 1.970526315789474e-05, |
|
"loss": 0.9406, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.093533515930176, |
|
"learning_rate": 1.968421052631579e-05, |
|
"loss": 0.7951, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.3724496364593506, |
|
"learning_rate": 1.9663157894736844e-05, |
|
"loss": 0.8648, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.12341022491455, |
|
"learning_rate": 1.9642105263157897e-05, |
|
"loss": 0.7823, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.80940842628479, |
|
"learning_rate": 1.962105263157895e-05, |
|
"loss": 0.706, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.243487358093262, |
|
"learning_rate": 1.9600000000000002e-05, |
|
"loss": 0.8244, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 11.420123100280762, |
|
"learning_rate": 1.9578947368421055e-05, |
|
"loss": 0.6753, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 63.8618278503418, |
|
"learning_rate": 1.9557894736842107e-05, |
|
"loss": 0.8309, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.521258354187012, |
|
"learning_rate": 1.953684210526316e-05, |
|
"loss": 0.8101, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.9532318115234375, |
|
"learning_rate": 1.9515789473684213e-05, |
|
"loss": 0.8533, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.792180061340332, |
|
"learning_rate": 1.9494736842105265e-05, |
|
"loss": 0.7573, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.155513286590576, |
|
"learning_rate": 1.9473684210526318e-05, |
|
"loss": 0.8961, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 9.195950508117676, |
|
"learning_rate": 1.945263157894737e-05, |
|
"loss": 0.8398, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.699478626251221, |
|
"learning_rate": 1.9431578947368423e-05, |
|
"loss": 0.8018, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.254507541656494, |
|
"learning_rate": 1.9410526315789476e-05, |
|
"loss": 0.8408, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.351966857910156, |
|
"learning_rate": 1.9389473684210525e-05, |
|
"loss": 0.7323, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.361276626586914, |
|
"learning_rate": 1.936842105263158e-05, |
|
"loss": 0.8401, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.449990272521973, |
|
"learning_rate": 1.9347368421052634e-05, |
|
"loss": 0.726, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.375738143920898, |
|
"learning_rate": 1.9326315789473687e-05, |
|
"loss": 0.8305, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.601025342941284, |
|
"learning_rate": 1.930526315789474e-05, |
|
"loss": 0.9152, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 12.153268814086914, |
|
"learning_rate": 1.9284210526315792e-05, |
|
"loss": 0.8423, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.785663604736328, |
|
"learning_rate": 1.9263157894736845e-05, |
|
"loss": 0.7733, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 10.162787437438965, |
|
"learning_rate": 1.9242105263157894e-05, |
|
"loss": 0.893, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.871621608734131, |
|
"learning_rate": 1.922105263157895e-05, |
|
"loss": 0.798, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.9919800758361816, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.8484, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.40109920501709, |
|
"learning_rate": 1.9178947368421055e-05, |
|
"loss": 0.9129, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.794926643371582, |
|
"learning_rate": 1.9157894736842108e-05, |
|
"loss": 0.8687, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.942440986633301, |
|
"learning_rate": 1.913684210526316e-05, |
|
"loss": 0.8564, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.968307018280029, |
|
"learning_rate": 1.9115789473684213e-05, |
|
"loss": 0.8495, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.425616264343262, |
|
"learning_rate": 1.9094736842105262e-05, |
|
"loss": 0.7242, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.819301128387451, |
|
"learning_rate": 1.907368421052632e-05, |
|
"loss": 0.8381, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.81688117980957, |
|
"learning_rate": 1.9052631578947368e-05, |
|
"loss": 0.8817, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.102423191070557, |
|
"learning_rate": 1.9031578947368424e-05, |
|
"loss": 0.8274, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.12994909286499, |
|
"learning_rate": 1.9010526315789476e-05, |
|
"loss": 0.7052, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.15468692779541, |
|
"learning_rate": 1.898947368421053e-05, |
|
"loss": 0.772, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.62323796749115, |
|
"learning_rate": 1.8968421052631582e-05, |
|
"loss": 0.7764, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.546677589416504, |
|
"learning_rate": 1.894736842105263e-05, |
|
"loss": 0.8365, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 0.7952949404716492, |
|
"eval_runtime": 67.7544, |
|
"eval_samples_per_second": 14.759, |
|
"eval_steps_per_second": 14.759, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 9.28386402130127, |
|
"learning_rate": 1.8926315789473687e-05, |
|
"loss": 0.8765, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.3430304527282715, |
|
"learning_rate": 1.8905263157894736e-05, |
|
"loss": 0.8763, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.0531206130981445, |
|
"learning_rate": 1.8884210526315792e-05, |
|
"loss": 0.7943, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.028320074081421, |
|
"learning_rate": 1.886315789473684e-05, |
|
"loss": 0.836, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.3861188888549805, |
|
"learning_rate": 1.8842105263157898e-05, |
|
"loss": 0.7336, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.7832908630371094, |
|
"learning_rate": 1.882105263157895e-05, |
|
"loss": 0.9283, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.8170342445373535, |
|
"learning_rate": 1.88e-05, |
|
"loss": 0.7655, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.15322732925415, |
|
"learning_rate": 1.8778947368421056e-05, |
|
"loss": 0.9341, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.066686153411865, |
|
"learning_rate": 1.8757894736842105e-05, |
|
"loss": 0.85, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.986961603164673, |
|
"learning_rate": 1.873684210526316e-05, |
|
"loss": 0.8943, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.8456902503967285, |
|
"learning_rate": 1.871578947368421e-05, |
|
"loss": 0.8279, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.6177377700805664, |
|
"learning_rate": 1.8694736842105266e-05, |
|
"loss": 0.8192, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 14.768010139465332, |
|
"learning_rate": 1.8673684210526316e-05, |
|
"loss": 0.8005, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 11.347342491149902, |
|
"learning_rate": 1.8652631578947368e-05, |
|
"loss": 0.8081, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.0560150146484375, |
|
"learning_rate": 1.8631578947368424e-05, |
|
"loss": 0.9389, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.3164710998535156, |
|
"learning_rate": 1.8610526315789473e-05, |
|
"loss": 0.8501, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 11.112225532531738, |
|
"learning_rate": 1.858947368421053e-05, |
|
"loss": 0.7162, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.200588703155518, |
|
"learning_rate": 1.856842105263158e-05, |
|
"loss": 0.7448, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.573482513427734, |
|
"learning_rate": 1.8547368421052635e-05, |
|
"loss": 0.8071, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.153548717498779, |
|
"learning_rate": 1.8526315789473684e-05, |
|
"loss": 0.7957, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.3308305740356445, |
|
"learning_rate": 1.8505263157894737e-05, |
|
"loss": 0.7301, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.269808769226074, |
|
"learning_rate": 1.8484210526315793e-05, |
|
"loss": 0.8072, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.588324546813965, |
|
"learning_rate": 1.8463157894736842e-05, |
|
"loss": 0.8587, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.593557357788086, |
|
"learning_rate": 1.8442105263157898e-05, |
|
"loss": 0.856, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.2591094970703125, |
|
"learning_rate": 1.8421052631578947e-05, |
|
"loss": 0.7717, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.052567958831787, |
|
"learning_rate": 1.8400000000000003e-05, |
|
"loss": 0.7823, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.447838306427002, |
|
"learning_rate": 1.8378947368421053e-05, |
|
"loss": 0.83, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.029257774353027, |
|
"learning_rate": 1.8357894736842105e-05, |
|
"loss": 0.7504, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 9.053960800170898, |
|
"learning_rate": 1.8336842105263158e-05, |
|
"loss": 0.9074, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.2877705097198486, |
|
"learning_rate": 1.831578947368421e-05, |
|
"loss": 0.772, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.4482290744781494, |
|
"learning_rate": 1.8294736842105267e-05, |
|
"loss": 0.8658, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.684794902801514, |
|
"learning_rate": 1.8273684210526316e-05, |
|
"loss": 0.7848, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.553828716278076, |
|
"learning_rate": 1.8252631578947372e-05, |
|
"loss": 0.8219, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.5203397274017334, |
|
"learning_rate": 1.823157894736842e-05, |
|
"loss": 0.9071, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.961795806884766, |
|
"learning_rate": 1.8210526315789477e-05, |
|
"loss": 0.6542, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.663081645965576, |
|
"learning_rate": 1.8189473684210527e-05, |
|
"loss": 0.7402, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 8.785040855407715, |
|
"learning_rate": 1.816842105263158e-05, |
|
"loss": 0.7462, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.659074783325195, |
|
"learning_rate": 1.8147368421052632e-05, |
|
"loss": 0.6951, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.5885703563690186, |
|
"learning_rate": 1.8126315789473685e-05, |
|
"loss": 0.7008, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.1295347213745117, |
|
"learning_rate": 1.810526315789474e-05, |
|
"loss": 0.9103, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.4699888229370117, |
|
"learning_rate": 1.808421052631579e-05, |
|
"loss": 0.841, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.3273444175720215, |
|
"learning_rate": 1.8063157894736846e-05, |
|
"loss": 0.9041, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.149638652801514, |
|
"learning_rate": 1.8042105263157895e-05, |
|
"loss": 0.7784, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.4124910831451416, |
|
"learning_rate": 1.8021052631578948e-05, |
|
"loss": 0.8208, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.9231085777282715, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.7173, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.008113384246826, |
|
"learning_rate": 1.7978947368421053e-05, |
|
"loss": 0.7383, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.1748046875, |
|
"learning_rate": 1.795789473684211e-05, |
|
"loss": 0.8399, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.4990293979644775, |
|
"learning_rate": 1.793684210526316e-05, |
|
"loss": 0.6721, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.1186299324035645, |
|
"learning_rate": 1.7915789473684214e-05, |
|
"loss": 0.782, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.12732458114624, |
|
"learning_rate": 1.7894736842105264e-05, |
|
"loss": 0.7211, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.811568021774292, |
|
"eval_runtime": 67.7961, |
|
"eval_samples_per_second": 14.75, |
|
"eval_steps_per_second": 14.75, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.631096124649048, |
|
"learning_rate": 1.7873684210526316e-05, |
|
"loss": 0.7557, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 8.850045204162598, |
|
"learning_rate": 1.785263157894737e-05, |
|
"loss": 0.8757, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.1114978790283203, |
|
"learning_rate": 1.7831578947368422e-05, |
|
"loss": 0.7613, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.5038743019104, |
|
"learning_rate": 1.7810526315789474e-05, |
|
"loss": 0.8049, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.2331156730651855, |
|
"learning_rate": 1.7789473684210527e-05, |
|
"loss": 0.8277, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.05696964263916, |
|
"learning_rate": 1.7768421052631583e-05, |
|
"loss": 0.7973, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.1331920623779297, |
|
"learning_rate": 1.7747368421052632e-05, |
|
"loss": 0.7688, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.984541416168213, |
|
"learning_rate": 1.7726315789473685e-05, |
|
"loss": 0.7865, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.149406433105469, |
|
"learning_rate": 1.7705263157894738e-05, |
|
"loss": 0.7728, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 8.092243194580078, |
|
"learning_rate": 1.768421052631579e-05, |
|
"loss": 0.935, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 13.16551399230957, |
|
"learning_rate": 1.7663157894736843e-05, |
|
"loss": 0.8286, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.131350517272949, |
|
"learning_rate": 1.7642105263157896e-05, |
|
"loss": 0.7864, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 7.870023727416992, |
|
"learning_rate": 1.7621052631578948e-05, |
|
"loss": 0.8645, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 10.631692886352539, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.8473, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.421032905578613, |
|
"learning_rate": 1.7578947368421054e-05, |
|
"loss": 0.7868, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.57529878616333, |
|
"learning_rate": 1.7557894736842106e-05, |
|
"loss": 0.7882, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.8785624504089355, |
|
"learning_rate": 1.753684210526316e-05, |
|
"loss": 0.7543, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.722006320953369, |
|
"learning_rate": 1.751578947368421e-05, |
|
"loss": 0.9626, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.466771364212036, |
|
"learning_rate": 1.7494736842105264e-05, |
|
"loss": 0.783, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.072049856185913, |
|
"learning_rate": 1.7473684210526317e-05, |
|
"loss": 0.7503, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.768575668334961, |
|
"learning_rate": 1.745263157894737e-05, |
|
"loss": 0.8193, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.585022211074829, |
|
"learning_rate": 1.7431578947368422e-05, |
|
"loss": 0.8808, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.0711567401885986, |
|
"learning_rate": 1.7410526315789475e-05, |
|
"loss": 0.8098, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.3020272254943848, |
|
"learning_rate": 1.7389473684210527e-05, |
|
"loss": 0.7196, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.645238161087036, |
|
"learning_rate": 1.736842105263158e-05, |
|
"loss": 0.8904, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 6.018638610839844, |
|
"learning_rate": 1.7347368421052633e-05, |
|
"loss": 0.7937, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.629096746444702, |
|
"learning_rate": 1.7326315789473685e-05, |
|
"loss": 0.9171, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.5619189739227295, |
|
"learning_rate": 1.7305263157894738e-05, |
|
"loss": 0.9488, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 9.464752197265625, |
|
"learning_rate": 1.728421052631579e-05, |
|
"loss": 0.8459, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.9856364727020264, |
|
"learning_rate": 1.7263157894736843e-05, |
|
"loss": 0.8378, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.753553867340088, |
|
"learning_rate": 1.7242105263157896e-05, |
|
"loss": 0.8093, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.4593358039855957, |
|
"learning_rate": 1.722105263157895e-05, |
|
"loss": 0.7896, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.7163546085357666, |
|
"learning_rate": 1.72e-05, |
|
"loss": 0.7188, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.105628728866577, |
|
"learning_rate": 1.7178947368421054e-05, |
|
"loss": 0.7643, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.387368679046631, |
|
"learning_rate": 1.7157894736842107e-05, |
|
"loss": 0.8465, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.020385265350342, |
|
"learning_rate": 1.713684210526316e-05, |
|
"loss": 0.7798, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.560520172119141, |
|
"learning_rate": 1.7115789473684212e-05, |
|
"loss": 0.7704, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 15.739727973937988, |
|
"learning_rate": 1.7094736842105265e-05, |
|
"loss": 0.7148, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.79690408706665, |
|
"learning_rate": 1.7073684210526317e-05, |
|
"loss": 0.798, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.6939146518707275, |
|
"learning_rate": 1.705263157894737e-05, |
|
"loss": 0.7641, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.193384170532227, |
|
"learning_rate": 1.7031578947368423e-05, |
|
"loss": 0.7866, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.940731525421143, |
|
"learning_rate": 1.7010526315789475e-05, |
|
"loss": 0.8261, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.1812446117401123, |
|
"learning_rate": 1.6989473684210528e-05, |
|
"loss": 0.7973, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.7413289546966553, |
|
"learning_rate": 1.696842105263158e-05, |
|
"loss": 0.7818, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.024014472961426, |
|
"learning_rate": 1.6947368421052633e-05, |
|
"loss": 0.7237, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.0871291160583496, |
|
"learning_rate": 1.6926315789473686e-05, |
|
"loss": 0.772, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.28814435005188, |
|
"learning_rate": 1.690526315789474e-05, |
|
"loss": 0.7067, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.8241286277770996, |
|
"learning_rate": 1.688421052631579e-05, |
|
"loss": 0.8175, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.5942068099975586, |
|
"learning_rate": 1.6863157894736844e-05, |
|
"loss": 0.9265, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.6822662353515625, |
|
"learning_rate": 1.6842105263157896e-05, |
|
"loss": 0.8593, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.8064771890640259, |
|
"eval_runtime": 67.7887, |
|
"eval_samples_per_second": 14.752, |
|
"eval_steps_per_second": 14.752, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 7.032164573669434, |
|
"learning_rate": 1.682105263157895e-05, |
|
"loss": 0.8819, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.874982833862305, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.8021, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.6172547340393066, |
|
"learning_rate": 1.6778947368421054e-05, |
|
"loss": 0.8017, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 10.659741401672363, |
|
"learning_rate": 1.6757894736842107e-05, |
|
"loss": 0.8896, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.189141750335693, |
|
"learning_rate": 1.673684210526316e-05, |
|
"loss": 0.7997, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.523468971252441, |
|
"learning_rate": 1.6715789473684212e-05, |
|
"loss": 0.8498, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 8.533658981323242, |
|
"learning_rate": 1.6694736842105265e-05, |
|
"loss": 0.8857, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.0041606426239014, |
|
"learning_rate": 1.6673684210526318e-05, |
|
"loss": 0.8112, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.055651664733887, |
|
"learning_rate": 1.665263157894737e-05, |
|
"loss": 0.7872, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 5.761922836303711, |
|
"learning_rate": 1.6631578947368423e-05, |
|
"loss": 0.7727, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.518223524093628, |
|
"learning_rate": 1.6610526315789476e-05, |
|
"loss": 0.7997, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.975761890411377, |
|
"learning_rate": 1.658947368421053e-05, |
|
"loss": 0.7457, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.2227561473846436, |
|
"learning_rate": 1.656842105263158e-05, |
|
"loss": 0.816, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.705923080444336, |
|
"learning_rate": 1.6547368421052634e-05, |
|
"loss": 0.8113, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.655057430267334, |
|
"learning_rate": 1.6526315789473686e-05, |
|
"loss": 0.7912, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.0186755657196045, |
|
"learning_rate": 1.650526315789474e-05, |
|
"loss": 0.8608, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.232386827468872, |
|
"learning_rate": 1.648421052631579e-05, |
|
"loss": 0.8549, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 11.968620300292969, |
|
"learning_rate": 1.6463157894736844e-05, |
|
"loss": 0.868, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.5853216648101807, |
|
"learning_rate": 1.6442105263157897e-05, |
|
"loss": 0.8388, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.375610589981079, |
|
"learning_rate": 1.642105263157895e-05, |
|
"loss": 0.9111, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9734487533569336, |
|
"learning_rate": 1.64e-05, |
|
"loss": 0.7288, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 10.517192840576172, |
|
"learning_rate": 1.6378947368421055e-05, |
|
"loss": 0.698, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.183718204498291, |
|
"learning_rate": 1.6357894736842108e-05, |
|
"loss": 0.7759, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.9075675010681152, |
|
"learning_rate": 1.633684210526316e-05, |
|
"loss": 0.7829, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.287744998931885, |
|
"learning_rate": 1.6315789473684213e-05, |
|
"loss": 0.7057, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.977657318115234, |
|
"learning_rate": 1.6294736842105265e-05, |
|
"loss": 0.8346, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.196689128875732, |
|
"learning_rate": 1.6273684210526318e-05, |
|
"loss": 0.8508, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.467477798461914, |
|
"learning_rate": 1.6252631578947367e-05, |
|
"loss": 0.7179, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.059762954711914, |
|
"learning_rate": 1.6231578947368423e-05, |
|
"loss": 0.7549, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.980865955352783, |
|
"learning_rate": 1.6210526315789473e-05, |
|
"loss": 0.814, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.675939559936523, |
|
"learning_rate": 1.618947368421053e-05, |
|
"loss": 0.8227, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.530073642730713, |
|
"learning_rate": 1.616842105263158e-05, |
|
"loss": 0.8517, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.6851344108581543, |
|
"learning_rate": 1.6147368421052634e-05, |
|
"loss": 0.7684, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.206923961639404, |
|
"learning_rate": 1.6126315789473687e-05, |
|
"loss": 0.8199, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.220828056335449, |
|
"learning_rate": 1.6105263157894736e-05, |
|
"loss": 0.8871, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.5062482357025146, |
|
"learning_rate": 1.6084210526315792e-05, |
|
"loss": 0.8281, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9830796718597412, |
|
"learning_rate": 1.606315789473684e-05, |
|
"loss": 0.8678, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.3255491256713867, |
|
"learning_rate": 1.6042105263157897e-05, |
|
"loss": 0.8337, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.259572505950928, |
|
"learning_rate": 1.6021052631578947e-05, |
|
"loss": 0.7954, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.6201376914978027, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.818, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.3598544597625732, |
|
"learning_rate": 1.5978947368421055e-05, |
|
"loss": 0.7697, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.34808349609375, |
|
"learning_rate": 1.5957894736842105e-05, |
|
"loss": 0.6347, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.967682361602783, |
|
"learning_rate": 1.593684210526316e-05, |
|
"loss": 0.7178, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 10.222978591918945, |
|
"learning_rate": 1.591578947368421e-05, |
|
"loss": 0.7642, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.9339826107025146, |
|
"learning_rate": 1.5894736842105266e-05, |
|
"loss": 0.8197, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.3337771892547607, |
|
"learning_rate": 1.5873684210526315e-05, |
|
"loss": 0.9375, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.8479838371276855, |
|
"learning_rate": 1.585263157894737e-05, |
|
"loss": 0.9196, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 9.294541358947754, |
|
"learning_rate": 1.5831578947368424e-05, |
|
"loss": 0.7144, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.325323104858398, |
|
"learning_rate": 1.5810526315789473e-05, |
|
"loss": 0.7897, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.377369403839111, |
|
"learning_rate": 1.578947368421053e-05, |
|
"loss": 0.9008, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 0.8163847923278809, |
|
"eval_runtime": 67.7994, |
|
"eval_samples_per_second": 14.749, |
|
"eval_steps_per_second": 14.749, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.1105055809021, |
|
"learning_rate": 1.576842105263158e-05, |
|
"loss": 0.7897, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.321247100830078, |
|
"learning_rate": 1.5747368421052635e-05, |
|
"loss": 0.7394, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.1728689670562744, |
|
"learning_rate": 1.5726315789473684e-05, |
|
"loss": 0.7395, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.246960163116455, |
|
"learning_rate": 1.570526315789474e-05, |
|
"loss": 0.7825, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.518326282501221, |
|
"learning_rate": 1.568421052631579e-05, |
|
"loss": 0.8168, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.336541652679443, |
|
"learning_rate": 1.5663157894736842e-05, |
|
"loss": 0.8887, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.3204426765441895, |
|
"learning_rate": 1.5642105263157898e-05, |
|
"loss": 0.8257, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 9.327149391174316, |
|
"learning_rate": 1.5621052631578947e-05, |
|
"loss": 0.7896, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.575266718864441, |
|
"learning_rate": 1.5600000000000003e-05, |
|
"loss": 0.9021, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.955282688140869, |
|
"learning_rate": 1.5578947368421052e-05, |
|
"loss": 0.8115, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.098946571350098, |
|
"learning_rate": 1.555789473684211e-05, |
|
"loss": 0.647, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.354290723800659, |
|
"learning_rate": 1.5536842105263158e-05, |
|
"loss": 0.8033, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.7387518882751465, |
|
"learning_rate": 1.551578947368421e-05, |
|
"loss": 0.6904, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 7.594583034515381, |
|
"learning_rate": 1.5494736842105263e-05, |
|
"loss": 0.7914, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.994126081466675, |
|
"learning_rate": 1.5473684210526316e-05, |
|
"loss": 0.8019, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.478656768798828, |
|
"learning_rate": 1.545263157894737e-05, |
|
"loss": 0.7575, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.4734623432159424, |
|
"learning_rate": 1.543157894736842e-05, |
|
"loss": 0.7662, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.2532217502593994, |
|
"learning_rate": 1.5410526315789477e-05, |
|
"loss": 0.6782, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.5201520919799805, |
|
"learning_rate": 1.5389473684210526e-05, |
|
"loss": 0.7102, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.8668696880340576, |
|
"learning_rate": 1.536842105263158e-05, |
|
"loss": 0.8358, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.816726207733154, |
|
"learning_rate": 1.534736842105263e-05, |
|
"loss": 0.8439, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.014636516571045, |
|
"learning_rate": 1.5326315789473684e-05, |
|
"loss": 0.8699, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.465543270111084, |
|
"learning_rate": 1.530526315789474e-05, |
|
"loss": 0.7515, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.904135227203369, |
|
"learning_rate": 1.528421052631579e-05, |
|
"loss": 0.7578, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.1614532470703125, |
|
"learning_rate": 1.5263157894736846e-05, |
|
"loss": 0.6915, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.4877758026123047, |
|
"learning_rate": 1.5242105263157897e-05, |
|
"loss": 0.7607, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.346368312835693, |
|
"learning_rate": 1.5221052631578948e-05, |
|
"loss": 0.8558, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.319607734680176, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 0.8349, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.130995750427246, |
|
"learning_rate": 1.5178947368421053e-05, |
|
"loss": 0.7747, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.8080275058746338, |
|
"learning_rate": 1.5157894736842107e-05, |
|
"loss": 0.696, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.184603214263916, |
|
"learning_rate": 1.5136842105263158e-05, |
|
"loss": 0.7874, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.7330257892608643, |
|
"learning_rate": 1.5115789473684212e-05, |
|
"loss": 0.6844, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.2653279304504395, |
|
"learning_rate": 1.5094736842105263e-05, |
|
"loss": 0.6429, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.826791524887085, |
|
"learning_rate": 1.5073684210526316e-05, |
|
"loss": 0.7868, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.656713008880615, |
|
"learning_rate": 1.505263157894737e-05, |
|
"loss": 0.81, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.6106789112091064, |
|
"learning_rate": 1.5031578947368421e-05, |
|
"loss": 0.7693, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.2952094078063965, |
|
"learning_rate": 1.5010526315789476e-05, |
|
"loss": 0.7727, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.8655829429626465, |
|
"learning_rate": 1.4989473684210527e-05, |
|
"loss": 0.7702, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.170299768447876, |
|
"learning_rate": 1.4968421052631581e-05, |
|
"loss": 0.884, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 10.48736572265625, |
|
"learning_rate": 1.4947368421052632e-05, |
|
"loss": 0.8281, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.244320392608643, |
|
"learning_rate": 1.4926315789473686e-05, |
|
"loss": 0.8549, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 9.334859848022461, |
|
"learning_rate": 1.4905263157894739e-05, |
|
"loss": 0.7232, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.034826755523682, |
|
"learning_rate": 1.488421052631579e-05, |
|
"loss": 0.7666, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.024431228637695, |
|
"learning_rate": 1.4863157894736844e-05, |
|
"loss": 0.7803, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.2071685791015625, |
|
"learning_rate": 1.4842105263157895e-05, |
|
"loss": 0.8844, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.319312810897827, |
|
"learning_rate": 1.482105263157895e-05, |
|
"loss": 0.7345, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.384433746337891, |
|
"learning_rate": 1.48e-05, |
|
"loss": 0.6718, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.6182382106781006, |
|
"learning_rate": 1.4778947368421055e-05, |
|
"loss": 0.7898, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 8.145679473876953, |
|
"learning_rate": 1.4757894736842106e-05, |
|
"loss": 0.7754, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.761884689331055, |
|
"learning_rate": 1.4736842105263159e-05, |
|
"loss": 0.7607, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 0.8005050420761108, |
|
"eval_runtime": 67.8219, |
|
"eval_samples_per_second": 14.744, |
|
"eval_steps_per_second": 14.744, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 9.505555152893066, |
|
"learning_rate": 1.4715789473684213e-05, |
|
"loss": 0.6883, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.4507155418396, |
|
"learning_rate": 1.4694736842105264e-05, |
|
"loss": 0.7203, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 7.255837440490723, |
|
"learning_rate": 1.4673684210526318e-05, |
|
"loss": 0.8193, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.857261657714844, |
|
"learning_rate": 1.465263157894737e-05, |
|
"loss": 0.6581, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 8.000073432922363, |
|
"learning_rate": 1.4631578947368424e-05, |
|
"loss": 0.7318, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.8200011253356934, |
|
"learning_rate": 1.4610526315789474e-05, |
|
"loss": 0.8176, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.028472423553467, |
|
"learning_rate": 1.4589473684210527e-05, |
|
"loss": 0.7712, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.2594337463378906, |
|
"learning_rate": 1.456842105263158e-05, |
|
"loss": 0.8032, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.751202344894409, |
|
"learning_rate": 1.4547368421052632e-05, |
|
"loss": 0.7905, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.235973834991455, |
|
"learning_rate": 1.4526315789473687e-05, |
|
"loss": 0.8402, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.327855348587036, |
|
"learning_rate": 1.4505263157894738e-05, |
|
"loss": 0.9027, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.911487102508545, |
|
"learning_rate": 1.4484210526315792e-05, |
|
"loss": 0.8241, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.3612990379333496, |
|
"learning_rate": 1.4463157894736843e-05, |
|
"loss": 0.7884, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.106682300567627, |
|
"learning_rate": 1.4442105263157896e-05, |
|
"loss": 0.7414, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.354177951812744, |
|
"learning_rate": 1.4421052631578948e-05, |
|
"loss": 0.7626, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.0019009113311768, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.6853, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.787949562072754, |
|
"learning_rate": 1.4378947368421054e-05, |
|
"loss": 0.8105, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.0848469734191895, |
|
"learning_rate": 1.4357894736842106e-05, |
|
"loss": 0.7787, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.892744541168213, |
|
"learning_rate": 1.433684210526316e-05, |
|
"loss": 0.8883, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.717949390411377, |
|
"learning_rate": 1.4315789473684212e-05, |
|
"loss": 0.7458, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.4417831897735596, |
|
"learning_rate": 1.4294736842105263e-05, |
|
"loss": 0.7868, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 9.308151245117188, |
|
"learning_rate": 1.4273684210526317e-05, |
|
"loss": 0.8725, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.7036919593811035, |
|
"learning_rate": 1.425263157894737e-05, |
|
"loss": 0.8706, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 7.338090419769287, |
|
"learning_rate": 1.4231578947368422e-05, |
|
"loss": 0.7695, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.9678733348846436, |
|
"learning_rate": 1.4210526315789475e-05, |
|
"loss": 0.7259, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.3436050415039062, |
|
"learning_rate": 1.418947368421053e-05, |
|
"loss": 0.7193, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.23856520652771, |
|
"learning_rate": 1.416842105263158e-05, |
|
"loss": 0.6766, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.021206855773926, |
|
"learning_rate": 1.4147368421052631e-05, |
|
"loss": 0.888, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 7.063048839569092, |
|
"learning_rate": 1.4126315789473686e-05, |
|
"loss": 0.7543, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.777950763702393, |
|
"learning_rate": 1.4105263157894738e-05, |
|
"loss": 0.6239, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.2984225749969482, |
|
"learning_rate": 1.4084210526315791e-05, |
|
"loss": 0.8349, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.48808479309082, |
|
"learning_rate": 1.4063157894736844e-05, |
|
"loss": 0.8765, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.557926177978516, |
|
"learning_rate": 1.4042105263157896e-05, |
|
"loss": 0.8508, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.661513090133667, |
|
"learning_rate": 1.4021052631578949e-05, |
|
"loss": 0.7547, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.827274799346924, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.8556, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.0803449153900146, |
|
"learning_rate": 1.3978947368421054e-05, |
|
"loss": 0.8663, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 9.996018409729004, |
|
"learning_rate": 1.3957894736842105e-05, |
|
"loss": 0.68, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.443753242492676, |
|
"learning_rate": 1.393684210526316e-05, |
|
"loss": 0.7227, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 9.685049057006836, |
|
"learning_rate": 1.3915789473684212e-05, |
|
"loss": 0.7336, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.89941930770874, |
|
"learning_rate": 1.3894736842105265e-05, |
|
"loss": 0.7016, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 9.616964340209961, |
|
"learning_rate": 1.3873684210526317e-05, |
|
"loss": 0.7678, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.262804985046387, |
|
"learning_rate": 1.3852631578947368e-05, |
|
"loss": 0.8227, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.251176357269287, |
|
"learning_rate": 1.3831578947368423e-05, |
|
"loss": 0.71, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.626483917236328, |
|
"learning_rate": 1.3810526315789474e-05, |
|
"loss": 0.8204, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.990488052368164, |
|
"learning_rate": 1.3789473684210528e-05, |
|
"loss": 0.7745, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 7.510478496551514, |
|
"learning_rate": 1.3768421052631579e-05, |
|
"loss": 0.6286, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.006512641906738, |
|
"learning_rate": 1.3747368421052633e-05, |
|
"loss": 0.8146, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.388507843017578, |
|
"learning_rate": 1.3726315789473686e-05, |
|
"loss": 0.7914, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.217645168304443, |
|
"learning_rate": 1.3705263157894737e-05, |
|
"loss": 0.7219, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.257259368896484, |
|
"learning_rate": 1.3684210526315791e-05, |
|
"loss": 0.8404, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 0.8086790442466736, |
|
"eval_runtime": 67.9356, |
|
"eval_samples_per_second": 14.72, |
|
"eval_steps_per_second": 14.72, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.6733217239379883, |
|
"learning_rate": 1.3663157894736842e-05, |
|
"loss": 0.7936, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.0083932876586914, |
|
"learning_rate": 1.3642105263157897e-05, |
|
"loss": 0.7122, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 17.048171997070312, |
|
"learning_rate": 1.3621052631578948e-05, |
|
"loss": 0.7568, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.1675314903259277, |
|
"learning_rate": 1.3600000000000002e-05, |
|
"loss": 0.639, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.6769821643829346, |
|
"learning_rate": 1.3578947368421055e-05, |
|
"loss": 0.8691, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.516094207763672, |
|
"learning_rate": 1.3557894736842106e-05, |
|
"loss": 0.7681, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.8059396743774414, |
|
"learning_rate": 1.353684210526316e-05, |
|
"loss": 0.7387, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.531425952911377, |
|
"learning_rate": 1.3515789473684211e-05, |
|
"loss": 0.7559, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 9.721296310424805, |
|
"learning_rate": 1.3494736842105265e-05, |
|
"loss": 0.7816, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 6.60942268371582, |
|
"learning_rate": 1.3473684210526316e-05, |
|
"loss": 0.8203, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.4415578842163086, |
|
"learning_rate": 1.345263157894737e-05, |
|
"loss": 0.6731, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.322394847869873, |
|
"learning_rate": 1.3431578947368421e-05, |
|
"loss": 0.8578, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.429548978805542, |
|
"learning_rate": 1.3410526315789474e-05, |
|
"loss": 0.7576, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.9194421768188477, |
|
"learning_rate": 1.3389473684210528e-05, |
|
"loss": 0.8131, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.6766152381896973, |
|
"learning_rate": 1.336842105263158e-05, |
|
"loss": 0.9391, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.4084839820861816, |
|
"learning_rate": 1.3347368421052634e-05, |
|
"loss": 0.8242, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.2697949409484863, |
|
"learning_rate": 1.3326315789473685e-05, |
|
"loss": 0.7534, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.7821884155273438, |
|
"learning_rate": 1.3305263157894739e-05, |
|
"loss": 0.7237, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.925840854644775, |
|
"learning_rate": 1.328421052631579e-05, |
|
"loss": 0.8194, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.694246768951416, |
|
"learning_rate": 1.3263157894736843e-05, |
|
"loss": 0.7628, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 7.358584403991699, |
|
"learning_rate": 1.3242105263157895e-05, |
|
"loss": 0.9161, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.007431983947754, |
|
"learning_rate": 1.3221052631578948e-05, |
|
"loss": 0.6624, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.7626278400421143, |
|
"learning_rate": 1.3200000000000002e-05, |
|
"loss": 0.7662, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.4226157665252686, |
|
"learning_rate": 1.3178947368421053e-05, |
|
"loss": 0.809, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.5735135078430176, |
|
"learning_rate": 1.3157894736842108e-05, |
|
"loss": 0.6769, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.4251084327697754, |
|
"learning_rate": 1.3136842105263159e-05, |
|
"loss": 0.7785, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.311429977416992, |
|
"learning_rate": 1.3115789473684211e-05, |
|
"loss": 0.7536, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.2348074913024902, |
|
"learning_rate": 1.3094736842105264e-05, |
|
"loss": 0.8138, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 7.259544372558594, |
|
"learning_rate": 1.3073684210526317e-05, |
|
"loss": 0.8, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.99937105178833, |
|
"learning_rate": 1.305263157894737e-05, |
|
"loss": 0.7894, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 10.336478233337402, |
|
"learning_rate": 1.3031578947368422e-05, |
|
"loss": 0.7928, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.500198841094971, |
|
"learning_rate": 1.3010526315789476e-05, |
|
"loss": 0.7916, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.115521192550659, |
|
"learning_rate": 1.2989473684210527e-05, |
|
"loss": 0.7585, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 8.954665184020996, |
|
"learning_rate": 1.2968421052631578e-05, |
|
"loss": 0.7727, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.874253273010254, |
|
"learning_rate": 1.2947368421052633e-05, |
|
"loss": 0.7903, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.151484966278076, |
|
"learning_rate": 1.2926315789473685e-05, |
|
"loss": 0.7199, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 10.117889404296875, |
|
"learning_rate": 1.2905263157894738e-05, |
|
"loss": 0.7562, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.595205307006836, |
|
"learning_rate": 1.288421052631579e-05, |
|
"loss": 0.8167, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.4744372367858887, |
|
"learning_rate": 1.2863157894736845e-05, |
|
"loss": 0.7343, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.1740803718566895, |
|
"learning_rate": 1.2842105263157896e-05, |
|
"loss": 0.8754, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 7.299022197723389, |
|
"learning_rate": 1.2821052631578947e-05, |
|
"loss": 0.7379, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.2339208126068115, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 0.771, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.5612077713012695, |
|
"learning_rate": 1.2778947368421054e-05, |
|
"loss": 0.7959, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.87350606918335, |
|
"learning_rate": 1.2757894736842106e-05, |
|
"loss": 0.7871, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.8318493366241455, |
|
"learning_rate": 1.2736842105263159e-05, |
|
"loss": 0.7502, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.797230243682861, |
|
"learning_rate": 1.2715789473684212e-05, |
|
"loss": 0.7241, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.037790775299072, |
|
"learning_rate": 1.2694736842105264e-05, |
|
"loss": 0.8642, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.736443042755127, |
|
"learning_rate": 1.2673684210526315e-05, |
|
"loss": 0.7672, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.449172258377075, |
|
"learning_rate": 1.265263157894737e-05, |
|
"loss": 0.7685, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.1893362998962402, |
|
"learning_rate": 1.263157894736842e-05, |
|
"loss": 0.6876, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 0.8031703233718872, |
|
"eval_runtime": 67.9677, |
|
"eval_samples_per_second": 14.713, |
|
"eval_steps_per_second": 14.713, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.5685079097747803, |
|
"learning_rate": 1.2610526315789475e-05, |
|
"loss": 0.7317, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 6.440120220184326, |
|
"learning_rate": 1.2589473684210528e-05, |
|
"loss": 0.7919, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 5.1870341300964355, |
|
"learning_rate": 1.256842105263158e-05, |
|
"loss": 0.6921, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.161406517028809, |
|
"learning_rate": 1.2547368421052633e-05, |
|
"loss": 0.7822, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 6.242280006408691, |
|
"learning_rate": 1.2526315789473684e-05, |
|
"loss": 0.8151, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.825861692428589, |
|
"learning_rate": 1.2505263157894738e-05, |
|
"loss": 0.7709, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 22.97239112854004, |
|
"learning_rate": 1.248421052631579e-05, |
|
"loss": 0.8456, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.718015193939209, |
|
"learning_rate": 1.2463157894736844e-05, |
|
"loss": 0.8354, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.266710042953491, |
|
"learning_rate": 1.2442105263157895e-05, |
|
"loss": 0.6444, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.711140155792236, |
|
"learning_rate": 1.2421052631578949e-05, |
|
"loss": 0.8418, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.8773484230041504, |
|
"learning_rate": 1.2400000000000002e-05, |
|
"loss": 0.6673, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.492387771606445, |
|
"learning_rate": 1.2378947368421053e-05, |
|
"loss": 0.7801, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.106402397155762, |
|
"learning_rate": 1.2357894736842107e-05, |
|
"loss": 0.718, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.770216941833496, |
|
"learning_rate": 1.2336842105263158e-05, |
|
"loss": 0.7546, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.7071616649627686, |
|
"learning_rate": 1.2315789473684212e-05, |
|
"loss": 0.8232, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 11.786856651306152, |
|
"learning_rate": 1.2294736842105263e-05, |
|
"loss": 0.845, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.190443515777588, |
|
"learning_rate": 1.2273684210526317e-05, |
|
"loss": 0.7656, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 6.3326239585876465, |
|
"learning_rate": 1.225263157894737e-05, |
|
"loss": 0.8145, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.735156297683716, |
|
"learning_rate": 1.2231578947368421e-05, |
|
"loss": 0.7637, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 44.92083740234375, |
|
"learning_rate": 1.2210526315789475e-05, |
|
"loss": 0.8358, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.335235595703125, |
|
"learning_rate": 1.2189473684210526e-05, |
|
"loss": 0.8564, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.2740349769592285, |
|
"learning_rate": 1.216842105263158e-05, |
|
"loss": 0.8443, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.290927410125732, |
|
"learning_rate": 1.2147368421052632e-05, |
|
"loss": 0.8041, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 9.460419654846191, |
|
"learning_rate": 1.2126315789473686e-05, |
|
"loss": 0.8054, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.963223934173584, |
|
"learning_rate": 1.2105263157894737e-05, |
|
"loss": 0.8104, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.091956615447998, |
|
"learning_rate": 1.208421052631579e-05, |
|
"loss": 0.7156, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.055617570877075, |
|
"learning_rate": 1.2063157894736844e-05, |
|
"loss": 0.835, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.4951014518737793, |
|
"learning_rate": 1.2042105263157895e-05, |
|
"loss": 0.8004, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.876716136932373, |
|
"learning_rate": 1.202105263157895e-05, |
|
"loss": 0.7324, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.0460751056671143, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.7856, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.417022943496704, |
|
"learning_rate": 1.1978947368421055e-05, |
|
"loss": 0.7649, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.435426235198975, |
|
"learning_rate": 1.1957894736842106e-05, |
|
"loss": 0.8354, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.5707461833953857, |
|
"learning_rate": 1.1936842105263158e-05, |
|
"loss": 0.7264, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.077741622924805, |
|
"learning_rate": 1.1915789473684211e-05, |
|
"loss": 0.6934, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.498834133148193, |
|
"learning_rate": 1.1894736842105264e-05, |
|
"loss": 0.7687, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.684948205947876, |
|
"learning_rate": 1.1873684210526318e-05, |
|
"loss": 0.9371, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.4451920986175537, |
|
"learning_rate": 1.1852631578947369e-05, |
|
"loss": 0.7904, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.1316657066345215, |
|
"learning_rate": 1.1831578947368423e-05, |
|
"loss": 0.6835, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.741140127182007, |
|
"learning_rate": 1.1810526315789474e-05, |
|
"loss": 0.8932, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.40395975112915, |
|
"learning_rate": 1.1789473684210527e-05, |
|
"loss": 0.9044, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.406174898147583, |
|
"learning_rate": 1.176842105263158e-05, |
|
"loss": 0.7924, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.892871379852295, |
|
"learning_rate": 1.1747368421052632e-05, |
|
"loss": 0.7356, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.855538368225098, |
|
"learning_rate": 1.1726315789473685e-05, |
|
"loss": 0.7543, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.951474666595459, |
|
"learning_rate": 1.1705263157894737e-05, |
|
"loss": 0.7416, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.314338207244873, |
|
"learning_rate": 1.1684210526315792e-05, |
|
"loss": 0.7764, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.176599502563477, |
|
"learning_rate": 1.1663157894736843e-05, |
|
"loss": 0.7658, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.275913238525391, |
|
"learning_rate": 1.1642105263157897e-05, |
|
"loss": 0.6621, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.1267452239990234, |
|
"learning_rate": 1.1621052631578948e-05, |
|
"loss": 0.7699, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 8.313840866088867, |
|
"learning_rate": 1.16e-05, |
|
"loss": 0.7262, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 8.310973167419434, |
|
"learning_rate": 1.1578947368421053e-05, |
|
"loss": 0.8871, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 0.7730265855789185, |
|
"eval_runtime": 67.9868, |
|
"eval_samples_per_second": 14.709, |
|
"eval_steps_per_second": 14.709, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.4506046772003174, |
|
"learning_rate": 1.1557894736842106e-05, |
|
"loss": 0.7579, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.864931106567383, |
|
"learning_rate": 1.153684210526316e-05, |
|
"loss": 0.899, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.998289108276367, |
|
"learning_rate": 1.1515789473684211e-05, |
|
"loss": 0.7924, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 4.859367370605469, |
|
"learning_rate": 1.1494736842105266e-05, |
|
"loss": 0.773, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.9851796627044678, |
|
"learning_rate": 1.1473684210526317e-05, |
|
"loss": 0.8665, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.357670783996582, |
|
"learning_rate": 1.145263157894737e-05, |
|
"loss": 0.7522, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.778637409210205, |
|
"learning_rate": 1.1431578947368422e-05, |
|
"loss": 0.6188, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 8.546213150024414, |
|
"learning_rate": 1.1410526315789475e-05, |
|
"loss": 0.7296, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.5466620922088623, |
|
"learning_rate": 1.1389473684210527e-05, |
|
"loss": 0.7774, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.85372257232666, |
|
"learning_rate": 1.136842105263158e-05, |
|
"loss": 0.7884, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.1757266521453857, |
|
"learning_rate": 1.1347368421052634e-05, |
|
"loss": 0.8008, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.7544124126434326, |
|
"learning_rate": 1.1326315789473685e-05, |
|
"loss": 0.7322, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 7.456575870513916, |
|
"learning_rate": 1.1305263157894736e-05, |
|
"loss": 0.7978, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.9789164066314697, |
|
"learning_rate": 1.128421052631579e-05, |
|
"loss": 0.617, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.361474514007568, |
|
"learning_rate": 1.1263157894736843e-05, |
|
"loss": 0.8214, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 14.45222282409668, |
|
"learning_rate": 1.1242105263157896e-05, |
|
"loss": 0.7183, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.7697906494140625, |
|
"learning_rate": 1.1221052631578949e-05, |
|
"loss": 0.729, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.225655555725098, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.7627, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.067778587341309, |
|
"learning_rate": 1.1178947368421054e-05, |
|
"loss": 0.78, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.5654473304748535, |
|
"learning_rate": 1.1157894736842105e-05, |
|
"loss": 0.7178, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.7385423183441162, |
|
"learning_rate": 1.1136842105263159e-05, |
|
"loss": 0.9387, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.847338676452637, |
|
"learning_rate": 1.111578947368421e-05, |
|
"loss": 0.7951, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.739323377609253, |
|
"learning_rate": 1.1094736842105264e-05, |
|
"loss": 0.8198, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.23370885848999, |
|
"learning_rate": 1.1073684210526317e-05, |
|
"loss": 0.7462, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.970132350921631, |
|
"learning_rate": 1.105263157894737e-05, |
|
"loss": 0.6983, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.8072540760040283, |
|
"learning_rate": 1.1031578947368422e-05, |
|
"loss": 0.852, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.734208345413208, |
|
"learning_rate": 1.1010526315789473e-05, |
|
"loss": 0.8621, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.022127151489258, |
|
"learning_rate": 1.0989473684210528e-05, |
|
"loss": 0.7652, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 7.284844875335693, |
|
"learning_rate": 1.0968421052631579e-05, |
|
"loss": 0.7901, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.52205753326416, |
|
"learning_rate": 1.0947368421052633e-05, |
|
"loss": 0.8347, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.1662251949310303, |
|
"learning_rate": 1.0926315789473686e-05, |
|
"loss": 0.6105, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.027661323547363, |
|
"learning_rate": 1.0905263157894738e-05, |
|
"loss": 0.7447, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 9.989821434020996, |
|
"learning_rate": 1.0884210526315791e-05, |
|
"loss": 0.8144, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.886387825012207, |
|
"learning_rate": 1.0863157894736842e-05, |
|
"loss": 0.7702, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 8.8762845993042, |
|
"learning_rate": 1.0842105263157896e-05, |
|
"loss": 0.7305, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.934712886810303, |
|
"learning_rate": 1.0821052631578947e-05, |
|
"loss": 0.7849, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.3684771060943604, |
|
"learning_rate": 1.0800000000000002e-05, |
|
"loss": 0.8292, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 10.528717041015625, |
|
"learning_rate": 1.0778947368421053e-05, |
|
"loss": 0.8637, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 10.721526145935059, |
|
"learning_rate": 1.0757894736842107e-05, |
|
"loss": 0.7756, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.99760103225708, |
|
"learning_rate": 1.073684210526316e-05, |
|
"loss": 0.7928, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 8.126914978027344, |
|
"learning_rate": 1.071578947368421e-05, |
|
"loss": 0.7833, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.2220332622528076, |
|
"learning_rate": 1.0694736842105265e-05, |
|
"loss": 0.7819, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 7.0100321769714355, |
|
"learning_rate": 1.0673684210526316e-05, |
|
"loss": 0.8533, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.5343334674835205, |
|
"learning_rate": 1.065263157894737e-05, |
|
"loss": 0.7053, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.862158298492432, |
|
"learning_rate": 1.0631578947368421e-05, |
|
"loss": 0.7556, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 10.008291244506836, |
|
"learning_rate": 1.0610526315789476e-05, |
|
"loss": 0.8381, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.455188035964966, |
|
"learning_rate": 1.0589473684210526e-05, |
|
"loss": 0.749, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.882299423217773, |
|
"learning_rate": 1.0568421052631579e-05, |
|
"loss": 0.7797, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.7382001876831055, |
|
"learning_rate": 1.0547368421052633e-05, |
|
"loss": 0.8191, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.9528167247772217, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 0.6382, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 0.7771185040473938, |
|
"eval_runtime": 67.9924, |
|
"eval_samples_per_second": 14.708, |
|
"eval_steps_per_second": 14.708, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.4830055236816406, |
|
"learning_rate": 1.0505263157894739e-05, |
|
"loss": 0.9099, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 5.7392096519470215, |
|
"learning_rate": 1.048421052631579e-05, |
|
"loss": 0.6423, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.720612049102783, |
|
"learning_rate": 1.0463157894736844e-05, |
|
"loss": 0.7826, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.0437145233154297, |
|
"learning_rate": 1.0442105263157895e-05, |
|
"loss": 0.7865, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 8.835311889648438, |
|
"learning_rate": 1.0421052631578948e-05, |
|
"loss": 0.7778, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.596973419189453, |
|
"learning_rate": 1.04e-05, |
|
"loss": 0.7381, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.108314037322998, |
|
"learning_rate": 1.0378947368421053e-05, |
|
"loss": 0.7689, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.865196704864502, |
|
"learning_rate": 1.0357894736842107e-05, |
|
"loss": 0.7785, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.4403493404388428, |
|
"learning_rate": 1.0336842105263158e-05, |
|
"loss": 0.8322, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.243029832839966, |
|
"learning_rate": 1.0315789473684213e-05, |
|
"loss": 0.6658, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.806818962097168, |
|
"learning_rate": 1.0294736842105264e-05, |
|
"loss": 0.781, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.820622205734253, |
|
"learning_rate": 1.0273684210526316e-05, |
|
"loss": 0.7499, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.203964710235596, |
|
"learning_rate": 1.0252631578947369e-05, |
|
"loss": 0.7702, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.803215503692627, |
|
"learning_rate": 1.0231578947368422e-05, |
|
"loss": 0.6291, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 5.486114978790283, |
|
"learning_rate": 1.0210526315789476e-05, |
|
"loss": 0.8124, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.74938440322876, |
|
"learning_rate": 1.0189473684210527e-05, |
|
"loss": 0.7735, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.10128116607666, |
|
"learning_rate": 1.0168421052631581e-05, |
|
"loss": 0.6809, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 6.844088554382324, |
|
"learning_rate": 1.0147368421052632e-05, |
|
"loss": 0.8294, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.329681873321533, |
|
"learning_rate": 1.0126315789473685e-05, |
|
"loss": 0.861, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 12.482446670532227, |
|
"learning_rate": 1.0105263157894738e-05, |
|
"loss": 0.7346, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.8471055030822754, |
|
"learning_rate": 1.008421052631579e-05, |
|
"loss": 0.7714, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.1509273052215576, |
|
"learning_rate": 1.0063157894736843e-05, |
|
"loss": 0.697, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.524876117706299, |
|
"learning_rate": 1.0042105263157896e-05, |
|
"loss": 0.8373, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.7305006980895996, |
|
"learning_rate": 1.002105263157895e-05, |
|
"loss": 0.7182, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.5194203853607178, |
|
"learning_rate": 1e-05, |
|
"loss": 0.794, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 14.967845916748047, |
|
"learning_rate": 9.978947368421053e-06, |
|
"loss": 0.7564, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.8730751276016235, |
|
"learning_rate": 9.957894736842106e-06, |
|
"loss": 0.726, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.1793789863586426, |
|
"learning_rate": 9.936842105263159e-06, |
|
"loss": 0.7019, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 5.0785651206970215, |
|
"learning_rate": 9.915789473684211e-06, |
|
"loss": 0.7771, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 9.810837745666504, |
|
"learning_rate": 9.894736842105264e-06, |
|
"loss": 0.7542, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 24.654855728149414, |
|
"learning_rate": 9.873684210526317e-06, |
|
"loss": 0.7928, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.083669424057007, |
|
"learning_rate": 9.85263157894737e-06, |
|
"loss": 0.8091, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.9507665634155273, |
|
"learning_rate": 9.831578947368422e-06, |
|
"loss": 0.7548, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.55362606048584, |
|
"learning_rate": 9.810526315789475e-06, |
|
"loss": 0.7804, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.572410821914673, |
|
"learning_rate": 9.789473684210527e-06, |
|
"loss": 0.748, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.70060658454895, |
|
"learning_rate": 9.76842105263158e-06, |
|
"loss": 0.7303, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.397512674331665, |
|
"learning_rate": 9.747368421052633e-06, |
|
"loss": 0.7209, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.797943592071533, |
|
"learning_rate": 9.726315789473685e-06, |
|
"loss": 0.9082, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 9.164168357849121, |
|
"learning_rate": 9.705263157894738e-06, |
|
"loss": 0.7995, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.297326564788818, |
|
"learning_rate": 9.68421052631579e-06, |
|
"loss": 0.7484, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 12.500905990600586, |
|
"learning_rate": 9.663157894736843e-06, |
|
"loss": 0.7291, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.1083016395568848, |
|
"learning_rate": 9.642105263157896e-06, |
|
"loss": 0.8064, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.058903694152832, |
|
"learning_rate": 9.621052631578947e-06, |
|
"loss": 0.7087, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.303778648376465, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.6257, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.8508620262145996, |
|
"learning_rate": 9.578947368421054e-06, |
|
"loss": 0.7423, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.9560956954956055, |
|
"learning_rate": 9.557894736842107e-06, |
|
"loss": 0.7304, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.8841540813446045, |
|
"learning_rate": 9.53684210526316e-06, |
|
"loss": 0.7768, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.6742358207702637, |
|
"learning_rate": 9.515789473684212e-06, |
|
"loss": 0.7618, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.105114936828613, |
|
"learning_rate": 9.494736842105265e-06, |
|
"loss": 0.7086, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.728137493133545, |
|
"learning_rate": 9.473684210526315e-06, |
|
"loss": 0.8313, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 0.7711445689201355, |
|
"eval_runtime": 67.9047, |
|
"eval_samples_per_second": 14.727, |
|
"eval_steps_per_second": 14.727, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.539173603057861, |
|
"learning_rate": 9.452631578947368e-06, |
|
"loss": 0.7231, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.742118835449219, |
|
"learning_rate": 9.43157894736842e-06, |
|
"loss": 0.8199, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.9068603515625, |
|
"learning_rate": 9.410526315789475e-06, |
|
"loss": 0.7615, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 7.106772422790527, |
|
"learning_rate": 9.389473684210528e-06, |
|
"loss": 0.7139, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.272012710571289, |
|
"learning_rate": 9.36842105263158e-06, |
|
"loss": 0.6264, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 14.025699615478516, |
|
"learning_rate": 9.347368421052633e-06, |
|
"loss": 0.7416, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 12.747345924377441, |
|
"learning_rate": 9.326315789473684e-06, |
|
"loss": 0.781, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 7.966195106506348, |
|
"learning_rate": 9.305263157894737e-06, |
|
"loss": 0.7503, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.3705811500549316, |
|
"learning_rate": 9.28421052631579e-06, |
|
"loss": 0.7704, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.239542007446289, |
|
"learning_rate": 9.263157894736842e-06, |
|
"loss": 0.6806, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.395047187805176, |
|
"learning_rate": 9.242105263157896e-06, |
|
"loss": 0.6961, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.807992458343506, |
|
"learning_rate": 9.221052631578949e-06, |
|
"loss": 0.769, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.8179049491882324, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 0.7515, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.826687812805176, |
|
"learning_rate": 9.178947368421053e-06, |
|
"loss": 0.7337, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.776168346405029, |
|
"learning_rate": 9.157894736842105e-06, |
|
"loss": 0.7173, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.10529088973999, |
|
"learning_rate": 9.136842105263158e-06, |
|
"loss": 0.7255, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.4715189933776855, |
|
"learning_rate": 9.11578947368421e-06, |
|
"loss": 0.8092, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.8921728134155273, |
|
"learning_rate": 9.094736842105263e-06, |
|
"loss": 0.6684, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.904684066772461, |
|
"learning_rate": 9.073684210526316e-06, |
|
"loss": 0.7804, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 9.521209716796875, |
|
"learning_rate": 9.05263157894737e-06, |
|
"loss": 0.793, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 11.125286102294922, |
|
"learning_rate": 9.031578947368423e-06, |
|
"loss": 0.8254, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 8.136049270629883, |
|
"learning_rate": 9.010526315789474e-06, |
|
"loss": 0.7475, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.4722092151641846, |
|
"learning_rate": 8.989473684210527e-06, |
|
"loss": 0.7268, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.330580711364746, |
|
"learning_rate": 8.96842105263158e-06, |
|
"loss": 0.7995, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 25.711868286132812, |
|
"learning_rate": 8.947368421052632e-06, |
|
"loss": 0.801, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.3957395553588867, |
|
"learning_rate": 8.926315789473685e-06, |
|
"loss": 0.6988, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.033153533935547, |
|
"learning_rate": 8.905263157894737e-06, |
|
"loss": 0.7378, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.359398365020752, |
|
"learning_rate": 8.884210526315792e-06, |
|
"loss": 0.7214, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.08485746383667, |
|
"learning_rate": 8.863157894736842e-06, |
|
"loss": 0.7034, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.156674385070801, |
|
"learning_rate": 8.842105263157895e-06, |
|
"loss": 0.7833, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.031563758850098, |
|
"learning_rate": 8.821052631578948e-06, |
|
"loss": 0.7385, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 9.957317352294922, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.8572, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.951910972595215, |
|
"learning_rate": 8.778947368421053e-06, |
|
"loss": 0.7374, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.296828746795654, |
|
"learning_rate": 8.757894736842106e-06, |
|
"loss": 0.7619, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 7.079039096832275, |
|
"learning_rate": 8.736842105263158e-06, |
|
"loss": 0.7842, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.972481727600098, |
|
"learning_rate": 8.715789473684211e-06, |
|
"loss": 0.7039, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 11.936322212219238, |
|
"learning_rate": 8.694736842105264e-06, |
|
"loss": 0.6701, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.164266586303711, |
|
"learning_rate": 8.673684210526316e-06, |
|
"loss": 0.7481, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.0412397384643555, |
|
"learning_rate": 8.652631578947369e-06, |
|
"loss": 0.8783, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 13.239718437194824, |
|
"learning_rate": 8.631578947368422e-06, |
|
"loss": 0.8639, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.553131103515625, |
|
"learning_rate": 8.610526315789474e-06, |
|
"loss": 0.7861, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.507501602172852, |
|
"learning_rate": 8.589473684210527e-06, |
|
"loss": 0.7526, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.70124888420105, |
|
"learning_rate": 8.56842105263158e-06, |
|
"loss": 0.8391, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.307315349578857, |
|
"learning_rate": 8.547368421052632e-06, |
|
"loss": 0.7253, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 12.232582092285156, |
|
"learning_rate": 8.526315789473685e-06, |
|
"loss": 0.8559, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.0924105644226074, |
|
"learning_rate": 8.505263157894738e-06, |
|
"loss": 0.6245, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.90191912651062, |
|
"learning_rate": 8.48421052631579e-06, |
|
"loss": 0.6643, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.4637041091918945, |
|
"learning_rate": 8.463157894736843e-06, |
|
"loss": 0.72, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.8273704051971436, |
|
"learning_rate": 8.442105263157896e-06, |
|
"loss": 0.7202, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 7.119280815124512, |
|
"learning_rate": 8.421052631578948e-06, |
|
"loss": 0.7047, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 0.7685219645500183, |
|
"eval_runtime": 67.892, |
|
"eval_samples_per_second": 14.729, |
|
"eval_steps_per_second": 14.729, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.9551520347595215, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.6911, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.9231200218200684, |
|
"learning_rate": 8.378947368421054e-06, |
|
"loss": 0.7942, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 7.254823684692383, |
|
"learning_rate": 8.357894736842106e-06, |
|
"loss": 0.7811, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.8563404083251953, |
|
"learning_rate": 8.336842105263159e-06, |
|
"loss": 0.7523, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.5061299800872803, |
|
"learning_rate": 8.315789473684212e-06, |
|
"loss": 0.6222, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.3213858604431152, |
|
"learning_rate": 8.294736842105264e-06, |
|
"loss": 0.7617, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.054555416107178, |
|
"learning_rate": 8.273684210526317e-06, |
|
"loss": 0.7333, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.5189318656921387, |
|
"learning_rate": 8.25263157894737e-06, |
|
"loss": 0.8676, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.989790439605713, |
|
"learning_rate": 8.231578947368422e-06, |
|
"loss": 0.6678, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 7.941010475158691, |
|
"learning_rate": 8.210526315789475e-06, |
|
"loss": 0.7317, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.6499247550964355, |
|
"learning_rate": 8.189473684210527e-06, |
|
"loss": 0.7484, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.512948513031006, |
|
"learning_rate": 8.16842105263158e-06, |
|
"loss": 0.8508, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.844045400619507, |
|
"learning_rate": 8.147368421052633e-06, |
|
"loss": 0.7468, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.620250701904297, |
|
"learning_rate": 8.126315789473684e-06, |
|
"loss": 0.6449, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.5233919620513916, |
|
"learning_rate": 8.105263157894736e-06, |
|
"loss": 0.7928, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.866186618804932, |
|
"learning_rate": 8.08421052631579e-06, |
|
"loss": 0.787, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.392407417297363, |
|
"learning_rate": 8.063157894736843e-06, |
|
"loss": 0.7746, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.6285176277160645, |
|
"learning_rate": 8.042105263157896e-06, |
|
"loss": 0.7304, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.571240186691284, |
|
"learning_rate": 8.021052631578949e-06, |
|
"loss": 0.7008, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.8306283950805664, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.834, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.5514955520629883, |
|
"learning_rate": 7.978947368421052e-06, |
|
"loss": 0.8136, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 8.471675872802734, |
|
"learning_rate": 7.957894736842105e-06, |
|
"loss": 0.8439, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 8.785553932189941, |
|
"learning_rate": 7.936842105263158e-06, |
|
"loss": 0.7763, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.334304332733154, |
|
"learning_rate": 7.915789473684212e-06, |
|
"loss": 0.7832, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 14.861701011657715, |
|
"learning_rate": 7.894736842105265e-06, |
|
"loss": 0.6889, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.040034770965576, |
|
"learning_rate": 7.873684210526317e-06, |
|
"loss": 0.7422, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 9.74354076385498, |
|
"learning_rate": 7.85263157894737e-06, |
|
"loss": 0.7765, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.4280757904052734, |
|
"learning_rate": 7.831578947368421e-06, |
|
"loss": 0.7465, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 6.530819416046143, |
|
"learning_rate": 7.810526315789474e-06, |
|
"loss": 0.8216, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 6.786412239074707, |
|
"learning_rate": 7.789473684210526e-06, |
|
"loss": 0.7694, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.896278381347656, |
|
"learning_rate": 7.768421052631579e-06, |
|
"loss": 0.8282, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.5938825607299805, |
|
"learning_rate": 7.747368421052631e-06, |
|
"loss": 0.6628, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.134136915206909, |
|
"learning_rate": 7.726315789473686e-06, |
|
"loss": 0.8061, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 7.497835159301758, |
|
"learning_rate": 7.705263157894738e-06, |
|
"loss": 0.8946, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.5185306072235107, |
|
"learning_rate": 7.68421052631579e-06, |
|
"loss": 0.6689, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.464015245437622, |
|
"learning_rate": 7.663157894736842e-06, |
|
"loss": 0.7758, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.803342580795288, |
|
"learning_rate": 7.642105263157895e-06, |
|
"loss": 0.7478, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 6.2652130126953125, |
|
"learning_rate": 7.621052631578948e-06, |
|
"loss": 0.7293, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 9.655146598815918, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 0.7454, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.041891574859619, |
|
"learning_rate": 7.578947368421054e-06, |
|
"loss": 0.8579, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.133237838745117, |
|
"learning_rate": 7.557894736842106e-06, |
|
"loss": 0.6662, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 7.207560062408447, |
|
"learning_rate": 7.536842105263158e-06, |
|
"loss": 0.8135, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.374864101409912, |
|
"learning_rate": 7.515789473684211e-06, |
|
"loss": 0.7514, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.067178249359131, |
|
"learning_rate": 7.494736842105263e-06, |
|
"loss": 0.7446, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.283421516418457, |
|
"learning_rate": 7.473684210526316e-06, |
|
"loss": 0.7955, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.092348098754883, |
|
"learning_rate": 7.4526315789473695e-06, |
|
"loss": 0.5471, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 9.400391578674316, |
|
"learning_rate": 7.431578947368422e-06, |
|
"loss": 0.7098, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.843224048614502, |
|
"learning_rate": 7.410526315789475e-06, |
|
"loss": 0.7943, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.5985705852508545, |
|
"learning_rate": 7.3894736842105275e-06, |
|
"loss": 0.8059, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.502979278564453, |
|
"learning_rate": 7.368421052631579e-06, |
|
"loss": 0.6236, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.7682243585586548, |
|
"eval_runtime": 67.9039, |
|
"eval_samples_per_second": 14.727, |
|
"eval_steps_per_second": 14.727, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 11.025419235229492, |
|
"learning_rate": 7.347368421052632e-06, |
|
"loss": 0.8343, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.4290804862976074, |
|
"learning_rate": 7.326315789473685e-06, |
|
"loss": 0.7572, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.0629210472106934, |
|
"learning_rate": 7.305263157894737e-06, |
|
"loss": 0.8245, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.065977573394775, |
|
"learning_rate": 7.28421052631579e-06, |
|
"loss": 0.6447, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.971541166305542, |
|
"learning_rate": 7.263157894736843e-06, |
|
"loss": 0.8688, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.4434573650360107, |
|
"learning_rate": 7.242105263157896e-06, |
|
"loss": 0.6749, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.323293685913086, |
|
"learning_rate": 7.221052631578948e-06, |
|
"loss": 0.7982, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 16.821266174316406, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.7898, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.008687734603882, |
|
"learning_rate": 7.178947368421053e-06, |
|
"loss": 0.7375, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.629837989807129, |
|
"learning_rate": 7.157894736842106e-06, |
|
"loss": 0.7909, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.807744026184082, |
|
"learning_rate": 7.1368421052631585e-06, |
|
"loss": 0.621, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.9960129261016846, |
|
"learning_rate": 7.115789473684211e-06, |
|
"loss": 0.851, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.7165372371673584, |
|
"learning_rate": 7.094736842105265e-06, |
|
"loss": 0.7872, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.922586917877197, |
|
"learning_rate": 7.073684210526316e-06, |
|
"loss": 0.8822, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 9.046282768249512, |
|
"learning_rate": 7.052631578947369e-06, |
|
"loss": 0.7454, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.76317024230957, |
|
"learning_rate": 7.031578947368422e-06, |
|
"loss": 0.7116, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.31531286239624, |
|
"learning_rate": 7.010526315789474e-06, |
|
"loss": 0.7892, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.0895297527313232, |
|
"learning_rate": 6.989473684210527e-06, |
|
"loss": 0.7095, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.174783706665039, |
|
"learning_rate": 6.96842105263158e-06, |
|
"loss": 0.8007, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.1555280685424805, |
|
"learning_rate": 6.947368421052632e-06, |
|
"loss": 0.8274, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.173882246017456, |
|
"learning_rate": 6.926315789473684e-06, |
|
"loss": 0.6447, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.1489410400390625, |
|
"learning_rate": 6.905263157894737e-06, |
|
"loss": 0.7428, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.523904323577881, |
|
"learning_rate": 6.8842105263157895e-06, |
|
"loss": 0.8159, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.7494622468948364, |
|
"learning_rate": 6.863157894736843e-06, |
|
"loss": 0.863, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.552121639251709, |
|
"learning_rate": 6.842105263157896e-06, |
|
"loss": 0.7448, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.1907453536987305, |
|
"learning_rate": 6.821052631578948e-06, |
|
"loss": 0.6813, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.284384727478027, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 0.699, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.010688781738281, |
|
"learning_rate": 6.778947368421053e-06, |
|
"loss": 0.7803, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.5098397731781006, |
|
"learning_rate": 6.7578947368421054e-06, |
|
"loss": 0.767, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.8980441093444824, |
|
"learning_rate": 6.736842105263158e-06, |
|
"loss": 0.8084, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.8058199882507324, |
|
"learning_rate": 6.715789473684211e-06, |
|
"loss": 0.7214, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.3668529987335205, |
|
"learning_rate": 6.694736842105264e-06, |
|
"loss": 0.6759, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.715735912322998, |
|
"learning_rate": 6.673684210526317e-06, |
|
"loss": 0.7747, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 8.902985572814941, |
|
"learning_rate": 6.6526315789473695e-06, |
|
"loss": 0.8256, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.802920818328857, |
|
"learning_rate": 6.631578947368421e-06, |
|
"loss": 0.7682, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 9.218498229980469, |
|
"learning_rate": 6.610526315789474e-06, |
|
"loss": 0.7855, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.406294822692871, |
|
"learning_rate": 6.589473684210527e-06, |
|
"loss": 0.736, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.765889644622803, |
|
"learning_rate": 6.568421052631579e-06, |
|
"loss": 0.7073, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.910264015197754, |
|
"learning_rate": 6.547368421052632e-06, |
|
"loss": 0.7328, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 9.011739730834961, |
|
"learning_rate": 6.526315789473685e-06, |
|
"loss": 0.6798, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 8.296028137207031, |
|
"learning_rate": 6.505263157894738e-06, |
|
"loss": 0.7469, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.347682952880859, |
|
"learning_rate": 6.484210526315789e-06, |
|
"loss": 0.7143, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.903685092926025, |
|
"learning_rate": 6.463157894736843e-06, |
|
"loss": 0.7413, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.017665386199951, |
|
"learning_rate": 6.442105263157895e-06, |
|
"loss": 0.7569, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.3947088718414307, |
|
"learning_rate": 6.421052631578948e-06, |
|
"loss": 0.75, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.019251823425293, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.7364, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.439628839492798, |
|
"learning_rate": 6.378947368421053e-06, |
|
"loss": 0.68, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.413942575454712, |
|
"learning_rate": 6.357894736842106e-06, |
|
"loss": 0.79, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 8.72237491607666, |
|
"learning_rate": 6.336842105263158e-06, |
|
"loss": 0.6678, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.9021055698394775, |
|
"learning_rate": 6.31578947368421e-06, |
|
"loss": 0.7169, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 0.7889605164527893, |
|
"eval_runtime": 67.8704, |
|
"eval_samples_per_second": 14.734, |
|
"eval_steps_per_second": 14.734, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 8.238909721374512, |
|
"learning_rate": 6.294736842105264e-06, |
|
"loss": 0.658, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.403461456298828, |
|
"learning_rate": 6.2736842105263165e-06, |
|
"loss": 0.8165, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.648688316345215, |
|
"learning_rate": 6.252631578947369e-06, |
|
"loss": 0.7506, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.380591630935669, |
|
"learning_rate": 6.231578947368422e-06, |
|
"loss": 0.8892, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.201750755310059, |
|
"learning_rate": 6.2105263157894745e-06, |
|
"loss": 0.7069, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.9994821548461914, |
|
"learning_rate": 6.189473684210526e-06, |
|
"loss": 0.6896, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.100094318389893, |
|
"learning_rate": 6.168421052631579e-06, |
|
"loss": 0.6241, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.88962721824646, |
|
"learning_rate": 6.1473684210526316e-06, |
|
"loss": 0.741, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.669283151626587, |
|
"learning_rate": 6.126315789473685e-06, |
|
"loss": 0.5153, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.010345458984375, |
|
"learning_rate": 6.105263157894738e-06, |
|
"loss": 0.7394, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.333982467651367, |
|
"learning_rate": 6.08421052631579e-06, |
|
"loss": 0.6423, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.0060064792633057, |
|
"learning_rate": 6.063157894736843e-06, |
|
"loss": 0.7073, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.618821144104004, |
|
"learning_rate": 6.042105263157895e-06, |
|
"loss": 0.7221, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.6231422424316406, |
|
"learning_rate": 6.0210526315789475e-06, |
|
"loss": 0.6748, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 7.207015514373779, |
|
"learning_rate": 6e-06, |
|
"loss": 0.7403, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.1877031326293945, |
|
"learning_rate": 5.978947368421053e-06, |
|
"loss": 0.6143, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.433973550796509, |
|
"learning_rate": 5.9578947368421055e-06, |
|
"loss": 0.6593, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.261890888214111, |
|
"learning_rate": 5.936842105263159e-06, |
|
"loss": 0.7119, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.4731180667877197, |
|
"learning_rate": 5.915789473684212e-06, |
|
"loss": 0.7764, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.540252923965454, |
|
"learning_rate": 5.8947368421052634e-06, |
|
"loss": 0.788, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 16.481884002685547, |
|
"learning_rate": 5.873684210526316e-06, |
|
"loss": 0.7411, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.3406548500061035, |
|
"learning_rate": 5.852631578947369e-06, |
|
"loss": 0.7703, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.786658763885498, |
|
"learning_rate": 5.831578947368421e-06, |
|
"loss": 0.7068, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.659720420837402, |
|
"learning_rate": 5.810526315789474e-06, |
|
"loss": 0.7287, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.9273788928985596, |
|
"learning_rate": 5.789473684210527e-06, |
|
"loss": 0.7059, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.475671768188477, |
|
"learning_rate": 5.76842105263158e-06, |
|
"loss": 0.7284, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.699868202209473, |
|
"learning_rate": 5.747368421052633e-06, |
|
"loss": 0.8036, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.518573045730591, |
|
"learning_rate": 5.726315789473685e-06, |
|
"loss": 0.7209, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 8.151999473571777, |
|
"learning_rate": 5.705263157894737e-06, |
|
"loss": 0.6903, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.088874340057373, |
|
"learning_rate": 5.68421052631579e-06, |
|
"loss": 0.7685, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 9.118200302124023, |
|
"learning_rate": 5.663157894736843e-06, |
|
"loss": 0.7256, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.765544414520264, |
|
"learning_rate": 5.642105263157895e-06, |
|
"loss": 0.8016, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 11.424837112426758, |
|
"learning_rate": 5.621052631578948e-06, |
|
"loss": 0.7721, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.862210750579834, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.6898, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.197153568267822, |
|
"learning_rate": 5.578947368421052e-06, |
|
"loss": 0.6907, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.712553977966309, |
|
"learning_rate": 5.557894736842105e-06, |
|
"loss": 0.773, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.968278408050537, |
|
"learning_rate": 5.5368421052631586e-06, |
|
"loss": 0.7892, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.8882153034210205, |
|
"learning_rate": 5.515789473684211e-06, |
|
"loss": 0.8365, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.6297197341918945, |
|
"learning_rate": 5.494736842105264e-06, |
|
"loss": 0.7374, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.2362327575683594, |
|
"learning_rate": 5.4736842105263165e-06, |
|
"loss": 0.8293, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.1008100509643555, |
|
"learning_rate": 5.452631578947369e-06, |
|
"loss": 0.7048, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.488921642303467, |
|
"learning_rate": 5.431578947368421e-06, |
|
"loss": 0.7902, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.7497622966766357, |
|
"learning_rate": 5.410526315789474e-06, |
|
"loss": 0.8359, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.192277193069458, |
|
"learning_rate": 5.389473684210526e-06, |
|
"loss": 0.7253, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.586243629455566, |
|
"learning_rate": 5.36842105263158e-06, |
|
"loss": 0.7588, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.424870729446411, |
|
"learning_rate": 5.3473684210526325e-06, |
|
"loss": 0.7268, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 28.807186126708984, |
|
"learning_rate": 5.326315789473685e-06, |
|
"loss": 0.7979, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.297823905944824, |
|
"learning_rate": 5.305263157894738e-06, |
|
"loss": 0.768, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.891976833343506, |
|
"learning_rate": 5.2842105263157896e-06, |
|
"loss": 0.7063, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.083632469177246, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 0.8102, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 0.7507393956184387, |
|
"eval_runtime": 67.8717, |
|
"eval_samples_per_second": 14.734, |
|
"eval_steps_per_second": 14.734, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 10.315424919128418, |
|
"learning_rate": 5.242105263157895e-06, |
|
"loss": 0.736, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.440877676010132, |
|
"learning_rate": 5.2210526315789475e-06, |
|
"loss": 0.799, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.361064910888672, |
|
"learning_rate": 5.2e-06, |
|
"loss": 0.832, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.1224961280822754, |
|
"learning_rate": 5.178947368421054e-06, |
|
"loss": 0.7118, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.9322614669799805, |
|
"learning_rate": 5.157894736842106e-06, |
|
"loss": 0.6614, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.812900066375732, |
|
"learning_rate": 5.136842105263158e-06, |
|
"loss": 0.8002, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 6.411820411682129, |
|
"learning_rate": 5.115789473684211e-06, |
|
"loss": 0.835, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.406981468200684, |
|
"learning_rate": 5.0947368421052635e-06, |
|
"loss": 0.8384, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.32007360458374, |
|
"learning_rate": 5.073684210526316e-06, |
|
"loss": 0.5798, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.640589714050293, |
|
"learning_rate": 5.052631578947369e-06, |
|
"loss": 0.7896, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.4717936515808105, |
|
"learning_rate": 5.0315789473684214e-06, |
|
"loss": 0.7829, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.995558261871338, |
|
"learning_rate": 5.010526315789475e-06, |
|
"loss": 0.7322, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.5911152362823486, |
|
"learning_rate": 4.989473684210527e-06, |
|
"loss": 0.7727, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.3829457759857178, |
|
"learning_rate": 4.968421052631579e-06, |
|
"loss": 0.7178, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.157157897949219, |
|
"learning_rate": 4.947368421052632e-06, |
|
"loss": 0.7241, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 6.205902099609375, |
|
"learning_rate": 4.926315789473685e-06, |
|
"loss": 0.7831, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.92594051361084, |
|
"learning_rate": 4.905263157894737e-06, |
|
"loss": 0.8057, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.578032493591309, |
|
"learning_rate": 4.88421052631579e-06, |
|
"loss": 0.8011, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 6.8539605140686035, |
|
"learning_rate": 4.863157894736843e-06, |
|
"loss": 0.7792, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 7.954685211181641, |
|
"learning_rate": 4.842105263157895e-06, |
|
"loss": 0.6691, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.0253312587738037, |
|
"learning_rate": 4.821052631578948e-06, |
|
"loss": 0.6483, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 8.230294227600098, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.8076, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.5444509983062744, |
|
"learning_rate": 4.778947368421053e-06, |
|
"loss": 0.7902, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.8759273290634155, |
|
"learning_rate": 4.757894736842106e-06, |
|
"loss": 0.7308, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.69119930267334, |
|
"learning_rate": 4.736842105263158e-06, |
|
"loss": 0.6605, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 7.020988941192627, |
|
"learning_rate": 4.71578947368421e-06, |
|
"loss": 0.7678, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.7685866355896, |
|
"learning_rate": 4.694736842105264e-06, |
|
"loss": 0.8022, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.516789436340332, |
|
"learning_rate": 4.6736842105263166e-06, |
|
"loss": 0.6176, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.267387866973877, |
|
"learning_rate": 4.652631578947368e-06, |
|
"loss": 0.6487, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.96762228012085, |
|
"learning_rate": 4.631578947368421e-06, |
|
"loss": 0.7066, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.345110893249512, |
|
"learning_rate": 4.6105263157894745e-06, |
|
"loss": 0.6072, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 10.33462142944336, |
|
"learning_rate": 4.589473684210526e-06, |
|
"loss": 0.8211, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.632289409637451, |
|
"learning_rate": 4.568421052631579e-06, |
|
"loss": 0.8335, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.453967094421387, |
|
"learning_rate": 4.547368421052632e-06, |
|
"loss": 0.8331, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.877091407775879, |
|
"learning_rate": 4.526315789473685e-06, |
|
"loss": 0.6793, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 16.41980743408203, |
|
"learning_rate": 4.505263157894737e-06, |
|
"loss": 0.819, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.1915693283081055, |
|
"learning_rate": 4.48421052631579e-06, |
|
"loss": 0.7217, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.805244445800781, |
|
"learning_rate": 4.463157894736842e-06, |
|
"loss": 0.7146, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.697472333908081, |
|
"learning_rate": 4.442105263157896e-06, |
|
"loss": 0.6748, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.6001346111297607, |
|
"learning_rate": 4.4210526315789476e-06, |
|
"loss": 0.6972, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.912445545196533, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.7157, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 6.9912309646606445, |
|
"learning_rate": 4.378947368421053e-06, |
|
"loss": 0.5927, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.380290985107422, |
|
"learning_rate": 4.3578947368421055e-06, |
|
"loss": 0.699, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.024576663970947, |
|
"learning_rate": 4.336842105263158e-06, |
|
"loss": 0.8156, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.523719310760498, |
|
"learning_rate": 4.315789473684211e-06, |
|
"loss": 0.7827, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 10.055171966552734, |
|
"learning_rate": 4.2947368421052635e-06, |
|
"loss": 0.7142, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 7.437203407287598, |
|
"learning_rate": 4.273684210526316e-06, |
|
"loss": 0.7184, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.6910207271575928, |
|
"learning_rate": 4.252631578947369e-06, |
|
"loss": 0.7311, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 12.729212760925293, |
|
"learning_rate": 4.2315789473684215e-06, |
|
"loss": 0.7629, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.817344903945923, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 0.8676, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.7396635413169861, |
|
"eval_runtime": 67.9126, |
|
"eval_samples_per_second": 14.725, |
|
"eval_steps_per_second": 14.725, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.193355083465576, |
|
"learning_rate": 4.189473684210527e-06, |
|
"loss": 0.7036, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.617652177810669, |
|
"learning_rate": 4.1684210526315794e-06, |
|
"loss": 0.6547, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.48286771774292, |
|
"learning_rate": 4.147368421052632e-06, |
|
"loss": 0.6756, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.939229965209961, |
|
"learning_rate": 4.126315789473685e-06, |
|
"loss": 0.7157, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 14.387231826782227, |
|
"learning_rate": 4.105263157894737e-06, |
|
"loss": 0.8052, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.042211055755615, |
|
"learning_rate": 4.08421052631579e-06, |
|
"loss": 0.6733, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 6.068091869354248, |
|
"learning_rate": 4.063157894736842e-06, |
|
"loss": 0.6172, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.004486083984375, |
|
"learning_rate": 4.042105263157895e-06, |
|
"loss": 0.7888, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.651116847991943, |
|
"learning_rate": 4.021052631578948e-06, |
|
"loss": 0.6979, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.581594944000244, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.7654, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.6030330657958984, |
|
"learning_rate": 3.9789473684210525e-06, |
|
"loss": 0.7946, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.385477542877197, |
|
"learning_rate": 3.957894736842106e-06, |
|
"loss": 0.7785, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.688074588775635, |
|
"learning_rate": 3.936842105263159e-06, |
|
"loss": 0.7762, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.7027924060821533, |
|
"learning_rate": 3.9157894736842104e-06, |
|
"loss": 0.6933, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.239694118499756, |
|
"learning_rate": 3.894736842105263e-06, |
|
"loss": 0.8061, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.3939032554626465, |
|
"learning_rate": 3.873684210526316e-06, |
|
"loss": 0.7537, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.115386962890625, |
|
"learning_rate": 3.852631578947369e-06, |
|
"loss": 0.7025, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.546750545501709, |
|
"learning_rate": 3.831578947368421e-06, |
|
"loss": 0.7108, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.043384552001953, |
|
"learning_rate": 3.810526315789474e-06, |
|
"loss": 0.7506, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.8117778301239014, |
|
"learning_rate": 3.789473684210527e-06, |
|
"loss": 0.773, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.000233173370361, |
|
"learning_rate": 3.768421052631579e-06, |
|
"loss": 0.6902, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.7739787101745605, |
|
"learning_rate": 3.7473684210526317e-06, |
|
"loss": 0.6397, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.948480129241943, |
|
"learning_rate": 3.7263157894736848e-06, |
|
"loss": 0.6185, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.269702434539795, |
|
"learning_rate": 3.7052631578947374e-06, |
|
"loss": 0.7487, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.8336634635925293, |
|
"learning_rate": 3.6842105263157896e-06, |
|
"loss": 0.7805, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.896543979644775, |
|
"learning_rate": 3.6631578947368423e-06, |
|
"loss": 0.645, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.051191806793213, |
|
"learning_rate": 3.642105263157895e-06, |
|
"loss": 0.7477, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 24.540451049804688, |
|
"learning_rate": 3.621052631578948e-06, |
|
"loss": 0.8168, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.061807155609131, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.727, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.3907368183135986, |
|
"learning_rate": 3.578947368421053e-06, |
|
"loss": 0.6614, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.554809093475342, |
|
"learning_rate": 3.5578947368421056e-06, |
|
"loss": 0.6947, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.7383534908294678, |
|
"learning_rate": 3.536842105263158e-06, |
|
"loss": 0.6171, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.406937122344971, |
|
"learning_rate": 3.515789473684211e-06, |
|
"loss": 0.6102, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.226219654083252, |
|
"learning_rate": 3.4947368421052635e-06, |
|
"loss": 0.7746, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.249040126800537, |
|
"learning_rate": 3.473684210526316e-06, |
|
"loss": 0.7158, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.806312084197998, |
|
"learning_rate": 3.4526315789473684e-06, |
|
"loss": 0.7249, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.993473529815674, |
|
"learning_rate": 3.4315789473684215e-06, |
|
"loss": 0.826, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.120741367340088, |
|
"learning_rate": 3.410526315789474e-06, |
|
"loss": 0.6238, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.020960807800293, |
|
"learning_rate": 3.3894736842105264e-06, |
|
"loss": 0.6749, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.000002384185791, |
|
"learning_rate": 3.368421052631579e-06, |
|
"loss": 0.7652, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 8.221445083618164, |
|
"learning_rate": 3.347368421052632e-06, |
|
"loss": 0.7781, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.850223541259766, |
|
"learning_rate": 3.3263157894736848e-06, |
|
"loss": 0.7555, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.249915838241577, |
|
"learning_rate": 3.305263157894737e-06, |
|
"loss": 0.7305, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.955141067504883, |
|
"learning_rate": 3.2842105263157897e-06, |
|
"loss": 0.6817, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.4711403846740723, |
|
"learning_rate": 3.2631578947368423e-06, |
|
"loss": 0.683, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.367486953735352, |
|
"learning_rate": 3.2421052631578945e-06, |
|
"loss": 0.6494, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.283465623855591, |
|
"learning_rate": 3.2210526315789476e-06, |
|
"loss": 0.6092, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.473137855529785, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.676, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.177180528640747, |
|
"learning_rate": 3.178947368421053e-06, |
|
"loss": 0.6685, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.735683441162109, |
|
"learning_rate": 3.157894736842105e-06, |
|
"loss": 0.7544, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 0.7582711577415466, |
|
"eval_runtime": 67.8631, |
|
"eval_samples_per_second": 14.736, |
|
"eval_steps_per_second": 14.736, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.465471267700195, |
|
"learning_rate": 3.1368421052631582e-06, |
|
"loss": 0.8191, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.8849751949310303, |
|
"learning_rate": 3.115789473684211e-06, |
|
"loss": 0.7078, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.555447101593018, |
|
"learning_rate": 3.094736842105263e-06, |
|
"loss": 0.7332, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.269344806671143, |
|
"learning_rate": 3.0736842105263158e-06, |
|
"loss": 0.7619, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.792567729949951, |
|
"learning_rate": 3.052631578947369e-06, |
|
"loss": 0.6858, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.095942974090576, |
|
"learning_rate": 3.0315789473684215e-06, |
|
"loss": 0.7793, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.316791296005249, |
|
"learning_rate": 3.0105263157894737e-06, |
|
"loss": 0.666, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.55336332321167, |
|
"learning_rate": 2.9894736842105264e-06, |
|
"loss": 0.7723, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 7.5306315422058105, |
|
"learning_rate": 2.9684210526315795e-06, |
|
"loss": 0.7283, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.935115337371826, |
|
"learning_rate": 2.9473684210526317e-06, |
|
"loss": 0.7843, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.173915863037109, |
|
"learning_rate": 2.9263157894736844e-06, |
|
"loss": 0.6662, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.5214264392852783, |
|
"learning_rate": 2.905263157894737e-06, |
|
"loss": 0.6887, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.139004707336426, |
|
"learning_rate": 2.88421052631579e-06, |
|
"loss": 0.6778, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.185042381286621, |
|
"learning_rate": 2.8631578947368423e-06, |
|
"loss": 0.9094, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.3607513904571533, |
|
"learning_rate": 2.842105263157895e-06, |
|
"loss": 0.7918, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.062870502471924, |
|
"learning_rate": 2.8210526315789476e-06, |
|
"loss": 0.7694, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.099003791809082, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.7301, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.512063026428223, |
|
"learning_rate": 2.7789473684210525e-06, |
|
"loss": 0.7887, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.625652551651001, |
|
"learning_rate": 2.7578947368421056e-06, |
|
"loss": 0.7781, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.8921008110046387, |
|
"learning_rate": 2.7368421052631583e-06, |
|
"loss": 0.7582, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 10.71945571899414, |
|
"learning_rate": 2.7157894736842105e-06, |
|
"loss": 0.7234, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 17.737136840820312, |
|
"learning_rate": 2.694736842105263e-06, |
|
"loss": 0.6298, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 9.8464994430542, |
|
"learning_rate": 2.6736842105263162e-06, |
|
"loss": 0.7856, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 7.925550937652588, |
|
"learning_rate": 2.652631578947369e-06, |
|
"loss": 0.8387, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.530381441116333, |
|
"learning_rate": 2.631578947368421e-06, |
|
"loss": 0.8223, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 6.403299808502197, |
|
"learning_rate": 2.6105263157894738e-06, |
|
"loss": 0.8079, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.1753740310668945, |
|
"learning_rate": 2.589473684210527e-06, |
|
"loss": 0.7888, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.760190725326538, |
|
"learning_rate": 2.568421052631579e-06, |
|
"loss": 0.7071, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.183119297027588, |
|
"learning_rate": 2.5473684210526317e-06, |
|
"loss": 0.619, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.66708517074585, |
|
"learning_rate": 2.5263157894736844e-06, |
|
"loss": 0.7888, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.3660988807678223, |
|
"learning_rate": 2.5052631578947375e-06, |
|
"loss": 0.7466, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.8384206295013428, |
|
"learning_rate": 2.4842105263157897e-06, |
|
"loss": 0.7371, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.593717336654663, |
|
"learning_rate": 2.4631578947368424e-06, |
|
"loss": 0.5967, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.778346538543701, |
|
"learning_rate": 2.442105263157895e-06, |
|
"loss": 0.6407, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 10.841148376464844, |
|
"learning_rate": 2.4210526315789477e-06, |
|
"loss": 0.8172, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.635694980621338, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.8135, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.5510995388031006, |
|
"learning_rate": 2.378947368421053e-06, |
|
"loss": 0.8328, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.770972967147827, |
|
"learning_rate": 2.357894736842105e-06, |
|
"loss": 0.6642, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.756451606750488, |
|
"learning_rate": 2.3368421052631583e-06, |
|
"loss": 0.7484, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.9202377796173096, |
|
"learning_rate": 2.3157894736842105e-06, |
|
"loss": 0.7381, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.43782377243042, |
|
"learning_rate": 2.294736842105263e-06, |
|
"loss": 0.7915, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 20.496152877807617, |
|
"learning_rate": 2.273684210526316e-06, |
|
"loss": 0.6872, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.2591583728790283, |
|
"learning_rate": 2.2526315789473685e-06, |
|
"loss": 0.668, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.23056960105896, |
|
"learning_rate": 2.231578947368421e-06, |
|
"loss": 0.6229, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.419168949127197, |
|
"learning_rate": 2.2105263157894738e-06, |
|
"loss": 0.9534, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 15.681089401245117, |
|
"learning_rate": 2.1894736842105264e-06, |
|
"loss": 0.782, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.7693331241607666, |
|
"learning_rate": 2.168421052631579e-06, |
|
"loss": 0.8047, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.4705393314361572, |
|
"learning_rate": 2.1473684210526317e-06, |
|
"loss": 0.7832, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.295872688293457, |
|
"learning_rate": 2.1263157894736844e-06, |
|
"loss": 0.7355, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.0480620861053467, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 0.6739, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 0.7442497611045837, |
|
"eval_runtime": 67.8767, |
|
"eval_samples_per_second": 14.733, |
|
"eval_steps_per_second": 14.733, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.9723927974700928, |
|
"learning_rate": 2.0842105263157897e-06, |
|
"loss": 0.7003, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.0932421684265137, |
|
"learning_rate": 2.0631578947368424e-06, |
|
"loss": 0.6897, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.70625114440918, |
|
"learning_rate": 2.042105263157895e-06, |
|
"loss": 0.8106, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.2763564586639404, |
|
"learning_rate": 2.0210526315789477e-06, |
|
"loss": 0.7387, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.553431034088135, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.7435, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.36479377746582, |
|
"learning_rate": 1.978947368421053e-06, |
|
"loss": 0.7713, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.923874855041504, |
|
"learning_rate": 1.9578947368421052e-06, |
|
"loss": 0.5508, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 8.63404655456543, |
|
"learning_rate": 1.936842105263158e-06, |
|
"loss": 0.7323, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.521135330200195, |
|
"learning_rate": 1.9157894736842105e-06, |
|
"loss": 0.699, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 9.009405136108398, |
|
"learning_rate": 1.8947368421052634e-06, |
|
"loss": 0.789, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 12.834007263183594, |
|
"learning_rate": 1.8736842105263158e-06, |
|
"loss": 0.7382, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.753262758255005, |
|
"learning_rate": 1.8526315789473687e-06, |
|
"loss": 0.7035, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.300708770751953, |
|
"learning_rate": 1.8315789473684211e-06, |
|
"loss": 0.7558, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.416452884674072, |
|
"learning_rate": 1.810526315789474e-06, |
|
"loss": 0.6854, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 7.664788722991943, |
|
"learning_rate": 1.7894736842105265e-06, |
|
"loss": 0.6951, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.646073818206787, |
|
"learning_rate": 1.768421052631579e-06, |
|
"loss": 0.7472, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.125991106033325, |
|
"learning_rate": 1.7473684210526318e-06, |
|
"loss": 0.6711, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.308753967285156, |
|
"learning_rate": 1.7263157894736842e-06, |
|
"loss": 0.6393, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 11.79830265045166, |
|
"learning_rate": 1.705263157894737e-06, |
|
"loss": 0.7358, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.862399101257324, |
|
"learning_rate": 1.6842105263157895e-06, |
|
"loss": 0.8422, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.3199968338012695, |
|
"learning_rate": 1.6631578947368424e-06, |
|
"loss": 0.6999, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.263275146484375, |
|
"learning_rate": 1.6421052631578948e-06, |
|
"loss": 0.7122, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.283051490783691, |
|
"learning_rate": 1.6210526315789473e-06, |
|
"loss": 0.7793, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.0055785179138184, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.732, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.184137344360352, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 0.7339, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.587636709213257, |
|
"learning_rate": 1.5578947368421054e-06, |
|
"loss": 0.8473, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 8.189043045043945, |
|
"learning_rate": 1.5368421052631579e-06, |
|
"loss": 0.6498, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.4272284507751465, |
|
"learning_rate": 1.5157894736842108e-06, |
|
"loss": 0.7676, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.280287027359009, |
|
"learning_rate": 1.4947368421052632e-06, |
|
"loss": 0.6283, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 8.722474098205566, |
|
"learning_rate": 1.4736842105263159e-06, |
|
"loss": 0.7555, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.574818134307861, |
|
"learning_rate": 1.4526315789473685e-06, |
|
"loss": 0.7481, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.0097527503967285, |
|
"learning_rate": 1.4315789473684212e-06, |
|
"loss": 0.6181, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.725505352020264, |
|
"learning_rate": 1.4105263157894738e-06, |
|
"loss": 0.677, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.934959888458252, |
|
"learning_rate": 1.3894736842105263e-06, |
|
"loss": 0.6932, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.7491650581359863, |
|
"learning_rate": 1.3684210526315791e-06, |
|
"loss": 0.7361, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.734315872192383, |
|
"learning_rate": 1.3473684210526316e-06, |
|
"loss": 0.6442, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.301790714263916, |
|
"learning_rate": 1.3263157894736844e-06, |
|
"loss": 0.7642, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.042958736419678, |
|
"learning_rate": 1.3052631578947369e-06, |
|
"loss": 0.7974, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.941096782684326, |
|
"learning_rate": 1.2842105263157895e-06, |
|
"loss": 0.8603, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.379117488861084, |
|
"learning_rate": 1.2631578947368422e-06, |
|
"loss": 0.8297, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6.3129048347473145, |
|
"learning_rate": 1.2421052631578948e-06, |
|
"loss": 0.7783, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.5439133644104, |
|
"learning_rate": 1.2210526315789475e-06, |
|
"loss": 0.8122, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6.480744361877441, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.7779, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.862485408782959, |
|
"learning_rate": 1.1789473684210526e-06, |
|
"loss": 0.6917, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.7247443199157715, |
|
"learning_rate": 1.1578947368421053e-06, |
|
"loss": 0.7017, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 8.194451332092285, |
|
"learning_rate": 1.136842105263158e-06, |
|
"loss": 0.7031, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 8.057929992675781, |
|
"learning_rate": 1.1157894736842106e-06, |
|
"loss": 0.7116, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.529337406158447, |
|
"learning_rate": 1.0947368421052632e-06, |
|
"loss": 0.8314, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 7.412846565246582, |
|
"learning_rate": 1.0736842105263159e-06, |
|
"loss": 0.6448, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.7076497077941895, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 0.6291, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.7395394444465637, |
|
"eval_runtime": 67.8841, |
|
"eval_samples_per_second": 14.731, |
|
"eval_steps_per_second": 14.731, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.488115310668945, |
|
"learning_rate": 1.0315789473684212e-06, |
|
"loss": 0.8611, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.6314383745193481, |
|
"learning_rate": 1.0105263157894738e-06, |
|
"loss": 0.7694, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.290372848510742, |
|
"learning_rate": 9.894736842105265e-07, |
|
"loss": 0.7166, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.1572625637054443, |
|
"learning_rate": 9.68421052631579e-07, |
|
"loss": 0.7649, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.951930999755859, |
|
"learning_rate": 9.473684210526317e-07, |
|
"loss": 0.7057, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.696636199951172, |
|
"learning_rate": 9.263157894736844e-07, |
|
"loss": 0.7853, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.211262226104736, |
|
"learning_rate": 9.05263157894737e-07, |
|
"loss": 0.6612, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.584897041320801, |
|
"learning_rate": 8.842105263157895e-07, |
|
"loss": 0.6393, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.64282751083374, |
|
"learning_rate": 8.631578947368421e-07, |
|
"loss": 0.7915, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.691389799118042, |
|
"learning_rate": 8.421052631578948e-07, |
|
"loss": 0.659, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.740243911743164, |
|
"learning_rate": 8.210526315789474e-07, |
|
"loss": 0.7134, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6.811493873596191, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 0.8592, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.2056334018707275, |
|
"learning_rate": 7.789473684210527e-07, |
|
"loss": 0.6753, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.347885608673096, |
|
"learning_rate": 7.578947368421054e-07, |
|
"loss": 0.7476, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.63771915435791, |
|
"learning_rate": 7.368421052631579e-07, |
|
"loss": 0.7649, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.062124013900757, |
|
"learning_rate": 7.157894736842106e-07, |
|
"loss": 0.6792, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 9.334321022033691, |
|
"learning_rate": 6.947368421052631e-07, |
|
"loss": 0.7626, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 7.429685115814209, |
|
"learning_rate": 6.736842105263158e-07, |
|
"loss": 0.6943, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.459277629852295, |
|
"learning_rate": 6.526315789473684e-07, |
|
"loss": 0.7838, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6.821927070617676, |
|
"learning_rate": 6.315789473684211e-07, |
|
"loss": 0.7103, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 10.438909530639648, |
|
"learning_rate": 6.105263157894738e-07, |
|
"loss": 0.7509, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 11.55811882019043, |
|
"learning_rate": 5.894736842105263e-07, |
|
"loss": 0.7623, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.1809043884277344, |
|
"learning_rate": 5.68421052631579e-07, |
|
"loss": 0.6294, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.337337970733643, |
|
"learning_rate": 5.473684210526316e-07, |
|
"loss": 0.763, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 8.130523681640625, |
|
"learning_rate": 5.263157894736843e-07, |
|
"loss": 0.6404, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.213668346405029, |
|
"learning_rate": 5.052631578947369e-07, |
|
"loss": 0.7379, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.8605246543884277, |
|
"learning_rate": 4.842105263157895e-07, |
|
"loss": 0.7483, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.358519077301025, |
|
"learning_rate": 4.631578947368422e-07, |
|
"loss": 0.6823, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.9712955951690674, |
|
"learning_rate": 4.421052631578947e-07, |
|
"loss": 0.679, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6.285613059997559, |
|
"learning_rate": 4.210526315789474e-07, |
|
"loss": 0.7763, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.434277296066284, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 0.8558, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 7.880703449249268, |
|
"learning_rate": 3.789473684210527e-07, |
|
"loss": 0.7494, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 11.698799133300781, |
|
"learning_rate": 3.578947368421053e-07, |
|
"loss": 0.6576, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.2752954959869385, |
|
"learning_rate": 3.368421052631579e-07, |
|
"loss": 0.6494, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.878567934036255, |
|
"learning_rate": 3.1578947368421055e-07, |
|
"loss": 0.6781, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.6086246967315674, |
|
"learning_rate": 2.9473684210526315e-07, |
|
"loss": 0.7339, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.403782844543457, |
|
"learning_rate": 2.736842105263158e-07, |
|
"loss": 0.7738, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.487565994262695, |
|
"learning_rate": 2.5263157894736846e-07, |
|
"loss": 0.8165, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.29118537902832, |
|
"learning_rate": 2.315789473684211e-07, |
|
"loss": 0.6272, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.634309768676758, |
|
"learning_rate": 2.105263157894737e-07, |
|
"loss": 0.6641, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.989073276519775, |
|
"learning_rate": 1.8947368421052634e-07, |
|
"loss": 0.7111, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.606556415557861, |
|
"learning_rate": 1.6842105263157895e-07, |
|
"loss": 0.6112, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.012443542480469, |
|
"learning_rate": 1.4736842105263158e-07, |
|
"loss": 0.6684, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 6.287766933441162, |
|
"learning_rate": 1.2631578947368423e-07, |
|
"loss": 0.6687, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.646402597427368, |
|
"learning_rate": 1.0526315789473685e-07, |
|
"loss": 0.6452, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 7.9046950340271, |
|
"learning_rate": 8.421052631578947e-08, |
|
"loss": 0.7636, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.733578681945801, |
|
"learning_rate": 6.315789473684211e-08, |
|
"loss": 0.6619, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.342442274093628, |
|
"learning_rate": 4.2105263157894737e-08, |
|
"loss": 0.74, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.0832839012146, |
|
"learning_rate": 2.1052631578947368e-08, |
|
"loss": 0.7314, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.517941951751709, |
|
"learning_rate": 0.0, |
|
"loss": 0.755, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.7402730584144592, |
|
"eval_runtime": 67.899, |
|
"eval_samples_per_second": 14.728, |
|
"eval_steps_per_second": 14.728, |
|
"step": 10000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 2500, |
|
"total_flos": 1.6102125993984e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|