|
{ |
|
"best_metric": 0.6554008152173914, |
|
"best_model_checkpoint": "demo_LID_ntu-spml_distilhubert/checkpoint-6930", |
|
"epoch": 9.99891891891892, |
|
"eval_steps": 500, |
|
"global_step": 6930, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014414414414414415, |
|
"grad_norm": 2.169387102127075, |
|
"learning_rate": 4.329004329004329e-06, |
|
"loss": 15.2197, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02882882882882883, |
|
"grad_norm": 2.4440665245056152, |
|
"learning_rate": 8.658008658008657e-06, |
|
"loss": 15.2046, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.043243243243243246, |
|
"grad_norm": 1.9768311977386475, |
|
"learning_rate": 1.2987012987012986e-05, |
|
"loss": 15.2027, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05765765765765766, |
|
"grad_norm": 2.598134994506836, |
|
"learning_rate": 1.7316017316017315e-05, |
|
"loss": 15.1842, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07207207207207207, |
|
"grad_norm": 2.2137622833251953, |
|
"learning_rate": 2.164502164502164e-05, |
|
"loss": 15.1876, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08648648648648649, |
|
"grad_norm": 2.252912759780884, |
|
"learning_rate": 2.5974025974025972e-05, |
|
"loss": 15.172, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1009009009009009, |
|
"grad_norm": 2.699625015258789, |
|
"learning_rate": 3.03030303030303e-05, |
|
"loss": 15.1004, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11531531531531532, |
|
"grad_norm": 2.774757146835327, |
|
"learning_rate": 3.463203463203463e-05, |
|
"loss": 15.0877, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12972972972972974, |
|
"grad_norm": 2.9454381465911865, |
|
"learning_rate": 3.896103896103895e-05, |
|
"loss": 15.0704, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14414414414414414, |
|
"grad_norm": 3.3984997272491455, |
|
"learning_rate": 4.329004329004328e-05, |
|
"loss": 15.0211, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15855855855855855, |
|
"grad_norm": 3.1876633167266846, |
|
"learning_rate": 4.7619047619047614e-05, |
|
"loss": 14.9973, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17297297297297298, |
|
"grad_norm": 3.87903094291687, |
|
"learning_rate": 5.1948051948051944e-05, |
|
"loss": 14.9534, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1873873873873874, |
|
"grad_norm": 4.1114983558654785, |
|
"learning_rate": 5.627705627705627e-05, |
|
"loss": 14.8464, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2018018018018018, |
|
"grad_norm": 3.837207078933716, |
|
"learning_rate": 6.06060606060606e-05, |
|
"loss": 14.8935, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 3.978295087814331, |
|
"learning_rate": 6.493506493506494e-05, |
|
"loss": 14.8274, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23063063063063063, |
|
"grad_norm": 5.2168145179748535, |
|
"learning_rate": 6.926406926406926e-05, |
|
"loss": 14.7065, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.24504504504504504, |
|
"grad_norm": 5.752880096435547, |
|
"learning_rate": 7.359307359307358e-05, |
|
"loss": 14.6178, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2594594594594595, |
|
"grad_norm": 6.018016338348389, |
|
"learning_rate": 7.79220779220779e-05, |
|
"loss": 14.4008, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.27387387387387385, |
|
"grad_norm": 5.537229537963867, |
|
"learning_rate": 8.225108225108224e-05, |
|
"loss": 14.3105, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2882882882882883, |
|
"grad_norm": 6.358255863189697, |
|
"learning_rate": 8.658008658008657e-05, |
|
"loss": 14.1688, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3027027027027027, |
|
"grad_norm": 6.9536356925964355, |
|
"learning_rate": 9.09090909090909e-05, |
|
"loss": 14.1205, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3171171171171171, |
|
"grad_norm": 8.093494415283203, |
|
"learning_rate": 9.523809523809523e-05, |
|
"loss": 14.1292, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.33153153153153153, |
|
"grad_norm": 6.803300380706787, |
|
"learning_rate": 9.956709956709956e-05, |
|
"loss": 13.9276, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.34594594594594597, |
|
"grad_norm": 6.665808200836182, |
|
"learning_rate": 0.00010389610389610389, |
|
"loss": 13.9136, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.36036036036036034, |
|
"grad_norm": 10.191052436828613, |
|
"learning_rate": 0.00010822510822510823, |
|
"loss": 13.708, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3747747747747748, |
|
"grad_norm": 7.783840656280518, |
|
"learning_rate": 0.00011255411255411254, |
|
"loss": 13.6658, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3891891891891892, |
|
"grad_norm": 11.964157104492188, |
|
"learning_rate": 0.00011688311688311687, |
|
"loss": 13.6014, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4036036036036036, |
|
"grad_norm": 7.828129291534424, |
|
"learning_rate": 0.0001212121212121212, |
|
"loss": 13.3956, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.418018018018018, |
|
"grad_norm": 8.642557144165039, |
|
"learning_rate": 0.00012554112554112555, |
|
"loss": 13.4701, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 8.499011993408203, |
|
"learning_rate": 0.00012987012987012987, |
|
"loss": 13.2608, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.44684684684684683, |
|
"grad_norm": 9.103832244873047, |
|
"learning_rate": 0.0001341991341991342, |
|
"loss": 12.8141, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.46126126126126127, |
|
"grad_norm": 8.243462562561035, |
|
"learning_rate": 0.00013852813852813852, |
|
"loss": 12.8678, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4756756756756757, |
|
"grad_norm": 12.445680618286133, |
|
"learning_rate": 0.00014285714285714284, |
|
"loss": 12.9204, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4900900900900901, |
|
"grad_norm": 10.037951469421387, |
|
"learning_rate": 0.00014718614718614716, |
|
"loss": 12.9456, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5045045045045045, |
|
"grad_norm": 14.364166259765625, |
|
"learning_rate": 0.00015151515151515152, |
|
"loss": 12.5126, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.518918918918919, |
|
"grad_norm": 10.338336944580078, |
|
"learning_rate": 0.0001558441558441558, |
|
"loss": 13.2546, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 9.899740219116211, |
|
"learning_rate": 0.00016017316017316016, |
|
"loss": 12.3445, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5477477477477477, |
|
"grad_norm": 11.309089660644531, |
|
"learning_rate": 0.00016450216450216449, |
|
"loss": 12.2799, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5621621621621622, |
|
"grad_norm": 11.268434524536133, |
|
"learning_rate": 0.00016883116883116884, |
|
"loss": 12.1578, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5765765765765766, |
|
"grad_norm": 9.793964385986328, |
|
"learning_rate": 0.00017316017316017313, |
|
"loss": 11.9812, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.590990990990991, |
|
"grad_norm": 11.267273902893066, |
|
"learning_rate": 0.00017748917748917746, |
|
"loss": 12.1401, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6054054054054054, |
|
"grad_norm": 10.383160591125488, |
|
"learning_rate": 0.0001818181818181818, |
|
"loss": 12.0603, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6198198198198198, |
|
"grad_norm": 14.343868255615234, |
|
"learning_rate": 0.00018614718614718616, |
|
"loss": 11.2182, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6342342342342342, |
|
"grad_norm": 13.931622505187988, |
|
"learning_rate": 0.00019047619047619045, |
|
"loss": 11.6929, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 12.756230354309082, |
|
"learning_rate": 0.00019480519480519478, |
|
"loss": 11.9651, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6630630630630631, |
|
"grad_norm": 13.018777847290039, |
|
"learning_rate": 0.00019913419913419913, |
|
"loss": 11.6416, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6774774774774774, |
|
"grad_norm": 13.232623100280762, |
|
"learning_rate": 0.00020346320346320345, |
|
"loss": 11.4997, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6918918918918919, |
|
"grad_norm": 12.543861389160156, |
|
"learning_rate": 0.00020779220779220778, |
|
"loss": 11.5597, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7063063063063063, |
|
"grad_norm": 12.517231941223145, |
|
"learning_rate": 0.0002121212121212121, |
|
"loss": 11.1162, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7207207207207207, |
|
"grad_norm": 14.61859130859375, |
|
"learning_rate": 0.00021645021645021645, |
|
"loss": 11.2086, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7351351351351352, |
|
"grad_norm": 14.246715545654297, |
|
"learning_rate": 0.00022077922077922075, |
|
"loss": 11.2519, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7495495495495496, |
|
"grad_norm": 13.88980484008789, |
|
"learning_rate": 0.00022510822510822507, |
|
"loss": 10.9391, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7639639639639639, |
|
"grad_norm": 14.310384750366211, |
|
"learning_rate": 0.00022943722943722942, |
|
"loss": 10.7129, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7783783783783784, |
|
"grad_norm": 13.765666007995605, |
|
"learning_rate": 0.00023376623376623374, |
|
"loss": 11.218, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7927927927927928, |
|
"grad_norm": 17.789613723754883, |
|
"learning_rate": 0.00023809523809523807, |
|
"loss": 10.2992, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8072072072072072, |
|
"grad_norm": 17.212533950805664, |
|
"learning_rate": 0.0002424242424242424, |
|
"loss": 11.1959, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8216216216216217, |
|
"grad_norm": 14.872720718383789, |
|
"learning_rate": 0.00024675324675324674, |
|
"loss": 9.933, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.836036036036036, |
|
"grad_norm": 14.751778602600098, |
|
"learning_rate": 0.0002510822510822511, |
|
"loss": 10.2721, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8504504504504504, |
|
"grad_norm": 13.110413551330566, |
|
"learning_rate": 0.0002554112554112554, |
|
"loss": 10.0697, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 14.484004020690918, |
|
"learning_rate": 0.00025974025974025974, |
|
"loss": 10.6599, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8792792792792793, |
|
"grad_norm": 15.150849342346191, |
|
"learning_rate": 0.00026406926406926404, |
|
"loss": 10.3077, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.8936936936936937, |
|
"grad_norm": 19.270540237426758, |
|
"learning_rate": 0.0002683982683982684, |
|
"loss": 10.2954, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9081081081081082, |
|
"grad_norm": 17.365564346313477, |
|
"learning_rate": 0.0002727272727272727, |
|
"loss": 10.2966, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9225225225225225, |
|
"grad_norm": 23.610044479370117, |
|
"learning_rate": 0.00027705627705627703, |
|
"loss": 9.4401, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9369369369369369, |
|
"grad_norm": 16.38220977783203, |
|
"learning_rate": 0.0002813852813852814, |
|
"loss": 9.8423, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9513513513513514, |
|
"grad_norm": 18.670101165771484, |
|
"learning_rate": 0.0002857142857142857, |
|
"loss": 10.2396, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9657657657657658, |
|
"grad_norm": 20.733997344970703, |
|
"learning_rate": 0.00029004329004329003, |
|
"loss": 9.3347, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.9801801801801802, |
|
"grad_norm": 18.066375732421875, |
|
"learning_rate": 0.00029437229437229433, |
|
"loss": 10.4626, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9945945945945946, |
|
"grad_norm": 18.0963191986084, |
|
"learning_rate": 0.0002987012987012987, |
|
"loss": 9.6557, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.9989189189189189, |
|
"eval_accuracy": 0.26137907608695654, |
|
"eval_loss": 2.65486216545105, |
|
"eval_runtime": 541.7254, |
|
"eval_samples_per_second": 10.869, |
|
"eval_steps_per_second": 10.869, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 1.01009009009009, |
|
"grad_norm": 15.17456340789795, |
|
"learning_rate": 0.00029966329966329963, |
|
"loss": 10.1474, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0245045045045045, |
|
"grad_norm": 19.106407165527344, |
|
"learning_rate": 0.00029918229918229916, |
|
"loss": 8.6672, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.038918918918919, |
|
"grad_norm": 16.296113967895508, |
|
"learning_rate": 0.0002987012987012987, |
|
"loss": 8.7251, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0533333333333332, |
|
"grad_norm": 22.187761306762695, |
|
"learning_rate": 0.00029826839826839827, |
|
"loss": 9.2252, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.0677477477477477, |
|
"grad_norm": 17.774612426757812, |
|
"learning_rate": 0.00029778739778739773, |
|
"loss": 8.3988, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.0821621621621622, |
|
"grad_norm": 22.759864807128906, |
|
"learning_rate": 0.0002973063973063973, |
|
"loss": 8.4637, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0965765765765765, |
|
"grad_norm": 22.068397521972656, |
|
"learning_rate": 0.0002968253968253968, |
|
"loss": 9.4532, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.110990990990991, |
|
"grad_norm": 22.11869239807129, |
|
"learning_rate": 0.0002963443963443963, |
|
"loss": 8.5823, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1254054054054055, |
|
"grad_norm": 20.577394485473633, |
|
"learning_rate": 0.0002958633958633958, |
|
"loss": 8.8257, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1398198198198197, |
|
"grad_norm": 19.24051856994629, |
|
"learning_rate": 0.00029538239538239535, |
|
"loss": 8.4165, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.1542342342342342, |
|
"grad_norm": 18.745025634765625, |
|
"learning_rate": 0.00029490139490139487, |
|
"loss": 8.4419, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1686486486486487, |
|
"grad_norm": 16.836870193481445, |
|
"learning_rate": 0.0002944203944203944, |
|
"loss": 8.2076, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.183063063063063, |
|
"grad_norm": 23.824594497680664, |
|
"learning_rate": 0.0002939393939393939, |
|
"loss": 7.8032, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.1974774774774775, |
|
"grad_norm": 17.577869415283203, |
|
"learning_rate": 0.00029345839345839344, |
|
"loss": 8.3441, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.211891891891892, |
|
"grad_norm": 17.508779525756836, |
|
"learning_rate": 0.00029297739297739296, |
|
"loss": 8.1213, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2263063063063062, |
|
"grad_norm": 16.90478515625, |
|
"learning_rate": 0.0002924963924963925, |
|
"loss": 7.6077, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2407207207207207, |
|
"grad_norm": 20.760663986206055, |
|
"learning_rate": 0.000292015392015392, |
|
"loss": 7.8654, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.2551351351351352, |
|
"grad_norm": 20.966073989868164, |
|
"learning_rate": 0.00029153439153439153, |
|
"loss": 7.7627, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.2695495495495495, |
|
"grad_norm": 18.766395568847656, |
|
"learning_rate": 0.000291053391053391, |
|
"loss": 7.0404, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.283963963963964, |
|
"grad_norm": 20.34043312072754, |
|
"learning_rate": 0.0002905723905723906, |
|
"loss": 8.2117, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.2983783783783784, |
|
"grad_norm": 22.05991554260254, |
|
"learning_rate": 0.00029009139009139004, |
|
"loss": 7.5249, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3127927927927927, |
|
"grad_norm": 18.58563232421875, |
|
"learning_rate": 0.00028961038961038956, |
|
"loss": 7.9662, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.3272072072072072, |
|
"grad_norm": 18.942352294921875, |
|
"learning_rate": 0.0002891293891293891, |
|
"loss": 7.7609, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3416216216216217, |
|
"grad_norm": 23.675949096679688, |
|
"learning_rate": 0.0002886483886483886, |
|
"loss": 7.4968, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.356036036036036, |
|
"grad_norm": 22.53910255432129, |
|
"learning_rate": 0.00028816738816738813, |
|
"loss": 7.9113, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.3704504504504504, |
|
"grad_norm": 21.479690551757812, |
|
"learning_rate": 0.00028768638768638766, |
|
"loss": 6.8956, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.384864864864865, |
|
"grad_norm": 20.469209671020508, |
|
"learning_rate": 0.0002872053872053872, |
|
"loss": 7.2737, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.3992792792792792, |
|
"grad_norm": 17.538774490356445, |
|
"learning_rate": 0.0002867243867243867, |
|
"loss": 7.2458, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4136936936936937, |
|
"grad_norm": 22.793577194213867, |
|
"learning_rate": 0.0002862433862433862, |
|
"loss": 7.2339, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4281081081081082, |
|
"grad_norm": 18.235897064208984, |
|
"learning_rate": 0.00028576238576238575, |
|
"loss": 7.6416, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.4425225225225224, |
|
"grad_norm": 24.108549118041992, |
|
"learning_rate": 0.00028528138528138527, |
|
"loss": 7.5449, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.456936936936937, |
|
"grad_norm": 23.248693466186523, |
|
"learning_rate": 0.0002848003848003848, |
|
"loss": 7.0878, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.4713513513513514, |
|
"grad_norm": 20.034454345703125, |
|
"learning_rate": 0.00028431938431938426, |
|
"loss": 7.426, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.4857657657657657, |
|
"grad_norm": 22.129047393798828, |
|
"learning_rate": 0.00028383838383838384, |
|
"loss": 6.9635, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.5001801801801802, |
|
"grad_norm": 20.906335830688477, |
|
"learning_rate": 0.0002833573833573833, |
|
"loss": 7.1704, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.5145945945945947, |
|
"grad_norm": 22.88907814025879, |
|
"learning_rate": 0.0002828763828763829, |
|
"loss": 7.1875, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.529009009009009, |
|
"grad_norm": 23.162479400634766, |
|
"learning_rate": 0.00028239538239538235, |
|
"loss": 7.665, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.5434234234234234, |
|
"grad_norm": 22.069990158081055, |
|
"learning_rate": 0.00028191438191438187, |
|
"loss": 7.0347, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.557837837837838, |
|
"grad_norm": 21.646320343017578, |
|
"learning_rate": 0.0002814333814333814, |
|
"loss": 7.4735, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.5722522522522522, |
|
"grad_norm": 22.21576499938965, |
|
"learning_rate": 0.0002809523809523809, |
|
"loss": 7.3836, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.5866666666666667, |
|
"grad_norm": 17.76190757751465, |
|
"learning_rate": 0.00028047138047138044, |
|
"loss": 7.2981, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6010810810810812, |
|
"grad_norm": 15.208210945129395, |
|
"learning_rate": 0.00027999037999037996, |
|
"loss": 6.1374, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.6154954954954954, |
|
"grad_norm": 24.096397399902344, |
|
"learning_rate": 0.0002795093795093795, |
|
"loss": 6.3449, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.62990990990991, |
|
"grad_norm": 23.264659881591797, |
|
"learning_rate": 0.000279028379028379, |
|
"loss": 6.9955, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.6443243243243244, |
|
"grad_norm": 23.365312576293945, |
|
"learning_rate": 0.00027854737854737853, |
|
"loss": 6.7135, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.6587387387387387, |
|
"grad_norm": 18.671892166137695, |
|
"learning_rate": 0.00027806637806637805, |
|
"loss": 6.3113, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.6731531531531532, |
|
"grad_norm": 22.89389991760254, |
|
"learning_rate": 0.0002775853775853776, |
|
"loss": 6.6979, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.6875675675675677, |
|
"grad_norm": 22.493839263916016, |
|
"learning_rate": 0.0002771043771043771, |
|
"loss": 5.7641, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.701981981981982, |
|
"grad_norm": 24.027435302734375, |
|
"learning_rate": 0.00027662337662337657, |
|
"loss": 7.2983, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.7163963963963964, |
|
"grad_norm": 19.027225494384766, |
|
"learning_rate": 0.00027614237614237614, |
|
"loss": 6.2111, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.730810810810811, |
|
"grad_norm": 27.56620979309082, |
|
"learning_rate": 0.0002756613756613756, |
|
"loss": 6.7366, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7452252252252252, |
|
"grad_norm": 16.027616500854492, |
|
"learning_rate": 0.00027518037518037513, |
|
"loss": 6.1943, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.7596396396396397, |
|
"grad_norm": 20.16025161743164, |
|
"learning_rate": 0.0002746993746993747, |
|
"loss": 6.3816, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.7740540540540541, |
|
"grad_norm": 13.574505805969238, |
|
"learning_rate": 0.0002742183742183742, |
|
"loss": 5.9191, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.7884684684684684, |
|
"grad_norm": 19.855785369873047, |
|
"learning_rate": 0.0002737373737373737, |
|
"loss": 6.3663, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.802882882882883, |
|
"grad_norm": 20.211448669433594, |
|
"learning_rate": 0.0002732563732563732, |
|
"loss": 6.4382, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.8172972972972974, |
|
"grad_norm": 21.60570526123047, |
|
"learning_rate": 0.00027277537277537275, |
|
"loss": 6.7056, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.8317117117117117, |
|
"grad_norm": 20.5418758392334, |
|
"learning_rate": 0.00027229437229437227, |
|
"loss": 5.5842, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.8461261261261261, |
|
"grad_norm": 27.491355895996094, |
|
"learning_rate": 0.0002718133718133718, |
|
"loss": 5.9011, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.8605405405405406, |
|
"grad_norm": 23.979827880859375, |
|
"learning_rate": 0.0002713323713323713, |
|
"loss": 6.084, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.874954954954955, |
|
"grad_norm": 18.55582618713379, |
|
"learning_rate": 0.00027085137085137084, |
|
"loss": 6.0097, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.8893693693693694, |
|
"grad_norm": 19.917762756347656, |
|
"learning_rate": 0.00027037037037037036, |
|
"loss": 5.7525, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.9037837837837839, |
|
"grad_norm": 17.546810150146484, |
|
"learning_rate": 0.00026988936988936983, |
|
"loss": 6.3093, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.9181981981981981, |
|
"grad_norm": 26.043676376342773, |
|
"learning_rate": 0.0002694083694083694, |
|
"loss": 5.9062, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.9326126126126126, |
|
"grad_norm": 22.03000831604004, |
|
"learning_rate": 0.00026892736892736893, |
|
"loss": 6.4594, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.9470270270270271, |
|
"grad_norm": 23.965402603149414, |
|
"learning_rate": 0.0002684463684463684, |
|
"loss": 6.3053, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.9614414414414414, |
|
"grad_norm": 21.040790557861328, |
|
"learning_rate": 0.000267965367965368, |
|
"loss": 5.4142, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.9758558558558559, |
|
"grad_norm": 22.65288543701172, |
|
"learning_rate": 0.00026748436748436744, |
|
"loss": 6.5429, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.9902702702702704, |
|
"grad_norm": 19.748960494995117, |
|
"learning_rate": 0.00026700336700336696, |
|
"loss": 6.1707, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.998918918918919, |
|
"eval_accuracy": 0.468070652173913, |
|
"eval_loss": 1.8478443622589111, |
|
"eval_runtime": 536.954, |
|
"eval_samples_per_second": 10.966, |
|
"eval_steps_per_second": 10.966, |
|
"step": 1386 |
|
}, |
|
{ |
|
"epoch": 2.0057657657657657, |
|
"grad_norm": 20.129833221435547, |
|
"learning_rate": 0.0002665223665223665, |
|
"loss": 5.5637, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.02018018018018, |
|
"grad_norm": 18.542203903198242, |
|
"learning_rate": 0.000266041366041366, |
|
"loss": 4.8547, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.0345945945945947, |
|
"grad_norm": 16.80269432067871, |
|
"learning_rate": 0.00026556036556036553, |
|
"loss": 4.9395, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.049009009009009, |
|
"grad_norm": 24.43153953552246, |
|
"learning_rate": 0.00026507936507936506, |
|
"loss": 4.8408, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.063423423423423, |
|
"grad_norm": 20.406522750854492, |
|
"learning_rate": 0.0002645983645983646, |
|
"loss": 4.3663, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.077837837837838, |
|
"grad_norm": 17.540870666503906, |
|
"learning_rate": 0.0002641173641173641, |
|
"loss": 3.6172, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.092252252252252, |
|
"grad_norm": 22.39369773864746, |
|
"learning_rate": 0.0002636363636363636, |
|
"loss": 4.5143, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.1066666666666665, |
|
"grad_norm": 24.582853317260742, |
|
"learning_rate": 0.00026315536315536315, |
|
"loss": 4.4835, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.121081081081081, |
|
"grad_norm": 22.656949996948242, |
|
"learning_rate": 0.00026267436267436267, |
|
"loss": 4.4713, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.1354954954954954, |
|
"grad_norm": 22.375396728515625, |
|
"learning_rate": 0.0002621933621933622, |
|
"loss": 4.4695, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.1499099099099097, |
|
"grad_norm": 17.02708625793457, |
|
"learning_rate": 0.00026171236171236166, |
|
"loss": 3.8927, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.1643243243243244, |
|
"grad_norm": 19.711584091186523, |
|
"learning_rate": 0.00026123136123136124, |
|
"loss": 3.9472, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.1787387387387387, |
|
"grad_norm": 18.87154197692871, |
|
"learning_rate": 0.0002607503607503607, |
|
"loss": 4.8518, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.193153153153153, |
|
"grad_norm": 25.693981170654297, |
|
"learning_rate": 0.0002602693602693603, |
|
"loss": 4.6599, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.2075675675675677, |
|
"grad_norm": 15.880191802978516, |
|
"learning_rate": 0.00025978835978835975, |
|
"loss": 4.1435, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.221981981981982, |
|
"grad_norm": 20.515146255493164, |
|
"learning_rate": 0.00025930735930735927, |
|
"loss": 4.1378, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.236396396396396, |
|
"grad_norm": 23.654556274414062, |
|
"learning_rate": 0.0002588263588263588, |
|
"loss": 4.4749, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.250810810810811, |
|
"grad_norm": 25.85966682434082, |
|
"learning_rate": 0.0002583453583453583, |
|
"loss": 4.2029, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.265225225225225, |
|
"grad_norm": 21.542530059814453, |
|
"learning_rate": 0.00025786435786435784, |
|
"loss": 4.6039, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.2796396396396394, |
|
"grad_norm": 19.57372283935547, |
|
"learning_rate": 0.00025738335738335736, |
|
"loss": 4.0779, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.294054054054054, |
|
"grad_norm": 20.794376373291016, |
|
"learning_rate": 0.0002569023569023569, |
|
"loss": 4.7794, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.3084684684684684, |
|
"grad_norm": 23.753938674926758, |
|
"learning_rate": 0.0002564213564213564, |
|
"loss": 4.8506, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.3228828828828827, |
|
"grad_norm": 19.38469123840332, |
|
"learning_rate": 0.00025594035594035593, |
|
"loss": 3.4325, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.3372972972972974, |
|
"grad_norm": 21.55483627319336, |
|
"learning_rate": 0.00025545935545935545, |
|
"loss": 4.151, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.3517117117117117, |
|
"grad_norm": 24.347623825073242, |
|
"learning_rate": 0.000254978354978355, |
|
"loss": 4.3691, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.366126126126126, |
|
"grad_norm": 22.3781795501709, |
|
"learning_rate": 0.0002544973544973545, |
|
"loss": 4.5897, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.3805405405405407, |
|
"grad_norm": 23.88686180114746, |
|
"learning_rate": 0.00025401635401635397, |
|
"loss": 4.1445, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.394954954954955, |
|
"grad_norm": 22.73502540588379, |
|
"learning_rate": 0.00025353535353535354, |
|
"loss": 4.7023, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.409369369369369, |
|
"grad_norm": 28.19312286376953, |
|
"learning_rate": 0.000253054353054353, |
|
"loss": 4.1145, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.423783783783784, |
|
"grad_norm": 18.269119262695312, |
|
"learning_rate": 0.00025257335257335253, |
|
"loss": 4.2782, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.438198198198198, |
|
"grad_norm": 23.031797409057617, |
|
"learning_rate": 0.00025209235209235206, |
|
"loss": 4.1351, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.4526126126126124, |
|
"grad_norm": 29.572736740112305, |
|
"learning_rate": 0.0002516113516113516, |
|
"loss": 3.9022, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.467027027027027, |
|
"grad_norm": 27.48060417175293, |
|
"learning_rate": 0.0002511303511303511, |
|
"loss": 4.2383, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.4814414414414414, |
|
"grad_norm": 20.07984733581543, |
|
"learning_rate": 0.0002506493506493506, |
|
"loss": 4.8254, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.4958558558558557, |
|
"grad_norm": 15.536605834960938, |
|
"learning_rate": 0.00025016835016835015, |
|
"loss": 4.4781, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.5102702702702704, |
|
"grad_norm": 24.318782806396484, |
|
"learning_rate": 0.00024968734968734967, |
|
"loss": 3.9879, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.5246846846846847, |
|
"grad_norm": 16.27837562561035, |
|
"learning_rate": 0.0002492063492063492, |
|
"loss": 3.9869, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.539099099099099, |
|
"grad_norm": 17.794788360595703, |
|
"learning_rate": 0.0002487253487253487, |
|
"loss": 3.9309, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.5535135135135136, |
|
"grad_norm": 21.39970588684082, |
|
"learning_rate": 0.00024824434824434824, |
|
"loss": 4.3936, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.567927927927928, |
|
"grad_norm": 22.3472957611084, |
|
"learning_rate": 0.00024776334776334776, |
|
"loss": 4.5431, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.5823423423423426, |
|
"grad_norm": 22.283802032470703, |
|
"learning_rate": 0.00024728234728234723, |
|
"loss": 3.7322, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.596756756756757, |
|
"grad_norm": 20.59347152709961, |
|
"learning_rate": 0.0002468013468013468, |
|
"loss": 4.7168, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.611171171171171, |
|
"grad_norm": 21.301950454711914, |
|
"learning_rate": 0.0002463203463203463, |
|
"loss": 4.2457, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.6255855855855854, |
|
"grad_norm": 24.100994110107422, |
|
"learning_rate": 0.0002458393458393458, |
|
"loss": 4.0849, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 20.029577255249023, |
|
"learning_rate": 0.0002453583453583453, |
|
"loss": 3.956, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.6544144144144144, |
|
"grad_norm": 18.682430267333984, |
|
"learning_rate": 0.00024487734487734484, |
|
"loss": 4.0165, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.668828828828829, |
|
"grad_norm": 24.04487419128418, |
|
"learning_rate": 0.00024439634439634437, |
|
"loss": 4.0105, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.6832432432432434, |
|
"grad_norm": 21.22220802307129, |
|
"learning_rate": 0.0002439153439153439, |
|
"loss": 3.997, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.6976576576576576, |
|
"grad_norm": 19.668106079101562, |
|
"learning_rate": 0.0002434343434343434, |
|
"loss": 4.0831, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.712072072072072, |
|
"grad_norm": 30.692045211791992, |
|
"learning_rate": 0.00024295334295334293, |
|
"loss": 4.0591, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.7264864864864866, |
|
"grad_norm": 22.906898498535156, |
|
"learning_rate": 0.00024247234247234246, |
|
"loss": 4.5457, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.740900900900901, |
|
"grad_norm": 22.690523147583008, |
|
"learning_rate": 0.00024199134199134195, |
|
"loss": 3.8756, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.755315315315315, |
|
"grad_norm": 21.029132843017578, |
|
"learning_rate": 0.0002415103415103415, |
|
"loss": 4.011, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.76972972972973, |
|
"grad_norm": 21.587825775146484, |
|
"learning_rate": 0.000241029341029341, |
|
"loss": 3.7924, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.784144144144144, |
|
"grad_norm": 22.353364944458008, |
|
"learning_rate": 0.00024054834054834052, |
|
"loss": 4.3143, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.7985585585585584, |
|
"grad_norm": 21.176376342773438, |
|
"learning_rate": 0.00024006734006734004, |
|
"loss": 4.6675, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.812972972972973, |
|
"grad_norm": 18.859739303588867, |
|
"learning_rate": 0.00023958633958633956, |
|
"loss": 4.0779, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.8273873873873874, |
|
"grad_norm": 18.34664535522461, |
|
"learning_rate": 0.0002391053391053391, |
|
"loss": 4.2849, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.8418018018018016, |
|
"grad_norm": 22.619640350341797, |
|
"learning_rate": 0.0002386243386243386, |
|
"loss": 3.9383, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.8562162162162164, |
|
"grad_norm": 22.183664321899414, |
|
"learning_rate": 0.0002381433381433381, |
|
"loss": 3.7888, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.8706306306306306, |
|
"grad_norm": 26.002941131591797, |
|
"learning_rate": 0.00023766233766233765, |
|
"loss": 3.912, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.885045045045045, |
|
"grad_norm": 27.130271911621094, |
|
"learning_rate": 0.00023718133718133715, |
|
"loss": 3.9044, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.8994594594594596, |
|
"grad_norm": 21.608003616333008, |
|
"learning_rate": 0.00023670033670033667, |
|
"loss": 4.2128, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.913873873873874, |
|
"grad_norm": 19.621829986572266, |
|
"learning_rate": 0.0002362193362193362, |
|
"loss": 3.8509, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.928288288288288, |
|
"grad_norm": 23.38471031188965, |
|
"learning_rate": 0.00023573833573833572, |
|
"loss": 4.067, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.942702702702703, |
|
"grad_norm": 13.28516674041748, |
|
"learning_rate": 0.0002352573352573352, |
|
"loss": 4.186, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.957117117117117, |
|
"grad_norm": 18.91407585144043, |
|
"learning_rate": 0.00023477633477633476, |
|
"loss": 3.7117, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.9715315315315314, |
|
"grad_norm": 18.93157196044922, |
|
"learning_rate": 0.00023429533429533426, |
|
"loss": 3.8855, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.985945945945946, |
|
"grad_norm": 20.980789184570312, |
|
"learning_rate": 0.0002338143338143338, |
|
"loss": 3.7871, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.998918918918919, |
|
"eval_accuracy": 0.5473845108695652, |
|
"eval_loss": 1.6941322088241577, |
|
"eval_runtime": 536.9387, |
|
"eval_samples_per_second": 10.966, |
|
"eval_steps_per_second": 10.966, |
|
"step": 2079 |
|
}, |
|
{ |
|
"epoch": 3.0014414414414414, |
|
"grad_norm": 28.662826538085938, |
|
"learning_rate": 0.0002333333333333333, |
|
"loss": 4.0376, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.0158558558558557, |
|
"grad_norm": 13.298629760742188, |
|
"learning_rate": 0.00023285233285233283, |
|
"loss": 2.4392, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.0302702702702704, |
|
"grad_norm": 20.722625732421875, |
|
"learning_rate": 0.00023237133237133238, |
|
"loss": 2.5711, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.0446846846846847, |
|
"grad_norm": 18.076677322387695, |
|
"learning_rate": 0.00023189033189033187, |
|
"loss": 2.4815, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.059099099099099, |
|
"grad_norm": 23.47679328918457, |
|
"learning_rate": 0.00023140933140933137, |
|
"loss": 2.4175, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.0735135135135137, |
|
"grad_norm": 25.233163833618164, |
|
"learning_rate": 0.00023092833092833092, |
|
"loss": 2.6018, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.087927927927928, |
|
"grad_norm": 23.916234970092773, |
|
"learning_rate": 0.0002304473304473304, |
|
"loss": 2.9529, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.102342342342342, |
|
"grad_norm": 20.37197494506836, |
|
"learning_rate": 0.00022996632996632994, |
|
"loss": 2.2146, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.116756756756757, |
|
"grad_norm": 20.04782485961914, |
|
"learning_rate": 0.00022948532948532948, |
|
"loss": 2.1764, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.131171171171171, |
|
"grad_norm": 24.065858840942383, |
|
"learning_rate": 0.00022900432900432898, |
|
"loss": 2.7395, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.1455855855855854, |
|
"grad_norm": 20.15619468688965, |
|
"learning_rate": 0.00022852332852332853, |
|
"loss": 2.6955, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 15.333986282348633, |
|
"learning_rate": 0.00022804232804232803, |
|
"loss": 2.378, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.1744144144144144, |
|
"grad_norm": 17.780742645263672, |
|
"learning_rate": 0.00022756132756132752, |
|
"loss": 2.4017, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.1888288288288287, |
|
"grad_norm": 22.119949340820312, |
|
"learning_rate": 0.00022708032708032707, |
|
"loss": 2.3123, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.2032432432432434, |
|
"grad_norm": 22.979034423828125, |
|
"learning_rate": 0.0002265993265993266, |
|
"loss": 1.877, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.2176576576576577, |
|
"grad_norm": 21.25425910949707, |
|
"learning_rate": 0.0002261183261183261, |
|
"loss": 2.3021, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.232072072072072, |
|
"grad_norm": 20.077585220336914, |
|
"learning_rate": 0.00022563732563732564, |
|
"loss": 2.5026, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.2464864864864866, |
|
"grad_norm": 21.955101013183594, |
|
"learning_rate": 0.00022515632515632513, |
|
"loss": 2.4518, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.260900900900901, |
|
"grad_norm": 23.3514347076416, |
|
"learning_rate": 0.00022467532467532463, |
|
"loss": 2.4694, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.275315315315315, |
|
"grad_norm": 11.233248710632324, |
|
"learning_rate": 0.00022419432419432418, |
|
"loss": 2.2057, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.28972972972973, |
|
"grad_norm": 20.17824363708496, |
|
"learning_rate": 0.0002237133237133237, |
|
"loss": 2.3982, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.304144144144144, |
|
"grad_norm": 20.694353103637695, |
|
"learning_rate": 0.00022323232323232322, |
|
"loss": 3.0053, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.3185585585585584, |
|
"grad_norm": 24.36587142944336, |
|
"learning_rate": 0.00022275132275132275, |
|
"loss": 2.3132, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.332972972972973, |
|
"grad_norm": 18.3751277923584, |
|
"learning_rate": 0.00022227032227032224, |
|
"loss": 2.2867, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.3473873873873874, |
|
"grad_norm": 19.790868759155273, |
|
"learning_rate": 0.0002217893217893218, |
|
"loss": 2.7789, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.3618018018018017, |
|
"grad_norm": 24.86772346496582, |
|
"learning_rate": 0.0002213083213083213, |
|
"loss": 3.0161, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.3762162162162164, |
|
"grad_norm": 21.827804565429688, |
|
"learning_rate": 0.0002208273208273208, |
|
"loss": 2.546, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.3906306306306306, |
|
"grad_norm": 19.654054641723633, |
|
"learning_rate": 0.00022034632034632033, |
|
"loss": 2.6371, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.405045045045045, |
|
"grad_norm": 21.734804153442383, |
|
"learning_rate": 0.00021986531986531986, |
|
"loss": 2.4253, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.4194594594594596, |
|
"grad_norm": 27.88010597229004, |
|
"learning_rate": 0.00021938431938431935, |
|
"loss": 2.2937, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.433873873873874, |
|
"grad_norm": 22.679140090942383, |
|
"learning_rate": 0.0002189033189033189, |
|
"loss": 2.6596, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.448288288288288, |
|
"grad_norm": 21.52387809753418, |
|
"learning_rate": 0.0002184223184223184, |
|
"loss": 2.0818, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.462702702702703, |
|
"grad_norm": 20.006406784057617, |
|
"learning_rate": 0.00021794131794131792, |
|
"loss": 2.8108, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.477117117117117, |
|
"grad_norm": 19.29098892211914, |
|
"learning_rate": 0.00021746031746031744, |
|
"loss": 2.3845, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.4915315315315314, |
|
"grad_norm": 16.946989059448242, |
|
"learning_rate": 0.00021697931697931696, |
|
"loss": 2.5469, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.505945945945946, |
|
"grad_norm": 25.288267135620117, |
|
"learning_rate": 0.0002164983164983165, |
|
"loss": 2.5397, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.5203603603603604, |
|
"grad_norm": 25.8332462310791, |
|
"learning_rate": 0.000216017316017316, |
|
"loss": 2.1714, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.5347747747747746, |
|
"grad_norm": 19.762386322021484, |
|
"learning_rate": 0.0002155363155363155, |
|
"loss": 3.3805, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.5491891891891894, |
|
"grad_norm": 20.7349796295166, |
|
"learning_rate": 0.00021505531505531505, |
|
"loss": 2.7777, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.5636036036036036, |
|
"grad_norm": 22.35674285888672, |
|
"learning_rate": 0.00021457431457431455, |
|
"loss": 2.1907, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.578018018018018, |
|
"grad_norm": 21.76331901550293, |
|
"learning_rate": 0.00021409331409331407, |
|
"loss": 2.7713, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.5924324324324326, |
|
"grad_norm": 20.995986938476562, |
|
"learning_rate": 0.0002136123136123136, |
|
"loss": 2.6262, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.606846846846847, |
|
"grad_norm": 23.074106216430664, |
|
"learning_rate": 0.00021313131313131312, |
|
"loss": 2.0651, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.621261261261261, |
|
"grad_norm": 23.654848098754883, |
|
"learning_rate": 0.00021265031265031261, |
|
"loss": 2.718, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.6356756756756754, |
|
"grad_norm": 25.261152267456055, |
|
"learning_rate": 0.00021216931216931216, |
|
"loss": 2.6679, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.65009009009009, |
|
"grad_norm": 21.01721954345703, |
|
"learning_rate": 0.00021168831168831166, |
|
"loss": 2.8435, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.6645045045045044, |
|
"grad_norm": 22.361772537231445, |
|
"learning_rate": 0.0002112073112073112, |
|
"loss": 2.7907, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.678918918918919, |
|
"grad_norm": 25.23889923095703, |
|
"learning_rate": 0.0002107263107263107, |
|
"loss": 2.8608, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.6933333333333334, |
|
"grad_norm": 21.43499183654785, |
|
"learning_rate": 0.00021024531024531023, |
|
"loss": 2.3714, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.7077477477477476, |
|
"grad_norm": 20.24538230895996, |
|
"learning_rate": 0.00020976430976430975, |
|
"loss": 2.4759, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.722162162162162, |
|
"grad_norm": 22.164335250854492, |
|
"learning_rate": 0.00020928330928330927, |
|
"loss": 2.8105, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.7365765765765766, |
|
"grad_norm": 25.067033767700195, |
|
"learning_rate": 0.00020880230880230877, |
|
"loss": 2.3837, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.750990990990991, |
|
"grad_norm": 27.547651290893555, |
|
"learning_rate": 0.00020832130832130832, |
|
"loss": 2.4441, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.7654054054054056, |
|
"grad_norm": 19.971914291381836, |
|
"learning_rate": 0.0002078403078403078, |
|
"loss": 2.4194, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.77981981981982, |
|
"grad_norm": 17.411178588867188, |
|
"learning_rate": 0.00020735930735930734, |
|
"loss": 2.3971, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.794234234234234, |
|
"grad_norm": 31.035659790039062, |
|
"learning_rate": 0.00020687830687830686, |
|
"loss": 2.6306, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 3.8086486486486484, |
|
"grad_norm": 26.793031692504883, |
|
"learning_rate": 0.00020639730639730638, |
|
"loss": 3.0321, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.823063063063063, |
|
"grad_norm": 27.277006149291992, |
|
"learning_rate": 0.0002059163059163059, |
|
"loss": 2.1434, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.8374774774774774, |
|
"grad_norm": 29.178829193115234, |
|
"learning_rate": 0.00020543530543530543, |
|
"loss": 2.7848, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.851891891891892, |
|
"grad_norm": 17.34369659423828, |
|
"learning_rate": 0.00020495430495430492, |
|
"loss": 2.5354, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 3.8663063063063063, |
|
"grad_norm": 24.41458511352539, |
|
"learning_rate": 0.00020447330447330447, |
|
"loss": 2.4852, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 3.8807207207207206, |
|
"grad_norm": 27.604721069335938, |
|
"learning_rate": 0.00020399230399230397, |
|
"loss": 2.6835, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 3.895135135135135, |
|
"grad_norm": 19.998043060302734, |
|
"learning_rate": 0.0002035113035113035, |
|
"loss": 2.2523, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.9095495495495496, |
|
"grad_norm": 26.73026466369629, |
|
"learning_rate": 0.000203030303030303, |
|
"loss": 3.4174, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 3.923963963963964, |
|
"grad_norm": 27.696605682373047, |
|
"learning_rate": 0.00020254930254930253, |
|
"loss": 2.5488, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 3.9383783783783786, |
|
"grad_norm": 25.43397331237793, |
|
"learning_rate": 0.00020206830206830203, |
|
"loss": 2.1643, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 3.952792792792793, |
|
"grad_norm": 18.155502319335938, |
|
"learning_rate": 0.00020158730158730158, |
|
"loss": 2.2196, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 3.967207207207207, |
|
"grad_norm": 27.430566787719727, |
|
"learning_rate": 0.00020110630110630108, |
|
"loss": 2.2681, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 3.9816216216216214, |
|
"grad_norm": 17.62324333190918, |
|
"learning_rate": 0.00020062530062530062, |
|
"loss": 2.3872, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 3.996036036036036, |
|
"grad_norm": 22.322702407836914, |
|
"learning_rate": 0.00020014430014430012, |
|
"loss": 2.7966, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 3.998918918918919, |
|
"eval_accuracy": 0.5579144021739131, |
|
"eval_loss": 1.8579920530319214, |
|
"eval_runtime": 536.9866, |
|
"eval_samples_per_second": 10.965, |
|
"eval_steps_per_second": 10.965, |
|
"step": 2772 |
|
}, |
|
{ |
|
"epoch": 4.011531531531531, |
|
"grad_norm": 17.038963317871094, |
|
"learning_rate": 0.00019971139971139968, |
|
"loss": 1.7853, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.025945945945946, |
|
"grad_norm": 21.912731170654297, |
|
"learning_rate": 0.00019923039923039923, |
|
"loss": 1.4446, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.04036036036036, |
|
"grad_norm": 8.3090238571167, |
|
"learning_rate": 0.00019874939874939873, |
|
"loss": 1.1382, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.054774774774775, |
|
"grad_norm": 10.985939979553223, |
|
"learning_rate": 0.00019826839826839825, |
|
"loss": 1.4296, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.069189189189189, |
|
"grad_norm": 14.48794174194336, |
|
"learning_rate": 0.00019778739778739777, |
|
"loss": 1.3267, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.083603603603604, |
|
"grad_norm": 7.6786789894104, |
|
"learning_rate": 0.0001973063973063973, |
|
"loss": 1.3823, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.098018018018018, |
|
"grad_norm": 21.3938045501709, |
|
"learning_rate": 0.0001968253968253968, |
|
"loss": 1.636, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.112432432432432, |
|
"grad_norm": 16.059181213378906, |
|
"learning_rate": 0.00019634439634439634, |
|
"loss": 1.4253, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.126846846846846, |
|
"grad_norm": 31.663381576538086, |
|
"learning_rate": 0.00019586339586339583, |
|
"loss": 1.6679, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.141261261261262, |
|
"grad_norm": 28.778202056884766, |
|
"learning_rate": 0.00019538239538239536, |
|
"loss": 1.7084, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 4.155675675675676, |
|
"grad_norm": 24.17688751220703, |
|
"learning_rate": 0.00019490139490139488, |
|
"loss": 1.503, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 4.17009009009009, |
|
"grad_norm": 18.74388313293457, |
|
"learning_rate": 0.0001944203944203944, |
|
"loss": 1.4459, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 4.184504504504504, |
|
"grad_norm": 25.333425521850586, |
|
"learning_rate": 0.00019393939393939395, |
|
"loss": 1.5935, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.198918918918919, |
|
"grad_norm": 19.402793884277344, |
|
"learning_rate": 0.00019345839345839345, |
|
"loss": 1.3032, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 4.213333333333333, |
|
"grad_norm": 11.908445358276367, |
|
"learning_rate": 0.00019297739297739294, |
|
"loss": 1.4052, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 4.227747747747748, |
|
"grad_norm": 10.511947631835938, |
|
"learning_rate": 0.0001924963924963925, |
|
"loss": 1.3532, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 4.242162162162162, |
|
"grad_norm": 18.962549209594727, |
|
"learning_rate": 0.000192015392015392, |
|
"loss": 1.4759, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 4.256576576576577, |
|
"grad_norm": 29.238679885864258, |
|
"learning_rate": 0.0001915343915343915, |
|
"loss": 1.6444, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 4.270990990990991, |
|
"grad_norm": 13.944114685058594, |
|
"learning_rate": 0.00019105339105339106, |
|
"loss": 1.5509, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 4.285405405405405, |
|
"grad_norm": 17.7829532623291, |
|
"learning_rate": 0.00019057239057239056, |
|
"loss": 1.4536, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 4.299819819819819, |
|
"grad_norm": 13.711050033569336, |
|
"learning_rate": 0.00019009139009139005, |
|
"loss": 1.299, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 4.314234234234235, |
|
"grad_norm": 24.686168670654297, |
|
"learning_rate": 0.0001896103896103896, |
|
"loss": 1.3826, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 4.328648648648649, |
|
"grad_norm": 21.13921546936035, |
|
"learning_rate": 0.0001891293891293891, |
|
"loss": 1.7036, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.343063063063063, |
|
"grad_norm": 14.596439361572266, |
|
"learning_rate": 0.00018864838864838862, |
|
"loss": 1.5839, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 4.357477477477477, |
|
"grad_norm": 22.715736389160156, |
|
"learning_rate": 0.00018816738816738817, |
|
"loss": 1.5686, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 4.371891891891892, |
|
"grad_norm": 17.39431381225586, |
|
"learning_rate": 0.00018768638768638766, |
|
"loss": 1.5422, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 4.386306306306306, |
|
"grad_norm": 24.868406295776367, |
|
"learning_rate": 0.0001872053872053872, |
|
"loss": 1.7397, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 4.400720720720721, |
|
"grad_norm": 26.22691535949707, |
|
"learning_rate": 0.0001867243867243867, |
|
"loss": 1.4283, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.415135135135135, |
|
"grad_norm": 15.568745613098145, |
|
"learning_rate": 0.0001862433862433862, |
|
"loss": 1.2897, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 4.42954954954955, |
|
"grad_norm": 19.749555587768555, |
|
"learning_rate": 0.00018576238576238575, |
|
"loss": 1.4769, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 4.443963963963964, |
|
"grad_norm": 29.223718643188477, |
|
"learning_rate": 0.00018528138528138528, |
|
"loss": 1.3324, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 4.458378378378378, |
|
"grad_norm": 19.438663482666016, |
|
"learning_rate": 0.00018480038480038477, |
|
"loss": 1.568, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 4.472792792792792, |
|
"grad_norm": 10.73144245147705, |
|
"learning_rate": 0.00018431938431938432, |
|
"loss": 1.1532, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.487207207207208, |
|
"grad_norm": 16.664306640625, |
|
"learning_rate": 0.00018383838383838382, |
|
"loss": 1.4775, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 4.501621621621622, |
|
"grad_norm": 25.43704605102539, |
|
"learning_rate": 0.0001833573833573833, |
|
"loss": 1.3084, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 4.516036036036036, |
|
"grad_norm": 22.560327529907227, |
|
"learning_rate": 0.00018287638287638286, |
|
"loss": 1.4541, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 4.53045045045045, |
|
"grad_norm": 22.581119537353516, |
|
"learning_rate": 0.00018239538239538239, |
|
"loss": 1.4581, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 4.544864864864865, |
|
"grad_norm": 19.075603485107422, |
|
"learning_rate": 0.0001819143819143819, |
|
"loss": 1.3255, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.559279279279279, |
|
"grad_norm": 15.375678062438965, |
|
"learning_rate": 0.00018143338143338143, |
|
"loss": 1.035, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 4.573693693693694, |
|
"grad_norm": 30.394746780395508, |
|
"learning_rate": 0.00018095238095238093, |
|
"loss": 1.7147, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 4.588108108108108, |
|
"grad_norm": 29.191686630249023, |
|
"learning_rate": 0.00018047138047138048, |
|
"loss": 1.3125, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 4.602522522522523, |
|
"grad_norm": 21.012161254882812, |
|
"learning_rate": 0.00017999037999037997, |
|
"loss": 1.5039, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 4.616936936936937, |
|
"grad_norm": 17.093364715576172, |
|
"learning_rate": 0.0001795093795093795, |
|
"loss": 1.4667, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.631351351351351, |
|
"grad_norm": 14.385228157043457, |
|
"learning_rate": 0.00017902837902837902, |
|
"loss": 1.2575, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 4.645765765765765, |
|
"grad_norm": 16.330244064331055, |
|
"learning_rate": 0.00017854737854737854, |
|
"loss": 1.2436, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 4.6601801801801805, |
|
"grad_norm": 17.112266540527344, |
|
"learning_rate": 0.00017806637806637803, |
|
"loss": 1.5148, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 4.674594594594595, |
|
"grad_norm": 25.027666091918945, |
|
"learning_rate": 0.00017758537758537758, |
|
"loss": 1.6239, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 4.689009009009009, |
|
"grad_norm": 11.63669490814209, |
|
"learning_rate": 0.00017710437710437708, |
|
"loss": 1.4982, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.703423423423423, |
|
"grad_norm": 18.43046760559082, |
|
"learning_rate": 0.00017662337662337663, |
|
"loss": 1.4225, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 4.717837837837838, |
|
"grad_norm": 17.656518936157227, |
|
"learning_rate": 0.00017614237614237613, |
|
"loss": 1.4843, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 4.732252252252252, |
|
"grad_norm": 17.17339324951172, |
|
"learning_rate": 0.00017566137566137565, |
|
"loss": 1.5321, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 4.746666666666667, |
|
"grad_norm": 18.681303024291992, |
|
"learning_rate": 0.00017518037518037517, |
|
"loss": 1.6286, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 4.761081081081081, |
|
"grad_norm": 22.697771072387695, |
|
"learning_rate": 0.0001746993746993747, |
|
"loss": 1.4057, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.775495495495496, |
|
"grad_norm": 16.85506248474121, |
|
"learning_rate": 0.0001742183742183742, |
|
"loss": 1.6464, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 4.78990990990991, |
|
"grad_norm": 23.760793685913086, |
|
"learning_rate": 0.00017373737373737374, |
|
"loss": 1.4451, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 4.804324324324324, |
|
"grad_norm": 19.93245506286621, |
|
"learning_rate": 0.00017325637325637323, |
|
"loss": 1.821, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 4.818738738738738, |
|
"grad_norm": 15.235669136047363, |
|
"learning_rate": 0.00017277537277537276, |
|
"loss": 1.3603, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 4.8331531531531535, |
|
"grad_norm": 18.125097274780273, |
|
"learning_rate": 0.00017229437229437228, |
|
"loss": 1.2805, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 4.847567567567568, |
|
"grad_norm": 19.607587814331055, |
|
"learning_rate": 0.0001718133718133718, |
|
"loss": 1.7882, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 4.861981981981982, |
|
"grad_norm": 30.157733917236328, |
|
"learning_rate": 0.00017133237133237132, |
|
"loss": 1.5676, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 4.876396396396396, |
|
"grad_norm": 14.961874961853027, |
|
"learning_rate": 0.00017085137085137085, |
|
"loss": 1.2282, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 4.890810810810811, |
|
"grad_norm": 29.467988967895508, |
|
"learning_rate": 0.00017037037037037034, |
|
"loss": 1.6735, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 4.905225225225225, |
|
"grad_norm": 22.682449340820312, |
|
"learning_rate": 0.0001698893698893699, |
|
"loss": 1.4523, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 4.91963963963964, |
|
"grad_norm": 17.40091323852539, |
|
"learning_rate": 0.0001694083694083694, |
|
"loss": 1.1466, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 4.934054054054054, |
|
"grad_norm": 24.69778823852539, |
|
"learning_rate": 0.0001689273689273689, |
|
"loss": 1.2446, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 4.9484684684684686, |
|
"grad_norm": 14.909017562866211, |
|
"learning_rate": 0.00016844636844636843, |
|
"loss": 1.5575, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 4.962882882882883, |
|
"grad_norm": 13.104373931884766, |
|
"learning_rate": 0.00016796536796536796, |
|
"loss": 1.5514, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 4.977297297297297, |
|
"grad_norm": 24.999370574951172, |
|
"learning_rate": 0.00016748436748436745, |
|
"loss": 1.4959, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 4.991711711711711, |
|
"grad_norm": 29.072294235229492, |
|
"learning_rate": 0.000167003367003367, |
|
"loss": 1.5871, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 4.998918918918919, |
|
"eval_accuracy": 0.6139605978260869, |
|
"eval_loss": 1.6662975549697876, |
|
"eval_runtime": 540.9629, |
|
"eval_samples_per_second": 10.884, |
|
"eval_steps_per_second": 10.884, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 5.007207207207207, |
|
"grad_norm": 12.2052640914917, |
|
"learning_rate": 0.0001665223665223665, |
|
"loss": 0.9848, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 5.021621621621621, |
|
"grad_norm": 11.040346145629883, |
|
"learning_rate": 0.00016604136604136605, |
|
"loss": 0.7229, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 5.036036036036036, |
|
"grad_norm": 11.913896560668945, |
|
"learning_rate": 0.00016556036556036554, |
|
"loss": 0.5134, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 5.050450450450451, |
|
"grad_norm": 16.600475311279297, |
|
"learning_rate": 0.00016507936507936506, |
|
"loss": 0.5581, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.064864864864865, |
|
"grad_norm": 9.584583282470703, |
|
"learning_rate": 0.0001645983645983646, |
|
"loss": 0.7335, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 5.079279279279279, |
|
"grad_norm": 15.97603702545166, |
|
"learning_rate": 0.0001641173641173641, |
|
"loss": 0.9761, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 5.093693693693694, |
|
"grad_norm": 21.01009178161621, |
|
"learning_rate": 0.0001636363636363636, |
|
"loss": 0.6637, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 5.108108108108108, |
|
"grad_norm": 18.944791793823242, |
|
"learning_rate": 0.00016315536315536315, |
|
"loss": 0.8514, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 5.122522522522522, |
|
"grad_norm": 15.107224464416504, |
|
"learning_rate": 0.00016267436267436265, |
|
"loss": 0.7069, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 5.136936936936937, |
|
"grad_norm": 20.789289474487305, |
|
"learning_rate": 0.00016219336219336217, |
|
"loss": 0.7369, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 5.151351351351352, |
|
"grad_norm": 25.02975845336914, |
|
"learning_rate": 0.0001617123617123617, |
|
"loss": 0.85, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 5.165765765765766, |
|
"grad_norm": 14.045705795288086, |
|
"learning_rate": 0.00016123136123136122, |
|
"loss": 1.0056, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 5.18018018018018, |
|
"grad_norm": 19.27486801147461, |
|
"learning_rate": 0.00016075036075036074, |
|
"loss": 0.8829, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 5.194594594594594, |
|
"grad_norm": 16.740869522094727, |
|
"learning_rate": 0.00016026936026936026, |
|
"loss": 0.7436, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 5.209009009009009, |
|
"grad_norm": 22.02817153930664, |
|
"learning_rate": 0.00015978835978835976, |
|
"loss": 0.8404, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 5.223423423423424, |
|
"grad_norm": 18.062744140625, |
|
"learning_rate": 0.0001593073593073593, |
|
"loss": 0.9403, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 5.237837837837838, |
|
"grad_norm": 11.673712730407715, |
|
"learning_rate": 0.0001588263588263588, |
|
"loss": 0.8351, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 5.252252252252252, |
|
"grad_norm": 13.337545394897461, |
|
"learning_rate": 0.00015834535834535833, |
|
"loss": 0.6274, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 5.266666666666667, |
|
"grad_norm": 19.310646057128906, |
|
"learning_rate": 0.00015786435786435785, |
|
"loss": 0.969, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 5.281081081081081, |
|
"grad_norm": 19.875566482543945, |
|
"learning_rate": 0.00015738335738335737, |
|
"loss": 0.6036, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 5.295495495495495, |
|
"grad_norm": 15.952252388000488, |
|
"learning_rate": 0.00015690235690235687, |
|
"loss": 0.6879, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 5.30990990990991, |
|
"grad_norm": 17.611326217651367, |
|
"learning_rate": 0.00015642135642135642, |
|
"loss": 0.5589, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 5.324324324324325, |
|
"grad_norm": 19.946884155273438, |
|
"learning_rate": 0.0001559403559403559, |
|
"loss": 0.7953, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 5.338738738738739, |
|
"grad_norm": 11.897385597229004, |
|
"learning_rate": 0.00015545935545935546, |
|
"loss": 0.5896, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 5.353153153153153, |
|
"grad_norm": 15.592938423156738, |
|
"learning_rate": 0.00015497835497835496, |
|
"loss": 1.1955, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 5.367567567567567, |
|
"grad_norm": 15.585307121276855, |
|
"learning_rate": 0.00015449735449735448, |
|
"loss": 1.0289, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 5.381981981981982, |
|
"grad_norm": 14.25250244140625, |
|
"learning_rate": 0.000154016354016354, |
|
"loss": 0.5986, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 5.396396396396397, |
|
"grad_norm": 23.96398162841797, |
|
"learning_rate": 0.00015353535353535353, |
|
"loss": 0.7085, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 5.410810810810811, |
|
"grad_norm": 23.628772735595703, |
|
"learning_rate": 0.00015305435305435302, |
|
"loss": 0.826, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 5.425225225225225, |
|
"grad_norm": 17.359643936157227, |
|
"learning_rate": 0.00015257335257335257, |
|
"loss": 0.7858, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 5.43963963963964, |
|
"grad_norm": 22.010915756225586, |
|
"learning_rate": 0.00015209235209235207, |
|
"loss": 0.7688, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 5.454054054054054, |
|
"grad_norm": 28.990123748779297, |
|
"learning_rate": 0.0001516113516113516, |
|
"loss": 0.7106, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 5.468468468468468, |
|
"grad_norm": 11.545175552368164, |
|
"learning_rate": 0.0001511303511303511, |
|
"loss": 0.9866, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 5.482882882882883, |
|
"grad_norm": 25.446990966796875, |
|
"learning_rate": 0.00015064935064935063, |
|
"loss": 0.9894, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 5.4972972972972975, |
|
"grad_norm": 28.915557861328125, |
|
"learning_rate": 0.00015016835016835018, |
|
"loss": 0.8584, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 5.511711711711712, |
|
"grad_norm": 19.692970275878906, |
|
"learning_rate": 0.00014968734968734968, |
|
"loss": 0.6045, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 5.526126126126126, |
|
"grad_norm": 25.059045791625977, |
|
"learning_rate": 0.00014920634920634917, |
|
"loss": 1.1067, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 5.54054054054054, |
|
"grad_norm": 13.645286560058594, |
|
"learning_rate": 0.0001487253487253487, |
|
"loss": 0.7451, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 5.554954954954955, |
|
"grad_norm": 22.43482780456543, |
|
"learning_rate": 0.00014824434824434822, |
|
"loss": 0.8842, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 5.569369369369369, |
|
"grad_norm": 11.246109008789062, |
|
"learning_rate": 0.00014776334776334774, |
|
"loss": 0.629, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 5.583783783783784, |
|
"grad_norm": 21.903657913208008, |
|
"learning_rate": 0.00014728234728234727, |
|
"loss": 0.9014, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 5.598198198198198, |
|
"grad_norm": 9.34262752532959, |
|
"learning_rate": 0.0001468013468013468, |
|
"loss": 0.8017, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 5.612612612612613, |
|
"grad_norm": 28.314603805541992, |
|
"learning_rate": 0.0001463203463203463, |
|
"loss": 0.8316, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 5.627027027027027, |
|
"grad_norm": 23.812631607055664, |
|
"learning_rate": 0.00014583934583934583, |
|
"loss": 1.1573, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 5.641441441441441, |
|
"grad_norm": 19.350114822387695, |
|
"learning_rate": 0.00014535834535834533, |
|
"loss": 0.6841, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 5.655855855855856, |
|
"grad_norm": 36.78022766113281, |
|
"learning_rate": 0.00014487734487734485, |
|
"loss": 0.8235, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 5.6702702702702705, |
|
"grad_norm": 14.95051097869873, |
|
"learning_rate": 0.0001443963443963444, |
|
"loss": 0.6835, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 5.684684684684685, |
|
"grad_norm": 11.998274803161621, |
|
"learning_rate": 0.0001439153439153439, |
|
"loss": 0.9942, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 5.699099099099099, |
|
"grad_norm": 19.465404510498047, |
|
"learning_rate": 0.00014343434343434342, |
|
"loss": 0.9386, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 5.713513513513513, |
|
"grad_norm": 15.735244750976562, |
|
"learning_rate": 0.00014295334295334294, |
|
"loss": 0.8174, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 5.727927927927928, |
|
"grad_norm": 24.03779411315918, |
|
"learning_rate": 0.00014247234247234246, |
|
"loss": 1.0849, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 5.742342342342342, |
|
"grad_norm": 12.98159408569336, |
|
"learning_rate": 0.00014199134199134196, |
|
"loss": 0.6748, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 5.756756756756757, |
|
"grad_norm": 13.99123477935791, |
|
"learning_rate": 0.0001415103415103415, |
|
"loss": 0.6744, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 5.771171171171171, |
|
"grad_norm": 24.469266891479492, |
|
"learning_rate": 0.00014102934102934103, |
|
"loss": 0.6449, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 5.7855855855855856, |
|
"grad_norm": 28.23906898498535, |
|
"learning_rate": 0.00014054834054834055, |
|
"loss": 0.757, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 18.971261978149414, |
|
"learning_rate": 0.00014006734006734005, |
|
"loss": 0.7486, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 5.814414414414414, |
|
"grad_norm": 19.77442169189453, |
|
"learning_rate": 0.00013958633958633957, |
|
"loss": 0.8439, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 5.828828828828829, |
|
"grad_norm": 19.546371459960938, |
|
"learning_rate": 0.0001391053391053391, |
|
"loss": 0.8859, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 5.8432432432432435, |
|
"grad_norm": 12.447526931762695, |
|
"learning_rate": 0.0001386243386243386, |
|
"loss": 0.6841, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 5.857657657657658, |
|
"grad_norm": 18.02086639404297, |
|
"learning_rate": 0.00013814333814333814, |
|
"loss": 0.8155, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 5.872072072072072, |
|
"grad_norm": 23.19020652770996, |
|
"learning_rate": 0.00013766233766233766, |
|
"loss": 0.8727, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 5.886486486486486, |
|
"grad_norm": 9.812922477722168, |
|
"learning_rate": 0.00013718133718133719, |
|
"loss": 0.8107, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 5.900900900900901, |
|
"grad_norm": 18.993051528930664, |
|
"learning_rate": 0.00013670033670033668, |
|
"loss": 0.6686, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 5.915315315315315, |
|
"grad_norm": 24.841590881347656, |
|
"learning_rate": 0.0001362193362193362, |
|
"loss": 0.8777, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 5.92972972972973, |
|
"grad_norm": 12.165318489074707, |
|
"learning_rate": 0.00013573833573833573, |
|
"loss": 0.7149, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 5.944144144144144, |
|
"grad_norm": 25.776872634887695, |
|
"learning_rate": 0.00013525733525733525, |
|
"loss": 0.9527, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 5.9585585585585585, |
|
"grad_norm": 15.240096092224121, |
|
"learning_rate": 0.00013477633477633477, |
|
"loss": 0.7363, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 5.972972972972973, |
|
"grad_norm": 18.949817657470703, |
|
"learning_rate": 0.0001342953342953343, |
|
"loss": 0.8795, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 5.987387387387387, |
|
"grad_norm": 23.45053482055664, |
|
"learning_rate": 0.00013381433381433382, |
|
"loss": 0.7355, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 5.998918918918919, |
|
"eval_accuracy": 0.6154891304347826, |
|
"eval_loss": 1.9490801095962524, |
|
"eval_runtime": 540.4624, |
|
"eval_samples_per_second": 10.894, |
|
"eval_steps_per_second": 10.894, |
|
"step": 4158 |
|
}, |
|
{ |
|
"epoch": 6.002882882882883, |
|
"grad_norm": 19.96414566040039, |
|
"learning_rate": 0.0001333333333333333, |
|
"loss": 0.7705, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 6.017297297297297, |
|
"grad_norm": 12.935175895690918, |
|
"learning_rate": 0.00013285233285233284, |
|
"loss": 0.4507, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 6.031711711711711, |
|
"grad_norm": 18.57610511779785, |
|
"learning_rate": 0.00013237133237133236, |
|
"loss": 0.4772, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 6.0461261261261265, |
|
"grad_norm": 18.15093231201172, |
|
"learning_rate": 0.00013189033189033188, |
|
"loss": 0.4697, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 6.060540540540541, |
|
"grad_norm": 9.7061128616333, |
|
"learning_rate": 0.0001314093314093314, |
|
"loss": 0.3953, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 6.074954954954955, |
|
"grad_norm": 14.228235244750977, |
|
"learning_rate": 0.00013092833092833093, |
|
"loss": 0.4857, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 6.089369369369369, |
|
"grad_norm": 12.73335075378418, |
|
"learning_rate": 0.00013044733044733045, |
|
"loss": 0.2774, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 6.103783783783784, |
|
"grad_norm": 26.926279067993164, |
|
"learning_rate": 0.00012996632996632997, |
|
"loss": 0.4033, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 6.118198198198198, |
|
"grad_norm": 5.05507755279541, |
|
"learning_rate": 0.00012948532948532947, |
|
"loss": 0.379, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 6.132612612612613, |
|
"grad_norm": 13.0632905960083, |
|
"learning_rate": 0.000129004329004329, |
|
"loss": 0.5064, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 6.147027027027027, |
|
"grad_norm": 9.610346794128418, |
|
"learning_rate": 0.0001285233285233285, |
|
"loss": 0.5576, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 6.161441441441442, |
|
"grad_norm": 9.474533081054688, |
|
"learning_rate": 0.00012804232804232803, |
|
"loss": 0.4405, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 6.175855855855856, |
|
"grad_norm": 6.424566745758057, |
|
"learning_rate": 0.00012756132756132756, |
|
"loss": 0.4283, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 6.19027027027027, |
|
"grad_norm": 22.856693267822266, |
|
"learning_rate": 0.00012708032708032708, |
|
"loss": 0.5386, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 6.204684684684684, |
|
"grad_norm": 14.695728302001953, |
|
"learning_rate": 0.0001265993265993266, |
|
"loss": 0.4684, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 6.2190990990990995, |
|
"grad_norm": 12.434320449829102, |
|
"learning_rate": 0.0001261183261183261, |
|
"loss": 0.3499, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 6.233513513513514, |
|
"grad_norm": 3.9371864795684814, |
|
"learning_rate": 0.00012563732563732562, |
|
"loss": 0.4161, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 6.247927927927928, |
|
"grad_norm": 11.733071327209473, |
|
"learning_rate": 0.00012515632515632514, |
|
"loss": 0.4829, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 6.262342342342342, |
|
"grad_norm": 5.837855815887451, |
|
"learning_rate": 0.00012467532467532467, |
|
"loss": 0.5473, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 6.276756756756757, |
|
"grad_norm": 10.520476341247559, |
|
"learning_rate": 0.0001241943241943242, |
|
"loss": 0.432, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 6.291171171171171, |
|
"grad_norm": 14.354527473449707, |
|
"learning_rate": 0.0001237133237133237, |
|
"loss": 0.3837, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 6.305585585585586, |
|
"grad_norm": 24.440963745117188, |
|
"learning_rate": 0.00012323232323232323, |
|
"loss": 0.6812, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 21.688756942749023, |
|
"learning_rate": 0.00012275132275132273, |
|
"loss": 0.6889, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 6.3344144144144146, |
|
"grad_norm": 4.70493221282959, |
|
"learning_rate": 0.00012227032227032225, |
|
"loss": 0.4692, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 6.348828828828829, |
|
"grad_norm": 10.504195213317871, |
|
"learning_rate": 0.00012178932178932179, |
|
"loss": 0.3945, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 6.363243243243243, |
|
"grad_norm": 12.554998397827148, |
|
"learning_rate": 0.00012130832130832131, |
|
"loss": 0.4145, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 6.377657657657657, |
|
"grad_norm": 5.851123809814453, |
|
"learning_rate": 0.0001208273208273208, |
|
"loss": 0.3595, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 6.392072072072072, |
|
"grad_norm": 33.16427993774414, |
|
"learning_rate": 0.00012034632034632034, |
|
"loss": 0.5448, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 6.406486486486487, |
|
"grad_norm": 17.474634170532227, |
|
"learning_rate": 0.00011986531986531986, |
|
"loss": 0.4775, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 6.420900900900901, |
|
"grad_norm": 21.54201889038086, |
|
"learning_rate": 0.00011938431938431936, |
|
"loss": 0.4061, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 6.435315315315315, |
|
"grad_norm": 27.28333854675293, |
|
"learning_rate": 0.00011890331890331888, |
|
"loss": 0.41, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 6.44972972972973, |
|
"grad_norm": 31.519390106201172, |
|
"learning_rate": 0.00011842231842231842, |
|
"loss": 0.4323, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 6.464144144144144, |
|
"grad_norm": 18.609390258789062, |
|
"learning_rate": 0.00011794131794131794, |
|
"loss": 0.323, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 6.478558558558559, |
|
"grad_norm": 16.234210968017578, |
|
"learning_rate": 0.00011746031746031744, |
|
"loss": 0.3677, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 6.492972972972973, |
|
"grad_norm": 18.266056060791016, |
|
"learning_rate": 0.00011697931697931697, |
|
"loss": 0.4261, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 6.5073873873873875, |
|
"grad_norm": 13.765610694885254, |
|
"learning_rate": 0.0001164983164983165, |
|
"loss": 0.2749, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 6.521801801801802, |
|
"grad_norm": 19.466411590576172, |
|
"learning_rate": 0.00011601731601731602, |
|
"loss": 0.5191, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 6.536216216216216, |
|
"grad_norm": 5.606191635131836, |
|
"learning_rate": 0.00011553631553631553, |
|
"loss": 0.2674, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 6.55063063063063, |
|
"grad_norm": 21.999649047851562, |
|
"learning_rate": 0.00011505531505531505, |
|
"loss": 0.3778, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 6.565045045045045, |
|
"grad_norm": 5.735301494598389, |
|
"learning_rate": 0.00011457431457431457, |
|
"loss": 0.5567, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 6.57945945945946, |
|
"grad_norm": 10.661727905273438, |
|
"learning_rate": 0.00011409331409331408, |
|
"loss": 0.319, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 6.593873873873874, |
|
"grad_norm": 23.01692771911621, |
|
"learning_rate": 0.0001136123136123136, |
|
"loss": 0.4116, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 6.608288288288288, |
|
"grad_norm": 11.15292739868164, |
|
"learning_rate": 0.00011313131313131313, |
|
"loss": 0.395, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 6.622702702702703, |
|
"grad_norm": 15.197105407714844, |
|
"learning_rate": 0.00011265031265031265, |
|
"loss": 0.5435, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 6.637117117117117, |
|
"grad_norm": 23.04345703125, |
|
"learning_rate": 0.00011216931216931216, |
|
"loss": 0.4702, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 6.651531531531532, |
|
"grad_norm": 8.85188102722168, |
|
"learning_rate": 0.00011168831168831168, |
|
"loss": 0.3533, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 6.665945945945946, |
|
"grad_norm": 9.123584747314453, |
|
"learning_rate": 0.0001112073112073112, |
|
"loss": 0.4277, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 6.6803603603603605, |
|
"grad_norm": 8.331842422485352, |
|
"learning_rate": 0.00011072631072631073, |
|
"loss": 0.5292, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 6.694774774774775, |
|
"grad_norm": 12.688973426818848, |
|
"learning_rate": 0.00011024531024531024, |
|
"loss": 0.3495, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 6.709189189189189, |
|
"grad_norm": 22.717866897583008, |
|
"learning_rate": 0.00010976430976430976, |
|
"loss": 0.4317, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 6.723603603603603, |
|
"grad_norm": 22.28693962097168, |
|
"learning_rate": 0.00010928330928330928, |
|
"loss": 0.5334, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 6.738018018018018, |
|
"grad_norm": 18.496274948120117, |
|
"learning_rate": 0.00010880230880230879, |
|
"loss": 0.4481, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 6.752432432432433, |
|
"grad_norm": 22.91065216064453, |
|
"learning_rate": 0.00010832130832130831, |
|
"loss": 0.3546, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 6.766846846846847, |
|
"grad_norm": 24.638437271118164, |
|
"learning_rate": 0.00010784030784030783, |
|
"loss": 0.6028, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 6.781261261261261, |
|
"grad_norm": 12.158951759338379, |
|
"learning_rate": 0.00010735930735930736, |
|
"loss": 0.3595, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 6.7956756756756755, |
|
"grad_norm": 3.462782144546509, |
|
"learning_rate": 0.00010687830687830687, |
|
"loss": 0.3434, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 6.81009009009009, |
|
"grad_norm": 14.709941864013672, |
|
"learning_rate": 0.00010639730639730639, |
|
"loss": 0.3708, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 6.824504504504505, |
|
"grad_norm": 2.6258020401000977, |
|
"learning_rate": 0.00010591630591630591, |
|
"loss": 0.3561, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 6.838918918918919, |
|
"grad_norm": 4.584090709686279, |
|
"learning_rate": 0.00010543530543530543, |
|
"loss": 0.4685, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 6.8533333333333335, |
|
"grad_norm": 27.684444427490234, |
|
"learning_rate": 0.00010495430495430494, |
|
"loss": 0.2848, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 6.867747747747748, |
|
"grad_norm": 5.796729564666748, |
|
"learning_rate": 0.00010447330447330447, |
|
"loss": 0.3553, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 6.882162162162162, |
|
"grad_norm": 4.9681396484375, |
|
"learning_rate": 0.00010399230399230399, |
|
"loss": 0.3048, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 6.896576576576576, |
|
"grad_norm": 22.89188575744629, |
|
"learning_rate": 0.0001035113035113035, |
|
"loss": 0.6352, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 6.910990990990991, |
|
"grad_norm": 2.380059003829956, |
|
"learning_rate": 0.00010303030303030302, |
|
"loss": 0.4462, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 6.925405405405406, |
|
"grad_norm": 13.61782455444336, |
|
"learning_rate": 0.00010254930254930254, |
|
"loss": 0.4329, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 6.93981981981982, |
|
"grad_norm": 6.834221839904785, |
|
"learning_rate": 0.00010206830206830207, |
|
"loss": 0.2754, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 6.954234234234234, |
|
"grad_norm": 1.0478729009628296, |
|
"learning_rate": 0.00010158730158730157, |
|
"loss": 0.221, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 6.9686486486486485, |
|
"grad_norm": 8.622994422912598, |
|
"learning_rate": 0.0001011063011063011, |
|
"loss": 0.2593, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 6.983063063063063, |
|
"grad_norm": 22.14352035522461, |
|
"learning_rate": 0.00010062530062530062, |
|
"loss": 0.3164, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 6.997477477477478, |
|
"grad_norm": 8.023240089416504, |
|
"learning_rate": 0.00010014430014430014, |
|
"loss": 0.4492, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 6.998918918918919, |
|
"eval_accuracy": 0.6379076086956522, |
|
"eval_loss": 2.059363842010498, |
|
"eval_runtime": 537.7178, |
|
"eval_samples_per_second": 10.95, |
|
"eval_steps_per_second": 10.95, |
|
"step": 4851 |
|
}, |
|
{ |
|
"epoch": 7.012972972972973, |
|
"grad_norm": 14.681108474731445, |
|
"learning_rate": 9.966329966329965e-05, |
|
"loss": 0.2425, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 7.027387387387387, |
|
"grad_norm": 25.905927658081055, |
|
"learning_rate": 9.918229918229917e-05, |
|
"loss": 0.2949, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 7.041801801801801, |
|
"grad_norm": 2.836951971054077, |
|
"learning_rate": 9.87012987012987e-05, |
|
"loss": 0.1989, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 7.0562162162162165, |
|
"grad_norm": 1.04839026927948, |
|
"learning_rate": 9.82202982202982e-05, |
|
"loss": 0.1024, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 7.070630630630631, |
|
"grad_norm": 10.27518367767334, |
|
"learning_rate": 9.773929773929773e-05, |
|
"loss": 0.1522, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 7.085045045045045, |
|
"grad_norm": 15.933104515075684, |
|
"learning_rate": 9.725829725829725e-05, |
|
"loss": 0.145, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 7.099459459459459, |
|
"grad_norm": 18.11174201965332, |
|
"learning_rate": 9.677729677729677e-05, |
|
"loss": 0.1838, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 7.113873873873874, |
|
"grad_norm": 1.1443898677825928, |
|
"learning_rate": 9.629629629629628e-05, |
|
"loss": 0.1418, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 7.128288288288288, |
|
"grad_norm": 15.602287292480469, |
|
"learning_rate": 9.58152958152958e-05, |
|
"loss": 0.3214, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 7.142702702702703, |
|
"grad_norm": 16.450904846191406, |
|
"learning_rate": 9.533429533429533e-05, |
|
"loss": 0.1656, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 7.157117117117117, |
|
"grad_norm": 14.295945167541504, |
|
"learning_rate": 9.485329485329484e-05, |
|
"loss": 0.3092, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 7.1715315315315316, |
|
"grad_norm": 3.2762200832366943, |
|
"learning_rate": 9.437229437229436e-05, |
|
"loss": 0.0993, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 7.185945945945946, |
|
"grad_norm": 1.229925274848938, |
|
"learning_rate": 9.389129389129388e-05, |
|
"loss": 0.1636, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 7.20036036036036, |
|
"grad_norm": 8.866992950439453, |
|
"learning_rate": 9.34102934102934e-05, |
|
"loss": 0.1434, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 7.214774774774774, |
|
"grad_norm": 6.15886116027832, |
|
"learning_rate": 9.292929292929291e-05, |
|
"loss": 0.1759, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 7.2291891891891895, |
|
"grad_norm": 6.583317279815674, |
|
"learning_rate": 9.244829244829244e-05, |
|
"loss": 0.1752, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 7.243603603603604, |
|
"grad_norm": 13.805874824523926, |
|
"learning_rate": 9.196729196729196e-05, |
|
"loss": 0.1778, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 7.258018018018018, |
|
"grad_norm": 4.149932861328125, |
|
"learning_rate": 9.148629148629148e-05, |
|
"loss": 0.3115, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 7.272432432432432, |
|
"grad_norm": 13.87183666229248, |
|
"learning_rate": 9.100529100529099e-05, |
|
"loss": 0.1509, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 7.286846846846847, |
|
"grad_norm": 8.47652530670166, |
|
"learning_rate": 9.052429052429051e-05, |
|
"loss": 0.3549, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 7.301261261261261, |
|
"grad_norm": 9.171941757202148, |
|
"learning_rate": 9.004329004329004e-05, |
|
"loss": 0.1054, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 7.315675675675676, |
|
"grad_norm": 9.501484870910645, |
|
"learning_rate": 8.956228956228955e-05, |
|
"loss": 0.1728, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 7.33009009009009, |
|
"grad_norm": 0.5740847587585449, |
|
"learning_rate": 8.908128908128907e-05, |
|
"loss": 0.116, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 7.3445045045045045, |
|
"grad_norm": 2.0156924724578857, |
|
"learning_rate": 8.860028860028859e-05, |
|
"loss": 0.1889, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 7.358918918918919, |
|
"grad_norm": 4.784016132354736, |
|
"learning_rate": 8.811928811928811e-05, |
|
"loss": 0.2124, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 7.373333333333333, |
|
"grad_norm": 2.135333299636841, |
|
"learning_rate": 8.763828763828762e-05, |
|
"loss": 0.1885, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 7.387747747747747, |
|
"grad_norm": 13.758618354797363, |
|
"learning_rate": 8.715728715728714e-05, |
|
"loss": 0.2869, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 7.4021621621621625, |
|
"grad_norm": 10.508682250976562, |
|
"learning_rate": 8.667628667628667e-05, |
|
"loss": 0.09, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 7.416576576576577, |
|
"grad_norm": 8.677715301513672, |
|
"learning_rate": 8.619528619528619e-05, |
|
"loss": 0.1022, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 7.430990990990991, |
|
"grad_norm": 7.379012584686279, |
|
"learning_rate": 8.57142857142857e-05, |
|
"loss": 0.2095, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 7.445405405405405, |
|
"grad_norm": 16.449451446533203, |
|
"learning_rate": 8.523328523328522e-05, |
|
"loss": 0.1052, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 7.45981981981982, |
|
"grad_norm": 14.736000061035156, |
|
"learning_rate": 8.475228475228474e-05, |
|
"loss": 0.2009, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 7.474234234234234, |
|
"grad_norm": 3.677145004272461, |
|
"learning_rate": 8.427128427128425e-05, |
|
"loss": 0.1472, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 7.488648648648649, |
|
"grad_norm": 0.6532973051071167, |
|
"learning_rate": 8.379028379028378e-05, |
|
"loss": 0.1467, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 7.503063063063063, |
|
"grad_norm": 14.072589874267578, |
|
"learning_rate": 8.33092833092833e-05, |
|
"loss": 0.1727, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 7.5174774774774775, |
|
"grad_norm": 15.414175033569336, |
|
"learning_rate": 8.282828282828282e-05, |
|
"loss": 0.1885, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 7.531891891891892, |
|
"grad_norm": 2.108407735824585, |
|
"learning_rate": 8.234728234728233e-05, |
|
"loss": 0.1228, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 7.546306306306306, |
|
"grad_norm": 13.167756080627441, |
|
"learning_rate": 8.186628186628185e-05, |
|
"loss": 0.1511, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 7.56072072072072, |
|
"grad_norm": 12.300124168395996, |
|
"learning_rate": 8.138528138528138e-05, |
|
"loss": 0.1712, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 7.5751351351351355, |
|
"grad_norm": 4.797776222229004, |
|
"learning_rate": 8.09042809042809e-05, |
|
"loss": 0.1385, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 7.58954954954955, |
|
"grad_norm": 9.989211082458496, |
|
"learning_rate": 8.042328042328041e-05, |
|
"loss": 0.2256, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 7.603963963963964, |
|
"grad_norm": 21.55989646911621, |
|
"learning_rate": 7.994227994227993e-05, |
|
"loss": 0.2175, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 7.618378378378378, |
|
"grad_norm": 12.825868606567383, |
|
"learning_rate": 7.946127946127945e-05, |
|
"loss": 0.1561, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 7.6327927927927925, |
|
"grad_norm": 5.119826793670654, |
|
"learning_rate": 7.902837902837901e-05, |
|
"loss": 0.1237, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 7.647207207207208, |
|
"grad_norm": 8.325628280639648, |
|
"learning_rate": 7.854737854737855e-05, |
|
"loss": 0.3462, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 7.661621621621622, |
|
"grad_norm": 8.451800346374512, |
|
"learning_rate": 7.806637806637807e-05, |
|
"loss": 0.2437, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 7.676036036036036, |
|
"grad_norm": 9.6069974899292, |
|
"learning_rate": 7.758537758537757e-05, |
|
"loss": 0.1846, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 7.6904504504504505, |
|
"grad_norm": 14.663230895996094, |
|
"learning_rate": 7.71043771043771e-05, |
|
"loss": 0.2186, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 7.704864864864865, |
|
"grad_norm": 16.57319450378418, |
|
"learning_rate": 7.662337662337662e-05, |
|
"loss": 0.1133, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 7.719279279279279, |
|
"grad_norm": 10.028879165649414, |
|
"learning_rate": 7.614237614237615e-05, |
|
"loss": 0.1361, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 7.733693693693693, |
|
"grad_norm": 17.944252014160156, |
|
"learning_rate": 7.566137566137566e-05, |
|
"loss": 0.2533, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 7.7481081081081085, |
|
"grad_norm": 4.871366500854492, |
|
"learning_rate": 7.518037518037518e-05, |
|
"loss": 0.1396, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 7.762522522522523, |
|
"grad_norm": 5.787502765655518, |
|
"learning_rate": 7.469937469937469e-05, |
|
"loss": 0.3421, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 7.776936936936937, |
|
"grad_norm": 20.75065040588379, |
|
"learning_rate": 7.421837421837421e-05, |
|
"loss": 0.1679, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 7.791351351351351, |
|
"grad_norm": 16.226171493530273, |
|
"learning_rate": 7.373737373737373e-05, |
|
"loss": 0.2005, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 7.8057657657657655, |
|
"grad_norm": 1.3808518648147583, |
|
"learning_rate": 7.325637325637326e-05, |
|
"loss": 0.2236, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 7.82018018018018, |
|
"grad_norm": 5.49656343460083, |
|
"learning_rate": 7.277537277537277e-05, |
|
"loss": 0.2159, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 7.834594594594595, |
|
"grad_norm": 4.51519250869751, |
|
"learning_rate": 7.229437229437229e-05, |
|
"loss": 0.1601, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 7.849009009009009, |
|
"grad_norm": 3.9731264114379883, |
|
"learning_rate": 7.181337181337181e-05, |
|
"loss": 0.2402, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 7.8634234234234235, |
|
"grad_norm": 1.414002776145935, |
|
"learning_rate": 7.133237133237133e-05, |
|
"loss": 0.1709, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 7.877837837837838, |
|
"grad_norm": 3.847299575805664, |
|
"learning_rate": 7.085137085137084e-05, |
|
"loss": 0.2866, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 7.892252252252252, |
|
"grad_norm": 16.216571807861328, |
|
"learning_rate": 7.037037037037036e-05, |
|
"loss": 0.1026, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 7.906666666666666, |
|
"grad_norm": 1.87873113155365, |
|
"learning_rate": 6.988936988936989e-05, |
|
"loss": 0.1027, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 7.921081081081081, |
|
"grad_norm": 11.856677055358887, |
|
"learning_rate": 6.94083694083694e-05, |
|
"loss": 0.0807, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 7.935495495495496, |
|
"grad_norm": 1.2753289937973022, |
|
"learning_rate": 6.892736892736892e-05, |
|
"loss": 0.1885, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 7.94990990990991, |
|
"grad_norm": 5.382585048675537, |
|
"learning_rate": 6.844636844636844e-05, |
|
"loss": 0.1034, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 7.964324324324324, |
|
"grad_norm": 4.376471996307373, |
|
"learning_rate": 6.796536796536796e-05, |
|
"loss": 0.1051, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 7.9787387387387385, |
|
"grad_norm": 6.501208782196045, |
|
"learning_rate": 6.748436748436747e-05, |
|
"loss": 0.1589, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 7.993153153153153, |
|
"grad_norm": 7.671748161315918, |
|
"learning_rate": 6.7003367003367e-05, |
|
"loss": 0.1528, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 7.998918918918919, |
|
"eval_accuracy": 0.6402853260869565, |
|
"eval_loss": 2.1739323139190674, |
|
"eval_runtime": 537.1422, |
|
"eval_samples_per_second": 10.962, |
|
"eval_steps_per_second": 10.962, |
|
"step": 5544 |
|
}, |
|
{ |
|
"epoch": 8.008648648648649, |
|
"grad_norm": 0.7333820462226868, |
|
"learning_rate": 6.652236652236652e-05, |
|
"loss": 0.1737, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 8.023063063063063, |
|
"grad_norm": 1.1993273496627808, |
|
"learning_rate": 6.604136604136604e-05, |
|
"loss": 0.0923, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 8.037477477477477, |
|
"grad_norm": 18.680021286010742, |
|
"learning_rate": 6.556036556036555e-05, |
|
"loss": 0.1005, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 8.051891891891891, |
|
"grad_norm": 19.182872772216797, |
|
"learning_rate": 6.507936507936507e-05, |
|
"loss": 0.1297, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 8.066306306306306, |
|
"grad_norm": 2.575910806655884, |
|
"learning_rate": 6.45983645983646e-05, |
|
"loss": 0.049, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 8.08072072072072, |
|
"grad_norm": 1.0843993425369263, |
|
"learning_rate": 6.41173641173641e-05, |
|
"loss": 0.0646, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 8.095135135135136, |
|
"grad_norm": 0.35826346278190613, |
|
"learning_rate": 6.363636363636363e-05, |
|
"loss": 0.0356, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 8.10954954954955, |
|
"grad_norm": 1.4210469722747803, |
|
"learning_rate": 6.315536315536315e-05, |
|
"loss": 0.0329, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 8.123963963963964, |
|
"grad_norm": 8.666502952575684, |
|
"learning_rate": 6.267436267436267e-05, |
|
"loss": 0.0496, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 8.138378378378379, |
|
"grad_norm": 0.4810231328010559, |
|
"learning_rate": 6.219336219336218e-05, |
|
"loss": 0.0276, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 8.152792792792793, |
|
"grad_norm": 5.4928789138793945, |
|
"learning_rate": 6.17123617123617e-05, |
|
"loss": 0.0692, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 8.167207207207207, |
|
"grad_norm": 5.067449569702148, |
|
"learning_rate": 6.123136123136123e-05, |
|
"loss": 0.058, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 8.181621621621622, |
|
"grad_norm": 25.670732498168945, |
|
"learning_rate": 6.075036075036074e-05, |
|
"loss": 0.1061, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 8.196036036036036, |
|
"grad_norm": 6.106614589691162, |
|
"learning_rate": 6.0269360269360265e-05, |
|
"loss": 0.0554, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 8.21045045045045, |
|
"grad_norm": 7.492941379547119, |
|
"learning_rate": 5.978835978835978e-05, |
|
"loss": 0.0667, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 8.224864864864864, |
|
"grad_norm": 1.3118231296539307, |
|
"learning_rate": 5.9307359307359304e-05, |
|
"loss": 0.0388, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 8.239279279279279, |
|
"grad_norm": 4.273688316345215, |
|
"learning_rate": 5.882635882635882e-05, |
|
"loss": 0.047, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 8.253693693693693, |
|
"grad_norm": 2.6258041858673096, |
|
"learning_rate": 5.834535834535834e-05, |
|
"loss": 0.0652, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 8.268108108108109, |
|
"grad_norm": 5.456060886383057, |
|
"learning_rate": 5.786435786435786e-05, |
|
"loss": 0.1954, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 8.282522522522523, |
|
"grad_norm": 3.158957004547119, |
|
"learning_rate": 5.738335738335738e-05, |
|
"loss": 0.0662, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 8.296936936936937, |
|
"grad_norm": 3.201091766357422, |
|
"learning_rate": 5.6902356902356896e-05, |
|
"loss": 0.199, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 8.311351351351352, |
|
"grad_norm": 1.514101505279541, |
|
"learning_rate": 5.642135642135642e-05, |
|
"loss": 0.1082, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 8.325765765765766, |
|
"grad_norm": 0.24764111638069153, |
|
"learning_rate": 5.5940355940355935e-05, |
|
"loss": 0.0607, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 8.34018018018018, |
|
"grad_norm": 1.5579568147659302, |
|
"learning_rate": 5.545935545935545e-05, |
|
"loss": 0.0205, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 8.354594594594595, |
|
"grad_norm": 9.406379699707031, |
|
"learning_rate": 5.497835497835497e-05, |
|
"loss": 0.0614, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 8.369009009009009, |
|
"grad_norm": 3.4456870555877686, |
|
"learning_rate": 5.449735449735449e-05, |
|
"loss": 0.0169, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 8.383423423423423, |
|
"grad_norm": 0.3121024966239929, |
|
"learning_rate": 5.401635401635401e-05, |
|
"loss": 0.078, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 8.397837837837837, |
|
"grad_norm": 7.2323832511901855, |
|
"learning_rate": 5.353535353535353e-05, |
|
"loss": 0.0794, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 8.412252252252252, |
|
"grad_norm": 0.42312678694725037, |
|
"learning_rate": 5.305435305435305e-05, |
|
"loss": 0.0229, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 8.426666666666666, |
|
"grad_norm": 1.5303746461868286, |
|
"learning_rate": 5.2573352573352566e-05, |
|
"loss": 0.0555, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 8.441081081081082, |
|
"grad_norm": 0.5218743681907654, |
|
"learning_rate": 5.209235209235209e-05, |
|
"loss": 0.097, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 8.455495495495496, |
|
"grad_norm": 3.4224956035614014, |
|
"learning_rate": 5.1611351611351604e-05, |
|
"loss": 0.0415, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 8.46990990990991, |
|
"grad_norm": 0.56160569190979, |
|
"learning_rate": 5.113035113035113e-05, |
|
"loss": 0.0476, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 8.484324324324325, |
|
"grad_norm": 2.77597975730896, |
|
"learning_rate": 5.064935064935064e-05, |
|
"loss": 0.0231, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 8.498738738738739, |
|
"grad_norm": 2.240520477294922, |
|
"learning_rate": 5.016835016835016e-05, |
|
"loss": 0.051, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 8.513153153153153, |
|
"grad_norm": 1.585841178894043, |
|
"learning_rate": 4.968734968734968e-05, |
|
"loss": 0.0575, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 8.527567567567568, |
|
"grad_norm": 12.269892692565918, |
|
"learning_rate": 4.92063492063492e-05, |
|
"loss": 0.0419, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 8.541981981981982, |
|
"grad_norm": 4.764209747314453, |
|
"learning_rate": 4.872534872534872e-05, |
|
"loss": 0.1574, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 8.556396396396396, |
|
"grad_norm": 6.484140396118164, |
|
"learning_rate": 4.8244348244348236e-05, |
|
"loss": 0.0667, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 8.57081081081081, |
|
"grad_norm": 8.274352073669434, |
|
"learning_rate": 4.7763347763347765e-05, |
|
"loss": 0.1035, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 8.585225225225225, |
|
"grad_norm": 18.833515167236328, |
|
"learning_rate": 4.7282347282347274e-05, |
|
"loss": 0.0372, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 8.599639639639639, |
|
"grad_norm": 4.068152904510498, |
|
"learning_rate": 4.68013468013468e-05, |
|
"loss": 0.0689, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 8.614054054054055, |
|
"grad_norm": 4.497600078582764, |
|
"learning_rate": 4.632034632034632e-05, |
|
"loss": 0.0501, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 8.62846846846847, |
|
"grad_norm": 1.556960940361023, |
|
"learning_rate": 4.583934583934583e-05, |
|
"loss": 0.0988, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 8.642882882882883, |
|
"grad_norm": 14.646133422851562, |
|
"learning_rate": 4.535834535834536e-05, |
|
"loss": 0.055, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 8.657297297297298, |
|
"grad_norm": 0.7149348258972168, |
|
"learning_rate": 4.4877344877344874e-05, |
|
"loss": 0.0471, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 8.671711711711712, |
|
"grad_norm": 0.4112788438796997, |
|
"learning_rate": 4.4396344396344396e-05, |
|
"loss": 0.0755, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 8.686126126126126, |
|
"grad_norm": 0.7935078740119934, |
|
"learning_rate": 4.391534391534391e-05, |
|
"loss": 0.0194, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 8.70054054054054, |
|
"grad_norm": 2.739198684692383, |
|
"learning_rate": 4.3434343434343435e-05, |
|
"loss": 0.0313, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 8.714954954954955, |
|
"grad_norm": 1.197202444076538, |
|
"learning_rate": 4.295334295334295e-05, |
|
"loss": 0.0473, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 8.729369369369369, |
|
"grad_norm": 2.7497189044952393, |
|
"learning_rate": 4.247234247234247e-05, |
|
"loss": 0.0168, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 8.743783783783783, |
|
"grad_norm": 22.05868911743164, |
|
"learning_rate": 4.199134199134199e-05, |
|
"loss": 0.0741, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 8.758198198198198, |
|
"grad_norm": 2.2377078533172607, |
|
"learning_rate": 4.151034151034151e-05, |
|
"loss": 0.0413, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 8.772612612612612, |
|
"grad_norm": 1.0943878889083862, |
|
"learning_rate": 4.102934102934103e-05, |
|
"loss": 0.0475, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 8.787027027027026, |
|
"grad_norm": 1.7506133317947388, |
|
"learning_rate": 4.054834054834054e-05, |
|
"loss": 0.0188, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 8.801441441441442, |
|
"grad_norm": 2.1582717895507812, |
|
"learning_rate": 4.0067340067340066e-05, |
|
"loss": 0.0407, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 8.815855855855856, |
|
"grad_norm": 13.355046272277832, |
|
"learning_rate": 3.958633958633958e-05, |
|
"loss": 0.1049, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 8.83027027027027, |
|
"grad_norm": 3.4152133464813232, |
|
"learning_rate": 3.9105339105339104e-05, |
|
"loss": 0.0346, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 8.844684684684685, |
|
"grad_norm": 0.4933088421821594, |
|
"learning_rate": 3.862433862433862e-05, |
|
"loss": 0.1112, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 8.8590990990991, |
|
"grad_norm": 12.00542163848877, |
|
"learning_rate": 3.814333814333814e-05, |
|
"loss": 0.0318, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 8.873513513513513, |
|
"grad_norm": 9.061931610107422, |
|
"learning_rate": 3.766233766233766e-05, |
|
"loss": 0.0962, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 8.887927927927928, |
|
"grad_norm": 0.15183605253696442, |
|
"learning_rate": 3.7181337181337174e-05, |
|
"loss": 0.093, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 8.902342342342342, |
|
"grad_norm": 5.919425010681152, |
|
"learning_rate": 3.67003367003367e-05, |
|
"loss": 0.0287, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 8.916756756756756, |
|
"grad_norm": 6.494754791259766, |
|
"learning_rate": 3.621933621933621e-05, |
|
"loss": 0.0287, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 8.93117117117117, |
|
"grad_norm": 3.5904083251953125, |
|
"learning_rate": 3.5738335738335735e-05, |
|
"loss": 0.0247, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 8.945585585585585, |
|
"grad_norm": 5.52282190322876, |
|
"learning_rate": 3.525733525733526e-05, |
|
"loss": 0.0644, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"grad_norm": 3.505472183227539, |
|
"learning_rate": 3.4776334776334774e-05, |
|
"loss": 0.0133, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 8.974414414414415, |
|
"grad_norm": 0.13238631188869476, |
|
"learning_rate": 3.4295334295334296e-05, |
|
"loss": 0.0294, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 8.98882882882883, |
|
"grad_norm": 1.1236836910247803, |
|
"learning_rate": 3.381433381433381e-05, |
|
"loss": 0.0468, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 8.99891891891892, |
|
"eval_accuracy": 0.6504755434782609, |
|
"eval_loss": 2.3125061988830566, |
|
"eval_runtime": 539.1351, |
|
"eval_samples_per_second": 10.921, |
|
"eval_steps_per_second": 10.921, |
|
"step": 6237 |
|
}, |
|
{ |
|
"epoch": 9.004324324324324, |
|
"grad_norm": 1.5750885009765625, |
|
"learning_rate": 3.333333333333333e-05, |
|
"loss": 0.0234, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 9.018738738738739, |
|
"grad_norm": 0.3882788121700287, |
|
"learning_rate": 3.285233285233285e-05, |
|
"loss": 0.0151, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 9.033153153153153, |
|
"grad_norm": 0.2824605107307434, |
|
"learning_rate": 3.2371332371332367e-05, |
|
"loss": 0.0045, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 9.047567567567567, |
|
"grad_norm": 0.8951876759529114, |
|
"learning_rate": 3.189033189033189e-05, |
|
"loss": 0.0058, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 9.061981981981981, |
|
"grad_norm": 0.6100791096687317, |
|
"learning_rate": 3.1409331409331405e-05, |
|
"loss": 0.0148, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 9.076396396396396, |
|
"grad_norm": 8.918787002563477, |
|
"learning_rate": 3.092833092833093e-05, |
|
"loss": 0.0175, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 9.090810810810812, |
|
"grad_norm": 0.46548986434936523, |
|
"learning_rate": 3.0447330447330447e-05, |
|
"loss": 0.006, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 9.105225225225226, |
|
"grad_norm": 2.6482155323028564, |
|
"learning_rate": 2.9966329966329966e-05, |
|
"loss": 0.0089, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 9.11963963963964, |
|
"grad_norm": 0.44524553418159485, |
|
"learning_rate": 2.9485329485329485e-05, |
|
"loss": 0.0063, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 9.134054054054054, |
|
"grad_norm": 1.2146574258804321, |
|
"learning_rate": 2.9004329004329005e-05, |
|
"loss": 0.0065, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 9.148468468468469, |
|
"grad_norm": 5.5731201171875, |
|
"learning_rate": 2.852332852332852e-05, |
|
"loss": 0.017, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 9.162882882882883, |
|
"grad_norm": 1.0001026391983032, |
|
"learning_rate": 2.804232804232804e-05, |
|
"loss": 0.0095, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 9.177297297297297, |
|
"grad_norm": 0.22491152584552765, |
|
"learning_rate": 2.756132756132756e-05, |
|
"loss": 0.0301, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 9.191711711711712, |
|
"grad_norm": 0.5325976610183716, |
|
"learning_rate": 2.7080327080327078e-05, |
|
"loss": 0.0296, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 9.206126126126126, |
|
"grad_norm": 0.44546425342559814, |
|
"learning_rate": 2.6599326599326597e-05, |
|
"loss": 0.0056, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 9.22054054054054, |
|
"grad_norm": 3.602013349533081, |
|
"learning_rate": 2.6118326118326117e-05, |
|
"loss": 0.014, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 9.234954954954954, |
|
"grad_norm": 0.4638885259628296, |
|
"learning_rate": 2.5637325637325636e-05, |
|
"loss": 0.01, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 9.249369369369369, |
|
"grad_norm": 0.21774759888648987, |
|
"learning_rate": 2.5156325156325155e-05, |
|
"loss": 0.0543, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 9.263783783783785, |
|
"grad_norm": 0.2262602001428604, |
|
"learning_rate": 2.4675324675324674e-05, |
|
"loss": 0.0086, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 9.278198198198199, |
|
"grad_norm": 1.7811743021011353, |
|
"learning_rate": 2.4194324194324193e-05, |
|
"loss": 0.0109, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 9.292612612612613, |
|
"grad_norm": 1.6832902431488037, |
|
"learning_rate": 2.371332371332371e-05, |
|
"loss": 0.0076, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 9.307027027027027, |
|
"grad_norm": 0.11599577963352203, |
|
"learning_rate": 2.323232323232323e-05, |
|
"loss": 0.0065, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 9.321441441441442, |
|
"grad_norm": 0.049297433346509933, |
|
"learning_rate": 2.2751322751322748e-05, |
|
"loss": 0.0094, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 9.335855855855856, |
|
"grad_norm": 0.6120862364768982, |
|
"learning_rate": 2.2270322270322267e-05, |
|
"loss": 0.0065, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 9.35027027027027, |
|
"grad_norm": 0.24179236590862274, |
|
"learning_rate": 2.1789321789321786e-05, |
|
"loss": 0.0156, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 9.364684684684685, |
|
"grad_norm": 1.3065845966339111, |
|
"learning_rate": 2.1308321308321305e-05, |
|
"loss": 0.0114, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 9.379099099099099, |
|
"grad_norm": 1.4051166772842407, |
|
"learning_rate": 2.0827320827320825e-05, |
|
"loss": 0.005, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 9.393513513513513, |
|
"grad_norm": 1.3191016912460327, |
|
"learning_rate": 2.0346320346320344e-05, |
|
"loss": 0.0079, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 9.407927927927927, |
|
"grad_norm": 0.15781471133232117, |
|
"learning_rate": 1.9865319865319863e-05, |
|
"loss": 0.0144, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 9.422342342342342, |
|
"grad_norm": 0.2565706968307495, |
|
"learning_rate": 1.9384319384319386e-05, |
|
"loss": 0.0338, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 9.436756756756758, |
|
"grad_norm": 0.3341190814971924, |
|
"learning_rate": 1.8903318903318905e-05, |
|
"loss": 0.0105, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 9.451171171171172, |
|
"grad_norm": 0.5033118724822998, |
|
"learning_rate": 1.842231842231842e-05, |
|
"loss": 0.0568, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 9.465585585585586, |
|
"grad_norm": 1.653732419013977, |
|
"learning_rate": 1.794131794131794e-05, |
|
"loss": 0.0084, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 9.48, |
|
"grad_norm": 11.09926700592041, |
|
"learning_rate": 1.746031746031746e-05, |
|
"loss": 0.0144, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 9.494414414414415, |
|
"grad_norm": 0.14694152772426605, |
|
"learning_rate": 1.697931697931698e-05, |
|
"loss": 0.0047, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 9.508828828828829, |
|
"grad_norm": 0.05755695700645447, |
|
"learning_rate": 1.6498316498316498e-05, |
|
"loss": 0.0096, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 9.523243243243243, |
|
"grad_norm": 0.30771782994270325, |
|
"learning_rate": 1.6017316017316017e-05, |
|
"loss": 0.0143, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 9.537657657657657, |
|
"grad_norm": 0.2555331885814667, |
|
"learning_rate": 1.5536315536315536e-05, |
|
"loss": 0.0152, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 9.552072072072072, |
|
"grad_norm": 0.45528095960617065, |
|
"learning_rate": 1.5055315055315054e-05, |
|
"loss": 0.0055, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 9.566486486486486, |
|
"grad_norm": 1.118922472000122, |
|
"learning_rate": 1.4574314574314573e-05, |
|
"loss": 0.019, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 9.5809009009009, |
|
"grad_norm": 0.5122382044792175, |
|
"learning_rate": 1.4093314093314092e-05, |
|
"loss": 0.0534, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 9.595315315315315, |
|
"grad_norm": 0.18795226514339447, |
|
"learning_rate": 1.3612313612313611e-05, |
|
"loss": 0.0247, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 9.609729729729729, |
|
"grad_norm": 1.0938136577606201, |
|
"learning_rate": 1.313131313131313e-05, |
|
"loss": 0.0062, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 9.624144144144145, |
|
"grad_norm": 0.13021990656852722, |
|
"learning_rate": 1.265031265031265e-05, |
|
"loss": 0.0052, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 9.63855855855856, |
|
"grad_norm": 1.0237598419189453, |
|
"learning_rate": 1.2169312169312167e-05, |
|
"loss": 0.0106, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 9.652972972972973, |
|
"grad_norm": 0.8002647161483765, |
|
"learning_rate": 1.1688311688311687e-05, |
|
"loss": 0.0051, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 9.667387387387388, |
|
"grad_norm": 0.5976181030273438, |
|
"learning_rate": 1.1207311207311206e-05, |
|
"loss": 0.0026, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 9.681801801801802, |
|
"grad_norm": 0.4594089388847351, |
|
"learning_rate": 1.0726310726310727e-05, |
|
"loss": 0.0045, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 9.696216216216216, |
|
"grad_norm": 0.6820192933082581, |
|
"learning_rate": 1.0245310245310246e-05, |
|
"loss": 0.005, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 9.71063063063063, |
|
"grad_norm": 0.21790215373039246, |
|
"learning_rate": 9.764309764309763e-06, |
|
"loss": 0.0093, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 9.725045045045045, |
|
"grad_norm": 3.2225234508514404, |
|
"learning_rate": 9.283309283309283e-06, |
|
"loss": 0.008, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 9.739459459459459, |
|
"grad_norm": 2.9584898948669434, |
|
"learning_rate": 8.802308802308802e-06, |
|
"loss": 0.032, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 9.753873873873873, |
|
"grad_norm": 0.250264972448349, |
|
"learning_rate": 8.321308321308321e-06, |
|
"loss": 0.0075, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 9.768288288288288, |
|
"grad_norm": 14.774813652038574, |
|
"learning_rate": 7.840307840307839e-06, |
|
"loss": 0.0137, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 9.782702702702704, |
|
"grad_norm": 16.798877716064453, |
|
"learning_rate": 7.359307359307359e-06, |
|
"loss": 0.0144, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 9.797117117117118, |
|
"grad_norm": 0.39727962017059326, |
|
"learning_rate": 6.878306878306877e-06, |
|
"loss": 0.0068, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 9.811531531531532, |
|
"grad_norm": 0.6047233939170837, |
|
"learning_rate": 6.397306397306397e-06, |
|
"loss": 0.0046, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 9.825945945945946, |
|
"grad_norm": 0.6603574752807617, |
|
"learning_rate": 5.916305916305916e-06, |
|
"loss": 0.0058, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 9.84036036036036, |
|
"grad_norm": 0.07351452112197876, |
|
"learning_rate": 5.435305435305435e-06, |
|
"loss": 0.007, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 9.854774774774775, |
|
"grad_norm": 0.48447152972221375, |
|
"learning_rate": 4.954304954304954e-06, |
|
"loss": 0.0059, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 9.86918918918919, |
|
"grad_norm": 0.12311412394046783, |
|
"learning_rate": 4.473304473304473e-06, |
|
"loss": 0.0151, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 9.883603603603603, |
|
"grad_norm": 0.08983255177736282, |
|
"learning_rate": 3.992303992303992e-06, |
|
"loss": 0.0072, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 9.898018018018018, |
|
"grad_norm": 0.78732830286026, |
|
"learning_rate": 3.511303511303511e-06, |
|
"loss": 0.0613, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 9.912432432432432, |
|
"grad_norm": 0.09099213033914566, |
|
"learning_rate": 3.0303030303030305e-06, |
|
"loss": 0.0064, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 9.926846846846846, |
|
"grad_norm": 0.3043908476829529, |
|
"learning_rate": 2.5493025493025493e-06, |
|
"loss": 0.0062, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 9.94126126126126, |
|
"grad_norm": 0.16236887872219086, |
|
"learning_rate": 2.068302068302068e-06, |
|
"loss": 0.007, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 9.955675675675675, |
|
"grad_norm": 1.6547272205352783, |
|
"learning_rate": 1.587301587301587e-06, |
|
"loss": 0.0141, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 9.97009009009009, |
|
"grad_norm": 14.645796775817871, |
|
"learning_rate": 1.1063011063011063e-06, |
|
"loss": 0.0272, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 9.984504504504505, |
|
"grad_norm": 0.29277849197387695, |
|
"learning_rate": 6.253006253006252e-07, |
|
"loss": 0.0082, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 9.99891891891892, |
|
"grad_norm": 0.7076464891433716, |
|
"learning_rate": 1.4430014430014428e-07, |
|
"loss": 0.0045, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 9.99891891891892, |
|
"eval_accuracy": 0.6554008152173914, |
|
"eval_loss": 2.2544686794281006, |
|
"eval_runtime": 539.1275, |
|
"eval_samples_per_second": 10.921, |
|
"eval_steps_per_second": 10.921, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 9.99891891891892, |
|
"step": 6930, |
|
"total_flos": 3.884969846408101e+18, |
|
"train_loss": 2.9567832476562925, |
|
"train_runtime": 60026.293, |
|
"train_samples_per_second": 3.697, |
|
"train_steps_per_second": 0.115 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 6930, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.884969846408101e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|