|
{ |
|
"best_metric": 1.6068978309631348, |
|
"best_model_checkpoint": "/home/nlplab5/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/checkpoint-12420", |
|
"epoch": 9.049218813243746, |
|
"eval_steps": 90, |
|
"global_step": 12420, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.2670376300811768, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 7.2579, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.1587345600128174, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 7.2077, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.9462554454803467, |
|
"learning_rate": 1.25e-05, |
|
"loss": 7.1099, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.7092673778533936, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 6.9866, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.605360269546509, |
|
"learning_rate": 2.0833333333333333e-05, |
|
"loss": 6.87, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.5747287273406982, |
|
"learning_rate": 2.5e-05, |
|
"loss": 6.7736, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.551903247833252, |
|
"learning_rate": 2.9166666666666666e-05, |
|
"loss": 6.6903, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.5231289863586426, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 6.6151, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.503075361251831, |
|
"learning_rate": 3.75e-05, |
|
"loss": 6.5462, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_accuracy": 0.2242063046599867, |
|
"eval_loss": 6.500818729400635, |
|
"eval_runtime": 1083.9947, |
|
"eval_samples_per_second": 460.679, |
|
"eval_steps_per_second": 2.399, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.4953837394714355, |
|
"learning_rate": 4.1666666666666665e-05, |
|
"loss": 6.4802, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.4765946865081787, |
|
"learning_rate": 4.5833333333333334e-05, |
|
"loss": 6.4127, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.4805634021759033, |
|
"learning_rate": 5e-05, |
|
"loss": 6.3425, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.4720654487609863, |
|
"learning_rate": 5.416666666666667e-05, |
|
"loss": 6.2706, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.454899787902832, |
|
"learning_rate": 5.833333333333333e-05, |
|
"loss": 6.1941, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.4357142448425293, |
|
"learning_rate": 6.25e-05, |
|
"loss": 6.1169, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.4193003177642822, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 6.0351, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.396649122238159, |
|
"learning_rate": 7.083333333333334e-05, |
|
"loss": 5.9532, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.3566408157348633, |
|
"learning_rate": 7.5e-05, |
|
"loss": 5.8695, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_accuracy": 0.22404351306840548, |
|
"eval_loss": 5.819457530975342, |
|
"eval_runtime": 1077.6772, |
|
"eval_samples_per_second": 463.38, |
|
"eval_steps_per_second": 2.414, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.310816526412964, |
|
"learning_rate": 7.916666666666666e-05, |
|
"loss": 5.7876, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.2797272205352783, |
|
"learning_rate": 8.333333333333333e-05, |
|
"loss": 5.706, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.2188355922698975, |
|
"learning_rate": 8.75e-05, |
|
"loss": 5.6255, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.142122745513916, |
|
"learning_rate": 9.166666666666667e-05, |
|
"loss": 5.5471, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.069880485534668, |
|
"learning_rate": 9.583333333333334e-05, |
|
"loss": 5.4704, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.957664966583252, |
|
"learning_rate": 0.0001, |
|
"loss": 5.4003, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.8441264629364014, |
|
"learning_rate": 0.00010416666666666667, |
|
"loss": 5.3306, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.722961664199829, |
|
"learning_rate": 0.00010833333333333334, |
|
"loss": 5.2648, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.5622942447662354, |
|
"learning_rate": 0.00011250000000000001, |
|
"loss": 5.2004, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_accuracy": 0.22578753539560809, |
|
"eval_loss": 5.162991046905518, |
|
"eval_runtime": 1079.7321, |
|
"eval_samples_per_second": 462.498, |
|
"eval_steps_per_second": 2.409, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.4034879207611084, |
|
"learning_rate": 0.00011666666666666667, |
|
"loss": 5.144, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.2136635780334473, |
|
"learning_rate": 0.00012083333333333333, |
|
"loss": 5.087, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.9789605140686035, |
|
"learning_rate": 0.000125, |
|
"loss": 5.0347, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7709304094314575, |
|
"learning_rate": 0.00012916666666666667, |
|
"loss": 4.9873, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5693560838699341, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 4.95, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.42085811495780945, |
|
"learning_rate": 0.0001375, |
|
"loss": 4.9181, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.3126681447029114, |
|
"learning_rate": 0.00014166666666666668, |
|
"loss": 4.8959, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.24236658215522766, |
|
"learning_rate": 0.00014583333333333335, |
|
"loss": 4.876, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.23320983350276947, |
|
"learning_rate": 0.00015, |
|
"loss": 4.8607, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_accuracy": 0.2620490039433166, |
|
"eval_loss": 4.841182231903076, |
|
"eval_runtime": 1079.5667, |
|
"eval_samples_per_second": 462.569, |
|
"eval_steps_per_second": 2.409, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.23191139101982117, |
|
"learning_rate": 0.00015416666666666668, |
|
"loss": 4.846, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.2323000133037567, |
|
"learning_rate": 0.00015833333333333332, |
|
"loss": 4.8304, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2213001251220703, |
|
"learning_rate": 0.00016250000000000002, |
|
"loss": 4.816, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.21700094640254974, |
|
"learning_rate": 0.00016666666666666666, |
|
"loss": 4.8044, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.27367648482322693, |
|
"learning_rate": 0.00017083333333333333, |
|
"loss": 4.787, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.3331514000892639, |
|
"learning_rate": 0.000175, |
|
"loss": 4.775, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.3531811833381653, |
|
"learning_rate": 0.00017916666666666667, |
|
"loss": 4.7599, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.17425844073295593, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 4.7471, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.16081774234771729, |
|
"learning_rate": 0.0001875, |
|
"loss": 4.732, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_accuracy": 0.2854636107287403, |
|
"eval_loss": 4.713276386260986, |
|
"eval_runtime": 1135.1993, |
|
"eval_samples_per_second": 439.9, |
|
"eval_steps_per_second": 2.291, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5644450783729553, |
|
"learning_rate": 0.00019166666666666667, |
|
"loss": 4.7196, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.4822804629802704, |
|
"learning_rate": 0.00019583333333333334, |
|
"loss": 4.7067, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.26140686869621277, |
|
"learning_rate": 0.0002, |
|
"loss": 4.6939, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.24341146647930145, |
|
"learning_rate": 0.00020416666666666668, |
|
"loss": 4.6797, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.1883888840675354, |
|
"learning_rate": 0.00020833333333333335, |
|
"loss": 4.667, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.32793405652046204, |
|
"learning_rate": 0.0002125, |
|
"loss": 4.6568, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6900771856307983, |
|
"learning_rate": 0.00021666666666666668, |
|
"loss": 4.6504, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.23155897855758667, |
|
"learning_rate": 0.00022083333333333333, |
|
"loss": 4.6371, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.15708310902118683, |
|
"learning_rate": 0.00022500000000000002, |
|
"loss": 4.6273, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_accuracy": 0.2875459051329938, |
|
"eval_loss": 4.6114654541015625, |
|
"eval_runtime": 1075.2001, |
|
"eval_samples_per_second": 464.447, |
|
"eval_steps_per_second": 2.419, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.14138343930244446, |
|
"learning_rate": 0.00022916666666666666, |
|
"loss": 4.6187, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.10016636550426483, |
|
"learning_rate": 0.00023333333333333333, |
|
"loss": 4.6101, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.09886801242828369, |
|
"learning_rate": 0.0002375, |
|
"loss": 4.6027, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.9051061272621155, |
|
"learning_rate": 0.00024166666666666667, |
|
"loss": 4.5993, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8797232508659363, |
|
"learning_rate": 0.0002458333333333333, |
|
"loss": 4.5982, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.4441538453102112, |
|
"learning_rate": 0.00025, |
|
"loss": 4.5868, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.24725468456745148, |
|
"learning_rate": 0.00025416666666666665, |
|
"loss": 4.5836, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.08581159263849258, |
|
"learning_rate": 0.00025833333333333334, |
|
"loss": 4.5773, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.10642833262681961, |
|
"learning_rate": 0.00026250000000000004, |
|
"loss": 4.572, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_accuracy": 0.28878817310094446, |
|
"eval_loss": 4.563485622406006, |
|
"eval_runtime": 1076.0022, |
|
"eval_samples_per_second": 464.101, |
|
"eval_steps_per_second": 2.417, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.17463397979736328, |
|
"learning_rate": 0.0002666666666666667, |
|
"loss": 4.5707, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.1806878000497818, |
|
"learning_rate": 0.0002708333333333333, |
|
"loss": 4.5681, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.18553458154201508, |
|
"learning_rate": 0.000275, |
|
"loss": 4.5612, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7338638305664062, |
|
"learning_rate": 0.00027916666666666666, |
|
"loss": 4.5601, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.3852124512195587, |
|
"learning_rate": 0.00028333333333333335, |
|
"loss": 4.5567, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.2558722198009491, |
|
"learning_rate": 0.0002875, |
|
"loss": 4.555, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.1996091902256012, |
|
"learning_rate": 0.0002916666666666667, |
|
"loss": 4.553, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.414126992225647, |
|
"learning_rate": 0.00029583333333333333, |
|
"loss": 4.551, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6828728914260864, |
|
"learning_rate": 0.0003, |
|
"loss": 4.5485, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_accuracy": 0.2897431184302517, |
|
"eval_loss": 4.544471263885498, |
|
"eval_runtime": 1075.524, |
|
"eval_samples_per_second": 464.308, |
|
"eval_steps_per_second": 2.418, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.31978073716163635, |
|
"learning_rate": 0.00030416666666666667, |
|
"loss": 4.5478, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.23724275827407837, |
|
"learning_rate": 0.00030833333333333337, |
|
"loss": 4.5451, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0507104396820068, |
|
"learning_rate": 0.0003125, |
|
"loss": 4.5441, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.38554638624191284, |
|
"learning_rate": 0.00031666666666666665, |
|
"loss": 4.5433, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.275704562664032, |
|
"learning_rate": 0.00032083333333333334, |
|
"loss": 4.5415, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.2305123656988144, |
|
"learning_rate": 0.00032500000000000004, |
|
"loss": 4.5398, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.4813285768032074, |
|
"learning_rate": 0.0003291666666666667, |
|
"loss": 4.5391, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6520434617996216, |
|
"learning_rate": 0.0003333333333333333, |
|
"loss": 4.5361, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.3875904381275177, |
|
"learning_rate": 0.0003375, |
|
"loss": 4.5351, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_accuracy": 0.29011102141892453, |
|
"eval_loss": 4.53138542175293, |
|
"eval_runtime": 1074.6899, |
|
"eval_samples_per_second": 464.668, |
|
"eval_steps_per_second": 2.42, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6874526143074036, |
|
"learning_rate": 0.00034166666666666666, |
|
"loss": 4.5338, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.4180966019630432, |
|
"learning_rate": 0.00034583333333333335, |
|
"loss": 4.5331, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.41761451959609985, |
|
"learning_rate": 0.00035, |
|
"loss": 4.5291, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6088799834251404, |
|
"learning_rate": 0.0003541666666666667, |
|
"loss": 4.5303, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.20438095927238464, |
|
"learning_rate": 0.00035833333333333333, |
|
"loss": 4.5298, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.4336546063423157, |
|
"learning_rate": 0.0003625, |
|
"loss": 4.5283, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.40454909205436707, |
|
"learning_rate": 0.00036666666666666667, |
|
"loss": 4.5301, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.4893989562988281, |
|
"learning_rate": 0.00037083333333333337, |
|
"loss": 4.5286, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.4546484053134918, |
|
"learning_rate": 0.000375, |
|
"loss": 4.5263, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_accuracy": 0.2903379261149848, |
|
"eval_loss": 4.5242390632629395, |
|
"eval_runtime": 1074.5238, |
|
"eval_samples_per_second": 464.74, |
|
"eval_steps_per_second": 2.421, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.22772909700870514, |
|
"learning_rate": 0.00037916666666666665, |
|
"loss": 4.5251, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.48488083481788635, |
|
"learning_rate": 0.00038333333333333334, |
|
"loss": 4.524, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.21915870904922485, |
|
"learning_rate": 0.00038750000000000004, |
|
"loss": 4.5276, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4013586938381195, |
|
"learning_rate": 0.0003916666666666667, |
|
"loss": 4.5263, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.6083785891532898, |
|
"learning_rate": 0.0003958333333333333, |
|
"loss": 4.5226, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5477403402328491, |
|
"learning_rate": 0.0004, |
|
"loss": 4.5223, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.25517192482948303, |
|
"learning_rate": 0.00040416666666666666, |
|
"loss": 4.5213, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5681092739105225, |
|
"learning_rate": 0.00040833333333333336, |
|
"loss": 4.5213, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.27979689836502075, |
|
"learning_rate": 0.0004125, |
|
"loss": 4.5223, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_accuracy": 0.2904179370099399, |
|
"eval_loss": 4.5171709060668945, |
|
"eval_runtime": 1074.3659, |
|
"eval_samples_per_second": 464.808, |
|
"eval_steps_per_second": 2.421, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.278210312128067, |
|
"learning_rate": 0.0004166666666666667, |
|
"loss": 4.5185, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.31496554613113403, |
|
"learning_rate": 0.00042083333333333333, |
|
"loss": 4.5184, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.28795501589775085, |
|
"learning_rate": 0.000425, |
|
"loss": 4.5202, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.1585451066493988, |
|
"learning_rate": 0.00042916666666666667, |
|
"loss": 4.5177, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.25573596358299255, |
|
"learning_rate": 0.00043333333333333337, |
|
"loss": 4.5157, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.9261253476142883, |
|
"learning_rate": 0.0004375, |
|
"loss": 4.5155, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.20488545298576355, |
|
"learning_rate": 0.00044166666666666665, |
|
"loss": 4.5171, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.19982470571994781, |
|
"learning_rate": 0.00044583333333333335, |
|
"loss": 4.5157, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.2195570170879364, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 4.511, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_accuracy": 0.29042484814802466, |
|
"eval_loss": 4.511170387268066, |
|
"eval_runtime": 1074.2406, |
|
"eval_samples_per_second": 464.862, |
|
"eval_steps_per_second": 2.421, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5774135589599609, |
|
"learning_rate": 0.0004541666666666667, |
|
"loss": 4.5159, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.3652968108654022, |
|
"learning_rate": 0.0004583333333333333, |
|
"loss": 4.5152, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.4962700307369232, |
|
"learning_rate": 0.0004625, |
|
"loss": 4.5139, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.3226447105407715, |
|
"learning_rate": 0.00046666666666666666, |
|
"loss": 4.5117, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5751166939735413, |
|
"learning_rate": 0.00047083333333333336, |
|
"loss": 4.5131, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.181748166680336, |
|
"learning_rate": 0.000475, |
|
"loss": 4.5136, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.6175718903541565, |
|
"learning_rate": 0.0004791666666666667, |
|
"loss": 4.5107, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.261405348777771, |
|
"learning_rate": 0.00048333333333333334, |
|
"loss": 4.5124, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.18674370646476746, |
|
"learning_rate": 0.0004875, |
|
"loss": 4.5093, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"eval_accuracy": 0.2903912366894582, |
|
"eval_loss": 4.507014274597168, |
|
"eval_runtime": 1075.0378, |
|
"eval_samples_per_second": 464.518, |
|
"eval_steps_per_second": 2.419, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4493379294872284, |
|
"learning_rate": 0.0004916666666666666, |
|
"loss": 4.5107, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.40603315830230713, |
|
"learning_rate": 0.0004958333333333334, |
|
"loss": 4.5103, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.17475590109825134, |
|
"learning_rate": 0.0005, |
|
"loss": 4.507, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.6670963168144226, |
|
"learning_rate": 0.0005041666666666667, |
|
"loss": 4.5076, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.20023925602436066, |
|
"learning_rate": 0.0005083333333333333, |
|
"loss": 4.5092, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.38801464438438416, |
|
"learning_rate": 0.0005124999999999999, |
|
"loss": 4.5047, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.4637294113636017, |
|
"learning_rate": 0.0005166666666666667, |
|
"loss": 4.5088, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.17603175342082977, |
|
"learning_rate": 0.0005208333333333334, |
|
"loss": 4.5057, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.4553210735321045, |
|
"learning_rate": 0.0005250000000000001, |
|
"loss": 4.505, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"eval_accuracy": 0.29054962247372956, |
|
"eval_loss": 4.502260208129883, |
|
"eval_runtime": 1075.7242, |
|
"eval_samples_per_second": 464.221, |
|
"eval_steps_per_second": 2.418, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.30805012583732605, |
|
"learning_rate": 0.0005291666666666667, |
|
"loss": 4.5032, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.5969117283821106, |
|
"learning_rate": 0.0005333333333333334, |
|
"loss": 4.5036, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.24606676399707794, |
|
"learning_rate": 0.0005375, |
|
"loss": 4.5042, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.2750067710876465, |
|
"learning_rate": 0.0005416666666666666, |
|
"loss": 4.5037, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.4421214163303375, |
|
"learning_rate": 0.0005458333333333333, |
|
"loss": 4.5014, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.2441830188035965, |
|
"learning_rate": 0.00055, |
|
"loss": 4.5005, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.25598272681236267, |
|
"learning_rate": 0.0005541666666666667, |
|
"loss": 4.5007, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.17499062418937683, |
|
"learning_rate": 0.0005583333333333333, |
|
"loss": 4.5031, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.6325914263725281, |
|
"learning_rate": 0.0005625000000000001, |
|
"loss": 4.5003, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"eval_accuracy": 0.2903214025778754, |
|
"eval_loss": 4.499546051025391, |
|
"eval_runtime": 1074.5865, |
|
"eval_samples_per_second": 464.713, |
|
"eval_steps_per_second": 2.42, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.29858532547950745, |
|
"learning_rate": 0.0005666666666666667, |
|
"loss": 4.5014, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.3625228703022003, |
|
"learning_rate": 0.0005708333333333333, |
|
"loss": 4.4991, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.271508127450943, |
|
"learning_rate": 0.000575, |
|
"loss": 4.499, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.7316662073135376, |
|
"learning_rate": 0.0005791666666666667, |
|
"loss": 4.498, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.3072379529476166, |
|
"learning_rate": 0.0005833333333333334, |
|
"loss": 4.5021, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.16316668689250946, |
|
"learning_rate": 0.0005875, |
|
"loss": 4.4963, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.6301301717758179, |
|
"learning_rate": 0.0005916666666666667, |
|
"loss": 4.4978, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.22797346115112305, |
|
"learning_rate": 0.0005958333333333333, |
|
"loss": 4.4964, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.3899094760417938, |
|
"learning_rate": 0.0006, |
|
"loss": 4.4939, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"eval_accuracy": 0.29044920062027546, |
|
"eval_loss": 4.489974021911621, |
|
"eval_runtime": 1075.4503, |
|
"eval_samples_per_second": 464.339, |
|
"eval_steps_per_second": 2.419, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.34080126881599426, |
|
"learning_rate": 0.0006041666666666666, |
|
"loss": 4.493, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.12690007686615, |
|
"learning_rate": 0.0006083333333333333, |
|
"loss": 4.4904, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.30394747853279114, |
|
"learning_rate": 0.0006125000000000001, |
|
"loss": 4.489, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.4542248845100403, |
|
"learning_rate": 0.0006166666666666667, |
|
"loss": 4.4841, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.3733484447002411, |
|
"learning_rate": 0.0006208333333333334, |
|
"loss": 4.4738, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.31465083360671997, |
|
"learning_rate": 0.000625, |
|
"loss": 4.4695, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.32257241010665894, |
|
"learning_rate": 0.0006291666666666667, |
|
"loss": 4.461, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.7750576734542847, |
|
"learning_rate": 0.0006333333333333333, |
|
"loss": 4.4636, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.35094720125198364, |
|
"learning_rate": 0.0006374999999999999, |
|
"loss": 4.4569, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"eval_accuracy": 0.2906712538997569, |
|
"eval_loss": 4.445650100708008, |
|
"eval_runtime": 1075.3046, |
|
"eval_samples_per_second": 464.402, |
|
"eval_steps_per_second": 2.419, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.5662222504615784, |
|
"learning_rate": 0.0006416666666666667, |
|
"loss": 4.4542, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.6326726675033569, |
|
"learning_rate": 0.0006458333333333334, |
|
"loss": 4.4509, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.4773523807525635, |
|
"learning_rate": 0.0006500000000000001, |
|
"loss": 4.4468, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.4514019191265106, |
|
"learning_rate": 0.0006541666666666667, |
|
"loss": 4.442, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.5631856918334961, |
|
"learning_rate": 0.0006583333333333334, |
|
"loss": 4.44, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.36760690808296204, |
|
"learning_rate": 0.0006625, |
|
"loss": 4.4317, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.669217586517334, |
|
"learning_rate": 0.0006666666666666666, |
|
"loss": 4.4365, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.4648076891899109, |
|
"learning_rate": 0.0006708333333333333, |
|
"loss": 4.4277, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.45093855261802673, |
|
"learning_rate": 0.000675, |
|
"loss": 4.4215, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"eval_accuracy": 0.29071258845839876, |
|
"eval_loss": 4.40159797668457, |
|
"eval_runtime": 1073.4339, |
|
"eval_samples_per_second": 465.212, |
|
"eval_steps_per_second": 2.423, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.6909874081611633, |
|
"learning_rate": 0.0006791666666666667, |
|
"loss": 4.4243, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.5092808604240417, |
|
"learning_rate": 0.0006833333333333333, |
|
"loss": 4.4162, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.7031335234642029, |
|
"learning_rate": 0.0006875, |
|
"loss": 4.409, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.778090238571167, |
|
"learning_rate": 0.0006916666666666667, |
|
"loss": 4.4091, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.935316801071167, |
|
"learning_rate": 0.0006958333333333334, |
|
"loss": 4.4059, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.9648371934890747, |
|
"learning_rate": 0.0007, |
|
"loss": 4.4033, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.524691641330719, |
|
"learning_rate": 0.0007041666666666667, |
|
"loss": 4.4012, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.5595187544822693, |
|
"learning_rate": 0.0007083333333333334, |
|
"loss": 4.3954, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.5809574723243713, |
|
"learning_rate": 0.0007125, |
|
"loss": 4.3943, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"eval_accuracy": 0.29116029691107925, |
|
"eval_loss": 4.3684492111206055, |
|
"eval_runtime": 1080.2368, |
|
"eval_samples_per_second": 462.282, |
|
"eval_steps_per_second": 2.408, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.4484635293483734, |
|
"learning_rate": 0.0007166666666666667, |
|
"loss": 4.3881, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.6823798418045044, |
|
"learning_rate": 0.0007208333333333333, |
|
"loss": 4.3829, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.7428690791130066, |
|
"learning_rate": 0.000725, |
|
"loss": 4.3843, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.6587589979171753, |
|
"learning_rate": 0.0007291666666666666, |
|
"loss": 4.3828, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.5860837697982788, |
|
"learning_rate": 0.0007333333333333333, |
|
"loss": 4.3764, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.5413070321083069, |
|
"learning_rate": 0.0007375000000000001, |
|
"loss": 4.3752, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.665489137172699, |
|
"learning_rate": 0.0007416666666666667, |
|
"loss": 4.3728, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.711599588394165, |
|
"learning_rate": 0.0007458333333333334, |
|
"loss": 4.373, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.6164100170135498, |
|
"learning_rate": 0.00075, |
|
"loss": 4.3677, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"eval_accuracy": 0.29116748362162354, |
|
"eval_loss": 4.338656902313232, |
|
"eval_runtime": 1078.2447, |
|
"eval_samples_per_second": 463.136, |
|
"eval_steps_per_second": 2.412, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.8174536228179932, |
|
"learning_rate": 0.0007541666666666667, |
|
"loss": 4.3622, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.4686708152294159, |
|
"learning_rate": 0.0007583333333333333, |
|
"loss": 4.3615, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.726311206817627, |
|
"learning_rate": 0.0007624999999999999, |
|
"loss": 4.3553, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.6094339489936829, |
|
"learning_rate": 0.0007666666666666667, |
|
"loss": 4.3584, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.43092453479766846, |
|
"learning_rate": 0.0007708333333333334, |
|
"loss": 4.3515, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.8314465284347534, |
|
"learning_rate": 0.0007750000000000001, |
|
"loss": 4.3503, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.9535554647445679, |
|
"learning_rate": 0.0007791666666666667, |
|
"loss": 4.3459, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.9487770795822144, |
|
"learning_rate": 0.0007833333333333334, |
|
"loss": 4.3438, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.65323406457901, |
|
"learning_rate": 0.0007875, |
|
"loss": 4.3382, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"eval_accuracy": 0.2915768463978657, |
|
"eval_loss": 4.299588203430176, |
|
"eval_runtime": 1078.9106, |
|
"eval_samples_per_second": 462.85, |
|
"eval_steps_per_second": 2.411, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.8234522342681885, |
|
"learning_rate": 0.0007916666666666666, |
|
"loss": 4.337, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.984524667263031, |
|
"learning_rate": 0.0007958333333333333, |
|
"loss": 4.3342, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.087571382522583, |
|
"learning_rate": 0.0008, |
|
"loss": 4.3327, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.8180701732635498, |
|
"learning_rate": 0.0008041666666666667, |
|
"loss": 4.3292, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.209524154663086, |
|
"learning_rate": 0.0008083333333333333, |
|
"loss": 4.3258, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.2440215349197388, |
|
"learning_rate": 0.0008125000000000001, |
|
"loss": 4.322, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.1065999269485474, |
|
"learning_rate": 0.0008166666666666667, |
|
"loss": 4.3207, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.2142093181610107, |
|
"learning_rate": 0.0008208333333333334, |
|
"loss": 4.3183, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.8045121431350708, |
|
"learning_rate": 0.000825, |
|
"loss": 4.3216, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"eval_accuracy": 0.2919771311011085, |
|
"eval_loss": 4.267116069793701, |
|
"eval_runtime": 1079.6166, |
|
"eval_samples_per_second": 462.548, |
|
"eval_steps_per_second": 2.409, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.8329472541809082, |
|
"learning_rate": 0.0008291666666666667, |
|
"loss": 4.3102, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.114058256149292, |
|
"learning_rate": 0.0008333333333333334, |
|
"loss": 4.3138, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.989930272102356, |
|
"learning_rate": 0.0008375, |
|
"loss": 4.3068, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.9290440082550049, |
|
"learning_rate": 0.0008416666666666667, |
|
"loss": 4.3016, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.6841094493865967, |
|
"learning_rate": 0.0008458333333333333, |
|
"loss": 4.3031, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.0473177433013916, |
|
"learning_rate": 0.00085, |
|
"loss": 4.3079, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.1396197080612183, |
|
"learning_rate": 0.0008541666666666666, |
|
"loss": 4.2986, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.2318438291549683, |
|
"learning_rate": 0.0008583333333333333, |
|
"loss": 4.3005, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 1.749923825263977, |
|
"learning_rate": 0.0008625000000000001, |
|
"loss": 4.2879, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"eval_accuracy": 0.2931822363307358, |
|
"eval_loss": 4.231507778167725, |
|
"eval_runtime": 1089.4507, |
|
"eval_samples_per_second": 458.378, |
|
"eval_steps_per_second": 2.047, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.9980252385139465, |
|
"learning_rate": 0.0008666666666666667, |
|
"loss": 4.2821, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.5635493993759155, |
|
"learning_rate": 0.0008708333333333334, |
|
"loss": 4.29, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.6463395357131958, |
|
"learning_rate": 0.000875, |
|
"loss": 4.2856, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.3602315187454224, |
|
"learning_rate": 0.0008791666666666667, |
|
"loss": 4.2802, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.8902882933616638, |
|
"learning_rate": 0.0008833333333333333, |
|
"loss": 4.2729, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.186219573020935, |
|
"learning_rate": 0.0008874999999999999, |
|
"loss": 4.2691, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.0059683322906494, |
|
"learning_rate": 0.0008916666666666667, |
|
"loss": 4.26, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.2011739015579224, |
|
"learning_rate": 0.0008958333333333334, |
|
"loss": 4.2577, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 1.4442743062973022, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 4.263, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"eval_accuracy": 0.2934964665435206, |
|
"eval_loss": 4.21316385269165, |
|
"eval_runtime": 1096.5373, |
|
"eval_samples_per_second": 455.415, |
|
"eval_steps_per_second": 2.034, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.1617356538772583, |
|
"learning_rate": 0.0009041666666666667, |
|
"loss": 4.2661, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.3990079164505005, |
|
"learning_rate": 0.0009083333333333334, |
|
"loss": 4.2475, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.2959562540054321, |
|
"learning_rate": 0.0009125, |
|
"loss": 4.2415, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.0796222686767578, |
|
"learning_rate": 0.0009166666666666666, |
|
"loss": 4.2337, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 1.6943458318710327, |
|
"learning_rate": 0.0009208333333333333, |
|
"loss": 4.2281, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.7960783243179321, |
|
"learning_rate": 0.000925, |
|
"loss": 4.2241, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.086534023284912, |
|
"learning_rate": 0.0009291666666666667, |
|
"loss": 4.2269, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 1.137702226638794, |
|
"learning_rate": 0.0009333333333333333, |
|
"loss": 4.2158, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.1577701568603516, |
|
"learning_rate": 0.0009375, |
|
"loss": 4.2013, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"eval_accuracy": 0.2987269750298371, |
|
"eval_loss": 4.123126029968262, |
|
"eval_runtime": 1097.6256, |
|
"eval_samples_per_second": 454.964, |
|
"eval_steps_per_second": 2.032, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.6455570459365845, |
|
"learning_rate": 0.0009416666666666667, |
|
"loss": 4.1815, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.6025768518447876, |
|
"learning_rate": 0.0009458333333333334, |
|
"loss": 4.1534, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.2223172187805176, |
|
"learning_rate": 0.00095, |
|
"loss": 4.1298, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.766542673110962, |
|
"learning_rate": 0.0009541666666666667, |
|
"loss": 4.1187, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.156003952026367, |
|
"learning_rate": 0.0009583333333333334, |
|
"loss": 4.0858, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.9074057340621948, |
|
"learning_rate": 0.0009625, |
|
"loss": 4.0801, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.6140304803848267, |
|
"learning_rate": 0.0009666666666666667, |
|
"loss": 4.0383, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.5922300815582275, |
|
"learning_rate": 0.0009708333333333333, |
|
"loss": 4.0099, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.9714833498001099, |
|
"learning_rate": 0.000975, |
|
"loss": 3.9757, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"eval_accuracy": 0.3303083702251303, |
|
"eval_loss": 3.764934539794922, |
|
"eval_runtime": 1104.6754, |
|
"eval_samples_per_second": 452.06, |
|
"eval_steps_per_second": 2.019, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.1198415756225586, |
|
"learning_rate": 0.0009791666666666666, |
|
"loss": 3.9507, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.0731935501098633, |
|
"learning_rate": 0.0009833333333333332, |
|
"loss": 3.9258, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.1984808444976807, |
|
"learning_rate": 0.0009875, |
|
"loss": 3.9003, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.033250331878662, |
|
"learning_rate": 0.0009916666666666667, |
|
"loss": 3.8732, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 1.7183982133865356, |
|
"learning_rate": 0.0009958333333333334, |
|
"loss": 3.8557, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.216938018798828, |
|
"learning_rate": 0.001, |
|
"loss": 3.8376, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.109079599380493, |
|
"learning_rate": 0.000999009900990099, |
|
"loss": 3.8212, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.5402984619140625, |
|
"learning_rate": 0.0009980198019801981, |
|
"loss": 3.8, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.051513433456421, |
|
"learning_rate": 0.000997029702970297, |
|
"loss": 3.7913, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"eval_accuracy": 0.35789052045361985, |
|
"eval_loss": 3.5296359062194824, |
|
"eval_runtime": 1087.9351, |
|
"eval_samples_per_second": 459.016, |
|
"eval_steps_per_second": 2.05, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.8306666612625122, |
|
"learning_rate": 0.000996039603960396, |
|
"loss": 3.7567, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.9114989042282104, |
|
"learning_rate": 0.000995049504950495, |
|
"loss": 3.7491, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 1.9881885051727295, |
|
"learning_rate": 0.0009940594059405941, |
|
"loss": 3.7297, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.2852580547332764, |
|
"learning_rate": 0.0009930693069306932, |
|
"loss": 3.7073, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.090174913406372, |
|
"learning_rate": 0.000992079207920792, |
|
"loss": 3.6902, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.5586419105529785, |
|
"learning_rate": 0.000991089108910891, |
|
"loss": 3.6792, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.9420301914215088, |
|
"learning_rate": 0.0009900990099009901, |
|
"loss": 3.6728, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.319821834564209, |
|
"learning_rate": 0.0009891089108910892, |
|
"loss": 3.6627, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.134413480758667, |
|
"learning_rate": 0.0009881188118811882, |
|
"loss": 3.6435, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"eval_accuracy": 0.3799301143797497, |
|
"eval_loss": 3.3790884017944336, |
|
"eval_runtime": 1089.5448, |
|
"eval_samples_per_second": 458.338, |
|
"eval_steps_per_second": 2.047, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.8554224967956543, |
|
"learning_rate": 0.000987128712871287, |
|
"loss": 3.6265, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.16987681388855, |
|
"learning_rate": 0.000986138613861386, |
|
"loss": 3.6098, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 1.9863182306289673, |
|
"learning_rate": 0.0009851485148514852, |
|
"loss": 3.5982, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.0247480869293213, |
|
"learning_rate": 0.0009841584158415842, |
|
"loss": 3.5911, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.2719273567199707, |
|
"learning_rate": 0.0009831683168316833, |
|
"loss": 3.5804, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.0588369369506836, |
|
"learning_rate": 0.000982178217821782, |
|
"loss": 3.5654, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.9666892290115356, |
|
"learning_rate": 0.0009811881188118811, |
|
"loss": 3.558, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.752681016921997, |
|
"learning_rate": 0.0009801980198019802, |
|
"loss": 3.5389, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.821775197982788, |
|
"learning_rate": 0.0009792079207920793, |
|
"loss": 3.5327, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_accuracy": 0.39317171253107736, |
|
"eval_loss": 3.2741596698760986, |
|
"eval_runtime": 1086.2288, |
|
"eval_samples_per_second": 459.737, |
|
"eval_steps_per_second": 2.053, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.4307518005371094, |
|
"learning_rate": 0.0009782178217821783, |
|
"loss": 3.5311, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.8416870832443237, |
|
"learning_rate": 0.0009772277227722771, |
|
"loss": 3.5199, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.7294279336929321, |
|
"learning_rate": 0.0009762376237623762, |
|
"loss": 3.5067, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.0376105308532715, |
|
"learning_rate": 0.0009752475247524752, |
|
"loss": 3.4957, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.845569133758545, |
|
"learning_rate": 0.0009742574257425743, |
|
"loss": 3.4778, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.1370015144348145, |
|
"learning_rate": 0.0009732673267326732, |
|
"loss": 3.4766, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.0046229362487793, |
|
"learning_rate": 0.0009722772277227723, |
|
"loss": 3.4609, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.7367238998413086, |
|
"learning_rate": 0.0009712871287128712, |
|
"loss": 3.4574, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.245299816131592, |
|
"learning_rate": 0.0009702970297029703, |
|
"loss": 3.4402, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"eval_accuracy": 0.40886959953318786, |
|
"eval_loss": 3.1605701446533203, |
|
"eval_runtime": 1086.3963, |
|
"eval_samples_per_second": 459.666, |
|
"eval_steps_per_second": 2.053, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 1.6792678833007812, |
|
"learning_rate": 0.0009693069306930693, |
|
"loss": 3.4155, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.1290223598480225, |
|
"learning_rate": 0.0009683168316831683, |
|
"loss": 3.3953, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 1.9963873624801636, |
|
"learning_rate": 0.0009673267326732673, |
|
"loss": 3.3722, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.212454080581665, |
|
"learning_rate": 0.0009663366336633663, |
|
"loss": 3.3532, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.145552396774292, |
|
"learning_rate": 0.0009653465346534653, |
|
"loss": 3.336, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.423874616622925, |
|
"learning_rate": 0.0009643564356435644, |
|
"loss": 3.3111, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 2.0116701126098633, |
|
"learning_rate": 0.0009633663366336633, |
|
"loss": 3.305, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 2.243619203567505, |
|
"learning_rate": 0.0009623762376237624, |
|
"loss": 3.2854, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.5583114624023438, |
|
"learning_rate": 0.0009613861386138613, |
|
"loss": 3.2635, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"eval_accuracy": 0.43169227745021366, |
|
"eval_loss": 2.9848363399505615, |
|
"eval_runtime": 1087.8763, |
|
"eval_samples_per_second": 459.041, |
|
"eval_steps_per_second": 2.05, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.156170606613159, |
|
"learning_rate": 0.0009603960396039604, |
|
"loss": 3.2498, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.917297601699829, |
|
"learning_rate": 0.0009594059405940594, |
|
"loss": 3.2343, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.7647627592086792, |
|
"learning_rate": 0.0009584158415841584, |
|
"loss": 3.2206, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 1.7406831979751587, |
|
"learning_rate": 0.0009574257425742574, |
|
"loss": 3.2023, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 1.721940040588379, |
|
"learning_rate": 0.0009564356435643564, |
|
"loss": 3.1896, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.5204572677612305, |
|
"learning_rate": 0.0009554455445544554, |
|
"loss": 3.1769, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.20760440826416, |
|
"learning_rate": 0.0009544554455445545, |
|
"loss": 3.1706, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 1.6796480417251587, |
|
"learning_rate": 0.0009534653465346534, |
|
"loss": 3.1552, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.044858455657959, |
|
"learning_rate": 0.0009524752475247525, |
|
"loss": 3.1385, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"eval_accuracy": 0.4465053493029932, |
|
"eval_loss": 2.872570037841797, |
|
"eval_runtime": 1089.0874, |
|
"eval_samples_per_second": 458.531, |
|
"eval_steps_per_second": 2.048, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.3739113807678223, |
|
"learning_rate": 0.0009514851485148514, |
|
"loss": 3.1217, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 1.631298303604126, |
|
"learning_rate": 0.0009504950495049505, |
|
"loss": 3.1102, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.8213354349136353, |
|
"learning_rate": 0.0009495049504950495, |
|
"loss": 3.092, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.910646915435791, |
|
"learning_rate": 0.0009485148514851485, |
|
"loss": 3.0768, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.6591072082519531, |
|
"learning_rate": 0.0009475247524752475, |
|
"loss": 3.0721, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.921587347984314, |
|
"learning_rate": 0.0009465346534653465, |
|
"loss": 3.0493, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.576114296913147, |
|
"learning_rate": 0.0009455445544554455, |
|
"loss": 3.0392, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.41093909740448, |
|
"learning_rate": 0.0009445544554455446, |
|
"loss": 3.0204, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.41178297996521, |
|
"learning_rate": 0.0009435643564356435, |
|
"loss": 3.0046, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"eval_accuracy": 0.46085574907280247, |
|
"eval_loss": 2.7754335403442383, |
|
"eval_runtime": 1088.9809, |
|
"eval_samples_per_second": 458.576, |
|
"eval_steps_per_second": 2.048, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.5229026079177856, |
|
"learning_rate": 0.0009425742574257426, |
|
"loss": 2.989, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.3193325996398926, |
|
"learning_rate": 0.0009415841584158415, |
|
"loss": 2.9764, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.1938610076904297, |
|
"learning_rate": 0.0009405940594059406, |
|
"loss": 2.9636, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.1402697563171387, |
|
"learning_rate": 0.0009396039603960396, |
|
"loss": 2.9517, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.4980099201202393, |
|
"learning_rate": 0.0009386138613861386, |
|
"loss": 2.9445, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.4591041803359985, |
|
"learning_rate": 0.0009376237623762376, |
|
"loss": 2.9317, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.4302833080291748, |
|
"learning_rate": 0.0009366336633663367, |
|
"loss": 2.9167, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.2571301460266113, |
|
"learning_rate": 0.0009356435643564357, |
|
"loss": 2.9049, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.2039096355438232, |
|
"learning_rate": 0.0009346534653465348, |
|
"loss": 2.8885, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"eval_accuracy": 0.4740639726753192, |
|
"eval_loss": 2.6853535175323486, |
|
"eval_runtime": 1088.5789, |
|
"eval_samples_per_second": 458.745, |
|
"eval_steps_per_second": 2.049, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.1458439826965332, |
|
"learning_rate": 0.0009336633663366337, |
|
"loss": 2.8844, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.1883801221847534, |
|
"learning_rate": 0.0009326732673267328, |
|
"loss": 2.8777, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.0597162246704102, |
|
"learning_rate": 0.0009316831683168317, |
|
"loss": 2.8647, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.2362898588180542, |
|
"learning_rate": 0.0009306930693069308, |
|
"loss": 2.8565, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 1.588973879814148, |
|
"learning_rate": 0.0009297029702970298, |
|
"loss": 2.8443, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 1.7287636995315552, |
|
"learning_rate": 0.0009287128712871288, |
|
"loss": 2.8401, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 1.2376179695129395, |
|
"learning_rate": 0.0009277227722772278, |
|
"loss": 2.8338, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.1424429416656494, |
|
"learning_rate": 0.0009267326732673268, |
|
"loss": 2.8158, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.0561778545379639, |
|
"learning_rate": 0.0009257425742574258, |
|
"loss": 2.8086, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"eval_accuracy": 0.4839770905517238, |
|
"eval_loss": 2.6107919216156006, |
|
"eval_runtime": 1086.2364, |
|
"eval_samples_per_second": 459.734, |
|
"eval_steps_per_second": 2.053, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.1775144338607788, |
|
"learning_rate": 0.0009247524752475249, |
|
"loss": 2.8002, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 1.1752933263778687, |
|
"learning_rate": 0.0009237623762376238, |
|
"loss": 2.7913, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.35330331325531, |
|
"learning_rate": 0.0009227722772277229, |
|
"loss": 2.7821, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.163878321647644, |
|
"learning_rate": 0.0009217821782178218, |
|
"loss": 2.7755, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.0859084129333496, |
|
"learning_rate": 0.0009207920792079209, |
|
"loss": 2.7651, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 1.1351704597473145, |
|
"learning_rate": 0.0009198019801980199, |
|
"loss": 2.7583, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.0613981485366821, |
|
"learning_rate": 0.0009188118811881188, |
|
"loss": 2.7482, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.1925811767578125, |
|
"learning_rate": 0.0009178217821782179, |
|
"loss": 2.7411, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.00603187084198, |
|
"learning_rate": 0.0009168316831683168, |
|
"loss": 2.7357, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"eval_accuracy": 0.4945267646340819, |
|
"eval_loss": 2.5409770011901855, |
|
"eval_runtime": 1086.0357, |
|
"eval_samples_per_second": 459.819, |
|
"eval_steps_per_second": 2.053, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.0638866424560547, |
|
"learning_rate": 0.0009158415841584159, |
|
"loss": 2.7271, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.0579949617385864, |
|
"learning_rate": 0.000914851485148515, |
|
"loss": 2.7192, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.8697578310966492, |
|
"learning_rate": 0.0009138613861386139, |
|
"loss": 2.7114, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 1.1074854135513306, |
|
"learning_rate": 0.0009128712871287129, |
|
"loss": 2.7044, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.2240349054336548, |
|
"learning_rate": 0.0009118811881188119, |
|
"loss": 2.6981, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.0825715065002441, |
|
"learning_rate": 0.0009108910891089109, |
|
"loss": 2.688, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.9786016941070557, |
|
"learning_rate": 0.00090990099009901, |
|
"loss": 2.681, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.7355318665504456, |
|
"learning_rate": 0.0009089108910891089, |
|
"loss": 2.6771, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 1.0296987295150757, |
|
"learning_rate": 0.000907920792079208, |
|
"loss": 2.6714, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"eval_accuracy": 0.5031676098849697, |
|
"eval_loss": 2.4873406887054443, |
|
"eval_runtime": 1086.6349, |
|
"eval_samples_per_second": 459.566, |
|
"eval_steps_per_second": 2.052, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 1.0019482374191284, |
|
"learning_rate": 0.0009069306930693069, |
|
"loss": 2.659, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.95697021484375, |
|
"learning_rate": 0.000905940594059406, |
|
"loss": 2.6534, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.9206619262695312, |
|
"learning_rate": 0.000904950495049505, |
|
"loss": 2.6499, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.9173060059547424, |
|
"learning_rate": 0.000903960396039604, |
|
"loss": 2.6436, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.946976900100708, |
|
"learning_rate": 0.000902970297029703, |
|
"loss": 2.6386, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 1.1060245037078857, |
|
"learning_rate": 0.000901980198019802, |
|
"loss": 2.6295, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.9128373861312866, |
|
"learning_rate": 0.000900990099009901, |
|
"loss": 2.6207, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.7775394916534424, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 2.6152, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 1.0009465217590332, |
|
"learning_rate": 0.000899009900990099, |
|
"loss": 2.6114, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"eval_accuracy": 0.5107387401188807, |
|
"eval_loss": 2.430750608444214, |
|
"eval_runtime": 1085.7185, |
|
"eval_samples_per_second": 459.953, |
|
"eval_steps_per_second": 2.054, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.9122986197471619, |
|
"learning_rate": 0.0008980198019801981, |
|
"loss": 2.6046, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.9263846278190613, |
|
"learning_rate": 0.000897029702970297, |
|
"loss": 2.6006, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.9440599083900452, |
|
"learning_rate": 0.0008960396039603961, |
|
"loss": 2.5954, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.0791646242141724, |
|
"learning_rate": 0.0008950495049504951, |
|
"loss": 2.5875, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.8801349401473999, |
|
"learning_rate": 0.0008940594059405941, |
|
"loss": 2.5805, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.8976075053215027, |
|
"learning_rate": 0.0008930693069306931, |
|
"loss": 2.5856, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.9874941110610962, |
|
"learning_rate": 0.0008920792079207921, |
|
"loss": 2.5741, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.8185487985610962, |
|
"learning_rate": 0.0008910891089108911, |
|
"loss": 2.5625, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.0372703075408936, |
|
"learning_rate": 0.0008900990099009902, |
|
"loss": 2.5622, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_accuracy": 0.5173882190435195, |
|
"eval_loss": 2.3903918266296387, |
|
"eval_runtime": 1085.7537, |
|
"eval_samples_per_second": 459.939, |
|
"eval_steps_per_second": 2.054, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.7737933397293091, |
|
"learning_rate": 0.0008891089108910891, |
|
"loss": 2.5551, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.8418464660644531, |
|
"learning_rate": 0.0008881188118811882, |
|
"loss": 2.5505, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.9638449549674988, |
|
"learning_rate": 0.0008871287128712871, |
|
"loss": 2.5506, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.8606787919998169, |
|
"learning_rate": 0.0008861386138613862, |
|
"loss": 2.5446, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.9567099213600159, |
|
"learning_rate": 0.0008851485148514852, |
|
"loss": 2.5316, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.9098414182662964, |
|
"learning_rate": 0.0008841584158415842, |
|
"loss": 2.5299, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.9305897951126099, |
|
"learning_rate": 0.0008831683168316832, |
|
"loss": 2.53, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.6542484164237976, |
|
"learning_rate": 0.0008821782178217822, |
|
"loss": 2.5182, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.8789640069007874, |
|
"learning_rate": 0.0008811881188118812, |
|
"loss": 2.5145, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"eval_accuracy": 0.5233957340804408, |
|
"eval_loss": 2.3455302715301514, |
|
"eval_runtime": 1084.7897, |
|
"eval_samples_per_second": 460.347, |
|
"eval_steps_per_second": 2.056, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.7239986062049866, |
|
"learning_rate": 0.0008801980198019803, |
|
"loss": 2.5092, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.7931397557258606, |
|
"learning_rate": 0.0008792079207920792, |
|
"loss": 2.4992, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.9696986675262451, |
|
"learning_rate": 0.0008782178217821783, |
|
"loss": 2.5035, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.7151007056236267, |
|
"learning_rate": 0.0008772277227722772, |
|
"loss": 2.4927, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 1.054768443107605, |
|
"learning_rate": 0.0008762376237623763, |
|
"loss": 2.4831, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.7492119669914246, |
|
"learning_rate": 0.0008752475247524753, |
|
"loss": 2.4872, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.6677684187889099, |
|
"learning_rate": 0.0008742574257425743, |
|
"loss": 2.4797, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.8682121634483337, |
|
"learning_rate": 0.0008732673267326733, |
|
"loss": 2.4747, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.9361952543258667, |
|
"learning_rate": 0.0008722772277227722, |
|
"loss": 2.4741, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"eval_accuracy": 0.5298733491746213, |
|
"eval_loss": 2.301394462585449, |
|
"eval_runtime": 1087.2464, |
|
"eval_samples_per_second": 459.307, |
|
"eval_steps_per_second": 2.051, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.7424592971801758, |
|
"learning_rate": 0.0008712871287128713, |
|
"loss": 2.4641, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.7794514894485474, |
|
"learning_rate": 0.0008702970297029704, |
|
"loss": 2.4558, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.8285810351371765, |
|
"learning_rate": 0.0008693069306930693, |
|
"loss": 2.4541, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.8555008769035339, |
|
"learning_rate": 0.0008683168316831684, |
|
"loss": 2.456, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.7897722721099854, |
|
"learning_rate": 0.0008673267326732673, |
|
"loss": 2.4447, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.7043498754501343, |
|
"learning_rate": 0.0008663366336633663, |
|
"loss": 2.44, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.7770318984985352, |
|
"learning_rate": 0.0008653465346534654, |
|
"loss": 2.439, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.8721603155136108, |
|
"learning_rate": 0.0008643564356435643, |
|
"loss": 2.4363, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.653965413570404, |
|
"learning_rate": 0.0008633663366336634, |
|
"loss": 2.4298, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_accuracy": 0.5352807916398568, |
|
"eval_loss": 2.2673096656799316, |
|
"eval_runtime": 1086.4494, |
|
"eval_samples_per_second": 459.644, |
|
"eval_steps_per_second": 2.053, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.7826604247093201, |
|
"learning_rate": 0.0008623762376237623, |
|
"loss": 2.4204, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.6523510813713074, |
|
"learning_rate": 0.0008613861386138614, |
|
"loss": 2.4194, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.9408191442489624, |
|
"learning_rate": 0.0008603960396039604, |
|
"loss": 2.4135, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.8393464684486389, |
|
"learning_rate": 0.0008594059405940594, |
|
"loss": 2.4179, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.7277712821960449, |
|
"learning_rate": 0.0008584158415841584, |
|
"loss": 2.4087, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.6864319443702698, |
|
"learning_rate": 0.0008574257425742574, |
|
"loss": 2.405, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.6246985197067261, |
|
"learning_rate": 0.0008564356435643564, |
|
"loss": 2.3962, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.7276294231414795, |
|
"learning_rate": 0.0008554455445544555, |
|
"loss": 2.4043, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.7767272591590881, |
|
"learning_rate": 0.0008544554455445544, |
|
"loss": 2.3947, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"eval_accuracy": 0.5401726205914658, |
|
"eval_loss": 2.2340025901794434, |
|
"eval_runtime": 1089.0379, |
|
"eval_samples_per_second": 458.552, |
|
"eval_steps_per_second": 2.048, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.717089831829071, |
|
"learning_rate": 0.0008534653465346535, |
|
"loss": 2.3934, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.7381496429443359, |
|
"learning_rate": 0.0008524752475247524, |
|
"loss": 2.384, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.8456007838249207, |
|
"learning_rate": 0.0008514851485148515, |
|
"loss": 2.3842, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.6353156566619873, |
|
"learning_rate": 0.0008504950495049505, |
|
"loss": 2.3774, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.6743925213813782, |
|
"learning_rate": 0.0008495049504950495, |
|
"loss": 2.3775, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.6839917898178101, |
|
"learning_rate": 0.0008485148514851485, |
|
"loss": 2.3737, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.6635532379150391, |
|
"learning_rate": 0.0008475247524752475, |
|
"loss": 2.3689, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.7584016919136047, |
|
"learning_rate": 0.0008465346534653465, |
|
"loss": 2.3623, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.9728506803512573, |
|
"learning_rate": 0.0008455445544554456, |
|
"loss": 2.3653, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"eval_accuracy": 0.5444187372461048, |
|
"eval_loss": 2.210555076599121, |
|
"eval_runtime": 1087.1075, |
|
"eval_samples_per_second": 459.366, |
|
"eval_steps_per_second": 2.051, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.7243532538414001, |
|
"learning_rate": 0.0008445544554455445, |
|
"loss": 2.3605, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.5774228572845459, |
|
"learning_rate": 0.0008435643564356436, |
|
"loss": 2.3547, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.9098168611526489, |
|
"learning_rate": 0.0008425742574257425, |
|
"loss": 2.3507, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.8770572543144226, |
|
"learning_rate": 0.0008415841584158416, |
|
"loss": 2.3535, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.5555605292320251, |
|
"learning_rate": 0.0008405940594059406, |
|
"loss": 2.3463, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.8333105444908142, |
|
"learning_rate": 0.0008396039603960396, |
|
"loss": 2.3417, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.724617600440979, |
|
"learning_rate": 0.0008386138613861386, |
|
"loss": 2.3382, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.6441348791122437, |
|
"learning_rate": 0.0008376237623762376, |
|
"loss": 2.3322, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.7889347672462463, |
|
"learning_rate": 0.0008366336633663366, |
|
"loss": 2.332, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"eval_accuracy": 0.5490578933317315, |
|
"eval_loss": 2.1787993907928467, |
|
"eval_runtime": 1089.6908, |
|
"eval_samples_per_second": 458.277, |
|
"eval_steps_per_second": 2.046, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.7708555459976196, |
|
"learning_rate": 0.0008356435643564357, |
|
"loss": 2.3326, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.5825323462486267, |
|
"learning_rate": 0.0008346534653465346, |
|
"loss": 2.3269, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.6252484321594238, |
|
"learning_rate": 0.0008336633663366337, |
|
"loss": 2.3226, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.8860800266265869, |
|
"learning_rate": 0.0008326732673267326, |
|
"loss": 2.3213, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.6472296714782715, |
|
"learning_rate": 0.0008316831683168317, |
|
"loss": 2.3159, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.5913267731666565, |
|
"learning_rate": 0.0008306930693069307, |
|
"loss": 2.31, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.9591660499572754, |
|
"learning_rate": 0.0008297029702970297, |
|
"loss": 2.3105, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.6770280599594116, |
|
"learning_rate": 0.0008287128712871287, |
|
"loss": 2.3043, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.6088559031486511, |
|
"learning_rate": 0.0008277227722772277, |
|
"loss": 2.3006, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"eval_accuracy": 0.5543035745580942, |
|
"eval_loss": 2.146070718765259, |
|
"eval_runtime": 1088.9664, |
|
"eval_samples_per_second": 458.582, |
|
"eval_steps_per_second": 2.048, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.8546580076217651, |
|
"learning_rate": 0.0008267326732673267, |
|
"loss": 2.3008, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 0.5891646146774292, |
|
"learning_rate": 0.0008257425742574258, |
|
"loss": 2.2996, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.7681787610054016, |
|
"learning_rate": 0.0008247524752475247, |
|
"loss": 2.291, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.6504939198493958, |
|
"learning_rate": 0.0008237623762376238, |
|
"loss": 2.2908, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.7509806752204895, |
|
"learning_rate": 0.0008227722772277227, |
|
"loss": 2.2894, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.9019284844398499, |
|
"learning_rate": 0.0008217821782178218, |
|
"loss": 2.2851, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.5588305592536926, |
|
"learning_rate": 0.0008207920792079208, |
|
"loss": 2.284, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.659537672996521, |
|
"learning_rate": 0.0008198019801980197, |
|
"loss": 2.2758, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.6673635840415955, |
|
"learning_rate": 0.0008188118811881188, |
|
"loss": 2.2731, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"eval_accuracy": 0.5577760802425583, |
|
"eval_loss": 2.1262881755828857, |
|
"eval_runtime": 1086.5663, |
|
"eval_samples_per_second": 459.595, |
|
"eval_steps_per_second": 2.052, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.7020514607429504, |
|
"learning_rate": 0.0008178217821782177, |
|
"loss": 2.2744, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.6836090087890625, |
|
"learning_rate": 0.0008168316831683168, |
|
"loss": 2.2726, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.8297485709190369, |
|
"learning_rate": 0.0008158415841584159, |
|
"loss": 2.2696, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.6747464537620544, |
|
"learning_rate": 0.0008148514851485148, |
|
"loss": 2.2667, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.6610013246536255, |
|
"learning_rate": 0.0008138613861386138, |
|
"loss": 2.261, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.9729331731796265, |
|
"learning_rate": 0.0008128712871287128, |
|
"loss": 2.2634, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.580893874168396, |
|
"learning_rate": 0.000811881188118812, |
|
"loss": 2.2643, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 0.5239897966384888, |
|
"learning_rate": 0.000810891089108911, |
|
"loss": 2.2533, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 0.9247382283210754, |
|
"learning_rate": 0.00080990099009901, |
|
"loss": 2.2544, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"eval_accuracy": 0.5607038793304535, |
|
"eval_loss": 2.1080663204193115, |
|
"eval_runtime": 1087.2566, |
|
"eval_samples_per_second": 459.303, |
|
"eval_steps_per_second": 2.051, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.7254253029823303, |
|
"learning_rate": 0.000808910891089109, |
|
"loss": 2.2521, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.47916772961616516, |
|
"learning_rate": 0.0008079207920792079, |
|
"loss": 2.2471, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.8148419260978699, |
|
"learning_rate": 0.000806930693069307, |
|
"loss": 2.2479, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.6527644991874695, |
|
"learning_rate": 0.000805940594059406, |
|
"loss": 2.2446, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.7129182815551758, |
|
"learning_rate": 0.000804950495049505, |
|
"loss": 2.2382, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 0.6954285502433777, |
|
"learning_rate": 0.000803960396039604, |
|
"loss": 2.2399, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.6172522902488708, |
|
"learning_rate": 0.000802970297029703, |
|
"loss": 2.2395, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 0.8309088349342346, |
|
"learning_rate": 0.000801980198019802, |
|
"loss": 2.2379, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 0.6792633533477783, |
|
"learning_rate": 0.0008009900990099011, |
|
"loss": 2.2364, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"eval_accuracy": 0.5646760378560493, |
|
"eval_loss": 2.0840134620666504, |
|
"eval_runtime": 1086.4023, |
|
"eval_samples_per_second": 459.664, |
|
"eval_steps_per_second": 2.053, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 0.6008450388908386, |
|
"learning_rate": 0.0008, |
|
"loss": 2.2289, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.5826246738433838, |
|
"learning_rate": 0.0007990099009900991, |
|
"loss": 2.2259, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.6360362768173218, |
|
"learning_rate": 0.000798019801980198, |
|
"loss": 2.2262, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.7450495362281799, |
|
"learning_rate": 0.0007970297029702971, |
|
"loss": 2.2241, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.5571395754814148, |
|
"learning_rate": 0.0007960396039603961, |
|
"loss": 2.2245, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.622724175453186, |
|
"learning_rate": 0.0007950495049504951, |
|
"loss": 2.2183, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.621284544467926, |
|
"learning_rate": 0.0007940594059405941, |
|
"loss": 2.2153, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 0.9469096660614014, |
|
"learning_rate": 0.0007930693069306931, |
|
"loss": 2.2156, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.6318257451057434, |
|
"learning_rate": 0.0007920792079207921, |
|
"loss": 2.2143, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"eval_accuracy": 0.5673093799211185, |
|
"eval_loss": 2.0671584606170654, |
|
"eval_runtime": 1088.5283, |
|
"eval_samples_per_second": 458.766, |
|
"eval_steps_per_second": 2.049, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.7229343056678772, |
|
"learning_rate": 0.0007910891089108912, |
|
"loss": 2.2111, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.8711042404174805, |
|
"learning_rate": 0.0007900990099009901, |
|
"loss": 2.2144, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.5540309548377991, |
|
"learning_rate": 0.0007891089108910892, |
|
"loss": 2.2063, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.6157627105712891, |
|
"learning_rate": 0.0007881188118811881, |
|
"loss": 2.2026, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 1.0260213613510132, |
|
"learning_rate": 0.0007871287128712872, |
|
"loss": 2.2107, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.5229135155677795, |
|
"learning_rate": 0.0007861386138613862, |
|
"loss": 2.2031, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.460483580827713, |
|
"learning_rate": 0.0007851485148514852, |
|
"loss": 2.1956, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.9283266067504883, |
|
"learning_rate": 0.0007841584158415842, |
|
"loss": 2.1976, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.5621626973152161, |
|
"learning_rate": 0.0007831683168316832, |
|
"loss": 2.1972, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"eval_accuracy": 0.5702843964025183, |
|
"eval_loss": 2.049508810043335, |
|
"eval_runtime": 1086.5875, |
|
"eval_samples_per_second": 459.586, |
|
"eval_steps_per_second": 2.052, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.6998817324638367, |
|
"learning_rate": 0.0007821782178217822, |
|
"loss": 2.1905, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.6161481738090515, |
|
"learning_rate": 0.0007811881188118813, |
|
"loss": 2.1916, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 0.7690967321395874, |
|
"learning_rate": 0.0007801980198019802, |
|
"loss": 2.1904, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 0.6058160066604614, |
|
"learning_rate": 0.0007792079207920793, |
|
"loss": 2.1894, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 0.5859819054603577, |
|
"learning_rate": 0.0007782178217821782, |
|
"loss": 2.1823, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.7922290563583374, |
|
"learning_rate": 0.0007772277227722773, |
|
"loss": 2.1816, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.617785632610321, |
|
"learning_rate": 0.0007762376237623763, |
|
"loss": 2.1855, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.6082860827445984, |
|
"learning_rate": 0.0007752475247524753, |
|
"loss": 2.1811, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.44060420989990234, |
|
"learning_rate": 0.0007742574257425743, |
|
"loss": 2.1738, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"eval_accuracy": 0.5732795510180845, |
|
"eval_loss": 2.031883955001831, |
|
"eval_runtime": 1085.1554, |
|
"eval_samples_per_second": 460.192, |
|
"eval_steps_per_second": 2.055, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.6842211484909058, |
|
"learning_rate": 0.0007732673267326733, |
|
"loss": 2.1746, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.7363536357879639, |
|
"learning_rate": 0.0007722772277227723, |
|
"loss": 2.1756, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.657122015953064, |
|
"learning_rate": 0.0007712871287128714, |
|
"loss": 2.1767, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.525112509727478, |
|
"learning_rate": 0.0007702970297029703, |
|
"loss": 2.1705, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.594642162322998, |
|
"learning_rate": 0.0007693069306930694, |
|
"loss": 2.167, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.7353718280792236, |
|
"learning_rate": 0.0007683168316831683, |
|
"loss": 2.1678, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.7462971806526184, |
|
"learning_rate": 0.0007673267326732674, |
|
"loss": 2.1665, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 0.543685257434845, |
|
"learning_rate": 0.0007663366336633664, |
|
"loss": 2.1624, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.53340744972229, |
|
"learning_rate": 0.0007653465346534654, |
|
"loss": 2.1587, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"eval_accuracy": 0.5758243970327254, |
|
"eval_loss": 2.01580810546875, |
|
"eval_runtime": 1087.4822, |
|
"eval_samples_per_second": 459.208, |
|
"eval_steps_per_second": 2.051, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.819010853767395, |
|
"learning_rate": 0.0007643564356435644, |
|
"loss": 2.1623, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.6191548109054565, |
|
"learning_rate": 0.0007633663366336634, |
|
"loss": 2.1611, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.5724292993545532, |
|
"learning_rate": 0.0007623762376237624, |
|
"loss": 2.1583, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.6278745532035828, |
|
"learning_rate": 0.0007613861386138615, |
|
"loss": 2.1573, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.6978874802589417, |
|
"learning_rate": 0.0007603960396039604, |
|
"loss": 2.1523, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.9318163990974426, |
|
"learning_rate": 0.0007594059405940595, |
|
"loss": 2.1524, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 0.5397381782531738, |
|
"learning_rate": 0.0007584158415841584, |
|
"loss": 2.1505, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 0.6277997493743896, |
|
"learning_rate": 0.0007574257425742574, |
|
"loss": 2.15, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.6128600239753723, |
|
"learning_rate": 0.0007564356435643565, |
|
"loss": 2.1466, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"eval_accuracy": 0.5778634145294884, |
|
"eval_loss": 2.0040018558502197, |
|
"eval_runtime": 1086.3606, |
|
"eval_samples_per_second": 459.682, |
|
"eval_steps_per_second": 2.053, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.602790892124176, |
|
"learning_rate": 0.0007554455445544554, |
|
"loss": 2.1483, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.5856905579566956, |
|
"learning_rate": 0.0007544554455445545, |
|
"loss": 2.1471, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.6018987894058228, |
|
"learning_rate": 0.0007534653465346534, |
|
"loss": 2.141, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.6041855216026306, |
|
"learning_rate": 0.0007524752475247525, |
|
"loss": 2.1385, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.580766499042511, |
|
"learning_rate": 0.0007514851485148515, |
|
"loss": 2.1375, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.5637401342391968, |
|
"learning_rate": 0.0007504950495049505, |
|
"loss": 2.1374, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.6017095446586609, |
|
"learning_rate": 0.0007495049504950495, |
|
"loss": 2.1372, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.766730010509491, |
|
"learning_rate": 0.0007485148514851485, |
|
"loss": 2.1336, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.5679196119308472, |
|
"learning_rate": 0.0007475247524752475, |
|
"loss": 2.1339, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"eval_accuracy": 0.5801526152356328, |
|
"eval_loss": 1.9901340007781982, |
|
"eval_runtime": 1085.9898, |
|
"eval_samples_per_second": 459.839, |
|
"eval_steps_per_second": 2.053, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.5875471234321594, |
|
"learning_rate": 0.0007465346534653466, |
|
"loss": 2.1327, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.885311484336853, |
|
"learning_rate": 0.0007455445544554455, |
|
"loss": 2.1319, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 0.5480872392654419, |
|
"learning_rate": 0.0007445544554455446, |
|
"loss": 2.1283, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.6052006483078003, |
|
"learning_rate": 0.0007435643564356435, |
|
"loss": 2.1258, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.4672467112541199, |
|
"learning_rate": 0.0007425742574257426, |
|
"loss": 2.1246, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.7137532234191895, |
|
"learning_rate": 0.0007415841584158416, |
|
"loss": 2.1225, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.726308286190033, |
|
"learning_rate": 0.0007405940594059406, |
|
"loss": 2.1247, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.4779931902885437, |
|
"learning_rate": 0.0007396039603960396, |
|
"loss": 2.1215, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.5192296504974365, |
|
"learning_rate": 0.0007386138613861386, |
|
"loss": 2.1151, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"eval_accuracy": 0.5818349975818327, |
|
"eval_loss": 1.981979489326477, |
|
"eval_runtime": 1085.0979, |
|
"eval_samples_per_second": 460.217, |
|
"eval_steps_per_second": 2.055, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.824546754360199, |
|
"learning_rate": 0.0007376237623762376, |
|
"loss": 2.1176, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.5459938049316406, |
|
"learning_rate": 0.0007366336633663367, |
|
"loss": 2.1209, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.657993733882904, |
|
"learning_rate": 0.0007356435643564356, |
|
"loss": 2.1174, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.6625123023986816, |
|
"learning_rate": 0.0007346534653465347, |
|
"loss": 2.114, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.6521473526954651, |
|
"learning_rate": 0.0007336633663366336, |
|
"loss": 2.1126, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.540843665599823, |
|
"learning_rate": 0.0007326732673267327, |
|
"loss": 2.1139, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.5456762313842773, |
|
"learning_rate": 0.0007316831683168317, |
|
"loss": 2.1096, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.6277236938476562, |
|
"learning_rate": 0.0007306930693069307, |
|
"loss": 2.1085, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.5047609210014343, |
|
"learning_rate": 0.0007297029702970297, |
|
"loss": 2.1048, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"eval_accuracy": 0.5844656610858342, |
|
"eval_loss": 1.9673104286193848, |
|
"eval_runtime": 1086.0993, |
|
"eval_samples_per_second": 459.792, |
|
"eval_steps_per_second": 2.053, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.5852828025817871, |
|
"learning_rate": 0.0007287128712871287, |
|
"loss": 2.1091, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.49883949756622314, |
|
"learning_rate": 0.0007277227722772277, |
|
"loss": 2.1059, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.4874211251735687, |
|
"learning_rate": 0.0007267326732673268, |
|
"loss": 2.101, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.807388961315155, |
|
"learning_rate": 0.0007257425742574257, |
|
"loss": 2.1019, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.4876428246498108, |
|
"learning_rate": 0.0007247524752475248, |
|
"loss": 2.101, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.5534060597419739, |
|
"learning_rate": 0.0007237623762376237, |
|
"loss": 2.096, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.5464605093002319, |
|
"learning_rate": 0.0007227722772277228, |
|
"loss": 2.0999, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.6738607883453369, |
|
"learning_rate": 0.0007217821782178218, |
|
"loss": 2.0973, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.5829378366470337, |
|
"learning_rate": 0.0007207920792079208, |
|
"loss": 2.0943, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"eval_accuracy": 0.5862642603086486, |
|
"eval_loss": 1.9560039043426514, |
|
"eval_runtime": 1085.7866, |
|
"eval_samples_per_second": 459.925, |
|
"eval_steps_per_second": 2.054, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.6175411939620972, |
|
"learning_rate": 0.0007198019801980198, |
|
"loss": 2.097, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.4551532566547394, |
|
"learning_rate": 0.0007188118811881188, |
|
"loss": 2.0909, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.6612002849578857, |
|
"learning_rate": 0.0007178217821782178, |
|
"loss": 2.0885, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.5608059763908386, |
|
"learning_rate": 0.0007168316831683169, |
|
"loss": 2.0893, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.6450534462928772, |
|
"learning_rate": 0.0007158415841584158, |
|
"loss": 2.0881, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.5012123584747314, |
|
"learning_rate": 0.0007148514851485149, |
|
"loss": 2.0843, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.5907981395721436, |
|
"learning_rate": 0.0007138613861386138, |
|
"loss": 2.0848, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.6378484964370728, |
|
"learning_rate": 0.0007128712871287129, |
|
"loss": 2.0891, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.5008774399757385, |
|
"learning_rate": 0.0007118811881188119, |
|
"loss": 2.0839, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"eval_accuracy": 0.5886411137424359, |
|
"eval_loss": 1.9411782026290894, |
|
"eval_runtime": 1086.0682, |
|
"eval_samples_per_second": 459.805, |
|
"eval_steps_per_second": 2.053, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.5727280378341675, |
|
"learning_rate": 0.0007108910891089109, |
|
"loss": 2.0834, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.5417036414146423, |
|
"learning_rate": 0.0007099009900990099, |
|
"loss": 2.0806, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.5739684104919434, |
|
"learning_rate": 0.0007089108910891088, |
|
"loss": 2.0809, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.4842034876346588, |
|
"learning_rate": 0.0007079207920792079, |
|
"loss": 2.0787, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.7235381603240967, |
|
"learning_rate": 0.000706930693069307, |
|
"loss": 2.0761, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.5333278775215149, |
|
"learning_rate": 0.0007059405940594059, |
|
"loss": 2.082, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.5933953523635864, |
|
"learning_rate": 0.000704950495049505, |
|
"loss": 2.0711, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.3995033800601959, |
|
"learning_rate": 0.0007039603960396039, |
|
"loss": 2.072, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.5461521744728088, |
|
"learning_rate": 0.0007029702970297029, |
|
"loss": 2.0748, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"eval_accuracy": 0.5892188513716441, |
|
"eval_loss": 1.9380106925964355, |
|
"eval_runtime": 1086.2866, |
|
"eval_samples_per_second": 459.713, |
|
"eval_steps_per_second": 2.053, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.6759417057037354, |
|
"learning_rate": 0.000701980198019802, |
|
"loss": 2.0743, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 0.4820743203163147, |
|
"learning_rate": 0.0007009900990099009, |
|
"loss": 2.0708, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.5824475884437561, |
|
"learning_rate": 0.0007, |
|
"loss": 2.0676, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.5133311748504639, |
|
"learning_rate": 0.0006990099009900989, |
|
"loss": 2.0688, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.5744913220405579, |
|
"learning_rate": 0.000698019801980198, |
|
"loss": 2.0657, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.5103346705436707, |
|
"learning_rate": 0.000697029702970297, |
|
"loss": 2.0674, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.48300009965896606, |
|
"learning_rate": 0.000696039603960396, |
|
"loss": 2.0649, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.64620441198349, |
|
"learning_rate": 0.000695049504950495, |
|
"loss": 2.0658, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.5096336603164673, |
|
"learning_rate": 0.000694059405940594, |
|
"loss": 2.0671, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5915717983649539, |
|
"eval_loss": 1.9237810373306274, |
|
"eval_runtime": 1088.2291, |
|
"eval_samples_per_second": 458.892, |
|
"eval_steps_per_second": 2.049, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.45002949237823486, |
|
"learning_rate": 0.000693069306930693, |
|
"loss": 2.0612, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.48484668135643005, |
|
"learning_rate": 0.0006920792079207921, |
|
"loss": 2.0598, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 0.5888765454292297, |
|
"learning_rate": 0.000691089108910891, |
|
"loss": 2.0665, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.6311919689178467, |
|
"learning_rate": 0.0006900990099009901, |
|
"loss": 2.0604, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.4337356686592102, |
|
"learning_rate": 0.000689108910891089, |
|
"loss": 2.0562, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.5855375528335571, |
|
"learning_rate": 0.0006881188118811881, |
|
"loss": 2.0535, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 0.6715276837348938, |
|
"learning_rate": 0.0006871287128712872, |
|
"loss": 2.0569, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 0.5453487634658813, |
|
"learning_rate": 0.0006861386138613862, |
|
"loss": 2.0539, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.44611501693725586, |
|
"learning_rate": 0.0006851485148514852, |
|
"loss": 2.0557, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"eval_accuracy": 0.593527753135558, |
|
"eval_loss": 1.9137904644012451, |
|
"eval_runtime": 1092.9474, |
|
"eval_samples_per_second": 456.911, |
|
"eval_steps_per_second": 2.04, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.46945101022720337, |
|
"learning_rate": 0.0006841584158415842, |
|
"loss": 2.0494, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.6355441808700562, |
|
"learning_rate": 0.0006831683168316832, |
|
"loss": 2.0535, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.5079066157341003, |
|
"learning_rate": 0.0006821782178217823, |
|
"loss": 2.0535, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.48156508803367615, |
|
"learning_rate": 0.0006811881188118812, |
|
"loss": 2.0479, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.40759479999542236, |
|
"learning_rate": 0.0006801980198019803, |
|
"loss": 2.0465, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 0.6267878413200378, |
|
"learning_rate": 0.0006792079207920792, |
|
"loss": 2.047, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.4935464560985565, |
|
"learning_rate": 0.0006782178217821783, |
|
"loss": 2.0457, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.8215575814247131, |
|
"learning_rate": 0.0006772277227722773, |
|
"loss": 2.0459, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 0.5285871624946594, |
|
"learning_rate": 0.0006762376237623763, |
|
"loss": 2.046, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"eval_accuracy": 0.5947354040081998, |
|
"eval_loss": 1.9072514772415161, |
|
"eval_runtime": 1086.9971, |
|
"eval_samples_per_second": 459.412, |
|
"eval_steps_per_second": 2.052, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.4480656087398529, |
|
"learning_rate": 0.0006752475247524753, |
|
"loss": 2.0419, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.5856757760047913, |
|
"learning_rate": 0.0006742574257425743, |
|
"loss": 2.043, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.5994493365287781, |
|
"learning_rate": 0.0006732673267326733, |
|
"loss": 2.0423, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.5151802897453308, |
|
"learning_rate": 0.0006722772277227724, |
|
"loss": 2.0367, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.5299440622329712, |
|
"learning_rate": 0.0006712871287128713, |
|
"loss": 2.0404, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 0.5033411979675293, |
|
"learning_rate": 0.0006702970297029704, |
|
"loss": 2.0383, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 0.5515163540840149, |
|
"learning_rate": 0.0006693069306930693, |
|
"loss": 2.0377, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.6445341110229492, |
|
"learning_rate": 0.0006683168316831684, |
|
"loss": 2.0348, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.48192110657691956, |
|
"learning_rate": 0.0006673267326732674, |
|
"loss": 2.0376, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"eval_accuracy": 0.5963143559474492, |
|
"eval_loss": 1.8983112573623657, |
|
"eval_runtime": 1086.0935, |
|
"eval_samples_per_second": 459.795, |
|
"eval_steps_per_second": 2.053, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.43177658319473267, |
|
"learning_rate": 0.0006663366336633664, |
|
"loss": 2.0304, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 0.5673606395721436, |
|
"learning_rate": 0.0006653465346534654, |
|
"loss": 2.0342, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.6636048555374146, |
|
"learning_rate": 0.0006643564356435644, |
|
"loss": 2.035, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 0.5259466171264648, |
|
"learning_rate": 0.0006633663366336634, |
|
"loss": 2.0323, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 0.561341404914856, |
|
"learning_rate": 0.0006623762376237625, |
|
"loss": 2.0277, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.48961034417152405, |
|
"learning_rate": 0.0006613861386138614, |
|
"loss": 2.0316, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.46912887692451477, |
|
"learning_rate": 0.0006603960396039605, |
|
"loss": 2.0285, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.5009626746177673, |
|
"learning_rate": 0.0006594059405940594, |
|
"loss": 2.0287, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.5485634207725525, |
|
"learning_rate": 0.0006584158415841585, |
|
"loss": 2.0275, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"eval_accuracy": 0.5978907811550054, |
|
"eval_loss": 1.8894693851470947, |
|
"eval_runtime": 1085.9104, |
|
"eval_samples_per_second": 459.872, |
|
"eval_steps_per_second": 2.054, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 0.5094584822654724, |
|
"learning_rate": 0.0006574257425742575, |
|
"loss": 2.0285, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.5796740651130676, |
|
"learning_rate": 0.0006564356435643565, |
|
"loss": 2.0246, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.5473222136497498, |
|
"learning_rate": 0.0006554455445544555, |
|
"loss": 2.0262, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 0.514639675617218, |
|
"learning_rate": 0.0006544554455445545, |
|
"loss": 2.0196, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.7184515595436096, |
|
"learning_rate": 0.0006534653465346535, |
|
"loss": 2.0228, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.5729575157165527, |
|
"learning_rate": 0.0006524752475247526, |
|
"loss": 2.0216, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.543946385383606, |
|
"learning_rate": 0.0006514851485148515, |
|
"loss": 2.0221, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.5328618884086609, |
|
"learning_rate": 0.0006504950495049506, |
|
"loss": 2.0205, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.45202726125717163, |
|
"learning_rate": 0.0006495049504950495, |
|
"loss": 2.0171, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"eval_accuracy": 0.5995061137091588, |
|
"eval_loss": 1.880002498626709, |
|
"eval_runtime": 1087.0114, |
|
"eval_samples_per_second": 459.406, |
|
"eval_steps_per_second": 2.051, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 0.6767913103103638, |
|
"learning_rate": 0.0006485148514851485, |
|
"loss": 2.0196, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 0.6758072972297668, |
|
"learning_rate": 0.0006475247524752476, |
|
"loss": 2.0187, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.49338895082473755, |
|
"learning_rate": 0.0006465346534653465, |
|
"loss": 2.0203, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.6283939480781555, |
|
"learning_rate": 0.0006455445544554456, |
|
"loss": 2.0141, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.4420575499534607, |
|
"learning_rate": 0.0006445544554455445, |
|
"loss": 2.0169, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.5496084690093994, |
|
"learning_rate": 0.0006435643564356436, |
|
"loss": 2.0132, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 0.5455350279808044, |
|
"learning_rate": 0.0006425742574257426, |
|
"loss": 2.0135, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 0.5139738917350769, |
|
"learning_rate": 0.0006415841584158416, |
|
"loss": 2.0165, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 0.5023753046989441, |
|
"learning_rate": 0.0006405940594059406, |
|
"loss": 2.0107, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"eval_accuracy": 0.6004914418538849, |
|
"eval_loss": 1.8729732036590576, |
|
"eval_runtime": 1086.4568, |
|
"eval_samples_per_second": 459.641, |
|
"eval_steps_per_second": 2.053, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.50832599401474, |
|
"learning_rate": 0.0006396039603960396, |
|
"loss": 2.0125, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.6144891977310181, |
|
"learning_rate": 0.0006386138613861386, |
|
"loss": 2.0074, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.5852723121643066, |
|
"learning_rate": 0.0006376237623762377, |
|
"loss": 2.0116, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.6694257259368896, |
|
"learning_rate": 0.0006366336633663366, |
|
"loss": 2.0068, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 0.5024294257164001, |
|
"learning_rate": 0.0006356435643564357, |
|
"loss": 2.0097, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.4397622048854828, |
|
"learning_rate": 0.0006346534653465346, |
|
"loss": 2.0058, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.46592214703559875, |
|
"learning_rate": 0.0006336633663366337, |
|
"loss": 2.0036, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 0.6728220582008362, |
|
"learning_rate": 0.0006326732673267327, |
|
"loss": 2.0027, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.5555120706558228, |
|
"learning_rate": 0.0006316831683168317, |
|
"loss": 2.0037, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"eval_accuracy": 0.6017631778318724, |
|
"eval_loss": 1.8681055307388306, |
|
"eval_runtime": 1084.7583, |
|
"eval_samples_per_second": 460.361, |
|
"eval_steps_per_second": 2.056, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.5361539721488953, |
|
"learning_rate": 0.0006306930693069307, |
|
"loss": 2.0029, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.4616907238960266, |
|
"learning_rate": 0.0006297029702970297, |
|
"loss": 2.004, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.4987693727016449, |
|
"learning_rate": 0.0006287128712871287, |
|
"loss": 2.0023, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.5090926289558411, |
|
"learning_rate": 0.0006277227722772278, |
|
"loss": 1.9998, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.5008625388145447, |
|
"learning_rate": 0.0006267326732673267, |
|
"loss": 1.9983, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.5074314475059509, |
|
"learning_rate": 0.0006257425742574258, |
|
"loss": 1.9971, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 0.5803602933883667, |
|
"learning_rate": 0.0006247524752475247, |
|
"loss": 1.9974, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.534377932548523, |
|
"learning_rate": 0.0006237623762376238, |
|
"loss": 1.9962, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.5166971683502197, |
|
"learning_rate": 0.0006227722772277228, |
|
"loss": 1.9967, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"eval_accuracy": 0.6030721722960667, |
|
"eval_loss": 1.8595592975616455, |
|
"eval_runtime": 1083.857, |
|
"eval_samples_per_second": 460.743, |
|
"eval_steps_per_second": 2.057, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.610471785068512, |
|
"learning_rate": 0.0006217821782178218, |
|
"loss": 1.9972, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.4919542372226715, |
|
"learning_rate": 0.0006207920792079208, |
|
"loss": 1.9945, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 0.6607844829559326, |
|
"learning_rate": 0.0006198019801980198, |
|
"loss": 1.9962, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 0.4905446171760559, |
|
"learning_rate": 0.0006188118811881188, |
|
"loss": 1.9937, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 0.46790874004364014, |
|
"learning_rate": 0.0006178217821782179, |
|
"loss": 1.99, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.5997541546821594, |
|
"learning_rate": 0.0006168316831683168, |
|
"loss": 1.9928, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.5869884490966797, |
|
"learning_rate": 0.0006158415841584159, |
|
"loss": 1.9913, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.5359517335891724, |
|
"learning_rate": 0.0006148514851485148, |
|
"loss": 1.9892, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 0.5119579434394836, |
|
"learning_rate": 0.0006138613861386139, |
|
"loss": 1.9892, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"eval_accuracy": 0.6040549869719135, |
|
"eval_loss": 1.8538638353347778, |
|
"eval_runtime": 1084.8597, |
|
"eval_samples_per_second": 460.318, |
|
"eval_steps_per_second": 2.056, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.501132071018219, |
|
"learning_rate": 0.0006128712871287129, |
|
"loss": 1.9874, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.5507422089576721, |
|
"learning_rate": 0.0006118811881188119, |
|
"loss": 1.9904, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 0.5412635207176208, |
|
"learning_rate": 0.0006108910891089109, |
|
"loss": 1.9873, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.44309449195861816, |
|
"learning_rate": 0.0006099009900990099, |
|
"loss": 1.9869, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.5056418776512146, |
|
"learning_rate": 0.0006089108910891089, |
|
"loss": 1.9855, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.48882943391799927, |
|
"learning_rate": 0.000607920792079208, |
|
"loss": 1.984, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.4717276692390442, |
|
"learning_rate": 0.0006069306930693069, |
|
"loss": 1.9841, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.5123859643936157, |
|
"learning_rate": 0.000605940594059406, |
|
"loss": 1.9847, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 0.4882528483867645, |
|
"learning_rate": 0.0006049504950495049, |
|
"loss": 1.9824, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"eval_accuracy": 0.6054229902100914, |
|
"eval_loss": 1.8454294204711914, |
|
"eval_runtime": 1084.8895, |
|
"eval_samples_per_second": 460.305, |
|
"eval_steps_per_second": 2.056, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 0.4354398548603058, |
|
"learning_rate": 0.000603960396039604, |
|
"loss": 1.9806, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.5733498334884644, |
|
"learning_rate": 0.000602970297029703, |
|
"loss": 1.9828, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.5736912488937378, |
|
"learning_rate": 0.000601980198019802, |
|
"loss": 1.9816, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.44418302178382874, |
|
"learning_rate": 0.000600990099009901, |
|
"loss": 1.978, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.5334004759788513, |
|
"learning_rate": 0.0006, |
|
"loss": 1.9814, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.5706362128257751, |
|
"learning_rate": 0.000599009900990099, |
|
"loss": 1.9787, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 0.8315806984901428, |
|
"learning_rate": 0.000598019801980198, |
|
"loss": 1.9772, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 0.47482118010520935, |
|
"learning_rate": 0.000597029702970297, |
|
"loss": 1.98, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.43432384729385376, |
|
"learning_rate": 0.000596039603960396, |
|
"loss": 1.9766, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"eval_accuracy": 0.6066947766959975, |
|
"eval_loss": 1.8382798433303833, |
|
"eval_runtime": 1084.0208, |
|
"eval_samples_per_second": 460.674, |
|
"eval_steps_per_second": 2.057, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.4684685170650482, |
|
"learning_rate": 0.000595049504950495, |
|
"loss": 1.9747, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.5347057580947876, |
|
"learning_rate": 0.000594059405940594, |
|
"loss": 1.9773, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.5178421139717102, |
|
"learning_rate": 0.0005930693069306931, |
|
"loss": 1.9754, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.5036115050315857, |
|
"learning_rate": 0.000592079207920792, |
|
"loss": 1.9737, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.48273569345474243, |
|
"learning_rate": 0.0005910891089108911, |
|
"loss": 1.9723, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 0.4350590109825134, |
|
"learning_rate": 0.00059009900990099, |
|
"loss": 1.9715, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 0.4483092129230499, |
|
"learning_rate": 0.0005891089108910891, |
|
"loss": 1.9718, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.6620519757270813, |
|
"learning_rate": 0.0005881188118811881, |
|
"loss": 1.9726, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.41184690594673157, |
|
"learning_rate": 0.0005871287128712871, |
|
"loss": 1.9682, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"eval_accuracy": 0.6078188605164281, |
|
"eval_loss": 1.8335860967636108, |
|
"eval_runtime": 1085.8051, |
|
"eval_samples_per_second": 459.917, |
|
"eval_steps_per_second": 2.054, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.6032079458236694, |
|
"learning_rate": 0.0005861386138613861, |
|
"loss": 1.9701, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.5113199949264526, |
|
"learning_rate": 0.0005851485148514851, |
|
"loss": 1.9715, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.43198567628860474, |
|
"learning_rate": 0.0005841584158415841, |
|
"loss": 1.9677, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 0.511009931564331, |
|
"learning_rate": 0.0005831683168316832, |
|
"loss": 1.9692, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 0.48394373059272766, |
|
"learning_rate": 0.0005821782178217821, |
|
"loss": 1.9658, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 0.5180623531341553, |
|
"learning_rate": 0.0005811881188118812, |
|
"loss": 1.9656, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.5282729864120483, |
|
"learning_rate": 0.0005801980198019801, |
|
"loss": 1.9672, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.44400766491889954, |
|
"learning_rate": 0.0005792079207920792, |
|
"loss": 1.9628, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.47033068537712097, |
|
"learning_rate": 0.0005782178217821782, |
|
"loss": 1.9653, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"eval_accuracy": 0.6084373992178661, |
|
"eval_loss": 1.8281679153442383, |
|
"eval_runtime": 1084.5844, |
|
"eval_samples_per_second": 460.434, |
|
"eval_steps_per_second": 2.056, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 0.7458497881889343, |
|
"learning_rate": 0.0005772277227722772, |
|
"loss": 1.9662, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 0.4841892421245575, |
|
"learning_rate": 0.0005762376237623762, |
|
"loss": 1.9645, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 0.472526490688324, |
|
"learning_rate": 0.0005752475247524752, |
|
"loss": 1.9621, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.41672539710998535, |
|
"learning_rate": 0.0005742574257425742, |
|
"loss": 1.9595, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.5024113059043884, |
|
"learning_rate": 0.0005732673267326733, |
|
"loss": 1.9593, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.5023587942123413, |
|
"learning_rate": 0.0005722772277227722, |
|
"loss": 1.9599, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.45752909779548645, |
|
"learning_rate": 0.0005712871287128713, |
|
"loss": 1.9599, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 0.6170557737350464, |
|
"learning_rate": 0.0005702970297029702, |
|
"loss": 1.9616, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.4267810583114624, |
|
"learning_rate": 0.0005693069306930693, |
|
"loss": 1.9599, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"eval_accuracy": 0.609472718679915, |
|
"eval_loss": 1.8216131925582886, |
|
"eval_runtime": 1085.0974, |
|
"eval_samples_per_second": 460.217, |
|
"eval_steps_per_second": 2.055, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 0.5765691995620728, |
|
"learning_rate": 0.0005683168316831683, |
|
"loss": 1.9571, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 0.5143380761146545, |
|
"learning_rate": 0.0005673267326732673, |
|
"loss": 1.9594, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 0.47696933150291443, |
|
"learning_rate": 0.0005663366336633663, |
|
"loss": 1.9569, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.6473893523216248, |
|
"learning_rate": 0.0005653465346534653, |
|
"loss": 1.959, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.42045238614082336, |
|
"learning_rate": 0.0005643564356435643, |
|
"loss": 1.9537, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.47495463490486145, |
|
"learning_rate": 0.0005633663366336634, |
|
"loss": 1.9539, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.46555668115615845, |
|
"learning_rate": 0.0005623762376237624, |
|
"loss": 1.9532, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 0.5669355988502502, |
|
"learning_rate": 0.0005613861386138615, |
|
"loss": 1.9539, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 0.5012803077697754, |
|
"learning_rate": 0.0005603960396039604, |
|
"loss": 1.9516, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"eval_accuracy": 0.6106878911706901, |
|
"eval_loss": 1.8161377906799316, |
|
"eval_runtime": 1086.8974, |
|
"eval_samples_per_second": 459.455, |
|
"eval_steps_per_second": 2.052, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 0.6031624674797058, |
|
"learning_rate": 0.0005594059405940595, |
|
"loss": 1.951, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.48754164576530457, |
|
"learning_rate": 0.0005584158415841585, |
|
"loss": 1.9512, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"grad_norm": 0.5661942362785339, |
|
"learning_rate": 0.0005574257425742575, |
|
"loss": 1.953, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"grad_norm": 0.5200914144515991, |
|
"learning_rate": 0.0005564356435643565, |
|
"loss": 1.9528, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"grad_norm": 0.504625678062439, |
|
"learning_rate": 0.0005554455445544555, |
|
"loss": 1.9473, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 5.03, |
|
"grad_norm": 0.5114207863807678, |
|
"learning_rate": 0.0005544554455445545, |
|
"loss": 1.9503, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 5.03, |
|
"grad_norm": 0.4563724100589752, |
|
"learning_rate": 0.0005534653465346536, |
|
"loss": 1.9522, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 0.48981210589408875, |
|
"learning_rate": 0.0005524752475247525, |
|
"loss": 1.9463, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"grad_norm": 0.4641856849193573, |
|
"learning_rate": 0.0005514851485148516, |
|
"loss": 1.9473, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"eval_accuracy": 0.6114587633382502, |
|
"eval_loss": 1.8128423690795898, |
|
"eval_runtime": 1088.997, |
|
"eval_samples_per_second": 458.569, |
|
"eval_steps_per_second": 2.048, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 0.522405743598938, |
|
"learning_rate": 0.0005504950495049505, |
|
"loss": 1.9464, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 0.5010780692100525, |
|
"learning_rate": 0.0005495049504950496, |
|
"loss": 1.9462, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 5.07, |
|
"grad_norm": 0.4186078906059265, |
|
"learning_rate": 0.0005485148514851486, |
|
"loss": 1.9454, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 0.43226584792137146, |
|
"learning_rate": 0.0005475247524752476, |
|
"loss": 1.9429, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 5.09, |
|
"grad_norm": 0.4429096579551697, |
|
"learning_rate": 0.0005465346534653466, |
|
"loss": 1.9439, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 5.09, |
|
"grad_norm": 0.5576241612434387, |
|
"learning_rate": 0.0005455445544554456, |
|
"loss": 1.9465, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 0.5840058326721191, |
|
"learning_rate": 0.0005445544554455446, |
|
"loss": 1.9465, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 0.42570897936820984, |
|
"learning_rate": 0.0005435643564356437, |
|
"loss": 1.9423, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"grad_norm": 0.4703156650066376, |
|
"learning_rate": 0.0005425742574257426, |
|
"loss": 1.9445, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 5.11, |
|
"eval_accuracy": 0.6122824776592447, |
|
"eval_loss": 1.8064905405044556, |
|
"eval_runtime": 1089.2916, |
|
"eval_samples_per_second": 458.445, |
|
"eval_steps_per_second": 2.047, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 0.47701114416122437, |
|
"learning_rate": 0.0005415841584158417, |
|
"loss": 1.9414, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 5.13, |
|
"grad_norm": 0.4756263196468353, |
|
"learning_rate": 0.0005405940594059406, |
|
"loss": 1.9412, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 5.14, |
|
"grad_norm": 0.4438433051109314, |
|
"learning_rate": 0.0005396039603960396, |
|
"loss": 1.9396, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 5.14, |
|
"grad_norm": 0.7217634916305542, |
|
"learning_rate": 0.0005386138613861387, |
|
"loss": 1.9405, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"grad_norm": 0.5862283110618591, |
|
"learning_rate": 0.0005376237623762376, |
|
"loss": 1.9408, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 0.4042336642742157, |
|
"learning_rate": 0.0005366336633663367, |
|
"loss": 1.9389, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 5.17, |
|
"grad_norm": 0.48928365111351013, |
|
"learning_rate": 0.0005356435643564356, |
|
"loss": 1.9387, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 5.17, |
|
"grad_norm": 0.4354238212108612, |
|
"learning_rate": 0.0005346534653465347, |
|
"loss": 1.9383, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"grad_norm": 0.4175672233104706, |
|
"learning_rate": 0.0005336633663366337, |
|
"loss": 1.9352, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"eval_accuracy": 0.6133205485630702, |
|
"eval_loss": 1.8010112047195435, |
|
"eval_runtime": 1088.7719, |
|
"eval_samples_per_second": 458.664, |
|
"eval_steps_per_second": 2.048, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 0.49512535333633423, |
|
"learning_rate": 0.0005326732673267327, |
|
"loss": 1.936, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 0.5564088225364685, |
|
"learning_rate": 0.0005316831683168317, |
|
"loss": 1.9352, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 0.42951545119285583, |
|
"learning_rate": 0.0005306930693069307, |
|
"loss": 1.9382, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"grad_norm": 0.4925052523612976, |
|
"learning_rate": 0.0005297029702970297, |
|
"loss": 1.9332, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 0.47808635234832764, |
|
"learning_rate": 0.0005287128712871288, |
|
"loss": 1.9347, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 0.49086272716522217, |
|
"learning_rate": 0.0005277227722772277, |
|
"loss": 1.9346, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"grad_norm": 0.6410700678825378, |
|
"learning_rate": 0.0005267326732673268, |
|
"loss": 1.9355, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 0.5207043886184692, |
|
"learning_rate": 0.0005257425742574257, |
|
"loss": 1.9362, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.4774588346481323, |
|
"learning_rate": 0.0005247524752475248, |
|
"loss": 1.9275, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"eval_accuracy": 0.6140941851085354, |
|
"eval_loss": 1.795212745666504, |
|
"eval_runtime": 1086.7491, |
|
"eval_samples_per_second": 459.517, |
|
"eval_steps_per_second": 2.052, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.43084409832954407, |
|
"learning_rate": 0.0005237623762376238, |
|
"loss": 1.9323, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 5.26, |
|
"grad_norm": 0.47473257780075073, |
|
"learning_rate": 0.0005227722772277228, |
|
"loss": 1.9325, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"grad_norm": 0.5835340023040771, |
|
"learning_rate": 0.0005217821782178218, |
|
"loss": 1.9307, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 0.5061103701591492, |
|
"learning_rate": 0.0005207920792079208, |
|
"loss": 1.9312, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 0.41326335072517395, |
|
"learning_rate": 0.0005198019801980198, |
|
"loss": 1.9302, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"grad_norm": 0.4128727316856384, |
|
"learning_rate": 0.0005188118811881189, |
|
"loss": 1.9306, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"grad_norm": 0.4514748752117157, |
|
"learning_rate": 0.0005178217821782178, |
|
"loss": 1.9306, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 5.3, |
|
"grad_norm": 0.4459412097930908, |
|
"learning_rate": 0.0005168316831683169, |
|
"loss": 1.9273, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 5.31, |
|
"grad_norm": 0.508747935295105, |
|
"learning_rate": 0.0005158415841584158, |
|
"loss": 1.9288, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 5.31, |
|
"eval_accuracy": 0.6147504784530803, |
|
"eval_loss": 1.7935823202133179, |
|
"eval_runtime": 1086.9403, |
|
"eval_samples_per_second": 459.436, |
|
"eval_steps_per_second": 2.052, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 0.47734731435775757, |
|
"learning_rate": 0.0005148514851485149, |
|
"loss": 1.9298, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"grad_norm": 0.5174552798271179, |
|
"learning_rate": 0.0005138613861386139, |
|
"loss": 1.9265, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"grad_norm": 0.5077902674674988, |
|
"learning_rate": 0.0005128712871287129, |
|
"loss": 1.9261, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 5.34, |
|
"grad_norm": 0.5650092959403992, |
|
"learning_rate": 0.0005118811881188119, |
|
"loss": 1.9298, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"grad_norm": 0.4757165312767029, |
|
"learning_rate": 0.0005108910891089109, |
|
"loss": 1.924, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"grad_norm": 0.39661648869514465, |
|
"learning_rate": 0.0005099009900990099, |
|
"loss": 1.9246, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"grad_norm": 0.4992424547672272, |
|
"learning_rate": 0.000508910891089109, |
|
"loss": 1.9238, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 0.5065789222717285, |
|
"learning_rate": 0.0005079207920792079, |
|
"loss": 1.9227, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 0.4040853977203369, |
|
"learning_rate": 0.000506930693069307, |
|
"loss": 1.9216, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"eval_accuracy": 0.6157344716957146, |
|
"eval_loss": 1.7872822284698486, |
|
"eval_runtime": 1088.1718, |
|
"eval_samples_per_second": 458.917, |
|
"eval_steps_per_second": 2.049, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 0.5322463512420654, |
|
"learning_rate": 0.0005059405940594059, |
|
"loss": 1.9224, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 5.39, |
|
"grad_norm": 0.7337666153907776, |
|
"learning_rate": 0.000504950495049505, |
|
"loss": 1.9215, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 0.4274744987487793, |
|
"learning_rate": 0.000503960396039604, |
|
"loss": 1.9204, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 5.41, |
|
"grad_norm": 0.5131354928016663, |
|
"learning_rate": 0.000502970297029703, |
|
"loss": 1.9232, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 5.41, |
|
"grad_norm": 0.46376702189445496, |
|
"learning_rate": 0.000501980198019802, |
|
"loss": 1.9206, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"grad_norm": 0.4123290181159973, |
|
"learning_rate": 0.000500990099009901, |
|
"loss": 1.9195, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"grad_norm": 0.5006473660469055, |
|
"learning_rate": 0.0005, |
|
"loss": 1.9193, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 0.4529099464416504, |
|
"learning_rate": 0.0004990099009900991, |
|
"loss": 1.9194, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 0.43744367361068726, |
|
"learning_rate": 0.000498019801980198, |
|
"loss": 1.9215, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"eval_accuracy": 0.6164003583042955, |
|
"eval_loss": 1.7831112146377563, |
|
"eval_runtime": 1087.85, |
|
"eval_samples_per_second": 459.052, |
|
"eval_steps_per_second": 2.05, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"grad_norm": 0.4329184591770172, |
|
"learning_rate": 0.0004970297029702971, |
|
"loss": 1.9184, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 0.4381932020187378, |
|
"learning_rate": 0.000496039603960396, |
|
"loss": 1.9194, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 0.4533543586730957, |
|
"learning_rate": 0.0004950495049504951, |
|
"loss": 1.9163, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 5.47, |
|
"grad_norm": 0.50531405210495, |
|
"learning_rate": 0.0004940594059405941, |
|
"loss": 1.9196, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 0.45237472653388977, |
|
"learning_rate": 0.000493069306930693, |
|
"loss": 1.9148, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 0.4136326313018799, |
|
"learning_rate": 0.0004920792079207921, |
|
"loss": 1.9152, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 0.5655802488327026, |
|
"learning_rate": 0.000491089108910891, |
|
"loss": 1.9162, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.4693652391433716, |
|
"learning_rate": 0.0004900990099009901, |
|
"loss": 1.9172, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 5.51, |
|
"grad_norm": 0.43454521894454956, |
|
"learning_rate": 0.0004891089108910892, |
|
"loss": 1.9143, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 5.51, |
|
"eval_accuracy": 0.6173679783746377, |
|
"eval_loss": 1.7775607109069824, |
|
"eval_runtime": 1086.5491, |
|
"eval_samples_per_second": 459.602, |
|
"eval_steps_per_second": 2.052, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 0.5520709156990051, |
|
"learning_rate": 0.0004881188118811881, |
|
"loss": 1.9129, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 0.5848320722579956, |
|
"learning_rate": 0.00048712871287128715, |
|
"loss": 1.9157, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 5.53, |
|
"grad_norm": 0.4443696141242981, |
|
"learning_rate": 0.00048613861386138615, |
|
"loss": 1.9129, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 0.4639554023742676, |
|
"learning_rate": 0.00048514851485148515, |
|
"loss": 1.9118, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 5.54, |
|
"grad_norm": 0.4965130090713501, |
|
"learning_rate": 0.00048415841584158414, |
|
"loss": 1.911, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"grad_norm": 0.5710552334785461, |
|
"learning_rate": 0.00048316831683168314, |
|
"loss": 1.9133, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 5.56, |
|
"grad_norm": 0.5551273226737976, |
|
"learning_rate": 0.0004821782178217822, |
|
"loss": 1.9115, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"grad_norm": 0.4237355887889862, |
|
"learning_rate": 0.0004811881188118812, |
|
"loss": 1.9107, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"grad_norm": 0.5999632477760315, |
|
"learning_rate": 0.0004801980198019802, |
|
"loss": 1.9125, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"eval_accuracy": 0.6181112629292015, |
|
"eval_loss": 1.7737356424331665, |
|
"eval_runtime": 1089.6084, |
|
"eval_samples_per_second": 458.311, |
|
"eval_steps_per_second": 2.047, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 5.58, |
|
"grad_norm": 0.4806526303291321, |
|
"learning_rate": 0.0004792079207920792, |
|
"loss": 1.9102, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 5.59, |
|
"grad_norm": 0.4812794327735901, |
|
"learning_rate": 0.0004782178217821782, |
|
"loss": 1.9071, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 0.4029771089553833, |
|
"learning_rate": 0.00047722772277227724, |
|
"loss": 1.9081, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 0.40626466274261475, |
|
"learning_rate": 0.00047623762376237624, |
|
"loss": 1.9113, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 5.61, |
|
"grad_norm": 0.4513317942619324, |
|
"learning_rate": 0.00047524752475247524, |
|
"loss": 1.9088, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 0.4461658000946045, |
|
"learning_rate": 0.00047425742574257423, |
|
"loss": 1.9054, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 0.4517589807510376, |
|
"learning_rate": 0.00047326732673267323, |
|
"loss": 1.9082, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 5.63, |
|
"grad_norm": 0.45047953724861145, |
|
"learning_rate": 0.0004722772277227723, |
|
"loss": 1.9064, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 0.4856921434402466, |
|
"learning_rate": 0.0004712871287128713, |
|
"loss": 1.9075, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"eval_accuracy": 0.6187679819070717, |
|
"eval_loss": 1.7702995538711548, |
|
"eval_runtime": 1084.6666, |
|
"eval_samples_per_second": 460.4, |
|
"eval_steps_per_second": 2.056, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"grad_norm": 0.4753153920173645, |
|
"learning_rate": 0.0004702970297029703, |
|
"loss": 1.9063, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"grad_norm": 0.45308151841163635, |
|
"learning_rate": 0.0004693069306930693, |
|
"loss": 1.9077, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 5.66, |
|
"grad_norm": 0.4464881718158722, |
|
"learning_rate": 0.00046831683168316833, |
|
"loss": 1.9038, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"grad_norm": 0.49109694361686707, |
|
"learning_rate": 0.0004673267326732674, |
|
"loss": 1.9041, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 0.45286795496940613, |
|
"learning_rate": 0.0004663366336633664, |
|
"loss": 1.9028, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 0.4543616771697998, |
|
"learning_rate": 0.0004653465346534654, |
|
"loss": 1.9055, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"grad_norm": 0.45107975602149963, |
|
"learning_rate": 0.0004643564356435644, |
|
"loss": 1.905, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 0.4717351198196411, |
|
"learning_rate": 0.0004633663366336634, |
|
"loss": 1.9035, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 5.71, |
|
"grad_norm": 0.47570666670799255, |
|
"learning_rate": 0.00046237623762376243, |
|
"loss": 1.9023, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 5.71, |
|
"eval_accuracy": 0.619487519380987, |
|
"eval_loss": 1.764754295349121, |
|
"eval_runtime": 1085.4383, |
|
"eval_samples_per_second": 460.072, |
|
"eval_steps_per_second": 2.054, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 5.71, |
|
"grad_norm": 0.4219855070114136, |
|
"learning_rate": 0.00046138613861386143, |
|
"loss": 1.9023, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 0.43669965863227844, |
|
"learning_rate": 0.0004603960396039604, |
|
"loss": 1.9027, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 0.3779612183570862, |
|
"learning_rate": 0.0004594059405940594, |
|
"loss": 1.8983, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 0.42528143525123596, |
|
"learning_rate": 0.0004584158415841584, |
|
"loss": 1.9003, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 5.74, |
|
"grad_norm": 0.5401535630226135, |
|
"learning_rate": 0.0004574257425742575, |
|
"loss": 1.902, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.5040842890739441, |
|
"learning_rate": 0.00045643564356435647, |
|
"loss": 1.898, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 0.40838822722435, |
|
"learning_rate": 0.00045544554455445547, |
|
"loss": 1.9009, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 0.4518304169178009, |
|
"learning_rate": 0.00045445544554455447, |
|
"loss": 1.8971, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 5.77, |
|
"grad_norm": 0.4375505745410919, |
|
"learning_rate": 0.00045346534653465347, |
|
"loss": 1.8968, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 5.77, |
|
"eval_accuracy": 0.6199796521907708, |
|
"eval_loss": 1.7632313966751099, |
|
"eval_runtime": 1087.3022, |
|
"eval_samples_per_second": 459.284, |
|
"eval_steps_per_second": 2.051, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"grad_norm": 0.4348089098930359, |
|
"learning_rate": 0.0004524752475247525, |
|
"loss": 1.897, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"grad_norm": 0.4852411150932312, |
|
"learning_rate": 0.0004514851485148515, |
|
"loss": 1.8979, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"grad_norm": 0.3691408932209015, |
|
"learning_rate": 0.0004504950495049505, |
|
"loss": 1.8952, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 0.40795382857322693, |
|
"learning_rate": 0.0004495049504950495, |
|
"loss": 1.8976, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 0.5768758058547974, |
|
"learning_rate": 0.0004485148514851485, |
|
"loss": 1.9002, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 0.47147759795188904, |
|
"learning_rate": 0.00044752475247524756, |
|
"loss": 1.8962, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 0.4033481478691101, |
|
"learning_rate": 0.00044653465346534656, |
|
"loss": 1.8902, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 5.83, |
|
"grad_norm": 0.474514365196228, |
|
"learning_rate": 0.00044554455445544556, |
|
"loss": 1.8914, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 0.4343509376049042, |
|
"learning_rate": 0.00044455445544554456, |
|
"loss": 1.8909, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"eval_accuracy": 0.6212259728987508, |
|
"eval_loss": 1.755420207977295, |
|
"eval_runtime": 1084.9224, |
|
"eval_samples_per_second": 460.291, |
|
"eval_steps_per_second": 2.055, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 0.435395210981369, |
|
"learning_rate": 0.00044356435643564356, |
|
"loss": 1.8907, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 5.85, |
|
"grad_norm": 0.48715683817863464, |
|
"learning_rate": 0.0004425742574257426, |
|
"loss": 1.8894, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 5.86, |
|
"grad_norm": 0.4001710116863251, |
|
"learning_rate": 0.0004415841584158416, |
|
"loss": 1.89, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 5.87, |
|
"grad_norm": 0.38079318404197693, |
|
"learning_rate": 0.0004405940594059406, |
|
"loss": 1.8893, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 5.87, |
|
"grad_norm": 0.5229191780090332, |
|
"learning_rate": 0.0004396039603960396, |
|
"loss": 1.8916, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 0.4705289900302887, |
|
"learning_rate": 0.0004386138613861386, |
|
"loss": 1.891, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 5.89, |
|
"grad_norm": 0.4324556589126587, |
|
"learning_rate": 0.00043762376237623765, |
|
"loss": 1.8872, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 5.89, |
|
"grad_norm": 0.4106965959072113, |
|
"learning_rate": 0.00043663366336633665, |
|
"loss": 1.8861, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 5.9, |
|
"grad_norm": 0.461008220911026, |
|
"learning_rate": 0.00043564356435643565, |
|
"loss": 1.8903, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 5.9, |
|
"eval_accuracy": 0.6217799147539793, |
|
"eval_loss": 1.751676082611084, |
|
"eval_runtime": 1084.7892, |
|
"eval_samples_per_second": 460.348, |
|
"eval_steps_per_second": 2.056, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 0.4489499032497406, |
|
"learning_rate": 0.00043465346534653465, |
|
"loss": 1.8894, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 0.40970441699028015, |
|
"learning_rate": 0.00043366336633663365, |
|
"loss": 1.8858, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 0.397197425365448, |
|
"learning_rate": 0.0004326732673267327, |
|
"loss": 1.8866, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 5.93, |
|
"grad_norm": 0.4046621322631836, |
|
"learning_rate": 0.0004316831683168317, |
|
"loss": 1.8865, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 5.94, |
|
"grad_norm": 0.4488195478916168, |
|
"learning_rate": 0.0004306930693069307, |
|
"loss": 1.8898, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 5.95, |
|
"grad_norm": 0.37480929493904114, |
|
"learning_rate": 0.0004297029702970297, |
|
"loss": 1.883, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 5.95, |
|
"grad_norm": 0.39327338337898254, |
|
"learning_rate": 0.0004287128712871287, |
|
"loss": 1.8837, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"grad_norm": 0.4443519115447998, |
|
"learning_rate": 0.00042772277227722774, |
|
"loss": 1.8825, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 0.4902293384075165, |
|
"learning_rate": 0.00042673267326732674, |
|
"loss": 1.8837, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"eval_accuracy": 0.622874142241673, |
|
"eval_loss": 1.7469381093978882, |
|
"eval_runtime": 1082.2695, |
|
"eval_samples_per_second": 461.419, |
|
"eval_steps_per_second": 2.06, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 0.43381059169769287, |
|
"learning_rate": 0.00042574257425742574, |
|
"loss": 1.8812, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"grad_norm": 0.4675629734992981, |
|
"learning_rate": 0.00042475247524752474, |
|
"loss": 1.8808, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 5.99, |
|
"grad_norm": 0.4100710451602936, |
|
"learning_rate": 0.00042376237623762374, |
|
"loss": 1.882, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.5555077791213989, |
|
"learning_rate": 0.0004227722772277228, |
|
"loss": 1.8824, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.5407902598381042, |
|
"learning_rate": 0.0004217821782178218, |
|
"loss": 1.8813, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"grad_norm": 0.48739898204803467, |
|
"learning_rate": 0.0004207920792079208, |
|
"loss": 1.8782, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 0.4977239966392517, |
|
"learning_rate": 0.0004198019801980198, |
|
"loss": 1.8801, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 6.03, |
|
"grad_norm": 0.47402721643447876, |
|
"learning_rate": 0.0004188118811881188, |
|
"loss": 1.8811, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 6.03, |
|
"grad_norm": 0.4796026349067688, |
|
"learning_rate": 0.00041782178217821784, |
|
"loss": 1.8801, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 6.03, |
|
"eval_accuracy": 0.6238451571943063, |
|
"eval_loss": 1.741037130355835, |
|
"eval_runtime": 1087.5002, |
|
"eval_samples_per_second": 459.2, |
|
"eval_steps_per_second": 2.051, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 0.419162392616272, |
|
"learning_rate": 0.00041683168316831683, |
|
"loss": 1.8796, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"grad_norm": 0.409493625164032, |
|
"learning_rate": 0.00041584158415841583, |
|
"loss": 1.8794, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"grad_norm": 0.5155593752861023, |
|
"learning_rate": 0.00041485148514851483, |
|
"loss": 1.8786, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 6.06, |
|
"grad_norm": 0.46643194556236267, |
|
"learning_rate": 0.00041386138613861383, |
|
"loss": 1.8786, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 6.07, |
|
"grad_norm": 0.4528968930244446, |
|
"learning_rate": 0.0004128712871287129, |
|
"loss": 1.8765, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 0.4780101180076599, |
|
"learning_rate": 0.0004118811881188119, |
|
"loss": 1.8742, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 0.41187387704849243, |
|
"learning_rate": 0.0004108910891089109, |
|
"loss": 1.8754, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 6.09, |
|
"grad_norm": 0.48807311058044434, |
|
"learning_rate": 0.0004099009900990099, |
|
"loss": 1.877, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 0.4205549657344818, |
|
"learning_rate": 0.0004089108910891089, |
|
"loss": 1.8769, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"eval_accuracy": 0.6247575321496361, |
|
"eval_loss": 1.7364323139190674, |
|
"eval_runtime": 1085.7284, |
|
"eval_samples_per_second": 459.949, |
|
"eval_steps_per_second": 2.054, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 6.11, |
|
"grad_norm": 0.5422940850257874, |
|
"learning_rate": 0.0004079207920792079, |
|
"loss": 1.8743, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 6.11, |
|
"grad_norm": 0.4061022400856018, |
|
"learning_rate": 0.0004069306930693069, |
|
"loss": 1.8748, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 0.4819527268409729, |
|
"learning_rate": 0.000405940594059406, |
|
"loss": 1.8734, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"grad_norm": 0.4938518702983856, |
|
"learning_rate": 0.000404950495049505, |
|
"loss": 1.8746, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"grad_norm": 0.39021047949790955, |
|
"learning_rate": 0.00040396039603960397, |
|
"loss": 1.874, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 6.14, |
|
"grad_norm": 0.3815496265888214, |
|
"learning_rate": 0.000402970297029703, |
|
"loss": 1.8722, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 6.15, |
|
"grad_norm": 0.4226173758506775, |
|
"learning_rate": 0.000401980198019802, |
|
"loss": 1.8716, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"grad_norm": 0.4311840236186981, |
|
"learning_rate": 0.000400990099009901, |
|
"loss": 1.8729, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"grad_norm": 0.37644141912460327, |
|
"learning_rate": 0.0004, |
|
"loss": 1.8697, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"eval_accuracy": 0.6258217191475034, |
|
"eval_loss": 1.7313834428787231, |
|
"eval_runtime": 1087.4157, |
|
"eval_samples_per_second": 459.236, |
|
"eval_steps_per_second": 2.051, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"grad_norm": 0.34123849868774414, |
|
"learning_rate": 0.000399009900990099, |
|
"loss": 1.8709, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"grad_norm": 0.5545117259025574, |
|
"learning_rate": 0.00039801980198019807, |
|
"loss": 1.8729, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 0.44565874338150024, |
|
"learning_rate": 0.00039702970297029707, |
|
"loss": 1.8716, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 0.4301845133304596, |
|
"learning_rate": 0.00039603960396039607, |
|
"loss": 1.866, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 0.46877652406692505, |
|
"learning_rate": 0.00039504950495049506, |
|
"loss": 1.8672, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 6.21, |
|
"grad_norm": 0.535370945930481, |
|
"learning_rate": 0.00039405940594059406, |
|
"loss": 1.8712, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"grad_norm": 0.39393341541290283, |
|
"learning_rate": 0.0003930693069306931, |
|
"loss": 1.8671, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"grad_norm": 0.36698782444000244, |
|
"learning_rate": 0.0003920792079207921, |
|
"loss": 1.8685, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 6.23, |
|
"grad_norm": 0.4744206666946411, |
|
"learning_rate": 0.0003910891089108911, |
|
"loss": 1.8673, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 6.23, |
|
"eval_accuracy": 0.6265433755799307, |
|
"eval_loss": 1.7275755405426025, |
|
"eval_runtime": 1085.596, |
|
"eval_samples_per_second": 460.005, |
|
"eval_steps_per_second": 2.054, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 0.4614261984825134, |
|
"learning_rate": 0.0003900990099009901, |
|
"loss": 1.8631, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 0.4780130386352539, |
|
"learning_rate": 0.0003891089108910891, |
|
"loss": 1.8658, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.4078359007835388, |
|
"learning_rate": 0.00038811881188118816, |
|
"loss": 1.8644, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 6.26, |
|
"grad_norm": 0.4950817823410034, |
|
"learning_rate": 0.00038712871287128716, |
|
"loss": 1.8631, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 6.27, |
|
"grad_norm": 0.4473728835582733, |
|
"learning_rate": 0.00038613861386138616, |
|
"loss": 1.862, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 6.27, |
|
"grad_norm": 0.41996896266937256, |
|
"learning_rate": 0.00038514851485148515, |
|
"loss": 1.8609, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 0.4896424114704132, |
|
"learning_rate": 0.00038415841584158415, |
|
"loss": 1.8639, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 6.29, |
|
"grad_norm": 0.5288357138633728, |
|
"learning_rate": 0.0003831683168316832, |
|
"loss": 1.8629, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"grad_norm": 0.414982408285141, |
|
"learning_rate": 0.0003821782178217822, |
|
"loss": 1.8611, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"eval_accuracy": 0.6280281595524545, |
|
"eval_loss": 1.7198432683944702, |
|
"eval_runtime": 1085.3535, |
|
"eval_samples_per_second": 460.108, |
|
"eval_steps_per_second": 2.055, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"grad_norm": 0.45578229427337646, |
|
"learning_rate": 0.0003811881188118812, |
|
"loss": 1.8598, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"grad_norm": 0.4705806374549866, |
|
"learning_rate": 0.0003801980198019802, |
|
"loss": 1.8606, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 0.40844404697418213, |
|
"learning_rate": 0.0003792079207920792, |
|
"loss": 1.8597, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 0.3853258490562439, |
|
"learning_rate": 0.00037821782178217825, |
|
"loss": 1.8582, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 6.33, |
|
"grad_norm": 0.4357406198978424, |
|
"learning_rate": 0.00037722772277227725, |
|
"loss": 1.8595, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 6.34, |
|
"grad_norm": 0.5262021422386169, |
|
"learning_rate": 0.00037623762376237625, |
|
"loss": 1.858, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"grad_norm": 0.5264425873756409, |
|
"learning_rate": 0.00037524752475247524, |
|
"loss": 1.8587, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"grad_norm": 0.45019951462745667, |
|
"learning_rate": 0.00037425742574257424, |
|
"loss": 1.8566, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"grad_norm": 0.41189640760421753, |
|
"learning_rate": 0.0003732673267326733, |
|
"loss": 1.8572, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"eval_accuracy": 0.6290112373666048, |
|
"eval_loss": 1.7162350416183472, |
|
"eval_runtime": 1085.09, |
|
"eval_samples_per_second": 460.22, |
|
"eval_steps_per_second": 2.055, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"grad_norm": 0.4982648491859436, |
|
"learning_rate": 0.0003722772277227723, |
|
"loss": 1.8546, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 6.38, |
|
"grad_norm": 0.3992471694946289, |
|
"learning_rate": 0.0003712871287128713, |
|
"loss": 1.8537, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 6.38, |
|
"grad_norm": 0.46979019045829773, |
|
"learning_rate": 0.0003702970297029703, |
|
"loss": 1.8566, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 6.39, |
|
"grad_norm": 0.5312979817390442, |
|
"learning_rate": 0.0003693069306930693, |
|
"loss": 1.8528, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.4439733624458313, |
|
"learning_rate": 0.00036831683168316834, |
|
"loss": 1.855, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.39169180393218994, |
|
"learning_rate": 0.00036732673267326734, |
|
"loss": 1.8528, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 6.41, |
|
"grad_norm": 0.41544708609580994, |
|
"learning_rate": 0.00036633663366336634, |
|
"loss": 1.8508, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 6.42, |
|
"grad_norm": 0.4738489091396332, |
|
"learning_rate": 0.00036534653465346533, |
|
"loss": 1.8515, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 6.43, |
|
"grad_norm": 0.4519752860069275, |
|
"learning_rate": 0.00036435643564356433, |
|
"loss": 1.8507, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 6.43, |
|
"eval_accuracy": 0.6307912179674288, |
|
"eval_loss": 1.707141399383545, |
|
"eval_runtime": 1087.4605, |
|
"eval_samples_per_second": 459.217, |
|
"eval_steps_per_second": 2.051, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 6.43, |
|
"grad_norm": 0.382656991481781, |
|
"learning_rate": 0.0003633663366336634, |
|
"loss": 1.8487, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"grad_norm": 0.4599255919456482, |
|
"learning_rate": 0.0003623762376237624, |
|
"loss": 1.8506, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 6.45, |
|
"grad_norm": 0.4658602476119995, |
|
"learning_rate": 0.0003613861386138614, |
|
"loss": 1.8496, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 0.43937841057777405, |
|
"learning_rate": 0.0003603960396039604, |
|
"loss": 1.8504, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 0.4001791477203369, |
|
"learning_rate": 0.0003594059405940594, |
|
"loss": 1.8479, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 6.47, |
|
"grad_norm": 0.47235003113746643, |
|
"learning_rate": 0.00035841584158415843, |
|
"loss": 1.8462, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 6.48, |
|
"grad_norm": 0.42399758100509644, |
|
"learning_rate": 0.00035742574257425743, |
|
"loss": 1.8483, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 6.48, |
|
"grad_norm": 0.3666219413280487, |
|
"learning_rate": 0.0003564356435643564, |
|
"loss": 1.8462, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 6.49, |
|
"grad_norm": 0.4140106737613678, |
|
"learning_rate": 0.0003554455445544554, |
|
"loss": 1.8447, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 6.49, |
|
"eval_accuracy": 0.6324917716108477, |
|
"eval_loss": 1.6985867023468018, |
|
"eval_runtime": 1087.0314, |
|
"eval_samples_per_second": 459.398, |
|
"eval_steps_per_second": 2.051, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.4904990792274475, |
|
"learning_rate": 0.0003544554455445544, |
|
"loss": 1.8451, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"grad_norm": 0.43653395771980286, |
|
"learning_rate": 0.0003534653465346535, |
|
"loss": 1.8434, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 6.51, |
|
"grad_norm": 0.4815143942832947, |
|
"learning_rate": 0.0003524752475247525, |
|
"loss": 1.8405, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 6.52, |
|
"grad_norm": 0.5665289759635925, |
|
"learning_rate": 0.00035148514851485147, |
|
"loss": 1.8426, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 6.53, |
|
"grad_norm": 0.45166435837745667, |
|
"learning_rate": 0.00035049504950495047, |
|
"loss": 1.8431, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"grad_norm": 0.45748040080070496, |
|
"learning_rate": 0.00034950495049504947, |
|
"loss": 1.8434, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"grad_norm": 0.42432862520217896, |
|
"learning_rate": 0.0003485148514851485, |
|
"loss": 1.8408, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"grad_norm": 0.3683374226093292, |
|
"learning_rate": 0.0003475247524752475, |
|
"loss": 1.8388, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 0.4106079041957855, |
|
"learning_rate": 0.0003465346534653465, |
|
"loss": 1.839, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"eval_accuracy": 0.6338676239837059, |
|
"eval_loss": 1.6945050954818726, |
|
"eval_runtime": 1084.8256, |
|
"eval_samples_per_second": 460.332, |
|
"eval_steps_per_second": 2.056, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 0.46325406432151794, |
|
"learning_rate": 0.0003455445544554455, |
|
"loss": 1.8383, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 6.57, |
|
"grad_norm": 0.4024347960948944, |
|
"learning_rate": 0.0003445544554455445, |
|
"loss": 1.8397, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 6.58, |
|
"grad_norm": 0.5055080652236938, |
|
"learning_rate": 0.0003435643564356436, |
|
"loss": 1.8386, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 6.59, |
|
"grad_norm": 0.38540130853652954, |
|
"learning_rate": 0.0003425742574257426, |
|
"loss": 1.8381, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 6.59, |
|
"grad_norm": 0.42219218611717224, |
|
"learning_rate": 0.0003415841584158416, |
|
"loss": 1.8379, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 0.37353622913360596, |
|
"learning_rate": 0.0003405940594059406, |
|
"loss": 1.8354, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 6.61, |
|
"grad_norm": 0.4756326973438263, |
|
"learning_rate": 0.0003396039603960396, |
|
"loss": 1.8345, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"grad_norm": 0.44085556268692017, |
|
"learning_rate": 0.00033861386138613867, |
|
"loss": 1.8355, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"grad_norm": 0.4287554919719696, |
|
"learning_rate": 0.00033762376237623766, |
|
"loss": 1.8345, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"eval_accuracy": 0.6351918568609488, |
|
"eval_loss": 1.6874170303344727, |
|
"eval_runtime": 1085.5989, |
|
"eval_samples_per_second": 460.004, |
|
"eval_steps_per_second": 2.054, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 6.63, |
|
"grad_norm": 0.4309207499027252, |
|
"learning_rate": 0.00033663366336633666, |
|
"loss": 1.8336, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 0.41971975564956665, |
|
"learning_rate": 0.00033564356435643566, |
|
"loss": 1.83, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 0.4418039619922638, |
|
"learning_rate": 0.00033465346534653466, |
|
"loss": 1.829, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 6.65, |
|
"grad_norm": 0.44311702251434326, |
|
"learning_rate": 0.0003336633663366337, |
|
"loss": 1.8337, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 6.66, |
|
"grad_norm": 0.46464502811431885, |
|
"learning_rate": 0.0003326732673267327, |
|
"loss": 1.835, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 0.40370500087738037, |
|
"learning_rate": 0.0003316831683168317, |
|
"loss": 1.8304, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 6.67, |
|
"grad_norm": 0.37608906626701355, |
|
"learning_rate": 0.0003306930693069307, |
|
"loss": 1.8312, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"grad_norm": 0.3990706503391266, |
|
"learning_rate": 0.0003297029702970297, |
|
"loss": 1.8313, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 6.69, |
|
"grad_norm": 0.5069761872291565, |
|
"learning_rate": 0.00032871287128712876, |
|
"loss": 1.8292, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 6.69, |
|
"eval_accuracy": 0.6364002620555558, |
|
"eval_loss": 1.6848387718200684, |
|
"eval_runtime": 1085.7505, |
|
"eval_samples_per_second": 459.94, |
|
"eval_steps_per_second": 2.054, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 6.7, |
|
"grad_norm": 0.450114905834198, |
|
"learning_rate": 0.00032772277227722775, |
|
"loss": 1.8279, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 6.7, |
|
"grad_norm": 0.4129829704761505, |
|
"learning_rate": 0.00032673267326732675, |
|
"loss": 1.8282, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 6.71, |
|
"grad_norm": 0.4841521084308624, |
|
"learning_rate": 0.00032574257425742575, |
|
"loss": 1.8282, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 0.5160727500915527, |
|
"learning_rate": 0.00032475247524752475, |
|
"loss": 1.8269, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 0.5048640370368958, |
|
"learning_rate": 0.0003237623762376238, |
|
"loss": 1.8268, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 0.584237813949585, |
|
"learning_rate": 0.0003227722772277228, |
|
"loss": 1.8295, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 6.74, |
|
"grad_norm": 0.43098556995391846, |
|
"learning_rate": 0.0003217821782178218, |
|
"loss": 1.8249, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.3972054421901703, |
|
"learning_rate": 0.0003207920792079208, |
|
"loss": 1.8246, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.3777482211589813, |
|
"learning_rate": 0.0003198019801980198, |
|
"loss": 1.8243, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"eval_accuracy": 0.6379164581645916, |
|
"eval_loss": 1.6772228479385376, |
|
"eval_runtime": 1085.6894, |
|
"eval_samples_per_second": 459.966, |
|
"eval_steps_per_second": 2.054, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"grad_norm": 0.42633891105651855, |
|
"learning_rate": 0.00031881188118811885, |
|
"loss": 1.8221, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 6.77, |
|
"grad_norm": 0.5893245339393616, |
|
"learning_rate": 0.00031782178217821784, |
|
"loss": 1.8243, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"grad_norm": 0.5304878354072571, |
|
"learning_rate": 0.00031683168316831684, |
|
"loss": 1.8242, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 6.78, |
|
"grad_norm": 0.4657045304775238, |
|
"learning_rate": 0.00031584158415841584, |
|
"loss": 1.8242, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 6.79, |
|
"grad_norm": 0.43276962637901306, |
|
"learning_rate": 0.00031485148514851484, |
|
"loss": 1.8225, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 0.40515634417533875, |
|
"learning_rate": 0.0003138613861386139, |
|
"loss": 1.8228, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 6.81, |
|
"grad_norm": 0.40448087453842163, |
|
"learning_rate": 0.0003128712871287129, |
|
"loss": 1.8198, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 6.81, |
|
"grad_norm": 0.4380359351634979, |
|
"learning_rate": 0.0003118811881188119, |
|
"loss": 1.8201, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 0.46539106965065, |
|
"learning_rate": 0.0003108910891089109, |
|
"loss": 1.8217, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"eval_accuracy": 0.6389200973153136, |
|
"eval_loss": 1.6709976196289062, |
|
"eval_runtime": 1085.3788, |
|
"eval_samples_per_second": 460.097, |
|
"eval_steps_per_second": 2.055, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 0.4772910177707672, |
|
"learning_rate": 0.0003099009900990099, |
|
"loss": 1.8202, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 0.42799142003059387, |
|
"learning_rate": 0.00030891089108910894, |
|
"loss": 1.819, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 6.84, |
|
"grad_norm": 0.40562522411346436, |
|
"learning_rate": 0.00030792079207920793, |
|
"loss": 1.8208, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 6.85, |
|
"grad_norm": 0.6129370927810669, |
|
"learning_rate": 0.00030693069306930693, |
|
"loss": 1.8158, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 6.86, |
|
"grad_norm": 0.4654337465763092, |
|
"learning_rate": 0.00030594059405940593, |
|
"loss": 1.8175, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 6.86, |
|
"grad_norm": 0.4340899884700775, |
|
"learning_rate": 0.00030495049504950493, |
|
"loss": 1.8156, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 6.87, |
|
"grad_norm": 0.4216438829898834, |
|
"learning_rate": 0.000303960396039604, |
|
"loss": 1.8182, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 0.3995111584663391, |
|
"learning_rate": 0.000302970297029703, |
|
"loss": 1.8175, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 6.89, |
|
"grad_norm": 0.4166571795940399, |
|
"learning_rate": 0.000301980198019802, |
|
"loss": 1.8171, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 6.89, |
|
"eval_accuracy": 0.6397325747268708, |
|
"eval_loss": 1.6698857545852661, |
|
"eval_runtime": 1086.4276, |
|
"eval_samples_per_second": 459.653, |
|
"eval_steps_per_second": 2.053, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 6.89, |
|
"grad_norm": 0.4397519528865814, |
|
"learning_rate": 0.000300990099009901, |
|
"loss": 1.8171, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 6.9, |
|
"grad_norm": 0.4201519787311554, |
|
"learning_rate": 0.0003, |
|
"loss": 1.8166, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"grad_norm": 0.4194183945655823, |
|
"learning_rate": 0.000299009900990099, |
|
"loss": 1.8137, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"grad_norm": 0.4156358540058136, |
|
"learning_rate": 0.000298019801980198, |
|
"loss": 1.8149, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 0.43356573581695557, |
|
"learning_rate": 0.000297029702970297, |
|
"loss": 1.8104, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 6.93, |
|
"grad_norm": 0.4354686141014099, |
|
"learning_rate": 0.000296039603960396, |
|
"loss": 1.8131, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"grad_norm": 0.45513659715652466, |
|
"learning_rate": 0.000295049504950495, |
|
"loss": 1.8115, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"grad_norm": 0.39433979988098145, |
|
"learning_rate": 0.00029405940594059407, |
|
"loss": 1.8106, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 6.95, |
|
"grad_norm": 0.5176064372062683, |
|
"learning_rate": 0.00029306930693069307, |
|
"loss": 1.8153, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 6.95, |
|
"eval_accuracy": 0.640691022778346, |
|
"eval_loss": 1.66335129737854, |
|
"eval_runtime": 1082.6564, |
|
"eval_samples_per_second": 461.254, |
|
"eval_steps_per_second": 2.06, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"grad_norm": 0.5153635740280151, |
|
"learning_rate": 0.00029207920792079207, |
|
"loss": 1.8142, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 6.97, |
|
"grad_norm": 0.37104055285453796, |
|
"learning_rate": 0.00029108910891089107, |
|
"loss": 1.812, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 6.97, |
|
"grad_norm": 0.40426042675971985, |
|
"learning_rate": 0.00029009900990099006, |
|
"loss": 1.8119, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 6.98, |
|
"grad_norm": 0.5108228325843811, |
|
"learning_rate": 0.0002891089108910891, |
|
"loss": 1.8131, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"grad_norm": 0.4702747166156769, |
|
"learning_rate": 0.0002881188118811881, |
|
"loss": 1.812, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"grad_norm": 0.3683488965034485, |
|
"learning_rate": 0.0002871287128712871, |
|
"loss": 1.8081, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.42281776666641235, |
|
"learning_rate": 0.0002861386138613861, |
|
"loss": 1.81, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 0.48128095269203186, |
|
"learning_rate": 0.0002851485148514851, |
|
"loss": 1.81, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"grad_norm": 0.4020933210849762, |
|
"learning_rate": 0.00028415841584158416, |
|
"loss": 1.81, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"eval_accuracy": 0.6416495776980271, |
|
"eval_loss": 1.6598803997039795, |
|
"eval_runtime": 1084.1267, |
|
"eval_samples_per_second": 460.629, |
|
"eval_steps_per_second": 2.057, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"grad_norm": 0.5061802864074707, |
|
"learning_rate": 0.00028316831683168316, |
|
"loss": 1.8091, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 7.03, |
|
"grad_norm": 0.5182695388793945, |
|
"learning_rate": 0.00028217821782178216, |
|
"loss": 1.8101, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 0.45669490098953247, |
|
"learning_rate": 0.0002811881188118812, |
|
"loss": 1.8059, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"grad_norm": 0.5214717984199524, |
|
"learning_rate": 0.0002801980198019802, |
|
"loss": 1.809, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"grad_norm": 0.46073251962661743, |
|
"learning_rate": 0.00027920792079207926, |
|
"loss": 1.8066, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 7.06, |
|
"grad_norm": 0.3924192190170288, |
|
"learning_rate": 0.00027821782178217826, |
|
"loss": 1.8058, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"grad_norm": 0.42634785175323486, |
|
"learning_rate": 0.00027722772277227726, |
|
"loss": 1.8082, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"grad_norm": 0.46675705909729004, |
|
"learning_rate": 0.00027623762376237626, |
|
"loss": 1.8057, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 7.08, |
|
"grad_norm": 0.43609708547592163, |
|
"learning_rate": 0.00027524752475247525, |
|
"loss": 1.8051, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 7.08, |
|
"eval_accuracy": 0.6424563152815425, |
|
"eval_loss": 1.6557390689849854, |
|
"eval_runtime": 1083.0526, |
|
"eval_samples_per_second": 461.086, |
|
"eval_steps_per_second": 2.059, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 7.09, |
|
"grad_norm": 0.35754847526550293, |
|
"learning_rate": 0.0002742574257425743, |
|
"loss": 1.8053, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.38588428497314453, |
|
"learning_rate": 0.0002732673267326733, |
|
"loss": 1.8059, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.4815811514854431, |
|
"learning_rate": 0.0002722772277227723, |
|
"loss": 1.8053, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 7.11, |
|
"grad_norm": 0.4292014241218567, |
|
"learning_rate": 0.0002712871287128713, |
|
"loss": 1.8018, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"grad_norm": 0.4443877339363098, |
|
"learning_rate": 0.0002702970297029703, |
|
"loss": 1.8022, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 7.13, |
|
"grad_norm": 0.4887067675590515, |
|
"learning_rate": 0.00026930693069306935, |
|
"loss": 1.804, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 7.13, |
|
"grad_norm": 0.4577280282974243, |
|
"learning_rate": 0.00026831683168316835, |
|
"loss": 1.8043, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"grad_norm": 0.38725900650024414, |
|
"learning_rate": 0.00026732673267326735, |
|
"loss": 1.8014, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"grad_norm": 0.45041412115097046, |
|
"learning_rate": 0.00026633663366336635, |
|
"loss": 1.8046, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"eval_accuracy": 0.6431962908340371, |
|
"eval_loss": 1.6528569459915161, |
|
"eval_runtime": 1084.5222, |
|
"eval_samples_per_second": 460.461, |
|
"eval_steps_per_second": 2.056, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"grad_norm": 0.532798707485199, |
|
"learning_rate": 0.00026534653465346534, |
|
"loss": 1.8015, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 7.16, |
|
"grad_norm": 0.43234437704086304, |
|
"learning_rate": 0.0002643564356435644, |
|
"loss": 1.8013, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 7.17, |
|
"grad_norm": 0.4301891624927521, |
|
"learning_rate": 0.0002633663366336634, |
|
"loss": 1.8011, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 7.18, |
|
"grad_norm": 0.41938352584838867, |
|
"learning_rate": 0.0002623762376237624, |
|
"loss": 1.8032, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 7.18, |
|
"grad_norm": 0.4415999948978424, |
|
"learning_rate": 0.0002613861386138614, |
|
"loss": 1.8011, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 0.46587106585502625, |
|
"learning_rate": 0.0002603960396039604, |
|
"loss": 1.8005, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 0.3755381107330322, |
|
"learning_rate": 0.00025940594059405944, |
|
"loss": 1.7996, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"grad_norm": 0.37551912665367126, |
|
"learning_rate": 0.00025841584158415844, |
|
"loss": 1.7981, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"grad_norm": 0.4268946349620819, |
|
"learning_rate": 0.00025742574257425744, |
|
"loss": 1.7997, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 7.21, |
|
"eval_accuracy": 0.6438810244549774, |
|
"eval_loss": 1.6489626169204712, |
|
"eval_runtime": 1085.6377, |
|
"eval_samples_per_second": 459.988, |
|
"eval_steps_per_second": 2.054, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 7.22, |
|
"grad_norm": 0.5793518424034119, |
|
"learning_rate": 0.00025643564356435644, |
|
"loss": 1.8, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 7.23, |
|
"grad_norm": 0.37436190247535706, |
|
"learning_rate": 0.00025544554455445543, |
|
"loss": 1.7974, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"grad_norm": 0.5522225499153137, |
|
"learning_rate": 0.0002544554455445545, |
|
"loss": 1.7968, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 7.24, |
|
"grad_norm": 0.4452868402004242, |
|
"learning_rate": 0.0002534653465346535, |
|
"loss": 1.7997, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.41211819648742676, |
|
"learning_rate": 0.0002524752475247525, |
|
"loss": 1.798, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 0.4052869379520416, |
|
"learning_rate": 0.0002514851485148515, |
|
"loss": 1.7948, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 7.26, |
|
"grad_norm": 0.4514144957065582, |
|
"learning_rate": 0.0002504950495049505, |
|
"loss": 1.7955, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"grad_norm": 0.49351832270622253, |
|
"learning_rate": 0.00024950495049504953, |
|
"loss": 1.7991, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 0.49827703833580017, |
|
"learning_rate": 0.00024851485148514853, |
|
"loss": 1.7994, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"eval_accuracy": 0.6442858204973027, |
|
"eval_loss": 1.6466220617294312, |
|
"eval_runtime": 1086.2584, |
|
"eval_samples_per_second": 459.725, |
|
"eval_steps_per_second": 2.053, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 7.29, |
|
"grad_norm": 0.4449995160102844, |
|
"learning_rate": 0.00024752475247524753, |
|
"loss": 1.7962, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 7.29, |
|
"grad_norm": 0.3683604300022125, |
|
"learning_rate": 0.0002465346534653465, |
|
"loss": 1.7984, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"grad_norm": 0.48126864433288574, |
|
"learning_rate": 0.0002455445544554455, |
|
"loss": 1.7953, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 0.3565351068973541, |
|
"learning_rate": 0.0002445544554455446, |
|
"loss": 1.7914, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"grad_norm": 0.4369056820869446, |
|
"learning_rate": 0.00024356435643564357, |
|
"loss": 1.7949, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 7.32, |
|
"grad_norm": 0.5566734671592712, |
|
"learning_rate": 0.00024257425742574257, |
|
"loss": 1.7961, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 7.33, |
|
"grad_norm": 0.4066598117351532, |
|
"learning_rate": 0.00024158415841584157, |
|
"loss": 1.7962, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 7.34, |
|
"grad_norm": 0.4281260669231415, |
|
"learning_rate": 0.0002405940594059406, |
|
"loss": 1.7943, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 7.34, |
|
"grad_norm": 0.34586983919143677, |
|
"learning_rate": 0.0002396039603960396, |
|
"loss": 1.7934, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 7.34, |
|
"eval_accuracy": 0.6448193534487687, |
|
"eval_loss": 1.644710898399353, |
|
"eval_runtime": 1084.6339, |
|
"eval_samples_per_second": 460.413, |
|
"eval_steps_per_second": 2.056, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 7.35, |
|
"grad_norm": 0.3622014820575714, |
|
"learning_rate": 0.00023861386138613862, |
|
"loss": 1.7912, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 0.456106960773468, |
|
"learning_rate": 0.00023762376237623762, |
|
"loss": 1.7922, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 0.4329501986503601, |
|
"learning_rate": 0.00023663366336633662, |
|
"loss": 1.7937, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 0.4640803039073944, |
|
"learning_rate": 0.00023564356435643564, |
|
"loss": 1.793, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 7.38, |
|
"grad_norm": 0.39238548278808594, |
|
"learning_rate": 0.00023465346534653464, |
|
"loss": 1.7916, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 7.39, |
|
"grad_norm": 0.43311530351638794, |
|
"learning_rate": 0.0002336633663366337, |
|
"loss": 1.7943, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 0.35872432589530945, |
|
"learning_rate": 0.0002326732673267327, |
|
"loss": 1.7918, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 0.36510738730430603, |
|
"learning_rate": 0.0002316831683168317, |
|
"loss": 1.7907, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 7.41, |
|
"grad_norm": 0.46106651425361633, |
|
"learning_rate": 0.00023069306930693071, |
|
"loss": 1.7917, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 7.41, |
|
"eval_accuracy": 0.6454906168005663, |
|
"eval_loss": 1.641427993774414, |
|
"eval_runtime": 1084.7154, |
|
"eval_samples_per_second": 460.379, |
|
"eval_steps_per_second": 2.056, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"grad_norm": 0.37493211030960083, |
|
"learning_rate": 0.0002297029702970297, |
|
"loss": 1.7921, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 7.42, |
|
"grad_norm": 0.3865686058998108, |
|
"learning_rate": 0.00022871287128712874, |
|
"loss": 1.791, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 7.43, |
|
"grad_norm": 0.3889116942882538, |
|
"learning_rate": 0.00022772277227722774, |
|
"loss": 1.7907, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"grad_norm": 0.6045088768005371, |
|
"learning_rate": 0.00022673267326732673, |
|
"loss": 1.7902, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 7.45, |
|
"grad_norm": 0.4185848832130432, |
|
"learning_rate": 0.00022574257425742576, |
|
"loss": 1.7889, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 7.45, |
|
"grad_norm": 0.3915616571903229, |
|
"learning_rate": 0.00022475247524752476, |
|
"loss": 1.7906, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 0.3687775433063507, |
|
"learning_rate": 0.00022376237623762378, |
|
"loss": 1.7897, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 7.47, |
|
"grad_norm": 0.39049315452575684, |
|
"learning_rate": 0.00022277227722772278, |
|
"loss": 1.789, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"grad_norm": 0.37289920449256897, |
|
"learning_rate": 0.00022178217821782178, |
|
"loss": 1.7887, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"eval_accuracy": 0.645869271741179, |
|
"eval_loss": 1.639427661895752, |
|
"eval_runtime": 1118.4989, |
|
"eval_samples_per_second": 446.473, |
|
"eval_steps_per_second": 1.994, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 7.48, |
|
"grad_norm": 0.45301392674446106, |
|
"learning_rate": 0.0002207920792079208, |
|
"loss": 1.7911, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 7.49, |
|
"grad_norm": 0.42282310128211975, |
|
"learning_rate": 0.0002198019801980198, |
|
"loss": 1.788, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.34825190901756287, |
|
"learning_rate": 0.00021881188118811883, |
|
"loss": 1.7888, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.4688248038291931, |
|
"learning_rate": 0.00021782178217821783, |
|
"loss": 1.7878, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"grad_norm": 0.4295547902584076, |
|
"learning_rate": 0.00021683168316831682, |
|
"loss": 1.7877, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"grad_norm": 0.3433161675930023, |
|
"learning_rate": 0.00021584158415841585, |
|
"loss": 1.7884, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"grad_norm": 0.34638333320617676, |
|
"learning_rate": 0.00021485148514851485, |
|
"loss": 1.7891, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"grad_norm": 0.40277931094169617, |
|
"learning_rate": 0.00021386138613861387, |
|
"loss": 1.7856, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"grad_norm": 0.42518341541290283, |
|
"learning_rate": 0.00021287128712871287, |
|
"loss": 1.7861, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 7.54, |
|
"eval_accuracy": 0.6466240342094058, |
|
"eval_loss": 1.637886881828308, |
|
"eval_runtime": 1094.5797, |
|
"eval_samples_per_second": 456.23, |
|
"eval_steps_per_second": 2.037, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 0.3844136893749237, |
|
"learning_rate": 0.00021188118811881187, |
|
"loss": 1.7862, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 7.56, |
|
"grad_norm": 0.34440210461616516, |
|
"learning_rate": 0.0002108910891089109, |
|
"loss": 1.7864, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 7.56, |
|
"grad_norm": 0.501716136932373, |
|
"learning_rate": 0.0002099009900990099, |
|
"loss": 1.7843, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 7.57, |
|
"grad_norm": 0.3695526421070099, |
|
"learning_rate": 0.00020891089108910892, |
|
"loss": 1.7855, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"grad_norm": 0.38437628746032715, |
|
"learning_rate": 0.00020792079207920792, |
|
"loss": 1.7847, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 7.58, |
|
"grad_norm": 0.4197578430175781, |
|
"learning_rate": 0.00020693069306930691, |
|
"loss": 1.7833, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 7.59, |
|
"grad_norm": 0.39093175530433655, |
|
"learning_rate": 0.00020594059405940594, |
|
"loss": 1.786, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 0.36000731587409973, |
|
"learning_rate": 0.00020495049504950494, |
|
"loss": 1.7872, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 7.61, |
|
"grad_norm": 0.45473846793174744, |
|
"learning_rate": 0.00020396039603960396, |
|
"loss": 1.7853, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 7.61, |
|
"eval_accuracy": 0.6470880404015521, |
|
"eval_loss": 1.6340434551239014, |
|
"eval_runtime": 1091.9524, |
|
"eval_samples_per_second": 457.328, |
|
"eval_steps_per_second": 2.042, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 7.61, |
|
"grad_norm": 0.3739522099494934, |
|
"learning_rate": 0.000202970297029703, |
|
"loss": 1.7824, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 7.62, |
|
"grad_norm": 0.40237516164779663, |
|
"learning_rate": 0.00020198019801980199, |
|
"loss": 1.7837, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 7.63, |
|
"grad_norm": 0.44717445969581604, |
|
"learning_rate": 0.000200990099009901, |
|
"loss": 1.7828, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 0.4383144676685333, |
|
"learning_rate": 0.0002, |
|
"loss": 1.7841, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 0.368528813123703, |
|
"learning_rate": 0.00019900990099009903, |
|
"loss": 1.7818, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"grad_norm": 0.40288105607032776, |
|
"learning_rate": 0.00019801980198019803, |
|
"loss": 1.7832, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 7.66, |
|
"grad_norm": 0.3775827884674072, |
|
"learning_rate": 0.00019702970297029703, |
|
"loss": 1.7814, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 7.66, |
|
"grad_norm": 0.4067000448703766, |
|
"learning_rate": 0.00019603960396039606, |
|
"loss": 1.7815, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"grad_norm": 0.45529502630233765, |
|
"learning_rate": 0.00019504950495049505, |
|
"loss": 1.7847, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 7.67, |
|
"eval_accuracy": 0.647348903103301, |
|
"eval_loss": 1.6344057321548462, |
|
"eval_runtime": 1092.9918, |
|
"eval_samples_per_second": 456.893, |
|
"eval_steps_per_second": 2.04, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"grad_norm": 0.4383666217327118, |
|
"learning_rate": 0.00019405940594059408, |
|
"loss": 1.7799, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.41071295738220215, |
|
"learning_rate": 0.00019306930693069308, |
|
"loss": 1.7826, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 7.69, |
|
"grad_norm": 0.4367865324020386, |
|
"learning_rate": 0.00019207920792079208, |
|
"loss": 1.7819, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 7.7, |
|
"grad_norm": 0.3609165549278259, |
|
"learning_rate": 0.0001910891089108911, |
|
"loss": 1.7802, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 7.71, |
|
"grad_norm": 0.39005783200263977, |
|
"learning_rate": 0.0001900990099009901, |
|
"loss": 1.78, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 7.72, |
|
"grad_norm": 0.3287705183029175, |
|
"learning_rate": 0.00018910891089108913, |
|
"loss": 1.7797, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 7.72, |
|
"grad_norm": 0.3266151547431946, |
|
"learning_rate": 0.00018811881188118812, |
|
"loss": 1.7795, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 7.73, |
|
"grad_norm": 0.35796740651130676, |
|
"learning_rate": 0.00018712871287128712, |
|
"loss": 1.7782, |
|
"step": 10610 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 0.39394471049308777, |
|
"learning_rate": 0.00018613861386138615, |
|
"loss": 1.7799, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"eval_accuracy": 0.647908722477522, |
|
"eval_loss": 1.6299968957901, |
|
"eval_runtime": 1086.7095, |
|
"eval_samples_per_second": 459.534, |
|
"eval_steps_per_second": 2.052, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.4640734791755676, |
|
"learning_rate": 0.00018514851485148514, |
|
"loss": 1.7794, |
|
"step": 10630 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.3890862464904785, |
|
"learning_rate": 0.00018415841584158417, |
|
"loss": 1.7794, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 7.76, |
|
"grad_norm": 0.3503568768501282, |
|
"learning_rate": 0.00018316831683168317, |
|
"loss": 1.7776, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 7.77, |
|
"grad_norm": 0.39593997597694397, |
|
"learning_rate": 0.00018217821782178217, |
|
"loss": 1.7798, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 7.77, |
|
"grad_norm": 0.3677063286304474, |
|
"learning_rate": 0.0001811881188118812, |
|
"loss": 1.7769, |
|
"step": 10670 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"grad_norm": 0.3558836877346039, |
|
"learning_rate": 0.0001801980198019802, |
|
"loss": 1.7803, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"grad_norm": 0.4983728229999542, |
|
"learning_rate": 0.00017920792079207922, |
|
"loss": 1.7774, |
|
"step": 10690 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 0.3735315203666687, |
|
"learning_rate": 0.0001782178217821782, |
|
"loss": 1.778, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 0.3440467417240143, |
|
"learning_rate": 0.0001772277227722772, |
|
"loss": 1.7773, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"eval_accuracy": 0.6480254558570787, |
|
"eval_loss": 1.6308239698410034, |
|
"eval_runtime": 1084.4491, |
|
"eval_samples_per_second": 460.492, |
|
"eval_steps_per_second": 2.056, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 7.81, |
|
"grad_norm": 0.3812803626060486, |
|
"learning_rate": 0.00017623762376237624, |
|
"loss": 1.7776, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 7.82, |
|
"grad_norm": 0.3772016763687134, |
|
"learning_rate": 0.00017524752475247524, |
|
"loss": 1.7751, |
|
"step": 10730 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 0.3638882339000702, |
|
"learning_rate": 0.00017425742574257426, |
|
"loss": 1.7794, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 0.43426576256752014, |
|
"learning_rate": 0.00017326732673267326, |
|
"loss": 1.7787, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"grad_norm": 0.41420796513557434, |
|
"learning_rate": 0.00017227722772277226, |
|
"loss": 1.7766, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"grad_norm": 0.3958096206188202, |
|
"learning_rate": 0.0001712871287128713, |
|
"loss": 1.7765, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 7.85, |
|
"grad_norm": 0.3829286992549896, |
|
"learning_rate": 0.0001702970297029703, |
|
"loss": 1.7773, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 7.86, |
|
"grad_norm": 0.41278573870658875, |
|
"learning_rate": 0.00016930693069306933, |
|
"loss": 1.7769, |
|
"step": 10790 |
|
}, |
|
{ |
|
"epoch": 7.87, |
|
"grad_norm": 0.337071031332016, |
|
"learning_rate": 0.00016831683168316833, |
|
"loss": 1.7766, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 7.87, |
|
"eval_accuracy": 0.6483471740815172, |
|
"eval_loss": 1.6272797584533691, |
|
"eval_runtime": 1083.9003, |
|
"eval_samples_per_second": 460.725, |
|
"eval_steps_per_second": 2.057, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 0.3459130823612213, |
|
"learning_rate": 0.00016732673267326733, |
|
"loss": 1.7781, |
|
"step": 10810 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 0.3339349925518036, |
|
"learning_rate": 0.00016633663366336635, |
|
"loss": 1.7754, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 7.89, |
|
"grad_norm": 0.3156519830226898, |
|
"learning_rate": 0.00016534653465346535, |
|
"loss": 1.776, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"grad_norm": 0.3628999590873718, |
|
"learning_rate": 0.00016435643564356438, |
|
"loss": 1.7744, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"grad_norm": 0.350087434053421, |
|
"learning_rate": 0.00016336633663366338, |
|
"loss": 1.7752, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"grad_norm": 0.40664511919021606, |
|
"learning_rate": 0.00016237623762376237, |
|
"loss": 1.7781, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 0.3364078104496002, |
|
"learning_rate": 0.0001613861386138614, |
|
"loss": 1.776, |
|
"step": 10870 |
|
}, |
|
{ |
|
"epoch": 7.93, |
|
"grad_norm": 0.32285594940185547, |
|
"learning_rate": 0.0001603960396039604, |
|
"loss": 1.772, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 7.93, |
|
"grad_norm": 0.39036986231803894, |
|
"learning_rate": 0.00015940594059405942, |
|
"loss": 1.7755, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 7.93, |
|
"eval_accuracy": 0.6488458576263911, |
|
"eval_loss": 1.6268614530563354, |
|
"eval_runtime": 1085.4437, |
|
"eval_samples_per_second": 460.07, |
|
"eval_steps_per_second": 2.054, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 7.94, |
|
"grad_norm": 0.41111525893211365, |
|
"learning_rate": 0.00015841584158415842, |
|
"loss": 1.7747, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 7.95, |
|
"grad_norm": 0.4492338299751282, |
|
"learning_rate": 0.00015742574257425742, |
|
"loss": 1.7753, |
|
"step": 10910 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 0.3836340606212616, |
|
"learning_rate": 0.00015643564356435644, |
|
"loss": 1.7727, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"grad_norm": 0.33723685145378113, |
|
"learning_rate": 0.00015544554455445544, |
|
"loss": 1.775, |
|
"step": 10930 |
|
}, |
|
{ |
|
"epoch": 7.97, |
|
"grad_norm": 0.4088629484176636, |
|
"learning_rate": 0.00015445544554455447, |
|
"loss": 1.7741, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 7.98, |
|
"grad_norm": 0.3302168548107147, |
|
"learning_rate": 0.00015346534653465347, |
|
"loss": 1.7732, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 7.99, |
|
"grad_norm": 0.3605554401874542, |
|
"learning_rate": 0.00015247524752475246, |
|
"loss": 1.7722, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 7.99, |
|
"grad_norm": 0.355826735496521, |
|
"learning_rate": 0.0001514851485148515, |
|
"loss": 1.7715, |
|
"step": 10970 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.3708418607711792, |
|
"learning_rate": 0.0001504950495049505, |
|
"loss": 1.7721, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.6491409467366662, |
|
"eval_loss": 1.6255041360855103, |
|
"eval_runtime": 1085.4323, |
|
"eval_samples_per_second": 460.075, |
|
"eval_steps_per_second": 2.054, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.3321845233440399, |
|
"learning_rate": 0.0001495049504950495, |
|
"loss": 1.7719, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.3251964747905731, |
|
"learning_rate": 0.0001485148514851485, |
|
"loss": 1.7731, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 8.02, |
|
"grad_norm": 0.4384077787399292, |
|
"learning_rate": 0.0001475247524752475, |
|
"loss": 1.7753, |
|
"step": 11010 |
|
}, |
|
{ |
|
"epoch": 8.03, |
|
"grad_norm": 0.3885464370250702, |
|
"learning_rate": 0.00014653465346534653, |
|
"loss": 1.7707, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 8.04, |
|
"grad_norm": 0.42948633432388306, |
|
"learning_rate": 0.00014554455445544553, |
|
"loss": 1.7716, |
|
"step": 11030 |
|
}, |
|
{ |
|
"epoch": 8.04, |
|
"grad_norm": 0.33700281381607056, |
|
"learning_rate": 0.00014455445544554456, |
|
"loss": 1.7711, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 8.05, |
|
"grad_norm": 0.37887144088745117, |
|
"learning_rate": 0.00014356435643564356, |
|
"loss": 1.7707, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"grad_norm": 0.2920531630516052, |
|
"learning_rate": 0.00014257425742574255, |
|
"loss": 1.7717, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"grad_norm": 0.41969195008277893, |
|
"learning_rate": 0.00014158415841584158, |
|
"loss": 1.77, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"eval_accuracy": 0.6496409109474853, |
|
"eval_loss": 1.6225236654281616, |
|
"eval_runtime": 1085.9823, |
|
"eval_samples_per_second": 459.842, |
|
"eval_steps_per_second": 2.053, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"grad_norm": 0.3497646152973175, |
|
"learning_rate": 0.0001405940594059406, |
|
"loss": 1.7712, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 8.08, |
|
"grad_norm": 0.38377320766448975, |
|
"learning_rate": 0.00013960396039603963, |
|
"loss": 1.77, |
|
"step": 11090 |
|
}, |
|
{ |
|
"epoch": 8.09, |
|
"grad_norm": 0.3549109101295471, |
|
"learning_rate": 0.00013861386138613863, |
|
"loss": 1.7736, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 8.09, |
|
"grad_norm": 0.3375445604324341, |
|
"learning_rate": 0.00013762376237623763, |
|
"loss": 1.7715, |
|
"step": 11110 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 0.30400267243385315, |
|
"learning_rate": 0.00013663366336633665, |
|
"loss": 1.7708, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 8.11, |
|
"grad_norm": 0.36075320839881897, |
|
"learning_rate": 0.00013564356435643565, |
|
"loss": 1.7706, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 0.40548309683799744, |
|
"learning_rate": 0.00013465346534653468, |
|
"loss": 1.7677, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 0.45684516429901123, |
|
"learning_rate": 0.00013366336633663367, |
|
"loss": 1.7682, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 8.13, |
|
"grad_norm": 0.35352224111557007, |
|
"learning_rate": 0.00013267326732673267, |
|
"loss": 1.7708, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 8.13, |
|
"eval_accuracy": 0.6498058850737081, |
|
"eval_loss": 1.6215531826019287, |
|
"eval_runtime": 1090.6922, |
|
"eval_samples_per_second": 457.856, |
|
"eval_steps_per_second": 2.045, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 8.14, |
|
"grad_norm": 0.32690539956092834, |
|
"learning_rate": 0.0001316831683168317, |
|
"loss": 1.7701, |
|
"step": 11170 |
|
}, |
|
{ |
|
"epoch": 8.15, |
|
"grad_norm": 0.3275192975997925, |
|
"learning_rate": 0.0001306930693069307, |
|
"loss": 1.7664, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 8.15, |
|
"grad_norm": 0.35621440410614014, |
|
"learning_rate": 0.00012970297029702972, |
|
"loss": 1.7708, |
|
"step": 11190 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"grad_norm": 0.3187929093837738, |
|
"learning_rate": 0.00012871287128712872, |
|
"loss": 1.7686, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"grad_norm": 0.30944034457206726, |
|
"learning_rate": 0.00012772277227722772, |
|
"loss": 1.7705, |
|
"step": 11210 |
|
}, |
|
{ |
|
"epoch": 8.17, |
|
"grad_norm": 0.3147297501564026, |
|
"learning_rate": 0.00012673267326732674, |
|
"loss": 1.7668, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 8.18, |
|
"grad_norm": 0.31606265902519226, |
|
"learning_rate": 0.00012574257425742574, |
|
"loss": 1.7664, |
|
"step": 11230 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"grad_norm": 0.3430984914302826, |
|
"learning_rate": 0.00012475247524752477, |
|
"loss": 1.7688, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 0.36714431643486023, |
|
"learning_rate": 0.00012376237623762376, |
|
"loss": 1.7686, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"eval_accuracy": 0.6500912899825928, |
|
"eval_loss": 1.6193368434906006, |
|
"eval_runtime": 1084.4098, |
|
"eval_samples_per_second": 460.509, |
|
"eval_steps_per_second": 2.056, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 0.33857467770576477, |
|
"learning_rate": 0.00012277227722772276, |
|
"loss": 1.7706, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"grad_norm": 0.33994871377944946, |
|
"learning_rate": 0.00012178217821782179, |
|
"loss": 1.7657, |
|
"step": 11270 |
|
}, |
|
{ |
|
"epoch": 8.22, |
|
"grad_norm": 0.3522297143936157, |
|
"learning_rate": 0.00012079207920792079, |
|
"loss": 1.7671, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"grad_norm": 0.3409149646759033, |
|
"learning_rate": 0.0001198019801980198, |
|
"loss": 1.7683, |
|
"step": 11290 |
|
}, |
|
{ |
|
"epoch": 8.23, |
|
"grad_norm": 0.34772610664367676, |
|
"learning_rate": 0.00011881188118811881, |
|
"loss": 1.7694, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 8.24, |
|
"grad_norm": 0.3905799388885498, |
|
"learning_rate": 0.00011782178217821782, |
|
"loss": 1.7676, |
|
"step": 11310 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 0.3478334844112396, |
|
"learning_rate": 0.00011683168316831685, |
|
"loss": 1.7656, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 8.26, |
|
"grad_norm": 0.3372560143470764, |
|
"learning_rate": 0.00011584158415841584, |
|
"loss": 1.7662, |
|
"step": 11330 |
|
}, |
|
{ |
|
"epoch": 8.26, |
|
"grad_norm": 0.31676506996154785, |
|
"learning_rate": 0.00011485148514851486, |
|
"loss": 1.7673, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 8.26, |
|
"eval_accuracy": 0.6503277550921033, |
|
"eval_loss": 1.6178277730941772, |
|
"eval_runtime": 1085.1752, |
|
"eval_samples_per_second": 460.184, |
|
"eval_steps_per_second": 2.055, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 8.27, |
|
"grad_norm": 0.31384769082069397, |
|
"learning_rate": 0.00011386138613861387, |
|
"loss": 1.7657, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 0.3267647922039032, |
|
"learning_rate": 0.00011287128712871288, |
|
"loss": 1.766, |
|
"step": 11360 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 0.32966649532318115, |
|
"learning_rate": 0.00011188118811881189, |
|
"loss": 1.7636, |
|
"step": 11370 |
|
}, |
|
{ |
|
"epoch": 8.29, |
|
"grad_norm": 0.34511563181877136, |
|
"learning_rate": 0.00011089108910891089, |
|
"loss": 1.767, |
|
"step": 11380 |
|
}, |
|
{ |
|
"epoch": 8.3, |
|
"grad_norm": 0.3151010274887085, |
|
"learning_rate": 0.0001099009900990099, |
|
"loss": 1.7663, |
|
"step": 11390 |
|
}, |
|
{ |
|
"epoch": 8.31, |
|
"grad_norm": 0.33481037616729736, |
|
"learning_rate": 0.00010891089108910891, |
|
"loss": 1.7687, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 8.31, |
|
"grad_norm": 0.35512518882751465, |
|
"learning_rate": 0.00010792079207920792, |
|
"loss": 1.7681, |
|
"step": 11410 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"grad_norm": 0.3472909927368164, |
|
"learning_rate": 0.00010693069306930694, |
|
"loss": 1.7653, |
|
"step": 11420 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"grad_norm": 0.3452986180782318, |
|
"learning_rate": 0.00010594059405940593, |
|
"loss": 1.7666, |
|
"step": 11430 |
|
}, |
|
{ |
|
"epoch": 8.33, |
|
"eval_accuracy": 0.6505906465254327, |
|
"eval_loss": 1.6169500350952148, |
|
"eval_runtime": 1083.6278, |
|
"eval_samples_per_second": 460.841, |
|
"eval_steps_per_second": 2.058, |
|
"step": 11430 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"grad_norm": 0.34459248185157776, |
|
"learning_rate": 0.00010495049504950495, |
|
"loss": 1.7661, |
|
"step": 11440 |
|
}, |
|
{ |
|
"epoch": 8.34, |
|
"grad_norm": 0.3042079508304596, |
|
"learning_rate": 0.00010396039603960396, |
|
"loss": 1.761, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 8.35, |
|
"grad_norm": 0.32908689975738525, |
|
"learning_rate": 0.00010297029702970297, |
|
"loss": 1.7657, |
|
"step": 11460 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"grad_norm": 0.34110862016677856, |
|
"learning_rate": 0.00010198019801980198, |
|
"loss": 1.7644, |
|
"step": 11470 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"grad_norm": 0.2914797365665436, |
|
"learning_rate": 0.00010099009900990099, |
|
"loss": 1.7659, |
|
"step": 11480 |
|
}, |
|
{ |
|
"epoch": 8.37, |
|
"grad_norm": 0.32843562960624695, |
|
"learning_rate": 0.0001, |
|
"loss": 1.7634, |
|
"step": 11490 |
|
}, |
|
{ |
|
"epoch": 8.38, |
|
"grad_norm": 0.3443434536457062, |
|
"learning_rate": 9.900990099009902e-05, |
|
"loss": 1.7647, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"grad_norm": 0.3086354732513428, |
|
"learning_rate": 9.801980198019803e-05, |
|
"loss": 1.7633, |
|
"step": 11510 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"grad_norm": 0.2858017086982727, |
|
"learning_rate": 9.702970297029704e-05, |
|
"loss": 1.7635, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 8.39, |
|
"eval_accuracy": 0.6506746559931541, |
|
"eval_loss": 1.61593496799469, |
|
"eval_runtime": 1085.1265, |
|
"eval_samples_per_second": 460.204, |
|
"eval_steps_per_second": 2.055, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 0.32728368043899536, |
|
"learning_rate": 9.603960396039604e-05, |
|
"loss": 1.7637, |
|
"step": 11530 |
|
}, |
|
{ |
|
"epoch": 8.41, |
|
"grad_norm": 0.3133088946342468, |
|
"learning_rate": 9.504950495049505e-05, |
|
"loss": 1.7629, |
|
"step": 11540 |
|
}, |
|
{ |
|
"epoch": 8.42, |
|
"grad_norm": 0.2798636853694916, |
|
"learning_rate": 9.405940594059406e-05, |
|
"loss": 1.766, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 8.42, |
|
"grad_norm": 0.3476144075393677, |
|
"learning_rate": 9.306930693069307e-05, |
|
"loss": 1.7658, |
|
"step": 11560 |
|
}, |
|
{ |
|
"epoch": 8.43, |
|
"grad_norm": 0.2828819751739502, |
|
"learning_rate": 9.207920792079209e-05, |
|
"loss": 1.7624, |
|
"step": 11570 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 0.27723389863967896, |
|
"learning_rate": 9.108910891089108e-05, |
|
"loss": 1.7635, |
|
"step": 11580 |
|
}, |
|
{ |
|
"epoch": 8.44, |
|
"grad_norm": 0.32631412148475647, |
|
"learning_rate": 9.00990099009901e-05, |
|
"loss": 1.7621, |
|
"step": 11590 |
|
}, |
|
{ |
|
"epoch": 8.45, |
|
"grad_norm": 0.3203299641609192, |
|
"learning_rate": 8.91089108910891e-05, |
|
"loss": 1.7629, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"grad_norm": 0.31430014967918396, |
|
"learning_rate": 8.811881188118812e-05, |
|
"loss": 1.7631, |
|
"step": 11610 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"eval_accuracy": 0.6510713372187361, |
|
"eval_loss": 1.6139030456542969, |
|
"eval_runtime": 1084.8096, |
|
"eval_samples_per_second": 460.339, |
|
"eval_steps_per_second": 2.056, |
|
"step": 11610 |
|
}, |
|
{ |
|
"epoch": 8.47, |
|
"grad_norm": 0.302937775850296, |
|
"learning_rate": 8.712871287128713e-05, |
|
"loss": 1.7628, |
|
"step": 11620 |
|
}, |
|
{ |
|
"epoch": 8.47, |
|
"grad_norm": 0.2944415807723999, |
|
"learning_rate": 8.613861386138613e-05, |
|
"loss": 1.7642, |
|
"step": 11630 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"grad_norm": 0.3318140506744385, |
|
"learning_rate": 8.514851485148515e-05, |
|
"loss": 1.7626, |
|
"step": 11640 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"grad_norm": 0.2729699909687042, |
|
"learning_rate": 8.415841584158417e-05, |
|
"loss": 1.7623, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.37606656551361084, |
|
"learning_rate": 8.316831683168318e-05, |
|
"loss": 1.7609, |
|
"step": 11660 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.30239492654800415, |
|
"learning_rate": 8.217821782178219e-05, |
|
"loss": 1.7621, |
|
"step": 11670 |
|
}, |
|
{ |
|
"epoch": 8.51, |
|
"grad_norm": 0.2841242253780365, |
|
"learning_rate": 8.118811881188119e-05, |
|
"loss": 1.7629, |
|
"step": 11680 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"grad_norm": 0.3027147054672241, |
|
"learning_rate": 8.01980198019802e-05, |
|
"loss": 1.7618, |
|
"step": 11690 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"grad_norm": 0.2852645218372345, |
|
"learning_rate": 7.920792079207921e-05, |
|
"loss": 1.7633, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 8.52, |
|
"eval_accuracy": 0.6512795497031492, |
|
"eval_loss": 1.6128230094909668, |
|
"eval_runtime": 1083.3765, |
|
"eval_samples_per_second": 460.948, |
|
"eval_steps_per_second": 2.058, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 8.53, |
|
"grad_norm": 0.30832818150520325, |
|
"learning_rate": 7.821782178217822e-05, |
|
"loss": 1.7606, |
|
"step": 11710 |
|
}, |
|
{ |
|
"epoch": 8.54, |
|
"grad_norm": 0.30394139885902405, |
|
"learning_rate": 7.722772277227723e-05, |
|
"loss": 1.7609, |
|
"step": 11720 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 0.2910577952861786, |
|
"learning_rate": 7.623762376237623e-05, |
|
"loss": 1.7607, |
|
"step": 11730 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 0.27040547132492065, |
|
"learning_rate": 7.524752475247524e-05, |
|
"loss": 1.7607, |
|
"step": 11740 |
|
}, |
|
{ |
|
"epoch": 8.56, |
|
"grad_norm": 0.2972952723503113, |
|
"learning_rate": 7.425742574257426e-05, |
|
"loss": 1.7615, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 8.57, |
|
"grad_norm": 0.26889026165008545, |
|
"learning_rate": 7.326732673267327e-05, |
|
"loss": 1.7631, |
|
"step": 11760 |
|
}, |
|
{ |
|
"epoch": 8.58, |
|
"grad_norm": 0.3071919083595276, |
|
"learning_rate": 7.227722772277228e-05, |
|
"loss": 1.7572, |
|
"step": 11770 |
|
}, |
|
{ |
|
"epoch": 8.58, |
|
"grad_norm": 0.30390483140945435, |
|
"learning_rate": 7.128712871287128e-05, |
|
"loss": 1.7593, |
|
"step": 11780 |
|
}, |
|
{ |
|
"epoch": 8.59, |
|
"grad_norm": 0.2942393124103546, |
|
"learning_rate": 7.02970297029703e-05, |
|
"loss": 1.7616, |
|
"step": 11790 |
|
}, |
|
{ |
|
"epoch": 8.59, |
|
"eval_accuracy": 0.6516749418564994, |
|
"eval_loss": 1.6112834215164185, |
|
"eval_runtime": 1083.6459, |
|
"eval_samples_per_second": 460.833, |
|
"eval_steps_per_second": 2.058, |
|
"step": 11790 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 0.2823123037815094, |
|
"learning_rate": 6.930693069306931e-05, |
|
"loss": 1.7615, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 0.3058627247810364, |
|
"learning_rate": 6.831683168316833e-05, |
|
"loss": 1.7612, |
|
"step": 11810 |
|
}, |
|
{ |
|
"epoch": 8.61, |
|
"grad_norm": 0.2954027056694031, |
|
"learning_rate": 6.732673267326734e-05, |
|
"loss": 1.7623, |
|
"step": 11820 |
|
}, |
|
{ |
|
"epoch": 8.62, |
|
"grad_norm": 0.32210031151771545, |
|
"learning_rate": 6.633663366336634e-05, |
|
"loss": 1.7607, |
|
"step": 11830 |
|
}, |
|
{ |
|
"epoch": 8.63, |
|
"grad_norm": 0.2638227343559265, |
|
"learning_rate": 6.534653465346535e-05, |
|
"loss": 1.7591, |
|
"step": 11840 |
|
}, |
|
{ |
|
"epoch": 8.63, |
|
"grad_norm": 0.2716045379638672, |
|
"learning_rate": 6.435643564356436e-05, |
|
"loss": 1.7602, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 0.2823101282119751, |
|
"learning_rate": 6.336633663366337e-05, |
|
"loss": 1.7602, |
|
"step": 11860 |
|
}, |
|
{ |
|
"epoch": 8.65, |
|
"grad_norm": 0.26111429929733276, |
|
"learning_rate": 6.237623762376238e-05, |
|
"loss": 1.7608, |
|
"step": 11870 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"grad_norm": 0.28957730531692505, |
|
"learning_rate": 6.138613861386138e-05, |
|
"loss": 1.7602, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"eval_accuracy": 0.6518121780771471, |
|
"eval_loss": 1.610386610031128, |
|
"eval_runtime": 1085.801, |
|
"eval_samples_per_second": 459.919, |
|
"eval_steps_per_second": 2.054, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"grad_norm": 0.31074461340904236, |
|
"learning_rate": 6.039603960396039e-05, |
|
"loss": 1.7599, |
|
"step": 11890 |
|
}, |
|
{ |
|
"epoch": 8.67, |
|
"grad_norm": 0.2974682152271271, |
|
"learning_rate": 5.9405940594059404e-05, |
|
"loss": 1.7604, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 8.68, |
|
"grad_norm": 0.3034124970436096, |
|
"learning_rate": 5.841584158415842e-05, |
|
"loss": 1.7605, |
|
"step": 11910 |
|
}, |
|
{ |
|
"epoch": 8.68, |
|
"grad_norm": 0.28555795550346375, |
|
"learning_rate": 5.742574257425743e-05, |
|
"loss": 1.7568, |
|
"step": 11920 |
|
}, |
|
{ |
|
"epoch": 8.69, |
|
"grad_norm": 0.2668933868408203, |
|
"learning_rate": 5.643564356435644e-05, |
|
"loss": 1.7576, |
|
"step": 11930 |
|
}, |
|
{ |
|
"epoch": 8.7, |
|
"grad_norm": 0.2799495458602905, |
|
"learning_rate": 5.5445544554455445e-05, |
|
"loss": 1.7595, |
|
"step": 11940 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"grad_norm": 0.28266316652297974, |
|
"learning_rate": 5.4455445544554456e-05, |
|
"loss": 1.7602, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 8.71, |
|
"grad_norm": 0.27878817915916443, |
|
"learning_rate": 5.346534653465347e-05, |
|
"loss": 1.7607, |
|
"step": 11960 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"grad_norm": 0.2904963493347168, |
|
"learning_rate": 5.247524752475247e-05, |
|
"loss": 1.7578, |
|
"step": 11970 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"eval_accuracy": 0.6515499792766575, |
|
"eval_loss": 1.6111468076705933, |
|
"eval_runtime": 1090.6667, |
|
"eval_samples_per_second": 457.867, |
|
"eval_steps_per_second": 2.045, |
|
"step": 11970 |
|
}, |
|
{ |
|
"epoch": 8.73, |
|
"grad_norm": 0.289413720369339, |
|
"learning_rate": 5.1485148514851485e-05, |
|
"loss": 1.758, |
|
"step": 11980 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"grad_norm": 0.2739205062389374, |
|
"learning_rate": 5.0495049504950497e-05, |
|
"loss": 1.7579, |
|
"step": 11990 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"grad_norm": 0.26597511768341064, |
|
"learning_rate": 4.950495049504951e-05, |
|
"loss": 1.7568, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.24635004997253418, |
|
"learning_rate": 4.851485148514852e-05, |
|
"loss": 1.7584, |
|
"step": 12010 |
|
}, |
|
{ |
|
"epoch": 8.76, |
|
"grad_norm": 0.2534136474132538, |
|
"learning_rate": 4.7524752475247525e-05, |
|
"loss": 1.7602, |
|
"step": 12020 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"grad_norm": 0.26007363200187683, |
|
"learning_rate": 4.653465346534654e-05, |
|
"loss": 1.7567, |
|
"step": 12030 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"grad_norm": 0.2807808816432953, |
|
"learning_rate": 4.554455445544554e-05, |
|
"loss": 1.7566, |
|
"step": 12040 |
|
}, |
|
{ |
|
"epoch": 8.78, |
|
"grad_norm": 0.2677513360977173, |
|
"learning_rate": 4.455445544554455e-05, |
|
"loss": 1.7567, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 8.79, |
|
"grad_norm": 0.2691977620124817, |
|
"learning_rate": 4.3564356435643565e-05, |
|
"loss": 1.757, |
|
"step": 12060 |
|
}, |
|
{ |
|
"epoch": 8.79, |
|
"eval_accuracy": 0.6521601327172856, |
|
"eval_loss": 1.60829758644104, |
|
"eval_runtime": 1089.928, |
|
"eval_samples_per_second": 458.177, |
|
"eval_steps_per_second": 2.046, |
|
"step": 12060 |
|
}, |
|
{ |
|
"epoch": 8.79, |
|
"grad_norm": 0.2577356696128845, |
|
"learning_rate": 4.257425742574258e-05, |
|
"loss": 1.7584, |
|
"step": 12070 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 0.2654874324798584, |
|
"learning_rate": 4.158415841584159e-05, |
|
"loss": 1.7571, |
|
"step": 12080 |
|
}, |
|
{ |
|
"epoch": 8.81, |
|
"grad_norm": 0.25344353914260864, |
|
"learning_rate": 4.0594059405940594e-05, |
|
"loss": 1.7581, |
|
"step": 12090 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"grad_norm": 0.25865158438682556, |
|
"learning_rate": 3.9603960396039605e-05, |
|
"loss": 1.7552, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 8.82, |
|
"grad_norm": 0.28875982761383057, |
|
"learning_rate": 3.861386138613862e-05, |
|
"loss": 1.757, |
|
"step": 12110 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"grad_norm": 0.2697414755821228, |
|
"learning_rate": 3.762376237623762e-05, |
|
"loss": 1.7579, |
|
"step": 12120 |
|
}, |
|
{ |
|
"epoch": 8.84, |
|
"grad_norm": 0.2786589562892914, |
|
"learning_rate": 3.6633663366336634e-05, |
|
"loss": 1.7583, |
|
"step": 12130 |
|
}, |
|
{ |
|
"epoch": 8.85, |
|
"grad_norm": 0.258486270904541, |
|
"learning_rate": 3.564356435643564e-05, |
|
"loss": 1.7581, |
|
"step": 12140 |
|
}, |
|
{ |
|
"epoch": 8.85, |
|
"grad_norm": 0.2595365345478058, |
|
"learning_rate": 3.465346534653466e-05, |
|
"loss": 1.757, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 8.85, |
|
"eval_accuracy": 0.652040482066107, |
|
"eval_loss": 1.6086018085479736, |
|
"eval_runtime": 1089.5635, |
|
"eval_samples_per_second": 458.33, |
|
"eval_steps_per_second": 2.047, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 8.86, |
|
"grad_norm": 0.25674012303352356, |
|
"learning_rate": 3.366336633663367e-05, |
|
"loss": 1.7595, |
|
"step": 12160 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"grad_norm": 0.23194921016693115, |
|
"learning_rate": 3.2673267326732674e-05, |
|
"loss": 1.7574, |
|
"step": 12170 |
|
}, |
|
{ |
|
"epoch": 8.87, |
|
"grad_norm": 0.2626875936985016, |
|
"learning_rate": 3.1683168316831686e-05, |
|
"loss": 1.7571, |
|
"step": 12180 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 0.2361476868391037, |
|
"learning_rate": 3.069306930693069e-05, |
|
"loss": 1.7573, |
|
"step": 12190 |
|
}, |
|
{ |
|
"epoch": 8.89, |
|
"grad_norm": 0.2606755793094635, |
|
"learning_rate": 2.9702970297029702e-05, |
|
"loss": 1.7567, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"grad_norm": 0.27499887347221375, |
|
"learning_rate": 2.8712871287128714e-05, |
|
"loss": 1.7579, |
|
"step": 12210 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"grad_norm": 0.24832656979560852, |
|
"learning_rate": 2.7722772277227722e-05, |
|
"loss": 1.7566, |
|
"step": 12220 |
|
}, |
|
{ |
|
"epoch": 8.91, |
|
"grad_norm": 0.24898388981819153, |
|
"learning_rate": 2.6732673267326734e-05, |
|
"loss": 1.7544, |
|
"step": 12230 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 0.24266423285007477, |
|
"learning_rate": 2.5742574257425742e-05, |
|
"loss": 1.7559, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"eval_accuracy": 0.6522573824099933, |
|
"eval_loss": 1.6079708337783813, |
|
"eval_runtime": 1089.9176, |
|
"eval_samples_per_second": 458.181, |
|
"eval_steps_per_second": 2.046, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"grad_norm": 0.2438860386610031, |
|
"learning_rate": 2.4752475247524754e-05, |
|
"loss": 1.7554, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"grad_norm": 0.22911418974399567, |
|
"learning_rate": 2.3762376237623762e-05, |
|
"loss": 1.7547, |
|
"step": 12260 |
|
}, |
|
{ |
|
"epoch": 8.94, |
|
"grad_norm": 0.2550877034664154, |
|
"learning_rate": 2.277227722772277e-05, |
|
"loss": 1.7567, |
|
"step": 12270 |
|
}, |
|
{ |
|
"epoch": 8.95, |
|
"grad_norm": 0.2409505546092987, |
|
"learning_rate": 2.1782178217821783e-05, |
|
"loss": 1.7556, |
|
"step": 12280 |
|
}, |
|
{ |
|
"epoch": 8.95, |
|
"grad_norm": 0.23632997274398804, |
|
"learning_rate": 2.0792079207920794e-05, |
|
"loss": 1.7573, |
|
"step": 12290 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"grad_norm": 0.22292740643024445, |
|
"learning_rate": 1.9801980198019803e-05, |
|
"loss": 1.757, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 8.97, |
|
"grad_norm": 0.2350420504808426, |
|
"learning_rate": 1.881188118811881e-05, |
|
"loss": 1.756, |
|
"step": 12310 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 0.22938278317451477, |
|
"learning_rate": 1.782178217821782e-05, |
|
"loss": 1.7562, |
|
"step": 12320 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"grad_norm": 0.2246268391609192, |
|
"learning_rate": 1.6831683168316834e-05, |
|
"loss": 1.7556, |
|
"step": 12330 |
|
}, |
|
{ |
|
"epoch": 8.98, |
|
"eval_accuracy": 0.652376308176148, |
|
"eval_loss": 1.6073620319366455, |
|
"eval_runtime": 1088.9818, |
|
"eval_samples_per_second": 458.575, |
|
"eval_steps_per_second": 2.048, |
|
"step": 12330 |
|
}, |
|
{ |
|
"epoch": 8.99, |
|
"grad_norm": 0.22820483148097992, |
|
"learning_rate": 1.5841584158415843e-05, |
|
"loss": 1.7564, |
|
"step": 12340 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.2315167486667633, |
|
"learning_rate": 1.4851485148514851e-05, |
|
"loss": 1.7558, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 9.01, |
|
"grad_norm": 0.21513350307941437, |
|
"learning_rate": 1.3861386138613861e-05, |
|
"loss": 1.757, |
|
"step": 12360 |
|
}, |
|
{ |
|
"epoch": 9.01, |
|
"grad_norm": 0.21538245677947998, |
|
"learning_rate": 1.2871287128712871e-05, |
|
"loss": 1.7527, |
|
"step": 12370 |
|
}, |
|
{ |
|
"epoch": 9.02, |
|
"grad_norm": 0.22796376049518585, |
|
"learning_rate": 1.1881188118811881e-05, |
|
"loss": 1.7549, |
|
"step": 12380 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"grad_norm": 0.21846508979797363, |
|
"learning_rate": 1.0891089108910891e-05, |
|
"loss": 1.7527, |
|
"step": 12390 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"grad_norm": 0.2252340316772461, |
|
"learning_rate": 9.900990099009901e-06, |
|
"loss": 1.757, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"grad_norm": 0.22679966688156128, |
|
"learning_rate": 8.91089108910891e-06, |
|
"loss": 1.7547, |
|
"step": 12410 |
|
}, |
|
{ |
|
"epoch": 9.05, |
|
"grad_norm": 0.21749068796634674, |
|
"learning_rate": 7.920792079207921e-06, |
|
"loss": 1.755, |
|
"step": 12420 |
|
}, |
|
{ |
|
"epoch": 9.05, |
|
"eval_accuracy": 0.6525192559694988, |
|
"eval_loss": 1.6068978309631348, |
|
"eval_runtime": 1087.147, |
|
"eval_samples_per_second": 459.349, |
|
"eval_steps_per_second": 2.051, |
|
"step": 12420 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 12500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 90, |
|
"total_flos": 1.28938481325833e+18, |
|
"train_batch_size": 192, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|