|
{ |
|
"best_metric": 0.5749832987785339, |
|
"best_model_checkpoint": "./Melanoma-Classification/checkpoint-3800", |
|
"epoch": 4.0, |
|
"eval_steps": 100, |
|
"global_step": 5068, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.4924156665802002, |
|
"learning_rate": 0.00019960536700868193, |
|
"loss": 1.5595, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.687912106513977, |
|
"learning_rate": 0.00019921073401736386, |
|
"loss": 1.3491, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.1836044788360596, |
|
"learning_rate": 0.0001988161010260458, |
|
"loss": 1.1583, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.7617909908294678, |
|
"learning_rate": 0.00019842146803472773, |
|
"loss": 1.4836, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7186870574951172, |
|
"learning_rate": 0.00019802683504340965, |
|
"loss": 1.1296, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.4756615161895752, |
|
"learning_rate": 0.00019763220205209155, |
|
"loss": 1.0864, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.723090648651123, |
|
"learning_rate": 0.00019723756906077347, |
|
"loss": 1.1339, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.304826021194458, |
|
"learning_rate": 0.0001968429360694554, |
|
"loss": 1.0937, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.5404345989227295, |
|
"learning_rate": 0.00019644830307813735, |
|
"loss": 1.0765, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.1473004817962646, |
|
"learning_rate": 0.00019605367008681927, |
|
"loss": 0.9779, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_accuracy": 0.6041049930925597, |
|
"eval_loss": 1.1158318519592285, |
|
"eval_runtime": 91.4624, |
|
"eval_samples_per_second": 55.4, |
|
"eval_steps_per_second": 6.932, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.886522054672241, |
|
"learning_rate": 0.0001956590370955012, |
|
"loss": 1.0835, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.163769006729126, |
|
"learning_rate": 0.00019526440410418312, |
|
"loss": 1.1449, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.7149088382720947, |
|
"learning_rate": 0.00019486977111286504, |
|
"loss": 1.115, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.6274876594543457, |
|
"learning_rate": 0.00019447513812154697, |
|
"loss": 1.1516, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.5108281373977661, |
|
"learning_rate": 0.00019408050513022892, |
|
"loss": 1.0328, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.575366497039795, |
|
"learning_rate": 0.00019368587213891084, |
|
"loss": 1.0288, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.9967373609542847, |
|
"learning_rate": 0.00019329123914759276, |
|
"loss": 0.89, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.47580885887146, |
|
"learning_rate": 0.00019289660615627466, |
|
"loss": 1.0238, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.0836246013641357, |
|
"learning_rate": 0.00019250197316495658, |
|
"loss": 1.034, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.9168826341629028, |
|
"learning_rate": 0.0001921073401736385, |
|
"loss": 0.9934, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_accuracy": 0.650088809946714, |
|
"eval_loss": 1.0227293968200684, |
|
"eval_runtime": 91.1657, |
|
"eval_samples_per_second": 55.58, |
|
"eval_steps_per_second": 6.954, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.0433690547943115, |
|
"learning_rate": 0.00019171270718232046, |
|
"loss": 0.9561, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.9006619453430176, |
|
"learning_rate": 0.00019131807419100238, |
|
"loss": 1.1087, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.693882942199707, |
|
"learning_rate": 0.0001909234411996843, |
|
"loss": 1.0212, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.44376802444458, |
|
"learning_rate": 0.00019052880820836623, |
|
"loss": 0.9192, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.168645977973938, |
|
"learning_rate": 0.00019013417521704815, |
|
"loss": 0.9394, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.2017979621887207, |
|
"learning_rate": 0.00018973954222573008, |
|
"loss": 1.0321, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.930981397628784, |
|
"learning_rate": 0.00018934490923441203, |
|
"loss": 1.019, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.8777360916137695, |
|
"learning_rate": 0.00018895027624309395, |
|
"loss": 1.1093, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.2270009517669678, |
|
"learning_rate": 0.00018855564325177585, |
|
"loss": 0.9979, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.8015016317367554, |
|
"learning_rate": 0.00018816101026045777, |
|
"loss": 0.9562, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_accuracy": 0.6747582395895007, |
|
"eval_loss": 0.9276195168495178, |
|
"eval_runtime": 91.6369, |
|
"eval_samples_per_second": 55.294, |
|
"eval_steps_per_second": 6.919, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.6960203647613525, |
|
"learning_rate": 0.0001877663772691397, |
|
"loss": 0.95, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.5449939966201782, |
|
"learning_rate": 0.00018737174427782162, |
|
"loss": 0.9869, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.7589311599731445, |
|
"learning_rate": 0.00018697711128650357, |
|
"loss": 0.8735, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1408846378326416, |
|
"learning_rate": 0.0001865824782951855, |
|
"loss": 0.8768, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.8114129304885864, |
|
"learning_rate": 0.00018618784530386741, |
|
"loss": 0.8912, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.1425697803497314, |
|
"learning_rate": 0.00018579321231254934, |
|
"loss": 0.9495, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.0084307193756104, |
|
"learning_rate": 0.00018539857932123126, |
|
"loss": 1.0388, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.8898825645446777, |
|
"learning_rate": 0.00018500394632991319, |
|
"loss": 0.9202, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.13079833984375, |
|
"learning_rate": 0.00018460931333859514, |
|
"loss": 0.9567, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.111664056777954, |
|
"learning_rate": 0.00018421468034727706, |
|
"loss": 1.0995, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_accuracy": 0.6836392342609039, |
|
"eval_loss": 0.9088240265846252, |
|
"eval_runtime": 91.7358, |
|
"eval_samples_per_second": 55.235, |
|
"eval_steps_per_second": 6.911, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.5040955543518066, |
|
"learning_rate": 0.00018382004735595896, |
|
"loss": 0.7407, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.1850649118423462, |
|
"learning_rate": 0.00018342541436464088, |
|
"loss": 0.8685, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.618028163909912, |
|
"learning_rate": 0.0001830307813733228, |
|
"loss": 0.9087, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.015228509902954, |
|
"learning_rate": 0.00018263614838200473, |
|
"loss": 1.0173, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.2800592184066772, |
|
"learning_rate": 0.00018224151539068668, |
|
"loss": 1.1207, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.40296471118927, |
|
"learning_rate": 0.0001818468823993686, |
|
"loss": 0.7813, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.0328774452209473, |
|
"learning_rate": 0.00018145224940805052, |
|
"loss": 0.8776, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.16869854927063, |
|
"learning_rate": 0.00018105761641673245, |
|
"loss": 1.1089, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.867370367050171, |
|
"learning_rate": 0.00018066298342541437, |
|
"loss": 0.9493, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.4438893795013428, |
|
"learning_rate": 0.0001802683504340963, |
|
"loss": 0.8198, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_accuracy": 0.6948884941780146, |
|
"eval_loss": 0.8580543398857117, |
|
"eval_runtime": 92.2035, |
|
"eval_samples_per_second": 54.955, |
|
"eval_steps_per_second": 6.876, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.3336217403411865, |
|
"learning_rate": 0.00017987371744277825, |
|
"loss": 0.7372, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.482779026031494, |
|
"learning_rate": 0.00017947908445146017, |
|
"loss": 0.8516, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.5005723237991333, |
|
"learning_rate": 0.00017908445146014207, |
|
"loss": 0.7964, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.8133363723754883, |
|
"learning_rate": 0.000178689818468824, |
|
"loss": 0.8267, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.8624382019042969, |
|
"learning_rate": 0.0001782951854775059, |
|
"loss": 0.8205, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.319140911102295, |
|
"learning_rate": 0.00017790055248618784, |
|
"loss": 0.8772, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.922405958175659, |
|
"learning_rate": 0.0001775059194948698, |
|
"loss": 0.9475, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.8725391030311584, |
|
"learning_rate": 0.0001771112865035517, |
|
"loss": 0.8155, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.170085906982422, |
|
"learning_rate": 0.00017671665351223363, |
|
"loss": 1.0075, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.645709276199341, |
|
"learning_rate": 0.00017632202052091556, |
|
"loss": 0.8034, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_accuracy": 0.6966646931122953, |
|
"eval_loss": 0.8444026112556458, |
|
"eval_runtime": 91.3762, |
|
"eval_samples_per_second": 55.452, |
|
"eval_steps_per_second": 6.938, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.4042195081710815, |
|
"learning_rate": 0.00017592738752959748, |
|
"loss": 0.8112, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.9543468952178955, |
|
"learning_rate": 0.0001755327545382794, |
|
"loss": 0.8895, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.4769212007522583, |
|
"learning_rate": 0.00017513812154696135, |
|
"loss": 0.8816, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.843929648399353, |
|
"learning_rate": 0.00017474348855564325, |
|
"loss": 0.834, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.1005053520202637, |
|
"learning_rate": 0.00017434885556432517, |
|
"loss": 0.8459, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.8301466703414917, |
|
"learning_rate": 0.0001739542225730071, |
|
"loss": 0.745, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.7254234552383423, |
|
"learning_rate": 0.00017355958958168902, |
|
"loss": 0.7985, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.2055907249450684, |
|
"learning_rate": 0.00017316495659037095, |
|
"loss": 0.768, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.17266845703125, |
|
"learning_rate": 0.0001727703235990529, |
|
"loss": 0.8608, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.5750426054000854, |
|
"learning_rate": 0.00017237569060773482, |
|
"loss": 0.8319, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_accuracy": 0.7148213933293862, |
|
"eval_loss": 0.819585919380188, |
|
"eval_runtime": 91.97, |
|
"eval_samples_per_second": 55.094, |
|
"eval_steps_per_second": 6.894, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.9275529384613037, |
|
"learning_rate": 0.00017198105761641674, |
|
"loss": 0.8694, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.029745101928711, |
|
"learning_rate": 0.00017158642462509867, |
|
"loss": 0.814, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.5681614875793457, |
|
"learning_rate": 0.0001711917916337806, |
|
"loss": 0.8117, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.8536593914031982, |
|
"learning_rate": 0.00017079715864246251, |
|
"loss": 0.8844, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.9550455808639526, |
|
"learning_rate": 0.00017040252565114446, |
|
"loss": 0.8574, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.0112359523773193, |
|
"learning_rate": 0.00017000789265982636, |
|
"loss": 0.7197, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.2003233432769775, |
|
"learning_rate": 0.00016961325966850828, |
|
"loss": 0.8009, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.5492188930511475, |
|
"learning_rate": 0.0001692186266771902, |
|
"loss": 0.7504, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.8917009830474854, |
|
"learning_rate": 0.00016882399368587213, |
|
"loss": 0.7638, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.2614909410476685, |
|
"learning_rate": 0.00016842936069455408, |
|
"loss": 0.787, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"eval_accuracy": 0.6974541148608644, |
|
"eval_loss": 0.8360146880149841, |
|
"eval_runtime": 91.0831, |
|
"eval_samples_per_second": 55.63, |
|
"eval_steps_per_second": 6.961, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.555303692817688, |
|
"learning_rate": 0.000168034727703236, |
|
"loss": 0.9073, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.3937816619873047, |
|
"learning_rate": 0.00016764009471191793, |
|
"loss": 0.7977, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.7314428091049194, |
|
"learning_rate": 0.00016724546172059985, |
|
"loss": 0.8166, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.7047677040100098, |
|
"learning_rate": 0.00016685082872928178, |
|
"loss": 0.9118, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.1908257007598877, |
|
"learning_rate": 0.0001664561957379637, |
|
"loss": 0.7528, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.966177463531494, |
|
"learning_rate": 0.00016606156274664565, |
|
"loss": 0.8306, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.368105411529541, |
|
"learning_rate": 0.00016566692975532755, |
|
"loss": 0.8289, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.197537660598755, |
|
"learning_rate": 0.00016527229676400947, |
|
"loss": 0.9994, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.863797187805176, |
|
"learning_rate": 0.0001648776637726914, |
|
"loss": 0.852, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.704500198364258, |
|
"learning_rate": 0.00016448303078137332, |
|
"loss": 0.8642, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_accuracy": 0.7008091572922834, |
|
"eval_loss": 0.8250275254249573, |
|
"eval_runtime": 90.6591, |
|
"eval_samples_per_second": 55.891, |
|
"eval_steps_per_second": 6.993, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.438966751098633, |
|
"learning_rate": 0.00016408839779005524, |
|
"loss": 0.7703, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.4163198471069336, |
|
"learning_rate": 0.0001636937647987372, |
|
"loss": 0.7739, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.480591297149658, |
|
"learning_rate": 0.00016329913180741912, |
|
"loss": 0.8631, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.24957537651062, |
|
"learning_rate": 0.00016290449881610104, |
|
"loss": 0.8008, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.1198952198028564, |
|
"learning_rate": 0.00016250986582478296, |
|
"loss": 0.8948, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.7699896097183228, |
|
"learning_rate": 0.00016211523283346489, |
|
"loss": 0.8871, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.3637826442718506, |
|
"learning_rate": 0.0001617205998421468, |
|
"loss": 0.7904, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.0125231742858887, |
|
"learning_rate": 0.00016132596685082876, |
|
"loss": 0.6222, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.767691731452942, |
|
"learning_rate": 0.00016093133385951066, |
|
"loss": 0.8987, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.2865104675292969, |
|
"learning_rate": 0.00016053670086819258, |
|
"loss": 0.8329, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_accuracy": 0.7171896585750938, |
|
"eval_loss": 0.79388028383255, |
|
"eval_runtime": 90.2373, |
|
"eval_samples_per_second": 56.152, |
|
"eval_steps_per_second": 7.026, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.2386913299560547, |
|
"learning_rate": 0.0001601420678768745, |
|
"loss": 0.7724, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.5864025354385376, |
|
"learning_rate": 0.00015974743488555643, |
|
"loss": 0.7434, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.7106010913848877, |
|
"learning_rate": 0.00015935280189423835, |
|
"loss": 0.6527, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.2001113891601562, |
|
"learning_rate": 0.0001589581689029203, |
|
"loss": 0.7301, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.8353298902511597, |
|
"learning_rate": 0.00015856353591160222, |
|
"loss": 0.7831, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.228792190551758, |
|
"learning_rate": 0.00015816890292028415, |
|
"loss": 0.7633, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.594202756881714, |
|
"learning_rate": 0.00015777426992896607, |
|
"loss": 0.8804, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.274918794631958, |
|
"learning_rate": 0.000157379636937648, |
|
"loss": 0.9008, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.0492002964019775, |
|
"learning_rate": 0.00015698500394632992, |
|
"loss": 0.6768, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.415807008743286, |
|
"learning_rate": 0.00015659037095501187, |
|
"loss": 0.9678, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_accuracy": 0.7331754489836195, |
|
"eval_loss": 0.7660978436470032, |
|
"eval_runtime": 92.2043, |
|
"eval_samples_per_second": 54.954, |
|
"eval_steps_per_second": 6.876, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.9822345972061157, |
|
"learning_rate": 0.00015619573796369377, |
|
"loss": 0.7861, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.869279384613037, |
|
"learning_rate": 0.0001558011049723757, |
|
"loss": 0.7472, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.2551605701446533, |
|
"learning_rate": 0.0001554064719810576, |
|
"loss": 0.6913, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.4671690464019775, |
|
"learning_rate": 0.00015501183898973954, |
|
"loss": 0.6751, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.8815134763717651, |
|
"learning_rate": 0.00015461720599842146, |
|
"loss": 0.6414, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.746532917022705, |
|
"learning_rate": 0.0001542225730071034, |
|
"loss": 0.849, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.844710111618042, |
|
"learning_rate": 0.00015382794001578533, |
|
"loss": 0.8739, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.393015146255493, |
|
"learning_rate": 0.00015343330702446726, |
|
"loss": 0.7106, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.9429343938827515, |
|
"learning_rate": 0.00015303867403314918, |
|
"loss": 0.7832, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.263422727584839, |
|
"learning_rate": 0.0001526440410418311, |
|
"loss": 0.8226, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_accuracy": 0.7373199131636077, |
|
"eval_loss": 0.7284455895423889, |
|
"eval_runtime": 91.1355, |
|
"eval_samples_per_second": 55.599, |
|
"eval_steps_per_second": 6.957, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.1170480251312256, |
|
"learning_rate": 0.00015224940805051303, |
|
"loss": 0.8428, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.143359422683716, |
|
"learning_rate": 0.00015185477505919495, |
|
"loss": 0.8429, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.9562945365905762, |
|
"learning_rate": 0.00015146014206787688, |
|
"loss": 0.7666, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.7012232542037964, |
|
"learning_rate": 0.0001510655090765588, |
|
"loss": 0.7828, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.563348412513733, |
|
"learning_rate": 0.00015067087608524072, |
|
"loss": 0.8311, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.0692856311798096, |
|
"learning_rate": 0.00015027624309392265, |
|
"loss": 0.7133, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.52514910697937, |
|
"learning_rate": 0.00014988161010260457, |
|
"loss": 0.8335, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.8508520126342773, |
|
"learning_rate": 0.00014948697711128652, |
|
"loss": 0.5996, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.858475685119629, |
|
"learning_rate": 0.00014909234411996844, |
|
"loss": 0.6434, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 4.348726272583008, |
|
"learning_rate": 0.00014869771112865037, |
|
"loss": 0.7967, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"eval_accuracy": 0.7410696664693113, |
|
"eval_loss": 0.7354866862297058, |
|
"eval_runtime": 91.4806, |
|
"eval_samples_per_second": 55.389, |
|
"eval_steps_per_second": 6.93, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.194589614868164, |
|
"learning_rate": 0.0001483030781373323, |
|
"loss": 0.5958, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.5000673532485962, |
|
"learning_rate": 0.00014790844514601421, |
|
"loss": 0.7041, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 1.6893486976623535, |
|
"learning_rate": 0.00014751381215469614, |
|
"loss": 0.6521, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.065706968307495, |
|
"learning_rate": 0.00014711917916337806, |
|
"loss": 0.6359, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.6649951934814453, |
|
"learning_rate": 0.00014672454617205998, |
|
"loss": 0.7392, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.844940185546875, |
|
"learning_rate": 0.0001463299131807419, |
|
"loss": 0.6207, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.581749677658081, |
|
"learning_rate": 0.00014593528018942383, |
|
"loss": 0.551, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.7631639242172241, |
|
"learning_rate": 0.00014554064719810576, |
|
"loss": 0.6514, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 4.243679523468018, |
|
"learning_rate": 0.00014514601420678768, |
|
"loss": 0.5341, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 3.421487808227539, |
|
"learning_rate": 0.00014475138121546963, |
|
"loss": 0.6531, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_accuracy": 0.7246891651865008, |
|
"eval_loss": 0.7560569047927856, |
|
"eval_runtime": 91.2074, |
|
"eval_samples_per_second": 55.555, |
|
"eval_steps_per_second": 6.951, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 4.670535564422607, |
|
"learning_rate": 0.00014435674822415155, |
|
"loss": 0.6982, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 3.2248194217681885, |
|
"learning_rate": 0.00014396211523283348, |
|
"loss": 0.5345, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 3.1267943382263184, |
|
"learning_rate": 0.0001435674822415154, |
|
"loss": 0.7239, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 3.158299684524536, |
|
"learning_rate": 0.00014317284925019732, |
|
"loss": 0.6854, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.1028189659118652, |
|
"learning_rate": 0.00014277821625887925, |
|
"loss": 0.6367, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.811735153198242, |
|
"learning_rate": 0.00014238358326756117, |
|
"loss": 0.6858, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.150059461593628, |
|
"learning_rate": 0.0001419889502762431, |
|
"loss": 0.7487, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.536576509475708, |
|
"learning_rate": 0.00014159431728492502, |
|
"loss": 0.5976, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.9393413066864014, |
|
"learning_rate": 0.00014119968429360694, |
|
"loss": 0.5959, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 1.6796705722808838, |
|
"learning_rate": 0.00014080505130228886, |
|
"loss": 0.5719, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"eval_accuracy": 0.7637655417406749, |
|
"eval_loss": 0.6839306354522705, |
|
"eval_runtime": 91.2672, |
|
"eval_samples_per_second": 55.518, |
|
"eval_steps_per_second": 6.947, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.542813301086426, |
|
"learning_rate": 0.00014041041831097082, |
|
"loss": 0.5247, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.9640020132064819, |
|
"learning_rate": 0.00014001578531965274, |
|
"loss": 0.4843, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.0161819458007812, |
|
"learning_rate": 0.00013962115232833466, |
|
"loss": 0.676, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.028648853302002, |
|
"learning_rate": 0.00013922651933701659, |
|
"loss": 0.5935, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.2919349670410156, |
|
"learning_rate": 0.0001388318863456985, |
|
"loss": 0.5941, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.8597910404205322, |
|
"learning_rate": 0.00013843725335438043, |
|
"loss": 0.6447, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.6460645198822021, |
|
"learning_rate": 0.00013804262036306236, |
|
"loss": 0.5576, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.1643693447113037, |
|
"learning_rate": 0.00013764798737174428, |
|
"loss": 0.6632, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.991893768310547, |
|
"learning_rate": 0.0001372533543804262, |
|
"loss": 0.6406, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 3.314627170562744, |
|
"learning_rate": 0.00013685872138910813, |
|
"loss": 0.6123, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"eval_accuracy": 0.7584369449378331, |
|
"eval_loss": 0.6857106685638428, |
|
"eval_runtime": 91.5334, |
|
"eval_samples_per_second": 55.357, |
|
"eval_steps_per_second": 6.926, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 1.986220121383667, |
|
"learning_rate": 0.00013646408839779005, |
|
"loss": 0.7161, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 4.052389144897461, |
|
"learning_rate": 0.00013606945540647197, |
|
"loss": 0.7073, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.59963321685791, |
|
"learning_rate": 0.00013567482241515392, |
|
"loss": 0.6871, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 3.5070436000823975, |
|
"learning_rate": 0.00013528018942383585, |
|
"loss": 0.6279, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.9049060344696045, |
|
"learning_rate": 0.00013488555643251777, |
|
"loss": 0.5972, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 1.6428894996643066, |
|
"learning_rate": 0.0001344909234411997, |
|
"loss": 0.5128, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.659148693084717, |
|
"learning_rate": 0.00013409629044988162, |
|
"loss": 0.7127, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 1.5462294816970825, |
|
"learning_rate": 0.00013370165745856354, |
|
"loss": 0.6444, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.8382019996643066, |
|
"learning_rate": 0.00013330702446724547, |
|
"loss": 0.5236, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.1057918071746826, |
|
"learning_rate": 0.0001329123914759274, |
|
"loss": 0.6504, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"eval_accuracy": 0.7531083481349912, |
|
"eval_loss": 0.6970384120941162, |
|
"eval_runtime": 92.1039, |
|
"eval_samples_per_second": 55.014, |
|
"eval_steps_per_second": 6.884, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 1.670453429222107, |
|
"learning_rate": 0.0001325177584846093, |
|
"loss": 0.5204, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 3.400521755218506, |
|
"learning_rate": 0.00013212312549329124, |
|
"loss": 0.5707, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.378593683242798, |
|
"learning_rate": 0.00013172849250197316, |
|
"loss": 0.6149, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 4.090423583984375, |
|
"learning_rate": 0.00013133385951065508, |
|
"loss": 0.7014, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.9423013925552368, |
|
"learning_rate": 0.00013093922651933703, |
|
"loss": 0.4657, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.23858380317688, |
|
"learning_rate": 0.00013054459352801896, |
|
"loss": 0.5976, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 3.8164286613464355, |
|
"learning_rate": 0.00013014996053670088, |
|
"loss": 0.5735, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 3.1099419593811035, |
|
"learning_rate": 0.0001297553275453828, |
|
"loss": 0.5255, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 3.818204879760742, |
|
"learning_rate": 0.00012936069455406473, |
|
"loss": 0.6007, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 3.2938928604125977, |
|
"learning_rate": 0.00012896606156274665, |
|
"loss": 0.6214, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"eval_accuracy": 0.7576475231892639, |
|
"eval_loss": 0.6841119527816772, |
|
"eval_runtime": 91.5596, |
|
"eval_samples_per_second": 55.341, |
|
"eval_steps_per_second": 6.924, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 3.2717528343200684, |
|
"learning_rate": 0.00012857142857142858, |
|
"loss": 0.6118, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 4.299696445465088, |
|
"learning_rate": 0.0001281767955801105, |
|
"loss": 0.5825, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 3.3908324241638184, |
|
"learning_rate": 0.00012778216258879242, |
|
"loss": 0.6013, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.867518424987793, |
|
"learning_rate": 0.00012738752959747435, |
|
"loss": 0.4792, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 4.130500793457031, |
|
"learning_rate": 0.00012699289660615627, |
|
"loss": 0.624, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 1.728039026260376, |
|
"learning_rate": 0.0001265982636148382, |
|
"loss": 0.534, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 3.357947826385498, |
|
"learning_rate": 0.00012620363062352014, |
|
"loss": 0.6773, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.5584005117416382, |
|
"learning_rate": 0.00012580899763220207, |
|
"loss": 0.634, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.4084281921386719, |
|
"learning_rate": 0.000125414364640884, |
|
"loss": 0.5695, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 3.2529728412628174, |
|
"learning_rate": 0.00012501973164956591, |
|
"loss": 0.4925, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_accuracy": 0.7641602526149596, |
|
"eval_loss": 0.6623676419258118, |
|
"eval_runtime": 91.2699, |
|
"eval_samples_per_second": 55.517, |
|
"eval_steps_per_second": 6.946, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.2992420196533203, |
|
"learning_rate": 0.00012462509865824784, |
|
"loss": 0.554, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.778937339782715, |
|
"learning_rate": 0.00012423046566692976, |
|
"loss": 0.6068, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.7722057104110718, |
|
"learning_rate": 0.00012383583267561169, |
|
"loss": 0.6889, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 1.0860401391983032, |
|
"learning_rate": 0.0001234411996842936, |
|
"loss": 0.5685, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.8181560039520264, |
|
"learning_rate": 0.00012304656669297553, |
|
"loss": 0.4731, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.5842840671539307, |
|
"learning_rate": 0.00012265193370165746, |
|
"loss": 0.6794, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.0809130668640137, |
|
"learning_rate": 0.00012225730071033938, |
|
"loss": 0.602, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 4.095306396484375, |
|
"learning_rate": 0.0001218626677190213, |
|
"loss": 0.5275, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.1503665447235107, |
|
"learning_rate": 0.00012146803472770325, |
|
"loss": 0.5502, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.8762742280960083, |
|
"learning_rate": 0.00012107340173638518, |
|
"loss": 0.5797, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_accuracy": 0.7708703374777975, |
|
"eval_loss": 0.6286503076553345, |
|
"eval_runtime": 91.5869, |
|
"eval_samples_per_second": 55.325, |
|
"eval_steps_per_second": 6.922, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.8651785850524902, |
|
"learning_rate": 0.0001206787687450671, |
|
"loss": 0.4879, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 1.7065809965133667, |
|
"learning_rate": 0.00012028413575374901, |
|
"loss": 0.4858, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.897512435913086, |
|
"learning_rate": 0.00011988950276243093, |
|
"loss": 0.5983, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.6783411502838135, |
|
"learning_rate": 0.00011949486977111286, |
|
"loss": 0.475, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 3.3606057167053223, |
|
"learning_rate": 0.00011910023677979481, |
|
"loss": 0.6417, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.4237632751464844, |
|
"learning_rate": 0.00011870560378847673, |
|
"loss": 0.543, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.7784054279327393, |
|
"learning_rate": 0.00011831097079715866, |
|
"loss": 0.6355, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.962653398513794, |
|
"learning_rate": 0.00011791633780584057, |
|
"loss": 0.6289, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.9475271701812744, |
|
"learning_rate": 0.00011752170481452249, |
|
"loss": 0.4379, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.1250455379486084, |
|
"learning_rate": 0.00011712707182320441, |
|
"loss": 0.6018, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"eval_accuracy": 0.7621866982435366, |
|
"eval_loss": 0.6536552309989929, |
|
"eval_runtime": 91.6935, |
|
"eval_samples_per_second": 55.26, |
|
"eval_steps_per_second": 6.914, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 3.269709348678589, |
|
"learning_rate": 0.00011673243883188636, |
|
"loss": 0.6077, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.2141342163085938, |
|
"learning_rate": 0.00011633780584056829, |
|
"loss": 0.543, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.65494966506958, |
|
"learning_rate": 0.00011594317284925021, |
|
"loss": 0.5, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.4164280891418457, |
|
"learning_rate": 0.00011554853985793212, |
|
"loss": 0.5186, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.6607398986816406, |
|
"learning_rate": 0.00011515390686661404, |
|
"loss": 0.5087, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.4734584093093872, |
|
"learning_rate": 0.00011475927387529597, |
|
"loss": 0.5642, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.652514934539795, |
|
"learning_rate": 0.00011436464088397792, |
|
"loss": 0.5276, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.7451727390289307, |
|
"learning_rate": 0.00011397000789265984, |
|
"loss": 0.4694, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.4469006061553955, |
|
"learning_rate": 0.00011357537490134175, |
|
"loss": 0.611, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.6196885108947754, |
|
"learning_rate": 0.00011318074191002367, |
|
"loss": 0.6334, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"eval_accuracy": 0.7712650483520821, |
|
"eval_loss": 0.6413267254829407, |
|
"eval_runtime": 91.8747, |
|
"eval_samples_per_second": 55.151, |
|
"eval_steps_per_second": 6.901, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 3.294318437576294, |
|
"learning_rate": 0.0001127861089187056, |
|
"loss": 0.5822, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 3.1869616508483887, |
|
"learning_rate": 0.00011239147592738752, |
|
"loss": 0.5093, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.102219820022583, |
|
"learning_rate": 0.00011199684293606947, |
|
"loss": 0.6136, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.6941049098968506, |
|
"learning_rate": 0.0001116022099447514, |
|
"loss": 0.5639, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.5528244972229004, |
|
"learning_rate": 0.0001112075769534333, |
|
"loss": 0.5713, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.0887537002563477, |
|
"learning_rate": 0.00011081294396211523, |
|
"loss": 0.3819, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.9082605242729187, |
|
"learning_rate": 0.00011041831097079715, |
|
"loss": 0.6017, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 3.7689263820648193, |
|
"learning_rate": 0.0001100236779794791, |
|
"loss": 0.6816, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.2326387166976929, |
|
"learning_rate": 0.00010962904498816103, |
|
"loss": 0.4845, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.5216434001922607, |
|
"learning_rate": 0.00010923441199684295, |
|
"loss": 0.4111, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"eval_accuracy": 0.7785671995263469, |
|
"eval_loss": 0.6241939663887024, |
|
"eval_runtime": 92.1094, |
|
"eval_samples_per_second": 55.011, |
|
"eval_steps_per_second": 6.883, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 3.051318407058716, |
|
"learning_rate": 0.00010883977900552486, |
|
"loss": 0.4867, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.2874298095703125, |
|
"learning_rate": 0.00010844514601420678, |
|
"loss": 0.594, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.954172134399414, |
|
"learning_rate": 0.00010805051302288871, |
|
"loss": 0.4688, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.589738607406616, |
|
"learning_rate": 0.00010765588003157066, |
|
"loss": 0.5692, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.662523031234741, |
|
"learning_rate": 0.00010726124704025258, |
|
"loss": 0.6033, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 1.6577619314193726, |
|
"learning_rate": 0.0001068666140489345, |
|
"loss": 0.4772, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.9956750869750977, |
|
"learning_rate": 0.00010647198105761642, |
|
"loss": 0.4724, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.4675168991088867, |
|
"learning_rate": 0.00010607734806629834, |
|
"loss": 0.5699, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 3.3173062801361084, |
|
"learning_rate": 0.00010568271507498026, |
|
"loss": 0.7105, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.4665205478668213, |
|
"learning_rate": 0.00010528808208366221, |
|
"loss": 0.4779, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"eval_accuracy": 0.7789619104006316, |
|
"eval_loss": 0.6259533166885376, |
|
"eval_runtime": 92.0715, |
|
"eval_samples_per_second": 55.033, |
|
"eval_steps_per_second": 6.886, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.5439658164978027, |
|
"learning_rate": 0.00010489344909234414, |
|
"loss": 0.5352, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.340139627456665, |
|
"learning_rate": 0.00010449881610102606, |
|
"loss": 0.5646, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.9097132682800293, |
|
"learning_rate": 0.00010410418310970797, |
|
"loss": 0.6391, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 4.203121662139893, |
|
"learning_rate": 0.0001037095501183899, |
|
"loss": 0.4706, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 4.017784595489502, |
|
"learning_rate": 0.00010331491712707182, |
|
"loss": 0.514, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.4876112937927246, |
|
"learning_rate": 0.00010292028413575377, |
|
"loss": 0.4246, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 1.8711539506912231, |
|
"learning_rate": 0.00010252565114443569, |
|
"loss": 0.4967, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.180922508239746, |
|
"learning_rate": 0.0001021310181531176, |
|
"loss": 0.5203, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.5203304290771484, |
|
"learning_rate": 0.00010173638516179952, |
|
"loss": 0.5806, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.462113857269287, |
|
"learning_rate": 0.00010134175217048145, |
|
"loss": 0.5488, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"eval_accuracy": 0.7807381093349122, |
|
"eval_loss": 0.6145808696746826, |
|
"eval_runtime": 92.3214, |
|
"eval_samples_per_second": 54.884, |
|
"eval_steps_per_second": 6.867, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.9260380268096924, |
|
"learning_rate": 0.00010094711917916337, |
|
"loss": 0.5797, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 2.5453708171844482, |
|
"learning_rate": 0.00010055248618784532, |
|
"loss": 0.5223, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.0558793544769287, |
|
"learning_rate": 0.00010015785319652725, |
|
"loss": 0.3824, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.0377509593963623, |
|
"learning_rate": 9.976322020520916e-05, |
|
"loss": 0.4497, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.023796319961548, |
|
"learning_rate": 9.936858721389108e-05, |
|
"loss": 0.3679, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 1.737573504447937, |
|
"learning_rate": 9.8973954222573e-05, |
|
"loss": 0.2755, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.711060881614685, |
|
"learning_rate": 9.857932123125494e-05, |
|
"loss": 0.2844, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.0967369079589844, |
|
"learning_rate": 9.818468823993686e-05, |
|
"loss": 0.3734, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 4.402666091918945, |
|
"learning_rate": 9.779005524861879e-05, |
|
"loss": 0.3492, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.23064565658569336, |
|
"learning_rate": 9.739542225730071e-05, |
|
"loss": 0.3212, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"eval_accuracy": 0.7706729820406553, |
|
"eval_loss": 0.697548508644104, |
|
"eval_runtime": 93.0724, |
|
"eval_samples_per_second": 54.441, |
|
"eval_steps_per_second": 6.812, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.1419265270233154, |
|
"learning_rate": 9.700078926598263e-05, |
|
"loss": 0.3762, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 2.5868382453918457, |
|
"learning_rate": 9.660615627466457e-05, |
|
"loss": 0.3973, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.9295076131820679, |
|
"learning_rate": 9.62115232833465e-05, |
|
"loss": 0.3264, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.8431601524353027, |
|
"learning_rate": 9.581689029202842e-05, |
|
"loss": 0.2892, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 2.848076105117798, |
|
"learning_rate": 9.542225730071036e-05, |
|
"loss": 0.3393, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.3948113918304443, |
|
"learning_rate": 9.502762430939227e-05, |
|
"loss": 0.3074, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 7.114987850189209, |
|
"learning_rate": 9.463299131807419e-05, |
|
"loss": 0.3326, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 3.4275946617126465, |
|
"learning_rate": 9.423835832675613e-05, |
|
"loss": 0.3915, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.183617353439331, |
|
"learning_rate": 9.384372533543805e-05, |
|
"loss": 0.3841, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 2.8311221599578857, |
|
"learning_rate": 9.344909234411997e-05, |
|
"loss": 0.4282, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"eval_accuracy": 0.7789619104006316, |
|
"eval_loss": 0.634354293346405, |
|
"eval_runtime": 93.2533, |
|
"eval_samples_per_second": 54.336, |
|
"eval_steps_per_second": 6.799, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 2.839000701904297, |
|
"learning_rate": 9.305445935280191e-05, |
|
"loss": 0.2869, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 1.4117017984390259, |
|
"learning_rate": 9.265982636148382e-05, |
|
"loss": 0.35, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 6.646859645843506, |
|
"learning_rate": 9.226519337016574e-05, |
|
"loss": 0.1956, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.4397947788238525, |
|
"learning_rate": 9.187056037884768e-05, |
|
"loss": 0.3768, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.0888396501541138, |
|
"learning_rate": 9.14759273875296e-05, |
|
"loss": 0.2825, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 1.1045578718185425, |
|
"learning_rate": 9.108129439621153e-05, |
|
"loss": 0.1746, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.6195358037948608, |
|
"learning_rate": 9.068666140489345e-05, |
|
"loss": 0.2362, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 2.6458139419555664, |
|
"learning_rate": 9.029202841357538e-05, |
|
"loss": 0.2583, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 2.6736512184143066, |
|
"learning_rate": 8.98973954222573e-05, |
|
"loss": 0.4217, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 2.508667230606079, |
|
"learning_rate": 8.950276243093924e-05, |
|
"loss": 0.2822, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"eval_accuracy": 0.7844878626406158, |
|
"eval_loss": 0.6984525322914124, |
|
"eval_runtime": 93.5792, |
|
"eval_samples_per_second": 54.147, |
|
"eval_steps_per_second": 6.775, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 1.4989869594573975, |
|
"learning_rate": 8.910812943962116e-05, |
|
"loss": 0.3101, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.30821582674980164, |
|
"learning_rate": 8.871349644830308e-05, |
|
"loss": 0.2419, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 5.179461479187012, |
|
"learning_rate": 8.8318863456985e-05, |
|
"loss": 0.4989, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 3.597341775894165, |
|
"learning_rate": 8.792423046566693e-05, |
|
"loss": 0.3628, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.0267224311828613, |
|
"learning_rate": 8.752959747434885e-05, |
|
"loss": 0.2395, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 4.339228630065918, |
|
"learning_rate": 8.713496448303079e-05, |
|
"loss": 0.385, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 2.7694778442382812, |
|
"learning_rate": 8.674033149171271e-05, |
|
"loss": 0.3129, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 3.7710163593292236, |
|
"learning_rate": 8.634569850039464e-05, |
|
"loss": 0.3818, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 3.384129047393799, |
|
"learning_rate": 8.595106550907656e-05, |
|
"loss": 0.2772, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.9485032558441162, |
|
"learning_rate": 8.555643251775848e-05, |
|
"loss": 0.3003, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"eval_accuracy": 0.7992895204262878, |
|
"eval_loss": 0.5954017639160156, |
|
"eval_runtime": 90.8299, |
|
"eval_samples_per_second": 55.786, |
|
"eval_steps_per_second": 6.98, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.648350715637207, |
|
"learning_rate": 8.516179952644041e-05, |
|
"loss": 0.3255, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 2.1421117782592773, |
|
"learning_rate": 8.476716653512235e-05, |
|
"loss": 0.3721, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 3.3461079597473145, |
|
"learning_rate": 8.437253354380427e-05, |
|
"loss": 0.3533, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 2.0906074047088623, |
|
"learning_rate": 8.397790055248619e-05, |
|
"loss": 0.2805, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 2.1087684631347656, |
|
"learning_rate": 8.358326756116812e-05, |
|
"loss": 0.2588, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 4.184020042419434, |
|
"learning_rate": 8.318863456985004e-05, |
|
"loss": 0.3646, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.7979826927185059, |
|
"learning_rate": 8.279400157853196e-05, |
|
"loss": 0.2358, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 3.317765235900879, |
|
"learning_rate": 8.23993685872139e-05, |
|
"loss": 0.3134, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 4.421618938446045, |
|
"learning_rate": 8.200473559589582e-05, |
|
"loss": 0.3644, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 4.3369879722595215, |
|
"learning_rate": 8.161010260457775e-05, |
|
"loss": 0.2982, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"eval_accuracy": 0.7939609236234458, |
|
"eval_loss": 0.6156049966812134, |
|
"eval_runtime": 90.2416, |
|
"eval_samples_per_second": 56.149, |
|
"eval_steps_per_second": 7.026, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 3.2008187770843506, |
|
"learning_rate": 8.121546961325967e-05, |
|
"loss": 0.3314, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 2.550990343093872, |
|
"learning_rate": 8.08208366219416e-05, |
|
"loss": 0.2522, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.7245291471481323, |
|
"learning_rate": 8.042620363062352e-05, |
|
"loss": 0.325, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 3.1475908756256104, |
|
"learning_rate": 8.003157063930545e-05, |
|
"loss": 0.3212, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 1.6802873611450195, |
|
"learning_rate": 7.963693764798738e-05, |
|
"loss": 0.2484, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 3.3150057792663574, |
|
"learning_rate": 7.92423046566693e-05, |
|
"loss": 0.3014, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.040041208267212, |
|
"learning_rate": 7.884767166535123e-05, |
|
"loss": 0.324, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 1.5080398321151733, |
|
"learning_rate": 7.845303867403315e-05, |
|
"loss": 0.2253, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.1280837059020996, |
|
"learning_rate": 7.805840568271507e-05, |
|
"loss": 0.2373, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 4.508062839508057, |
|
"learning_rate": 7.766377269139701e-05, |
|
"loss": 0.2628, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"eval_accuracy": 0.7963291888691534, |
|
"eval_loss": 0.6317601203918457, |
|
"eval_runtime": 91.5531, |
|
"eval_samples_per_second": 55.345, |
|
"eval_steps_per_second": 6.925, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.481145977973938, |
|
"learning_rate": 7.726913970007893e-05, |
|
"loss": 0.3059, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.0478160381317139, |
|
"learning_rate": 7.687450670876086e-05, |
|
"loss": 0.2453, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 3.6211369037628174, |
|
"learning_rate": 7.647987371744278e-05, |
|
"loss": 0.293, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.617844343185425, |
|
"learning_rate": 7.60852407261247e-05, |
|
"loss": 0.2081, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 2.3399300575256348, |
|
"learning_rate": 7.569060773480663e-05, |
|
"loss": 0.239, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 5.697124481201172, |
|
"learning_rate": 7.529597474348856e-05, |
|
"loss": 0.2893, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 3.405291795730591, |
|
"learning_rate": 7.490134175217049e-05, |
|
"loss": 0.3245, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 3.2485592365264893, |
|
"learning_rate": 7.450670876085241e-05, |
|
"loss": 0.4137, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.0946143865585327, |
|
"learning_rate": 7.411207576953433e-05, |
|
"loss": 0.1969, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.3789944052696228, |
|
"learning_rate": 7.371744277821626e-05, |
|
"loss": 0.2987, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"eval_accuracy": 0.8030392737319914, |
|
"eval_loss": 0.6494620442390442, |
|
"eval_runtime": 92.4486, |
|
"eval_samples_per_second": 54.809, |
|
"eval_steps_per_second": 6.858, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 3.4358811378479004, |
|
"learning_rate": 7.332280978689818e-05, |
|
"loss": 0.2757, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 3.544767141342163, |
|
"learning_rate": 7.292817679558012e-05, |
|
"loss": 0.2126, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 2.8036537170410156, |
|
"learning_rate": 7.253354380426204e-05, |
|
"loss": 0.2455, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 2.1251418590545654, |
|
"learning_rate": 7.213891081294397e-05, |
|
"loss": 0.2136, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 4.44752311706543, |
|
"learning_rate": 7.174427782162589e-05, |
|
"loss": 0.3819, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.8341973423957825, |
|
"learning_rate": 7.134964483030781e-05, |
|
"loss": 0.2532, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 3.4972732067108154, |
|
"learning_rate": 7.095501183898974e-05, |
|
"loss": 0.2948, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 3.1307532787323, |
|
"learning_rate": 7.056037884767167e-05, |
|
"loss": 0.2987, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.040160894393921, |
|
"learning_rate": 7.01657458563536e-05, |
|
"loss": 0.2939, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.5896205902099609, |
|
"learning_rate": 6.977111286503552e-05, |
|
"loss": 0.2714, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_accuracy": 0.8052101835405565, |
|
"eval_loss": 0.6017640829086304, |
|
"eval_runtime": 92.2458, |
|
"eval_samples_per_second": 54.929, |
|
"eval_steps_per_second": 6.873, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 4.136446475982666, |
|
"learning_rate": 6.937647987371744e-05, |
|
"loss": 0.2848, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.1288126707077026, |
|
"learning_rate": 6.898184688239937e-05, |
|
"loss": 0.3115, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 1.7702901363372803, |
|
"learning_rate": 6.858721389108129e-05, |
|
"loss": 0.2547, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 2.859607696533203, |
|
"learning_rate": 6.819258089976323e-05, |
|
"loss": 0.2793, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 5.98909854888916, |
|
"learning_rate": 6.779794790844515e-05, |
|
"loss": 0.2718, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 4.067326545715332, |
|
"learning_rate": 6.740331491712708e-05, |
|
"loss": 0.3174, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 5.376368522644043, |
|
"learning_rate": 6.7008681925809e-05, |
|
"loss": 0.3494, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 2.6737453937530518, |
|
"learning_rate": 6.661404893449092e-05, |
|
"loss": 0.293, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.012742042541504, |
|
"learning_rate": 6.621941594317286e-05, |
|
"loss": 0.2291, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 4.339968204498291, |
|
"learning_rate": 6.582478295185478e-05, |
|
"loss": 0.3059, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"eval_accuracy": 0.8077758042234063, |
|
"eval_loss": 0.5943508148193359, |
|
"eval_runtime": 92.7623, |
|
"eval_samples_per_second": 54.624, |
|
"eval_steps_per_second": 6.835, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.6194040775299072, |
|
"learning_rate": 6.543014996053671e-05, |
|
"loss": 0.3704, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 4.174539089202881, |
|
"learning_rate": 6.503551696921863e-05, |
|
"loss": 0.3716, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 4.487131118774414, |
|
"learning_rate": 6.464088397790055e-05, |
|
"loss": 0.3054, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 3.870863437652588, |
|
"learning_rate": 6.424625098658248e-05, |
|
"loss": 0.2811, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 4.740366458892822, |
|
"learning_rate": 6.385161799526441e-05, |
|
"loss": 0.2143, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 2.337819814682007, |
|
"learning_rate": 6.345698500394634e-05, |
|
"loss": 0.4391, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 6.155264854431152, |
|
"learning_rate": 6.306235201262826e-05, |
|
"loss": 0.3491, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 2.4382271766662598, |
|
"learning_rate": 6.266771902131019e-05, |
|
"loss": 0.2899, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 2.5207793712615967, |
|
"learning_rate": 6.227308602999211e-05, |
|
"loss": 0.2522, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 2.243215322494507, |
|
"learning_rate": 6.187845303867403e-05, |
|
"loss": 0.2762, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"eval_accuracy": 0.7935662127491613, |
|
"eval_loss": 0.6296067237854004, |
|
"eval_runtime": 92.1328, |
|
"eval_samples_per_second": 54.997, |
|
"eval_steps_per_second": 6.881, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 2.194077968597412, |
|
"learning_rate": 6.148382004735597e-05, |
|
"loss": 0.2279, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 2.3025989532470703, |
|
"learning_rate": 6.108918705603789e-05, |
|
"loss": 0.3366, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 3.9970479011535645, |
|
"learning_rate": 6.069455406471981e-05, |
|
"loss": 0.2698, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 3.0827438831329346, |
|
"learning_rate": 6.029992107340175e-05, |
|
"loss": 0.2195, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.7207252979278564, |
|
"learning_rate": 5.9905288082083663e-05, |
|
"loss": 0.2549, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.6517016887664795, |
|
"learning_rate": 5.951065509076559e-05, |
|
"loss": 0.2294, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 1.5923064947128296, |
|
"learning_rate": 5.9116022099447524e-05, |
|
"loss": 0.2465, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 4.049023151397705, |
|
"learning_rate": 5.872138910812944e-05, |
|
"loss": 0.2712, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.7295699119567871, |
|
"learning_rate": 5.8326756116811364e-05, |
|
"loss": 0.3309, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 3.531484365463257, |
|
"learning_rate": 5.79321231254933e-05, |
|
"loss": 0.3685, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"eval_accuracy": 0.8016577856719953, |
|
"eval_loss": 0.6276537179946899, |
|
"eval_runtime": 92.0744, |
|
"eval_samples_per_second": 55.032, |
|
"eval_steps_per_second": 6.886, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 2.7010326385498047, |
|
"learning_rate": 5.753749013417522e-05, |
|
"loss": 0.1898, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 2.8486061096191406, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.2965, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 5.980003833770752, |
|
"learning_rate": 5.674822415153908e-05, |
|
"loss": 0.297, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.3644309639930725, |
|
"learning_rate": 5.6353591160220996e-05, |
|
"loss": 0.2627, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 4.6136064529418945, |
|
"learning_rate": 5.595895816890292e-05, |
|
"loss": 0.3327, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.8576825857162476, |
|
"learning_rate": 5.556432517758485e-05, |
|
"loss": 0.2064, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 3.2825279235839844, |
|
"learning_rate": 5.516969218626677e-05, |
|
"loss": 0.2582, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 4.109126567840576, |
|
"learning_rate": 5.4775059194948696e-05, |
|
"loss": 0.2412, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.9957514405250549, |
|
"learning_rate": 5.438042620363063e-05, |
|
"loss": 0.1818, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.8618041276931763, |
|
"learning_rate": 5.398579321231255e-05, |
|
"loss": 0.2299, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"eval_accuracy": 0.8125123347148214, |
|
"eval_loss": 0.5834479928016663, |
|
"eval_runtime": 91.926, |
|
"eval_samples_per_second": 55.12, |
|
"eval_steps_per_second": 6.897, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 3.655374526977539, |
|
"learning_rate": 5.3591160220994474e-05, |
|
"loss": 0.3183, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 2.711798906326294, |
|
"learning_rate": 5.3196527229676404e-05, |
|
"loss": 0.3336, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 4.209230422973633, |
|
"learning_rate": 5.280189423835833e-05, |
|
"loss": 0.2944, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 1.0983171463012695, |
|
"learning_rate": 5.240726124704025e-05, |
|
"loss": 0.2846, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 2.4771692752838135, |
|
"learning_rate": 5.201262825572218e-05, |
|
"loss": 0.3192, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 2.7510647773742676, |
|
"learning_rate": 5.1617995264404105e-05, |
|
"loss": 0.1813, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 3.2778220176696777, |
|
"learning_rate": 5.122336227308603e-05, |
|
"loss": 0.1935, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 2.6012797355651855, |
|
"learning_rate": 5.082872928176796e-05, |
|
"loss": 0.2111, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 2.29512357711792, |
|
"learning_rate": 5.043409629044988e-05, |
|
"loss": 0.2987, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 5.790306568145752, |
|
"learning_rate": 5.0039463299131806e-05, |
|
"loss": 0.3414, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.8166567988948096, |
|
"eval_loss": 0.5749832987785339, |
|
"eval_runtime": 92.3841, |
|
"eval_samples_per_second": 54.847, |
|
"eval_steps_per_second": 6.863, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 4.339576721191406, |
|
"learning_rate": 4.9644830307813736e-05, |
|
"loss": 0.1144, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.7903636693954468, |
|
"learning_rate": 4.925019731649566e-05, |
|
"loss": 0.0767, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 2.8627429008483887, |
|
"learning_rate": 4.885556432517759e-05, |
|
"loss": 0.0859, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 4.783523082733154, |
|
"learning_rate": 4.8460931333859514e-05, |
|
"loss": 0.1071, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.7647588849067688, |
|
"learning_rate": 4.806629834254144e-05, |
|
"loss": 0.1531, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 2.228163719177246, |
|
"learning_rate": 4.767166535122337e-05, |
|
"loss": 0.1582, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.918712854385376, |
|
"learning_rate": 4.727703235990529e-05, |
|
"loss": 0.1197, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 4.140162467956543, |
|
"learning_rate": 4.6882399368587215e-05, |
|
"loss": 0.0733, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 2.8325231075286865, |
|
"learning_rate": 4.6487766377269145e-05, |
|
"loss": 0.1651, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.49401429295539856, |
|
"learning_rate": 4.609313338595106e-05, |
|
"loss": 0.1082, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"eval_accuracy": 0.819617130451944, |
|
"eval_loss": 0.6201203465461731, |
|
"eval_runtime": 92.3164, |
|
"eval_samples_per_second": 54.887, |
|
"eval_steps_per_second": 6.868, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.34822145104408264, |
|
"learning_rate": 4.569850039463299e-05, |
|
"loss": 0.0457, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 1.0006756782531738, |
|
"learning_rate": 4.530386740331492e-05, |
|
"loss": 0.1194, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.7993775010108948, |
|
"learning_rate": 4.4909234411996846e-05, |
|
"loss": 0.1325, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 6.907358646392822, |
|
"learning_rate": 4.451460142067877e-05, |
|
"loss": 0.0605, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.2168715000152588, |
|
"learning_rate": 4.41199684293607e-05, |
|
"loss": 0.0968, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.48231208324432373, |
|
"learning_rate": 4.372533543804262e-05, |
|
"loss": 0.0868, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.09531491994857788, |
|
"learning_rate": 4.333070244672455e-05, |
|
"loss": 0.1161, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 3.662717342376709, |
|
"learning_rate": 4.293606945540648e-05, |
|
"loss": 0.1188, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 8.440380096435547, |
|
"learning_rate": 4.25414364640884e-05, |
|
"loss": 0.1319, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 2.019827127456665, |
|
"learning_rate": 4.2146803472770324e-05, |
|
"loss": 0.049, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"eval_accuracy": 0.8160647325833826, |
|
"eval_loss": 0.6474657654762268, |
|
"eval_runtime": 92.4783, |
|
"eval_samples_per_second": 54.791, |
|
"eval_steps_per_second": 6.856, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 4.42202615737915, |
|
"learning_rate": 4.1752170481452254e-05, |
|
"loss": 0.0794, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 4.54102087020874, |
|
"learning_rate": 4.135753749013418e-05, |
|
"loss": 0.1152, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.31936466693878174, |
|
"learning_rate": 4.09629044988161e-05, |
|
"loss": 0.0945, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 0.12120810896158218, |
|
"learning_rate": 4.056827150749803e-05, |
|
"loss": 0.1196, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.3144413232803345, |
|
"learning_rate": 4.0173638516179955e-05, |
|
"loss": 0.1226, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 5.312731742858887, |
|
"learning_rate": 3.977900552486188e-05, |
|
"loss": 0.1807, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 0.05327579751610756, |
|
"learning_rate": 3.938437253354381e-05, |
|
"loss": 0.1521, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 1.8903406858444214, |
|
"learning_rate": 3.898973954222573e-05, |
|
"loss": 0.1712, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.07272280752658844, |
|
"learning_rate": 3.8595106550907656e-05, |
|
"loss": 0.0731, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.41295164823532104, |
|
"learning_rate": 3.8200473559589587e-05, |
|
"loss": 0.102, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"eval_accuracy": 0.8097493585948293, |
|
"eval_loss": 0.6791070103645325, |
|
"eval_runtime": 92.1285, |
|
"eval_samples_per_second": 54.999, |
|
"eval_steps_per_second": 6.882, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 5.161396026611328, |
|
"learning_rate": 3.780584056827151e-05, |
|
"loss": 0.0824, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.2593359053134918, |
|
"learning_rate": 3.7411207576953434e-05, |
|
"loss": 0.1567, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.5720149278640747, |
|
"learning_rate": 3.7016574585635364e-05, |
|
"loss": 0.1422, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 6.186444282531738, |
|
"learning_rate": 3.662194159431729e-05, |
|
"loss": 0.0811, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.9182838201522827, |
|
"learning_rate": 3.622730860299921e-05, |
|
"loss": 0.1454, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 3.1142055988311768, |
|
"learning_rate": 3.583267561168114e-05, |
|
"loss": 0.1304, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.17880991101264954, |
|
"learning_rate": 3.5438042620363065e-05, |
|
"loss": 0.0791, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 3.2896628379821777, |
|
"learning_rate": 3.504340962904499e-05, |
|
"loss": 0.0903, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 3.1752073764801025, |
|
"learning_rate": 3.464877663772691e-05, |
|
"loss": 0.0988, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 1.282556414604187, |
|
"learning_rate": 3.425414364640884e-05, |
|
"loss": 0.0483, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"eval_accuracy": 0.8215906848233668, |
|
"eval_loss": 0.6581845283508301, |
|
"eval_runtime": 92.2168, |
|
"eval_samples_per_second": 54.947, |
|
"eval_steps_per_second": 6.875, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 5.438603401184082, |
|
"learning_rate": 3.3859510655090766e-05, |
|
"loss": 0.084, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.6808891296386719, |
|
"learning_rate": 3.346487766377269e-05, |
|
"loss": 0.1193, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.4623468220233917, |
|
"learning_rate": 3.307024467245462e-05, |
|
"loss": 0.0575, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.5891864895820618, |
|
"learning_rate": 3.267561168113654e-05, |
|
"loss": 0.0353, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 3.8541829586029053, |
|
"learning_rate": 3.2280978689818467e-05, |
|
"loss": 0.1207, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.4876788854598999, |
|
"learning_rate": 3.18863456985004e-05, |
|
"loss": 0.0386, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.7822741270065308, |
|
"learning_rate": 3.149171270718232e-05, |
|
"loss": 0.0753, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 0.6429558396339417, |
|
"learning_rate": 3.1097079715864244e-05, |
|
"loss": 0.0799, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 6.067298889160156, |
|
"learning_rate": 3.0702446724546174e-05, |
|
"loss": 0.1465, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 3.5951452255249023, |
|
"learning_rate": 3.03078137332281e-05, |
|
"loss": 0.1204, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"eval_accuracy": 0.8221827511347938, |
|
"eval_loss": 0.6602674722671509, |
|
"eval_runtime": 91.9479, |
|
"eval_samples_per_second": 55.107, |
|
"eval_steps_per_second": 6.895, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.12355990707874298, |
|
"learning_rate": 2.9913180741910025e-05, |
|
"loss": 0.0873, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 1.412264347076416, |
|
"learning_rate": 2.951854775059195e-05, |
|
"loss": 0.0222, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.25506913661956787, |
|
"learning_rate": 2.912391475927388e-05, |
|
"loss": 0.049, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 1.5517828464508057, |
|
"learning_rate": 2.8729281767955802e-05, |
|
"loss": 0.0323, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 9.105844497680664, |
|
"learning_rate": 2.833464877663773e-05, |
|
"loss": 0.1014, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.06447356194257736, |
|
"learning_rate": 2.7940015785319656e-05, |
|
"loss": 0.049, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 6.344263076782227, |
|
"learning_rate": 2.754538279400158e-05, |
|
"loss": 0.0841, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.45002487301826477, |
|
"learning_rate": 2.7150749802683506e-05, |
|
"loss": 0.1012, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.10588052868843079, |
|
"learning_rate": 2.6756116811365433e-05, |
|
"loss": 0.0076, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 3.3592453002929688, |
|
"learning_rate": 2.6361483820047357e-05, |
|
"loss": 0.0611, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"eval_accuracy": 0.8190250641405171, |
|
"eval_loss": 0.717377781867981, |
|
"eval_runtime": 91.8985, |
|
"eval_samples_per_second": 55.137, |
|
"eval_steps_per_second": 6.899, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.4130847156047821, |
|
"learning_rate": 2.5966850828729284e-05, |
|
"loss": 0.1135, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 4.881414890289307, |
|
"learning_rate": 2.557221783741121e-05, |
|
"loss": 0.1324, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 2.830336332321167, |
|
"learning_rate": 2.5177584846093134e-05, |
|
"loss": 0.0686, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.41728538274765015, |
|
"learning_rate": 2.478295185477506e-05, |
|
"loss": 0.0649, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 2.339663505554199, |
|
"learning_rate": 2.4388318863456985e-05, |
|
"loss": 0.1187, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.06545936316251755, |
|
"learning_rate": 2.399368587213891e-05, |
|
"loss": 0.0995, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 0.9462890625, |
|
"learning_rate": 2.359905288082084e-05, |
|
"loss": 0.0796, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 3.4971237182617188, |
|
"learning_rate": 2.3204419889502762e-05, |
|
"loss": 0.062, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 3.370915412902832, |
|
"learning_rate": 2.280978689818469e-05, |
|
"loss": 0.1375, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.047411490231752396, |
|
"learning_rate": 2.2415153906866616e-05, |
|
"loss": 0.0555, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"eval_accuracy": 0.8235642391947898, |
|
"eval_loss": 0.6841058135032654, |
|
"eval_runtime": 92.265, |
|
"eval_samples_per_second": 54.918, |
|
"eval_steps_per_second": 6.872, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 9.102856636047363, |
|
"learning_rate": 2.202052091554854e-05, |
|
"loss": 0.0936, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 1.5080801248550415, |
|
"learning_rate": 2.1625887924230466e-05, |
|
"loss": 0.0577, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.3888694047927856, |
|
"learning_rate": 2.1231254932912393e-05, |
|
"loss": 0.1027, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 3.6194889545440674, |
|
"learning_rate": 2.0836621941594317e-05, |
|
"loss": 0.0755, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.7071271538734436, |
|
"learning_rate": 2.0441988950276244e-05, |
|
"loss": 0.0276, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.07247108966112137, |
|
"learning_rate": 2.004735595895817e-05, |
|
"loss": 0.1272, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 1.932975172996521, |
|
"learning_rate": 1.9652722967640098e-05, |
|
"loss": 0.1341, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.6965919137001038, |
|
"learning_rate": 1.925808997632202e-05, |
|
"loss": 0.0614, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 3.4213573932647705, |
|
"learning_rate": 1.8863456985003948e-05, |
|
"loss": 0.0764, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 0.1402810513973236, |
|
"learning_rate": 1.8468823993685875e-05, |
|
"loss": 0.0188, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"eval_accuracy": 0.8239589500690744, |
|
"eval_loss": 0.7009280920028687, |
|
"eval_runtime": 92.5146, |
|
"eval_samples_per_second": 54.77, |
|
"eval_steps_per_second": 6.853, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.07443057000637054, |
|
"learning_rate": 1.80741910023678e-05, |
|
"loss": 0.0846, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.20932039618492126, |
|
"learning_rate": 1.7679558011049725e-05, |
|
"loss": 0.0747, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.04756965488195419, |
|
"learning_rate": 1.7284925019731652e-05, |
|
"loss": 0.0628, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 13.026732444763184, |
|
"learning_rate": 1.6890292028413576e-05, |
|
"loss": 0.1348, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.08388219773769379, |
|
"learning_rate": 1.64956590370955e-05, |
|
"loss": 0.0186, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.08429060131311417, |
|
"learning_rate": 1.610102604577743e-05, |
|
"loss": 0.0342, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.22585107386112213, |
|
"learning_rate": 1.5706393054459353e-05, |
|
"loss": 0.0371, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 8.003403663635254, |
|
"learning_rate": 1.5311760063141277e-05, |
|
"loss": 0.1692, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 1.522824764251709, |
|
"learning_rate": 1.4917127071823205e-05, |
|
"loss": 0.0612, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 2.2803661823272705, |
|
"learning_rate": 1.452249408050513e-05, |
|
"loss": 0.1292, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"eval_accuracy": 0.8204065522005132, |
|
"eval_loss": 0.703969419002533, |
|
"eval_runtime": 92.794, |
|
"eval_samples_per_second": 54.605, |
|
"eval_steps_per_second": 6.832, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 6.225048065185547, |
|
"learning_rate": 1.4127861089187056e-05, |
|
"loss": 0.0557, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.4492289423942566, |
|
"learning_rate": 1.3733228097868983e-05, |
|
"loss": 0.1578, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 1.797982096672058, |
|
"learning_rate": 1.3338595106550908e-05, |
|
"loss": 0.019, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.4045000970363617, |
|
"learning_rate": 1.2943962115232833e-05, |
|
"loss": 0.0614, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.08240614086389542, |
|
"learning_rate": 1.254932912391476e-05, |
|
"loss": 0.1421, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 4.642998695373535, |
|
"learning_rate": 1.2154696132596685e-05, |
|
"loss": 0.0562, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.16091428697109222, |
|
"learning_rate": 1.176006314127861e-05, |
|
"loss": 0.0638, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 1.0345463752746582, |
|
"learning_rate": 1.1365430149960538e-05, |
|
"loss": 0.0634, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.042663924396038055, |
|
"learning_rate": 1.0970797158642463e-05, |
|
"loss": 0.1053, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 1.7890504598617554, |
|
"learning_rate": 1.0576164167324388e-05, |
|
"loss": 0.0661, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"eval_accuracy": 0.8237615946319321, |
|
"eval_loss": 0.7073862552642822, |
|
"eval_runtime": 91.8421, |
|
"eval_samples_per_second": 55.171, |
|
"eval_steps_per_second": 6.903, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.08139993995428085, |
|
"learning_rate": 1.0181531176006315e-05, |
|
"loss": 0.0623, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.02715960517525673, |
|
"learning_rate": 9.786898184688242e-06, |
|
"loss": 0.0532, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 5.507246494293213, |
|
"learning_rate": 9.392265193370165e-06, |
|
"loss": 0.0829, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.11962206661701202, |
|
"learning_rate": 8.997632202052092e-06, |
|
"loss": 0.0887, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.14435319602489471, |
|
"learning_rate": 8.602999210734018e-06, |
|
"loss": 0.0478, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 4.151905536651611, |
|
"learning_rate": 8.208366219415943e-06, |
|
"loss": 0.1795, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.10827145725488663, |
|
"learning_rate": 7.81373322809787e-06, |
|
"loss": 0.0566, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.5536698698997498, |
|
"learning_rate": 7.419100236779796e-06, |
|
"loss": 0.0602, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.1482585370540619, |
|
"learning_rate": 7.02446724546172e-06, |
|
"loss": 0.0995, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.6495879888534546, |
|
"learning_rate": 6.629834254143646e-06, |
|
"loss": 0.1061, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"eval_accuracy": 0.82099861851194, |
|
"eval_loss": 0.6983892321586609, |
|
"eval_runtime": 92.4464, |
|
"eval_samples_per_second": 54.81, |
|
"eval_steps_per_second": 6.858, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.4518879652023315, |
|
"learning_rate": 6.235201262825572e-06, |
|
"loss": 0.0968, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 2.563098669052124, |
|
"learning_rate": 5.840568271507498e-06, |
|
"loss": 0.0877, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 5.312118053436279, |
|
"learning_rate": 5.445935280189424e-06, |
|
"loss": 0.1662, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 2.835618257522583, |
|
"learning_rate": 5.05130228887135e-06, |
|
"loss": 0.0757, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 1.9459186792373657, |
|
"learning_rate": 4.656669297553276e-06, |
|
"loss": 0.0844, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 1.6886509656906128, |
|
"learning_rate": 4.262036306235202e-06, |
|
"loss": 0.1373, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 9.272232055664062, |
|
"learning_rate": 3.867403314917127e-06, |
|
"loss": 0.0566, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.17531900107860565, |
|
"learning_rate": 3.472770323599053e-06, |
|
"loss": 0.0691, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.08186766505241394, |
|
"learning_rate": 3.0781373322809787e-06, |
|
"loss": 0.0586, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 3.012854814529419, |
|
"learning_rate": 2.683504340962905e-06, |
|
"loss": 0.0861, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"eval_accuracy": 0.8229721728833629, |
|
"eval_loss": 0.6912640929222107, |
|
"eval_runtime": 91.3989, |
|
"eval_samples_per_second": 55.438, |
|
"eval_steps_per_second": 6.937, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 0.05941695719957352, |
|
"learning_rate": 2.2888713496448305e-06, |
|
"loss": 0.0622, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 7.365468502044678, |
|
"learning_rate": 1.8942383583267563e-06, |
|
"loss": 0.1702, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 2.009979724884033, |
|
"learning_rate": 1.499605367008682e-06, |
|
"loss": 0.0164, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 4.683029651641846, |
|
"learning_rate": 1.1049723756906078e-06, |
|
"loss": 0.0963, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 8.475516319274902, |
|
"learning_rate": 7.103393843725336e-07, |
|
"loss": 0.071, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.3904523253440857, |
|
"learning_rate": 3.1570639305445935e-07, |
|
"loss": 0.0701, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 5068, |
|
"total_flos": 6.281528488153842e+18, |
|
"train_loss": 0.4664627312864129, |
|
"train_runtime": 8576.6293, |
|
"train_samples_per_second": 9.451, |
|
"train_steps_per_second": 0.591 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5068, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"total_flos": 6.281528488153842e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|