|
{ |
|
"best_metric": 0.5304816365242004, |
|
"best_model_checkpoint": "./vit-base-beans/checkpoint-1600", |
|
"epoch": 4.0, |
|
"eval_steps": 100, |
|
"global_step": 1736, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02304147465437788, |
|
"grad_norm": 2.396202564239502, |
|
"learning_rate": 0.00019884792626728113, |
|
"loss": 1.8485, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04608294930875576, |
|
"grad_norm": 1.289166808128357, |
|
"learning_rate": 0.00019769585253456222, |
|
"loss": 1.5911, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06912442396313365, |
|
"grad_norm": 2.512033462524414, |
|
"learning_rate": 0.00019654377880184333, |
|
"loss": 1.4806, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09216589861751152, |
|
"grad_norm": 2.6234657764434814, |
|
"learning_rate": 0.00019539170506912442, |
|
"loss": 1.3684, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1152073732718894, |
|
"grad_norm": 2.335149049758911, |
|
"learning_rate": 0.00019423963133640554, |
|
"loss": 1.4012, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1382488479262673, |
|
"grad_norm": 3.386568546295166, |
|
"learning_rate": 0.00019308755760368663, |
|
"loss": 1.2248, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 1.9273797273635864, |
|
"learning_rate": 0.00019193548387096775, |
|
"loss": 1.144, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.18433179723502305, |
|
"grad_norm": 2.2117414474487305, |
|
"learning_rate": 0.00019078341013824886, |
|
"loss": 1.0101, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.2073732718894009, |
|
"grad_norm": 3.1132171154022217, |
|
"learning_rate": 0.00018963133640552998, |
|
"loss": 1.1411, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2304147465437788, |
|
"grad_norm": 3.0585570335388184, |
|
"learning_rate": 0.00018847926267281107, |
|
"loss": 1.0791, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2304147465437788, |
|
"eval_accuracy": 0.6335113484646195, |
|
"eval_loss": 1.0347875356674194, |
|
"eval_runtime": 11.9052, |
|
"eval_samples_per_second": 125.828, |
|
"eval_steps_per_second": 15.791, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2534562211981567, |
|
"grad_norm": 2.400747299194336, |
|
"learning_rate": 0.00018732718894009219, |
|
"loss": 1.04, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2764976958525346, |
|
"grad_norm": 2.432607412338257, |
|
"learning_rate": 0.00018617511520737328, |
|
"loss": 1.0396, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2995391705069124, |
|
"grad_norm": 2.5169568061828613, |
|
"learning_rate": 0.0001850230414746544, |
|
"loss": 0.9925, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 2.450554847717285, |
|
"learning_rate": 0.00018387096774193548, |
|
"loss": 1.0361, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3456221198156682, |
|
"grad_norm": 1.5931885242462158, |
|
"learning_rate": 0.0001827188940092166, |
|
"loss": 0.9851, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3686635944700461, |
|
"grad_norm": 1.8019052743911743, |
|
"learning_rate": 0.0001815668202764977, |
|
"loss": 0.8847, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.391705069124424, |
|
"grad_norm": 2.283034086227417, |
|
"learning_rate": 0.0001804147465437788, |
|
"loss": 0.8507, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4147465437788018, |
|
"grad_norm": 2.5878796577453613, |
|
"learning_rate": 0.0001792626728110599, |
|
"loss": 0.9579, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4377880184331797, |
|
"grad_norm": 3.469618558883667, |
|
"learning_rate": 0.000178110599078341, |
|
"loss": 0.9453, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4608294930875576, |
|
"grad_norm": 1.9743025302886963, |
|
"learning_rate": 0.00017695852534562213, |
|
"loss": 0.9415, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4608294930875576, |
|
"eval_accuracy": 0.6448598130841121, |
|
"eval_loss": 0.9576324820518494, |
|
"eval_runtime": 11.862, |
|
"eval_samples_per_second": 126.285, |
|
"eval_steps_per_second": 15.849, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 3.031723976135254, |
|
"learning_rate": 0.00017580645161290325, |
|
"loss": 0.7819, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5069124423963134, |
|
"grad_norm": 2.2470805644989014, |
|
"learning_rate": 0.00017465437788018436, |
|
"loss": 0.8163, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5299539170506913, |
|
"grad_norm": 1.723471760749817, |
|
"learning_rate": 0.00017350230414746545, |
|
"loss": 0.6728, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5529953917050692, |
|
"grad_norm": 3.93212628364563, |
|
"learning_rate": 0.00017235023041474657, |
|
"loss": 0.684, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.576036866359447, |
|
"grad_norm": 1.4867981672286987, |
|
"learning_rate": 0.00017119815668202766, |
|
"loss": 0.8527, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5990783410138248, |
|
"grad_norm": 2.4340641498565674, |
|
"learning_rate": 0.00017004608294930878, |
|
"loss": 1.0102, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6221198156682027, |
|
"grad_norm": 2.8441660404205322, |
|
"learning_rate": 0.00016889400921658987, |
|
"loss": 0.7739, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 1.6598294973373413, |
|
"learning_rate": 0.00016774193548387098, |
|
"loss": 0.7442, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6682027649769585, |
|
"grad_norm": 3.455202102661133, |
|
"learning_rate": 0.00016658986175115207, |
|
"loss": 0.7643, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6912442396313364, |
|
"grad_norm": 2.480116367340088, |
|
"learning_rate": 0.0001654377880184332, |
|
"loss": 0.7839, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6912442396313364, |
|
"eval_accuracy": 0.6662216288384513, |
|
"eval_loss": 0.89629727602005, |
|
"eval_runtime": 11.7103, |
|
"eval_samples_per_second": 127.921, |
|
"eval_steps_per_second": 16.054, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 3.3055620193481445, |
|
"learning_rate": 0.00016428571428571428, |
|
"loss": 0.639, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7373271889400922, |
|
"grad_norm": 1.8542070388793945, |
|
"learning_rate": 0.0001631336405529954, |
|
"loss": 0.8931, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7603686635944701, |
|
"grad_norm": 1.6089766025543213, |
|
"learning_rate": 0.00016198156682027649, |
|
"loss": 0.9023, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.783410138248848, |
|
"grad_norm": 1.5780836343765259, |
|
"learning_rate": 0.0001608294930875576, |
|
"loss": 0.7285, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 3.153092384338379, |
|
"learning_rate": 0.00015967741935483872, |
|
"loss": 0.8702, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8294930875576036, |
|
"grad_norm": 2.3161656856536865, |
|
"learning_rate": 0.00015852534562211984, |
|
"loss": 0.7343, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8525345622119815, |
|
"grad_norm": 1.7923251390457153, |
|
"learning_rate": 0.00015737327188940093, |
|
"loss": 0.7986, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8755760368663594, |
|
"grad_norm": 2.7093405723571777, |
|
"learning_rate": 0.00015622119815668204, |
|
"loss": 0.6377, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8986175115207373, |
|
"grad_norm": 4.7555251121521, |
|
"learning_rate": 0.00015506912442396313, |
|
"loss": 0.8223, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9216589861751152, |
|
"grad_norm": 2.78916072845459, |
|
"learning_rate": 0.00015391705069124425, |
|
"loss": 0.7181, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9216589861751152, |
|
"eval_accuracy": 0.6962616822429907, |
|
"eval_loss": 0.8479276299476624, |
|
"eval_runtime": 11.6609, |
|
"eval_samples_per_second": 128.464, |
|
"eval_steps_per_second": 16.122, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9447004608294931, |
|
"grad_norm": 2.4783871173858643, |
|
"learning_rate": 0.00015276497695852537, |
|
"loss": 0.7422, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 2.8775382041931152, |
|
"learning_rate": 0.00015161290322580646, |
|
"loss": 0.6255, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9907834101382489, |
|
"grad_norm": 2.3851194381713867, |
|
"learning_rate": 0.00015046082949308757, |
|
"loss": 0.7266, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0138248847926268, |
|
"grad_norm": 5.285385608673096, |
|
"learning_rate": 0.00014930875576036866, |
|
"loss": 0.6283, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0368663594470047, |
|
"grad_norm": 1.691789984703064, |
|
"learning_rate": 0.00014815668202764978, |
|
"loss": 0.4918, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0599078341013826, |
|
"grad_norm": 2.8921382427215576, |
|
"learning_rate": 0.00014700460829493087, |
|
"loss": 0.5787, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.0829493087557605, |
|
"grad_norm": 3.1509757041931152, |
|
"learning_rate": 0.00014585253456221199, |
|
"loss": 0.4906, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1059907834101383, |
|
"grad_norm": 3.2979822158813477, |
|
"learning_rate": 0.0001447004608294931, |
|
"loss": 0.5715, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.129032258064516, |
|
"grad_norm": 3.3389899730682373, |
|
"learning_rate": 0.00014354838709677422, |
|
"loss": 0.5411, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1520737327188941, |
|
"grad_norm": 0.9589664936065674, |
|
"learning_rate": 0.0001423963133640553, |
|
"loss": 0.3995, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1520737327188941, |
|
"eval_accuracy": 0.7169559412550067, |
|
"eval_loss": 0.7820530533790588, |
|
"eval_runtime": 11.5056, |
|
"eval_samples_per_second": 130.197, |
|
"eval_steps_per_second": 16.34, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1751152073732718, |
|
"grad_norm": 2.248042106628418, |
|
"learning_rate": 0.00014124423963133643, |
|
"loss": 0.5057, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.1981566820276497, |
|
"grad_norm": 3.944963216781616, |
|
"learning_rate": 0.00014009216589861752, |
|
"loss": 0.5005, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2211981566820276, |
|
"grad_norm": 2.7981412410736084, |
|
"learning_rate": 0.00013894009216589863, |
|
"loss": 0.6703, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2442396313364055, |
|
"grad_norm": 1.683069109916687, |
|
"learning_rate": 0.00013778801843317972, |
|
"loss": 0.5394, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.2672811059907834, |
|
"grad_norm": 1.2122957706451416, |
|
"learning_rate": 0.00013663594470046084, |
|
"loss": 0.4775, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.2903225806451613, |
|
"grad_norm": 1.4005225896835327, |
|
"learning_rate": 0.00013548387096774193, |
|
"loss": 0.4467, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.3133640552995391, |
|
"grad_norm": 2.5969114303588867, |
|
"learning_rate": 0.00013433179723502305, |
|
"loss": 0.4289, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.336405529953917, |
|
"grad_norm": 3.344553232192993, |
|
"learning_rate": 0.00013317972350230414, |
|
"loss": 0.4631, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.359447004608295, |
|
"grad_norm": 1.6798585653305054, |
|
"learning_rate": 0.00013202764976958525, |
|
"loss": 0.4329, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.3824884792626728, |
|
"grad_norm": 1.3849396705627441, |
|
"learning_rate": 0.00013087557603686637, |
|
"loss": 0.5025, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3824884792626728, |
|
"eval_accuracy": 0.7837116154873164, |
|
"eval_loss": 0.6299713253974915, |
|
"eval_runtime": 11.705, |
|
"eval_samples_per_second": 127.979, |
|
"eval_steps_per_second": 16.061, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4055299539170507, |
|
"grad_norm": 2.550548791885376, |
|
"learning_rate": 0.00012972350230414746, |
|
"loss": 0.4463, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 3.063411235809326, |
|
"learning_rate": 0.00012857142857142858, |
|
"loss": 0.3624, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4516129032258065, |
|
"grad_norm": 6.676961898803711, |
|
"learning_rate": 0.0001274193548387097, |
|
"loss": 0.4446, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.4746543778801844, |
|
"grad_norm": 0.8720624446868896, |
|
"learning_rate": 0.0001262672811059908, |
|
"loss": 0.5162, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.4976958525345623, |
|
"grad_norm": 2.214848041534424, |
|
"learning_rate": 0.0001251152073732719, |
|
"loss": 0.2978, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.52073732718894, |
|
"grad_norm": 5.083272457122803, |
|
"learning_rate": 0.00012396313364055302, |
|
"loss": 0.5157, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.543778801843318, |
|
"grad_norm": 4.042588710784912, |
|
"learning_rate": 0.0001228110599078341, |
|
"loss": 0.5338, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.5668202764976957, |
|
"grad_norm": 3.1029160022735596, |
|
"learning_rate": 0.00012165898617511522, |
|
"loss": 0.4767, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.5898617511520738, |
|
"grad_norm": 1.4430710077285767, |
|
"learning_rate": 0.00012050691244239631, |
|
"loss": 0.5531, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.6129032258064515, |
|
"grad_norm": 11.178030967712402, |
|
"learning_rate": 0.00011935483870967743, |
|
"loss": 0.4985, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6129032258064515, |
|
"eval_accuracy": 0.7489986648865153, |
|
"eval_loss": 0.7058817744255066, |
|
"eval_runtime": 11.9139, |
|
"eval_samples_per_second": 125.736, |
|
"eval_steps_per_second": 15.78, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6359447004608296, |
|
"grad_norm": 3.918297529220581, |
|
"learning_rate": 0.00011820276497695852, |
|
"loss": 0.5471, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.6589861751152073, |
|
"grad_norm": 2.7170467376708984, |
|
"learning_rate": 0.00011705069124423964, |
|
"loss": 0.4797, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.6820276497695854, |
|
"grad_norm": 1.0436949729919434, |
|
"learning_rate": 0.00011589861751152074, |
|
"loss": 0.427, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.705069124423963, |
|
"grad_norm": 3.6829638481140137, |
|
"learning_rate": 0.00011474654377880186, |
|
"loss": 0.5121, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.728110599078341, |
|
"grad_norm": 1.8748345375061035, |
|
"learning_rate": 0.00011359447004608295, |
|
"loss": 0.4227, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.7511520737327189, |
|
"grad_norm": 4.548758506774902, |
|
"learning_rate": 0.00011244239631336406, |
|
"loss": 0.3164, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.7741935483870968, |
|
"grad_norm": 3.4847280979156494, |
|
"learning_rate": 0.00011129032258064515, |
|
"loss": 0.5092, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.7972350230414746, |
|
"grad_norm": 1.8869714736938477, |
|
"learning_rate": 0.00011013824884792627, |
|
"loss": 0.4472, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.8202764976958525, |
|
"grad_norm": 3.899409770965576, |
|
"learning_rate": 0.00010898617511520739, |
|
"loss": 0.4708, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.8433179723502304, |
|
"grad_norm": 1.543060541152954, |
|
"learning_rate": 0.00010783410138248849, |
|
"loss": 0.4388, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.8433179723502304, |
|
"eval_accuracy": 0.7857142857142857, |
|
"eval_loss": 0.5893343091011047, |
|
"eval_runtime": 11.4174, |
|
"eval_samples_per_second": 131.203, |
|
"eval_steps_per_second": 16.466, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.8663594470046083, |
|
"grad_norm": 5.587724208831787, |
|
"learning_rate": 0.0001066820276497696, |
|
"loss": 0.4264, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.8894009216589862, |
|
"grad_norm": 7.794037342071533, |
|
"learning_rate": 0.0001055299539170507, |
|
"loss": 0.4513, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.912442396313364, |
|
"grad_norm": 3.597796678543091, |
|
"learning_rate": 0.00010437788018433181, |
|
"loss": 0.437, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.935483870967742, |
|
"grad_norm": 2.825336217880249, |
|
"learning_rate": 0.0001032258064516129, |
|
"loss": 0.5202, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.9585253456221197, |
|
"grad_norm": 1.8002281188964844, |
|
"learning_rate": 0.00010207373271889402, |
|
"loss": 0.3283, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.9815668202764978, |
|
"grad_norm": 6.496976375579834, |
|
"learning_rate": 0.00010092165898617512, |
|
"loss": 0.2887, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.0046082949308754, |
|
"grad_norm": 2.1674392223358154, |
|
"learning_rate": 9.976958525345623e-05, |
|
"loss": 0.3299, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.0276497695852536, |
|
"grad_norm": 0.475057989358902, |
|
"learning_rate": 9.861751152073733e-05, |
|
"loss": 0.2049, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.0506912442396312, |
|
"grad_norm": 2.232353687286377, |
|
"learning_rate": 9.746543778801845e-05, |
|
"loss": 0.2598, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.0737327188940093, |
|
"grad_norm": 3.595874309539795, |
|
"learning_rate": 9.631336405529955e-05, |
|
"loss": 0.2389, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.0737327188940093, |
|
"eval_accuracy": 0.807743658210948, |
|
"eval_loss": 0.5928804278373718, |
|
"eval_runtime": 11.7831, |
|
"eval_samples_per_second": 127.131, |
|
"eval_steps_per_second": 15.955, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.096774193548387, |
|
"grad_norm": 2.4027860164642334, |
|
"learning_rate": 9.516129032258065e-05, |
|
"loss": 0.2023, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.119815668202765, |
|
"grad_norm": 4.1582560539245605, |
|
"learning_rate": 9.400921658986176e-05, |
|
"loss": 0.2389, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 3.8105199337005615, |
|
"learning_rate": 9.285714285714286e-05, |
|
"loss": 0.2054, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.165898617511521, |
|
"grad_norm": 4.042884826660156, |
|
"learning_rate": 9.170506912442398e-05, |
|
"loss": 0.2445, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.1889400921658986, |
|
"grad_norm": 3.3385071754455566, |
|
"learning_rate": 9.055299539170508e-05, |
|
"loss": 0.2578, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.2119815668202767, |
|
"grad_norm": 2.232977867126465, |
|
"learning_rate": 8.940092165898618e-05, |
|
"loss": 0.2168, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.2350230414746544, |
|
"grad_norm": 4.8774847984313965, |
|
"learning_rate": 8.824884792626729e-05, |
|
"loss": 0.1978, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.258064516129032, |
|
"grad_norm": 2.6131808757781982, |
|
"learning_rate": 8.709677419354839e-05, |
|
"loss": 0.223, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.28110599078341, |
|
"grad_norm": 1.6126481294631958, |
|
"learning_rate": 8.594470046082949e-05, |
|
"loss": 0.3882, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.3041474654377883, |
|
"grad_norm": 1.6977124214172363, |
|
"learning_rate": 8.479262672811061e-05, |
|
"loss": 0.2767, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.3041474654377883, |
|
"eval_accuracy": 0.8090787716955942, |
|
"eval_loss": 0.5795237421989441, |
|
"eval_runtime": 11.3869, |
|
"eval_samples_per_second": 131.555, |
|
"eval_steps_per_second": 16.51, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.327188940092166, |
|
"grad_norm": 5.384529113769531, |
|
"learning_rate": 8.364055299539171e-05, |
|
"loss": 0.2478, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.3502304147465436, |
|
"grad_norm": 7.527071952819824, |
|
"learning_rate": 8.248847926267282e-05, |
|
"loss": 0.1614, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.3732718894009217, |
|
"grad_norm": 3.253967523574829, |
|
"learning_rate": 8.133640552995392e-05, |
|
"loss": 0.1988, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.3963133640552994, |
|
"grad_norm": 2.3061683177948, |
|
"learning_rate": 8.018433179723502e-05, |
|
"loss": 0.2267, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.4193548387096775, |
|
"grad_norm": 5.240030288696289, |
|
"learning_rate": 7.903225806451613e-05, |
|
"loss": 0.3522, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.442396313364055, |
|
"grad_norm": 5.367170810699463, |
|
"learning_rate": 7.788018433179723e-05, |
|
"loss": 0.21, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.4654377880184333, |
|
"grad_norm": 2.52602219581604, |
|
"learning_rate": 7.672811059907835e-05, |
|
"loss": 0.208, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.488479262672811, |
|
"grad_norm": 3.110276937484741, |
|
"learning_rate": 7.557603686635945e-05, |
|
"loss": 0.1624, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.511520737327189, |
|
"grad_norm": 3.7577178478240967, |
|
"learning_rate": 7.442396313364057e-05, |
|
"loss": 0.2187, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.5345622119815667, |
|
"grad_norm": 0.886064887046814, |
|
"learning_rate": 7.327188940092167e-05, |
|
"loss": 0.2387, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.5345622119815667, |
|
"eval_accuracy": 0.8090787716955942, |
|
"eval_loss": 0.6099982857704163, |
|
"eval_runtime": 11.7513, |
|
"eval_samples_per_second": 127.476, |
|
"eval_steps_per_second": 15.998, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.557603686635945, |
|
"grad_norm": 0.9772585034370422, |
|
"learning_rate": 7.211981566820277e-05, |
|
"loss": 0.2289, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.5806451612903225, |
|
"grad_norm": 5.879600524902344, |
|
"learning_rate": 7.096774193548388e-05, |
|
"loss": 0.2592, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.6036866359447006, |
|
"grad_norm": 5.125580310821533, |
|
"learning_rate": 6.981566820276498e-05, |
|
"loss": 0.1801, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.6267281105990783, |
|
"grad_norm": 4.4502692222595215, |
|
"learning_rate": 6.86635944700461e-05, |
|
"loss": 0.3577, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.6497695852534564, |
|
"grad_norm": 0.543267548084259, |
|
"learning_rate": 6.75115207373272e-05, |
|
"loss": 0.1313, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.672811059907834, |
|
"grad_norm": 1.4891630411148071, |
|
"learning_rate": 6.63594470046083e-05, |
|
"loss": 0.1858, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.6958525345622117, |
|
"grad_norm": 2.359645366668701, |
|
"learning_rate": 6.52073732718894e-05, |
|
"loss": 0.2059, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.71889400921659, |
|
"grad_norm": 2.5760185718536377, |
|
"learning_rate": 6.405529953917051e-05, |
|
"loss": 0.2378, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.741935483870968, |
|
"grad_norm": 0.24703356623649597, |
|
"learning_rate": 6.290322580645161e-05, |
|
"loss": 0.1487, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.7649769585253456, |
|
"grad_norm": 0.22307877242565155, |
|
"learning_rate": 6.175115207373272e-05, |
|
"loss": 0.1691, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.7649769585253456, |
|
"eval_accuracy": 0.8070761014686249, |
|
"eval_loss": 0.6174820065498352, |
|
"eval_runtime": 11.265, |
|
"eval_samples_per_second": 132.978, |
|
"eval_steps_per_second": 16.689, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.7880184331797233, |
|
"grad_norm": 2.50034761428833, |
|
"learning_rate": 6.0599078341013825e-05, |
|
"loss": 0.2148, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.8110599078341014, |
|
"grad_norm": 0.3251860439777374, |
|
"learning_rate": 5.944700460829493e-05, |
|
"loss": 0.1538, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.8341013824884795, |
|
"grad_norm": 3.687969446182251, |
|
"learning_rate": 5.829493087557604e-05, |
|
"loss": 0.2445, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 7.214417457580566, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.229, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.880184331797235, |
|
"grad_norm": 2.587062120437622, |
|
"learning_rate": 5.5990783410138245e-05, |
|
"loss": 0.1999, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.903225806451613, |
|
"grad_norm": 4.365920066833496, |
|
"learning_rate": 5.4838709677419355e-05, |
|
"loss": 0.1061, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.9262672811059907, |
|
"grad_norm": 3.7295572757720947, |
|
"learning_rate": 5.368663594470046e-05, |
|
"loss": 0.3093, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.9493087557603688, |
|
"grad_norm": 2.4992685317993164, |
|
"learning_rate": 5.253456221198156e-05, |
|
"loss": 0.1644, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.9723502304147464, |
|
"grad_norm": 5.495995998382568, |
|
"learning_rate": 5.138248847926268e-05, |
|
"loss": 0.2393, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.9953917050691246, |
|
"grad_norm": 2.1380579471588135, |
|
"learning_rate": 5.023041474654379e-05, |
|
"loss": 0.1738, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.9953917050691246, |
|
"eval_accuracy": 0.8197596795727636, |
|
"eval_loss": 0.5877332091331482, |
|
"eval_runtime": 11.4089, |
|
"eval_samples_per_second": 131.301, |
|
"eval_steps_per_second": 16.478, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.0184331797235022, |
|
"grad_norm": 6.119831085205078, |
|
"learning_rate": 4.9078341013824885e-05, |
|
"loss": 0.075, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.0414746543778803, |
|
"grad_norm": 0.25446683168411255, |
|
"learning_rate": 4.792626728110599e-05, |
|
"loss": 0.0528, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.064516129032258, |
|
"grad_norm": 0.32773900032043457, |
|
"learning_rate": 4.67741935483871e-05, |
|
"loss": 0.0551, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.087557603686636, |
|
"grad_norm": 0.8912816643714905, |
|
"learning_rate": 4.562211981566821e-05, |
|
"loss": 0.0799, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.110599078341014, |
|
"grad_norm": 0.6732431054115295, |
|
"learning_rate": 4.447004608294931e-05, |
|
"loss": 0.0327, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.133640552995392, |
|
"grad_norm": 5.909882545471191, |
|
"learning_rate": 4.3317972350230415e-05, |
|
"loss": 0.108, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.1566820276497696, |
|
"grad_norm": 1.3546661138534546, |
|
"learning_rate": 4.2165898617511525e-05, |
|
"loss": 0.1057, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.1797235023041477, |
|
"grad_norm": 0.09205944836139679, |
|
"learning_rate": 4.101382488479263e-05, |
|
"loss": 0.045, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.2027649769585254, |
|
"grad_norm": 0.12445586174726486, |
|
"learning_rate": 3.986175115207373e-05, |
|
"loss": 0.0391, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.225806451612903, |
|
"grad_norm": 0.49267128109931946, |
|
"learning_rate": 3.870967741935484e-05, |
|
"loss": 0.0397, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.225806451612903, |
|
"eval_accuracy": 0.835781041388518, |
|
"eval_loss": 0.576629102230072, |
|
"eval_runtime": 11.5658, |
|
"eval_samples_per_second": 129.52, |
|
"eval_steps_per_second": 16.255, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.248847926267281, |
|
"grad_norm": 0.24710910022258759, |
|
"learning_rate": 3.7557603686635945e-05, |
|
"loss": 0.0982, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.271889400921659, |
|
"grad_norm": 1.3541345596313477, |
|
"learning_rate": 3.640552995391705e-05, |
|
"loss": 0.1062, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.294930875576037, |
|
"grad_norm": 0.07805185765028, |
|
"learning_rate": 3.525345622119816e-05, |
|
"loss": 0.0367, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.3179723502304146, |
|
"grad_norm": 0.704824686050415, |
|
"learning_rate": 3.410138248847927e-05, |
|
"loss": 0.0576, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.3410138248847927, |
|
"grad_norm": 3.216744899749756, |
|
"learning_rate": 3.294930875576037e-05, |
|
"loss": 0.123, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.3640552995391704, |
|
"grad_norm": 3.2812583446502686, |
|
"learning_rate": 3.1797235023041475e-05, |
|
"loss": 0.0535, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.3870967741935485, |
|
"grad_norm": 0.09345371276140213, |
|
"learning_rate": 3.0645161290322585e-05, |
|
"loss": 0.0363, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.410138248847926, |
|
"grad_norm": 0.5610162019729614, |
|
"learning_rate": 2.9493087557603688e-05, |
|
"loss": 0.0903, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.4331797235023043, |
|
"grad_norm": 1.413180947303772, |
|
"learning_rate": 2.8341013824884795e-05, |
|
"loss": 0.0792, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.456221198156682, |
|
"grad_norm": 6.735473155975342, |
|
"learning_rate": 2.7188940092165898e-05, |
|
"loss": 0.03, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.456221198156682, |
|
"eval_accuracy": 0.8371161548731643, |
|
"eval_loss": 0.5680701732635498, |
|
"eval_runtime": 11.6369, |
|
"eval_samples_per_second": 128.728, |
|
"eval_steps_per_second": 16.155, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.47926267281106, |
|
"grad_norm": 1.4329415559768677, |
|
"learning_rate": 2.6036866359447005e-05, |
|
"loss": 0.0206, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.5023041474654377, |
|
"grad_norm": 0.0513407364487648, |
|
"learning_rate": 2.488479262672811e-05, |
|
"loss": 0.0637, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.525345622119816, |
|
"grad_norm": 0.09985367208719254, |
|
"learning_rate": 2.3732718894009218e-05, |
|
"loss": 0.0829, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.5483870967741935, |
|
"grad_norm": 0.0632900595664978, |
|
"learning_rate": 2.258064516129032e-05, |
|
"loss": 0.0329, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 0.23229588568210602, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 0.0709, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.5944700460829493, |
|
"grad_norm": 0.15025608241558075, |
|
"learning_rate": 2.0276497695852538e-05, |
|
"loss": 0.1135, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.6175115207373274, |
|
"grad_norm": 5.933778285980225, |
|
"learning_rate": 1.912442396313364e-05, |
|
"loss": 0.1093, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.640552995391705, |
|
"grad_norm": 0.06949874013662338, |
|
"learning_rate": 1.7972350230414748e-05, |
|
"loss": 0.0498, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.6635944700460827, |
|
"grad_norm": 0.09838402271270752, |
|
"learning_rate": 1.682027649769585e-05, |
|
"loss": 0.0598, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.686635944700461, |
|
"grad_norm": 0.9366612434387207, |
|
"learning_rate": 1.5668202764976958e-05, |
|
"loss": 0.092, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.686635944700461, |
|
"eval_accuracy": 0.8451268357810414, |
|
"eval_loss": 0.5304816365242004, |
|
"eval_runtime": 11.6024, |
|
"eval_samples_per_second": 129.111, |
|
"eval_steps_per_second": 16.203, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.709677419354839, |
|
"grad_norm": 0.04733530059456825, |
|
"learning_rate": 1.4516129032258066e-05, |
|
"loss": 0.0276, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.7327188940092166, |
|
"grad_norm": 0.08603022992610931, |
|
"learning_rate": 1.3364055299539171e-05, |
|
"loss": 0.0347, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.7557603686635943, |
|
"grad_norm": 0.041543856263160706, |
|
"learning_rate": 1.2211981566820276e-05, |
|
"loss": 0.026, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.7788018433179724, |
|
"grad_norm": 0.24026305973529816, |
|
"learning_rate": 1.1059907834101383e-05, |
|
"loss": 0.0496, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.80184331797235, |
|
"grad_norm": 0.03894612938165665, |
|
"learning_rate": 9.90783410138249e-06, |
|
"loss": 0.0365, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.824884792626728, |
|
"grad_norm": 4.442405700683594, |
|
"learning_rate": 8.755760368663595e-06, |
|
"loss": 0.0402, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.847926267281106, |
|
"grad_norm": 0.032657474279403687, |
|
"learning_rate": 7.603686635944701e-06, |
|
"loss": 0.0596, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.870967741935484, |
|
"grad_norm": 2.9635491371154785, |
|
"learning_rate": 6.451612903225806e-06, |
|
"loss": 0.0835, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.8940092165898617, |
|
"grad_norm": 0.06601913273334503, |
|
"learning_rate": 5.299539170506913e-06, |
|
"loss": 0.0277, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.9170506912442398, |
|
"grad_norm": 0.22990980744361877, |
|
"learning_rate": 4.147465437788019e-06, |
|
"loss": 0.0416, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.9170506912442398, |
|
"eval_accuracy": 0.8471295060080107, |
|
"eval_loss": 0.5442608594894409, |
|
"eval_runtime": 11.486, |
|
"eval_samples_per_second": 130.42, |
|
"eval_steps_per_second": 16.368, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.9400921658986174, |
|
"grad_norm": 0.06300857663154602, |
|
"learning_rate": 2.9953917050691243e-06, |
|
"loss": 0.0331, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.9631336405529956, |
|
"grad_norm": 1.9846687316894531, |
|
"learning_rate": 1.8433179723502305e-06, |
|
"loss": 0.04, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.986175115207373, |
|
"grad_norm": 0.4808693826198578, |
|
"learning_rate": 6.912442396313364e-07, |
|
"loss": 0.0494, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 1736, |
|
"total_flos": 2.1525139607212524e+18, |
|
"train_loss": 0.4232822818690181, |
|
"train_runtime": 559.5799, |
|
"train_samples_per_second": 49.637, |
|
"train_steps_per_second": 3.102 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1736, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.1525139607212524e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|