{ "best_metric": 0.9764913889070788, "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat/checkpoint-5370", "epoch": 10.0, "eval_steps": 500, "global_step": 5370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 3.459811210632324, "learning_rate": 9.31098696461825e-07, "loss": 0.7199, "step": 10 }, { "epoch": 0.04, "grad_norm": 3.0020904541015625, "learning_rate": 1.86219739292365e-06, "loss": 0.7055, "step": 20 }, { "epoch": 0.06, "grad_norm": 2.790972948074341, "learning_rate": 2.7932960893854746e-06, "loss": 0.6772, "step": 30 }, { "epoch": 0.07, "grad_norm": 3.0193727016448975, "learning_rate": 3.7243947858473e-06, "loss": 0.6557, "step": 40 }, { "epoch": 0.09, "grad_norm": 2.7702300548553467, "learning_rate": 4.655493482309125e-06, "loss": 0.6343, "step": 50 }, { "epoch": 0.11, "grad_norm": 2.6783347129821777, "learning_rate": 5.586592178770949e-06, "loss": 0.5984, "step": 60 }, { "epoch": 0.13, "grad_norm": 2.6775383949279785, "learning_rate": 6.517690875232775e-06, "loss": 0.5643, "step": 70 }, { "epoch": 0.15, "grad_norm": 4.056990623474121, "learning_rate": 7.4487895716946e-06, "loss": 0.534, "step": 80 }, { "epoch": 0.17, "grad_norm": 3.630444288253784, "learning_rate": 8.379888268156424e-06, "loss": 0.4963, "step": 90 }, { "epoch": 0.19, "grad_norm": 5.517983913421631, "learning_rate": 9.31098696461825e-06, "loss": 0.4724, "step": 100 }, { "epoch": 0.2, "grad_norm": 6.528829097747803, "learning_rate": 1.0242085661080076e-05, "loss": 0.4412, "step": 110 }, { "epoch": 0.22, "grad_norm": 4.494168758392334, "learning_rate": 1.1173184357541899e-05, "loss": 0.426, "step": 120 }, { "epoch": 0.24, "grad_norm": 6.211711883544922, "learning_rate": 1.2104283054003724e-05, "loss": 0.4181, "step": 130 }, { "epoch": 0.26, "grad_norm": 8.021025657653809, "learning_rate": 1.303538175046555e-05, "loss": 0.4104, "step": 140 }, { "epoch": 0.28, "grad_norm": 5.851734161376953, "learning_rate": 1.3966480446927374e-05, "loss": 0.396, "step": 150 }, { "epoch": 0.3, "grad_norm": 7.224986553192139, "learning_rate": 1.48975791433892e-05, "loss": 0.3909, "step": 160 }, { "epoch": 0.32, "grad_norm": 8.457038879394531, "learning_rate": 1.5828677839851026e-05, "loss": 0.3542, "step": 170 }, { "epoch": 0.34, "grad_norm": 6.494715690612793, "learning_rate": 1.675977653631285e-05, "loss": 0.386, "step": 180 }, { "epoch": 0.35, "grad_norm": 6.016439914703369, "learning_rate": 1.7690875232774675e-05, "loss": 0.3678, "step": 190 }, { "epoch": 0.37, "grad_norm": 7.461198329925537, "learning_rate": 1.86219739292365e-05, "loss": 0.3559, "step": 200 }, { "epoch": 0.39, "grad_norm": 9.391804695129395, "learning_rate": 1.9553072625698323e-05, "loss": 0.3505, "step": 210 }, { "epoch": 0.41, "grad_norm": 7.332585334777832, "learning_rate": 2.0484171322160152e-05, "loss": 0.3562, "step": 220 }, { "epoch": 0.43, "grad_norm": 6.92149019241333, "learning_rate": 2.1415270018621975e-05, "loss": 0.3258, "step": 230 }, { "epoch": 0.45, "grad_norm": 7.061047554016113, "learning_rate": 2.2346368715083797e-05, "loss": 0.3243, "step": 240 }, { "epoch": 0.47, "grad_norm": 8.112573623657227, "learning_rate": 2.3277467411545626e-05, "loss": 0.3315, "step": 250 }, { "epoch": 0.48, "grad_norm": 5.698765277862549, "learning_rate": 2.420856610800745e-05, "loss": 0.3337, "step": 260 }, { "epoch": 0.5, "grad_norm": 11.131732940673828, "learning_rate": 2.5139664804469275e-05, "loss": 0.2954, "step": 270 }, { "epoch": 0.52, "grad_norm": 5.557519912719727, "learning_rate": 2.60707635009311e-05, "loss": 0.2806, "step": 280 }, { "epoch": 0.54, "grad_norm": 7.42105770111084, "learning_rate": 2.7001862197392923e-05, "loss": 0.3018, "step": 290 }, { "epoch": 0.56, "grad_norm": 5.84682035446167, "learning_rate": 2.793296089385475e-05, "loss": 0.2983, "step": 300 }, { "epoch": 0.58, "grad_norm": 12.523080825805664, "learning_rate": 2.886405959031657e-05, "loss": 0.2942, "step": 310 }, { "epoch": 0.6, "grad_norm": 10.046136856079102, "learning_rate": 2.97951582867784e-05, "loss": 0.3163, "step": 320 }, { "epoch": 0.61, "grad_norm": 8.785089492797852, "learning_rate": 3.0726256983240227e-05, "loss": 0.2803, "step": 330 }, { "epoch": 0.63, "grad_norm": 6.233649253845215, "learning_rate": 3.165735567970205e-05, "loss": 0.2968, "step": 340 }, { "epoch": 0.65, "grad_norm": 6.553578853607178, "learning_rate": 3.258845437616387e-05, "loss": 0.2786, "step": 350 }, { "epoch": 0.67, "grad_norm": 15.390639305114746, "learning_rate": 3.35195530726257e-05, "loss": 0.2665, "step": 360 }, { "epoch": 0.69, "grad_norm": 7.992424964904785, "learning_rate": 3.445065176908753e-05, "loss": 0.2693, "step": 370 }, { "epoch": 0.71, "grad_norm": 9.531819343566895, "learning_rate": 3.538175046554935e-05, "loss": 0.2914, "step": 380 }, { "epoch": 0.73, "grad_norm": 8.762212753295898, "learning_rate": 3.6312849162011175e-05, "loss": 0.2673, "step": 390 }, { "epoch": 0.74, "grad_norm": 11.937532424926758, "learning_rate": 3.7243947858473e-05, "loss": 0.2846, "step": 400 }, { "epoch": 0.76, "grad_norm": 7.173421382904053, "learning_rate": 3.817504655493483e-05, "loss": 0.3207, "step": 410 }, { "epoch": 0.78, "grad_norm": 10.6701021194458, "learning_rate": 3.9106145251396646e-05, "loss": 0.285, "step": 420 }, { "epoch": 0.8, "grad_norm": 9.541986465454102, "learning_rate": 4.003724394785848e-05, "loss": 0.2684, "step": 430 }, { "epoch": 0.82, "grad_norm": 7.715641975402832, "learning_rate": 4.0968342644320304e-05, "loss": 0.258, "step": 440 }, { "epoch": 0.84, "grad_norm": 5.0808491706848145, "learning_rate": 4.1899441340782123e-05, "loss": 0.2539, "step": 450 }, { "epoch": 0.86, "grad_norm": 7.925539016723633, "learning_rate": 4.283054003724395e-05, "loss": 0.2571, "step": 460 }, { "epoch": 0.88, "grad_norm": 5.518635272979736, "learning_rate": 4.3761638733705775e-05, "loss": 0.2775, "step": 470 }, { "epoch": 0.89, "grad_norm": 11.271161079406738, "learning_rate": 4.4692737430167594e-05, "loss": 0.2619, "step": 480 }, { "epoch": 0.91, "grad_norm": 9.803050994873047, "learning_rate": 4.562383612662943e-05, "loss": 0.2623, "step": 490 }, { "epoch": 0.93, "grad_norm": 6.506972312927246, "learning_rate": 4.655493482309125e-05, "loss": 0.278, "step": 500 }, { "epoch": 0.95, "grad_norm": 9.093548774719238, "learning_rate": 4.748603351955307e-05, "loss": 0.2694, "step": 510 }, { "epoch": 0.97, "grad_norm": 4.879490852355957, "learning_rate": 4.84171322160149e-05, "loss": 0.2487, "step": 520 }, { "epoch": 0.99, "grad_norm": 12.4769926071167, "learning_rate": 4.9348230912476724e-05, "loss": 0.2428, "step": 530 }, { "epoch": 1.0, "eval_accuracy": 0.9301944862811866, "eval_loss": 0.17676062881946564, "eval_runtime": 76.2927, "eval_samples_per_second": 200.163, "eval_steps_per_second": 3.133, "step": 537 }, { "epoch": 1.01, "grad_norm": 8.015608787536621, "learning_rate": 4.99689633767846e-05, "loss": 0.2571, "step": 540 }, { "epoch": 1.02, "grad_norm": 4.525390625, "learning_rate": 4.986550796606663e-05, "loss": 0.2506, "step": 550 }, { "epoch": 1.04, "grad_norm": 5.759488105773926, "learning_rate": 4.9762052555348645e-05, "loss": 0.2415, "step": 560 }, { "epoch": 1.06, "grad_norm": 7.032402992248535, "learning_rate": 4.965859714463066e-05, "loss": 0.2397, "step": 570 }, { "epoch": 1.08, "grad_norm": 7.004304885864258, "learning_rate": 4.955514173391269e-05, "loss": 0.237, "step": 580 }, { "epoch": 1.1, "grad_norm": 5.160691261291504, "learning_rate": 4.9451686323194706e-05, "loss": 0.2404, "step": 590 }, { "epoch": 1.12, "grad_norm": 3.625582218170166, "learning_rate": 4.9348230912476724e-05, "loss": 0.2526, "step": 600 }, { "epoch": 1.14, "grad_norm": 6.990879058837891, "learning_rate": 4.924477550175874e-05, "loss": 0.2412, "step": 610 }, { "epoch": 1.15, "grad_norm": 5.855559825897217, "learning_rate": 4.9141320091040766e-05, "loss": 0.2289, "step": 620 }, { "epoch": 1.17, "grad_norm": 3.9337453842163086, "learning_rate": 4.9037864680322784e-05, "loss": 0.2367, "step": 630 }, { "epoch": 1.19, "grad_norm": 5.882688999176025, "learning_rate": 4.89344092696048e-05, "loss": 0.2166, "step": 640 }, { "epoch": 1.21, "grad_norm": 5.7442779541015625, "learning_rate": 4.8830953858886826e-05, "loss": 0.2439, "step": 650 }, { "epoch": 1.23, "grad_norm": 6.9317755699157715, "learning_rate": 4.872749844816884e-05, "loss": 0.246, "step": 660 }, { "epoch": 1.25, "grad_norm": 5.778949737548828, "learning_rate": 4.862404303745086e-05, "loss": 0.2377, "step": 670 }, { "epoch": 1.27, "grad_norm": 3.596724271774292, "learning_rate": 4.852058762673288e-05, "loss": 0.2367, "step": 680 }, { "epoch": 1.28, "grad_norm": 5.4918670654296875, "learning_rate": 4.84171322160149e-05, "loss": 0.2199, "step": 690 }, { "epoch": 1.3, "grad_norm": 5.873249530792236, "learning_rate": 4.831367680529692e-05, "loss": 0.2051, "step": 700 }, { "epoch": 1.32, "grad_norm": 5.162601947784424, "learning_rate": 4.8210221394578933e-05, "loss": 0.2048, "step": 710 }, { "epoch": 1.34, "grad_norm": 7.136524200439453, "learning_rate": 4.810676598386096e-05, "loss": 0.2388, "step": 720 }, { "epoch": 1.36, "grad_norm": 5.694529056549072, "learning_rate": 4.8003310573142976e-05, "loss": 0.2468, "step": 730 }, { "epoch": 1.38, "grad_norm": 8.343286514282227, "learning_rate": 4.7899855162424994e-05, "loss": 0.2166, "step": 740 }, { "epoch": 1.4, "grad_norm": 5.000110626220703, "learning_rate": 4.779639975170702e-05, "loss": 0.1935, "step": 750 }, { "epoch": 1.42, "grad_norm": 4.729160785675049, "learning_rate": 4.7692944340989036e-05, "loss": 0.2, "step": 760 }, { "epoch": 1.43, "grad_norm": 3.7154366970062256, "learning_rate": 4.7589488930271054e-05, "loss": 0.1998, "step": 770 }, { "epoch": 1.45, "grad_norm": 7.739311695098877, "learning_rate": 4.748603351955307e-05, "loss": 0.218, "step": 780 }, { "epoch": 1.47, "grad_norm": 9.706314086914062, "learning_rate": 4.7382578108835096e-05, "loss": 0.1978, "step": 790 }, { "epoch": 1.49, "grad_norm": 4.060732841491699, "learning_rate": 4.7279122698117114e-05, "loss": 0.2043, "step": 800 }, { "epoch": 1.51, "grad_norm": 6.549215793609619, "learning_rate": 4.717566728739913e-05, "loss": 0.2051, "step": 810 }, { "epoch": 1.53, "grad_norm": 4.586370468139648, "learning_rate": 4.707221187668116e-05, "loss": 0.2025, "step": 820 }, { "epoch": 1.55, "grad_norm": 7.645646572113037, "learning_rate": 4.696875646596317e-05, "loss": 0.2121, "step": 830 }, { "epoch": 1.56, "grad_norm": 6.611913681030273, "learning_rate": 4.686530105524519e-05, "loss": 0.2095, "step": 840 }, { "epoch": 1.58, "grad_norm": 5.388148307800293, "learning_rate": 4.676184564452721e-05, "loss": 0.227, "step": 850 }, { "epoch": 1.6, "grad_norm": 6.008878231048584, "learning_rate": 4.665839023380923e-05, "loss": 0.1992, "step": 860 }, { "epoch": 1.62, "grad_norm": 4.902502536773682, "learning_rate": 4.655493482309125e-05, "loss": 0.2126, "step": 870 }, { "epoch": 1.64, "grad_norm": 4.861505508422852, "learning_rate": 4.6451479412373264e-05, "loss": 0.1905, "step": 880 }, { "epoch": 1.66, "grad_norm": 3.8657066822052, "learning_rate": 4.634802400165529e-05, "loss": 0.2017, "step": 890 }, { "epoch": 1.68, "grad_norm": 3.623135805130005, "learning_rate": 4.6244568590937306e-05, "loss": 0.184, "step": 900 }, { "epoch": 1.69, "grad_norm": 5.032660961151123, "learning_rate": 4.6141113180219324e-05, "loss": 0.2036, "step": 910 }, { "epoch": 1.71, "grad_norm": 5.572585105895996, "learning_rate": 4.603765776950135e-05, "loss": 0.2066, "step": 920 }, { "epoch": 1.73, "grad_norm": 5.036092281341553, "learning_rate": 4.5934202358783367e-05, "loss": 0.2003, "step": 930 }, { "epoch": 1.75, "grad_norm": 3.459541082382202, "learning_rate": 4.5830746948065384e-05, "loss": 0.2143, "step": 940 }, { "epoch": 1.77, "grad_norm": 3.976844072341919, "learning_rate": 4.57272915373474e-05, "loss": 0.18, "step": 950 }, { "epoch": 1.79, "grad_norm": 4.34367036819458, "learning_rate": 4.562383612662943e-05, "loss": 0.2025, "step": 960 }, { "epoch": 1.81, "grad_norm": 4.163506031036377, "learning_rate": 4.5520380715911445e-05, "loss": 0.1976, "step": 970 }, { "epoch": 1.82, "grad_norm": 5.492095947265625, "learning_rate": 4.541692530519346e-05, "loss": 0.2001, "step": 980 }, { "epoch": 1.84, "grad_norm": 5.207737445831299, "learning_rate": 4.531346989447549e-05, "loss": 0.2054, "step": 990 }, { "epoch": 1.86, "grad_norm": 6.426153659820557, "learning_rate": 4.52100144837575e-05, "loss": 0.1785, "step": 1000 }, { "epoch": 1.88, "grad_norm": 4.854528903961182, "learning_rate": 4.510655907303952e-05, "loss": 0.1873, "step": 1010 }, { "epoch": 1.9, "grad_norm": 5.651273727416992, "learning_rate": 4.500310366232154e-05, "loss": 0.2056, "step": 1020 }, { "epoch": 1.92, "grad_norm": 4.955221652984619, "learning_rate": 4.489964825160356e-05, "loss": 0.1934, "step": 1030 }, { "epoch": 1.94, "grad_norm": 4.257241725921631, "learning_rate": 4.479619284088558e-05, "loss": 0.1972, "step": 1040 }, { "epoch": 1.96, "grad_norm": 3.3766889572143555, "learning_rate": 4.4692737430167594e-05, "loss": 0.1789, "step": 1050 }, { "epoch": 1.97, "grad_norm": 4.304858684539795, "learning_rate": 4.458928201944962e-05, "loss": 0.1899, "step": 1060 }, { "epoch": 1.99, "grad_norm": 3.6677396297454834, "learning_rate": 4.448582660873164e-05, "loss": 0.1877, "step": 1070 }, { "epoch": 2.0, "eval_accuracy": 0.9550782528976491, "eval_loss": 0.11414149403572083, "eval_runtime": 76.7154, "eval_samples_per_second": 199.061, "eval_steps_per_second": 3.115, "step": 1074 }, { "epoch": 2.01, "grad_norm": 2.929933786392212, "learning_rate": 4.4382371198013655e-05, "loss": 0.1798, "step": 1080 }, { "epoch": 2.03, "grad_norm": 4.865507125854492, "learning_rate": 4.427891578729568e-05, "loss": 0.1895, "step": 1090 }, { "epoch": 2.05, "grad_norm": 5.74074125289917, "learning_rate": 4.41754603765777e-05, "loss": 0.1879, "step": 1100 }, { "epoch": 2.07, "grad_norm": 3.846959114074707, "learning_rate": 4.4072004965859715e-05, "loss": 0.1843, "step": 1110 }, { "epoch": 2.09, "grad_norm": 3.7884573936462402, "learning_rate": 4.396854955514173e-05, "loss": 0.1654, "step": 1120 }, { "epoch": 2.1, "grad_norm": 4.030633449554443, "learning_rate": 4.386509414442376e-05, "loss": 0.1754, "step": 1130 }, { "epoch": 2.12, "grad_norm": 4.759024620056152, "learning_rate": 4.3761638733705775e-05, "loss": 0.1804, "step": 1140 }, { "epoch": 2.14, "grad_norm": 5.641456604003906, "learning_rate": 4.365818332298779e-05, "loss": 0.1822, "step": 1150 }, { "epoch": 2.16, "grad_norm": 6.424627780914307, "learning_rate": 4.355472791226982e-05, "loss": 0.1904, "step": 1160 }, { "epoch": 2.18, "grad_norm": 3.8068325519561768, "learning_rate": 4.345127250155183e-05, "loss": 0.1623, "step": 1170 }, { "epoch": 2.2, "grad_norm": 8.320751190185547, "learning_rate": 4.334781709083385e-05, "loss": 0.171, "step": 1180 }, { "epoch": 2.22, "grad_norm": 3.5664634704589844, "learning_rate": 4.324436168011588e-05, "loss": 0.1674, "step": 1190 }, { "epoch": 2.23, "grad_norm": 5.247286319732666, "learning_rate": 4.314090626939789e-05, "loss": 0.1841, "step": 1200 }, { "epoch": 2.25, "grad_norm": 4.270170211791992, "learning_rate": 4.3037450858679914e-05, "loss": 0.1767, "step": 1210 }, { "epoch": 2.27, "grad_norm": 3.8757407665252686, "learning_rate": 4.2933995447961925e-05, "loss": 0.1763, "step": 1220 }, { "epoch": 2.29, "grad_norm": 3.8913145065307617, "learning_rate": 4.283054003724395e-05, "loss": 0.1938, "step": 1230 }, { "epoch": 2.31, "grad_norm": 5.493150234222412, "learning_rate": 4.2727084626525974e-05, "loss": 0.1815, "step": 1240 }, { "epoch": 2.33, "grad_norm": 3.1902143955230713, "learning_rate": 4.2623629215807985e-05, "loss": 0.1666, "step": 1250 }, { "epoch": 2.35, "grad_norm": 4.181407928466797, "learning_rate": 4.252017380509001e-05, "loss": 0.1856, "step": 1260 }, { "epoch": 2.36, "grad_norm": 3.092036724090576, "learning_rate": 4.241671839437203e-05, "loss": 0.1582, "step": 1270 }, { "epoch": 2.38, "grad_norm": 4.973979949951172, "learning_rate": 4.2313262983654045e-05, "loss": 0.2007, "step": 1280 }, { "epoch": 2.4, "grad_norm": 3.259059429168701, "learning_rate": 4.220980757293606e-05, "loss": 0.1729, "step": 1290 }, { "epoch": 2.42, "grad_norm": 3.7358877658843994, "learning_rate": 4.210635216221809e-05, "loss": 0.1996, "step": 1300 }, { "epoch": 2.44, "grad_norm": 4.496426582336426, "learning_rate": 4.2002896751500106e-05, "loss": 0.1711, "step": 1310 }, { "epoch": 2.46, "grad_norm": 4.290408611297607, "learning_rate": 4.1899441340782123e-05, "loss": 0.1773, "step": 1320 }, { "epoch": 2.48, "grad_norm": 5.771693229675293, "learning_rate": 4.179598593006415e-05, "loss": 0.1777, "step": 1330 }, { "epoch": 2.5, "grad_norm": 4.477756500244141, "learning_rate": 4.169253051934616e-05, "loss": 0.1731, "step": 1340 }, { "epoch": 2.51, "grad_norm": 4.2773895263671875, "learning_rate": 4.1589075108628184e-05, "loss": 0.185, "step": 1350 }, { "epoch": 2.53, "grad_norm": 5.028202533721924, "learning_rate": 4.148561969791021e-05, "loss": 0.1729, "step": 1360 }, { "epoch": 2.55, "grad_norm": 5.293006896972656, "learning_rate": 4.138216428719222e-05, "loss": 0.1714, "step": 1370 }, { "epoch": 2.57, "grad_norm": 5.110383987426758, "learning_rate": 4.1278708876474244e-05, "loss": 0.1697, "step": 1380 }, { "epoch": 2.59, "grad_norm": 4.646900653839111, "learning_rate": 4.1175253465756255e-05, "loss": 0.189, "step": 1390 }, { "epoch": 2.61, "grad_norm": 3.653062343597412, "learning_rate": 4.107179805503828e-05, "loss": 0.1619, "step": 1400 }, { "epoch": 2.63, "grad_norm": 4.054281711578369, "learning_rate": 4.0968342644320304e-05, "loss": 0.1767, "step": 1410 }, { "epoch": 2.64, "grad_norm": 2.8915677070617676, "learning_rate": 4.0864887233602315e-05, "loss": 0.1565, "step": 1420 }, { "epoch": 2.66, "grad_norm": 6.253181457519531, "learning_rate": 4.076143182288434e-05, "loss": 0.1844, "step": 1430 }, { "epoch": 2.68, "grad_norm": 4.847060680389404, "learning_rate": 4.065797641216636e-05, "loss": 0.1653, "step": 1440 }, { "epoch": 2.7, "grad_norm": 6.430269718170166, "learning_rate": 4.0554521001448376e-05, "loss": 0.1701, "step": 1450 }, { "epoch": 2.72, "grad_norm": 8.237833023071289, "learning_rate": 4.04510655907304e-05, "loss": 0.1612, "step": 1460 }, { "epoch": 2.74, "grad_norm": 4.337214946746826, "learning_rate": 4.034761018001242e-05, "loss": 0.1699, "step": 1470 }, { "epoch": 2.76, "grad_norm": 2.5487866401672363, "learning_rate": 4.0244154769294436e-05, "loss": 0.1658, "step": 1480 }, { "epoch": 2.77, "grad_norm": 5.610079765319824, "learning_rate": 4.0140699358576454e-05, "loss": 0.1591, "step": 1490 }, { "epoch": 2.79, "grad_norm": 3.776702642440796, "learning_rate": 4.003724394785848e-05, "loss": 0.1542, "step": 1500 }, { "epoch": 2.81, "grad_norm": 7.976995944976807, "learning_rate": 3.9933788537140496e-05, "loss": 0.1641, "step": 1510 }, { "epoch": 2.83, "grad_norm": 3.6274945735931396, "learning_rate": 3.9830333126422514e-05, "loss": 0.1654, "step": 1520 }, { "epoch": 2.85, "grad_norm": 4.607425212860107, "learning_rate": 3.972687771570454e-05, "loss": 0.1819, "step": 1530 }, { "epoch": 2.87, "grad_norm": 3.462461471557617, "learning_rate": 3.962342230498655e-05, "loss": 0.1622, "step": 1540 }, { "epoch": 2.89, "grad_norm": 5.016781330108643, "learning_rate": 3.9519966894268574e-05, "loss": 0.1422, "step": 1550 }, { "epoch": 2.91, "grad_norm": 5.226962566375732, "learning_rate": 3.9416511483550586e-05, "loss": 0.1747, "step": 1560 }, { "epoch": 2.92, "grad_norm": 4.826117038726807, "learning_rate": 3.931305607283261e-05, "loss": 0.1582, "step": 1570 }, { "epoch": 2.94, "grad_norm": 5.275283336639404, "learning_rate": 3.9209600662114635e-05, "loss": 0.1597, "step": 1580 }, { "epoch": 2.96, "grad_norm": 4.76600980758667, "learning_rate": 3.9106145251396646e-05, "loss": 0.1685, "step": 1590 }, { "epoch": 2.98, "grad_norm": 4.5946946144104, "learning_rate": 3.900268984067867e-05, "loss": 0.1431, "step": 1600 }, { "epoch": 3.0, "grad_norm": 3.286769151687622, "learning_rate": 3.889923442996069e-05, "loss": 0.1574, "step": 1610 }, { "epoch": 3.0, "eval_accuracy": 0.9461070001964508, "eval_loss": 0.13586224615573883, "eval_runtime": 75.9342, "eval_samples_per_second": 201.108, "eval_steps_per_second": 3.147, "step": 1611 }, { "epoch": 3.02, "grad_norm": 4.025867938995361, "learning_rate": 3.8795779019242706e-05, "loss": 0.1492, "step": 1620 }, { "epoch": 3.04, "grad_norm": 5.146902084350586, "learning_rate": 3.869232360852473e-05, "loss": 0.1639, "step": 1630 }, { "epoch": 3.05, "grad_norm": 4.2595906257629395, "learning_rate": 3.858886819780675e-05, "loss": 0.1614, "step": 1640 }, { "epoch": 3.07, "grad_norm": 3.0910167694091797, "learning_rate": 3.8485412787088766e-05, "loss": 0.1676, "step": 1650 }, { "epoch": 3.09, "grad_norm": 2.9316298961639404, "learning_rate": 3.8381957376370784e-05, "loss": 0.1585, "step": 1660 }, { "epoch": 3.11, "grad_norm": 6.207183361053467, "learning_rate": 3.827850196565281e-05, "loss": 0.1454, "step": 1670 }, { "epoch": 3.13, "grad_norm": 6.1695170402526855, "learning_rate": 3.817504655493483e-05, "loss": 0.156, "step": 1680 }, { "epoch": 3.15, "grad_norm": 4.578737258911133, "learning_rate": 3.8071591144216845e-05, "loss": 0.1499, "step": 1690 }, { "epoch": 3.17, "grad_norm": 3.303900957107544, "learning_rate": 3.796813573349887e-05, "loss": 0.163, "step": 1700 }, { "epoch": 3.18, "grad_norm": 4.704057216644287, "learning_rate": 3.786468032278088e-05, "loss": 0.1595, "step": 1710 }, { "epoch": 3.2, "grad_norm": 6.2445068359375, "learning_rate": 3.7761224912062905e-05, "loss": 0.1547, "step": 1720 }, { "epoch": 3.22, "grad_norm": 4.816792964935303, "learning_rate": 3.765776950134492e-05, "loss": 0.1509, "step": 1730 }, { "epoch": 3.24, "grad_norm": 3.5534775257110596, "learning_rate": 3.755431409062694e-05, "loss": 0.1504, "step": 1740 }, { "epoch": 3.26, "grad_norm": 3.065899133682251, "learning_rate": 3.7450858679908965e-05, "loss": 0.1491, "step": 1750 }, { "epoch": 3.28, "grad_norm": 5.379620552062988, "learning_rate": 3.7347403269190976e-05, "loss": 0.1605, "step": 1760 }, { "epoch": 3.3, "grad_norm": 3.993555784225464, "learning_rate": 3.7243947858473e-05, "loss": 0.1516, "step": 1770 }, { "epoch": 3.31, "grad_norm": 3.0113396644592285, "learning_rate": 3.714049244775502e-05, "loss": 0.1487, "step": 1780 }, { "epoch": 3.33, "grad_norm": 8.80622673034668, "learning_rate": 3.7037037037037037e-05, "loss": 0.1587, "step": 1790 }, { "epoch": 3.35, "grad_norm": 3.0601558685302734, "learning_rate": 3.693358162631906e-05, "loss": 0.1523, "step": 1800 }, { "epoch": 3.37, "grad_norm": 3.8811893463134766, "learning_rate": 3.683012621560108e-05, "loss": 0.1665, "step": 1810 }, { "epoch": 3.39, "grad_norm": 5.455690860748291, "learning_rate": 3.67266708048831e-05, "loss": 0.1458, "step": 1820 }, { "epoch": 3.41, "grad_norm": 2.9093761444091797, "learning_rate": 3.6623215394165115e-05, "loss": 0.1524, "step": 1830 }, { "epoch": 3.43, "grad_norm": 5.275539398193359, "learning_rate": 3.651975998344714e-05, "loss": 0.1551, "step": 1840 }, { "epoch": 3.45, "grad_norm": 3.3126418590545654, "learning_rate": 3.641630457272916e-05, "loss": 0.1532, "step": 1850 }, { "epoch": 3.46, "grad_norm": 4.406727313995361, "learning_rate": 3.6312849162011175e-05, "loss": 0.1535, "step": 1860 }, { "epoch": 3.48, "grad_norm": 2.409686803817749, "learning_rate": 3.62093937512932e-05, "loss": 0.1617, "step": 1870 }, { "epoch": 3.5, "grad_norm": 3.572361946105957, "learning_rate": 3.610593834057521e-05, "loss": 0.1466, "step": 1880 }, { "epoch": 3.52, "grad_norm": 4.148622035980225, "learning_rate": 3.6002482929857235e-05, "loss": 0.1473, "step": 1890 }, { "epoch": 3.54, "grad_norm": 2.6833081245422363, "learning_rate": 3.589902751913925e-05, "loss": 0.1533, "step": 1900 }, { "epoch": 3.56, "grad_norm": 4.358566761016846, "learning_rate": 3.579557210842127e-05, "loss": 0.159, "step": 1910 }, { "epoch": 3.58, "grad_norm": 2.758660078048706, "learning_rate": 3.5692116697703296e-05, "loss": 0.152, "step": 1920 }, { "epoch": 3.59, "grad_norm": 6.929190158843994, "learning_rate": 3.558866128698531e-05, "loss": 0.1512, "step": 1930 }, { "epoch": 3.61, "grad_norm": 3.7686049938201904, "learning_rate": 3.548520587626733e-05, "loss": 0.1507, "step": 1940 }, { "epoch": 3.63, "grad_norm": 3.968449592590332, "learning_rate": 3.538175046554935e-05, "loss": 0.1463, "step": 1950 }, { "epoch": 3.65, "grad_norm": 3.7159385681152344, "learning_rate": 3.527829505483137e-05, "loss": 0.1545, "step": 1960 }, { "epoch": 3.67, "grad_norm": 7.977210521697998, "learning_rate": 3.517483964411339e-05, "loss": 0.1478, "step": 1970 }, { "epoch": 3.69, "grad_norm": 4.873678207397461, "learning_rate": 3.507138423339541e-05, "loss": 0.145, "step": 1980 }, { "epoch": 3.71, "grad_norm": 5.451579570770264, "learning_rate": 3.496792882267743e-05, "loss": 0.1484, "step": 1990 }, { "epoch": 3.72, "grad_norm": 3.9063429832458496, "learning_rate": 3.4864473411959445e-05, "loss": 0.1453, "step": 2000 }, { "epoch": 3.74, "grad_norm": 3.2299928665161133, "learning_rate": 3.476101800124147e-05, "loss": 0.1546, "step": 2010 }, { "epoch": 3.76, "grad_norm": 4.653662204742432, "learning_rate": 3.465756259052349e-05, "loss": 0.1419, "step": 2020 }, { "epoch": 3.78, "grad_norm": 3.974182367324829, "learning_rate": 3.4554107179805505e-05, "loss": 0.1388, "step": 2030 }, { "epoch": 3.8, "grad_norm": 5.028197765350342, "learning_rate": 3.445065176908753e-05, "loss": 0.1664, "step": 2040 }, { "epoch": 3.82, "grad_norm": 4.503687381744385, "learning_rate": 3.434719635836954e-05, "loss": 0.1506, "step": 2050 }, { "epoch": 3.84, "grad_norm": 3.6866469383239746, "learning_rate": 3.4243740947651566e-05, "loss": 0.1351, "step": 2060 }, { "epoch": 3.85, "grad_norm": 2.9063117504119873, "learning_rate": 3.4140285536933584e-05, "loss": 0.1427, "step": 2070 }, { "epoch": 3.87, "grad_norm": 4.720154762268066, "learning_rate": 3.40368301262156e-05, "loss": 0.1422, "step": 2080 }, { "epoch": 3.89, "grad_norm": 4.307085990905762, "learning_rate": 3.3933374715497626e-05, "loss": 0.1442, "step": 2090 }, { "epoch": 3.91, "grad_norm": 3.0267748832702637, "learning_rate": 3.382991930477964e-05, "loss": 0.1479, "step": 2100 }, { "epoch": 3.93, "grad_norm": 4.804783344268799, "learning_rate": 3.372646389406166e-05, "loss": 0.1515, "step": 2110 }, { "epoch": 3.95, "grad_norm": 6.450136184692383, "learning_rate": 3.362300848334368e-05, "loss": 0.1523, "step": 2120 }, { "epoch": 3.97, "grad_norm": 4.83671236038208, "learning_rate": 3.35195530726257e-05, "loss": 0.1428, "step": 2130 }, { "epoch": 3.99, "grad_norm": 3.104628324508667, "learning_rate": 3.341609766190772e-05, "loss": 0.1412, "step": 2140 }, { "epoch": 4.0, "eval_accuracy": 0.9521969746578482, "eval_loss": 0.12448973953723907, "eval_runtime": 75.9779, "eval_samples_per_second": 200.993, "eval_steps_per_second": 3.146, "step": 2148 }, { "epoch": 4.0, "grad_norm": 3.8630151748657227, "learning_rate": 3.331264225118974e-05, "loss": 0.1589, "step": 2150 }, { "epoch": 4.02, "grad_norm": 4.394106388092041, "learning_rate": 3.320918684047176e-05, "loss": 0.1574, "step": 2160 }, { "epoch": 4.04, "grad_norm": 3.3176400661468506, "learning_rate": 3.3105731429753776e-05, "loss": 0.1453, "step": 2170 }, { "epoch": 4.06, "grad_norm": 3.7267205715179443, "learning_rate": 3.30022760190358e-05, "loss": 0.1386, "step": 2180 }, { "epoch": 4.08, "grad_norm": 4.1653313636779785, "learning_rate": 3.289882060831782e-05, "loss": 0.1438, "step": 2190 }, { "epoch": 4.1, "grad_norm": 3.0098297595977783, "learning_rate": 3.2795365197599836e-05, "loss": 0.1407, "step": 2200 }, { "epoch": 4.12, "grad_norm": 4.614931106567383, "learning_rate": 3.269190978688186e-05, "loss": 0.1359, "step": 2210 }, { "epoch": 4.13, "grad_norm": 2.9281229972839355, "learning_rate": 3.258845437616387e-05, "loss": 0.1457, "step": 2220 }, { "epoch": 4.15, "grad_norm": 3.3955490589141846, "learning_rate": 3.2484998965445896e-05, "loss": 0.1272, "step": 2230 }, { "epoch": 4.17, "grad_norm": 3.2423532009124756, "learning_rate": 3.2381543554727914e-05, "loss": 0.1287, "step": 2240 }, { "epoch": 4.19, "grad_norm": 3.0419697761535645, "learning_rate": 3.227808814400993e-05, "loss": 0.1415, "step": 2250 }, { "epoch": 4.21, "grad_norm": 3.408339738845825, "learning_rate": 3.2174632733291956e-05, "loss": 0.1391, "step": 2260 }, { "epoch": 4.23, "grad_norm": 6.2242350578308105, "learning_rate": 3.207117732257397e-05, "loss": 0.1465, "step": 2270 }, { "epoch": 4.25, "grad_norm": 2.413308620452881, "learning_rate": 3.196772191185599e-05, "loss": 0.1311, "step": 2280 }, { "epoch": 4.26, "grad_norm": 4.887659549713135, "learning_rate": 3.186426650113801e-05, "loss": 0.1389, "step": 2290 }, { "epoch": 4.28, "grad_norm": 3.2473926544189453, "learning_rate": 3.176081109042003e-05, "loss": 0.1502, "step": 2300 }, { "epoch": 4.3, "grad_norm": 4.212912559509277, "learning_rate": 3.165735567970205e-05, "loss": 0.1313, "step": 2310 }, { "epoch": 4.32, "grad_norm": 4.970630645751953, "learning_rate": 3.155390026898407e-05, "loss": 0.137, "step": 2320 }, { "epoch": 4.34, "grad_norm": 2.669292688369751, "learning_rate": 3.145044485826609e-05, "loss": 0.1447, "step": 2330 }, { "epoch": 4.36, "grad_norm": 4.9535980224609375, "learning_rate": 3.1346989447548106e-05, "loss": 0.1385, "step": 2340 }, { "epoch": 4.38, "grad_norm": 4.139229774475098, "learning_rate": 3.124353403683013e-05, "loss": 0.1275, "step": 2350 }, { "epoch": 4.39, "grad_norm": 5.44993782043457, "learning_rate": 3.114007862611215e-05, "loss": 0.1364, "step": 2360 }, { "epoch": 4.41, "grad_norm": 2.844508171081543, "learning_rate": 3.1036623215394166e-05, "loss": 0.1359, "step": 2370 }, { "epoch": 4.43, "grad_norm": 3.190946578979492, "learning_rate": 3.093316780467619e-05, "loss": 0.1318, "step": 2380 }, { "epoch": 4.45, "grad_norm": 6.107606887817383, "learning_rate": 3.08297123939582e-05, "loss": 0.1169, "step": 2390 }, { "epoch": 4.47, "grad_norm": 3.5856211185455322, "learning_rate": 3.0726256983240227e-05, "loss": 0.1481, "step": 2400 }, { "epoch": 4.49, "grad_norm": 3.9483227729797363, "learning_rate": 3.0622801572522244e-05, "loss": 0.1397, "step": 2410 }, { "epoch": 4.51, "grad_norm": 4.955249786376953, "learning_rate": 3.051934616180426e-05, "loss": 0.1427, "step": 2420 }, { "epoch": 4.53, "grad_norm": 4.119729995727539, "learning_rate": 3.0415890751086283e-05, "loss": 0.1265, "step": 2430 }, { "epoch": 4.54, "grad_norm": 3.9766499996185303, "learning_rate": 3.03124353403683e-05, "loss": 0.1184, "step": 2440 }, { "epoch": 4.56, "grad_norm": 3.8164212703704834, "learning_rate": 3.0208979929650323e-05, "loss": 0.1247, "step": 2450 }, { "epoch": 4.58, "grad_norm": 3.525179862976074, "learning_rate": 3.0105524518932344e-05, "loss": 0.1415, "step": 2460 }, { "epoch": 4.6, "grad_norm": 3.0888924598693848, "learning_rate": 3.0002069108214358e-05, "loss": 0.1319, "step": 2470 }, { "epoch": 4.62, "grad_norm": 4.304419040679932, "learning_rate": 2.989861369749638e-05, "loss": 0.1439, "step": 2480 }, { "epoch": 4.64, "grad_norm": 8.166552543640137, "learning_rate": 2.97951582867784e-05, "loss": 0.1331, "step": 2490 }, { "epoch": 4.66, "grad_norm": 2.1165900230407715, "learning_rate": 2.969170287606042e-05, "loss": 0.1406, "step": 2500 }, { "epoch": 4.67, "grad_norm": 5.2040557861328125, "learning_rate": 2.958824746534244e-05, "loss": 0.1234, "step": 2510 }, { "epoch": 4.69, "grad_norm": 3.8155689239501953, "learning_rate": 2.948479205462446e-05, "loss": 0.1159, "step": 2520 }, { "epoch": 4.71, "grad_norm": 7.363270282745361, "learning_rate": 2.9381336643906475e-05, "loss": 0.1306, "step": 2530 }, { "epoch": 4.73, "grad_norm": 4.376212120056152, "learning_rate": 2.9277881233188497e-05, "loss": 0.1352, "step": 2540 }, { "epoch": 4.75, "grad_norm": 4.796603679656982, "learning_rate": 2.9174425822470518e-05, "loss": 0.1363, "step": 2550 }, { "epoch": 4.77, "grad_norm": 3.4734084606170654, "learning_rate": 2.9070970411752536e-05, "loss": 0.127, "step": 2560 }, { "epoch": 4.79, "grad_norm": 4.342591285705566, "learning_rate": 2.8967515001034557e-05, "loss": 0.1264, "step": 2570 }, { "epoch": 4.8, "grad_norm": 4.9679856300354, "learning_rate": 2.886405959031657e-05, "loss": 0.1397, "step": 2580 }, { "epoch": 4.82, "grad_norm": 3.1558454036712646, "learning_rate": 2.8760604179598593e-05, "loss": 0.1304, "step": 2590 }, { "epoch": 4.84, "grad_norm": 4.197761535644531, "learning_rate": 2.8657148768880614e-05, "loss": 0.1192, "step": 2600 }, { "epoch": 4.86, "grad_norm": 3.9379701614379883, "learning_rate": 2.8553693358162632e-05, "loss": 0.1355, "step": 2610 }, { "epoch": 4.88, "grad_norm": 4.20279598236084, "learning_rate": 2.8450237947444653e-05, "loss": 0.1333, "step": 2620 }, { "epoch": 4.9, "grad_norm": 5.210755348205566, "learning_rate": 2.8346782536726674e-05, "loss": 0.1239, "step": 2630 }, { "epoch": 4.92, "grad_norm": 5.406430244445801, "learning_rate": 2.824332712600869e-05, "loss": 0.1469, "step": 2640 }, { "epoch": 4.93, "grad_norm": 5.022087097167969, "learning_rate": 2.813987171529071e-05, "loss": 0.1207, "step": 2650 }, { "epoch": 4.95, "grad_norm": 3.3133649826049805, "learning_rate": 2.803641630457273e-05, "loss": 0.1271, "step": 2660 }, { "epoch": 4.97, "grad_norm": 3.654719591140747, "learning_rate": 2.793296089385475e-05, "loss": 0.1287, "step": 2670 }, { "epoch": 4.99, "grad_norm": 4.803737640380859, "learning_rate": 2.782950548313677e-05, "loss": 0.1289, "step": 2680 }, { "epoch": 5.0, "eval_accuracy": 0.9704668980420404, "eval_loss": 0.07738856226205826, "eval_runtime": 76.5353, "eval_samples_per_second": 199.529, "eval_steps_per_second": 3.123, "step": 2685 }, { "epoch": 5.01, "grad_norm": 4.131795406341553, "learning_rate": 2.772605007241879e-05, "loss": 0.1206, "step": 2690 }, { "epoch": 5.03, "grad_norm": 3.679658889770508, "learning_rate": 2.7622594661700806e-05, "loss": 0.1291, "step": 2700 }, { "epoch": 5.05, "grad_norm": 3.8007965087890625, "learning_rate": 2.7519139250982827e-05, "loss": 0.1119, "step": 2710 }, { "epoch": 5.07, "grad_norm": 4.17035436630249, "learning_rate": 2.741568384026485e-05, "loss": 0.1317, "step": 2720 }, { "epoch": 5.08, "grad_norm": 3.355526924133301, "learning_rate": 2.7312228429546866e-05, "loss": 0.1381, "step": 2730 }, { "epoch": 5.1, "grad_norm": 5.8981547355651855, "learning_rate": 2.7208773018828887e-05, "loss": 0.1296, "step": 2740 }, { "epoch": 5.12, "grad_norm": 2.47714900970459, "learning_rate": 2.7105317608110902e-05, "loss": 0.131, "step": 2750 }, { "epoch": 5.14, "grad_norm": 4.262291431427002, "learning_rate": 2.7001862197392923e-05, "loss": 0.1357, "step": 2760 }, { "epoch": 5.16, "grad_norm": 4.63747501373291, "learning_rate": 2.6898406786674944e-05, "loss": 0.1171, "step": 2770 }, { "epoch": 5.18, "grad_norm": 3.2632124423980713, "learning_rate": 2.6794951375956962e-05, "loss": 0.1307, "step": 2780 }, { "epoch": 5.2, "grad_norm": 4.751256942749023, "learning_rate": 2.6691495965238983e-05, "loss": 0.112, "step": 2790 }, { "epoch": 5.21, "grad_norm": 7.088289737701416, "learning_rate": 2.6588040554521005e-05, "loss": 0.1389, "step": 2800 }, { "epoch": 5.23, "grad_norm": 2.923245906829834, "learning_rate": 2.648458514380302e-05, "loss": 0.133, "step": 2810 }, { "epoch": 5.25, "grad_norm": 3.6290907859802246, "learning_rate": 2.638112973308504e-05, "loss": 0.1215, "step": 2820 }, { "epoch": 5.27, "grad_norm": 4.726309299468994, "learning_rate": 2.627767432236706e-05, "loss": 0.124, "step": 2830 }, { "epoch": 5.29, "grad_norm": 3.569528818130493, "learning_rate": 2.617421891164908e-05, "loss": 0.131, "step": 2840 }, { "epoch": 5.31, "grad_norm": 2.8678665161132812, "learning_rate": 2.60707635009311e-05, "loss": 0.1346, "step": 2850 }, { "epoch": 5.33, "grad_norm": 6.845192909240723, "learning_rate": 2.5967308090213122e-05, "loss": 0.142, "step": 2860 }, { "epoch": 5.34, "grad_norm": 3.2927472591400146, "learning_rate": 2.5863852679495136e-05, "loss": 0.1229, "step": 2870 }, { "epoch": 5.36, "grad_norm": 3.8850090503692627, "learning_rate": 2.5760397268777158e-05, "loss": 0.1057, "step": 2880 }, { "epoch": 5.38, "grad_norm": 4.47546911239624, "learning_rate": 2.565694185805918e-05, "loss": 0.1292, "step": 2890 }, { "epoch": 5.4, "grad_norm": 2.9944636821746826, "learning_rate": 2.5553486447341197e-05, "loss": 0.1298, "step": 2900 }, { "epoch": 5.42, "grad_norm": 3.4300310611724854, "learning_rate": 2.5450031036623218e-05, "loss": 0.1285, "step": 2910 }, { "epoch": 5.44, "grad_norm": 3.3256707191467285, "learning_rate": 2.5346575625905232e-05, "loss": 0.1253, "step": 2920 }, { "epoch": 5.46, "grad_norm": 4.314760684967041, "learning_rate": 2.5243120215187254e-05, "loss": 0.1353, "step": 2930 }, { "epoch": 5.47, "grad_norm": 5.66748571395874, "learning_rate": 2.5139664804469275e-05, "loss": 0.1284, "step": 2940 }, { "epoch": 5.49, "grad_norm": 4.710278511047363, "learning_rate": 2.5036209393751293e-05, "loss": 0.1339, "step": 2950 }, { "epoch": 5.51, "grad_norm": 2.889969825744629, "learning_rate": 2.4932753983033314e-05, "loss": 0.1213, "step": 2960 }, { "epoch": 5.53, "grad_norm": 5.408463001251221, "learning_rate": 2.482929857231533e-05, "loss": 0.1343, "step": 2970 }, { "epoch": 5.55, "grad_norm": 2.5208628177642822, "learning_rate": 2.4725843161597353e-05, "loss": 0.1121, "step": 2980 }, { "epoch": 5.57, "grad_norm": 3.910186290740967, "learning_rate": 2.462238775087937e-05, "loss": 0.1201, "step": 2990 }, { "epoch": 5.59, "grad_norm": 2.9305076599121094, "learning_rate": 2.4518932340161392e-05, "loss": 0.1188, "step": 3000 }, { "epoch": 5.61, "grad_norm": 3.034980297088623, "learning_rate": 2.4415476929443413e-05, "loss": 0.121, "step": 3010 }, { "epoch": 5.62, "grad_norm": 4.653752326965332, "learning_rate": 2.431202151872543e-05, "loss": 0.1283, "step": 3020 }, { "epoch": 5.64, "grad_norm": 3.6336913108825684, "learning_rate": 2.420856610800745e-05, "loss": 0.111, "step": 3030 }, { "epoch": 5.66, "grad_norm": 3.5738136768341064, "learning_rate": 2.4105110697289467e-05, "loss": 0.1179, "step": 3040 }, { "epoch": 5.68, "grad_norm": 2.7753243446350098, "learning_rate": 2.4001655286571488e-05, "loss": 0.1122, "step": 3050 }, { "epoch": 5.7, "grad_norm": 3.7840399742126465, "learning_rate": 2.389819987585351e-05, "loss": 0.117, "step": 3060 }, { "epoch": 5.72, "grad_norm": 4.982550144195557, "learning_rate": 2.3794744465135527e-05, "loss": 0.1179, "step": 3070 }, { "epoch": 5.74, "grad_norm": 2.420515775680542, "learning_rate": 2.3691289054417548e-05, "loss": 0.1368, "step": 3080 }, { "epoch": 5.75, "grad_norm": 3.5275652408599854, "learning_rate": 2.3587833643699566e-05, "loss": 0.1249, "step": 3090 }, { "epoch": 5.77, "grad_norm": 4.064232349395752, "learning_rate": 2.3484378232981584e-05, "loss": 0.1314, "step": 3100 }, { "epoch": 5.79, "grad_norm": 5.377870082855225, "learning_rate": 2.3380922822263605e-05, "loss": 0.1215, "step": 3110 }, { "epoch": 5.81, "grad_norm": 3.4903948307037354, "learning_rate": 2.3277467411545626e-05, "loss": 0.1182, "step": 3120 }, { "epoch": 5.83, "grad_norm": 6.624187469482422, "learning_rate": 2.3174012000827644e-05, "loss": 0.1233, "step": 3130 }, { "epoch": 5.85, "grad_norm": 4.476204872131348, "learning_rate": 2.3070556590109662e-05, "loss": 0.1284, "step": 3140 }, { "epoch": 5.87, "grad_norm": 2.996946096420288, "learning_rate": 2.2967101179391683e-05, "loss": 0.135, "step": 3150 }, { "epoch": 5.88, "grad_norm": 4.674262046813965, "learning_rate": 2.28636457686737e-05, "loss": 0.1167, "step": 3160 }, { "epoch": 5.9, "grad_norm": 3.4972784519195557, "learning_rate": 2.2760190357955722e-05, "loss": 0.1251, "step": 3170 }, { "epoch": 5.92, "grad_norm": 3.1503241062164307, "learning_rate": 2.2656734947237744e-05, "loss": 0.1169, "step": 3180 }, { "epoch": 5.94, "grad_norm": 3.190443277359009, "learning_rate": 2.255327953651976e-05, "loss": 0.1355, "step": 3190 }, { "epoch": 5.96, "grad_norm": 4.845892429351807, "learning_rate": 2.244982412580178e-05, "loss": 0.1268, "step": 3200 }, { "epoch": 5.98, "grad_norm": 3.408785343170166, "learning_rate": 2.2346368715083797e-05, "loss": 0.1153, "step": 3210 }, { "epoch": 6.0, "grad_norm": 3.0129294395446777, "learning_rate": 2.224291330436582e-05, "loss": 0.1116, "step": 3220 }, { "epoch": 6.0, "eval_accuracy": 0.9663414314714164, "eval_loss": 0.08886239677667618, "eval_runtime": 76.0016, "eval_samples_per_second": 200.93, "eval_steps_per_second": 3.145, "step": 3222 }, { "epoch": 6.01, "grad_norm": 4.935642242431641, "learning_rate": 2.213945789364784e-05, "loss": 0.1182, "step": 3230 }, { "epoch": 6.03, "grad_norm": 4.62550163269043, "learning_rate": 2.2036002482929857e-05, "loss": 0.105, "step": 3240 }, { "epoch": 6.05, "grad_norm": 5.272533416748047, "learning_rate": 2.193254707221188e-05, "loss": 0.1216, "step": 3250 }, { "epoch": 6.07, "grad_norm": 3.5938615798950195, "learning_rate": 2.1829091661493897e-05, "loss": 0.1134, "step": 3260 }, { "epoch": 6.09, "grad_norm": 3.716996431350708, "learning_rate": 2.1725636250775914e-05, "loss": 0.117, "step": 3270 }, { "epoch": 6.11, "grad_norm": 2.794499158859253, "learning_rate": 2.162218084005794e-05, "loss": 0.1197, "step": 3280 }, { "epoch": 6.13, "grad_norm": 3.517066717147827, "learning_rate": 2.1518725429339957e-05, "loss": 0.1158, "step": 3290 }, { "epoch": 6.15, "grad_norm": 3.3488523960113525, "learning_rate": 2.1415270018621975e-05, "loss": 0.1083, "step": 3300 }, { "epoch": 6.16, "grad_norm": 4.872901916503906, "learning_rate": 2.1311814607903992e-05, "loss": 0.1234, "step": 3310 }, { "epoch": 6.18, "grad_norm": 5.1622633934021, "learning_rate": 2.1208359197186014e-05, "loss": 0.1155, "step": 3320 }, { "epoch": 6.2, "grad_norm": 3.0708415508270264, "learning_rate": 2.110490378646803e-05, "loss": 0.1293, "step": 3330 }, { "epoch": 6.22, "grad_norm": 5.712008953094482, "learning_rate": 2.1001448375750053e-05, "loss": 0.1295, "step": 3340 }, { "epoch": 6.24, "grad_norm": 2.7026169300079346, "learning_rate": 2.0897992965032074e-05, "loss": 0.1242, "step": 3350 }, { "epoch": 6.26, "grad_norm": 2.5105152130126953, "learning_rate": 2.0794537554314092e-05, "loss": 0.1239, "step": 3360 }, { "epoch": 6.28, "grad_norm": 3.213020086288452, "learning_rate": 2.069108214359611e-05, "loss": 0.1108, "step": 3370 }, { "epoch": 6.29, "grad_norm": 4.593565940856934, "learning_rate": 2.0587626732878128e-05, "loss": 0.1085, "step": 3380 }, { "epoch": 6.31, "grad_norm": 4.210085868835449, "learning_rate": 2.0484171322160152e-05, "loss": 0.1153, "step": 3390 }, { "epoch": 6.33, "grad_norm": 3.647468328475952, "learning_rate": 2.038071591144217e-05, "loss": 0.1139, "step": 3400 }, { "epoch": 6.35, "grad_norm": 3.584791898727417, "learning_rate": 2.0277260500724188e-05, "loss": 0.1083, "step": 3410 }, { "epoch": 6.37, "grad_norm": 3.0671119689941406, "learning_rate": 2.017380509000621e-05, "loss": 0.1039, "step": 3420 }, { "epoch": 6.39, "grad_norm": 4.143247127532959, "learning_rate": 2.0070349679288227e-05, "loss": 0.1217, "step": 3430 }, { "epoch": 6.41, "grad_norm": 3.869572401046753, "learning_rate": 1.9966894268570248e-05, "loss": 0.1136, "step": 3440 }, { "epoch": 6.42, "grad_norm": 3.644425630569458, "learning_rate": 1.986343885785227e-05, "loss": 0.1107, "step": 3450 }, { "epoch": 6.44, "grad_norm": 4.0842814445495605, "learning_rate": 1.9759983447134287e-05, "loss": 0.113, "step": 3460 }, { "epoch": 6.46, "grad_norm": 4.737167835235596, "learning_rate": 1.9656528036416305e-05, "loss": 0.118, "step": 3470 }, { "epoch": 6.48, "grad_norm": 4.954039573669434, "learning_rate": 1.9553072625698323e-05, "loss": 0.1103, "step": 3480 }, { "epoch": 6.5, "grad_norm": 3.720627784729004, "learning_rate": 1.9449617214980344e-05, "loss": 0.1165, "step": 3490 }, { "epoch": 6.52, "grad_norm": 4.383377552032471, "learning_rate": 1.9346161804262365e-05, "loss": 0.1317, "step": 3500 }, { "epoch": 6.54, "grad_norm": 2.1662657260894775, "learning_rate": 1.9242706393544383e-05, "loss": 0.1094, "step": 3510 }, { "epoch": 6.55, "grad_norm": 3.569554328918457, "learning_rate": 1.9139250982826404e-05, "loss": 0.1179, "step": 3520 }, { "epoch": 6.57, "grad_norm": 3.2241714000701904, "learning_rate": 1.9035795572108422e-05, "loss": 0.1194, "step": 3530 }, { "epoch": 6.59, "grad_norm": 3.6238088607788086, "learning_rate": 1.893234016139044e-05, "loss": 0.1125, "step": 3540 }, { "epoch": 6.61, "grad_norm": 4.729239463806152, "learning_rate": 1.882888475067246e-05, "loss": 0.1197, "step": 3550 }, { "epoch": 6.63, "grad_norm": 3.336503744125366, "learning_rate": 1.8725429339954483e-05, "loss": 0.1158, "step": 3560 }, { "epoch": 6.65, "grad_norm": 2.9191136360168457, "learning_rate": 1.86219739292365e-05, "loss": 0.0991, "step": 3570 }, { "epoch": 6.67, "grad_norm": 4.706370830535889, "learning_rate": 1.8518518518518518e-05, "loss": 0.123, "step": 3580 }, { "epoch": 6.69, "grad_norm": 3.0669870376586914, "learning_rate": 1.841506310780054e-05, "loss": 0.1188, "step": 3590 }, { "epoch": 6.7, "grad_norm": 3.902052402496338, "learning_rate": 1.8311607697082557e-05, "loss": 0.1104, "step": 3600 }, { "epoch": 6.72, "grad_norm": 5.678684711456299, "learning_rate": 1.820815228636458e-05, "loss": 0.1058, "step": 3610 }, { "epoch": 6.74, "grad_norm": 4.781716823577881, "learning_rate": 1.81046968756466e-05, "loss": 0.1265, "step": 3620 }, { "epoch": 6.76, "grad_norm": 4.41150426864624, "learning_rate": 1.8001241464928618e-05, "loss": 0.1209, "step": 3630 }, { "epoch": 6.78, "grad_norm": 3.2814714908599854, "learning_rate": 1.7897786054210635e-05, "loss": 0.1039, "step": 3640 }, { "epoch": 6.8, "grad_norm": 3.8997206687927246, "learning_rate": 1.7794330643492653e-05, "loss": 0.1263, "step": 3650 }, { "epoch": 6.82, "grad_norm": 3.4347612857818604, "learning_rate": 1.7690875232774675e-05, "loss": 0.1119, "step": 3660 }, { "epoch": 6.83, "grad_norm": 3.8548810482025146, "learning_rate": 1.7587419822056696e-05, "loss": 0.1194, "step": 3670 }, { "epoch": 6.85, "grad_norm": 1.9823788404464722, "learning_rate": 1.7483964411338714e-05, "loss": 0.118, "step": 3680 }, { "epoch": 6.87, "grad_norm": 3.7170395851135254, "learning_rate": 1.7380509000620735e-05, "loss": 0.1097, "step": 3690 }, { "epoch": 6.89, "grad_norm": 2.754812717437744, "learning_rate": 1.7277053589902753e-05, "loss": 0.1059, "step": 3700 }, { "epoch": 6.91, "grad_norm": 5.865429401397705, "learning_rate": 1.717359817918477e-05, "loss": 0.1184, "step": 3710 }, { "epoch": 6.93, "grad_norm": 2.526935577392578, "learning_rate": 1.7070142768466792e-05, "loss": 0.1109, "step": 3720 }, { "epoch": 6.95, "grad_norm": 4.929475784301758, "learning_rate": 1.6966687357748813e-05, "loss": 0.115, "step": 3730 }, { "epoch": 6.96, "grad_norm": 3.5055129528045654, "learning_rate": 1.686323194703083e-05, "loss": 0.1042, "step": 3740 }, { "epoch": 6.98, "grad_norm": 5.299228191375732, "learning_rate": 1.675977653631285e-05, "loss": 0.1091, "step": 3750 }, { "epoch": 7.0, "eval_accuracy": 0.9717110863728636, "eval_loss": 0.07997328042984009, "eval_runtime": 77.3565, "eval_samples_per_second": 197.411, "eval_steps_per_second": 3.09, "step": 3759 }, { "epoch": 7.0, "grad_norm": 2.833702325820923, "learning_rate": 1.665632112559487e-05, "loss": 0.1151, "step": 3760 }, { "epoch": 7.02, "grad_norm": 3.2511253356933594, "learning_rate": 1.6552865714876888e-05, "loss": 0.1081, "step": 3770 }, { "epoch": 7.04, "grad_norm": 4.139963150024414, "learning_rate": 1.644941030415891e-05, "loss": 0.104, "step": 3780 }, { "epoch": 7.06, "grad_norm": 3.9693522453308105, "learning_rate": 1.634595489344093e-05, "loss": 0.1116, "step": 3790 }, { "epoch": 7.08, "grad_norm": 3.844640016555786, "learning_rate": 1.6242499482722948e-05, "loss": 0.114, "step": 3800 }, { "epoch": 7.09, "grad_norm": 3.2410988807678223, "learning_rate": 1.6139044072004966e-05, "loss": 0.1002, "step": 3810 }, { "epoch": 7.11, "grad_norm": 3.647073268890381, "learning_rate": 1.6035588661286984e-05, "loss": 0.1086, "step": 3820 }, { "epoch": 7.13, "grad_norm": 3.2348549365997314, "learning_rate": 1.5932133250569005e-05, "loss": 0.089, "step": 3830 }, { "epoch": 7.15, "grad_norm": 4.308054447174072, "learning_rate": 1.5828677839851026e-05, "loss": 0.1098, "step": 3840 }, { "epoch": 7.17, "grad_norm": 2.811333417892456, "learning_rate": 1.5725222429133044e-05, "loss": 0.1187, "step": 3850 }, { "epoch": 7.19, "grad_norm": 4.352880001068115, "learning_rate": 1.5621767018415065e-05, "loss": 0.1115, "step": 3860 }, { "epoch": 7.21, "grad_norm": 4.083710193634033, "learning_rate": 1.5518311607697083e-05, "loss": 0.1064, "step": 3870 }, { "epoch": 7.23, "grad_norm": 3.2910239696502686, "learning_rate": 1.54148561969791e-05, "loss": 0.1019, "step": 3880 }, { "epoch": 7.24, "grad_norm": 4.050919532775879, "learning_rate": 1.5311400786261122e-05, "loss": 0.1023, "step": 3890 }, { "epoch": 7.26, "grad_norm": 3.7136409282684326, "learning_rate": 1.5207945375543142e-05, "loss": 0.1072, "step": 3900 }, { "epoch": 7.28, "grad_norm": 2.733660936355591, "learning_rate": 1.5104489964825161e-05, "loss": 0.0913, "step": 3910 }, { "epoch": 7.3, "grad_norm": 5.974127292633057, "learning_rate": 1.5001034554107179e-05, "loss": 0.1121, "step": 3920 }, { "epoch": 7.32, "grad_norm": 3.1126036643981934, "learning_rate": 1.48975791433892e-05, "loss": 0.11, "step": 3930 }, { "epoch": 7.34, "grad_norm": 3.3671085834503174, "learning_rate": 1.479412373267122e-05, "loss": 0.1213, "step": 3940 }, { "epoch": 7.36, "grad_norm": 3.5733137130737305, "learning_rate": 1.4690668321953238e-05, "loss": 0.1109, "step": 3950 }, { "epoch": 7.37, "grad_norm": 3.739729642868042, "learning_rate": 1.4587212911235259e-05, "loss": 0.1012, "step": 3960 }, { "epoch": 7.39, "grad_norm": 3.9161570072174072, "learning_rate": 1.4483757500517278e-05, "loss": 0.1062, "step": 3970 }, { "epoch": 7.41, "grad_norm": 3.0634665489196777, "learning_rate": 1.4380302089799296e-05, "loss": 0.1158, "step": 3980 }, { "epoch": 7.43, "grad_norm": 6.085744380950928, "learning_rate": 1.4276846679081316e-05, "loss": 0.1133, "step": 3990 }, { "epoch": 7.45, "grad_norm": 4.016533851623535, "learning_rate": 1.4173391268363337e-05, "loss": 0.1201, "step": 4000 }, { "epoch": 7.47, "grad_norm": 4.826283931732178, "learning_rate": 1.4069935857645355e-05, "loss": 0.0988, "step": 4010 }, { "epoch": 7.49, "grad_norm": 4.33713436126709, "learning_rate": 1.3966480446927374e-05, "loss": 0.1128, "step": 4020 }, { "epoch": 7.5, "grad_norm": 3.433681011199951, "learning_rate": 1.3863025036209396e-05, "loss": 0.1273, "step": 4030 }, { "epoch": 7.52, "grad_norm": 2.7771129608154297, "learning_rate": 1.3759569625491414e-05, "loss": 0.1077, "step": 4040 }, { "epoch": 7.54, "grad_norm": 3.7656004428863525, "learning_rate": 1.3656114214773433e-05, "loss": 0.1012, "step": 4050 }, { "epoch": 7.56, "grad_norm": 3.985187530517578, "learning_rate": 1.3552658804055451e-05, "loss": 0.114, "step": 4060 }, { "epoch": 7.58, "grad_norm": 3.333801031112671, "learning_rate": 1.3449203393337472e-05, "loss": 0.1071, "step": 4070 }, { "epoch": 7.6, "grad_norm": 3.4079647064208984, "learning_rate": 1.3345747982619492e-05, "loss": 0.1104, "step": 4080 }, { "epoch": 7.62, "grad_norm": 3.1041195392608643, "learning_rate": 1.324229257190151e-05, "loss": 0.1129, "step": 4090 }, { "epoch": 7.64, "grad_norm": 3.2613961696624756, "learning_rate": 1.313883716118353e-05, "loss": 0.1109, "step": 4100 }, { "epoch": 7.65, "grad_norm": 3.5139191150665283, "learning_rate": 1.303538175046555e-05, "loss": 0.1173, "step": 4110 }, { "epoch": 7.67, "grad_norm": 3.3949713706970215, "learning_rate": 1.2931926339747568e-05, "loss": 0.1133, "step": 4120 }, { "epoch": 7.69, "grad_norm": 4.6892900466918945, "learning_rate": 1.282847092902959e-05, "loss": 0.1159, "step": 4130 }, { "epoch": 7.71, "grad_norm": 2.7756004333496094, "learning_rate": 1.2725015518311609e-05, "loss": 0.1074, "step": 4140 }, { "epoch": 7.73, "grad_norm": 2.3531765937805176, "learning_rate": 1.2621560107593627e-05, "loss": 0.1026, "step": 4150 }, { "epoch": 7.75, "grad_norm": 3.776615858078003, "learning_rate": 1.2518104696875646e-05, "loss": 0.1035, "step": 4160 }, { "epoch": 7.77, "grad_norm": 3.5298571586608887, "learning_rate": 1.2414649286157666e-05, "loss": 0.1033, "step": 4170 }, { "epoch": 7.78, "grad_norm": 2.347933769226074, "learning_rate": 1.2311193875439685e-05, "loss": 0.0973, "step": 4180 }, { "epoch": 7.8, "grad_norm": 2.9776358604431152, "learning_rate": 1.2207738464721707e-05, "loss": 0.1207, "step": 4190 }, { "epoch": 7.82, "grad_norm": 2.641087055206299, "learning_rate": 1.2104283054003724e-05, "loss": 0.1083, "step": 4200 }, { "epoch": 7.84, "grad_norm": 2.9380156993865967, "learning_rate": 1.2000827643285744e-05, "loss": 0.0985, "step": 4210 }, { "epoch": 7.86, "grad_norm": 2.4157328605651855, "learning_rate": 1.1897372232567764e-05, "loss": 0.0976, "step": 4220 }, { "epoch": 7.88, "grad_norm": 3.6187868118286133, "learning_rate": 1.1793916821849783e-05, "loss": 0.1199, "step": 4230 }, { "epoch": 7.9, "grad_norm": 2.72450852394104, "learning_rate": 1.1690461411131803e-05, "loss": 0.1057, "step": 4240 }, { "epoch": 7.91, "grad_norm": 3.189300298690796, "learning_rate": 1.1587006000413822e-05, "loss": 0.0851, "step": 4250 }, { "epoch": 7.93, "grad_norm": 2.51131272315979, "learning_rate": 1.1483550589695842e-05, "loss": 0.1037, "step": 4260 }, { "epoch": 7.95, "grad_norm": 2.9266738891601562, "learning_rate": 1.1380095178977861e-05, "loss": 0.1008, "step": 4270 }, { "epoch": 7.97, "grad_norm": 3.532125473022461, "learning_rate": 1.127663976825988e-05, "loss": 0.1028, "step": 4280 }, { "epoch": 7.99, "grad_norm": 3.3334052562713623, "learning_rate": 1.1173184357541899e-05, "loss": 0.1096, "step": 4290 }, { "epoch": 8.0, "eval_accuracy": 0.9757055857507694, "eval_loss": 0.06653288006782532, "eval_runtime": 77.4069, "eval_samples_per_second": 197.282, "eval_steps_per_second": 3.088, "step": 4296 }, { "epoch": 8.01, "grad_norm": 3.6473946571350098, "learning_rate": 1.106972894682392e-05, "loss": 0.1067, "step": 4300 }, { "epoch": 8.03, "grad_norm": 3.4407718181610107, "learning_rate": 1.096627353610594e-05, "loss": 0.0985, "step": 4310 }, { "epoch": 8.04, "grad_norm": 4.716196060180664, "learning_rate": 1.0862818125387957e-05, "loss": 0.1021, "step": 4320 }, { "epoch": 8.06, "grad_norm": 6.6525702476501465, "learning_rate": 1.0759362714669978e-05, "loss": 0.1124, "step": 4330 }, { "epoch": 8.08, "grad_norm": 3.4421017169952393, "learning_rate": 1.0655907303951996e-05, "loss": 0.1059, "step": 4340 }, { "epoch": 8.1, "grad_norm": 3.29632830619812, "learning_rate": 1.0552451893234016e-05, "loss": 0.1049, "step": 4350 }, { "epoch": 8.12, "grad_norm": 5.985255241394043, "learning_rate": 1.0448996482516037e-05, "loss": 0.0996, "step": 4360 }, { "epoch": 8.14, "grad_norm": 4.031270503997803, "learning_rate": 1.0345541071798055e-05, "loss": 0.1034, "step": 4370 }, { "epoch": 8.16, "grad_norm": 3.9531686305999756, "learning_rate": 1.0242085661080076e-05, "loss": 0.0969, "step": 4380 }, { "epoch": 8.18, "grad_norm": 4.336350440979004, "learning_rate": 1.0138630250362094e-05, "loss": 0.0993, "step": 4390 }, { "epoch": 8.19, "grad_norm": 3.5339159965515137, "learning_rate": 1.0035174839644113e-05, "loss": 0.1108, "step": 4400 }, { "epoch": 8.21, "grad_norm": 4.038322925567627, "learning_rate": 9.931719428926135e-06, "loss": 0.1026, "step": 4410 }, { "epoch": 8.23, "grad_norm": 5.179644584655762, "learning_rate": 9.828264018208153e-06, "loss": 0.1047, "step": 4420 }, { "epoch": 8.25, "grad_norm": 3.643061876296997, "learning_rate": 9.724808607490172e-06, "loss": 0.1037, "step": 4430 }, { "epoch": 8.27, "grad_norm": 2.6012673377990723, "learning_rate": 9.621353196772192e-06, "loss": 0.1082, "step": 4440 }, { "epoch": 8.29, "grad_norm": 6.382651329040527, "learning_rate": 9.517897786054211e-06, "loss": 0.1017, "step": 4450 }, { "epoch": 8.31, "grad_norm": 3.192500352859497, "learning_rate": 9.41444237533623e-06, "loss": 0.1021, "step": 4460 }, { "epoch": 8.32, "grad_norm": 2.353194236755371, "learning_rate": 9.31098696461825e-06, "loss": 0.0942, "step": 4470 }, { "epoch": 8.34, "grad_norm": 2.7383475303649902, "learning_rate": 9.20753155390027e-06, "loss": 0.0887, "step": 4480 }, { "epoch": 8.36, "grad_norm": 3.0728166103363037, "learning_rate": 9.10407614318229e-06, "loss": 0.1038, "step": 4490 }, { "epoch": 8.38, "grad_norm": 2.619554042816162, "learning_rate": 9.000620732464309e-06, "loss": 0.1013, "step": 4500 }, { "epoch": 8.4, "grad_norm": 5.080254554748535, "learning_rate": 8.897165321746327e-06, "loss": 0.1143, "step": 4510 }, { "epoch": 8.42, "grad_norm": 4.772169589996338, "learning_rate": 8.793709911028348e-06, "loss": 0.1027, "step": 4520 }, { "epoch": 8.44, "grad_norm": 2.42454195022583, "learning_rate": 8.690254500310367e-06, "loss": 0.0992, "step": 4530 }, { "epoch": 8.45, "grad_norm": 2.924750328063965, "learning_rate": 8.586799089592385e-06, "loss": 0.0977, "step": 4540 }, { "epoch": 8.47, "grad_norm": 3.605734348297119, "learning_rate": 8.483343678874407e-06, "loss": 0.0964, "step": 4550 }, { "epoch": 8.49, "grad_norm": 5.610400199890137, "learning_rate": 8.379888268156424e-06, "loss": 0.0883, "step": 4560 }, { "epoch": 8.51, "grad_norm": 2.485067367553711, "learning_rate": 8.276432857438444e-06, "loss": 0.1016, "step": 4570 }, { "epoch": 8.53, "grad_norm": 4.045931816101074, "learning_rate": 8.172977446720465e-06, "loss": 0.1047, "step": 4580 }, { "epoch": 8.55, "grad_norm": 3.8962624073028564, "learning_rate": 8.069522036002483e-06, "loss": 0.1028, "step": 4590 }, { "epoch": 8.57, "grad_norm": 2.916381359100342, "learning_rate": 7.966066625284502e-06, "loss": 0.1092, "step": 4600 }, { "epoch": 8.58, "grad_norm": 2.839132308959961, "learning_rate": 7.862611214566522e-06, "loss": 0.0993, "step": 4610 }, { "epoch": 8.6, "grad_norm": 3.5891973972320557, "learning_rate": 7.759155803848542e-06, "loss": 0.0932, "step": 4620 }, { "epoch": 8.62, "grad_norm": 3.9104928970336914, "learning_rate": 7.655700393130561e-06, "loss": 0.0892, "step": 4630 }, { "epoch": 8.64, "grad_norm": 4.489515781402588, "learning_rate": 7.552244982412581e-06, "loss": 0.0909, "step": 4640 }, { "epoch": 8.66, "grad_norm": 3.1181390285491943, "learning_rate": 7.4487895716946e-06, "loss": 0.0865, "step": 4650 }, { "epoch": 8.68, "grad_norm": 3.370128870010376, "learning_rate": 7.345334160976619e-06, "loss": 0.0902, "step": 4660 }, { "epoch": 8.7, "grad_norm": 3.6510777473449707, "learning_rate": 7.241878750258639e-06, "loss": 0.1041, "step": 4670 }, { "epoch": 8.72, "grad_norm": 4.543170928955078, "learning_rate": 7.138423339540658e-06, "loss": 0.1106, "step": 4680 }, { "epoch": 8.73, "grad_norm": 3.1991612911224365, "learning_rate": 7.0349679288226775e-06, "loss": 0.0993, "step": 4690 }, { "epoch": 8.75, "grad_norm": 2.5615463256835938, "learning_rate": 6.931512518104698e-06, "loss": 0.0927, "step": 4700 }, { "epoch": 8.77, "grad_norm": 5.079352855682373, "learning_rate": 6.8280571073867165e-06, "loss": 0.1004, "step": 4710 }, { "epoch": 8.79, "grad_norm": 2.056499481201172, "learning_rate": 6.724601696668736e-06, "loss": 0.0884, "step": 4720 }, { "epoch": 8.81, "grad_norm": 2.651646614074707, "learning_rate": 6.621146285950755e-06, "loss": 0.095, "step": 4730 }, { "epoch": 8.83, "grad_norm": 2.911651849746704, "learning_rate": 6.517690875232775e-06, "loss": 0.0971, "step": 4740 }, { "epoch": 8.85, "grad_norm": 2.585360527038574, "learning_rate": 6.414235464514795e-06, "loss": 0.0942, "step": 4750 }, { "epoch": 8.86, "grad_norm": 4.262210369110107, "learning_rate": 6.310780053796813e-06, "loss": 0.099, "step": 4760 }, { "epoch": 8.88, "grad_norm": 3.066347599029541, "learning_rate": 6.207324643078833e-06, "loss": 0.0997, "step": 4770 }, { "epoch": 8.9, "grad_norm": 4.1641740798950195, "learning_rate": 6.103869232360853e-06, "loss": 0.1004, "step": 4780 }, { "epoch": 8.92, "grad_norm": 4.297872066497803, "learning_rate": 6.000413821642872e-06, "loss": 0.0975, "step": 4790 }, { "epoch": 8.94, "grad_norm": 2.9514224529266357, "learning_rate": 5.8969584109248915e-06, "loss": 0.09, "step": 4800 }, { "epoch": 8.96, "grad_norm": 3.211758852005005, "learning_rate": 5.793503000206911e-06, "loss": 0.0915, "step": 4810 }, { "epoch": 8.98, "grad_norm": 3.523693084716797, "learning_rate": 5.690047589488931e-06, "loss": 0.0843, "step": 4820 }, { "epoch": 8.99, "grad_norm": 4.310064315795898, "learning_rate": 5.586592178770949e-06, "loss": 0.0996, "step": 4830 }, { "epoch": 9.0, "eval_accuracy": 0.9746578482090237, "eval_loss": 0.07076110690832138, "eval_runtime": 77.2249, "eval_samples_per_second": 197.747, "eval_steps_per_second": 3.095, "step": 4833 }, { "epoch": 9.01, "grad_norm": 4.004106044769287, "learning_rate": 5.48313676805297e-06, "loss": 0.0958, "step": 4840 }, { "epoch": 9.03, "grad_norm": 3.368622064590454, "learning_rate": 5.379681357334989e-06, "loss": 0.1035, "step": 4850 }, { "epoch": 9.05, "grad_norm": 2.3737103939056396, "learning_rate": 5.276225946617008e-06, "loss": 0.0846, "step": 4860 }, { "epoch": 9.07, "grad_norm": 3.6056108474731445, "learning_rate": 5.1727705358990274e-06, "loss": 0.0988, "step": 4870 }, { "epoch": 9.09, "grad_norm": 6.646406173706055, "learning_rate": 5.069315125181047e-06, "loss": 0.077, "step": 4880 }, { "epoch": 9.11, "grad_norm": 3.300297737121582, "learning_rate": 4.965859714463067e-06, "loss": 0.0959, "step": 4890 }, { "epoch": 9.12, "grad_norm": 3.297924518585205, "learning_rate": 4.862404303745086e-06, "loss": 0.0904, "step": 4900 }, { "epoch": 9.14, "grad_norm": 2.438100576400757, "learning_rate": 4.7589488930271056e-06, "loss": 0.0787, "step": 4910 }, { "epoch": 9.16, "grad_norm": 6.617523670196533, "learning_rate": 4.655493482309125e-06, "loss": 0.1144, "step": 4920 }, { "epoch": 9.18, "grad_norm": 4.281922817230225, "learning_rate": 4.552038071591145e-06, "loss": 0.1008, "step": 4930 }, { "epoch": 9.2, "grad_norm": 2.712520122528076, "learning_rate": 4.448582660873163e-06, "loss": 0.0886, "step": 4940 }, { "epoch": 9.22, "grad_norm": 4.191254615783691, "learning_rate": 4.345127250155184e-06, "loss": 0.0986, "step": 4950 }, { "epoch": 9.24, "grad_norm": 3.2903385162353516, "learning_rate": 4.241671839437203e-06, "loss": 0.0886, "step": 4960 }, { "epoch": 9.26, "grad_norm": 4.816535472869873, "learning_rate": 4.138216428719222e-06, "loss": 0.1063, "step": 4970 }, { "epoch": 9.27, "grad_norm": 3.22310209274292, "learning_rate": 4.0347610180012415e-06, "loss": 0.0978, "step": 4980 }, { "epoch": 9.29, "grad_norm": 3.7314705848693848, "learning_rate": 3.931305607283261e-06, "loss": 0.0842, "step": 4990 }, { "epoch": 9.31, "grad_norm": 3.6335864067077637, "learning_rate": 3.8278501965652806e-06, "loss": 0.1074, "step": 5000 }, { "epoch": 9.33, "grad_norm": 2.8816540241241455, "learning_rate": 3.7243947858473e-06, "loss": 0.093, "step": 5010 }, { "epoch": 9.35, "grad_norm": 4.274160385131836, "learning_rate": 3.6209393751293196e-06, "loss": 0.1024, "step": 5020 }, { "epoch": 9.37, "grad_norm": 2.640784502029419, "learning_rate": 3.5174839644113387e-06, "loss": 0.0984, "step": 5030 }, { "epoch": 9.39, "grad_norm": 4.0636396408081055, "learning_rate": 3.4140285536933583e-06, "loss": 0.0998, "step": 5040 }, { "epoch": 9.4, "grad_norm": 3.3350281715393066, "learning_rate": 3.3105731429753774e-06, "loss": 0.0825, "step": 5050 }, { "epoch": 9.42, "grad_norm": 3.7046918869018555, "learning_rate": 3.2071177322573973e-06, "loss": 0.0747, "step": 5060 }, { "epoch": 9.44, "grad_norm": 3.884317636489868, "learning_rate": 3.1036623215394165e-06, "loss": 0.0923, "step": 5070 }, { "epoch": 9.46, "grad_norm": 4.088473320007324, "learning_rate": 3.000206910821436e-06, "loss": 0.0885, "step": 5080 }, { "epoch": 9.48, "grad_norm": 2.4199376106262207, "learning_rate": 2.8967515001034555e-06, "loss": 0.1044, "step": 5090 }, { "epoch": 9.5, "grad_norm": 4.261946678161621, "learning_rate": 2.7932960893854746e-06, "loss": 0.0856, "step": 5100 }, { "epoch": 9.52, "grad_norm": 4.894256114959717, "learning_rate": 2.6898406786674946e-06, "loss": 0.0979, "step": 5110 }, { "epoch": 9.53, "grad_norm": 3.232664108276367, "learning_rate": 2.5863852679495137e-06, "loss": 0.0912, "step": 5120 }, { "epoch": 9.55, "grad_norm": 3.6954145431518555, "learning_rate": 2.4829298572315337e-06, "loss": 0.0837, "step": 5130 }, { "epoch": 9.57, "grad_norm": 3.2980313301086426, "learning_rate": 2.3794744465135528e-06, "loss": 0.0866, "step": 5140 }, { "epoch": 9.59, "grad_norm": 5.655994415283203, "learning_rate": 2.2760190357955723e-06, "loss": 0.0881, "step": 5150 }, { "epoch": 9.61, "grad_norm": 4.117016792297363, "learning_rate": 2.172563625077592e-06, "loss": 0.0938, "step": 5160 }, { "epoch": 9.63, "grad_norm": 4.604465007781982, "learning_rate": 2.069108214359611e-06, "loss": 0.0869, "step": 5170 }, { "epoch": 9.65, "grad_norm": 2.572514057159424, "learning_rate": 1.9656528036416305e-06, "loss": 0.0939, "step": 5180 }, { "epoch": 9.66, "grad_norm": 3.884051561355591, "learning_rate": 1.86219739292365e-06, "loss": 0.11, "step": 5190 }, { "epoch": 9.68, "grad_norm": 3.295647621154785, "learning_rate": 1.7587419822056694e-06, "loss": 0.0796, "step": 5200 }, { "epoch": 9.7, "grad_norm": 3.270512819290161, "learning_rate": 1.6552865714876887e-06, "loss": 0.1012, "step": 5210 }, { "epoch": 9.72, "grad_norm": 3.492386817932129, "learning_rate": 1.5518311607697082e-06, "loss": 0.1028, "step": 5220 }, { "epoch": 9.74, "grad_norm": 2.9747917652130127, "learning_rate": 1.4483757500517278e-06, "loss": 0.1021, "step": 5230 }, { "epoch": 9.76, "grad_norm": 3.2330212593078613, "learning_rate": 1.3449203393337473e-06, "loss": 0.0865, "step": 5240 }, { "epoch": 9.78, "grad_norm": 3.7194619178771973, "learning_rate": 1.2414649286157668e-06, "loss": 0.0992, "step": 5250 }, { "epoch": 9.8, "grad_norm": 5.062513828277588, "learning_rate": 1.1380095178977862e-06, "loss": 0.1016, "step": 5260 }, { "epoch": 9.81, "grad_norm": 3.4997618198394775, "learning_rate": 1.0345541071798055e-06, "loss": 0.077, "step": 5270 }, { "epoch": 9.83, "grad_norm": 3.2800211906433105, "learning_rate": 9.31098696461825e-07, "loss": 0.0966, "step": 5280 }, { "epoch": 9.85, "grad_norm": 5.51563835144043, "learning_rate": 8.276432857438443e-07, "loss": 0.0967, "step": 5290 }, { "epoch": 9.87, "grad_norm": 5.4373698234558105, "learning_rate": 7.241878750258639e-07, "loss": 0.1014, "step": 5300 }, { "epoch": 9.89, "grad_norm": 4.9278154373168945, "learning_rate": 6.207324643078834e-07, "loss": 0.1023, "step": 5310 }, { "epoch": 9.91, "grad_norm": 3.8460750579833984, "learning_rate": 5.172770535899027e-07, "loss": 0.0992, "step": 5320 }, { "epoch": 9.93, "grad_norm": 2.2577359676361084, "learning_rate": 4.1382164287192217e-07, "loss": 0.0822, "step": 5330 }, { "epoch": 9.94, "grad_norm": 3.778047561645508, "learning_rate": 3.103662321539417e-07, "loss": 0.0894, "step": 5340 }, { "epoch": 9.96, "grad_norm": 4.457272052764893, "learning_rate": 2.0691082143596109e-07, "loss": 0.0854, "step": 5350 }, { "epoch": 9.98, "grad_norm": 3.2312333583831787, "learning_rate": 1.0345541071798054e-07, "loss": 0.0858, "step": 5360 }, { "epoch": 10.0, "grad_norm": 4.007193088531494, "learning_rate": 0.0, "loss": 0.0992, "step": 5370 }, { "epoch": 10.0, "eval_accuracy": 0.9764913889070788, "eval_loss": 0.06747107207775116, "eval_runtime": 75.4344, "eval_samples_per_second": 202.441, "eval_steps_per_second": 3.168, "step": 5370 }, { "epoch": 10.0, "step": 5370, "total_flos": 3.4161822702270628e+19, "train_loss": 0.15793793185907148, "train_runtime": 13574.3496, "train_samples_per_second": 101.249, "train_steps_per_second": 0.396 } ], "logging_steps": 10, "max_steps": 5370, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.4161822702270628e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }