{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 371920, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 0.8909159302711487, "learning_rate": 3.125e-05, "loss": 6.228, "step": 1000 }, { "epoch": 0.11, "grad_norm": 1.025515079498291, "learning_rate": 6.25e-05, "loss": 5.0018, "step": 2000 }, { "epoch": 0.16, "grad_norm": 0.9241665005683899, "learning_rate": 9.375e-05, "loss": 4.6753, "step": 3000 }, { "epoch": 0.22, "grad_norm": 0.8754993677139282, "learning_rate": 0.000125, "loss": 4.4594, "step": 4000 }, { "epoch": 0.27, "grad_norm": 0.7558003664016724, "learning_rate": 0.00015625, "loss": 4.3007, "step": 5000 }, { "epoch": 0.32, "grad_norm": 0.7471924424171448, "learning_rate": 0.0001875, "loss": 4.1771, "step": 6000 }, { "epoch": 0.38, "grad_norm": 0.6875430345535278, "learning_rate": 0.00021875, "loss": 4.0791, "step": 7000 }, { "epoch": 0.43, "grad_norm": 0.6728542447090149, "learning_rate": 0.00025, "loss": 3.9784, "step": 8000 }, { "epoch": 0.48, "grad_norm": 0.6585622429847717, "learning_rate": 0.00028125000000000003, "loss": 3.9146, "step": 9000 }, { "epoch": 0.54, "grad_norm": 0.5960397124290466, "learning_rate": 0.00031246875000000003, "loss": 3.847, "step": 10000 }, { "epoch": 0.59, "grad_norm": 0.576011061668396, "learning_rate": 0.00034368749999999997, "loss": 3.8063, "step": 11000 }, { "epoch": 0.65, "grad_norm": 0.5716710686683655, "learning_rate": 0.0003749375, "loss": 3.7672, "step": 12000 }, { "epoch": 0.7, "grad_norm": 0.5205326080322266, "learning_rate": 0.00040615625, "loss": 3.7304, "step": 13000 }, { "epoch": 0.75, "grad_norm": 0.47286731004714966, "learning_rate": 0.00043740625, "loss": 3.6967, "step": 14000 }, { "epoch": 0.81, "grad_norm": 0.4275617301464081, "learning_rate": 0.00046865625, "loss": 3.6717, "step": 15000 }, { "epoch": 0.86, "grad_norm": 0.4085034430027008, "learning_rate": 0.000499875, "loss": 3.6497, "step": 16000 }, { "epoch": 0.91, "grad_norm": 0.40013808012008667, "learning_rate": 0.00053109375, "loss": 3.6225, "step": 17000 }, { "epoch": 0.97, "grad_norm": 0.3701670169830322, "learning_rate": 0.00056234375, "loss": 3.61, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.35835879185489444, "eval_loss": 3.795727014541626, "eval_runtime": 153.3517, "eval_samples_per_second": 377.674, "eval_steps_per_second": 5.901, "step": 18596 }, { "epoch": 1.02, "grad_norm": 0.34899449348449707, "learning_rate": 0.00059359375, "loss": 3.579, "step": 19000 }, { "epoch": 1.08, "grad_norm": 0.37780293822288513, "learning_rate": 0.0006248437500000001, "loss": 3.5464, "step": 20000 }, { "epoch": 1.13, "grad_norm": 0.3226105272769928, "learning_rate": 0.0006560625, "loss": 3.5387, "step": 21000 }, { "epoch": 1.18, "grad_norm": 0.3026217222213745, "learning_rate": 0.0006873125, "loss": 3.5284, "step": 22000 }, { "epoch": 1.24, "grad_norm": 0.2845563590526581, "learning_rate": 0.0007185625, "loss": 3.5209, "step": 23000 }, { "epoch": 1.29, "grad_norm": 0.28052306175231934, "learning_rate": 0.0007498125, "loss": 3.5094, "step": 24000 }, { "epoch": 1.34, "grad_norm": 0.34453320503234863, "learning_rate": 0.0007810312499999999, "loss": 3.4934, "step": 25000 }, { "epoch": 1.4, "grad_norm": 0.30819350481033325, "learning_rate": 0.00081228125, "loss": 3.4931, "step": 26000 }, { "epoch": 1.45, "grad_norm": 0.30646008253097534, "learning_rate": 0.0008435000000000001, "loss": 3.477, "step": 27000 }, { "epoch": 1.51, "grad_norm": 0.24530355632305145, "learning_rate": 0.0008747500000000001, "loss": 3.4684, "step": 28000 }, { "epoch": 1.56, "grad_norm": 0.287979394197464, "learning_rate": 0.0009059375, "loss": 3.4633, "step": 29000 }, { "epoch": 1.61, "grad_norm": 0.2318340390920639, "learning_rate": 0.0009371875, "loss": 3.4523, "step": 30000 }, { "epoch": 1.67, "grad_norm": 0.21640178561210632, "learning_rate": 0.0009684375, "loss": 3.4511, "step": 31000 }, { "epoch": 1.72, "grad_norm": 0.23932267725467682, "learning_rate": 0.0009996562500000001, "loss": 3.44, "step": 32000 }, { "epoch": 1.77, "grad_norm": 0.23123665153980255, "learning_rate": 0.0009970904918804424, "loss": 3.4308, "step": 33000 }, { "epoch": 1.83, "grad_norm": 0.22664424777030945, "learning_rate": 0.0009941486232054601, "loss": 3.4153, "step": 34000 }, { "epoch": 1.88, "grad_norm": 0.21310342848300934, "learning_rate": 0.0009912067545304777, "loss": 3.4106, "step": 35000 }, { "epoch": 1.94, "grad_norm": 0.2162679135799408, "learning_rate": 0.0009882678277241704, "loss": 3.4031, "step": 36000 }, { "epoch": 1.99, "grad_norm": 0.21360158920288086, "learning_rate": 0.0009853259590491882, "loss": 3.3861, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.38014852732029175, "eval_loss": 3.564697742462158, "eval_runtime": 154.5153, "eval_samples_per_second": 374.83, "eval_steps_per_second": 5.857, "step": 37192 }, { "epoch": 2.04, "grad_norm": 0.2235065996646881, "learning_rate": 0.0009823899741115557, "loss": 3.3397, "step": 38000 }, { "epoch": 2.1, "grad_norm": 0.2304013967514038, "learning_rate": 0.0009794481054365733, "loss": 3.3289, "step": 39000 }, { "epoch": 2.15, "grad_norm": 0.22438882291316986, "learning_rate": 0.000976506236761591, "loss": 3.326, "step": 40000 }, { "epoch": 2.2, "grad_norm": 0.21580879390239716, "learning_rate": 0.0009735643680866086, "loss": 3.3208, "step": 41000 }, { "epoch": 2.26, "grad_norm": 0.21414010226726532, "learning_rate": 0.0009706283831489763, "loss": 3.3202, "step": 42000 }, { "epoch": 2.31, "grad_norm": 0.20920276641845703, "learning_rate": 0.0009676865144739939, "loss": 3.3111, "step": 43000 }, { "epoch": 2.37, "grad_norm": 0.19549404084682465, "learning_rate": 0.0009647446457990115, "loss": 3.308, "step": 44000 }, { "epoch": 2.42, "grad_norm": 0.2047666609287262, "learning_rate": 0.0009618027771240292, "loss": 3.307, "step": 45000 }, { "epoch": 2.47, "grad_norm": 0.19435334205627441, "learning_rate": 0.0009588609084490469, "loss": 3.3007, "step": 46000 }, { "epoch": 2.53, "grad_norm": 0.20967498421669006, "learning_rate": 0.0009559190397740644, "loss": 3.293, "step": 47000 }, { "epoch": 2.58, "grad_norm": 0.22619305551052094, "learning_rate": 0.0009529801129677571, "loss": 3.2911, "step": 48000 }, { "epoch": 2.63, "grad_norm": 0.1997978389263153, "learning_rate": 0.0009500382442927748, "loss": 3.286, "step": 49000 }, { "epoch": 2.69, "grad_norm": 0.22306489944458008, "learning_rate": 0.0009470963756177925, "loss": 3.2839, "step": 50000 }, { "epoch": 2.74, "grad_norm": 0.20853149890899658, "learning_rate": 0.000944157448811485, "loss": 3.2745, "step": 51000 }, { "epoch": 2.8, "grad_norm": 0.20844997465610504, "learning_rate": 0.0009412155801365028, "loss": 3.2657, "step": 52000 }, { "epoch": 2.85, "grad_norm": 0.19329632818698883, "learning_rate": 0.0009382766533301954, "loss": 3.2683, "step": 53000 }, { "epoch": 2.9, "grad_norm": 0.23765866458415985, "learning_rate": 0.000935334784655213, "loss": 3.2637, "step": 54000 }, { "epoch": 2.96, "grad_norm": 0.2281341701745987, "learning_rate": 0.0009323929159802307, "loss": 3.2591, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.39145331347307755, "eval_loss": 3.4933156967163086, "eval_runtime": 154.9004, "eval_samples_per_second": 373.898, "eval_steps_per_second": 5.842, "step": 55788 }, { "epoch": 3.01, "grad_norm": 0.2204052358865738, "learning_rate": 0.0009294539891739233, "loss": 3.2431, "step": 56000 }, { "epoch": 3.07, "grad_norm": 0.2415602058172226, "learning_rate": 0.000926512120498941, "loss": 3.1919, "step": 57000 }, { "epoch": 3.12, "grad_norm": 0.19874005019664764, "learning_rate": 0.0009235702518239586, "loss": 3.196, "step": 58000 }, { "epoch": 3.17, "grad_norm": 0.1932218074798584, "learning_rate": 0.0009206283831489762, "loss": 3.1988, "step": 59000 }, { "epoch": 3.23, "grad_norm": 0.21343468129634857, "learning_rate": 0.0009176894563426689, "loss": 3.1961, "step": 60000 }, { "epoch": 3.28, "grad_norm": 0.2427150458097458, "learning_rate": 0.0009147475876676865, "loss": 3.1967, "step": 61000 }, { "epoch": 3.33, "grad_norm": 0.24537405371665955, "learning_rate": 0.0009118086608613791, "loss": 3.1992, "step": 62000 }, { "epoch": 3.39, "grad_norm": 0.20690293610095978, "learning_rate": 0.0009088667921863968, "loss": 3.1979, "step": 63000 }, { "epoch": 3.44, "grad_norm": 0.25561264157295227, "learning_rate": 0.0009059249235114144, "loss": 3.1956, "step": 64000 }, { "epoch": 3.5, "grad_norm": 0.20544248819351196, "learning_rate": 0.000902985996705107, "loss": 3.1969, "step": 65000 }, { "epoch": 3.55, "grad_norm": 0.19939689338207245, "learning_rate": 0.0009000441280301248, "loss": 3.1913, "step": 66000 }, { "epoch": 3.6, "grad_norm": 0.2552586495876312, "learning_rate": 0.0008971052012238174, "loss": 3.1915, "step": 67000 }, { "epoch": 3.66, "grad_norm": 0.23190921545028687, "learning_rate": 0.000894163332548835, "loss": 3.1966, "step": 68000 }, { "epoch": 3.71, "grad_norm": 0.24762412905693054, "learning_rate": 0.0008912244057425277, "loss": 3.1867, "step": 69000 }, { "epoch": 3.76, "grad_norm": 0.2145838439464569, "learning_rate": 0.0008882825370675454, "loss": 3.1905, "step": 70000 }, { "epoch": 3.82, "grad_norm": 0.2245662808418274, "learning_rate": 0.0008853436102612379, "loss": 3.1846, "step": 71000 }, { "epoch": 3.87, "grad_norm": 0.2066437155008316, "learning_rate": 0.0008824017415862556, "loss": 3.1832, "step": 72000 }, { "epoch": 3.93, "grad_norm": 0.21306444704532623, "learning_rate": 0.0008794628147799483, "loss": 3.1834, "step": 73000 }, { "epoch": 3.98, "grad_norm": 0.22630509734153748, "learning_rate": 0.000876520946104966, "loss": 3.1836, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.3972446414495344, "eval_loss": 3.4326329231262207, "eval_runtime": 154.0547, "eval_samples_per_second": 375.951, "eval_steps_per_second": 5.875, "step": 74384 }, { "epoch": 4.03, "grad_norm": 0.19157204031944275, "learning_rate": 0.0008735790774299835, "loss": 3.1363, "step": 75000 }, { "epoch": 4.09, "grad_norm": 0.20826587080955505, "learning_rate": 0.0008706401506236762, "loss": 3.1164, "step": 76000 }, { "epoch": 4.14, "grad_norm": 0.20335273444652557, "learning_rate": 0.0008676982819486939, "loss": 3.119, "step": 77000 }, { "epoch": 4.19, "grad_norm": 0.20953166484832764, "learning_rate": 0.0008647564132737114, "loss": 3.1245, "step": 78000 }, { "epoch": 4.25, "grad_norm": 0.20731399953365326, "learning_rate": 0.0008618145445987291, "loss": 3.1238, "step": 79000 }, { "epoch": 4.3, "grad_norm": 0.23300333321094513, "learning_rate": 0.0008588756177924218, "loss": 3.1252, "step": 80000 }, { "epoch": 4.36, "grad_norm": 0.20891304314136505, "learning_rate": 0.0008559337491174393, "loss": 3.1288, "step": 81000 }, { "epoch": 4.41, "grad_norm": 0.21623115241527557, "learning_rate": 0.000852994822311132, "loss": 3.1285, "step": 82000 }, { "epoch": 4.46, "grad_norm": 0.2474987506866455, "learning_rate": 0.0008500529536361497, "loss": 3.1262, "step": 83000 }, { "epoch": 4.52, "grad_norm": 0.2118910700082779, "learning_rate": 0.0008471140268298423, "loss": 3.127, "step": 84000 }, { "epoch": 4.57, "grad_norm": 0.24061526358127594, "learning_rate": 0.00084417215815486, "loss": 3.131, "step": 85000 }, { "epoch": 4.62, "grad_norm": 0.22474774718284607, "learning_rate": 0.0008412302894798777, "loss": 3.1276, "step": 86000 }, { "epoch": 4.68, "grad_norm": 0.19421574473381042, "learning_rate": 0.0008382884208048953, "loss": 3.127, "step": 87000 }, { "epoch": 4.73, "grad_norm": 0.20476099848747253, "learning_rate": 0.0008353494939985879, "loss": 3.128, "step": 88000 }, { "epoch": 4.79, "grad_norm": 0.23917576670646667, "learning_rate": 0.0008324076253236056, "loss": 3.1285, "step": 89000 }, { "epoch": 4.84, "grad_norm": 0.213016539812088, "learning_rate": 0.0008294657566486232, "loss": 3.1281, "step": 90000 }, { "epoch": 4.89, "grad_norm": 0.20960000157356262, "learning_rate": 0.0008265238879736408, "loss": 3.1314, "step": 91000 }, { "epoch": 4.95, "grad_norm": 0.1963956207036972, "learning_rate": 0.0008235849611673335, "loss": 3.1278, "step": 92000 }, { "epoch": 5.0, "eval_accuracy": 0.40181854228669783, "eval_loss": 3.4132988452911377, "eval_runtime": 154.1416, "eval_samples_per_second": 375.739, "eval_steps_per_second": 5.871, "step": 92980 }, { "epoch": 5.0, "grad_norm": 0.26301759481430054, "learning_rate": 0.0008206430924923511, "loss": 3.121, "step": 93000 }, { "epoch": 5.05, "grad_norm": 0.21968483924865723, "learning_rate": 0.0008177012238173688, "loss": 3.0579, "step": 94000 }, { "epoch": 5.11, "grad_norm": 0.23824404180049896, "learning_rate": 0.0008147622970110614, "loss": 3.0621, "step": 95000 }, { "epoch": 5.16, "grad_norm": 0.20740585029125214, "learning_rate": 0.000811820428336079, "loss": 3.0653, "step": 96000 }, { "epoch": 5.22, "grad_norm": 0.21885009109973907, "learning_rate": 0.0008088785596610967, "loss": 3.0721, "step": 97000 }, { "epoch": 5.27, "grad_norm": 0.27921879291534424, "learning_rate": 0.0008059425747234644, "loss": 3.0724, "step": 98000 }, { "epoch": 5.32, "grad_norm": 0.2494860142469406, "learning_rate": 0.0008030007060484819, "loss": 3.0771, "step": 99000 }, { "epoch": 5.38, "grad_norm": 0.2149578034877777, "learning_rate": 0.0008000588373734997, "loss": 3.0778, "step": 100000 }, { "epoch": 5.43, "grad_norm": 0.24940645694732666, "learning_rate": 0.0007971169686985174, "loss": 3.079, "step": 101000 }, { "epoch": 5.49, "grad_norm": 0.2192409187555313, "learning_rate": 0.00079417804189221, "loss": 3.0803, "step": 102000 }, { "epoch": 5.54, "grad_norm": 0.21941109001636505, "learning_rate": 0.0007912361732172276, "loss": 3.0771, "step": 103000 }, { "epoch": 5.59, "grad_norm": 0.21313978731632233, "learning_rate": 0.0007882943045422453, "loss": 3.0784, "step": 104000 }, { "epoch": 5.65, "grad_norm": 0.27852264046669006, "learning_rate": 0.0007853553777359379, "loss": 3.0842, "step": 105000 }, { "epoch": 5.7, "grad_norm": 0.27452683448791504, "learning_rate": 0.0007824164509296305, "loss": 3.0822, "step": 106000 }, { "epoch": 5.75, "grad_norm": 0.22761721909046173, "learning_rate": 0.0007794745822546482, "loss": 3.0824, "step": 107000 }, { "epoch": 5.81, "grad_norm": 0.21608448028564453, "learning_rate": 0.0007765356554483409, "loss": 3.0859, "step": 108000 }, { "epoch": 5.86, "grad_norm": 0.26219895482063293, "learning_rate": 0.0007735937867733584, "loss": 3.0817, "step": 109000 }, { "epoch": 5.92, "grad_norm": 0.2277887463569641, "learning_rate": 0.0007706519180983761, "loss": 3.0799, "step": 110000 }, { "epoch": 5.97, "grad_norm": 0.20768152177333832, "learning_rate": 0.0007677100494233938, "loss": 3.0795, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.40326990139989005, "eval_loss": 3.4124019145965576, "eval_runtime": 154.0264, "eval_samples_per_second": 376.02, "eval_steps_per_second": 5.876, "step": 111576 }, { "epoch": 6.02, "grad_norm": 0.20890597999095917, "learning_rate": 0.0007647681807484113, "loss": 3.049, "step": 112000 }, { "epoch": 6.08, "grad_norm": 0.23515057563781738, "learning_rate": 0.000761829253942104, "loss": 3.0134, "step": 113000 }, { "epoch": 6.13, "grad_norm": 0.23229765892028809, "learning_rate": 0.0007588873852671217, "loss": 3.0212, "step": 114000 }, { "epoch": 6.18, "grad_norm": 0.26751187443733215, "learning_rate": 0.0007559455165921393, "loss": 3.0246, "step": 115000 }, { "epoch": 6.24, "grad_norm": 0.25669464468955994, "learning_rate": 0.0007530095316545069, "loss": 3.0349, "step": 116000 }, { "epoch": 6.29, "grad_norm": 0.2544041872024536, "learning_rate": 0.0007500676629795246, "loss": 3.0371, "step": 117000 }, { "epoch": 6.35, "grad_norm": 0.23453743755817413, "learning_rate": 0.0007471257943045423, "loss": 3.0346, "step": 118000 }, { "epoch": 6.4, "grad_norm": 0.2783219814300537, "learning_rate": 0.0007441839256295599, "loss": 3.0379, "step": 119000 }, { "epoch": 6.45, "grad_norm": 0.24288877844810486, "learning_rate": 0.0007412420569545776, "loss": 3.0353, "step": 120000 }, { "epoch": 6.51, "grad_norm": 0.21807478368282318, "learning_rate": 0.0007383031301482702, "loss": 3.0422, "step": 121000 }, { "epoch": 6.56, "grad_norm": 0.2313011735677719, "learning_rate": 0.0007353642033419628, "loss": 3.0417, "step": 122000 }, { "epoch": 6.61, "grad_norm": 0.23170366883277893, "learning_rate": 0.0007324223346669805, "loss": 3.0423, "step": 123000 }, { "epoch": 6.67, "grad_norm": 0.21878008544445038, "learning_rate": 0.0007294804659919982, "loss": 3.0422, "step": 124000 }, { "epoch": 6.72, "grad_norm": 0.2428036332130432, "learning_rate": 0.0007265385973170157, "loss": 3.0479, "step": 125000 }, { "epoch": 6.78, "grad_norm": 0.22587792575359344, "learning_rate": 0.0007235967286420334, "loss": 3.0425, "step": 126000 }, { "epoch": 6.83, "grad_norm": 0.22501161694526672, "learning_rate": 0.000720657801835726, "loss": 3.0446, "step": 127000 }, { "epoch": 6.88, "grad_norm": 0.23849289119243622, "learning_rate": 0.0007177159331607437, "loss": 3.0415, "step": 128000 }, { "epoch": 6.94, "grad_norm": 0.23556004464626312, "learning_rate": 0.0007147770063544363, "loss": 3.0478, "step": 129000 }, { "epoch": 6.99, "grad_norm": 0.22832949459552765, "learning_rate": 0.000711835137679454, "loss": 3.0444, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.4052325248213447, "eval_loss": 3.3837218284606934, "eval_runtime": 154.0952, "eval_samples_per_second": 375.852, "eval_steps_per_second": 5.873, "step": 130172 }, { "epoch": 7.04, "grad_norm": 0.21719269454479218, "learning_rate": 0.0007088932690044717, "loss": 2.9885, "step": 131000 }, { "epoch": 7.1, "grad_norm": 0.24770499765872955, "learning_rate": 0.0007059543421981642, "loss": 2.9813, "step": 132000 }, { "epoch": 7.15, "grad_norm": 0.23300251364707947, "learning_rate": 0.000703012473523182, "loss": 2.9884, "step": 133000 }, { "epoch": 7.21, "grad_norm": 0.2238881140947342, "learning_rate": 0.0007000735467168746, "loss": 2.9932, "step": 134000 }, { "epoch": 7.26, "grad_norm": 0.22716933488845825, "learning_rate": 0.0006971316780418923, "loss": 2.9953, "step": 135000 }, { "epoch": 7.31, "grad_norm": 0.2647615075111389, "learning_rate": 0.0006941898093669099, "loss": 3.0028, "step": 136000 }, { "epoch": 7.37, "grad_norm": 0.2218490093946457, "learning_rate": 0.0006912508825606025, "loss": 3.0034, "step": 137000 }, { "epoch": 7.42, "grad_norm": 0.20896905660629272, "learning_rate": 0.0006883090138856202, "loss": 3.0015, "step": 138000 }, { "epoch": 7.47, "grad_norm": 0.24288326501846313, "learning_rate": 0.0006853700870793128, "loss": 3.0057, "step": 139000 }, { "epoch": 7.53, "grad_norm": 0.23336203396320343, "learning_rate": 0.0006824282184043304, "loss": 3.0104, "step": 140000 }, { "epoch": 7.58, "grad_norm": 0.22705671191215515, "learning_rate": 0.0006794863497293481, "loss": 3.0079, "step": 141000 }, { "epoch": 7.64, "grad_norm": 0.24445344507694244, "learning_rate": 0.0006765474229230408, "loss": 3.0095, "step": 142000 }, { "epoch": 7.69, "grad_norm": 0.24330651760101318, "learning_rate": 0.0006736055542480583, "loss": 3.0088, "step": 143000 }, { "epoch": 7.74, "grad_norm": 0.23043465614318848, "learning_rate": 0.000670666627441751, "loss": 3.0093, "step": 144000 }, { "epoch": 7.8, "grad_norm": 0.21775782108306885, "learning_rate": 0.0006677247587667687, "loss": 3.0126, "step": 145000 }, { "epoch": 7.85, "grad_norm": 0.2638954222202301, "learning_rate": 0.0006647858319604612, "loss": 3.014, "step": 146000 }, { "epoch": 7.9, "grad_norm": 0.24298764765262604, "learning_rate": 0.0006618439632854789, "loss": 3.0144, "step": 147000 }, { "epoch": 7.96, "grad_norm": 0.2365158647298813, "learning_rate": 0.0006589020946104966, "loss": 3.0115, "step": 148000 }, { "epoch": 8.0, "eval_accuracy": 0.4068079041596424, "eval_loss": 3.366936683654785, "eval_runtime": 153.7426, "eval_samples_per_second": 376.714, "eval_steps_per_second": 5.886, "step": 148768 }, { "epoch": 8.01, "grad_norm": 0.2332482486963272, "learning_rate": 0.0006559631678041892, "loss": 3.0018, "step": 149000 }, { "epoch": 8.07, "grad_norm": 0.2459622025489807, "learning_rate": 0.0006530242409978818, "loss": 2.9471, "step": 150000 }, { "epoch": 8.12, "grad_norm": 0.2830921411514282, "learning_rate": 0.0006500823723228996, "loss": 2.9543, "step": 151000 }, { "epoch": 8.17, "grad_norm": 0.267229825258255, "learning_rate": 0.0006471434455165921, "loss": 2.9631, "step": 152000 }, { "epoch": 8.23, "grad_norm": 0.259920597076416, "learning_rate": 0.0006442015768416098, "loss": 2.9617, "step": 153000 }, { "epoch": 8.28, "grad_norm": 0.2159169465303421, "learning_rate": 0.0006412597081666275, "loss": 2.9668, "step": 154000 }, { "epoch": 8.34, "grad_norm": 0.2374085932970047, "learning_rate": 0.0006383178394916452, "loss": 2.9728, "step": 155000 }, { "epoch": 8.39, "grad_norm": 0.24743901193141937, "learning_rate": 0.0006353759708166627, "loss": 2.9693, "step": 156000 }, { "epoch": 8.44, "grad_norm": 0.22295930981636047, "learning_rate": 0.0006324341021416804, "loss": 2.9758, "step": 157000 }, { "epoch": 8.5, "grad_norm": 0.2578998804092407, "learning_rate": 0.0006294922334666981, "loss": 2.9743, "step": 158000 }, { "epoch": 8.55, "grad_norm": 0.23009610176086426, "learning_rate": 0.0006265533066603907, "loss": 2.9815, "step": 159000 }, { "epoch": 8.6, "grad_norm": 0.2383948415517807, "learning_rate": 0.0006236143798540833, "loss": 2.9798, "step": 160000 }, { "epoch": 8.66, "grad_norm": 0.2457112967967987, "learning_rate": 0.000620675453047776, "loss": 2.9815, "step": 161000 }, { "epoch": 8.71, "grad_norm": 0.24992822110652924, "learning_rate": 0.0006177335843727936, "loss": 2.9797, "step": 162000 }, { "epoch": 8.77, "grad_norm": 0.24688223004341125, "learning_rate": 0.0006147946575664862, "loss": 2.9869, "step": 163000 }, { "epoch": 8.82, "grad_norm": 0.23152810335159302, "learning_rate": 0.0006118527888915038, "loss": 2.9833, "step": 164000 }, { "epoch": 8.87, "grad_norm": 0.2560078501701355, "learning_rate": 0.0006089109202165215, "loss": 2.9869, "step": 165000 }, { "epoch": 8.93, "grad_norm": 0.2425604909658432, "learning_rate": 0.0006059690515415393, "loss": 2.9856, "step": 166000 }, { "epoch": 8.98, "grad_norm": 0.22484102845191956, "learning_rate": 0.0006030271828665569, "loss": 2.9869, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.40730411942650524, "eval_loss": 3.3864247798919678, "eval_runtime": 154.1912, "eval_samples_per_second": 375.618, "eval_steps_per_second": 5.869, "step": 167364 }, { "epoch": 9.03, "grad_norm": 0.24037963151931763, "learning_rate": 0.0006000882560602495, "loss": 2.9434, "step": 168000 }, { "epoch": 9.09, "grad_norm": 0.23906952142715454, "learning_rate": 0.0005971463873852672, "loss": 2.9315, "step": 169000 }, { "epoch": 9.14, "grad_norm": 0.2632203698158264, "learning_rate": 0.0005942045187102848, "loss": 2.9304, "step": 170000 }, { "epoch": 9.2, "grad_norm": 0.2545742392539978, "learning_rate": 0.0005912655919039774, "loss": 2.9354, "step": 171000 }, { "epoch": 9.25, "grad_norm": 0.2435782551765442, "learning_rate": 0.0005883237232289951, "loss": 2.937, "step": 172000 }, { "epoch": 9.3, "grad_norm": 0.25050055980682373, "learning_rate": 0.0005853847964226877, "loss": 2.9435, "step": 173000 }, { "epoch": 9.36, "grad_norm": 0.30542534589767456, "learning_rate": 0.0005824429277477053, "loss": 2.9454, "step": 174000 }, { "epoch": 9.41, "grad_norm": 0.23577359318733215, "learning_rate": 0.000579501059072723, "loss": 2.9473, "step": 175000 }, { "epoch": 9.46, "grad_norm": 0.2357962429523468, "learning_rate": 0.0005765591903977406, "loss": 2.9438, "step": 176000 }, { "epoch": 9.52, "grad_norm": 0.27557680010795593, "learning_rate": 0.0005736202635914332, "loss": 2.9519, "step": 177000 }, { "epoch": 9.57, "grad_norm": 0.2616015672683716, "learning_rate": 0.0005706813367851259, "loss": 2.9517, "step": 178000 }, { "epoch": 9.63, "grad_norm": 0.2544803321361542, "learning_rate": 0.0005677394681101436, "loss": 2.9528, "step": 179000 }, { "epoch": 9.68, "grad_norm": 0.2523456811904907, "learning_rate": 0.0005647975994351611, "loss": 2.9551, "step": 180000 }, { "epoch": 9.73, "grad_norm": 0.24971884489059448, "learning_rate": 0.0005618557307601789, "loss": 2.9546, "step": 181000 }, { "epoch": 9.79, "grad_norm": 0.24730221927165985, "learning_rate": 0.0005589138620851966, "loss": 2.9569, "step": 182000 }, { "epoch": 9.84, "grad_norm": 0.230006605386734, "learning_rate": 0.0005559749352788892, "loss": 2.9582, "step": 183000 }, { "epoch": 9.89, "grad_norm": 0.22560349106788635, "learning_rate": 0.0005530389503412567, "loss": 2.9606, "step": 184000 }, { "epoch": 9.95, "grad_norm": 0.28621554374694824, "learning_rate": 0.0005500970816662745, "loss": 2.9651, "step": 185000 }, { "epoch": 10.0, "eval_accuracy": 0.4079115901065721, "eval_loss": 3.361621856689453, "eval_runtime": 154.014, "eval_samples_per_second": 376.05, "eval_steps_per_second": 5.876, "step": 185960 }, { "epoch": 10.0, "grad_norm": 0.2537492513656616, "learning_rate": 0.0005471552129912922, "loss": 2.9625, "step": 186000 }, { "epoch": 10.06, "grad_norm": 0.23095977306365967, "learning_rate": 0.0005442133443163097, "loss": 2.901, "step": 187000 }, { "epoch": 10.11, "grad_norm": 0.2696865200996399, "learning_rate": 0.0005412714756413274, "loss": 2.9027, "step": 188000 }, { "epoch": 10.16, "grad_norm": 0.26662537455558777, "learning_rate": 0.0005383325488350201, "loss": 2.912, "step": 189000 }, { "epoch": 10.22, "grad_norm": 0.24653853476047516, "learning_rate": 0.0005353906801600376, "loss": 2.9081, "step": 190000 }, { "epoch": 10.27, "grad_norm": 0.2434544712305069, "learning_rate": 0.0005324488114850553, "loss": 2.9184, "step": 191000 }, { "epoch": 10.32, "grad_norm": 0.25054100155830383, "learning_rate": 0.000529506942810073, "loss": 2.9183, "step": 192000 }, { "epoch": 10.38, "grad_norm": 0.27828437089920044, "learning_rate": 0.0005265650741350905, "loss": 2.9214, "step": 193000 }, { "epoch": 10.43, "grad_norm": 0.2823689579963684, "learning_rate": 0.0005236290891974582, "loss": 2.9229, "step": 194000 }, { "epoch": 10.49, "grad_norm": 0.26634788513183594, "learning_rate": 0.0005206872205224759, "loss": 2.9256, "step": 195000 }, { "epoch": 10.54, "grad_norm": 0.314277708530426, "learning_rate": 0.0005177453518474935, "loss": 2.9326, "step": 196000 }, { "epoch": 10.59, "grad_norm": 0.24600285291671753, "learning_rate": 0.0005148034831725112, "loss": 2.9236, "step": 197000 }, { "epoch": 10.65, "grad_norm": 0.2303924709558487, "learning_rate": 0.0005118645563662038, "loss": 2.9316, "step": 198000 }, { "epoch": 10.7, "grad_norm": 0.23870837688446045, "learning_rate": 0.0005089226876912215, "loss": 2.9352, "step": 199000 }, { "epoch": 10.76, "grad_norm": 0.2509617805480957, "learning_rate": 0.0005059808190162391, "loss": 2.9302, "step": 200000 }, { "epoch": 10.81, "grad_norm": 0.28766971826553345, "learning_rate": 0.0005030389503412568, "loss": 2.9309, "step": 201000 }, { "epoch": 10.86, "grad_norm": 0.26924803853034973, "learning_rate": 0.0005001000235349494, "loss": 2.9381, "step": 202000 }, { "epoch": 10.92, "grad_norm": 0.26498013734817505, "learning_rate": 0.0004971581548599671, "loss": 2.9364, "step": 203000 }, { "epoch": 10.97, "grad_norm": 0.2591700255870819, "learning_rate": 0.0004942162861849847, "loss": 2.9407, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.40898988079613996, "eval_loss": 3.3727080821990967, "eval_runtime": 154.0532, "eval_samples_per_second": 375.955, "eval_steps_per_second": 5.875, "step": 204556 }, { "epoch": 11.02, "grad_norm": 0.2849581837654114, "learning_rate": 0.0004912744175100024, "loss": 2.9093, "step": 205000 }, { "epoch": 11.08, "grad_norm": 0.26025569438934326, "learning_rate": 0.00048833843257237, "loss": 2.8745, "step": 206000 }, { "epoch": 11.13, "grad_norm": 0.2565370202064514, "learning_rate": 0.00048539656389738763, "loss": 2.8846, "step": 207000 }, { "epoch": 11.19, "grad_norm": 0.29611414670944214, "learning_rate": 0.00048245763709108024, "loss": 2.8862, "step": 208000 }, { "epoch": 11.24, "grad_norm": 0.23962269723415375, "learning_rate": 0.0004795157684160979, "loss": 2.8923, "step": 209000 }, { "epoch": 11.29, "grad_norm": 0.23374328017234802, "learning_rate": 0.0004765768416097905, "loss": 2.895, "step": 210000 }, { "epoch": 11.35, "grad_norm": 0.2842772305011749, "learning_rate": 0.0004736349729348082, "loss": 2.8978, "step": 211000 }, { "epoch": 11.4, "grad_norm": 0.2555365562438965, "learning_rate": 0.0004706960461285008, "loss": 2.8982, "step": 212000 }, { "epoch": 11.45, "grad_norm": 0.26069942116737366, "learning_rate": 0.0004677541774535185, "loss": 2.9035, "step": 213000 }, { "epoch": 11.51, "grad_norm": 0.2919773459434509, "learning_rate": 0.00046481230877853615, "loss": 2.9057, "step": 214000 }, { "epoch": 11.56, "grad_norm": 0.26192766427993774, "learning_rate": 0.0004618704401035538, "loss": 2.9039, "step": 215000 }, { "epoch": 11.62, "grad_norm": 0.2489086240530014, "learning_rate": 0.0004589285714285714, "loss": 2.9093, "step": 216000 }, { "epoch": 11.67, "grad_norm": 0.26663750410079956, "learning_rate": 0.00045598964462226406, "loss": 2.9134, "step": 217000 }, { "epoch": 11.72, "grad_norm": 0.26307132840156555, "learning_rate": 0.00045304777594728173, "loss": 2.9088, "step": 218000 }, { "epoch": 11.78, "grad_norm": 0.2765272259712219, "learning_rate": 0.00045010884914097434, "loss": 2.9124, "step": 219000 }, { "epoch": 11.83, "grad_norm": 0.2534734010696411, "learning_rate": 0.000447166980465992, "loss": 2.9139, "step": 220000 }, { "epoch": 11.88, "grad_norm": 0.2891595661640167, "learning_rate": 0.0004442251117910097, "loss": 2.9135, "step": 221000 }, { "epoch": 11.94, "grad_norm": 0.27639132738113403, "learning_rate": 0.0004412861849847023, "loss": 2.9144, "step": 222000 }, { "epoch": 11.99, "grad_norm": 0.2949223518371582, "learning_rate": 0.0004383443163097199, "loss": 2.9186, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.4090768830667313, "eval_loss": 3.386943817138672, "eval_runtime": 154.0531, "eval_samples_per_second": 375.955, "eval_steps_per_second": 5.875, "step": 223152 }, { "epoch": 12.05, "grad_norm": 0.2919670641422272, "learning_rate": 0.0004354053895034126, "loss": 2.8624, "step": 224000 }, { "epoch": 12.1, "grad_norm": 0.2778524160385132, "learning_rate": 0.0004324664626971052, "loss": 2.8607, "step": 225000 }, { "epoch": 12.15, "grad_norm": 0.2688823640346527, "learning_rate": 0.00042952459402212286, "loss": 2.8637, "step": 226000 }, { "epoch": 12.21, "grad_norm": 0.26661086082458496, "learning_rate": 0.0004265827253471405, "loss": 2.8716, "step": 227000 }, { "epoch": 12.26, "grad_norm": 0.2500409483909607, "learning_rate": 0.00042364085667215815, "loss": 2.8719, "step": 228000 }, { "epoch": 12.31, "grad_norm": 0.24592752754688263, "learning_rate": 0.0004207019298658508, "loss": 2.8779, "step": 229000 }, { "epoch": 12.37, "grad_norm": 0.2966313064098358, "learning_rate": 0.00041776006119086844, "loss": 2.8773, "step": 230000 }, { "epoch": 12.42, "grad_norm": 0.2876270115375519, "learning_rate": 0.0004148181925158861, "loss": 2.8777, "step": 231000 }, { "epoch": 12.48, "grad_norm": 0.25883278250694275, "learning_rate": 0.0004118792657095787, "loss": 2.8838, "step": 232000 }, { "epoch": 12.53, "grad_norm": 0.25680166482925415, "learning_rate": 0.0004089373970345964, "loss": 2.8847, "step": 233000 }, { "epoch": 12.58, "grad_norm": 0.28966328501701355, "learning_rate": 0.000405998470228289, "loss": 2.8847, "step": 234000 }, { "epoch": 12.64, "grad_norm": 0.2587486803531647, "learning_rate": 0.0004030566015533066, "loss": 2.8868, "step": 235000 }, { "epoch": 12.69, "grad_norm": 0.2806561291217804, "learning_rate": 0.00040011473287832435, "loss": 2.8887, "step": 236000 }, { "epoch": 12.74, "grad_norm": 0.25261130928993225, "learning_rate": 0.00039717580607201696, "loss": 2.8888, "step": 237000 }, { "epoch": 12.8, "grad_norm": 0.2473638653755188, "learning_rate": 0.00039423393739703463, "loss": 2.8931, "step": 238000 }, { "epoch": 12.85, "grad_norm": 0.2695077061653137, "learning_rate": 0.00039129206872205225, "loss": 2.8937, "step": 239000 }, { "epoch": 12.91, "grad_norm": 0.26754283905029297, "learning_rate": 0.00038835020004706987, "loss": 2.8929, "step": 240000 }, { "epoch": 12.96, "grad_norm": 0.2615933120250702, "learning_rate": 0.00038541127324076254, "loss": 2.8974, "step": 241000 }, { "epoch": 13.0, "eval_accuracy": 0.4096338991481772, "eval_loss": 3.3688578605651855, "eval_runtime": 154.4918, "eval_samples_per_second": 374.887, "eval_steps_per_second": 5.858, "step": 241748 }, { "epoch": 13.01, "grad_norm": 0.2606102526187897, "learning_rate": 0.00038246940456578016, "loss": 2.8775, "step": 242000 }, { "epoch": 13.07, "grad_norm": 0.27312567830085754, "learning_rate": 0.0003795275358907979, "loss": 2.837, "step": 243000 }, { "epoch": 13.12, "grad_norm": 0.2866615056991577, "learning_rate": 0.0003765886090844905, "loss": 2.8441, "step": 244000 }, { "epoch": 13.17, "grad_norm": 0.26024916768074036, "learning_rate": 0.0003736467404095081, "loss": 2.8453, "step": 245000 }, { "epoch": 13.23, "grad_norm": 0.3037625551223755, "learning_rate": 0.0003707107554718757, "loss": 2.849, "step": 246000 }, { "epoch": 13.28, "grad_norm": 0.25507059693336487, "learning_rate": 0.0003677688867968934, "loss": 2.8511, "step": 247000 }, { "epoch": 13.34, "grad_norm": 0.26291272044181824, "learning_rate": 0.00036482701812191106, "loss": 2.8608, "step": 248000 }, { "epoch": 13.39, "grad_norm": 0.2695772349834442, "learning_rate": 0.0003618851494469287, "loss": 2.8596, "step": 249000 }, { "epoch": 13.44, "grad_norm": 0.2734662592411041, "learning_rate": 0.00035894622264062134, "loss": 2.8626, "step": 250000 }, { "epoch": 13.5, "grad_norm": 0.34274160861968994, "learning_rate": 0.00035600435396563896, "loss": 2.8624, "step": 251000 }, { "epoch": 13.55, "grad_norm": 0.2600579857826233, "learning_rate": 0.00035306248529065663, "loss": 2.8641, "step": 252000 }, { "epoch": 13.61, "grad_norm": 0.27528443932533264, "learning_rate": 0.0003501235584843493, "loss": 2.8663, "step": 253000 }, { "epoch": 13.66, "grad_norm": 0.26855769753456116, "learning_rate": 0.0003471816898093669, "loss": 2.872, "step": 254000 }, { "epoch": 13.71, "grad_norm": 0.26368248462677, "learning_rate": 0.0003442427630030596, "loss": 2.8674, "step": 255000 }, { "epoch": 13.77, "grad_norm": 0.27387478947639465, "learning_rate": 0.0003413008943280772, "loss": 2.8665, "step": 256000 }, { "epoch": 13.82, "grad_norm": 0.30804863572120667, "learning_rate": 0.0003383590256530948, "loss": 2.8721, "step": 257000 }, { "epoch": 13.87, "grad_norm": 0.3087449371814728, "learning_rate": 0.0003354200988467875, "loss": 2.8765, "step": 258000 }, { "epoch": 13.93, "grad_norm": 0.26736271381378174, "learning_rate": 0.0003324782301718051, "loss": 2.8768, "step": 259000 }, { "epoch": 13.98, "grad_norm": 0.3191315233707428, "learning_rate": 0.00032953636149682283, "loss": 2.8733, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.40978962985337464, "eval_loss": 3.381140947341919, "eval_runtime": 153.947, "eval_samples_per_second": 376.214, "eval_steps_per_second": 5.879, "step": 260344 }, { "epoch": 14.04, "grad_norm": 0.2935682237148285, "learning_rate": 0.00032659743469051544, "loss": 2.8349, "step": 261000 }, { "epoch": 14.09, "grad_norm": 0.32009586691856384, "learning_rate": 0.0003236555660155331, "loss": 2.8245, "step": 262000 }, { "epoch": 14.14, "grad_norm": 0.2821045219898224, "learning_rate": 0.00032071369734055073, "loss": 2.8243, "step": 263000 }, { "epoch": 14.2, "grad_norm": 0.2877224087715149, "learning_rate": 0.00031777477053424334, "loss": 2.8294, "step": 264000 }, { "epoch": 14.25, "grad_norm": 0.28222963213920593, "learning_rate": 0.000314835843727936, "loss": 2.8345, "step": 265000 }, { "epoch": 14.3, "grad_norm": 0.2666056156158447, "learning_rate": 0.0003118939750529536, "loss": 2.8345, "step": 266000 }, { "epoch": 14.36, "grad_norm": 0.26124948263168335, "learning_rate": 0.0003089521063779713, "loss": 2.8419, "step": 267000 }, { "epoch": 14.41, "grad_norm": 0.3096939027309418, "learning_rate": 0.00030601023770298897, "loss": 2.8415, "step": 268000 }, { "epoch": 14.47, "grad_norm": 0.27581918239593506, "learning_rate": 0.0003030683690280066, "loss": 2.8435, "step": 269000 }, { "epoch": 14.52, "grad_norm": 0.29562026262283325, "learning_rate": 0.00030012944222169925, "loss": 2.8451, "step": 270000 }, { "epoch": 14.57, "grad_norm": 0.30838248133659363, "learning_rate": 0.0002971875735467169, "loss": 2.8491, "step": 271000 }, { "epoch": 14.63, "grad_norm": 0.2891804873943329, "learning_rate": 0.00029424864674040954, "loss": 2.8485, "step": 272000 }, { "epoch": 14.68, "grad_norm": 0.2642837166786194, "learning_rate": 0.00029130677806542716, "loss": 2.8478, "step": 273000 }, { "epoch": 14.73, "grad_norm": 0.3049638271331787, "learning_rate": 0.0002883649093904448, "loss": 2.8519, "step": 274000 }, { "epoch": 14.79, "grad_norm": 0.26046690344810486, "learning_rate": 0.0002854230407154625, "loss": 2.8514, "step": 275000 }, { "epoch": 14.84, "grad_norm": 0.2902365028858185, "learning_rate": 0.0002824841139091551, "loss": 2.8505, "step": 276000 }, { "epoch": 14.9, "grad_norm": 0.2648141384124756, "learning_rate": 0.0002795422452341728, "loss": 2.8525, "step": 277000 }, { "epoch": 14.95, "grad_norm": 0.27711859345436096, "learning_rate": 0.0002766003765591904, "loss": 2.8547, "step": 278000 }, { "epoch": 15.0, "eval_accuracy": 0.41021328724206096, "eval_loss": 3.3861238956451416, "eval_runtime": 154.0149, "eval_samples_per_second": 376.048, "eval_steps_per_second": 5.876, "step": 278940 }, { "epoch": 15.0, "grad_norm": 0.27470868825912476, "learning_rate": 0.000273658507884208, "loss": 2.8524, "step": 279000 }, { "epoch": 15.06, "grad_norm": 0.29804325103759766, "learning_rate": 0.0002707225229465757, "loss": 2.8018, "step": 280000 }, { "epoch": 15.11, "grad_norm": 0.3048715591430664, "learning_rate": 0.0002677806542715933, "loss": 2.8103, "step": 281000 }, { "epoch": 15.16, "grad_norm": 0.2895219624042511, "learning_rate": 0.00026483878559661097, "loss": 2.812, "step": 282000 }, { "epoch": 15.22, "grad_norm": 0.31599316000938416, "learning_rate": 0.00026189691692162865, "loss": 2.82, "step": 283000 }, { "epoch": 15.27, "grad_norm": 0.2961556315422058, "learning_rate": 0.00025895504824664626, "loss": 2.8214, "step": 284000 }, { "epoch": 15.33, "grad_norm": 0.3021540343761444, "learning_rate": 0.00025601612144033893, "loss": 2.8174, "step": 285000 }, { "epoch": 15.38, "grad_norm": 0.2883637845516205, "learning_rate": 0.00025307425276535655, "loss": 2.8197, "step": 286000 }, { "epoch": 15.43, "grad_norm": 0.2952839136123657, "learning_rate": 0.0002501323840903742, "loss": 2.8187, "step": 287000 }, { "epoch": 15.49, "grad_norm": 0.2684228718280792, "learning_rate": 0.00024719051541539184, "loss": 2.826, "step": 288000 }, { "epoch": 15.54, "grad_norm": 0.31188449263572693, "learning_rate": 0.0002442515886090845, "loss": 2.8287, "step": 289000 }, { "epoch": 15.59, "grad_norm": 0.2904526889324188, "learning_rate": 0.00024130971993410215, "loss": 2.8232, "step": 290000 }, { "epoch": 15.65, "grad_norm": 0.31149864196777344, "learning_rate": 0.0002383678512591198, "loss": 2.8305, "step": 291000 }, { "epoch": 15.7, "grad_norm": 0.30907660722732544, "learning_rate": 0.00023542892445281243, "loss": 2.8319, "step": 292000 }, { "epoch": 15.76, "grad_norm": 0.2970659136772156, "learning_rate": 0.00023248705577783008, "loss": 2.8319, "step": 293000 }, { "epoch": 15.81, "grad_norm": 0.32185524702072144, "learning_rate": 0.00022954812897152272, "loss": 2.8334, "step": 294000 }, { "epoch": 15.86, "grad_norm": 0.2747587561607361, "learning_rate": 0.00022660626029654036, "loss": 2.8351, "step": 295000 }, { "epoch": 15.92, "grad_norm": 0.32215383648872375, "learning_rate": 0.000223664391621558, "loss": 2.8343, "step": 296000 }, { "epoch": 15.97, "grad_norm": 0.30099472403526306, "learning_rate": 0.00022072252294657566, "loss": 2.8352, "step": 297000 }, { "epoch": 16.0, "eval_accuracy": 0.4099246009434271, "eval_loss": 3.3998661041259766, "eval_runtime": 154.387, "eval_samples_per_second": 375.142, "eval_steps_per_second": 5.862, "step": 297536 }, { "epoch": 16.02, "grad_norm": 0.3042096495628357, "learning_rate": 0.0002177835961402683, "loss": 2.8115, "step": 298000 }, { "epoch": 16.08, "grad_norm": 0.29497480392456055, "learning_rate": 0.00021484172746528597, "loss": 2.7915, "step": 299000 }, { "epoch": 16.13, "grad_norm": 0.2896369993686676, "learning_rate": 0.0002118998587903036, "loss": 2.7949, "step": 300000 }, { "epoch": 16.19, "grad_norm": 0.3014323115348816, "learning_rate": 0.00020896093198399625, "loss": 2.7968, "step": 301000 }, { "epoch": 16.24, "grad_norm": 0.2915323078632355, "learning_rate": 0.00020601906330901387, "loss": 2.7961, "step": 302000 }, { "epoch": 16.29, "grad_norm": 0.28494659066200256, "learning_rate": 0.00020307719463403154, "loss": 2.7993, "step": 303000 }, { "epoch": 16.35, "grad_norm": 0.2689177095890045, "learning_rate": 0.00020013826782772418, "loss": 2.8054, "step": 304000 }, { "epoch": 16.4, "grad_norm": 0.290145605802536, "learning_rate": 0.00019719639915274182, "loss": 2.808, "step": 305000 }, { "epoch": 16.46, "grad_norm": 0.306331992149353, "learning_rate": 0.00019425747234643446, "loss": 2.8077, "step": 306000 }, { "epoch": 16.51, "grad_norm": 0.28849080204963684, "learning_rate": 0.0001913156036714521, "loss": 2.808, "step": 307000 }, { "epoch": 16.56, "grad_norm": 0.33114251494407654, "learning_rate": 0.00018837373499646975, "loss": 2.8096, "step": 308000 }, { "epoch": 16.62, "grad_norm": 0.32337045669555664, "learning_rate": 0.0001854348081901624, "loss": 2.8101, "step": 309000 }, { "epoch": 16.67, "grad_norm": 0.3087981939315796, "learning_rate": 0.00018249293951518004, "loss": 2.8112, "step": 310000 }, { "epoch": 16.72, "grad_norm": 0.2813110947608948, "learning_rate": 0.00017955401270887267, "loss": 2.8153, "step": 311000 }, { "epoch": 16.78, "grad_norm": 0.28998225927352905, "learning_rate": 0.00017661214403389035, "loss": 2.815, "step": 312000 }, { "epoch": 16.83, "grad_norm": 0.29306545853614807, "learning_rate": 0.00017367321722758298, "loss": 2.8161, "step": 313000 }, { "epoch": 16.89, "grad_norm": 0.3053112328052521, "learning_rate": 0.0001707313485526006, "loss": 2.8119, "step": 314000 }, { "epoch": 16.94, "grad_norm": 0.3361365795135498, "learning_rate": 0.00016778947987761828, "loss": 2.8147, "step": 315000 }, { "epoch": 16.99, "grad_norm": 0.30090630054473877, "learning_rate": 0.00016485349493998588, "loss": 2.8173, "step": 316000 }, { "epoch": 17.0, "eval_accuracy": 0.4103711006270949, "eval_loss": 3.3992600440979004, "eval_runtime": 154.0327, "eval_samples_per_second": 376.005, "eval_steps_per_second": 5.875, "step": 316132 }, { "epoch": 17.05, "grad_norm": 0.28460776805877686, "learning_rate": 0.00016191162626500352, "loss": 2.7809, "step": 317000 }, { "epoch": 17.1, "grad_norm": 0.2945604920387268, "learning_rate": 0.0001589697575900212, "loss": 2.7761, "step": 318000 }, { "epoch": 17.15, "grad_norm": 0.2811717987060547, "learning_rate": 0.00015603083078371383, "loss": 2.7791, "step": 319000 }, { "epoch": 17.21, "grad_norm": 0.28703057765960693, "learning_rate": 0.00015308896210873148, "loss": 2.785, "step": 320000 }, { "epoch": 17.26, "grad_norm": 0.2974686622619629, "learning_rate": 0.00015014709343374912, "loss": 2.7862, "step": 321000 }, { "epoch": 17.32, "grad_norm": 0.29753968119621277, "learning_rate": 0.00014720522475876677, "loss": 2.7875, "step": 322000 }, { "epoch": 17.37, "grad_norm": 0.29779624938964844, "learning_rate": 0.0001442662979524594, "loss": 2.7876, "step": 323000 }, { "epoch": 17.42, "grad_norm": 0.281829833984375, "learning_rate": 0.00014132442927747705, "loss": 2.7909, "step": 324000 }, { "epoch": 17.48, "grad_norm": 0.3030970096588135, "learning_rate": 0.0001383825606024947, "loss": 2.7883, "step": 325000 }, { "epoch": 17.53, "grad_norm": 0.28257811069488525, "learning_rate": 0.00013544363379618734, "loss": 2.793, "step": 326000 }, { "epoch": 17.58, "grad_norm": 0.3062205910682678, "learning_rate": 0.00013250176512120498, "loss": 2.7923, "step": 327000 }, { "epoch": 17.64, "grad_norm": 0.29610031843185425, "learning_rate": 0.00012956283831489762, "loss": 2.7947, "step": 328000 }, { "epoch": 17.69, "grad_norm": 0.31379351019859314, "learning_rate": 0.0001266209696399153, "loss": 2.793, "step": 329000 }, { "epoch": 17.75, "grad_norm": 0.29375967383384705, "learning_rate": 0.00012367910096493294, "loss": 2.7971, "step": 330000 }, { "epoch": 17.8, "grad_norm": 0.30711761116981506, "learning_rate": 0.00012073723228995057, "loss": 2.7963, "step": 331000 }, { "epoch": 17.85, "grad_norm": 0.29258862137794495, "learning_rate": 0.00011780124735231819, "loss": 2.7937, "step": 332000 }, { "epoch": 17.91, "grad_norm": 0.2702758014202118, "learning_rate": 0.00011485937867733585, "loss": 2.7942, "step": 333000 }, { "epoch": 17.96, "grad_norm": 0.2748437523841858, "learning_rate": 0.00011191751000235349, "loss": 2.8005, "step": 334000 }, { "epoch": 18.0, "eval_accuracy": 0.4099377016714312, "eval_loss": 3.4154224395751953, "eval_runtime": 154.4547, "eval_samples_per_second": 374.977, "eval_steps_per_second": 5.859, "step": 334728 }, { "epoch": 18.01, "grad_norm": 0.29431185126304626, "learning_rate": 0.00010897858319604613, "loss": 2.7895, "step": 335000 }, { "epoch": 18.07, "grad_norm": 0.3129688799381256, "learning_rate": 0.00010603965638973876, "loss": 2.7669, "step": 336000 }, { "epoch": 18.12, "grad_norm": 0.29429852962493896, "learning_rate": 0.00010309778771475643, "loss": 2.7668, "step": 337000 }, { "epoch": 18.18, "grad_norm": 0.30273085832595825, "learning_rate": 0.00010015591903977407, "loss": 2.7695, "step": 338000 }, { "epoch": 18.23, "grad_norm": 0.3032689392566681, "learning_rate": 9.72169922334667e-05, "loss": 2.7712, "step": 339000 }, { "epoch": 18.28, "grad_norm": 0.3074301779270172, "learning_rate": 9.427512355848434e-05, "loss": 2.7691, "step": 340000 }, { "epoch": 18.34, "grad_norm": 0.30292806029319763, "learning_rate": 9.1333254883502e-05, "loss": 2.7725, "step": 341000 }, { "epoch": 18.39, "grad_norm": 0.31416478753089905, "learning_rate": 8.839432807719464e-05, "loss": 2.7718, "step": 342000 }, { "epoch": 18.44, "grad_norm": 0.3108297884464264, "learning_rate": 8.545245940221228e-05, "loss": 2.7724, "step": 343000 }, { "epoch": 18.5, "grad_norm": 0.31622520089149475, "learning_rate": 8.251059072722994e-05, "loss": 2.7771, "step": 344000 }, { "epoch": 18.55, "grad_norm": 0.2999779284000397, "learning_rate": 7.956872205224759e-05, "loss": 2.7739, "step": 345000 }, { "epoch": 18.61, "grad_norm": 0.28529179096221924, "learning_rate": 7.662685337726524e-05, "loss": 2.7776, "step": 346000 }, { "epoch": 18.66, "grad_norm": 0.3071877956390381, "learning_rate": 7.368792657095789e-05, "loss": 2.7764, "step": 347000 }, { "epoch": 18.71, "grad_norm": 0.3046530783176422, "learning_rate": 7.074605789597552e-05, "loss": 2.776, "step": 348000 }, { "epoch": 18.77, "grad_norm": 0.3025725781917572, "learning_rate": 6.780418922099317e-05, "loss": 2.7787, "step": 349000 }, { "epoch": 18.82, "grad_norm": 0.3011409044265747, "learning_rate": 6.48652624146858e-05, "loss": 2.7794, "step": 350000 }, { "epoch": 18.88, "grad_norm": 0.2958693206310272, "learning_rate": 6.192339373970346e-05, "loss": 2.7778, "step": 351000 }, { "epoch": 18.93, "grad_norm": 0.3003888726234436, "learning_rate": 5.898152506472111e-05, "loss": 2.7786, "step": 352000 }, { "epoch": 18.98, "grad_norm": 0.29960742592811584, "learning_rate": 5.604554012708873e-05, "loss": 2.7796, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.41019803665099974, "eval_loss": 3.4155313968658447, "eval_runtime": 154.2799, "eval_samples_per_second": 375.402, "eval_steps_per_second": 5.866, "step": 353324 }, { "epoch": 19.04, "grad_norm": 0.29620590806007385, "learning_rate": 5.310367145210638e-05, "loss": 2.7661, "step": 354000 }, { "epoch": 19.09, "grad_norm": 0.3027678430080414, "learning_rate": 5.016180277712403e-05, "loss": 2.7561, "step": 355000 }, { "epoch": 19.14, "grad_norm": 0.31339025497436523, "learning_rate": 4.721993410214168e-05, "loss": 2.757, "step": 356000 }, { "epoch": 19.2, "grad_norm": 0.31296485662460327, "learning_rate": 4.4278065427159335e-05, "loss": 2.7561, "step": 357000 }, { "epoch": 19.25, "grad_norm": 0.3046360909938812, "learning_rate": 4.1339138620851965e-05, "loss": 2.7601, "step": 358000 }, { "epoch": 19.31, "grad_norm": 0.3128166198730469, "learning_rate": 3.839726994586962e-05, "loss": 2.7618, "step": 359000 }, { "epoch": 19.36, "grad_norm": 0.3523208200931549, "learning_rate": 3.545540127088727e-05, "loss": 2.7576, "step": 360000 }, { "epoch": 19.41, "grad_norm": 0.30089735984802246, "learning_rate": 3.25164744645799e-05, "loss": 2.7602, "step": 361000 }, { "epoch": 19.47, "grad_norm": 0.3562370538711548, "learning_rate": 2.957460578959755e-05, "loss": 2.7599, "step": 362000 }, { "epoch": 19.52, "grad_norm": 0.33757689595222473, "learning_rate": 2.6635678983290187e-05, "loss": 2.7591, "step": 363000 }, { "epoch": 19.57, "grad_norm": 0.313703715801239, "learning_rate": 2.3693810308307837e-05, "loss": 2.7621, "step": 364000 }, { "epoch": 19.63, "grad_norm": 0.31429997086524963, "learning_rate": 2.075488350200047e-05, "loss": 2.7585, "step": 365000 }, { "epoch": 19.68, "grad_norm": 0.29419606924057007, "learning_rate": 1.7813014827018123e-05, "loss": 2.7598, "step": 366000 }, { "epoch": 19.74, "grad_norm": 0.28869205713272095, "learning_rate": 1.4874088020710754e-05, "loss": 2.7631, "step": 367000 }, { "epoch": 19.79, "grad_norm": 0.2999878227710724, "learning_rate": 1.1932219345728408e-05, "loss": 2.7615, "step": 368000 }, { "epoch": 19.84, "grad_norm": 0.31875109672546387, "learning_rate": 8.990350670746058e-06, "loss": 2.7607, "step": 369000 }, { "epoch": 19.9, "grad_norm": 0.2959778308868408, "learning_rate": 6.051423864438692e-06, "loss": 2.7631, "step": 370000 }, { "epoch": 19.95, "grad_norm": 0.3109806478023529, "learning_rate": 3.1095551894563426e-06, "loss": 2.7606, "step": 371000 }, { "epoch": 20.0, "eval_accuracy": 0.40996444059320364, "eval_loss": 3.424755573272705, "eval_runtime": 154.1805, "eval_samples_per_second": 375.644, "eval_steps_per_second": 5.87, "step": 371920 }, { "epoch": 20.0, "step": 371920, "total_flos": 1.5670047538944e+18, "train_loss": 3.0296430246833577, "train_runtime": 81436.2617, "train_samples_per_second": 146.143, "train_steps_per_second": 4.567 } ], "logging_steps": 1000, "max_steps": 371920, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.5670047538944e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }