{ "best_metric": 0.7176774740219116, "best_model_checkpoint": "./results/checkpoint-975000", "epoch": 5.0, "eval_steps": 5000, "global_step": 975040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005127994748933377, "grad_norm": 0.901538610458374, "learning_rate": 0.0007991795208401707, "loss": 0.9598, "step": 1000 }, { "epoch": 0.010255989497866755, "grad_norm": 0.8623629808425903, "learning_rate": 0.0007983590416803414, "loss": 0.9776, "step": 2000 }, { "epoch": 0.01538398424680013, "grad_norm": 0.7568956017494202, "learning_rate": 0.000797538562520512, "loss": 0.9766, "step": 3000 }, { "epoch": 0.02051197899573351, "grad_norm": 0.7210434675216675, "learning_rate": 0.0007967180833606827, "loss": 0.9764, "step": 4000 }, { "epoch": 0.025639973744666887, "grad_norm": 0.8768877983093262, "learning_rate": 0.0007958984246800132, "loss": 0.9709, "step": 5000 }, { "epoch": 0.025639973744666887, "eval_accuracy": 0.7612099241863869, "eval_loss": 0.9169855713844299, "eval_runtime": 69.8508, "eval_samples_per_second": 7219.146, "eval_steps_per_second": 28.203, "step": 5000 }, { "epoch": 0.03076796849360026, "grad_norm": 0.9542640447616577, "learning_rate": 0.0007950779455201838, "loss": 0.969, "step": 6000 }, { "epoch": 0.03589596324253364, "grad_norm": 0.8376867771148682, "learning_rate": 0.0007942574663603544, "loss": 0.9755, "step": 7000 }, { "epoch": 0.04102395799146702, "grad_norm": 0.6857182383537292, "learning_rate": 0.000793437807679685, "loss": 0.9711, "step": 8000 }, { "epoch": 0.0461519527404004, "grad_norm": 0.7409297227859497, "learning_rate": 0.0007926173285198557, "loss": 0.9656, "step": 9000 }, { "epoch": 0.051279947489333774, "grad_norm": 0.7136521339416504, "learning_rate": 0.0007917968493600262, "loss": 0.9635, "step": 10000 }, { "epoch": 0.051279947489333774, "eval_accuracy": 0.7623323543468389, "eval_loss": 0.9147374033927917, "eval_runtime": 70.4732, "eval_samples_per_second": 7155.392, "eval_steps_per_second": 27.954, "step": 10000 }, { "epoch": 0.056407942238267145, "grad_norm": 0.8227306008338928, "learning_rate": 0.0007909771906793567, "loss": 0.9623, "step": 11000 }, { "epoch": 0.06153593698720052, "grad_norm": 0.6881102919578552, "learning_rate": 0.0007901567115195274, "loss": 0.9636, "step": 12000 }, { "epoch": 0.0666639317361339, "grad_norm": 0.7201479077339172, "learning_rate": 0.0007893362323596982, "loss": 0.956, "step": 13000 }, { "epoch": 0.07179192648506728, "grad_norm": 0.7269867062568665, "learning_rate": 0.0007885165736790286, "loss": 0.9592, "step": 14000 }, { "epoch": 0.07691992123400065, "grad_norm": 0.7410030961036682, "learning_rate": 0.0007876960945191992, "loss": 0.9518, "step": 15000 }, { "epoch": 0.07691992123400065, "eval_accuracy": 0.7646228257873372, "eval_loss": 0.9081013202667236, "eval_runtime": 70.3401, "eval_samples_per_second": 7168.926, "eval_steps_per_second": 28.007, "step": 15000 }, { "epoch": 0.08204791598293404, "grad_norm": 0.7281767129898071, "learning_rate": 0.0007868764358385297, "loss": 0.9504, "step": 16000 }, { "epoch": 0.08717591073186741, "grad_norm": 0.7674609422683716, "learning_rate": 0.0007860559566787004, "loss": 0.9557, "step": 17000 }, { "epoch": 0.0923039054808008, "grad_norm": 0.6836472749710083, "learning_rate": 0.0007852354775188711, "loss": 0.9489, "step": 18000 }, { "epoch": 0.09743190022973416, "grad_norm": 0.6684597730636597, "learning_rate": 0.0007844158188382016, "loss": 0.9474, "step": 19000 }, { "epoch": 0.10255989497866755, "grad_norm": 0.6993370652198792, "learning_rate": 0.0007835953396783721, "loss": 0.9472, "step": 20000 }, { "epoch": 0.10255989497866755, "eval_accuracy": 0.7656302366027251, "eval_loss": 0.9044137597084045, "eval_runtime": 69.9478, "eval_samples_per_second": 7209.128, "eval_steps_per_second": 28.164, "step": 20000 }, { "epoch": 0.10768788972760092, "grad_norm": 0.785712480545044, "learning_rate": 0.0007827748605185429, "loss": 0.945, "step": 21000 }, { "epoch": 0.11281588447653429, "grad_norm": 0.6483029127120972, "learning_rate": 0.0007819543813587136, "loss": 0.938, "step": 22000 }, { "epoch": 0.11794387922546767, "grad_norm": 0.7053022980690002, "learning_rate": 0.0007811363636363637, "loss": 0.9426, "step": 23000 }, { "epoch": 0.12307187397440104, "grad_norm": 0.699554979801178, "learning_rate": 0.0007803158844765343, "loss": 0.9335, "step": 24000 }, { "epoch": 0.12819986872333441, "grad_norm": 0.6407724618911743, "learning_rate": 0.000779495405316705, "loss": 0.9443, "step": 25000 }, { "epoch": 0.12819986872333441, "eval_accuracy": 0.765967362269292, "eval_loss": 0.906114935874939, "eval_runtime": 69.886, "eval_samples_per_second": 7215.51, "eval_steps_per_second": 28.189, "step": 25000 }, { "epoch": 0.1333278634722678, "grad_norm": 0.746428906917572, "learning_rate": 0.0007786749261568757, "loss": 0.9375, "step": 26000 }, { "epoch": 0.13845585822120118, "grad_norm": 0.7493112683296204, "learning_rate": 0.0007778544469970463, "loss": 0.9388, "step": 27000 }, { "epoch": 0.14358385297013457, "grad_norm": 0.7842503190040588, "learning_rate": 0.0007770339678372169, "loss": 0.9295, "step": 28000 }, { "epoch": 0.14871184771906792, "grad_norm": 0.6593005061149597, "learning_rate": 0.0007762143091565475, "loss": 0.9277, "step": 29000 }, { "epoch": 0.1538398424680013, "grad_norm": 0.6147266626358032, "learning_rate": 0.0007753946504758779, "loss": 0.93, "step": 30000 }, { "epoch": 0.1538398424680013, "eval_accuracy": 0.7651384297479689, "eval_loss": 0.9070658087730408, "eval_runtime": 70.0619, "eval_samples_per_second": 7197.396, "eval_steps_per_second": 28.118, "step": 30000 }, { "epoch": 0.1589678372169347, "grad_norm": 0.6877672672271729, "learning_rate": 0.0007745749917952084, "loss": 0.9313, "step": 31000 }, { "epoch": 0.16409583196586808, "grad_norm": 0.6153438091278076, "learning_rate": 0.0007737545126353791, "loss": 0.9303, "step": 32000 }, { "epoch": 0.16922382671480143, "grad_norm": 0.6031501889228821, "learning_rate": 0.0007729340334755498, "loss": 0.9249, "step": 33000 }, { "epoch": 0.17435182146373482, "grad_norm": 0.8115866184234619, "learning_rate": 0.0007721135543157204, "loss": 0.9267, "step": 34000 }, { "epoch": 0.1794798162126682, "grad_norm": 0.6294234395027161, "learning_rate": 0.0007712930751558911, "loss": 0.9206, "step": 35000 }, { "epoch": 0.1794798162126682, "eval_accuracy": 0.7679782177157555, "eval_loss": 0.89626145362854, "eval_runtime": 70.2701, "eval_samples_per_second": 7176.072, "eval_steps_per_second": 28.035, "step": 35000 }, { "epoch": 0.1846078109616016, "grad_norm": 0.7004373073577881, "learning_rate": 0.0007704725959960617, "loss": 0.9247, "step": 36000 }, { "epoch": 0.18973580571053494, "grad_norm": 0.7466434240341187, "learning_rate": 0.0007696529373153922, "loss": 0.9278, "step": 37000 }, { "epoch": 0.19486380045946833, "grad_norm": 0.7003448009490967, "learning_rate": 0.0007688332786347226, "loss": 0.9206, "step": 38000 }, { "epoch": 0.1999917952084017, "grad_norm": 0.6374254822731018, "learning_rate": 0.0007680127994748934, "loss": 0.9187, "step": 39000 }, { "epoch": 0.2051197899573351, "grad_norm": 0.6395047903060913, "learning_rate": 0.0007671923203150641, "loss": 0.9214, "step": 40000 }, { "epoch": 0.2051197899573351, "eval_accuracy": 0.7692791261702723, "eval_loss": 0.8910465836524963, "eval_runtime": 69.9786, "eval_samples_per_second": 7205.961, "eval_steps_per_second": 28.151, "step": 40000 }, { "epoch": 0.21024778470626845, "grad_norm": 0.6510487794876099, "learning_rate": 0.0007663726616343946, "loss": 0.9129, "step": 41000 }, { "epoch": 0.21537577945520184, "grad_norm": 0.5432180762290955, "learning_rate": 0.0007655521824745652, "loss": 0.9162, "step": 42000 }, { "epoch": 0.22050377420413522, "grad_norm": 0.5783429145812988, "learning_rate": 0.0007647317033147358, "loss": 0.9143, "step": 43000 }, { "epoch": 0.22563176895306858, "grad_norm": 0.6667906045913696, "learning_rate": 0.0007639128651132261, "loss": 0.9165, "step": 44000 }, { "epoch": 0.23075976370200196, "grad_norm": 0.5841776728630066, "learning_rate": 0.0007630923859533968, "loss": 0.912, "step": 45000 }, { "epoch": 0.23075976370200196, "eval_accuracy": 0.76871592799789, "eval_loss": 0.8913866877555847, "eval_runtime": 71.0385, "eval_samples_per_second": 7098.444, "eval_steps_per_second": 27.731, "step": 45000 }, { "epoch": 0.23588775845093535, "grad_norm": 0.5653632879257202, "learning_rate": 0.0007622719067935674, "loss": 0.9085, "step": 46000 }, { "epoch": 0.24101575319986873, "grad_norm": 0.6264407634735107, "learning_rate": 0.0007614514276337381, "loss": 0.912, "step": 47000 }, { "epoch": 0.2461437479488021, "grad_norm": 0.6177555322647095, "learning_rate": 0.0007606309484739088, "loss": 0.9135, "step": 48000 }, { "epoch": 0.2512717426977355, "grad_norm": 0.7154836654663086, "learning_rate": 0.0007598104693140794, "loss": 0.909, "step": 49000 }, { "epoch": 0.25639973744666883, "grad_norm": 0.5617414712905884, "learning_rate": 0.00075899081063341, "loss": 0.9113, "step": 50000 }, { "epoch": 0.25639973744666883, "eval_accuracy": 0.7718630952498994, "eval_loss": 0.8801304697990417, "eval_runtime": 69.6775, "eval_samples_per_second": 7237.097, "eval_steps_per_second": 28.273, "step": 50000 }, { "epoch": 0.2615277321956022, "grad_norm": 0.6990838646888733, "learning_rate": 0.0007581703314735806, "loss": 0.9116, "step": 51000 }, { "epoch": 0.2666557269445356, "grad_norm": 0.6637808680534363, "learning_rate": 0.0007573498523137513, "loss": 0.9054, "step": 52000 }, { "epoch": 0.271783721693469, "grad_norm": 0.6528953313827515, "learning_rate": 0.000756529373153922, "loss": 0.9088, "step": 53000 }, { "epoch": 0.27691171644240237, "grad_norm": 0.5775809288024902, "learning_rate": 0.0007557097144732524, "loss": 0.9065, "step": 54000 }, { "epoch": 0.28203971119133575, "grad_norm": 0.6081883311271667, "learning_rate": 0.0007548892353134231, "loss": 0.9035, "step": 55000 }, { "epoch": 0.28203971119133575, "eval_accuracy": 0.7723469697360306, "eval_loss": 0.8803244233131409, "eval_runtime": 70.1579, "eval_samples_per_second": 7187.544, "eval_steps_per_second": 28.08, "step": 55000 }, { "epoch": 0.28716770594026914, "grad_norm": 0.6076958179473877, "learning_rate": 0.0007540695766327536, "loss": 0.8979, "step": 56000 }, { "epoch": 0.2922957006892025, "grad_norm": 0.6458191871643066, "learning_rate": 0.0007532490974729242, "loss": 0.8985, "step": 57000 }, { "epoch": 0.29742369543813585, "grad_norm": 0.6102216243743896, "learning_rate": 0.0007524294387922547, "loss": 0.8997, "step": 58000 }, { "epoch": 0.30255169018706923, "grad_norm": 0.6574801206588745, "learning_rate": 0.0007516089596324254, "loss": 0.8976, "step": 59000 }, { "epoch": 0.3076796849360026, "grad_norm": 0.5695379972457886, "learning_rate": 0.000750788480472596, "loss": 0.9035, "step": 60000 }, { "epoch": 0.3076796849360026, "eval_accuracy": 0.7717183295224912, "eval_loss": 0.8797768354415894, "eval_runtime": 70.2414, "eval_samples_per_second": 7178.995, "eval_steps_per_second": 28.046, "step": 60000 }, { "epoch": 0.312807679684936, "grad_norm": 0.6494637131690979, "learning_rate": 0.0007499680013127667, "loss": 0.9002, "step": 61000 }, { "epoch": 0.3179356744338694, "grad_norm": 0.5798928141593933, "learning_rate": 0.0007491475221529373, "loss": 0.8956, "step": 62000 }, { "epoch": 0.32306366918280277, "grad_norm": 0.5499163269996643, "learning_rate": 0.000748327042993108, "loss": 0.8904, "step": 63000 }, { "epoch": 0.32819166393173616, "grad_norm": 0.5835821032524109, "learning_rate": 0.0007475073843124385, "loss": 0.894, "step": 64000 }, { "epoch": 0.3333196586806695, "grad_norm": 0.5534561276435852, "learning_rate": 0.0007466877256317691, "loss": 0.8898, "step": 65000 }, { "epoch": 0.3333196586806695, "eval_accuracy": 0.7719364696596815, "eval_loss": 0.882189929485321, "eval_runtime": 70.3107, "eval_samples_per_second": 7171.92, "eval_steps_per_second": 28.018, "step": 65000 }, { "epoch": 0.33844765342960287, "grad_norm": 0.6137602925300598, "learning_rate": 0.0007458672464719396, "loss": 0.892, "step": 66000 }, { "epoch": 0.34357564817853625, "grad_norm": 0.5671562552452087, "learning_rate": 0.0007450467673121103, "loss": 0.8924, "step": 67000 }, { "epoch": 0.34870364292746964, "grad_norm": 0.6657189130783081, "learning_rate": 0.0007442271086314408, "loss": 0.891, "step": 68000 }, { "epoch": 0.353831637676403, "grad_norm": 0.5254055857658386, "learning_rate": 0.0007434066294716115, "loss": 0.8925, "step": 69000 }, { "epoch": 0.3589596324253364, "grad_norm": 0.6411523818969727, "learning_rate": 0.0007425861503117821, "loss": 0.8874, "step": 70000 }, { "epoch": 0.3589596324253364, "eval_accuracy": 0.7748020378254998, "eval_loss": 0.87027508020401, "eval_runtime": 70.6336, "eval_samples_per_second": 7139.135, "eval_steps_per_second": 27.89, "step": 70000 }, { "epoch": 0.3640876271742698, "grad_norm": 0.5713903307914734, "learning_rate": 0.0007417664916311126, "loss": 0.889, "step": 71000 }, { "epoch": 0.3692156219232032, "grad_norm": 0.5263519883155823, "learning_rate": 0.0007409460124712833, "loss": 0.8912, "step": 72000 }, { "epoch": 0.3743436166721365, "grad_norm": 0.6226451992988586, "learning_rate": 0.0007401263537906138, "loss": 0.8873, "step": 73000 }, { "epoch": 0.3794716114210699, "grad_norm": 0.5685300827026367, "learning_rate": 0.0007393058746307845, "loss": 0.8883, "step": 74000 }, { "epoch": 0.38459960617000327, "grad_norm": 0.5722889304161072, "learning_rate": 0.000738485395470955, "loss": 0.8848, "step": 75000 }, { "epoch": 0.38459960617000327, "eval_accuracy": 0.776444038130896, "eval_loss": 0.862265944480896, "eval_runtime": 69.9413, "eval_samples_per_second": 7209.803, "eval_steps_per_second": 28.166, "step": 75000 }, { "epoch": 0.38972760091893666, "grad_norm": 0.6220349669456482, "learning_rate": 0.0007376657367902856, "loss": 0.89, "step": 76000 }, { "epoch": 0.39485559566787004, "grad_norm": 0.5804027318954468, "learning_rate": 0.0007368452576304562, "loss": 0.8918, "step": 77000 }, { "epoch": 0.3999835904168034, "grad_norm": 0.6130577921867371, "learning_rate": 0.0007360255989497868, "loss": 0.8879, "step": 78000 }, { "epoch": 0.4051115851657368, "grad_norm": 0.6952462792396545, "learning_rate": 0.0007352051197899574, "loss": 0.8856, "step": 79000 }, { "epoch": 0.4102395799146702, "grad_norm": 0.5211403965950012, "learning_rate": 0.000734384640630128, "loss": 0.8817, "step": 80000 }, { "epoch": 0.4102395799146702, "eval_accuracy": 0.7765511251073348, "eval_loss": 0.8608506321907043, "eval_runtime": 70.0124, "eval_samples_per_second": 7202.477, "eval_steps_per_second": 28.138, "step": 80000 }, { "epoch": 0.4153675746636035, "grad_norm": 0.6103194952011108, "learning_rate": 0.0007335641614702986, "loss": 0.8809, "step": 81000 }, { "epoch": 0.4204955694125369, "grad_norm": 0.7082314491271973, "learning_rate": 0.0007327445027896292, "loss": 0.8816, "step": 82000 }, { "epoch": 0.4256235641614703, "grad_norm": 0.5109438300132751, "learning_rate": 0.0007319240236297998, "loss": 0.8783, "step": 83000 }, { "epoch": 0.4307515589104037, "grad_norm": 0.5687588453292847, "learning_rate": 0.0007311043649491304, "loss": 0.881, "step": 84000 }, { "epoch": 0.43587955365933706, "grad_norm": 0.5712786912918091, "learning_rate": 0.0007302838857893009, "loss": 0.8765, "step": 85000 }, { "epoch": 0.43587955365933706, "eval_accuracy": 0.7769001493268394, "eval_loss": 0.8598923683166504, "eval_runtime": 70.0898, "eval_samples_per_second": 7194.524, "eval_steps_per_second": 28.107, "step": 85000 }, { "epoch": 0.44100754840827044, "grad_norm": 0.5572083592414856, "learning_rate": 0.0007294642271086315, "loss": 0.8809, "step": 86000 }, { "epoch": 0.44613554315720383, "grad_norm": 0.6107764840126038, "learning_rate": 0.0007286437479488021, "loss": 0.8783, "step": 87000 }, { "epoch": 0.45126353790613716, "grad_norm": 0.5306351184844971, "learning_rate": 0.0007278232687889728, "loss": 0.8735, "step": 88000 }, { "epoch": 0.45639153265507054, "grad_norm": 0.6217735409736633, "learning_rate": 0.0007270036101083033, "loss": 0.8743, "step": 89000 }, { "epoch": 0.4615195274040039, "grad_norm": 0.6092381477355957, "learning_rate": 0.000726183130948474, "loss": 0.8763, "step": 90000 }, { "epoch": 0.4615195274040039, "eval_accuracy": 0.7786928646361125, "eval_loss": 0.8531858921051025, "eval_runtime": 70.2526, "eval_samples_per_second": 7177.859, "eval_steps_per_second": 28.042, "step": 90000 }, { "epoch": 0.4666475221529373, "grad_norm": 0.597750186920166, "learning_rate": 0.0007253626517886446, "loss": 0.876, "step": 91000 }, { "epoch": 0.4717755169018707, "grad_norm": 0.5622979402542114, "learning_rate": 0.0007245421726288153, "loss": 0.8734, "step": 92000 }, { "epoch": 0.4769035116508041, "grad_norm": 0.5731295943260193, "learning_rate": 0.0007237233344273055, "loss": 0.8717, "step": 93000 }, { "epoch": 0.48203150639973746, "grad_norm": 0.5203495025634766, "learning_rate": 0.0007229028552674763, "loss": 0.8742, "step": 94000 }, { "epoch": 0.48715950114867085, "grad_norm": 0.5816043615341187, "learning_rate": 0.000722082376107647, "loss": 0.8714, "step": 95000 }, { "epoch": 0.48715950114867085, "eval_accuracy": 0.7774316180247213, "eval_loss": 0.8571500778198242, "eval_runtime": 71.6304, "eval_samples_per_second": 7039.786, "eval_steps_per_second": 27.502, "step": 95000 }, { "epoch": 0.4922874958976042, "grad_norm": 0.5794270038604736, "learning_rate": 0.0007212618969478175, "loss": 0.8713, "step": 96000 }, { "epoch": 0.49741549064653756, "grad_norm": 0.5427232384681702, "learning_rate": 0.000720442238267148, "loss": 0.8745, "step": 97000 }, { "epoch": 0.502543485395471, "grad_norm": 0.6751123666763306, "learning_rate": 0.0007196217591073187, "loss": 0.8738, "step": 98000 }, { "epoch": 0.5076714801444043, "grad_norm": 0.5248042345046997, "learning_rate": 0.0007188021004266492, "loss": 0.8646, "step": 99000 }, { "epoch": 0.5127994748933377, "grad_norm": 0.5569424629211426, "learning_rate": 0.0007179816212668199, "loss": 0.869, "step": 100000 }, { "epoch": 0.5127994748933377, "eval_accuracy": 0.7795773237378114, "eval_loss": 0.8479480743408203, "eval_runtime": 70.8167, "eval_samples_per_second": 7120.674, "eval_steps_per_second": 27.818, "step": 100000 }, { "epoch": 0.5179274696422711, "grad_norm": 0.5133588910102844, "learning_rate": 0.0007171611421069905, "loss": 0.871, "step": 101000 }, { "epoch": 0.5230554643912044, "grad_norm": 0.5195496082305908, "learning_rate": 0.000716341483426321, "loss": 0.867, "step": 102000 }, { "epoch": 0.5281834591401379, "grad_norm": 0.5803874731063843, "learning_rate": 0.0007155210042664917, "loss": 0.8706, "step": 103000 }, { "epoch": 0.5333114538890712, "grad_norm": 0.6282073855400085, "learning_rate": 0.0007147005251066623, "loss": 0.8685, "step": 104000 }, { "epoch": 0.5384394486380046, "grad_norm": 0.5759472846984863, "learning_rate": 0.0007138816869051526, "loss": 0.8672, "step": 105000 }, { "epoch": 0.5384394486380046, "eval_accuracy": 0.7798450411789086, "eval_loss": 0.8479976654052734, "eval_runtime": 70.1068, "eval_samples_per_second": 7192.783, "eval_steps_per_second": 28.1, "step": 105000 }, { "epoch": 0.543567443386938, "grad_norm": 0.5341648459434509, "learning_rate": 0.0007130612077453233, "loss": 0.871, "step": 106000 }, { "epoch": 0.5486954381358713, "grad_norm": 0.5903869867324829, "learning_rate": 0.000712240728585494, "loss": 0.8673, "step": 107000 }, { "epoch": 0.5538234328848047, "grad_norm": 0.5092418789863586, "learning_rate": 0.0007114202494256646, "loss": 0.8636, "step": 108000 }, { "epoch": 0.5589514276337381, "grad_norm": 0.6551657915115356, "learning_rate": 0.0007105997702658353, "loss": 0.8609, "step": 109000 }, { "epoch": 0.5640794223826715, "grad_norm": 0.5433303713798523, "learning_rate": 0.0007097801115851658, "loss": 0.8632, "step": 110000 }, { "epoch": 0.5640794223826715, "eval_accuracy": 0.779192603859494, "eval_loss": 0.8520081043243408, "eval_runtime": 72.0954, "eval_samples_per_second": 6994.389, "eval_steps_per_second": 27.325, "step": 110000 }, { "epoch": 0.5692074171316048, "grad_norm": 0.5089520215988159, "learning_rate": 0.0007089596324253364, "loss": 0.8656, "step": 111000 }, { "epoch": 0.5743354118805383, "grad_norm": 0.47633206844329834, "learning_rate": 0.000708139153265507, "loss": 0.8602, "step": 112000 }, { "epoch": 0.5794634066294716, "grad_norm": 0.5215737223625183, "learning_rate": 0.0007073186741056778, "loss": 0.8646, "step": 113000 }, { "epoch": 0.584591401378405, "grad_norm": 0.614766538143158, "learning_rate": 0.0007064990154250082, "loss": 0.8637, "step": 114000 }, { "epoch": 0.5897193961273384, "grad_norm": 0.6264929175376892, "learning_rate": 0.0007056785362651788, "loss": 0.8592, "step": 115000 }, { "epoch": 0.5897193961273384, "eval_accuracy": 0.7810983554216748, "eval_loss": 0.8432644009590149, "eval_runtime": 72.0548, "eval_samples_per_second": 6998.331, "eval_steps_per_second": 27.34, "step": 115000 }, { "epoch": 0.5948473908762717, "grad_norm": 0.525632381439209, "learning_rate": 0.0007048580571053496, "loss": 0.8622, "step": 116000 }, { "epoch": 0.5999753856252051, "grad_norm": 0.496176153421402, "learning_rate": 0.00070403839842468, "loss": 0.8576, "step": 117000 }, { "epoch": 0.6051033803741385, "grad_norm": 0.5899348855018616, "learning_rate": 0.0007032179192648508, "loss": 0.8572, "step": 118000 }, { "epoch": 0.6102313751230719, "grad_norm": 0.6466697454452515, "learning_rate": 0.0007023974401050213, "loss": 0.8627, "step": 119000 }, { "epoch": 0.6153593698720052, "grad_norm": 0.5172029137611389, "learning_rate": 0.000701576960945192, "loss": 0.8607, "step": 120000 }, { "epoch": 0.6153593698720052, "eval_accuracy": 0.7810884399608934, "eval_loss": 0.8427762389183044, "eval_runtime": 71.927, "eval_samples_per_second": 7010.756, "eval_steps_per_second": 27.389, "step": 120000 }, { "epoch": 0.6204873646209387, "grad_norm": 0.5664732456207275, "learning_rate": 0.0007007573022645225, "loss": 0.8613, "step": 121000 }, { "epoch": 0.625615359369872, "grad_norm": 0.5575988292694092, "learning_rate": 0.0006999368231046932, "loss": 0.8575, "step": 122000 }, { "epoch": 0.6307433541188053, "grad_norm": 0.5931879281997681, "learning_rate": 0.0006991179849031835, "loss": 0.8503, "step": 123000 }, { "epoch": 0.6358713488677388, "grad_norm": 0.5649187564849854, "learning_rate": 0.0006982975057433542, "loss": 0.8559, "step": 124000 }, { "epoch": 0.6409993436166721, "grad_norm": 0.6426099538803101, "learning_rate": 0.0006974770265835248, "loss": 0.853, "step": 125000 }, { "epoch": 0.6409993436166721, "eval_accuracy": 0.7827423388192273, "eval_loss": 0.837541937828064, "eval_runtime": 72.3751, "eval_samples_per_second": 6967.35, "eval_steps_per_second": 27.219, "step": 125000 }, { "epoch": 0.6461273383656055, "grad_norm": 0.6391366720199585, "learning_rate": 0.0006966573679028554, "loss": 0.8635, "step": 126000 }, { "epoch": 0.6512553331145389, "grad_norm": 0.5831626653671265, "learning_rate": 0.0006958368887430259, "loss": 0.8537, "step": 127000 }, { "epoch": 0.6563833278634723, "grad_norm": 0.49990639090538025, "learning_rate": 0.0006950172300623565, "loss": 0.8565, "step": 128000 }, { "epoch": 0.6615113226124056, "grad_norm": 0.5582433938980103, "learning_rate": 0.0006941967509025271, "loss": 0.8497, "step": 129000 }, { "epoch": 0.666639317361339, "grad_norm": 0.49399054050445557, "learning_rate": 0.0006933762717426978, "loss": 0.8541, "step": 130000 }, { "epoch": 0.666639317361339, "eval_accuracy": 0.7805252417885111, "eval_loss": 0.8454738855361938, "eval_runtime": 70.1262, "eval_samples_per_second": 7190.793, "eval_steps_per_second": 28.092, "step": 130000 }, { "epoch": 0.6717673121102724, "grad_norm": 0.6758140325546265, "learning_rate": 0.0006925557925828684, "loss": 0.8531, "step": 131000 }, { "epoch": 0.6768953068592057, "grad_norm": 0.5472828149795532, "learning_rate": 0.0006917361339021989, "loss": 0.8531, "step": 132000 }, { "epoch": 0.6820233016081392, "grad_norm": 0.5796289443969727, "learning_rate": 0.0006909156547423695, "loss": 0.8509, "step": 133000 }, { "epoch": 0.6871512963570725, "grad_norm": 0.5476044416427612, "learning_rate": 0.0006900959960617001, "loss": 0.8464, "step": 134000 }, { "epoch": 0.692279291106006, "grad_norm": 0.5579943060874939, "learning_rate": 0.0006892755169018707, "loss": 0.8473, "step": 135000 }, { "epoch": 0.692279291106006, "eval_accuracy": 0.7838251071365537, "eval_loss": 0.8329602479934692, "eval_runtime": 70.2662, "eval_samples_per_second": 7176.466, "eval_steps_per_second": 28.036, "step": 135000 }, { "epoch": 0.6974072858549393, "grad_norm": 0.4959736168384552, "learning_rate": 0.0006884558582212013, "loss": 0.8483, "step": 136000 }, { "epoch": 0.7025352806038727, "grad_norm": 0.4995713531970978, "learning_rate": 0.0006876353790613718, "loss": 0.8518, "step": 137000 }, { "epoch": 0.707663275352806, "grad_norm": 0.5315651297569275, "learning_rate": 0.0006868148999015425, "loss": 0.8478, "step": 138000 }, { "epoch": 0.7127912701017394, "grad_norm": 0.556061327457428, "learning_rate": 0.000685995241220873, "loss": 0.8479, "step": 139000 }, { "epoch": 0.7179192648506728, "grad_norm": 0.5722927451133728, "learning_rate": 0.0006851747620610437, "loss": 0.8449, "step": 140000 }, { "epoch": 0.7179192648506728, "eval_accuracy": 0.7838469211502728, "eval_loss": 0.8305063247680664, "eval_runtime": 70.4959, "eval_samples_per_second": 7153.084, "eval_steps_per_second": 27.945, "step": 140000 }, { "epoch": 0.7230472595996061, "grad_norm": 0.5656773447990417, "learning_rate": 0.0006843542829012143, "loss": 0.8534, "step": 141000 }, { "epoch": 0.7281752543485396, "grad_norm": 0.7694585919380188, "learning_rate": 0.0006835346242205448, "loss": 0.8448, "step": 142000 }, { "epoch": 0.7333032490974729, "grad_norm": 0.5496963858604431, "learning_rate": 0.0006827141450607155, "loss": 0.8458, "step": 143000 }, { "epoch": 0.7384312438464063, "grad_norm": 0.519934892654419, "learning_rate": 0.000681894486380046, "loss": 0.8429, "step": 144000 }, { "epoch": 0.7435592385953397, "grad_norm": 0.6174483895301819, "learning_rate": 0.0006810740072202167, "loss": 0.8465, "step": 145000 }, { "epoch": 0.7435592385953397, "eval_accuracy": 0.7849554696656308, "eval_loss": 0.8273547887802124, "eval_runtime": 70.1807, "eval_samples_per_second": 7185.206, "eval_steps_per_second": 28.07, "step": 145000 }, { "epoch": 0.748687233344273, "grad_norm": 0.5116410851478577, "learning_rate": 0.0006802535280603873, "loss": 0.8458, "step": 146000 }, { "epoch": 0.7538152280932064, "grad_norm": 0.5362213850021362, "learning_rate": 0.0006794338693797178, "loss": 0.8455, "step": 147000 }, { "epoch": 0.7589432228421398, "grad_norm": 0.5534299612045288, "learning_rate": 0.0006786133902198884, "loss": 0.8468, "step": 148000 }, { "epoch": 0.7640712175910732, "grad_norm": 0.5589115619659424, "learning_rate": 0.0006777929110600591, "loss": 0.8419, "step": 149000 }, { "epoch": 0.7691992123400065, "grad_norm": 0.5336338877677917, "learning_rate": 0.0006769724319002297, "loss": 0.8423, "step": 150000 }, { "epoch": 0.7691992123400065, "eval_accuracy": 0.7836287810130824, "eval_loss": 0.8325037956237793, "eval_runtime": 70.4264, "eval_samples_per_second": 7160.14, "eval_steps_per_second": 27.972, "step": 150000 }, { "epoch": 0.77432720708894, "grad_norm": 0.5388402342796326, "learning_rate": 0.0006761527732195603, "loss": 0.8454, "step": 151000 }, { "epoch": 0.7794552018378733, "grad_norm": 0.5751635432243347, "learning_rate": 0.0006753322940597309, "loss": 0.8433, "step": 152000 }, { "epoch": 0.7845831965868066, "grad_norm": 0.6209118366241455, "learning_rate": 0.0006745118148999016, "loss": 0.8404, "step": 153000 }, { "epoch": 0.7897111913357401, "grad_norm": 0.5103615522384644, "learning_rate": 0.0006736913357400722, "loss": 0.8417, "step": 154000 }, { "epoch": 0.7948391860846734, "grad_norm": 0.5667718648910522, "learning_rate": 0.0006728724975385626, "loss": 0.8454, "step": 155000 }, { "epoch": 0.7948391860846734, "eval_accuracy": 0.7848602812421296, "eval_loss": 0.8270025253295898, "eval_runtime": 70.3951, "eval_samples_per_second": 7163.327, "eval_steps_per_second": 27.985, "step": 155000 }, { "epoch": 0.7999671808336069, "grad_norm": 0.5443304181098938, "learning_rate": 0.0006720520183787332, "loss": 0.8404, "step": 156000 }, { "epoch": 0.8050951755825402, "grad_norm": 0.585914134979248, "learning_rate": 0.0006712323596980638, "loss": 0.8435, "step": 157000 }, { "epoch": 0.8102231703314736, "grad_norm": 0.6673417687416077, "learning_rate": 0.0006704118805382343, "loss": 0.8345, "step": 158000 }, { "epoch": 0.815351165080407, "grad_norm": 0.49369016289711, "learning_rate": 0.000669591401378405, "loss": 0.8371, "step": 159000 }, { "epoch": 0.8204791598293404, "grad_norm": 0.5406259894371033, "learning_rate": 0.0006687709222185758, "loss": 0.8358, "step": 160000 }, { "epoch": 0.8204791598293404, "eval_accuracy": 0.7837596650953966, "eval_loss": 0.8327611684799194, "eval_runtime": 70.7171, "eval_samples_per_second": 7130.706, "eval_steps_per_second": 27.857, "step": 160000 }, { "epoch": 0.8256071545782737, "grad_norm": 0.5569799542427063, "learning_rate": 0.0006679512635379062, "loss": 0.8445, "step": 161000 }, { "epoch": 0.830735149327207, "grad_norm": 0.5280943512916565, "learning_rate": 0.0006671316048572367, "loss": 0.8393, "step": 162000 }, { "epoch": 0.8358631440761405, "grad_norm": 0.5504807829856873, "learning_rate": 0.0006663111256974073, "loss": 0.839, "step": 163000 }, { "epoch": 0.8409911388250738, "grad_norm": 0.5247480869293213, "learning_rate": 0.000665490646537578, "loss": 0.843, "step": 164000 }, { "epoch": 0.8461191335740073, "grad_norm": 0.5142805576324463, "learning_rate": 0.0006646709878569085, "loss": 0.8389, "step": 165000 }, { "epoch": 0.8461191335740073, "eval_accuracy": 0.7868037115552797, "eval_loss": 0.8208885788917542, "eval_runtime": 70.3239, "eval_samples_per_second": 7170.578, "eval_steps_per_second": 28.013, "step": 165000 }, { "epoch": 0.8512471283229406, "grad_norm": 0.6155256032943726, "learning_rate": 0.0006638505086970792, "loss": 0.8416, "step": 166000 }, { "epoch": 0.856375123071874, "grad_norm": 0.502625048160553, "learning_rate": 0.0006630300295372497, "loss": 0.831, "step": 167000 }, { "epoch": 0.8615031178208074, "grad_norm": 0.5640740990638733, "learning_rate": 0.0006622103708565802, "loss": 0.8362, "step": 168000 }, { "epoch": 0.8666311125697407, "grad_norm": 0.5358522534370422, "learning_rate": 0.0006613898916967509, "loss": 0.8343, "step": 169000 }, { "epoch": 0.8717591073186741, "grad_norm": 0.4705986976623535, "learning_rate": 0.0006605694125369216, "loss": 0.8332, "step": 170000 }, { "epoch": 0.8717591073186741, "eval_accuracy": 0.7833689959406104, "eval_loss": 0.8340306282043457, "eval_runtime": 70.0607, "eval_samples_per_second": 7197.521, "eval_steps_per_second": 28.118, "step": 170000 }, { "epoch": 0.8768871020676074, "grad_norm": 0.5243467092514038, "learning_rate": 0.0006597489333770922, "loss": 0.8372, "step": 171000 }, { "epoch": 0.8820150968165409, "grad_norm": 0.495029091835022, "learning_rate": 0.0006589300951755825, "loss": 0.8378, "step": 172000 }, { "epoch": 0.8871430915654742, "grad_norm": 0.5230892896652222, "learning_rate": 0.0006581096160157532, "loss": 0.8379, "step": 173000 }, { "epoch": 0.8922710863144077, "grad_norm": 0.5050895810127258, "learning_rate": 0.0006572899573350837, "loss": 0.837, "step": 174000 }, { "epoch": 0.897399081063341, "grad_norm": 0.539610743522644, "learning_rate": 0.0006564694781752544, "loss": 0.8357, "step": 175000 }, { "epoch": 0.897399081063341, "eval_accuracy": 0.7864408056906812, "eval_loss": 0.81998211145401, "eval_runtime": 70.0652, "eval_samples_per_second": 7197.051, "eval_steps_per_second": 28.117, "step": 175000 }, { "epoch": 0.9025270758122743, "grad_norm": 0.6129307746887207, "learning_rate": 0.0006556489990154251, "loss": 0.8317, "step": 176000 }, { "epoch": 0.9076550705612078, "grad_norm": 0.5198879241943359, "learning_rate": 0.0006548293403347555, "loss": 0.8305, "step": 177000 }, { "epoch": 0.9127830653101411, "grad_norm": 0.4718916118144989, "learning_rate": 0.0006540088611749262, "loss": 0.8351, "step": 178000 }, { "epoch": 0.9179110600590745, "grad_norm": 0.5416029095649719, "learning_rate": 0.0006531883820150968, "loss": 0.8308, "step": 179000 }, { "epoch": 0.9230390548080079, "grad_norm": 0.515774130821228, "learning_rate": 0.0006523687233344274, "loss": 0.8356, "step": 180000 }, { "epoch": 0.9230390548080079, "eval_accuracy": 0.7877159339471664, "eval_loss": 0.8161681890487671, "eval_runtime": 71.9316, "eval_samples_per_second": 7010.315, "eval_steps_per_second": 27.387, "step": 180000 }, { "epoch": 0.9281670495569413, "grad_norm": 0.494751900434494, "learning_rate": 0.000651548244174598, "loss": 0.8299, "step": 181000 }, { "epoch": 0.9332950443058746, "grad_norm": 0.5368635654449463, "learning_rate": 0.0006507285854939285, "loss": 0.8359, "step": 182000 }, { "epoch": 0.9384230390548081, "grad_norm": 0.6150837540626526, "learning_rate": 0.0006499081063340992, "loss": 0.8269, "step": 183000 }, { "epoch": 0.9435510338037414, "grad_norm": 0.5255937576293945, "learning_rate": 0.0006490884476534297, "loss": 0.8321, "step": 184000 }, { "epoch": 0.9486790285526747, "grad_norm": 0.5314843654632568, "learning_rate": 0.0006482679684936002, "loss": 0.835, "step": 185000 }, { "epoch": 0.9486790285526747, "eval_accuracy": 0.7874323517688191, "eval_loss": 0.8180950284004211, "eval_runtime": 70.7247, "eval_samples_per_second": 7129.945, "eval_steps_per_second": 27.854, "step": 185000 }, { "epoch": 0.9538070233016082, "grad_norm": 0.4751564860343933, "learning_rate": 0.000647447489333771, "loss": 0.8287, "step": 186000 }, { "epoch": 0.9589350180505415, "grad_norm": 0.5860036611557007, "learning_rate": 0.0006466270101739416, "loss": 0.8311, "step": 187000 }, { "epoch": 0.9640630127994749, "grad_norm": 0.5739655494689941, "learning_rate": 0.0006458065310141122, "loss": 0.8313, "step": 188000 }, { "epoch": 0.9691910075484083, "grad_norm": 0.5147101283073425, "learning_rate": 0.0006449876928126026, "loss": 0.829, "step": 189000 }, { "epoch": 0.9743190022973417, "grad_norm": 0.5122900605201721, "learning_rate": 0.0006441672136527732, "loss": 0.8298, "step": 190000 }, { "epoch": 0.9743190022973417, "eval_accuracy": 0.7873550111747243, "eval_loss": 0.8179843425750732, "eval_runtime": 70.5166, "eval_samples_per_second": 7150.985, "eval_steps_per_second": 27.937, "step": 190000 }, { "epoch": 0.979446997046275, "grad_norm": 0.6026971936225891, "learning_rate": 0.0006433467344929439, "loss": 0.8283, "step": 191000 }, { "epoch": 0.9845749917952084, "grad_norm": 0.5144930481910706, "learning_rate": 0.0006425262553331146, "loss": 0.8323, "step": 192000 }, { "epoch": 0.9897029865441418, "grad_norm": 0.5514624714851379, "learning_rate": 0.0006417065966524451, "loss": 0.8301, "step": 193000 }, { "epoch": 0.9948309812930751, "grad_norm": 0.4893347918987274, "learning_rate": 0.0006408861174926157, "loss": 0.8317, "step": 194000 }, { "epoch": 0.9999589760420086, "grad_norm": 0.6260170936584473, "learning_rate": 0.0006400656383327864, "loss": 0.8285, "step": 195000 }, { "epoch": 0.9999589760420086, "eval_accuracy": 0.7877952576334175, "eval_loss": 0.8153980374336243, "eval_runtime": 69.9977, "eval_samples_per_second": 7203.999, "eval_steps_per_second": 28.144, "step": 195000 }, { "epoch": 1.005086970790942, "grad_norm": 0.5064743757247925, "learning_rate": 0.0006392459796521169, "loss": 0.8134, "step": 196000 }, { "epoch": 1.0102149655398753, "grad_norm": 0.5456205010414124, "learning_rate": 0.0006384255004922876, "loss": 0.8135, "step": 197000 }, { "epoch": 1.0153429602888087, "grad_norm": 0.5631531476974487, "learning_rate": 0.0006376050213324581, "loss": 0.8119, "step": 198000 }, { "epoch": 1.020470955037742, "grad_norm": 0.4758036136627197, "learning_rate": 0.0006367845421726289, "loss": 0.812, "step": 199000 }, { "epoch": 1.0255989497866753, "grad_norm": 0.4660184979438782, "learning_rate": 0.0006359657039711192, "loss": 0.8138, "step": 200000 }, { "epoch": 1.0255989497866753, "eval_accuracy": 0.7889196708860258, "eval_loss": 0.8118866086006165, "eval_runtime": 70.6329, "eval_samples_per_second": 7139.205, "eval_steps_per_second": 27.891, "step": 200000 }, { "epoch": 1.0307269445356089, "grad_norm": 0.5638351440429688, "learning_rate": 0.0006351452248112899, "loss": 0.8188, "step": 201000 }, { "epoch": 1.0358549392845422, "grad_norm": 0.523722767829895, "learning_rate": 0.0006343247456514605, "loss": 0.8136, "step": 202000 }, { "epoch": 1.0409829340334755, "grad_norm": 0.5290976166725159, "learning_rate": 0.0006335042664916311, "loss": 0.8133, "step": 203000 }, { "epoch": 1.0461109287824089, "grad_norm": 0.5694358348846436, "learning_rate": 0.0006326846078109616, "loss": 0.814, "step": 204000 }, { "epoch": 1.0512389235313424, "grad_norm": 0.4659350514411926, "learning_rate": 0.0006318641286511323, "loss": 0.8104, "step": 205000 }, { "epoch": 1.0512389235313424, "eval_accuracy": 0.7886797167351164, "eval_loss": 0.8087360262870789, "eval_runtime": 70.4787, "eval_samples_per_second": 7154.827, "eval_steps_per_second": 27.952, "step": 205000 }, { "epoch": 1.0563669182802757, "grad_norm": 0.5546853542327881, "learning_rate": 0.0006310436494913029, "loss": 0.81, "step": 206000 }, { "epoch": 1.061494913029209, "grad_norm": 0.5060790777206421, "learning_rate": 0.0006302231703314736, "loss": 0.8108, "step": 207000 }, { "epoch": 1.0666229077781424, "grad_norm": 0.4691222012042999, "learning_rate": 0.0006294035116508041, "loss": 0.8136, "step": 208000 }, { "epoch": 1.0717509025270757, "grad_norm": 0.6137627959251404, "learning_rate": 0.0006285838529701347, "loss": 0.8088, "step": 209000 }, { "epoch": 1.0768788972760093, "grad_norm": 0.49290600419044495, "learning_rate": 0.0006277633738103052, "loss": 0.8162, "step": 210000 }, { "epoch": 1.0768788972760093, "eval_accuracy": 0.7894769197819392, "eval_loss": 0.8073009252548218, "eval_runtime": 70.1216, "eval_samples_per_second": 7191.261, "eval_steps_per_second": 28.094, "step": 210000 }, { "epoch": 1.0820068920249426, "grad_norm": 0.5331597924232483, "learning_rate": 0.0006269428946504759, "loss": 0.8161, "step": 211000 }, { "epoch": 1.087134886773876, "grad_norm": 0.4749445915222168, "learning_rate": 0.0006261232359698064, "loss": 0.8104, "step": 212000 }, { "epoch": 1.0922628815228093, "grad_norm": 0.5476503372192383, "learning_rate": 0.0006253027568099771, "loss": 0.8174, "step": 213000 }, { "epoch": 1.0973908762717426, "grad_norm": 0.5627039074897766, "learning_rate": 0.0006244830981293076, "loss": 0.8079, "step": 214000 }, { "epoch": 1.1025188710206761, "grad_norm": 0.5722908973693848, "learning_rate": 0.0006236626189694782, "loss": 0.8122, "step": 215000 }, { "epoch": 1.1025188710206761, "eval_accuracy": 0.7901551372993855, "eval_loss": 0.8053246140480042, "eval_runtime": 70.4003, "eval_samples_per_second": 7162.792, "eval_steps_per_second": 27.983, "step": 215000 }, { "epoch": 1.1076468657696095, "grad_norm": 0.523002028465271, "learning_rate": 0.0006228429602888086, "loss": 0.8151, "step": 216000 }, { "epoch": 1.1127748605185428, "grad_norm": 0.5497482419013977, "learning_rate": 0.0006220224811289794, "loss": 0.8087, "step": 217000 }, { "epoch": 1.1179028552674761, "grad_norm": 0.5973089337348938, "learning_rate": 0.0006212020019691501, "loss": 0.8182, "step": 218000 }, { "epoch": 1.1230308500164097, "grad_norm": 0.5367917418479919, "learning_rate": 0.0006203815228093206, "loss": 0.8055, "step": 219000 }, { "epoch": 1.128158844765343, "grad_norm": 0.5204382538795471, "learning_rate": 0.0006195610436494914, "loss": 0.807, "step": 220000 }, { "epoch": 1.128158844765343, "eval_accuracy": 0.7899627773602267, "eval_loss": 0.8064243793487549, "eval_runtime": 69.8978, "eval_samples_per_second": 7214.291, "eval_steps_per_second": 28.184, "step": 220000 }, { "epoch": 1.1332868395142763, "grad_norm": 0.5525519847869873, "learning_rate": 0.0006187413849688218, "loss": 0.8159, "step": 221000 }, { "epoch": 1.1384148342632097, "grad_norm": 0.6252419948577881, "learning_rate": 0.0006179217262881523, "loss": 0.8111, "step": 222000 }, { "epoch": 1.143542829012143, "grad_norm": 0.5321826934814453, "learning_rate": 0.000617101247128323, "loss": 0.8112, "step": 223000 }, { "epoch": 1.1486708237610765, "grad_norm": 0.5125843286514282, "learning_rate": 0.0006162815884476534, "loss": 0.8112, "step": 224000 }, { "epoch": 1.1537988185100099, "grad_norm": 0.660315752029419, "learning_rate": 0.0006154611092878241, "loss": 0.8114, "step": 225000 }, { "epoch": 1.1537988185100099, "eval_accuracy": 0.7906588427070794, "eval_loss": 0.8042543530464172, "eval_runtime": 70.3111, "eval_samples_per_second": 7171.883, "eval_steps_per_second": 28.018, "step": 225000 }, { "epoch": 1.1589268132589432, "grad_norm": 0.5146278142929077, "learning_rate": 0.0006146414506071546, "loss": 0.8126, "step": 226000 }, { "epoch": 1.1640548080078765, "grad_norm": 0.4868735671043396, "learning_rate": 0.0006138209714473253, "loss": 0.8091, "step": 227000 }, { "epoch": 1.16918280275681, "grad_norm": 0.5329962968826294, "learning_rate": 0.000613000492287496, "loss": 0.8126, "step": 228000 }, { "epoch": 1.1743107975057434, "grad_norm": 0.5590263605117798, "learning_rate": 0.0006121808336068264, "loss": 0.811, "step": 229000 }, { "epoch": 1.1794387922546767, "grad_norm": 0.508899986743927, "learning_rate": 0.0006113603544469971, "loss": 0.8165, "step": 230000 }, { "epoch": 1.1794387922546767, "eval_accuracy": 0.791109004626554, "eval_loss": 0.8041806817054749, "eval_runtime": 70.7247, "eval_samples_per_second": 7129.944, "eval_steps_per_second": 27.854, "step": 230000 }, { "epoch": 1.18456678700361, "grad_norm": 0.6163780093193054, "learning_rate": 0.0006105398752871677, "loss": 0.8093, "step": 231000 }, { "epoch": 1.1896947817525434, "grad_norm": 0.5574708580970764, "learning_rate": 0.0006097193961273384, "loss": 0.8119, "step": 232000 }, { "epoch": 1.194822776501477, "grad_norm": 0.6107341647148132, "learning_rate": 0.0006088997374466689, "loss": 0.8098, "step": 233000 }, { "epoch": 1.1999507712504103, "grad_norm": 0.47551023960113525, "learning_rate": 0.0006080792582868395, "loss": 0.8105, "step": 234000 }, { "epoch": 1.2050787659993436, "grad_norm": 0.5238052606582642, "learning_rate": 0.00060725959960617, "loss": 0.8124, "step": 235000 }, { "epoch": 1.2050787659993436, "eval_accuracy": 0.7910217485716778, "eval_loss": 0.8009098768234253, "eval_runtime": 72.1696, "eval_samples_per_second": 6987.192, "eval_steps_per_second": 27.297, "step": 235000 }, { "epoch": 1.210206760748277, "grad_norm": 0.5526208877563477, "learning_rate": 0.0006064391204463407, "loss": 0.8067, "step": 236000 }, { "epoch": 1.2153347554972105, "grad_norm": 0.6230306029319763, "learning_rate": 0.0006056194617656711, "loss": 0.8043, "step": 237000 }, { "epoch": 1.2204627502461438, "grad_norm": 0.5491142272949219, "learning_rate": 0.0006047989826058419, "loss": 0.8107, "step": 238000 }, { "epoch": 1.2255907449950771, "grad_norm": 0.48065313696861267, "learning_rate": 0.0006039785034460125, "loss": 0.8097, "step": 239000 }, { "epoch": 1.2307187397440105, "grad_norm": 0.6628726124763489, "learning_rate": 0.000603158844765343, "loss": 0.8092, "step": 240000 }, { "epoch": 1.2307187397440105, "eval_accuracy": 0.7914025022656828, "eval_loss": 0.8019245266914368, "eval_runtime": 70.3103, "eval_samples_per_second": 7171.963, "eval_steps_per_second": 28.019, "step": 240000 }, { "epoch": 1.2358467344929438, "grad_norm": 0.5481253862380981, "learning_rate": 0.0006023383656055136, "loss": 0.8065, "step": 241000 }, { "epoch": 1.2409747292418774, "grad_norm": 0.5324077606201172, "learning_rate": 0.0006015178864456843, "loss": 0.8092, "step": 242000 }, { "epoch": 1.2461027239908107, "grad_norm": 0.5364370346069336, "learning_rate": 0.0006006982277650148, "loss": 0.8072, "step": 243000 }, { "epoch": 1.251230718739744, "grad_norm": 0.5714884400367737, "learning_rate": 0.0005998777486051855, "loss": 0.8135, "step": 244000 }, { "epoch": 1.2563587134886773, "grad_norm": 0.4822482168674469, "learning_rate": 0.0005990572694453561, "loss": 0.8023, "step": 245000 }, { "epoch": 1.2563587134886773, "eval_accuracy": 0.7920886521517542, "eval_loss": 0.7978983521461487, "eval_runtime": 72.2855, "eval_samples_per_second": 6975.993, "eval_steps_per_second": 27.253, "step": 245000 }, { "epoch": 1.2614867082376109, "grad_norm": 0.49178367853164673, "learning_rate": 0.0005982376107646866, "loss": 0.8052, "step": 246000 }, { "epoch": 1.2666147029865442, "grad_norm": 0.6012019515037537, "learning_rate": 0.0005974171316048573, "loss": 0.8115, "step": 247000 }, { "epoch": 1.2717426977354775, "grad_norm": 0.42487451434135437, "learning_rate": 0.0005965966524450279, "loss": 0.8086, "step": 248000 }, { "epoch": 1.2768706924844109, "grad_norm": 0.5072160959243774, "learning_rate": 0.0005957761732851986, "loss": 0.803, "step": 249000 }, { "epoch": 1.2819986872333442, "grad_norm": 0.5228409171104431, "learning_rate": 0.000594956514604529, "loss": 0.8058, "step": 250000 }, { "epoch": 1.2819986872333442, "eval_accuracy": 0.7922056545889744, "eval_loss": 0.798753559589386, "eval_runtime": 72.5622, "eval_samples_per_second": 6949.391, "eval_steps_per_second": 27.149, "step": 250000 }, { "epoch": 1.2871266819822775, "grad_norm": 0.4831065237522125, "learning_rate": 0.0005941368559238596, "loss": 0.8047, "step": 251000 }, { "epoch": 1.292254676731211, "grad_norm": 0.5009732246398926, "learning_rate": 0.0005933163767640302, "loss": 0.8017, "step": 252000 }, { "epoch": 1.2973826714801444, "grad_norm": 0.5227653384208679, "learning_rate": 0.0005924967180833607, "loss": 0.8053, "step": 253000 }, { "epoch": 1.3025106662290777, "grad_norm": 0.586212158203125, "learning_rate": 0.0005916762389235314, "loss": 0.8045, "step": 254000 }, { "epoch": 1.3076386609780113, "grad_norm": 0.5137971043586731, "learning_rate": 0.000590855759763702, "loss": 0.8057, "step": 255000 }, { "epoch": 1.3076386609780113, "eval_accuracy": 0.7923087753811008, "eval_loss": 0.7976465225219727, "eval_runtime": 72.7749, "eval_samples_per_second": 6929.082, "eval_steps_per_second": 27.07, "step": 255000 }, { "epoch": 1.3127666557269446, "grad_norm": 0.5631904602050781, "learning_rate": 0.0005900361010830325, "loss": 0.8075, "step": 256000 }, { "epoch": 1.317894650475878, "grad_norm": 0.4949762523174286, "learning_rate": 0.0005892156219232032, "loss": 0.8033, "step": 257000 }, { "epoch": 1.3230226452248113, "grad_norm": 0.5236957669258118, "learning_rate": 0.0005883951427633738, "loss": 0.8045, "step": 258000 }, { "epoch": 1.3281506399737446, "grad_norm": 0.6979305744171143, "learning_rate": 0.0005875746636035445, "loss": 0.8053, "step": 259000 }, { "epoch": 1.333278634722678, "grad_norm": 0.5310340523719788, "learning_rate": 0.0005867558254020348, "loss": 0.8076, "step": 260000 }, { "epoch": 1.333278634722678, "eval_accuracy": 0.7921203816262545, "eval_loss": 0.7976046204566956, "eval_runtime": 72.1958, "eval_samples_per_second": 6984.659, "eval_steps_per_second": 27.287, "step": 260000 }, { "epoch": 1.3384066294716115, "grad_norm": 0.5367364287376404, "learning_rate": 0.0005859353462422055, "loss": 0.8025, "step": 261000 }, { "epoch": 1.3435346242205448, "grad_norm": 0.4976128935813904, "learning_rate": 0.0005851148670823761, "loss": 0.8074, "step": 262000 }, { "epoch": 1.3486626189694781, "grad_norm": 0.47957494854927063, "learning_rate": 0.0005842943879225468, "loss": 0.7947, "step": 263000 }, { "epoch": 1.3537906137184115, "grad_norm": 0.5330920219421387, "learning_rate": 0.0005834747292418773, "loss": 0.8085, "step": 264000 }, { "epoch": 1.358918608467345, "grad_norm": 0.4707110524177551, "learning_rate": 0.0005826550705612078, "loss": 0.805, "step": 265000 }, { "epoch": 1.358918608467345, "eval_accuracy": 0.7930286378338288, "eval_loss": 0.7952682375907898, "eval_runtime": 70.3067, "eval_samples_per_second": 7172.327, "eval_steps_per_second": 28.02, "step": 265000 }, { "epoch": 1.3640466032162784, "grad_norm": 0.5745394229888916, "learning_rate": 0.0005818345914013784, "loss": 0.7977, "step": 266000 }, { "epoch": 1.3691745979652117, "grad_norm": 0.5694977045059204, "learning_rate": 0.0005810141122415491, "loss": 0.8068, "step": 267000 }, { "epoch": 1.374302592714145, "grad_norm": 0.49434027075767517, "learning_rate": 0.0005801936330817198, "loss": 0.7991, "step": 268000 }, { "epoch": 1.3794305874630783, "grad_norm": 0.5822916030883789, "learning_rate": 0.0005793739744010503, "loss": 0.7992, "step": 269000 }, { "epoch": 1.384558582212012, "grad_norm": 0.5346013307571411, "learning_rate": 0.000578553495241221, "loss": 0.797, "step": 270000 }, { "epoch": 1.384558582212012, "eval_accuracy": 0.7926161546653234, "eval_loss": 0.7990232110023499, "eval_runtime": 70.9801, "eval_samples_per_second": 7104.283, "eval_steps_per_second": 27.754, "step": 270000 }, { "epoch": 1.3896865769609452, "grad_norm": 0.49978184700012207, "learning_rate": 0.0005777330160813915, "loss": 0.7982, "step": 271000 }, { "epoch": 1.3948145717098785, "grad_norm": 0.5729996562004089, "learning_rate": 0.000576913357400722, "loss": 0.7999, "step": 272000 }, { "epoch": 1.3999425664588119, "grad_norm": 0.5556117296218872, "learning_rate": 0.0005760928782408927, "loss": 0.7995, "step": 273000 }, { "epoch": 1.4050705612077454, "grad_norm": 0.5568280816078186, "learning_rate": 0.0005752732195602233, "loss": 0.7985, "step": 274000 }, { "epoch": 1.4101985559566788, "grad_norm": 0.4251072406768799, "learning_rate": 0.0005744527404003939, "loss": 0.7997, "step": 275000 }, { "epoch": 1.4101985559566788, "eval_accuracy": 0.7935204446885851, "eval_loss": 0.7928967475891113, "eval_runtime": 70.2273, "eval_samples_per_second": 7180.446, "eval_steps_per_second": 28.052, "step": 275000 }, { "epoch": 1.415326550705612, "grad_norm": 0.5283703804016113, "learning_rate": 0.0005736322612405645, "loss": 0.8001, "step": 276000 }, { "epoch": 1.4204545454545454, "grad_norm": 0.5075294971466064, "learning_rate": 0.000572812602559895, "loss": 0.8032, "step": 277000 }, { "epoch": 1.4255825402034787, "grad_norm": 0.5486775636672974, "learning_rate": 0.0005719921234000657, "loss": 0.796, "step": 278000 }, { "epoch": 1.430710534952412, "grad_norm": 0.5214180946350098, "learning_rate": 0.0005711724647193961, "loss": 0.7958, "step": 279000 }, { "epoch": 1.4358385297013456, "grad_norm": 0.5862768292427063, "learning_rate": 0.0005703519855595669, "loss": 0.8028, "step": 280000 }, { "epoch": 1.4358385297013456, "eval_accuracy": 0.7933221354729576, "eval_loss": 0.7933070063591003, "eval_runtime": 70.7406, "eval_samples_per_second": 7128.341, "eval_steps_per_second": 27.848, "step": 280000 }, { "epoch": 1.440966524450279, "grad_norm": 0.49619877338409424, "learning_rate": 0.0005695315063997374, "loss": 0.8002, "step": 281000 }, { "epoch": 1.4460945191992123, "grad_norm": 0.45815509557724, "learning_rate": 0.0005687110272399081, "loss": 0.7973, "step": 282000 }, { "epoch": 1.4512225139481458, "grad_norm": 0.5492725372314453, "learning_rate": 0.0005678913685592386, "loss": 0.8013, "step": 283000 }, { "epoch": 1.4563505086970792, "grad_norm": 0.5624783635139465, "learning_rate": 0.0005670717098785691, "loss": 0.7985, "step": 284000 }, { "epoch": 1.4614785034460125, "grad_norm": 0.625063419342041, "learning_rate": 0.0005662512307187398, "loss": 0.7981, "step": 285000 }, { "epoch": 1.4614785034460125, "eval_accuracy": 0.7934252562650839, "eval_loss": 0.7905462980270386, "eval_runtime": 70.8389, "eval_samples_per_second": 7118.449, "eval_steps_per_second": 27.81, "step": 285000 }, { "epoch": 1.4666064981949458, "grad_norm": 0.5240657329559326, "learning_rate": 0.00056543239251723, "loss": 0.802, "step": 286000 }, { "epoch": 1.4717344929438791, "grad_norm": 0.5398324131965637, "learning_rate": 0.0005646119133574008, "loss": 0.7964, "step": 287000 }, { "epoch": 1.4768624876928125, "grad_norm": 0.5304765701293945, "learning_rate": 0.0005637914341975715, "loss": 0.797, "step": 288000 }, { "epoch": 1.481990482441746, "grad_norm": 0.5485924482345581, "learning_rate": 0.000562970955037742, "loss": 0.7938, "step": 289000 }, { "epoch": 1.4871184771906794, "grad_norm": 0.5855067372322083, "learning_rate": 0.0005621504758779128, "loss": 0.8002, "step": 290000 }, { "epoch": 1.4871184771906794, "eval_accuracy": 0.7923662850536327, "eval_loss": 0.7965236306190491, "eval_runtime": 70.6279, "eval_samples_per_second": 7139.718, "eval_steps_per_second": 27.893, "step": 290000 }, { "epoch": 1.4922464719396127, "grad_norm": 0.5860921740531921, "learning_rate": 0.0005613308171972432, "loss": 0.7904, "step": 291000 }, { "epoch": 1.4973744666885462, "grad_norm": 0.49449965357780457, "learning_rate": 0.0005605103380374139, "loss": 0.7991, "step": 292000 }, { "epoch": 1.5025024614374796, "grad_norm": 0.5922234654426575, "learning_rate": 0.0005596898588775845, "loss": 0.7964, "step": 293000 }, { "epoch": 1.507630456186413, "grad_norm": 0.546913206577301, "learning_rate": 0.0005588693797177552, "loss": 0.7978, "step": 294000 }, { "epoch": 1.5127584509353462, "grad_norm": 0.5648984909057617, "learning_rate": 0.0005580497210370857, "loss": 0.7984, "step": 295000 }, { "epoch": 1.5127584509353462, "eval_accuracy": 0.7932785074455195, "eval_loss": 0.7914912104606628, "eval_runtime": 71.0254, "eval_samples_per_second": 7099.76, "eval_steps_per_second": 27.737, "step": 295000 }, { "epoch": 1.5178864456842796, "grad_norm": 0.6177080273628235, "learning_rate": 0.0005572292418772564, "loss": 0.7941, "step": 296000 }, { "epoch": 1.5230144404332129, "grad_norm": 0.528565526008606, "learning_rate": 0.0005564095831965868, "loss": 0.795, "step": 297000 }, { "epoch": 1.5281424351821462, "grad_norm": 0.4736846089363098, "learning_rate": 0.0005555891040367575, "loss": 0.794, "step": 298000 }, { "epoch": 1.5332704299310798, "grad_norm": 0.5611686706542969, "learning_rate": 0.0005547694453560879, "loss": 0.7892, "step": 299000 }, { "epoch": 1.538398424680013, "grad_norm": 0.6292598843574524, "learning_rate": 0.0005539489661962586, "loss": 0.7973, "step": 300000 }, { "epoch": 1.538398424680013, "eval_accuracy": 0.7931773697455494, "eval_loss": 0.7950462102890015, "eval_runtime": 70.2224, "eval_samples_per_second": 7180.942, "eval_steps_per_second": 28.054, "step": 300000 }, { "epoch": 1.5435264194289466, "grad_norm": 0.5387662649154663, "learning_rate": 0.0005531284870364294, "loss": 0.8025, "step": 301000 }, { "epoch": 1.54865441417788, "grad_norm": 0.8784258961677551, "learning_rate": 0.0005523088283557598, "loss": 0.7952, "step": 302000 }, { "epoch": 1.5537824089268133, "grad_norm": 0.5515384674072266, "learning_rate": 0.0005514883491959305, "loss": 0.794, "step": 303000 }, { "epoch": 1.5589104036757466, "grad_norm": 0.43967705965042114, "learning_rate": 0.0005506678700361011, "loss": 0.7904, "step": 304000 }, { "epoch": 1.56403839842468, "grad_norm": 0.5585535168647766, "learning_rate": 0.0005498473908762718, "loss": 0.7933, "step": 305000 }, { "epoch": 1.56403839842468, "eval_accuracy": 0.7950494087410737, "eval_loss": 0.7864640355110168, "eval_runtime": 71.0956, "eval_samples_per_second": 7092.745, "eval_steps_per_second": 27.709, "step": 305000 }, { "epoch": 1.5691663931736133, "grad_norm": 0.6416157484054565, "learning_rate": 0.0005490285526747621, "loss": 0.7961, "step": 306000 }, { "epoch": 1.5742943879225466, "grad_norm": 0.4676745533943176, "learning_rate": 0.0005482088939940925, "loss": 0.8016, "step": 307000 }, { "epoch": 1.5794223826714802, "grad_norm": 0.5629871487617493, "learning_rate": 0.0005473884148342633, "loss": 0.797, "step": 308000 }, { "epoch": 1.5845503774204135, "grad_norm": 0.5258574485778809, "learning_rate": 0.0005465679356744338, "loss": 0.7955, "step": 309000 }, { "epoch": 1.589678372169347, "grad_norm": 0.470490962266922, "learning_rate": 0.0005457482769937644, "loss": 0.7927, "step": 310000 }, { "epoch": 1.589678372169347, "eval_accuracy": 0.7946071791902242, "eval_loss": 0.7886035442352295, "eval_runtime": 70.8059, "eval_samples_per_second": 7121.762, "eval_steps_per_second": 27.823, "step": 310000 }, { "epoch": 1.5948063669182804, "grad_norm": 0.517235279083252, "learning_rate": 0.000544927797833935, "loss": 0.7927, "step": 311000 }, { "epoch": 1.5999343616672137, "grad_norm": 0.5009629726409912, "learning_rate": 0.0005441073186741057, "loss": 0.7927, "step": 312000 }, { "epoch": 1.605062356416147, "grad_norm": 0.48802587389945984, "learning_rate": 0.0005432876599934362, "loss": 0.7915, "step": 313000 }, { "epoch": 1.6101903511650804, "grad_norm": 0.5152111649513245, "learning_rate": 0.0005424671808336068, "loss": 0.7972, "step": 314000 }, { "epoch": 1.6153183459140137, "grad_norm": 0.5144213438034058, "learning_rate": 0.0005416467016737775, "loss": 0.799, "step": 315000 }, { "epoch": 1.6153183459140137, "eval_accuracy": 0.7953984329605781, "eval_loss": 0.784041166305542, "eval_runtime": 70.5336, "eval_samples_per_second": 7149.264, "eval_steps_per_second": 27.93, "step": 315000 }, { "epoch": 1.620446340662947, "grad_norm": 0.5271795988082886, "learning_rate": 0.0005408262225139482, "loss": 0.7932, "step": 316000 }, { "epoch": 1.6255743354118806, "grad_norm": 0.5851743817329407, "learning_rate": 0.0005400057433541188, "loss": 0.7947, "step": 317000 }, { "epoch": 1.630702330160814, "grad_norm": 0.5321177244186401, "learning_rate": 0.0005391860846734493, "loss": 0.7944, "step": 318000 }, { "epoch": 1.6358303249097474, "grad_norm": 0.6146435737609863, "learning_rate": 0.00053836560551362, "loss": 0.7928, "step": 319000 }, { "epoch": 1.6409583196586808, "grad_norm": 0.5450624823570251, "learning_rate": 0.0005375459468329504, "loss": 0.7961, "step": 320000 }, { "epoch": 1.6409583196586808, "eval_accuracy": 0.7900559826915716, "eval_loss": 0.8131775259971619, "eval_runtime": 70.478, "eval_samples_per_second": 7154.903, "eval_steps_per_second": 27.952, "step": 320000 }, { "epoch": 1.646086314407614, "grad_norm": 0.5331543684005737, "learning_rate": 0.0005367254676731211, "loss": 0.7874, "step": 321000 }, { "epoch": 1.6512143091565474, "grad_norm": 0.5814546942710876, "learning_rate": 0.0005359049885132918, "loss": 0.796, "step": 322000 }, { "epoch": 1.6563423039054808, "grad_norm": 0.5462503433227539, "learning_rate": 0.0005350845093534624, "loss": 0.7873, "step": 323000 }, { "epoch": 1.661470298654414, "grad_norm": 0.5096405744552612, "learning_rate": 0.000534264850672793, "loss": 0.7932, "step": 324000 }, { "epoch": 1.6665982934033474, "grad_norm": 0.5340434908866882, "learning_rate": 0.0005334443715129636, "loss": 0.7866, "step": 325000 }, { "epoch": 1.6665982934033474, "eval_accuracy": 0.795765305009489, "eval_loss": 0.7828590869903564, "eval_runtime": 70.4009, "eval_samples_per_second": 7162.738, "eval_steps_per_second": 27.983, "step": 325000 }, { "epoch": 1.671726288152281, "grad_norm": 0.5498859882354736, "learning_rate": 0.000532624712832294, "loss": 0.7957, "step": 326000 }, { "epoch": 1.6768542829012143, "grad_norm": 0.5334329605102539, "learning_rate": 0.0005318042336724648, "loss": 0.7935, "step": 327000 }, { "epoch": 1.6819822776501479, "grad_norm": 0.5516751408576965, "learning_rate": 0.0005309837545126354, "loss": 0.7898, "step": 328000 }, { "epoch": 1.6871102723990812, "grad_norm": 0.4836411476135254, "learning_rate": 0.0005301640958319659, "loss": 0.7903, "step": 329000 }, { "epoch": 1.6922382671480145, "grad_norm": 0.5234991312026978, "learning_rate": 0.0005293436166721366, "loss": 0.7898, "step": 330000 }, { "epoch": 1.6922382671480145, "eval_accuracy": 0.7959041214604284, "eval_loss": 0.78127521276474, "eval_runtime": 70.4799, "eval_samples_per_second": 7154.702, "eval_steps_per_second": 27.951, "step": 330000 }, { "epoch": 1.6973662618969478, "grad_norm": 0.5535818934440613, "learning_rate": 0.000528523957991467, "loss": 0.7923, "step": 331000 }, { "epoch": 1.7024942566458812, "grad_norm": 0.5635313391685486, "learning_rate": 0.0005277034788316378, "loss": 0.7882, "step": 332000 }, { "epoch": 1.7076222513948145, "grad_norm": 0.5740207433700562, "learning_rate": 0.0005268829996718083, "loss": 0.7887, "step": 333000 }, { "epoch": 1.7127502461437478, "grad_norm": 0.5098114013671875, "learning_rate": 0.000526062520511979, "loss": 0.7858, "step": 334000 }, { "epoch": 1.7178782408926814, "grad_norm": 0.6315898895263672, "learning_rate": 0.0005252428618313095, "loss": 0.7885, "step": 335000 }, { "epoch": 1.7178782408926814, "eval_accuracy": 0.7968698873405347, "eval_loss": 0.7796327471733093, "eval_runtime": 70.4022, "eval_samples_per_second": 7162.606, "eval_steps_per_second": 27.982, "step": 335000 }, { "epoch": 1.7230062356416147, "grad_norm": 0.5062967538833618, "learning_rate": 0.0005244223826714802, "loss": 0.7917, "step": 336000 }, { "epoch": 1.728134230390548, "grad_norm": 0.5790054798126221, "learning_rate": 0.0005236027239908107, "loss": 0.7898, "step": 337000 }, { "epoch": 1.7332622251394816, "grad_norm": 0.5439338684082031, "learning_rate": 0.0005227822448309813, "loss": 0.787, "step": 338000 }, { "epoch": 1.738390219888415, "grad_norm": 0.6131258010864258, "learning_rate": 0.000521961765671152, "loss": 0.7923, "step": 339000 }, { "epoch": 1.7435182146373482, "grad_norm": 0.5550510287284851, "learning_rate": 0.0005211421069904825, "loss": 0.7901, "step": 340000 }, { "epoch": 1.7435182146373482, "eval_accuracy": 0.7958188484977086, "eval_loss": 0.781705915927887, "eval_runtime": 70.8007, "eval_samples_per_second": 7122.293, "eval_steps_per_second": 27.825, "step": 340000 }, { "epoch": 1.7486462093862816, "grad_norm": 0.48614928126335144, "learning_rate": 0.0005203216278306532, "loss": 0.784, "step": 341000 }, { "epoch": 1.753774204135215, "grad_norm": 0.5720589756965637, "learning_rate": 0.0005195011486708237, "loss": 0.794, "step": 342000 }, { "epoch": 1.7589021988841482, "grad_norm": 0.4716200828552246, "learning_rate": 0.0005186814899901542, "loss": 0.7877, "step": 343000 }, { "epoch": 1.7640301936330816, "grad_norm": 0.5630565881729126, "learning_rate": 0.0005178610108303249, "loss": 0.7902, "step": 344000 }, { "epoch": 1.769158188382015, "grad_norm": 0.5793322920799255, "learning_rate": 0.0005170405316704956, "loss": 0.7916, "step": 345000 }, { "epoch": 1.769158188382015, "eval_accuracy": 0.7961777881779944, "eval_loss": 0.7823154926300049, "eval_runtime": 70.6721, "eval_samples_per_second": 7135.251, "eval_steps_per_second": 27.875, "step": 345000 }, { "epoch": 1.7742861831309484, "grad_norm": 0.5480318069458008, "learning_rate": 0.0005162208729898261, "loss": 0.7844, "step": 346000 }, { "epoch": 1.779414177879882, "grad_norm": 0.5857470035552979, "learning_rate": 0.0005154003938299967, "loss": 0.7906, "step": 347000 }, { "epoch": 1.7845421726288153, "grad_norm": 0.5360823273658752, "learning_rate": 0.0005145807351493272, "loss": 0.7909, "step": 348000 }, { "epoch": 1.7896701673777486, "grad_norm": 0.48977744579315186, "learning_rate": 0.0005137602559894979, "loss": 0.7876, "step": 349000 }, { "epoch": 1.794798162126682, "grad_norm": 0.513276219367981, "learning_rate": 0.0005129405973088284, "loss": 0.787, "step": 350000 }, { "epoch": 1.794798162126682, "eval_accuracy": 0.7968520395111282, "eval_loss": 0.7788918018341064, "eval_runtime": 70.4278, "eval_samples_per_second": 7160.004, "eval_steps_per_second": 27.972, "step": 350000 }, { "epoch": 1.7999261568756153, "grad_norm": 0.6035018563270569, "learning_rate": 0.0005121201181489991, "loss": 0.7845, "step": 351000 }, { "epoch": 1.8050541516245486, "grad_norm": 0.5028976202011108, "learning_rate": 0.0005112996389891697, "loss": 0.7854, "step": 352000 }, { "epoch": 1.810182146373482, "grad_norm": 0.48929738998413086, "learning_rate": 0.0005104799803085002, "loss": 0.7848, "step": 353000 }, { "epoch": 1.8153101411224155, "grad_norm": 0.4971577823162079, "learning_rate": 0.0005096595011486708, "loss": 0.7882, "step": 354000 }, { "epoch": 1.8204381358713488, "grad_norm": 0.5837960839271545, "learning_rate": 0.0005088398424680013, "loss": 0.7822, "step": 355000 }, { "epoch": 1.8204381358713488, "eval_accuracy": 0.7967806481935023, "eval_loss": 0.7787038087844849, "eval_runtime": 72.4468, "eval_samples_per_second": 6960.459, "eval_steps_per_second": 27.192, "step": 355000 }, { "epoch": 1.8255661306202824, "grad_norm": 0.5238701105117798, "learning_rate": 0.000508019363308172, "loss": 0.7797, "step": 356000 }, { "epoch": 1.8306941253692157, "grad_norm": 0.5829733610153198, "learning_rate": 0.0005071988841483427, "loss": 0.7821, "step": 357000 }, { "epoch": 1.835822120118149, "grad_norm": 0.5547536015510559, "learning_rate": 0.0005063784049885132, "loss": 0.7862, "step": 358000 }, { "epoch": 1.8409501148670824, "grad_norm": 0.5710061192512512, "learning_rate": 0.0005055587463078438, "loss": 0.7805, "step": 359000 }, { "epoch": 1.8460781096160157, "grad_norm": 0.5201205015182495, "learning_rate": 0.0005047382671480144, "loss": 0.7844, "step": 360000 }, { "epoch": 1.8460781096160157, "eval_accuracy": 0.7980775904637064, "eval_loss": 0.775397002696991, "eval_runtime": 70.5955, "eval_samples_per_second": 7142.994, "eval_steps_per_second": 27.905, "step": 360000 }, { "epoch": 1.851206104364949, "grad_norm": 0.5080453753471375, "learning_rate": 0.0005039177879881851, "loss": 0.787, "step": 361000 }, { "epoch": 1.8563340991138824, "grad_norm": 0.5578721165657043, "learning_rate": 0.0005030973088283558, "loss": 0.7854, "step": 362000 }, { "epoch": 1.861462093862816, "grad_norm": 0.5530651211738586, "learning_rate": 0.0005022776501476862, "loss": 0.7836, "step": 363000 }, { "epoch": 1.8665900886117492, "grad_norm": 0.5697611570358276, "learning_rate": 0.000501457170987857, "loss": 0.789, "step": 364000 }, { "epoch": 1.8717180833606828, "grad_norm": 0.48562246561050415, "learning_rate": 0.0005006375123071874, "loss": 0.7849, "step": 365000 }, { "epoch": 1.8717180833606828, "eval_accuracy": 0.7972189115600391, "eval_loss": 0.7775390148162842, "eval_runtime": 70.9915, "eval_samples_per_second": 7103.149, "eval_steps_per_second": 27.75, "step": 365000 }, { "epoch": 1.8768460781096161, "grad_norm": 0.5869441032409668, "learning_rate": 0.0004998178536265179, "loss": 0.7889, "step": 366000 }, { "epoch": 1.8819740728585495, "grad_norm": 0.5583035945892334, "learning_rate": 0.0004989973744666886, "loss": 0.784, "step": 367000 }, { "epoch": 1.8871020676074828, "grad_norm": 0.7010840177536011, "learning_rate": 0.000498177715786019, "loss": 0.785, "step": 368000 }, { "epoch": 1.892230062356416, "grad_norm": 0.6296639442443848, "learning_rate": 0.0004973572366261897, "loss": 0.7858, "step": 369000 }, { "epoch": 1.8973580571053494, "grad_norm": 0.5181257128715515, "learning_rate": 0.0004965367574663604, "loss": 0.7845, "step": 370000 }, { "epoch": 1.8973580571053494, "eval_accuracy": 0.7972744381404149, "eval_loss": 0.776125431060791, "eval_runtime": 71.3881, "eval_samples_per_second": 7063.681, "eval_steps_per_second": 27.596, "step": 370000 }, { "epoch": 1.9024860518542828, "grad_norm": 0.6372833847999573, "learning_rate": 0.0004957170987856909, "loss": 0.7888, "step": 371000 }, { "epoch": 1.9076140466032163, "grad_norm": 0.5316877961158752, "learning_rate": 0.0004948966196258616, "loss": 0.7826, "step": 372000 }, { "epoch": 1.9127420413521496, "grad_norm": 0.6046363115310669, "learning_rate": 0.000494076960945192, "loss": 0.7868, "step": 373000 }, { "epoch": 1.9178700361010832, "grad_norm": 0.4909999370574951, "learning_rate": 0.0004932564817853627, "loss": 0.7822, "step": 374000 }, { "epoch": 1.9229980308500165, "grad_norm": 0.6157692670822144, "learning_rate": 0.0004924360026255333, "loss": 0.7905, "step": 375000 }, { "epoch": 1.9229980308500165, "eval_accuracy": 0.7982957306008968, "eval_loss": 0.7735826373100281, "eval_runtime": 72.1539, "eval_samples_per_second": 6988.716, "eval_steps_per_second": 27.303, "step": 375000 }, { "epoch": 1.9281260255989499, "grad_norm": 0.5116710662841797, "learning_rate": 0.000491615523465704, "loss": 0.7829, "step": 376000 }, { "epoch": 1.9332540203478832, "grad_norm": 0.6274186968803406, "learning_rate": 0.0004907958647850345, "loss": 0.7822, "step": 377000 }, { "epoch": 1.9383820150968165, "grad_norm": 0.5890050530433655, "learning_rate": 0.000489976206104365, "loss": 0.7776, "step": 378000 }, { "epoch": 1.9435100098457498, "grad_norm": 0.5998618006706238, "learning_rate": 0.0004891557269445356, "loss": 0.7807, "step": 379000 }, { "epoch": 1.9486380045946832, "grad_norm": 0.6149361729621887, "learning_rate": 0.0004883360682638662, "loss": 0.788, "step": 380000 }, { "epoch": 1.9486380045946832, "eval_accuracy": 0.7978316870363282, "eval_loss": 0.7737977504730225, "eval_runtime": 72.3374, "eval_samples_per_second": 6970.982, "eval_steps_per_second": 27.233, "step": 380000 }, { "epoch": 1.9537659993436167, "grad_norm": 0.5185332298278809, "learning_rate": 0.00048751558910403676, "loss": 0.7807, "step": 381000 }, { "epoch": 1.95889399409255, "grad_norm": 0.5065966248512268, "learning_rate": 0.0004866959304233673, "loss": 0.7821, "step": 382000 }, { "epoch": 1.9640219888414834, "grad_norm": 0.5468112230300903, "learning_rate": 0.0004858754512635379, "loss": 0.7815, "step": 383000 }, { "epoch": 1.969149983590417, "grad_norm": 0.5651405453681946, "learning_rate": 0.00048505579258286845, "loss": 0.7814, "step": 384000 }, { "epoch": 1.9742779783393503, "grad_norm": 0.5032802820205688, "learning_rate": 0.00048423531342303906, "loss": 0.7832, "step": 385000 }, { "epoch": 1.9742779783393503, "eval_accuracy": 0.7980478440813623, "eval_loss": 0.7719081044197083, "eval_runtime": 72.6605, "eval_samples_per_second": 6939.985, "eval_steps_per_second": 27.112, "step": 385000 }, { "epoch": 1.9794059730882836, "grad_norm": 0.5570623278617859, "learning_rate": 0.0004834148342632097, "loss": 0.784, "step": 386000 }, { "epoch": 1.984533967837217, "grad_norm": 0.5148215293884277, "learning_rate": 0.00048259517558254026, "loss": 0.776, "step": 387000 }, { "epoch": 1.9896619625861502, "grad_norm": 0.5762397646903992, "learning_rate": 0.00048177469642271087, "loss": 0.783, "step": 388000 }, { "epoch": 1.9947899573350836, "grad_norm": 0.5391610264778137, "learning_rate": 0.0004809550377420414, "loss": 0.7848, "step": 389000 }, { "epoch": 1.999917952084017, "grad_norm": 0.5661218166351318, "learning_rate": 0.000480134558582212, "loss": 0.7787, "step": 390000 }, { "epoch": 1.999917952084017, "eval_accuracy": 0.7985733635027753, "eval_loss": 0.7710365056991577, "eval_runtime": 71.0173, "eval_samples_per_second": 7100.566, "eval_steps_per_second": 27.74, "step": 390000 }, { "epoch": 2.0050459468329502, "grad_norm": 0.5676838159561157, "learning_rate": 0.0004793140794223827, "loss": 0.7707, "step": 391000 }, { "epoch": 2.010173941581884, "grad_norm": 0.6141178011894226, "learning_rate": 0.0004784944207417132, "loss": 0.7632, "step": 392000 }, { "epoch": 2.0153019363308173, "grad_norm": 0.5664064884185791, "learning_rate": 0.0004776739415818838, "loss": 0.7652, "step": 393000 }, { "epoch": 2.0204299310797507, "grad_norm": 0.555926501750946, "learning_rate": 0.00047685428290121436, "loss": 0.7647, "step": 394000 }, { "epoch": 2.025557925828684, "grad_norm": 0.5670093297958374, "learning_rate": 0.0004760346242205448, "loss": 0.767, "step": 395000 }, { "epoch": 2.025557925828684, "eval_accuracy": 0.7984861074478992, "eval_loss": 0.7716742753982544, "eval_runtime": 70.4739, "eval_samples_per_second": 7155.313, "eval_steps_per_second": 27.954, "step": 395000 }, { "epoch": 2.0306859205776173, "grad_norm": 0.5309360027313232, "learning_rate": 0.00047521414506071545, "loss": 0.7649, "step": 396000 }, { "epoch": 2.0358139153265506, "grad_norm": 0.5965875387191772, "learning_rate": 0.00047439366590088617, "loss": 0.7625, "step": 397000 }, { "epoch": 2.040941910075484, "grad_norm": 0.5864492058753967, "learning_rate": 0.0004735731867410568, "loss": 0.7727, "step": 398000 }, { "epoch": 2.0460699048244173, "grad_norm": 0.5250040888786316, "learning_rate": 0.00047275270758122745, "loss": 0.7614, "step": 399000 }, { "epoch": 2.0511978995733506, "grad_norm": 0.5365161299705505, "learning_rate": 0.000471933048900558, "loss": 0.7666, "step": 400000 }, { "epoch": 2.0511978995733506, "eval_accuracy": 0.7989104891693422, "eval_loss": 0.769827663898468, "eval_runtime": 71.0886, "eval_samples_per_second": 7093.444, "eval_steps_per_second": 27.712, "step": 400000 }, { "epoch": 2.0563258943222844, "grad_norm": 0.5217652320861816, "learning_rate": 0.0004711125697407286, "loss": 0.769, "step": 401000 }, { "epoch": 2.0614538890712177, "grad_norm": 0.5363185405731201, "learning_rate": 0.00047029291106005913, "loss": 0.7726, "step": 402000 }, { "epoch": 2.066581883820151, "grad_norm": 0.5053459405899048, "learning_rate": 0.00046947243190022974, "loss": 0.7718, "step": 403000 }, { "epoch": 2.0717098785690844, "grad_norm": 0.5293710231781006, "learning_rate": 0.0004686519527404004, "loss": 0.7601, "step": 404000 }, { "epoch": 2.0768378733180177, "grad_norm": 0.520518958568573, "learning_rate": 0.00046783311453889076, "loss": 0.7631, "step": 405000 }, { "epoch": 2.0768378733180177, "eval_accuracy": 0.798196575993083, "eval_loss": 0.7718871235847473, "eval_runtime": 70.3742, "eval_samples_per_second": 7165.453, "eval_steps_per_second": 27.993, "step": 405000 }, { "epoch": 2.081965868066951, "grad_norm": 0.5407614707946777, "learning_rate": 0.00046701263537906137, "loss": 0.7654, "step": 406000 }, { "epoch": 2.0870938628158844, "grad_norm": 0.5390910506248474, "learning_rate": 0.0004661921562192321, "loss": 0.7657, "step": 407000 }, { "epoch": 2.0922218575648177, "grad_norm": 0.5459956526756287, "learning_rate": 0.0004653716770594027, "loss": 0.7575, "step": 408000 }, { "epoch": 2.097349852313751, "grad_norm": 0.647534191608429, "learning_rate": 0.00046455119789957336, "loss": 0.7614, "step": 409000 }, { "epoch": 2.102477847062685, "grad_norm": 0.6167773604393005, "learning_rate": 0.0004637307187397441, "loss": 0.7634, "step": 410000 }, { "epoch": 2.102477847062685, "eval_accuracy": 0.7994003129319422, "eval_loss": 0.7683917880058289, "eval_runtime": 71.0406, "eval_samples_per_second": 7098.237, "eval_steps_per_second": 27.731, "step": 410000 }, { "epoch": 2.107605841811618, "grad_norm": 0.5860775709152222, "learning_rate": 0.0004629110600590745, "loss": 0.7686, "step": 411000 }, { "epoch": 2.1127338365605515, "grad_norm": 0.6858440041542053, "learning_rate": 0.00046209058089924517, "loss": 0.7654, "step": 412000 }, { "epoch": 2.117861831309485, "grad_norm": 0.6214200258255005, "learning_rate": 0.0004612701017394159, "loss": 0.7693, "step": 413000 }, { "epoch": 2.122989826058418, "grad_norm": 0.5739907026290894, "learning_rate": 0.0004604504430587463, "loss": 0.7657, "step": 414000 }, { "epoch": 2.1281178208073515, "grad_norm": 0.5387310981750488, "learning_rate": 0.00045962996389891693, "loss": 0.7621, "step": 415000 }, { "epoch": 2.1281178208073515, "eval_accuracy": 0.7986586364654952, "eval_loss": 0.7706997990608215, "eval_runtime": 69.9541, "eval_samples_per_second": 7208.486, "eval_steps_per_second": 28.161, "step": 415000 }, { "epoch": 2.133245815556285, "grad_norm": 0.5783685445785522, "learning_rate": 0.00045881030521824747, "loss": 0.7629, "step": 416000 }, { "epoch": 2.138373810305218, "grad_norm": 0.4915275573730469, "learning_rate": 0.00045798982605841813, "loss": 0.7669, "step": 417000 }, { "epoch": 2.1435018050541514, "grad_norm": 0.5774083733558655, "learning_rate": 0.00045717016737774867, "loss": 0.7621, "step": 418000 }, { "epoch": 2.1486297998030848, "grad_norm": 0.6195769309997559, "learning_rate": 0.0004563496882179193, "loss": 0.7711, "step": 419000 }, { "epoch": 2.1537577945520185, "grad_norm": 0.5560759902000427, "learning_rate": 0.0004555300295372498, "loss": 0.7694, "step": 420000 }, { "epoch": 2.1537577945520185, "eval_accuracy": 0.7994161776691925, "eval_loss": 0.7699734568595886, "eval_runtime": 70.8926, "eval_samples_per_second": 7113.055, "eval_steps_per_second": 27.789, "step": 420000 }, { "epoch": 2.158885789300952, "grad_norm": 0.7171072959899902, "learning_rate": 0.0004547095503774204, "loss": 0.7674, "step": 421000 }, { "epoch": 2.164013784049885, "grad_norm": 0.5899352431297302, "learning_rate": 0.0004538898916967509, "loss": 0.7665, "step": 422000 }, { "epoch": 2.1691417787988185, "grad_norm": 0.44034343957901, "learning_rate": 0.0004530694125369216, "loss": 0.763, "step": 423000 }, { "epoch": 2.174269773547752, "grad_norm": 0.5452010631561279, "learning_rate": 0.00045224893337709223, "loss": 0.7658, "step": 424000 }, { "epoch": 2.179397768296685, "grad_norm": 0.5799173712730408, "learning_rate": 0.00045142927469642266, "loss": 0.7648, "step": 425000 }, { "epoch": 2.179397768296685, "eval_accuracy": 0.7994617887887868, "eval_loss": 0.7678025364875793, "eval_runtime": 70.6789, "eval_samples_per_second": 7134.561, "eval_steps_per_second": 27.873, "step": 425000 }, { "epoch": 2.1845257630456185, "grad_norm": 0.5448973178863525, "learning_rate": 0.00045060879553659343, "loss": 0.768, "step": 426000 }, { "epoch": 2.189653757794552, "grad_norm": 0.6005496382713318, "learning_rate": 0.00044978913685592386, "loss": 0.7636, "step": 427000 }, { "epoch": 2.194781752543485, "grad_norm": 0.5918001532554626, "learning_rate": 0.0004489686576960946, "loss": 0.7623, "step": 428000 }, { "epoch": 2.199909747292419, "grad_norm": 0.5451094508171082, "learning_rate": 0.0004481481785362652, "loss": 0.7672, "step": 429000 }, { "epoch": 2.2050377420413523, "grad_norm": 0.579969584941864, "learning_rate": 0.0004473285198555957, "loss": 0.7612, "step": 430000 }, { "epoch": 2.2050377420413523, "eval_accuracy": 0.79951334918485, "eval_loss": 0.7673205137252808, "eval_runtime": 71.0233, "eval_samples_per_second": 7099.971, "eval_steps_per_second": 27.737, "step": 430000 }, { "epoch": 2.2101657367902856, "grad_norm": 0.6007245182991028, "learning_rate": 0.0004465080406957664, "loss": 0.7678, "step": 431000 }, { "epoch": 2.215293731539219, "grad_norm": 0.6319281458854675, "learning_rate": 0.0004456883820150968, "loss": 0.7647, "step": 432000 }, { "epoch": 2.2204217262881523, "grad_norm": 0.6171641945838928, "learning_rate": 0.00044486790285526754, "loss": 0.7607, "step": 433000 }, { "epoch": 2.2255497210370856, "grad_norm": 0.644342839717865, "learning_rate": 0.00044404742369543815, "loss": 0.7617, "step": 434000 }, { "epoch": 2.230677715786019, "grad_norm": 0.6072486042976379, "learning_rate": 0.00044322776501476863, "loss": 0.7627, "step": 435000 }, { "epoch": 2.230677715786019, "eval_accuracy": 0.7996541487279455, "eval_loss": 0.7670552134513855, "eval_runtime": 71.2576, "eval_samples_per_second": 7076.621, "eval_steps_per_second": 27.646, "step": 435000 }, { "epoch": 2.2358057105349523, "grad_norm": 0.525732696056366, "learning_rate": 0.00044240728585493935, "loss": 0.7599, "step": 436000 }, { "epoch": 2.2409337052838856, "grad_norm": 0.6272632479667664, "learning_rate": 0.0004415876271742698, "loss": 0.7627, "step": 437000 }, { "epoch": 2.2460617000328194, "grad_norm": 0.6143770813941956, "learning_rate": 0.0004407671480144404, "loss": 0.7605, "step": 438000 }, { "epoch": 2.2511896947817527, "grad_norm": 0.5599572062492371, "learning_rate": 0.0004399466688546111, "loss": 0.7606, "step": 439000 }, { "epoch": 2.256317689530686, "grad_norm": 0.5624639987945557, "learning_rate": 0.0004391270101739416, "loss": 0.766, "step": 440000 }, { "epoch": 2.256317689530686, "eval_accuracy": 0.800255025651297, "eval_loss": 0.7649426460266113, "eval_runtime": 70.8234, "eval_samples_per_second": 7120.002, "eval_steps_per_second": 27.816, "step": 440000 }, { "epoch": 2.2614456842796193, "grad_norm": 0.6064088940620422, "learning_rate": 0.00043830735149327213, "loss": 0.7612, "step": 441000 }, { "epoch": 2.2665736790285527, "grad_norm": 0.6239656209945679, "learning_rate": 0.00043748687233344274, "loss": 0.7655, "step": 442000 }, { "epoch": 2.271701673777486, "grad_norm": 0.553521990776062, "learning_rate": 0.0004366663931736134, "loss": 0.7607, "step": 443000 }, { "epoch": 2.2768296685264193, "grad_norm": 0.5122565031051636, "learning_rate": 0.0004358459140137841, "loss": 0.7639, "step": 444000 }, { "epoch": 2.2819576632753527, "grad_norm": 0.5602110624313354, "learning_rate": 0.00043502625533311455, "loss": 0.7635, "step": 445000 }, { "epoch": 2.2819576632753527, "eval_accuracy": 0.8000408516984193, "eval_loss": 0.7652831077575684, "eval_runtime": 71.0631, "eval_samples_per_second": 7095.986, "eval_steps_per_second": 27.722, "step": 445000 }, { "epoch": 2.287085658024286, "grad_norm": 0.6772589683532715, "learning_rate": 0.00043420577617328527, "loss": 0.7651, "step": 446000 }, { "epoch": 2.2922136527732198, "grad_norm": 0.5864415168762207, "learning_rate": 0.0004333861174926157, "loss": 0.7581, "step": 447000 }, { "epoch": 2.297341647522153, "grad_norm": 0.6166993975639343, "learning_rate": 0.00043256563833278636, "loss": 0.7636, "step": 448000 }, { "epoch": 2.3024696422710864, "grad_norm": 0.5639147162437439, "learning_rate": 0.0004317459796521169, "loss": 0.7671, "step": 449000 }, { "epoch": 2.3075976370200197, "grad_norm": 0.56409752368927, "learning_rate": 0.0004309255004922875, "loss": 0.761, "step": 450000 }, { "epoch": 2.3075976370200197, "eval_accuracy": 0.8000130884082314, "eval_loss": 0.7647321224212646, "eval_runtime": 70.9452, "eval_samples_per_second": 7107.782, "eval_steps_per_second": 27.768, "step": 450000 }, { "epoch": 2.312725631768953, "grad_norm": 0.5483759045600891, "learning_rate": 0.00043010584181161804, "loss": 0.7638, "step": 451000 }, { "epoch": 2.3178536265178864, "grad_norm": 0.5161094069480896, "learning_rate": 0.00042928536265178865, "loss": 0.7657, "step": 452000 }, { "epoch": 2.3229816212668197, "grad_norm": 0.4951265752315521, "learning_rate": 0.0004284648834919593, "loss": 0.7587, "step": 453000 }, { "epoch": 2.328109616015753, "grad_norm": 0.5597565174102783, "learning_rate": 0.00042764440433213003, "loss": 0.7607, "step": 454000 }, { "epoch": 2.3332376107646864, "grad_norm": 0.6346337795257568, "learning_rate": 0.00042682474565146046, "loss": 0.7649, "step": 455000 }, { "epoch": 2.3332376107646864, "eval_accuracy": 0.8000943951866387, "eval_loss": 0.7661289572715759, "eval_runtime": 70.8231, "eval_samples_per_second": 7120.038, "eval_steps_per_second": 27.816, "step": 455000 }, { "epoch": 2.33836560551362, "grad_norm": 0.5165403485298157, "learning_rate": 0.00042600426649163107, "loss": 0.7653, "step": 456000 }, { "epoch": 2.3434936002625535, "grad_norm": 0.5810725092887878, "learning_rate": 0.0004251846078109616, "loss": 0.7664, "step": 457000 }, { "epoch": 2.348621595011487, "grad_norm": 0.6094423532485962, "learning_rate": 0.0004243641286511323, "loss": 0.7701, "step": 458000 }, { "epoch": 2.35374958976042, "grad_norm": 0.6010240912437439, "learning_rate": 0.000423543649491303, "loss": 0.7638, "step": 459000 }, { "epoch": 2.3588775845093535, "grad_norm": 0.5759085416793823, "learning_rate": 0.0004227239908106334, "loss": 0.7589, "step": 460000 }, { "epoch": 2.3588775845093535, "eval_accuracy": 0.8005048952629877, "eval_loss": 0.7629817724227905, "eval_runtime": 70.2507, "eval_samples_per_second": 7178.051, "eval_steps_per_second": 28.042, "step": 460000 }, { "epoch": 2.364005579258287, "grad_norm": 0.5739274024963379, "learning_rate": 0.0004219035116508041, "loss": 0.7583, "step": 461000 }, { "epoch": 2.36913357400722, "grad_norm": 0.47534069418907166, "learning_rate": 0.0004210838529701346, "loss": 0.7614, "step": 462000 }, { "epoch": 2.3742615687561535, "grad_norm": 0.5601282119750977, "learning_rate": 0.00042026337381030523, "loss": 0.7656, "step": 463000 }, { "epoch": 2.379389563505087, "grad_norm": 0.5563719272613525, "learning_rate": 0.00041944371512963577, "loss": 0.7572, "step": 464000 }, { "epoch": 2.3845175582540206, "grad_norm": 0.6598874926567078, "learning_rate": 0.0004186240564489662, "loss": 0.7586, "step": 465000 }, { "epoch": 2.3845175582540206, "eval_accuracy": 0.7987895205478094, "eval_loss": 0.7702716588973999, "eval_runtime": 70.6182, "eval_samples_per_second": 7140.692, "eval_steps_per_second": 27.896, "step": 465000 }, { "epoch": 2.389645553002954, "grad_norm": 0.5708329081535339, "learning_rate": 0.00041780357728913686, "loss": 0.7666, "step": 466000 }, { "epoch": 2.394773547751887, "grad_norm": 0.6055165529251099, "learning_rate": 0.0004169830981293076, "loss": 0.7662, "step": 467000 }, { "epoch": 2.3999015425008206, "grad_norm": 0.5853766798973083, "learning_rate": 0.0004161626189694782, "loss": 0.7625, "step": 468000 }, { "epoch": 2.405029537249754, "grad_norm": 0.5626079440116882, "learning_rate": 0.0004153429602888087, "loss": 0.7638, "step": 469000 }, { "epoch": 2.410157531998687, "grad_norm": 0.46508702635765076, "learning_rate": 0.00041452248112897934, "loss": 0.7595, "step": 470000 }, { "epoch": 2.410157531998687, "eval_accuracy": 0.8003343493375481, "eval_loss": 0.7640067934989929, "eval_runtime": 70.6075, "eval_samples_per_second": 7141.772, "eval_steps_per_second": 27.901, "step": 470000 }, { "epoch": 2.4152855267476205, "grad_norm": 0.6124268174171448, "learning_rate": 0.0004137028224483098, "loss": 0.7612, "step": 471000 }, { "epoch": 2.420413521496554, "grad_norm": 0.5748667120933533, "learning_rate": 0.00041288234328848054, "loss": 0.7553, "step": 472000 }, { "epoch": 2.425541516245487, "grad_norm": 0.6059972047805786, "learning_rate": 0.00041206268460781097, "loss": 0.7579, "step": 473000 }, { "epoch": 2.430669510994421, "grad_norm": 0.5620743036270142, "learning_rate": 0.0004112422054479816, "loss": 0.7601, "step": 474000 }, { "epoch": 2.4357975057433543, "grad_norm": 0.6134539246559143, "learning_rate": 0.0004104225467673121, "loss": 0.7622, "step": 475000 }, { "epoch": 2.4357975057433543, "eval_accuracy": 0.800485064341425, "eval_loss": 0.7626769542694092, "eval_runtime": 70.659, "eval_samples_per_second": 7136.569, "eval_steps_per_second": 27.88, "step": 475000 }, { "epoch": 2.4409255004922876, "grad_norm": 0.7367038130760193, "learning_rate": 0.0004096020676074828, "loss": 0.7633, "step": 476000 }, { "epoch": 2.446053495241221, "grad_norm": 0.5661228895187378, "learning_rate": 0.0004087815884476535, "loss": 0.7631, "step": 477000 }, { "epoch": 2.4511814899901543, "grad_norm": 0.5520285964012146, "learning_rate": 0.0004079619297669839, "loss": 0.7557, "step": 478000 }, { "epoch": 2.4563094847390876, "grad_norm": 0.5640761256217957, "learning_rate": 0.0004071414506071546, "loss": 0.7643, "step": 479000 }, { "epoch": 2.461437479488021, "grad_norm": 0.5398057699203491, "learning_rate": 0.00040632179192648507, "loss": 0.7593, "step": 480000 }, { "epoch": 2.461437479488021, "eval_accuracy": 0.8013318446921547, "eval_loss": 0.7604992389678955, "eval_runtime": 70.2675, "eval_samples_per_second": 7176.337, "eval_steps_per_second": 28.036, "step": 480000 }, { "epoch": 2.4665654742369543, "grad_norm": 0.5367211699485779, "learning_rate": 0.00040550131276665573, "loss": 0.7611, "step": 481000 }, { "epoch": 2.4716934689858876, "grad_norm": 0.48792392015457153, "learning_rate": 0.00040468083360682645, "loss": 0.7587, "step": 482000 }, { "epoch": 2.4768214637348214, "grad_norm": 0.5297552943229675, "learning_rate": 0.0004038611749261569, "loss": 0.7637, "step": 483000 }, { "epoch": 2.4819494584837547, "grad_norm": 0.531583845615387, "learning_rate": 0.00040304069576632754, "loss": 0.7647, "step": 484000 }, { "epoch": 2.487077453232688, "grad_norm": 0.6632963418960571, "learning_rate": 0.00040222021660649826, "loss": 0.7558, "step": 485000 }, { "epoch": 2.487077453232688, "eval_accuracy": 0.8012168253470907, "eval_loss": 0.7608583569526672, "eval_runtime": 70.9709, "eval_samples_per_second": 7105.208, "eval_steps_per_second": 27.758, "step": 485000 }, { "epoch": 2.4922054479816214, "grad_norm": 0.541089653968811, "learning_rate": 0.00040139973744666887, "loss": 0.7587, "step": 486000 }, { "epoch": 2.4973334427305547, "grad_norm": 0.6149650812149048, "learning_rate": 0.0004005800787659993, "loss": 0.7531, "step": 487000 }, { "epoch": 2.502461437479488, "grad_norm": 0.5699481964111328, "learning_rate": 0.00039975959960617, "loss": 0.7613, "step": 488000 }, { "epoch": 2.5075894322284213, "grad_norm": 0.614250898361206, "learning_rate": 0.00039893994092550056, "loss": 0.7568, "step": 489000 }, { "epoch": 2.5127174269773547, "grad_norm": 0.5428251624107361, "learning_rate": 0.00039811946176567117, "loss": 0.7599, "step": 490000 }, { "epoch": 2.5127174269773547, "eval_accuracy": 0.8002133807160152, "eval_loss": 0.76506108045578, "eval_runtime": 72.0723, "eval_samples_per_second": 6996.63, "eval_steps_per_second": 27.334, "step": 490000 }, { "epoch": 2.517845421726288, "grad_norm": 0.5326504707336426, "learning_rate": 0.00039729980308500165, "loss": 0.7614, "step": 491000 }, { "epoch": 2.5229734164752218, "grad_norm": 0.8246121406555176, "learning_rate": 0.0003964793239251723, "loss": 0.7592, "step": 492000 }, { "epoch": 2.5281014112241547, "grad_norm": 0.5567710995674133, "learning_rate": 0.0003956596652445028, "loss": 0.7612, "step": 493000 }, { "epoch": 2.5332294059730884, "grad_norm": 0.5717312693595886, "learning_rate": 0.00039484000656383333, "loss": 0.7566, "step": 494000 }, { "epoch": 2.5383574007220218, "grad_norm": 0.5597317814826965, "learning_rate": 0.00039401952740400394, "loss": 0.7587, "step": 495000 }, { "epoch": 2.5383574007220218, "eval_accuracy": 0.8016451732528462, "eval_loss": 0.7589249610900879, "eval_runtime": 72.0406, "eval_samples_per_second": 6999.709, "eval_steps_per_second": 27.346, "step": 495000 }, { "epoch": 2.543485395470955, "grad_norm": 0.5756000876426697, "learning_rate": 0.0003931990482441746, "loss": 0.7615, "step": 496000 }, { "epoch": 2.5486133902198884, "grad_norm": 0.6494652628898621, "learning_rate": 0.0003923793895635051, "loss": 0.7616, "step": 497000 }, { "epoch": 2.5537413849688217, "grad_norm": 0.48364412784576416, "learning_rate": 0.0003915589104036758, "loss": 0.7586, "step": 498000 }, { "epoch": 2.558869379717755, "grad_norm": 0.502427875995636, "learning_rate": 0.0003907384312438464, "loss": 0.7564, "step": 499000 }, { "epoch": 2.5639973744666884, "grad_norm": 0.5577316284179688, "learning_rate": 0.0003899179520840171, "loss": 0.7588, "step": 500000 }, { "epoch": 2.5639973744666884, "eval_accuracy": 0.8023571033369492, "eval_loss": 0.7570073008537292, "eval_runtime": 70.8941, "eval_samples_per_second": 7112.903, "eval_steps_per_second": 27.788, "step": 500000 }, { "epoch": 2.569125369215622, "grad_norm": 0.5779034495353699, "learning_rate": 0.00038909829340334756, "loss": 0.7622, "step": 501000 }, { "epoch": 2.574253363964555, "grad_norm": 0.5690110921859741, "learning_rate": 0.00038827781424351823, "loss": 0.7513, "step": 502000 }, { "epoch": 2.579381358713489, "grad_norm": 0.6035880446434021, "learning_rate": 0.0003874581555628487, "loss": 0.7558, "step": 503000 }, { "epoch": 2.584509353462422, "grad_norm": 0.520416796207428, "learning_rate": 0.0003866376764030194, "loss": 0.7567, "step": 504000 }, { "epoch": 2.5896373482113555, "grad_norm": 0.5084878206253052, "learning_rate": 0.00038581719724319004, "loss": 0.762, "step": 505000 }, { "epoch": 2.5896373482113555, "eval_accuracy": 0.8019803158272568, "eval_loss": 0.7566066384315491, "eval_runtime": 72.7272, "eval_samples_per_second": 6933.627, "eval_steps_per_second": 27.088, "step": 505000 }, { "epoch": 2.594765342960289, "grad_norm": 0.431702196598053, "learning_rate": 0.0003849975385625205, "loss": 0.7592, "step": 506000 }, { "epoch": 2.599893337709222, "grad_norm": 0.5581603050231934, "learning_rate": 0.0003841770594026912, "loss": 0.7547, "step": 507000 }, { "epoch": 2.6050213324581555, "grad_norm": 0.5758413076400757, "learning_rate": 0.00038335658024286185, "loss": 0.7615, "step": 508000 }, { "epoch": 2.610149327207089, "grad_norm": 0.5125038623809814, "learning_rate": 0.00038253692156219233, "loss": 0.7537, "step": 509000 }, { "epoch": 2.6152773219560226, "grad_norm": 0.518104612827301, "learning_rate": 0.000381716442402363, "loss": 0.7526, "step": 510000 }, { "epoch": 2.6152773219560226, "eval_accuracy": 0.801333827784311, "eval_loss": 0.7601718306541443, "eval_runtime": 72.6835, "eval_samples_per_second": 6937.791, "eval_steps_per_second": 27.104, "step": 510000 }, { "epoch": 2.6204053167049555, "grad_norm": 0.6345874667167664, "learning_rate": 0.00038089596324253366, "loss": 0.7562, "step": 511000 }, { "epoch": 2.6255333114538892, "grad_norm": 0.5293376445770264, "learning_rate": 0.00038007630456186414, "loss": 0.7594, "step": 512000 }, { "epoch": 2.6306613062028226, "grad_norm": 0.6707202792167664, "learning_rate": 0.0003792558254020348, "loss": 0.7539, "step": 513000 }, { "epoch": 2.635789300951756, "grad_norm": 0.5322111248970032, "learning_rate": 0.00037843534624220547, "loss": 0.7574, "step": 514000 }, { "epoch": 2.6409172957006892, "grad_norm": 0.6770355105400085, "learning_rate": 0.00037761568756153595, "loss": 0.7587, "step": 515000 }, { "epoch": 2.6409172957006892, "eval_accuracy": 0.8021270646468212, "eval_loss": 0.7560163140296936, "eval_runtime": 72.4103, "eval_samples_per_second": 6963.969, "eval_steps_per_second": 27.206, "step": 515000 }, { "epoch": 2.6460452904496226, "grad_norm": 0.5936743021011353, "learning_rate": 0.0003767952084017066, "loss": 0.7551, "step": 516000 }, { "epoch": 2.651173285198556, "grad_norm": 0.5725984573364258, "learning_rate": 0.0003759755497210371, "loss": 0.7603, "step": 517000 }, { "epoch": 2.656301279947489, "grad_norm": 0.5772015452384949, "learning_rate": 0.00037515507056120776, "loss": 0.7541, "step": 518000 }, { "epoch": 2.661429274696423, "grad_norm": 0.6508539915084839, "learning_rate": 0.00037433459140137843, "loss": 0.7572, "step": 519000 }, { "epoch": 2.666557269445356, "grad_norm": 0.569170355796814, "learning_rate": 0.0003735141122415491, "loss": 0.7522, "step": 520000 }, { "epoch": 2.666557269445356, "eval_accuracy": 0.8026208545937338, "eval_loss": 0.755728542804718, "eval_runtime": 72.4617, "eval_samples_per_second": 6959.031, "eval_steps_per_second": 27.187, "step": 520000 }, { "epoch": 2.6716852641942896, "grad_norm": 0.6192373633384705, "learning_rate": 0.0003726944535608796, "loss": 0.7514, "step": 521000 }, { "epoch": 2.676813258943223, "grad_norm": 0.5529782772064209, "learning_rate": 0.00037187397440105024, "loss": 0.75, "step": 522000 }, { "epoch": 2.6819412536921563, "grad_norm": 0.596807062625885, "learning_rate": 0.0003710543157203807, "loss": 0.7602, "step": 523000 }, { "epoch": 2.6870692484410896, "grad_norm": 0.6609178185462952, "learning_rate": 0.0003702338365605514, "loss": 0.7536, "step": 524000 }, { "epoch": 2.692197243190023, "grad_norm": 0.5860297679901123, "learning_rate": 0.00036941335740072205, "loss": 0.7546, "step": 525000 }, { "epoch": 2.692197243190023, "eval_accuracy": 0.8026208545937338, "eval_loss": 0.7542169094085693, "eval_runtime": 70.6137, "eval_samples_per_second": 7141.149, "eval_steps_per_second": 27.898, "step": 525000 }, { "epoch": 2.6973252379389563, "grad_norm": 0.48967838287353516, "learning_rate": 0.00036859369872005253, "loss": 0.7545, "step": 526000 }, { "epoch": 2.7024532326878896, "grad_norm": 0.5674000382423401, "learning_rate": 0.0003677732195602232, "loss": 0.7534, "step": 527000 }, { "epoch": 2.707581227436823, "grad_norm": 0.6165111064910889, "learning_rate": 0.0003669543813587135, "loss": 0.7531, "step": 528000 }, { "epoch": 2.7127092221857563, "grad_norm": 0.6512913107872009, "learning_rate": 0.00036613390219888416, "loss": 0.7603, "step": 529000 }, { "epoch": 2.71783721693469, "grad_norm": 0.5308037400245667, "learning_rate": 0.0003653134230390548, "loss": 0.7542, "step": 530000 }, { "epoch": 2.71783721693469, "eval_accuracy": 0.8029004705877687, "eval_loss": 0.7542742490768433, "eval_runtime": 70.9003, "eval_samples_per_second": 7112.287, "eval_steps_per_second": 27.786, "step": 530000 }, { "epoch": 2.7229652116836234, "grad_norm": 0.6076812148094177, "learning_rate": 0.0003644937643583853, "loss": 0.7526, "step": 531000 }, { "epoch": 2.7280932064325567, "grad_norm": 0.5457537174224854, "learning_rate": 0.000363673285198556, "loss": 0.7548, "step": 532000 }, { "epoch": 2.73322120118149, "grad_norm": 0.5145652890205383, "learning_rate": 0.00036285280603872664, "loss": 0.7527, "step": 533000 }, { "epoch": 2.7383491959304234, "grad_norm": 0.5338176488876343, "learning_rate": 0.0003620323268788973, "loss": 0.7594, "step": 534000 }, { "epoch": 2.7434771906793567, "grad_norm": 0.5713699460029602, "learning_rate": 0.0003612126681982278, "loss": 0.7509, "step": 535000 }, { "epoch": 2.7434771906793567, "eval_accuracy": 0.8029421155230505, "eval_loss": 0.754176914691925, "eval_runtime": 71.0137, "eval_samples_per_second": 7100.921, "eval_steps_per_second": 27.741, "step": 535000 }, { "epoch": 2.74860518542829, "grad_norm": 0.628857433795929, "learning_rate": 0.0003603921890383984, "loss": 0.7535, "step": 536000 }, { "epoch": 2.7537331801772233, "grad_norm": 0.5895251035690308, "learning_rate": 0.00035957253035772893, "loss": 0.7541, "step": 537000 }, { "epoch": 2.7588611749261567, "grad_norm": 0.5879444479942322, "learning_rate": 0.0003587520511978996, "loss": 0.7515, "step": 538000 }, { "epoch": 2.7639891696750905, "grad_norm": 0.545227587223053, "learning_rate": 0.00035793157203807026, "loss": 0.7481, "step": 539000 }, { "epoch": 2.769117164424024, "grad_norm": 0.715880274772644, "learning_rate": 0.00035711109287824087, "loss": 0.7515, "step": 540000 }, { "epoch": 2.769117164424024, "eval_accuracy": 0.8015698157509078, "eval_loss": 0.7585220336914062, "eval_runtime": 70.7574, "eval_samples_per_second": 7126.642, "eval_steps_per_second": 27.842, "step": 540000 }, { "epoch": 2.774245159172957, "grad_norm": 0.5264488458633423, "learning_rate": 0.0003562922546767312, "loss": 0.7567, "step": 541000 }, { "epoch": 2.7793731539218904, "grad_norm": 0.5310335755348206, "learning_rate": 0.0003554717755169019, "loss": 0.7561, "step": 542000 }, { "epoch": 2.7845011486708238, "grad_norm": 0.5482403635978699, "learning_rate": 0.00035465129635707255, "loss": 0.7466, "step": 543000 }, { "epoch": 2.789629143419757, "grad_norm": 0.6248590350151062, "learning_rate": 0.00035383163767640304, "loss": 0.7552, "step": 544000 }, { "epoch": 2.7947571381686904, "grad_norm": 0.5901440382003784, "learning_rate": 0.0003530111585165737, "loss": 0.7508, "step": 545000 }, { "epoch": 2.7947571381686904, "eval_accuracy": 0.8024403932075127, "eval_loss": 0.7553383708000183, "eval_runtime": 70.9934, "eval_samples_per_second": 7102.956, "eval_steps_per_second": 27.749, "step": 545000 }, { "epoch": 2.7998851329176238, "grad_norm": 0.5881081819534302, "learning_rate": 0.0003521914998359042, "loss": 0.7488, "step": 546000 }, { "epoch": 2.805013127666557, "grad_norm": 0.5565335154533386, "learning_rate": 0.00035137102067607485, "loss": 0.7536, "step": 547000 }, { "epoch": 2.810141122415491, "grad_norm": 0.6405097842216492, "learning_rate": 0.00035055136199540533, "loss": 0.7549, "step": 548000 }, { "epoch": 2.815269117164424, "grad_norm": 0.6065213680267334, "learning_rate": 0.0003497317033147358, "loss": 0.7565, "step": 549000 }, { "epoch": 2.8203971119133575, "grad_norm": 0.6667752861976624, "learning_rate": 0.0003489112241549065, "loss": 0.7523, "step": 550000 }, { "epoch": 2.8203971119133575, "eval_accuracy": 0.8027616541368294, "eval_loss": 0.7530876994132996, "eval_runtime": 71.1414, "eval_samples_per_second": 7088.176, "eval_steps_per_second": 27.691, "step": 550000 }, { "epoch": 2.825525106662291, "grad_norm": 0.6081520915031433, "learning_rate": 0.00034809074499507714, "loss": 0.7503, "step": 551000 }, { "epoch": 2.830653101411224, "grad_norm": 0.5587588548660278, "learning_rate": 0.0003472702658352478, "loss": 0.7511, "step": 552000 }, { "epoch": 2.8357810961601575, "grad_norm": 0.5523270964622498, "learning_rate": 0.00034644978667541847, "loss": 0.7577, "step": 553000 }, { "epoch": 2.840909090909091, "grad_norm": 0.5792860388755798, "learning_rate": 0.00034563012799474895, "loss": 0.7532, "step": 554000 }, { "epoch": 2.846037085658024, "grad_norm": 0.5903825163841248, "learning_rate": 0.0003448096488349196, "loss": 0.756, "step": 555000 }, { "epoch": 2.846037085658024, "eval_accuracy": 0.8034934151424951, "eval_loss": 0.7511287331581116, "eval_runtime": 70.8662, "eval_samples_per_second": 7115.706, "eval_steps_per_second": 27.799, "step": 555000 }, { "epoch": 2.8511650804069575, "grad_norm": 0.47514578700065613, "learning_rate": 0.0003439899901542501, "loss": 0.7546, "step": 556000 }, { "epoch": 2.8562930751558913, "grad_norm": 0.555206835269928, "learning_rate": 0.00034316951099442076, "loss": 0.7583, "step": 557000 }, { "epoch": 2.861421069904824, "grad_norm": 0.5832545757293701, "learning_rate": 0.0003423490318345914, "loss": 0.7492, "step": 558000 }, { "epoch": 2.866549064653758, "grad_norm": 0.5508819818496704, "learning_rate": 0.0003415285526747621, "loss": 0.7503, "step": 559000 }, { "epoch": 2.8716770594026912, "grad_norm": 0.5555607676506042, "learning_rate": 0.0003407097144732524, "loss": 0.7559, "step": 560000 }, { "epoch": 2.8716770594026912, "eval_accuracy": 0.8038345069933744, "eval_loss": 0.7500419020652771, "eval_runtime": 70.485, "eval_samples_per_second": 7154.192, "eval_steps_per_second": 27.949, "step": 560000 }, { "epoch": 2.8768050541516246, "grad_norm": 0.5687641501426697, "learning_rate": 0.00033988923531342306, "loss": 0.7513, "step": 561000 }, { "epoch": 2.881933048900558, "grad_norm": 0.5863436460494995, "learning_rate": 0.0003390687561535937, "loss": 0.7525, "step": 562000 }, { "epoch": 2.8870610436494912, "grad_norm": 0.5974256992340088, "learning_rate": 0.0003382482769937644, "loss": 0.7582, "step": 563000 }, { "epoch": 2.8921890383984246, "grad_norm": 0.6164770722389221, "learning_rate": 0.0003374294387922547, "loss": 0.7473, "step": 564000 }, { "epoch": 2.897317033147358, "grad_norm": 0.5947271585464478, "learning_rate": 0.00033660895963242535, "loss": 0.75, "step": 565000 }, { "epoch": 2.897317033147358, "eval_accuracy": 0.8038226084404368, "eval_loss": 0.7494056820869446, "eval_runtime": 70.4752, "eval_samples_per_second": 7155.186, "eval_steps_per_second": 27.953, "step": 565000 }, { "epoch": 2.9024450278962917, "grad_norm": 0.6441121101379395, "learning_rate": 0.000335788480472596, "loss": 0.7547, "step": 566000 }, { "epoch": 2.9075730226452245, "grad_norm": 0.6216950416564941, "learning_rate": 0.0003349680013127667, "loss": 0.7508, "step": 567000 }, { "epoch": 2.9127010173941583, "grad_norm": 0.5308820009231567, "learning_rate": 0.00033414834263209716, "loss": 0.7522, "step": 568000 }, { "epoch": 2.9178290121430916, "grad_norm": 0.6509214043617249, "learning_rate": 0.0003333278634722678, "loss": 0.7506, "step": 569000 }, { "epoch": 2.922957006892025, "grad_norm": 0.513090193271637, "learning_rate": 0.0003325073843124385, "loss": 0.7492, "step": 570000 }, { "epoch": 2.922957006892025, "eval_accuracy": 0.8034795334974012, "eval_loss": 0.7511353492736816, "eval_runtime": 70.7785, "eval_samples_per_second": 7124.523, "eval_steps_per_second": 27.833, "step": 570000 }, { "epoch": 2.9280850016409583, "grad_norm": 0.5987011194229126, "learning_rate": 0.00033168772563176897, "loss": 0.7439, "step": 571000 }, { "epoch": 2.9332129963898916, "grad_norm": 0.49355295300483704, "learning_rate": 0.00033086806695109945, "loss": 0.7464, "step": 572000 }, { "epoch": 2.938340991138825, "grad_norm": 0.6383023262023926, "learning_rate": 0.0003300475877912701, "loss": 0.7496, "step": 573000 }, { "epoch": 2.9434689858877583, "grad_norm": 0.5184949636459351, "learning_rate": 0.0003292271086314408, "loss": 0.7502, "step": 574000 }, { "epoch": 2.948596980636692, "grad_norm": 0.6014882922172546, "learning_rate": 0.00032840662947161145, "loss": 0.7481, "step": 575000 }, { "epoch": 2.948596980636692, "eval_accuracy": 0.8043858066128191, "eval_loss": 0.7470995783805847, "eval_runtime": 70.4753, "eval_samples_per_second": 7155.175, "eval_steps_per_second": 27.953, "step": 575000 }, { "epoch": 2.953724975385625, "grad_norm": 0.5628450512886047, "learning_rate": 0.00032758697079094193, "loss": 0.7464, "step": 576000 }, { "epoch": 2.9588529701345587, "grad_norm": 0.5449588894844055, "learning_rate": 0.0003267664916311126, "loss": 0.7455, "step": 577000 }, { "epoch": 2.963980964883492, "grad_norm": 0.565690815448761, "learning_rate": 0.00032594601247128326, "loss": 0.7503, "step": 578000 }, { "epoch": 2.9691089596324254, "grad_norm": 0.574501633644104, "learning_rate": 0.00032512717426977356, "loss": 0.7432, "step": 579000 }, { "epoch": 2.9742369543813587, "grad_norm": 0.578654944896698, "learning_rate": 0.0003243066951099442, "loss": 0.751, "step": 580000 }, { "epoch": 2.9742369543813587, "eval_accuracy": 0.804334246216756, "eval_loss": 0.7478451132774353, "eval_runtime": 70.7554, "eval_samples_per_second": 7126.846, "eval_steps_per_second": 27.842, "step": 580000 }, { "epoch": 2.979364949130292, "grad_norm": 0.5930513739585876, "learning_rate": 0.0003234862159501149, "loss": 0.7532, "step": 581000 }, { "epoch": 2.9844929438792254, "grad_norm": 0.5324290990829468, "learning_rate": 0.00032266573679028555, "loss": 0.7458, "step": 582000 }, { "epoch": 2.9896209386281587, "grad_norm": 0.5386386513710022, "learning_rate": 0.00032184607810961603, "loss": 0.7562, "step": 583000 }, { "epoch": 2.9947489333770925, "grad_norm": 0.49079427123069763, "learning_rate": 0.0003210255989497867, "loss": 0.7427, "step": 584000 }, { "epoch": 2.9998769281260254, "grad_norm": 0.5127983093261719, "learning_rate": 0.0003202051197899573, "loss": 0.7545, "step": 585000 }, { "epoch": 2.9998769281260254, "eval_accuracy": 0.8019426370762875, "eval_loss": 0.7594846487045288, "eval_runtime": 71.1196, "eval_samples_per_second": 7090.354, "eval_steps_per_second": 27.7, "step": 585000 }, { "epoch": 3.005004922874959, "grad_norm": 0.62957364320755, "learning_rate": 0.00031938628158844766, "loss": 0.7319, "step": 586000 }, { "epoch": 3.0101329176238925, "grad_norm": 0.6073950529098511, "learning_rate": 0.0003185658024286183, "loss": 0.7285, "step": 587000 }, { "epoch": 3.015260912372826, "grad_norm": 0.6438223719596863, "learning_rate": 0.000317745323268789, "loss": 0.7286, "step": 588000 }, { "epoch": 3.020388907121759, "grad_norm": 0.49073129892349243, "learning_rate": 0.00031692484410895965, "loss": 0.7329, "step": 589000 }, { "epoch": 3.0255169018706924, "grad_norm": 0.6797950267791748, "learning_rate": 0.00031610600590744996, "loss": 0.7299, "step": 590000 }, { "epoch": 3.0255169018706924, "eval_accuracy": 0.8041597341070037, "eval_loss": 0.7478121519088745, "eval_runtime": 70.5877, "eval_samples_per_second": 7143.782, "eval_steps_per_second": 27.909, "step": 590000 }, { "epoch": 3.0306448966196258, "grad_norm": 0.6050714254379272, "learning_rate": 0.0003152855267476206, "loss": 0.7359, "step": 591000 }, { "epoch": 3.035772891368559, "grad_norm": 0.5688833594322205, "learning_rate": 0.0003144650475877913, "loss": 0.7322, "step": 592000 }, { "epoch": 3.0409008861174924, "grad_norm": 0.6677442193031311, "learning_rate": 0.00031364456842796195, "loss": 0.73, "step": 593000 }, { "epoch": 3.046028880866426, "grad_norm": 0.5591084361076355, "learning_rate": 0.00031282490974729243, "loss": 0.7333, "step": 594000 }, { "epoch": 3.0511568756153595, "grad_norm": 0.6151507496833801, "learning_rate": 0.0003120044305874631, "loss": 0.7305, "step": 595000 }, { "epoch": 3.0511568756153595, "eval_accuracy": 0.804687236620573, "eval_loss": 0.7486827373504639, "eval_runtime": 70.8166, "eval_samples_per_second": 7120.686, "eval_steps_per_second": 27.818, "step": 595000 }, { "epoch": 3.056284870364293, "grad_norm": 0.6491912007331848, "learning_rate": 0.00031118395142763376, "loss": 0.734, "step": 596000 }, { "epoch": 3.061412865113226, "grad_norm": 0.5502904653549194, "learning_rate": 0.00031036429274696424, "loss": 0.7316, "step": 597000 }, { "epoch": 3.0665408598621595, "grad_norm": 0.5560014247894287, "learning_rate": 0.0003095438135871349, "loss": 0.7324, "step": 598000 }, { "epoch": 3.071668854611093, "grad_norm": 0.6535289883613586, "learning_rate": 0.0003087241549064654, "loss": 0.7342, "step": 599000 }, { "epoch": 3.076796849360026, "grad_norm": 0.5713477730751038, "learning_rate": 0.00030790367574663605, "loss": 0.7343, "step": 600000 }, { "epoch": 3.076796849360026, "eval_accuracy": 0.8047467293852613, "eval_loss": 0.7465729713439941, "eval_runtime": 70.5289, "eval_samples_per_second": 7149.74, "eval_steps_per_second": 27.932, "step": 600000 }, { "epoch": 3.0819248441089595, "grad_norm": 0.5471197366714478, "learning_rate": 0.00030708401706596654, "loss": 0.7319, "step": 601000 }, { "epoch": 3.087052838857893, "grad_norm": 0.6190933585166931, "learning_rate": 0.000306264358385297, "loss": 0.7287, "step": 602000 }, { "epoch": 3.092180833606826, "grad_norm": 0.6250334978103638, "learning_rate": 0.0003054438792254677, "loss": 0.7344, "step": 603000 }, { "epoch": 3.09730882835576, "grad_norm": 0.6196159720420837, "learning_rate": 0.00030462340006563835, "loss": 0.7326, "step": 604000 }, { "epoch": 3.1024368231046933, "grad_norm": 0.539490818977356, "learning_rate": 0.000303802920905809, "loss": 0.731, "step": 605000 }, { "epoch": 3.1024368231046933, "eval_accuracy": 0.8044611641147575, "eval_loss": 0.7472212314605713, "eval_runtime": 70.6792, "eval_samples_per_second": 7134.536, "eval_steps_per_second": 27.872, "step": 605000 }, { "epoch": 3.1075648178536266, "grad_norm": 0.7250561714172363, "learning_rate": 0.0003029824417459797, "loss": 0.7392, "step": 606000 }, { "epoch": 3.11269281260256, "grad_norm": 0.6296505331993103, "learning_rate": 0.00030216278306531016, "loss": 0.7361, "step": 607000 }, { "epoch": 3.1178208073514933, "grad_norm": 0.5384014844894409, "learning_rate": 0.0003013423039054808, "loss": 0.7315, "step": 608000 }, { "epoch": 3.1229488021004266, "grad_norm": 0.6182312369346619, "learning_rate": 0.0003005234657039711, "loss": 0.733, "step": 609000 }, { "epoch": 3.12807679684936, "grad_norm": 0.6149222254753113, "learning_rate": 0.0002997029865441418, "loss": 0.733, "step": 610000 }, { "epoch": 3.12807679684936, "eval_accuracy": 0.8045979974735405, "eval_loss": 0.7459588050842285, "eval_runtime": 70.7817, "eval_samples_per_second": 7124.2, "eval_steps_per_second": 27.832, "step": 610000 }, { "epoch": 3.1332047915982932, "grad_norm": 0.5092925429344177, "learning_rate": 0.00029888250738431245, "loss": 0.7366, "step": 611000 }, { "epoch": 3.1383327863472266, "grad_norm": 0.5847880840301514, "learning_rate": 0.0002980620282244831, "loss": 0.7341, "step": 612000 }, { "epoch": 3.1434607810961603, "grad_norm": 0.5842584371566772, "learning_rate": 0.0002972415490646538, "loss": 0.7385, "step": 613000 }, { "epoch": 3.1485887758450937, "grad_norm": 0.6642169952392578, "learning_rate": 0.00029642106990482444, "loss": 0.7367, "step": 614000 }, { "epoch": 3.153716770594027, "grad_norm": 0.5550722479820251, "learning_rate": 0.0002956014112241549, "loss": 0.7351, "step": 615000 }, { "epoch": 3.153716770594027, "eval_accuracy": 0.8043302800324433, "eval_loss": 0.7485695481300354, "eval_runtime": 71.2762, "eval_samples_per_second": 7074.772, "eval_steps_per_second": 27.639, "step": 615000 }, { "epoch": 3.1588447653429603, "grad_norm": 0.5302006602287292, "learning_rate": 0.0002947809320643256, "loss": 0.7338, "step": 616000 }, { "epoch": 3.1639727600918937, "grad_norm": 0.5731785893440247, "learning_rate": 0.00029396127338365607, "loss": 0.738, "step": 617000 }, { "epoch": 3.169100754840827, "grad_norm": 0.5970706939697266, "learning_rate": 0.00029314079422382674, "loss": 0.7352, "step": 618000 }, { "epoch": 3.1742287495897603, "grad_norm": 0.621990442276001, "learning_rate": 0.0002923211355431572, "loss": 0.7367, "step": 619000 }, { "epoch": 3.1793567443386936, "grad_norm": 0.6552351713180542, "learning_rate": 0.0002915006563833279, "loss": 0.7372, "step": 620000 }, { "epoch": 3.1793567443386936, "eval_accuracy": 0.8051572294616103, "eval_loss": 0.7445986270904541, "eval_runtime": 70.1908, "eval_samples_per_second": 7184.172, "eval_steps_per_second": 28.066, "step": 620000 }, { "epoch": 3.184484739087627, "grad_norm": 0.6487659215927124, "learning_rate": 0.0002906801772234985, "loss": 0.7294, "step": 621000 }, { "epoch": 3.1896127338365607, "grad_norm": 0.5325456261634827, "learning_rate": 0.00028986051854282903, "loss": 0.733, "step": 622000 }, { "epoch": 3.194740728585494, "grad_norm": 0.5794259309768677, "learning_rate": 0.0002890400393829997, "loss": 0.7311, "step": 623000 }, { "epoch": 3.1998687233344274, "grad_norm": 0.6754854321479797, "learning_rate": 0.0002882203807023302, "loss": 0.7314, "step": 624000 }, { "epoch": 3.2049967180833607, "grad_norm": 0.5375105142593384, "learning_rate": 0.00028739990154250084, "loss": 0.7299, "step": 625000 }, { "epoch": 3.2049967180833607, "eval_accuracy": 0.8044809950363203, "eval_loss": 0.7478250861167908, "eval_runtime": 72.0089, "eval_samples_per_second": 7002.79, "eval_steps_per_second": 27.358, "step": 625000 }, { "epoch": 3.210124712832294, "grad_norm": 0.6171442866325378, "learning_rate": 0.0002865802428618313, "loss": 0.7324, "step": 626000 }, { "epoch": 3.2152527075812274, "grad_norm": 0.7359910011291504, "learning_rate": 0.000285759763702002, "loss": 0.7356, "step": 627000 }, { "epoch": 3.2203807023301607, "grad_norm": 0.6429058909416199, "learning_rate": 0.00028494010502133247, "loss": 0.7311, "step": 628000 }, { "epoch": 3.225508697079094, "grad_norm": 0.4959796667098999, "learning_rate": 0.00028411962586150313, "loss": 0.7338, "step": 629000 }, { "epoch": 3.2306366918280274, "grad_norm": 0.5685999393463135, "learning_rate": 0.0002832991467016738, "loss": 0.7351, "step": 630000 }, { "epoch": 3.2306366918280274, "eval_accuracy": 0.804709050634292, "eval_loss": 0.7458400726318359, "eval_runtime": 71.0043, "eval_samples_per_second": 7101.866, "eval_steps_per_second": 27.745, "step": 630000 }, { "epoch": 3.235764686576961, "grad_norm": 0.5883785486221313, "learning_rate": 0.0002824794880210043, "loss": 0.7264, "step": 631000 }, { "epoch": 3.2408926813258945, "grad_norm": 0.5906899571418762, "learning_rate": 0.00028165900886117494, "loss": 0.7372, "step": 632000 }, { "epoch": 3.246020676074828, "grad_norm": 0.5407042503356934, "learning_rate": 0.00028083935018050543, "loss": 0.7344, "step": 633000 }, { "epoch": 3.251148670823761, "grad_norm": 0.5565351247787476, "learning_rate": 0.0002800196914998359, "loss": 0.7304, "step": 634000 }, { "epoch": 3.2562766655726945, "grad_norm": 0.7526431679725647, "learning_rate": 0.0002791992123400066, "loss": 0.7304, "step": 635000 }, { "epoch": 3.2562766655726945, "eval_accuracy": 0.8049271907714823, "eval_loss": 0.745959997177124, "eval_runtime": 72.0556, "eval_samples_per_second": 6998.247, "eval_steps_per_second": 27.34, "step": 635000 }, { "epoch": 3.261404660321628, "grad_norm": 0.7256116271018982, "learning_rate": 0.00027837955365933706, "loss": 0.7332, "step": 636000 }, { "epoch": 3.266532655070561, "grad_norm": 0.6029453277587891, "learning_rate": 0.0002775590744995077, "loss": 0.7307, "step": 637000 }, { "epoch": 3.2716606498194944, "grad_norm": 0.5765652656555176, "learning_rate": 0.0002767385953396784, "loss": 0.7335, "step": 638000 }, { "epoch": 3.2767886445684278, "grad_norm": 0.6440179347991943, "learning_rate": 0.00027591811617984905, "loss": 0.7373, "step": 639000 }, { "epoch": 3.2819166393173616, "grad_norm": 0.5710249543190002, "learning_rate": 0.00027509845749917953, "loss": 0.7335, "step": 640000 }, { "epoch": 3.2819166393173616, "eval_accuracy": 0.8048914951126693, "eval_loss": 0.7450812458992004, "eval_runtime": 72.3432, "eval_samples_per_second": 6970.429, "eval_steps_per_second": 27.231, "step": 640000 }, { "epoch": 3.287044634066295, "grad_norm": 0.677749752998352, "learning_rate": 0.0002742779783393502, "loss": 0.7315, "step": 641000 }, { "epoch": 3.292172628815228, "grad_norm": 0.5461249351501465, "learning_rate": 0.00027345749917952086, "loss": 0.7372, "step": 642000 }, { "epoch": 3.2973006235641615, "grad_norm": 0.547609269618988, "learning_rate": 0.00027263702001969147, "loss": 0.7408, "step": 643000 }, { "epoch": 3.302428618313095, "grad_norm": 0.664138674736023, "learning_rate": 0.000271817361339022, "loss": 0.7349, "step": 644000 }, { "epoch": 3.307556613062028, "grad_norm": 0.5541239976882935, "learning_rate": 0.0002709977026583525, "loss": 0.7351, "step": 645000 }, { "epoch": 3.307556613062028, "eval_accuracy": 0.8057561232928055, "eval_loss": 0.741638720035553, "eval_runtime": 72.2419, "eval_samples_per_second": 6980.197, "eval_steps_per_second": 27.269, "step": 645000 }, { "epoch": 3.3126846078109615, "grad_norm": 0.598354160785675, "learning_rate": 0.00027017722349852315, "loss": 0.7366, "step": 646000 }, { "epoch": 3.317812602559895, "grad_norm": 0.6116665005683899, "learning_rate": 0.0002693567443386938, "loss": 0.737, "step": 647000 }, { "epoch": 3.322940597308828, "grad_norm": 0.5967441201210022, "learning_rate": 0.0002685370856580243, "loss": 0.7286, "step": 648000 }, { "epoch": 3.328068592057762, "grad_norm": 0.5829706788063049, "learning_rate": 0.00026771660649819496, "loss": 0.737, "step": 649000 }, { "epoch": 3.3331965868066953, "grad_norm": 0.5304847955703735, "learning_rate": 0.00026689694781752545, "loss": 0.7324, "step": 650000 }, { "epoch": 3.3331965868066953, "eval_accuracy": 0.805809666781025, "eval_loss": 0.741962730884552, "eval_runtime": 72.4277, "eval_samples_per_second": 6962.292, "eval_steps_per_second": 27.2, "step": 650000 }, { "epoch": 3.3383245815556286, "grad_norm": 0.5310320854187012, "learning_rate": 0.0002660764686576961, "loss": 0.7363, "step": 651000 }, { "epoch": 3.343452576304562, "grad_norm": 0.6127032041549683, "learning_rate": 0.0002652559894978668, "loss": 0.7302, "step": 652000 }, { "epoch": 3.3485805710534953, "grad_norm": 0.4821667969226837, "learning_rate": 0.00026443633081719726, "loss": 0.728, "step": 653000 }, { "epoch": 3.3537085658024286, "grad_norm": 0.6435668468475342, "learning_rate": 0.0002636158516573679, "loss": 0.7299, "step": 654000 }, { "epoch": 3.358836560551362, "grad_norm": 0.5996416807174683, "learning_rate": 0.0002627961929766984, "loss": 0.732, "step": 655000 }, { "epoch": 3.358836560551362, "eval_accuracy": 0.8056668841457731, "eval_loss": 0.7425902485847473, "eval_runtime": 70.346, "eval_samples_per_second": 7168.33, "eval_steps_per_second": 28.004, "step": 655000 }, { "epoch": 3.3639645553002953, "grad_norm": 0.551065981388092, "learning_rate": 0.00026197571381686907, "loss": 0.7361, "step": 656000 }, { "epoch": 3.3690925500492286, "grad_norm": 0.6245192289352417, "learning_rate": 0.0002611552346570397, "loss": 0.7265, "step": 657000 }, { "epoch": 3.3742205447981624, "grad_norm": 0.5377309322357178, "learning_rate": 0.0002603347554972104, "loss": 0.7293, "step": 658000 }, { "epoch": 3.3793485395470957, "grad_norm": 0.5521800518035889, "learning_rate": 0.0002595150968165409, "loss": 0.7327, "step": 659000 }, { "epoch": 3.384476534296029, "grad_norm": 0.6193340420722961, "learning_rate": 0.00025869461765671154, "loss": 0.7286, "step": 660000 }, { "epoch": 3.384476534296029, "eval_accuracy": 0.8062082683044364, "eval_loss": 0.7417632341384888, "eval_runtime": 70.6013, "eval_samples_per_second": 7142.403, "eval_steps_per_second": 27.903, "step": 660000 }, { "epoch": 3.3896045290449623, "grad_norm": 0.6531165242195129, "learning_rate": 0.00025787413849688215, "loss": 0.7326, "step": 661000 }, { "epoch": 3.3947325237938957, "grad_norm": 0.6123143434524536, "learning_rate": 0.0002570544798162127, "loss": 0.7299, "step": 662000 }, { "epoch": 3.399860518542829, "grad_norm": 0.5394929647445679, "learning_rate": 0.0002562348211355432, "loss": 0.7312, "step": 663000 }, { "epoch": 3.4049885132917623, "grad_norm": 0.6724646687507629, "learning_rate": 0.00025541434197571384, "loss": 0.7347, "step": 664000 }, { "epoch": 3.4101165080406957, "grad_norm": 0.593758225440979, "learning_rate": 0.0002545938628158845, "loss": 0.7331, "step": 665000 }, { "epoch": 3.4101165080406957, "eval_accuracy": 0.8058909735594323, "eval_loss": 0.7419803738594055, "eval_runtime": 71.0621, "eval_samples_per_second": 7096.091, "eval_steps_per_second": 27.722, "step": 665000 }, { "epoch": 3.415244502789629, "grad_norm": 0.5918824672698975, "learning_rate": 0.00025377338365605517, "loss": 0.733, "step": 666000 }, { "epoch": 3.4203724975385628, "grad_norm": 0.5340002179145813, "learning_rate": 0.00025295290449622583, "loss": 0.7347, "step": 667000 }, { "epoch": 3.425500492287496, "grad_norm": 0.6719947457313538, "learning_rate": 0.0002521332458155563, "loss": 0.7325, "step": 668000 }, { "epoch": 3.4306284870364294, "grad_norm": 0.6139300465583801, "learning_rate": 0.0002513127666557269, "loss": 0.7324, "step": 669000 }, { "epoch": 3.4357564817853627, "grad_norm": 0.561059296131134, "learning_rate": 0.0002504931079750574, "loss": 0.729, "step": 670000 }, { "epoch": 3.4357564817853627, "eval_accuracy": 0.806450205547502, "eval_loss": 0.7402191758155823, "eval_runtime": 70.2373, "eval_samples_per_second": 7179.42, "eval_steps_per_second": 28.048, "step": 670000 }, { "epoch": 3.440884476534296, "grad_norm": 0.6199759244918823, "learning_rate": 0.0002496726288152281, "loss": 0.728, "step": 671000 }, { "epoch": 3.4460124712832294, "grad_norm": 0.5123302936553955, "learning_rate": 0.0002488529701345586, "loss": 0.7254, "step": 672000 }, { "epoch": 3.4511404660321627, "grad_norm": 0.5622628927230835, "learning_rate": 0.00024803249097472927, "loss": 0.7296, "step": 673000 }, { "epoch": 3.456268460781096, "grad_norm": 0.5848363041877747, "learning_rate": 0.00024721283229405975, "loss": 0.7318, "step": 674000 }, { "epoch": 3.4613964555300294, "grad_norm": 0.6006805896759033, "learning_rate": 0.0002463923531342304, "loss": 0.7336, "step": 675000 }, { "epoch": 3.4613964555300294, "eval_accuracy": 0.8062935412671562, "eval_loss": 0.7409020066261292, "eval_runtime": 70.322, "eval_samples_per_second": 7170.769, "eval_steps_per_second": 28.014, "step": 675000 }, { "epoch": 3.4665244502789627, "grad_norm": 0.4952269494533539, "learning_rate": 0.0002455726944535609, "loss": 0.7329, "step": 676000 }, { "epoch": 3.471652445027896, "grad_norm": 0.6394836902618408, "learning_rate": 0.00024475221529373156, "loss": 0.7323, "step": 677000 }, { "epoch": 3.47678043977683, "grad_norm": 0.5517870187759399, "learning_rate": 0.0002439317361339022, "loss": 0.7296, "step": 678000 }, { "epoch": 3.481908434525763, "grad_norm": 0.5815935730934143, "learning_rate": 0.00024311207745323268, "loss": 0.7268, "step": 679000 }, { "epoch": 3.4870364292746965, "grad_norm": 0.5354881286621094, "learning_rate": 0.00024229159829340335, "loss": 0.7275, "step": 680000 }, { "epoch": 3.4870364292746965, "eval_accuracy": 0.8063907127828137, "eval_loss": 0.7398457527160645, "eval_runtime": 71.0804, "eval_samples_per_second": 7094.266, "eval_steps_per_second": 27.715, "step": 680000 }, { "epoch": 3.49216442402363, "grad_norm": 0.6503774523735046, "learning_rate": 0.00024147193961273386, "loss": 0.7311, "step": 681000 }, { "epoch": 3.497292418772563, "grad_norm": 0.6554757356643677, "learning_rate": 0.00024065146045290452, "loss": 0.7304, "step": 682000 }, { "epoch": 3.5024204135214965, "grad_norm": 0.5777162313461304, "learning_rate": 0.00023983262225139482, "loss": 0.7309, "step": 683000 }, { "epoch": 3.50754840827043, "grad_norm": 0.6095039248466492, "learning_rate": 0.0002390121430915655, "loss": 0.7334, "step": 684000 }, { "epoch": 3.5126764030193636, "grad_norm": 0.6406295895576477, "learning_rate": 0.00023819166393173612, "loss": 0.7298, "step": 685000 }, { "epoch": 3.5126764030193636, "eval_accuracy": 0.8068706210846325, "eval_loss": 0.7388338446617126, "eval_runtime": 70.8653, "eval_samples_per_second": 7115.8, "eval_steps_per_second": 27.799, "step": 685000 }, { "epoch": 3.5178043977682965, "grad_norm": 0.59187912940979, "learning_rate": 0.0002373720052510666, "loss": 0.7317, "step": 686000 }, { "epoch": 3.5229323925172302, "grad_norm": 0.6433655023574829, "learning_rate": 0.0002365515260912373, "loss": 0.7306, "step": 687000 }, { "epoch": 3.5280603872661636, "grad_norm": 0.6265184283256531, "learning_rate": 0.00023573186741056778, "loss": 0.7257, "step": 688000 }, { "epoch": 3.533188382015097, "grad_norm": 0.5827895998954773, "learning_rate": 0.00023491138825073847, "loss": 0.736, "step": 689000 }, { "epoch": 3.53831637676403, "grad_norm": 0.6121138334274292, "learning_rate": 0.00023409090909090908, "loss": 0.724, "step": 690000 }, { "epoch": 3.53831637676403, "eval_accuracy": 0.8070411670100721, "eval_loss": 0.7364519834518433, "eval_runtime": 70.9665, "eval_samples_per_second": 7105.647, "eval_steps_per_second": 27.76, "step": 690000 }, { "epoch": 3.5434443715129635, "grad_norm": 0.6549979448318481, "learning_rate": 0.00023327042993107977, "loss": 0.7346, "step": 691000 }, { "epoch": 3.548572366261897, "grad_norm": 0.6074033975601196, "learning_rate": 0.0002324499507712504, "loss": 0.7272, "step": 692000 }, { "epoch": 3.55370036101083, "grad_norm": 0.6036473512649536, "learning_rate": 0.0002316302920905809, "loss": 0.7302, "step": 693000 }, { "epoch": 3.558828355759764, "grad_norm": 0.6517288684844971, "learning_rate": 0.00023081063340991138, "loss": 0.7329, "step": 694000 }, { "epoch": 3.563956350508697, "grad_norm": 0.6263485550880432, "learning_rate": 0.00022999015425008207, "loss": 0.7266, "step": 695000 }, { "epoch": 3.563956350508697, "eval_accuracy": 0.8072474085943248, "eval_loss": 0.7372848987579346, "eval_runtime": 70.6808, "eval_samples_per_second": 7134.373, "eval_steps_per_second": 27.872, "step": 695000 }, { "epoch": 3.5690843452576306, "grad_norm": 0.5630945563316345, "learning_rate": 0.00022917049556941255, "loss": 0.7274, "step": 696000 }, { "epoch": 3.574212340006564, "grad_norm": 0.579341471195221, "learning_rate": 0.0002283500164095832, "loss": 0.73, "step": 697000 }, { "epoch": 3.5793403347554973, "grad_norm": 0.589762806892395, "learning_rate": 0.00022752953724975385, "loss": 0.7255, "step": 698000 }, { "epoch": 3.5844683295044306, "grad_norm": 0.6535184979438782, "learning_rate": 0.00022670987856908433, "loss": 0.728, "step": 699000 }, { "epoch": 3.589596324253364, "grad_norm": 0.5404123067855835, "learning_rate": 0.00022588939940925502, "loss": 0.7282, "step": 700000 }, { "epoch": 3.589596324253364, "eval_accuracy": 0.8074278699805458, "eval_loss": 0.7370765209197998, "eval_runtime": 70.0132, "eval_samples_per_second": 7202.399, "eval_steps_per_second": 28.138, "step": 700000 }, { "epoch": 3.5947243190022973, "grad_norm": 0.5447523593902588, "learning_rate": 0.0002250689202494257, "loss": 0.7354, "step": 701000 }, { "epoch": 3.5998523137512306, "grad_norm": 0.5586708784103394, "learning_rate": 0.00022424844108959632, "loss": 0.7278, "step": 702000 }, { "epoch": 3.604980308500164, "grad_norm": 0.6701763272285461, "learning_rate": 0.0002234287824089268, "loss": 0.7297, "step": 703000 }, { "epoch": 3.6101083032490973, "grad_norm": 0.6514126062393188, "learning_rate": 0.00022260912372825732, "loss": 0.7244, "step": 704000 }, { "epoch": 3.615236297998031, "grad_norm": 0.662257969379425, "learning_rate": 0.00022178864456842798, "loss": 0.7272, "step": 705000 }, { "epoch": 3.615236297998031, "eval_accuracy": 0.8073326815570446, "eval_loss": 0.7360438108444214, "eval_runtime": 70.837, "eval_samples_per_second": 7118.642, "eval_steps_per_second": 27.81, "step": 705000 }, { "epoch": 3.6203642927469644, "grad_norm": 0.5558410882949829, "learning_rate": 0.00022096816540859862, "loss": 0.7297, "step": 706000 }, { "epoch": 3.6254922874958977, "grad_norm": 0.5577208995819092, "learning_rate": 0.00022014768624876928, "loss": 0.7301, "step": 707000 }, { "epoch": 3.630620282244831, "grad_norm": 0.5822728276252747, "learning_rate": 0.0002193280275680998, "loss": 0.7295, "step": 708000 }, { "epoch": 3.6357482769937643, "grad_norm": 0.5762273669242859, "learning_rate": 0.00021850836888743028, "loss": 0.723, "step": 709000 }, { "epoch": 3.6408762717426977, "grad_norm": 0.6955676674842834, "learning_rate": 0.00021768788972760094, "loss": 0.7227, "step": 710000 }, { "epoch": 3.6408762717426977, "eval_accuracy": 0.8072176622119807, "eval_loss": 0.7360370755195618, "eval_runtime": 70.929, "eval_samples_per_second": 7109.408, "eval_steps_per_second": 27.774, "step": 710000 }, { "epoch": 3.646004266491631, "grad_norm": 0.5888379216194153, "learning_rate": 0.00021686741056777158, "loss": 0.7304, "step": 711000 }, { "epoch": 3.6511322612405643, "grad_norm": 0.694804310798645, "learning_rate": 0.00021604693140794227, "loss": 0.7247, "step": 712000 }, { "epoch": 3.6562602559894977, "grad_norm": 0.6396298408508301, "learning_rate": 0.00021522809320643257, "loss": 0.7273, "step": 713000 }, { "epoch": 3.6613882507384314, "grad_norm": 0.613314688205719, "learning_rate": 0.00021440761404660323, "loss": 0.7297, "step": 714000 }, { "epoch": 3.6665162454873648, "grad_norm": 0.6243338584899902, "learning_rate": 0.00021358713488677387, "loss": 0.7275, "step": 715000 }, { "epoch": 3.6665162454873648, "eval_accuracy": 0.8072751718845127, "eval_loss": 0.7357719540596008, "eval_runtime": 71.5286, "eval_samples_per_second": 7049.814, "eval_steps_per_second": 27.541, "step": 715000 }, { "epoch": 3.671644240236298, "grad_norm": 0.6075868010520935, "learning_rate": 0.00021276747620610435, "loss": 0.7252, "step": 716000 }, { "epoch": 3.6767722349852314, "grad_norm": 0.6390559077262878, "learning_rate": 0.00021194699704627504, "loss": 0.7288, "step": 717000 }, { "epoch": 3.6819002297341648, "grad_norm": 0.6235975623130798, "learning_rate": 0.0002111265178864457, "loss": 0.732, "step": 718000 }, { "epoch": 3.687028224483098, "grad_norm": 0.5627439618110657, "learning_rate": 0.0002103068592057762, "loss": 0.726, "step": 719000 }, { "epoch": 3.6921562192320314, "grad_norm": 0.6966290473937988, "learning_rate": 0.00020948638004594683, "loss": 0.7299, "step": 720000 }, { "epoch": 3.6921562192320314, "eval_accuracy": 0.8062756934377497, "eval_loss": 0.7421655058860779, "eval_runtime": 70.8745, "eval_samples_per_second": 7114.872, "eval_steps_per_second": 27.796, "step": 720000 }, { "epoch": 3.6972842139809647, "grad_norm": 0.594695508480072, "learning_rate": 0.00020866590088611752, "loss": 0.7184, "step": 721000 }, { "epoch": 3.702412208729898, "grad_norm": 0.5712147951126099, "learning_rate": 0.00020784542172628818, "loss": 0.7256, "step": 722000 }, { "epoch": 3.707540203478832, "grad_norm": 0.5083569884300232, "learning_rate": 0.00020702658352477848, "loss": 0.7343, "step": 723000 }, { "epoch": 3.712668198227765, "grad_norm": 0.5372103452682495, "learning_rate": 0.00020620610436494915, "loss": 0.7324, "step": 724000 }, { "epoch": 3.7177961929766985, "grad_norm": 0.618584156036377, "learning_rate": 0.00020538562520511978, "loss": 0.7363, "step": 725000 }, { "epoch": 3.7177961929766985, "eval_accuracy": 0.8072355100413872, "eval_loss": 0.7361379265785217, "eval_runtime": 71.1457, "eval_samples_per_second": 7087.746, "eval_steps_per_second": 27.69, "step": 725000 }, { "epoch": 3.722924187725632, "grad_norm": 0.5576392412185669, "learning_rate": 0.00020456514604529048, "loss": 0.7306, "step": 726000 }, { "epoch": 3.728052182474565, "grad_norm": 0.5453084707260132, "learning_rate": 0.0002037446668854611, "loss": 0.7247, "step": 727000 }, { "epoch": 3.7331801772234985, "grad_norm": 0.6769667267799377, "learning_rate": 0.0002029250082047916, "loss": 0.729, "step": 728000 }, { "epoch": 3.738308171972432, "grad_norm": 0.5298287272453308, "learning_rate": 0.00020210452904496226, "loss": 0.7256, "step": 729000 }, { "epoch": 3.743436166721365, "grad_norm": 0.7398955225944519, "learning_rate": 0.00020128487036429274, "loss": 0.7274, "step": 730000 }, { "epoch": 3.743436166721365, "eval_accuracy": 0.808213174474431, "eval_loss": 0.7333736419677734, "eval_runtime": 71.0217, "eval_samples_per_second": 7100.124, "eval_steps_per_second": 27.738, "step": 730000 }, { "epoch": 3.7485641614702985, "grad_norm": 0.618033230304718, "learning_rate": 0.00020046439120446343, "loss": 0.7314, "step": 731000 }, { "epoch": 3.7536921562192322, "grad_norm": 0.6644644737243652, "learning_rate": 0.0001996447325237939, "loss": 0.727, "step": 732000 }, { "epoch": 3.7588201509681656, "grad_norm": 0.5424043536186218, "learning_rate": 0.00019882425336396458, "loss": 0.7295, "step": 733000 }, { "epoch": 3.763948145717099, "grad_norm": 0.6209110021591187, "learning_rate": 0.00019800377420413522, "loss": 0.7296, "step": 734000 }, { "epoch": 3.7690761404660322, "grad_norm": 0.6663910746574402, "learning_rate": 0.00019718329504430588, "loss": 0.7282, "step": 735000 }, { "epoch": 3.7690761404660322, "eval_accuracy": 0.8080525440097727, "eval_loss": 0.7346888780593872, "eval_runtime": 70.7959, "eval_samples_per_second": 7122.769, "eval_steps_per_second": 27.826, "step": 735000 }, { "epoch": 3.7742041352149656, "grad_norm": 0.6030405759811401, "learning_rate": 0.00019636363636363636, "loss": 0.7222, "step": 736000 }, { "epoch": 3.779332129963899, "grad_norm": 0.6797240972518921, "learning_rate": 0.00019554397768296687, "loss": 0.7228, "step": 737000 }, { "epoch": 3.784460124712832, "grad_norm": 0.6230988502502441, "learning_rate": 0.0001947234985231375, "loss": 0.7243, "step": 738000 }, { "epoch": 3.7895881194617655, "grad_norm": 0.6362661123275757, "learning_rate": 0.000193903839842468, "loss": 0.723, "step": 739000 }, { "epoch": 3.794716114210699, "grad_norm": 0.5619367957115173, "learning_rate": 0.00019308336068263868, "loss": 0.7239, "step": 740000 }, { "epoch": 3.794716114210699, "eval_accuracy": 0.8085384015880602, "eval_loss": 0.7326257228851318, "eval_runtime": 70.693, "eval_samples_per_second": 7133.135, "eval_steps_per_second": 27.867, "step": 740000 }, { "epoch": 3.7998441089596326, "grad_norm": 0.5680134296417236, "learning_rate": 0.00019226288152280935, "loss": 0.7277, "step": 741000 }, { "epoch": 3.8049721037085655, "grad_norm": 0.7252689599990845, "learning_rate": 0.00019144240236297999, "loss": 0.726, "step": 742000 }, { "epoch": 3.8101000984574993, "grad_norm": 0.6489447355270386, "learning_rate": 0.00019062274368231047, "loss": 0.7239, "step": 743000 }, { "epoch": 3.8152280932064326, "grad_norm": 0.5661848187446594, "learning_rate": 0.00018980226452248113, "loss": 0.721, "step": 744000 }, { "epoch": 3.820356087955366, "grad_norm": 0.5854164958000183, "learning_rate": 0.0001889817853626518, "loss": 0.7225, "step": 745000 }, { "epoch": 3.820356087955366, "eval_accuracy": 0.8076400608412674, "eval_loss": 0.7351976633071899, "eval_runtime": 70.65, "eval_samples_per_second": 7137.478, "eval_steps_per_second": 27.884, "step": 745000 }, { "epoch": 3.8254840827042993, "grad_norm": 0.6415901184082031, "learning_rate": 0.00018816212668198228, "loss": 0.7245, "step": 746000 }, { "epoch": 3.8306120774532326, "grad_norm": 0.6576703190803528, "learning_rate": 0.00018734164752215294, "loss": 0.725, "step": 747000 }, { "epoch": 3.835740072202166, "grad_norm": 0.6380520462989807, "learning_rate": 0.00018652280932064325, "loss": 0.7286, "step": 748000 }, { "epoch": 3.8408680669510993, "grad_norm": 0.5243638753890991, "learning_rate": 0.00018570233016081394, "loss": 0.729, "step": 749000 }, { "epoch": 3.845996061700033, "grad_norm": 0.6276266574859619, "learning_rate": 0.00018488267148014442, "loss": 0.7242, "step": 750000 }, { "epoch": 3.845996061700033, "eval_accuracy": 0.8085840127076546, "eval_loss": 0.7319746613502502, "eval_runtime": 71.054, "eval_samples_per_second": 7096.896, "eval_steps_per_second": 27.725, "step": 750000 }, { "epoch": 3.851124056448966, "grad_norm": 0.5431926846504211, "learning_rate": 0.00018406219232031508, "loss": 0.7222, "step": 751000 }, { "epoch": 3.8562520511978997, "grad_norm": 0.6457306742668152, "learning_rate": 0.00018324171316048572, "loss": 0.729, "step": 752000 }, { "epoch": 3.861380045946833, "grad_norm": 0.6358340978622437, "learning_rate": 0.00018242205447981623, "loss": 0.7269, "step": 753000 }, { "epoch": 3.8665080406957664, "grad_norm": 0.6079466342926025, "learning_rate": 0.00018160157531998687, "loss": 0.7287, "step": 754000 }, { "epoch": 3.8716360354446997, "grad_norm": 0.5899778008460999, "learning_rate": 0.00018078109616015756, "loss": 0.7291, "step": 755000 }, { "epoch": 3.8716360354446997, "eval_accuracy": 0.8088874258075647, "eval_loss": 0.7317127585411072, "eval_runtime": 72.0436, "eval_samples_per_second": 6999.412, "eval_steps_per_second": 27.345, "step": 755000 }, { "epoch": 3.876764030193633, "grad_norm": 0.5655895471572876, "learning_rate": 0.0001799606170003282, "loss": 0.726, "step": 756000 }, { "epoch": 3.8818920249425664, "grad_norm": 0.6597123146057129, "learning_rate": 0.0001791409583196587, "loss": 0.7246, "step": 757000 }, { "epoch": 3.8870200196914997, "grad_norm": 0.5834771990776062, "learning_rate": 0.0001783212996389892, "loss": 0.7239, "step": 758000 }, { "epoch": 3.8921480144404335, "grad_norm": 0.5592017769813538, "learning_rate": 0.00017750082047915982, "loss": 0.725, "step": 759000 }, { "epoch": 3.8972760091893663, "grad_norm": 0.6665492057800293, "learning_rate": 0.0001766803413193305, "loss": 0.7292, "step": 760000 }, { "epoch": 3.8972760091893663, "eval_accuracy": 0.8086831673154683, "eval_loss": 0.730987548828125, "eval_runtime": 70.5662, "eval_samples_per_second": 7145.956, "eval_steps_per_second": 27.917, "step": 760000 }, { "epoch": 3.9024040039383, "grad_norm": 0.58745276927948, "learning_rate": 0.00017586068263866097, "loss": 0.7232, "step": 761000 }, { "epoch": 3.9075319986872334, "grad_norm": 0.6152140498161316, "learning_rate": 0.00017504020347883166, "loss": 0.7236, "step": 762000 }, { "epoch": 3.9126599934361668, "grad_norm": 0.535243570804596, "learning_rate": 0.0001742197243190023, "loss": 0.7238, "step": 763000 }, { "epoch": 3.9177879881851, "grad_norm": 0.6166626811027527, "learning_rate": 0.00017339924515917296, "loss": 0.724, "step": 764000 }, { "epoch": 3.9229159829340334, "grad_norm": 0.6726566553115845, "learning_rate": 0.00017257958647850345, "loss": 0.7247, "step": 765000 }, { "epoch": 3.9229159829340334, "eval_accuracy": 0.8082944812528383, "eval_loss": 0.7309958934783936, "eval_runtime": 72.3573, "eval_samples_per_second": 6969.066, "eval_steps_per_second": 27.226, "step": 765000 }, { "epoch": 3.9280439776829668, "grad_norm": 0.5634755492210388, "learning_rate": 0.00017175992779783393, "loss": 0.7247, "step": 766000 }, { "epoch": 3.9331719724319, "grad_norm": 0.6358364224433899, "learning_rate": 0.0001709394486380046, "loss": 0.7247, "step": 767000 }, { "epoch": 3.938299967180834, "grad_norm": 0.7192628383636475, "learning_rate": 0.00017011896947817528, "loss": 0.716, "step": 768000 }, { "epoch": 3.9434279619297667, "grad_norm": 0.7193153500556946, "learning_rate": 0.00016929931079750577, "loss": 0.7243, "step": 769000 }, { "epoch": 3.9485559566787005, "grad_norm": 0.6458471417427063, "learning_rate": 0.0001684788316376764, "loss": 0.7286, "step": 770000 }, { "epoch": 3.9485559566787005, "eval_accuracy": 0.8083678556626205, "eval_loss": 0.7326467037200928, "eval_runtime": 72.9455, "eval_samples_per_second": 6912.87, "eval_steps_per_second": 27.006, "step": 770000 }, { "epoch": 3.953683951427634, "grad_norm": 0.5734179019927979, "learning_rate": 0.00016765835247784707, "loss": 0.7231, "step": 771000 }, { "epoch": 3.958811946176567, "grad_norm": 0.5269643068313599, "learning_rate": 0.00016683787331801773, "loss": 0.725, "step": 772000 }, { "epoch": 3.9639399409255005, "grad_norm": 0.5405349731445312, "learning_rate": 0.00016601821463734821, "loss": 0.7249, "step": 773000 }, { "epoch": 3.969067935674434, "grad_norm": 0.7156373262405396, "learning_rate": 0.00016519773547751888, "loss": 0.7268, "step": 774000 }, { "epoch": 3.974195930423367, "grad_norm": 0.6865078806877136, "learning_rate": 0.0001643780767968494, "loss": 0.7237, "step": 775000 }, { "epoch": 3.974195930423367, "eval_accuracy": 0.8088120683056262, "eval_loss": 0.7302601933479309, "eval_runtime": 72.7492, "eval_samples_per_second": 6931.523, "eval_steps_per_second": 27.079, "step": 775000 }, { "epoch": 3.9793239251723005, "grad_norm": 0.5540174245834351, "learning_rate": 0.00016355759763702002, "loss": 0.7218, "step": 776000 }, { "epoch": 3.9844519199212343, "grad_norm": 0.5312943458557129, "learning_rate": 0.00016273793895635053, "loss": 0.7238, "step": 777000 }, { "epoch": 3.989579914670167, "grad_norm": 0.5829684138298035, "learning_rate": 0.00016191745979652117, "loss": 0.7278, "step": 778000 }, { "epoch": 3.994707909419101, "grad_norm": 0.6079599857330322, "learning_rate": 0.00016109698063669184, "loss": 0.7226, "step": 779000 }, { "epoch": 3.9998359041680343, "grad_norm": 0.6079375147819519, "learning_rate": 0.00016027732195602232, "loss": 0.7187, "step": 780000 }, { "epoch": 3.9998359041680343, "eval_accuracy": 0.809048056272223, "eval_loss": 0.7297579050064087, "eval_runtime": 72.3866, "eval_samples_per_second": 6966.249, "eval_steps_per_second": 27.215, "step": 780000 }, { "epoch": 4.004963898916968, "grad_norm": 0.6389214992523193, "learning_rate": 0.00015945684279619298, "loss": 0.7078, "step": 781000 }, { "epoch": 4.0100918936659005, "grad_norm": 0.5486829280853271, "learning_rate": 0.00015863636363636365, "loss": 0.7073, "step": 782000 }, { "epoch": 4.015219888414834, "grad_norm": 0.5968378186225891, "learning_rate": 0.0001578158844765343, "loss": 0.7099, "step": 783000 }, { "epoch": 4.020347883163768, "grad_norm": 0.6569184064865112, "learning_rate": 0.0001569962257958648, "loss": 0.7124, "step": 784000 }, { "epoch": 4.025475877912701, "grad_norm": 0.6940632462501526, "learning_rate": 0.00015617574663603546, "loss": 0.7077, "step": 785000 }, { "epoch": 4.025475877912701, "eval_accuracy": 0.8084432131645589, "eval_loss": 0.7316287755966187, "eval_runtime": 70.6436, "eval_samples_per_second": 7138.124, "eval_steps_per_second": 27.886, "step": 785000 }, { "epoch": 4.030603872661635, "grad_norm": 0.5832222104072571, "learning_rate": 0.0001553552674762061, "loss": 0.7076, "step": 786000 }, { "epoch": 4.0357318674105676, "grad_norm": 0.7346569895744324, "learning_rate": 0.00015453478831637678, "loss": 0.7102, "step": 787000 }, { "epoch": 4.040859862159501, "grad_norm": 0.6240456700325012, "learning_rate": 0.00015371512963570727, "loss": 0.7077, "step": 788000 }, { "epoch": 4.045987856908434, "grad_norm": 0.6334419846534729, "learning_rate": 0.00015289465047587793, "loss": 0.71, "step": 789000 }, { "epoch": 4.051115851657368, "grad_norm": 0.6498461961746216, "learning_rate": 0.00015207417131604857, "loss": 0.7108, "step": 790000 }, { "epoch": 4.051115851657368, "eval_accuracy": 0.808385703492027, "eval_loss": 0.7316392660140991, "eval_runtime": 71.009, "eval_samples_per_second": 7101.399, "eval_steps_per_second": 27.743, "step": 790000 }, { "epoch": 4.056243846406301, "grad_norm": 0.618683934211731, "learning_rate": 0.00015125369215621923, "loss": 0.7098, "step": 791000 }, { "epoch": 4.061371841155235, "grad_norm": 0.5353243350982666, "learning_rate": 0.00015043403347554971, "loss": 0.7031, "step": 792000 }, { "epoch": 4.066499835904168, "grad_norm": 0.621579647064209, "learning_rate": 0.0001496135543157204, "loss": 0.7057, "step": 793000 }, { "epoch": 4.071627830653101, "grad_norm": 0.5752761363983154, "learning_rate": 0.0001487938956350509, "loss": 0.7089, "step": 794000 }, { "epoch": 4.076755825402035, "grad_norm": 0.5452781915664673, "learning_rate": 0.00014797341647522153, "loss": 0.7025, "step": 795000 }, { "epoch": 4.076755825402035, "eval_accuracy": 0.8093415539113518, "eval_loss": 0.7299850583076477, "eval_runtime": 70.8578, "eval_samples_per_second": 7116.544, "eval_steps_per_second": 27.802, "step": 795000 }, { "epoch": 4.081883820150968, "grad_norm": 0.6958083510398865, "learning_rate": 0.00014715375779455204, "loss": 0.7161, "step": 796000 }, { "epoch": 4.087011814899902, "grad_norm": 0.6346060037612915, "learning_rate": 0.00014633327863472267, "loss": 0.7077, "step": 797000 }, { "epoch": 4.092139809648835, "grad_norm": 0.5567407608032227, "learning_rate": 0.00014551361995405318, "loss": 0.706, "step": 798000 }, { "epoch": 4.097267804397768, "grad_norm": 0.582592248916626, "learning_rate": 0.00014469314079422382, "loss": 0.7061, "step": 799000 }, { "epoch": 4.102395799146701, "grad_norm": 0.6524437069892883, "learning_rate": 0.0001438734821135543, "loss": 0.708, "step": 800000 }, { "epoch": 4.102395799146701, "eval_accuracy": 0.8092661964094133, "eval_loss": 0.7295412421226501, "eval_runtime": 70.8008, "eval_samples_per_second": 7122.281, "eval_steps_per_second": 27.825, "step": 800000 }, { "epoch": 4.107523793895635, "grad_norm": 0.6798685789108276, "learning_rate": 0.0001430538234328848, "loss": 0.7061, "step": 801000 }, { "epoch": 4.112651788644569, "grad_norm": 0.5619680881500244, "learning_rate": 0.00014223334427305548, "loss": 0.7073, "step": 802000 }, { "epoch": 4.117779783393502, "grad_norm": 0.61456298828125, "learning_rate": 0.00014141286511322614, "loss": 0.7079, "step": 803000 }, { "epoch": 4.1229077781424355, "grad_norm": 0.619781494140625, "learning_rate": 0.00014059238595339678, "loss": 0.7088, "step": 804000 }, { "epoch": 4.128035772891368, "grad_norm": 0.8303067684173584, "learning_rate": 0.0001397727272727273, "loss": 0.7067, "step": 805000 }, { "epoch": 4.128035772891368, "eval_accuracy": 0.8093911312152587, "eval_loss": 0.728842556476593, "eval_runtime": 70.5007, "eval_samples_per_second": 7152.596, "eval_steps_per_second": 27.943, "step": 805000 }, { "epoch": 4.133163767640302, "grad_norm": 0.548308789730072, "learning_rate": 0.00013895224811289792, "loss": 0.7091, "step": 806000 }, { "epoch": 4.138291762389235, "grad_norm": 0.584027111530304, "learning_rate": 0.00013813258943222843, "loss": 0.7038, "step": 807000 }, { "epoch": 4.143419757138169, "grad_norm": 0.6365171670913696, "learning_rate": 0.00013731293075155892, "loss": 0.7089, "step": 808000 }, { "epoch": 4.148547751887102, "grad_norm": 0.6319549083709717, "learning_rate": 0.00013649245159172958, "loss": 0.7115, "step": 809000 }, { "epoch": 4.1536757466360354, "grad_norm": 0.6406691670417786, "learning_rate": 0.00013567197243190024, "loss": 0.7123, "step": 810000 }, { "epoch": 4.1536757466360354, "eval_accuracy": 0.8094129452289778, "eval_loss": 0.7286692261695862, "eval_runtime": 70.5388, "eval_samples_per_second": 7148.737, "eval_steps_per_second": 27.928, "step": 810000 }, { "epoch": 4.158803741384969, "grad_norm": 0.5951902866363525, "learning_rate": 0.00013485149327207088, "loss": 0.7072, "step": 811000 }, { "epoch": 4.163931736133902, "grad_norm": 0.6139009594917297, "learning_rate": 0.0001340318345914014, "loss": 0.7061, "step": 812000 }, { "epoch": 4.169059730882836, "grad_norm": 0.5775468349456787, "learning_rate": 0.00013321135543157203, "loss": 0.7077, "step": 813000 }, { "epoch": 4.174187725631769, "grad_norm": 0.6729329824447632, "learning_rate": 0.00013239087627174272, "loss": 0.7089, "step": 814000 }, { "epoch": 4.1793157203807025, "grad_norm": 0.6356979608535767, "learning_rate": 0.0001315712175910732, "loss": 0.707, "step": 815000 }, { "epoch": 4.1793157203807025, "eval_accuracy": 0.8094843365466037, "eval_loss": 0.7282999753952026, "eval_runtime": 70.9611, "eval_samples_per_second": 7106.19, "eval_steps_per_second": 27.762, "step": 815000 }, { "epoch": 4.184443715129635, "grad_norm": 0.6034815311431885, "learning_rate": 0.00013075073843124387, "loss": 0.7049, "step": 816000 }, { "epoch": 4.189571709878569, "grad_norm": 0.5859324932098389, "learning_rate": 0.0001299302592714145, "loss": 0.7051, "step": 817000 }, { "epoch": 4.194699704627502, "grad_norm": 0.6676262021064758, "learning_rate": 0.00012910978011158517, "loss": 0.71, "step": 818000 }, { "epoch": 4.199827699376436, "grad_norm": 0.6685138940811157, "learning_rate": 0.00012829012143091565, "loss": 0.7053, "step": 819000 }, { "epoch": 4.20495569412537, "grad_norm": 0.6142294406890869, "learning_rate": 0.00012746964227108634, "loss": 0.7033, "step": 820000 }, { "epoch": 4.20495569412537, "eval_accuracy": 0.8098571578719834, "eval_loss": 0.7282252907752991, "eval_runtime": 70.3811, "eval_samples_per_second": 7164.753, "eval_steps_per_second": 27.99, "step": 820000 }, { "epoch": 4.2100836888743025, "grad_norm": 0.503429114818573, "learning_rate": 0.0001266499835904168, "loss": 0.7065, "step": 821000 }, { "epoch": 4.215211683623236, "grad_norm": 0.6597088575363159, "learning_rate": 0.0001258295044305875, "loss": 0.7117, "step": 822000 }, { "epoch": 4.220339678372169, "grad_norm": 0.6281595230102539, "learning_rate": 0.00012500984574991797, "loss": 0.7079, "step": 823000 }, { "epoch": 4.225467673121103, "grad_norm": 0.4819941222667694, "learning_rate": 0.0001241893665900886, "loss": 0.7072, "step": 824000 }, { "epoch": 4.230595667870036, "grad_norm": 0.6821579337120056, "learning_rate": 0.00012336888743025927, "loss": 0.7128, "step": 825000 }, { "epoch": 4.230595667870036, "eval_accuracy": 0.809940447742547, "eval_loss": 0.7272255420684814, "eval_runtime": 70.5765, "eval_samples_per_second": 7144.914, "eval_steps_per_second": 27.913, "step": 825000 }, { "epoch": 4.23572366261897, "grad_norm": 0.5742472410202026, "learning_rate": 0.00012254840827042993, "loss": 0.7044, "step": 826000 }, { "epoch": 4.2408516573679025, "grad_norm": 0.5903806090354919, "learning_rate": 0.00012172874958976042, "loss": 0.71, "step": 827000 }, { "epoch": 4.245979652116836, "grad_norm": 0.6407065987586975, "learning_rate": 0.0001209090909090909, "loss": 0.712, "step": 828000 }, { "epoch": 4.251107646865769, "grad_norm": 0.5949096083641052, "learning_rate": 0.00012008861174926158, "loss": 0.7109, "step": 829000 }, { "epoch": 4.256235641614703, "grad_norm": 0.6764048337936401, "learning_rate": 0.00011926813258943224, "loss": 0.7053, "step": 830000 }, { "epoch": 4.256235641614703, "eval_accuracy": 0.8094744210858222, "eval_loss": 0.728439450263977, "eval_runtime": 70.6873, "eval_samples_per_second": 7133.718, "eval_steps_per_second": 27.869, "step": 830000 }, { "epoch": 4.261363636363637, "grad_norm": 0.5870431661605835, "learning_rate": 0.00011844847390876273, "loss": 0.7108, "step": 831000 }, { "epoch": 4.26649163111257, "grad_norm": 0.6865319609642029, "learning_rate": 0.00011762799474893338, "loss": 0.7084, "step": 832000 }, { "epoch": 4.271619625861503, "grad_norm": 0.6285982131958008, "learning_rate": 0.00011680751558910404, "loss": 0.7097, "step": 833000 }, { "epoch": 4.276747620610436, "grad_norm": 0.6233821511268616, "learning_rate": 0.0001159870364292747, "loss": 0.7081, "step": 834000 }, { "epoch": 4.28187561535937, "grad_norm": 0.5938854813575745, "learning_rate": 0.0001151673777486052, "loss": 0.7097, "step": 835000 }, { "epoch": 4.28187561535937, "eval_accuracy": 0.8098016312916078, "eval_loss": 0.726809561252594, "eval_runtime": 71.0425, "eval_samples_per_second": 7098.049, "eval_steps_per_second": 27.73, "step": 835000 }, { "epoch": 4.287003610108303, "grad_norm": 0.6070296168327332, "learning_rate": 0.00011434771906793568, "loss": 0.7051, "step": 836000 }, { "epoch": 4.292131604857237, "grad_norm": 0.7683534026145935, "learning_rate": 0.00011352723990810635, "loss": 0.7032, "step": 837000 }, { "epoch": 4.2972595996061695, "grad_norm": 0.786492645740509, "learning_rate": 0.000112706760748277, "loss": 0.7057, "step": 838000 }, { "epoch": 4.302387594355103, "grad_norm": 0.5988065600395203, "learning_rate": 0.00011188710206760748, "loss": 0.7018, "step": 839000 }, { "epoch": 4.307515589104037, "grad_norm": 0.8143409490585327, "learning_rate": 0.00011106662290777814, "loss": 0.7101, "step": 840000 }, { "epoch": 4.307515589104037, "eval_accuracy": 0.8097322230661381, "eval_loss": 0.7266760468482971, "eval_runtime": 70.5876, "eval_samples_per_second": 7143.794, "eval_steps_per_second": 27.909, "step": 840000 }, { "epoch": 4.31264358385297, "grad_norm": 0.5831992626190186, "learning_rate": 0.00011024614374794882, "loss": 0.7077, "step": 841000 }, { "epoch": 4.317771578601904, "grad_norm": 0.6808064579963684, "learning_rate": 0.00010942566458811947, "loss": 0.7043, "step": 842000 }, { "epoch": 4.322899573350837, "grad_norm": 0.7133612036705017, "learning_rate": 0.00010860600590744995, "loss": 0.7072, "step": 843000 }, { "epoch": 4.32802756809977, "grad_norm": 0.6466283798217773, "learning_rate": 0.00010778552674762062, "loss": 0.7064, "step": 844000 }, { "epoch": 4.333155562848703, "grad_norm": 0.6584567427635193, "learning_rate": 0.00010696504758779127, "loss": 0.7074, "step": 845000 }, { "epoch": 4.333155562848703, "eval_accuracy": 0.8101506555111122, "eval_loss": 0.7261218428611755, "eval_runtime": 70.6237, "eval_samples_per_second": 7140.143, "eval_steps_per_second": 27.894, "step": 845000 }, { "epoch": 4.338283557597637, "grad_norm": 0.6504273414611816, "learning_rate": 0.00010614538890712175, "loss": 0.7061, "step": 846000 }, { "epoch": 4.34341155234657, "grad_norm": 0.6829896569252014, "learning_rate": 0.00010532490974729243, "loss": 0.711, "step": 847000 }, { "epoch": 4.348539547095504, "grad_norm": 0.6022945642471313, "learning_rate": 0.00010450443058746309, "loss": 0.7078, "step": 848000 }, { "epoch": 4.3536675418444375, "grad_norm": 0.7128505110740662, "learning_rate": 0.00010368395142763374, "loss": 0.7056, "step": 849000 }, { "epoch": 4.35879553659337, "grad_norm": 0.579028844833374, "learning_rate": 0.00010286347226780439, "loss": 0.7034, "step": 850000 }, { "epoch": 4.35879553659337, "eval_accuracy": 0.8100614163640798, "eval_loss": 0.7257193922996521, "eval_runtime": 70.8414, "eval_samples_per_second": 7118.192, "eval_steps_per_second": 27.809, "step": 850000 }, { "epoch": 4.363923531342304, "grad_norm": 0.6017422080039978, "learning_rate": 0.00010204463406629472, "loss": 0.7071, "step": 851000 }, { "epoch": 4.369051526091237, "grad_norm": 0.6294065713882446, "learning_rate": 0.00010122415490646537, "loss": 0.7056, "step": 852000 }, { "epoch": 4.374179520840171, "grad_norm": 0.6739296913146973, "learning_rate": 0.00010040449622579587, "loss": 0.7089, "step": 853000 }, { "epoch": 4.379307515589104, "grad_norm": 0.6514387130737305, "learning_rate": 9.958401706596653e-05, "loss": 0.7031, "step": 854000 }, { "epoch": 4.3844355103380375, "grad_norm": 0.6753476858139038, "learning_rate": 9.87635379061372e-05, "loss": 0.7059, "step": 855000 }, { "epoch": 4.3844355103380375, "eval_accuracy": 0.8097857665543575, "eval_loss": 0.7262446880340576, "eval_runtime": 70.6365, "eval_samples_per_second": 7138.842, "eval_steps_per_second": 27.889, "step": 855000 }, { "epoch": 4.38956350508697, "grad_norm": 0.5851760506629944, "learning_rate": 9.794305874630785e-05, "loss": 0.7059, "step": 856000 }, { "epoch": 4.394691499835904, "grad_norm": 0.808464765548706, "learning_rate": 9.712340006563833e-05, "loss": 0.7078, "step": 857000 }, { "epoch": 4.399819494584838, "grad_norm": 0.6346122622489929, "learning_rate": 9.630292090580901e-05, "loss": 0.7078, "step": 858000 }, { "epoch": 4.404947489333771, "grad_norm": 0.6442052125930786, "learning_rate": 9.548326222513949e-05, "loss": 0.7065, "step": 859000 }, { "epoch": 4.4100754840827046, "grad_norm": 0.6481711864471436, "learning_rate": 9.466360354446997e-05, "loss": 0.7008, "step": 860000 }, { "epoch": 4.4100754840827046, "eval_accuracy": 0.8099979574150791, "eval_loss": 0.7247160077095032, "eval_runtime": 70.847, "eval_samples_per_second": 7117.629, "eval_steps_per_second": 27.806, "step": 860000 }, { "epoch": 4.415203478831637, "grad_norm": 0.6924198269844055, "learning_rate": 9.384312438464064e-05, "loss": 0.7081, "step": 861000 }, { "epoch": 4.420331473580571, "grad_norm": 0.6235934495925903, "learning_rate": 9.302264522481129e-05, "loss": 0.701, "step": 862000 }, { "epoch": 4.425459468329504, "grad_norm": 0.5865472555160522, "learning_rate": 9.220216606498195e-05, "loss": 0.712, "step": 863000 }, { "epoch": 4.430587463078438, "grad_norm": 0.7308680415153503, "learning_rate": 9.13816869051526e-05, "loss": 0.7049, "step": 864000 }, { "epoch": 4.435715457827371, "grad_norm": 0.5955784320831299, "learning_rate": 9.05620282244831e-05, "loss": 0.7021, "step": 865000 }, { "epoch": 4.435715457827371, "eval_accuracy": 0.8102874888698953, "eval_loss": 0.724114716053009, "eval_runtime": 70.3858, "eval_samples_per_second": 7164.267, "eval_steps_per_second": 27.989, "step": 865000 }, { "epoch": 4.4408434525763045, "grad_norm": 0.5804136395454407, "learning_rate": 8.974154906465376e-05, "loss": 0.7042, "step": 866000 }, { "epoch": 4.445971447325238, "grad_norm": 0.5142644047737122, "learning_rate": 8.892106990482441e-05, "loss": 0.7052, "step": 867000 }, { "epoch": 4.451099442074171, "grad_norm": 0.7222858667373657, "learning_rate": 8.810141122415491e-05, "loss": 0.7052, "step": 868000 }, { "epoch": 4.456227436823105, "grad_norm": 0.7241150140762329, "learning_rate": 8.728093206432557e-05, "loss": 0.7057, "step": 869000 }, { "epoch": 4.461355431572038, "grad_norm": 0.6935784220695496, "learning_rate": 8.646045290449622e-05, "loss": 0.707, "step": 870000 }, { "epoch": 4.461355431572038, "eval_accuracy": 0.810493730454148, "eval_loss": 0.724329948425293, "eval_runtime": 71.022, "eval_samples_per_second": 7100.095, "eval_steps_per_second": 27.738, "step": 870000 }, { "epoch": 4.466483426320972, "grad_norm": 0.6944177150726318, "learning_rate": 8.564079422382672e-05, "loss": 0.7078, "step": 871000 }, { "epoch": 4.4716114210699045, "grad_norm": 0.6552361845970154, "learning_rate": 8.482031506399738e-05, "loss": 0.7072, "step": 872000 }, { "epoch": 4.476739415818838, "grad_norm": 0.7402836084365845, "learning_rate": 8.399983590416803e-05, "loss": 0.7067, "step": 873000 }, { "epoch": 4.481867410567771, "grad_norm": 0.5813349485397339, "learning_rate": 8.31793567443387e-05, "loss": 0.7043, "step": 874000 }, { "epoch": 4.486995405316705, "grad_norm": 0.617222249507904, "learning_rate": 8.23596980636692e-05, "loss": 0.7034, "step": 875000 }, { "epoch": 4.486995405316705, "eval_accuracy": 0.8105631386796176, "eval_loss": 0.723841667175293, "eval_runtime": 70.6204, "eval_samples_per_second": 7140.477, "eval_steps_per_second": 27.896, "step": 875000 }, { "epoch": 4.492123400065639, "grad_norm": 0.5876939296722412, "learning_rate": 8.154003938299968e-05, "loss": 0.6986, "step": 876000 }, { "epoch": 4.497251394814572, "grad_norm": 0.6400925517082214, "learning_rate": 8.071956022317033e-05, "loss": 0.6971, "step": 877000 }, { "epoch": 4.502379389563505, "grad_norm": 0.5628241896629333, "learning_rate": 7.989908106334099e-05, "loss": 0.7097, "step": 878000 }, { "epoch": 4.507507384312438, "grad_norm": 0.6545447111129761, "learning_rate": 7.907942238267149e-05, "loss": 0.7048, "step": 879000 }, { "epoch": 4.512635379061372, "grad_norm": 0.5451228022575378, "learning_rate": 7.825894322284214e-05, "loss": 0.7055, "step": 880000 }, { "epoch": 4.512635379061372, "eval_accuracy": 0.8106325469050872, "eval_loss": 0.7232978940010071, "eval_runtime": 70.6814, "eval_samples_per_second": 7134.307, "eval_steps_per_second": 27.872, "step": 880000 }, { "epoch": 4.517763373810305, "grad_norm": 0.6168373823165894, "learning_rate": 7.743928454217264e-05, "loss": 0.7027, "step": 881000 }, { "epoch": 4.522891368559239, "grad_norm": 0.7063629031181335, "learning_rate": 7.66188053823433e-05, "loss": 0.7018, "step": 882000 }, { "epoch": 4.528019363308172, "grad_norm": 0.6569219827651978, "learning_rate": 7.579832622251395e-05, "loss": 0.7025, "step": 883000 }, { "epoch": 4.533147358057105, "grad_norm": 0.5398179888725281, "learning_rate": 7.497784706268461e-05, "loss": 0.7037, "step": 884000 }, { "epoch": 4.538275352806039, "grad_norm": 0.6475858092308044, "learning_rate": 7.415736790285526e-05, "loss": 0.7056, "step": 885000 }, { "epoch": 4.538275352806039, "eval_accuracy": 0.8106821242089941, "eval_loss": 0.7230600714683533, "eval_runtime": 70.6727, "eval_samples_per_second": 7135.193, "eval_steps_per_second": 27.875, "step": 885000 }, { "epoch": 4.543403347554972, "grad_norm": 0.5825057029724121, "learning_rate": 7.333852970134559e-05, "loss": 0.7081, "step": 886000 }, { "epoch": 4.548531342303906, "grad_norm": 0.513498067855835, "learning_rate": 7.251805054151624e-05, "loss": 0.7066, "step": 887000 }, { "epoch": 4.553659337052839, "grad_norm": 0.6001198887825012, "learning_rate": 7.169757138168691e-05, "loss": 0.7089, "step": 888000 }, { "epoch": 4.558787331801772, "grad_norm": 0.6619890332221985, "learning_rate": 7.08779127010174e-05, "loss": 0.7039, "step": 889000 }, { "epoch": 4.563915326550705, "grad_norm": 0.5484103560447693, "learning_rate": 7.005743354118805e-05, "loss": 0.7029, "step": 890000 }, { "epoch": 4.563915326550705, "eval_accuracy": 0.8108130082913083, "eval_loss": 0.7226137518882751, "eval_runtime": 71.5901, "eval_samples_per_second": 7043.757, "eval_steps_per_second": 27.518, "step": 890000 }, { "epoch": 4.569043321299639, "grad_norm": 0.6214715242385864, "learning_rate": 6.923695438135872e-05, "loss": 0.7055, "step": 891000 }, { "epoch": 4.574171316048572, "grad_norm": 0.6796917915344238, "learning_rate": 6.841647522152938e-05, "loss": 0.6988, "step": 892000 }, { "epoch": 4.579299310797506, "grad_norm": 0.6622291803359985, "learning_rate": 6.759599606170005e-05, "loss": 0.7013, "step": 893000 }, { "epoch": 4.5844273055464395, "grad_norm": 0.6859151721000671, "learning_rate": 6.677715786019035e-05, "loss": 0.7039, "step": 894000 }, { "epoch": 4.589555300295372, "grad_norm": 0.6275607943534851, "learning_rate": 6.595667870036103e-05, "loss": 0.7048, "step": 895000 }, { "epoch": 4.589555300295372, "eval_accuracy": 0.8110688271794678, "eval_loss": 0.7223987579345703, "eval_runtime": 70.3176, "eval_samples_per_second": 7171.217, "eval_steps_per_second": 28.016, "step": 895000 }, { "epoch": 4.594683295044306, "grad_norm": 0.5506250858306885, "learning_rate": 6.513619954053168e-05, "loss": 0.705, "step": 896000 }, { "epoch": 4.599811289793239, "grad_norm": 0.6328802704811096, "learning_rate": 6.431572038070234e-05, "loss": 0.704, "step": 897000 }, { "epoch": 4.604939284542173, "grad_norm": 0.5099455714225769, "learning_rate": 6.349606170003282e-05, "loss": 0.7036, "step": 898000 }, { "epoch": 4.610067279291106, "grad_norm": 0.6595134139060974, "learning_rate": 6.267558254020349e-05, "loss": 0.7027, "step": 899000 }, { "epoch": 4.6151952740400395, "grad_norm": 0.5882099866867065, "learning_rate": 6.185592385953397e-05, "loss": 0.7031, "step": 900000 }, { "epoch": 4.6151952740400395, "eval_accuracy": 0.8110370977049675, "eval_loss": 0.7221442461013794, "eval_runtime": 72.8563, "eval_samples_per_second": 6921.335, "eval_steps_per_second": 27.04, "step": 900000 }, { "epoch": 4.620323268788972, "grad_norm": 0.6113934516906738, "learning_rate": 6.103544469970463e-05, "loss": 0.7006, "step": 901000 }, { "epoch": 4.625451263537906, "grad_norm": 0.577958345413208, "learning_rate": 6.021496553987529e-05, "loss": 0.7055, "step": 902000 }, { "epoch": 4.63057925828684, "grad_norm": 0.6707080602645874, "learning_rate": 5.939530685920578e-05, "loss": 0.7013, "step": 903000 }, { "epoch": 4.635707253035773, "grad_norm": 0.6511906385421753, "learning_rate": 5.857482769937643e-05, "loss": 0.6999, "step": 904000 }, { "epoch": 4.640835247784707, "grad_norm": 0.6258724331855774, "learning_rate": 5.77543485395471e-05, "loss": 0.7034, "step": 905000 }, { "epoch": 4.640835247784707, "eval_accuracy": 0.8111858296166882, "eval_loss": 0.7216086387634277, "eval_runtime": 72.246, "eval_samples_per_second": 6979.801, "eval_steps_per_second": 27.268, "step": 905000 }, { "epoch": 4.6459632425336395, "grad_norm": 0.5974183678627014, "learning_rate": 5.693386937971775e-05, "loss": 0.7035, "step": 906000 }, { "epoch": 4.651091237282573, "grad_norm": 0.5262141823768616, "learning_rate": 5.611421069904824e-05, "loss": 0.7044, "step": 907000 }, { "epoch": 4.656219232031506, "grad_norm": 0.6684118509292603, "learning_rate": 5.5293731539218904e-05, "loss": 0.705, "step": 908000 }, { "epoch": 4.66134722678044, "grad_norm": 0.6512974500656128, "learning_rate": 5.4474072858549394e-05, "loss": 0.703, "step": 909000 }, { "epoch": 4.666475221529373, "grad_norm": 0.5594669580459595, "learning_rate": 5.365359369872005e-05, "loss": 0.7012, "step": 910000 }, { "epoch": 4.666475221529373, "eval_accuracy": 0.8113444769891902, "eval_loss": 0.7218356132507324, "eval_runtime": 73.0508, "eval_samples_per_second": 6902.904, "eval_steps_per_second": 26.968, "step": 910000 }, { "epoch": 4.6716032162783065, "grad_norm": 0.5940375328063965, "learning_rate": 5.283393501805054e-05, "loss": 0.7034, "step": 911000 }, { "epoch": 4.67673121102724, "grad_norm": 0.7923645377159119, "learning_rate": 5.2013455858221205e-05, "loss": 0.7016, "step": 912000 }, { "epoch": 4.681859205776173, "grad_norm": 0.6684789657592773, "learning_rate": 5.1193797177551695e-05, "loss": 0.701, "step": 913000 }, { "epoch": 4.686987200525107, "grad_norm": 0.5665594339370728, "learning_rate": 5.0373318017722345e-05, "loss": 0.6976, "step": 914000 }, { "epoch": 4.69211519527404, "grad_norm": 0.6844334006309509, "learning_rate": 4.9552838857893016e-05, "loss": 0.702, "step": 915000 }, { "epoch": 4.69211519527404, "eval_accuracy": 0.8114475977813165, "eval_loss": 0.7209221720695496, "eval_runtime": 70.6438, "eval_samples_per_second": 7138.104, "eval_steps_per_second": 27.886, "step": 915000 }, { "epoch": 4.697243190022974, "grad_norm": 0.5797888040542603, "learning_rate": 4.873235969806367e-05, "loss": 0.701, "step": 916000 }, { "epoch": 4.7023711847719065, "grad_norm": 0.5763701796531677, "learning_rate": 4.791270101739416e-05, "loss": 0.6996, "step": 917000 }, { "epoch": 4.70749917952084, "grad_norm": 0.6927294731140137, "learning_rate": 4.709222185756482e-05, "loss": 0.7001, "step": 918000 }, { "epoch": 4.712627174269773, "grad_norm": 0.6528701782226562, "learning_rate": 4.627256317689531e-05, "loss": 0.7005, "step": 919000 }, { "epoch": 4.717755169018707, "grad_norm": 0.639787495136261, "learning_rate": 4.545208401706597e-05, "loss": 0.7018, "step": 920000 }, { "epoch": 4.717755169018707, "eval_accuracy": 0.8114575132420979, "eval_loss": 0.7206711173057556, "eval_runtime": 71.2927, "eval_samples_per_second": 7073.134, "eval_steps_per_second": 27.633, "step": 920000 }, { "epoch": 4.722883163767641, "grad_norm": 0.6466658711433411, "learning_rate": 4.4632425336396457e-05, "loss": 0.701, "step": 921000 }, { "epoch": 4.728011158516574, "grad_norm": 0.6040979027748108, "learning_rate": 4.3811946176567114e-05, "loss": 0.6978, "step": 922000 }, { "epoch": 4.733139153265507, "grad_norm": 0.5377728939056396, "learning_rate": 4.2992287495897603e-05, "loss": 0.7065, "step": 923000 }, { "epoch": 4.73826714801444, "grad_norm": 0.6486812233924866, "learning_rate": 4.217180833606826e-05, "loss": 0.6991, "step": 924000 }, { "epoch": 4.743395142763374, "grad_norm": 0.6265415549278259, "learning_rate": 4.1351329176238924e-05, "loss": 0.7056, "step": 925000 }, { "epoch": 4.743395142763374, "eval_accuracy": 0.8115923635087247, "eval_loss": 0.7200772166252136, "eval_runtime": 70.1921, "eval_samples_per_second": 7184.043, "eval_steps_per_second": 28.066, "step": 925000 }, { "epoch": 4.748523137512307, "grad_norm": 0.6883388757705688, "learning_rate": 4.0531670495569414e-05, "loss": 0.7004, "step": 926000 }, { "epoch": 4.753651132261241, "grad_norm": 0.6205825805664062, "learning_rate": 3.971119133574007e-05, "loss": 0.7013, "step": 927000 }, { "epoch": 4.758779127010174, "grad_norm": 0.7139776945114136, "learning_rate": 3.8890712175910735e-05, "loss": 0.7043, "step": 928000 }, { "epoch": 4.763907121759107, "grad_norm": 0.5354844927787781, "learning_rate": 3.807023301608139e-05, "loss": 0.7057, "step": 929000 }, { "epoch": 4.769035116508041, "grad_norm": 0.96646648645401, "learning_rate": 3.725139481457171e-05, "loss": 0.7005, "step": 930000 }, { "epoch": 4.769035116508041, "eval_accuracy": 0.811774807987102, "eval_loss": 0.719876229763031, "eval_runtime": 70.278, "eval_samples_per_second": 7175.264, "eval_steps_per_second": 28.032, "step": 930000 }, { "epoch": 4.774163111256974, "grad_norm": 0.6708937287330627, "learning_rate": 3.643091565474237e-05, "loss": 0.7045, "step": 931000 }, { "epoch": 4.779291106005908, "grad_norm": 0.6520052552223206, "learning_rate": 3.561043649491303e-05, "loss": 0.7006, "step": 932000 }, { "epoch": 4.784419100754841, "grad_norm": 0.6138364672660828, "learning_rate": 3.478995733508369e-05, "loss": 0.7068, "step": 933000 }, { "epoch": 4.789547095503774, "grad_norm": 0.6665279269218445, "learning_rate": 3.3970298654414176e-05, "loss": 0.7006, "step": 934000 }, { "epoch": 4.794675090252707, "grad_norm": 0.6475928425788879, "learning_rate": 3.314981949458484e-05, "loss": 0.7005, "step": 935000 }, { "epoch": 4.794675090252707, "eval_accuracy": 0.8117192814067262, "eval_loss": 0.719744086265564, "eval_runtime": 70.9024, "eval_samples_per_second": 7112.072, "eval_steps_per_second": 27.785, "step": 935000 }, { "epoch": 4.799803085001641, "grad_norm": 0.7204559445381165, "learning_rate": 3.233016081391533e-05, "loss": 0.703, "step": 936000 }, { "epoch": 4.804931079750574, "grad_norm": 0.6928732991218567, "learning_rate": 3.1509681654085986e-05, "loss": 0.7024, "step": 937000 }, { "epoch": 4.810059074499508, "grad_norm": 0.6560456156730652, "learning_rate": 3.0690022973416476e-05, "loss": 0.7027, "step": 938000 }, { "epoch": 4.8151870692484415, "grad_norm": 0.9681175351142883, "learning_rate": 2.9869543813587137e-05, "loss": 0.7004, "step": 939000 }, { "epoch": 4.820315063997374, "grad_norm": 0.6823869943618774, "learning_rate": 2.9049064653757797e-05, "loss": 0.708, "step": 940000 }, { "epoch": 4.820315063997374, "eval_accuracy": 0.8116816026557571, "eval_loss": 0.7189022302627563, "eval_runtime": 70.9694, "eval_samples_per_second": 7105.359, "eval_steps_per_second": 27.758, "step": 940000 }, { "epoch": 4.825443058746308, "grad_norm": 0.6066316366195679, "learning_rate": 2.8228585493928454e-05, "loss": 0.7005, "step": 941000 }, { "epoch": 4.830571053495241, "grad_norm": 0.5331090688705444, "learning_rate": 2.7408926813258944e-05, "loss": 0.7024, "step": 942000 }, { "epoch": 4.835699048244175, "grad_norm": 0.671029269695282, "learning_rate": 2.6589268132589434e-05, "loss": 0.699, "step": 943000 }, { "epoch": 4.840827042993108, "grad_norm": 0.5964373350143433, "learning_rate": 2.5768788972760094e-05, "loss": 0.7024, "step": 944000 }, { "epoch": 4.8459550377420415, "grad_norm": 0.6053373217582703, "learning_rate": 2.4949130292090584e-05, "loss": 0.6956, "step": 945000 }, { "epoch": 4.8459550377420415, "eval_accuracy": 0.8117688587106331, "eval_loss": 0.7190364599227905, "eval_runtime": 70.746, "eval_samples_per_second": 7127.8, "eval_steps_per_second": 27.846, "step": 945000 }, { "epoch": 4.851083032490974, "grad_norm": 0.7100061178207397, "learning_rate": 2.4128651132261245e-05, "loss": 0.7026, "step": 946000 }, { "epoch": 4.856211027239908, "grad_norm": 0.6258633136749268, "learning_rate": 2.33081719724319e-05, "loss": 0.7027, "step": 947000 }, { "epoch": 4.861339021988842, "grad_norm": 0.5608850717544556, "learning_rate": 2.2488513291762388e-05, "loss": 0.7016, "step": 948000 }, { "epoch": 4.866467016737775, "grad_norm": 0.5873566269874573, "learning_rate": 2.166803413193305e-05, "loss": 0.6989, "step": 949000 }, { "epoch": 4.871595011486709, "grad_norm": 0.6754425168037415, "learning_rate": 2.084755497210371e-05, "loss": 0.7074, "step": 950000 }, { "epoch": 4.871595011486709, "eval_accuracy": 0.8119790664791984, "eval_loss": 0.718496561050415, "eval_runtime": 71.0902, "eval_samples_per_second": 7093.285, "eval_steps_per_second": 27.711, "step": 950000 }, { "epoch": 4.8767230062356415, "grad_norm": 0.7100470662117004, "learning_rate": 2.00278962914342e-05, "loss": 0.7073, "step": 951000 }, { "epoch": 4.881851000984575, "grad_norm": 0.5760778784751892, "learning_rate": 1.9207417131604856e-05, "loss": 0.7031, "step": 952000 }, { "epoch": 4.886978995733508, "grad_norm": 0.5999142527580261, "learning_rate": 1.8386937971775516e-05, "loss": 0.7052, "step": 953000 }, { "epoch": 4.892106990482442, "grad_norm": 0.7493179440498352, "learning_rate": 1.7566458811946177e-05, "loss": 0.6976, "step": 954000 }, { "epoch": 4.897234985231375, "grad_norm": 0.6732503771781921, "learning_rate": 1.6746800131276666e-05, "loss": 0.6964, "step": 955000 }, { "epoch": 4.897234985231375, "eval_accuracy": 0.8121079674693563, "eval_loss": 0.7184280157089233, "eval_runtime": 71.2166, "eval_samples_per_second": 7080.691, "eval_steps_per_second": 27.662, "step": 955000 }, { "epoch": 4.902362979980309, "grad_norm": 0.6185951232910156, "learning_rate": 1.5926320971447327e-05, "loss": 0.6985, "step": 956000 }, { "epoch": 4.907490974729242, "grad_norm": 0.6204145550727844, "learning_rate": 1.5106662290777815e-05, "loss": 0.7, "step": 957000 }, { "epoch": 4.912618969478175, "grad_norm": 0.583530068397522, "learning_rate": 1.4286183130948474e-05, "loss": 0.7001, "step": 958000 }, { "epoch": 4.917746964227109, "grad_norm": 0.714301586151123, "learning_rate": 1.3466524450278964e-05, "loss": 0.6982, "step": 959000 }, { "epoch": 4.922874958976042, "grad_norm": 0.5850209593772888, "learning_rate": 1.2646045290449624e-05, "loss": 0.7048, "step": 960000 }, { "epoch": 4.922874958976042, "eval_accuracy": 0.8119532862811668, "eval_loss": 0.7187953591346741, "eval_runtime": 70.1008, "eval_samples_per_second": 7193.397, "eval_steps_per_second": 28.102, "step": 960000 }, { "epoch": 4.928002953724976, "grad_norm": 0.6345271468162537, "learning_rate": 1.1826386609780112e-05, "loss": 0.6982, "step": 961000 }, { "epoch": 4.9331309484739085, "grad_norm": 0.6941786408424377, "learning_rate": 1.10067279291106e-05, "loss": 0.7005, "step": 962000 }, { "epoch": 4.938258943222842, "grad_norm": 0.6848387718200684, "learning_rate": 1.0186248769281261e-05, "loss": 0.7006, "step": 963000 }, { "epoch": 4.943386937971775, "grad_norm": 0.6272566318511963, "learning_rate": 9.36576960945192e-06, "loss": 0.7062, "step": 964000 }, { "epoch": 4.948514932720709, "grad_norm": 0.596750795841217, "learning_rate": 8.54611092878241e-06, "loss": 0.7018, "step": 965000 }, { "epoch": 4.948514932720709, "eval_accuracy": 0.8122428177359831, "eval_loss": 0.717775285243988, "eval_runtime": 70.2259, "eval_samples_per_second": 7180.585, "eval_steps_per_second": 28.052, "step": 965000 }, { "epoch": 4.953642927469643, "grad_norm": 0.5509336590766907, "learning_rate": 7.72563176895307e-06, "loss": 0.6972, "step": 966000 }, { "epoch": 4.958770922218576, "grad_norm": 0.5977193117141724, "learning_rate": 6.9051526091237285e-06, "loss": 0.6954, "step": 967000 }, { "epoch": 4.963898916967509, "grad_norm": 0.5693632364273071, "learning_rate": 6.085493928454218e-06, "loss": 0.7012, "step": 968000 }, { "epoch": 4.969026911716442, "grad_norm": 0.5690773725509644, "learning_rate": 5.265014768624877e-06, "loss": 0.7014, "step": 969000 }, { "epoch": 4.974154906465376, "grad_norm": 0.6249033808708191, "learning_rate": 4.444535608795537e-06, "loss": 0.7006, "step": 970000 }, { "epoch": 4.974154906465376, "eval_accuracy": 0.8120564070732931, "eval_loss": 0.7176830172538757, "eval_runtime": 71.2046, "eval_samples_per_second": 7081.891, "eval_steps_per_second": 27.667, "step": 970000 }, { "epoch": 4.979282901214309, "grad_norm": 0.6632362604141235, "learning_rate": 3.6240564489661962e-06, "loss": 0.7013, "step": 971000 }, { "epoch": 4.984410895963243, "grad_norm": 0.6778949499130249, "learning_rate": 2.8043977682966853e-06, "loss": 0.6983, "step": 972000 }, { "epoch": 4.989538890712176, "grad_norm": 0.5816048979759216, "learning_rate": 1.9839186084673453e-06, "loss": 0.7074, "step": 973000 }, { "epoch": 4.994666885461109, "grad_norm": 0.6206609010696411, "learning_rate": 1.1642599277978341e-06, "loss": 0.7042, "step": 974000 }, { "epoch": 4.999794880210043, "grad_norm": 0.6729464530944824, "learning_rate": 3.437807679684936e-07, "loss": 0.7005, "step": 975000 }, { "epoch": 4.999794880210043, "eval_accuracy": 0.812062356349762, "eval_loss": 0.7176774740219116, "eval_runtime": 71.0656, "eval_samples_per_second": 7095.738, "eval_steps_per_second": 27.721, "step": 975000 }, { "epoch": 5.0, "step": 975040, "total_flos": 2.144143543604609e+19, "train_loss": 0.7742696757605321, "train_runtime": 91371.0354, "train_samples_per_second": 2731.829, "train_steps_per_second": 10.671 } ], "logging_steps": 1000, "max_steps": 975040, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5000, "total_flos": 2.144143543604609e+19, "train_batch_size": 256, "trial_name": null, "trial_params": null }