diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,140353 @@ +{ + "best_metric": 2.2541162967681885, + "best_model_checkpoint": "./out/checkpoint-16000", + "epoch": 3.7488284910965324, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00018744142455482662, + "grad_norm": 93677.7890625, + "learning_rate": 1e-05, + "loss": 2.6898, + "step": 1 + }, + { + "epoch": 0.00037488284910965324, + "grad_norm": 74033.5859375, + "learning_rate": 2e-05, + "loss": 2.5578, + "step": 2 + }, + { + "epoch": 0.0005623242736644799, + "grad_norm": 70953.59375, + "learning_rate": 3e-05, + "loss": 2.5138, + "step": 3 + }, + { + "epoch": 0.0007497656982193065, + "grad_norm": 82630.5625, + "learning_rate": 4e-05, + "loss": 2.4641, + "step": 4 + }, + { + "epoch": 0.0009372071227741331, + "grad_norm": 73234.6484375, + "learning_rate": 5e-05, + "loss": 2.5099, + "step": 5 + }, + { + "epoch": 0.0011246485473289597, + "grad_norm": 65282.47265625, + "learning_rate": 6e-05, + "loss": 2.5056, + "step": 6 + }, + { + "epoch": 0.0013120899718837863, + "grad_norm": 57816.4609375, + "learning_rate": 7e-05, + "loss": 2.4064, + "step": 7 + }, + { + "epoch": 0.001499531396438613, + "grad_norm": 64720.3515625, + "learning_rate": 8e-05, + "loss": 2.4164, + "step": 8 + }, + { + "epoch": 0.0016869728209934396, + "grad_norm": 65362.07421875, + "learning_rate": 9e-05, + "loss": 2.4862, + "step": 9 + }, + { + "epoch": 0.0018744142455482662, + "grad_norm": 65064.578125, + "learning_rate": 0.0001, + "loss": 2.4924, + "step": 10 + }, + { + "epoch": 0.002061855670103093, + "grad_norm": 68526.484375, + "learning_rate": 9.999999938253242e-05, + "loss": 2.4442, + "step": 11 + }, + { + "epoch": 0.0022492970946579195, + "grad_norm": 65596.140625, + "learning_rate": 9.999999753012967e-05, + "loss": 2.5339, + "step": 12 + }, + { + "epoch": 0.002436738519212746, + "grad_norm": 66038.09375, + "learning_rate": 9.999999444279181e-05, + "loss": 2.4107, + "step": 13 + }, + { + "epoch": 0.0026241799437675727, + "grad_norm": 64423.1953125, + "learning_rate": 9.999999012051891e-05, + "loss": 2.3946, + "step": 14 + }, + { + "epoch": 0.0028116213683223993, + "grad_norm": 60430.453125, + "learning_rate": 9.99999845633111e-05, + "loss": 2.4492, + "step": 15 + }, + { + "epoch": 0.002999062792877226, + "grad_norm": 62844.1796875, + "learning_rate": 9.999997777116848e-05, + "loss": 2.4603, + "step": 16 + }, + { + "epoch": 0.0031865042174320526, + "grad_norm": 63830.96484375, + "learning_rate": 9.999996974409122e-05, + "loss": 2.3611, + "step": 17 + }, + { + "epoch": 0.003373945641986879, + "grad_norm": 75461.703125, + "learning_rate": 9.999996048207956e-05, + "loss": 2.4846, + "step": 18 + }, + { + "epoch": 0.003561387066541706, + "grad_norm": 63892.89453125, + "learning_rate": 9.999994998513369e-05, + "loss": 2.3961, + "step": 19 + }, + { + "epoch": 0.0037488284910965324, + "grad_norm": 64866.0, + "learning_rate": 9.999993825325389e-05, + "loss": 2.4227, + "step": 20 + }, + { + "epoch": 0.003936269915651359, + "grad_norm": 61415.91796875, + "learning_rate": 9.999992528644042e-05, + "loss": 2.382, + "step": 21 + }, + { + "epoch": 0.004123711340206186, + "grad_norm": 66786.59375, + "learning_rate": 9.999991108469364e-05, + "loss": 2.4088, + "step": 22 + }, + { + "epoch": 0.004311152764761012, + "grad_norm": 58976.44140625, + "learning_rate": 9.999989564801389e-05, + "loss": 2.4175, + "step": 23 + }, + { + "epoch": 0.004498594189315839, + "grad_norm": 63740.75, + "learning_rate": 9.999987897640153e-05, + "loss": 2.4167, + "step": 24 + }, + { + "epoch": 0.004686035613870665, + "grad_norm": 56891.82421875, + "learning_rate": 9.999986106985697e-05, + "loss": 2.3991, + "step": 25 + }, + { + "epoch": 0.004873477038425492, + "grad_norm": 58922.515625, + "learning_rate": 9.99998419283807e-05, + "loss": 2.3582, + "step": 26 + }, + { + "epoch": 0.005060918462980318, + "grad_norm": 55474.72265625, + "learning_rate": 9.999982155197314e-05, + "loss": 2.3703, + "step": 27 + }, + { + "epoch": 0.005248359887535145, + "grad_norm": 54813.66796875, + "learning_rate": 9.999979994063483e-05, + "loss": 2.3962, + "step": 28 + }, + { + "epoch": 0.005435801312089972, + "grad_norm": 58669.6640625, + "learning_rate": 9.999977709436625e-05, + "loss": 2.4127, + "step": 29 + }, + { + "epoch": 0.005623242736644799, + "grad_norm": 59393.09765625, + "learning_rate": 9.999975301316803e-05, + "loss": 2.4259, + "step": 30 + }, + { + "epoch": 0.005810684161199625, + "grad_norm": 61276.50390625, + "learning_rate": 9.999972769704073e-05, + "loss": 2.3495, + "step": 31 + }, + { + "epoch": 0.005998125585754452, + "grad_norm": 60808.12890625, + "learning_rate": 9.999970114598498e-05, + "loss": 2.3428, + "step": 32 + }, + { + "epoch": 0.006185567010309278, + "grad_norm": 58443.203125, + "learning_rate": 9.999967336000144e-05, + "loss": 2.3396, + "step": 33 + }, + { + "epoch": 0.006373008434864105, + "grad_norm": 61082.734375, + "learning_rate": 9.999964433909079e-05, + "loss": 2.4065, + "step": 34 + }, + { + "epoch": 0.006560449859418931, + "grad_norm": 61798.1484375, + "learning_rate": 9.999961408325376e-05, + "loss": 2.4025, + "step": 35 + }, + { + "epoch": 0.006747891283973758, + "grad_norm": 65027.140625, + "learning_rate": 9.99995825924911e-05, + "loss": 2.4878, + "step": 36 + }, + { + "epoch": 0.0069353327085285845, + "grad_norm": 58735.0703125, + "learning_rate": 9.999954986680354e-05, + "loss": 2.3812, + "step": 37 + }, + { + "epoch": 0.007122774133083412, + "grad_norm": 69721.1484375, + "learning_rate": 9.999951590619194e-05, + "loss": 2.4387, + "step": 38 + }, + { + "epoch": 0.007310215557638238, + "grad_norm": 65334.8046875, + "learning_rate": 9.999948071065713e-05, + "loss": 2.468, + "step": 39 + }, + { + "epoch": 0.007497656982193065, + "grad_norm": 62754.56640625, + "learning_rate": 9.999944428019997e-05, + "loss": 2.4726, + "step": 40 + }, + { + "epoch": 0.007685098406747891, + "grad_norm": 62421.1953125, + "learning_rate": 9.999940661482135e-05, + "loss": 2.3923, + "step": 41 + }, + { + "epoch": 0.007872539831302717, + "grad_norm": 60274.6875, + "learning_rate": 9.999936771452222e-05, + "loss": 2.4437, + "step": 42 + }, + { + "epoch": 0.008059981255857544, + "grad_norm": 54014.45703125, + "learning_rate": 9.999932757930353e-05, + "loss": 2.3929, + "step": 43 + }, + { + "epoch": 0.008247422680412371, + "grad_norm": 58264.5078125, + "learning_rate": 9.999928620916627e-05, + "loss": 2.3339, + "step": 44 + }, + { + "epoch": 0.008434864104967198, + "grad_norm": 56649.20703125, + "learning_rate": 9.999924360411148e-05, + "loss": 2.3765, + "step": 45 + }, + { + "epoch": 0.008622305529522024, + "grad_norm": 57137.98046875, + "learning_rate": 9.999919976414018e-05, + "loss": 2.3571, + "step": 46 + }, + { + "epoch": 0.00880974695407685, + "grad_norm": 58425.98828125, + "learning_rate": 9.999915468925349e-05, + "loss": 2.3923, + "step": 47 + }, + { + "epoch": 0.008997188378631678, + "grad_norm": 56112.78515625, + "learning_rate": 9.999910837945248e-05, + "loss": 2.3445, + "step": 48 + }, + { + "epoch": 0.009184629803186505, + "grad_norm": 59776.99609375, + "learning_rate": 9.999906083473833e-05, + "loss": 2.3854, + "step": 49 + }, + { + "epoch": 0.00937207122774133, + "grad_norm": 57685.2890625, + "learning_rate": 9.99990120551122e-05, + "loss": 2.3589, + "step": 50 + }, + { + "epoch": 0.009559512652296157, + "grad_norm": 54844.0078125, + "learning_rate": 9.999896204057531e-05, + "loss": 2.2879, + "step": 51 + }, + { + "epoch": 0.009746954076850984, + "grad_norm": 59345.56640625, + "learning_rate": 9.999891079112886e-05, + "loss": 2.304, + "step": 52 + }, + { + "epoch": 0.009934395501405811, + "grad_norm": 61168.6640625, + "learning_rate": 9.999885830677415e-05, + "loss": 2.3552, + "step": 53 + }, + { + "epoch": 0.010121836925960637, + "grad_norm": 55989.24609375, + "learning_rate": 9.999880458751247e-05, + "loss": 2.3153, + "step": 54 + }, + { + "epoch": 0.010309278350515464, + "grad_norm": 57621.015625, + "learning_rate": 9.999874963334514e-05, + "loss": 2.3321, + "step": 55 + }, + { + "epoch": 0.01049671977507029, + "grad_norm": 53103.51953125, + "learning_rate": 9.999869344427352e-05, + "loss": 2.3874, + "step": 56 + }, + { + "epoch": 0.010684161199625118, + "grad_norm": 60184.37890625, + "learning_rate": 9.999863602029898e-05, + "loss": 2.4492, + "step": 57 + }, + { + "epoch": 0.010871602624179943, + "grad_norm": 56337.33203125, + "learning_rate": 9.999857736142297e-05, + "loss": 2.3372, + "step": 58 + }, + { + "epoch": 0.01105904404873477, + "grad_norm": 51447.578125, + "learning_rate": 9.999851746764693e-05, + "loss": 2.3316, + "step": 59 + }, + { + "epoch": 0.011246485473289597, + "grad_norm": 55290.74609375, + "learning_rate": 9.999845633897231e-05, + "loss": 2.4284, + "step": 60 + }, + { + "epoch": 0.011433926897844424, + "grad_norm": 64272.93359375, + "learning_rate": 9.999839397540066e-05, + "loss": 2.4235, + "step": 61 + }, + { + "epoch": 0.01162136832239925, + "grad_norm": 58223.66796875, + "learning_rate": 9.99983303769335e-05, + "loss": 2.3912, + "step": 62 + }, + { + "epoch": 0.011808809746954077, + "grad_norm": 57433.16015625, + "learning_rate": 9.999826554357239e-05, + "loss": 2.3328, + "step": 63 + }, + { + "epoch": 0.011996251171508904, + "grad_norm": 54132.90625, + "learning_rate": 9.999819947531895e-05, + "loss": 2.396, + "step": 64 + }, + { + "epoch": 0.01218369259606373, + "grad_norm": 50619.75390625, + "learning_rate": 9.999813217217482e-05, + "loss": 2.3258, + "step": 65 + }, + { + "epoch": 0.012371134020618556, + "grad_norm": 58544.734375, + "learning_rate": 9.999806363414163e-05, + "loss": 2.3627, + "step": 66 + }, + { + "epoch": 0.012558575445173383, + "grad_norm": 62628.71484375, + "learning_rate": 9.99979938612211e-05, + "loss": 2.4574, + "step": 67 + }, + { + "epoch": 0.01274601686972821, + "grad_norm": 66866.2578125, + "learning_rate": 9.999792285341495e-05, + "loss": 2.3776, + "step": 68 + }, + { + "epoch": 0.012933458294283037, + "grad_norm": 58267.74609375, + "learning_rate": 9.999785061072493e-05, + "loss": 2.4259, + "step": 69 + }, + { + "epoch": 0.013120899718837863, + "grad_norm": 56524.45703125, + "learning_rate": 9.999777713315282e-05, + "loss": 2.4187, + "step": 70 + }, + { + "epoch": 0.01330834114339269, + "grad_norm": 58506.26171875, + "learning_rate": 9.999770242070043e-05, + "loss": 2.4584, + "step": 71 + }, + { + "epoch": 0.013495782567947517, + "grad_norm": 63846.73828125, + "learning_rate": 9.999762647336963e-05, + "loss": 2.3256, + "step": 72 + }, + { + "epoch": 0.013683223992502344, + "grad_norm": 62737.35546875, + "learning_rate": 9.999754929116225e-05, + "loss": 2.3778, + "step": 73 + }, + { + "epoch": 0.013870665417057169, + "grad_norm": 57540.734375, + "learning_rate": 9.999747087408024e-05, + "loss": 2.3909, + "step": 74 + }, + { + "epoch": 0.014058106841611996, + "grad_norm": 58507.0, + "learning_rate": 9.999739122212554e-05, + "loss": 2.3265, + "step": 75 + }, + { + "epoch": 0.014245548266166823, + "grad_norm": 60015.63671875, + "learning_rate": 9.999731033530009e-05, + "loss": 2.3507, + "step": 76 + }, + { + "epoch": 0.01443298969072165, + "grad_norm": 58881.62109375, + "learning_rate": 9.999722821360589e-05, + "loss": 2.2765, + "step": 77 + }, + { + "epoch": 0.014620431115276476, + "grad_norm": 64155.11328125, + "learning_rate": 9.999714485704496e-05, + "loss": 2.3054, + "step": 78 + }, + { + "epoch": 0.014807872539831303, + "grad_norm": 56310.828125, + "learning_rate": 9.99970602656194e-05, + "loss": 2.3335, + "step": 79 + }, + { + "epoch": 0.01499531396438613, + "grad_norm": 63017.21484375, + "learning_rate": 9.999697443933126e-05, + "loss": 2.3669, + "step": 80 + }, + { + "epoch": 0.015182755388940957, + "grad_norm": 57907.15234375, + "learning_rate": 9.999688737818269e-05, + "loss": 2.3824, + "step": 81 + }, + { + "epoch": 0.015370196813495782, + "grad_norm": 64209.125, + "learning_rate": 9.99967990821758e-05, + "loss": 2.3801, + "step": 82 + }, + { + "epoch": 0.015557638238050609, + "grad_norm": 53665.5078125, + "learning_rate": 9.999670955131282e-05, + "loss": 2.342, + "step": 83 + }, + { + "epoch": 0.015745079662605434, + "grad_norm": 56424.51953125, + "learning_rate": 9.999661878559592e-05, + "loss": 2.3907, + "step": 84 + }, + { + "epoch": 0.015932521087160263, + "grad_norm": 52563.109375, + "learning_rate": 9.999652678502735e-05, + "loss": 2.3262, + "step": 85 + }, + { + "epoch": 0.01611996251171509, + "grad_norm": 58680.22265625, + "learning_rate": 9.999643354960942e-05, + "loss": 2.4263, + "step": 86 + }, + { + "epoch": 0.016307403936269917, + "grad_norm": 53380.3046875, + "learning_rate": 9.999633907934438e-05, + "loss": 2.2875, + "step": 87 + }, + { + "epoch": 0.016494845360824743, + "grad_norm": 54820.42578125, + "learning_rate": 9.99962433742346e-05, + "loss": 2.3583, + "step": 88 + }, + { + "epoch": 0.016682286785379568, + "grad_norm": 60321.265625, + "learning_rate": 9.999614643428241e-05, + "loss": 2.3661, + "step": 89 + }, + { + "epoch": 0.016869728209934397, + "grad_norm": 53558.9375, + "learning_rate": 9.999604825949023e-05, + "loss": 2.3741, + "step": 90 + }, + { + "epoch": 0.017057169634489222, + "grad_norm": 54834.13671875, + "learning_rate": 9.999594884986048e-05, + "loss": 2.4132, + "step": 91 + }, + { + "epoch": 0.017244611059044047, + "grad_norm": 59353.84375, + "learning_rate": 9.999584820539562e-05, + "loss": 2.4397, + "step": 92 + }, + { + "epoch": 0.017432052483598876, + "grad_norm": 57698.796875, + "learning_rate": 9.999574632609811e-05, + "loss": 2.3451, + "step": 93 + }, + { + "epoch": 0.0176194939081537, + "grad_norm": 56182.26953125, + "learning_rate": 9.999564321197051e-05, + "loss": 2.3266, + "step": 94 + }, + { + "epoch": 0.01780693533270853, + "grad_norm": 62564.515625, + "learning_rate": 9.999553886301533e-05, + "loss": 2.3992, + "step": 95 + }, + { + "epoch": 0.017994376757263356, + "grad_norm": 58893.140625, + "learning_rate": 9.999543327923517e-05, + "loss": 2.4025, + "step": 96 + }, + { + "epoch": 0.01818181818181818, + "grad_norm": 56678.71875, + "learning_rate": 9.999532646063262e-05, + "loss": 2.4346, + "step": 97 + }, + { + "epoch": 0.01836925960637301, + "grad_norm": 57688.875, + "learning_rate": 9.999521840721033e-05, + "loss": 2.3584, + "step": 98 + }, + { + "epoch": 0.018556701030927835, + "grad_norm": 62169.08203125, + "learning_rate": 9.999510911897096e-05, + "loss": 2.3922, + "step": 99 + }, + { + "epoch": 0.01874414245548266, + "grad_norm": 59541.9296875, + "learning_rate": 9.999499859591724e-05, + "loss": 2.4136, + "step": 100 + }, + { + "epoch": 0.01893158388003749, + "grad_norm": 61325.67578125, + "learning_rate": 9.999488683805184e-05, + "loss": 2.3408, + "step": 101 + }, + { + "epoch": 0.019119025304592314, + "grad_norm": 58207.78125, + "learning_rate": 9.999477384537757e-05, + "loss": 2.3236, + "step": 102 + }, + { + "epoch": 0.01930646672914714, + "grad_norm": 57718.37890625, + "learning_rate": 9.99946596178972e-05, + "loss": 2.3519, + "step": 103 + }, + { + "epoch": 0.01949390815370197, + "grad_norm": 60371.8984375, + "learning_rate": 9.999454415561356e-05, + "loss": 2.461, + "step": 104 + }, + { + "epoch": 0.019681349578256794, + "grad_norm": 52459.41015625, + "learning_rate": 9.999442745852949e-05, + "loss": 2.336, + "step": 105 + }, + { + "epoch": 0.019868791002811623, + "grad_norm": 51809.09375, + "learning_rate": 9.99943095266479e-05, + "loss": 2.3075, + "step": 106 + }, + { + "epoch": 0.020056232427366448, + "grad_norm": 57940.54296875, + "learning_rate": 9.999419035997166e-05, + "loss": 2.3392, + "step": 107 + }, + { + "epoch": 0.020243673851921273, + "grad_norm": 54904.609375, + "learning_rate": 9.999406995850376e-05, + "loss": 2.367, + "step": 108 + }, + { + "epoch": 0.020431115276476102, + "grad_norm": 61224.65234375, + "learning_rate": 9.999394832224714e-05, + "loss": 2.4072, + "step": 109 + }, + { + "epoch": 0.020618556701030927, + "grad_norm": 64136.87109375, + "learning_rate": 9.99938254512048e-05, + "loss": 2.4354, + "step": 110 + }, + { + "epoch": 0.020805998125585753, + "grad_norm": 57798.70703125, + "learning_rate": 9.99937013453798e-05, + "loss": 2.4161, + "step": 111 + }, + { + "epoch": 0.02099343955014058, + "grad_norm": 58993.90234375, + "learning_rate": 9.99935760047752e-05, + "loss": 2.3406, + "step": 112 + }, + { + "epoch": 0.021180880974695407, + "grad_norm": 57831.52734375, + "learning_rate": 9.999344942939408e-05, + "loss": 2.3472, + "step": 113 + }, + { + "epoch": 0.021368322399250236, + "grad_norm": 57283.43359375, + "learning_rate": 9.99933216192396e-05, + "loss": 2.3074, + "step": 114 + }, + { + "epoch": 0.02155576382380506, + "grad_norm": 63869.8046875, + "learning_rate": 9.999319257431488e-05, + "loss": 2.4281, + "step": 115 + }, + { + "epoch": 0.021743205248359886, + "grad_norm": 56642.46875, + "learning_rate": 9.99930622946231e-05, + "loss": 2.4075, + "step": 116 + }, + { + "epoch": 0.021930646672914715, + "grad_norm": 57549.75, + "learning_rate": 9.999293078016751e-05, + "loss": 2.3743, + "step": 117 + }, + { + "epoch": 0.02211808809746954, + "grad_norm": 56498.4140625, + "learning_rate": 9.999279803095137e-05, + "loss": 2.3902, + "step": 118 + }, + { + "epoch": 0.022305529522024366, + "grad_norm": 57451.828125, + "learning_rate": 9.999266404697791e-05, + "loss": 2.3845, + "step": 119 + }, + { + "epoch": 0.022492970946579195, + "grad_norm": 56342.2109375, + "learning_rate": 9.999252882825048e-05, + "loss": 2.3876, + "step": 120 + }, + { + "epoch": 0.02268041237113402, + "grad_norm": 65026.98046875, + "learning_rate": 9.999239237477239e-05, + "loss": 2.4023, + "step": 121 + }, + { + "epoch": 0.02286785379568885, + "grad_norm": 63505.1875, + "learning_rate": 9.999225468654703e-05, + "loss": 2.3363, + "step": 122 + }, + { + "epoch": 0.023055295220243674, + "grad_norm": 57625.64453125, + "learning_rate": 9.99921157635778e-05, + "loss": 2.3574, + "step": 123 + }, + { + "epoch": 0.0232427366447985, + "grad_norm": 58583.7109375, + "learning_rate": 9.999197560586812e-05, + "loss": 2.4295, + "step": 124 + }, + { + "epoch": 0.023430178069353328, + "grad_norm": 57596.0234375, + "learning_rate": 9.999183421342147e-05, + "loss": 2.3551, + "step": 125 + }, + { + "epoch": 0.023617619493908153, + "grad_norm": 53533.25390625, + "learning_rate": 9.99916915862413e-05, + "loss": 2.34, + "step": 126 + }, + { + "epoch": 0.02380506091846298, + "grad_norm": 62265.37109375, + "learning_rate": 9.99915477243312e-05, + "loss": 2.3351, + "step": 127 + }, + { + "epoch": 0.023992502343017807, + "grad_norm": 60309.7890625, + "learning_rate": 9.999140262769467e-05, + "loss": 2.302, + "step": 128 + }, + { + "epoch": 0.024179943767572633, + "grad_norm": 58122.08984375, + "learning_rate": 9.99912562963353e-05, + "loss": 2.2965, + "step": 129 + }, + { + "epoch": 0.02436738519212746, + "grad_norm": 55708.6640625, + "learning_rate": 9.999110873025672e-05, + "loss": 2.4098, + "step": 130 + }, + { + "epoch": 0.024554826616682287, + "grad_norm": 55078.12890625, + "learning_rate": 9.999095992946257e-05, + "loss": 2.3063, + "step": 131 + }, + { + "epoch": 0.024742268041237112, + "grad_norm": 58921.94140625, + "learning_rate": 9.999080989395653e-05, + "loss": 2.3952, + "step": 132 + }, + { + "epoch": 0.02492970946579194, + "grad_norm": 56760.0390625, + "learning_rate": 9.99906586237423e-05, + "loss": 2.3531, + "step": 133 + }, + { + "epoch": 0.025117150890346766, + "grad_norm": 59925.4296875, + "learning_rate": 9.999050611882362e-05, + "loss": 2.327, + "step": 134 + }, + { + "epoch": 0.02530459231490159, + "grad_norm": 61144.24609375, + "learning_rate": 9.999035237920425e-05, + "loss": 2.3321, + "step": 135 + }, + { + "epoch": 0.02549203373945642, + "grad_norm": 61031.1640625, + "learning_rate": 9.999019740488798e-05, + "loss": 2.4268, + "step": 136 + }, + { + "epoch": 0.025679475164011246, + "grad_norm": 56623.125, + "learning_rate": 9.999004119587864e-05, + "loss": 2.3592, + "step": 137 + }, + { + "epoch": 0.025866916588566075, + "grad_norm": 57946.36328125, + "learning_rate": 9.99898837521801e-05, + "loss": 2.3196, + "step": 138 + }, + { + "epoch": 0.0260543580131209, + "grad_norm": 54820.13671875, + "learning_rate": 9.998972507379626e-05, + "loss": 2.3374, + "step": 139 + }, + { + "epoch": 0.026241799437675725, + "grad_norm": 60293.45703125, + "learning_rate": 9.998956516073101e-05, + "loss": 2.4145, + "step": 140 + }, + { + "epoch": 0.026429240862230554, + "grad_norm": 53998.453125, + "learning_rate": 9.998940401298833e-05, + "loss": 2.4383, + "step": 141 + }, + { + "epoch": 0.02661668228678538, + "grad_norm": 52783.72265625, + "learning_rate": 9.998924163057217e-05, + "loss": 2.3581, + "step": 142 + }, + { + "epoch": 0.026804123711340205, + "grad_norm": 58402.46484375, + "learning_rate": 9.998907801348655e-05, + "loss": 2.3328, + "step": 143 + }, + { + "epoch": 0.026991565135895033, + "grad_norm": 55760.8125, + "learning_rate": 9.998891316173551e-05, + "loss": 2.3846, + "step": 144 + }, + { + "epoch": 0.02717900656044986, + "grad_norm": 56531.25, + "learning_rate": 9.998874707532315e-05, + "loss": 2.3559, + "step": 145 + }, + { + "epoch": 0.027366447985004688, + "grad_norm": 65454.671875, + "learning_rate": 9.998857975425353e-05, + "loss": 2.3979, + "step": 146 + }, + { + "epoch": 0.027553889409559513, + "grad_norm": 53829.65625, + "learning_rate": 9.99884111985308e-05, + "loss": 2.3502, + "step": 147 + }, + { + "epoch": 0.027741330834114338, + "grad_norm": 56835.36328125, + "learning_rate": 9.998824140815913e-05, + "loss": 2.3768, + "step": 148 + }, + { + "epoch": 0.027928772258669167, + "grad_norm": 56971.953125, + "learning_rate": 9.99880703831427e-05, + "loss": 2.3432, + "step": 149 + }, + { + "epoch": 0.028116213683223992, + "grad_norm": 60235.27734375, + "learning_rate": 9.998789812348576e-05, + "loss": 2.2782, + "step": 150 + }, + { + "epoch": 0.028303655107778818, + "grad_norm": 54759.1015625, + "learning_rate": 9.998772462919254e-05, + "loss": 2.2789, + "step": 151 + }, + { + "epoch": 0.028491096532333646, + "grad_norm": 57113.71875, + "learning_rate": 9.998754990026733e-05, + "loss": 2.3726, + "step": 152 + }, + { + "epoch": 0.02867853795688847, + "grad_norm": 54158.11328125, + "learning_rate": 9.998737393671444e-05, + "loss": 2.3109, + "step": 153 + }, + { + "epoch": 0.0288659793814433, + "grad_norm": 57534.99609375, + "learning_rate": 9.998719673853822e-05, + "loss": 2.2899, + "step": 154 + }, + { + "epoch": 0.029053420805998126, + "grad_norm": 59143.90234375, + "learning_rate": 9.998701830574306e-05, + "loss": 2.4913, + "step": 155 + }, + { + "epoch": 0.02924086223055295, + "grad_norm": 61181.40625, + "learning_rate": 9.998683863833335e-05, + "loss": 2.3679, + "step": 156 + }, + { + "epoch": 0.02942830365510778, + "grad_norm": 55132.73828125, + "learning_rate": 9.998665773631355e-05, + "loss": 2.3947, + "step": 157 + }, + { + "epoch": 0.029615745079662605, + "grad_norm": 62728.22265625, + "learning_rate": 9.99864755996881e-05, + "loss": 2.4004, + "step": 158 + }, + { + "epoch": 0.02980318650421743, + "grad_norm": 59545.4453125, + "learning_rate": 9.998629222846152e-05, + "loss": 2.4392, + "step": 159 + }, + { + "epoch": 0.02999062792877226, + "grad_norm": 56033.53125, + "learning_rate": 9.998610762263833e-05, + "loss": 2.3505, + "step": 160 + }, + { + "epoch": 0.030178069353327085, + "grad_norm": 52792.37890625, + "learning_rate": 9.998592178222307e-05, + "loss": 2.317, + "step": 161 + }, + { + "epoch": 0.030365510777881913, + "grad_norm": 54669.171875, + "learning_rate": 9.998573470722037e-05, + "loss": 2.3606, + "step": 162 + }, + { + "epoch": 0.03055295220243674, + "grad_norm": 61085.90625, + "learning_rate": 9.998554639763482e-05, + "loss": 2.3706, + "step": 163 + }, + { + "epoch": 0.030740393626991564, + "grad_norm": 52318.12109375, + "learning_rate": 9.998535685347109e-05, + "loss": 2.3672, + "step": 164 + }, + { + "epoch": 0.030927835051546393, + "grad_norm": 50866.49609375, + "learning_rate": 9.998516607473385e-05, + "loss": 2.3287, + "step": 165 + }, + { + "epoch": 0.031115276476101218, + "grad_norm": 62216.890625, + "learning_rate": 9.998497406142781e-05, + "loss": 2.3692, + "step": 166 + }, + { + "epoch": 0.031302717900656044, + "grad_norm": 61478.109375, + "learning_rate": 9.998478081355773e-05, + "loss": 2.3164, + "step": 167 + }, + { + "epoch": 0.03149015932521087, + "grad_norm": 58559.7890625, + "learning_rate": 9.998458633112835e-05, + "loss": 2.3701, + "step": 168 + }, + { + "epoch": 0.0316776007497657, + "grad_norm": 67036.4609375, + "learning_rate": 9.99843906141445e-05, + "loss": 2.4074, + "step": 169 + }, + { + "epoch": 0.031865042174320526, + "grad_norm": 55874.15234375, + "learning_rate": 9.998419366261101e-05, + "loss": 2.437, + "step": 170 + }, + { + "epoch": 0.03205248359887535, + "grad_norm": 62936.26953125, + "learning_rate": 9.998399547653274e-05, + "loss": 2.4187, + "step": 171 + }, + { + "epoch": 0.03223992502343018, + "grad_norm": 57890.5703125, + "learning_rate": 9.99837960559146e-05, + "loss": 2.4211, + "step": 172 + }, + { + "epoch": 0.032427366447985, + "grad_norm": 65252.609375, + "learning_rate": 9.998359540076148e-05, + "loss": 2.3456, + "step": 173 + }, + { + "epoch": 0.032614807872539835, + "grad_norm": 63276.328125, + "learning_rate": 9.998339351107837e-05, + "loss": 2.404, + "step": 174 + }, + { + "epoch": 0.03280224929709466, + "grad_norm": 52940.89453125, + "learning_rate": 9.998319038687024e-05, + "loss": 2.4811, + "step": 175 + }, + { + "epoch": 0.032989690721649485, + "grad_norm": 54962.625, + "learning_rate": 9.998298602814213e-05, + "loss": 2.3154, + "step": 176 + }, + { + "epoch": 0.03317713214620431, + "grad_norm": 57469.796875, + "learning_rate": 9.998278043489906e-05, + "loss": 2.4056, + "step": 177 + }, + { + "epoch": 0.033364573570759136, + "grad_norm": 67455.2890625, + "learning_rate": 9.998257360714611e-05, + "loss": 2.3707, + "step": 178 + }, + { + "epoch": 0.03355201499531396, + "grad_norm": 58472.60546875, + "learning_rate": 9.99823655448884e-05, + "loss": 2.3798, + "step": 179 + }, + { + "epoch": 0.033739456419868794, + "grad_norm": 51719.9296875, + "learning_rate": 9.998215624813106e-05, + "loss": 2.3856, + "step": 180 + }, + { + "epoch": 0.03392689784442362, + "grad_norm": 53300.07421875, + "learning_rate": 9.998194571687927e-05, + "loss": 2.347, + "step": 181 + }, + { + "epoch": 0.034114339268978444, + "grad_norm": 52127.73828125, + "learning_rate": 9.998173395113822e-05, + "loss": 2.3974, + "step": 182 + }, + { + "epoch": 0.03430178069353327, + "grad_norm": 67426.0859375, + "learning_rate": 9.998152095091314e-05, + "loss": 2.3, + "step": 183 + }, + { + "epoch": 0.034489222118088095, + "grad_norm": 61279.4375, + "learning_rate": 9.998130671620929e-05, + "loss": 2.2826, + "step": 184 + }, + { + "epoch": 0.03467666354264293, + "grad_norm": 61986.11328125, + "learning_rate": 9.998109124703196e-05, + "loss": 2.3578, + "step": 185 + }, + { + "epoch": 0.03486410496719775, + "grad_norm": 63380.01953125, + "learning_rate": 9.998087454338651e-05, + "loss": 2.4001, + "step": 186 + }, + { + "epoch": 0.03505154639175258, + "grad_norm": 59033.71484375, + "learning_rate": 9.998065660527823e-05, + "loss": 2.4057, + "step": 187 + }, + { + "epoch": 0.0352389878163074, + "grad_norm": 59700.52734375, + "learning_rate": 9.998043743271254e-05, + "loss": 2.4003, + "step": 188 + }, + { + "epoch": 0.03542642924086223, + "grad_norm": 56740.265625, + "learning_rate": 9.998021702569486e-05, + "loss": 2.3479, + "step": 189 + }, + { + "epoch": 0.03561387066541706, + "grad_norm": 56647.10546875, + "learning_rate": 9.99799953842306e-05, + "loss": 2.4074, + "step": 190 + }, + { + "epoch": 0.035801312089971886, + "grad_norm": 61887.76953125, + "learning_rate": 9.997977250832527e-05, + "loss": 2.3816, + "step": 191 + }, + { + "epoch": 0.03598875351452671, + "grad_norm": 56419.65625, + "learning_rate": 9.997954839798436e-05, + "loss": 2.4093, + "step": 192 + }, + { + "epoch": 0.03617619493908154, + "grad_norm": 57438.10546875, + "learning_rate": 9.997932305321338e-05, + "loss": 2.3949, + "step": 193 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 63034.75390625, + "learning_rate": 9.997909647401794e-05, + "loss": 2.4595, + "step": 194 + }, + { + "epoch": 0.03655107778819119, + "grad_norm": 76751.625, + "learning_rate": 9.997886866040361e-05, + "loss": 2.5157, + "step": 195 + }, + { + "epoch": 0.03673851921274602, + "grad_norm": 60205.484375, + "learning_rate": 9.997863961237603e-05, + "loss": 2.3372, + "step": 196 + }, + { + "epoch": 0.036925960637300845, + "grad_norm": 60567.546875, + "learning_rate": 9.997840932994084e-05, + "loss": 2.4597, + "step": 197 + }, + { + "epoch": 0.03711340206185567, + "grad_norm": 50529.34375, + "learning_rate": 9.997817781310374e-05, + "loss": 2.361, + "step": 198 + }, + { + "epoch": 0.037300843486410495, + "grad_norm": 63714.83984375, + "learning_rate": 9.997794506187044e-05, + "loss": 2.4439, + "step": 199 + }, + { + "epoch": 0.03748828491096532, + "grad_norm": 57160.2578125, + "learning_rate": 9.997771107624672e-05, + "loss": 2.4011, + "step": 200 + }, + { + "epoch": 0.03767572633552015, + "grad_norm": 59941.29296875, + "learning_rate": 9.99774758562383e-05, + "loss": 2.3177, + "step": 201 + }, + { + "epoch": 0.03786316776007498, + "grad_norm": 53346.9921875, + "learning_rate": 9.997723940185104e-05, + "loss": 2.3875, + "step": 202 + }, + { + "epoch": 0.038050609184629804, + "grad_norm": 61381.76171875, + "learning_rate": 9.997700171309076e-05, + "loss": 2.3403, + "step": 203 + }, + { + "epoch": 0.03823805060918463, + "grad_norm": 59235.80078125, + "learning_rate": 9.997676278996333e-05, + "loss": 2.3022, + "step": 204 + }, + { + "epoch": 0.038425492033739454, + "grad_norm": 58141.59765625, + "learning_rate": 9.997652263247468e-05, + "loss": 2.3138, + "step": 205 + }, + { + "epoch": 0.03861293345829428, + "grad_norm": 54268.171875, + "learning_rate": 9.997628124063068e-05, + "loss": 2.3496, + "step": 206 + }, + { + "epoch": 0.03880037488284911, + "grad_norm": 59662.72265625, + "learning_rate": 9.997603861443735e-05, + "loss": 2.3976, + "step": 207 + }, + { + "epoch": 0.03898781630740394, + "grad_norm": 59804.30078125, + "learning_rate": 9.997579475390066e-05, + "loss": 2.3997, + "step": 208 + }, + { + "epoch": 0.03917525773195876, + "grad_norm": 53126.97265625, + "learning_rate": 9.997554965902662e-05, + "loss": 2.3138, + "step": 209 + }, + { + "epoch": 0.03936269915651359, + "grad_norm": 62162.765625, + "learning_rate": 9.997530332982132e-05, + "loss": 2.3769, + "step": 210 + }, + { + "epoch": 0.03955014058106841, + "grad_norm": 50746.31640625, + "learning_rate": 9.99750557662908e-05, + "loss": 2.3597, + "step": 211 + }, + { + "epoch": 0.039737582005623245, + "grad_norm": 54142.3359375, + "learning_rate": 9.997480696844121e-05, + "loss": 2.3682, + "step": 212 + }, + { + "epoch": 0.03992502343017807, + "grad_norm": 60448.5390625, + "learning_rate": 9.997455693627869e-05, + "loss": 2.3603, + "step": 213 + }, + { + "epoch": 0.040112464854732896, + "grad_norm": 54518.4921875, + "learning_rate": 9.997430566980939e-05, + "loss": 2.401, + "step": 214 + }, + { + "epoch": 0.04029990627928772, + "grad_norm": 57872.32421875, + "learning_rate": 9.997405316903953e-05, + "loss": 2.3428, + "step": 215 + }, + { + "epoch": 0.04048734770384255, + "grad_norm": 52959.26171875, + "learning_rate": 9.997379943397536e-05, + "loss": 2.3895, + "step": 216 + }, + { + "epoch": 0.04067478912839738, + "grad_norm": 55626.2421875, + "learning_rate": 9.997354446462313e-05, + "loss": 2.3337, + "step": 217 + }, + { + "epoch": 0.040862230552952204, + "grad_norm": 54783.2578125, + "learning_rate": 9.997328826098913e-05, + "loss": 2.4302, + "step": 218 + }, + { + "epoch": 0.04104967197750703, + "grad_norm": 54220.91015625, + "learning_rate": 9.997303082307972e-05, + "loss": 2.3141, + "step": 219 + }, + { + "epoch": 0.041237113402061855, + "grad_norm": 53245.75, + "learning_rate": 9.997277215090124e-05, + "loss": 2.3774, + "step": 220 + }, + { + "epoch": 0.04142455482661668, + "grad_norm": 55989.67578125, + "learning_rate": 9.997251224446008e-05, + "loss": 2.3708, + "step": 221 + }, + { + "epoch": 0.041611996251171506, + "grad_norm": 56543.3046875, + "learning_rate": 9.997225110376264e-05, + "loss": 2.4591, + "step": 222 + }, + { + "epoch": 0.04179943767572634, + "grad_norm": 55781.61328125, + "learning_rate": 9.997198872881541e-05, + "loss": 2.3651, + "step": 223 + }, + { + "epoch": 0.04198687910028116, + "grad_norm": 58294.82421875, + "learning_rate": 9.997172511962484e-05, + "loss": 2.3483, + "step": 224 + }, + { + "epoch": 0.04217432052483599, + "grad_norm": 70852.2421875, + "learning_rate": 9.997146027619744e-05, + "loss": 2.372, + "step": 225 + }, + { + "epoch": 0.042361761949390814, + "grad_norm": 52895.375, + "learning_rate": 9.997119419853977e-05, + "loss": 2.3608, + "step": 226 + }, + { + "epoch": 0.04254920337394564, + "grad_norm": 61430.140625, + "learning_rate": 9.997092688665839e-05, + "loss": 2.3457, + "step": 227 + }, + { + "epoch": 0.04273664479850047, + "grad_norm": 57719.94921875, + "learning_rate": 9.99706583405599e-05, + "loss": 2.3975, + "step": 228 + }, + { + "epoch": 0.0429240862230553, + "grad_norm": 52589.23046875, + "learning_rate": 9.997038856025094e-05, + "loss": 2.2705, + "step": 229 + }, + { + "epoch": 0.04311152764761012, + "grad_norm": 60540.203125, + "learning_rate": 9.997011754573818e-05, + "loss": 2.5171, + "step": 230 + }, + { + "epoch": 0.04329896907216495, + "grad_norm": 56712.703125, + "learning_rate": 9.996984529702829e-05, + "loss": 2.3992, + "step": 231 + }, + { + "epoch": 0.04348641049671977, + "grad_norm": 58696.65625, + "learning_rate": 9.996957181412801e-05, + "loss": 2.3634, + "step": 232 + }, + { + "epoch": 0.043673851921274605, + "grad_norm": 56949.76953125, + "learning_rate": 9.99692970970441e-05, + "loss": 2.3885, + "step": 233 + }, + { + "epoch": 0.04386129334582943, + "grad_norm": 60847.47265625, + "learning_rate": 9.996902114578333e-05, + "loss": 2.369, + "step": 234 + }, + { + "epoch": 0.044048734770384255, + "grad_norm": 54649.796875, + "learning_rate": 9.996874396035251e-05, + "loss": 2.3522, + "step": 235 + }, + { + "epoch": 0.04423617619493908, + "grad_norm": 58140.390625, + "learning_rate": 9.996846554075853e-05, + "loss": 2.3835, + "step": 236 + }, + { + "epoch": 0.044423617619493906, + "grad_norm": 58623.20703125, + "learning_rate": 9.996818588700822e-05, + "loss": 2.4166, + "step": 237 + }, + { + "epoch": 0.04461105904404873, + "grad_norm": 54111.39453125, + "learning_rate": 9.99679049991085e-05, + "loss": 2.3279, + "step": 238 + }, + { + "epoch": 0.044798500468603564, + "grad_norm": 50317.55859375, + "learning_rate": 9.996762287706632e-05, + "loss": 2.393, + "step": 239 + }, + { + "epoch": 0.04498594189315839, + "grad_norm": 55830.8359375, + "learning_rate": 9.996733952088863e-05, + "loss": 2.4283, + "step": 240 + }, + { + "epoch": 0.045173383317713214, + "grad_norm": 56188.8203125, + "learning_rate": 9.996705493058245e-05, + "loss": 2.4107, + "step": 241 + }, + { + "epoch": 0.04536082474226804, + "grad_norm": 59554.45703125, + "learning_rate": 9.996676910615479e-05, + "loss": 2.4634, + "step": 242 + }, + { + "epoch": 0.045548266166822865, + "grad_norm": 51152.79296875, + "learning_rate": 9.996648204761272e-05, + "loss": 2.317, + "step": 243 + }, + { + "epoch": 0.0457357075913777, + "grad_norm": 56476.296875, + "learning_rate": 9.996619375496332e-05, + "loss": 2.3686, + "step": 244 + }, + { + "epoch": 0.04592314901593252, + "grad_norm": 57774.44140625, + "learning_rate": 9.996590422821372e-05, + "loss": 2.327, + "step": 245 + }, + { + "epoch": 0.04611059044048735, + "grad_norm": 53147.41796875, + "learning_rate": 9.996561346737105e-05, + "loss": 2.4115, + "step": 246 + }, + { + "epoch": 0.04629803186504217, + "grad_norm": 50594.3671875, + "learning_rate": 9.996532147244253e-05, + "loss": 2.3008, + "step": 247 + }, + { + "epoch": 0.046485473289597, + "grad_norm": 61130.0, + "learning_rate": 9.996502824343534e-05, + "loss": 2.3438, + "step": 248 + }, + { + "epoch": 0.04667291471415183, + "grad_norm": 54164.2578125, + "learning_rate": 9.996473378035673e-05, + "loss": 2.3802, + "step": 249 + }, + { + "epoch": 0.046860356138706656, + "grad_norm": 52928.99609375, + "learning_rate": 9.996443808321399e-05, + "loss": 2.3968, + "step": 250 + }, + { + "epoch": 0.04704779756326148, + "grad_norm": 52758.0234375, + "learning_rate": 9.99641411520144e-05, + "loss": 2.3617, + "step": 251 + }, + { + "epoch": 0.04723523898781631, + "grad_norm": 52483.671875, + "learning_rate": 9.99638429867653e-05, + "loss": 2.3206, + "step": 252 + }, + { + "epoch": 0.04742268041237113, + "grad_norm": 57139.09375, + "learning_rate": 9.996354358747407e-05, + "loss": 2.374, + "step": 253 + }, + { + "epoch": 0.04761012183692596, + "grad_norm": 58880.390625, + "learning_rate": 9.996324295414808e-05, + "loss": 2.3657, + "step": 254 + }, + { + "epoch": 0.04779756326148079, + "grad_norm": 55501.92578125, + "learning_rate": 9.996294108679479e-05, + "loss": 2.3137, + "step": 255 + }, + { + "epoch": 0.047985004686035615, + "grad_norm": 56158.33203125, + "learning_rate": 9.99626379854216e-05, + "loss": 2.3215, + "step": 256 + }, + { + "epoch": 0.04817244611059044, + "grad_norm": 52760.13671875, + "learning_rate": 9.996233365003604e-05, + "loss": 2.386, + "step": 257 + }, + { + "epoch": 0.048359887535145266, + "grad_norm": 55723.45703125, + "learning_rate": 9.996202808064563e-05, + "loss": 2.3182, + "step": 258 + }, + { + "epoch": 0.04854732895970009, + "grad_norm": 54857.73828125, + "learning_rate": 9.996172127725789e-05, + "loss": 2.3537, + "step": 259 + }, + { + "epoch": 0.04873477038425492, + "grad_norm": 54675.2578125, + "learning_rate": 9.996141323988043e-05, + "loss": 2.3385, + "step": 260 + }, + { + "epoch": 0.04892221180880975, + "grad_norm": 60873.5625, + "learning_rate": 9.99611039685208e-05, + "loss": 2.3425, + "step": 261 + }, + { + "epoch": 0.049109653233364574, + "grad_norm": 59105.2890625, + "learning_rate": 9.996079346318671e-05, + "loss": 2.3575, + "step": 262 + }, + { + "epoch": 0.0492970946579194, + "grad_norm": 55924.96875, + "learning_rate": 9.99604817238858e-05, + "loss": 2.3417, + "step": 263 + }, + { + "epoch": 0.049484536082474224, + "grad_norm": 63410.16015625, + "learning_rate": 9.996016875062574e-05, + "loss": 2.4438, + "step": 264 + }, + { + "epoch": 0.04967197750702906, + "grad_norm": 71779.9140625, + "learning_rate": 9.99598545434143e-05, + "loss": 2.3587, + "step": 265 + }, + { + "epoch": 0.04985941893158388, + "grad_norm": 57001.8125, + "learning_rate": 9.995953910225921e-05, + "loss": 2.4169, + "step": 266 + }, + { + "epoch": 0.05004686035613871, + "grad_norm": 61791.19140625, + "learning_rate": 9.995922242716831e-05, + "loss": 2.3785, + "step": 267 + }, + { + "epoch": 0.05023430178069353, + "grad_norm": 64791.44921875, + "learning_rate": 9.995890451814937e-05, + "loss": 2.3409, + "step": 268 + }, + { + "epoch": 0.05042174320524836, + "grad_norm": 62814.05078125, + "learning_rate": 9.995858537521025e-05, + "loss": 2.4027, + "step": 269 + }, + { + "epoch": 0.05060918462980318, + "grad_norm": 56703.62109375, + "learning_rate": 9.995826499835888e-05, + "loss": 2.3394, + "step": 270 + }, + { + "epoch": 0.050796626054358016, + "grad_norm": 57305.65625, + "learning_rate": 9.995794338760311e-05, + "loss": 2.3382, + "step": 271 + }, + { + "epoch": 0.05098406747891284, + "grad_norm": 65871.1328125, + "learning_rate": 9.99576205429509e-05, + "loss": 2.4197, + "step": 272 + }, + { + "epoch": 0.051171508903467666, + "grad_norm": 58113.40234375, + "learning_rate": 9.995729646441025e-05, + "loss": 2.3112, + "step": 273 + }, + { + "epoch": 0.05135895032802249, + "grad_norm": 56835.3046875, + "learning_rate": 9.995697115198914e-05, + "loss": 2.3783, + "step": 274 + }, + { + "epoch": 0.05154639175257732, + "grad_norm": 59792.0859375, + "learning_rate": 9.995664460569562e-05, + "loss": 2.3606, + "step": 275 + }, + { + "epoch": 0.05173383317713215, + "grad_norm": 59126.44921875, + "learning_rate": 9.995631682553775e-05, + "loss": 2.3171, + "step": 276 + }, + { + "epoch": 0.051921274601686974, + "grad_norm": 57715.984375, + "learning_rate": 9.995598781152362e-05, + "loss": 2.3433, + "step": 277 + }, + { + "epoch": 0.0521087160262418, + "grad_norm": 56221.25390625, + "learning_rate": 9.995565756366136e-05, + "loss": 2.3482, + "step": 278 + }, + { + "epoch": 0.052296157450796625, + "grad_norm": 59860.796875, + "learning_rate": 9.995532608195912e-05, + "loss": 2.3802, + "step": 279 + }, + { + "epoch": 0.05248359887535145, + "grad_norm": 51387.171875, + "learning_rate": 9.995499336642511e-05, + "loss": 2.4252, + "step": 280 + }, + { + "epoch": 0.05267104029990628, + "grad_norm": 60394.33203125, + "learning_rate": 9.995465941706753e-05, + "loss": 2.3394, + "step": 281 + }, + { + "epoch": 0.05285848172446111, + "grad_norm": 59068.08203125, + "learning_rate": 9.995432423389461e-05, + "loss": 2.408, + "step": 282 + }, + { + "epoch": 0.05304592314901593, + "grad_norm": 54095.69140625, + "learning_rate": 9.995398781691465e-05, + "loss": 2.3905, + "step": 283 + }, + { + "epoch": 0.05323336457357076, + "grad_norm": 49612.9921875, + "learning_rate": 9.995365016613599e-05, + "loss": 2.3522, + "step": 284 + }, + { + "epoch": 0.053420805998125584, + "grad_norm": 51749.25, + "learning_rate": 9.99533112815669e-05, + "loss": 2.3636, + "step": 285 + }, + { + "epoch": 0.05360824742268041, + "grad_norm": 56058.25390625, + "learning_rate": 9.99529711632158e-05, + "loss": 2.3301, + "step": 286 + }, + { + "epoch": 0.05379568884723524, + "grad_norm": 56039.6875, + "learning_rate": 9.995262981109108e-05, + "loss": 2.4161, + "step": 287 + }, + { + "epoch": 0.05398313027179007, + "grad_norm": 56877.2890625, + "learning_rate": 9.995228722520116e-05, + "loss": 2.3661, + "step": 288 + }, + { + "epoch": 0.05417057169634489, + "grad_norm": 55679.9140625, + "learning_rate": 9.995194340555452e-05, + "loss": 2.3297, + "step": 289 + }, + { + "epoch": 0.05435801312089972, + "grad_norm": 52346.0859375, + "learning_rate": 9.995159835215964e-05, + "loss": 2.3592, + "step": 290 + }, + { + "epoch": 0.05454545454545454, + "grad_norm": 57825.65234375, + "learning_rate": 9.995125206502503e-05, + "loss": 2.3813, + "step": 291 + }, + { + "epoch": 0.054732895970009375, + "grad_norm": 52579.96875, + "learning_rate": 9.995090454415928e-05, + "loss": 2.38, + "step": 292 + }, + { + "epoch": 0.0549203373945642, + "grad_norm": 57737.14453125, + "learning_rate": 9.995055578957093e-05, + "loss": 2.4214, + "step": 293 + }, + { + "epoch": 0.055107778819119026, + "grad_norm": 56139.484375, + "learning_rate": 9.995020580126861e-05, + "loss": 2.3129, + "step": 294 + }, + { + "epoch": 0.05529522024367385, + "grad_norm": 55527.0234375, + "learning_rate": 9.994985457926098e-05, + "loss": 2.3371, + "step": 295 + }, + { + "epoch": 0.055482661668228676, + "grad_norm": 56021.6328125, + "learning_rate": 9.994950212355671e-05, + "loss": 2.4544, + "step": 296 + }, + { + "epoch": 0.05567010309278351, + "grad_norm": 57452.34765625, + "learning_rate": 9.99491484341645e-05, + "loss": 2.3862, + "step": 297 + }, + { + "epoch": 0.055857544517338334, + "grad_norm": 49745.8046875, + "learning_rate": 9.994879351109307e-05, + "loss": 2.4281, + "step": 298 + }, + { + "epoch": 0.05604498594189316, + "grad_norm": 56498.8125, + "learning_rate": 9.99484373543512e-05, + "loss": 2.4653, + "step": 299 + }, + { + "epoch": 0.056232427366447985, + "grad_norm": 53576.8125, + "learning_rate": 9.994807996394768e-05, + "loss": 2.407, + "step": 300 + }, + { + "epoch": 0.05641986879100281, + "grad_norm": 54570.203125, + "learning_rate": 9.994772133989137e-05, + "loss": 2.3718, + "step": 301 + }, + { + "epoch": 0.056607310215557635, + "grad_norm": 53904.2109375, + "learning_rate": 9.994736148219109e-05, + "loss": 2.3508, + "step": 302 + }, + { + "epoch": 0.05679475164011247, + "grad_norm": 57780.93359375, + "learning_rate": 9.994700039085572e-05, + "loss": 2.3899, + "step": 303 + }, + { + "epoch": 0.05698219306466729, + "grad_norm": 53681.54296875, + "learning_rate": 9.994663806589421e-05, + "loss": 2.3162, + "step": 304 + }, + { + "epoch": 0.05716963448922212, + "grad_norm": 56787.56640625, + "learning_rate": 9.99462745073155e-05, + "loss": 2.3824, + "step": 305 + }, + { + "epoch": 0.05735707591377694, + "grad_norm": 54512.4375, + "learning_rate": 9.994590971512856e-05, + "loss": 2.2648, + "step": 306 + }, + { + "epoch": 0.05754451733833177, + "grad_norm": 54392.37890625, + "learning_rate": 9.994554368934241e-05, + "loss": 2.3449, + "step": 307 + }, + { + "epoch": 0.0577319587628866, + "grad_norm": 61179.5, + "learning_rate": 9.994517642996607e-05, + "loss": 2.4342, + "step": 308 + }, + { + "epoch": 0.057919400187441426, + "grad_norm": 58236.16796875, + "learning_rate": 9.994480793700867e-05, + "loss": 2.3642, + "step": 309 + }, + { + "epoch": 0.05810684161199625, + "grad_norm": 52332.57421875, + "learning_rate": 9.994443821047923e-05, + "loss": 2.3973, + "step": 310 + }, + { + "epoch": 0.05829428303655108, + "grad_norm": 55853.08203125, + "learning_rate": 9.994406725038694e-05, + "loss": 2.4341, + "step": 311 + }, + { + "epoch": 0.0584817244611059, + "grad_norm": 55106.01171875, + "learning_rate": 9.994369505674092e-05, + "loss": 2.4019, + "step": 312 + }, + { + "epoch": 0.05866916588566073, + "grad_norm": 59131.16796875, + "learning_rate": 9.99433216295504e-05, + "loss": 2.3581, + "step": 313 + }, + { + "epoch": 0.05885660731021556, + "grad_norm": 58752.55859375, + "learning_rate": 9.99429469688246e-05, + "loss": 2.4009, + "step": 314 + }, + { + "epoch": 0.059044048734770385, + "grad_norm": 52216.56640625, + "learning_rate": 9.994257107457275e-05, + "loss": 2.3945, + "step": 315 + }, + { + "epoch": 0.05923149015932521, + "grad_norm": 54277.41796875, + "learning_rate": 9.994219394680415e-05, + "loss": 2.3414, + "step": 316 + }, + { + "epoch": 0.059418931583880036, + "grad_norm": 63572.95703125, + "learning_rate": 9.994181558552809e-05, + "loss": 2.4147, + "step": 317 + }, + { + "epoch": 0.05960637300843486, + "grad_norm": 55354.17578125, + "learning_rate": 9.994143599075397e-05, + "loss": 2.3475, + "step": 318 + }, + { + "epoch": 0.05979381443298969, + "grad_norm": 56100.76953125, + "learning_rate": 9.994105516249111e-05, + "loss": 2.4469, + "step": 319 + }, + { + "epoch": 0.05998125585754452, + "grad_norm": 56605.125, + "learning_rate": 9.994067310074892e-05, + "loss": 2.3631, + "step": 320 + }, + { + "epoch": 0.060168697282099344, + "grad_norm": 54693.0546875, + "learning_rate": 9.994028980553688e-05, + "loss": 2.3486, + "step": 321 + }, + { + "epoch": 0.06035613870665417, + "grad_norm": 51518.359375, + "learning_rate": 9.993990527686442e-05, + "loss": 2.3774, + "step": 322 + }, + { + "epoch": 0.060543580131208995, + "grad_norm": 55894.5625, + "learning_rate": 9.993951951474105e-05, + "loss": 2.417, + "step": 323 + }, + { + "epoch": 0.06073102155576383, + "grad_norm": 53081.140625, + "learning_rate": 9.99391325191763e-05, + "loss": 2.3651, + "step": 324 + }, + { + "epoch": 0.06091846298031865, + "grad_norm": 53624.15234375, + "learning_rate": 9.993874429017972e-05, + "loss": 2.3459, + "step": 325 + }, + { + "epoch": 0.06110590440487348, + "grad_norm": 52071.5078125, + "learning_rate": 9.99383548277609e-05, + "loss": 2.3764, + "step": 326 + }, + { + "epoch": 0.0612933458294283, + "grad_norm": 54894.8046875, + "learning_rate": 9.993796413192946e-05, + "loss": 2.3773, + "step": 327 + }, + { + "epoch": 0.06148078725398313, + "grad_norm": 56816.296875, + "learning_rate": 9.993757220269506e-05, + "loss": 2.3772, + "step": 328 + }, + { + "epoch": 0.061668228678537954, + "grad_norm": 56251.8671875, + "learning_rate": 9.993717904006736e-05, + "loss": 2.4005, + "step": 329 + }, + { + "epoch": 0.061855670103092786, + "grad_norm": 52001.5, + "learning_rate": 9.993678464405611e-05, + "loss": 2.4067, + "step": 330 + }, + { + "epoch": 0.06204311152764761, + "grad_norm": 62330.93359375, + "learning_rate": 9.9936389014671e-05, + "loss": 2.3824, + "step": 331 + }, + { + "epoch": 0.062230552952202436, + "grad_norm": 53282.60546875, + "learning_rate": 9.993599215192181e-05, + "loss": 2.4601, + "step": 332 + }, + { + "epoch": 0.06241799437675726, + "grad_norm": 58652.1015625, + "learning_rate": 9.993559405581838e-05, + "loss": 2.4018, + "step": 333 + }, + { + "epoch": 0.06260543580131209, + "grad_norm": 54134.4921875, + "learning_rate": 9.993519472637052e-05, + "loss": 2.317, + "step": 334 + }, + { + "epoch": 0.06279287722586692, + "grad_norm": 61178.23046875, + "learning_rate": 9.993479416358809e-05, + "loss": 2.3852, + "step": 335 + }, + { + "epoch": 0.06298031865042174, + "grad_norm": 52942.45703125, + "learning_rate": 9.993439236748098e-05, + "loss": 2.3672, + "step": 336 + }, + { + "epoch": 0.06316776007497657, + "grad_norm": 54442.26953125, + "learning_rate": 9.993398933805912e-05, + "loss": 2.4033, + "step": 337 + }, + { + "epoch": 0.0633552014995314, + "grad_norm": 56633.33203125, + "learning_rate": 9.993358507533245e-05, + "loss": 2.3924, + "step": 338 + }, + { + "epoch": 0.06354264292408622, + "grad_norm": 55072.171875, + "learning_rate": 9.9933179579311e-05, + "loss": 2.2687, + "step": 339 + }, + { + "epoch": 0.06373008434864105, + "grad_norm": 61216.4921875, + "learning_rate": 9.993277285000473e-05, + "loss": 2.3985, + "step": 340 + }, + { + "epoch": 0.06391752577319587, + "grad_norm": 54184.9765625, + "learning_rate": 9.993236488742372e-05, + "loss": 2.3439, + "step": 341 + }, + { + "epoch": 0.0641049671977507, + "grad_norm": 59394.05078125, + "learning_rate": 9.993195569157803e-05, + "loss": 2.3781, + "step": 342 + }, + { + "epoch": 0.06429240862230554, + "grad_norm": 51647.390625, + "learning_rate": 9.993154526247778e-05, + "loss": 2.4438, + "step": 343 + }, + { + "epoch": 0.06447985004686035, + "grad_norm": 50680.51171875, + "learning_rate": 9.993113360013309e-05, + "loss": 2.3986, + "step": 344 + }, + { + "epoch": 0.06466729147141519, + "grad_norm": 54839.4609375, + "learning_rate": 9.993072070455414e-05, + "loss": 2.368, + "step": 345 + }, + { + "epoch": 0.06485473289597, + "grad_norm": 55551.20703125, + "learning_rate": 9.993030657575113e-05, + "loss": 2.3481, + "step": 346 + }, + { + "epoch": 0.06504217432052484, + "grad_norm": 50628.453125, + "learning_rate": 9.992989121373428e-05, + "loss": 2.3859, + "step": 347 + }, + { + "epoch": 0.06522961574507967, + "grad_norm": 54001.4453125, + "learning_rate": 9.992947461851385e-05, + "loss": 2.4298, + "step": 348 + }, + { + "epoch": 0.06541705716963449, + "grad_norm": 56767.44140625, + "learning_rate": 9.992905679010013e-05, + "loss": 2.3632, + "step": 349 + }, + { + "epoch": 0.06560449859418932, + "grad_norm": 55567.84375, + "learning_rate": 9.992863772850345e-05, + "loss": 2.3839, + "step": 350 + }, + { + "epoch": 0.06579194001874414, + "grad_norm": 52418.02734375, + "learning_rate": 9.992821743373413e-05, + "loss": 2.4355, + "step": 351 + }, + { + "epoch": 0.06597938144329897, + "grad_norm": 52187.86328125, + "learning_rate": 9.992779590580259e-05, + "loss": 2.3049, + "step": 352 + }, + { + "epoch": 0.06616682286785379, + "grad_norm": 49334.1953125, + "learning_rate": 9.992737314471923e-05, + "loss": 2.3379, + "step": 353 + }, + { + "epoch": 0.06635426429240862, + "grad_norm": 53703.55078125, + "learning_rate": 9.992694915049448e-05, + "loss": 2.333, + "step": 354 + }, + { + "epoch": 0.06654170571696345, + "grad_norm": 56501.0390625, + "learning_rate": 9.99265239231388e-05, + "loss": 2.4306, + "step": 355 + }, + { + "epoch": 0.06672914714151827, + "grad_norm": 59491.0234375, + "learning_rate": 9.992609746266273e-05, + "loss": 2.3206, + "step": 356 + }, + { + "epoch": 0.0669165885660731, + "grad_norm": 55500.375, + "learning_rate": 9.992566976907676e-05, + "loss": 2.3427, + "step": 357 + }, + { + "epoch": 0.06710402999062792, + "grad_norm": 54324.1328125, + "learning_rate": 9.99252408423915e-05, + "loss": 2.3908, + "step": 358 + }, + { + "epoch": 0.06729147141518275, + "grad_norm": 56693.5703125, + "learning_rate": 9.992481068261753e-05, + "loss": 2.381, + "step": 359 + }, + { + "epoch": 0.06747891283973759, + "grad_norm": 56514.67578125, + "learning_rate": 9.992437928976544e-05, + "loss": 2.3437, + "step": 360 + }, + { + "epoch": 0.0676663542642924, + "grad_norm": 55918.55078125, + "learning_rate": 9.99239466638459e-05, + "loss": 2.3636, + "step": 361 + }, + { + "epoch": 0.06785379568884724, + "grad_norm": 50573.1015625, + "learning_rate": 9.992351280486962e-05, + "loss": 2.358, + "step": 362 + }, + { + "epoch": 0.06804123711340206, + "grad_norm": 60392.8203125, + "learning_rate": 9.99230777128473e-05, + "loss": 2.5165, + "step": 363 + }, + { + "epoch": 0.06822867853795689, + "grad_norm": 51747.0625, + "learning_rate": 9.992264138778969e-05, + "loss": 2.4292, + "step": 364 + }, + { + "epoch": 0.06841611996251172, + "grad_norm": 52211.1171875, + "learning_rate": 9.992220382970756e-05, + "loss": 2.3745, + "step": 365 + }, + { + "epoch": 0.06860356138706654, + "grad_norm": 54136.91015625, + "learning_rate": 9.992176503861172e-05, + "loss": 2.4072, + "step": 366 + }, + { + "epoch": 0.06879100281162137, + "grad_norm": 51104.84375, + "learning_rate": 9.992132501451302e-05, + "loss": 2.4598, + "step": 367 + }, + { + "epoch": 0.06897844423617619, + "grad_norm": 52048.453125, + "learning_rate": 9.99208837574223e-05, + "loss": 2.3603, + "step": 368 + }, + { + "epoch": 0.06916588566073102, + "grad_norm": 59758.90625, + "learning_rate": 9.992044126735048e-05, + "loss": 2.3912, + "step": 369 + }, + { + "epoch": 0.06935332708528585, + "grad_norm": 51502.21875, + "learning_rate": 9.99199975443085e-05, + "loss": 2.419, + "step": 370 + }, + { + "epoch": 0.06954076850984067, + "grad_norm": 56997.8828125, + "learning_rate": 9.991955258830727e-05, + "loss": 2.3754, + "step": 371 + }, + { + "epoch": 0.0697282099343955, + "grad_norm": 57669.671875, + "learning_rate": 9.991910639935785e-05, + "loss": 2.3913, + "step": 372 + }, + { + "epoch": 0.06991565135895032, + "grad_norm": 54532.94921875, + "learning_rate": 9.99186589774712e-05, + "loss": 2.3614, + "step": 373 + }, + { + "epoch": 0.07010309278350516, + "grad_norm": 53183.0546875, + "learning_rate": 9.991821032265838e-05, + "loss": 2.3193, + "step": 374 + }, + { + "epoch": 0.07029053420805999, + "grad_norm": 54560.3359375, + "learning_rate": 9.99177604349305e-05, + "loss": 2.3211, + "step": 375 + }, + { + "epoch": 0.0704779756326148, + "grad_norm": 57518.97265625, + "learning_rate": 9.991730931429866e-05, + "loss": 2.3456, + "step": 376 + }, + { + "epoch": 0.07066541705716964, + "grad_norm": 55339.8125, + "learning_rate": 9.9916856960774e-05, + "loss": 2.3076, + "step": 377 + }, + { + "epoch": 0.07085285848172446, + "grad_norm": 53755.10546875, + "learning_rate": 9.991640337436769e-05, + "loss": 2.3462, + "step": 378 + }, + { + "epoch": 0.07104029990627929, + "grad_norm": 61188.44140625, + "learning_rate": 9.991594855509093e-05, + "loss": 2.3316, + "step": 379 + }, + { + "epoch": 0.07122774133083412, + "grad_norm": 64452.72265625, + "learning_rate": 9.991549250295496e-05, + "loss": 2.2866, + "step": 380 + }, + { + "epoch": 0.07141518275538894, + "grad_norm": 51022.30859375, + "learning_rate": 9.991503521797104e-05, + "loss": 2.345, + "step": 381 + }, + { + "epoch": 0.07160262417994377, + "grad_norm": 58102.09765625, + "learning_rate": 9.991457670015048e-05, + "loss": 2.4058, + "step": 382 + }, + { + "epoch": 0.07179006560449859, + "grad_norm": 57396.26953125, + "learning_rate": 9.991411694950457e-05, + "loss": 2.3915, + "step": 383 + }, + { + "epoch": 0.07197750702905342, + "grad_norm": 57496.26953125, + "learning_rate": 9.99136559660447e-05, + "loss": 2.3677, + "step": 384 + }, + { + "epoch": 0.07216494845360824, + "grad_norm": 53963.8671875, + "learning_rate": 9.991319374978223e-05, + "loss": 2.4302, + "step": 385 + }, + { + "epoch": 0.07235238987816307, + "grad_norm": 55466.10546875, + "learning_rate": 9.991273030072858e-05, + "loss": 2.3379, + "step": 386 + }, + { + "epoch": 0.0725398313027179, + "grad_norm": 52150.94140625, + "learning_rate": 9.991226561889523e-05, + "loss": 2.3658, + "step": 387 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 56857.50390625, + "learning_rate": 9.991179970429362e-05, + "loss": 2.3898, + "step": 388 + }, + { + "epoch": 0.07291471415182756, + "grad_norm": 54339.55859375, + "learning_rate": 9.991133255693526e-05, + "loss": 2.4262, + "step": 389 + }, + { + "epoch": 0.07310215557638237, + "grad_norm": 57257.37890625, + "learning_rate": 9.991086417683171e-05, + "loss": 2.3579, + "step": 390 + }, + { + "epoch": 0.0732895970009372, + "grad_norm": 56674.1640625, + "learning_rate": 9.991039456399451e-05, + "loss": 2.3636, + "step": 391 + }, + { + "epoch": 0.07347703842549204, + "grad_norm": 49498.8515625, + "learning_rate": 9.99099237184353e-05, + "loss": 2.3212, + "step": 392 + }, + { + "epoch": 0.07366447985004686, + "grad_norm": 54244.4609375, + "learning_rate": 9.990945164016566e-05, + "loss": 2.292, + "step": 393 + }, + { + "epoch": 0.07385192127460169, + "grad_norm": 54938.296875, + "learning_rate": 9.990897832919728e-05, + "loss": 2.4146, + "step": 394 + }, + { + "epoch": 0.07403936269915651, + "grad_norm": 59105.78515625, + "learning_rate": 9.990850378554184e-05, + "loss": 2.3344, + "step": 395 + }, + { + "epoch": 0.07422680412371134, + "grad_norm": 58463.57421875, + "learning_rate": 9.990802800921107e-05, + "loss": 2.3933, + "step": 396 + }, + { + "epoch": 0.07441424554826617, + "grad_norm": 54764.04296875, + "learning_rate": 9.990755100021672e-05, + "loss": 2.3648, + "step": 397 + }, + { + "epoch": 0.07460168697282099, + "grad_norm": 54502.53515625, + "learning_rate": 9.990707275857056e-05, + "loss": 2.3264, + "step": 398 + }, + { + "epoch": 0.07478912839737582, + "grad_norm": 52109.28125, + "learning_rate": 9.99065932842844e-05, + "loss": 2.3535, + "step": 399 + }, + { + "epoch": 0.07497656982193064, + "grad_norm": 52621.625, + "learning_rate": 9.99061125773701e-05, + "loss": 2.3563, + "step": 400 + }, + { + "epoch": 0.07516401124648547, + "grad_norm": 53377.3203125, + "learning_rate": 9.990563063783953e-05, + "loss": 2.4037, + "step": 401 + }, + { + "epoch": 0.0753514526710403, + "grad_norm": 55744.5546875, + "learning_rate": 9.990514746570458e-05, + "loss": 2.4129, + "step": 402 + }, + { + "epoch": 0.07553889409559512, + "grad_norm": 51056.0625, + "learning_rate": 9.990466306097719e-05, + "loss": 2.3164, + "step": 403 + }, + { + "epoch": 0.07572633552014996, + "grad_norm": 50392.51171875, + "learning_rate": 9.990417742366931e-05, + "loss": 2.3412, + "step": 404 + }, + { + "epoch": 0.07591377694470477, + "grad_norm": 52542.5078125, + "learning_rate": 9.990369055379298e-05, + "loss": 2.4202, + "step": 405 + }, + { + "epoch": 0.07610121836925961, + "grad_norm": 53119.4296875, + "learning_rate": 9.990320245136017e-05, + "loss": 2.3046, + "step": 406 + }, + { + "epoch": 0.07628865979381444, + "grad_norm": 49789.5390625, + "learning_rate": 9.990271311638297e-05, + "loss": 2.3203, + "step": 407 + }, + { + "epoch": 0.07647610121836926, + "grad_norm": 54207.875, + "learning_rate": 9.990222254887345e-05, + "loss": 2.4524, + "step": 408 + }, + { + "epoch": 0.07666354264292409, + "grad_norm": 52743.2109375, + "learning_rate": 9.990173074884374e-05, + "loss": 2.3964, + "step": 409 + }, + { + "epoch": 0.07685098406747891, + "grad_norm": 60829.1875, + "learning_rate": 9.990123771630597e-05, + "loss": 2.4424, + "step": 410 + }, + { + "epoch": 0.07703842549203374, + "grad_norm": 50786.8671875, + "learning_rate": 9.990074345127234e-05, + "loss": 2.4117, + "step": 411 + }, + { + "epoch": 0.07722586691658856, + "grad_norm": 51583.87109375, + "learning_rate": 9.990024795375504e-05, + "loss": 2.347, + "step": 412 + }, + { + "epoch": 0.07741330834114339, + "grad_norm": 51167.6328125, + "learning_rate": 9.989975122376629e-05, + "loss": 2.352, + "step": 413 + }, + { + "epoch": 0.07760074976569822, + "grad_norm": 51346.16796875, + "learning_rate": 9.98992532613184e-05, + "loss": 2.3838, + "step": 414 + }, + { + "epoch": 0.07778819119025304, + "grad_norm": 52432.87890625, + "learning_rate": 9.989875406642365e-05, + "loss": 2.3189, + "step": 415 + }, + { + "epoch": 0.07797563261480787, + "grad_norm": 60625.51953125, + "learning_rate": 9.989825363909437e-05, + "loss": 2.3876, + "step": 416 + }, + { + "epoch": 0.07816307403936269, + "grad_norm": 48965.703125, + "learning_rate": 9.989775197934291e-05, + "loss": 2.338, + "step": 417 + }, + { + "epoch": 0.07835051546391752, + "grad_norm": 53220.95703125, + "learning_rate": 9.989724908718168e-05, + "loss": 2.3565, + "step": 418 + }, + { + "epoch": 0.07853795688847236, + "grad_norm": 54826.234375, + "learning_rate": 9.989674496262309e-05, + "loss": 2.3402, + "step": 419 + }, + { + "epoch": 0.07872539831302718, + "grad_norm": 60618.703125, + "learning_rate": 9.989623960567957e-05, + "loss": 2.371, + "step": 420 + }, + { + "epoch": 0.07891283973758201, + "grad_norm": 57067.0234375, + "learning_rate": 9.989573301636366e-05, + "loss": 2.401, + "step": 421 + }, + { + "epoch": 0.07910028116213683, + "grad_norm": 53052.3828125, + "learning_rate": 9.989522519468781e-05, + "loss": 2.3637, + "step": 422 + }, + { + "epoch": 0.07928772258669166, + "grad_norm": 53654.0390625, + "learning_rate": 9.98947161406646e-05, + "loss": 2.3281, + "step": 423 + }, + { + "epoch": 0.07947516401124649, + "grad_norm": 55345.91796875, + "learning_rate": 9.989420585430657e-05, + "loss": 2.3788, + "step": 424 + }, + { + "epoch": 0.07966260543580131, + "grad_norm": 56549.31640625, + "learning_rate": 9.989369433562636e-05, + "loss": 2.3849, + "step": 425 + }, + { + "epoch": 0.07985004686035614, + "grad_norm": 55179.6328125, + "learning_rate": 9.989318158463658e-05, + "loss": 2.3917, + "step": 426 + }, + { + "epoch": 0.08003748828491096, + "grad_norm": 53676.9375, + "learning_rate": 9.989266760134992e-05, + "loss": 2.4255, + "step": 427 + }, + { + "epoch": 0.08022492970946579, + "grad_norm": 56187.2421875, + "learning_rate": 9.989215238577904e-05, + "loss": 2.3969, + "step": 428 + }, + { + "epoch": 0.08041237113402062, + "grad_norm": 55691.3515625, + "learning_rate": 9.989163593793669e-05, + "loss": 2.3953, + "step": 429 + }, + { + "epoch": 0.08059981255857544, + "grad_norm": 51104.109375, + "learning_rate": 9.989111825783562e-05, + "loss": 2.2956, + "step": 430 + }, + { + "epoch": 0.08078725398313027, + "grad_norm": 60236.0703125, + "learning_rate": 9.989059934548861e-05, + "loss": 2.4136, + "step": 431 + }, + { + "epoch": 0.0809746954076851, + "grad_norm": 56910.13671875, + "learning_rate": 9.989007920090848e-05, + "loss": 2.4073, + "step": 432 + }, + { + "epoch": 0.08116213683223993, + "grad_norm": 225526.90625, + "learning_rate": 9.988955782410808e-05, + "loss": 2.7459, + "step": 433 + }, + { + "epoch": 0.08134957825679476, + "grad_norm": 56382.07421875, + "learning_rate": 9.988903521510029e-05, + "loss": 2.3915, + "step": 434 + }, + { + "epoch": 0.08153701968134958, + "grad_norm": 57017.0546875, + "learning_rate": 9.9888511373898e-05, + "loss": 2.4198, + "step": 435 + }, + { + "epoch": 0.08172446110590441, + "grad_norm": 56532.34765625, + "learning_rate": 9.988798630051418e-05, + "loss": 2.3451, + "step": 436 + }, + { + "epoch": 0.08191190253045923, + "grad_norm": 54771.55078125, + "learning_rate": 9.988745999496175e-05, + "loss": 2.3766, + "step": 437 + }, + { + "epoch": 0.08209934395501406, + "grad_norm": 54534.3984375, + "learning_rate": 9.988693245725374e-05, + "loss": 2.386, + "step": 438 + }, + { + "epoch": 0.08228678537956889, + "grad_norm": 53064.125, + "learning_rate": 9.988640368740319e-05, + "loss": 2.4141, + "step": 439 + }, + { + "epoch": 0.08247422680412371, + "grad_norm": 54412.23828125, + "learning_rate": 9.988587368542315e-05, + "loss": 2.3097, + "step": 440 + }, + { + "epoch": 0.08266166822867854, + "grad_norm": 209439.140625, + "learning_rate": 9.988534245132672e-05, + "loss": 2.7387, + "step": 441 + }, + { + "epoch": 0.08284910965323336, + "grad_norm": 54931.80078125, + "learning_rate": 9.988480998512699e-05, + "loss": 2.3831, + "step": 442 + }, + { + "epoch": 0.08303655107778819, + "grad_norm": 53611.0390625, + "learning_rate": 9.988427628683713e-05, + "loss": 2.3682, + "step": 443 + }, + { + "epoch": 0.08322399250234301, + "grad_norm": 56770.671875, + "learning_rate": 9.988374135647032e-05, + "loss": 2.3578, + "step": 444 + }, + { + "epoch": 0.08341143392689784, + "grad_norm": 57080.09765625, + "learning_rate": 9.988320519403978e-05, + "loss": 2.3442, + "step": 445 + }, + { + "epoch": 0.08359887535145268, + "grad_norm": 57863.328125, + "learning_rate": 9.988266779955875e-05, + "loss": 2.3842, + "step": 446 + }, + { + "epoch": 0.0837863167760075, + "grad_norm": 54397.94140625, + "learning_rate": 9.98821291730405e-05, + "loss": 2.3222, + "step": 447 + }, + { + "epoch": 0.08397375820056233, + "grad_norm": 55913.79296875, + "learning_rate": 9.988158931449832e-05, + "loss": 2.4013, + "step": 448 + }, + { + "epoch": 0.08416119962511714, + "grad_norm": 53146.99609375, + "learning_rate": 9.988104822394558e-05, + "loss": 2.368, + "step": 449 + }, + { + "epoch": 0.08434864104967198, + "grad_norm": 53153.265625, + "learning_rate": 9.988050590139559e-05, + "loss": 2.3159, + "step": 450 + }, + { + "epoch": 0.08453608247422681, + "grad_norm": 59721.16015625, + "learning_rate": 9.987996234686178e-05, + "loss": 2.4517, + "step": 451 + }, + { + "epoch": 0.08472352389878163, + "grad_norm": 59321.140625, + "learning_rate": 9.987941756035758e-05, + "loss": 2.3232, + "step": 452 + }, + { + "epoch": 0.08491096532333646, + "grad_norm": 50206.46875, + "learning_rate": 9.987887154189644e-05, + "loss": 2.347, + "step": 453 + }, + { + "epoch": 0.08509840674789128, + "grad_norm": 55031.125, + "learning_rate": 9.987832429149185e-05, + "loss": 2.3531, + "step": 454 + }, + { + "epoch": 0.08528584817244611, + "grad_norm": 57415.6328125, + "learning_rate": 9.987777580915729e-05, + "loss": 2.3691, + "step": 455 + }, + { + "epoch": 0.08547328959700094, + "grad_norm": 55298.71875, + "learning_rate": 9.987722609490637e-05, + "loss": 2.4797, + "step": 456 + }, + { + "epoch": 0.08566073102155576, + "grad_norm": 51031.83984375, + "learning_rate": 9.987667514875261e-05, + "loss": 2.3666, + "step": 457 + }, + { + "epoch": 0.0858481724461106, + "grad_norm": 53559.65625, + "learning_rate": 9.987612297070964e-05, + "loss": 2.3316, + "step": 458 + }, + { + "epoch": 0.08603561387066541, + "grad_norm": 58186.01953125, + "learning_rate": 9.98755695607911e-05, + "loss": 2.3601, + "step": 459 + }, + { + "epoch": 0.08622305529522024, + "grad_norm": 51893.40234375, + "learning_rate": 9.987501491901066e-05, + "loss": 2.3584, + "step": 460 + }, + { + "epoch": 0.08641049671977508, + "grad_norm": 57353.234375, + "learning_rate": 9.9874459045382e-05, + "loss": 2.3394, + "step": 461 + }, + { + "epoch": 0.0865979381443299, + "grad_norm": 58828.44140625, + "learning_rate": 9.987390193991888e-05, + "loss": 2.3885, + "step": 462 + }, + { + "epoch": 0.08678537956888473, + "grad_norm": 53893.8125, + "learning_rate": 9.987334360263504e-05, + "loss": 2.38, + "step": 463 + }, + { + "epoch": 0.08697282099343955, + "grad_norm": 50097.27734375, + "learning_rate": 9.987278403354429e-05, + "loss": 2.3548, + "step": 464 + }, + { + "epoch": 0.08716026241799438, + "grad_norm": 56488.94140625, + "learning_rate": 9.98722232326604e-05, + "loss": 2.3494, + "step": 465 + }, + { + "epoch": 0.08734770384254921, + "grad_norm": 55661.1796875, + "learning_rate": 9.98716611999973e-05, + "loss": 2.3649, + "step": 466 + }, + { + "epoch": 0.08753514526710403, + "grad_norm": 51958.22265625, + "learning_rate": 9.98710979355688e-05, + "loss": 2.3702, + "step": 467 + }, + { + "epoch": 0.08772258669165886, + "grad_norm": 56066.7890625, + "learning_rate": 9.987053343938886e-05, + "loss": 2.3856, + "step": 468 + }, + { + "epoch": 0.08791002811621368, + "grad_norm": 55580.33203125, + "learning_rate": 9.986996771147138e-05, + "loss": 2.2943, + "step": 469 + }, + { + "epoch": 0.08809746954076851, + "grad_norm": 51453.54296875, + "learning_rate": 9.986940075183036e-05, + "loss": 2.3439, + "step": 470 + }, + { + "epoch": 0.08828491096532334, + "grad_norm": 56472.58984375, + "learning_rate": 9.986883256047981e-05, + "loss": 2.3687, + "step": 471 + }, + { + "epoch": 0.08847235238987816, + "grad_norm": 57994.203125, + "learning_rate": 9.986826313743373e-05, + "loss": 2.3857, + "step": 472 + }, + { + "epoch": 0.088659793814433, + "grad_norm": 53638.99609375, + "learning_rate": 9.986769248270623e-05, + "loss": 2.3536, + "step": 473 + }, + { + "epoch": 0.08884723523898781, + "grad_norm": 55343.1171875, + "learning_rate": 9.986712059631136e-05, + "loss": 2.4212, + "step": 474 + }, + { + "epoch": 0.08903467666354264, + "grad_norm": 54481.171875, + "learning_rate": 9.986654747826328e-05, + "loss": 2.3098, + "step": 475 + }, + { + "epoch": 0.08922211808809746, + "grad_norm": 53143.01953125, + "learning_rate": 9.986597312857613e-05, + "loss": 2.3904, + "step": 476 + }, + { + "epoch": 0.0894095595126523, + "grad_norm": 55481.890625, + "learning_rate": 9.986539754726409e-05, + "loss": 2.3508, + "step": 477 + }, + { + "epoch": 0.08959700093720713, + "grad_norm": 50469.2109375, + "learning_rate": 9.986482073434137e-05, + "loss": 2.2944, + "step": 478 + }, + { + "epoch": 0.08978444236176195, + "grad_norm": 55472.30078125, + "learning_rate": 9.986424268982225e-05, + "loss": 2.4116, + "step": 479 + }, + { + "epoch": 0.08997188378631678, + "grad_norm": 56855.27734375, + "learning_rate": 9.986366341372097e-05, + "loss": 2.3943, + "step": 480 + }, + { + "epoch": 0.0901593252108716, + "grad_norm": 52068.6015625, + "learning_rate": 9.986308290605186e-05, + "loss": 2.3429, + "step": 481 + }, + { + "epoch": 0.09034676663542643, + "grad_norm": 56099.76171875, + "learning_rate": 9.986250116682926e-05, + "loss": 2.3448, + "step": 482 + }, + { + "epoch": 0.09053420805998126, + "grad_norm": 52931.8828125, + "learning_rate": 9.986191819606752e-05, + "loss": 2.3517, + "step": 483 + }, + { + "epoch": 0.09072164948453608, + "grad_norm": 54566.02734375, + "learning_rate": 9.986133399378104e-05, + "loss": 2.3201, + "step": 484 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 56105.75, + "learning_rate": 9.986074855998426e-05, + "loss": 2.326, + "step": 485 + }, + { + "epoch": 0.09109653233364573, + "grad_norm": 53722.39453125, + "learning_rate": 9.986016189469165e-05, + "loss": 2.3823, + "step": 486 + }, + { + "epoch": 0.09128397375820056, + "grad_norm": 54957.32421875, + "learning_rate": 9.985957399791767e-05, + "loss": 2.3121, + "step": 487 + }, + { + "epoch": 0.0914714151827554, + "grad_norm": 58934.34765625, + "learning_rate": 9.985898486967687e-05, + "loss": 2.439, + "step": 488 + }, + { + "epoch": 0.09165885660731021, + "grad_norm": 56684.07421875, + "learning_rate": 9.985839450998377e-05, + "loss": 2.3998, + "step": 489 + }, + { + "epoch": 0.09184629803186505, + "grad_norm": 51585.8046875, + "learning_rate": 9.985780291885298e-05, + "loss": 2.3234, + "step": 490 + }, + { + "epoch": 0.09203373945641986, + "grad_norm": 58691.19921875, + "learning_rate": 9.98572100962991e-05, + "loss": 2.4343, + "step": 491 + }, + { + "epoch": 0.0922211808809747, + "grad_norm": 58554.84375, + "learning_rate": 9.985661604233676e-05, + "loss": 2.3693, + "step": 492 + }, + { + "epoch": 0.09240862230552953, + "grad_norm": 54025.01953125, + "learning_rate": 9.985602075698063e-05, + "loss": 2.4239, + "step": 493 + }, + { + "epoch": 0.09259606373008435, + "grad_norm": 56128.44140625, + "learning_rate": 9.985542424024547e-05, + "loss": 2.4398, + "step": 494 + }, + { + "epoch": 0.09278350515463918, + "grad_norm": 57821.05078125, + "learning_rate": 9.985482649214595e-05, + "loss": 2.3888, + "step": 495 + }, + { + "epoch": 0.092970946579194, + "grad_norm": 52948.66015625, + "learning_rate": 9.985422751269684e-05, + "loss": 2.307, + "step": 496 + }, + { + "epoch": 0.09315838800374883, + "grad_norm": 59615.40625, + "learning_rate": 9.985362730191296e-05, + "loss": 2.409, + "step": 497 + }, + { + "epoch": 0.09334582942830366, + "grad_norm": 54056.7265625, + "learning_rate": 9.985302585980911e-05, + "loss": 2.3547, + "step": 498 + }, + { + "epoch": 0.09353327085285848, + "grad_norm": 52298.17578125, + "learning_rate": 9.985242318640018e-05, + "loss": 2.352, + "step": 499 + }, + { + "epoch": 0.09372071227741331, + "grad_norm": 56259.984375, + "learning_rate": 9.985181928170102e-05, + "loss": 2.3806, + "step": 500 + }, + { + "epoch": 0.09372071227741331, + "eval_loss": 2.3493378162384033, + "eval_runtime": 143.6281, + "eval_samples_per_second": 35.153, + "eval_steps_per_second": 1.761, + "step": 500 + }, + { + "epoch": 0.09390815370196813, + "grad_norm": 57820.4609375, + "learning_rate": 9.985121414572655e-05, + "loss": 2.3847, + "step": 501 + }, + { + "epoch": 0.09409559512652296, + "grad_norm": 55511.85546875, + "learning_rate": 9.985060777849173e-05, + "loss": 2.3626, + "step": 502 + }, + { + "epoch": 0.0942830365510778, + "grad_norm": 54740.9765625, + "learning_rate": 9.985000018001152e-05, + "loss": 2.4134, + "step": 503 + }, + { + "epoch": 0.09447047797563261, + "grad_norm": 50027.83203125, + "learning_rate": 9.984939135030096e-05, + "loss": 2.3679, + "step": 504 + }, + { + "epoch": 0.09465791940018745, + "grad_norm": 50894.58984375, + "learning_rate": 9.984878128937504e-05, + "loss": 2.3877, + "step": 505 + }, + { + "epoch": 0.09484536082474226, + "grad_norm": 58994.51953125, + "learning_rate": 9.984816999724886e-05, + "loss": 2.3377, + "step": 506 + }, + { + "epoch": 0.0950328022492971, + "grad_norm": 55091.046875, + "learning_rate": 9.984755747393753e-05, + "loss": 2.398, + "step": 507 + }, + { + "epoch": 0.09522024367385191, + "grad_norm": 53871.6953125, + "learning_rate": 9.984694371945613e-05, + "loss": 2.4123, + "step": 508 + }, + { + "epoch": 0.09540768509840675, + "grad_norm": 54310.61328125, + "learning_rate": 9.984632873381986e-05, + "loss": 2.3393, + "step": 509 + }, + { + "epoch": 0.09559512652296158, + "grad_norm": 52799.8515625, + "learning_rate": 9.98457125170439e-05, + "loss": 2.4305, + "step": 510 + }, + { + "epoch": 0.0957825679475164, + "grad_norm": 55304.72265625, + "learning_rate": 9.984509506914345e-05, + "loss": 2.2644, + "step": 511 + }, + { + "epoch": 0.09597000937207123, + "grad_norm": 127304.859375, + "learning_rate": 9.98444763901338e-05, + "loss": 2.673, + "step": 512 + }, + { + "epoch": 0.09615745079662605, + "grad_norm": 55107.2890625, + "learning_rate": 9.984385648003021e-05, + "loss": 2.4174, + "step": 513 + }, + { + "epoch": 0.09634489222118088, + "grad_norm": 55814.08203125, + "learning_rate": 9.984323533884796e-05, + "loss": 2.35, + "step": 514 + }, + { + "epoch": 0.09653233364573571, + "grad_norm": 50223.5234375, + "learning_rate": 9.984261296660244e-05, + "loss": 2.3261, + "step": 515 + }, + { + "epoch": 0.09671977507029053, + "grad_norm": 51428.00390625, + "learning_rate": 9.984198936330899e-05, + "loss": 2.3927, + "step": 516 + }, + { + "epoch": 0.09690721649484536, + "grad_norm": 53094.328125, + "learning_rate": 9.984136452898304e-05, + "loss": 2.3506, + "step": 517 + }, + { + "epoch": 0.09709465791940018, + "grad_norm": 51771.51171875, + "learning_rate": 9.984073846363999e-05, + "loss": 2.3113, + "step": 518 + }, + { + "epoch": 0.09728209934395501, + "grad_norm": 51857.3984375, + "learning_rate": 9.984011116729533e-05, + "loss": 2.4106, + "step": 519 + }, + { + "epoch": 0.09746954076850985, + "grad_norm": 53460.5625, + "learning_rate": 9.983948263996454e-05, + "loss": 2.3942, + "step": 520 + }, + { + "epoch": 0.09765698219306466, + "grad_norm": 55041.31640625, + "learning_rate": 9.983885288166315e-05, + "loss": 2.3723, + "step": 521 + }, + { + "epoch": 0.0978444236176195, + "grad_norm": 64311.53515625, + "learning_rate": 9.98382218924067e-05, + "loss": 2.375, + "step": 522 + }, + { + "epoch": 0.09803186504217432, + "grad_norm": 50068.3046875, + "learning_rate": 9.98375896722108e-05, + "loss": 2.3965, + "step": 523 + }, + { + "epoch": 0.09821930646672915, + "grad_norm": 52362.3359375, + "learning_rate": 9.983695622109105e-05, + "loss": 2.3337, + "step": 524 + }, + { + "epoch": 0.09840674789128398, + "grad_norm": 52675.62109375, + "learning_rate": 9.983632153906309e-05, + "loss": 2.4073, + "step": 525 + }, + { + "epoch": 0.0985941893158388, + "grad_norm": 53218.3046875, + "learning_rate": 9.983568562614261e-05, + "loss": 2.3089, + "step": 526 + }, + { + "epoch": 0.09878163074039363, + "grad_norm": 59134.98828125, + "learning_rate": 9.983504848234529e-05, + "loss": 2.3222, + "step": 527 + }, + { + "epoch": 0.09896907216494845, + "grad_norm": 49028.25390625, + "learning_rate": 9.98344101076869e-05, + "loss": 2.4221, + "step": 528 + }, + { + "epoch": 0.09915651358950328, + "grad_norm": 51213.5625, + "learning_rate": 9.983377050218318e-05, + "loss": 2.3313, + "step": 529 + }, + { + "epoch": 0.09934395501405811, + "grad_norm": 56383.05859375, + "learning_rate": 9.983312966584995e-05, + "loss": 2.3005, + "step": 530 + }, + { + "epoch": 0.09953139643861293, + "grad_norm": 52961.8984375, + "learning_rate": 9.983248759870301e-05, + "loss": 2.393, + "step": 531 + }, + { + "epoch": 0.09971883786316776, + "grad_norm": 52204.03125, + "learning_rate": 9.983184430075825e-05, + "loss": 2.3695, + "step": 532 + }, + { + "epoch": 0.09990627928772258, + "grad_norm": 57108.0234375, + "learning_rate": 9.983119977203153e-05, + "loss": 2.3461, + "step": 533 + }, + { + "epoch": 0.10009372071227741, + "grad_norm": 56499.02734375, + "learning_rate": 9.98305540125388e-05, + "loss": 2.3121, + "step": 534 + }, + { + "epoch": 0.10028116213683223, + "grad_norm": 53852.05859375, + "learning_rate": 9.982990702229598e-05, + "loss": 2.3669, + "step": 535 + }, + { + "epoch": 0.10046860356138707, + "grad_norm": 49932.890625, + "learning_rate": 9.982925880131908e-05, + "loss": 2.3496, + "step": 536 + }, + { + "epoch": 0.1006560449859419, + "grad_norm": 51468.7578125, + "learning_rate": 9.982860934962408e-05, + "loss": 2.3418, + "step": 537 + }, + { + "epoch": 0.10084348641049672, + "grad_norm": 51040.8125, + "learning_rate": 9.982795866722703e-05, + "loss": 2.3903, + "step": 538 + }, + { + "epoch": 0.10103092783505155, + "grad_norm": 50249.85546875, + "learning_rate": 9.9827306754144e-05, + "loss": 2.4112, + "step": 539 + }, + { + "epoch": 0.10121836925960637, + "grad_norm": 53389.31640625, + "learning_rate": 9.98266536103911e-05, + "loss": 2.3177, + "step": 540 + }, + { + "epoch": 0.1014058106841612, + "grad_norm": 58171.6796875, + "learning_rate": 9.982599923598446e-05, + "loss": 2.3689, + "step": 541 + }, + { + "epoch": 0.10159325210871603, + "grad_norm": 53907.80078125, + "learning_rate": 9.982534363094023e-05, + "loss": 2.3373, + "step": 542 + }, + { + "epoch": 0.10178069353327085, + "grad_norm": 53855.5546875, + "learning_rate": 9.982468679527462e-05, + "loss": 2.3338, + "step": 543 + }, + { + "epoch": 0.10196813495782568, + "grad_norm": 50742.76953125, + "learning_rate": 9.982402872900385e-05, + "loss": 2.4311, + "step": 544 + }, + { + "epoch": 0.1021555763823805, + "grad_norm": 51443.453125, + "learning_rate": 9.982336943214416e-05, + "loss": 2.3826, + "step": 545 + }, + { + "epoch": 0.10234301780693533, + "grad_norm": 55413.4765625, + "learning_rate": 9.982270890471185e-05, + "loss": 2.3805, + "step": 546 + }, + { + "epoch": 0.10253045923149016, + "grad_norm": 51298.34375, + "learning_rate": 9.982204714672322e-05, + "loss": 2.3631, + "step": 547 + }, + { + "epoch": 0.10271790065604498, + "grad_norm": 52196.39453125, + "learning_rate": 9.982138415819461e-05, + "loss": 2.3013, + "step": 548 + }, + { + "epoch": 0.10290534208059982, + "grad_norm": 52279.1484375, + "learning_rate": 9.982071993914243e-05, + "loss": 2.3983, + "step": 549 + }, + { + "epoch": 0.10309278350515463, + "grad_norm": 52113.3046875, + "learning_rate": 9.982005448958304e-05, + "loss": 2.3304, + "step": 550 + }, + { + "epoch": 0.10328022492970947, + "grad_norm": 52075.4765625, + "learning_rate": 9.98193878095329e-05, + "loss": 2.3394, + "step": 551 + }, + { + "epoch": 0.1034676663542643, + "grad_norm": 63102.25, + "learning_rate": 9.981871989900849e-05, + "loss": 2.3416, + "step": 552 + }, + { + "epoch": 0.10365510777881912, + "grad_norm": 52128.90234375, + "learning_rate": 9.981805075802627e-05, + "loss": 2.3555, + "step": 553 + }, + { + "epoch": 0.10384254920337395, + "grad_norm": 58984.77734375, + "learning_rate": 9.98173803866028e-05, + "loss": 2.3835, + "step": 554 + }, + { + "epoch": 0.10402999062792877, + "grad_norm": 57072.25390625, + "learning_rate": 9.981670878475461e-05, + "loss": 2.3737, + "step": 555 + }, + { + "epoch": 0.1042174320524836, + "grad_norm": 57815.52734375, + "learning_rate": 9.981603595249831e-05, + "loss": 2.361, + "step": 556 + }, + { + "epoch": 0.10440487347703843, + "grad_norm": 52824.3984375, + "learning_rate": 9.98153618898505e-05, + "loss": 2.4025, + "step": 557 + }, + { + "epoch": 0.10459231490159325, + "grad_norm": 55537.88671875, + "learning_rate": 9.981468659682784e-05, + "loss": 2.3896, + "step": 558 + }, + { + "epoch": 0.10477975632614808, + "grad_norm": 52889.03515625, + "learning_rate": 9.981401007344702e-05, + "loss": 2.3577, + "step": 559 + }, + { + "epoch": 0.1049671977507029, + "grad_norm": 54462.02734375, + "learning_rate": 9.981333231972475e-05, + "loss": 2.3615, + "step": 560 + }, + { + "epoch": 0.10515463917525773, + "grad_norm": 53802.5546875, + "learning_rate": 9.981265333567773e-05, + "loss": 2.343, + "step": 561 + }, + { + "epoch": 0.10534208059981257, + "grad_norm": 54234.0234375, + "learning_rate": 9.981197312132275e-05, + "loss": 2.3663, + "step": 562 + }, + { + "epoch": 0.10552952202436738, + "grad_norm": 51147.09765625, + "learning_rate": 9.981129167667662e-05, + "loss": 2.3562, + "step": 563 + }, + { + "epoch": 0.10571696344892222, + "grad_norm": 51836.0, + "learning_rate": 9.981060900175618e-05, + "loss": 2.3864, + "step": 564 + }, + { + "epoch": 0.10590440487347703, + "grad_norm": 55283.5, + "learning_rate": 9.980992509657827e-05, + "loss": 2.3576, + "step": 565 + }, + { + "epoch": 0.10609184629803187, + "grad_norm": 56550.0078125, + "learning_rate": 9.980923996115978e-05, + "loss": 2.385, + "step": 566 + }, + { + "epoch": 0.10627928772258668, + "grad_norm": 52767.24609375, + "learning_rate": 9.980855359551766e-05, + "loss": 2.3913, + "step": 567 + }, + { + "epoch": 0.10646672914714152, + "grad_norm": 50916.046875, + "learning_rate": 9.980786599966883e-05, + "loss": 2.3743, + "step": 568 + }, + { + "epoch": 0.10665417057169635, + "grad_norm": 49759.77734375, + "learning_rate": 9.98071771736303e-05, + "loss": 2.3483, + "step": 569 + }, + { + "epoch": 0.10684161199625117, + "grad_norm": 56294.72265625, + "learning_rate": 9.980648711741906e-05, + "loss": 2.3587, + "step": 570 + }, + { + "epoch": 0.107029053420806, + "grad_norm": 56248.25390625, + "learning_rate": 9.980579583105217e-05, + "loss": 2.4269, + "step": 571 + }, + { + "epoch": 0.10721649484536082, + "grad_norm": 56178.4375, + "learning_rate": 9.98051033145467e-05, + "loss": 2.372, + "step": 572 + }, + { + "epoch": 0.10740393626991565, + "grad_norm": 52239.3515625, + "learning_rate": 9.980440956791974e-05, + "loss": 2.3858, + "step": 573 + }, + { + "epoch": 0.10759137769447048, + "grad_norm": 55848.56640625, + "learning_rate": 9.980371459118842e-05, + "loss": 2.3021, + "step": 574 + }, + { + "epoch": 0.1077788191190253, + "grad_norm": 46817.171875, + "learning_rate": 9.980301838436995e-05, + "loss": 2.3568, + "step": 575 + }, + { + "epoch": 0.10796626054358013, + "grad_norm": 58019.125, + "learning_rate": 9.980232094748149e-05, + "loss": 2.4639, + "step": 576 + }, + { + "epoch": 0.10815370196813495, + "grad_norm": 51165.8203125, + "learning_rate": 9.980162228054026e-05, + "loss": 2.3987, + "step": 577 + }, + { + "epoch": 0.10834114339268978, + "grad_norm": 55384.0625, + "learning_rate": 9.980092238356356e-05, + "loss": 2.4475, + "step": 578 + }, + { + "epoch": 0.10852858481724462, + "grad_norm": 58745.30859375, + "learning_rate": 9.980022125656861e-05, + "loss": 2.3955, + "step": 579 + }, + { + "epoch": 0.10871602624179943, + "grad_norm": 52023.015625, + "learning_rate": 9.979951889957278e-05, + "loss": 2.3776, + "step": 580 + }, + { + "epoch": 0.10890346766635427, + "grad_norm": 58673.15625, + "learning_rate": 9.979881531259338e-05, + "loss": 2.3786, + "step": 581 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 51953.171875, + "learning_rate": 9.979811049564784e-05, + "loss": 2.4115, + "step": 582 + }, + { + "epoch": 0.10927835051546392, + "grad_norm": 52650.33984375, + "learning_rate": 9.979740444875351e-05, + "loss": 2.3691, + "step": 583 + }, + { + "epoch": 0.10946579194001875, + "grad_norm": 57630.05078125, + "learning_rate": 9.979669717192786e-05, + "loss": 2.2905, + "step": 584 + }, + { + "epoch": 0.10965323336457357, + "grad_norm": 53730.546875, + "learning_rate": 9.979598866518836e-05, + "loss": 2.4262, + "step": 585 + }, + { + "epoch": 0.1098406747891284, + "grad_norm": 55385.2734375, + "learning_rate": 9.979527892855249e-05, + "loss": 2.4068, + "step": 586 + }, + { + "epoch": 0.11002811621368322, + "grad_norm": 53642.0625, + "learning_rate": 9.97945679620378e-05, + "loss": 2.3587, + "step": 587 + }, + { + "epoch": 0.11021555763823805, + "grad_norm": 54145.5390625, + "learning_rate": 9.979385576566182e-05, + "loss": 2.3527, + "step": 588 + }, + { + "epoch": 0.11040299906279288, + "grad_norm": 53472.50390625, + "learning_rate": 9.979314233944218e-05, + "loss": 2.4232, + "step": 589 + }, + { + "epoch": 0.1105904404873477, + "grad_norm": 56996.53125, + "learning_rate": 9.979242768339647e-05, + "loss": 2.3987, + "step": 590 + }, + { + "epoch": 0.11077788191190253, + "grad_norm": 56247.03515625, + "learning_rate": 9.979171179754237e-05, + "loss": 2.3671, + "step": 591 + }, + { + "epoch": 0.11096532333645735, + "grad_norm": 51614.26953125, + "learning_rate": 9.979099468189755e-05, + "loss": 2.3152, + "step": 592 + }, + { + "epoch": 0.11115276476101218, + "grad_norm": 53986.8125, + "learning_rate": 9.979027633647968e-05, + "loss": 2.3864, + "step": 593 + }, + { + "epoch": 0.11134020618556702, + "grad_norm": 53330.53125, + "learning_rate": 9.978955676130656e-05, + "loss": 2.364, + "step": 594 + }, + { + "epoch": 0.11152764761012184, + "grad_norm": 54948.1640625, + "learning_rate": 9.978883595639596e-05, + "loss": 2.3513, + "step": 595 + }, + { + "epoch": 0.11171508903467667, + "grad_norm": 52801.0625, + "learning_rate": 9.978811392176563e-05, + "loss": 2.4133, + "step": 596 + }, + { + "epoch": 0.11190253045923149, + "grad_norm": 54013.17578125, + "learning_rate": 9.978739065743346e-05, + "loss": 2.4008, + "step": 597 + }, + { + "epoch": 0.11208997188378632, + "grad_norm": 53320.8203125, + "learning_rate": 9.978666616341729e-05, + "loss": 2.2978, + "step": 598 + }, + { + "epoch": 0.11227741330834114, + "grad_norm": 54645.79296875, + "learning_rate": 9.9785940439735e-05, + "loss": 2.4011, + "step": 599 + }, + { + "epoch": 0.11246485473289597, + "grad_norm": 57282.3671875, + "learning_rate": 9.978521348640455e-05, + "loss": 2.3434, + "step": 600 + }, + { + "epoch": 0.1126522961574508, + "grad_norm": 58739.30078125, + "learning_rate": 9.978448530344387e-05, + "loss": 2.4068, + "step": 601 + }, + { + "epoch": 0.11283973758200562, + "grad_norm": 56266.13671875, + "learning_rate": 9.978375589087094e-05, + "loss": 2.381, + "step": 602 + }, + { + "epoch": 0.11302717900656045, + "grad_norm": 53347.9140625, + "learning_rate": 9.978302524870378e-05, + "loss": 2.2932, + "step": 603 + }, + { + "epoch": 0.11321462043111527, + "grad_norm": 64770.80859375, + "learning_rate": 9.978229337696046e-05, + "loss": 2.3941, + "step": 604 + }, + { + "epoch": 0.1134020618556701, + "grad_norm": 59043.2890625, + "learning_rate": 9.978156027565901e-05, + "loss": 2.3263, + "step": 605 + }, + { + "epoch": 0.11358950328022493, + "grad_norm": 56487.77734375, + "learning_rate": 9.97808259448176e-05, + "loss": 2.4016, + "step": 606 + }, + { + "epoch": 0.11377694470477975, + "grad_norm": 50725.06640625, + "learning_rate": 9.978009038445431e-05, + "loss": 2.3268, + "step": 607 + }, + { + "epoch": 0.11396438612933459, + "grad_norm": 52580.30859375, + "learning_rate": 9.977935359458732e-05, + "loss": 2.343, + "step": 608 + }, + { + "epoch": 0.1141518275538894, + "grad_norm": 61892.80078125, + "learning_rate": 9.977861557523485e-05, + "loss": 2.3974, + "step": 609 + }, + { + "epoch": 0.11433926897844424, + "grad_norm": 53365.51171875, + "learning_rate": 9.977787632641511e-05, + "loss": 2.4058, + "step": 610 + }, + { + "epoch": 0.11452671040299907, + "grad_norm": 52730.2734375, + "learning_rate": 9.977713584814636e-05, + "loss": 2.3469, + "step": 611 + }, + { + "epoch": 0.11471415182755389, + "grad_norm": 53538.796875, + "learning_rate": 9.97763941404469e-05, + "loss": 2.4259, + "step": 612 + }, + { + "epoch": 0.11490159325210872, + "grad_norm": 55681.04296875, + "learning_rate": 9.977565120333503e-05, + "loss": 2.3685, + "step": 613 + }, + { + "epoch": 0.11508903467666354, + "grad_norm": 50145.28515625, + "learning_rate": 9.977490703682912e-05, + "loss": 2.4134, + "step": 614 + }, + { + "epoch": 0.11527647610121837, + "grad_norm": 47268.96484375, + "learning_rate": 9.977416164094753e-05, + "loss": 2.371, + "step": 615 + }, + { + "epoch": 0.1154639175257732, + "grad_norm": 50587.83203125, + "learning_rate": 9.97734150157087e-05, + "loss": 2.3876, + "step": 616 + }, + { + "epoch": 0.11565135895032802, + "grad_norm": 56339.8984375, + "learning_rate": 9.977266716113103e-05, + "loss": 2.3765, + "step": 617 + }, + { + "epoch": 0.11583880037488285, + "grad_norm": 50227.79296875, + "learning_rate": 9.977191807723301e-05, + "loss": 2.3755, + "step": 618 + }, + { + "epoch": 0.11602624179943767, + "grad_norm": 55145.34375, + "learning_rate": 9.977116776403315e-05, + "loss": 2.3551, + "step": 619 + }, + { + "epoch": 0.1162136832239925, + "grad_norm": 55573.1015625, + "learning_rate": 9.977041622154998e-05, + "loss": 2.3237, + "step": 620 + }, + { + "epoch": 0.11640112464854734, + "grad_norm": 52967.2421875, + "learning_rate": 9.976966344980208e-05, + "loss": 2.3504, + "step": 621 + }, + { + "epoch": 0.11658856607310215, + "grad_norm": 56959.14453125, + "learning_rate": 9.9768909448808e-05, + "loss": 2.3231, + "step": 622 + }, + { + "epoch": 0.11677600749765699, + "grad_norm": 56904.3671875, + "learning_rate": 9.976815421858636e-05, + "loss": 2.262, + "step": 623 + }, + { + "epoch": 0.1169634489222118, + "grad_norm": 57802.5234375, + "learning_rate": 9.976739775915586e-05, + "loss": 2.3074, + "step": 624 + }, + { + "epoch": 0.11715089034676664, + "grad_norm": 100609.640625, + "learning_rate": 9.976664007053515e-05, + "loss": 2.5695, + "step": 625 + }, + { + "epoch": 0.11733833177132146, + "grad_norm": 85497.1171875, + "learning_rate": 9.976588115274297e-05, + "loss": 2.5891, + "step": 626 + }, + { + "epoch": 0.11752577319587629, + "grad_norm": 56392.64453125, + "learning_rate": 9.976512100579804e-05, + "loss": 2.3694, + "step": 627 + }, + { + "epoch": 0.11771321462043112, + "grad_norm": 60073.42578125, + "learning_rate": 9.976435962971913e-05, + "loss": 2.4074, + "step": 628 + }, + { + "epoch": 0.11790065604498594, + "grad_norm": 53501.36328125, + "learning_rate": 9.976359702452507e-05, + "loss": 2.3668, + "step": 629 + }, + { + "epoch": 0.11808809746954077, + "grad_norm": 56914.15234375, + "learning_rate": 9.976283319023469e-05, + "loss": 2.363, + "step": 630 + }, + { + "epoch": 0.11827553889409559, + "grad_norm": 55335.68359375, + "learning_rate": 9.976206812686683e-05, + "loss": 2.3852, + "step": 631 + }, + { + "epoch": 0.11846298031865042, + "grad_norm": 60068.53125, + "learning_rate": 9.976130183444041e-05, + "loss": 2.2523, + "step": 632 + }, + { + "epoch": 0.11865042174320525, + "grad_norm": 51455.9765625, + "learning_rate": 9.976053431297437e-05, + "loss": 2.3952, + "step": 633 + }, + { + "epoch": 0.11883786316776007, + "grad_norm": 55938.09765625, + "learning_rate": 9.975976556248762e-05, + "loss": 2.3711, + "step": 634 + }, + { + "epoch": 0.1190253045923149, + "grad_norm": 54424.4921875, + "learning_rate": 9.97589955829992e-05, + "loss": 2.3831, + "step": 635 + }, + { + "epoch": 0.11921274601686972, + "grad_norm": 159498.390625, + "learning_rate": 9.975822437452809e-05, + "loss": 2.4132, + "step": 636 + }, + { + "epoch": 0.11940018744142455, + "grad_norm": 53058.90234375, + "learning_rate": 9.975745193709335e-05, + "loss": 2.3877, + "step": 637 + }, + { + "epoch": 0.11958762886597939, + "grad_norm": 54673.04296875, + "learning_rate": 9.975667827071406e-05, + "loss": 2.3343, + "step": 638 + }, + { + "epoch": 0.1197750702905342, + "grad_norm": 64423.703125, + "learning_rate": 9.975590337540932e-05, + "loss": 2.3764, + "step": 639 + }, + { + "epoch": 0.11996251171508904, + "grad_norm": 59035.19921875, + "learning_rate": 9.975512725119829e-05, + "loss": 2.4355, + "step": 640 + }, + { + "epoch": 0.12014995313964386, + "grad_norm": 53586.98046875, + "learning_rate": 9.975434989810013e-05, + "loss": 2.3381, + "step": 641 + }, + { + "epoch": 0.12033739456419869, + "grad_norm": 54299.03125, + "learning_rate": 9.975357131613401e-05, + "loss": 2.4251, + "step": 642 + }, + { + "epoch": 0.12052483598875352, + "grad_norm": 54777.0390625, + "learning_rate": 9.97527915053192e-05, + "loss": 2.3516, + "step": 643 + }, + { + "epoch": 0.12071227741330834, + "grad_norm": 56238.84765625, + "learning_rate": 9.975201046567495e-05, + "loss": 2.3714, + "step": 644 + }, + { + "epoch": 0.12089971883786317, + "grad_norm": 60382.7109375, + "learning_rate": 9.975122819722053e-05, + "loss": 2.3473, + "step": 645 + }, + { + "epoch": 0.12108716026241799, + "grad_norm": 53288.98828125, + "learning_rate": 9.97504446999753e-05, + "loss": 2.3607, + "step": 646 + }, + { + "epoch": 0.12127460168697282, + "grad_norm": 60638.8984375, + "learning_rate": 9.974965997395858e-05, + "loss": 2.3885, + "step": 647 + }, + { + "epoch": 0.12146204311152765, + "grad_norm": 51828.484375, + "learning_rate": 9.974887401918975e-05, + "loss": 2.3482, + "step": 648 + }, + { + "epoch": 0.12164948453608247, + "grad_norm": 56463.43359375, + "learning_rate": 9.974808683568824e-05, + "loss": 2.4035, + "step": 649 + }, + { + "epoch": 0.1218369259606373, + "grad_norm": 52138.87890625, + "learning_rate": 9.974729842347348e-05, + "loss": 2.403, + "step": 650 + }, + { + "epoch": 0.12202436738519212, + "grad_norm": 53371.78125, + "learning_rate": 9.974650878256495e-05, + "loss": 2.3702, + "step": 651 + }, + { + "epoch": 0.12221180880974696, + "grad_norm": 51808.2265625, + "learning_rate": 9.974571791298216e-05, + "loss": 2.2647, + "step": 652 + }, + { + "epoch": 0.12239925023430179, + "grad_norm": 56664.87890625, + "learning_rate": 9.974492581474461e-05, + "loss": 2.3543, + "step": 653 + }, + { + "epoch": 0.1225866916588566, + "grad_norm": 55516.15234375, + "learning_rate": 9.97441324878719e-05, + "loss": 2.4371, + "step": 654 + }, + { + "epoch": 0.12277413308341144, + "grad_norm": 54509.8125, + "learning_rate": 9.974333793238362e-05, + "loss": 2.3447, + "step": 655 + }, + { + "epoch": 0.12296157450796626, + "grad_norm": 54534.99609375, + "learning_rate": 9.974254214829937e-05, + "loss": 2.3362, + "step": 656 + }, + { + "epoch": 0.12314901593252109, + "grad_norm": 53118.2421875, + "learning_rate": 9.974174513563883e-05, + "loss": 2.3127, + "step": 657 + }, + { + "epoch": 0.12333645735707591, + "grad_norm": 54488.34375, + "learning_rate": 9.974094689442168e-05, + "loss": 2.3414, + "step": 658 + }, + { + "epoch": 0.12352389878163074, + "grad_norm": 54719.38671875, + "learning_rate": 9.974014742466762e-05, + "loss": 2.3276, + "step": 659 + }, + { + "epoch": 0.12371134020618557, + "grad_norm": 53933.95703125, + "learning_rate": 9.973934672639642e-05, + "loss": 2.4014, + "step": 660 + }, + { + "epoch": 0.12389878163074039, + "grad_norm": 53589.7421875, + "learning_rate": 9.973854479962784e-05, + "loss": 2.4154, + "step": 661 + }, + { + "epoch": 0.12408622305529522, + "grad_norm": 52307.9609375, + "learning_rate": 9.97377416443817e-05, + "loss": 2.3877, + "step": 662 + }, + { + "epoch": 0.12427366447985004, + "grad_norm": 51141.16796875, + "learning_rate": 9.973693726067781e-05, + "loss": 2.3359, + "step": 663 + }, + { + "epoch": 0.12446110590440487, + "grad_norm": 55213.5859375, + "learning_rate": 9.973613164853606e-05, + "loss": 2.3666, + "step": 664 + }, + { + "epoch": 0.1246485473289597, + "grad_norm": 62373.55078125, + "learning_rate": 9.973532480797635e-05, + "loss": 2.2828, + "step": 665 + }, + { + "epoch": 0.12483598875351452, + "grad_norm": 59229.2890625, + "learning_rate": 9.973451673901859e-05, + "loss": 2.2624, + "step": 666 + }, + { + "epoch": 0.12502343017806936, + "grad_norm": 54348.69921875, + "learning_rate": 9.973370744168275e-05, + "loss": 2.3966, + "step": 667 + }, + { + "epoch": 0.12521087160262417, + "grad_norm": 54282.85546875, + "learning_rate": 9.973289691598881e-05, + "loss": 2.3783, + "step": 668 + }, + { + "epoch": 0.12539831302717902, + "grad_norm": 58391.31640625, + "learning_rate": 9.973208516195682e-05, + "loss": 2.3514, + "step": 669 + }, + { + "epoch": 0.12558575445173384, + "grad_norm": 54428.62890625, + "learning_rate": 9.973127217960679e-05, + "loss": 2.3571, + "step": 670 + }, + { + "epoch": 0.12577319587628866, + "grad_norm": 55444.953125, + "learning_rate": 9.973045796895881e-05, + "loss": 2.3387, + "step": 671 + }, + { + "epoch": 0.12596063730084348, + "grad_norm": 57777.85546875, + "learning_rate": 9.972964253003301e-05, + "loss": 2.4032, + "step": 672 + }, + { + "epoch": 0.12614807872539832, + "grad_norm": 54984.98046875, + "learning_rate": 9.972882586284951e-05, + "loss": 2.3512, + "step": 673 + }, + { + "epoch": 0.12633552014995314, + "grad_norm": 56883.984375, + "learning_rate": 9.972800796742847e-05, + "loss": 2.3937, + "step": 674 + }, + { + "epoch": 0.12652296157450796, + "grad_norm": 56033.31640625, + "learning_rate": 9.972718884379012e-05, + "loss": 2.3169, + "step": 675 + }, + { + "epoch": 0.1267104029990628, + "grad_norm": 52166.88671875, + "learning_rate": 9.972636849195467e-05, + "loss": 2.3584, + "step": 676 + }, + { + "epoch": 0.12689784442361762, + "grad_norm": 52006.08984375, + "learning_rate": 9.972554691194241e-05, + "loss": 2.2674, + "step": 677 + }, + { + "epoch": 0.12708528584817244, + "grad_norm": 53685.98046875, + "learning_rate": 9.972472410377359e-05, + "loss": 2.3569, + "step": 678 + }, + { + "epoch": 0.12727272727272726, + "grad_norm": 55503.90234375, + "learning_rate": 9.972390006746855e-05, + "loss": 2.4281, + "step": 679 + }, + { + "epoch": 0.1274601686972821, + "grad_norm": 53304.90625, + "learning_rate": 9.972307480304766e-05, + "loss": 2.3438, + "step": 680 + }, + { + "epoch": 0.12764761012183692, + "grad_norm": 52979.6171875, + "learning_rate": 9.972224831053128e-05, + "loss": 2.3357, + "step": 681 + }, + { + "epoch": 0.12783505154639174, + "grad_norm": 53333.01953125, + "learning_rate": 9.972142058993985e-05, + "loss": 2.2698, + "step": 682 + }, + { + "epoch": 0.1280224929709466, + "grad_norm": 59075.296875, + "learning_rate": 9.972059164129376e-05, + "loss": 2.4032, + "step": 683 + }, + { + "epoch": 0.1282099343955014, + "grad_norm": 58694.140625, + "learning_rate": 9.971976146461356e-05, + "loss": 2.3068, + "step": 684 + }, + { + "epoch": 0.12839737582005623, + "grad_norm": 50750.22265625, + "learning_rate": 9.97189300599197e-05, + "loss": 2.3758, + "step": 685 + }, + { + "epoch": 0.12858481724461107, + "grad_norm": 55264.7578125, + "learning_rate": 9.971809742723273e-05, + "loss": 2.3723, + "step": 686 + }, + { + "epoch": 0.1287722586691659, + "grad_norm": 53144.8125, + "learning_rate": 9.971726356657321e-05, + "loss": 2.4027, + "step": 687 + }, + { + "epoch": 0.1289597000937207, + "grad_norm": 51057.6640625, + "learning_rate": 9.971642847796175e-05, + "loss": 2.3663, + "step": 688 + }, + { + "epoch": 0.12914714151827553, + "grad_norm": 51874.8125, + "learning_rate": 9.971559216141894e-05, + "loss": 2.4173, + "step": 689 + }, + { + "epoch": 0.12933458294283037, + "grad_norm": 53193.9765625, + "learning_rate": 9.971475461696547e-05, + "loss": 2.3698, + "step": 690 + }, + { + "epoch": 0.1295220243673852, + "grad_norm": 58613.77734375, + "learning_rate": 9.971391584462204e-05, + "loss": 2.4409, + "step": 691 + }, + { + "epoch": 0.12970946579194, + "grad_norm": 56457.9375, + "learning_rate": 9.971307584440932e-05, + "loss": 2.2829, + "step": 692 + }, + { + "epoch": 0.12989690721649486, + "grad_norm": 54272.8828125, + "learning_rate": 9.971223461634809e-05, + "loss": 2.3334, + "step": 693 + }, + { + "epoch": 0.13008434864104967, + "grad_norm": 54307.5703125, + "learning_rate": 9.97113921604591e-05, + "loss": 2.3649, + "step": 694 + }, + { + "epoch": 0.1302717900656045, + "grad_norm": 51081.17578125, + "learning_rate": 9.97105484767632e-05, + "loss": 2.3047, + "step": 695 + }, + { + "epoch": 0.13045923149015934, + "grad_norm": 55196.015625, + "learning_rate": 9.970970356528119e-05, + "loss": 2.3415, + "step": 696 + }, + { + "epoch": 0.13064667291471416, + "grad_norm": 58044.61328125, + "learning_rate": 9.970885742603397e-05, + "loss": 2.4177, + "step": 697 + }, + { + "epoch": 0.13083411433926898, + "grad_norm": 54942.6171875, + "learning_rate": 9.97080100590424e-05, + "loss": 2.3911, + "step": 698 + }, + { + "epoch": 0.1310215557638238, + "grad_norm": 55480.71875, + "learning_rate": 9.970716146432744e-05, + "loss": 2.4068, + "step": 699 + }, + { + "epoch": 0.13120899718837864, + "grad_norm": 60810.5546875, + "learning_rate": 9.970631164191002e-05, + "loss": 2.3068, + "step": 700 + }, + { + "epoch": 0.13139643861293346, + "grad_norm": 56674.5078125, + "learning_rate": 9.970546059181116e-05, + "loss": 2.368, + "step": 701 + }, + { + "epoch": 0.13158388003748828, + "grad_norm": 52834.2109375, + "learning_rate": 9.970460831405189e-05, + "loss": 2.4968, + "step": 702 + }, + { + "epoch": 0.13177132146204312, + "grad_norm": 55247.72265625, + "learning_rate": 9.97037548086532e-05, + "loss": 2.3626, + "step": 703 + }, + { + "epoch": 0.13195876288659794, + "grad_norm": 49280.99609375, + "learning_rate": 9.970290007563623e-05, + "loss": 2.3509, + "step": 704 + }, + { + "epoch": 0.13214620431115276, + "grad_norm": 50736.54296875, + "learning_rate": 9.970204411502206e-05, + "loss": 2.3418, + "step": 705 + }, + { + "epoch": 0.13233364573570758, + "grad_norm": 53912.15234375, + "learning_rate": 9.970118692683184e-05, + "loss": 2.4396, + "step": 706 + }, + { + "epoch": 0.13252108716026242, + "grad_norm": 56376.21875, + "learning_rate": 9.970032851108675e-05, + "loss": 2.4256, + "step": 707 + }, + { + "epoch": 0.13270852858481724, + "grad_norm": 55379.8828125, + "learning_rate": 9.969946886780798e-05, + "loss": 2.4399, + "step": 708 + }, + { + "epoch": 0.13289597000937206, + "grad_norm": 55278.12109375, + "learning_rate": 9.969860799701675e-05, + "loss": 2.376, + "step": 709 + }, + { + "epoch": 0.1330834114339269, + "grad_norm": 55198.66015625, + "learning_rate": 9.969774589873436e-05, + "loss": 2.3708, + "step": 710 + }, + { + "epoch": 0.13327085285848173, + "grad_norm": 51671.97265625, + "learning_rate": 9.969688257298207e-05, + "loss": 2.398, + "step": 711 + }, + { + "epoch": 0.13345829428303654, + "grad_norm": 55208.92578125, + "learning_rate": 9.969601801978121e-05, + "loss": 2.3958, + "step": 712 + }, + { + "epoch": 0.1336457357075914, + "grad_norm": 49909.5546875, + "learning_rate": 9.969515223915314e-05, + "loss": 2.3735, + "step": 713 + }, + { + "epoch": 0.1338331771321462, + "grad_norm": 52870.3984375, + "learning_rate": 9.969428523111924e-05, + "loss": 2.2964, + "step": 714 + }, + { + "epoch": 0.13402061855670103, + "grad_norm": 53071.87109375, + "learning_rate": 9.969341699570095e-05, + "loss": 2.3168, + "step": 715 + }, + { + "epoch": 0.13420805998125585, + "grad_norm": 50687.70703125, + "learning_rate": 9.969254753291965e-05, + "loss": 2.3887, + "step": 716 + }, + { + "epoch": 0.1343955014058107, + "grad_norm": 51898.93359375, + "learning_rate": 9.969167684279686e-05, + "loss": 2.369, + "step": 717 + }, + { + "epoch": 0.1345829428303655, + "grad_norm": 53068.61328125, + "learning_rate": 9.96908049253541e-05, + "loss": 2.3386, + "step": 718 + }, + { + "epoch": 0.13477038425492033, + "grad_norm": 50191.36328125, + "learning_rate": 9.968993178061286e-05, + "loss": 2.3085, + "step": 719 + }, + { + "epoch": 0.13495782567947517, + "grad_norm": 55938.953125, + "learning_rate": 9.968905740859474e-05, + "loss": 2.3714, + "step": 720 + }, + { + "epoch": 0.13514526710403, + "grad_norm": 50153.9765625, + "learning_rate": 9.968818180932133e-05, + "loss": 2.3653, + "step": 721 + }, + { + "epoch": 0.1353327085285848, + "grad_norm": 58839.609375, + "learning_rate": 9.968730498281423e-05, + "loss": 2.3947, + "step": 722 + }, + { + "epoch": 0.13552014995313966, + "grad_norm": 52550.72265625, + "learning_rate": 9.968642692909514e-05, + "loss": 2.3595, + "step": 723 + }, + { + "epoch": 0.13570759137769448, + "grad_norm": 51420.69140625, + "learning_rate": 9.968554764818572e-05, + "loss": 2.3151, + "step": 724 + }, + { + "epoch": 0.1358950328022493, + "grad_norm": 50100.1171875, + "learning_rate": 9.968466714010767e-05, + "loss": 2.3672, + "step": 725 + }, + { + "epoch": 0.1360824742268041, + "grad_norm": 53507.8359375, + "learning_rate": 9.968378540488279e-05, + "loss": 2.3589, + "step": 726 + }, + { + "epoch": 0.13626991565135896, + "grad_norm": 54613.4765625, + "learning_rate": 9.968290244253281e-05, + "loss": 2.3277, + "step": 727 + }, + { + "epoch": 0.13645735707591378, + "grad_norm": 77761.953125, + "learning_rate": 9.968201825307956e-05, + "loss": 2.574, + "step": 728 + }, + { + "epoch": 0.1366447985004686, + "grad_norm": 56969.88671875, + "learning_rate": 9.968113283654486e-05, + "loss": 2.4308, + "step": 729 + }, + { + "epoch": 0.13683223992502344, + "grad_norm": 58032.546875, + "learning_rate": 9.968024619295061e-05, + "loss": 2.3854, + "step": 730 + }, + { + "epoch": 0.13701968134957826, + "grad_norm": 54077.66796875, + "learning_rate": 9.967935832231867e-05, + "loss": 2.4379, + "step": 731 + }, + { + "epoch": 0.13720712277413308, + "grad_norm": 50557.81640625, + "learning_rate": 9.967846922467102e-05, + "loss": 2.3453, + "step": 732 + }, + { + "epoch": 0.1373945641986879, + "grad_norm": 54706.73828125, + "learning_rate": 9.967757890002956e-05, + "loss": 2.3565, + "step": 733 + }, + { + "epoch": 0.13758200562324274, + "grad_norm": 52305.19921875, + "learning_rate": 9.967668734841632e-05, + "loss": 2.3856, + "step": 734 + }, + { + "epoch": 0.13776944704779756, + "grad_norm": 56949.984375, + "learning_rate": 9.967579456985331e-05, + "loss": 2.3548, + "step": 735 + }, + { + "epoch": 0.13795688847235238, + "grad_norm": 53168.5390625, + "learning_rate": 9.967490056436257e-05, + "loss": 2.3451, + "step": 736 + }, + { + "epoch": 0.13814432989690723, + "grad_norm": 55912.0625, + "learning_rate": 9.96740053319662e-05, + "loss": 2.4005, + "step": 737 + }, + { + "epoch": 0.13833177132146204, + "grad_norm": 57834.46484375, + "learning_rate": 9.96731088726863e-05, + "loss": 2.3605, + "step": 738 + }, + { + "epoch": 0.13851921274601686, + "grad_norm": 53416.47265625, + "learning_rate": 9.967221118654502e-05, + "loss": 2.3423, + "step": 739 + }, + { + "epoch": 0.1387066541705717, + "grad_norm": 52960.96875, + "learning_rate": 9.967131227356453e-05, + "loss": 2.3787, + "step": 740 + }, + { + "epoch": 0.13889409559512653, + "grad_norm": 55327.3125, + "learning_rate": 9.967041213376701e-05, + "loss": 2.4518, + "step": 741 + }, + { + "epoch": 0.13908153701968134, + "grad_norm": 50347.99609375, + "learning_rate": 9.966951076717471e-05, + "loss": 2.3698, + "step": 742 + }, + { + "epoch": 0.13926897844423616, + "grad_norm": 51426.25390625, + "learning_rate": 9.966860817380989e-05, + "loss": 2.3692, + "step": 743 + }, + { + "epoch": 0.139456419868791, + "grad_norm": 51554.0625, + "learning_rate": 9.966770435369486e-05, + "loss": 2.375, + "step": 744 + }, + { + "epoch": 0.13964386129334583, + "grad_norm": 54622.43359375, + "learning_rate": 9.966679930685193e-05, + "loss": 2.3645, + "step": 745 + }, + { + "epoch": 0.13983130271790065, + "grad_norm": 51576.8515625, + "learning_rate": 9.966589303330342e-05, + "loss": 2.393, + "step": 746 + }, + { + "epoch": 0.1400187441424555, + "grad_norm": 50169.5546875, + "learning_rate": 9.966498553307178e-05, + "loss": 2.3165, + "step": 747 + }, + { + "epoch": 0.1402061855670103, + "grad_norm": 57277.3203125, + "learning_rate": 9.966407680617936e-05, + "loss": 2.3359, + "step": 748 + }, + { + "epoch": 0.14039362699156513, + "grad_norm": 51897.546875, + "learning_rate": 9.966316685264865e-05, + "loss": 2.3669, + "step": 749 + }, + { + "epoch": 0.14058106841611998, + "grad_norm": 55287.62109375, + "learning_rate": 9.96622556725021e-05, + "loss": 2.3387, + "step": 750 + }, + { + "epoch": 0.1407685098406748, + "grad_norm": 55510.31640625, + "learning_rate": 9.966134326576223e-05, + "loss": 2.3611, + "step": 751 + }, + { + "epoch": 0.1409559512652296, + "grad_norm": 54189.48828125, + "learning_rate": 9.966042963245157e-05, + "loss": 2.436, + "step": 752 + }, + { + "epoch": 0.14114339268978443, + "grad_norm": 55980.16015625, + "learning_rate": 9.965951477259268e-05, + "loss": 2.3982, + "step": 753 + }, + { + "epoch": 0.14133083411433928, + "grad_norm": 55014.71875, + "learning_rate": 9.965859868620815e-05, + "loss": 2.2987, + "step": 754 + }, + { + "epoch": 0.1415182755388941, + "grad_norm": 50251.55859375, + "learning_rate": 9.965768137332064e-05, + "loss": 2.2838, + "step": 755 + }, + { + "epoch": 0.1417057169634489, + "grad_norm": 52825.25, + "learning_rate": 9.965676283395275e-05, + "loss": 2.3889, + "step": 756 + }, + { + "epoch": 0.14189315838800376, + "grad_norm": 57230.0546875, + "learning_rate": 9.965584306812723e-05, + "loss": 2.4288, + "step": 757 + }, + { + "epoch": 0.14208059981255858, + "grad_norm": 52830.94140625, + "learning_rate": 9.965492207586674e-05, + "loss": 2.4333, + "step": 758 + }, + { + "epoch": 0.1422680412371134, + "grad_norm": 58050.0078125, + "learning_rate": 9.965399985719406e-05, + "loss": 2.3117, + "step": 759 + }, + { + "epoch": 0.14245548266166824, + "grad_norm": 54099.75390625, + "learning_rate": 9.965307641213197e-05, + "loss": 2.3024, + "step": 760 + }, + { + "epoch": 0.14264292408622306, + "grad_norm": 55041.30859375, + "learning_rate": 9.965215174070326e-05, + "loss": 2.4534, + "step": 761 + }, + { + "epoch": 0.14283036551077788, + "grad_norm": 54109.328125, + "learning_rate": 9.965122584293078e-05, + "loss": 2.3539, + "step": 762 + }, + { + "epoch": 0.1430178069353327, + "grad_norm": 56334.87109375, + "learning_rate": 9.96502987188374e-05, + "loss": 2.3677, + "step": 763 + }, + { + "epoch": 0.14320524835988754, + "grad_norm": 52740.65625, + "learning_rate": 9.964937036844602e-05, + "loss": 2.3362, + "step": 764 + }, + { + "epoch": 0.14339268978444236, + "grad_norm": 54728.58984375, + "learning_rate": 9.964844079177955e-05, + "loss": 2.3609, + "step": 765 + }, + { + "epoch": 0.14358013120899718, + "grad_norm": 53807.46875, + "learning_rate": 9.964750998886096e-05, + "loss": 2.3278, + "step": 766 + }, + { + "epoch": 0.14376757263355203, + "grad_norm": 53625.52734375, + "learning_rate": 9.964657795971327e-05, + "loss": 2.3498, + "step": 767 + }, + { + "epoch": 0.14395501405810684, + "grad_norm": 52604.48828125, + "learning_rate": 9.964564470435944e-05, + "loss": 2.3318, + "step": 768 + }, + { + "epoch": 0.14414245548266166, + "grad_norm": 49596.484375, + "learning_rate": 9.964471022282255e-05, + "loss": 2.3429, + "step": 769 + }, + { + "epoch": 0.14432989690721648, + "grad_norm": 50873.00390625, + "learning_rate": 9.964377451512571e-05, + "loss": 2.3666, + "step": 770 + }, + { + "epoch": 0.14451733833177133, + "grad_norm": 56415.11328125, + "learning_rate": 9.964283758129198e-05, + "loss": 2.3388, + "step": 771 + }, + { + "epoch": 0.14470477975632615, + "grad_norm": 56017.484375, + "learning_rate": 9.964189942134455e-05, + "loss": 2.4302, + "step": 772 + }, + { + "epoch": 0.14489222118088096, + "grad_norm": 50375.28515625, + "learning_rate": 9.964096003530654e-05, + "loss": 2.3648, + "step": 773 + }, + { + "epoch": 0.1450796626054358, + "grad_norm": 52865.9765625, + "learning_rate": 9.96400194232012e-05, + "loss": 2.362, + "step": 774 + }, + { + "epoch": 0.14526710402999063, + "grad_norm": 67672.6171875, + "learning_rate": 9.963907758505172e-05, + "loss": 2.2858, + "step": 775 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 57055.7890625, + "learning_rate": 9.963813452088138e-05, + "loss": 2.4085, + "step": 776 + }, + { + "epoch": 0.1456419868791003, + "grad_norm": 51529.05078125, + "learning_rate": 9.96371902307135e-05, + "loss": 2.3896, + "step": 777 + }, + { + "epoch": 0.1458294283036551, + "grad_norm": 53236.140625, + "learning_rate": 9.963624471457135e-05, + "loss": 2.3762, + "step": 778 + }, + { + "epoch": 0.14601686972820993, + "grad_norm": 55608.25390625, + "learning_rate": 9.963529797247832e-05, + "loss": 2.3481, + "step": 779 + }, + { + "epoch": 0.14620431115276475, + "grad_norm": 52885.296875, + "learning_rate": 9.963435000445779e-05, + "loss": 2.3617, + "step": 780 + }, + { + "epoch": 0.1463917525773196, + "grad_norm": 50361.18359375, + "learning_rate": 9.963340081053316e-05, + "loss": 2.3347, + "step": 781 + }, + { + "epoch": 0.1465791940018744, + "grad_norm": 57246.15234375, + "learning_rate": 9.963245039072788e-05, + "loss": 2.3094, + "step": 782 + }, + { + "epoch": 0.14676663542642923, + "grad_norm": 52524.62109375, + "learning_rate": 9.963149874506544e-05, + "loss": 2.3925, + "step": 783 + }, + { + "epoch": 0.14695407685098408, + "grad_norm": 53733.65234375, + "learning_rate": 9.963054587356931e-05, + "loss": 2.241, + "step": 784 + }, + { + "epoch": 0.1471415182755389, + "grad_norm": 50366.84765625, + "learning_rate": 9.962959177626303e-05, + "loss": 2.3058, + "step": 785 + }, + { + "epoch": 0.14732895970009371, + "grad_norm": 54877.91796875, + "learning_rate": 9.96286364531702e-05, + "loss": 2.3149, + "step": 786 + }, + { + "epoch": 0.14751640112464856, + "grad_norm": 60299.48046875, + "learning_rate": 9.96276799043144e-05, + "loss": 2.4684, + "step": 787 + }, + { + "epoch": 0.14770384254920338, + "grad_norm": 56698.2578125, + "learning_rate": 9.962672212971925e-05, + "loss": 2.3799, + "step": 788 + }, + { + "epoch": 0.1478912839737582, + "grad_norm": 58089.625, + "learning_rate": 9.962576312940839e-05, + "loss": 2.4118, + "step": 789 + }, + { + "epoch": 0.14807872539831302, + "grad_norm": 57517.83984375, + "learning_rate": 9.962480290340553e-05, + "loss": 2.4132, + "step": 790 + }, + { + "epoch": 0.14826616682286786, + "grad_norm": 53452.1796875, + "learning_rate": 9.962384145173437e-05, + "loss": 2.3483, + "step": 791 + }, + { + "epoch": 0.14845360824742268, + "grad_norm": 50664.19140625, + "learning_rate": 9.962287877441867e-05, + "loss": 2.2789, + "step": 792 + }, + { + "epoch": 0.1486410496719775, + "grad_norm": 50860.62890625, + "learning_rate": 9.96219148714822e-05, + "loss": 2.2493, + "step": 793 + }, + { + "epoch": 0.14882849109653234, + "grad_norm": 53775.05078125, + "learning_rate": 9.962094974294878e-05, + "loss": 2.3583, + "step": 794 + }, + { + "epoch": 0.14901593252108716, + "grad_norm": 56342.55859375, + "learning_rate": 9.961998338884222e-05, + "loss": 2.3408, + "step": 795 + }, + { + "epoch": 0.14920337394564198, + "grad_norm": 51114.75390625, + "learning_rate": 9.961901580918641e-05, + "loss": 2.4169, + "step": 796 + }, + { + "epoch": 0.1493908153701968, + "grad_norm": 55381.6796875, + "learning_rate": 9.961804700400523e-05, + "loss": 2.3854, + "step": 797 + }, + { + "epoch": 0.14957825679475165, + "grad_norm": 52674.90234375, + "learning_rate": 9.961707697332265e-05, + "loss": 2.3471, + "step": 798 + }, + { + "epoch": 0.14976569821930646, + "grad_norm": 56571.1640625, + "learning_rate": 9.961610571716258e-05, + "loss": 2.3542, + "step": 799 + }, + { + "epoch": 0.14995313964386128, + "grad_norm": 53205.3515625, + "learning_rate": 9.961513323554902e-05, + "loss": 2.4237, + "step": 800 + }, + { + "epoch": 0.15014058106841613, + "grad_norm": 56469.515625, + "learning_rate": 9.961415952850599e-05, + "loss": 2.3643, + "step": 801 + }, + { + "epoch": 0.15032802249297095, + "grad_norm": 59646.05859375, + "learning_rate": 9.961318459605758e-05, + "loss": 2.3808, + "step": 802 + }, + { + "epoch": 0.15051546391752577, + "grad_norm": 53972.8515625, + "learning_rate": 9.96122084382278e-05, + "loss": 2.3257, + "step": 803 + }, + { + "epoch": 0.1507029053420806, + "grad_norm": 52781.94921875, + "learning_rate": 9.96112310550408e-05, + "loss": 2.4131, + "step": 804 + }, + { + "epoch": 0.15089034676663543, + "grad_norm": 48387.67578125, + "learning_rate": 9.961025244652074e-05, + "loss": 2.3912, + "step": 805 + }, + { + "epoch": 0.15107778819119025, + "grad_norm": 48117.4453125, + "learning_rate": 9.960927261269173e-05, + "loss": 2.3337, + "step": 806 + }, + { + "epoch": 0.15126522961574507, + "grad_norm": 60362.05078125, + "learning_rate": 9.960829155357803e-05, + "loss": 2.3532, + "step": 807 + }, + { + "epoch": 0.1514526710402999, + "grad_norm": 54144.57421875, + "learning_rate": 9.960730926920383e-05, + "loss": 2.4087, + "step": 808 + }, + { + "epoch": 0.15164011246485473, + "grad_norm": 51685.0859375, + "learning_rate": 9.960632575959344e-05, + "loss": 2.3256, + "step": 809 + }, + { + "epoch": 0.15182755388940955, + "grad_norm": 54358.9453125, + "learning_rate": 9.960534102477108e-05, + "loss": 2.3597, + "step": 810 + }, + { + "epoch": 0.1520149953139644, + "grad_norm": 58133.30859375, + "learning_rate": 9.960435506476114e-05, + "loss": 2.3477, + "step": 811 + }, + { + "epoch": 0.15220243673851921, + "grad_norm": 55189.20703125, + "learning_rate": 9.960336787958793e-05, + "loss": 2.4235, + "step": 812 + }, + { + "epoch": 0.15238987816307403, + "grad_norm": 53221.83984375, + "learning_rate": 9.960237946927584e-05, + "loss": 2.4311, + "step": 813 + }, + { + "epoch": 0.15257731958762888, + "grad_norm": 56093.171875, + "learning_rate": 9.96013898338493e-05, + "loss": 2.3731, + "step": 814 + }, + { + "epoch": 0.1527647610121837, + "grad_norm": 53540.33984375, + "learning_rate": 9.960039897333273e-05, + "loss": 2.319, + "step": 815 + }, + { + "epoch": 0.15295220243673852, + "grad_norm": 57148.203125, + "learning_rate": 9.959940688775062e-05, + "loss": 2.3961, + "step": 816 + }, + { + "epoch": 0.15313964386129333, + "grad_norm": 51464.2265625, + "learning_rate": 9.959841357712746e-05, + "loss": 2.3086, + "step": 817 + }, + { + "epoch": 0.15332708528584818, + "grad_norm": 52429.7109375, + "learning_rate": 9.959741904148778e-05, + "loss": 2.37, + "step": 818 + }, + { + "epoch": 0.153514526710403, + "grad_norm": 57571.02734375, + "learning_rate": 9.959642328085618e-05, + "loss": 2.3614, + "step": 819 + }, + { + "epoch": 0.15370196813495782, + "grad_norm": 54554.3125, + "learning_rate": 9.959542629525722e-05, + "loss": 2.3513, + "step": 820 + }, + { + "epoch": 0.15388940955951266, + "grad_norm": 56006.8125, + "learning_rate": 9.959442808471551e-05, + "loss": 2.393, + "step": 821 + }, + { + "epoch": 0.15407685098406748, + "grad_norm": 51568.85546875, + "learning_rate": 9.959342864925576e-05, + "loss": 2.402, + "step": 822 + }, + { + "epoch": 0.1542642924086223, + "grad_norm": 49120.55078125, + "learning_rate": 9.959242798890261e-05, + "loss": 2.3698, + "step": 823 + }, + { + "epoch": 0.15445173383317712, + "grad_norm": 55603.96484375, + "learning_rate": 9.959142610368077e-05, + "loss": 2.3692, + "step": 824 + }, + { + "epoch": 0.15463917525773196, + "grad_norm": 51255.41015625, + "learning_rate": 9.9590422993615e-05, + "loss": 2.394, + "step": 825 + }, + { + "epoch": 0.15482661668228678, + "grad_norm": 53189.18359375, + "learning_rate": 9.95894186587301e-05, + "loss": 2.3189, + "step": 826 + }, + { + "epoch": 0.1550140581068416, + "grad_norm": 55906.00390625, + "learning_rate": 9.958841309905085e-05, + "loss": 2.3936, + "step": 827 + }, + { + "epoch": 0.15520149953139645, + "grad_norm": 50908.3828125, + "learning_rate": 9.958740631460207e-05, + "loss": 2.3173, + "step": 828 + }, + { + "epoch": 0.15538894095595127, + "grad_norm": 54882.46875, + "learning_rate": 9.958639830540865e-05, + "loss": 2.3922, + "step": 829 + }, + { + "epoch": 0.15557638238050608, + "grad_norm": 53820.15625, + "learning_rate": 9.95853890714955e-05, + "loss": 2.3029, + "step": 830 + }, + { + "epoch": 0.15576382380506093, + "grad_norm": 51760.984375, + "learning_rate": 9.95843786128875e-05, + "loss": 2.3407, + "step": 831 + }, + { + "epoch": 0.15595126522961575, + "grad_norm": 49765.55078125, + "learning_rate": 9.958336692960966e-05, + "loss": 2.2865, + "step": 832 + }, + { + "epoch": 0.15613870665417057, + "grad_norm": 52350.19921875, + "learning_rate": 9.958235402168693e-05, + "loss": 2.3479, + "step": 833 + }, + { + "epoch": 0.15632614807872539, + "grad_norm": 52677.140625, + "learning_rate": 9.958133988914436e-05, + "loss": 2.3325, + "step": 834 + }, + { + "epoch": 0.15651358950328023, + "grad_norm": 48056.88671875, + "learning_rate": 9.958032453200695e-05, + "loss": 2.3918, + "step": 835 + }, + { + "epoch": 0.15670103092783505, + "grad_norm": 54672.73828125, + "learning_rate": 9.957930795029981e-05, + "loss": 2.342, + "step": 836 + }, + { + "epoch": 0.15688847235238987, + "grad_norm": 52731.68359375, + "learning_rate": 9.957829014404806e-05, + "loss": 2.3186, + "step": 837 + }, + { + "epoch": 0.15707591377694471, + "grad_norm": 58930.85546875, + "learning_rate": 9.957727111327682e-05, + "loss": 2.3461, + "step": 838 + }, + { + "epoch": 0.15726335520149953, + "grad_norm": 52995.1640625, + "learning_rate": 9.957625085801125e-05, + "loss": 2.3358, + "step": 839 + }, + { + "epoch": 0.15745079662605435, + "grad_norm": 52610.66796875, + "learning_rate": 9.957522937827657e-05, + "loss": 2.3497, + "step": 840 + }, + { + "epoch": 0.1576382380506092, + "grad_norm": 48724.76171875, + "learning_rate": 9.9574206674098e-05, + "loss": 2.3597, + "step": 841 + }, + { + "epoch": 0.15782567947516402, + "grad_norm": 59505.51953125, + "learning_rate": 9.957318274550078e-05, + "loss": 2.4048, + "step": 842 + }, + { + "epoch": 0.15801312089971883, + "grad_norm": 56611.125, + "learning_rate": 9.957215759251024e-05, + "loss": 2.3517, + "step": 843 + }, + { + "epoch": 0.15820056232427365, + "grad_norm": 57170.1796875, + "learning_rate": 9.957113121515167e-05, + "loss": 2.3867, + "step": 844 + }, + { + "epoch": 0.1583880037488285, + "grad_norm": 54504.078125, + "learning_rate": 9.957010361345043e-05, + "loss": 2.3169, + "step": 845 + }, + { + "epoch": 0.15857544517338332, + "grad_norm": 48961.46875, + "learning_rate": 9.95690747874319e-05, + "loss": 2.3466, + "step": 846 + }, + { + "epoch": 0.15876288659793814, + "grad_norm": 56648.2265625, + "learning_rate": 9.956804473712148e-05, + "loss": 2.3205, + "step": 847 + }, + { + "epoch": 0.15895032802249298, + "grad_norm": 60193.5, + "learning_rate": 9.956701346254464e-05, + "loss": 2.3278, + "step": 848 + }, + { + "epoch": 0.1591377694470478, + "grad_norm": 54805.0703125, + "learning_rate": 9.956598096372682e-05, + "loss": 2.3798, + "step": 849 + }, + { + "epoch": 0.15932521087160262, + "grad_norm": 54374.90625, + "learning_rate": 9.956494724069354e-05, + "loss": 2.3645, + "step": 850 + }, + { + "epoch": 0.15951265229615746, + "grad_norm": 48109.78125, + "learning_rate": 9.956391229347032e-05, + "loss": 2.2936, + "step": 851 + }, + { + "epoch": 0.15970009372071228, + "grad_norm": 53837.1484375, + "learning_rate": 9.956287612208274e-05, + "loss": 2.3596, + "step": 852 + }, + { + "epoch": 0.1598875351452671, + "grad_norm": 53795.34375, + "learning_rate": 9.956183872655636e-05, + "loss": 2.4069, + "step": 853 + }, + { + "epoch": 0.16007497656982192, + "grad_norm": 52913.52734375, + "learning_rate": 9.956080010691684e-05, + "loss": 2.3377, + "step": 854 + }, + { + "epoch": 0.16026241799437677, + "grad_norm": 48135.35546875, + "learning_rate": 9.95597602631898e-05, + "loss": 2.386, + "step": 855 + }, + { + "epoch": 0.16044985941893158, + "grad_norm": 51964.11328125, + "learning_rate": 9.955871919540094e-05, + "loss": 2.3564, + "step": 856 + }, + { + "epoch": 0.1606373008434864, + "grad_norm": 53876.37109375, + "learning_rate": 9.955767690357597e-05, + "loss": 2.3873, + "step": 857 + }, + { + "epoch": 0.16082474226804125, + "grad_norm": 58111.57421875, + "learning_rate": 9.955663338774063e-05, + "loss": 2.4291, + "step": 858 + }, + { + "epoch": 0.16101218369259607, + "grad_norm": 52216.69140625, + "learning_rate": 9.95555886479207e-05, + "loss": 2.3662, + "step": 859 + }, + { + "epoch": 0.16119962511715089, + "grad_norm": 53052.859375, + "learning_rate": 9.955454268414197e-05, + "loss": 2.3694, + "step": 860 + }, + { + "epoch": 0.1613870665417057, + "grad_norm": 50975.06640625, + "learning_rate": 9.95534954964303e-05, + "loss": 2.3306, + "step": 861 + }, + { + "epoch": 0.16157450796626055, + "grad_norm": 52550.58984375, + "learning_rate": 9.955244708481153e-05, + "loss": 2.3256, + "step": 862 + }, + { + "epoch": 0.16176194939081537, + "grad_norm": 54151.5078125, + "learning_rate": 9.955139744931156e-05, + "loss": 2.3082, + "step": 863 + }, + { + "epoch": 0.1619493908153702, + "grad_norm": 65633.2265625, + "learning_rate": 9.955034658995633e-05, + "loss": 2.3645, + "step": 864 + }, + { + "epoch": 0.16213683223992503, + "grad_norm": 51952.87890625, + "learning_rate": 9.954929450677177e-05, + "loss": 2.3112, + "step": 865 + }, + { + "epoch": 0.16232427366447985, + "grad_norm": 56279.94140625, + "learning_rate": 9.954824119978389e-05, + "loss": 2.4215, + "step": 866 + }, + { + "epoch": 0.16251171508903467, + "grad_norm": 52234.62109375, + "learning_rate": 9.954718666901868e-05, + "loss": 2.3496, + "step": 867 + }, + { + "epoch": 0.16269915651358952, + "grad_norm": 52637.3515625, + "learning_rate": 9.95461309145022e-05, + "loss": 2.3903, + "step": 868 + }, + { + "epoch": 0.16288659793814433, + "grad_norm": 52761.84375, + "learning_rate": 9.954507393626054e-05, + "loss": 2.2924, + "step": 869 + }, + { + "epoch": 0.16307403936269915, + "grad_norm": 54614.7265625, + "learning_rate": 9.954401573431977e-05, + "loss": 2.3607, + "step": 870 + }, + { + "epoch": 0.16326148078725397, + "grad_norm": 56626.86328125, + "learning_rate": 9.954295630870604e-05, + "loss": 2.339, + "step": 871 + }, + { + "epoch": 0.16344892221180882, + "grad_norm": 55792.32421875, + "learning_rate": 9.954189565944554e-05, + "loss": 2.3146, + "step": 872 + }, + { + "epoch": 0.16363636363636364, + "grad_norm": 51895.0234375, + "learning_rate": 9.954083378656445e-05, + "loss": 2.4754, + "step": 873 + }, + { + "epoch": 0.16382380506091845, + "grad_norm": 51338.94140625, + "learning_rate": 9.953977069008899e-05, + "loss": 2.3975, + "step": 874 + }, + { + "epoch": 0.1640112464854733, + "grad_norm": 53117.2421875, + "learning_rate": 9.95387063700454e-05, + "loss": 2.3205, + "step": 875 + }, + { + "epoch": 0.16419868791002812, + "grad_norm": 50997.6015625, + "learning_rate": 9.953764082646003e-05, + "loss": 2.3664, + "step": 876 + }, + { + "epoch": 0.16438612933458294, + "grad_norm": 53246.90234375, + "learning_rate": 9.953657405935915e-05, + "loss": 2.3955, + "step": 877 + }, + { + "epoch": 0.16457357075913778, + "grad_norm": 53339.5625, + "learning_rate": 9.953550606876909e-05, + "loss": 2.323, + "step": 878 + }, + { + "epoch": 0.1647610121836926, + "grad_norm": 53207.60546875, + "learning_rate": 9.953443685471629e-05, + "loss": 2.3591, + "step": 879 + }, + { + "epoch": 0.16494845360824742, + "grad_norm": 50868.89453125, + "learning_rate": 9.95333664172271e-05, + "loss": 2.3452, + "step": 880 + }, + { + "epoch": 0.16513589503280224, + "grad_norm": 54867.25, + "learning_rate": 9.953229475632797e-05, + "loss": 2.4619, + "step": 881 + }, + { + "epoch": 0.16532333645735708, + "grad_norm": 54247.53515625, + "learning_rate": 9.95312218720454e-05, + "loss": 2.3496, + "step": 882 + }, + { + "epoch": 0.1655107778819119, + "grad_norm": 58636.390625, + "learning_rate": 9.953014776440586e-05, + "loss": 2.4019, + "step": 883 + }, + { + "epoch": 0.16569821930646672, + "grad_norm": 55726.16015625, + "learning_rate": 9.952907243343589e-05, + "loss": 2.4457, + "step": 884 + }, + { + "epoch": 0.16588566073102157, + "grad_norm": 50902.609375, + "learning_rate": 9.952799587916204e-05, + "loss": 2.3383, + "step": 885 + }, + { + "epoch": 0.16607310215557639, + "grad_norm": 53972.265625, + "learning_rate": 9.952691810161093e-05, + "loss": 2.4198, + "step": 886 + }, + { + "epoch": 0.1662605435801312, + "grad_norm": 56626.6484375, + "learning_rate": 9.952583910080912e-05, + "loss": 2.4289, + "step": 887 + }, + { + "epoch": 0.16644798500468602, + "grad_norm": 49679.65625, + "learning_rate": 9.95247588767833e-05, + "loss": 2.3696, + "step": 888 + }, + { + "epoch": 0.16663542642924087, + "grad_norm": 53512.05078125, + "learning_rate": 9.952367742956016e-05, + "loss": 2.4236, + "step": 889 + }, + { + "epoch": 0.1668228678537957, + "grad_norm": 53437.2890625, + "learning_rate": 9.95225947591664e-05, + "loss": 2.2932, + "step": 890 + }, + { + "epoch": 0.1670103092783505, + "grad_norm": 52848.4765625, + "learning_rate": 9.952151086562876e-05, + "loss": 2.303, + "step": 891 + }, + { + "epoch": 0.16719775070290535, + "grad_norm": 53970.26953125, + "learning_rate": 9.952042574897398e-05, + "loss": 2.3519, + "step": 892 + }, + { + "epoch": 0.16738519212746017, + "grad_norm": 51631.4296875, + "learning_rate": 9.95193394092289e-05, + "loss": 2.3489, + "step": 893 + }, + { + "epoch": 0.167572633552015, + "grad_norm": 56364.55859375, + "learning_rate": 9.951825184642035e-05, + "loss": 2.2771, + "step": 894 + }, + { + "epoch": 0.16776007497656983, + "grad_norm": 52024.65625, + "learning_rate": 9.951716306057516e-05, + "loss": 2.4655, + "step": 895 + }, + { + "epoch": 0.16794751640112465, + "grad_norm": 54111.8203125, + "learning_rate": 9.951607305172027e-05, + "loss": 2.3837, + "step": 896 + }, + { + "epoch": 0.16813495782567947, + "grad_norm": 51732.19921875, + "learning_rate": 9.951498181988257e-05, + "loss": 2.429, + "step": 897 + }, + { + "epoch": 0.1683223992502343, + "grad_norm": 51169.3046875, + "learning_rate": 9.951388936508901e-05, + "loss": 2.2746, + "step": 898 + }, + { + "epoch": 0.16850984067478914, + "grad_norm": 50240.32421875, + "learning_rate": 9.951279568736657e-05, + "loss": 2.3287, + "step": 899 + }, + { + "epoch": 0.16869728209934395, + "grad_norm": 49981.0234375, + "learning_rate": 9.951170078674228e-05, + "loss": 2.3061, + "step": 900 + }, + { + "epoch": 0.16888472352389877, + "grad_norm": 53604.265625, + "learning_rate": 9.951060466324316e-05, + "loss": 2.3538, + "step": 901 + }, + { + "epoch": 0.16907216494845362, + "grad_norm": 53444.95703125, + "learning_rate": 9.950950731689631e-05, + "loss": 2.4049, + "step": 902 + }, + { + "epoch": 0.16925960637300844, + "grad_norm": 50403.03125, + "learning_rate": 9.950840874772882e-05, + "loss": 2.4378, + "step": 903 + }, + { + "epoch": 0.16944704779756325, + "grad_norm": 48212.79296875, + "learning_rate": 9.950730895576781e-05, + "loss": 2.3648, + "step": 904 + }, + { + "epoch": 0.1696344892221181, + "grad_norm": 57417.63671875, + "learning_rate": 9.950620794104046e-05, + "loss": 2.3901, + "step": 905 + }, + { + "epoch": 0.16982193064667292, + "grad_norm": 56640.9453125, + "learning_rate": 9.950510570357396e-05, + "loss": 2.3558, + "step": 906 + }, + { + "epoch": 0.17000937207122774, + "grad_norm": 52221.76171875, + "learning_rate": 9.950400224339555e-05, + "loss": 2.3637, + "step": 907 + }, + { + "epoch": 0.17019681349578256, + "grad_norm": 57183.59765625, + "learning_rate": 9.950289756053245e-05, + "loss": 2.4564, + "step": 908 + }, + { + "epoch": 0.1703842549203374, + "grad_norm": 53333.4140625, + "learning_rate": 9.950179165501197e-05, + "loss": 2.368, + "step": 909 + }, + { + "epoch": 0.17057169634489222, + "grad_norm": 56551.71484375, + "learning_rate": 9.950068452686142e-05, + "loss": 2.2925, + "step": 910 + }, + { + "epoch": 0.17075913776944704, + "grad_norm": 56566.703125, + "learning_rate": 9.949957617610812e-05, + "loss": 2.3602, + "step": 911 + }, + { + "epoch": 0.17094657919400189, + "grad_norm": 53688.92578125, + "learning_rate": 9.949846660277949e-05, + "loss": 2.2885, + "step": 912 + }, + { + "epoch": 0.1711340206185567, + "grad_norm": 58906.8515625, + "learning_rate": 9.949735580690289e-05, + "loss": 2.3964, + "step": 913 + }, + { + "epoch": 0.17132146204311152, + "grad_norm": 54346.6640625, + "learning_rate": 9.949624378850578e-05, + "loss": 2.2528, + "step": 914 + }, + { + "epoch": 0.17150890346766634, + "grad_norm": 50562.671875, + "learning_rate": 9.949513054761563e-05, + "loss": 2.3196, + "step": 915 + }, + { + "epoch": 0.1716963448922212, + "grad_norm": 48754.58203125, + "learning_rate": 9.949401608425992e-05, + "loss": 2.368, + "step": 916 + }, + { + "epoch": 0.171883786316776, + "grad_norm": 48795.60546875, + "learning_rate": 9.94929003984662e-05, + "loss": 2.334, + "step": 917 + }, + { + "epoch": 0.17207122774133082, + "grad_norm": 50453.16796875, + "learning_rate": 9.949178349026198e-05, + "loss": 2.239, + "step": 918 + }, + { + "epoch": 0.17225866916588567, + "grad_norm": 49755.01953125, + "learning_rate": 9.949066535967489e-05, + "loss": 2.34, + "step": 919 + }, + { + "epoch": 0.1724461105904405, + "grad_norm": 51100.2890625, + "learning_rate": 9.948954600673253e-05, + "loss": 2.3302, + "step": 920 + }, + { + "epoch": 0.1726335520149953, + "grad_norm": 55134.5625, + "learning_rate": 9.948842543146255e-05, + "loss": 2.393, + "step": 921 + }, + { + "epoch": 0.17282099343955015, + "grad_norm": 52491.54296875, + "learning_rate": 9.948730363389262e-05, + "loss": 2.4421, + "step": 922 + }, + { + "epoch": 0.17300843486410497, + "grad_norm": 51554.0859375, + "learning_rate": 9.948618061405047e-05, + "loss": 2.4028, + "step": 923 + }, + { + "epoch": 0.1731958762886598, + "grad_norm": 59327.23046875, + "learning_rate": 9.948505637196379e-05, + "loss": 2.3015, + "step": 924 + }, + { + "epoch": 0.1733833177132146, + "grad_norm": 52057.41015625, + "learning_rate": 9.948393090766039e-05, + "loss": 2.3211, + "step": 925 + }, + { + "epoch": 0.17357075913776945, + "grad_norm": 54090.5546875, + "learning_rate": 9.948280422116805e-05, + "loss": 2.3783, + "step": 926 + }, + { + "epoch": 0.17375820056232427, + "grad_norm": 57558.39453125, + "learning_rate": 9.948167631251461e-05, + "loss": 2.3605, + "step": 927 + }, + { + "epoch": 0.1739456419868791, + "grad_norm": 48658.48828125, + "learning_rate": 9.94805471817279e-05, + "loss": 2.3601, + "step": 928 + }, + { + "epoch": 0.17413308341143394, + "grad_norm": 47995.84375, + "learning_rate": 9.947941682883585e-05, + "loss": 2.3939, + "step": 929 + }, + { + "epoch": 0.17432052483598875, + "grad_norm": 53420.125, + "learning_rate": 9.947828525386634e-05, + "loss": 2.3689, + "step": 930 + }, + { + "epoch": 0.17450796626054357, + "grad_norm": 52376.296875, + "learning_rate": 9.947715245684734e-05, + "loss": 2.3367, + "step": 931 + }, + { + "epoch": 0.17469540768509842, + "grad_norm": 57275.2421875, + "learning_rate": 9.947601843780683e-05, + "loss": 2.3505, + "step": 932 + }, + { + "epoch": 0.17488284910965324, + "grad_norm": 50082.6640625, + "learning_rate": 9.94748831967728e-05, + "loss": 2.3963, + "step": 933 + }, + { + "epoch": 0.17507029053420806, + "grad_norm": 51551.72265625, + "learning_rate": 9.94737467337733e-05, + "loss": 2.3611, + "step": 934 + }, + { + "epoch": 0.17525773195876287, + "grad_norm": 51562.79296875, + "learning_rate": 9.94726090488364e-05, + "loss": 2.2915, + "step": 935 + }, + { + "epoch": 0.17544517338331772, + "grad_norm": 51498.609375, + "learning_rate": 9.947147014199021e-05, + "loss": 2.3786, + "step": 936 + }, + { + "epoch": 0.17563261480787254, + "grad_norm": 54895.984375, + "learning_rate": 9.947033001326282e-05, + "loss": 2.315, + "step": 937 + }, + { + "epoch": 0.17582005623242736, + "grad_norm": 50382.1015625, + "learning_rate": 9.946918866268245e-05, + "loss": 2.2867, + "step": 938 + }, + { + "epoch": 0.1760074976569822, + "grad_norm": 50515.9140625, + "learning_rate": 9.946804609027724e-05, + "loss": 2.3415, + "step": 939 + }, + { + "epoch": 0.17619493908153702, + "grad_norm": 52193.8046875, + "learning_rate": 9.946690229607542e-05, + "loss": 2.2819, + "step": 940 + }, + { + "epoch": 0.17638238050609184, + "grad_norm": 53111.82421875, + "learning_rate": 9.946575728010528e-05, + "loss": 2.3996, + "step": 941 + }, + { + "epoch": 0.1765698219306467, + "grad_norm": 54500.3046875, + "learning_rate": 9.946461104239504e-05, + "loss": 2.3293, + "step": 942 + }, + { + "epoch": 0.1767572633552015, + "grad_norm": 55569.63671875, + "learning_rate": 9.946346358297305e-05, + "loss": 2.3642, + "step": 943 + }, + { + "epoch": 0.17694470477975632, + "grad_norm": 230759.171875, + "learning_rate": 9.946231490186763e-05, + "loss": 2.8579, + "step": 944 + }, + { + "epoch": 0.17713214620431114, + "grad_norm": 52710.01953125, + "learning_rate": 9.946116499910718e-05, + "loss": 2.4473, + "step": 945 + }, + { + "epoch": 0.177319587628866, + "grad_norm": 51165.38671875, + "learning_rate": 9.946001387472006e-05, + "loss": 2.344, + "step": 946 + }, + { + "epoch": 0.1775070290534208, + "grad_norm": 52227.609375, + "learning_rate": 9.945886152873474e-05, + "loss": 2.3949, + "step": 947 + }, + { + "epoch": 0.17769447047797562, + "grad_norm": 51976.49609375, + "learning_rate": 9.945770796117966e-05, + "loss": 2.3318, + "step": 948 + }, + { + "epoch": 0.17788191190253047, + "grad_norm": 49681.19921875, + "learning_rate": 9.945655317208331e-05, + "loss": 2.3447, + "step": 949 + }, + { + "epoch": 0.1780693533270853, + "grad_norm": 57590.00390625, + "learning_rate": 9.945539716147423e-05, + "loss": 2.3812, + "step": 950 + }, + { + "epoch": 0.1782567947516401, + "grad_norm": 52366.5546875, + "learning_rate": 9.945423992938094e-05, + "loss": 2.3607, + "step": 951 + }, + { + "epoch": 0.17844423617619493, + "grad_norm": 51199.265625, + "learning_rate": 9.945308147583206e-05, + "loss": 2.3404, + "step": 952 + }, + { + "epoch": 0.17863167760074977, + "grad_norm": 52477.515625, + "learning_rate": 9.945192180085618e-05, + "loss": 2.353, + "step": 953 + }, + { + "epoch": 0.1788191190253046, + "grad_norm": 53249.7265625, + "learning_rate": 9.945076090448195e-05, + "loss": 2.3378, + "step": 954 + }, + { + "epoch": 0.1790065604498594, + "grad_norm": 49834.3359375, + "learning_rate": 9.944959878673802e-05, + "loss": 2.446, + "step": 955 + }, + { + "epoch": 0.17919400187441425, + "grad_norm": 50906.90234375, + "learning_rate": 9.944843544765314e-05, + "loss": 2.3695, + "step": 956 + }, + { + "epoch": 0.17938144329896907, + "grad_norm": 56052.09375, + "learning_rate": 9.944727088725601e-05, + "loss": 2.35, + "step": 957 + }, + { + "epoch": 0.1795688847235239, + "grad_norm": 57616.640625, + "learning_rate": 9.944610510557538e-05, + "loss": 2.3854, + "step": 958 + }, + { + "epoch": 0.17975632614807874, + "grad_norm": 63399.3359375, + "learning_rate": 9.944493810264009e-05, + "loss": 2.4046, + "step": 959 + }, + { + "epoch": 0.17994376757263356, + "grad_norm": 56770.078125, + "learning_rate": 9.94437698784789e-05, + "loss": 2.2981, + "step": 960 + }, + { + "epoch": 0.18013120899718837, + "grad_norm": 55007.16796875, + "learning_rate": 9.944260043312073e-05, + "loss": 2.3247, + "step": 961 + }, + { + "epoch": 0.1803186504217432, + "grad_norm": 51196.4765625, + "learning_rate": 9.944142976659441e-05, + "loss": 2.3796, + "step": 962 + }, + { + "epoch": 0.18050609184629804, + "grad_norm": 52923.953125, + "learning_rate": 9.944025787892889e-05, + "loss": 2.3733, + "step": 963 + }, + { + "epoch": 0.18069353327085286, + "grad_norm": 51807.1171875, + "learning_rate": 9.943908477015309e-05, + "loss": 2.325, + "step": 964 + }, + { + "epoch": 0.18088097469540768, + "grad_norm": 53732.9765625, + "learning_rate": 9.9437910440296e-05, + "loss": 2.3666, + "step": 965 + }, + { + "epoch": 0.18106841611996252, + "grad_norm": 55278.98046875, + "learning_rate": 9.943673488938662e-05, + "loss": 2.3411, + "step": 966 + }, + { + "epoch": 0.18125585754451734, + "grad_norm": 52863.73046875, + "learning_rate": 9.9435558117454e-05, + "loss": 2.3351, + "step": 967 + }, + { + "epoch": 0.18144329896907216, + "grad_norm": 52856.61328125, + "learning_rate": 9.943438012452718e-05, + "loss": 2.3434, + "step": 968 + }, + { + "epoch": 0.181630740393627, + "grad_norm": 54307.53515625, + "learning_rate": 9.943320091063524e-05, + "loss": 2.446, + "step": 969 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 49754.12890625, + "learning_rate": 9.943202047580735e-05, + "loss": 2.3906, + "step": 970 + }, + { + "epoch": 0.18200562324273664, + "grad_norm": 53289.80859375, + "learning_rate": 9.943083882007265e-05, + "loss": 2.4228, + "step": 971 + }, + { + "epoch": 0.18219306466729146, + "grad_norm": 49948.9921875, + "learning_rate": 9.94296559434603e-05, + "loss": 2.3552, + "step": 972 + }, + { + "epoch": 0.1823805060918463, + "grad_norm": 53763.1328125, + "learning_rate": 9.942847184599955e-05, + "loss": 2.4217, + "step": 973 + }, + { + "epoch": 0.18256794751640112, + "grad_norm": 52112.015625, + "learning_rate": 9.942728652771962e-05, + "loss": 2.2922, + "step": 974 + }, + { + "epoch": 0.18275538894095594, + "grad_norm": 50303.04296875, + "learning_rate": 9.94260999886498e-05, + "loss": 2.3673, + "step": 975 + }, + { + "epoch": 0.1829428303655108, + "grad_norm": 49885.90234375, + "learning_rate": 9.94249122288194e-05, + "loss": 2.3362, + "step": 976 + }, + { + "epoch": 0.1831302717900656, + "grad_norm": 52851.31640625, + "learning_rate": 9.942372324825772e-05, + "loss": 2.3845, + "step": 977 + }, + { + "epoch": 0.18331771321462043, + "grad_norm": 55000.94140625, + "learning_rate": 9.942253304699418e-05, + "loss": 2.4166, + "step": 978 + }, + { + "epoch": 0.18350515463917524, + "grad_norm": 53345.9609375, + "learning_rate": 9.942134162505815e-05, + "loss": 2.2977, + "step": 979 + }, + { + "epoch": 0.1836925960637301, + "grad_norm": 56315.80078125, + "learning_rate": 9.942014898247906e-05, + "loss": 2.2869, + "step": 980 + }, + { + "epoch": 0.1838800374882849, + "grad_norm": 49644.63671875, + "learning_rate": 9.941895511928634e-05, + "loss": 2.3828, + "step": 981 + }, + { + "epoch": 0.18406747891283973, + "grad_norm": 52293.3828125, + "learning_rate": 9.941776003550953e-05, + "loss": 2.3467, + "step": 982 + }, + { + "epoch": 0.18425492033739457, + "grad_norm": 48540.375, + "learning_rate": 9.941656373117811e-05, + "loss": 2.316, + "step": 983 + }, + { + "epoch": 0.1844423617619494, + "grad_norm": 60059.99609375, + "learning_rate": 9.941536620632164e-05, + "loss": 2.3872, + "step": 984 + }, + { + "epoch": 0.1846298031865042, + "grad_norm": 52179.921875, + "learning_rate": 9.941416746096967e-05, + "loss": 2.3558, + "step": 985 + }, + { + "epoch": 0.18481724461105906, + "grad_norm": 50364.25, + "learning_rate": 9.941296749515185e-05, + "loss": 2.3476, + "step": 986 + }, + { + "epoch": 0.18500468603561387, + "grad_norm": 52493.15234375, + "learning_rate": 9.94117663088978e-05, + "loss": 2.4275, + "step": 987 + }, + { + "epoch": 0.1851921274601687, + "grad_norm": 53480.58203125, + "learning_rate": 9.941056390223717e-05, + "loss": 2.3844, + "step": 988 + }, + { + "epoch": 0.1853795688847235, + "grad_norm": 51812.55859375, + "learning_rate": 9.94093602751997e-05, + "loss": 2.3619, + "step": 989 + }, + { + "epoch": 0.18556701030927836, + "grad_norm": 59152.65625, + "learning_rate": 9.940815542781505e-05, + "loss": 2.2624, + "step": 990 + }, + { + "epoch": 0.18575445173383318, + "grad_norm": 54586.3359375, + "learning_rate": 9.940694936011305e-05, + "loss": 2.3647, + "step": 991 + }, + { + "epoch": 0.185941893158388, + "grad_norm": 47113.41015625, + "learning_rate": 9.940574207212345e-05, + "loss": 2.3348, + "step": 992 + }, + { + "epoch": 0.18612933458294284, + "grad_norm": 49720.13671875, + "learning_rate": 9.940453356387608e-05, + "loss": 2.2853, + "step": 993 + }, + { + "epoch": 0.18631677600749766, + "grad_norm": 49644.87890625, + "learning_rate": 9.940332383540078e-05, + "loss": 2.3915, + "step": 994 + }, + { + "epoch": 0.18650421743205248, + "grad_norm": 48474.36328125, + "learning_rate": 9.940211288672744e-05, + "loss": 2.4152, + "step": 995 + }, + { + "epoch": 0.18669165885660732, + "grad_norm": 51720.24609375, + "learning_rate": 9.940090071788595e-05, + "loss": 2.3292, + "step": 996 + }, + { + "epoch": 0.18687910028116214, + "grad_norm": 51407.05078125, + "learning_rate": 9.939968732890628e-05, + "loss": 2.3193, + "step": 997 + }, + { + "epoch": 0.18706654170571696, + "grad_norm": 52322.17578125, + "learning_rate": 9.939847271981837e-05, + "loss": 2.3451, + "step": 998 + }, + { + "epoch": 0.18725398313027178, + "grad_norm": 51757.3671875, + "learning_rate": 9.939725689065222e-05, + "loss": 2.3318, + "step": 999 + }, + { + "epoch": 0.18744142455482662, + "grad_norm": 52358.5859375, + "learning_rate": 9.939603984143788e-05, + "loss": 2.3183, + "step": 1000 + }, + { + "epoch": 0.18744142455482662, + "eval_loss": 2.3466055393218994, + "eval_runtime": 129.4332, + "eval_samples_per_second": 39.009, + "eval_steps_per_second": 1.955, + "step": 1000 + }, + { + "epoch": 0.18762886597938144, + "grad_norm": 58316.44921875, + "learning_rate": 9.939482157220541e-05, + "loss": 2.3734, + "step": 1001 + }, + { + "epoch": 0.18781630740393626, + "grad_norm": 51635.0703125, + "learning_rate": 9.939360208298487e-05, + "loss": 2.2966, + "step": 1002 + }, + { + "epoch": 0.1880037488284911, + "grad_norm": 53900.421875, + "learning_rate": 9.939238137380641e-05, + "loss": 2.3837, + "step": 1003 + }, + { + "epoch": 0.18819119025304593, + "grad_norm": 52008.7734375, + "learning_rate": 9.939115944470017e-05, + "loss": 2.3201, + "step": 1004 + }, + { + "epoch": 0.18837863167760074, + "grad_norm": 50090.0, + "learning_rate": 9.938993629569631e-05, + "loss": 2.3535, + "step": 1005 + }, + { + "epoch": 0.1885660731021556, + "grad_norm": 56718.48046875, + "learning_rate": 9.938871192682507e-05, + "loss": 2.3754, + "step": 1006 + }, + { + "epoch": 0.1887535145267104, + "grad_norm": 56955.4765625, + "learning_rate": 9.938748633811667e-05, + "loss": 2.3399, + "step": 1007 + }, + { + "epoch": 0.18894095595126523, + "grad_norm": 48648.3359375, + "learning_rate": 9.938625952960141e-05, + "loss": 2.4036, + "step": 1008 + }, + { + "epoch": 0.18912839737582005, + "grad_norm": 53715.6015625, + "learning_rate": 9.938503150130956e-05, + "loss": 2.4264, + "step": 1009 + }, + { + "epoch": 0.1893158388003749, + "grad_norm": 51977.2890625, + "learning_rate": 9.938380225327146e-05, + "loss": 2.3717, + "step": 1010 + }, + { + "epoch": 0.1895032802249297, + "grad_norm": 52056.3515625, + "learning_rate": 9.938257178551747e-05, + "loss": 2.3173, + "step": 1011 + }, + { + "epoch": 0.18969072164948453, + "grad_norm": 51487.1015625, + "learning_rate": 9.938134009807799e-05, + "loss": 2.3827, + "step": 1012 + }, + { + "epoch": 0.18987816307403937, + "grad_norm": 54302.40234375, + "learning_rate": 9.938010719098341e-05, + "loss": 2.3346, + "step": 1013 + }, + { + "epoch": 0.1900656044985942, + "grad_norm": 51455.546875, + "learning_rate": 9.937887306426423e-05, + "loss": 2.3655, + "step": 1014 + }, + { + "epoch": 0.190253045923149, + "grad_norm": 53490.36328125, + "learning_rate": 9.93776377179509e-05, + "loss": 2.3036, + "step": 1015 + }, + { + "epoch": 0.19044048734770383, + "grad_norm": 57020.3828125, + "learning_rate": 9.937640115207393e-05, + "loss": 2.3709, + "step": 1016 + }, + { + "epoch": 0.19062792877225868, + "grad_norm": 55788.28125, + "learning_rate": 9.937516336666387e-05, + "loss": 2.264, + "step": 1017 + }, + { + "epoch": 0.1908153701968135, + "grad_norm": 57989.46875, + "learning_rate": 9.93739243617513e-05, + "loss": 2.4351, + "step": 1018 + }, + { + "epoch": 0.1910028116213683, + "grad_norm": 46673.70703125, + "learning_rate": 9.93726841373668e-05, + "loss": 2.377, + "step": 1019 + }, + { + "epoch": 0.19119025304592316, + "grad_norm": 50715.6171875, + "learning_rate": 9.937144269354101e-05, + "loss": 2.4267, + "step": 1020 + }, + { + "epoch": 0.19137769447047798, + "grad_norm": 53851.9375, + "learning_rate": 9.937020003030462e-05, + "loss": 2.3392, + "step": 1021 + }, + { + "epoch": 0.1915651358950328, + "grad_norm": 55420.4375, + "learning_rate": 9.936895614768829e-05, + "loss": 2.3501, + "step": 1022 + }, + { + "epoch": 0.19175257731958764, + "grad_norm": 53474.9453125, + "learning_rate": 9.936771104572273e-05, + "loss": 2.4176, + "step": 1023 + }, + { + "epoch": 0.19194001874414246, + "grad_norm": 54024.38671875, + "learning_rate": 9.936646472443874e-05, + "loss": 2.3949, + "step": 1024 + }, + { + "epoch": 0.19212746016869728, + "grad_norm": 52602.23828125, + "learning_rate": 9.936521718386706e-05, + "loss": 2.2838, + "step": 1025 + }, + { + "epoch": 0.1923149015932521, + "grad_norm": 52307.7890625, + "learning_rate": 9.936396842403853e-05, + "loss": 2.332, + "step": 1026 + }, + { + "epoch": 0.19250234301780694, + "grad_norm": 52515.73046875, + "learning_rate": 9.936271844498397e-05, + "loss": 2.3481, + "step": 1027 + }, + { + "epoch": 0.19268978444236176, + "grad_norm": 67263.9921875, + "learning_rate": 9.936146724673428e-05, + "loss": 2.417, + "step": 1028 + }, + { + "epoch": 0.19287722586691658, + "grad_norm": 49595.75390625, + "learning_rate": 9.936021482932033e-05, + "loss": 2.3347, + "step": 1029 + }, + { + "epoch": 0.19306466729147143, + "grad_norm": 49127.3125, + "learning_rate": 9.935896119277309e-05, + "loss": 2.4273, + "step": 1030 + }, + { + "epoch": 0.19325210871602624, + "grad_norm": 51493.89453125, + "learning_rate": 9.935770633712348e-05, + "loss": 2.3301, + "step": 1031 + }, + { + "epoch": 0.19343955014058106, + "grad_norm": 55057.25, + "learning_rate": 9.935645026240252e-05, + "loss": 2.4146, + "step": 1032 + }, + { + "epoch": 0.1936269915651359, + "grad_norm": 49592.26953125, + "learning_rate": 9.935519296864124e-05, + "loss": 2.4713, + "step": 1033 + }, + { + "epoch": 0.19381443298969073, + "grad_norm": 47123.484375, + "learning_rate": 9.935393445587068e-05, + "loss": 2.3912, + "step": 1034 + }, + { + "epoch": 0.19400187441424555, + "grad_norm": 51530.234375, + "learning_rate": 9.935267472412193e-05, + "loss": 2.3328, + "step": 1035 + }, + { + "epoch": 0.19418931583880036, + "grad_norm": 49632.00390625, + "learning_rate": 9.93514137734261e-05, + "loss": 2.3715, + "step": 1036 + }, + { + "epoch": 0.1943767572633552, + "grad_norm": 60403.1171875, + "learning_rate": 9.935015160381433e-05, + "loss": 2.375, + "step": 1037 + }, + { + "epoch": 0.19456419868791003, + "grad_norm": 52997.03515625, + "learning_rate": 9.934888821531781e-05, + "loss": 2.3783, + "step": 1038 + }, + { + "epoch": 0.19475164011246485, + "grad_norm": 55312.90625, + "learning_rate": 9.934762360796772e-05, + "loss": 2.3405, + "step": 1039 + }, + { + "epoch": 0.1949390815370197, + "grad_norm": 54814.93359375, + "learning_rate": 9.934635778179531e-05, + "loss": 2.4516, + "step": 1040 + }, + { + "epoch": 0.1951265229615745, + "grad_norm": 56008.64453125, + "learning_rate": 9.934509073683184e-05, + "loss": 2.3682, + "step": 1041 + }, + { + "epoch": 0.19531396438612933, + "grad_norm": 52466.8359375, + "learning_rate": 9.934382247310862e-05, + "loss": 2.3273, + "step": 1042 + }, + { + "epoch": 0.19550140581068415, + "grad_norm": 52244.01953125, + "learning_rate": 9.934255299065695e-05, + "loss": 2.3587, + "step": 1043 + }, + { + "epoch": 0.195688847235239, + "grad_norm": 48661.99609375, + "learning_rate": 9.93412822895082e-05, + "loss": 2.3053, + "step": 1044 + }, + { + "epoch": 0.1958762886597938, + "grad_norm": 58958.4921875, + "learning_rate": 9.934001036969373e-05, + "loss": 2.4336, + "step": 1045 + }, + { + "epoch": 0.19606373008434863, + "grad_norm": 52010.3359375, + "learning_rate": 9.933873723124501e-05, + "loss": 2.2548, + "step": 1046 + }, + { + "epoch": 0.19625117150890348, + "grad_norm": 56796.56640625, + "learning_rate": 9.933746287419342e-05, + "loss": 2.2789, + "step": 1047 + }, + { + "epoch": 0.1964386129334583, + "grad_norm": 50980.453125, + "learning_rate": 9.933618729857046e-05, + "loss": 2.3534, + "step": 1048 + }, + { + "epoch": 0.1966260543580131, + "grad_norm": 55063.08984375, + "learning_rate": 9.933491050440766e-05, + "loss": 2.3997, + "step": 1049 + }, + { + "epoch": 0.19681349578256796, + "grad_norm": 55392.578125, + "learning_rate": 9.933363249173652e-05, + "loss": 2.3568, + "step": 1050 + }, + { + "epoch": 0.19700093720712278, + "grad_norm": 56292.60546875, + "learning_rate": 9.933235326058863e-05, + "loss": 2.2864, + "step": 1051 + }, + { + "epoch": 0.1971883786316776, + "grad_norm": 50023.171875, + "learning_rate": 9.933107281099556e-05, + "loss": 2.3976, + "step": 1052 + }, + { + "epoch": 0.19737582005623241, + "grad_norm": 49267.96875, + "learning_rate": 9.932979114298897e-05, + "loss": 2.3465, + "step": 1053 + }, + { + "epoch": 0.19756326148078726, + "grad_norm": 54805.83984375, + "learning_rate": 9.932850825660047e-05, + "loss": 2.3298, + "step": 1054 + }, + { + "epoch": 0.19775070290534208, + "grad_norm": 49512.76171875, + "learning_rate": 9.932722415186179e-05, + "loss": 2.3464, + "step": 1055 + }, + { + "epoch": 0.1979381443298969, + "grad_norm": 51741.54296875, + "learning_rate": 9.932593882880463e-05, + "loss": 2.3667, + "step": 1056 + }, + { + "epoch": 0.19812558575445174, + "grad_norm": 52333.30859375, + "learning_rate": 9.932465228746073e-05, + "loss": 2.3461, + "step": 1057 + }, + { + "epoch": 0.19831302717900656, + "grad_norm": 53687.19140625, + "learning_rate": 9.932336452786186e-05, + "loss": 2.4446, + "step": 1058 + }, + { + "epoch": 0.19850046860356138, + "grad_norm": 53156.96875, + "learning_rate": 9.932207555003984e-05, + "loss": 2.3109, + "step": 1059 + }, + { + "epoch": 0.19868791002811623, + "grad_norm": 49162.9375, + "learning_rate": 9.932078535402649e-05, + "loss": 2.3497, + "step": 1060 + }, + { + "epoch": 0.19887535145267105, + "grad_norm": 50666.640625, + "learning_rate": 9.931949393985372e-05, + "loss": 2.3624, + "step": 1061 + }, + { + "epoch": 0.19906279287722586, + "grad_norm": 52118.640625, + "learning_rate": 9.931820130755336e-05, + "loss": 2.3433, + "step": 1062 + }, + { + "epoch": 0.19925023430178068, + "grad_norm": 55451.65234375, + "learning_rate": 9.931690745715739e-05, + "loss": 2.3082, + "step": 1063 + }, + { + "epoch": 0.19943767572633553, + "grad_norm": 54182.9609375, + "learning_rate": 9.931561238869773e-05, + "loss": 2.412, + "step": 1064 + }, + { + "epoch": 0.19962511715089035, + "grad_norm": 51559.6875, + "learning_rate": 9.931431610220639e-05, + "loss": 2.3152, + "step": 1065 + }, + { + "epoch": 0.19981255857544516, + "grad_norm": 51137.66796875, + "learning_rate": 9.931301859771538e-05, + "loss": 2.3808, + "step": 1066 + }, + { + "epoch": 0.2, + "grad_norm": 54243.8203125, + "learning_rate": 9.931171987525673e-05, + "loss": 2.3256, + "step": 1067 + }, + { + "epoch": 0.20018744142455483, + "grad_norm": 51318.19140625, + "learning_rate": 9.931041993486257e-05, + "loss": 2.3639, + "step": 1068 + }, + { + "epoch": 0.20037488284910965, + "grad_norm": 54284.92578125, + "learning_rate": 9.930911877656494e-05, + "loss": 2.37, + "step": 1069 + }, + { + "epoch": 0.20056232427366447, + "grad_norm": 51820.92578125, + "learning_rate": 9.930781640039602e-05, + "loss": 2.3181, + "step": 1070 + }, + { + "epoch": 0.2007497656982193, + "grad_norm": 53838.05859375, + "learning_rate": 9.930651280638794e-05, + "loss": 2.3795, + "step": 1071 + }, + { + "epoch": 0.20093720712277413, + "grad_norm": 57371.30859375, + "learning_rate": 9.930520799457295e-05, + "loss": 2.3373, + "step": 1072 + }, + { + "epoch": 0.20112464854732895, + "grad_norm": 50461.16015625, + "learning_rate": 9.930390196498322e-05, + "loss": 2.3912, + "step": 1073 + }, + { + "epoch": 0.2013120899718838, + "grad_norm": 53050.796875, + "learning_rate": 9.930259471765107e-05, + "loss": 2.3422, + "step": 1074 + }, + { + "epoch": 0.2014995313964386, + "grad_norm": 51781.17578125, + "learning_rate": 9.930128625260872e-05, + "loss": 2.297, + "step": 1075 + }, + { + "epoch": 0.20168697282099343, + "grad_norm": 73023.8046875, + "learning_rate": 9.929997656988853e-05, + "loss": 2.4825, + "step": 1076 + }, + { + "epoch": 0.20187441424554828, + "grad_norm": 53231.1640625, + "learning_rate": 9.929866566952284e-05, + "loss": 2.3178, + "step": 1077 + }, + { + "epoch": 0.2020618556701031, + "grad_norm": 56471.6015625, + "learning_rate": 9.929735355154402e-05, + "loss": 2.3996, + "step": 1078 + }, + { + "epoch": 0.20224929709465791, + "grad_norm": 55995.8828125, + "learning_rate": 9.929604021598449e-05, + "loss": 2.4064, + "step": 1079 + }, + { + "epoch": 0.20243673851921273, + "grad_norm": 48760.6875, + "learning_rate": 9.929472566287667e-05, + "loss": 2.4006, + "step": 1080 + }, + { + "epoch": 0.20262417994376758, + "grad_norm": 52433.93359375, + "learning_rate": 9.929340989225305e-05, + "loss": 2.338, + "step": 1081 + }, + { + "epoch": 0.2028116213683224, + "grad_norm": 52654.5390625, + "learning_rate": 9.929209290414611e-05, + "loss": 2.3264, + "step": 1082 + }, + { + "epoch": 0.20299906279287722, + "grad_norm": 52051.4765625, + "learning_rate": 9.929077469858838e-05, + "loss": 2.3621, + "step": 1083 + }, + { + "epoch": 0.20318650421743206, + "grad_norm": 58602.484375, + "learning_rate": 9.928945527561242e-05, + "loss": 2.3721, + "step": 1084 + }, + { + "epoch": 0.20337394564198688, + "grad_norm": 49175.078125, + "learning_rate": 9.928813463525083e-05, + "loss": 2.3153, + "step": 1085 + }, + { + "epoch": 0.2035613870665417, + "grad_norm": 54402.36328125, + "learning_rate": 9.92868127775362e-05, + "loss": 2.4145, + "step": 1086 + }, + { + "epoch": 0.20374882849109655, + "grad_norm": 51598.9140625, + "learning_rate": 9.928548970250123e-05, + "loss": 2.3669, + "step": 1087 + }, + { + "epoch": 0.20393626991565136, + "grad_norm": 53609.10546875, + "learning_rate": 9.928416541017854e-05, + "loss": 2.2849, + "step": 1088 + }, + { + "epoch": 0.20412371134020618, + "grad_norm": 50693.2734375, + "learning_rate": 9.928283990060085e-05, + "loss": 2.3475, + "step": 1089 + }, + { + "epoch": 0.204311152764761, + "grad_norm": 52494.91796875, + "learning_rate": 9.928151317380093e-05, + "loss": 2.2883, + "step": 1090 + }, + { + "epoch": 0.20449859418931585, + "grad_norm": 51761.26171875, + "learning_rate": 9.928018522981152e-05, + "loss": 2.3465, + "step": 1091 + }, + { + "epoch": 0.20468603561387066, + "grad_norm": 56645.7421875, + "learning_rate": 9.927885606866543e-05, + "loss": 2.3315, + "step": 1092 + }, + { + "epoch": 0.20487347703842548, + "grad_norm": 51660.3203125, + "learning_rate": 9.927752569039548e-05, + "loss": 2.356, + "step": 1093 + }, + { + "epoch": 0.20506091846298033, + "grad_norm": 49820.41796875, + "learning_rate": 9.927619409503455e-05, + "loss": 2.3842, + "step": 1094 + }, + { + "epoch": 0.20524835988753515, + "grad_norm": 53608.5234375, + "learning_rate": 9.92748612826155e-05, + "loss": 2.3947, + "step": 1095 + }, + { + "epoch": 0.20543580131208997, + "grad_norm": 54034.3515625, + "learning_rate": 9.927352725317127e-05, + "loss": 2.337, + "step": 1096 + }, + { + "epoch": 0.2056232427366448, + "grad_norm": 47639.7421875, + "learning_rate": 9.927219200673479e-05, + "loss": 2.3238, + "step": 1097 + }, + { + "epoch": 0.20581068416119963, + "grad_norm": 50189.55859375, + "learning_rate": 9.927085554333906e-05, + "loss": 2.3271, + "step": 1098 + }, + { + "epoch": 0.20599812558575445, + "grad_norm": 52426.58203125, + "learning_rate": 9.926951786301706e-05, + "loss": 2.4546, + "step": 1099 + }, + { + "epoch": 0.20618556701030927, + "grad_norm": 49214.43359375, + "learning_rate": 9.926817896580187e-05, + "loss": 2.4445, + "step": 1100 + }, + { + "epoch": 0.2063730084348641, + "grad_norm": 48585.80859375, + "learning_rate": 9.926683885172655e-05, + "loss": 2.3394, + "step": 1101 + }, + { + "epoch": 0.20656044985941893, + "grad_norm": 53408.53125, + "learning_rate": 9.926549752082415e-05, + "loss": 2.2984, + "step": 1102 + }, + { + "epoch": 0.20674789128397375, + "grad_norm": 54599.69921875, + "learning_rate": 9.926415497312784e-05, + "loss": 2.3707, + "step": 1103 + }, + { + "epoch": 0.2069353327085286, + "grad_norm": 50814.33203125, + "learning_rate": 9.92628112086708e-05, + "loss": 2.3753, + "step": 1104 + }, + { + "epoch": 0.20712277413308341, + "grad_norm": 53577.421875, + "learning_rate": 9.926146622748618e-05, + "loss": 2.3617, + "step": 1105 + }, + { + "epoch": 0.20731021555763823, + "grad_norm": 49554.5390625, + "learning_rate": 9.92601200296072e-05, + "loss": 2.3364, + "step": 1106 + }, + { + "epoch": 0.20749765698219305, + "grad_norm": 54554.0390625, + "learning_rate": 9.925877261506713e-05, + "loss": 2.3825, + "step": 1107 + }, + { + "epoch": 0.2076850984067479, + "grad_norm": 49774.79296875, + "learning_rate": 9.925742398389926e-05, + "loss": 2.3843, + "step": 1108 + }, + { + "epoch": 0.20787253983130272, + "grad_norm": 55267.5078125, + "learning_rate": 9.925607413613686e-05, + "loss": 2.4161, + "step": 1109 + }, + { + "epoch": 0.20805998125585753, + "grad_norm": 52270.5546875, + "learning_rate": 9.925472307181329e-05, + "loss": 2.3271, + "step": 1110 + }, + { + "epoch": 0.20824742268041238, + "grad_norm": 53626.88671875, + "learning_rate": 9.925337079096193e-05, + "loss": 2.3648, + "step": 1111 + }, + { + "epoch": 0.2084348641049672, + "grad_norm": 55224.9375, + "learning_rate": 9.925201729361616e-05, + "loss": 2.3967, + "step": 1112 + }, + { + "epoch": 0.20862230552952202, + "grad_norm": 54422.79296875, + "learning_rate": 9.925066257980942e-05, + "loss": 2.3418, + "step": 1113 + }, + { + "epoch": 0.20880974695407686, + "grad_norm": 53501.1328125, + "learning_rate": 9.924930664957517e-05, + "loss": 2.2736, + "step": 1114 + }, + { + "epoch": 0.20899718837863168, + "grad_norm": 50323.2421875, + "learning_rate": 9.924794950294691e-05, + "loss": 2.3693, + "step": 1115 + }, + { + "epoch": 0.2091846298031865, + "grad_norm": 51871.9296875, + "learning_rate": 9.924659113995813e-05, + "loss": 2.365, + "step": 1116 + }, + { + "epoch": 0.20937207122774132, + "grad_norm": 52099.35546875, + "learning_rate": 9.924523156064241e-05, + "loss": 2.3326, + "step": 1117 + }, + { + "epoch": 0.20955951265229616, + "grad_norm": 53530.984375, + "learning_rate": 9.924387076503332e-05, + "loss": 2.4727, + "step": 1118 + }, + { + "epoch": 0.20974695407685098, + "grad_norm": 53501.11328125, + "learning_rate": 9.924250875316446e-05, + "loss": 2.4225, + "step": 1119 + }, + { + "epoch": 0.2099343955014058, + "grad_norm": 57575.08984375, + "learning_rate": 9.924114552506947e-05, + "loss": 2.3591, + "step": 1120 + }, + { + "epoch": 0.21012183692596065, + "grad_norm": 55411.71875, + "learning_rate": 9.923978108078205e-05, + "loss": 2.3476, + "step": 1121 + }, + { + "epoch": 0.21030927835051547, + "grad_norm": 54551.25, + "learning_rate": 9.923841542033587e-05, + "loss": 2.3502, + "step": 1122 + }, + { + "epoch": 0.21049671977507028, + "grad_norm": 55276.6484375, + "learning_rate": 9.923704854376466e-05, + "loss": 2.3067, + "step": 1123 + }, + { + "epoch": 0.21068416119962513, + "grad_norm": 51852.84375, + "learning_rate": 9.923568045110218e-05, + "loss": 2.4185, + "step": 1124 + }, + { + "epoch": 0.21087160262417995, + "grad_norm": 53368.63671875, + "learning_rate": 9.923431114238224e-05, + "loss": 2.472, + "step": 1125 + }, + { + "epoch": 0.21105904404873477, + "grad_norm": 64332.390625, + "learning_rate": 9.923294061763864e-05, + "loss": 2.3843, + "step": 1126 + }, + { + "epoch": 0.21124648547328959, + "grad_norm": 55966.62890625, + "learning_rate": 9.923156887690526e-05, + "loss": 2.3523, + "step": 1127 + }, + { + "epoch": 0.21143392689784443, + "grad_norm": 49855.06640625, + "learning_rate": 9.923019592021593e-05, + "loss": 2.3422, + "step": 1128 + }, + { + "epoch": 0.21162136832239925, + "grad_norm": 54666.4140625, + "learning_rate": 9.92288217476046e-05, + "loss": 2.3688, + "step": 1129 + }, + { + "epoch": 0.21180880974695407, + "grad_norm": 50078.03515625, + "learning_rate": 9.922744635910521e-05, + "loss": 2.3361, + "step": 1130 + }, + { + "epoch": 0.21199625117150891, + "grad_norm": 49117.2421875, + "learning_rate": 9.92260697547517e-05, + "loss": 2.3433, + "step": 1131 + }, + { + "epoch": 0.21218369259606373, + "grad_norm": 49863.15234375, + "learning_rate": 9.922469193457811e-05, + "loss": 2.36, + "step": 1132 + }, + { + "epoch": 0.21237113402061855, + "grad_norm": 50927.8359375, + "learning_rate": 9.922331289861844e-05, + "loss": 2.354, + "step": 1133 + }, + { + "epoch": 0.21255857544517337, + "grad_norm": 51738.45703125, + "learning_rate": 9.922193264690676e-05, + "loss": 2.3925, + "step": 1134 + }, + { + "epoch": 0.21274601686972822, + "grad_norm": 50750.29296875, + "learning_rate": 9.922055117947716e-05, + "loss": 2.4119, + "step": 1135 + }, + { + "epoch": 0.21293345829428303, + "grad_norm": 54524.97265625, + "learning_rate": 9.921916849636375e-05, + "loss": 2.3818, + "step": 1136 + }, + { + "epoch": 0.21312089971883785, + "grad_norm": 55244.046875, + "learning_rate": 9.921778459760071e-05, + "loss": 2.3503, + "step": 1137 + }, + { + "epoch": 0.2133083411433927, + "grad_norm": 52880.96484375, + "learning_rate": 9.92163994832222e-05, + "loss": 2.3418, + "step": 1138 + }, + { + "epoch": 0.21349578256794752, + "grad_norm": 53803.92578125, + "learning_rate": 9.921501315326242e-05, + "loss": 2.3517, + "step": 1139 + }, + { + "epoch": 0.21368322399250234, + "grad_norm": 51261.94140625, + "learning_rate": 9.921362560775565e-05, + "loss": 2.3603, + "step": 1140 + }, + { + "epoch": 0.21387066541705718, + "grad_norm": 50836.390625, + "learning_rate": 9.921223684673611e-05, + "loss": 2.3653, + "step": 1141 + }, + { + "epoch": 0.214058106841612, + "grad_norm": 56962.140625, + "learning_rate": 9.921084687023813e-05, + "loss": 2.3853, + "step": 1142 + }, + { + "epoch": 0.21424554826616682, + "grad_norm": 52220.95703125, + "learning_rate": 9.920945567829603e-05, + "loss": 2.3293, + "step": 1143 + }, + { + "epoch": 0.21443298969072164, + "grad_norm": 50913.69921875, + "learning_rate": 9.920806327094419e-05, + "loss": 2.323, + "step": 1144 + }, + { + "epoch": 0.21462043111527648, + "grad_norm": 57004.875, + "learning_rate": 9.920666964821696e-05, + "loss": 2.4459, + "step": 1145 + }, + { + "epoch": 0.2148078725398313, + "grad_norm": 51864.5234375, + "learning_rate": 9.920527481014881e-05, + "loss": 2.4111, + "step": 1146 + }, + { + "epoch": 0.21499531396438612, + "grad_norm": 51700.7734375, + "learning_rate": 9.920387875677417e-05, + "loss": 2.3988, + "step": 1147 + }, + { + "epoch": 0.21518275538894097, + "grad_norm": 51316.55859375, + "learning_rate": 9.92024814881275e-05, + "loss": 2.3516, + "step": 1148 + }, + { + "epoch": 0.21537019681349578, + "grad_norm": 56729.03125, + "learning_rate": 9.920108300424336e-05, + "loss": 2.4896, + "step": 1149 + }, + { + "epoch": 0.2155576382380506, + "grad_norm": 55227.171875, + "learning_rate": 9.919968330515623e-05, + "loss": 2.3483, + "step": 1150 + }, + { + "epoch": 0.21574507966260545, + "grad_norm": 52117.4140625, + "learning_rate": 9.919828239090072e-05, + "loss": 2.3458, + "step": 1151 + }, + { + "epoch": 0.21593252108716027, + "grad_norm": 55832.484375, + "learning_rate": 9.919688026151142e-05, + "loss": 2.3477, + "step": 1152 + }, + { + "epoch": 0.21611996251171509, + "grad_norm": 49093.6953125, + "learning_rate": 9.919547691702296e-05, + "loss": 2.4105, + "step": 1153 + }, + { + "epoch": 0.2163074039362699, + "grad_norm": 50604.56640625, + "learning_rate": 9.919407235747002e-05, + "loss": 2.3342, + "step": 1154 + }, + { + "epoch": 0.21649484536082475, + "grad_norm": 51257.203125, + "learning_rate": 9.919266658288726e-05, + "loss": 2.3998, + "step": 1155 + }, + { + "epoch": 0.21668228678537957, + "grad_norm": 57057.38671875, + "learning_rate": 9.919125959330942e-05, + "loss": 2.3356, + "step": 1156 + }, + { + "epoch": 0.2168697282099344, + "grad_norm": 54603.81640625, + "learning_rate": 9.918985138877124e-05, + "loss": 2.3567, + "step": 1157 + }, + { + "epoch": 0.21705716963448923, + "grad_norm": 55252.765625, + "learning_rate": 9.91884419693075e-05, + "loss": 2.32, + "step": 1158 + }, + { + "epoch": 0.21724461105904405, + "grad_norm": 59222.765625, + "learning_rate": 9.918703133495304e-05, + "loss": 2.3669, + "step": 1159 + }, + { + "epoch": 0.21743205248359887, + "grad_norm": 53397.39453125, + "learning_rate": 9.918561948574266e-05, + "loss": 2.3335, + "step": 1160 + }, + { + "epoch": 0.2176194939081537, + "grad_norm": 50831.953125, + "learning_rate": 9.918420642171124e-05, + "loss": 2.2911, + "step": 1161 + }, + { + "epoch": 0.21780693533270853, + "grad_norm": 51057.62109375, + "learning_rate": 9.918279214289369e-05, + "loss": 2.4247, + "step": 1162 + }, + { + "epoch": 0.21799437675726335, + "grad_norm": 54926.88671875, + "learning_rate": 9.918137664932497e-05, + "loss": 2.3844, + "step": 1163 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 53365.2734375, + "learning_rate": 9.917995994104e-05, + "loss": 2.4107, + "step": 1164 + }, + { + "epoch": 0.21836925960637302, + "grad_norm": 59865.76171875, + "learning_rate": 9.917854201807377e-05, + "loss": 2.3918, + "step": 1165 + }, + { + "epoch": 0.21855670103092784, + "grad_norm": 48277.95703125, + "learning_rate": 9.917712288046132e-05, + "loss": 2.3404, + "step": 1166 + }, + { + "epoch": 0.21874414245548265, + "grad_norm": 51429.328125, + "learning_rate": 9.917570252823769e-05, + "loss": 2.4214, + "step": 1167 + }, + { + "epoch": 0.2189315838800375, + "grad_norm": 56320.44140625, + "learning_rate": 9.917428096143798e-05, + "loss": 2.3582, + "step": 1168 + }, + { + "epoch": 0.21911902530459232, + "grad_norm": 54334.29296875, + "learning_rate": 9.917285818009725e-05, + "loss": 2.3234, + "step": 1169 + }, + { + "epoch": 0.21930646672914714, + "grad_norm": 59062.0390625, + "learning_rate": 9.91714341842507e-05, + "loss": 2.4573, + "step": 1170 + }, + { + "epoch": 0.21949390815370196, + "grad_norm": 50355.91796875, + "learning_rate": 9.917000897393348e-05, + "loss": 2.3672, + "step": 1171 + }, + { + "epoch": 0.2196813495782568, + "grad_norm": 54709.296875, + "learning_rate": 9.916858254918079e-05, + "loss": 2.3933, + "step": 1172 + }, + { + "epoch": 0.21986879100281162, + "grad_norm": 58880.8359375, + "learning_rate": 9.916715491002785e-05, + "loss": 2.3153, + "step": 1173 + }, + { + "epoch": 0.22005623242736644, + "grad_norm": 60973.6015625, + "learning_rate": 9.916572605650993e-05, + "loss": 2.3365, + "step": 1174 + }, + { + "epoch": 0.22024367385192128, + "grad_norm": 51313.8125, + "learning_rate": 9.916429598866232e-05, + "loss": 2.3857, + "step": 1175 + }, + { + "epoch": 0.2204311152764761, + "grad_norm": 53415.52734375, + "learning_rate": 9.916286470652036e-05, + "loss": 2.3942, + "step": 1176 + }, + { + "epoch": 0.22061855670103092, + "grad_norm": 49302.65625, + "learning_rate": 9.916143221011936e-05, + "loss": 2.3116, + "step": 1177 + }, + { + "epoch": 0.22080599812558577, + "grad_norm": 53391.375, + "learning_rate": 9.915999849949473e-05, + "loss": 2.3748, + "step": 1178 + }, + { + "epoch": 0.22099343955014059, + "grad_norm": 55496.078125, + "learning_rate": 9.915856357468188e-05, + "loss": 2.3067, + "step": 1179 + }, + { + "epoch": 0.2211808809746954, + "grad_norm": 49810.08984375, + "learning_rate": 9.915712743571624e-05, + "loss": 2.3195, + "step": 1180 + }, + { + "epoch": 0.22136832239925022, + "grad_norm": 55512.58203125, + "learning_rate": 9.915569008263328e-05, + "loss": 2.3233, + "step": 1181 + }, + { + "epoch": 0.22155576382380507, + "grad_norm": 54825.6953125, + "learning_rate": 9.915425151546852e-05, + "loss": 2.356, + "step": 1182 + }, + { + "epoch": 0.2217432052483599, + "grad_norm": 54426.0546875, + "learning_rate": 9.915281173425746e-05, + "loss": 2.439, + "step": 1183 + }, + { + "epoch": 0.2219306466729147, + "grad_norm": 52880.6484375, + "learning_rate": 9.91513707390357e-05, + "loss": 2.2518, + "step": 1184 + }, + { + "epoch": 0.22211808809746955, + "grad_norm": 49845.16796875, + "learning_rate": 9.91499285298388e-05, + "loss": 2.3729, + "step": 1185 + }, + { + "epoch": 0.22230552952202437, + "grad_norm": 59249.33984375, + "learning_rate": 9.914848510670239e-05, + "loss": 2.4187, + "step": 1186 + }, + { + "epoch": 0.2224929709465792, + "grad_norm": 49891.9375, + "learning_rate": 9.91470404696621e-05, + "loss": 2.3724, + "step": 1187 + }, + { + "epoch": 0.22268041237113403, + "grad_norm": 49549.234375, + "learning_rate": 9.914559461875365e-05, + "loss": 2.3722, + "step": 1188 + }, + { + "epoch": 0.22286785379568885, + "grad_norm": 49926.99609375, + "learning_rate": 9.914414755401272e-05, + "loss": 2.3793, + "step": 1189 + }, + { + "epoch": 0.22305529522024367, + "grad_norm": 55726.67578125, + "learning_rate": 9.914269927547507e-05, + "loss": 2.3851, + "step": 1190 + }, + { + "epoch": 0.2232427366447985, + "grad_norm": 52685.734375, + "learning_rate": 9.914124978317646e-05, + "loss": 2.3145, + "step": 1191 + }, + { + "epoch": 0.22343017806935334, + "grad_norm": 51084.0703125, + "learning_rate": 9.91397990771527e-05, + "loss": 2.3679, + "step": 1192 + }, + { + "epoch": 0.22361761949390815, + "grad_norm": 54019.05859375, + "learning_rate": 9.91383471574396e-05, + "loss": 2.3372, + "step": 1193 + }, + { + "epoch": 0.22380506091846297, + "grad_norm": 51776.70703125, + "learning_rate": 9.913689402407304e-05, + "loss": 2.3901, + "step": 1194 + }, + { + "epoch": 0.22399250234301782, + "grad_norm": 54336.74609375, + "learning_rate": 9.91354396770889e-05, + "loss": 2.4423, + "step": 1195 + }, + { + "epoch": 0.22417994376757264, + "grad_norm": 49166.09375, + "learning_rate": 9.913398411652311e-05, + "loss": 2.3268, + "step": 1196 + }, + { + "epoch": 0.22436738519212746, + "grad_norm": 53534.171875, + "learning_rate": 9.913252734241161e-05, + "loss": 2.3871, + "step": 1197 + }, + { + "epoch": 0.22455482661668227, + "grad_norm": 54338.078125, + "learning_rate": 9.91310693547904e-05, + "loss": 2.3982, + "step": 1198 + }, + { + "epoch": 0.22474226804123712, + "grad_norm": 49912.92578125, + "learning_rate": 9.912961015369546e-05, + "loss": 2.4066, + "step": 1199 + }, + { + "epoch": 0.22492970946579194, + "grad_norm": 55796.17578125, + "learning_rate": 9.912814973916287e-05, + "loss": 2.336, + "step": 1200 + }, + { + "epoch": 0.22511715089034676, + "grad_norm": 51169.76171875, + "learning_rate": 9.912668811122865e-05, + "loss": 2.2735, + "step": 1201 + }, + { + "epoch": 0.2253045923149016, + "grad_norm": 48795.48828125, + "learning_rate": 9.912522526992894e-05, + "loss": 2.3844, + "step": 1202 + }, + { + "epoch": 0.22549203373945642, + "grad_norm": 57529.23046875, + "learning_rate": 9.912376121529987e-05, + "loss": 2.381, + "step": 1203 + }, + { + "epoch": 0.22567947516401124, + "grad_norm": 56127.26171875, + "learning_rate": 9.912229594737757e-05, + "loss": 2.353, + "step": 1204 + }, + { + "epoch": 0.22586691658856609, + "grad_norm": 52075.921875, + "learning_rate": 9.912082946619825e-05, + "loss": 2.3282, + "step": 1205 + }, + { + "epoch": 0.2260543580131209, + "grad_norm": 51504.1484375, + "learning_rate": 9.911936177179813e-05, + "loss": 2.336, + "step": 1206 + }, + { + "epoch": 0.22624179943767572, + "grad_norm": 52137.875, + "learning_rate": 9.911789286421345e-05, + "loss": 2.3844, + "step": 1207 + }, + { + "epoch": 0.22642924086223054, + "grad_norm": 55204.25390625, + "learning_rate": 9.911642274348053e-05, + "loss": 2.3757, + "step": 1208 + }, + { + "epoch": 0.2266166822867854, + "grad_norm": 48393.234375, + "learning_rate": 9.91149514096356e-05, + "loss": 2.3526, + "step": 1209 + }, + { + "epoch": 0.2268041237113402, + "grad_norm": 48530.78125, + "learning_rate": 9.911347886271509e-05, + "loss": 2.3476, + "step": 1210 + }, + { + "epoch": 0.22699156513589502, + "grad_norm": 52339.27734375, + "learning_rate": 9.911200510275532e-05, + "loss": 2.4383, + "step": 1211 + }, + { + "epoch": 0.22717900656044987, + "grad_norm": 50372.56640625, + "learning_rate": 9.911053012979268e-05, + "loss": 2.3107, + "step": 1212 + }, + { + "epoch": 0.2273664479850047, + "grad_norm": 53760.828125, + "learning_rate": 9.910905394386363e-05, + "loss": 2.3412, + "step": 1213 + }, + { + "epoch": 0.2275538894095595, + "grad_norm": 49273.35546875, + "learning_rate": 9.910757654500462e-05, + "loss": 2.4361, + "step": 1214 + }, + { + "epoch": 0.22774133083411435, + "grad_norm": 53196.21875, + "learning_rate": 9.910609793325213e-05, + "loss": 2.3562, + "step": 1215 + }, + { + "epoch": 0.22792877225866917, + "grad_norm": 49282.37109375, + "learning_rate": 9.910461810864271e-05, + "loss": 2.3344, + "step": 1216 + }, + { + "epoch": 0.228116213683224, + "grad_norm": 49608.0546875, + "learning_rate": 9.910313707121288e-05, + "loss": 2.3139, + "step": 1217 + }, + { + "epoch": 0.2283036551077788, + "grad_norm": 51596.80078125, + "learning_rate": 9.910165482099921e-05, + "loss": 2.3175, + "step": 1218 + }, + { + "epoch": 0.22849109653233365, + "grad_norm": 54462.9921875, + "learning_rate": 9.910017135803832e-05, + "loss": 2.3311, + "step": 1219 + }, + { + "epoch": 0.22867853795688847, + "grad_norm": 53221.83203125, + "learning_rate": 9.90986866823669e-05, + "loss": 2.3436, + "step": 1220 + }, + { + "epoch": 0.2288659793814433, + "grad_norm": 56721.359375, + "learning_rate": 9.909720079402154e-05, + "loss": 2.3961, + "step": 1221 + }, + { + "epoch": 0.22905342080599814, + "grad_norm": 49476.82421875, + "learning_rate": 9.909571369303898e-05, + "loss": 2.3943, + "step": 1222 + }, + { + "epoch": 0.22924086223055296, + "grad_norm": 50715.91015625, + "learning_rate": 9.909422537945597e-05, + "loss": 2.2949, + "step": 1223 + }, + { + "epoch": 0.22942830365510777, + "grad_norm": 49019.64453125, + "learning_rate": 9.909273585330923e-05, + "loss": 2.3219, + "step": 1224 + }, + { + "epoch": 0.2296157450796626, + "grad_norm": 47085.26953125, + "learning_rate": 9.909124511463554e-05, + "loss": 2.403, + "step": 1225 + }, + { + "epoch": 0.22980318650421744, + "grad_norm": 52626.7890625, + "learning_rate": 9.908975316347177e-05, + "loss": 2.3515, + "step": 1226 + }, + { + "epoch": 0.22999062792877226, + "grad_norm": 52391.6171875, + "learning_rate": 9.908825999985473e-05, + "loss": 2.391, + "step": 1227 + }, + { + "epoch": 0.23017806935332707, + "grad_norm": 53122.33203125, + "learning_rate": 9.908676562382133e-05, + "loss": 2.3991, + "step": 1228 + }, + { + "epoch": 0.23036551077788192, + "grad_norm": 53021.8515625, + "learning_rate": 9.908527003540845e-05, + "loss": 2.2777, + "step": 1229 + }, + { + "epoch": 0.23055295220243674, + "grad_norm": 49909.2890625, + "learning_rate": 9.908377323465304e-05, + "loss": 2.3693, + "step": 1230 + }, + { + "epoch": 0.23074039362699156, + "grad_norm": 50074.3359375, + "learning_rate": 9.908227522159207e-05, + "loss": 2.3796, + "step": 1231 + }, + { + "epoch": 0.2309278350515464, + "grad_norm": 52894.13671875, + "learning_rate": 9.908077599626253e-05, + "loss": 2.3283, + "step": 1232 + }, + { + "epoch": 0.23111527647610122, + "grad_norm": 52441.08984375, + "learning_rate": 9.907927555870147e-05, + "loss": 2.3346, + "step": 1233 + }, + { + "epoch": 0.23130271790065604, + "grad_norm": 53246.43359375, + "learning_rate": 9.907777390894592e-05, + "loss": 2.3869, + "step": 1234 + }, + { + "epoch": 0.23149015932521086, + "grad_norm": 50695.44140625, + "learning_rate": 9.907627104703301e-05, + "loss": 2.3357, + "step": 1235 + }, + { + "epoch": 0.2316776007497657, + "grad_norm": 54787.6953125, + "learning_rate": 9.907476697299982e-05, + "loss": 2.4824, + "step": 1236 + }, + { + "epoch": 0.23186504217432052, + "grad_norm": 53779.0390625, + "learning_rate": 9.907326168688351e-05, + "loss": 2.3816, + "step": 1237 + }, + { + "epoch": 0.23205248359887534, + "grad_norm": 51710.18359375, + "learning_rate": 9.907175518872126e-05, + "loss": 2.3435, + "step": 1238 + }, + { + "epoch": 0.2322399250234302, + "grad_norm": 53226.53125, + "learning_rate": 9.907024747855028e-05, + "loss": 2.296, + "step": 1239 + }, + { + "epoch": 0.232427366447985, + "grad_norm": 48031.2265625, + "learning_rate": 9.906873855640781e-05, + "loss": 2.436, + "step": 1240 + }, + { + "epoch": 0.23261480787253982, + "grad_norm": 51441.74609375, + "learning_rate": 9.906722842233112e-05, + "loss": 2.4487, + "step": 1241 + }, + { + "epoch": 0.23280224929709467, + "grad_norm": 53677.0234375, + "learning_rate": 9.90657170763575e-05, + "loss": 2.3406, + "step": 1242 + }, + { + "epoch": 0.2329896907216495, + "grad_norm": 49318.09765625, + "learning_rate": 9.906420451852427e-05, + "loss": 2.3432, + "step": 1243 + }, + { + "epoch": 0.2331771321462043, + "grad_norm": 54931.9296875, + "learning_rate": 9.906269074886882e-05, + "loss": 2.3992, + "step": 1244 + }, + { + "epoch": 0.23336457357075913, + "grad_norm": 50977.19140625, + "learning_rate": 9.90611757674285e-05, + "loss": 2.3722, + "step": 1245 + }, + { + "epoch": 0.23355201499531397, + "grad_norm": 47543.2890625, + "learning_rate": 9.905965957424077e-05, + "loss": 2.391, + "step": 1246 + }, + { + "epoch": 0.2337394564198688, + "grad_norm": 51831.4375, + "learning_rate": 9.905814216934304e-05, + "loss": 2.3413, + "step": 1247 + }, + { + "epoch": 0.2339268978444236, + "grad_norm": 55583.2421875, + "learning_rate": 9.90566235527728e-05, + "loss": 2.3586, + "step": 1248 + }, + { + "epoch": 0.23411433926897846, + "grad_norm": 49569.35546875, + "learning_rate": 9.905510372456758e-05, + "loss": 2.3772, + "step": 1249 + }, + { + "epoch": 0.23430178069353327, + "grad_norm": 52024.9375, + "learning_rate": 9.90535826847649e-05, + "loss": 2.3522, + "step": 1250 + }, + { + "epoch": 0.2344892221180881, + "grad_norm": 56629.6171875, + "learning_rate": 9.905206043340231e-05, + "loss": 2.3391, + "step": 1251 + }, + { + "epoch": 0.2346766635426429, + "grad_norm": 56355.99609375, + "learning_rate": 9.905053697051743e-05, + "loss": 2.3272, + "step": 1252 + }, + { + "epoch": 0.23486410496719776, + "grad_norm": 51314.06640625, + "learning_rate": 9.904901229614788e-05, + "loss": 2.3139, + "step": 1253 + }, + { + "epoch": 0.23505154639175257, + "grad_norm": 55139.5546875, + "learning_rate": 9.904748641033133e-05, + "loss": 2.3951, + "step": 1254 + }, + { + "epoch": 0.2352389878163074, + "grad_norm": 52428.3984375, + "learning_rate": 9.904595931310543e-05, + "loss": 2.3876, + "step": 1255 + }, + { + "epoch": 0.23542642924086224, + "grad_norm": 56144.03125, + "learning_rate": 9.904443100450796e-05, + "loss": 2.3474, + "step": 1256 + }, + { + "epoch": 0.23561387066541706, + "grad_norm": 49307.91015625, + "learning_rate": 9.90429014845766e-05, + "loss": 2.3804, + "step": 1257 + }, + { + "epoch": 0.23580131208997188, + "grad_norm": 57053.38671875, + "learning_rate": 9.904137075334917e-05, + "loss": 2.3612, + "step": 1258 + }, + { + "epoch": 0.23598875351452672, + "grad_norm": 50306.546875, + "learning_rate": 9.903983881086347e-05, + "loss": 2.4618, + "step": 1259 + }, + { + "epoch": 0.23617619493908154, + "grad_norm": 50676.10546875, + "learning_rate": 9.903830565715732e-05, + "loss": 2.3098, + "step": 1260 + }, + { + "epoch": 0.23636363636363636, + "grad_norm": 51969.40234375, + "learning_rate": 9.903677129226861e-05, + "loss": 2.3631, + "step": 1261 + }, + { + "epoch": 0.23655107778819118, + "grad_norm": 52876.4375, + "learning_rate": 9.903523571623523e-05, + "loss": 2.3728, + "step": 1262 + }, + { + "epoch": 0.23673851921274602, + "grad_norm": 52753.31640625, + "learning_rate": 9.90336989290951e-05, + "loss": 2.3663, + "step": 1263 + }, + { + "epoch": 0.23692596063730084, + "grad_norm": 51694.80078125, + "learning_rate": 9.903216093088618e-05, + "loss": 2.3638, + "step": 1264 + }, + { + "epoch": 0.23711340206185566, + "grad_norm": 48634.04296875, + "learning_rate": 9.903062172164645e-05, + "loss": 2.3065, + "step": 1265 + }, + { + "epoch": 0.2373008434864105, + "grad_norm": 51819.39453125, + "learning_rate": 9.902908130141393e-05, + "loss": 2.3804, + "step": 1266 + }, + { + "epoch": 0.23748828491096532, + "grad_norm": 52582.8359375, + "learning_rate": 9.902753967022668e-05, + "loss": 2.3852, + "step": 1267 + }, + { + "epoch": 0.23767572633552014, + "grad_norm": 51956.921875, + "learning_rate": 9.902599682812275e-05, + "loss": 2.3352, + "step": 1268 + }, + { + "epoch": 0.237863167760075, + "grad_norm": 52779.453125, + "learning_rate": 9.902445277514028e-05, + "loss": 2.3959, + "step": 1269 + }, + { + "epoch": 0.2380506091846298, + "grad_norm": 52412.76953125, + "learning_rate": 9.902290751131737e-05, + "loss": 2.32, + "step": 1270 + }, + { + "epoch": 0.23823805060918463, + "grad_norm": 57103.06640625, + "learning_rate": 9.902136103669221e-05, + "loss": 2.3897, + "step": 1271 + }, + { + "epoch": 0.23842549203373944, + "grad_norm": 50866.37890625, + "learning_rate": 9.9019813351303e-05, + "loss": 2.3776, + "step": 1272 + }, + { + "epoch": 0.2386129334582943, + "grad_norm": 58020.03125, + "learning_rate": 9.901826445518795e-05, + "loss": 2.3475, + "step": 1273 + }, + { + "epoch": 0.2388003748828491, + "grad_norm": 52418.6171875, + "learning_rate": 9.901671434838532e-05, + "loss": 2.4103, + "step": 1274 + }, + { + "epoch": 0.23898781630740393, + "grad_norm": 57570.765625, + "learning_rate": 9.901516303093338e-05, + "loss": 2.3166, + "step": 1275 + }, + { + "epoch": 0.23917525773195877, + "grad_norm": 60854.65625, + "learning_rate": 9.901361050287049e-05, + "loss": 2.3288, + "step": 1276 + }, + { + "epoch": 0.2393626991565136, + "grad_norm": 50253.7265625, + "learning_rate": 9.901205676423495e-05, + "loss": 2.3962, + "step": 1277 + }, + { + "epoch": 0.2395501405810684, + "grad_norm": 51420.41015625, + "learning_rate": 9.901050181506517e-05, + "loss": 2.3913, + "step": 1278 + }, + { + "epoch": 0.23973758200562326, + "grad_norm": 52508.11328125, + "learning_rate": 9.900894565539953e-05, + "loss": 2.3208, + "step": 1279 + }, + { + "epoch": 0.23992502343017807, + "grad_norm": 47417.17578125, + "learning_rate": 9.900738828527646e-05, + "loss": 2.3462, + "step": 1280 + }, + { + "epoch": 0.2401124648547329, + "grad_norm": 54909.26953125, + "learning_rate": 9.900582970473445e-05, + "loss": 2.3924, + "step": 1281 + }, + { + "epoch": 0.2402999062792877, + "grad_norm": 49962.16015625, + "learning_rate": 9.900426991381198e-05, + "loss": 2.3219, + "step": 1282 + }, + { + "epoch": 0.24048734770384256, + "grad_norm": 50745.84375, + "learning_rate": 9.900270891254759e-05, + "loss": 2.3894, + "step": 1283 + }, + { + "epoch": 0.24067478912839738, + "grad_norm": 51366.79296875, + "learning_rate": 9.90011467009798e-05, + "loss": 2.3082, + "step": 1284 + }, + { + "epoch": 0.2408622305529522, + "grad_norm": 53778.40625, + "learning_rate": 9.899958327914722e-05, + "loss": 2.3737, + "step": 1285 + }, + { + "epoch": 0.24104967197750704, + "grad_norm": 50884.50390625, + "learning_rate": 9.899801864708849e-05, + "loss": 2.4489, + "step": 1286 + }, + { + "epoch": 0.24123711340206186, + "grad_norm": 51854.80859375, + "learning_rate": 9.89964528048422e-05, + "loss": 2.3276, + "step": 1287 + }, + { + "epoch": 0.24142455482661668, + "grad_norm": 49301.49609375, + "learning_rate": 9.899488575244706e-05, + "loss": 2.3089, + "step": 1288 + }, + { + "epoch": 0.2416119962511715, + "grad_norm": 52543.765625, + "learning_rate": 9.899331748994176e-05, + "loss": 2.3336, + "step": 1289 + }, + { + "epoch": 0.24179943767572634, + "grad_norm": 51347.41796875, + "learning_rate": 9.899174801736504e-05, + "loss": 2.2825, + "step": 1290 + }, + { + "epoch": 0.24198687910028116, + "grad_norm": 54380.5546875, + "learning_rate": 9.899017733475566e-05, + "loss": 2.3175, + "step": 1291 + }, + { + "epoch": 0.24217432052483598, + "grad_norm": 50387.54296875, + "learning_rate": 9.898860544215242e-05, + "loss": 2.4162, + "step": 1292 + }, + { + "epoch": 0.24236176194939082, + "grad_norm": 47797.4375, + "learning_rate": 9.898703233959416e-05, + "loss": 2.3727, + "step": 1293 + }, + { + "epoch": 0.24254920337394564, + "grad_norm": 51167.7734375, + "learning_rate": 9.898545802711969e-05, + "loss": 2.2739, + "step": 1294 + }, + { + "epoch": 0.24273664479850046, + "grad_norm": 53382.09375, + "learning_rate": 9.898388250476792e-05, + "loss": 2.3414, + "step": 1295 + }, + { + "epoch": 0.2429240862230553, + "grad_norm": 54196.38671875, + "learning_rate": 9.898230577257777e-05, + "loss": 2.3361, + "step": 1296 + }, + { + "epoch": 0.24311152764761013, + "grad_norm": 53105.36328125, + "learning_rate": 9.898072783058816e-05, + "loss": 2.2664, + "step": 1297 + }, + { + "epoch": 0.24329896907216494, + "grad_norm": 50730.71484375, + "learning_rate": 9.897914867883808e-05, + "loss": 2.3193, + "step": 1298 + }, + { + "epoch": 0.24348641049671976, + "grad_norm": 50765.32421875, + "learning_rate": 9.897756831736654e-05, + "loss": 2.3134, + "step": 1299 + }, + { + "epoch": 0.2436738519212746, + "grad_norm": 49563.65625, + "learning_rate": 9.897598674621255e-05, + "loss": 2.3312, + "step": 1300 + }, + { + "epoch": 0.24386129334582943, + "grad_norm": 58552.77734375, + "learning_rate": 9.89744039654152e-05, + "loss": 2.3726, + "step": 1301 + }, + { + "epoch": 0.24404873477038425, + "grad_norm": 55141.71875, + "learning_rate": 9.897281997501355e-05, + "loss": 2.4103, + "step": 1302 + }, + { + "epoch": 0.2442361761949391, + "grad_norm": 48830.97265625, + "learning_rate": 9.897123477504676e-05, + "loss": 2.4477, + "step": 1303 + }, + { + "epoch": 0.2444236176194939, + "grad_norm": 53646.09375, + "learning_rate": 9.896964836555396e-05, + "loss": 2.294, + "step": 1304 + }, + { + "epoch": 0.24461105904404873, + "grad_norm": 52994.1484375, + "learning_rate": 9.896806074657433e-05, + "loss": 2.3463, + "step": 1305 + }, + { + "epoch": 0.24479850046860357, + "grad_norm": 50343.0390625, + "learning_rate": 9.896647191814708e-05, + "loss": 2.3375, + "step": 1306 + }, + { + "epoch": 0.2449859418931584, + "grad_norm": 53703.921875, + "learning_rate": 9.896488188031148e-05, + "loss": 2.4368, + "step": 1307 + }, + { + "epoch": 0.2451733833177132, + "grad_norm": 51910.94140625, + "learning_rate": 9.896329063310677e-05, + "loss": 2.383, + "step": 1308 + }, + { + "epoch": 0.24536082474226803, + "grad_norm": 51780.03125, + "learning_rate": 9.896169817657226e-05, + "loss": 2.3037, + "step": 1309 + }, + { + "epoch": 0.24554826616682288, + "grad_norm": 52444.59375, + "learning_rate": 9.89601045107473e-05, + "loss": 2.4538, + "step": 1310 + }, + { + "epoch": 0.2457357075913777, + "grad_norm": 56958.15625, + "learning_rate": 9.895850963567121e-05, + "loss": 2.3655, + "step": 1311 + }, + { + "epoch": 0.2459231490159325, + "grad_norm": 53276.87109375, + "learning_rate": 9.895691355138342e-05, + "loss": 2.3557, + "step": 1312 + }, + { + "epoch": 0.24611059044048736, + "grad_norm": 51760.71875, + "learning_rate": 9.895531625792334e-05, + "loss": 2.3926, + "step": 1313 + }, + { + "epoch": 0.24629803186504218, + "grad_norm": 54516.28125, + "learning_rate": 9.895371775533042e-05, + "loss": 2.3433, + "step": 1314 + }, + { + "epoch": 0.246485473289597, + "grad_norm": 55060.9140625, + "learning_rate": 9.895211804364415e-05, + "loss": 2.3448, + "step": 1315 + }, + { + "epoch": 0.24667291471415181, + "grad_norm": 50857.7265625, + "learning_rate": 9.895051712290402e-05, + "loss": 2.4612, + "step": 1316 + }, + { + "epoch": 0.24686035613870666, + "grad_norm": 57799.07421875, + "learning_rate": 9.894891499314958e-05, + "loss": 2.4254, + "step": 1317 + }, + { + "epoch": 0.24704779756326148, + "grad_norm": 55423.94921875, + "learning_rate": 9.894731165442041e-05, + "loss": 2.3629, + "step": 1318 + }, + { + "epoch": 0.2472352389878163, + "grad_norm": 51489.19921875, + "learning_rate": 9.89457071067561e-05, + "loss": 2.3181, + "step": 1319 + }, + { + "epoch": 0.24742268041237114, + "grad_norm": 55108.08984375, + "learning_rate": 9.894410135019628e-05, + "loss": 2.3334, + "step": 1320 + }, + { + "epoch": 0.24761012183692596, + "grad_norm": 51174.6484375, + "learning_rate": 9.894249438478062e-05, + "loss": 2.3486, + "step": 1321 + }, + { + "epoch": 0.24779756326148078, + "grad_norm": 52529.296875, + "learning_rate": 9.89408862105488e-05, + "loss": 2.3897, + "step": 1322 + }, + { + "epoch": 0.24798500468603563, + "grad_norm": 53991.54296875, + "learning_rate": 9.893927682754056e-05, + "loss": 2.3493, + "step": 1323 + }, + { + "epoch": 0.24817244611059044, + "grad_norm": 53192.95703125, + "learning_rate": 9.893766623579561e-05, + "loss": 2.2711, + "step": 1324 + }, + { + "epoch": 0.24835988753514526, + "grad_norm": 50466.43359375, + "learning_rate": 9.893605443535377e-05, + "loss": 2.4143, + "step": 1325 + }, + { + "epoch": 0.24854732895970008, + "grad_norm": 52885.82421875, + "learning_rate": 9.893444142625481e-05, + "loss": 2.2991, + "step": 1326 + }, + { + "epoch": 0.24873477038425493, + "grad_norm": 53686.796875, + "learning_rate": 9.89328272085386e-05, + "loss": 2.3348, + "step": 1327 + }, + { + "epoch": 0.24892221180880975, + "grad_norm": 55671.53125, + "learning_rate": 9.893121178224501e-05, + "loss": 2.3337, + "step": 1328 + }, + { + "epoch": 0.24910965323336456, + "grad_norm": 55160.98046875, + "learning_rate": 9.892959514741394e-05, + "loss": 2.3237, + "step": 1329 + }, + { + "epoch": 0.2492970946579194, + "grad_norm": 52382.41015625, + "learning_rate": 9.892797730408527e-05, + "loss": 2.4178, + "step": 1330 + }, + { + "epoch": 0.24948453608247423, + "grad_norm": 52081.0859375, + "learning_rate": 9.892635825229904e-05, + "loss": 2.3461, + "step": 1331 + }, + { + "epoch": 0.24967197750702905, + "grad_norm": 47533.21875, + "learning_rate": 9.892473799209516e-05, + "loss": 2.3469, + "step": 1332 + }, + { + "epoch": 0.2498594189315839, + "grad_norm": 50674.95703125, + "learning_rate": 9.89231165235137e-05, + "loss": 2.2812, + "step": 1333 + }, + { + "epoch": 0.2500468603561387, + "grad_norm": 55816.2890625, + "learning_rate": 9.892149384659468e-05, + "loss": 2.3477, + "step": 1334 + }, + { + "epoch": 0.25023430178069356, + "grad_norm": 50699.8359375, + "learning_rate": 9.891986996137821e-05, + "loss": 2.3667, + "step": 1335 + }, + { + "epoch": 0.25042174320524835, + "grad_norm": 51371.08984375, + "learning_rate": 9.891824486790436e-05, + "loss": 2.3516, + "step": 1336 + }, + { + "epoch": 0.2506091846298032, + "grad_norm": 52705.3828125, + "learning_rate": 9.891661856621329e-05, + "loss": 2.3026, + "step": 1337 + }, + { + "epoch": 0.25079662605435804, + "grad_norm": 51108.578125, + "learning_rate": 9.891499105634517e-05, + "loss": 2.3059, + "step": 1338 + }, + { + "epoch": 0.25098406747891283, + "grad_norm": 49818.88671875, + "learning_rate": 9.891336233834018e-05, + "loss": 2.3818, + "step": 1339 + }, + { + "epoch": 0.2511715089034677, + "grad_norm": 54867.625, + "learning_rate": 9.891173241223857e-05, + "loss": 2.3625, + "step": 1340 + }, + { + "epoch": 0.25135895032802247, + "grad_norm": 53528.9140625, + "learning_rate": 9.891010127808055e-05, + "loss": 2.481, + "step": 1341 + }, + { + "epoch": 0.2515463917525773, + "grad_norm": 56732.45703125, + "learning_rate": 9.890846893590649e-05, + "loss": 2.3126, + "step": 1342 + }, + { + "epoch": 0.25173383317713216, + "grad_norm": 49651.4765625, + "learning_rate": 9.890683538575663e-05, + "loss": 2.3493, + "step": 1343 + }, + { + "epoch": 0.25192127460168695, + "grad_norm": 53591.671875, + "learning_rate": 9.890520062767135e-05, + "loss": 2.3914, + "step": 1344 + }, + { + "epoch": 0.2521087160262418, + "grad_norm": 50831.078125, + "learning_rate": 9.890356466169102e-05, + "loss": 2.3162, + "step": 1345 + }, + { + "epoch": 0.25229615745079664, + "grad_norm": 53780.87890625, + "learning_rate": 9.890192748785606e-05, + "loss": 2.3215, + "step": 1346 + }, + { + "epoch": 0.25248359887535143, + "grad_norm": 51535.50390625, + "learning_rate": 9.890028910620686e-05, + "loss": 2.3449, + "step": 1347 + }, + { + "epoch": 0.2526710402999063, + "grad_norm": 52434.76171875, + "learning_rate": 9.889864951678395e-05, + "loss": 2.351, + "step": 1348 + }, + { + "epoch": 0.2528584817244611, + "grad_norm": 51873.74609375, + "learning_rate": 9.889700871962779e-05, + "loss": 2.3166, + "step": 1349 + }, + { + "epoch": 0.2530459231490159, + "grad_norm": 50367.90234375, + "learning_rate": 9.88953667147789e-05, + "loss": 2.339, + "step": 1350 + }, + { + "epoch": 0.25323336457357076, + "grad_norm": 54705.515625, + "learning_rate": 9.889372350227785e-05, + "loss": 2.4332, + "step": 1351 + }, + { + "epoch": 0.2534208059981256, + "grad_norm": 49972.234375, + "learning_rate": 9.889207908216523e-05, + "loss": 2.3651, + "step": 1352 + }, + { + "epoch": 0.2536082474226804, + "grad_norm": 52768.80859375, + "learning_rate": 9.889043345448164e-05, + "loss": 2.4397, + "step": 1353 + }, + { + "epoch": 0.25379568884723525, + "grad_norm": 56043.6875, + "learning_rate": 9.888878661926772e-05, + "loss": 2.3972, + "step": 1354 + }, + { + "epoch": 0.2539831302717901, + "grad_norm": 55601.77734375, + "learning_rate": 9.888713857656418e-05, + "loss": 2.4307, + "step": 1355 + }, + { + "epoch": 0.2541705716963449, + "grad_norm": 50787.890625, + "learning_rate": 9.888548932641168e-05, + "loss": 2.3383, + "step": 1356 + }, + { + "epoch": 0.25435801312089973, + "grad_norm": 49568.43359375, + "learning_rate": 9.888383886885098e-05, + "loss": 2.3538, + "step": 1357 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 52651.68359375, + "learning_rate": 9.888218720392283e-05, + "loss": 2.3312, + "step": 1358 + }, + { + "epoch": 0.25473289597000937, + "grad_norm": 49651.671875, + "learning_rate": 9.888053433166805e-05, + "loss": 2.3427, + "step": 1359 + }, + { + "epoch": 0.2549203373945642, + "grad_norm": 54023.703125, + "learning_rate": 9.887888025212744e-05, + "loss": 2.3534, + "step": 1360 + }, + { + "epoch": 0.255107778819119, + "grad_norm": 52507.5078125, + "learning_rate": 9.887722496534187e-05, + "loss": 2.357, + "step": 1361 + }, + { + "epoch": 0.25529522024367385, + "grad_norm": 54502.875, + "learning_rate": 9.88755684713522e-05, + "loss": 2.3603, + "step": 1362 + }, + { + "epoch": 0.2554826616682287, + "grad_norm": 53533.0390625, + "learning_rate": 9.887391077019936e-05, + "loss": 2.3136, + "step": 1363 + }, + { + "epoch": 0.2556701030927835, + "grad_norm": 53445.96875, + "learning_rate": 9.887225186192429e-05, + "loss": 2.3486, + "step": 1364 + }, + { + "epoch": 0.25585754451733833, + "grad_norm": 51840.48046875, + "learning_rate": 9.887059174656796e-05, + "loss": 2.2891, + "step": 1365 + }, + { + "epoch": 0.2560449859418932, + "grad_norm": 54445.390625, + "learning_rate": 9.886893042417139e-05, + "loss": 2.3211, + "step": 1366 + }, + { + "epoch": 0.25623242736644797, + "grad_norm": 55333.3203125, + "learning_rate": 9.886726789477558e-05, + "loss": 2.3514, + "step": 1367 + }, + { + "epoch": 0.2564198687910028, + "grad_norm": 54913.51171875, + "learning_rate": 9.886560415842161e-05, + "loss": 2.3608, + "step": 1368 + }, + { + "epoch": 0.25660731021555766, + "grad_norm": 50044.50390625, + "learning_rate": 9.886393921515058e-05, + "loss": 2.4221, + "step": 1369 + }, + { + "epoch": 0.25679475164011245, + "grad_norm": 53087.5078125, + "learning_rate": 9.88622730650036e-05, + "loss": 2.3618, + "step": 1370 + }, + { + "epoch": 0.2569821930646673, + "grad_norm": 52626.94140625, + "learning_rate": 9.886060570802182e-05, + "loss": 2.3917, + "step": 1371 + }, + { + "epoch": 0.25716963448922214, + "grad_norm": 50659.41796875, + "learning_rate": 9.885893714424645e-05, + "loss": 2.3698, + "step": 1372 + }, + { + "epoch": 0.25735707591377693, + "grad_norm": 49438.8125, + "learning_rate": 9.885726737371867e-05, + "loss": 2.3874, + "step": 1373 + }, + { + "epoch": 0.2575445173383318, + "grad_norm": 52272.515625, + "learning_rate": 9.885559639647972e-05, + "loss": 2.3906, + "step": 1374 + }, + { + "epoch": 0.25773195876288657, + "grad_norm": 52497.51171875, + "learning_rate": 9.885392421257089e-05, + "loss": 2.3505, + "step": 1375 + }, + { + "epoch": 0.2579194001874414, + "grad_norm": 52873.81640625, + "learning_rate": 9.885225082203347e-05, + "loss": 2.3064, + "step": 1376 + }, + { + "epoch": 0.25810684161199626, + "grad_norm": 49285.40625, + "learning_rate": 9.885057622490879e-05, + "loss": 2.3056, + "step": 1377 + }, + { + "epoch": 0.25829428303655105, + "grad_norm": 53746.0546875, + "learning_rate": 9.884890042123822e-05, + "loss": 2.3584, + "step": 1378 + }, + { + "epoch": 0.2584817244611059, + "grad_norm": 47676.46875, + "learning_rate": 9.884722341106314e-05, + "loss": 2.3414, + "step": 1379 + }, + { + "epoch": 0.25866916588566075, + "grad_norm": 50006.0078125, + "learning_rate": 9.884554519442497e-05, + "loss": 2.3796, + "step": 1380 + }, + { + "epoch": 0.25885660731021554, + "grad_norm": 49407.73046875, + "learning_rate": 9.884386577136516e-05, + "loss": 2.3614, + "step": 1381 + }, + { + "epoch": 0.2590440487347704, + "grad_norm": 53207.5859375, + "learning_rate": 9.88421851419252e-05, + "loss": 2.3465, + "step": 1382 + }, + { + "epoch": 0.25923149015932523, + "grad_norm": 51657.90625, + "learning_rate": 9.884050330614658e-05, + "loss": 2.3111, + "step": 1383 + }, + { + "epoch": 0.25941893158388, + "grad_norm": 49662.9140625, + "learning_rate": 9.883882026407088e-05, + "loss": 2.4088, + "step": 1384 + }, + { + "epoch": 0.25960637300843487, + "grad_norm": 48905.5703125, + "learning_rate": 9.883713601573961e-05, + "loss": 2.4081, + "step": 1385 + }, + { + "epoch": 0.2597938144329897, + "grad_norm": 48419.3828125, + "learning_rate": 9.88354505611944e-05, + "loss": 2.3615, + "step": 1386 + }, + { + "epoch": 0.2599812558575445, + "grad_norm": 52475.96875, + "learning_rate": 9.88337639004769e-05, + "loss": 2.3617, + "step": 1387 + }, + { + "epoch": 0.26016869728209935, + "grad_norm": 58279.8359375, + "learning_rate": 9.883207603362871e-05, + "loss": 2.3423, + "step": 1388 + }, + { + "epoch": 0.2603561387066542, + "grad_norm": 53125.4609375, + "learning_rate": 9.883038696069159e-05, + "loss": 2.3693, + "step": 1389 + }, + { + "epoch": 0.260543580131209, + "grad_norm": 49533.7109375, + "learning_rate": 9.88286966817072e-05, + "loss": 2.3273, + "step": 1390 + }, + { + "epoch": 0.26073102155576383, + "grad_norm": 54546.94140625, + "learning_rate": 9.882700519671733e-05, + "loss": 2.2869, + "step": 1391 + }, + { + "epoch": 0.2609184629803187, + "grad_norm": 54809.45703125, + "learning_rate": 9.882531250576372e-05, + "loss": 2.4108, + "step": 1392 + }, + { + "epoch": 0.26110590440487347, + "grad_norm": 51016.84375, + "learning_rate": 9.88236186088882e-05, + "loss": 2.3676, + "step": 1393 + }, + { + "epoch": 0.2612933458294283, + "grad_norm": 54254.125, + "learning_rate": 9.882192350613259e-05, + "loss": 2.37, + "step": 1394 + }, + { + "epoch": 0.2614807872539831, + "grad_norm": 95217.1484375, + "learning_rate": 9.882022719753877e-05, + "loss": 2.6092, + "step": 1395 + }, + { + "epoch": 0.26166822867853795, + "grad_norm": 52571.88671875, + "learning_rate": 9.881852968314864e-05, + "loss": 2.4273, + "step": 1396 + }, + { + "epoch": 0.2618556701030928, + "grad_norm": 51988.51953125, + "learning_rate": 9.881683096300413e-05, + "loss": 2.3394, + "step": 1397 + }, + { + "epoch": 0.2620431115276476, + "grad_norm": 54691.703125, + "learning_rate": 9.881513103714717e-05, + "loss": 2.3187, + "step": 1398 + }, + { + "epoch": 0.26223055295220243, + "grad_norm": 50704.26953125, + "learning_rate": 9.881342990561977e-05, + "loss": 2.339, + "step": 1399 + }, + { + "epoch": 0.2624179943767573, + "grad_norm": 51262.72265625, + "learning_rate": 9.881172756846396e-05, + "loss": 2.3862, + "step": 1400 + }, + { + "epoch": 0.26260543580131207, + "grad_norm": 50126.97265625, + "learning_rate": 9.881002402572173e-05, + "loss": 2.3907, + "step": 1401 + }, + { + "epoch": 0.2627928772258669, + "grad_norm": 55727.92578125, + "learning_rate": 9.880831927743521e-05, + "loss": 2.2897, + "step": 1402 + }, + { + "epoch": 0.26298031865042176, + "grad_norm": 51779.55078125, + "learning_rate": 9.880661332364648e-05, + "loss": 2.3562, + "step": 1403 + }, + { + "epoch": 0.26316776007497655, + "grad_norm": 54671.90234375, + "learning_rate": 9.880490616439768e-05, + "loss": 2.3511, + "step": 1404 + }, + { + "epoch": 0.2633552014995314, + "grad_norm": 49939.88671875, + "learning_rate": 9.880319779973098e-05, + "loss": 2.2896, + "step": 1405 + }, + { + "epoch": 0.26354264292408625, + "grad_norm": 56144.0859375, + "learning_rate": 9.880148822968855e-05, + "loss": 2.4145, + "step": 1406 + }, + { + "epoch": 0.26373008434864104, + "grad_norm": 55455.859375, + "learning_rate": 9.879977745431265e-05, + "loss": 2.3783, + "step": 1407 + }, + { + "epoch": 0.2639175257731959, + "grad_norm": 46440.1875, + "learning_rate": 9.87980654736455e-05, + "loss": 2.3344, + "step": 1408 + }, + { + "epoch": 0.26410496719775073, + "grad_norm": 57235.109375, + "learning_rate": 9.879635228772942e-05, + "loss": 2.3423, + "step": 1409 + }, + { + "epoch": 0.2642924086223055, + "grad_norm": 51981.23046875, + "learning_rate": 9.87946378966067e-05, + "loss": 2.3915, + "step": 1410 + }, + { + "epoch": 0.26447985004686037, + "grad_norm": 53377.48828125, + "learning_rate": 9.879292230031968e-05, + "loss": 2.4603, + "step": 1411 + }, + { + "epoch": 0.26466729147141516, + "grad_norm": 51476.21484375, + "learning_rate": 9.879120549891074e-05, + "loss": 2.3548, + "step": 1412 + }, + { + "epoch": 0.26485473289597, + "grad_norm": 51707.52734375, + "learning_rate": 9.878948749242229e-05, + "loss": 2.3488, + "step": 1413 + }, + { + "epoch": 0.26504217432052485, + "grad_norm": 47956.40625, + "learning_rate": 9.878776828089675e-05, + "loss": 2.358, + "step": 1414 + }, + { + "epoch": 0.26522961574507964, + "grad_norm": 50520.53125, + "learning_rate": 9.878604786437659e-05, + "loss": 2.3675, + "step": 1415 + }, + { + "epoch": 0.2654170571696345, + "grad_norm": 54494.02734375, + "learning_rate": 9.87843262429043e-05, + "loss": 2.3609, + "step": 1416 + }, + { + "epoch": 0.26560449859418933, + "grad_norm": 52166.69921875, + "learning_rate": 9.87826034165224e-05, + "loss": 2.3605, + "step": 1417 + }, + { + "epoch": 0.2657919400187441, + "grad_norm": 56560.453125, + "learning_rate": 9.878087938527345e-05, + "loss": 2.3267, + "step": 1418 + }, + { + "epoch": 0.26597938144329897, + "grad_norm": 52592.51953125, + "learning_rate": 9.877915414920002e-05, + "loss": 2.3091, + "step": 1419 + }, + { + "epoch": 0.2661668228678538, + "grad_norm": 59762.33984375, + "learning_rate": 9.877742770834471e-05, + "loss": 2.2953, + "step": 1420 + }, + { + "epoch": 0.2663542642924086, + "grad_norm": 53837.8828125, + "learning_rate": 9.87757000627502e-05, + "loss": 2.3534, + "step": 1421 + }, + { + "epoch": 0.26654170571696345, + "grad_norm": 51664.84765625, + "learning_rate": 9.877397121245913e-05, + "loss": 2.3185, + "step": 1422 + }, + { + "epoch": 0.2667291471415183, + "grad_norm": 49015.25390625, + "learning_rate": 9.87722411575142e-05, + "loss": 2.4127, + "step": 1423 + }, + { + "epoch": 0.2669165885660731, + "grad_norm": 49396.3671875, + "learning_rate": 9.877050989795816e-05, + "loss": 2.3419, + "step": 1424 + }, + { + "epoch": 0.26710402999062793, + "grad_norm": 54846.62109375, + "learning_rate": 9.876877743383375e-05, + "loss": 2.4281, + "step": 1425 + }, + { + "epoch": 0.2672914714151828, + "grad_norm": 50743.72265625, + "learning_rate": 9.876704376518375e-05, + "loss": 2.3185, + "step": 1426 + }, + { + "epoch": 0.26747891283973757, + "grad_norm": 49067.19140625, + "learning_rate": 9.876530889205102e-05, + "loss": 2.3501, + "step": 1427 + }, + { + "epoch": 0.2676663542642924, + "grad_norm": 48754.23046875, + "learning_rate": 9.876357281447839e-05, + "loss": 2.3568, + "step": 1428 + }, + { + "epoch": 0.26785379568884726, + "grad_norm": 51102.05078125, + "learning_rate": 9.876183553250871e-05, + "loss": 2.3805, + "step": 1429 + }, + { + "epoch": 0.26804123711340205, + "grad_norm": 52709.22265625, + "learning_rate": 9.876009704618494e-05, + "loss": 2.3439, + "step": 1430 + }, + { + "epoch": 0.2682286785379569, + "grad_norm": 53397.75, + "learning_rate": 9.875835735554997e-05, + "loss": 2.3434, + "step": 1431 + }, + { + "epoch": 0.2684161199625117, + "grad_norm": 49590.12109375, + "learning_rate": 9.87566164606468e-05, + "loss": 2.3138, + "step": 1432 + }, + { + "epoch": 0.26860356138706654, + "grad_norm": 50603.36328125, + "learning_rate": 9.87548743615184e-05, + "loss": 2.3592, + "step": 1433 + }, + { + "epoch": 0.2687910028116214, + "grad_norm": 53580.9140625, + "learning_rate": 9.875313105820785e-05, + "loss": 2.3627, + "step": 1434 + }, + { + "epoch": 0.2689784442361762, + "grad_norm": 47986.58203125, + "learning_rate": 9.875138655075815e-05, + "loss": 2.3711, + "step": 1435 + }, + { + "epoch": 0.269165885660731, + "grad_norm": 53497.50390625, + "learning_rate": 9.874964083921241e-05, + "loss": 2.4529, + "step": 1436 + }, + { + "epoch": 0.26935332708528587, + "grad_norm": 49420.28515625, + "learning_rate": 9.874789392361374e-05, + "loss": 2.3706, + "step": 1437 + }, + { + "epoch": 0.26954076850984066, + "grad_norm": 48772.0546875, + "learning_rate": 9.87461458040053e-05, + "loss": 2.3418, + "step": 1438 + }, + { + "epoch": 0.2697282099343955, + "grad_norm": 56424.52734375, + "learning_rate": 9.874439648043026e-05, + "loss": 2.3267, + "step": 1439 + }, + { + "epoch": 0.26991565135895035, + "grad_norm": 54224.796875, + "learning_rate": 9.874264595293182e-05, + "loss": 2.3287, + "step": 1440 + }, + { + "epoch": 0.27010309278350514, + "grad_norm": 53291.9765625, + "learning_rate": 9.874089422155324e-05, + "loss": 2.3358, + "step": 1441 + }, + { + "epoch": 0.27029053420806, + "grad_norm": 51352.27734375, + "learning_rate": 9.873914128633775e-05, + "loss": 2.3219, + "step": 1442 + }, + { + "epoch": 0.27047797563261483, + "grad_norm": 52599.32421875, + "learning_rate": 9.873738714732865e-05, + "loss": 2.3555, + "step": 1443 + }, + { + "epoch": 0.2706654170571696, + "grad_norm": 60539.74609375, + "learning_rate": 9.873563180456928e-05, + "loss": 2.3345, + "step": 1444 + }, + { + "epoch": 0.27085285848172447, + "grad_norm": 56592.30078125, + "learning_rate": 9.8733875258103e-05, + "loss": 2.3806, + "step": 1445 + }, + { + "epoch": 0.2710402999062793, + "grad_norm": 51550.69921875, + "learning_rate": 9.87321175079732e-05, + "loss": 2.3673, + "step": 1446 + }, + { + "epoch": 0.2712277413308341, + "grad_norm": 55376.04296875, + "learning_rate": 9.873035855422326e-05, + "loss": 2.3755, + "step": 1447 + }, + { + "epoch": 0.27141518275538895, + "grad_norm": 52711.50390625, + "learning_rate": 9.872859839689666e-05, + "loss": 2.3139, + "step": 1448 + }, + { + "epoch": 0.27160262417994374, + "grad_norm": 52784.19921875, + "learning_rate": 9.872683703603684e-05, + "loss": 2.2896, + "step": 1449 + }, + { + "epoch": 0.2717900656044986, + "grad_norm": 55272.01953125, + "learning_rate": 9.872507447168734e-05, + "loss": 2.2703, + "step": 1450 + }, + { + "epoch": 0.27197750702905343, + "grad_norm": 58121.078125, + "learning_rate": 9.872331070389165e-05, + "loss": 2.3999, + "step": 1451 + }, + { + "epoch": 0.2721649484536082, + "grad_norm": 48345.58984375, + "learning_rate": 9.872154573269336e-05, + "loss": 2.3286, + "step": 1452 + }, + { + "epoch": 0.27235238987816307, + "grad_norm": 49958.09765625, + "learning_rate": 9.871977955813606e-05, + "loss": 2.3667, + "step": 1453 + }, + { + "epoch": 0.2725398313027179, + "grad_norm": 52541.84765625, + "learning_rate": 9.871801218026338e-05, + "loss": 2.3476, + "step": 1454 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 52906.55859375, + "learning_rate": 9.871624359911895e-05, + "loss": 2.3883, + "step": 1455 + }, + { + "epoch": 0.27291471415182755, + "grad_norm": 53030.00390625, + "learning_rate": 9.871447381474647e-05, + "loss": 2.3342, + "step": 1456 + }, + { + "epoch": 0.2731021555763824, + "grad_norm": 52410.6875, + "learning_rate": 9.871270282718966e-05, + "loss": 2.3762, + "step": 1457 + }, + { + "epoch": 0.2732895970009372, + "grad_norm": 54154.18359375, + "learning_rate": 9.871093063649223e-05, + "loss": 2.3517, + "step": 1458 + }, + { + "epoch": 0.27347703842549204, + "grad_norm": 54160.16015625, + "learning_rate": 9.870915724269798e-05, + "loss": 2.3628, + "step": 1459 + }, + { + "epoch": 0.2736644798500469, + "grad_norm": 53278.07421875, + "learning_rate": 9.870738264585069e-05, + "loss": 2.3379, + "step": 1460 + }, + { + "epoch": 0.2738519212746017, + "grad_norm": 53301.3203125, + "learning_rate": 9.870560684599422e-05, + "loss": 2.4127, + "step": 1461 + }, + { + "epoch": 0.2740393626991565, + "grad_norm": 51413.6484375, + "learning_rate": 9.870382984317238e-05, + "loss": 2.3825, + "step": 1462 + }, + { + "epoch": 0.27422680412371137, + "grad_norm": 52141.078125, + "learning_rate": 9.870205163742909e-05, + "loss": 2.4077, + "step": 1463 + }, + { + "epoch": 0.27441424554826616, + "grad_norm": 56844.76171875, + "learning_rate": 9.870027222880829e-05, + "loss": 2.311, + "step": 1464 + }, + { + "epoch": 0.274601686972821, + "grad_norm": 48039.9140625, + "learning_rate": 9.869849161735389e-05, + "loss": 2.2845, + "step": 1465 + }, + { + "epoch": 0.2747891283973758, + "grad_norm": 47855.0703125, + "learning_rate": 9.869670980310989e-05, + "loss": 2.3686, + "step": 1466 + }, + { + "epoch": 0.27497656982193064, + "grad_norm": 49487.03125, + "learning_rate": 9.869492678612029e-05, + "loss": 2.3791, + "step": 1467 + }, + { + "epoch": 0.2751640112464855, + "grad_norm": 47284.1171875, + "learning_rate": 9.869314256642915e-05, + "loss": 2.3267, + "step": 1468 + }, + { + "epoch": 0.2753514526710403, + "grad_norm": 49083.65625, + "learning_rate": 9.869135714408048e-05, + "loss": 2.3688, + "step": 1469 + }, + { + "epoch": 0.2755388940955951, + "grad_norm": 49202.62109375, + "learning_rate": 9.868957051911844e-05, + "loss": 2.3341, + "step": 1470 + }, + { + "epoch": 0.27572633552014997, + "grad_norm": 51532.04296875, + "learning_rate": 9.868778269158715e-05, + "loss": 2.3957, + "step": 1471 + }, + { + "epoch": 0.27591377694470476, + "grad_norm": 50666.44140625, + "learning_rate": 9.868599366153074e-05, + "loss": 2.2739, + "step": 1472 + }, + { + "epoch": 0.2761012183692596, + "grad_norm": 57487.0625, + "learning_rate": 9.868420342899341e-05, + "loss": 2.3696, + "step": 1473 + }, + { + "epoch": 0.27628865979381445, + "grad_norm": 58247.31640625, + "learning_rate": 9.868241199401937e-05, + "loss": 2.338, + "step": 1474 + }, + { + "epoch": 0.27647610121836924, + "grad_norm": 52189.81640625, + "learning_rate": 9.868061935665288e-05, + "loss": 2.3868, + "step": 1475 + }, + { + "epoch": 0.2766635426429241, + "grad_norm": 49200.1640625, + "learning_rate": 9.867882551693821e-05, + "loss": 2.3761, + "step": 1476 + }, + { + "epoch": 0.27685098406747893, + "grad_norm": 53999.296875, + "learning_rate": 9.867703047491966e-05, + "loss": 2.3409, + "step": 1477 + }, + { + "epoch": 0.2770384254920337, + "grad_norm": 54098.01953125, + "learning_rate": 9.867523423064156e-05, + "loss": 2.3059, + "step": 1478 + }, + { + "epoch": 0.27722586691658857, + "grad_norm": 50301.0859375, + "learning_rate": 9.867343678414829e-05, + "loss": 2.3091, + "step": 1479 + }, + { + "epoch": 0.2774133083411434, + "grad_norm": 54234.46484375, + "learning_rate": 9.867163813548424e-05, + "loss": 2.3396, + "step": 1480 + }, + { + "epoch": 0.2776007497656982, + "grad_norm": 50312.41796875, + "learning_rate": 9.866983828469382e-05, + "loss": 2.3807, + "step": 1481 + }, + { + "epoch": 0.27778819119025305, + "grad_norm": 55240.84765625, + "learning_rate": 9.866803723182149e-05, + "loss": 2.4086, + "step": 1482 + }, + { + "epoch": 0.2779756326148079, + "grad_norm": 51979.8046875, + "learning_rate": 9.866623497691177e-05, + "loss": 2.3456, + "step": 1483 + }, + { + "epoch": 0.2781630740393627, + "grad_norm": 62215.0625, + "learning_rate": 9.866443152000911e-05, + "loss": 2.4023, + "step": 1484 + }, + { + "epoch": 0.27835051546391754, + "grad_norm": 59869.70703125, + "learning_rate": 9.86626268611581e-05, + "loss": 2.3451, + "step": 1485 + }, + { + "epoch": 0.2785379568884723, + "grad_norm": 50961.09375, + "learning_rate": 9.86608210004033e-05, + "loss": 2.3343, + "step": 1486 + }, + { + "epoch": 0.2787253983130272, + "grad_norm": 50897.9296875, + "learning_rate": 9.865901393778931e-05, + "loss": 2.366, + "step": 1487 + }, + { + "epoch": 0.278912839737582, + "grad_norm": 51368.7109375, + "learning_rate": 9.865720567336076e-05, + "loss": 2.2831, + "step": 1488 + }, + { + "epoch": 0.2791002811621368, + "grad_norm": 51899.16015625, + "learning_rate": 9.865539620716232e-05, + "loss": 2.3606, + "step": 1489 + }, + { + "epoch": 0.27928772258669166, + "grad_norm": 49885.08984375, + "learning_rate": 9.865358553923867e-05, + "loss": 2.3701, + "step": 1490 + }, + { + "epoch": 0.2794751640112465, + "grad_norm": 48853.69921875, + "learning_rate": 9.865177366963456e-05, + "loss": 2.2679, + "step": 1491 + }, + { + "epoch": 0.2796626054358013, + "grad_norm": 51138.2734375, + "learning_rate": 9.864996059839469e-05, + "loss": 2.3796, + "step": 1492 + }, + { + "epoch": 0.27985004686035614, + "grad_norm": 51672.53125, + "learning_rate": 9.864814632556387e-05, + "loss": 2.407, + "step": 1493 + }, + { + "epoch": 0.280037488284911, + "grad_norm": 50431.3984375, + "learning_rate": 9.864633085118693e-05, + "loss": 2.3709, + "step": 1494 + }, + { + "epoch": 0.2802249297094658, + "grad_norm": 53574.1484375, + "learning_rate": 9.864451417530867e-05, + "loss": 2.3856, + "step": 1495 + }, + { + "epoch": 0.2804123711340206, + "grad_norm": 54451.6640625, + "learning_rate": 9.864269629797399e-05, + "loss": 2.365, + "step": 1496 + }, + { + "epoch": 0.28059981255857547, + "grad_norm": 53843.16015625, + "learning_rate": 9.864087721922776e-05, + "loss": 2.307, + "step": 1497 + }, + { + "epoch": 0.28078725398313026, + "grad_norm": 51672.94140625, + "learning_rate": 9.863905693911495e-05, + "loss": 2.3898, + "step": 1498 + }, + { + "epoch": 0.2809746954076851, + "grad_norm": 54839.16015625, + "learning_rate": 9.863723545768049e-05, + "loss": 2.375, + "step": 1499 + }, + { + "epoch": 0.28116213683223995, + "grad_norm": 52132.12890625, + "learning_rate": 9.863541277496936e-05, + "loss": 2.3511, + "step": 1500 + }, + { + "epoch": 0.28116213683223995, + "eval_loss": 2.343967914581299, + "eval_runtime": 131.0853, + "eval_samples_per_second": 38.517, + "eval_steps_per_second": 1.93, + "step": 1500 + }, + { + "epoch": 0.28134957825679474, + "grad_norm": 52631.3828125, + "learning_rate": 9.863358889102659e-05, + "loss": 2.3277, + "step": 1501 + }, + { + "epoch": 0.2815370196813496, + "grad_norm": 51864.1171875, + "learning_rate": 9.863176380589724e-05, + "loss": 2.3152, + "step": 1502 + }, + { + "epoch": 0.2817244611059044, + "grad_norm": 56093.21875, + "learning_rate": 9.862993751962638e-05, + "loss": 2.3819, + "step": 1503 + }, + { + "epoch": 0.2819119025304592, + "grad_norm": 51925.375, + "learning_rate": 9.86281100322591e-05, + "loss": 2.383, + "step": 1504 + }, + { + "epoch": 0.28209934395501407, + "grad_norm": 55749.6796875, + "learning_rate": 9.862628134384056e-05, + "loss": 2.3419, + "step": 1505 + }, + { + "epoch": 0.28228678537956886, + "grad_norm": 56685.69140625, + "learning_rate": 9.862445145441591e-05, + "loss": 2.3462, + "step": 1506 + }, + { + "epoch": 0.2824742268041237, + "grad_norm": 53733.28515625, + "learning_rate": 9.862262036403036e-05, + "loss": 2.3695, + "step": 1507 + }, + { + "epoch": 0.28266166822867855, + "grad_norm": 51366.10546875, + "learning_rate": 9.862078807272912e-05, + "loss": 2.2644, + "step": 1508 + }, + { + "epoch": 0.28284910965323334, + "grad_norm": 53764.23046875, + "learning_rate": 9.861895458055746e-05, + "loss": 2.2709, + "step": 1509 + }, + { + "epoch": 0.2830365510777882, + "grad_norm": 57679.01953125, + "learning_rate": 9.861711988756063e-05, + "loss": 2.3349, + "step": 1510 + }, + { + "epoch": 0.28322399250234304, + "grad_norm": 51414.6484375, + "learning_rate": 9.8615283993784e-05, + "loss": 2.3847, + "step": 1511 + }, + { + "epoch": 0.2834114339268978, + "grad_norm": 51471.1953125, + "learning_rate": 9.861344689927288e-05, + "loss": 2.4064, + "step": 1512 + }, + { + "epoch": 0.2835988753514527, + "grad_norm": 52136.5703125, + "learning_rate": 9.861160860407265e-05, + "loss": 2.3757, + "step": 1513 + }, + { + "epoch": 0.2837863167760075, + "grad_norm": 49023.20703125, + "learning_rate": 9.860976910822871e-05, + "loss": 2.3516, + "step": 1514 + }, + { + "epoch": 0.2839737582005623, + "grad_norm": 50752.32421875, + "learning_rate": 9.86079284117865e-05, + "loss": 2.3881, + "step": 1515 + }, + { + "epoch": 0.28416119962511716, + "grad_norm": 55138.015625, + "learning_rate": 9.860608651479148e-05, + "loss": 2.3149, + "step": 1516 + }, + { + "epoch": 0.284348641049672, + "grad_norm": 52080.8984375, + "learning_rate": 9.860424341728914e-05, + "loss": 2.3775, + "step": 1517 + }, + { + "epoch": 0.2845360824742268, + "grad_norm": 52405.44140625, + "learning_rate": 9.8602399119325e-05, + "loss": 2.3928, + "step": 1518 + }, + { + "epoch": 0.28472352389878164, + "grad_norm": 50635.72265625, + "learning_rate": 9.860055362094462e-05, + "loss": 2.353, + "step": 1519 + }, + { + "epoch": 0.2849109653233365, + "grad_norm": 50365.125, + "learning_rate": 9.859870692219357e-05, + "loss": 2.3719, + "step": 1520 + }, + { + "epoch": 0.2850984067478913, + "grad_norm": 50345.5546875, + "learning_rate": 9.859685902311747e-05, + "loss": 2.3226, + "step": 1521 + }, + { + "epoch": 0.2852858481724461, + "grad_norm": 67707.046875, + "learning_rate": 9.859500992376197e-05, + "loss": 2.3298, + "step": 1522 + }, + { + "epoch": 0.2854732895970009, + "grad_norm": 54625.37109375, + "learning_rate": 9.859315962417272e-05, + "loss": 2.368, + "step": 1523 + }, + { + "epoch": 0.28566073102155576, + "grad_norm": 79390.0390625, + "learning_rate": 9.859130812439543e-05, + "loss": 2.5453, + "step": 1524 + }, + { + "epoch": 0.2858481724461106, + "grad_norm": 50843.03125, + "learning_rate": 9.858945542447583e-05, + "loss": 2.3115, + "step": 1525 + }, + { + "epoch": 0.2860356138706654, + "grad_norm": 49736.28125, + "learning_rate": 9.858760152445967e-05, + "loss": 2.3134, + "step": 1526 + }, + { + "epoch": 0.28622305529522024, + "grad_norm": 52678.7109375, + "learning_rate": 9.858574642439276e-05, + "loss": 2.3997, + "step": 1527 + }, + { + "epoch": 0.2864104967197751, + "grad_norm": 56513.5, + "learning_rate": 9.85838901243209e-05, + "loss": 2.3748, + "step": 1528 + }, + { + "epoch": 0.2865979381443299, + "grad_norm": 52410.5234375, + "learning_rate": 9.858203262428996e-05, + "loss": 2.3357, + "step": 1529 + }, + { + "epoch": 0.2867853795688847, + "grad_norm": 52281.09375, + "learning_rate": 9.858017392434577e-05, + "loss": 2.3576, + "step": 1530 + }, + { + "epoch": 0.28697282099343957, + "grad_norm": 51633.2734375, + "learning_rate": 9.857831402453428e-05, + "loss": 2.315, + "step": 1531 + }, + { + "epoch": 0.28716026241799436, + "grad_norm": 54671.6875, + "learning_rate": 9.857645292490143e-05, + "loss": 2.3855, + "step": 1532 + }, + { + "epoch": 0.2873477038425492, + "grad_norm": 57079.4609375, + "learning_rate": 9.857459062549317e-05, + "loss": 2.4123, + "step": 1533 + }, + { + "epoch": 0.28753514526710405, + "grad_norm": 51614.64453125, + "learning_rate": 9.85727271263555e-05, + "loss": 2.3877, + "step": 1534 + }, + { + "epoch": 0.28772258669165884, + "grad_norm": 48757.65625, + "learning_rate": 9.857086242753444e-05, + "loss": 2.3512, + "step": 1535 + }, + { + "epoch": 0.2879100281162137, + "grad_norm": 50711.6796875, + "learning_rate": 9.856899652907604e-05, + "loss": 2.3175, + "step": 1536 + }, + { + "epoch": 0.28809746954076854, + "grad_norm": 55870.3125, + "learning_rate": 9.856712943102642e-05, + "loss": 2.3173, + "step": 1537 + }, + { + "epoch": 0.2882849109653233, + "grad_norm": 56323.7890625, + "learning_rate": 9.856526113343165e-05, + "loss": 2.287, + "step": 1538 + }, + { + "epoch": 0.2884723523898782, + "grad_norm": 47997.921875, + "learning_rate": 9.85633916363379e-05, + "loss": 2.3739, + "step": 1539 + }, + { + "epoch": 0.28865979381443296, + "grad_norm": 49077.38671875, + "learning_rate": 9.856152093979134e-05, + "loss": 2.3949, + "step": 1540 + }, + { + "epoch": 0.2888472352389878, + "grad_norm": 49948.8125, + "learning_rate": 9.855964904383818e-05, + "loss": 2.4069, + "step": 1541 + }, + { + "epoch": 0.28903467666354266, + "grad_norm": 52402.9375, + "learning_rate": 9.855777594852464e-05, + "loss": 2.3298, + "step": 1542 + }, + { + "epoch": 0.28922211808809745, + "grad_norm": 61042.56640625, + "learning_rate": 9.8555901653897e-05, + "loss": 2.393, + "step": 1543 + }, + { + "epoch": 0.2894095595126523, + "grad_norm": 49876.0859375, + "learning_rate": 9.855402616000154e-05, + "loss": 2.3597, + "step": 1544 + }, + { + "epoch": 0.28959700093720714, + "grad_norm": 50516.0234375, + "learning_rate": 9.855214946688459e-05, + "loss": 2.3231, + "step": 1545 + }, + { + "epoch": 0.28978444236176193, + "grad_norm": 52518.140625, + "learning_rate": 9.85502715745925e-05, + "loss": 2.3582, + "step": 1546 + }, + { + "epoch": 0.2899718837863168, + "grad_norm": 54322.1875, + "learning_rate": 9.854839248317162e-05, + "loss": 2.3645, + "step": 1547 + }, + { + "epoch": 0.2901593252108716, + "grad_norm": 49327.0390625, + "learning_rate": 9.854651219266842e-05, + "loss": 2.3715, + "step": 1548 + }, + { + "epoch": 0.2903467666354264, + "grad_norm": 53051.2890625, + "learning_rate": 9.854463070312929e-05, + "loss": 2.3998, + "step": 1549 + }, + { + "epoch": 0.29053420805998126, + "grad_norm": 57482.3515625, + "learning_rate": 9.854274801460073e-05, + "loss": 2.3708, + "step": 1550 + }, + { + "epoch": 0.2907216494845361, + "grad_norm": 50838.921875, + "learning_rate": 9.854086412712924e-05, + "loss": 2.3824, + "step": 1551 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 51271.12109375, + "learning_rate": 9.853897904076133e-05, + "loss": 2.3936, + "step": 1552 + }, + { + "epoch": 0.29109653233364574, + "grad_norm": 48382.4453125, + "learning_rate": 9.853709275554357e-05, + "loss": 2.2831, + "step": 1553 + }, + { + "epoch": 0.2912839737582006, + "grad_norm": 53042.08203125, + "learning_rate": 9.853520527152255e-05, + "loss": 2.3353, + "step": 1554 + }, + { + "epoch": 0.2914714151827554, + "grad_norm": 54945.65234375, + "learning_rate": 9.853331658874487e-05, + "loss": 2.3125, + "step": 1555 + }, + { + "epoch": 0.2916588566073102, + "grad_norm": 49263.30078125, + "learning_rate": 9.853142670725721e-05, + "loss": 2.3781, + "step": 1556 + }, + { + "epoch": 0.291846298031865, + "grad_norm": 48128.7890625, + "learning_rate": 9.852953562710624e-05, + "loss": 2.3124, + "step": 1557 + }, + { + "epoch": 0.29203373945641986, + "grad_norm": 55517.0078125, + "learning_rate": 9.852764334833865e-05, + "loss": 2.3878, + "step": 1558 + }, + { + "epoch": 0.2922211808809747, + "grad_norm": 50953.26171875, + "learning_rate": 9.852574987100119e-05, + "loss": 2.3454, + "step": 1559 + }, + { + "epoch": 0.2924086223055295, + "grad_norm": 49328.1328125, + "learning_rate": 9.852385519514062e-05, + "loss": 2.3615, + "step": 1560 + }, + { + "epoch": 0.29259606373008434, + "grad_norm": 54045.6015625, + "learning_rate": 9.852195932080373e-05, + "loss": 2.3823, + "step": 1561 + }, + { + "epoch": 0.2927835051546392, + "grad_norm": 55713.94140625, + "learning_rate": 9.852006224803737e-05, + "loss": 2.3314, + "step": 1562 + }, + { + "epoch": 0.292970946579194, + "grad_norm": 54267.5625, + "learning_rate": 9.851816397688838e-05, + "loss": 2.415, + "step": 1563 + }, + { + "epoch": 0.2931583880037488, + "grad_norm": 50604.109375, + "learning_rate": 9.851626450740363e-05, + "loss": 2.3993, + "step": 1564 + }, + { + "epoch": 0.2933458294283037, + "grad_norm": 53874.390625, + "learning_rate": 9.851436383963007e-05, + "loss": 2.395, + "step": 1565 + }, + { + "epoch": 0.29353327085285846, + "grad_norm": 51801.67578125, + "learning_rate": 9.851246197361459e-05, + "loss": 2.314, + "step": 1566 + }, + { + "epoch": 0.2937207122774133, + "grad_norm": 52900.28125, + "learning_rate": 9.851055890940422e-05, + "loss": 2.3605, + "step": 1567 + }, + { + "epoch": 0.29390815370196816, + "grad_norm": 53971.2890625, + "learning_rate": 9.850865464704594e-05, + "loss": 2.3551, + "step": 1568 + }, + { + "epoch": 0.29409559512652295, + "grad_norm": 57066.28125, + "learning_rate": 9.850674918658677e-05, + "loss": 2.2745, + "step": 1569 + }, + { + "epoch": 0.2942830365510778, + "grad_norm": 51837.66015625, + "learning_rate": 9.850484252807379e-05, + "loss": 2.2862, + "step": 1570 + }, + { + "epoch": 0.29447047797563264, + "grad_norm": 55349.45703125, + "learning_rate": 9.85029346715541e-05, + "loss": 2.37, + "step": 1571 + }, + { + "epoch": 0.29465791940018743, + "grad_norm": 47601.28125, + "learning_rate": 9.85010256170748e-05, + "loss": 2.3633, + "step": 1572 + }, + { + "epoch": 0.2948453608247423, + "grad_norm": 49563.734375, + "learning_rate": 9.849911536468305e-05, + "loss": 2.3177, + "step": 1573 + }, + { + "epoch": 0.2950328022492971, + "grad_norm": 49506.4140625, + "learning_rate": 9.849720391442602e-05, + "loss": 2.3795, + "step": 1574 + }, + { + "epoch": 0.2952202436738519, + "grad_norm": 47772.4296875, + "learning_rate": 9.849529126635095e-05, + "loss": 2.3204, + "step": 1575 + }, + { + "epoch": 0.29540768509840676, + "grad_norm": 49586.4609375, + "learning_rate": 9.849337742050505e-05, + "loss": 2.4137, + "step": 1576 + }, + { + "epoch": 0.29559512652296155, + "grad_norm": 57266.40625, + "learning_rate": 9.849146237693561e-05, + "loss": 2.4171, + "step": 1577 + }, + { + "epoch": 0.2957825679475164, + "grad_norm": 49298.3359375, + "learning_rate": 9.84895461356899e-05, + "loss": 2.2709, + "step": 1578 + }, + { + "epoch": 0.29597000937207124, + "grad_norm": 49845.87890625, + "learning_rate": 9.848762869681528e-05, + "loss": 2.3004, + "step": 1579 + }, + { + "epoch": 0.29615745079662603, + "grad_norm": 53218.71875, + "learning_rate": 9.848571006035909e-05, + "loss": 2.4244, + "step": 1580 + }, + { + "epoch": 0.2963448922211809, + "grad_norm": 57476.1796875, + "learning_rate": 9.848379022636873e-05, + "loss": 2.3111, + "step": 1581 + }, + { + "epoch": 0.2965323336457357, + "grad_norm": 54986.23828125, + "learning_rate": 9.848186919489162e-05, + "loss": 2.3956, + "step": 1582 + }, + { + "epoch": 0.2967197750702905, + "grad_norm": 54470.484375, + "learning_rate": 9.847994696597518e-05, + "loss": 2.327, + "step": 1583 + }, + { + "epoch": 0.29690721649484536, + "grad_norm": 50461.59765625, + "learning_rate": 9.847802353966692e-05, + "loss": 2.3801, + "step": 1584 + }, + { + "epoch": 0.2970946579194002, + "grad_norm": 56200.28515625, + "learning_rate": 9.847609891601432e-05, + "loss": 2.3297, + "step": 1585 + }, + { + "epoch": 0.297282099343955, + "grad_norm": 49283.76171875, + "learning_rate": 9.847417309506494e-05, + "loss": 2.3482, + "step": 1586 + }, + { + "epoch": 0.29746954076850984, + "grad_norm": 52996.1640625, + "learning_rate": 9.847224607686632e-05, + "loss": 2.3801, + "step": 1587 + }, + { + "epoch": 0.2976569821930647, + "grad_norm": 52996.21875, + "learning_rate": 9.847031786146608e-05, + "loss": 2.3486, + "step": 1588 + }, + { + "epoch": 0.2978444236176195, + "grad_norm": 47230.0234375, + "learning_rate": 9.846838844891182e-05, + "loss": 2.3634, + "step": 1589 + }, + { + "epoch": 0.2980318650421743, + "grad_norm": 47540.61328125, + "learning_rate": 9.846645783925121e-05, + "loss": 2.3704, + "step": 1590 + }, + { + "epoch": 0.2982193064667292, + "grad_norm": 48212.12890625, + "learning_rate": 9.846452603253193e-05, + "loss": 2.2914, + "step": 1591 + }, + { + "epoch": 0.29840674789128396, + "grad_norm": 53513.71875, + "learning_rate": 9.846259302880169e-05, + "loss": 2.4094, + "step": 1592 + }, + { + "epoch": 0.2985941893158388, + "grad_norm": 52391.921875, + "learning_rate": 9.846065882810823e-05, + "loss": 2.3567, + "step": 1593 + }, + { + "epoch": 0.2987816307403936, + "grad_norm": 55431.15234375, + "learning_rate": 9.845872343049933e-05, + "loss": 2.3886, + "step": 1594 + }, + { + "epoch": 0.29896907216494845, + "grad_norm": 50530.3828125, + "learning_rate": 9.845678683602279e-05, + "loss": 2.3587, + "step": 1595 + }, + { + "epoch": 0.2991565135895033, + "grad_norm": 51184.91796875, + "learning_rate": 9.845484904472646e-05, + "loss": 2.3448, + "step": 1596 + }, + { + "epoch": 0.2993439550140581, + "grad_norm": 55964.8046875, + "learning_rate": 9.845291005665815e-05, + "loss": 2.3741, + "step": 1597 + }, + { + "epoch": 0.29953139643861293, + "grad_norm": 48824.828125, + "learning_rate": 9.84509698718658e-05, + "loss": 2.3493, + "step": 1598 + }, + { + "epoch": 0.2997188378631678, + "grad_norm": 52798.28125, + "learning_rate": 9.84490284903973e-05, + "loss": 2.3642, + "step": 1599 + }, + { + "epoch": 0.29990627928772257, + "grad_norm": 72008.4140625, + "learning_rate": 9.844708591230062e-05, + "loss": 2.4501, + "step": 1600 + }, + { + "epoch": 0.3000937207122774, + "grad_norm": 51594.90234375, + "learning_rate": 9.844514213762373e-05, + "loss": 2.3675, + "step": 1601 + }, + { + "epoch": 0.30028116213683226, + "grad_norm": 50602.55859375, + "learning_rate": 9.844319716641463e-05, + "loss": 2.3485, + "step": 1602 + }, + { + "epoch": 0.30046860356138705, + "grad_norm": 49536.55859375, + "learning_rate": 9.844125099872137e-05, + "loss": 2.3553, + "step": 1603 + }, + { + "epoch": 0.3006560449859419, + "grad_norm": 52461.7578125, + "learning_rate": 9.843930363459204e-05, + "loss": 2.346, + "step": 1604 + }, + { + "epoch": 0.30084348641049674, + "grad_norm": 51299.8125, + "learning_rate": 9.843735507407468e-05, + "loss": 2.3396, + "step": 1605 + }, + { + "epoch": 0.30103092783505153, + "grad_norm": 47707.72265625, + "learning_rate": 9.843540531721747e-05, + "loss": 2.4235, + "step": 1606 + }, + { + "epoch": 0.3012183692596064, + "grad_norm": 49734.98046875, + "learning_rate": 9.843345436406854e-05, + "loss": 2.3604, + "step": 1607 + }, + { + "epoch": 0.3014058106841612, + "grad_norm": 53752.60546875, + "learning_rate": 9.843150221467608e-05, + "loss": 2.3604, + "step": 1608 + }, + { + "epoch": 0.301593252108716, + "grad_norm": 49943.18359375, + "learning_rate": 9.842954886908831e-05, + "loss": 2.3432, + "step": 1609 + }, + { + "epoch": 0.30178069353327086, + "grad_norm": 53216.53515625, + "learning_rate": 9.842759432735347e-05, + "loss": 2.3655, + "step": 1610 + }, + { + "epoch": 0.3019681349578257, + "grad_norm": 49762.6328125, + "learning_rate": 9.842563858951983e-05, + "loss": 2.3679, + "step": 1611 + }, + { + "epoch": 0.3021555763823805, + "grad_norm": 49396.59375, + "learning_rate": 9.842368165563571e-05, + "loss": 2.3367, + "step": 1612 + }, + { + "epoch": 0.30234301780693534, + "grad_norm": 53567.6015625, + "learning_rate": 9.842172352574944e-05, + "loss": 2.3141, + "step": 1613 + }, + { + "epoch": 0.30253045923149013, + "grad_norm": 50298.74609375, + "learning_rate": 9.841976419990937e-05, + "loss": 2.3326, + "step": 1614 + }, + { + "epoch": 0.302717900656045, + "grad_norm": 52129.16015625, + "learning_rate": 9.841780367816391e-05, + "loss": 2.3853, + "step": 1615 + }, + { + "epoch": 0.3029053420805998, + "grad_norm": 53706.78125, + "learning_rate": 9.841584196056148e-05, + "loss": 2.3415, + "step": 1616 + }, + { + "epoch": 0.3030927835051546, + "grad_norm": 51767.515625, + "learning_rate": 9.841387904715051e-05, + "loss": 2.3319, + "step": 1617 + }, + { + "epoch": 0.30328022492970946, + "grad_norm": 50790.62890625, + "learning_rate": 9.84119149379795e-05, + "loss": 2.3665, + "step": 1618 + }, + { + "epoch": 0.3034676663542643, + "grad_norm": 56914.01171875, + "learning_rate": 9.840994963309697e-05, + "loss": 2.3522, + "step": 1619 + }, + { + "epoch": 0.3036551077788191, + "grad_norm": 49211.3359375, + "learning_rate": 9.840798313255142e-05, + "loss": 2.3671, + "step": 1620 + }, + { + "epoch": 0.30384254920337395, + "grad_norm": 59622.8828125, + "learning_rate": 9.840601543639147e-05, + "loss": 2.4872, + "step": 1621 + }, + { + "epoch": 0.3040299906279288, + "grad_norm": 47688.2578125, + "learning_rate": 9.84040465446657e-05, + "loss": 2.4156, + "step": 1622 + }, + { + "epoch": 0.3042174320524836, + "grad_norm": 51361.6875, + "learning_rate": 9.840207645742275e-05, + "loss": 2.3956, + "step": 1623 + }, + { + "epoch": 0.30440487347703843, + "grad_norm": 52665.48828125, + "learning_rate": 9.840010517471123e-05, + "loss": 2.3191, + "step": 1624 + }, + { + "epoch": 0.3045923149015933, + "grad_norm": 51608.12890625, + "learning_rate": 9.83981326965799e-05, + "loss": 2.3308, + "step": 1625 + }, + { + "epoch": 0.30477975632614807, + "grad_norm": 49208.17578125, + "learning_rate": 9.839615902307742e-05, + "loss": 2.3124, + "step": 1626 + }, + { + "epoch": 0.3049671977507029, + "grad_norm": 53693.59375, + "learning_rate": 9.839418415425255e-05, + "loss": 2.3388, + "step": 1627 + }, + { + "epoch": 0.30515463917525776, + "grad_norm": 50564.7578125, + "learning_rate": 9.839220809015409e-05, + "loss": 2.3573, + "step": 1628 + }, + { + "epoch": 0.30534208059981255, + "grad_norm": 51473.4765625, + "learning_rate": 9.839023083083083e-05, + "loss": 2.3582, + "step": 1629 + }, + { + "epoch": 0.3055295220243674, + "grad_norm": 52552.88671875, + "learning_rate": 9.83882523763316e-05, + "loss": 2.3644, + "step": 1630 + }, + { + "epoch": 0.3057169634489222, + "grad_norm": 65046.68359375, + "learning_rate": 9.838627272670529e-05, + "loss": 2.3638, + "step": 1631 + }, + { + "epoch": 0.30590440487347703, + "grad_norm": 50180.03125, + "learning_rate": 9.838429188200076e-05, + "loss": 2.3633, + "step": 1632 + }, + { + "epoch": 0.3060918462980319, + "grad_norm": 48165.58203125, + "learning_rate": 9.838230984226696e-05, + "loss": 2.3617, + "step": 1633 + }, + { + "epoch": 0.30627928772258667, + "grad_norm": 51047.76953125, + "learning_rate": 9.838032660755283e-05, + "loss": 2.4279, + "step": 1634 + }, + { + "epoch": 0.3064667291471415, + "grad_norm": 48994.33203125, + "learning_rate": 9.837834217790737e-05, + "loss": 2.3703, + "step": 1635 + }, + { + "epoch": 0.30665417057169636, + "grad_norm": 53743.1328125, + "learning_rate": 9.837635655337958e-05, + "loss": 2.3199, + "step": 1636 + }, + { + "epoch": 0.30684161199625115, + "grad_norm": 54239.83984375, + "learning_rate": 9.837436973401848e-05, + "loss": 2.426, + "step": 1637 + }, + { + "epoch": 0.307029053420806, + "grad_norm": 69475.234375, + "learning_rate": 9.83723817198732e-05, + "loss": 2.458, + "step": 1638 + }, + { + "epoch": 0.30721649484536084, + "grad_norm": 51160.08984375, + "learning_rate": 9.837039251099278e-05, + "loss": 2.3359, + "step": 1639 + }, + { + "epoch": 0.30740393626991563, + "grad_norm": 52474.7890625, + "learning_rate": 9.83684021074264e-05, + "loss": 2.339, + "step": 1640 + }, + { + "epoch": 0.3075913776944705, + "grad_norm": 50535.73046875, + "learning_rate": 9.836641050922319e-05, + "loss": 2.3014, + "step": 1641 + }, + { + "epoch": 0.3077788191190253, + "grad_norm": 53649.4140625, + "learning_rate": 9.836441771643235e-05, + "loss": 2.3715, + "step": 1642 + }, + { + "epoch": 0.3079662605435801, + "grad_norm": 47926.390625, + "learning_rate": 9.836242372910309e-05, + "loss": 2.3663, + "step": 1643 + }, + { + "epoch": 0.30815370196813496, + "grad_norm": 53921.56640625, + "learning_rate": 9.836042854728467e-05, + "loss": 2.3461, + "step": 1644 + }, + { + "epoch": 0.3083411433926898, + "grad_norm": 54406.265625, + "learning_rate": 9.835843217102636e-05, + "loss": 2.3044, + "step": 1645 + }, + { + "epoch": 0.3085285848172446, + "grad_norm": 50457.41015625, + "learning_rate": 9.835643460037749e-05, + "loss": 2.396, + "step": 1646 + }, + { + "epoch": 0.30871602624179945, + "grad_norm": 49175.82421875, + "learning_rate": 9.835443583538737e-05, + "loss": 2.4119, + "step": 1647 + }, + { + "epoch": 0.30890346766635424, + "grad_norm": 52398.42578125, + "learning_rate": 9.835243587610536e-05, + "loss": 2.3745, + "step": 1648 + }, + { + "epoch": 0.3090909090909091, + "grad_norm": 47760.7578125, + "learning_rate": 9.83504347225809e-05, + "loss": 2.343, + "step": 1649 + }, + { + "epoch": 0.30927835051546393, + "grad_norm": 48988.57421875, + "learning_rate": 9.834843237486337e-05, + "loss": 2.322, + "step": 1650 + }, + { + "epoch": 0.3094657919400187, + "grad_norm": 49797.94921875, + "learning_rate": 9.834642883300227e-05, + "loss": 2.4101, + "step": 1651 + }, + { + "epoch": 0.30965323336457357, + "grad_norm": 51417.73046875, + "learning_rate": 9.834442409704704e-05, + "loss": 2.287, + "step": 1652 + }, + { + "epoch": 0.3098406747891284, + "grad_norm": 52747.6171875, + "learning_rate": 9.834241816704721e-05, + "loss": 2.3255, + "step": 1653 + }, + { + "epoch": 0.3100281162136832, + "grad_norm": 53458.41796875, + "learning_rate": 9.834041104305233e-05, + "loss": 2.3576, + "step": 1654 + }, + { + "epoch": 0.31021555763823805, + "grad_norm": 53274.0625, + "learning_rate": 9.833840272511199e-05, + "loss": 2.3572, + "step": 1655 + }, + { + "epoch": 0.3104029990627929, + "grad_norm": 49216.453125, + "learning_rate": 9.833639321327577e-05, + "loss": 2.4118, + "step": 1656 + }, + { + "epoch": 0.3105904404873477, + "grad_norm": 51931.734375, + "learning_rate": 9.83343825075933e-05, + "loss": 2.3774, + "step": 1657 + }, + { + "epoch": 0.31077788191190253, + "grad_norm": 54486.953125, + "learning_rate": 9.833237060811425e-05, + "loss": 2.3645, + "step": 1658 + }, + { + "epoch": 0.3109653233364574, + "grad_norm": 52607.515625, + "learning_rate": 9.83303575148883e-05, + "loss": 2.3168, + "step": 1659 + }, + { + "epoch": 0.31115276476101217, + "grad_norm": 51803.89453125, + "learning_rate": 9.83283432279652e-05, + "loss": 2.3903, + "step": 1660 + }, + { + "epoch": 0.311340206185567, + "grad_norm": 57337.1953125, + "learning_rate": 9.832632774739466e-05, + "loss": 2.3629, + "step": 1661 + }, + { + "epoch": 0.31152764761012186, + "grad_norm": 55103.16796875, + "learning_rate": 9.832431107322649e-05, + "loss": 2.3164, + "step": 1662 + }, + { + "epoch": 0.31171508903467665, + "grad_norm": 51962.5546875, + "learning_rate": 9.83222932055105e-05, + "loss": 2.3389, + "step": 1663 + }, + { + "epoch": 0.3119025304592315, + "grad_norm": 51624.99609375, + "learning_rate": 9.83202741442965e-05, + "loss": 2.4082, + "step": 1664 + }, + { + "epoch": 0.31208997188378634, + "grad_norm": 55193.09765625, + "learning_rate": 9.831825388963439e-05, + "loss": 2.3378, + "step": 1665 + }, + { + "epoch": 0.31227741330834113, + "grad_norm": 53118.55078125, + "learning_rate": 9.831623244157406e-05, + "loss": 2.3523, + "step": 1666 + }, + { + "epoch": 0.312464854732896, + "grad_norm": 54055.265625, + "learning_rate": 9.831420980016541e-05, + "loss": 2.3734, + "step": 1667 + }, + { + "epoch": 0.31265229615745077, + "grad_norm": 52329.87109375, + "learning_rate": 9.831218596545844e-05, + "loss": 2.2809, + "step": 1668 + }, + { + "epoch": 0.3128397375820056, + "grad_norm": 56385.98828125, + "learning_rate": 9.83101609375031e-05, + "loss": 2.4506, + "step": 1669 + }, + { + "epoch": 0.31302717900656046, + "grad_norm": 51579.04296875, + "learning_rate": 9.830813471634943e-05, + "loss": 2.3701, + "step": 1670 + }, + { + "epoch": 0.31321462043111525, + "grad_norm": 59621.4765625, + "learning_rate": 9.830610730204746e-05, + "loss": 2.4174, + "step": 1671 + }, + { + "epoch": 0.3134020618556701, + "grad_norm": 48061.38671875, + "learning_rate": 9.830407869464726e-05, + "loss": 2.4016, + "step": 1672 + }, + { + "epoch": 0.31358950328022495, + "grad_norm": 52589.32421875, + "learning_rate": 9.830204889419896e-05, + "loss": 2.3195, + "step": 1673 + }, + { + "epoch": 0.31377694470477974, + "grad_norm": 49879.08203125, + "learning_rate": 9.830001790075266e-05, + "loss": 2.3654, + "step": 1674 + }, + { + "epoch": 0.3139643861293346, + "grad_norm": 44288.1796875, + "learning_rate": 9.829798571435855e-05, + "loss": 2.3572, + "step": 1675 + }, + { + "epoch": 0.31415182755388943, + "grad_norm": 51170.859375, + "learning_rate": 9.829595233506681e-05, + "loss": 2.3974, + "step": 1676 + }, + { + "epoch": 0.3143392689784442, + "grad_norm": 49042.8203125, + "learning_rate": 9.829391776292767e-05, + "loss": 2.2837, + "step": 1677 + }, + { + "epoch": 0.31452671040299907, + "grad_norm": 46393.95703125, + "learning_rate": 9.829188199799138e-05, + "loss": 2.3221, + "step": 1678 + }, + { + "epoch": 0.3147141518275539, + "grad_norm": 50533.98828125, + "learning_rate": 9.828984504030819e-05, + "loss": 2.362, + "step": 1679 + }, + { + "epoch": 0.3149015932521087, + "grad_norm": 50514.10546875, + "learning_rate": 9.828780688992845e-05, + "loss": 2.3797, + "step": 1680 + }, + { + "epoch": 0.31508903467666355, + "grad_norm": 52574.3359375, + "learning_rate": 9.82857675469025e-05, + "loss": 2.394, + "step": 1681 + }, + { + "epoch": 0.3152764761012184, + "grad_norm": 53291.09765625, + "learning_rate": 9.828372701128068e-05, + "loss": 2.3593, + "step": 1682 + }, + { + "epoch": 0.3154639175257732, + "grad_norm": 51298.60546875, + "learning_rate": 9.828168528311341e-05, + "loss": 2.3664, + "step": 1683 + }, + { + "epoch": 0.31565135895032803, + "grad_norm": 50623.8046875, + "learning_rate": 9.827964236245111e-05, + "loss": 2.3164, + "step": 1684 + }, + { + "epoch": 0.3158388003748828, + "grad_norm": 53877.91796875, + "learning_rate": 9.827759824934424e-05, + "loss": 2.3213, + "step": 1685 + }, + { + "epoch": 0.31602624179943767, + "grad_norm": 47787.3515625, + "learning_rate": 9.827555294384328e-05, + "loss": 2.3404, + "step": 1686 + }, + { + "epoch": 0.3162136832239925, + "grad_norm": 56305.30078125, + "learning_rate": 9.827350644599876e-05, + "loss": 2.3248, + "step": 1687 + }, + { + "epoch": 0.3164011246485473, + "grad_norm": 50960.55859375, + "learning_rate": 9.827145875586122e-05, + "loss": 2.329, + "step": 1688 + }, + { + "epoch": 0.31658856607310215, + "grad_norm": 54124.015625, + "learning_rate": 9.826940987348121e-05, + "loss": 2.3205, + "step": 1689 + }, + { + "epoch": 0.316776007497657, + "grad_norm": 51884.21484375, + "learning_rate": 9.826735979890938e-05, + "loss": 2.3401, + "step": 1690 + }, + { + "epoch": 0.3169634489222118, + "grad_norm": 53389.5390625, + "learning_rate": 9.826530853219634e-05, + "loss": 2.3191, + "step": 1691 + }, + { + "epoch": 0.31715089034676663, + "grad_norm": 52669.4296875, + "learning_rate": 9.826325607339275e-05, + "loss": 2.3745, + "step": 1692 + }, + { + "epoch": 0.3173383317713215, + "grad_norm": 50567.08203125, + "learning_rate": 9.826120242254933e-05, + "loss": 2.3781, + "step": 1693 + }, + { + "epoch": 0.31752577319587627, + "grad_norm": 51070.34765625, + "learning_rate": 9.825914757971675e-05, + "loss": 2.2977, + "step": 1694 + }, + { + "epoch": 0.3177132146204311, + "grad_norm": 53049.06640625, + "learning_rate": 9.825709154494582e-05, + "loss": 2.3647, + "step": 1695 + }, + { + "epoch": 0.31790065604498596, + "grad_norm": 51834.86328125, + "learning_rate": 9.825503431828728e-05, + "loss": 2.3404, + "step": 1696 + }, + { + "epoch": 0.31808809746954075, + "grad_norm": 56435.921875, + "learning_rate": 9.825297589979196e-05, + "loss": 2.3527, + "step": 1697 + }, + { + "epoch": 0.3182755388940956, + "grad_norm": 53041.6484375, + "learning_rate": 9.825091628951071e-05, + "loss": 2.2735, + "step": 1698 + }, + { + "epoch": 0.31846298031865045, + "grad_norm": 53411.60546875, + "learning_rate": 9.824885548749436e-05, + "loss": 2.3711, + "step": 1699 + }, + { + "epoch": 0.31865042174320524, + "grad_norm": 52281.62890625, + "learning_rate": 9.824679349379386e-05, + "loss": 2.3794, + "step": 1700 + }, + { + "epoch": 0.3188378631677601, + "grad_norm": 49045.6953125, + "learning_rate": 9.824473030846009e-05, + "loss": 2.4081, + "step": 1701 + }, + { + "epoch": 0.31902530459231493, + "grad_norm": 49045.40625, + "learning_rate": 9.824266593154405e-05, + "loss": 2.3125, + "step": 1702 + }, + { + "epoch": 0.3192127460168697, + "grad_norm": 48998.703125, + "learning_rate": 9.82406003630967e-05, + "loss": 2.378, + "step": 1703 + }, + { + "epoch": 0.31940018744142457, + "grad_norm": 49478.39453125, + "learning_rate": 9.823853360316907e-05, + "loss": 2.3897, + "step": 1704 + }, + { + "epoch": 0.31958762886597936, + "grad_norm": 52400.6875, + "learning_rate": 9.823646565181219e-05, + "loss": 2.3725, + "step": 1705 + }, + { + "epoch": 0.3197750702905342, + "grad_norm": 51542.55859375, + "learning_rate": 9.823439650907717e-05, + "loss": 2.3999, + "step": 1706 + }, + { + "epoch": 0.31996251171508905, + "grad_norm": 59415.57421875, + "learning_rate": 9.823232617501507e-05, + "loss": 2.3715, + "step": 1707 + }, + { + "epoch": 0.32014995313964384, + "grad_norm": 56520.1640625, + "learning_rate": 9.823025464967706e-05, + "loss": 2.3133, + "step": 1708 + }, + { + "epoch": 0.3203373945641987, + "grad_norm": 50980.953125, + "learning_rate": 9.82281819331143e-05, + "loss": 2.3253, + "step": 1709 + }, + { + "epoch": 0.32052483598875353, + "grad_norm": 54598.07421875, + "learning_rate": 9.822610802537797e-05, + "loss": 2.29, + "step": 1710 + }, + { + "epoch": 0.3207122774133083, + "grad_norm": 48412.83203125, + "learning_rate": 9.822403292651929e-05, + "loss": 2.3458, + "step": 1711 + }, + { + "epoch": 0.32089971883786317, + "grad_norm": 49951.609375, + "learning_rate": 9.822195663658952e-05, + "loss": 2.2856, + "step": 1712 + }, + { + "epoch": 0.321087160262418, + "grad_norm": 57075.91796875, + "learning_rate": 9.821987915563995e-05, + "loss": 2.4247, + "step": 1713 + }, + { + "epoch": 0.3212746016869728, + "grad_norm": 52349.94921875, + "learning_rate": 9.821780048372187e-05, + "loss": 2.342, + "step": 1714 + }, + { + "epoch": 0.32146204311152765, + "grad_norm": 51574.6875, + "learning_rate": 9.821572062088665e-05, + "loss": 2.3807, + "step": 1715 + }, + { + "epoch": 0.3216494845360825, + "grad_norm": 57053.13671875, + "learning_rate": 9.821363956718563e-05, + "loss": 2.323, + "step": 1716 + }, + { + "epoch": 0.3218369259606373, + "grad_norm": 49053.25390625, + "learning_rate": 9.821155732267022e-05, + "loss": 2.3595, + "step": 1717 + }, + { + "epoch": 0.32202436738519213, + "grad_norm": 47924.25390625, + "learning_rate": 9.820947388739186e-05, + "loss": 2.3058, + "step": 1718 + }, + { + "epoch": 0.322211808809747, + "grad_norm": 48134.0625, + "learning_rate": 9.8207389261402e-05, + "loss": 2.3687, + "step": 1719 + }, + { + "epoch": 0.32239925023430177, + "grad_norm": 49758.53515625, + "learning_rate": 9.820530344475213e-05, + "loss": 2.3603, + "step": 1720 + }, + { + "epoch": 0.3225866916588566, + "grad_norm": 52780.7109375, + "learning_rate": 9.820321643749375e-05, + "loss": 2.248, + "step": 1721 + }, + { + "epoch": 0.3227741330834114, + "grad_norm": 53754.35546875, + "learning_rate": 9.820112823967844e-05, + "loss": 2.3432, + "step": 1722 + }, + { + "epoch": 0.32296157450796625, + "grad_norm": 52898.0, + "learning_rate": 9.819903885135775e-05, + "loss": 2.3651, + "step": 1723 + }, + { + "epoch": 0.3231490159325211, + "grad_norm": 58425.71875, + "learning_rate": 9.819694827258328e-05, + "loss": 2.3246, + "step": 1724 + }, + { + "epoch": 0.3233364573570759, + "grad_norm": 52524.3359375, + "learning_rate": 9.819485650340668e-05, + "loss": 2.3594, + "step": 1725 + }, + { + "epoch": 0.32352389878163074, + "grad_norm": 54922.140625, + "learning_rate": 9.819276354387962e-05, + "loss": 2.3871, + "step": 1726 + }, + { + "epoch": 0.3237113402061856, + "grad_norm": 62774.6796875, + "learning_rate": 9.819066939405377e-05, + "loss": 2.3514, + "step": 1727 + }, + { + "epoch": 0.3238987816307404, + "grad_norm": 52790.74609375, + "learning_rate": 9.818857405398088e-05, + "loss": 2.3453, + "step": 1728 + }, + { + "epoch": 0.3240862230552952, + "grad_norm": 50172.65625, + "learning_rate": 9.818647752371268e-05, + "loss": 2.2948, + "step": 1729 + }, + { + "epoch": 0.32427366447985007, + "grad_norm": 53979.1328125, + "learning_rate": 9.818437980330096e-05, + "loss": 2.4033, + "step": 1730 + }, + { + "epoch": 0.32446110590440486, + "grad_norm": 48833.0859375, + "learning_rate": 9.818228089279752e-05, + "loss": 2.3576, + "step": 1731 + }, + { + "epoch": 0.3246485473289597, + "grad_norm": 54231.453125, + "learning_rate": 9.818018079225423e-05, + "loss": 2.387, + "step": 1732 + }, + { + "epoch": 0.32483598875351455, + "grad_norm": 58373.93359375, + "learning_rate": 9.817807950172293e-05, + "loss": 2.3712, + "step": 1733 + }, + { + "epoch": 0.32502343017806934, + "grad_norm": 49187.22265625, + "learning_rate": 9.817597702125554e-05, + "loss": 2.32, + "step": 1734 + }, + { + "epoch": 0.3252108716026242, + "grad_norm": 50764.39453125, + "learning_rate": 9.817387335090398e-05, + "loss": 2.2981, + "step": 1735 + }, + { + "epoch": 0.32539831302717903, + "grad_norm": 52482.69140625, + "learning_rate": 9.817176849072019e-05, + "loss": 2.4242, + "step": 1736 + }, + { + "epoch": 0.3255857544517338, + "grad_norm": 54216.484375, + "learning_rate": 9.816966244075618e-05, + "loss": 2.3907, + "step": 1737 + }, + { + "epoch": 0.32577319587628867, + "grad_norm": 49269.83203125, + "learning_rate": 9.816755520106397e-05, + "loss": 2.3627, + "step": 1738 + }, + { + "epoch": 0.32596063730084346, + "grad_norm": 48885.51171875, + "learning_rate": 9.816544677169559e-05, + "loss": 2.2516, + "step": 1739 + }, + { + "epoch": 0.3261480787253983, + "grad_norm": 48183.74609375, + "learning_rate": 9.816333715270311e-05, + "loss": 2.3774, + "step": 1740 + }, + { + "epoch": 0.32633552014995315, + "grad_norm": 57892.078125, + "learning_rate": 9.816122634413867e-05, + "loss": 2.3642, + "step": 1741 + }, + { + "epoch": 0.32652296157450794, + "grad_norm": 48238.41015625, + "learning_rate": 9.815911434605437e-05, + "loss": 2.3128, + "step": 1742 + }, + { + "epoch": 0.3267104029990628, + "grad_norm": 49693.671875, + "learning_rate": 9.815700115850239e-05, + "loss": 2.3097, + "step": 1743 + }, + { + "epoch": 0.32689784442361763, + "grad_norm": 49915.6015625, + "learning_rate": 9.81548867815349e-05, + "loss": 2.3383, + "step": 1744 + }, + { + "epoch": 0.3270852858481724, + "grad_norm": 53097.22265625, + "learning_rate": 9.815277121520417e-05, + "loss": 2.3605, + "step": 1745 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 53709.53515625, + "learning_rate": 9.815065445956239e-05, + "loss": 2.3116, + "step": 1746 + }, + { + "epoch": 0.3274601686972821, + "grad_norm": 52997.3828125, + "learning_rate": 9.814853651466189e-05, + "loss": 2.3549, + "step": 1747 + }, + { + "epoch": 0.3276476101218369, + "grad_norm": 60318.09765625, + "learning_rate": 9.814641738055497e-05, + "loss": 2.328, + "step": 1748 + }, + { + "epoch": 0.32783505154639175, + "grad_norm": 51464.9609375, + "learning_rate": 9.814429705729395e-05, + "loss": 2.2491, + "step": 1749 + }, + { + "epoch": 0.3280224929709466, + "grad_norm": 57932.03515625, + "learning_rate": 9.814217554493121e-05, + "loss": 2.3826, + "step": 1750 + }, + { + "epoch": 0.3282099343955014, + "grad_norm": 50627.703125, + "learning_rate": 9.814005284351917e-05, + "loss": 2.3362, + "step": 1751 + }, + { + "epoch": 0.32839737582005624, + "grad_norm": 50747.8671875, + "learning_rate": 9.813792895311022e-05, + "loss": 2.3107, + "step": 1752 + }, + { + "epoch": 0.3285848172446111, + "grad_norm": 51726.15625, + "learning_rate": 9.813580387375685e-05, + "loss": 2.4099, + "step": 1753 + }, + { + "epoch": 0.3287722586691659, + "grad_norm": 54767.7265625, + "learning_rate": 9.813367760551153e-05, + "loss": 2.3364, + "step": 1754 + }, + { + "epoch": 0.3289597000937207, + "grad_norm": 53505.58203125, + "learning_rate": 9.813155014842679e-05, + "loss": 2.3403, + "step": 1755 + }, + { + "epoch": 0.32914714151827557, + "grad_norm": 50930.76171875, + "learning_rate": 9.812942150255516e-05, + "loss": 2.319, + "step": 1756 + }, + { + "epoch": 0.32933458294283036, + "grad_norm": 54751.73828125, + "learning_rate": 9.812729166794921e-05, + "loss": 2.2563, + "step": 1757 + }, + { + "epoch": 0.3295220243673852, + "grad_norm": 58584.9140625, + "learning_rate": 9.812516064466155e-05, + "loss": 2.4038, + "step": 1758 + }, + { + "epoch": 0.32970946579194, + "grad_norm": 54809.78515625, + "learning_rate": 9.812302843274482e-05, + "loss": 2.2945, + "step": 1759 + }, + { + "epoch": 0.32989690721649484, + "grad_norm": 52943.7734375, + "learning_rate": 9.812089503225168e-05, + "loss": 2.3607, + "step": 1760 + }, + { + "epoch": 0.3300843486410497, + "grad_norm": 52070.46875, + "learning_rate": 9.811876044323483e-05, + "loss": 2.4044, + "step": 1761 + }, + { + "epoch": 0.3302717900656045, + "grad_norm": 54337.9375, + "learning_rate": 9.811662466574698e-05, + "loss": 2.4331, + "step": 1762 + }, + { + "epoch": 0.3304592314901593, + "grad_norm": 52296.07421875, + "learning_rate": 9.811448769984088e-05, + "loss": 2.3358, + "step": 1763 + }, + { + "epoch": 0.33064667291471417, + "grad_norm": 52852.4921875, + "learning_rate": 9.811234954556933e-05, + "loss": 2.3695, + "step": 1764 + }, + { + "epoch": 0.33083411433926896, + "grad_norm": 52500.58984375, + "learning_rate": 9.811021020298511e-05, + "loss": 2.3597, + "step": 1765 + }, + { + "epoch": 0.3310215557638238, + "grad_norm": 48859.7890625, + "learning_rate": 9.810806967214108e-05, + "loss": 2.2846, + "step": 1766 + }, + { + "epoch": 0.33120899718837865, + "grad_norm": 59379.734375, + "learning_rate": 9.81059279530901e-05, + "loss": 2.3879, + "step": 1767 + }, + { + "epoch": 0.33139643861293344, + "grad_norm": 54539.890625, + "learning_rate": 9.810378504588506e-05, + "loss": 2.3714, + "step": 1768 + }, + { + "epoch": 0.3315838800374883, + "grad_norm": 75037.6015625, + "learning_rate": 9.810164095057892e-05, + "loss": 2.478, + "step": 1769 + }, + { + "epoch": 0.33177132146204313, + "grad_norm": 50049.421875, + "learning_rate": 9.80994956672246e-05, + "loss": 2.342, + "step": 1770 + }, + { + "epoch": 0.3319587628865979, + "grad_norm": 53989.65625, + "learning_rate": 9.80973491958751e-05, + "loss": 2.4123, + "step": 1771 + }, + { + "epoch": 0.33214620431115277, + "grad_norm": 53043.796875, + "learning_rate": 9.809520153658343e-05, + "loss": 2.4007, + "step": 1772 + }, + { + "epoch": 0.3323336457357076, + "grad_norm": 57321.8359375, + "learning_rate": 9.809305268940266e-05, + "loss": 2.3726, + "step": 1773 + }, + { + "epoch": 0.3325210871602624, + "grad_norm": 49779.30078125, + "learning_rate": 9.809090265438581e-05, + "loss": 2.3309, + "step": 1774 + }, + { + "epoch": 0.33270852858481725, + "grad_norm": 52090.8828125, + "learning_rate": 9.808875143158602e-05, + "loss": 2.3895, + "step": 1775 + }, + { + "epoch": 0.33289597000937204, + "grad_norm": 49064.44921875, + "learning_rate": 9.808659902105644e-05, + "loss": 2.4498, + "step": 1776 + }, + { + "epoch": 0.3330834114339269, + "grad_norm": 55309.82421875, + "learning_rate": 9.808444542285019e-05, + "loss": 2.3295, + "step": 1777 + }, + { + "epoch": 0.33327085285848174, + "grad_norm": 48020.0546875, + "learning_rate": 9.808229063702049e-05, + "loss": 2.3391, + "step": 1778 + }, + { + "epoch": 0.3334582942830365, + "grad_norm": 54631.140625, + "learning_rate": 9.808013466362054e-05, + "loss": 2.3606, + "step": 1779 + }, + { + "epoch": 0.3336457357075914, + "grad_norm": 49339.5625, + "learning_rate": 9.807797750270362e-05, + "loss": 2.3331, + "step": 1780 + }, + { + "epoch": 0.3338331771321462, + "grad_norm": 51778.9921875, + "learning_rate": 9.807581915432298e-05, + "loss": 2.262, + "step": 1781 + }, + { + "epoch": 0.334020618556701, + "grad_norm": 56825.66015625, + "learning_rate": 9.807365961853194e-05, + "loss": 2.3501, + "step": 1782 + }, + { + "epoch": 0.33420805998125586, + "grad_norm": 50782.71484375, + "learning_rate": 9.807149889538383e-05, + "loss": 2.31, + "step": 1783 + }, + { + "epoch": 0.3343955014058107, + "grad_norm": 52827.34765625, + "learning_rate": 9.806933698493201e-05, + "loss": 2.3569, + "step": 1784 + }, + { + "epoch": 0.3345829428303655, + "grad_norm": 53291.3125, + "learning_rate": 9.806717388722991e-05, + "loss": 2.3058, + "step": 1785 + }, + { + "epoch": 0.33477038425492034, + "grad_norm": 56480.95703125, + "learning_rate": 9.806500960233094e-05, + "loss": 2.4593, + "step": 1786 + }, + { + "epoch": 0.3349578256794752, + "grad_norm": 55490.0234375, + "learning_rate": 9.806284413028853e-05, + "loss": 2.3147, + "step": 1787 + }, + { + "epoch": 0.33514526710403, + "grad_norm": 50720.0546875, + "learning_rate": 9.80606774711562e-05, + "loss": 2.3409, + "step": 1788 + }, + { + "epoch": 0.3353327085285848, + "grad_norm": 53905.0546875, + "learning_rate": 9.805850962498744e-05, + "loss": 2.3806, + "step": 1789 + }, + { + "epoch": 0.33552014995313967, + "grad_norm": 53705.22265625, + "learning_rate": 9.80563405918358e-05, + "loss": 2.3115, + "step": 1790 + }, + { + "epoch": 0.33570759137769446, + "grad_norm": 53528.78515625, + "learning_rate": 9.805417037175486e-05, + "loss": 2.3736, + "step": 1791 + }, + { + "epoch": 0.3358950328022493, + "grad_norm": 50060.8984375, + "learning_rate": 9.805199896479821e-05, + "loss": 2.3044, + "step": 1792 + }, + { + "epoch": 0.33608247422680415, + "grad_norm": 49977.4765625, + "learning_rate": 9.804982637101949e-05, + "loss": 2.3507, + "step": 1793 + }, + { + "epoch": 0.33626991565135894, + "grad_norm": 53236.55078125, + "learning_rate": 9.804765259047235e-05, + "loss": 2.3943, + "step": 1794 + }, + { + "epoch": 0.3364573570759138, + "grad_norm": 50738.15625, + "learning_rate": 9.804547762321049e-05, + "loss": 2.3515, + "step": 1795 + }, + { + "epoch": 0.3366447985004686, + "grad_norm": 49599.03515625, + "learning_rate": 9.804330146928761e-05, + "loss": 2.3905, + "step": 1796 + }, + { + "epoch": 0.3368322399250234, + "grad_norm": 54853.96875, + "learning_rate": 9.80411241287575e-05, + "loss": 2.3649, + "step": 1797 + }, + { + "epoch": 0.33701968134957827, + "grad_norm": 57637.8828125, + "learning_rate": 9.80389456016739e-05, + "loss": 2.3421, + "step": 1798 + }, + { + "epoch": 0.33720712277413306, + "grad_norm": 50217.50390625, + "learning_rate": 9.803676588809063e-05, + "loss": 2.3482, + "step": 1799 + }, + { + "epoch": 0.3373945641986879, + "grad_norm": 51602.1640625, + "learning_rate": 9.803458498806151e-05, + "loss": 2.3521, + "step": 1800 + }, + { + "epoch": 0.33758200562324275, + "grad_norm": 51466.5390625, + "learning_rate": 9.803240290164042e-05, + "loss": 2.3542, + "step": 1801 + }, + { + "epoch": 0.33776944704779754, + "grad_norm": 53798.48046875, + "learning_rate": 9.803021962888127e-05, + "loss": 2.3163, + "step": 1802 + }, + { + "epoch": 0.3379568884723524, + "grad_norm": 53483.06640625, + "learning_rate": 9.802803516983795e-05, + "loss": 2.2992, + "step": 1803 + }, + { + "epoch": 0.33814432989690724, + "grad_norm": 52526.84375, + "learning_rate": 9.802584952456443e-05, + "loss": 2.3511, + "step": 1804 + }, + { + "epoch": 0.338331771321462, + "grad_norm": 48269.2734375, + "learning_rate": 9.80236626931147e-05, + "loss": 2.37, + "step": 1805 + }, + { + "epoch": 0.3385192127460169, + "grad_norm": 53453.14453125, + "learning_rate": 9.802147467554277e-05, + "loss": 2.3708, + "step": 1806 + }, + { + "epoch": 0.3387066541705717, + "grad_norm": 58330.04296875, + "learning_rate": 9.801928547190267e-05, + "loss": 2.3013, + "step": 1807 + }, + { + "epoch": 0.3388940955951265, + "grad_norm": 48386.71875, + "learning_rate": 9.801709508224848e-05, + "loss": 2.361, + "step": 1808 + }, + { + "epoch": 0.33908153701968136, + "grad_norm": 50349.53125, + "learning_rate": 9.80149035066343e-05, + "loss": 2.3536, + "step": 1809 + }, + { + "epoch": 0.3392689784442362, + "grad_norm": 54569.921875, + "learning_rate": 9.801271074511425e-05, + "loss": 2.3942, + "step": 1810 + }, + { + "epoch": 0.339456419868791, + "grad_norm": 50605.71484375, + "learning_rate": 9.801051679774249e-05, + "loss": 2.353, + "step": 1811 + }, + { + "epoch": 0.33964386129334584, + "grad_norm": 50832.19921875, + "learning_rate": 9.80083216645732e-05, + "loss": 2.3446, + "step": 1812 + }, + { + "epoch": 0.33983130271790063, + "grad_norm": 51670.7578125, + "learning_rate": 9.800612534566063e-05, + "loss": 2.3527, + "step": 1813 + }, + { + "epoch": 0.3400187441424555, + "grad_norm": 51170.98046875, + "learning_rate": 9.8003927841059e-05, + "loss": 2.2643, + "step": 1814 + }, + { + "epoch": 0.3402061855670103, + "grad_norm": 48456.06640625, + "learning_rate": 9.800172915082259e-05, + "loss": 2.3324, + "step": 1815 + }, + { + "epoch": 0.3403936269915651, + "grad_norm": 47783.91015625, + "learning_rate": 9.799952927500571e-05, + "loss": 2.3418, + "step": 1816 + }, + { + "epoch": 0.34058106841611996, + "grad_norm": 47898.14453125, + "learning_rate": 9.799732821366269e-05, + "loss": 2.3401, + "step": 1817 + }, + { + "epoch": 0.3407685098406748, + "grad_norm": 45111.765625, + "learning_rate": 9.799512596684787e-05, + "loss": 2.2999, + "step": 1818 + }, + { + "epoch": 0.3409559512652296, + "grad_norm": 48741.71875, + "learning_rate": 9.79929225346157e-05, + "loss": 2.3641, + "step": 1819 + }, + { + "epoch": 0.34114339268978444, + "grad_norm": 52506.45703125, + "learning_rate": 9.799071791702055e-05, + "loss": 2.3737, + "step": 1820 + }, + { + "epoch": 0.3413308341143393, + "grad_norm": 48892.31640625, + "learning_rate": 9.798851211411688e-05, + "loss": 2.421, + "step": 1821 + }, + { + "epoch": 0.3415182755388941, + "grad_norm": 57043.22265625, + "learning_rate": 9.798630512595919e-05, + "loss": 2.3307, + "step": 1822 + }, + { + "epoch": 0.3417057169634489, + "grad_norm": 48735.31640625, + "learning_rate": 9.798409695260198e-05, + "loss": 2.3009, + "step": 1823 + }, + { + "epoch": 0.34189315838800377, + "grad_norm": 56533.90625, + "learning_rate": 9.798188759409978e-05, + "loss": 2.3556, + "step": 1824 + }, + { + "epoch": 0.34208059981255856, + "grad_norm": 52971.26953125, + "learning_rate": 9.797967705050716e-05, + "loss": 2.4371, + "step": 1825 + }, + { + "epoch": 0.3422680412371134, + "grad_norm": 49848.015625, + "learning_rate": 9.797746532187873e-05, + "loss": 2.3433, + "step": 1826 + }, + { + "epoch": 0.34245548266166825, + "grad_norm": 53057.80859375, + "learning_rate": 9.797525240826911e-05, + "loss": 2.3423, + "step": 1827 + }, + { + "epoch": 0.34264292408622304, + "grad_norm": 53460.61328125, + "learning_rate": 9.797303830973297e-05, + "loss": 2.3523, + "step": 1828 + }, + { + "epoch": 0.3428303655107779, + "grad_norm": 51998.80859375, + "learning_rate": 9.797082302632496e-05, + "loss": 2.357, + "step": 1829 + }, + { + "epoch": 0.3430178069353327, + "grad_norm": 54237.0078125, + "learning_rate": 9.796860655809983e-05, + "loss": 2.3602, + "step": 1830 + }, + { + "epoch": 0.3432052483598875, + "grad_norm": 50822.49609375, + "learning_rate": 9.79663889051123e-05, + "loss": 2.439, + "step": 1831 + }, + { + "epoch": 0.3433926897844424, + "grad_norm": 46900.89453125, + "learning_rate": 9.796417006741717e-05, + "loss": 2.3591, + "step": 1832 + }, + { + "epoch": 0.34358013120899716, + "grad_norm": 49727.6875, + "learning_rate": 9.79619500450692e-05, + "loss": 2.3121, + "step": 1833 + }, + { + "epoch": 0.343767572633552, + "grad_norm": 54846.86328125, + "learning_rate": 9.795972883812328e-05, + "loss": 2.3307, + "step": 1834 + }, + { + "epoch": 0.34395501405810686, + "grad_norm": 58535.5, + "learning_rate": 9.795750644663422e-05, + "loss": 2.3911, + "step": 1835 + }, + { + "epoch": 0.34414245548266165, + "grad_norm": 54044.984375, + "learning_rate": 9.795528287065692e-05, + "loss": 2.3511, + "step": 1836 + }, + { + "epoch": 0.3443298969072165, + "grad_norm": 51040.23046875, + "learning_rate": 9.795305811024633e-05, + "loss": 2.3609, + "step": 1837 + }, + { + "epoch": 0.34451733833177134, + "grad_norm": 48721.01953125, + "learning_rate": 9.795083216545735e-05, + "loss": 2.3606, + "step": 1838 + }, + { + "epoch": 0.34470477975632613, + "grad_norm": 52674.62890625, + "learning_rate": 9.7948605036345e-05, + "loss": 2.4186, + "step": 1839 + }, + { + "epoch": 0.344892221180881, + "grad_norm": 49967.625, + "learning_rate": 9.794637672296427e-05, + "loss": 2.3686, + "step": 1840 + }, + { + "epoch": 0.3450796626054358, + "grad_norm": 50054.25390625, + "learning_rate": 9.794414722537021e-05, + "loss": 2.38, + "step": 1841 + }, + { + "epoch": 0.3452671040299906, + "grad_norm": 53845.71875, + "learning_rate": 9.794191654361786e-05, + "loss": 2.4245, + "step": 1842 + }, + { + "epoch": 0.34545454545454546, + "grad_norm": 48771.11328125, + "learning_rate": 9.793968467776233e-05, + "loss": 2.3316, + "step": 1843 + }, + { + "epoch": 0.3456419868791003, + "grad_norm": 51465.87890625, + "learning_rate": 9.793745162785875e-05, + "loss": 2.448, + "step": 1844 + }, + { + "epoch": 0.3458294283036551, + "grad_norm": 57800.5703125, + "learning_rate": 9.793521739396226e-05, + "loss": 2.3148, + "step": 1845 + }, + { + "epoch": 0.34601686972820994, + "grad_norm": 52216.3828125, + "learning_rate": 9.793298197612805e-05, + "loss": 2.4275, + "step": 1846 + }, + { + "epoch": 0.3462043111527648, + "grad_norm": 47891.64453125, + "learning_rate": 9.793074537441134e-05, + "loss": 2.3849, + "step": 1847 + }, + { + "epoch": 0.3463917525773196, + "grad_norm": 50065.7421875, + "learning_rate": 9.792850758886736e-05, + "loss": 2.3308, + "step": 1848 + }, + { + "epoch": 0.3465791940018744, + "grad_norm": 53173.40625, + "learning_rate": 9.792626861955136e-05, + "loss": 2.2992, + "step": 1849 + }, + { + "epoch": 0.3467666354264292, + "grad_norm": 54242.37109375, + "learning_rate": 9.792402846651869e-05, + "loss": 2.2727, + "step": 1850 + }, + { + "epoch": 0.34695407685098406, + "grad_norm": 51527.13671875, + "learning_rate": 9.792178712982464e-05, + "loss": 2.3097, + "step": 1851 + }, + { + "epoch": 0.3471415182755389, + "grad_norm": 53961.125, + "learning_rate": 9.791954460952459e-05, + "loss": 2.2937, + "step": 1852 + }, + { + "epoch": 0.3473289597000937, + "grad_norm": 54131.89453125, + "learning_rate": 9.79173009056739e-05, + "loss": 2.3291, + "step": 1853 + }, + { + "epoch": 0.34751640112464854, + "grad_norm": 50674.2421875, + "learning_rate": 9.791505601832802e-05, + "loss": 2.3443, + "step": 1854 + }, + { + "epoch": 0.3477038425492034, + "grad_norm": 51384.21484375, + "learning_rate": 9.791280994754238e-05, + "loss": 2.339, + "step": 1855 + }, + { + "epoch": 0.3478912839737582, + "grad_norm": 53070.09765625, + "learning_rate": 9.791056269337244e-05, + "loss": 2.3624, + "step": 1856 + }, + { + "epoch": 0.348078725398313, + "grad_norm": 49023.53125, + "learning_rate": 9.790831425587371e-05, + "loss": 2.3201, + "step": 1857 + }, + { + "epoch": 0.3482661668228679, + "grad_norm": 52131.0390625, + "learning_rate": 9.790606463510175e-05, + "loss": 2.359, + "step": 1858 + }, + { + "epoch": 0.34845360824742266, + "grad_norm": 51742.1640625, + "learning_rate": 9.790381383111211e-05, + "loss": 2.397, + "step": 1859 + }, + { + "epoch": 0.3486410496719775, + "grad_norm": 50676.02734375, + "learning_rate": 9.790156184396035e-05, + "loss": 2.4132, + "step": 1860 + }, + { + "epoch": 0.34882849109653236, + "grad_norm": 53921.96484375, + "learning_rate": 9.789930867370214e-05, + "loss": 2.3304, + "step": 1861 + }, + { + "epoch": 0.34901593252108715, + "grad_norm": 49888.265625, + "learning_rate": 9.78970543203931e-05, + "loss": 2.2963, + "step": 1862 + }, + { + "epoch": 0.349203373945642, + "grad_norm": 56577.92578125, + "learning_rate": 9.789479878408893e-05, + "loss": 2.3576, + "step": 1863 + }, + { + "epoch": 0.34939081537019684, + "grad_norm": 52168.4375, + "learning_rate": 9.78925420648453e-05, + "loss": 2.312, + "step": 1864 + }, + { + "epoch": 0.34957825679475163, + "grad_norm": 48608.64453125, + "learning_rate": 9.789028416271799e-05, + "loss": 2.3213, + "step": 1865 + }, + { + "epoch": 0.3497656982193065, + "grad_norm": 48598.38671875, + "learning_rate": 9.788802507776275e-05, + "loss": 2.4096, + "step": 1866 + }, + { + "epoch": 0.34995313964386127, + "grad_norm": 91033.3359375, + "learning_rate": 9.78857648100354e-05, + "loss": 2.7734, + "step": 1867 + }, + { + "epoch": 0.3501405810684161, + "grad_norm": 52207.02734375, + "learning_rate": 9.788350335959172e-05, + "loss": 2.3094, + "step": 1868 + }, + { + "epoch": 0.35032802249297096, + "grad_norm": 49553.35546875, + "learning_rate": 9.788124072648759e-05, + "loss": 2.3533, + "step": 1869 + }, + { + "epoch": 0.35051546391752575, + "grad_norm": 45514.08984375, + "learning_rate": 9.78789769107789e-05, + "loss": 2.3208, + "step": 1870 + }, + { + "epoch": 0.3507029053420806, + "grad_norm": 54675.73828125, + "learning_rate": 9.787671191252155e-05, + "loss": 2.3829, + "step": 1871 + }, + { + "epoch": 0.35089034676663544, + "grad_norm": 50802.72265625, + "learning_rate": 9.78744457317715e-05, + "loss": 2.345, + "step": 1872 + }, + { + "epoch": 0.35107778819119023, + "grad_norm": 51496.30859375, + "learning_rate": 9.787217836858471e-05, + "loss": 2.3336, + "step": 1873 + }, + { + "epoch": 0.3512652296157451, + "grad_norm": 53911.37890625, + "learning_rate": 9.786990982301716e-05, + "loss": 2.3871, + "step": 1874 + }, + { + "epoch": 0.3514526710402999, + "grad_norm": 49280.89453125, + "learning_rate": 9.786764009512492e-05, + "loss": 2.3191, + "step": 1875 + }, + { + "epoch": 0.3516401124648547, + "grad_norm": 52064.07421875, + "learning_rate": 9.786536918496403e-05, + "loss": 2.3282, + "step": 1876 + }, + { + "epoch": 0.35182755388940956, + "grad_norm": 49858.4765625, + "learning_rate": 9.786309709259058e-05, + "loss": 2.3835, + "step": 1877 + }, + { + "epoch": 0.3520149953139644, + "grad_norm": 49149.0, + "learning_rate": 9.786082381806067e-05, + "loss": 2.3058, + "step": 1878 + }, + { + "epoch": 0.3522024367385192, + "grad_norm": 48383.6015625, + "learning_rate": 9.785854936143049e-05, + "loss": 2.3507, + "step": 1879 + }, + { + "epoch": 0.35238987816307404, + "grad_norm": 50280.51171875, + "learning_rate": 9.785627372275618e-05, + "loss": 2.4589, + "step": 1880 + }, + { + "epoch": 0.3525773195876289, + "grad_norm": 51356.03125, + "learning_rate": 9.785399690209396e-05, + "loss": 2.3176, + "step": 1881 + }, + { + "epoch": 0.3527647610121837, + "grad_norm": 50451.83984375, + "learning_rate": 9.785171889950004e-05, + "loss": 2.4159, + "step": 1882 + }, + { + "epoch": 0.3529522024367385, + "grad_norm": 55143.4453125, + "learning_rate": 9.784943971503072e-05, + "loss": 2.321, + "step": 1883 + }, + { + "epoch": 0.3531396438612934, + "grad_norm": 52029.5, + "learning_rate": 9.784715934874228e-05, + "loss": 2.4243, + "step": 1884 + }, + { + "epoch": 0.35332708528584816, + "grad_norm": 53858.5703125, + "learning_rate": 9.784487780069103e-05, + "loss": 2.3589, + "step": 1885 + }, + { + "epoch": 0.353514526710403, + "grad_norm": 52984.59765625, + "learning_rate": 9.784259507093334e-05, + "loss": 2.2847, + "step": 1886 + }, + { + "epoch": 0.3537019681349578, + "grad_norm": 53184.72265625, + "learning_rate": 9.784031115952557e-05, + "loss": 2.3655, + "step": 1887 + }, + { + "epoch": 0.35388940955951265, + "grad_norm": 49445.5703125, + "learning_rate": 9.783802606652415e-05, + "loss": 2.2759, + "step": 1888 + }, + { + "epoch": 0.3540768509840675, + "grad_norm": 57161.3984375, + "learning_rate": 9.78357397919855e-05, + "loss": 2.36, + "step": 1889 + }, + { + "epoch": 0.3542642924086223, + "grad_norm": 48873.31640625, + "learning_rate": 9.78334523359661e-05, + "loss": 2.34, + "step": 1890 + }, + { + "epoch": 0.35445173383317713, + "grad_norm": 54775.1171875, + "learning_rate": 9.783116369852245e-05, + "loss": 2.3502, + "step": 1891 + }, + { + "epoch": 0.354639175257732, + "grad_norm": 48719.48046875, + "learning_rate": 9.782887387971105e-05, + "loss": 2.4014, + "step": 1892 + }, + { + "epoch": 0.35482661668228677, + "grad_norm": 52338.390625, + "learning_rate": 9.78265828795885e-05, + "loss": 2.2694, + "step": 1893 + }, + { + "epoch": 0.3550140581068416, + "grad_norm": 51298.8828125, + "learning_rate": 9.782429069821136e-05, + "loss": 2.3915, + "step": 1894 + }, + { + "epoch": 0.35520149953139646, + "grad_norm": 51806.75390625, + "learning_rate": 9.782199733563623e-05, + "loss": 2.3746, + "step": 1895 + }, + { + "epoch": 0.35538894095595125, + "grad_norm": 60556.75, + "learning_rate": 9.781970279191977e-05, + "loss": 2.4999, + "step": 1896 + }, + { + "epoch": 0.3555763823805061, + "grad_norm": 49798.92578125, + "learning_rate": 9.781740706711866e-05, + "loss": 2.3254, + "step": 1897 + }, + { + "epoch": 0.35576382380506094, + "grad_norm": 54512.87109375, + "learning_rate": 9.78151101612896e-05, + "loss": 2.3509, + "step": 1898 + }, + { + "epoch": 0.35595126522961573, + "grad_norm": 56074.703125, + "learning_rate": 9.78128120744893e-05, + "loss": 2.4439, + "step": 1899 + }, + { + "epoch": 0.3561387066541706, + "grad_norm": 53018.98828125, + "learning_rate": 9.781051280677453e-05, + "loss": 2.4597, + "step": 1900 + }, + { + "epoch": 0.3563261480787254, + "grad_norm": 49422.4921875, + "learning_rate": 9.780821235820208e-05, + "loss": 2.3411, + "step": 1901 + }, + { + "epoch": 0.3565135895032802, + "grad_norm": 55799.6015625, + "learning_rate": 9.780591072882877e-05, + "loss": 2.3646, + "step": 1902 + }, + { + "epoch": 0.35670103092783506, + "grad_norm": 53406.14453125, + "learning_rate": 9.780360791871145e-05, + "loss": 2.2669, + "step": 1903 + }, + { + "epoch": 0.35688847235238985, + "grad_norm": 49803.5, + "learning_rate": 9.780130392790698e-05, + "loss": 2.3485, + "step": 1904 + }, + { + "epoch": 0.3570759137769447, + "grad_norm": 50426.14453125, + "learning_rate": 9.779899875647229e-05, + "loss": 2.404, + "step": 1905 + }, + { + "epoch": 0.35726335520149954, + "grad_norm": 51874.203125, + "learning_rate": 9.779669240446431e-05, + "loss": 2.3052, + "step": 1906 + }, + { + "epoch": 0.35745079662605433, + "grad_norm": 52652.37890625, + "learning_rate": 9.779438487193999e-05, + "loss": 2.3554, + "step": 1907 + }, + { + "epoch": 0.3576382380506092, + "grad_norm": 55155.375, + "learning_rate": 9.779207615895633e-05, + "loss": 2.4562, + "step": 1908 + }, + { + "epoch": 0.357825679475164, + "grad_norm": 49467.34765625, + "learning_rate": 9.778976626557035e-05, + "loss": 2.3293, + "step": 1909 + }, + { + "epoch": 0.3580131208997188, + "grad_norm": 51923.23046875, + "learning_rate": 9.77874551918391e-05, + "loss": 2.3414, + "step": 1910 + }, + { + "epoch": 0.35820056232427366, + "grad_norm": 49378.2265625, + "learning_rate": 9.778514293781967e-05, + "loss": 2.3956, + "step": 1911 + }, + { + "epoch": 0.3583880037488285, + "grad_norm": 49036.37109375, + "learning_rate": 9.778282950356917e-05, + "loss": 2.3428, + "step": 1912 + }, + { + "epoch": 0.3585754451733833, + "grad_norm": 55697.90625, + "learning_rate": 9.778051488914472e-05, + "loss": 2.3479, + "step": 1913 + }, + { + "epoch": 0.35876288659793815, + "grad_norm": 55772.83203125, + "learning_rate": 9.777819909460351e-05, + "loss": 2.3724, + "step": 1914 + }, + { + "epoch": 0.358950328022493, + "grad_norm": 53278.69140625, + "learning_rate": 9.777588212000273e-05, + "loss": 2.3204, + "step": 1915 + }, + { + "epoch": 0.3591377694470478, + "grad_norm": 52015.34375, + "learning_rate": 9.777356396539958e-05, + "loss": 2.3084, + "step": 1916 + }, + { + "epoch": 0.35932521087160263, + "grad_norm": 51449.8984375, + "learning_rate": 9.777124463085137e-05, + "loss": 2.3257, + "step": 1917 + }, + { + "epoch": 0.3595126522961575, + "grad_norm": 50698.9921875, + "learning_rate": 9.776892411641534e-05, + "loss": 2.3226, + "step": 1918 + }, + { + "epoch": 0.35970009372071227, + "grad_norm": 55587.2265625, + "learning_rate": 9.776660242214881e-05, + "loss": 2.2824, + "step": 1919 + }, + { + "epoch": 0.3598875351452671, + "grad_norm": 50927.67578125, + "learning_rate": 9.776427954810914e-05, + "loss": 2.3251, + "step": 1920 + }, + { + "epoch": 0.3600749765698219, + "grad_norm": 51312.54296875, + "learning_rate": 9.77619554943537e-05, + "loss": 2.3769, + "step": 1921 + }, + { + "epoch": 0.36026241799437675, + "grad_norm": 54422.7265625, + "learning_rate": 9.775963026093987e-05, + "loss": 2.31, + "step": 1922 + }, + { + "epoch": 0.3604498594189316, + "grad_norm": 50281.90234375, + "learning_rate": 9.77573038479251e-05, + "loss": 2.3866, + "step": 1923 + }, + { + "epoch": 0.3606373008434864, + "grad_norm": 53338.984375, + "learning_rate": 9.775497625536684e-05, + "loss": 2.3295, + "step": 1924 + }, + { + "epoch": 0.36082474226804123, + "grad_norm": 57137.63671875, + "learning_rate": 9.775264748332258e-05, + "loss": 2.3469, + "step": 1925 + }, + { + "epoch": 0.3610121836925961, + "grad_norm": 50151.5390625, + "learning_rate": 9.775031753184984e-05, + "loss": 2.3457, + "step": 1926 + }, + { + "epoch": 0.36119962511715087, + "grad_norm": 49694.671875, + "learning_rate": 9.774798640100617e-05, + "loss": 2.3664, + "step": 1927 + }, + { + "epoch": 0.3613870665417057, + "grad_norm": 50724.984375, + "learning_rate": 9.774565409084914e-05, + "loss": 2.3463, + "step": 1928 + }, + { + "epoch": 0.36157450796626056, + "grad_norm": 47664.8828125, + "learning_rate": 9.774332060143637e-05, + "loss": 2.384, + "step": 1929 + }, + { + "epoch": 0.36176194939081535, + "grad_norm": 55592.5625, + "learning_rate": 9.774098593282546e-05, + "loss": 2.3311, + "step": 1930 + }, + { + "epoch": 0.3619493908153702, + "grad_norm": 55824.94140625, + "learning_rate": 9.773865008507412e-05, + "loss": 2.3609, + "step": 1931 + }, + { + "epoch": 0.36213683223992504, + "grad_norm": 50869.19140625, + "learning_rate": 9.773631305823999e-05, + "loss": 2.3253, + "step": 1932 + }, + { + "epoch": 0.36232427366447983, + "grad_norm": 48691.39453125, + "learning_rate": 9.773397485238083e-05, + "loss": 2.42, + "step": 1933 + }, + { + "epoch": 0.3625117150890347, + "grad_norm": 56542.7578125, + "learning_rate": 9.773163546755438e-05, + "loss": 2.3876, + "step": 1934 + }, + { + "epoch": 0.3626991565135895, + "grad_norm": 46200.29296875, + "learning_rate": 9.772929490381842e-05, + "loss": 2.3769, + "step": 1935 + }, + { + "epoch": 0.3628865979381443, + "grad_norm": 50262.1875, + "learning_rate": 9.772695316123076e-05, + "loss": 2.3766, + "step": 1936 + }, + { + "epoch": 0.36307403936269916, + "grad_norm": 50446.12890625, + "learning_rate": 9.772461023984923e-05, + "loss": 2.3081, + "step": 1937 + }, + { + "epoch": 0.363261480787254, + "grad_norm": 51865.890625, + "learning_rate": 9.77222661397317e-05, + "loss": 2.3621, + "step": 1938 + }, + { + "epoch": 0.3634489222118088, + "grad_norm": 50604.91796875, + "learning_rate": 9.771992086093609e-05, + "loss": 2.2988, + "step": 1939 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 47917.60546875, + "learning_rate": 9.771757440352027e-05, + "loss": 2.3422, + "step": 1940 + }, + { + "epoch": 0.36382380506091844, + "grad_norm": 54449.16796875, + "learning_rate": 9.771522676754225e-05, + "loss": 2.3043, + "step": 1941 + }, + { + "epoch": 0.3640112464854733, + "grad_norm": 52714.58203125, + "learning_rate": 9.771287795305999e-05, + "loss": 2.4122, + "step": 1942 + }, + { + "epoch": 0.36419868791002813, + "grad_norm": 50625.98828125, + "learning_rate": 9.771052796013152e-05, + "loss": 2.3907, + "step": 1943 + }, + { + "epoch": 0.3643861293345829, + "grad_norm": 53964.46484375, + "learning_rate": 9.770817678881484e-05, + "loss": 2.3379, + "step": 1944 + }, + { + "epoch": 0.36457357075913777, + "grad_norm": 50834.78125, + "learning_rate": 9.770582443916807e-05, + "loss": 2.3408, + "step": 1945 + }, + { + "epoch": 0.3647610121836926, + "grad_norm": 46078.59765625, + "learning_rate": 9.770347091124927e-05, + "loss": 2.3731, + "step": 1946 + }, + { + "epoch": 0.3649484536082474, + "grad_norm": 50685.8203125, + "learning_rate": 9.770111620511659e-05, + "loss": 2.3897, + "step": 1947 + }, + { + "epoch": 0.36513589503280225, + "grad_norm": 53217.0625, + "learning_rate": 9.76987603208282e-05, + "loss": 2.4228, + "step": 1948 + }, + { + "epoch": 0.3653233364573571, + "grad_norm": 51450.25, + "learning_rate": 9.769640325844226e-05, + "loss": 2.3866, + "step": 1949 + }, + { + "epoch": 0.3655107778819119, + "grad_norm": 52237.9765625, + "learning_rate": 9.7694045018017e-05, + "loss": 2.2487, + "step": 1950 + }, + { + "epoch": 0.36569821930646673, + "grad_norm": 52582.734375, + "learning_rate": 9.769168559961066e-05, + "loss": 2.3754, + "step": 1951 + }, + { + "epoch": 0.3658856607310216, + "grad_norm": 53176.02734375, + "learning_rate": 9.768932500328152e-05, + "loss": 2.4468, + "step": 1952 + }, + { + "epoch": 0.36607310215557637, + "grad_norm": 51613.4140625, + "learning_rate": 9.768696322908791e-05, + "loss": 2.3455, + "step": 1953 + }, + { + "epoch": 0.3662605435801312, + "grad_norm": 50158.8046875, + "learning_rate": 9.76846002770881e-05, + "loss": 2.3917, + "step": 1954 + }, + { + "epoch": 0.36644798500468606, + "grad_norm": 46289.140625, + "learning_rate": 9.76822361473405e-05, + "loss": 2.3696, + "step": 1955 + }, + { + "epoch": 0.36663542642924085, + "grad_norm": 48988.86328125, + "learning_rate": 9.767987083990348e-05, + "loss": 2.2979, + "step": 1956 + }, + { + "epoch": 0.3668228678537957, + "grad_norm": 55435.14453125, + "learning_rate": 9.767750435483549e-05, + "loss": 2.411, + "step": 1957 + }, + { + "epoch": 0.3670103092783505, + "grad_norm": 52889.921875, + "learning_rate": 9.767513669219493e-05, + "loss": 2.3179, + "step": 1958 + }, + { + "epoch": 0.36719775070290533, + "grad_norm": 52266.37109375, + "learning_rate": 9.767276785204033e-05, + "loss": 2.3368, + "step": 1959 + }, + { + "epoch": 0.3673851921274602, + "grad_norm": 48900.00390625, + "learning_rate": 9.767039783443014e-05, + "loss": 2.3021, + "step": 1960 + }, + { + "epoch": 0.36757263355201497, + "grad_norm": 50285.87890625, + "learning_rate": 9.766802663942295e-05, + "loss": 2.2604, + "step": 1961 + }, + { + "epoch": 0.3677600749765698, + "grad_norm": 50580.359375, + "learning_rate": 9.766565426707732e-05, + "loss": 2.326, + "step": 1962 + }, + { + "epoch": 0.36794751640112466, + "grad_norm": 51279.828125, + "learning_rate": 9.766328071745181e-05, + "loss": 2.3609, + "step": 1963 + }, + { + "epoch": 0.36813495782567945, + "grad_norm": 50928.19921875, + "learning_rate": 9.766090599060507e-05, + "loss": 2.319, + "step": 1964 + }, + { + "epoch": 0.3683223992502343, + "grad_norm": 56773.3984375, + "learning_rate": 9.765853008659574e-05, + "loss": 2.3106, + "step": 1965 + }, + { + "epoch": 0.36850984067478915, + "grad_norm": 52685.32421875, + "learning_rate": 9.765615300548253e-05, + "loss": 2.4719, + "step": 1966 + }, + { + "epoch": 0.36869728209934394, + "grad_norm": 50704.109375, + "learning_rate": 9.765377474732408e-05, + "loss": 2.4101, + "step": 1967 + }, + { + "epoch": 0.3688847235238988, + "grad_norm": 51181.42578125, + "learning_rate": 9.765139531217922e-05, + "loss": 2.3309, + "step": 1968 + }, + { + "epoch": 0.36907216494845363, + "grad_norm": 52023.26953125, + "learning_rate": 9.764901470010665e-05, + "loss": 2.3625, + "step": 1969 + }, + { + "epoch": 0.3692596063730084, + "grad_norm": 54251.9296875, + "learning_rate": 9.764663291116522e-05, + "loss": 2.2854, + "step": 1970 + }, + { + "epoch": 0.36944704779756327, + "grad_norm": 48773.78515625, + "learning_rate": 9.76442499454137e-05, + "loss": 2.3063, + "step": 1971 + }, + { + "epoch": 0.3696344892221181, + "grad_norm": 46697.5859375, + "learning_rate": 9.764186580291101e-05, + "loss": 2.4126, + "step": 1972 + }, + { + "epoch": 0.3698219306466729, + "grad_norm": 54273.84765625, + "learning_rate": 9.763948048371598e-05, + "loss": 2.3015, + "step": 1973 + }, + { + "epoch": 0.37000937207122775, + "grad_norm": 49937.7421875, + "learning_rate": 9.763709398788757e-05, + "loss": 2.3458, + "step": 1974 + }, + { + "epoch": 0.3701968134957826, + "grad_norm": 46757.98046875, + "learning_rate": 9.763470631548468e-05, + "loss": 2.3448, + "step": 1975 + }, + { + "epoch": 0.3703842549203374, + "grad_norm": 49674.296875, + "learning_rate": 9.763231746656632e-05, + "loss": 2.305, + "step": 1976 + }, + { + "epoch": 0.37057169634489223, + "grad_norm": 52311.47265625, + "learning_rate": 9.762992744119145e-05, + "loss": 2.3714, + "step": 1977 + }, + { + "epoch": 0.370759137769447, + "grad_norm": 51622.9921875, + "learning_rate": 9.762753623941915e-05, + "loss": 2.3258, + "step": 1978 + }, + { + "epoch": 0.37094657919400187, + "grad_norm": 51940.53515625, + "learning_rate": 9.762514386130845e-05, + "loss": 2.3975, + "step": 1979 + }, + { + "epoch": 0.3711340206185567, + "grad_norm": 51649.08984375, + "learning_rate": 9.762275030691844e-05, + "loss": 2.4456, + "step": 1980 + }, + { + "epoch": 0.3713214620431115, + "grad_norm": 51633.171875, + "learning_rate": 9.762035557630823e-05, + "loss": 2.3659, + "step": 1981 + }, + { + "epoch": 0.37150890346766635, + "grad_norm": 55226.609375, + "learning_rate": 9.761795966953699e-05, + "loss": 2.2845, + "step": 1982 + }, + { + "epoch": 0.3716963448922212, + "grad_norm": 50778.2265625, + "learning_rate": 9.761556258666389e-05, + "loss": 2.2881, + "step": 1983 + }, + { + "epoch": 0.371883786316776, + "grad_norm": 46740.19921875, + "learning_rate": 9.761316432774812e-05, + "loss": 2.383, + "step": 1984 + }, + { + "epoch": 0.37207122774133083, + "grad_norm": 50748.01171875, + "learning_rate": 9.761076489284892e-05, + "loss": 2.2691, + "step": 1985 + }, + { + "epoch": 0.3722586691658857, + "grad_norm": 54370.4375, + "learning_rate": 9.760836428202556e-05, + "loss": 2.339, + "step": 1986 + }, + { + "epoch": 0.37244611059044047, + "grad_norm": 51373.36328125, + "learning_rate": 9.760596249533734e-05, + "loss": 2.3717, + "step": 1987 + }, + { + "epoch": 0.3726335520149953, + "grad_norm": 51067.6328125, + "learning_rate": 9.760355953284354e-05, + "loss": 2.3144, + "step": 1988 + }, + { + "epoch": 0.37282099343955016, + "grad_norm": 55583.0703125, + "learning_rate": 9.760115539460355e-05, + "loss": 2.3583, + "step": 1989 + }, + { + "epoch": 0.37300843486410495, + "grad_norm": 50875.94140625, + "learning_rate": 9.759875008067674e-05, + "loss": 2.3333, + "step": 1990 + }, + { + "epoch": 0.3731958762886598, + "grad_norm": 51907.890625, + "learning_rate": 9.759634359112252e-05, + "loss": 2.3343, + "step": 1991 + }, + { + "epoch": 0.37338331771321465, + "grad_norm": 48478.11328125, + "learning_rate": 9.759393592600033e-05, + "loss": 2.3589, + "step": 1992 + }, + { + "epoch": 0.37357075913776944, + "grad_norm": 48703.19921875, + "learning_rate": 9.759152708536961e-05, + "loss": 2.2992, + "step": 1993 + }, + { + "epoch": 0.3737582005623243, + "grad_norm": 51127.171875, + "learning_rate": 9.758911706928988e-05, + "loss": 2.3529, + "step": 1994 + }, + { + "epoch": 0.3739456419868791, + "grad_norm": 49880.5625, + "learning_rate": 9.758670587782066e-05, + "loss": 2.3347, + "step": 1995 + }, + { + "epoch": 0.3741330834114339, + "grad_norm": 53560.29296875, + "learning_rate": 9.758429351102151e-05, + "loss": 2.374, + "step": 1996 + }, + { + "epoch": 0.37432052483598877, + "grad_norm": 56904.07421875, + "learning_rate": 9.7581879968952e-05, + "loss": 2.4095, + "step": 1997 + }, + { + "epoch": 0.37450796626054356, + "grad_norm": 60283.9140625, + "learning_rate": 9.757946525167174e-05, + "loss": 2.3449, + "step": 1998 + }, + { + "epoch": 0.3746954076850984, + "grad_norm": 49392.2734375, + "learning_rate": 9.757704935924037e-05, + "loss": 2.3733, + "step": 1999 + }, + { + "epoch": 0.37488284910965325, + "grad_norm": 50626.52734375, + "learning_rate": 9.757463229171758e-05, + "loss": 2.3662, + "step": 2000 + }, + { + "epoch": 0.37488284910965325, + "eval_loss": 2.338331699371338, + "eval_runtime": 129.0266, + "eval_samples_per_second": 39.131, + "eval_steps_per_second": 1.961, + "step": 2000 + }, + { + "epoch": 0.37507029053420804, + "grad_norm": 50585.30078125, + "learning_rate": 9.757221404916306e-05, + "loss": 2.3733, + "step": 2001 + }, + { + "epoch": 0.3752577319587629, + "grad_norm": 51904.39453125, + "learning_rate": 9.75697946316365e-05, + "loss": 2.3889, + "step": 2002 + }, + { + "epoch": 0.37544517338331773, + "grad_norm": 50689.83984375, + "learning_rate": 9.756737403919771e-05, + "loss": 2.3316, + "step": 2003 + }, + { + "epoch": 0.3756326148078725, + "grad_norm": 48160.54296875, + "learning_rate": 9.756495227190645e-05, + "loss": 2.2977, + "step": 2004 + }, + { + "epoch": 0.37582005623242737, + "grad_norm": 56086.03125, + "learning_rate": 9.756252932982255e-05, + "loss": 2.4325, + "step": 2005 + }, + { + "epoch": 0.3760074976569822, + "grad_norm": 47382.58203125, + "learning_rate": 9.756010521300582e-05, + "loss": 2.3582, + "step": 2006 + }, + { + "epoch": 0.376194939081537, + "grad_norm": 51345.58203125, + "learning_rate": 9.755767992151617e-05, + "loss": 2.2316, + "step": 2007 + }, + { + "epoch": 0.37638238050609185, + "grad_norm": 53322.1484375, + "learning_rate": 9.755525345541347e-05, + "loss": 2.3212, + "step": 2008 + }, + { + "epoch": 0.3765698219306467, + "grad_norm": 54630.109375, + "learning_rate": 9.755282581475769e-05, + "loss": 2.3308, + "step": 2009 + }, + { + "epoch": 0.3767572633552015, + "grad_norm": 54103.76171875, + "learning_rate": 9.755039699960875e-05, + "loss": 2.326, + "step": 2010 + }, + { + "epoch": 0.37694470477975633, + "grad_norm": 54917.765625, + "learning_rate": 9.754796701002664e-05, + "loss": 2.3265, + "step": 2011 + }, + { + "epoch": 0.3771321462043112, + "grad_norm": 52879.6484375, + "learning_rate": 9.754553584607143e-05, + "loss": 2.4129, + "step": 2012 + }, + { + "epoch": 0.37731958762886597, + "grad_norm": 57259.40234375, + "learning_rate": 9.754310350780309e-05, + "loss": 2.3889, + "step": 2013 + }, + { + "epoch": 0.3775070290534208, + "grad_norm": 49221.88671875, + "learning_rate": 9.754066999528175e-05, + "loss": 2.327, + "step": 2014 + }, + { + "epoch": 0.3776944704779756, + "grad_norm": 54997.66796875, + "learning_rate": 9.75382353085675e-05, + "loss": 2.3534, + "step": 2015 + }, + { + "epoch": 0.37788191190253045, + "grad_norm": 52235.49609375, + "learning_rate": 9.753579944772046e-05, + "loss": 2.3619, + "step": 2016 + }, + { + "epoch": 0.3780693533270853, + "grad_norm": 47836.234375, + "learning_rate": 9.753336241280082e-05, + "loss": 2.3962, + "step": 2017 + }, + { + "epoch": 0.3782567947516401, + "grad_norm": 52058.5234375, + "learning_rate": 9.753092420386875e-05, + "loss": 2.3616, + "step": 2018 + }, + { + "epoch": 0.37844423617619494, + "grad_norm": 52974.69921875, + "learning_rate": 9.752848482098447e-05, + "loss": 2.383, + "step": 2019 + }, + { + "epoch": 0.3786316776007498, + "grad_norm": 50354.34765625, + "learning_rate": 9.752604426420825e-05, + "loss": 2.3426, + "step": 2020 + }, + { + "epoch": 0.3788191190253046, + "grad_norm": 47950.87890625, + "learning_rate": 9.752360253360036e-05, + "loss": 2.3546, + "step": 2021 + }, + { + "epoch": 0.3790065604498594, + "grad_norm": 46669.49609375, + "learning_rate": 9.752115962922109e-05, + "loss": 2.3477, + "step": 2022 + }, + { + "epoch": 0.37919400187441427, + "grad_norm": 49349.6171875, + "learning_rate": 9.751871555113078e-05, + "loss": 2.3955, + "step": 2023 + }, + { + "epoch": 0.37938144329896906, + "grad_norm": 51278.84765625, + "learning_rate": 9.751627029938981e-05, + "loss": 2.2877, + "step": 2024 + }, + { + "epoch": 0.3795688847235239, + "grad_norm": 52741.87890625, + "learning_rate": 9.751382387405859e-05, + "loss": 2.4193, + "step": 2025 + }, + { + "epoch": 0.37975632614807875, + "grad_norm": 50635.3203125, + "learning_rate": 9.751137627519749e-05, + "loss": 2.3308, + "step": 2026 + }, + { + "epoch": 0.37994376757263354, + "grad_norm": 51481.3359375, + "learning_rate": 9.750892750286702e-05, + "loss": 2.4154, + "step": 2027 + }, + { + "epoch": 0.3801312089971884, + "grad_norm": 54656.4375, + "learning_rate": 9.750647755712763e-05, + "loss": 2.3347, + "step": 2028 + }, + { + "epoch": 0.38031865042174323, + "grad_norm": 57620.51953125, + "learning_rate": 9.750402643803985e-05, + "loss": 2.3663, + "step": 2029 + }, + { + "epoch": 0.380506091846298, + "grad_norm": 49788.0859375, + "learning_rate": 9.750157414566419e-05, + "loss": 2.2838, + "step": 2030 + }, + { + "epoch": 0.38069353327085287, + "grad_norm": 51486.27734375, + "learning_rate": 9.749912068006124e-05, + "loss": 2.2396, + "step": 2031 + }, + { + "epoch": 0.38088097469540766, + "grad_norm": 51821.37109375, + "learning_rate": 9.749666604129158e-05, + "loss": 2.3356, + "step": 2032 + }, + { + "epoch": 0.3810684161199625, + "grad_norm": 54515.6328125, + "learning_rate": 9.749421022941588e-05, + "loss": 2.3737, + "step": 2033 + }, + { + "epoch": 0.38125585754451735, + "grad_norm": 55916.66796875, + "learning_rate": 9.749175324449474e-05, + "loss": 2.317, + "step": 2034 + }, + { + "epoch": 0.38144329896907214, + "grad_norm": 49576.9296875, + "learning_rate": 9.748929508658886e-05, + "loss": 2.3823, + "step": 2035 + }, + { + "epoch": 0.381630740393627, + "grad_norm": 54555.23828125, + "learning_rate": 9.748683575575899e-05, + "loss": 2.302, + "step": 2036 + }, + { + "epoch": 0.38181818181818183, + "grad_norm": 51773.2109375, + "learning_rate": 9.748437525206582e-05, + "loss": 2.3493, + "step": 2037 + }, + { + "epoch": 0.3820056232427366, + "grad_norm": 48332.8984375, + "learning_rate": 9.748191357557016e-05, + "loss": 2.3589, + "step": 2038 + }, + { + "epoch": 0.38219306466729147, + "grad_norm": 53570.37890625, + "learning_rate": 9.74794507263328e-05, + "loss": 2.3346, + "step": 2039 + }, + { + "epoch": 0.3823805060918463, + "grad_norm": 53462.2109375, + "learning_rate": 9.747698670441456e-05, + "loss": 2.3424, + "step": 2040 + }, + { + "epoch": 0.3825679475164011, + "grad_norm": 55568.31640625, + "learning_rate": 9.74745215098763e-05, + "loss": 2.3819, + "step": 2041 + }, + { + "epoch": 0.38275538894095595, + "grad_norm": 51251.16015625, + "learning_rate": 9.747205514277891e-05, + "loss": 2.3352, + "step": 2042 + }, + { + "epoch": 0.3829428303655108, + "grad_norm": 57151.3671875, + "learning_rate": 9.74695876031833e-05, + "loss": 2.4642, + "step": 2043 + }, + { + "epoch": 0.3831302717900656, + "grad_norm": 50506.43359375, + "learning_rate": 9.746711889115043e-05, + "loss": 2.344, + "step": 2044 + }, + { + "epoch": 0.38331771321462044, + "grad_norm": 52242.80859375, + "learning_rate": 9.746464900674125e-05, + "loss": 2.3237, + "step": 2045 + }, + { + "epoch": 0.3835051546391753, + "grad_norm": 49389.04296875, + "learning_rate": 9.74621779500168e-05, + "loss": 2.4025, + "step": 2046 + }, + { + "epoch": 0.3836925960637301, + "grad_norm": 53702.61328125, + "learning_rate": 9.745970572103809e-05, + "loss": 2.3072, + "step": 2047 + }, + { + "epoch": 0.3838800374882849, + "grad_norm": 51695.98828125, + "learning_rate": 9.745723231986618e-05, + "loss": 2.3251, + "step": 2048 + }, + { + "epoch": 0.3840674789128397, + "grad_norm": 48015.07421875, + "learning_rate": 9.745475774656216e-05, + "loss": 2.3009, + "step": 2049 + }, + { + "epoch": 0.38425492033739456, + "grad_norm": 50256.61328125, + "learning_rate": 9.745228200118715e-05, + "loss": 2.2796, + "step": 2050 + }, + { + "epoch": 0.3844423617619494, + "grad_norm": 48255.82421875, + "learning_rate": 9.744980508380229e-05, + "loss": 2.3586, + "step": 2051 + }, + { + "epoch": 0.3846298031865042, + "grad_norm": 52566.16015625, + "learning_rate": 9.744732699446875e-05, + "loss": 2.4586, + "step": 2052 + }, + { + "epoch": 0.38481724461105904, + "grad_norm": 52143.5859375, + "learning_rate": 9.744484773324777e-05, + "loss": 2.2986, + "step": 2053 + }, + { + "epoch": 0.3850046860356139, + "grad_norm": 48069.56640625, + "learning_rate": 9.744236730020057e-05, + "loss": 2.2693, + "step": 2054 + }, + { + "epoch": 0.3851921274601687, + "grad_norm": 51903.33984375, + "learning_rate": 9.74398856953884e-05, + "loss": 2.2883, + "step": 2055 + }, + { + "epoch": 0.3853795688847235, + "grad_norm": 49537.71484375, + "learning_rate": 9.743740291887255e-05, + "loss": 2.3396, + "step": 2056 + }, + { + "epoch": 0.38556701030927837, + "grad_norm": 49500.546875, + "learning_rate": 9.743491897071436e-05, + "loss": 2.3999, + "step": 2057 + }, + { + "epoch": 0.38575445173383316, + "grad_norm": 49237.99609375, + "learning_rate": 9.743243385097516e-05, + "loss": 2.2542, + "step": 2058 + }, + { + "epoch": 0.385941893158388, + "grad_norm": 49955.54296875, + "learning_rate": 9.742994755971637e-05, + "loss": 2.2511, + "step": 2059 + }, + { + "epoch": 0.38612933458294285, + "grad_norm": 55039.75, + "learning_rate": 9.742746009699935e-05, + "loss": 2.4109, + "step": 2060 + }, + { + "epoch": 0.38631677600749764, + "grad_norm": 50561.44921875, + "learning_rate": 9.742497146288555e-05, + "loss": 2.283, + "step": 2061 + }, + { + "epoch": 0.3865042174320525, + "grad_norm": 50816.38671875, + "learning_rate": 9.742248165743646e-05, + "loss": 2.396, + "step": 2062 + }, + { + "epoch": 0.38669165885660733, + "grad_norm": 47144.38671875, + "learning_rate": 9.741999068071354e-05, + "loss": 2.3292, + "step": 2063 + }, + { + "epoch": 0.3868791002811621, + "grad_norm": 52841.46484375, + "learning_rate": 9.741749853277832e-05, + "loss": 2.3735, + "step": 2064 + }, + { + "epoch": 0.38706654170571697, + "grad_norm": 53587.9921875, + "learning_rate": 9.74150052136924e-05, + "loss": 2.3665, + "step": 2065 + }, + { + "epoch": 0.3872539831302718, + "grad_norm": 50384.4453125, + "learning_rate": 9.741251072351732e-05, + "loss": 2.3293, + "step": 2066 + }, + { + "epoch": 0.3874414245548266, + "grad_norm": 49201.171875, + "learning_rate": 9.741001506231467e-05, + "loss": 2.3962, + "step": 2067 + }, + { + "epoch": 0.38762886597938145, + "grad_norm": 50198.65625, + "learning_rate": 9.740751823014613e-05, + "loss": 2.4381, + "step": 2068 + }, + { + "epoch": 0.38781630740393624, + "grad_norm": 53692.640625, + "learning_rate": 9.740502022707336e-05, + "loss": 2.2872, + "step": 2069 + }, + { + "epoch": 0.3880037488284911, + "grad_norm": 49880.55859375, + "learning_rate": 9.740252105315803e-05, + "loss": 2.3051, + "step": 2070 + }, + { + "epoch": 0.38819119025304594, + "grad_norm": 50581.40234375, + "learning_rate": 9.740002070846192e-05, + "loss": 2.3942, + "step": 2071 + }, + { + "epoch": 0.3883786316776007, + "grad_norm": 51335.4609375, + "learning_rate": 9.739751919304674e-05, + "loss": 2.371, + "step": 2072 + }, + { + "epoch": 0.3885660731021556, + "grad_norm": 50713.21484375, + "learning_rate": 9.739501650697428e-05, + "loss": 2.3377, + "step": 2073 + }, + { + "epoch": 0.3887535145267104, + "grad_norm": 50213.8671875, + "learning_rate": 9.739251265030637e-05, + "loss": 2.3137, + "step": 2074 + }, + { + "epoch": 0.3889409559512652, + "grad_norm": 55296.56640625, + "learning_rate": 9.739000762310484e-05, + "loss": 2.3419, + "step": 2075 + }, + { + "epoch": 0.38912839737582006, + "grad_norm": 56543.01953125, + "learning_rate": 9.738750142543157e-05, + "loss": 2.3662, + "step": 2076 + }, + { + "epoch": 0.3893158388003749, + "grad_norm": 50837.05859375, + "learning_rate": 9.738499405734844e-05, + "loss": 2.4343, + "step": 2077 + }, + { + "epoch": 0.3895032802249297, + "grad_norm": 48902.03515625, + "learning_rate": 9.73824855189174e-05, + "loss": 2.3791, + "step": 2078 + }, + { + "epoch": 0.38969072164948454, + "grad_norm": 51087.3828125, + "learning_rate": 9.73799758102004e-05, + "loss": 2.3254, + "step": 2079 + }, + { + "epoch": 0.3898781630740394, + "grad_norm": 55839.734375, + "learning_rate": 9.737746493125943e-05, + "loss": 2.2945, + "step": 2080 + }, + { + "epoch": 0.3900656044985942, + "grad_norm": 51572.39453125, + "learning_rate": 9.737495288215649e-05, + "loss": 2.3457, + "step": 2081 + }, + { + "epoch": 0.390253045923149, + "grad_norm": 49675.77734375, + "learning_rate": 9.737243966295364e-05, + "loss": 2.3581, + "step": 2082 + }, + { + "epoch": 0.39044048734770387, + "grad_norm": 51342.59375, + "learning_rate": 9.736992527371296e-05, + "loss": 2.3221, + "step": 2083 + }, + { + "epoch": 0.39062792877225866, + "grad_norm": 50426.8984375, + "learning_rate": 9.736740971449653e-05, + "loss": 2.3824, + "step": 2084 + }, + { + "epoch": 0.3908153701968135, + "grad_norm": 56466.58984375, + "learning_rate": 9.73648929853665e-05, + "loss": 2.3335, + "step": 2085 + }, + { + "epoch": 0.3910028116213683, + "grad_norm": 50771.16796875, + "learning_rate": 9.736237508638501e-05, + "loss": 2.393, + "step": 2086 + }, + { + "epoch": 0.39119025304592314, + "grad_norm": 54806.421875, + "learning_rate": 9.735985601761428e-05, + "loss": 2.3724, + "step": 2087 + }, + { + "epoch": 0.391377694470478, + "grad_norm": 52493.20703125, + "learning_rate": 9.73573357791165e-05, + "loss": 2.33, + "step": 2088 + }, + { + "epoch": 0.3915651358950328, + "grad_norm": 62637.125, + "learning_rate": 9.735481437095392e-05, + "loss": 2.4064, + "step": 2089 + }, + { + "epoch": 0.3917525773195876, + "grad_norm": 53457.2578125, + "learning_rate": 9.735229179318884e-05, + "loss": 2.4302, + "step": 2090 + }, + { + "epoch": 0.39194001874414247, + "grad_norm": 50092.8984375, + "learning_rate": 9.734976804588354e-05, + "loss": 2.3789, + "step": 2091 + }, + { + "epoch": 0.39212746016869726, + "grad_norm": 50370.80078125, + "learning_rate": 9.734724312910035e-05, + "loss": 2.3077, + "step": 2092 + }, + { + "epoch": 0.3923149015932521, + "grad_norm": 53763.03515625, + "learning_rate": 9.734471704290164e-05, + "loss": 2.306, + "step": 2093 + }, + { + "epoch": 0.39250234301780695, + "grad_norm": 60529.39453125, + "learning_rate": 9.734218978734981e-05, + "loss": 2.3462, + "step": 2094 + }, + { + "epoch": 0.39268978444236174, + "grad_norm": 48769.4921875, + "learning_rate": 9.733966136250728e-05, + "loss": 2.3564, + "step": 2095 + }, + { + "epoch": 0.3928772258669166, + "grad_norm": 55443.25390625, + "learning_rate": 9.733713176843649e-05, + "loss": 2.3882, + "step": 2096 + }, + { + "epoch": 0.39306466729147144, + "grad_norm": 52461.7109375, + "learning_rate": 9.73346010051999e-05, + "loss": 2.2967, + "step": 2097 + }, + { + "epoch": 0.3932521087160262, + "grad_norm": 53786.7265625, + "learning_rate": 9.733206907286004e-05, + "loss": 2.308, + "step": 2098 + }, + { + "epoch": 0.3934395501405811, + "grad_norm": 51258.33984375, + "learning_rate": 9.732953597147944e-05, + "loss": 2.3198, + "step": 2099 + }, + { + "epoch": 0.3936269915651359, + "grad_norm": 50392.640625, + "learning_rate": 9.732700170112067e-05, + "loss": 2.3507, + "step": 2100 + }, + { + "epoch": 0.3938144329896907, + "grad_norm": 53406.94921875, + "learning_rate": 9.732446626184633e-05, + "loss": 2.3014, + "step": 2101 + }, + { + "epoch": 0.39400187441424556, + "grad_norm": 49982.515625, + "learning_rate": 9.732192965371902e-05, + "loss": 2.4496, + "step": 2102 + }, + { + "epoch": 0.3941893158388004, + "grad_norm": 55136.484375, + "learning_rate": 9.73193918768014e-05, + "loss": 2.2983, + "step": 2103 + }, + { + "epoch": 0.3943767572633552, + "grad_norm": 53494.80078125, + "learning_rate": 9.731685293115615e-05, + "loss": 2.3702, + "step": 2104 + }, + { + "epoch": 0.39456419868791004, + "grad_norm": 51116.21484375, + "learning_rate": 9.7314312816846e-05, + "loss": 2.2911, + "step": 2105 + }, + { + "epoch": 0.39475164011246483, + "grad_norm": 53702.33984375, + "learning_rate": 9.731177153393366e-05, + "loss": 2.3362, + "step": 2106 + }, + { + "epoch": 0.3949390815370197, + "grad_norm": 48022.21484375, + "learning_rate": 9.730922908248188e-05, + "loss": 2.288, + "step": 2107 + }, + { + "epoch": 0.3951265229615745, + "grad_norm": 48062.40625, + "learning_rate": 9.730668546255348e-05, + "loss": 2.3634, + "step": 2108 + }, + { + "epoch": 0.3953139643861293, + "grad_norm": 48712.578125, + "learning_rate": 9.73041406742113e-05, + "loss": 2.3231, + "step": 2109 + }, + { + "epoch": 0.39550140581068416, + "grad_norm": 51372.59375, + "learning_rate": 9.730159471751817e-05, + "loss": 2.3436, + "step": 2110 + }, + { + "epoch": 0.395688847235239, + "grad_norm": 49001.16796875, + "learning_rate": 9.729904759253699e-05, + "loss": 2.4632, + "step": 2111 + }, + { + "epoch": 0.3958762886597938, + "grad_norm": 51793.7734375, + "learning_rate": 9.729649929933065e-05, + "loss": 2.3863, + "step": 2112 + }, + { + "epoch": 0.39606373008434864, + "grad_norm": 51379.55859375, + "learning_rate": 9.72939498379621e-05, + "loss": 2.3782, + "step": 2113 + }, + { + "epoch": 0.3962511715089035, + "grad_norm": 54095.30078125, + "learning_rate": 9.729139920849429e-05, + "loss": 2.3316, + "step": 2114 + }, + { + "epoch": 0.3964386129334583, + "grad_norm": 51916.98046875, + "learning_rate": 9.728884741099025e-05, + "loss": 2.3447, + "step": 2115 + }, + { + "epoch": 0.3966260543580131, + "grad_norm": 48842.0390625, + "learning_rate": 9.728629444551298e-05, + "loss": 2.3155, + "step": 2116 + }, + { + "epoch": 0.39681349578256797, + "grad_norm": 52378.37109375, + "learning_rate": 9.728374031212556e-05, + "loss": 2.2729, + "step": 2117 + }, + { + "epoch": 0.39700093720712276, + "grad_norm": 47303.33203125, + "learning_rate": 9.728118501089105e-05, + "loss": 2.3016, + "step": 2118 + }, + { + "epoch": 0.3971883786316776, + "grad_norm": 53515.4609375, + "learning_rate": 9.727862854187257e-05, + "loss": 2.3145, + "step": 2119 + }, + { + "epoch": 0.39737582005623245, + "grad_norm": 48638.9140625, + "learning_rate": 9.727607090513327e-05, + "loss": 2.3744, + "step": 2120 + }, + { + "epoch": 0.39756326148078724, + "grad_norm": 60638.703125, + "learning_rate": 9.72735121007363e-05, + "loss": 2.3086, + "step": 2121 + }, + { + "epoch": 0.3977507029053421, + "grad_norm": 48695.12109375, + "learning_rate": 9.727095212874489e-05, + "loss": 2.3052, + "step": 2122 + }, + { + "epoch": 0.3979381443298969, + "grad_norm": 46820.296875, + "learning_rate": 9.726839098922222e-05, + "loss": 2.3564, + "step": 2123 + }, + { + "epoch": 0.3981255857544517, + "grad_norm": 54244.06640625, + "learning_rate": 9.726582868223161e-05, + "loss": 2.3281, + "step": 2124 + }, + { + "epoch": 0.3983130271790066, + "grad_norm": 50951.515625, + "learning_rate": 9.726326520783629e-05, + "loss": 2.2937, + "step": 2125 + }, + { + "epoch": 0.39850046860356136, + "grad_norm": 53899.9921875, + "learning_rate": 9.726070056609961e-05, + "loss": 2.2879, + "step": 2126 + }, + { + "epoch": 0.3986879100281162, + "grad_norm": 53200.4609375, + "learning_rate": 9.72581347570849e-05, + "loss": 2.3711, + "step": 2127 + }, + { + "epoch": 0.39887535145267106, + "grad_norm": 50927.85546875, + "learning_rate": 9.725556778085552e-05, + "loss": 2.3775, + "step": 2128 + }, + { + "epoch": 0.39906279287722585, + "grad_norm": 49832.31640625, + "learning_rate": 9.72529996374749e-05, + "loss": 2.2089, + "step": 2129 + }, + { + "epoch": 0.3992502343017807, + "grad_norm": 56821.65625, + "learning_rate": 9.725043032700644e-05, + "loss": 2.2743, + "step": 2130 + }, + { + "epoch": 0.39943767572633554, + "grad_norm": 52712.68359375, + "learning_rate": 9.724785984951362e-05, + "loss": 2.3124, + "step": 2131 + }, + { + "epoch": 0.39962511715089033, + "grad_norm": 47737.171875, + "learning_rate": 9.724528820505991e-05, + "loss": 2.345, + "step": 2132 + }, + { + "epoch": 0.3998125585754452, + "grad_norm": 51948.58203125, + "learning_rate": 9.724271539370886e-05, + "loss": 2.2687, + "step": 2133 + }, + { + "epoch": 0.4, + "grad_norm": 50864.75390625, + "learning_rate": 9.724014141552397e-05, + "loss": 2.3884, + "step": 2134 + }, + { + "epoch": 0.4001874414245548, + "grad_norm": 49359.66015625, + "learning_rate": 9.723756627056884e-05, + "loss": 2.3252, + "step": 2135 + }, + { + "epoch": 0.40037488284910966, + "grad_norm": 51483.74609375, + "learning_rate": 9.723498995890707e-05, + "loss": 2.3155, + "step": 2136 + }, + { + "epoch": 0.4005623242736645, + "grad_norm": 51630.328125, + "learning_rate": 9.723241248060228e-05, + "loss": 2.2994, + "step": 2137 + }, + { + "epoch": 0.4007497656982193, + "grad_norm": 53772.44921875, + "learning_rate": 9.722983383571814e-05, + "loss": 2.3609, + "step": 2138 + }, + { + "epoch": 0.40093720712277414, + "grad_norm": 52507.0546875, + "learning_rate": 9.722725402431835e-05, + "loss": 2.3548, + "step": 2139 + }, + { + "epoch": 0.40112464854732893, + "grad_norm": 51508.72265625, + "learning_rate": 9.722467304646661e-05, + "loss": 2.3316, + "step": 2140 + }, + { + "epoch": 0.4013120899718838, + "grad_norm": 49225.5234375, + "learning_rate": 9.722209090222668e-05, + "loss": 2.3475, + "step": 2141 + }, + { + "epoch": 0.4014995313964386, + "grad_norm": 50205.8671875, + "learning_rate": 9.721950759166231e-05, + "loss": 2.3442, + "step": 2142 + }, + { + "epoch": 0.4016869728209934, + "grad_norm": 56833.38671875, + "learning_rate": 9.721692311483735e-05, + "loss": 2.314, + "step": 2143 + }, + { + "epoch": 0.40187441424554826, + "grad_norm": 50970.4296875, + "learning_rate": 9.721433747181558e-05, + "loss": 2.3876, + "step": 2144 + }, + { + "epoch": 0.4020618556701031, + "grad_norm": 50398.484375, + "learning_rate": 9.721175066266091e-05, + "loss": 2.3656, + "step": 2145 + }, + { + "epoch": 0.4022492970946579, + "grad_norm": 51658.80078125, + "learning_rate": 9.72091626874372e-05, + "loss": 2.2995, + "step": 2146 + }, + { + "epoch": 0.40243673851921274, + "grad_norm": 51311.5, + "learning_rate": 9.720657354620837e-05, + "loss": 2.2988, + "step": 2147 + }, + { + "epoch": 0.4026241799437676, + "grad_norm": 49476.03515625, + "learning_rate": 9.720398323903839e-05, + "loss": 2.3157, + "step": 2148 + }, + { + "epoch": 0.4028116213683224, + "grad_norm": 48858.8359375, + "learning_rate": 9.720139176599124e-05, + "loss": 2.3699, + "step": 2149 + }, + { + "epoch": 0.4029990627928772, + "grad_norm": 53022.5703125, + "learning_rate": 9.719879912713088e-05, + "loss": 2.4277, + "step": 2150 + }, + { + "epoch": 0.4031865042174321, + "grad_norm": 51061.84765625, + "learning_rate": 9.719620532252138e-05, + "loss": 2.3535, + "step": 2151 + }, + { + "epoch": 0.40337394564198686, + "grad_norm": 46869.2578125, + "learning_rate": 9.719361035222682e-05, + "loss": 2.3256, + "step": 2152 + }, + { + "epoch": 0.4035613870665417, + "grad_norm": 47377.21875, + "learning_rate": 9.719101421631126e-05, + "loss": 2.3633, + "step": 2153 + }, + { + "epoch": 0.40374882849109656, + "grad_norm": 51816.80859375, + "learning_rate": 9.718841691483883e-05, + "loss": 2.3247, + "step": 2154 + }, + { + "epoch": 0.40393626991565135, + "grad_norm": 55464.5390625, + "learning_rate": 9.71858184478737e-05, + "loss": 2.2908, + "step": 2155 + }, + { + "epoch": 0.4041237113402062, + "grad_norm": 55432.19921875, + "learning_rate": 9.718321881548e-05, + "loss": 2.352, + "step": 2156 + }, + { + "epoch": 0.40431115276476104, + "grad_norm": 48787.43359375, + "learning_rate": 9.7180618017722e-05, + "loss": 2.3446, + "step": 2157 + }, + { + "epoch": 0.40449859418931583, + "grad_norm": 52613.8046875, + "learning_rate": 9.717801605466389e-05, + "loss": 2.3894, + "step": 2158 + }, + { + "epoch": 0.4046860356138707, + "grad_norm": 48160.12890625, + "learning_rate": 9.717541292636995e-05, + "loss": 2.4082, + "step": 2159 + }, + { + "epoch": 0.40487347703842547, + "grad_norm": 51708.10546875, + "learning_rate": 9.717280863290448e-05, + "loss": 2.4747, + "step": 2160 + }, + { + "epoch": 0.4050609184629803, + "grad_norm": 47561.578125, + "learning_rate": 9.717020317433179e-05, + "loss": 2.349, + "step": 2161 + }, + { + "epoch": 0.40524835988753516, + "grad_norm": 53089.78125, + "learning_rate": 9.716759655071626e-05, + "loss": 2.3221, + "step": 2162 + }, + { + "epoch": 0.40543580131208995, + "grad_norm": 54428.01953125, + "learning_rate": 9.716498876212222e-05, + "loss": 2.3184, + "step": 2163 + }, + { + "epoch": 0.4056232427366448, + "grad_norm": 70464.1953125, + "learning_rate": 9.716237980861413e-05, + "loss": 2.5324, + "step": 2164 + }, + { + "epoch": 0.40581068416119964, + "grad_norm": 51147.63671875, + "learning_rate": 9.715976969025639e-05, + "loss": 2.3012, + "step": 2165 + }, + { + "epoch": 0.40599812558575443, + "grad_norm": 54110.89453125, + "learning_rate": 9.715715840711348e-05, + "loss": 2.3971, + "step": 2166 + }, + { + "epoch": 0.4061855670103093, + "grad_norm": 53845.375, + "learning_rate": 9.715454595924991e-05, + "loss": 2.2928, + "step": 2167 + }, + { + "epoch": 0.4063730084348641, + "grad_norm": 51340.375, + "learning_rate": 9.71519323467302e-05, + "loss": 2.3151, + "step": 2168 + }, + { + "epoch": 0.4065604498594189, + "grad_norm": 55916.6796875, + "learning_rate": 9.714931756961886e-05, + "loss": 2.3256, + "step": 2169 + }, + { + "epoch": 0.40674789128397376, + "grad_norm": 52670.3515625, + "learning_rate": 9.714670162798054e-05, + "loss": 2.3929, + "step": 2170 + }, + { + "epoch": 0.4069353327085286, + "grad_norm": 50496.81640625, + "learning_rate": 9.714408452187981e-05, + "loss": 2.2821, + "step": 2171 + }, + { + "epoch": 0.4071227741330834, + "grad_norm": 51624.95703125, + "learning_rate": 9.71414662513813e-05, + "loss": 2.3624, + "step": 2172 + }, + { + "epoch": 0.40731021555763824, + "grad_norm": 53390.24609375, + "learning_rate": 9.713884681654971e-05, + "loss": 2.2801, + "step": 2173 + }, + { + "epoch": 0.4074976569821931, + "grad_norm": 52306.75390625, + "learning_rate": 9.71362262174497e-05, + "loss": 2.4521, + "step": 2174 + }, + { + "epoch": 0.4076850984067479, + "grad_norm": 55325.9375, + "learning_rate": 9.713360445414604e-05, + "loss": 2.3365, + "step": 2175 + }, + { + "epoch": 0.4078725398313027, + "grad_norm": 53949.64453125, + "learning_rate": 9.713098152670346e-05, + "loss": 2.3504, + "step": 2176 + }, + { + "epoch": 0.4080599812558575, + "grad_norm": 50298.76171875, + "learning_rate": 9.712835743518672e-05, + "loss": 2.344, + "step": 2177 + }, + { + "epoch": 0.40824742268041236, + "grad_norm": 54615.8671875, + "learning_rate": 9.712573217966067e-05, + "loss": 2.3772, + "step": 2178 + }, + { + "epoch": 0.4084348641049672, + "grad_norm": 52660.7265625, + "learning_rate": 9.712310576019013e-05, + "loss": 2.3972, + "step": 2179 + }, + { + "epoch": 0.408622305529522, + "grad_norm": 50292.8203125, + "learning_rate": 9.712047817683997e-05, + "loss": 2.363, + "step": 2180 + }, + { + "epoch": 0.40880974695407685, + "grad_norm": 49820.94140625, + "learning_rate": 9.71178494296751e-05, + "loss": 2.3189, + "step": 2181 + }, + { + "epoch": 0.4089971883786317, + "grad_norm": 54418.0, + "learning_rate": 9.711521951876044e-05, + "loss": 2.3554, + "step": 2182 + }, + { + "epoch": 0.4091846298031865, + "grad_norm": 51060.71875, + "learning_rate": 9.711258844416093e-05, + "loss": 2.3482, + "step": 2183 + }, + { + "epoch": 0.40937207122774133, + "grad_norm": 51042.43359375, + "learning_rate": 9.710995620594157e-05, + "loss": 2.3572, + "step": 2184 + }, + { + "epoch": 0.4095595126522962, + "grad_norm": 49365.56640625, + "learning_rate": 9.710732280416738e-05, + "loss": 2.3196, + "step": 2185 + }, + { + "epoch": 0.40974695407685097, + "grad_norm": 50473.05078125, + "learning_rate": 9.710468823890338e-05, + "loss": 2.3434, + "step": 2186 + }, + { + "epoch": 0.4099343955014058, + "grad_norm": 53713.109375, + "learning_rate": 9.710205251021468e-05, + "loss": 2.3182, + "step": 2187 + }, + { + "epoch": 0.41012183692596066, + "grad_norm": 51608.8515625, + "learning_rate": 9.709941561816631e-05, + "loss": 2.3793, + "step": 2188 + }, + { + "epoch": 0.41030927835051545, + "grad_norm": 52197.9921875, + "learning_rate": 9.709677756282348e-05, + "loss": 2.4257, + "step": 2189 + }, + { + "epoch": 0.4104967197750703, + "grad_norm": 57002.9140625, + "learning_rate": 9.709413834425127e-05, + "loss": 2.3427, + "step": 2190 + }, + { + "epoch": 0.41068416119962514, + "grad_norm": 51540.140625, + "learning_rate": 9.709149796251493e-05, + "loss": 2.367, + "step": 2191 + }, + { + "epoch": 0.41087160262417993, + "grad_norm": 54008.22265625, + "learning_rate": 9.708885641767963e-05, + "loss": 2.3777, + "step": 2192 + }, + { + "epoch": 0.4110590440487348, + "grad_norm": 52681.671875, + "learning_rate": 9.708621370981063e-05, + "loss": 2.3569, + "step": 2193 + }, + { + "epoch": 0.4112464854732896, + "grad_norm": 53098.8203125, + "learning_rate": 9.70835698389732e-05, + "loss": 2.4073, + "step": 2194 + }, + { + "epoch": 0.4114339268978444, + "grad_norm": 48950.81640625, + "learning_rate": 9.708092480523265e-05, + "loss": 2.3827, + "step": 2195 + }, + { + "epoch": 0.41162136832239926, + "grad_norm": 50937.32421875, + "learning_rate": 9.707827860865429e-05, + "loss": 2.3555, + "step": 2196 + }, + { + "epoch": 0.41180880974695405, + "grad_norm": 52374.3515625, + "learning_rate": 9.707563124930348e-05, + "loss": 2.3044, + "step": 2197 + }, + { + "epoch": 0.4119962511715089, + "grad_norm": 58401.83203125, + "learning_rate": 9.707298272724563e-05, + "loss": 2.4344, + "step": 2198 + }, + { + "epoch": 0.41218369259606374, + "grad_norm": 51754.84375, + "learning_rate": 9.707033304254611e-05, + "loss": 2.3333, + "step": 2199 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 53308.421875, + "learning_rate": 9.706768219527043e-05, + "loss": 2.3442, + "step": 2200 + }, + { + "epoch": 0.4125585754451734, + "grad_norm": 52180.7890625, + "learning_rate": 9.706503018548399e-05, + "loss": 2.3936, + "step": 2201 + }, + { + "epoch": 0.4127460168697282, + "grad_norm": 49902.76953125, + "learning_rate": 9.706237701325234e-05, + "loss": 2.3797, + "step": 2202 + }, + { + "epoch": 0.412933458294283, + "grad_norm": 49608.74609375, + "learning_rate": 9.705972267864099e-05, + "loss": 2.3916, + "step": 2203 + }, + { + "epoch": 0.41312089971883786, + "grad_norm": 54416.7265625, + "learning_rate": 9.705706718171552e-05, + "loss": 2.4409, + "step": 2204 + }, + { + "epoch": 0.4133083411433927, + "grad_norm": 49079.3359375, + "learning_rate": 9.705441052254147e-05, + "loss": 2.3529, + "step": 2205 + }, + { + "epoch": 0.4134957825679475, + "grad_norm": 52453.84765625, + "learning_rate": 9.705175270118452e-05, + "loss": 2.3107, + "step": 2206 + }, + { + "epoch": 0.41368322399250235, + "grad_norm": 58125.171875, + "learning_rate": 9.704909371771025e-05, + "loss": 2.3991, + "step": 2207 + }, + { + "epoch": 0.4138706654170572, + "grad_norm": 49559.0078125, + "learning_rate": 9.704643357218439e-05, + "loss": 2.3614, + "step": 2208 + }, + { + "epoch": 0.414058106841612, + "grad_norm": 50891.109375, + "learning_rate": 9.70437722646726e-05, + "loss": 2.3132, + "step": 2209 + }, + { + "epoch": 0.41424554826616683, + "grad_norm": 50562.88671875, + "learning_rate": 9.704110979524064e-05, + "loss": 2.3437, + "step": 2210 + }, + { + "epoch": 0.4144329896907217, + "grad_norm": 48893.93359375, + "learning_rate": 9.703844616395425e-05, + "loss": 2.3304, + "step": 2211 + }, + { + "epoch": 0.41462043111527647, + "grad_norm": 50027.58203125, + "learning_rate": 9.703578137087924e-05, + "loss": 2.3549, + "step": 2212 + }, + { + "epoch": 0.4148078725398313, + "grad_norm": 50305.23046875, + "learning_rate": 9.70331154160814e-05, + "loss": 2.3221, + "step": 2213 + }, + { + "epoch": 0.4149953139643861, + "grad_norm": 52539.64453125, + "learning_rate": 9.70304482996266e-05, + "loss": 2.3748, + "step": 2214 + }, + { + "epoch": 0.41518275538894095, + "grad_norm": 51053.58984375, + "learning_rate": 9.702778002158068e-05, + "loss": 2.3421, + "step": 2215 + }, + { + "epoch": 0.4153701968134958, + "grad_norm": 51661.71484375, + "learning_rate": 9.702511058200959e-05, + "loss": 2.3281, + "step": 2216 + }, + { + "epoch": 0.4155576382380506, + "grad_norm": 50949.7265625, + "learning_rate": 9.702243998097923e-05, + "loss": 2.2745, + "step": 2217 + }, + { + "epoch": 0.41574507966260543, + "grad_norm": 48151.48828125, + "learning_rate": 9.701976821855556e-05, + "loss": 2.376, + "step": 2218 + }, + { + "epoch": 0.4159325210871603, + "grad_norm": 48645.8359375, + "learning_rate": 9.70170952948046e-05, + "loss": 2.365, + "step": 2219 + }, + { + "epoch": 0.41611996251171507, + "grad_norm": 54149.828125, + "learning_rate": 9.701442120979232e-05, + "loss": 2.2934, + "step": 2220 + }, + { + "epoch": 0.4163074039362699, + "grad_norm": 46809.8515625, + "learning_rate": 9.70117459635848e-05, + "loss": 2.344, + "step": 2221 + }, + { + "epoch": 0.41649484536082476, + "grad_norm": 50617.27734375, + "learning_rate": 9.700906955624811e-05, + "loss": 2.2921, + "step": 2222 + }, + { + "epoch": 0.41668228678537955, + "grad_norm": 49888.78125, + "learning_rate": 9.700639198784833e-05, + "loss": 2.2922, + "step": 2223 + }, + { + "epoch": 0.4168697282099344, + "grad_norm": 47940.32421875, + "learning_rate": 9.700371325845163e-05, + "loss": 2.351, + "step": 2224 + }, + { + "epoch": 0.41705716963448924, + "grad_norm": 49073.55859375, + "learning_rate": 9.700103336812416e-05, + "loss": 2.3076, + "step": 2225 + }, + { + "epoch": 0.41724461105904403, + "grad_norm": 54624.3203125, + "learning_rate": 9.699835231693211e-05, + "loss": 2.3167, + "step": 2226 + }, + { + "epoch": 0.4174320524835989, + "grad_norm": 47975.48828125, + "learning_rate": 9.699567010494166e-05, + "loss": 2.3539, + "step": 2227 + }, + { + "epoch": 0.4176194939081537, + "grad_norm": 52357.44140625, + "learning_rate": 9.699298673221912e-05, + "loss": 2.2917, + "step": 2228 + }, + { + "epoch": 0.4178069353327085, + "grad_norm": 51209.58984375, + "learning_rate": 9.699030219883072e-05, + "loss": 2.2689, + "step": 2229 + }, + { + "epoch": 0.41799437675726336, + "grad_norm": 49909.37890625, + "learning_rate": 9.698761650484278e-05, + "loss": 2.3799, + "step": 2230 + }, + { + "epoch": 0.41818181818181815, + "grad_norm": 54104.15234375, + "learning_rate": 9.698492965032163e-05, + "loss": 2.2866, + "step": 2231 + }, + { + "epoch": 0.418369259606373, + "grad_norm": 54780.42578125, + "learning_rate": 9.698224163533365e-05, + "loss": 2.2957, + "step": 2232 + }, + { + "epoch": 0.41855670103092785, + "grad_norm": 52766.8359375, + "learning_rate": 9.697955245994519e-05, + "loss": 2.3413, + "step": 2233 + }, + { + "epoch": 0.41874414245548264, + "grad_norm": 48745.14453125, + "learning_rate": 9.697686212422271e-05, + "loss": 2.3243, + "step": 2234 + }, + { + "epoch": 0.4189315838800375, + "grad_norm": 50657.5859375, + "learning_rate": 9.697417062823264e-05, + "loss": 2.3544, + "step": 2235 + }, + { + "epoch": 0.41911902530459233, + "grad_norm": 54020.32421875, + "learning_rate": 9.697147797204147e-05, + "loss": 2.3188, + "step": 2236 + }, + { + "epoch": 0.4193064667291471, + "grad_norm": 54880.7421875, + "learning_rate": 9.696878415571567e-05, + "loss": 2.3698, + "step": 2237 + }, + { + "epoch": 0.41949390815370197, + "grad_norm": 56855.86328125, + "learning_rate": 9.696608917932181e-05, + "loss": 2.3975, + "step": 2238 + }, + { + "epoch": 0.4196813495782568, + "grad_norm": 49340.77734375, + "learning_rate": 9.696339304292644e-05, + "loss": 2.3343, + "step": 2239 + }, + { + "epoch": 0.4198687910028116, + "grad_norm": 49810.8125, + "learning_rate": 9.696069574659615e-05, + "loss": 2.2886, + "step": 2240 + }, + { + "epoch": 0.42005623242736645, + "grad_norm": 48708.98828125, + "learning_rate": 9.695799729039756e-05, + "loss": 2.3081, + "step": 2241 + }, + { + "epoch": 0.4202436738519213, + "grad_norm": 50933.74609375, + "learning_rate": 9.695529767439732e-05, + "loss": 2.3342, + "step": 2242 + }, + { + "epoch": 0.4204311152764761, + "grad_norm": 55489.7265625, + "learning_rate": 9.69525968986621e-05, + "loss": 2.3457, + "step": 2243 + }, + { + "epoch": 0.42061855670103093, + "grad_norm": 51846.6953125, + "learning_rate": 9.694989496325862e-05, + "loss": 2.404, + "step": 2244 + }, + { + "epoch": 0.4208059981255858, + "grad_norm": 51968.45703125, + "learning_rate": 9.69471918682536e-05, + "loss": 2.3386, + "step": 2245 + }, + { + "epoch": 0.42099343955014057, + "grad_norm": 53785.50390625, + "learning_rate": 9.694448761371382e-05, + "loss": 2.3012, + "step": 2246 + }, + { + "epoch": 0.4211808809746954, + "grad_norm": 48928.89453125, + "learning_rate": 9.694178219970604e-05, + "loss": 2.371, + "step": 2247 + }, + { + "epoch": 0.42136832239925026, + "grad_norm": 50806.99609375, + "learning_rate": 9.693907562629711e-05, + "loss": 2.486, + "step": 2248 + }, + { + "epoch": 0.42155576382380505, + "grad_norm": 51183.921875, + "learning_rate": 9.693636789355389e-05, + "loss": 2.3729, + "step": 2249 + }, + { + "epoch": 0.4217432052483599, + "grad_norm": 51734.83203125, + "learning_rate": 9.693365900154321e-05, + "loss": 2.3066, + "step": 2250 + }, + { + "epoch": 0.4219306466729147, + "grad_norm": 47154.04296875, + "learning_rate": 9.693094895033202e-05, + "loss": 2.3801, + "step": 2251 + }, + { + "epoch": 0.42211808809746953, + "grad_norm": 50264.4921875, + "learning_rate": 9.692823773998722e-05, + "loss": 2.3298, + "step": 2252 + }, + { + "epoch": 0.4223055295220244, + "grad_norm": 53527.45703125, + "learning_rate": 9.69255253705758e-05, + "loss": 2.3369, + "step": 2253 + }, + { + "epoch": 0.42249297094657917, + "grad_norm": 53763.46875, + "learning_rate": 9.692281184216473e-05, + "loss": 2.3247, + "step": 2254 + }, + { + "epoch": 0.422680412371134, + "grad_norm": 48296.2890625, + "learning_rate": 9.692009715482107e-05, + "loss": 2.3786, + "step": 2255 + }, + { + "epoch": 0.42286785379568886, + "grad_norm": 49892.48828125, + "learning_rate": 9.691738130861184e-05, + "loss": 2.3526, + "step": 2256 + }, + { + "epoch": 0.42305529522024365, + "grad_norm": 50383.70703125, + "learning_rate": 9.69146643036041e-05, + "loss": 2.3441, + "step": 2257 + }, + { + "epoch": 0.4232427366447985, + "grad_norm": 47735.12890625, + "learning_rate": 9.691194613986499e-05, + "loss": 2.3366, + "step": 2258 + }, + { + "epoch": 0.42343017806935335, + "grad_norm": 48627.13671875, + "learning_rate": 9.690922681746164e-05, + "loss": 2.3964, + "step": 2259 + }, + { + "epoch": 0.42361761949390814, + "grad_norm": 50745.00390625, + "learning_rate": 9.690650633646119e-05, + "loss": 2.3283, + "step": 2260 + }, + { + "epoch": 0.423805060918463, + "grad_norm": 52808.1171875, + "learning_rate": 9.690378469693086e-05, + "loss": 2.3681, + "step": 2261 + }, + { + "epoch": 0.42399250234301783, + "grad_norm": 53457.796875, + "learning_rate": 9.690106189893787e-05, + "loss": 2.3944, + "step": 2262 + }, + { + "epoch": 0.4241799437675726, + "grad_norm": 52105.35546875, + "learning_rate": 9.689833794254944e-05, + "loss": 2.3258, + "step": 2263 + }, + { + "epoch": 0.42436738519212747, + "grad_norm": 52032.62890625, + "learning_rate": 9.689561282783286e-05, + "loss": 2.3598, + "step": 2264 + }, + { + "epoch": 0.4245548266166823, + "grad_norm": 52464.38671875, + "learning_rate": 9.689288655485547e-05, + "loss": 2.3047, + "step": 2265 + }, + { + "epoch": 0.4247422680412371, + "grad_norm": 51222.61328125, + "learning_rate": 9.689015912368455e-05, + "loss": 2.3331, + "step": 2266 + }, + { + "epoch": 0.42492970946579195, + "grad_norm": 53586.55859375, + "learning_rate": 9.688743053438752e-05, + "loss": 2.3386, + "step": 2267 + }, + { + "epoch": 0.42511715089034674, + "grad_norm": 49477.30078125, + "learning_rate": 9.688470078703173e-05, + "loss": 2.3599, + "step": 2268 + }, + { + "epoch": 0.4253045923149016, + "grad_norm": 54392.39453125, + "learning_rate": 9.688196988168463e-05, + "loss": 2.2808, + "step": 2269 + }, + { + "epoch": 0.42549203373945643, + "grad_norm": 49152.109375, + "learning_rate": 9.687923781841366e-05, + "loss": 2.3806, + "step": 2270 + }, + { + "epoch": 0.4256794751640112, + "grad_norm": 51854.5390625, + "learning_rate": 9.687650459728628e-05, + "loss": 2.3692, + "step": 2271 + }, + { + "epoch": 0.42586691658856607, + "grad_norm": 56339.9296875, + "learning_rate": 9.687377021837002e-05, + "loss": 2.4429, + "step": 2272 + }, + { + "epoch": 0.4260543580131209, + "grad_norm": 52526.09765625, + "learning_rate": 9.687103468173241e-05, + "loss": 2.401, + "step": 2273 + }, + { + "epoch": 0.4262417994376757, + "grad_norm": 49030.859375, + "learning_rate": 9.686829798744102e-05, + "loss": 2.293, + "step": 2274 + }, + { + "epoch": 0.42642924086223055, + "grad_norm": 52178.65234375, + "learning_rate": 9.686556013556342e-05, + "loss": 2.2639, + "step": 2275 + }, + { + "epoch": 0.4266166822867854, + "grad_norm": 51602.99609375, + "learning_rate": 9.686282112616725e-05, + "loss": 2.4064, + "step": 2276 + }, + { + "epoch": 0.4268041237113402, + "grad_norm": 50712.94140625, + "learning_rate": 9.686008095932016e-05, + "loss": 2.398, + "step": 2277 + }, + { + "epoch": 0.42699156513589503, + "grad_norm": 51229.515625, + "learning_rate": 9.685733963508983e-05, + "loss": 2.3451, + "step": 2278 + }, + { + "epoch": 0.4271790065604499, + "grad_norm": 50380.5859375, + "learning_rate": 9.685459715354394e-05, + "loss": 2.368, + "step": 2279 + }, + { + "epoch": 0.42736644798500467, + "grad_norm": 53054.30078125, + "learning_rate": 9.685185351475028e-05, + "loss": 2.3389, + "step": 2280 + }, + { + "epoch": 0.4275538894095595, + "grad_norm": 48206.55859375, + "learning_rate": 9.684910871877657e-05, + "loss": 2.3187, + "step": 2281 + }, + { + "epoch": 0.42774133083411436, + "grad_norm": 50835.69140625, + "learning_rate": 9.684636276569062e-05, + "loss": 2.3509, + "step": 2282 + }, + { + "epoch": 0.42792877225866915, + "grad_norm": 53421.96875, + "learning_rate": 9.684361565556023e-05, + "loss": 2.3388, + "step": 2283 + }, + { + "epoch": 0.428116213683224, + "grad_norm": 52792.328125, + "learning_rate": 9.684086738845329e-05, + "loss": 2.3245, + "step": 2284 + }, + { + "epoch": 0.42830365510777885, + "grad_norm": 53740.97265625, + "learning_rate": 9.683811796443763e-05, + "loss": 2.3328, + "step": 2285 + }, + { + "epoch": 0.42849109653233364, + "grad_norm": 51407.04296875, + "learning_rate": 9.683536738358119e-05, + "loss": 2.3707, + "step": 2286 + }, + { + "epoch": 0.4286785379568885, + "grad_norm": 48725.375, + "learning_rate": 9.683261564595191e-05, + "loss": 2.2963, + "step": 2287 + }, + { + "epoch": 0.4288659793814433, + "grad_norm": 55300.73046875, + "learning_rate": 9.682986275161774e-05, + "loss": 2.3943, + "step": 2288 + }, + { + "epoch": 0.4290534208059981, + "grad_norm": 52001.9609375, + "learning_rate": 9.682710870064667e-05, + "loss": 2.2467, + "step": 2289 + }, + { + "epoch": 0.42924086223055297, + "grad_norm": 53979.2109375, + "learning_rate": 9.682435349310673e-05, + "loss": 2.4503, + "step": 2290 + }, + { + "epoch": 0.42942830365510776, + "grad_norm": 52401.3984375, + "learning_rate": 9.682159712906596e-05, + "loss": 2.2335, + "step": 2291 + }, + { + "epoch": 0.4296157450796626, + "grad_norm": 53430.09375, + "learning_rate": 9.681883960859246e-05, + "loss": 2.3991, + "step": 2292 + }, + { + "epoch": 0.42980318650421745, + "grad_norm": 54921.40625, + "learning_rate": 9.681608093175432e-05, + "loss": 2.3369, + "step": 2293 + }, + { + "epoch": 0.42999062792877224, + "grad_norm": 54137.59375, + "learning_rate": 9.681332109861967e-05, + "loss": 2.3664, + "step": 2294 + }, + { + "epoch": 0.4301780693533271, + "grad_norm": 49806.15625, + "learning_rate": 9.68105601092567e-05, + "loss": 2.2776, + "step": 2295 + }, + { + "epoch": 0.43036551077788193, + "grad_norm": 63469.17578125, + "learning_rate": 9.680779796373357e-05, + "loss": 2.3618, + "step": 2296 + }, + { + "epoch": 0.4305529522024367, + "grad_norm": 61747.1484375, + "learning_rate": 9.680503466211852e-05, + "loss": 2.4795, + "step": 2297 + }, + { + "epoch": 0.43074039362699157, + "grad_norm": 48514.53515625, + "learning_rate": 9.68022702044798e-05, + "loss": 2.3793, + "step": 2298 + }, + { + "epoch": 0.4309278350515464, + "grad_norm": 47625.0625, + "learning_rate": 9.67995045908857e-05, + "loss": 2.3931, + "step": 2299 + }, + { + "epoch": 0.4311152764761012, + "grad_norm": 56025.65625, + "learning_rate": 9.679673782140448e-05, + "loss": 2.3276, + "step": 2300 + }, + { + "epoch": 0.43130271790065605, + "grad_norm": 52583.484375, + "learning_rate": 9.679396989610454e-05, + "loss": 2.4067, + "step": 2301 + }, + { + "epoch": 0.4314901593252109, + "grad_norm": 52530.83984375, + "learning_rate": 9.679120081505419e-05, + "loss": 2.3174, + "step": 2302 + }, + { + "epoch": 0.4316776007497657, + "grad_norm": 52025.4375, + "learning_rate": 9.678843057832187e-05, + "loss": 2.2896, + "step": 2303 + }, + { + "epoch": 0.43186504217432053, + "grad_norm": 48935.10546875, + "learning_rate": 9.678565918597595e-05, + "loss": 2.2575, + "step": 2304 + }, + { + "epoch": 0.4320524835988753, + "grad_norm": 57517.5546875, + "learning_rate": 9.678288663808493e-05, + "loss": 2.3088, + "step": 2305 + }, + { + "epoch": 0.43223992502343017, + "grad_norm": 53535.62890625, + "learning_rate": 9.678011293471725e-05, + "loss": 2.3155, + "step": 2306 + }, + { + "epoch": 0.432427366447985, + "grad_norm": 53066.640625, + "learning_rate": 9.677733807594145e-05, + "loss": 2.3106, + "step": 2307 + }, + { + "epoch": 0.4326148078725398, + "grad_norm": 50025.73046875, + "learning_rate": 9.677456206182603e-05, + "loss": 2.3534, + "step": 2308 + }, + { + "epoch": 0.43280224929709465, + "grad_norm": 48770.078125, + "learning_rate": 9.677178489243956e-05, + "loss": 2.3339, + "step": 2309 + }, + { + "epoch": 0.4329896907216495, + "grad_norm": 48277.46484375, + "learning_rate": 9.676900656785066e-05, + "loss": 2.3613, + "step": 2310 + }, + { + "epoch": 0.4331771321462043, + "grad_norm": 60796.0, + "learning_rate": 9.676622708812795e-05, + "loss": 2.4018, + "step": 2311 + }, + { + "epoch": 0.43336457357075914, + "grad_norm": 55021.9453125, + "learning_rate": 9.676344645334003e-05, + "loss": 2.3886, + "step": 2312 + }, + { + "epoch": 0.433552014995314, + "grad_norm": 54175.79296875, + "learning_rate": 9.676066466355564e-05, + "loss": 2.3722, + "step": 2313 + }, + { + "epoch": 0.4337394564198688, + "grad_norm": 51774.15234375, + "learning_rate": 9.675788171884345e-05, + "loss": 2.3803, + "step": 2314 + }, + { + "epoch": 0.4339268978444236, + "grad_norm": 51512.53125, + "learning_rate": 9.675509761927221e-05, + "loss": 2.3412, + "step": 2315 + }, + { + "epoch": 0.43411433926897847, + "grad_norm": 53606.38671875, + "learning_rate": 9.675231236491067e-05, + "loss": 2.2832, + "step": 2316 + }, + { + "epoch": 0.43430178069353326, + "grad_norm": 50419.484375, + "learning_rate": 9.674952595582763e-05, + "loss": 2.3535, + "step": 2317 + }, + { + "epoch": 0.4344892221180881, + "grad_norm": 50548.67578125, + "learning_rate": 9.674673839209191e-05, + "loss": 2.3688, + "step": 2318 + }, + { + "epoch": 0.43467666354264295, + "grad_norm": 53361.5, + "learning_rate": 9.674394967377237e-05, + "loss": 2.4074, + "step": 2319 + }, + { + "epoch": 0.43486410496719774, + "grad_norm": 49975.84375, + "learning_rate": 9.674115980093785e-05, + "loss": 2.3652, + "step": 2320 + }, + { + "epoch": 0.4350515463917526, + "grad_norm": 56262.91796875, + "learning_rate": 9.67383687736573e-05, + "loss": 2.2906, + "step": 2321 + }, + { + "epoch": 0.4352389878163074, + "grad_norm": 48573.37890625, + "learning_rate": 9.673557659199965e-05, + "loss": 2.3801, + "step": 2322 + }, + { + "epoch": 0.4354264292408622, + "grad_norm": 51148.3671875, + "learning_rate": 9.673278325603384e-05, + "loss": 2.3621, + "step": 2323 + }, + { + "epoch": 0.43561387066541707, + "grad_norm": 51193.3828125, + "learning_rate": 9.672998876582889e-05, + "loss": 2.2838, + "step": 2324 + }, + { + "epoch": 0.43580131208997186, + "grad_norm": 51557.5078125, + "learning_rate": 9.672719312145378e-05, + "loss": 2.2945, + "step": 2325 + }, + { + "epoch": 0.4359887535145267, + "grad_norm": 54752.0234375, + "learning_rate": 9.672439632297759e-05, + "loss": 2.3682, + "step": 2326 + }, + { + "epoch": 0.43617619493908155, + "grad_norm": 55091.16796875, + "learning_rate": 9.67215983704694e-05, + "loss": 2.4345, + "step": 2327 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 50286.3671875, + "learning_rate": 9.671879926399829e-05, + "loss": 2.3438, + "step": 2328 + }, + { + "epoch": 0.4365510777881912, + "grad_norm": 52755.73046875, + "learning_rate": 9.671599900363342e-05, + "loss": 2.5102, + "step": 2329 + }, + { + "epoch": 0.43673851921274603, + "grad_norm": 55062.87109375, + "learning_rate": 9.671319758944393e-05, + "loss": 2.3261, + "step": 2330 + }, + { + "epoch": 0.4369259606373008, + "grad_norm": 55171.41796875, + "learning_rate": 9.671039502149905e-05, + "loss": 2.3624, + "step": 2331 + }, + { + "epoch": 0.43711340206185567, + "grad_norm": 54939.65625, + "learning_rate": 9.670759129986795e-05, + "loss": 2.3109, + "step": 2332 + }, + { + "epoch": 0.4373008434864105, + "grad_norm": 51943.66015625, + "learning_rate": 9.670478642461991e-05, + "loss": 2.3799, + "step": 2333 + }, + { + "epoch": 0.4374882849109653, + "grad_norm": 47763.12109375, + "learning_rate": 9.670198039582421e-05, + "loss": 2.3738, + "step": 2334 + }, + { + "epoch": 0.43767572633552015, + "grad_norm": 47017.4375, + "learning_rate": 9.669917321355013e-05, + "loss": 2.3889, + "step": 2335 + }, + { + "epoch": 0.437863167760075, + "grad_norm": 48189.921875, + "learning_rate": 9.669636487786701e-05, + "loss": 2.3528, + "step": 2336 + }, + { + "epoch": 0.4380506091846298, + "grad_norm": 54478.3046875, + "learning_rate": 9.669355538884424e-05, + "loss": 2.311, + "step": 2337 + }, + { + "epoch": 0.43823805060918464, + "grad_norm": 47505.56640625, + "learning_rate": 9.669074474655117e-05, + "loss": 2.3114, + "step": 2338 + }, + { + "epoch": 0.4384254920337395, + "grad_norm": 50129.23046875, + "learning_rate": 9.668793295105727e-05, + "loss": 2.3041, + "step": 2339 + }, + { + "epoch": 0.4386129334582943, + "grad_norm": 53162.15625, + "learning_rate": 9.668512000243194e-05, + "loss": 2.306, + "step": 2340 + }, + { + "epoch": 0.4388003748828491, + "grad_norm": 49076.94140625, + "learning_rate": 9.668230590074467e-05, + "loss": 2.381, + "step": 2341 + }, + { + "epoch": 0.4389878163074039, + "grad_norm": 49766.0625, + "learning_rate": 9.667949064606498e-05, + "loss": 2.2892, + "step": 2342 + }, + { + "epoch": 0.43917525773195876, + "grad_norm": 54444.4765625, + "learning_rate": 9.667667423846238e-05, + "loss": 2.3479, + "step": 2343 + }, + { + "epoch": 0.4393626991565136, + "grad_norm": 49470.02734375, + "learning_rate": 9.667385667800646e-05, + "loss": 2.3226, + "step": 2344 + }, + { + "epoch": 0.4395501405810684, + "grad_norm": 49234.32421875, + "learning_rate": 9.667103796476677e-05, + "loss": 2.3686, + "step": 2345 + }, + { + "epoch": 0.43973758200562324, + "grad_norm": 53412.546875, + "learning_rate": 9.666821809881297e-05, + "loss": 2.433, + "step": 2346 + }, + { + "epoch": 0.4399250234301781, + "grad_norm": 52204.875, + "learning_rate": 9.66653970802147e-05, + "loss": 2.3971, + "step": 2347 + }, + { + "epoch": 0.4401124648547329, + "grad_norm": 50102.21484375, + "learning_rate": 9.66625749090416e-05, + "loss": 2.3391, + "step": 2348 + }, + { + "epoch": 0.4402999062792877, + "grad_norm": 49774.90234375, + "learning_rate": 9.66597515853634e-05, + "loss": 2.3096, + "step": 2349 + }, + { + "epoch": 0.44048734770384257, + "grad_norm": 51254.01171875, + "learning_rate": 9.665692710924985e-05, + "loss": 2.297, + "step": 2350 + }, + { + "epoch": 0.44067478912839736, + "grad_norm": 48125.5546875, + "learning_rate": 9.665410148077067e-05, + "loss": 2.3377, + "step": 2351 + }, + { + "epoch": 0.4408622305529522, + "grad_norm": 51460.56640625, + "learning_rate": 9.665127469999568e-05, + "loss": 2.3793, + "step": 2352 + }, + { + "epoch": 0.44104967197750705, + "grad_norm": 51344.91796875, + "learning_rate": 9.664844676699471e-05, + "loss": 2.3354, + "step": 2353 + }, + { + "epoch": 0.44123711340206184, + "grad_norm": 52965.953125, + "learning_rate": 9.664561768183756e-05, + "loss": 2.4526, + "step": 2354 + }, + { + "epoch": 0.4414245548266167, + "grad_norm": 48837.1015625, + "learning_rate": 9.664278744459414e-05, + "loss": 2.4052, + "step": 2355 + }, + { + "epoch": 0.44161199625117153, + "grad_norm": 51792.76953125, + "learning_rate": 9.663995605533434e-05, + "loss": 2.4136, + "step": 2356 + }, + { + "epoch": 0.4417994376757263, + "grad_norm": 52327.21875, + "learning_rate": 9.663712351412809e-05, + "loss": 2.438, + "step": 2357 + }, + { + "epoch": 0.44198687910028117, + "grad_norm": 52243.80078125, + "learning_rate": 9.663428982104536e-05, + "loss": 2.3298, + "step": 2358 + }, + { + "epoch": 0.44217432052483596, + "grad_norm": 59351.4140625, + "learning_rate": 9.663145497615613e-05, + "loss": 2.3329, + "step": 2359 + }, + { + "epoch": 0.4423617619493908, + "grad_norm": 53091.35546875, + "learning_rate": 9.662861897953044e-05, + "loss": 2.436, + "step": 2360 + }, + { + "epoch": 0.44254920337394565, + "grad_norm": 54022.5546875, + "learning_rate": 9.662578183123829e-05, + "loss": 2.3449, + "step": 2361 + }, + { + "epoch": 0.44273664479850044, + "grad_norm": 53544.42578125, + "learning_rate": 9.66229435313498e-05, + "loss": 2.2811, + "step": 2362 + }, + { + "epoch": 0.4429240862230553, + "grad_norm": 47856.98046875, + "learning_rate": 9.662010407993505e-05, + "loss": 2.3724, + "step": 2363 + }, + { + "epoch": 0.44311152764761014, + "grad_norm": 49486.11328125, + "learning_rate": 9.661726347706415e-05, + "loss": 2.3678, + "step": 2364 + }, + { + "epoch": 0.44329896907216493, + "grad_norm": 51393.2109375, + "learning_rate": 9.66144217228073e-05, + "loss": 2.3071, + "step": 2365 + }, + { + "epoch": 0.4434864104967198, + "grad_norm": 53346.375, + "learning_rate": 9.661157881723469e-05, + "loss": 2.2989, + "step": 2366 + }, + { + "epoch": 0.4436738519212746, + "grad_norm": 46400.71875, + "learning_rate": 9.660873476041647e-05, + "loss": 2.3682, + "step": 2367 + }, + { + "epoch": 0.4438612933458294, + "grad_norm": 51337.0078125, + "learning_rate": 9.660588955242296e-05, + "loss": 2.3646, + "step": 2368 + }, + { + "epoch": 0.44404873477038426, + "grad_norm": 49932.12890625, + "learning_rate": 9.66030431933244e-05, + "loss": 2.3298, + "step": 2369 + }, + { + "epoch": 0.4442361761949391, + "grad_norm": 55140.15234375, + "learning_rate": 9.66001956831911e-05, + "loss": 2.2992, + "step": 2370 + }, + { + "epoch": 0.4444236176194939, + "grad_norm": 50369.49609375, + "learning_rate": 9.659734702209338e-05, + "loss": 2.3426, + "step": 2371 + }, + { + "epoch": 0.44461105904404874, + "grad_norm": 49423.4375, + "learning_rate": 9.659449721010161e-05, + "loss": 2.3471, + "step": 2372 + }, + { + "epoch": 0.4447985004686036, + "grad_norm": 51259.41015625, + "learning_rate": 9.659164624728616e-05, + "loss": 2.2923, + "step": 2373 + }, + { + "epoch": 0.4449859418931584, + "grad_norm": 56445.8203125, + "learning_rate": 9.658879413371744e-05, + "loss": 2.3468, + "step": 2374 + }, + { + "epoch": 0.4451733833177132, + "grad_norm": 49777.296875, + "learning_rate": 9.658594086946593e-05, + "loss": 2.355, + "step": 2375 + }, + { + "epoch": 0.44536082474226807, + "grad_norm": 45403.28515625, + "learning_rate": 9.658308645460208e-05, + "loss": 2.3627, + "step": 2376 + }, + { + "epoch": 0.44554826616682286, + "grad_norm": 49242.73046875, + "learning_rate": 9.65802308891964e-05, + "loss": 2.3772, + "step": 2377 + }, + { + "epoch": 0.4457357075913777, + "grad_norm": 44352.921875, + "learning_rate": 9.657737417331938e-05, + "loss": 2.3268, + "step": 2378 + }, + { + "epoch": 0.4459231490159325, + "grad_norm": 49310.78125, + "learning_rate": 9.657451630704162e-05, + "loss": 2.2575, + "step": 2379 + }, + { + "epoch": 0.44611059044048734, + "grad_norm": 51711.75, + "learning_rate": 9.657165729043369e-05, + "loss": 2.3374, + "step": 2380 + }, + { + "epoch": 0.4462980318650422, + "grad_norm": 54922.6875, + "learning_rate": 9.656879712356621e-05, + "loss": 2.3332, + "step": 2381 + }, + { + "epoch": 0.446485473289597, + "grad_norm": 54180.14453125, + "learning_rate": 9.65659358065098e-05, + "loss": 2.2989, + "step": 2382 + }, + { + "epoch": 0.4466729147141518, + "grad_norm": 48083.171875, + "learning_rate": 9.656307333933518e-05, + "loss": 2.3004, + "step": 2383 + }, + { + "epoch": 0.44686035613870667, + "grad_norm": 53785.3828125, + "learning_rate": 9.6560209722113e-05, + "loss": 2.3641, + "step": 2384 + }, + { + "epoch": 0.44704779756326146, + "grad_norm": 51501.31640625, + "learning_rate": 9.655734495491402e-05, + "loss": 2.3433, + "step": 2385 + }, + { + "epoch": 0.4472352389878163, + "grad_norm": 51770.29296875, + "learning_rate": 9.655447903780897e-05, + "loss": 2.3647, + "step": 2386 + }, + { + "epoch": 0.44742268041237115, + "grad_norm": 51119.5, + "learning_rate": 9.655161197086864e-05, + "loss": 2.401, + "step": 2387 + }, + { + "epoch": 0.44761012183692594, + "grad_norm": 56935.8359375, + "learning_rate": 9.654874375416386e-05, + "loss": 2.3134, + "step": 2388 + }, + { + "epoch": 0.4477975632614808, + "grad_norm": 51631.5234375, + "learning_rate": 9.654587438776546e-05, + "loss": 2.3431, + "step": 2389 + }, + { + "epoch": 0.44798500468603564, + "grad_norm": 52675.640625, + "learning_rate": 9.65430038717443e-05, + "loss": 2.3783, + "step": 2390 + }, + { + "epoch": 0.44817244611059043, + "grad_norm": 49265.11328125, + "learning_rate": 9.654013220617131e-05, + "loss": 2.3703, + "step": 2391 + }, + { + "epoch": 0.4483598875351453, + "grad_norm": 58788.97265625, + "learning_rate": 9.653725939111738e-05, + "loss": 2.4121, + "step": 2392 + }, + { + "epoch": 0.4485473289597001, + "grad_norm": 46378.31640625, + "learning_rate": 9.653438542665348e-05, + "loss": 2.3305, + "step": 2393 + }, + { + "epoch": 0.4487347703842549, + "grad_norm": 52618.21875, + "learning_rate": 9.65315103128506e-05, + "loss": 2.3881, + "step": 2394 + }, + { + "epoch": 0.44892221180880976, + "grad_norm": 51599.625, + "learning_rate": 9.652863404977973e-05, + "loss": 2.3163, + "step": 2395 + }, + { + "epoch": 0.44910965323336455, + "grad_norm": 49088.03515625, + "learning_rate": 9.652575663751194e-05, + "loss": 2.3172, + "step": 2396 + }, + { + "epoch": 0.4492970946579194, + "grad_norm": 56824.3359375, + "learning_rate": 9.652287807611827e-05, + "loss": 2.3618, + "step": 2397 + }, + { + "epoch": 0.44948453608247424, + "grad_norm": 49576.609375, + "learning_rate": 9.651999836566984e-05, + "loss": 2.3155, + "step": 2398 + }, + { + "epoch": 0.44967197750702903, + "grad_norm": 49532.109375, + "learning_rate": 9.651711750623776e-05, + "loss": 2.3195, + "step": 2399 + }, + { + "epoch": 0.4498594189315839, + "grad_norm": 51045.5234375, + "learning_rate": 9.651423549789318e-05, + "loss": 2.3335, + "step": 2400 + }, + { + "epoch": 0.4500468603561387, + "grad_norm": 48624.33984375, + "learning_rate": 9.65113523407073e-05, + "loss": 2.3699, + "step": 2401 + }, + { + "epoch": 0.4502343017806935, + "grad_norm": 49170.58984375, + "learning_rate": 9.650846803475131e-05, + "loss": 2.3804, + "step": 2402 + }, + { + "epoch": 0.45042174320524836, + "grad_norm": 52511.42578125, + "learning_rate": 9.650558258009648e-05, + "loss": 2.4246, + "step": 2403 + }, + { + "epoch": 0.4506091846298032, + "grad_norm": 51150.609375, + "learning_rate": 9.650269597681403e-05, + "loss": 2.3307, + "step": 2404 + }, + { + "epoch": 0.450796626054358, + "grad_norm": 49835.453125, + "learning_rate": 9.64998082249753e-05, + "loss": 2.3986, + "step": 2405 + }, + { + "epoch": 0.45098406747891284, + "grad_norm": 49902.82421875, + "learning_rate": 9.649691932465158e-05, + "loss": 2.4462, + "step": 2406 + }, + { + "epoch": 0.4511715089034677, + "grad_norm": 52969.26953125, + "learning_rate": 9.649402927591424e-05, + "loss": 2.3708, + "step": 2407 + }, + { + "epoch": 0.4513589503280225, + "grad_norm": 50541.3828125, + "learning_rate": 9.649113807883467e-05, + "loss": 2.3543, + "step": 2408 + }, + { + "epoch": 0.4515463917525773, + "grad_norm": 54190.67578125, + "learning_rate": 9.648824573348425e-05, + "loss": 2.3035, + "step": 2409 + }, + { + "epoch": 0.45173383317713217, + "grad_norm": 51039.015625, + "learning_rate": 9.648535223993444e-05, + "loss": 2.4305, + "step": 2410 + }, + { + "epoch": 0.45192127460168696, + "grad_norm": 59315.06640625, + "learning_rate": 9.64824575982567e-05, + "loss": 2.4365, + "step": 2411 + }, + { + "epoch": 0.4521087160262418, + "grad_norm": 48378.48828125, + "learning_rate": 9.647956180852254e-05, + "loss": 2.3514, + "step": 2412 + }, + { + "epoch": 0.4522961574507966, + "grad_norm": 50353.171875, + "learning_rate": 9.647666487080344e-05, + "loss": 2.3315, + "step": 2413 + }, + { + "epoch": 0.45248359887535144, + "grad_norm": 55765.4609375, + "learning_rate": 9.647376678517097e-05, + "loss": 2.2758, + "step": 2414 + }, + { + "epoch": 0.4526710402999063, + "grad_norm": 52678.8125, + "learning_rate": 9.647086755169673e-05, + "loss": 2.3286, + "step": 2415 + }, + { + "epoch": 0.4528584817244611, + "grad_norm": 55972.3515625, + "learning_rate": 9.646796717045231e-05, + "loss": 2.3185, + "step": 2416 + }, + { + "epoch": 0.4530459231490159, + "grad_norm": 54613.06640625, + "learning_rate": 9.646506564150935e-05, + "loss": 2.3319, + "step": 2417 + }, + { + "epoch": 0.4532333645735708, + "grad_norm": 49717.91015625, + "learning_rate": 9.64621629649395e-05, + "loss": 2.3393, + "step": 2418 + }, + { + "epoch": 0.45342080599812556, + "grad_norm": 49480.35546875, + "learning_rate": 9.645925914081449e-05, + "loss": 2.2427, + "step": 2419 + }, + { + "epoch": 0.4536082474226804, + "grad_norm": 51117.3046875, + "learning_rate": 9.645635416920599e-05, + "loss": 2.3519, + "step": 2420 + }, + { + "epoch": 0.45379568884723526, + "grad_norm": 51686.94921875, + "learning_rate": 9.645344805018577e-05, + "loss": 2.3405, + "step": 2421 + }, + { + "epoch": 0.45398313027179005, + "grad_norm": 55681.72265625, + "learning_rate": 9.645054078382562e-05, + "loss": 2.2914, + "step": 2422 + }, + { + "epoch": 0.4541705716963449, + "grad_norm": 53040.32421875, + "learning_rate": 9.644763237019734e-05, + "loss": 2.5913, + "step": 2423 + }, + { + "epoch": 0.45435801312089974, + "grad_norm": 51031.3046875, + "learning_rate": 9.644472280937275e-05, + "loss": 2.4089, + "step": 2424 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 51606.2265625, + "learning_rate": 9.644181210142374e-05, + "loss": 2.3916, + "step": 2425 + }, + { + "epoch": 0.4547328959700094, + "grad_norm": 52945.69921875, + "learning_rate": 9.643890024642217e-05, + "loss": 2.3606, + "step": 2426 + }, + { + "epoch": 0.4549203373945642, + "grad_norm": 49881.67578125, + "learning_rate": 9.643598724443999e-05, + "loss": 2.2962, + "step": 2427 + }, + { + "epoch": 0.455107778819119, + "grad_norm": 52457.54296875, + "learning_rate": 9.643307309554912e-05, + "loss": 2.2874, + "step": 2428 + }, + { + "epoch": 0.45529522024367386, + "grad_norm": 51788.82421875, + "learning_rate": 9.643015779982155e-05, + "loss": 2.344, + "step": 2429 + }, + { + "epoch": 0.4554826616682287, + "grad_norm": 47782.11328125, + "learning_rate": 9.642724135732925e-05, + "loss": 2.3154, + "step": 2430 + }, + { + "epoch": 0.4556701030927835, + "grad_norm": 53762.00390625, + "learning_rate": 9.642432376814432e-05, + "loss": 2.2798, + "step": 2431 + }, + { + "epoch": 0.45585754451733834, + "grad_norm": 53723.96484375, + "learning_rate": 9.642140503233877e-05, + "loss": 2.2359, + "step": 2432 + }, + { + "epoch": 0.45604498594189313, + "grad_norm": 48825.6328125, + "learning_rate": 9.64184851499847e-05, + "loss": 2.2938, + "step": 2433 + }, + { + "epoch": 0.456232427366448, + "grad_norm": 52162.6640625, + "learning_rate": 9.641556412115424e-05, + "loss": 2.3893, + "step": 2434 + }, + { + "epoch": 0.4564198687910028, + "grad_norm": 53263.8203125, + "learning_rate": 9.64126419459195e-05, + "loss": 2.4036, + "step": 2435 + }, + { + "epoch": 0.4566073102155576, + "grad_norm": 50472.796875, + "learning_rate": 9.640971862435271e-05, + "loss": 2.3615, + "step": 2436 + }, + { + "epoch": 0.45679475164011246, + "grad_norm": 52916.01953125, + "learning_rate": 9.640679415652602e-05, + "loss": 2.3808, + "step": 2437 + }, + { + "epoch": 0.4569821930646673, + "grad_norm": 51377.8671875, + "learning_rate": 9.64038685425117e-05, + "loss": 2.3165, + "step": 2438 + }, + { + "epoch": 0.4571696344892221, + "grad_norm": 53016.171875, + "learning_rate": 9.640094178238196e-05, + "loss": 2.3099, + "step": 2439 + }, + { + "epoch": 0.45735707591377694, + "grad_norm": 51529.62109375, + "learning_rate": 9.639801387620914e-05, + "loss": 2.3877, + "step": 2440 + }, + { + "epoch": 0.4575445173383318, + "grad_norm": 47794.484375, + "learning_rate": 9.639508482406552e-05, + "loss": 2.3035, + "step": 2441 + }, + { + "epoch": 0.4577319587628866, + "grad_norm": 55254.859375, + "learning_rate": 9.639215462602347e-05, + "loss": 2.3196, + "step": 2442 + }, + { + "epoch": 0.4579194001874414, + "grad_norm": 55322.91796875, + "learning_rate": 9.638922328215534e-05, + "loss": 2.2648, + "step": 2443 + }, + { + "epoch": 0.4581068416119963, + "grad_norm": 51165.578125, + "learning_rate": 9.638629079253353e-05, + "loss": 2.3342, + "step": 2444 + }, + { + "epoch": 0.45829428303655106, + "grad_norm": 48752.41796875, + "learning_rate": 9.63833571572305e-05, + "loss": 2.3876, + "step": 2445 + }, + { + "epoch": 0.4584817244611059, + "grad_norm": 50143.26171875, + "learning_rate": 9.638042237631868e-05, + "loss": 2.3284, + "step": 2446 + }, + { + "epoch": 0.45866916588566076, + "grad_norm": 51260.5703125, + "learning_rate": 9.637748644987057e-05, + "loss": 2.356, + "step": 2447 + }, + { + "epoch": 0.45885660731021555, + "grad_norm": 51102.9765625, + "learning_rate": 9.637454937795866e-05, + "loss": 2.2832, + "step": 2448 + }, + { + "epoch": 0.4590440487347704, + "grad_norm": 52984.18359375, + "learning_rate": 9.637161116065549e-05, + "loss": 2.3308, + "step": 2449 + }, + { + "epoch": 0.4592314901593252, + "grad_norm": 49483.8203125, + "learning_rate": 9.636867179803369e-05, + "loss": 2.3264, + "step": 2450 + }, + { + "epoch": 0.45941893158388003, + "grad_norm": 56108.296875, + "learning_rate": 9.636573129016577e-05, + "loss": 2.4036, + "step": 2451 + }, + { + "epoch": 0.4596063730084349, + "grad_norm": 49519.640625, + "learning_rate": 9.636278963712442e-05, + "loss": 2.3171, + "step": 2452 + }, + { + "epoch": 0.45979381443298967, + "grad_norm": 56978.078125, + "learning_rate": 9.635984683898227e-05, + "loss": 2.332, + "step": 2453 + }, + { + "epoch": 0.4599812558575445, + "grad_norm": 50164.8828125, + "learning_rate": 9.635690289581203e-05, + "loss": 2.3045, + "step": 2454 + }, + { + "epoch": 0.46016869728209936, + "grad_norm": 53565.1640625, + "learning_rate": 9.635395780768637e-05, + "loss": 2.3466, + "step": 2455 + }, + { + "epoch": 0.46035613870665415, + "grad_norm": 51984.1015625, + "learning_rate": 9.635101157467805e-05, + "loss": 2.3514, + "step": 2456 + }, + { + "epoch": 0.460543580131209, + "grad_norm": 48078.09765625, + "learning_rate": 9.634806419685984e-05, + "loss": 2.3471, + "step": 2457 + }, + { + "epoch": 0.46073102155576384, + "grad_norm": 48884.57421875, + "learning_rate": 9.634511567430455e-05, + "loss": 2.3471, + "step": 2458 + }, + { + "epoch": 0.46091846298031863, + "grad_norm": 52139.5625, + "learning_rate": 9.634216600708498e-05, + "loss": 2.4055, + "step": 2459 + }, + { + "epoch": 0.4611059044048735, + "grad_norm": 50789.39453125, + "learning_rate": 9.633921519527401e-05, + "loss": 2.346, + "step": 2460 + }, + { + "epoch": 0.4612933458294283, + "grad_norm": 50974.9609375, + "learning_rate": 9.633626323894448e-05, + "loss": 2.289, + "step": 2461 + }, + { + "epoch": 0.4614807872539831, + "grad_norm": 49733.07421875, + "learning_rate": 9.633331013816932e-05, + "loss": 2.3941, + "step": 2462 + }, + { + "epoch": 0.46166822867853796, + "grad_norm": 55467.66796875, + "learning_rate": 9.63303558930215e-05, + "loss": 2.3229, + "step": 2463 + }, + { + "epoch": 0.4618556701030928, + "grad_norm": 54027.46484375, + "learning_rate": 9.632740050357394e-05, + "loss": 2.3686, + "step": 2464 + }, + { + "epoch": 0.4620431115276476, + "grad_norm": 53052.43359375, + "learning_rate": 9.632444396989966e-05, + "loss": 2.2468, + "step": 2465 + }, + { + "epoch": 0.46223055295220244, + "grad_norm": 50674.5625, + "learning_rate": 9.632148629207166e-05, + "loss": 2.369, + "step": 2466 + }, + { + "epoch": 0.4624179943767573, + "grad_norm": 52008.3125, + "learning_rate": 9.631852747016302e-05, + "loss": 2.393, + "step": 2467 + }, + { + "epoch": 0.4626054358013121, + "grad_norm": 48734.9609375, + "learning_rate": 9.631556750424679e-05, + "loss": 2.2661, + "step": 2468 + }, + { + "epoch": 0.4627928772258669, + "grad_norm": 52436.296875, + "learning_rate": 9.63126063943961e-05, + "loss": 2.3026, + "step": 2469 + }, + { + "epoch": 0.4629803186504217, + "grad_norm": 53892.05078125, + "learning_rate": 9.630964414068409e-05, + "loss": 2.3834, + "step": 2470 + }, + { + "epoch": 0.46316776007497656, + "grad_norm": 72809.0546875, + "learning_rate": 9.630668074318389e-05, + "loss": 2.5758, + "step": 2471 + }, + { + "epoch": 0.4633552014995314, + "grad_norm": 53961.98828125, + "learning_rate": 9.630371620196872e-05, + "loss": 2.27, + "step": 2472 + }, + { + "epoch": 0.4635426429240862, + "grad_norm": 46890.15234375, + "learning_rate": 9.630075051711179e-05, + "loss": 2.3811, + "step": 2473 + }, + { + "epoch": 0.46373008434864105, + "grad_norm": 57003.74609375, + "learning_rate": 9.629778368868636e-05, + "loss": 2.3588, + "step": 2474 + }, + { + "epoch": 0.4639175257731959, + "grad_norm": 48612.8828125, + "learning_rate": 9.62948157167657e-05, + "loss": 2.301, + "step": 2475 + }, + { + "epoch": 0.4641049671977507, + "grad_norm": 51006.28515625, + "learning_rate": 9.629184660142311e-05, + "loss": 2.4034, + "step": 2476 + }, + { + "epoch": 0.46429240862230553, + "grad_norm": 51159.77734375, + "learning_rate": 9.628887634273191e-05, + "loss": 2.3193, + "step": 2477 + }, + { + "epoch": 0.4644798500468604, + "grad_norm": 54412.3125, + "learning_rate": 9.62859049407655e-05, + "loss": 2.2923, + "step": 2478 + }, + { + "epoch": 0.46466729147141517, + "grad_norm": 50653.2109375, + "learning_rate": 9.628293239559725e-05, + "loss": 2.3725, + "step": 2479 + }, + { + "epoch": 0.46485473289597, + "grad_norm": 48687.16015625, + "learning_rate": 9.627995870730058e-05, + "loss": 2.2971, + "step": 2480 + }, + { + "epoch": 0.46504217432052486, + "grad_norm": 55145.75, + "learning_rate": 9.627698387594892e-05, + "loss": 2.3772, + "step": 2481 + }, + { + "epoch": 0.46522961574507965, + "grad_norm": 50172.85546875, + "learning_rate": 9.627400790161576e-05, + "loss": 2.3509, + "step": 2482 + }, + { + "epoch": 0.4654170571696345, + "grad_norm": 52233.9609375, + "learning_rate": 9.62710307843746e-05, + "loss": 2.3332, + "step": 2483 + }, + { + "epoch": 0.46560449859418934, + "grad_norm": 48273.0, + "learning_rate": 9.626805252429896e-05, + "loss": 2.4216, + "step": 2484 + }, + { + "epoch": 0.46579194001874413, + "grad_norm": 53482.65625, + "learning_rate": 9.626507312146242e-05, + "loss": 2.3409, + "step": 2485 + }, + { + "epoch": 0.465979381443299, + "grad_norm": 52178.52734375, + "learning_rate": 9.626209257593857e-05, + "loss": 2.3551, + "step": 2486 + }, + { + "epoch": 0.46616682286785377, + "grad_norm": 51237.921875, + "learning_rate": 9.625911088780099e-05, + "loss": 2.3363, + "step": 2487 + }, + { + "epoch": 0.4663542642924086, + "grad_norm": 47611.265625, + "learning_rate": 9.625612805712337e-05, + "loss": 2.3908, + "step": 2488 + }, + { + "epoch": 0.46654170571696346, + "grad_norm": 48405.33984375, + "learning_rate": 9.625314408397935e-05, + "loss": 2.365, + "step": 2489 + }, + { + "epoch": 0.46672914714151825, + "grad_norm": 49593.03125, + "learning_rate": 9.625015896844264e-05, + "loss": 2.273, + "step": 2490 + }, + { + "epoch": 0.4669165885660731, + "grad_norm": 49408.84375, + "learning_rate": 9.624717271058696e-05, + "loss": 2.3637, + "step": 2491 + }, + { + "epoch": 0.46710402999062794, + "grad_norm": 51247.70703125, + "learning_rate": 9.624418531048608e-05, + "loss": 2.3689, + "step": 2492 + }, + { + "epoch": 0.46729147141518274, + "grad_norm": 50018.80859375, + "learning_rate": 9.624119676821379e-05, + "loss": 2.4073, + "step": 2493 + }, + { + "epoch": 0.4674789128397376, + "grad_norm": 55831.38671875, + "learning_rate": 9.623820708384388e-05, + "loss": 2.3956, + "step": 2494 + }, + { + "epoch": 0.4676663542642924, + "grad_norm": 49634.39453125, + "learning_rate": 9.62352162574502e-05, + "loss": 2.2968, + "step": 2495 + }, + { + "epoch": 0.4678537956888472, + "grad_norm": 65881.5703125, + "learning_rate": 9.623222428910664e-05, + "loss": 2.2925, + "step": 2496 + }, + { + "epoch": 0.46804123711340206, + "grad_norm": 47406.31640625, + "learning_rate": 9.622923117888704e-05, + "loss": 2.3462, + "step": 2497 + }, + { + "epoch": 0.4682286785379569, + "grad_norm": 48484.62890625, + "learning_rate": 9.622623692686541e-05, + "loss": 2.408, + "step": 2498 + }, + { + "epoch": 0.4684161199625117, + "grad_norm": 51272.1875, + "learning_rate": 9.622324153311566e-05, + "loss": 2.3971, + "step": 2499 + }, + { + "epoch": 0.46860356138706655, + "grad_norm": 50148.08203125, + "learning_rate": 9.622024499771174e-05, + "loss": 2.2736, + "step": 2500 + }, + { + "epoch": 0.46860356138706655, + "eval_loss": 2.3379557132720947, + "eval_runtime": 129.521, + "eval_samples_per_second": 38.982, + "eval_steps_per_second": 1.953, + "step": 2500 + }, + { + "epoch": 0.4687910028116214, + "grad_norm": 48877.68359375, + "learning_rate": 9.621724732072772e-05, + "loss": 2.2773, + "step": 2501 + }, + { + "epoch": 0.4689784442361762, + "grad_norm": 48861.53125, + "learning_rate": 9.621424850223761e-05, + "loss": 2.3368, + "step": 2502 + }, + { + "epoch": 0.46916588566073103, + "grad_norm": 50061.32421875, + "learning_rate": 9.621124854231548e-05, + "loss": 2.3216, + "step": 2503 + }, + { + "epoch": 0.4693533270852858, + "grad_norm": 52777.53515625, + "learning_rate": 9.620824744103543e-05, + "loss": 2.3474, + "step": 2504 + }, + { + "epoch": 0.46954076850984067, + "grad_norm": 50170.33984375, + "learning_rate": 9.620524519847156e-05, + "loss": 2.2771, + "step": 2505 + }, + { + "epoch": 0.4697282099343955, + "grad_norm": 53003.43359375, + "learning_rate": 9.620224181469805e-05, + "loss": 2.257, + "step": 2506 + }, + { + "epoch": 0.4699156513589503, + "grad_norm": 51221.48046875, + "learning_rate": 9.619923728978907e-05, + "loss": 2.3569, + "step": 2507 + }, + { + "epoch": 0.47010309278350515, + "grad_norm": 54776.66796875, + "learning_rate": 9.619623162381882e-05, + "loss": 2.3322, + "step": 2508 + }, + { + "epoch": 0.47029053420806, + "grad_norm": 59484.20703125, + "learning_rate": 9.619322481686154e-05, + "loss": 2.3371, + "step": 2509 + }, + { + "epoch": 0.4704779756326148, + "grad_norm": 48663.140625, + "learning_rate": 9.619021686899153e-05, + "loss": 2.404, + "step": 2510 + }, + { + "epoch": 0.47066541705716963, + "grad_norm": 53849.390625, + "learning_rate": 9.618720778028302e-05, + "loss": 2.3678, + "step": 2511 + }, + { + "epoch": 0.4708528584817245, + "grad_norm": 46674.16796875, + "learning_rate": 9.618419755081036e-05, + "loss": 2.3876, + "step": 2512 + }, + { + "epoch": 0.47104029990627927, + "grad_norm": 47218.12890625, + "learning_rate": 9.61811861806479e-05, + "loss": 2.322, + "step": 2513 + }, + { + "epoch": 0.4712277413308341, + "grad_norm": 48414.44140625, + "learning_rate": 9.617817366987003e-05, + "loss": 2.3273, + "step": 2514 + }, + { + "epoch": 0.47141518275538896, + "grad_norm": 46470.078125, + "learning_rate": 9.617516001855114e-05, + "loss": 2.3806, + "step": 2515 + }, + { + "epoch": 0.47160262417994375, + "grad_norm": 50654.109375, + "learning_rate": 9.617214522676566e-05, + "loss": 2.3341, + "step": 2516 + }, + { + "epoch": 0.4717900656044986, + "grad_norm": 52679.53125, + "learning_rate": 9.616912929458805e-05, + "loss": 2.2397, + "step": 2517 + }, + { + "epoch": 0.47197750702905344, + "grad_norm": 47829.953125, + "learning_rate": 9.61661122220928e-05, + "loss": 2.3198, + "step": 2518 + }, + { + "epoch": 0.47216494845360824, + "grad_norm": 54424.86328125, + "learning_rate": 9.616309400935445e-05, + "loss": 2.3479, + "step": 2519 + }, + { + "epoch": 0.4723523898781631, + "grad_norm": 59758.59765625, + "learning_rate": 9.616007465644752e-05, + "loss": 2.3462, + "step": 2520 + }, + { + "epoch": 0.4725398313027179, + "grad_norm": 45574.140625, + "learning_rate": 9.61570541634466e-05, + "loss": 2.3289, + "step": 2521 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 48818.515625, + "learning_rate": 9.615403253042627e-05, + "loss": 2.3641, + "step": 2522 + }, + { + "epoch": 0.47291471415182756, + "grad_norm": 57984.6328125, + "learning_rate": 9.615100975746119e-05, + "loss": 2.3445, + "step": 2523 + }, + { + "epoch": 0.47310215557638235, + "grad_norm": 49019.18359375, + "learning_rate": 9.614798584462601e-05, + "loss": 2.3424, + "step": 2524 + }, + { + "epoch": 0.4732895970009372, + "grad_norm": 56026.15234375, + "learning_rate": 9.61449607919954e-05, + "loss": 2.3229, + "step": 2525 + }, + { + "epoch": 0.47347703842549205, + "grad_norm": 51754.3203125, + "learning_rate": 9.61419345996441e-05, + "loss": 2.3552, + "step": 2526 + }, + { + "epoch": 0.47366447985004684, + "grad_norm": 51644.9453125, + "learning_rate": 9.613890726764683e-05, + "loss": 2.4159, + "step": 2527 + }, + { + "epoch": 0.4738519212746017, + "grad_norm": 50667.94140625, + "learning_rate": 9.613587879607836e-05, + "loss": 2.4233, + "step": 2528 + }, + { + "epoch": 0.47403936269915653, + "grad_norm": 53206.171875, + "learning_rate": 9.61328491850135e-05, + "loss": 2.3323, + "step": 2529 + }, + { + "epoch": 0.4742268041237113, + "grad_norm": 52028.0546875, + "learning_rate": 9.61298184345271e-05, + "loss": 2.3873, + "step": 2530 + }, + { + "epoch": 0.47441424554826617, + "grad_norm": 51806.56640625, + "learning_rate": 9.612678654469398e-05, + "loss": 2.2642, + "step": 2531 + }, + { + "epoch": 0.474601686972821, + "grad_norm": 50398.60546875, + "learning_rate": 9.612375351558902e-05, + "loss": 2.3307, + "step": 2532 + }, + { + "epoch": 0.4747891283973758, + "grad_norm": 53917.43359375, + "learning_rate": 9.612071934728718e-05, + "loss": 2.3366, + "step": 2533 + }, + { + "epoch": 0.47497656982193065, + "grad_norm": 60024.78125, + "learning_rate": 9.611768403986334e-05, + "loss": 2.4138, + "step": 2534 + }, + { + "epoch": 0.4751640112464855, + "grad_norm": 53692.17578125, + "learning_rate": 9.611464759339252e-05, + "loss": 2.369, + "step": 2535 + }, + { + "epoch": 0.4753514526710403, + "grad_norm": 51037.7890625, + "learning_rate": 9.611161000794969e-05, + "loss": 2.3268, + "step": 2536 + }, + { + "epoch": 0.47553889409559513, + "grad_norm": 50705.50390625, + "learning_rate": 9.610857128360986e-05, + "loss": 2.3105, + "step": 2537 + }, + { + "epoch": 0.47572633552015, + "grad_norm": 47636.26953125, + "learning_rate": 9.610553142044811e-05, + "loss": 2.3979, + "step": 2538 + }, + { + "epoch": 0.47591377694470477, + "grad_norm": 46196.62890625, + "learning_rate": 9.610249041853951e-05, + "loss": 2.281, + "step": 2539 + }, + { + "epoch": 0.4761012183692596, + "grad_norm": 48923.59375, + "learning_rate": 9.609944827795918e-05, + "loss": 2.2578, + "step": 2540 + }, + { + "epoch": 0.4762886597938144, + "grad_norm": 61429.29296875, + "learning_rate": 9.609640499878223e-05, + "loss": 2.3565, + "step": 2541 + }, + { + "epoch": 0.47647610121836925, + "grad_norm": 52592.4609375, + "learning_rate": 9.609336058108385e-05, + "loss": 2.3551, + "step": 2542 + }, + { + "epoch": 0.4766635426429241, + "grad_norm": 52734.3671875, + "learning_rate": 9.609031502493921e-05, + "loss": 2.3656, + "step": 2543 + }, + { + "epoch": 0.4768509840674789, + "grad_norm": 50470.57421875, + "learning_rate": 9.608726833042356e-05, + "loss": 2.3334, + "step": 2544 + }, + { + "epoch": 0.47703842549203374, + "grad_norm": 50622.24609375, + "learning_rate": 9.608422049761214e-05, + "loss": 2.3569, + "step": 2545 + }, + { + "epoch": 0.4772258669165886, + "grad_norm": 50622.3125, + "learning_rate": 9.608117152658022e-05, + "loss": 2.3567, + "step": 2546 + }, + { + "epoch": 0.47741330834114337, + "grad_norm": 54097.44140625, + "learning_rate": 9.607812141740309e-05, + "loss": 2.3899, + "step": 2547 + }, + { + "epoch": 0.4776007497656982, + "grad_norm": 54964.27734375, + "learning_rate": 9.607507017015613e-05, + "loss": 2.3122, + "step": 2548 + }, + { + "epoch": 0.47778819119025306, + "grad_norm": 51858.4765625, + "learning_rate": 9.607201778491465e-05, + "loss": 2.2569, + "step": 2549 + }, + { + "epoch": 0.47797563261480785, + "grad_norm": 56966.90234375, + "learning_rate": 9.606896426175408e-05, + "loss": 2.3446, + "step": 2550 + }, + { + "epoch": 0.4781630740393627, + "grad_norm": 52197.6875, + "learning_rate": 9.606590960074982e-05, + "loss": 2.3053, + "step": 2551 + }, + { + "epoch": 0.47835051546391755, + "grad_norm": 53755.9765625, + "learning_rate": 9.606285380197731e-05, + "loss": 2.3195, + "step": 2552 + }, + { + "epoch": 0.47853795688847234, + "grad_norm": 51831.953125, + "learning_rate": 9.605979686551204e-05, + "loss": 2.3331, + "step": 2553 + }, + { + "epoch": 0.4787253983130272, + "grad_norm": 51428.98828125, + "learning_rate": 9.605673879142951e-05, + "loss": 2.314, + "step": 2554 + }, + { + "epoch": 0.47891283973758203, + "grad_norm": 53848.55859375, + "learning_rate": 9.605367957980523e-05, + "loss": 2.375, + "step": 2555 + }, + { + "epoch": 0.4791002811621368, + "grad_norm": 55850.85546875, + "learning_rate": 9.605061923071479e-05, + "loss": 2.3595, + "step": 2556 + }, + { + "epoch": 0.47928772258669167, + "grad_norm": 48275.15625, + "learning_rate": 9.604755774423376e-05, + "loss": 2.4273, + "step": 2557 + }, + { + "epoch": 0.4794751640112465, + "grad_norm": 54667.484375, + "learning_rate": 9.604449512043774e-05, + "loss": 2.3113, + "step": 2558 + }, + { + "epoch": 0.4796626054358013, + "grad_norm": 51163.37109375, + "learning_rate": 9.604143135940239e-05, + "loss": 2.4264, + "step": 2559 + }, + { + "epoch": 0.47985004686035615, + "grad_norm": 53912.51171875, + "learning_rate": 9.603836646120339e-05, + "loss": 2.275, + "step": 2560 + }, + { + "epoch": 0.48003748828491094, + "grad_norm": 54520.87109375, + "learning_rate": 9.603530042591641e-05, + "loss": 2.3162, + "step": 2561 + }, + { + "epoch": 0.4802249297094658, + "grad_norm": 52898.82421875, + "learning_rate": 9.60322332536172e-05, + "loss": 2.3785, + "step": 2562 + }, + { + "epoch": 0.48041237113402063, + "grad_norm": 58903.98828125, + "learning_rate": 9.602916494438153e-05, + "loss": 2.2785, + "step": 2563 + }, + { + "epoch": 0.4805998125585754, + "grad_norm": 49269.1015625, + "learning_rate": 9.602609549828515e-05, + "loss": 2.3573, + "step": 2564 + }, + { + "epoch": 0.48078725398313027, + "grad_norm": 52711.14453125, + "learning_rate": 9.602302491540389e-05, + "loss": 2.309, + "step": 2565 + }, + { + "epoch": 0.4809746954076851, + "grad_norm": 57185.23828125, + "learning_rate": 9.601995319581358e-05, + "loss": 2.3354, + "step": 2566 + }, + { + "epoch": 0.4811621368322399, + "grad_norm": 58450.21875, + "learning_rate": 9.601688033959008e-05, + "loss": 2.3992, + "step": 2567 + }, + { + "epoch": 0.48134957825679475, + "grad_norm": 53329.203125, + "learning_rate": 9.601380634680931e-05, + "loss": 2.3976, + "step": 2568 + }, + { + "epoch": 0.4815370196813496, + "grad_norm": 52150.5859375, + "learning_rate": 9.60107312175472e-05, + "loss": 2.3489, + "step": 2569 + }, + { + "epoch": 0.4817244611059044, + "grad_norm": 46417.56640625, + "learning_rate": 9.600765495187965e-05, + "loss": 2.3293, + "step": 2570 + }, + { + "epoch": 0.48191190253045924, + "grad_norm": 50999.24609375, + "learning_rate": 9.60045775498827e-05, + "loss": 2.3571, + "step": 2571 + }, + { + "epoch": 0.4820993439550141, + "grad_norm": 49337.80859375, + "learning_rate": 9.600149901163231e-05, + "loss": 2.3666, + "step": 2572 + }, + { + "epoch": 0.48228678537956887, + "grad_norm": 52816.921875, + "learning_rate": 9.599841933720455e-05, + "loss": 2.2841, + "step": 2573 + }, + { + "epoch": 0.4824742268041237, + "grad_norm": 54745.0390625, + "learning_rate": 9.599533852667546e-05, + "loss": 2.4038, + "step": 2574 + }, + { + "epoch": 0.48266166822867856, + "grad_norm": 51141.0546875, + "learning_rate": 9.599225658012115e-05, + "loss": 2.315, + "step": 2575 + }, + { + "epoch": 0.48284910965323335, + "grad_norm": 50661.62109375, + "learning_rate": 9.598917349761771e-05, + "loss": 2.3529, + "step": 2576 + }, + { + "epoch": 0.4830365510777882, + "grad_norm": 48944.5625, + "learning_rate": 9.598608927924133e-05, + "loss": 2.3454, + "step": 2577 + }, + { + "epoch": 0.483223992502343, + "grad_norm": 51163.421875, + "learning_rate": 9.598300392506818e-05, + "loss": 2.3835, + "step": 2578 + }, + { + "epoch": 0.48341143392689784, + "grad_norm": 51721.54296875, + "learning_rate": 9.597991743517443e-05, + "loss": 2.2823, + "step": 2579 + }, + { + "epoch": 0.4835988753514527, + "grad_norm": 48302.3828125, + "learning_rate": 9.597682980963633e-05, + "loss": 2.3605, + "step": 2580 + }, + { + "epoch": 0.4837863167760075, + "grad_norm": 55893.1640625, + "learning_rate": 9.597374104853017e-05, + "loss": 2.3099, + "step": 2581 + }, + { + "epoch": 0.4839737582005623, + "grad_norm": 51035.9609375, + "learning_rate": 9.597065115193218e-05, + "loss": 2.2984, + "step": 2582 + }, + { + "epoch": 0.48416119962511717, + "grad_norm": 51052.69921875, + "learning_rate": 9.596756011991872e-05, + "loss": 2.3304, + "step": 2583 + }, + { + "epoch": 0.48434864104967196, + "grad_norm": 51960.828125, + "learning_rate": 9.596446795256613e-05, + "loss": 2.3258, + "step": 2584 + }, + { + "epoch": 0.4845360824742268, + "grad_norm": 48794.4609375, + "learning_rate": 9.596137464995078e-05, + "loss": 2.2591, + "step": 2585 + }, + { + "epoch": 0.48472352389878165, + "grad_norm": 51318.53125, + "learning_rate": 9.595828021214905e-05, + "loss": 2.3318, + "step": 2586 + }, + { + "epoch": 0.48491096532333644, + "grad_norm": 50914.3046875, + "learning_rate": 9.595518463923738e-05, + "loss": 2.3131, + "step": 2587 + }, + { + "epoch": 0.4850984067478913, + "grad_norm": 49638.62890625, + "learning_rate": 9.595208793129224e-05, + "loss": 2.3788, + "step": 2588 + }, + { + "epoch": 0.48528584817244613, + "grad_norm": 54799.47265625, + "learning_rate": 9.594899008839012e-05, + "loss": 2.2817, + "step": 2589 + }, + { + "epoch": 0.4854732895970009, + "grad_norm": 55356.55859375, + "learning_rate": 9.59458911106075e-05, + "loss": 2.3337, + "step": 2590 + }, + { + "epoch": 0.48566073102155577, + "grad_norm": 49220.14453125, + "learning_rate": 9.594279099802095e-05, + "loss": 2.3619, + "step": 2591 + }, + { + "epoch": 0.4858481724461106, + "grad_norm": 49980.27734375, + "learning_rate": 9.593968975070703e-05, + "loss": 2.3135, + "step": 2592 + }, + { + "epoch": 0.4860356138706654, + "grad_norm": 52777.7890625, + "learning_rate": 9.593658736874232e-05, + "loss": 2.3027, + "step": 2593 + }, + { + "epoch": 0.48622305529522025, + "grad_norm": 54573.38671875, + "learning_rate": 9.593348385220348e-05, + "loss": 2.2499, + "step": 2594 + }, + { + "epoch": 0.48641049671977504, + "grad_norm": 54904.34375, + "learning_rate": 9.593037920116711e-05, + "loss": 2.3712, + "step": 2595 + }, + { + "epoch": 0.4865979381443299, + "grad_norm": 52428.96484375, + "learning_rate": 9.592727341570996e-05, + "loss": 2.3159, + "step": 2596 + }, + { + "epoch": 0.48678537956888474, + "grad_norm": 52374.1484375, + "learning_rate": 9.592416649590869e-05, + "loss": 2.3801, + "step": 2597 + }, + { + "epoch": 0.4869728209934395, + "grad_norm": 53278.78125, + "learning_rate": 9.592105844184005e-05, + "loss": 2.3481, + "step": 2598 + }, + { + "epoch": 0.48716026241799437, + "grad_norm": 50856.6015625, + "learning_rate": 9.591794925358079e-05, + "loss": 2.3838, + "step": 2599 + }, + { + "epoch": 0.4873477038425492, + "grad_norm": 53510.65625, + "learning_rate": 9.591483893120774e-05, + "loss": 2.3724, + "step": 2600 + }, + { + "epoch": 0.487535145267104, + "grad_norm": 52962.8671875, + "learning_rate": 9.591172747479768e-05, + "loss": 2.3599, + "step": 2601 + }, + { + "epoch": 0.48772258669165885, + "grad_norm": 49286.83203125, + "learning_rate": 9.590861488442748e-05, + "loss": 2.273, + "step": 2602 + }, + { + "epoch": 0.4879100281162137, + "grad_norm": 47367.2890625, + "learning_rate": 9.590550116017401e-05, + "loss": 2.2552, + "step": 2603 + }, + { + "epoch": 0.4880974695407685, + "grad_norm": 53366.93359375, + "learning_rate": 9.590238630211419e-05, + "loss": 2.3626, + "step": 2604 + }, + { + "epoch": 0.48828491096532334, + "grad_norm": 52780.69140625, + "learning_rate": 9.589927031032493e-05, + "loss": 2.4183, + "step": 2605 + }, + { + "epoch": 0.4884723523898782, + "grad_norm": 52811.16796875, + "learning_rate": 9.589615318488321e-05, + "loss": 2.3828, + "step": 2606 + }, + { + "epoch": 0.488659793814433, + "grad_norm": 47671.2578125, + "learning_rate": 9.5893034925866e-05, + "loss": 2.3499, + "step": 2607 + }, + { + "epoch": 0.4888472352389878, + "grad_norm": 52398.359375, + "learning_rate": 9.588991553335035e-05, + "loss": 2.2305, + "step": 2608 + }, + { + "epoch": 0.48903467666354267, + "grad_norm": 47089.85546875, + "learning_rate": 9.588679500741328e-05, + "loss": 2.3514, + "step": 2609 + }, + { + "epoch": 0.48922211808809746, + "grad_norm": 53264.8359375, + "learning_rate": 9.588367334813185e-05, + "loss": 2.3907, + "step": 2610 + }, + { + "epoch": 0.4894095595126523, + "grad_norm": 53926.6796875, + "learning_rate": 9.588055055558318e-05, + "loss": 2.3731, + "step": 2611 + }, + { + "epoch": 0.48959700093720715, + "grad_norm": 53030.8125, + "learning_rate": 9.587742662984441e-05, + "loss": 2.4154, + "step": 2612 + }, + { + "epoch": 0.48978444236176194, + "grad_norm": 54196.38671875, + "learning_rate": 9.587430157099268e-05, + "loss": 2.3335, + "step": 2613 + }, + { + "epoch": 0.4899718837863168, + "grad_norm": 51065.359375, + "learning_rate": 9.587117537910518e-05, + "loss": 2.3787, + "step": 2614 + }, + { + "epoch": 0.4901593252108716, + "grad_norm": 52724.078125, + "learning_rate": 9.586804805425911e-05, + "loss": 2.2911, + "step": 2615 + }, + { + "epoch": 0.4903467666354264, + "grad_norm": 47903.58984375, + "learning_rate": 9.586491959653172e-05, + "loss": 2.3787, + "step": 2616 + }, + { + "epoch": 0.49053420805998127, + "grad_norm": 46813.26953125, + "learning_rate": 9.586179000600028e-05, + "loss": 2.3145, + "step": 2617 + }, + { + "epoch": 0.49072164948453606, + "grad_norm": 49721.8203125, + "learning_rate": 9.58586592827421e-05, + "loss": 2.3527, + "step": 2618 + }, + { + "epoch": 0.4909090909090909, + "grad_norm": 54983.90625, + "learning_rate": 9.585552742683449e-05, + "loss": 2.3193, + "step": 2619 + }, + { + "epoch": 0.49109653233364575, + "grad_norm": 50989.671875, + "learning_rate": 9.585239443835481e-05, + "loss": 2.2833, + "step": 2620 + }, + { + "epoch": 0.49128397375820054, + "grad_norm": 53539.0, + "learning_rate": 9.584926031738041e-05, + "loss": 2.2934, + "step": 2621 + }, + { + "epoch": 0.4914714151827554, + "grad_norm": 47169.234375, + "learning_rate": 9.584612506398875e-05, + "loss": 2.3599, + "step": 2622 + }, + { + "epoch": 0.49165885660731024, + "grad_norm": 51653.73828125, + "learning_rate": 9.584298867825725e-05, + "loss": 2.3185, + "step": 2623 + }, + { + "epoch": 0.491846298031865, + "grad_norm": 48529.953125, + "learning_rate": 9.583985116026334e-05, + "loss": 2.338, + "step": 2624 + }, + { + "epoch": 0.49203373945641987, + "grad_norm": 52031.6171875, + "learning_rate": 9.583671251008455e-05, + "loss": 2.3577, + "step": 2625 + }, + { + "epoch": 0.4922211808809747, + "grad_norm": 49078.0546875, + "learning_rate": 9.583357272779838e-05, + "loss": 2.306, + "step": 2626 + }, + { + "epoch": 0.4924086223055295, + "grad_norm": 48514.8125, + "learning_rate": 9.58304318134824e-05, + "loss": 2.3512, + "step": 2627 + }, + { + "epoch": 0.49259606373008435, + "grad_norm": 52055.73046875, + "learning_rate": 9.582728976721417e-05, + "loss": 2.404, + "step": 2628 + }, + { + "epoch": 0.4927835051546392, + "grad_norm": 49459.70703125, + "learning_rate": 9.58241465890713e-05, + "loss": 2.377, + "step": 2629 + }, + { + "epoch": 0.492970946579194, + "grad_norm": 51190.97265625, + "learning_rate": 9.582100227913142e-05, + "loss": 2.2877, + "step": 2630 + }, + { + "epoch": 0.49315838800374884, + "grad_norm": 56739.3125, + "learning_rate": 9.581785683747218e-05, + "loss": 2.3504, + "step": 2631 + }, + { + "epoch": 0.49334582942830363, + "grad_norm": 48622.86328125, + "learning_rate": 9.58147102641713e-05, + "loss": 2.348, + "step": 2632 + }, + { + "epoch": 0.4935332708528585, + "grad_norm": 47240.23828125, + "learning_rate": 9.581156255930647e-05, + "loss": 2.2782, + "step": 2633 + }, + { + "epoch": 0.4937207122774133, + "grad_norm": 66216.6171875, + "learning_rate": 9.580841372295543e-05, + "loss": 2.2774, + "step": 2634 + }, + { + "epoch": 0.4939081537019681, + "grad_norm": 52010.18359375, + "learning_rate": 9.580526375519597e-05, + "loss": 2.3489, + "step": 2635 + }, + { + "epoch": 0.49409559512652296, + "grad_norm": 48532.69921875, + "learning_rate": 9.580211265610588e-05, + "loss": 2.2297, + "step": 2636 + }, + { + "epoch": 0.4942830365510778, + "grad_norm": 49069.85546875, + "learning_rate": 9.5798960425763e-05, + "loss": 2.3354, + "step": 2637 + }, + { + "epoch": 0.4944704779756326, + "grad_norm": 57161.1171875, + "learning_rate": 9.579580706424517e-05, + "loss": 2.3191, + "step": 2638 + }, + { + "epoch": 0.49465791940018744, + "grad_norm": 47675.3984375, + "learning_rate": 9.579265257163029e-05, + "loss": 2.3375, + "step": 2639 + }, + { + "epoch": 0.4948453608247423, + "grad_norm": 49726.2109375, + "learning_rate": 9.578949694799624e-05, + "loss": 2.3176, + "step": 2640 + }, + { + "epoch": 0.4950328022492971, + "grad_norm": 54963.84375, + "learning_rate": 9.5786340193421e-05, + "loss": 2.2573, + "step": 2641 + }, + { + "epoch": 0.4952202436738519, + "grad_norm": 51662.3046875, + "learning_rate": 9.578318230798251e-05, + "loss": 2.2579, + "step": 2642 + }, + { + "epoch": 0.49540768509840677, + "grad_norm": 50683.59375, + "learning_rate": 9.578002329175879e-05, + "loss": 2.372, + "step": 2643 + }, + { + "epoch": 0.49559512652296156, + "grad_norm": 50619.74609375, + "learning_rate": 9.577686314482783e-05, + "loss": 2.2997, + "step": 2644 + }, + { + "epoch": 0.4957825679475164, + "grad_norm": 52060.50390625, + "learning_rate": 9.577370186726772e-05, + "loss": 2.3311, + "step": 2645 + }, + { + "epoch": 0.49597000937207125, + "grad_norm": 51934.13671875, + "learning_rate": 9.57705394591565e-05, + "loss": 2.3436, + "step": 2646 + }, + { + "epoch": 0.49615745079662604, + "grad_norm": 51175.9375, + "learning_rate": 9.576737592057232e-05, + "loss": 2.3243, + "step": 2647 + }, + { + "epoch": 0.4963448922211809, + "grad_norm": 45888.328125, + "learning_rate": 9.576421125159329e-05, + "loss": 2.3759, + "step": 2648 + }, + { + "epoch": 0.49653233364573574, + "grad_norm": 52850.34375, + "learning_rate": 9.576104545229756e-05, + "loss": 2.2922, + "step": 2649 + }, + { + "epoch": 0.4967197750702905, + "grad_norm": 53049.47265625, + "learning_rate": 9.575787852276335e-05, + "loss": 2.3478, + "step": 2650 + }, + { + "epoch": 0.49690721649484537, + "grad_norm": 48739.90234375, + "learning_rate": 9.575471046306886e-05, + "loss": 2.3041, + "step": 2651 + }, + { + "epoch": 0.49709465791940016, + "grad_norm": 48047.51953125, + "learning_rate": 9.575154127329234e-05, + "loss": 2.35, + "step": 2652 + }, + { + "epoch": 0.497282099343955, + "grad_norm": 56094.7421875, + "learning_rate": 9.574837095351208e-05, + "loss": 2.3402, + "step": 2653 + }, + { + "epoch": 0.49746954076850985, + "grad_norm": 44366.58203125, + "learning_rate": 9.574519950380636e-05, + "loss": 2.3082, + "step": 2654 + }, + { + "epoch": 0.49765698219306465, + "grad_norm": 49361.73828125, + "learning_rate": 9.574202692425354e-05, + "loss": 2.3309, + "step": 2655 + }, + { + "epoch": 0.4978444236176195, + "grad_norm": 49317.234375, + "learning_rate": 9.573885321493194e-05, + "loss": 2.3239, + "step": 2656 + }, + { + "epoch": 0.49803186504217434, + "grad_norm": 54840.8125, + "learning_rate": 9.573567837591998e-05, + "loss": 2.4437, + "step": 2657 + }, + { + "epoch": 0.49821930646672913, + "grad_norm": 51171.83203125, + "learning_rate": 9.573250240729605e-05, + "loss": 2.3267, + "step": 2658 + }, + { + "epoch": 0.498406747891284, + "grad_norm": 48432.65625, + "learning_rate": 9.572932530913861e-05, + "loss": 2.3239, + "step": 2659 + }, + { + "epoch": 0.4985941893158388, + "grad_norm": 46061.94140625, + "learning_rate": 9.572614708152613e-05, + "loss": 2.3963, + "step": 2660 + }, + { + "epoch": 0.4987816307403936, + "grad_norm": 51390.06640625, + "learning_rate": 9.572296772453709e-05, + "loss": 2.4026, + "step": 2661 + }, + { + "epoch": 0.49896907216494846, + "grad_norm": 49157.61328125, + "learning_rate": 9.571978723825003e-05, + "loss": 2.2932, + "step": 2662 + }, + { + "epoch": 0.4991565135895033, + "grad_norm": 50969.390625, + "learning_rate": 9.571660562274351e-05, + "loss": 2.3416, + "step": 2663 + }, + { + "epoch": 0.4993439550140581, + "grad_norm": 51674.4921875, + "learning_rate": 9.57134228780961e-05, + "loss": 2.3132, + "step": 2664 + }, + { + "epoch": 0.49953139643861294, + "grad_norm": 52357.68359375, + "learning_rate": 9.571023900438641e-05, + "loss": 2.2713, + "step": 2665 + }, + { + "epoch": 0.4997188378631678, + "grad_norm": 50772.62890625, + "learning_rate": 9.570705400169309e-05, + "loss": 2.3052, + "step": 2666 + }, + { + "epoch": 0.4999062792877226, + "grad_norm": 51512.10546875, + "learning_rate": 9.570386787009477e-05, + "loss": 2.2871, + "step": 2667 + }, + { + "epoch": 0.5000937207122774, + "grad_norm": 48705.5234375, + "learning_rate": 9.570068060967021e-05, + "loss": 2.2737, + "step": 2668 + }, + { + "epoch": 0.5002811621368323, + "grad_norm": 55290.0078125, + "learning_rate": 9.569749222049806e-05, + "loss": 2.3485, + "step": 2669 + }, + { + "epoch": 0.5004686035613871, + "grad_norm": 48181.453125, + "learning_rate": 9.569430270265711e-05, + "loss": 2.3032, + "step": 2670 + }, + { + "epoch": 0.5006560449859419, + "grad_norm": 49924.52734375, + "learning_rate": 9.569111205622613e-05, + "loss": 2.3407, + "step": 2671 + }, + { + "epoch": 0.5008434864104967, + "grad_norm": 53063.30078125, + "learning_rate": 9.568792028128391e-05, + "loss": 2.3344, + "step": 2672 + }, + { + "epoch": 0.5010309278350515, + "grad_norm": 49577.02734375, + "learning_rate": 9.56847273779093e-05, + "loss": 2.2961, + "step": 2673 + }, + { + "epoch": 0.5012183692596064, + "grad_norm": 60394.67578125, + "learning_rate": 9.568153334618117e-05, + "loss": 2.3289, + "step": 2674 + }, + { + "epoch": 0.5014058106841612, + "grad_norm": 51333.71875, + "learning_rate": 9.567833818617838e-05, + "loss": 2.3531, + "step": 2675 + }, + { + "epoch": 0.5015932521087161, + "grad_norm": 47345.35546875, + "learning_rate": 9.567514189797987e-05, + "loss": 2.3163, + "step": 2676 + }, + { + "epoch": 0.5017806935332708, + "grad_norm": 52711.76171875, + "learning_rate": 9.567194448166455e-05, + "loss": 2.4038, + "step": 2677 + }, + { + "epoch": 0.5019681349578257, + "grad_norm": 50573.53515625, + "learning_rate": 9.566874593731144e-05, + "loss": 2.2507, + "step": 2678 + }, + { + "epoch": 0.5021555763823805, + "grad_norm": 53105.8125, + "learning_rate": 9.566554626499949e-05, + "loss": 2.3777, + "step": 2679 + }, + { + "epoch": 0.5023430178069354, + "grad_norm": 48675.3125, + "learning_rate": 9.566234546480777e-05, + "loss": 2.3386, + "step": 2680 + }, + { + "epoch": 0.5025304592314902, + "grad_norm": 51244.3984375, + "learning_rate": 9.565914353681533e-05, + "loss": 2.3334, + "step": 2681 + }, + { + "epoch": 0.5027179006560449, + "grad_norm": 50423.734375, + "learning_rate": 9.565594048110122e-05, + "loss": 2.3209, + "step": 2682 + }, + { + "epoch": 0.5029053420805998, + "grad_norm": 51443.62890625, + "learning_rate": 9.565273629774459e-05, + "loss": 2.3064, + "step": 2683 + }, + { + "epoch": 0.5030927835051546, + "grad_norm": 58286.61328125, + "learning_rate": 9.564953098682454e-05, + "loss": 2.3814, + "step": 2684 + }, + { + "epoch": 0.5032802249297095, + "grad_norm": 49712.3359375, + "learning_rate": 9.564632454842028e-05, + "loss": 2.2869, + "step": 2685 + }, + { + "epoch": 0.5034676663542643, + "grad_norm": 48875.1640625, + "learning_rate": 9.5643116982611e-05, + "loss": 2.3584, + "step": 2686 + }, + { + "epoch": 0.5036551077788192, + "grad_norm": 52764.58984375, + "learning_rate": 9.563990828947588e-05, + "loss": 2.3594, + "step": 2687 + }, + { + "epoch": 0.5038425492033739, + "grad_norm": 52147.77734375, + "learning_rate": 9.563669846909421e-05, + "loss": 2.256, + "step": 2688 + }, + { + "epoch": 0.5040299906279287, + "grad_norm": 47102.140625, + "learning_rate": 9.563348752154527e-05, + "loss": 2.3469, + "step": 2689 + }, + { + "epoch": 0.5042174320524836, + "grad_norm": 48801.16015625, + "learning_rate": 9.563027544690833e-05, + "loss": 2.3573, + "step": 2690 + }, + { + "epoch": 0.5044048734770384, + "grad_norm": 50866.87890625, + "learning_rate": 9.562706224526275e-05, + "loss": 2.368, + "step": 2691 + }, + { + "epoch": 0.5045923149015933, + "grad_norm": 49627.59375, + "learning_rate": 9.562384791668789e-05, + "loss": 2.3253, + "step": 2692 + }, + { + "epoch": 0.5047797563261481, + "grad_norm": 50088.0234375, + "learning_rate": 9.562063246126314e-05, + "loss": 2.3242, + "step": 2693 + }, + { + "epoch": 0.5049671977507029, + "grad_norm": 53157.53515625, + "learning_rate": 9.561741587906792e-05, + "loss": 2.2813, + "step": 2694 + }, + { + "epoch": 0.5051546391752577, + "grad_norm": 58546.203125, + "learning_rate": 9.561419817018166e-05, + "loss": 2.3448, + "step": 2695 + }, + { + "epoch": 0.5053420805998126, + "grad_norm": 54606.7421875, + "learning_rate": 9.561097933468387e-05, + "loss": 2.3206, + "step": 2696 + }, + { + "epoch": 0.5055295220243674, + "grad_norm": 53344.1015625, + "learning_rate": 9.5607759372654e-05, + "loss": 2.35, + "step": 2697 + }, + { + "epoch": 0.5057169634489223, + "grad_norm": 52162.125, + "learning_rate": 9.560453828417163e-05, + "loss": 2.3623, + "step": 2698 + }, + { + "epoch": 0.505904404873477, + "grad_norm": 53764.01953125, + "learning_rate": 9.560131606931628e-05, + "loss": 2.355, + "step": 2699 + }, + { + "epoch": 0.5060918462980318, + "grad_norm": 53323.10546875, + "learning_rate": 9.559809272816753e-05, + "loss": 2.2881, + "step": 2700 + }, + { + "epoch": 0.5062792877225867, + "grad_norm": 49763.13671875, + "learning_rate": 9.559486826080504e-05, + "loss": 2.3374, + "step": 2701 + }, + { + "epoch": 0.5064667291471415, + "grad_norm": 55351.375, + "learning_rate": 9.55916426673084e-05, + "loss": 2.3473, + "step": 2702 + }, + { + "epoch": 0.5066541705716964, + "grad_norm": 50802.33984375, + "learning_rate": 9.55884159477573e-05, + "loss": 2.297, + "step": 2703 + }, + { + "epoch": 0.5068416119962512, + "grad_norm": 49454.4140625, + "learning_rate": 9.558518810223142e-05, + "loss": 2.3865, + "step": 2704 + }, + { + "epoch": 0.507029053420806, + "grad_norm": 53110.66796875, + "learning_rate": 9.55819591308105e-05, + "loss": 2.3348, + "step": 2705 + }, + { + "epoch": 0.5072164948453608, + "grad_norm": 47627.98046875, + "learning_rate": 9.55787290335743e-05, + "loss": 2.3172, + "step": 2706 + }, + { + "epoch": 0.5074039362699156, + "grad_norm": 49806.14453125, + "learning_rate": 9.557549781060257e-05, + "loss": 2.3718, + "step": 2707 + }, + { + "epoch": 0.5075913776944705, + "grad_norm": 46778.05078125, + "learning_rate": 9.557226546197515e-05, + "loss": 2.3259, + "step": 2708 + }, + { + "epoch": 0.5077788191190253, + "grad_norm": 49870.5546875, + "learning_rate": 9.556903198777185e-05, + "loss": 2.3395, + "step": 2709 + }, + { + "epoch": 0.5079662605435802, + "grad_norm": 49865.51953125, + "learning_rate": 9.556579738807255e-05, + "loss": 2.3091, + "step": 2710 + }, + { + "epoch": 0.5081537019681349, + "grad_norm": 49157.76953125, + "learning_rate": 9.556256166295711e-05, + "loss": 2.3661, + "step": 2711 + }, + { + "epoch": 0.5083411433926898, + "grad_norm": 48706.87109375, + "learning_rate": 9.555932481250549e-05, + "loss": 2.2982, + "step": 2712 + }, + { + "epoch": 0.5085285848172446, + "grad_norm": 47269.15234375, + "learning_rate": 9.555608683679762e-05, + "loss": 2.325, + "step": 2713 + }, + { + "epoch": 0.5087160262417995, + "grad_norm": 48913.23828125, + "learning_rate": 9.555284773591346e-05, + "loss": 2.35, + "step": 2714 + }, + { + "epoch": 0.5089034676663543, + "grad_norm": 54355.62890625, + "learning_rate": 9.554960750993301e-05, + "loss": 2.3102, + "step": 2715 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 50484.42578125, + "learning_rate": 9.554636615893632e-05, + "loss": 2.2451, + "step": 2716 + }, + { + "epoch": 0.5092783505154639, + "grad_norm": 54381.015625, + "learning_rate": 9.554312368300344e-05, + "loss": 2.3325, + "step": 2717 + }, + { + "epoch": 0.5094657919400187, + "grad_norm": 54790.87109375, + "learning_rate": 9.553988008221446e-05, + "loss": 2.4315, + "step": 2718 + }, + { + "epoch": 0.5096532333645736, + "grad_norm": 52564.72265625, + "learning_rate": 9.553663535664947e-05, + "loss": 2.3457, + "step": 2719 + }, + { + "epoch": 0.5098406747891284, + "grad_norm": 49103.44921875, + "learning_rate": 9.553338950638863e-05, + "loss": 2.3554, + "step": 2720 + }, + { + "epoch": 0.5100281162136833, + "grad_norm": 59507.33984375, + "learning_rate": 9.553014253151211e-05, + "loss": 2.2442, + "step": 2721 + }, + { + "epoch": 0.510215557638238, + "grad_norm": 55863.203125, + "learning_rate": 9.55268944321001e-05, + "loss": 2.4397, + "step": 2722 + }, + { + "epoch": 0.5104029990627929, + "grad_norm": 50850.765625, + "learning_rate": 9.55236452082328e-05, + "loss": 2.3247, + "step": 2723 + }, + { + "epoch": 0.5105904404873477, + "grad_norm": 50666.07421875, + "learning_rate": 9.552039485999051e-05, + "loss": 2.3023, + "step": 2724 + }, + { + "epoch": 0.5107778819119025, + "grad_norm": 49535.97265625, + "learning_rate": 9.55171433874535e-05, + "loss": 2.3505, + "step": 2725 + }, + { + "epoch": 0.5109653233364574, + "grad_norm": 51638.28515625, + "learning_rate": 9.551389079070203e-05, + "loss": 2.329, + "step": 2726 + }, + { + "epoch": 0.5111527647610122, + "grad_norm": 49507.9921875, + "learning_rate": 9.55106370698165e-05, + "loss": 2.3657, + "step": 2727 + }, + { + "epoch": 0.511340206185567, + "grad_norm": 50262.62890625, + "learning_rate": 9.55073822248772e-05, + "loss": 2.3361, + "step": 2728 + }, + { + "epoch": 0.5115276476101218, + "grad_norm": 49827.109375, + "learning_rate": 9.550412625596459e-05, + "loss": 2.3782, + "step": 2729 + }, + { + "epoch": 0.5117150890346767, + "grad_norm": 50976.15625, + "learning_rate": 9.550086916315904e-05, + "loss": 2.2758, + "step": 2730 + }, + { + "epoch": 0.5119025304592315, + "grad_norm": 46772.79296875, + "learning_rate": 9.549761094654102e-05, + "loss": 2.3791, + "step": 2731 + }, + { + "epoch": 0.5120899718837864, + "grad_norm": 61881.3359375, + "learning_rate": 9.5494351606191e-05, + "loss": 2.2981, + "step": 2732 + }, + { + "epoch": 0.5122774133083411, + "grad_norm": 54130.95703125, + "learning_rate": 9.549109114218948e-05, + "loss": 2.4448, + "step": 2733 + }, + { + "epoch": 0.5124648547328959, + "grad_norm": 51312.625, + "learning_rate": 9.548782955461699e-05, + "loss": 2.3453, + "step": 2734 + }, + { + "epoch": 0.5126522961574508, + "grad_norm": 48368.6953125, + "learning_rate": 9.548456684355409e-05, + "loss": 2.256, + "step": 2735 + }, + { + "epoch": 0.5128397375820056, + "grad_norm": 46049.796875, + "learning_rate": 9.548130300908136e-05, + "loss": 2.3321, + "step": 2736 + }, + { + "epoch": 0.5130271790065605, + "grad_norm": 51888.48046875, + "learning_rate": 9.547803805127941e-05, + "loss": 2.262, + "step": 2737 + }, + { + "epoch": 0.5132146204311153, + "grad_norm": 47964.44140625, + "learning_rate": 9.547477197022887e-05, + "loss": 2.3527, + "step": 2738 + }, + { + "epoch": 0.51340206185567, + "grad_norm": 49368.625, + "learning_rate": 9.547150476601044e-05, + "loss": 2.3326, + "step": 2739 + }, + { + "epoch": 0.5135895032802249, + "grad_norm": 50720.00390625, + "learning_rate": 9.54682364387048e-05, + "loss": 2.3454, + "step": 2740 + }, + { + "epoch": 0.5137769447047797, + "grad_norm": 53930.77734375, + "learning_rate": 9.546496698839264e-05, + "loss": 2.3506, + "step": 2741 + }, + { + "epoch": 0.5139643861293346, + "grad_norm": 50460.453125, + "learning_rate": 9.546169641515476e-05, + "loss": 2.3678, + "step": 2742 + }, + { + "epoch": 0.5141518275538894, + "grad_norm": 48400.26171875, + "learning_rate": 9.545842471907194e-05, + "loss": 2.3341, + "step": 2743 + }, + { + "epoch": 0.5143392689784443, + "grad_norm": 50933.22265625, + "learning_rate": 9.545515190022494e-05, + "loss": 2.3755, + "step": 2744 + }, + { + "epoch": 0.514526710402999, + "grad_norm": 54632.31640625, + "learning_rate": 9.545187795869464e-05, + "loss": 2.3989, + "step": 2745 + }, + { + "epoch": 0.5147141518275539, + "grad_norm": 50105.0078125, + "learning_rate": 9.544860289456188e-05, + "loss": 2.327, + "step": 2746 + }, + { + "epoch": 0.5149015932521087, + "grad_norm": 48166.04296875, + "learning_rate": 9.544532670790755e-05, + "loss": 2.3678, + "step": 2747 + }, + { + "epoch": 0.5150890346766636, + "grad_norm": 52324.72265625, + "learning_rate": 9.544204939881257e-05, + "loss": 2.2928, + "step": 2748 + }, + { + "epoch": 0.5152764761012184, + "grad_norm": 46650.84375, + "learning_rate": 9.543877096735788e-05, + "loss": 2.356, + "step": 2749 + }, + { + "epoch": 0.5154639175257731, + "grad_norm": 52167.0, + "learning_rate": 9.543549141362447e-05, + "loss": 2.3468, + "step": 2750 + }, + { + "epoch": 0.515651358950328, + "grad_norm": 48010.83203125, + "learning_rate": 9.543221073769333e-05, + "loss": 2.3427, + "step": 2751 + }, + { + "epoch": 0.5158388003748828, + "grad_norm": 48015.7109375, + "learning_rate": 9.542892893964548e-05, + "loss": 2.2919, + "step": 2752 + }, + { + "epoch": 0.5160262417994377, + "grad_norm": 46381.39453125, + "learning_rate": 9.542564601956198e-05, + "loss": 2.3671, + "step": 2753 + }, + { + "epoch": 0.5162136832239925, + "grad_norm": 48266.66015625, + "learning_rate": 9.542236197752393e-05, + "loss": 2.3261, + "step": 2754 + }, + { + "epoch": 0.5164011246485474, + "grad_norm": 51390.91796875, + "learning_rate": 9.541907681361243e-05, + "loss": 2.438, + "step": 2755 + }, + { + "epoch": 0.5165885660731021, + "grad_norm": 50251.35546875, + "learning_rate": 9.541579052790862e-05, + "loss": 2.3593, + "step": 2756 + }, + { + "epoch": 0.516776007497657, + "grad_norm": 49756.75390625, + "learning_rate": 9.541250312049366e-05, + "loss": 2.3297, + "step": 2757 + }, + { + "epoch": 0.5169634489222118, + "grad_norm": 50557.515625, + "learning_rate": 9.540921459144876e-05, + "loss": 2.369, + "step": 2758 + }, + { + "epoch": 0.5171508903467666, + "grad_norm": 49807.76953125, + "learning_rate": 9.540592494085512e-05, + "loss": 2.3676, + "step": 2759 + }, + { + "epoch": 0.5173383317713215, + "grad_norm": 50448.921875, + "learning_rate": 9.540263416879401e-05, + "loss": 2.4115, + "step": 2760 + }, + { + "epoch": 0.5175257731958763, + "grad_norm": 49390.75, + "learning_rate": 9.539934227534671e-05, + "loss": 2.3823, + "step": 2761 + }, + { + "epoch": 0.5177132146204311, + "grad_norm": 50481.0390625, + "learning_rate": 9.539604926059451e-05, + "loss": 2.301, + "step": 2762 + }, + { + "epoch": 0.5179006560449859, + "grad_norm": 48823.7109375, + "learning_rate": 9.539275512461874e-05, + "loss": 2.3422, + "step": 2763 + }, + { + "epoch": 0.5180880974695408, + "grad_norm": 46996.3984375, + "learning_rate": 9.538945986750079e-05, + "loss": 2.3375, + "step": 2764 + }, + { + "epoch": 0.5182755388940956, + "grad_norm": 51131.47265625, + "learning_rate": 9.538616348932203e-05, + "loss": 2.2832, + "step": 2765 + }, + { + "epoch": 0.5184629803186505, + "grad_norm": 49168.44140625, + "learning_rate": 9.538286599016386e-05, + "loss": 2.3285, + "step": 2766 + }, + { + "epoch": 0.5186504217432053, + "grad_norm": 52490.5390625, + "learning_rate": 9.537956737010773e-05, + "loss": 2.3747, + "step": 2767 + }, + { + "epoch": 0.51883786316776, + "grad_norm": 60386.87890625, + "learning_rate": 9.537626762923516e-05, + "loss": 2.3914, + "step": 2768 + }, + { + "epoch": 0.5190253045923149, + "grad_norm": 51717.58984375, + "learning_rate": 9.537296676762758e-05, + "loss": 2.4134, + "step": 2769 + }, + { + "epoch": 0.5192127460168697, + "grad_norm": 51475.7734375, + "learning_rate": 9.536966478536655e-05, + "loss": 2.3265, + "step": 2770 + }, + { + "epoch": 0.5194001874414246, + "grad_norm": 51956.34375, + "learning_rate": 9.536636168253364e-05, + "loss": 2.4159, + "step": 2771 + }, + { + "epoch": 0.5195876288659794, + "grad_norm": 52878.96484375, + "learning_rate": 9.536305745921039e-05, + "loss": 2.2793, + "step": 2772 + }, + { + "epoch": 0.5197750702905342, + "grad_norm": 50184.28125, + "learning_rate": 9.535975211547845e-05, + "loss": 2.3344, + "step": 2773 + }, + { + "epoch": 0.519962511715089, + "grad_norm": 49764.5234375, + "learning_rate": 9.535644565141944e-05, + "loss": 2.4375, + "step": 2774 + }, + { + "epoch": 0.5201499531396439, + "grad_norm": 49475.6171875, + "learning_rate": 9.535313806711504e-05, + "loss": 2.3776, + "step": 2775 + }, + { + "epoch": 0.5203373945641987, + "grad_norm": 69296.859375, + "learning_rate": 9.534982936264692e-05, + "loss": 2.5035, + "step": 2776 + }, + { + "epoch": 0.5205248359887535, + "grad_norm": 49305.37890625, + "learning_rate": 9.534651953809681e-05, + "loss": 2.3004, + "step": 2777 + }, + { + "epoch": 0.5207122774133084, + "grad_norm": 49558.78125, + "learning_rate": 9.534320859354646e-05, + "loss": 2.3318, + "step": 2778 + }, + { + "epoch": 0.5208997188378631, + "grad_norm": 51528.625, + "learning_rate": 9.533989652907764e-05, + "loss": 2.2874, + "step": 2779 + }, + { + "epoch": 0.521087160262418, + "grad_norm": 49602.1328125, + "learning_rate": 9.533658334477218e-05, + "loss": 2.2918, + "step": 2780 + }, + { + "epoch": 0.5212746016869728, + "grad_norm": 48712.64453125, + "learning_rate": 9.533326904071187e-05, + "loss": 2.2823, + "step": 2781 + }, + { + "epoch": 0.5214620431115277, + "grad_norm": 47479.3671875, + "learning_rate": 9.53299536169786e-05, + "loss": 2.3075, + "step": 2782 + }, + { + "epoch": 0.5216494845360825, + "grad_norm": 56167.203125, + "learning_rate": 9.532663707365425e-05, + "loss": 2.2974, + "step": 2783 + }, + { + "epoch": 0.5218369259606374, + "grad_norm": 51733.0390625, + "learning_rate": 9.532331941082072e-05, + "loss": 2.2922, + "step": 2784 + }, + { + "epoch": 0.5220243673851921, + "grad_norm": 50504.91015625, + "learning_rate": 9.532000062855996e-05, + "loss": 2.3814, + "step": 2785 + }, + { + "epoch": 0.5222118088097469, + "grad_norm": 51402.3984375, + "learning_rate": 9.531668072695397e-05, + "loss": 2.3151, + "step": 2786 + }, + { + "epoch": 0.5223992502343018, + "grad_norm": 52016.953125, + "learning_rate": 9.53133597060847e-05, + "loss": 2.3292, + "step": 2787 + }, + { + "epoch": 0.5225866916588566, + "grad_norm": 52432.125, + "learning_rate": 9.53100375660342e-05, + "loss": 2.3804, + "step": 2788 + }, + { + "epoch": 0.5227741330834115, + "grad_norm": 52583.4765625, + "learning_rate": 9.530671430688451e-05, + "loss": 2.34, + "step": 2789 + }, + { + "epoch": 0.5229615745079662, + "grad_norm": 48458.8203125, + "learning_rate": 9.530338992871772e-05, + "loss": 2.3388, + "step": 2790 + }, + { + "epoch": 0.523149015932521, + "grad_norm": 51235.46875, + "learning_rate": 9.530006443161594e-05, + "loss": 2.3741, + "step": 2791 + }, + { + "epoch": 0.5233364573570759, + "grad_norm": 49617.92578125, + "learning_rate": 9.529673781566131e-05, + "loss": 2.3732, + "step": 2792 + }, + { + "epoch": 0.5235238987816307, + "grad_norm": 54042.7421875, + "learning_rate": 9.529341008093596e-05, + "loss": 2.2558, + "step": 2793 + }, + { + "epoch": 0.5237113402061856, + "grad_norm": 51455.921875, + "learning_rate": 9.529008122752212e-05, + "loss": 2.3249, + "step": 2794 + }, + { + "epoch": 0.5238987816307404, + "grad_norm": 50271.34765625, + "learning_rate": 9.528675125550199e-05, + "loss": 2.3484, + "step": 2795 + }, + { + "epoch": 0.5240862230552952, + "grad_norm": 48094.70703125, + "learning_rate": 9.528342016495781e-05, + "loss": 2.32, + "step": 2796 + }, + { + "epoch": 0.52427366447985, + "grad_norm": 55093.52734375, + "learning_rate": 9.528008795597189e-05, + "loss": 2.3174, + "step": 2797 + }, + { + "epoch": 0.5244611059044049, + "grad_norm": 50972.34375, + "learning_rate": 9.527675462862649e-05, + "loss": 2.3225, + "step": 2798 + }, + { + "epoch": 0.5246485473289597, + "grad_norm": 48329.640625, + "learning_rate": 9.527342018300393e-05, + "loss": 2.3541, + "step": 2799 + }, + { + "epoch": 0.5248359887535146, + "grad_norm": 53802.625, + "learning_rate": 9.527008461918662e-05, + "loss": 2.2887, + "step": 2800 + }, + { + "epoch": 0.5250234301780694, + "grad_norm": 51595.53125, + "learning_rate": 9.52667479372569e-05, + "loss": 2.3409, + "step": 2801 + }, + { + "epoch": 0.5252108716026241, + "grad_norm": 54570.46484375, + "learning_rate": 9.526341013729721e-05, + "loss": 2.3593, + "step": 2802 + }, + { + "epoch": 0.525398313027179, + "grad_norm": 56015.6328125, + "learning_rate": 9.526007121938996e-05, + "loss": 2.3352, + "step": 2803 + }, + { + "epoch": 0.5255857544517338, + "grad_norm": 50381.69140625, + "learning_rate": 9.525673118361762e-05, + "loss": 2.3778, + "step": 2804 + }, + { + "epoch": 0.5257731958762887, + "grad_norm": 51888.92578125, + "learning_rate": 9.525339003006272e-05, + "loss": 2.2912, + "step": 2805 + }, + { + "epoch": 0.5259606373008435, + "grad_norm": 56993.51953125, + "learning_rate": 9.525004775880774e-05, + "loss": 2.3894, + "step": 2806 + }, + { + "epoch": 0.5261480787253983, + "grad_norm": 51274.609375, + "learning_rate": 9.524670436993525e-05, + "loss": 2.3573, + "step": 2807 + }, + { + "epoch": 0.5263355201499531, + "grad_norm": 50139.60546875, + "learning_rate": 9.524335986352783e-05, + "loss": 2.3417, + "step": 2808 + }, + { + "epoch": 0.526522961574508, + "grad_norm": 48605.19140625, + "learning_rate": 9.524001423966808e-05, + "loss": 2.3467, + "step": 2809 + }, + { + "epoch": 0.5267104029990628, + "grad_norm": 48157.66796875, + "learning_rate": 9.523666749843865e-05, + "loss": 2.3335, + "step": 2810 + }, + { + "epoch": 0.5268978444236176, + "grad_norm": 49666.046875, + "learning_rate": 9.523331963992216e-05, + "loss": 2.3387, + "step": 2811 + }, + { + "epoch": 0.5270852858481725, + "grad_norm": 49991.9375, + "learning_rate": 9.522997066420133e-05, + "loss": 2.3296, + "step": 2812 + }, + { + "epoch": 0.5272727272727272, + "grad_norm": 49884.890625, + "learning_rate": 9.522662057135887e-05, + "loss": 2.3297, + "step": 2813 + }, + { + "epoch": 0.5274601686972821, + "grad_norm": 56012.1171875, + "learning_rate": 9.52232693614775e-05, + "loss": 2.3571, + "step": 2814 + }, + { + "epoch": 0.5276476101218369, + "grad_norm": 50056.98828125, + "learning_rate": 9.521991703464002e-05, + "loss": 2.3576, + "step": 2815 + }, + { + "epoch": 0.5278350515463918, + "grad_norm": 52068.66796875, + "learning_rate": 9.521656359092923e-05, + "loss": 2.369, + "step": 2816 + }, + { + "epoch": 0.5280224929709466, + "grad_norm": 50191.7421875, + "learning_rate": 9.521320903042793e-05, + "loss": 2.3315, + "step": 2817 + }, + { + "epoch": 0.5282099343955015, + "grad_norm": 47704.30078125, + "learning_rate": 9.5209853353219e-05, + "loss": 2.3069, + "step": 2818 + }, + { + "epoch": 0.5283973758200562, + "grad_norm": 50944.84765625, + "learning_rate": 9.52064965593853e-05, + "loss": 2.2837, + "step": 2819 + }, + { + "epoch": 0.528584817244611, + "grad_norm": 48920.421875, + "learning_rate": 9.520313864900974e-05, + "loss": 2.3359, + "step": 2820 + }, + { + "epoch": 0.5287722586691659, + "grad_norm": 49742.69140625, + "learning_rate": 9.519977962217526e-05, + "loss": 2.2978, + "step": 2821 + }, + { + "epoch": 0.5289597000937207, + "grad_norm": 49520.3828125, + "learning_rate": 9.519641947896483e-05, + "loss": 2.3879, + "step": 2822 + }, + { + "epoch": 0.5291471415182756, + "grad_norm": 56814.20703125, + "learning_rate": 9.519305821946145e-05, + "loss": 2.4306, + "step": 2823 + }, + { + "epoch": 0.5293345829428303, + "grad_norm": 53254.078125, + "learning_rate": 9.51896958437481e-05, + "loss": 2.3857, + "step": 2824 + }, + { + "epoch": 0.5295220243673852, + "grad_norm": 50884.4921875, + "learning_rate": 9.518633235190787e-05, + "loss": 2.3427, + "step": 2825 + }, + { + "epoch": 0.52970946579194, + "grad_norm": 52977.57421875, + "learning_rate": 9.518296774402382e-05, + "loss": 2.3907, + "step": 2826 + }, + { + "epoch": 0.5298969072164949, + "grad_norm": 58915.6328125, + "learning_rate": 9.517960202017903e-05, + "loss": 2.2982, + "step": 2827 + }, + { + "epoch": 0.5300843486410497, + "grad_norm": 51705.63671875, + "learning_rate": 9.517623518045666e-05, + "loss": 2.3301, + "step": 2828 + }, + { + "epoch": 0.5302717900656045, + "grad_norm": 47956.4765625, + "learning_rate": 9.517286722493985e-05, + "loss": 2.2648, + "step": 2829 + }, + { + "epoch": 0.5304592314901593, + "grad_norm": 49675.71484375, + "learning_rate": 9.516949815371179e-05, + "loss": 2.4164, + "step": 2830 + }, + { + "epoch": 0.5306466729147141, + "grad_norm": 51796.046875, + "learning_rate": 9.516612796685568e-05, + "loss": 2.3694, + "step": 2831 + }, + { + "epoch": 0.530834114339269, + "grad_norm": 52303.12109375, + "learning_rate": 9.516275666445477e-05, + "loss": 2.3771, + "step": 2832 + }, + { + "epoch": 0.5310215557638238, + "grad_norm": 49959.24609375, + "learning_rate": 9.515938424659234e-05, + "loss": 2.3905, + "step": 2833 + }, + { + "epoch": 0.5312089971883787, + "grad_norm": 48879.85546875, + "learning_rate": 9.515601071335166e-05, + "loss": 2.3193, + "step": 2834 + }, + { + "epoch": 0.5313964386129335, + "grad_norm": 47542.63671875, + "learning_rate": 9.515263606481604e-05, + "loss": 2.3551, + "step": 2835 + }, + { + "epoch": 0.5315838800374882, + "grad_norm": 50535.4453125, + "learning_rate": 9.514926030106887e-05, + "loss": 2.3606, + "step": 2836 + }, + { + "epoch": 0.5317713214620431, + "grad_norm": 49200.48046875, + "learning_rate": 9.514588342219351e-05, + "loss": 2.3686, + "step": 2837 + }, + { + "epoch": 0.5319587628865979, + "grad_norm": 52511.46875, + "learning_rate": 9.514250542827335e-05, + "loss": 2.4774, + "step": 2838 + }, + { + "epoch": 0.5321462043111528, + "grad_norm": 47369.6640625, + "learning_rate": 9.513912631939185e-05, + "loss": 2.3237, + "step": 2839 + }, + { + "epoch": 0.5323336457357076, + "grad_norm": 49785.72265625, + "learning_rate": 9.513574609563244e-05, + "loss": 2.3305, + "step": 2840 + }, + { + "epoch": 0.5325210871602624, + "grad_norm": 52757.49609375, + "learning_rate": 9.513236475707862e-05, + "loss": 2.2759, + "step": 2841 + }, + { + "epoch": 0.5327085285848172, + "grad_norm": 47681.58203125, + "learning_rate": 9.512898230381392e-05, + "loss": 2.309, + "step": 2842 + }, + { + "epoch": 0.532895970009372, + "grad_norm": 51832.3203125, + "learning_rate": 9.512559873592187e-05, + "loss": 2.3026, + "step": 2843 + }, + { + "epoch": 0.5330834114339269, + "grad_norm": 51552.1484375, + "learning_rate": 9.512221405348601e-05, + "loss": 2.331, + "step": 2844 + }, + { + "epoch": 0.5332708528584817, + "grad_norm": 53030.47265625, + "learning_rate": 9.511882825658999e-05, + "loss": 2.3623, + "step": 2845 + }, + { + "epoch": 0.5334582942830366, + "grad_norm": 62660.79296875, + "learning_rate": 9.51154413453174e-05, + "loss": 2.3263, + "step": 2846 + }, + { + "epoch": 0.5336457357075913, + "grad_norm": 56148.19140625, + "learning_rate": 9.51120533197519e-05, + "loss": 2.3605, + "step": 2847 + }, + { + "epoch": 0.5338331771321462, + "grad_norm": 54250.265625, + "learning_rate": 9.510866417997717e-05, + "loss": 2.1996, + "step": 2848 + }, + { + "epoch": 0.534020618556701, + "grad_norm": 49655.39453125, + "learning_rate": 9.510527392607694e-05, + "loss": 2.3467, + "step": 2849 + }, + { + "epoch": 0.5342080599812559, + "grad_norm": 54719.62109375, + "learning_rate": 9.510188255813489e-05, + "loss": 2.3077, + "step": 2850 + }, + { + "epoch": 0.5343955014058107, + "grad_norm": 49255.78125, + "learning_rate": 9.509849007623483e-05, + "loss": 2.3842, + "step": 2851 + }, + { + "epoch": 0.5345829428303656, + "grad_norm": 50981.19921875, + "learning_rate": 9.509509648046054e-05, + "loss": 2.3636, + "step": 2852 + }, + { + "epoch": 0.5347703842549203, + "grad_norm": 49733.08203125, + "learning_rate": 9.509170177089583e-05, + "loss": 2.4004, + "step": 2853 + }, + { + "epoch": 0.5349578256794751, + "grad_norm": 52148.4296875, + "learning_rate": 9.508830594762457e-05, + "loss": 2.3698, + "step": 2854 + }, + { + "epoch": 0.53514526710403, + "grad_norm": 51332.1484375, + "learning_rate": 9.508490901073058e-05, + "loss": 2.3059, + "step": 2855 + }, + { + "epoch": 0.5353327085285848, + "grad_norm": 54375.6640625, + "learning_rate": 9.50815109602978e-05, + "loss": 2.3522, + "step": 2856 + }, + { + "epoch": 0.5355201499531397, + "grad_norm": 47302.80859375, + "learning_rate": 9.507811179641016e-05, + "loss": 2.2691, + "step": 2857 + }, + { + "epoch": 0.5357075913776945, + "grad_norm": 55301.37109375, + "learning_rate": 9.50747115191516e-05, + "loss": 2.3394, + "step": 2858 + }, + { + "epoch": 0.5358950328022493, + "grad_norm": 48449.57421875, + "learning_rate": 9.507131012860609e-05, + "loss": 2.3331, + "step": 2859 + }, + { + "epoch": 0.5360824742268041, + "grad_norm": 53447.0078125, + "learning_rate": 9.506790762485767e-05, + "loss": 2.3723, + "step": 2860 + }, + { + "epoch": 0.536269915651359, + "grad_norm": 49590.640625, + "learning_rate": 9.506450400799037e-05, + "loss": 2.3106, + "step": 2861 + }, + { + "epoch": 0.5364573570759138, + "grad_norm": 51848.1328125, + "learning_rate": 9.506109927808822e-05, + "loss": 2.3354, + "step": 2862 + }, + { + "epoch": 0.5366447985004686, + "grad_norm": 47379.55078125, + "learning_rate": 9.505769343523536e-05, + "loss": 2.2004, + "step": 2863 + }, + { + "epoch": 0.5368322399250234, + "grad_norm": 48446.4140625, + "learning_rate": 9.505428647951589e-05, + "loss": 2.2867, + "step": 2864 + }, + { + "epoch": 0.5370196813495782, + "grad_norm": 47581.640625, + "learning_rate": 9.505087841101396e-05, + "loss": 2.3715, + "step": 2865 + }, + { + "epoch": 0.5372071227741331, + "grad_norm": 50185.015625, + "learning_rate": 9.504746922981374e-05, + "loss": 2.394, + "step": 2866 + }, + { + "epoch": 0.5373945641986879, + "grad_norm": 49184.60546875, + "learning_rate": 9.504405893599944e-05, + "loss": 2.3386, + "step": 2867 + }, + { + "epoch": 0.5375820056232428, + "grad_norm": 48795.9296875, + "learning_rate": 9.504064752965527e-05, + "loss": 2.3389, + "step": 2868 + }, + { + "epoch": 0.5377694470477976, + "grad_norm": 53551.44140625, + "learning_rate": 9.503723501086551e-05, + "loss": 2.3066, + "step": 2869 + }, + { + "epoch": 0.5379568884723523, + "grad_norm": 48805.05078125, + "learning_rate": 9.503382137971444e-05, + "loss": 2.3455, + "step": 2870 + }, + { + "epoch": 0.5381443298969072, + "grad_norm": 46287.82421875, + "learning_rate": 9.503040663628636e-05, + "loss": 2.3074, + "step": 2871 + }, + { + "epoch": 0.538331771321462, + "grad_norm": 51041.22265625, + "learning_rate": 9.502699078066562e-05, + "loss": 2.3222, + "step": 2872 + }, + { + "epoch": 0.5385192127460169, + "grad_norm": 49890.91015625, + "learning_rate": 9.502357381293661e-05, + "loss": 2.3606, + "step": 2873 + }, + { + "epoch": 0.5387066541705717, + "grad_norm": 56579.41015625, + "learning_rate": 9.502015573318367e-05, + "loss": 2.2955, + "step": 2874 + }, + { + "epoch": 0.5388940955951266, + "grad_norm": 48786.10546875, + "learning_rate": 9.501673654149126e-05, + "loss": 2.3676, + "step": 2875 + }, + { + "epoch": 0.5390815370196813, + "grad_norm": 51175.11328125, + "learning_rate": 9.501331623794382e-05, + "loss": 2.2889, + "step": 2876 + }, + { + "epoch": 0.5392689784442362, + "grad_norm": 54869.125, + "learning_rate": 9.500989482262585e-05, + "loss": 2.2493, + "step": 2877 + }, + { + "epoch": 0.539456419868791, + "grad_norm": 47940.57421875, + "learning_rate": 9.500647229562183e-05, + "loss": 2.3477, + "step": 2878 + }, + { + "epoch": 0.5396438612933459, + "grad_norm": 53350.2421875, + "learning_rate": 9.500304865701628e-05, + "loss": 2.2672, + "step": 2879 + }, + { + "epoch": 0.5398313027179007, + "grad_norm": 53157.9765625, + "learning_rate": 9.499962390689379e-05, + "loss": 2.3896, + "step": 2880 + }, + { + "epoch": 0.5400187441424554, + "grad_norm": 47855.79296875, + "learning_rate": 9.499619804533891e-05, + "loss": 2.3549, + "step": 2881 + }, + { + "epoch": 0.5402061855670103, + "grad_norm": 53386.51171875, + "learning_rate": 9.499277107243629e-05, + "loss": 2.2333, + "step": 2882 + }, + { + "epoch": 0.5403936269915651, + "grad_norm": 53704.06640625, + "learning_rate": 9.498934298827057e-05, + "loss": 2.3199, + "step": 2883 + }, + { + "epoch": 0.54058106841612, + "grad_norm": 46732.55859375, + "learning_rate": 9.498591379292639e-05, + "loss": 2.3713, + "step": 2884 + }, + { + "epoch": 0.5407685098406748, + "grad_norm": 50897.4296875, + "learning_rate": 9.498248348648848e-05, + "loss": 2.4147, + "step": 2885 + }, + { + "epoch": 0.5409559512652297, + "grad_norm": 49532.3515625, + "learning_rate": 9.497905206904154e-05, + "loss": 2.3511, + "step": 2886 + }, + { + "epoch": 0.5411433926897844, + "grad_norm": 50153.2109375, + "learning_rate": 9.497561954067033e-05, + "loss": 2.3333, + "step": 2887 + }, + { + "epoch": 0.5413308341143392, + "grad_norm": 52819.265625, + "learning_rate": 9.497218590145963e-05, + "loss": 2.2957, + "step": 2888 + }, + { + "epoch": 0.5415182755388941, + "grad_norm": 49868.99609375, + "learning_rate": 9.496875115149424e-05, + "loss": 2.3933, + "step": 2889 + }, + { + "epoch": 0.5417057169634489, + "grad_norm": 51463.16015625, + "learning_rate": 9.496531529085901e-05, + "loss": 2.3212, + "step": 2890 + }, + { + "epoch": 0.5418931583880038, + "grad_norm": 52239.37109375, + "learning_rate": 9.496187831963878e-05, + "loss": 2.2816, + "step": 2891 + }, + { + "epoch": 0.5420805998125586, + "grad_norm": 49551.60546875, + "learning_rate": 9.495844023791846e-05, + "loss": 2.3137, + "step": 2892 + }, + { + "epoch": 0.5422680412371134, + "grad_norm": 50447.4375, + "learning_rate": 9.495500104578294e-05, + "loss": 2.3694, + "step": 2893 + }, + { + "epoch": 0.5424554826616682, + "grad_norm": 47554.65625, + "learning_rate": 9.49515607433172e-05, + "loss": 2.4228, + "step": 2894 + }, + { + "epoch": 0.542642924086223, + "grad_norm": 50637.4765625, + "learning_rate": 9.494811933060619e-05, + "loss": 2.2934, + "step": 2895 + }, + { + "epoch": 0.5428303655107779, + "grad_norm": 47342.26171875, + "learning_rate": 9.494467680773489e-05, + "loss": 2.355, + "step": 2896 + }, + { + "epoch": 0.5430178069353327, + "grad_norm": 51472.42578125, + "learning_rate": 9.494123317478835e-05, + "loss": 2.3364, + "step": 2897 + }, + { + "epoch": 0.5432052483598875, + "grad_norm": 49212.984375, + "learning_rate": 9.493778843185163e-05, + "loss": 2.2683, + "step": 2898 + }, + { + "epoch": 0.5433926897844423, + "grad_norm": 47259.765625, + "learning_rate": 9.493434257900978e-05, + "loss": 2.3954, + "step": 2899 + }, + { + "epoch": 0.5435801312089972, + "grad_norm": 52228.265625, + "learning_rate": 9.493089561634793e-05, + "loss": 2.3253, + "step": 2900 + }, + { + "epoch": 0.543767572633552, + "grad_norm": 52929.08984375, + "learning_rate": 9.492744754395123e-05, + "loss": 2.3263, + "step": 2901 + }, + { + "epoch": 0.5439550140581069, + "grad_norm": 52532.640625, + "learning_rate": 9.492399836190482e-05, + "loss": 2.3846, + "step": 2902 + }, + { + "epoch": 0.5441424554826617, + "grad_norm": 52948.80859375, + "learning_rate": 9.492054807029388e-05, + "loss": 2.3777, + "step": 2903 + }, + { + "epoch": 0.5443298969072164, + "grad_norm": 53371.47265625, + "learning_rate": 9.491709666920365e-05, + "loss": 2.3683, + "step": 2904 + }, + { + "epoch": 0.5445173383317713, + "grad_norm": 50188.984375, + "learning_rate": 9.491364415871937e-05, + "loss": 2.3698, + "step": 2905 + }, + { + "epoch": 0.5447047797563261, + "grad_norm": 51782.8203125, + "learning_rate": 9.491019053892631e-05, + "loss": 2.3107, + "step": 2906 + }, + { + "epoch": 0.544892221180881, + "grad_norm": 49026.71875, + "learning_rate": 9.490673580990976e-05, + "loss": 2.3479, + "step": 2907 + }, + { + "epoch": 0.5450796626054358, + "grad_norm": 48529.96484375, + "learning_rate": 9.490327997175507e-05, + "loss": 2.2946, + "step": 2908 + }, + { + "epoch": 0.5452671040299907, + "grad_norm": 48247.5, + "learning_rate": 9.489982302454759e-05, + "loss": 2.3529, + "step": 2909 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 47333.3125, + "learning_rate": 9.489636496837268e-05, + "loss": 2.3023, + "step": 2910 + }, + { + "epoch": 0.5456419868791003, + "grad_norm": 47253.19921875, + "learning_rate": 9.489290580331578e-05, + "loss": 2.3262, + "step": 2911 + }, + { + "epoch": 0.5458294283036551, + "grad_norm": 52896.37109375, + "learning_rate": 9.48894455294623e-05, + "loss": 2.2416, + "step": 2912 + }, + { + "epoch": 0.54601686972821, + "grad_norm": 48871.75390625, + "learning_rate": 9.488598414689771e-05, + "loss": 2.312, + "step": 2913 + }, + { + "epoch": 0.5462043111527648, + "grad_norm": 50999.4609375, + "learning_rate": 9.488252165570753e-05, + "loss": 2.3439, + "step": 2914 + }, + { + "epoch": 0.5463917525773195, + "grad_norm": 51257.87109375, + "learning_rate": 9.487905805597723e-05, + "loss": 2.3233, + "step": 2915 + }, + { + "epoch": 0.5465791940018744, + "grad_norm": 50134.77734375, + "learning_rate": 9.48755933477924e-05, + "loss": 2.3448, + "step": 2916 + }, + { + "epoch": 0.5467666354264292, + "grad_norm": 49724.95703125, + "learning_rate": 9.487212753123859e-05, + "loss": 2.3061, + "step": 2917 + }, + { + "epoch": 0.5469540768509841, + "grad_norm": 50496.48046875, + "learning_rate": 9.486866060640141e-05, + "loss": 2.348, + "step": 2918 + }, + { + "epoch": 0.5471415182755389, + "grad_norm": 50167.25, + "learning_rate": 9.486519257336648e-05, + "loss": 2.3545, + "step": 2919 + }, + { + "epoch": 0.5473289597000938, + "grad_norm": 51164.03515625, + "learning_rate": 9.486172343221947e-05, + "loss": 2.4182, + "step": 2920 + }, + { + "epoch": 0.5475164011246485, + "grad_norm": 52638.37890625, + "learning_rate": 9.485825318304604e-05, + "loss": 2.3363, + "step": 2921 + }, + { + "epoch": 0.5477038425492033, + "grad_norm": 53772.84765625, + "learning_rate": 9.485478182593193e-05, + "loss": 2.3267, + "step": 2922 + }, + { + "epoch": 0.5478912839737582, + "grad_norm": 50259.51171875, + "learning_rate": 9.485130936096285e-05, + "loss": 2.3506, + "step": 2923 + }, + { + "epoch": 0.548078725398313, + "grad_norm": 49140.87890625, + "learning_rate": 9.48478357882246e-05, + "loss": 2.3692, + "step": 2924 + }, + { + "epoch": 0.5482661668228679, + "grad_norm": 47656.9140625, + "learning_rate": 9.484436110780294e-05, + "loss": 2.3661, + "step": 2925 + }, + { + "epoch": 0.5484536082474227, + "grad_norm": 49649.8203125, + "learning_rate": 9.48408853197837e-05, + "loss": 2.3709, + "step": 2926 + }, + { + "epoch": 0.5486410496719775, + "grad_norm": 50951.48828125, + "learning_rate": 9.483740842425272e-05, + "loss": 2.3394, + "step": 2927 + }, + { + "epoch": 0.5488284910965323, + "grad_norm": 50282.9765625, + "learning_rate": 9.483393042129591e-05, + "loss": 2.3249, + "step": 2928 + }, + { + "epoch": 0.5490159325210872, + "grad_norm": 49093.359375, + "learning_rate": 9.483045131099915e-05, + "loss": 2.3825, + "step": 2929 + }, + { + "epoch": 0.549203373945642, + "grad_norm": 52095.375, + "learning_rate": 9.482697109344833e-05, + "loss": 2.3747, + "step": 2930 + }, + { + "epoch": 0.5493908153701968, + "grad_norm": 47247.26953125, + "learning_rate": 9.482348976872948e-05, + "loss": 2.3542, + "step": 2931 + }, + { + "epoch": 0.5495782567947516, + "grad_norm": 57798.421875, + "learning_rate": 9.482000733692853e-05, + "loss": 2.337, + "step": 2932 + }, + { + "epoch": 0.5497656982193064, + "grad_norm": 49718.28515625, + "learning_rate": 9.48165237981315e-05, + "loss": 2.3356, + "step": 2933 + }, + { + "epoch": 0.5499531396438613, + "grad_norm": 51683.5625, + "learning_rate": 9.481303915242445e-05, + "loss": 2.2661, + "step": 2934 + }, + { + "epoch": 0.5501405810684161, + "grad_norm": 50448.0234375, + "learning_rate": 9.480955339989344e-05, + "loss": 2.3436, + "step": 2935 + }, + { + "epoch": 0.550328022492971, + "grad_norm": 49756.40625, + "learning_rate": 9.480606654062453e-05, + "loss": 2.2931, + "step": 2936 + }, + { + "epoch": 0.5505154639175258, + "grad_norm": 51926.74609375, + "learning_rate": 9.48025785747039e-05, + "loss": 2.3149, + "step": 2937 + }, + { + "epoch": 0.5507029053420806, + "grad_norm": 52597.7109375, + "learning_rate": 9.479908950221765e-05, + "loss": 2.3598, + "step": 2938 + }, + { + "epoch": 0.5508903467666354, + "grad_norm": 49849.2109375, + "learning_rate": 9.479559932325197e-05, + "loss": 2.3645, + "step": 2939 + }, + { + "epoch": 0.5510777881911902, + "grad_norm": 48679.19140625, + "learning_rate": 9.479210803789308e-05, + "loss": 2.2828, + "step": 2940 + }, + { + "epoch": 0.5512652296157451, + "grad_norm": 53127.41796875, + "learning_rate": 9.478861564622718e-05, + "loss": 2.2862, + "step": 2941 + }, + { + "epoch": 0.5514526710402999, + "grad_norm": 56286.76171875, + "learning_rate": 9.478512214834055e-05, + "loss": 2.3381, + "step": 2942 + }, + { + "epoch": 0.5516401124648548, + "grad_norm": 48510.88671875, + "learning_rate": 9.478162754431947e-05, + "loss": 2.331, + "step": 2943 + }, + { + "epoch": 0.5518275538894095, + "grad_norm": 49881.46875, + "learning_rate": 9.477813183425026e-05, + "loss": 2.3458, + "step": 2944 + }, + { + "epoch": 0.5520149953139644, + "grad_norm": 51267.3125, + "learning_rate": 9.477463501821923e-05, + "loss": 2.3079, + "step": 2945 + }, + { + "epoch": 0.5522024367385192, + "grad_norm": 49418.96875, + "learning_rate": 9.477113709631277e-05, + "loss": 2.3852, + "step": 2946 + }, + { + "epoch": 0.552389878163074, + "grad_norm": 45944.98046875, + "learning_rate": 9.476763806861727e-05, + "loss": 2.3112, + "step": 2947 + }, + { + "epoch": 0.5525773195876289, + "grad_norm": 56056.52734375, + "learning_rate": 9.476413793521917e-05, + "loss": 2.2989, + "step": 2948 + }, + { + "epoch": 0.5527647610121837, + "grad_norm": 48977.125, + "learning_rate": 9.476063669620488e-05, + "loss": 2.3153, + "step": 2949 + }, + { + "epoch": 0.5529522024367385, + "grad_norm": 51653.05078125, + "learning_rate": 9.475713435166089e-05, + "loss": 2.3442, + "step": 2950 + }, + { + "epoch": 0.5531396438612933, + "grad_norm": 53510.05859375, + "learning_rate": 9.475363090167372e-05, + "loss": 2.3994, + "step": 2951 + }, + { + "epoch": 0.5533270852858482, + "grad_norm": 50209.6171875, + "learning_rate": 9.47501263463299e-05, + "loss": 2.3947, + "step": 2952 + }, + { + "epoch": 0.553514526710403, + "grad_norm": 50206.75, + "learning_rate": 9.474662068571598e-05, + "loss": 2.3369, + "step": 2953 + }, + { + "epoch": 0.5537019681349579, + "grad_norm": 54650.015625, + "learning_rate": 9.474311391991853e-05, + "loss": 2.2691, + "step": 2954 + }, + { + "epoch": 0.5538894095595126, + "grad_norm": 48790.26953125, + "learning_rate": 9.473960604902417e-05, + "loss": 2.3361, + "step": 2955 + }, + { + "epoch": 0.5540768509840674, + "grad_norm": 55122.7265625, + "learning_rate": 9.473609707311957e-05, + "loss": 2.4192, + "step": 2956 + }, + { + "epoch": 0.5542642924086223, + "grad_norm": 46598.671875, + "learning_rate": 9.473258699229135e-05, + "loss": 2.313, + "step": 2957 + }, + { + "epoch": 0.5544517338331771, + "grad_norm": 55926.35546875, + "learning_rate": 9.472907580662624e-05, + "loss": 2.3, + "step": 2958 + }, + { + "epoch": 0.554639175257732, + "grad_norm": 52265.77734375, + "learning_rate": 9.472556351621096e-05, + "loss": 2.3363, + "step": 2959 + }, + { + "epoch": 0.5548266166822868, + "grad_norm": 57067.18359375, + "learning_rate": 9.472205012113224e-05, + "loss": 2.3357, + "step": 2960 + }, + { + "epoch": 0.5550140581068416, + "grad_norm": 50298.625, + "learning_rate": 9.471853562147686e-05, + "loss": 2.3066, + "step": 2961 + }, + { + "epoch": 0.5552014995313964, + "grad_norm": 51017.9375, + "learning_rate": 9.471502001733164e-05, + "loss": 2.2418, + "step": 2962 + }, + { + "epoch": 0.5553889409559513, + "grad_norm": 50354.625, + "learning_rate": 9.471150330878337e-05, + "loss": 2.3289, + "step": 2963 + }, + { + "epoch": 0.5555763823805061, + "grad_norm": 49858.3515625, + "learning_rate": 9.470798549591897e-05, + "loss": 2.3751, + "step": 2964 + }, + { + "epoch": 0.555763823805061, + "grad_norm": 47527.63671875, + "learning_rate": 9.470446657882528e-05, + "loss": 2.3081, + "step": 2965 + }, + { + "epoch": 0.5559512652296158, + "grad_norm": 54704.2578125, + "learning_rate": 9.470094655758924e-05, + "loss": 2.2548, + "step": 2966 + }, + { + "epoch": 0.5561387066541705, + "grad_norm": 49557.35546875, + "learning_rate": 9.469742543229775e-05, + "loss": 2.3424, + "step": 2967 + }, + { + "epoch": 0.5563261480787254, + "grad_norm": 50029.46875, + "learning_rate": 9.469390320303782e-05, + "loss": 2.2586, + "step": 2968 + }, + { + "epoch": 0.5565135895032802, + "grad_norm": 53223.62109375, + "learning_rate": 9.469037986989642e-05, + "loss": 2.3199, + "step": 2969 + }, + { + "epoch": 0.5567010309278351, + "grad_norm": 49809.2265625, + "learning_rate": 9.468685543296059e-05, + "loss": 2.2747, + "step": 2970 + }, + { + "epoch": 0.5568884723523899, + "grad_norm": 54176.0390625, + "learning_rate": 9.468332989231735e-05, + "loss": 2.3757, + "step": 2971 + }, + { + "epoch": 0.5570759137769447, + "grad_norm": 49989.5703125, + "learning_rate": 9.467980324805381e-05, + "loss": 2.3519, + "step": 2972 + }, + { + "epoch": 0.5572633552014995, + "grad_norm": 53140.44921875, + "learning_rate": 9.467627550025705e-05, + "loss": 2.256, + "step": 2973 + }, + { + "epoch": 0.5574507966260543, + "grad_norm": 49476.1328125, + "learning_rate": 9.467274664901421e-05, + "loss": 2.3569, + "step": 2974 + }, + { + "epoch": 0.5576382380506092, + "grad_norm": 47872.6484375, + "learning_rate": 9.466921669441244e-05, + "loss": 2.3816, + "step": 2975 + }, + { + "epoch": 0.557825679475164, + "grad_norm": 49575.0703125, + "learning_rate": 9.466568563653895e-05, + "loss": 2.3331, + "step": 2976 + }, + { + "epoch": 0.5580131208997189, + "grad_norm": 51318.31640625, + "learning_rate": 9.466215347548094e-05, + "loss": 2.354, + "step": 2977 + }, + { + "epoch": 0.5582005623242736, + "grad_norm": 49093.62109375, + "learning_rate": 9.465862021132562e-05, + "loss": 2.3198, + "step": 2978 + }, + { + "epoch": 0.5583880037488285, + "grad_norm": 57427.72265625, + "learning_rate": 9.465508584416028e-05, + "loss": 2.2871, + "step": 2979 + }, + { + "epoch": 0.5585754451733833, + "grad_norm": 52336.421875, + "learning_rate": 9.465155037407223e-05, + "loss": 2.3824, + "step": 2980 + }, + { + "epoch": 0.5587628865979382, + "grad_norm": 49354.9609375, + "learning_rate": 9.464801380114879e-05, + "loss": 2.3224, + "step": 2981 + }, + { + "epoch": 0.558950328022493, + "grad_norm": 54161.37890625, + "learning_rate": 9.464447612547727e-05, + "loss": 2.4441, + "step": 2982 + }, + { + "epoch": 0.5591377694470478, + "grad_norm": 50860.08203125, + "learning_rate": 9.464093734714508e-05, + "loss": 2.3682, + "step": 2983 + }, + { + "epoch": 0.5593252108716026, + "grad_norm": 53007.51953125, + "learning_rate": 9.463739746623963e-05, + "loss": 2.2674, + "step": 2984 + }, + { + "epoch": 0.5595126522961574, + "grad_norm": 52510.1171875, + "learning_rate": 9.463385648284832e-05, + "loss": 2.3501, + "step": 2985 + }, + { + "epoch": 0.5597000937207123, + "grad_norm": 54531.546875, + "learning_rate": 9.463031439705863e-05, + "loss": 2.4001, + "step": 2986 + }, + { + "epoch": 0.5598875351452671, + "grad_norm": 49786.1328125, + "learning_rate": 9.462677120895803e-05, + "loss": 2.2958, + "step": 2987 + }, + { + "epoch": 0.560074976569822, + "grad_norm": 49478.12109375, + "learning_rate": 9.462322691863406e-05, + "loss": 2.3933, + "step": 2988 + }, + { + "epoch": 0.5602624179943767, + "grad_norm": 51968.90234375, + "learning_rate": 9.461968152617422e-05, + "loss": 2.3822, + "step": 2989 + }, + { + "epoch": 0.5604498594189316, + "grad_norm": 51679.3984375, + "learning_rate": 9.46161350316661e-05, + "loss": 2.3062, + "step": 2990 + }, + { + "epoch": 0.5606373008434864, + "grad_norm": 52033.359375, + "learning_rate": 9.46125874351973e-05, + "loss": 2.3381, + "step": 2991 + }, + { + "epoch": 0.5608247422680412, + "grad_norm": 54190.37890625, + "learning_rate": 9.460903873685541e-05, + "loss": 2.3746, + "step": 2992 + }, + { + "epoch": 0.5610121836925961, + "grad_norm": 47796.98828125, + "learning_rate": 9.460548893672812e-05, + "loss": 2.3488, + "step": 2993 + }, + { + "epoch": 0.5611996251171509, + "grad_norm": 46998.62109375, + "learning_rate": 9.460193803490309e-05, + "loss": 2.3348, + "step": 2994 + }, + { + "epoch": 0.5613870665417057, + "grad_norm": 49158.515625, + "learning_rate": 9.4598386031468e-05, + "loss": 2.372, + "step": 2995 + }, + { + "epoch": 0.5615745079662605, + "grad_norm": 53713.375, + "learning_rate": 9.45948329265106e-05, + "loss": 2.2983, + "step": 2996 + }, + { + "epoch": 0.5617619493908154, + "grad_norm": 49389.41015625, + "learning_rate": 9.459127872011866e-05, + "loss": 2.3865, + "step": 2997 + }, + { + "epoch": 0.5619493908153702, + "grad_norm": 47420.10546875, + "learning_rate": 9.458772341237993e-05, + "loss": 2.3407, + "step": 2998 + }, + { + "epoch": 0.562136832239925, + "grad_norm": 54772.48828125, + "learning_rate": 9.458416700338225e-05, + "loss": 2.3649, + "step": 2999 + }, + { + "epoch": 0.5623242736644799, + "grad_norm": 48292.671875, + "learning_rate": 9.458060949321346e-05, + "loss": 2.3342, + "step": 3000 + }, + { + "epoch": 0.5623242736644799, + "eval_loss": 2.333876371383667, + "eval_runtime": 131.6872, + "eval_samples_per_second": 38.341, + "eval_steps_per_second": 1.921, + "step": 3000 + }, + { + "epoch": 0.5625117150890346, + "grad_norm": 50550.65625, + "learning_rate": 9.45770508819614e-05, + "loss": 2.286, + "step": 3001 + }, + { + "epoch": 0.5626991565135895, + "grad_norm": 48394.40234375, + "learning_rate": 9.457349116971398e-05, + "loss": 2.2758, + "step": 3002 + }, + { + "epoch": 0.5628865979381443, + "grad_norm": 49461.26171875, + "learning_rate": 9.456993035655912e-05, + "loss": 2.335, + "step": 3003 + }, + { + "epoch": 0.5630740393626992, + "grad_norm": 49031.30859375, + "learning_rate": 9.456636844258478e-05, + "loss": 2.3461, + "step": 3004 + }, + { + "epoch": 0.563261480787254, + "grad_norm": 53709.0, + "learning_rate": 9.45628054278789e-05, + "loss": 2.324, + "step": 3005 + }, + { + "epoch": 0.5634489222118088, + "grad_norm": 50794.421875, + "learning_rate": 9.455924131252952e-05, + "loss": 2.3336, + "step": 3006 + }, + { + "epoch": 0.5636363636363636, + "grad_norm": 53280.875, + "learning_rate": 9.455567609662463e-05, + "loss": 2.4135, + "step": 3007 + }, + { + "epoch": 0.5638238050609184, + "grad_norm": 49422.99609375, + "learning_rate": 9.455210978025231e-05, + "loss": 2.3229, + "step": 3008 + }, + { + "epoch": 0.5640112464854733, + "grad_norm": 50744.91015625, + "learning_rate": 9.454854236350066e-05, + "loss": 2.4279, + "step": 3009 + }, + { + "epoch": 0.5641986879100281, + "grad_norm": 52751.04296875, + "learning_rate": 9.454497384645775e-05, + "loss": 2.351, + "step": 3010 + }, + { + "epoch": 0.564386129334583, + "grad_norm": 47693.26171875, + "learning_rate": 9.454140422921176e-05, + "loss": 2.3046, + "step": 3011 + }, + { + "epoch": 0.5645735707591377, + "grad_norm": 47716.55078125, + "learning_rate": 9.453783351185083e-05, + "loss": 2.3687, + "step": 3012 + }, + { + "epoch": 0.5647610121836926, + "grad_norm": 48005.0078125, + "learning_rate": 9.453426169446315e-05, + "loss": 2.3713, + "step": 3013 + }, + { + "epoch": 0.5649484536082474, + "grad_norm": 50847.48046875, + "learning_rate": 9.453068877713695e-05, + "loss": 2.3026, + "step": 3014 + }, + { + "epoch": 0.5651358950328023, + "grad_norm": 48851.90234375, + "learning_rate": 9.452711475996047e-05, + "loss": 2.3444, + "step": 3015 + }, + { + "epoch": 0.5653233364573571, + "grad_norm": 51359.95703125, + "learning_rate": 9.452353964302199e-05, + "loss": 2.2612, + "step": 3016 + }, + { + "epoch": 0.565510777881912, + "grad_norm": 52547.75, + "learning_rate": 9.451996342640978e-05, + "loss": 2.2657, + "step": 3017 + }, + { + "epoch": 0.5656982193064667, + "grad_norm": 51994.0390625, + "learning_rate": 9.451638611021223e-05, + "loss": 2.4182, + "step": 3018 + }, + { + "epoch": 0.5658856607310215, + "grad_norm": 54669.8515625, + "learning_rate": 9.451280769451765e-05, + "loss": 2.3067, + "step": 3019 + }, + { + "epoch": 0.5660731021555764, + "grad_norm": 52217.73828125, + "learning_rate": 9.450922817941442e-05, + "loss": 2.3409, + "step": 3020 + }, + { + "epoch": 0.5662605435801312, + "grad_norm": 47981.953125, + "learning_rate": 9.450564756499097e-05, + "loss": 2.37, + "step": 3021 + }, + { + "epoch": 0.5664479850046861, + "grad_norm": 53745.73046875, + "learning_rate": 9.450206585133572e-05, + "loss": 2.305, + "step": 3022 + }, + { + "epoch": 0.5666354264292408, + "grad_norm": 48855.62890625, + "learning_rate": 9.449848303853715e-05, + "loss": 2.3062, + "step": 3023 + }, + { + "epoch": 0.5668228678537957, + "grad_norm": 46690.5078125, + "learning_rate": 9.449489912668374e-05, + "loss": 2.2713, + "step": 3024 + }, + { + "epoch": 0.5670103092783505, + "grad_norm": 49370.88671875, + "learning_rate": 9.4491314115864e-05, + "loss": 2.3653, + "step": 3025 + }, + { + "epoch": 0.5671977507029053, + "grad_norm": 50235.328125, + "learning_rate": 9.44877280061665e-05, + "loss": 2.3669, + "step": 3026 + }, + { + "epoch": 0.5673851921274602, + "grad_norm": 49559.625, + "learning_rate": 9.44841407976798e-05, + "loss": 2.2855, + "step": 3027 + }, + { + "epoch": 0.567572633552015, + "grad_norm": 50410.1171875, + "learning_rate": 9.448055249049249e-05, + "loss": 2.3794, + "step": 3028 + }, + { + "epoch": 0.5677600749765698, + "grad_norm": 48592.2421875, + "learning_rate": 9.447696308469321e-05, + "loss": 2.283, + "step": 3029 + }, + { + "epoch": 0.5679475164011246, + "grad_norm": 51327.265625, + "learning_rate": 9.447337258037058e-05, + "loss": 2.4091, + "step": 3030 + }, + { + "epoch": 0.5681349578256795, + "grad_norm": 53036.3828125, + "learning_rate": 9.446978097761334e-05, + "loss": 2.3094, + "step": 3031 + }, + { + "epoch": 0.5683223992502343, + "grad_norm": 47264.1796875, + "learning_rate": 9.446618827651015e-05, + "loss": 2.3481, + "step": 3032 + }, + { + "epoch": 0.5685098406747892, + "grad_norm": 50087.28515625, + "learning_rate": 9.446259447714976e-05, + "loss": 2.359, + "step": 3033 + }, + { + "epoch": 0.568697282099344, + "grad_norm": 52466.4921875, + "learning_rate": 9.445899957962095e-05, + "loss": 2.3474, + "step": 3034 + }, + { + "epoch": 0.5688847235238987, + "grad_norm": 47739.48046875, + "learning_rate": 9.445540358401247e-05, + "loss": 2.3998, + "step": 3035 + }, + { + "epoch": 0.5690721649484536, + "grad_norm": 48906.66015625, + "learning_rate": 9.445180649041317e-05, + "loss": 2.3443, + "step": 3036 + }, + { + "epoch": 0.5692596063730084, + "grad_norm": 55744.25390625, + "learning_rate": 9.444820829891188e-05, + "loss": 2.271, + "step": 3037 + }, + { + "epoch": 0.5694470477975633, + "grad_norm": 51728.05078125, + "learning_rate": 9.444460900959748e-05, + "loss": 2.3429, + "step": 3038 + }, + { + "epoch": 0.5696344892221181, + "grad_norm": 53309.0859375, + "learning_rate": 9.444100862255884e-05, + "loss": 2.337, + "step": 3039 + }, + { + "epoch": 0.569821930646673, + "grad_norm": 51086.3203125, + "learning_rate": 9.443740713788492e-05, + "loss": 2.3129, + "step": 3040 + }, + { + "epoch": 0.5700093720712277, + "grad_norm": 53962.73046875, + "learning_rate": 9.443380455566467e-05, + "loss": 2.3453, + "step": 3041 + }, + { + "epoch": 0.5701968134957826, + "grad_norm": 48330.48046875, + "learning_rate": 9.443020087598704e-05, + "loss": 2.3474, + "step": 3042 + }, + { + "epoch": 0.5703842549203374, + "grad_norm": 49433.625, + "learning_rate": 9.442659609894106e-05, + "loss": 2.3381, + "step": 3043 + }, + { + "epoch": 0.5705716963448922, + "grad_norm": 54822.859375, + "learning_rate": 9.442299022461575e-05, + "loss": 2.393, + "step": 3044 + }, + { + "epoch": 0.5707591377694471, + "grad_norm": 48621.8984375, + "learning_rate": 9.441938325310019e-05, + "loss": 2.3288, + "step": 3045 + }, + { + "epoch": 0.5709465791940018, + "grad_norm": 54062.5390625, + "learning_rate": 9.441577518448344e-05, + "loss": 2.2926, + "step": 3046 + }, + { + "epoch": 0.5711340206185567, + "grad_norm": 49570.1171875, + "learning_rate": 9.441216601885463e-05, + "loss": 2.3871, + "step": 3047 + }, + { + "epoch": 0.5713214620431115, + "grad_norm": 47401.33203125, + "learning_rate": 9.44085557563029e-05, + "loss": 2.3745, + "step": 3048 + }, + { + "epoch": 0.5715089034676664, + "grad_norm": 50360.21875, + "learning_rate": 9.440494439691743e-05, + "loss": 2.2886, + "step": 3049 + }, + { + "epoch": 0.5716963448922212, + "grad_norm": 53059.6796875, + "learning_rate": 9.440133194078739e-05, + "loss": 2.399, + "step": 3050 + }, + { + "epoch": 0.571883786316776, + "grad_norm": 58036.1171875, + "learning_rate": 9.439771838800202e-05, + "loss": 2.4006, + "step": 3051 + }, + { + "epoch": 0.5720712277413308, + "grad_norm": 51160.28125, + "learning_rate": 9.439410373865058e-05, + "loss": 2.3002, + "step": 3052 + }, + { + "epoch": 0.5722586691658856, + "grad_norm": 52290.5, + "learning_rate": 9.439048799282234e-05, + "loss": 2.3168, + "step": 3053 + }, + { + "epoch": 0.5724461105904405, + "grad_norm": 50241.67578125, + "learning_rate": 9.438687115060658e-05, + "loss": 2.3655, + "step": 3054 + }, + { + "epoch": 0.5726335520149953, + "grad_norm": 51192.09375, + "learning_rate": 9.438325321209267e-05, + "loss": 2.3685, + "step": 3055 + }, + { + "epoch": 0.5728209934395502, + "grad_norm": 49574.8359375, + "learning_rate": 9.437963417736993e-05, + "loss": 2.3055, + "step": 3056 + }, + { + "epoch": 0.573008434864105, + "grad_norm": 50838.51953125, + "learning_rate": 9.437601404652778e-05, + "loss": 2.1943, + "step": 3057 + }, + { + "epoch": 0.5731958762886598, + "grad_norm": 55511.1484375, + "learning_rate": 9.437239281965563e-05, + "loss": 2.2592, + "step": 3058 + }, + { + "epoch": 0.5733833177132146, + "grad_norm": 48984.3984375, + "learning_rate": 9.436877049684287e-05, + "loss": 2.3479, + "step": 3059 + }, + { + "epoch": 0.5735707591377694, + "grad_norm": 47575.546875, + "learning_rate": 9.436514707817903e-05, + "loss": 2.3533, + "step": 3060 + }, + { + "epoch": 0.5737582005623243, + "grad_norm": 53592.8515625, + "learning_rate": 9.436152256375357e-05, + "loss": 2.3155, + "step": 3061 + }, + { + "epoch": 0.5739456419868791, + "grad_norm": 49339.0703125, + "learning_rate": 9.435789695365602e-05, + "loss": 2.3019, + "step": 3062 + }, + { + "epoch": 0.5741330834114339, + "grad_norm": 46096.8203125, + "learning_rate": 9.435427024797592e-05, + "loss": 2.2942, + "step": 3063 + }, + { + "epoch": 0.5743205248359887, + "grad_norm": 51568.7578125, + "learning_rate": 9.435064244680285e-05, + "loss": 2.3809, + "step": 3064 + }, + { + "epoch": 0.5745079662605436, + "grad_norm": 47701.01171875, + "learning_rate": 9.434701355022642e-05, + "loss": 2.2995, + "step": 3065 + }, + { + "epoch": 0.5746954076850984, + "grad_norm": 50362.58203125, + "learning_rate": 9.434338355833626e-05, + "loss": 2.3877, + "step": 3066 + }, + { + "epoch": 0.5748828491096533, + "grad_norm": 48166.89453125, + "learning_rate": 9.433975247122202e-05, + "loss": 2.3951, + "step": 3067 + }, + { + "epoch": 0.5750702905342081, + "grad_norm": 50386.42578125, + "learning_rate": 9.433612028897336e-05, + "loss": 2.2869, + "step": 3068 + }, + { + "epoch": 0.5752577319587628, + "grad_norm": 48447.98046875, + "learning_rate": 9.433248701168002e-05, + "loss": 2.2966, + "step": 3069 + }, + { + "epoch": 0.5754451733833177, + "grad_norm": 48350.77734375, + "learning_rate": 9.432885263943172e-05, + "loss": 2.405, + "step": 3070 + }, + { + "epoch": 0.5756326148078725, + "grad_norm": 49348.4609375, + "learning_rate": 9.432521717231826e-05, + "loss": 2.3405, + "step": 3071 + }, + { + "epoch": 0.5758200562324274, + "grad_norm": 52721.53515625, + "learning_rate": 9.432158061042938e-05, + "loss": 2.3306, + "step": 3072 + }, + { + "epoch": 0.5760074976569822, + "grad_norm": 54774.1640625, + "learning_rate": 9.431794295385494e-05, + "loss": 2.3466, + "step": 3073 + }, + { + "epoch": 0.5761949390815371, + "grad_norm": 50619.359375, + "learning_rate": 9.431430420268475e-05, + "loss": 2.3382, + "step": 3074 + }, + { + "epoch": 0.5763823805060918, + "grad_norm": 52618.65625, + "learning_rate": 9.431066435700872e-05, + "loss": 2.359, + "step": 3075 + }, + { + "epoch": 0.5765698219306467, + "grad_norm": 48974.20703125, + "learning_rate": 9.430702341691671e-05, + "loss": 2.2917, + "step": 3076 + }, + { + "epoch": 0.5767572633552015, + "grad_norm": 52608.171875, + "learning_rate": 9.430338138249868e-05, + "loss": 2.3196, + "step": 3077 + }, + { + "epoch": 0.5769447047797563, + "grad_norm": 51585.109375, + "learning_rate": 9.429973825384457e-05, + "loss": 2.3174, + "step": 3078 + }, + { + "epoch": 0.5771321462043112, + "grad_norm": 52683.32421875, + "learning_rate": 9.429609403104435e-05, + "loss": 2.2914, + "step": 3079 + }, + { + "epoch": 0.5773195876288659, + "grad_norm": 52016.734375, + "learning_rate": 9.429244871418805e-05, + "loss": 2.3462, + "step": 3080 + }, + { + "epoch": 0.5775070290534208, + "grad_norm": 51056.578125, + "learning_rate": 9.428880230336569e-05, + "loss": 2.4124, + "step": 3081 + }, + { + "epoch": 0.5776944704779756, + "grad_norm": 49690.9921875, + "learning_rate": 9.428515479866733e-05, + "loss": 2.3324, + "step": 3082 + }, + { + "epoch": 0.5778819119025305, + "grad_norm": 50853.29296875, + "learning_rate": 9.428150620018306e-05, + "loss": 2.3185, + "step": 3083 + }, + { + "epoch": 0.5780693533270853, + "grad_norm": 57983.01171875, + "learning_rate": 9.427785650800301e-05, + "loss": 2.3124, + "step": 3084 + }, + { + "epoch": 0.5782567947516402, + "grad_norm": 53734.0234375, + "learning_rate": 9.42742057222173e-05, + "loss": 2.3054, + "step": 3085 + }, + { + "epoch": 0.5784442361761949, + "grad_norm": 50756.66796875, + "learning_rate": 9.427055384291613e-05, + "loss": 2.2888, + "step": 3086 + }, + { + "epoch": 0.5786316776007497, + "grad_norm": 49869.046875, + "learning_rate": 9.426690087018965e-05, + "loss": 2.3493, + "step": 3087 + }, + { + "epoch": 0.5788191190253046, + "grad_norm": 48053.35546875, + "learning_rate": 9.426324680412812e-05, + "loss": 2.2999, + "step": 3088 + }, + { + "epoch": 0.5790065604498594, + "grad_norm": 51660.41796875, + "learning_rate": 9.42595916448218e-05, + "loss": 2.3359, + "step": 3089 + }, + { + "epoch": 0.5791940018744143, + "grad_norm": 48014.4609375, + "learning_rate": 9.425593539236092e-05, + "loss": 2.3079, + "step": 3090 + }, + { + "epoch": 0.5793814432989691, + "grad_norm": 48592.9921875, + "learning_rate": 9.425227804683585e-05, + "loss": 2.4002, + "step": 3091 + }, + { + "epoch": 0.5795688847235239, + "grad_norm": 54204.83203125, + "learning_rate": 9.424861960833687e-05, + "loss": 2.3051, + "step": 3092 + }, + { + "epoch": 0.5797563261480787, + "grad_norm": 50371.6328125, + "learning_rate": 9.424496007695435e-05, + "loss": 2.3173, + "step": 3093 + }, + { + "epoch": 0.5799437675726336, + "grad_norm": 48923.3125, + "learning_rate": 9.424129945277869e-05, + "loss": 2.4015, + "step": 3094 + }, + { + "epoch": 0.5801312089971884, + "grad_norm": 52624.0546875, + "learning_rate": 9.423763773590029e-05, + "loss": 2.3705, + "step": 3095 + }, + { + "epoch": 0.5803186504217432, + "grad_norm": 54574.078125, + "learning_rate": 9.423397492640957e-05, + "loss": 2.3468, + "step": 3096 + }, + { + "epoch": 0.580506091846298, + "grad_norm": 49487.26953125, + "learning_rate": 9.423031102439704e-05, + "loss": 2.3443, + "step": 3097 + }, + { + "epoch": 0.5806935332708528, + "grad_norm": 48980.7265625, + "learning_rate": 9.422664602995318e-05, + "loss": 2.3279, + "step": 3098 + }, + { + "epoch": 0.5808809746954077, + "grad_norm": 50383.421875, + "learning_rate": 9.422297994316849e-05, + "loss": 2.3564, + "step": 3099 + }, + { + "epoch": 0.5810684161199625, + "grad_norm": 51692.83984375, + "learning_rate": 9.421931276413354e-05, + "loss": 2.3545, + "step": 3100 + }, + { + "epoch": 0.5812558575445174, + "grad_norm": 45528.84765625, + "learning_rate": 9.42156444929389e-05, + "loss": 2.2894, + "step": 3101 + }, + { + "epoch": 0.5814432989690722, + "grad_norm": 53599.8203125, + "learning_rate": 9.421197512967515e-05, + "loss": 2.3193, + "step": 3102 + }, + { + "epoch": 0.5816307403936269, + "grad_norm": 48169.63671875, + "learning_rate": 9.420830467443296e-05, + "loss": 2.3399, + "step": 3103 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 51676.875, + "learning_rate": 9.420463312730294e-05, + "loss": 2.3301, + "step": 3104 + }, + { + "epoch": 0.5820056232427366, + "grad_norm": 50960.3203125, + "learning_rate": 9.42009604883758e-05, + "loss": 2.3743, + "step": 3105 + }, + { + "epoch": 0.5821930646672915, + "grad_norm": 53101.79296875, + "learning_rate": 9.419728675774225e-05, + "loss": 2.2944, + "step": 3106 + }, + { + "epoch": 0.5823805060918463, + "grad_norm": 50460.16796875, + "learning_rate": 9.419361193549301e-05, + "loss": 2.3428, + "step": 3107 + }, + { + "epoch": 0.5825679475164012, + "grad_norm": 57918.45703125, + "learning_rate": 9.418993602171886e-05, + "loss": 2.4845, + "step": 3108 + }, + { + "epoch": 0.5827553889409559, + "grad_norm": 60198.2578125, + "learning_rate": 9.418625901651059e-05, + "loss": 2.2287, + "step": 3109 + }, + { + "epoch": 0.5829428303655108, + "grad_norm": 52864.21484375, + "learning_rate": 9.4182580919959e-05, + "loss": 2.3195, + "step": 3110 + }, + { + "epoch": 0.5831302717900656, + "grad_norm": 51906.78125, + "learning_rate": 9.417890173215493e-05, + "loss": 2.3517, + "step": 3111 + }, + { + "epoch": 0.5833177132146204, + "grad_norm": 52430.4453125, + "learning_rate": 9.41752214531893e-05, + "loss": 2.3621, + "step": 3112 + }, + { + "epoch": 0.5835051546391753, + "grad_norm": 52575.1171875, + "learning_rate": 9.417154008315294e-05, + "loss": 2.3195, + "step": 3113 + }, + { + "epoch": 0.58369259606373, + "grad_norm": 50711.7578125, + "learning_rate": 9.416785762213684e-05, + "loss": 2.3022, + "step": 3114 + }, + { + "epoch": 0.5838800374882849, + "grad_norm": 51341.79296875, + "learning_rate": 9.41641740702319e-05, + "loss": 2.3193, + "step": 3115 + }, + { + "epoch": 0.5840674789128397, + "grad_norm": 51024.171875, + "learning_rate": 9.416048942752914e-05, + "loss": 2.3994, + "step": 3116 + }, + { + "epoch": 0.5842549203373946, + "grad_norm": 51512.7109375, + "learning_rate": 9.415680369411953e-05, + "loss": 2.278, + "step": 3117 + }, + { + "epoch": 0.5844423617619494, + "grad_norm": 51688.7578125, + "learning_rate": 9.415311687009412e-05, + "loss": 2.3593, + "step": 3118 + }, + { + "epoch": 0.5846298031865043, + "grad_norm": 49920.12890625, + "learning_rate": 9.414942895554397e-05, + "loss": 2.3685, + "step": 3119 + }, + { + "epoch": 0.584817244611059, + "grad_norm": 56783.91796875, + "learning_rate": 9.414573995056016e-05, + "loss": 2.349, + "step": 3120 + }, + { + "epoch": 0.5850046860356138, + "grad_norm": 49428.20703125, + "learning_rate": 9.414204985523383e-05, + "loss": 2.2703, + "step": 3121 + }, + { + "epoch": 0.5851921274601687, + "grad_norm": 46727.7734375, + "learning_rate": 9.413835866965608e-05, + "loss": 2.3165, + "step": 3122 + }, + { + "epoch": 0.5853795688847235, + "grad_norm": 51472.72265625, + "learning_rate": 9.41346663939181e-05, + "loss": 2.3051, + "step": 3123 + }, + { + "epoch": 0.5855670103092784, + "grad_norm": 50692.4453125, + "learning_rate": 9.41309730281111e-05, + "loss": 2.3642, + "step": 3124 + }, + { + "epoch": 0.5857544517338332, + "grad_norm": 53160.05078125, + "learning_rate": 9.412727857232625e-05, + "loss": 2.3333, + "step": 3125 + }, + { + "epoch": 0.585941893158388, + "grad_norm": 51318.89453125, + "learning_rate": 9.412358302665485e-05, + "loss": 2.2922, + "step": 3126 + }, + { + "epoch": 0.5861293345829428, + "grad_norm": 55089.05078125, + "learning_rate": 9.411988639118815e-05, + "loss": 2.3025, + "step": 3127 + }, + { + "epoch": 0.5863167760074977, + "grad_norm": 48802.8203125, + "learning_rate": 9.411618866601745e-05, + "loss": 2.3716, + "step": 3128 + }, + { + "epoch": 0.5865042174320525, + "grad_norm": 47614.42578125, + "learning_rate": 9.41124898512341e-05, + "loss": 2.2954, + "step": 3129 + }, + { + "epoch": 0.5866916588566073, + "grad_norm": 50440.06640625, + "learning_rate": 9.410878994692947e-05, + "loss": 2.3674, + "step": 3130 + }, + { + "epoch": 0.5868791002811622, + "grad_norm": 51661.6484375, + "learning_rate": 9.410508895319487e-05, + "loss": 2.3677, + "step": 3131 + }, + { + "epoch": 0.5870665417057169, + "grad_norm": 52481.046875, + "learning_rate": 9.410138687012178e-05, + "loss": 2.3395, + "step": 3132 + }, + { + "epoch": 0.5872539831302718, + "grad_norm": 51329.6953125, + "learning_rate": 9.409768369780161e-05, + "loss": 2.3632, + "step": 3133 + }, + { + "epoch": 0.5874414245548266, + "grad_norm": 48356.78515625, + "learning_rate": 9.409397943632583e-05, + "loss": 2.3211, + "step": 3134 + }, + { + "epoch": 0.5876288659793815, + "grad_norm": 47425.62109375, + "learning_rate": 9.409027408578593e-05, + "loss": 2.3521, + "step": 3135 + }, + { + "epoch": 0.5878163074039363, + "grad_norm": 50512.13671875, + "learning_rate": 9.408656764627339e-05, + "loss": 2.2762, + "step": 3136 + }, + { + "epoch": 0.588003748828491, + "grad_norm": 51447.2421875, + "learning_rate": 9.408286011787983e-05, + "loss": 2.2783, + "step": 3137 + }, + { + "epoch": 0.5881911902530459, + "grad_norm": 51953.23046875, + "learning_rate": 9.407915150069676e-05, + "loss": 2.3053, + "step": 3138 + }, + { + "epoch": 0.5883786316776007, + "grad_norm": 46353.3046875, + "learning_rate": 9.40754417948158e-05, + "loss": 2.3577, + "step": 3139 + }, + { + "epoch": 0.5885660731021556, + "grad_norm": 45037.30078125, + "learning_rate": 9.407173100032857e-05, + "loss": 2.3013, + "step": 3140 + }, + { + "epoch": 0.5887535145267104, + "grad_norm": 54412.953125, + "learning_rate": 9.406801911732672e-05, + "loss": 2.3593, + "step": 3141 + }, + { + "epoch": 0.5889409559512653, + "grad_norm": 51118.05859375, + "learning_rate": 9.406430614590195e-05, + "loss": 2.401, + "step": 3142 + }, + { + "epoch": 0.58912839737582, + "grad_norm": 50093.2265625, + "learning_rate": 9.406059208614593e-05, + "loss": 2.3632, + "step": 3143 + }, + { + "epoch": 0.5893158388003749, + "grad_norm": 50235.6796875, + "learning_rate": 9.405687693815041e-05, + "loss": 2.4004, + "step": 3144 + }, + { + "epoch": 0.5895032802249297, + "grad_norm": 56568.7578125, + "learning_rate": 9.405316070200716e-05, + "loss": 2.3354, + "step": 3145 + }, + { + "epoch": 0.5896907216494846, + "grad_norm": 52019.42578125, + "learning_rate": 9.404944337780797e-05, + "loss": 2.3877, + "step": 3146 + }, + { + "epoch": 0.5898781630740394, + "grad_norm": 54326.87890625, + "learning_rate": 9.404572496564462e-05, + "loss": 2.2134, + "step": 3147 + }, + { + "epoch": 0.5900656044985942, + "grad_norm": 56817.38671875, + "learning_rate": 9.404200546560897e-05, + "loss": 2.4258, + "step": 3148 + }, + { + "epoch": 0.590253045923149, + "grad_norm": 47690.9140625, + "learning_rate": 9.403828487779289e-05, + "loss": 2.3086, + "step": 3149 + }, + { + "epoch": 0.5904404873477038, + "grad_norm": 47566.41796875, + "learning_rate": 9.403456320228827e-05, + "loss": 2.3153, + "step": 3150 + }, + { + "epoch": 0.5906279287722587, + "grad_norm": 49757.05859375, + "learning_rate": 9.403084043918704e-05, + "loss": 2.3366, + "step": 3151 + }, + { + "epoch": 0.5908153701968135, + "grad_norm": 49192.9921875, + "learning_rate": 9.402711658858111e-05, + "loss": 2.3548, + "step": 3152 + }, + { + "epoch": 0.5910028116213684, + "grad_norm": 49640.046875, + "learning_rate": 9.402339165056252e-05, + "loss": 2.3374, + "step": 3153 + }, + { + "epoch": 0.5911902530459231, + "grad_norm": 47308.2265625, + "learning_rate": 9.401966562522322e-05, + "loss": 2.3327, + "step": 3154 + }, + { + "epoch": 0.5913776944704779, + "grad_norm": 49409.984375, + "learning_rate": 9.401593851265525e-05, + "loss": 2.3302, + "step": 3155 + }, + { + "epoch": 0.5915651358950328, + "grad_norm": 49923.69140625, + "learning_rate": 9.401221031295064e-05, + "loss": 2.4297, + "step": 3156 + }, + { + "epoch": 0.5917525773195876, + "grad_norm": 52229.92578125, + "learning_rate": 9.400848102620152e-05, + "loss": 2.3799, + "step": 3157 + }, + { + "epoch": 0.5919400187441425, + "grad_norm": 49615.3828125, + "learning_rate": 9.400475065249999e-05, + "loss": 2.3273, + "step": 3158 + }, + { + "epoch": 0.5921274601686973, + "grad_norm": 55544.26171875, + "learning_rate": 9.400101919193816e-05, + "loss": 2.314, + "step": 3159 + }, + { + "epoch": 0.5923149015932521, + "grad_norm": 53050.00390625, + "learning_rate": 9.399728664460821e-05, + "loss": 2.2438, + "step": 3160 + }, + { + "epoch": 0.5925023430178069, + "grad_norm": 50886.1171875, + "learning_rate": 9.399355301060231e-05, + "loss": 2.3044, + "step": 3161 + }, + { + "epoch": 0.5926897844423618, + "grad_norm": 49780.33984375, + "learning_rate": 9.398981829001269e-05, + "loss": 2.3254, + "step": 3162 + }, + { + "epoch": 0.5928772258669166, + "grad_norm": 50194.984375, + "learning_rate": 9.39860824829316e-05, + "loss": 2.2785, + "step": 3163 + }, + { + "epoch": 0.5930646672914714, + "grad_norm": 57657.21875, + "learning_rate": 9.39823455894513e-05, + "loss": 2.3223, + "step": 3164 + }, + { + "epoch": 0.5932521087160263, + "grad_norm": 46534.9609375, + "learning_rate": 9.397860760966408e-05, + "loss": 2.3563, + "step": 3165 + }, + { + "epoch": 0.593439550140581, + "grad_norm": 50018.41015625, + "learning_rate": 9.397486854366229e-05, + "loss": 2.3581, + "step": 3166 + }, + { + "epoch": 0.5936269915651359, + "grad_norm": 51359.31640625, + "learning_rate": 9.397112839153823e-05, + "loss": 2.3411, + "step": 3167 + }, + { + "epoch": 0.5938144329896907, + "grad_norm": 51112.69921875, + "learning_rate": 9.396738715338433e-05, + "loss": 2.3509, + "step": 3168 + }, + { + "epoch": 0.5940018744142456, + "grad_norm": 53865.40234375, + "learning_rate": 9.396364482929297e-05, + "loss": 2.2743, + "step": 3169 + }, + { + "epoch": 0.5941893158388004, + "grad_norm": 49329.125, + "learning_rate": 9.395990141935658e-05, + "loss": 2.3057, + "step": 3170 + }, + { + "epoch": 0.5943767572633551, + "grad_norm": 51545.2890625, + "learning_rate": 9.395615692366763e-05, + "loss": 2.4141, + "step": 3171 + }, + { + "epoch": 0.59456419868791, + "grad_norm": 44980.00390625, + "learning_rate": 9.395241134231858e-05, + "loss": 2.3881, + "step": 3172 + }, + { + "epoch": 0.5947516401124648, + "grad_norm": 47074.625, + "learning_rate": 9.394866467540196e-05, + "loss": 2.338, + "step": 3173 + }, + { + "epoch": 0.5949390815370197, + "grad_norm": 51205.19921875, + "learning_rate": 9.39449169230103e-05, + "loss": 2.2541, + "step": 3174 + }, + { + "epoch": 0.5951265229615745, + "grad_norm": 46415.671875, + "learning_rate": 9.394116808523617e-05, + "loss": 2.357, + "step": 3175 + }, + { + "epoch": 0.5953139643861294, + "grad_norm": 47253.33203125, + "learning_rate": 9.393741816217216e-05, + "loss": 2.3363, + "step": 3176 + }, + { + "epoch": 0.5955014058106841, + "grad_norm": 50415.703125, + "learning_rate": 9.393366715391089e-05, + "loss": 2.3236, + "step": 3177 + }, + { + "epoch": 0.595688847235239, + "grad_norm": 48592.68359375, + "learning_rate": 9.3929915060545e-05, + "loss": 2.3447, + "step": 3178 + }, + { + "epoch": 0.5958762886597938, + "grad_norm": 47346.58203125, + "learning_rate": 9.392616188216716e-05, + "loss": 2.3481, + "step": 3179 + }, + { + "epoch": 0.5960637300843487, + "grad_norm": 53912.16796875, + "learning_rate": 9.392240761887008e-05, + "loss": 2.3079, + "step": 3180 + }, + { + "epoch": 0.5962511715089035, + "grad_norm": 48045.78515625, + "learning_rate": 9.391865227074648e-05, + "loss": 2.2984, + "step": 3181 + }, + { + "epoch": 0.5964386129334583, + "grad_norm": 47446.46484375, + "learning_rate": 9.39148958378891e-05, + "loss": 2.3147, + "step": 3182 + }, + { + "epoch": 0.5966260543580131, + "grad_norm": 54138.8984375, + "learning_rate": 9.391113832039072e-05, + "loss": 2.3906, + "step": 3183 + }, + { + "epoch": 0.5968134957825679, + "grad_norm": 53561.7890625, + "learning_rate": 9.390737971834418e-05, + "loss": 2.3446, + "step": 3184 + }, + { + "epoch": 0.5970009372071228, + "grad_norm": 55534.39453125, + "learning_rate": 9.390362003184226e-05, + "loss": 2.3095, + "step": 3185 + }, + { + "epoch": 0.5971883786316776, + "grad_norm": 54813.44921875, + "learning_rate": 9.389985926097786e-05, + "loss": 2.2488, + "step": 3186 + }, + { + "epoch": 0.5973758200562325, + "grad_norm": 48453.84765625, + "learning_rate": 9.389609740584386e-05, + "loss": 2.3425, + "step": 3187 + }, + { + "epoch": 0.5975632614807872, + "grad_norm": 50871.37890625, + "learning_rate": 9.389233446653317e-05, + "loss": 2.3278, + "step": 3188 + }, + { + "epoch": 0.597750702905342, + "grad_norm": 57023.6875, + "learning_rate": 9.388857044313872e-05, + "loss": 2.3582, + "step": 3189 + }, + { + "epoch": 0.5979381443298969, + "grad_norm": 55437.8984375, + "learning_rate": 9.388480533575347e-05, + "loss": 2.2833, + "step": 3190 + }, + { + "epoch": 0.5981255857544517, + "grad_norm": 47374.98046875, + "learning_rate": 9.388103914447044e-05, + "loss": 2.3212, + "step": 3191 + }, + { + "epoch": 0.5983130271790066, + "grad_norm": 49908.49609375, + "learning_rate": 9.387727186938265e-05, + "loss": 2.3097, + "step": 3192 + }, + { + "epoch": 0.5985004686035614, + "grad_norm": 46606.28515625, + "learning_rate": 9.387350351058311e-05, + "loss": 2.3207, + "step": 3193 + }, + { + "epoch": 0.5986879100281162, + "grad_norm": 49728.8125, + "learning_rate": 9.38697340681649e-05, + "loss": 2.2604, + "step": 3194 + }, + { + "epoch": 0.598875351452671, + "grad_norm": 49090.1171875, + "learning_rate": 9.386596354222117e-05, + "loss": 2.4198, + "step": 3195 + }, + { + "epoch": 0.5990627928772259, + "grad_norm": 49798.27734375, + "learning_rate": 9.3862191932845e-05, + "loss": 2.3436, + "step": 3196 + }, + { + "epoch": 0.5992502343017807, + "grad_norm": 53905.62890625, + "learning_rate": 9.385841924012955e-05, + "loss": 2.2975, + "step": 3197 + }, + { + "epoch": 0.5994376757263356, + "grad_norm": 50761.8515625, + "learning_rate": 9.385464546416802e-05, + "loss": 2.3008, + "step": 3198 + }, + { + "epoch": 0.5996251171508904, + "grad_norm": 49473.55078125, + "learning_rate": 9.38508706050536e-05, + "loss": 2.3523, + "step": 3199 + }, + { + "epoch": 0.5998125585754451, + "grad_norm": 50075.87890625, + "learning_rate": 9.384709466287953e-05, + "loss": 2.285, + "step": 3200 + }, + { + "epoch": 0.6, + "grad_norm": 48940.4921875, + "learning_rate": 9.384331763773908e-05, + "loss": 2.399, + "step": 3201 + }, + { + "epoch": 0.6001874414245548, + "grad_norm": 51875.2578125, + "learning_rate": 9.383953952972551e-05, + "loss": 2.3256, + "step": 3202 + }, + { + "epoch": 0.6003748828491097, + "grad_norm": 51629.38671875, + "learning_rate": 9.383576033893217e-05, + "loss": 2.4104, + "step": 3203 + }, + { + "epoch": 0.6005623242736645, + "grad_norm": 48072.984375, + "learning_rate": 9.383198006545237e-05, + "loss": 2.321, + "step": 3204 + }, + { + "epoch": 0.6007497656982193, + "grad_norm": 50354.04296875, + "learning_rate": 9.38281987093795e-05, + "loss": 2.3011, + "step": 3205 + }, + { + "epoch": 0.6009372071227741, + "grad_norm": 49142.3125, + "learning_rate": 9.382441627080694e-05, + "loss": 2.3412, + "step": 3206 + }, + { + "epoch": 0.6011246485473289, + "grad_norm": 52910.5078125, + "learning_rate": 9.382063274982813e-05, + "loss": 2.3072, + "step": 3207 + }, + { + "epoch": 0.6013120899718838, + "grad_norm": 46386.97265625, + "learning_rate": 9.381684814653648e-05, + "loss": 2.3627, + "step": 3208 + }, + { + "epoch": 0.6014995313964386, + "grad_norm": 49114.7578125, + "learning_rate": 9.381306246102552e-05, + "loss": 2.2782, + "step": 3209 + }, + { + "epoch": 0.6016869728209935, + "grad_norm": 51733.20703125, + "learning_rate": 9.380927569338871e-05, + "loss": 2.3109, + "step": 3210 + }, + { + "epoch": 0.6018744142455482, + "grad_norm": 47473.76171875, + "learning_rate": 9.380548784371959e-05, + "loss": 2.3339, + "step": 3211 + }, + { + "epoch": 0.6020618556701031, + "grad_norm": 48195.734375, + "learning_rate": 9.38016989121117e-05, + "loss": 2.3537, + "step": 3212 + }, + { + "epoch": 0.6022492970946579, + "grad_norm": 53962.40625, + "learning_rate": 9.379790889865864e-05, + "loss": 2.2806, + "step": 3213 + }, + { + "epoch": 0.6024367385192128, + "grad_norm": 51986.5, + "learning_rate": 9.379411780345403e-05, + "loss": 2.3742, + "step": 3214 + }, + { + "epoch": 0.6026241799437676, + "grad_norm": 53346.328125, + "learning_rate": 9.37903256265915e-05, + "loss": 2.3117, + "step": 3215 + }, + { + "epoch": 0.6028116213683224, + "grad_norm": 47876.63671875, + "learning_rate": 9.378653236816467e-05, + "loss": 2.3642, + "step": 3216 + }, + { + "epoch": 0.6029990627928772, + "grad_norm": 55764.2109375, + "learning_rate": 9.378273802826727e-05, + "loss": 2.343, + "step": 3217 + }, + { + "epoch": 0.603186504217432, + "grad_norm": 52842.40625, + "learning_rate": 9.377894260699302e-05, + "loss": 2.3264, + "step": 3218 + }, + { + "epoch": 0.6033739456419869, + "grad_norm": 49290.88671875, + "learning_rate": 9.377514610443565e-05, + "loss": 2.3025, + "step": 3219 + }, + { + "epoch": 0.6035613870665417, + "grad_norm": 53198.88671875, + "learning_rate": 9.377134852068892e-05, + "loss": 2.2877, + "step": 3220 + }, + { + "epoch": 0.6037488284910966, + "grad_norm": 50934.390625, + "learning_rate": 9.376754985584662e-05, + "loss": 2.3802, + "step": 3221 + }, + { + "epoch": 0.6039362699156514, + "grad_norm": 51120.3515625, + "learning_rate": 9.376375011000258e-05, + "loss": 2.3326, + "step": 3222 + }, + { + "epoch": 0.6041237113402061, + "grad_norm": 48768.8828125, + "learning_rate": 9.375994928325067e-05, + "loss": 2.3635, + "step": 3223 + }, + { + "epoch": 0.604311152764761, + "grad_norm": 48620.66796875, + "learning_rate": 9.375614737568475e-05, + "loss": 2.3393, + "step": 3224 + }, + { + "epoch": 0.6044985941893158, + "grad_norm": 53361.25390625, + "learning_rate": 9.375234438739872e-05, + "loss": 2.3841, + "step": 3225 + }, + { + "epoch": 0.6046860356138707, + "grad_norm": 47289.87890625, + "learning_rate": 9.37485403184865e-05, + "loss": 2.4009, + "step": 3226 + }, + { + "epoch": 0.6048734770384255, + "grad_norm": 49627.12109375, + "learning_rate": 9.374473516904206e-05, + "loss": 2.3781, + "step": 3227 + }, + { + "epoch": 0.6050609184629803, + "grad_norm": 52330.77734375, + "learning_rate": 9.374092893915938e-05, + "loss": 2.3024, + "step": 3228 + }, + { + "epoch": 0.6052483598875351, + "grad_norm": 49768.78125, + "learning_rate": 9.373712162893247e-05, + "loss": 2.309, + "step": 3229 + }, + { + "epoch": 0.60543580131209, + "grad_norm": 50629.05078125, + "learning_rate": 9.373331323845535e-05, + "loss": 2.3202, + "step": 3230 + }, + { + "epoch": 0.6056232427366448, + "grad_norm": 48947.4921875, + "learning_rate": 9.372950376782209e-05, + "loss": 2.3604, + "step": 3231 + }, + { + "epoch": 0.6058106841611997, + "grad_norm": 46776.11328125, + "learning_rate": 9.372569321712678e-05, + "loss": 2.3626, + "step": 3232 + }, + { + "epoch": 0.6059981255857545, + "grad_norm": 50887.60546875, + "learning_rate": 9.372188158646356e-05, + "loss": 2.3558, + "step": 3233 + }, + { + "epoch": 0.6061855670103092, + "grad_norm": 50444.76171875, + "learning_rate": 9.371806887592654e-05, + "loss": 2.3725, + "step": 3234 + }, + { + "epoch": 0.6063730084348641, + "grad_norm": 54788.5546875, + "learning_rate": 9.371425508560992e-05, + "loss": 2.3499, + "step": 3235 + }, + { + "epoch": 0.6065604498594189, + "grad_norm": 55221.15625, + "learning_rate": 9.371044021560784e-05, + "loss": 2.3272, + "step": 3236 + }, + { + "epoch": 0.6067478912839738, + "grad_norm": 49242.5625, + "learning_rate": 9.370662426601457e-05, + "loss": 2.3306, + "step": 3237 + }, + { + "epoch": 0.6069353327085286, + "grad_norm": 47777.6328125, + "learning_rate": 9.370280723692436e-05, + "loss": 2.3731, + "step": 3238 + }, + { + "epoch": 0.6071227741330835, + "grad_norm": 52280.25390625, + "learning_rate": 9.369898912843147e-05, + "loss": 2.3091, + "step": 3239 + }, + { + "epoch": 0.6073102155576382, + "grad_norm": 44979.30859375, + "learning_rate": 9.369516994063019e-05, + "loss": 2.3653, + "step": 3240 + }, + { + "epoch": 0.607497656982193, + "grad_norm": 49142.5078125, + "learning_rate": 9.369134967361489e-05, + "loss": 2.369, + "step": 3241 + }, + { + "epoch": 0.6076850984067479, + "grad_norm": 52369.796875, + "learning_rate": 9.368752832747987e-05, + "loss": 2.3535, + "step": 3242 + }, + { + "epoch": 0.6078725398313027, + "grad_norm": 49049.0546875, + "learning_rate": 9.368370590231957e-05, + "loss": 2.3772, + "step": 3243 + }, + { + "epoch": 0.6080599812558576, + "grad_norm": 46684.15234375, + "learning_rate": 9.367988239822835e-05, + "loss": 2.3666, + "step": 3244 + }, + { + "epoch": 0.6082474226804123, + "grad_norm": 46937.74609375, + "learning_rate": 9.367605781530067e-05, + "loss": 2.3425, + "step": 3245 + }, + { + "epoch": 0.6084348641049672, + "grad_norm": 53421.734375, + "learning_rate": 9.367223215363099e-05, + "loss": 2.3439, + "step": 3246 + }, + { + "epoch": 0.608622305529522, + "grad_norm": 50301.66796875, + "learning_rate": 9.366840541331381e-05, + "loss": 2.4145, + "step": 3247 + }, + { + "epoch": 0.6088097469540769, + "grad_norm": 48702.44921875, + "learning_rate": 9.366457759444362e-05, + "loss": 2.4117, + "step": 3248 + }, + { + "epoch": 0.6089971883786317, + "grad_norm": 49330.40625, + "learning_rate": 9.366074869711497e-05, + "loss": 2.2777, + "step": 3249 + }, + { + "epoch": 0.6091846298031866, + "grad_norm": 45596.04296875, + "learning_rate": 9.365691872142245e-05, + "loss": 2.3107, + "step": 3250 + }, + { + "epoch": 0.6093720712277413, + "grad_norm": 48152.17578125, + "learning_rate": 9.365308766746063e-05, + "loss": 2.3165, + "step": 3251 + }, + { + "epoch": 0.6095595126522961, + "grad_norm": 48613.52734375, + "learning_rate": 9.364925553532414e-05, + "loss": 2.3812, + "step": 3252 + }, + { + "epoch": 0.609746954076851, + "grad_norm": 46721.26171875, + "learning_rate": 9.364542232510764e-05, + "loss": 2.3114, + "step": 3253 + }, + { + "epoch": 0.6099343955014058, + "grad_norm": 52360.78515625, + "learning_rate": 9.36415880369058e-05, + "loss": 2.3022, + "step": 3254 + }, + { + "epoch": 0.6101218369259607, + "grad_norm": 60631.1640625, + "learning_rate": 9.36377526708133e-05, + "loss": 2.408, + "step": 3255 + }, + { + "epoch": 0.6103092783505155, + "grad_norm": 53423.5703125, + "learning_rate": 9.363391622692489e-05, + "loss": 2.3244, + "step": 3256 + }, + { + "epoch": 0.6104967197750703, + "grad_norm": 48236.20703125, + "learning_rate": 9.363007870533532e-05, + "loss": 2.3327, + "step": 3257 + }, + { + "epoch": 0.6106841611996251, + "grad_norm": 49328.62109375, + "learning_rate": 9.362624010613939e-05, + "loss": 2.2585, + "step": 3258 + }, + { + "epoch": 0.6108716026241799, + "grad_norm": 51551.578125, + "learning_rate": 9.362240042943186e-05, + "loss": 2.4038, + "step": 3259 + }, + { + "epoch": 0.6110590440487348, + "grad_norm": 49786.87109375, + "learning_rate": 9.361855967530763e-05, + "loss": 2.3126, + "step": 3260 + }, + { + "epoch": 0.6112464854732896, + "grad_norm": 53380.10546875, + "learning_rate": 9.361471784386152e-05, + "loss": 2.3253, + "step": 3261 + }, + { + "epoch": 0.6114339268978444, + "grad_norm": 55957.19140625, + "learning_rate": 9.361087493518842e-05, + "loss": 2.3058, + "step": 3262 + }, + { + "epoch": 0.6116213683223992, + "grad_norm": 53537.2734375, + "learning_rate": 9.360703094938326e-05, + "loss": 2.3052, + "step": 3263 + }, + { + "epoch": 0.6118088097469541, + "grad_norm": 50101.39453125, + "learning_rate": 9.360318588654096e-05, + "loss": 2.3282, + "step": 3264 + }, + { + "epoch": 0.6119962511715089, + "grad_norm": 49079.6171875, + "learning_rate": 9.35993397467565e-05, + "loss": 2.3783, + "step": 3265 + }, + { + "epoch": 0.6121836925960638, + "grad_norm": 50576.078125, + "learning_rate": 9.359549253012488e-05, + "loss": 2.3341, + "step": 3266 + }, + { + "epoch": 0.6123711340206186, + "grad_norm": 52778.56640625, + "learning_rate": 9.359164423674112e-05, + "loss": 2.2634, + "step": 3267 + }, + { + "epoch": 0.6125585754451733, + "grad_norm": 53063.0625, + "learning_rate": 9.358779486670028e-05, + "loss": 2.3908, + "step": 3268 + }, + { + "epoch": 0.6127460168697282, + "grad_norm": 50162.75, + "learning_rate": 9.358394442009739e-05, + "loss": 2.3442, + "step": 3269 + }, + { + "epoch": 0.612933458294283, + "grad_norm": 54680.9296875, + "learning_rate": 9.35800928970276e-05, + "loss": 2.278, + "step": 3270 + }, + { + "epoch": 0.6131208997188379, + "grad_norm": 47015.75, + "learning_rate": 9.3576240297586e-05, + "loss": 2.3615, + "step": 3271 + }, + { + "epoch": 0.6133083411433927, + "grad_norm": 47778.33984375, + "learning_rate": 9.357238662186779e-05, + "loss": 2.3145, + "step": 3272 + }, + { + "epoch": 0.6134957825679476, + "grad_norm": 50387.13671875, + "learning_rate": 9.356853186996811e-05, + "loss": 2.2765, + "step": 3273 + }, + { + "epoch": 0.6136832239925023, + "grad_norm": 48117.80859375, + "learning_rate": 9.356467604198217e-05, + "loss": 2.3583, + "step": 3274 + }, + { + "epoch": 0.6138706654170571, + "grad_norm": 53511.52734375, + "learning_rate": 9.356081913800522e-05, + "loss": 2.391, + "step": 3275 + }, + { + "epoch": 0.614058106841612, + "grad_norm": 46078.90234375, + "learning_rate": 9.355696115813252e-05, + "loss": 2.2789, + "step": 3276 + }, + { + "epoch": 0.6142455482661668, + "grad_norm": 51145.79296875, + "learning_rate": 9.355310210245934e-05, + "loss": 2.3398, + "step": 3277 + }, + { + "epoch": 0.6144329896907217, + "grad_norm": 48547.5703125, + "learning_rate": 9.354924197108101e-05, + "loss": 2.2408, + "step": 3278 + }, + { + "epoch": 0.6146204311152764, + "grad_norm": 50110.94140625, + "learning_rate": 9.354538076409288e-05, + "loss": 2.4018, + "step": 3279 + }, + { + "epoch": 0.6148078725398313, + "grad_norm": 51752.74609375, + "learning_rate": 9.354151848159028e-05, + "loss": 2.3833, + "step": 3280 + }, + { + "epoch": 0.6149953139643861, + "grad_norm": 54505.69140625, + "learning_rate": 9.353765512366865e-05, + "loss": 2.2683, + "step": 3281 + }, + { + "epoch": 0.615182755388941, + "grad_norm": 51154.91796875, + "learning_rate": 9.353379069042336e-05, + "loss": 2.3812, + "step": 3282 + }, + { + "epoch": 0.6153701968134958, + "grad_norm": 49726.5234375, + "learning_rate": 9.35299251819499e-05, + "loss": 2.3655, + "step": 3283 + }, + { + "epoch": 0.6155576382380507, + "grad_norm": 50879.875, + "learning_rate": 9.352605859834372e-05, + "loss": 2.3279, + "step": 3284 + }, + { + "epoch": 0.6157450796626054, + "grad_norm": 54211.6875, + "learning_rate": 9.352219093970033e-05, + "loss": 2.3233, + "step": 3285 + }, + { + "epoch": 0.6159325210871602, + "grad_norm": 49815.53125, + "learning_rate": 9.351832220611524e-05, + "loss": 2.3147, + "step": 3286 + }, + { + "epoch": 0.6161199625117151, + "grad_norm": 54104.18359375, + "learning_rate": 9.351445239768402e-05, + "loss": 2.3531, + "step": 3287 + }, + { + "epoch": 0.6163074039362699, + "grad_norm": 50042.890625, + "learning_rate": 9.351058151450225e-05, + "loss": 2.3793, + "step": 3288 + }, + { + "epoch": 0.6164948453608248, + "grad_norm": 50624.8203125, + "learning_rate": 9.350670955666553e-05, + "loss": 2.3623, + "step": 3289 + }, + { + "epoch": 0.6166822867853796, + "grad_norm": 49209.39453125, + "learning_rate": 9.350283652426948e-05, + "loss": 2.3574, + "step": 3290 + }, + { + "epoch": 0.6168697282099344, + "grad_norm": 51484.26171875, + "learning_rate": 9.349896241740977e-05, + "loss": 2.3537, + "step": 3291 + }, + { + "epoch": 0.6170571696344892, + "grad_norm": 51112.71484375, + "learning_rate": 9.34950872361821e-05, + "loss": 2.2546, + "step": 3292 + }, + { + "epoch": 0.617244611059044, + "grad_norm": 54073.28515625, + "learning_rate": 9.349121098068215e-05, + "loss": 2.3418, + "step": 3293 + }, + { + "epoch": 0.6174320524835989, + "grad_norm": 47705.2734375, + "learning_rate": 9.348733365100568e-05, + "loss": 2.2785, + "step": 3294 + }, + { + "epoch": 0.6176194939081537, + "grad_norm": 51101.17578125, + "learning_rate": 9.348345524724847e-05, + "loss": 2.3315, + "step": 3295 + }, + { + "epoch": 0.6178069353327085, + "grad_norm": 54278.3984375, + "learning_rate": 9.347957576950627e-05, + "loss": 2.3098, + "step": 3296 + }, + { + "epoch": 0.6179943767572633, + "grad_norm": 48060.078125, + "learning_rate": 9.347569521787492e-05, + "loss": 2.3523, + "step": 3297 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 54394.97265625, + "learning_rate": 9.347181359245027e-05, + "loss": 2.3388, + "step": 3298 + }, + { + "epoch": 0.618369259606373, + "grad_norm": 52119.21484375, + "learning_rate": 9.346793089332819e-05, + "loss": 2.4092, + "step": 3299 + }, + { + "epoch": 0.6185567010309279, + "grad_norm": 50790.1953125, + "learning_rate": 9.346404712060458e-05, + "loss": 2.3065, + "step": 3300 + }, + { + "epoch": 0.6187441424554827, + "grad_norm": 51246.0390625, + "learning_rate": 9.346016227437535e-05, + "loss": 2.3699, + "step": 3301 + }, + { + "epoch": 0.6189315838800374, + "grad_norm": 50657.5, + "learning_rate": 9.345627635473647e-05, + "loss": 2.3627, + "step": 3302 + }, + { + "epoch": 0.6191190253045923, + "grad_norm": 51855.03125, + "learning_rate": 9.345238936178389e-05, + "loss": 2.3226, + "step": 3303 + }, + { + "epoch": 0.6193064667291471, + "grad_norm": 46988.4140625, + "learning_rate": 9.344850129561364e-05, + "loss": 2.342, + "step": 3304 + }, + { + "epoch": 0.619493908153702, + "grad_norm": 50511.52734375, + "learning_rate": 9.344461215632172e-05, + "loss": 2.3201, + "step": 3305 + }, + { + "epoch": 0.6196813495782568, + "grad_norm": 46885.98046875, + "learning_rate": 9.344072194400423e-05, + "loss": 2.3378, + "step": 3306 + }, + { + "epoch": 0.6198687910028117, + "grad_norm": 48278.90625, + "learning_rate": 9.343683065875721e-05, + "loss": 2.308, + "step": 3307 + }, + { + "epoch": 0.6200562324273664, + "grad_norm": 47058.359375, + "learning_rate": 9.34329383006768e-05, + "loss": 2.3551, + "step": 3308 + }, + { + "epoch": 0.6202436738519213, + "grad_norm": 50247.77734375, + "learning_rate": 9.342904486985913e-05, + "loss": 2.2976, + "step": 3309 + }, + { + "epoch": 0.6204311152764761, + "grad_norm": 45861.37890625, + "learning_rate": 9.342515036640036e-05, + "loss": 2.3367, + "step": 3310 + }, + { + "epoch": 0.6206185567010309, + "grad_norm": 50060.96484375, + "learning_rate": 9.342125479039668e-05, + "loss": 2.3234, + "step": 3311 + }, + { + "epoch": 0.6208059981255858, + "grad_norm": 55207.8984375, + "learning_rate": 9.341735814194429e-05, + "loss": 2.356, + "step": 3312 + }, + { + "epoch": 0.6209934395501406, + "grad_norm": 47623.05859375, + "learning_rate": 9.341346042113946e-05, + "loss": 2.3165, + "step": 3313 + }, + { + "epoch": 0.6211808809746954, + "grad_norm": 55868.05859375, + "learning_rate": 9.340956162807842e-05, + "loss": 2.2824, + "step": 3314 + }, + { + "epoch": 0.6213683223992502, + "grad_norm": 49419.75390625, + "learning_rate": 9.340566176285752e-05, + "loss": 2.3412, + "step": 3315 + }, + { + "epoch": 0.6215557638238051, + "grad_norm": 53309.765625, + "learning_rate": 9.340176082557303e-05, + "loss": 2.341, + "step": 3316 + }, + { + "epoch": 0.6217432052483599, + "grad_norm": 53711.77734375, + "learning_rate": 9.339785881632132e-05, + "loss": 2.2746, + "step": 3317 + }, + { + "epoch": 0.6219306466729148, + "grad_norm": 50783.93359375, + "learning_rate": 9.339395573519875e-05, + "loss": 2.3408, + "step": 3318 + }, + { + "epoch": 0.6221180880974695, + "grad_norm": 54527.5546875, + "learning_rate": 9.339005158230177e-05, + "loss": 2.208, + "step": 3319 + }, + { + "epoch": 0.6223055295220243, + "grad_norm": 53153.515625, + "learning_rate": 9.338614635772675e-05, + "loss": 2.3227, + "step": 3320 + }, + { + "epoch": 0.6224929709465792, + "grad_norm": 49198.5078125, + "learning_rate": 9.338224006157017e-05, + "loss": 2.2809, + "step": 3321 + }, + { + "epoch": 0.622680412371134, + "grad_norm": 49228.34765625, + "learning_rate": 9.33783326939285e-05, + "loss": 2.2725, + "step": 3322 + }, + { + "epoch": 0.6228678537956889, + "grad_norm": 53602.96484375, + "learning_rate": 9.337442425489827e-05, + "loss": 2.355, + "step": 3323 + }, + { + "epoch": 0.6230552952202437, + "grad_norm": 50178.15234375, + "learning_rate": 9.337051474457598e-05, + "loss": 2.3369, + "step": 3324 + }, + { + "epoch": 0.6232427366447985, + "grad_norm": 49958.94140625, + "learning_rate": 9.336660416305822e-05, + "loss": 2.3505, + "step": 3325 + }, + { + "epoch": 0.6234301780693533, + "grad_norm": 49468.1484375, + "learning_rate": 9.336269251044155e-05, + "loss": 2.2539, + "step": 3326 + }, + { + "epoch": 0.6236176194939081, + "grad_norm": 52100.1171875, + "learning_rate": 9.33587797868226e-05, + "loss": 2.2808, + "step": 3327 + }, + { + "epoch": 0.623805060918463, + "grad_norm": 50440.046875, + "learning_rate": 9.3354865992298e-05, + "loss": 2.3132, + "step": 3328 + }, + { + "epoch": 0.6239925023430178, + "grad_norm": 51667.1328125, + "learning_rate": 9.335095112696443e-05, + "loss": 2.3397, + "step": 3329 + }, + { + "epoch": 0.6241799437675727, + "grad_norm": 50539.66796875, + "learning_rate": 9.334703519091856e-05, + "loss": 2.2794, + "step": 3330 + }, + { + "epoch": 0.6243673851921274, + "grad_norm": 50924.74609375, + "learning_rate": 9.334311818425714e-05, + "loss": 2.2794, + "step": 3331 + }, + { + "epoch": 0.6245548266166823, + "grad_norm": 49547.18359375, + "learning_rate": 9.333920010707689e-05, + "loss": 2.3666, + "step": 3332 + }, + { + "epoch": 0.6247422680412371, + "grad_norm": 53834.4609375, + "learning_rate": 9.333528095947457e-05, + "loss": 2.4133, + "step": 3333 + }, + { + "epoch": 0.624929709465792, + "grad_norm": 51940.046875, + "learning_rate": 9.3331360741547e-05, + "loss": 2.3208, + "step": 3334 + }, + { + "epoch": 0.6251171508903468, + "grad_norm": 51438.5078125, + "learning_rate": 9.332743945339102e-05, + "loss": 2.3299, + "step": 3335 + }, + { + "epoch": 0.6253045923149015, + "grad_norm": 53309.28515625, + "learning_rate": 9.332351709510346e-05, + "loss": 2.3776, + "step": 3336 + }, + { + "epoch": 0.6254920337394564, + "grad_norm": 50356.0625, + "learning_rate": 9.331959366678117e-05, + "loss": 2.349, + "step": 3337 + }, + { + "epoch": 0.6256794751640112, + "grad_norm": 52617.75390625, + "learning_rate": 9.33156691685211e-05, + "loss": 2.2728, + "step": 3338 + }, + { + "epoch": 0.6258669165885661, + "grad_norm": 46148.921875, + "learning_rate": 9.331174360042016e-05, + "loss": 2.3299, + "step": 3339 + }, + { + "epoch": 0.6260543580131209, + "grad_norm": 50680.2109375, + "learning_rate": 9.33078169625753e-05, + "loss": 2.3144, + "step": 3340 + }, + { + "epoch": 0.6262417994376758, + "grad_norm": 51634.23046875, + "learning_rate": 9.330388925508351e-05, + "loss": 2.3407, + "step": 3341 + }, + { + "epoch": 0.6264292408622305, + "grad_norm": 52594.25, + "learning_rate": 9.329996047804182e-05, + "loss": 2.3504, + "step": 3342 + }, + { + "epoch": 0.6266166822867854, + "grad_norm": 49347.16796875, + "learning_rate": 9.329603063154722e-05, + "loss": 2.3761, + "step": 3343 + }, + { + "epoch": 0.6268041237113402, + "grad_norm": 49061.2734375, + "learning_rate": 9.329209971569681e-05, + "loss": 2.3539, + "step": 3344 + }, + { + "epoch": 0.626991565135895, + "grad_norm": 51439.24609375, + "learning_rate": 9.328816773058766e-05, + "loss": 2.2607, + "step": 3345 + }, + { + "epoch": 0.6271790065604499, + "grad_norm": 50869.859375, + "learning_rate": 9.328423467631691e-05, + "loss": 2.3895, + "step": 3346 + }, + { + "epoch": 0.6273664479850047, + "grad_norm": 50423.5625, + "learning_rate": 9.328030055298166e-05, + "loss": 2.382, + "step": 3347 + }, + { + "epoch": 0.6275538894095595, + "grad_norm": 50921.83203125, + "learning_rate": 9.327636536067911e-05, + "loss": 2.297, + "step": 3348 + }, + { + "epoch": 0.6277413308341143, + "grad_norm": 54475.90625, + "learning_rate": 9.327242909950644e-05, + "loss": 2.4323, + "step": 3349 + }, + { + "epoch": 0.6279287722586692, + "grad_norm": 51010.67578125, + "learning_rate": 9.326849176956087e-05, + "loss": 2.3223, + "step": 3350 + }, + { + "epoch": 0.628116213683224, + "grad_norm": 51175.01953125, + "learning_rate": 9.326455337093967e-05, + "loss": 2.3042, + "step": 3351 + }, + { + "epoch": 0.6283036551077789, + "grad_norm": 50881.203125, + "learning_rate": 9.326061390374008e-05, + "loss": 2.3129, + "step": 3352 + }, + { + "epoch": 0.6284910965323336, + "grad_norm": 51954.8203125, + "learning_rate": 9.325667336805941e-05, + "loss": 2.3966, + "step": 3353 + }, + { + "epoch": 0.6286785379568884, + "grad_norm": 52071.375, + "learning_rate": 9.3252731763995e-05, + "loss": 2.3268, + "step": 3354 + }, + { + "epoch": 0.6288659793814433, + "grad_norm": 56657.75390625, + "learning_rate": 9.324878909164421e-05, + "loss": 2.3552, + "step": 3355 + }, + { + "epoch": 0.6290534208059981, + "grad_norm": 50005.11328125, + "learning_rate": 9.324484535110439e-05, + "loss": 2.3555, + "step": 3356 + }, + { + "epoch": 0.629240862230553, + "grad_norm": 50317.828125, + "learning_rate": 9.324090054247294e-05, + "loss": 2.4017, + "step": 3357 + }, + { + "epoch": 0.6294283036551078, + "grad_norm": 50203.9765625, + "learning_rate": 9.323695466584732e-05, + "loss": 2.3036, + "step": 3358 + }, + { + "epoch": 0.6296157450796626, + "grad_norm": 56554.25, + "learning_rate": 9.323300772132497e-05, + "loss": 2.3551, + "step": 3359 + }, + { + "epoch": 0.6298031865042174, + "grad_norm": 52521.3125, + "learning_rate": 9.322905970900339e-05, + "loss": 2.4923, + "step": 3360 + }, + { + "epoch": 0.6299906279287723, + "grad_norm": 55302.3671875, + "learning_rate": 9.32251106289801e-05, + "loss": 2.3679, + "step": 3361 + }, + { + "epoch": 0.6301780693533271, + "grad_norm": 49917.66796875, + "learning_rate": 9.322116048135259e-05, + "loss": 2.3706, + "step": 3362 + }, + { + "epoch": 0.6303655107778819, + "grad_norm": 50246.1484375, + "learning_rate": 9.321720926621847e-05, + "loss": 2.2635, + "step": 3363 + }, + { + "epoch": 0.6305529522024368, + "grad_norm": 56386.359375, + "learning_rate": 9.321325698367531e-05, + "loss": 2.3992, + "step": 3364 + }, + { + "epoch": 0.6307403936269915, + "grad_norm": 49250.44921875, + "learning_rate": 9.320930363382073e-05, + "loss": 2.4144, + "step": 3365 + }, + { + "epoch": 0.6309278350515464, + "grad_norm": 47384.09375, + "learning_rate": 9.320534921675237e-05, + "loss": 2.265, + "step": 3366 + }, + { + "epoch": 0.6311152764761012, + "grad_norm": 49426.484375, + "learning_rate": 9.320139373256792e-05, + "loss": 2.2881, + "step": 3367 + }, + { + "epoch": 0.6313027179006561, + "grad_norm": 52060.0390625, + "learning_rate": 9.319743718136503e-05, + "loss": 2.4465, + "step": 3368 + }, + { + "epoch": 0.6314901593252109, + "grad_norm": 52377.65234375, + "learning_rate": 9.319347956324145e-05, + "loss": 2.3521, + "step": 3369 + }, + { + "epoch": 0.6316776007497656, + "grad_norm": 47881.171875, + "learning_rate": 9.318952087829493e-05, + "loss": 2.3486, + "step": 3370 + }, + { + "epoch": 0.6318650421743205, + "grad_norm": 46414.82421875, + "learning_rate": 9.318556112662324e-05, + "loss": 2.3488, + "step": 3371 + }, + { + "epoch": 0.6320524835988753, + "grad_norm": 51029.75, + "learning_rate": 9.31816003083242e-05, + "loss": 2.3653, + "step": 3372 + }, + { + "epoch": 0.6322399250234302, + "grad_norm": 49729.06640625, + "learning_rate": 9.317763842349558e-05, + "loss": 2.3797, + "step": 3373 + }, + { + "epoch": 0.632427366447985, + "grad_norm": 65368.62890625, + "learning_rate": 9.31736754722353e-05, + "loss": 2.4144, + "step": 3374 + }, + { + "epoch": 0.6326148078725399, + "grad_norm": 46737.57421875, + "learning_rate": 9.31697114546412e-05, + "loss": 2.3407, + "step": 3375 + }, + { + "epoch": 0.6328022492970946, + "grad_norm": 47838.515625, + "learning_rate": 9.316574637081122e-05, + "loss": 2.3442, + "step": 3376 + }, + { + "epoch": 0.6329896907216495, + "grad_norm": 50758.6796875, + "learning_rate": 9.316178022084323e-05, + "loss": 2.337, + "step": 3377 + }, + { + "epoch": 0.6331771321462043, + "grad_norm": 51845.4921875, + "learning_rate": 9.315781300483526e-05, + "loss": 2.3351, + "step": 3378 + }, + { + "epoch": 0.6333645735707591, + "grad_norm": 51402.6171875, + "learning_rate": 9.315384472288525e-05, + "loss": 2.2931, + "step": 3379 + }, + { + "epoch": 0.633552014995314, + "grad_norm": 52307.52734375, + "learning_rate": 9.314987537509123e-05, + "loss": 2.3531, + "step": 3380 + }, + { + "epoch": 0.6337394564198688, + "grad_norm": 56931.84765625, + "learning_rate": 9.314590496155123e-05, + "loss": 2.3468, + "step": 3381 + }, + { + "epoch": 0.6339268978444236, + "grad_norm": 47679.1484375, + "learning_rate": 9.314193348236332e-05, + "loss": 2.3405, + "step": 3382 + }, + { + "epoch": 0.6341143392689784, + "grad_norm": 55718.43359375, + "learning_rate": 9.313796093762559e-05, + "loss": 2.2484, + "step": 3383 + }, + { + "epoch": 0.6343017806935333, + "grad_norm": 49272.48046875, + "learning_rate": 9.313398732743615e-05, + "loss": 2.3254, + "step": 3384 + }, + { + "epoch": 0.6344892221180881, + "grad_norm": 48805.77734375, + "learning_rate": 9.313001265189313e-05, + "loss": 2.3724, + "step": 3385 + }, + { + "epoch": 0.634676663542643, + "grad_norm": 53525.14453125, + "learning_rate": 9.312603691109475e-05, + "loss": 2.3487, + "step": 3386 + }, + { + "epoch": 0.6348641049671977, + "grad_norm": 52903.12890625, + "learning_rate": 9.312206010513915e-05, + "loss": 2.3497, + "step": 3387 + }, + { + "epoch": 0.6350515463917525, + "grad_norm": 50487.0546875, + "learning_rate": 9.311808223412458e-05, + "loss": 2.3375, + "step": 3388 + }, + { + "epoch": 0.6352389878163074, + "grad_norm": 52040.1484375, + "learning_rate": 9.31141032981493e-05, + "loss": 2.3404, + "step": 3389 + }, + { + "epoch": 0.6354264292408622, + "grad_norm": 49964.796875, + "learning_rate": 9.311012329731154e-05, + "loss": 2.3371, + "step": 3390 + }, + { + "epoch": 0.6356138706654171, + "grad_norm": 50322.42578125, + "learning_rate": 9.310614223170963e-05, + "loss": 2.3359, + "step": 3391 + }, + { + "epoch": 0.6358013120899719, + "grad_norm": 48892.44140625, + "learning_rate": 9.31021601014419e-05, + "loss": 2.2688, + "step": 3392 + }, + { + "epoch": 0.6359887535145267, + "grad_norm": 53425.56640625, + "learning_rate": 9.309817690660671e-05, + "loss": 2.3105, + "step": 3393 + }, + { + "epoch": 0.6361761949390815, + "grad_norm": 51621.234375, + "learning_rate": 9.309419264730241e-05, + "loss": 2.3978, + "step": 3394 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 49938.984375, + "learning_rate": 9.309020732362743e-05, + "loss": 2.3535, + "step": 3395 + }, + { + "epoch": 0.6365510777881912, + "grad_norm": 55263.51953125, + "learning_rate": 9.30862209356802e-05, + "loss": 2.2526, + "step": 3396 + }, + { + "epoch": 0.636738519212746, + "grad_norm": 46711.625, + "learning_rate": 9.308223348355917e-05, + "loss": 2.2964, + "step": 3397 + }, + { + "epoch": 0.6369259606373009, + "grad_norm": 48328.2890625, + "learning_rate": 9.307824496736283e-05, + "loss": 2.2861, + "step": 3398 + }, + { + "epoch": 0.6371134020618556, + "grad_norm": 50729.55078125, + "learning_rate": 9.30742553871897e-05, + "loss": 2.2748, + "step": 3399 + }, + { + "epoch": 0.6373008434864105, + "grad_norm": 51126.64453125, + "learning_rate": 9.30702647431383e-05, + "loss": 2.347, + "step": 3400 + }, + { + "epoch": 0.6374882849109653, + "grad_norm": 53508.08984375, + "learning_rate": 9.30662730353072e-05, + "loss": 2.2761, + "step": 3401 + }, + { + "epoch": 0.6376757263355202, + "grad_norm": 50249.44140625, + "learning_rate": 9.3062280263795e-05, + "loss": 2.3584, + "step": 3402 + }, + { + "epoch": 0.637863167760075, + "grad_norm": 48561.09765625, + "learning_rate": 9.305828642870031e-05, + "loss": 2.3072, + "step": 3403 + }, + { + "epoch": 0.6380506091846299, + "grad_norm": 51189.81640625, + "learning_rate": 9.305429153012177e-05, + "loss": 2.2685, + "step": 3404 + }, + { + "epoch": 0.6382380506091846, + "grad_norm": 48424.125, + "learning_rate": 9.305029556815804e-05, + "loss": 2.2914, + "step": 3405 + }, + { + "epoch": 0.6384254920337394, + "grad_norm": 46446.5703125, + "learning_rate": 9.304629854290783e-05, + "loss": 2.3589, + "step": 3406 + }, + { + "epoch": 0.6386129334582943, + "grad_norm": 46114.390625, + "learning_rate": 9.304230045446986e-05, + "loss": 2.2922, + "step": 3407 + }, + { + "epoch": 0.6388003748828491, + "grad_norm": 49974.38671875, + "learning_rate": 9.303830130294288e-05, + "loss": 2.2705, + "step": 3408 + }, + { + "epoch": 0.638987816307404, + "grad_norm": 50284.40234375, + "learning_rate": 9.303430108842564e-05, + "loss": 2.3224, + "step": 3409 + }, + { + "epoch": 0.6391752577319587, + "grad_norm": 50640.01953125, + "learning_rate": 9.303029981101698e-05, + "loss": 2.3466, + "step": 3410 + }, + { + "epoch": 0.6393626991565136, + "grad_norm": 50435.8359375, + "learning_rate": 9.302629747081568e-05, + "loss": 2.3675, + "step": 3411 + }, + { + "epoch": 0.6395501405810684, + "grad_norm": 49388.15234375, + "learning_rate": 9.302229406792064e-05, + "loss": 2.2676, + "step": 3412 + }, + { + "epoch": 0.6397375820056233, + "grad_norm": 50071.5, + "learning_rate": 9.30182896024307e-05, + "loss": 2.3637, + "step": 3413 + }, + { + "epoch": 0.6399250234301781, + "grad_norm": 46887.4609375, + "learning_rate": 9.30142840744448e-05, + "loss": 2.3391, + "step": 3414 + }, + { + "epoch": 0.6401124648547329, + "grad_norm": 52511.6875, + "learning_rate": 9.301027748406183e-05, + "loss": 2.2985, + "step": 3415 + }, + { + "epoch": 0.6402999062792877, + "grad_norm": 49541.7109375, + "learning_rate": 9.300626983138076e-05, + "loss": 2.396, + "step": 3416 + }, + { + "epoch": 0.6404873477038425, + "grad_norm": 50042.84765625, + "learning_rate": 9.300226111650062e-05, + "loss": 2.3426, + "step": 3417 + }, + { + "epoch": 0.6406747891283974, + "grad_norm": 51926.5234375, + "learning_rate": 9.299825133952036e-05, + "loss": 2.2706, + "step": 3418 + }, + { + "epoch": 0.6408622305529522, + "grad_norm": 53102.97265625, + "learning_rate": 9.299424050053904e-05, + "loss": 2.2753, + "step": 3419 + }, + { + "epoch": 0.6410496719775071, + "grad_norm": 47514.484375, + "learning_rate": 9.299022859965573e-05, + "loss": 2.3016, + "step": 3420 + }, + { + "epoch": 0.6412371134020619, + "grad_norm": 45468.796875, + "learning_rate": 9.29862156369695e-05, + "loss": 2.3387, + "step": 3421 + }, + { + "epoch": 0.6414245548266166, + "grad_norm": 51850.0625, + "learning_rate": 9.29822016125795e-05, + "loss": 2.3308, + "step": 3422 + }, + { + "epoch": 0.6416119962511715, + "grad_norm": 50405.8671875, + "learning_rate": 9.297818652658482e-05, + "loss": 2.3354, + "step": 3423 + }, + { + "epoch": 0.6417994376757263, + "grad_norm": 48219.60546875, + "learning_rate": 9.297417037908467e-05, + "loss": 2.3655, + "step": 3424 + }, + { + "epoch": 0.6419868791002812, + "grad_norm": 53494.24609375, + "learning_rate": 9.297015317017823e-05, + "loss": 2.3074, + "step": 3425 + }, + { + "epoch": 0.642174320524836, + "grad_norm": 48970.4765625, + "learning_rate": 9.296613489996471e-05, + "loss": 2.3223, + "step": 3426 + }, + { + "epoch": 0.6423617619493908, + "grad_norm": 45927.73046875, + "learning_rate": 9.296211556854338e-05, + "loss": 2.3021, + "step": 3427 + }, + { + "epoch": 0.6425492033739456, + "grad_norm": 46759.5546875, + "learning_rate": 9.295809517601349e-05, + "loss": 2.3254, + "step": 3428 + }, + { + "epoch": 0.6427366447985005, + "grad_norm": 53536.1015625, + "learning_rate": 9.295407372247432e-05, + "loss": 2.3248, + "step": 3429 + }, + { + "epoch": 0.6429240862230553, + "grad_norm": 52491.90234375, + "learning_rate": 9.295005120802524e-05, + "loss": 2.3554, + "step": 3430 + }, + { + "epoch": 0.6431115276476101, + "grad_norm": 47082.078125, + "learning_rate": 9.294602763276558e-05, + "loss": 2.3119, + "step": 3431 + }, + { + "epoch": 0.643298969072165, + "grad_norm": 50451.9921875, + "learning_rate": 9.294200299679471e-05, + "loss": 2.2759, + "step": 3432 + }, + { + "epoch": 0.6434864104967197, + "grad_norm": 48205.89453125, + "learning_rate": 9.293797730021205e-05, + "loss": 2.3524, + "step": 3433 + }, + { + "epoch": 0.6436738519212746, + "grad_norm": 52823.44140625, + "learning_rate": 9.293395054311704e-05, + "loss": 2.3407, + "step": 3434 + }, + { + "epoch": 0.6438612933458294, + "grad_norm": 52536.109375, + "learning_rate": 9.292992272560909e-05, + "loss": 2.3211, + "step": 3435 + }, + { + "epoch": 0.6440487347703843, + "grad_norm": 48637.5859375, + "learning_rate": 9.292589384778772e-05, + "loss": 2.2929, + "step": 3436 + }, + { + "epoch": 0.6442361761949391, + "grad_norm": 50566.1015625, + "learning_rate": 9.292186390975241e-05, + "loss": 2.3404, + "step": 3437 + }, + { + "epoch": 0.644423617619494, + "grad_norm": 51337.5234375, + "learning_rate": 9.291783291160272e-05, + "loss": 2.3572, + "step": 3438 + }, + { + "epoch": 0.6446110590440487, + "grad_norm": 50187.25, + "learning_rate": 9.291380085343821e-05, + "loss": 2.3003, + "step": 3439 + }, + { + "epoch": 0.6447985004686035, + "grad_norm": 51233.69921875, + "learning_rate": 9.290976773535844e-05, + "loss": 2.3419, + "step": 3440 + }, + { + "epoch": 0.6449859418931584, + "grad_norm": 54013.2734375, + "learning_rate": 9.290573355746306e-05, + "loss": 2.3351, + "step": 3441 + }, + { + "epoch": 0.6451733833177132, + "grad_norm": 49389.98046875, + "learning_rate": 9.290169831985168e-05, + "loss": 2.3726, + "step": 3442 + }, + { + "epoch": 0.6453608247422681, + "grad_norm": 50427.1328125, + "learning_rate": 9.289766202262398e-05, + "loss": 2.3747, + "step": 3443 + }, + { + "epoch": 0.6455482661668228, + "grad_norm": 50428.21875, + "learning_rate": 9.289362466587963e-05, + "loss": 2.3718, + "step": 3444 + }, + { + "epoch": 0.6457357075913777, + "grad_norm": 53468.38671875, + "learning_rate": 9.288958624971838e-05, + "loss": 2.386, + "step": 3445 + }, + { + "epoch": 0.6459231490159325, + "grad_norm": 47783.12109375, + "learning_rate": 9.288554677423994e-05, + "loss": 2.3627, + "step": 3446 + }, + { + "epoch": 0.6461105904404874, + "grad_norm": 50624.140625, + "learning_rate": 9.288150623954411e-05, + "loss": 2.2835, + "step": 3447 + }, + { + "epoch": 0.6462980318650422, + "grad_norm": 50788.734375, + "learning_rate": 9.287746464573066e-05, + "loss": 2.2427, + "step": 3448 + }, + { + "epoch": 0.646485473289597, + "grad_norm": 52111.58203125, + "learning_rate": 9.287342199289944e-05, + "loss": 2.2928, + "step": 3449 + }, + { + "epoch": 0.6466729147141518, + "grad_norm": 53515.328125, + "learning_rate": 9.286937828115028e-05, + "loss": 2.35, + "step": 3450 + }, + { + "epoch": 0.6468603561387066, + "grad_norm": 51022.41796875, + "learning_rate": 9.286533351058304e-05, + "loss": 2.3756, + "step": 3451 + }, + { + "epoch": 0.6470477975632615, + "grad_norm": 53436.51171875, + "learning_rate": 9.286128768129765e-05, + "loss": 2.3002, + "step": 3452 + }, + { + "epoch": 0.6472352389878163, + "grad_norm": 49467.28515625, + "learning_rate": 9.285724079339404e-05, + "loss": 2.312, + "step": 3453 + }, + { + "epoch": 0.6474226804123712, + "grad_norm": 53648.63671875, + "learning_rate": 9.285319284697213e-05, + "loss": 2.3265, + "step": 3454 + }, + { + "epoch": 0.647610121836926, + "grad_norm": 54925.65625, + "learning_rate": 9.284914384213192e-05, + "loss": 2.3053, + "step": 3455 + }, + { + "epoch": 0.6477975632614807, + "grad_norm": 52881.8671875, + "learning_rate": 9.284509377897341e-05, + "loss": 2.3062, + "step": 3456 + }, + { + "epoch": 0.6479850046860356, + "grad_norm": 57929.48828125, + "learning_rate": 9.284104265759663e-05, + "loss": 2.2777, + "step": 3457 + }, + { + "epoch": 0.6481724461105904, + "grad_norm": 49254.55078125, + "learning_rate": 9.283699047810165e-05, + "loss": 2.2779, + "step": 3458 + }, + { + "epoch": 0.6483598875351453, + "grad_norm": 48239.59765625, + "learning_rate": 9.283293724058855e-05, + "loss": 2.3729, + "step": 3459 + }, + { + "epoch": 0.6485473289597001, + "grad_norm": 52329.421875, + "learning_rate": 9.282888294515742e-05, + "loss": 2.3265, + "step": 3460 + }, + { + "epoch": 0.6487347703842549, + "grad_norm": 49519.23046875, + "learning_rate": 9.28248275919084e-05, + "loss": 2.3353, + "step": 3461 + }, + { + "epoch": 0.6489222118088097, + "grad_norm": 50241.90234375, + "learning_rate": 9.282077118094168e-05, + "loss": 2.405, + "step": 3462 + }, + { + "epoch": 0.6491096532333646, + "grad_norm": 51314.0859375, + "learning_rate": 9.281671371235743e-05, + "loss": 2.351, + "step": 3463 + }, + { + "epoch": 0.6492970946579194, + "grad_norm": 50178.75, + "learning_rate": 9.281265518625585e-05, + "loss": 2.3061, + "step": 3464 + }, + { + "epoch": 0.6494845360824743, + "grad_norm": 49040.28515625, + "learning_rate": 9.280859560273721e-05, + "loss": 2.4203, + "step": 3465 + }, + { + "epoch": 0.6496719775070291, + "grad_norm": 51684.375, + "learning_rate": 9.280453496190175e-05, + "loss": 2.3482, + "step": 3466 + }, + { + "epoch": 0.6498594189315838, + "grad_norm": 51926.5703125, + "learning_rate": 9.280047326384979e-05, + "loss": 2.2733, + "step": 3467 + }, + { + "epoch": 0.6500468603561387, + "grad_norm": 51774.4140625, + "learning_rate": 9.279641050868162e-05, + "loss": 2.3714, + "step": 3468 + }, + { + "epoch": 0.6502343017806935, + "grad_norm": 49964.640625, + "learning_rate": 9.279234669649759e-05, + "loss": 2.3916, + "step": 3469 + }, + { + "epoch": 0.6504217432052484, + "grad_norm": 50271.3984375, + "learning_rate": 9.27882818273981e-05, + "loss": 2.2792, + "step": 3470 + }, + { + "epoch": 0.6506091846298032, + "grad_norm": 54322.78125, + "learning_rate": 9.27842159014835e-05, + "loss": 2.3207, + "step": 3471 + }, + { + "epoch": 0.6507966260543581, + "grad_norm": 46012.50390625, + "learning_rate": 9.278014891885425e-05, + "loss": 2.2872, + "step": 3472 + }, + { + "epoch": 0.6509840674789128, + "grad_norm": 50394.87109375, + "learning_rate": 9.277608087961078e-05, + "loss": 2.3156, + "step": 3473 + }, + { + "epoch": 0.6511715089034676, + "grad_norm": 48721.50390625, + "learning_rate": 9.277201178385357e-05, + "loss": 2.2786, + "step": 3474 + }, + { + "epoch": 0.6513589503280225, + "grad_norm": 48321.98828125, + "learning_rate": 9.276794163168314e-05, + "loss": 2.3442, + "step": 3475 + }, + { + "epoch": 0.6515463917525773, + "grad_norm": 45281.0859375, + "learning_rate": 9.276387042319998e-05, + "loss": 2.3022, + "step": 3476 + }, + { + "epoch": 0.6517338331771322, + "grad_norm": 52852.62890625, + "learning_rate": 9.275979815850469e-05, + "loss": 2.2988, + "step": 3477 + }, + { + "epoch": 0.6519212746016869, + "grad_norm": 52142.6640625, + "learning_rate": 9.27557248376978e-05, + "loss": 2.3078, + "step": 3478 + }, + { + "epoch": 0.6521087160262418, + "grad_norm": 46218.76171875, + "learning_rate": 9.275165046087995e-05, + "loss": 2.3598, + "step": 3479 + }, + { + "epoch": 0.6522961574507966, + "grad_norm": 50495.33203125, + "learning_rate": 9.274757502815176e-05, + "loss": 2.3276, + "step": 3480 + }, + { + "epoch": 0.6524835988753515, + "grad_norm": 49064.32421875, + "learning_rate": 9.274349853961389e-05, + "loss": 2.3316, + "step": 3481 + }, + { + "epoch": 0.6526710402999063, + "grad_norm": 48421.6171875, + "learning_rate": 9.273942099536703e-05, + "loss": 2.3924, + "step": 3482 + }, + { + "epoch": 0.6528584817244611, + "grad_norm": 52433.9296875, + "learning_rate": 9.273534239551189e-05, + "loss": 2.2226, + "step": 3483 + }, + { + "epoch": 0.6530459231490159, + "grad_norm": 46883.89453125, + "learning_rate": 9.273126274014918e-05, + "loss": 2.2522, + "step": 3484 + }, + { + "epoch": 0.6532333645735707, + "grad_norm": 49444.15234375, + "learning_rate": 9.272718202937968e-05, + "loss": 2.385, + "step": 3485 + }, + { + "epoch": 0.6534208059981256, + "grad_norm": 47696.203125, + "learning_rate": 9.272310026330421e-05, + "loss": 2.3592, + "step": 3486 + }, + { + "epoch": 0.6536082474226804, + "grad_norm": 52523.94921875, + "learning_rate": 9.271901744202354e-05, + "loss": 2.2576, + "step": 3487 + }, + { + "epoch": 0.6537956888472353, + "grad_norm": 48245.62109375, + "learning_rate": 9.271493356563851e-05, + "loss": 2.2976, + "step": 3488 + }, + { + "epoch": 0.6539831302717901, + "grad_norm": 50861.1015625, + "learning_rate": 9.271084863425002e-05, + "loss": 2.3078, + "step": 3489 + }, + { + "epoch": 0.6541705716963448, + "grad_norm": 58145.4296875, + "learning_rate": 9.270676264795893e-05, + "loss": 2.4413, + "step": 3490 + }, + { + "epoch": 0.6543580131208997, + "grad_norm": 49310.65625, + "learning_rate": 9.270267560686618e-05, + "loss": 2.3431, + "step": 3491 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 48016.94921875, + "learning_rate": 9.269858751107272e-05, + "loss": 2.3712, + "step": 3492 + }, + { + "epoch": 0.6547328959700094, + "grad_norm": 45084.84765625, + "learning_rate": 9.26944983606795e-05, + "loss": 2.3426, + "step": 3493 + }, + { + "epoch": 0.6549203373945642, + "grad_norm": 47701.671875, + "learning_rate": 9.269040815578751e-05, + "loss": 2.3575, + "step": 3494 + }, + { + "epoch": 0.6551077788191191, + "grad_norm": 46813.5390625, + "learning_rate": 9.26863168964978e-05, + "loss": 2.3089, + "step": 3495 + }, + { + "epoch": 0.6552952202436738, + "grad_norm": 51177.21484375, + "learning_rate": 9.268222458291141e-05, + "loss": 2.306, + "step": 3496 + }, + { + "epoch": 0.6554826616682287, + "grad_norm": 52191.49609375, + "learning_rate": 9.26781312151294e-05, + "loss": 2.3931, + "step": 3497 + }, + { + "epoch": 0.6556701030927835, + "grad_norm": 53310.0703125, + "learning_rate": 9.26740367932529e-05, + "loss": 2.2909, + "step": 3498 + }, + { + "epoch": 0.6558575445173384, + "grad_norm": 54664.24609375, + "learning_rate": 9.2669941317383e-05, + "loss": 2.3681, + "step": 3499 + }, + { + "epoch": 0.6560449859418932, + "grad_norm": 53215.5, + "learning_rate": 9.266584478762089e-05, + "loss": 2.2909, + "step": 3500 + }, + { + "epoch": 0.6560449859418932, + "eval_loss": 2.3258614540100098, + "eval_runtime": 130.2057, + "eval_samples_per_second": 38.777, + "eval_steps_per_second": 1.943, + "step": 3500 + }, + { + "epoch": 0.6562324273664479, + "grad_norm": 51697.8671875, + "learning_rate": 9.26617472040677e-05, + "loss": 2.3535, + "step": 3501 + }, + { + "epoch": 0.6564198687910028, + "grad_norm": 49294.5234375, + "learning_rate": 9.26576485668247e-05, + "loss": 2.3889, + "step": 3502 + }, + { + "epoch": 0.6566073102155576, + "grad_norm": 51913.40234375, + "learning_rate": 9.265354887599307e-05, + "loss": 2.3645, + "step": 3503 + }, + { + "epoch": 0.6567947516401125, + "grad_norm": 49777.515625, + "learning_rate": 9.264944813167409e-05, + "loss": 2.3635, + "step": 3504 + }, + { + "epoch": 0.6569821930646673, + "grad_norm": 50460.5078125, + "learning_rate": 9.264534633396902e-05, + "loss": 2.2689, + "step": 3505 + }, + { + "epoch": 0.6571696344892222, + "grad_norm": 49464.0390625, + "learning_rate": 9.26412434829792e-05, + "loss": 2.3161, + "step": 3506 + }, + { + "epoch": 0.6573570759137769, + "grad_norm": 48506.484375, + "learning_rate": 9.263713957880595e-05, + "loss": 2.3828, + "step": 3507 + }, + { + "epoch": 0.6575445173383317, + "grad_norm": 47924.90234375, + "learning_rate": 9.263303462155064e-05, + "loss": 2.3331, + "step": 3508 + }, + { + "epoch": 0.6577319587628866, + "grad_norm": 51792.48828125, + "learning_rate": 9.262892861131464e-05, + "loss": 2.3414, + "step": 3509 + }, + { + "epoch": 0.6579194001874414, + "grad_norm": 49746.53125, + "learning_rate": 9.262482154819938e-05, + "loss": 2.3432, + "step": 3510 + }, + { + "epoch": 0.6581068416119963, + "grad_norm": 50582.34765625, + "learning_rate": 9.262071343230628e-05, + "loss": 2.3801, + "step": 3511 + }, + { + "epoch": 0.6582942830365511, + "grad_norm": 45460.921875, + "learning_rate": 9.261660426373682e-05, + "loss": 2.3801, + "step": 3512 + }, + { + "epoch": 0.6584817244611059, + "grad_norm": 73238.796875, + "learning_rate": 9.26124940425925e-05, + "loss": 2.4107, + "step": 3513 + }, + { + "epoch": 0.6586691658856607, + "grad_norm": 53225.6796875, + "learning_rate": 9.26083827689748e-05, + "loss": 2.3427, + "step": 3514 + }, + { + "epoch": 0.6588566073102156, + "grad_norm": 48487.44921875, + "learning_rate": 9.260427044298532e-05, + "loss": 2.3338, + "step": 3515 + }, + { + "epoch": 0.6590440487347704, + "grad_norm": 50085.125, + "learning_rate": 9.260015706472558e-05, + "loss": 2.4224, + "step": 3516 + }, + { + "epoch": 0.6592314901593253, + "grad_norm": 47186.6171875, + "learning_rate": 9.25960426342972e-05, + "loss": 2.2775, + "step": 3517 + }, + { + "epoch": 0.65941893158388, + "grad_norm": 48533.453125, + "learning_rate": 9.259192715180178e-05, + "loss": 2.3511, + "step": 3518 + }, + { + "epoch": 0.6596063730084348, + "grad_norm": 51717.25390625, + "learning_rate": 9.258781061734099e-05, + "loss": 2.378, + "step": 3519 + }, + { + "epoch": 0.6597938144329897, + "grad_norm": 50434.65234375, + "learning_rate": 9.258369303101648e-05, + "loss": 2.3113, + "step": 3520 + }, + { + "epoch": 0.6599812558575445, + "grad_norm": 51492.05078125, + "learning_rate": 9.257957439292998e-05, + "loss": 2.3615, + "step": 3521 + }, + { + "epoch": 0.6601686972820994, + "grad_norm": 58604.2734375, + "learning_rate": 9.25754547031832e-05, + "loss": 2.5017, + "step": 3522 + }, + { + "epoch": 0.6603561387066542, + "grad_norm": 50743.3515625, + "learning_rate": 9.257133396187786e-05, + "loss": 2.299, + "step": 3523 + }, + { + "epoch": 0.660543580131209, + "grad_norm": 51290.80078125, + "learning_rate": 9.25672121691158e-05, + "loss": 2.2825, + "step": 3524 + }, + { + "epoch": 0.6607310215557638, + "grad_norm": 51050.7890625, + "learning_rate": 9.256308932499877e-05, + "loss": 2.393, + "step": 3525 + }, + { + "epoch": 0.6609184629803186, + "grad_norm": 51858.32421875, + "learning_rate": 9.255896542962863e-05, + "loss": 2.3039, + "step": 3526 + }, + { + "epoch": 0.6611059044048735, + "grad_norm": 47957.44921875, + "learning_rate": 9.255484048310722e-05, + "loss": 2.2515, + "step": 3527 + }, + { + "epoch": 0.6612933458294283, + "grad_norm": 51120.52734375, + "learning_rate": 9.255071448553642e-05, + "loss": 2.3246, + "step": 3528 + }, + { + "epoch": 0.6614807872539832, + "grad_norm": 51581.7421875, + "learning_rate": 9.254658743701814e-05, + "loss": 2.3037, + "step": 3529 + }, + { + "epoch": 0.6616682286785379, + "grad_norm": 48650.38671875, + "learning_rate": 9.254245933765431e-05, + "loss": 2.3638, + "step": 3530 + }, + { + "epoch": 0.6618556701030928, + "grad_norm": 47518.82421875, + "learning_rate": 9.25383301875469e-05, + "loss": 2.3364, + "step": 3531 + }, + { + "epoch": 0.6620431115276476, + "grad_norm": 46063.44921875, + "learning_rate": 9.253419998679787e-05, + "loss": 2.2748, + "step": 3532 + }, + { + "epoch": 0.6622305529522025, + "grad_norm": 58123.1875, + "learning_rate": 9.253006873550925e-05, + "loss": 2.3451, + "step": 3533 + }, + { + "epoch": 0.6624179943767573, + "grad_norm": 51893.91015625, + "learning_rate": 9.25259364337831e-05, + "loss": 2.3062, + "step": 3534 + }, + { + "epoch": 0.662605435801312, + "grad_norm": 55112.56640625, + "learning_rate": 9.252180308172142e-05, + "loss": 2.3208, + "step": 3535 + }, + { + "epoch": 0.6627928772258669, + "grad_norm": 57075.69921875, + "learning_rate": 9.251766867942635e-05, + "loss": 2.3033, + "step": 3536 + }, + { + "epoch": 0.6629803186504217, + "grad_norm": 51464.5, + "learning_rate": 9.251353322699998e-05, + "loss": 2.2768, + "step": 3537 + }, + { + "epoch": 0.6631677600749766, + "grad_norm": 44089.4609375, + "learning_rate": 9.250939672454447e-05, + "loss": 2.3633, + "step": 3538 + }, + { + "epoch": 0.6633552014995314, + "grad_norm": 52927.7578125, + "learning_rate": 9.250525917216197e-05, + "loss": 2.3554, + "step": 3539 + }, + { + "epoch": 0.6635426429240863, + "grad_norm": 47444.0859375, + "learning_rate": 9.250112056995468e-05, + "loss": 2.3311, + "step": 3540 + }, + { + "epoch": 0.663730084348641, + "grad_norm": 62170.984375, + "learning_rate": 9.249698091802482e-05, + "loss": 2.284, + "step": 3541 + }, + { + "epoch": 0.6639175257731958, + "grad_norm": 51152.5234375, + "learning_rate": 9.249284021647462e-05, + "loss": 2.2696, + "step": 3542 + }, + { + "epoch": 0.6641049671977507, + "grad_norm": 48711.45703125, + "learning_rate": 9.248869846540636e-05, + "loss": 2.3209, + "step": 3543 + }, + { + "epoch": 0.6642924086223055, + "grad_norm": 50980.2421875, + "learning_rate": 9.248455566492234e-05, + "loss": 2.2429, + "step": 3544 + }, + { + "epoch": 0.6644798500468604, + "grad_norm": 50389.7578125, + "learning_rate": 9.248041181512488e-05, + "loss": 2.3461, + "step": 3545 + }, + { + "epoch": 0.6646672914714152, + "grad_norm": 52235.1875, + "learning_rate": 9.247626691611633e-05, + "loss": 2.3077, + "step": 3546 + }, + { + "epoch": 0.66485473289597, + "grad_norm": 49043.015625, + "learning_rate": 9.247212096799904e-05, + "loss": 2.3488, + "step": 3547 + }, + { + "epoch": 0.6650421743205248, + "grad_norm": 51272.046875, + "learning_rate": 9.246797397087543e-05, + "loss": 2.3354, + "step": 3548 + }, + { + "epoch": 0.6652296157450797, + "grad_norm": 51912.484375, + "learning_rate": 9.246382592484794e-05, + "loss": 2.3313, + "step": 3549 + }, + { + "epoch": 0.6654170571696345, + "grad_norm": 47977.1796875, + "learning_rate": 9.2459676830019e-05, + "loss": 2.3509, + "step": 3550 + }, + { + "epoch": 0.6656044985941894, + "grad_norm": 46166.0703125, + "learning_rate": 9.245552668649107e-05, + "loss": 2.3881, + "step": 3551 + }, + { + "epoch": 0.6657919400187441, + "grad_norm": 51930.56640625, + "learning_rate": 9.24513754943667e-05, + "loss": 2.4435, + "step": 3552 + }, + { + "epoch": 0.6659793814432989, + "grad_norm": 47587.50390625, + "learning_rate": 9.244722325374839e-05, + "loss": 2.3331, + "step": 3553 + }, + { + "epoch": 0.6661668228678538, + "grad_norm": 48623.91796875, + "learning_rate": 9.24430699647387e-05, + "loss": 2.3232, + "step": 3554 + }, + { + "epoch": 0.6663542642924086, + "grad_norm": 48577.33203125, + "learning_rate": 9.24389156274402e-05, + "loss": 2.3122, + "step": 3555 + }, + { + "epoch": 0.6665417057169635, + "grad_norm": 49942.12109375, + "learning_rate": 9.243476024195552e-05, + "loss": 2.289, + "step": 3556 + }, + { + "epoch": 0.6667291471415183, + "grad_norm": 54323.03125, + "learning_rate": 9.243060380838726e-05, + "loss": 2.3044, + "step": 3557 + }, + { + "epoch": 0.666916588566073, + "grad_norm": 48961.94140625, + "learning_rate": 9.242644632683812e-05, + "loss": 2.2829, + "step": 3558 + }, + { + "epoch": 0.6671040299906279, + "grad_norm": 52508.56640625, + "learning_rate": 9.242228779741076e-05, + "loss": 2.3533, + "step": 3559 + }, + { + "epoch": 0.6672914714151827, + "grad_norm": 50826.1796875, + "learning_rate": 9.241812822020788e-05, + "loss": 2.3823, + "step": 3560 + }, + { + "epoch": 0.6674789128397376, + "grad_norm": 55696.765625, + "learning_rate": 9.241396759533222e-05, + "loss": 2.3517, + "step": 3561 + }, + { + "epoch": 0.6676663542642924, + "grad_norm": 53105.48828125, + "learning_rate": 9.240980592288658e-05, + "loss": 2.3788, + "step": 3562 + }, + { + "epoch": 0.6678537956888473, + "grad_norm": 53184.265625, + "learning_rate": 9.24056432029737e-05, + "loss": 2.3642, + "step": 3563 + }, + { + "epoch": 0.668041237113402, + "grad_norm": 51534.765625, + "learning_rate": 9.240147943569641e-05, + "loss": 2.3287, + "step": 3564 + }, + { + "epoch": 0.6682286785379569, + "grad_norm": 48900.98828125, + "learning_rate": 9.239731462115757e-05, + "loss": 2.2954, + "step": 3565 + }, + { + "epoch": 0.6684161199625117, + "grad_norm": 48815.140625, + "learning_rate": 9.239314875946001e-05, + "loss": 2.3245, + "step": 3566 + }, + { + "epoch": 0.6686035613870666, + "grad_norm": 50492.78125, + "learning_rate": 9.238898185070665e-05, + "loss": 2.2933, + "step": 3567 + }, + { + "epoch": 0.6687910028116214, + "grad_norm": 48240.30859375, + "learning_rate": 9.23848138950004e-05, + "loss": 2.3083, + "step": 3568 + }, + { + "epoch": 0.6689784442361761, + "grad_norm": 44546.3515625, + "learning_rate": 9.238064489244421e-05, + "loss": 2.2776, + "step": 3569 + }, + { + "epoch": 0.669165885660731, + "grad_norm": 50518.65234375, + "learning_rate": 9.2376474843141e-05, + "loss": 2.3349, + "step": 3570 + }, + { + "epoch": 0.6693533270852858, + "grad_norm": 48633.12890625, + "learning_rate": 9.237230374719384e-05, + "loss": 2.3121, + "step": 3571 + }, + { + "epoch": 0.6695407685098407, + "grad_norm": 49174.34375, + "learning_rate": 9.236813160470568e-05, + "loss": 2.3228, + "step": 3572 + }, + { + "epoch": 0.6697282099343955, + "grad_norm": 54111.26953125, + "learning_rate": 9.236395841577962e-05, + "loss": 2.2483, + "step": 3573 + }, + { + "epoch": 0.6699156513589504, + "grad_norm": 51828.52734375, + "learning_rate": 9.235978418051871e-05, + "loss": 2.2773, + "step": 3574 + }, + { + "epoch": 0.6701030927835051, + "grad_norm": 50818.58984375, + "learning_rate": 9.235560889902605e-05, + "loss": 2.3389, + "step": 3575 + }, + { + "epoch": 0.67029053420806, + "grad_norm": 54653.62109375, + "learning_rate": 9.235143257140477e-05, + "loss": 2.2735, + "step": 3576 + }, + { + "epoch": 0.6704779756326148, + "grad_norm": 50639.859375, + "learning_rate": 9.234725519775801e-05, + "loss": 2.434, + "step": 3577 + }, + { + "epoch": 0.6706654170571696, + "grad_norm": 50698.734375, + "learning_rate": 9.234307677818897e-05, + "loss": 2.3201, + "step": 3578 + }, + { + "epoch": 0.6708528584817245, + "grad_norm": 51413.140625, + "learning_rate": 9.233889731280081e-05, + "loss": 2.3373, + "step": 3579 + }, + { + "epoch": 0.6710402999062793, + "grad_norm": 52825.203125, + "learning_rate": 9.233471680169679e-05, + "loss": 2.3177, + "step": 3580 + }, + { + "epoch": 0.6712277413308341, + "grad_norm": 48598.640625, + "learning_rate": 9.233053524498014e-05, + "loss": 2.3226, + "step": 3581 + }, + { + "epoch": 0.6714151827553889, + "grad_norm": 47652.17578125, + "learning_rate": 9.232635264275416e-05, + "loss": 2.3064, + "step": 3582 + }, + { + "epoch": 0.6716026241799438, + "grad_norm": 51025.8671875, + "learning_rate": 9.232216899512214e-05, + "loss": 2.332, + "step": 3583 + }, + { + "epoch": 0.6717900656044986, + "grad_norm": 52702.30078125, + "learning_rate": 9.231798430218744e-05, + "loss": 2.269, + "step": 3584 + }, + { + "epoch": 0.6719775070290535, + "grad_norm": 51525.75, + "learning_rate": 9.231379856405337e-05, + "loss": 2.4031, + "step": 3585 + }, + { + "epoch": 0.6721649484536083, + "grad_norm": 53205.1796875, + "learning_rate": 9.230961178082333e-05, + "loss": 2.2831, + "step": 3586 + }, + { + "epoch": 0.672352389878163, + "grad_norm": 51610.96484375, + "learning_rate": 9.230542395260076e-05, + "loss": 2.3083, + "step": 3587 + }, + { + "epoch": 0.6725398313027179, + "grad_norm": 53448.23828125, + "learning_rate": 9.230123507948905e-05, + "loss": 2.4276, + "step": 3588 + }, + { + "epoch": 0.6727272727272727, + "grad_norm": 49292.71484375, + "learning_rate": 9.229704516159168e-05, + "loss": 2.2919, + "step": 3589 + }, + { + "epoch": 0.6729147141518276, + "grad_norm": 48269.1953125, + "learning_rate": 9.229285419901215e-05, + "loss": 2.358, + "step": 3590 + }, + { + "epoch": 0.6731021555763824, + "grad_norm": 54263.234375, + "learning_rate": 9.228866219185393e-05, + "loss": 2.326, + "step": 3591 + }, + { + "epoch": 0.6732895970009372, + "grad_norm": 51724.96484375, + "learning_rate": 9.22844691402206e-05, + "loss": 2.2629, + "step": 3592 + }, + { + "epoch": 0.673477038425492, + "grad_norm": 51527.90625, + "learning_rate": 9.22802750442157e-05, + "loss": 2.3427, + "step": 3593 + }, + { + "epoch": 0.6736644798500468, + "grad_norm": 47228.6953125, + "learning_rate": 9.227607990394282e-05, + "loss": 2.3568, + "step": 3594 + }, + { + "epoch": 0.6738519212746017, + "grad_norm": 50889.90625, + "learning_rate": 9.227188371950558e-05, + "loss": 2.3607, + "step": 3595 + }, + { + "epoch": 0.6740393626991565, + "grad_norm": 56086.20703125, + "learning_rate": 9.226768649100762e-05, + "loss": 2.3768, + "step": 3596 + }, + { + "epoch": 0.6742268041237114, + "grad_norm": 52102.65625, + "learning_rate": 9.22634882185526e-05, + "loss": 2.3811, + "step": 3597 + }, + { + "epoch": 0.6744142455482661, + "grad_norm": 53255.52734375, + "learning_rate": 9.225928890224422e-05, + "loss": 2.3688, + "step": 3598 + }, + { + "epoch": 0.674601686972821, + "grad_norm": 47085.625, + "learning_rate": 9.22550885421862e-05, + "loss": 2.3425, + "step": 3599 + }, + { + "epoch": 0.6747891283973758, + "grad_norm": 56502.27734375, + "learning_rate": 9.225088713848227e-05, + "loss": 2.2998, + "step": 3600 + }, + { + "epoch": 0.6749765698219307, + "grad_norm": 47808.19140625, + "learning_rate": 9.22466846912362e-05, + "loss": 2.3542, + "step": 3601 + }, + { + "epoch": 0.6751640112464855, + "grad_norm": 49896.359375, + "learning_rate": 9.22424812005518e-05, + "loss": 2.3345, + "step": 3602 + }, + { + "epoch": 0.6753514526710404, + "grad_norm": 48622.55078125, + "learning_rate": 9.223827666653288e-05, + "loss": 2.2701, + "step": 3603 + }, + { + "epoch": 0.6755388940955951, + "grad_norm": 50282.26171875, + "learning_rate": 9.22340710892833e-05, + "loss": 2.3465, + "step": 3604 + }, + { + "epoch": 0.6757263355201499, + "grad_norm": 48933.9453125, + "learning_rate": 9.222986446890691e-05, + "loss": 2.3153, + "step": 3605 + }, + { + "epoch": 0.6759137769447048, + "grad_norm": 51764.66796875, + "learning_rate": 9.222565680550764e-05, + "loss": 2.3883, + "step": 3606 + }, + { + "epoch": 0.6761012183692596, + "grad_norm": 47303.05859375, + "learning_rate": 9.222144809918936e-05, + "loss": 2.3071, + "step": 3607 + }, + { + "epoch": 0.6762886597938145, + "grad_norm": 48717.1953125, + "learning_rate": 9.221723835005607e-05, + "loss": 2.3509, + "step": 3608 + }, + { + "epoch": 0.6764761012183692, + "grad_norm": 49341.8046875, + "learning_rate": 9.221302755821173e-05, + "loss": 2.3659, + "step": 3609 + }, + { + "epoch": 0.676663542642924, + "grad_norm": 49785.49609375, + "learning_rate": 9.220881572376033e-05, + "loss": 2.3455, + "step": 3610 + }, + { + "epoch": 0.6768509840674789, + "grad_norm": 47367.3828125, + "learning_rate": 9.220460284680592e-05, + "loss": 2.334, + "step": 3611 + }, + { + "epoch": 0.6770384254920337, + "grad_norm": 50350.33984375, + "learning_rate": 9.220038892745253e-05, + "loss": 2.4296, + "step": 3612 + }, + { + "epoch": 0.6772258669165886, + "grad_norm": 47357.15234375, + "learning_rate": 9.219617396580424e-05, + "loss": 2.3533, + "step": 3613 + }, + { + "epoch": 0.6774133083411434, + "grad_norm": 50470.9140625, + "learning_rate": 9.219195796196518e-05, + "loss": 2.2952, + "step": 3614 + }, + { + "epoch": 0.6776007497656982, + "grad_norm": 46182.29296875, + "learning_rate": 9.218774091603945e-05, + "loss": 2.3052, + "step": 3615 + }, + { + "epoch": 0.677788191190253, + "grad_norm": 47312.609375, + "learning_rate": 9.21835228281312e-05, + "loss": 2.4346, + "step": 3616 + }, + { + "epoch": 0.6779756326148079, + "grad_norm": 51143.71875, + "learning_rate": 9.217930369834466e-05, + "loss": 2.312, + "step": 3617 + }, + { + "epoch": 0.6781630740393627, + "grad_norm": 53158.5078125, + "learning_rate": 9.217508352678399e-05, + "loss": 2.4198, + "step": 3618 + }, + { + "epoch": 0.6783505154639176, + "grad_norm": 45698.34765625, + "learning_rate": 9.217086231355343e-05, + "loss": 2.3371, + "step": 3619 + }, + { + "epoch": 0.6785379568884724, + "grad_norm": 53902.453125, + "learning_rate": 9.216664005875725e-05, + "loss": 2.3011, + "step": 3620 + }, + { + "epoch": 0.6787253983130271, + "grad_norm": 51591.5234375, + "learning_rate": 9.216241676249974e-05, + "loss": 2.2773, + "step": 3621 + }, + { + "epoch": 0.678912839737582, + "grad_norm": 45360.66015625, + "learning_rate": 9.21581924248852e-05, + "loss": 2.3655, + "step": 3622 + }, + { + "epoch": 0.6791002811621368, + "grad_norm": 46149.37890625, + "learning_rate": 9.215396704601795e-05, + "loss": 2.42, + "step": 3623 + }, + { + "epoch": 0.6792877225866917, + "grad_norm": 53791.296875, + "learning_rate": 9.21497406260024e-05, + "loss": 2.3567, + "step": 3624 + }, + { + "epoch": 0.6794751640112465, + "grad_norm": 48588.16796875, + "learning_rate": 9.214551316494286e-05, + "loss": 2.3673, + "step": 3625 + }, + { + "epoch": 0.6796626054358013, + "grad_norm": 49588.7421875, + "learning_rate": 9.214128466294382e-05, + "loss": 2.3127, + "step": 3626 + }, + { + "epoch": 0.6798500468603561, + "grad_norm": 51282.421875, + "learning_rate": 9.213705512010968e-05, + "loss": 2.375, + "step": 3627 + }, + { + "epoch": 0.680037488284911, + "grad_norm": 55582.421875, + "learning_rate": 9.213282453654491e-05, + "loss": 2.295, + "step": 3628 + }, + { + "epoch": 0.6802249297094658, + "grad_norm": 52347.3125, + "learning_rate": 9.212859291235399e-05, + "loss": 2.3091, + "step": 3629 + }, + { + "epoch": 0.6804123711340206, + "grad_norm": 49741.62109375, + "learning_rate": 9.212436024764144e-05, + "loss": 2.3504, + "step": 3630 + }, + { + "epoch": 0.6805998125585755, + "grad_norm": 47095.80859375, + "learning_rate": 9.212012654251183e-05, + "loss": 2.2934, + "step": 3631 + }, + { + "epoch": 0.6807872539831302, + "grad_norm": 52482.1796875, + "learning_rate": 9.211589179706968e-05, + "loss": 2.3461, + "step": 3632 + }, + { + "epoch": 0.6809746954076851, + "grad_norm": 50443.2421875, + "learning_rate": 9.211165601141962e-05, + "loss": 2.3837, + "step": 3633 + }, + { + "epoch": 0.6811621368322399, + "grad_norm": 51041.94921875, + "learning_rate": 9.210741918566626e-05, + "loss": 2.2868, + "step": 3634 + }, + { + "epoch": 0.6813495782567948, + "grad_norm": 48105.12109375, + "learning_rate": 9.210318131991422e-05, + "loss": 2.372, + "step": 3635 + }, + { + "epoch": 0.6815370196813496, + "grad_norm": 48057.96875, + "learning_rate": 9.209894241426822e-05, + "loss": 2.3865, + "step": 3636 + }, + { + "epoch": 0.6817244611059045, + "grad_norm": 50563.11328125, + "learning_rate": 9.209470246883289e-05, + "loss": 2.3767, + "step": 3637 + }, + { + "epoch": 0.6819119025304592, + "grad_norm": 49152.84375, + "learning_rate": 9.209046148371301e-05, + "loss": 2.327, + "step": 3638 + }, + { + "epoch": 0.682099343955014, + "grad_norm": 54111.0859375, + "learning_rate": 9.20862194590133e-05, + "loss": 2.3399, + "step": 3639 + }, + { + "epoch": 0.6822867853795689, + "grad_norm": 47908.81640625, + "learning_rate": 9.208197639483853e-05, + "loss": 2.3723, + "step": 3640 + }, + { + "epoch": 0.6824742268041237, + "grad_norm": 50742.13671875, + "learning_rate": 9.20777322912935e-05, + "loss": 2.3341, + "step": 3641 + }, + { + "epoch": 0.6826616682286786, + "grad_norm": 53935.86328125, + "learning_rate": 9.207348714848304e-05, + "loss": 2.2746, + "step": 3642 + }, + { + "epoch": 0.6828491096532333, + "grad_norm": 51512.11328125, + "learning_rate": 9.206924096651199e-05, + "loss": 2.3569, + "step": 3643 + }, + { + "epoch": 0.6830365510777882, + "grad_norm": 56571.88671875, + "learning_rate": 9.206499374548524e-05, + "loss": 2.3965, + "step": 3644 + }, + { + "epoch": 0.683223992502343, + "grad_norm": 52878.8515625, + "learning_rate": 9.206074548550769e-05, + "loss": 2.3883, + "step": 3645 + }, + { + "epoch": 0.6834114339268978, + "grad_norm": 49421.16015625, + "learning_rate": 9.205649618668424e-05, + "loss": 2.3038, + "step": 3646 + }, + { + "epoch": 0.6835988753514527, + "grad_norm": 51153.46875, + "learning_rate": 9.205224584911986e-05, + "loss": 2.3015, + "step": 3647 + }, + { + "epoch": 0.6837863167760075, + "grad_norm": 50708.359375, + "learning_rate": 9.204799447291955e-05, + "loss": 2.3456, + "step": 3648 + }, + { + "epoch": 0.6839737582005623, + "grad_norm": 51356.3203125, + "learning_rate": 9.204374205818829e-05, + "loss": 2.3448, + "step": 3649 + }, + { + "epoch": 0.6841611996251171, + "grad_norm": 50312.41015625, + "learning_rate": 9.20394886050311e-05, + "loss": 2.3363, + "step": 3650 + }, + { + "epoch": 0.684348641049672, + "grad_norm": 49981.609375, + "learning_rate": 9.203523411355304e-05, + "loss": 2.3698, + "step": 3651 + }, + { + "epoch": 0.6845360824742268, + "grad_norm": 53157.5234375, + "learning_rate": 9.203097858385922e-05, + "loss": 2.3402, + "step": 3652 + }, + { + "epoch": 0.6847235238987817, + "grad_norm": 49475.5, + "learning_rate": 9.202672201605471e-05, + "loss": 2.3758, + "step": 3653 + }, + { + "epoch": 0.6849109653233365, + "grad_norm": 49443.07421875, + "learning_rate": 9.202246441024464e-05, + "loss": 2.3996, + "step": 3654 + }, + { + "epoch": 0.6850984067478912, + "grad_norm": 48000.8359375, + "learning_rate": 9.201820576653422e-05, + "loss": 2.3161, + "step": 3655 + }, + { + "epoch": 0.6852858481724461, + "grad_norm": 47924.58203125, + "learning_rate": 9.201394608502856e-05, + "loss": 2.3579, + "step": 3656 + }, + { + "epoch": 0.6854732895970009, + "grad_norm": 55785.7421875, + "learning_rate": 9.200968536583293e-05, + "loss": 2.5483, + "step": 3657 + }, + { + "epoch": 0.6856607310215558, + "grad_norm": 50863.1953125, + "learning_rate": 9.200542360905251e-05, + "loss": 2.2857, + "step": 3658 + }, + { + "epoch": 0.6858481724461106, + "grad_norm": 47093.2421875, + "learning_rate": 9.20011608147926e-05, + "loss": 2.2837, + "step": 3659 + }, + { + "epoch": 0.6860356138706654, + "grad_norm": 54510.72265625, + "learning_rate": 9.199689698315848e-05, + "loss": 2.462, + "step": 3660 + }, + { + "epoch": 0.6862230552952202, + "grad_norm": 49791.8828125, + "learning_rate": 9.199263211425544e-05, + "loss": 2.3332, + "step": 3661 + }, + { + "epoch": 0.686410496719775, + "grad_norm": 62329.39453125, + "learning_rate": 9.198836620818884e-05, + "loss": 2.2078, + "step": 3662 + }, + { + "epoch": 0.6865979381443299, + "grad_norm": 48889.50390625, + "learning_rate": 9.198409926506404e-05, + "loss": 2.3692, + "step": 3663 + }, + { + "epoch": 0.6867853795688847, + "grad_norm": 49312.26171875, + "learning_rate": 9.197983128498641e-05, + "loss": 2.3317, + "step": 3664 + }, + { + "epoch": 0.6869728209934396, + "grad_norm": 52914.40234375, + "learning_rate": 9.197556226806137e-05, + "loss": 2.3535, + "step": 3665 + }, + { + "epoch": 0.6871602624179943, + "grad_norm": 48688.0703125, + "learning_rate": 9.197129221439438e-05, + "loss": 2.2069, + "step": 3666 + }, + { + "epoch": 0.6873477038425492, + "grad_norm": 49310.44921875, + "learning_rate": 9.196702112409088e-05, + "loss": 2.2741, + "step": 3667 + }, + { + "epoch": 0.687535145267104, + "grad_norm": 46827.62890625, + "learning_rate": 9.196274899725637e-05, + "loss": 2.3846, + "step": 3668 + }, + { + "epoch": 0.6877225866916589, + "grad_norm": 47946.83203125, + "learning_rate": 9.195847583399637e-05, + "loss": 2.3789, + "step": 3669 + }, + { + "epoch": 0.6879100281162137, + "grad_norm": 54139.26171875, + "learning_rate": 9.195420163441642e-05, + "loss": 2.4017, + "step": 3670 + }, + { + "epoch": 0.6880974695407686, + "grad_norm": 51084.390625, + "learning_rate": 9.194992639862208e-05, + "loss": 2.364, + "step": 3671 + }, + { + "epoch": 0.6882849109653233, + "grad_norm": 50010.45703125, + "learning_rate": 9.194565012671894e-05, + "loss": 2.3431, + "step": 3672 + }, + { + "epoch": 0.6884723523898781, + "grad_norm": 53301.0390625, + "learning_rate": 9.194137281881264e-05, + "loss": 2.3794, + "step": 3673 + }, + { + "epoch": 0.688659793814433, + "grad_norm": 49995.9609375, + "learning_rate": 9.193709447500877e-05, + "loss": 2.3857, + "step": 3674 + }, + { + "epoch": 0.6888472352389878, + "grad_norm": 51028.85546875, + "learning_rate": 9.193281509541308e-05, + "loss": 2.3731, + "step": 3675 + }, + { + "epoch": 0.6890346766635427, + "grad_norm": 59506.953125, + "learning_rate": 9.192853468013122e-05, + "loss": 2.3618, + "step": 3676 + }, + { + "epoch": 0.6892221180880975, + "grad_norm": 46752.9765625, + "learning_rate": 9.19242532292689e-05, + "loss": 2.3337, + "step": 3677 + }, + { + "epoch": 0.6894095595126523, + "grad_norm": 50071.015625, + "learning_rate": 9.191997074293188e-05, + "loss": 2.3304, + "step": 3678 + }, + { + "epoch": 0.6895970009372071, + "grad_norm": 47603.21484375, + "learning_rate": 9.191568722122595e-05, + "loss": 2.2991, + "step": 3679 + }, + { + "epoch": 0.689784442361762, + "grad_norm": 51304.015625, + "learning_rate": 9.191140266425689e-05, + "loss": 2.2438, + "step": 3680 + }, + { + "epoch": 0.6899718837863168, + "grad_norm": 48545.8984375, + "learning_rate": 9.190711707213051e-05, + "loss": 2.3168, + "step": 3681 + }, + { + "epoch": 0.6901593252108716, + "grad_norm": 52941.4765625, + "learning_rate": 9.190283044495267e-05, + "loss": 2.3016, + "step": 3682 + }, + { + "epoch": 0.6903467666354264, + "grad_norm": 50078.9609375, + "learning_rate": 9.189854278282924e-05, + "loss": 2.3779, + "step": 3683 + }, + { + "epoch": 0.6905342080599812, + "grad_norm": 48117.41796875, + "learning_rate": 9.189425408586614e-05, + "loss": 2.2902, + "step": 3684 + }, + { + "epoch": 0.6907216494845361, + "grad_norm": 46861.53125, + "learning_rate": 9.188996435416927e-05, + "loss": 2.3123, + "step": 3685 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 51506.859375, + "learning_rate": 9.18856735878446e-05, + "loss": 2.3408, + "step": 3686 + }, + { + "epoch": 0.6910965323336458, + "grad_norm": 48148.171875, + "learning_rate": 9.18813817869981e-05, + "loss": 2.3464, + "step": 3687 + }, + { + "epoch": 0.6912839737582006, + "grad_norm": 49398.4296875, + "learning_rate": 9.187708895173576e-05, + "loss": 2.3816, + "step": 3688 + }, + { + "epoch": 0.6914714151827553, + "grad_norm": 45164.41015625, + "learning_rate": 9.187279508216363e-05, + "loss": 2.3003, + "step": 3689 + }, + { + "epoch": 0.6916588566073102, + "grad_norm": 50726.91796875, + "learning_rate": 9.186850017838776e-05, + "loss": 2.4469, + "step": 3690 + }, + { + "epoch": 0.691846298031865, + "grad_norm": 48798.31640625, + "learning_rate": 9.186420424051419e-05, + "loss": 2.336, + "step": 3691 + }, + { + "epoch": 0.6920337394564199, + "grad_norm": 56597.8359375, + "learning_rate": 9.185990726864906e-05, + "loss": 2.2718, + "step": 3692 + }, + { + "epoch": 0.6922211808809747, + "grad_norm": 51201.609375, + "learning_rate": 9.185560926289851e-05, + "loss": 2.2351, + "step": 3693 + }, + { + "epoch": 0.6924086223055296, + "grad_norm": 56741.18359375, + "learning_rate": 9.185131022336865e-05, + "loss": 2.2988, + "step": 3694 + }, + { + "epoch": 0.6925960637300843, + "grad_norm": 51757.43359375, + "learning_rate": 9.184701015016572e-05, + "loss": 2.3243, + "step": 3695 + }, + { + "epoch": 0.6927835051546392, + "grad_norm": 51734.71875, + "learning_rate": 9.184270904339588e-05, + "loss": 2.3251, + "step": 3696 + }, + { + "epoch": 0.692970946579194, + "grad_norm": 56865.92578125, + "learning_rate": 9.183840690316536e-05, + "loss": 2.3109, + "step": 3697 + }, + { + "epoch": 0.6931583880037488, + "grad_norm": 50248.1640625, + "learning_rate": 9.183410372958046e-05, + "loss": 2.2026, + "step": 3698 + }, + { + "epoch": 0.6933458294283037, + "grad_norm": 50639.4375, + "learning_rate": 9.182979952274742e-05, + "loss": 2.3261, + "step": 3699 + }, + { + "epoch": 0.6935332708528584, + "grad_norm": 49571.5, + "learning_rate": 9.182549428277257e-05, + "loss": 2.3785, + "step": 3700 + }, + { + "epoch": 0.6937207122774133, + "grad_norm": 52456.6171875, + "learning_rate": 9.182118800976225e-05, + "loss": 2.3787, + "step": 3701 + }, + { + "epoch": 0.6939081537019681, + "grad_norm": 46005.57421875, + "learning_rate": 9.18168807038228e-05, + "loss": 2.3325, + "step": 3702 + }, + { + "epoch": 0.694095595126523, + "grad_norm": 51526.9453125, + "learning_rate": 9.18125723650606e-05, + "loss": 2.29, + "step": 3703 + }, + { + "epoch": 0.6942830365510778, + "grad_norm": 53033.24609375, + "learning_rate": 9.18082629935821e-05, + "loss": 2.3705, + "step": 3704 + }, + { + "epoch": 0.6944704779756327, + "grad_norm": 48598.1015625, + "learning_rate": 9.180395258949368e-05, + "loss": 2.3568, + "step": 3705 + }, + { + "epoch": 0.6946579194001874, + "grad_norm": 48669.89453125, + "learning_rate": 9.179964115290185e-05, + "loss": 2.3629, + "step": 3706 + }, + { + "epoch": 0.6948453608247422, + "grad_norm": 51068.62109375, + "learning_rate": 9.179532868391308e-05, + "loss": 2.3353, + "step": 3707 + }, + { + "epoch": 0.6950328022492971, + "grad_norm": 46828.6796875, + "learning_rate": 9.179101518263387e-05, + "loss": 2.332, + "step": 3708 + }, + { + "epoch": 0.6952202436738519, + "grad_norm": 52357.03515625, + "learning_rate": 9.178670064917077e-05, + "loss": 2.2881, + "step": 3709 + }, + { + "epoch": 0.6954076850984068, + "grad_norm": 51788.47265625, + "learning_rate": 9.178238508363033e-05, + "loss": 2.31, + "step": 3710 + }, + { + "epoch": 0.6955951265229616, + "grad_norm": 54647.96484375, + "learning_rate": 9.177806848611916e-05, + "loss": 2.2482, + "step": 3711 + }, + { + "epoch": 0.6957825679475164, + "grad_norm": 48506.99609375, + "learning_rate": 9.177375085674386e-05, + "loss": 2.3367, + "step": 3712 + }, + { + "epoch": 0.6959700093720712, + "grad_norm": 52139.61328125, + "learning_rate": 9.17694321956111e-05, + "loss": 2.3037, + "step": 3713 + }, + { + "epoch": 0.696157450796626, + "grad_norm": 51956.3515625, + "learning_rate": 9.176511250282749e-05, + "loss": 2.3541, + "step": 3714 + }, + { + "epoch": 0.6963448922211809, + "grad_norm": 52219.44921875, + "learning_rate": 9.176079177849977e-05, + "loss": 2.3704, + "step": 3715 + }, + { + "epoch": 0.6965323336457357, + "grad_norm": 49897.55859375, + "learning_rate": 9.175647002273462e-05, + "loss": 2.2549, + "step": 3716 + }, + { + "epoch": 0.6967197750702905, + "grad_norm": 56749.05078125, + "learning_rate": 9.175214723563881e-05, + "loss": 2.3379, + "step": 3717 + }, + { + "epoch": 0.6969072164948453, + "grad_norm": 50949.20703125, + "learning_rate": 9.174782341731908e-05, + "loss": 2.337, + "step": 3718 + }, + { + "epoch": 0.6970946579194002, + "grad_norm": 51601.4453125, + "learning_rate": 9.174349856788224e-05, + "loss": 2.3024, + "step": 3719 + }, + { + "epoch": 0.697282099343955, + "grad_norm": 49324.80078125, + "learning_rate": 9.173917268743512e-05, + "loss": 2.3675, + "step": 3720 + }, + { + "epoch": 0.6974695407685099, + "grad_norm": 50141.203125, + "learning_rate": 9.173484577608456e-05, + "loss": 2.3927, + "step": 3721 + }, + { + "epoch": 0.6976569821930647, + "grad_norm": 47316.765625, + "learning_rate": 9.17305178339374e-05, + "loss": 2.3484, + "step": 3722 + }, + { + "epoch": 0.6978444236176194, + "grad_norm": 49769.03515625, + "learning_rate": 9.172618886110056e-05, + "loss": 2.3475, + "step": 3723 + }, + { + "epoch": 0.6980318650421743, + "grad_norm": 48540.3203125, + "learning_rate": 9.172185885768095e-05, + "loss": 2.3975, + "step": 3724 + }, + { + "epoch": 0.6982193064667291, + "grad_norm": 58549.703125, + "learning_rate": 9.171752782378554e-05, + "loss": 2.4088, + "step": 3725 + }, + { + "epoch": 0.698406747891284, + "grad_norm": 52645.6015625, + "learning_rate": 9.171319575952126e-05, + "loss": 2.3064, + "step": 3726 + }, + { + "epoch": 0.6985941893158388, + "grad_norm": 51653.83203125, + "learning_rate": 9.170886266499513e-05, + "loss": 2.3266, + "step": 3727 + }, + { + "epoch": 0.6987816307403937, + "grad_norm": 48162.58203125, + "learning_rate": 9.170452854031416e-05, + "loss": 2.3489, + "step": 3728 + }, + { + "epoch": 0.6989690721649484, + "grad_norm": 48569.52734375, + "learning_rate": 9.170019338558542e-05, + "loss": 2.3444, + "step": 3729 + }, + { + "epoch": 0.6991565135895033, + "grad_norm": 50483.796875, + "learning_rate": 9.169585720091596e-05, + "loss": 2.4165, + "step": 3730 + }, + { + "epoch": 0.6993439550140581, + "grad_norm": 49105.5703125, + "learning_rate": 9.169151998641288e-05, + "loss": 2.3419, + "step": 3731 + }, + { + "epoch": 0.699531396438613, + "grad_norm": 51046.2109375, + "learning_rate": 9.168718174218332e-05, + "loss": 2.3152, + "step": 3732 + }, + { + "epoch": 0.6997188378631678, + "grad_norm": 51160.34375, + "learning_rate": 9.168284246833443e-05, + "loss": 2.315, + "step": 3733 + }, + { + "epoch": 0.6999062792877225, + "grad_norm": 47082.33203125, + "learning_rate": 9.167850216497335e-05, + "loss": 2.3211, + "step": 3734 + }, + { + "epoch": 0.7000937207122774, + "grad_norm": 55298.1875, + "learning_rate": 9.167416083220732e-05, + "loss": 2.3188, + "step": 3735 + }, + { + "epoch": 0.7002811621368322, + "grad_norm": 51328.796875, + "learning_rate": 9.166981847014355e-05, + "loss": 2.3177, + "step": 3736 + }, + { + "epoch": 0.7004686035613871, + "grad_norm": 50377.546875, + "learning_rate": 9.166547507888928e-05, + "loss": 2.3154, + "step": 3737 + }, + { + "epoch": 0.7006560449859419, + "grad_norm": 46664.83203125, + "learning_rate": 9.16611306585518e-05, + "loss": 2.3706, + "step": 3738 + }, + { + "epoch": 0.7008434864104968, + "grad_norm": 43400.78515625, + "learning_rate": 9.165678520923841e-05, + "loss": 2.3187, + "step": 3739 + }, + { + "epoch": 0.7010309278350515, + "grad_norm": 52345.7421875, + "learning_rate": 9.165243873105643e-05, + "loss": 2.3367, + "step": 3740 + }, + { + "epoch": 0.7012183692596063, + "grad_norm": 46841.88671875, + "learning_rate": 9.164809122411323e-05, + "loss": 2.3752, + "step": 3741 + }, + { + "epoch": 0.7014058106841612, + "grad_norm": 48335.359375, + "learning_rate": 9.164374268851616e-05, + "loss": 2.256, + "step": 3742 + }, + { + "epoch": 0.701593252108716, + "grad_norm": 51155.69140625, + "learning_rate": 9.163939312437264e-05, + "loss": 2.2872, + "step": 3743 + }, + { + "epoch": 0.7017806935332709, + "grad_norm": 53505.6328125, + "learning_rate": 9.163504253179009e-05, + "loss": 2.3724, + "step": 3744 + }, + { + "epoch": 0.7019681349578257, + "grad_norm": 50938.2578125, + "learning_rate": 9.163069091087597e-05, + "loss": 2.299, + "step": 3745 + }, + { + "epoch": 0.7021555763823805, + "grad_norm": 56126.57421875, + "learning_rate": 9.162633826173776e-05, + "loss": 2.2474, + "step": 3746 + }, + { + "epoch": 0.7023430178069353, + "grad_norm": 50542.13671875, + "learning_rate": 9.162198458448298e-05, + "loss": 2.3084, + "step": 3747 + }, + { + "epoch": 0.7025304592314902, + "grad_norm": 49607.7109375, + "learning_rate": 9.161762987921914e-05, + "loss": 2.3278, + "step": 3748 + }, + { + "epoch": 0.702717900656045, + "grad_norm": 48568.73828125, + "learning_rate": 9.161327414605379e-05, + "loss": 2.3646, + "step": 3749 + }, + { + "epoch": 0.7029053420805998, + "grad_norm": 50035.2578125, + "learning_rate": 9.160891738509452e-05, + "loss": 2.3228, + "step": 3750 + }, + { + "epoch": 0.7030927835051546, + "grad_norm": 47958.97265625, + "learning_rate": 9.160455959644894e-05, + "loss": 2.3664, + "step": 3751 + }, + { + "epoch": 0.7032802249297094, + "grad_norm": 46501.90625, + "learning_rate": 9.160020078022468e-05, + "loss": 2.2684, + "step": 3752 + }, + { + "epoch": 0.7034676663542643, + "grad_norm": 46673.1484375, + "learning_rate": 9.15958409365294e-05, + "loss": 2.2751, + "step": 3753 + }, + { + "epoch": 0.7036551077788191, + "grad_norm": 47611.66015625, + "learning_rate": 9.159148006547079e-05, + "loss": 2.3541, + "step": 3754 + }, + { + "epoch": 0.703842549203374, + "grad_norm": 49232.9453125, + "learning_rate": 9.158711816715652e-05, + "loss": 2.303, + "step": 3755 + }, + { + "epoch": 0.7040299906279288, + "grad_norm": 51794.77734375, + "learning_rate": 9.158275524169437e-05, + "loss": 2.2882, + "step": 3756 + }, + { + "epoch": 0.7042174320524835, + "grad_norm": 55267.3125, + "learning_rate": 9.157839128919207e-05, + "loss": 2.3845, + "step": 3757 + }, + { + "epoch": 0.7044048734770384, + "grad_norm": 52698.53125, + "learning_rate": 9.15740263097574e-05, + "loss": 2.4401, + "step": 3758 + }, + { + "epoch": 0.7045923149015932, + "grad_norm": 51711.50390625, + "learning_rate": 9.15696603034982e-05, + "loss": 2.3241, + "step": 3759 + }, + { + "epoch": 0.7047797563261481, + "grad_norm": 53408.96484375, + "learning_rate": 9.15652932705223e-05, + "loss": 2.3422, + "step": 3760 + }, + { + "epoch": 0.7049671977507029, + "grad_norm": 53429.15234375, + "learning_rate": 9.156092521093751e-05, + "loss": 2.3877, + "step": 3761 + }, + { + "epoch": 0.7051546391752578, + "grad_norm": 52518.24609375, + "learning_rate": 9.155655612485178e-05, + "loss": 2.3548, + "step": 3762 + }, + { + "epoch": 0.7053420805998125, + "grad_norm": 51270.6953125, + "learning_rate": 9.155218601237298e-05, + "loss": 2.2426, + "step": 3763 + }, + { + "epoch": 0.7055295220243674, + "grad_norm": 60577.32421875, + "learning_rate": 9.154781487360906e-05, + "loss": 2.3092, + "step": 3764 + }, + { + "epoch": 0.7057169634489222, + "grad_norm": 48989.421875, + "learning_rate": 9.154344270866798e-05, + "loss": 2.3169, + "step": 3765 + }, + { + "epoch": 0.705904404873477, + "grad_norm": 52516.3984375, + "learning_rate": 9.153906951765772e-05, + "loss": 2.3665, + "step": 3766 + }, + { + "epoch": 0.7060918462980319, + "grad_norm": 47191.4765625, + "learning_rate": 9.153469530068631e-05, + "loss": 2.368, + "step": 3767 + }, + { + "epoch": 0.7062792877225867, + "grad_norm": 56673.5, + "learning_rate": 9.153032005786178e-05, + "loss": 2.3509, + "step": 3768 + }, + { + "epoch": 0.7064667291471415, + "grad_norm": 62702.265625, + "learning_rate": 9.152594378929218e-05, + "loss": 2.3559, + "step": 3769 + }, + { + "epoch": 0.7066541705716963, + "grad_norm": 53677.84765625, + "learning_rate": 9.15215664950856e-05, + "loss": 2.3415, + "step": 3770 + }, + { + "epoch": 0.7068416119962512, + "grad_norm": 54458.22265625, + "learning_rate": 9.151718817535018e-05, + "loss": 2.3247, + "step": 3771 + }, + { + "epoch": 0.707029053420806, + "grad_norm": 51843.03125, + "learning_rate": 9.151280883019404e-05, + "loss": 2.3657, + "step": 3772 + }, + { + "epoch": 0.7072164948453609, + "grad_norm": 52562.06640625, + "learning_rate": 9.150842845972533e-05, + "loss": 2.2999, + "step": 3773 + }, + { + "epoch": 0.7074039362699156, + "grad_norm": 50436.00390625, + "learning_rate": 9.150404706405226e-05, + "loss": 2.2437, + "step": 3774 + }, + { + "epoch": 0.7075913776944704, + "grad_norm": 50814.6484375, + "learning_rate": 9.149966464328305e-05, + "loss": 2.3893, + "step": 3775 + }, + { + "epoch": 0.7077788191190253, + "grad_norm": 58808.06640625, + "learning_rate": 9.149528119752591e-05, + "loss": 2.4021, + "step": 3776 + }, + { + "epoch": 0.7079662605435801, + "grad_norm": 46562.9375, + "learning_rate": 9.149089672688913e-05, + "loss": 2.2932, + "step": 3777 + }, + { + "epoch": 0.708153701968135, + "grad_norm": 48686.68359375, + "learning_rate": 9.1486511231481e-05, + "loss": 2.4016, + "step": 3778 + }, + { + "epoch": 0.7083411433926898, + "grad_norm": 49888.57421875, + "learning_rate": 9.148212471140982e-05, + "loss": 2.3727, + "step": 3779 + }, + { + "epoch": 0.7085285848172446, + "grad_norm": 48910.7265625, + "learning_rate": 9.147773716678393e-05, + "loss": 2.3092, + "step": 3780 + }, + { + "epoch": 0.7087160262417994, + "grad_norm": 46924.33203125, + "learning_rate": 9.147334859771174e-05, + "loss": 2.3025, + "step": 3781 + }, + { + "epoch": 0.7089034676663543, + "grad_norm": 50696.60546875, + "learning_rate": 9.146895900430159e-05, + "loss": 2.3845, + "step": 3782 + }, + { + "epoch": 0.7090909090909091, + "grad_norm": 52657.5, + "learning_rate": 9.146456838666193e-05, + "loss": 2.3746, + "step": 3783 + }, + { + "epoch": 0.709278350515464, + "grad_norm": 48720.078125, + "learning_rate": 9.14601767449012e-05, + "loss": 2.2851, + "step": 3784 + }, + { + "epoch": 0.7094657919400188, + "grad_norm": 52055.1796875, + "learning_rate": 9.145578407912782e-05, + "loss": 2.3408, + "step": 3785 + }, + { + "epoch": 0.7096532333645735, + "grad_norm": 52750.06640625, + "learning_rate": 9.145139038945036e-05, + "loss": 2.2907, + "step": 3786 + }, + { + "epoch": 0.7098406747891284, + "grad_norm": 47881.7734375, + "learning_rate": 9.144699567597727e-05, + "loss": 2.3569, + "step": 3787 + }, + { + "epoch": 0.7100281162136832, + "grad_norm": 45295.90625, + "learning_rate": 9.144259993881714e-05, + "loss": 2.3399, + "step": 3788 + }, + { + "epoch": 0.7102155576382381, + "grad_norm": 51242.41015625, + "learning_rate": 9.143820317807853e-05, + "loss": 2.3078, + "step": 3789 + }, + { + "epoch": 0.7104029990627929, + "grad_norm": 50218.546875, + "learning_rate": 9.143380539387002e-05, + "loss": 2.2911, + "step": 3790 + }, + { + "epoch": 0.7105904404873477, + "grad_norm": 50051.26171875, + "learning_rate": 9.142940658630024e-05, + "loss": 2.2817, + "step": 3791 + }, + { + "epoch": 0.7107778819119025, + "grad_norm": 47846.56640625, + "learning_rate": 9.142500675547781e-05, + "loss": 2.286, + "step": 3792 + }, + { + "epoch": 0.7109653233364573, + "grad_norm": 53184.71875, + "learning_rate": 9.142060590151145e-05, + "loss": 2.3018, + "step": 3793 + }, + { + "epoch": 0.7111527647610122, + "grad_norm": 51669.2734375, + "learning_rate": 9.14162040245098e-05, + "loss": 2.3129, + "step": 3794 + }, + { + "epoch": 0.711340206185567, + "grad_norm": 51806.57421875, + "learning_rate": 9.141180112458163e-05, + "loss": 2.2403, + "step": 3795 + }, + { + "epoch": 0.7115276476101219, + "grad_norm": 52010.171875, + "learning_rate": 9.140739720183566e-05, + "loss": 2.3447, + "step": 3796 + }, + { + "epoch": 0.7117150890346766, + "grad_norm": 50152.30859375, + "learning_rate": 9.140299225638065e-05, + "loss": 2.3459, + "step": 3797 + }, + { + "epoch": 0.7119025304592315, + "grad_norm": 54202.37109375, + "learning_rate": 9.139858628832543e-05, + "loss": 2.3216, + "step": 3798 + }, + { + "epoch": 0.7120899718837863, + "grad_norm": 52193.3984375, + "learning_rate": 9.139417929777879e-05, + "loss": 2.4216, + "step": 3799 + }, + { + "epoch": 0.7122774133083412, + "grad_norm": 49520.6796875, + "learning_rate": 9.138977128484959e-05, + "loss": 2.3071, + "step": 3800 + }, + { + "epoch": 0.712464854732896, + "grad_norm": 54906.46484375, + "learning_rate": 9.138536224964671e-05, + "loss": 2.5574, + "step": 3801 + }, + { + "epoch": 0.7126522961574508, + "grad_norm": 49625.9140625, + "learning_rate": 9.138095219227903e-05, + "loss": 2.2222, + "step": 3802 + }, + { + "epoch": 0.7128397375820056, + "grad_norm": 52329.12890625, + "learning_rate": 9.137654111285548e-05, + "loss": 2.3171, + "step": 3803 + }, + { + "epoch": 0.7130271790065604, + "grad_norm": 45980.6015625, + "learning_rate": 9.137212901148501e-05, + "loss": 2.2988, + "step": 3804 + }, + { + "epoch": 0.7132146204311153, + "grad_norm": 53455.87890625, + "learning_rate": 9.136771588827662e-05, + "loss": 2.2861, + "step": 3805 + }, + { + "epoch": 0.7134020618556701, + "grad_norm": 52188.91796875, + "learning_rate": 9.136330174333924e-05, + "loss": 2.3434, + "step": 3806 + }, + { + "epoch": 0.713589503280225, + "grad_norm": 51330.5546875, + "learning_rate": 9.135888657678197e-05, + "loss": 2.3278, + "step": 3807 + }, + { + "epoch": 0.7137769447047797, + "grad_norm": 49172.359375, + "learning_rate": 9.135447038871381e-05, + "loss": 2.3775, + "step": 3808 + }, + { + "epoch": 0.7139643861293345, + "grad_norm": 50333.01953125, + "learning_rate": 9.135005317924386e-05, + "loss": 2.3739, + "step": 3809 + }, + { + "epoch": 0.7141518275538894, + "grad_norm": 48281.73046875, + "learning_rate": 9.134563494848119e-05, + "loss": 2.3536, + "step": 3810 + }, + { + "epoch": 0.7143392689784442, + "grad_norm": 51497.61328125, + "learning_rate": 9.134121569653494e-05, + "loss": 2.3445, + "step": 3811 + }, + { + "epoch": 0.7145267104029991, + "grad_norm": 49748.1171875, + "learning_rate": 9.133679542351428e-05, + "loss": 2.3873, + "step": 3812 + }, + { + "epoch": 0.7147141518275539, + "grad_norm": 45911.12109375, + "learning_rate": 9.133237412952835e-05, + "loss": 2.3345, + "step": 3813 + }, + { + "epoch": 0.7149015932521087, + "grad_norm": 49245.99609375, + "learning_rate": 9.132795181468638e-05, + "loss": 2.3569, + "step": 3814 + }, + { + "epoch": 0.7150890346766635, + "grad_norm": 46998.24609375, + "learning_rate": 9.132352847909758e-05, + "loss": 2.3019, + "step": 3815 + }, + { + "epoch": 0.7152764761012184, + "grad_norm": 45431.578125, + "learning_rate": 9.131910412287118e-05, + "loss": 2.2914, + "step": 3816 + }, + { + "epoch": 0.7154639175257732, + "grad_norm": 49997.6328125, + "learning_rate": 9.131467874611652e-05, + "loss": 2.3072, + "step": 3817 + }, + { + "epoch": 0.715651358950328, + "grad_norm": 46797.16015625, + "learning_rate": 9.131025234894284e-05, + "loss": 2.4014, + "step": 3818 + }, + { + "epoch": 0.7158388003748829, + "grad_norm": 49309.015625, + "learning_rate": 9.130582493145948e-05, + "loss": 2.3492, + "step": 3819 + }, + { + "epoch": 0.7160262417994376, + "grad_norm": 49606.26953125, + "learning_rate": 9.13013964937758e-05, + "loss": 2.2549, + "step": 3820 + }, + { + "epoch": 0.7162136832239925, + "grad_norm": 54442.53515625, + "learning_rate": 9.129696703600119e-05, + "loss": 2.3591, + "step": 3821 + }, + { + "epoch": 0.7164011246485473, + "grad_norm": 51960.8828125, + "learning_rate": 9.129253655824502e-05, + "loss": 2.3252, + "step": 3822 + }, + { + "epoch": 0.7165885660731022, + "grad_norm": 50490.87890625, + "learning_rate": 9.128810506061674e-05, + "loss": 2.3559, + "step": 3823 + }, + { + "epoch": 0.716776007497657, + "grad_norm": 47525.47265625, + "learning_rate": 9.12836725432258e-05, + "loss": 2.3812, + "step": 3824 + }, + { + "epoch": 0.7169634489222118, + "grad_norm": 45797.8125, + "learning_rate": 9.127923900618167e-05, + "loss": 2.3046, + "step": 3825 + }, + { + "epoch": 0.7171508903467666, + "grad_norm": 47731.97265625, + "learning_rate": 9.127480444959385e-05, + "loss": 2.3552, + "step": 3826 + }, + { + "epoch": 0.7173383317713214, + "grad_norm": 45975.36328125, + "learning_rate": 9.127036887357188e-05, + "loss": 2.2769, + "step": 3827 + }, + { + "epoch": 0.7175257731958763, + "grad_norm": 48357.58984375, + "learning_rate": 9.126593227822531e-05, + "loss": 2.3688, + "step": 3828 + }, + { + "epoch": 0.7177132146204311, + "grad_norm": 46161.86328125, + "learning_rate": 9.126149466366369e-05, + "loss": 2.3447, + "step": 3829 + }, + { + "epoch": 0.717900656044986, + "grad_norm": 51970.74609375, + "learning_rate": 9.125705602999668e-05, + "loss": 2.3333, + "step": 3830 + }, + { + "epoch": 0.7180880974695407, + "grad_norm": 49184.09765625, + "learning_rate": 9.125261637733387e-05, + "loss": 2.3779, + "step": 3831 + }, + { + "epoch": 0.7182755388940956, + "grad_norm": 47997.09765625, + "learning_rate": 9.124817570578491e-05, + "loss": 2.3234, + "step": 3832 + }, + { + "epoch": 0.7184629803186504, + "grad_norm": 51341.40625, + "learning_rate": 9.124373401545949e-05, + "loss": 2.5324, + "step": 3833 + }, + { + "epoch": 0.7186504217432053, + "grad_norm": 51979.0859375, + "learning_rate": 9.123929130646732e-05, + "loss": 2.2234, + "step": 3834 + }, + { + "epoch": 0.7188378631677601, + "grad_norm": 52541.49609375, + "learning_rate": 9.123484757891812e-05, + "loss": 2.3279, + "step": 3835 + }, + { + "epoch": 0.719025304592315, + "grad_norm": 47979.125, + "learning_rate": 9.123040283292165e-05, + "loss": 2.3412, + "step": 3836 + }, + { + "epoch": 0.7192127460168697, + "grad_norm": 48340.359375, + "learning_rate": 9.122595706858768e-05, + "loss": 2.3722, + "step": 3837 + }, + { + "epoch": 0.7194001874414245, + "grad_norm": 50072.59375, + "learning_rate": 9.122151028602601e-05, + "loss": 2.2994, + "step": 3838 + }, + { + "epoch": 0.7195876288659794, + "grad_norm": 49867.03515625, + "learning_rate": 9.121706248534649e-05, + "loss": 2.3169, + "step": 3839 + }, + { + "epoch": 0.7197750702905342, + "grad_norm": 46720.36328125, + "learning_rate": 9.121261366665899e-05, + "loss": 2.324, + "step": 3840 + }, + { + "epoch": 0.7199625117150891, + "grad_norm": 61144.12890625, + "learning_rate": 9.120816383007334e-05, + "loss": 2.352, + "step": 3841 + }, + { + "epoch": 0.7201499531396438, + "grad_norm": 48827.96875, + "learning_rate": 9.120371297569948e-05, + "loss": 2.3051, + "step": 3842 + }, + { + "epoch": 0.7203373945641987, + "grad_norm": 52191.70703125, + "learning_rate": 9.119926110364732e-05, + "loss": 2.327, + "step": 3843 + }, + { + "epoch": 0.7205248359887535, + "grad_norm": 52178.83203125, + "learning_rate": 9.119480821402681e-05, + "loss": 2.3551, + "step": 3844 + }, + { + "epoch": 0.7207122774133083, + "grad_norm": 47311.71875, + "learning_rate": 9.119035430694798e-05, + "loss": 2.3525, + "step": 3845 + }, + { + "epoch": 0.7208997188378632, + "grad_norm": 50681.5078125, + "learning_rate": 9.11858993825208e-05, + "loss": 2.3617, + "step": 3846 + }, + { + "epoch": 0.721087160262418, + "grad_norm": 51324.9921875, + "learning_rate": 9.118144344085529e-05, + "loss": 2.3701, + "step": 3847 + }, + { + "epoch": 0.7212746016869728, + "grad_norm": 45331.18359375, + "learning_rate": 9.117698648206152e-05, + "loss": 2.383, + "step": 3848 + }, + { + "epoch": 0.7214620431115276, + "grad_norm": 51826.95703125, + "learning_rate": 9.117252850624957e-05, + "loss": 2.2255, + "step": 3849 + }, + { + "epoch": 0.7216494845360825, + "grad_norm": 48248.08984375, + "learning_rate": 9.116806951352957e-05, + "loss": 2.3398, + "step": 3850 + }, + { + "epoch": 0.7218369259606373, + "grad_norm": 54617.91796875, + "learning_rate": 9.116360950401159e-05, + "loss": 2.3769, + "step": 3851 + }, + { + "epoch": 0.7220243673851922, + "grad_norm": 49192.70703125, + "learning_rate": 9.115914847780586e-05, + "loss": 2.346, + "step": 3852 + }, + { + "epoch": 0.722211808809747, + "grad_norm": 49567.9140625, + "learning_rate": 9.11546864350225e-05, + "loss": 2.3125, + "step": 3853 + }, + { + "epoch": 0.7223992502343017, + "grad_norm": 50977.78515625, + "learning_rate": 9.115022337577174e-05, + "loss": 2.2743, + "step": 3854 + }, + { + "epoch": 0.7225866916588566, + "grad_norm": 51203.87109375, + "learning_rate": 9.114575930016385e-05, + "loss": 2.3618, + "step": 3855 + }, + { + "epoch": 0.7227741330834114, + "grad_norm": 46842.453125, + "learning_rate": 9.114129420830903e-05, + "loss": 2.319, + "step": 3856 + }, + { + "epoch": 0.7229615745079663, + "grad_norm": 51080.890625, + "learning_rate": 9.113682810031758e-05, + "loss": 2.2869, + "step": 3857 + }, + { + "epoch": 0.7231490159325211, + "grad_norm": 46722.9375, + "learning_rate": 9.113236097629982e-05, + "loss": 2.2897, + "step": 3858 + }, + { + "epoch": 0.723336457357076, + "grad_norm": 47769.9140625, + "learning_rate": 9.112789283636606e-05, + "loss": 2.3479, + "step": 3859 + }, + { + "epoch": 0.7235238987816307, + "grad_norm": 51615.68359375, + "learning_rate": 9.112342368062668e-05, + "loss": 2.3407, + "step": 3860 + }, + { + "epoch": 0.7237113402061855, + "grad_norm": 48757.7109375, + "learning_rate": 9.111895350919204e-05, + "loss": 2.3612, + "step": 3861 + }, + { + "epoch": 0.7238987816307404, + "grad_norm": 48502.90234375, + "learning_rate": 9.111448232217258e-05, + "loss": 2.3154, + "step": 3862 + }, + { + "epoch": 0.7240862230552952, + "grad_norm": 51452.18359375, + "learning_rate": 9.11100101196787e-05, + "loss": 2.3045, + "step": 3863 + }, + { + "epoch": 0.7242736644798501, + "grad_norm": 46747.38671875, + "learning_rate": 9.110553690182087e-05, + "loss": 2.2904, + "step": 3864 + }, + { + "epoch": 0.7244611059044048, + "grad_norm": 47905.96484375, + "learning_rate": 9.110106266870957e-05, + "loss": 2.3501, + "step": 3865 + }, + { + "epoch": 0.7246485473289597, + "grad_norm": 49957.2734375, + "learning_rate": 9.109658742045532e-05, + "loss": 2.3877, + "step": 3866 + }, + { + "epoch": 0.7248359887535145, + "grad_norm": 52415.05859375, + "learning_rate": 9.109211115716864e-05, + "loss": 2.2884, + "step": 3867 + }, + { + "epoch": 0.7250234301780694, + "grad_norm": 52052.48828125, + "learning_rate": 9.10876338789601e-05, + "loss": 2.3724, + "step": 3868 + }, + { + "epoch": 0.7252108716026242, + "grad_norm": 48713.828125, + "learning_rate": 9.108315558594024e-05, + "loss": 2.3673, + "step": 3869 + }, + { + "epoch": 0.725398313027179, + "grad_norm": 54716.015625, + "learning_rate": 9.107867627821973e-05, + "loss": 2.3642, + "step": 3870 + }, + { + "epoch": 0.7255857544517338, + "grad_norm": 46635.703125, + "learning_rate": 9.107419595590918e-05, + "loss": 2.2714, + "step": 3871 + }, + { + "epoch": 0.7257731958762886, + "grad_norm": 50640.83203125, + "learning_rate": 9.106971461911923e-05, + "loss": 2.3149, + "step": 3872 + }, + { + "epoch": 0.7259606373008435, + "grad_norm": 51068.13671875, + "learning_rate": 9.106523226796057e-05, + "loss": 2.3057, + "step": 3873 + }, + { + "epoch": 0.7261480787253983, + "grad_norm": 50483.7578125, + "learning_rate": 9.106074890254393e-05, + "loss": 2.3563, + "step": 3874 + }, + { + "epoch": 0.7263355201499532, + "grad_norm": 45741.0859375, + "learning_rate": 9.105626452298002e-05, + "loss": 2.3306, + "step": 3875 + }, + { + "epoch": 0.726522961574508, + "grad_norm": 47708.94921875, + "learning_rate": 9.10517791293796e-05, + "loss": 2.3614, + "step": 3876 + }, + { + "epoch": 0.7267104029990628, + "grad_norm": 54189.15234375, + "learning_rate": 9.104729272185348e-05, + "loss": 2.3525, + "step": 3877 + }, + { + "epoch": 0.7268978444236176, + "grad_norm": 49506.7265625, + "learning_rate": 9.104280530051243e-05, + "loss": 2.3539, + "step": 3878 + }, + { + "epoch": 0.7270852858481724, + "grad_norm": 51433.6015625, + "learning_rate": 9.103831686546733e-05, + "loss": 2.6471, + "step": 3879 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 49396.078125, + "learning_rate": 9.103382741682898e-05, + "loss": 2.3774, + "step": 3880 + }, + { + "epoch": 0.7274601686972821, + "grad_norm": 46803.34375, + "learning_rate": 9.10293369547083e-05, + "loss": 2.342, + "step": 3881 + }, + { + "epoch": 0.7276476101218369, + "grad_norm": 51273.70703125, + "learning_rate": 9.102484547921621e-05, + "loss": 2.3428, + "step": 3882 + }, + { + "epoch": 0.7278350515463917, + "grad_norm": 44292.8125, + "learning_rate": 9.10203529904636e-05, + "loss": 2.3447, + "step": 3883 + }, + { + "epoch": 0.7280224929709466, + "grad_norm": 47997.3359375, + "learning_rate": 9.101585948856147e-05, + "loss": 2.3502, + "step": 3884 + }, + { + "epoch": 0.7282099343955014, + "grad_norm": 50583.76171875, + "learning_rate": 9.10113649736208e-05, + "loss": 2.2869, + "step": 3885 + }, + { + "epoch": 0.7283973758200563, + "grad_norm": 50976.1875, + "learning_rate": 9.100686944575257e-05, + "loss": 2.3187, + "step": 3886 + }, + { + "epoch": 0.7285848172446111, + "grad_norm": 46517.2578125, + "learning_rate": 9.100237290506783e-05, + "loss": 2.3326, + "step": 3887 + }, + { + "epoch": 0.7287722586691658, + "grad_norm": 52914.2578125, + "learning_rate": 9.099787535167764e-05, + "loss": 2.314, + "step": 3888 + }, + { + "epoch": 0.7289597000937207, + "grad_norm": 49244.16015625, + "learning_rate": 9.099337678569309e-05, + "loss": 2.3426, + "step": 3889 + }, + { + "epoch": 0.7291471415182755, + "grad_norm": 49590.4453125, + "learning_rate": 9.098887720722528e-05, + "loss": 2.3262, + "step": 3890 + }, + { + "epoch": 0.7293345829428304, + "grad_norm": 46868.77734375, + "learning_rate": 9.098437661638534e-05, + "loss": 2.3351, + "step": 3891 + }, + { + "epoch": 0.7295220243673852, + "grad_norm": 45129.37109375, + "learning_rate": 9.097987501328444e-05, + "loss": 2.3135, + "step": 3892 + }, + { + "epoch": 0.7297094657919401, + "grad_norm": 52991.55859375, + "learning_rate": 9.097537239803376e-05, + "loss": 2.3439, + "step": 3893 + }, + { + "epoch": 0.7298969072164948, + "grad_norm": 49734.75, + "learning_rate": 9.09708687707445e-05, + "loss": 2.3294, + "step": 3894 + }, + { + "epoch": 0.7300843486410497, + "grad_norm": 48419.19921875, + "learning_rate": 9.09663641315279e-05, + "loss": 2.3891, + "step": 3895 + }, + { + "epoch": 0.7302717900656045, + "grad_norm": 52308.38671875, + "learning_rate": 9.096185848049523e-05, + "loss": 2.2717, + "step": 3896 + }, + { + "epoch": 0.7304592314901593, + "grad_norm": 50121.03515625, + "learning_rate": 9.095735181775778e-05, + "loss": 2.3309, + "step": 3897 + }, + { + "epoch": 0.7306466729147142, + "grad_norm": 47626.3125, + "learning_rate": 9.095284414342681e-05, + "loss": 2.2954, + "step": 3898 + }, + { + "epoch": 0.7308341143392689, + "grad_norm": 49306.0703125, + "learning_rate": 9.09483354576137e-05, + "loss": 2.3347, + "step": 3899 + }, + { + "epoch": 0.7310215557638238, + "grad_norm": 51512.48828125, + "learning_rate": 9.094382576042979e-05, + "loss": 2.3053, + "step": 3900 + }, + { + "epoch": 0.7312089971883786, + "grad_norm": 50837.890625, + "learning_rate": 9.093931505198647e-05, + "loss": 2.2987, + "step": 3901 + }, + { + "epoch": 0.7313964386129335, + "grad_norm": 48207.87109375, + "learning_rate": 9.093480333239515e-05, + "loss": 2.3556, + "step": 3902 + }, + { + "epoch": 0.7315838800374883, + "grad_norm": 48703.546875, + "learning_rate": 9.093029060176726e-05, + "loss": 2.3476, + "step": 3903 + }, + { + "epoch": 0.7317713214620432, + "grad_norm": 48629.24609375, + "learning_rate": 9.092577686021423e-05, + "loss": 2.2915, + "step": 3904 + }, + { + "epoch": 0.7319587628865979, + "grad_norm": 47288.96484375, + "learning_rate": 9.092126210784761e-05, + "loss": 2.2425, + "step": 3905 + }, + { + "epoch": 0.7321462043111527, + "grad_norm": 48741.8671875, + "learning_rate": 9.091674634477887e-05, + "loss": 2.3482, + "step": 3906 + }, + { + "epoch": 0.7323336457357076, + "grad_norm": 48841.01953125, + "learning_rate": 9.091222957111953e-05, + "loss": 2.3156, + "step": 3907 + }, + { + "epoch": 0.7325210871602624, + "grad_norm": 50615.7890625, + "learning_rate": 9.090771178698116e-05, + "loss": 2.3637, + "step": 3908 + }, + { + "epoch": 0.7327085285848173, + "grad_norm": 55213.765625, + "learning_rate": 9.090319299247536e-05, + "loss": 2.2749, + "step": 3909 + }, + { + "epoch": 0.7328959700093721, + "grad_norm": 49459.57421875, + "learning_rate": 9.089867318771373e-05, + "loss": 2.3436, + "step": 3910 + }, + { + "epoch": 0.7330834114339269, + "grad_norm": 49519.7421875, + "learning_rate": 9.089415237280787e-05, + "loss": 2.2802, + "step": 3911 + }, + { + "epoch": 0.7332708528584817, + "grad_norm": 48487.5390625, + "learning_rate": 9.08896305478695e-05, + "loss": 2.3248, + "step": 3912 + }, + { + "epoch": 0.7334582942830365, + "grad_norm": 51047.203125, + "learning_rate": 9.088510771301025e-05, + "loss": 2.2859, + "step": 3913 + }, + { + "epoch": 0.7336457357075914, + "grad_norm": 47116.734375, + "learning_rate": 9.088058386834184e-05, + "loss": 2.291, + "step": 3914 + }, + { + "epoch": 0.7338331771321462, + "grad_norm": 47256.44921875, + "learning_rate": 9.087605901397603e-05, + "loss": 2.3119, + "step": 3915 + }, + { + "epoch": 0.734020618556701, + "grad_norm": 50267.9375, + "learning_rate": 9.087153315002454e-05, + "loss": 2.3121, + "step": 3916 + }, + { + "epoch": 0.7342080599812558, + "grad_norm": 51968.7109375, + "learning_rate": 9.086700627659919e-05, + "loss": 2.3578, + "step": 3917 + }, + { + "epoch": 0.7343955014058107, + "grad_norm": 48206.421875, + "learning_rate": 9.086247839381177e-05, + "loss": 2.3786, + "step": 3918 + }, + { + "epoch": 0.7345829428303655, + "grad_norm": 53693.28515625, + "learning_rate": 9.08579495017741e-05, + "loss": 2.3023, + "step": 3919 + }, + { + "epoch": 0.7347703842549204, + "grad_norm": 46123.15234375, + "learning_rate": 9.085341960059806e-05, + "loss": 2.3697, + "step": 3920 + }, + { + "epoch": 0.7349578256794752, + "grad_norm": 51795.65234375, + "learning_rate": 9.084888869039552e-05, + "loss": 2.2255, + "step": 3921 + }, + { + "epoch": 0.7351452671040299, + "grad_norm": 48811.16796875, + "learning_rate": 9.084435677127839e-05, + "loss": 2.3494, + "step": 3922 + }, + { + "epoch": 0.7353327085285848, + "grad_norm": 52101.29296875, + "learning_rate": 9.083982384335859e-05, + "loss": 2.2942, + "step": 3923 + }, + { + "epoch": 0.7355201499531396, + "grad_norm": 50610.59375, + "learning_rate": 9.08352899067481e-05, + "loss": 2.3409, + "step": 3924 + }, + { + "epoch": 0.7357075913776945, + "grad_norm": 47375.2578125, + "learning_rate": 9.083075496155891e-05, + "loss": 2.3476, + "step": 3925 + }, + { + "epoch": 0.7358950328022493, + "grad_norm": 51230.9375, + "learning_rate": 9.082621900790299e-05, + "loss": 2.3963, + "step": 3926 + }, + { + "epoch": 0.7360824742268042, + "grad_norm": 51875.25, + "learning_rate": 9.082168204589241e-05, + "loss": 2.2264, + "step": 3927 + }, + { + "epoch": 0.7362699156513589, + "grad_norm": 47590.21484375, + "learning_rate": 9.081714407563919e-05, + "loss": 2.2434, + "step": 3928 + }, + { + "epoch": 0.7364573570759138, + "grad_norm": 47924.921875, + "learning_rate": 9.081260509725545e-05, + "loss": 2.3425, + "step": 3929 + }, + { + "epoch": 0.7366447985004686, + "grad_norm": 51605.48046875, + "learning_rate": 9.080806511085328e-05, + "loss": 2.2919, + "step": 3930 + }, + { + "epoch": 0.7368322399250234, + "grad_norm": 49747.49609375, + "learning_rate": 9.080352411654478e-05, + "loss": 2.3338, + "step": 3931 + }, + { + "epoch": 0.7370196813495783, + "grad_norm": 48448.08984375, + "learning_rate": 9.079898211444216e-05, + "loss": 2.2736, + "step": 3932 + }, + { + "epoch": 0.737207122774133, + "grad_norm": 49697.01953125, + "learning_rate": 9.079443910465757e-05, + "loss": 2.3418, + "step": 3933 + }, + { + "epoch": 0.7373945641986879, + "grad_norm": 50502.36328125, + "learning_rate": 9.078989508730323e-05, + "loss": 2.3293, + "step": 3934 + }, + { + "epoch": 0.7375820056232427, + "grad_norm": 51971.921875, + "learning_rate": 9.078535006249136e-05, + "loss": 2.2888, + "step": 3935 + }, + { + "epoch": 0.7377694470477976, + "grad_norm": 48251.23046875, + "learning_rate": 9.078080403033424e-05, + "loss": 2.3322, + "step": 3936 + }, + { + "epoch": 0.7379568884723524, + "grad_norm": 49965.37109375, + "learning_rate": 9.077625699094412e-05, + "loss": 2.3001, + "step": 3937 + }, + { + "epoch": 0.7381443298969073, + "grad_norm": 50405.95703125, + "learning_rate": 9.077170894443333e-05, + "loss": 2.2558, + "step": 3938 + }, + { + "epoch": 0.738331771321462, + "grad_norm": 47410.6796875, + "learning_rate": 9.076715989091417e-05, + "loss": 2.2716, + "step": 3939 + }, + { + "epoch": 0.7385192127460168, + "grad_norm": 50249.17578125, + "learning_rate": 9.076260983049903e-05, + "loss": 2.3868, + "step": 3940 + }, + { + "epoch": 0.7387066541705717, + "grad_norm": 50300.3984375, + "learning_rate": 9.075805876330028e-05, + "loss": 2.3431, + "step": 3941 + }, + { + "epoch": 0.7388940955951265, + "grad_norm": 49196.71484375, + "learning_rate": 9.07535066894303e-05, + "loss": 2.3513, + "step": 3942 + }, + { + "epoch": 0.7390815370196814, + "grad_norm": 54900.10546875, + "learning_rate": 9.074895360900156e-05, + "loss": 2.3202, + "step": 3943 + }, + { + "epoch": 0.7392689784442362, + "grad_norm": 53429.8515625, + "learning_rate": 9.07443995221265e-05, + "loss": 2.2729, + "step": 3944 + }, + { + "epoch": 0.739456419868791, + "grad_norm": 54188.57421875, + "learning_rate": 9.073984442891759e-05, + "loss": 2.3335, + "step": 3945 + }, + { + "epoch": 0.7396438612933458, + "grad_norm": 49920.31640625, + "learning_rate": 9.073528832948735e-05, + "loss": 2.3252, + "step": 3946 + }, + { + "epoch": 0.7398313027179007, + "grad_norm": 49566.24609375, + "learning_rate": 9.07307312239483e-05, + "loss": 2.3391, + "step": 3947 + }, + { + "epoch": 0.7400187441424555, + "grad_norm": 60309.8515625, + "learning_rate": 9.072617311241301e-05, + "loss": 2.2916, + "step": 3948 + }, + { + "epoch": 0.7402061855670103, + "grad_norm": 56411.71484375, + "learning_rate": 9.072161399499401e-05, + "loss": 2.3853, + "step": 3949 + }, + { + "epoch": 0.7403936269915652, + "grad_norm": 47775.28125, + "learning_rate": 9.071705387180398e-05, + "loss": 2.3186, + "step": 3950 + }, + { + "epoch": 0.7405810684161199, + "grad_norm": 50545.890625, + "learning_rate": 9.071249274295548e-05, + "loss": 2.3403, + "step": 3951 + }, + { + "epoch": 0.7407685098406748, + "grad_norm": 54327.38671875, + "learning_rate": 9.070793060856122e-05, + "loss": 2.3119, + "step": 3952 + }, + { + "epoch": 0.7409559512652296, + "grad_norm": 48315.0078125, + "learning_rate": 9.070336746873383e-05, + "loss": 2.3367, + "step": 3953 + }, + { + "epoch": 0.7411433926897845, + "grad_norm": 53202.6796875, + "learning_rate": 9.069880332358604e-05, + "loss": 2.2631, + "step": 3954 + }, + { + "epoch": 0.7413308341143393, + "grad_norm": 49252.4296875, + "learning_rate": 9.069423817323059e-05, + "loss": 2.3616, + "step": 3955 + }, + { + "epoch": 0.741518275538894, + "grad_norm": 49777.9609375, + "learning_rate": 9.068967201778021e-05, + "loss": 2.3552, + "step": 3956 + }, + { + "epoch": 0.7417057169634489, + "grad_norm": 55218.9453125, + "learning_rate": 9.068510485734768e-05, + "loss": 2.2683, + "step": 3957 + }, + { + "epoch": 0.7418931583880037, + "grad_norm": 52169.9375, + "learning_rate": 9.068053669204582e-05, + "loss": 2.2912, + "step": 3958 + }, + { + "epoch": 0.7420805998125586, + "grad_norm": 51157.09375, + "learning_rate": 9.067596752198744e-05, + "loss": 2.2509, + "step": 3959 + }, + { + "epoch": 0.7422680412371134, + "grad_norm": 53139.44921875, + "learning_rate": 9.06713973472854e-05, + "loss": 2.3054, + "step": 3960 + }, + { + "epoch": 0.7424554826616683, + "grad_norm": 51218.0234375, + "learning_rate": 9.066682616805258e-05, + "loss": 2.2955, + "step": 3961 + }, + { + "epoch": 0.742642924086223, + "grad_norm": 52028.8515625, + "learning_rate": 9.066225398440186e-05, + "loss": 2.3132, + "step": 3962 + }, + { + "epoch": 0.7428303655107779, + "grad_norm": 60169.16796875, + "learning_rate": 9.06576807964462e-05, + "loss": 2.2914, + "step": 3963 + }, + { + "epoch": 0.7430178069353327, + "grad_norm": 52736.484375, + "learning_rate": 9.065310660429855e-05, + "loss": 2.4189, + "step": 3964 + }, + { + "epoch": 0.7432052483598875, + "grad_norm": 47802.58203125, + "learning_rate": 9.064853140807186e-05, + "loss": 2.3726, + "step": 3965 + }, + { + "epoch": 0.7433926897844424, + "grad_norm": 49392.57421875, + "learning_rate": 9.064395520787915e-05, + "loss": 2.3845, + "step": 3966 + }, + { + "epoch": 0.7435801312089972, + "grad_norm": 50422.40625, + "learning_rate": 9.063937800383345e-05, + "loss": 2.3596, + "step": 3967 + }, + { + "epoch": 0.743767572633552, + "grad_norm": 47017.1484375, + "learning_rate": 9.063479979604779e-05, + "loss": 2.297, + "step": 3968 + }, + { + "epoch": 0.7439550140581068, + "grad_norm": 47024.68359375, + "learning_rate": 9.063022058463527e-05, + "loss": 2.3037, + "step": 3969 + }, + { + "epoch": 0.7441424554826617, + "grad_norm": 50383.8515625, + "learning_rate": 9.062564036970897e-05, + "loss": 2.3296, + "step": 3970 + }, + { + "epoch": 0.7443298969072165, + "grad_norm": 55339.66015625, + "learning_rate": 9.062105915138202e-05, + "loss": 2.3252, + "step": 3971 + }, + { + "epoch": 0.7445173383317714, + "grad_norm": 46602.69921875, + "learning_rate": 9.061647692976758e-05, + "loss": 2.3939, + "step": 3972 + }, + { + "epoch": 0.7447047797563261, + "grad_norm": 53324.7734375, + "learning_rate": 9.061189370497883e-05, + "loss": 2.3384, + "step": 3973 + }, + { + "epoch": 0.7448922211808809, + "grad_norm": 51585.5546875, + "learning_rate": 9.060730947712895e-05, + "loss": 2.3994, + "step": 3974 + }, + { + "epoch": 0.7450796626054358, + "grad_norm": 47845.1484375, + "learning_rate": 9.060272424633117e-05, + "loss": 2.352, + "step": 3975 + }, + { + "epoch": 0.7452671040299906, + "grad_norm": 48339.7109375, + "learning_rate": 9.059813801269875e-05, + "loss": 2.3362, + "step": 3976 + }, + { + "epoch": 0.7454545454545455, + "grad_norm": 50723.34765625, + "learning_rate": 9.059355077634496e-05, + "loss": 2.2888, + "step": 3977 + }, + { + "epoch": 0.7456419868791003, + "grad_norm": 48308.85546875, + "learning_rate": 9.05889625373831e-05, + "loss": 2.3485, + "step": 3978 + }, + { + "epoch": 0.7458294283036551, + "grad_norm": 50441.25, + "learning_rate": 9.058437329592648e-05, + "loss": 2.3343, + "step": 3979 + }, + { + "epoch": 0.7460168697282099, + "grad_norm": 47627.14453125, + "learning_rate": 9.057978305208845e-05, + "loss": 2.3058, + "step": 3980 + }, + { + "epoch": 0.7462043111527648, + "grad_norm": 55322.953125, + "learning_rate": 9.057519180598241e-05, + "loss": 2.3273, + "step": 3981 + }, + { + "epoch": 0.7463917525773196, + "grad_norm": 45624.796875, + "learning_rate": 9.057059955772173e-05, + "loss": 2.3056, + "step": 3982 + }, + { + "epoch": 0.7465791940018744, + "grad_norm": 47608.2265625, + "learning_rate": 9.056600630741984e-05, + "loss": 2.2876, + "step": 3983 + }, + { + "epoch": 0.7467666354264293, + "grad_norm": 49929.51171875, + "learning_rate": 9.05614120551902e-05, + "loss": 2.3446, + "step": 3984 + }, + { + "epoch": 0.746954076850984, + "grad_norm": 47587.72265625, + "learning_rate": 9.055681680114626e-05, + "loss": 2.3731, + "step": 3985 + }, + { + "epoch": 0.7471415182755389, + "grad_norm": 48617.7265625, + "learning_rate": 9.055222054540152e-05, + "loss": 2.3149, + "step": 3986 + }, + { + "epoch": 0.7473289597000937, + "grad_norm": 47426.421875, + "learning_rate": 9.054762328806951e-05, + "loss": 2.3634, + "step": 3987 + }, + { + "epoch": 0.7475164011246486, + "grad_norm": 48798.7734375, + "learning_rate": 9.05430250292638e-05, + "loss": 2.3013, + "step": 3988 + }, + { + "epoch": 0.7477038425492034, + "grad_norm": 47264.125, + "learning_rate": 9.053842576909792e-05, + "loss": 2.2896, + "step": 3989 + }, + { + "epoch": 0.7478912839737581, + "grad_norm": 59082.60546875, + "learning_rate": 9.053382550768548e-05, + "loss": 2.2637, + "step": 3990 + }, + { + "epoch": 0.748078725398313, + "grad_norm": 53833.6328125, + "learning_rate": 9.05292242451401e-05, + "loss": 2.3354, + "step": 3991 + }, + { + "epoch": 0.7482661668228678, + "grad_norm": 48281.4609375, + "learning_rate": 9.052462198157543e-05, + "loss": 2.2761, + "step": 3992 + }, + { + "epoch": 0.7484536082474227, + "grad_norm": 49888.55078125, + "learning_rate": 9.052001871710514e-05, + "loss": 2.3315, + "step": 3993 + }, + { + "epoch": 0.7486410496719775, + "grad_norm": 50052.81640625, + "learning_rate": 9.051541445184293e-05, + "loss": 2.3893, + "step": 3994 + }, + { + "epoch": 0.7488284910965324, + "grad_norm": 44073.48046875, + "learning_rate": 9.051080918590251e-05, + "loss": 2.3528, + "step": 3995 + }, + { + "epoch": 0.7490159325210871, + "grad_norm": 53439.2890625, + "learning_rate": 9.050620291939762e-05, + "loss": 2.3548, + "step": 3996 + }, + { + "epoch": 0.749203373945642, + "grad_norm": 51208.22265625, + "learning_rate": 9.050159565244205e-05, + "loss": 2.2966, + "step": 3997 + }, + { + "epoch": 0.7493908153701968, + "grad_norm": 49963.3984375, + "learning_rate": 9.049698738514956e-05, + "loss": 2.3523, + "step": 3998 + }, + { + "epoch": 0.7495782567947517, + "grad_norm": 50184.80078125, + "learning_rate": 9.049237811763399e-05, + "loss": 2.3722, + "step": 3999 + }, + { + "epoch": 0.7497656982193065, + "grad_norm": 46615.50390625, + "learning_rate": 9.048776785000919e-05, + "loss": 2.3697, + "step": 4000 + }, + { + "epoch": 0.7497656982193065, + "eval_loss": 2.3236632347106934, + "eval_runtime": 130.4377, + "eval_samples_per_second": 38.708, + "eval_steps_per_second": 1.94, + "step": 4000 + }, + { + "epoch": 0.7499531396438613, + "grad_norm": 53330.1015625, + "learning_rate": 9.0483156582389e-05, + "loss": 2.3096, + "step": 4001 + }, + { + "epoch": 0.7501405810684161, + "grad_norm": 50077.28125, + "learning_rate": 9.047854431488734e-05, + "loss": 2.29, + "step": 4002 + }, + { + "epoch": 0.7503280224929709, + "grad_norm": 47845.8671875, + "learning_rate": 9.047393104761812e-05, + "loss": 2.3019, + "step": 4003 + }, + { + "epoch": 0.7505154639175258, + "grad_norm": 54923.58203125, + "learning_rate": 9.046931678069528e-05, + "loss": 2.3314, + "step": 4004 + }, + { + "epoch": 0.7507029053420806, + "grad_norm": 52324.48046875, + "learning_rate": 9.046470151423278e-05, + "loss": 2.351, + "step": 4005 + }, + { + "epoch": 0.7508903467666355, + "grad_norm": 48103.453125, + "learning_rate": 9.046008524834462e-05, + "loss": 2.3276, + "step": 4006 + }, + { + "epoch": 0.7510777881911902, + "grad_norm": 51461.2578125, + "learning_rate": 9.045546798314479e-05, + "loss": 2.2773, + "step": 4007 + }, + { + "epoch": 0.751265229615745, + "grad_norm": 50023.08203125, + "learning_rate": 9.045084971874738e-05, + "loss": 2.2754, + "step": 4008 + }, + { + "epoch": 0.7514526710402999, + "grad_norm": 50786.78515625, + "learning_rate": 9.044623045526641e-05, + "loss": 2.2655, + "step": 4009 + }, + { + "epoch": 0.7516401124648547, + "grad_norm": 53700.265625, + "learning_rate": 9.044161019281599e-05, + "loss": 2.3257, + "step": 4010 + }, + { + "epoch": 0.7518275538894096, + "grad_norm": 48836.734375, + "learning_rate": 9.043698893151025e-05, + "loss": 2.2778, + "step": 4011 + }, + { + "epoch": 0.7520149953139644, + "grad_norm": 46095.76953125, + "learning_rate": 9.043236667146328e-05, + "loss": 2.3166, + "step": 4012 + }, + { + "epoch": 0.7522024367385192, + "grad_norm": 51541.47265625, + "learning_rate": 9.04277434127893e-05, + "loss": 2.3068, + "step": 4013 + }, + { + "epoch": 0.752389878163074, + "grad_norm": 52755.80078125, + "learning_rate": 9.042311915560247e-05, + "loss": 2.3637, + "step": 4014 + }, + { + "epoch": 0.7525773195876289, + "grad_norm": 52473.02734375, + "learning_rate": 9.041849390001698e-05, + "loss": 2.3068, + "step": 4015 + }, + { + "epoch": 0.7527647610121837, + "grad_norm": 49620.87890625, + "learning_rate": 9.041386764614713e-05, + "loss": 2.2807, + "step": 4016 + }, + { + "epoch": 0.7529522024367385, + "grad_norm": 47378.30859375, + "learning_rate": 9.040924039410712e-05, + "loss": 2.4016, + "step": 4017 + }, + { + "epoch": 0.7531396438612934, + "grad_norm": 50429.58984375, + "learning_rate": 9.040461214401129e-05, + "loss": 2.27, + "step": 4018 + }, + { + "epoch": 0.7533270852858481, + "grad_norm": 50757.25390625, + "learning_rate": 9.03999828959739e-05, + "loss": 2.4151, + "step": 4019 + }, + { + "epoch": 0.753514526710403, + "grad_norm": 48169.62109375, + "learning_rate": 9.039535265010933e-05, + "loss": 2.3534, + "step": 4020 + }, + { + "epoch": 0.7537019681349578, + "grad_norm": 48460.26953125, + "learning_rate": 9.03907214065319e-05, + "loss": 2.322, + "step": 4021 + }, + { + "epoch": 0.7538894095595127, + "grad_norm": 49729.18359375, + "learning_rate": 9.038608916535602e-05, + "loss": 2.3513, + "step": 4022 + }, + { + "epoch": 0.7540768509840675, + "grad_norm": 46472.265625, + "learning_rate": 9.038145592669612e-05, + "loss": 2.2996, + "step": 4023 + }, + { + "epoch": 0.7542642924086224, + "grad_norm": 57242.25390625, + "learning_rate": 9.037682169066662e-05, + "loss": 2.2818, + "step": 4024 + }, + { + "epoch": 0.7544517338331771, + "grad_norm": 49629.76953125, + "learning_rate": 9.037218645738195e-05, + "loss": 2.3356, + "step": 4025 + }, + { + "epoch": 0.7546391752577319, + "grad_norm": 51700.01171875, + "learning_rate": 9.036755022695662e-05, + "loss": 2.295, + "step": 4026 + }, + { + "epoch": 0.7548266166822868, + "grad_norm": 49364.9921875, + "learning_rate": 9.036291299950514e-05, + "loss": 2.3436, + "step": 4027 + }, + { + "epoch": 0.7550140581068416, + "grad_norm": 50001.875, + "learning_rate": 9.035827477514204e-05, + "loss": 2.3269, + "step": 4028 + }, + { + "epoch": 0.7552014995313965, + "grad_norm": 50513.578125, + "learning_rate": 9.035363555398189e-05, + "loss": 2.245, + "step": 4029 + }, + { + "epoch": 0.7553889409559512, + "grad_norm": 48505.06640625, + "learning_rate": 9.034899533613926e-05, + "loss": 2.2789, + "step": 4030 + }, + { + "epoch": 0.7555763823805061, + "grad_norm": 51407.93359375, + "learning_rate": 9.034435412172874e-05, + "loss": 2.2938, + "step": 4031 + }, + { + "epoch": 0.7557638238050609, + "grad_norm": 51107.828125, + "learning_rate": 9.033971191086499e-05, + "loss": 2.3252, + "step": 4032 + }, + { + "epoch": 0.7559512652296158, + "grad_norm": 51335.05859375, + "learning_rate": 9.033506870366267e-05, + "loss": 2.3129, + "step": 4033 + }, + { + "epoch": 0.7561387066541706, + "grad_norm": 56670.94140625, + "learning_rate": 9.033042450023643e-05, + "loss": 2.2547, + "step": 4034 + }, + { + "epoch": 0.7563261480787254, + "grad_norm": 48837.9140625, + "learning_rate": 9.032577930070101e-05, + "loss": 2.3058, + "step": 4035 + }, + { + "epoch": 0.7565135895032802, + "grad_norm": 46964.87890625, + "learning_rate": 9.032113310517111e-05, + "loss": 2.345, + "step": 4036 + }, + { + "epoch": 0.756701030927835, + "grad_norm": 46718.98046875, + "learning_rate": 9.03164859137615e-05, + "loss": 2.3008, + "step": 4037 + }, + { + "epoch": 0.7568884723523899, + "grad_norm": 54924.76171875, + "learning_rate": 9.031183772658698e-05, + "loss": 2.186, + "step": 4038 + }, + { + "epoch": 0.7570759137769447, + "grad_norm": 52574.58203125, + "learning_rate": 9.030718854376231e-05, + "loss": 2.2943, + "step": 4039 + }, + { + "epoch": 0.7572633552014996, + "grad_norm": 49384.015625, + "learning_rate": 9.030253836540235e-05, + "loss": 2.2885, + "step": 4040 + }, + { + "epoch": 0.7574507966260544, + "grad_norm": 50869.08984375, + "learning_rate": 9.029788719162195e-05, + "loss": 2.329, + "step": 4041 + }, + { + "epoch": 0.7576382380506091, + "grad_norm": 57156.25, + "learning_rate": 9.029323502253598e-05, + "loss": 2.3419, + "step": 4042 + }, + { + "epoch": 0.757825679475164, + "grad_norm": 48018.09765625, + "learning_rate": 9.028858185825934e-05, + "loss": 2.3341, + "step": 4043 + }, + { + "epoch": 0.7580131208997188, + "grad_norm": 51389.265625, + "learning_rate": 9.028392769890697e-05, + "loss": 2.2544, + "step": 4044 + }, + { + "epoch": 0.7582005623242737, + "grad_norm": 49632.1953125, + "learning_rate": 9.027927254459381e-05, + "loss": 2.2989, + "step": 4045 + }, + { + "epoch": 0.7583880037488285, + "grad_norm": 52742.48828125, + "learning_rate": 9.027461639543484e-05, + "loss": 2.3505, + "step": 4046 + }, + { + "epoch": 0.7585754451733833, + "grad_norm": 51482.46484375, + "learning_rate": 9.026995925154507e-05, + "loss": 2.351, + "step": 4047 + }, + { + "epoch": 0.7587628865979381, + "grad_norm": 47899.0234375, + "learning_rate": 9.026530111303951e-05, + "loss": 2.344, + "step": 4048 + }, + { + "epoch": 0.758950328022493, + "grad_norm": 49314.19921875, + "learning_rate": 9.026064198003321e-05, + "loss": 2.3386, + "step": 4049 + }, + { + "epoch": 0.7591377694470478, + "grad_norm": 51102.3671875, + "learning_rate": 9.025598185264126e-05, + "loss": 2.3163, + "step": 4050 + }, + { + "epoch": 0.7593252108716027, + "grad_norm": 48927.46875, + "learning_rate": 9.025132073097875e-05, + "loss": 2.2902, + "step": 4051 + }, + { + "epoch": 0.7595126522961575, + "grad_norm": 50769.484375, + "learning_rate": 9.024665861516081e-05, + "loss": 2.3098, + "step": 4052 + }, + { + "epoch": 0.7597000937207122, + "grad_norm": 48916.23828125, + "learning_rate": 9.024199550530257e-05, + "loss": 2.2979, + "step": 4053 + }, + { + "epoch": 0.7598875351452671, + "grad_norm": 47578.625, + "learning_rate": 9.023733140151923e-05, + "loss": 2.3177, + "step": 4054 + }, + { + "epoch": 0.7600749765698219, + "grad_norm": 49295.640625, + "learning_rate": 9.023266630392597e-05, + "loss": 2.3267, + "step": 4055 + }, + { + "epoch": 0.7602624179943768, + "grad_norm": 55931.5546875, + "learning_rate": 9.0228000212638e-05, + "loss": 2.2367, + "step": 4056 + }, + { + "epoch": 0.7604498594189316, + "grad_norm": 48993.04296875, + "learning_rate": 9.02233331277706e-05, + "loss": 2.3102, + "step": 4057 + }, + { + "epoch": 0.7606373008434865, + "grad_norm": 48853.78125, + "learning_rate": 9.021866504943902e-05, + "loss": 2.29, + "step": 4058 + }, + { + "epoch": 0.7608247422680412, + "grad_norm": 53779.5625, + "learning_rate": 9.021399597775854e-05, + "loss": 2.3515, + "step": 4059 + }, + { + "epoch": 0.761012183692596, + "grad_norm": 51996.6875, + "learning_rate": 9.020932591284453e-05, + "loss": 2.3705, + "step": 4060 + }, + { + "epoch": 0.7611996251171509, + "grad_norm": 51344.1953125, + "learning_rate": 9.020465485481228e-05, + "loss": 2.3089, + "step": 4061 + }, + { + "epoch": 0.7613870665417057, + "grad_norm": 48587.8203125, + "learning_rate": 9.019998280377717e-05, + "loss": 2.3369, + "step": 4062 + }, + { + "epoch": 0.7615745079662606, + "grad_norm": 51426.046875, + "learning_rate": 9.019530975985463e-05, + "loss": 2.2584, + "step": 4063 + }, + { + "epoch": 0.7617619493908153, + "grad_norm": 49608.6953125, + "learning_rate": 9.019063572316004e-05, + "loss": 2.3275, + "step": 4064 + }, + { + "epoch": 0.7619493908153702, + "grad_norm": 48197.140625, + "learning_rate": 9.018596069380885e-05, + "loss": 2.338, + "step": 4065 + }, + { + "epoch": 0.762136832239925, + "grad_norm": 49281.23046875, + "learning_rate": 9.018128467191654e-05, + "loss": 2.3386, + "step": 4066 + }, + { + "epoch": 0.7623242736644799, + "grad_norm": 48526.65625, + "learning_rate": 9.017660765759861e-05, + "loss": 2.3405, + "step": 4067 + }, + { + "epoch": 0.7625117150890347, + "grad_norm": 49185.80859375, + "learning_rate": 9.017192965097055e-05, + "loss": 2.2919, + "step": 4068 + }, + { + "epoch": 0.7626991565135895, + "grad_norm": 50400.1796875, + "learning_rate": 9.016725065214789e-05, + "loss": 2.386, + "step": 4069 + }, + { + "epoch": 0.7628865979381443, + "grad_norm": 49349.171875, + "learning_rate": 9.016257066124623e-05, + "loss": 2.4165, + "step": 4070 + }, + { + "epoch": 0.7630740393626991, + "grad_norm": 49228.47265625, + "learning_rate": 9.015788967838115e-05, + "loss": 2.3899, + "step": 4071 + }, + { + "epoch": 0.763261480787254, + "grad_norm": 54667.18359375, + "learning_rate": 9.015320770366826e-05, + "loss": 2.3521, + "step": 4072 + }, + { + "epoch": 0.7634489222118088, + "grad_norm": 47081.2890625, + "learning_rate": 9.01485247372232e-05, + "loss": 2.3532, + "step": 4073 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 47564.0078125, + "learning_rate": 9.014384077916163e-05, + "loss": 2.3366, + "step": 4074 + }, + { + "epoch": 0.7638238050609185, + "grad_norm": 48345.5390625, + "learning_rate": 9.013915582959924e-05, + "loss": 2.2755, + "step": 4075 + }, + { + "epoch": 0.7640112464854732, + "grad_norm": 52679.8359375, + "learning_rate": 9.013446988865175e-05, + "loss": 2.442, + "step": 4076 + }, + { + "epoch": 0.7641986879100281, + "grad_norm": 51129.78515625, + "learning_rate": 9.012978295643488e-05, + "loss": 2.312, + "step": 4077 + }, + { + "epoch": 0.7643861293345829, + "grad_norm": 52547.23046875, + "learning_rate": 9.01250950330644e-05, + "loss": 2.3213, + "step": 4078 + }, + { + "epoch": 0.7645735707591378, + "grad_norm": 52268.984375, + "learning_rate": 9.01204061186561e-05, + "loss": 2.3259, + "step": 4079 + }, + { + "epoch": 0.7647610121836926, + "grad_norm": 47955.71484375, + "learning_rate": 9.011571621332578e-05, + "loss": 2.3751, + "step": 4080 + }, + { + "epoch": 0.7649484536082474, + "grad_norm": 50344.72265625, + "learning_rate": 9.011102531718928e-05, + "loss": 2.3182, + "step": 4081 + }, + { + "epoch": 0.7651358950328022, + "grad_norm": 47846.671875, + "learning_rate": 9.010633343036247e-05, + "loss": 2.3049, + "step": 4082 + }, + { + "epoch": 0.7653233364573571, + "grad_norm": 48807.51953125, + "learning_rate": 9.010164055296121e-05, + "loss": 2.3565, + "step": 4083 + }, + { + "epoch": 0.7655107778819119, + "grad_norm": 48078.05078125, + "learning_rate": 9.009694668510143e-05, + "loss": 2.3153, + "step": 4084 + }, + { + "epoch": 0.7656982193064668, + "grad_norm": 49623.421875, + "learning_rate": 9.009225182689905e-05, + "loss": 2.3426, + "step": 4085 + }, + { + "epoch": 0.7658856607310216, + "grad_norm": 44848.515625, + "learning_rate": 9.008755597847003e-05, + "loss": 2.3202, + "step": 4086 + }, + { + "epoch": 0.7660731021555763, + "grad_norm": 51330.796875, + "learning_rate": 9.008285913993035e-05, + "loss": 2.3602, + "step": 4087 + }, + { + "epoch": 0.7662605435801312, + "grad_norm": 50623.11328125, + "learning_rate": 9.007816131139602e-05, + "loss": 2.3439, + "step": 4088 + }, + { + "epoch": 0.766447985004686, + "grad_norm": 49400.71875, + "learning_rate": 9.007346249298307e-05, + "loss": 2.3438, + "step": 4089 + }, + { + "epoch": 0.7666354264292409, + "grad_norm": 50860.296875, + "learning_rate": 9.006876268480755e-05, + "loss": 2.3209, + "step": 4090 + }, + { + "epoch": 0.7668228678537957, + "grad_norm": 58001.00390625, + "learning_rate": 9.006406188698552e-05, + "loss": 2.2858, + "step": 4091 + }, + { + "epoch": 0.7670103092783506, + "grad_norm": 49652.4765625, + "learning_rate": 9.005936009963314e-05, + "loss": 2.3002, + "step": 4092 + }, + { + "epoch": 0.7671977507029053, + "grad_norm": 48992.92578125, + "learning_rate": 9.005465732286648e-05, + "loss": 2.275, + "step": 4093 + }, + { + "epoch": 0.7673851921274601, + "grad_norm": 46563.609375, + "learning_rate": 9.004995355680173e-05, + "loss": 2.3269, + "step": 4094 + }, + { + "epoch": 0.767572633552015, + "grad_norm": 52959.453125, + "learning_rate": 9.004524880155505e-05, + "loss": 2.3446, + "step": 4095 + }, + { + "epoch": 0.7677600749765698, + "grad_norm": 48943.64453125, + "learning_rate": 9.004054305724266e-05, + "loss": 2.3184, + "step": 4096 + }, + { + "epoch": 0.7679475164011247, + "grad_norm": 51842.15234375, + "learning_rate": 9.003583632398073e-05, + "loss": 2.32, + "step": 4097 + }, + { + "epoch": 0.7681349578256794, + "grad_norm": 56595.61328125, + "learning_rate": 9.003112860188559e-05, + "loss": 2.2666, + "step": 4098 + }, + { + "epoch": 0.7683223992502343, + "grad_norm": 51849.765625, + "learning_rate": 9.002641989107346e-05, + "loss": 2.3396, + "step": 4099 + }, + { + "epoch": 0.7685098406747891, + "grad_norm": 47534.03515625, + "learning_rate": 9.002171019166063e-05, + "loss": 2.3641, + "step": 4100 + }, + { + "epoch": 0.768697282099344, + "grad_norm": 50203.23828125, + "learning_rate": 9.001699950376347e-05, + "loss": 2.3751, + "step": 4101 + }, + { + "epoch": 0.7688847235238988, + "grad_norm": 51135.55859375, + "learning_rate": 9.00122878274983e-05, + "loss": 2.3264, + "step": 4102 + }, + { + "epoch": 0.7690721649484537, + "grad_norm": 50190.54296875, + "learning_rate": 9.00075751629815e-05, + "loss": 2.345, + "step": 4103 + }, + { + "epoch": 0.7692596063730084, + "grad_norm": 48942.06640625, + "learning_rate": 9.000286151032946e-05, + "loss": 2.3594, + "step": 4104 + }, + { + "epoch": 0.7694470477975632, + "grad_norm": 50603.78125, + "learning_rate": 8.999814686965862e-05, + "loss": 2.3863, + "step": 4105 + }, + { + "epoch": 0.7696344892221181, + "grad_norm": 50125.4453125, + "learning_rate": 8.999343124108541e-05, + "loss": 2.3629, + "step": 4106 + }, + { + "epoch": 0.7698219306466729, + "grad_norm": 51096.40625, + "learning_rate": 8.998871462472629e-05, + "loss": 2.4157, + "step": 4107 + }, + { + "epoch": 0.7700093720712278, + "grad_norm": 49521.7421875, + "learning_rate": 8.998399702069775e-05, + "loss": 2.3559, + "step": 4108 + }, + { + "epoch": 0.7701968134957826, + "grad_norm": 45401.9921875, + "learning_rate": 8.997927842911634e-05, + "loss": 2.3233, + "step": 4109 + }, + { + "epoch": 0.7703842549203374, + "grad_norm": 52766.1328125, + "learning_rate": 8.997455885009859e-05, + "loss": 2.4143, + "step": 4110 + }, + { + "epoch": 0.7705716963448922, + "grad_norm": 55865.2109375, + "learning_rate": 8.996983828376107e-05, + "loss": 2.3655, + "step": 4111 + }, + { + "epoch": 0.770759137769447, + "grad_norm": 49698.71875, + "learning_rate": 8.996511673022035e-05, + "loss": 2.2979, + "step": 4112 + }, + { + "epoch": 0.7709465791940019, + "grad_norm": 53957.17578125, + "learning_rate": 8.996039418959306e-05, + "loss": 2.2991, + "step": 4113 + }, + { + "epoch": 0.7711340206185567, + "grad_norm": 50424.22265625, + "learning_rate": 8.995567066199586e-05, + "loss": 2.2782, + "step": 4114 + }, + { + "epoch": 0.7713214620431116, + "grad_norm": 51379.09765625, + "learning_rate": 8.995094614754538e-05, + "loss": 2.3023, + "step": 4115 + }, + { + "epoch": 0.7715089034676663, + "grad_norm": 48622.6796875, + "learning_rate": 8.994622064635833e-05, + "loss": 2.2828, + "step": 4116 + }, + { + "epoch": 0.7716963448922212, + "grad_norm": 49292.81640625, + "learning_rate": 8.994149415855144e-05, + "loss": 2.3289, + "step": 4117 + }, + { + "epoch": 0.771883786316776, + "grad_norm": 52480.6796875, + "learning_rate": 8.993676668424142e-05, + "loss": 2.3143, + "step": 4118 + }, + { + "epoch": 0.7720712277413309, + "grad_norm": 55254.15625, + "learning_rate": 8.993203822354504e-05, + "loss": 2.274, + "step": 4119 + }, + { + "epoch": 0.7722586691658857, + "grad_norm": 52153.03515625, + "learning_rate": 8.992730877657907e-05, + "loss": 2.2381, + "step": 4120 + }, + { + "epoch": 0.7724461105904404, + "grad_norm": 47935.40625, + "learning_rate": 8.992257834346036e-05, + "loss": 2.3415, + "step": 4121 + }, + { + "epoch": 0.7726335520149953, + "grad_norm": 52084.14453125, + "learning_rate": 8.991784692430574e-05, + "loss": 2.39, + "step": 4122 + }, + { + "epoch": 0.7728209934395501, + "grad_norm": 52826.81640625, + "learning_rate": 8.991311451923203e-05, + "loss": 2.2965, + "step": 4123 + }, + { + "epoch": 0.773008434864105, + "grad_norm": 50218.5234375, + "learning_rate": 8.990838112835615e-05, + "loss": 2.3203, + "step": 4124 + }, + { + "epoch": 0.7731958762886598, + "grad_norm": 47812.9765625, + "learning_rate": 8.990364675179499e-05, + "loss": 2.3472, + "step": 4125 + }, + { + "epoch": 0.7733833177132147, + "grad_norm": 44814.24609375, + "learning_rate": 8.989891138966549e-05, + "loss": 2.3745, + "step": 4126 + }, + { + "epoch": 0.7735707591377694, + "grad_norm": 55519.25390625, + "learning_rate": 8.989417504208462e-05, + "loss": 2.3331, + "step": 4127 + }, + { + "epoch": 0.7737582005623242, + "grad_norm": 49006.23828125, + "learning_rate": 8.988943770916933e-05, + "loss": 2.3017, + "step": 4128 + }, + { + "epoch": 0.7739456419868791, + "grad_norm": 52138.25, + "learning_rate": 8.988469939103667e-05, + "loss": 2.4049, + "step": 4129 + }, + { + "epoch": 0.7741330834114339, + "grad_norm": 47943.58203125, + "learning_rate": 8.987996008780364e-05, + "loss": 2.2937, + "step": 4130 + }, + { + "epoch": 0.7743205248359888, + "grad_norm": 50775.828125, + "learning_rate": 8.987521979958729e-05, + "loss": 2.3045, + "step": 4131 + }, + { + "epoch": 0.7745079662605436, + "grad_norm": 48331.52734375, + "learning_rate": 8.987047852650472e-05, + "loss": 2.3154, + "step": 4132 + }, + { + "epoch": 0.7746954076850984, + "grad_norm": 47696.67578125, + "learning_rate": 8.986573626867303e-05, + "loss": 2.2995, + "step": 4133 + }, + { + "epoch": 0.7748828491096532, + "grad_norm": 50089.60546875, + "learning_rate": 8.986099302620933e-05, + "loss": 2.3322, + "step": 4134 + }, + { + "epoch": 0.7750702905342081, + "grad_norm": 53763.91796875, + "learning_rate": 8.985624879923079e-05, + "loss": 2.3393, + "step": 4135 + }, + { + "epoch": 0.7752577319587629, + "grad_norm": 51931.80859375, + "learning_rate": 8.985150358785457e-05, + "loss": 2.3143, + "step": 4136 + }, + { + "epoch": 0.7754451733833178, + "grad_norm": 54219.73046875, + "learning_rate": 8.984675739219788e-05, + "loss": 2.2722, + "step": 4137 + }, + { + "epoch": 0.7756326148078725, + "grad_norm": 48939.5625, + "learning_rate": 8.984201021237795e-05, + "loss": 2.3061, + "step": 4138 + }, + { + "epoch": 0.7758200562324273, + "grad_norm": 47536.8984375, + "learning_rate": 8.983726204851201e-05, + "loss": 2.2749, + "step": 4139 + }, + { + "epoch": 0.7760074976569822, + "grad_norm": 51333.62109375, + "learning_rate": 8.983251290071737e-05, + "loss": 2.3015, + "step": 4140 + }, + { + "epoch": 0.776194939081537, + "grad_norm": 47212.328125, + "learning_rate": 8.98277627691113e-05, + "loss": 2.2418, + "step": 4141 + }, + { + "epoch": 0.7763823805060919, + "grad_norm": 52108.92578125, + "learning_rate": 8.98230116538111e-05, + "loss": 2.2495, + "step": 4142 + }, + { + "epoch": 0.7765698219306467, + "grad_norm": 46286.5546875, + "learning_rate": 8.981825955493417e-05, + "loss": 2.3617, + "step": 4143 + }, + { + "epoch": 0.7767572633552015, + "grad_norm": 54836.83203125, + "learning_rate": 8.981350647259786e-05, + "loss": 2.3677, + "step": 4144 + }, + { + "epoch": 0.7769447047797563, + "grad_norm": 47871.3671875, + "learning_rate": 8.980875240691954e-05, + "loss": 2.2933, + "step": 4145 + }, + { + "epoch": 0.7771321462043111, + "grad_norm": 53418.40625, + "learning_rate": 8.980399735801666e-05, + "loss": 2.3724, + "step": 4146 + }, + { + "epoch": 0.777319587628866, + "grad_norm": 56393.4921875, + "learning_rate": 8.979924132600663e-05, + "loss": 2.2265, + "step": 4147 + }, + { + "epoch": 0.7775070290534208, + "grad_norm": 49926.95703125, + "learning_rate": 8.979448431100696e-05, + "loss": 2.3083, + "step": 4148 + }, + { + "epoch": 0.7776944704779757, + "grad_norm": 51933.27734375, + "learning_rate": 8.978972631313513e-05, + "loss": 2.3027, + "step": 4149 + }, + { + "epoch": 0.7778819119025304, + "grad_norm": 47349.47265625, + "learning_rate": 8.978496733250863e-05, + "loss": 2.331, + "step": 4150 + }, + { + "epoch": 0.7780693533270853, + "grad_norm": 49200.859375, + "learning_rate": 8.978020736924502e-05, + "loss": 2.2863, + "step": 4151 + }, + { + "epoch": 0.7782567947516401, + "grad_norm": 47428.47265625, + "learning_rate": 8.977544642346188e-05, + "loss": 2.3272, + "step": 4152 + }, + { + "epoch": 0.778444236176195, + "grad_norm": 49499.61328125, + "learning_rate": 8.977068449527677e-05, + "loss": 2.3428, + "step": 4153 + }, + { + "epoch": 0.7786316776007498, + "grad_norm": 49636.33203125, + "learning_rate": 8.976592158480732e-05, + "loss": 2.2606, + "step": 4154 + }, + { + "epoch": 0.7788191190253045, + "grad_norm": 49206.40625, + "learning_rate": 8.976115769217117e-05, + "loss": 2.386, + "step": 4155 + }, + { + "epoch": 0.7790065604498594, + "grad_norm": 49952.83984375, + "learning_rate": 8.975639281748596e-05, + "loss": 2.3544, + "step": 4156 + }, + { + "epoch": 0.7791940018744142, + "grad_norm": 49463.49609375, + "learning_rate": 8.975162696086941e-05, + "loss": 2.3262, + "step": 4157 + }, + { + "epoch": 0.7793814432989691, + "grad_norm": 48831.5859375, + "learning_rate": 8.97468601224392e-05, + "loss": 2.3454, + "step": 4158 + }, + { + "epoch": 0.7795688847235239, + "grad_norm": 50803.16015625, + "learning_rate": 8.974209230231309e-05, + "loss": 2.3399, + "step": 4159 + }, + { + "epoch": 0.7797563261480788, + "grad_norm": 48918.8828125, + "learning_rate": 8.973732350060881e-05, + "loss": 2.3634, + "step": 4160 + }, + { + "epoch": 0.7799437675726335, + "grad_norm": 50591.30859375, + "learning_rate": 8.973255371744417e-05, + "loss": 2.3059, + "step": 4161 + }, + { + "epoch": 0.7801312089971884, + "grad_norm": 52672.98828125, + "learning_rate": 8.972778295293697e-05, + "loss": 2.3632, + "step": 4162 + }, + { + "epoch": 0.7803186504217432, + "grad_norm": 49574.89453125, + "learning_rate": 8.972301120720503e-05, + "loss": 2.3154, + "step": 4163 + }, + { + "epoch": 0.780506091846298, + "grad_norm": 52080.2734375, + "learning_rate": 8.971823848036623e-05, + "loss": 2.2721, + "step": 4164 + }, + { + "epoch": 0.7806935332708529, + "grad_norm": 51998.07421875, + "learning_rate": 8.971346477253841e-05, + "loss": 2.3162, + "step": 4165 + }, + { + "epoch": 0.7808809746954077, + "grad_norm": 48386.109375, + "learning_rate": 8.970869008383952e-05, + "loss": 2.3557, + "step": 4166 + }, + { + "epoch": 0.7810684161199625, + "grad_norm": 47621.94140625, + "learning_rate": 8.970391441438746e-05, + "loss": 2.3125, + "step": 4167 + }, + { + "epoch": 0.7812558575445173, + "grad_norm": 49186.6015625, + "learning_rate": 8.96991377643002e-05, + "loss": 2.2239, + "step": 4168 + }, + { + "epoch": 0.7814432989690722, + "grad_norm": 50055.2265625, + "learning_rate": 8.96943601336957e-05, + "loss": 2.3797, + "step": 4169 + }, + { + "epoch": 0.781630740393627, + "grad_norm": 52614.6875, + "learning_rate": 8.968958152269195e-05, + "loss": 2.3163, + "step": 4170 + }, + { + "epoch": 0.7818181818181819, + "grad_norm": 51873.0625, + "learning_rate": 8.968480193140703e-05, + "loss": 2.3228, + "step": 4171 + }, + { + "epoch": 0.7820056232427366, + "grad_norm": 48529.91796875, + "learning_rate": 8.968002135995894e-05, + "loss": 2.3676, + "step": 4172 + }, + { + "epoch": 0.7821930646672914, + "grad_norm": 65958.3984375, + "learning_rate": 8.967523980846577e-05, + "loss": 2.3621, + "step": 4173 + }, + { + "epoch": 0.7823805060918463, + "grad_norm": 48661.0078125, + "learning_rate": 8.96704572770456e-05, + "loss": 2.4153, + "step": 4174 + }, + { + "epoch": 0.7825679475164011, + "grad_norm": 50151.83984375, + "learning_rate": 8.966567376581658e-05, + "loss": 2.3426, + "step": 4175 + }, + { + "epoch": 0.782755388940956, + "grad_norm": 53858.44140625, + "learning_rate": 8.966088927489686e-05, + "loss": 2.3643, + "step": 4176 + }, + { + "epoch": 0.7829428303655108, + "grad_norm": 54008.6796875, + "learning_rate": 8.965610380440459e-05, + "loss": 2.4124, + "step": 4177 + }, + { + "epoch": 0.7831302717900656, + "grad_norm": 50236.5546875, + "learning_rate": 8.965131735445795e-05, + "loss": 2.34, + "step": 4178 + }, + { + "epoch": 0.7833177132146204, + "grad_norm": 51461.03125, + "learning_rate": 8.964652992517521e-05, + "loss": 2.3905, + "step": 4179 + }, + { + "epoch": 0.7835051546391752, + "grad_norm": 46022.453125, + "learning_rate": 8.964174151667456e-05, + "loss": 2.3411, + "step": 4180 + }, + { + "epoch": 0.7836925960637301, + "grad_norm": 49342.0390625, + "learning_rate": 8.963695212907431e-05, + "loss": 2.3621, + "step": 4181 + }, + { + "epoch": 0.7838800374882849, + "grad_norm": 49899.1171875, + "learning_rate": 8.963216176249271e-05, + "loss": 2.3825, + "step": 4182 + }, + { + "epoch": 0.7840674789128398, + "grad_norm": 48351.06640625, + "learning_rate": 8.962737041704811e-05, + "loss": 2.3181, + "step": 4183 + }, + { + "epoch": 0.7842549203373945, + "grad_norm": 46196.7578125, + "learning_rate": 8.962257809285884e-05, + "loss": 2.3921, + "step": 4184 + }, + { + "epoch": 0.7844423617619494, + "grad_norm": 50528.66015625, + "learning_rate": 8.961778479004326e-05, + "loss": 2.3489, + "step": 4185 + }, + { + "epoch": 0.7846298031865042, + "grad_norm": 48065.56640625, + "learning_rate": 8.961299050871976e-05, + "loss": 2.3251, + "step": 4186 + }, + { + "epoch": 0.7848172446110591, + "grad_norm": 51741.38671875, + "learning_rate": 8.960819524900674e-05, + "loss": 2.352, + "step": 4187 + }, + { + "epoch": 0.7850046860356139, + "grad_norm": 52777.39453125, + "learning_rate": 8.960339901102268e-05, + "loss": 2.3532, + "step": 4188 + }, + { + "epoch": 0.7851921274601686, + "grad_norm": 49507.12109375, + "learning_rate": 8.9598601794886e-05, + "loss": 2.3267, + "step": 4189 + }, + { + "epoch": 0.7853795688847235, + "grad_norm": 47049.40625, + "learning_rate": 8.959380360071518e-05, + "loss": 2.3803, + "step": 4190 + }, + { + "epoch": 0.7855670103092783, + "grad_norm": 57902.45703125, + "learning_rate": 8.958900442862877e-05, + "loss": 2.3511, + "step": 4191 + }, + { + "epoch": 0.7857544517338332, + "grad_norm": 49384.1484375, + "learning_rate": 8.958420427874525e-05, + "loss": 2.3641, + "step": 4192 + }, + { + "epoch": 0.785941893158388, + "grad_norm": 50167.1875, + "learning_rate": 8.957940315118323e-05, + "loss": 2.3551, + "step": 4193 + }, + { + "epoch": 0.7861293345829429, + "grad_norm": 50506.13671875, + "learning_rate": 8.957460104606126e-05, + "loss": 2.3063, + "step": 4194 + }, + { + "epoch": 0.7863167760074976, + "grad_norm": 51670.22265625, + "learning_rate": 8.956979796349795e-05, + "loss": 2.3401, + "step": 4195 + }, + { + "epoch": 0.7865042174320525, + "grad_norm": 48748.08203125, + "learning_rate": 8.956499390361194e-05, + "loss": 2.2871, + "step": 4196 + }, + { + "epoch": 0.7866916588566073, + "grad_norm": 46740.6484375, + "learning_rate": 8.956018886652187e-05, + "loss": 2.3022, + "step": 4197 + }, + { + "epoch": 0.7868791002811621, + "grad_norm": 49585.3359375, + "learning_rate": 8.955538285234644e-05, + "loss": 2.3121, + "step": 4198 + }, + { + "epoch": 0.787066541705717, + "grad_norm": 49093.37109375, + "learning_rate": 8.955057586120432e-05, + "loss": 2.2879, + "step": 4199 + }, + { + "epoch": 0.7872539831302718, + "grad_norm": 50145.07421875, + "learning_rate": 8.954576789321426e-05, + "loss": 2.4031, + "step": 4200 + }, + { + "epoch": 0.7874414245548266, + "grad_norm": 46097.66015625, + "learning_rate": 8.9540958948495e-05, + "loss": 2.334, + "step": 4201 + }, + { + "epoch": 0.7876288659793814, + "grad_norm": 45227.484375, + "learning_rate": 8.953614902716533e-05, + "loss": 2.3621, + "step": 4202 + }, + { + "epoch": 0.7878163074039363, + "grad_norm": 51586.3359375, + "learning_rate": 8.953133812934403e-05, + "loss": 2.2991, + "step": 4203 + }, + { + "epoch": 0.7880037488284911, + "grad_norm": 50307.8046875, + "learning_rate": 8.952652625514994e-05, + "loss": 2.3253, + "step": 4204 + }, + { + "epoch": 0.788191190253046, + "grad_norm": 51603.390625, + "learning_rate": 8.95217134047019e-05, + "loss": 2.3255, + "step": 4205 + }, + { + "epoch": 0.7883786316776008, + "grad_norm": 44193.91796875, + "learning_rate": 8.951689957811877e-05, + "loss": 2.306, + "step": 4206 + }, + { + "epoch": 0.7885660731021555, + "grad_norm": 49639.46875, + "learning_rate": 8.951208477551946e-05, + "loss": 2.2764, + "step": 4207 + }, + { + "epoch": 0.7887535145267104, + "grad_norm": 45219.65625, + "learning_rate": 8.950726899702288e-05, + "loss": 2.2821, + "step": 4208 + }, + { + "epoch": 0.7889409559512652, + "grad_norm": 47271.453125, + "learning_rate": 8.950245224274798e-05, + "loss": 2.2601, + "step": 4209 + }, + { + "epoch": 0.7891283973758201, + "grad_norm": 53811.453125, + "learning_rate": 8.949763451281372e-05, + "loss": 2.3123, + "step": 4210 + }, + { + "epoch": 0.7893158388003749, + "grad_norm": 48483.33984375, + "learning_rate": 8.94928158073391e-05, + "loss": 2.3523, + "step": 4211 + }, + { + "epoch": 0.7895032802249297, + "grad_norm": 54800.3125, + "learning_rate": 8.948799612644314e-05, + "loss": 2.3643, + "step": 4212 + }, + { + "epoch": 0.7896907216494845, + "grad_norm": 46248.29296875, + "learning_rate": 8.948317547024487e-05, + "loss": 2.3502, + "step": 4213 + }, + { + "epoch": 0.7898781630740394, + "grad_norm": 51933.75, + "learning_rate": 8.947835383886336e-05, + "loss": 2.3011, + "step": 4214 + }, + { + "epoch": 0.7900656044985942, + "grad_norm": 52532.61328125, + "learning_rate": 8.947353123241769e-05, + "loss": 2.2859, + "step": 4215 + }, + { + "epoch": 0.790253045923149, + "grad_norm": 50040.8515625, + "learning_rate": 8.946870765102697e-05, + "loss": 2.3589, + "step": 4216 + }, + { + "epoch": 0.7904404873477039, + "grad_norm": 50799.734375, + "learning_rate": 8.946388309481035e-05, + "loss": 2.2852, + "step": 4217 + }, + { + "epoch": 0.7906279287722586, + "grad_norm": 53981.16015625, + "learning_rate": 8.945905756388698e-05, + "loss": 2.2568, + "step": 4218 + }, + { + "epoch": 0.7908153701968135, + "grad_norm": 47306.91796875, + "learning_rate": 8.945423105837605e-05, + "loss": 2.3236, + "step": 4219 + }, + { + "epoch": 0.7910028116213683, + "grad_norm": 51991.5625, + "learning_rate": 8.944940357839677e-05, + "loss": 2.2444, + "step": 4220 + }, + { + "epoch": 0.7911902530459232, + "grad_norm": 50576.3203125, + "learning_rate": 8.944457512406836e-05, + "loss": 2.2297, + "step": 4221 + }, + { + "epoch": 0.791377694470478, + "grad_norm": 53507.984375, + "learning_rate": 8.943974569551008e-05, + "loss": 2.2525, + "step": 4222 + }, + { + "epoch": 0.7915651358950329, + "grad_norm": 47071.1640625, + "learning_rate": 8.943491529284123e-05, + "loss": 2.3001, + "step": 4223 + }, + { + "epoch": 0.7917525773195876, + "grad_norm": 56263.875, + "learning_rate": 8.94300839161811e-05, + "loss": 2.3692, + "step": 4224 + }, + { + "epoch": 0.7919400187441424, + "grad_norm": 48677.3359375, + "learning_rate": 8.9425251565649e-05, + "loss": 2.3056, + "step": 4225 + }, + { + "epoch": 0.7921274601686973, + "grad_norm": 49373.0703125, + "learning_rate": 8.942041824136433e-05, + "loss": 2.3209, + "step": 4226 + }, + { + "epoch": 0.7923149015932521, + "grad_norm": 49263.02734375, + "learning_rate": 8.941558394344642e-05, + "loss": 2.3549, + "step": 4227 + }, + { + "epoch": 0.792502343017807, + "grad_norm": 46692.63671875, + "learning_rate": 8.94107486720147e-05, + "loss": 2.3334, + "step": 4228 + }, + { + "epoch": 0.7926897844423617, + "grad_norm": 49706.296875, + "learning_rate": 8.940591242718858e-05, + "loss": 2.4192, + "step": 4229 + }, + { + "epoch": 0.7928772258669166, + "grad_norm": 51058.8125, + "learning_rate": 8.940107520908751e-05, + "loss": 2.347, + "step": 4230 + }, + { + "epoch": 0.7930646672914714, + "grad_norm": 49206.83984375, + "learning_rate": 8.939623701783098e-05, + "loss": 2.3485, + "step": 4231 + }, + { + "epoch": 0.7932521087160262, + "grad_norm": 54417.2265625, + "learning_rate": 8.939139785353846e-05, + "loss": 2.307, + "step": 4232 + }, + { + "epoch": 0.7934395501405811, + "grad_norm": 50066.15234375, + "learning_rate": 8.938655771632949e-05, + "loss": 2.3953, + "step": 4233 + }, + { + "epoch": 0.7936269915651359, + "grad_norm": 49677.953125, + "learning_rate": 8.93817166063236e-05, + "loss": 2.3102, + "step": 4234 + }, + { + "epoch": 0.7938144329896907, + "grad_norm": 53130.7109375, + "learning_rate": 8.937687452364037e-05, + "loss": 2.3461, + "step": 4235 + }, + { + "epoch": 0.7940018744142455, + "grad_norm": 50461.61328125, + "learning_rate": 8.93720314683994e-05, + "loss": 2.326, + "step": 4236 + }, + { + "epoch": 0.7941893158388004, + "grad_norm": 47833.03125, + "learning_rate": 8.936718744072031e-05, + "loss": 2.2647, + "step": 4237 + }, + { + "epoch": 0.7943767572633552, + "grad_norm": 52623.1640625, + "learning_rate": 8.936234244072271e-05, + "loss": 2.3284, + "step": 4238 + }, + { + "epoch": 0.7945641986879101, + "grad_norm": 52052.6953125, + "learning_rate": 8.935749646852629e-05, + "loss": 2.3661, + "step": 4239 + }, + { + "epoch": 0.7947516401124649, + "grad_norm": 46112.60546875, + "learning_rate": 8.935264952425074e-05, + "loss": 2.3083, + "step": 4240 + }, + { + "epoch": 0.7949390815370196, + "grad_norm": 47824.5703125, + "learning_rate": 8.934780160801578e-05, + "loss": 2.3256, + "step": 4241 + }, + { + "epoch": 0.7951265229615745, + "grad_norm": 48578.69140625, + "learning_rate": 8.93429527199411e-05, + "loss": 2.3727, + "step": 4242 + }, + { + "epoch": 0.7953139643861293, + "grad_norm": 46080.58984375, + "learning_rate": 8.933810286014654e-05, + "loss": 2.3497, + "step": 4243 + }, + { + "epoch": 0.7955014058106842, + "grad_norm": 50837.2109375, + "learning_rate": 8.933325202875182e-05, + "loss": 2.3827, + "step": 4244 + }, + { + "epoch": 0.795688847235239, + "grad_norm": 49094.73046875, + "learning_rate": 8.932840022587677e-05, + "loss": 2.3555, + "step": 4245 + }, + { + "epoch": 0.7958762886597938, + "grad_norm": 48311.41796875, + "learning_rate": 8.932354745164121e-05, + "loss": 2.2955, + "step": 4246 + }, + { + "epoch": 0.7960637300843486, + "grad_norm": 49091.60546875, + "learning_rate": 8.931869370616506e-05, + "loss": 2.3509, + "step": 4247 + }, + { + "epoch": 0.7962511715089035, + "grad_norm": 50342.78515625, + "learning_rate": 8.931383898956812e-05, + "loss": 2.3118, + "step": 4248 + }, + { + "epoch": 0.7964386129334583, + "grad_norm": 55863.78125, + "learning_rate": 8.930898330197032e-05, + "loss": 2.2655, + "step": 4249 + }, + { + "epoch": 0.7966260543580131, + "grad_norm": 50142.2890625, + "learning_rate": 8.93041266434916e-05, + "loss": 2.342, + "step": 4250 + }, + { + "epoch": 0.796813495782568, + "grad_norm": 52012.8671875, + "learning_rate": 8.929926901425191e-05, + "loss": 2.2891, + "step": 4251 + }, + { + "epoch": 0.7970009372071227, + "grad_norm": 49260.765625, + "learning_rate": 8.929441041437126e-05, + "loss": 2.308, + "step": 4252 + }, + { + "epoch": 0.7971883786316776, + "grad_norm": 51651.578125, + "learning_rate": 8.928955084396958e-05, + "loss": 2.3481, + "step": 4253 + }, + { + "epoch": 0.7973758200562324, + "grad_norm": 49816.9921875, + "learning_rate": 8.928469030316696e-05, + "loss": 2.3308, + "step": 4254 + }, + { + "epoch": 0.7975632614807873, + "grad_norm": 52979.19140625, + "learning_rate": 8.927982879208341e-05, + "loss": 2.2651, + "step": 4255 + }, + { + "epoch": 0.7977507029053421, + "grad_norm": 55177.3046875, + "learning_rate": 8.9274966310839e-05, + "loss": 2.3902, + "step": 4256 + }, + { + "epoch": 0.797938144329897, + "grad_norm": 52317.05078125, + "learning_rate": 8.927010285955386e-05, + "loss": 2.3282, + "step": 4257 + }, + { + "epoch": 0.7981255857544517, + "grad_norm": 54047.65234375, + "learning_rate": 8.92652384383481e-05, + "loss": 2.3126, + "step": 4258 + }, + { + "epoch": 0.7983130271790065, + "grad_norm": 50605.98828125, + "learning_rate": 8.926037304734185e-05, + "loss": 2.3878, + "step": 4259 + }, + { + "epoch": 0.7985004686035614, + "grad_norm": 50687.68359375, + "learning_rate": 8.92555066866553e-05, + "loss": 2.3166, + "step": 4260 + }, + { + "epoch": 0.7986879100281162, + "grad_norm": 50351.90234375, + "learning_rate": 8.925063935640861e-05, + "loss": 2.3029, + "step": 4261 + }, + { + "epoch": 0.7988753514526711, + "grad_norm": 46539.73828125, + "learning_rate": 8.924577105672203e-05, + "loss": 2.3098, + "step": 4262 + }, + { + "epoch": 0.7990627928772258, + "grad_norm": 49001.61328125, + "learning_rate": 8.924090178771578e-05, + "loss": 2.2485, + "step": 4263 + }, + { + "epoch": 0.7992502343017807, + "grad_norm": 55668.12890625, + "learning_rate": 8.923603154951014e-05, + "loss": 2.3644, + "step": 4264 + }, + { + "epoch": 0.7994376757263355, + "grad_norm": 51974.80078125, + "learning_rate": 8.923116034222538e-05, + "loss": 2.3314, + "step": 4265 + }, + { + "epoch": 0.7996251171508904, + "grad_norm": 49397.46875, + "learning_rate": 8.922628816598184e-05, + "loss": 2.3083, + "step": 4266 + }, + { + "epoch": 0.7998125585754452, + "grad_norm": 48796.06640625, + "learning_rate": 8.922141502089984e-05, + "loss": 2.3516, + "step": 4267 + }, + { + "epoch": 0.8, + "grad_norm": 48217.9140625, + "learning_rate": 8.921654090709972e-05, + "loss": 2.3067, + "step": 4268 + }, + { + "epoch": 0.8001874414245548, + "grad_norm": 44723.16015625, + "learning_rate": 8.92116658247019e-05, + "loss": 2.3326, + "step": 4269 + }, + { + "epoch": 0.8003748828491096, + "grad_norm": 50742.13671875, + "learning_rate": 8.920678977382677e-05, + "loss": 2.3522, + "step": 4270 + }, + { + "epoch": 0.8005623242736645, + "grad_norm": 47094.8046875, + "learning_rate": 8.920191275459477e-05, + "loss": 2.2958, + "step": 4271 + }, + { + "epoch": 0.8007497656982193, + "grad_norm": 49170.15234375, + "learning_rate": 8.919703476712636e-05, + "loss": 2.267, + "step": 4272 + }, + { + "epoch": 0.8009372071227742, + "grad_norm": 50799.23046875, + "learning_rate": 8.919215581154198e-05, + "loss": 2.346, + "step": 4273 + }, + { + "epoch": 0.801124648547329, + "grad_norm": 48268.2265625, + "learning_rate": 8.918727588796219e-05, + "loss": 2.3577, + "step": 4274 + }, + { + "epoch": 0.8013120899718837, + "grad_norm": 55400.12109375, + "learning_rate": 8.918239499650748e-05, + "loss": 2.3646, + "step": 4275 + }, + { + "epoch": 0.8014995313964386, + "grad_norm": 48745.62890625, + "learning_rate": 8.91775131372984e-05, + "loss": 2.2846, + "step": 4276 + }, + { + "epoch": 0.8016869728209934, + "grad_norm": 50098.54296875, + "learning_rate": 8.917263031045557e-05, + "loss": 2.3935, + "step": 4277 + }, + { + "epoch": 0.8018744142455483, + "grad_norm": 49124.69921875, + "learning_rate": 8.916774651609956e-05, + "loss": 2.3413, + "step": 4278 + }, + { + "epoch": 0.8020618556701031, + "grad_norm": 49983.95703125, + "learning_rate": 8.916286175435098e-05, + "loss": 2.3674, + "step": 4279 + }, + { + "epoch": 0.8022492970946579, + "grad_norm": 48107.19921875, + "learning_rate": 8.915797602533048e-05, + "loss": 2.3973, + "step": 4280 + }, + { + "epoch": 0.8024367385192127, + "grad_norm": 47950.26953125, + "learning_rate": 8.915308932915875e-05, + "loss": 2.302, + "step": 4281 + }, + { + "epoch": 0.8026241799437676, + "grad_norm": 52393.88671875, + "learning_rate": 8.914820166595647e-05, + "loss": 2.3038, + "step": 4282 + }, + { + "epoch": 0.8028116213683224, + "grad_norm": 49565.5546875, + "learning_rate": 8.914331303584438e-05, + "loss": 2.3036, + "step": 4283 + }, + { + "epoch": 0.8029990627928772, + "grad_norm": 45166.69921875, + "learning_rate": 8.913842343894321e-05, + "loss": 2.3278, + "step": 4284 + }, + { + "epoch": 0.8031865042174321, + "grad_norm": 46932.82421875, + "learning_rate": 8.913353287537371e-05, + "loss": 2.2838, + "step": 4285 + }, + { + "epoch": 0.8033739456419868, + "grad_norm": 55347.8515625, + "learning_rate": 8.912864134525669e-05, + "loss": 2.2348, + "step": 4286 + }, + { + "epoch": 0.8035613870665417, + "grad_norm": 47762.30078125, + "learning_rate": 8.912374884871297e-05, + "loss": 2.2878, + "step": 4287 + }, + { + "epoch": 0.8037488284910965, + "grad_norm": 50664.83203125, + "learning_rate": 8.911885538586336e-05, + "loss": 2.3367, + "step": 4288 + }, + { + "epoch": 0.8039362699156514, + "grad_norm": 47429.75390625, + "learning_rate": 8.911396095682876e-05, + "loss": 2.3596, + "step": 4289 + }, + { + "epoch": 0.8041237113402062, + "grad_norm": 50298.890625, + "learning_rate": 8.910906556173003e-05, + "loss": 2.2724, + "step": 4290 + }, + { + "epoch": 0.8043111527647611, + "grad_norm": 49436.87890625, + "learning_rate": 8.910416920068807e-05, + "loss": 2.2836, + "step": 4291 + }, + { + "epoch": 0.8044985941893158, + "grad_norm": 53625.53125, + "learning_rate": 8.909927187382385e-05, + "loss": 2.3214, + "step": 4292 + }, + { + "epoch": 0.8046860356138706, + "grad_norm": 48454.83984375, + "learning_rate": 8.909437358125829e-05, + "loss": 2.3273, + "step": 4293 + }, + { + "epoch": 0.8048734770384255, + "grad_norm": 52006.14453125, + "learning_rate": 8.90894743231124e-05, + "loss": 2.312, + "step": 4294 + }, + { + "epoch": 0.8050609184629803, + "grad_norm": 48187.71875, + "learning_rate": 8.908457409950718e-05, + "loss": 2.3253, + "step": 4295 + }, + { + "epoch": 0.8052483598875352, + "grad_norm": 47970.73046875, + "learning_rate": 8.907967291056365e-05, + "loss": 2.2661, + "step": 4296 + }, + { + "epoch": 0.80543580131209, + "grad_norm": 53880.51171875, + "learning_rate": 8.907477075640286e-05, + "loss": 2.3289, + "step": 4297 + }, + { + "epoch": 0.8056232427366448, + "grad_norm": 47764.38671875, + "learning_rate": 8.906986763714588e-05, + "loss": 2.3503, + "step": 4298 + }, + { + "epoch": 0.8058106841611996, + "grad_norm": 50058.84765625, + "learning_rate": 8.906496355291385e-05, + "loss": 2.3317, + "step": 4299 + }, + { + "epoch": 0.8059981255857545, + "grad_norm": 49470.41796875, + "learning_rate": 8.906005850382787e-05, + "loss": 2.3065, + "step": 4300 + }, + { + "epoch": 0.8061855670103093, + "grad_norm": 50815.8828125, + "learning_rate": 8.905515249000908e-05, + "loss": 2.3137, + "step": 4301 + }, + { + "epoch": 0.8063730084348641, + "grad_norm": 52361.48046875, + "learning_rate": 8.905024551157866e-05, + "loss": 2.3452, + "step": 4302 + }, + { + "epoch": 0.8065604498594189, + "grad_norm": 47248.3828125, + "learning_rate": 8.90453375686578e-05, + "loss": 2.3753, + "step": 4303 + }, + { + "epoch": 0.8067478912839737, + "grad_norm": 48239.46484375, + "learning_rate": 8.904042866136773e-05, + "loss": 2.3479, + "step": 4304 + }, + { + "epoch": 0.8069353327085286, + "grad_norm": 46761.05859375, + "learning_rate": 8.903551878982968e-05, + "loss": 2.2863, + "step": 4305 + }, + { + "epoch": 0.8071227741330834, + "grad_norm": 48100.1484375, + "learning_rate": 8.903060795416494e-05, + "loss": 2.3585, + "step": 4306 + }, + { + "epoch": 0.8073102155576383, + "grad_norm": 48394.9453125, + "learning_rate": 8.902569615449476e-05, + "loss": 2.3436, + "step": 4307 + }, + { + "epoch": 0.8074976569821931, + "grad_norm": 48679.00390625, + "learning_rate": 8.902078339094051e-05, + "loss": 2.3367, + "step": 4308 + }, + { + "epoch": 0.8076850984067478, + "grad_norm": 52936.390625, + "learning_rate": 8.901586966362349e-05, + "loss": 2.3596, + "step": 4309 + }, + { + "epoch": 0.8078725398313027, + "grad_norm": 45765.359375, + "learning_rate": 8.901095497266509e-05, + "loss": 2.3128, + "step": 4310 + }, + { + "epoch": 0.8080599812558575, + "grad_norm": 48907.11328125, + "learning_rate": 8.900603931818667e-05, + "loss": 2.3199, + "step": 4311 + }, + { + "epoch": 0.8082474226804124, + "grad_norm": 48880.4140625, + "learning_rate": 8.900112270030965e-05, + "loss": 2.2809, + "step": 4312 + }, + { + "epoch": 0.8084348641049672, + "grad_norm": 49843.078125, + "learning_rate": 8.899620511915545e-05, + "loss": 2.3266, + "step": 4313 + }, + { + "epoch": 0.8086223055295221, + "grad_norm": 51231.6953125, + "learning_rate": 8.899128657484556e-05, + "loss": 2.4509, + "step": 4314 + }, + { + "epoch": 0.8088097469540768, + "grad_norm": 48721.17578125, + "learning_rate": 8.898636706750144e-05, + "loss": 2.3548, + "step": 4315 + }, + { + "epoch": 0.8089971883786317, + "grad_norm": 47638.078125, + "learning_rate": 8.898144659724461e-05, + "loss": 2.2846, + "step": 4316 + }, + { + "epoch": 0.8091846298031865, + "grad_norm": 48454.8984375, + "learning_rate": 8.897652516419658e-05, + "loss": 2.2968, + "step": 4317 + }, + { + "epoch": 0.8093720712277414, + "grad_norm": 51721.32421875, + "learning_rate": 8.897160276847891e-05, + "loss": 2.2423, + "step": 4318 + }, + { + "epoch": 0.8095595126522962, + "grad_norm": 43314.72265625, + "learning_rate": 8.896667941021318e-05, + "loss": 2.3235, + "step": 4319 + }, + { + "epoch": 0.8097469540768509, + "grad_norm": 53518.953125, + "learning_rate": 8.8961755089521e-05, + "loss": 2.2529, + "step": 4320 + }, + { + "epoch": 0.8099343955014058, + "grad_norm": 50284.71875, + "learning_rate": 8.895682980652398e-05, + "loss": 2.3269, + "step": 4321 + }, + { + "epoch": 0.8101218369259606, + "grad_norm": 50849.74609375, + "learning_rate": 8.895190356134376e-05, + "loss": 2.2829, + "step": 4322 + }, + { + "epoch": 0.8103092783505155, + "grad_norm": 54252.421875, + "learning_rate": 8.894697635410203e-05, + "loss": 2.3938, + "step": 4323 + }, + { + "epoch": 0.8104967197750703, + "grad_norm": 49405.47265625, + "learning_rate": 8.894204818492048e-05, + "loss": 2.3139, + "step": 4324 + }, + { + "epoch": 0.8106841611996252, + "grad_norm": 51865.06640625, + "learning_rate": 8.893711905392083e-05, + "loss": 2.402, + "step": 4325 + }, + { + "epoch": 0.8108716026241799, + "grad_norm": 47239.8125, + "learning_rate": 8.893218896122482e-05, + "loss": 2.2775, + "step": 4326 + }, + { + "epoch": 0.8110590440487347, + "grad_norm": 49680.65625, + "learning_rate": 8.892725790695421e-05, + "loss": 2.2742, + "step": 4327 + }, + { + "epoch": 0.8112464854732896, + "grad_norm": 49973.9765625, + "learning_rate": 8.892232589123082e-05, + "loss": 2.3268, + "step": 4328 + }, + { + "epoch": 0.8114339268978444, + "grad_norm": 48823.92578125, + "learning_rate": 8.891739291417644e-05, + "loss": 2.3449, + "step": 4329 + }, + { + "epoch": 0.8116213683223993, + "grad_norm": 57621.3359375, + "learning_rate": 8.891245897591292e-05, + "loss": 2.2991, + "step": 4330 + }, + { + "epoch": 0.8118088097469541, + "grad_norm": 45874.34765625, + "learning_rate": 8.890752407656209e-05, + "loss": 2.3586, + "step": 4331 + }, + { + "epoch": 0.8119962511715089, + "grad_norm": 55246.66015625, + "learning_rate": 8.890258821624586e-05, + "loss": 2.2056, + "step": 4332 + }, + { + "epoch": 0.8121836925960637, + "grad_norm": 53489.21484375, + "learning_rate": 8.889765139508616e-05, + "loss": 2.3847, + "step": 4333 + }, + { + "epoch": 0.8123711340206186, + "grad_norm": 48394.91015625, + "learning_rate": 8.88927136132049e-05, + "loss": 2.3221, + "step": 4334 + }, + { + "epoch": 0.8125585754451734, + "grad_norm": 46757.8359375, + "learning_rate": 8.888777487072401e-05, + "loss": 2.3491, + "step": 4335 + }, + { + "epoch": 0.8127460168697282, + "grad_norm": 45657.69921875, + "learning_rate": 8.888283516776553e-05, + "loss": 2.2755, + "step": 4336 + }, + { + "epoch": 0.812933458294283, + "grad_norm": 46143.83203125, + "learning_rate": 8.887789450445144e-05, + "loss": 2.3449, + "step": 4337 + }, + { + "epoch": 0.8131208997188378, + "grad_norm": 48099.9296875, + "learning_rate": 8.887295288090374e-05, + "loss": 2.3234, + "step": 4338 + }, + { + "epoch": 0.8133083411433927, + "grad_norm": 50414.0390625, + "learning_rate": 8.886801029724449e-05, + "loss": 2.2994, + "step": 4339 + }, + { + "epoch": 0.8134957825679475, + "grad_norm": 51376.5703125, + "learning_rate": 8.886306675359581e-05, + "loss": 2.2883, + "step": 4340 + }, + { + "epoch": 0.8136832239925024, + "grad_norm": 50941.6875, + "learning_rate": 8.885812225007975e-05, + "loss": 2.3133, + "step": 4341 + }, + { + "epoch": 0.8138706654170572, + "grad_norm": 48273.8359375, + "learning_rate": 8.885317678681846e-05, + "loss": 2.2916, + "step": 4342 + }, + { + "epoch": 0.814058106841612, + "grad_norm": 46855.4140625, + "learning_rate": 8.884823036393408e-05, + "loss": 2.2706, + "step": 4343 + }, + { + "epoch": 0.8142455482661668, + "grad_norm": 53542.125, + "learning_rate": 8.884328298154876e-05, + "loss": 2.3925, + "step": 4344 + }, + { + "epoch": 0.8144329896907216, + "grad_norm": 49215.43359375, + "learning_rate": 8.883833463978472e-05, + "loss": 2.3069, + "step": 4345 + }, + { + "epoch": 0.8146204311152765, + "grad_norm": 49833.34375, + "learning_rate": 8.883338533876417e-05, + "loss": 2.2785, + "step": 4346 + }, + { + "epoch": 0.8148078725398313, + "grad_norm": 51711.69140625, + "learning_rate": 8.882843507860937e-05, + "loss": 2.2875, + "step": 4347 + }, + { + "epoch": 0.8149953139643862, + "grad_norm": 54521.95703125, + "learning_rate": 8.882348385944254e-05, + "loss": 2.2741, + "step": 4348 + }, + { + "epoch": 0.8151827553889409, + "grad_norm": 47581.65234375, + "learning_rate": 8.881853168138601e-05, + "loss": 2.3045, + "step": 4349 + }, + { + "epoch": 0.8153701968134958, + "grad_norm": 49872.1796875, + "learning_rate": 8.881357854456206e-05, + "loss": 2.3226, + "step": 4350 + }, + { + "epoch": 0.8155576382380506, + "grad_norm": 48992.375, + "learning_rate": 8.880862444909304e-05, + "loss": 2.2652, + "step": 4351 + }, + { + "epoch": 0.8157450796626055, + "grad_norm": 52043.3828125, + "learning_rate": 8.880366939510133e-05, + "loss": 2.3845, + "step": 4352 + }, + { + "epoch": 0.8159325210871603, + "grad_norm": 49136.08984375, + "learning_rate": 8.879871338270928e-05, + "loss": 2.3332, + "step": 4353 + }, + { + "epoch": 0.816119962511715, + "grad_norm": 52061.3828125, + "learning_rate": 8.879375641203933e-05, + "loss": 2.4019, + "step": 4354 + }, + { + "epoch": 0.8163074039362699, + "grad_norm": 48441.1640625, + "learning_rate": 8.878879848321389e-05, + "loss": 2.3756, + "step": 4355 + }, + { + "epoch": 0.8164948453608247, + "grad_norm": 48081.46484375, + "learning_rate": 8.87838395963554e-05, + "loss": 2.3588, + "step": 4356 + }, + { + "epoch": 0.8166822867853796, + "grad_norm": 47759.66796875, + "learning_rate": 8.877887975158638e-05, + "loss": 2.3457, + "step": 4357 + }, + { + "epoch": 0.8168697282099344, + "grad_norm": 52948.18359375, + "learning_rate": 8.877391894902928e-05, + "loss": 2.3852, + "step": 4358 + }, + { + "epoch": 0.8170571696344893, + "grad_norm": 51303.2578125, + "learning_rate": 8.876895718880669e-05, + "loss": 2.3024, + "step": 4359 + }, + { + "epoch": 0.817244611059044, + "grad_norm": 48203.9765625, + "learning_rate": 8.87639944710411e-05, + "loss": 2.3684, + "step": 4360 + }, + { + "epoch": 0.8174320524835988, + "grad_norm": 47925.51171875, + "learning_rate": 8.87590307958551e-05, + "loss": 2.3192, + "step": 4361 + }, + { + "epoch": 0.8176194939081537, + "grad_norm": 50729.95703125, + "learning_rate": 8.87540661633713e-05, + "loss": 2.3702, + "step": 4362 + }, + { + "epoch": 0.8178069353327085, + "grad_norm": 48209.80859375, + "learning_rate": 8.874910057371231e-05, + "loss": 2.2648, + "step": 4363 + }, + { + "epoch": 0.8179943767572634, + "grad_norm": 49759.0234375, + "learning_rate": 8.874413402700076e-05, + "loss": 2.3613, + "step": 4364 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 51838.98828125, + "learning_rate": 8.873916652335936e-05, + "loss": 2.3114, + "step": 4365 + }, + { + "epoch": 0.818369259606373, + "grad_norm": 46608.4765625, + "learning_rate": 8.873419806291074e-05, + "loss": 2.3021, + "step": 4366 + }, + { + "epoch": 0.8185567010309278, + "grad_norm": 52464.9765625, + "learning_rate": 8.872922864577768e-05, + "loss": 2.2861, + "step": 4367 + }, + { + "epoch": 0.8187441424554827, + "grad_norm": 46135.75, + "learning_rate": 8.872425827208286e-05, + "loss": 2.3392, + "step": 4368 + }, + { + "epoch": 0.8189315838800375, + "grad_norm": 53887.515625, + "learning_rate": 8.871928694194909e-05, + "loss": 2.3688, + "step": 4369 + }, + { + "epoch": 0.8191190253045924, + "grad_norm": 50114.2890625, + "learning_rate": 8.871431465549913e-05, + "loss": 2.2625, + "step": 4370 + }, + { + "epoch": 0.8193064667291471, + "grad_norm": 49092.4296875, + "learning_rate": 8.870934141285578e-05, + "loss": 2.3479, + "step": 4371 + }, + { + "epoch": 0.8194939081537019, + "grad_norm": 53286.26171875, + "learning_rate": 8.870436721414188e-05, + "loss": 2.3207, + "step": 4372 + }, + { + "epoch": 0.8196813495782568, + "grad_norm": 47615.671875, + "learning_rate": 8.86993920594803e-05, + "loss": 2.3673, + "step": 4373 + }, + { + "epoch": 0.8198687910028116, + "grad_norm": 47711.63671875, + "learning_rate": 8.869441594899392e-05, + "loss": 2.3637, + "step": 4374 + }, + { + "epoch": 0.8200562324273665, + "grad_norm": 50651.078125, + "learning_rate": 8.86894388828056e-05, + "loss": 2.3157, + "step": 4375 + }, + { + "epoch": 0.8202436738519213, + "grad_norm": 53316.8046875, + "learning_rate": 8.868446086103834e-05, + "loss": 2.401, + "step": 4376 + }, + { + "epoch": 0.820431115276476, + "grad_norm": 48038.6875, + "learning_rate": 8.867948188381503e-05, + "loss": 2.2791, + "step": 4377 + }, + { + "epoch": 0.8206185567010309, + "grad_norm": 50642.1796875, + "learning_rate": 8.867450195125869e-05, + "loss": 2.329, + "step": 4378 + }, + { + "epoch": 0.8208059981255857, + "grad_norm": 52287.46484375, + "learning_rate": 8.866952106349227e-05, + "loss": 2.3327, + "step": 4379 + }, + { + "epoch": 0.8209934395501406, + "grad_norm": 50277.390625, + "learning_rate": 8.866453922063882e-05, + "loss": 2.3622, + "step": 4380 + }, + { + "epoch": 0.8211808809746954, + "grad_norm": 52534.1796875, + "learning_rate": 8.865955642282137e-05, + "loss": 2.2369, + "step": 4381 + }, + { + "epoch": 0.8213683223992503, + "grad_norm": 51972.91015625, + "learning_rate": 8.865457267016302e-05, + "loss": 2.2896, + "step": 4382 + }, + { + "epoch": 0.821555763823805, + "grad_norm": 56134.890625, + "learning_rate": 8.864958796278684e-05, + "loss": 2.2685, + "step": 4383 + }, + { + "epoch": 0.8217432052483599, + "grad_norm": 51841.234375, + "learning_rate": 8.864460230081594e-05, + "loss": 2.2529, + "step": 4384 + }, + { + "epoch": 0.8219306466729147, + "grad_norm": 47165.23828125, + "learning_rate": 8.863961568437347e-05, + "loss": 2.3225, + "step": 4385 + }, + { + "epoch": 0.8221180880974696, + "grad_norm": 47659.51171875, + "learning_rate": 8.863462811358258e-05, + "loss": 2.2986, + "step": 4386 + }, + { + "epoch": 0.8223055295220244, + "grad_norm": 50421.3984375, + "learning_rate": 8.862963958856647e-05, + "loss": 2.3877, + "step": 4387 + }, + { + "epoch": 0.8224929709465792, + "grad_norm": 48859.6953125, + "learning_rate": 8.862465010944835e-05, + "loss": 2.3018, + "step": 4388 + }, + { + "epoch": 0.822680412371134, + "grad_norm": 55569.12109375, + "learning_rate": 8.861965967635147e-05, + "loss": 2.278, + "step": 4389 + }, + { + "epoch": 0.8228678537956888, + "grad_norm": 49648.5859375, + "learning_rate": 8.861466828939904e-05, + "loss": 2.3086, + "step": 4390 + }, + { + "epoch": 0.8230552952202437, + "grad_norm": 52984.53515625, + "learning_rate": 8.860967594871436e-05, + "loss": 2.3855, + "step": 4391 + }, + { + "epoch": 0.8232427366447985, + "grad_norm": 50659.48828125, + "learning_rate": 8.860468265442077e-05, + "loss": 2.2945, + "step": 4392 + }, + { + "epoch": 0.8234301780693534, + "grad_norm": 50158.37109375, + "learning_rate": 8.859968840664154e-05, + "loss": 2.3277, + "step": 4393 + }, + { + "epoch": 0.8236176194939081, + "grad_norm": 52293.54296875, + "learning_rate": 8.859469320550008e-05, + "loss": 2.2885, + "step": 4394 + }, + { + "epoch": 0.823805060918463, + "grad_norm": 52525.171875, + "learning_rate": 8.858969705111972e-05, + "loss": 2.3545, + "step": 4395 + }, + { + "epoch": 0.8239925023430178, + "grad_norm": 50475.82421875, + "learning_rate": 8.858469994362388e-05, + "loss": 2.3482, + "step": 4396 + }, + { + "epoch": 0.8241799437675726, + "grad_norm": 49890.46484375, + "learning_rate": 8.857970188313598e-05, + "loss": 2.3451, + "step": 4397 + }, + { + "epoch": 0.8243673851921275, + "grad_norm": 49257.6484375, + "learning_rate": 8.857470286977946e-05, + "loss": 2.2736, + "step": 4398 + }, + { + "epoch": 0.8245548266166823, + "grad_norm": 58159.81640625, + "learning_rate": 8.85697029036778e-05, + "loss": 2.3069, + "step": 4399 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 55425.828125, + "learning_rate": 8.856470198495447e-05, + "loss": 2.2994, + "step": 4400 + }, + { + "epoch": 0.8249297094657919, + "grad_norm": 46761.96875, + "learning_rate": 8.8559700113733e-05, + "loss": 2.3196, + "step": 4401 + }, + { + "epoch": 0.8251171508903468, + "grad_norm": 49882.296875, + "learning_rate": 8.855469729013695e-05, + "loss": 2.3448, + "step": 4402 + }, + { + "epoch": 0.8253045923149016, + "grad_norm": 48399.953125, + "learning_rate": 8.854969351428985e-05, + "loss": 2.2917, + "step": 4403 + }, + { + "epoch": 0.8254920337394565, + "grad_norm": 50427.25, + "learning_rate": 8.85446887863153e-05, + "loss": 2.307, + "step": 4404 + }, + { + "epoch": 0.8256794751640113, + "grad_norm": 48759.26171875, + "learning_rate": 8.853968310633692e-05, + "loss": 2.3331, + "step": 4405 + }, + { + "epoch": 0.825866916588566, + "grad_norm": 50276.7265625, + "learning_rate": 8.853467647447831e-05, + "loss": 2.2697, + "step": 4406 + }, + { + "epoch": 0.8260543580131209, + "grad_norm": 45260.0390625, + "learning_rate": 8.852966889086318e-05, + "loss": 2.3001, + "step": 4407 + }, + { + "epoch": 0.8262417994376757, + "grad_norm": 47629.71875, + "learning_rate": 8.852466035561517e-05, + "loss": 2.3123, + "step": 4408 + }, + { + "epoch": 0.8264292408622306, + "grad_norm": 55767.25, + "learning_rate": 8.851965086885801e-05, + "loss": 2.3576, + "step": 4409 + }, + { + "epoch": 0.8266166822867854, + "grad_norm": 49471.58984375, + "learning_rate": 8.85146404307154e-05, + "loss": 2.351, + "step": 4410 + }, + { + "epoch": 0.8268041237113402, + "grad_norm": 48827.16015625, + "learning_rate": 8.850962904131111e-05, + "loss": 2.2856, + "step": 4411 + }, + { + "epoch": 0.826991565135895, + "grad_norm": 49024.09375, + "learning_rate": 8.850461670076891e-05, + "loss": 2.3135, + "step": 4412 + }, + { + "epoch": 0.8271790065604498, + "grad_norm": 52730.359375, + "learning_rate": 8.84996034092126e-05, + "loss": 2.2485, + "step": 4413 + }, + { + "epoch": 0.8273664479850047, + "grad_norm": 47970.72265625, + "learning_rate": 8.8494589166766e-05, + "loss": 2.3486, + "step": 4414 + }, + { + "epoch": 0.8275538894095595, + "grad_norm": 50309.6015625, + "learning_rate": 8.848957397355297e-05, + "loss": 2.4182, + "step": 4415 + }, + { + "epoch": 0.8277413308341144, + "grad_norm": 51980.875, + "learning_rate": 8.848455782969735e-05, + "loss": 2.3951, + "step": 4416 + }, + { + "epoch": 0.8279287722586691, + "grad_norm": 51623.59765625, + "learning_rate": 8.847954073532305e-05, + "loss": 2.3018, + "step": 4417 + }, + { + "epoch": 0.828116213683224, + "grad_norm": 48935.01953125, + "learning_rate": 8.8474522690554e-05, + "loss": 2.2424, + "step": 4418 + }, + { + "epoch": 0.8283036551077788, + "grad_norm": 50865.56640625, + "learning_rate": 8.84695036955141e-05, + "loss": 2.3912, + "step": 4419 + }, + { + "epoch": 0.8284910965323337, + "grad_norm": 55346.640625, + "learning_rate": 8.846448375032734e-05, + "loss": 2.3067, + "step": 4420 + }, + { + "epoch": 0.8286785379568885, + "grad_norm": 49793.02734375, + "learning_rate": 8.84594628551177e-05, + "loss": 2.3456, + "step": 4421 + }, + { + "epoch": 0.8288659793814434, + "grad_norm": 48889.9453125, + "learning_rate": 8.845444101000921e-05, + "loss": 2.3974, + "step": 4422 + }, + { + "epoch": 0.8290534208059981, + "grad_norm": 54536.05859375, + "learning_rate": 8.844941821512587e-05, + "loss": 2.2944, + "step": 4423 + }, + { + "epoch": 0.8292408622305529, + "grad_norm": 49146.46875, + "learning_rate": 8.844439447059174e-05, + "loss": 2.2965, + "step": 4424 + }, + { + "epoch": 0.8294283036551078, + "grad_norm": 47992.66796875, + "learning_rate": 8.843936977653094e-05, + "loss": 2.3427, + "step": 4425 + }, + { + "epoch": 0.8296157450796626, + "grad_norm": 53956.09375, + "learning_rate": 8.843434413306752e-05, + "loss": 2.2724, + "step": 4426 + }, + { + "epoch": 0.8298031865042175, + "grad_norm": 46991.23828125, + "learning_rate": 8.842931754032563e-05, + "loss": 2.2854, + "step": 4427 + }, + { + "epoch": 0.8299906279287722, + "grad_norm": 52439.6328125, + "learning_rate": 8.842428999842944e-05, + "loss": 2.3147, + "step": 4428 + }, + { + "epoch": 0.830178069353327, + "grad_norm": 49025.95703125, + "learning_rate": 8.841926150750308e-05, + "loss": 2.3658, + "step": 4429 + }, + { + "epoch": 0.8303655107778819, + "grad_norm": 48966.58203125, + "learning_rate": 8.841423206767078e-05, + "loss": 2.3232, + "step": 4430 + }, + { + "epoch": 0.8305529522024367, + "grad_norm": 48769.19140625, + "learning_rate": 8.840920167905676e-05, + "loss": 2.3475, + "step": 4431 + }, + { + "epoch": 0.8307403936269916, + "grad_norm": 52560.74609375, + "learning_rate": 8.840417034178525e-05, + "loss": 2.3578, + "step": 4432 + }, + { + "epoch": 0.8309278350515464, + "grad_norm": 53075.12890625, + "learning_rate": 8.839913805598053e-05, + "loss": 2.3963, + "step": 4433 + }, + { + "epoch": 0.8311152764761012, + "grad_norm": 52474.93359375, + "learning_rate": 8.839410482176689e-05, + "loss": 2.2312, + "step": 4434 + }, + { + "epoch": 0.831302717900656, + "grad_norm": 46066.86328125, + "learning_rate": 8.838907063926863e-05, + "loss": 2.3068, + "step": 4435 + }, + { + "epoch": 0.8314901593252109, + "grad_norm": 47258.35546875, + "learning_rate": 8.83840355086101e-05, + "loss": 2.3478, + "step": 4436 + }, + { + "epoch": 0.8316776007497657, + "grad_norm": 52229.41796875, + "learning_rate": 8.837899942991566e-05, + "loss": 2.3869, + "step": 4437 + }, + { + "epoch": 0.8318650421743206, + "grad_norm": 50024.08203125, + "learning_rate": 8.837396240330969e-05, + "loss": 2.3461, + "step": 4438 + }, + { + "epoch": 0.8320524835988754, + "grad_norm": 50924.70703125, + "learning_rate": 8.83689244289166e-05, + "loss": 2.2715, + "step": 4439 + }, + { + "epoch": 0.8322399250234301, + "grad_norm": 47052.5546875, + "learning_rate": 8.836388550686084e-05, + "loss": 2.301, + "step": 4440 + }, + { + "epoch": 0.832427366447985, + "grad_norm": 48398.90234375, + "learning_rate": 8.835884563726682e-05, + "loss": 2.3231, + "step": 4441 + }, + { + "epoch": 0.8326148078725398, + "grad_norm": 49992.203125, + "learning_rate": 8.835380482025906e-05, + "loss": 2.2798, + "step": 4442 + }, + { + "epoch": 0.8328022492970947, + "grad_norm": 48469.87109375, + "learning_rate": 8.834876305596205e-05, + "loss": 2.3316, + "step": 4443 + }, + { + "epoch": 0.8329896907216495, + "grad_norm": 49075.953125, + "learning_rate": 8.834372034450031e-05, + "loss": 2.3646, + "step": 4444 + }, + { + "epoch": 0.8331771321462043, + "grad_norm": 46972.83203125, + "learning_rate": 8.833867668599838e-05, + "loss": 2.3183, + "step": 4445 + }, + { + "epoch": 0.8333645735707591, + "grad_norm": 49580.28515625, + "learning_rate": 8.833363208058085e-05, + "loss": 2.2781, + "step": 4446 + }, + { + "epoch": 0.833552014995314, + "grad_norm": 50545.359375, + "learning_rate": 8.832858652837231e-05, + "loss": 2.3024, + "step": 4447 + }, + { + "epoch": 0.8337394564198688, + "grad_norm": 49004.71484375, + "learning_rate": 8.832354002949739e-05, + "loss": 2.3894, + "step": 4448 + }, + { + "epoch": 0.8339268978444236, + "grad_norm": 50323.109375, + "learning_rate": 8.831849258408069e-05, + "loss": 2.2883, + "step": 4449 + }, + { + "epoch": 0.8341143392689785, + "grad_norm": 48059.67578125, + "learning_rate": 8.831344419224692e-05, + "loss": 2.284, + "step": 4450 + }, + { + "epoch": 0.8343017806935332, + "grad_norm": 53790.734375, + "learning_rate": 8.830839485412075e-05, + "loss": 2.2672, + "step": 4451 + }, + { + "epoch": 0.8344892221180881, + "grad_norm": 54121.625, + "learning_rate": 8.830334456982689e-05, + "loss": 2.3489, + "step": 4452 + }, + { + "epoch": 0.8346766635426429, + "grad_norm": 49740.48046875, + "learning_rate": 8.82982933394901e-05, + "loss": 2.3763, + "step": 4453 + }, + { + "epoch": 0.8348641049671978, + "grad_norm": 52801.58203125, + "learning_rate": 8.82932411632351e-05, + "loss": 2.3195, + "step": 4454 + }, + { + "epoch": 0.8350515463917526, + "grad_norm": 49759.81640625, + "learning_rate": 8.82881880411867e-05, + "loss": 2.2251, + "step": 4455 + }, + { + "epoch": 0.8352389878163075, + "grad_norm": 55807.1875, + "learning_rate": 8.82831339734697e-05, + "loss": 2.2941, + "step": 4456 + }, + { + "epoch": 0.8354264292408622, + "grad_norm": 50920.55859375, + "learning_rate": 8.827807896020892e-05, + "loss": 2.3111, + "step": 4457 + }, + { + "epoch": 0.835613870665417, + "grad_norm": 49153.37890625, + "learning_rate": 8.827302300152922e-05, + "loss": 2.3648, + "step": 4458 + }, + { + "epoch": 0.8358013120899719, + "grad_norm": 51940.6640625, + "learning_rate": 8.826796609755546e-05, + "loss": 2.2558, + "step": 4459 + }, + { + "epoch": 0.8359887535145267, + "grad_norm": 58907.6953125, + "learning_rate": 8.826290824841257e-05, + "loss": 2.3884, + "step": 4460 + }, + { + "epoch": 0.8361761949390816, + "grad_norm": 50207.27734375, + "learning_rate": 8.825784945422544e-05, + "loss": 2.3679, + "step": 4461 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 52271.82421875, + "learning_rate": 8.825278971511903e-05, + "loss": 2.3424, + "step": 4462 + }, + { + "epoch": 0.8365510777881912, + "grad_norm": 51421.4921875, + "learning_rate": 8.824772903121831e-05, + "loss": 2.3164, + "step": 4463 + }, + { + "epoch": 0.836738519212746, + "grad_norm": 50557.4921875, + "learning_rate": 8.82426674026483e-05, + "loss": 2.3226, + "step": 4464 + }, + { + "epoch": 0.8369259606373008, + "grad_norm": 50536.03125, + "learning_rate": 8.823760482953397e-05, + "loss": 2.3633, + "step": 4465 + }, + { + "epoch": 0.8371134020618557, + "grad_norm": 46658.9140625, + "learning_rate": 8.823254131200036e-05, + "loss": 2.3154, + "step": 4466 + }, + { + "epoch": 0.8373008434864105, + "grad_norm": 51885.61328125, + "learning_rate": 8.822747685017257e-05, + "loss": 2.3356, + "step": 4467 + }, + { + "epoch": 0.8374882849109653, + "grad_norm": 54480.0390625, + "learning_rate": 8.822241144417566e-05, + "loss": 2.3003, + "step": 4468 + }, + { + "epoch": 0.8376757263355201, + "grad_norm": 46451.3515625, + "learning_rate": 8.821734509413475e-05, + "loss": 2.3168, + "step": 4469 + }, + { + "epoch": 0.837863167760075, + "grad_norm": 49944.3359375, + "learning_rate": 8.821227780017494e-05, + "loss": 2.4026, + "step": 4470 + }, + { + "epoch": 0.8380506091846298, + "grad_norm": 48737.55078125, + "learning_rate": 8.820720956242143e-05, + "loss": 2.3157, + "step": 4471 + }, + { + "epoch": 0.8382380506091847, + "grad_norm": 48604.5390625, + "learning_rate": 8.82021403809994e-05, + "loss": 2.2671, + "step": 4472 + }, + { + "epoch": 0.8384254920337395, + "grad_norm": 48000.76171875, + "learning_rate": 8.819707025603399e-05, + "loss": 2.3252, + "step": 4473 + }, + { + "epoch": 0.8386129334582942, + "grad_norm": 49803.79296875, + "learning_rate": 8.819199918765049e-05, + "loss": 2.4044, + "step": 4474 + }, + { + "epoch": 0.8388003748828491, + "grad_norm": 48465.296875, + "learning_rate": 8.818692717597413e-05, + "loss": 2.3242, + "step": 4475 + }, + { + "epoch": 0.8389878163074039, + "grad_norm": 45946.29296875, + "learning_rate": 8.818185422113017e-05, + "loss": 2.3288, + "step": 4476 + }, + { + "epoch": 0.8391752577319588, + "grad_norm": 46432.72265625, + "learning_rate": 8.817678032324392e-05, + "loss": 2.2912, + "step": 4477 + }, + { + "epoch": 0.8393626991565136, + "grad_norm": 55275.26953125, + "learning_rate": 8.817170548244067e-05, + "loss": 2.3012, + "step": 4478 + }, + { + "epoch": 0.8395501405810685, + "grad_norm": 49369.0078125, + "learning_rate": 8.816662969884581e-05, + "loss": 2.2435, + "step": 4479 + }, + { + "epoch": 0.8397375820056232, + "grad_norm": 51162.4453125, + "learning_rate": 8.816155297258469e-05, + "loss": 2.3505, + "step": 4480 + }, + { + "epoch": 0.839925023430178, + "grad_norm": 49219.2265625, + "learning_rate": 8.815647530378266e-05, + "loss": 2.3545, + "step": 4481 + }, + { + "epoch": 0.8401124648547329, + "grad_norm": 49045.8203125, + "learning_rate": 8.815139669256518e-05, + "loss": 2.3447, + "step": 4482 + }, + { + "epoch": 0.8402999062792877, + "grad_norm": 52399.63671875, + "learning_rate": 8.814631713905767e-05, + "loss": 2.4075, + "step": 4483 + }, + { + "epoch": 0.8404873477038426, + "grad_norm": 48895.171875, + "learning_rate": 8.814123664338557e-05, + "loss": 2.261, + "step": 4484 + }, + { + "epoch": 0.8406747891283973, + "grad_norm": 48371.03125, + "learning_rate": 8.813615520567437e-05, + "loss": 2.3715, + "step": 4485 + }, + { + "epoch": 0.8408622305529522, + "grad_norm": 48275.36328125, + "learning_rate": 8.81310728260496e-05, + "loss": 2.2902, + "step": 4486 + }, + { + "epoch": 0.841049671977507, + "grad_norm": 47911.0078125, + "learning_rate": 8.812598950463676e-05, + "loss": 2.3202, + "step": 4487 + }, + { + "epoch": 0.8412371134020619, + "grad_norm": 51984.37890625, + "learning_rate": 8.812090524156142e-05, + "loss": 2.4363, + "step": 4488 + }, + { + "epoch": 0.8414245548266167, + "grad_norm": 51347.921875, + "learning_rate": 8.811582003694913e-05, + "loss": 2.2483, + "step": 4489 + }, + { + "epoch": 0.8416119962511716, + "grad_norm": 50901.6953125, + "learning_rate": 8.81107338909255e-05, + "loss": 2.253, + "step": 4490 + }, + { + "epoch": 0.8417994376757263, + "grad_norm": 52012.03515625, + "learning_rate": 8.810564680361617e-05, + "loss": 2.3137, + "step": 4491 + }, + { + "epoch": 0.8419868791002811, + "grad_norm": 52057.7265625, + "learning_rate": 8.810055877514676e-05, + "loss": 2.4193, + "step": 4492 + }, + { + "epoch": 0.842174320524836, + "grad_norm": 49433.80078125, + "learning_rate": 8.809546980564295e-05, + "loss": 2.3415, + "step": 4493 + }, + { + "epoch": 0.8423617619493908, + "grad_norm": 48966.3125, + "learning_rate": 8.809037989523042e-05, + "loss": 2.3858, + "step": 4494 + }, + { + "epoch": 0.8425492033739457, + "grad_norm": 50343.34765625, + "learning_rate": 8.80852890440349e-05, + "loss": 2.3207, + "step": 4495 + }, + { + "epoch": 0.8427366447985005, + "grad_norm": 53102.0234375, + "learning_rate": 8.808019725218211e-05, + "loss": 2.3082, + "step": 4496 + }, + { + "epoch": 0.8429240862230553, + "grad_norm": 50391.26171875, + "learning_rate": 8.807510451979783e-05, + "loss": 2.3042, + "step": 4497 + }, + { + "epoch": 0.8431115276476101, + "grad_norm": 49702.36328125, + "learning_rate": 8.807001084700784e-05, + "loss": 2.3016, + "step": 4498 + }, + { + "epoch": 0.843298969072165, + "grad_norm": 49488.9765625, + "learning_rate": 8.806491623393791e-05, + "loss": 2.2917, + "step": 4499 + }, + { + "epoch": 0.8434864104967198, + "grad_norm": 51458.578125, + "learning_rate": 8.805982068071392e-05, + "loss": 2.2825, + "step": 4500 + }, + { + "epoch": 0.8434864104967198, + "eval_loss": 2.3208961486816406, + "eval_runtime": 128.0607, + "eval_samples_per_second": 39.427, + "eval_steps_per_second": 1.976, + "step": 4500 + }, + { + "epoch": 0.8436738519212746, + "grad_norm": 54415.68359375, + "learning_rate": 8.805472418746171e-05, + "loss": 2.3016, + "step": 4501 + }, + { + "epoch": 0.8438612933458294, + "grad_norm": 48797.2890625, + "learning_rate": 8.804962675430714e-05, + "loss": 2.3109, + "step": 4502 + }, + { + "epoch": 0.8440487347703842, + "grad_norm": 57436.76953125, + "learning_rate": 8.804452838137614e-05, + "loss": 2.2834, + "step": 4503 + }, + { + "epoch": 0.8442361761949391, + "grad_norm": 52194.890625, + "learning_rate": 8.803942906879459e-05, + "loss": 2.3572, + "step": 4504 + }, + { + "epoch": 0.8444236176194939, + "grad_norm": 50303.734375, + "learning_rate": 8.803432881668848e-05, + "loss": 2.2853, + "step": 4505 + }, + { + "epoch": 0.8446110590440488, + "grad_norm": 48329.98828125, + "learning_rate": 8.802922762518375e-05, + "loss": 2.3166, + "step": 4506 + }, + { + "epoch": 0.8447985004686036, + "grad_norm": 51278.9921875, + "learning_rate": 8.802412549440641e-05, + "loss": 2.2851, + "step": 4507 + }, + { + "epoch": 0.8449859418931583, + "grad_norm": 46112.75390625, + "learning_rate": 8.801902242448245e-05, + "loss": 2.3165, + "step": 4508 + }, + { + "epoch": 0.8451733833177132, + "grad_norm": 47656.86328125, + "learning_rate": 8.801391841553797e-05, + "loss": 2.3197, + "step": 4509 + }, + { + "epoch": 0.845360824742268, + "grad_norm": 46936.06640625, + "learning_rate": 8.800881346769896e-05, + "loss": 2.3619, + "step": 4510 + }, + { + "epoch": 0.8455482661668229, + "grad_norm": 46299.7265625, + "learning_rate": 8.800370758109153e-05, + "loss": 2.3255, + "step": 4511 + }, + { + "epoch": 0.8457357075913777, + "grad_norm": 48697.59375, + "learning_rate": 8.799860075584181e-05, + "loss": 2.3034, + "step": 4512 + }, + { + "epoch": 0.8459231490159326, + "grad_norm": 47504.1015625, + "learning_rate": 8.799349299207591e-05, + "loss": 2.3095, + "step": 4513 + }, + { + "epoch": 0.8461105904404873, + "grad_norm": 45025.04296875, + "learning_rate": 8.798838428992001e-05, + "loss": 2.3513, + "step": 4514 + }, + { + "epoch": 0.8462980318650422, + "grad_norm": 49522.984375, + "learning_rate": 8.798327464950025e-05, + "loss": 2.3217, + "step": 4515 + }, + { + "epoch": 0.846485473289597, + "grad_norm": 54683.83984375, + "learning_rate": 8.797816407094286e-05, + "loss": 2.2888, + "step": 4516 + }, + { + "epoch": 0.8466729147141518, + "grad_norm": 51616.609375, + "learning_rate": 8.797305255437405e-05, + "loss": 2.2977, + "step": 4517 + }, + { + "epoch": 0.8468603561387067, + "grad_norm": 49457.96875, + "learning_rate": 8.796794009992009e-05, + "loss": 2.3101, + "step": 4518 + }, + { + "epoch": 0.8470477975632614, + "grad_norm": 48842.33203125, + "learning_rate": 8.796282670770721e-05, + "loss": 2.2633, + "step": 4519 + }, + { + "epoch": 0.8472352389878163, + "grad_norm": 46513.99609375, + "learning_rate": 8.795771237786175e-05, + "loss": 2.2629, + "step": 4520 + }, + { + "epoch": 0.8474226804123711, + "grad_norm": 45894.61328125, + "learning_rate": 8.795259711051e-05, + "loss": 2.2937, + "step": 4521 + }, + { + "epoch": 0.847610121836926, + "grad_norm": 51287.96875, + "learning_rate": 8.794748090577831e-05, + "loss": 2.2884, + "step": 4522 + }, + { + "epoch": 0.8477975632614808, + "grad_norm": 45983.9453125, + "learning_rate": 8.794236376379306e-05, + "loss": 2.2867, + "step": 4523 + }, + { + "epoch": 0.8479850046860357, + "grad_norm": 49674.328125, + "learning_rate": 8.79372456846806e-05, + "loss": 2.3651, + "step": 4524 + }, + { + "epoch": 0.8481724461105904, + "grad_norm": 48471.87890625, + "learning_rate": 8.793212666856736e-05, + "loss": 2.4114, + "step": 4525 + }, + { + "epoch": 0.8483598875351452, + "grad_norm": 49680.65625, + "learning_rate": 8.792700671557977e-05, + "loss": 2.3037, + "step": 4526 + }, + { + "epoch": 0.8485473289597001, + "grad_norm": 48415.95703125, + "learning_rate": 8.792188582584429e-05, + "loss": 2.3461, + "step": 4527 + }, + { + "epoch": 0.8487347703842549, + "grad_norm": 46258.03515625, + "learning_rate": 8.791676399948739e-05, + "loss": 2.2634, + "step": 4528 + }, + { + "epoch": 0.8489222118088098, + "grad_norm": 49037.89453125, + "learning_rate": 8.791164123663557e-05, + "loss": 2.3366, + "step": 4529 + }, + { + "epoch": 0.8491096532333646, + "grad_norm": 46710.54296875, + "learning_rate": 8.790651753741538e-05, + "loss": 2.3143, + "step": 4530 + }, + { + "epoch": 0.8492970946579194, + "grad_norm": 47873.8515625, + "learning_rate": 8.790139290195335e-05, + "loss": 2.2652, + "step": 4531 + }, + { + "epoch": 0.8494845360824742, + "grad_norm": 51846.36328125, + "learning_rate": 8.789626733037606e-05, + "loss": 2.2528, + "step": 4532 + }, + { + "epoch": 0.849671977507029, + "grad_norm": 49244.6484375, + "learning_rate": 8.78911408228101e-05, + "loss": 2.2646, + "step": 4533 + }, + { + "epoch": 0.8498594189315839, + "grad_norm": 48506.6015625, + "learning_rate": 8.788601337938207e-05, + "loss": 2.428, + "step": 4534 + }, + { + "epoch": 0.8500468603561387, + "grad_norm": 47636.609375, + "learning_rate": 8.788088500021865e-05, + "loss": 2.3887, + "step": 4535 + }, + { + "epoch": 0.8502343017806935, + "grad_norm": 58673.69921875, + "learning_rate": 8.787575568544648e-05, + "loss": 2.3698, + "step": 4536 + }, + { + "epoch": 0.8504217432052483, + "grad_norm": 54147.16796875, + "learning_rate": 8.787062543519224e-05, + "loss": 2.3807, + "step": 4537 + }, + { + "epoch": 0.8506091846298032, + "grad_norm": 49853.2734375, + "learning_rate": 8.786549424958265e-05, + "loss": 2.3658, + "step": 4538 + }, + { + "epoch": 0.850796626054358, + "grad_norm": 47264.16796875, + "learning_rate": 8.786036212874446e-05, + "loss": 2.2664, + "step": 4539 + }, + { + "epoch": 0.8509840674789129, + "grad_norm": 47084.05078125, + "learning_rate": 8.785522907280442e-05, + "loss": 2.3089, + "step": 4540 + }, + { + "epoch": 0.8511715089034677, + "grad_norm": 48849.046875, + "learning_rate": 8.785009508188928e-05, + "loss": 2.3314, + "step": 4541 + }, + { + "epoch": 0.8513589503280224, + "grad_norm": 52213.70703125, + "learning_rate": 8.784496015612589e-05, + "loss": 2.3177, + "step": 4542 + }, + { + "epoch": 0.8515463917525773, + "grad_norm": 52517.03125, + "learning_rate": 8.783982429564103e-05, + "loss": 2.2703, + "step": 4543 + }, + { + "epoch": 0.8517338331771321, + "grad_norm": 52260.390625, + "learning_rate": 8.783468750056157e-05, + "loss": 2.3324, + "step": 4544 + }, + { + "epoch": 0.851921274601687, + "grad_norm": 51199.43359375, + "learning_rate": 8.782954977101439e-05, + "loss": 2.3144, + "step": 4545 + }, + { + "epoch": 0.8521087160262418, + "grad_norm": 51098.10546875, + "learning_rate": 8.782441110712639e-05, + "loss": 2.2741, + "step": 4546 + }, + { + "epoch": 0.8522961574507967, + "grad_norm": 47533.7265625, + "learning_rate": 8.781927150902444e-05, + "loss": 2.3697, + "step": 4547 + }, + { + "epoch": 0.8524835988753514, + "grad_norm": 59575.10546875, + "learning_rate": 8.781413097683553e-05, + "loss": 2.6141, + "step": 4548 + }, + { + "epoch": 0.8526710402999063, + "grad_norm": 68716.2265625, + "learning_rate": 8.780898951068663e-05, + "loss": 2.642, + "step": 4549 + }, + { + "epoch": 0.8528584817244611, + "grad_norm": 51319.09375, + "learning_rate": 8.78038471107047e-05, + "loss": 2.2694, + "step": 4550 + }, + { + "epoch": 0.853045923149016, + "grad_norm": 49661.59375, + "learning_rate": 8.779870377701675e-05, + "loss": 2.2416, + "step": 4551 + }, + { + "epoch": 0.8532333645735708, + "grad_norm": 49915.90234375, + "learning_rate": 8.779355950974983e-05, + "loss": 2.2978, + "step": 4552 + }, + { + "epoch": 0.8534208059981255, + "grad_norm": 48776.05859375, + "learning_rate": 8.778841430903099e-05, + "loss": 2.3381, + "step": 4553 + }, + { + "epoch": 0.8536082474226804, + "grad_norm": 47548.1875, + "learning_rate": 8.778326817498733e-05, + "loss": 2.4395, + "step": 4554 + }, + { + "epoch": 0.8537956888472352, + "grad_norm": 48775.40625, + "learning_rate": 8.77781211077459e-05, + "loss": 2.3039, + "step": 4555 + }, + { + "epoch": 0.8539831302717901, + "grad_norm": 53460.25390625, + "learning_rate": 8.777297310743389e-05, + "loss": 2.3357, + "step": 4556 + }, + { + "epoch": 0.8541705716963449, + "grad_norm": 48005.10546875, + "learning_rate": 8.77678241741784e-05, + "loss": 2.2734, + "step": 4557 + }, + { + "epoch": 0.8543580131208998, + "grad_norm": 53619.42578125, + "learning_rate": 8.77626743081066e-05, + "loss": 2.3257, + "step": 4558 + }, + { + "epoch": 0.8545454545454545, + "grad_norm": 53487.88671875, + "learning_rate": 8.775752350934574e-05, + "loss": 2.3878, + "step": 4559 + }, + { + "epoch": 0.8547328959700093, + "grad_norm": 49410.42578125, + "learning_rate": 8.775237177802298e-05, + "loss": 2.3613, + "step": 4560 + }, + { + "epoch": 0.8549203373945642, + "grad_norm": 50133.26171875, + "learning_rate": 8.77472191142656e-05, + "loss": 2.358, + "step": 4561 + }, + { + "epoch": 0.855107778819119, + "grad_norm": 52071.22265625, + "learning_rate": 8.774206551820084e-05, + "loss": 2.3105, + "step": 4562 + }, + { + "epoch": 0.8552952202436739, + "grad_norm": 50056.3046875, + "learning_rate": 8.773691098995599e-05, + "loss": 2.2881, + "step": 4563 + }, + { + "epoch": 0.8554826616682287, + "grad_norm": 50589.9375, + "learning_rate": 8.773175552965837e-05, + "loss": 2.2724, + "step": 4564 + }, + { + "epoch": 0.8556701030927835, + "grad_norm": 51399.015625, + "learning_rate": 8.772659913743531e-05, + "loss": 2.3111, + "step": 4565 + }, + { + "epoch": 0.8558575445173383, + "grad_norm": 48304.98828125, + "learning_rate": 8.772144181341416e-05, + "loss": 2.2504, + "step": 4566 + }, + { + "epoch": 0.8560449859418932, + "grad_norm": 45852.9296875, + "learning_rate": 8.77162835577223e-05, + "loss": 2.3501, + "step": 4567 + }, + { + "epoch": 0.856232427366448, + "grad_norm": 45855.0546875, + "learning_rate": 8.771112437048715e-05, + "loss": 2.3128, + "step": 4568 + }, + { + "epoch": 0.8564198687910028, + "grad_norm": 50626.87109375, + "learning_rate": 8.770596425183612e-05, + "loss": 2.2871, + "step": 4569 + }, + { + "epoch": 0.8566073102155577, + "grad_norm": 46904.4140625, + "learning_rate": 8.770080320189664e-05, + "loss": 2.2455, + "step": 4570 + }, + { + "epoch": 0.8567947516401124, + "grad_norm": 51886.44921875, + "learning_rate": 8.769564122079622e-05, + "loss": 2.3639, + "step": 4571 + }, + { + "epoch": 0.8569821930646673, + "grad_norm": 54120.0390625, + "learning_rate": 8.769047830866233e-05, + "loss": 2.3041, + "step": 4572 + }, + { + "epoch": 0.8571696344892221, + "grad_norm": 49769.57421875, + "learning_rate": 8.76853144656225e-05, + "loss": 2.2668, + "step": 4573 + }, + { + "epoch": 0.857357075913777, + "grad_norm": 50881.9921875, + "learning_rate": 8.768014969180424e-05, + "loss": 2.3043, + "step": 4574 + }, + { + "epoch": 0.8575445173383318, + "grad_norm": 49635.8046875, + "learning_rate": 8.767498398733516e-05, + "loss": 2.3392, + "step": 4575 + }, + { + "epoch": 0.8577319587628865, + "grad_norm": 55000.12109375, + "learning_rate": 8.766981735234281e-05, + "loss": 2.1284, + "step": 4576 + }, + { + "epoch": 0.8579194001874414, + "grad_norm": 52416.515625, + "learning_rate": 8.766464978695481e-05, + "loss": 2.3088, + "step": 4577 + }, + { + "epoch": 0.8581068416119962, + "grad_norm": 51201.40625, + "learning_rate": 8.765948129129879e-05, + "loss": 2.3198, + "step": 4578 + }, + { + "epoch": 0.8582942830365511, + "grad_norm": 51287.75390625, + "learning_rate": 8.765431186550241e-05, + "loss": 2.3269, + "step": 4579 + }, + { + "epoch": 0.8584817244611059, + "grad_norm": 48216.40625, + "learning_rate": 8.764914150969335e-05, + "loss": 2.3365, + "step": 4580 + }, + { + "epoch": 0.8586691658856608, + "grad_norm": 50505.625, + "learning_rate": 8.76439702239993e-05, + "loss": 2.3063, + "step": 4581 + }, + { + "epoch": 0.8588566073102155, + "grad_norm": 49368.81640625, + "learning_rate": 8.763879800854801e-05, + "loss": 2.2119, + "step": 4582 + }, + { + "epoch": 0.8590440487347704, + "grad_norm": 50949.265625, + "learning_rate": 8.76336248634672e-05, + "loss": 2.2973, + "step": 4583 + }, + { + "epoch": 0.8592314901593252, + "grad_norm": 49205.90625, + "learning_rate": 8.762845078888464e-05, + "loss": 2.2278, + "step": 4584 + }, + { + "epoch": 0.85941893158388, + "grad_norm": 52079.65625, + "learning_rate": 8.762327578492815e-05, + "loss": 2.3782, + "step": 4585 + }, + { + "epoch": 0.8596063730084349, + "grad_norm": 52703.7421875, + "learning_rate": 8.761809985172551e-05, + "loss": 2.3722, + "step": 4586 + }, + { + "epoch": 0.8597938144329897, + "grad_norm": 49663.01171875, + "learning_rate": 8.761292298940458e-05, + "loss": 2.3277, + "step": 4587 + }, + { + "epoch": 0.8599812558575445, + "grad_norm": 49482.15234375, + "learning_rate": 8.760774519809323e-05, + "loss": 2.3298, + "step": 4588 + }, + { + "epoch": 0.8601686972820993, + "grad_norm": 52746.390625, + "learning_rate": 8.76025664779193e-05, + "loss": 2.273, + "step": 4589 + }, + { + "epoch": 0.8603561387066542, + "grad_norm": 51022.19140625, + "learning_rate": 8.759738682901077e-05, + "loss": 2.2287, + "step": 4590 + }, + { + "epoch": 0.860543580131209, + "grad_norm": 54665.34765625, + "learning_rate": 8.759220625149551e-05, + "loss": 2.3445, + "step": 4591 + }, + { + "epoch": 0.8607310215557639, + "grad_norm": 47988.71484375, + "learning_rate": 8.75870247455015e-05, + "loss": 2.2786, + "step": 4592 + }, + { + "epoch": 0.8609184629803186, + "grad_norm": 50466.93359375, + "learning_rate": 8.758184231115671e-05, + "loss": 2.2888, + "step": 4593 + }, + { + "epoch": 0.8611059044048734, + "grad_norm": 51360.8203125, + "learning_rate": 8.757665894858914e-05, + "loss": 2.3662, + "step": 4594 + }, + { + "epoch": 0.8612933458294283, + "grad_norm": 45744.5078125, + "learning_rate": 8.75714746579268e-05, + "loss": 2.2977, + "step": 4595 + }, + { + "epoch": 0.8614807872539831, + "grad_norm": 48155.89453125, + "learning_rate": 8.756628943929776e-05, + "loss": 2.2838, + "step": 4596 + }, + { + "epoch": 0.861668228678538, + "grad_norm": 50375.13671875, + "learning_rate": 8.75611032928301e-05, + "loss": 2.262, + "step": 4597 + }, + { + "epoch": 0.8618556701030928, + "grad_norm": 47698.15625, + "learning_rate": 8.755591621865186e-05, + "loss": 2.3295, + "step": 4598 + }, + { + "epoch": 0.8620431115276476, + "grad_norm": 48089.609375, + "learning_rate": 8.755072821689118e-05, + "loss": 2.2881, + "step": 4599 + }, + { + "epoch": 0.8622305529522024, + "grad_norm": 49100.46875, + "learning_rate": 8.75455392876762e-05, + "loss": 2.3439, + "step": 4600 + }, + { + "epoch": 0.8624179943767573, + "grad_norm": 46792.328125, + "learning_rate": 8.754034943113509e-05, + "loss": 2.2523, + "step": 4601 + }, + { + "epoch": 0.8626054358013121, + "grad_norm": 46413.16796875, + "learning_rate": 8.753515864739601e-05, + "loss": 2.2371, + "step": 4602 + }, + { + "epoch": 0.862792877225867, + "grad_norm": 49714.609375, + "learning_rate": 8.752996693658718e-05, + "loss": 2.3787, + "step": 4603 + }, + { + "epoch": 0.8629803186504218, + "grad_norm": 46598.8359375, + "learning_rate": 8.752477429883684e-05, + "loss": 2.2725, + "step": 4604 + }, + { + "epoch": 0.8631677600749765, + "grad_norm": 50250.62890625, + "learning_rate": 8.75195807342732e-05, + "loss": 2.3297, + "step": 4605 + }, + { + "epoch": 0.8633552014995314, + "grad_norm": 53034.93359375, + "learning_rate": 8.751438624302457e-05, + "loss": 2.3361, + "step": 4606 + }, + { + "epoch": 0.8635426429240862, + "grad_norm": 51676.2890625, + "learning_rate": 8.750919082521924e-05, + "loss": 2.4165, + "step": 4607 + }, + { + "epoch": 0.8637300843486411, + "grad_norm": 48787.3671875, + "learning_rate": 8.750399448098554e-05, + "loss": 2.3052, + "step": 4608 + }, + { + "epoch": 0.8639175257731959, + "grad_norm": 48883.83203125, + "learning_rate": 8.749879721045178e-05, + "loss": 2.3053, + "step": 4609 + }, + { + "epoch": 0.8641049671977507, + "grad_norm": 50885.9375, + "learning_rate": 8.749359901374634e-05, + "loss": 2.3202, + "step": 4610 + }, + { + "epoch": 0.8642924086223055, + "grad_norm": 51107.81640625, + "learning_rate": 8.748839989099763e-05, + "loss": 2.2362, + "step": 4611 + }, + { + "epoch": 0.8644798500468603, + "grad_norm": 54373.66015625, + "learning_rate": 8.748319984233406e-05, + "loss": 2.2541, + "step": 4612 + }, + { + "epoch": 0.8646672914714152, + "grad_norm": 54893.921875, + "learning_rate": 8.747799886788404e-05, + "loss": 2.2697, + "step": 4613 + }, + { + "epoch": 0.86485473289597, + "grad_norm": 54201.11328125, + "learning_rate": 8.747279696777604e-05, + "loss": 2.3282, + "step": 4614 + }, + { + "epoch": 0.8650421743205249, + "grad_norm": 49248.04296875, + "learning_rate": 8.746759414213853e-05, + "loss": 2.2723, + "step": 4615 + }, + { + "epoch": 0.8652296157450796, + "grad_norm": 46875.6328125, + "learning_rate": 8.746239039110002e-05, + "loss": 2.3038, + "step": 4616 + }, + { + "epoch": 0.8654170571696345, + "grad_norm": 54326.9921875, + "learning_rate": 8.745718571478907e-05, + "loss": 2.3236, + "step": 4617 + }, + { + "epoch": 0.8656044985941893, + "grad_norm": 46645.61328125, + "learning_rate": 8.745198011333416e-05, + "loss": 2.3578, + "step": 4618 + }, + { + "epoch": 0.8657919400187442, + "grad_norm": 52158.29296875, + "learning_rate": 8.744677358686392e-05, + "loss": 2.2958, + "step": 4619 + }, + { + "epoch": 0.865979381443299, + "grad_norm": 52226.11328125, + "learning_rate": 8.744156613550691e-05, + "loss": 2.332, + "step": 4620 + }, + { + "epoch": 0.8661668228678538, + "grad_norm": 49373.1796875, + "learning_rate": 8.743635775939177e-05, + "loss": 2.3003, + "step": 4621 + }, + { + "epoch": 0.8663542642924086, + "grad_norm": 46803.0625, + "learning_rate": 8.743114845864713e-05, + "loss": 2.2619, + "step": 4622 + }, + { + "epoch": 0.8665417057169634, + "grad_norm": 48381.50390625, + "learning_rate": 8.742593823340165e-05, + "loss": 2.3082, + "step": 4623 + }, + { + "epoch": 0.8667291471415183, + "grad_norm": 49662.3359375, + "learning_rate": 8.742072708378402e-05, + "loss": 2.2973, + "step": 4624 + }, + { + "epoch": 0.8669165885660731, + "grad_norm": 49935.078125, + "learning_rate": 8.741551500992295e-05, + "loss": 2.3208, + "step": 4625 + }, + { + "epoch": 0.867104029990628, + "grad_norm": 46898.234375, + "learning_rate": 8.741030201194715e-05, + "loss": 2.3619, + "step": 4626 + }, + { + "epoch": 0.8672914714151827, + "grad_norm": 50255.55078125, + "learning_rate": 8.740508808998541e-05, + "loss": 2.2713, + "step": 4627 + }, + { + "epoch": 0.8674789128397375, + "grad_norm": 50348.8046875, + "learning_rate": 8.739987324416649e-05, + "loss": 2.2089, + "step": 4628 + }, + { + "epoch": 0.8676663542642924, + "grad_norm": 51369.62109375, + "learning_rate": 8.73946574746192e-05, + "loss": 2.2557, + "step": 4629 + }, + { + "epoch": 0.8678537956888472, + "grad_norm": 52431.15625, + "learning_rate": 8.738944078147234e-05, + "loss": 2.3555, + "step": 4630 + }, + { + "epoch": 0.8680412371134021, + "grad_norm": 53479.89453125, + "learning_rate": 8.738422316485478e-05, + "loss": 2.3085, + "step": 4631 + }, + { + "epoch": 0.8682286785379569, + "grad_norm": 50746.7890625, + "learning_rate": 8.737900462489536e-05, + "loss": 2.3479, + "step": 4632 + }, + { + "epoch": 0.8684161199625117, + "grad_norm": 46550.68359375, + "learning_rate": 8.737378516172301e-05, + "loss": 2.266, + "step": 4633 + }, + { + "epoch": 0.8686035613870665, + "grad_norm": 53020.23828125, + "learning_rate": 8.73685647754666e-05, + "loss": 2.3003, + "step": 4634 + }, + { + "epoch": 0.8687910028116214, + "grad_norm": 49058.46484375, + "learning_rate": 8.73633434662551e-05, + "loss": 2.3635, + "step": 4635 + }, + { + "epoch": 0.8689784442361762, + "grad_norm": 48716.85546875, + "learning_rate": 8.735812123421746e-05, + "loss": 2.3261, + "step": 4636 + }, + { + "epoch": 0.869165885660731, + "grad_norm": 51175.828125, + "learning_rate": 8.735289807948266e-05, + "loss": 2.3716, + "step": 4637 + }, + { + "epoch": 0.8693533270852859, + "grad_norm": 47830.88671875, + "learning_rate": 8.734767400217971e-05, + "loss": 2.2738, + "step": 4638 + }, + { + "epoch": 0.8695407685098406, + "grad_norm": 50411.296875, + "learning_rate": 8.734244900243763e-05, + "loss": 2.3955, + "step": 4639 + }, + { + "epoch": 0.8697282099343955, + "grad_norm": 53384.74609375, + "learning_rate": 8.733722308038548e-05, + "loss": 2.2862, + "step": 4640 + }, + { + "epoch": 0.8699156513589503, + "grad_norm": 51337.69140625, + "learning_rate": 8.733199623615231e-05, + "loss": 2.3363, + "step": 4641 + }, + { + "epoch": 0.8701030927835052, + "grad_norm": 53248.77734375, + "learning_rate": 8.732676846986725e-05, + "loss": 2.3095, + "step": 4642 + }, + { + "epoch": 0.87029053420806, + "grad_norm": 45406.80078125, + "learning_rate": 8.73215397816594e-05, + "loss": 2.3321, + "step": 4643 + }, + { + "epoch": 0.8704779756326148, + "grad_norm": 51577.359375, + "learning_rate": 8.73163101716579e-05, + "loss": 2.321, + "step": 4644 + }, + { + "epoch": 0.8706654170571696, + "grad_norm": 48149.02734375, + "learning_rate": 8.731107963999194e-05, + "loss": 2.2758, + "step": 4645 + }, + { + "epoch": 0.8708528584817244, + "grad_norm": 49196.609375, + "learning_rate": 8.730584818679066e-05, + "loss": 2.3471, + "step": 4646 + }, + { + "epoch": 0.8710402999062793, + "grad_norm": 50991.72265625, + "learning_rate": 8.730061581218332e-05, + "loss": 2.4421, + "step": 4647 + }, + { + "epoch": 0.8712277413308341, + "grad_norm": 55908.8046875, + "learning_rate": 8.729538251629912e-05, + "loss": 2.3134, + "step": 4648 + }, + { + "epoch": 0.871415182755389, + "grad_norm": 47793.5, + "learning_rate": 8.729014829926734e-05, + "loss": 2.3242, + "step": 4649 + }, + { + "epoch": 0.8716026241799437, + "grad_norm": 49069.2421875, + "learning_rate": 8.728491316121723e-05, + "loss": 2.3041, + "step": 4650 + }, + { + "epoch": 0.8717900656044986, + "grad_norm": 51043.60546875, + "learning_rate": 8.72796771022781e-05, + "loss": 2.2846, + "step": 4651 + }, + { + "epoch": 0.8719775070290534, + "grad_norm": 49139.0703125, + "learning_rate": 8.727444012257928e-05, + "loss": 2.3172, + "step": 4652 + }, + { + "epoch": 0.8721649484536083, + "grad_norm": 52410.421875, + "learning_rate": 8.726920222225013e-05, + "loss": 2.275, + "step": 4653 + }, + { + "epoch": 0.8723523898781631, + "grad_norm": 46173.8984375, + "learning_rate": 8.726396340142e-05, + "loss": 2.3315, + "step": 4654 + }, + { + "epoch": 0.872539831302718, + "grad_norm": 47484.05859375, + "learning_rate": 8.725872366021828e-05, + "loss": 2.3038, + "step": 4655 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 58669.79296875, + "learning_rate": 8.72534829987744e-05, + "loss": 2.315, + "step": 4656 + }, + { + "epoch": 0.8729147141518275, + "grad_norm": 47947.19140625, + "learning_rate": 8.724824141721779e-05, + "loss": 2.3023, + "step": 4657 + }, + { + "epoch": 0.8731021555763824, + "grad_norm": 54675.1953125, + "learning_rate": 8.72429989156779e-05, + "loss": 2.3738, + "step": 4658 + }, + { + "epoch": 0.8732895970009372, + "grad_norm": 49471.3515625, + "learning_rate": 8.723775549428423e-05, + "loss": 2.2922, + "step": 4659 + }, + { + "epoch": 0.8734770384254921, + "grad_norm": 49798.56640625, + "learning_rate": 8.723251115316628e-05, + "loss": 2.2845, + "step": 4660 + }, + { + "epoch": 0.8736644798500469, + "grad_norm": 47367.87890625, + "learning_rate": 8.722726589245359e-05, + "loss": 2.3532, + "step": 4661 + }, + { + "epoch": 0.8738519212746017, + "grad_norm": 48548.04296875, + "learning_rate": 8.722201971227569e-05, + "loss": 2.3659, + "step": 4662 + }, + { + "epoch": 0.8740393626991565, + "grad_norm": 50699.0703125, + "learning_rate": 8.721677261276215e-05, + "loss": 2.3228, + "step": 4663 + }, + { + "epoch": 0.8742268041237113, + "grad_norm": 47067.6328125, + "learning_rate": 8.721152459404258e-05, + "loss": 2.2576, + "step": 4664 + }, + { + "epoch": 0.8744142455482662, + "grad_norm": 49937.32421875, + "learning_rate": 8.720627565624661e-05, + "loss": 2.2846, + "step": 4665 + }, + { + "epoch": 0.874601686972821, + "grad_norm": 51146.65625, + "learning_rate": 8.720102579950387e-05, + "loss": 2.3237, + "step": 4666 + }, + { + "epoch": 0.8747891283973758, + "grad_norm": 48255.0859375, + "learning_rate": 8.719577502394401e-05, + "loss": 2.2547, + "step": 4667 + }, + { + "epoch": 0.8749765698219306, + "grad_norm": 54186.45703125, + "learning_rate": 8.719052332969675e-05, + "loss": 2.2725, + "step": 4668 + }, + { + "epoch": 0.8751640112464855, + "grad_norm": 49391.28125, + "learning_rate": 8.718527071689177e-05, + "loss": 2.2912, + "step": 4669 + }, + { + "epoch": 0.8753514526710403, + "grad_norm": 47337.46484375, + "learning_rate": 8.718001718565881e-05, + "loss": 2.4005, + "step": 4670 + }, + { + "epoch": 0.8755388940955952, + "grad_norm": 56083.46875, + "learning_rate": 8.717476273612765e-05, + "loss": 2.3259, + "step": 4671 + }, + { + "epoch": 0.87572633552015, + "grad_norm": 46998.3203125, + "learning_rate": 8.716950736842804e-05, + "loss": 2.3128, + "step": 4672 + }, + { + "epoch": 0.8759137769447047, + "grad_norm": 48201.9375, + "learning_rate": 8.716425108268978e-05, + "loss": 2.3049, + "step": 4673 + }, + { + "epoch": 0.8761012183692596, + "grad_norm": 48043.6015625, + "learning_rate": 8.71589938790427e-05, + "loss": 2.3422, + "step": 4674 + }, + { + "epoch": 0.8762886597938144, + "grad_norm": 49393.3046875, + "learning_rate": 8.715373575761667e-05, + "loss": 2.3009, + "step": 4675 + }, + { + "epoch": 0.8764761012183693, + "grad_norm": 49535.7265625, + "learning_rate": 8.714847671854151e-05, + "loss": 2.2662, + "step": 4676 + }, + { + "epoch": 0.8766635426429241, + "grad_norm": 49218.74609375, + "learning_rate": 8.714321676194715e-05, + "loss": 2.2699, + "step": 4677 + }, + { + "epoch": 0.876850984067479, + "grad_norm": 45859.8984375, + "learning_rate": 8.71379558879635e-05, + "loss": 2.3006, + "step": 4678 + }, + { + "epoch": 0.8770384254920337, + "grad_norm": 45722.01171875, + "learning_rate": 8.71326940967205e-05, + "loss": 2.2884, + "step": 4679 + }, + { + "epoch": 0.8772258669165885, + "grad_norm": 51805.14453125, + "learning_rate": 8.712743138834807e-05, + "loss": 2.2778, + "step": 4680 + }, + { + "epoch": 0.8774133083411434, + "grad_norm": 53515.13671875, + "learning_rate": 8.712216776297624e-05, + "loss": 2.3897, + "step": 4681 + }, + { + "epoch": 0.8776007497656982, + "grad_norm": 46813.828125, + "learning_rate": 8.711690322073498e-05, + "loss": 2.3362, + "step": 4682 + }, + { + "epoch": 0.8777881911902531, + "grad_norm": 50049.30859375, + "learning_rate": 8.711163776175435e-05, + "loss": 2.3435, + "step": 4683 + }, + { + "epoch": 0.8779756326148078, + "grad_norm": 46527.8125, + "learning_rate": 8.710637138616438e-05, + "loss": 2.4068, + "step": 4684 + }, + { + "epoch": 0.8781630740393627, + "grad_norm": 50810.0, + "learning_rate": 8.710110409409514e-05, + "loss": 2.303, + "step": 4685 + }, + { + "epoch": 0.8783505154639175, + "grad_norm": 53376.4453125, + "learning_rate": 8.709583588567673e-05, + "loss": 2.301, + "step": 4686 + }, + { + "epoch": 0.8785379568884724, + "grad_norm": 51179.67578125, + "learning_rate": 8.709056676103927e-05, + "loss": 2.2712, + "step": 4687 + }, + { + "epoch": 0.8787253983130272, + "grad_norm": 50943.26953125, + "learning_rate": 8.708529672031291e-05, + "loss": 2.2559, + "step": 4688 + }, + { + "epoch": 0.878912839737582, + "grad_norm": 50373.41796875, + "learning_rate": 8.708002576362779e-05, + "loss": 2.3143, + "step": 4689 + }, + { + "epoch": 0.8791002811621368, + "grad_norm": 45772.13671875, + "learning_rate": 8.707475389111411e-05, + "loss": 2.296, + "step": 4690 + }, + { + "epoch": 0.8792877225866916, + "grad_norm": 51810.6484375, + "learning_rate": 8.70694811029021e-05, + "loss": 2.3586, + "step": 4691 + }, + { + "epoch": 0.8794751640112465, + "grad_norm": 49295.20703125, + "learning_rate": 8.706420739912193e-05, + "loss": 2.3463, + "step": 4692 + }, + { + "epoch": 0.8796626054358013, + "grad_norm": 47444.984375, + "learning_rate": 8.705893277990391e-05, + "loss": 2.322, + "step": 4693 + }, + { + "epoch": 0.8798500468603562, + "grad_norm": 45882.36328125, + "learning_rate": 8.70536572453783e-05, + "loss": 2.2518, + "step": 4694 + }, + { + "epoch": 0.880037488284911, + "grad_norm": 50911.90625, + "learning_rate": 8.70483807956754e-05, + "loss": 2.2961, + "step": 4695 + }, + { + "epoch": 0.8802249297094658, + "grad_norm": 48781.125, + "learning_rate": 8.704310343092553e-05, + "loss": 2.3147, + "step": 4696 + }, + { + "epoch": 0.8804123711340206, + "grad_norm": 52991.8828125, + "learning_rate": 8.703782515125902e-05, + "loss": 2.2161, + "step": 4697 + }, + { + "epoch": 0.8805998125585754, + "grad_norm": 48289.01171875, + "learning_rate": 8.703254595680626e-05, + "loss": 2.3509, + "step": 4698 + }, + { + "epoch": 0.8807872539831303, + "grad_norm": 49862.30859375, + "learning_rate": 8.702726584769762e-05, + "loss": 2.3439, + "step": 4699 + }, + { + "epoch": 0.8809746954076851, + "grad_norm": 47984.8828125, + "learning_rate": 8.702198482406352e-05, + "loss": 2.3855, + "step": 4700 + }, + { + "epoch": 0.8811621368322399, + "grad_norm": 50120.75390625, + "learning_rate": 8.70167028860344e-05, + "loss": 2.354, + "step": 4701 + }, + { + "epoch": 0.8813495782567947, + "grad_norm": 53512.1796875, + "learning_rate": 8.701142003374071e-05, + "loss": 2.3044, + "step": 4702 + }, + { + "epoch": 0.8815370196813496, + "grad_norm": 45692.328125, + "learning_rate": 8.700613626731293e-05, + "loss": 2.3352, + "step": 4703 + }, + { + "epoch": 0.8817244611059044, + "grad_norm": 48149.80859375, + "learning_rate": 8.700085158688157e-05, + "loss": 2.3306, + "step": 4704 + }, + { + "epoch": 0.8819119025304593, + "grad_norm": 49543.15234375, + "learning_rate": 8.699556599257715e-05, + "loss": 2.2561, + "step": 4705 + }, + { + "epoch": 0.8820993439550141, + "grad_norm": 52632.3515625, + "learning_rate": 8.69902794845302e-05, + "loss": 2.3093, + "step": 4706 + }, + { + "epoch": 0.8822867853795688, + "grad_norm": 49961.28125, + "learning_rate": 8.698499206287131e-05, + "loss": 2.3078, + "step": 4707 + }, + { + "epoch": 0.8824742268041237, + "grad_norm": 48316.3984375, + "learning_rate": 8.697970372773107e-05, + "loss": 2.3193, + "step": 4708 + }, + { + "epoch": 0.8826616682286785, + "grad_norm": 48085.11328125, + "learning_rate": 8.69744144792401e-05, + "loss": 2.3067, + "step": 4709 + }, + { + "epoch": 0.8828491096532334, + "grad_norm": 49150.0859375, + "learning_rate": 8.696912431752902e-05, + "loss": 2.3219, + "step": 4710 + }, + { + "epoch": 0.8830365510777882, + "grad_norm": 50350.61328125, + "learning_rate": 8.69638332427285e-05, + "loss": 2.359, + "step": 4711 + }, + { + "epoch": 0.8832239925023431, + "grad_norm": 46314.625, + "learning_rate": 8.695854125496923e-05, + "loss": 2.3679, + "step": 4712 + }, + { + "epoch": 0.8834114339268978, + "grad_norm": 50812.78125, + "learning_rate": 8.69532483543819e-05, + "loss": 2.4238, + "step": 4713 + }, + { + "epoch": 0.8835988753514527, + "grad_norm": 48953.37109375, + "learning_rate": 8.694795454109727e-05, + "loss": 2.2736, + "step": 4714 + }, + { + "epoch": 0.8837863167760075, + "grad_norm": 48573.30859375, + "learning_rate": 8.694265981524606e-05, + "loss": 2.3369, + "step": 4715 + }, + { + "epoch": 0.8839737582005623, + "grad_norm": 53762.14453125, + "learning_rate": 8.693736417695903e-05, + "loss": 2.393, + "step": 4716 + }, + { + "epoch": 0.8841611996251172, + "grad_norm": 48618.90625, + "learning_rate": 8.6932067626367e-05, + "loss": 2.301, + "step": 4717 + }, + { + "epoch": 0.8843486410496719, + "grad_norm": 52316.43359375, + "learning_rate": 8.692677016360078e-05, + "loss": 2.3105, + "step": 4718 + }, + { + "epoch": 0.8845360824742268, + "grad_norm": 49166.6953125, + "learning_rate": 8.692147178879122e-05, + "loss": 2.328, + "step": 4719 + }, + { + "epoch": 0.8847235238987816, + "grad_norm": 51406.41796875, + "learning_rate": 8.691617250206918e-05, + "loss": 2.3259, + "step": 4720 + }, + { + "epoch": 0.8849109653233365, + "grad_norm": 52541.890625, + "learning_rate": 8.691087230356554e-05, + "loss": 2.2703, + "step": 4721 + }, + { + "epoch": 0.8850984067478913, + "grad_norm": 49324.2890625, + "learning_rate": 8.690557119341121e-05, + "loss": 2.3881, + "step": 4722 + }, + { + "epoch": 0.8852858481724462, + "grad_norm": 48763.32421875, + "learning_rate": 8.69002691717371e-05, + "loss": 2.3222, + "step": 4723 + }, + { + "epoch": 0.8854732895970009, + "grad_norm": 51590.0, + "learning_rate": 8.689496623867421e-05, + "loss": 2.3002, + "step": 4724 + }, + { + "epoch": 0.8856607310215557, + "grad_norm": 56717.421875, + "learning_rate": 8.688966239435348e-05, + "loss": 2.3091, + "step": 4725 + }, + { + "epoch": 0.8858481724461106, + "grad_norm": 51029.24609375, + "learning_rate": 8.688435763890592e-05, + "loss": 2.2489, + "step": 4726 + }, + { + "epoch": 0.8860356138706654, + "grad_norm": 54905.94140625, + "learning_rate": 8.687905197246254e-05, + "loss": 2.3297, + "step": 4727 + }, + { + "epoch": 0.8862230552952203, + "grad_norm": 47499.9921875, + "learning_rate": 8.68737453951544e-05, + "loss": 2.3291, + "step": 4728 + }, + { + "epoch": 0.8864104967197751, + "grad_norm": 47571.8359375, + "learning_rate": 8.686843790711254e-05, + "loss": 2.329, + "step": 4729 + }, + { + "epoch": 0.8865979381443299, + "grad_norm": 51605.26953125, + "learning_rate": 8.686312950846809e-05, + "loss": 2.3908, + "step": 4730 + }, + { + "epoch": 0.8867853795688847, + "grad_norm": 50061.35546875, + "learning_rate": 8.685782019935212e-05, + "loss": 2.2373, + "step": 4731 + }, + { + "epoch": 0.8869728209934395, + "grad_norm": 47362.734375, + "learning_rate": 8.685250997989578e-05, + "loss": 2.3328, + "step": 4732 + }, + { + "epoch": 0.8871602624179944, + "grad_norm": 49169.0546875, + "learning_rate": 8.684719885023023e-05, + "loss": 2.3068, + "step": 4733 + }, + { + "epoch": 0.8873477038425492, + "grad_norm": 48700.83203125, + "learning_rate": 8.684188681048662e-05, + "loss": 2.3113, + "step": 4734 + }, + { + "epoch": 0.887535145267104, + "grad_norm": 46832.140625, + "learning_rate": 8.683657386079618e-05, + "loss": 2.312, + "step": 4735 + }, + { + "epoch": 0.8877225866916588, + "grad_norm": 47560.55859375, + "learning_rate": 8.683126000129013e-05, + "loss": 2.375, + "step": 4736 + }, + { + "epoch": 0.8879100281162137, + "grad_norm": 49426.85546875, + "learning_rate": 8.682594523209973e-05, + "loss": 2.3081, + "step": 4737 + }, + { + "epoch": 0.8880974695407685, + "grad_norm": 50255.7265625, + "learning_rate": 8.68206295533562e-05, + "loss": 2.3104, + "step": 4738 + }, + { + "epoch": 0.8882849109653234, + "grad_norm": 45902.4765625, + "learning_rate": 8.681531296519086e-05, + "loss": 2.308, + "step": 4739 + }, + { + "epoch": 0.8884723523898782, + "grad_norm": 46240.44921875, + "learning_rate": 8.680999546773503e-05, + "loss": 2.317, + "step": 4740 + }, + { + "epoch": 0.8886597938144329, + "grad_norm": 45922.5859375, + "learning_rate": 8.680467706112004e-05, + "loss": 2.3571, + "step": 4741 + }, + { + "epoch": 0.8888472352389878, + "grad_norm": 54447.43359375, + "learning_rate": 8.679935774547725e-05, + "loss": 2.3324, + "step": 4742 + }, + { + "epoch": 0.8890346766635426, + "grad_norm": 53443.84765625, + "learning_rate": 8.679403752093803e-05, + "loss": 2.2999, + "step": 4743 + }, + { + "epoch": 0.8892221180880975, + "grad_norm": 48606.7578125, + "learning_rate": 8.678871638763377e-05, + "loss": 2.3253, + "step": 4744 + }, + { + "epoch": 0.8894095595126523, + "grad_norm": 55993.26953125, + "learning_rate": 8.678339434569594e-05, + "loss": 2.4046, + "step": 4745 + }, + { + "epoch": 0.8895970009372072, + "grad_norm": 48177.01171875, + "learning_rate": 8.677807139525593e-05, + "loss": 2.3113, + "step": 4746 + }, + { + "epoch": 0.8897844423617619, + "grad_norm": 50964.28125, + "learning_rate": 8.677274753644526e-05, + "loss": 2.2825, + "step": 4747 + }, + { + "epoch": 0.8899718837863168, + "grad_norm": 51196.53515625, + "learning_rate": 8.67674227693954e-05, + "loss": 2.2615, + "step": 4748 + }, + { + "epoch": 0.8901593252108716, + "grad_norm": 56749.3515625, + "learning_rate": 8.676209709423786e-05, + "loss": 2.3548, + "step": 4749 + }, + { + "epoch": 0.8903467666354264, + "grad_norm": 51015.4453125, + "learning_rate": 8.67567705111042e-05, + "loss": 2.2872, + "step": 4750 + }, + { + "epoch": 0.8905342080599813, + "grad_norm": 47316.72265625, + "learning_rate": 8.675144302012594e-05, + "loss": 2.2852, + "step": 4751 + }, + { + "epoch": 0.8907216494845361, + "grad_norm": 52093.24609375, + "learning_rate": 8.674611462143471e-05, + "loss": 2.2416, + "step": 4752 + }, + { + "epoch": 0.8909090909090909, + "grad_norm": 49092.796875, + "learning_rate": 8.674078531516207e-05, + "loss": 2.2752, + "step": 4753 + }, + { + "epoch": 0.8910965323336457, + "grad_norm": 57198.015625, + "learning_rate": 8.67354551014397e-05, + "loss": 2.3066, + "step": 4754 + }, + { + "epoch": 0.8912839737582006, + "grad_norm": 50955.0703125, + "learning_rate": 8.67301239803992e-05, + "loss": 2.2895, + "step": 4755 + }, + { + "epoch": 0.8914714151827554, + "grad_norm": 52177.23046875, + "learning_rate": 8.672479195217225e-05, + "loss": 2.3161, + "step": 4756 + }, + { + "epoch": 0.8916588566073103, + "grad_norm": 48882.60546875, + "learning_rate": 8.671945901689054e-05, + "loss": 2.2015, + "step": 4757 + }, + { + "epoch": 0.891846298031865, + "grad_norm": 50925.48046875, + "learning_rate": 8.671412517468583e-05, + "loss": 2.3038, + "step": 4758 + }, + { + "epoch": 0.8920337394564198, + "grad_norm": 53427.84375, + "learning_rate": 8.670879042568982e-05, + "loss": 2.3257, + "step": 4759 + }, + { + "epoch": 0.8922211808809747, + "grad_norm": 49324.23046875, + "learning_rate": 8.670345477003428e-05, + "loss": 2.3549, + "step": 4760 + }, + { + "epoch": 0.8924086223055295, + "grad_norm": 55780.484375, + "learning_rate": 8.669811820785098e-05, + "loss": 2.3063, + "step": 4761 + }, + { + "epoch": 0.8925960637300844, + "grad_norm": 50120.453125, + "learning_rate": 8.669278073927177e-05, + "loss": 2.3527, + "step": 4762 + }, + { + "epoch": 0.8927835051546392, + "grad_norm": 54860.76953125, + "learning_rate": 8.668744236442842e-05, + "loss": 2.2963, + "step": 4763 + }, + { + "epoch": 0.892970946579194, + "grad_norm": 50125.0390625, + "learning_rate": 8.668210308345282e-05, + "loss": 2.3048, + "step": 4764 + }, + { + "epoch": 0.8931583880037488, + "grad_norm": 49505.05078125, + "learning_rate": 8.667676289647683e-05, + "loss": 2.3375, + "step": 4765 + }, + { + "epoch": 0.8933458294283037, + "grad_norm": 48498.515625, + "learning_rate": 8.667142180363234e-05, + "loss": 2.2424, + "step": 4766 + }, + { + "epoch": 0.8935332708528585, + "grad_norm": 50243.296875, + "learning_rate": 8.666607980505131e-05, + "loss": 2.3536, + "step": 4767 + }, + { + "epoch": 0.8937207122774133, + "grad_norm": 50867.3828125, + "learning_rate": 8.666073690086562e-05, + "loss": 2.3407, + "step": 4768 + }, + { + "epoch": 0.8939081537019682, + "grad_norm": 47442.48828125, + "learning_rate": 8.665539309120727e-05, + "loss": 2.3296, + "step": 4769 + }, + { + "epoch": 0.8940955951265229, + "grad_norm": 45916.51953125, + "learning_rate": 8.665004837620821e-05, + "loss": 2.2784, + "step": 4770 + }, + { + "epoch": 0.8942830365510778, + "grad_norm": 55161.01171875, + "learning_rate": 8.664470275600049e-05, + "loss": 2.4531, + "step": 4771 + }, + { + "epoch": 0.8944704779756326, + "grad_norm": 47511.23828125, + "learning_rate": 8.663935623071613e-05, + "loss": 2.3388, + "step": 4772 + }, + { + "epoch": 0.8946579194001875, + "grad_norm": 48902.9140625, + "learning_rate": 8.663400880048718e-05, + "loss": 2.3836, + "step": 4773 + }, + { + "epoch": 0.8948453608247423, + "grad_norm": 48800.015625, + "learning_rate": 8.66286604654457e-05, + "loss": 2.3471, + "step": 4774 + }, + { + "epoch": 0.895032802249297, + "grad_norm": 48689.6328125, + "learning_rate": 8.66233112257238e-05, + "loss": 2.3019, + "step": 4775 + }, + { + "epoch": 0.8952202436738519, + "grad_norm": 45569.05859375, + "learning_rate": 8.661796108145358e-05, + "loss": 2.3177, + "step": 4776 + }, + { + "epoch": 0.8954076850984067, + "grad_norm": 49062.109375, + "learning_rate": 8.661261003276721e-05, + "loss": 2.3176, + "step": 4777 + }, + { + "epoch": 0.8955951265229616, + "grad_norm": 50576.8515625, + "learning_rate": 8.660725807979684e-05, + "loss": 2.2686, + "step": 4778 + }, + { + "epoch": 0.8957825679475164, + "grad_norm": 49207.078125, + "learning_rate": 8.660190522267466e-05, + "loss": 2.3099, + "step": 4779 + }, + { + "epoch": 0.8959700093720713, + "grad_norm": 50806.36328125, + "learning_rate": 8.659655146153287e-05, + "loss": 2.2901, + "step": 4780 + }, + { + "epoch": 0.896157450796626, + "grad_norm": 50433.9375, + "learning_rate": 8.65911967965037e-05, + "loss": 2.3022, + "step": 4781 + }, + { + "epoch": 0.8963448922211809, + "grad_norm": 49014.109375, + "learning_rate": 8.658584122771943e-05, + "loss": 2.3144, + "step": 4782 + }, + { + "epoch": 0.8965323336457357, + "grad_norm": 45938.20703125, + "learning_rate": 8.658048475531231e-05, + "loss": 2.2672, + "step": 4783 + }, + { + "epoch": 0.8967197750702905, + "grad_norm": 46801.28125, + "learning_rate": 8.657512737941464e-05, + "loss": 2.313, + "step": 4784 + }, + { + "epoch": 0.8969072164948454, + "grad_norm": 47151.24609375, + "learning_rate": 8.656976910015873e-05, + "loss": 2.349, + "step": 4785 + }, + { + "epoch": 0.8970946579194002, + "grad_norm": 46011.6171875, + "learning_rate": 8.656440991767696e-05, + "loss": 2.3431, + "step": 4786 + }, + { + "epoch": 0.897282099343955, + "grad_norm": 43697.234375, + "learning_rate": 8.655904983210166e-05, + "loss": 2.3319, + "step": 4787 + }, + { + "epoch": 0.8974695407685098, + "grad_norm": 51221.4296875, + "learning_rate": 8.655368884356523e-05, + "loss": 2.3113, + "step": 4788 + }, + { + "epoch": 0.8976569821930647, + "grad_norm": 51147.2265625, + "learning_rate": 8.654832695220008e-05, + "loss": 2.3175, + "step": 4789 + }, + { + "epoch": 0.8978444236176195, + "grad_norm": 44897.41015625, + "learning_rate": 8.654296415813864e-05, + "loss": 2.3618, + "step": 4790 + }, + { + "epoch": 0.8980318650421744, + "grad_norm": 50098.42578125, + "learning_rate": 8.653760046151336e-05, + "loss": 2.2488, + "step": 4791 + }, + { + "epoch": 0.8982193064667291, + "grad_norm": 58691.0, + "learning_rate": 8.653223586245673e-05, + "loss": 2.2572, + "step": 4792 + }, + { + "epoch": 0.8984067478912839, + "grad_norm": 48547.9296875, + "learning_rate": 8.652687036110123e-05, + "loss": 2.4104, + "step": 4793 + }, + { + "epoch": 0.8985941893158388, + "grad_norm": 48500.8125, + "learning_rate": 8.652150395757938e-05, + "loss": 2.3164, + "step": 4794 + }, + { + "epoch": 0.8987816307403936, + "grad_norm": 49236.234375, + "learning_rate": 8.651613665202375e-05, + "loss": 2.3424, + "step": 4795 + }, + { + "epoch": 0.8989690721649485, + "grad_norm": 50763.6953125, + "learning_rate": 8.651076844456689e-05, + "loss": 2.3044, + "step": 4796 + }, + { + "epoch": 0.8991565135895033, + "grad_norm": 55283.6875, + "learning_rate": 8.650539933534136e-05, + "loss": 2.2767, + "step": 4797 + }, + { + "epoch": 0.8993439550140581, + "grad_norm": 49363.7734375, + "learning_rate": 8.650002932447983e-05, + "loss": 2.3594, + "step": 4798 + }, + { + "epoch": 0.8995313964386129, + "grad_norm": 50587.10546875, + "learning_rate": 8.649465841211488e-05, + "loss": 2.2719, + "step": 4799 + }, + { + "epoch": 0.8997188378631678, + "grad_norm": 51692.12890625, + "learning_rate": 8.648928659837919e-05, + "loss": 2.3343, + "step": 4800 + }, + { + "epoch": 0.8999062792877226, + "grad_norm": 49258.55859375, + "learning_rate": 8.648391388340543e-05, + "loss": 2.279, + "step": 4801 + }, + { + "epoch": 0.9000937207122774, + "grad_norm": 48449.59765625, + "learning_rate": 8.647854026732629e-05, + "loss": 2.343, + "step": 4802 + }, + { + "epoch": 0.9002811621368323, + "grad_norm": 48131.9296875, + "learning_rate": 8.64731657502745e-05, + "loss": 2.2621, + "step": 4803 + }, + { + "epoch": 0.900468603561387, + "grad_norm": 52226.59375, + "learning_rate": 8.646779033238283e-05, + "loss": 2.3613, + "step": 4804 + }, + { + "epoch": 0.9006560449859419, + "grad_norm": 48070.78125, + "learning_rate": 8.6462414013784e-05, + "loss": 2.3238, + "step": 4805 + }, + { + "epoch": 0.9008434864104967, + "grad_norm": 45404.81640625, + "learning_rate": 8.645703679461082e-05, + "loss": 2.3205, + "step": 4806 + }, + { + "epoch": 0.9010309278350516, + "grad_norm": 53221.98828125, + "learning_rate": 8.64516586749961e-05, + "loss": 2.284, + "step": 4807 + }, + { + "epoch": 0.9012183692596064, + "grad_norm": 48946.65625, + "learning_rate": 8.644627965507266e-05, + "loss": 2.2889, + "step": 4808 + }, + { + "epoch": 0.9014058106841611, + "grad_norm": 50118.859375, + "learning_rate": 8.644089973497338e-05, + "loss": 2.38, + "step": 4809 + }, + { + "epoch": 0.901593252108716, + "grad_norm": 51698.70703125, + "learning_rate": 8.643551891483113e-05, + "loss": 2.3331, + "step": 4810 + }, + { + "epoch": 0.9017806935332708, + "grad_norm": 54429.0546875, + "learning_rate": 8.643013719477877e-05, + "loss": 2.3351, + "step": 4811 + }, + { + "epoch": 0.9019681349578257, + "grad_norm": 49135.7890625, + "learning_rate": 8.642475457494929e-05, + "loss": 2.2771, + "step": 4812 + }, + { + "epoch": 0.9021555763823805, + "grad_norm": 49282.58984375, + "learning_rate": 8.641937105547557e-05, + "loss": 2.3411, + "step": 4813 + }, + { + "epoch": 0.9023430178069354, + "grad_norm": 49530.421875, + "learning_rate": 8.641398663649063e-05, + "loss": 2.398, + "step": 4814 + }, + { + "epoch": 0.9025304592314901, + "grad_norm": 48635.484375, + "learning_rate": 8.64086013181274e-05, + "loss": 2.3396, + "step": 4815 + }, + { + "epoch": 0.902717900656045, + "grad_norm": 50885.64453125, + "learning_rate": 8.640321510051895e-05, + "loss": 2.3092, + "step": 4816 + }, + { + "epoch": 0.9029053420805998, + "grad_norm": 46528.48828125, + "learning_rate": 8.639782798379826e-05, + "loss": 2.217, + "step": 4817 + }, + { + "epoch": 0.9030927835051547, + "grad_norm": 48761.8828125, + "learning_rate": 8.639243996809842e-05, + "loss": 2.27, + "step": 4818 + }, + { + "epoch": 0.9032802249297095, + "grad_norm": 48503.25390625, + "learning_rate": 8.638705105355251e-05, + "loss": 2.3701, + "step": 4819 + }, + { + "epoch": 0.9034676663542643, + "grad_norm": 48120.7890625, + "learning_rate": 8.63816612402936e-05, + "loss": 2.3336, + "step": 4820 + }, + { + "epoch": 0.9036551077788191, + "grad_norm": 54862.859375, + "learning_rate": 8.637627052845482e-05, + "loss": 2.3196, + "step": 4821 + }, + { + "epoch": 0.9038425492033739, + "grad_norm": 54872.5703125, + "learning_rate": 8.637087891816934e-05, + "loss": 2.2424, + "step": 4822 + }, + { + "epoch": 0.9040299906279288, + "grad_norm": 52730.96875, + "learning_rate": 8.63654864095703e-05, + "loss": 2.3868, + "step": 4823 + }, + { + "epoch": 0.9042174320524836, + "grad_norm": 48985.1484375, + "learning_rate": 8.636009300279089e-05, + "loss": 2.3237, + "step": 4824 + }, + { + "epoch": 0.9044048734770385, + "grad_norm": 53915.85546875, + "learning_rate": 8.635469869796433e-05, + "loss": 2.3278, + "step": 4825 + }, + { + "epoch": 0.9045923149015932, + "grad_norm": 54189.60546875, + "learning_rate": 8.634930349522384e-05, + "loss": 2.4315, + "step": 4826 + }, + { + "epoch": 0.904779756326148, + "grad_norm": 47738.20703125, + "learning_rate": 8.634390739470268e-05, + "loss": 2.3063, + "step": 4827 + }, + { + "epoch": 0.9049671977507029, + "grad_norm": 46521.28125, + "learning_rate": 8.633851039653413e-05, + "loss": 2.3071, + "step": 4828 + }, + { + "epoch": 0.9051546391752577, + "grad_norm": 50904.04296875, + "learning_rate": 8.63331125008515e-05, + "loss": 2.3658, + "step": 4829 + }, + { + "epoch": 0.9053420805998126, + "grad_norm": 53286.95703125, + "learning_rate": 8.632771370778808e-05, + "loss": 2.3175, + "step": 4830 + }, + { + "epoch": 0.9055295220243674, + "grad_norm": 54679.58203125, + "learning_rate": 8.632231401747724e-05, + "loss": 2.3185, + "step": 4831 + }, + { + "epoch": 0.9057169634489222, + "grad_norm": 49695.4140625, + "learning_rate": 8.631691343005233e-05, + "loss": 2.4016, + "step": 4832 + }, + { + "epoch": 0.905904404873477, + "grad_norm": 52008.33203125, + "learning_rate": 8.631151194564674e-05, + "loss": 2.2087, + "step": 4833 + }, + { + "epoch": 0.9060918462980319, + "grad_norm": 48494.3671875, + "learning_rate": 8.63061095643939e-05, + "loss": 2.3426, + "step": 4834 + }, + { + "epoch": 0.9062792877225867, + "grad_norm": 53842.21484375, + "learning_rate": 8.630070628642721e-05, + "loss": 2.3707, + "step": 4835 + }, + { + "epoch": 0.9064667291471415, + "grad_norm": 49437.6875, + "learning_rate": 8.629530211188015e-05, + "loss": 2.1945, + "step": 4836 + }, + { + "epoch": 0.9066541705716964, + "grad_norm": 52053.375, + "learning_rate": 8.628989704088618e-05, + "loss": 2.2276, + "step": 4837 + }, + { + "epoch": 0.9068416119962511, + "grad_norm": 52876.484375, + "learning_rate": 8.628449107357882e-05, + "loss": 2.2535, + "step": 4838 + }, + { + "epoch": 0.907029053420806, + "grad_norm": 50319.609375, + "learning_rate": 8.627908421009154e-05, + "loss": 2.3277, + "step": 4839 + }, + { + "epoch": 0.9072164948453608, + "grad_norm": 44562.234375, + "learning_rate": 8.627367645055794e-05, + "loss": 2.2959, + "step": 4840 + }, + { + "epoch": 0.9074039362699157, + "grad_norm": 46784.5859375, + "learning_rate": 8.626826779511155e-05, + "loss": 2.3817, + "step": 4841 + }, + { + "epoch": 0.9075913776944705, + "grad_norm": 44046.15234375, + "learning_rate": 8.626285824388598e-05, + "loss": 2.3265, + "step": 4842 + }, + { + "epoch": 0.9077788191190254, + "grad_norm": 49106.53515625, + "learning_rate": 8.625744779701483e-05, + "loss": 2.3426, + "step": 4843 + }, + { + "epoch": 0.9079662605435801, + "grad_norm": 54308.07421875, + "learning_rate": 8.625203645463171e-05, + "loss": 2.2927, + "step": 4844 + }, + { + "epoch": 0.9081537019681349, + "grad_norm": 47593.89453125, + "learning_rate": 8.62466242168703e-05, + "loss": 2.3465, + "step": 4845 + }, + { + "epoch": 0.9083411433926898, + "grad_norm": 48910.30859375, + "learning_rate": 8.624121108386427e-05, + "loss": 2.2393, + "step": 4846 + }, + { + "epoch": 0.9085285848172446, + "grad_norm": 52669.1875, + "learning_rate": 8.623579705574731e-05, + "loss": 2.3583, + "step": 4847 + }, + { + "epoch": 0.9087160262417995, + "grad_norm": 52246.68359375, + "learning_rate": 8.623038213265314e-05, + "loss": 2.3487, + "step": 4848 + }, + { + "epoch": 0.9089034676663542, + "grad_norm": 51577.52734375, + "learning_rate": 8.622496631471551e-05, + "loss": 2.294, + "step": 4849 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 54429.1796875, + "learning_rate": 8.621954960206817e-05, + "loss": 2.2713, + "step": 4850 + }, + { + "epoch": 0.9092783505154639, + "grad_norm": 49038.671875, + "learning_rate": 8.621413199484493e-05, + "loss": 2.3206, + "step": 4851 + }, + { + "epoch": 0.9094657919400188, + "grad_norm": 47768.12890625, + "learning_rate": 8.620871349317958e-05, + "loss": 2.3519, + "step": 4852 + }, + { + "epoch": 0.9096532333645736, + "grad_norm": 50384.3671875, + "learning_rate": 8.620329409720593e-05, + "loss": 2.3212, + "step": 4853 + }, + { + "epoch": 0.9098406747891284, + "grad_norm": 54789.06640625, + "learning_rate": 8.619787380705787e-05, + "loss": 2.3231, + "step": 4854 + }, + { + "epoch": 0.9100281162136832, + "grad_norm": 51785.8515625, + "learning_rate": 8.619245262286926e-05, + "loss": 2.2869, + "step": 4855 + }, + { + "epoch": 0.910215557638238, + "grad_norm": 52563.1640625, + "learning_rate": 8.618703054477399e-05, + "loss": 2.3217, + "step": 4856 + }, + { + "epoch": 0.9104029990627929, + "grad_norm": 49860.00390625, + "learning_rate": 8.618160757290599e-05, + "loss": 2.3274, + "step": 4857 + }, + { + "epoch": 0.9105904404873477, + "grad_norm": 48503.3515625, + "learning_rate": 8.617618370739918e-05, + "loss": 2.277, + "step": 4858 + }, + { + "epoch": 0.9107778819119026, + "grad_norm": 46662.49609375, + "learning_rate": 8.617075894838756e-05, + "loss": 2.3197, + "step": 4859 + }, + { + "epoch": 0.9109653233364574, + "grad_norm": 48742.265625, + "learning_rate": 8.616533329600505e-05, + "loss": 2.3244, + "step": 4860 + }, + { + "epoch": 0.9111527647610121, + "grad_norm": 59915.56640625, + "learning_rate": 8.615990675038573e-05, + "loss": 2.4358, + "step": 4861 + }, + { + "epoch": 0.911340206185567, + "grad_norm": 50838.078125, + "learning_rate": 8.615447931166358e-05, + "loss": 2.368, + "step": 4862 + }, + { + "epoch": 0.9115276476101218, + "grad_norm": 54462.59765625, + "learning_rate": 8.614905097997268e-05, + "loss": 2.3276, + "step": 4863 + }, + { + "epoch": 0.9117150890346767, + "grad_norm": 49899.640625, + "learning_rate": 8.614362175544707e-05, + "loss": 2.2609, + "step": 4864 + }, + { + "epoch": 0.9119025304592315, + "grad_norm": 48756.7734375, + "learning_rate": 8.613819163822088e-05, + "loss": 2.3401, + "step": 4865 + }, + { + "epoch": 0.9120899718837863, + "grad_norm": 49889.6953125, + "learning_rate": 8.61327606284282e-05, + "loss": 2.3089, + "step": 4866 + }, + { + "epoch": 0.9122774133083411, + "grad_norm": 49251.4453125, + "learning_rate": 8.612732872620318e-05, + "loss": 2.2899, + "step": 4867 + }, + { + "epoch": 0.912464854732896, + "grad_norm": 50722.71875, + "learning_rate": 8.612189593167998e-05, + "loss": 2.404, + "step": 4868 + }, + { + "epoch": 0.9126522961574508, + "grad_norm": 49226.24609375, + "learning_rate": 8.611646224499278e-05, + "loss": 2.3067, + "step": 4869 + }, + { + "epoch": 0.9128397375820057, + "grad_norm": 50603.6796875, + "learning_rate": 8.611102766627581e-05, + "loss": 2.2935, + "step": 4870 + }, + { + "epoch": 0.9130271790065605, + "grad_norm": 51965.31640625, + "learning_rate": 8.610559219566325e-05, + "loss": 2.3259, + "step": 4871 + }, + { + "epoch": 0.9132146204311152, + "grad_norm": 54579.5859375, + "learning_rate": 8.610015583328938e-05, + "loss": 2.2517, + "step": 4872 + }, + { + "epoch": 0.9134020618556701, + "grad_norm": 48065.69921875, + "learning_rate": 8.609471857928849e-05, + "loss": 2.3163, + "step": 4873 + }, + { + "epoch": 0.9135895032802249, + "grad_norm": 48357.41796875, + "learning_rate": 8.608928043379482e-05, + "loss": 2.3511, + "step": 4874 + }, + { + "epoch": 0.9137769447047798, + "grad_norm": 47031.8046875, + "learning_rate": 8.608384139694272e-05, + "loss": 2.2495, + "step": 4875 + }, + { + "epoch": 0.9139643861293346, + "grad_norm": 50400.84765625, + "learning_rate": 8.607840146886652e-05, + "loss": 2.34, + "step": 4876 + }, + { + "epoch": 0.9141518275538895, + "grad_norm": 46910.18359375, + "learning_rate": 8.607296064970059e-05, + "loss": 2.3029, + "step": 4877 + }, + { + "epoch": 0.9143392689784442, + "grad_norm": 47160.9375, + "learning_rate": 8.60675189395793e-05, + "loss": 2.3228, + "step": 4878 + }, + { + "epoch": 0.914526710402999, + "grad_norm": 49001.375, + "learning_rate": 8.606207633863703e-05, + "loss": 2.3544, + "step": 4879 + }, + { + "epoch": 0.9147141518275539, + "grad_norm": 45741.26171875, + "learning_rate": 8.605663284700824e-05, + "loss": 2.3032, + "step": 4880 + }, + { + "epoch": 0.9149015932521087, + "grad_norm": 51361.70703125, + "learning_rate": 8.605118846482738e-05, + "loss": 2.2585, + "step": 4881 + }, + { + "epoch": 0.9150890346766636, + "grad_norm": 53131.515625, + "learning_rate": 8.604574319222892e-05, + "loss": 2.2697, + "step": 4882 + }, + { + "epoch": 0.9152764761012183, + "grad_norm": 46657.58203125, + "learning_rate": 8.604029702934731e-05, + "loss": 2.2872, + "step": 4883 + }, + { + "epoch": 0.9154639175257732, + "grad_norm": 47901.40234375, + "learning_rate": 8.60348499763171e-05, + "loss": 2.3503, + "step": 4884 + }, + { + "epoch": 0.915651358950328, + "grad_norm": 44836.59765625, + "learning_rate": 8.602940203327281e-05, + "loss": 2.3158, + "step": 4885 + }, + { + "epoch": 0.9158388003748829, + "grad_norm": 47888.0546875, + "learning_rate": 8.602395320034901e-05, + "loss": 2.3397, + "step": 4886 + }, + { + "epoch": 0.9160262417994377, + "grad_norm": 53108.0078125, + "learning_rate": 8.601850347768027e-05, + "loss": 2.2925, + "step": 4887 + }, + { + "epoch": 0.9162136832239925, + "grad_norm": 51815.45703125, + "learning_rate": 8.601305286540118e-05, + "loss": 2.2495, + "step": 4888 + }, + { + "epoch": 0.9164011246485473, + "grad_norm": 48402.58984375, + "learning_rate": 8.600760136364638e-05, + "loss": 2.3478, + "step": 4889 + }, + { + "epoch": 0.9165885660731021, + "grad_norm": 47492.00390625, + "learning_rate": 8.600214897255052e-05, + "loss": 2.3221, + "step": 4890 + }, + { + "epoch": 0.916776007497657, + "grad_norm": 46999.61328125, + "learning_rate": 8.599669569224827e-05, + "loss": 2.3151, + "step": 4891 + }, + { + "epoch": 0.9169634489222118, + "grad_norm": 48181.96875, + "learning_rate": 8.599124152287429e-05, + "loss": 2.3546, + "step": 4892 + }, + { + "epoch": 0.9171508903467667, + "grad_norm": 52851.43359375, + "learning_rate": 8.59857864645633e-05, + "loss": 2.3646, + "step": 4893 + }, + { + "epoch": 0.9173383317713215, + "grad_norm": 54227.8046875, + "learning_rate": 8.598033051745005e-05, + "loss": 2.3305, + "step": 4894 + }, + { + "epoch": 0.9175257731958762, + "grad_norm": 51861.4921875, + "learning_rate": 8.59748736816693e-05, + "loss": 2.2869, + "step": 4895 + }, + { + "epoch": 0.9177132146204311, + "grad_norm": 45534.68359375, + "learning_rate": 8.596941595735579e-05, + "loss": 2.2554, + "step": 4896 + }, + { + "epoch": 0.9179006560449859, + "grad_norm": 48738.3515625, + "learning_rate": 8.596395734464436e-05, + "loss": 2.3468, + "step": 4897 + }, + { + "epoch": 0.9180880974695408, + "grad_norm": 51470.94921875, + "learning_rate": 8.595849784366979e-05, + "loss": 2.424, + "step": 4898 + }, + { + "epoch": 0.9182755388940956, + "grad_norm": 48533.015625, + "learning_rate": 8.595303745456695e-05, + "loss": 2.3126, + "step": 4899 + }, + { + "epoch": 0.9184629803186504, + "grad_norm": 52517.36328125, + "learning_rate": 8.594757617747071e-05, + "loss": 2.3481, + "step": 4900 + }, + { + "epoch": 0.9186504217432052, + "grad_norm": 55722.50390625, + "learning_rate": 8.594211401251594e-05, + "loss": 2.3564, + "step": 4901 + }, + { + "epoch": 0.9188378631677601, + "grad_norm": 48910.47265625, + "learning_rate": 8.593665095983755e-05, + "loss": 2.3613, + "step": 4902 + }, + { + "epoch": 0.9190253045923149, + "grad_norm": 52910.5546875, + "learning_rate": 8.593118701957049e-05, + "loss": 2.3081, + "step": 4903 + }, + { + "epoch": 0.9192127460168698, + "grad_norm": 47132.20703125, + "learning_rate": 8.592572219184967e-05, + "loss": 2.2874, + "step": 4904 + }, + { + "epoch": 0.9194001874414246, + "grad_norm": 53028.33203125, + "learning_rate": 8.59202564768101e-05, + "loss": 2.3605, + "step": 4905 + }, + { + "epoch": 0.9195876288659793, + "grad_norm": 50746.4921875, + "learning_rate": 8.591478987458676e-05, + "loss": 2.305, + "step": 4906 + }, + { + "epoch": 0.9197750702905342, + "grad_norm": 45768.0546875, + "learning_rate": 8.590932238531469e-05, + "loss": 2.3064, + "step": 4907 + }, + { + "epoch": 0.919962511715089, + "grad_norm": 49286.546875, + "learning_rate": 8.59038540091289e-05, + "loss": 2.288, + "step": 4908 + }, + { + "epoch": 0.9201499531396439, + "grad_norm": 50150.4921875, + "learning_rate": 8.589838474616448e-05, + "loss": 2.2751, + "step": 4909 + }, + { + "epoch": 0.9203373945641987, + "grad_norm": 50201.3671875, + "learning_rate": 8.58929145965565e-05, + "loss": 2.3835, + "step": 4910 + }, + { + "epoch": 0.9205248359887536, + "grad_norm": 48859.5859375, + "learning_rate": 8.588744356044005e-05, + "loss": 2.3019, + "step": 4911 + }, + { + "epoch": 0.9207122774133083, + "grad_norm": 51991.82421875, + "learning_rate": 8.588197163795029e-05, + "loss": 2.2999, + "step": 4912 + }, + { + "epoch": 0.9208997188378631, + "grad_norm": 48385.078125, + "learning_rate": 8.587649882922233e-05, + "loss": 2.3022, + "step": 4913 + }, + { + "epoch": 0.921087160262418, + "grad_norm": 48108.6328125, + "learning_rate": 8.587102513439139e-05, + "loss": 2.3797, + "step": 4914 + }, + { + "epoch": 0.9212746016869728, + "grad_norm": 52912.921875, + "learning_rate": 8.586555055359262e-05, + "loss": 2.3972, + "step": 4915 + }, + { + "epoch": 0.9214620431115277, + "grad_norm": 49207.64453125, + "learning_rate": 8.586007508696126e-05, + "loss": 2.2758, + "step": 4916 + }, + { + "epoch": 0.9216494845360824, + "grad_norm": 46558.04296875, + "learning_rate": 8.585459873463253e-05, + "loss": 2.3452, + "step": 4917 + }, + { + "epoch": 0.9218369259606373, + "grad_norm": 46701.26171875, + "learning_rate": 8.584912149674169e-05, + "loss": 2.3082, + "step": 4918 + }, + { + "epoch": 0.9220243673851921, + "grad_norm": 49730.8359375, + "learning_rate": 8.584364337342405e-05, + "loss": 2.3541, + "step": 4919 + }, + { + "epoch": 0.922211808809747, + "grad_norm": 47142.1875, + "learning_rate": 8.583816436481488e-05, + "loss": 2.3363, + "step": 4920 + }, + { + "epoch": 0.9223992502343018, + "grad_norm": 53037.6015625, + "learning_rate": 8.58326844710495e-05, + "loss": 2.3018, + "step": 4921 + }, + { + "epoch": 0.9225866916588567, + "grad_norm": 47982.296875, + "learning_rate": 8.58272036922633e-05, + "loss": 2.3324, + "step": 4922 + }, + { + "epoch": 0.9227741330834114, + "grad_norm": 43260.96484375, + "learning_rate": 8.58217220285916e-05, + "loss": 2.3383, + "step": 4923 + }, + { + "epoch": 0.9229615745079662, + "grad_norm": 45623.0625, + "learning_rate": 8.581623948016982e-05, + "loss": 2.2816, + "step": 4924 + }, + { + "epoch": 0.9231490159325211, + "grad_norm": 53734.58203125, + "learning_rate": 8.581075604713335e-05, + "loss": 2.3195, + "step": 4925 + }, + { + "epoch": 0.9233364573570759, + "grad_norm": 47502.3203125, + "learning_rate": 8.580527172961766e-05, + "loss": 2.3983, + "step": 4926 + }, + { + "epoch": 0.9235238987816308, + "grad_norm": 51586.55078125, + "learning_rate": 8.579978652775815e-05, + "loss": 2.354, + "step": 4927 + }, + { + "epoch": 0.9237113402061856, + "grad_norm": 45672.44140625, + "learning_rate": 8.579430044169034e-05, + "loss": 2.334, + "step": 4928 + }, + { + "epoch": 0.9238987816307404, + "grad_norm": 46313.17578125, + "learning_rate": 8.578881347154971e-05, + "loss": 2.3363, + "step": 4929 + }, + { + "epoch": 0.9240862230552952, + "grad_norm": 48768.94140625, + "learning_rate": 8.57833256174718e-05, + "loss": 2.3502, + "step": 4930 + }, + { + "epoch": 0.92427366447985, + "grad_norm": 50564.75390625, + "learning_rate": 8.577783687959213e-05, + "loss": 2.3055, + "step": 4931 + }, + { + "epoch": 0.9244611059044049, + "grad_norm": 49702.26171875, + "learning_rate": 8.577234725804628e-05, + "loss": 2.3079, + "step": 4932 + }, + { + "epoch": 0.9246485473289597, + "grad_norm": 48942.66796875, + "learning_rate": 8.576685675296984e-05, + "loss": 2.3116, + "step": 4933 + }, + { + "epoch": 0.9248359887535146, + "grad_norm": 49653.2265625, + "learning_rate": 8.57613653644984e-05, + "loss": 2.3085, + "step": 4934 + }, + { + "epoch": 0.9250234301780693, + "grad_norm": 48199.23828125, + "learning_rate": 8.575587309276761e-05, + "loss": 2.3247, + "step": 4935 + }, + { + "epoch": 0.9252108716026242, + "grad_norm": 48230.3203125, + "learning_rate": 8.57503799379131e-05, + "loss": 2.3907, + "step": 4936 + }, + { + "epoch": 0.925398313027179, + "grad_norm": 48104.8125, + "learning_rate": 8.574488590007057e-05, + "loss": 2.2878, + "step": 4937 + }, + { + "epoch": 0.9255857544517339, + "grad_norm": 48082.73046875, + "learning_rate": 8.57393909793757e-05, + "loss": 2.3159, + "step": 4938 + }, + { + "epoch": 0.9257731958762887, + "grad_norm": 49179.37890625, + "learning_rate": 8.573389517596421e-05, + "loss": 2.4055, + "step": 4939 + }, + { + "epoch": 0.9259606373008434, + "grad_norm": 48052.40234375, + "learning_rate": 8.572839848997183e-05, + "loss": 2.3287, + "step": 4940 + }, + { + "epoch": 0.9261480787253983, + "grad_norm": 51274.37890625, + "learning_rate": 8.572290092153435e-05, + "loss": 2.3172, + "step": 4941 + }, + { + "epoch": 0.9263355201499531, + "grad_norm": 50902.43359375, + "learning_rate": 8.57174024707875e-05, + "loss": 2.3783, + "step": 4942 + }, + { + "epoch": 0.926522961574508, + "grad_norm": 49931.140625, + "learning_rate": 8.571190313786714e-05, + "loss": 2.3285, + "step": 4943 + }, + { + "epoch": 0.9267104029990628, + "grad_norm": 52841.9140625, + "learning_rate": 8.570640292290906e-05, + "loss": 2.2683, + "step": 4944 + }, + { + "epoch": 0.9268978444236177, + "grad_norm": 48020.1015625, + "learning_rate": 8.570090182604914e-05, + "loss": 2.2967, + "step": 4945 + }, + { + "epoch": 0.9270852858481724, + "grad_norm": 50015.99609375, + "learning_rate": 8.569539984742324e-05, + "loss": 2.3311, + "step": 4946 + }, + { + "epoch": 0.9272727272727272, + "grad_norm": 51817.85546875, + "learning_rate": 8.568989698716721e-05, + "loss": 2.315, + "step": 4947 + }, + { + "epoch": 0.9274601686972821, + "grad_norm": 51790.6953125, + "learning_rate": 8.568439324541701e-05, + "loss": 2.2899, + "step": 4948 + }, + { + "epoch": 0.9276476101218369, + "grad_norm": 51094.35546875, + "learning_rate": 8.567888862230858e-05, + "loss": 2.2978, + "step": 4949 + }, + { + "epoch": 0.9278350515463918, + "grad_norm": 53082.65625, + "learning_rate": 8.567338311797783e-05, + "loss": 2.3345, + "step": 4950 + }, + { + "epoch": 0.9280224929709466, + "grad_norm": 50659.171875, + "learning_rate": 8.566787673256078e-05, + "loss": 2.2814, + "step": 4951 + }, + { + "epoch": 0.9282099343955014, + "grad_norm": 52227.57421875, + "learning_rate": 8.566236946619343e-05, + "loss": 2.3051, + "step": 4952 + }, + { + "epoch": 0.9283973758200562, + "grad_norm": 48840.6015625, + "learning_rate": 8.565686131901178e-05, + "loss": 2.3705, + "step": 4953 + }, + { + "epoch": 0.9285848172446111, + "grad_norm": 50148.75390625, + "learning_rate": 8.565135229115188e-05, + "loss": 2.3346, + "step": 4954 + }, + { + "epoch": 0.9287722586691659, + "grad_norm": 53347.07421875, + "learning_rate": 8.564584238274982e-05, + "loss": 2.334, + "step": 4955 + }, + { + "epoch": 0.9289597000937208, + "grad_norm": 48105.20703125, + "learning_rate": 8.564033159394166e-05, + "loss": 2.3092, + "step": 4956 + }, + { + "epoch": 0.9291471415182755, + "grad_norm": 46072.71875, + "learning_rate": 8.56348199248635e-05, + "loss": 2.3322, + "step": 4957 + }, + { + "epoch": 0.9293345829428303, + "grad_norm": 51239.21484375, + "learning_rate": 8.562930737565152e-05, + "loss": 2.3356, + "step": 4958 + }, + { + "epoch": 0.9295220243673852, + "grad_norm": 46124.046875, + "learning_rate": 8.562379394644182e-05, + "loss": 2.3761, + "step": 4959 + }, + { + "epoch": 0.92970946579194, + "grad_norm": 50474.79296875, + "learning_rate": 8.561827963737061e-05, + "loss": 2.3244, + "step": 4960 + }, + { + "epoch": 0.9298969072164949, + "grad_norm": 47676.42578125, + "learning_rate": 8.561276444857407e-05, + "loss": 2.3146, + "step": 4961 + }, + { + "epoch": 0.9300843486410497, + "grad_norm": 47709.7734375, + "learning_rate": 8.560724838018841e-05, + "loss": 2.2152, + "step": 4962 + }, + { + "epoch": 0.9302717900656045, + "grad_norm": 50114.26953125, + "learning_rate": 8.560173143234989e-05, + "loss": 2.3145, + "step": 4963 + }, + { + "epoch": 0.9304592314901593, + "grad_norm": 46960.98828125, + "learning_rate": 8.559621360519477e-05, + "loss": 2.3142, + "step": 4964 + }, + { + "epoch": 0.9306466729147141, + "grad_norm": 49403.71484375, + "learning_rate": 8.55906948988593e-05, + "loss": 2.311, + "step": 4965 + }, + { + "epoch": 0.930834114339269, + "grad_norm": 46490.45703125, + "learning_rate": 8.558517531347983e-05, + "loss": 2.2733, + "step": 4966 + }, + { + "epoch": 0.9310215557638238, + "grad_norm": 49960.8515625, + "learning_rate": 8.557965484919266e-05, + "loss": 2.3199, + "step": 4967 + }, + { + "epoch": 0.9312089971883787, + "grad_norm": 51731.9453125, + "learning_rate": 8.557413350613414e-05, + "loss": 2.1832, + "step": 4968 + }, + { + "epoch": 0.9313964386129334, + "grad_norm": 47437.17578125, + "learning_rate": 8.556861128444066e-05, + "loss": 2.2662, + "step": 4969 + }, + { + "epoch": 0.9315838800374883, + "grad_norm": 46450.859375, + "learning_rate": 8.556308818424858e-05, + "loss": 2.2996, + "step": 4970 + }, + { + "epoch": 0.9317713214620431, + "grad_norm": 52669.80859375, + "learning_rate": 8.555756420569434e-05, + "loss": 2.3233, + "step": 4971 + }, + { + "epoch": 0.931958762886598, + "grad_norm": 50302.7109375, + "learning_rate": 8.555203934891436e-05, + "loss": 2.323, + "step": 4972 + }, + { + "epoch": 0.9321462043111528, + "grad_norm": 51533.38671875, + "learning_rate": 8.55465136140451e-05, + "loss": 2.3246, + "step": 4973 + }, + { + "epoch": 0.9323336457357075, + "grad_norm": 51508.08984375, + "learning_rate": 8.554098700122306e-05, + "loss": 2.3286, + "step": 4974 + }, + { + "epoch": 0.9325210871602624, + "grad_norm": 49778.203125, + "learning_rate": 8.553545951058471e-05, + "loss": 2.3044, + "step": 4975 + }, + { + "epoch": 0.9327085285848172, + "grad_norm": 52666.43359375, + "learning_rate": 8.552993114226658e-05, + "loss": 2.277, + "step": 4976 + }, + { + "epoch": 0.9328959700093721, + "grad_norm": 48621.05859375, + "learning_rate": 8.552440189640521e-05, + "loss": 2.2883, + "step": 4977 + }, + { + "epoch": 0.9330834114339269, + "grad_norm": 48447.203125, + "learning_rate": 8.551887177313718e-05, + "loss": 2.2442, + "step": 4978 + }, + { + "epoch": 0.9332708528584818, + "grad_norm": 49267.05859375, + "learning_rate": 8.551334077259904e-05, + "loss": 2.2909, + "step": 4979 + }, + { + "epoch": 0.9334582942830365, + "grad_norm": 49918.5546875, + "learning_rate": 8.550780889492746e-05, + "loss": 2.3126, + "step": 4980 + }, + { + "epoch": 0.9336457357075914, + "grad_norm": 51995.21484375, + "learning_rate": 8.550227614025902e-05, + "loss": 2.3433, + "step": 4981 + }, + { + "epoch": 0.9338331771321462, + "grad_norm": 54484.05859375, + "learning_rate": 8.549674250873041e-05, + "loss": 2.3387, + "step": 4982 + }, + { + "epoch": 0.934020618556701, + "grad_norm": 51367.3125, + "learning_rate": 8.549120800047826e-05, + "loss": 2.3762, + "step": 4983 + }, + { + "epoch": 0.9342080599812559, + "grad_norm": 53219.65234375, + "learning_rate": 8.548567261563932e-05, + "loss": 2.2727, + "step": 4984 + }, + { + "epoch": 0.9343955014058107, + "grad_norm": 50550.09375, + "learning_rate": 8.548013635435024e-05, + "loss": 2.3957, + "step": 4985 + }, + { + "epoch": 0.9345829428303655, + "grad_norm": 49176.640625, + "learning_rate": 8.547459921674778e-05, + "loss": 2.3165, + "step": 4986 + }, + { + "epoch": 0.9347703842549203, + "grad_norm": 46103.5625, + "learning_rate": 8.546906120296875e-05, + "loss": 2.2575, + "step": 4987 + }, + { + "epoch": 0.9349578256794752, + "grad_norm": 46691.2578125, + "learning_rate": 8.546352231314988e-05, + "loss": 2.4397, + "step": 4988 + }, + { + "epoch": 0.93514526710403, + "grad_norm": 52344.8671875, + "learning_rate": 8.545798254742799e-05, + "loss": 2.4803, + "step": 4989 + }, + { + "epoch": 0.9353327085285849, + "grad_norm": 47507.0546875, + "learning_rate": 8.545244190593991e-05, + "loss": 2.3154, + "step": 4990 + }, + { + "epoch": 0.9355201499531396, + "grad_norm": 52692.96484375, + "learning_rate": 8.544690038882245e-05, + "loss": 2.3133, + "step": 4991 + }, + { + "epoch": 0.9357075913776944, + "grad_norm": 47697.203125, + "learning_rate": 8.544135799621253e-05, + "loss": 2.3381, + "step": 4992 + }, + { + "epoch": 0.9358950328022493, + "grad_norm": 49926.0546875, + "learning_rate": 8.543581472824702e-05, + "loss": 2.2578, + "step": 4993 + }, + { + "epoch": 0.9360824742268041, + "grad_norm": 47822.5625, + "learning_rate": 8.543027058506283e-05, + "loss": 2.3091, + "step": 4994 + }, + { + "epoch": 0.936269915651359, + "grad_norm": 47656.49609375, + "learning_rate": 8.54247255667969e-05, + "loss": 2.2915, + "step": 4995 + }, + { + "epoch": 0.9364573570759138, + "grad_norm": 51442.984375, + "learning_rate": 8.541917967358615e-05, + "loss": 2.3513, + "step": 4996 + }, + { + "epoch": 0.9366447985004686, + "grad_norm": 47489.5078125, + "learning_rate": 8.541363290556761e-05, + "loss": 2.3261, + "step": 4997 + }, + { + "epoch": 0.9368322399250234, + "grad_norm": 46406.6328125, + "learning_rate": 8.540808526287824e-05, + "loss": 2.3368, + "step": 4998 + }, + { + "epoch": 0.9370196813495782, + "grad_norm": 47026.6875, + "learning_rate": 8.540253674565508e-05, + "loss": 2.2819, + "step": 4999 + }, + { + "epoch": 0.9372071227741331, + "grad_norm": 59740.36328125, + "learning_rate": 8.539698735403514e-05, + "loss": 2.1935, + "step": 5000 + }, + { + "epoch": 0.9372071227741331, + "eval_loss": 2.313755989074707, + "eval_runtime": 132.4236, + "eval_samples_per_second": 38.128, + "eval_steps_per_second": 1.911, + "step": 5000 + }, + { + "epoch": 0.9373945641986879, + "grad_norm": 47492.9375, + "learning_rate": 8.539143708815553e-05, + "loss": 2.2898, + "step": 5001 + }, + { + "epoch": 0.9375820056232428, + "grad_norm": 50569.30078125, + "learning_rate": 8.53858859481533e-05, + "loss": 2.3886, + "step": 5002 + }, + { + "epoch": 0.9377694470477975, + "grad_norm": 45426.125, + "learning_rate": 8.538033393416556e-05, + "loss": 2.288, + "step": 5003 + }, + { + "epoch": 0.9379568884723524, + "grad_norm": 49423.24609375, + "learning_rate": 8.537478104632946e-05, + "loss": 2.2977, + "step": 5004 + }, + { + "epoch": 0.9381443298969072, + "grad_norm": 46068.078125, + "learning_rate": 8.536922728478212e-05, + "loss": 2.3279, + "step": 5005 + }, + { + "epoch": 0.9383317713214621, + "grad_norm": 46896.38671875, + "learning_rate": 8.536367264966072e-05, + "loss": 2.322, + "step": 5006 + }, + { + "epoch": 0.9385192127460169, + "grad_norm": 53577.1953125, + "learning_rate": 8.535811714110246e-05, + "loss": 2.2669, + "step": 5007 + }, + { + "epoch": 0.9387066541705716, + "grad_norm": 48590.76953125, + "learning_rate": 8.535256075924453e-05, + "loss": 2.3233, + "step": 5008 + }, + { + "epoch": 0.9388940955951265, + "grad_norm": 48231.9609375, + "learning_rate": 8.534700350422422e-05, + "loss": 2.329, + "step": 5009 + }, + { + "epoch": 0.9390815370196813, + "grad_norm": 45274.40625, + "learning_rate": 8.534144537617874e-05, + "loss": 2.3379, + "step": 5010 + }, + { + "epoch": 0.9392689784442362, + "grad_norm": 47939.03515625, + "learning_rate": 8.533588637524537e-05, + "loss": 2.3093, + "step": 5011 + }, + { + "epoch": 0.939456419868791, + "grad_norm": 53735.25390625, + "learning_rate": 8.533032650156142e-05, + "loss": 2.3296, + "step": 5012 + }, + { + "epoch": 0.9396438612933459, + "grad_norm": 49954.33984375, + "learning_rate": 8.532476575526422e-05, + "loss": 2.2986, + "step": 5013 + }, + { + "epoch": 0.9398313027179006, + "grad_norm": 55465.7578125, + "learning_rate": 8.53192041364911e-05, + "loss": 2.3455, + "step": 5014 + }, + { + "epoch": 0.9400187441424555, + "grad_norm": 48726.15625, + "learning_rate": 8.531364164537942e-05, + "loss": 2.2994, + "step": 5015 + }, + { + "epoch": 0.9402061855670103, + "grad_norm": 45944.97265625, + "learning_rate": 8.53080782820666e-05, + "loss": 2.3444, + "step": 5016 + }, + { + "epoch": 0.9403936269915651, + "grad_norm": 50868.00390625, + "learning_rate": 8.530251404669001e-05, + "loss": 2.2633, + "step": 5017 + }, + { + "epoch": 0.94058106841612, + "grad_norm": 54766.08203125, + "learning_rate": 8.529694893938709e-05, + "loss": 2.2748, + "step": 5018 + }, + { + "epoch": 0.9407685098406748, + "grad_norm": 48420.6484375, + "learning_rate": 8.52913829602953e-05, + "loss": 2.2779, + "step": 5019 + }, + { + "epoch": 0.9409559512652296, + "grad_norm": 49437.265625, + "learning_rate": 8.528581610955211e-05, + "loss": 2.3552, + "step": 5020 + }, + { + "epoch": 0.9411433926897844, + "grad_norm": 52541.66796875, + "learning_rate": 8.528024838729501e-05, + "loss": 2.3969, + "step": 5021 + }, + { + "epoch": 0.9413308341143393, + "grad_norm": 57674.08984375, + "learning_rate": 8.52746797936615e-05, + "loss": 2.3195, + "step": 5022 + }, + { + "epoch": 0.9415182755388941, + "grad_norm": 53358.34375, + "learning_rate": 8.526911032878915e-05, + "loss": 2.3698, + "step": 5023 + }, + { + "epoch": 0.941705716963449, + "grad_norm": 51696.99609375, + "learning_rate": 8.52635399928155e-05, + "loss": 2.3409, + "step": 5024 + }, + { + "epoch": 0.9418931583880038, + "grad_norm": 46216.76171875, + "learning_rate": 8.525796878587813e-05, + "loss": 2.3114, + "step": 5025 + }, + { + "epoch": 0.9420805998125585, + "grad_norm": 47945.2734375, + "learning_rate": 8.525239670811464e-05, + "loss": 2.3593, + "step": 5026 + }, + { + "epoch": 0.9422680412371134, + "grad_norm": 53303.94140625, + "learning_rate": 8.524682375966267e-05, + "loss": 2.294, + "step": 5027 + }, + { + "epoch": 0.9424554826616682, + "grad_norm": 50374.5859375, + "learning_rate": 8.524124994065985e-05, + "loss": 2.2973, + "step": 5028 + }, + { + "epoch": 0.9426429240862231, + "grad_norm": 45961.8671875, + "learning_rate": 8.523567525124384e-05, + "loss": 2.3401, + "step": 5029 + }, + { + "epoch": 0.9428303655107779, + "grad_norm": 54295.65234375, + "learning_rate": 8.523009969155233e-05, + "loss": 2.2431, + "step": 5030 + }, + { + "epoch": 0.9430178069353327, + "grad_norm": 49560.1171875, + "learning_rate": 8.522452326172305e-05, + "loss": 2.2902, + "step": 5031 + }, + { + "epoch": 0.9432052483598875, + "grad_norm": 52298.421875, + "learning_rate": 8.52189459618937e-05, + "loss": 2.2887, + "step": 5032 + }, + { + "epoch": 0.9433926897844424, + "grad_norm": 50853.15625, + "learning_rate": 8.521336779220205e-05, + "loss": 2.2887, + "step": 5033 + }, + { + "epoch": 0.9435801312089972, + "grad_norm": 52495.6875, + "learning_rate": 8.520778875278586e-05, + "loss": 2.3794, + "step": 5034 + }, + { + "epoch": 0.943767572633552, + "grad_norm": 53360.85546875, + "learning_rate": 8.520220884378295e-05, + "loss": 2.2712, + "step": 5035 + }, + { + "epoch": 0.9439550140581069, + "grad_norm": 50228.55859375, + "learning_rate": 8.519662806533112e-05, + "loss": 2.2349, + "step": 5036 + }, + { + "epoch": 0.9441424554826616, + "grad_norm": 60239.7734375, + "learning_rate": 8.519104641756821e-05, + "loss": 2.452, + "step": 5037 + }, + { + "epoch": 0.9443298969072165, + "grad_norm": 48529.53125, + "learning_rate": 8.518546390063208e-05, + "loss": 2.3537, + "step": 5038 + }, + { + "epoch": 0.9445173383317713, + "grad_norm": 47167.87890625, + "learning_rate": 8.517988051466062e-05, + "loss": 2.2908, + "step": 5039 + }, + { + "epoch": 0.9447047797563262, + "grad_norm": 51474.09765625, + "learning_rate": 8.517429625979169e-05, + "loss": 2.2754, + "step": 5040 + }, + { + "epoch": 0.944892221180881, + "grad_norm": 50597.4140625, + "learning_rate": 8.516871113616327e-05, + "loss": 2.351, + "step": 5041 + }, + { + "epoch": 0.9450796626054359, + "grad_norm": 49092.9375, + "learning_rate": 8.516312514391328e-05, + "loss": 2.294, + "step": 5042 + }, + { + "epoch": 0.9452671040299906, + "grad_norm": 49552.859375, + "learning_rate": 8.515753828317969e-05, + "loss": 2.2542, + "step": 5043 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 47053.83203125, + "learning_rate": 8.515195055410048e-05, + "loss": 2.3617, + "step": 5044 + }, + { + "epoch": 0.9456419868791003, + "grad_norm": 50405.85546875, + "learning_rate": 8.514636195681367e-05, + "loss": 2.2422, + "step": 5045 + }, + { + "epoch": 0.9458294283036551, + "grad_norm": 50719.36328125, + "learning_rate": 8.514077249145729e-05, + "loss": 2.3696, + "step": 5046 + }, + { + "epoch": 0.94601686972821, + "grad_norm": 50899.5390625, + "learning_rate": 8.513518215816938e-05, + "loss": 2.2896, + "step": 5047 + }, + { + "epoch": 0.9462043111527647, + "grad_norm": 47790.33203125, + "learning_rate": 8.512959095708805e-05, + "loss": 2.3407, + "step": 5048 + }, + { + "epoch": 0.9463917525773196, + "grad_norm": 48437.38671875, + "learning_rate": 8.512399888835134e-05, + "loss": 2.2895, + "step": 5049 + }, + { + "epoch": 0.9465791940018744, + "grad_norm": 48454.078125, + "learning_rate": 8.511840595209741e-05, + "loss": 2.3131, + "step": 5050 + }, + { + "epoch": 0.9467666354264292, + "grad_norm": 48583.78125, + "learning_rate": 8.511281214846439e-05, + "loss": 2.2898, + "step": 5051 + }, + { + "epoch": 0.9469540768509841, + "grad_norm": 49908.3125, + "learning_rate": 8.510721747759043e-05, + "loss": 2.2699, + "step": 5052 + }, + { + "epoch": 0.9471415182755389, + "grad_norm": 51112.015625, + "learning_rate": 8.510162193961371e-05, + "loss": 2.2757, + "step": 5053 + }, + { + "epoch": 0.9473289597000937, + "grad_norm": 49446.7734375, + "learning_rate": 8.509602553467244e-05, + "loss": 2.3116, + "step": 5054 + }, + { + "epoch": 0.9475164011246485, + "grad_norm": 49862.82421875, + "learning_rate": 8.509042826290483e-05, + "loss": 2.3734, + "step": 5055 + }, + { + "epoch": 0.9477038425492034, + "grad_norm": 51961.8671875, + "learning_rate": 8.508483012444916e-05, + "loss": 2.3216, + "step": 5056 + }, + { + "epoch": 0.9478912839737582, + "grad_norm": 55377.01171875, + "learning_rate": 8.507923111944367e-05, + "loss": 2.275, + "step": 5057 + }, + { + "epoch": 0.9480787253983131, + "grad_norm": 50420.2421875, + "learning_rate": 8.507363124802666e-05, + "loss": 2.3078, + "step": 5058 + }, + { + "epoch": 0.9482661668228679, + "grad_norm": 50976.0546875, + "learning_rate": 8.506803051033641e-05, + "loss": 2.2501, + "step": 5059 + }, + { + "epoch": 0.9484536082474226, + "grad_norm": 50003.76953125, + "learning_rate": 8.506242890651127e-05, + "loss": 2.3454, + "step": 5060 + }, + { + "epoch": 0.9486410496719775, + "grad_norm": 49099.3828125, + "learning_rate": 8.505682643668962e-05, + "loss": 2.2771, + "step": 5061 + }, + { + "epoch": 0.9488284910965323, + "grad_norm": 51266.296875, + "learning_rate": 8.505122310100979e-05, + "loss": 2.3265, + "step": 5062 + }, + { + "epoch": 0.9490159325210872, + "grad_norm": 45202.61328125, + "learning_rate": 8.50456188996102e-05, + "loss": 2.2848, + "step": 5063 + }, + { + "epoch": 0.949203373945642, + "grad_norm": 49330.93359375, + "learning_rate": 8.504001383262925e-05, + "loss": 2.311, + "step": 5064 + }, + { + "epoch": 0.9493908153701968, + "grad_norm": 51722.28125, + "learning_rate": 8.50344079002054e-05, + "loss": 2.2665, + "step": 5065 + }, + { + "epoch": 0.9495782567947516, + "grad_norm": 48921.60546875, + "learning_rate": 8.50288011024771e-05, + "loss": 2.2812, + "step": 5066 + }, + { + "epoch": 0.9497656982193065, + "grad_norm": 51611.15234375, + "learning_rate": 8.502319343958282e-05, + "loss": 2.3051, + "step": 5067 + }, + { + "epoch": 0.9499531396438613, + "grad_norm": 50421.59375, + "learning_rate": 8.501758491166107e-05, + "loss": 2.2691, + "step": 5068 + }, + { + "epoch": 0.9501405810684161, + "grad_norm": 51512.2265625, + "learning_rate": 8.501197551885038e-05, + "loss": 2.2428, + "step": 5069 + }, + { + "epoch": 0.950328022492971, + "grad_norm": 48849.453125, + "learning_rate": 8.500636526128929e-05, + "loss": 2.3341, + "step": 5070 + }, + { + "epoch": 0.9505154639175257, + "grad_norm": 48532.93359375, + "learning_rate": 8.500075413911637e-05, + "loss": 2.3446, + "step": 5071 + }, + { + "epoch": 0.9507029053420806, + "grad_norm": 50921.2265625, + "learning_rate": 8.49951421524702e-05, + "loss": 2.3178, + "step": 5072 + }, + { + "epoch": 0.9508903467666354, + "grad_norm": 56108.98828125, + "learning_rate": 8.498952930148938e-05, + "loss": 2.34, + "step": 5073 + }, + { + "epoch": 0.9510777881911903, + "grad_norm": 51939.40234375, + "learning_rate": 8.498391558631255e-05, + "loss": 2.2721, + "step": 5074 + }, + { + "epoch": 0.9512652296157451, + "grad_norm": 44303.73046875, + "learning_rate": 8.497830100707835e-05, + "loss": 2.3369, + "step": 5075 + }, + { + "epoch": 0.9514526710403, + "grad_norm": 51176.1640625, + "learning_rate": 8.49726855639255e-05, + "loss": 2.3892, + "step": 5076 + }, + { + "epoch": 0.9516401124648547, + "grad_norm": 49439.7734375, + "learning_rate": 8.496706925699264e-05, + "loss": 2.3052, + "step": 5077 + }, + { + "epoch": 0.9518275538894095, + "grad_norm": 50704.62109375, + "learning_rate": 8.49614520864185e-05, + "loss": 2.3212, + "step": 5078 + }, + { + "epoch": 0.9520149953139644, + "grad_norm": 49617.05078125, + "learning_rate": 8.495583405234182e-05, + "loss": 2.3196, + "step": 5079 + }, + { + "epoch": 0.9522024367385192, + "grad_norm": 46396.37109375, + "learning_rate": 8.495021515490139e-05, + "loss": 2.3291, + "step": 5080 + }, + { + "epoch": 0.9523898781630741, + "grad_norm": 50001.70703125, + "learning_rate": 8.494459539423591e-05, + "loss": 2.3191, + "step": 5081 + }, + { + "epoch": 0.9525773195876288, + "grad_norm": 48332.546875, + "learning_rate": 8.493897477048427e-05, + "loss": 2.2596, + "step": 5082 + }, + { + "epoch": 0.9527647610121837, + "grad_norm": 48359.8359375, + "learning_rate": 8.493335328378525e-05, + "loss": 2.2559, + "step": 5083 + }, + { + "epoch": 0.9529522024367385, + "grad_norm": 48470.00390625, + "learning_rate": 8.492773093427769e-05, + "loss": 2.3184, + "step": 5084 + }, + { + "epoch": 0.9531396438612934, + "grad_norm": 51207.73046875, + "learning_rate": 8.492210772210044e-05, + "loss": 2.2875, + "step": 5085 + }, + { + "epoch": 0.9533270852858482, + "grad_norm": 49347.359375, + "learning_rate": 8.491648364739242e-05, + "loss": 2.2709, + "step": 5086 + }, + { + "epoch": 0.953514526710403, + "grad_norm": 47329.19921875, + "learning_rate": 8.491085871029254e-05, + "loss": 2.3771, + "step": 5087 + }, + { + "epoch": 0.9537019681349578, + "grad_norm": 49013.69140625, + "learning_rate": 8.490523291093968e-05, + "loss": 2.3502, + "step": 5088 + }, + { + "epoch": 0.9538894095595126, + "grad_norm": 47756.87890625, + "learning_rate": 8.489960624947284e-05, + "loss": 2.2492, + "step": 5089 + }, + { + "epoch": 0.9540768509840675, + "grad_norm": 57135.62109375, + "learning_rate": 8.489397872603098e-05, + "loss": 2.2894, + "step": 5090 + }, + { + "epoch": 0.9542642924086223, + "grad_norm": 50202.08984375, + "learning_rate": 8.488835034075309e-05, + "loss": 2.2427, + "step": 5091 + }, + { + "epoch": 0.9544517338331772, + "grad_norm": 48616.8046875, + "learning_rate": 8.488272109377816e-05, + "loss": 2.405, + "step": 5092 + }, + { + "epoch": 0.954639175257732, + "grad_norm": 48247.42578125, + "learning_rate": 8.487709098524525e-05, + "loss": 2.3195, + "step": 5093 + }, + { + "epoch": 0.9548266166822867, + "grad_norm": 43422.62890625, + "learning_rate": 8.487146001529343e-05, + "loss": 2.3116, + "step": 5094 + }, + { + "epoch": 0.9550140581068416, + "grad_norm": 53509.60546875, + "learning_rate": 8.486582818406174e-05, + "loss": 2.3171, + "step": 5095 + }, + { + "epoch": 0.9552014995313964, + "grad_norm": 50045.640625, + "learning_rate": 8.486019549168931e-05, + "loss": 2.2623, + "step": 5096 + }, + { + "epoch": 0.9553889409559513, + "grad_norm": 49399.33203125, + "learning_rate": 8.485456193831524e-05, + "loss": 2.2901, + "step": 5097 + }, + { + "epoch": 0.9555763823805061, + "grad_norm": 47907.1171875, + "learning_rate": 8.484892752407869e-05, + "loss": 2.304, + "step": 5098 + }, + { + "epoch": 0.9557638238050609, + "grad_norm": 48077.8125, + "learning_rate": 8.484329224911879e-05, + "loss": 2.3588, + "step": 5099 + }, + { + "epoch": 0.9559512652296157, + "grad_norm": 50356.421875, + "learning_rate": 8.483765611357477e-05, + "loss": 2.3652, + "step": 5100 + }, + { + "epoch": 0.9561387066541706, + "grad_norm": 52071.98046875, + "learning_rate": 8.48320191175858e-05, + "loss": 2.3179, + "step": 5101 + }, + { + "epoch": 0.9563261480787254, + "grad_norm": 48857.234375, + "learning_rate": 8.482638126129112e-05, + "loss": 2.3249, + "step": 5102 + }, + { + "epoch": 0.9565135895032802, + "grad_norm": 49234.4609375, + "learning_rate": 8.482074254482996e-05, + "loss": 2.293, + "step": 5103 + }, + { + "epoch": 0.9567010309278351, + "grad_norm": 47781.3671875, + "learning_rate": 8.481510296834164e-05, + "loss": 2.2748, + "step": 5104 + }, + { + "epoch": 0.9568884723523898, + "grad_norm": 54237.57421875, + "learning_rate": 8.48094625319654e-05, + "loss": 2.37, + "step": 5105 + }, + { + "epoch": 0.9570759137769447, + "grad_norm": 47370.16796875, + "learning_rate": 8.480382123584057e-05, + "loss": 2.2968, + "step": 5106 + }, + { + "epoch": 0.9572633552014995, + "grad_norm": 49480.88671875, + "learning_rate": 8.479817908010645e-05, + "loss": 2.2522, + "step": 5107 + }, + { + "epoch": 0.9574507966260544, + "grad_norm": 48349.06640625, + "learning_rate": 8.479253606490245e-05, + "loss": 2.3759, + "step": 5108 + }, + { + "epoch": 0.9576382380506092, + "grad_norm": 48505.95703125, + "learning_rate": 8.47868921903679e-05, + "loss": 2.2941, + "step": 5109 + }, + { + "epoch": 0.9578256794751641, + "grad_norm": 51344.0078125, + "learning_rate": 8.478124745664224e-05, + "loss": 2.3111, + "step": 5110 + }, + { + "epoch": 0.9580131208997188, + "grad_norm": 51346.44140625, + "learning_rate": 8.477560186386486e-05, + "loss": 2.3324, + "step": 5111 + }, + { + "epoch": 0.9582005623242736, + "grad_norm": 55214.69140625, + "learning_rate": 8.476995541217518e-05, + "loss": 2.2553, + "step": 5112 + }, + { + "epoch": 0.9583880037488285, + "grad_norm": 51435.9296875, + "learning_rate": 8.476430810171271e-05, + "loss": 2.3146, + "step": 5113 + }, + { + "epoch": 0.9585754451733833, + "grad_norm": 52917.93359375, + "learning_rate": 8.475865993261689e-05, + "loss": 2.3379, + "step": 5114 + }, + { + "epoch": 0.9587628865979382, + "grad_norm": 50442.1875, + "learning_rate": 8.475301090502723e-05, + "loss": 2.3498, + "step": 5115 + }, + { + "epoch": 0.958950328022493, + "grad_norm": 50030.59765625, + "learning_rate": 8.474736101908325e-05, + "loss": 2.2835, + "step": 5116 + }, + { + "epoch": 0.9591377694470478, + "grad_norm": 50796.66796875, + "learning_rate": 8.474171027492452e-05, + "loss": 2.294, + "step": 5117 + }, + { + "epoch": 0.9593252108716026, + "grad_norm": 53003.98046875, + "learning_rate": 8.473605867269058e-05, + "loss": 2.328, + "step": 5118 + }, + { + "epoch": 0.9595126522961575, + "grad_norm": 48031.2578125, + "learning_rate": 8.473040621252103e-05, + "loss": 2.3393, + "step": 5119 + }, + { + "epoch": 0.9597000937207123, + "grad_norm": 53832.8828125, + "learning_rate": 8.472475289455547e-05, + "loss": 2.3094, + "step": 5120 + }, + { + "epoch": 0.9598875351452671, + "grad_norm": 48598.3125, + "learning_rate": 8.471909871893354e-05, + "loss": 2.343, + "step": 5121 + }, + { + "epoch": 0.9600749765698219, + "grad_norm": 49202.30859375, + "learning_rate": 8.471344368579488e-05, + "loss": 2.2812, + "step": 5122 + }, + { + "epoch": 0.9602624179943767, + "grad_norm": 49276.4921875, + "learning_rate": 8.470778779527918e-05, + "loss": 2.3019, + "step": 5123 + }, + { + "epoch": 0.9604498594189316, + "grad_norm": 48381.90625, + "learning_rate": 8.47021310475261e-05, + "loss": 2.2956, + "step": 5124 + }, + { + "epoch": 0.9606373008434864, + "grad_norm": 51494.24609375, + "learning_rate": 8.469647344267541e-05, + "loss": 2.2902, + "step": 5125 + }, + { + "epoch": 0.9608247422680413, + "grad_norm": 51000.015625, + "learning_rate": 8.46908149808668e-05, + "loss": 2.2929, + "step": 5126 + }, + { + "epoch": 0.9610121836925961, + "grad_norm": 51071.20703125, + "learning_rate": 8.468515566224e-05, + "loss": 2.284, + "step": 5127 + }, + { + "epoch": 0.9611996251171508, + "grad_norm": 46322.59375, + "learning_rate": 8.467949548693485e-05, + "loss": 2.3875, + "step": 5128 + }, + { + "epoch": 0.9613870665417057, + "grad_norm": 50042.140625, + "learning_rate": 8.467383445509113e-05, + "loss": 2.2632, + "step": 5129 + }, + { + "epoch": 0.9615745079662605, + "grad_norm": 53609.640625, + "learning_rate": 8.466817256684865e-05, + "loss": 2.2982, + "step": 5130 + }, + { + "epoch": 0.9617619493908154, + "grad_norm": 52174.44140625, + "learning_rate": 8.466250982234727e-05, + "loss": 2.3515, + "step": 5131 + }, + { + "epoch": 0.9619493908153702, + "grad_norm": 48664.97265625, + "learning_rate": 8.465684622172683e-05, + "loss": 2.3604, + "step": 5132 + }, + { + "epoch": 0.9621368322399251, + "grad_norm": 51027.3203125, + "learning_rate": 8.46511817651272e-05, + "loss": 2.3213, + "step": 5133 + }, + { + "epoch": 0.9623242736644798, + "grad_norm": 51749.9921875, + "learning_rate": 8.464551645268833e-05, + "loss": 2.3314, + "step": 5134 + }, + { + "epoch": 0.9625117150890347, + "grad_norm": 52495.6328125, + "learning_rate": 8.46398502845501e-05, + "loss": 2.2634, + "step": 5135 + }, + { + "epoch": 0.9626991565135895, + "grad_norm": 48571.5, + "learning_rate": 8.46341832608525e-05, + "loss": 2.3126, + "step": 5136 + }, + { + "epoch": 0.9628865979381444, + "grad_norm": 51124.16015625, + "learning_rate": 8.462851538173546e-05, + "loss": 2.2517, + "step": 5137 + }, + { + "epoch": 0.9630740393626992, + "grad_norm": 46213.56640625, + "learning_rate": 8.4622846647339e-05, + "loss": 2.3205, + "step": 5138 + }, + { + "epoch": 0.9632614807872539, + "grad_norm": 48194.12109375, + "learning_rate": 8.461717705780312e-05, + "loss": 2.3531, + "step": 5139 + }, + { + "epoch": 0.9634489222118088, + "grad_norm": 50079.2578125, + "learning_rate": 8.461150661326785e-05, + "loss": 2.2546, + "step": 5140 + }, + { + "epoch": 0.9636363636363636, + "grad_norm": 53288.21875, + "learning_rate": 8.460583531387323e-05, + "loss": 2.2554, + "step": 5141 + }, + { + "epoch": 0.9638238050609185, + "grad_norm": 49984.3515625, + "learning_rate": 8.460016315975934e-05, + "loss": 2.2086, + "step": 5142 + }, + { + "epoch": 0.9640112464854733, + "grad_norm": 53865.6015625, + "learning_rate": 8.45944901510663e-05, + "loss": 2.3788, + "step": 5143 + }, + { + "epoch": 0.9641986879100282, + "grad_norm": 47535.69140625, + "learning_rate": 8.458881628793418e-05, + "loss": 2.2895, + "step": 5144 + }, + { + "epoch": 0.9643861293345829, + "grad_norm": 46867.3984375, + "learning_rate": 8.458314157050316e-05, + "loss": 2.3386, + "step": 5145 + }, + { + "epoch": 0.9645735707591377, + "grad_norm": 50697.921875, + "learning_rate": 8.457746599891338e-05, + "loss": 2.3044, + "step": 5146 + }, + { + "epoch": 0.9647610121836926, + "grad_norm": 49006.39453125, + "learning_rate": 8.457178957330503e-05, + "loss": 2.262, + "step": 5147 + }, + { + "epoch": 0.9649484536082474, + "grad_norm": 51328.2890625, + "learning_rate": 8.456611229381828e-05, + "loss": 2.3478, + "step": 5148 + }, + { + "epoch": 0.9651358950328023, + "grad_norm": 49823.609375, + "learning_rate": 8.456043416059339e-05, + "loss": 2.2001, + "step": 5149 + }, + { + "epoch": 0.9653233364573571, + "grad_norm": 49418.71484375, + "learning_rate": 8.455475517377057e-05, + "loss": 2.2701, + "step": 5150 + }, + { + "epoch": 0.9655107778819119, + "grad_norm": 46468.59375, + "learning_rate": 8.45490753334901e-05, + "loss": 2.2711, + "step": 5151 + }, + { + "epoch": 0.9656982193064667, + "grad_norm": 49574.328125, + "learning_rate": 8.454339463989226e-05, + "loss": 2.5291, + "step": 5152 + }, + { + "epoch": 0.9658856607310216, + "grad_norm": 52691.48046875, + "learning_rate": 8.453771309311737e-05, + "loss": 2.2927, + "step": 5153 + }, + { + "epoch": 0.9660731021555764, + "grad_norm": 50636.421875, + "learning_rate": 8.453203069330574e-05, + "loss": 2.3053, + "step": 5154 + }, + { + "epoch": 0.9662605435801312, + "grad_norm": 48392.23828125, + "learning_rate": 8.452634744059772e-05, + "loss": 2.3512, + "step": 5155 + }, + { + "epoch": 0.966447985004686, + "grad_norm": 47380.7890625, + "learning_rate": 8.452066333513369e-05, + "loss": 2.341, + "step": 5156 + }, + { + "epoch": 0.9666354264292408, + "grad_norm": 49529.7421875, + "learning_rate": 8.451497837705403e-05, + "loss": 2.2657, + "step": 5157 + }, + { + "epoch": 0.9668228678537957, + "grad_norm": 50963.88671875, + "learning_rate": 8.450929256649914e-05, + "loss": 2.3003, + "step": 5158 + }, + { + "epoch": 0.9670103092783505, + "grad_norm": 50137.9453125, + "learning_rate": 8.450360590360949e-05, + "loss": 2.3598, + "step": 5159 + }, + { + "epoch": 0.9671977507029054, + "grad_norm": 49948.23828125, + "learning_rate": 8.44979183885255e-05, + "loss": 2.3362, + "step": 5160 + }, + { + "epoch": 0.9673851921274602, + "grad_norm": 45615.6328125, + "learning_rate": 8.449223002138763e-05, + "loss": 2.3551, + "step": 5161 + }, + { + "epoch": 0.967572633552015, + "grad_norm": 45951.734375, + "learning_rate": 8.448654080233643e-05, + "loss": 2.2913, + "step": 5162 + }, + { + "epoch": 0.9677600749765698, + "grad_norm": 53099.015625, + "learning_rate": 8.448085073151237e-05, + "loss": 2.3287, + "step": 5163 + }, + { + "epoch": 0.9679475164011246, + "grad_norm": 46386.19921875, + "learning_rate": 8.447515980905601e-05, + "loss": 2.378, + "step": 5164 + }, + { + "epoch": 0.9681349578256795, + "grad_norm": 47301.3359375, + "learning_rate": 8.446946803510787e-05, + "loss": 2.3057, + "step": 5165 + }, + { + "epoch": 0.9683223992502343, + "grad_norm": 48683.23828125, + "learning_rate": 8.446377540980858e-05, + "loss": 2.3783, + "step": 5166 + }, + { + "epoch": 0.9685098406747892, + "grad_norm": 47944.671875, + "learning_rate": 8.445808193329874e-05, + "loss": 2.2214, + "step": 5167 + }, + { + "epoch": 0.9686972820993439, + "grad_norm": 51302.4140625, + "learning_rate": 8.445238760571893e-05, + "loss": 2.3048, + "step": 5168 + }, + { + "epoch": 0.9688847235238988, + "grad_norm": 46760.3671875, + "learning_rate": 8.444669242720982e-05, + "loss": 2.2591, + "step": 5169 + }, + { + "epoch": 0.9690721649484536, + "grad_norm": 46711.91796875, + "learning_rate": 8.444099639791208e-05, + "loss": 2.2886, + "step": 5170 + }, + { + "epoch": 0.9692596063730085, + "grad_norm": 51147.0390625, + "learning_rate": 8.443529951796636e-05, + "loss": 2.3636, + "step": 5171 + }, + { + "epoch": 0.9694470477975633, + "grad_norm": 50399.21875, + "learning_rate": 8.44296017875134e-05, + "loss": 2.2914, + "step": 5172 + }, + { + "epoch": 0.969634489222118, + "grad_norm": 45829.22265625, + "learning_rate": 8.442390320669391e-05, + "loss": 2.266, + "step": 5173 + }, + { + "epoch": 0.9698219306466729, + "grad_norm": 49065.98828125, + "learning_rate": 8.441820377564863e-05, + "loss": 2.3225, + "step": 5174 + }, + { + "epoch": 0.9700093720712277, + "grad_norm": 50458.859375, + "learning_rate": 8.441250349451837e-05, + "loss": 2.3865, + "step": 5175 + }, + { + "epoch": 0.9701968134957826, + "grad_norm": 53267.80859375, + "learning_rate": 8.440680236344387e-05, + "loss": 2.3288, + "step": 5176 + }, + { + "epoch": 0.9703842549203374, + "grad_norm": 53661.21875, + "learning_rate": 8.440110038256597e-05, + "loss": 2.3087, + "step": 5177 + }, + { + "epoch": 0.9705716963448923, + "grad_norm": 50116.4921875, + "learning_rate": 8.439539755202549e-05, + "loss": 2.2997, + "step": 5178 + }, + { + "epoch": 0.970759137769447, + "grad_norm": 46592.59765625, + "learning_rate": 8.438969387196329e-05, + "loss": 2.3702, + "step": 5179 + }, + { + "epoch": 0.9709465791940018, + "grad_norm": 47292.36328125, + "learning_rate": 8.438398934252022e-05, + "loss": 2.2497, + "step": 5180 + }, + { + "epoch": 0.9711340206185567, + "grad_norm": 49189.87890625, + "learning_rate": 8.43782839638372e-05, + "loss": 2.2747, + "step": 5181 + }, + { + "epoch": 0.9713214620431115, + "grad_norm": 47138.890625, + "learning_rate": 8.437257773605516e-05, + "loss": 2.2977, + "step": 5182 + }, + { + "epoch": 0.9715089034676664, + "grad_norm": 48874.171875, + "learning_rate": 8.436687065931499e-05, + "loss": 2.2799, + "step": 5183 + }, + { + "epoch": 0.9716963448922212, + "grad_norm": 45565.5390625, + "learning_rate": 8.436116273375768e-05, + "loss": 2.2964, + "step": 5184 + }, + { + "epoch": 0.971883786316776, + "grad_norm": 59961.45703125, + "learning_rate": 8.43554539595242e-05, + "loss": 2.3151, + "step": 5185 + }, + { + "epoch": 0.9720712277413308, + "grad_norm": 51943.890625, + "learning_rate": 8.434974433675555e-05, + "loss": 2.3269, + "step": 5186 + }, + { + "epoch": 0.9722586691658857, + "grad_norm": 52363.96484375, + "learning_rate": 8.434403386559275e-05, + "loss": 2.23, + "step": 5187 + }, + { + "epoch": 0.9724461105904405, + "grad_norm": 46295.23046875, + "learning_rate": 8.433832254617684e-05, + "loss": 2.3514, + "step": 5188 + }, + { + "epoch": 0.9726335520149954, + "grad_norm": 48407.64453125, + "learning_rate": 8.433261037864887e-05, + "loss": 2.4769, + "step": 5189 + }, + { + "epoch": 0.9728209934395501, + "grad_norm": 52735.73828125, + "learning_rate": 8.432689736314997e-05, + "loss": 2.3225, + "step": 5190 + }, + { + "epoch": 0.9730084348641049, + "grad_norm": 45933.546875, + "learning_rate": 8.432118349982118e-05, + "loss": 2.3177, + "step": 5191 + }, + { + "epoch": 0.9731958762886598, + "grad_norm": 48465.90625, + "learning_rate": 8.431546878880367e-05, + "loss": 2.2443, + "step": 5192 + }, + { + "epoch": 0.9733833177132146, + "grad_norm": 55341.79296875, + "learning_rate": 8.430975323023857e-05, + "loss": 2.2477, + "step": 5193 + }, + { + "epoch": 0.9735707591377695, + "grad_norm": 49892.01953125, + "learning_rate": 8.430403682426704e-05, + "loss": 2.3158, + "step": 5194 + }, + { + "epoch": 0.9737582005623243, + "grad_norm": 48313.171875, + "learning_rate": 8.42983195710303e-05, + "loss": 2.333, + "step": 5195 + }, + { + "epoch": 0.973945641986879, + "grad_norm": 48904.67578125, + "learning_rate": 8.42926014706695e-05, + "loss": 2.257, + "step": 5196 + }, + { + "epoch": 0.9741330834114339, + "grad_norm": 50737.87109375, + "learning_rate": 8.428688252332593e-05, + "loss": 2.2434, + "step": 5197 + }, + { + "epoch": 0.9743205248359887, + "grad_norm": 47254.23828125, + "learning_rate": 8.42811627291408e-05, + "loss": 2.3267, + "step": 5198 + }, + { + "epoch": 0.9745079662605436, + "grad_norm": 47439.08984375, + "learning_rate": 8.427544208825542e-05, + "loss": 2.3105, + "step": 5199 + }, + { + "epoch": 0.9746954076850984, + "grad_norm": 46056.671875, + "learning_rate": 8.426972060081103e-05, + "loss": 2.3088, + "step": 5200 + }, + { + "epoch": 0.9748828491096533, + "grad_norm": 52824.54296875, + "learning_rate": 8.426399826694899e-05, + "loss": 2.3127, + "step": 5201 + }, + { + "epoch": 0.975070290534208, + "grad_norm": 51949.78515625, + "learning_rate": 8.425827508681061e-05, + "loss": 2.3148, + "step": 5202 + }, + { + "epoch": 0.9752577319587629, + "grad_norm": 84289.3203125, + "learning_rate": 8.425255106053725e-05, + "loss": 2.5941, + "step": 5203 + }, + { + "epoch": 0.9754451733833177, + "grad_norm": 48207.87890625, + "learning_rate": 8.42468261882703e-05, + "loss": 2.3537, + "step": 5204 + }, + { + "epoch": 0.9756326148078726, + "grad_norm": 51982.5078125, + "learning_rate": 8.424110047015114e-05, + "loss": 2.2396, + "step": 5205 + }, + { + "epoch": 0.9758200562324274, + "grad_norm": 48491.359375, + "learning_rate": 8.42353739063212e-05, + "loss": 2.322, + "step": 5206 + }, + { + "epoch": 0.9760074976569822, + "grad_norm": 51971.0703125, + "learning_rate": 8.422964649692188e-05, + "loss": 2.2933, + "step": 5207 + }, + { + "epoch": 0.976194939081537, + "grad_norm": 49073.84375, + "learning_rate": 8.42239182420947e-05, + "loss": 2.3382, + "step": 5208 + }, + { + "epoch": 0.9763823805060918, + "grad_norm": 46874.0, + "learning_rate": 8.42181891419811e-05, + "loss": 2.2852, + "step": 5209 + }, + { + "epoch": 0.9765698219306467, + "grad_norm": 48158.4375, + "learning_rate": 8.42124591967226e-05, + "loss": 2.3051, + "step": 5210 + }, + { + "epoch": 0.9767572633552015, + "grad_norm": 46712.125, + "learning_rate": 8.42067284064607e-05, + "loss": 2.2791, + "step": 5211 + }, + { + "epoch": 0.9769447047797564, + "grad_norm": 48700.58203125, + "learning_rate": 8.420099677133699e-05, + "loss": 2.3037, + "step": 5212 + }, + { + "epoch": 0.9771321462043111, + "grad_norm": 49561.14453125, + "learning_rate": 8.419526429149298e-05, + "loss": 2.2789, + "step": 5213 + }, + { + "epoch": 0.977319587628866, + "grad_norm": 52324.578125, + "learning_rate": 8.418953096707027e-05, + "loss": 2.267, + "step": 5214 + }, + { + "epoch": 0.9775070290534208, + "grad_norm": 46988.4453125, + "learning_rate": 8.418379679821049e-05, + "loss": 2.2591, + "step": 5215 + }, + { + "epoch": 0.9776944704779756, + "grad_norm": 54242.6953125, + "learning_rate": 8.417806178505523e-05, + "loss": 2.2867, + "step": 5216 + }, + { + "epoch": 0.9778819119025305, + "grad_norm": 52089.7421875, + "learning_rate": 8.417232592774617e-05, + "loss": 2.2782, + "step": 5217 + }, + { + "epoch": 0.9780693533270853, + "grad_norm": 49800.58203125, + "learning_rate": 8.416658922642497e-05, + "loss": 2.2855, + "step": 5218 + }, + { + "epoch": 0.9782567947516401, + "grad_norm": 53255.99609375, + "learning_rate": 8.416085168123329e-05, + "loss": 2.345, + "step": 5219 + }, + { + "epoch": 0.9784442361761949, + "grad_norm": 50131.1484375, + "learning_rate": 8.415511329231289e-05, + "loss": 2.3031, + "step": 5220 + }, + { + "epoch": 0.9786316776007498, + "grad_norm": 56989.79296875, + "learning_rate": 8.414937405980545e-05, + "loss": 2.354, + "step": 5221 + }, + { + "epoch": 0.9788191190253046, + "grad_norm": 47998.3203125, + "learning_rate": 8.414363398385277e-05, + "loss": 2.259, + "step": 5222 + }, + { + "epoch": 0.9790065604498595, + "grad_norm": 56707.09375, + "learning_rate": 8.413789306459658e-05, + "loss": 2.234, + "step": 5223 + }, + { + "epoch": 0.9791940018744143, + "grad_norm": 55245.171875, + "learning_rate": 8.413215130217871e-05, + "loss": 2.306, + "step": 5224 + }, + { + "epoch": 0.979381443298969, + "grad_norm": 48221.25, + "learning_rate": 8.412640869674095e-05, + "loss": 2.2933, + "step": 5225 + }, + { + "epoch": 0.9795688847235239, + "grad_norm": 52590.0078125, + "learning_rate": 8.412066524842513e-05, + "loss": 2.2895, + "step": 5226 + }, + { + "epoch": 0.9797563261480787, + "grad_norm": 49387.984375, + "learning_rate": 8.411492095737312e-05, + "loss": 2.2811, + "step": 5227 + }, + { + "epoch": 0.9799437675726336, + "grad_norm": 53276.9921875, + "learning_rate": 8.41091758237268e-05, + "loss": 2.2261, + "step": 5228 + }, + { + "epoch": 0.9801312089971884, + "grad_norm": 50940.07421875, + "learning_rate": 8.410342984762805e-05, + "loss": 2.3168, + "step": 5229 + }, + { + "epoch": 0.9803186504217432, + "grad_norm": 48380.1796875, + "learning_rate": 8.409768302921881e-05, + "loss": 2.3058, + "step": 5230 + }, + { + "epoch": 0.980506091846298, + "grad_norm": 51774.09375, + "learning_rate": 8.4091935368641e-05, + "loss": 2.3354, + "step": 5231 + }, + { + "epoch": 0.9806935332708528, + "grad_norm": 45921.21484375, + "learning_rate": 8.408618686603658e-05, + "loss": 2.3261, + "step": 5232 + }, + { + "epoch": 0.9808809746954077, + "grad_norm": 47407.5859375, + "learning_rate": 8.408043752154755e-05, + "loss": 2.2839, + "step": 5233 + }, + { + "epoch": 0.9810684161199625, + "grad_norm": 45696.41015625, + "learning_rate": 8.40746873353159e-05, + "loss": 2.3117, + "step": 5234 + }, + { + "epoch": 0.9812558575445174, + "grad_norm": 48107.94921875, + "learning_rate": 8.406893630748365e-05, + "loss": 2.2744, + "step": 5235 + }, + { + "epoch": 0.9814432989690721, + "grad_norm": 47483.6796875, + "learning_rate": 8.406318443819284e-05, + "loss": 2.2454, + "step": 5236 + }, + { + "epoch": 0.981630740393627, + "grad_norm": 48748.86328125, + "learning_rate": 8.405743172758554e-05, + "loss": 2.3699, + "step": 5237 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 46061.30078125, + "learning_rate": 8.405167817580384e-05, + "loss": 2.2729, + "step": 5238 + }, + { + "epoch": 0.9820056232427367, + "grad_norm": 48915.93359375, + "learning_rate": 8.404592378298981e-05, + "loss": 2.2864, + "step": 5239 + }, + { + "epoch": 0.9821930646672915, + "grad_norm": 45321.69140625, + "learning_rate": 8.404016854928563e-05, + "loss": 2.2991, + "step": 5240 + }, + { + "epoch": 0.9823805060918464, + "grad_norm": 49487.5625, + "learning_rate": 8.403441247483341e-05, + "loss": 2.3797, + "step": 5241 + }, + { + "epoch": 0.9825679475164011, + "grad_norm": 52285.6171875, + "learning_rate": 8.402865555977534e-05, + "loss": 2.3539, + "step": 5242 + }, + { + "epoch": 0.9827553889409559, + "grad_norm": 52049.9609375, + "learning_rate": 8.402289780425359e-05, + "loss": 2.3716, + "step": 5243 + }, + { + "epoch": 0.9829428303655108, + "grad_norm": 48236.7265625, + "learning_rate": 8.401713920841036e-05, + "loss": 2.2777, + "step": 5244 + }, + { + "epoch": 0.9831302717900656, + "grad_norm": 51709.8203125, + "learning_rate": 8.401137977238791e-05, + "loss": 2.3684, + "step": 5245 + }, + { + "epoch": 0.9833177132146205, + "grad_norm": 49326.9375, + "learning_rate": 8.400561949632848e-05, + "loss": 2.2866, + "step": 5246 + }, + { + "epoch": 0.9835051546391752, + "grad_norm": 49255.1484375, + "learning_rate": 8.399985838037434e-05, + "loss": 2.283, + "step": 5247 + }, + { + "epoch": 0.98369259606373, + "grad_norm": 44822.953125, + "learning_rate": 8.399409642466778e-05, + "loss": 2.2774, + "step": 5248 + }, + { + "epoch": 0.9838800374882849, + "grad_norm": 49337.171875, + "learning_rate": 8.398833362935108e-05, + "loss": 2.2079, + "step": 5249 + }, + { + "epoch": 0.9840674789128397, + "grad_norm": 50096.84765625, + "learning_rate": 8.398256999456664e-05, + "loss": 2.3217, + "step": 5250 + }, + { + "epoch": 0.9842549203373946, + "grad_norm": 46553.10546875, + "learning_rate": 8.397680552045677e-05, + "loss": 2.275, + "step": 5251 + }, + { + "epoch": 0.9844423617619494, + "grad_norm": 46479.61328125, + "learning_rate": 8.397104020716386e-05, + "loss": 2.3364, + "step": 5252 + }, + { + "epoch": 0.9846298031865042, + "grad_norm": 49328.0703125, + "learning_rate": 8.39652740548303e-05, + "loss": 2.2976, + "step": 5253 + }, + { + "epoch": 0.984817244611059, + "grad_norm": 48234.97265625, + "learning_rate": 8.39595070635985e-05, + "loss": 2.4055, + "step": 5254 + }, + { + "epoch": 0.9850046860356139, + "grad_norm": 49466.953125, + "learning_rate": 8.395373923361092e-05, + "loss": 2.2387, + "step": 5255 + }, + { + "epoch": 0.9851921274601687, + "grad_norm": 49950.1796875, + "learning_rate": 8.394797056500998e-05, + "loss": 2.2521, + "step": 5256 + }, + { + "epoch": 0.9853795688847236, + "grad_norm": 51465.44140625, + "learning_rate": 8.394220105793819e-05, + "loss": 2.3181, + "step": 5257 + }, + { + "epoch": 0.9855670103092784, + "grad_norm": 51384.0234375, + "learning_rate": 8.393643071253805e-05, + "loss": 2.2835, + "step": 5258 + }, + { + "epoch": 0.9857544517338331, + "grad_norm": 51100.5703125, + "learning_rate": 8.393065952895207e-05, + "loss": 2.3809, + "step": 5259 + }, + { + "epoch": 0.985941893158388, + "grad_norm": 54094.734375, + "learning_rate": 8.392488750732277e-05, + "loss": 2.2645, + "step": 5260 + }, + { + "epoch": 0.9861293345829428, + "grad_norm": 48989.2109375, + "learning_rate": 8.391911464779276e-05, + "loss": 2.3695, + "step": 5261 + }, + { + "epoch": 0.9863167760074977, + "grad_norm": 50630.484375, + "learning_rate": 8.391334095050459e-05, + "loss": 2.3273, + "step": 5262 + }, + { + "epoch": 0.9865042174320525, + "grad_norm": 47436.10546875, + "learning_rate": 8.390756641560085e-05, + "loss": 2.3415, + "step": 5263 + }, + { + "epoch": 0.9866916588566073, + "grad_norm": 47062.90625, + "learning_rate": 8.390179104322419e-05, + "loss": 2.3875, + "step": 5264 + }, + { + "epoch": 0.9868791002811621, + "grad_norm": 50533.8671875, + "learning_rate": 8.389601483351725e-05, + "loss": 2.2452, + "step": 5265 + }, + { + "epoch": 0.987066541705717, + "grad_norm": 46574.69921875, + "learning_rate": 8.389023778662269e-05, + "loss": 2.3842, + "step": 5266 + }, + { + "epoch": 0.9872539831302718, + "grad_norm": 47560.3046875, + "learning_rate": 8.38844599026832e-05, + "loss": 2.2894, + "step": 5267 + }, + { + "epoch": 0.9874414245548266, + "grad_norm": 48746.76171875, + "learning_rate": 8.387868118184148e-05, + "loss": 2.329, + "step": 5268 + }, + { + "epoch": 0.9876288659793815, + "grad_norm": 51928.79296875, + "learning_rate": 8.387290162424025e-05, + "loss": 2.3422, + "step": 5269 + }, + { + "epoch": 0.9878163074039362, + "grad_norm": 47587.58203125, + "learning_rate": 8.386712123002227e-05, + "loss": 2.221, + "step": 5270 + }, + { + "epoch": 0.9880037488284911, + "grad_norm": 52255.75, + "learning_rate": 8.386133999933031e-05, + "loss": 2.2471, + "step": 5271 + }, + { + "epoch": 0.9881911902530459, + "grad_norm": 51262.1640625, + "learning_rate": 8.385555793230716e-05, + "loss": 2.2853, + "step": 5272 + }, + { + "epoch": 0.9883786316776008, + "grad_norm": 46303.93359375, + "learning_rate": 8.38497750290956e-05, + "loss": 2.3117, + "step": 5273 + }, + { + "epoch": 0.9885660731021556, + "grad_norm": 52173.359375, + "learning_rate": 8.384399128983851e-05, + "loss": 2.2868, + "step": 5274 + }, + { + "epoch": 0.9887535145267105, + "grad_norm": 48098.42578125, + "learning_rate": 8.38382067146787e-05, + "loss": 2.2744, + "step": 5275 + }, + { + "epoch": 0.9889409559512652, + "grad_norm": 49623.796875, + "learning_rate": 8.383242130375906e-05, + "loss": 2.3079, + "step": 5276 + }, + { + "epoch": 0.98912839737582, + "grad_norm": 47533.21484375, + "learning_rate": 8.382663505722249e-05, + "loss": 2.3111, + "step": 5277 + }, + { + "epoch": 0.9893158388003749, + "grad_norm": 48757.578125, + "learning_rate": 8.382084797521185e-05, + "loss": 2.3545, + "step": 5278 + }, + { + "epoch": 0.9895032802249297, + "grad_norm": 49905.36328125, + "learning_rate": 8.381506005787016e-05, + "loss": 2.2857, + "step": 5279 + }, + { + "epoch": 0.9896907216494846, + "grad_norm": 49960.7421875, + "learning_rate": 8.38092713053403e-05, + "loss": 2.3342, + "step": 5280 + }, + { + "epoch": 0.9898781630740393, + "grad_norm": 50154.48046875, + "learning_rate": 8.38034817177653e-05, + "loss": 2.3041, + "step": 5281 + }, + { + "epoch": 0.9900656044985942, + "grad_norm": 50253.43359375, + "learning_rate": 8.37976912952881e-05, + "loss": 2.2755, + "step": 5282 + }, + { + "epoch": 0.990253045923149, + "grad_norm": 49577.1484375, + "learning_rate": 8.379190003805178e-05, + "loss": 2.3394, + "step": 5283 + }, + { + "epoch": 0.9904404873477038, + "grad_norm": 48148.18359375, + "learning_rate": 8.37861079461993e-05, + "loss": 2.2601, + "step": 5284 + }, + { + "epoch": 0.9906279287722587, + "grad_norm": 46703.953125, + "learning_rate": 8.378031501987378e-05, + "loss": 2.3929, + "step": 5285 + }, + { + "epoch": 0.9908153701968135, + "grad_norm": 47450.0234375, + "learning_rate": 8.377452125921827e-05, + "loss": 2.3239, + "step": 5286 + }, + { + "epoch": 0.9910028116213683, + "grad_norm": 55722.13671875, + "learning_rate": 8.376872666437588e-05, + "loss": 2.3607, + "step": 5287 + }, + { + "epoch": 0.9911902530459231, + "grad_norm": 53763.33203125, + "learning_rate": 8.376293123548971e-05, + "loss": 2.4433, + "step": 5288 + }, + { + "epoch": 0.991377694470478, + "grad_norm": 52088.390625, + "learning_rate": 8.375713497270293e-05, + "loss": 2.311, + "step": 5289 + }, + { + "epoch": 0.9915651358950328, + "grad_norm": 51117.890625, + "learning_rate": 8.375133787615867e-05, + "loss": 2.3537, + "step": 5290 + }, + { + "epoch": 0.9917525773195877, + "grad_norm": 47533.6953125, + "learning_rate": 8.374553994600013e-05, + "loss": 2.3049, + "step": 5291 + }, + { + "epoch": 0.9919400187441425, + "grad_norm": 50756.27734375, + "learning_rate": 8.373974118237052e-05, + "loss": 2.3012, + "step": 5292 + }, + { + "epoch": 0.9921274601686972, + "grad_norm": 50754.44921875, + "learning_rate": 8.373394158541301e-05, + "loss": 2.3823, + "step": 5293 + }, + { + "epoch": 0.9923149015932521, + "grad_norm": 47858.34765625, + "learning_rate": 8.372814115527091e-05, + "loss": 2.2986, + "step": 5294 + }, + { + "epoch": 0.9925023430178069, + "grad_norm": 47234.39453125, + "learning_rate": 8.372233989208744e-05, + "loss": 2.3143, + "step": 5295 + }, + { + "epoch": 0.9926897844423618, + "grad_norm": 46745.12109375, + "learning_rate": 8.37165377960059e-05, + "loss": 2.3727, + "step": 5296 + }, + { + "epoch": 0.9928772258669166, + "grad_norm": 55278.22265625, + "learning_rate": 8.37107348671696e-05, + "loss": 2.3114, + "step": 5297 + }, + { + "epoch": 0.9930646672914715, + "grad_norm": 47025.71484375, + "learning_rate": 8.370493110572185e-05, + "loss": 2.3591, + "step": 5298 + }, + { + "epoch": 0.9932521087160262, + "grad_norm": 49530.875, + "learning_rate": 8.369912651180598e-05, + "loss": 2.3262, + "step": 5299 + }, + { + "epoch": 0.993439550140581, + "grad_norm": 51647.01171875, + "learning_rate": 8.36933210855654e-05, + "loss": 2.2516, + "step": 5300 + }, + { + "epoch": 0.9936269915651359, + "grad_norm": 47783.57421875, + "learning_rate": 8.368751482714347e-05, + "loss": 2.3719, + "step": 5301 + }, + { + "epoch": 0.9938144329896907, + "grad_norm": 48665.06640625, + "learning_rate": 8.368170773668359e-05, + "loss": 2.2905, + "step": 5302 + }, + { + "epoch": 0.9940018744142456, + "grad_norm": 51240.30859375, + "learning_rate": 8.36758998143292e-05, + "loss": 2.3651, + "step": 5303 + }, + { + "epoch": 0.9941893158388003, + "grad_norm": 48020.48046875, + "learning_rate": 8.367009106022375e-05, + "loss": 2.358, + "step": 5304 + }, + { + "epoch": 0.9943767572633552, + "grad_norm": 48687.7734375, + "learning_rate": 8.366428147451071e-05, + "loss": 2.3231, + "step": 5305 + }, + { + "epoch": 0.99456419868791, + "grad_norm": 49198.7265625, + "learning_rate": 8.365847105733355e-05, + "loss": 2.3085, + "step": 5306 + }, + { + "epoch": 0.9947516401124649, + "grad_norm": 45895.94921875, + "learning_rate": 8.365265980883581e-05, + "loss": 2.3182, + "step": 5307 + }, + { + "epoch": 0.9949390815370197, + "grad_norm": 51039.296875, + "learning_rate": 8.3646847729161e-05, + "loss": 2.2364, + "step": 5308 + }, + { + "epoch": 0.9951265229615746, + "grad_norm": 47420.70703125, + "learning_rate": 8.364103481845267e-05, + "loss": 2.3451, + "step": 5309 + }, + { + "epoch": 0.9953139643861293, + "grad_norm": 49950.19140625, + "learning_rate": 8.36352210768544e-05, + "loss": 2.31, + "step": 5310 + }, + { + "epoch": 0.9955014058106841, + "grad_norm": 44396.01953125, + "learning_rate": 8.362940650450978e-05, + "loss": 2.2997, + "step": 5311 + }, + { + "epoch": 0.995688847235239, + "grad_norm": 55480.01171875, + "learning_rate": 8.362359110156241e-05, + "loss": 2.1959, + "step": 5312 + }, + { + "epoch": 0.9958762886597938, + "grad_norm": 48981.984375, + "learning_rate": 8.361777486815594e-05, + "loss": 2.3082, + "step": 5313 + }, + { + "epoch": 0.9960637300843487, + "grad_norm": 44015.10546875, + "learning_rate": 8.361195780443401e-05, + "loss": 2.3418, + "step": 5314 + }, + { + "epoch": 0.9962511715089035, + "grad_norm": 53312.38671875, + "learning_rate": 8.360613991054032e-05, + "loss": 2.3278, + "step": 5315 + }, + { + "epoch": 0.9964386129334583, + "grad_norm": 48913.16796875, + "learning_rate": 8.360032118661853e-05, + "loss": 2.3774, + "step": 5316 + }, + { + "epoch": 0.9966260543580131, + "grad_norm": 47177.81640625, + "learning_rate": 8.359450163281237e-05, + "loss": 2.3485, + "step": 5317 + }, + { + "epoch": 0.996813495782568, + "grad_norm": 51044.91015625, + "learning_rate": 8.358868124926558e-05, + "loss": 2.2823, + "step": 5318 + }, + { + "epoch": 0.9970009372071228, + "grad_norm": 44777.08984375, + "learning_rate": 8.358286003612191e-05, + "loss": 2.3712, + "step": 5319 + }, + { + "epoch": 0.9971883786316776, + "grad_norm": 49476.578125, + "learning_rate": 8.357703799352514e-05, + "loss": 2.2556, + "step": 5320 + }, + { + "epoch": 0.9973758200562324, + "grad_norm": 47338.15625, + "learning_rate": 8.357121512161908e-05, + "loss": 2.289, + "step": 5321 + }, + { + "epoch": 0.9975632614807872, + "grad_norm": 50791.015625, + "learning_rate": 8.35653914205475e-05, + "loss": 2.3253, + "step": 5322 + }, + { + "epoch": 0.9977507029053421, + "grad_norm": 50257.15625, + "learning_rate": 8.355956689045427e-05, + "loss": 2.2598, + "step": 5323 + }, + { + "epoch": 0.9979381443298969, + "grad_norm": 48991.4140625, + "learning_rate": 8.355374153148328e-05, + "loss": 2.3153, + "step": 5324 + }, + { + "epoch": 0.9981255857544518, + "grad_norm": 48371.5546875, + "learning_rate": 8.354791534377836e-05, + "loss": 2.3101, + "step": 5325 + }, + { + "epoch": 0.9983130271790066, + "grad_norm": 46030.3046875, + "learning_rate": 8.354208832748342e-05, + "loss": 2.3126, + "step": 5326 + }, + { + "epoch": 0.9985004686035613, + "grad_norm": 50308.21484375, + "learning_rate": 8.35362604827424e-05, + "loss": 2.3173, + "step": 5327 + }, + { + "epoch": 0.9986879100281162, + "grad_norm": 47079.74609375, + "learning_rate": 8.353043180969921e-05, + "loss": 2.2637, + "step": 5328 + }, + { + "epoch": 0.998875351452671, + "grad_norm": 47069.4921875, + "learning_rate": 8.352460230849784e-05, + "loss": 2.3516, + "step": 5329 + }, + { + "epoch": 0.9990627928772259, + "grad_norm": 48670.23828125, + "learning_rate": 8.351877197928227e-05, + "loss": 2.324, + "step": 5330 + }, + { + "epoch": 0.9992502343017807, + "grad_norm": 49376.734375, + "learning_rate": 8.351294082219647e-05, + "loss": 2.3788, + "step": 5331 + }, + { + "epoch": 0.9994376757263356, + "grad_norm": 48068.9921875, + "learning_rate": 8.350710883738448e-05, + "loss": 2.3503, + "step": 5332 + }, + { + "epoch": 0.9996251171508903, + "grad_norm": 45784.1171875, + "learning_rate": 8.350127602499037e-05, + "loss": 2.2881, + "step": 5333 + }, + { + "epoch": 0.9998125585754452, + "grad_norm": 51446.20703125, + "learning_rate": 8.349544238515814e-05, + "loss": 2.3677, + "step": 5334 + }, + { + "epoch": 1.0, + "grad_norm": 65339.17578125, + "learning_rate": 8.348960791803193e-05, + "loss": 2.2532, + "step": 5335 + }, + { + "epoch": 1.0001874414245548, + "grad_norm": 47655.84765625, + "learning_rate": 8.348377262375583e-05, + "loss": 2.3089, + "step": 5336 + }, + { + "epoch": 1.0003748828491097, + "grad_norm": 48648.36328125, + "learning_rate": 8.347793650247396e-05, + "loss": 2.3226, + "step": 5337 + }, + { + "epoch": 1.0005623242736645, + "grad_norm": 48430.2734375, + "learning_rate": 8.347209955433044e-05, + "loss": 2.2522, + "step": 5338 + }, + { + "epoch": 1.0007497656982194, + "grad_norm": 47440.828125, + "learning_rate": 8.346626177946948e-05, + "loss": 2.1921, + "step": 5339 + }, + { + "epoch": 1.0009372071227742, + "grad_norm": 45285.28125, + "learning_rate": 8.346042317803525e-05, + "loss": 2.2631, + "step": 5340 + }, + { + "epoch": 1.0011246485473289, + "grad_norm": 50798.82421875, + "learning_rate": 8.345458375017193e-05, + "loss": 2.2063, + "step": 5341 + }, + { + "epoch": 1.0013120899718837, + "grad_norm": 47552.17578125, + "learning_rate": 8.344874349602376e-05, + "loss": 2.1973, + "step": 5342 + }, + { + "epoch": 1.0014995313964385, + "grad_norm": 50029.31640625, + "learning_rate": 8.344290241573502e-05, + "loss": 2.2488, + "step": 5343 + }, + { + "epoch": 1.0016869728209934, + "grad_norm": 54313.3203125, + "learning_rate": 8.343706050944993e-05, + "loss": 2.2321, + "step": 5344 + }, + { + "epoch": 1.0018744142455482, + "grad_norm": 46600.5390625, + "learning_rate": 8.34312177773128e-05, + "loss": 2.222, + "step": 5345 + }, + { + "epoch": 1.002061855670103, + "grad_norm": 48404.59375, + "learning_rate": 8.342537421946795e-05, + "loss": 2.2262, + "step": 5346 + }, + { + "epoch": 1.002249297094658, + "grad_norm": 47291.2265625, + "learning_rate": 8.341952983605968e-05, + "loss": 2.2695, + "step": 5347 + }, + { + "epoch": 1.0024367385192128, + "grad_norm": 50214.07421875, + "learning_rate": 8.341368462723234e-05, + "loss": 2.2098, + "step": 5348 + }, + { + "epoch": 1.0026241799437676, + "grad_norm": 55136.9140625, + "learning_rate": 8.340783859313033e-05, + "loss": 2.2045, + "step": 5349 + }, + { + "epoch": 1.0028116213683225, + "grad_norm": 51818.42578125, + "learning_rate": 8.340199173389802e-05, + "loss": 2.2298, + "step": 5350 + }, + { + "epoch": 1.0029990627928773, + "grad_norm": 49654.16015625, + "learning_rate": 8.339614404967982e-05, + "loss": 2.2093, + "step": 5351 + }, + { + "epoch": 1.0031865042174322, + "grad_norm": 52637.21484375, + "learning_rate": 8.339029554062015e-05, + "loss": 2.1862, + "step": 5352 + }, + { + "epoch": 1.0033739456419868, + "grad_norm": 47141.84765625, + "learning_rate": 8.338444620686348e-05, + "loss": 2.263, + "step": 5353 + }, + { + "epoch": 1.0035613870665416, + "grad_norm": 47186.53515625, + "learning_rate": 8.337859604855426e-05, + "loss": 2.2482, + "step": 5354 + }, + { + "epoch": 1.0037488284910965, + "grad_norm": 51079.96875, + "learning_rate": 8.337274506583702e-05, + "loss": 2.3097, + "step": 5355 + }, + { + "epoch": 1.0039362699156513, + "grad_norm": 67722.25, + "learning_rate": 8.336689325885622e-05, + "loss": 2.2969, + "step": 5356 + }, + { + "epoch": 1.0041237113402062, + "grad_norm": 48544.875, + "learning_rate": 8.336104062775643e-05, + "loss": 2.2705, + "step": 5357 + }, + { + "epoch": 1.004311152764761, + "grad_norm": 47284.69921875, + "learning_rate": 8.335518717268218e-05, + "loss": 2.3191, + "step": 5358 + }, + { + "epoch": 1.0044985941893159, + "grad_norm": 54864.1875, + "learning_rate": 8.334933289377806e-05, + "loss": 2.1937, + "step": 5359 + }, + { + "epoch": 1.0046860356138707, + "grad_norm": 48820.4765625, + "learning_rate": 8.334347779118866e-05, + "loss": 2.2804, + "step": 5360 + }, + { + "epoch": 1.0048734770384256, + "grad_norm": 49749.3828125, + "learning_rate": 8.333762186505858e-05, + "loss": 2.1902, + "step": 5361 + }, + { + "epoch": 1.0050609184629804, + "grad_norm": 50413.73828125, + "learning_rate": 8.333176511553246e-05, + "loss": 2.2103, + "step": 5362 + }, + { + "epoch": 1.0052483598875352, + "grad_norm": 47661.40234375, + "learning_rate": 8.332590754275497e-05, + "loss": 2.2414, + "step": 5363 + }, + { + "epoch": 1.0054358013120899, + "grad_norm": 45915.23828125, + "learning_rate": 8.332004914687075e-05, + "loss": 2.2367, + "step": 5364 + }, + { + "epoch": 1.0056232427366447, + "grad_norm": 49694.953125, + "learning_rate": 8.331418992802453e-05, + "loss": 2.2638, + "step": 5365 + }, + { + "epoch": 1.0058106841611996, + "grad_norm": 50848.140625, + "learning_rate": 8.330832988636101e-05, + "loss": 2.203, + "step": 5366 + }, + { + "epoch": 1.0059981255857544, + "grad_norm": 50156.546875, + "learning_rate": 8.330246902202494e-05, + "loss": 2.2474, + "step": 5367 + }, + { + "epoch": 1.0061855670103093, + "grad_norm": 49114.27734375, + "learning_rate": 8.329660733516104e-05, + "loss": 2.2391, + "step": 5368 + }, + { + "epoch": 1.006373008434864, + "grad_norm": 46932.30078125, + "learning_rate": 8.329074482591412e-05, + "loss": 2.2207, + "step": 5369 + }, + { + "epoch": 1.006560449859419, + "grad_norm": 50821.40625, + "learning_rate": 8.328488149442896e-05, + "loss": 2.1993, + "step": 5370 + }, + { + "epoch": 1.0067478912839738, + "grad_norm": 51995.6015625, + "learning_rate": 8.32790173408504e-05, + "loss": 2.2103, + "step": 5371 + }, + { + "epoch": 1.0069353327085286, + "grad_norm": 51222.55078125, + "learning_rate": 8.327315236532324e-05, + "loss": 2.3125, + "step": 5372 + }, + { + "epoch": 1.0071227741330835, + "grad_norm": 48280.93359375, + "learning_rate": 8.326728656799236e-05, + "loss": 2.2011, + "step": 5373 + }, + { + "epoch": 1.0073102155576383, + "grad_norm": 52552.03515625, + "learning_rate": 8.326141994900263e-05, + "loss": 2.2117, + "step": 5374 + }, + { + "epoch": 1.007497656982193, + "grad_norm": 53897.4765625, + "learning_rate": 8.325555250849896e-05, + "loss": 2.1622, + "step": 5375 + }, + { + "epoch": 1.0076850984067478, + "grad_norm": 48334.91015625, + "learning_rate": 8.324968424662625e-05, + "loss": 2.2291, + "step": 5376 + }, + { + "epoch": 1.0078725398313026, + "grad_norm": 47918.13671875, + "learning_rate": 8.324381516352945e-05, + "loss": 2.2387, + "step": 5377 + }, + { + "epoch": 1.0080599812558575, + "grad_norm": 49097.984375, + "learning_rate": 8.323794525935353e-05, + "loss": 2.2328, + "step": 5378 + }, + { + "epoch": 1.0082474226804123, + "grad_norm": 52743.71484375, + "learning_rate": 8.323207453424345e-05, + "loss": 2.2655, + "step": 5379 + }, + { + "epoch": 1.0084348641049672, + "grad_norm": 50688.6875, + "learning_rate": 8.32262029883442e-05, + "loss": 2.2098, + "step": 5380 + }, + { + "epoch": 1.008622305529522, + "grad_norm": 53702.97265625, + "learning_rate": 8.322033062180082e-05, + "loss": 2.1891, + "step": 5381 + }, + { + "epoch": 1.0088097469540769, + "grad_norm": 47666.60546875, + "learning_rate": 8.321445743475836e-05, + "loss": 2.261, + "step": 5382 + }, + { + "epoch": 1.0089971883786317, + "grad_norm": 50605.1484375, + "learning_rate": 8.320858342736184e-05, + "loss": 2.2427, + "step": 5383 + }, + { + "epoch": 1.0091846298031866, + "grad_norm": 49264.953125, + "learning_rate": 8.320270859975638e-05, + "loss": 2.2198, + "step": 5384 + }, + { + "epoch": 1.0093720712277414, + "grad_norm": 48962.48828125, + "learning_rate": 8.319683295208707e-05, + "loss": 2.2408, + "step": 5385 + }, + { + "epoch": 1.0095595126522963, + "grad_norm": 50480.61328125, + "learning_rate": 8.319095648449902e-05, + "loss": 2.1884, + "step": 5386 + }, + { + "epoch": 1.0097469540768509, + "grad_norm": 49899.625, + "learning_rate": 8.318507919713736e-05, + "loss": 2.2619, + "step": 5387 + }, + { + "epoch": 1.0099343955014057, + "grad_norm": 52572.328125, + "learning_rate": 8.31792010901473e-05, + "loss": 2.1759, + "step": 5388 + }, + { + "epoch": 1.0101218369259606, + "grad_norm": 49493.23828125, + "learning_rate": 8.317332216367398e-05, + "loss": 2.2199, + "step": 5389 + }, + { + "epoch": 1.0103092783505154, + "grad_norm": 48245.82421875, + "learning_rate": 8.316744241786261e-05, + "loss": 2.2092, + "step": 5390 + }, + { + "epoch": 1.0104967197750703, + "grad_norm": 58166.11328125, + "learning_rate": 8.316156185285841e-05, + "loss": 2.1601, + "step": 5391 + }, + { + "epoch": 1.0106841611996251, + "grad_norm": 47176.7109375, + "learning_rate": 8.315568046880663e-05, + "loss": 2.2364, + "step": 5392 + }, + { + "epoch": 1.01087160262418, + "grad_norm": 47206.12890625, + "learning_rate": 8.314979826585253e-05, + "loss": 2.2772, + "step": 5393 + }, + { + "epoch": 1.0110590440487348, + "grad_norm": 52466.69921875, + "learning_rate": 8.31439152441414e-05, + "loss": 2.2332, + "step": 5394 + }, + { + "epoch": 1.0112464854732897, + "grad_norm": 53955.1328125, + "learning_rate": 8.313803140381853e-05, + "loss": 2.2281, + "step": 5395 + }, + { + "epoch": 1.0114339268978445, + "grad_norm": 46899.84375, + "learning_rate": 8.313214674502923e-05, + "loss": 2.2312, + "step": 5396 + }, + { + "epoch": 1.0116213683223994, + "grad_norm": 50670.6171875, + "learning_rate": 8.312626126791888e-05, + "loss": 2.2754, + "step": 5397 + }, + { + "epoch": 1.011808809746954, + "grad_norm": 50289.72265625, + "learning_rate": 8.312037497263284e-05, + "loss": 2.2635, + "step": 5398 + }, + { + "epoch": 1.0119962511715088, + "grad_norm": 48374.9296875, + "learning_rate": 8.311448785931645e-05, + "loss": 2.2593, + "step": 5399 + }, + { + "epoch": 1.0121836925960637, + "grad_norm": 53075.3515625, + "learning_rate": 8.310859992811515e-05, + "loss": 2.215, + "step": 5400 + }, + { + "epoch": 1.0123711340206185, + "grad_norm": 49919.98828125, + "learning_rate": 8.310271117917437e-05, + "loss": 2.2541, + "step": 5401 + }, + { + "epoch": 1.0125585754451734, + "grad_norm": 51358.71484375, + "learning_rate": 8.309682161263954e-05, + "loss": 2.2328, + "step": 5402 + }, + { + "epoch": 1.0127460168697282, + "grad_norm": 48574.0625, + "learning_rate": 8.309093122865612e-05, + "loss": 2.223, + "step": 5403 + }, + { + "epoch": 1.012933458294283, + "grad_norm": 49047.1171875, + "learning_rate": 8.308504002736961e-05, + "loss": 2.2387, + "step": 5404 + }, + { + "epoch": 1.013120899718838, + "grad_norm": 50767.234375, + "learning_rate": 8.307914800892549e-05, + "loss": 2.1546, + "step": 5405 + }, + { + "epoch": 1.0133083411433927, + "grad_norm": 47979.12109375, + "learning_rate": 8.307325517346932e-05, + "loss": 2.2643, + "step": 5406 + }, + { + "epoch": 1.0134957825679476, + "grad_norm": 43773.3828125, + "learning_rate": 8.306736152114662e-05, + "loss": 2.2892, + "step": 5407 + }, + { + "epoch": 1.0136832239925024, + "grad_norm": 49850.921875, + "learning_rate": 8.306146705210296e-05, + "loss": 2.3005, + "step": 5408 + }, + { + "epoch": 1.013870665417057, + "grad_norm": 46579.0625, + "learning_rate": 8.305557176648393e-05, + "loss": 2.2225, + "step": 5409 + }, + { + "epoch": 1.014058106841612, + "grad_norm": 47925.59375, + "learning_rate": 8.304967566443515e-05, + "loss": 2.2559, + "step": 5410 + }, + { + "epoch": 1.0142455482661668, + "grad_norm": 51323.35546875, + "learning_rate": 8.30437787461022e-05, + "loss": 2.2559, + "step": 5411 + }, + { + "epoch": 1.0144329896907216, + "grad_norm": 51568.69921875, + "learning_rate": 8.303788101163078e-05, + "loss": 2.1604, + "step": 5412 + }, + { + "epoch": 1.0146204311152764, + "grad_norm": 50688.29296875, + "learning_rate": 8.303198246116652e-05, + "loss": 2.2576, + "step": 5413 + }, + { + "epoch": 1.0148078725398313, + "grad_norm": 55156.83984375, + "learning_rate": 8.302608309485514e-05, + "loss": 2.2308, + "step": 5414 + }, + { + "epoch": 1.0149953139643861, + "grad_norm": 51142.390625, + "learning_rate": 8.30201829128423e-05, + "loss": 2.2127, + "step": 5415 + }, + { + "epoch": 1.015182755388941, + "grad_norm": 47482.15625, + "learning_rate": 8.301428191527379e-05, + "loss": 2.1956, + "step": 5416 + }, + { + "epoch": 1.0153701968134958, + "grad_norm": 51423.69140625, + "learning_rate": 8.300838010229529e-05, + "loss": 2.2122, + "step": 5417 + }, + { + "epoch": 1.0155576382380507, + "grad_norm": 49163.03125, + "learning_rate": 8.300247747405261e-05, + "loss": 2.2079, + "step": 5418 + }, + { + "epoch": 1.0157450796626055, + "grad_norm": 48835.26953125, + "learning_rate": 8.299657403069153e-05, + "loss": 2.2642, + "step": 5419 + }, + { + "epoch": 1.0159325210871604, + "grad_norm": 50158.49609375, + "learning_rate": 8.299066977235782e-05, + "loss": 2.2017, + "step": 5420 + }, + { + "epoch": 1.016119962511715, + "grad_norm": 50547.22265625, + "learning_rate": 8.298476469919736e-05, + "loss": 2.2472, + "step": 5421 + }, + { + "epoch": 1.0163074039362698, + "grad_norm": 52763.4375, + "learning_rate": 8.297885881135599e-05, + "loss": 2.2612, + "step": 5422 + }, + { + "epoch": 1.0164948453608247, + "grad_norm": 53127.01171875, + "learning_rate": 8.297295210897955e-05, + "loss": 2.249, + "step": 5423 + }, + { + "epoch": 1.0166822867853795, + "grad_norm": 52127.5703125, + "learning_rate": 8.296704459221393e-05, + "loss": 2.228, + "step": 5424 + }, + { + "epoch": 1.0168697282099344, + "grad_norm": 47894.734375, + "learning_rate": 8.296113626120509e-05, + "loss": 2.3099, + "step": 5425 + }, + { + "epoch": 1.0170571696344892, + "grad_norm": 55922.578125, + "learning_rate": 8.295522711609888e-05, + "loss": 2.2714, + "step": 5426 + }, + { + "epoch": 1.017244611059044, + "grad_norm": 53567.171875, + "learning_rate": 8.29493171570413e-05, + "loss": 2.2252, + "step": 5427 + }, + { + "epoch": 1.017432052483599, + "grad_norm": 47914.4609375, + "learning_rate": 8.294340638417832e-05, + "loss": 2.3013, + "step": 5428 + }, + { + "epoch": 1.0176194939081538, + "grad_norm": 54149.94921875, + "learning_rate": 8.293749479765591e-05, + "loss": 2.1761, + "step": 5429 + }, + { + "epoch": 1.0178069353327086, + "grad_norm": 48181.109375, + "learning_rate": 8.293158239762006e-05, + "loss": 2.2338, + "step": 5430 + }, + { + "epoch": 1.0179943767572635, + "grad_norm": 51935.58984375, + "learning_rate": 8.292566918421685e-05, + "loss": 2.2067, + "step": 5431 + }, + { + "epoch": 1.018181818181818, + "grad_norm": 50244.859375, + "learning_rate": 8.29197551575923e-05, + "loss": 2.1193, + "step": 5432 + }, + { + "epoch": 1.018369259606373, + "grad_norm": 50520.0859375, + "learning_rate": 8.291384031789246e-05, + "loss": 2.2888, + "step": 5433 + }, + { + "epoch": 1.0185567010309278, + "grad_norm": 50802.5859375, + "learning_rate": 8.290792466526345e-05, + "loss": 2.2473, + "step": 5434 + }, + { + "epoch": 1.0187441424554826, + "grad_norm": 51724.421875, + "learning_rate": 8.290200819985135e-05, + "loss": 2.2505, + "step": 5435 + }, + { + "epoch": 1.0189315838800375, + "grad_norm": 45911.0859375, + "learning_rate": 8.289609092180231e-05, + "loss": 2.257, + "step": 5436 + }, + { + "epoch": 1.0191190253045923, + "grad_norm": 51193.234375, + "learning_rate": 8.289017283126249e-05, + "loss": 2.2096, + "step": 5437 + }, + { + "epoch": 1.0193064667291472, + "grad_norm": 46189.859375, + "learning_rate": 8.288425392837804e-05, + "loss": 2.3028, + "step": 5438 + }, + { + "epoch": 1.019493908153702, + "grad_norm": 48251.1875, + "learning_rate": 8.287833421329514e-05, + "loss": 2.2695, + "step": 5439 + }, + { + "epoch": 1.0196813495782568, + "grad_norm": 49227.48828125, + "learning_rate": 8.287241368616002e-05, + "loss": 2.2544, + "step": 5440 + }, + { + "epoch": 1.0198687910028117, + "grad_norm": 47301.34375, + "learning_rate": 8.28664923471189e-05, + "loss": 2.2781, + "step": 5441 + }, + { + "epoch": 1.0200562324273665, + "grad_norm": 52212.734375, + "learning_rate": 8.286057019631803e-05, + "loss": 2.2325, + "step": 5442 + }, + { + "epoch": 1.0202436738519212, + "grad_norm": 48509.90625, + "learning_rate": 8.285464723390368e-05, + "loss": 2.2022, + "step": 5443 + }, + { + "epoch": 1.020431115276476, + "grad_norm": 49407.58203125, + "learning_rate": 8.284872346002214e-05, + "loss": 2.2393, + "step": 5444 + }, + { + "epoch": 1.0206185567010309, + "grad_norm": 51169.5625, + "learning_rate": 8.284279887481973e-05, + "loss": 2.2558, + "step": 5445 + }, + { + "epoch": 1.0208059981255857, + "grad_norm": 47773.765625, + "learning_rate": 8.283687347844275e-05, + "loss": 2.264, + "step": 5446 + }, + { + "epoch": 1.0209934395501405, + "grad_norm": 50104.86328125, + "learning_rate": 8.283094727103759e-05, + "loss": 2.1964, + "step": 5447 + }, + { + "epoch": 1.0211808809746954, + "grad_norm": 46240.19140625, + "learning_rate": 8.282502025275059e-05, + "loss": 2.2294, + "step": 5448 + }, + { + "epoch": 1.0213683223992502, + "grad_norm": 51318.3984375, + "learning_rate": 8.281909242372816e-05, + "loss": 2.2702, + "step": 5449 + }, + { + "epoch": 1.021555763823805, + "grad_norm": 51366.03515625, + "learning_rate": 8.281316378411668e-05, + "loss": 2.2001, + "step": 5450 + }, + { + "epoch": 1.02174320524836, + "grad_norm": 50441.609375, + "learning_rate": 8.280723433406259e-05, + "loss": 2.2654, + "step": 5451 + }, + { + "epoch": 1.0219306466729148, + "grad_norm": 55241.01953125, + "learning_rate": 8.280130407371236e-05, + "loss": 2.2205, + "step": 5452 + }, + { + "epoch": 1.0221180880974696, + "grad_norm": 47943.50390625, + "learning_rate": 8.279537300321245e-05, + "loss": 2.2041, + "step": 5453 + }, + { + "epoch": 1.0223055295220245, + "grad_norm": 49458.234375, + "learning_rate": 8.278944112270935e-05, + "loss": 2.2555, + "step": 5454 + }, + { + "epoch": 1.022492970946579, + "grad_norm": 49684.37109375, + "learning_rate": 8.278350843234955e-05, + "loss": 2.2435, + "step": 5455 + }, + { + "epoch": 1.022680412371134, + "grad_norm": 51083.81640625, + "learning_rate": 8.27775749322796e-05, + "loss": 2.2108, + "step": 5456 + }, + { + "epoch": 1.0228678537956888, + "grad_norm": 50734.00390625, + "learning_rate": 8.277164062264606e-05, + "loss": 2.2523, + "step": 5457 + }, + { + "epoch": 1.0230552952202436, + "grad_norm": 49691.1015625, + "learning_rate": 8.276570550359548e-05, + "loss": 2.2205, + "step": 5458 + }, + { + "epoch": 1.0232427366447985, + "grad_norm": 54150.7265625, + "learning_rate": 8.275976957527445e-05, + "loss": 2.2529, + "step": 5459 + }, + { + "epoch": 1.0234301780693533, + "grad_norm": 52984.5078125, + "learning_rate": 8.27538328378296e-05, + "loss": 2.3124, + "step": 5460 + }, + { + "epoch": 1.0236176194939082, + "grad_norm": 53660.5234375, + "learning_rate": 8.274789529140752e-05, + "loss": 2.1877, + "step": 5461 + }, + { + "epoch": 1.023805060918463, + "grad_norm": 47993.359375, + "learning_rate": 8.27419569361549e-05, + "loss": 2.2525, + "step": 5462 + }, + { + "epoch": 1.0239925023430179, + "grad_norm": 49598.75, + "learning_rate": 8.273601777221839e-05, + "loss": 2.1837, + "step": 5463 + }, + { + "epoch": 1.0241799437675727, + "grad_norm": 48016.890625, + "learning_rate": 8.273007779974469e-05, + "loss": 2.2496, + "step": 5464 + }, + { + "epoch": 1.0243673851921276, + "grad_norm": 51379.84375, + "learning_rate": 8.272413701888048e-05, + "loss": 2.2824, + "step": 5465 + }, + { + "epoch": 1.0245548266166822, + "grad_norm": 51595.515625, + "learning_rate": 8.271819542977255e-05, + "loss": 2.1883, + "step": 5466 + }, + { + "epoch": 1.024742268041237, + "grad_norm": 48098.0234375, + "learning_rate": 8.271225303256759e-05, + "loss": 2.24, + "step": 5467 + }, + { + "epoch": 1.0249297094657919, + "grad_norm": 47993.11328125, + "learning_rate": 8.27063098274124e-05, + "loss": 2.2339, + "step": 5468 + }, + { + "epoch": 1.0251171508903467, + "grad_norm": 54316.81640625, + "learning_rate": 8.270036581445375e-05, + "loss": 2.2299, + "step": 5469 + }, + { + "epoch": 1.0253045923149016, + "grad_norm": 49383.1953125, + "learning_rate": 8.269442099383847e-05, + "loss": 2.1637, + "step": 5470 + }, + { + "epoch": 1.0254920337394564, + "grad_norm": 49829.90234375, + "learning_rate": 8.268847536571338e-05, + "loss": 2.1387, + "step": 5471 + }, + { + "epoch": 1.0256794751640113, + "grad_norm": 49217.81640625, + "learning_rate": 8.268252893022533e-05, + "loss": 2.2388, + "step": 5472 + }, + { + "epoch": 1.025866916588566, + "grad_norm": 49719.08984375, + "learning_rate": 8.26765816875212e-05, + "loss": 2.2263, + "step": 5473 + }, + { + "epoch": 1.026054358013121, + "grad_norm": 54751.71875, + "learning_rate": 8.267063363774785e-05, + "loss": 2.2409, + "step": 5474 + }, + { + "epoch": 1.0262417994376758, + "grad_norm": 51041.53125, + "learning_rate": 8.266468478105221e-05, + "loss": 2.33, + "step": 5475 + }, + { + "epoch": 1.0264292408622306, + "grad_norm": 48163.890625, + "learning_rate": 8.265873511758121e-05, + "loss": 2.2526, + "step": 5476 + }, + { + "epoch": 1.0266166822867855, + "grad_norm": 47903.109375, + "learning_rate": 8.26527846474818e-05, + "loss": 2.2599, + "step": 5477 + }, + { + "epoch": 1.02680412371134, + "grad_norm": 51259.40625, + "learning_rate": 8.264683337090093e-05, + "loss": 2.2749, + "step": 5478 + }, + { + "epoch": 1.026991565135895, + "grad_norm": 47205.3515625, + "learning_rate": 8.264088128798562e-05, + "loss": 2.2274, + "step": 5479 + }, + { + "epoch": 1.0271790065604498, + "grad_norm": 56563.953125, + "learning_rate": 8.263492839888285e-05, + "loss": 2.1849, + "step": 5480 + }, + { + "epoch": 1.0273664479850046, + "grad_norm": 50674.3046875, + "learning_rate": 8.262897470373967e-05, + "loss": 2.2445, + "step": 5481 + }, + { + "epoch": 1.0275538894095595, + "grad_norm": 50467.67578125, + "learning_rate": 8.26230202027031e-05, + "loss": 2.2116, + "step": 5482 + }, + { + "epoch": 1.0277413308341143, + "grad_norm": 48327.77734375, + "learning_rate": 8.261706489592025e-05, + "loss": 2.303, + "step": 5483 + }, + { + "epoch": 1.0279287722586692, + "grad_norm": 51412.46484375, + "learning_rate": 8.261110878353817e-05, + "loss": 2.2439, + "step": 5484 + }, + { + "epoch": 1.028116213683224, + "grad_norm": 50759.515625, + "learning_rate": 8.260515186570399e-05, + "loss": 2.2224, + "step": 5485 + }, + { + "epoch": 1.0283036551077789, + "grad_norm": 54179.98046875, + "learning_rate": 8.259919414256484e-05, + "loss": 2.2414, + "step": 5486 + }, + { + "epoch": 1.0284910965323337, + "grad_norm": 45429.6875, + "learning_rate": 8.259323561426786e-05, + "loss": 2.2841, + "step": 5487 + }, + { + "epoch": 1.0286785379568886, + "grad_norm": 52074.26171875, + "learning_rate": 8.258727628096021e-05, + "loss": 2.2223, + "step": 5488 + }, + { + "epoch": 1.0288659793814432, + "grad_norm": 46696.953125, + "learning_rate": 8.258131614278907e-05, + "loss": 2.2442, + "step": 5489 + }, + { + "epoch": 1.029053420805998, + "grad_norm": 51723.6171875, + "learning_rate": 8.257535519990169e-05, + "loss": 2.2053, + "step": 5490 + }, + { + "epoch": 1.0292408622305529, + "grad_norm": 52265.1953125, + "learning_rate": 8.256939345244527e-05, + "loss": 2.2252, + "step": 5491 + }, + { + "epoch": 1.0294283036551077, + "grad_norm": 50396.6796875, + "learning_rate": 8.256343090056703e-05, + "loss": 2.2032, + "step": 5492 + }, + { + "epoch": 1.0296157450796626, + "grad_norm": 49153.921875, + "learning_rate": 8.255746754441429e-05, + "loss": 2.2384, + "step": 5493 + }, + { + "epoch": 1.0298031865042174, + "grad_norm": 51907.61328125, + "learning_rate": 8.25515033841343e-05, + "loss": 2.2373, + "step": 5494 + }, + { + "epoch": 1.0299906279287723, + "grad_norm": 54910.2265625, + "learning_rate": 8.25455384198744e-05, + "loss": 2.495, + "step": 5495 + }, + { + "epoch": 1.0301780693533271, + "grad_norm": 49727.74609375, + "learning_rate": 8.253957265178189e-05, + "loss": 2.3046, + "step": 5496 + }, + { + "epoch": 1.030365510777882, + "grad_norm": 50502.91796875, + "learning_rate": 8.253360608000411e-05, + "loss": 2.5094, + "step": 5497 + }, + { + "epoch": 1.0305529522024368, + "grad_norm": 51889.3515625, + "learning_rate": 8.252763870468847e-05, + "loss": 2.2381, + "step": 5498 + }, + { + "epoch": 1.0307403936269917, + "grad_norm": 54245.57421875, + "learning_rate": 8.252167052598229e-05, + "loss": 2.2189, + "step": 5499 + }, + { + "epoch": 1.0309278350515463, + "grad_norm": 48121.98828125, + "learning_rate": 8.251570154403304e-05, + "loss": 2.417, + "step": 5500 + }, + { + "epoch": 1.0309278350515463, + "eval_loss": 2.314683675765991, + "eval_runtime": 133.2327, + "eval_samples_per_second": 37.896, + "eval_steps_per_second": 1.899, + "step": 5500 + }, + { + "epoch": 1.0311152764761011, + "grad_norm": 50045.1015625, + "learning_rate": 8.250973175898811e-05, + "loss": 2.2763, + "step": 5501 + }, + { + "epoch": 1.031302717900656, + "grad_norm": 57252.625, + "learning_rate": 8.250376117099495e-05, + "loss": 2.2082, + "step": 5502 + }, + { + "epoch": 1.0314901593252108, + "grad_norm": 54341.5078125, + "learning_rate": 8.249778978020104e-05, + "loss": 2.3445, + "step": 5503 + }, + { + "epoch": 1.0316776007497657, + "grad_norm": 49857.9140625, + "learning_rate": 8.249181758675384e-05, + "loss": 2.2731, + "step": 5504 + }, + { + "epoch": 1.0318650421743205, + "grad_norm": 48062.00390625, + "learning_rate": 8.248584459080088e-05, + "loss": 2.2342, + "step": 5505 + }, + { + "epoch": 1.0320524835988754, + "grad_norm": 51568.7265625, + "learning_rate": 8.247987079248969e-05, + "loss": 2.2699, + "step": 5506 + }, + { + "epoch": 1.0322399250234302, + "grad_norm": 62128.40625, + "learning_rate": 8.24738961919678e-05, + "loss": 2.1829, + "step": 5507 + }, + { + "epoch": 1.032427366447985, + "grad_norm": 48181.6328125, + "learning_rate": 8.246792078938275e-05, + "loss": 2.1891, + "step": 5508 + }, + { + "epoch": 1.03261480787254, + "grad_norm": 52000.89453125, + "learning_rate": 8.246194458488219e-05, + "loss": 2.2434, + "step": 5509 + }, + { + "epoch": 1.0328022492970947, + "grad_norm": 49577.41796875, + "learning_rate": 8.245596757861366e-05, + "loss": 2.2273, + "step": 5510 + }, + { + "epoch": 1.0329896907216496, + "grad_norm": 48797.1328125, + "learning_rate": 8.244998977072482e-05, + "loss": 2.2841, + "step": 5511 + }, + { + "epoch": 1.0331771321462042, + "grad_norm": 53833.421875, + "learning_rate": 8.24440111613633e-05, + "loss": 2.1975, + "step": 5512 + }, + { + "epoch": 1.033364573570759, + "grad_norm": 47061.46875, + "learning_rate": 8.243803175067677e-05, + "loss": 2.2847, + "step": 5513 + }, + { + "epoch": 1.033552014995314, + "grad_norm": 50590.68359375, + "learning_rate": 8.243205153881292e-05, + "loss": 2.1976, + "step": 5514 + }, + { + "epoch": 1.0337394564198688, + "grad_norm": 53118.375, + "learning_rate": 8.242607052591944e-05, + "loss": 2.2377, + "step": 5515 + }, + { + "epoch": 1.0339268978444236, + "grad_norm": 49126.7890625, + "learning_rate": 8.242008871214406e-05, + "loss": 2.3096, + "step": 5516 + }, + { + "epoch": 1.0341143392689784, + "grad_norm": 48399.31640625, + "learning_rate": 8.241410609763452e-05, + "loss": 2.2028, + "step": 5517 + }, + { + "epoch": 1.0343017806935333, + "grad_norm": 52042.67578125, + "learning_rate": 8.240812268253858e-05, + "loss": 2.2902, + "step": 5518 + }, + { + "epoch": 1.0344892221180881, + "grad_norm": 49140.94921875, + "learning_rate": 8.240213846700403e-05, + "loss": 2.2834, + "step": 5519 + }, + { + "epoch": 1.034676663542643, + "grad_norm": 49024.45703125, + "learning_rate": 8.239615345117869e-05, + "loss": 2.2644, + "step": 5520 + }, + { + "epoch": 1.0348641049671978, + "grad_norm": 50807.0234375, + "learning_rate": 8.239016763521032e-05, + "loss": 2.1608, + "step": 5521 + }, + { + "epoch": 1.0350515463917527, + "grad_norm": 64884.6875, + "learning_rate": 8.238418101924684e-05, + "loss": 2.1529, + "step": 5522 + }, + { + "epoch": 1.0352389878163073, + "grad_norm": 48653.953125, + "learning_rate": 8.237819360343606e-05, + "loss": 2.2207, + "step": 5523 + }, + { + "epoch": 1.0354264292408621, + "grad_norm": 58241.41015625, + "learning_rate": 8.237220538792588e-05, + "loss": 2.2611, + "step": 5524 + }, + { + "epoch": 1.035613870665417, + "grad_norm": 51383.5703125, + "learning_rate": 8.23662163728642e-05, + "loss": 2.1343, + "step": 5525 + }, + { + "epoch": 1.0358013120899718, + "grad_norm": 53597.3046875, + "learning_rate": 8.236022655839894e-05, + "loss": 2.2111, + "step": 5526 + }, + { + "epoch": 1.0359887535145267, + "grad_norm": 52480.40625, + "learning_rate": 8.235423594467804e-05, + "loss": 2.237, + "step": 5527 + }, + { + "epoch": 1.0361761949390815, + "grad_norm": 55604.9453125, + "learning_rate": 8.234824453184946e-05, + "loss": 2.1847, + "step": 5528 + }, + { + "epoch": 1.0363636363636364, + "grad_norm": 58030.42578125, + "learning_rate": 8.234225232006117e-05, + "loss": 2.1948, + "step": 5529 + }, + { + "epoch": 1.0365510777881912, + "grad_norm": 51221.23828125, + "learning_rate": 8.23362593094612e-05, + "loss": 2.2051, + "step": 5530 + }, + { + "epoch": 1.036738519212746, + "grad_norm": 55435.92578125, + "learning_rate": 8.233026550019753e-05, + "loss": 2.2701, + "step": 5531 + }, + { + "epoch": 1.036925960637301, + "grad_norm": 52218.765625, + "learning_rate": 8.232427089241823e-05, + "loss": 2.2597, + "step": 5532 + }, + { + "epoch": 1.0371134020618558, + "grad_norm": 53156.9375, + "learning_rate": 8.231827548627135e-05, + "loss": 2.2139, + "step": 5533 + }, + { + "epoch": 1.0373008434864106, + "grad_norm": 47856.68359375, + "learning_rate": 8.231227928190494e-05, + "loss": 2.316, + "step": 5534 + }, + { + "epoch": 1.0374882849109652, + "grad_norm": 54871.61328125, + "learning_rate": 8.230628227946716e-05, + "loss": 2.2769, + "step": 5535 + }, + { + "epoch": 1.03767572633552, + "grad_norm": 49532.34765625, + "learning_rate": 8.230028447910607e-05, + "loss": 2.3014, + "step": 5536 + }, + { + "epoch": 1.037863167760075, + "grad_norm": 52065.22265625, + "learning_rate": 8.229428588096985e-05, + "loss": 2.2998, + "step": 5537 + }, + { + "epoch": 1.0380506091846298, + "grad_norm": 52168.99609375, + "learning_rate": 8.228828648520663e-05, + "loss": 2.27, + "step": 5538 + }, + { + "epoch": 1.0382380506091846, + "grad_norm": 50845.5390625, + "learning_rate": 8.228228629196459e-05, + "loss": 2.2146, + "step": 5539 + }, + { + "epoch": 1.0384254920337395, + "grad_norm": 53359.92578125, + "learning_rate": 8.227628530139195e-05, + "loss": 2.2351, + "step": 5540 + }, + { + "epoch": 1.0386129334582943, + "grad_norm": 50845.67578125, + "learning_rate": 8.227028351363691e-05, + "loss": 2.2157, + "step": 5541 + }, + { + "epoch": 1.0388003748828492, + "grad_norm": 49615.671875, + "learning_rate": 8.226428092884769e-05, + "loss": 2.3132, + "step": 5542 + }, + { + "epoch": 1.038987816307404, + "grad_norm": 49788.69921875, + "learning_rate": 8.225827754717257e-05, + "loss": 2.2892, + "step": 5543 + }, + { + "epoch": 1.0391752577319588, + "grad_norm": 59761.34375, + "learning_rate": 8.22522733687598e-05, + "loss": 2.2224, + "step": 5544 + }, + { + "epoch": 1.0393626991565137, + "grad_norm": 49146.11328125, + "learning_rate": 8.224626839375772e-05, + "loss": 2.2569, + "step": 5545 + }, + { + "epoch": 1.0395501405810683, + "grad_norm": 59784.015625, + "learning_rate": 8.224026262231461e-05, + "loss": 2.2808, + "step": 5546 + }, + { + "epoch": 1.0397375820056232, + "grad_norm": 53230.1796875, + "learning_rate": 8.223425605457881e-05, + "loss": 2.2436, + "step": 5547 + }, + { + "epoch": 1.039925023430178, + "grad_norm": 53857.078125, + "learning_rate": 8.222824869069868e-05, + "loss": 2.2467, + "step": 5548 + }, + { + "epoch": 1.0401124648547329, + "grad_norm": 49327.58203125, + "learning_rate": 8.22222405308226e-05, + "loss": 2.1769, + "step": 5549 + }, + { + "epoch": 1.0402999062792877, + "grad_norm": 55470.63671875, + "learning_rate": 8.221623157509894e-05, + "loss": 2.1696, + "step": 5550 + }, + { + "epoch": 1.0404873477038425, + "grad_norm": 51281.109375, + "learning_rate": 8.221022182367613e-05, + "loss": 2.2249, + "step": 5551 + }, + { + "epoch": 1.0406747891283974, + "grad_norm": 48934.6875, + "learning_rate": 8.220421127670261e-05, + "loss": 2.2654, + "step": 5552 + }, + { + "epoch": 1.0408622305529522, + "grad_norm": 52532.375, + "learning_rate": 8.219819993432683e-05, + "loss": 2.2545, + "step": 5553 + }, + { + "epoch": 1.041049671977507, + "grad_norm": 50695.2578125, + "learning_rate": 8.219218779669724e-05, + "loss": 2.2771, + "step": 5554 + }, + { + "epoch": 1.041237113402062, + "grad_norm": 49406.77734375, + "learning_rate": 8.218617486396236e-05, + "loss": 2.2575, + "step": 5555 + }, + { + "epoch": 1.0414245548266168, + "grad_norm": 51609.5625, + "learning_rate": 8.21801611362707e-05, + "loss": 2.1965, + "step": 5556 + }, + { + "epoch": 1.0416119962511714, + "grad_norm": 50110.09765625, + "learning_rate": 8.217414661377077e-05, + "loss": 2.3149, + "step": 5557 + }, + { + "epoch": 1.0417994376757262, + "grad_norm": 47935.01171875, + "learning_rate": 8.216813129661113e-05, + "loss": 2.2861, + "step": 5558 + }, + { + "epoch": 1.041986879100281, + "grad_norm": 52519.3203125, + "learning_rate": 8.216211518494037e-05, + "loss": 2.2382, + "step": 5559 + }, + { + "epoch": 1.042174320524836, + "grad_norm": 51534.54296875, + "learning_rate": 8.215609827890704e-05, + "loss": 2.2372, + "step": 5560 + }, + { + "epoch": 1.0423617619493908, + "grad_norm": 51520.515625, + "learning_rate": 8.21500805786598e-05, + "loss": 2.2293, + "step": 5561 + }, + { + "epoch": 1.0425492033739456, + "grad_norm": 57371.2265625, + "learning_rate": 8.214406208434723e-05, + "loss": 2.2404, + "step": 5562 + }, + { + "epoch": 1.0427366447985005, + "grad_norm": 53756.609375, + "learning_rate": 8.213804279611801e-05, + "loss": 2.2437, + "step": 5563 + }, + { + "epoch": 1.0429240862230553, + "grad_norm": 54767.4453125, + "learning_rate": 8.213202271412079e-05, + "loss": 2.2817, + "step": 5564 + }, + { + "epoch": 1.0431115276476102, + "grad_norm": 52593.28125, + "learning_rate": 8.212600183850429e-05, + "loss": 2.2201, + "step": 5565 + }, + { + "epoch": 1.043298969072165, + "grad_norm": 50276.62109375, + "learning_rate": 8.211998016941719e-05, + "loss": 2.2649, + "step": 5566 + }, + { + "epoch": 1.0434864104967199, + "grad_norm": 50255.6640625, + "learning_rate": 8.211395770700823e-05, + "loss": 2.2146, + "step": 5567 + }, + { + "epoch": 1.0436738519212747, + "grad_norm": 51710.328125, + "learning_rate": 8.210793445142613e-05, + "loss": 2.2631, + "step": 5568 + }, + { + "epoch": 1.0438612933458293, + "grad_norm": 52443.68359375, + "learning_rate": 8.210191040281969e-05, + "loss": 2.2472, + "step": 5569 + }, + { + "epoch": 1.0440487347703842, + "grad_norm": 52797.11328125, + "learning_rate": 8.20958855613377e-05, + "loss": 2.2499, + "step": 5570 + }, + { + "epoch": 1.044236176194939, + "grad_norm": 53448.296875, + "learning_rate": 8.208985992712891e-05, + "loss": 2.2312, + "step": 5571 + }, + { + "epoch": 1.0444236176194939, + "grad_norm": 51397.9609375, + "learning_rate": 8.208383350034221e-05, + "loss": 2.2407, + "step": 5572 + }, + { + "epoch": 1.0446110590440487, + "grad_norm": 48932.90625, + "learning_rate": 8.207780628112643e-05, + "loss": 2.2791, + "step": 5573 + }, + { + "epoch": 1.0447985004686036, + "grad_norm": 51597.40234375, + "learning_rate": 8.20717782696304e-05, + "loss": 2.1538, + "step": 5574 + }, + { + "epoch": 1.0449859418931584, + "grad_norm": 49508.78515625, + "learning_rate": 8.206574946600305e-05, + "loss": 2.2449, + "step": 5575 + }, + { + "epoch": 1.0451733833177133, + "grad_norm": 52373.00390625, + "learning_rate": 8.205971987039324e-05, + "loss": 2.2889, + "step": 5576 + }, + { + "epoch": 1.045360824742268, + "grad_norm": 52843.43359375, + "learning_rate": 8.205368948294994e-05, + "loss": 2.2529, + "step": 5577 + }, + { + "epoch": 1.045548266166823, + "grad_norm": 52942.5, + "learning_rate": 8.204765830382205e-05, + "loss": 2.2007, + "step": 5578 + }, + { + "epoch": 1.0457357075913778, + "grad_norm": 47966.8515625, + "learning_rate": 8.204162633315855e-05, + "loss": 2.2347, + "step": 5579 + }, + { + "epoch": 1.0459231490159324, + "grad_norm": 50006.28125, + "learning_rate": 8.203559357110844e-05, + "loss": 2.2329, + "step": 5580 + }, + { + "epoch": 1.0461105904404873, + "grad_norm": 50708.1484375, + "learning_rate": 8.202956001782069e-05, + "loss": 2.3237, + "step": 5581 + }, + { + "epoch": 1.046298031865042, + "grad_norm": 47207.77734375, + "learning_rate": 8.202352567344433e-05, + "loss": 2.246, + "step": 5582 + }, + { + "epoch": 1.046485473289597, + "grad_norm": 49963.37109375, + "learning_rate": 8.201749053812841e-05, + "loss": 2.2121, + "step": 5583 + }, + { + "epoch": 1.0466729147141518, + "grad_norm": 52148.375, + "learning_rate": 8.2011454612022e-05, + "loss": 2.266, + "step": 5584 + }, + { + "epoch": 1.0468603561387066, + "grad_norm": 53816.33984375, + "learning_rate": 8.200541789527414e-05, + "loss": 2.2015, + "step": 5585 + }, + { + "epoch": 1.0470477975632615, + "grad_norm": 50611.3671875, + "learning_rate": 8.199938038803396e-05, + "loss": 2.2503, + "step": 5586 + }, + { + "epoch": 1.0472352389878163, + "grad_norm": 48887.5859375, + "learning_rate": 8.199334209045058e-05, + "loss": 2.168, + "step": 5587 + }, + { + "epoch": 1.0474226804123712, + "grad_norm": 49856.65625, + "learning_rate": 8.198730300267314e-05, + "loss": 2.2446, + "step": 5588 + }, + { + "epoch": 1.047610121836926, + "grad_norm": 52126.84375, + "learning_rate": 8.198126312485077e-05, + "loss": 2.192, + "step": 5589 + }, + { + "epoch": 1.0477975632614809, + "grad_norm": 51254.6953125, + "learning_rate": 8.197522245713267e-05, + "loss": 2.219, + "step": 5590 + }, + { + "epoch": 1.0479850046860357, + "grad_norm": 51937.34375, + "learning_rate": 8.196918099966803e-05, + "loss": 2.2571, + "step": 5591 + }, + { + "epoch": 1.0481724461105904, + "grad_norm": 49588.484375, + "learning_rate": 8.196313875260606e-05, + "loss": 2.1625, + "step": 5592 + }, + { + "epoch": 1.0483598875351452, + "grad_norm": 46407.765625, + "learning_rate": 8.195709571609602e-05, + "loss": 2.155, + "step": 5593 + }, + { + "epoch": 1.0485473289597, + "grad_norm": 52187.61328125, + "learning_rate": 8.195105189028714e-05, + "loss": 2.2104, + "step": 5594 + }, + { + "epoch": 1.0487347703842549, + "grad_norm": 52824.96484375, + "learning_rate": 8.194500727532871e-05, + "loss": 2.21, + "step": 5595 + }, + { + "epoch": 1.0489222118088097, + "grad_norm": 49055.03125, + "learning_rate": 8.193896187137e-05, + "loss": 2.1879, + "step": 5596 + }, + { + "epoch": 1.0491096532333646, + "grad_norm": 50499.01953125, + "learning_rate": 8.193291567856036e-05, + "loss": 2.4785, + "step": 5597 + }, + { + "epoch": 1.0492970946579194, + "grad_norm": 51052.83984375, + "learning_rate": 8.192686869704911e-05, + "loss": 2.2188, + "step": 5598 + }, + { + "epoch": 1.0494845360824743, + "grad_norm": 50814.46484375, + "learning_rate": 8.192082092698558e-05, + "loss": 2.3032, + "step": 5599 + }, + { + "epoch": 1.0496719775070291, + "grad_norm": 53938.71875, + "learning_rate": 8.191477236851916e-05, + "loss": 2.2282, + "step": 5600 + }, + { + "epoch": 1.049859418931584, + "grad_norm": 54145.14453125, + "learning_rate": 8.190872302179925e-05, + "loss": 2.2663, + "step": 5601 + }, + { + "epoch": 1.0500468603561388, + "grad_norm": 53240.45703125, + "learning_rate": 8.190267288697525e-05, + "loss": 2.2713, + "step": 5602 + }, + { + "epoch": 1.0502343017806934, + "grad_norm": 48836.48046875, + "learning_rate": 8.189662196419658e-05, + "loss": 2.2448, + "step": 5603 + }, + { + "epoch": 1.0504217432052483, + "grad_norm": 53605.3671875, + "learning_rate": 8.189057025361273e-05, + "loss": 2.1885, + "step": 5604 + }, + { + "epoch": 1.0506091846298031, + "grad_norm": 51342.71875, + "learning_rate": 8.188451775537311e-05, + "loss": 2.2939, + "step": 5605 + }, + { + "epoch": 1.050796626054358, + "grad_norm": 51101.77734375, + "learning_rate": 8.187846446962726e-05, + "loss": 2.2776, + "step": 5606 + }, + { + "epoch": 1.0509840674789128, + "grad_norm": 49156.87890625, + "learning_rate": 8.187241039652467e-05, + "loss": 2.2609, + "step": 5607 + }, + { + "epoch": 1.0511715089034677, + "grad_norm": 51105.62890625, + "learning_rate": 8.186635553621484e-05, + "loss": 2.2286, + "step": 5608 + }, + { + "epoch": 1.0513589503280225, + "grad_norm": 55059.2578125, + "learning_rate": 8.186029988884738e-05, + "loss": 2.2429, + "step": 5609 + }, + { + "epoch": 1.0515463917525774, + "grad_norm": 50777.671875, + "learning_rate": 8.18542434545718e-05, + "loss": 2.298, + "step": 5610 + }, + { + "epoch": 1.0517338331771322, + "grad_norm": 57441.45703125, + "learning_rate": 8.18481862335377e-05, + "loss": 2.1184, + "step": 5611 + }, + { + "epoch": 1.051921274601687, + "grad_norm": 52048.0234375, + "learning_rate": 8.18421282258947e-05, + "loss": 2.2591, + "step": 5612 + }, + { + "epoch": 1.052108716026242, + "grad_norm": 50215.58203125, + "learning_rate": 8.183606943179242e-05, + "loss": 2.2983, + "step": 5613 + }, + { + "epoch": 1.0522961574507965, + "grad_norm": 108494.3046875, + "learning_rate": 8.18300098513805e-05, + "loss": 2.2214, + "step": 5614 + }, + { + "epoch": 1.0524835988753514, + "grad_norm": 51630.828125, + "learning_rate": 8.18239494848086e-05, + "loss": 2.2099, + "step": 5615 + }, + { + "epoch": 1.0526710402999062, + "grad_norm": 53887.84765625, + "learning_rate": 8.181788833222642e-05, + "loss": 2.2461, + "step": 5616 + }, + { + "epoch": 1.052858481724461, + "grad_norm": 49327.046875, + "learning_rate": 8.181182639378364e-05, + "loss": 2.2367, + "step": 5617 + }, + { + "epoch": 1.053045923149016, + "grad_norm": 53008.9140625, + "learning_rate": 8.180576366962998e-05, + "loss": 2.2194, + "step": 5618 + }, + { + "epoch": 1.0532333645735708, + "grad_norm": 51365.2578125, + "learning_rate": 8.179970015991523e-05, + "loss": 2.2166, + "step": 5619 + }, + { + "epoch": 1.0534208059981256, + "grad_norm": 49245.61328125, + "learning_rate": 8.179363586478908e-05, + "loss": 2.2608, + "step": 5620 + }, + { + "epoch": 1.0536082474226804, + "grad_norm": 53225.2734375, + "learning_rate": 8.178757078440136e-05, + "loss": 2.2545, + "step": 5621 + }, + { + "epoch": 1.0537956888472353, + "grad_norm": 51123.23828125, + "learning_rate": 8.178150491890185e-05, + "loss": 2.2578, + "step": 5622 + }, + { + "epoch": 1.0539831302717901, + "grad_norm": 54730.80859375, + "learning_rate": 8.177543826844039e-05, + "loss": 2.2658, + "step": 5623 + }, + { + "epoch": 1.054170571696345, + "grad_norm": 46219.51953125, + "learning_rate": 8.176937083316678e-05, + "loss": 2.2488, + "step": 5624 + }, + { + "epoch": 1.0543580131208996, + "grad_norm": 56726.2421875, + "learning_rate": 8.176330261323091e-05, + "loss": 2.2924, + "step": 5625 + }, + { + "epoch": 1.0545454545454545, + "grad_norm": 51398.0546875, + "learning_rate": 8.175723360878265e-05, + "loss": 2.2066, + "step": 5626 + }, + { + "epoch": 1.0547328959700093, + "grad_norm": 49605.3828125, + "learning_rate": 8.17511638199719e-05, + "loss": 2.2869, + "step": 5627 + }, + { + "epoch": 1.0549203373945641, + "grad_norm": 51792.99609375, + "learning_rate": 8.174509324694856e-05, + "loss": 2.2341, + "step": 5628 + }, + { + "epoch": 1.055107778819119, + "grad_norm": 54179.796875, + "learning_rate": 8.173902188986256e-05, + "loss": 2.2099, + "step": 5629 + }, + { + "epoch": 1.0552952202436738, + "grad_norm": 58787.98828125, + "learning_rate": 8.17329497488639e-05, + "loss": 2.1789, + "step": 5630 + }, + { + "epoch": 1.0554826616682287, + "grad_norm": 48263.2421875, + "learning_rate": 8.17268768241025e-05, + "loss": 2.2256, + "step": 5631 + }, + { + "epoch": 1.0556701030927835, + "grad_norm": 51508.9921875, + "learning_rate": 8.172080311572838e-05, + "loss": 2.2514, + "step": 5632 + }, + { + "epoch": 1.0558575445173384, + "grad_norm": 49174.73046875, + "learning_rate": 8.171472862389155e-05, + "loss": 2.2421, + "step": 5633 + }, + { + "epoch": 1.0560449859418932, + "grad_norm": 49624.23828125, + "learning_rate": 8.170865334874206e-05, + "loss": 2.2939, + "step": 5634 + }, + { + "epoch": 1.056232427366448, + "grad_norm": 49188.93359375, + "learning_rate": 8.170257729042992e-05, + "loss": 2.2405, + "step": 5635 + }, + { + "epoch": 1.056419868791003, + "grad_norm": 49819.27734375, + "learning_rate": 8.169650044910524e-05, + "loss": 2.2101, + "step": 5636 + }, + { + "epoch": 1.0566073102155575, + "grad_norm": 49636.6796875, + "learning_rate": 8.169042282491809e-05, + "loss": 2.3165, + "step": 5637 + }, + { + "epoch": 1.0567947516401124, + "grad_norm": 48737.484375, + "learning_rate": 8.168434441801857e-05, + "loss": 2.2605, + "step": 5638 + }, + { + "epoch": 1.0569821930646672, + "grad_norm": 49247.37890625, + "learning_rate": 8.167826522855685e-05, + "loss": 2.3303, + "step": 5639 + }, + { + "epoch": 1.057169634489222, + "grad_norm": 54184.265625, + "learning_rate": 8.167218525668303e-05, + "loss": 2.2413, + "step": 5640 + }, + { + "epoch": 1.057357075913777, + "grad_norm": 47586.22265625, + "learning_rate": 8.166610450254731e-05, + "loss": 2.2413, + "step": 5641 + }, + { + "epoch": 1.0575445173383318, + "grad_norm": 48431.36328125, + "learning_rate": 8.166002296629984e-05, + "loss": 2.277, + "step": 5642 + }, + { + "epoch": 1.0577319587628866, + "grad_norm": 56540.25390625, + "learning_rate": 8.165394064809087e-05, + "loss": 2.2914, + "step": 5643 + }, + { + "epoch": 1.0579194001874415, + "grad_norm": 51378.73828125, + "learning_rate": 8.16478575480706e-05, + "loss": 2.1809, + "step": 5644 + }, + { + "epoch": 1.0581068416119963, + "grad_norm": 48294.0859375, + "learning_rate": 8.16417736663893e-05, + "loss": 2.2659, + "step": 5645 + }, + { + "epoch": 1.0582942830365512, + "grad_norm": 67580.9765625, + "learning_rate": 8.163568900319717e-05, + "loss": 2.2063, + "step": 5646 + }, + { + "epoch": 1.058481724461106, + "grad_norm": 50887.08984375, + "learning_rate": 8.162960355864458e-05, + "loss": 2.2329, + "step": 5647 + }, + { + "epoch": 1.0586691658856606, + "grad_norm": 51037.828125, + "learning_rate": 8.162351733288179e-05, + "loss": 2.2106, + "step": 5648 + }, + { + "epoch": 1.0588566073102155, + "grad_norm": 52122.45703125, + "learning_rate": 8.16174303260591e-05, + "loss": 2.2841, + "step": 5649 + }, + { + "epoch": 1.0590440487347703, + "grad_norm": 54992.28515625, + "learning_rate": 8.161134253832687e-05, + "loss": 2.2964, + "step": 5650 + }, + { + "epoch": 1.0592314901593252, + "grad_norm": 54201.1796875, + "learning_rate": 8.16052539698355e-05, + "loss": 2.2408, + "step": 5651 + }, + { + "epoch": 1.05941893158388, + "grad_norm": 54833.4375, + "learning_rate": 8.159916462073529e-05, + "loss": 2.219, + "step": 5652 + }, + { + "epoch": 1.0596063730084349, + "grad_norm": 50665.40625, + "learning_rate": 8.159307449117671e-05, + "loss": 2.1714, + "step": 5653 + }, + { + "epoch": 1.0597938144329897, + "grad_norm": 51659.83984375, + "learning_rate": 8.158698358131015e-05, + "loss": 2.1776, + "step": 5654 + }, + { + "epoch": 1.0599812558575445, + "grad_norm": 50546.0703125, + "learning_rate": 8.158089189128603e-05, + "loss": 2.2907, + "step": 5655 + }, + { + "epoch": 1.0601686972820994, + "grad_norm": 48940.1015625, + "learning_rate": 8.157479942125485e-05, + "loss": 2.2309, + "step": 5656 + }, + { + "epoch": 1.0603561387066542, + "grad_norm": 48784.1015625, + "learning_rate": 8.156870617136704e-05, + "loss": 2.229, + "step": 5657 + }, + { + "epoch": 1.060543580131209, + "grad_norm": 52705.59765625, + "learning_rate": 8.156261214177315e-05, + "loss": 2.1875, + "step": 5658 + }, + { + "epoch": 1.060731021555764, + "grad_norm": 55260.484375, + "learning_rate": 8.155651733262362e-05, + "loss": 2.2663, + "step": 5659 + }, + { + "epoch": 1.0609184629803186, + "grad_norm": 50326.8984375, + "learning_rate": 8.155042174406903e-05, + "loss": 2.2421, + "step": 5660 + }, + { + "epoch": 1.0611059044048734, + "grad_norm": 49337.9140625, + "learning_rate": 8.154432537625996e-05, + "loss": 2.2907, + "step": 5661 + }, + { + "epoch": 1.0612933458294282, + "grad_norm": 48823.54296875, + "learning_rate": 8.153822822934694e-05, + "loss": 2.2612, + "step": 5662 + }, + { + "epoch": 1.061480787253983, + "grad_norm": 52762.08203125, + "learning_rate": 8.153213030348055e-05, + "loss": 2.2163, + "step": 5663 + }, + { + "epoch": 1.061668228678538, + "grad_norm": 51048.1015625, + "learning_rate": 8.152603159881145e-05, + "loss": 2.2157, + "step": 5664 + }, + { + "epoch": 1.0618556701030928, + "grad_norm": 52075.03515625, + "learning_rate": 8.151993211549022e-05, + "loss": 2.2373, + "step": 5665 + }, + { + "epoch": 1.0620431115276476, + "grad_norm": 48401.125, + "learning_rate": 8.151383185366755e-05, + "loss": 2.2538, + "step": 5666 + }, + { + "epoch": 1.0622305529522025, + "grad_norm": 47471.62109375, + "learning_rate": 8.15077308134941e-05, + "loss": 2.203, + "step": 5667 + }, + { + "epoch": 1.0624179943767573, + "grad_norm": 55507.0390625, + "learning_rate": 8.150162899512053e-05, + "loss": 2.1941, + "step": 5668 + }, + { + "epoch": 1.0626054358013122, + "grad_norm": 56734.6953125, + "learning_rate": 8.149552639869757e-05, + "loss": 2.26, + "step": 5669 + }, + { + "epoch": 1.062792877225867, + "grad_norm": 47903.33984375, + "learning_rate": 8.148942302437595e-05, + "loss": 2.2398, + "step": 5670 + }, + { + "epoch": 1.0629803186504216, + "grad_norm": 53048.4375, + "learning_rate": 8.14833188723064e-05, + "loss": 2.2483, + "step": 5671 + }, + { + "epoch": 1.0631677600749765, + "grad_norm": 51560.75, + "learning_rate": 8.147721394263969e-05, + "loss": 2.3554, + "step": 5672 + }, + { + "epoch": 1.0633552014995313, + "grad_norm": 53460.09765625, + "learning_rate": 8.147110823552663e-05, + "loss": 2.2477, + "step": 5673 + }, + { + "epoch": 1.0635426429240862, + "grad_norm": 53955.609375, + "learning_rate": 8.146500175111797e-05, + "loss": 2.2305, + "step": 5674 + }, + { + "epoch": 1.063730084348641, + "grad_norm": 51413.9453125, + "learning_rate": 8.145889448956459e-05, + "loss": 2.1931, + "step": 5675 + }, + { + "epoch": 1.0639175257731959, + "grad_norm": 53301.98046875, + "learning_rate": 8.145278645101728e-05, + "loss": 2.2474, + "step": 5676 + }, + { + "epoch": 1.0641049671977507, + "grad_norm": 51272.171875, + "learning_rate": 8.144667763562694e-05, + "loss": 2.2971, + "step": 5677 + }, + { + "epoch": 1.0642924086223056, + "grad_norm": 48861.17578125, + "learning_rate": 8.14405680435444e-05, + "loss": 2.2537, + "step": 5678 + }, + { + "epoch": 1.0644798500468604, + "grad_norm": 48415.48046875, + "learning_rate": 8.143445767492063e-05, + "loss": 2.2303, + "step": 5679 + }, + { + "epoch": 1.0646672914714153, + "grad_norm": 49946.66796875, + "learning_rate": 8.14283465299065e-05, + "loss": 2.3518, + "step": 5680 + }, + { + "epoch": 1.06485473289597, + "grad_norm": 47502.21875, + "learning_rate": 8.142223460865296e-05, + "loss": 2.2642, + "step": 5681 + }, + { + "epoch": 1.0650421743205247, + "grad_norm": 48577.921875, + "learning_rate": 8.141612191131096e-05, + "loss": 2.2839, + "step": 5682 + }, + { + "epoch": 1.0652296157450796, + "grad_norm": 47840.921875, + "learning_rate": 8.141000843803148e-05, + "loss": 2.1869, + "step": 5683 + }, + { + "epoch": 1.0654170571696344, + "grad_norm": 51328.8203125, + "learning_rate": 8.140389418896553e-05, + "loss": 2.2766, + "step": 5684 + }, + { + "epoch": 1.0656044985941893, + "grad_norm": 50909.2421875, + "learning_rate": 8.139777916426408e-05, + "loss": 2.2404, + "step": 5685 + }, + { + "epoch": 1.065791940018744, + "grad_norm": 49119.08984375, + "learning_rate": 8.139166336407822e-05, + "loss": 2.2235, + "step": 5686 + }, + { + "epoch": 1.065979381443299, + "grad_norm": 52954.59375, + "learning_rate": 8.138554678855895e-05, + "loss": 2.2933, + "step": 5687 + }, + { + "epoch": 1.0661668228678538, + "grad_norm": 50278.35546875, + "learning_rate": 8.13794294378574e-05, + "loss": 2.2899, + "step": 5688 + }, + { + "epoch": 1.0663542642924086, + "grad_norm": 50373.32421875, + "learning_rate": 8.137331131212459e-05, + "loss": 2.282, + "step": 5689 + }, + { + "epoch": 1.0665417057169635, + "grad_norm": 49317.8125, + "learning_rate": 8.136719241151169e-05, + "loss": 2.2207, + "step": 5690 + }, + { + "epoch": 1.0667291471415183, + "grad_norm": 48598.78515625, + "learning_rate": 8.13610727361698e-05, + "loss": 2.2971, + "step": 5691 + }, + { + "epoch": 1.0669165885660732, + "grad_norm": 50786.39453125, + "learning_rate": 8.135495228625008e-05, + "loss": 2.2059, + "step": 5692 + }, + { + "epoch": 1.0671040299906278, + "grad_norm": 47405.625, + "learning_rate": 8.134883106190367e-05, + "loss": 2.2595, + "step": 5693 + }, + { + "epoch": 1.0672914714151827, + "grad_norm": 51529.23046875, + "learning_rate": 8.134270906328178e-05, + "loss": 2.1865, + "step": 5694 + }, + { + "epoch": 1.0674789128397375, + "grad_norm": 51693.37890625, + "learning_rate": 8.133658629053563e-05, + "loss": 2.2137, + "step": 5695 + }, + { + "epoch": 1.0676663542642923, + "grad_norm": 48758.90625, + "learning_rate": 8.133046274381642e-05, + "loss": 2.2692, + "step": 5696 + }, + { + "epoch": 1.0678537956888472, + "grad_norm": 45580.27734375, + "learning_rate": 8.13243384232754e-05, + "loss": 2.1995, + "step": 5697 + }, + { + "epoch": 1.068041237113402, + "grad_norm": 50817.28515625, + "learning_rate": 8.131821332906384e-05, + "loss": 2.2274, + "step": 5698 + }, + { + "epoch": 1.0682286785379569, + "grad_norm": 53759.54296875, + "learning_rate": 8.131208746133301e-05, + "loss": 2.2159, + "step": 5699 + }, + { + "epoch": 1.0684161199625117, + "grad_norm": 50298.24609375, + "learning_rate": 8.13059608202342e-05, + "loss": 2.1865, + "step": 5700 + }, + { + "epoch": 1.0686035613870666, + "grad_norm": 54822.46484375, + "learning_rate": 8.129983340591878e-05, + "loss": 2.1806, + "step": 5701 + }, + { + "epoch": 1.0687910028116214, + "grad_norm": 48675.83203125, + "learning_rate": 8.129370521853804e-05, + "loss": 2.2459, + "step": 5702 + }, + { + "epoch": 1.0689784442361763, + "grad_norm": 49895.1171875, + "learning_rate": 8.128757625824335e-05, + "loss": 2.1893, + "step": 5703 + }, + { + "epoch": 1.0691658856607311, + "grad_norm": 50893.40625, + "learning_rate": 8.128144652518609e-05, + "loss": 2.2549, + "step": 5704 + }, + { + "epoch": 1.069353327085286, + "grad_norm": 52707.875, + "learning_rate": 8.127531601951765e-05, + "loss": 2.2956, + "step": 5705 + }, + { + "epoch": 1.0695407685098406, + "grad_norm": 46957.5078125, + "learning_rate": 8.126918474138947e-05, + "loss": 2.2756, + "step": 5706 + }, + { + "epoch": 1.0697282099343954, + "grad_norm": 49373.20703125, + "learning_rate": 8.126305269095298e-05, + "loss": 2.2428, + "step": 5707 + }, + { + "epoch": 1.0699156513589503, + "grad_norm": 53915.265625, + "learning_rate": 8.12569198683596e-05, + "loss": 2.2184, + "step": 5708 + }, + { + "epoch": 1.0701030927835051, + "grad_norm": 51571.3046875, + "learning_rate": 8.125078627376083e-05, + "loss": 2.2685, + "step": 5709 + }, + { + "epoch": 1.07029053420806, + "grad_norm": 49987.734375, + "learning_rate": 8.124465190730815e-05, + "loss": 2.1985, + "step": 5710 + }, + { + "epoch": 1.0704779756326148, + "grad_norm": 51357.22265625, + "learning_rate": 8.123851676915309e-05, + "loss": 2.2008, + "step": 5711 + }, + { + "epoch": 1.0706654170571697, + "grad_norm": 52667.9140625, + "learning_rate": 8.123238085944715e-05, + "loss": 2.256, + "step": 5712 + }, + { + "epoch": 1.0708528584817245, + "grad_norm": 49441.38671875, + "learning_rate": 8.12262441783419e-05, + "loss": 2.2629, + "step": 5713 + }, + { + "epoch": 1.0710402999062794, + "grad_norm": 48606.27734375, + "learning_rate": 8.122010672598893e-05, + "loss": 2.2528, + "step": 5714 + }, + { + "epoch": 1.0712277413308342, + "grad_norm": 48015.65625, + "learning_rate": 8.121396850253977e-05, + "loss": 2.2503, + "step": 5715 + }, + { + "epoch": 1.071415182755389, + "grad_norm": 51942.07421875, + "learning_rate": 8.120782950814607e-05, + "loss": 2.2362, + "step": 5716 + }, + { + "epoch": 1.0716026241799437, + "grad_norm": 47427.2421875, + "learning_rate": 8.120168974295944e-05, + "loss": 2.251, + "step": 5717 + }, + { + "epoch": 1.0717900656044985, + "grad_norm": 51392.078125, + "learning_rate": 8.119554920713153e-05, + "loss": 2.2277, + "step": 5718 + }, + { + "epoch": 1.0719775070290534, + "grad_norm": 49490.84375, + "learning_rate": 8.1189407900814e-05, + "loss": 2.2434, + "step": 5719 + }, + { + "epoch": 1.0721649484536082, + "grad_norm": 50320.984375, + "learning_rate": 8.118326582415854e-05, + "loss": 2.2542, + "step": 5720 + }, + { + "epoch": 1.072352389878163, + "grad_norm": 56649.41796875, + "learning_rate": 8.117712297731682e-05, + "loss": 2.2371, + "step": 5721 + }, + { + "epoch": 1.072539831302718, + "grad_norm": 55457.3046875, + "learning_rate": 8.11709793604406e-05, + "loss": 2.2522, + "step": 5722 + }, + { + "epoch": 1.0727272727272728, + "grad_norm": 51300.5546875, + "learning_rate": 8.116483497368161e-05, + "loss": 2.2715, + "step": 5723 + }, + { + "epoch": 1.0729147141518276, + "grad_norm": 53932.01171875, + "learning_rate": 8.115868981719159e-05, + "loss": 2.4014, + "step": 5724 + }, + { + "epoch": 1.0731021555763824, + "grad_norm": 48189.05078125, + "learning_rate": 8.115254389112235e-05, + "loss": 2.2612, + "step": 5725 + }, + { + "epoch": 1.0732895970009373, + "grad_norm": 51269.0078125, + "learning_rate": 8.114639719562565e-05, + "loss": 2.3107, + "step": 5726 + }, + { + "epoch": 1.0734770384254921, + "grad_norm": 51947.2578125, + "learning_rate": 8.114024973085332e-05, + "loss": 2.2257, + "step": 5727 + }, + { + "epoch": 1.0736644798500468, + "grad_norm": 56554.6015625, + "learning_rate": 8.113410149695719e-05, + "loss": 2.3149, + "step": 5728 + }, + { + "epoch": 1.0738519212746016, + "grad_norm": 55504.98828125, + "learning_rate": 8.112795249408912e-05, + "loss": 2.3328, + "step": 5729 + }, + { + "epoch": 1.0740393626991565, + "grad_norm": 50584.68359375, + "learning_rate": 8.112180272240099e-05, + "loss": 2.2329, + "step": 5730 + }, + { + "epoch": 1.0742268041237113, + "grad_norm": 49733.95703125, + "learning_rate": 8.111565218204468e-05, + "loss": 2.2486, + "step": 5731 + }, + { + "epoch": 1.0744142455482661, + "grad_norm": 49317.6015625, + "learning_rate": 8.11095008731721e-05, + "loss": 2.2142, + "step": 5732 + }, + { + "epoch": 1.074601686972821, + "grad_norm": 49632.3984375, + "learning_rate": 8.110334879593518e-05, + "loss": 2.2129, + "step": 5733 + }, + { + "epoch": 1.0747891283973758, + "grad_norm": 51448.76171875, + "learning_rate": 8.109719595048589e-05, + "loss": 2.252, + "step": 5734 + }, + { + "epoch": 1.0749765698219307, + "grad_norm": 47890.21484375, + "learning_rate": 8.109104233697616e-05, + "loss": 2.2458, + "step": 5735 + }, + { + "epoch": 1.0751640112464855, + "grad_norm": 51803.84765625, + "learning_rate": 8.1084887955558e-05, + "loss": 2.2988, + "step": 5736 + }, + { + "epoch": 1.0753514526710404, + "grad_norm": 47671.8984375, + "learning_rate": 8.10787328063834e-05, + "loss": 2.198, + "step": 5737 + }, + { + "epoch": 1.0755388940955952, + "grad_norm": 52859.44140625, + "learning_rate": 8.107257688960442e-05, + "loss": 2.1553, + "step": 5738 + }, + { + "epoch": 1.0757263355201498, + "grad_norm": 49415.01953125, + "learning_rate": 8.106642020537305e-05, + "loss": 2.2739, + "step": 5739 + }, + { + "epoch": 1.0759137769447047, + "grad_norm": 50268.6796875, + "learning_rate": 8.10602627538414e-05, + "loss": 2.2672, + "step": 5740 + }, + { + "epoch": 1.0761012183692595, + "grad_norm": 53671.22265625, + "learning_rate": 8.10541045351615e-05, + "loss": 2.2229, + "step": 5741 + }, + { + "epoch": 1.0762886597938144, + "grad_norm": 49514.41796875, + "learning_rate": 8.104794554948551e-05, + "loss": 2.2378, + "step": 5742 + }, + { + "epoch": 1.0764761012183692, + "grad_norm": 52938.4140625, + "learning_rate": 8.104178579696551e-05, + "loss": 2.2648, + "step": 5743 + }, + { + "epoch": 1.076663542642924, + "grad_norm": 49469.546875, + "learning_rate": 8.103562527775363e-05, + "loss": 2.2847, + "step": 5744 + }, + { + "epoch": 1.076850984067479, + "grad_norm": 54281.33203125, + "learning_rate": 8.102946399200206e-05, + "loss": 2.1385, + "step": 5745 + }, + { + "epoch": 1.0770384254920338, + "grad_norm": 48764.27734375, + "learning_rate": 8.102330193986296e-05, + "loss": 2.2252, + "step": 5746 + }, + { + "epoch": 1.0772258669165886, + "grad_norm": 50906.30859375, + "learning_rate": 8.101713912148853e-05, + "loss": 2.2774, + "step": 5747 + }, + { + "epoch": 1.0774133083411435, + "grad_norm": 49589.63671875, + "learning_rate": 8.101097553703097e-05, + "loss": 2.2961, + "step": 5748 + }, + { + "epoch": 1.0776007497656983, + "grad_norm": 56716.02734375, + "learning_rate": 8.100481118664251e-05, + "loss": 2.5935, + "step": 5749 + }, + { + "epoch": 1.077788191190253, + "grad_norm": 49547.3515625, + "learning_rate": 8.099864607047544e-05, + "loss": 2.2451, + "step": 5750 + }, + { + "epoch": 1.0779756326148078, + "grad_norm": 52251.9296875, + "learning_rate": 8.0992480188682e-05, + "loss": 2.1559, + "step": 5751 + }, + { + "epoch": 1.0781630740393626, + "grad_norm": 48546.20703125, + "learning_rate": 8.098631354141444e-05, + "loss": 2.2577, + "step": 5752 + }, + { + "epoch": 1.0783505154639175, + "grad_norm": 53042.3515625, + "learning_rate": 8.098014612882515e-05, + "loss": 2.2942, + "step": 5753 + }, + { + "epoch": 1.0785379568884723, + "grad_norm": 52957.48046875, + "learning_rate": 8.09739779510664e-05, + "loss": 2.2612, + "step": 5754 + }, + { + "epoch": 1.0787253983130272, + "grad_norm": 50911.7734375, + "learning_rate": 8.096780900829057e-05, + "loss": 2.1749, + "step": 5755 + }, + { + "epoch": 1.078912839737582, + "grad_norm": 54337.67578125, + "learning_rate": 8.096163930064999e-05, + "loss": 2.1532, + "step": 5756 + }, + { + "epoch": 1.0791002811621369, + "grad_norm": 47899.7421875, + "learning_rate": 8.095546882829707e-05, + "loss": 2.2409, + "step": 5757 + }, + { + "epoch": 1.0792877225866917, + "grad_norm": 50965.25390625, + "learning_rate": 8.09492975913842e-05, + "loss": 2.2916, + "step": 5758 + }, + { + "epoch": 1.0794751640112465, + "grad_norm": 57177.4453125, + "learning_rate": 8.094312559006381e-05, + "loss": 2.2096, + "step": 5759 + }, + { + "epoch": 1.0796626054358014, + "grad_norm": 52224.32421875, + "learning_rate": 8.093695282448832e-05, + "loss": 2.2311, + "step": 5760 + }, + { + "epoch": 1.0798500468603562, + "grad_norm": 50925.80859375, + "learning_rate": 8.093077929481023e-05, + "loss": 2.2731, + "step": 5761 + }, + { + "epoch": 1.0800374882849109, + "grad_norm": 47040.91796875, + "learning_rate": 8.092460500118198e-05, + "loss": 2.2706, + "step": 5762 + }, + { + "epoch": 1.0802249297094657, + "grad_norm": 47420.359375, + "learning_rate": 8.09184299437561e-05, + "loss": 2.229, + "step": 5763 + }, + { + "epoch": 1.0804123711340206, + "grad_norm": 49365.25, + "learning_rate": 8.091225412268507e-05, + "loss": 2.2109, + "step": 5764 + }, + { + "epoch": 1.0805998125585754, + "grad_norm": 52361.8046875, + "learning_rate": 8.090607753812143e-05, + "loss": 2.1718, + "step": 5765 + }, + { + "epoch": 1.0807872539831302, + "grad_norm": 46784.5078125, + "learning_rate": 8.089990019021776e-05, + "loss": 2.2244, + "step": 5766 + }, + { + "epoch": 1.080974695407685, + "grad_norm": 48932.12109375, + "learning_rate": 8.089372207912661e-05, + "loss": 2.2296, + "step": 5767 + }, + { + "epoch": 1.08116213683224, + "grad_norm": 57451.13671875, + "learning_rate": 8.088754320500058e-05, + "loss": 2.2136, + "step": 5768 + }, + { + "epoch": 1.0813495782567948, + "grad_norm": 51962.2578125, + "learning_rate": 8.08813635679923e-05, + "loss": 2.3203, + "step": 5769 + }, + { + "epoch": 1.0815370196813496, + "grad_norm": 51095.3359375, + "learning_rate": 8.087518316825435e-05, + "loss": 2.1866, + "step": 5770 + }, + { + "epoch": 1.0817244611059045, + "grad_norm": 50208.76953125, + "learning_rate": 8.086900200593942e-05, + "loss": 2.1954, + "step": 5771 + }, + { + "epoch": 1.0819119025304593, + "grad_norm": 47483.1484375, + "learning_rate": 8.086282008120016e-05, + "loss": 2.2626, + "step": 5772 + }, + { + "epoch": 1.0820993439550142, + "grad_norm": 52097.7890625, + "learning_rate": 8.085663739418925e-05, + "loss": 2.2391, + "step": 5773 + }, + { + "epoch": 1.0822867853795688, + "grad_norm": 48933.06640625, + "learning_rate": 8.085045394505941e-05, + "loss": 2.2021, + "step": 5774 + }, + { + "epoch": 1.0824742268041236, + "grad_norm": 53837.73828125, + "learning_rate": 8.084426973396337e-05, + "loss": 2.2098, + "step": 5775 + }, + { + "epoch": 1.0826616682286785, + "grad_norm": 55882.71484375, + "learning_rate": 8.083808476105384e-05, + "loss": 2.2157, + "step": 5776 + }, + { + "epoch": 1.0828491096532333, + "grad_norm": 53350.90234375, + "learning_rate": 8.08318990264836e-05, + "loss": 2.29, + "step": 5777 + }, + { + "epoch": 1.0830365510777882, + "grad_norm": 51605.3515625, + "learning_rate": 8.082571253040543e-05, + "loss": 2.219, + "step": 5778 + }, + { + "epoch": 1.083223992502343, + "grad_norm": 49722.3984375, + "learning_rate": 8.081952527297213e-05, + "loss": 2.2538, + "step": 5779 + }, + { + "epoch": 1.0834114339268979, + "grad_norm": 48463.53125, + "learning_rate": 8.08133372543365e-05, + "loss": 2.2605, + "step": 5780 + }, + { + "epoch": 1.0835988753514527, + "grad_norm": 50114.70703125, + "learning_rate": 8.080714847465141e-05, + "loss": 2.2155, + "step": 5781 + }, + { + "epoch": 1.0837863167760076, + "grad_norm": 50455.91796875, + "learning_rate": 8.08009589340697e-05, + "loss": 2.2378, + "step": 5782 + }, + { + "epoch": 1.0839737582005624, + "grad_norm": 55554.27734375, + "learning_rate": 8.079476863274422e-05, + "loss": 2.2328, + "step": 5783 + }, + { + "epoch": 1.0841611996251173, + "grad_norm": 51673.4296875, + "learning_rate": 8.078857757082787e-05, + "loss": 2.1994, + "step": 5784 + }, + { + "epoch": 1.0843486410496719, + "grad_norm": 54759.0859375, + "learning_rate": 8.07823857484736e-05, + "loss": 2.2569, + "step": 5785 + }, + { + "epoch": 1.0845360824742267, + "grad_norm": 52097.38671875, + "learning_rate": 8.07761931658343e-05, + "loss": 2.2042, + "step": 5786 + }, + { + "epoch": 1.0847235238987816, + "grad_norm": 54797.58984375, + "learning_rate": 8.076999982306292e-05, + "loss": 2.265, + "step": 5787 + }, + { + "epoch": 1.0849109653233364, + "grad_norm": 52982.38671875, + "learning_rate": 8.076380572031246e-05, + "loss": 2.2165, + "step": 5788 + }, + { + "epoch": 1.0850984067478913, + "grad_norm": 52938.33203125, + "learning_rate": 8.075761085773587e-05, + "loss": 2.2139, + "step": 5789 + }, + { + "epoch": 1.085285848172446, + "grad_norm": 50114.8203125, + "learning_rate": 8.075141523548617e-05, + "loss": 2.1726, + "step": 5790 + }, + { + "epoch": 1.085473289597001, + "grad_norm": 55179.6328125, + "learning_rate": 8.074521885371639e-05, + "loss": 2.3078, + "step": 5791 + }, + { + "epoch": 1.0856607310215558, + "grad_norm": 53167.57421875, + "learning_rate": 8.073902171257957e-05, + "loss": 2.2617, + "step": 5792 + }, + { + "epoch": 1.0858481724461106, + "grad_norm": 56740.42578125, + "learning_rate": 8.073282381222875e-05, + "loss": 2.2886, + "step": 5793 + }, + { + "epoch": 1.0860356138706655, + "grad_norm": 50615.80078125, + "learning_rate": 8.072662515281703e-05, + "loss": 2.2295, + "step": 5794 + }, + { + "epoch": 1.0862230552952203, + "grad_norm": 51740.73046875, + "learning_rate": 8.072042573449751e-05, + "loss": 2.2195, + "step": 5795 + }, + { + "epoch": 1.086410496719775, + "grad_norm": 47014.421875, + "learning_rate": 8.071422555742331e-05, + "loss": 2.2098, + "step": 5796 + }, + { + "epoch": 1.0865979381443298, + "grad_norm": 48904.40234375, + "learning_rate": 8.070802462174755e-05, + "loss": 2.268, + "step": 5797 + }, + { + "epoch": 1.0867853795688847, + "grad_norm": 48565.5703125, + "learning_rate": 8.070182292762341e-05, + "loss": 2.2636, + "step": 5798 + }, + { + "epoch": 1.0869728209934395, + "grad_norm": 50671.76953125, + "learning_rate": 8.069562047520405e-05, + "loss": 2.299, + "step": 5799 + }, + { + "epoch": 1.0871602624179943, + "grad_norm": 47275.63671875, + "learning_rate": 8.068941726464265e-05, + "loss": 2.2835, + "step": 5800 + }, + { + "epoch": 1.0873477038425492, + "grad_norm": 51591.40234375, + "learning_rate": 8.068321329609243e-05, + "loss": 2.2341, + "step": 5801 + }, + { + "epoch": 1.087535145267104, + "grad_norm": 51015.046875, + "learning_rate": 8.067700856970663e-05, + "loss": 2.2576, + "step": 5802 + }, + { + "epoch": 1.0877225866916589, + "grad_norm": 49432.34765625, + "learning_rate": 8.06708030856385e-05, + "loss": 2.249, + "step": 5803 + }, + { + "epoch": 1.0879100281162137, + "grad_norm": 49462.51953125, + "learning_rate": 8.066459684404128e-05, + "loss": 2.2701, + "step": 5804 + }, + { + "epoch": 1.0880974695407686, + "grad_norm": 52066.87109375, + "learning_rate": 8.065838984506831e-05, + "loss": 2.1822, + "step": 5805 + }, + { + "epoch": 1.0882849109653234, + "grad_norm": 53109.734375, + "learning_rate": 8.065218208887282e-05, + "loss": 2.235, + "step": 5806 + }, + { + "epoch": 1.088472352389878, + "grad_norm": 48543.4296875, + "learning_rate": 8.064597357560819e-05, + "loss": 2.213, + "step": 5807 + }, + { + "epoch": 1.088659793814433, + "grad_norm": 51809.2890625, + "learning_rate": 8.063976430542774e-05, + "loss": 2.2689, + "step": 5808 + }, + { + "epoch": 1.0888472352389877, + "grad_norm": 49987.7265625, + "learning_rate": 8.063355427848484e-05, + "loss": 2.1788, + "step": 5809 + }, + { + "epoch": 1.0890346766635426, + "grad_norm": 56647.05859375, + "learning_rate": 8.062734349493288e-05, + "loss": 2.1915, + "step": 5810 + }, + { + "epoch": 1.0892221180880974, + "grad_norm": 53341.1171875, + "learning_rate": 8.062113195492523e-05, + "loss": 2.2783, + "step": 5811 + }, + { + "epoch": 1.0894095595126523, + "grad_norm": 50520.0703125, + "learning_rate": 8.061491965861533e-05, + "loss": 2.1963, + "step": 5812 + }, + { + "epoch": 1.0895970009372071, + "grad_norm": 50449.18359375, + "learning_rate": 8.06087066061566e-05, + "loss": 2.1876, + "step": 5813 + }, + { + "epoch": 1.089784442361762, + "grad_norm": 47644.01171875, + "learning_rate": 8.06024927977025e-05, + "loss": 2.227, + "step": 5814 + }, + { + "epoch": 1.0899718837863168, + "grad_norm": 48202.3828125, + "learning_rate": 8.059627823340651e-05, + "loss": 2.3016, + "step": 5815 + }, + { + "epoch": 1.0901593252108717, + "grad_norm": 53670.77734375, + "learning_rate": 8.05900629134221e-05, + "loss": 2.2374, + "step": 5816 + }, + { + "epoch": 1.0903467666354265, + "grad_norm": 51185.2265625, + "learning_rate": 8.058384683790282e-05, + "loss": 2.2069, + "step": 5817 + }, + { + "epoch": 1.0905342080599814, + "grad_norm": 58068.8828125, + "learning_rate": 8.057763000700217e-05, + "loss": 2.2595, + "step": 5818 + }, + { + "epoch": 1.090721649484536, + "grad_norm": 53762.41796875, + "learning_rate": 8.057141242087371e-05, + "loss": 2.2989, + "step": 5819 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 49445.9453125, + "learning_rate": 8.056519407967098e-05, + "loss": 2.2569, + "step": 5820 + }, + { + "epoch": 1.0910965323336457, + "grad_norm": 52414.54296875, + "learning_rate": 8.055897498354759e-05, + "loss": 2.2533, + "step": 5821 + }, + { + "epoch": 1.0912839737582005, + "grad_norm": 50955.609375, + "learning_rate": 8.055275513265714e-05, + "loss": 2.267, + "step": 5822 + }, + { + "epoch": 1.0914714151827554, + "grad_norm": 53655.30859375, + "learning_rate": 8.054653452715326e-05, + "loss": 2.2427, + "step": 5823 + }, + { + "epoch": 1.0916588566073102, + "grad_norm": 50779.640625, + "learning_rate": 8.054031316718958e-05, + "loss": 2.1766, + "step": 5824 + }, + { + "epoch": 1.091846298031865, + "grad_norm": 53192.328125, + "learning_rate": 8.053409105291975e-05, + "loss": 2.1789, + "step": 5825 + }, + { + "epoch": 1.09203373945642, + "grad_norm": 47294.96484375, + "learning_rate": 8.052786818449746e-05, + "loss": 2.2472, + "step": 5826 + }, + { + "epoch": 1.0922211808809748, + "grad_norm": 48239.30078125, + "learning_rate": 8.052164456207641e-05, + "loss": 2.2619, + "step": 5827 + }, + { + "epoch": 1.0924086223055296, + "grad_norm": 49390.1796875, + "learning_rate": 8.051542018581031e-05, + "loss": 2.1643, + "step": 5828 + }, + { + "epoch": 1.0925960637300844, + "grad_norm": 50135.72265625, + "learning_rate": 8.050919505585291e-05, + "loss": 2.1697, + "step": 5829 + }, + { + "epoch": 1.0927835051546393, + "grad_norm": 52366.7890625, + "learning_rate": 8.050296917235792e-05, + "loss": 2.2888, + "step": 5830 + }, + { + "epoch": 1.092970946579194, + "grad_norm": 55980.69140625, + "learning_rate": 8.049674253547916e-05, + "loss": 2.2294, + "step": 5831 + }, + { + "epoch": 1.0931583880037488, + "grad_norm": 51138.47265625, + "learning_rate": 8.04905151453704e-05, + "loss": 2.1793, + "step": 5832 + }, + { + "epoch": 1.0933458294283036, + "grad_norm": 48945.25390625, + "learning_rate": 8.048428700218544e-05, + "loss": 2.2398, + "step": 5833 + }, + { + "epoch": 1.0935332708528585, + "grad_norm": 51456.28125, + "learning_rate": 8.04780581060781e-05, + "loss": 2.2304, + "step": 5834 + }, + { + "epoch": 1.0937207122774133, + "grad_norm": 47690.3046875, + "learning_rate": 8.047182845720226e-05, + "loss": 2.2671, + "step": 5835 + }, + { + "epoch": 1.0939081537019681, + "grad_norm": 54723.51953125, + "learning_rate": 8.046559805571176e-05, + "loss": 2.3064, + "step": 5836 + }, + { + "epoch": 1.094095595126523, + "grad_norm": 52632.87890625, + "learning_rate": 8.045936690176048e-05, + "loss": 2.2565, + "step": 5837 + }, + { + "epoch": 1.0942830365510778, + "grad_norm": 51416.234375, + "learning_rate": 8.045313499550234e-05, + "loss": 2.2954, + "step": 5838 + }, + { + "epoch": 1.0944704779756327, + "grad_norm": 53664.53515625, + "learning_rate": 8.044690233709124e-05, + "loss": 2.1921, + "step": 5839 + }, + { + "epoch": 1.0946579194001875, + "grad_norm": 47580.99609375, + "learning_rate": 8.044066892668113e-05, + "loss": 2.2161, + "step": 5840 + }, + { + "epoch": 1.0948453608247424, + "grad_norm": 53875.0234375, + "learning_rate": 8.043443476442597e-05, + "loss": 2.1589, + "step": 5841 + }, + { + "epoch": 1.095032802249297, + "grad_norm": 51742.12890625, + "learning_rate": 8.042819985047972e-05, + "loss": 2.2659, + "step": 5842 + }, + { + "epoch": 1.0952202436738518, + "grad_norm": 51973.22265625, + "learning_rate": 8.042196418499639e-05, + "loss": 2.2521, + "step": 5843 + }, + { + "epoch": 1.0954076850984067, + "grad_norm": 56286.28125, + "learning_rate": 8.041572776812999e-05, + "loss": 2.2375, + "step": 5844 + }, + { + "epoch": 1.0955951265229615, + "grad_norm": 51335.5, + "learning_rate": 8.040949060003455e-05, + "loss": 2.1973, + "step": 5845 + }, + { + "epoch": 1.0957825679475164, + "grad_norm": 49342.4921875, + "learning_rate": 8.04032526808641e-05, + "loss": 2.2493, + "step": 5846 + }, + { + "epoch": 1.0959700093720712, + "grad_norm": 48643.359375, + "learning_rate": 8.039701401077275e-05, + "loss": 2.2303, + "step": 5847 + }, + { + "epoch": 1.096157450796626, + "grad_norm": 50873.29296875, + "learning_rate": 8.039077458991457e-05, + "loss": 2.1753, + "step": 5848 + }, + { + "epoch": 1.096344892221181, + "grad_norm": 48285.453125, + "learning_rate": 8.038453441844363e-05, + "loss": 2.2206, + "step": 5849 + }, + { + "epoch": 1.0965323336457358, + "grad_norm": 54345.6015625, + "learning_rate": 8.037829349651412e-05, + "loss": 2.2882, + "step": 5850 + }, + { + "epoch": 1.0967197750702906, + "grad_norm": 55353.71875, + "learning_rate": 8.037205182428012e-05, + "loss": 2.2772, + "step": 5851 + }, + { + "epoch": 1.0969072164948455, + "grad_norm": 50487.0859375, + "learning_rate": 8.036580940189582e-05, + "loss": 2.2718, + "step": 5852 + }, + { + "epoch": 1.0970946579194, + "grad_norm": 44549.359375, + "learning_rate": 8.035956622951539e-05, + "loss": 2.2532, + "step": 5853 + }, + { + "epoch": 1.097282099343955, + "grad_norm": 49394.515625, + "learning_rate": 8.035332230729306e-05, + "loss": 2.327, + "step": 5854 + }, + { + "epoch": 1.0974695407685098, + "grad_norm": 52781.6015625, + "learning_rate": 8.034707763538301e-05, + "loss": 2.2669, + "step": 5855 + }, + { + "epoch": 1.0976569821930646, + "grad_norm": 47646.8359375, + "learning_rate": 8.034083221393949e-05, + "loss": 2.2187, + "step": 5856 + }, + { + "epoch": 1.0978444236176195, + "grad_norm": 51050.39453125, + "learning_rate": 8.033458604311674e-05, + "loss": 2.2716, + "step": 5857 + }, + { + "epoch": 1.0980318650421743, + "grad_norm": 49148.640625, + "learning_rate": 8.032833912306906e-05, + "loss": 2.2825, + "step": 5858 + }, + { + "epoch": 1.0982193064667292, + "grad_norm": 51488.12890625, + "learning_rate": 8.032209145395072e-05, + "loss": 2.2629, + "step": 5859 + }, + { + "epoch": 1.098406747891284, + "grad_norm": 51411.79296875, + "learning_rate": 8.031584303591603e-05, + "loss": 2.302, + "step": 5860 + }, + { + "epoch": 1.0985941893158389, + "grad_norm": 50149.7265625, + "learning_rate": 8.030959386911932e-05, + "loss": 2.32, + "step": 5861 + }, + { + "epoch": 1.0987816307403937, + "grad_norm": 49730.8671875, + "learning_rate": 8.030334395371495e-05, + "loss": 2.2263, + "step": 5862 + }, + { + "epoch": 1.0989690721649485, + "grad_norm": 51929.1328125, + "learning_rate": 8.029709328985728e-05, + "loss": 2.5099, + "step": 5863 + }, + { + "epoch": 1.0991565135895032, + "grad_norm": 54489.9296875, + "learning_rate": 8.029084187770067e-05, + "loss": 2.2662, + "step": 5864 + }, + { + "epoch": 1.099343955014058, + "grad_norm": 53707.19140625, + "learning_rate": 8.028458971739954e-05, + "loss": 2.1826, + "step": 5865 + }, + { + "epoch": 1.0995313964386129, + "grad_norm": 51709.6328125, + "learning_rate": 8.02783368091083e-05, + "loss": 2.2139, + "step": 5866 + }, + { + "epoch": 1.0997188378631677, + "grad_norm": 53600.234375, + "learning_rate": 8.027208315298141e-05, + "loss": 2.2363, + "step": 5867 + }, + { + "epoch": 1.0999062792877226, + "grad_norm": 53394.41796875, + "learning_rate": 8.026582874917331e-05, + "loss": 2.2355, + "step": 5868 + }, + { + "epoch": 1.1000937207122774, + "grad_norm": 52577.7890625, + "learning_rate": 8.025957359783849e-05, + "loss": 2.2175, + "step": 5869 + }, + { + "epoch": 1.1002811621368322, + "grad_norm": 52024.33984375, + "learning_rate": 8.025331769913143e-05, + "loss": 2.2151, + "step": 5870 + }, + { + "epoch": 1.100468603561387, + "grad_norm": 48375.46875, + "learning_rate": 8.024706105320663e-05, + "loss": 2.2721, + "step": 5871 + }, + { + "epoch": 1.100656044985942, + "grad_norm": 54016.6015625, + "learning_rate": 8.024080366021866e-05, + "loss": 2.268, + "step": 5872 + }, + { + "epoch": 1.1008434864104968, + "grad_norm": 47106.3984375, + "learning_rate": 8.023454552032203e-05, + "loss": 2.2472, + "step": 5873 + }, + { + "epoch": 1.1010309278350516, + "grad_norm": 52002.28515625, + "learning_rate": 8.022828663367135e-05, + "loss": 2.2568, + "step": 5874 + }, + { + "epoch": 1.1012183692596063, + "grad_norm": 51542.8125, + "learning_rate": 8.022202700042116e-05, + "loss": 2.267, + "step": 5875 + }, + { + "epoch": 1.101405810684161, + "grad_norm": 52536.01953125, + "learning_rate": 8.021576662072609e-05, + "loss": 2.1853, + "step": 5876 + }, + { + "epoch": 1.101593252108716, + "grad_norm": 51880.65234375, + "learning_rate": 8.020950549474076e-05, + "loss": 2.2741, + "step": 5877 + }, + { + "epoch": 1.1017806935332708, + "grad_norm": 54070.50390625, + "learning_rate": 8.020324362261982e-05, + "loss": 2.1991, + "step": 5878 + }, + { + "epoch": 1.1019681349578256, + "grad_norm": 50276.73828125, + "learning_rate": 8.019698100451791e-05, + "loss": 2.2157, + "step": 5879 + }, + { + "epoch": 1.1021555763823805, + "grad_norm": 53407.02734375, + "learning_rate": 8.019071764058972e-05, + "loss": 2.2742, + "step": 5880 + }, + { + "epoch": 1.1023430178069353, + "grad_norm": 48952.7890625, + "learning_rate": 8.018445353098996e-05, + "loss": 2.2202, + "step": 5881 + }, + { + "epoch": 1.1025304592314902, + "grad_norm": 48583.296875, + "learning_rate": 8.017818867587332e-05, + "loss": 2.3155, + "step": 5882 + }, + { + "epoch": 1.102717900656045, + "grad_norm": 52133.5390625, + "learning_rate": 8.017192307539455e-05, + "loss": 2.238, + "step": 5883 + }, + { + "epoch": 1.1029053420805999, + "grad_norm": 53319.265625, + "learning_rate": 8.01656567297084e-05, + "loss": 2.1578, + "step": 5884 + }, + { + "epoch": 1.1030927835051547, + "grad_norm": 52179.953125, + "learning_rate": 8.015938963896965e-05, + "loss": 2.2587, + "step": 5885 + }, + { + "epoch": 1.1032802249297096, + "grad_norm": 50245.60546875, + "learning_rate": 8.015312180333306e-05, + "loss": 2.2399, + "step": 5886 + }, + { + "epoch": 1.1034676663542644, + "grad_norm": 52071.0234375, + "learning_rate": 8.014685322295346e-05, + "loss": 2.3402, + "step": 5887 + }, + { + "epoch": 1.103655107778819, + "grad_norm": 51777.41015625, + "learning_rate": 8.014058389798568e-05, + "loss": 2.2513, + "step": 5888 + }, + { + "epoch": 1.1038425492033739, + "grad_norm": 53482.515625, + "learning_rate": 8.013431382858455e-05, + "loss": 2.2731, + "step": 5889 + }, + { + "epoch": 1.1040299906279287, + "grad_norm": 46939.33203125, + "learning_rate": 8.012804301490493e-05, + "loss": 2.2576, + "step": 5890 + }, + { + "epoch": 1.1042174320524836, + "grad_norm": 49829.12109375, + "learning_rate": 8.012177145710172e-05, + "loss": 2.2212, + "step": 5891 + }, + { + "epoch": 1.1044048734770384, + "grad_norm": 51789.42578125, + "learning_rate": 8.011549915532981e-05, + "loss": 2.2505, + "step": 5892 + }, + { + "epoch": 1.1045923149015933, + "grad_norm": 50552.3046875, + "learning_rate": 8.010922610974412e-05, + "loss": 2.2607, + "step": 5893 + }, + { + "epoch": 1.104779756326148, + "grad_norm": 53990.29296875, + "learning_rate": 8.010295232049956e-05, + "loss": 2.2368, + "step": 5894 + }, + { + "epoch": 1.104967197750703, + "grad_norm": 47614.06640625, + "learning_rate": 8.009667778775112e-05, + "loss": 2.2469, + "step": 5895 + }, + { + "epoch": 1.1051546391752578, + "grad_norm": 60439.79296875, + "learning_rate": 8.009040251165376e-05, + "loss": 2.3916, + "step": 5896 + }, + { + "epoch": 1.1053420805998126, + "grad_norm": 49667.34765625, + "learning_rate": 8.008412649236247e-05, + "loss": 2.2874, + "step": 5897 + }, + { + "epoch": 1.1055295220243675, + "grad_norm": 53202.0390625, + "learning_rate": 8.007784973003227e-05, + "loss": 2.3202, + "step": 5898 + }, + { + "epoch": 1.1057169634489221, + "grad_norm": 50671.1015625, + "learning_rate": 8.007157222481816e-05, + "loss": 2.2291, + "step": 5899 + }, + { + "epoch": 1.105904404873477, + "grad_norm": 49080.12890625, + "learning_rate": 8.00652939768752e-05, + "loss": 2.2251, + "step": 5900 + }, + { + "epoch": 1.1060918462980318, + "grad_norm": 50316.51171875, + "learning_rate": 8.00590149863585e-05, + "loss": 2.1974, + "step": 5901 + }, + { + "epoch": 1.1062792877225867, + "grad_norm": 46204.0625, + "learning_rate": 8.005273525342307e-05, + "loss": 2.2805, + "step": 5902 + }, + { + "epoch": 1.1064667291471415, + "grad_norm": 53280.8359375, + "learning_rate": 8.004645477822404e-05, + "loss": 2.2401, + "step": 5903 + }, + { + "epoch": 1.1066541705716963, + "grad_norm": 52966.40625, + "learning_rate": 8.004017356091656e-05, + "loss": 2.2656, + "step": 5904 + }, + { + "epoch": 1.1068416119962512, + "grad_norm": 58557.46484375, + "learning_rate": 8.00338916016557e-05, + "loss": 2.2393, + "step": 5905 + }, + { + "epoch": 1.107029053420806, + "grad_norm": 53969.7890625, + "learning_rate": 8.002760890059669e-05, + "loss": 2.3367, + "step": 5906 + }, + { + "epoch": 1.1072164948453609, + "grad_norm": 47773.71875, + "learning_rate": 8.002132545789464e-05, + "loss": 2.1973, + "step": 5907 + }, + { + "epoch": 1.1074039362699157, + "grad_norm": 50392.953125, + "learning_rate": 8.00150412737048e-05, + "loss": 2.2569, + "step": 5908 + }, + { + "epoch": 1.1075913776944706, + "grad_norm": 48385.1953125, + "learning_rate": 8.000875634818235e-05, + "loss": 2.2453, + "step": 5909 + }, + { + "epoch": 1.1077788191190252, + "grad_norm": 49328.921875, + "learning_rate": 8.000247068148253e-05, + "loss": 2.2234, + "step": 5910 + }, + { + "epoch": 1.10796626054358, + "grad_norm": 56796.1953125, + "learning_rate": 7.999618427376058e-05, + "loss": 2.1851, + "step": 5911 + }, + { + "epoch": 1.108153701968135, + "grad_norm": 52439.046875, + "learning_rate": 7.998989712517178e-05, + "loss": 2.2025, + "step": 5912 + }, + { + "epoch": 1.1083411433926897, + "grad_norm": 52595.1796875, + "learning_rate": 7.998360923587138e-05, + "loss": 2.2566, + "step": 5913 + }, + { + "epoch": 1.1085285848172446, + "grad_norm": 51355.8515625, + "learning_rate": 7.997732060601472e-05, + "loss": 2.2122, + "step": 5914 + }, + { + "epoch": 1.1087160262417994, + "grad_norm": 52862.11328125, + "learning_rate": 7.99710312357571e-05, + "loss": 2.2416, + "step": 5915 + }, + { + "epoch": 1.1089034676663543, + "grad_norm": 48768.83984375, + "learning_rate": 7.996474112525387e-05, + "loss": 2.1998, + "step": 5916 + }, + { + "epoch": 1.1090909090909091, + "grad_norm": 53140.17578125, + "learning_rate": 7.99584502746604e-05, + "loss": 2.2784, + "step": 5917 + }, + { + "epoch": 1.109278350515464, + "grad_norm": 50835.0546875, + "learning_rate": 7.995215868413202e-05, + "loss": 2.238, + "step": 5918 + }, + { + "epoch": 1.1094657919400188, + "grad_norm": 51107.68359375, + "learning_rate": 7.994586635382418e-05, + "loss": 2.2761, + "step": 5919 + }, + { + "epoch": 1.1096532333645737, + "grad_norm": 48321.23046875, + "learning_rate": 7.993957328389225e-05, + "loss": 2.2299, + "step": 5920 + }, + { + "epoch": 1.1098406747891283, + "grad_norm": 50600.375, + "learning_rate": 7.993327947449167e-05, + "loss": 2.185, + "step": 5921 + }, + { + "epoch": 1.1100281162136831, + "grad_norm": 52331.1953125, + "learning_rate": 7.99269849257779e-05, + "loss": 2.202, + "step": 5922 + }, + { + "epoch": 1.110215557638238, + "grad_norm": 51693.29296875, + "learning_rate": 7.992068963790643e-05, + "loss": 2.2356, + "step": 5923 + }, + { + "epoch": 1.1104029990627928, + "grad_norm": 50715.81640625, + "learning_rate": 7.99143936110327e-05, + "loss": 2.2565, + "step": 5924 + }, + { + "epoch": 1.1105904404873477, + "grad_norm": 51003.32421875, + "learning_rate": 7.990809684531223e-05, + "loss": 2.325, + "step": 5925 + }, + { + "epoch": 1.1107778819119025, + "grad_norm": 53602.40234375, + "learning_rate": 7.990179934090054e-05, + "loss": 2.2649, + "step": 5926 + }, + { + "epoch": 1.1109653233364574, + "grad_norm": 55665.8125, + "learning_rate": 7.989550109795319e-05, + "loss": 2.27, + "step": 5927 + }, + { + "epoch": 1.1111527647610122, + "grad_norm": 49612.2265625, + "learning_rate": 7.98892021166257e-05, + "loss": 2.3266, + "step": 5928 + }, + { + "epoch": 1.111340206185567, + "grad_norm": 51143.5546875, + "learning_rate": 7.98829023970737e-05, + "loss": 2.2792, + "step": 5929 + }, + { + "epoch": 1.111527647610122, + "grad_norm": 49746.18359375, + "learning_rate": 7.987660193945273e-05, + "loss": 2.2053, + "step": 5930 + }, + { + "epoch": 1.1117150890346768, + "grad_norm": 49359.359375, + "learning_rate": 7.987030074391844e-05, + "loss": 2.2488, + "step": 5931 + }, + { + "epoch": 1.1119025304592314, + "grad_norm": 50556.63671875, + "learning_rate": 7.986399881062646e-05, + "loss": 2.1727, + "step": 5932 + }, + { + "epoch": 1.1120899718837862, + "grad_norm": 52733.57421875, + "learning_rate": 7.98576961397324e-05, + "loss": 2.2016, + "step": 5933 + }, + { + "epoch": 1.112277413308341, + "grad_norm": 47658.91796875, + "learning_rate": 7.985139273139197e-05, + "loss": 2.3066, + "step": 5934 + }, + { + "epoch": 1.112464854732896, + "grad_norm": 51353.234375, + "learning_rate": 7.984508858576084e-05, + "loss": 2.2253, + "step": 5935 + }, + { + "epoch": 1.1126522961574508, + "grad_norm": 49778.86328125, + "learning_rate": 7.983878370299473e-05, + "loss": 2.3215, + "step": 5936 + }, + { + "epoch": 1.1128397375820056, + "grad_norm": 52482.9765625, + "learning_rate": 7.983247808324932e-05, + "loss": 2.2525, + "step": 5937 + }, + { + "epoch": 1.1130271790065605, + "grad_norm": 51085.98828125, + "learning_rate": 7.98261717266804e-05, + "loss": 2.2576, + "step": 5938 + }, + { + "epoch": 1.1132146204311153, + "grad_norm": 55704.2734375, + "learning_rate": 7.98198646334437e-05, + "loss": 2.2702, + "step": 5939 + }, + { + "epoch": 1.1134020618556701, + "grad_norm": 51920.53515625, + "learning_rate": 7.981355680369502e-05, + "loss": 2.2412, + "step": 5940 + }, + { + "epoch": 1.113589503280225, + "grad_norm": 51327.83203125, + "learning_rate": 7.980724823759014e-05, + "loss": 2.1942, + "step": 5941 + }, + { + "epoch": 1.1137769447047798, + "grad_norm": 47494.76171875, + "learning_rate": 7.980093893528486e-05, + "loss": 2.2672, + "step": 5942 + }, + { + "epoch": 1.1139643861293347, + "grad_norm": 53233.2109375, + "learning_rate": 7.979462889693503e-05, + "loss": 2.2852, + "step": 5943 + }, + { + "epoch": 1.1141518275538893, + "grad_norm": 54247.67578125, + "learning_rate": 7.978831812269651e-05, + "loss": 2.2168, + "step": 5944 + }, + { + "epoch": 1.1143392689784442, + "grad_norm": 51615.92578125, + "learning_rate": 7.978200661272514e-05, + "loss": 2.2693, + "step": 5945 + }, + { + "epoch": 1.114526710402999, + "grad_norm": 49466.44140625, + "learning_rate": 7.977569436717682e-05, + "loss": 2.2399, + "step": 5946 + }, + { + "epoch": 1.1147141518275538, + "grad_norm": 49170.8125, + "learning_rate": 7.976938138620746e-05, + "loss": 2.2303, + "step": 5947 + }, + { + "epoch": 1.1149015932521087, + "grad_norm": 49575.38671875, + "learning_rate": 7.976306766997297e-05, + "loss": 2.2914, + "step": 5948 + }, + { + "epoch": 1.1150890346766635, + "grad_norm": 52319.6640625, + "learning_rate": 7.97567532186293e-05, + "loss": 2.2669, + "step": 5949 + }, + { + "epoch": 1.1152764761012184, + "grad_norm": 52661.6484375, + "learning_rate": 7.97504380323324e-05, + "loss": 2.2172, + "step": 5950 + }, + { + "epoch": 1.1154639175257732, + "grad_norm": 51654.76171875, + "learning_rate": 7.974412211123826e-05, + "loss": 2.2665, + "step": 5951 + }, + { + "epoch": 1.115651358950328, + "grad_norm": 50036.49609375, + "learning_rate": 7.973780545550287e-05, + "loss": 2.2607, + "step": 5952 + }, + { + "epoch": 1.115838800374883, + "grad_norm": 49791.53125, + "learning_rate": 7.973148806528224e-05, + "loss": 2.2245, + "step": 5953 + }, + { + "epoch": 1.1160262417994378, + "grad_norm": 54408.24609375, + "learning_rate": 7.972516994073238e-05, + "loss": 2.2109, + "step": 5954 + }, + { + "epoch": 1.1162136832239926, + "grad_norm": 50064.31640625, + "learning_rate": 7.971885108200939e-05, + "loss": 2.2684, + "step": 5955 + }, + { + "epoch": 1.1164011246485472, + "grad_norm": 54437.52734375, + "learning_rate": 7.971253148926929e-05, + "loss": 2.1991, + "step": 5956 + }, + { + "epoch": 1.116588566073102, + "grad_norm": 49648.66796875, + "learning_rate": 7.970621116266821e-05, + "loss": 2.233, + "step": 5957 + }, + { + "epoch": 1.116776007497657, + "grad_norm": 54104.28515625, + "learning_rate": 7.96998901023622e-05, + "loss": 2.2918, + "step": 5958 + }, + { + "epoch": 1.1169634489222118, + "grad_norm": 48883.515625, + "learning_rate": 7.969356830850743e-05, + "loss": 2.1824, + "step": 5959 + }, + { + "epoch": 1.1171508903467666, + "grad_norm": 49419.046875, + "learning_rate": 7.968724578126002e-05, + "loss": 2.2399, + "step": 5960 + }, + { + "epoch": 1.1173383317713215, + "grad_norm": 57466.0703125, + "learning_rate": 7.968092252077613e-05, + "loss": 2.2454, + "step": 5961 + }, + { + "epoch": 1.1175257731958763, + "grad_norm": 52341.265625, + "learning_rate": 7.967459852721193e-05, + "loss": 2.2536, + "step": 5962 + }, + { + "epoch": 1.1177132146204312, + "grad_norm": 48813.7734375, + "learning_rate": 7.966827380072361e-05, + "loss": 2.1827, + "step": 5963 + }, + { + "epoch": 1.117900656044986, + "grad_norm": 47030.0859375, + "learning_rate": 7.96619483414674e-05, + "loss": 2.237, + "step": 5964 + }, + { + "epoch": 1.1180880974695409, + "grad_norm": 50710.32421875, + "learning_rate": 7.965562214959953e-05, + "loss": 2.2896, + "step": 5965 + }, + { + "epoch": 1.1182755388940957, + "grad_norm": 50058.0390625, + "learning_rate": 7.964929522527624e-05, + "loss": 2.2225, + "step": 5966 + }, + { + "epoch": 1.1184629803186503, + "grad_norm": 48217.609375, + "learning_rate": 7.964296756865379e-05, + "loss": 2.3101, + "step": 5967 + }, + { + "epoch": 1.1186504217432052, + "grad_norm": 47957.83203125, + "learning_rate": 7.963663917988848e-05, + "loss": 2.2245, + "step": 5968 + }, + { + "epoch": 1.11883786316776, + "grad_norm": 52389.37890625, + "learning_rate": 7.963031005913662e-05, + "loss": 2.1464, + "step": 5969 + }, + { + "epoch": 1.1190253045923149, + "grad_norm": 51355.01953125, + "learning_rate": 7.962398020655451e-05, + "loss": 2.2177, + "step": 5970 + }, + { + "epoch": 1.1192127460168697, + "grad_norm": 47582.8203125, + "learning_rate": 7.961764962229848e-05, + "loss": 2.2699, + "step": 5971 + }, + { + "epoch": 1.1194001874414246, + "grad_norm": 51281.12109375, + "learning_rate": 7.96113183065249e-05, + "loss": 2.2632, + "step": 5972 + }, + { + "epoch": 1.1195876288659794, + "grad_norm": 50759.4140625, + "learning_rate": 7.960498625939019e-05, + "loss": 2.1816, + "step": 5973 + }, + { + "epoch": 1.1197750702905342, + "grad_norm": 52562.1953125, + "learning_rate": 7.959865348105066e-05, + "loss": 2.2084, + "step": 5974 + }, + { + "epoch": 1.119962511715089, + "grad_norm": 54268.31640625, + "learning_rate": 7.959231997166279e-05, + "loss": 2.2157, + "step": 5975 + }, + { + "epoch": 1.120149953139644, + "grad_norm": 51067.0703125, + "learning_rate": 7.958598573138298e-05, + "loss": 2.3023, + "step": 5976 + }, + { + "epoch": 1.1203373945641988, + "grad_norm": 52836.546875, + "learning_rate": 7.957965076036769e-05, + "loss": 2.231, + "step": 5977 + }, + { + "epoch": 1.1205248359887534, + "grad_norm": 49999.6484375, + "learning_rate": 7.957331505877335e-05, + "loss": 2.1406, + "step": 5978 + }, + { + "epoch": 1.1207122774133083, + "grad_norm": 52003.1171875, + "learning_rate": 7.956697862675649e-05, + "loss": 2.2645, + "step": 5979 + }, + { + "epoch": 1.120899718837863, + "grad_norm": 51512.2421875, + "learning_rate": 7.956064146447358e-05, + "loss": 2.2583, + "step": 5980 + }, + { + "epoch": 1.121087160262418, + "grad_norm": 51495.48828125, + "learning_rate": 7.955430357208116e-05, + "loss": 2.2459, + "step": 5981 + }, + { + "epoch": 1.1212746016869728, + "grad_norm": 52429.171875, + "learning_rate": 7.954796494973576e-05, + "loss": 2.2727, + "step": 5982 + }, + { + "epoch": 1.1214620431115276, + "grad_norm": 53087.91796875, + "learning_rate": 7.954162559759393e-05, + "loss": 2.2625, + "step": 5983 + }, + { + "epoch": 1.1216494845360825, + "grad_norm": 48650.234375, + "learning_rate": 7.953528551581225e-05, + "loss": 2.2388, + "step": 5984 + }, + { + "epoch": 1.1218369259606373, + "grad_norm": 55695.3984375, + "learning_rate": 7.952894470454733e-05, + "loss": 2.2334, + "step": 5985 + }, + { + "epoch": 1.1220243673851922, + "grad_norm": 51994.09375, + "learning_rate": 7.952260316395574e-05, + "loss": 2.1908, + "step": 5986 + }, + { + "epoch": 1.122211808809747, + "grad_norm": 49545.28515625, + "learning_rate": 7.951626089419412e-05, + "loss": 2.3214, + "step": 5987 + }, + { + "epoch": 1.1223992502343019, + "grad_norm": 49757.8046875, + "learning_rate": 7.950991789541915e-05, + "loss": 2.2539, + "step": 5988 + }, + { + "epoch": 1.1225866916588565, + "grad_norm": 50075.14453125, + "learning_rate": 7.950357416778744e-05, + "loss": 2.2863, + "step": 5989 + }, + { + "epoch": 1.1227741330834113, + "grad_norm": 53486.77734375, + "learning_rate": 7.949722971145572e-05, + "loss": 2.2198, + "step": 5990 + }, + { + "epoch": 1.1229615745079662, + "grad_norm": 48093.78125, + "learning_rate": 7.949088452658067e-05, + "loss": 2.214, + "step": 5991 + }, + { + "epoch": 1.123149015932521, + "grad_norm": 46378.765625, + "learning_rate": 7.9484538613319e-05, + "loss": 2.231, + "step": 5992 + }, + { + "epoch": 1.1233364573570759, + "grad_norm": 48819.125, + "learning_rate": 7.947819197182746e-05, + "loss": 2.1984, + "step": 5993 + }, + { + "epoch": 1.1235238987816307, + "grad_norm": 54639.91796875, + "learning_rate": 7.94718446022628e-05, + "loss": 2.3231, + "step": 5994 + }, + { + "epoch": 1.1237113402061856, + "grad_norm": 47652.4921875, + "learning_rate": 7.946549650478177e-05, + "loss": 2.289, + "step": 5995 + }, + { + "epoch": 1.1238987816307404, + "grad_norm": 54391.3203125, + "learning_rate": 7.945914767954119e-05, + "loss": 2.294, + "step": 5996 + }, + { + "epoch": 1.1240862230552953, + "grad_norm": 54706.65625, + "learning_rate": 7.945279812669785e-05, + "loss": 2.2468, + "step": 5997 + }, + { + "epoch": 1.12427366447985, + "grad_norm": 52760.35546875, + "learning_rate": 7.94464478464086e-05, + "loss": 2.2161, + "step": 5998 + }, + { + "epoch": 1.124461105904405, + "grad_norm": 50501.53515625, + "learning_rate": 7.944009683883027e-05, + "loss": 2.1901, + "step": 5999 + }, + { + "epoch": 1.1246485473289598, + "grad_norm": 51572.640625, + "learning_rate": 7.943374510411969e-05, + "loss": 2.188, + "step": 6000 + }, + { + "epoch": 1.1246485473289598, + "eval_loss": 2.311718225479126, + "eval_runtime": 128.8022, + "eval_samples_per_second": 39.2, + "eval_steps_per_second": 1.964, + "step": 6000 + }, + { + "epoch": 1.1248359887535144, + "grad_norm": 49281.1640625, + "learning_rate": 7.94273926424338e-05, + "loss": 2.2544, + "step": 6001 + }, + { + "epoch": 1.1250234301780693, + "grad_norm": 48296.4765625, + "learning_rate": 7.942103945392942e-05, + "loss": 2.2046, + "step": 6002 + }, + { + "epoch": 1.1252108716026241, + "grad_norm": 50107.17578125, + "learning_rate": 7.941468553876354e-05, + "loss": 2.2238, + "step": 6003 + }, + { + "epoch": 1.125398313027179, + "grad_norm": 53341.2109375, + "learning_rate": 7.940833089709307e-05, + "loss": 2.2653, + "step": 6004 + }, + { + "epoch": 1.1255857544517338, + "grad_norm": 48806.546875, + "learning_rate": 7.940197552907495e-05, + "loss": 2.3074, + "step": 6005 + }, + { + "epoch": 1.1257731958762887, + "grad_norm": 49885.6484375, + "learning_rate": 7.939561943486615e-05, + "loss": 2.165, + "step": 6006 + }, + { + "epoch": 1.1259606373008435, + "grad_norm": 47040.859375, + "learning_rate": 7.938926261462366e-05, + "loss": 2.2871, + "step": 6007 + }, + { + "epoch": 1.1261480787253983, + "grad_norm": 51340.1875, + "learning_rate": 7.93829050685045e-05, + "loss": 2.3615, + "step": 6008 + }, + { + "epoch": 1.1263355201499532, + "grad_norm": 54985.1875, + "learning_rate": 7.937654679666567e-05, + "loss": 2.1925, + "step": 6009 + }, + { + "epoch": 1.126522961574508, + "grad_norm": 52236.1640625, + "learning_rate": 7.937018779926421e-05, + "loss": 2.2543, + "step": 6010 + }, + { + "epoch": 1.1267104029990629, + "grad_norm": 51074.46484375, + "learning_rate": 7.936382807645721e-05, + "loss": 2.282, + "step": 6011 + }, + { + "epoch": 1.1268978444236177, + "grad_norm": 53600.19140625, + "learning_rate": 7.935746762840172e-05, + "loss": 2.1641, + "step": 6012 + }, + { + "epoch": 1.1270852858481724, + "grad_norm": 52394.52734375, + "learning_rate": 7.935110645525483e-05, + "loss": 2.1368, + "step": 6013 + }, + { + "epoch": 1.1272727272727272, + "grad_norm": 53108.85546875, + "learning_rate": 7.934474455717369e-05, + "loss": 2.2545, + "step": 6014 + }, + { + "epoch": 1.127460168697282, + "grad_norm": 50485.7421875, + "learning_rate": 7.933838193431538e-05, + "loss": 2.226, + "step": 6015 + }, + { + "epoch": 1.127647610121837, + "grad_norm": 53351.2578125, + "learning_rate": 7.93320185868371e-05, + "loss": 2.1313, + "step": 6016 + }, + { + "epoch": 1.1278350515463917, + "grad_norm": 50627.59375, + "learning_rate": 7.932565451489598e-05, + "loss": 2.2839, + "step": 6017 + }, + { + "epoch": 1.1280224929709466, + "grad_norm": 63055.97265625, + "learning_rate": 7.931928971864921e-05, + "loss": 2.2169, + "step": 6018 + }, + { + "epoch": 1.1282099343955014, + "grad_norm": 49143.72265625, + "learning_rate": 7.931292419825398e-05, + "loss": 2.2593, + "step": 6019 + }, + { + "epoch": 1.1283973758200563, + "grad_norm": 50114.76953125, + "learning_rate": 7.930655795386756e-05, + "loss": 2.1793, + "step": 6020 + }, + { + "epoch": 1.1285848172446111, + "grad_norm": 46990.10546875, + "learning_rate": 7.930019098564713e-05, + "loss": 2.3172, + "step": 6021 + }, + { + "epoch": 1.128772258669166, + "grad_norm": 52126.8515625, + "learning_rate": 7.929382329374998e-05, + "loss": 2.2614, + "step": 6022 + }, + { + "epoch": 1.1289597000937208, + "grad_norm": 49716.50390625, + "learning_rate": 7.928745487833338e-05, + "loss": 2.2453, + "step": 6023 + }, + { + "epoch": 1.1291471415182754, + "grad_norm": 51573.76171875, + "learning_rate": 7.928108573955462e-05, + "loss": 2.2275, + "step": 6024 + }, + { + "epoch": 1.1293345829428303, + "grad_norm": 51145.9453125, + "learning_rate": 7.927471587757098e-05, + "loss": 2.1513, + "step": 6025 + }, + { + "epoch": 1.1295220243673851, + "grad_norm": 54841.8984375, + "learning_rate": 7.926834529253984e-05, + "loss": 2.2408, + "step": 6026 + }, + { + "epoch": 1.12970946579194, + "grad_norm": 48138.97265625, + "learning_rate": 7.926197398461852e-05, + "loss": 2.2553, + "step": 6027 + }, + { + "epoch": 1.1298969072164948, + "grad_norm": 49003.109375, + "learning_rate": 7.925560195396435e-05, + "loss": 2.1771, + "step": 6028 + }, + { + "epoch": 1.1300843486410497, + "grad_norm": 47338.9375, + "learning_rate": 7.924922920073478e-05, + "loss": 2.2386, + "step": 6029 + }, + { + "epoch": 1.1302717900656045, + "grad_norm": 49367.11328125, + "learning_rate": 7.924285572508714e-05, + "loss": 2.2538, + "step": 6030 + }, + { + "epoch": 1.1304592314901594, + "grad_norm": 54133.828125, + "learning_rate": 7.92364815271789e-05, + "loss": 2.1921, + "step": 6031 + }, + { + "epoch": 1.1306466729147142, + "grad_norm": 49420.31640625, + "learning_rate": 7.923010660716746e-05, + "loss": 2.1974, + "step": 6032 + }, + { + "epoch": 1.130834114339269, + "grad_norm": 51466.95703125, + "learning_rate": 7.922373096521029e-05, + "loss": 2.2192, + "step": 6033 + }, + { + "epoch": 1.131021555763824, + "grad_norm": 53436.50390625, + "learning_rate": 7.921735460146485e-05, + "loss": 2.2593, + "step": 6034 + }, + { + "epoch": 1.1312089971883785, + "grad_norm": 50891.66796875, + "learning_rate": 7.921097751608863e-05, + "loss": 2.2747, + "step": 6035 + }, + { + "epoch": 1.1313964386129334, + "grad_norm": 51345.84375, + "learning_rate": 7.920459970923913e-05, + "loss": 2.3811, + "step": 6036 + }, + { + "epoch": 1.1315838800374882, + "grad_norm": 52074.2421875, + "learning_rate": 7.91982211810739e-05, + "loss": 2.2161, + "step": 6037 + }, + { + "epoch": 1.131771321462043, + "grad_norm": 48837.85546875, + "learning_rate": 7.919184193175045e-05, + "loss": 2.2042, + "step": 6038 + }, + { + "epoch": 1.131958762886598, + "grad_norm": 51359.79296875, + "learning_rate": 7.918546196142637e-05, + "loss": 2.2338, + "step": 6039 + }, + { + "epoch": 1.1321462043111528, + "grad_norm": 52787.328125, + "learning_rate": 7.91790812702592e-05, + "loss": 2.2919, + "step": 6040 + }, + { + "epoch": 1.1323336457357076, + "grad_norm": 48303.484375, + "learning_rate": 7.917269985840656e-05, + "loss": 2.2606, + "step": 6041 + }, + { + "epoch": 1.1325210871602625, + "grad_norm": 54838.4921875, + "learning_rate": 7.916631772602607e-05, + "loss": 2.2948, + "step": 6042 + }, + { + "epoch": 1.1327085285848173, + "grad_norm": 55328.6640625, + "learning_rate": 7.915993487327533e-05, + "loss": 2.257, + "step": 6043 + }, + { + "epoch": 1.1328959700093721, + "grad_norm": 49589.78125, + "learning_rate": 7.915355130031201e-05, + "loss": 2.239, + "step": 6044 + }, + { + "epoch": 1.133083411433927, + "grad_norm": 52220.09765625, + "learning_rate": 7.914716700729377e-05, + "loss": 2.2476, + "step": 6045 + }, + { + "epoch": 1.1332708528584816, + "grad_norm": 50035.265625, + "learning_rate": 7.914078199437829e-05, + "loss": 2.1728, + "step": 6046 + }, + { + "epoch": 1.1334582942830365, + "grad_norm": 52183.87109375, + "learning_rate": 7.913439626172328e-05, + "loss": 2.2952, + "step": 6047 + }, + { + "epoch": 1.1336457357075913, + "grad_norm": 49130.10546875, + "learning_rate": 7.912800980948645e-05, + "loss": 2.237, + "step": 6048 + }, + { + "epoch": 1.1338331771321462, + "grad_norm": 46144.44921875, + "learning_rate": 7.912162263782555e-05, + "loss": 2.2643, + "step": 6049 + }, + { + "epoch": 1.134020618556701, + "grad_norm": 53698.3203125, + "learning_rate": 7.911523474689834e-05, + "loss": 2.3275, + "step": 6050 + }, + { + "epoch": 1.1342080599812558, + "grad_norm": 51716.79296875, + "learning_rate": 7.910884613686256e-05, + "loss": 2.2741, + "step": 6051 + }, + { + "epoch": 1.1343955014058107, + "grad_norm": 48641.828125, + "learning_rate": 7.910245680787603e-05, + "loss": 2.2889, + "step": 6052 + }, + { + "epoch": 1.1345829428303655, + "grad_norm": 52295.46875, + "learning_rate": 7.909606676009655e-05, + "loss": 2.3045, + "step": 6053 + }, + { + "epoch": 1.1347703842549204, + "grad_norm": 54753.921875, + "learning_rate": 7.908967599368193e-05, + "loss": 2.2412, + "step": 6054 + }, + { + "epoch": 1.1349578256794752, + "grad_norm": 51249.74609375, + "learning_rate": 7.908328450879005e-05, + "loss": 2.2593, + "step": 6055 + }, + { + "epoch": 1.13514526710403, + "grad_norm": 49162.65625, + "learning_rate": 7.907689230557874e-05, + "loss": 2.3273, + "step": 6056 + }, + { + "epoch": 1.1353327085285847, + "grad_norm": 51184.5546875, + "learning_rate": 7.907049938420589e-05, + "loss": 2.2346, + "step": 6057 + }, + { + "epoch": 1.1355201499531398, + "grad_norm": 55579.0078125, + "learning_rate": 7.906410574482938e-05, + "loss": 2.2816, + "step": 6058 + }, + { + "epoch": 1.1357075913776944, + "grad_norm": 50200.6484375, + "learning_rate": 7.905771138760717e-05, + "loss": 2.1324, + "step": 6059 + }, + { + "epoch": 1.1358950328022492, + "grad_norm": 49857.76171875, + "learning_rate": 7.905131631269713e-05, + "loss": 2.2482, + "step": 6060 + }, + { + "epoch": 1.136082474226804, + "grad_norm": 50741.7734375, + "learning_rate": 7.904492052025727e-05, + "loss": 2.2131, + "step": 6061 + }, + { + "epoch": 1.136269915651359, + "grad_norm": 53732.734375, + "learning_rate": 7.903852401044552e-05, + "loss": 2.1772, + "step": 6062 + }, + { + "epoch": 1.1364573570759138, + "grad_norm": 52510.70703125, + "learning_rate": 7.903212678341987e-05, + "loss": 2.2762, + "step": 6063 + }, + { + "epoch": 1.1366447985004686, + "grad_norm": 48722.11328125, + "learning_rate": 7.902572883933834e-05, + "loss": 2.2129, + "step": 6064 + }, + { + "epoch": 1.1368322399250235, + "grad_norm": 54731.234375, + "learning_rate": 7.901933017835895e-05, + "loss": 2.2348, + "step": 6065 + }, + { + "epoch": 1.1370196813495783, + "grad_norm": 51921.23828125, + "learning_rate": 7.90129308006397e-05, + "loss": 2.2872, + "step": 6066 + }, + { + "epoch": 1.1372071227741332, + "grad_norm": 49061.06640625, + "learning_rate": 7.900653070633869e-05, + "loss": 2.2621, + "step": 6067 + }, + { + "epoch": 1.1373945641986878, + "grad_norm": 54001.58984375, + "learning_rate": 7.900012989561398e-05, + "loss": 2.1635, + "step": 6068 + }, + { + "epoch": 1.1375820056232429, + "grad_norm": 51883.19140625, + "learning_rate": 7.899372836862366e-05, + "loss": 2.1847, + "step": 6069 + }, + { + "epoch": 1.1377694470477975, + "grad_norm": 50640.25390625, + "learning_rate": 7.898732612552584e-05, + "loss": 2.1703, + "step": 6070 + }, + { + "epoch": 1.1379568884723523, + "grad_norm": 54235.76171875, + "learning_rate": 7.898092316647865e-05, + "loss": 2.2122, + "step": 6071 + }, + { + "epoch": 1.1381443298969072, + "grad_norm": 52404.38671875, + "learning_rate": 7.897451949164024e-05, + "loss": 2.301, + "step": 6072 + }, + { + "epoch": 1.138331771321462, + "grad_norm": 55928.1015625, + "learning_rate": 7.896811510116873e-05, + "loss": 2.2369, + "step": 6073 + }, + { + "epoch": 1.1385192127460169, + "grad_norm": 53495.41796875, + "learning_rate": 7.896170999522237e-05, + "loss": 2.2567, + "step": 6074 + }, + { + "epoch": 1.1387066541705717, + "grad_norm": 48583.65234375, + "learning_rate": 7.89553041739593e-05, + "loss": 2.214, + "step": 6075 + }, + { + "epoch": 1.1388940955951266, + "grad_norm": 52254.99609375, + "learning_rate": 7.894889763753776e-05, + "loss": 2.2225, + "step": 6076 + }, + { + "epoch": 1.1390815370196814, + "grad_norm": 52263.65234375, + "learning_rate": 7.8942490386116e-05, + "loss": 2.1846, + "step": 6077 + }, + { + "epoch": 1.1392689784442362, + "grad_norm": 50333.59765625, + "learning_rate": 7.893608241985224e-05, + "loss": 2.3206, + "step": 6078 + }, + { + "epoch": 1.139456419868791, + "grad_norm": 47326.328125, + "learning_rate": 7.892967373890476e-05, + "loss": 2.314, + "step": 6079 + }, + { + "epoch": 1.139643861293346, + "grad_norm": 52379.04296875, + "learning_rate": 7.892326434343184e-05, + "loss": 2.2221, + "step": 6080 + }, + { + "epoch": 1.1398313027179006, + "grad_norm": 50816.84765625, + "learning_rate": 7.891685423359181e-05, + "loss": 2.3152, + "step": 6081 + }, + { + "epoch": 1.1400187441424554, + "grad_norm": 51183.17578125, + "learning_rate": 7.891044340954297e-05, + "loss": 2.2211, + "step": 6082 + }, + { + "epoch": 1.1402061855670103, + "grad_norm": 50462.6796875, + "learning_rate": 7.890403187144364e-05, + "loss": 2.2586, + "step": 6083 + }, + { + "epoch": 1.140393626991565, + "grad_norm": 53165.46875, + "learning_rate": 7.889761961945221e-05, + "loss": 2.2061, + "step": 6084 + }, + { + "epoch": 1.14058106841612, + "grad_norm": 56619.19140625, + "learning_rate": 7.889120665372705e-05, + "loss": 2.1742, + "step": 6085 + }, + { + "epoch": 1.1407685098406748, + "grad_norm": 55510.04296875, + "learning_rate": 7.888479297442652e-05, + "loss": 2.2721, + "step": 6086 + }, + { + "epoch": 1.1409559512652296, + "grad_norm": 51705.55078125, + "learning_rate": 7.887837858170907e-05, + "loss": 2.1626, + "step": 6087 + }, + { + "epoch": 1.1411433926897845, + "grad_norm": 50653.15625, + "learning_rate": 7.88719634757331e-05, + "loss": 2.1301, + "step": 6088 + }, + { + "epoch": 1.1413308341143393, + "grad_norm": 50542.765625, + "learning_rate": 7.886554765665708e-05, + "loss": 2.2312, + "step": 6089 + }, + { + "epoch": 1.1415182755388942, + "grad_norm": 52134.609375, + "learning_rate": 7.885913112463945e-05, + "loss": 2.2681, + "step": 6090 + }, + { + "epoch": 1.141705716963449, + "grad_norm": 50220.26171875, + "learning_rate": 7.88527138798387e-05, + "loss": 2.2121, + "step": 6091 + }, + { + "epoch": 1.1418931583880036, + "grad_norm": 55381.6953125, + "learning_rate": 7.884629592241332e-05, + "loss": 2.2065, + "step": 6092 + }, + { + "epoch": 1.1420805998125585, + "grad_norm": 55029.5625, + "learning_rate": 7.883987725252183e-05, + "loss": 2.2934, + "step": 6093 + }, + { + "epoch": 1.1422680412371133, + "grad_norm": 52598.79296875, + "learning_rate": 7.883345787032277e-05, + "loss": 2.1982, + "step": 6094 + }, + { + "epoch": 1.1424554826616682, + "grad_norm": 51270.8359375, + "learning_rate": 7.882703777597469e-05, + "loss": 2.2528, + "step": 6095 + }, + { + "epoch": 1.142642924086223, + "grad_norm": 58565.73828125, + "learning_rate": 7.882061696963614e-05, + "loss": 2.1591, + "step": 6096 + }, + { + "epoch": 1.1428303655107779, + "grad_norm": 57574.18359375, + "learning_rate": 7.881419545146572e-05, + "loss": 2.1863, + "step": 6097 + }, + { + "epoch": 1.1430178069353327, + "grad_norm": 48831.17578125, + "learning_rate": 7.880777322162203e-05, + "loss": 2.2149, + "step": 6098 + }, + { + "epoch": 1.1432052483598876, + "grad_norm": 48479.1796875, + "learning_rate": 7.880135028026369e-05, + "loss": 2.2246, + "step": 6099 + }, + { + "epoch": 1.1433926897844424, + "grad_norm": 50914.70703125, + "learning_rate": 7.879492662754934e-05, + "loss": 2.2533, + "step": 6100 + }, + { + "epoch": 1.1435801312089973, + "grad_norm": 52123.05078125, + "learning_rate": 7.878850226363763e-05, + "loss": 2.226, + "step": 6101 + }, + { + "epoch": 1.143767572633552, + "grad_norm": 51010.27734375, + "learning_rate": 7.878207718868725e-05, + "loss": 2.2279, + "step": 6102 + }, + { + "epoch": 1.1439550140581067, + "grad_norm": 52548.25390625, + "learning_rate": 7.877565140285687e-05, + "loss": 2.2706, + "step": 6103 + }, + { + "epoch": 1.1441424554826616, + "grad_norm": 50589.875, + "learning_rate": 7.876922490630521e-05, + "loss": 2.1792, + "step": 6104 + }, + { + "epoch": 1.1443298969072164, + "grad_norm": 51018.953125, + "learning_rate": 7.8762797699191e-05, + "loss": 2.2587, + "step": 6105 + }, + { + "epoch": 1.1445173383317713, + "grad_norm": 49324.734375, + "learning_rate": 7.875636978167299e-05, + "loss": 2.2958, + "step": 6106 + }, + { + "epoch": 1.1447047797563261, + "grad_norm": 50590.9140625, + "learning_rate": 7.87499411539099e-05, + "loss": 2.261, + "step": 6107 + }, + { + "epoch": 1.144892221180881, + "grad_norm": 53275.4921875, + "learning_rate": 7.874351181606056e-05, + "loss": 2.2853, + "step": 6108 + }, + { + "epoch": 1.1450796626054358, + "grad_norm": 47101.640625, + "learning_rate": 7.873708176828374e-05, + "loss": 2.2157, + "step": 6109 + }, + { + "epoch": 1.1452671040299907, + "grad_norm": 52571.203125, + "learning_rate": 7.873065101073826e-05, + "loss": 2.2732, + "step": 6110 + }, + { + "epoch": 1.1454545454545455, + "grad_norm": 51837.51171875, + "learning_rate": 7.872421954358295e-05, + "loss": 2.2139, + "step": 6111 + }, + { + "epoch": 1.1456419868791003, + "grad_norm": 52485.08203125, + "learning_rate": 7.871778736697666e-05, + "loss": 2.3016, + "step": 6112 + }, + { + "epoch": 1.1458294283036552, + "grad_norm": 52518.90234375, + "learning_rate": 7.871135448107825e-05, + "loss": 2.2274, + "step": 6113 + }, + { + "epoch": 1.1460168697282098, + "grad_norm": 49562.50390625, + "learning_rate": 7.87049208860466e-05, + "loss": 2.2285, + "step": 6114 + }, + { + "epoch": 1.1462043111527647, + "grad_norm": 50613.453125, + "learning_rate": 7.869848658204065e-05, + "loss": 2.2228, + "step": 6115 + }, + { + "epoch": 1.1463917525773195, + "grad_norm": 51363.9609375, + "learning_rate": 7.869205156921926e-05, + "loss": 2.2167, + "step": 6116 + }, + { + "epoch": 1.1465791940018744, + "grad_norm": 56149.3203125, + "learning_rate": 7.868561584774141e-05, + "loss": 2.2018, + "step": 6117 + }, + { + "epoch": 1.1467666354264292, + "grad_norm": 48933.5625, + "learning_rate": 7.867917941776604e-05, + "loss": 2.2325, + "step": 6118 + }, + { + "epoch": 1.146954076850984, + "grad_norm": 53024.078125, + "learning_rate": 7.867274227945212e-05, + "loss": 2.2681, + "step": 6119 + }, + { + "epoch": 1.147141518275539, + "grad_norm": 49744.14453125, + "learning_rate": 7.866630443295864e-05, + "loss": 2.1983, + "step": 6120 + }, + { + "epoch": 1.1473289597000937, + "grad_norm": 47197.23828125, + "learning_rate": 7.865986587844461e-05, + "loss": 2.2331, + "step": 6121 + }, + { + "epoch": 1.1475164011246486, + "grad_norm": 50929.95703125, + "learning_rate": 7.865342661606903e-05, + "loss": 2.2216, + "step": 6122 + }, + { + "epoch": 1.1477038425492034, + "grad_norm": 49838.56640625, + "learning_rate": 7.864698664599099e-05, + "loss": 2.276, + "step": 6123 + }, + { + "epoch": 1.1478912839737583, + "grad_norm": 47167.53515625, + "learning_rate": 7.864054596836952e-05, + "loss": 2.2361, + "step": 6124 + }, + { + "epoch": 1.148078725398313, + "grad_norm": 51637.56640625, + "learning_rate": 7.863410458336369e-05, + "loss": 2.2999, + "step": 6125 + }, + { + "epoch": 1.148266166822868, + "grad_norm": 49990.6640625, + "learning_rate": 7.86276624911326e-05, + "loss": 2.2683, + "step": 6126 + }, + { + "epoch": 1.1484536082474226, + "grad_norm": 60033.8984375, + "learning_rate": 7.862121969183535e-05, + "loss": 2.2639, + "step": 6127 + }, + { + "epoch": 1.1486410496719774, + "grad_norm": 50321.5078125, + "learning_rate": 7.86147761856311e-05, + "loss": 2.1886, + "step": 6128 + }, + { + "epoch": 1.1488284910965323, + "grad_norm": 52697.45703125, + "learning_rate": 7.860833197267896e-05, + "loss": 2.2698, + "step": 6129 + }, + { + "epoch": 1.1490159325210871, + "grad_norm": 46109.6015625, + "learning_rate": 7.860188705313814e-05, + "loss": 2.241, + "step": 6130 + }, + { + "epoch": 1.149203373945642, + "grad_norm": 53852.16015625, + "learning_rate": 7.859544142716777e-05, + "loss": 2.2363, + "step": 6131 + }, + { + "epoch": 1.1493908153701968, + "grad_norm": 52166.796875, + "learning_rate": 7.858899509492707e-05, + "loss": 2.2031, + "step": 6132 + }, + { + "epoch": 1.1495782567947517, + "grad_norm": 48422.328125, + "learning_rate": 7.858254805657526e-05, + "loss": 2.2979, + "step": 6133 + }, + { + "epoch": 1.1497656982193065, + "grad_norm": 52132.0078125, + "learning_rate": 7.857610031227158e-05, + "loss": 2.2251, + "step": 6134 + }, + { + "epoch": 1.1499531396438614, + "grad_norm": 50394.1640625, + "learning_rate": 7.856965186217527e-05, + "loss": 2.2002, + "step": 6135 + }, + { + "epoch": 1.1501405810684162, + "grad_norm": 48859.15234375, + "learning_rate": 7.85632027064456e-05, + "loss": 2.3171, + "step": 6136 + }, + { + "epoch": 1.150328022492971, + "grad_norm": 51765.171875, + "learning_rate": 7.855675284524185e-05, + "loss": 2.3149, + "step": 6137 + }, + { + "epoch": 1.1505154639175257, + "grad_norm": 49064.08984375, + "learning_rate": 7.855030227872333e-05, + "loss": 2.2444, + "step": 6138 + }, + { + "epoch": 1.1507029053420805, + "grad_norm": 49587.51171875, + "learning_rate": 7.854385100704937e-05, + "loss": 2.2817, + "step": 6139 + }, + { + "epoch": 1.1508903467666354, + "grad_norm": 50318.60546875, + "learning_rate": 7.853739903037929e-05, + "loss": 2.229, + "step": 6140 + }, + { + "epoch": 1.1510777881911902, + "grad_norm": 51365.75, + "learning_rate": 7.853094634887246e-05, + "loss": 2.3516, + "step": 6141 + }, + { + "epoch": 1.151265229615745, + "grad_norm": 52159.703125, + "learning_rate": 7.852449296268822e-05, + "loss": 2.23, + "step": 6142 + }, + { + "epoch": 1.1514526710403, + "grad_norm": 49042.25390625, + "learning_rate": 7.851803887198602e-05, + "loss": 2.276, + "step": 6143 + }, + { + "epoch": 1.1516401124648548, + "grad_norm": 55700.87109375, + "learning_rate": 7.851158407692522e-05, + "loss": 2.168, + "step": 6144 + }, + { + "epoch": 1.1518275538894096, + "grad_norm": 53948.43359375, + "learning_rate": 7.850512857766526e-05, + "loss": 2.2458, + "step": 6145 + }, + { + "epoch": 1.1520149953139645, + "grad_norm": 47552.6640625, + "learning_rate": 7.849867237436557e-05, + "loss": 2.2905, + "step": 6146 + }, + { + "epoch": 1.1522024367385193, + "grad_norm": 50241.87109375, + "learning_rate": 7.849221546718563e-05, + "loss": 2.2407, + "step": 6147 + }, + { + "epoch": 1.1523898781630741, + "grad_norm": 47653.1328125, + "learning_rate": 7.848575785628491e-05, + "loss": 2.2733, + "step": 6148 + }, + { + "epoch": 1.1525773195876288, + "grad_norm": 51026.15625, + "learning_rate": 7.847929954182291e-05, + "loss": 2.3094, + "step": 6149 + }, + { + "epoch": 1.1527647610121836, + "grad_norm": 52223.92578125, + "learning_rate": 7.847284052395913e-05, + "loss": 2.2606, + "step": 6150 + }, + { + "epoch": 1.1529522024367385, + "grad_norm": 50302.0703125, + "learning_rate": 7.84663808028531e-05, + "loss": 2.2705, + "step": 6151 + }, + { + "epoch": 1.1531396438612933, + "grad_norm": 49027.515625, + "learning_rate": 7.845992037866438e-05, + "loss": 2.2889, + "step": 6152 + }, + { + "epoch": 1.1533270852858482, + "grad_norm": 46626.94921875, + "learning_rate": 7.845345925155252e-05, + "loss": 2.2476, + "step": 6153 + }, + { + "epoch": 1.153514526710403, + "grad_norm": 47197.109375, + "learning_rate": 7.84469974216771e-05, + "loss": 2.2319, + "step": 6154 + }, + { + "epoch": 1.1537019681349578, + "grad_norm": 52937.5078125, + "learning_rate": 7.844053488919774e-05, + "loss": 2.1887, + "step": 6155 + }, + { + "epoch": 1.1538894095595127, + "grad_norm": 47665.75, + "learning_rate": 7.843407165427403e-05, + "loss": 2.2489, + "step": 6156 + }, + { + "epoch": 1.1540768509840675, + "grad_norm": 50283.94140625, + "learning_rate": 7.842760771706562e-05, + "loss": 2.2004, + "step": 6157 + }, + { + "epoch": 1.1542642924086224, + "grad_norm": 46447.66015625, + "learning_rate": 7.842114307773216e-05, + "loss": 2.2016, + "step": 6158 + }, + { + "epoch": 1.1544517338331772, + "grad_norm": 54221.0546875, + "learning_rate": 7.841467773643331e-05, + "loss": 2.1552, + "step": 6159 + }, + { + "epoch": 1.1546391752577319, + "grad_norm": 50405.92578125, + "learning_rate": 7.840821169332877e-05, + "loss": 2.2931, + "step": 6160 + }, + { + "epoch": 1.1548266166822867, + "grad_norm": 52146.1953125, + "learning_rate": 7.840174494857822e-05, + "loss": 2.2841, + "step": 6161 + }, + { + "epoch": 1.1550140581068415, + "grad_norm": 49032.8515625, + "learning_rate": 7.839527750234139e-05, + "loss": 2.2374, + "step": 6162 + }, + { + "epoch": 1.1552014995313964, + "grad_norm": 52349.05859375, + "learning_rate": 7.838880935477803e-05, + "loss": 2.186, + "step": 6163 + }, + { + "epoch": 1.1553889409559512, + "grad_norm": 51708.7109375, + "learning_rate": 7.838234050604789e-05, + "loss": 2.3359, + "step": 6164 + }, + { + "epoch": 1.155576382380506, + "grad_norm": 48151.8359375, + "learning_rate": 7.837587095631073e-05, + "loss": 2.2134, + "step": 6165 + }, + { + "epoch": 1.155763823805061, + "grad_norm": 45546.796875, + "learning_rate": 7.836940070572635e-05, + "loss": 2.2129, + "step": 6166 + }, + { + "epoch": 1.1559512652296158, + "grad_norm": 50872.48828125, + "learning_rate": 7.836292975445456e-05, + "loss": 2.1999, + "step": 6167 + }, + { + "epoch": 1.1561387066541706, + "grad_norm": 51658.2421875, + "learning_rate": 7.835645810265517e-05, + "loss": 2.2082, + "step": 6168 + }, + { + "epoch": 1.1563261480787255, + "grad_norm": 50015.29296875, + "learning_rate": 7.834998575048802e-05, + "loss": 2.2257, + "step": 6169 + }, + { + "epoch": 1.1565135895032803, + "grad_norm": 59428.4296875, + "learning_rate": 7.8343512698113e-05, + "loss": 2.2564, + "step": 6170 + }, + { + "epoch": 1.156701030927835, + "grad_norm": 47541.64453125, + "learning_rate": 7.833703894568995e-05, + "loss": 2.2855, + "step": 6171 + }, + { + "epoch": 1.1568884723523898, + "grad_norm": 49221.7578125, + "learning_rate": 7.833056449337878e-05, + "loss": 2.2407, + "step": 6172 + }, + { + "epoch": 1.1570759137769446, + "grad_norm": 51765.79296875, + "learning_rate": 7.832408934133938e-05, + "loss": 2.2666, + "step": 6173 + }, + { + "epoch": 1.1572633552014995, + "grad_norm": 54226.125, + "learning_rate": 7.831761348973173e-05, + "loss": 2.1807, + "step": 6174 + }, + { + "epoch": 1.1574507966260543, + "grad_norm": 51960.4140625, + "learning_rate": 7.831113693871573e-05, + "loss": 2.2584, + "step": 6175 + }, + { + "epoch": 1.1576382380506092, + "grad_norm": 53136.49609375, + "learning_rate": 7.830465968845134e-05, + "loss": 2.1729, + "step": 6176 + }, + { + "epoch": 1.157825679475164, + "grad_norm": 52723.3203125, + "learning_rate": 7.829818173909857e-05, + "loss": 2.2832, + "step": 6177 + }, + { + "epoch": 1.1580131208997189, + "grad_norm": 48873.46484375, + "learning_rate": 7.829170309081739e-05, + "loss": 2.2413, + "step": 6178 + }, + { + "epoch": 1.1582005623242737, + "grad_norm": 49000.75390625, + "learning_rate": 7.828522374376783e-05, + "loss": 2.2059, + "step": 6179 + }, + { + "epoch": 1.1583880037488286, + "grad_norm": 55437.98828125, + "learning_rate": 7.827874369810993e-05, + "loss": 2.2717, + "step": 6180 + }, + { + "epoch": 1.1585754451733834, + "grad_norm": 47954.4609375, + "learning_rate": 7.827226295400369e-05, + "loss": 2.2445, + "step": 6181 + }, + { + "epoch": 1.158762886597938, + "grad_norm": 52950.75390625, + "learning_rate": 7.826578151160924e-05, + "loss": 2.2241, + "step": 6182 + }, + { + "epoch": 1.158950328022493, + "grad_norm": 49213.0703125, + "learning_rate": 7.825929937108662e-05, + "loss": 2.2061, + "step": 6183 + }, + { + "epoch": 1.1591377694470477, + "grad_norm": 58787.3046875, + "learning_rate": 7.825281653259595e-05, + "loss": 2.1956, + "step": 6184 + }, + { + "epoch": 1.1593252108716026, + "grad_norm": 52793.703125, + "learning_rate": 7.824633299629733e-05, + "loss": 2.3264, + "step": 6185 + }, + { + "epoch": 1.1595126522961574, + "grad_norm": 53809.1640625, + "learning_rate": 7.823984876235091e-05, + "loss": 2.3499, + "step": 6186 + }, + { + "epoch": 1.1597000937207123, + "grad_norm": 52829.625, + "learning_rate": 7.823336383091685e-05, + "loss": 2.3825, + "step": 6187 + }, + { + "epoch": 1.159887535145267, + "grad_norm": 53577.9765625, + "learning_rate": 7.82268782021553e-05, + "loss": 2.2716, + "step": 6188 + }, + { + "epoch": 1.160074976569822, + "grad_norm": 53981.4609375, + "learning_rate": 7.822039187622645e-05, + "loss": 2.347, + "step": 6189 + }, + { + "epoch": 1.1602624179943768, + "grad_norm": 49555.734375, + "learning_rate": 7.821390485329053e-05, + "loss": 2.2052, + "step": 6190 + }, + { + "epoch": 1.1604498594189316, + "grad_norm": 51485.5546875, + "learning_rate": 7.820741713350773e-05, + "loss": 2.2116, + "step": 6191 + }, + { + "epoch": 1.1606373008434865, + "grad_norm": 54034.8984375, + "learning_rate": 7.82009287170383e-05, + "loss": 2.256, + "step": 6192 + }, + { + "epoch": 1.1608247422680413, + "grad_norm": 50307.86328125, + "learning_rate": 7.819443960404248e-05, + "loss": 2.2859, + "step": 6193 + }, + { + "epoch": 1.1610121836925962, + "grad_norm": 54978.56640625, + "learning_rate": 7.818794979468056e-05, + "loss": 2.3189, + "step": 6194 + }, + { + "epoch": 1.1611996251171508, + "grad_norm": 53983.83984375, + "learning_rate": 7.818145928911285e-05, + "loss": 2.3034, + "step": 6195 + }, + { + "epoch": 1.1613870665417056, + "grad_norm": 49048.24609375, + "learning_rate": 7.81749680874996e-05, + "loss": 2.2494, + "step": 6196 + }, + { + "epoch": 1.1615745079662605, + "grad_norm": 52207.26953125, + "learning_rate": 7.816847619000118e-05, + "loss": 2.2935, + "step": 6197 + }, + { + "epoch": 1.1617619493908153, + "grad_norm": 50859.3125, + "learning_rate": 7.816198359677793e-05, + "loss": 2.2561, + "step": 6198 + }, + { + "epoch": 1.1619493908153702, + "grad_norm": 54162.9609375, + "learning_rate": 7.815549030799019e-05, + "loss": 2.2862, + "step": 6199 + }, + { + "epoch": 1.162136832239925, + "grad_norm": 51236.28125, + "learning_rate": 7.814899632379832e-05, + "loss": 2.2466, + "step": 6200 + }, + { + "epoch": 1.1623242736644799, + "grad_norm": 50256.53125, + "learning_rate": 7.81425016443628e-05, + "loss": 2.2128, + "step": 6201 + }, + { + "epoch": 1.1625117150890347, + "grad_norm": 49652.296875, + "learning_rate": 7.813600626984392e-05, + "loss": 2.287, + "step": 6202 + }, + { + "epoch": 1.1626991565135896, + "grad_norm": 48472.54296875, + "learning_rate": 7.812951020040217e-05, + "loss": 2.2449, + "step": 6203 + }, + { + "epoch": 1.1628865979381444, + "grad_norm": 52582.15234375, + "learning_rate": 7.8123013436198e-05, + "loss": 2.2337, + "step": 6204 + }, + { + "epoch": 1.1630740393626993, + "grad_norm": 50880.125, + "learning_rate": 7.811651597739187e-05, + "loss": 2.2306, + "step": 6205 + }, + { + "epoch": 1.1632614807872539, + "grad_norm": 49837.28515625, + "learning_rate": 7.811001782414422e-05, + "loss": 2.2245, + "step": 6206 + }, + { + "epoch": 1.1634489222118087, + "grad_norm": 54133.0859375, + "learning_rate": 7.810351897661561e-05, + "loss": 2.2526, + "step": 6207 + }, + { + "epoch": 1.1636363636363636, + "grad_norm": 48063.546875, + "learning_rate": 7.80970194349665e-05, + "loss": 2.2855, + "step": 6208 + }, + { + "epoch": 1.1638238050609184, + "grad_norm": 53176.21484375, + "learning_rate": 7.809051919935744e-05, + "loss": 2.2744, + "step": 6209 + }, + { + "epoch": 1.1640112464854733, + "grad_norm": 50599.671875, + "learning_rate": 7.808401826994898e-05, + "loss": 2.2423, + "step": 6210 + }, + { + "epoch": 1.1641986879100281, + "grad_norm": 51058.140625, + "learning_rate": 7.807751664690166e-05, + "loss": 2.2019, + "step": 6211 + }, + { + "epoch": 1.164386129334583, + "grad_norm": 52857.78515625, + "learning_rate": 7.80710143303761e-05, + "loss": 2.247, + "step": 6212 + }, + { + "epoch": 1.1645735707591378, + "grad_norm": 48825.82421875, + "learning_rate": 7.806451132053287e-05, + "loss": 2.3006, + "step": 6213 + }, + { + "epoch": 1.1647610121836927, + "grad_norm": 51644.0, + "learning_rate": 7.805800761753261e-05, + "loss": 2.2893, + "step": 6214 + }, + { + "epoch": 1.1649484536082475, + "grad_norm": 48342.14453125, + "learning_rate": 7.805150322153594e-05, + "loss": 2.2101, + "step": 6215 + }, + { + "epoch": 1.1651358950328023, + "grad_norm": 57042.85546875, + "learning_rate": 7.80449981327035e-05, + "loss": 2.2918, + "step": 6216 + }, + { + "epoch": 1.165323336457357, + "grad_norm": 50920.0234375, + "learning_rate": 7.803849235119597e-05, + "loss": 2.2668, + "step": 6217 + }, + { + "epoch": 1.1655107778819118, + "grad_norm": 46882.03515625, + "learning_rate": 7.803198587717403e-05, + "loss": 2.2577, + "step": 6218 + }, + { + "epoch": 1.1656982193064667, + "grad_norm": 50940.5625, + "learning_rate": 7.802547871079838e-05, + "loss": 2.2482, + "step": 6219 + }, + { + "epoch": 1.1658856607310215, + "grad_norm": 53771.39453125, + "learning_rate": 7.801897085222976e-05, + "loss": 2.175, + "step": 6220 + }, + { + "epoch": 1.1660731021555764, + "grad_norm": 54535.54296875, + "learning_rate": 7.801246230162886e-05, + "loss": 2.3207, + "step": 6221 + }, + { + "epoch": 1.1662605435801312, + "grad_norm": 50739.48828125, + "learning_rate": 7.800595305915648e-05, + "loss": 2.3281, + "step": 6222 + }, + { + "epoch": 1.166447985004686, + "grad_norm": 52695.16015625, + "learning_rate": 7.799944312497336e-05, + "loss": 2.2483, + "step": 6223 + }, + { + "epoch": 1.166635426429241, + "grad_norm": 49654.35546875, + "learning_rate": 7.79929324992403e-05, + "loss": 2.224, + "step": 6224 + }, + { + "epoch": 1.1668228678537957, + "grad_norm": 56068.625, + "learning_rate": 7.79864211821181e-05, + "loss": 2.1499, + "step": 6225 + }, + { + "epoch": 1.1670103092783506, + "grad_norm": 56727.4296875, + "learning_rate": 7.797990917376758e-05, + "loss": 2.1827, + "step": 6226 + }, + { + "epoch": 1.1671977507029054, + "grad_norm": 49329.16015625, + "learning_rate": 7.797339647434957e-05, + "loss": 2.2782, + "step": 6227 + }, + { + "epoch": 1.16738519212746, + "grad_norm": 52197.921875, + "learning_rate": 7.796688308402495e-05, + "loss": 2.2619, + "step": 6228 + }, + { + "epoch": 1.167572633552015, + "grad_norm": 50348.1640625, + "learning_rate": 7.796036900295458e-05, + "loss": 2.2797, + "step": 6229 + }, + { + "epoch": 1.1677600749765698, + "grad_norm": 50167.63671875, + "learning_rate": 7.795385423129933e-05, + "loss": 2.1721, + "step": 6230 + }, + { + "epoch": 1.1679475164011246, + "grad_norm": 48279.3046875, + "learning_rate": 7.794733876922015e-05, + "loss": 2.2315, + "step": 6231 + }, + { + "epoch": 1.1681349578256794, + "grad_norm": 50390.71484375, + "learning_rate": 7.794082261687791e-05, + "loss": 2.2342, + "step": 6232 + }, + { + "epoch": 1.1683223992502343, + "grad_norm": 51232.50390625, + "learning_rate": 7.79343057744336e-05, + "loss": 2.2763, + "step": 6233 + }, + { + "epoch": 1.1685098406747891, + "grad_norm": 52963.08203125, + "learning_rate": 7.792778824204815e-05, + "loss": 2.2167, + "step": 6234 + }, + { + "epoch": 1.168697282099344, + "grad_norm": 48904.11328125, + "learning_rate": 7.792127001988251e-05, + "loss": 2.2317, + "step": 6235 + }, + { + "epoch": 1.1688847235238988, + "grad_norm": 52013.3203125, + "learning_rate": 7.791475110809774e-05, + "loss": 2.2406, + "step": 6236 + }, + { + "epoch": 1.1690721649484537, + "grad_norm": 51008.140625, + "learning_rate": 7.790823150685478e-05, + "loss": 2.2281, + "step": 6237 + }, + { + "epoch": 1.1692596063730085, + "grad_norm": 51888.45703125, + "learning_rate": 7.790171121631471e-05, + "loss": 2.2297, + "step": 6238 + }, + { + "epoch": 1.1694470477975631, + "grad_norm": 50078.2109375, + "learning_rate": 7.789519023663852e-05, + "loss": 2.176, + "step": 6239 + }, + { + "epoch": 1.1696344892221182, + "grad_norm": 49566.609375, + "learning_rate": 7.788866856798733e-05, + "loss": 2.2189, + "step": 6240 + }, + { + "epoch": 1.1698219306466728, + "grad_norm": 51336.41015625, + "learning_rate": 7.788214621052216e-05, + "loss": 2.2636, + "step": 6241 + }, + { + "epoch": 1.1700093720712277, + "grad_norm": 51455.87890625, + "learning_rate": 7.787562316440413e-05, + "loss": 2.1285, + "step": 6242 + }, + { + "epoch": 1.1701968134957825, + "grad_norm": 50833.73828125, + "learning_rate": 7.786909942979436e-05, + "loss": 2.2796, + "step": 6243 + }, + { + "epoch": 1.1703842549203374, + "grad_norm": 57439.328125, + "learning_rate": 7.786257500685396e-05, + "loss": 2.3401, + "step": 6244 + }, + { + "epoch": 1.1705716963448922, + "grad_norm": 54043.89453125, + "learning_rate": 7.785604989574409e-05, + "loss": 2.2244, + "step": 6245 + }, + { + "epoch": 1.170759137769447, + "grad_norm": 49087.546875, + "learning_rate": 7.784952409662589e-05, + "loss": 2.2153, + "step": 6246 + }, + { + "epoch": 1.170946579194002, + "grad_norm": 56218.82421875, + "learning_rate": 7.784299760966056e-05, + "loss": 2.2154, + "step": 6247 + }, + { + "epoch": 1.1711340206185568, + "grad_norm": 52597.4296875, + "learning_rate": 7.78364704350093e-05, + "loss": 2.2568, + "step": 6248 + }, + { + "epoch": 1.1713214620431116, + "grad_norm": 53182.859375, + "learning_rate": 7.78299425728333e-05, + "loss": 2.2591, + "step": 6249 + }, + { + "epoch": 1.1715089034676662, + "grad_norm": 50499.02734375, + "learning_rate": 7.782341402329379e-05, + "loss": 2.2277, + "step": 6250 + }, + { + "epoch": 1.1716963448922213, + "grad_norm": 53056.35546875, + "learning_rate": 7.781688478655205e-05, + "loss": 2.2864, + "step": 6251 + }, + { + "epoch": 1.171883786316776, + "grad_norm": 52344.25, + "learning_rate": 7.781035486276931e-05, + "loss": 2.2104, + "step": 6252 + }, + { + "epoch": 1.1720712277413308, + "grad_norm": 51826.890625, + "learning_rate": 7.780382425210686e-05, + "loss": 2.2681, + "step": 6253 + }, + { + "epoch": 1.1722586691658856, + "grad_norm": 50741.57421875, + "learning_rate": 7.779729295472599e-05, + "loss": 2.2924, + "step": 6254 + }, + { + "epoch": 1.1724461105904405, + "grad_norm": 49971.07421875, + "learning_rate": 7.779076097078804e-05, + "loss": 2.2621, + "step": 6255 + }, + { + "epoch": 1.1726335520149953, + "grad_norm": 51059.046875, + "learning_rate": 7.77842283004543e-05, + "loss": 2.2511, + "step": 6256 + }, + { + "epoch": 1.1728209934395502, + "grad_norm": 52593.11328125, + "learning_rate": 7.777769494388618e-05, + "loss": 2.1904, + "step": 6257 + }, + { + "epoch": 1.173008434864105, + "grad_norm": 50810.98828125, + "learning_rate": 7.777116090124498e-05, + "loss": 2.2565, + "step": 6258 + }, + { + "epoch": 1.1731958762886598, + "grad_norm": 51484.11328125, + "learning_rate": 7.776462617269212e-05, + "loss": 2.2543, + "step": 6259 + }, + { + "epoch": 1.1733833177132147, + "grad_norm": 49339.6875, + "learning_rate": 7.775809075838899e-05, + "loss": 2.2251, + "step": 6260 + }, + { + "epoch": 1.1735707591377695, + "grad_norm": 49577.01171875, + "learning_rate": 7.775155465849701e-05, + "loss": 2.3025, + "step": 6261 + }, + { + "epoch": 1.1737582005623244, + "grad_norm": 63000.55859375, + "learning_rate": 7.774501787317761e-05, + "loss": 2.3877, + "step": 6262 + }, + { + "epoch": 1.173945641986879, + "grad_norm": 49189.0234375, + "learning_rate": 7.773848040259224e-05, + "loss": 2.2281, + "step": 6263 + }, + { + "epoch": 1.1741330834114339, + "grad_norm": 52433.80859375, + "learning_rate": 7.773194224690237e-05, + "loss": 2.2907, + "step": 6264 + }, + { + "epoch": 1.1743205248359887, + "grad_norm": 48863.3046875, + "learning_rate": 7.772540340626948e-05, + "loss": 2.2272, + "step": 6265 + }, + { + "epoch": 1.1745079662605435, + "grad_norm": 49882.05078125, + "learning_rate": 7.771886388085508e-05, + "loss": 2.2639, + "step": 6266 + }, + { + "epoch": 1.1746954076850984, + "grad_norm": 49278.828125, + "learning_rate": 7.771232367082066e-05, + "loss": 2.2078, + "step": 6267 + }, + { + "epoch": 1.1748828491096532, + "grad_norm": 48992.42578125, + "learning_rate": 7.770578277632781e-05, + "loss": 2.2056, + "step": 6268 + }, + { + "epoch": 1.175070290534208, + "grad_norm": 52450.28515625, + "learning_rate": 7.7699241197538e-05, + "loss": 2.2685, + "step": 6269 + }, + { + "epoch": 1.175257731958763, + "grad_norm": 52743.68359375, + "learning_rate": 7.769269893461288e-05, + "loss": 2.2343, + "step": 6270 + }, + { + "epoch": 1.1754451733833178, + "grad_norm": 49977.7421875, + "learning_rate": 7.768615598771398e-05, + "loss": 2.2607, + "step": 6271 + }, + { + "epoch": 1.1756326148078726, + "grad_norm": 48425.9453125, + "learning_rate": 7.767961235700294e-05, + "loss": 2.2316, + "step": 6272 + }, + { + "epoch": 1.1758200562324275, + "grad_norm": 52133.015625, + "learning_rate": 7.767306804264135e-05, + "loss": 2.3335, + "step": 6273 + }, + { + "epoch": 1.176007497656982, + "grad_norm": 51132.16796875, + "learning_rate": 7.766652304479087e-05, + "loss": 2.2051, + "step": 6274 + }, + { + "epoch": 1.176194939081537, + "grad_norm": 52406.484375, + "learning_rate": 7.765997736361312e-05, + "loss": 2.2293, + "step": 6275 + }, + { + "epoch": 1.1763823805060918, + "grad_norm": 60099.4140625, + "learning_rate": 7.765343099926981e-05, + "loss": 2.1394, + "step": 6276 + }, + { + "epoch": 1.1765698219306466, + "grad_norm": 52270.13671875, + "learning_rate": 7.764688395192261e-05, + "loss": 2.1936, + "step": 6277 + }, + { + "epoch": 1.1767572633552015, + "grad_norm": 47552.65234375, + "learning_rate": 7.764033622173322e-05, + "loss": 2.2364, + "step": 6278 + }, + { + "epoch": 1.1769447047797563, + "grad_norm": 47816.25, + "learning_rate": 7.763378780886335e-05, + "loss": 2.2564, + "step": 6279 + }, + { + "epoch": 1.1771321462043112, + "grad_norm": 46268.4296875, + "learning_rate": 7.762723871347475e-05, + "loss": 2.2525, + "step": 6280 + }, + { + "epoch": 1.177319587628866, + "grad_norm": 49028.109375, + "learning_rate": 7.762068893572916e-05, + "loss": 2.3286, + "step": 6281 + }, + { + "epoch": 1.1775070290534209, + "grad_norm": 55761.26171875, + "learning_rate": 7.761413847578839e-05, + "loss": 2.2714, + "step": 6282 + }, + { + "epoch": 1.1776944704779757, + "grad_norm": 53226.171875, + "learning_rate": 7.76075873338142e-05, + "loss": 2.2633, + "step": 6283 + }, + { + "epoch": 1.1778819119025306, + "grad_norm": 49773.91015625, + "learning_rate": 7.760103550996838e-05, + "loss": 2.2699, + "step": 6284 + }, + { + "epoch": 1.1780693533270852, + "grad_norm": 47441.95703125, + "learning_rate": 7.759448300441278e-05, + "loss": 2.2068, + "step": 6285 + }, + { + "epoch": 1.17825679475164, + "grad_norm": 49521.52734375, + "learning_rate": 7.758792981730922e-05, + "loss": 2.2803, + "step": 6286 + }, + { + "epoch": 1.1784442361761949, + "grad_norm": 48824.703125, + "learning_rate": 7.758137594881956e-05, + "loss": 2.2592, + "step": 6287 + }, + { + "epoch": 1.1786316776007497, + "grad_norm": 51120.1328125, + "learning_rate": 7.757482139910567e-05, + "loss": 2.2222, + "step": 6288 + }, + { + "epoch": 1.1788191190253046, + "grad_norm": 51477.859375, + "learning_rate": 7.756826616832946e-05, + "loss": 2.2302, + "step": 6289 + }, + { + "epoch": 1.1790065604498594, + "grad_norm": 47949.77734375, + "learning_rate": 7.756171025665282e-05, + "loss": 2.2975, + "step": 6290 + }, + { + "epoch": 1.1791940018744143, + "grad_norm": 50397.7890625, + "learning_rate": 7.755515366423765e-05, + "loss": 2.2607, + "step": 6291 + }, + { + "epoch": 1.179381443298969, + "grad_norm": 49327.6875, + "learning_rate": 7.754859639124593e-05, + "loss": 2.2373, + "step": 6292 + }, + { + "epoch": 1.179568884723524, + "grad_norm": 48541.18359375, + "learning_rate": 7.754203843783958e-05, + "loss": 2.2571, + "step": 6293 + }, + { + "epoch": 1.1797563261480788, + "grad_norm": 53353.03125, + "learning_rate": 7.753547980418061e-05, + "loss": 2.2264, + "step": 6294 + }, + { + "epoch": 1.1799437675726336, + "grad_norm": 51350.09375, + "learning_rate": 7.752892049043097e-05, + "loss": 2.2261, + "step": 6295 + }, + { + "epoch": 1.1801312089971883, + "grad_norm": 48641.01171875, + "learning_rate": 7.75223604967527e-05, + "loss": 2.1786, + "step": 6296 + }, + { + "epoch": 1.180318650421743, + "grad_norm": 51730.67578125, + "learning_rate": 7.751579982330778e-05, + "loss": 2.2047, + "step": 6297 + }, + { + "epoch": 1.180506091846298, + "grad_norm": 52200.65234375, + "learning_rate": 7.75092384702583e-05, + "loss": 2.3079, + "step": 6298 + }, + { + "epoch": 1.1806935332708528, + "grad_norm": 52748.59375, + "learning_rate": 7.750267643776629e-05, + "loss": 2.2348, + "step": 6299 + }, + { + "epoch": 1.1808809746954076, + "grad_norm": 55633.12890625, + "learning_rate": 7.749611372599386e-05, + "loss": 2.3108, + "step": 6300 + }, + { + "epoch": 1.1810684161199625, + "grad_norm": 54470.33984375, + "learning_rate": 7.748955033510304e-05, + "loss": 2.2974, + "step": 6301 + }, + { + "epoch": 1.1812558575445173, + "grad_norm": 52810.54296875, + "learning_rate": 7.748298626525596e-05, + "loss": 2.2599, + "step": 6302 + }, + { + "epoch": 1.1814432989690722, + "grad_norm": 56258.19140625, + "learning_rate": 7.747642151661477e-05, + "loss": 2.2497, + "step": 6303 + }, + { + "epoch": 1.181630740393627, + "grad_norm": 54272.57421875, + "learning_rate": 7.746985608934159e-05, + "loss": 2.198, + "step": 6304 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 51912.79296875, + "learning_rate": 7.746328998359858e-05, + "loss": 2.1629, + "step": 6305 + }, + { + "epoch": 1.1820056232427367, + "grad_norm": 56125.9296875, + "learning_rate": 7.745672319954791e-05, + "loss": 2.2675, + "step": 6306 + }, + { + "epoch": 1.1821930646672913, + "grad_norm": 55134.515625, + "learning_rate": 7.745015573735178e-05, + "loss": 2.2231, + "step": 6307 + }, + { + "epoch": 1.1823805060918464, + "grad_norm": 52130.41796875, + "learning_rate": 7.744358759717239e-05, + "loss": 2.2447, + "step": 6308 + }, + { + "epoch": 1.182567947516401, + "grad_norm": 55339.56640625, + "learning_rate": 7.743701877917195e-05, + "loss": 2.2823, + "step": 6309 + }, + { + "epoch": 1.1827553889409559, + "grad_norm": 52908.4296875, + "learning_rate": 7.743044928351275e-05, + "loss": 2.3244, + "step": 6310 + }, + { + "epoch": 1.1829428303655107, + "grad_norm": 50939.96875, + "learning_rate": 7.7423879110357e-05, + "loss": 2.2718, + "step": 6311 + }, + { + "epoch": 1.1831302717900656, + "grad_norm": 53509.1328125, + "learning_rate": 7.7417308259867e-05, + "loss": 2.2797, + "step": 6312 + }, + { + "epoch": 1.1833177132146204, + "grad_norm": 49525.0, + "learning_rate": 7.741073673220502e-05, + "loss": 2.3105, + "step": 6313 + }, + { + "epoch": 1.1835051546391753, + "grad_norm": 46523.109375, + "learning_rate": 7.740416452753338e-05, + "loss": 2.226, + "step": 6314 + }, + { + "epoch": 1.1836925960637301, + "grad_norm": 52862.56640625, + "learning_rate": 7.739759164601443e-05, + "loss": 2.2168, + "step": 6315 + }, + { + "epoch": 1.183880037488285, + "grad_norm": 58306.6796875, + "learning_rate": 7.739101808781046e-05, + "loss": 2.2312, + "step": 6316 + }, + { + "epoch": 1.1840674789128398, + "grad_norm": 57074.54296875, + "learning_rate": 7.738444385308387e-05, + "loss": 2.3183, + "step": 6317 + }, + { + "epoch": 1.1842549203373947, + "grad_norm": 49648.515625, + "learning_rate": 7.737786894199703e-05, + "loss": 2.1954, + "step": 6318 + }, + { + "epoch": 1.1844423617619495, + "grad_norm": 48997.76171875, + "learning_rate": 7.73712933547123e-05, + "loss": 2.2292, + "step": 6319 + }, + { + "epoch": 1.1846298031865041, + "grad_norm": 57494.4765625, + "learning_rate": 7.736471709139213e-05, + "loss": 2.1878, + "step": 6320 + }, + { + "epoch": 1.184817244611059, + "grad_norm": 48500.4375, + "learning_rate": 7.735814015219892e-05, + "loss": 2.2299, + "step": 6321 + }, + { + "epoch": 1.1850046860356138, + "grad_norm": 52949.33984375, + "learning_rate": 7.735156253729512e-05, + "loss": 2.1692, + "step": 6322 + }, + { + "epoch": 1.1851921274601687, + "grad_norm": 50994.73046875, + "learning_rate": 7.734498424684318e-05, + "loss": 2.2605, + "step": 6323 + }, + { + "epoch": 1.1853795688847235, + "grad_norm": 49565.2421875, + "learning_rate": 7.73384052810056e-05, + "loss": 2.3356, + "step": 6324 + }, + { + "epoch": 1.1855670103092784, + "grad_norm": 48930.17578125, + "learning_rate": 7.733182563994484e-05, + "loss": 2.2298, + "step": 6325 + }, + { + "epoch": 1.1857544517338332, + "grad_norm": 54044.234375, + "learning_rate": 7.732524532382343e-05, + "loss": 2.2477, + "step": 6326 + }, + { + "epoch": 1.185941893158388, + "grad_norm": 56264.44140625, + "learning_rate": 7.731866433280388e-05, + "loss": 2.2115, + "step": 6327 + }, + { + "epoch": 1.186129334582943, + "grad_norm": 49424.08984375, + "learning_rate": 7.731208266704875e-05, + "loss": 2.2671, + "step": 6328 + }, + { + "epoch": 1.1863167760074977, + "grad_norm": 48371.13671875, + "learning_rate": 7.730550032672057e-05, + "loss": 2.2224, + "step": 6329 + }, + { + "epoch": 1.1865042174320526, + "grad_norm": 53643.47265625, + "learning_rate": 7.729891731198195e-05, + "loss": 2.225, + "step": 6330 + }, + { + "epoch": 1.1866916588566072, + "grad_norm": 50822.4296875, + "learning_rate": 7.729233362299546e-05, + "loss": 2.2298, + "step": 6331 + }, + { + "epoch": 1.186879100281162, + "grad_norm": 49754.27734375, + "learning_rate": 7.728574925992372e-05, + "loss": 2.3245, + "step": 6332 + }, + { + "epoch": 1.187066541705717, + "grad_norm": 51805.51953125, + "learning_rate": 7.727916422292935e-05, + "loss": 2.164, + "step": 6333 + }, + { + "epoch": 1.1872539831302718, + "grad_norm": 50409.546875, + "learning_rate": 7.727257851217499e-05, + "loss": 2.2535, + "step": 6334 + }, + { + "epoch": 1.1874414245548266, + "grad_norm": 50153.73828125, + "learning_rate": 7.726599212782331e-05, + "loss": 2.171, + "step": 6335 + }, + { + "epoch": 1.1876288659793814, + "grad_norm": 49286.9140625, + "learning_rate": 7.725940507003695e-05, + "loss": 2.271, + "step": 6336 + }, + { + "epoch": 1.1878163074039363, + "grad_norm": 54841.578125, + "learning_rate": 7.725281733897864e-05, + "loss": 2.3362, + "step": 6337 + }, + { + "epoch": 1.1880037488284911, + "grad_norm": 56742.44140625, + "learning_rate": 7.724622893481108e-05, + "loss": 2.3047, + "step": 6338 + }, + { + "epoch": 1.188191190253046, + "grad_norm": 48143.609375, + "learning_rate": 7.7239639857697e-05, + "loss": 2.3022, + "step": 6339 + }, + { + "epoch": 1.1883786316776008, + "grad_norm": 51140.53125, + "learning_rate": 7.72330501077991e-05, + "loss": 2.2463, + "step": 6340 + }, + { + "epoch": 1.1885660731021557, + "grad_norm": 48619.1328125, + "learning_rate": 7.72264596852802e-05, + "loss": 2.341, + "step": 6341 + }, + { + "epoch": 1.1887535145267103, + "grad_norm": 50063.9453125, + "learning_rate": 7.721986859030303e-05, + "loss": 2.181, + "step": 6342 + }, + { + "epoch": 1.1889409559512651, + "grad_norm": 52580.09765625, + "learning_rate": 7.721327682303041e-05, + "loss": 2.2628, + "step": 6343 + }, + { + "epoch": 1.18912839737582, + "grad_norm": 49122.0859375, + "learning_rate": 7.720668438362512e-05, + "loss": 2.2444, + "step": 6344 + }, + { + "epoch": 1.1893158388003748, + "grad_norm": 47654.2265625, + "learning_rate": 7.720009127225002e-05, + "loss": 2.3009, + "step": 6345 + }, + { + "epoch": 1.1895032802249297, + "grad_norm": 50401.44921875, + "learning_rate": 7.719349748906792e-05, + "loss": 2.2318, + "step": 6346 + }, + { + "epoch": 1.1896907216494845, + "grad_norm": 49058.92578125, + "learning_rate": 7.718690303424169e-05, + "loss": 2.2168, + "step": 6347 + }, + { + "epoch": 1.1898781630740394, + "grad_norm": 47315.92578125, + "learning_rate": 7.71803079079342e-05, + "loss": 2.2208, + "step": 6348 + }, + { + "epoch": 1.1900656044985942, + "grad_norm": 55418.359375, + "learning_rate": 7.717371211030834e-05, + "loss": 2.1277, + "step": 6349 + }, + { + "epoch": 1.190253045923149, + "grad_norm": 49891.80078125, + "learning_rate": 7.716711564152704e-05, + "loss": 2.1596, + "step": 6350 + }, + { + "epoch": 1.190440487347704, + "grad_norm": 53377.21484375, + "learning_rate": 7.716051850175318e-05, + "loss": 2.2233, + "step": 6351 + }, + { + "epoch": 1.1906279287722588, + "grad_norm": 48286.28125, + "learning_rate": 7.715392069114976e-05, + "loss": 2.256, + "step": 6352 + }, + { + "epoch": 1.1908153701968134, + "grad_norm": 50858.30859375, + "learning_rate": 7.714732220987967e-05, + "loss": 2.2669, + "step": 6353 + }, + { + "epoch": 1.1910028116213682, + "grad_norm": 62061.484375, + "learning_rate": 7.714072305810594e-05, + "loss": 2.1832, + "step": 6354 + }, + { + "epoch": 1.191190253045923, + "grad_norm": 49760.91796875, + "learning_rate": 7.713412323599153e-05, + "loss": 2.2313, + "step": 6355 + }, + { + "epoch": 1.191377694470478, + "grad_norm": 46656.0859375, + "learning_rate": 7.712752274369945e-05, + "loss": 2.2824, + "step": 6356 + }, + { + "epoch": 1.1915651358950328, + "grad_norm": 49134.70703125, + "learning_rate": 7.712092158139275e-05, + "loss": 2.219, + "step": 6357 + }, + { + "epoch": 1.1917525773195876, + "grad_norm": 53183.40625, + "learning_rate": 7.711431974923443e-05, + "loss": 2.2449, + "step": 6358 + }, + { + "epoch": 1.1919400187441425, + "grad_norm": 49754.9453125, + "learning_rate": 7.710771724738757e-05, + "loss": 2.2688, + "step": 6359 + }, + { + "epoch": 1.1921274601686973, + "grad_norm": 53465.6640625, + "learning_rate": 7.710111407601526e-05, + "loss": 2.202, + "step": 6360 + }, + { + "epoch": 1.1923149015932522, + "grad_norm": 50816.78125, + "learning_rate": 7.709451023528056e-05, + "loss": 2.2527, + "step": 6361 + }, + { + "epoch": 1.192502343017807, + "grad_norm": 49690.84765625, + "learning_rate": 7.708790572534656e-05, + "loss": 2.1921, + "step": 6362 + }, + { + "epoch": 1.1926897844423618, + "grad_norm": 53134.6328125, + "learning_rate": 7.708130054637643e-05, + "loss": 2.1917, + "step": 6363 + }, + { + "epoch": 1.1928772258669165, + "grad_norm": 54854.953125, + "learning_rate": 7.70746946985333e-05, + "loss": 2.2201, + "step": 6364 + }, + { + "epoch": 1.1930646672914715, + "grad_norm": 52772.546875, + "learning_rate": 7.70680881819803e-05, + "loss": 2.2616, + "step": 6365 + }, + { + "epoch": 1.1932521087160262, + "grad_norm": 48112.14453125, + "learning_rate": 7.70614809968806e-05, + "loss": 2.2375, + "step": 6366 + }, + { + "epoch": 1.193439550140581, + "grad_norm": 52533.60546875, + "learning_rate": 7.705487314339744e-05, + "loss": 2.224, + "step": 6367 + }, + { + "epoch": 1.1936269915651359, + "grad_norm": 50363.14453125, + "learning_rate": 7.704826462169397e-05, + "loss": 2.2452, + "step": 6368 + }, + { + "epoch": 1.1938144329896907, + "grad_norm": 51518.66015625, + "learning_rate": 7.704165543193343e-05, + "loss": 2.2275, + "step": 6369 + }, + { + "epoch": 1.1940018744142455, + "grad_norm": 50600.015625, + "learning_rate": 7.703504557427905e-05, + "loss": 2.2487, + "step": 6370 + }, + { + "epoch": 1.1941893158388004, + "grad_norm": 51121.140625, + "learning_rate": 7.702843504889412e-05, + "loss": 2.2196, + "step": 6371 + }, + { + "epoch": 1.1943767572633552, + "grad_norm": 49772.578125, + "learning_rate": 7.702182385594188e-05, + "loss": 2.2632, + "step": 6372 + }, + { + "epoch": 1.19456419868791, + "grad_norm": 49221.76953125, + "learning_rate": 7.701521199558562e-05, + "loss": 2.2679, + "step": 6373 + }, + { + "epoch": 1.194751640112465, + "grad_norm": 51338.25390625, + "learning_rate": 7.700859946798867e-05, + "loss": 2.2519, + "step": 6374 + }, + { + "epoch": 1.1949390815370198, + "grad_norm": 51066.75390625, + "learning_rate": 7.70019862733143e-05, + "loss": 2.2582, + "step": 6375 + }, + { + "epoch": 1.1951265229615746, + "grad_norm": 50836.4609375, + "learning_rate": 7.699537241172589e-05, + "loss": 2.2456, + "step": 6376 + }, + { + "epoch": 1.1953139643861292, + "grad_norm": 51179.26171875, + "learning_rate": 7.698875788338677e-05, + "loss": 2.2567, + "step": 6377 + }, + { + "epoch": 1.195501405810684, + "grad_norm": 50986.61328125, + "learning_rate": 7.698214268846036e-05, + "loss": 2.2286, + "step": 6378 + }, + { + "epoch": 1.195688847235239, + "grad_norm": 53464.625, + "learning_rate": 7.697552682710996e-05, + "loss": 2.2961, + "step": 6379 + }, + { + "epoch": 1.1958762886597938, + "grad_norm": 51195.20703125, + "learning_rate": 7.696891029949905e-05, + "loss": 2.2887, + "step": 6380 + }, + { + "epoch": 1.1960637300843486, + "grad_norm": 51875.9375, + "learning_rate": 7.696229310579102e-05, + "loss": 2.2182, + "step": 6381 + }, + { + "epoch": 1.1962511715089035, + "grad_norm": 52673.46875, + "learning_rate": 7.69556752461493e-05, + "loss": 2.2264, + "step": 6382 + }, + { + "epoch": 1.1964386129334583, + "grad_norm": 51870.67578125, + "learning_rate": 7.694905672073736e-05, + "loss": 2.2644, + "step": 6383 + }, + { + "epoch": 1.1966260543580132, + "grad_norm": 49591.671875, + "learning_rate": 7.694243752971864e-05, + "loss": 2.2341, + "step": 6384 + }, + { + "epoch": 1.196813495782568, + "grad_norm": 50896.36328125, + "learning_rate": 7.693581767325667e-05, + "loss": 2.1816, + "step": 6385 + }, + { + "epoch": 1.1970009372071229, + "grad_norm": 49155.69921875, + "learning_rate": 7.69291971515149e-05, + "loss": 2.212, + "step": 6386 + }, + { + "epoch": 1.1971883786316777, + "grad_norm": 51362.65234375, + "learning_rate": 7.692257596465689e-05, + "loss": 2.1796, + "step": 6387 + }, + { + "epoch": 1.1973758200562323, + "grad_norm": 51765.49609375, + "learning_rate": 7.691595411284616e-05, + "loss": 2.2185, + "step": 6388 + }, + { + "epoch": 1.1975632614807872, + "grad_norm": 53600.8359375, + "learning_rate": 7.690933159624625e-05, + "loss": 2.283, + "step": 6389 + }, + { + "epoch": 1.197750702905342, + "grad_norm": 50712.06640625, + "learning_rate": 7.690270841502074e-05, + "loss": 2.2675, + "step": 6390 + }, + { + "epoch": 1.1979381443298969, + "grad_norm": 52172.76953125, + "learning_rate": 7.689608456933322e-05, + "loss": 2.2496, + "step": 6391 + }, + { + "epoch": 1.1981255857544517, + "grad_norm": 52519.015625, + "learning_rate": 7.688946005934727e-05, + "loss": 2.1491, + "step": 6392 + }, + { + "epoch": 1.1983130271790066, + "grad_norm": 54925.97265625, + "learning_rate": 7.688283488522655e-05, + "loss": 2.2568, + "step": 6393 + }, + { + "epoch": 1.1985004686035614, + "grad_norm": 47507.47265625, + "learning_rate": 7.687620904713461e-05, + "loss": 2.2287, + "step": 6394 + }, + { + "epoch": 1.1986879100281163, + "grad_norm": 48386.82421875, + "learning_rate": 7.686958254523521e-05, + "loss": 2.265, + "step": 6395 + }, + { + "epoch": 1.198875351452671, + "grad_norm": 52508.38671875, + "learning_rate": 7.686295537969193e-05, + "loss": 2.2709, + "step": 6396 + }, + { + "epoch": 1.199062792877226, + "grad_norm": 59101.7734375, + "learning_rate": 7.685632755066847e-05, + "loss": 2.2528, + "step": 6397 + }, + { + "epoch": 1.1992502343017808, + "grad_norm": 48184.09765625, + "learning_rate": 7.684969905832855e-05, + "loss": 2.2811, + "step": 6398 + }, + { + "epoch": 1.1994376757263354, + "grad_norm": 52203.84765625, + "learning_rate": 7.684306990283588e-05, + "loss": 2.1898, + "step": 6399 + }, + { + "epoch": 1.1996251171508903, + "grad_norm": 50904.05859375, + "learning_rate": 7.683644008435418e-05, + "loss": 2.254, + "step": 6400 + }, + { + "epoch": 1.199812558575445, + "grad_norm": 50385.5234375, + "learning_rate": 7.68298096030472e-05, + "loss": 2.2946, + "step": 6401 + }, + { + "epoch": 1.2, + "grad_norm": 49662.390625, + "learning_rate": 7.682317845907871e-05, + "loss": 2.2346, + "step": 6402 + }, + { + "epoch": 1.2001874414245548, + "grad_norm": 49927.32421875, + "learning_rate": 7.68165466526125e-05, + "loss": 2.2279, + "step": 6403 + }, + { + "epoch": 1.2003748828491096, + "grad_norm": 49296.625, + "learning_rate": 7.680991418381235e-05, + "loss": 2.2483, + "step": 6404 + }, + { + "epoch": 1.2005623242736645, + "grad_norm": 51033.77734375, + "learning_rate": 7.680328105284207e-05, + "loss": 2.1877, + "step": 6405 + }, + { + "epoch": 1.2007497656982193, + "grad_norm": 50015.7578125, + "learning_rate": 7.679664725986551e-05, + "loss": 2.1608, + "step": 6406 + }, + { + "epoch": 1.2009372071227742, + "grad_norm": 51599.04296875, + "learning_rate": 7.67900128050465e-05, + "loss": 2.1195, + "step": 6407 + }, + { + "epoch": 1.201124648547329, + "grad_norm": 51872.2734375, + "learning_rate": 7.678337768854892e-05, + "loss": 2.2176, + "step": 6408 + }, + { + "epoch": 1.2013120899718839, + "grad_norm": 50754.31640625, + "learning_rate": 7.677674191053661e-05, + "loss": 2.2947, + "step": 6409 + }, + { + "epoch": 1.2014995313964385, + "grad_norm": 55066.67578125, + "learning_rate": 7.677010547117351e-05, + "loss": 2.2737, + "step": 6410 + }, + { + "epoch": 1.2016869728209933, + "grad_norm": 53870.83984375, + "learning_rate": 7.676346837062352e-05, + "loss": 2.2792, + "step": 6411 + }, + { + "epoch": 1.2018744142455482, + "grad_norm": 47410.4375, + "learning_rate": 7.675683060905054e-05, + "loss": 2.2582, + "step": 6412 + }, + { + "epoch": 1.202061855670103, + "grad_norm": 49739.375, + "learning_rate": 7.675019218661854e-05, + "loss": 2.2796, + "step": 6413 + }, + { + "epoch": 1.2022492970946579, + "grad_norm": 48869.2421875, + "learning_rate": 7.674355310349148e-05, + "loss": 2.3155, + "step": 6414 + }, + { + "epoch": 1.2024367385192127, + "grad_norm": 48291.2109375, + "learning_rate": 7.673691335983334e-05, + "loss": 2.2641, + "step": 6415 + }, + { + "epoch": 1.2026241799437676, + "grad_norm": 47690.671875, + "learning_rate": 7.673027295580811e-05, + "loss": 2.2831, + "step": 6416 + }, + { + "epoch": 1.2028116213683224, + "grad_norm": 51172.51171875, + "learning_rate": 7.672363189157975e-05, + "loss": 2.3029, + "step": 6417 + }, + { + "epoch": 1.2029990627928773, + "grad_norm": 52596.5859375, + "learning_rate": 7.671699016731238e-05, + "loss": 2.2967, + "step": 6418 + }, + { + "epoch": 1.2031865042174321, + "grad_norm": 45944.7109375, + "learning_rate": 7.671034778316997e-05, + "loss": 2.2136, + "step": 6419 + }, + { + "epoch": 1.203373945641987, + "grad_norm": 51924.734375, + "learning_rate": 7.67037047393166e-05, + "loss": 2.2672, + "step": 6420 + }, + { + "epoch": 1.2035613870665416, + "grad_norm": 53156.8828125, + "learning_rate": 7.669706103591635e-05, + "loss": 2.1817, + "step": 6421 + }, + { + "epoch": 1.2037488284910967, + "grad_norm": 54140.9296875, + "learning_rate": 7.669041667313329e-05, + "loss": 2.3164, + "step": 6422 + }, + { + "epoch": 1.2039362699156513, + "grad_norm": 51236.3671875, + "learning_rate": 7.668377165113157e-05, + "loss": 2.2209, + "step": 6423 + }, + { + "epoch": 1.2041237113402061, + "grad_norm": 55334.9765625, + "learning_rate": 7.667712597007527e-05, + "loss": 2.1695, + "step": 6424 + }, + { + "epoch": 1.204311152764761, + "grad_norm": 50246.55078125, + "learning_rate": 7.667047963012854e-05, + "loss": 2.3069, + "step": 6425 + }, + { + "epoch": 1.2044985941893158, + "grad_norm": 48431.5703125, + "learning_rate": 7.666383263145556e-05, + "loss": 2.2219, + "step": 6426 + }, + { + "epoch": 1.2046860356138707, + "grad_norm": 49149.17578125, + "learning_rate": 7.665718497422048e-05, + "loss": 2.2428, + "step": 6427 + }, + { + "epoch": 1.2048734770384255, + "grad_norm": 51579.07421875, + "learning_rate": 7.665053665858748e-05, + "loss": 2.2276, + "step": 6428 + }, + { + "epoch": 1.2050609184629804, + "grad_norm": 49417.2109375, + "learning_rate": 7.664388768472078e-05, + "loss": 2.2431, + "step": 6429 + }, + { + "epoch": 1.2052483598875352, + "grad_norm": 54840.28125, + "learning_rate": 7.66372380527846e-05, + "loss": 2.1896, + "step": 6430 + }, + { + "epoch": 1.20543580131209, + "grad_norm": 50804.921875, + "learning_rate": 7.663058776294319e-05, + "loss": 2.2272, + "step": 6431 + }, + { + "epoch": 1.205623242736645, + "grad_norm": 54956.93359375, + "learning_rate": 7.662393681536077e-05, + "loss": 2.2876, + "step": 6432 + }, + { + "epoch": 1.2058106841611997, + "grad_norm": 58936.25, + "learning_rate": 7.661728521020164e-05, + "loss": 2.2354, + "step": 6433 + }, + { + "epoch": 1.2059981255857544, + "grad_norm": 53541.53515625, + "learning_rate": 7.661063294763007e-05, + "loss": 2.1978, + "step": 6434 + }, + { + "epoch": 1.2061855670103092, + "grad_norm": 49770.1328125, + "learning_rate": 7.660398002781037e-05, + "loss": 2.3269, + "step": 6435 + }, + { + "epoch": 1.206373008434864, + "grad_norm": 51601.78515625, + "learning_rate": 7.659732645090686e-05, + "loss": 2.259, + "step": 6436 + }, + { + "epoch": 1.206560449859419, + "grad_norm": 48921.65625, + "learning_rate": 7.659067221708385e-05, + "loss": 2.24, + "step": 6437 + }, + { + "epoch": 1.2067478912839738, + "grad_norm": 50483.15234375, + "learning_rate": 7.658401732650575e-05, + "loss": 2.233, + "step": 6438 + }, + { + "epoch": 1.2069353327085286, + "grad_norm": 51671.51953125, + "learning_rate": 7.657736177933686e-05, + "loss": 2.1802, + "step": 6439 + }, + { + "epoch": 1.2071227741330834, + "grad_norm": 54577.3828125, + "learning_rate": 7.65707055757416e-05, + "loss": 2.2646, + "step": 6440 + }, + { + "epoch": 1.2073102155576383, + "grad_norm": 51501.44140625, + "learning_rate": 7.656404871588435e-05, + "loss": 2.1684, + "step": 6441 + }, + { + "epoch": 1.2074976569821931, + "grad_norm": 56102.5390625, + "learning_rate": 7.655739119992955e-05, + "loss": 2.2473, + "step": 6442 + }, + { + "epoch": 1.207685098406748, + "grad_norm": 50489.17578125, + "learning_rate": 7.655073302804162e-05, + "loss": 2.2214, + "step": 6443 + }, + { + "epoch": 1.2078725398313028, + "grad_norm": 55412.39453125, + "learning_rate": 7.6544074200385e-05, + "loss": 2.1964, + "step": 6444 + }, + { + "epoch": 1.2080599812558575, + "grad_norm": 50031.5, + "learning_rate": 7.653741471712417e-05, + "loss": 2.2623, + "step": 6445 + }, + { + "epoch": 1.2082474226804123, + "grad_norm": 52675.9296875, + "learning_rate": 7.65307545784236e-05, + "loss": 2.2304, + "step": 6446 + }, + { + "epoch": 1.2084348641049671, + "grad_norm": 48168.19140625, + "learning_rate": 7.652409378444778e-05, + "loss": 2.253, + "step": 6447 + }, + { + "epoch": 1.208622305529522, + "grad_norm": 54008.05078125, + "learning_rate": 7.651743233536125e-05, + "loss": 2.2238, + "step": 6448 + }, + { + "epoch": 1.2088097469540768, + "grad_norm": 48301.4921875, + "learning_rate": 7.65107702313285e-05, + "loss": 2.2755, + "step": 6449 + }, + { + "epoch": 1.2089971883786317, + "grad_norm": 63171.9765625, + "learning_rate": 7.650410747251412e-05, + "loss": 2.246, + "step": 6450 + }, + { + "epoch": 1.2091846298031865, + "grad_norm": 47516.8203125, + "learning_rate": 7.649744405908266e-05, + "loss": 2.2729, + "step": 6451 + }, + { + "epoch": 1.2093720712277414, + "grad_norm": 52229.77734375, + "learning_rate": 7.649077999119864e-05, + "loss": 2.2328, + "step": 6452 + }, + { + "epoch": 1.2095595126522962, + "grad_norm": 47111.546875, + "learning_rate": 7.648411526902674e-05, + "loss": 2.1828, + "step": 6453 + }, + { + "epoch": 1.209746954076851, + "grad_norm": 50071.46484375, + "learning_rate": 7.647744989273151e-05, + "loss": 2.3049, + "step": 6454 + }, + { + "epoch": 1.209934395501406, + "grad_norm": 48885.25, + "learning_rate": 7.64707838624776e-05, + "loss": 2.2056, + "step": 6455 + }, + { + "epoch": 1.2101218369259605, + "grad_norm": 51438.66015625, + "learning_rate": 7.646411717842966e-05, + "loss": 2.1948, + "step": 6456 + }, + { + "epoch": 1.2103092783505154, + "grad_norm": 52001.6015625, + "learning_rate": 7.645744984075231e-05, + "loss": 2.2423, + "step": 6457 + }, + { + "epoch": 1.2104967197750702, + "grad_norm": 50833.29296875, + "learning_rate": 7.645078184961027e-05, + "loss": 2.1807, + "step": 6458 + }, + { + "epoch": 1.210684161199625, + "grad_norm": 49440.3671875, + "learning_rate": 7.644411320516822e-05, + "loss": 2.2493, + "step": 6459 + }, + { + "epoch": 1.21087160262418, + "grad_norm": 51562.84765625, + "learning_rate": 7.643744390759085e-05, + "loss": 2.2495, + "step": 6460 + }, + { + "epoch": 1.2110590440487348, + "grad_norm": 53825.25390625, + "learning_rate": 7.643077395704289e-05, + "loss": 2.1828, + "step": 6461 + }, + { + "epoch": 1.2112464854732896, + "grad_norm": 49450.87890625, + "learning_rate": 7.642410335368908e-05, + "loss": 2.1695, + "step": 6462 + }, + { + "epoch": 1.2114339268978445, + "grad_norm": 52785.58203125, + "learning_rate": 7.641743209769418e-05, + "loss": 2.1962, + "step": 6463 + }, + { + "epoch": 1.2116213683223993, + "grad_norm": 50174.11328125, + "learning_rate": 7.641076018922295e-05, + "loss": 2.2721, + "step": 6464 + }, + { + "epoch": 1.2118088097469542, + "grad_norm": 48357.69140625, + "learning_rate": 7.64040876284402e-05, + "loss": 2.2466, + "step": 6465 + }, + { + "epoch": 1.211996251171509, + "grad_norm": 52580.30078125, + "learning_rate": 7.639741441551072e-05, + "loss": 2.2531, + "step": 6466 + }, + { + "epoch": 1.2121836925960636, + "grad_norm": 51674.8984375, + "learning_rate": 7.639074055059931e-05, + "loss": 2.2007, + "step": 6467 + }, + { + "epoch": 1.2123711340206185, + "grad_norm": 53038.0703125, + "learning_rate": 7.638406603387086e-05, + "loss": 2.2947, + "step": 6468 + }, + { + "epoch": 1.2125585754451733, + "grad_norm": 54823.37890625, + "learning_rate": 7.637739086549014e-05, + "loss": 2.3103, + "step": 6469 + }, + { + "epoch": 1.2127460168697282, + "grad_norm": 52021.0703125, + "learning_rate": 7.63707150456221e-05, + "loss": 2.2513, + "step": 6470 + }, + { + "epoch": 1.212933458294283, + "grad_norm": 52213.484375, + "learning_rate": 7.636403857443157e-05, + "loss": 2.2833, + "step": 6471 + }, + { + "epoch": 1.2131208997188379, + "grad_norm": 49452.94140625, + "learning_rate": 7.635736145208346e-05, + "loss": 2.2873, + "step": 6472 + }, + { + "epoch": 1.2133083411433927, + "grad_norm": 48349.890625, + "learning_rate": 7.635068367874271e-05, + "loss": 2.2692, + "step": 6473 + }, + { + "epoch": 1.2134957825679475, + "grad_norm": 52864.8359375, + "learning_rate": 7.634400525457424e-05, + "loss": 2.2049, + "step": 6474 + }, + { + "epoch": 1.2136832239925024, + "grad_norm": 48306.09765625, + "learning_rate": 7.633732617974299e-05, + "loss": 2.2175, + "step": 6475 + }, + { + "epoch": 1.2138706654170572, + "grad_norm": 53801.28515625, + "learning_rate": 7.633064645441393e-05, + "loss": 2.2837, + "step": 6476 + }, + { + "epoch": 1.214058106841612, + "grad_norm": 60550.265625, + "learning_rate": 7.632396607875205e-05, + "loss": 2.1554, + "step": 6477 + }, + { + "epoch": 1.2142455482661667, + "grad_norm": 50851.5546875, + "learning_rate": 7.631728505292233e-05, + "loss": 2.2828, + "step": 6478 + }, + { + "epoch": 1.2144329896907216, + "grad_norm": 52300.74609375, + "learning_rate": 7.63106033770898e-05, + "loss": 2.268, + "step": 6479 + }, + { + "epoch": 1.2146204311152764, + "grad_norm": 52731.29296875, + "learning_rate": 7.630392105141945e-05, + "loss": 2.2109, + "step": 6480 + }, + { + "epoch": 1.2148078725398312, + "grad_norm": 52562.57421875, + "learning_rate": 7.629723807607638e-05, + "loss": 2.1752, + "step": 6481 + }, + { + "epoch": 1.214995313964386, + "grad_norm": 51282.17578125, + "learning_rate": 7.62905544512256e-05, + "loss": 2.2603, + "step": 6482 + }, + { + "epoch": 1.215182755388941, + "grad_norm": 46649.98046875, + "learning_rate": 7.628387017703225e-05, + "loss": 2.2037, + "step": 6483 + }, + { + "epoch": 1.2153701968134958, + "grad_norm": 51620.5234375, + "learning_rate": 7.627718525366136e-05, + "loss": 2.227, + "step": 6484 + }, + { + "epoch": 1.2155576382380506, + "grad_norm": 54443.8359375, + "learning_rate": 7.627049968127806e-05, + "loss": 2.2616, + "step": 6485 + }, + { + "epoch": 1.2157450796626055, + "grad_norm": 50525.375, + "learning_rate": 7.626381346004749e-05, + "loss": 2.1673, + "step": 6486 + }, + { + "epoch": 1.2159325210871603, + "grad_norm": 47833.87890625, + "learning_rate": 7.625712659013475e-05, + "loss": 2.2288, + "step": 6487 + }, + { + "epoch": 1.2161199625117152, + "grad_norm": 53356.23046875, + "learning_rate": 7.625043907170506e-05, + "loss": 2.2765, + "step": 6488 + }, + { + "epoch": 1.2163074039362698, + "grad_norm": 53444.21484375, + "learning_rate": 7.624375090492355e-05, + "loss": 2.2497, + "step": 6489 + }, + { + "epoch": 1.2164948453608249, + "grad_norm": 48081.55859375, + "learning_rate": 7.623706208995542e-05, + "loss": 2.2413, + "step": 6490 + }, + { + "epoch": 1.2166822867853795, + "grad_norm": 50406.328125, + "learning_rate": 7.623037262696586e-05, + "loss": 2.2424, + "step": 6491 + }, + { + "epoch": 1.2168697282099343, + "grad_norm": 48923.93359375, + "learning_rate": 7.622368251612011e-05, + "loss": 2.2649, + "step": 6492 + }, + { + "epoch": 1.2170571696344892, + "grad_norm": 50072.13671875, + "learning_rate": 7.621699175758341e-05, + "loss": 2.2432, + "step": 6493 + }, + { + "epoch": 1.217244611059044, + "grad_norm": 50154.3515625, + "learning_rate": 7.6210300351521e-05, + "loss": 2.2611, + "step": 6494 + }, + { + "epoch": 1.2174320524835989, + "grad_norm": 51792.03515625, + "learning_rate": 7.620360829809813e-05, + "loss": 2.2595, + "step": 6495 + }, + { + "epoch": 1.2176194939081537, + "grad_norm": 50088.9453125, + "learning_rate": 7.619691559748016e-05, + "loss": 2.1848, + "step": 6496 + }, + { + "epoch": 1.2178069353327086, + "grad_norm": 49401.65625, + "learning_rate": 7.61902222498323e-05, + "loss": 2.1908, + "step": 6497 + }, + { + "epoch": 1.2179943767572634, + "grad_norm": 49632.5, + "learning_rate": 7.618352825531991e-05, + "loss": 2.2763, + "step": 6498 + }, + { + "epoch": 1.2181818181818183, + "grad_norm": 47014.11328125, + "learning_rate": 7.617683361410835e-05, + "loss": 2.2121, + "step": 6499 + }, + { + "epoch": 1.218369259606373, + "grad_norm": 49687.4765625, + "learning_rate": 7.617013832636291e-05, + "loss": 2.1958, + "step": 6500 + }, + { + "epoch": 1.218369259606373, + "eval_loss": 2.3113834857940674, + "eval_runtime": 129.1637, + "eval_samples_per_second": 39.09, + "eval_steps_per_second": 1.959, + "step": 6500 + }, + { + "epoch": 1.218556701030928, + "grad_norm": 56564.515625, + "learning_rate": 7.616344239224898e-05, + "loss": 2.2391, + "step": 6501 + }, + { + "epoch": 1.2187441424554826, + "grad_norm": 49024.96875, + "learning_rate": 7.615674581193195e-05, + "loss": 2.2373, + "step": 6502 + }, + { + "epoch": 1.2189315838800374, + "grad_norm": 50235.61328125, + "learning_rate": 7.615004858557722e-05, + "loss": 2.2483, + "step": 6503 + }, + { + "epoch": 1.2191190253045923, + "grad_norm": 51867.54296875, + "learning_rate": 7.614335071335019e-05, + "loss": 2.2428, + "step": 6504 + }, + { + "epoch": 1.219306466729147, + "grad_norm": 49736.3203125, + "learning_rate": 7.613665219541628e-05, + "loss": 2.2385, + "step": 6505 + }, + { + "epoch": 1.219493908153702, + "grad_norm": 47260.328125, + "learning_rate": 7.612995303194096e-05, + "loss": 2.3035, + "step": 6506 + }, + { + "epoch": 1.2196813495782568, + "grad_norm": 50116.80859375, + "learning_rate": 7.612325322308969e-05, + "loss": 2.2514, + "step": 6507 + }, + { + "epoch": 1.2198687910028116, + "grad_norm": 49828.171875, + "learning_rate": 7.611655276902791e-05, + "loss": 2.1978, + "step": 6508 + }, + { + "epoch": 1.2200562324273665, + "grad_norm": 47279.6796875, + "learning_rate": 7.610985166992115e-05, + "loss": 2.197, + "step": 6509 + }, + { + "epoch": 1.2202436738519213, + "grad_norm": 48861.26953125, + "learning_rate": 7.610314992593491e-05, + "loss": 2.2236, + "step": 6510 + }, + { + "epoch": 1.2204311152764762, + "grad_norm": 49770.66796875, + "learning_rate": 7.60964475372347e-05, + "loss": 2.2189, + "step": 6511 + }, + { + "epoch": 1.220618556701031, + "grad_norm": 50410.11328125, + "learning_rate": 7.608974450398607e-05, + "loss": 2.2707, + "step": 6512 + }, + { + "epoch": 1.2208059981255857, + "grad_norm": 50833.4375, + "learning_rate": 7.608304082635458e-05, + "loss": 2.1943, + "step": 6513 + }, + { + "epoch": 1.2209934395501405, + "grad_norm": 51202.78125, + "learning_rate": 7.607633650450581e-05, + "loss": 2.293, + "step": 6514 + }, + { + "epoch": 1.2211808809746953, + "grad_norm": 50241.08203125, + "learning_rate": 7.606963153860531e-05, + "loss": 2.1942, + "step": 6515 + }, + { + "epoch": 1.2213683223992502, + "grad_norm": 56722.80859375, + "learning_rate": 7.606292592881874e-05, + "loss": 2.1271, + "step": 6516 + }, + { + "epoch": 1.221555763823805, + "grad_norm": 49497.859375, + "learning_rate": 7.605621967531166e-05, + "loss": 2.1787, + "step": 6517 + }, + { + "epoch": 1.2217432052483599, + "grad_norm": 53504.23828125, + "learning_rate": 7.604951277824977e-05, + "loss": 2.2115, + "step": 6518 + }, + { + "epoch": 1.2219306466729147, + "grad_norm": 52364.5, + "learning_rate": 7.604280523779867e-05, + "loss": 2.2546, + "step": 6519 + }, + { + "epoch": 1.2221180880974696, + "grad_norm": 52567.19140625, + "learning_rate": 7.603609705412404e-05, + "loss": 2.2683, + "step": 6520 + }, + { + "epoch": 1.2223055295220244, + "grad_norm": 52686.30859375, + "learning_rate": 7.602938822739159e-05, + "loss": 2.2351, + "step": 6521 + }, + { + "epoch": 1.2224929709465793, + "grad_norm": 55204.88671875, + "learning_rate": 7.602267875776698e-05, + "loss": 2.185, + "step": 6522 + }, + { + "epoch": 1.2226804123711341, + "grad_norm": 46879.18359375, + "learning_rate": 7.601596864541595e-05, + "loss": 2.2264, + "step": 6523 + }, + { + "epoch": 1.2228678537956887, + "grad_norm": 49358.6796875, + "learning_rate": 7.600925789050422e-05, + "loss": 2.2632, + "step": 6524 + }, + { + "epoch": 1.2230552952202436, + "grad_norm": 49443.62109375, + "learning_rate": 7.600254649319753e-05, + "loss": 2.2075, + "step": 6525 + }, + { + "epoch": 1.2232427366447984, + "grad_norm": 48430.62890625, + "learning_rate": 7.59958344536617e-05, + "loss": 2.2521, + "step": 6526 + }, + { + "epoch": 1.2234301780693533, + "grad_norm": 48750.73046875, + "learning_rate": 7.598912177206243e-05, + "loss": 2.2423, + "step": 6527 + }, + { + "epoch": 1.2236176194939081, + "grad_norm": 53061.125, + "learning_rate": 7.598240844856554e-05, + "loss": 2.204, + "step": 6528 + }, + { + "epoch": 1.223805060918463, + "grad_norm": 50162.58984375, + "learning_rate": 7.597569448333686e-05, + "loss": 2.2821, + "step": 6529 + }, + { + "epoch": 1.2239925023430178, + "grad_norm": 49968.9296875, + "learning_rate": 7.596897987654219e-05, + "loss": 2.2735, + "step": 6530 + }, + { + "epoch": 1.2241799437675727, + "grad_norm": 47452.140625, + "learning_rate": 7.596226462834741e-05, + "loss": 2.2313, + "step": 6531 + }, + { + "epoch": 1.2243673851921275, + "grad_norm": 49813.5703125, + "learning_rate": 7.595554873891835e-05, + "loss": 2.2137, + "step": 6532 + }, + { + "epoch": 1.2245548266166824, + "grad_norm": 51268.1640625, + "learning_rate": 7.594883220842088e-05, + "loss": 2.3019, + "step": 6533 + }, + { + "epoch": 1.2247422680412372, + "grad_norm": 49524.6015625, + "learning_rate": 7.594211503702088e-05, + "loss": 2.2956, + "step": 6534 + }, + { + "epoch": 1.2249297094657918, + "grad_norm": 49170.1171875, + "learning_rate": 7.59353972248843e-05, + "loss": 2.2515, + "step": 6535 + }, + { + "epoch": 1.2251171508903467, + "grad_norm": 53551.54296875, + "learning_rate": 7.592867877217704e-05, + "loss": 2.2506, + "step": 6536 + }, + { + "epoch": 1.2253045923149015, + "grad_norm": 55940.38671875, + "learning_rate": 7.592195967906501e-05, + "loss": 2.2102, + "step": 6537 + }, + { + "epoch": 1.2254920337394564, + "grad_norm": 52967.95703125, + "learning_rate": 7.591523994571418e-05, + "loss": 2.2192, + "step": 6538 + }, + { + "epoch": 1.2256794751640112, + "grad_norm": 48040.04296875, + "learning_rate": 7.590851957229055e-05, + "loss": 2.2278, + "step": 6539 + }, + { + "epoch": 1.225866916588566, + "grad_norm": 48551.2578125, + "learning_rate": 7.590179855896006e-05, + "loss": 2.2779, + "step": 6540 + }, + { + "epoch": 1.226054358013121, + "grad_norm": 48378.578125, + "learning_rate": 7.589507690588873e-05, + "loss": 2.2383, + "step": 6541 + }, + { + "epoch": 1.2262417994376758, + "grad_norm": 49864.35546875, + "learning_rate": 7.588835461324257e-05, + "loss": 2.1739, + "step": 6542 + }, + { + "epoch": 1.2264292408622306, + "grad_norm": 56748.08984375, + "learning_rate": 7.588163168118762e-05, + "loss": 2.2804, + "step": 6543 + }, + { + "epoch": 1.2266166822867854, + "grad_norm": 52596.734375, + "learning_rate": 7.587490810988992e-05, + "loss": 2.1325, + "step": 6544 + }, + { + "epoch": 1.2268041237113403, + "grad_norm": 48733.22265625, + "learning_rate": 7.586818389951553e-05, + "loss": 2.2281, + "step": 6545 + }, + { + "epoch": 1.226991565135895, + "grad_norm": 52522.140625, + "learning_rate": 7.586145905023056e-05, + "loss": 2.2366, + "step": 6546 + }, + { + "epoch": 1.22717900656045, + "grad_norm": 51351.93359375, + "learning_rate": 7.585473356220108e-05, + "loss": 2.2075, + "step": 6547 + }, + { + "epoch": 1.2273664479850046, + "grad_norm": 52548.25, + "learning_rate": 7.584800743559318e-05, + "loss": 2.2494, + "step": 6548 + }, + { + "epoch": 1.2275538894095595, + "grad_norm": 53657.109375, + "learning_rate": 7.584128067057301e-05, + "loss": 2.2532, + "step": 6549 + }, + { + "epoch": 1.2277413308341143, + "grad_norm": 49572.30078125, + "learning_rate": 7.583455326730673e-05, + "loss": 2.2035, + "step": 6550 + }, + { + "epoch": 1.2279287722586691, + "grad_norm": 54688.203125, + "learning_rate": 7.582782522596046e-05, + "loss": 2.2201, + "step": 6551 + }, + { + "epoch": 1.228116213683224, + "grad_norm": 54268.07421875, + "learning_rate": 7.582109654670042e-05, + "loss": 2.1856, + "step": 6552 + }, + { + "epoch": 1.2283036551077788, + "grad_norm": 49245.56640625, + "learning_rate": 7.581436722969274e-05, + "loss": 2.3124, + "step": 6553 + }, + { + "epoch": 1.2284910965323337, + "grad_norm": 50796.10546875, + "learning_rate": 7.580763727510369e-05, + "loss": 2.3234, + "step": 6554 + }, + { + "epoch": 1.2286785379568885, + "grad_norm": 54531.51171875, + "learning_rate": 7.580090668309942e-05, + "loss": 2.1484, + "step": 6555 + }, + { + "epoch": 1.2288659793814434, + "grad_norm": 51381.0, + "learning_rate": 7.579417545384623e-05, + "loss": 2.2349, + "step": 6556 + }, + { + "epoch": 1.2290534208059982, + "grad_norm": 52542.9140625, + "learning_rate": 7.578744358751035e-05, + "loss": 2.2822, + "step": 6557 + }, + { + "epoch": 1.229240862230553, + "grad_norm": 49210.13671875, + "learning_rate": 7.578071108425805e-05, + "loss": 2.2372, + "step": 6558 + }, + { + "epoch": 1.2294283036551077, + "grad_norm": 54563.375, + "learning_rate": 7.577397794425561e-05, + "loss": 2.2743, + "step": 6559 + }, + { + "epoch": 1.2296157450796625, + "grad_norm": 49233.01953125, + "learning_rate": 7.576724416766933e-05, + "loss": 2.2846, + "step": 6560 + }, + { + "epoch": 1.2298031865042174, + "grad_norm": 53117.5, + "learning_rate": 7.576050975466553e-05, + "loss": 2.3104, + "step": 6561 + }, + { + "epoch": 1.2299906279287722, + "grad_norm": 52662.21484375, + "learning_rate": 7.575377470541053e-05, + "loss": 2.178, + "step": 6562 + }, + { + "epoch": 1.230178069353327, + "grad_norm": 53956.6796875, + "learning_rate": 7.574703902007068e-05, + "loss": 2.2059, + "step": 6563 + }, + { + "epoch": 1.230365510777882, + "grad_norm": 53215.66015625, + "learning_rate": 7.574030269881236e-05, + "loss": 2.2617, + "step": 6564 + }, + { + "epoch": 1.2305529522024368, + "grad_norm": 50120.64453125, + "learning_rate": 7.573356574180193e-05, + "loss": 2.2407, + "step": 6565 + }, + { + "epoch": 1.2307403936269916, + "grad_norm": 49986.359375, + "learning_rate": 7.57268281492058e-05, + "loss": 2.2477, + "step": 6566 + }, + { + "epoch": 1.2309278350515465, + "grad_norm": 48978.2734375, + "learning_rate": 7.572008992119037e-05, + "loss": 2.2359, + "step": 6567 + }, + { + "epoch": 1.2311152764761013, + "grad_norm": 50524.67578125, + "learning_rate": 7.571335105792206e-05, + "loss": 2.1681, + "step": 6568 + }, + { + "epoch": 1.2313027179006562, + "grad_norm": 49336.7421875, + "learning_rate": 7.57066115595673e-05, + "loss": 2.2638, + "step": 6569 + }, + { + "epoch": 1.2314901593252108, + "grad_norm": 49393.9375, + "learning_rate": 7.569987142629258e-05, + "loss": 2.1923, + "step": 6570 + }, + { + "epoch": 1.2316776007497656, + "grad_norm": 54555.85546875, + "learning_rate": 7.569313065826435e-05, + "loss": 2.2129, + "step": 6571 + }, + { + "epoch": 1.2318650421743205, + "grad_norm": 51798.58203125, + "learning_rate": 7.568638925564911e-05, + "loss": 2.259, + "step": 6572 + }, + { + "epoch": 1.2320524835988753, + "grad_norm": 48001.203125, + "learning_rate": 7.567964721861336e-05, + "loss": 2.2379, + "step": 6573 + }, + { + "epoch": 1.2322399250234302, + "grad_norm": 57749.30859375, + "learning_rate": 7.56729045473236e-05, + "loss": 2.2166, + "step": 6574 + }, + { + "epoch": 1.232427366447985, + "grad_norm": 51598.890625, + "learning_rate": 7.566616124194641e-05, + "loss": 2.3015, + "step": 6575 + }, + { + "epoch": 1.2326148078725399, + "grad_norm": 51667.75, + "learning_rate": 7.56594173026483e-05, + "loss": 2.2723, + "step": 6576 + }, + { + "epoch": 1.2328022492970947, + "grad_norm": 51490.625, + "learning_rate": 7.565267272959585e-05, + "loss": 2.2008, + "step": 6577 + }, + { + "epoch": 1.2329896907216495, + "grad_norm": 52070.97265625, + "learning_rate": 7.564592752295566e-05, + "loss": 2.1662, + "step": 6578 + }, + { + "epoch": 1.2331771321462044, + "grad_norm": 50522.4375, + "learning_rate": 7.563918168289427e-05, + "loss": 2.2128, + "step": 6579 + }, + { + "epoch": 1.2333645735707592, + "grad_norm": 53154.1953125, + "learning_rate": 7.563243520957837e-05, + "loss": 2.1327, + "step": 6580 + }, + { + "epoch": 1.2335520149953139, + "grad_norm": 51664.5390625, + "learning_rate": 7.562568810317455e-05, + "loss": 2.2028, + "step": 6581 + }, + { + "epoch": 1.2337394564198687, + "grad_norm": 51855.57421875, + "learning_rate": 7.561894036384944e-05, + "loss": 2.2037, + "step": 6582 + }, + { + "epoch": 1.2339268978444236, + "grad_norm": 53016.48046875, + "learning_rate": 7.561219199176974e-05, + "loss": 2.2532, + "step": 6583 + }, + { + "epoch": 1.2341143392689784, + "grad_norm": 51031.8671875, + "learning_rate": 7.56054429871021e-05, + "loss": 2.2709, + "step": 6584 + }, + { + "epoch": 1.2343017806935332, + "grad_norm": 48999.7734375, + "learning_rate": 7.559869335001319e-05, + "loss": 2.2952, + "step": 6585 + }, + { + "epoch": 1.234489222118088, + "grad_norm": 49959.75390625, + "learning_rate": 7.559194308066977e-05, + "loss": 2.2604, + "step": 6586 + }, + { + "epoch": 1.234676663542643, + "grad_norm": 53236.51953125, + "learning_rate": 7.558519217923854e-05, + "loss": 2.2719, + "step": 6587 + }, + { + "epoch": 1.2348641049671978, + "grad_norm": 53869.37109375, + "learning_rate": 7.557844064588622e-05, + "loss": 2.2398, + "step": 6588 + }, + { + "epoch": 1.2350515463917526, + "grad_norm": 54514.0859375, + "learning_rate": 7.557168848077959e-05, + "loss": 2.2549, + "step": 6589 + }, + { + "epoch": 1.2352389878163075, + "grad_norm": 49141.1875, + "learning_rate": 7.55649356840854e-05, + "loss": 2.2963, + "step": 6590 + }, + { + "epoch": 1.2354264292408623, + "grad_norm": 52903.4921875, + "learning_rate": 7.555818225597044e-05, + "loss": 2.1594, + "step": 6591 + }, + { + "epoch": 1.235613870665417, + "grad_norm": 50360.640625, + "learning_rate": 7.555142819660153e-05, + "loss": 2.2151, + "step": 6592 + }, + { + "epoch": 1.2358013120899718, + "grad_norm": 50590.58203125, + "learning_rate": 7.554467350614546e-05, + "loss": 2.2144, + "step": 6593 + }, + { + "epoch": 1.2359887535145266, + "grad_norm": 54324.05859375, + "learning_rate": 7.553791818476908e-05, + "loss": 2.1728, + "step": 6594 + }, + { + "epoch": 1.2361761949390815, + "grad_norm": 52590.26171875, + "learning_rate": 7.553116223263922e-05, + "loss": 2.2655, + "step": 6595 + }, + { + "epoch": 1.2363636363636363, + "grad_norm": 51185.078125, + "learning_rate": 7.552440564992277e-05, + "loss": 2.2561, + "step": 6596 + }, + { + "epoch": 1.2365510777881912, + "grad_norm": 52441.5078125, + "learning_rate": 7.55176484367866e-05, + "loss": 2.297, + "step": 6597 + }, + { + "epoch": 1.236738519212746, + "grad_norm": 53730.8125, + "learning_rate": 7.551089059339756e-05, + "loss": 2.2128, + "step": 6598 + }, + { + "epoch": 1.2369259606373009, + "grad_norm": 48954.93359375, + "learning_rate": 7.550413211992261e-05, + "loss": 2.2144, + "step": 6599 + }, + { + "epoch": 1.2371134020618557, + "grad_norm": 54091.96484375, + "learning_rate": 7.549737301652869e-05, + "loss": 2.2243, + "step": 6600 + }, + { + "epoch": 1.2373008434864106, + "grad_norm": 52964.33203125, + "learning_rate": 7.549061328338269e-05, + "loss": 2.2157, + "step": 6601 + }, + { + "epoch": 1.2374882849109654, + "grad_norm": 47978.17578125, + "learning_rate": 7.548385292065161e-05, + "loss": 2.2837, + "step": 6602 + }, + { + "epoch": 1.23767572633552, + "grad_norm": 48405.375, + "learning_rate": 7.547709192850239e-05, + "loss": 2.1999, + "step": 6603 + }, + { + "epoch": 1.237863167760075, + "grad_norm": 51723.484375, + "learning_rate": 7.547033030710204e-05, + "loss": 2.2568, + "step": 6604 + }, + { + "epoch": 1.2380506091846297, + "grad_norm": 50497.83203125, + "learning_rate": 7.546356805661754e-05, + "loss": 2.2671, + "step": 6605 + }, + { + "epoch": 1.2382380506091846, + "grad_norm": 50226.75, + "learning_rate": 7.545680517721593e-05, + "loss": 2.2539, + "step": 6606 + }, + { + "epoch": 1.2384254920337394, + "grad_norm": 49170.87109375, + "learning_rate": 7.545004166906424e-05, + "loss": 2.1813, + "step": 6607 + }, + { + "epoch": 1.2386129334582943, + "grad_norm": 49025.7578125, + "learning_rate": 7.544327753232952e-05, + "loss": 2.3093, + "step": 6608 + }, + { + "epoch": 1.238800374882849, + "grad_norm": 56715.05078125, + "learning_rate": 7.543651276717883e-05, + "loss": 2.192, + "step": 6609 + }, + { + "epoch": 1.238987816307404, + "grad_norm": 51288.28125, + "learning_rate": 7.542974737377926e-05, + "loss": 2.3282, + "step": 6610 + }, + { + "epoch": 1.2391752577319588, + "grad_norm": 50118.1015625, + "learning_rate": 7.542298135229788e-05, + "loss": 2.273, + "step": 6611 + }, + { + "epoch": 1.2393626991565136, + "grad_norm": 50453.53515625, + "learning_rate": 7.541621470290183e-05, + "loss": 2.2491, + "step": 6612 + }, + { + "epoch": 1.2395501405810685, + "grad_norm": 46974.734375, + "learning_rate": 7.540944742575824e-05, + "loss": 2.2425, + "step": 6613 + }, + { + "epoch": 1.2397375820056233, + "grad_norm": 51813.93359375, + "learning_rate": 7.540267952103423e-05, + "loss": 2.2473, + "step": 6614 + }, + { + "epoch": 1.2399250234301782, + "grad_norm": 52110.80078125, + "learning_rate": 7.539591098889698e-05, + "loss": 2.2662, + "step": 6615 + }, + { + "epoch": 1.2401124648547328, + "grad_norm": 52345.28125, + "learning_rate": 7.538914182951365e-05, + "loss": 2.464, + "step": 6616 + }, + { + "epoch": 1.2402999062792877, + "grad_norm": 44305.640625, + "learning_rate": 7.538237204305144e-05, + "loss": 2.2301, + "step": 6617 + }, + { + "epoch": 1.2404873477038425, + "grad_norm": 47004.90234375, + "learning_rate": 7.537560162967754e-05, + "loss": 2.2481, + "step": 6618 + }, + { + "epoch": 1.2406747891283973, + "grad_norm": 49013.9921875, + "learning_rate": 7.53688305895592e-05, + "loss": 2.2425, + "step": 6619 + }, + { + "epoch": 1.2408622305529522, + "grad_norm": 51001.734375, + "learning_rate": 7.53620589228636e-05, + "loss": 2.2259, + "step": 6620 + }, + { + "epoch": 1.241049671977507, + "grad_norm": 51964.80078125, + "learning_rate": 7.535528662975805e-05, + "loss": 2.2902, + "step": 6621 + }, + { + "epoch": 1.2412371134020619, + "grad_norm": 48413.50390625, + "learning_rate": 7.534851371040979e-05, + "loss": 2.2345, + "step": 6622 + }, + { + "epoch": 1.2414245548266167, + "grad_norm": 50351.91015625, + "learning_rate": 7.534174016498611e-05, + "loss": 2.2409, + "step": 6623 + }, + { + "epoch": 1.2416119962511716, + "grad_norm": 53680.34765625, + "learning_rate": 7.533496599365431e-05, + "loss": 2.27, + "step": 6624 + }, + { + "epoch": 1.2417994376757264, + "grad_norm": 55763.98828125, + "learning_rate": 7.532819119658169e-05, + "loss": 2.2158, + "step": 6625 + }, + { + "epoch": 1.2419868791002813, + "grad_norm": 50628.8828125, + "learning_rate": 7.532141577393558e-05, + "loss": 2.2409, + "step": 6626 + }, + { + "epoch": 1.242174320524836, + "grad_norm": 54334.21484375, + "learning_rate": 7.531463972588333e-05, + "loss": 2.2987, + "step": 6627 + }, + { + "epoch": 1.2423617619493907, + "grad_norm": 49091.83203125, + "learning_rate": 7.530786305259231e-05, + "loss": 2.3016, + "step": 6628 + }, + { + "epoch": 1.2425492033739456, + "grad_norm": 49346.67578125, + "learning_rate": 7.530108575422988e-05, + "loss": 2.2543, + "step": 6629 + }, + { + "epoch": 1.2427366447985004, + "grad_norm": 49114.5, + "learning_rate": 7.529430783096343e-05, + "loss": 2.3552, + "step": 6630 + }, + { + "epoch": 1.2429240862230553, + "grad_norm": 51369.28515625, + "learning_rate": 7.528752928296037e-05, + "loss": 2.1754, + "step": 6631 + }, + { + "epoch": 1.2431115276476101, + "grad_norm": 53629.87109375, + "learning_rate": 7.528075011038813e-05, + "loss": 2.2464, + "step": 6632 + }, + { + "epoch": 1.243298969072165, + "grad_norm": 48870.1015625, + "learning_rate": 7.527397031341413e-05, + "loss": 2.3162, + "step": 6633 + }, + { + "epoch": 1.2434864104967198, + "grad_norm": 51531.40234375, + "learning_rate": 7.526718989220585e-05, + "loss": 2.2496, + "step": 6634 + }, + { + "epoch": 1.2436738519212747, + "grad_norm": 53556.41796875, + "learning_rate": 7.526040884693072e-05, + "loss": 2.1442, + "step": 6635 + }, + { + "epoch": 1.2438612933458295, + "grad_norm": 52680.17578125, + "learning_rate": 7.525362717775626e-05, + "loss": 2.2486, + "step": 6636 + }, + { + "epoch": 1.2440487347703844, + "grad_norm": 59546.38671875, + "learning_rate": 7.524684488484994e-05, + "loss": 2.2422, + "step": 6637 + }, + { + "epoch": 1.244236176194939, + "grad_norm": 55536.51953125, + "learning_rate": 7.524006196837927e-05, + "loss": 2.2053, + "step": 6638 + }, + { + "epoch": 1.2444236176194938, + "grad_norm": 51343.58203125, + "learning_rate": 7.523327842851182e-05, + "loss": 2.1886, + "step": 6639 + }, + { + "epoch": 1.2446110590440487, + "grad_norm": 48167.6328125, + "learning_rate": 7.522649426541511e-05, + "loss": 2.2471, + "step": 6640 + }, + { + "epoch": 1.2447985004686035, + "grad_norm": 47333.91796875, + "learning_rate": 7.521970947925667e-05, + "loss": 2.2789, + "step": 6641 + }, + { + "epoch": 1.2449859418931584, + "grad_norm": 52063.4140625, + "learning_rate": 7.521292407020413e-05, + "loss": 2.2177, + "step": 6642 + }, + { + "epoch": 1.2451733833177132, + "grad_norm": 50905.85546875, + "learning_rate": 7.520613803842504e-05, + "loss": 2.2564, + "step": 6643 + }, + { + "epoch": 1.245360824742268, + "grad_norm": 48678.45703125, + "learning_rate": 7.519935138408701e-05, + "loss": 2.226, + "step": 6644 + }, + { + "epoch": 1.245548266166823, + "grad_norm": 48570.1328125, + "learning_rate": 7.51925641073577e-05, + "loss": 2.2745, + "step": 6645 + }, + { + "epoch": 1.2457357075913777, + "grad_norm": 48383.85546875, + "learning_rate": 7.518577620840469e-05, + "loss": 2.2397, + "step": 6646 + }, + { + "epoch": 1.2459231490159326, + "grad_norm": 54921.52734375, + "learning_rate": 7.517898768739569e-05, + "loss": 2.2372, + "step": 6647 + }, + { + "epoch": 1.2461105904404874, + "grad_norm": 53528.3515625, + "learning_rate": 7.517219854449831e-05, + "loss": 2.2049, + "step": 6648 + }, + { + "epoch": 1.246298031865042, + "grad_norm": 50747.28515625, + "learning_rate": 7.516540877988028e-05, + "loss": 2.3043, + "step": 6649 + }, + { + "epoch": 1.246485473289597, + "grad_norm": 49365.84375, + "learning_rate": 7.515861839370927e-05, + "loss": 2.242, + "step": 6650 + }, + { + "epoch": 1.2466729147141518, + "grad_norm": 50323.421875, + "learning_rate": 7.515182738615303e-05, + "loss": 2.1853, + "step": 6651 + }, + { + "epoch": 1.2468603561387066, + "grad_norm": 48074.6640625, + "learning_rate": 7.514503575737926e-05, + "loss": 2.2175, + "step": 6652 + }, + { + "epoch": 1.2470477975632615, + "grad_norm": 50873.328125, + "learning_rate": 7.513824350755567e-05, + "loss": 2.258, + "step": 6653 + }, + { + "epoch": 1.2472352389878163, + "grad_norm": 50729.15625, + "learning_rate": 7.51314506368501e-05, + "loss": 2.3186, + "step": 6654 + }, + { + "epoch": 1.2474226804123711, + "grad_norm": 49058.4453125, + "learning_rate": 7.512465714543026e-05, + "loss": 2.2606, + "step": 6655 + }, + { + "epoch": 1.247610121836926, + "grad_norm": 51695.15234375, + "learning_rate": 7.511786303346397e-05, + "loss": 2.3062, + "step": 6656 + }, + { + "epoch": 1.2477975632614808, + "grad_norm": 51333.92578125, + "learning_rate": 7.511106830111903e-05, + "loss": 2.2294, + "step": 6657 + }, + { + "epoch": 1.2479850046860357, + "grad_norm": 57072.61328125, + "learning_rate": 7.510427294856324e-05, + "loss": 2.1958, + "step": 6658 + }, + { + "epoch": 1.2481724461105905, + "grad_norm": 46248.12890625, + "learning_rate": 7.509747697596448e-05, + "loss": 2.2673, + "step": 6659 + }, + { + "epoch": 1.2483598875351452, + "grad_norm": 54799.20703125, + "learning_rate": 7.509068038349057e-05, + "loss": 2.2844, + "step": 6660 + }, + { + "epoch": 1.2485473289597, + "grad_norm": 52844.703125, + "learning_rate": 7.508388317130937e-05, + "loss": 2.2719, + "step": 6661 + }, + { + "epoch": 1.2487347703842548, + "grad_norm": 53770.81640625, + "learning_rate": 7.507708533958882e-05, + "loss": 2.4196, + "step": 6662 + }, + { + "epoch": 1.2489222118088097, + "grad_norm": 53537.9140625, + "learning_rate": 7.507028688849673e-05, + "loss": 2.2628, + "step": 6663 + }, + { + "epoch": 1.2491096532333645, + "grad_norm": 51809.578125, + "learning_rate": 7.506348781820107e-05, + "loss": 2.2296, + "step": 6664 + }, + { + "epoch": 1.2492970946579194, + "grad_norm": 48143.99609375, + "learning_rate": 7.505668812886978e-05, + "loss": 2.2468, + "step": 6665 + }, + { + "epoch": 1.2494845360824742, + "grad_norm": 50901.19140625, + "learning_rate": 7.504988782067075e-05, + "loss": 2.2172, + "step": 6666 + }, + { + "epoch": 1.249671977507029, + "grad_norm": 48690.359375, + "learning_rate": 7.5043086893772e-05, + "loss": 2.2965, + "step": 6667 + }, + { + "epoch": 1.249859418931584, + "grad_norm": 52859.203125, + "learning_rate": 7.503628534834144e-05, + "loss": 2.2036, + "step": 6668 + }, + { + "epoch": 1.2500468603561388, + "grad_norm": 53053.33203125, + "learning_rate": 7.50294831845471e-05, + "loss": 2.3015, + "step": 6669 + }, + { + "epoch": 1.2502343017806936, + "grad_norm": 52119.38671875, + "learning_rate": 7.502268040255698e-05, + "loss": 2.2527, + "step": 6670 + }, + { + "epoch": 1.2504217432052482, + "grad_norm": 48341.33984375, + "learning_rate": 7.50158770025391e-05, + "loss": 2.2742, + "step": 6671 + }, + { + "epoch": 1.2506091846298033, + "grad_norm": 56235.05078125, + "learning_rate": 7.500907298466149e-05, + "loss": 2.2278, + "step": 6672 + }, + { + "epoch": 1.250796626054358, + "grad_norm": 47858.03125, + "learning_rate": 7.500226834909221e-05, + "loss": 2.2372, + "step": 6673 + }, + { + "epoch": 1.2509840674789128, + "grad_norm": 53880.28125, + "learning_rate": 7.49954630959993e-05, + "loss": 2.1971, + "step": 6674 + }, + { + "epoch": 1.2511715089034676, + "grad_norm": 48615.1875, + "learning_rate": 7.498865722555087e-05, + "loss": 2.2909, + "step": 6675 + }, + { + "epoch": 1.2513589503280225, + "grad_norm": 54918.921875, + "learning_rate": 7.498185073791499e-05, + "loss": 2.3271, + "step": 6676 + }, + { + "epoch": 1.2515463917525773, + "grad_norm": 53896.62109375, + "learning_rate": 7.497504363325981e-05, + "loss": 2.2027, + "step": 6677 + }, + { + "epoch": 1.2517338331771322, + "grad_norm": 48336.51953125, + "learning_rate": 7.496823591175342e-05, + "loss": 2.2564, + "step": 6678 + }, + { + "epoch": 1.251921274601687, + "grad_norm": 48423.22265625, + "learning_rate": 7.496142757356399e-05, + "loss": 2.2641, + "step": 6679 + }, + { + "epoch": 1.2521087160262419, + "grad_norm": 51163.5703125, + "learning_rate": 7.495461861885966e-05, + "loss": 2.1904, + "step": 6680 + }, + { + "epoch": 1.2522961574507967, + "grad_norm": 46238.3515625, + "learning_rate": 7.494780904780856e-05, + "loss": 2.2796, + "step": 6681 + }, + { + "epoch": 1.2524835988753513, + "grad_norm": 49327.75390625, + "learning_rate": 7.494099886057898e-05, + "loss": 2.2919, + "step": 6682 + }, + { + "epoch": 1.2526710402999064, + "grad_norm": 50499.76171875, + "learning_rate": 7.493418805733903e-05, + "loss": 2.242, + "step": 6683 + }, + { + "epoch": 1.252858481724461, + "grad_norm": 48521.55859375, + "learning_rate": 7.492737663825697e-05, + "loss": 2.2512, + "step": 6684 + }, + { + "epoch": 1.2530459231490159, + "grad_norm": 53113.16015625, + "learning_rate": 7.492056460350103e-05, + "loss": 2.2554, + "step": 6685 + }, + { + "epoch": 1.2532333645735707, + "grad_norm": 47566.85546875, + "learning_rate": 7.491375195323944e-05, + "loss": 2.177, + "step": 6686 + }, + { + "epoch": 1.2534208059981256, + "grad_norm": 51636.9140625, + "learning_rate": 7.490693868764049e-05, + "loss": 2.3122, + "step": 6687 + }, + { + "epoch": 1.2536082474226804, + "grad_norm": 47796.5546875, + "learning_rate": 7.490012480687246e-05, + "loss": 2.2435, + "step": 6688 + }, + { + "epoch": 1.2537956888472352, + "grad_norm": 52161.26953125, + "learning_rate": 7.48933103111036e-05, + "loss": 2.2128, + "step": 6689 + }, + { + "epoch": 1.25398313027179, + "grad_norm": 57023.921875, + "learning_rate": 7.488649520050228e-05, + "loss": 2.2867, + "step": 6690 + }, + { + "epoch": 1.254170571696345, + "grad_norm": 49940.71875, + "learning_rate": 7.487967947523678e-05, + "loss": 2.1937, + "step": 6691 + }, + { + "epoch": 1.2543580131208998, + "grad_norm": 48920.66015625, + "learning_rate": 7.487286313547546e-05, + "loss": 2.1983, + "step": 6692 + }, + { + "epoch": 1.2545454545454544, + "grad_norm": 49065.125, + "learning_rate": 7.486604618138666e-05, + "loss": 2.3155, + "step": 6693 + }, + { + "epoch": 1.2547328959700095, + "grad_norm": 56197.58203125, + "learning_rate": 7.485922861313876e-05, + "loss": 2.2195, + "step": 6694 + }, + { + "epoch": 1.254920337394564, + "grad_norm": 51873.609375, + "learning_rate": 7.485241043090016e-05, + "loss": 2.3063, + "step": 6695 + }, + { + "epoch": 1.255107778819119, + "grad_norm": 50248.10546875, + "learning_rate": 7.484559163483922e-05, + "loss": 2.2337, + "step": 6696 + }, + { + "epoch": 1.2552952202436738, + "grad_norm": 49485.40625, + "learning_rate": 7.48387722251244e-05, + "loss": 2.1988, + "step": 6697 + }, + { + "epoch": 1.2554826616682286, + "grad_norm": 53858.8671875, + "learning_rate": 7.48319522019241e-05, + "loss": 2.2993, + "step": 6698 + }, + { + "epoch": 1.2556701030927835, + "grad_norm": 50016.3203125, + "learning_rate": 7.482513156540677e-05, + "loss": 2.2485, + "step": 6699 + }, + { + "epoch": 1.2558575445173383, + "grad_norm": 55144.56640625, + "learning_rate": 7.481831031574088e-05, + "loss": 2.2055, + "step": 6700 + }, + { + "epoch": 1.2560449859418932, + "grad_norm": 53446.48828125, + "learning_rate": 7.48114884530949e-05, + "loss": 2.1915, + "step": 6701 + }, + { + "epoch": 1.256232427366448, + "grad_norm": 49413.97265625, + "learning_rate": 7.480466597763733e-05, + "loss": 2.2288, + "step": 6702 + }, + { + "epoch": 1.2564198687910029, + "grad_norm": 52035.32421875, + "learning_rate": 7.479784288953669e-05, + "loss": 2.2016, + "step": 6703 + }, + { + "epoch": 1.2566073102155577, + "grad_norm": 51423.55078125, + "learning_rate": 7.479101918896144e-05, + "loss": 2.2422, + "step": 6704 + }, + { + "epoch": 1.2567947516401126, + "grad_norm": 49165.92578125, + "learning_rate": 7.478419487608018e-05, + "loss": 2.2272, + "step": 6705 + }, + { + "epoch": 1.2569821930646672, + "grad_norm": 49802.30078125, + "learning_rate": 7.477736995106144e-05, + "loss": 2.256, + "step": 6706 + }, + { + "epoch": 1.2571696344892223, + "grad_norm": 53527.27734375, + "learning_rate": 7.477054441407379e-05, + "loss": 2.2888, + "step": 6707 + }, + { + "epoch": 1.2573570759137769, + "grad_norm": 54213.0390625, + "learning_rate": 7.476371826528579e-05, + "loss": 2.3325, + "step": 6708 + }, + { + "epoch": 1.2575445173383317, + "grad_norm": 51132.4140625, + "learning_rate": 7.475689150486608e-05, + "loss": 2.2222, + "step": 6709 + }, + { + "epoch": 1.2577319587628866, + "grad_norm": 53182.109375, + "learning_rate": 7.475006413298324e-05, + "loss": 2.1998, + "step": 6710 + }, + { + "epoch": 1.2579194001874414, + "grad_norm": 64740.09375, + "learning_rate": 7.47432361498059e-05, + "loss": 2.2229, + "step": 6711 + }, + { + "epoch": 1.2581068416119963, + "grad_norm": 52278.5078125, + "learning_rate": 7.47364075555027e-05, + "loss": 2.2083, + "step": 6712 + }, + { + "epoch": 1.258294283036551, + "grad_norm": 50053.671875, + "learning_rate": 7.472957835024233e-05, + "loss": 2.1873, + "step": 6713 + }, + { + "epoch": 1.258481724461106, + "grad_norm": 51468.203125, + "learning_rate": 7.47227485341934e-05, + "loss": 2.2339, + "step": 6714 + }, + { + "epoch": 1.2586691658856608, + "grad_norm": 51415.7109375, + "learning_rate": 7.471591810752466e-05, + "loss": 2.1831, + "step": 6715 + }, + { + "epoch": 1.2588566073102156, + "grad_norm": 52624.00390625, + "learning_rate": 7.470908707040478e-05, + "loss": 2.2975, + "step": 6716 + }, + { + "epoch": 1.2590440487347703, + "grad_norm": 50519.53125, + "learning_rate": 7.470225542300248e-05, + "loss": 2.2048, + "step": 6717 + }, + { + "epoch": 1.2592314901593253, + "grad_norm": 47397.76953125, + "learning_rate": 7.469542316548653e-05, + "loss": 2.2582, + "step": 6718 + }, + { + "epoch": 1.25941893158388, + "grad_norm": 52314.015625, + "learning_rate": 7.468859029802559e-05, + "loss": 2.2355, + "step": 6719 + }, + { + "epoch": 1.2596063730084348, + "grad_norm": 57090.49609375, + "learning_rate": 7.468175682078853e-05, + "loss": 2.1765, + "step": 6720 + }, + { + "epoch": 1.2597938144329897, + "grad_norm": 48440.69140625, + "learning_rate": 7.467492273394405e-05, + "loss": 2.1798, + "step": 6721 + }, + { + "epoch": 1.2599812558575445, + "grad_norm": 51075.20703125, + "learning_rate": 7.4668088037661e-05, + "loss": 2.2526, + "step": 6722 + }, + { + "epoch": 1.2601686972820993, + "grad_norm": 49647.99609375, + "learning_rate": 7.466125273210812e-05, + "loss": 2.2582, + "step": 6723 + }, + { + "epoch": 1.2603561387066542, + "grad_norm": 49357.23828125, + "learning_rate": 7.465441681745428e-05, + "loss": 2.211, + "step": 6724 + }, + { + "epoch": 1.260543580131209, + "grad_norm": 52984.71875, + "learning_rate": 7.464758029386833e-05, + "loss": 2.2362, + "step": 6725 + }, + { + "epoch": 1.2607310215557639, + "grad_norm": 48929.33984375, + "learning_rate": 7.464074316151908e-05, + "loss": 2.2628, + "step": 6726 + }, + { + "epoch": 1.2609184629803187, + "grad_norm": 48798.9453125, + "learning_rate": 7.463390542057543e-05, + "loss": 2.2757, + "step": 6727 + }, + { + "epoch": 1.2611059044048734, + "grad_norm": 49619.30078125, + "learning_rate": 7.462706707120626e-05, + "loss": 2.2447, + "step": 6728 + }, + { + "epoch": 1.2612933458294284, + "grad_norm": 53186.76171875, + "learning_rate": 7.462022811358045e-05, + "loss": 2.3084, + "step": 6729 + }, + { + "epoch": 1.261480787253983, + "grad_norm": 53120.23828125, + "learning_rate": 7.461338854786693e-05, + "loss": 2.1627, + "step": 6730 + }, + { + "epoch": 1.261668228678538, + "grad_norm": 50813.13671875, + "learning_rate": 7.460654837423464e-05, + "loss": 2.2037, + "step": 6731 + }, + { + "epoch": 1.2618556701030927, + "grad_norm": 48838.96484375, + "learning_rate": 7.459970759285248e-05, + "loss": 2.252, + "step": 6732 + }, + { + "epoch": 1.2620431115276476, + "grad_norm": 54703.87890625, + "learning_rate": 7.459286620388947e-05, + "loss": 2.2402, + "step": 6733 + }, + { + "epoch": 1.2622305529522024, + "grad_norm": 50051.78125, + "learning_rate": 7.458602420751453e-05, + "loss": 2.1986, + "step": 6734 + }, + { + "epoch": 1.2624179943767573, + "grad_norm": 49646.74609375, + "learning_rate": 7.457918160389667e-05, + "loss": 2.2279, + "step": 6735 + }, + { + "epoch": 1.2626054358013121, + "grad_norm": 52779.609375, + "learning_rate": 7.457233839320489e-05, + "loss": 2.2185, + "step": 6736 + }, + { + "epoch": 1.262792877225867, + "grad_norm": 50855.81640625, + "learning_rate": 7.456549457560821e-05, + "loss": 2.1883, + "step": 6737 + }, + { + "epoch": 1.2629803186504218, + "grad_norm": 59237.0078125, + "learning_rate": 7.455865015127567e-05, + "loss": 2.2619, + "step": 6738 + }, + { + "epoch": 1.2631677600749764, + "grad_norm": 53456.734375, + "learning_rate": 7.455180512037629e-05, + "loss": 2.2554, + "step": 6739 + }, + { + "epoch": 1.2633552014995315, + "grad_norm": 50335.87890625, + "learning_rate": 7.45449594830792e-05, + "loss": 2.2209, + "step": 6740 + }, + { + "epoch": 1.2635426429240861, + "grad_norm": 54774.54296875, + "learning_rate": 7.453811323955339e-05, + "loss": 2.2685, + "step": 6741 + }, + { + "epoch": 1.263730084348641, + "grad_norm": 53044.18359375, + "learning_rate": 7.453126638996802e-05, + "loss": 2.2342, + "step": 6742 + }, + { + "epoch": 1.2639175257731958, + "grad_norm": 55869.78515625, + "learning_rate": 7.452441893449217e-05, + "loss": 2.2139, + "step": 6743 + }, + { + "epoch": 1.2641049671977507, + "grad_norm": 53642.49609375, + "learning_rate": 7.451757087329496e-05, + "loss": 2.2435, + "step": 6744 + }, + { + "epoch": 1.2642924086223055, + "grad_norm": 54271.34765625, + "learning_rate": 7.451072220654556e-05, + "loss": 2.243, + "step": 6745 + }, + { + "epoch": 1.2644798500468604, + "grad_norm": 52505.0703125, + "learning_rate": 7.450387293441308e-05, + "loss": 2.2, + "step": 6746 + }, + { + "epoch": 1.2646672914714152, + "grad_norm": 49802.328125, + "learning_rate": 7.44970230570667e-05, + "loss": 2.2962, + "step": 6747 + }, + { + "epoch": 1.26485473289597, + "grad_norm": 48483.73046875, + "learning_rate": 7.449017257467565e-05, + "loss": 2.1969, + "step": 6748 + }, + { + "epoch": 1.265042174320525, + "grad_norm": 52863.1640625, + "learning_rate": 7.448332148740905e-05, + "loss": 2.2721, + "step": 6749 + }, + { + "epoch": 1.2652296157450795, + "grad_norm": 50807.32421875, + "learning_rate": 7.447646979543617e-05, + "loss": 2.2811, + "step": 6750 + }, + { + "epoch": 1.2654170571696346, + "grad_norm": 50467.5703125, + "learning_rate": 7.446961749892621e-05, + "loss": 2.2553, + "step": 6751 + }, + { + "epoch": 1.2656044985941892, + "grad_norm": 51374.2578125, + "learning_rate": 7.446276459804844e-05, + "loss": 2.2704, + "step": 6752 + }, + { + "epoch": 1.265791940018744, + "grad_norm": 55572.84375, + "learning_rate": 7.44559110929721e-05, + "loss": 2.2388, + "step": 6753 + }, + { + "epoch": 1.265979381443299, + "grad_norm": 50073.27734375, + "learning_rate": 7.444905698386644e-05, + "loss": 2.2856, + "step": 6754 + }, + { + "epoch": 1.2661668228678538, + "grad_norm": 49867.60546875, + "learning_rate": 7.44422022709008e-05, + "loss": 2.2539, + "step": 6755 + }, + { + "epoch": 1.2663542642924086, + "grad_norm": 49956.6640625, + "learning_rate": 7.443534695424443e-05, + "loss": 2.2498, + "step": 6756 + }, + { + "epoch": 1.2665417057169635, + "grad_norm": 47567.75390625, + "learning_rate": 7.442849103406668e-05, + "loss": 2.2498, + "step": 6757 + }, + { + "epoch": 1.2667291471415183, + "grad_norm": 55083.16796875, + "learning_rate": 7.442163451053687e-05, + "loss": 2.1872, + "step": 6758 + }, + { + "epoch": 1.2669165885660731, + "grad_norm": 52761.23046875, + "learning_rate": 7.441477738382437e-05, + "loss": 2.1659, + "step": 6759 + }, + { + "epoch": 1.267104029990628, + "grad_norm": 53367.9375, + "learning_rate": 7.44079196540985e-05, + "loss": 2.2709, + "step": 6760 + }, + { + "epoch": 1.2672914714151828, + "grad_norm": 49556.265625, + "learning_rate": 7.440106132152866e-05, + "loss": 2.2375, + "step": 6761 + }, + { + "epoch": 1.2674789128397377, + "grad_norm": 54634.875, + "learning_rate": 7.439420238628427e-05, + "loss": 2.2414, + "step": 6762 + }, + { + "epoch": 1.2676663542642923, + "grad_norm": 50539.30078125, + "learning_rate": 7.438734284853468e-05, + "loss": 2.2256, + "step": 6763 + }, + { + "epoch": 1.2678537956888474, + "grad_norm": 56051.8515625, + "learning_rate": 7.438048270844935e-05, + "loss": 2.3215, + "step": 6764 + }, + { + "epoch": 1.268041237113402, + "grad_norm": 49928.7890625, + "learning_rate": 7.43736219661977e-05, + "loss": 2.2412, + "step": 6765 + }, + { + "epoch": 1.2682286785379568, + "grad_norm": 50552.0234375, + "learning_rate": 7.43667606219492e-05, + "loss": 2.2103, + "step": 6766 + }, + { + "epoch": 1.2684161199625117, + "grad_norm": 49598.0234375, + "learning_rate": 7.435989867587328e-05, + "loss": 2.284, + "step": 6767 + }, + { + "epoch": 1.2686035613870665, + "grad_norm": 48323.765625, + "learning_rate": 7.435303612813949e-05, + "loss": 2.2372, + "step": 6768 + }, + { + "epoch": 1.2687910028116214, + "grad_norm": 49788.0625, + "learning_rate": 7.434617297891724e-05, + "loss": 2.2586, + "step": 6769 + }, + { + "epoch": 1.2689784442361762, + "grad_norm": 49464.18359375, + "learning_rate": 7.433930922837611e-05, + "loss": 2.2912, + "step": 6770 + }, + { + "epoch": 1.269165885660731, + "grad_norm": 50505.17578125, + "learning_rate": 7.433244487668559e-05, + "loss": 2.2492, + "step": 6771 + }, + { + "epoch": 1.269353327085286, + "grad_norm": 53141.859375, + "learning_rate": 7.432557992401521e-05, + "loss": 2.2235, + "step": 6772 + }, + { + "epoch": 1.2695407685098408, + "grad_norm": 52263.34765625, + "learning_rate": 7.431871437053457e-05, + "loss": 2.2235, + "step": 6773 + }, + { + "epoch": 1.2697282099343954, + "grad_norm": 52122.90234375, + "learning_rate": 7.431184821641319e-05, + "loss": 2.28, + "step": 6774 + }, + { + "epoch": 1.2699156513589505, + "grad_norm": 48689.99609375, + "learning_rate": 7.430498146182071e-05, + "loss": 2.2554, + "step": 6775 + }, + { + "epoch": 1.270103092783505, + "grad_norm": 51487.640625, + "learning_rate": 7.429811410692669e-05, + "loss": 2.2684, + "step": 6776 + }, + { + "epoch": 1.27029053420806, + "grad_norm": 48976.9140625, + "learning_rate": 7.429124615190074e-05, + "loss": 2.241, + "step": 6777 + }, + { + "epoch": 1.2704779756326148, + "grad_norm": 47844.18359375, + "learning_rate": 7.428437759691251e-05, + "loss": 2.2758, + "step": 6778 + }, + { + "epoch": 1.2706654170571696, + "grad_norm": 48329.37109375, + "learning_rate": 7.427750844213165e-05, + "loss": 2.2299, + "step": 6779 + }, + { + "epoch": 1.2708528584817245, + "grad_norm": 50991.8515625, + "learning_rate": 7.427063868772779e-05, + "loss": 2.2548, + "step": 6780 + }, + { + "epoch": 1.2710402999062793, + "grad_norm": 47882.1953125, + "learning_rate": 7.426376833387064e-05, + "loss": 2.3393, + "step": 6781 + }, + { + "epoch": 1.2712277413308342, + "grad_norm": 51993.99609375, + "learning_rate": 7.425689738072985e-05, + "loss": 2.2822, + "step": 6782 + }, + { + "epoch": 1.271415182755389, + "grad_norm": 50764.98828125, + "learning_rate": 7.425002582847517e-05, + "loss": 2.2285, + "step": 6783 + }, + { + "epoch": 1.2716026241799439, + "grad_norm": 46700.6484375, + "learning_rate": 7.424315367727627e-05, + "loss": 2.2135, + "step": 6784 + }, + { + "epoch": 1.2717900656044985, + "grad_norm": 52935.66015625, + "learning_rate": 7.423628092730293e-05, + "loss": 2.2052, + "step": 6785 + }, + { + "epoch": 1.2719775070290535, + "grad_norm": 49404.6640625, + "learning_rate": 7.422940757872485e-05, + "loss": 2.229, + "step": 6786 + }, + { + "epoch": 1.2721649484536082, + "grad_norm": 51292.1015625, + "learning_rate": 7.422253363171183e-05, + "loss": 2.2524, + "step": 6787 + }, + { + "epoch": 1.272352389878163, + "grad_norm": 48191.5, + "learning_rate": 7.421565908643362e-05, + "loss": 2.2304, + "step": 6788 + }, + { + "epoch": 1.2725398313027179, + "grad_norm": 49705.63671875, + "learning_rate": 7.420878394306004e-05, + "loss": 2.2457, + "step": 6789 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 55122.15625, + "learning_rate": 7.420190820176088e-05, + "loss": 2.1805, + "step": 6790 + }, + { + "epoch": 1.2729147141518276, + "grad_norm": 50722.25, + "learning_rate": 7.419503186270597e-05, + "loss": 2.2287, + "step": 6791 + }, + { + "epoch": 1.2731021555763824, + "grad_norm": 49274.80078125, + "learning_rate": 7.418815492606513e-05, + "loss": 2.2055, + "step": 6792 + }, + { + "epoch": 1.2732895970009372, + "grad_norm": 51816.7421875, + "learning_rate": 7.418127739200823e-05, + "loss": 2.32, + "step": 6793 + }, + { + "epoch": 1.273477038425492, + "grad_norm": 49135.765625, + "learning_rate": 7.417439926070514e-05, + "loss": 2.2584, + "step": 6794 + }, + { + "epoch": 1.273664479850047, + "grad_norm": 53226.67578125, + "learning_rate": 7.416752053232572e-05, + "loss": 2.2105, + "step": 6795 + }, + { + "epoch": 1.2738519212746016, + "grad_norm": 53377.54296875, + "learning_rate": 7.416064120703989e-05, + "loss": 2.2689, + "step": 6796 + }, + { + "epoch": 1.2740393626991566, + "grad_norm": 51717.52734375, + "learning_rate": 7.415376128501752e-05, + "loss": 2.2208, + "step": 6797 + }, + { + "epoch": 1.2742268041237113, + "grad_norm": 52809.61328125, + "learning_rate": 7.41468807664286e-05, + "loss": 2.3091, + "step": 6798 + }, + { + "epoch": 1.274414245548266, + "grad_norm": 51428.45703125, + "learning_rate": 7.413999965144298e-05, + "loss": 2.239, + "step": 6799 + }, + { + "epoch": 1.274601686972821, + "grad_norm": 52003.875, + "learning_rate": 7.413311794023072e-05, + "loss": 2.241, + "step": 6800 + }, + { + "epoch": 1.2747891283973758, + "grad_norm": 57664.3828125, + "learning_rate": 7.41262356329617e-05, + "loss": 2.2522, + "step": 6801 + }, + { + "epoch": 1.2749765698219306, + "grad_norm": 49857.421875, + "learning_rate": 7.411935272980596e-05, + "loss": 2.2383, + "step": 6802 + }, + { + "epoch": 1.2751640112464855, + "grad_norm": 49323.2421875, + "learning_rate": 7.411246923093348e-05, + "loss": 2.2289, + "step": 6803 + }, + { + "epoch": 1.2753514526710403, + "grad_norm": 49814.671875, + "learning_rate": 7.410558513651424e-05, + "loss": 2.2307, + "step": 6804 + }, + { + "epoch": 1.2755388940955952, + "grad_norm": 47909.14453125, + "learning_rate": 7.409870044671833e-05, + "loss": 2.3007, + "step": 6805 + }, + { + "epoch": 1.27572633552015, + "grad_norm": 53766.125, + "learning_rate": 7.409181516171578e-05, + "loss": 2.2024, + "step": 6806 + }, + { + "epoch": 1.2759137769447046, + "grad_norm": 51761.203125, + "learning_rate": 7.40849292816766e-05, + "loss": 2.2593, + "step": 6807 + }, + { + "epoch": 1.2761012183692597, + "grad_norm": 49983.9296875, + "learning_rate": 7.40780428067709e-05, + "loss": 2.2361, + "step": 6808 + }, + { + "epoch": 1.2762886597938143, + "grad_norm": 50009.14453125, + "learning_rate": 7.407115573716876e-05, + "loss": 2.2368, + "step": 6809 + }, + { + "epoch": 1.2764761012183692, + "grad_norm": 49380.7421875, + "learning_rate": 7.406426807304028e-05, + "loss": 2.2657, + "step": 6810 + }, + { + "epoch": 1.276663542642924, + "grad_norm": 52305.6015625, + "learning_rate": 7.405737981455558e-05, + "loss": 2.2798, + "step": 6811 + }, + { + "epoch": 1.2768509840674789, + "grad_norm": 51550.24609375, + "learning_rate": 7.405049096188479e-05, + "loss": 2.2307, + "step": 6812 + }, + { + "epoch": 1.2770384254920337, + "grad_norm": 55224.03515625, + "learning_rate": 7.404360151519807e-05, + "loss": 2.2996, + "step": 6813 + }, + { + "epoch": 1.2772258669165886, + "grad_norm": 55554.21484375, + "learning_rate": 7.403671147466555e-05, + "loss": 2.2523, + "step": 6814 + }, + { + "epoch": 1.2774133083411434, + "grad_norm": 51513.65234375, + "learning_rate": 7.402982084045742e-05, + "loss": 2.2552, + "step": 6815 + }, + { + "epoch": 1.2776007497656983, + "grad_norm": 48852.1015625, + "learning_rate": 7.402292961274387e-05, + "loss": 2.2411, + "step": 6816 + }, + { + "epoch": 1.277788191190253, + "grad_norm": 49098.21875, + "learning_rate": 7.40160377916951e-05, + "loss": 2.2495, + "step": 6817 + }, + { + "epoch": 1.277975632614808, + "grad_norm": 52283.90625, + "learning_rate": 7.400914537748136e-05, + "loss": 2.2247, + "step": 6818 + }, + { + "epoch": 1.2781630740393628, + "grad_norm": 54329.16015625, + "learning_rate": 7.400225237027283e-05, + "loss": 2.2422, + "step": 6819 + }, + { + "epoch": 1.2783505154639174, + "grad_norm": 50083.8984375, + "learning_rate": 7.399535877023979e-05, + "loss": 2.2243, + "step": 6820 + }, + { + "epoch": 1.2785379568884723, + "grad_norm": 54323.63671875, + "learning_rate": 7.398846457755252e-05, + "loss": 2.2124, + "step": 6821 + }, + { + "epoch": 1.2787253983130271, + "grad_norm": 52294.99609375, + "learning_rate": 7.398156979238126e-05, + "loss": 2.3273, + "step": 6822 + }, + { + "epoch": 1.278912839737582, + "grad_norm": 49113.54296875, + "learning_rate": 7.397467441489632e-05, + "loss": 2.1945, + "step": 6823 + }, + { + "epoch": 1.2791002811621368, + "grad_norm": 53428.9140625, + "learning_rate": 7.3967778445268e-05, + "loss": 2.2525, + "step": 6824 + }, + { + "epoch": 1.2792877225866917, + "grad_norm": 49522.47265625, + "learning_rate": 7.396088188366663e-05, + "loss": 2.1458, + "step": 6825 + }, + { + "epoch": 1.2794751640112465, + "grad_norm": 50673.59375, + "learning_rate": 7.395398473026256e-05, + "loss": 2.2469, + "step": 6826 + }, + { + "epoch": 1.2796626054358013, + "grad_norm": 50980.77734375, + "learning_rate": 7.394708698522609e-05, + "loss": 2.2115, + "step": 6827 + }, + { + "epoch": 1.2798500468603562, + "grad_norm": 52014.85546875, + "learning_rate": 7.394018864872765e-05, + "loss": 2.2578, + "step": 6828 + }, + { + "epoch": 1.280037488284911, + "grad_norm": 55812.63671875, + "learning_rate": 7.393328972093759e-05, + "loss": 2.1827, + "step": 6829 + }, + { + "epoch": 1.2802249297094659, + "grad_norm": 49456.47265625, + "learning_rate": 7.392639020202627e-05, + "loss": 2.2302, + "step": 6830 + }, + { + "epoch": 1.2804123711340205, + "grad_norm": 56162.75, + "learning_rate": 7.391949009216417e-05, + "loss": 2.2125, + "step": 6831 + }, + { + "epoch": 1.2805998125585756, + "grad_norm": 50178.2109375, + "learning_rate": 7.391258939152165e-05, + "loss": 2.211, + "step": 6832 + }, + { + "epoch": 1.2807872539831302, + "grad_norm": 52618.8203125, + "learning_rate": 7.39056881002692e-05, + "loss": 2.1866, + "step": 6833 + }, + { + "epoch": 1.280974695407685, + "grad_norm": 51010.6640625, + "learning_rate": 7.389878621857722e-05, + "loss": 2.2254, + "step": 6834 + }, + { + "epoch": 1.28116213683224, + "grad_norm": 49933.36328125, + "learning_rate": 7.389188374661623e-05, + "loss": 2.172, + "step": 6835 + }, + { + "epoch": 1.2813495782567947, + "grad_norm": 50160.83984375, + "learning_rate": 7.388498068455667e-05, + "loss": 2.2573, + "step": 6836 + }, + { + "epoch": 1.2815370196813496, + "grad_norm": 50873.83984375, + "learning_rate": 7.387807703256907e-05, + "loss": 2.2835, + "step": 6837 + }, + { + "epoch": 1.2817244611059044, + "grad_norm": 50561.44921875, + "learning_rate": 7.387117279082391e-05, + "loss": 2.2991, + "step": 6838 + }, + { + "epoch": 1.2819119025304593, + "grad_norm": 47062.1171875, + "learning_rate": 7.386426795949175e-05, + "loss": 2.2759, + "step": 6839 + }, + { + "epoch": 1.2820993439550141, + "grad_norm": 51316.88671875, + "learning_rate": 7.38573625387431e-05, + "loss": 2.3167, + "step": 6840 + }, + { + "epoch": 1.282286785379569, + "grad_norm": 53483.796875, + "learning_rate": 7.385045652874854e-05, + "loss": 2.2495, + "step": 6841 + }, + { + "epoch": 1.2824742268041236, + "grad_norm": 51970.68359375, + "learning_rate": 7.38435499296786e-05, + "loss": 2.279, + "step": 6842 + }, + { + "epoch": 1.2826616682286787, + "grad_norm": 51004.515625, + "learning_rate": 7.383664274170391e-05, + "loss": 2.2605, + "step": 6843 + }, + { + "epoch": 1.2828491096532333, + "grad_norm": 48161.109375, + "learning_rate": 7.382973496499505e-05, + "loss": 2.241, + "step": 6844 + }, + { + "epoch": 1.2830365510777881, + "grad_norm": 51097.7578125, + "learning_rate": 7.382282659972262e-05, + "loss": 2.2159, + "step": 6845 + }, + { + "epoch": 1.283223992502343, + "grad_norm": 52476.015625, + "learning_rate": 7.381591764605727e-05, + "loss": 2.1728, + "step": 6846 + }, + { + "epoch": 1.2834114339268978, + "grad_norm": 50202.3515625, + "learning_rate": 7.380900810416962e-05, + "loss": 2.3015, + "step": 6847 + }, + { + "epoch": 1.2835988753514527, + "grad_norm": 51859.484375, + "learning_rate": 7.380209797423035e-05, + "loss": 2.2017, + "step": 6848 + }, + { + "epoch": 1.2837863167760075, + "grad_norm": 48489.5234375, + "learning_rate": 7.379518725641012e-05, + "loss": 2.1719, + "step": 6849 + }, + { + "epoch": 1.2839737582005624, + "grad_norm": 51008.31640625, + "learning_rate": 7.378827595087961e-05, + "loss": 2.2478, + "step": 6850 + }, + { + "epoch": 1.2841611996251172, + "grad_norm": 52600.4921875, + "learning_rate": 7.378136405780954e-05, + "loss": 2.2495, + "step": 6851 + }, + { + "epoch": 1.284348641049672, + "grad_norm": 50603.765625, + "learning_rate": 7.377445157737058e-05, + "loss": 2.2167, + "step": 6852 + }, + { + "epoch": 1.2845360824742267, + "grad_norm": 50593.16015625, + "learning_rate": 7.376753850973351e-05, + "loss": 2.2724, + "step": 6853 + }, + { + "epoch": 1.2847235238987817, + "grad_norm": 50597.36328125, + "learning_rate": 7.376062485506907e-05, + "loss": 2.1933, + "step": 6854 + }, + { + "epoch": 1.2849109653233364, + "grad_norm": 52817.421875, + "learning_rate": 7.375371061354797e-05, + "loss": 2.163, + "step": 6855 + }, + { + "epoch": 1.2850984067478912, + "grad_norm": 55018.01953125, + "learning_rate": 7.374679578534103e-05, + "loss": 2.2322, + "step": 6856 + }, + { + "epoch": 1.285285848172446, + "grad_norm": 51328.18359375, + "learning_rate": 7.373988037061902e-05, + "loss": 2.2693, + "step": 6857 + }, + { + "epoch": 1.285473289597001, + "grad_norm": 50358.24609375, + "learning_rate": 7.373296436955277e-05, + "loss": 2.2281, + "step": 6858 + }, + { + "epoch": 1.2856607310215558, + "grad_norm": 48015.859375, + "learning_rate": 7.372604778231304e-05, + "loss": 2.2989, + "step": 6859 + }, + { + "epoch": 1.2858481724461106, + "grad_norm": 51835.359375, + "learning_rate": 7.371913060907068e-05, + "loss": 2.2172, + "step": 6860 + }, + { + "epoch": 1.2860356138706655, + "grad_norm": 48134.32421875, + "learning_rate": 7.371221284999658e-05, + "loss": 2.2322, + "step": 6861 + }, + { + "epoch": 1.2862230552952203, + "grad_norm": 49319.8203125, + "learning_rate": 7.370529450526154e-05, + "loss": 2.2692, + "step": 6862 + }, + { + "epoch": 1.2864104967197751, + "grad_norm": 54755.8984375, + "learning_rate": 7.369837557503648e-05, + "loss": 2.2241, + "step": 6863 + }, + { + "epoch": 1.2865979381443298, + "grad_norm": 52179.91796875, + "learning_rate": 7.369145605949226e-05, + "loss": 2.2415, + "step": 6864 + }, + { + "epoch": 1.2867853795688848, + "grad_norm": 46879.3046875, + "learning_rate": 7.368453595879978e-05, + "loss": 2.2729, + "step": 6865 + }, + { + "epoch": 1.2869728209934395, + "grad_norm": 48734.125, + "learning_rate": 7.367761527312999e-05, + "loss": 2.3212, + "step": 6866 + }, + { + "epoch": 1.2871602624179943, + "grad_norm": 52037.89453125, + "learning_rate": 7.367069400265378e-05, + "loss": 2.3739, + "step": 6867 + }, + { + "epoch": 1.2873477038425492, + "grad_norm": 54150.30859375, + "learning_rate": 7.366377214754213e-05, + "loss": 2.2055, + "step": 6868 + }, + { + "epoch": 1.287535145267104, + "grad_norm": 47046.1640625, + "learning_rate": 7.3656849707966e-05, + "loss": 2.1944, + "step": 6869 + }, + { + "epoch": 1.2877225866916588, + "grad_norm": 48356.70703125, + "learning_rate": 7.364992668409633e-05, + "loss": 2.2, + "step": 6870 + }, + { + "epoch": 1.2879100281162137, + "grad_norm": 51998.4765625, + "learning_rate": 7.364300307610415e-05, + "loss": 2.1981, + "step": 6871 + }, + { + "epoch": 1.2880974695407685, + "grad_norm": 51193.25390625, + "learning_rate": 7.363607888416043e-05, + "loss": 2.3018, + "step": 6872 + }, + { + "epoch": 1.2882849109653234, + "grad_norm": 48444.76953125, + "learning_rate": 7.362915410843623e-05, + "loss": 2.2704, + "step": 6873 + }, + { + "epoch": 1.2884723523898782, + "grad_norm": 50166.48046875, + "learning_rate": 7.362222874910254e-05, + "loss": 2.3748, + "step": 6874 + }, + { + "epoch": 1.2886597938144329, + "grad_norm": 51468.05859375, + "learning_rate": 7.361530280633044e-05, + "loss": 2.2448, + "step": 6875 + }, + { + "epoch": 1.288847235238988, + "grad_norm": 49843.37890625, + "learning_rate": 7.360837628029097e-05, + "loss": 2.2273, + "step": 6876 + }, + { + "epoch": 1.2890346766635425, + "grad_norm": 47752.984375, + "learning_rate": 7.360144917115521e-05, + "loss": 2.2192, + "step": 6877 + }, + { + "epoch": 1.2892221180880974, + "grad_norm": 54770.0234375, + "learning_rate": 7.359452147909428e-05, + "loss": 2.2175, + "step": 6878 + }, + { + "epoch": 1.2894095595126522, + "grad_norm": 52833.7578125, + "learning_rate": 7.358759320427923e-05, + "loss": 2.247, + "step": 6879 + }, + { + "epoch": 1.289597000937207, + "grad_norm": 53988.06640625, + "learning_rate": 7.358066434688123e-05, + "loss": 2.3272, + "step": 6880 + }, + { + "epoch": 1.289784442361762, + "grad_norm": 53986.46875, + "learning_rate": 7.357373490707138e-05, + "loss": 2.28, + "step": 6881 + }, + { + "epoch": 1.2899718837863168, + "grad_norm": 52446.80078125, + "learning_rate": 7.356680488502085e-05, + "loss": 2.2268, + "step": 6882 + }, + { + "epoch": 1.2901593252108716, + "grad_norm": 47840.234375, + "learning_rate": 7.355987428090079e-05, + "loss": 2.2068, + "step": 6883 + }, + { + "epoch": 1.2903467666354265, + "grad_norm": 48172.6015625, + "learning_rate": 7.355294309488238e-05, + "loss": 2.2346, + "step": 6884 + }, + { + "epoch": 1.2905342080599813, + "grad_norm": 52400.97265625, + "learning_rate": 7.354601132713681e-05, + "loss": 2.2392, + "step": 6885 + }, + { + "epoch": 1.2907216494845362, + "grad_norm": 52180.7890625, + "learning_rate": 7.353907897783531e-05, + "loss": 2.2591, + "step": 6886 + }, + { + "epoch": 1.290909090909091, + "grad_norm": 54622.2265625, + "learning_rate": 7.353214604714905e-05, + "loss": 2.2037, + "step": 6887 + }, + { + "epoch": 1.2910965323336456, + "grad_norm": 49444.90234375, + "learning_rate": 7.352521253524931e-05, + "loss": 2.2626, + "step": 6888 + }, + { + "epoch": 1.2912839737582007, + "grad_norm": 48625.66796875, + "learning_rate": 7.351827844230732e-05, + "loss": 2.2678, + "step": 6889 + }, + { + "epoch": 1.2914714151827553, + "grad_norm": 51330.48046875, + "learning_rate": 7.351134376849434e-05, + "loss": 2.1763, + "step": 6890 + }, + { + "epoch": 1.2916588566073102, + "grad_norm": 51070.01953125, + "learning_rate": 7.350440851398165e-05, + "loss": 2.2187, + "step": 6891 + }, + { + "epoch": 1.291846298031865, + "grad_norm": 47044.2890625, + "learning_rate": 7.349747267894054e-05, + "loss": 2.2142, + "step": 6892 + }, + { + "epoch": 1.2920337394564199, + "grad_norm": 47368.98828125, + "learning_rate": 7.349053626354232e-05, + "loss": 2.276, + "step": 6893 + }, + { + "epoch": 1.2922211808809747, + "grad_norm": 47257.85546875, + "learning_rate": 7.348359926795831e-05, + "loss": 2.1868, + "step": 6894 + }, + { + "epoch": 1.2924086223055296, + "grad_norm": 49717.09375, + "learning_rate": 7.347666169235987e-05, + "loss": 2.2093, + "step": 6895 + }, + { + "epoch": 1.2925960637300844, + "grad_norm": 48443.29296875, + "learning_rate": 7.346972353691828e-05, + "loss": 2.2403, + "step": 6896 + }, + { + "epoch": 1.2927835051546392, + "grad_norm": 50907.05078125, + "learning_rate": 7.346278480180497e-05, + "loss": 2.167, + "step": 6897 + }, + { + "epoch": 1.292970946579194, + "grad_norm": 51222.50390625, + "learning_rate": 7.345584548719131e-05, + "loss": 2.2135, + "step": 6898 + }, + { + "epoch": 1.2931583880037487, + "grad_norm": 47322.921875, + "learning_rate": 7.344890559324866e-05, + "loss": 2.2688, + "step": 6899 + }, + { + "epoch": 1.2933458294283038, + "grad_norm": 51488.63671875, + "learning_rate": 7.344196512014845e-05, + "loss": 2.2803, + "step": 6900 + }, + { + "epoch": 1.2935332708528584, + "grad_norm": 52956.12890625, + "learning_rate": 7.34350240680621e-05, + "loss": 2.228, + "step": 6901 + }, + { + "epoch": 1.2937207122774133, + "grad_norm": 47910.97265625, + "learning_rate": 7.342808243716102e-05, + "loss": 2.2245, + "step": 6902 + }, + { + "epoch": 1.293908153701968, + "grad_norm": 56729.4453125, + "learning_rate": 7.34211402276167e-05, + "loss": 2.2058, + "step": 6903 + }, + { + "epoch": 1.294095595126523, + "grad_norm": 48785.5625, + "learning_rate": 7.341419743960057e-05, + "loss": 2.1624, + "step": 6904 + }, + { + "epoch": 1.2942830365510778, + "grad_norm": 54221.09375, + "learning_rate": 7.340725407328412e-05, + "loss": 2.2237, + "step": 6905 + }, + { + "epoch": 1.2944704779756326, + "grad_norm": 50563.36328125, + "learning_rate": 7.340031012883885e-05, + "loss": 2.2063, + "step": 6906 + }, + { + "epoch": 1.2946579194001875, + "grad_norm": 51676.90234375, + "learning_rate": 7.339336560643626e-05, + "loss": 2.2977, + "step": 6907 + }, + { + "epoch": 1.2948453608247423, + "grad_norm": 49511.20703125, + "learning_rate": 7.338642050624787e-05, + "loss": 2.2913, + "step": 6908 + }, + { + "epoch": 1.2950328022492972, + "grad_norm": 47912.42578125, + "learning_rate": 7.337947482844519e-05, + "loss": 2.2326, + "step": 6909 + }, + { + "epoch": 1.2952202436738518, + "grad_norm": 53695.64453125, + "learning_rate": 7.337252857319982e-05, + "loss": 2.2225, + "step": 6910 + }, + { + "epoch": 1.2954076850984069, + "grad_norm": 51326.484375, + "learning_rate": 7.336558174068328e-05, + "loss": 2.2974, + "step": 6911 + }, + { + "epoch": 1.2955951265229615, + "grad_norm": 53998.0625, + "learning_rate": 7.335863433106718e-05, + "loss": 2.2717, + "step": 6912 + }, + { + "epoch": 1.2957825679475163, + "grad_norm": 50203.9765625, + "learning_rate": 7.335168634452308e-05, + "loss": 2.2326, + "step": 6913 + }, + { + "epoch": 1.2959700093720712, + "grad_norm": 49178.47265625, + "learning_rate": 7.334473778122262e-05, + "loss": 2.1853, + "step": 6914 + }, + { + "epoch": 1.296157450796626, + "grad_norm": 52709.91015625, + "learning_rate": 7.333778864133741e-05, + "loss": 2.299, + "step": 6915 + }, + { + "epoch": 1.2963448922211809, + "grad_norm": 49558.66015625, + "learning_rate": 7.333083892503906e-05, + "loss": 2.2319, + "step": 6916 + }, + { + "epoch": 1.2965323336457357, + "grad_norm": 52535.49609375, + "learning_rate": 7.332388863249924e-05, + "loss": 2.1733, + "step": 6917 + }, + { + "epoch": 1.2967197750702906, + "grad_norm": 51421.25390625, + "learning_rate": 7.331693776388961e-05, + "loss": 2.2569, + "step": 6918 + }, + { + "epoch": 1.2969072164948454, + "grad_norm": 53340.33984375, + "learning_rate": 7.330998631938187e-05, + "loss": 2.2718, + "step": 6919 + }, + { + "epoch": 1.2970946579194003, + "grad_norm": 52764.48828125, + "learning_rate": 7.330303429914767e-05, + "loss": 2.2078, + "step": 6920 + }, + { + "epoch": 1.2972820993439549, + "grad_norm": 51065.4375, + "learning_rate": 7.329608170335873e-05, + "loss": 2.1915, + "step": 6921 + }, + { + "epoch": 1.29746954076851, + "grad_norm": 50645.71875, + "learning_rate": 7.328912853218679e-05, + "loss": 2.2759, + "step": 6922 + }, + { + "epoch": 1.2976569821930646, + "grad_norm": 49577.28125, + "learning_rate": 7.328217478580355e-05, + "loss": 2.217, + "step": 6923 + }, + { + "epoch": 1.2978444236176194, + "grad_norm": 50219.44140625, + "learning_rate": 7.32752204643808e-05, + "loss": 2.2661, + "step": 6924 + }, + { + "epoch": 1.2980318650421743, + "grad_norm": 49752.9453125, + "learning_rate": 7.326826556809028e-05, + "loss": 2.2498, + "step": 6925 + }, + { + "epoch": 1.2982193064667291, + "grad_norm": 50119.76953125, + "learning_rate": 7.326131009710376e-05, + "loss": 2.2628, + "step": 6926 + }, + { + "epoch": 1.298406747891284, + "grad_norm": 50307.9453125, + "learning_rate": 7.325435405159303e-05, + "loss": 2.2413, + "step": 6927 + }, + { + "epoch": 1.2985941893158388, + "grad_norm": 54367.86328125, + "learning_rate": 7.324739743172993e-05, + "loss": 2.2092, + "step": 6928 + }, + { + "epoch": 1.2987816307403937, + "grad_norm": 50007.1640625, + "learning_rate": 7.324044023768624e-05, + "loss": 2.2612, + "step": 6929 + }, + { + "epoch": 1.2989690721649485, + "grad_norm": 47819.37109375, + "learning_rate": 7.32334824696338e-05, + "loss": 2.2343, + "step": 6930 + }, + { + "epoch": 1.2991565135895033, + "grad_norm": 54208.625, + "learning_rate": 7.322652412774448e-05, + "loss": 2.223, + "step": 6931 + }, + { + "epoch": 1.299343955014058, + "grad_norm": 51816.13671875, + "learning_rate": 7.321956521219011e-05, + "loss": 2.2153, + "step": 6932 + }, + { + "epoch": 1.299531396438613, + "grad_norm": 54069.375, + "learning_rate": 7.32126057231426e-05, + "loss": 2.3073, + "step": 6933 + }, + { + "epoch": 1.2997188378631677, + "grad_norm": 49242.0234375, + "learning_rate": 7.320564566077383e-05, + "loss": 2.2587, + "step": 6934 + }, + { + "epoch": 1.2999062792877225, + "grad_norm": 50283.58203125, + "learning_rate": 7.319868502525568e-05, + "loss": 2.2523, + "step": 6935 + }, + { + "epoch": 1.3000937207122774, + "grad_norm": 53483.859375, + "learning_rate": 7.31917238167601e-05, + "loss": 2.2535, + "step": 6936 + }, + { + "epoch": 1.3002811621368322, + "grad_norm": 53516.12109375, + "learning_rate": 7.318476203545901e-05, + "loss": 2.2681, + "step": 6937 + }, + { + "epoch": 1.300468603561387, + "grad_norm": 52428.82421875, + "learning_rate": 7.317779968152434e-05, + "loss": 2.2517, + "step": 6938 + }, + { + "epoch": 1.300656044985942, + "grad_norm": 52605.51171875, + "learning_rate": 7.317083675512808e-05, + "loss": 2.2196, + "step": 6939 + }, + { + "epoch": 1.3008434864104967, + "grad_norm": 54132.77734375, + "learning_rate": 7.316387325644219e-05, + "loss": 2.2152, + "step": 6940 + }, + { + "epoch": 1.3010309278350516, + "grad_norm": 53503.3671875, + "learning_rate": 7.315690918563868e-05, + "loss": 2.267, + "step": 6941 + }, + { + "epoch": 1.3012183692596064, + "grad_norm": 53155.78125, + "learning_rate": 7.314994454288953e-05, + "loss": 2.2646, + "step": 6942 + }, + { + "epoch": 1.3014058106841613, + "grad_norm": 52693.46875, + "learning_rate": 7.314297932836676e-05, + "loss": 2.2812, + "step": 6943 + }, + { + "epoch": 1.3015932521087161, + "grad_norm": 50545.19921875, + "learning_rate": 7.31360135422424e-05, + "loss": 2.1974, + "step": 6944 + }, + { + "epoch": 1.3017806935332707, + "grad_norm": 54789.24609375, + "learning_rate": 7.31290471846885e-05, + "loss": 2.1892, + "step": 6945 + }, + { + "epoch": 1.3019681349578258, + "grad_norm": 62292.875, + "learning_rate": 7.312208025587712e-05, + "loss": 2.2149, + "step": 6946 + }, + { + "epoch": 1.3021555763823804, + "grad_norm": 50695.49609375, + "learning_rate": 7.311511275598035e-05, + "loss": 2.2799, + "step": 6947 + }, + { + "epoch": 1.3023430178069353, + "grad_norm": 51178.234375, + "learning_rate": 7.310814468517025e-05, + "loss": 2.2387, + "step": 6948 + }, + { + "epoch": 1.3025304592314901, + "grad_norm": 53742.07421875, + "learning_rate": 7.310117604361896e-05, + "loss": 2.2439, + "step": 6949 + }, + { + "epoch": 1.302717900656045, + "grad_norm": 50747.01953125, + "learning_rate": 7.309420683149854e-05, + "loss": 2.2199, + "step": 6950 + }, + { + "epoch": 1.3029053420805998, + "grad_norm": 54403.41015625, + "learning_rate": 7.308723704898118e-05, + "loss": 2.2916, + "step": 6951 + }, + { + "epoch": 1.3030927835051547, + "grad_norm": 52995.6015625, + "learning_rate": 7.3080266696239e-05, + "loss": 2.2677, + "step": 6952 + }, + { + "epoch": 1.3032802249297095, + "grad_norm": 52114.47265625, + "learning_rate": 7.307329577344414e-05, + "loss": 2.2409, + "step": 6953 + }, + { + "epoch": 1.3034676663542644, + "grad_norm": 47572.69140625, + "learning_rate": 7.306632428076878e-05, + "loss": 2.2206, + "step": 6954 + }, + { + "epoch": 1.3036551077788192, + "grad_norm": 49175.296875, + "learning_rate": 7.305935221838513e-05, + "loss": 2.3354, + "step": 6955 + }, + { + "epoch": 1.3038425492033738, + "grad_norm": 46373.99609375, + "learning_rate": 7.305237958646539e-05, + "loss": 2.1692, + "step": 6956 + }, + { + "epoch": 1.304029990627929, + "grad_norm": 50876.421875, + "learning_rate": 7.304540638518176e-05, + "loss": 2.2876, + "step": 6957 + }, + { + "epoch": 1.3042174320524835, + "grad_norm": 47033.75, + "learning_rate": 7.303843261470646e-05, + "loss": 2.2308, + "step": 6958 + }, + { + "epoch": 1.3044048734770384, + "grad_norm": 49828.17578125, + "learning_rate": 7.303145827521174e-05, + "loss": 2.2075, + "step": 6959 + }, + { + "epoch": 1.3045923149015932, + "grad_norm": 48392.375, + "learning_rate": 7.302448336686987e-05, + "loss": 2.1898, + "step": 6960 + }, + { + "epoch": 1.304779756326148, + "grad_norm": 51538.421875, + "learning_rate": 7.30175078898531e-05, + "loss": 2.1772, + "step": 6961 + }, + { + "epoch": 1.304967197750703, + "grad_norm": 51535.82421875, + "learning_rate": 7.301053184433376e-05, + "loss": 2.1704, + "step": 6962 + }, + { + "epoch": 1.3051546391752578, + "grad_norm": 53731.52734375, + "learning_rate": 7.300355523048407e-05, + "loss": 2.2908, + "step": 6963 + }, + { + "epoch": 1.3053420805998126, + "grad_norm": 54489.64453125, + "learning_rate": 7.299657804847641e-05, + "loss": 2.246, + "step": 6964 + }, + { + "epoch": 1.3055295220243675, + "grad_norm": 53391.1640625, + "learning_rate": 7.29896002984831e-05, + "loss": 2.2856, + "step": 6965 + }, + { + "epoch": 1.3057169634489223, + "grad_norm": 52393.21484375, + "learning_rate": 7.298262198067646e-05, + "loss": 2.2357, + "step": 6966 + }, + { + "epoch": 1.305904404873477, + "grad_norm": 51689.34765625, + "learning_rate": 7.297564309522886e-05, + "loss": 2.2554, + "step": 6967 + }, + { + "epoch": 1.306091846298032, + "grad_norm": 50165.54296875, + "learning_rate": 7.296866364231265e-05, + "loss": 2.2453, + "step": 6968 + }, + { + "epoch": 1.3062792877225866, + "grad_norm": 53693.2578125, + "learning_rate": 7.296168362210023e-05, + "loss": 2.2373, + "step": 6969 + }, + { + "epoch": 1.3064667291471415, + "grad_norm": 48535.26953125, + "learning_rate": 7.295470303476401e-05, + "loss": 2.2526, + "step": 6970 + }, + { + "epoch": 1.3066541705716963, + "grad_norm": 55379.51953125, + "learning_rate": 7.294772188047638e-05, + "loss": 2.239, + "step": 6971 + }, + { + "epoch": 1.3068416119962512, + "grad_norm": 49642.9765625, + "learning_rate": 7.294074015940977e-05, + "loss": 2.2485, + "step": 6972 + }, + { + "epoch": 1.307029053420806, + "grad_norm": 49021.0234375, + "learning_rate": 7.293375787173661e-05, + "loss": 2.3065, + "step": 6973 + }, + { + "epoch": 1.3072164948453608, + "grad_norm": 51788.69140625, + "learning_rate": 7.292677501762939e-05, + "loss": 2.2484, + "step": 6974 + }, + { + "epoch": 1.3074039362699157, + "grad_norm": 51521.50390625, + "learning_rate": 7.291979159726053e-05, + "loss": 2.2026, + "step": 6975 + }, + { + "epoch": 1.3075913776944705, + "grad_norm": 50680.84375, + "learning_rate": 7.291280761080254e-05, + "loss": 2.2315, + "step": 6976 + }, + { + "epoch": 1.3077788191190254, + "grad_norm": 52761.2890625, + "learning_rate": 7.290582305842792e-05, + "loss": 2.2133, + "step": 6977 + }, + { + "epoch": 1.30796626054358, + "grad_norm": 49731.78515625, + "learning_rate": 7.289883794030915e-05, + "loss": 2.278, + "step": 6978 + }, + { + "epoch": 1.308153701968135, + "grad_norm": 52673.5625, + "learning_rate": 7.289185225661878e-05, + "loss": 2.1951, + "step": 6979 + }, + { + "epoch": 1.3083411433926897, + "grad_norm": 49786.44140625, + "learning_rate": 7.288486600752933e-05, + "loss": 2.215, + "step": 6980 + }, + { + "epoch": 1.3085285848172445, + "grad_norm": 49398.15625, + "learning_rate": 7.287787919321338e-05, + "loss": 2.2618, + "step": 6981 + }, + { + "epoch": 1.3087160262417994, + "grad_norm": 53257.9921875, + "learning_rate": 7.287089181384346e-05, + "loss": 2.2429, + "step": 6982 + }, + { + "epoch": 1.3089034676663542, + "grad_norm": 50321.03125, + "learning_rate": 7.286390386959217e-05, + "loss": 2.2381, + "step": 6983 + }, + { + "epoch": 1.309090909090909, + "grad_norm": 50183.03125, + "learning_rate": 7.285691536063211e-05, + "loss": 2.1921, + "step": 6984 + }, + { + "epoch": 1.309278350515464, + "grad_norm": 51615.31640625, + "learning_rate": 7.284992628713585e-05, + "loss": 2.2339, + "step": 6985 + }, + { + "epoch": 1.3094657919400188, + "grad_norm": 53091.8125, + "learning_rate": 7.284293664927605e-05, + "loss": 2.2568, + "step": 6986 + }, + { + "epoch": 1.3096532333645736, + "grad_norm": 53735.98828125, + "learning_rate": 7.283594644722534e-05, + "loss": 2.2438, + "step": 6987 + }, + { + "epoch": 1.3098406747891285, + "grad_norm": 51331.23828125, + "learning_rate": 7.282895568115634e-05, + "loss": 2.2265, + "step": 6988 + }, + { + "epoch": 1.310028116213683, + "grad_norm": 48175.8359375, + "learning_rate": 7.282196435124174e-05, + "loss": 2.243, + "step": 6989 + }, + { + "epoch": 1.3102155576382382, + "grad_norm": 48982.89453125, + "learning_rate": 7.281497245765421e-05, + "loss": 2.2336, + "step": 6990 + }, + { + "epoch": 1.3104029990627928, + "grad_norm": 50597.31640625, + "learning_rate": 7.280798000056644e-05, + "loss": 2.2421, + "step": 6991 + }, + { + "epoch": 1.3105904404873476, + "grad_norm": 60686.4609375, + "learning_rate": 7.280098698015114e-05, + "loss": 2.2414, + "step": 6992 + }, + { + "epoch": 1.3107778819119025, + "grad_norm": 50507.015625, + "learning_rate": 7.279399339658101e-05, + "loss": 2.2131, + "step": 6993 + }, + { + "epoch": 1.3109653233364573, + "grad_norm": 48810.41796875, + "learning_rate": 7.278699925002882e-05, + "loss": 2.2151, + "step": 6994 + }, + { + "epoch": 1.3111527647610122, + "grad_norm": 53203.21484375, + "learning_rate": 7.278000454066729e-05, + "loss": 2.2491, + "step": 6995 + }, + { + "epoch": 1.311340206185567, + "grad_norm": 50713.421875, + "learning_rate": 7.277300926866915e-05, + "loss": 2.2378, + "step": 6996 + }, + { + "epoch": 1.3115276476101219, + "grad_norm": 52439.20703125, + "learning_rate": 7.276601343420725e-05, + "loss": 2.2557, + "step": 6997 + }, + { + "epoch": 1.3117150890346767, + "grad_norm": 49700.76171875, + "learning_rate": 7.275901703745429e-05, + "loss": 2.2314, + "step": 6998 + }, + { + "epoch": 1.3119025304592316, + "grad_norm": 53788.21484375, + "learning_rate": 7.275202007858315e-05, + "loss": 2.1995, + "step": 6999 + }, + { + "epoch": 1.3120899718837864, + "grad_norm": 52565.3671875, + "learning_rate": 7.27450225577666e-05, + "loss": 2.2287, + "step": 7000 + }, + { + "epoch": 1.3120899718837864, + "eval_loss": 2.305553913116455, + "eval_runtime": 128.7438, + "eval_samples_per_second": 39.217, + "eval_steps_per_second": 1.965, + "step": 7000 + }, + { + "epoch": 1.3122774133083412, + "grad_norm": 48788.09375, + "learning_rate": 7.27380244751775e-05, + "loss": 2.2807, + "step": 7001 + }, + { + "epoch": 1.3124648547328959, + "grad_norm": 52230.48046875, + "learning_rate": 7.273102583098865e-05, + "loss": 2.2564, + "step": 7002 + }, + { + "epoch": 1.3126522961574507, + "grad_norm": 55416.2421875, + "learning_rate": 7.272402662537294e-05, + "loss": 2.2107, + "step": 7003 + }, + { + "epoch": 1.3128397375820056, + "grad_norm": 51209.25390625, + "learning_rate": 7.271702685850323e-05, + "loss": 2.1914, + "step": 7004 + }, + { + "epoch": 1.3130271790065604, + "grad_norm": 51951.6015625, + "learning_rate": 7.271002653055243e-05, + "loss": 2.1968, + "step": 7005 + }, + { + "epoch": 1.3132146204311153, + "grad_norm": 48346.078125, + "learning_rate": 7.27030256416934e-05, + "loss": 2.2347, + "step": 7006 + }, + { + "epoch": 1.31340206185567, + "grad_norm": 51341.69140625, + "learning_rate": 7.269602419209909e-05, + "loss": 2.2633, + "step": 7007 + }, + { + "epoch": 1.313589503280225, + "grad_norm": 55368.6640625, + "learning_rate": 7.268902218194238e-05, + "loss": 2.1766, + "step": 7008 + }, + { + "epoch": 1.3137769447047798, + "grad_norm": 54508.08203125, + "learning_rate": 7.268201961139627e-05, + "loss": 2.1931, + "step": 7009 + }, + { + "epoch": 1.3139643861293346, + "grad_norm": 51022.67578125, + "learning_rate": 7.267501648063366e-05, + "loss": 2.2809, + "step": 7010 + }, + { + "epoch": 1.3141518275538895, + "grad_norm": 54089.59375, + "learning_rate": 7.266801278982755e-05, + "loss": 2.2609, + "step": 7011 + }, + { + "epoch": 1.3143392689784443, + "grad_norm": 58231.09375, + "learning_rate": 7.266100853915093e-05, + "loss": 2.1977, + "step": 7012 + }, + { + "epoch": 1.314526710402999, + "grad_norm": 51281.3203125, + "learning_rate": 7.265400372877674e-05, + "loss": 2.2332, + "step": 7013 + }, + { + "epoch": 1.314714151827554, + "grad_norm": 52365.49609375, + "learning_rate": 7.264699835887807e-05, + "loss": 2.2727, + "step": 7014 + }, + { + "epoch": 1.3149015932521086, + "grad_norm": 55182.4375, + "learning_rate": 7.263999242962786e-05, + "loss": 2.282, + "step": 7015 + }, + { + "epoch": 1.3150890346766635, + "grad_norm": 46019.28515625, + "learning_rate": 7.263298594119923e-05, + "loss": 2.2447, + "step": 7016 + }, + { + "epoch": 1.3152764761012183, + "grad_norm": 53832.1796875, + "learning_rate": 7.262597889376516e-05, + "loss": 2.1878, + "step": 7017 + }, + { + "epoch": 1.3154639175257732, + "grad_norm": 52703.0546875, + "learning_rate": 7.261897128749877e-05, + "loss": 2.2471, + "step": 7018 + }, + { + "epoch": 1.315651358950328, + "grad_norm": 50137.43359375, + "learning_rate": 7.261196312257311e-05, + "loss": 2.2365, + "step": 7019 + }, + { + "epoch": 1.3158388003748829, + "grad_norm": 53245.1015625, + "learning_rate": 7.260495439916128e-05, + "loss": 2.1927, + "step": 7020 + }, + { + "epoch": 1.3160262417994377, + "grad_norm": 49776.953125, + "learning_rate": 7.259794511743638e-05, + "loss": 2.2422, + "step": 7021 + }, + { + "epoch": 1.3162136832239926, + "grad_norm": 46563.765625, + "learning_rate": 7.259093527757153e-05, + "loss": 2.2185, + "step": 7022 + }, + { + "epoch": 1.3164011246485474, + "grad_norm": 48796.421875, + "learning_rate": 7.258392487973986e-05, + "loss": 2.2129, + "step": 7023 + }, + { + "epoch": 1.316588566073102, + "grad_norm": 50964.23828125, + "learning_rate": 7.257691392411456e-05, + "loss": 2.2305, + "step": 7024 + }, + { + "epoch": 1.316776007497657, + "grad_norm": 50124.5234375, + "learning_rate": 7.256990241086873e-05, + "loss": 2.2355, + "step": 7025 + }, + { + "epoch": 1.3169634489222117, + "grad_norm": 55708.6484375, + "learning_rate": 7.256289034017557e-05, + "loss": 2.3738, + "step": 7026 + }, + { + "epoch": 1.3171508903467666, + "grad_norm": 50621.75390625, + "learning_rate": 7.255587771220829e-05, + "loss": 2.2459, + "step": 7027 + }, + { + "epoch": 1.3173383317713214, + "grad_norm": 49548.96484375, + "learning_rate": 7.254886452714005e-05, + "loss": 2.2448, + "step": 7028 + }, + { + "epoch": 1.3175257731958763, + "grad_norm": 49251.078125, + "learning_rate": 7.254185078514411e-05, + "loss": 2.2031, + "step": 7029 + }, + { + "epoch": 1.3177132146204311, + "grad_norm": 53091.0546875, + "learning_rate": 7.253483648639368e-05, + "loss": 2.2538, + "step": 7030 + }, + { + "epoch": 1.317900656044986, + "grad_norm": 51405.265625, + "learning_rate": 7.252782163106201e-05, + "loss": 2.2212, + "step": 7031 + }, + { + "epoch": 1.3180880974695408, + "grad_norm": 50630.72265625, + "learning_rate": 7.252080621932234e-05, + "loss": 2.2396, + "step": 7032 + }, + { + "epoch": 1.3182755388940957, + "grad_norm": 53637.65625, + "learning_rate": 7.251379025134796e-05, + "loss": 2.2028, + "step": 7033 + }, + { + "epoch": 1.3184629803186505, + "grad_norm": 46670.39453125, + "learning_rate": 7.250677372731215e-05, + "loss": 2.2972, + "step": 7034 + }, + { + "epoch": 1.3186504217432051, + "grad_norm": 51014.82421875, + "learning_rate": 7.249975664738822e-05, + "loss": 2.2418, + "step": 7035 + }, + { + "epoch": 1.3188378631677602, + "grad_norm": 52304.94921875, + "learning_rate": 7.249273901174946e-05, + "loss": 2.1752, + "step": 7036 + }, + { + "epoch": 1.3190253045923148, + "grad_norm": 52515.08984375, + "learning_rate": 7.248572082056922e-05, + "loss": 2.3063, + "step": 7037 + }, + { + "epoch": 1.3192127460168697, + "grad_norm": 45963.01171875, + "learning_rate": 7.24787020740208e-05, + "loss": 2.284, + "step": 7038 + }, + { + "epoch": 1.3194001874414245, + "grad_norm": 52394.02734375, + "learning_rate": 7.247168277227762e-05, + "loss": 2.2278, + "step": 7039 + }, + { + "epoch": 1.3195876288659794, + "grad_norm": 51259.68359375, + "learning_rate": 7.246466291551299e-05, + "loss": 2.2453, + "step": 7040 + }, + { + "epoch": 1.3197750702905342, + "grad_norm": 47323.6953125, + "learning_rate": 7.245764250390032e-05, + "loss": 2.2412, + "step": 7041 + }, + { + "epoch": 1.319962511715089, + "grad_norm": 48111.05078125, + "learning_rate": 7.245062153761299e-05, + "loss": 2.2776, + "step": 7042 + }, + { + "epoch": 1.320149953139644, + "grad_norm": 53016.53515625, + "learning_rate": 7.244360001682442e-05, + "loss": 2.2323, + "step": 7043 + }, + { + "epoch": 1.3203373945641987, + "grad_norm": 47472.4296875, + "learning_rate": 7.243657794170802e-05, + "loss": 2.224, + "step": 7044 + }, + { + "epoch": 1.3205248359887536, + "grad_norm": 50261.54296875, + "learning_rate": 7.242955531243725e-05, + "loss": 2.2987, + "step": 7045 + }, + { + "epoch": 1.3207122774133082, + "grad_norm": 48731.76953125, + "learning_rate": 7.242253212918554e-05, + "loss": 2.2668, + "step": 7046 + }, + { + "epoch": 1.3208997188378633, + "grad_norm": 49301.8203125, + "learning_rate": 7.241550839212635e-05, + "loss": 2.2118, + "step": 7047 + }, + { + "epoch": 1.321087160262418, + "grad_norm": 51986.375, + "learning_rate": 7.240848410143316e-05, + "loss": 2.2535, + "step": 7048 + }, + { + "epoch": 1.3212746016869727, + "grad_norm": 47835.46875, + "learning_rate": 7.240145925727948e-05, + "loss": 2.2478, + "step": 7049 + }, + { + "epoch": 1.3214620431115276, + "grad_norm": 57820.078125, + "learning_rate": 7.23944338598388e-05, + "loss": 2.2576, + "step": 7050 + }, + { + "epoch": 1.3216494845360824, + "grad_norm": 52520.1640625, + "learning_rate": 7.238740790928462e-05, + "loss": 2.2393, + "step": 7051 + }, + { + "epoch": 1.3218369259606373, + "grad_norm": 50245.078125, + "learning_rate": 7.23803814057905e-05, + "loss": 2.2827, + "step": 7052 + }, + { + "epoch": 1.3220243673851921, + "grad_norm": 48484.57421875, + "learning_rate": 7.237335434952999e-05, + "loss": 2.19, + "step": 7053 + }, + { + "epoch": 1.322211808809747, + "grad_norm": 49563.9921875, + "learning_rate": 7.236632674067662e-05, + "loss": 2.2345, + "step": 7054 + }, + { + "epoch": 1.3223992502343018, + "grad_norm": 49283.3828125, + "learning_rate": 7.235929857940399e-05, + "loss": 2.3227, + "step": 7055 + }, + { + "epoch": 1.3225866916588567, + "grad_norm": 49543.71875, + "learning_rate": 7.235226986588566e-05, + "loss": 2.313, + "step": 7056 + }, + { + "epoch": 1.3227741330834113, + "grad_norm": 53520.12109375, + "learning_rate": 7.234524060029526e-05, + "loss": 2.2002, + "step": 7057 + }, + { + "epoch": 1.3229615745079664, + "grad_norm": 54330.56640625, + "learning_rate": 7.233821078280635e-05, + "loss": 2.2308, + "step": 7058 + }, + { + "epoch": 1.323149015932521, + "grad_norm": 52468.16015625, + "learning_rate": 7.233118041359264e-05, + "loss": 2.1193, + "step": 7059 + }, + { + "epoch": 1.3233364573570758, + "grad_norm": 51318.5546875, + "learning_rate": 7.232414949282771e-05, + "loss": 2.287, + "step": 7060 + }, + { + "epoch": 1.3235238987816307, + "grad_norm": 49480.73046875, + "learning_rate": 7.231711802068523e-05, + "loss": 2.2014, + "step": 7061 + }, + { + "epoch": 1.3237113402061855, + "grad_norm": 49433.85546875, + "learning_rate": 7.231008599733887e-05, + "loss": 2.2114, + "step": 7062 + }, + { + "epoch": 1.3238987816307404, + "grad_norm": 52915.9921875, + "learning_rate": 7.230305342296232e-05, + "loss": 2.2017, + "step": 7063 + }, + { + "epoch": 1.3240862230552952, + "grad_norm": 50938.41796875, + "learning_rate": 7.229602029772926e-05, + "loss": 2.2566, + "step": 7064 + }, + { + "epoch": 1.32427366447985, + "grad_norm": 57453.6640625, + "learning_rate": 7.22889866218134e-05, + "loss": 2.2621, + "step": 7065 + }, + { + "epoch": 1.324461105904405, + "grad_norm": 55139.875, + "learning_rate": 7.228195239538847e-05, + "loss": 2.306, + "step": 7066 + }, + { + "epoch": 1.3246485473289598, + "grad_norm": 48219.328125, + "learning_rate": 7.227491761862822e-05, + "loss": 2.2162, + "step": 7067 + }, + { + "epoch": 1.3248359887535146, + "grad_norm": 50997.2734375, + "learning_rate": 7.226788229170637e-05, + "loss": 2.2337, + "step": 7068 + }, + { + "epoch": 1.3250234301780695, + "grad_norm": 47762.875, + "learning_rate": 7.22608464147967e-05, + "loss": 2.2121, + "step": 7069 + }, + { + "epoch": 1.325210871602624, + "grad_norm": 48267.50390625, + "learning_rate": 7.2253809988073e-05, + "loss": 2.2297, + "step": 7070 + }, + { + "epoch": 1.3253983130271791, + "grad_norm": 48257.0, + "learning_rate": 7.224677301170902e-05, + "loss": 2.2617, + "step": 7071 + }, + { + "epoch": 1.3255857544517338, + "grad_norm": 51399.90625, + "learning_rate": 7.223973548587862e-05, + "loss": 2.1859, + "step": 7072 + }, + { + "epoch": 1.3257731958762886, + "grad_norm": 52391.91015625, + "learning_rate": 7.223269741075555e-05, + "loss": 2.1402, + "step": 7073 + }, + { + "epoch": 1.3259606373008435, + "grad_norm": 50228.97265625, + "learning_rate": 7.222565878651371e-05, + "loss": 2.2231, + "step": 7074 + }, + { + "epoch": 1.3261480787253983, + "grad_norm": 52768.19140625, + "learning_rate": 7.221861961332689e-05, + "loss": 2.2744, + "step": 7075 + }, + { + "epoch": 1.3263355201499532, + "grad_norm": 56425.5859375, + "learning_rate": 7.221157989136899e-05, + "loss": 2.3262, + "step": 7076 + }, + { + "epoch": 1.326522961574508, + "grad_norm": 48270.39453125, + "learning_rate": 7.220453962081386e-05, + "loss": 2.1707, + "step": 7077 + }, + { + "epoch": 1.3267104029990628, + "grad_norm": 48810.55859375, + "learning_rate": 7.21974988018354e-05, + "loss": 2.2326, + "step": 7078 + }, + { + "epoch": 1.3268978444236177, + "grad_norm": 49756.78515625, + "learning_rate": 7.219045743460748e-05, + "loss": 2.2859, + "step": 7079 + }, + { + "epoch": 1.3270852858481725, + "grad_norm": 50809.29296875, + "learning_rate": 7.218341551930406e-05, + "loss": 2.2645, + "step": 7080 + }, + { + "epoch": 1.3272727272727272, + "grad_norm": 49956.9921875, + "learning_rate": 7.217637305609901e-05, + "loss": 2.2255, + "step": 7081 + }, + { + "epoch": 1.3274601686972822, + "grad_norm": 53489.3984375, + "learning_rate": 7.21693300451663e-05, + "loss": 2.1733, + "step": 7082 + }, + { + "epoch": 1.3276476101218369, + "grad_norm": 55582.71875, + "learning_rate": 7.216228648667988e-05, + "loss": 2.2822, + "step": 7083 + }, + { + "epoch": 1.3278350515463917, + "grad_norm": 50932.90234375, + "learning_rate": 7.215524238081372e-05, + "loss": 2.23, + "step": 7084 + }, + { + "epoch": 1.3280224929709465, + "grad_norm": 52629.1484375, + "learning_rate": 7.214819772774181e-05, + "loss": 2.2129, + "step": 7085 + }, + { + "epoch": 1.3282099343955014, + "grad_norm": 57946.42578125, + "learning_rate": 7.214115252763813e-05, + "loss": 2.1804, + "step": 7086 + }, + { + "epoch": 1.3283973758200562, + "grad_norm": 52758.0546875, + "learning_rate": 7.213410678067669e-05, + "loss": 2.1589, + "step": 7087 + }, + { + "epoch": 1.328584817244611, + "grad_norm": 52352.81640625, + "learning_rate": 7.21270604870315e-05, + "loss": 2.2641, + "step": 7088 + }, + { + "epoch": 1.328772258669166, + "grad_norm": 55397.94140625, + "learning_rate": 7.21200136468766e-05, + "loss": 2.2389, + "step": 7089 + }, + { + "epoch": 1.3289597000937208, + "grad_norm": 52614.16796875, + "learning_rate": 7.211296626038607e-05, + "loss": 2.3049, + "step": 7090 + }, + { + "epoch": 1.3291471415182756, + "grad_norm": 48976.56640625, + "learning_rate": 7.21059183277339e-05, + "loss": 2.2168, + "step": 7091 + }, + { + "epoch": 1.3293345829428302, + "grad_norm": 52654.6015625, + "learning_rate": 7.209886984909424e-05, + "loss": 2.2176, + "step": 7092 + }, + { + "epoch": 1.3295220243673853, + "grad_norm": 49804.68359375, + "learning_rate": 7.209182082464114e-05, + "loss": 2.213, + "step": 7093 + }, + { + "epoch": 1.32970946579194, + "grad_norm": 56264.40625, + "learning_rate": 7.208477125454871e-05, + "loss": 2.1988, + "step": 7094 + }, + { + "epoch": 1.3298969072164948, + "grad_norm": 51472.66796875, + "learning_rate": 7.207772113899105e-05, + "loss": 2.2348, + "step": 7095 + }, + { + "epoch": 1.3300843486410496, + "grad_norm": 54046.34765625, + "learning_rate": 7.20706704781423e-05, + "loss": 2.2027, + "step": 7096 + }, + { + "epoch": 1.3302717900656045, + "grad_norm": 50192.1015625, + "learning_rate": 7.206361927217662e-05, + "loss": 2.1256, + "step": 7097 + }, + { + "epoch": 1.3304592314901593, + "grad_norm": 49993.64453125, + "learning_rate": 7.205656752126814e-05, + "loss": 2.2948, + "step": 7098 + }, + { + "epoch": 1.3306466729147142, + "grad_norm": 55739.52734375, + "learning_rate": 7.204951522559105e-05, + "loss": 2.2819, + "step": 7099 + }, + { + "epoch": 1.330834114339269, + "grad_norm": 50840.7265625, + "learning_rate": 7.204246238531953e-05, + "loss": 2.243, + "step": 7100 + }, + { + "epoch": 1.3310215557638239, + "grad_norm": 45850.1953125, + "learning_rate": 7.203540900062773e-05, + "loss": 2.2529, + "step": 7101 + }, + { + "epoch": 1.3312089971883787, + "grad_norm": 52926.0859375, + "learning_rate": 7.202835507168994e-05, + "loss": 2.2046, + "step": 7102 + }, + { + "epoch": 1.3313964386129333, + "grad_norm": 50996.76171875, + "learning_rate": 7.202130059868031e-05, + "loss": 2.2299, + "step": 7103 + }, + { + "epoch": 1.3315838800374884, + "grad_norm": 47800.73828125, + "learning_rate": 7.201424558177312e-05, + "loss": 2.2229, + "step": 7104 + }, + { + "epoch": 1.331771321462043, + "grad_norm": 54944.80078125, + "learning_rate": 7.20071900211426e-05, + "loss": 2.298, + "step": 7105 + }, + { + "epoch": 1.3319587628865979, + "grad_norm": 52758.9453125, + "learning_rate": 7.200013391696302e-05, + "loss": 2.2286, + "step": 7106 + }, + { + "epoch": 1.3321462043111527, + "grad_norm": 54732.640625, + "learning_rate": 7.199307726940864e-05, + "loss": 2.2955, + "step": 7107 + }, + { + "epoch": 1.3323336457357076, + "grad_norm": 49930.0078125, + "learning_rate": 7.19860200786538e-05, + "loss": 2.1982, + "step": 7108 + }, + { + "epoch": 1.3325210871602624, + "grad_norm": 50506.1015625, + "learning_rate": 7.197896234487276e-05, + "loss": 2.2897, + "step": 7109 + }, + { + "epoch": 1.3327085285848173, + "grad_norm": 51359.18359375, + "learning_rate": 7.197190406823983e-05, + "loss": 2.2296, + "step": 7110 + }, + { + "epoch": 1.332895970009372, + "grad_norm": 53432.69921875, + "learning_rate": 7.196484524892937e-05, + "loss": 2.3073, + "step": 7111 + }, + { + "epoch": 1.333083411433927, + "grad_norm": 56351.5703125, + "learning_rate": 7.195778588711571e-05, + "loss": 2.2019, + "step": 7112 + }, + { + "epoch": 1.3332708528584818, + "grad_norm": 52306.16796875, + "learning_rate": 7.19507259829732e-05, + "loss": 2.2045, + "step": 7113 + }, + { + "epoch": 1.3334582942830364, + "grad_norm": 50862.40625, + "learning_rate": 7.194366553667621e-05, + "loss": 2.2724, + "step": 7114 + }, + { + "epoch": 1.3336457357075915, + "grad_norm": 49887.30859375, + "learning_rate": 7.193660454839916e-05, + "loss": 2.2233, + "step": 7115 + }, + { + "epoch": 1.333833177132146, + "grad_norm": 51942.8984375, + "learning_rate": 7.192954301831638e-05, + "loss": 2.2456, + "step": 7116 + }, + { + "epoch": 1.334020618556701, + "grad_norm": 46883.53515625, + "learning_rate": 7.192248094660235e-05, + "loss": 2.2928, + "step": 7117 + }, + { + "epoch": 1.3342080599812558, + "grad_norm": 57638.22265625, + "learning_rate": 7.191541833343145e-05, + "loss": 2.1353, + "step": 7118 + }, + { + "epoch": 1.3343955014058106, + "grad_norm": 54585.1875, + "learning_rate": 7.190835517897814e-05, + "loss": 2.2744, + "step": 7119 + }, + { + "epoch": 1.3345829428303655, + "grad_norm": 49669.984375, + "learning_rate": 7.190129148341686e-05, + "loss": 2.2289, + "step": 7120 + }, + { + "epoch": 1.3347703842549203, + "grad_norm": 51368.08203125, + "learning_rate": 7.189422724692206e-05, + "loss": 2.2087, + "step": 7121 + }, + { + "epoch": 1.3349578256794752, + "grad_norm": 49405.87109375, + "learning_rate": 7.188716246966825e-05, + "loss": 2.2075, + "step": 7122 + }, + { + "epoch": 1.33514526710403, + "grad_norm": 50985.80859375, + "learning_rate": 7.188009715182991e-05, + "loss": 2.235, + "step": 7123 + }, + { + "epoch": 1.3353327085285849, + "grad_norm": 51539.953125, + "learning_rate": 7.187303129358152e-05, + "loss": 2.2558, + "step": 7124 + }, + { + "epoch": 1.3355201499531397, + "grad_norm": 55429.19140625, + "learning_rate": 7.186596489509762e-05, + "loss": 2.1783, + "step": 7125 + }, + { + "epoch": 1.3357075913776946, + "grad_norm": 49140.875, + "learning_rate": 7.185889795655274e-05, + "loss": 2.2084, + "step": 7126 + }, + { + "epoch": 1.3358950328022492, + "grad_norm": 49553.2578125, + "learning_rate": 7.185183047812143e-05, + "loss": 2.2384, + "step": 7127 + }, + { + "epoch": 1.3360824742268043, + "grad_norm": 49638.62890625, + "learning_rate": 7.184476245997824e-05, + "loss": 2.2069, + "step": 7128 + }, + { + "epoch": 1.3362699156513589, + "grad_norm": 52052.96484375, + "learning_rate": 7.18376939022977e-05, + "loss": 2.2772, + "step": 7129 + }, + { + "epoch": 1.3364573570759137, + "grad_norm": 47101.87109375, + "learning_rate": 7.183062480525449e-05, + "loss": 2.2329, + "step": 7130 + }, + { + "epoch": 1.3366447985004686, + "grad_norm": 60618.4609375, + "learning_rate": 7.18235551690231e-05, + "loss": 2.2752, + "step": 7131 + }, + { + "epoch": 1.3368322399250234, + "grad_norm": 51069.91796875, + "learning_rate": 7.181648499377823e-05, + "loss": 2.3037, + "step": 7132 + }, + { + "epoch": 1.3370196813495783, + "grad_norm": 50959.16015625, + "learning_rate": 7.180941427969446e-05, + "loss": 2.5177, + "step": 7133 + }, + { + "epoch": 1.3372071227741331, + "grad_norm": 52782.7109375, + "learning_rate": 7.180234302694642e-05, + "loss": 2.2527, + "step": 7134 + }, + { + "epoch": 1.337394564198688, + "grad_norm": 51533.5078125, + "learning_rate": 7.179527123570878e-05, + "loss": 2.2414, + "step": 7135 + }, + { + "epoch": 1.3375820056232428, + "grad_norm": 52110.4453125, + "learning_rate": 7.178819890615619e-05, + "loss": 2.2635, + "step": 7136 + }, + { + "epoch": 1.3377694470477977, + "grad_norm": 53267.8984375, + "learning_rate": 7.178112603846335e-05, + "loss": 2.221, + "step": 7137 + }, + { + "epoch": 1.3379568884723523, + "grad_norm": 47188.72265625, + "learning_rate": 7.177405263280493e-05, + "loss": 2.2027, + "step": 7138 + }, + { + "epoch": 1.3381443298969073, + "grad_norm": 50415.94921875, + "learning_rate": 7.176697868935565e-05, + "loss": 2.2715, + "step": 7139 + }, + { + "epoch": 1.338331771321462, + "grad_norm": 50776.36328125, + "learning_rate": 7.17599042082902e-05, + "loss": 2.1865, + "step": 7140 + }, + { + "epoch": 1.3385192127460168, + "grad_norm": 51657.96484375, + "learning_rate": 7.175282918978335e-05, + "loss": 2.2344, + "step": 7141 + }, + { + "epoch": 1.3387066541705717, + "grad_norm": 49409.80078125, + "learning_rate": 7.17457536340098e-05, + "loss": 2.2615, + "step": 7142 + }, + { + "epoch": 1.3388940955951265, + "grad_norm": 54058.9921875, + "learning_rate": 7.173867754114433e-05, + "loss": 2.1704, + "step": 7143 + }, + { + "epoch": 1.3390815370196814, + "grad_norm": 50932.13671875, + "learning_rate": 7.173160091136172e-05, + "loss": 2.2961, + "step": 7144 + }, + { + "epoch": 1.3392689784442362, + "grad_norm": 51804.6875, + "learning_rate": 7.172452374483675e-05, + "loss": 2.2619, + "step": 7145 + }, + { + "epoch": 1.339456419868791, + "grad_norm": 52284.39453125, + "learning_rate": 7.17174460417442e-05, + "loss": 2.2214, + "step": 7146 + }, + { + "epoch": 1.339643861293346, + "grad_norm": 60040.2578125, + "learning_rate": 7.171036780225888e-05, + "loss": 2.2054, + "step": 7147 + }, + { + "epoch": 1.3398313027179007, + "grad_norm": 54305.2265625, + "learning_rate": 7.170328902655565e-05, + "loss": 2.209, + "step": 7148 + }, + { + "epoch": 1.3400187441424554, + "grad_norm": 48899.89453125, + "learning_rate": 7.169620971480929e-05, + "loss": 2.3261, + "step": 7149 + }, + { + "epoch": 1.3402061855670104, + "grad_norm": 54163.3046875, + "learning_rate": 7.16891298671947e-05, + "loss": 2.2243, + "step": 7150 + }, + { + "epoch": 1.340393626991565, + "grad_norm": 48101.859375, + "learning_rate": 7.168204948388671e-05, + "loss": 2.2488, + "step": 7151 + }, + { + "epoch": 1.34058106841612, + "grad_norm": 49652.67578125, + "learning_rate": 7.167496856506022e-05, + "loss": 2.257, + "step": 7152 + }, + { + "epoch": 1.3407685098406747, + "grad_norm": 46123.00390625, + "learning_rate": 7.16678871108901e-05, + "loss": 2.2256, + "step": 7153 + }, + { + "epoch": 1.3409559512652296, + "grad_norm": 51697.86328125, + "learning_rate": 7.166080512155127e-05, + "loss": 2.2794, + "step": 7154 + }, + { + "epoch": 1.3411433926897844, + "grad_norm": 48931.359375, + "learning_rate": 7.165372259721863e-05, + "loss": 2.2353, + "step": 7155 + }, + { + "epoch": 1.3413308341143393, + "grad_norm": 48984.23828125, + "learning_rate": 7.164663953806711e-05, + "loss": 2.2638, + "step": 7156 + }, + { + "epoch": 1.3415182755388941, + "grad_norm": 53493.90625, + "learning_rate": 7.163955594427167e-05, + "loss": 2.1813, + "step": 7157 + }, + { + "epoch": 1.341705716963449, + "grad_norm": 52719.28515625, + "learning_rate": 7.163247181600725e-05, + "loss": 2.2744, + "step": 7158 + }, + { + "epoch": 1.3418931583880038, + "grad_norm": 52255.0078125, + "learning_rate": 7.162538715344882e-05, + "loss": 2.2561, + "step": 7159 + }, + { + "epoch": 1.3420805998125585, + "grad_norm": 53984.859375, + "learning_rate": 7.161830195677137e-05, + "loss": 2.2593, + "step": 7160 + }, + { + "epoch": 1.3422680412371135, + "grad_norm": 51329.5703125, + "learning_rate": 7.161121622614989e-05, + "loss": 2.2858, + "step": 7161 + }, + { + "epoch": 1.3424554826616681, + "grad_norm": 47686.7734375, + "learning_rate": 7.160412996175939e-05, + "loss": 2.2322, + "step": 7162 + }, + { + "epoch": 1.342642924086223, + "grad_norm": 49314.296875, + "learning_rate": 7.15970431637749e-05, + "loss": 2.2761, + "step": 7163 + }, + { + "epoch": 1.3428303655107778, + "grad_norm": 53947.69140625, + "learning_rate": 7.158995583237142e-05, + "loss": 2.1915, + "step": 7164 + }, + { + "epoch": 1.3430178069353327, + "grad_norm": 52430.953125, + "learning_rate": 7.158286796772404e-05, + "loss": 2.145, + "step": 7165 + }, + { + "epoch": 1.3432052483598875, + "grad_norm": 53799.5703125, + "learning_rate": 7.157577957000781e-05, + "loss": 2.1918, + "step": 7166 + }, + { + "epoch": 1.3433926897844424, + "grad_norm": 51068.76171875, + "learning_rate": 7.156869063939778e-05, + "loss": 2.2508, + "step": 7167 + }, + { + "epoch": 1.3435801312089972, + "grad_norm": 51014.52734375, + "learning_rate": 7.156160117606906e-05, + "loss": 2.2287, + "step": 7168 + }, + { + "epoch": 1.343767572633552, + "grad_norm": 51947.87890625, + "learning_rate": 7.155451118019674e-05, + "loss": 2.2819, + "step": 7169 + }, + { + "epoch": 1.343955014058107, + "grad_norm": 49956.31640625, + "learning_rate": 7.154742065195595e-05, + "loss": 2.236, + "step": 7170 + }, + { + "epoch": 1.3441424554826615, + "grad_norm": 52973.36328125, + "learning_rate": 7.154032959152181e-05, + "loss": 2.2138, + "step": 7171 + }, + { + "epoch": 1.3443298969072166, + "grad_norm": 49994.9921875, + "learning_rate": 7.153323799906945e-05, + "loss": 2.2583, + "step": 7172 + }, + { + "epoch": 1.3445173383317712, + "grad_norm": 56573.1484375, + "learning_rate": 7.152614587477403e-05, + "loss": 2.2026, + "step": 7173 + }, + { + "epoch": 1.344704779756326, + "grad_norm": 53040.11328125, + "learning_rate": 7.151905321881071e-05, + "loss": 2.1783, + "step": 7174 + }, + { + "epoch": 1.344892221180881, + "grad_norm": 58297.03125, + "learning_rate": 7.151196003135468e-05, + "loss": 2.2933, + "step": 7175 + }, + { + "epoch": 1.3450796626054358, + "grad_norm": 53048.640625, + "learning_rate": 7.150486631258113e-05, + "loss": 2.2227, + "step": 7176 + }, + { + "epoch": 1.3452671040299906, + "grad_norm": 51012.015625, + "learning_rate": 7.149777206266527e-05, + "loss": 2.2621, + "step": 7177 + }, + { + "epoch": 1.3454545454545455, + "grad_norm": 51162.3359375, + "learning_rate": 7.149067728178232e-05, + "loss": 2.3359, + "step": 7178 + }, + { + "epoch": 1.3456419868791003, + "grad_norm": 51119.60546875, + "learning_rate": 7.148358197010747e-05, + "loss": 2.1956, + "step": 7179 + }, + { + "epoch": 1.3458294283036552, + "grad_norm": 51680.6015625, + "learning_rate": 7.147648612781604e-05, + "loss": 2.2544, + "step": 7180 + }, + { + "epoch": 1.34601686972821, + "grad_norm": 50296.13671875, + "learning_rate": 7.146938975508322e-05, + "loss": 2.2741, + "step": 7181 + }, + { + "epoch": 1.3462043111527648, + "grad_norm": 52896.58203125, + "learning_rate": 7.146229285208431e-05, + "loss": 2.2338, + "step": 7182 + }, + { + "epoch": 1.3463917525773197, + "grad_norm": 50416.58984375, + "learning_rate": 7.14551954189946e-05, + "loss": 2.2475, + "step": 7183 + }, + { + "epoch": 1.3465791940018743, + "grad_norm": 53558.6953125, + "learning_rate": 7.144809745598938e-05, + "loss": 2.2177, + "step": 7184 + }, + { + "epoch": 1.3467666354264292, + "grad_norm": 50359.77734375, + "learning_rate": 7.144099896324396e-05, + "loss": 2.275, + "step": 7185 + }, + { + "epoch": 1.346954076850984, + "grad_norm": 50159.04296875, + "learning_rate": 7.143389994093367e-05, + "loss": 2.2806, + "step": 7186 + }, + { + "epoch": 1.3471415182755389, + "grad_norm": 48307.5625, + "learning_rate": 7.142680038923382e-05, + "loss": 2.2228, + "step": 7187 + }, + { + "epoch": 1.3473289597000937, + "grad_norm": 51080.5625, + "learning_rate": 7.14197003083198e-05, + "loss": 2.201, + "step": 7188 + }, + { + "epoch": 1.3475164011246485, + "grad_norm": 50787.4296875, + "learning_rate": 7.141259969836693e-05, + "loss": 2.1202, + "step": 7189 + }, + { + "epoch": 1.3477038425492034, + "grad_norm": 56537.70703125, + "learning_rate": 7.140549855955064e-05, + "loss": 2.2467, + "step": 7190 + }, + { + "epoch": 1.3478912839737582, + "grad_norm": 54030.4140625, + "learning_rate": 7.139839689204628e-05, + "loss": 2.2984, + "step": 7191 + }, + { + "epoch": 1.348078725398313, + "grad_norm": 49957.01953125, + "learning_rate": 7.139129469602925e-05, + "loss": 2.2433, + "step": 7192 + }, + { + "epoch": 1.348266166822868, + "grad_norm": 54609.7890625, + "learning_rate": 7.1384191971675e-05, + "loss": 2.147, + "step": 7193 + }, + { + "epoch": 1.3484536082474228, + "grad_norm": 49232.8984375, + "learning_rate": 7.13770887191589e-05, + "loss": 2.1903, + "step": 7194 + }, + { + "epoch": 1.3486410496719774, + "grad_norm": 52553.3828125, + "learning_rate": 7.136998493865645e-05, + "loss": 2.2477, + "step": 7195 + }, + { + "epoch": 1.3488284910965325, + "grad_norm": 52798.8046875, + "learning_rate": 7.136288063034306e-05, + "loss": 2.256, + "step": 7196 + }, + { + "epoch": 1.349015932521087, + "grad_norm": 52908.109375, + "learning_rate": 7.135577579439422e-05, + "loss": 2.24, + "step": 7197 + }, + { + "epoch": 1.349203373945642, + "grad_norm": 47760.59375, + "learning_rate": 7.134867043098541e-05, + "loss": 2.2394, + "step": 7198 + }, + { + "epoch": 1.3493908153701968, + "grad_norm": 51288.671875, + "learning_rate": 7.134156454029211e-05, + "loss": 2.2346, + "step": 7199 + }, + { + "epoch": 1.3495782567947516, + "grad_norm": 51523.62890625, + "learning_rate": 7.133445812248985e-05, + "loss": 2.2081, + "step": 7200 + }, + { + "epoch": 1.3497656982193065, + "grad_norm": 48459.16015625, + "learning_rate": 7.132735117775414e-05, + "loss": 2.2289, + "step": 7201 + }, + { + "epoch": 1.3499531396438613, + "grad_norm": 50906.5390625, + "learning_rate": 7.132024370626048e-05, + "loss": 2.244, + "step": 7202 + }, + { + "epoch": 1.3501405810684162, + "grad_norm": 52337.50390625, + "learning_rate": 7.131313570818448e-05, + "loss": 2.2958, + "step": 7203 + }, + { + "epoch": 1.350328022492971, + "grad_norm": 52206.9765625, + "learning_rate": 7.130602718370162e-05, + "loss": 2.1663, + "step": 7204 + }, + { + "epoch": 1.3505154639175259, + "grad_norm": 49728.2890625, + "learning_rate": 7.129891813298755e-05, + "loss": 2.2256, + "step": 7205 + }, + { + "epoch": 1.3507029053420805, + "grad_norm": 57416.49609375, + "learning_rate": 7.12918085562178e-05, + "loss": 2.138, + "step": 7206 + }, + { + "epoch": 1.3508903467666356, + "grad_norm": 50434.08984375, + "learning_rate": 7.128469845356798e-05, + "loss": 2.164, + "step": 7207 + }, + { + "epoch": 1.3510777881911902, + "grad_norm": 54361.06640625, + "learning_rate": 7.12775878252137e-05, + "loss": 2.1807, + "step": 7208 + }, + { + "epoch": 1.351265229615745, + "grad_norm": 50989.12890625, + "learning_rate": 7.12704766713306e-05, + "loss": 2.2956, + "step": 7209 + }, + { + "epoch": 1.3514526710402999, + "grad_norm": 51087.171875, + "learning_rate": 7.12633649920943e-05, + "loss": 2.2505, + "step": 7210 + }, + { + "epoch": 1.3516401124648547, + "grad_norm": 53011.9375, + "learning_rate": 7.125625278768045e-05, + "loss": 2.2551, + "step": 7211 + }, + { + "epoch": 1.3518275538894096, + "grad_norm": 48921.37109375, + "learning_rate": 7.12491400582647e-05, + "loss": 2.2733, + "step": 7212 + }, + { + "epoch": 1.3520149953139644, + "grad_norm": 50989.9453125, + "learning_rate": 7.124202680402277e-05, + "loss": 2.324, + "step": 7213 + }, + { + "epoch": 1.3522024367385193, + "grad_norm": 50201.71875, + "learning_rate": 7.12349130251303e-05, + "loss": 2.2845, + "step": 7214 + }, + { + "epoch": 1.352389878163074, + "grad_norm": 52574.5859375, + "learning_rate": 7.122779872176302e-05, + "loss": 2.2854, + "step": 7215 + }, + { + "epoch": 1.352577319587629, + "grad_norm": 57723.3515625, + "learning_rate": 7.122068389409664e-05, + "loss": 2.2558, + "step": 7216 + }, + { + "epoch": 1.3527647610121836, + "grad_norm": 51767.8671875, + "learning_rate": 7.121356854230686e-05, + "loss": 2.2402, + "step": 7217 + }, + { + "epoch": 1.3529522024367386, + "grad_norm": 51954.171875, + "learning_rate": 7.120645266656945e-05, + "loss": 2.2666, + "step": 7218 + }, + { + "epoch": 1.3531396438612933, + "grad_norm": 51471.65625, + "learning_rate": 7.119933626706016e-05, + "loss": 2.2664, + "step": 7219 + }, + { + "epoch": 1.353327085285848, + "grad_norm": 48524.53125, + "learning_rate": 7.119221934395473e-05, + "loss": 2.2656, + "step": 7220 + }, + { + "epoch": 1.353514526710403, + "grad_norm": 50316.8046875, + "learning_rate": 7.1185101897429e-05, + "loss": 2.1679, + "step": 7221 + }, + { + "epoch": 1.3537019681349578, + "grad_norm": 47598.56640625, + "learning_rate": 7.117798392765867e-05, + "loss": 2.2223, + "step": 7222 + }, + { + "epoch": 1.3538894095595126, + "grad_norm": 52526.46484375, + "learning_rate": 7.117086543481963e-05, + "loss": 2.2025, + "step": 7223 + }, + { + "epoch": 1.3540768509840675, + "grad_norm": 52574.0, + "learning_rate": 7.116374641908763e-05, + "loss": 2.1942, + "step": 7224 + }, + { + "epoch": 1.3542642924086223, + "grad_norm": 53311.73046875, + "learning_rate": 7.115662688063857e-05, + "loss": 2.2414, + "step": 7225 + }, + { + "epoch": 1.3544517338331772, + "grad_norm": 52493.69921875, + "learning_rate": 7.114950681964823e-05, + "loss": 2.245, + "step": 7226 + }, + { + "epoch": 1.354639175257732, + "grad_norm": 53230.15234375, + "learning_rate": 7.11423862362925e-05, + "loss": 2.2517, + "step": 7227 + }, + { + "epoch": 1.3548266166822867, + "grad_norm": 55925.71875, + "learning_rate": 7.113526513074725e-05, + "loss": 2.3381, + "step": 7228 + }, + { + "epoch": 1.3550140581068417, + "grad_norm": 52722.4921875, + "learning_rate": 7.112814350318834e-05, + "loss": 2.2302, + "step": 7229 + }, + { + "epoch": 1.3552014995313963, + "grad_norm": 50475.0390625, + "learning_rate": 7.112102135379169e-05, + "loss": 2.229, + "step": 7230 + }, + { + "epoch": 1.3553889409559512, + "grad_norm": 50234.2734375, + "learning_rate": 7.111389868273319e-05, + "loss": 2.243, + "step": 7231 + }, + { + "epoch": 1.355576382380506, + "grad_norm": 49870.19140625, + "learning_rate": 7.110677549018875e-05, + "loss": 2.2531, + "step": 7232 + }, + { + "epoch": 1.3557638238050609, + "grad_norm": 52609.5703125, + "learning_rate": 7.109965177633435e-05, + "loss": 2.257, + "step": 7233 + }, + { + "epoch": 1.3559512652296157, + "grad_norm": 52245.46484375, + "learning_rate": 7.10925275413459e-05, + "loss": 2.2717, + "step": 7234 + }, + { + "epoch": 1.3561387066541706, + "grad_norm": 52140.8046875, + "learning_rate": 7.108540278539935e-05, + "loss": 2.209, + "step": 7235 + }, + { + "epoch": 1.3563261480787254, + "grad_norm": 50247.98046875, + "learning_rate": 7.10782775086707e-05, + "loss": 2.2503, + "step": 7236 + }, + { + "epoch": 1.3565135895032803, + "grad_norm": 58707.57421875, + "learning_rate": 7.107115171133593e-05, + "loss": 2.1854, + "step": 7237 + }, + { + "epoch": 1.3567010309278351, + "grad_norm": 51381.38671875, + "learning_rate": 7.106402539357103e-05, + "loss": 2.2019, + "step": 7238 + }, + { + "epoch": 1.3568884723523897, + "grad_norm": 48534.73828125, + "learning_rate": 7.1056898555552e-05, + "loss": 2.2643, + "step": 7239 + }, + { + "epoch": 1.3570759137769448, + "grad_norm": 49661.48046875, + "learning_rate": 7.10497711974549e-05, + "loss": 2.2223, + "step": 7240 + }, + { + "epoch": 1.3572633552014994, + "grad_norm": 55846.1796875, + "learning_rate": 7.104264331945573e-05, + "loss": 2.1601, + "step": 7241 + }, + { + "epoch": 1.3574507966260543, + "grad_norm": 50027.48046875, + "learning_rate": 7.103551492173054e-05, + "loss": 2.2386, + "step": 7242 + }, + { + "epoch": 1.3576382380506091, + "grad_norm": 51327.34375, + "learning_rate": 7.102838600445542e-05, + "loss": 2.25, + "step": 7243 + }, + { + "epoch": 1.357825679475164, + "grad_norm": 51136.9921875, + "learning_rate": 7.102125656780641e-05, + "loss": 2.1987, + "step": 7244 + }, + { + "epoch": 1.3580131208997188, + "grad_norm": 54243.9609375, + "learning_rate": 7.101412661195964e-05, + "loss": 2.2779, + "step": 7245 + }, + { + "epoch": 1.3582005623242737, + "grad_norm": 52847.81640625, + "learning_rate": 7.100699613709118e-05, + "loss": 2.2204, + "step": 7246 + }, + { + "epoch": 1.3583880037488285, + "grad_norm": 53499.51953125, + "learning_rate": 7.099986514337715e-05, + "loss": 2.2629, + "step": 7247 + }, + { + "epoch": 1.3585754451733834, + "grad_norm": 54143.328125, + "learning_rate": 7.099273363099368e-05, + "loss": 2.1892, + "step": 7248 + }, + { + "epoch": 1.3587628865979382, + "grad_norm": 52420.9453125, + "learning_rate": 7.098560160011692e-05, + "loss": 2.142, + "step": 7249 + }, + { + "epoch": 1.358950328022493, + "grad_norm": 50226.1953125, + "learning_rate": 7.097846905092298e-05, + "loss": 2.2217, + "step": 7250 + }, + { + "epoch": 1.359137769447048, + "grad_norm": 47949.82421875, + "learning_rate": 7.097133598358807e-05, + "loss": 2.2397, + "step": 7251 + }, + { + "epoch": 1.3593252108716025, + "grad_norm": 51310.8671875, + "learning_rate": 7.096420239828835e-05, + "loss": 2.3126, + "step": 7252 + }, + { + "epoch": 1.3595126522961576, + "grad_norm": 49035.60546875, + "learning_rate": 7.095706829520001e-05, + "loss": 2.274, + "step": 7253 + }, + { + "epoch": 1.3597000937207122, + "grad_norm": 51470.8984375, + "learning_rate": 7.094993367449926e-05, + "loss": 2.2788, + "step": 7254 + }, + { + "epoch": 1.359887535145267, + "grad_norm": 50196.71875, + "learning_rate": 7.09427985363623e-05, + "loss": 2.2624, + "step": 7255 + }, + { + "epoch": 1.360074976569822, + "grad_norm": 50358.2890625, + "learning_rate": 7.093566288096537e-05, + "loss": 2.2403, + "step": 7256 + }, + { + "epoch": 1.3602624179943767, + "grad_norm": 50108.67578125, + "learning_rate": 7.092852670848471e-05, + "loss": 2.1851, + "step": 7257 + }, + { + "epoch": 1.3604498594189316, + "grad_norm": 49455.03515625, + "learning_rate": 7.092139001909658e-05, + "loss": 2.2547, + "step": 7258 + }, + { + "epoch": 1.3606373008434864, + "grad_norm": 53725.4375, + "learning_rate": 7.091425281297725e-05, + "loss": 2.2408, + "step": 7259 + }, + { + "epoch": 1.3608247422680413, + "grad_norm": 50847.609375, + "learning_rate": 7.090711509030296e-05, + "loss": 2.2402, + "step": 7260 + }, + { + "epoch": 1.3610121836925961, + "grad_norm": 48768.71484375, + "learning_rate": 7.089997685125008e-05, + "loss": 2.1805, + "step": 7261 + }, + { + "epoch": 1.361199625117151, + "grad_norm": 50645.06640625, + "learning_rate": 7.089283809599482e-05, + "loss": 2.191, + "step": 7262 + }, + { + "epoch": 1.3613870665417056, + "grad_norm": 57774.22265625, + "learning_rate": 7.088569882471359e-05, + "loss": 2.2899, + "step": 7263 + }, + { + "epoch": 1.3615745079662607, + "grad_norm": 48943.234375, + "learning_rate": 7.087855903758265e-05, + "loss": 2.2518, + "step": 7264 + }, + { + "epoch": 1.3617619493908153, + "grad_norm": 50905.23828125, + "learning_rate": 7.087141873477838e-05, + "loss": 2.281, + "step": 7265 + }, + { + "epoch": 1.3619493908153701, + "grad_norm": 51833.890625, + "learning_rate": 7.086427791647714e-05, + "loss": 2.1986, + "step": 7266 + }, + { + "epoch": 1.362136832239925, + "grad_norm": 49569.203125, + "learning_rate": 7.085713658285526e-05, + "loss": 2.2703, + "step": 7267 + }, + { + "epoch": 1.3623242736644798, + "grad_norm": 48974.140625, + "learning_rate": 7.084999473408917e-05, + "loss": 2.2649, + "step": 7268 + }, + { + "epoch": 1.3625117150890347, + "grad_norm": 48008.9375, + "learning_rate": 7.084285237035523e-05, + "loss": 2.2441, + "step": 7269 + }, + { + "epoch": 1.3626991565135895, + "grad_norm": 51695.0234375, + "learning_rate": 7.083570949182987e-05, + "loss": 2.2394, + "step": 7270 + }, + { + "epoch": 1.3628865979381444, + "grad_norm": 52426.74609375, + "learning_rate": 7.082856609868948e-05, + "loss": 2.2269, + "step": 7271 + }, + { + "epoch": 1.3630740393626992, + "grad_norm": 52632.34765625, + "learning_rate": 7.082142219111053e-05, + "loss": 2.2146, + "step": 7272 + }, + { + "epoch": 1.363261480787254, + "grad_norm": 53573.33203125, + "learning_rate": 7.081427776926945e-05, + "loss": 2.2121, + "step": 7273 + }, + { + "epoch": 1.3634489222118087, + "grad_norm": 57406.70703125, + "learning_rate": 7.080713283334269e-05, + "loss": 2.2572, + "step": 7274 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 53597.78515625, + "learning_rate": 7.079998738350671e-05, + "loss": 2.2014, + "step": 7275 + }, + { + "epoch": 1.3638238050609184, + "grad_norm": 50432.4765625, + "learning_rate": 7.079284141993803e-05, + "loss": 2.2277, + "step": 7276 + }, + { + "epoch": 1.3640112464854732, + "grad_norm": 50033.72265625, + "learning_rate": 7.078569494281311e-05, + "loss": 2.2028, + "step": 7277 + }, + { + "epoch": 1.364198687910028, + "grad_norm": 47798.41796875, + "learning_rate": 7.077854795230849e-05, + "loss": 2.2947, + "step": 7278 + }, + { + "epoch": 1.364386129334583, + "grad_norm": 52210.94921875, + "learning_rate": 7.077140044860066e-05, + "loss": 2.2064, + "step": 7279 + }, + { + "epoch": 1.3645735707591378, + "grad_norm": 47603.78515625, + "learning_rate": 7.076425243186617e-05, + "loss": 2.2956, + "step": 7280 + }, + { + "epoch": 1.3647610121836926, + "grad_norm": 49371.42578125, + "learning_rate": 7.075710390228158e-05, + "loss": 2.2627, + "step": 7281 + }, + { + "epoch": 1.3649484536082475, + "grad_norm": 50024.52734375, + "learning_rate": 7.074995486002343e-05, + "loss": 2.2519, + "step": 7282 + }, + { + "epoch": 1.3651358950328023, + "grad_norm": 49398.58984375, + "learning_rate": 7.07428053052683e-05, + "loss": 2.2655, + "step": 7283 + }, + { + "epoch": 1.3653233364573572, + "grad_norm": 52274.8984375, + "learning_rate": 7.073565523819277e-05, + "loss": 2.2245, + "step": 7284 + }, + { + "epoch": 1.3655107778819118, + "grad_norm": 51924.359375, + "learning_rate": 7.072850465897344e-05, + "loss": 2.1877, + "step": 7285 + }, + { + "epoch": 1.3656982193064668, + "grad_norm": 51659.2109375, + "learning_rate": 7.072135356778691e-05, + "loss": 2.2617, + "step": 7286 + }, + { + "epoch": 1.3658856607310215, + "grad_norm": 48135.79296875, + "learning_rate": 7.071420196480983e-05, + "loss": 2.2055, + "step": 7287 + }, + { + "epoch": 1.3660731021555763, + "grad_norm": 51385.8046875, + "learning_rate": 7.070704985021881e-05, + "loss": 2.279, + "step": 7288 + }, + { + "epoch": 1.3662605435801312, + "grad_norm": 52327.00390625, + "learning_rate": 7.06998972241905e-05, + "loss": 2.1949, + "step": 7289 + }, + { + "epoch": 1.366447985004686, + "grad_norm": 48743.75390625, + "learning_rate": 7.069274408690157e-05, + "loss": 2.2086, + "step": 7290 + }, + { + "epoch": 1.3666354264292409, + "grad_norm": 58697.3671875, + "learning_rate": 7.06855904385287e-05, + "loss": 2.2242, + "step": 7291 + }, + { + "epoch": 1.3668228678537957, + "grad_norm": 52113.73828125, + "learning_rate": 7.067843627924854e-05, + "loss": 2.2024, + "step": 7292 + }, + { + "epoch": 1.3670103092783505, + "grad_norm": 48586.57421875, + "learning_rate": 7.067128160923784e-05, + "loss": 2.2141, + "step": 7293 + }, + { + "epoch": 1.3671977507029054, + "grad_norm": 49897.34765625, + "learning_rate": 7.066412642867329e-05, + "loss": 2.3386, + "step": 7294 + }, + { + "epoch": 1.3673851921274602, + "grad_norm": 56687.64453125, + "learning_rate": 7.065697073773157e-05, + "loss": 2.2796, + "step": 7295 + }, + { + "epoch": 1.3675726335520149, + "grad_norm": 52574.9296875, + "learning_rate": 7.064981453658948e-05, + "loss": 2.2411, + "step": 7296 + }, + { + "epoch": 1.36776007497657, + "grad_norm": 53550.26953125, + "learning_rate": 7.064265782542374e-05, + "loss": 2.1269, + "step": 7297 + }, + { + "epoch": 1.3679475164011246, + "grad_norm": 47975.03515625, + "learning_rate": 7.063550060441112e-05, + "loss": 2.2734, + "step": 7298 + }, + { + "epoch": 1.3681349578256794, + "grad_norm": 50273.875, + "learning_rate": 7.06283428737284e-05, + "loss": 2.1937, + "step": 7299 + }, + { + "epoch": 1.3683223992502342, + "grad_norm": 54104.36328125, + "learning_rate": 7.062118463355233e-05, + "loss": 2.2282, + "step": 7300 + }, + { + "epoch": 1.368509840674789, + "grad_norm": 49250.21875, + "learning_rate": 7.061402588405975e-05, + "loss": 2.1873, + "step": 7301 + }, + { + "epoch": 1.368697282099344, + "grad_norm": 49093.91796875, + "learning_rate": 7.060686662542745e-05, + "loss": 2.3041, + "step": 7302 + }, + { + "epoch": 1.3688847235238988, + "grad_norm": 49618.6953125, + "learning_rate": 7.059970685783228e-05, + "loss": 2.2425, + "step": 7303 + }, + { + "epoch": 1.3690721649484536, + "grad_norm": 52772.6875, + "learning_rate": 7.059254658145103e-05, + "loss": 2.1748, + "step": 7304 + }, + { + "epoch": 1.3692596063730085, + "grad_norm": 52084.14453125, + "learning_rate": 7.05853857964606e-05, + "loss": 2.1802, + "step": 7305 + }, + { + "epoch": 1.3694470477975633, + "grad_norm": 49589.35546875, + "learning_rate": 7.057822450303782e-05, + "loss": 2.2701, + "step": 7306 + }, + { + "epoch": 1.3696344892221182, + "grad_norm": 49943.515625, + "learning_rate": 7.057106270135958e-05, + "loss": 2.2633, + "step": 7307 + }, + { + "epoch": 1.369821930646673, + "grad_norm": 53726.87890625, + "learning_rate": 7.056390039160277e-05, + "loss": 2.2172, + "step": 7308 + }, + { + "epoch": 1.3700093720712276, + "grad_norm": 51456.52734375, + "learning_rate": 7.055673757394427e-05, + "loss": 2.2273, + "step": 7309 + }, + { + "epoch": 1.3701968134957827, + "grad_norm": 46897.9921875, + "learning_rate": 7.0549574248561e-05, + "loss": 2.2305, + "step": 7310 + }, + { + "epoch": 1.3703842549203373, + "grad_norm": 47337.0078125, + "learning_rate": 7.05424104156299e-05, + "loss": 2.2148, + "step": 7311 + }, + { + "epoch": 1.3705716963448922, + "grad_norm": 52895.05078125, + "learning_rate": 7.053524607532788e-05, + "loss": 2.2056, + "step": 7312 + }, + { + "epoch": 1.370759137769447, + "grad_norm": 50373.05859375, + "learning_rate": 7.052808122783191e-05, + "loss": 2.1705, + "step": 7313 + }, + { + "epoch": 1.3709465791940019, + "grad_norm": 53715.70703125, + "learning_rate": 7.052091587331897e-05, + "loss": 2.1884, + "step": 7314 + }, + { + "epoch": 1.3711340206185567, + "grad_norm": 53226.84765625, + "learning_rate": 7.0513750011966e-05, + "loss": 2.2283, + "step": 7315 + }, + { + "epoch": 1.3713214620431116, + "grad_norm": 49061.56640625, + "learning_rate": 7.050658364395e-05, + "loss": 2.2315, + "step": 7316 + }, + { + "epoch": 1.3715089034676664, + "grad_norm": 51592.3125, + "learning_rate": 7.049941676944798e-05, + "loss": 2.2202, + "step": 7317 + }, + { + "epoch": 1.3716963448922213, + "grad_norm": 47485.203125, + "learning_rate": 7.049224938863694e-05, + "loss": 2.2005, + "step": 7318 + }, + { + "epoch": 1.371883786316776, + "grad_norm": 51873.1015625, + "learning_rate": 7.04850815016939e-05, + "loss": 2.2462, + "step": 7319 + }, + { + "epoch": 1.3720712277413307, + "grad_norm": 53658.28125, + "learning_rate": 7.047791310879591e-05, + "loss": 2.2134, + "step": 7320 + }, + { + "epoch": 1.3722586691658858, + "grad_norm": 50206.30859375, + "learning_rate": 7.047074421012002e-05, + "loss": 2.1705, + "step": 7321 + }, + { + "epoch": 1.3724461105904404, + "grad_norm": 52143.703125, + "learning_rate": 7.046357480584328e-05, + "loss": 2.1929, + "step": 7322 + }, + { + "epoch": 1.3726335520149953, + "grad_norm": 50196.31640625, + "learning_rate": 7.045640489614278e-05, + "loss": 2.2861, + "step": 7323 + }, + { + "epoch": 1.37282099343955, + "grad_norm": 50284.31640625, + "learning_rate": 7.044923448119561e-05, + "loss": 2.1695, + "step": 7324 + }, + { + "epoch": 1.373008434864105, + "grad_norm": 50442.55859375, + "learning_rate": 7.044206356117884e-05, + "loss": 2.2272, + "step": 7325 + }, + { + "epoch": 1.3731958762886598, + "grad_norm": 49100.94140625, + "learning_rate": 7.043489213626963e-05, + "loss": 2.2757, + "step": 7326 + }, + { + "epoch": 1.3733833177132146, + "grad_norm": 52062.828125, + "learning_rate": 7.042772020664506e-05, + "loss": 2.2066, + "step": 7327 + }, + { + "epoch": 1.3735707591377695, + "grad_norm": 48171.640625, + "learning_rate": 7.042054777248229e-05, + "loss": 2.2861, + "step": 7328 + }, + { + "epoch": 1.3737582005623243, + "grad_norm": 53279.12890625, + "learning_rate": 7.041337483395847e-05, + "loss": 2.2264, + "step": 7329 + }, + { + "epoch": 1.3739456419868792, + "grad_norm": 51975.7734375, + "learning_rate": 7.040620139125075e-05, + "loss": 2.2795, + "step": 7330 + }, + { + "epoch": 1.3741330834114338, + "grad_norm": 50379.12109375, + "learning_rate": 7.039902744453632e-05, + "loss": 2.2757, + "step": 7331 + }, + { + "epoch": 1.3743205248359889, + "grad_norm": 51658.9453125, + "learning_rate": 7.039185299399236e-05, + "loss": 2.2577, + "step": 7332 + }, + { + "epoch": 1.3745079662605435, + "grad_norm": 47975.40625, + "learning_rate": 7.038467803979606e-05, + "loss": 2.2182, + "step": 7333 + }, + { + "epoch": 1.3746954076850983, + "grad_norm": 52387.5078125, + "learning_rate": 7.037750258212466e-05, + "loss": 2.1778, + "step": 7334 + }, + { + "epoch": 1.3748828491096532, + "grad_norm": 60742.31640625, + "learning_rate": 7.037032662115535e-05, + "loss": 2.2191, + "step": 7335 + }, + { + "epoch": 1.375070290534208, + "grad_norm": 49985.56640625, + "learning_rate": 7.036315015706537e-05, + "loss": 2.2694, + "step": 7336 + }, + { + "epoch": 1.3752577319587629, + "grad_norm": 48422.41796875, + "learning_rate": 7.035597319003199e-05, + "loss": 2.1924, + "step": 7337 + }, + { + "epoch": 1.3754451733833177, + "grad_norm": 51962.76171875, + "learning_rate": 7.034879572023249e-05, + "loss": 2.2475, + "step": 7338 + }, + { + "epoch": 1.3756326148078726, + "grad_norm": 52570.62109375, + "learning_rate": 7.03416177478441e-05, + "loss": 2.1571, + "step": 7339 + }, + { + "epoch": 1.3758200562324274, + "grad_norm": 50632.88671875, + "learning_rate": 7.033443927304412e-05, + "loss": 2.1977, + "step": 7340 + }, + { + "epoch": 1.3760074976569823, + "grad_norm": 48149.296875, + "learning_rate": 7.032726029600986e-05, + "loss": 2.2948, + "step": 7341 + }, + { + "epoch": 1.376194939081537, + "grad_norm": 46189.39453125, + "learning_rate": 7.032008081691863e-05, + "loss": 2.2281, + "step": 7342 + }, + { + "epoch": 1.376382380506092, + "grad_norm": 49144.93359375, + "learning_rate": 7.031290083594775e-05, + "loss": 2.2258, + "step": 7343 + }, + { + "epoch": 1.3765698219306466, + "grad_norm": 47784.62890625, + "learning_rate": 7.030572035327455e-05, + "loss": 2.2232, + "step": 7344 + }, + { + "epoch": 1.3767572633552014, + "grad_norm": 53424.296875, + "learning_rate": 7.029853936907638e-05, + "loss": 2.2154, + "step": 7345 + }, + { + "epoch": 1.3769447047797563, + "grad_norm": 53779.5234375, + "learning_rate": 7.029135788353061e-05, + "loss": 2.3052, + "step": 7346 + }, + { + "epoch": 1.3771321462043111, + "grad_norm": 48767.36328125, + "learning_rate": 7.02841758968146e-05, + "loss": 2.1726, + "step": 7347 + }, + { + "epoch": 1.377319587628866, + "grad_norm": 54429.02734375, + "learning_rate": 7.027699340910577e-05, + "loss": 2.3073, + "step": 7348 + }, + { + "epoch": 1.3775070290534208, + "grad_norm": 53216.02734375, + "learning_rate": 7.026981042058146e-05, + "loss": 2.1943, + "step": 7349 + }, + { + "epoch": 1.3776944704779757, + "grad_norm": 50633.5703125, + "learning_rate": 7.026262693141913e-05, + "loss": 2.2209, + "step": 7350 + }, + { + "epoch": 1.3778819119025305, + "grad_norm": 54430.0, + "learning_rate": 7.025544294179618e-05, + "loss": 2.3189, + "step": 7351 + }, + { + "epoch": 1.3780693533270854, + "grad_norm": 54457.640625, + "learning_rate": 7.024825845189005e-05, + "loss": 2.2853, + "step": 7352 + }, + { + "epoch": 1.37825679475164, + "grad_norm": 51439.4609375, + "learning_rate": 7.024107346187819e-05, + "loss": 2.3518, + "step": 7353 + }, + { + "epoch": 1.378444236176195, + "grad_norm": 54535.75390625, + "learning_rate": 7.023388797193808e-05, + "loss": 2.2127, + "step": 7354 + }, + { + "epoch": 1.3786316776007497, + "grad_norm": 52221.03515625, + "learning_rate": 7.022670198224714e-05, + "loss": 2.2589, + "step": 7355 + }, + { + "epoch": 1.3788191190253045, + "grad_norm": 52422.30078125, + "learning_rate": 7.02195154929829e-05, + "loss": 2.2909, + "step": 7356 + }, + { + "epoch": 1.3790065604498594, + "grad_norm": 48394.6328125, + "learning_rate": 7.021232850432284e-05, + "loss": 2.2503, + "step": 7357 + }, + { + "epoch": 1.3791940018744142, + "grad_norm": 49746.17578125, + "learning_rate": 7.020514101644446e-05, + "loss": 2.3371, + "step": 7358 + }, + { + "epoch": 1.379381443298969, + "grad_norm": 53926.46484375, + "learning_rate": 7.019795302952532e-05, + "loss": 2.1582, + "step": 7359 + }, + { + "epoch": 1.379568884723524, + "grad_norm": 55395.93359375, + "learning_rate": 7.01907645437429e-05, + "loss": 2.2554, + "step": 7360 + }, + { + "epoch": 1.3797563261480787, + "grad_norm": 50439.19140625, + "learning_rate": 7.018357555927479e-05, + "loss": 2.275, + "step": 7361 + }, + { + "epoch": 1.3799437675726336, + "grad_norm": 54243.0546875, + "learning_rate": 7.017638607629852e-05, + "loss": 2.2108, + "step": 7362 + }, + { + "epoch": 1.3801312089971884, + "grad_norm": 53892.2109375, + "learning_rate": 7.016919609499168e-05, + "loss": 2.2912, + "step": 7363 + }, + { + "epoch": 1.3803186504217433, + "grad_norm": 49750.66015625, + "learning_rate": 7.016200561553184e-05, + "loss": 2.2904, + "step": 7364 + }, + { + "epoch": 1.3805060918462981, + "grad_norm": 48046.3125, + "learning_rate": 7.015481463809661e-05, + "loss": 2.2736, + "step": 7365 + }, + { + "epoch": 1.3806935332708528, + "grad_norm": 57329.8046875, + "learning_rate": 7.014762316286359e-05, + "loss": 2.1703, + "step": 7366 + }, + { + "epoch": 1.3808809746954076, + "grad_norm": 53194.93359375, + "learning_rate": 7.01404311900104e-05, + "loss": 2.2235, + "step": 7367 + }, + { + "epoch": 1.3810684161199624, + "grad_norm": 54723.5859375, + "learning_rate": 7.013323871971465e-05, + "loss": 2.2164, + "step": 7368 + }, + { + "epoch": 1.3812558575445173, + "grad_norm": 52451.63671875, + "learning_rate": 7.012604575215404e-05, + "loss": 2.2903, + "step": 7369 + }, + { + "epoch": 1.3814432989690721, + "grad_norm": 51953.078125, + "learning_rate": 7.011885228750618e-05, + "loss": 2.2877, + "step": 7370 + }, + { + "epoch": 1.381630740393627, + "grad_norm": 50773.7109375, + "learning_rate": 7.011165832594875e-05, + "loss": 2.2241, + "step": 7371 + }, + { + "epoch": 1.3818181818181818, + "grad_norm": 53319.29296875, + "learning_rate": 7.010446386765944e-05, + "loss": 2.2233, + "step": 7372 + }, + { + "epoch": 1.3820056232427367, + "grad_norm": 50491.9609375, + "learning_rate": 7.009726891281593e-05, + "loss": 2.2437, + "step": 7373 + }, + { + "epoch": 1.3821930646672915, + "grad_norm": 49008.53515625, + "learning_rate": 7.009007346159595e-05, + "loss": 2.3171, + "step": 7374 + }, + { + "epoch": 1.3823805060918464, + "grad_norm": 49756.94140625, + "learning_rate": 7.008287751417718e-05, + "loss": 2.2412, + "step": 7375 + }, + { + "epoch": 1.3825679475164012, + "grad_norm": 51831.8203125, + "learning_rate": 7.007568107073739e-05, + "loss": 2.2864, + "step": 7376 + }, + { + "epoch": 1.3827553889409558, + "grad_norm": 53443.82421875, + "learning_rate": 7.006848413145431e-05, + "loss": 2.2235, + "step": 7377 + }, + { + "epoch": 1.382942830365511, + "grad_norm": 48839.12109375, + "learning_rate": 7.006128669650567e-05, + "loss": 2.2397, + "step": 7378 + }, + { + "epoch": 1.3831302717900655, + "grad_norm": 48986.54296875, + "learning_rate": 7.005408876606927e-05, + "loss": 2.3013, + "step": 7379 + }, + { + "epoch": 1.3833177132146204, + "grad_norm": 48382.35546875, + "learning_rate": 7.004689034032288e-05, + "loss": 2.2605, + "step": 7380 + }, + { + "epoch": 1.3835051546391752, + "grad_norm": 51222.0390625, + "learning_rate": 7.00396914194443e-05, + "loss": 2.1676, + "step": 7381 + }, + { + "epoch": 1.38369259606373, + "grad_norm": 49746.48828125, + "learning_rate": 7.003249200361131e-05, + "loss": 2.2104, + "step": 7382 + }, + { + "epoch": 1.383880037488285, + "grad_norm": 54869.6875, + "learning_rate": 7.002529209300173e-05, + "loss": 2.3642, + "step": 7383 + }, + { + "epoch": 1.3840674789128398, + "grad_norm": 52882.421875, + "learning_rate": 7.001809168779342e-05, + "loss": 2.3344, + "step": 7384 + }, + { + "epoch": 1.3842549203373946, + "grad_norm": 47212.13671875, + "learning_rate": 7.001089078816418e-05, + "loss": 2.2681, + "step": 7385 + }, + { + "epoch": 1.3844423617619495, + "grad_norm": 51199.1953125, + "learning_rate": 7.00036893942919e-05, + "loss": 2.1712, + "step": 7386 + }, + { + "epoch": 1.3846298031865043, + "grad_norm": 48735.1484375, + "learning_rate": 6.999648750635443e-05, + "loss": 2.2263, + "step": 7387 + }, + { + "epoch": 1.384817244611059, + "grad_norm": 52246.984375, + "learning_rate": 6.998928512452961e-05, + "loss": 2.1964, + "step": 7388 + }, + { + "epoch": 1.385004686035614, + "grad_norm": 49766.63671875, + "learning_rate": 6.998208224899541e-05, + "loss": 2.2738, + "step": 7389 + }, + { + "epoch": 1.3851921274601686, + "grad_norm": 54782.58984375, + "learning_rate": 6.997487887992964e-05, + "loss": 2.2495, + "step": 7390 + }, + { + "epoch": 1.3853795688847235, + "grad_norm": 46177.0546875, + "learning_rate": 6.996767501751029e-05, + "loss": 2.2488, + "step": 7391 + }, + { + "epoch": 1.3855670103092783, + "grad_norm": 49280.77734375, + "learning_rate": 6.996047066191525e-05, + "loss": 2.2507, + "step": 7392 + }, + { + "epoch": 1.3857544517338332, + "grad_norm": 46779.05859375, + "learning_rate": 6.995326581332247e-05, + "loss": 2.2682, + "step": 7393 + }, + { + "epoch": 1.385941893158388, + "grad_norm": 51777.60546875, + "learning_rate": 6.994606047190988e-05, + "loss": 2.2036, + "step": 7394 + }, + { + "epoch": 1.3861293345829429, + "grad_norm": 53098.07421875, + "learning_rate": 6.993885463785546e-05, + "loss": 2.248, + "step": 7395 + }, + { + "epoch": 1.3863167760074977, + "grad_norm": 47408.59765625, + "learning_rate": 6.993164831133718e-05, + "loss": 2.2203, + "step": 7396 + }, + { + "epoch": 1.3865042174320525, + "grad_norm": 52945.52734375, + "learning_rate": 6.992444149253305e-05, + "loss": 2.2292, + "step": 7397 + }, + { + "epoch": 1.3866916588566074, + "grad_norm": 53311.82421875, + "learning_rate": 6.991723418162102e-05, + "loss": 2.222, + "step": 7398 + }, + { + "epoch": 1.386879100281162, + "grad_norm": 48064.58984375, + "learning_rate": 6.991002637877914e-05, + "loss": 2.1702, + "step": 7399 + }, + { + "epoch": 1.387066541705717, + "grad_norm": 52525.45703125, + "learning_rate": 6.990281808418542e-05, + "loss": 2.2355, + "step": 7400 + }, + { + "epoch": 1.3872539831302717, + "grad_norm": 52818.3984375, + "learning_rate": 6.989560929801789e-05, + "loss": 2.2626, + "step": 7401 + }, + { + "epoch": 1.3874414245548266, + "grad_norm": 53188.84765625, + "learning_rate": 6.988840002045462e-05, + "loss": 2.285, + "step": 7402 + }, + { + "epoch": 1.3876288659793814, + "grad_norm": 56000.625, + "learning_rate": 6.988119025167364e-05, + "loss": 2.2258, + "step": 7403 + }, + { + "epoch": 1.3878163074039362, + "grad_norm": 52176.07421875, + "learning_rate": 6.987397999185306e-05, + "loss": 2.2494, + "step": 7404 + }, + { + "epoch": 1.388003748828491, + "grad_norm": 51456.44140625, + "learning_rate": 6.986676924117092e-05, + "loss": 2.2015, + "step": 7405 + }, + { + "epoch": 1.388191190253046, + "grad_norm": 50535.30859375, + "learning_rate": 6.985955799980536e-05, + "loss": 2.2449, + "step": 7406 + }, + { + "epoch": 1.3883786316776008, + "grad_norm": 47212.203125, + "learning_rate": 6.985234626793445e-05, + "loss": 2.2059, + "step": 7407 + }, + { + "epoch": 1.3885660731021556, + "grad_norm": 49459.2578125, + "learning_rate": 6.984513404573631e-05, + "loss": 2.2286, + "step": 7408 + }, + { + "epoch": 1.3887535145267105, + "grad_norm": 56733.96875, + "learning_rate": 6.983792133338911e-05, + "loss": 2.1857, + "step": 7409 + }, + { + "epoch": 1.388940955951265, + "grad_norm": 50774.296875, + "learning_rate": 6.983070813107098e-05, + "loss": 2.2198, + "step": 7410 + }, + { + "epoch": 1.3891283973758202, + "grad_norm": 48969.08984375, + "learning_rate": 6.982349443896004e-05, + "loss": 2.2455, + "step": 7411 + }, + { + "epoch": 1.3893158388003748, + "grad_norm": 54897.53125, + "learning_rate": 6.981628025723452e-05, + "loss": 2.2216, + "step": 7412 + }, + { + "epoch": 1.3895032802249296, + "grad_norm": 56361.4140625, + "learning_rate": 6.980906558607257e-05, + "loss": 2.2777, + "step": 7413 + }, + { + "epoch": 1.3896907216494845, + "grad_norm": 49217.64453125, + "learning_rate": 6.980185042565236e-05, + "loss": 2.2074, + "step": 7414 + }, + { + "epoch": 1.3898781630740393, + "grad_norm": 53620.51953125, + "learning_rate": 6.979463477615214e-05, + "loss": 2.2096, + "step": 7415 + }, + { + "epoch": 1.3900656044985942, + "grad_norm": 49968.625, + "learning_rate": 6.978741863775008e-05, + "loss": 2.2125, + "step": 7416 + }, + { + "epoch": 1.390253045923149, + "grad_norm": 59215.98046875, + "learning_rate": 6.978020201062447e-05, + "loss": 2.1787, + "step": 7417 + }, + { + "epoch": 1.3904404873477039, + "grad_norm": 53574.22265625, + "learning_rate": 6.977298489495347e-05, + "loss": 2.2178, + "step": 7418 + }, + { + "epoch": 1.3906279287722587, + "grad_norm": 48020.85546875, + "learning_rate": 6.976576729091542e-05, + "loss": 2.2715, + "step": 7419 + }, + { + "epoch": 1.3908153701968136, + "grad_norm": 51584.93359375, + "learning_rate": 6.975854919868851e-05, + "loss": 2.2178, + "step": 7420 + }, + { + "epoch": 1.3910028116213682, + "grad_norm": 49336.91796875, + "learning_rate": 6.975133061845108e-05, + "loss": 2.2747, + "step": 7421 + }, + { + "epoch": 1.3911902530459233, + "grad_norm": 48846.1875, + "learning_rate": 6.974411155038139e-05, + "loss": 2.2921, + "step": 7422 + }, + { + "epoch": 1.3913776944704779, + "grad_norm": 49852.99609375, + "learning_rate": 6.973689199465773e-05, + "loss": 2.2835, + "step": 7423 + }, + { + "epoch": 1.3915651358950327, + "grad_norm": 54687.35546875, + "learning_rate": 6.972967195145843e-05, + "loss": 2.2567, + "step": 7424 + }, + { + "epoch": 1.3917525773195876, + "grad_norm": 52768.68359375, + "learning_rate": 6.972245142096182e-05, + "loss": 2.258, + "step": 7425 + }, + { + "epoch": 1.3919400187441424, + "grad_norm": 52137.9765625, + "learning_rate": 6.971523040334622e-05, + "loss": 2.2334, + "step": 7426 + }, + { + "epoch": 1.3921274601686973, + "grad_norm": 55430.0078125, + "learning_rate": 6.970800889879e-05, + "loss": 2.2594, + "step": 7427 + }, + { + "epoch": 1.392314901593252, + "grad_norm": 52189.8984375, + "learning_rate": 6.97007869074715e-05, + "loss": 2.3152, + "step": 7428 + }, + { + "epoch": 1.392502343017807, + "grad_norm": 48934.1328125, + "learning_rate": 6.969356442956911e-05, + "loss": 2.2519, + "step": 7429 + }, + { + "epoch": 1.3926897844423618, + "grad_norm": 47474.06640625, + "learning_rate": 6.968634146526121e-05, + "loss": 2.2074, + "step": 7430 + }, + { + "epoch": 1.3928772258669166, + "grad_norm": 55934.28515625, + "learning_rate": 6.967911801472621e-05, + "loss": 2.2772, + "step": 7431 + }, + { + "epoch": 1.3930646672914715, + "grad_norm": 51156.27734375, + "learning_rate": 6.967189407814251e-05, + "loss": 2.2887, + "step": 7432 + }, + { + "epoch": 1.3932521087160263, + "grad_norm": 49801.375, + "learning_rate": 6.966466965568852e-05, + "loss": 2.2668, + "step": 7433 + }, + { + "epoch": 1.393439550140581, + "grad_norm": 51346.05859375, + "learning_rate": 6.96574447475427e-05, + "loss": 2.29, + "step": 7434 + }, + { + "epoch": 1.393626991565136, + "grad_norm": 51138.046875, + "learning_rate": 6.965021935388347e-05, + "loss": 2.1914, + "step": 7435 + }, + { + "epoch": 1.3938144329896907, + "grad_norm": 50668.32421875, + "learning_rate": 6.964299347488929e-05, + "loss": 2.2149, + "step": 7436 + }, + { + "epoch": 1.3940018744142455, + "grad_norm": 51452.62109375, + "learning_rate": 6.963576711073865e-05, + "loss": 2.2267, + "step": 7437 + }, + { + "epoch": 1.3941893158388003, + "grad_norm": 51733.9375, + "learning_rate": 6.962854026161002e-05, + "loss": 2.252, + "step": 7438 + }, + { + "epoch": 1.3943767572633552, + "grad_norm": 52242.32421875, + "learning_rate": 6.96213129276819e-05, + "loss": 2.2714, + "step": 7439 + }, + { + "epoch": 1.39456419868791, + "grad_norm": 51452.546875, + "learning_rate": 6.961408510913278e-05, + "loss": 2.285, + "step": 7440 + }, + { + "epoch": 1.3947516401124649, + "grad_norm": 53039.4296875, + "learning_rate": 6.96068568061412e-05, + "loss": 2.2237, + "step": 7441 + }, + { + "epoch": 1.3949390815370197, + "grad_norm": 49124.18359375, + "learning_rate": 6.959962801888567e-05, + "loss": 2.2434, + "step": 7442 + }, + { + "epoch": 1.3951265229615746, + "grad_norm": 47833.72265625, + "learning_rate": 6.959239874754473e-05, + "loss": 2.2017, + "step": 7443 + }, + { + "epoch": 1.3953139643861294, + "grad_norm": 51713.37109375, + "learning_rate": 6.958516899229694e-05, + "loss": 2.235, + "step": 7444 + }, + { + "epoch": 1.395501405810684, + "grad_norm": 50659.90234375, + "learning_rate": 6.95779387533209e-05, + "loss": 2.2259, + "step": 7445 + }, + { + "epoch": 1.3956888472352391, + "grad_norm": 51518.11328125, + "learning_rate": 6.957070803079511e-05, + "loss": 2.3028, + "step": 7446 + }, + { + "epoch": 1.3958762886597937, + "grad_norm": 49624.64453125, + "learning_rate": 6.956347682489825e-05, + "loss": 2.2134, + "step": 7447 + }, + { + "epoch": 1.3960637300843486, + "grad_norm": 51572.25, + "learning_rate": 6.955624513580885e-05, + "loss": 2.164, + "step": 7448 + }, + { + "epoch": 1.3962511715089034, + "grad_norm": 54723.734375, + "learning_rate": 6.954901296370556e-05, + "loss": 2.213, + "step": 7449 + }, + { + "epoch": 1.3964386129334583, + "grad_norm": 48099.578125, + "learning_rate": 6.954178030876699e-05, + "loss": 2.2274, + "step": 7450 + }, + { + "epoch": 1.3966260543580131, + "grad_norm": 55112.27734375, + "learning_rate": 6.953454717117179e-05, + "loss": 2.234, + "step": 7451 + }, + { + "epoch": 1.396813495782568, + "grad_norm": 50161.17578125, + "learning_rate": 6.952731355109859e-05, + "loss": 2.1644, + "step": 7452 + }, + { + "epoch": 1.3970009372071228, + "grad_norm": 50143.0, + "learning_rate": 6.952007944872607e-05, + "loss": 2.2331, + "step": 7453 + }, + { + "epoch": 1.3971883786316777, + "grad_norm": 49329.93359375, + "learning_rate": 6.95128448642329e-05, + "loss": 2.2434, + "step": 7454 + }, + { + "epoch": 1.3973758200562325, + "grad_norm": 52618.7109375, + "learning_rate": 6.950560979779777e-05, + "loss": 2.271, + "step": 7455 + }, + { + "epoch": 1.3975632614807871, + "grad_norm": 54093.8125, + "learning_rate": 6.949837424959935e-05, + "loss": 2.185, + "step": 7456 + }, + { + "epoch": 1.3977507029053422, + "grad_norm": 55727.00390625, + "learning_rate": 6.949113821981636e-05, + "loss": 2.2708, + "step": 7457 + }, + { + "epoch": 1.3979381443298968, + "grad_norm": 52259.328125, + "learning_rate": 6.948390170862755e-05, + "loss": 2.1566, + "step": 7458 + }, + { + "epoch": 1.3981255857544517, + "grad_norm": 52064.87890625, + "learning_rate": 6.947666471621161e-05, + "loss": 2.2157, + "step": 7459 + }, + { + "epoch": 1.3983130271790065, + "grad_norm": 48237.2734375, + "learning_rate": 6.946942724274732e-05, + "loss": 2.179, + "step": 7460 + }, + { + "epoch": 1.3985004686035614, + "grad_norm": 51048.7578125, + "learning_rate": 6.946218928841339e-05, + "loss": 2.2419, + "step": 7461 + }, + { + "epoch": 1.3986879100281162, + "grad_norm": 55035.01953125, + "learning_rate": 6.945495085338865e-05, + "loss": 2.2407, + "step": 7462 + }, + { + "epoch": 1.398875351452671, + "grad_norm": 76348.8046875, + "learning_rate": 6.944771193785184e-05, + "loss": 2.2475, + "step": 7463 + }, + { + "epoch": 1.399062792877226, + "grad_norm": 47787.1484375, + "learning_rate": 6.944047254198175e-05, + "loss": 2.2523, + "step": 7464 + }, + { + "epoch": 1.3992502343017807, + "grad_norm": 49495.75, + "learning_rate": 6.943323266595719e-05, + "loss": 2.2, + "step": 7465 + }, + { + "epoch": 1.3994376757263356, + "grad_norm": 49589.125, + "learning_rate": 6.942599230995699e-05, + "loss": 2.2304, + "step": 7466 + }, + { + "epoch": 1.3996251171508902, + "grad_norm": 52209.02734375, + "learning_rate": 6.941875147415996e-05, + "loss": 2.2239, + "step": 7467 + }, + { + "epoch": 1.3998125585754453, + "grad_norm": 53991.35546875, + "learning_rate": 6.941151015874494e-05, + "loss": 2.2676, + "step": 7468 + }, + { + "epoch": 1.4, + "grad_norm": 50195.5625, + "learning_rate": 6.940426836389082e-05, + "loss": 2.2475, + "step": 7469 + }, + { + "epoch": 1.4001874414245548, + "grad_norm": 52670.94140625, + "learning_rate": 6.939702608977639e-05, + "loss": 2.168, + "step": 7470 + }, + { + "epoch": 1.4003748828491096, + "grad_norm": 54636.76171875, + "learning_rate": 6.938978333658057e-05, + "loss": 2.2267, + "step": 7471 + }, + { + "epoch": 1.4005623242736644, + "grad_norm": 49463.4765625, + "learning_rate": 6.938254010448225e-05, + "loss": 2.1922, + "step": 7472 + }, + { + "epoch": 1.4007497656982193, + "grad_norm": 49209.375, + "learning_rate": 6.937529639366031e-05, + "loss": 2.2688, + "step": 7473 + }, + { + "epoch": 1.4009372071227741, + "grad_norm": 54272.45703125, + "learning_rate": 6.93680522042937e-05, + "loss": 2.2929, + "step": 7474 + }, + { + "epoch": 1.401124648547329, + "grad_norm": 55383.19140625, + "learning_rate": 6.936080753656129e-05, + "loss": 2.3201, + "step": 7475 + }, + { + "epoch": 1.4013120899718838, + "grad_norm": 48629.28125, + "learning_rate": 6.935356239064202e-05, + "loss": 2.2573, + "step": 7476 + }, + { + "epoch": 1.4014995313964387, + "grad_norm": 54855.9453125, + "learning_rate": 6.934631676671486e-05, + "loss": 2.2408, + "step": 7477 + }, + { + "epoch": 1.4016869728209933, + "grad_norm": 50870.04296875, + "learning_rate": 6.933907066495877e-05, + "loss": 2.2777, + "step": 7478 + }, + { + "epoch": 1.4018744142455484, + "grad_norm": 53025.09765625, + "learning_rate": 6.93318240855527e-05, + "loss": 2.2334, + "step": 7479 + }, + { + "epoch": 1.402061855670103, + "grad_norm": 53008.16796875, + "learning_rate": 6.932457702867564e-05, + "loss": 2.186, + "step": 7480 + }, + { + "epoch": 1.4022492970946578, + "grad_norm": 55084.54296875, + "learning_rate": 6.931732949450659e-05, + "loss": 2.2971, + "step": 7481 + }, + { + "epoch": 1.4024367385192127, + "grad_norm": 50930.69140625, + "learning_rate": 6.931008148322454e-05, + "loss": 2.2205, + "step": 7482 + }, + { + "epoch": 1.4026241799437675, + "grad_norm": 50297.54296875, + "learning_rate": 6.930283299500851e-05, + "loss": 2.2085, + "step": 7483 + }, + { + "epoch": 1.4028116213683224, + "grad_norm": 51826.2109375, + "learning_rate": 6.929558403003754e-05, + "loss": 2.352, + "step": 7484 + }, + { + "epoch": 1.4029990627928772, + "grad_norm": 53095.53125, + "learning_rate": 6.928833458849065e-05, + "loss": 2.1656, + "step": 7485 + }, + { + "epoch": 1.403186504217432, + "grad_norm": 51307.890625, + "learning_rate": 6.928108467054692e-05, + "loss": 2.1814, + "step": 7486 + }, + { + "epoch": 1.403373945641987, + "grad_norm": 48428.1484375, + "learning_rate": 6.927383427638539e-05, + "loss": 2.2497, + "step": 7487 + }, + { + "epoch": 1.4035613870665418, + "grad_norm": 56373.265625, + "learning_rate": 6.926658340618512e-05, + "loss": 2.189, + "step": 7488 + }, + { + "epoch": 1.4037488284910966, + "grad_norm": 49481.96875, + "learning_rate": 6.925933206012524e-05, + "loss": 2.3122, + "step": 7489 + }, + { + "epoch": 1.4039362699156515, + "grad_norm": 54518.359375, + "learning_rate": 6.925208023838485e-05, + "loss": 2.1971, + "step": 7490 + }, + { + "epoch": 1.404123711340206, + "grad_norm": 50881.1640625, + "learning_rate": 6.9244827941143e-05, + "loss": 2.2777, + "step": 7491 + }, + { + "epoch": 1.4043111527647612, + "grad_norm": 50899.4921875, + "learning_rate": 6.923757516857888e-05, + "loss": 2.2246, + "step": 7492 + }, + { + "epoch": 1.4044985941893158, + "grad_norm": 48843.58984375, + "learning_rate": 6.923032192087158e-05, + "loss": 2.2713, + "step": 7493 + }, + { + "epoch": 1.4046860356138706, + "grad_norm": 50393.74609375, + "learning_rate": 6.922306819820026e-05, + "loss": 2.2049, + "step": 7494 + }, + { + "epoch": 1.4048734770384255, + "grad_norm": 50547.8671875, + "learning_rate": 6.921581400074407e-05, + "loss": 2.2028, + "step": 7495 + }, + { + "epoch": 1.4050609184629803, + "grad_norm": 51239.54296875, + "learning_rate": 6.92085593286822e-05, + "loss": 2.2463, + "step": 7496 + }, + { + "epoch": 1.4052483598875352, + "grad_norm": 59318.8359375, + "learning_rate": 6.920130418219383e-05, + "loss": 2.3146, + "step": 7497 + }, + { + "epoch": 1.40543580131209, + "grad_norm": 48907.30078125, + "learning_rate": 6.919404856145812e-05, + "loss": 2.2544, + "step": 7498 + }, + { + "epoch": 1.4056232427366449, + "grad_norm": 48388.83203125, + "learning_rate": 6.918679246665432e-05, + "loss": 2.3062, + "step": 7499 + }, + { + "epoch": 1.4058106841611997, + "grad_norm": 56194.828125, + "learning_rate": 6.91795358979616e-05, + "loss": 2.1981, + "step": 7500 + }, + { + "epoch": 1.4058106841611997, + "eval_loss": 2.304192304611206, + "eval_runtime": 129.1431, + "eval_samples_per_second": 39.096, + "eval_steps_per_second": 1.959, + "step": 7500 + }, + { + "epoch": 1.4059981255857545, + "grad_norm": 51638.67578125, + "learning_rate": 6.917227885555922e-05, + "loss": 2.2449, + "step": 7501 + }, + { + "epoch": 1.4061855670103092, + "grad_norm": 45686.0703125, + "learning_rate": 6.91650213396264e-05, + "loss": 2.1995, + "step": 7502 + }, + { + "epoch": 1.4063730084348642, + "grad_norm": 47261.5546875, + "learning_rate": 6.915776335034241e-05, + "loss": 2.2235, + "step": 7503 + }, + { + "epoch": 1.4065604498594189, + "grad_norm": 46328.0703125, + "learning_rate": 6.91505048878865e-05, + "loss": 2.2544, + "step": 7504 + }, + { + "epoch": 1.4067478912839737, + "grad_norm": 50049.0546875, + "learning_rate": 6.914324595243795e-05, + "loss": 2.299, + "step": 7505 + }, + { + "epoch": 1.4069353327085286, + "grad_norm": 46844.6484375, + "learning_rate": 6.913598654417603e-05, + "loss": 2.2464, + "step": 7506 + }, + { + "epoch": 1.4071227741330834, + "grad_norm": 51998.390625, + "learning_rate": 6.912872666328009e-05, + "loss": 2.2569, + "step": 7507 + }, + { + "epoch": 1.4073102155576382, + "grad_norm": 48596.96484375, + "learning_rate": 6.912146630992937e-05, + "loss": 2.2497, + "step": 7508 + }, + { + "epoch": 1.407497656982193, + "grad_norm": 48524.31640625, + "learning_rate": 6.911420548430323e-05, + "loss": 2.2873, + "step": 7509 + }, + { + "epoch": 1.407685098406748, + "grad_norm": 49560.61328125, + "learning_rate": 6.910694418658101e-05, + "loss": 2.2119, + "step": 7510 + }, + { + "epoch": 1.4078725398313028, + "grad_norm": 53491.28515625, + "learning_rate": 6.909968241694203e-05, + "loss": 2.246, + "step": 7511 + }, + { + "epoch": 1.4080599812558576, + "grad_norm": 51810.91015625, + "learning_rate": 6.909242017556565e-05, + "loss": 2.2086, + "step": 7512 + }, + { + "epoch": 1.4082474226804123, + "grad_norm": 57120.4453125, + "learning_rate": 6.908515746263125e-05, + "loss": 2.2622, + "step": 7513 + }, + { + "epoch": 1.4084348641049673, + "grad_norm": 52597.7109375, + "learning_rate": 6.907789427831822e-05, + "loss": 2.2467, + "step": 7514 + }, + { + "epoch": 1.408622305529522, + "grad_norm": 52534.078125, + "learning_rate": 6.907063062280592e-05, + "loss": 2.3126, + "step": 7515 + }, + { + "epoch": 1.4088097469540768, + "grad_norm": 51030.47265625, + "learning_rate": 6.906336649627377e-05, + "loss": 2.1476, + "step": 7516 + }, + { + "epoch": 1.4089971883786316, + "grad_norm": 52660.3515625, + "learning_rate": 6.905610189890118e-05, + "loss": 2.1612, + "step": 7517 + }, + { + "epoch": 1.4091846298031865, + "grad_norm": 48614.0703125, + "learning_rate": 6.90488368308676e-05, + "loss": 2.295, + "step": 7518 + }, + { + "epoch": 1.4093720712277413, + "grad_norm": 48471.86328125, + "learning_rate": 6.904157129235242e-05, + "loss": 2.3375, + "step": 7519 + }, + { + "epoch": 1.4095595126522962, + "grad_norm": 49839.828125, + "learning_rate": 6.903430528353514e-05, + "loss": 2.1854, + "step": 7520 + }, + { + "epoch": 1.409746954076851, + "grad_norm": 56092.1796875, + "learning_rate": 6.902703880459517e-05, + "loss": 2.2443, + "step": 7521 + }, + { + "epoch": 1.4099343955014059, + "grad_norm": 51004.203125, + "learning_rate": 6.901977185571205e-05, + "loss": 2.2507, + "step": 7522 + }, + { + "epoch": 1.4101218369259607, + "grad_norm": 49062.8359375, + "learning_rate": 6.90125044370652e-05, + "loss": 2.2547, + "step": 7523 + }, + { + "epoch": 1.4103092783505153, + "grad_norm": 48435.5546875, + "learning_rate": 6.900523654883414e-05, + "loss": 2.2497, + "step": 7524 + }, + { + "epoch": 1.4104967197750704, + "grad_norm": 58519.97265625, + "learning_rate": 6.899796819119839e-05, + "loss": 2.2137, + "step": 7525 + }, + { + "epoch": 1.410684161199625, + "grad_norm": 49867.8984375, + "learning_rate": 6.899069936433744e-05, + "loss": 2.2395, + "step": 7526 + }, + { + "epoch": 1.4108716026241799, + "grad_norm": 48551.16015625, + "learning_rate": 6.898343006843086e-05, + "loss": 2.1806, + "step": 7527 + }, + { + "epoch": 1.4110590440487347, + "grad_norm": 52905.8203125, + "learning_rate": 6.897616030365815e-05, + "loss": 2.2159, + "step": 7528 + }, + { + "epoch": 1.4112464854732896, + "grad_norm": 52404.5859375, + "learning_rate": 6.89688900701989e-05, + "loss": 2.2688, + "step": 7529 + }, + { + "epoch": 1.4114339268978444, + "grad_norm": 52468.20703125, + "learning_rate": 6.896161936823264e-05, + "loss": 2.1717, + "step": 7530 + }, + { + "epoch": 1.4116213683223993, + "grad_norm": 50889.2734375, + "learning_rate": 6.895434819793898e-05, + "loss": 2.2776, + "step": 7531 + }, + { + "epoch": 1.411808809746954, + "grad_norm": 51377.41796875, + "learning_rate": 6.894707655949749e-05, + "loss": 2.2136, + "step": 7532 + }, + { + "epoch": 1.411996251171509, + "grad_norm": 56092.98828125, + "learning_rate": 6.893980445308776e-05, + "loss": 2.2183, + "step": 7533 + }, + { + "epoch": 1.4121836925960638, + "grad_norm": 49958.83984375, + "learning_rate": 6.893253187888943e-05, + "loss": 2.3319, + "step": 7534 + }, + { + "epoch": 1.4123711340206184, + "grad_norm": 53152.19140625, + "learning_rate": 6.892525883708212e-05, + "loss": 2.2384, + "step": 7535 + }, + { + "epoch": 1.4125585754451735, + "grad_norm": 53197.984375, + "learning_rate": 6.891798532784543e-05, + "loss": 2.272, + "step": 7536 + }, + { + "epoch": 1.4127460168697281, + "grad_norm": 52096.1875, + "learning_rate": 6.891071135135905e-05, + "loss": 2.2772, + "step": 7537 + }, + { + "epoch": 1.412933458294283, + "grad_norm": 51285.8046875, + "learning_rate": 6.890343690780261e-05, + "loss": 2.2739, + "step": 7538 + }, + { + "epoch": 1.4131208997188378, + "grad_norm": 51492.54296875, + "learning_rate": 6.889616199735578e-05, + "loss": 2.216, + "step": 7539 + }, + { + "epoch": 1.4133083411433927, + "grad_norm": 49980.703125, + "learning_rate": 6.888888662019826e-05, + "loss": 2.2246, + "step": 7540 + }, + { + "epoch": 1.4134957825679475, + "grad_norm": 53246.87890625, + "learning_rate": 6.888161077650971e-05, + "loss": 2.2586, + "step": 7541 + }, + { + "epoch": 1.4136832239925023, + "grad_norm": 50161.51171875, + "learning_rate": 6.887433446646988e-05, + "loss": 2.3255, + "step": 7542 + }, + { + "epoch": 1.4138706654170572, + "grad_norm": 48515.6640625, + "learning_rate": 6.886705769025844e-05, + "loss": 2.2379, + "step": 7543 + }, + { + "epoch": 1.414058106841612, + "grad_norm": 53753.01171875, + "learning_rate": 6.885978044805514e-05, + "loss": 2.298, + "step": 7544 + }, + { + "epoch": 1.4142455482661669, + "grad_norm": 52701.76171875, + "learning_rate": 6.885250274003973e-05, + "loss": 2.2288, + "step": 7545 + }, + { + "epoch": 1.4144329896907217, + "grad_norm": 51065.25390625, + "learning_rate": 6.884522456639193e-05, + "loss": 2.2839, + "step": 7546 + }, + { + "epoch": 1.4146204311152766, + "grad_norm": 48793.16015625, + "learning_rate": 6.883794592729151e-05, + "loss": 2.1738, + "step": 7547 + }, + { + "epoch": 1.4148078725398312, + "grad_norm": 52792.21875, + "learning_rate": 6.883066682291828e-05, + "loss": 2.3226, + "step": 7548 + }, + { + "epoch": 1.414995313964386, + "grad_norm": 50994.703125, + "learning_rate": 6.882338725345196e-05, + "loss": 2.2324, + "step": 7549 + }, + { + "epoch": 1.415182755388941, + "grad_norm": 58646.1015625, + "learning_rate": 6.88161072190724e-05, + "loss": 2.2091, + "step": 7550 + }, + { + "epoch": 1.4153701968134957, + "grad_norm": 48080.96875, + "learning_rate": 6.880882671995939e-05, + "loss": 2.2261, + "step": 7551 + }, + { + "epoch": 1.4155576382380506, + "grad_norm": 49719.7734375, + "learning_rate": 6.880154575629273e-05, + "loss": 2.2562, + "step": 7552 + }, + { + "epoch": 1.4157450796626054, + "grad_norm": 47865.52734375, + "learning_rate": 6.879426432825227e-05, + "loss": 2.1949, + "step": 7553 + }, + { + "epoch": 1.4159325210871603, + "grad_norm": 54136.21484375, + "learning_rate": 6.878698243601785e-05, + "loss": 2.2888, + "step": 7554 + }, + { + "epoch": 1.4161199625117151, + "grad_norm": 51340.3515625, + "learning_rate": 6.877970007976933e-05, + "loss": 2.2339, + "step": 7555 + }, + { + "epoch": 1.41630740393627, + "grad_norm": 50947.0859375, + "learning_rate": 6.877241725968656e-05, + "loss": 2.2371, + "step": 7556 + }, + { + "epoch": 1.4164948453608248, + "grad_norm": 55104.875, + "learning_rate": 6.876513397594942e-05, + "loss": 2.245, + "step": 7557 + }, + { + "epoch": 1.4166822867853797, + "grad_norm": 54854.65625, + "learning_rate": 6.87578502287378e-05, + "loss": 2.2699, + "step": 7558 + }, + { + "epoch": 1.4168697282099343, + "grad_norm": 49478.9453125, + "learning_rate": 6.87505660182316e-05, + "loss": 2.2169, + "step": 7559 + }, + { + "epoch": 1.4170571696344894, + "grad_norm": 47594.98046875, + "learning_rate": 6.874328134461072e-05, + "loss": 2.3189, + "step": 7560 + }, + { + "epoch": 1.417244611059044, + "grad_norm": 51104.7578125, + "learning_rate": 6.87359962080551e-05, + "loss": 2.3029, + "step": 7561 + }, + { + "epoch": 1.4174320524835988, + "grad_norm": 51607.66796875, + "learning_rate": 6.872871060874467e-05, + "loss": 2.2656, + "step": 7562 + }, + { + "epoch": 1.4176194939081537, + "grad_norm": 50386.3984375, + "learning_rate": 6.872142454685939e-05, + "loss": 2.1992, + "step": 7563 + }, + { + "epoch": 1.4178069353327085, + "grad_norm": 54200.93359375, + "learning_rate": 6.871413802257916e-05, + "loss": 2.2268, + "step": 7564 + }, + { + "epoch": 1.4179943767572634, + "grad_norm": 50410.33984375, + "learning_rate": 6.870685103608401e-05, + "loss": 2.2711, + "step": 7565 + }, + { + "epoch": 1.4181818181818182, + "grad_norm": 49771.24609375, + "learning_rate": 6.869956358755388e-05, + "loss": 2.2255, + "step": 7566 + }, + { + "epoch": 1.418369259606373, + "grad_norm": 55662.71875, + "learning_rate": 6.869227567716877e-05, + "loss": 2.3256, + "step": 7567 + }, + { + "epoch": 1.418556701030928, + "grad_norm": 49260.71484375, + "learning_rate": 6.868498730510869e-05, + "loss": 2.2223, + "step": 7568 + }, + { + "epoch": 1.4187441424554827, + "grad_norm": 49399.38671875, + "learning_rate": 6.867769847155365e-05, + "loss": 2.2993, + "step": 7569 + }, + { + "epoch": 1.4189315838800374, + "grad_norm": 53983.0078125, + "learning_rate": 6.867040917668368e-05, + "loss": 2.2793, + "step": 7570 + }, + { + "epoch": 1.4191190253045924, + "grad_norm": 47771.43359375, + "learning_rate": 6.866311942067881e-05, + "loss": 2.2574, + "step": 7571 + }, + { + "epoch": 1.419306466729147, + "grad_norm": 52279.94921875, + "learning_rate": 6.865582920371907e-05, + "loss": 2.1789, + "step": 7572 + }, + { + "epoch": 1.419493908153702, + "grad_norm": 47827.27734375, + "learning_rate": 6.864853852598455e-05, + "loss": 2.2877, + "step": 7573 + }, + { + "epoch": 1.4196813495782568, + "grad_norm": 49505.16796875, + "learning_rate": 6.864124738765529e-05, + "loss": 2.2543, + "step": 7574 + }, + { + "epoch": 1.4198687910028116, + "grad_norm": 50850.10546875, + "learning_rate": 6.863395578891141e-05, + "loss": 2.2204, + "step": 7575 + }, + { + "epoch": 1.4200562324273664, + "grad_norm": 51211.05859375, + "learning_rate": 6.862666372993298e-05, + "loss": 2.2824, + "step": 7576 + }, + { + "epoch": 1.4202436738519213, + "grad_norm": 55173.640625, + "learning_rate": 6.86193712109001e-05, + "loss": 2.277, + "step": 7577 + }, + { + "epoch": 1.4204311152764761, + "grad_norm": 47653.64453125, + "learning_rate": 6.861207823199289e-05, + "loss": 2.2004, + "step": 7578 + }, + { + "epoch": 1.420618556701031, + "grad_norm": 54796.2265625, + "learning_rate": 6.860478479339147e-05, + "loss": 2.2522, + "step": 7579 + }, + { + "epoch": 1.4208059981255858, + "grad_norm": 53381.69921875, + "learning_rate": 6.8597490895276e-05, + "loss": 2.4046, + "step": 7580 + }, + { + "epoch": 1.4209934395501405, + "grad_norm": 56142.296875, + "learning_rate": 6.859019653782661e-05, + "loss": 2.2258, + "step": 7581 + }, + { + "epoch": 1.4211808809746955, + "grad_norm": 53200.35546875, + "learning_rate": 6.858290172122348e-05, + "loss": 2.1963, + "step": 7582 + }, + { + "epoch": 1.4213683223992502, + "grad_norm": 51259.38671875, + "learning_rate": 6.857560644564677e-05, + "loss": 2.133, + "step": 7583 + }, + { + "epoch": 1.421555763823805, + "grad_norm": 54459.0625, + "learning_rate": 6.856831071127664e-05, + "loss": 2.217, + "step": 7584 + }, + { + "epoch": 1.4217432052483598, + "grad_norm": 51198.38671875, + "learning_rate": 6.856101451829333e-05, + "loss": 2.2652, + "step": 7585 + }, + { + "epoch": 1.4219306466729147, + "grad_norm": 55889.2109375, + "learning_rate": 6.855371786687703e-05, + "loss": 2.183, + "step": 7586 + }, + { + "epoch": 1.4221180880974695, + "grad_norm": 50620.53125, + "learning_rate": 6.854642075720794e-05, + "loss": 2.209, + "step": 7587 + }, + { + "epoch": 1.4223055295220244, + "grad_norm": 51336.90234375, + "learning_rate": 6.853912318946632e-05, + "loss": 2.1862, + "step": 7588 + }, + { + "epoch": 1.4224929709465792, + "grad_norm": 48596.8125, + "learning_rate": 6.853182516383238e-05, + "loss": 2.2463, + "step": 7589 + }, + { + "epoch": 1.422680412371134, + "grad_norm": 50440.95703125, + "learning_rate": 6.852452668048638e-05, + "loss": 2.2267, + "step": 7590 + }, + { + "epoch": 1.422867853795689, + "grad_norm": 50086.59375, + "learning_rate": 6.85172277396086e-05, + "loss": 2.2637, + "step": 7591 + }, + { + "epoch": 1.4230552952202435, + "grad_norm": 50972.46875, + "learning_rate": 6.850992834137929e-05, + "loss": 2.2392, + "step": 7592 + }, + { + "epoch": 1.4232427366447986, + "grad_norm": 51882.71875, + "learning_rate": 6.850262848597877e-05, + "loss": 2.2181, + "step": 7593 + }, + { + "epoch": 1.4234301780693532, + "grad_norm": 51049.546875, + "learning_rate": 6.84953281735873e-05, + "loss": 2.219, + "step": 7594 + }, + { + "epoch": 1.423617619493908, + "grad_norm": 52851.375, + "learning_rate": 6.84880274043852e-05, + "loss": 2.2822, + "step": 7595 + }, + { + "epoch": 1.423805060918463, + "grad_norm": 53043.6328125, + "learning_rate": 6.84807261785528e-05, + "loss": 2.2027, + "step": 7596 + }, + { + "epoch": 1.4239925023430178, + "grad_norm": 48044.87109375, + "learning_rate": 6.847342449627042e-05, + "loss": 2.2407, + "step": 7597 + }, + { + "epoch": 1.4241799437675726, + "grad_norm": 52616.203125, + "learning_rate": 6.846612235771843e-05, + "loss": 2.2071, + "step": 7598 + }, + { + "epoch": 1.4243673851921275, + "grad_norm": 49948.3046875, + "learning_rate": 6.845881976307712e-05, + "loss": 2.2496, + "step": 7599 + }, + { + "epoch": 1.4245548266166823, + "grad_norm": 50909.0625, + "learning_rate": 6.845151671252692e-05, + "loss": 2.2455, + "step": 7600 + }, + { + "epoch": 1.4247422680412372, + "grad_norm": 51488.30859375, + "learning_rate": 6.844421320624817e-05, + "loss": 2.1659, + "step": 7601 + }, + { + "epoch": 1.424929709465792, + "grad_norm": 50085.11328125, + "learning_rate": 6.843690924442127e-05, + "loss": 2.2372, + "step": 7602 + }, + { + "epoch": 1.4251171508903466, + "grad_norm": 53249.8046875, + "learning_rate": 6.842960482722662e-05, + "loss": 2.2241, + "step": 7603 + }, + { + "epoch": 1.4253045923149017, + "grad_norm": 47974.22265625, + "learning_rate": 6.842229995484462e-05, + "loss": 2.3007, + "step": 7604 + }, + { + "epoch": 1.4254920337394563, + "grad_norm": 50532.46875, + "learning_rate": 6.84149946274557e-05, + "loss": 2.2016, + "step": 7605 + }, + { + "epoch": 1.4256794751640112, + "grad_norm": 48145.41015625, + "learning_rate": 6.840768884524029e-05, + "loss": 2.182, + "step": 7606 + }, + { + "epoch": 1.425866916588566, + "grad_norm": 48691.23046875, + "learning_rate": 6.840038260837883e-05, + "loss": 2.1873, + "step": 7607 + }, + { + "epoch": 1.4260543580131209, + "grad_norm": 58211.88671875, + "learning_rate": 6.839307591705177e-05, + "loss": 2.2661, + "step": 7608 + }, + { + "epoch": 1.4262417994376757, + "grad_norm": 46230.66015625, + "learning_rate": 6.838576877143959e-05, + "loss": 2.2874, + "step": 7609 + }, + { + "epoch": 1.4264292408622306, + "grad_norm": 55659.91015625, + "learning_rate": 6.837846117172275e-05, + "loss": 2.2321, + "step": 7610 + }, + { + "epoch": 1.4266166822867854, + "grad_norm": 49995.52734375, + "learning_rate": 6.837115311808176e-05, + "loss": 2.213, + "step": 7611 + }, + { + "epoch": 1.4268041237113402, + "grad_norm": 54199.8515625, + "learning_rate": 6.836384461069708e-05, + "loss": 2.3102, + "step": 7612 + }, + { + "epoch": 1.426991565135895, + "grad_norm": 50913.82421875, + "learning_rate": 6.835653564974927e-05, + "loss": 2.2066, + "step": 7613 + }, + { + "epoch": 1.42717900656045, + "grad_norm": 51672.59765625, + "learning_rate": 6.834922623541882e-05, + "loss": 2.3438, + "step": 7614 + }, + { + "epoch": 1.4273664479850048, + "grad_norm": 52962.77734375, + "learning_rate": 6.834191636788628e-05, + "loss": 2.2637, + "step": 7615 + }, + { + "epoch": 1.4275538894095594, + "grad_norm": 50266.1328125, + "learning_rate": 6.833460604733218e-05, + "loss": 2.1644, + "step": 7616 + }, + { + "epoch": 1.4277413308341145, + "grad_norm": 50439.49609375, + "learning_rate": 6.832729527393708e-05, + "loss": 2.2679, + "step": 7617 + }, + { + "epoch": 1.427928772258669, + "grad_norm": 54572.94921875, + "learning_rate": 6.831998404788155e-05, + "loss": 2.2303, + "step": 7618 + }, + { + "epoch": 1.428116213683224, + "grad_norm": 47764.65234375, + "learning_rate": 6.831267236934616e-05, + "loss": 2.2148, + "step": 7619 + }, + { + "epoch": 1.4283036551077788, + "grad_norm": 50603.0859375, + "learning_rate": 6.830536023851152e-05, + "loss": 2.2443, + "step": 7620 + }, + { + "epoch": 1.4284910965323336, + "grad_norm": 52234.75, + "learning_rate": 6.829804765555821e-05, + "loss": 2.1697, + "step": 7621 + }, + { + "epoch": 1.4286785379568885, + "grad_norm": 54604.3671875, + "learning_rate": 6.829073462066685e-05, + "loss": 2.3052, + "step": 7622 + }, + { + "epoch": 1.4288659793814433, + "grad_norm": 55054.66015625, + "learning_rate": 6.828342113401805e-05, + "loss": 2.3001, + "step": 7623 + }, + { + "epoch": 1.4290534208059982, + "grad_norm": 49929.015625, + "learning_rate": 6.827610719579247e-05, + "loss": 2.245, + "step": 7624 + }, + { + "epoch": 1.429240862230553, + "grad_norm": 54292.9140625, + "learning_rate": 6.82687928061707e-05, + "loss": 2.2406, + "step": 7625 + }, + { + "epoch": 1.4294283036551079, + "grad_norm": 52466.7734375, + "learning_rate": 6.826147796533349e-05, + "loss": 2.2637, + "step": 7626 + }, + { + "epoch": 1.4296157450796625, + "grad_norm": 50755.1640625, + "learning_rate": 6.82541626734614e-05, + "loss": 2.2443, + "step": 7627 + }, + { + "epoch": 1.4298031865042176, + "grad_norm": 50065.51953125, + "learning_rate": 6.824684693073519e-05, + "loss": 2.2097, + "step": 7628 + }, + { + "epoch": 1.4299906279287722, + "grad_norm": 46326.89453125, + "learning_rate": 6.82395307373355e-05, + "loss": 2.2015, + "step": 7629 + }, + { + "epoch": 1.430178069353327, + "grad_norm": 49844.890625, + "learning_rate": 6.823221409344305e-05, + "loss": 2.269, + "step": 7630 + }, + { + "epoch": 1.4303655107778819, + "grad_norm": 58154.75390625, + "learning_rate": 6.822489699923855e-05, + "loss": 2.1973, + "step": 7631 + }, + { + "epoch": 1.4305529522024367, + "grad_norm": 52189.69140625, + "learning_rate": 6.821757945490274e-05, + "loss": 2.2363, + "step": 7632 + }, + { + "epoch": 1.4307403936269916, + "grad_norm": 48648.12109375, + "learning_rate": 6.821026146061633e-05, + "loss": 2.22, + "step": 7633 + }, + { + "epoch": 1.4309278350515464, + "grad_norm": 52668.5, + "learning_rate": 6.820294301656004e-05, + "loss": 2.2387, + "step": 7634 + }, + { + "epoch": 1.4311152764761013, + "grad_norm": 46912.91015625, + "learning_rate": 6.81956241229147e-05, + "loss": 2.2308, + "step": 7635 + }, + { + "epoch": 1.431302717900656, + "grad_norm": 50342.36328125, + "learning_rate": 6.818830477986102e-05, + "loss": 2.2233, + "step": 7636 + }, + { + "epoch": 1.431490159325211, + "grad_norm": 60098.30859375, + "learning_rate": 6.81809849875798e-05, + "loss": 2.4843, + "step": 7637 + }, + { + "epoch": 1.4316776007497656, + "grad_norm": 51859.74609375, + "learning_rate": 6.81736647462518e-05, + "loss": 2.1935, + "step": 7638 + }, + { + "epoch": 1.4318650421743206, + "grad_norm": 53460.19921875, + "learning_rate": 6.816634405605788e-05, + "loss": 2.3004, + "step": 7639 + }, + { + "epoch": 1.4320524835988753, + "grad_norm": 54865.69921875, + "learning_rate": 6.815902291717878e-05, + "loss": 2.3209, + "step": 7640 + }, + { + "epoch": 1.4322399250234301, + "grad_norm": 49107.77734375, + "learning_rate": 6.815170132979538e-05, + "loss": 2.2605, + "step": 7641 + }, + { + "epoch": 1.432427366447985, + "grad_norm": 51360.37109375, + "learning_rate": 6.814437929408848e-05, + "loss": 2.258, + "step": 7642 + }, + { + "epoch": 1.4326148078725398, + "grad_norm": 50971.53515625, + "learning_rate": 6.813705681023894e-05, + "loss": 2.2571, + "step": 7643 + }, + { + "epoch": 1.4328022492970947, + "grad_norm": 48032.6171875, + "learning_rate": 6.812973387842762e-05, + "loss": 2.2536, + "step": 7644 + }, + { + "epoch": 1.4329896907216495, + "grad_norm": 49270.54296875, + "learning_rate": 6.812241049883536e-05, + "loss": 2.2071, + "step": 7645 + }, + { + "epoch": 1.4331771321462043, + "grad_norm": 50681.23828125, + "learning_rate": 6.811508667164307e-05, + "loss": 2.2504, + "step": 7646 + }, + { + "epoch": 1.4333645735707592, + "grad_norm": 51406.34765625, + "learning_rate": 6.810776239703162e-05, + "loss": 2.2325, + "step": 7647 + }, + { + "epoch": 1.433552014995314, + "grad_norm": 50904.96875, + "learning_rate": 6.810043767518192e-05, + "loss": 2.1696, + "step": 7648 + }, + { + "epoch": 1.4337394564198687, + "grad_norm": 54327.32421875, + "learning_rate": 6.809311250627489e-05, + "loss": 2.2573, + "step": 7649 + }, + { + "epoch": 1.4339268978444237, + "grad_norm": 51880.30859375, + "learning_rate": 6.808578689049142e-05, + "loss": 2.2037, + "step": 7650 + }, + { + "epoch": 1.4341143392689784, + "grad_norm": 50165.42578125, + "learning_rate": 6.807846082801246e-05, + "loss": 2.2364, + "step": 7651 + }, + { + "epoch": 1.4343017806935332, + "grad_norm": 48844.55078125, + "learning_rate": 6.807113431901895e-05, + "loss": 2.2156, + "step": 7652 + }, + { + "epoch": 1.434489222118088, + "grad_norm": 52051.1015625, + "learning_rate": 6.806380736369187e-05, + "loss": 2.2667, + "step": 7653 + }, + { + "epoch": 1.434676663542643, + "grad_norm": 47400.63671875, + "learning_rate": 6.805647996221216e-05, + "loss": 2.3058, + "step": 7654 + }, + { + "epoch": 1.4348641049671977, + "grad_norm": 51258.4765625, + "learning_rate": 6.804915211476082e-05, + "loss": 2.2542, + "step": 7655 + }, + { + "epoch": 1.4350515463917526, + "grad_norm": 54500.25, + "learning_rate": 6.804182382151881e-05, + "loss": 2.1381, + "step": 7656 + }, + { + "epoch": 1.4352389878163074, + "grad_norm": 52403.82421875, + "learning_rate": 6.803449508266714e-05, + "loss": 2.2126, + "step": 7657 + }, + { + "epoch": 1.4354264292408623, + "grad_norm": 51713.7265625, + "learning_rate": 6.802716589838683e-05, + "loss": 2.2258, + "step": 7658 + }, + { + "epoch": 1.4356138706654171, + "grad_norm": 54834.3046875, + "learning_rate": 6.801983626885889e-05, + "loss": 2.1861, + "step": 7659 + }, + { + "epoch": 1.4358013120899717, + "grad_norm": 49901.90625, + "learning_rate": 6.801250619426437e-05, + "loss": 2.2295, + "step": 7660 + }, + { + "epoch": 1.4359887535145268, + "grad_norm": 53967.85546875, + "learning_rate": 6.800517567478429e-05, + "loss": 2.2642, + "step": 7661 + }, + { + "epoch": 1.4361761949390814, + "grad_norm": 47381.57421875, + "learning_rate": 6.79978447105997e-05, + "loss": 2.2479, + "step": 7662 + }, + { + "epoch": 1.4363636363636363, + "grad_norm": 48467.65234375, + "learning_rate": 6.799051330189172e-05, + "loss": 2.2303, + "step": 7663 + }, + { + "epoch": 1.4365510777881911, + "grad_norm": 49325.76171875, + "learning_rate": 6.798318144884136e-05, + "loss": 2.2066, + "step": 7664 + }, + { + "epoch": 1.436738519212746, + "grad_norm": 48691.91015625, + "learning_rate": 6.797584915162973e-05, + "loss": 2.2257, + "step": 7665 + }, + { + "epoch": 1.4369259606373008, + "grad_norm": 52890.671875, + "learning_rate": 6.796851641043794e-05, + "loss": 2.2802, + "step": 7666 + }, + { + "epoch": 1.4371134020618557, + "grad_norm": 49554.79296875, + "learning_rate": 6.796118322544708e-05, + "loss": 2.2359, + "step": 7667 + }, + { + "epoch": 1.4373008434864105, + "grad_norm": 48732.6328125, + "learning_rate": 6.795384959683831e-05, + "loss": 2.2809, + "step": 7668 + }, + { + "epoch": 1.4374882849109654, + "grad_norm": 51253.62109375, + "learning_rate": 6.794651552479273e-05, + "loss": 2.2504, + "step": 7669 + }, + { + "epoch": 1.4376757263355202, + "grad_norm": 52372.24609375, + "learning_rate": 6.793918100949145e-05, + "loss": 2.2549, + "step": 7670 + }, + { + "epoch": 1.437863167760075, + "grad_norm": 48692.59765625, + "learning_rate": 6.793184605111568e-05, + "loss": 2.2457, + "step": 7671 + }, + { + "epoch": 1.43805060918463, + "grad_norm": 52303.31640625, + "learning_rate": 6.792451064984655e-05, + "loss": 2.2364, + "step": 7672 + }, + { + "epoch": 1.4382380506091845, + "grad_norm": 51076.921875, + "learning_rate": 6.791717480586525e-05, + "loss": 2.2419, + "step": 7673 + }, + { + "epoch": 1.4384254920337396, + "grad_norm": 54011.3203125, + "learning_rate": 6.790983851935298e-05, + "loss": 2.1487, + "step": 7674 + }, + { + "epoch": 1.4386129334582942, + "grad_norm": 50705.25390625, + "learning_rate": 6.79025017904909e-05, + "loss": 2.2439, + "step": 7675 + }, + { + "epoch": 1.438800374882849, + "grad_norm": 49540.58203125, + "learning_rate": 6.789516461946023e-05, + "loss": 2.2348, + "step": 7676 + }, + { + "epoch": 1.438987816307404, + "grad_norm": 54726.5078125, + "learning_rate": 6.788782700644222e-05, + "loss": 2.2188, + "step": 7677 + }, + { + "epoch": 1.4391752577319588, + "grad_norm": 54384.30859375, + "learning_rate": 6.788048895161807e-05, + "loss": 2.2253, + "step": 7678 + }, + { + "epoch": 1.4393626991565136, + "grad_norm": 51883.0078125, + "learning_rate": 6.787315045516901e-05, + "loss": 2.2081, + "step": 7679 + }, + { + "epoch": 1.4395501405810684, + "grad_norm": 53095.90234375, + "learning_rate": 6.786581151727631e-05, + "loss": 2.2896, + "step": 7680 + }, + { + "epoch": 1.4397375820056233, + "grad_norm": 51676.91015625, + "learning_rate": 6.785847213812124e-05, + "loss": 2.2497, + "step": 7681 + }, + { + "epoch": 1.4399250234301781, + "grad_norm": 51529.18359375, + "learning_rate": 6.785113231788504e-05, + "loss": 2.1824, + "step": 7682 + }, + { + "epoch": 1.440112464854733, + "grad_norm": 57528.7109375, + "learning_rate": 6.784379205674904e-05, + "loss": 2.2476, + "step": 7683 + }, + { + "epoch": 1.4402999062792876, + "grad_norm": 60536.06640625, + "learning_rate": 6.783645135489452e-05, + "loss": 2.2208, + "step": 7684 + }, + { + "epoch": 1.4404873477038427, + "grad_norm": 48138.36328125, + "learning_rate": 6.782911021250274e-05, + "loss": 2.2309, + "step": 7685 + }, + { + "epoch": 1.4406747891283973, + "grad_norm": 53786.77734375, + "learning_rate": 6.782176862975509e-05, + "loss": 2.215, + "step": 7686 + }, + { + "epoch": 1.4408622305529522, + "grad_norm": 44802.83203125, + "learning_rate": 6.781442660683282e-05, + "loss": 2.1775, + "step": 7687 + }, + { + "epoch": 1.441049671977507, + "grad_norm": 46204.14453125, + "learning_rate": 6.780708414391735e-05, + "loss": 2.2911, + "step": 7688 + }, + { + "epoch": 1.4412371134020618, + "grad_norm": 50928.671875, + "learning_rate": 6.779974124118997e-05, + "loss": 2.2922, + "step": 7689 + }, + { + "epoch": 1.4414245548266167, + "grad_norm": 50182.62890625, + "learning_rate": 6.779239789883206e-05, + "loss": 2.289, + "step": 7690 + }, + { + "epoch": 1.4416119962511715, + "grad_norm": 50009.703125, + "learning_rate": 6.7785054117025e-05, + "loss": 2.2951, + "step": 7691 + }, + { + "epoch": 1.4417994376757264, + "grad_norm": 52446.734375, + "learning_rate": 6.777770989595014e-05, + "loss": 2.2129, + "step": 7692 + }, + { + "epoch": 1.4419868791002812, + "grad_norm": 48662.96875, + "learning_rate": 6.777036523578893e-05, + "loss": 2.2266, + "step": 7693 + }, + { + "epoch": 1.442174320524836, + "grad_norm": 49817.69140625, + "learning_rate": 6.776302013672271e-05, + "loss": 2.2415, + "step": 7694 + }, + { + "epoch": 1.4423617619493907, + "grad_norm": 50095.32421875, + "learning_rate": 6.775567459893293e-05, + "loss": 2.1897, + "step": 7695 + }, + { + "epoch": 1.4425492033739458, + "grad_norm": 49379.8046875, + "learning_rate": 6.7748328622601e-05, + "loss": 2.2292, + "step": 7696 + }, + { + "epoch": 1.4427366447985004, + "grad_norm": 49466.9453125, + "learning_rate": 6.774098220790836e-05, + "loss": 2.3046, + "step": 7697 + }, + { + "epoch": 1.4429240862230552, + "grad_norm": 44815.46875, + "learning_rate": 6.773363535503647e-05, + "loss": 2.1952, + "step": 7698 + }, + { + "epoch": 1.44311152764761, + "grad_norm": 54422.75, + "learning_rate": 6.77262880641668e-05, + "loss": 2.2455, + "step": 7699 + }, + { + "epoch": 1.443298969072165, + "grad_norm": 48128.4140625, + "learning_rate": 6.771894033548076e-05, + "loss": 2.2151, + "step": 7700 + }, + { + "epoch": 1.4434864104967198, + "grad_norm": 53776.03125, + "learning_rate": 6.77115921691599e-05, + "loss": 2.2475, + "step": 7701 + }, + { + "epoch": 1.4436738519212746, + "grad_norm": 51920.2109375, + "learning_rate": 6.770424356538566e-05, + "loss": 2.1402, + "step": 7702 + }, + { + "epoch": 1.4438612933458295, + "grad_norm": 54328.609375, + "learning_rate": 6.769689452433957e-05, + "loss": 2.2054, + "step": 7703 + }, + { + "epoch": 1.4440487347703843, + "grad_norm": 48057.96875, + "learning_rate": 6.768954504620311e-05, + "loss": 2.2549, + "step": 7704 + }, + { + "epoch": 1.4442361761949392, + "grad_norm": 53953.06640625, + "learning_rate": 6.768219513115782e-05, + "loss": 2.2058, + "step": 7705 + }, + { + "epoch": 1.4444236176194938, + "grad_norm": 51256.82421875, + "learning_rate": 6.767484477938526e-05, + "loss": 2.1874, + "step": 7706 + }, + { + "epoch": 1.4446110590440489, + "grad_norm": 52445.6484375, + "learning_rate": 6.766749399106694e-05, + "loss": 2.3294, + "step": 7707 + }, + { + "epoch": 1.4447985004686035, + "grad_norm": 56713.5078125, + "learning_rate": 6.766014276638442e-05, + "loss": 2.2379, + "step": 7708 + }, + { + "epoch": 1.4449859418931583, + "grad_norm": 55697.9296875, + "learning_rate": 6.765279110551926e-05, + "loss": 2.1707, + "step": 7709 + }, + { + "epoch": 1.4451733833177132, + "grad_norm": 50181.6171875, + "learning_rate": 6.764543900865307e-05, + "loss": 2.2289, + "step": 7710 + }, + { + "epoch": 1.445360824742268, + "grad_norm": 46950.06640625, + "learning_rate": 6.76380864759674e-05, + "loss": 2.2072, + "step": 7711 + }, + { + "epoch": 1.4455482661668229, + "grad_norm": 52075.79296875, + "learning_rate": 6.763073350764387e-05, + "loss": 2.2552, + "step": 7712 + }, + { + "epoch": 1.4457357075913777, + "grad_norm": 48149.70703125, + "learning_rate": 6.762338010386406e-05, + "loss": 2.2175, + "step": 7713 + }, + { + "epoch": 1.4459231490159326, + "grad_norm": 52312.79296875, + "learning_rate": 6.761602626480963e-05, + "loss": 2.2908, + "step": 7714 + }, + { + "epoch": 1.4461105904404874, + "grad_norm": 52459.76171875, + "learning_rate": 6.760867199066219e-05, + "loss": 2.1526, + "step": 7715 + }, + { + "epoch": 1.4462980318650422, + "grad_norm": 50136.4140625, + "learning_rate": 6.760131728160338e-05, + "loss": 2.2681, + "step": 7716 + }, + { + "epoch": 1.4464854732895969, + "grad_norm": 53216.5546875, + "learning_rate": 6.759396213781486e-05, + "loss": 2.2414, + "step": 7717 + }, + { + "epoch": 1.446672914714152, + "grad_norm": 49576.3125, + "learning_rate": 6.758660655947828e-05, + "loss": 2.294, + "step": 7718 + }, + { + "epoch": 1.4468603561387066, + "grad_norm": 48453.41015625, + "learning_rate": 6.757925054677531e-05, + "loss": 2.1857, + "step": 7719 + }, + { + "epoch": 1.4470477975632614, + "grad_norm": 54553.10546875, + "learning_rate": 6.757189409988764e-05, + "loss": 2.1842, + "step": 7720 + }, + { + "epoch": 1.4472352389878163, + "grad_norm": 52332.84375, + "learning_rate": 6.7564537218997e-05, + "loss": 2.2723, + "step": 7721 + }, + { + "epoch": 1.447422680412371, + "grad_norm": 52813.28125, + "learning_rate": 6.755717990428504e-05, + "loss": 2.3688, + "step": 7722 + }, + { + "epoch": 1.447610121836926, + "grad_norm": 50783.87109375, + "learning_rate": 6.754982215593349e-05, + "loss": 2.2507, + "step": 7723 + }, + { + "epoch": 1.4477975632614808, + "grad_norm": 48063.80859375, + "learning_rate": 6.75424639741241e-05, + "loss": 2.226, + "step": 7724 + }, + { + "epoch": 1.4479850046860356, + "grad_norm": 49589.40625, + "learning_rate": 6.75351053590386e-05, + "loss": 2.2789, + "step": 7725 + }, + { + "epoch": 1.4481724461105905, + "grad_norm": 48239.27734375, + "learning_rate": 6.752774631085871e-05, + "loss": 2.2969, + "step": 7726 + }, + { + "epoch": 1.4483598875351453, + "grad_norm": 50069.2265625, + "learning_rate": 6.752038682976623e-05, + "loss": 2.2682, + "step": 7727 + }, + { + "epoch": 1.4485473289597002, + "grad_norm": 49746.9921875, + "learning_rate": 6.75130269159429e-05, + "loss": 2.3031, + "step": 7728 + }, + { + "epoch": 1.448734770384255, + "grad_norm": 50908.01953125, + "learning_rate": 6.750566656957053e-05, + "loss": 2.2514, + "step": 7729 + }, + { + "epoch": 1.4489222118088096, + "grad_norm": 50916.6015625, + "learning_rate": 6.749830579083087e-05, + "loss": 2.249, + "step": 7730 + }, + { + "epoch": 1.4491096532333645, + "grad_norm": 49440.43359375, + "learning_rate": 6.749094457990577e-05, + "loss": 2.221, + "step": 7731 + }, + { + "epoch": 1.4492970946579193, + "grad_norm": 50634.80078125, + "learning_rate": 6.7483582936977e-05, + "loss": 2.1956, + "step": 7732 + }, + { + "epoch": 1.4494845360824742, + "grad_norm": 52927.0078125, + "learning_rate": 6.74762208622264e-05, + "loss": 2.1599, + "step": 7733 + }, + { + "epoch": 1.449671977507029, + "grad_norm": 47592.68359375, + "learning_rate": 6.74688583558358e-05, + "loss": 2.2424, + "step": 7734 + }, + { + "epoch": 1.4498594189315839, + "grad_norm": 48805.01953125, + "learning_rate": 6.746149541798705e-05, + "loss": 2.2564, + "step": 7735 + }, + { + "epoch": 1.4500468603561387, + "grad_norm": 49664.2578125, + "learning_rate": 6.7454132048862e-05, + "loss": 2.2528, + "step": 7736 + }, + { + "epoch": 1.4502343017806936, + "grad_norm": 56669.6328125, + "learning_rate": 6.744676824864254e-05, + "loss": 2.172, + "step": 7737 + }, + { + "epoch": 1.4504217432052484, + "grad_norm": 52250.60546875, + "learning_rate": 6.743940401751049e-05, + "loss": 2.2686, + "step": 7738 + }, + { + "epoch": 1.4506091846298033, + "grad_norm": 48243.203125, + "learning_rate": 6.74320393556478e-05, + "loss": 2.3016, + "step": 7739 + }, + { + "epoch": 1.450796626054358, + "grad_norm": 52863.33203125, + "learning_rate": 6.74246742632363e-05, + "loss": 2.2458, + "step": 7740 + }, + { + "epoch": 1.4509840674789127, + "grad_norm": 54486.19140625, + "learning_rate": 6.741730874045797e-05, + "loss": 2.3029, + "step": 7741 + }, + { + "epoch": 1.4511715089034678, + "grad_norm": 54178.71875, + "learning_rate": 6.74099427874947e-05, + "loss": 2.2402, + "step": 7742 + }, + { + "epoch": 1.4513589503280224, + "grad_norm": 50645.6953125, + "learning_rate": 6.740257640452839e-05, + "loss": 2.2075, + "step": 7743 + }, + { + "epoch": 1.4515463917525773, + "grad_norm": 47748.68359375, + "learning_rate": 6.739520959174103e-05, + "loss": 2.2521, + "step": 7744 + }, + { + "epoch": 1.4517338331771321, + "grad_norm": 52271.359375, + "learning_rate": 6.738784234931452e-05, + "loss": 2.2633, + "step": 7745 + }, + { + "epoch": 1.451921274601687, + "grad_norm": 51907.5703125, + "learning_rate": 6.738047467743087e-05, + "loss": 2.2559, + "step": 7746 + }, + { + "epoch": 1.4521087160262418, + "grad_norm": 49315.26171875, + "learning_rate": 6.737310657627203e-05, + "loss": 2.1985, + "step": 7747 + }, + { + "epoch": 1.4522961574507967, + "grad_norm": 50312.34765625, + "learning_rate": 6.736573804601997e-05, + "loss": 2.2723, + "step": 7748 + }, + { + "epoch": 1.4524835988753515, + "grad_norm": 53143.4375, + "learning_rate": 6.735836908685671e-05, + "loss": 2.2785, + "step": 7749 + }, + { + "epoch": 1.4526710402999063, + "grad_norm": 50998.015625, + "learning_rate": 6.735099969896422e-05, + "loss": 2.1571, + "step": 7750 + }, + { + "epoch": 1.4528584817244612, + "grad_norm": 55077.9140625, + "learning_rate": 6.734362988252454e-05, + "loss": 2.2208, + "step": 7751 + }, + { + "epoch": 1.4530459231490158, + "grad_norm": 49062.6953125, + "learning_rate": 6.733625963771968e-05, + "loss": 2.2489, + "step": 7752 + }, + { + "epoch": 1.4532333645735709, + "grad_norm": 49510.91796875, + "learning_rate": 6.73288889647317e-05, + "loss": 2.2791, + "step": 7753 + }, + { + "epoch": 1.4534208059981255, + "grad_norm": 52191.6328125, + "learning_rate": 6.732151786374264e-05, + "loss": 2.2532, + "step": 7754 + }, + { + "epoch": 1.4536082474226804, + "grad_norm": 51545.671875, + "learning_rate": 6.731414633493451e-05, + "loss": 2.2131, + "step": 7755 + }, + { + "epoch": 1.4537956888472352, + "grad_norm": 48090.63671875, + "learning_rate": 6.730677437848944e-05, + "loss": 2.2256, + "step": 7756 + }, + { + "epoch": 1.45398313027179, + "grad_norm": 58642.5703125, + "learning_rate": 6.729940199458948e-05, + "loss": 2.2871, + "step": 7757 + }, + { + "epoch": 1.454170571696345, + "grad_norm": 51324.1796875, + "learning_rate": 6.72920291834167e-05, + "loss": 2.1811, + "step": 7758 + }, + { + "epoch": 1.4543580131208997, + "grad_norm": 48234.26171875, + "learning_rate": 6.728465594515326e-05, + "loss": 2.2237, + "step": 7759 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 53033.7734375, + "learning_rate": 6.72772822799812e-05, + "loss": 2.1468, + "step": 7760 + }, + { + "epoch": 1.4547328959700094, + "grad_norm": 52796.07421875, + "learning_rate": 6.726990818808266e-05, + "loss": 2.1896, + "step": 7761 + }, + { + "epoch": 1.4549203373945643, + "grad_norm": 54086.23046875, + "learning_rate": 6.726253366963979e-05, + "loss": 2.2187, + "step": 7762 + }, + { + "epoch": 1.455107778819119, + "grad_norm": 52360.890625, + "learning_rate": 6.725515872483473e-05, + "loss": 2.1723, + "step": 7763 + }, + { + "epoch": 1.455295220243674, + "grad_norm": 57281.265625, + "learning_rate": 6.72477833538496e-05, + "loss": 2.3076, + "step": 7764 + }, + { + "epoch": 1.4554826616682286, + "grad_norm": 53234.28515625, + "learning_rate": 6.724040755686658e-05, + "loss": 2.1985, + "step": 7765 + }, + { + "epoch": 1.4556701030927834, + "grad_norm": 51013.85546875, + "learning_rate": 6.723303133406788e-05, + "loss": 2.2137, + "step": 7766 + }, + { + "epoch": 1.4558575445173383, + "grad_norm": 55381.12109375, + "learning_rate": 6.722565468563561e-05, + "loss": 2.3186, + "step": 7767 + }, + { + "epoch": 1.4560449859418931, + "grad_norm": 53747.4453125, + "learning_rate": 6.721827761175202e-05, + "loss": 2.2252, + "step": 7768 + }, + { + "epoch": 1.456232427366448, + "grad_norm": 60439.109375, + "learning_rate": 6.721090011259931e-05, + "loss": 2.1763, + "step": 7769 + }, + { + "epoch": 1.4564198687910028, + "grad_norm": 53272.94921875, + "learning_rate": 6.720352218835965e-05, + "loss": 2.1492, + "step": 7770 + }, + { + "epoch": 1.4566073102155577, + "grad_norm": 48282.18359375, + "learning_rate": 6.719614383921532e-05, + "loss": 2.2473, + "step": 7771 + }, + { + "epoch": 1.4567947516401125, + "grad_norm": 49051.75390625, + "learning_rate": 6.718876506534854e-05, + "loss": 2.242, + "step": 7772 + }, + { + "epoch": 1.4569821930646674, + "grad_norm": 55050.453125, + "learning_rate": 6.718138586694153e-05, + "loss": 2.2859, + "step": 7773 + }, + { + "epoch": 1.457169634489222, + "grad_norm": 53409.09375, + "learning_rate": 6.717400624417658e-05, + "loss": 2.2144, + "step": 7774 + }, + { + "epoch": 1.457357075913777, + "grad_norm": 52869.00390625, + "learning_rate": 6.716662619723592e-05, + "loss": 2.2567, + "step": 7775 + }, + { + "epoch": 1.4575445173383317, + "grad_norm": 51746.61328125, + "learning_rate": 6.715924572630187e-05, + "loss": 2.2777, + "step": 7776 + }, + { + "epoch": 1.4577319587628865, + "grad_norm": 55552.38671875, + "learning_rate": 6.71518648315567e-05, + "loss": 2.1406, + "step": 7777 + }, + { + "epoch": 1.4579194001874414, + "grad_norm": 51120.24609375, + "learning_rate": 6.714448351318268e-05, + "loss": 2.2425, + "step": 7778 + }, + { + "epoch": 1.4581068416119962, + "grad_norm": 48354.1796875, + "learning_rate": 6.713710177136218e-05, + "loss": 2.2391, + "step": 7779 + }, + { + "epoch": 1.458294283036551, + "grad_norm": 52071.1640625, + "learning_rate": 6.712971960627745e-05, + "loss": 2.2407, + "step": 7780 + }, + { + "epoch": 1.458481724461106, + "grad_norm": 51105.6015625, + "learning_rate": 6.712233701811089e-05, + "loss": 2.2633, + "step": 7781 + }, + { + "epoch": 1.4586691658856608, + "grad_norm": 45456.47265625, + "learning_rate": 6.711495400704477e-05, + "loss": 2.2221, + "step": 7782 + }, + { + "epoch": 1.4588566073102156, + "grad_norm": 48841.81640625, + "learning_rate": 6.71075705732615e-05, + "loss": 2.2286, + "step": 7783 + }, + { + "epoch": 1.4590440487347704, + "grad_norm": 51968.95703125, + "learning_rate": 6.71001867169434e-05, + "loss": 2.2748, + "step": 7784 + }, + { + "epoch": 1.459231490159325, + "grad_norm": 54306.890625, + "learning_rate": 6.709280243827287e-05, + "loss": 2.2428, + "step": 7785 + }, + { + "epoch": 1.4594189315838801, + "grad_norm": 50177.38671875, + "learning_rate": 6.708541773743229e-05, + "loss": 2.1487, + "step": 7786 + }, + { + "epoch": 1.4596063730084348, + "grad_norm": 51218.7109375, + "learning_rate": 6.707803261460403e-05, + "loss": 2.2432, + "step": 7787 + }, + { + "epoch": 1.4597938144329896, + "grad_norm": 54943.57421875, + "learning_rate": 6.707064706997052e-05, + "loss": 2.307, + "step": 7788 + }, + { + "epoch": 1.4599812558575445, + "grad_norm": 50573.21484375, + "learning_rate": 6.706326110371413e-05, + "loss": 2.249, + "step": 7789 + }, + { + "epoch": 1.4601686972820993, + "grad_norm": 46712.8828125, + "learning_rate": 6.705587471601734e-05, + "loss": 2.2039, + "step": 7790 + }, + { + "epoch": 1.4603561387066541, + "grad_norm": 50075.6484375, + "learning_rate": 6.704848790706256e-05, + "loss": 2.2539, + "step": 7791 + }, + { + "epoch": 1.460543580131209, + "grad_norm": 54513.1796875, + "learning_rate": 6.704110067703222e-05, + "loss": 2.226, + "step": 7792 + }, + { + "epoch": 1.4607310215557638, + "grad_norm": 49772.703125, + "learning_rate": 6.703371302610878e-05, + "loss": 2.2498, + "step": 7793 + }, + { + "epoch": 1.4609184629803187, + "grad_norm": 53065.78125, + "learning_rate": 6.702632495447474e-05, + "loss": 2.2393, + "step": 7794 + }, + { + "epoch": 1.4611059044048735, + "grad_norm": 53090.92578125, + "learning_rate": 6.701893646231252e-05, + "loss": 2.2211, + "step": 7795 + }, + { + "epoch": 1.4612933458294284, + "grad_norm": 50785.328125, + "learning_rate": 6.701154754980465e-05, + "loss": 2.2912, + "step": 7796 + }, + { + "epoch": 1.4614807872539832, + "grad_norm": 51259.49609375, + "learning_rate": 6.70041582171336e-05, + "loss": 2.349, + "step": 7797 + }, + { + "epoch": 1.4616682286785379, + "grad_norm": 52530.4140625, + "learning_rate": 6.69967684644819e-05, + "loss": 2.2582, + "step": 7798 + }, + { + "epoch": 1.461855670103093, + "grad_norm": 53366.1953125, + "learning_rate": 6.698937829203206e-05, + "loss": 2.2602, + "step": 7799 + }, + { + "epoch": 1.4620431115276475, + "grad_norm": 49128.87109375, + "learning_rate": 6.698198769996659e-05, + "loss": 2.2059, + "step": 7800 + }, + { + "epoch": 1.4622305529522024, + "grad_norm": 56829.078125, + "learning_rate": 6.697459668846804e-05, + "loss": 2.2471, + "step": 7801 + }, + { + "epoch": 1.4624179943767572, + "grad_norm": 50065.56640625, + "learning_rate": 6.696720525771898e-05, + "loss": 2.1935, + "step": 7802 + }, + { + "epoch": 1.462605435801312, + "grad_norm": 51426.78125, + "learning_rate": 6.695981340790193e-05, + "loss": 2.2364, + "step": 7803 + }, + { + "epoch": 1.462792877225867, + "grad_norm": 49735.8515625, + "learning_rate": 6.695242113919949e-05, + "loss": 2.2442, + "step": 7804 + }, + { + "epoch": 1.4629803186504218, + "grad_norm": 52369.8515625, + "learning_rate": 6.694502845179421e-05, + "loss": 2.1718, + "step": 7805 + }, + { + "epoch": 1.4631677600749766, + "grad_norm": 47135.6640625, + "learning_rate": 6.69376353458687e-05, + "loss": 2.2303, + "step": 7806 + }, + { + "epoch": 1.4633552014995315, + "grad_norm": 54102.63671875, + "learning_rate": 6.693024182160557e-05, + "loss": 2.2198, + "step": 7807 + }, + { + "epoch": 1.4635426429240863, + "grad_norm": 50239.91015625, + "learning_rate": 6.692284787918741e-05, + "loss": 2.2804, + "step": 7808 + }, + { + "epoch": 1.463730084348641, + "grad_norm": 52296.08203125, + "learning_rate": 6.691545351879685e-05, + "loss": 2.2106, + "step": 7809 + }, + { + "epoch": 1.463917525773196, + "grad_norm": 46034.50390625, + "learning_rate": 6.690805874061651e-05, + "loss": 2.2527, + "step": 7810 + }, + { + "epoch": 1.4641049671977506, + "grad_norm": 52954.875, + "learning_rate": 6.690066354482905e-05, + "loss": 2.2659, + "step": 7811 + }, + { + "epoch": 1.4642924086223055, + "grad_norm": 47528.91015625, + "learning_rate": 6.689326793161712e-05, + "loss": 2.2245, + "step": 7812 + }, + { + "epoch": 1.4644798500468603, + "grad_norm": 48494.7421875, + "learning_rate": 6.688587190116335e-05, + "loss": 2.2092, + "step": 7813 + }, + { + "epoch": 1.4646672914714152, + "grad_norm": 51911.89453125, + "learning_rate": 6.687847545365046e-05, + "loss": 2.2967, + "step": 7814 + }, + { + "epoch": 1.46485473289597, + "grad_norm": 51706.8984375, + "learning_rate": 6.687107858926112e-05, + "loss": 2.2912, + "step": 7815 + }, + { + "epoch": 1.4650421743205249, + "grad_norm": 49044.33984375, + "learning_rate": 6.686368130817798e-05, + "loss": 2.2169, + "step": 7816 + }, + { + "epoch": 1.4652296157450797, + "grad_norm": 51358.1953125, + "learning_rate": 6.685628361058379e-05, + "loss": 2.2645, + "step": 7817 + }, + { + "epoch": 1.4654170571696346, + "grad_norm": 49025.84765625, + "learning_rate": 6.684888549666125e-05, + "loss": 2.1079, + "step": 7818 + }, + { + "epoch": 1.4656044985941894, + "grad_norm": 50214.359375, + "learning_rate": 6.684148696659309e-05, + "loss": 2.2272, + "step": 7819 + }, + { + "epoch": 1.465791940018744, + "grad_norm": 52488.61328125, + "learning_rate": 6.683408802056202e-05, + "loss": 2.2438, + "step": 7820 + }, + { + "epoch": 1.465979381443299, + "grad_norm": 51106.30078125, + "learning_rate": 6.682668865875081e-05, + "loss": 2.2165, + "step": 7821 + }, + { + "epoch": 1.4661668228678537, + "grad_norm": 53665.51953125, + "learning_rate": 6.681928888134221e-05, + "loss": 2.2485, + "step": 7822 + }, + { + "epoch": 1.4663542642924086, + "grad_norm": 53063.21875, + "learning_rate": 6.681188868851898e-05, + "loss": 2.2088, + "step": 7823 + }, + { + "epoch": 1.4665417057169634, + "grad_norm": 50805.5546875, + "learning_rate": 6.68044880804639e-05, + "loss": 2.202, + "step": 7824 + }, + { + "epoch": 1.4667291471415183, + "grad_norm": 52313.5703125, + "learning_rate": 6.679708705735974e-05, + "loss": 2.275, + "step": 7825 + }, + { + "epoch": 1.466916588566073, + "grad_norm": 52925.203125, + "learning_rate": 6.678968561938932e-05, + "loss": 2.2789, + "step": 7826 + }, + { + "epoch": 1.467104029990628, + "grad_norm": 47768.72265625, + "learning_rate": 6.678228376673544e-05, + "loss": 2.2258, + "step": 7827 + }, + { + "epoch": 1.4672914714151828, + "grad_norm": 53017.44140625, + "learning_rate": 6.677488149958088e-05, + "loss": 2.2055, + "step": 7828 + }, + { + "epoch": 1.4674789128397376, + "grad_norm": 52544.921875, + "learning_rate": 6.676747881810851e-05, + "loss": 2.2969, + "step": 7829 + }, + { + "epoch": 1.4676663542642925, + "grad_norm": 48554.5625, + "learning_rate": 6.676007572250116e-05, + "loss": 2.1988, + "step": 7830 + }, + { + "epoch": 1.467853795688847, + "grad_norm": 51786.4453125, + "learning_rate": 6.675267221294167e-05, + "loss": 2.2452, + "step": 7831 + }, + { + "epoch": 1.4680412371134022, + "grad_norm": 51569.51953125, + "learning_rate": 6.67452682896129e-05, + "loss": 2.2499, + "step": 7832 + }, + { + "epoch": 1.4682286785379568, + "grad_norm": 50049.6171875, + "learning_rate": 6.67378639526977e-05, + "loss": 2.1487, + "step": 7833 + }, + { + "epoch": 1.4684161199625116, + "grad_norm": 50264.72265625, + "learning_rate": 6.673045920237897e-05, + "loss": 2.2434, + "step": 7834 + }, + { + "epoch": 1.4686035613870665, + "grad_norm": 49541.65234375, + "learning_rate": 6.672305403883958e-05, + "loss": 2.2539, + "step": 7835 + }, + { + "epoch": 1.4687910028116213, + "grad_norm": 50263.90625, + "learning_rate": 6.671564846226243e-05, + "loss": 2.2659, + "step": 7836 + }, + { + "epoch": 1.4689784442361762, + "grad_norm": 54839.83203125, + "learning_rate": 6.670824247283047e-05, + "loss": 2.2157, + "step": 7837 + }, + { + "epoch": 1.469165885660731, + "grad_norm": 55882.1015625, + "learning_rate": 6.670083607072655e-05, + "loss": 2.2176, + "step": 7838 + }, + { + "epoch": 1.4693533270852859, + "grad_norm": 55735.00390625, + "learning_rate": 6.669342925613364e-05, + "loss": 2.2735, + "step": 7839 + }, + { + "epoch": 1.4695407685098407, + "grad_norm": 51183.203125, + "learning_rate": 6.668602202923468e-05, + "loss": 2.2012, + "step": 7840 + }, + { + "epoch": 1.4697282099343956, + "grad_norm": 55016.12109375, + "learning_rate": 6.66786143902126e-05, + "loss": 2.1719, + "step": 7841 + }, + { + "epoch": 1.4699156513589502, + "grad_norm": 51643.8359375, + "learning_rate": 6.667120633925037e-05, + "loss": 2.2484, + "step": 7842 + }, + { + "epoch": 1.4701030927835053, + "grad_norm": 49719.60546875, + "learning_rate": 6.666379787653097e-05, + "loss": 2.2599, + "step": 7843 + }, + { + "epoch": 1.4702905342080599, + "grad_norm": 53967.421875, + "learning_rate": 6.665638900223736e-05, + "loss": 2.2074, + "step": 7844 + }, + { + "epoch": 1.4704779756326147, + "grad_norm": 57597.01171875, + "learning_rate": 6.664897971655253e-05, + "loss": 2.219, + "step": 7845 + }, + { + "epoch": 1.4706654170571696, + "grad_norm": 54264.97265625, + "learning_rate": 6.66415700196595e-05, + "loss": 2.5081, + "step": 7846 + }, + { + "epoch": 1.4708528584817244, + "grad_norm": 50817.703125, + "learning_rate": 6.663415991174125e-05, + "loss": 2.2566, + "step": 7847 + }, + { + "epoch": 1.4710402999062793, + "grad_norm": 50163.89453125, + "learning_rate": 6.662674939298083e-05, + "loss": 2.2515, + "step": 7848 + }, + { + "epoch": 1.4712277413308341, + "grad_norm": 51585.03125, + "learning_rate": 6.661933846356125e-05, + "loss": 2.2567, + "step": 7849 + }, + { + "epoch": 1.471415182755389, + "grad_norm": 55764.38671875, + "learning_rate": 6.661192712366556e-05, + "loss": 2.1855, + "step": 7850 + }, + { + "epoch": 1.4716026241799438, + "grad_norm": 55774.26171875, + "learning_rate": 6.66045153734768e-05, + "loss": 2.2521, + "step": 7851 + }, + { + "epoch": 1.4717900656044987, + "grad_norm": 53277.59765625, + "learning_rate": 6.659710321317806e-05, + "loss": 2.2231, + "step": 7852 + }, + { + "epoch": 1.4719775070290535, + "grad_norm": 53560.734375, + "learning_rate": 6.658969064295236e-05, + "loss": 2.3083, + "step": 7853 + }, + { + "epoch": 1.4721649484536083, + "grad_norm": 49848.70703125, + "learning_rate": 6.658227766298282e-05, + "loss": 2.2221, + "step": 7854 + }, + { + "epoch": 1.472352389878163, + "grad_norm": 52310.9765625, + "learning_rate": 6.657486427345253e-05, + "loss": 2.1895, + "step": 7855 + }, + { + "epoch": 1.472539831302718, + "grad_norm": 50164.8984375, + "learning_rate": 6.656745047454458e-05, + "loss": 2.28, + "step": 7856 + }, + { + "epoch": 1.4727272727272727, + "grad_norm": 53773.9375, + "learning_rate": 6.656003626644209e-05, + "loss": 2.2338, + "step": 7857 + }, + { + "epoch": 1.4729147141518275, + "grad_norm": 50371.6484375, + "learning_rate": 6.655262164932815e-05, + "loss": 2.2815, + "step": 7858 + }, + { + "epoch": 1.4731021555763824, + "grad_norm": 50638.9765625, + "learning_rate": 6.654520662338594e-05, + "loss": 2.2956, + "step": 7859 + }, + { + "epoch": 1.4732895970009372, + "grad_norm": 48722.046875, + "learning_rate": 6.653779118879858e-05, + "loss": 2.2609, + "step": 7860 + }, + { + "epoch": 1.473477038425492, + "grad_norm": 53114.53125, + "learning_rate": 6.65303753457492e-05, + "loss": 2.2193, + "step": 7861 + }, + { + "epoch": 1.473664479850047, + "grad_norm": 53580.609375, + "learning_rate": 6.652295909442099e-05, + "loss": 2.2843, + "step": 7862 + }, + { + "epoch": 1.4738519212746017, + "grad_norm": 48895.5546875, + "learning_rate": 6.65155424349971e-05, + "loss": 2.3161, + "step": 7863 + }, + { + "epoch": 1.4740393626991566, + "grad_norm": 49423.41015625, + "learning_rate": 6.650812536766073e-05, + "loss": 2.2587, + "step": 7864 + }, + { + "epoch": 1.4742268041237114, + "grad_norm": 49056.328125, + "learning_rate": 6.650070789259509e-05, + "loss": 2.204, + "step": 7865 + }, + { + "epoch": 1.474414245548266, + "grad_norm": 47397.8671875, + "learning_rate": 6.649329000998333e-05, + "loss": 2.2426, + "step": 7866 + }, + { + "epoch": 1.4746016869728211, + "grad_norm": 46724.140625, + "learning_rate": 6.64858717200087e-05, + "loss": 2.234, + "step": 7867 + }, + { + "epoch": 1.4747891283973757, + "grad_norm": 50983.3828125, + "learning_rate": 6.647845302285443e-05, + "loss": 2.2542, + "step": 7868 + }, + { + "epoch": 1.4749765698219306, + "grad_norm": 52432.49609375, + "learning_rate": 6.647103391870372e-05, + "loss": 2.3106, + "step": 7869 + }, + { + "epoch": 1.4751640112464854, + "grad_norm": 48645.8359375, + "learning_rate": 6.646361440773983e-05, + "loss": 2.2843, + "step": 7870 + }, + { + "epoch": 1.4753514526710403, + "grad_norm": 50000.76171875, + "learning_rate": 6.6456194490146e-05, + "loss": 2.1732, + "step": 7871 + }, + { + "epoch": 1.4755388940955951, + "grad_norm": 48566.640625, + "learning_rate": 6.644877416610551e-05, + "loss": 2.2148, + "step": 7872 + }, + { + "epoch": 1.47572633552015, + "grad_norm": 51629.5625, + "learning_rate": 6.644135343580162e-05, + "loss": 2.1927, + "step": 7873 + }, + { + "epoch": 1.4759137769447048, + "grad_norm": 47937.91796875, + "learning_rate": 6.643393229941762e-05, + "loss": 2.2394, + "step": 7874 + }, + { + "epoch": 1.4761012183692597, + "grad_norm": 48387.67578125, + "learning_rate": 6.64265107571368e-05, + "loss": 2.1824, + "step": 7875 + }, + { + "epoch": 1.4762886597938145, + "grad_norm": 53221.98828125, + "learning_rate": 6.641908880914246e-05, + "loss": 2.3454, + "step": 7876 + }, + { + "epoch": 1.4764761012183691, + "grad_norm": 53022.359375, + "learning_rate": 6.641166645561792e-05, + "loss": 2.2823, + "step": 7877 + }, + { + "epoch": 1.4766635426429242, + "grad_norm": 55053.625, + "learning_rate": 6.640424369674648e-05, + "loss": 2.2624, + "step": 7878 + }, + { + "epoch": 1.4768509840674788, + "grad_norm": 57054.921875, + "learning_rate": 6.63968205327115e-05, + "loss": 2.1544, + "step": 7879 + }, + { + "epoch": 1.4770384254920337, + "grad_norm": 52374.5625, + "learning_rate": 6.638939696369632e-05, + "loss": 2.2006, + "step": 7880 + }, + { + "epoch": 1.4772258669165885, + "grad_norm": 50660.2109375, + "learning_rate": 6.638197298988426e-05, + "loss": 2.2281, + "step": 7881 + }, + { + "epoch": 1.4774133083411434, + "grad_norm": 51155.31640625, + "learning_rate": 6.637454861145874e-05, + "loss": 2.1829, + "step": 7882 + }, + { + "epoch": 1.4776007497656982, + "grad_norm": 49749.47265625, + "learning_rate": 6.636712382860308e-05, + "loss": 2.2723, + "step": 7883 + }, + { + "epoch": 1.477788191190253, + "grad_norm": 52551.890625, + "learning_rate": 6.635969864150068e-05, + "loss": 2.2321, + "step": 7884 + }, + { + "epoch": 1.477975632614808, + "grad_norm": 55410.5234375, + "learning_rate": 6.635227305033495e-05, + "loss": 2.1711, + "step": 7885 + }, + { + "epoch": 1.4781630740393628, + "grad_norm": 51483.55859375, + "learning_rate": 6.634484705528925e-05, + "loss": 2.2717, + "step": 7886 + }, + { + "epoch": 1.4783505154639176, + "grad_norm": 51652.3515625, + "learning_rate": 6.633742065654703e-05, + "loss": 2.1913, + "step": 7887 + }, + { + "epoch": 1.4785379568884722, + "grad_norm": 49252.4609375, + "learning_rate": 6.63299938542917e-05, + "loss": 2.1774, + "step": 7888 + }, + { + "epoch": 1.4787253983130273, + "grad_norm": 50329.42578125, + "learning_rate": 6.632256664870671e-05, + "loss": 2.2378, + "step": 7889 + }, + { + "epoch": 1.478912839737582, + "grad_norm": 56039.79296875, + "learning_rate": 6.631513903997547e-05, + "loss": 2.283, + "step": 7890 + }, + { + "epoch": 1.4791002811621368, + "grad_norm": 56133.84765625, + "learning_rate": 6.630771102828145e-05, + "loss": 2.1279, + "step": 7891 + }, + { + "epoch": 1.4792877225866916, + "grad_norm": 50545.8515625, + "learning_rate": 6.630028261380811e-05, + "loss": 2.2443, + "step": 7892 + }, + { + "epoch": 1.4794751640112465, + "grad_norm": 52467.53125, + "learning_rate": 6.629285379673892e-05, + "loss": 2.2221, + "step": 7893 + }, + { + "epoch": 1.4796626054358013, + "grad_norm": 50305.3046875, + "learning_rate": 6.628542457725738e-05, + "loss": 2.2543, + "step": 7894 + }, + { + "epoch": 1.4798500468603561, + "grad_norm": 51634.078125, + "learning_rate": 6.627799495554698e-05, + "loss": 2.2116, + "step": 7895 + }, + { + "epoch": 1.480037488284911, + "grad_norm": 56112.44921875, + "learning_rate": 6.627056493179117e-05, + "loss": 2.1851, + "step": 7896 + }, + { + "epoch": 1.4802249297094658, + "grad_norm": 48778.9765625, + "learning_rate": 6.626313450617353e-05, + "loss": 2.2425, + "step": 7897 + }, + { + "epoch": 1.4804123711340207, + "grad_norm": 53435.39453125, + "learning_rate": 6.625570367887755e-05, + "loss": 2.2132, + "step": 7898 + }, + { + "epoch": 1.4805998125585753, + "grad_norm": 48352.44921875, + "learning_rate": 6.624827245008675e-05, + "loss": 2.2679, + "step": 7899 + }, + { + "epoch": 1.4807872539831304, + "grad_norm": 52406.8984375, + "learning_rate": 6.624084081998471e-05, + "loss": 2.2169, + "step": 7900 + }, + { + "epoch": 1.480974695407685, + "grad_norm": 54037.34765625, + "learning_rate": 6.623340878875492e-05, + "loss": 2.3124, + "step": 7901 + }, + { + "epoch": 1.4811621368322399, + "grad_norm": 54121.578125, + "learning_rate": 6.622597635658102e-05, + "loss": 2.2804, + "step": 7902 + }, + { + "epoch": 1.4813495782567947, + "grad_norm": 55366.109375, + "learning_rate": 6.621854352364652e-05, + "loss": 2.2576, + "step": 7903 + }, + { + "epoch": 1.4815370196813495, + "grad_norm": 54972.421875, + "learning_rate": 6.621111029013502e-05, + "loss": 2.2669, + "step": 7904 + }, + { + "epoch": 1.4817244611059044, + "grad_norm": 48062.8671875, + "learning_rate": 6.620367665623013e-05, + "loss": 2.2666, + "step": 7905 + }, + { + "epoch": 1.4819119025304592, + "grad_norm": 46926.78515625, + "learning_rate": 6.619624262211542e-05, + "loss": 2.2142, + "step": 7906 + }, + { + "epoch": 1.482099343955014, + "grad_norm": 49074.5546875, + "learning_rate": 6.618880818797452e-05, + "loss": 2.2789, + "step": 7907 + }, + { + "epoch": 1.482286785379569, + "grad_norm": 52527.7734375, + "learning_rate": 6.618137335399106e-05, + "loss": 2.2986, + "step": 7908 + }, + { + "epoch": 1.4824742268041238, + "grad_norm": 56114.328125, + "learning_rate": 6.617393812034865e-05, + "loss": 2.2355, + "step": 7909 + }, + { + "epoch": 1.4826616682286786, + "grad_norm": 56132.9140625, + "learning_rate": 6.616650248723093e-05, + "loss": 2.182, + "step": 7910 + }, + { + "epoch": 1.4828491096532335, + "grad_norm": 51260.33984375, + "learning_rate": 6.615906645482155e-05, + "loss": 2.2416, + "step": 7911 + }, + { + "epoch": 1.483036551077788, + "grad_norm": 50379.796875, + "learning_rate": 6.61516300233042e-05, + "loss": 2.2342, + "step": 7912 + }, + { + "epoch": 1.483223992502343, + "grad_norm": 51472.921875, + "learning_rate": 6.614419319286253e-05, + "loss": 2.1703, + "step": 7913 + }, + { + "epoch": 1.4834114339268978, + "grad_norm": 54061.0078125, + "learning_rate": 6.613675596368021e-05, + "loss": 2.3287, + "step": 7914 + }, + { + "epoch": 1.4835988753514526, + "grad_norm": 51891.6953125, + "learning_rate": 6.612931833594094e-05, + "loss": 2.2899, + "step": 7915 + }, + { + "epoch": 1.4837863167760075, + "grad_norm": 48706.01953125, + "learning_rate": 6.612188030982842e-05, + "loss": 2.2288, + "step": 7916 + }, + { + "epoch": 1.4839737582005623, + "grad_norm": 47877.36328125, + "learning_rate": 6.611444188552636e-05, + "loss": 2.2432, + "step": 7917 + }, + { + "epoch": 1.4841611996251172, + "grad_norm": 50240.1171875, + "learning_rate": 6.610700306321847e-05, + "loss": 2.2134, + "step": 7918 + }, + { + "epoch": 1.484348641049672, + "grad_norm": 53593.02734375, + "learning_rate": 6.60995638430885e-05, + "loss": 2.2111, + "step": 7919 + }, + { + "epoch": 1.4845360824742269, + "grad_norm": 50739.64453125, + "learning_rate": 6.609212422532018e-05, + "loss": 2.2073, + "step": 7920 + }, + { + "epoch": 1.4847235238987817, + "grad_norm": 51464.27734375, + "learning_rate": 6.608468421009724e-05, + "loss": 2.2058, + "step": 7921 + }, + { + "epoch": 1.4849109653233366, + "grad_norm": 49317.0078125, + "learning_rate": 6.607724379760345e-05, + "loss": 2.196, + "step": 7922 + }, + { + "epoch": 1.4850984067478912, + "grad_norm": 46572.796875, + "learning_rate": 6.606980298802261e-05, + "loss": 2.2196, + "step": 7923 + }, + { + "epoch": 1.4852858481724462, + "grad_norm": 48120.55078125, + "learning_rate": 6.606236178153843e-05, + "loss": 2.2787, + "step": 7924 + }, + { + "epoch": 1.4854732895970009, + "grad_norm": 53551.44140625, + "learning_rate": 6.605492017833478e-05, + "loss": 2.2164, + "step": 7925 + }, + { + "epoch": 1.4856607310215557, + "grad_norm": 50259.609375, + "learning_rate": 6.60474781785954e-05, + "loss": 2.1867, + "step": 7926 + }, + { + "epoch": 1.4858481724461106, + "grad_norm": 54942.84765625, + "learning_rate": 6.60400357825041e-05, + "loss": 2.2604, + "step": 7927 + }, + { + "epoch": 1.4860356138706654, + "grad_norm": 49936.91015625, + "learning_rate": 6.603259299024475e-05, + "loss": 2.1602, + "step": 7928 + }, + { + "epoch": 1.4862230552952203, + "grad_norm": 50971.70703125, + "learning_rate": 6.60251498020011e-05, + "loss": 2.2468, + "step": 7929 + }, + { + "epoch": 1.486410496719775, + "grad_norm": 55015.921875, + "learning_rate": 6.601770621795705e-05, + "loss": 2.1166, + "step": 7930 + }, + { + "epoch": 1.48659793814433, + "grad_norm": 46253.37890625, + "learning_rate": 6.601026223829641e-05, + "loss": 2.276, + "step": 7931 + }, + { + "epoch": 1.4867853795688848, + "grad_norm": 50019.24609375, + "learning_rate": 6.600281786320307e-05, + "loss": 2.2253, + "step": 7932 + }, + { + "epoch": 1.4869728209934396, + "grad_norm": 53352.46875, + "learning_rate": 6.599537309286085e-05, + "loss": 2.269, + "step": 7933 + }, + { + "epoch": 1.4871602624179943, + "grad_norm": 49559.96875, + "learning_rate": 6.598792792745367e-05, + "loss": 2.1761, + "step": 7934 + }, + { + "epoch": 1.4873477038425493, + "grad_norm": 52271.12109375, + "learning_rate": 6.59804823671654e-05, + "loss": 2.2087, + "step": 7935 + }, + { + "epoch": 1.487535145267104, + "grad_norm": 55488.203125, + "learning_rate": 6.597303641217994e-05, + "loss": 2.2433, + "step": 7936 + }, + { + "epoch": 1.4877225866916588, + "grad_norm": 48871.9453125, + "learning_rate": 6.596559006268117e-05, + "loss": 2.2438, + "step": 7937 + }, + { + "epoch": 1.4879100281162136, + "grad_norm": 48030.3984375, + "learning_rate": 6.595814331885305e-05, + "loss": 2.1686, + "step": 7938 + }, + { + "epoch": 1.4880974695407685, + "grad_norm": 52571.74609375, + "learning_rate": 6.595069618087946e-05, + "loss": 2.2989, + "step": 7939 + }, + { + "epoch": 1.4882849109653233, + "grad_norm": 55399.46875, + "learning_rate": 6.594324864894437e-05, + "loss": 2.2416, + "step": 7940 + }, + { + "epoch": 1.4884723523898782, + "grad_norm": 49819.6875, + "learning_rate": 6.593580072323168e-05, + "loss": 2.272, + "step": 7941 + }, + { + "epoch": 1.488659793814433, + "grad_norm": 49089.86328125, + "learning_rate": 6.592835240392541e-05, + "loss": 2.2285, + "step": 7942 + }, + { + "epoch": 1.4888472352389879, + "grad_norm": 52657.60546875, + "learning_rate": 6.592090369120946e-05, + "loss": 2.2174, + "step": 7943 + }, + { + "epoch": 1.4890346766635427, + "grad_norm": 47812.14453125, + "learning_rate": 6.591345458526784e-05, + "loss": 2.1902, + "step": 7944 + }, + { + "epoch": 1.4892221180880973, + "grad_norm": 49852.27734375, + "learning_rate": 6.590600508628453e-05, + "loss": 2.2187, + "step": 7945 + }, + { + "epoch": 1.4894095595126524, + "grad_norm": 52384.21484375, + "learning_rate": 6.589855519444352e-05, + "loss": 2.2784, + "step": 7946 + }, + { + "epoch": 1.489597000937207, + "grad_norm": 52947.7734375, + "learning_rate": 6.589110490992879e-05, + "loss": 2.1958, + "step": 7947 + }, + { + "epoch": 1.4897844423617619, + "grad_norm": 53558.01171875, + "learning_rate": 6.588365423292438e-05, + "loss": 2.1325, + "step": 7948 + }, + { + "epoch": 1.4899718837863167, + "grad_norm": 50646.5078125, + "learning_rate": 6.587620316361431e-05, + "loss": 2.2807, + "step": 7949 + }, + { + "epoch": 1.4901593252108716, + "grad_norm": 50999.0625, + "learning_rate": 6.586875170218261e-05, + "loss": 2.2668, + "step": 7950 + }, + { + "epoch": 1.4903467666354264, + "grad_norm": 48285.828125, + "learning_rate": 6.58612998488133e-05, + "loss": 2.1925, + "step": 7951 + }, + { + "epoch": 1.4905342080599813, + "grad_norm": 50602.08984375, + "learning_rate": 6.585384760369045e-05, + "loss": 2.1799, + "step": 7952 + }, + { + "epoch": 1.4907216494845361, + "grad_norm": 52976.046875, + "learning_rate": 6.584639496699813e-05, + "loss": 2.2585, + "step": 7953 + }, + { + "epoch": 1.490909090909091, + "grad_norm": 50501.77734375, + "learning_rate": 6.58389419389204e-05, + "loss": 2.2159, + "step": 7954 + }, + { + "epoch": 1.4910965323336458, + "grad_norm": 52020.33203125, + "learning_rate": 6.583148851964134e-05, + "loss": 2.2428, + "step": 7955 + }, + { + "epoch": 1.4912839737582004, + "grad_norm": 50157.34765625, + "learning_rate": 6.582403470934504e-05, + "loss": 2.2843, + "step": 7956 + }, + { + "epoch": 1.4914714151827555, + "grad_norm": 52775.6796875, + "learning_rate": 6.581658050821559e-05, + "loss": 2.1658, + "step": 7957 + }, + { + "epoch": 1.4916588566073101, + "grad_norm": 55168.15234375, + "learning_rate": 6.580912591643711e-05, + "loss": 2.175, + "step": 7958 + }, + { + "epoch": 1.491846298031865, + "grad_norm": 52152.7890625, + "learning_rate": 6.580167093419372e-05, + "loss": 2.2347, + "step": 7959 + }, + { + "epoch": 1.4920337394564198, + "grad_norm": 52220.390625, + "learning_rate": 6.579421556166955e-05, + "loss": 2.2811, + "step": 7960 + }, + { + "epoch": 1.4922211808809747, + "grad_norm": 58971.42578125, + "learning_rate": 6.578675979904872e-05, + "loss": 2.191, + "step": 7961 + }, + { + "epoch": 1.4924086223055295, + "grad_norm": 52645.97265625, + "learning_rate": 6.57793036465154e-05, + "loss": 2.2421, + "step": 7962 + }, + { + "epoch": 1.4925960637300844, + "grad_norm": 51158.47265625, + "learning_rate": 6.577184710425373e-05, + "loss": 2.2538, + "step": 7963 + }, + { + "epoch": 1.4927835051546392, + "grad_norm": 50405.70703125, + "learning_rate": 6.576439017244789e-05, + "loss": 2.1729, + "step": 7964 + }, + { + "epoch": 1.492970946579194, + "grad_norm": 52884.3671875, + "learning_rate": 6.575693285128205e-05, + "loss": 2.1949, + "step": 7965 + }, + { + "epoch": 1.493158388003749, + "grad_norm": 49409.6953125, + "learning_rate": 6.574947514094041e-05, + "loss": 2.3125, + "step": 7966 + }, + { + "epoch": 1.4933458294283035, + "grad_norm": 47838.2734375, + "learning_rate": 6.574201704160713e-05, + "loss": 2.2375, + "step": 7967 + }, + { + "epoch": 1.4935332708528586, + "grad_norm": 49322.015625, + "learning_rate": 6.573455855346646e-05, + "loss": 2.2168, + "step": 7968 + }, + { + "epoch": 1.4937207122774132, + "grad_norm": 57041.53125, + "learning_rate": 6.572709967670259e-05, + "loss": 2.3308, + "step": 7969 + }, + { + "epoch": 1.493908153701968, + "grad_norm": 54284.796875, + "learning_rate": 6.571964041149974e-05, + "loss": 2.2218, + "step": 7970 + }, + { + "epoch": 1.494095595126523, + "grad_norm": 50566.0234375, + "learning_rate": 6.571218075804216e-05, + "loss": 2.2251, + "step": 7971 + }, + { + "epoch": 1.4942830365510777, + "grad_norm": 51120.47265625, + "learning_rate": 6.570472071651408e-05, + "loss": 2.2143, + "step": 7972 + }, + { + "epoch": 1.4944704779756326, + "grad_norm": 52015.859375, + "learning_rate": 6.569726028709977e-05, + "loss": 2.1991, + "step": 7973 + }, + { + "epoch": 1.4946579194001874, + "grad_norm": 49774.39453125, + "learning_rate": 6.568979946998347e-05, + "loss": 2.2214, + "step": 7974 + }, + { + "epoch": 1.4948453608247423, + "grad_norm": 51078.5546875, + "learning_rate": 6.568233826534949e-05, + "loss": 2.2369, + "step": 7975 + }, + { + "epoch": 1.4950328022492971, + "grad_norm": 53632.98828125, + "learning_rate": 6.567487667338206e-05, + "loss": 2.2103, + "step": 7976 + }, + { + "epoch": 1.495220243673852, + "grad_norm": 52431.4375, + "learning_rate": 6.56674146942655e-05, + "loss": 2.2297, + "step": 7977 + }, + { + "epoch": 1.4954076850984068, + "grad_norm": 51709.69140625, + "learning_rate": 6.565995232818412e-05, + "loss": 2.2604, + "step": 7978 + }, + { + "epoch": 1.4955951265229617, + "grad_norm": 48024.984375, + "learning_rate": 6.565248957532222e-05, + "loss": 2.2515, + "step": 7979 + }, + { + "epoch": 1.4957825679475163, + "grad_norm": 54123.59765625, + "learning_rate": 6.564502643586412e-05, + "loss": 2.2078, + "step": 7980 + }, + { + "epoch": 1.4959700093720714, + "grad_norm": 56336.0078125, + "learning_rate": 6.563756290999415e-05, + "loss": 2.3701, + "step": 7981 + }, + { + "epoch": 1.496157450796626, + "grad_norm": 52867.8515625, + "learning_rate": 6.563009899789667e-05, + "loss": 2.1134, + "step": 7982 + }, + { + "epoch": 1.4963448922211808, + "grad_norm": 58605.421875, + "learning_rate": 6.562263469975599e-05, + "loss": 2.278, + "step": 7983 + }, + { + "epoch": 1.4965323336457357, + "grad_norm": 51643.3984375, + "learning_rate": 6.561517001575648e-05, + "loss": 2.2802, + "step": 7984 + }, + { + "epoch": 1.4967197750702905, + "grad_norm": 53484.16015625, + "learning_rate": 6.560770494608253e-05, + "loss": 2.2277, + "step": 7985 + }, + { + "epoch": 1.4969072164948454, + "grad_norm": 52794.1015625, + "learning_rate": 6.560023949091849e-05, + "loss": 2.1811, + "step": 7986 + }, + { + "epoch": 1.4970946579194002, + "grad_norm": 51403.5859375, + "learning_rate": 6.559277365044876e-05, + "loss": 2.2277, + "step": 7987 + }, + { + "epoch": 1.497282099343955, + "grad_norm": 50710.39453125, + "learning_rate": 6.558530742485775e-05, + "loss": 2.2104, + "step": 7988 + }, + { + "epoch": 1.49746954076851, + "grad_norm": 47930.30859375, + "learning_rate": 6.557784081432985e-05, + "loss": 2.2236, + "step": 7989 + }, + { + "epoch": 1.4976569821930648, + "grad_norm": 47367.50390625, + "learning_rate": 6.55703738190495e-05, + "loss": 2.263, + "step": 7990 + }, + { + "epoch": 1.4978444236176194, + "grad_norm": 56280.20703125, + "learning_rate": 6.556290643920106e-05, + "loss": 2.2668, + "step": 7991 + }, + { + "epoch": 1.4980318650421744, + "grad_norm": 51062.26953125, + "learning_rate": 6.555543867496904e-05, + "loss": 2.2791, + "step": 7992 + }, + { + "epoch": 1.498219306466729, + "grad_norm": 52547.32421875, + "learning_rate": 6.554797052653784e-05, + "loss": 2.3085, + "step": 7993 + }, + { + "epoch": 1.498406747891284, + "grad_norm": 52147.3203125, + "learning_rate": 6.554050199409192e-05, + "loss": 2.2235, + "step": 7994 + }, + { + "epoch": 1.4985941893158388, + "grad_norm": 58886.70703125, + "learning_rate": 6.553303307781578e-05, + "loss": 2.2496, + "step": 7995 + }, + { + "epoch": 1.4987816307403936, + "grad_norm": 52120.046875, + "learning_rate": 6.552556377789384e-05, + "loss": 2.2564, + "step": 7996 + }, + { + "epoch": 1.4989690721649485, + "grad_norm": 52476.14453125, + "learning_rate": 6.551809409451059e-05, + "loss": 2.3003, + "step": 7997 + }, + { + "epoch": 1.4991565135895033, + "grad_norm": 50846.78125, + "learning_rate": 6.551062402785055e-05, + "loss": 2.2805, + "step": 7998 + }, + { + "epoch": 1.4993439550140581, + "grad_norm": 55812.5703125, + "learning_rate": 6.550315357809822e-05, + "loss": 2.2072, + "step": 7999 + }, + { + "epoch": 1.499531396438613, + "grad_norm": 51260.21875, + "learning_rate": 6.549568274543808e-05, + "loss": 2.2616, + "step": 8000 + }, + { + "epoch": 1.499531396438613, + "eval_loss": 2.2991936206817627, + "eval_runtime": 127.7093, + "eval_samples_per_second": 39.535, + "eval_steps_per_second": 1.981, + "step": 8000 + }, + { + "epoch": 1.4997188378631678, + "grad_norm": 53247.21875, + "learning_rate": 6.548821153005468e-05, + "loss": 2.2832, + "step": 8001 + }, + { + "epoch": 1.4999062792877225, + "grad_norm": 50545.41015625, + "learning_rate": 6.548073993213251e-05, + "loss": 2.2161, + "step": 8002 + }, + { + "epoch": 1.5000937207122775, + "grad_norm": 46728.5390625, + "learning_rate": 6.547326795185615e-05, + "loss": 2.2946, + "step": 8003 + }, + { + "epoch": 1.5002811621368322, + "grad_norm": 51899.4609375, + "learning_rate": 6.546579558941013e-05, + "loss": 2.2608, + "step": 8004 + }, + { + "epoch": 1.5004686035613872, + "grad_norm": 52250.2421875, + "learning_rate": 6.545832284497903e-05, + "loss": 2.2249, + "step": 8005 + }, + { + "epoch": 1.5006560449859419, + "grad_norm": 55515.05859375, + "learning_rate": 6.545084971874738e-05, + "loss": 2.271, + "step": 8006 + }, + { + "epoch": 1.5008434864104967, + "grad_norm": 53056.09765625, + "learning_rate": 6.544337621089977e-05, + "loss": 2.2424, + "step": 8007 + }, + { + "epoch": 1.5010309278350515, + "grad_norm": 52497.40625, + "learning_rate": 6.54359023216208e-05, + "loss": 2.2245, + "step": 8008 + }, + { + "epoch": 1.5012183692596064, + "grad_norm": 55873.0703125, + "learning_rate": 6.542842805109504e-05, + "loss": 2.2128, + "step": 8009 + }, + { + "epoch": 1.5014058106841612, + "grad_norm": 53079.54296875, + "learning_rate": 6.542095339950714e-05, + "loss": 2.2834, + "step": 8010 + }, + { + "epoch": 1.501593252108716, + "grad_norm": 56013.08984375, + "learning_rate": 6.541347836704168e-05, + "loss": 2.2114, + "step": 8011 + }, + { + "epoch": 1.501780693533271, + "grad_norm": 60642.15234375, + "learning_rate": 6.540600295388326e-05, + "loss": 2.1666, + "step": 8012 + }, + { + "epoch": 1.5019681349578256, + "grad_norm": 51646.91015625, + "learning_rate": 6.539852716021655e-05, + "loss": 2.2435, + "step": 8013 + }, + { + "epoch": 1.5021555763823806, + "grad_norm": 51488.1953125, + "learning_rate": 6.53910509862262e-05, + "loss": 2.1794, + "step": 8014 + }, + { + "epoch": 1.5023430178069352, + "grad_norm": 49816.9609375, + "learning_rate": 6.538357443209683e-05, + "loss": 2.2107, + "step": 8015 + }, + { + "epoch": 1.5025304592314903, + "grad_norm": 54177.01171875, + "learning_rate": 6.537609749801313e-05, + "loss": 2.1873, + "step": 8016 + }, + { + "epoch": 1.502717900656045, + "grad_norm": 49484.08984375, + "learning_rate": 6.536862018415974e-05, + "loss": 2.2273, + "step": 8017 + }, + { + "epoch": 1.5029053420805998, + "grad_norm": 49712.0234375, + "learning_rate": 6.536114249072137e-05, + "loss": 2.2456, + "step": 8018 + }, + { + "epoch": 1.5030927835051546, + "grad_norm": 47955.95703125, + "learning_rate": 6.53536644178827e-05, + "loss": 2.3028, + "step": 8019 + }, + { + "epoch": 1.5032802249297095, + "grad_norm": 56445.70703125, + "learning_rate": 6.534618596582841e-05, + "loss": 2.1845, + "step": 8020 + }, + { + "epoch": 1.5034676663542643, + "grad_norm": 54134.546875, + "learning_rate": 6.533870713474323e-05, + "loss": 2.2625, + "step": 8021 + }, + { + "epoch": 1.5036551077788192, + "grad_norm": 47804.640625, + "learning_rate": 6.533122792481188e-05, + "loss": 2.2235, + "step": 8022 + }, + { + "epoch": 1.503842549203374, + "grad_norm": 53772.19140625, + "learning_rate": 6.532374833621907e-05, + "loss": 2.2983, + "step": 8023 + }, + { + "epoch": 1.5040299906279286, + "grad_norm": 50462.9921875, + "learning_rate": 6.531626836914953e-05, + "loss": 2.2179, + "step": 8024 + }, + { + "epoch": 1.5042174320524837, + "grad_norm": 57058.73046875, + "learning_rate": 6.530878802378805e-05, + "loss": 2.2497, + "step": 8025 + }, + { + "epoch": 1.5044048734770383, + "grad_norm": 45695.83203125, + "learning_rate": 6.530130730031934e-05, + "loss": 2.2426, + "step": 8026 + }, + { + "epoch": 1.5045923149015934, + "grad_norm": 52463.125, + "learning_rate": 6.529382619892816e-05, + "loss": 2.2292, + "step": 8027 + }, + { + "epoch": 1.504779756326148, + "grad_norm": 50299.08203125, + "learning_rate": 6.528634471979932e-05, + "loss": 2.2606, + "step": 8028 + }, + { + "epoch": 1.5049671977507029, + "grad_norm": 46739.80859375, + "learning_rate": 6.527886286311758e-05, + "loss": 2.2329, + "step": 8029 + }, + { + "epoch": 1.5051546391752577, + "grad_norm": 51857.703125, + "learning_rate": 6.527138062906774e-05, + "loss": 2.2059, + "step": 8030 + }, + { + "epoch": 1.5053420805998126, + "grad_norm": 50573.80859375, + "learning_rate": 6.52638980178346e-05, + "loss": 2.2152, + "step": 8031 + }, + { + "epoch": 1.5055295220243674, + "grad_norm": 50583.98828125, + "learning_rate": 6.525641502960296e-05, + "loss": 2.204, + "step": 8032 + }, + { + "epoch": 1.5057169634489223, + "grad_norm": 48284.0546875, + "learning_rate": 6.524893166455766e-05, + "loss": 2.2224, + "step": 8033 + }, + { + "epoch": 1.505904404873477, + "grad_norm": 49975.96484375, + "learning_rate": 6.524144792288351e-05, + "loss": 2.2347, + "step": 8034 + }, + { + "epoch": 1.5060918462980317, + "grad_norm": 49284.6796875, + "learning_rate": 6.523396380476536e-05, + "loss": 2.2297, + "step": 8035 + }, + { + "epoch": 1.5062792877225868, + "grad_norm": 54580.50390625, + "learning_rate": 6.522647931038804e-05, + "loss": 2.237, + "step": 8036 + }, + { + "epoch": 1.5064667291471414, + "grad_norm": 48804.4609375, + "learning_rate": 6.521899443993644e-05, + "loss": 2.2173, + "step": 8037 + }, + { + "epoch": 1.5066541705716965, + "grad_norm": 51764.63671875, + "learning_rate": 6.521150919359542e-05, + "loss": 2.2628, + "step": 8038 + }, + { + "epoch": 1.506841611996251, + "grad_norm": 50330.7890625, + "learning_rate": 6.520402357154982e-05, + "loss": 2.1416, + "step": 8039 + }, + { + "epoch": 1.507029053420806, + "grad_norm": 54844.66796875, + "learning_rate": 6.519653757398456e-05, + "loss": 2.1922, + "step": 8040 + }, + { + "epoch": 1.5072164948453608, + "grad_norm": 54663.65234375, + "learning_rate": 6.518905120108453e-05, + "loss": 2.2216, + "step": 8041 + }, + { + "epoch": 1.5074039362699156, + "grad_norm": 50801.84765625, + "learning_rate": 6.518156445303463e-05, + "loss": 2.2605, + "step": 8042 + }, + { + "epoch": 1.5075913776944705, + "grad_norm": 55232.2578125, + "learning_rate": 6.517407733001976e-05, + "loss": 2.1853, + "step": 8043 + }, + { + "epoch": 1.5077788191190253, + "grad_norm": 48450.4921875, + "learning_rate": 6.516658983222486e-05, + "loss": 2.2693, + "step": 8044 + }, + { + "epoch": 1.5079662605435802, + "grad_norm": 52511.296875, + "learning_rate": 6.515910195983484e-05, + "loss": 2.1914, + "step": 8045 + }, + { + "epoch": 1.5081537019681348, + "grad_norm": 53264.1171875, + "learning_rate": 6.515161371303468e-05, + "loss": 2.2702, + "step": 8046 + }, + { + "epoch": 1.5083411433926899, + "grad_norm": 50179.99609375, + "learning_rate": 6.514412509200929e-05, + "loss": 2.2223, + "step": 8047 + }, + { + "epoch": 1.5085285848172445, + "grad_norm": 56956.6953125, + "learning_rate": 6.513663609694365e-05, + "loss": 2.3142, + "step": 8048 + }, + { + "epoch": 1.5087160262417996, + "grad_norm": 50436.59765625, + "learning_rate": 6.512914672802272e-05, + "loss": 2.2604, + "step": 8049 + }, + { + "epoch": 1.5089034676663542, + "grad_norm": 49362.90625, + "learning_rate": 6.512165698543149e-05, + "loss": 2.2065, + "step": 8050 + }, + { + "epoch": 1.509090909090909, + "grad_norm": 47055.078125, + "learning_rate": 6.511416686935494e-05, + "loss": 2.2147, + "step": 8051 + }, + { + "epoch": 1.5092783505154639, + "grad_norm": 54503.2890625, + "learning_rate": 6.510667637997804e-05, + "loss": 2.2578, + "step": 8052 + }, + { + "epoch": 1.5094657919400187, + "grad_norm": 53281.265625, + "learning_rate": 6.509918551748584e-05, + "loss": 2.2335, + "step": 8053 + }, + { + "epoch": 1.5096532333645736, + "grad_norm": 50090.0625, + "learning_rate": 6.509169428206334e-05, + "loss": 2.2179, + "step": 8054 + }, + { + "epoch": 1.5098406747891284, + "grad_norm": 48865.26171875, + "learning_rate": 6.508420267389555e-05, + "loss": 2.2762, + "step": 8055 + }, + { + "epoch": 1.5100281162136833, + "grad_norm": 48593.22265625, + "learning_rate": 6.50767106931675e-05, + "loss": 2.2548, + "step": 8056 + }, + { + "epoch": 1.510215557638238, + "grad_norm": 55039.3828125, + "learning_rate": 6.506921834006425e-05, + "loss": 2.3049, + "step": 8057 + }, + { + "epoch": 1.510402999062793, + "grad_norm": 53127.6171875, + "learning_rate": 6.506172561477084e-05, + "loss": 2.2977, + "step": 8058 + }, + { + "epoch": 1.5105904404873476, + "grad_norm": 48723.5703125, + "learning_rate": 6.505423251747233e-05, + "loss": 2.2526, + "step": 8059 + }, + { + "epoch": 1.5107778819119027, + "grad_norm": 55762.07421875, + "learning_rate": 6.50467390483538e-05, + "loss": 2.3455, + "step": 8060 + }, + { + "epoch": 1.5109653233364573, + "grad_norm": 55589.97265625, + "learning_rate": 6.503924520760036e-05, + "loss": 2.3012, + "step": 8061 + }, + { + "epoch": 1.5111527647610123, + "grad_norm": 47545.265625, + "learning_rate": 6.503175099539701e-05, + "loss": 2.237, + "step": 8062 + }, + { + "epoch": 1.511340206185567, + "grad_norm": 49428.125, + "learning_rate": 6.502425641192893e-05, + "loss": 2.2388, + "step": 8063 + }, + { + "epoch": 1.5115276476101218, + "grad_norm": 47517.703125, + "learning_rate": 6.501676145738119e-05, + "loss": 2.2319, + "step": 8064 + }, + { + "epoch": 1.5117150890346767, + "grad_norm": 54366.75, + "learning_rate": 6.50092661319389e-05, + "loss": 2.1811, + "step": 8065 + }, + { + "epoch": 1.5119025304592315, + "grad_norm": 52108.7734375, + "learning_rate": 6.500177043578723e-05, + "loss": 2.3001, + "step": 8066 + }, + { + "epoch": 1.5120899718837864, + "grad_norm": 52336.86328125, + "learning_rate": 6.499427436911123e-05, + "loss": 2.2686, + "step": 8067 + }, + { + "epoch": 1.512277413308341, + "grad_norm": 52466.28515625, + "learning_rate": 6.498677793209613e-05, + "loss": 2.1941, + "step": 8068 + }, + { + "epoch": 1.512464854732896, + "grad_norm": 52819.2734375, + "learning_rate": 6.497928112492704e-05, + "loss": 2.1896, + "step": 8069 + }, + { + "epoch": 1.5126522961574507, + "grad_norm": 49149.875, + "learning_rate": 6.497178394778912e-05, + "loss": 2.226, + "step": 8070 + }, + { + "epoch": 1.5128397375820057, + "grad_norm": 49672.0390625, + "learning_rate": 6.496428640086755e-05, + "loss": 2.2478, + "step": 8071 + }, + { + "epoch": 1.5130271790065604, + "grad_norm": 52904.30078125, + "learning_rate": 6.49567884843475e-05, + "loss": 2.2433, + "step": 8072 + }, + { + "epoch": 1.5132146204311154, + "grad_norm": 50502.3359375, + "learning_rate": 6.494929019841417e-05, + "loss": 2.223, + "step": 8073 + }, + { + "epoch": 1.51340206185567, + "grad_norm": 48788.83984375, + "learning_rate": 6.494179154325278e-05, + "loss": 2.1823, + "step": 8074 + }, + { + "epoch": 1.513589503280225, + "grad_norm": 53024.21875, + "learning_rate": 6.493429251904845e-05, + "loss": 2.2256, + "step": 8075 + }, + { + "epoch": 1.5137769447047797, + "grad_norm": 56846.2890625, + "learning_rate": 6.492679312598651e-05, + "loss": 2.2171, + "step": 8076 + }, + { + "epoch": 1.5139643861293346, + "grad_norm": 50732.76953125, + "learning_rate": 6.491929336425211e-05, + "loss": 2.2311, + "step": 8077 + }, + { + "epoch": 1.5141518275538894, + "grad_norm": 49070.42578125, + "learning_rate": 6.491179323403053e-05, + "loss": 2.2337, + "step": 8078 + }, + { + "epoch": 1.5143392689784443, + "grad_norm": 50918.00390625, + "learning_rate": 6.490429273550698e-05, + "loss": 2.2123, + "step": 8079 + }, + { + "epoch": 1.5145267104029991, + "grad_norm": 50390.171875, + "learning_rate": 6.489679186886671e-05, + "loss": 2.2572, + "step": 8080 + }, + { + "epoch": 1.5147141518275538, + "grad_norm": 52347.96875, + "learning_rate": 6.488929063429501e-05, + "loss": 2.1999, + "step": 8081 + }, + { + "epoch": 1.5149015932521088, + "grad_norm": 49481.78515625, + "learning_rate": 6.488178903197713e-05, + "loss": 2.2444, + "step": 8082 + }, + { + "epoch": 1.5150890346766634, + "grad_norm": 56733.06640625, + "learning_rate": 6.487428706209835e-05, + "loss": 2.2144, + "step": 8083 + }, + { + "epoch": 1.5152764761012185, + "grad_norm": 48832.01953125, + "learning_rate": 6.486678472484398e-05, + "loss": 2.3144, + "step": 8084 + }, + { + "epoch": 1.5154639175257731, + "grad_norm": 48456.9453125, + "learning_rate": 6.485928202039929e-05, + "loss": 2.2839, + "step": 8085 + }, + { + "epoch": 1.515651358950328, + "grad_norm": 56425.26953125, + "learning_rate": 6.485177894894959e-05, + "loss": 2.3026, + "step": 8086 + }, + { + "epoch": 1.5158388003748828, + "grad_norm": 52989.109375, + "learning_rate": 6.484427551068022e-05, + "loss": 2.2353, + "step": 8087 + }, + { + "epoch": 1.5160262417994377, + "grad_norm": 49030.1875, + "learning_rate": 6.483677170577648e-05, + "loss": 2.2565, + "step": 8088 + }, + { + "epoch": 1.5162136832239925, + "grad_norm": 49733.14453125, + "learning_rate": 6.482926753442373e-05, + "loss": 2.2432, + "step": 8089 + }, + { + "epoch": 1.5164011246485474, + "grad_norm": 48882.5234375, + "learning_rate": 6.482176299680729e-05, + "loss": 2.2651, + "step": 8090 + }, + { + "epoch": 1.5165885660731022, + "grad_norm": 49844.71875, + "learning_rate": 6.481425809311252e-05, + "loss": 2.206, + "step": 8091 + }, + { + "epoch": 1.5167760074976568, + "grad_norm": 55440.65234375, + "learning_rate": 6.480675282352478e-05, + "loss": 2.2415, + "step": 8092 + }, + { + "epoch": 1.516963448922212, + "grad_norm": 52538.05859375, + "learning_rate": 6.479924718822945e-05, + "loss": 2.2766, + "step": 8093 + }, + { + "epoch": 1.5171508903467665, + "grad_norm": 49495.86328125, + "learning_rate": 6.47917411874119e-05, + "loss": 2.2333, + "step": 8094 + }, + { + "epoch": 1.5173383317713216, + "grad_norm": 55290.66796875, + "learning_rate": 6.478423482125751e-05, + "loss": 2.2067, + "step": 8095 + }, + { + "epoch": 1.5175257731958762, + "grad_norm": 52085.55078125, + "learning_rate": 6.477672808995171e-05, + "loss": 2.2315, + "step": 8096 + }, + { + "epoch": 1.517713214620431, + "grad_norm": 51423.8671875, + "learning_rate": 6.476922099367986e-05, + "loss": 2.2096, + "step": 8097 + }, + { + "epoch": 1.517900656044986, + "grad_norm": 54383.41796875, + "learning_rate": 6.47617135326274e-05, + "loss": 2.2418, + "step": 8098 + }, + { + "epoch": 1.5180880974695408, + "grad_norm": 53062.76953125, + "learning_rate": 6.475420570697978e-05, + "loss": 2.1699, + "step": 8099 + }, + { + "epoch": 1.5182755388940956, + "grad_norm": 49680.76953125, + "learning_rate": 6.474669751692239e-05, + "loss": 2.2498, + "step": 8100 + }, + { + "epoch": 1.5184629803186505, + "grad_norm": 50769.5546875, + "learning_rate": 6.473918896264069e-05, + "loss": 2.2695, + "step": 8101 + }, + { + "epoch": 1.5186504217432053, + "grad_norm": 50012.55078125, + "learning_rate": 6.473168004432015e-05, + "loss": 2.2502, + "step": 8102 + }, + { + "epoch": 1.51883786316776, + "grad_norm": 48555.703125, + "learning_rate": 6.472417076214619e-05, + "loss": 2.1931, + "step": 8103 + }, + { + "epoch": 1.519025304592315, + "grad_norm": 53264.43359375, + "learning_rate": 6.471666111630433e-05, + "loss": 2.2761, + "step": 8104 + }, + { + "epoch": 1.5192127460168696, + "grad_norm": 50214.8046875, + "learning_rate": 6.470915110697998e-05, + "loss": 2.2325, + "step": 8105 + }, + { + "epoch": 1.5194001874414247, + "grad_norm": 53787.2890625, + "learning_rate": 6.470164073435871e-05, + "loss": 2.1836, + "step": 8106 + }, + { + "epoch": 1.5195876288659793, + "grad_norm": 49808.6953125, + "learning_rate": 6.469412999862595e-05, + "loss": 2.2351, + "step": 8107 + }, + { + "epoch": 1.5197750702905342, + "grad_norm": 50294.96484375, + "learning_rate": 6.468661889996723e-05, + "loss": 2.2659, + "step": 8108 + }, + { + "epoch": 1.519962511715089, + "grad_norm": 48886.29296875, + "learning_rate": 6.467910743856807e-05, + "loss": 2.24, + "step": 8109 + }, + { + "epoch": 1.5201499531396439, + "grad_norm": 58029.14453125, + "learning_rate": 6.467159561461399e-05, + "loss": 2.3363, + "step": 8110 + }, + { + "epoch": 1.5203373945641987, + "grad_norm": 52250.6953125, + "learning_rate": 6.466408342829052e-05, + "loss": 2.225, + "step": 8111 + }, + { + "epoch": 1.5205248359887535, + "grad_norm": 47467.7890625, + "learning_rate": 6.465657087978321e-05, + "loss": 2.2607, + "step": 8112 + }, + { + "epoch": 1.5207122774133084, + "grad_norm": 54316.20703125, + "learning_rate": 6.464905796927758e-05, + "loss": 2.3476, + "step": 8113 + }, + { + "epoch": 1.520899718837863, + "grad_norm": 50855.421875, + "learning_rate": 6.464154469695922e-05, + "loss": 2.2194, + "step": 8114 + }, + { + "epoch": 1.521087160262418, + "grad_norm": 50458.82421875, + "learning_rate": 6.46340310630137e-05, + "loss": 2.2236, + "step": 8115 + }, + { + "epoch": 1.5212746016869727, + "grad_norm": 52587.8125, + "learning_rate": 6.462651706762656e-05, + "loss": 2.1995, + "step": 8116 + }, + { + "epoch": 1.5214620431115278, + "grad_norm": 52353.61328125, + "learning_rate": 6.461900271098344e-05, + "loss": 2.1963, + "step": 8117 + }, + { + "epoch": 1.5216494845360824, + "grad_norm": 49185.9921875, + "learning_rate": 6.46114879932699e-05, + "loss": 2.2245, + "step": 8118 + }, + { + "epoch": 1.5218369259606375, + "grad_norm": 52290.80078125, + "learning_rate": 6.460397291467155e-05, + "loss": 2.2458, + "step": 8119 + }, + { + "epoch": 1.522024367385192, + "grad_norm": 53666.15625, + "learning_rate": 6.4596457475374e-05, + "loss": 2.2422, + "step": 8120 + }, + { + "epoch": 1.522211808809747, + "grad_norm": 49906.46484375, + "learning_rate": 6.458894167556288e-05, + "loss": 2.2814, + "step": 8121 + }, + { + "epoch": 1.5223992502343018, + "grad_norm": 49882.55859375, + "learning_rate": 6.458142551542381e-05, + "loss": 2.2557, + "step": 8122 + }, + { + "epoch": 1.5225866916588566, + "grad_norm": 54054.1015625, + "learning_rate": 6.457390899514244e-05, + "loss": 2.2237, + "step": 8123 + }, + { + "epoch": 1.5227741330834115, + "grad_norm": 51136.91015625, + "learning_rate": 6.456639211490442e-05, + "loss": 2.1882, + "step": 8124 + }, + { + "epoch": 1.522961574507966, + "grad_norm": 56929.90234375, + "learning_rate": 6.455887487489538e-05, + "loss": 2.2226, + "step": 8125 + }, + { + "epoch": 1.5231490159325212, + "grad_norm": 51273.8984375, + "learning_rate": 6.455135727530102e-05, + "loss": 2.266, + "step": 8126 + }, + { + "epoch": 1.5233364573570758, + "grad_norm": 48951.3046875, + "learning_rate": 6.4543839316307e-05, + "loss": 2.3338, + "step": 8127 + }, + { + "epoch": 1.5235238987816309, + "grad_norm": 59905.05078125, + "learning_rate": 6.4536320998099e-05, + "loss": 2.2063, + "step": 8128 + }, + { + "epoch": 1.5237113402061855, + "grad_norm": 54858.08203125, + "learning_rate": 6.452880232086272e-05, + "loss": 2.1749, + "step": 8129 + }, + { + "epoch": 1.5238987816307406, + "grad_norm": 50442.44140625, + "learning_rate": 6.452128328478386e-05, + "loss": 2.1729, + "step": 8130 + }, + { + "epoch": 1.5240862230552952, + "grad_norm": 51315.98828125, + "learning_rate": 6.451376389004812e-05, + "loss": 2.1855, + "step": 8131 + }, + { + "epoch": 1.52427366447985, + "grad_norm": 47323.8203125, + "learning_rate": 6.450624413684126e-05, + "loss": 2.2878, + "step": 8132 + }, + { + "epoch": 1.5244611059044049, + "grad_norm": 56298.78515625, + "learning_rate": 6.449872402534894e-05, + "loss": 2.2526, + "step": 8133 + }, + { + "epoch": 1.5246485473289597, + "grad_norm": 50273.2109375, + "learning_rate": 6.449120355575696e-05, + "loss": 2.2943, + "step": 8134 + }, + { + "epoch": 1.5248359887535146, + "grad_norm": 63069.5234375, + "learning_rate": 6.448368272825102e-05, + "loss": 2.1561, + "step": 8135 + }, + { + "epoch": 1.5250234301780694, + "grad_norm": 47814.3984375, + "learning_rate": 6.447616154301692e-05, + "loss": 2.2316, + "step": 8136 + }, + { + "epoch": 1.5252108716026243, + "grad_norm": 49021.7265625, + "learning_rate": 6.446864000024038e-05, + "loss": 2.2502, + "step": 8137 + }, + { + "epoch": 1.5253983130271789, + "grad_norm": 47938.73828125, + "learning_rate": 6.44611181001072e-05, + "loss": 2.2611, + "step": 8138 + }, + { + "epoch": 1.525585754451734, + "grad_norm": 52746.3828125, + "learning_rate": 6.445359584280314e-05, + "loss": 2.2004, + "step": 8139 + }, + { + "epoch": 1.5257731958762886, + "grad_norm": 51698.453125, + "learning_rate": 6.444607322851402e-05, + "loss": 2.2729, + "step": 8140 + }, + { + "epoch": 1.5259606373008436, + "grad_norm": 51176.1875, + "learning_rate": 6.443855025742561e-05, + "loss": 2.211, + "step": 8141 + }, + { + "epoch": 1.5261480787253983, + "grad_norm": 47956.91796875, + "learning_rate": 6.443102692972373e-05, + "loss": 2.2261, + "step": 8142 + }, + { + "epoch": 1.526335520149953, + "grad_norm": 51297.921875, + "learning_rate": 6.44235032455942e-05, + "loss": 2.2432, + "step": 8143 + }, + { + "epoch": 1.526522961574508, + "grad_norm": 59834.08203125, + "learning_rate": 6.441597920522283e-05, + "loss": 2.2923, + "step": 8144 + }, + { + "epoch": 1.5267104029990628, + "grad_norm": 49316.9765625, + "learning_rate": 6.440845480879547e-05, + "loss": 2.2427, + "step": 8145 + }, + { + "epoch": 1.5268978444236176, + "grad_norm": 53376.5390625, + "learning_rate": 6.440093005649796e-05, + "loss": 2.2478, + "step": 8146 + }, + { + "epoch": 1.5270852858481725, + "grad_norm": 47764.14453125, + "learning_rate": 6.439340494851614e-05, + "loss": 2.2634, + "step": 8147 + }, + { + "epoch": 1.5272727272727273, + "grad_norm": 53581.6171875, + "learning_rate": 6.438587948503589e-05, + "loss": 2.2156, + "step": 8148 + }, + { + "epoch": 1.527460168697282, + "grad_norm": 49204.41015625, + "learning_rate": 6.437835366624306e-05, + "loss": 2.2653, + "step": 8149 + }, + { + "epoch": 1.527647610121837, + "grad_norm": 48708.2578125, + "learning_rate": 6.437082749232353e-05, + "loss": 2.2075, + "step": 8150 + }, + { + "epoch": 1.5278350515463917, + "grad_norm": 50108.9609375, + "learning_rate": 6.436330096346321e-05, + "loss": 2.2822, + "step": 8151 + }, + { + "epoch": 1.5280224929709467, + "grad_norm": 50154.421875, + "learning_rate": 6.435577407984797e-05, + "loss": 2.2754, + "step": 8152 + }, + { + "epoch": 1.5282099343955013, + "grad_norm": 50973.71484375, + "learning_rate": 6.434824684166371e-05, + "loss": 2.2715, + "step": 8153 + }, + { + "epoch": 1.5283973758200562, + "grad_norm": 53116.625, + "learning_rate": 6.43407192490964e-05, + "loss": 2.1984, + "step": 8154 + }, + { + "epoch": 1.528584817244611, + "grad_norm": 50847.48828125, + "learning_rate": 6.433319130233187e-05, + "loss": 2.2666, + "step": 8155 + }, + { + "epoch": 1.5287722586691659, + "grad_norm": 55455.1640625, + "learning_rate": 6.432566300155611e-05, + "loss": 2.2177, + "step": 8156 + }, + { + "epoch": 1.5289597000937207, + "grad_norm": 54087.421875, + "learning_rate": 6.431813434695506e-05, + "loss": 2.271, + "step": 8157 + }, + { + "epoch": 1.5291471415182756, + "grad_norm": 50429.1875, + "learning_rate": 6.431060533871465e-05, + "loss": 2.2304, + "step": 8158 + }, + { + "epoch": 1.5293345829428304, + "grad_norm": 47336.296875, + "learning_rate": 6.430307597702083e-05, + "loss": 2.2316, + "step": 8159 + }, + { + "epoch": 1.529522024367385, + "grad_norm": 50494.94921875, + "learning_rate": 6.429554626205959e-05, + "loss": 2.1763, + "step": 8160 + }, + { + "epoch": 1.5297094657919401, + "grad_norm": 48558.48046875, + "learning_rate": 6.428801619401688e-05, + "loss": 2.2231, + "step": 8161 + }, + { + "epoch": 1.5298969072164947, + "grad_norm": 52241.37109375, + "learning_rate": 6.428048577307872e-05, + "loss": 2.3621, + "step": 8162 + }, + { + "epoch": 1.5300843486410498, + "grad_norm": 50241.8515625, + "learning_rate": 6.427295499943104e-05, + "loss": 2.3, + "step": 8163 + }, + { + "epoch": 1.5302717900656044, + "grad_norm": 49205.53515625, + "learning_rate": 6.42654238732599e-05, + "loss": 2.1999, + "step": 8164 + }, + { + "epoch": 1.5304592314901593, + "grad_norm": 52152.6015625, + "learning_rate": 6.425789239475128e-05, + "loss": 2.2356, + "step": 8165 + }, + { + "epoch": 1.5306466729147141, + "grad_norm": 46360.72265625, + "learning_rate": 6.425036056409121e-05, + "loss": 2.2033, + "step": 8166 + }, + { + "epoch": 1.530834114339269, + "grad_norm": 48220.29296875, + "learning_rate": 6.424282838146572e-05, + "loss": 2.1853, + "step": 8167 + }, + { + "epoch": 1.5310215557638238, + "grad_norm": 54124.984375, + "learning_rate": 6.423529584706081e-05, + "loss": 2.1954, + "step": 8168 + }, + { + "epoch": 1.5312089971883787, + "grad_norm": 53544.21484375, + "learning_rate": 6.422776296106257e-05, + "loss": 2.1285, + "step": 8169 + }, + { + "epoch": 1.5313964386129335, + "grad_norm": 49144.64453125, + "learning_rate": 6.4220229723657e-05, + "loss": 2.2525, + "step": 8170 + }, + { + "epoch": 1.5315838800374881, + "grad_norm": 49120.36328125, + "learning_rate": 6.421269613503022e-05, + "loss": 2.2783, + "step": 8171 + }, + { + "epoch": 1.5317713214620432, + "grad_norm": 48206.3671875, + "learning_rate": 6.420516219536827e-05, + "loss": 2.2373, + "step": 8172 + }, + { + "epoch": 1.5319587628865978, + "grad_norm": 49162.40625, + "learning_rate": 6.419762790485723e-05, + "loss": 2.3263, + "step": 8173 + }, + { + "epoch": 1.532146204311153, + "grad_norm": 51198.33984375, + "learning_rate": 6.419009326368319e-05, + "loss": 2.1853, + "step": 8174 + }, + { + "epoch": 1.5323336457357075, + "grad_norm": 45889.48828125, + "learning_rate": 6.418255827203223e-05, + "loss": 2.2511, + "step": 8175 + }, + { + "epoch": 1.5325210871602624, + "grad_norm": 51863.55078125, + "learning_rate": 6.417502293009047e-05, + "loss": 2.2339, + "step": 8176 + }, + { + "epoch": 1.5327085285848172, + "grad_norm": 49034.015625, + "learning_rate": 6.416748723804404e-05, + "loss": 2.2325, + "step": 8177 + }, + { + "epoch": 1.532895970009372, + "grad_norm": 53825.71484375, + "learning_rate": 6.415995119607904e-05, + "loss": 2.2871, + "step": 8178 + }, + { + "epoch": 1.533083411433927, + "grad_norm": 55703.71484375, + "learning_rate": 6.415241480438159e-05, + "loss": 2.253, + "step": 8179 + }, + { + "epoch": 1.5332708528584817, + "grad_norm": 56004.4375, + "learning_rate": 6.414487806313785e-05, + "loss": 2.3106, + "step": 8180 + }, + { + "epoch": 1.5334582942830366, + "grad_norm": 50918.4921875, + "learning_rate": 6.413734097253397e-05, + "loss": 2.2621, + "step": 8181 + }, + { + "epoch": 1.5336457357075912, + "grad_norm": 47454.5078125, + "learning_rate": 6.412980353275609e-05, + "loss": 2.217, + "step": 8182 + }, + { + "epoch": 1.5338331771321463, + "grad_norm": 50222.9765625, + "learning_rate": 6.412226574399038e-05, + "loss": 2.2109, + "step": 8183 + }, + { + "epoch": 1.534020618556701, + "grad_norm": 53402.76953125, + "learning_rate": 6.411472760642302e-05, + "loss": 2.2171, + "step": 8184 + }, + { + "epoch": 1.534208059981256, + "grad_norm": 47995.0234375, + "learning_rate": 6.410718912024018e-05, + "loss": 2.2302, + "step": 8185 + }, + { + "epoch": 1.5343955014058106, + "grad_norm": 48842.94921875, + "learning_rate": 6.409965028562806e-05, + "loss": 2.2384, + "step": 8186 + }, + { + "epoch": 1.5345829428303657, + "grad_norm": 48084.49609375, + "learning_rate": 6.409211110277286e-05, + "loss": 2.2904, + "step": 8187 + }, + { + "epoch": 1.5347703842549203, + "grad_norm": 49110.890625, + "learning_rate": 6.408457157186078e-05, + "loss": 2.2388, + "step": 8188 + }, + { + "epoch": 1.5349578256794751, + "grad_norm": 51410.21484375, + "learning_rate": 6.407703169307807e-05, + "loss": 2.3172, + "step": 8189 + }, + { + "epoch": 1.53514526710403, + "grad_norm": 50289.81640625, + "learning_rate": 6.40694914666109e-05, + "loss": 2.247, + "step": 8190 + }, + { + "epoch": 1.5353327085285848, + "grad_norm": 50924.33203125, + "learning_rate": 6.406195089264554e-05, + "loss": 2.2498, + "step": 8191 + }, + { + "epoch": 1.5355201499531397, + "grad_norm": 48499.078125, + "learning_rate": 6.405440997136825e-05, + "loss": 2.2807, + "step": 8192 + }, + { + "epoch": 1.5357075913776945, + "grad_norm": 51083.7265625, + "learning_rate": 6.404686870296522e-05, + "loss": 2.1861, + "step": 8193 + }, + { + "epoch": 1.5358950328022494, + "grad_norm": 55520.46875, + "learning_rate": 6.403932708762276e-05, + "loss": 2.2027, + "step": 8194 + }, + { + "epoch": 1.536082474226804, + "grad_norm": 50371.59375, + "learning_rate": 6.403178512552712e-05, + "loss": 2.2476, + "step": 8195 + }, + { + "epoch": 1.536269915651359, + "grad_norm": 48853.2890625, + "learning_rate": 6.402424281686458e-05, + "loss": 2.2386, + "step": 8196 + }, + { + "epoch": 1.5364573570759137, + "grad_norm": 51700.27734375, + "learning_rate": 6.401670016182143e-05, + "loss": 2.2056, + "step": 8197 + }, + { + "epoch": 1.5366447985004688, + "grad_norm": 51791.640625, + "learning_rate": 6.400915716058395e-05, + "loss": 2.298, + "step": 8198 + }, + { + "epoch": 1.5368322399250234, + "grad_norm": 50355.671875, + "learning_rate": 6.400161381333847e-05, + "loss": 2.2415, + "step": 8199 + }, + { + "epoch": 1.5370196813495782, + "grad_norm": 51684.88671875, + "learning_rate": 6.399407012027125e-05, + "loss": 2.1605, + "step": 8200 + }, + { + "epoch": 1.537207122774133, + "grad_norm": 52827.83203125, + "learning_rate": 6.398652608156867e-05, + "loss": 2.332, + "step": 8201 + }, + { + "epoch": 1.537394564198688, + "grad_norm": 56462.1484375, + "learning_rate": 6.397898169741702e-05, + "loss": 2.165, + "step": 8202 + }, + { + "epoch": 1.5375820056232428, + "grad_norm": 50839.015625, + "learning_rate": 6.397143696800264e-05, + "loss": 2.1956, + "step": 8203 + }, + { + "epoch": 1.5377694470477976, + "grad_norm": 53611.9453125, + "learning_rate": 6.39638918935119e-05, + "loss": 2.2129, + "step": 8204 + }, + { + "epoch": 1.5379568884723525, + "grad_norm": 52701.6875, + "learning_rate": 6.395634647413112e-05, + "loss": 2.2775, + "step": 8205 + }, + { + "epoch": 1.538144329896907, + "grad_norm": 50512.4296875, + "learning_rate": 6.394880071004668e-05, + "loss": 2.1132, + "step": 8206 + }, + { + "epoch": 1.5383317713214621, + "grad_norm": 57779.96484375, + "learning_rate": 6.394125460144494e-05, + "loss": 2.2133, + "step": 8207 + }, + { + "epoch": 1.5385192127460168, + "grad_norm": 52290.5703125, + "learning_rate": 6.39337081485123e-05, + "loss": 2.2643, + "step": 8208 + }, + { + "epoch": 1.5387066541705718, + "grad_norm": 51342.25390625, + "learning_rate": 6.392616135143511e-05, + "loss": 2.1611, + "step": 8209 + }, + { + "epoch": 1.5388940955951265, + "grad_norm": 54893.1640625, + "learning_rate": 6.39186142103998e-05, + "loss": 2.1809, + "step": 8210 + }, + { + "epoch": 1.5390815370196813, + "grad_norm": 56584.5234375, + "learning_rate": 6.391106672559278e-05, + "loss": 2.2149, + "step": 8211 + }, + { + "epoch": 1.5392689784442362, + "grad_norm": 49557.37109375, + "learning_rate": 6.390351889720044e-05, + "loss": 2.2444, + "step": 8212 + }, + { + "epoch": 1.539456419868791, + "grad_norm": 51453.515625, + "learning_rate": 6.38959707254092e-05, + "loss": 2.2026, + "step": 8213 + }, + { + "epoch": 1.5396438612933459, + "grad_norm": 51340.4140625, + "learning_rate": 6.388842221040552e-05, + "loss": 2.2398, + "step": 8214 + }, + { + "epoch": 1.5398313027179007, + "grad_norm": 51018.71484375, + "learning_rate": 6.38808733523758e-05, + "loss": 2.2089, + "step": 8215 + }, + { + "epoch": 1.5400187441424555, + "grad_norm": 49810.7890625, + "learning_rate": 6.38733241515065e-05, + "loss": 2.2444, + "step": 8216 + }, + { + "epoch": 1.5402061855670102, + "grad_norm": 49584.79296875, + "learning_rate": 6.38657746079841e-05, + "loss": 2.1657, + "step": 8217 + }, + { + "epoch": 1.5403936269915652, + "grad_norm": 50009.23828125, + "learning_rate": 6.385822472199504e-05, + "loss": 2.23, + "step": 8218 + }, + { + "epoch": 1.5405810684161199, + "grad_norm": 51149.58203125, + "learning_rate": 6.38506744937258e-05, + "loss": 2.2104, + "step": 8219 + }, + { + "epoch": 1.540768509840675, + "grad_norm": 47931.015625, + "learning_rate": 6.384312392336284e-05, + "loss": 2.2945, + "step": 8220 + }, + { + "epoch": 1.5409559512652296, + "grad_norm": 50455.7109375, + "learning_rate": 6.383557301109269e-05, + "loss": 2.2373, + "step": 8221 + }, + { + "epoch": 1.5411433926897844, + "grad_norm": 53438.69140625, + "learning_rate": 6.38280217571018e-05, + "loss": 2.2456, + "step": 8222 + }, + { + "epoch": 1.5413308341143392, + "grad_norm": 57556.83984375, + "learning_rate": 6.382047016157671e-05, + "loss": 2.156, + "step": 8223 + }, + { + "epoch": 1.541518275538894, + "grad_norm": 55568.47265625, + "learning_rate": 6.381291822470393e-05, + "loss": 2.3028, + "step": 8224 + }, + { + "epoch": 1.541705716963449, + "grad_norm": 52911.515625, + "learning_rate": 6.380536594667e-05, + "loss": 2.2204, + "step": 8225 + }, + { + "epoch": 1.5418931583880038, + "grad_norm": 50521.81640625, + "learning_rate": 6.379781332766138e-05, + "loss": 2.1624, + "step": 8226 + }, + { + "epoch": 1.5420805998125586, + "grad_norm": 56934.4921875, + "learning_rate": 6.37902603678647e-05, + "loss": 2.1828, + "step": 8227 + }, + { + "epoch": 1.5422680412371133, + "grad_norm": 47526.171875, + "learning_rate": 6.378270706746645e-05, + "loss": 2.2558, + "step": 8228 + }, + { + "epoch": 1.5424554826616683, + "grad_norm": 53920.80078125, + "learning_rate": 6.377515342665321e-05, + "loss": 2.1483, + "step": 8229 + }, + { + "epoch": 1.542642924086223, + "grad_norm": 52441.62109375, + "learning_rate": 6.376759944561154e-05, + "loss": 2.2811, + "step": 8230 + }, + { + "epoch": 1.542830365510778, + "grad_norm": 49130.94921875, + "learning_rate": 6.376004512452803e-05, + "loss": 2.304, + "step": 8231 + }, + { + "epoch": 1.5430178069353326, + "grad_norm": 53139.5546875, + "learning_rate": 6.375249046358923e-05, + "loss": 2.2482, + "step": 8232 + }, + { + "epoch": 1.5432052483598875, + "grad_norm": 54110.48046875, + "learning_rate": 6.374493546298173e-05, + "loss": 2.2365, + "step": 8233 + }, + { + "epoch": 1.5433926897844423, + "grad_norm": 51248.04296875, + "learning_rate": 6.373738012289216e-05, + "loss": 2.255, + "step": 8234 + }, + { + "epoch": 1.5435801312089972, + "grad_norm": 50309.0703125, + "learning_rate": 6.372982444350713e-05, + "loss": 2.193, + "step": 8235 + }, + { + "epoch": 1.543767572633552, + "grad_norm": 57178.703125, + "learning_rate": 6.372226842501321e-05, + "loss": 2.2397, + "step": 8236 + }, + { + "epoch": 1.5439550140581069, + "grad_norm": 51548.53515625, + "learning_rate": 6.371471206759705e-05, + "loss": 2.1663, + "step": 8237 + }, + { + "epoch": 1.5441424554826617, + "grad_norm": 48846.09375, + "learning_rate": 6.370715537144529e-05, + "loss": 2.1264, + "step": 8238 + }, + { + "epoch": 1.5443298969072163, + "grad_norm": 54680.0703125, + "learning_rate": 6.369959833674457e-05, + "loss": 2.1151, + "step": 8239 + }, + { + "epoch": 1.5445173383317714, + "grad_norm": 53652.7109375, + "learning_rate": 6.369204096368154e-05, + "loss": 2.2615, + "step": 8240 + }, + { + "epoch": 1.544704779756326, + "grad_norm": 47015.68359375, + "learning_rate": 6.368448325244283e-05, + "loss": 2.2356, + "step": 8241 + }, + { + "epoch": 1.544892221180881, + "grad_norm": 46481.62109375, + "learning_rate": 6.367692520321514e-05, + "loss": 2.1468, + "step": 8242 + }, + { + "epoch": 1.5450796626054357, + "grad_norm": 52589.1171875, + "learning_rate": 6.366936681618512e-05, + "loss": 2.294, + "step": 8243 + }, + { + "epoch": 1.5452671040299908, + "grad_norm": 49965.47265625, + "learning_rate": 6.366180809153947e-05, + "loss": 2.2491, + "step": 8244 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 50782.6640625, + "learning_rate": 6.365424902946488e-05, + "loss": 2.1691, + "step": 8245 + }, + { + "epoch": 1.5456419868791003, + "grad_norm": 52145.9140625, + "learning_rate": 6.364668963014802e-05, + "loss": 2.2005, + "step": 8246 + }, + { + "epoch": 1.545829428303655, + "grad_norm": 53876.64453125, + "learning_rate": 6.363912989377565e-05, + "loss": 2.283, + "step": 8247 + }, + { + "epoch": 1.54601686972821, + "grad_norm": 50195.3671875, + "learning_rate": 6.363156982053443e-05, + "loss": 2.2078, + "step": 8248 + }, + { + "epoch": 1.5462043111527648, + "grad_norm": 50192.59765625, + "learning_rate": 6.362400941061112e-05, + "loss": 2.2051, + "step": 8249 + }, + { + "epoch": 1.5463917525773194, + "grad_norm": 52267.140625, + "learning_rate": 6.361644866419244e-05, + "loss": 2.3078, + "step": 8250 + }, + { + "epoch": 1.5465791940018745, + "grad_norm": 54508.15234375, + "learning_rate": 6.360888758146513e-05, + "loss": 2.196, + "step": 8251 + }, + { + "epoch": 1.5467666354264291, + "grad_norm": 51750.3515625, + "learning_rate": 6.360132616261593e-05, + "loss": 2.1865, + "step": 8252 + }, + { + "epoch": 1.5469540768509842, + "grad_norm": 51169.171875, + "learning_rate": 6.359376440783163e-05, + "loss": 2.233, + "step": 8253 + }, + { + "epoch": 1.5471415182755388, + "grad_norm": 55254.29296875, + "learning_rate": 6.358620231729897e-05, + "loss": 2.2726, + "step": 8254 + }, + { + "epoch": 1.5473289597000939, + "grad_norm": 50108.1015625, + "learning_rate": 6.357863989120471e-05, + "loss": 2.2382, + "step": 8255 + }, + { + "epoch": 1.5475164011246485, + "grad_norm": 51725.85546875, + "learning_rate": 6.357107712973565e-05, + "loss": 2.2612, + "step": 8256 + }, + { + "epoch": 1.5477038425492033, + "grad_norm": 48612.390625, + "learning_rate": 6.356351403307861e-05, + "loss": 2.1585, + "step": 8257 + }, + { + "epoch": 1.5478912839737582, + "grad_norm": 52014.46875, + "learning_rate": 6.355595060142033e-05, + "loss": 2.292, + "step": 8258 + }, + { + "epoch": 1.548078725398313, + "grad_norm": 48663.44140625, + "learning_rate": 6.354838683494766e-05, + "loss": 2.1663, + "step": 8259 + }, + { + "epoch": 1.5482661668228679, + "grad_norm": 54313.2890625, + "learning_rate": 6.35408227338474e-05, + "loss": 2.1888, + "step": 8260 + }, + { + "epoch": 1.5484536082474227, + "grad_norm": 50822.1953125, + "learning_rate": 6.353325829830636e-05, + "loss": 2.1939, + "step": 8261 + }, + { + "epoch": 1.5486410496719776, + "grad_norm": 50593.640625, + "learning_rate": 6.35256935285114e-05, + "loss": 2.2147, + "step": 8262 + }, + { + "epoch": 1.5488284910965322, + "grad_norm": 48652.87109375, + "learning_rate": 6.351812842464933e-05, + "loss": 2.2139, + "step": 8263 + }, + { + "epoch": 1.5490159325210873, + "grad_norm": 49117.34375, + "learning_rate": 6.351056298690705e-05, + "loss": 2.2618, + "step": 8264 + }, + { + "epoch": 1.549203373945642, + "grad_norm": 52773.3828125, + "learning_rate": 6.350299721547135e-05, + "loss": 2.2539, + "step": 8265 + }, + { + "epoch": 1.549390815370197, + "grad_norm": 51081.95703125, + "learning_rate": 6.349543111052913e-05, + "loss": 2.165, + "step": 8266 + }, + { + "epoch": 1.5495782567947516, + "grad_norm": 49151.390625, + "learning_rate": 6.348786467226726e-05, + "loss": 2.1947, + "step": 8267 + }, + { + "epoch": 1.5497656982193064, + "grad_norm": 50176.08203125, + "learning_rate": 6.348029790087263e-05, + "loss": 2.2269, + "step": 8268 + }, + { + "epoch": 1.5499531396438613, + "grad_norm": 50138.765625, + "learning_rate": 6.34727307965321e-05, + "loss": 2.2299, + "step": 8269 + }, + { + "epoch": 1.5501405810684161, + "grad_norm": 53910.34375, + "learning_rate": 6.34651633594326e-05, + "loss": 2.2938, + "step": 8270 + }, + { + "epoch": 1.550328022492971, + "grad_norm": 53008.7890625, + "learning_rate": 6.345759558976101e-05, + "loss": 2.2648, + "step": 8271 + }, + { + "epoch": 1.5505154639175258, + "grad_norm": 56405.44140625, + "learning_rate": 6.345002748770428e-05, + "loss": 2.292, + "step": 8272 + }, + { + "epoch": 1.5507029053420807, + "grad_norm": 52763.2578125, + "learning_rate": 6.34424590534493e-05, + "loss": 2.2275, + "step": 8273 + }, + { + "epoch": 1.5508903467666353, + "grad_norm": 59843.6796875, + "learning_rate": 6.343489028718301e-05, + "loss": 2.1075, + "step": 8274 + }, + { + "epoch": 1.5510777881911904, + "grad_norm": 48674.69921875, + "learning_rate": 6.342732118909235e-05, + "loss": 2.2758, + "step": 8275 + }, + { + "epoch": 1.551265229615745, + "grad_norm": 50746.2890625, + "learning_rate": 6.341975175936424e-05, + "loss": 2.2113, + "step": 8276 + }, + { + "epoch": 1.5514526710403, + "grad_norm": 51439.859375, + "learning_rate": 6.34121819981857e-05, + "loss": 2.1291, + "step": 8277 + }, + { + "epoch": 1.5516401124648547, + "grad_norm": 54854.1640625, + "learning_rate": 6.340461190574362e-05, + "loss": 2.2091, + "step": 8278 + }, + { + "epoch": 1.5518275538894095, + "grad_norm": 56638.703125, + "learning_rate": 6.339704148222503e-05, + "loss": 2.2495, + "step": 8279 + }, + { + "epoch": 1.5520149953139644, + "grad_norm": 49263.6796875, + "learning_rate": 6.338947072781687e-05, + "loss": 2.2556, + "step": 8280 + }, + { + "epoch": 1.5522024367385192, + "grad_norm": 54865.640625, + "learning_rate": 6.338189964270615e-05, + "loss": 2.1843, + "step": 8281 + }, + { + "epoch": 1.552389878163074, + "grad_norm": 52853.4921875, + "learning_rate": 6.337432822707985e-05, + "loss": 2.1833, + "step": 8282 + }, + { + "epoch": 1.552577319587629, + "grad_norm": 57107.140625, + "learning_rate": 6.336675648112499e-05, + "loss": 2.2211, + "step": 8283 + }, + { + "epoch": 1.5527647610121837, + "grad_norm": 51279.140625, + "learning_rate": 6.335918440502857e-05, + "loss": 2.2678, + "step": 8284 + }, + { + "epoch": 1.5529522024367384, + "grad_norm": 48074.046875, + "learning_rate": 6.335161199897763e-05, + "loss": 2.2593, + "step": 8285 + }, + { + "epoch": 1.5531396438612934, + "grad_norm": 48163.66015625, + "learning_rate": 6.334403926315917e-05, + "loss": 2.226, + "step": 8286 + }, + { + "epoch": 1.553327085285848, + "grad_norm": 54487.53515625, + "learning_rate": 6.333646619776026e-05, + "loss": 2.2477, + "step": 8287 + }, + { + "epoch": 1.5535145267104031, + "grad_norm": 51022.0859375, + "learning_rate": 6.33288928029679e-05, + "loss": 2.1824, + "step": 8288 + }, + { + "epoch": 1.5537019681349578, + "grad_norm": 49842.90625, + "learning_rate": 6.332131907896919e-05, + "loss": 2.2231, + "step": 8289 + }, + { + "epoch": 1.5538894095595126, + "grad_norm": 53218.89453125, + "learning_rate": 6.331374502595116e-05, + "loss": 2.3063, + "step": 8290 + }, + { + "epoch": 1.5540768509840674, + "grad_norm": 50805.93359375, + "learning_rate": 6.330617064410088e-05, + "loss": 2.2084, + "step": 8291 + }, + { + "epoch": 1.5542642924086223, + "grad_norm": 51319.0859375, + "learning_rate": 6.329859593360546e-05, + "loss": 2.2452, + "step": 8292 + }, + { + "epoch": 1.5544517338331771, + "grad_norm": 57723.9609375, + "learning_rate": 6.329102089465194e-05, + "loss": 2.2308, + "step": 8293 + }, + { + "epoch": 1.554639175257732, + "grad_norm": 55738.91015625, + "learning_rate": 6.328344552742745e-05, + "loss": 2.2132, + "step": 8294 + }, + { + "epoch": 1.5548266166822868, + "grad_norm": 49338.0703125, + "learning_rate": 6.327586983211907e-05, + "loss": 2.2306, + "step": 8295 + }, + { + "epoch": 1.5550140581068415, + "grad_norm": 47457.34375, + "learning_rate": 6.326829380891392e-05, + "loss": 2.2158, + "step": 8296 + }, + { + "epoch": 1.5552014995313965, + "grad_norm": 54424.20703125, + "learning_rate": 6.326071745799911e-05, + "loss": 2.2443, + "step": 8297 + }, + { + "epoch": 1.5553889409559511, + "grad_norm": 48885.25390625, + "learning_rate": 6.32531407795618e-05, + "loss": 2.2071, + "step": 8298 + }, + { + "epoch": 1.5555763823805062, + "grad_norm": 49857.34375, + "learning_rate": 6.324556377378905e-05, + "loss": 2.2463, + "step": 8299 + }, + { + "epoch": 1.5557638238050608, + "grad_norm": 51577.04296875, + "learning_rate": 6.323798644086808e-05, + "loss": 2.2612, + "step": 8300 + }, + { + "epoch": 1.555951265229616, + "grad_norm": 55303.40625, + "learning_rate": 6.3230408780986e-05, + "loss": 2.1685, + "step": 8301 + }, + { + "epoch": 1.5561387066541705, + "grad_norm": 52431.046875, + "learning_rate": 6.322283079432998e-05, + "loss": 2.2617, + "step": 8302 + }, + { + "epoch": 1.5563261480787254, + "grad_norm": 60522.1171875, + "learning_rate": 6.321525248108717e-05, + "loss": 2.4232, + "step": 8303 + }, + { + "epoch": 1.5565135895032802, + "grad_norm": 47040.29296875, + "learning_rate": 6.320767384144477e-05, + "loss": 2.2299, + "step": 8304 + }, + { + "epoch": 1.556701030927835, + "grad_norm": 49043.47265625, + "learning_rate": 6.320009487558996e-05, + "loss": 2.2022, + "step": 8305 + }, + { + "epoch": 1.55688847235239, + "grad_norm": 50036.875, + "learning_rate": 6.319251558370988e-05, + "loss": 2.2534, + "step": 8306 + }, + { + "epoch": 1.5570759137769445, + "grad_norm": 50795.80078125, + "learning_rate": 6.318493596599181e-05, + "loss": 2.2269, + "step": 8307 + }, + { + "epoch": 1.5572633552014996, + "grad_norm": 51639.05859375, + "learning_rate": 6.31773560226229e-05, + "loss": 2.2705, + "step": 8308 + }, + { + "epoch": 1.5574507966260542, + "grad_norm": 46890.1015625, + "learning_rate": 6.316977575379038e-05, + "loss": 2.2173, + "step": 8309 + }, + { + "epoch": 1.5576382380506093, + "grad_norm": 49957.22265625, + "learning_rate": 6.316219515968148e-05, + "loss": 2.2324, + "step": 8310 + }, + { + "epoch": 1.557825679475164, + "grad_norm": 54532.1328125, + "learning_rate": 6.315461424048343e-05, + "loss": 2.3194, + "step": 8311 + }, + { + "epoch": 1.558013120899719, + "grad_norm": 49788.84375, + "learning_rate": 6.314703299638345e-05, + "loss": 2.2323, + "step": 8312 + }, + { + "epoch": 1.5582005623242736, + "grad_norm": 50377.22265625, + "learning_rate": 6.313945142756881e-05, + "loss": 2.2805, + "step": 8313 + }, + { + "epoch": 1.5583880037488285, + "grad_norm": 50621.09765625, + "learning_rate": 6.313186953422676e-05, + "loss": 2.2413, + "step": 8314 + }, + { + "epoch": 1.5585754451733833, + "grad_norm": 50046.42578125, + "learning_rate": 6.312428731654454e-05, + "loss": 2.2806, + "step": 8315 + }, + { + "epoch": 1.5587628865979382, + "grad_norm": 49646.49609375, + "learning_rate": 6.311670477470944e-05, + "loss": 2.1713, + "step": 8316 + }, + { + "epoch": 1.558950328022493, + "grad_norm": 49252.61328125, + "learning_rate": 6.310912190890876e-05, + "loss": 2.114, + "step": 8317 + }, + { + "epoch": 1.5591377694470478, + "grad_norm": 53239.1875, + "learning_rate": 6.310153871932976e-05, + "loss": 2.2214, + "step": 8318 + }, + { + "epoch": 1.5593252108716027, + "grad_norm": 58840.3046875, + "learning_rate": 6.309395520615972e-05, + "loss": 2.2367, + "step": 8319 + }, + { + "epoch": 1.5595126522961573, + "grad_norm": 50388.05859375, + "learning_rate": 6.308637136958599e-05, + "loss": 2.2404, + "step": 8320 + }, + { + "epoch": 1.5597000937207124, + "grad_norm": 52315.91796875, + "learning_rate": 6.307878720979583e-05, + "loss": 2.2465, + "step": 8321 + }, + { + "epoch": 1.559887535145267, + "grad_norm": 53878.28125, + "learning_rate": 6.307120272697658e-05, + "loss": 2.2195, + "step": 8322 + }, + { + "epoch": 1.560074976569822, + "grad_norm": 53466.37109375, + "learning_rate": 6.306361792131559e-05, + "loss": 2.235, + "step": 8323 + }, + { + "epoch": 1.5602624179943767, + "grad_norm": 52671.296875, + "learning_rate": 6.305603279300017e-05, + "loss": 2.2699, + "step": 8324 + }, + { + "epoch": 1.5604498594189316, + "grad_norm": 57795.15625, + "learning_rate": 6.304844734221765e-05, + "loss": 2.2356, + "step": 8325 + }, + { + "epoch": 1.5606373008434864, + "grad_norm": 53270.203125, + "learning_rate": 6.304086156915542e-05, + "loss": 2.249, + "step": 8326 + }, + { + "epoch": 1.5608247422680412, + "grad_norm": 57867.80859375, + "learning_rate": 6.303327547400079e-05, + "loss": 2.2682, + "step": 8327 + }, + { + "epoch": 1.561012183692596, + "grad_norm": 56507.32421875, + "learning_rate": 6.30256890569412e-05, + "loss": 2.1586, + "step": 8328 + }, + { + "epoch": 1.561199625117151, + "grad_norm": 50664.95703125, + "learning_rate": 6.301810231816394e-05, + "loss": 2.3175, + "step": 8329 + }, + { + "epoch": 1.5613870665417058, + "grad_norm": 54452.68359375, + "learning_rate": 6.301051525785644e-05, + "loss": 2.2392, + "step": 8330 + }, + { + "epoch": 1.5615745079662604, + "grad_norm": 49956.93359375, + "learning_rate": 6.300292787620609e-05, + "loss": 2.2154, + "step": 8331 + }, + { + "epoch": 1.5617619493908155, + "grad_norm": 51575.859375, + "learning_rate": 6.299534017340026e-05, + "loss": 2.254, + "step": 8332 + }, + { + "epoch": 1.56194939081537, + "grad_norm": 51168.83984375, + "learning_rate": 6.29877521496264e-05, + "loss": 2.2924, + "step": 8333 + }, + { + "epoch": 1.5621368322399252, + "grad_norm": 48719.24609375, + "learning_rate": 6.298016380507188e-05, + "loss": 2.2772, + "step": 8334 + }, + { + "epoch": 1.5623242736644798, + "grad_norm": 49055.28125, + "learning_rate": 6.297257513992416e-05, + "loss": 2.1678, + "step": 8335 + }, + { + "epoch": 1.5625117150890346, + "grad_norm": 57253.734375, + "learning_rate": 6.296498615437064e-05, + "loss": 2.2423, + "step": 8336 + }, + { + "epoch": 1.5626991565135895, + "grad_norm": 49352.3828125, + "learning_rate": 6.295739684859878e-05, + "loss": 2.2183, + "step": 8337 + }, + { + "epoch": 1.5628865979381443, + "grad_norm": 52285.0078125, + "learning_rate": 6.294980722279602e-05, + "loss": 2.1842, + "step": 8338 + }, + { + "epoch": 1.5630740393626992, + "grad_norm": 47877.1796875, + "learning_rate": 6.294221727714981e-05, + "loss": 2.2022, + "step": 8339 + }, + { + "epoch": 1.563261480787254, + "grad_norm": 50058.79296875, + "learning_rate": 6.29346270118476e-05, + "loss": 2.2372, + "step": 8340 + }, + { + "epoch": 1.5634489222118089, + "grad_norm": 50311.05078125, + "learning_rate": 6.292703642707689e-05, + "loss": 2.2402, + "step": 8341 + }, + { + "epoch": 1.5636363636363635, + "grad_norm": 52224.8046875, + "learning_rate": 6.291944552302513e-05, + "loss": 2.3438, + "step": 8342 + }, + { + "epoch": 1.5638238050609186, + "grad_norm": 52211.35546875, + "learning_rate": 6.291185429987983e-05, + "loss": 2.2528, + "step": 8343 + }, + { + "epoch": 1.5640112464854732, + "grad_norm": 54322.93359375, + "learning_rate": 6.290426275782844e-05, + "loss": 2.2313, + "step": 8344 + }, + { + "epoch": 1.5641986879100283, + "grad_norm": 48552.9765625, + "learning_rate": 6.289667089705852e-05, + "loss": 2.207, + "step": 8345 + }, + { + "epoch": 1.5643861293345829, + "grad_norm": 54639.171875, + "learning_rate": 6.288907871775753e-05, + "loss": 2.3215, + "step": 8346 + }, + { + "epoch": 1.5645735707591377, + "grad_norm": 50714.40234375, + "learning_rate": 6.288148622011302e-05, + "loss": 2.2448, + "step": 8347 + }, + { + "epoch": 1.5647610121836926, + "grad_norm": 54368.24609375, + "learning_rate": 6.28738934043125e-05, + "loss": 2.2394, + "step": 8348 + }, + { + "epoch": 1.5649484536082474, + "grad_norm": 58623.921875, + "learning_rate": 6.286630027054349e-05, + "loss": 2.1619, + "step": 8349 + }, + { + "epoch": 1.5651358950328023, + "grad_norm": 53746.1015625, + "learning_rate": 6.285870681899357e-05, + "loss": 2.2023, + "step": 8350 + }, + { + "epoch": 1.565323336457357, + "grad_norm": 53510.3046875, + "learning_rate": 6.285111304985024e-05, + "loss": 2.3509, + "step": 8351 + }, + { + "epoch": 1.565510777881912, + "grad_norm": 53834.9765625, + "learning_rate": 6.284351896330109e-05, + "loss": 2.2378, + "step": 8352 + }, + { + "epoch": 1.5656982193064666, + "grad_norm": 50675.9296875, + "learning_rate": 6.283592455953366e-05, + "loss": 2.177, + "step": 8353 + }, + { + "epoch": 1.5658856607310216, + "grad_norm": 54203.16796875, + "learning_rate": 6.282832983873555e-05, + "loss": 2.3227, + "step": 8354 + }, + { + "epoch": 1.5660731021555763, + "grad_norm": 53285.51953125, + "learning_rate": 6.282073480109433e-05, + "loss": 2.2651, + "step": 8355 + }, + { + "epoch": 1.5662605435801313, + "grad_norm": 54197.58984375, + "learning_rate": 6.281313944679757e-05, + "loss": 2.1836, + "step": 8356 + }, + { + "epoch": 1.566447985004686, + "grad_norm": 50815.62890625, + "learning_rate": 6.280554377603289e-05, + "loss": 2.2173, + "step": 8357 + }, + { + "epoch": 1.5666354264292408, + "grad_norm": 48298.078125, + "learning_rate": 6.279794778898787e-05, + "loss": 2.2474, + "step": 8358 + }, + { + "epoch": 1.5668228678537957, + "grad_norm": 50076.38671875, + "learning_rate": 6.279035148585012e-05, + "loss": 2.2612, + "step": 8359 + }, + { + "epoch": 1.5670103092783505, + "grad_norm": 50939.3671875, + "learning_rate": 6.27827548668073e-05, + "loss": 2.2566, + "step": 8360 + }, + { + "epoch": 1.5671977507029053, + "grad_norm": 53246.3828125, + "learning_rate": 6.2775157932047e-05, + "loss": 2.1552, + "step": 8361 + }, + { + "epoch": 1.5673851921274602, + "grad_norm": 49811.0078125, + "learning_rate": 6.276756068175686e-05, + "loss": 2.1851, + "step": 8362 + }, + { + "epoch": 1.567572633552015, + "grad_norm": 47736.44921875, + "learning_rate": 6.275996311612452e-05, + "loss": 2.2699, + "step": 8363 + }, + { + "epoch": 1.5677600749765697, + "grad_norm": 51221.14453125, + "learning_rate": 6.275236523533763e-05, + "loss": 2.3133, + "step": 8364 + }, + { + "epoch": 1.5679475164011247, + "grad_norm": 50179.8203125, + "learning_rate": 6.274476703958386e-05, + "loss": 2.2426, + "step": 8365 + }, + { + "epoch": 1.5681349578256794, + "grad_norm": 53087.203125, + "learning_rate": 6.273716852905087e-05, + "loss": 2.2128, + "step": 8366 + }, + { + "epoch": 1.5683223992502344, + "grad_norm": 51463.3984375, + "learning_rate": 6.272956970392633e-05, + "loss": 2.2134, + "step": 8367 + }, + { + "epoch": 1.568509840674789, + "grad_norm": 54908.57421875, + "learning_rate": 6.272197056439792e-05, + "loss": 2.2392, + "step": 8368 + }, + { + "epoch": 1.5686972820993441, + "grad_norm": 48960.3203125, + "learning_rate": 6.271437111065333e-05, + "loss": 2.2739, + "step": 8369 + }, + { + "epoch": 1.5688847235238987, + "grad_norm": 48787.12109375, + "learning_rate": 6.270677134288027e-05, + "loss": 2.2421, + "step": 8370 + }, + { + "epoch": 1.5690721649484536, + "grad_norm": 50512.16796875, + "learning_rate": 6.269917126126642e-05, + "loss": 2.1157, + "step": 8371 + }, + { + "epoch": 1.5692596063730084, + "grad_norm": 51908.26171875, + "learning_rate": 6.26915708659995e-05, + "loss": 2.1596, + "step": 8372 + }, + { + "epoch": 1.5694470477975633, + "grad_norm": 48549.4140625, + "learning_rate": 6.268397015726725e-05, + "loss": 2.2409, + "step": 8373 + }, + { + "epoch": 1.5696344892221181, + "grad_norm": 46031.8125, + "learning_rate": 6.267636913525736e-05, + "loss": 2.2197, + "step": 8374 + }, + { + "epoch": 1.569821930646673, + "grad_norm": 53675.0859375, + "learning_rate": 6.266876780015762e-05, + "loss": 2.2247, + "step": 8375 + }, + { + "epoch": 1.5700093720712278, + "grad_norm": 50930.88671875, + "learning_rate": 6.266116615215572e-05, + "loss": 2.2867, + "step": 8376 + }, + { + "epoch": 1.5701968134957824, + "grad_norm": 47512.04296875, + "learning_rate": 6.265356419143941e-05, + "loss": 2.299, + "step": 8377 + }, + { + "epoch": 1.5703842549203375, + "grad_norm": 50560.99609375, + "learning_rate": 6.264596191819651e-05, + "loss": 2.2715, + "step": 8378 + }, + { + "epoch": 1.5705716963448921, + "grad_norm": 52947.796875, + "learning_rate": 6.263835933261472e-05, + "loss": 2.3662, + "step": 8379 + }, + { + "epoch": 1.5707591377694472, + "grad_norm": 51169.4609375, + "learning_rate": 6.263075643488187e-05, + "loss": 2.2331, + "step": 8380 + }, + { + "epoch": 1.5709465791940018, + "grad_norm": 48092.8203125, + "learning_rate": 6.262315322518569e-05, + "loss": 2.2462, + "step": 8381 + }, + { + "epoch": 1.5711340206185567, + "grad_norm": 52820.12890625, + "learning_rate": 6.261554970371398e-05, + "loss": 2.2294, + "step": 8382 + }, + { + "epoch": 1.5713214620431115, + "grad_norm": 51556.078125, + "learning_rate": 6.260794587065457e-05, + "loss": 2.2047, + "step": 8383 + }, + { + "epoch": 1.5715089034676664, + "grad_norm": 60475.05859375, + "learning_rate": 6.260034172619524e-05, + "loss": 2.2307, + "step": 8384 + }, + { + "epoch": 1.5716963448922212, + "grad_norm": 53021.44921875, + "learning_rate": 6.25927372705238e-05, + "loss": 2.1913, + "step": 8385 + }, + { + "epoch": 1.571883786316776, + "grad_norm": 47833.85546875, + "learning_rate": 6.258513250382808e-05, + "loss": 2.2371, + "step": 8386 + }, + { + "epoch": 1.572071227741331, + "grad_norm": 53635.0546875, + "learning_rate": 6.25775274262959e-05, + "loss": 2.2797, + "step": 8387 + }, + { + "epoch": 1.5722586691658855, + "grad_norm": 52259.9140625, + "learning_rate": 6.256992203811509e-05, + "loss": 2.256, + "step": 8388 + }, + { + "epoch": 1.5724461105904406, + "grad_norm": 50823.3125, + "learning_rate": 6.256231633947353e-05, + "loss": 2.2112, + "step": 8389 + }, + { + "epoch": 1.5726335520149952, + "grad_norm": 53125.9453125, + "learning_rate": 6.255471033055902e-05, + "loss": 2.2179, + "step": 8390 + }, + { + "epoch": 1.5728209934395503, + "grad_norm": 47722.85546875, + "learning_rate": 6.254710401155945e-05, + "loss": 2.2362, + "step": 8391 + }, + { + "epoch": 1.573008434864105, + "grad_norm": 52546.28515625, + "learning_rate": 6.253949738266268e-05, + "loss": 2.1904, + "step": 8392 + }, + { + "epoch": 1.5731958762886598, + "grad_norm": 51085.625, + "learning_rate": 6.253189044405658e-05, + "loss": 2.1697, + "step": 8393 + }, + { + "epoch": 1.5733833177132146, + "grad_norm": 48941.890625, + "learning_rate": 6.252428319592903e-05, + "loss": 2.2372, + "step": 8394 + }, + { + "epoch": 1.5735707591377694, + "grad_norm": 56145.55859375, + "learning_rate": 6.251667563846794e-05, + "loss": 2.2121, + "step": 8395 + }, + { + "epoch": 1.5737582005623243, + "grad_norm": 51085.7734375, + "learning_rate": 6.250906777186117e-05, + "loss": 2.2421, + "step": 8396 + }, + { + "epoch": 1.5739456419868791, + "grad_norm": 52244.60546875, + "learning_rate": 6.250145959629665e-05, + "loss": 2.2121, + "step": 8397 + }, + { + "epoch": 1.574133083411434, + "grad_norm": 49321.95703125, + "learning_rate": 6.249385111196229e-05, + "loss": 2.2531, + "step": 8398 + }, + { + "epoch": 1.5743205248359886, + "grad_norm": 52082.05859375, + "learning_rate": 6.248624231904601e-05, + "loss": 2.1481, + "step": 8399 + }, + { + "epoch": 1.5745079662605437, + "grad_norm": 47115.4296875, + "learning_rate": 6.247863321773573e-05, + "loss": 2.2563, + "step": 8400 + }, + { + "epoch": 1.5746954076850983, + "grad_norm": 49821.2421875, + "learning_rate": 6.247102380821939e-05, + "loss": 2.1922, + "step": 8401 + }, + { + "epoch": 1.5748828491096534, + "grad_norm": 55774.390625, + "learning_rate": 6.246341409068492e-05, + "loss": 2.172, + "step": 8402 + }, + { + "epoch": 1.575070290534208, + "grad_norm": 53044.28515625, + "learning_rate": 6.245580406532029e-05, + "loss": 2.1723, + "step": 8403 + }, + { + "epoch": 1.5752577319587628, + "grad_norm": 54811.9609375, + "learning_rate": 6.244819373231344e-05, + "loss": 2.2354, + "step": 8404 + }, + { + "epoch": 1.5754451733833177, + "grad_norm": 51915.8046875, + "learning_rate": 6.244058309185235e-05, + "loss": 2.2696, + "step": 8405 + }, + { + "epoch": 1.5756326148078725, + "grad_norm": 49787.125, + "learning_rate": 6.243297214412501e-05, + "loss": 2.2372, + "step": 8406 + }, + { + "epoch": 1.5758200562324274, + "grad_norm": 52385.5546875, + "learning_rate": 6.242536088931932e-05, + "loss": 2.2339, + "step": 8407 + }, + { + "epoch": 1.5760074976569822, + "grad_norm": 53999.1953125, + "learning_rate": 6.241774932762338e-05, + "loss": 2.21, + "step": 8408 + }, + { + "epoch": 1.576194939081537, + "grad_norm": 52746.43359375, + "learning_rate": 6.241013745922512e-05, + "loss": 2.2126, + "step": 8409 + }, + { + "epoch": 1.5763823805060917, + "grad_norm": 52691.6328125, + "learning_rate": 6.240252528431254e-05, + "loss": 2.1648, + "step": 8410 + }, + { + "epoch": 1.5765698219306468, + "grad_norm": 50495.58203125, + "learning_rate": 6.239491280307369e-05, + "loss": 2.2176, + "step": 8411 + }, + { + "epoch": 1.5767572633552014, + "grad_norm": 53531.18359375, + "learning_rate": 6.238730001569653e-05, + "loss": 2.2265, + "step": 8412 + }, + { + "epoch": 1.5769447047797565, + "grad_norm": 51824.66015625, + "learning_rate": 6.237968692236915e-05, + "loss": 2.259, + "step": 8413 + }, + { + "epoch": 1.577132146204311, + "grad_norm": 49494.55078125, + "learning_rate": 6.237207352327952e-05, + "loss": 2.2133, + "step": 8414 + }, + { + "epoch": 1.577319587628866, + "grad_norm": 54145.765625, + "learning_rate": 6.236445981861575e-05, + "loss": 2.2494, + "step": 8415 + }, + { + "epoch": 1.5775070290534208, + "grad_norm": 49848.51171875, + "learning_rate": 6.235684580856583e-05, + "loss": 2.2725, + "step": 8416 + }, + { + "epoch": 1.5776944704779756, + "grad_norm": 53795.0546875, + "learning_rate": 6.234923149331785e-05, + "loss": 2.3045, + "step": 8417 + }, + { + "epoch": 1.5778819119025305, + "grad_norm": 53167.39453125, + "learning_rate": 6.234161687305985e-05, + "loss": 2.2578, + "step": 8418 + }, + { + "epoch": 1.5780693533270853, + "grad_norm": 51271.421875, + "learning_rate": 6.233400194797993e-05, + "loss": 2.2717, + "step": 8419 + }, + { + "epoch": 1.5782567947516402, + "grad_norm": 51250.75, + "learning_rate": 6.232638671826616e-05, + "loss": 2.2373, + "step": 8420 + }, + { + "epoch": 1.5784442361761948, + "grad_norm": 50395.4140625, + "learning_rate": 6.231877118410661e-05, + "loss": 2.2463, + "step": 8421 + }, + { + "epoch": 1.5786316776007498, + "grad_norm": 52640.12109375, + "learning_rate": 6.231115534568938e-05, + "loss": 2.21, + "step": 8422 + }, + { + "epoch": 1.5788191190253045, + "grad_norm": 51851.58203125, + "learning_rate": 6.230353920320257e-05, + "loss": 2.1851, + "step": 8423 + }, + { + "epoch": 1.5790065604498595, + "grad_norm": 52678.05078125, + "learning_rate": 6.229592275683431e-05, + "loss": 2.2589, + "step": 8424 + }, + { + "epoch": 1.5791940018744142, + "grad_norm": 49685.65234375, + "learning_rate": 6.228830600677268e-05, + "loss": 2.2446, + "step": 8425 + }, + { + "epoch": 1.5793814432989692, + "grad_norm": 51726.53125, + "learning_rate": 6.228068895320584e-05, + "loss": 2.2139, + "step": 8426 + }, + { + "epoch": 1.5795688847235239, + "grad_norm": 54411.5625, + "learning_rate": 6.22730715963219e-05, + "loss": 2.2116, + "step": 8427 + }, + { + "epoch": 1.5797563261480787, + "grad_norm": 54431.47265625, + "learning_rate": 6.226545393630901e-05, + "loss": 2.2561, + "step": 8428 + }, + { + "epoch": 1.5799437675726336, + "grad_norm": 48698.94140625, + "learning_rate": 6.225783597335529e-05, + "loss": 2.1896, + "step": 8429 + }, + { + "epoch": 1.5801312089971884, + "grad_norm": 54053.140625, + "learning_rate": 6.225021770764893e-05, + "loss": 2.2008, + "step": 8430 + }, + { + "epoch": 1.5803186504217432, + "grad_norm": 49464.1484375, + "learning_rate": 6.224259913937806e-05, + "loss": 2.2273, + "step": 8431 + }, + { + "epoch": 1.5805060918462979, + "grad_norm": 54065.77734375, + "learning_rate": 6.223498026873088e-05, + "loss": 2.2279, + "step": 8432 + }, + { + "epoch": 1.580693533270853, + "grad_norm": 53859.4765625, + "learning_rate": 6.222736109589555e-05, + "loss": 2.1922, + "step": 8433 + }, + { + "epoch": 1.5808809746954076, + "grad_norm": 51795.03125, + "learning_rate": 6.221974162106024e-05, + "loss": 2.2068, + "step": 8434 + }, + { + "epoch": 1.5810684161199626, + "grad_norm": 46668.0625, + "learning_rate": 6.221212184441316e-05, + "loss": 2.2689, + "step": 8435 + }, + { + "epoch": 1.5812558575445173, + "grad_norm": 52853.7578125, + "learning_rate": 6.220450176614251e-05, + "loss": 2.172, + "step": 8436 + }, + { + "epoch": 1.5814432989690723, + "grad_norm": 57812.796875, + "learning_rate": 6.219688138643649e-05, + "loss": 2.2338, + "step": 8437 + }, + { + "epoch": 1.581630740393627, + "grad_norm": 48307.3984375, + "learning_rate": 6.218926070548332e-05, + "loss": 2.2702, + "step": 8438 + }, + { + "epoch": 1.5818181818181818, + "grad_norm": 56694.703125, + "learning_rate": 6.218163972347121e-05, + "loss": 2.264, + "step": 8439 + }, + { + "epoch": 1.5820056232427366, + "grad_norm": 50508.8828125, + "learning_rate": 6.217401844058838e-05, + "loss": 2.2093, + "step": 8440 + }, + { + "epoch": 1.5821930646672915, + "grad_norm": 48760.6171875, + "learning_rate": 6.21663968570231e-05, + "loss": 2.2625, + "step": 8441 + }, + { + "epoch": 1.5823805060918463, + "grad_norm": 51664.23046875, + "learning_rate": 6.215877497296357e-05, + "loss": 2.2513, + "step": 8442 + }, + { + "epoch": 1.5825679475164012, + "grad_norm": 49894.9921875, + "learning_rate": 6.215115278859808e-05, + "loss": 2.2484, + "step": 8443 + }, + { + "epoch": 1.582755388940956, + "grad_norm": 51726.04296875, + "learning_rate": 6.214353030411485e-05, + "loss": 2.257, + "step": 8444 + }, + { + "epoch": 1.5829428303655106, + "grad_norm": 47498.04296875, + "learning_rate": 6.213590751970217e-05, + "loss": 2.2529, + "step": 8445 + }, + { + "epoch": 1.5831302717900657, + "grad_norm": 55146.47265625, + "learning_rate": 6.212828443554833e-05, + "loss": 2.2143, + "step": 8446 + }, + { + "epoch": 1.5833177132146203, + "grad_norm": 54842.2578125, + "learning_rate": 6.212066105184157e-05, + "loss": 2.164, + "step": 8447 + }, + { + "epoch": 1.5835051546391754, + "grad_norm": 50939.1015625, + "learning_rate": 6.21130373687702e-05, + "loss": 2.2377, + "step": 8448 + }, + { + "epoch": 1.58369259606373, + "grad_norm": 52582.16015625, + "learning_rate": 6.21054133865225e-05, + "loss": 2.1591, + "step": 8449 + }, + { + "epoch": 1.5838800374882849, + "grad_norm": 59734.828125, + "learning_rate": 6.20977891052868e-05, + "loss": 2.2914, + "step": 8450 + }, + { + "epoch": 1.5840674789128397, + "grad_norm": 55191.875, + "learning_rate": 6.20901645252514e-05, + "loss": 2.2373, + "step": 8451 + }, + { + "epoch": 1.5842549203373946, + "grad_norm": 47775.54296875, + "learning_rate": 6.208253964660459e-05, + "loss": 2.2448, + "step": 8452 + }, + { + "epoch": 1.5844423617619494, + "grad_norm": 48359.03515625, + "learning_rate": 6.207491446953474e-05, + "loss": 2.3195, + "step": 8453 + }, + { + "epoch": 1.5846298031865043, + "grad_norm": 47897.26953125, + "learning_rate": 6.206728899423014e-05, + "loss": 2.2484, + "step": 8454 + }, + { + "epoch": 1.584817244611059, + "grad_norm": 52898.234375, + "learning_rate": 6.205966322087915e-05, + "loss": 2.1847, + "step": 8455 + }, + { + "epoch": 1.5850046860356137, + "grad_norm": 52490.6328125, + "learning_rate": 6.205203714967012e-05, + "loss": 2.2165, + "step": 8456 + }, + { + "epoch": 1.5851921274601688, + "grad_norm": 51904.65234375, + "learning_rate": 6.20444107807914e-05, + "loss": 2.1337, + "step": 8457 + }, + { + "epoch": 1.5853795688847234, + "grad_norm": 51475.6015625, + "learning_rate": 6.203678411443134e-05, + "loss": 2.2378, + "step": 8458 + }, + { + "epoch": 1.5855670103092785, + "grad_norm": 55564.82421875, + "learning_rate": 6.202915715077832e-05, + "loss": 2.1959, + "step": 8459 + }, + { + "epoch": 1.5857544517338331, + "grad_norm": 51815.03515625, + "learning_rate": 6.202152989002072e-05, + "loss": 2.2691, + "step": 8460 + }, + { + "epoch": 1.585941893158388, + "grad_norm": 52440.6015625, + "learning_rate": 6.201390233234692e-05, + "loss": 2.1803, + "step": 8461 + }, + { + "epoch": 1.5861293345829428, + "grad_norm": 52102.609375, + "learning_rate": 6.20062744779453e-05, + "loss": 2.228, + "step": 8462 + }, + { + "epoch": 1.5863167760074977, + "grad_norm": 55518.73046875, + "learning_rate": 6.199864632700427e-05, + "loss": 2.2109, + "step": 8463 + }, + { + "epoch": 1.5865042174320525, + "grad_norm": 55799.23046875, + "learning_rate": 6.199101787971225e-05, + "loss": 2.1399, + "step": 8464 + }, + { + "epoch": 1.5866916588566073, + "grad_norm": 51141.77734375, + "learning_rate": 6.19833891362576e-05, + "loss": 2.2916, + "step": 8465 + }, + { + "epoch": 1.5868791002811622, + "grad_norm": 48106.96484375, + "learning_rate": 6.197576009682881e-05, + "loss": 2.2464, + "step": 8466 + }, + { + "epoch": 1.5870665417057168, + "grad_norm": 53877.76171875, + "learning_rate": 6.196813076161425e-05, + "loss": 2.2186, + "step": 8467 + }, + { + "epoch": 1.5872539831302719, + "grad_norm": 50497.73828125, + "learning_rate": 6.196050113080238e-05, + "loss": 2.1979, + "step": 8468 + }, + { + "epoch": 1.5874414245548265, + "grad_norm": 53321.17578125, + "learning_rate": 6.195287120458164e-05, + "loss": 2.2074, + "step": 8469 + }, + { + "epoch": 1.5876288659793816, + "grad_norm": 57825.9375, + "learning_rate": 6.194524098314048e-05, + "loss": 2.2026, + "step": 8470 + }, + { + "epoch": 1.5878163074039362, + "grad_norm": 49128.1484375, + "learning_rate": 6.193761046666736e-05, + "loss": 2.2494, + "step": 8471 + }, + { + "epoch": 1.588003748828491, + "grad_norm": 52222.984375, + "learning_rate": 6.192997965535072e-05, + "loss": 2.2263, + "step": 8472 + }, + { + "epoch": 1.588191190253046, + "grad_norm": 50588.046875, + "learning_rate": 6.192234854937908e-05, + "loss": 2.1393, + "step": 8473 + }, + { + "epoch": 1.5883786316776007, + "grad_norm": 49999.046875, + "learning_rate": 6.191471714894088e-05, + "loss": 2.3063, + "step": 8474 + }, + { + "epoch": 1.5885660731021556, + "grad_norm": 48074.359375, + "learning_rate": 6.19070854542246e-05, + "loss": 2.253, + "step": 8475 + }, + { + "epoch": 1.5887535145267104, + "grad_norm": 50232.125, + "learning_rate": 6.189945346541875e-05, + "loss": 2.1677, + "step": 8476 + }, + { + "epoch": 1.5889409559512653, + "grad_norm": 49859.41796875, + "learning_rate": 6.189182118271183e-05, + "loss": 2.2177, + "step": 8477 + }, + { + "epoch": 1.58912839737582, + "grad_norm": 59088.9921875, + "learning_rate": 6.188418860629235e-05, + "loss": 2.2347, + "step": 8478 + }, + { + "epoch": 1.589315838800375, + "grad_norm": 52997.6171875, + "learning_rate": 6.187655573634881e-05, + "loss": 2.1689, + "step": 8479 + }, + { + "epoch": 1.5895032802249296, + "grad_norm": 49898.2734375, + "learning_rate": 6.186892257306975e-05, + "loss": 2.2418, + "step": 8480 + }, + { + "epoch": 1.5896907216494847, + "grad_norm": 48621.15234375, + "learning_rate": 6.186128911664368e-05, + "loss": 2.2247, + "step": 8481 + }, + { + "epoch": 1.5898781630740393, + "grad_norm": 49827.8828125, + "learning_rate": 6.185365536725915e-05, + "loss": 2.2617, + "step": 8482 + }, + { + "epoch": 1.5900656044985944, + "grad_norm": 47702.29296875, + "learning_rate": 6.184602132510472e-05, + "loss": 2.2467, + "step": 8483 + }, + { + "epoch": 1.590253045923149, + "grad_norm": 60497.86328125, + "learning_rate": 6.183838699036891e-05, + "loss": 2.2262, + "step": 8484 + }, + { + "epoch": 1.5904404873477038, + "grad_norm": 53839.31640625, + "learning_rate": 6.183075236324028e-05, + "loss": 2.2181, + "step": 8485 + }, + { + "epoch": 1.5906279287722587, + "grad_norm": 51262.34765625, + "learning_rate": 6.182311744390742e-05, + "loss": 2.2495, + "step": 8486 + }, + { + "epoch": 1.5908153701968135, + "grad_norm": 50469.98828125, + "learning_rate": 6.181548223255886e-05, + "loss": 2.2651, + "step": 8487 + }, + { + "epoch": 1.5910028116213684, + "grad_norm": 50380.2578125, + "learning_rate": 6.180784672938325e-05, + "loss": 2.1602, + "step": 8488 + }, + { + "epoch": 1.591190253045923, + "grad_norm": 51164.59375, + "learning_rate": 6.180021093456911e-05, + "loss": 2.1789, + "step": 8489 + }, + { + "epoch": 1.591377694470478, + "grad_norm": 51583.05859375, + "learning_rate": 6.179257484830506e-05, + "loss": 2.2257, + "step": 8490 + }, + { + "epoch": 1.5915651358950327, + "grad_norm": 52822.37890625, + "learning_rate": 6.17849384707797e-05, + "loss": 2.2341, + "step": 8491 + }, + { + "epoch": 1.5917525773195877, + "grad_norm": 47970.51953125, + "learning_rate": 6.177730180218164e-05, + "loss": 2.2358, + "step": 8492 + }, + { + "epoch": 1.5919400187441424, + "grad_norm": 48563.75, + "learning_rate": 6.176966484269949e-05, + "loss": 2.232, + "step": 8493 + }, + { + "epoch": 1.5921274601686974, + "grad_norm": 50852.5, + "learning_rate": 6.17620275925219e-05, + "loss": 2.253, + "step": 8494 + }, + { + "epoch": 1.592314901593252, + "grad_norm": 53849.171875, + "learning_rate": 6.175439005183746e-05, + "loss": 2.2402, + "step": 8495 + }, + { + "epoch": 1.592502343017807, + "grad_norm": 48701.16796875, + "learning_rate": 6.174675222083483e-05, + "loss": 2.2559, + "step": 8496 + }, + { + "epoch": 1.5926897844423618, + "grad_norm": 51153.1015625, + "learning_rate": 6.173911409970265e-05, + "loss": 2.2478, + "step": 8497 + }, + { + "epoch": 1.5928772258669166, + "grad_norm": 55045.48046875, + "learning_rate": 6.173147568862957e-05, + "loss": 2.2945, + "step": 8498 + }, + { + "epoch": 1.5930646672914714, + "grad_norm": 50341.20703125, + "learning_rate": 6.172383698780427e-05, + "loss": 2.2768, + "step": 8499 + }, + { + "epoch": 1.5932521087160263, + "grad_norm": 46064.734375, + "learning_rate": 6.171619799741536e-05, + "loss": 2.2128, + "step": 8500 + }, + { + "epoch": 1.5932521087160263, + "eval_loss": 2.2929718494415283, + "eval_runtime": 130.2182, + "eval_samples_per_second": 38.773, + "eval_steps_per_second": 1.943, + "step": 8500 + }, + { + "epoch": 1.5934395501405811, + "grad_norm": 54219.74609375, + "learning_rate": 6.170855871765157e-05, + "loss": 2.2162, + "step": 8501 + }, + { + "epoch": 1.5936269915651358, + "grad_norm": 50264.9375, + "learning_rate": 6.170091914870155e-05, + "loss": 2.1448, + "step": 8502 + }, + { + "epoch": 1.5938144329896908, + "grad_norm": 53039.69140625, + "learning_rate": 6.169327929075401e-05, + "loss": 2.2366, + "step": 8503 + }, + { + "epoch": 1.5940018744142455, + "grad_norm": 51744.21875, + "learning_rate": 6.168563914399763e-05, + "loss": 2.236, + "step": 8504 + }, + { + "epoch": 1.5941893158388005, + "grad_norm": 48705.37109375, + "learning_rate": 6.16779987086211e-05, + "loss": 2.1452, + "step": 8505 + }, + { + "epoch": 1.5943767572633551, + "grad_norm": 47238.35546875, + "learning_rate": 6.167035798481316e-05, + "loss": 2.1928, + "step": 8506 + }, + { + "epoch": 1.59456419868791, + "grad_norm": 53335.4140625, + "learning_rate": 6.16627169727625e-05, + "loss": 2.1951, + "step": 8507 + }, + { + "epoch": 1.5947516401124648, + "grad_norm": 60062.4609375, + "learning_rate": 6.165507567265787e-05, + "loss": 2.1727, + "step": 8508 + }, + { + "epoch": 1.5949390815370197, + "grad_norm": 52446.125, + "learning_rate": 6.164743408468795e-05, + "loss": 2.219, + "step": 8509 + }, + { + "epoch": 1.5951265229615745, + "grad_norm": 51222.33984375, + "learning_rate": 6.163979220904154e-05, + "loss": 2.2905, + "step": 8510 + }, + { + "epoch": 1.5953139643861294, + "grad_norm": 55662.59375, + "learning_rate": 6.163215004590733e-05, + "loss": 2.229, + "step": 8511 + }, + { + "epoch": 1.5955014058106842, + "grad_norm": 50332.85546875, + "learning_rate": 6.162450759547411e-05, + "loss": 2.296, + "step": 8512 + }, + { + "epoch": 1.5956888472352388, + "grad_norm": 54649.28515625, + "learning_rate": 6.16168648579306e-05, + "loss": 2.2125, + "step": 8513 + }, + { + "epoch": 1.595876288659794, + "grad_norm": 58619.27734375, + "learning_rate": 6.160922183346563e-05, + "loss": 2.2658, + "step": 8514 + }, + { + "epoch": 1.5960637300843485, + "grad_norm": 50486.234375, + "learning_rate": 6.160157852226789e-05, + "loss": 2.2121, + "step": 8515 + }, + { + "epoch": 1.5962511715089036, + "grad_norm": 56399.3671875, + "learning_rate": 6.159393492452623e-05, + "loss": 2.1872, + "step": 8516 + }, + { + "epoch": 1.5964386129334582, + "grad_norm": 54312.2734375, + "learning_rate": 6.15862910404294e-05, + "loss": 2.2585, + "step": 8517 + }, + { + "epoch": 1.596626054358013, + "grad_norm": 51378.8515625, + "learning_rate": 6.157864687016621e-05, + "loss": 2.2706, + "step": 8518 + }, + { + "epoch": 1.596813495782568, + "grad_norm": 51990.1875, + "learning_rate": 6.157100241392543e-05, + "loss": 2.2251, + "step": 8519 + }, + { + "epoch": 1.5970009372071228, + "grad_norm": 52056.546875, + "learning_rate": 6.156335767189593e-05, + "loss": 2.3477, + "step": 8520 + }, + { + "epoch": 1.5971883786316776, + "grad_norm": 49719.5859375, + "learning_rate": 6.155571264426646e-05, + "loss": 2.2551, + "step": 8521 + }, + { + "epoch": 1.5973758200562325, + "grad_norm": 49252.09765625, + "learning_rate": 6.154806733122588e-05, + "loss": 2.2453, + "step": 8522 + }, + { + "epoch": 1.5975632614807873, + "grad_norm": 48494.28125, + "learning_rate": 6.154042173296302e-05, + "loss": 2.2094, + "step": 8523 + }, + { + "epoch": 1.597750702905342, + "grad_norm": 55847.23828125, + "learning_rate": 6.153277584966668e-05, + "loss": 2.2464, + "step": 8524 + }, + { + "epoch": 1.597938144329897, + "grad_norm": 50149.94140625, + "learning_rate": 6.152512968152575e-05, + "loss": 2.2741, + "step": 8525 + }, + { + "epoch": 1.5981255857544516, + "grad_norm": 51755.39453125, + "learning_rate": 6.151748322872905e-05, + "loss": 2.2739, + "step": 8526 + }, + { + "epoch": 1.5983130271790067, + "grad_norm": 47288.62890625, + "learning_rate": 6.150983649146545e-05, + "loss": 2.1912, + "step": 8527 + }, + { + "epoch": 1.5985004686035613, + "grad_norm": 52322.30078125, + "learning_rate": 6.15021894699238e-05, + "loss": 2.1635, + "step": 8528 + }, + { + "epoch": 1.5986879100281162, + "grad_norm": 48743.921875, + "learning_rate": 6.1494542164293e-05, + "loss": 2.2895, + "step": 8529 + }, + { + "epoch": 1.598875351452671, + "grad_norm": 52034.73828125, + "learning_rate": 6.148689457476188e-05, + "loss": 2.3346, + "step": 8530 + }, + { + "epoch": 1.5990627928772259, + "grad_norm": 54529.8515625, + "learning_rate": 6.147924670151941e-05, + "loss": 2.2313, + "step": 8531 + }, + { + "epoch": 1.5992502343017807, + "grad_norm": 51058.0625, + "learning_rate": 6.147159854475439e-05, + "loss": 2.2183, + "step": 8532 + }, + { + "epoch": 1.5994376757263356, + "grad_norm": 49843.45703125, + "learning_rate": 6.146395010465578e-05, + "loss": 2.1837, + "step": 8533 + }, + { + "epoch": 1.5996251171508904, + "grad_norm": 49008.546875, + "learning_rate": 6.145630138141245e-05, + "loss": 2.2044, + "step": 8534 + }, + { + "epoch": 1.599812558575445, + "grad_norm": 51346.89453125, + "learning_rate": 6.144865237521334e-05, + "loss": 2.2455, + "step": 8535 + }, + { + "epoch": 1.6, + "grad_norm": 49554.73046875, + "learning_rate": 6.144100308624735e-05, + "loss": 2.3141, + "step": 8536 + }, + { + "epoch": 1.6001874414245547, + "grad_norm": 55850.0859375, + "learning_rate": 6.143335351470342e-05, + "loss": 2.1367, + "step": 8537 + }, + { + "epoch": 1.6003748828491098, + "grad_norm": 51275.28515625, + "learning_rate": 6.142570366077048e-05, + "loss": 2.2198, + "step": 8538 + }, + { + "epoch": 1.6005623242736644, + "grad_norm": 53120.5234375, + "learning_rate": 6.141805352463748e-05, + "loss": 2.2274, + "step": 8539 + }, + { + "epoch": 1.6007497656982193, + "grad_norm": 50022.84765625, + "learning_rate": 6.141040310649336e-05, + "loss": 2.2498, + "step": 8540 + }, + { + "epoch": 1.600937207122774, + "grad_norm": 51492.453125, + "learning_rate": 6.140275240652706e-05, + "loss": 2.2338, + "step": 8541 + }, + { + "epoch": 1.601124648547329, + "grad_norm": 48406.953125, + "learning_rate": 6.139510142492758e-05, + "loss": 2.2103, + "step": 8542 + }, + { + "epoch": 1.6013120899718838, + "grad_norm": 53490.11328125, + "learning_rate": 6.138745016188386e-05, + "loss": 2.2292, + "step": 8543 + }, + { + "epoch": 1.6014995313964386, + "grad_norm": 49190.47265625, + "learning_rate": 6.13797986175849e-05, + "loss": 2.2214, + "step": 8544 + }, + { + "epoch": 1.6016869728209935, + "grad_norm": 48977.56640625, + "learning_rate": 6.137214679221965e-05, + "loss": 2.2215, + "step": 8545 + }, + { + "epoch": 1.601874414245548, + "grad_norm": 53957.08984375, + "learning_rate": 6.136449468597714e-05, + "loss": 2.176, + "step": 8546 + }, + { + "epoch": 1.6020618556701032, + "grad_norm": 51304.05859375, + "learning_rate": 6.135684229904632e-05, + "loss": 2.2371, + "step": 8547 + }, + { + "epoch": 1.6022492970946578, + "grad_norm": 50512.4921875, + "learning_rate": 6.134918963161623e-05, + "loss": 2.1911, + "step": 8548 + }, + { + "epoch": 1.6024367385192129, + "grad_norm": 51451.2578125, + "learning_rate": 6.134153668387587e-05, + "loss": 2.1841, + "step": 8549 + }, + { + "epoch": 1.6026241799437675, + "grad_norm": 58344.49609375, + "learning_rate": 6.133388345601425e-05, + "loss": 2.2237, + "step": 8550 + }, + { + "epoch": 1.6028116213683226, + "grad_norm": 48158.4765625, + "learning_rate": 6.132622994822042e-05, + "loss": 2.2165, + "step": 8551 + }, + { + "epoch": 1.6029990627928772, + "grad_norm": 54413.15625, + "learning_rate": 6.13185761606834e-05, + "loss": 2.2067, + "step": 8552 + }, + { + "epoch": 1.603186504217432, + "grad_norm": 50974.578125, + "learning_rate": 6.13109220935922e-05, + "loss": 2.1661, + "step": 8553 + }, + { + "epoch": 1.6033739456419869, + "grad_norm": 51661.5859375, + "learning_rate": 6.130326774713591e-05, + "loss": 2.315, + "step": 8554 + }, + { + "epoch": 1.6035613870665417, + "grad_norm": 49874.08203125, + "learning_rate": 6.129561312150355e-05, + "loss": 2.1901, + "step": 8555 + }, + { + "epoch": 1.6037488284910966, + "grad_norm": 50026.32421875, + "learning_rate": 6.128795821688419e-05, + "loss": 2.2022, + "step": 8556 + }, + { + "epoch": 1.6039362699156514, + "grad_norm": 49599.359375, + "learning_rate": 6.12803030334669e-05, + "loss": 2.18, + "step": 8557 + }, + { + "epoch": 1.6041237113402063, + "grad_norm": 50152.03125, + "learning_rate": 6.127264757144074e-05, + "loss": 2.2205, + "step": 8558 + }, + { + "epoch": 1.6043111527647609, + "grad_norm": 50718.6015625, + "learning_rate": 6.126499183099482e-05, + "loss": 2.2306, + "step": 8559 + }, + { + "epoch": 1.604498594189316, + "grad_norm": 49667.33984375, + "learning_rate": 6.125733581231818e-05, + "loss": 2.2404, + "step": 8560 + }, + { + "epoch": 1.6046860356138706, + "grad_norm": 54958.14453125, + "learning_rate": 6.124967951559998e-05, + "loss": 2.2508, + "step": 8561 + }, + { + "epoch": 1.6048734770384256, + "grad_norm": 51712.66796875, + "learning_rate": 6.124202294102925e-05, + "loss": 2.2142, + "step": 8562 + }, + { + "epoch": 1.6050609184629803, + "grad_norm": 51532.45703125, + "learning_rate": 6.123436608879513e-05, + "loss": 2.2888, + "step": 8563 + }, + { + "epoch": 1.6052483598875351, + "grad_norm": 51269.0546875, + "learning_rate": 6.122670895908675e-05, + "loss": 2.2266, + "step": 8564 + }, + { + "epoch": 1.60543580131209, + "grad_norm": 51049.64453125, + "learning_rate": 6.12190515520932e-05, + "loss": 2.1947, + "step": 8565 + }, + { + "epoch": 1.6056232427366448, + "grad_norm": 54411.80859375, + "learning_rate": 6.121139386800364e-05, + "loss": 2.2294, + "step": 8566 + }, + { + "epoch": 1.6058106841611997, + "grad_norm": 50738.35546875, + "learning_rate": 6.120373590700717e-05, + "loss": 2.2036, + "step": 8567 + }, + { + "epoch": 1.6059981255857545, + "grad_norm": 49151.20703125, + "learning_rate": 6.119607766929296e-05, + "loss": 2.2616, + "step": 8568 + }, + { + "epoch": 1.6061855670103093, + "grad_norm": 51122.046875, + "learning_rate": 6.118841915505013e-05, + "loss": 2.2317, + "step": 8569 + }, + { + "epoch": 1.606373008434864, + "grad_norm": 47142.42578125, + "learning_rate": 6.118076036446787e-05, + "loss": 2.291, + "step": 8570 + }, + { + "epoch": 1.606560449859419, + "grad_norm": 51078.41015625, + "learning_rate": 6.117310129773531e-05, + "loss": 2.2374, + "step": 8571 + }, + { + "epoch": 1.6067478912839737, + "grad_norm": 50995.41015625, + "learning_rate": 6.116544195504166e-05, + "loss": 2.1759, + "step": 8572 + }, + { + "epoch": 1.6069353327085287, + "grad_norm": 51805.3125, + "learning_rate": 6.115778233657604e-05, + "loss": 2.2144, + "step": 8573 + }, + { + "epoch": 1.6071227741330834, + "grad_norm": 53369.6796875, + "learning_rate": 6.11501224425277e-05, + "loss": 2.2439, + "step": 8574 + }, + { + "epoch": 1.6073102155576382, + "grad_norm": 54908.86328125, + "learning_rate": 6.114246227308576e-05, + "loss": 2.1865, + "step": 8575 + }, + { + "epoch": 1.607497656982193, + "grad_norm": 55414.37109375, + "learning_rate": 6.113480182843945e-05, + "loss": 2.2596, + "step": 8576 + }, + { + "epoch": 1.607685098406748, + "grad_norm": 48296.65625, + "learning_rate": 6.112714110877797e-05, + "loss": 2.2291, + "step": 8577 + }, + { + "epoch": 1.6078725398313027, + "grad_norm": 51588.515625, + "learning_rate": 6.111948011429054e-05, + "loss": 2.2129, + "step": 8578 + }, + { + "epoch": 1.6080599812558576, + "grad_norm": 47006.0625, + "learning_rate": 6.111181884516637e-05, + "loss": 2.1878, + "step": 8579 + }, + { + "epoch": 1.6082474226804124, + "grad_norm": 51722.2734375, + "learning_rate": 6.110415730159467e-05, + "loss": 2.2329, + "step": 8580 + }, + { + "epoch": 1.608434864104967, + "grad_norm": 49795.71875, + "learning_rate": 6.109649548376471e-05, + "loss": 2.3177, + "step": 8581 + }, + { + "epoch": 1.6086223055295221, + "grad_norm": 54521.71875, + "learning_rate": 6.108883339186567e-05, + "loss": 2.2669, + "step": 8582 + }, + { + "epoch": 1.6088097469540767, + "grad_norm": 53265.51953125, + "learning_rate": 6.108117102608684e-05, + "loss": 2.2221, + "step": 8583 + }, + { + "epoch": 1.6089971883786318, + "grad_norm": 55592.65625, + "learning_rate": 6.107350838661743e-05, + "loss": 2.1736, + "step": 8584 + }, + { + "epoch": 1.6091846298031864, + "grad_norm": 51314.765625, + "learning_rate": 6.106584547364673e-05, + "loss": 2.2422, + "step": 8585 + }, + { + "epoch": 1.6093720712277413, + "grad_norm": 48588.00390625, + "learning_rate": 6.105818228736399e-05, + "loss": 2.2768, + "step": 8586 + }, + { + "epoch": 1.6095595126522961, + "grad_norm": 58082.5390625, + "learning_rate": 6.105051882795851e-05, + "loss": 2.1494, + "step": 8587 + }, + { + "epoch": 1.609746954076851, + "grad_norm": 49708.99609375, + "learning_rate": 6.10428550956195e-05, + "loss": 2.1891, + "step": 8588 + }, + { + "epoch": 1.6099343955014058, + "grad_norm": 52557.13671875, + "learning_rate": 6.103519109053633e-05, + "loss": 2.1805, + "step": 8589 + }, + { + "epoch": 1.6101218369259607, + "grad_norm": 51632.50390625, + "learning_rate": 6.102752681289823e-05, + "loss": 2.1851, + "step": 8590 + }, + { + "epoch": 1.6103092783505155, + "grad_norm": 51031.484375, + "learning_rate": 6.10198622628945e-05, + "loss": 2.2339, + "step": 8591 + }, + { + "epoch": 1.6104967197750701, + "grad_norm": 46546.8828125, + "learning_rate": 6.101219744071448e-05, + "loss": 2.2446, + "step": 8592 + }, + { + "epoch": 1.6106841611996252, + "grad_norm": 54395.06640625, + "learning_rate": 6.1004532346547446e-05, + "loss": 2.2657, + "step": 8593 + }, + { + "epoch": 1.6108716026241798, + "grad_norm": 55863.796875, + "learning_rate": 6.099686698058275e-05, + "loss": 2.3042, + "step": 8594 + }, + { + "epoch": 1.611059044048735, + "grad_norm": 53949.1328125, + "learning_rate": 6.098920134300967e-05, + "loss": 2.2452, + "step": 8595 + }, + { + "epoch": 1.6112464854732895, + "grad_norm": 48496.51953125, + "learning_rate": 6.098153543401759e-05, + "loss": 2.2369, + "step": 8596 + }, + { + "epoch": 1.6114339268978444, + "grad_norm": 47567.94921875, + "learning_rate": 6.09738692537958e-05, + "loss": 2.2456, + "step": 8597 + }, + { + "epoch": 1.6116213683223992, + "grad_norm": 49818.9609375, + "learning_rate": 6.0966202802533676e-05, + "loss": 2.2435, + "step": 8598 + }, + { + "epoch": 1.611808809746954, + "grad_norm": 52284.625, + "learning_rate": 6.095853608042057e-05, + "loss": 2.255, + "step": 8599 + }, + { + "epoch": 1.611996251171509, + "grad_norm": 50026.31640625, + "learning_rate": 6.095086908764582e-05, + "loss": 2.2785, + "step": 8600 + }, + { + "epoch": 1.6121836925960638, + "grad_norm": 47262.35546875, + "learning_rate": 6.094320182439881e-05, + "loss": 2.2227, + "step": 8601 + }, + { + "epoch": 1.6123711340206186, + "grad_norm": 48085.87890625, + "learning_rate": 6.09355342908689e-05, + "loss": 2.1615, + "step": 8602 + }, + { + "epoch": 1.6125585754451732, + "grad_norm": 50994.203125, + "learning_rate": 6.092786648724547e-05, + "loss": 2.1188, + "step": 8603 + }, + { + "epoch": 1.6127460168697283, + "grad_norm": 48138.859375, + "learning_rate": 6.0920198413717896e-05, + "loss": 2.1886, + "step": 8604 + }, + { + "epoch": 1.612933458294283, + "grad_norm": 47827.625, + "learning_rate": 6.091253007047559e-05, + "loss": 2.1741, + "step": 8605 + }, + { + "epoch": 1.613120899718838, + "grad_norm": 52772.28515625, + "learning_rate": 6.090486145770794e-05, + "loss": 2.2788, + "step": 8606 + }, + { + "epoch": 1.6133083411433926, + "grad_norm": 54876.3671875, + "learning_rate": 6.0897192575604365e-05, + "loss": 2.2318, + "step": 8607 + }, + { + "epoch": 1.6134957825679477, + "grad_norm": 50054.5859375, + "learning_rate": 6.088952342435423e-05, + "loss": 2.2249, + "step": 8608 + }, + { + "epoch": 1.6136832239925023, + "grad_norm": 48128.87890625, + "learning_rate": 6.088185400414701e-05, + "loss": 2.1752, + "step": 8609 + }, + { + "epoch": 1.6138706654170571, + "grad_norm": 49664.69140625, + "learning_rate": 6.08741843151721e-05, + "loss": 2.2405, + "step": 8610 + }, + { + "epoch": 1.614058106841612, + "grad_norm": 51470.1640625, + "learning_rate": 6.086651435761892e-05, + "loss": 2.2817, + "step": 8611 + }, + { + "epoch": 1.6142455482661668, + "grad_norm": 51516.12890625, + "learning_rate": 6.085884413167694e-05, + "loss": 2.2043, + "step": 8612 + }, + { + "epoch": 1.6144329896907217, + "grad_norm": 51022.91015625, + "learning_rate": 6.085117363753559e-05, + "loss": 2.2853, + "step": 8613 + }, + { + "epoch": 1.6146204311152763, + "grad_norm": 48623.83203125, + "learning_rate": 6.0843502875384305e-05, + "loss": 2.2641, + "step": 8614 + }, + { + "epoch": 1.6148078725398314, + "grad_norm": 51350.56640625, + "learning_rate": 6.083583184541256e-05, + "loss": 2.2362, + "step": 8615 + }, + { + "epoch": 1.614995313964386, + "grad_norm": 51947.66796875, + "learning_rate": 6.082816054780983e-05, + "loss": 2.2317, + "step": 8616 + }, + { + "epoch": 1.615182755388941, + "grad_norm": 49333.96484375, + "learning_rate": 6.082048898276558e-05, + "loss": 2.1397, + "step": 8617 + }, + { + "epoch": 1.6153701968134957, + "grad_norm": 49052.6953125, + "learning_rate": 6.0812817150469245e-05, + "loss": 2.2272, + "step": 8618 + }, + { + "epoch": 1.6155576382380508, + "grad_norm": 47678.609375, + "learning_rate": 6.0805145051110376e-05, + "loss": 2.216, + "step": 8619 + }, + { + "epoch": 1.6157450796626054, + "grad_norm": 52683.4453125, + "learning_rate": 6.079747268487843e-05, + "loss": 2.2089, + "step": 8620 + }, + { + "epoch": 1.6159325210871602, + "grad_norm": 49701.32421875, + "learning_rate": 6.078980005196291e-05, + "loss": 2.2156, + "step": 8621 + }, + { + "epoch": 1.616119962511715, + "grad_norm": 46536.0546875, + "learning_rate": 6.078212715255331e-05, + "loss": 2.2467, + "step": 8622 + }, + { + "epoch": 1.61630740393627, + "grad_norm": 51197.3515625, + "learning_rate": 6.077445398683914e-05, + "loss": 2.2551, + "step": 8623 + }, + { + "epoch": 1.6164948453608248, + "grad_norm": 53234.59375, + "learning_rate": 6.076678055500995e-05, + "loss": 2.0963, + "step": 8624 + }, + { + "epoch": 1.6166822867853796, + "grad_norm": 51092.97265625, + "learning_rate": 6.075910685725522e-05, + "loss": 2.2071, + "step": 8625 + }, + { + "epoch": 1.6168697282099345, + "grad_norm": 51372.40234375, + "learning_rate": 6.0751432893764504e-05, + "loss": 2.226, + "step": 8626 + }, + { + "epoch": 1.617057169634489, + "grad_norm": 51170.84765625, + "learning_rate": 6.0743758664727345e-05, + "loss": 2.2284, + "step": 8627 + }, + { + "epoch": 1.6172446110590442, + "grad_norm": 52449.1484375, + "learning_rate": 6.073608417033326e-05, + "loss": 2.1548, + "step": 8628 + }, + { + "epoch": 1.6174320524835988, + "grad_norm": 52795.7734375, + "learning_rate": 6.0728409410771824e-05, + "loss": 2.2212, + "step": 8629 + }, + { + "epoch": 1.6176194939081538, + "grad_norm": 47744.07421875, + "learning_rate": 6.072073438623259e-05, + "loss": 2.274, + "step": 8630 + }, + { + "epoch": 1.6178069353327085, + "grad_norm": 52912.53125, + "learning_rate": 6.0713059096905104e-05, + "loss": 2.2639, + "step": 8631 + }, + { + "epoch": 1.6179943767572633, + "grad_norm": 52071.9375, + "learning_rate": 6.070538354297897e-05, + "loss": 2.2209, + "step": 8632 + }, + { + "epoch": 1.6181818181818182, + "grad_norm": 45438.5078125, + "learning_rate": 6.069770772464371e-05, + "loss": 2.2511, + "step": 8633 + }, + { + "epoch": 1.618369259606373, + "grad_norm": 47576.03125, + "learning_rate": 6.069003164208896e-05, + "loss": 2.1956, + "step": 8634 + }, + { + "epoch": 1.6185567010309279, + "grad_norm": 52782.47265625, + "learning_rate": 6.0682355295504287e-05, + "loss": 2.2202, + "step": 8635 + }, + { + "epoch": 1.6187441424554827, + "grad_norm": 52086.48046875, + "learning_rate": 6.067467868507929e-05, + "loss": 2.2254, + "step": 8636 + }, + { + "epoch": 1.6189315838800376, + "grad_norm": 50340.90625, + "learning_rate": 6.066700181100358e-05, + "loss": 2.2415, + "step": 8637 + }, + { + "epoch": 1.6191190253045922, + "grad_norm": 49064.85546875, + "learning_rate": 6.0659324673466724e-05, + "loss": 2.2331, + "step": 8638 + }, + { + "epoch": 1.6193064667291472, + "grad_norm": 49711.203125, + "learning_rate": 6.06516472726584e-05, + "loss": 2.2251, + "step": 8639 + }, + { + "epoch": 1.6194939081537019, + "grad_norm": 51257.3984375, + "learning_rate": 6.064396960876817e-05, + "loss": 2.2798, + "step": 8640 + }, + { + "epoch": 1.619681349578257, + "grad_norm": 51709.78125, + "learning_rate": 6.06362916819857e-05, + "loss": 2.2174, + "step": 8641 + }, + { + "epoch": 1.6198687910028116, + "grad_norm": 52033.0, + "learning_rate": 6.062861349250062e-05, + "loss": 2.1957, + "step": 8642 + }, + { + "epoch": 1.6200562324273664, + "grad_norm": 50359.87109375, + "learning_rate": 6.062093504050257e-05, + "loss": 2.2194, + "step": 8643 + }, + { + "epoch": 1.6202436738519213, + "grad_norm": 49751.21484375, + "learning_rate": 6.061325632618119e-05, + "loss": 2.2016, + "step": 8644 + }, + { + "epoch": 1.620431115276476, + "grad_norm": 51988.68359375, + "learning_rate": 6.060557734972615e-05, + "loss": 2.211, + "step": 8645 + }, + { + "epoch": 1.620618556701031, + "grad_norm": 47569.30078125, + "learning_rate": 6.059789811132708e-05, + "loss": 2.2327, + "step": 8646 + }, + { + "epoch": 1.6208059981255858, + "grad_norm": 55129.11328125, + "learning_rate": 6.0590218611173686e-05, + "loss": 2.2538, + "step": 8647 + }, + { + "epoch": 1.6209934395501406, + "grad_norm": 51660.203125, + "learning_rate": 6.058253884945562e-05, + "loss": 2.2532, + "step": 8648 + }, + { + "epoch": 1.6211808809746953, + "grad_norm": 48203.4375, + "learning_rate": 6.0574858826362545e-05, + "loss": 2.235, + "step": 8649 + }, + { + "epoch": 1.6213683223992503, + "grad_norm": 50347.03515625, + "learning_rate": 6.0567178542084205e-05, + "loss": 2.2179, + "step": 8650 + }, + { + "epoch": 1.621555763823805, + "grad_norm": 52411.41015625, + "learning_rate": 6.055949799681021e-05, + "loss": 2.2587, + "step": 8651 + }, + { + "epoch": 1.62174320524836, + "grad_norm": 49551.07421875, + "learning_rate": 6.0551817190730344e-05, + "loss": 2.2085, + "step": 8652 + }, + { + "epoch": 1.6219306466729146, + "grad_norm": 49523.91796875, + "learning_rate": 6.054413612403423e-05, + "loss": 2.2015, + "step": 8653 + }, + { + "epoch": 1.6221180880974695, + "grad_norm": 49710.046875, + "learning_rate": 6.053645479691167e-05, + "loss": 2.2133, + "step": 8654 + }, + { + "epoch": 1.6223055295220243, + "grad_norm": 50622.25, + "learning_rate": 6.0528773209552314e-05, + "loss": 2.2331, + "step": 8655 + }, + { + "epoch": 1.6224929709465792, + "grad_norm": 54815.2578125, + "learning_rate": 6.052109136214592e-05, + "loss": 2.235, + "step": 8656 + }, + { + "epoch": 1.622680412371134, + "grad_norm": 48027.3046875, + "learning_rate": 6.05134092548822e-05, + "loss": 2.2003, + "step": 8657 + }, + { + "epoch": 1.6228678537956889, + "grad_norm": 50178.3203125, + "learning_rate": 6.050572688795091e-05, + "loss": 2.2555, + "step": 8658 + }, + { + "epoch": 1.6230552952202437, + "grad_norm": 50927.34765625, + "learning_rate": 6.0498044261541774e-05, + "loss": 2.2687, + "step": 8659 + }, + { + "epoch": 1.6232427366447983, + "grad_norm": 52448.44921875, + "learning_rate": 6.0490361375844585e-05, + "loss": 2.2354, + "step": 8660 + }, + { + "epoch": 1.6234301780693534, + "grad_norm": 51852.140625, + "learning_rate": 6.048267823104905e-05, + "loss": 2.2449, + "step": 8661 + }, + { + "epoch": 1.623617619493908, + "grad_norm": 49955.77734375, + "learning_rate": 6.047499482734496e-05, + "loss": 2.2007, + "step": 8662 + }, + { + "epoch": 1.623805060918463, + "grad_norm": 51193.96875, + "learning_rate": 6.046731116492207e-05, + "loss": 2.1961, + "step": 8663 + }, + { + "epoch": 1.6239925023430177, + "grad_norm": 49589.984375, + "learning_rate": 6.045962724397016e-05, + "loss": 2.148, + "step": 8664 + }, + { + "epoch": 1.6241799437675728, + "grad_norm": 51884.3515625, + "learning_rate": 6.045194306467904e-05, + "loss": 2.2258, + "step": 8665 + }, + { + "epoch": 1.6243673851921274, + "grad_norm": 51077.6953125, + "learning_rate": 6.044425862723846e-05, + "loss": 2.2833, + "step": 8666 + }, + { + "epoch": 1.6245548266166823, + "grad_norm": 52381.98828125, + "learning_rate": 6.043657393183825e-05, + "loss": 2.158, + "step": 8667 + }, + { + "epoch": 1.6247422680412371, + "grad_norm": 53010.4296875, + "learning_rate": 6.0428888978668177e-05, + "loss": 2.1451, + "step": 8668 + }, + { + "epoch": 1.624929709465792, + "grad_norm": 47417.8359375, + "learning_rate": 6.042120376791807e-05, + "loss": 2.2441, + "step": 8669 + }, + { + "epoch": 1.6251171508903468, + "grad_norm": 51186.16796875, + "learning_rate": 6.041351829977775e-05, + "loss": 2.1917, + "step": 8670 + }, + { + "epoch": 1.6253045923149014, + "grad_norm": 58789.3828125, + "learning_rate": 6.0405832574437014e-05, + "loss": 2.2458, + "step": 8671 + }, + { + "epoch": 1.6254920337394565, + "grad_norm": 58017.01171875, + "learning_rate": 6.039814659208571e-05, + "loss": 2.2284, + "step": 8672 + }, + { + "epoch": 1.6256794751640111, + "grad_norm": 50127.65234375, + "learning_rate": 6.039046035291368e-05, + "loss": 2.2104, + "step": 8673 + }, + { + "epoch": 1.6258669165885662, + "grad_norm": 49514.40234375, + "learning_rate": 6.038277385711074e-05, + "loss": 2.273, + "step": 8674 + }, + { + "epoch": 1.6260543580131208, + "grad_norm": 52369.66015625, + "learning_rate": 6.037508710486675e-05, + "loss": 2.2072, + "step": 8675 + }, + { + "epoch": 1.6262417994376759, + "grad_norm": 48769.203125, + "learning_rate": 6.0367400096371574e-05, + "loss": 2.2267, + "step": 8676 + }, + { + "epoch": 1.6264292408622305, + "grad_norm": 52067.41015625, + "learning_rate": 6.035971283181504e-05, + "loss": 2.2903, + "step": 8677 + }, + { + "epoch": 1.6266166822867854, + "grad_norm": 53353.34375, + "learning_rate": 6.035202531138703e-05, + "loss": 2.2471, + "step": 8678 + }, + { + "epoch": 1.6268041237113402, + "grad_norm": 50562.23828125, + "learning_rate": 6.0344337535277415e-05, + "loss": 2.1829, + "step": 8679 + }, + { + "epoch": 1.626991565135895, + "grad_norm": 56076.86328125, + "learning_rate": 6.033664950367609e-05, + "loss": 2.1111, + "step": 8680 + }, + { + "epoch": 1.62717900656045, + "grad_norm": 48654.3046875, + "learning_rate": 6.032896121677291e-05, + "loss": 2.2378, + "step": 8681 + }, + { + "epoch": 1.6273664479850047, + "grad_norm": 51643.8671875, + "learning_rate": 6.032127267475779e-05, + "loss": 2.2089, + "step": 8682 + }, + { + "epoch": 1.6275538894095596, + "grad_norm": 52466.6640625, + "learning_rate": 6.03135838778206e-05, + "loss": 2.2412, + "step": 8683 + }, + { + "epoch": 1.6277413308341142, + "grad_norm": 50161.0, + "learning_rate": 6.030589482615128e-05, + "loss": 2.2123, + "step": 8684 + }, + { + "epoch": 1.6279287722586693, + "grad_norm": 49862.5703125, + "learning_rate": 6.02982055199397e-05, + "loss": 2.225, + "step": 8685 + }, + { + "epoch": 1.628116213683224, + "grad_norm": 49260.296875, + "learning_rate": 6.0290515959375804e-05, + "loss": 2.2373, + "step": 8686 + }, + { + "epoch": 1.628303655107779, + "grad_norm": 54192.4609375, + "learning_rate": 6.0282826144649506e-05, + "loss": 2.2208, + "step": 8687 + }, + { + "epoch": 1.6284910965323336, + "grad_norm": 53517.640625, + "learning_rate": 6.027513607595075e-05, + "loss": 2.263, + "step": 8688 + }, + { + "epoch": 1.6286785379568884, + "grad_norm": 50783.6328125, + "learning_rate": 6.026744575346943e-05, + "loss": 2.2636, + "step": 8689 + }, + { + "epoch": 1.6288659793814433, + "grad_norm": 47645.3515625, + "learning_rate": 6.0259755177395525e-05, + "loss": 2.2055, + "step": 8690 + }, + { + "epoch": 1.6290534208059981, + "grad_norm": 53126.0859375, + "learning_rate": 6.025206434791896e-05, + "loss": 2.3007, + "step": 8691 + }, + { + "epoch": 1.629240862230553, + "grad_norm": 50367.421875, + "learning_rate": 6.02443732652297e-05, + "loss": 2.2272, + "step": 8692 + }, + { + "epoch": 1.6294283036551078, + "grad_norm": 54940.796875, + "learning_rate": 6.02366819295177e-05, + "loss": 2.2946, + "step": 8693 + }, + { + "epoch": 1.6296157450796627, + "grad_norm": 48216.08203125, + "learning_rate": 6.022899034097294e-05, + "loss": 2.2675, + "step": 8694 + }, + { + "epoch": 1.6298031865042173, + "grad_norm": 51870.65234375, + "learning_rate": 6.022129849978538e-05, + "loss": 2.1198, + "step": 8695 + }, + { + "epoch": 1.6299906279287724, + "grad_norm": 49598.953125, + "learning_rate": 6.0213606406144985e-05, + "loss": 2.3047, + "step": 8696 + }, + { + "epoch": 1.630178069353327, + "grad_norm": 53832.5859375, + "learning_rate": 6.020591406024177e-05, + "loss": 2.1891, + "step": 8697 + }, + { + "epoch": 1.630365510777882, + "grad_norm": 51966.58984375, + "learning_rate": 6.01982214622657e-05, + "loss": 2.1953, + "step": 8698 + }, + { + "epoch": 1.6305529522024367, + "grad_norm": 50003.07421875, + "learning_rate": 6.019052861240678e-05, + "loss": 2.2564, + "step": 8699 + }, + { + "epoch": 1.6307403936269915, + "grad_norm": 47906.73828125, + "learning_rate": 6.018283551085503e-05, + "loss": 2.2275, + "step": 8700 + }, + { + "epoch": 1.6309278350515464, + "grad_norm": 51243.265625, + "learning_rate": 6.017514215780042e-05, + "loss": 2.2606, + "step": 8701 + }, + { + "epoch": 1.6311152764761012, + "grad_norm": 52573.27734375, + "learning_rate": 6.0167448553433026e-05, + "loss": 2.1941, + "step": 8702 + }, + { + "epoch": 1.631302717900656, + "grad_norm": 48227.68359375, + "learning_rate": 6.0159754697942816e-05, + "loss": 2.259, + "step": 8703 + }, + { + "epoch": 1.631490159325211, + "grad_norm": 49356.1875, + "learning_rate": 6.0152060591519835e-05, + "loss": 2.2056, + "step": 8704 + }, + { + "epoch": 1.6316776007497658, + "grad_norm": 48824.46875, + "learning_rate": 6.014436623435412e-05, + "loss": 2.2321, + "step": 8705 + }, + { + "epoch": 1.6318650421743204, + "grad_norm": 52328.83203125, + "learning_rate": 6.013667162663571e-05, + "loss": 2.2493, + "step": 8706 + }, + { + "epoch": 1.6320524835988754, + "grad_norm": 51090.078125, + "learning_rate": 6.0128976768554654e-05, + "loss": 2.321, + "step": 8707 + }, + { + "epoch": 1.63223992502343, + "grad_norm": 56538.90234375, + "learning_rate": 6.0121281660301e-05, + "loss": 2.1982, + "step": 8708 + }, + { + "epoch": 1.6324273664479851, + "grad_norm": 50070.08203125, + "learning_rate": 6.011358630206482e-05, + "loss": 2.2393, + "step": 8709 + }, + { + "epoch": 1.6326148078725398, + "grad_norm": 50810.13671875, + "learning_rate": 6.010589069403618e-05, + "loss": 2.2343, + "step": 8710 + }, + { + "epoch": 1.6328022492970946, + "grad_norm": 51268.828125, + "learning_rate": 6.009819483640512e-05, + "loss": 2.2827, + "step": 8711 + }, + { + "epoch": 1.6329896907216495, + "grad_norm": 49280.93359375, + "learning_rate": 6.009049872936175e-05, + "loss": 2.3057, + "step": 8712 + }, + { + "epoch": 1.6331771321462043, + "grad_norm": 53932.55078125, + "learning_rate": 6.0082802373096145e-05, + "loss": 2.2415, + "step": 8713 + }, + { + "epoch": 1.6333645735707591, + "grad_norm": 51852.12109375, + "learning_rate": 6.00751057677984e-05, + "loss": 2.2522, + "step": 8714 + }, + { + "epoch": 1.633552014995314, + "grad_norm": 57412.734375, + "learning_rate": 6.0067408913658596e-05, + "loss": 2.214, + "step": 8715 + }, + { + "epoch": 1.6337394564198688, + "grad_norm": 52144.8671875, + "learning_rate": 6.005971181086685e-05, + "loss": 2.2028, + "step": 8716 + }, + { + "epoch": 1.6339268978444235, + "grad_norm": 49954.515625, + "learning_rate": 6.005201445961326e-05, + "loss": 2.1901, + "step": 8717 + }, + { + "epoch": 1.6341143392689785, + "grad_norm": 55082.85546875, + "learning_rate": 6.004431686008795e-05, + "loss": 2.3655, + "step": 8718 + }, + { + "epoch": 1.6343017806935332, + "grad_norm": 55185.86328125, + "learning_rate": 6.003661901248103e-05, + "loss": 2.2412, + "step": 8719 + }, + { + "epoch": 1.6344892221180882, + "grad_norm": 56182.9921875, + "learning_rate": 6.0028920916982634e-05, + "loss": 2.1822, + "step": 8720 + }, + { + "epoch": 1.6346766635426428, + "grad_norm": 48992.546875, + "learning_rate": 6.00212225737829e-05, + "loss": 2.2127, + "step": 8721 + }, + { + "epoch": 1.6348641049671977, + "grad_norm": 50712.0390625, + "learning_rate": 6.001352398307195e-05, + "loss": 2.1582, + "step": 8722 + }, + { + "epoch": 1.6350515463917525, + "grad_norm": 50235.984375, + "learning_rate": 6.000582514503996e-05, + "loss": 2.1881, + "step": 8723 + }, + { + "epoch": 1.6352389878163074, + "grad_norm": 47282.1171875, + "learning_rate": 5.999812605987704e-05, + "loss": 2.2415, + "step": 8724 + }, + { + "epoch": 1.6354264292408622, + "grad_norm": 50906.01171875, + "learning_rate": 5.9990426727773383e-05, + "loss": 2.2273, + "step": 8725 + }, + { + "epoch": 1.635613870665417, + "grad_norm": 52364.046875, + "learning_rate": 5.9982727148919124e-05, + "loss": 2.2207, + "step": 8726 + }, + { + "epoch": 1.635801312089972, + "grad_norm": 51415.56640625, + "learning_rate": 5.9975027323504476e-05, + "loss": 2.2438, + "step": 8727 + }, + { + "epoch": 1.6359887535145266, + "grad_norm": 48200.95703125, + "learning_rate": 5.996732725171956e-05, + "loss": 2.2317, + "step": 8728 + }, + { + "epoch": 1.6361761949390816, + "grad_norm": 47775.01953125, + "learning_rate": 5.9959626933754585e-05, + "loss": 2.2022, + "step": 8729 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 55100.1171875, + "learning_rate": 5.995192636979976e-05, + "loss": 2.2052, + "step": 8730 + }, + { + "epoch": 1.6365510777881913, + "grad_norm": 54943.828125, + "learning_rate": 5.994422556004523e-05, + "loss": 2.1842, + "step": 8731 + }, + { + "epoch": 1.636738519212746, + "grad_norm": 54195.82421875, + "learning_rate": 5.9936524504681234e-05, + "loss": 2.2337, + "step": 8732 + }, + { + "epoch": 1.636925960637301, + "grad_norm": 50940.9140625, + "learning_rate": 5.9928823203897965e-05, + "loss": 2.227, + "step": 8733 + }, + { + "epoch": 1.6371134020618556, + "grad_norm": 52647.62109375, + "learning_rate": 5.992112165788564e-05, + "loss": 2.3067, + "step": 8734 + }, + { + "epoch": 1.6373008434864105, + "grad_norm": 58786.265625, + "learning_rate": 5.991341986683446e-05, + "loss": 2.2384, + "step": 8735 + }, + { + "epoch": 1.6374882849109653, + "grad_norm": 53020.390625, + "learning_rate": 5.990571783093467e-05, + "loss": 2.2041, + "step": 8736 + }, + { + "epoch": 1.6376757263355202, + "grad_norm": 51403.91015625, + "learning_rate": 5.9898015550376485e-05, + "loss": 2.2259, + "step": 8737 + }, + { + "epoch": 1.637863167760075, + "grad_norm": 52964.47265625, + "learning_rate": 5.989031302535018e-05, + "loss": 2.1775, + "step": 8738 + }, + { + "epoch": 1.6380506091846299, + "grad_norm": 54349.42578125, + "learning_rate": 5.9882610256045926e-05, + "loss": 2.1547, + "step": 8739 + }, + { + "epoch": 1.6382380506091847, + "grad_norm": 51364.67578125, + "learning_rate": 5.987490724265403e-05, + "loss": 2.1431, + "step": 8740 + }, + { + "epoch": 1.6384254920337393, + "grad_norm": 54261.5078125, + "learning_rate": 5.986720398536473e-05, + "loss": 2.2651, + "step": 8741 + }, + { + "epoch": 1.6386129334582944, + "grad_norm": 51786.0625, + "learning_rate": 5.985950048436828e-05, + "loss": 2.1975, + "step": 8742 + }, + { + "epoch": 1.638800374882849, + "grad_norm": 52255.1953125, + "learning_rate": 5.9851796739854945e-05, + "loss": 2.289, + "step": 8743 + }, + { + "epoch": 1.638987816307404, + "grad_norm": 57291.8125, + "learning_rate": 5.984409275201501e-05, + "loss": 2.1703, + "step": 8744 + }, + { + "epoch": 1.6391752577319587, + "grad_norm": 52656.015625, + "learning_rate": 5.9836388521038754e-05, + "loss": 2.2713, + "step": 8745 + }, + { + "epoch": 1.6393626991565136, + "grad_norm": 49725.95703125, + "learning_rate": 5.982868404711643e-05, + "loss": 2.204, + "step": 8746 + }, + { + "epoch": 1.6395501405810684, + "grad_norm": 50550.08984375, + "learning_rate": 5.9820979330438386e-05, + "loss": 2.2442, + "step": 8747 + }, + { + "epoch": 1.6397375820056233, + "grad_norm": 51336.62890625, + "learning_rate": 5.981327437119487e-05, + "loss": 2.1973, + "step": 8748 + }, + { + "epoch": 1.639925023430178, + "grad_norm": 48454.40234375, + "learning_rate": 5.98055691695762e-05, + "loss": 2.2342, + "step": 8749 + }, + { + "epoch": 1.640112464854733, + "grad_norm": 50785.9296875, + "learning_rate": 5.9797863725772684e-05, + "loss": 2.2338, + "step": 8750 + }, + { + "epoch": 1.6402999062792878, + "grad_norm": 53176.4140625, + "learning_rate": 5.979015803997464e-05, + "loss": 2.2337, + "step": 8751 + }, + { + "epoch": 1.6404873477038424, + "grad_norm": 51218.01953125, + "learning_rate": 5.978245211237238e-05, + "loss": 2.1872, + "step": 8752 + }, + { + "epoch": 1.6406747891283975, + "grad_norm": 50472.38671875, + "learning_rate": 5.977474594315625e-05, + "loss": 2.2288, + "step": 8753 + }, + { + "epoch": 1.640862230552952, + "grad_norm": 53840.78125, + "learning_rate": 5.976703953251654e-05, + "loss": 2.264, + "step": 8754 + }, + { + "epoch": 1.6410496719775072, + "grad_norm": 51362.1328125, + "learning_rate": 5.975933288064365e-05, + "loss": 2.2047, + "step": 8755 + }, + { + "epoch": 1.6412371134020618, + "grad_norm": 51456.6875, + "learning_rate": 5.975162598772788e-05, + "loss": 2.2019, + "step": 8756 + }, + { + "epoch": 1.6414245548266166, + "grad_norm": 56556.55859375, + "learning_rate": 5.974391885395958e-05, + "loss": 2.2267, + "step": 8757 + }, + { + "epoch": 1.6416119962511715, + "grad_norm": 50844.09765625, + "learning_rate": 5.973621147952914e-05, + "loss": 2.2283, + "step": 8758 + }, + { + "epoch": 1.6417994376757263, + "grad_norm": 49510.88671875, + "learning_rate": 5.972850386462687e-05, + "loss": 2.1496, + "step": 8759 + }, + { + "epoch": 1.6419868791002812, + "grad_norm": 51200.81640625, + "learning_rate": 5.972079600944319e-05, + "loss": 2.1806, + "step": 8760 + }, + { + "epoch": 1.642174320524836, + "grad_norm": 50371.2890625, + "learning_rate": 5.971308791416844e-05, + "loss": 2.1944, + "step": 8761 + }, + { + "epoch": 1.6423617619493909, + "grad_norm": 49890.95703125, + "learning_rate": 5.970537957899301e-05, + "loss": 2.2312, + "step": 8762 + }, + { + "epoch": 1.6425492033739455, + "grad_norm": 47477.96875, + "learning_rate": 5.96976710041073e-05, + "loss": 2.257, + "step": 8763 + }, + { + "epoch": 1.6427366447985006, + "grad_norm": 50715.92578125, + "learning_rate": 5.968996218970168e-05, + "loss": 2.2047, + "step": 8764 + }, + { + "epoch": 1.6429240862230552, + "grad_norm": 50650.91015625, + "learning_rate": 5.9682253135966556e-05, + "loss": 2.2151, + "step": 8765 + }, + { + "epoch": 1.6431115276476103, + "grad_norm": 53165.65234375, + "learning_rate": 5.967454384309234e-05, + "loss": 2.1709, + "step": 8766 + }, + { + "epoch": 1.6432989690721649, + "grad_norm": 48892.12890625, + "learning_rate": 5.9666834311269425e-05, + "loss": 2.1771, + "step": 8767 + }, + { + "epoch": 1.6434864104967197, + "grad_norm": 48938.1484375, + "learning_rate": 5.965912454068825e-05, + "loss": 2.1966, + "step": 8768 + }, + { + "epoch": 1.6436738519212746, + "grad_norm": 55338.79296875, + "learning_rate": 5.965141453153921e-05, + "loss": 2.1399, + "step": 8769 + }, + { + "epoch": 1.6438612933458294, + "grad_norm": 52993.42578125, + "learning_rate": 5.964370428401276e-05, + "loss": 2.3778, + "step": 8770 + }, + { + "epoch": 1.6440487347703843, + "grad_norm": 53092.796875, + "learning_rate": 5.96359937982993e-05, + "loss": 2.2539, + "step": 8771 + }, + { + "epoch": 1.6442361761949391, + "grad_norm": 49322.5703125, + "learning_rate": 5.962828307458929e-05, + "loss": 2.2782, + "step": 8772 + }, + { + "epoch": 1.644423617619494, + "grad_norm": 52896.3984375, + "learning_rate": 5.962057211307318e-05, + "loss": 2.2294, + "step": 8773 + }, + { + "epoch": 1.6446110590440486, + "grad_norm": 46708.6796875, + "learning_rate": 5.961286091394139e-05, + "loss": 2.2348, + "step": 8774 + }, + { + "epoch": 1.6447985004686037, + "grad_norm": 51012.7265625, + "learning_rate": 5.9605149477384425e-05, + "loss": 2.2078, + "step": 8775 + }, + { + "epoch": 1.6449859418931583, + "grad_norm": 50989.88671875, + "learning_rate": 5.9597437803592714e-05, + "loss": 2.3103, + "step": 8776 + }, + { + "epoch": 1.6451733833177133, + "grad_norm": 57571.30078125, + "learning_rate": 5.958972589275672e-05, + "loss": 2.2303, + "step": 8777 + }, + { + "epoch": 1.645360824742268, + "grad_norm": 52356.33984375, + "learning_rate": 5.958201374506695e-05, + "loss": 2.1528, + "step": 8778 + }, + { + "epoch": 1.6455482661668228, + "grad_norm": 49390.78125, + "learning_rate": 5.9574301360713845e-05, + "loss": 2.2547, + "step": 8779 + }, + { + "epoch": 1.6457357075913777, + "grad_norm": 53355.8671875, + "learning_rate": 5.956658873988792e-05, + "loss": 2.2632, + "step": 8780 + }, + { + "epoch": 1.6459231490159325, + "grad_norm": 48332.2734375, + "learning_rate": 5.955887588277965e-05, + "loss": 2.2256, + "step": 8781 + }, + { + "epoch": 1.6461105904404874, + "grad_norm": 54388.04296875, + "learning_rate": 5.955116278957954e-05, + "loss": 2.2816, + "step": 8782 + }, + { + "epoch": 1.6462980318650422, + "grad_norm": 51446.78515625, + "learning_rate": 5.9543449460478095e-05, + "loss": 2.2021, + "step": 8783 + }, + { + "epoch": 1.646485473289597, + "grad_norm": 49716.77734375, + "learning_rate": 5.9535735895665814e-05, + "loss": 2.2048, + "step": 8784 + }, + { + "epoch": 1.6466729147141517, + "grad_norm": 56177.48828125, + "learning_rate": 5.9528022095333236e-05, + "loss": 2.1953, + "step": 8785 + }, + { + "epoch": 1.6468603561387067, + "grad_norm": 51798.953125, + "learning_rate": 5.952030805967086e-05, + "loss": 2.1642, + "step": 8786 + }, + { + "epoch": 1.6470477975632614, + "grad_norm": 50544.36328125, + "learning_rate": 5.951259378886921e-05, + "loss": 2.1742, + "step": 8787 + }, + { + "epoch": 1.6472352389878164, + "grad_norm": 52734.0859375, + "learning_rate": 5.950487928311884e-05, + "loss": 2.1426, + "step": 8788 + }, + { + "epoch": 1.647422680412371, + "grad_norm": 52691.078125, + "learning_rate": 5.949716454261025e-05, + "loss": 2.2553, + "step": 8789 + }, + { + "epoch": 1.6476101218369261, + "grad_norm": 51514.1796875, + "learning_rate": 5.948944956753404e-05, + "loss": 2.2814, + "step": 8790 + }, + { + "epoch": 1.6477975632614807, + "grad_norm": 50801.83203125, + "learning_rate": 5.948173435808071e-05, + "loss": 2.2237, + "step": 8791 + }, + { + "epoch": 1.6479850046860356, + "grad_norm": 50964.66796875, + "learning_rate": 5.9474018914440835e-05, + "loss": 2.3175, + "step": 8792 + }, + { + "epoch": 1.6481724461105904, + "grad_norm": 54837.14453125, + "learning_rate": 5.946630323680498e-05, + "loss": 2.2163, + "step": 8793 + }, + { + "epoch": 1.6483598875351453, + "grad_norm": 49687.21875, + "learning_rate": 5.945858732536371e-05, + "loss": 2.2786, + "step": 8794 + }, + { + "epoch": 1.6485473289597001, + "grad_norm": 52078.09375, + "learning_rate": 5.945087118030759e-05, + "loss": 2.1544, + "step": 8795 + }, + { + "epoch": 1.6487347703842548, + "grad_norm": 53173.7265625, + "learning_rate": 5.9443154801827213e-05, + "loss": 2.1976, + "step": 8796 + }, + { + "epoch": 1.6489222118088098, + "grad_norm": 47321.95703125, + "learning_rate": 5.9435438190113144e-05, + "loss": 2.17, + "step": 8797 + }, + { + "epoch": 1.6491096532333644, + "grad_norm": 54749.66015625, + "learning_rate": 5.9427721345356005e-05, + "loss": 2.1995, + "step": 8798 + }, + { + "epoch": 1.6492970946579195, + "grad_norm": 54260.06640625, + "learning_rate": 5.942000426774636e-05, + "loss": 2.17, + "step": 8799 + }, + { + "epoch": 1.6494845360824741, + "grad_norm": 50851.42578125, + "learning_rate": 5.941228695747482e-05, + "loss": 2.2323, + "step": 8800 + }, + { + "epoch": 1.6496719775070292, + "grad_norm": 49952.09765625, + "learning_rate": 5.940456941473199e-05, + "loss": 2.2113, + "step": 8801 + }, + { + "epoch": 1.6498594189315838, + "grad_norm": 50599.6484375, + "learning_rate": 5.939685163970851e-05, + "loss": 2.2681, + "step": 8802 + }, + { + "epoch": 1.6500468603561387, + "grad_norm": 47339.05078125, + "learning_rate": 5.938913363259496e-05, + "loss": 2.2748, + "step": 8803 + }, + { + "epoch": 1.6502343017806935, + "grad_norm": 50097.078125, + "learning_rate": 5.9381415393581985e-05, + "loss": 2.2479, + "step": 8804 + }, + { + "epoch": 1.6504217432052484, + "grad_norm": 55186.12890625, + "learning_rate": 5.9373696922860224e-05, + "loss": 2.125, + "step": 8805 + }, + { + "epoch": 1.6506091846298032, + "grad_norm": 48213.8046875, + "learning_rate": 5.936597822062029e-05, + "loss": 2.243, + "step": 8806 + }, + { + "epoch": 1.650796626054358, + "grad_norm": 50432.640625, + "learning_rate": 5.935825928705283e-05, + "loss": 2.2376, + "step": 8807 + }, + { + "epoch": 1.650984067478913, + "grad_norm": 50885.734375, + "learning_rate": 5.935054012234852e-05, + "loss": 2.2243, + "step": 8808 + }, + { + "epoch": 1.6511715089034675, + "grad_norm": 46310.7734375, + "learning_rate": 5.934282072669798e-05, + "loss": 2.2685, + "step": 8809 + }, + { + "epoch": 1.6513589503280226, + "grad_norm": 51215.0703125, + "learning_rate": 5.9335101100291876e-05, + "loss": 2.2321, + "step": 8810 + }, + { + "epoch": 1.6515463917525772, + "grad_norm": 48485.28125, + "learning_rate": 5.932738124332089e-05, + "loss": 2.1926, + "step": 8811 + }, + { + "epoch": 1.6517338331771323, + "grad_norm": 52321.6875, + "learning_rate": 5.931966115597567e-05, + "loss": 2.2629, + "step": 8812 + }, + { + "epoch": 1.651921274601687, + "grad_norm": 49350.01171875, + "learning_rate": 5.931194083844691e-05, + "loss": 2.1965, + "step": 8813 + }, + { + "epoch": 1.6521087160262418, + "grad_norm": 51734.26171875, + "learning_rate": 5.9304220290925285e-05, + "loss": 2.2222, + "step": 8814 + }, + { + "epoch": 1.6522961574507966, + "grad_norm": 51083.22265625, + "learning_rate": 5.929649951360148e-05, + "loss": 2.2343, + "step": 8815 + }, + { + "epoch": 1.6524835988753515, + "grad_norm": 49959.5625, + "learning_rate": 5.9288778506666195e-05, + "loss": 2.2467, + "step": 8816 + }, + { + "epoch": 1.6526710402999063, + "grad_norm": 47534.05859375, + "learning_rate": 5.92810572703101e-05, + "loss": 2.2495, + "step": 8817 + }, + { + "epoch": 1.6528584817244611, + "grad_norm": 50845.9140625, + "learning_rate": 5.927333580472395e-05, + "loss": 2.1913, + "step": 8818 + }, + { + "epoch": 1.653045923149016, + "grad_norm": 56665.59765625, + "learning_rate": 5.926561411009841e-05, + "loss": 2.27, + "step": 8819 + }, + { + "epoch": 1.6532333645735706, + "grad_norm": 50663.609375, + "learning_rate": 5.9257892186624234e-05, + "loss": 2.2709, + "step": 8820 + }, + { + "epoch": 1.6534208059981257, + "grad_norm": 50077.91796875, + "learning_rate": 5.925017003449211e-05, + "loss": 2.2313, + "step": 8821 + }, + { + "epoch": 1.6536082474226803, + "grad_norm": 48145.41015625, + "learning_rate": 5.924244765389279e-05, + "loss": 2.24, + "step": 8822 + }, + { + "epoch": 1.6537956888472354, + "grad_norm": 49051.79296875, + "learning_rate": 5.923472504501698e-05, + "loss": 2.2569, + "step": 8823 + }, + { + "epoch": 1.65398313027179, + "grad_norm": 51575.875, + "learning_rate": 5.922700220805545e-05, + "loss": 2.25, + "step": 8824 + }, + { + "epoch": 1.6541705716963448, + "grad_norm": 49164.3828125, + "learning_rate": 5.9219279143198916e-05, + "loss": 2.195, + "step": 8825 + }, + { + "epoch": 1.6543580131208997, + "grad_norm": 49905.140625, + "learning_rate": 5.9211555850638154e-05, + "loss": 2.2003, + "step": 8826 + }, + { + "epoch": 1.6545454545454545, + "grad_norm": 49378.9921875, + "learning_rate": 5.92038323305639e-05, + "loss": 2.1847, + "step": 8827 + }, + { + "epoch": 1.6547328959700094, + "grad_norm": 51275.3203125, + "learning_rate": 5.919610858316692e-05, + "loss": 2.24, + "step": 8828 + }, + { + "epoch": 1.6549203373945642, + "grad_norm": 50026.41015625, + "learning_rate": 5.9188384608637984e-05, + "loss": 2.1773, + "step": 8829 + }, + { + "epoch": 1.655107778819119, + "grad_norm": 52865.5234375, + "learning_rate": 5.9180660407167855e-05, + "loss": 2.1854, + "step": 8830 + }, + { + "epoch": 1.6552952202436737, + "grad_norm": 52188.0078125, + "learning_rate": 5.917293597894732e-05, + "loss": 2.2433, + "step": 8831 + }, + { + "epoch": 1.6554826616682288, + "grad_norm": 55848.62890625, + "learning_rate": 5.916521132416715e-05, + "loss": 2.2174, + "step": 8832 + }, + { + "epoch": 1.6556701030927834, + "grad_norm": 52338.0, + "learning_rate": 5.915748644301815e-05, + "loss": 2.1898, + "step": 8833 + }, + { + "epoch": 1.6558575445173385, + "grad_norm": 52154.109375, + "learning_rate": 5.914976133569111e-05, + "loss": 2.17, + "step": 8834 + }, + { + "epoch": 1.656044985941893, + "grad_norm": 50775.0390625, + "learning_rate": 5.914203600237683e-05, + "loss": 2.1991, + "step": 8835 + }, + { + "epoch": 1.656232427366448, + "grad_norm": 56235.0078125, + "learning_rate": 5.9134310443266105e-05, + "loss": 2.0541, + "step": 8836 + }, + { + "epoch": 1.6564198687910028, + "grad_norm": 52320.265625, + "learning_rate": 5.912658465854975e-05, + "loss": 2.1734, + "step": 8837 + }, + { + "epoch": 1.6566073102155576, + "grad_norm": 53717.0, + "learning_rate": 5.911885864841861e-05, + "loss": 2.3025, + "step": 8838 + }, + { + "epoch": 1.6567947516401125, + "grad_norm": 48594.52734375, + "learning_rate": 5.911113241306346e-05, + "loss": 2.2464, + "step": 8839 + }, + { + "epoch": 1.6569821930646673, + "grad_norm": 53486.92578125, + "learning_rate": 5.910340595267516e-05, + "loss": 2.2277, + "step": 8840 + }, + { + "epoch": 1.6571696344892222, + "grad_norm": 52238.41796875, + "learning_rate": 5.909567926744455e-05, + "loss": 2.2026, + "step": 8841 + }, + { + "epoch": 1.6573570759137768, + "grad_norm": 54460.47265625, + "learning_rate": 5.9087952357562446e-05, + "loss": 2.2443, + "step": 8842 + }, + { + "epoch": 1.6575445173383319, + "grad_norm": 49050.9375, + "learning_rate": 5.9080225223219696e-05, + "loss": 2.2426, + "step": 8843 + }, + { + "epoch": 1.6577319587628865, + "grad_norm": 51953.12109375, + "learning_rate": 5.907249786460716e-05, + "loss": 2.2157, + "step": 8844 + }, + { + "epoch": 1.6579194001874415, + "grad_norm": 53357.859375, + "learning_rate": 5.906477028191568e-05, + "loss": 2.2838, + "step": 8845 + }, + { + "epoch": 1.6581068416119962, + "grad_norm": 53380.3671875, + "learning_rate": 5.905704247533614e-05, + "loss": 2.212, + "step": 8846 + }, + { + "epoch": 1.6582942830365512, + "grad_norm": 50839.5625, + "learning_rate": 5.9049314445059377e-05, + "loss": 2.2412, + "step": 8847 + }, + { + "epoch": 1.6584817244611059, + "grad_norm": 51757.390625, + "learning_rate": 5.904158619127631e-05, + "loss": 2.2709, + "step": 8848 + }, + { + "epoch": 1.6586691658856607, + "grad_norm": 50380.21875, + "learning_rate": 5.903385771417777e-05, + "loss": 2.2191, + "step": 8849 + }, + { + "epoch": 1.6588566073102156, + "grad_norm": 49158.78515625, + "learning_rate": 5.9026129013954644e-05, + "loss": 2.2412, + "step": 8850 + }, + { + "epoch": 1.6590440487347704, + "grad_norm": 51348.9296875, + "learning_rate": 5.901840009079784e-05, + "loss": 2.2272, + "step": 8851 + }, + { + "epoch": 1.6592314901593253, + "grad_norm": 49090.84375, + "learning_rate": 5.901067094489825e-05, + "loss": 2.2682, + "step": 8852 + }, + { + "epoch": 1.6594189315838799, + "grad_norm": 50426.67578125, + "learning_rate": 5.900294157644677e-05, + "loss": 2.158, + "step": 8853 + }, + { + "epoch": 1.659606373008435, + "grad_norm": 51867.17578125, + "learning_rate": 5.8995211985634314e-05, + "loss": 2.2964, + "step": 8854 + }, + { + "epoch": 1.6597938144329896, + "grad_norm": 57348.29296875, + "learning_rate": 5.898748217265178e-05, + "loss": 2.275, + "step": 8855 + }, + { + "epoch": 1.6599812558575446, + "grad_norm": 47959.6328125, + "learning_rate": 5.897975213769008e-05, + "loss": 2.2309, + "step": 8856 + }, + { + "epoch": 1.6601686972820993, + "grad_norm": 49975.3984375, + "learning_rate": 5.897202188094015e-05, + "loss": 2.2438, + "step": 8857 + }, + { + "epoch": 1.6603561387066543, + "grad_norm": 49543.62890625, + "learning_rate": 5.896429140259292e-05, + "loss": 2.2199, + "step": 8858 + }, + { + "epoch": 1.660543580131209, + "grad_norm": 55231.1484375, + "learning_rate": 5.8956560702839304e-05, + "loss": 2.1746, + "step": 8859 + }, + { + "epoch": 1.6607310215557638, + "grad_norm": 47695.51171875, + "learning_rate": 5.894882978187025e-05, + "loss": 2.2311, + "step": 8860 + }, + { + "epoch": 1.6609184629803186, + "grad_norm": 50936.02734375, + "learning_rate": 5.894109863987671e-05, + "loss": 2.132, + "step": 8861 + }, + { + "epoch": 1.6611059044048735, + "grad_norm": 49321.28125, + "learning_rate": 5.8933367277049614e-05, + "loss": 2.2125, + "step": 8862 + }, + { + "epoch": 1.6612933458294283, + "grad_norm": 53216.05859375, + "learning_rate": 5.892563569357995e-05, + "loss": 2.3008, + "step": 8863 + }, + { + "epoch": 1.6614807872539832, + "grad_norm": 51434.24609375, + "learning_rate": 5.8917903889658636e-05, + "loss": 2.2895, + "step": 8864 + }, + { + "epoch": 1.661668228678538, + "grad_norm": 50306.859375, + "learning_rate": 5.8910171865476674e-05, + "loss": 2.186, + "step": 8865 + }, + { + "epoch": 1.6618556701030927, + "grad_norm": 50132.69921875, + "learning_rate": 5.8902439621225e-05, + "loss": 2.2029, + "step": 8866 + }, + { + "epoch": 1.6620431115276477, + "grad_norm": 49621.53515625, + "learning_rate": 5.889470715709462e-05, + "loss": 2.2087, + "step": 8867 + }, + { + "epoch": 1.6622305529522023, + "grad_norm": 50516.30859375, + "learning_rate": 5.8886974473276504e-05, + "loss": 2.2642, + "step": 8868 + }, + { + "epoch": 1.6624179943767574, + "grad_norm": 53582.58203125, + "learning_rate": 5.887924156996165e-05, + "loss": 2.261, + "step": 8869 + }, + { + "epoch": 1.662605435801312, + "grad_norm": 53514.6796875, + "learning_rate": 5.887150844734104e-05, + "loss": 2.2215, + "step": 8870 + }, + { + "epoch": 1.6627928772258669, + "grad_norm": 50404.42578125, + "learning_rate": 5.886377510560566e-05, + "loss": 2.1997, + "step": 8871 + }, + { + "epoch": 1.6629803186504217, + "grad_norm": 49155.48828125, + "learning_rate": 5.885604154494654e-05, + "loss": 2.1989, + "step": 8872 + }, + { + "epoch": 1.6631677600749766, + "grad_norm": 51353.609375, + "learning_rate": 5.884830776555467e-05, + "loss": 2.2955, + "step": 8873 + }, + { + "epoch": 1.6633552014995314, + "grad_norm": 51944.84765625, + "learning_rate": 5.8840573767621074e-05, + "loss": 2.214, + "step": 8874 + }, + { + "epoch": 1.6635426429240863, + "grad_norm": 55011.23828125, + "learning_rate": 5.883283955133675e-05, + "loss": 2.2167, + "step": 8875 + }, + { + "epoch": 1.6637300843486411, + "grad_norm": 55263.58984375, + "learning_rate": 5.882510511689276e-05, + "loss": 2.2665, + "step": 8876 + }, + { + "epoch": 1.6639175257731957, + "grad_norm": 52297.04296875, + "learning_rate": 5.881737046448011e-05, + "loss": 2.2451, + "step": 8877 + }, + { + "epoch": 1.6641049671977508, + "grad_norm": 53152.60546875, + "learning_rate": 5.8809635594289845e-05, + "loss": 2.1798, + "step": 8878 + }, + { + "epoch": 1.6642924086223054, + "grad_norm": 54298.92578125, + "learning_rate": 5.8801900506512994e-05, + "loss": 2.243, + "step": 8879 + }, + { + "epoch": 1.6644798500468605, + "grad_norm": 51459.43359375, + "learning_rate": 5.8794165201340624e-05, + "loss": 2.2406, + "step": 8880 + }, + { + "epoch": 1.6646672914714151, + "grad_norm": 55442.51171875, + "learning_rate": 5.8786429678963765e-05, + "loss": 2.2236, + "step": 8881 + }, + { + "epoch": 1.66485473289597, + "grad_norm": 49106.19921875, + "learning_rate": 5.877869393957349e-05, + "loss": 2.2563, + "step": 8882 + }, + { + "epoch": 1.6650421743205248, + "grad_norm": 55275.75, + "learning_rate": 5.8770957983360867e-05, + "loss": 2.264, + "step": 8883 + }, + { + "epoch": 1.6652296157450797, + "grad_norm": 47583.50390625, + "learning_rate": 5.8763221810516934e-05, + "loss": 2.2335, + "step": 8884 + }, + { + "epoch": 1.6654170571696345, + "grad_norm": 49389.26171875, + "learning_rate": 5.8755485421232794e-05, + "loss": 2.2034, + "step": 8885 + }, + { + "epoch": 1.6656044985941894, + "grad_norm": 48625.68359375, + "learning_rate": 5.874774881569951e-05, + "loss": 2.2322, + "step": 8886 + }, + { + "epoch": 1.6657919400187442, + "grad_norm": 51322.0546875, + "learning_rate": 5.874001199410818e-05, + "loss": 2.2731, + "step": 8887 + }, + { + "epoch": 1.6659793814432988, + "grad_norm": 51394.52734375, + "learning_rate": 5.873227495664988e-05, + "loss": 2.2654, + "step": 8888 + }, + { + "epoch": 1.666166822867854, + "grad_norm": 53982.89453125, + "learning_rate": 5.8724537703515714e-05, + "loss": 2.2182, + "step": 8889 + }, + { + "epoch": 1.6663542642924085, + "grad_norm": 47175.05859375, + "learning_rate": 5.8716800234896765e-05, + "loss": 2.1546, + "step": 8890 + }, + { + "epoch": 1.6665417057169636, + "grad_norm": 48826.25, + "learning_rate": 5.870906255098417e-05, + "loss": 2.2532, + "step": 8891 + }, + { + "epoch": 1.6667291471415182, + "grad_norm": 52616.8203125, + "learning_rate": 5.8701324651969004e-05, + "loss": 2.243, + "step": 8892 + }, + { + "epoch": 1.666916588566073, + "grad_norm": 50478.21484375, + "learning_rate": 5.869358653804242e-05, + "loss": 2.1795, + "step": 8893 + }, + { + "epoch": 1.667104029990628, + "grad_norm": 50260.89453125, + "learning_rate": 5.8685848209395514e-05, + "loss": 2.1321, + "step": 8894 + }, + { + "epoch": 1.6672914714151827, + "grad_norm": 53783.83203125, + "learning_rate": 5.867810966621941e-05, + "loss": 2.2262, + "step": 8895 + }, + { + "epoch": 1.6674789128397376, + "grad_norm": 50454.58203125, + "learning_rate": 5.8670370908705265e-05, + "loss": 2.2385, + "step": 8896 + }, + { + "epoch": 1.6676663542642924, + "grad_norm": 52295.34765625, + "learning_rate": 5.866263193704418e-05, + "loss": 2.3328, + "step": 8897 + }, + { + "epoch": 1.6678537956888473, + "grad_norm": 54307.73046875, + "learning_rate": 5.865489275142733e-05, + "loss": 2.1769, + "step": 8898 + }, + { + "epoch": 1.668041237113402, + "grad_norm": 55158.62109375, + "learning_rate": 5.864715335204585e-05, + "loss": 2.1621, + "step": 8899 + }, + { + "epoch": 1.668228678537957, + "grad_norm": 50542.9765625, + "learning_rate": 5.863941373909088e-05, + "loss": 2.2272, + "step": 8900 + }, + { + "epoch": 1.6684161199625116, + "grad_norm": 49109.734375, + "learning_rate": 5.863167391275359e-05, + "loss": 2.2602, + "step": 8901 + }, + { + "epoch": 1.6686035613870667, + "grad_norm": 50572.15234375, + "learning_rate": 5.862393387322517e-05, + "loss": 2.2664, + "step": 8902 + }, + { + "epoch": 1.6687910028116213, + "grad_norm": 51729.6015625, + "learning_rate": 5.861619362069675e-05, + "loss": 2.3141, + "step": 8903 + }, + { + "epoch": 1.6689784442361761, + "grad_norm": 53895.390625, + "learning_rate": 5.860845315535952e-05, + "loss": 2.2301, + "step": 8904 + }, + { + "epoch": 1.669165885660731, + "grad_norm": 51684.77734375, + "learning_rate": 5.860071247740465e-05, + "loss": 2.2136, + "step": 8905 + }, + { + "epoch": 1.6693533270852858, + "grad_norm": 49367.97265625, + "learning_rate": 5.859297158702335e-05, + "loss": 2.2294, + "step": 8906 + }, + { + "epoch": 1.6695407685098407, + "grad_norm": 50078.296875, + "learning_rate": 5.8585230484406784e-05, + "loss": 2.2591, + "step": 8907 + }, + { + "epoch": 1.6697282099343955, + "grad_norm": 55830.16796875, + "learning_rate": 5.8577489169746144e-05, + "loss": 2.2956, + "step": 8908 + }, + { + "epoch": 1.6699156513589504, + "grad_norm": 54695.90625, + "learning_rate": 5.856974764323265e-05, + "loss": 2.2486, + "step": 8909 + }, + { + "epoch": 1.670103092783505, + "grad_norm": 53833.13671875, + "learning_rate": 5.8562005905057516e-05, + "loss": 2.2988, + "step": 8910 + }, + { + "epoch": 1.67029053420806, + "grad_norm": 50662.80859375, + "learning_rate": 5.8554263955411926e-05, + "loss": 2.2372, + "step": 8911 + }, + { + "epoch": 1.6704779756326147, + "grad_norm": 50872.73828125, + "learning_rate": 5.85465217944871e-05, + "loss": 2.1701, + "step": 8912 + }, + { + "epoch": 1.6706654170571698, + "grad_norm": 50008.52734375, + "learning_rate": 5.8538779422474276e-05, + "loss": 2.2373, + "step": 8913 + }, + { + "epoch": 1.6708528584817244, + "grad_norm": 50764.5234375, + "learning_rate": 5.853103683956467e-05, + "loss": 2.2706, + "step": 8914 + }, + { + "epoch": 1.6710402999062794, + "grad_norm": 49211.2890625, + "learning_rate": 5.8523294045949504e-05, + "loss": 2.2107, + "step": 8915 + }, + { + "epoch": 1.671227741330834, + "grad_norm": 54917.66015625, + "learning_rate": 5.851555104182004e-05, + "loss": 2.1962, + "step": 8916 + }, + { + "epoch": 1.671415182755389, + "grad_norm": 53766.953125, + "learning_rate": 5.8507807827367487e-05, + "loss": 2.267, + "step": 8917 + }, + { + "epoch": 1.6716026241799438, + "grad_norm": 52900.94921875, + "learning_rate": 5.850006440278313e-05, + "loss": 2.1947, + "step": 8918 + }, + { + "epoch": 1.6717900656044986, + "grad_norm": 48497.578125, + "learning_rate": 5.84923207682582e-05, + "loss": 2.2813, + "step": 8919 + }, + { + "epoch": 1.6719775070290535, + "grad_norm": 57805.96875, + "learning_rate": 5.8484576923983946e-05, + "loss": 2.2932, + "step": 8920 + }, + { + "epoch": 1.6721649484536083, + "grad_norm": 51256.7109375, + "learning_rate": 5.847683287015166e-05, + "loss": 2.2828, + "step": 8921 + }, + { + "epoch": 1.6723523898781631, + "grad_norm": 51559.3125, + "learning_rate": 5.8469088606952584e-05, + "loss": 2.2413, + "step": 8922 + }, + { + "epoch": 1.6725398313027178, + "grad_norm": 50244.21484375, + "learning_rate": 5.8461344134577986e-05, + "loss": 2.2125, + "step": 8923 + }, + { + "epoch": 1.6727272727272728, + "grad_norm": 47826.34375, + "learning_rate": 5.845359945321919e-05, + "loss": 2.2751, + "step": 8924 + }, + { + "epoch": 1.6729147141518275, + "grad_norm": 54378.7421875, + "learning_rate": 5.844585456306742e-05, + "loss": 2.2283, + "step": 8925 + }, + { + "epoch": 1.6731021555763825, + "grad_norm": 47600.8046875, + "learning_rate": 5.843810946431401e-05, + "loss": 2.1992, + "step": 8926 + }, + { + "epoch": 1.6732895970009372, + "grad_norm": 53282.79296875, + "learning_rate": 5.8430364157150216e-05, + "loss": 2.252, + "step": 8927 + }, + { + "epoch": 1.673477038425492, + "grad_norm": 53396.2109375, + "learning_rate": 5.842261864176737e-05, + "loss": 2.2033, + "step": 8928 + }, + { + "epoch": 1.6736644798500468, + "grad_norm": 49815.1953125, + "learning_rate": 5.841487291835675e-05, + "loss": 2.2627, + "step": 8929 + }, + { + "epoch": 1.6738519212746017, + "grad_norm": 47865.14453125, + "learning_rate": 5.8407126987109696e-05, + "loss": 2.2547, + "step": 8930 + }, + { + "epoch": 1.6740393626991565, + "grad_norm": 52339.36328125, + "learning_rate": 5.83993808482175e-05, + "loss": 2.3288, + "step": 8931 + }, + { + "epoch": 1.6742268041237114, + "grad_norm": 47635.02734375, + "learning_rate": 5.8391634501871485e-05, + "loss": 2.2218, + "step": 8932 + }, + { + "epoch": 1.6744142455482662, + "grad_norm": 49949.87890625, + "learning_rate": 5.8383887948262986e-05, + "loss": 2.2318, + "step": 8933 + }, + { + "epoch": 1.6746016869728209, + "grad_norm": 53440.0234375, + "learning_rate": 5.8376141187583324e-05, + "loss": 2.2873, + "step": 8934 + }, + { + "epoch": 1.674789128397376, + "grad_norm": 51646.12109375, + "learning_rate": 5.836839422002381e-05, + "loss": 2.2295, + "step": 8935 + }, + { + "epoch": 1.6749765698219305, + "grad_norm": 51112.359375, + "learning_rate": 5.836064704577584e-05, + "loss": 2.2921, + "step": 8936 + }, + { + "epoch": 1.6751640112464856, + "grad_norm": 57316.12109375, + "learning_rate": 5.8352899665030726e-05, + "loss": 2.25, + "step": 8937 + }, + { + "epoch": 1.6753514526710402, + "grad_norm": 47820.94140625, + "learning_rate": 5.8345152077979805e-05, + "loss": 2.2274, + "step": 8938 + }, + { + "epoch": 1.675538894095595, + "grad_norm": 53246.58203125, + "learning_rate": 5.8337404284814455e-05, + "loss": 2.2204, + "step": 8939 + }, + { + "epoch": 1.67572633552015, + "grad_norm": 52893.8671875, + "learning_rate": 5.832965628572602e-05, + "loss": 2.2525, + "step": 8940 + }, + { + "epoch": 1.6759137769447048, + "grad_norm": 49963.62109375, + "learning_rate": 5.832190808090589e-05, + "loss": 2.2306, + "step": 8941 + }, + { + "epoch": 1.6761012183692596, + "grad_norm": 49414.2109375, + "learning_rate": 5.831415967054541e-05, + "loss": 2.1871, + "step": 8942 + }, + { + "epoch": 1.6762886597938145, + "grad_norm": 49918.11328125, + "learning_rate": 5.830641105483596e-05, + "loss": 2.2205, + "step": 8943 + }, + { + "epoch": 1.6764761012183693, + "grad_norm": 50511.32421875, + "learning_rate": 5.8298662233968925e-05, + "loss": 2.2028, + "step": 8944 + }, + { + "epoch": 1.676663542642924, + "grad_norm": 46859.86328125, + "learning_rate": 5.82909132081357e-05, + "loss": 2.2176, + "step": 8945 + }, + { + "epoch": 1.676850984067479, + "grad_norm": 51346.3046875, + "learning_rate": 5.8283163977527666e-05, + "loss": 2.2154, + "step": 8946 + }, + { + "epoch": 1.6770384254920336, + "grad_norm": 54575.05859375, + "learning_rate": 5.8275414542336236e-05, + "loss": 2.1495, + "step": 8947 + }, + { + "epoch": 1.6772258669165887, + "grad_norm": 48855.7890625, + "learning_rate": 5.826766490275277e-05, + "loss": 2.2706, + "step": 8948 + }, + { + "epoch": 1.6774133083411433, + "grad_norm": 49066.8515625, + "learning_rate": 5.8259915058968715e-05, + "loss": 2.1971, + "step": 8949 + }, + { + "epoch": 1.6776007497656982, + "grad_norm": 47220.2734375, + "learning_rate": 5.825216501117545e-05, + "loss": 2.1989, + "step": 8950 + }, + { + "epoch": 1.677788191190253, + "grad_norm": 49460.69140625, + "learning_rate": 5.8244414759564435e-05, + "loss": 2.2408, + "step": 8951 + }, + { + "epoch": 1.6779756326148079, + "grad_norm": 52778.09765625, + "learning_rate": 5.823666430432706e-05, + "loss": 2.2121, + "step": 8952 + }, + { + "epoch": 1.6781630740393627, + "grad_norm": 54525.234375, + "learning_rate": 5.8228913645654736e-05, + "loss": 2.2412, + "step": 8953 + }, + { + "epoch": 1.6783505154639176, + "grad_norm": 53452.09765625, + "learning_rate": 5.822116278373894e-05, + "loss": 2.3181, + "step": 8954 + }, + { + "epoch": 1.6785379568884724, + "grad_norm": 53282.953125, + "learning_rate": 5.821341171877106e-05, + "loss": 2.1882, + "step": 8955 + }, + { + "epoch": 1.678725398313027, + "grad_norm": 53451.8125, + "learning_rate": 5.8205660450942575e-05, + "loss": 2.1769, + "step": 8956 + }, + { + "epoch": 1.678912839737582, + "grad_norm": 48938.28125, + "learning_rate": 5.81979089804449e-05, + "loss": 2.2495, + "step": 8957 + }, + { + "epoch": 1.6791002811621367, + "grad_norm": 51566.953125, + "learning_rate": 5.819015730746952e-05, + "loss": 2.2037, + "step": 8958 + }, + { + "epoch": 1.6792877225866918, + "grad_norm": 52859.3359375, + "learning_rate": 5.818240543220785e-05, + "loss": 2.2021, + "step": 8959 + }, + { + "epoch": 1.6794751640112464, + "grad_norm": 49710.59765625, + "learning_rate": 5.817465335485139e-05, + "loss": 2.1858, + "step": 8960 + }, + { + "epoch": 1.6796626054358013, + "grad_norm": 49099.14453125, + "learning_rate": 5.816690107559158e-05, + "loss": 2.1772, + "step": 8961 + }, + { + "epoch": 1.679850046860356, + "grad_norm": 52429.59375, + "learning_rate": 5.815914859461993e-05, + "loss": 2.2386, + "step": 8962 + }, + { + "epoch": 1.680037488284911, + "grad_norm": 50966.66015625, + "learning_rate": 5.815139591212786e-05, + "loss": 2.2244, + "step": 8963 + }, + { + "epoch": 1.6802249297094658, + "grad_norm": 56344.0390625, + "learning_rate": 5.8143643028306894e-05, + "loss": 2.2335, + "step": 8964 + }, + { + "epoch": 1.6804123711340206, + "grad_norm": 48773.77734375, + "learning_rate": 5.81358899433485e-05, + "loss": 2.2288, + "step": 8965 + }, + { + "epoch": 1.6805998125585755, + "grad_norm": 71904.15625, + "learning_rate": 5.8128136657444167e-05, + "loss": 2.2122, + "step": 8966 + }, + { + "epoch": 1.6807872539831301, + "grad_norm": 49059.5078125, + "learning_rate": 5.812038317078541e-05, + "loss": 2.2249, + "step": 8967 + }, + { + "epoch": 1.6809746954076852, + "grad_norm": 50797.03125, + "learning_rate": 5.8112629483563685e-05, + "loss": 2.1752, + "step": 8968 + }, + { + "epoch": 1.6811621368322398, + "grad_norm": 47341.66796875, + "learning_rate": 5.8104875595970564e-05, + "loss": 2.1957, + "step": 8969 + }, + { + "epoch": 1.6813495782567949, + "grad_norm": 49114.07421875, + "learning_rate": 5.80971215081975e-05, + "loss": 2.2082, + "step": 8970 + }, + { + "epoch": 1.6815370196813495, + "grad_norm": 52466.765625, + "learning_rate": 5.808936722043605e-05, + "loss": 2.2568, + "step": 8971 + }, + { + "epoch": 1.6817244611059046, + "grad_norm": 56234.27734375, + "learning_rate": 5.808161273287772e-05, + "loss": 2.2245, + "step": 8972 + }, + { + "epoch": 1.6819119025304592, + "grad_norm": 54152.984375, + "learning_rate": 5.807385804571403e-05, + "loss": 2.2126, + "step": 8973 + }, + { + "epoch": 1.682099343955014, + "grad_norm": 52605.1484375, + "learning_rate": 5.8066103159136496e-05, + "loss": 2.1624, + "step": 8974 + }, + { + "epoch": 1.6822867853795689, + "grad_norm": 47708.984375, + "learning_rate": 5.8058348073336686e-05, + "loss": 2.2382, + "step": 8975 + }, + { + "epoch": 1.6824742268041237, + "grad_norm": 48906.0546875, + "learning_rate": 5.805059278850613e-05, + "loss": 2.2523, + "step": 8976 + }, + { + "epoch": 1.6826616682286786, + "grad_norm": 48339.6875, + "learning_rate": 5.804283730483637e-05, + "loss": 2.2341, + "step": 8977 + }, + { + "epoch": 1.6828491096532332, + "grad_norm": 48783.12109375, + "learning_rate": 5.803508162251894e-05, + "loss": 2.2554, + "step": 8978 + }, + { + "epoch": 1.6830365510777883, + "grad_norm": 53002.22265625, + "learning_rate": 5.8027325741745445e-05, + "loss": 2.1553, + "step": 8979 + }, + { + "epoch": 1.683223992502343, + "grad_norm": 55323.25390625, + "learning_rate": 5.801956966270739e-05, + "loss": 2.2681, + "step": 8980 + }, + { + "epoch": 1.683411433926898, + "grad_norm": 52232.73828125, + "learning_rate": 5.801181338559636e-05, + "loss": 2.2423, + "step": 8981 + }, + { + "epoch": 1.6835988753514526, + "grad_norm": 50277.84375, + "learning_rate": 5.800405691060393e-05, + "loss": 2.216, + "step": 8982 + }, + { + "epoch": 1.6837863167760077, + "grad_norm": 48877.55078125, + "learning_rate": 5.799630023792166e-05, + "loss": 2.2364, + "step": 8983 + }, + { + "epoch": 1.6839737582005623, + "grad_norm": 53956.61328125, + "learning_rate": 5.798854336774116e-05, + "loss": 2.2342, + "step": 8984 + }, + { + "epoch": 1.6841611996251171, + "grad_norm": 49665.6953125, + "learning_rate": 5.7980786300253976e-05, + "loss": 2.2249, + "step": 8985 + }, + { + "epoch": 1.684348641049672, + "grad_norm": 46500.9609375, + "learning_rate": 5.797302903565174e-05, + "loss": 2.2119, + "step": 8986 + }, + { + "epoch": 1.6845360824742268, + "grad_norm": 51012.859375, + "learning_rate": 5.796527157412601e-05, + "loss": 2.2174, + "step": 8987 + }, + { + "epoch": 1.6847235238987817, + "grad_norm": 47319.80078125, + "learning_rate": 5.795751391586841e-05, + "loss": 2.2353, + "step": 8988 + }, + { + "epoch": 1.6849109653233365, + "grad_norm": 54678.22265625, + "learning_rate": 5.7949756061070525e-05, + "loss": 2.2443, + "step": 8989 + }, + { + "epoch": 1.6850984067478914, + "grad_norm": 52701.87109375, + "learning_rate": 5.794199800992397e-05, + "loss": 2.3028, + "step": 8990 + }, + { + "epoch": 1.685285848172446, + "grad_norm": 46638.9140625, + "learning_rate": 5.7934239762620366e-05, + "loss": 2.2129, + "step": 8991 + }, + { + "epoch": 1.685473289597001, + "grad_norm": 52574.53515625, + "learning_rate": 5.792648131935133e-05, + "loss": 2.2516, + "step": 8992 + }, + { + "epoch": 1.6856607310215557, + "grad_norm": 53618.515625, + "learning_rate": 5.791872268030847e-05, + "loss": 2.2095, + "step": 8993 + }, + { + "epoch": 1.6858481724461107, + "grad_norm": 49843.33984375, + "learning_rate": 5.791096384568344e-05, + "loss": 2.2435, + "step": 8994 + }, + { + "epoch": 1.6860356138706654, + "grad_norm": 52946.83203125, + "learning_rate": 5.7903204815667855e-05, + "loss": 2.1833, + "step": 8995 + }, + { + "epoch": 1.6862230552952202, + "grad_norm": 50757.6796875, + "learning_rate": 5.789544559045336e-05, + "loss": 2.2228, + "step": 8996 + }, + { + "epoch": 1.686410496719775, + "grad_norm": 49031.39453125, + "learning_rate": 5.7887686170231606e-05, + "loss": 2.2048, + "step": 8997 + }, + { + "epoch": 1.68659793814433, + "grad_norm": 56289.27734375, + "learning_rate": 5.787992655519421e-05, + "loss": 2.2475, + "step": 8998 + }, + { + "epoch": 1.6867853795688847, + "grad_norm": 54005.921875, + "learning_rate": 5.787216674553285e-05, + "loss": 2.2515, + "step": 8999 + }, + { + "epoch": 1.6869728209934396, + "grad_norm": 49722.63671875, + "learning_rate": 5.786440674143918e-05, + "loss": 2.1983, + "step": 9000 + }, + { + "epoch": 1.6869728209934396, + "eval_loss": 2.2926251888275146, + "eval_runtime": 127.8705, + "eval_samples_per_second": 39.485, + "eval_steps_per_second": 1.979, + "step": 9000 + }, + { + "epoch": 1.6871602624179944, + "grad_norm": 49321.484375, + "learning_rate": 5.785664654310485e-05, + "loss": 2.213, + "step": 9001 + }, + { + "epoch": 1.687347703842549, + "grad_norm": 50262.140625, + "learning_rate": 5.784888615072155e-05, + "loss": 2.2197, + "step": 9002 + }, + { + "epoch": 1.6875351452671041, + "grad_norm": 55377.84375, + "learning_rate": 5.7841125564480936e-05, + "loss": 2.2049, + "step": 9003 + }, + { + "epoch": 1.6877225866916588, + "grad_norm": 48128.53125, + "learning_rate": 5.7833364784574685e-05, + "loss": 2.2722, + "step": 9004 + }, + { + "epoch": 1.6879100281162138, + "grad_norm": 52204.2890625, + "learning_rate": 5.7825603811194474e-05, + "loss": 2.2128, + "step": 9005 + }, + { + "epoch": 1.6880974695407684, + "grad_norm": 50286.26171875, + "learning_rate": 5.7817842644532003e-05, + "loss": 2.2164, + "step": 9006 + }, + { + "epoch": 1.6882849109653233, + "grad_norm": 52808.87109375, + "learning_rate": 5.781008128477896e-05, + "loss": 2.2249, + "step": 9007 + }, + { + "epoch": 1.6884723523898781, + "grad_norm": 49890.078125, + "learning_rate": 5.780231973212703e-05, + "loss": 2.2298, + "step": 9008 + }, + { + "epoch": 1.688659793814433, + "grad_norm": 53387.734375, + "learning_rate": 5.779455798676792e-05, + "loss": 2.2063, + "step": 9009 + }, + { + "epoch": 1.6888472352389878, + "grad_norm": 51572.59765625, + "learning_rate": 5.778679604889332e-05, + "loss": 2.2398, + "step": 9010 + }, + { + "epoch": 1.6890346766635427, + "grad_norm": 49930.3046875, + "learning_rate": 5.777903391869497e-05, + "loss": 2.1644, + "step": 9011 + }, + { + "epoch": 1.6892221180880975, + "grad_norm": 52996.4765625, + "learning_rate": 5.777127159636456e-05, + "loss": 2.1999, + "step": 9012 + }, + { + "epoch": 1.6894095595126521, + "grad_norm": 51005.73046875, + "learning_rate": 5.776350908209381e-05, + "loss": 2.244, + "step": 9013 + }, + { + "epoch": 1.6895970009372072, + "grad_norm": 53821.51171875, + "learning_rate": 5.775574637607447e-05, + "loss": 2.2301, + "step": 9014 + }, + { + "epoch": 1.6897844423617618, + "grad_norm": 56866.06640625, + "learning_rate": 5.774798347849823e-05, + "loss": 2.1715, + "step": 9015 + }, + { + "epoch": 1.689971883786317, + "grad_norm": 50393.99609375, + "learning_rate": 5.774022038955685e-05, + "loss": 2.1151, + "step": 9016 + }, + { + "epoch": 1.6901593252108715, + "grad_norm": 48258.66796875, + "learning_rate": 5.773245710944207e-05, + "loss": 2.2173, + "step": 9017 + }, + { + "epoch": 1.6903467666354264, + "grad_norm": 48517.3828125, + "learning_rate": 5.772469363834562e-05, + "loss": 2.2147, + "step": 9018 + }, + { + "epoch": 1.6905342080599812, + "grad_norm": 46689.69921875, + "learning_rate": 5.7716929976459245e-05, + "loss": 2.2295, + "step": 9019 + }, + { + "epoch": 1.690721649484536, + "grad_norm": 53470.00390625, + "learning_rate": 5.7709166123974724e-05, + "loss": 2.1743, + "step": 9020 + }, + { + "epoch": 1.690909090909091, + "grad_norm": 49162.72265625, + "learning_rate": 5.7701402081083776e-05, + "loss": 2.1909, + "step": 9021 + }, + { + "epoch": 1.6910965323336458, + "grad_norm": 52200.296875, + "learning_rate": 5.769363784797819e-05, + "loss": 2.2435, + "step": 9022 + }, + { + "epoch": 1.6912839737582006, + "grad_norm": 54867.75, + "learning_rate": 5.7685873424849726e-05, + "loss": 2.1676, + "step": 9023 + }, + { + "epoch": 1.6914714151827552, + "grad_norm": 55758.05078125, + "learning_rate": 5.767810881189014e-05, + "loss": 2.1397, + "step": 9024 + }, + { + "epoch": 1.6916588566073103, + "grad_norm": 55543.80859375, + "learning_rate": 5.7670344009291236e-05, + "loss": 2.2074, + "step": 9025 + }, + { + "epoch": 1.691846298031865, + "grad_norm": 48508.87109375, + "learning_rate": 5.7662579017244766e-05, + "loss": 2.1687, + "step": 9026 + }, + { + "epoch": 1.69203373945642, + "grad_norm": 51466.7734375, + "learning_rate": 5.7654813835942546e-05, + "loss": 2.2155, + "step": 9027 + }, + { + "epoch": 1.6922211808809746, + "grad_norm": 50356.421875, + "learning_rate": 5.7647048465576325e-05, + "loss": 2.2012, + "step": 9028 + }, + { + "epoch": 1.6924086223055297, + "grad_norm": 53795.19921875, + "learning_rate": 5.763928290633795e-05, + "loss": 2.2127, + "step": 9029 + }, + { + "epoch": 1.6925960637300843, + "grad_norm": 49284.390625, + "learning_rate": 5.763151715841918e-05, + "loss": 2.2301, + "step": 9030 + }, + { + "epoch": 1.6927835051546392, + "grad_norm": 53769.21484375, + "learning_rate": 5.762375122201182e-05, + "loss": 2.2474, + "step": 9031 + }, + { + "epoch": 1.692970946579194, + "grad_norm": 53930.0078125, + "learning_rate": 5.76159850973077e-05, + "loss": 2.2455, + "step": 9032 + }, + { + "epoch": 1.6931583880037488, + "grad_norm": 50150.22265625, + "learning_rate": 5.7608218784498623e-05, + "loss": 2.219, + "step": 9033 + }, + { + "epoch": 1.6933458294283037, + "grad_norm": 55048.51953125, + "learning_rate": 5.7600452283776394e-05, + "loss": 2.1905, + "step": 9034 + }, + { + "epoch": 1.6935332708528583, + "grad_norm": 54169.671875, + "learning_rate": 5.759268559533287e-05, + "loss": 2.2072, + "step": 9035 + }, + { + "epoch": 1.6937207122774134, + "grad_norm": 55256.0703125, + "learning_rate": 5.7584918719359835e-05, + "loss": 2.1981, + "step": 9036 + }, + { + "epoch": 1.693908153701968, + "grad_norm": 54075.65234375, + "learning_rate": 5.7577151656049157e-05, + "loss": 2.281, + "step": 9037 + }, + { + "epoch": 1.694095595126523, + "grad_norm": 47693.453125, + "learning_rate": 5.756938440559264e-05, + "loss": 2.2187, + "step": 9038 + }, + { + "epoch": 1.6942830365510777, + "grad_norm": 54094.62109375, + "learning_rate": 5.756161696818215e-05, + "loss": 2.2285, + "step": 9039 + }, + { + "epoch": 1.6944704779756328, + "grad_norm": 53287.4765625, + "learning_rate": 5.7553849344009535e-05, + "loss": 2.135, + "step": 9040 + }, + { + "epoch": 1.6946579194001874, + "grad_norm": 57644.6953125, + "learning_rate": 5.7546081533266616e-05, + "loss": 2.2353, + "step": 9041 + }, + { + "epoch": 1.6948453608247422, + "grad_norm": 50393.19921875, + "learning_rate": 5.753831353614529e-05, + "loss": 2.2623, + "step": 9042 + }, + { + "epoch": 1.695032802249297, + "grad_norm": 51444.07421875, + "learning_rate": 5.753054535283737e-05, + "loss": 2.2763, + "step": 9043 + }, + { + "epoch": 1.695220243673852, + "grad_norm": 49548.07421875, + "learning_rate": 5.7522776983534765e-05, + "loss": 2.2013, + "step": 9044 + }, + { + "epoch": 1.6954076850984068, + "grad_norm": 48642.96484375, + "learning_rate": 5.751500842842931e-05, + "loss": 2.2058, + "step": 9045 + }, + { + "epoch": 1.6955951265229616, + "grad_norm": 52877.07421875, + "learning_rate": 5.750723968771289e-05, + "loss": 2.2054, + "step": 9046 + }, + { + "epoch": 1.6957825679475165, + "grad_norm": 54839.7890625, + "learning_rate": 5.749947076157738e-05, + "loss": 2.2597, + "step": 9047 + }, + { + "epoch": 1.695970009372071, + "grad_norm": 53845.64453125, + "learning_rate": 5.749170165021467e-05, + "loss": 2.1364, + "step": 9048 + }, + { + "epoch": 1.6961574507966262, + "grad_norm": 49694.06640625, + "learning_rate": 5.748393235381664e-05, + "loss": 2.2196, + "step": 9049 + }, + { + "epoch": 1.6963448922211808, + "grad_norm": 49500.29296875, + "learning_rate": 5.747616287257519e-05, + "loss": 2.2584, + "step": 9050 + }, + { + "epoch": 1.6965323336457359, + "grad_norm": 53149.62890625, + "learning_rate": 5.74683932066822e-05, + "loss": 2.2268, + "step": 9051 + }, + { + "epoch": 1.6967197750702905, + "grad_norm": 47925.58984375, + "learning_rate": 5.746062335632959e-05, + "loss": 2.2478, + "step": 9052 + }, + { + "epoch": 1.6969072164948453, + "grad_norm": 49649.1484375, + "learning_rate": 5.7452853321709244e-05, + "loss": 2.165, + "step": 9053 + }, + { + "epoch": 1.6970946579194002, + "grad_norm": 55467.81640625, + "learning_rate": 5.744508310301309e-05, + "loss": 2.2729, + "step": 9054 + }, + { + "epoch": 1.697282099343955, + "grad_norm": 49835.0234375, + "learning_rate": 5.7437312700433044e-05, + "loss": 2.1553, + "step": 9055 + }, + { + "epoch": 1.6974695407685099, + "grad_norm": 49635.42578125, + "learning_rate": 5.7429542114161005e-05, + "loss": 2.2227, + "step": 9056 + }, + { + "epoch": 1.6976569821930647, + "grad_norm": 49335.59375, + "learning_rate": 5.742177134438892e-05, + "loss": 2.2736, + "step": 9057 + }, + { + "epoch": 1.6978444236176196, + "grad_norm": 53055.55078125, + "learning_rate": 5.74140003913087e-05, + "loss": 2.2333, + "step": 9058 + }, + { + "epoch": 1.6980318650421742, + "grad_norm": 46161.84765625, + "learning_rate": 5.7406229255112275e-05, + "loss": 2.184, + "step": 9059 + }, + { + "epoch": 1.6982193064667293, + "grad_norm": 58375.51171875, + "learning_rate": 5.73984579359916e-05, + "loss": 2.2282, + "step": 9060 + }, + { + "epoch": 1.6984067478912839, + "grad_norm": 48616.01953125, + "learning_rate": 5.7390686434138607e-05, + "loss": 2.2904, + "step": 9061 + }, + { + "epoch": 1.698594189315839, + "grad_norm": 53156.55078125, + "learning_rate": 5.738291474974523e-05, + "loss": 2.2173, + "step": 9062 + }, + { + "epoch": 1.6987816307403936, + "grad_norm": 53196.078125, + "learning_rate": 5.737514288300344e-05, + "loss": 2.1637, + "step": 9063 + }, + { + "epoch": 1.6989690721649484, + "grad_norm": 50056.41796875, + "learning_rate": 5.7367370834105185e-05, + "loss": 2.2684, + "step": 9064 + }, + { + "epoch": 1.6991565135895033, + "grad_norm": 51512.828125, + "learning_rate": 5.735959860324242e-05, + "loss": 2.2725, + "step": 9065 + }, + { + "epoch": 1.699343955014058, + "grad_norm": 54139.4765625, + "learning_rate": 5.7351826190607106e-05, + "loss": 2.2711, + "step": 9066 + }, + { + "epoch": 1.699531396438613, + "grad_norm": 54308.33984375, + "learning_rate": 5.7344053596391226e-05, + "loss": 2.2441, + "step": 9067 + }, + { + "epoch": 1.6997188378631678, + "grad_norm": 48623.328125, + "learning_rate": 5.7336280820786724e-05, + "loss": 2.2526, + "step": 9068 + }, + { + "epoch": 1.6999062792877226, + "grad_norm": 54825.34765625, + "learning_rate": 5.732850786398561e-05, + "loss": 2.1408, + "step": 9069 + }, + { + "epoch": 1.7000937207122773, + "grad_norm": 53156.82421875, + "learning_rate": 5.732073472617987e-05, + "loss": 2.2489, + "step": 9070 + }, + { + "epoch": 1.7002811621368323, + "grad_norm": 57401.1796875, + "learning_rate": 5.731296140756145e-05, + "loss": 2.1205, + "step": 9071 + }, + { + "epoch": 1.700468603561387, + "grad_norm": 58075.73828125, + "learning_rate": 5.730518790832238e-05, + "loss": 2.1456, + "step": 9072 + }, + { + "epoch": 1.700656044985942, + "grad_norm": 49032.25, + "learning_rate": 5.7297414228654636e-05, + "loss": 2.2181, + "step": 9073 + }, + { + "epoch": 1.7008434864104967, + "grad_norm": 57067.24609375, + "learning_rate": 5.728964036875021e-05, + "loss": 2.1678, + "step": 9074 + }, + { + "epoch": 1.7010309278350515, + "grad_norm": 51938.515625, + "learning_rate": 5.7281866328801126e-05, + "loss": 2.1582, + "step": 9075 + }, + { + "epoch": 1.7012183692596063, + "grad_norm": 54501.17578125, + "learning_rate": 5.727409210899939e-05, + "loss": 2.2021, + "step": 9076 + }, + { + "epoch": 1.7014058106841612, + "grad_norm": 52230.56640625, + "learning_rate": 5.726631770953701e-05, + "loss": 2.1998, + "step": 9077 + }, + { + "epoch": 1.701593252108716, + "grad_norm": 45922.48828125, + "learning_rate": 5.725854313060599e-05, + "loss": 2.1617, + "step": 9078 + }, + { + "epoch": 1.7017806935332709, + "grad_norm": 54723.421875, + "learning_rate": 5.725076837239838e-05, + "loss": 2.2488, + "step": 9079 + }, + { + "epoch": 1.7019681349578257, + "grad_norm": 55280.31640625, + "learning_rate": 5.724299343510618e-05, + "loss": 2.2235, + "step": 9080 + }, + { + "epoch": 1.7021555763823804, + "grad_norm": 48879.68359375, + "learning_rate": 5.723521831892144e-05, + "loss": 2.2601, + "step": 9081 + }, + { + "epoch": 1.7023430178069354, + "grad_norm": 51382.0625, + "learning_rate": 5.7227443024036184e-05, + "loss": 2.2033, + "step": 9082 + }, + { + "epoch": 1.70253045923149, + "grad_norm": 55306.53515625, + "learning_rate": 5.7219667550642465e-05, + "loss": 2.1923, + "step": 9083 + }, + { + "epoch": 1.7027179006560451, + "grad_norm": 51185.46875, + "learning_rate": 5.721189189893231e-05, + "loss": 2.2323, + "step": 9084 + }, + { + "epoch": 1.7029053420805997, + "grad_norm": 55123.9375, + "learning_rate": 5.720411606909779e-05, + "loss": 2.2219, + "step": 9085 + }, + { + "epoch": 1.7030927835051546, + "grad_norm": 50807.51171875, + "learning_rate": 5.719634006133092e-05, + "loss": 2.2136, + "step": 9086 + }, + { + "epoch": 1.7032802249297094, + "grad_norm": 49360.9609375, + "learning_rate": 5.7188563875823796e-05, + "loss": 2.2269, + "step": 9087 + }, + { + "epoch": 1.7034676663542643, + "grad_norm": 50996.41015625, + "learning_rate": 5.718078751276846e-05, + "loss": 2.2391, + "step": 9088 + }, + { + "epoch": 1.7036551077788191, + "grad_norm": 50645.10546875, + "learning_rate": 5.717301097235698e-05, + "loss": 2.2464, + "step": 9089 + }, + { + "epoch": 1.703842549203374, + "grad_norm": 51340.6328125, + "learning_rate": 5.716523425478143e-05, + "loss": 2.1124, + "step": 9090 + }, + { + "epoch": 1.7040299906279288, + "grad_norm": 55283.6015625, + "learning_rate": 5.715745736023389e-05, + "loss": 2.2083, + "step": 9091 + }, + { + "epoch": 1.7042174320524834, + "grad_norm": 53972.3359375, + "learning_rate": 5.7149680288906436e-05, + "loss": 2.2713, + "step": 9092 + }, + { + "epoch": 1.7044048734770385, + "grad_norm": 56717.7890625, + "learning_rate": 5.714190304099114e-05, + "loss": 2.1945, + "step": 9093 + }, + { + "epoch": 1.7045923149015931, + "grad_norm": 51195.20703125, + "learning_rate": 5.71341256166801e-05, + "loss": 2.2048, + "step": 9094 + }, + { + "epoch": 1.7047797563261482, + "grad_norm": 50619.21484375, + "learning_rate": 5.71263480161654e-05, + "loss": 2.2266, + "step": 9095 + }, + { + "epoch": 1.7049671977507028, + "grad_norm": 56902.12890625, + "learning_rate": 5.711857023963915e-05, + "loss": 2.25, + "step": 9096 + }, + { + "epoch": 1.705154639175258, + "grad_norm": 49513.26171875, + "learning_rate": 5.711079228729344e-05, + "loss": 2.2541, + "step": 9097 + }, + { + "epoch": 1.7053420805998125, + "grad_norm": 51684.8125, + "learning_rate": 5.7103014159320375e-05, + "loss": 2.2436, + "step": 9098 + }, + { + "epoch": 1.7055295220243674, + "grad_norm": 48094.08984375, + "learning_rate": 5.7095235855912086e-05, + "loss": 2.1907, + "step": 9099 + }, + { + "epoch": 1.7057169634489222, + "grad_norm": 50753.8984375, + "learning_rate": 5.708745737726067e-05, + "loss": 2.2243, + "step": 9100 + }, + { + "epoch": 1.705904404873477, + "grad_norm": 52135.59765625, + "learning_rate": 5.7079678723558225e-05, + "loss": 2.1929, + "step": 9101 + }, + { + "epoch": 1.706091846298032, + "grad_norm": 52406.7734375, + "learning_rate": 5.7071899894996915e-05, + "loss": 2.327, + "step": 9102 + }, + { + "epoch": 1.7062792877225867, + "grad_norm": 47180.59375, + "learning_rate": 5.7064120891768846e-05, + "loss": 2.2024, + "step": 9103 + }, + { + "epoch": 1.7064667291471416, + "grad_norm": 47493.5859375, + "learning_rate": 5.705634171406614e-05, + "loss": 2.2283, + "step": 9104 + }, + { + "epoch": 1.7066541705716962, + "grad_norm": 55917.1640625, + "learning_rate": 5.704856236208096e-05, + "loss": 2.3551, + "step": 9105 + }, + { + "epoch": 1.7068416119962513, + "grad_norm": 53610.15234375, + "learning_rate": 5.704078283600539e-05, + "loss": 2.2694, + "step": 9106 + }, + { + "epoch": 1.707029053420806, + "grad_norm": 47856.08203125, + "learning_rate": 5.703300313603165e-05, + "loss": 2.2207, + "step": 9107 + }, + { + "epoch": 1.707216494845361, + "grad_norm": 56290.21875, + "learning_rate": 5.7025223262351834e-05, + "loss": 2.2525, + "step": 9108 + }, + { + "epoch": 1.7074039362699156, + "grad_norm": 47999.8515625, + "learning_rate": 5.701744321515812e-05, + "loss": 2.2411, + "step": 9109 + }, + { + "epoch": 1.7075913776944704, + "grad_norm": 46087.46484375, + "learning_rate": 5.700966299464264e-05, + "loss": 2.265, + "step": 9110 + }, + { + "epoch": 1.7077788191190253, + "grad_norm": 49164.0390625, + "learning_rate": 5.700188260099759e-05, + "loss": 2.2457, + "step": 9111 + }, + { + "epoch": 1.7079662605435801, + "grad_norm": 50730.80859375, + "learning_rate": 5.699410203441511e-05, + "loss": 2.1957, + "step": 9112 + }, + { + "epoch": 1.708153701968135, + "grad_norm": 57381.3828125, + "learning_rate": 5.6986321295087384e-05, + "loss": 2.1501, + "step": 9113 + }, + { + "epoch": 1.7083411433926898, + "grad_norm": 51928.91796875, + "learning_rate": 5.697854038320657e-05, + "loss": 2.2717, + "step": 9114 + }, + { + "epoch": 1.7085285848172447, + "grad_norm": 49494.48046875, + "learning_rate": 5.697075929896487e-05, + "loss": 2.2102, + "step": 9115 + }, + { + "epoch": 1.7087160262417993, + "grad_norm": 53690.828125, + "learning_rate": 5.696297804255444e-05, + "loss": 2.2625, + "step": 9116 + }, + { + "epoch": 1.7089034676663544, + "grad_norm": 50058.9765625, + "learning_rate": 5.6955196614167485e-05, + "loss": 2.2842, + "step": 9117 + }, + { + "epoch": 1.709090909090909, + "grad_norm": 52851.12109375, + "learning_rate": 5.694741501399619e-05, + "loss": 2.1678, + "step": 9118 + }, + { + "epoch": 1.709278350515464, + "grad_norm": 50248.2265625, + "learning_rate": 5.693963324223275e-05, + "loss": 2.2481, + "step": 9119 + }, + { + "epoch": 1.7094657919400187, + "grad_norm": 50424.19921875, + "learning_rate": 5.6931851299069386e-05, + "loss": 2.2007, + "step": 9120 + }, + { + "epoch": 1.7096532333645735, + "grad_norm": 51515.45703125, + "learning_rate": 5.6924069184698246e-05, + "loss": 2.2475, + "step": 9121 + }, + { + "epoch": 1.7098406747891284, + "grad_norm": 56419.4921875, + "learning_rate": 5.691628689931161e-05, + "loss": 2.1518, + "step": 9122 + }, + { + "epoch": 1.7100281162136832, + "grad_norm": 51024.14453125, + "learning_rate": 5.690850444310163e-05, + "loss": 2.2646, + "step": 9123 + }, + { + "epoch": 1.710215557638238, + "grad_norm": 52844.1015625, + "learning_rate": 5.690072181626056e-05, + "loss": 2.2129, + "step": 9124 + }, + { + "epoch": 1.710402999062793, + "grad_norm": 55917.671875, + "learning_rate": 5.68929390189806e-05, + "loss": 2.2195, + "step": 9125 + }, + { + "epoch": 1.7105904404873478, + "grad_norm": 49110.37890625, + "learning_rate": 5.688515605145398e-05, + "loss": 2.3119, + "step": 9126 + }, + { + "epoch": 1.7107778819119024, + "grad_norm": 51771.86328125, + "learning_rate": 5.687737291387294e-05, + "loss": 2.1724, + "step": 9127 + }, + { + "epoch": 1.7109653233364575, + "grad_norm": 49109.671875, + "learning_rate": 5.686958960642972e-05, + "loss": 2.1532, + "step": 9128 + }, + { + "epoch": 1.711152764761012, + "grad_norm": 51059.56640625, + "learning_rate": 5.686180612931651e-05, + "loss": 2.2094, + "step": 9129 + }, + { + "epoch": 1.7113402061855671, + "grad_norm": 53975.80859375, + "learning_rate": 5.685402248272561e-05, + "loss": 2.2582, + "step": 9130 + }, + { + "epoch": 1.7115276476101218, + "grad_norm": 49619.12109375, + "learning_rate": 5.684623866684924e-05, + "loss": 2.2572, + "step": 9131 + }, + { + "epoch": 1.7117150890346766, + "grad_norm": 48725.375, + "learning_rate": 5.683845468187964e-05, + "loss": 2.2538, + "step": 9132 + }, + { + "epoch": 1.7119025304592315, + "grad_norm": 49582.0625, + "learning_rate": 5.683067052800909e-05, + "loss": 2.2082, + "step": 9133 + }, + { + "epoch": 1.7120899718837863, + "grad_norm": 50690.140625, + "learning_rate": 5.682288620542981e-05, + "loss": 2.1828, + "step": 9134 + }, + { + "epoch": 1.7122774133083412, + "grad_norm": 51237.859375, + "learning_rate": 5.681510171433411e-05, + "loss": 2.1733, + "step": 9135 + }, + { + "epoch": 1.712464854732896, + "grad_norm": 51575.3828125, + "learning_rate": 5.680731705491421e-05, + "loss": 2.194, + "step": 9136 + }, + { + "epoch": 1.7126522961574508, + "grad_norm": 52226.44140625, + "learning_rate": 5.679953222736243e-05, + "loss": 2.2715, + "step": 9137 + }, + { + "epoch": 1.7128397375820055, + "grad_norm": 52823.45703125, + "learning_rate": 5.679174723187101e-05, + "loss": 2.2325, + "step": 9138 + }, + { + "epoch": 1.7130271790065605, + "grad_norm": 47675.51171875, + "learning_rate": 5.6783962068632234e-05, + "loss": 2.2289, + "step": 9139 + }, + { + "epoch": 1.7132146204311152, + "grad_norm": 51331.2421875, + "learning_rate": 5.6776176737838394e-05, + "loss": 2.239, + "step": 9140 + }, + { + "epoch": 1.7134020618556702, + "grad_norm": 53319.70703125, + "learning_rate": 5.6768391239681775e-05, + "loss": 2.1842, + "step": 9141 + }, + { + "epoch": 1.7135895032802249, + "grad_norm": 58643.75, + "learning_rate": 5.6760605574354665e-05, + "loss": 2.1678, + "step": 9142 + }, + { + "epoch": 1.7137769447047797, + "grad_norm": 50460.0546875, + "learning_rate": 5.675281974204938e-05, + "loss": 2.1457, + "step": 9143 + }, + { + "epoch": 1.7139643861293345, + "grad_norm": 48306.35546875, + "learning_rate": 5.6745033742958175e-05, + "loss": 2.2238, + "step": 9144 + }, + { + "epoch": 1.7141518275538894, + "grad_norm": 49999.87890625, + "learning_rate": 5.67372475772734e-05, + "loss": 2.1998, + "step": 9145 + }, + { + "epoch": 1.7143392689784442, + "grad_norm": 51513.50390625, + "learning_rate": 5.672946124518734e-05, + "loss": 2.1925, + "step": 9146 + }, + { + "epoch": 1.714526710402999, + "grad_norm": 52180.515625, + "learning_rate": 5.6721674746892306e-05, + "loss": 2.314, + "step": 9147 + }, + { + "epoch": 1.714714151827554, + "grad_norm": 51364.93359375, + "learning_rate": 5.671388808258064e-05, + "loss": 2.2436, + "step": 9148 + }, + { + "epoch": 1.7149015932521086, + "grad_norm": 51963.27734375, + "learning_rate": 5.670610125244461e-05, + "loss": 2.1828, + "step": 9149 + }, + { + "epoch": 1.7150890346766636, + "grad_norm": 49195.8125, + "learning_rate": 5.66983142566766e-05, + "loss": 2.2371, + "step": 9150 + }, + { + "epoch": 1.7152764761012183, + "grad_norm": 46936.0546875, + "learning_rate": 5.66905270954689e-05, + "loss": 2.2496, + "step": 9151 + }, + { + "epoch": 1.7154639175257733, + "grad_norm": 47997.23828125, + "learning_rate": 5.668273976901387e-05, + "loss": 2.1959, + "step": 9152 + }, + { + "epoch": 1.715651358950328, + "grad_norm": 54005.32421875, + "learning_rate": 5.6674952277503813e-05, + "loss": 2.1899, + "step": 9153 + }, + { + "epoch": 1.715838800374883, + "grad_norm": 51294.4609375, + "learning_rate": 5.6667164621131097e-05, + "loss": 2.2286, + "step": 9154 + }, + { + "epoch": 1.7160262417994376, + "grad_norm": 49041.5078125, + "learning_rate": 5.665937680008806e-05, + "loss": 2.3359, + "step": 9155 + }, + { + "epoch": 1.7162136832239925, + "grad_norm": 55383.4140625, + "learning_rate": 5.665158881456705e-05, + "loss": 2.1818, + "step": 9156 + }, + { + "epoch": 1.7164011246485473, + "grad_norm": 50528.98828125, + "learning_rate": 5.6643800664760417e-05, + "loss": 2.2593, + "step": 9157 + }, + { + "epoch": 1.7165885660731022, + "grad_norm": 46669.984375, + "learning_rate": 5.663601235086053e-05, + "loss": 2.1828, + "step": 9158 + }, + { + "epoch": 1.716776007497657, + "grad_norm": 52604.6875, + "learning_rate": 5.6628223873059726e-05, + "loss": 2.2308, + "step": 9159 + }, + { + "epoch": 1.7169634489222116, + "grad_norm": 47092.77734375, + "learning_rate": 5.662043523155041e-05, + "loss": 2.2304, + "step": 9160 + }, + { + "epoch": 1.7171508903467667, + "grad_norm": 50560.390625, + "learning_rate": 5.6612646426524904e-05, + "loss": 2.225, + "step": 9161 + }, + { + "epoch": 1.7173383317713213, + "grad_norm": 49701.34765625, + "learning_rate": 5.660485745817562e-05, + "loss": 2.2291, + "step": 9162 + }, + { + "epoch": 1.7175257731958764, + "grad_norm": 52978.07421875, + "learning_rate": 5.659706832669492e-05, + "loss": 2.1609, + "step": 9163 + }, + { + "epoch": 1.717713214620431, + "grad_norm": 50339.28125, + "learning_rate": 5.658927903227517e-05, + "loss": 2.2565, + "step": 9164 + }, + { + "epoch": 1.717900656044986, + "grad_norm": 49452.17578125, + "learning_rate": 5.658148957510879e-05, + "loss": 2.2569, + "step": 9165 + }, + { + "epoch": 1.7180880974695407, + "grad_norm": 51691.71484375, + "learning_rate": 5.657369995538814e-05, + "loss": 2.1743, + "step": 9166 + }, + { + "epoch": 1.7182755388940956, + "grad_norm": 48023.91796875, + "learning_rate": 5.6565910173305625e-05, + "loss": 2.2197, + "step": 9167 + }, + { + "epoch": 1.7184629803186504, + "grad_norm": 51512.03515625, + "learning_rate": 5.655812022905364e-05, + "loss": 2.1736, + "step": 9168 + }, + { + "epoch": 1.7186504217432053, + "grad_norm": 50839.00390625, + "learning_rate": 5.655033012282459e-05, + "loss": 2.2123, + "step": 9169 + }, + { + "epoch": 1.71883786316776, + "grad_norm": 48491.546875, + "learning_rate": 5.6542539854810874e-05, + "loss": 2.2536, + "step": 9170 + }, + { + "epoch": 1.719025304592315, + "grad_norm": 48803.63671875, + "learning_rate": 5.6534749425204915e-05, + "loss": 2.2091, + "step": 9171 + }, + { + "epoch": 1.7192127460168698, + "grad_norm": 48749.06640625, + "learning_rate": 5.6526958834199105e-05, + "loss": 2.192, + "step": 9172 + }, + { + "epoch": 1.7194001874414244, + "grad_norm": 51980.51953125, + "learning_rate": 5.6519168081985896e-05, + "loss": 2.2598, + "step": 9173 + }, + { + "epoch": 1.7195876288659795, + "grad_norm": 56359.4765625, + "learning_rate": 5.6511377168757674e-05, + "loss": 2.1712, + "step": 9174 + }, + { + "epoch": 1.7197750702905341, + "grad_norm": 52125.76171875, + "learning_rate": 5.650358609470687e-05, + "loss": 2.1951, + "step": 9175 + }, + { + "epoch": 1.7199625117150892, + "grad_norm": 48966.93359375, + "learning_rate": 5.649579486002593e-05, + "loss": 2.2664, + "step": 9176 + }, + { + "epoch": 1.7201499531396438, + "grad_norm": 49765.125, + "learning_rate": 5.648800346490728e-05, + "loss": 2.267, + "step": 9177 + }, + { + "epoch": 1.7203373945641987, + "grad_norm": 48484.6328125, + "learning_rate": 5.6480211909543354e-05, + "loss": 2.2139, + "step": 9178 + }, + { + "epoch": 1.7205248359887535, + "grad_norm": 50904.453125, + "learning_rate": 5.6472420194126595e-05, + "loss": 2.2181, + "step": 9179 + }, + { + "epoch": 1.7207122774133083, + "grad_norm": 50387.5703125, + "learning_rate": 5.646462831884946e-05, + "loss": 2.2296, + "step": 9180 + }, + { + "epoch": 1.7208997188378632, + "grad_norm": 51341.26171875, + "learning_rate": 5.6456836283904366e-05, + "loss": 2.2572, + "step": 9181 + }, + { + "epoch": 1.721087160262418, + "grad_norm": 46740.84765625, + "learning_rate": 5.644904408948381e-05, + "loss": 2.2294, + "step": 9182 + }, + { + "epoch": 1.7212746016869729, + "grad_norm": 54181.3046875, + "learning_rate": 5.644125173578021e-05, + "loss": 2.2333, + "step": 9183 + }, + { + "epoch": 1.7214620431115275, + "grad_norm": 54171.69921875, + "learning_rate": 5.643345922298604e-05, + "loss": 2.1516, + "step": 9184 + }, + { + "epoch": 1.7216494845360826, + "grad_norm": 53278.7109375, + "learning_rate": 5.642566655129378e-05, + "loss": 2.191, + "step": 9185 + }, + { + "epoch": 1.7218369259606372, + "grad_norm": 56056.16796875, + "learning_rate": 5.6417873720895884e-05, + "loss": 2.2675, + "step": 9186 + }, + { + "epoch": 1.7220243673851923, + "grad_norm": 52585.4375, + "learning_rate": 5.6410080731984815e-05, + "loss": 2.263, + "step": 9187 + }, + { + "epoch": 1.722211808809747, + "grad_norm": 57093.6875, + "learning_rate": 5.640228758475309e-05, + "loss": 2.2508, + "step": 9188 + }, + { + "epoch": 1.7223992502343017, + "grad_norm": 50723.2265625, + "learning_rate": 5.639449427939314e-05, + "loss": 2.2436, + "step": 9189 + }, + { + "epoch": 1.7225866916588566, + "grad_norm": 48611.46484375, + "learning_rate": 5.638670081609748e-05, + "loss": 2.2525, + "step": 9190 + }, + { + "epoch": 1.7227741330834114, + "grad_norm": 51028.9140625, + "learning_rate": 5.637890719505859e-05, + "loss": 2.2045, + "step": 9191 + }, + { + "epoch": 1.7229615745079663, + "grad_norm": 50868.42578125, + "learning_rate": 5.6371113416468965e-05, + "loss": 2.1971, + "step": 9192 + }, + { + "epoch": 1.7231490159325211, + "grad_norm": 50758.2734375, + "learning_rate": 5.63633194805211e-05, + "loss": 2.2129, + "step": 9193 + }, + { + "epoch": 1.723336457357076, + "grad_norm": 53360.45703125, + "learning_rate": 5.635552538740748e-05, + "loss": 2.2338, + "step": 9194 + }, + { + "epoch": 1.7235238987816306, + "grad_norm": 50514.95703125, + "learning_rate": 5.634773113732065e-05, + "loss": 2.2618, + "step": 9195 + }, + { + "epoch": 1.7237113402061857, + "grad_norm": 53687.9296875, + "learning_rate": 5.633993673045307e-05, + "loss": 2.2498, + "step": 9196 + }, + { + "epoch": 1.7238987816307403, + "grad_norm": 54355.26171875, + "learning_rate": 5.633214216699727e-05, + "loss": 2.2151, + "step": 9197 + }, + { + "epoch": 1.7240862230552954, + "grad_norm": 52073.00390625, + "learning_rate": 5.632434744714577e-05, + "loss": 2.2254, + "step": 9198 + }, + { + "epoch": 1.72427366447985, + "grad_norm": 59219.84375, + "learning_rate": 5.63165525710911e-05, + "loss": 2.1784, + "step": 9199 + }, + { + "epoch": 1.7244611059044048, + "grad_norm": 51965.64453125, + "learning_rate": 5.630875753902576e-05, + "loss": 2.2743, + "step": 9200 + }, + { + "epoch": 1.7246485473289597, + "grad_norm": 53921.04296875, + "learning_rate": 5.63009623511423e-05, + "loss": 2.2214, + "step": 9201 + }, + { + "epoch": 1.7248359887535145, + "grad_norm": 49396.77734375, + "learning_rate": 5.629316700763323e-05, + "loss": 2.221, + "step": 9202 + }, + { + "epoch": 1.7250234301780694, + "grad_norm": 46926.62890625, + "learning_rate": 5.628537150869109e-05, + "loss": 2.2459, + "step": 9203 + }, + { + "epoch": 1.7252108716026242, + "grad_norm": 50393.51171875, + "learning_rate": 5.627757585450842e-05, + "loss": 2.1838, + "step": 9204 + }, + { + "epoch": 1.725398313027179, + "grad_norm": 53744.30859375, + "learning_rate": 5.626978004527777e-05, + "loss": 2.3037, + "step": 9205 + }, + { + "epoch": 1.7255857544517337, + "grad_norm": 52137.72265625, + "learning_rate": 5.62619840811917e-05, + "loss": 2.2021, + "step": 9206 + }, + { + "epoch": 1.7257731958762887, + "grad_norm": 48890.296875, + "learning_rate": 5.6254187962442716e-05, + "loss": 2.1895, + "step": 9207 + }, + { + "epoch": 1.7259606373008434, + "grad_norm": 54022.23828125, + "learning_rate": 5.6246391689223424e-05, + "loss": 2.1483, + "step": 9208 + }, + { + "epoch": 1.7261480787253984, + "grad_norm": 53277.21875, + "learning_rate": 5.623859526172632e-05, + "loss": 2.1213, + "step": 9209 + }, + { + "epoch": 1.726335520149953, + "grad_norm": 51704.00390625, + "learning_rate": 5.6230798680144036e-05, + "loss": 2.1521, + "step": 9210 + }, + { + "epoch": 1.7265229615745081, + "grad_norm": 50070.265625, + "learning_rate": 5.6223001944669096e-05, + "loss": 2.1878, + "step": 9211 + }, + { + "epoch": 1.7267104029990628, + "grad_norm": 48409.01171875, + "learning_rate": 5.621520505549406e-05, + "loss": 2.2451, + "step": 9212 + }, + { + "epoch": 1.7268978444236176, + "grad_norm": 53598.6484375, + "learning_rate": 5.6207408012811524e-05, + "loss": 2.171, + "step": 9213 + }, + { + "epoch": 1.7270852858481724, + "grad_norm": 51727.53125, + "learning_rate": 5.619961081681406e-05, + "loss": 2.2476, + "step": 9214 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 50299.52734375, + "learning_rate": 5.6191813467694265e-05, + "loss": 2.2436, + "step": 9215 + }, + { + "epoch": 1.7274601686972821, + "grad_norm": 49563.7734375, + "learning_rate": 5.618401596564469e-05, + "loss": 2.1649, + "step": 9216 + }, + { + "epoch": 1.7276476101218368, + "grad_norm": 48551.4453125, + "learning_rate": 5.617621831085793e-05, + "loss": 2.2362, + "step": 9217 + }, + { + "epoch": 1.7278350515463918, + "grad_norm": 47347.7421875, + "learning_rate": 5.6168420503526586e-05, + "loss": 2.1992, + "step": 9218 + }, + { + "epoch": 1.7280224929709465, + "grad_norm": 47827.390625, + "learning_rate": 5.6160622543843256e-05, + "loss": 2.217, + "step": 9219 + }, + { + "epoch": 1.7282099343955015, + "grad_norm": 52367.77734375, + "learning_rate": 5.615282443200054e-05, + "loss": 2.1486, + "step": 9220 + }, + { + "epoch": 1.7283973758200561, + "grad_norm": 55678.796875, + "learning_rate": 5.614502616819105e-05, + "loss": 2.1997, + "step": 9221 + }, + { + "epoch": 1.7285848172446112, + "grad_norm": 55600.73046875, + "learning_rate": 5.613722775260735e-05, + "loss": 2.3358, + "step": 9222 + }, + { + "epoch": 1.7287722586691658, + "grad_norm": 49025.83203125, + "learning_rate": 5.6129429185442107e-05, + "loss": 2.2282, + "step": 9223 + }, + { + "epoch": 1.7289597000937207, + "grad_norm": 51027.7578125, + "learning_rate": 5.612163046688791e-05, + "loss": 2.2222, + "step": 9224 + }, + { + "epoch": 1.7291471415182755, + "grad_norm": 49468.2109375, + "learning_rate": 5.6113831597137367e-05, + "loss": 2.1592, + "step": 9225 + }, + { + "epoch": 1.7293345829428304, + "grad_norm": 50481.46875, + "learning_rate": 5.610603257638311e-05, + "loss": 2.2547, + "step": 9226 + }, + { + "epoch": 1.7295220243673852, + "grad_norm": 50167.91015625, + "learning_rate": 5.609823340481777e-05, + "loss": 2.18, + "step": 9227 + }, + { + "epoch": 1.72970946579194, + "grad_norm": 53797.28515625, + "learning_rate": 5.609043408263396e-05, + "loss": 2.2236, + "step": 9228 + }, + { + "epoch": 1.729896907216495, + "grad_norm": 53583.17578125, + "learning_rate": 5.608263461002433e-05, + "loss": 2.2999, + "step": 9229 + }, + { + "epoch": 1.7300843486410495, + "grad_norm": 52062.58203125, + "learning_rate": 5.607483498718152e-05, + "loss": 2.2308, + "step": 9230 + }, + { + "epoch": 1.7302717900656046, + "grad_norm": 55005.3515625, + "learning_rate": 5.6067035214298156e-05, + "loss": 2.3156, + "step": 9231 + }, + { + "epoch": 1.7304592314901592, + "grad_norm": 55102.81640625, + "learning_rate": 5.605923529156689e-05, + "loss": 2.2579, + "step": 9232 + }, + { + "epoch": 1.7306466729147143, + "grad_norm": 49049.90625, + "learning_rate": 5.605143521918037e-05, + "loss": 2.1416, + "step": 9233 + }, + { + "epoch": 1.730834114339269, + "grad_norm": 51877.7109375, + "learning_rate": 5.604363499733124e-05, + "loss": 2.1935, + "step": 9234 + }, + { + "epoch": 1.7310215557638238, + "grad_norm": 50913.0390625, + "learning_rate": 5.603583462621217e-05, + "loss": 2.2503, + "step": 9235 + }, + { + "epoch": 1.7312089971883786, + "grad_norm": 51940.12890625, + "learning_rate": 5.6028034106015815e-05, + "loss": 2.2815, + "step": 9236 + }, + { + "epoch": 1.7313964386129335, + "grad_norm": 49889.515625, + "learning_rate": 5.602023343693481e-05, + "loss": 2.2695, + "step": 9237 + }, + { + "epoch": 1.7315838800374883, + "grad_norm": 48301.85546875, + "learning_rate": 5.601243261916186e-05, + "loss": 2.2229, + "step": 9238 + }, + { + "epoch": 1.7317713214620432, + "grad_norm": 53721.94921875, + "learning_rate": 5.6004631652889615e-05, + "loss": 2.2277, + "step": 9239 + }, + { + "epoch": 1.731958762886598, + "grad_norm": 49792.43359375, + "learning_rate": 5.599683053831075e-05, + "loss": 2.2281, + "step": 9240 + }, + { + "epoch": 1.7321462043111526, + "grad_norm": 53944.765625, + "learning_rate": 5.5989029275617956e-05, + "loss": 2.1695, + "step": 9241 + }, + { + "epoch": 1.7323336457357077, + "grad_norm": 49442.2734375, + "learning_rate": 5.5981227865003896e-05, + "loss": 2.2337, + "step": 9242 + }, + { + "epoch": 1.7325210871602623, + "grad_norm": 51017.94921875, + "learning_rate": 5.597342630666127e-05, + "loss": 2.1817, + "step": 9243 + }, + { + "epoch": 1.7327085285848174, + "grad_norm": 51175.8359375, + "learning_rate": 5.596562460078276e-05, + "loss": 2.2092, + "step": 9244 + }, + { + "epoch": 1.732895970009372, + "grad_norm": 51294.69140625, + "learning_rate": 5.595782274756105e-05, + "loss": 2.1906, + "step": 9245 + }, + { + "epoch": 1.7330834114339269, + "grad_norm": 50480.6875, + "learning_rate": 5.5950020747188836e-05, + "loss": 2.2356, + "step": 9246 + }, + { + "epoch": 1.7332708528584817, + "grad_norm": 55384.5, + "learning_rate": 5.594221859985883e-05, + "loss": 2.209, + "step": 9247 + }, + { + "epoch": 1.7334582942830365, + "grad_norm": 53046.1171875, + "learning_rate": 5.593441630576374e-05, + "loss": 2.1824, + "step": 9248 + }, + { + "epoch": 1.7336457357075914, + "grad_norm": 48918.3828125, + "learning_rate": 5.5926613865096246e-05, + "loss": 2.226, + "step": 9249 + }, + { + "epoch": 1.7338331771321462, + "grad_norm": 56355.609375, + "learning_rate": 5.5918811278049076e-05, + "loss": 2.2043, + "step": 9250 + }, + { + "epoch": 1.734020618556701, + "grad_norm": 48873.54296875, + "learning_rate": 5.591100854481495e-05, + "loss": 2.1688, + "step": 9251 + }, + { + "epoch": 1.7342080599812557, + "grad_norm": 51015.765625, + "learning_rate": 5.5903205665586554e-05, + "loss": 2.2441, + "step": 9252 + }, + { + "epoch": 1.7343955014058108, + "grad_norm": 45652.29296875, + "learning_rate": 5.589540264055666e-05, + "loss": 2.1823, + "step": 9253 + }, + { + "epoch": 1.7345829428303654, + "grad_norm": 55918.2109375, + "learning_rate": 5.588759946991795e-05, + "loss": 2.2467, + "step": 9254 + }, + { + "epoch": 1.7347703842549205, + "grad_norm": 51602.1015625, + "learning_rate": 5.587979615386317e-05, + "loss": 2.2454, + "step": 9255 + }, + { + "epoch": 1.734957825679475, + "grad_norm": 50352.06640625, + "learning_rate": 5.587199269258505e-05, + "loss": 2.1814, + "step": 9256 + }, + { + "epoch": 1.73514526710403, + "grad_norm": 51794.26953125, + "learning_rate": 5.586418908627632e-05, + "loss": 2.2625, + "step": 9257 + }, + { + "epoch": 1.7353327085285848, + "grad_norm": 51215.25390625, + "learning_rate": 5.585638533512973e-05, + "loss": 2.1824, + "step": 9258 + }, + { + "epoch": 1.7355201499531396, + "grad_norm": 51571.48046875, + "learning_rate": 5.584858143933801e-05, + "loss": 2.2907, + "step": 9259 + }, + { + "epoch": 1.7357075913776945, + "grad_norm": 51141.02734375, + "learning_rate": 5.584077739909391e-05, + "loss": 2.2209, + "step": 9260 + }, + { + "epoch": 1.7358950328022493, + "grad_norm": 49229.75, + "learning_rate": 5.583297321459018e-05, + "loss": 2.2416, + "step": 9261 + }, + { + "epoch": 1.7360824742268042, + "grad_norm": 52227.99609375, + "learning_rate": 5.582516888601957e-05, + "loss": 2.243, + "step": 9262 + }, + { + "epoch": 1.7362699156513588, + "grad_norm": 47240.4375, + "learning_rate": 5.581736441357485e-05, + "loss": 2.2143, + "step": 9263 + }, + { + "epoch": 1.7364573570759139, + "grad_norm": 49824.02734375, + "learning_rate": 5.580955979744876e-05, + "loss": 2.2289, + "step": 9264 + }, + { + "epoch": 1.7366447985004685, + "grad_norm": 51233.97265625, + "learning_rate": 5.580175503783409e-05, + "loss": 2.2424, + "step": 9265 + }, + { + "epoch": 1.7368322399250236, + "grad_norm": 51486.30859375, + "learning_rate": 5.579395013492359e-05, + "loss": 2.1735, + "step": 9266 + }, + { + "epoch": 1.7370196813495782, + "grad_norm": 51215.39453125, + "learning_rate": 5.578614508891003e-05, + "loss": 2.2752, + "step": 9267 + }, + { + "epoch": 1.737207122774133, + "grad_norm": 49669.94140625, + "learning_rate": 5.577833989998619e-05, + "loss": 2.2341, + "step": 9268 + }, + { + "epoch": 1.7373945641986879, + "grad_norm": 47363.765625, + "learning_rate": 5.5770534568344834e-05, + "loss": 2.2406, + "step": 9269 + }, + { + "epoch": 1.7375820056232427, + "grad_norm": 52919.85546875, + "learning_rate": 5.576272909417877e-05, + "loss": 2.2254, + "step": 9270 + }, + { + "epoch": 1.7377694470477976, + "grad_norm": 61826.328125, + "learning_rate": 5.5754923477680775e-05, + "loss": 2.1086, + "step": 9271 + }, + { + "epoch": 1.7379568884723524, + "grad_norm": 55961.1875, + "learning_rate": 5.5747117719043606e-05, + "loss": 2.1578, + "step": 9272 + }, + { + "epoch": 1.7381443298969073, + "grad_norm": 53180.69140625, + "learning_rate": 5.573931181846011e-05, + "loss": 2.1875, + "step": 9273 + }, + { + "epoch": 1.7383317713214619, + "grad_norm": 51220.30078125, + "learning_rate": 5.573150577612303e-05, + "loss": 2.2089, + "step": 9274 + }, + { + "epoch": 1.738519212746017, + "grad_norm": 50781.47265625, + "learning_rate": 5.5723699592225196e-05, + "loss": 2.2548, + "step": 9275 + }, + { + "epoch": 1.7387066541705716, + "grad_norm": 52454.2109375, + "learning_rate": 5.571589326695941e-05, + "loss": 2.1812, + "step": 9276 + }, + { + "epoch": 1.7388940955951266, + "grad_norm": 52777.58203125, + "learning_rate": 5.5708086800518456e-05, + "loss": 2.2533, + "step": 9277 + }, + { + "epoch": 1.7390815370196813, + "grad_norm": 51309.62109375, + "learning_rate": 5.5700280193095165e-05, + "loss": 2.2239, + "step": 9278 + }, + { + "epoch": 1.7392689784442363, + "grad_norm": 49741.359375, + "learning_rate": 5.569247344488235e-05, + "loss": 2.2539, + "step": 9279 + }, + { + "epoch": 1.739456419868791, + "grad_norm": 52575.23046875, + "learning_rate": 5.5684666556072806e-05, + "loss": 2.2328, + "step": 9280 + }, + { + "epoch": 1.7396438612933458, + "grad_norm": 54410.390625, + "learning_rate": 5.5676859526859396e-05, + "loss": 2.2485, + "step": 9281 + }, + { + "epoch": 1.7398313027179007, + "grad_norm": 52222.76953125, + "learning_rate": 5.566905235743488e-05, + "loss": 2.2133, + "step": 9282 + }, + { + "epoch": 1.7400187441424555, + "grad_norm": 50187.41796875, + "learning_rate": 5.566124504799216e-05, + "loss": 2.2273, + "step": 9283 + }, + { + "epoch": 1.7402061855670103, + "grad_norm": 58180.39453125, + "learning_rate": 5.5653437598724e-05, + "loss": 2.1165, + "step": 9284 + }, + { + "epoch": 1.7403936269915652, + "grad_norm": 52315.39453125, + "learning_rate": 5.564563000982327e-05, + "loss": 2.1619, + "step": 9285 + }, + { + "epoch": 1.74058106841612, + "grad_norm": 50983.7109375, + "learning_rate": 5.563782228148281e-05, + "loss": 2.2063, + "step": 9286 + }, + { + "epoch": 1.7407685098406747, + "grad_norm": 48454.41796875, + "learning_rate": 5.5630014413895425e-05, + "loss": 2.2553, + "step": 9287 + }, + { + "epoch": 1.7409559512652297, + "grad_norm": 49378.54296875, + "learning_rate": 5.5622206407254e-05, + "loss": 2.1999, + "step": 9288 + }, + { + "epoch": 1.7411433926897844, + "grad_norm": 50039.7109375, + "learning_rate": 5.5614398261751357e-05, + "loss": 2.2408, + "step": 9289 + }, + { + "epoch": 1.7413308341143394, + "grad_norm": 49482.875, + "learning_rate": 5.5606589977580356e-05, + "loss": 2.2564, + "step": 9290 + }, + { + "epoch": 1.741518275538894, + "grad_norm": 52682.375, + "learning_rate": 5.5598781554933854e-05, + "loss": 2.2404, + "step": 9291 + }, + { + "epoch": 1.741705716963449, + "grad_norm": 50063.41015625, + "learning_rate": 5.559097299400471e-05, + "loss": 2.2512, + "step": 9292 + }, + { + "epoch": 1.7418931583880037, + "grad_norm": 54285.34765625, + "learning_rate": 5.558316429498578e-05, + "loss": 2.3001, + "step": 9293 + }, + { + "epoch": 1.7420805998125586, + "grad_norm": 48696.2109375, + "learning_rate": 5.557535545806994e-05, + "loss": 2.2295, + "step": 9294 + }, + { + "epoch": 1.7422680412371134, + "grad_norm": 52123.515625, + "learning_rate": 5.556754648345003e-05, + "loss": 2.2117, + "step": 9295 + }, + { + "epoch": 1.7424554826616683, + "grad_norm": 50842.82421875, + "learning_rate": 5.555973737131895e-05, + "loss": 2.2711, + "step": 9296 + }, + { + "epoch": 1.7426429240862231, + "grad_norm": 52888.91015625, + "learning_rate": 5.5551928121869565e-05, + "loss": 2.2928, + "step": 9297 + }, + { + "epoch": 1.7428303655107777, + "grad_norm": 48479.5625, + "learning_rate": 5.554411873529476e-05, + "loss": 2.2273, + "step": 9298 + }, + { + "epoch": 1.7430178069353328, + "grad_norm": 53916.75390625, + "learning_rate": 5.5536309211787405e-05, + "loss": 2.1368, + "step": 9299 + }, + { + "epoch": 1.7432052483598874, + "grad_norm": 56330.5546875, + "learning_rate": 5.552849955154038e-05, + "loss": 2.1955, + "step": 9300 + }, + { + "epoch": 1.7433926897844425, + "grad_norm": 49749.32421875, + "learning_rate": 5.55206897547466e-05, + "loss": 2.2115, + "step": 9301 + }, + { + "epoch": 1.7435801312089971, + "grad_norm": 50514.05078125, + "learning_rate": 5.551287982159893e-05, + "loss": 2.1574, + "step": 9302 + }, + { + "epoch": 1.743767572633552, + "grad_norm": 50278.8671875, + "learning_rate": 5.5505069752290286e-05, + "loss": 2.2307, + "step": 9303 + }, + { + "epoch": 1.7439550140581068, + "grad_norm": 52821.33203125, + "learning_rate": 5.549725954701355e-05, + "loss": 2.163, + "step": 9304 + }, + { + "epoch": 1.7441424554826617, + "grad_norm": 49310.62890625, + "learning_rate": 5.548944920596163e-05, + "loss": 2.2208, + "step": 9305 + }, + { + "epoch": 1.7443298969072165, + "grad_norm": 52150.48828125, + "learning_rate": 5.548163872932743e-05, + "loss": 2.241, + "step": 9306 + }, + { + "epoch": 1.7445173383317714, + "grad_norm": 49213.59765625, + "learning_rate": 5.547382811730386e-05, + "loss": 2.1811, + "step": 9307 + }, + { + "epoch": 1.7447047797563262, + "grad_norm": 49674.07421875, + "learning_rate": 5.546601737008383e-05, + "loss": 2.2558, + "step": 9308 + }, + { + "epoch": 1.7448922211808808, + "grad_norm": 47426.08984375, + "learning_rate": 5.5458206487860274e-05, + "loss": 2.2336, + "step": 9309 + }, + { + "epoch": 1.745079662605436, + "grad_norm": 51178.4765625, + "learning_rate": 5.5450395470826076e-05, + "loss": 2.218, + "step": 9310 + }, + { + "epoch": 1.7452671040299905, + "grad_norm": 49825.171875, + "learning_rate": 5.54425843191742e-05, + "loss": 2.2104, + "step": 9311 + }, + { + "epoch": 1.7454545454545456, + "grad_norm": 50652.0390625, + "learning_rate": 5.543477303309752e-05, + "loss": 2.2452, + "step": 9312 + }, + { + "epoch": 1.7456419868791002, + "grad_norm": 63741.828125, + "learning_rate": 5.542696161278901e-05, + "loss": 2.2535, + "step": 9313 + }, + { + "epoch": 1.745829428303655, + "grad_norm": 51531.6015625, + "learning_rate": 5.541915005844158e-05, + "loss": 2.2044, + "step": 9314 + }, + { + "epoch": 1.74601686972821, + "grad_norm": 52959.6640625, + "learning_rate": 5.541133837024816e-05, + "loss": 2.2864, + "step": 9315 + }, + { + "epoch": 1.7462043111527648, + "grad_norm": 52645.046875, + "learning_rate": 5.540352654840172e-05, + "loss": 2.2758, + "step": 9316 + }, + { + "epoch": 1.7463917525773196, + "grad_norm": 53582.828125, + "learning_rate": 5.539571459309516e-05, + "loss": 2.1919, + "step": 9317 + }, + { + "epoch": 1.7465791940018744, + "grad_norm": 47173.23046875, + "learning_rate": 5.538790250452145e-05, + "loss": 2.2345, + "step": 9318 + }, + { + "epoch": 1.7467666354264293, + "grad_norm": 49636.03515625, + "learning_rate": 5.538009028287353e-05, + "loss": 2.2083, + "step": 9319 + }, + { + "epoch": 1.746954076850984, + "grad_norm": 47348.859375, + "learning_rate": 5.5372277928344365e-05, + "loss": 2.2562, + "step": 9320 + }, + { + "epoch": 1.747141518275539, + "grad_norm": 54627.03125, + "learning_rate": 5.536446544112689e-05, + "loss": 2.3151, + "step": 9321 + }, + { + "epoch": 1.7473289597000936, + "grad_norm": 47908.94921875, + "learning_rate": 5.535665282141408e-05, + "loss": 2.2145, + "step": 9322 + }, + { + "epoch": 1.7475164011246487, + "grad_norm": 54409.48828125, + "learning_rate": 5.534884006939889e-05, + "loss": 2.2003, + "step": 9323 + }, + { + "epoch": 1.7477038425492033, + "grad_norm": 49779.203125, + "learning_rate": 5.53410271852743e-05, + "loss": 2.1906, + "step": 9324 + }, + { + "epoch": 1.7478912839737581, + "grad_norm": 52386.76953125, + "learning_rate": 5.5333214169233225e-05, + "loss": 2.2431, + "step": 9325 + }, + { + "epoch": 1.748078725398313, + "grad_norm": 50760.48828125, + "learning_rate": 5.5325401021468714e-05, + "loss": 2.2496, + "step": 9326 + }, + { + "epoch": 1.7482661668228678, + "grad_norm": 52365.88671875, + "learning_rate": 5.531758774217368e-05, + "loss": 2.2465, + "step": 9327 + }, + { + "epoch": 1.7484536082474227, + "grad_norm": 53100.27734375, + "learning_rate": 5.530977433154113e-05, + "loss": 2.2402, + "step": 9328 + }, + { + "epoch": 1.7486410496719775, + "grad_norm": 53627.9609375, + "learning_rate": 5.530196078976405e-05, + "loss": 2.3051, + "step": 9329 + }, + { + "epoch": 1.7488284910965324, + "grad_norm": 49859.42578125, + "learning_rate": 5.529414711703539e-05, + "loss": 2.2006, + "step": 9330 + }, + { + "epoch": 1.749015932521087, + "grad_norm": 46455.95703125, + "learning_rate": 5.5286333313548175e-05, + "loss": 2.2244, + "step": 9331 + }, + { + "epoch": 1.749203373945642, + "grad_norm": 50318.99609375, + "learning_rate": 5.527851937949537e-05, + "loss": 2.2018, + "step": 9332 + }, + { + "epoch": 1.7493908153701967, + "grad_norm": 50714.94921875, + "learning_rate": 5.527070531506999e-05, + "loss": 2.2923, + "step": 9333 + }, + { + "epoch": 1.7495782567947518, + "grad_norm": 46401.85546875, + "learning_rate": 5.5262891120465024e-05, + "loss": 2.2331, + "step": 9334 + }, + { + "epoch": 1.7497656982193064, + "grad_norm": 50320.375, + "learning_rate": 5.5255076795873465e-05, + "loss": 2.2153, + "step": 9335 + }, + { + "epoch": 1.7499531396438615, + "grad_norm": 52057.0078125, + "learning_rate": 5.524726234148832e-05, + "loss": 2.1885, + "step": 9336 + }, + { + "epoch": 1.750140581068416, + "grad_norm": 49629.05859375, + "learning_rate": 5.52394477575026e-05, + "loss": 2.2242, + "step": 9337 + }, + { + "epoch": 1.750328022492971, + "grad_norm": 52145.46484375, + "learning_rate": 5.523163304410931e-05, + "loss": 2.1601, + "step": 9338 + }, + { + "epoch": 1.7505154639175258, + "grad_norm": 48202.06640625, + "learning_rate": 5.5223818201501475e-05, + "loss": 2.1869, + "step": 9339 + }, + { + "epoch": 1.7507029053420806, + "grad_norm": 53832.8359375, + "learning_rate": 5.521600322987211e-05, + "loss": 2.2061, + "step": 9340 + }, + { + "epoch": 1.7508903467666355, + "grad_norm": 50936.65625, + "learning_rate": 5.5208188129414205e-05, + "loss": 2.1785, + "step": 9341 + }, + { + "epoch": 1.75107778819119, + "grad_norm": 49370.515625, + "learning_rate": 5.520037290032082e-05, + "loss": 2.172, + "step": 9342 + }, + { + "epoch": 1.7512652296157452, + "grad_norm": 53328.00390625, + "learning_rate": 5.519255754278496e-05, + "loss": 2.193, + "step": 9343 + }, + { + "epoch": 1.7514526710402998, + "grad_norm": 51504.2109375, + "learning_rate": 5.518474205699968e-05, + "loss": 2.2099, + "step": 9344 + }, + { + "epoch": 1.7516401124648548, + "grad_norm": 56704.19140625, + "learning_rate": 5.517692644315796e-05, + "loss": 2.2226, + "step": 9345 + }, + { + "epoch": 1.7518275538894095, + "grad_norm": 50929.94921875, + "learning_rate": 5.51691107014529e-05, + "loss": 2.2533, + "step": 9346 + }, + { + "epoch": 1.7520149953139645, + "grad_norm": 50813.27734375, + "learning_rate": 5.516129483207749e-05, + "loss": 2.2619, + "step": 9347 + }, + { + "epoch": 1.7522024367385192, + "grad_norm": 54600.1796875, + "learning_rate": 5.5153478835224806e-05, + "loss": 2.2872, + "step": 9348 + }, + { + "epoch": 1.752389878163074, + "grad_norm": 49972.1015625, + "learning_rate": 5.514566271108785e-05, + "loss": 2.2578, + "step": 9349 + }, + { + "epoch": 1.7525773195876289, + "grad_norm": 47318.55078125, + "learning_rate": 5.513784645985972e-05, + "loss": 2.2588, + "step": 9350 + }, + { + "epoch": 1.7527647610121837, + "grad_norm": 49712.78515625, + "learning_rate": 5.513003008173343e-05, + "loss": 2.2173, + "step": 9351 + }, + { + "epoch": 1.7529522024367385, + "grad_norm": 48380.69140625, + "learning_rate": 5.512221357690206e-05, + "loss": 2.2352, + "step": 9352 + }, + { + "epoch": 1.7531396438612934, + "grad_norm": 56988.79296875, + "learning_rate": 5.5114396945558645e-05, + "loss": 2.2027, + "step": 9353 + }, + { + "epoch": 1.7533270852858482, + "grad_norm": 54433.375, + "learning_rate": 5.510658018789627e-05, + "loss": 2.2519, + "step": 9354 + }, + { + "epoch": 1.7535145267104029, + "grad_norm": 51430.90234375, + "learning_rate": 5.509876330410797e-05, + "loss": 2.2565, + "step": 9355 + }, + { + "epoch": 1.753701968134958, + "grad_norm": 52569.03515625, + "learning_rate": 5.509094629438683e-05, + "loss": 2.2406, + "step": 9356 + }, + { + "epoch": 1.7538894095595126, + "grad_norm": 49703.44140625, + "learning_rate": 5.5083129158925926e-05, + "loss": 2.22, + "step": 9357 + }, + { + "epoch": 1.7540768509840676, + "grad_norm": 49017.97265625, + "learning_rate": 5.507531189791831e-05, + "loss": 2.2388, + "step": 9358 + }, + { + "epoch": 1.7542642924086223, + "grad_norm": 47919.47265625, + "learning_rate": 5.506749451155708e-05, + "loss": 2.2598, + "step": 9359 + }, + { + "epoch": 1.754451733833177, + "grad_norm": 48144.45703125, + "learning_rate": 5.505967700003529e-05, + "loss": 2.2042, + "step": 9360 + }, + { + "epoch": 1.754639175257732, + "grad_norm": 49570.56640625, + "learning_rate": 5.505185936354607e-05, + "loss": 2.3162, + "step": 9361 + }, + { + "epoch": 1.7548266166822868, + "grad_norm": 53380.65234375, + "learning_rate": 5.504404160228245e-05, + "loss": 2.2368, + "step": 9362 + }, + { + "epoch": 1.7550140581068416, + "grad_norm": 47776.07421875, + "learning_rate": 5.503622371643754e-05, + "loss": 2.2221, + "step": 9363 + }, + { + "epoch": 1.7552014995313965, + "grad_norm": 51868.625, + "learning_rate": 5.502840570620445e-05, + "loss": 2.1451, + "step": 9364 + }, + { + "epoch": 1.7553889409559513, + "grad_norm": 47755.58984375, + "learning_rate": 5.502058757177624e-05, + "loss": 2.2479, + "step": 9365 + }, + { + "epoch": 1.755576382380506, + "grad_norm": 51593.51953125, + "learning_rate": 5.5012769313346034e-05, + "loss": 2.1622, + "step": 9366 + }, + { + "epoch": 1.755763823805061, + "grad_norm": 52054.72265625, + "learning_rate": 5.500495093110693e-05, + "loss": 2.2105, + "step": 9367 + }, + { + "epoch": 1.7559512652296156, + "grad_norm": 53933.79296875, + "learning_rate": 5.499713242525202e-05, + "loss": 2.272, + "step": 9368 + }, + { + "epoch": 1.7561387066541707, + "grad_norm": 53582.375, + "learning_rate": 5.498931379597442e-05, + "loss": 2.1398, + "step": 9369 + }, + { + "epoch": 1.7563261480787253, + "grad_norm": 50370.31640625, + "learning_rate": 5.498149504346723e-05, + "loss": 2.1797, + "step": 9370 + }, + { + "epoch": 1.7565135895032802, + "grad_norm": 50080.3359375, + "learning_rate": 5.497367616792359e-05, + "loss": 2.2467, + "step": 9371 + }, + { + "epoch": 1.756701030927835, + "grad_norm": 52879.41015625, + "learning_rate": 5.4965857169536596e-05, + "loss": 2.2232, + "step": 9372 + }, + { + "epoch": 1.7568884723523899, + "grad_norm": 48657.48046875, + "learning_rate": 5.495803804849935e-05, + "loss": 2.2291, + "step": 9373 + }, + { + "epoch": 1.7570759137769447, + "grad_norm": 52514.5546875, + "learning_rate": 5.4950218805005006e-05, + "loss": 2.3444, + "step": 9374 + }, + { + "epoch": 1.7572633552014996, + "grad_norm": 53369.4140625, + "learning_rate": 5.494239943924666e-05, + "loss": 2.2191, + "step": 9375 + }, + { + "epoch": 1.7574507966260544, + "grad_norm": 52816.8984375, + "learning_rate": 5.4934579951417476e-05, + "loss": 2.2086, + "step": 9376 + }, + { + "epoch": 1.757638238050609, + "grad_norm": 54933.58203125, + "learning_rate": 5.492676034171055e-05, + "loss": 2.2095, + "step": 9377 + }, + { + "epoch": 1.757825679475164, + "grad_norm": 55409.48046875, + "learning_rate": 5.4918940610319036e-05, + "loss": 2.1502, + "step": 9378 + }, + { + "epoch": 1.7580131208997187, + "grad_norm": 50173.23828125, + "learning_rate": 5.4911120757436064e-05, + "loss": 2.2068, + "step": 9379 + }, + { + "epoch": 1.7582005623242738, + "grad_norm": 50227.7421875, + "learning_rate": 5.490330078325479e-05, + "loss": 2.2176, + "step": 9380 + }, + { + "epoch": 1.7583880037488284, + "grad_norm": 52643.8515625, + "learning_rate": 5.489548068796833e-05, + "loss": 2.2404, + "step": 9381 + }, + { + "epoch": 1.7585754451733833, + "grad_norm": 47365.7734375, + "learning_rate": 5.488766047176985e-05, + "loss": 2.2205, + "step": 9382 + }, + { + "epoch": 1.7587628865979381, + "grad_norm": 50784.8515625, + "learning_rate": 5.487984013485249e-05, + "loss": 2.2291, + "step": 9383 + }, + { + "epoch": 1.758950328022493, + "grad_norm": 55344.5078125, + "learning_rate": 5.48720196774094e-05, + "loss": 2.148, + "step": 9384 + }, + { + "epoch": 1.7591377694470478, + "grad_norm": 49476.78515625, + "learning_rate": 5.486419909963375e-05, + "loss": 2.2088, + "step": 9385 + }, + { + "epoch": 1.7593252108716027, + "grad_norm": 53060.7578125, + "learning_rate": 5.485637840171869e-05, + "loss": 2.202, + "step": 9386 + }, + { + "epoch": 1.7595126522961575, + "grad_norm": 50789.7734375, + "learning_rate": 5.484855758385738e-05, + "loss": 2.2298, + "step": 9387 + }, + { + "epoch": 1.7597000937207121, + "grad_norm": 53637.2421875, + "learning_rate": 5.484073664624297e-05, + "loss": 2.2321, + "step": 9388 + }, + { + "epoch": 1.7598875351452672, + "grad_norm": 49837.23828125, + "learning_rate": 5.483291558906866e-05, + "loss": 2.3248, + "step": 9389 + }, + { + "epoch": 1.7600749765698218, + "grad_norm": 50825.28515625, + "learning_rate": 5.482509441252759e-05, + "loss": 2.2691, + "step": 9390 + }, + { + "epoch": 1.7602624179943769, + "grad_norm": 52038.83203125, + "learning_rate": 5.481727311681294e-05, + "loss": 2.1802, + "step": 9391 + }, + { + "epoch": 1.7604498594189315, + "grad_norm": 49079.5703125, + "learning_rate": 5.48094517021179e-05, + "loss": 2.2267, + "step": 9392 + }, + { + "epoch": 1.7606373008434866, + "grad_norm": 51687.01953125, + "learning_rate": 5.480163016863563e-05, + "loss": 2.1633, + "step": 9393 + }, + { + "epoch": 1.7608247422680412, + "grad_norm": 60620.07421875, + "learning_rate": 5.479380851655932e-05, + "loss": 2.2946, + "step": 9394 + }, + { + "epoch": 1.761012183692596, + "grad_norm": 51750.90234375, + "learning_rate": 5.478598674608216e-05, + "loss": 2.2756, + "step": 9395 + }, + { + "epoch": 1.761199625117151, + "grad_norm": 51057.08203125, + "learning_rate": 5.477816485739734e-05, + "loss": 2.2284, + "step": 9396 + }, + { + "epoch": 1.7613870665417057, + "grad_norm": 51464.3828125, + "learning_rate": 5.477034285069803e-05, + "loss": 2.2395, + "step": 9397 + }, + { + "epoch": 1.7615745079662606, + "grad_norm": 52183.4921875, + "learning_rate": 5.4762520726177444e-05, + "loss": 2.1573, + "step": 9398 + }, + { + "epoch": 1.7617619493908152, + "grad_norm": 49814.45703125, + "learning_rate": 5.4754698484028766e-05, + "loss": 2.2155, + "step": 9399 + }, + { + "epoch": 1.7619493908153703, + "grad_norm": 47881.375, + "learning_rate": 5.47468761244452e-05, + "loss": 2.2711, + "step": 9400 + }, + { + "epoch": 1.762136832239925, + "grad_norm": 57861.40234375, + "learning_rate": 5.473905364761994e-05, + "loss": 2.2073, + "step": 9401 + }, + { + "epoch": 1.76232427366448, + "grad_norm": 51085.890625, + "learning_rate": 5.4731231053746226e-05, + "loss": 2.1883, + "step": 9402 + }, + { + "epoch": 1.7625117150890346, + "grad_norm": 49005.26171875, + "learning_rate": 5.4723408343017214e-05, + "loss": 2.2173, + "step": 9403 + }, + { + "epoch": 1.7626991565135897, + "grad_norm": 50287.46875, + "learning_rate": 5.471558551562616e-05, + "loss": 2.1999, + "step": 9404 + }, + { + "epoch": 1.7628865979381443, + "grad_norm": 52063.59765625, + "learning_rate": 5.470776257176624e-05, + "loss": 2.2578, + "step": 9405 + }, + { + "epoch": 1.7630740393626991, + "grad_norm": 55035.59765625, + "learning_rate": 5.46999395116307e-05, + "loss": 2.3726, + "step": 9406 + }, + { + "epoch": 1.763261480787254, + "grad_norm": 49410.0703125, + "learning_rate": 5.469211633541272e-05, + "loss": 2.2201, + "step": 9407 + }, + { + "epoch": 1.7634489222118088, + "grad_norm": 52080.78515625, + "learning_rate": 5.468429304330558e-05, + "loss": 2.1214, + "step": 9408 + }, + { + "epoch": 1.7636363636363637, + "grad_norm": 51104.2890625, + "learning_rate": 5.467646963550247e-05, + "loss": 2.2173, + "step": 9409 + }, + { + "epoch": 1.7638238050609185, + "grad_norm": 52487.078125, + "learning_rate": 5.466864611219662e-05, + "loss": 2.1722, + "step": 9410 + }, + { + "epoch": 1.7640112464854734, + "grad_norm": 52036.109375, + "learning_rate": 5.466082247358126e-05, + "loss": 2.1814, + "step": 9411 + }, + { + "epoch": 1.764198687910028, + "grad_norm": 51465.578125, + "learning_rate": 5.465299871984964e-05, + "loss": 2.154, + "step": 9412 + }, + { + "epoch": 1.764386129334583, + "grad_norm": 51936.21875, + "learning_rate": 5.464517485119497e-05, + "loss": 2.2682, + "step": 9413 + }, + { + "epoch": 1.7645735707591377, + "grad_norm": 49173.03125, + "learning_rate": 5.463735086781051e-05, + "loss": 2.2345, + "step": 9414 + }, + { + "epoch": 1.7647610121836927, + "grad_norm": 56229.8125, + "learning_rate": 5.4629526769889497e-05, + "loss": 2.2574, + "step": 9415 + }, + { + "epoch": 1.7649484536082474, + "grad_norm": 54532.61328125, + "learning_rate": 5.4621702557625165e-05, + "loss": 2.161, + "step": 9416 + }, + { + "epoch": 1.7651358950328022, + "grad_norm": 52722.0703125, + "learning_rate": 5.461387823121079e-05, + "loss": 2.2726, + "step": 9417 + }, + { + "epoch": 1.765323336457357, + "grad_norm": 51146.8984375, + "learning_rate": 5.4606053790839586e-05, + "loss": 2.245, + "step": 9418 + }, + { + "epoch": 1.765510777881912, + "grad_norm": 51867.15234375, + "learning_rate": 5.459822923670484e-05, + "loss": 2.2561, + "step": 9419 + }, + { + "epoch": 1.7656982193064668, + "grad_norm": 51527.359375, + "learning_rate": 5.4590404568999786e-05, + "loss": 2.2061, + "step": 9420 + }, + { + "epoch": 1.7658856607310216, + "grad_norm": 50846.40234375, + "learning_rate": 5.458257978791769e-05, + "loss": 2.1945, + "step": 9421 + }, + { + "epoch": 1.7660731021555764, + "grad_norm": 48939.27734375, + "learning_rate": 5.457475489365182e-05, + "loss": 2.2896, + "step": 9422 + }, + { + "epoch": 1.766260543580131, + "grad_norm": 51781.4921875, + "learning_rate": 5.456692988639542e-05, + "loss": 2.201, + "step": 9423 + }, + { + "epoch": 1.7664479850046861, + "grad_norm": 49655.1484375, + "learning_rate": 5.4559104766341805e-05, + "loss": 2.1938, + "step": 9424 + }, + { + "epoch": 1.7666354264292408, + "grad_norm": 49609.9921875, + "learning_rate": 5.45512795336842e-05, + "loss": 2.2411, + "step": 9425 + }, + { + "epoch": 1.7668228678537958, + "grad_norm": 52198.046875, + "learning_rate": 5.454345418861587e-05, + "loss": 2.2266, + "step": 9426 + }, + { + "epoch": 1.7670103092783505, + "grad_norm": 48024.87890625, + "learning_rate": 5.4535628731330134e-05, + "loss": 2.2272, + "step": 9427 + }, + { + "epoch": 1.7671977507029053, + "grad_norm": 48567.79296875, + "learning_rate": 5.4527803162020243e-05, + "loss": 2.2434, + "step": 9428 + }, + { + "epoch": 1.7673851921274601, + "grad_norm": 52809.66796875, + "learning_rate": 5.4519977480879466e-05, + "loss": 2.1906, + "step": 9429 + }, + { + "epoch": 1.767572633552015, + "grad_norm": 51503.109375, + "learning_rate": 5.451215168810114e-05, + "loss": 2.2293, + "step": 9430 + }, + { + "epoch": 1.7677600749765698, + "grad_norm": 48533.53515625, + "learning_rate": 5.4504325783878483e-05, + "loss": 2.239, + "step": 9431 + }, + { + "epoch": 1.7679475164011247, + "grad_norm": 48341.9140625, + "learning_rate": 5.449649976840485e-05, + "loss": 2.2377, + "step": 9432 + }, + { + "epoch": 1.7681349578256795, + "grad_norm": 49998.99609375, + "learning_rate": 5.4488673641873475e-05, + "loss": 2.217, + "step": 9433 + }, + { + "epoch": 1.7683223992502342, + "grad_norm": 50517.609375, + "learning_rate": 5.448084740447771e-05, + "loss": 2.238, + "step": 9434 + }, + { + "epoch": 1.7685098406747892, + "grad_norm": 48038.7265625, + "learning_rate": 5.44730210564108e-05, + "loss": 2.1914, + "step": 9435 + }, + { + "epoch": 1.7686972820993438, + "grad_norm": 51519.51171875, + "learning_rate": 5.4465194597866075e-05, + "loss": 2.2453, + "step": 9436 + }, + { + "epoch": 1.768884723523899, + "grad_norm": 51449.40234375, + "learning_rate": 5.445736802903685e-05, + "loss": 2.1931, + "step": 9437 + }, + { + "epoch": 1.7690721649484535, + "grad_norm": 51338.4765625, + "learning_rate": 5.4449541350116386e-05, + "loss": 2.2905, + "step": 9438 + }, + { + "epoch": 1.7692596063730084, + "grad_norm": 50156.5, + "learning_rate": 5.444171456129804e-05, + "loss": 2.1426, + "step": 9439 + }, + { + "epoch": 1.7694470477975632, + "grad_norm": 51182.5078125, + "learning_rate": 5.44338876627751e-05, + "loss": 2.2819, + "step": 9440 + }, + { + "epoch": 1.769634489222118, + "grad_norm": 53176.171875, + "learning_rate": 5.442606065474087e-05, + "loss": 2.2125, + "step": 9441 + }, + { + "epoch": 1.769821930646673, + "grad_norm": 51540.48828125, + "learning_rate": 5.441823353738869e-05, + "loss": 2.2059, + "step": 9442 + }, + { + "epoch": 1.7700093720712278, + "grad_norm": 53461.7890625, + "learning_rate": 5.4410406310911866e-05, + "loss": 2.2117, + "step": 9443 + }, + { + "epoch": 1.7701968134957826, + "grad_norm": 53548.71484375, + "learning_rate": 5.440257897550373e-05, + "loss": 2.2038, + "step": 9444 + }, + { + "epoch": 1.7703842549203372, + "grad_norm": 54197.71484375, + "learning_rate": 5.43947515313576e-05, + "loss": 2.1905, + "step": 9445 + }, + { + "epoch": 1.7705716963448923, + "grad_norm": 50715.78515625, + "learning_rate": 5.4386923978666796e-05, + "loss": 2.2403, + "step": 9446 + }, + { + "epoch": 1.770759137769447, + "grad_norm": 50106.73828125, + "learning_rate": 5.4379096317624675e-05, + "loss": 2.1945, + "step": 9447 + }, + { + "epoch": 1.770946579194002, + "grad_norm": 49943.1875, + "learning_rate": 5.437126854842454e-05, + "loss": 2.2753, + "step": 9448 + }, + { + "epoch": 1.7711340206185566, + "grad_norm": 49679.20703125, + "learning_rate": 5.4363440671259744e-05, + "loss": 2.2603, + "step": 9449 + }, + { + "epoch": 1.7713214620431117, + "grad_norm": 52953.71484375, + "learning_rate": 5.435561268632362e-05, + "loss": 2.1315, + "step": 9450 + }, + { + "epoch": 1.7715089034676663, + "grad_norm": 51835.8671875, + "learning_rate": 5.4347784593809505e-05, + "loss": 2.2125, + "step": 9451 + }, + { + "epoch": 1.7716963448922212, + "grad_norm": 53509.25, + "learning_rate": 5.4339956393910766e-05, + "loss": 2.193, + "step": 9452 + }, + { + "epoch": 1.771883786316776, + "grad_norm": 48737.6953125, + "learning_rate": 5.43321280868207e-05, + "loss": 2.2668, + "step": 9453 + }, + { + "epoch": 1.7720712277413309, + "grad_norm": 50866.17578125, + "learning_rate": 5.4324299672732704e-05, + "loss": 2.2708, + "step": 9454 + }, + { + "epoch": 1.7722586691658857, + "grad_norm": 50597.703125, + "learning_rate": 5.431647115184011e-05, + "loss": 2.2355, + "step": 9455 + }, + { + "epoch": 1.7724461105904403, + "grad_norm": 48257.05859375, + "learning_rate": 5.430864252433628e-05, + "loss": 2.1682, + "step": 9456 + }, + { + "epoch": 1.7726335520149954, + "grad_norm": 49081.44921875, + "learning_rate": 5.4300813790414564e-05, + "loss": 2.2461, + "step": 9457 + }, + { + "epoch": 1.77282099343955, + "grad_norm": 57454.07421875, + "learning_rate": 5.429298495026832e-05, + "loss": 2.2176, + "step": 9458 + }, + { + "epoch": 1.773008434864105, + "grad_norm": 52368.80078125, + "learning_rate": 5.428515600409091e-05, + "loss": 2.1371, + "step": 9459 + }, + { + "epoch": 1.7731958762886597, + "grad_norm": 51634.4453125, + "learning_rate": 5.427732695207571e-05, + "loss": 2.0826, + "step": 9460 + }, + { + "epoch": 1.7733833177132148, + "grad_norm": 51172.7734375, + "learning_rate": 5.426949779441607e-05, + "loss": 2.1945, + "step": 9461 + }, + { + "epoch": 1.7735707591377694, + "grad_norm": 52864.19921875, + "learning_rate": 5.426166853130539e-05, + "loss": 2.2387, + "step": 9462 + }, + { + "epoch": 1.7737582005623242, + "grad_norm": 51645.44921875, + "learning_rate": 5.425383916293701e-05, + "loss": 2.1876, + "step": 9463 + }, + { + "epoch": 1.773945641986879, + "grad_norm": 55784.88671875, + "learning_rate": 5.424600968950432e-05, + "loss": 2.2499, + "step": 9464 + }, + { + "epoch": 1.774133083411434, + "grad_norm": 56062.9453125, + "learning_rate": 5.42381801112007e-05, + "loss": 2.2278, + "step": 9465 + }, + { + "epoch": 1.7743205248359888, + "grad_norm": 51478.06640625, + "learning_rate": 5.423035042821951e-05, + "loss": 2.1914, + "step": 9466 + }, + { + "epoch": 1.7745079662605436, + "grad_norm": 51068.6015625, + "learning_rate": 5.422252064075417e-05, + "loss": 2.196, + "step": 9467 + }, + { + "epoch": 1.7746954076850985, + "grad_norm": 51006.9921875, + "learning_rate": 5.421469074899802e-05, + "loss": 2.2073, + "step": 9468 + }, + { + "epoch": 1.774882849109653, + "grad_norm": 52536.48046875, + "learning_rate": 5.42068607531445e-05, + "loss": 2.2815, + "step": 9469 + }, + { + "epoch": 1.7750702905342082, + "grad_norm": 51063.33984375, + "learning_rate": 5.419903065338695e-05, + "loss": 2.2338, + "step": 9470 + }, + { + "epoch": 1.7752577319587628, + "grad_norm": 57693.71484375, + "learning_rate": 5.41912004499188e-05, + "loss": 2.2776, + "step": 9471 + }, + { + "epoch": 1.7754451733833179, + "grad_norm": 52564.62109375, + "learning_rate": 5.4183370142933433e-05, + "loss": 2.1879, + "step": 9472 + }, + { + "epoch": 1.7756326148078725, + "grad_norm": 52159.5, + "learning_rate": 5.417553973262424e-05, + "loss": 2.273, + "step": 9473 + }, + { + "epoch": 1.7758200562324273, + "grad_norm": 50961.91015625, + "learning_rate": 5.416770921918464e-05, + "loss": 2.2053, + "step": 9474 + }, + { + "epoch": 1.7760074976569822, + "grad_norm": 51235.5234375, + "learning_rate": 5.415987860280802e-05, + "loss": 2.2862, + "step": 9475 + }, + { + "epoch": 1.776194939081537, + "grad_norm": 52802.43359375, + "learning_rate": 5.4152047883687784e-05, + "loss": 2.1848, + "step": 9476 + }, + { + "epoch": 1.7763823805060919, + "grad_norm": 53002.1015625, + "learning_rate": 5.414421706201737e-05, + "loss": 2.2094, + "step": 9477 + }, + { + "epoch": 1.7765698219306467, + "grad_norm": 46370.078125, + "learning_rate": 5.4136386137990145e-05, + "loss": 2.27, + "step": 9478 + }, + { + "epoch": 1.7767572633552016, + "grad_norm": 49195.64453125, + "learning_rate": 5.412855511179955e-05, + "loss": 2.2503, + "step": 9479 + }, + { + "epoch": 1.7769447047797562, + "grad_norm": 47594.24609375, + "learning_rate": 5.412072398363901e-05, + "loss": 2.2446, + "step": 9480 + }, + { + "epoch": 1.7771321462043113, + "grad_norm": 48961.41015625, + "learning_rate": 5.41128927537019e-05, + "loss": 2.2643, + "step": 9481 + }, + { + "epoch": 1.7773195876288659, + "grad_norm": 48799.0546875, + "learning_rate": 5.410506142218171e-05, + "loss": 2.1701, + "step": 9482 + }, + { + "epoch": 1.777507029053421, + "grad_norm": 50204.4375, + "learning_rate": 5.40972299892718e-05, + "loss": 2.2516, + "step": 9483 + }, + { + "epoch": 1.7776944704779756, + "grad_norm": 52248.04296875, + "learning_rate": 5.408939845516564e-05, + "loss": 2.2282, + "step": 9484 + }, + { + "epoch": 1.7778819119025304, + "grad_norm": 48970.92578125, + "learning_rate": 5.408156682005663e-05, + "loss": 2.1454, + "step": 9485 + }, + { + "epoch": 1.7780693533270853, + "grad_norm": 46992.1875, + "learning_rate": 5.40737350841382e-05, + "loss": 2.2359, + "step": 9486 + }, + { + "epoch": 1.7782567947516401, + "grad_norm": 52655.05859375, + "learning_rate": 5.406590324760381e-05, + "loss": 2.1691, + "step": 9487 + }, + { + "epoch": 1.778444236176195, + "grad_norm": 56977.50390625, + "learning_rate": 5.405807131064687e-05, + "loss": 2.2351, + "step": 9488 + }, + { + "epoch": 1.7786316776007498, + "grad_norm": 49720.1796875, + "learning_rate": 5.405023927346083e-05, + "loss": 2.2066, + "step": 9489 + }, + { + "epoch": 1.7788191190253047, + "grad_norm": 49587.6328125, + "learning_rate": 5.404240713623916e-05, + "loss": 2.166, + "step": 9490 + }, + { + "epoch": 1.7790065604498593, + "grad_norm": 55933.56640625, + "learning_rate": 5.403457489917524e-05, + "loss": 2.3072, + "step": 9491 + }, + { + "epoch": 1.7791940018744143, + "grad_norm": 48167.421875, + "learning_rate": 5.402674256246256e-05, + "loss": 2.2393, + "step": 9492 + }, + { + "epoch": 1.779381443298969, + "grad_norm": 48973.10546875, + "learning_rate": 5.4018910126294563e-05, + "loss": 2.2381, + "step": 9493 + }, + { + "epoch": 1.779568884723524, + "grad_norm": 48832.7578125, + "learning_rate": 5.40110775908647e-05, + "loss": 2.2109, + "step": 9494 + }, + { + "epoch": 1.7797563261480787, + "grad_norm": 47616.41015625, + "learning_rate": 5.400324495636643e-05, + "loss": 2.2599, + "step": 9495 + }, + { + "epoch": 1.7799437675726335, + "grad_norm": 48350.140625, + "learning_rate": 5.399541222299317e-05, + "loss": 2.1762, + "step": 9496 + }, + { + "epoch": 1.7801312089971884, + "grad_norm": 49776.046875, + "learning_rate": 5.3987579390938435e-05, + "loss": 2.2396, + "step": 9497 + }, + { + "epoch": 1.7803186504217432, + "grad_norm": 52878.92578125, + "learning_rate": 5.397974646039565e-05, + "loss": 2.2466, + "step": 9498 + }, + { + "epoch": 1.780506091846298, + "grad_norm": 49991.6171875, + "learning_rate": 5.3971913431558285e-05, + "loss": 2.2354, + "step": 9499 + }, + { + "epoch": 1.780693533270853, + "grad_norm": 53679.79296875, + "learning_rate": 5.396408030461981e-05, + "loss": 2.2668, + "step": 9500 + }, + { + "epoch": 1.780693533270853, + "eval_loss": 2.2844395637512207, + "eval_runtime": 127.8194, + "eval_samples_per_second": 39.501, + "eval_steps_per_second": 1.979, + "step": 9500 + }, + { + "epoch": 1.7808809746954077, + "grad_norm": 50355.83203125, + "learning_rate": 5.395624707977369e-05, + "loss": 2.186, + "step": 9501 + }, + { + "epoch": 1.7810684161199624, + "grad_norm": 48862.8125, + "learning_rate": 5.394841375721339e-05, + "loss": 2.2419, + "step": 9502 + }, + { + "epoch": 1.7812558575445174, + "grad_norm": 48446.578125, + "learning_rate": 5.394058033713241e-05, + "loss": 2.2359, + "step": 9503 + }, + { + "epoch": 1.781443298969072, + "grad_norm": 49043.6171875, + "learning_rate": 5.393274681972418e-05, + "loss": 2.1927, + "step": 9504 + }, + { + "epoch": 1.7816307403936271, + "grad_norm": 46234.96875, + "learning_rate": 5.3924913205182224e-05, + "loss": 2.17, + "step": 9505 + }, + { + "epoch": 1.7818181818181817, + "grad_norm": 51334.9140625, + "learning_rate": 5.3917079493699986e-05, + "loss": 2.2125, + "step": 9506 + }, + { + "epoch": 1.7820056232427366, + "grad_norm": 51832.84765625, + "learning_rate": 5.3909245685470966e-05, + "loss": 2.0891, + "step": 9507 + }, + { + "epoch": 1.7821930646672914, + "grad_norm": 56590.890625, + "learning_rate": 5.3901411780688647e-05, + "loss": 2.2078, + "step": 9508 + }, + { + "epoch": 1.7823805060918463, + "grad_norm": 53327.8828125, + "learning_rate": 5.389357777954651e-05, + "loss": 2.2594, + "step": 9509 + }, + { + "epoch": 1.7825679475164011, + "grad_norm": 50215.85546875, + "learning_rate": 5.388574368223807e-05, + "loss": 2.2388, + "step": 9510 + }, + { + "epoch": 1.782755388940956, + "grad_norm": 50395.36328125, + "learning_rate": 5.387790948895677e-05, + "loss": 2.1776, + "step": 9511 + }, + { + "epoch": 1.7829428303655108, + "grad_norm": 52815.17578125, + "learning_rate": 5.387007519989615e-05, + "loss": 2.1664, + "step": 9512 + }, + { + "epoch": 1.7831302717900654, + "grad_norm": 46426.0546875, + "learning_rate": 5.38622408152497e-05, + "loss": 2.2586, + "step": 9513 + }, + { + "epoch": 1.7833177132146205, + "grad_norm": 55415.22265625, + "learning_rate": 5.38544063352109e-05, + "loss": 2.2475, + "step": 9514 + }, + { + "epoch": 1.7835051546391751, + "grad_norm": 52203.2421875, + "learning_rate": 5.3846571759973255e-05, + "loss": 2.1843, + "step": 9515 + }, + { + "epoch": 1.7836925960637302, + "grad_norm": 57475.9609375, + "learning_rate": 5.383873708973027e-05, + "loss": 2.2119, + "step": 9516 + }, + { + "epoch": 1.7838800374882848, + "grad_norm": 59699.171875, + "learning_rate": 5.383090232467546e-05, + "loss": 2.2403, + "step": 9517 + }, + { + "epoch": 1.78406747891284, + "grad_norm": 52398.640625, + "learning_rate": 5.3823067465002344e-05, + "loss": 2.241, + "step": 9518 + }, + { + "epoch": 1.7842549203373945, + "grad_norm": 50925.0, + "learning_rate": 5.381523251090439e-05, + "loss": 2.2395, + "step": 9519 + }, + { + "epoch": 1.7844423617619494, + "grad_norm": 49270.59765625, + "learning_rate": 5.380739746257517e-05, + "loss": 2.215, + "step": 9520 + }, + { + "epoch": 1.7846298031865042, + "grad_norm": 53789.6328125, + "learning_rate": 5.379956232020815e-05, + "loss": 2.1966, + "step": 9521 + }, + { + "epoch": 1.784817244611059, + "grad_norm": 55135.41796875, + "learning_rate": 5.379172708399688e-05, + "loss": 2.1966, + "step": 9522 + }, + { + "epoch": 1.785004686035614, + "grad_norm": 49202.08984375, + "learning_rate": 5.378389175413486e-05, + "loss": 2.2256, + "step": 9523 + }, + { + "epoch": 1.7851921274601685, + "grad_norm": 52843.171875, + "learning_rate": 5.377605633081562e-05, + "loss": 2.2324, + "step": 9524 + }, + { + "epoch": 1.7853795688847236, + "grad_norm": 48994.3046875, + "learning_rate": 5.376822081423269e-05, + "loss": 2.2945, + "step": 9525 + }, + { + "epoch": 1.7855670103092782, + "grad_norm": 48157.03515625, + "learning_rate": 5.3760385204579565e-05, + "loss": 2.2302, + "step": 9526 + }, + { + "epoch": 1.7857544517338333, + "grad_norm": 50783.53515625, + "learning_rate": 5.375254950204984e-05, + "loss": 2.2284, + "step": 9527 + }, + { + "epoch": 1.785941893158388, + "grad_norm": 52401.69140625, + "learning_rate": 5.374471370683698e-05, + "loss": 2.2681, + "step": 9528 + }, + { + "epoch": 1.786129334582943, + "grad_norm": 49630.84765625, + "learning_rate": 5.3736877819134556e-05, + "loss": 2.2231, + "step": 9529 + }, + { + "epoch": 1.7863167760074976, + "grad_norm": 53386.47265625, + "learning_rate": 5.37290418391361e-05, + "loss": 2.2151, + "step": 9530 + }, + { + "epoch": 1.7865042174320525, + "grad_norm": 48531.0078125, + "learning_rate": 5.3721205767035134e-05, + "loss": 2.2594, + "step": 9531 + }, + { + "epoch": 1.7866916588566073, + "grad_norm": 51204.85546875, + "learning_rate": 5.3713369603025207e-05, + "loss": 2.2008, + "step": 9532 + }, + { + "epoch": 1.7868791002811621, + "grad_norm": 56976.36328125, + "learning_rate": 5.3705533347299886e-05, + "loss": 2.2964, + "step": 9533 + }, + { + "epoch": 1.787066541705717, + "grad_norm": 57397.73046875, + "learning_rate": 5.369769700005268e-05, + "loss": 2.2616, + "step": 9534 + }, + { + "epoch": 1.7872539831302718, + "grad_norm": 52457.15234375, + "learning_rate": 5.3689860561477154e-05, + "loss": 2.2444, + "step": 9535 + }, + { + "epoch": 1.7874414245548267, + "grad_norm": 55629.1796875, + "learning_rate": 5.3682024031766864e-05, + "loss": 2.1991, + "step": 9536 + }, + { + "epoch": 1.7876288659793813, + "grad_norm": 51128.89453125, + "learning_rate": 5.367418741111534e-05, + "loss": 2.2711, + "step": 9537 + }, + { + "epoch": 1.7878163074039364, + "grad_norm": 51972.3828125, + "learning_rate": 5.366635069971617e-05, + "loss": 2.2228, + "step": 9538 + }, + { + "epoch": 1.788003748828491, + "grad_norm": 51850.859375, + "learning_rate": 5.365851389776287e-05, + "loss": 2.2093, + "step": 9539 + }, + { + "epoch": 1.788191190253046, + "grad_norm": 54196.16796875, + "learning_rate": 5.365067700544903e-05, + "loss": 2.1053, + "step": 9540 + }, + { + "epoch": 1.7883786316776007, + "grad_norm": 49114.8515625, + "learning_rate": 5.3642840022968196e-05, + "loss": 2.203, + "step": 9541 + }, + { + "epoch": 1.7885660731021555, + "grad_norm": 52040.83984375, + "learning_rate": 5.3635002950513956e-05, + "loss": 2.3327, + "step": 9542 + }, + { + "epoch": 1.7887535145267104, + "grad_norm": 50328.2890625, + "learning_rate": 5.362716578827984e-05, + "loss": 2.281, + "step": 9543 + }, + { + "epoch": 1.7889409559512652, + "grad_norm": 52449.04296875, + "learning_rate": 5.3619328536459433e-05, + "loss": 2.173, + "step": 9544 + }, + { + "epoch": 1.78912839737582, + "grad_norm": 54022.7265625, + "learning_rate": 5.36114911952463e-05, + "loss": 2.2583, + "step": 9545 + }, + { + "epoch": 1.789315838800375, + "grad_norm": 53098.04296875, + "learning_rate": 5.360365376483402e-05, + "loss": 2.2499, + "step": 9546 + }, + { + "epoch": 1.7895032802249298, + "grad_norm": 49913.0, + "learning_rate": 5.3595816245416165e-05, + "loss": 2.2344, + "step": 9547 + }, + { + "epoch": 1.7896907216494844, + "grad_norm": 48069.2734375, + "learning_rate": 5.358797863718632e-05, + "loss": 2.2542, + "step": 9548 + }, + { + "epoch": 1.7898781630740395, + "grad_norm": 50748.45703125, + "learning_rate": 5.358014094033804e-05, + "loss": 2.201, + "step": 9549 + }, + { + "epoch": 1.790065604498594, + "grad_norm": 50286.5703125, + "learning_rate": 5.357230315506493e-05, + "loss": 2.1799, + "step": 9550 + }, + { + "epoch": 1.7902530459231492, + "grad_norm": 51816.1328125, + "learning_rate": 5.356446528156055e-05, + "loss": 2.1274, + "step": 9551 + }, + { + "epoch": 1.7904404873477038, + "grad_norm": 52206.6171875, + "learning_rate": 5.3556627320018516e-05, + "loss": 2.287, + "step": 9552 + }, + { + "epoch": 1.7906279287722586, + "grad_norm": 49306.2578125, + "learning_rate": 5.3548789270632395e-05, + "loss": 2.1904, + "step": 9553 + }, + { + "epoch": 1.7908153701968135, + "grad_norm": 51640.32421875, + "learning_rate": 5.3540951133595765e-05, + "loss": 2.1647, + "step": 9554 + }, + { + "epoch": 1.7910028116213683, + "grad_norm": 55174.73828125, + "learning_rate": 5.353311290910226e-05, + "loss": 2.2125, + "step": 9555 + }, + { + "epoch": 1.7911902530459232, + "grad_norm": 47454.66796875, + "learning_rate": 5.352527459734542e-05, + "loss": 2.221, + "step": 9556 + }, + { + "epoch": 1.791377694470478, + "grad_norm": 47298.30078125, + "learning_rate": 5.351743619851888e-05, + "loss": 2.2333, + "step": 9557 + }, + { + "epoch": 1.7915651358950329, + "grad_norm": 51521.859375, + "learning_rate": 5.350959771281623e-05, + "loss": 2.1761, + "step": 9558 + }, + { + "epoch": 1.7917525773195875, + "grad_norm": 50092.9375, + "learning_rate": 5.3501759140431054e-05, + "loss": 2.2358, + "step": 9559 + }, + { + "epoch": 1.7919400187441425, + "grad_norm": 49256.53125, + "learning_rate": 5.349392048155697e-05, + "loss": 2.2522, + "step": 9560 + }, + { + "epoch": 1.7921274601686972, + "grad_norm": 54338.796875, + "learning_rate": 5.3486081736387585e-05, + "loss": 2.2593, + "step": 9561 + }, + { + "epoch": 1.7923149015932522, + "grad_norm": 49820.390625, + "learning_rate": 5.347824290511652e-05, + "loss": 2.2426, + "step": 9562 + }, + { + "epoch": 1.7925023430178069, + "grad_norm": 49717.84765625, + "learning_rate": 5.3470403987937336e-05, + "loss": 2.1429, + "step": 9563 + }, + { + "epoch": 1.7926897844423617, + "grad_norm": 50124.39453125, + "learning_rate": 5.3462564985043685e-05, + "loss": 2.1928, + "step": 9564 + }, + { + "epoch": 1.7928772258669166, + "grad_norm": 49442.58984375, + "learning_rate": 5.345472589662916e-05, + "loss": 2.1929, + "step": 9565 + }, + { + "epoch": 1.7930646672914714, + "grad_norm": 51211.93359375, + "learning_rate": 5.344688672288739e-05, + "loss": 2.3133, + "step": 9566 + }, + { + "epoch": 1.7932521087160262, + "grad_norm": 46127.77734375, + "learning_rate": 5.3439047464011984e-05, + "loss": 2.1498, + "step": 9567 + }, + { + "epoch": 1.793439550140581, + "grad_norm": 51756.5234375, + "learning_rate": 5.3431208120196584e-05, + "loss": 2.1821, + "step": 9568 + }, + { + "epoch": 1.793626991565136, + "grad_norm": 52495.0625, + "learning_rate": 5.342336869163476e-05, + "loss": 2.2596, + "step": 9569 + }, + { + "epoch": 1.7938144329896906, + "grad_norm": 50261.5859375, + "learning_rate": 5.341552917852019e-05, + "loss": 2.1808, + "step": 9570 + }, + { + "epoch": 1.7940018744142456, + "grad_norm": 49120.296875, + "learning_rate": 5.340768958104647e-05, + "loss": 2.2311, + "step": 9571 + }, + { + "epoch": 1.7941893158388003, + "grad_norm": 49664.55859375, + "learning_rate": 5.339984989940724e-05, + "loss": 2.2782, + "step": 9572 + }, + { + "epoch": 1.7943767572633553, + "grad_norm": 50740.9765625, + "learning_rate": 5.3392010133796125e-05, + "loss": 2.3365, + "step": 9573 + }, + { + "epoch": 1.79456419868791, + "grad_norm": 51909.44921875, + "learning_rate": 5.338417028440674e-05, + "loss": 2.1614, + "step": 9574 + }, + { + "epoch": 1.794751640112465, + "grad_norm": 51245.63671875, + "learning_rate": 5.337633035143276e-05, + "loss": 2.2819, + "step": 9575 + }, + { + "epoch": 1.7949390815370196, + "grad_norm": 47402.20703125, + "learning_rate": 5.33684903350678e-05, + "loss": 2.2639, + "step": 9576 + }, + { + "epoch": 1.7951265229615745, + "grad_norm": 56782.78515625, + "learning_rate": 5.3360650235505485e-05, + "loss": 2.3181, + "step": 9577 + }, + { + "epoch": 1.7953139643861293, + "grad_norm": 51898.90234375, + "learning_rate": 5.335281005293946e-05, + "loss": 2.1704, + "step": 9578 + }, + { + "epoch": 1.7955014058106842, + "grad_norm": 51063.37890625, + "learning_rate": 5.334496978756338e-05, + "loss": 2.1249, + "step": 9579 + }, + { + "epoch": 1.795688847235239, + "grad_norm": 50671.92578125, + "learning_rate": 5.333712943957088e-05, + "loss": 2.2253, + "step": 9580 + }, + { + "epoch": 1.7958762886597937, + "grad_norm": 47013.1484375, + "learning_rate": 5.3329289009155614e-05, + "loss": 2.3038, + "step": 9581 + }, + { + "epoch": 1.7960637300843487, + "grad_norm": 50605.41015625, + "learning_rate": 5.332144849651122e-05, + "loss": 2.2374, + "step": 9582 + }, + { + "epoch": 1.7962511715089033, + "grad_norm": 50447.96875, + "learning_rate": 5.3313607901831374e-05, + "loss": 2.2529, + "step": 9583 + }, + { + "epoch": 1.7964386129334584, + "grad_norm": 48366.50390625, + "learning_rate": 5.3305767225309677e-05, + "loss": 2.2215, + "step": 9584 + }, + { + "epoch": 1.796626054358013, + "grad_norm": 50165.90625, + "learning_rate": 5.3297926467139844e-05, + "loss": 2.1566, + "step": 9585 + }, + { + "epoch": 1.796813495782568, + "grad_norm": 49544.1484375, + "learning_rate": 5.329008562751549e-05, + "loss": 2.2443, + "step": 9586 + }, + { + "epoch": 1.7970009372071227, + "grad_norm": 50353.36328125, + "learning_rate": 5.328224470663028e-05, + "loss": 2.1815, + "step": 9587 + }, + { + "epoch": 1.7971883786316776, + "grad_norm": 49130.50390625, + "learning_rate": 5.327440370467789e-05, + "loss": 2.2485, + "step": 9588 + }, + { + "epoch": 1.7973758200562324, + "grad_norm": 57142.50390625, + "learning_rate": 5.326656262185198e-05, + "loss": 2.1944, + "step": 9589 + }, + { + "epoch": 1.7975632614807873, + "grad_norm": 50416.6484375, + "learning_rate": 5.325872145834621e-05, + "loss": 2.2273, + "step": 9590 + }, + { + "epoch": 1.7977507029053421, + "grad_norm": 47359.73046875, + "learning_rate": 5.325088021435424e-05, + "loss": 2.2032, + "step": 9591 + }, + { + "epoch": 1.797938144329897, + "grad_norm": 53319.96484375, + "learning_rate": 5.324303889006973e-05, + "loss": 2.2491, + "step": 9592 + }, + { + "epoch": 1.7981255857544518, + "grad_norm": 51387.99609375, + "learning_rate": 5.3235197485686374e-05, + "loss": 2.2473, + "step": 9593 + }, + { + "epoch": 1.7983130271790064, + "grad_norm": 50293.30859375, + "learning_rate": 5.322735600139783e-05, + "loss": 2.23, + "step": 9594 + }, + { + "epoch": 1.7985004686035615, + "grad_norm": 54793.00390625, + "learning_rate": 5.321951443739779e-05, + "loss": 2.1188, + "step": 9595 + }, + { + "epoch": 1.7986879100281161, + "grad_norm": 53391.80078125, + "learning_rate": 5.3211672793879916e-05, + "loss": 2.1831, + "step": 9596 + }, + { + "epoch": 1.7988753514526712, + "grad_norm": 50894.12890625, + "learning_rate": 5.320383107103787e-05, + "loss": 2.2769, + "step": 9597 + }, + { + "epoch": 1.7990627928772258, + "grad_norm": 50087.45703125, + "learning_rate": 5.319598926906537e-05, + "loss": 2.1974, + "step": 9598 + }, + { + "epoch": 1.7992502343017807, + "grad_norm": 51269.6015625, + "learning_rate": 5.318814738815606e-05, + "loss": 2.2795, + "step": 9599 + }, + { + "epoch": 1.7994376757263355, + "grad_norm": 50578.4921875, + "learning_rate": 5.3180305428503665e-05, + "loss": 2.2672, + "step": 9600 + }, + { + "epoch": 1.7996251171508904, + "grad_norm": 61023.78515625, + "learning_rate": 5.317246339030183e-05, + "loss": 2.0957, + "step": 9601 + }, + { + "epoch": 1.7998125585754452, + "grad_norm": 52293.97265625, + "learning_rate": 5.316462127374428e-05, + "loss": 2.1947, + "step": 9602 + }, + { + "epoch": 1.8, + "grad_norm": 49381.59375, + "learning_rate": 5.315677907902468e-05, + "loss": 2.2466, + "step": 9603 + }, + { + "epoch": 1.800187441424555, + "grad_norm": 54502.58984375, + "learning_rate": 5.3148936806336714e-05, + "loss": 2.2185, + "step": 9604 + }, + { + "epoch": 1.8003748828491095, + "grad_norm": 52547.27734375, + "learning_rate": 5.314109445587412e-05, + "loss": 2.1863, + "step": 9605 + }, + { + "epoch": 1.8005623242736646, + "grad_norm": 49574.53125, + "learning_rate": 5.313325202783055e-05, + "loss": 2.2246, + "step": 9606 + }, + { + "epoch": 1.8007497656982192, + "grad_norm": 51439.015625, + "learning_rate": 5.3125409522399706e-05, + "loss": 2.1627, + "step": 9607 + }, + { + "epoch": 1.8009372071227743, + "grad_norm": 54122.2421875, + "learning_rate": 5.311756693977531e-05, + "loss": 2.2382, + "step": 9608 + }, + { + "epoch": 1.801124648547329, + "grad_norm": 50835.89453125, + "learning_rate": 5.3109724280151055e-05, + "loss": 2.187, + "step": 9609 + }, + { + "epoch": 1.8013120899718837, + "grad_norm": 55306.8203125, + "learning_rate": 5.310188154372064e-05, + "loss": 2.1457, + "step": 9610 + }, + { + "epoch": 1.8014995313964386, + "grad_norm": 52015.5, + "learning_rate": 5.309403873067779e-05, + "loss": 2.2507, + "step": 9611 + }, + { + "epoch": 1.8016869728209934, + "grad_norm": 53838.7109375, + "learning_rate": 5.3086195841216155e-05, + "loss": 2.2413, + "step": 9612 + }, + { + "epoch": 1.8018744142455483, + "grad_norm": 48251.72265625, + "learning_rate": 5.307835287552951e-05, + "loss": 2.2585, + "step": 9613 + }, + { + "epoch": 1.8020618556701031, + "grad_norm": 52853.53515625, + "learning_rate": 5.3070509833811546e-05, + "loss": 2.1639, + "step": 9614 + }, + { + "epoch": 1.802249297094658, + "grad_norm": 58048.05859375, + "learning_rate": 5.306266671625596e-05, + "loss": 2.1721, + "step": 9615 + }, + { + "epoch": 1.8024367385192126, + "grad_norm": 51637.140625, + "learning_rate": 5.3054823523056474e-05, + "loss": 2.2633, + "step": 9616 + }, + { + "epoch": 1.8026241799437677, + "grad_norm": 48182.1484375, + "learning_rate": 5.304698025440681e-05, + "loss": 2.2814, + "step": 9617 + }, + { + "epoch": 1.8028116213683223, + "grad_norm": 56006.81640625, + "learning_rate": 5.3039136910500696e-05, + "loss": 2.1326, + "step": 9618 + }, + { + "epoch": 1.8029990627928774, + "grad_norm": 51365.44921875, + "learning_rate": 5.303129349153182e-05, + "loss": 2.2118, + "step": 9619 + }, + { + "epoch": 1.803186504217432, + "grad_norm": 51926.12890625, + "learning_rate": 5.3023449997693944e-05, + "loss": 2.2059, + "step": 9620 + }, + { + "epoch": 1.8033739456419868, + "grad_norm": 52700.50390625, + "learning_rate": 5.301560642918076e-05, + "loss": 2.134, + "step": 9621 + }, + { + "epoch": 1.8035613870665417, + "grad_norm": 49569.0078125, + "learning_rate": 5.300776278618601e-05, + "loss": 2.2004, + "step": 9622 + }, + { + "epoch": 1.8037488284910965, + "grad_norm": 48412.109375, + "learning_rate": 5.299991906890341e-05, + "loss": 2.1669, + "step": 9623 + }, + { + "epoch": 1.8039362699156514, + "grad_norm": 51645.77734375, + "learning_rate": 5.299207527752672e-05, + "loss": 2.2657, + "step": 9624 + }, + { + "epoch": 1.8041237113402062, + "grad_norm": 52160.5234375, + "learning_rate": 5.298423141224963e-05, + "loss": 2.2986, + "step": 9625 + }, + { + "epoch": 1.804311152764761, + "grad_norm": 50391.78515625, + "learning_rate": 5.297638747326591e-05, + "loss": 2.2679, + "step": 9626 + }, + { + "epoch": 1.8044985941893157, + "grad_norm": 54957.4140625, + "learning_rate": 5.296854346076926e-05, + "loss": 2.202, + "step": 9627 + }, + { + "epoch": 1.8046860356138708, + "grad_norm": 51531.0859375, + "learning_rate": 5.296069937495346e-05, + "loss": 2.2199, + "step": 9628 + }, + { + "epoch": 1.8048734770384254, + "grad_norm": 50493.6328125, + "learning_rate": 5.2952855216012195e-05, + "loss": 2.2412, + "step": 9629 + }, + { + "epoch": 1.8050609184629804, + "grad_norm": 48403.56640625, + "learning_rate": 5.294501098413925e-05, + "loss": 2.1974, + "step": 9630 + }, + { + "epoch": 1.805248359887535, + "grad_norm": 47197.3671875, + "learning_rate": 5.293716667952837e-05, + "loss": 2.2156, + "step": 9631 + }, + { + "epoch": 1.8054358013120901, + "grad_norm": 50029.0234375, + "learning_rate": 5.292932230237324e-05, + "loss": 2.2052, + "step": 9632 + }, + { + "epoch": 1.8056232427366448, + "grad_norm": 48981.6328125, + "learning_rate": 5.2921477852867685e-05, + "loss": 2.2161, + "step": 9633 + }, + { + "epoch": 1.8058106841611996, + "grad_norm": 50326.46484375, + "learning_rate": 5.2913633331205384e-05, + "loss": 2.2326, + "step": 9634 + }, + { + "epoch": 1.8059981255857545, + "grad_norm": 51630.80859375, + "learning_rate": 5.290578873758014e-05, + "loss": 2.2619, + "step": 9635 + }, + { + "epoch": 1.8061855670103093, + "grad_norm": 55876.8125, + "learning_rate": 5.289794407218568e-05, + "loss": 2.2218, + "step": 9636 + }, + { + "epoch": 1.8063730084348641, + "grad_norm": 51557.609375, + "learning_rate": 5.289009933521575e-05, + "loss": 2.202, + "step": 9637 + }, + { + "epoch": 1.8065604498594188, + "grad_norm": 53003.69921875, + "learning_rate": 5.28822545268641e-05, + "loss": 2.2245, + "step": 9638 + }, + { + "epoch": 1.8067478912839738, + "grad_norm": 46116.0546875, + "learning_rate": 5.287440964732453e-05, + "loss": 2.1797, + "step": 9639 + }, + { + "epoch": 1.8069353327085285, + "grad_norm": 47203.82421875, + "learning_rate": 5.2866564696790744e-05, + "loss": 2.2618, + "step": 9640 + }, + { + "epoch": 1.8071227741330835, + "grad_norm": 53717.82421875, + "learning_rate": 5.2858719675456545e-05, + "loss": 2.2642, + "step": 9641 + }, + { + "epoch": 1.8073102155576382, + "grad_norm": 50360.953125, + "learning_rate": 5.2850874583515666e-05, + "loss": 2.2274, + "step": 9642 + }, + { + "epoch": 1.8074976569821932, + "grad_norm": 56763.70703125, + "learning_rate": 5.284302942116188e-05, + "loss": 2.2266, + "step": 9643 + }, + { + "epoch": 1.8076850984067478, + "grad_norm": 49495.0703125, + "learning_rate": 5.283518418858896e-05, + "loss": 2.2211, + "step": 9644 + }, + { + "epoch": 1.8078725398313027, + "grad_norm": 56008.5703125, + "learning_rate": 5.282733888599066e-05, + "loss": 2.1952, + "step": 9645 + }, + { + "epoch": 1.8080599812558575, + "grad_norm": 53259.875, + "learning_rate": 5.281949351356077e-05, + "loss": 2.1809, + "step": 9646 + }, + { + "epoch": 1.8082474226804124, + "grad_norm": 48507.97265625, + "learning_rate": 5.281164807149302e-05, + "loss": 2.1941, + "step": 9647 + }, + { + "epoch": 1.8084348641049672, + "grad_norm": 52930.05078125, + "learning_rate": 5.280380255998124e-05, + "loss": 2.1068, + "step": 9648 + }, + { + "epoch": 1.808622305529522, + "grad_norm": 50751.5703125, + "learning_rate": 5.2795956979219154e-05, + "loss": 2.2104, + "step": 9649 + }, + { + "epoch": 1.808809746954077, + "grad_norm": 50939.75, + "learning_rate": 5.278811132940055e-05, + "loss": 2.2466, + "step": 9650 + }, + { + "epoch": 1.8089971883786315, + "grad_norm": 48895.41796875, + "learning_rate": 5.2780265610719216e-05, + "loss": 2.2566, + "step": 9651 + }, + { + "epoch": 1.8091846298031866, + "grad_norm": 48507.6953125, + "learning_rate": 5.277241982336892e-05, + "loss": 2.253, + "step": 9652 + }, + { + "epoch": 1.8093720712277412, + "grad_norm": 49910.95703125, + "learning_rate": 5.276457396754346e-05, + "loss": 2.2427, + "step": 9653 + }, + { + "epoch": 1.8095595126522963, + "grad_norm": 49609.796875, + "learning_rate": 5.27567280434366e-05, + "loss": 2.1874, + "step": 9654 + }, + { + "epoch": 1.809746954076851, + "grad_norm": 52707.21875, + "learning_rate": 5.2748882051242136e-05, + "loss": 2.2366, + "step": 9655 + }, + { + "epoch": 1.8099343955014058, + "grad_norm": 51900.7109375, + "learning_rate": 5.274103599115385e-05, + "loss": 2.2988, + "step": 9656 + }, + { + "epoch": 1.8101218369259606, + "grad_norm": 52245.8671875, + "learning_rate": 5.273318986336552e-05, + "loss": 2.2302, + "step": 9657 + }, + { + "epoch": 1.8103092783505155, + "grad_norm": 54714.671875, + "learning_rate": 5.272534366807096e-05, + "loss": 2.2084, + "step": 9658 + }, + { + "epoch": 1.8104967197750703, + "grad_norm": 51348.234375, + "learning_rate": 5.2717497405463935e-05, + "loss": 2.2769, + "step": 9659 + }, + { + "epoch": 1.8106841611996252, + "grad_norm": 53455.171875, + "learning_rate": 5.2709651075738244e-05, + "loss": 2.2176, + "step": 9660 + }, + { + "epoch": 1.81087160262418, + "grad_norm": 51792.8359375, + "learning_rate": 5.2701804679087695e-05, + "loss": 2.2401, + "step": 9661 + }, + { + "epoch": 1.8110590440487346, + "grad_norm": 53042.015625, + "learning_rate": 5.269395821570605e-05, + "loss": 2.1563, + "step": 9662 + }, + { + "epoch": 1.8112464854732897, + "grad_norm": 53027.52734375, + "learning_rate": 5.268611168578717e-05, + "loss": 2.1552, + "step": 9663 + }, + { + "epoch": 1.8114339268978443, + "grad_norm": 58744.6328125, + "learning_rate": 5.267826508952478e-05, + "loss": 2.1856, + "step": 9664 + }, + { + "epoch": 1.8116213683223994, + "grad_norm": 49891.046875, + "learning_rate": 5.2670418427112724e-05, + "loss": 2.2511, + "step": 9665 + }, + { + "epoch": 1.811808809746954, + "grad_norm": 49227.36328125, + "learning_rate": 5.266257169874479e-05, + "loss": 2.1561, + "step": 9666 + }, + { + "epoch": 1.8119962511715089, + "grad_norm": 55396.3515625, + "learning_rate": 5.265472490461479e-05, + "loss": 2.2056, + "step": 9667 + }, + { + "epoch": 1.8121836925960637, + "grad_norm": 51208.078125, + "learning_rate": 5.2646878044916537e-05, + "loss": 2.2056, + "step": 9668 + }, + { + "epoch": 1.8123711340206186, + "grad_norm": 47004.07421875, + "learning_rate": 5.263903111984383e-05, + "loss": 2.2407, + "step": 9669 + }, + { + "epoch": 1.8125585754451734, + "grad_norm": 50644.69140625, + "learning_rate": 5.263118412959045e-05, + "loss": 2.1743, + "step": 9670 + }, + { + "epoch": 1.8127460168697282, + "grad_norm": 52307.8671875, + "learning_rate": 5.2623337074350264e-05, + "loss": 2.1358, + "step": 9671 + }, + { + "epoch": 1.812933458294283, + "grad_norm": 53115.82421875, + "learning_rate": 5.2615489954317024e-05, + "loss": 2.1565, + "step": 9672 + }, + { + "epoch": 1.8131208997188377, + "grad_norm": 50008.59375, + "learning_rate": 5.2607642769684605e-05, + "loss": 2.2756, + "step": 9673 + }, + { + "epoch": 1.8133083411433928, + "grad_norm": 54225.640625, + "learning_rate": 5.259979552064677e-05, + "loss": 2.253, + "step": 9674 + }, + { + "epoch": 1.8134957825679474, + "grad_norm": 53624.453125, + "learning_rate": 5.259194820739736e-05, + "loss": 2.1857, + "step": 9675 + }, + { + "epoch": 1.8136832239925025, + "grad_norm": 52156.765625, + "learning_rate": 5.25841008301302e-05, + "loss": 2.2092, + "step": 9676 + }, + { + "epoch": 1.813870665417057, + "grad_norm": 51050.328125, + "learning_rate": 5.257625338903907e-05, + "loss": 2.1871, + "step": 9677 + }, + { + "epoch": 1.814058106841612, + "grad_norm": 52052.99609375, + "learning_rate": 5.2568405884317864e-05, + "loss": 2.2141, + "step": 9678 + }, + { + "epoch": 1.8142455482661668, + "grad_norm": 51413.734375, + "learning_rate": 5.256055831616034e-05, + "loss": 2.1927, + "step": 9679 + }, + { + "epoch": 1.8144329896907216, + "grad_norm": 52687.53125, + "learning_rate": 5.2552710684760344e-05, + "loss": 2.2001, + "step": 9680 + }, + { + "epoch": 1.8146204311152765, + "grad_norm": 49005.390625, + "learning_rate": 5.2544862990311696e-05, + "loss": 2.1846, + "step": 9681 + }, + { + "epoch": 1.8148078725398313, + "grad_norm": 48787.13671875, + "learning_rate": 5.253701523300824e-05, + "loss": 2.2083, + "step": 9682 + }, + { + "epoch": 1.8149953139643862, + "grad_norm": 51533.0390625, + "learning_rate": 5.25291674130438e-05, + "loss": 2.1409, + "step": 9683 + }, + { + "epoch": 1.8151827553889408, + "grad_norm": 50258.234375, + "learning_rate": 5.252131953061221e-05, + "loss": 2.2713, + "step": 9684 + }, + { + "epoch": 1.8153701968134959, + "grad_norm": 51742.2734375, + "learning_rate": 5.2513471585907284e-05, + "loss": 2.2358, + "step": 9685 + }, + { + "epoch": 1.8155576382380505, + "grad_norm": 50651.05078125, + "learning_rate": 5.2505623579122874e-05, + "loss": 2.2402, + "step": 9686 + }, + { + "epoch": 1.8157450796626056, + "grad_norm": 52833.3984375, + "learning_rate": 5.249777551045281e-05, + "loss": 2.236, + "step": 9687 + }, + { + "epoch": 1.8159325210871602, + "grad_norm": 49042.9296875, + "learning_rate": 5.248992738009093e-05, + "loss": 2.2398, + "step": 9688 + }, + { + "epoch": 1.816119962511715, + "grad_norm": 52241.35546875, + "learning_rate": 5.248207918823108e-05, + "loss": 2.2477, + "step": 9689 + }, + { + "epoch": 1.8163074039362699, + "grad_norm": 51259.7109375, + "learning_rate": 5.247423093506707e-05, + "loss": 2.1387, + "step": 9690 + }, + { + "epoch": 1.8164948453608247, + "grad_norm": 49582.51171875, + "learning_rate": 5.2466382620792785e-05, + "loss": 2.2643, + "step": 9691 + }, + { + "epoch": 1.8166822867853796, + "grad_norm": 48394.62890625, + "learning_rate": 5.2458534245602033e-05, + "loss": 2.2419, + "step": 9692 + }, + { + "epoch": 1.8168697282099344, + "grad_norm": 50267.1328125, + "learning_rate": 5.245068580968868e-05, + "loss": 2.2397, + "step": 9693 + }, + { + "epoch": 1.8170571696344893, + "grad_norm": 49857.16015625, + "learning_rate": 5.244283731324656e-05, + "loss": 2.21, + "step": 9694 + }, + { + "epoch": 1.817244611059044, + "grad_norm": 52585.75390625, + "learning_rate": 5.243498875646953e-05, + "loss": 2.1772, + "step": 9695 + }, + { + "epoch": 1.817432052483599, + "grad_norm": 52357.4375, + "learning_rate": 5.242714013955144e-05, + "loss": 2.217, + "step": 9696 + }, + { + "epoch": 1.8176194939081536, + "grad_norm": 47581.05859375, + "learning_rate": 5.241929146268611e-05, + "loss": 2.204, + "step": 9697 + }, + { + "epoch": 1.8178069353327087, + "grad_norm": 53323.16015625, + "learning_rate": 5.241144272606744e-05, + "loss": 2.2355, + "step": 9698 + }, + { + "epoch": 1.8179943767572633, + "grad_norm": 54874.53515625, + "learning_rate": 5.240359392988927e-05, + "loss": 2.1631, + "step": 9699 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 55762.4296875, + "learning_rate": 5.2395745074345424e-05, + "loss": 2.2183, + "step": 9700 + }, + { + "epoch": 1.818369259606373, + "grad_norm": 55188.3125, + "learning_rate": 5.23878961596298e-05, + "loss": 2.2439, + "step": 9701 + }, + { + "epoch": 1.8185567010309278, + "grad_norm": 56898.578125, + "learning_rate": 5.238004718593622e-05, + "loss": 2.1623, + "step": 9702 + }, + { + "epoch": 1.8187441424554827, + "grad_norm": 49690.79296875, + "learning_rate": 5.237219815345857e-05, + "loss": 2.2257, + "step": 9703 + }, + { + "epoch": 1.8189315838800375, + "grad_norm": 53157.4609375, + "learning_rate": 5.23643490623907e-05, + "loss": 2.1879, + "step": 9704 + }, + { + "epoch": 1.8191190253045924, + "grad_norm": 52816.26953125, + "learning_rate": 5.235649991292647e-05, + "loss": 2.2148, + "step": 9705 + }, + { + "epoch": 1.819306466729147, + "grad_norm": 52240.64453125, + "learning_rate": 5.234865070525976e-05, + "loss": 2.2093, + "step": 9706 + }, + { + "epoch": 1.819493908153702, + "grad_norm": 49780.72265625, + "learning_rate": 5.2340801439584406e-05, + "loss": 2.1866, + "step": 9707 + }, + { + "epoch": 1.8196813495782567, + "grad_norm": 54391.24609375, + "learning_rate": 5.2332952116094306e-05, + "loss": 2.0525, + "step": 9708 + }, + { + "epoch": 1.8198687910028117, + "grad_norm": 53139.2265625, + "learning_rate": 5.23251027349833e-05, + "loss": 2.2302, + "step": 9709 + }, + { + "epoch": 1.8200562324273664, + "grad_norm": 46272.71484375, + "learning_rate": 5.231725329644528e-05, + "loss": 2.1811, + "step": 9710 + }, + { + "epoch": 1.8202436738519214, + "grad_norm": 49976.671875, + "learning_rate": 5.2309403800674095e-05, + "loss": 2.2479, + "step": 9711 + }, + { + "epoch": 1.820431115276476, + "grad_norm": 50820.796875, + "learning_rate": 5.2301554247863635e-05, + "loss": 2.1832, + "step": 9712 + }, + { + "epoch": 1.820618556701031, + "grad_norm": 52449.69921875, + "learning_rate": 5.229370463820778e-05, + "loss": 2.1851, + "step": 9713 + }, + { + "epoch": 1.8208059981255857, + "grad_norm": 50933.87890625, + "learning_rate": 5.22858549719004e-05, + "loss": 2.2467, + "step": 9714 + }, + { + "epoch": 1.8209934395501406, + "grad_norm": 55484.5078125, + "learning_rate": 5.2278005249135345e-05, + "loss": 2.1772, + "step": 9715 + }, + { + "epoch": 1.8211808809746954, + "grad_norm": 49776.7109375, + "learning_rate": 5.227015547010652e-05, + "loss": 2.1983, + "step": 9716 + }, + { + "epoch": 1.8213683223992503, + "grad_norm": 54462.41015625, + "learning_rate": 5.22623056350078e-05, + "loss": 2.2637, + "step": 9717 + }, + { + "epoch": 1.8215557638238051, + "grad_norm": 52560.0078125, + "learning_rate": 5.225445574403307e-05, + "loss": 2.2754, + "step": 9718 + }, + { + "epoch": 1.8217432052483598, + "grad_norm": 56572.9453125, + "learning_rate": 5.2246605797376206e-05, + "loss": 2.263, + "step": 9719 + }, + { + "epoch": 1.8219306466729148, + "grad_norm": 53099.2421875, + "learning_rate": 5.223875579523108e-05, + "loss": 2.2048, + "step": 9720 + }, + { + "epoch": 1.8221180880974694, + "grad_norm": 50973.2265625, + "learning_rate": 5.2230905737791605e-05, + "loss": 2.2325, + "step": 9721 + }, + { + "epoch": 1.8223055295220245, + "grad_norm": 51189.81640625, + "learning_rate": 5.2223055625251636e-05, + "loss": 2.1974, + "step": 9722 + }, + { + "epoch": 1.8224929709465791, + "grad_norm": 53096.93359375, + "learning_rate": 5.2215205457805074e-05, + "loss": 2.2058, + "step": 9723 + }, + { + "epoch": 1.822680412371134, + "grad_norm": 49067.9609375, + "learning_rate": 5.2207355235645814e-05, + "loss": 2.2679, + "step": 9724 + }, + { + "epoch": 1.8228678537956888, + "grad_norm": 48961.5703125, + "learning_rate": 5.219950495896774e-05, + "loss": 2.2047, + "step": 9725 + }, + { + "epoch": 1.8230552952202437, + "grad_norm": 52603.46875, + "learning_rate": 5.2191654627964746e-05, + "loss": 2.2102, + "step": 9726 + }, + { + "epoch": 1.8232427366447985, + "grad_norm": 51651.2421875, + "learning_rate": 5.218380424283072e-05, + "loss": 2.1732, + "step": 9727 + }, + { + "epoch": 1.8234301780693534, + "grad_norm": 49753.2265625, + "learning_rate": 5.217595380375958e-05, + "loss": 2.2648, + "step": 9728 + }, + { + "epoch": 1.8236176194939082, + "grad_norm": 55697.83203125, + "learning_rate": 5.216810331094519e-05, + "loss": 2.2835, + "step": 9729 + }, + { + "epoch": 1.8238050609184628, + "grad_norm": 46540.42578125, + "learning_rate": 5.2160252764581464e-05, + "loss": 2.2417, + "step": 9730 + }, + { + "epoch": 1.823992502343018, + "grad_norm": 50262.984375, + "learning_rate": 5.215240216486228e-05, + "loss": 2.1571, + "step": 9731 + }, + { + "epoch": 1.8241799437675725, + "grad_norm": 50231.2265625, + "learning_rate": 5.2144551511981565e-05, + "loss": 2.2927, + "step": 9732 + }, + { + "epoch": 1.8243673851921276, + "grad_norm": 52852.109375, + "learning_rate": 5.21367008061332e-05, + "loss": 2.2445, + "step": 9733 + }, + { + "epoch": 1.8245548266166822, + "grad_norm": 53048.59375, + "learning_rate": 5.212885004751111e-05, + "loss": 2.2538, + "step": 9734 + }, + { + "epoch": 1.824742268041237, + "grad_norm": 56812.921875, + "learning_rate": 5.212099923630916e-05, + "loss": 2.1879, + "step": 9735 + }, + { + "epoch": 1.824929709465792, + "grad_norm": 52808.2890625, + "learning_rate": 5.211314837272131e-05, + "loss": 2.1704, + "step": 9736 + }, + { + "epoch": 1.8251171508903468, + "grad_norm": 47956.08203125, + "learning_rate": 5.210529745694141e-05, + "loss": 2.2695, + "step": 9737 + }, + { + "epoch": 1.8253045923149016, + "grad_norm": 49254.921875, + "learning_rate": 5.2097446489163404e-05, + "loss": 2.256, + "step": 9738 + }, + { + "epoch": 1.8254920337394565, + "grad_norm": 47984.47265625, + "learning_rate": 5.2089595469581185e-05, + "loss": 2.2935, + "step": 9739 + }, + { + "epoch": 1.8256794751640113, + "grad_norm": 46927.84765625, + "learning_rate": 5.2081744398388665e-05, + "loss": 2.2427, + "step": 9740 + }, + { + "epoch": 1.825866916588566, + "grad_norm": 47496.50390625, + "learning_rate": 5.207389327577976e-05, + "loss": 2.214, + "step": 9741 + }, + { + "epoch": 1.826054358013121, + "grad_norm": 55956.0078125, + "learning_rate": 5.20660421019484e-05, + "loss": 2.1889, + "step": 9742 + }, + { + "epoch": 1.8262417994376756, + "grad_norm": 51302.15234375, + "learning_rate": 5.2058190877088456e-05, + "loss": 2.2864, + "step": 9743 + }, + { + "epoch": 1.8264292408622307, + "grad_norm": 47951.28125, + "learning_rate": 5.205033960139386e-05, + "loss": 2.2611, + "step": 9744 + }, + { + "epoch": 1.8266166822867853, + "grad_norm": 52050.5546875, + "learning_rate": 5.204248827505854e-05, + "loss": 2.2611, + "step": 9745 + }, + { + "epoch": 1.8268041237113402, + "grad_norm": 50551.9609375, + "learning_rate": 5.203463689827641e-05, + "loss": 2.1979, + "step": 9746 + }, + { + "epoch": 1.826991565135895, + "grad_norm": 50376.6171875, + "learning_rate": 5.202678547124139e-05, + "loss": 2.1736, + "step": 9747 + }, + { + "epoch": 1.8271790065604498, + "grad_norm": 52583.2734375, + "learning_rate": 5.201893399414739e-05, + "loss": 2.1444, + "step": 9748 + }, + { + "epoch": 1.8273664479850047, + "grad_norm": 50919.54296875, + "learning_rate": 5.201108246718835e-05, + "loss": 2.1944, + "step": 9749 + }, + { + "epoch": 1.8275538894095595, + "grad_norm": 49653.73046875, + "learning_rate": 5.200323089055816e-05, + "loss": 2.2963, + "step": 9750 + }, + { + "epoch": 1.8277413308341144, + "grad_norm": 54161.7265625, + "learning_rate": 5.19953792644508e-05, + "loss": 2.1569, + "step": 9751 + }, + { + "epoch": 1.827928772258669, + "grad_norm": 46355.37890625, + "learning_rate": 5.198752758906012e-05, + "loss": 2.192, + "step": 9752 + }, + { + "epoch": 1.828116213683224, + "grad_norm": 48678.87890625, + "learning_rate": 5.1979675864580114e-05, + "loss": 2.2286, + "step": 9753 + }, + { + "epoch": 1.8283036551077787, + "grad_norm": 51040.19140625, + "learning_rate": 5.197182409120466e-05, + "loss": 2.1885, + "step": 9754 + }, + { + "epoch": 1.8284910965323338, + "grad_norm": 50813.703125, + "learning_rate": 5.196397226912771e-05, + "loss": 2.2117, + "step": 9755 + }, + { + "epoch": 1.8286785379568884, + "grad_norm": 49974.08984375, + "learning_rate": 5.195612039854321e-05, + "loss": 2.2429, + "step": 9756 + }, + { + "epoch": 1.8288659793814435, + "grad_norm": 52969.27734375, + "learning_rate": 5.1948268479645055e-05, + "loss": 2.153, + "step": 9757 + }, + { + "epoch": 1.829053420805998, + "grad_norm": 52485.0390625, + "learning_rate": 5.19404165126272e-05, + "loss": 2.2214, + "step": 9758 + }, + { + "epoch": 1.829240862230553, + "grad_norm": 56150.25390625, + "learning_rate": 5.193256449768358e-05, + "loss": 2.2375, + "step": 9759 + }, + { + "epoch": 1.8294283036551078, + "grad_norm": 49930.77734375, + "learning_rate": 5.19247124350081e-05, + "loss": 2.263, + "step": 9760 + }, + { + "epoch": 1.8296157450796626, + "grad_norm": 52504.98046875, + "learning_rate": 5.191686032479474e-05, + "loss": 2.1865, + "step": 9761 + }, + { + "epoch": 1.8298031865042175, + "grad_norm": 52153.9765625, + "learning_rate": 5.1909008167237415e-05, + "loss": 2.2109, + "step": 9762 + }, + { + "epoch": 1.829990627928772, + "grad_norm": 49783.3671875, + "learning_rate": 5.1901155962530044e-05, + "loss": 2.2297, + "step": 9763 + }, + { + "epoch": 1.8301780693533272, + "grad_norm": 51787.2578125, + "learning_rate": 5.18933037108666e-05, + "loss": 2.236, + "step": 9764 + }, + { + "epoch": 1.8303655107778818, + "grad_norm": 47029.6953125, + "learning_rate": 5.1885451412441e-05, + "loss": 2.2705, + "step": 9765 + }, + { + "epoch": 1.8305529522024369, + "grad_norm": 54240.84765625, + "learning_rate": 5.187759906744721e-05, + "loss": 2.2421, + "step": 9766 + }, + { + "epoch": 1.8307403936269915, + "grad_norm": 50521.734375, + "learning_rate": 5.186974667607914e-05, + "loss": 2.152, + "step": 9767 + }, + { + "epoch": 1.8309278350515465, + "grad_norm": 50585.23828125, + "learning_rate": 5.1861894238530776e-05, + "loss": 2.2827, + "step": 9768 + }, + { + "epoch": 1.8311152764761012, + "grad_norm": 45524.51953125, + "learning_rate": 5.185404175499602e-05, + "loss": 2.192, + "step": 9769 + }, + { + "epoch": 1.831302717900656, + "grad_norm": 56337.90234375, + "learning_rate": 5.184618922566883e-05, + "loss": 2.1878, + "step": 9770 + }, + { + "epoch": 1.8314901593252109, + "grad_norm": 49333.83984375, + "learning_rate": 5.183833665074318e-05, + "loss": 2.3081, + "step": 9771 + }, + { + "epoch": 1.8316776007497657, + "grad_norm": 46512.046875, + "learning_rate": 5.183048403041298e-05, + "loss": 2.3152, + "step": 9772 + }, + { + "epoch": 1.8318650421743206, + "grad_norm": 46960.29296875, + "learning_rate": 5.182263136487221e-05, + "loss": 2.2187, + "step": 9773 + }, + { + "epoch": 1.8320524835988754, + "grad_norm": 50955.2109375, + "learning_rate": 5.18147786543148e-05, + "loss": 2.2636, + "step": 9774 + }, + { + "epoch": 1.8322399250234302, + "grad_norm": 51007.92578125, + "learning_rate": 5.180692589893471e-05, + "loss": 2.2128, + "step": 9775 + }, + { + "epoch": 1.8324273664479849, + "grad_norm": 55655.53515625, + "learning_rate": 5.1799073098925896e-05, + "loss": 2.2456, + "step": 9776 + }, + { + "epoch": 1.83261480787254, + "grad_norm": 52929.8671875, + "learning_rate": 5.179122025448232e-05, + "loss": 2.1753, + "step": 9777 + }, + { + "epoch": 1.8328022492970946, + "grad_norm": 48484.8125, + "learning_rate": 5.17833673657979e-05, + "loss": 2.1954, + "step": 9778 + }, + { + "epoch": 1.8329896907216496, + "grad_norm": 55220.5, + "learning_rate": 5.177551443306664e-05, + "loss": 2.2028, + "step": 9779 + }, + { + "epoch": 1.8331771321462043, + "grad_norm": 56168.91015625, + "learning_rate": 5.1767661456482477e-05, + "loss": 2.2239, + "step": 9780 + }, + { + "epoch": 1.833364573570759, + "grad_norm": 51100.1484375, + "learning_rate": 5.1759808436239356e-05, + "loss": 2.2554, + "step": 9781 + }, + { + "epoch": 1.833552014995314, + "grad_norm": 54972.46484375, + "learning_rate": 5.175195537253125e-05, + "loss": 2.1619, + "step": 9782 + }, + { + "epoch": 1.8337394564198688, + "grad_norm": 53405.00390625, + "learning_rate": 5.174410226555212e-05, + "loss": 2.1146, + "step": 9783 + }, + { + "epoch": 1.8339268978444236, + "grad_norm": 49471.8828125, + "learning_rate": 5.1736249115495926e-05, + "loss": 2.1848, + "step": 9784 + }, + { + "epoch": 1.8341143392689785, + "grad_norm": 49896.2578125, + "learning_rate": 5.172839592255662e-05, + "loss": 2.2398, + "step": 9785 + }, + { + "epoch": 1.8343017806935333, + "grad_norm": 49566.80859375, + "learning_rate": 5.17205426869282e-05, + "loss": 2.1858, + "step": 9786 + }, + { + "epoch": 1.834489222118088, + "grad_norm": 50491.4140625, + "learning_rate": 5.171268940880458e-05, + "loss": 2.1859, + "step": 9787 + }, + { + "epoch": 1.834676663542643, + "grad_norm": 50225.89453125, + "learning_rate": 5.170483608837977e-05, + "loss": 2.2049, + "step": 9788 + }, + { + "epoch": 1.8348641049671977, + "grad_norm": 54722.8828125, + "learning_rate": 5.169698272584771e-05, + "loss": 2.2069, + "step": 9789 + }, + { + "epoch": 1.8350515463917527, + "grad_norm": 52828.40625, + "learning_rate": 5.1689129321402375e-05, + "loss": 2.2248, + "step": 9790 + }, + { + "epoch": 1.8352389878163073, + "grad_norm": 53586.7265625, + "learning_rate": 5.1681275875237736e-05, + "loss": 2.2487, + "step": 9791 + }, + { + "epoch": 1.8354264292408622, + "grad_norm": 48781.3125, + "learning_rate": 5.1673422387547774e-05, + "loss": 2.1735, + "step": 9792 + }, + { + "epoch": 1.835613870665417, + "grad_norm": 54445.9765625, + "learning_rate": 5.1665568858526433e-05, + "loss": 2.2025, + "step": 9793 + }, + { + "epoch": 1.8358013120899719, + "grad_norm": 47558.828125, + "learning_rate": 5.165771528836771e-05, + "loss": 2.1987, + "step": 9794 + }, + { + "epoch": 1.8359887535145267, + "grad_norm": 52612.0546875, + "learning_rate": 5.164986167726558e-05, + "loss": 2.1699, + "step": 9795 + }, + { + "epoch": 1.8361761949390816, + "grad_norm": 53472.93359375, + "learning_rate": 5.1642008025413977e-05, + "loss": 2.2172, + "step": 9796 + }, + { + "epoch": 1.8363636363636364, + "grad_norm": 47681.98046875, + "learning_rate": 5.1634154333006926e-05, + "loss": 2.2753, + "step": 9797 + }, + { + "epoch": 1.836551077788191, + "grad_norm": 48269.37890625, + "learning_rate": 5.1626300600238376e-05, + "loss": 2.2301, + "step": 9798 + }, + { + "epoch": 1.8367385192127461, + "grad_norm": 46756.28125, + "learning_rate": 5.161844682730231e-05, + "loss": 2.2583, + "step": 9799 + }, + { + "epoch": 1.8369259606373007, + "grad_norm": 49904.37890625, + "learning_rate": 5.1610593014392714e-05, + "loss": 2.1837, + "step": 9800 + }, + { + "epoch": 1.8371134020618558, + "grad_norm": 55634.3046875, + "learning_rate": 5.160273916170355e-05, + "loss": 2.2701, + "step": 9801 + }, + { + "epoch": 1.8373008434864104, + "grad_norm": 51174.6640625, + "learning_rate": 5.15948852694288e-05, + "loss": 2.262, + "step": 9802 + }, + { + "epoch": 1.8374882849109653, + "grad_norm": 48074.3515625, + "learning_rate": 5.158703133776246e-05, + "loss": 2.2205, + "step": 9803 + }, + { + "epoch": 1.8376757263355201, + "grad_norm": 52624.23046875, + "learning_rate": 5.15791773668985e-05, + "loss": 2.2424, + "step": 9804 + }, + { + "epoch": 1.837863167760075, + "grad_norm": 51131.55078125, + "learning_rate": 5.157132335703091e-05, + "loss": 2.2248, + "step": 9805 + }, + { + "epoch": 1.8380506091846298, + "grad_norm": 54077.5859375, + "learning_rate": 5.156346930835366e-05, + "loss": 2.1816, + "step": 9806 + }, + { + "epoch": 1.8382380506091847, + "grad_norm": 50939.48046875, + "learning_rate": 5.155561522106077e-05, + "loss": 2.2205, + "step": 9807 + }, + { + "epoch": 1.8384254920337395, + "grad_norm": 53956.734375, + "learning_rate": 5.154776109534617e-05, + "loss": 2.1397, + "step": 9808 + }, + { + "epoch": 1.8386129334582941, + "grad_norm": 51289.43359375, + "learning_rate": 5.15399069314039e-05, + "loss": 2.2417, + "step": 9809 + }, + { + "epoch": 1.8388003748828492, + "grad_norm": 50030.796875, + "learning_rate": 5.1532052729427914e-05, + "loss": 2.2377, + "step": 9810 + }, + { + "epoch": 1.8389878163074038, + "grad_norm": 53867.87109375, + "learning_rate": 5.152419848961222e-05, + "loss": 2.1747, + "step": 9811 + }, + { + "epoch": 1.839175257731959, + "grad_norm": 49051.2265625, + "learning_rate": 5.15163442121508e-05, + "loss": 2.2222, + "step": 9812 + }, + { + "epoch": 1.8393626991565135, + "grad_norm": 46482.953125, + "learning_rate": 5.150848989723763e-05, + "loss": 2.2636, + "step": 9813 + }, + { + "epoch": 1.8395501405810686, + "grad_norm": 55134.3671875, + "learning_rate": 5.1500635545066734e-05, + "loss": 2.3254, + "step": 9814 + }, + { + "epoch": 1.8397375820056232, + "grad_norm": 52110.296875, + "learning_rate": 5.1492781155832084e-05, + "loss": 2.1747, + "step": 9815 + }, + { + "epoch": 1.839925023430178, + "grad_norm": 51588.17578125, + "learning_rate": 5.1484926729727656e-05, + "loss": 2.1563, + "step": 9816 + }, + { + "epoch": 1.840112464854733, + "grad_norm": 51307.63671875, + "learning_rate": 5.147707226694748e-05, + "loss": 2.2638, + "step": 9817 + }, + { + "epoch": 1.8402999062792877, + "grad_norm": 48894.11328125, + "learning_rate": 5.146921776768552e-05, + "loss": 2.2615, + "step": 9818 + }, + { + "epoch": 1.8404873477038426, + "grad_norm": 52537.71875, + "learning_rate": 5.14613632321358e-05, + "loss": 2.226, + "step": 9819 + }, + { + "epoch": 1.8406747891283972, + "grad_norm": 49638.40625, + "learning_rate": 5.14535086604923e-05, + "loss": 2.1963, + "step": 9820 + }, + { + "epoch": 1.8408622305529523, + "grad_norm": 52965.34765625, + "learning_rate": 5.144565405294901e-05, + "loss": 2.1734, + "step": 9821 + }, + { + "epoch": 1.841049671977507, + "grad_norm": 48622.05078125, + "learning_rate": 5.143779940969995e-05, + "loss": 2.1865, + "step": 9822 + }, + { + "epoch": 1.841237113402062, + "grad_norm": 50339.890625, + "learning_rate": 5.142994473093909e-05, + "loss": 2.2458, + "step": 9823 + }, + { + "epoch": 1.8414245548266166, + "grad_norm": 49403.62109375, + "learning_rate": 5.142209001686048e-05, + "loss": 2.1798, + "step": 9824 + }, + { + "epoch": 1.8416119962511717, + "grad_norm": 50298.24609375, + "learning_rate": 5.141423526765807e-05, + "loss": 2.2284, + "step": 9825 + }, + { + "epoch": 1.8417994376757263, + "grad_norm": 51711.09375, + "learning_rate": 5.140638048352589e-05, + "loss": 2.2379, + "step": 9826 + }, + { + "epoch": 1.8419868791002811, + "grad_norm": 52767.66796875, + "learning_rate": 5.1398525664657946e-05, + "loss": 2.1784, + "step": 9827 + }, + { + "epoch": 1.842174320524836, + "grad_norm": 51523.76953125, + "learning_rate": 5.139067081124821e-05, + "loss": 2.2313, + "step": 9828 + }, + { + "epoch": 1.8423617619493908, + "grad_norm": 48452.43359375, + "learning_rate": 5.138281592349072e-05, + "loss": 2.2631, + "step": 9829 + }, + { + "epoch": 1.8425492033739457, + "grad_norm": 55763.953125, + "learning_rate": 5.137496100157947e-05, + "loss": 2.242, + "step": 9830 + }, + { + "epoch": 1.8427366447985005, + "grad_norm": 48309.51171875, + "learning_rate": 5.136710604570846e-05, + "loss": 2.2792, + "step": 9831 + }, + { + "epoch": 1.8429240862230554, + "grad_norm": 53953.421875, + "learning_rate": 5.13592510560717e-05, + "loss": 2.1587, + "step": 9832 + }, + { + "epoch": 1.84311152764761, + "grad_norm": 53976.96484375, + "learning_rate": 5.135139603286321e-05, + "loss": 2.2413, + "step": 9833 + }, + { + "epoch": 1.843298969072165, + "grad_norm": 51776.0859375, + "learning_rate": 5.1343540976276985e-05, + "loss": 2.1784, + "step": 9834 + }, + { + "epoch": 1.8434864104967197, + "grad_norm": 52293.9375, + "learning_rate": 5.1335685886507044e-05, + "loss": 2.1492, + "step": 9835 + }, + { + "epoch": 1.8436738519212748, + "grad_norm": 49904.30859375, + "learning_rate": 5.132783076374738e-05, + "loss": 2.2496, + "step": 9836 + }, + { + "epoch": 1.8438612933458294, + "grad_norm": 47646.2265625, + "learning_rate": 5.131997560819204e-05, + "loss": 2.2767, + "step": 9837 + }, + { + "epoch": 1.8440487347703842, + "grad_norm": 52112.81640625, + "learning_rate": 5.131212042003498e-05, + "loss": 2.2491, + "step": 9838 + }, + { + "epoch": 1.844236176194939, + "grad_norm": 51875.76171875, + "learning_rate": 5.130426519947028e-05, + "loss": 2.1953, + "step": 9839 + }, + { + "epoch": 1.844423617619494, + "grad_norm": 51327.109375, + "learning_rate": 5.1296409946691913e-05, + "loss": 2.1787, + "step": 9840 + }, + { + "epoch": 1.8446110590440488, + "grad_norm": 48283.1640625, + "learning_rate": 5.1288554661893896e-05, + "loss": 2.2674, + "step": 9841 + }, + { + "epoch": 1.8447985004686036, + "grad_norm": 50151.3359375, + "learning_rate": 5.1280699345270256e-05, + "loss": 2.1926, + "step": 9842 + }, + { + "epoch": 1.8449859418931585, + "grad_norm": 54770.6484375, + "learning_rate": 5.127284399701498e-05, + "loss": 2.2452, + "step": 9843 + }, + { + "epoch": 1.845173383317713, + "grad_norm": 50724.72265625, + "learning_rate": 5.1264988617322144e-05, + "loss": 2.175, + "step": 9844 + }, + { + "epoch": 1.8453608247422681, + "grad_norm": 54183.42578125, + "learning_rate": 5.125713320638571e-05, + "loss": 2.1827, + "step": 9845 + }, + { + "epoch": 1.8455482661668228, + "grad_norm": 52632.91796875, + "learning_rate": 5.1249277764399714e-05, + "loss": 2.271, + "step": 9846 + }, + { + "epoch": 1.8457357075913778, + "grad_norm": 54481.66015625, + "learning_rate": 5.1241422291558174e-05, + "loss": 2.1692, + "step": 9847 + }, + { + "epoch": 1.8459231490159325, + "grad_norm": 49596.296875, + "learning_rate": 5.123356678805512e-05, + "loss": 2.1976, + "step": 9848 + }, + { + "epoch": 1.8461105904404873, + "grad_norm": 52390.59765625, + "learning_rate": 5.1225711254084565e-05, + "loss": 2.2539, + "step": 9849 + }, + { + "epoch": 1.8462980318650422, + "grad_norm": 51861.7890625, + "learning_rate": 5.121785568984053e-05, + "loss": 2.2468, + "step": 9850 + }, + { + "epoch": 1.846485473289597, + "grad_norm": 50633.59375, + "learning_rate": 5.121000009551703e-05, + "loss": 2.2788, + "step": 9851 + }, + { + "epoch": 1.8466729147141518, + "grad_norm": 50874.58984375, + "learning_rate": 5.120214447130811e-05, + "loss": 2.2667, + "step": 9852 + }, + { + "epoch": 1.8468603561387067, + "grad_norm": 53236.48828125, + "learning_rate": 5.119428881740778e-05, + "loss": 2.2464, + "step": 9853 + }, + { + "epoch": 1.8470477975632615, + "grad_norm": 51642.3828125, + "learning_rate": 5.118643313401006e-05, + "loss": 2.217, + "step": 9854 + }, + { + "epoch": 1.8472352389878162, + "grad_norm": 50564.75, + "learning_rate": 5.117857742130898e-05, + "loss": 2.2356, + "step": 9855 + }, + { + "epoch": 1.8474226804123712, + "grad_norm": 50649.21484375, + "learning_rate": 5.117072167949856e-05, + "loss": 2.2315, + "step": 9856 + }, + { + "epoch": 1.8476101218369259, + "grad_norm": 48897.60546875, + "learning_rate": 5.116286590877284e-05, + "loss": 2.1457, + "step": 9857 + }, + { + "epoch": 1.847797563261481, + "grad_norm": 54884.13671875, + "learning_rate": 5.115501010932583e-05, + "loss": 2.1622, + "step": 9858 + }, + { + "epoch": 1.8479850046860355, + "grad_norm": 53910.125, + "learning_rate": 5.1147154281351587e-05, + "loss": 2.2175, + "step": 9859 + }, + { + "epoch": 1.8481724461105904, + "grad_norm": 52782.4765625, + "learning_rate": 5.11392984250441e-05, + "loss": 2.1558, + "step": 9860 + }, + { + "epoch": 1.8483598875351452, + "grad_norm": 52795.34375, + "learning_rate": 5.113144254059743e-05, + "loss": 2.2872, + "step": 9861 + }, + { + "epoch": 1.8485473289597, + "grad_norm": 59274.48828125, + "learning_rate": 5.112358662820559e-05, + "loss": 2.1478, + "step": 9862 + }, + { + "epoch": 1.848734770384255, + "grad_norm": 52896.58203125, + "learning_rate": 5.111573068806262e-05, + "loss": 2.3463, + "step": 9863 + }, + { + "epoch": 1.8489222118088098, + "grad_norm": 53224.7578125, + "learning_rate": 5.1107874720362546e-05, + "loss": 2.265, + "step": 9864 + }, + { + "epoch": 1.8491096532333646, + "grad_norm": 53102.53515625, + "learning_rate": 5.1100018725299416e-05, + "loss": 2.094, + "step": 9865 + }, + { + "epoch": 1.8492970946579192, + "grad_norm": 50930.96875, + "learning_rate": 5.1092162703067224e-05, + "loss": 2.2078, + "step": 9866 + }, + { + "epoch": 1.8494845360824743, + "grad_norm": 52695.54296875, + "learning_rate": 5.108430665386006e-05, + "loss": 2.3254, + "step": 9867 + }, + { + "epoch": 1.849671977507029, + "grad_norm": 57539.55859375, + "learning_rate": 5.107645057787191e-05, + "loss": 2.1707, + "step": 9868 + }, + { + "epoch": 1.849859418931584, + "grad_norm": 52393.85546875, + "learning_rate": 5.106859447529683e-05, + "loss": 2.2237, + "step": 9869 + }, + { + "epoch": 1.8500468603561386, + "grad_norm": 50114.25390625, + "learning_rate": 5.106073834632886e-05, + "loss": 2.2537, + "step": 9870 + }, + { + "epoch": 1.8502343017806935, + "grad_norm": 54930.38671875, + "learning_rate": 5.105288219116201e-05, + "loss": 2.1644, + "step": 9871 + }, + { + "epoch": 1.8504217432052483, + "grad_norm": 56814.6875, + "learning_rate": 5.104502600999035e-05, + "loss": 2.0723, + "step": 9872 + }, + { + "epoch": 1.8506091846298032, + "grad_norm": 48428.82421875, + "learning_rate": 5.103716980300789e-05, + "loss": 2.2298, + "step": 9873 + }, + { + "epoch": 1.850796626054358, + "grad_norm": 50070.87890625, + "learning_rate": 5.102931357040869e-05, + "loss": 2.2222, + "step": 9874 + }, + { + "epoch": 1.8509840674789129, + "grad_norm": 50850.90234375, + "learning_rate": 5.1021457312386776e-05, + "loss": 2.2058, + "step": 9875 + }, + { + "epoch": 1.8511715089034677, + "grad_norm": 48610.4296875, + "learning_rate": 5.1013601029136183e-05, + "loss": 2.2498, + "step": 9876 + }, + { + "epoch": 1.8513589503280223, + "grad_norm": 51016.8984375, + "learning_rate": 5.100574472085097e-05, + "loss": 2.2615, + "step": 9877 + }, + { + "epoch": 1.8515463917525774, + "grad_norm": 50318.48828125, + "learning_rate": 5.099788838772516e-05, + "loss": 2.2061, + "step": 9878 + }, + { + "epoch": 1.851733833177132, + "grad_norm": 53804.03125, + "learning_rate": 5.0990032029952805e-05, + "loss": 2.1938, + "step": 9879 + }, + { + "epoch": 1.851921274601687, + "grad_norm": 51893.73828125, + "learning_rate": 5.098217564772795e-05, + "loss": 2.1876, + "step": 9880 + }, + { + "epoch": 1.8521087160262417, + "grad_norm": 52299.015625, + "learning_rate": 5.0974319241244615e-05, + "loss": 2.2734, + "step": 9881 + }, + { + "epoch": 1.8522961574507968, + "grad_norm": 47281.03515625, + "learning_rate": 5.096646281069687e-05, + "loss": 2.2511, + "step": 9882 + }, + { + "epoch": 1.8524835988753514, + "grad_norm": 55007.578125, + "learning_rate": 5.095860635627874e-05, + "loss": 2.2149, + "step": 9883 + }, + { + "epoch": 1.8526710402999063, + "grad_norm": 46971.5625, + "learning_rate": 5.0950749878184265e-05, + "loss": 2.2862, + "step": 9884 + }, + { + "epoch": 1.852858481724461, + "grad_norm": 58296.1875, + "learning_rate": 5.094289337660753e-05, + "loss": 2.2276, + "step": 9885 + }, + { + "epoch": 1.853045923149016, + "grad_norm": 44894.52734375, + "learning_rate": 5.093503685174251e-05, + "loss": 2.2228, + "step": 9886 + }, + { + "epoch": 1.8532333645735708, + "grad_norm": 52167.546875, + "learning_rate": 5.0927180303783326e-05, + "loss": 2.2561, + "step": 9887 + }, + { + "epoch": 1.8534208059981254, + "grad_norm": 47833.578125, + "learning_rate": 5.0919323732923984e-05, + "loss": 2.2339, + "step": 9888 + }, + { + "epoch": 1.8536082474226805, + "grad_norm": 51596.17578125, + "learning_rate": 5.091146713935853e-05, + "loss": 2.2142, + "step": 9889 + }, + { + "epoch": 1.8537956888472351, + "grad_norm": 55664.2109375, + "learning_rate": 5.090361052328102e-05, + "loss": 2.2238, + "step": 9890 + }, + { + "epoch": 1.8539831302717902, + "grad_norm": 49367.84375, + "learning_rate": 5.089575388488551e-05, + "loss": 2.2026, + "step": 9891 + }, + { + "epoch": 1.8541705716963448, + "grad_norm": 51863.86328125, + "learning_rate": 5.0887897224366034e-05, + "loss": 2.2269, + "step": 9892 + }, + { + "epoch": 1.8543580131208999, + "grad_norm": 48737.88671875, + "learning_rate": 5.0880040541916653e-05, + "loss": 2.2211, + "step": 9893 + }, + { + "epoch": 1.8545454545454545, + "grad_norm": 51892.58984375, + "learning_rate": 5.087218383773139e-05, + "loss": 2.2584, + "step": 9894 + }, + { + "epoch": 1.8547328959700093, + "grad_norm": 55853.4140625, + "learning_rate": 5.086432711200435e-05, + "loss": 2.2184, + "step": 9895 + }, + { + "epoch": 1.8549203373945642, + "grad_norm": 49310.31640625, + "learning_rate": 5.085647036492953e-05, + "loss": 2.1376, + "step": 9896 + }, + { + "epoch": 1.855107778819119, + "grad_norm": 52429.44140625, + "learning_rate": 5.084861359670101e-05, + "loss": 2.2482, + "step": 9897 + }, + { + "epoch": 1.8552952202436739, + "grad_norm": 52726.20703125, + "learning_rate": 5.084075680751283e-05, + "loss": 2.2203, + "step": 9898 + }, + { + "epoch": 1.8554826616682287, + "grad_norm": 51327.82421875, + "learning_rate": 5.083289999755905e-05, + "loss": 2.2079, + "step": 9899 + }, + { + "epoch": 1.8556701030927836, + "grad_norm": 55735.140625, + "learning_rate": 5.0825043167033725e-05, + "loss": 2.2385, + "step": 9900 + }, + { + "epoch": 1.8558575445173382, + "grad_norm": 49951.296875, + "learning_rate": 5.0817186316130884e-05, + "loss": 2.197, + "step": 9901 + }, + { + "epoch": 1.8560449859418933, + "grad_norm": 50698.8359375, + "learning_rate": 5.080932944504463e-05, + "loss": 2.1829, + "step": 9902 + }, + { + "epoch": 1.856232427366448, + "grad_norm": 50902.73828125, + "learning_rate": 5.080147255396897e-05, + "loss": 2.2283, + "step": 9903 + }, + { + "epoch": 1.856419868791003, + "grad_norm": 47012.609375, + "learning_rate": 5.079361564309797e-05, + "loss": 2.2301, + "step": 9904 + }, + { + "epoch": 1.8566073102155576, + "grad_norm": 49492.70703125, + "learning_rate": 5.078575871262571e-05, + "loss": 2.1898, + "step": 9905 + }, + { + "epoch": 1.8567947516401124, + "grad_norm": 50401.62109375, + "learning_rate": 5.077790176274621e-05, + "loss": 2.2134, + "step": 9906 + }, + { + "epoch": 1.8569821930646673, + "grad_norm": 54183.19140625, + "learning_rate": 5.077004479365356e-05, + "loss": 2.1493, + "step": 9907 + }, + { + "epoch": 1.8571696344892221, + "grad_norm": 52280.5078125, + "learning_rate": 5.076218780554181e-05, + "loss": 2.1919, + "step": 9908 + }, + { + "epoch": 1.857357075913777, + "grad_norm": 52047.20703125, + "learning_rate": 5.075433079860499e-05, + "loss": 2.181, + "step": 9909 + }, + { + "epoch": 1.8575445173383318, + "grad_norm": 52010.95703125, + "learning_rate": 5.0746473773037185e-05, + "loss": 2.294, + "step": 9910 + }, + { + "epoch": 1.8577319587628867, + "grad_norm": 49874.44921875, + "learning_rate": 5.073861672903244e-05, + "loss": 2.1994, + "step": 9911 + }, + { + "epoch": 1.8579194001874413, + "grad_norm": 52057.6328125, + "learning_rate": 5.0730759666784824e-05, + "loss": 2.2488, + "step": 9912 + }, + { + "epoch": 1.8581068416119964, + "grad_norm": 52643.6953125, + "learning_rate": 5.07229025864884e-05, + "loss": 2.1843, + "step": 9913 + }, + { + "epoch": 1.858294283036551, + "grad_norm": 56381.828125, + "learning_rate": 5.0715045488337196e-05, + "loss": 2.1799, + "step": 9914 + }, + { + "epoch": 1.858481724461106, + "grad_norm": 47327.87109375, + "learning_rate": 5.0707188372525316e-05, + "loss": 2.2537, + "step": 9915 + }, + { + "epoch": 1.8586691658856607, + "grad_norm": 52025.375, + "learning_rate": 5.069933123924678e-05, + "loss": 2.2494, + "step": 9916 + }, + { + "epoch": 1.8588566073102155, + "grad_norm": 50097.57421875, + "learning_rate": 5.06914740886957e-05, + "loss": 2.2488, + "step": 9917 + }, + { + "epoch": 1.8590440487347704, + "grad_norm": 51829.87890625, + "learning_rate": 5.068361692106609e-05, + "loss": 2.1881, + "step": 9918 + }, + { + "epoch": 1.8592314901593252, + "grad_norm": 52408.55078125, + "learning_rate": 5.067575973655202e-05, + "loss": 2.1634, + "step": 9919 + }, + { + "epoch": 1.85941893158388, + "grad_norm": 48948.23828125, + "learning_rate": 5.0667902535347566e-05, + "loss": 2.1398, + "step": 9920 + }, + { + "epoch": 1.859606373008435, + "grad_norm": 48658.4296875, + "learning_rate": 5.066004531764679e-05, + "loss": 2.1642, + "step": 9921 + }, + { + "epoch": 1.8597938144329897, + "grad_norm": 49030.734375, + "learning_rate": 5.065218808364376e-05, + "loss": 2.2766, + "step": 9922 + }, + { + "epoch": 1.8599812558575444, + "grad_norm": 51873.1796875, + "learning_rate": 5.064433083353252e-05, + "loss": 2.1661, + "step": 9923 + }, + { + "epoch": 1.8601686972820994, + "grad_norm": 51559.75, + "learning_rate": 5.0636473567507136e-05, + "loss": 2.2182, + "step": 9924 + }, + { + "epoch": 1.860356138706654, + "grad_norm": 50769.2890625, + "learning_rate": 5.0628616285761685e-05, + "loss": 2.2185, + "step": 9925 + }, + { + "epoch": 1.8605435801312091, + "grad_norm": 51091.2734375, + "learning_rate": 5.062075898849023e-05, + "loss": 2.2003, + "step": 9926 + }, + { + "epoch": 1.8607310215557638, + "grad_norm": 53653.0859375, + "learning_rate": 5.0612901675886824e-05, + "loss": 2.2109, + "step": 9927 + }, + { + "epoch": 1.8609184629803186, + "grad_norm": 53834.34765625, + "learning_rate": 5.060504434814556e-05, + "loss": 2.2048, + "step": 9928 + }, + { + "epoch": 1.8611059044048734, + "grad_norm": 53995.39453125, + "learning_rate": 5.0597187005460464e-05, + "loss": 2.3074, + "step": 9929 + }, + { + "epoch": 1.8612933458294283, + "grad_norm": 47542.453125, + "learning_rate": 5.058932964802564e-05, + "loss": 2.2363, + "step": 9930 + }, + { + "epoch": 1.8614807872539831, + "grad_norm": 47085.55078125, + "learning_rate": 5.058147227603512e-05, + "loss": 2.2034, + "step": 9931 + }, + { + "epoch": 1.861668228678538, + "grad_norm": 54316.28515625, + "learning_rate": 5.0573614889683006e-05, + "loss": 2.1838, + "step": 9932 + }, + { + "epoch": 1.8618556701030928, + "grad_norm": 51652.24609375, + "learning_rate": 5.056575748916335e-05, + "loss": 2.1391, + "step": 9933 + }, + { + "epoch": 1.8620431115276475, + "grad_norm": 51516.91015625, + "learning_rate": 5.0557900074670195e-05, + "loss": 2.1833, + "step": 9934 + }, + { + "epoch": 1.8622305529522025, + "grad_norm": 47216.78515625, + "learning_rate": 5.0550042646397646e-05, + "loss": 2.1357, + "step": 9935 + }, + { + "epoch": 1.8624179943767571, + "grad_norm": 50495.84765625, + "learning_rate": 5.054218520453976e-05, + "loss": 2.1809, + "step": 9936 + }, + { + "epoch": 1.8626054358013122, + "grad_norm": 49910.91796875, + "learning_rate": 5.05343277492906e-05, + "loss": 2.1911, + "step": 9937 + }, + { + "epoch": 1.8627928772258668, + "grad_norm": 48783.66796875, + "learning_rate": 5.052647028084424e-05, + "loss": 2.1439, + "step": 9938 + }, + { + "epoch": 1.862980318650422, + "grad_norm": 51137.015625, + "learning_rate": 5.051861279939474e-05, + "loss": 2.2038, + "step": 9939 + }, + { + "epoch": 1.8631677600749765, + "grad_norm": 55258.234375, + "learning_rate": 5.051075530513617e-05, + "loss": 2.1605, + "step": 9940 + }, + { + "epoch": 1.8633552014995314, + "grad_norm": 47305.0234375, + "learning_rate": 5.050289779826262e-05, + "loss": 2.2038, + "step": 9941 + }, + { + "epoch": 1.8635426429240862, + "grad_norm": 47678.75, + "learning_rate": 5.049504027896814e-05, + "loss": 2.2584, + "step": 9942 + }, + { + "epoch": 1.863730084348641, + "grad_norm": 51537.25390625, + "learning_rate": 5.0487182747446814e-05, + "loss": 2.1892, + "step": 9943 + }, + { + "epoch": 1.863917525773196, + "grad_norm": 50252.4921875, + "learning_rate": 5.0479325203892694e-05, + "loss": 2.257, + "step": 9944 + }, + { + "epoch": 1.8641049671977505, + "grad_norm": 55327.44140625, + "learning_rate": 5.047146764849987e-05, + "loss": 2.2575, + "step": 9945 + }, + { + "epoch": 1.8642924086223056, + "grad_norm": 50142.90625, + "learning_rate": 5.046361008146241e-05, + "loss": 2.2462, + "step": 9946 + }, + { + "epoch": 1.8644798500468602, + "grad_norm": 50703.94921875, + "learning_rate": 5.045575250297437e-05, + "loss": 2.2022, + "step": 9947 + }, + { + "epoch": 1.8646672914714153, + "grad_norm": 52607.3515625, + "learning_rate": 5.0447894913229845e-05, + "loss": 2.1492, + "step": 9948 + }, + { + "epoch": 1.86485473289597, + "grad_norm": 52241.04296875, + "learning_rate": 5.0440037312422884e-05, + "loss": 2.2063, + "step": 9949 + }, + { + "epoch": 1.865042174320525, + "grad_norm": 52547.74609375, + "learning_rate": 5.0432179700747597e-05, + "loss": 2.195, + "step": 9950 + }, + { + "epoch": 1.8652296157450796, + "grad_norm": 50188.4296875, + "learning_rate": 5.0424322078398e-05, + "loss": 2.2121, + "step": 9951 + }, + { + "epoch": 1.8654170571696345, + "grad_norm": 50605.0625, + "learning_rate": 5.041646444556823e-05, + "loss": 2.1935, + "step": 9952 + }, + { + "epoch": 1.8656044985941893, + "grad_norm": 49988.625, + "learning_rate": 5.0408606802452296e-05, + "loss": 2.1966, + "step": 9953 + }, + { + "epoch": 1.8657919400187442, + "grad_norm": 55190.1015625, + "learning_rate": 5.040074914924432e-05, + "loss": 2.2641, + "step": 9954 + }, + { + "epoch": 1.865979381443299, + "grad_norm": 49533.51953125, + "learning_rate": 5.0392891486138363e-05, + "loss": 2.2038, + "step": 9955 + }, + { + "epoch": 1.8661668228678538, + "grad_norm": 53438.2578125, + "learning_rate": 5.0385033813328486e-05, + "loss": 2.2224, + "step": 9956 + }, + { + "epoch": 1.8663542642924087, + "grad_norm": 56109.8125, + "learning_rate": 5.0377176131008774e-05, + "loss": 2.2503, + "step": 9957 + }, + { + "epoch": 1.8665417057169633, + "grad_norm": 52587.90234375, + "learning_rate": 5.036931843937331e-05, + "loss": 2.194, + "step": 9958 + }, + { + "epoch": 1.8667291471415184, + "grad_norm": 54515.09375, + "learning_rate": 5.036146073861615e-05, + "loss": 2.1901, + "step": 9959 + }, + { + "epoch": 1.866916588566073, + "grad_norm": 49541.58203125, + "learning_rate": 5.035360302893139e-05, + "loss": 2.259, + "step": 9960 + }, + { + "epoch": 1.867104029990628, + "grad_norm": 48666.55859375, + "learning_rate": 5.034574531051308e-05, + "loss": 2.24, + "step": 9961 + }, + { + "epoch": 1.8672914714151827, + "grad_norm": 52173.79296875, + "learning_rate": 5.033788758355532e-05, + "loss": 2.2638, + "step": 9962 + }, + { + "epoch": 1.8674789128397375, + "grad_norm": 53699.5859375, + "learning_rate": 5.033002984825217e-05, + "loss": 2.1671, + "step": 9963 + }, + { + "epoch": 1.8676663542642924, + "grad_norm": 47828.2890625, + "learning_rate": 5.03221721047977e-05, + "loss": 2.2198, + "step": 9964 + }, + { + "epoch": 1.8678537956888472, + "grad_norm": 48982.71875, + "learning_rate": 5.031431435338603e-05, + "loss": 2.215, + "step": 9965 + }, + { + "epoch": 1.868041237113402, + "grad_norm": 49810.98046875, + "learning_rate": 5.030645659421117e-05, + "loss": 2.2228, + "step": 9966 + }, + { + "epoch": 1.868228678537957, + "grad_norm": 50774.8671875, + "learning_rate": 5.029859882746723e-05, + "loss": 2.2115, + "step": 9967 + }, + { + "epoch": 1.8684161199625118, + "grad_norm": 48034.54296875, + "learning_rate": 5.02907410533483e-05, + "loss": 2.2072, + "step": 9968 + }, + { + "epoch": 1.8686035613870664, + "grad_norm": 53575.390625, + "learning_rate": 5.028288327204844e-05, + "loss": 2.2053, + "step": 9969 + }, + { + "epoch": 1.8687910028116215, + "grad_norm": 52691.578125, + "learning_rate": 5.027502548376173e-05, + "loss": 2.2038, + "step": 9970 + }, + { + "epoch": 1.868978444236176, + "grad_norm": 51115.07421875, + "learning_rate": 5.026716768868225e-05, + "loss": 2.225, + "step": 9971 + }, + { + "epoch": 1.8691658856607312, + "grad_norm": 55030.6953125, + "learning_rate": 5.025930988700406e-05, + "loss": 2.103, + "step": 9972 + }, + { + "epoch": 1.8693533270852858, + "grad_norm": 51964.859375, + "learning_rate": 5.025145207892128e-05, + "loss": 2.2245, + "step": 9973 + }, + { + "epoch": 1.8695407685098406, + "grad_norm": 47875.6328125, + "learning_rate": 5.0243594264627924e-05, + "loss": 2.1679, + "step": 9974 + }, + { + "epoch": 1.8697282099343955, + "grad_norm": 52530.94140625, + "learning_rate": 5.0235736444318135e-05, + "loss": 2.2339, + "step": 9975 + }, + { + "epoch": 1.8699156513589503, + "grad_norm": 50329.16796875, + "learning_rate": 5.022787861818594e-05, + "loss": 2.2666, + "step": 9976 + }, + { + "epoch": 1.8701030927835052, + "grad_norm": 50043.92578125, + "learning_rate": 5.022002078642546e-05, + "loss": 2.1947, + "step": 9977 + }, + { + "epoch": 1.87029053420806, + "grad_norm": 50652.33984375, + "learning_rate": 5.021216294923075e-05, + "loss": 2.1649, + "step": 9978 + }, + { + "epoch": 1.8704779756326149, + "grad_norm": 59436.94140625, + "learning_rate": 5.0204305106795855e-05, + "loss": 2.2761, + "step": 9979 + }, + { + "epoch": 1.8706654170571695, + "grad_norm": 49544.796875, + "learning_rate": 5.019644725931493e-05, + "loss": 2.2212, + "step": 9980 + }, + { + "epoch": 1.8708528584817246, + "grad_norm": 51542.6875, + "learning_rate": 5.0188589406982e-05, + "loss": 2.2475, + "step": 9981 + }, + { + "epoch": 1.8710402999062792, + "grad_norm": 50131.30078125, + "learning_rate": 5.018073154999115e-05, + "loss": 2.1825, + "step": 9982 + }, + { + "epoch": 1.8712277413308342, + "grad_norm": 47850.4765625, + "learning_rate": 5.017287368853646e-05, + "loss": 2.1882, + "step": 9983 + }, + { + "epoch": 1.8714151827553889, + "grad_norm": 53954.52734375, + "learning_rate": 5.0165015822812026e-05, + "loss": 2.2925, + "step": 9984 + }, + { + "epoch": 1.8716026241799437, + "grad_norm": 51012.9609375, + "learning_rate": 5.015715795301191e-05, + "loss": 2.3758, + "step": 9985 + }, + { + "epoch": 1.8717900656044986, + "grad_norm": 50442.08203125, + "learning_rate": 5.014930007933021e-05, + "loss": 2.2041, + "step": 9986 + }, + { + "epoch": 1.8719775070290534, + "grad_norm": 51680.7578125, + "learning_rate": 5.014144220196096e-05, + "loss": 2.1683, + "step": 9987 + }, + { + "epoch": 1.8721649484536083, + "grad_norm": 51579.765625, + "learning_rate": 5.0133584321098305e-05, + "loss": 2.1854, + "step": 9988 + }, + { + "epoch": 1.872352389878163, + "grad_norm": 47834.87890625, + "learning_rate": 5.012572643693626e-05, + "loss": 2.1735, + "step": 9989 + }, + { + "epoch": 1.872539831302718, + "grad_norm": 55574.83984375, + "learning_rate": 5.0117868549668955e-05, + "loss": 2.2529, + "step": 9990 + }, + { + "epoch": 1.8727272727272726, + "grad_norm": 57281.3125, + "learning_rate": 5.011001065949045e-05, + "loss": 2.1522, + "step": 9991 + }, + { + "epoch": 1.8729147141518276, + "grad_norm": 50714.7890625, + "learning_rate": 5.0102152766594814e-05, + "loss": 2.1711, + "step": 9992 + }, + { + "epoch": 1.8731021555763823, + "grad_norm": 51066.26171875, + "learning_rate": 5.0094294871176154e-05, + "loss": 2.2249, + "step": 9993 + }, + { + "epoch": 1.8732895970009373, + "grad_norm": 51691.4375, + "learning_rate": 5.0086436973428505e-05, + "loss": 2.1611, + "step": 9994 + }, + { + "epoch": 1.873477038425492, + "grad_norm": 49406.51953125, + "learning_rate": 5.007857907354601e-05, + "loss": 2.2021, + "step": 9995 + }, + { + "epoch": 1.873664479850047, + "grad_norm": 51959.8125, + "learning_rate": 5.007072117172269e-05, + "loss": 2.2766, + "step": 9996 + }, + { + "epoch": 1.8738519212746017, + "grad_norm": 50986.1171875, + "learning_rate": 5.006286326815265e-05, + "loss": 2.2693, + "step": 9997 + }, + { + "epoch": 1.8740393626991565, + "grad_norm": 51481.32421875, + "learning_rate": 5.0055005363029974e-05, + "loss": 2.2952, + "step": 9998 + }, + { + "epoch": 1.8742268041237113, + "grad_norm": 54896.9765625, + "learning_rate": 5.0047147456548725e-05, + "loss": 2.2364, + "step": 9999 + }, + { + "epoch": 1.8744142455482662, + "grad_norm": 49314.9375, + "learning_rate": 5.0039289548903015e-05, + "loss": 2.236, + "step": 10000 + }, + { + "epoch": 1.8744142455482662, + "eval_loss": 2.28055739402771, + "eval_runtime": 129.1114, + "eval_samples_per_second": 39.106, + "eval_steps_per_second": 1.96, + "step": 10000 + }, + { + "epoch": 1.874601686972821, + "grad_norm": 52091.12109375, + "learning_rate": 5.00314316402869e-05, + "loss": 2.202, + "step": 10001 + }, + { + "epoch": 1.8747891283973757, + "grad_norm": 51244.0078125, + "learning_rate": 5.002357373089444e-05, + "loss": 2.2051, + "step": 10002 + }, + { + "epoch": 1.8749765698219307, + "grad_norm": 48493.7421875, + "learning_rate": 5.001571582091977e-05, + "loss": 2.2438, + "step": 10003 + }, + { + "epoch": 1.8751640112464854, + "grad_norm": 51672.55078125, + "learning_rate": 5.0007857910556924e-05, + "loss": 2.281, + "step": 10004 + }, + { + "epoch": 1.8753514526710404, + "grad_norm": 50810.3671875, + "learning_rate": 5e-05, + "loss": 2.2019, + "step": 10005 + }, + { + "epoch": 1.875538894095595, + "grad_norm": 52030.5234375, + "learning_rate": 4.999214208944308e-05, + "loss": 2.2578, + "step": 10006 + }, + { + "epoch": 1.8757263355201501, + "grad_norm": 46866.046875, + "learning_rate": 4.9984284179080246e-05, + "loss": 2.2641, + "step": 10007 + }, + { + "epoch": 1.8759137769447047, + "grad_norm": 52347.7109375, + "learning_rate": 4.997642626910556e-05, + "loss": 2.2365, + "step": 10008 + }, + { + "epoch": 1.8761012183692596, + "grad_norm": 47214.328125, + "learning_rate": 4.9968568359713114e-05, + "loss": 2.2175, + "step": 10009 + }, + { + "epoch": 1.8762886597938144, + "grad_norm": 49156.2265625, + "learning_rate": 4.996071045109699e-05, + "loss": 2.1354, + "step": 10010 + }, + { + "epoch": 1.8764761012183693, + "grad_norm": 51808.47265625, + "learning_rate": 4.995285254345128e-05, + "loss": 2.1857, + "step": 10011 + }, + { + "epoch": 1.8766635426429241, + "grad_norm": 54516.94140625, + "learning_rate": 4.994499463697004e-05, + "loss": 2.2112, + "step": 10012 + }, + { + "epoch": 1.876850984067479, + "grad_norm": 46925.53125, + "learning_rate": 4.993713673184736e-05, + "loss": 2.264, + "step": 10013 + }, + { + "epoch": 1.8770384254920338, + "grad_norm": 51234.9609375, + "learning_rate": 4.992927882827732e-05, + "loss": 2.1655, + "step": 10014 + }, + { + "epoch": 1.8772258669165884, + "grad_norm": 51082.84765625, + "learning_rate": 4.992142092645402e-05, + "loss": 2.1926, + "step": 10015 + }, + { + "epoch": 1.8774133083411435, + "grad_norm": 49930.828125, + "learning_rate": 4.99135630265715e-05, + "loss": 2.2615, + "step": 10016 + }, + { + "epoch": 1.8776007497656981, + "grad_norm": 55018.57421875, + "learning_rate": 4.990570512882386e-05, + "loss": 2.1713, + "step": 10017 + }, + { + "epoch": 1.8777881911902532, + "grad_norm": 50737.23828125, + "learning_rate": 4.9897847233405184e-05, + "loss": 2.2044, + "step": 10018 + }, + { + "epoch": 1.8779756326148078, + "grad_norm": 50550.6171875, + "learning_rate": 4.9889989340509564e-05, + "loss": 2.2106, + "step": 10019 + }, + { + "epoch": 1.8781630740393627, + "grad_norm": 49914.32421875, + "learning_rate": 4.9882131450331056e-05, + "loss": 2.2028, + "step": 10020 + }, + { + "epoch": 1.8783505154639175, + "grad_norm": 49852.6328125, + "learning_rate": 4.987427356306374e-05, + "loss": 2.2236, + "step": 10021 + }, + { + "epoch": 1.8785379568884724, + "grad_norm": 55286.23046875, + "learning_rate": 4.986641567890172e-05, + "loss": 2.195, + "step": 10022 + }, + { + "epoch": 1.8787253983130272, + "grad_norm": 59031.1328125, + "learning_rate": 4.985855779803905e-05, + "loss": 2.0428, + "step": 10023 + }, + { + "epoch": 1.878912839737582, + "grad_norm": 48031.94921875, + "learning_rate": 4.985069992066981e-05, + "loss": 2.2355, + "step": 10024 + }, + { + "epoch": 1.879100281162137, + "grad_norm": 53585.02734375, + "learning_rate": 4.9842842046988094e-05, + "loss": 2.2497, + "step": 10025 + }, + { + "epoch": 1.8792877225866915, + "grad_norm": 48788.578125, + "learning_rate": 4.9834984177187986e-05, + "loss": 2.1975, + "step": 10026 + }, + { + "epoch": 1.8794751640112466, + "grad_norm": 53967.76953125, + "learning_rate": 4.982712631146354e-05, + "loss": 2.2459, + "step": 10027 + }, + { + "epoch": 1.8796626054358012, + "grad_norm": 49912.25, + "learning_rate": 4.9819268450008864e-05, + "loss": 2.2011, + "step": 10028 + }, + { + "epoch": 1.8798500468603563, + "grad_norm": 51288.25, + "learning_rate": 4.9811410593018015e-05, + "loss": 2.1673, + "step": 10029 + }, + { + "epoch": 1.880037488284911, + "grad_norm": 50160.60546875, + "learning_rate": 4.980355274068509e-05, + "loss": 2.1929, + "step": 10030 + }, + { + "epoch": 1.8802249297094658, + "grad_norm": 52076.75, + "learning_rate": 4.979569489320415e-05, + "loss": 2.2015, + "step": 10031 + }, + { + "epoch": 1.8804123711340206, + "grad_norm": 52073.140625, + "learning_rate": 4.9787837050769263e-05, + "loss": 2.1775, + "step": 10032 + }, + { + "epoch": 1.8805998125585754, + "grad_norm": 52312.93359375, + "learning_rate": 4.977997921357457e-05, + "loss": 2.2799, + "step": 10033 + }, + { + "epoch": 1.8807872539831303, + "grad_norm": 50816.93359375, + "learning_rate": 4.977212138181406e-05, + "loss": 2.2242, + "step": 10034 + }, + { + "epoch": 1.8809746954076851, + "grad_norm": 49395.91015625, + "learning_rate": 4.9764263555681877e-05, + "loss": 2.2218, + "step": 10035 + }, + { + "epoch": 1.88116213683224, + "grad_norm": 53887.87890625, + "learning_rate": 4.9756405735372074e-05, + "loss": 2.2607, + "step": 10036 + }, + { + "epoch": 1.8813495782567946, + "grad_norm": 46086.6640625, + "learning_rate": 4.9748547921078746e-05, + "loss": 2.2615, + "step": 10037 + }, + { + "epoch": 1.8815370196813497, + "grad_norm": 52899.828125, + "learning_rate": 4.9740690112995946e-05, + "loss": 2.2373, + "step": 10038 + }, + { + "epoch": 1.8817244611059043, + "grad_norm": 47877.52734375, + "learning_rate": 4.9732832311317765e-05, + "loss": 2.2114, + "step": 10039 + }, + { + "epoch": 1.8819119025304594, + "grad_norm": 49795.40234375, + "learning_rate": 4.972497451623827e-05, + "loss": 2.2244, + "step": 10040 + }, + { + "epoch": 1.882099343955014, + "grad_norm": 50960.93359375, + "learning_rate": 4.9717116727951576e-05, + "loss": 2.1968, + "step": 10041 + }, + { + "epoch": 1.8822867853795688, + "grad_norm": 51963.30859375, + "learning_rate": 4.9709258946651706e-05, + "loss": 2.1869, + "step": 10042 + }, + { + "epoch": 1.8824742268041237, + "grad_norm": 53382.2578125, + "learning_rate": 4.970140117253277e-05, + "loss": 2.2785, + "step": 10043 + }, + { + "epoch": 1.8826616682286785, + "grad_norm": 50649.578125, + "learning_rate": 4.969354340578884e-05, + "loss": 2.2659, + "step": 10044 + }, + { + "epoch": 1.8828491096532334, + "grad_norm": 47309.265625, + "learning_rate": 4.9685685646614e-05, + "loss": 2.2156, + "step": 10045 + }, + { + "epoch": 1.8830365510777882, + "grad_norm": 54281.5625, + "learning_rate": 4.96778278952023e-05, + "loss": 2.2877, + "step": 10046 + }, + { + "epoch": 1.883223992502343, + "grad_norm": 51766.07421875, + "learning_rate": 4.966997015174783e-05, + "loss": 2.1455, + "step": 10047 + }, + { + "epoch": 1.8834114339268977, + "grad_norm": 49636.421875, + "learning_rate": 4.9662112416444704e-05, + "loss": 2.1895, + "step": 10048 + }, + { + "epoch": 1.8835988753514528, + "grad_norm": 53243.37109375, + "learning_rate": 4.9654254689486924e-05, + "loss": 2.1326, + "step": 10049 + }, + { + "epoch": 1.8837863167760074, + "grad_norm": 51903.3046875, + "learning_rate": 4.964639697106862e-05, + "loss": 2.1932, + "step": 10050 + }, + { + "epoch": 1.8839737582005625, + "grad_norm": 52820.34765625, + "learning_rate": 4.9638539261383847e-05, + "loss": 2.2166, + "step": 10051 + }, + { + "epoch": 1.884161199625117, + "grad_norm": 51377.8125, + "learning_rate": 4.963068156062671e-05, + "loss": 2.2046, + "step": 10052 + }, + { + "epoch": 1.884348641049672, + "grad_norm": 50044.76953125, + "learning_rate": 4.962282386899124e-05, + "loss": 2.2152, + "step": 10053 + }, + { + "epoch": 1.8845360824742268, + "grad_norm": 48754.21484375, + "learning_rate": 4.9614966186671525e-05, + "loss": 2.1656, + "step": 10054 + }, + { + "epoch": 1.8847235238987816, + "grad_norm": 60119.75, + "learning_rate": 4.960710851386164e-05, + "loss": 2.3553, + "step": 10055 + }, + { + "epoch": 1.8849109653233365, + "grad_norm": 51959.16015625, + "learning_rate": 4.95992508507557e-05, + "loss": 2.1827, + "step": 10056 + }, + { + "epoch": 1.8850984067478913, + "grad_norm": 52436.96875, + "learning_rate": 4.959139319754771e-05, + "loss": 2.2608, + "step": 10057 + }, + { + "epoch": 1.8852858481724462, + "grad_norm": 50942.1953125, + "learning_rate": 4.9583535554431785e-05, + "loss": 2.2499, + "step": 10058 + }, + { + "epoch": 1.8854732895970008, + "grad_norm": 60746.2578125, + "learning_rate": 4.9575677921602e-05, + "loss": 2.2958, + "step": 10059 + }, + { + "epoch": 1.8856607310215558, + "grad_norm": 50137.453125, + "learning_rate": 4.956782029925243e-05, + "loss": 2.2577, + "step": 10060 + }, + { + "epoch": 1.8858481724461105, + "grad_norm": 51575.2109375, + "learning_rate": 4.955996268757712e-05, + "loss": 2.1995, + "step": 10061 + }, + { + "epoch": 1.8860356138706655, + "grad_norm": 51361.125, + "learning_rate": 4.955210508677015e-05, + "loss": 2.2258, + "step": 10062 + }, + { + "epoch": 1.8862230552952202, + "grad_norm": 51219.390625, + "learning_rate": 4.9544247497025646e-05, + "loss": 2.1944, + "step": 10063 + }, + { + "epoch": 1.8864104967197752, + "grad_norm": 50370.6484375, + "learning_rate": 4.9536389918537604e-05, + "loss": 2.2341, + "step": 10064 + }, + { + "epoch": 1.8865979381443299, + "grad_norm": 47874.4609375, + "learning_rate": 4.952853235150014e-05, + "loss": 2.2578, + "step": 10065 + }, + { + "epoch": 1.8867853795688847, + "grad_norm": 53601.3125, + "learning_rate": 4.952067479610731e-05, + "loss": 2.0953, + "step": 10066 + }, + { + "epoch": 1.8869728209934395, + "grad_norm": 52830.87109375, + "learning_rate": 4.9512817252553205e-05, + "loss": 2.2806, + "step": 10067 + }, + { + "epoch": 1.8871602624179944, + "grad_norm": 51833.328125, + "learning_rate": 4.950495972103187e-05, + "loss": 2.1971, + "step": 10068 + }, + { + "epoch": 1.8873477038425492, + "grad_norm": 47456.625, + "learning_rate": 4.949710220173739e-05, + "loss": 2.1967, + "step": 10069 + }, + { + "epoch": 1.8875351452671039, + "grad_norm": 56461.24609375, + "learning_rate": 4.9489244694863825e-05, + "loss": 2.2204, + "step": 10070 + }, + { + "epoch": 1.887722586691659, + "grad_norm": 49189.7578125, + "learning_rate": 4.948138720060528e-05, + "loss": 2.1657, + "step": 10071 + }, + { + "epoch": 1.8879100281162136, + "grad_norm": 50235.70703125, + "learning_rate": 4.947352971915577e-05, + "loss": 2.2009, + "step": 10072 + }, + { + "epoch": 1.8880974695407686, + "grad_norm": 50463.5625, + "learning_rate": 4.9465672250709406e-05, + "loss": 2.2136, + "step": 10073 + }, + { + "epoch": 1.8882849109653232, + "grad_norm": 52556.0, + "learning_rate": 4.9457814795460246e-05, + "loss": 2.2402, + "step": 10074 + }, + { + "epoch": 1.8884723523898783, + "grad_norm": 48908.140625, + "learning_rate": 4.944995735360236e-05, + "loss": 2.26, + "step": 10075 + }, + { + "epoch": 1.888659793814433, + "grad_norm": 54571.484375, + "learning_rate": 4.944209992532981e-05, + "loss": 2.2649, + "step": 10076 + }, + { + "epoch": 1.8888472352389878, + "grad_norm": 49016.66796875, + "learning_rate": 4.943424251083667e-05, + "loss": 2.2639, + "step": 10077 + }, + { + "epoch": 1.8890346766635426, + "grad_norm": 50326.7265625, + "learning_rate": 4.942638511031701e-05, + "loss": 2.265, + "step": 10078 + }, + { + "epoch": 1.8892221180880975, + "grad_norm": 51595.43359375, + "learning_rate": 4.941852772396489e-05, + "loss": 2.2591, + "step": 10079 + }, + { + "epoch": 1.8894095595126523, + "grad_norm": 50506.09375, + "learning_rate": 4.941067035197437e-05, + "loss": 2.1783, + "step": 10080 + }, + { + "epoch": 1.8895970009372072, + "grad_norm": 51135.30078125, + "learning_rate": 4.940281299453954e-05, + "loss": 2.2593, + "step": 10081 + }, + { + "epoch": 1.889784442361762, + "grad_norm": 53468.29296875, + "learning_rate": 4.939495565185446e-05, + "loss": 2.2023, + "step": 10082 + }, + { + "epoch": 1.8899718837863166, + "grad_norm": 51866.18359375, + "learning_rate": 4.938709832411319e-05, + "loss": 2.2614, + "step": 10083 + }, + { + "epoch": 1.8901593252108717, + "grad_norm": 50977.1484375, + "learning_rate": 4.937924101150978e-05, + "loss": 2.1889, + "step": 10084 + }, + { + "epoch": 1.8903467666354263, + "grad_norm": 52182.1328125, + "learning_rate": 4.937138371423831e-05, + "loss": 2.2294, + "step": 10085 + }, + { + "epoch": 1.8905342080599814, + "grad_norm": 50787.15234375, + "learning_rate": 4.936352643249288e-05, + "loss": 2.2225, + "step": 10086 + }, + { + "epoch": 1.890721649484536, + "grad_norm": 46916.96484375, + "learning_rate": 4.9355669166467495e-05, + "loss": 2.2631, + "step": 10087 + }, + { + "epoch": 1.8909090909090909, + "grad_norm": 51291.328125, + "learning_rate": 4.934781191635625e-05, + "loss": 2.2736, + "step": 10088 + }, + { + "epoch": 1.8910965323336457, + "grad_norm": 51956.421875, + "learning_rate": 4.9339954682353214e-05, + "loss": 2.1694, + "step": 10089 + }, + { + "epoch": 1.8912839737582006, + "grad_norm": 51469.26953125, + "learning_rate": 4.933209746465244e-05, + "loss": 2.2178, + "step": 10090 + }, + { + "epoch": 1.8914714151827554, + "grad_norm": 51338.08984375, + "learning_rate": 4.932424026344798e-05, + "loss": 2.1637, + "step": 10091 + }, + { + "epoch": 1.8916588566073103, + "grad_norm": 49947.875, + "learning_rate": 4.9316383078933924e-05, + "loss": 2.2348, + "step": 10092 + }, + { + "epoch": 1.891846298031865, + "grad_norm": 49222.12109375, + "learning_rate": 4.930852591130433e-05, + "loss": 2.2578, + "step": 10093 + }, + { + "epoch": 1.8920337394564197, + "grad_norm": 53122.6328125, + "learning_rate": 4.930066876075323e-05, + "loss": 2.2315, + "step": 10094 + }, + { + "epoch": 1.8922211808809748, + "grad_norm": 57037.40625, + "learning_rate": 4.9292811627474696e-05, + "loss": 2.1924, + "step": 10095 + }, + { + "epoch": 1.8924086223055294, + "grad_norm": 50000.15625, + "learning_rate": 4.928495451166281e-05, + "loss": 2.1945, + "step": 10096 + }, + { + "epoch": 1.8925960637300845, + "grad_norm": 58099.0078125, + "learning_rate": 4.927709741351163e-05, + "loss": 2.2318, + "step": 10097 + }, + { + "epoch": 1.8927835051546391, + "grad_norm": 47856.3671875, + "learning_rate": 4.926924033321519e-05, + "loss": 2.2381, + "step": 10098 + }, + { + "epoch": 1.892970946579194, + "grad_norm": 51613.171875, + "learning_rate": 4.926138327096758e-05, + "loss": 2.2427, + "step": 10099 + }, + { + "epoch": 1.8931583880037488, + "grad_norm": 49787.734375, + "learning_rate": 4.925352622696282e-05, + "loss": 2.1897, + "step": 10100 + }, + { + "epoch": 1.8933458294283037, + "grad_norm": 52957.4609375, + "learning_rate": 4.9245669201395036e-05, + "loss": 2.2433, + "step": 10101 + }, + { + "epoch": 1.8935332708528585, + "grad_norm": 53159.6640625, + "learning_rate": 4.923781219445821e-05, + "loss": 2.1842, + "step": 10102 + }, + { + "epoch": 1.8937207122774133, + "grad_norm": 49759.76953125, + "learning_rate": 4.9229955206346447e-05, + "loss": 2.146, + "step": 10103 + }, + { + "epoch": 1.8939081537019682, + "grad_norm": 51927.22265625, + "learning_rate": 4.92220982372538e-05, + "loss": 2.2213, + "step": 10104 + }, + { + "epoch": 1.8940955951265228, + "grad_norm": 49520.64453125, + "learning_rate": 4.921424128737431e-05, + "loss": 2.1634, + "step": 10105 + }, + { + "epoch": 1.8942830365510779, + "grad_norm": 51770.19140625, + "learning_rate": 4.9206384356902035e-05, + "loss": 2.2403, + "step": 10106 + }, + { + "epoch": 1.8944704779756325, + "grad_norm": 50803.296875, + "learning_rate": 4.9198527446031044e-05, + "loss": 2.2364, + "step": 10107 + }, + { + "epoch": 1.8946579194001876, + "grad_norm": 57798.1328125, + "learning_rate": 4.9190670554955395e-05, + "loss": 2.2003, + "step": 10108 + }, + { + "epoch": 1.8948453608247422, + "grad_norm": 51916.30078125, + "learning_rate": 4.918281368386912e-05, + "loss": 2.1587, + "step": 10109 + }, + { + "epoch": 1.895032802249297, + "grad_norm": 54843.66015625, + "learning_rate": 4.917495683296629e-05, + "loss": 2.2164, + "step": 10110 + }, + { + "epoch": 1.895220243673852, + "grad_norm": 50391.484375, + "learning_rate": 4.916710000244095e-05, + "loss": 2.2037, + "step": 10111 + }, + { + "epoch": 1.8954076850984067, + "grad_norm": 52510.4921875, + "learning_rate": 4.915924319248718e-05, + "loss": 2.2679, + "step": 10112 + }, + { + "epoch": 1.8955951265229616, + "grad_norm": 51460.21484375, + "learning_rate": 4.9151386403299e-05, + "loss": 2.2105, + "step": 10113 + }, + { + "epoch": 1.8957825679475164, + "grad_norm": 48526.4765625, + "learning_rate": 4.914352963507048e-05, + "loss": 2.2914, + "step": 10114 + }, + { + "epoch": 1.8959700093720713, + "grad_norm": 50559.765625, + "learning_rate": 4.913567288799565e-05, + "loss": 2.2183, + "step": 10115 + }, + { + "epoch": 1.896157450796626, + "grad_norm": 52129.578125, + "learning_rate": 4.9127816162268614e-05, + "loss": 2.2358, + "step": 10116 + }, + { + "epoch": 1.896344892221181, + "grad_norm": 48590.703125, + "learning_rate": 4.911995945808336e-05, + "loss": 2.251, + "step": 10117 + }, + { + "epoch": 1.8965323336457356, + "grad_norm": 51273.8125, + "learning_rate": 4.911210277563397e-05, + "loss": 2.2287, + "step": 10118 + }, + { + "epoch": 1.8967197750702907, + "grad_norm": 51660.8671875, + "learning_rate": 4.9104246115114504e-05, + "loss": 2.3387, + "step": 10119 + }, + { + "epoch": 1.8969072164948453, + "grad_norm": 52435.00390625, + "learning_rate": 4.909638947671898e-05, + "loss": 2.2072, + "step": 10120 + }, + { + "epoch": 1.8970946579194004, + "grad_norm": 48509.8046875, + "learning_rate": 4.908853286064148e-05, + "loss": 2.1605, + "step": 10121 + }, + { + "epoch": 1.897282099343955, + "grad_norm": 53411.73828125, + "learning_rate": 4.908067626707603e-05, + "loss": 2.2108, + "step": 10122 + }, + { + "epoch": 1.8974695407685098, + "grad_norm": 50618.4765625, + "learning_rate": 4.907281969621669e-05, + "loss": 2.264, + "step": 10123 + }, + { + "epoch": 1.8976569821930647, + "grad_norm": 49169.26171875, + "learning_rate": 4.9064963148257496e-05, + "loss": 2.2027, + "step": 10124 + }, + { + "epoch": 1.8978444236176195, + "grad_norm": 62780.4765625, + "learning_rate": 4.905710662339248e-05, + "loss": 2.1834, + "step": 10125 + }, + { + "epoch": 1.8980318650421744, + "grad_norm": 47859.3046875, + "learning_rate": 4.904925012181573e-05, + "loss": 2.1781, + "step": 10126 + }, + { + "epoch": 1.898219306466729, + "grad_norm": 56934.37890625, + "learning_rate": 4.904139364372128e-05, + "loss": 2.1702, + "step": 10127 + }, + { + "epoch": 1.898406747891284, + "grad_norm": 48106.3203125, + "learning_rate": 4.903353718930315e-05, + "loss": 2.2204, + "step": 10128 + }, + { + "epoch": 1.8985941893158387, + "grad_norm": 50667.3828125, + "learning_rate": 4.9025680758755396e-05, + "loss": 2.1948, + "step": 10129 + }, + { + "epoch": 1.8987816307403937, + "grad_norm": 52072.4453125, + "learning_rate": 4.9017824352272074e-05, + "loss": 2.1698, + "step": 10130 + }, + { + "epoch": 1.8989690721649484, + "grad_norm": 44447.30078125, + "learning_rate": 4.900996797004721e-05, + "loss": 2.2054, + "step": 10131 + }, + { + "epoch": 1.8991565135895034, + "grad_norm": 51758.765625, + "learning_rate": 4.9002111612274845e-05, + "loss": 2.2134, + "step": 10132 + }, + { + "epoch": 1.899343955014058, + "grad_norm": 47248.53515625, + "learning_rate": 4.8994255279149035e-05, + "loss": 2.1935, + "step": 10133 + }, + { + "epoch": 1.899531396438613, + "grad_norm": 53170.90234375, + "learning_rate": 4.8986398970863835e-05, + "loss": 2.1787, + "step": 10134 + }, + { + "epoch": 1.8997188378631678, + "grad_norm": 53982.92578125, + "learning_rate": 4.8978542687613236e-05, + "loss": 2.3063, + "step": 10135 + }, + { + "epoch": 1.8999062792877226, + "grad_norm": 52182.4609375, + "learning_rate": 4.897068642959132e-05, + "loss": 2.2021, + "step": 10136 + }, + { + "epoch": 1.9000937207122774, + "grad_norm": 51872.3125, + "learning_rate": 4.8962830196992115e-05, + "loss": 2.2178, + "step": 10137 + }, + { + "epoch": 1.9002811621368323, + "grad_norm": 57010.46875, + "learning_rate": 4.895497399000967e-05, + "loss": 2.3626, + "step": 10138 + }, + { + "epoch": 1.9004686035613871, + "grad_norm": 50055.66015625, + "learning_rate": 4.8947117808838007e-05, + "loss": 2.2065, + "step": 10139 + }, + { + "epoch": 1.9006560449859418, + "grad_norm": 46626.4140625, + "learning_rate": 4.8939261653671156e-05, + "loss": 2.1938, + "step": 10140 + }, + { + "epoch": 1.9008434864104968, + "grad_norm": 53088.19921875, + "learning_rate": 4.893140552470317e-05, + "loss": 2.1923, + "step": 10141 + }, + { + "epoch": 1.9010309278350515, + "grad_norm": 49652.08203125, + "learning_rate": 4.89235494221281e-05, + "loss": 2.1949, + "step": 10142 + }, + { + "epoch": 1.9012183692596065, + "grad_norm": 56720.1796875, + "learning_rate": 4.891569334613995e-05, + "loss": 2.2475, + "step": 10143 + }, + { + "epoch": 1.9014058106841611, + "grad_norm": 59438.45703125, + "learning_rate": 4.890783729693277e-05, + "loss": 2.2311, + "step": 10144 + }, + { + "epoch": 1.901593252108716, + "grad_norm": 50128.69140625, + "learning_rate": 4.88999812747006e-05, + "loss": 2.202, + "step": 10145 + }, + { + "epoch": 1.9017806935332708, + "grad_norm": 49632.32421875, + "learning_rate": 4.8892125279637466e-05, + "loss": 2.2672, + "step": 10146 + }, + { + "epoch": 1.9019681349578257, + "grad_norm": 56631.82421875, + "learning_rate": 4.888426931193739e-05, + "loss": 2.2346, + "step": 10147 + }, + { + "epoch": 1.9021555763823805, + "grad_norm": 50623.46484375, + "learning_rate": 4.887641337179441e-05, + "loss": 2.2267, + "step": 10148 + }, + { + "epoch": 1.9023430178069354, + "grad_norm": 49200.6171875, + "learning_rate": 4.8868557459402594e-05, + "loss": 2.2496, + "step": 10149 + }, + { + "epoch": 1.9025304592314902, + "grad_norm": 50757.0625, + "learning_rate": 4.886070157495591e-05, + "loss": 2.1696, + "step": 10150 + }, + { + "epoch": 1.9027179006560448, + "grad_norm": 47611.0859375, + "learning_rate": 4.8852845718648425e-05, + "loss": 2.2195, + "step": 10151 + }, + { + "epoch": 1.9029053420806, + "grad_norm": 48087.609375, + "learning_rate": 4.884498989067417e-05, + "loss": 2.2559, + "step": 10152 + }, + { + "epoch": 1.9030927835051545, + "grad_norm": 53344.4140625, + "learning_rate": 4.8837134091227176e-05, + "loss": 2.1825, + "step": 10153 + }, + { + "epoch": 1.9032802249297096, + "grad_norm": 51204.38671875, + "learning_rate": 4.8829278320501455e-05, + "loss": 2.2344, + "step": 10154 + }, + { + "epoch": 1.9034676663542642, + "grad_norm": 52584.1796875, + "learning_rate": 4.8821422578691026e-05, + "loss": 2.181, + "step": 10155 + }, + { + "epoch": 1.903655107778819, + "grad_norm": 46517.78515625, + "learning_rate": 4.881356686598995e-05, + "loss": 2.1357, + "step": 10156 + }, + { + "epoch": 1.903842549203374, + "grad_norm": 49377.72265625, + "learning_rate": 4.880571118259223e-05, + "loss": 2.1755, + "step": 10157 + }, + { + "epoch": 1.9040299906279288, + "grad_norm": 49278.59375, + "learning_rate": 4.87978555286919e-05, + "loss": 2.3014, + "step": 10158 + }, + { + "epoch": 1.9042174320524836, + "grad_norm": 48747.15234375, + "learning_rate": 4.8789999904482966e-05, + "loss": 2.1931, + "step": 10159 + }, + { + "epoch": 1.9044048734770385, + "grad_norm": 54874.46875, + "learning_rate": 4.8782144310159486e-05, + "loss": 2.1798, + "step": 10160 + }, + { + "epoch": 1.9045923149015933, + "grad_norm": 48945.37109375, + "learning_rate": 4.8774288745915454e-05, + "loss": 2.1671, + "step": 10161 + }, + { + "epoch": 1.904779756326148, + "grad_norm": 48716.64453125, + "learning_rate": 4.8766433211944894e-05, + "loss": 2.204, + "step": 10162 + }, + { + "epoch": 1.904967197750703, + "grad_norm": 52271.03125, + "learning_rate": 4.8758577708441824e-05, + "loss": 2.2374, + "step": 10163 + }, + { + "epoch": 1.9051546391752576, + "grad_norm": 50474.37890625, + "learning_rate": 4.875072223560031e-05, + "loss": 2.1304, + "step": 10164 + }, + { + "epoch": 1.9053420805998127, + "grad_norm": 48248.18359375, + "learning_rate": 4.8742866793614304e-05, + "loss": 2.1503, + "step": 10165 + }, + { + "epoch": 1.9055295220243673, + "grad_norm": 53410.0703125, + "learning_rate": 4.873501138267787e-05, + "loss": 2.325, + "step": 10166 + }, + { + "epoch": 1.9057169634489222, + "grad_norm": 48478.97265625, + "learning_rate": 4.8727156002985015e-05, + "loss": 2.2332, + "step": 10167 + }, + { + "epoch": 1.905904404873477, + "grad_norm": 51132.26953125, + "learning_rate": 4.871930065472976e-05, + "loss": 2.2032, + "step": 10168 + }, + { + "epoch": 1.9060918462980319, + "grad_norm": 55922.20703125, + "learning_rate": 4.8711445338106115e-05, + "loss": 2.2079, + "step": 10169 + }, + { + "epoch": 1.9062792877225867, + "grad_norm": 51609.40234375, + "learning_rate": 4.87035900533081e-05, + "loss": 2.1688, + "step": 10170 + }, + { + "epoch": 1.9064667291471415, + "grad_norm": 50637.1796875, + "learning_rate": 4.869573480052972e-05, + "loss": 2.2286, + "step": 10171 + }, + { + "epoch": 1.9066541705716964, + "grad_norm": 54439.39453125, + "learning_rate": 4.8687879579965025e-05, + "loss": 2.181, + "step": 10172 + }, + { + "epoch": 1.906841611996251, + "grad_norm": 51325.30859375, + "learning_rate": 4.868002439180798e-05, + "loss": 2.2655, + "step": 10173 + }, + { + "epoch": 1.907029053420806, + "grad_norm": 53108.58203125, + "learning_rate": 4.867216923625263e-05, + "loss": 2.2594, + "step": 10174 + }, + { + "epoch": 1.9072164948453607, + "grad_norm": 51532.6328125, + "learning_rate": 4.8664314113492975e-05, + "loss": 2.2267, + "step": 10175 + }, + { + "epoch": 1.9074039362699158, + "grad_norm": 51323.4453125, + "learning_rate": 4.865645902372303e-05, + "loss": 2.1534, + "step": 10176 + }, + { + "epoch": 1.9075913776944704, + "grad_norm": 49613.48046875, + "learning_rate": 4.864860396713681e-05, + "loss": 2.2363, + "step": 10177 + }, + { + "epoch": 1.9077788191190255, + "grad_norm": 50994.71484375, + "learning_rate": 4.86407489439283e-05, + "loss": 2.231, + "step": 10178 + }, + { + "epoch": 1.90796626054358, + "grad_norm": 52466.23046875, + "learning_rate": 4.863289395429157e-05, + "loss": 2.2544, + "step": 10179 + }, + { + "epoch": 1.908153701968135, + "grad_norm": 50560.3828125, + "learning_rate": 4.8625038998420544e-05, + "loss": 2.3028, + "step": 10180 + }, + { + "epoch": 1.9083411433926898, + "grad_norm": 50626.1328125, + "learning_rate": 4.861718407650929e-05, + "loss": 2.2134, + "step": 10181 + }, + { + "epoch": 1.9085285848172446, + "grad_norm": 53023.6796875, + "learning_rate": 4.8609329188751794e-05, + "loss": 2.1792, + "step": 10182 + }, + { + "epoch": 1.9087160262417995, + "grad_norm": 50673.390625, + "learning_rate": 4.860147433534208e-05, + "loss": 2.2339, + "step": 10183 + }, + { + "epoch": 1.908903467666354, + "grad_norm": 52334.43359375, + "learning_rate": 4.859361951647412e-05, + "loss": 2.2293, + "step": 10184 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 52066.28125, + "learning_rate": 4.8585764732341935e-05, + "loss": 2.2103, + "step": 10185 + }, + { + "epoch": 1.9092783505154638, + "grad_norm": 53513.38671875, + "learning_rate": 4.857790998313954e-05, + "loss": 2.2463, + "step": 10186 + }, + { + "epoch": 1.9094657919400189, + "grad_norm": 49391.23828125, + "learning_rate": 4.857005526906092e-05, + "loss": 2.1978, + "step": 10187 + }, + { + "epoch": 1.9096532333645735, + "grad_norm": 49910.30859375, + "learning_rate": 4.8562200590300053e-05, + "loss": 2.1649, + "step": 10188 + }, + { + "epoch": 1.9098406747891286, + "grad_norm": 53483.38671875, + "learning_rate": 4.8554345947051e-05, + "loss": 2.1637, + "step": 10189 + }, + { + "epoch": 1.9100281162136832, + "grad_norm": 48632.1953125, + "learning_rate": 4.854649133950772e-05, + "loss": 2.2692, + "step": 10190 + }, + { + "epoch": 1.910215557638238, + "grad_norm": 49088.73828125, + "learning_rate": 4.8538636767864215e-05, + "loss": 2.2123, + "step": 10191 + }, + { + "epoch": 1.9104029990627929, + "grad_norm": 53758.80859375, + "learning_rate": 4.853078223231448e-05, + "loss": 2.214, + "step": 10192 + }, + { + "epoch": 1.9105904404873477, + "grad_norm": 50246.39453125, + "learning_rate": 4.8522927733052524e-05, + "loss": 2.1883, + "step": 10193 + }, + { + "epoch": 1.9107778819119026, + "grad_norm": 64883.06640625, + "learning_rate": 4.851507327027236e-05, + "loss": 2.1905, + "step": 10194 + }, + { + "epoch": 1.9109653233364574, + "grad_norm": 51452.6171875, + "learning_rate": 4.8507218844167934e-05, + "loss": 2.1786, + "step": 10195 + }, + { + "epoch": 1.9111527647610123, + "grad_norm": 53278.16015625, + "learning_rate": 4.849936445493327e-05, + "loss": 2.0884, + "step": 10196 + }, + { + "epoch": 1.9113402061855669, + "grad_norm": 50358.71484375, + "learning_rate": 4.8491510102762365e-05, + "loss": 2.1863, + "step": 10197 + }, + { + "epoch": 1.911527647610122, + "grad_norm": 52380.6015625, + "learning_rate": 4.848365578784921e-05, + "loss": 2.226, + "step": 10198 + }, + { + "epoch": 1.9117150890346766, + "grad_norm": 56191.17578125, + "learning_rate": 4.847580151038779e-05, + "loss": 2.1027, + "step": 10199 + }, + { + "epoch": 1.9119025304592316, + "grad_norm": 53136.40625, + "learning_rate": 4.846794727057209e-05, + "loss": 2.1626, + "step": 10200 + }, + { + "epoch": 1.9120899718837863, + "grad_norm": 49146.5859375, + "learning_rate": 4.846009306859612e-05, + "loss": 2.2248, + "step": 10201 + }, + { + "epoch": 1.9122774133083411, + "grad_norm": 50592.26171875, + "learning_rate": 4.845223890465384e-05, + "loss": 2.169, + "step": 10202 + }, + { + "epoch": 1.912464854732896, + "grad_norm": 52755.54296875, + "learning_rate": 4.8444384778939246e-05, + "loss": 2.2299, + "step": 10203 + }, + { + "epoch": 1.9126522961574508, + "grad_norm": 53093.10546875, + "learning_rate": 4.843653069164634e-05, + "loss": 2.2318, + "step": 10204 + }, + { + "epoch": 1.9128397375820057, + "grad_norm": 51569.78125, + "learning_rate": 4.8428676642969104e-05, + "loss": 2.2626, + "step": 10205 + }, + { + "epoch": 1.9130271790065605, + "grad_norm": 52104.421875, + "learning_rate": 4.842082263310151e-05, + "loss": 2.3169, + "step": 10206 + }, + { + "epoch": 1.9132146204311153, + "grad_norm": 50053.8046875, + "learning_rate": 4.841296866223755e-05, + "loss": 2.2689, + "step": 10207 + }, + { + "epoch": 1.91340206185567, + "grad_norm": 49441.4296875, + "learning_rate": 4.84051147305712e-05, + "loss": 2.2385, + "step": 10208 + }, + { + "epoch": 1.913589503280225, + "grad_norm": 51695.421875, + "learning_rate": 4.8397260838296475e-05, + "loss": 2.2231, + "step": 10209 + }, + { + "epoch": 1.9137769447047797, + "grad_norm": 49840.5078125, + "learning_rate": 4.83894069856073e-05, + "loss": 2.1993, + "step": 10210 + }, + { + "epoch": 1.9139643861293347, + "grad_norm": 52233.25, + "learning_rate": 4.8381553172697694e-05, + "loss": 2.1934, + "step": 10211 + }, + { + "epoch": 1.9141518275538894, + "grad_norm": 49278.09765625, + "learning_rate": 4.837369939976162e-05, + "loss": 2.2631, + "step": 10212 + }, + { + "epoch": 1.9143392689784442, + "grad_norm": 52806.0703125, + "learning_rate": 4.8365845666993086e-05, + "loss": 2.2207, + "step": 10213 + }, + { + "epoch": 1.914526710402999, + "grad_norm": 53385.13671875, + "learning_rate": 4.835799197458603e-05, + "loss": 2.2524, + "step": 10214 + }, + { + "epoch": 1.914714151827554, + "grad_norm": 58586.27734375, + "learning_rate": 4.835013832273444e-05, + "loss": 2.1697, + "step": 10215 + }, + { + "epoch": 1.9149015932521087, + "grad_norm": 53357.36328125, + "learning_rate": 4.834228471163231e-05, + "loss": 2.1631, + "step": 10216 + }, + { + "epoch": 1.9150890346766636, + "grad_norm": 50429.08203125, + "learning_rate": 4.833443114147358e-05, + "loss": 2.1809, + "step": 10217 + }, + { + "epoch": 1.9152764761012184, + "grad_norm": 46957.109375, + "learning_rate": 4.832657761245224e-05, + "loss": 2.2199, + "step": 10218 + }, + { + "epoch": 1.915463917525773, + "grad_norm": 49316.16796875, + "learning_rate": 4.831872412476226e-05, + "loss": 2.1392, + "step": 10219 + }, + { + "epoch": 1.9156513589503281, + "grad_norm": 52832.25390625, + "learning_rate": 4.831087067859764e-05, + "loss": 2.2152, + "step": 10220 + }, + { + "epoch": 1.9158388003748827, + "grad_norm": 51978.60546875, + "learning_rate": 4.8303017274152305e-05, + "loss": 2.1534, + "step": 10221 + }, + { + "epoch": 1.9160262417994378, + "grad_norm": 49347.1484375, + "learning_rate": 4.829516391162024e-05, + "loss": 2.1815, + "step": 10222 + }, + { + "epoch": 1.9162136832239924, + "grad_norm": 54417.77734375, + "learning_rate": 4.828731059119543e-05, + "loss": 2.2579, + "step": 10223 + }, + { + "epoch": 1.9164011246485473, + "grad_norm": 52955.234375, + "learning_rate": 4.827945731307183e-05, + "loss": 2.2076, + "step": 10224 + }, + { + "epoch": 1.9165885660731021, + "grad_norm": 54995.94140625, + "learning_rate": 4.827160407744339e-05, + "loss": 2.1932, + "step": 10225 + }, + { + "epoch": 1.916776007497657, + "grad_norm": 54470.421875, + "learning_rate": 4.826375088450408e-05, + "loss": 2.1557, + "step": 10226 + }, + { + "epoch": 1.9169634489222118, + "grad_norm": 49754.05078125, + "learning_rate": 4.825589773444791e-05, + "loss": 2.2099, + "step": 10227 + }, + { + "epoch": 1.9171508903467667, + "grad_norm": 54716.07421875, + "learning_rate": 4.824804462746876e-05, + "loss": 2.2325, + "step": 10228 + }, + { + "epoch": 1.9173383317713215, + "grad_norm": 47923.65234375, + "learning_rate": 4.8240191563760655e-05, + "loss": 2.2209, + "step": 10229 + }, + { + "epoch": 1.9175257731958761, + "grad_norm": 48892.73828125, + "learning_rate": 4.823233854351754e-05, + "loss": 2.1961, + "step": 10230 + }, + { + "epoch": 1.9177132146204312, + "grad_norm": 52331.6328125, + "learning_rate": 4.822448556693337e-05, + "loss": 2.2122, + "step": 10231 + }, + { + "epoch": 1.9179006560449858, + "grad_norm": 48959.37109375, + "learning_rate": 4.8216632634202105e-05, + "loss": 2.3982, + "step": 10232 + }, + { + "epoch": 1.918088097469541, + "grad_norm": 49984.0234375, + "learning_rate": 4.820877974551769e-05, + "loss": 2.2197, + "step": 10233 + }, + { + "epoch": 1.9182755388940955, + "grad_norm": 53245.5625, + "learning_rate": 4.820092690107411e-05, + "loss": 2.2851, + "step": 10234 + }, + { + "epoch": 1.9184629803186504, + "grad_norm": 47763.87890625, + "learning_rate": 4.81930741010653e-05, + "loss": 2.2605, + "step": 10235 + }, + { + "epoch": 1.9186504217432052, + "grad_norm": 50675.73046875, + "learning_rate": 4.818522134568521e-05, + "loss": 2.2431, + "step": 10236 + }, + { + "epoch": 1.91883786316776, + "grad_norm": 48098.32421875, + "learning_rate": 4.8177368635127804e-05, + "loss": 2.1845, + "step": 10237 + }, + { + "epoch": 1.919025304592315, + "grad_norm": 47719.640625, + "learning_rate": 4.816951596958703e-05, + "loss": 2.1878, + "step": 10238 + }, + { + "epoch": 1.9192127460168698, + "grad_norm": 48124.328125, + "learning_rate": 4.816166334925684e-05, + "loss": 2.2328, + "step": 10239 + }, + { + "epoch": 1.9194001874414246, + "grad_norm": 53333.40625, + "learning_rate": 4.815381077433118e-05, + "loss": 2.295, + "step": 10240 + }, + { + "epoch": 1.9195876288659792, + "grad_norm": 55079.0078125, + "learning_rate": 4.814595824500398e-05, + "loss": 2.2324, + "step": 10241 + }, + { + "epoch": 1.9197750702905343, + "grad_norm": 52960.21484375, + "learning_rate": 4.8138105761469256e-05, + "loss": 2.1427, + "step": 10242 + }, + { + "epoch": 1.919962511715089, + "grad_norm": 49449.125, + "learning_rate": 4.813025332392086e-05, + "loss": 2.1479, + "step": 10243 + }, + { + "epoch": 1.920149953139644, + "grad_norm": 49707.953125, + "learning_rate": 4.8122400932552794e-05, + "loss": 2.186, + "step": 10244 + }, + { + "epoch": 1.9203373945641986, + "grad_norm": 50265.20703125, + "learning_rate": 4.8114548587559e-05, + "loss": 2.1785, + "step": 10245 + }, + { + "epoch": 1.9205248359887537, + "grad_norm": 54106.1796875, + "learning_rate": 4.810669628913341e-05, + "loss": 2.2669, + "step": 10246 + }, + { + "epoch": 1.9207122774133083, + "grad_norm": 53837.84765625, + "learning_rate": 4.809884403746997e-05, + "loss": 2.2069, + "step": 10247 + }, + { + "epoch": 1.9208997188378631, + "grad_norm": 56242.20703125, + "learning_rate": 4.8090991832762596e-05, + "loss": 2.2059, + "step": 10248 + }, + { + "epoch": 1.921087160262418, + "grad_norm": 52622.08984375, + "learning_rate": 4.808313967520526e-05, + "loss": 2.2478, + "step": 10249 + }, + { + "epoch": 1.9212746016869728, + "grad_norm": 51363.90625, + "learning_rate": 4.80752875649919e-05, + "loss": 2.1708, + "step": 10250 + }, + { + "epoch": 1.9214620431115277, + "grad_norm": 50519.88671875, + "learning_rate": 4.806743550231644e-05, + "loss": 2.1692, + "step": 10251 + }, + { + "epoch": 1.9216494845360823, + "grad_norm": 52857.38671875, + "learning_rate": 4.805958348737281e-05, + "loss": 2.2057, + "step": 10252 + }, + { + "epoch": 1.9218369259606374, + "grad_norm": 49612.65234375, + "learning_rate": 4.805173152035495e-05, + "loss": 2.151, + "step": 10253 + }, + { + "epoch": 1.922024367385192, + "grad_norm": 49915.78125, + "learning_rate": 4.804387960145681e-05, + "loss": 2.2064, + "step": 10254 + }, + { + "epoch": 1.922211808809747, + "grad_norm": 52526.08203125, + "learning_rate": 4.80360277308723e-05, + "loss": 2.2919, + "step": 10255 + }, + { + "epoch": 1.9223992502343017, + "grad_norm": 50006.46484375, + "learning_rate": 4.802817590879534e-05, + "loss": 2.2554, + "step": 10256 + }, + { + "epoch": 1.9225866916588568, + "grad_norm": 46805.96484375, + "learning_rate": 4.802032413541991e-05, + "loss": 2.1813, + "step": 10257 + }, + { + "epoch": 1.9227741330834114, + "grad_norm": 47936.59375, + "learning_rate": 4.801247241093988e-05, + "loss": 2.2517, + "step": 10258 + }, + { + "epoch": 1.9229615745079662, + "grad_norm": 46756.71875, + "learning_rate": 4.8004620735549214e-05, + "loss": 2.1912, + "step": 10259 + }, + { + "epoch": 1.923149015932521, + "grad_norm": 50219.59765625, + "learning_rate": 4.799676910944183e-05, + "loss": 2.2723, + "step": 10260 + }, + { + "epoch": 1.923336457357076, + "grad_norm": 54112.2421875, + "learning_rate": 4.7988917532811666e-05, + "loss": 2.1212, + "step": 10261 + }, + { + "epoch": 1.9235238987816308, + "grad_norm": 50310.5859375, + "learning_rate": 4.798106600585261e-05, + "loss": 2.2215, + "step": 10262 + }, + { + "epoch": 1.9237113402061856, + "grad_norm": 45857.74609375, + "learning_rate": 4.797321452875862e-05, + "loss": 2.3218, + "step": 10263 + }, + { + "epoch": 1.9238987816307405, + "grad_norm": 49133.7265625, + "learning_rate": 4.796536310172358e-05, + "loss": 2.1918, + "step": 10264 + }, + { + "epoch": 1.924086223055295, + "grad_norm": 50715.98828125, + "learning_rate": 4.7957511724941476e-05, + "loss": 2.1788, + "step": 10265 + }, + { + "epoch": 1.9242736644798502, + "grad_norm": 49836.61328125, + "learning_rate": 4.7949660398606146e-05, + "loss": 2.1585, + "step": 10266 + }, + { + "epoch": 1.9244611059044048, + "grad_norm": 49897.765625, + "learning_rate": 4.7941809122911555e-05, + "loss": 2.1779, + "step": 10267 + }, + { + "epoch": 1.9246485473289598, + "grad_norm": 48729.01953125, + "learning_rate": 4.793395789805162e-05, + "loss": 2.1862, + "step": 10268 + }, + { + "epoch": 1.9248359887535145, + "grad_norm": 49699.78515625, + "learning_rate": 4.792610672422025e-05, + "loss": 2.2211, + "step": 10269 + }, + { + "epoch": 1.9250234301780693, + "grad_norm": 50359.55078125, + "learning_rate": 4.791825560161134e-05, + "loss": 2.2625, + "step": 10270 + }, + { + "epoch": 1.9252108716026242, + "grad_norm": 47886.80859375, + "learning_rate": 4.7910404530418813e-05, + "loss": 2.1948, + "step": 10271 + }, + { + "epoch": 1.925398313027179, + "grad_norm": 49216.37890625, + "learning_rate": 4.7902553510836615e-05, + "loss": 2.2134, + "step": 10272 + }, + { + "epoch": 1.9255857544517339, + "grad_norm": 49101.99609375, + "learning_rate": 4.789470254305859e-05, + "loss": 2.164, + "step": 10273 + }, + { + "epoch": 1.9257731958762887, + "grad_norm": 52403.47265625, + "learning_rate": 4.78868516272787e-05, + "loss": 2.1349, + "step": 10274 + }, + { + "epoch": 1.9259606373008435, + "grad_norm": 49263.203125, + "learning_rate": 4.7879000763690835e-05, + "loss": 2.2031, + "step": 10275 + }, + { + "epoch": 1.9261480787253982, + "grad_norm": 52683.5078125, + "learning_rate": 4.7871149952488905e-05, + "loss": 2.1834, + "step": 10276 + }, + { + "epoch": 1.9263355201499532, + "grad_norm": 49624.01953125, + "learning_rate": 4.7863299193866804e-05, + "loss": 2.1932, + "step": 10277 + }, + { + "epoch": 1.9265229615745079, + "grad_norm": 48443.78125, + "learning_rate": 4.785544848801845e-05, + "loss": 2.2077, + "step": 10278 + }, + { + "epoch": 1.926710402999063, + "grad_norm": 48514.87109375, + "learning_rate": 4.784759783513772e-05, + "loss": 2.2567, + "step": 10279 + }, + { + "epoch": 1.9268978444236176, + "grad_norm": 49913.7890625, + "learning_rate": 4.783974723541856e-05, + "loss": 2.189, + "step": 10280 + }, + { + "epoch": 1.9270852858481724, + "grad_norm": 50721.08203125, + "learning_rate": 4.7831896689054825e-05, + "loss": 2.1417, + "step": 10281 + }, + { + "epoch": 1.9272727272727272, + "grad_norm": 52364.37109375, + "learning_rate": 4.782404619624043e-05, + "loss": 2.2066, + "step": 10282 + }, + { + "epoch": 1.927460168697282, + "grad_norm": 52296.94140625, + "learning_rate": 4.781619575716929e-05, + "loss": 2.1708, + "step": 10283 + }, + { + "epoch": 1.927647610121837, + "grad_norm": 52165.26953125, + "learning_rate": 4.7808345372035265e-05, + "loss": 2.1739, + "step": 10284 + }, + { + "epoch": 1.9278350515463918, + "grad_norm": 52780.99609375, + "learning_rate": 4.7800495041032265e-05, + "loss": 2.1522, + "step": 10285 + }, + { + "epoch": 1.9280224929709466, + "grad_norm": 51481.5, + "learning_rate": 4.7792644764354184e-05, + "loss": 2.2023, + "step": 10286 + }, + { + "epoch": 1.9282099343955013, + "grad_norm": 56088.3203125, + "learning_rate": 4.7784794542194945e-05, + "loss": 2.1479, + "step": 10287 + }, + { + "epoch": 1.9283973758200563, + "grad_norm": 46545.921875, + "learning_rate": 4.7776944374748376e-05, + "loss": 2.2451, + "step": 10288 + }, + { + "epoch": 1.928584817244611, + "grad_norm": 50770.92578125, + "learning_rate": 4.7769094262208414e-05, + "loss": 2.273, + "step": 10289 + }, + { + "epoch": 1.928772258669166, + "grad_norm": 52123.625, + "learning_rate": 4.776124420476892e-05, + "loss": 2.1465, + "step": 10290 + }, + { + "epoch": 1.9289597000937206, + "grad_norm": 51989.3828125, + "learning_rate": 4.775339420262381e-05, + "loss": 2.2123, + "step": 10291 + }, + { + "epoch": 1.9291471415182755, + "grad_norm": 50748.07421875, + "learning_rate": 4.7745544255966946e-05, + "loss": 2.2236, + "step": 10292 + }, + { + "epoch": 1.9293345829428303, + "grad_norm": 56689.92578125, + "learning_rate": 4.773769436499221e-05, + "loss": 2.371, + "step": 10293 + }, + { + "epoch": 1.9295220243673852, + "grad_norm": 51309.7421875, + "learning_rate": 4.772984452989348e-05, + "loss": 2.2232, + "step": 10294 + }, + { + "epoch": 1.92970946579194, + "grad_norm": 48729.27734375, + "learning_rate": 4.7721994750864674e-05, + "loss": 2.2073, + "step": 10295 + }, + { + "epoch": 1.9298969072164949, + "grad_norm": 50322.9921875, + "learning_rate": 4.7714145028099614e-05, + "loss": 2.2114, + "step": 10296 + }, + { + "epoch": 1.9300843486410497, + "grad_norm": 47654.3671875, + "learning_rate": 4.770629536179222e-05, + "loss": 2.2699, + "step": 10297 + }, + { + "epoch": 1.9302717900656043, + "grad_norm": 53992.19921875, + "learning_rate": 4.769844575213637e-05, + "loss": 2.2251, + "step": 10298 + }, + { + "epoch": 1.9304592314901594, + "grad_norm": 50226.96875, + "learning_rate": 4.769059619932592e-05, + "loss": 2.2463, + "step": 10299 + }, + { + "epoch": 1.930646672914714, + "grad_norm": 53147.65625, + "learning_rate": 4.768274670355474e-05, + "loss": 2.1982, + "step": 10300 + }, + { + "epoch": 1.930834114339269, + "grad_norm": 50220.94921875, + "learning_rate": 4.76748972650167e-05, + "loss": 2.2362, + "step": 10301 + }, + { + "epoch": 1.9310215557638237, + "grad_norm": 48489.84765625, + "learning_rate": 4.766704788390572e-05, + "loss": 2.1717, + "step": 10302 + }, + { + "epoch": 1.9312089971883788, + "grad_norm": 50466.60546875, + "learning_rate": 4.76591985604156e-05, + "loss": 2.2398, + "step": 10303 + }, + { + "epoch": 1.9313964386129334, + "grad_norm": 49061.16015625, + "learning_rate": 4.765134929474025e-05, + "loss": 2.2336, + "step": 10304 + }, + { + "epoch": 1.9315838800374883, + "grad_norm": 52630.5234375, + "learning_rate": 4.764350008707353e-05, + "loss": 2.2648, + "step": 10305 + }, + { + "epoch": 1.9317713214620431, + "grad_norm": 46380.8984375, + "learning_rate": 4.763565093760931e-05, + "loss": 2.2469, + "step": 10306 + }, + { + "epoch": 1.931958762886598, + "grad_norm": 59591.984375, + "learning_rate": 4.762780184654144e-05, + "loss": 2.1381, + "step": 10307 + }, + { + "epoch": 1.9321462043111528, + "grad_norm": 48656.34375, + "learning_rate": 4.7619952814063786e-05, + "loss": 2.2685, + "step": 10308 + }, + { + "epoch": 1.9323336457357074, + "grad_norm": 51203.8515625, + "learning_rate": 4.76121038403702e-05, + "loss": 2.2102, + "step": 10309 + }, + { + "epoch": 1.9325210871602625, + "grad_norm": 46543.94921875, + "learning_rate": 4.760425492565459e-05, + "loss": 2.2291, + "step": 10310 + }, + { + "epoch": 1.9327085285848171, + "grad_norm": 48438.74609375, + "learning_rate": 4.759640607011074e-05, + "loss": 2.2163, + "step": 10311 + }, + { + "epoch": 1.9328959700093722, + "grad_norm": 50267.0234375, + "learning_rate": 4.758855727393255e-05, + "loss": 2.2404, + "step": 10312 + }, + { + "epoch": 1.9330834114339268, + "grad_norm": 49787.87109375, + "learning_rate": 4.758070853731389e-05, + "loss": 2.1543, + "step": 10313 + }, + { + "epoch": 1.9332708528584819, + "grad_norm": 49980.3515625, + "learning_rate": 4.7572859860448575e-05, + "loss": 2.1942, + "step": 10314 + }, + { + "epoch": 1.9334582942830365, + "grad_norm": 52217.21875, + "learning_rate": 4.756501124353048e-05, + "loss": 2.1953, + "step": 10315 + }, + { + "epoch": 1.9336457357075914, + "grad_norm": 48649.296875, + "learning_rate": 4.7557162686753445e-05, + "loss": 2.1353, + "step": 10316 + }, + { + "epoch": 1.9338331771321462, + "grad_norm": 47548.97265625, + "learning_rate": 4.7549314190311336e-05, + "loss": 2.1914, + "step": 10317 + }, + { + "epoch": 1.934020618556701, + "grad_norm": 57834.65625, + "learning_rate": 4.7541465754397985e-05, + "loss": 2.2185, + "step": 10318 + }, + { + "epoch": 1.934208059981256, + "grad_norm": 45094.30078125, + "learning_rate": 4.7533617379207226e-05, + "loss": 2.2561, + "step": 10319 + }, + { + "epoch": 1.9343955014058107, + "grad_norm": 47352.5703125, + "learning_rate": 4.752576906493293e-05, + "loss": 2.2437, + "step": 10320 + }, + { + "epoch": 1.9345829428303656, + "grad_norm": 48701.50390625, + "learning_rate": 4.751792081176895e-05, + "loss": 2.1723, + "step": 10321 + }, + { + "epoch": 1.9347703842549202, + "grad_norm": 54449.40234375, + "learning_rate": 4.751007261990909e-05, + "loss": 2.214, + "step": 10322 + }, + { + "epoch": 1.9349578256794753, + "grad_norm": 50262.05859375, + "learning_rate": 4.7502224489547204e-05, + "loss": 2.2209, + "step": 10323 + }, + { + "epoch": 1.93514526710403, + "grad_norm": 48282.44140625, + "learning_rate": 4.7494376420877144e-05, + "loss": 2.2233, + "step": 10324 + }, + { + "epoch": 1.935332708528585, + "grad_norm": 50360.97265625, + "learning_rate": 4.7486528414092734e-05, + "loss": 2.2615, + "step": 10325 + }, + { + "epoch": 1.9355201499531396, + "grad_norm": 53251.6875, + "learning_rate": 4.7478680469387796e-05, + "loss": 2.217, + "step": 10326 + }, + { + "epoch": 1.9357075913776944, + "grad_norm": 47281.30859375, + "learning_rate": 4.7470832586956204e-05, + "loss": 2.2627, + "step": 10327 + }, + { + "epoch": 1.9358950328022493, + "grad_norm": 48841.5, + "learning_rate": 4.746298476699177e-05, + "loss": 2.2084, + "step": 10328 + }, + { + "epoch": 1.9360824742268041, + "grad_norm": 56203.3046875, + "learning_rate": 4.745513700968831e-05, + "loss": 2.143, + "step": 10329 + }, + { + "epoch": 1.936269915651359, + "grad_norm": 51217.18359375, + "learning_rate": 4.7447289315239675e-05, + "loss": 2.2303, + "step": 10330 + }, + { + "epoch": 1.9364573570759138, + "grad_norm": 51726.1015625, + "learning_rate": 4.743944168383968e-05, + "loss": 2.2061, + "step": 10331 + }, + { + "epoch": 1.9366447985004687, + "grad_norm": 51415.75, + "learning_rate": 4.743159411568217e-05, + "loss": 2.2216, + "step": 10332 + }, + { + "epoch": 1.9368322399250233, + "grad_norm": 48804.80859375, + "learning_rate": 4.742374661096093e-05, + "loss": 2.2103, + "step": 10333 + }, + { + "epoch": 1.9370196813495784, + "grad_norm": 51260.80078125, + "learning_rate": 4.7415899169869806e-05, + "loss": 2.1992, + "step": 10334 + }, + { + "epoch": 1.937207122774133, + "grad_norm": 50577.921875, + "learning_rate": 4.740805179260264e-05, + "loss": 2.1748, + "step": 10335 + }, + { + "epoch": 1.937394564198688, + "grad_norm": 50723.15625, + "learning_rate": 4.740020447935324e-05, + "loss": 2.2425, + "step": 10336 + }, + { + "epoch": 1.9375820056232427, + "grad_norm": 48586.25390625, + "learning_rate": 4.739235723031541e-05, + "loss": 2.2357, + "step": 10337 + }, + { + "epoch": 1.9377694470477975, + "grad_norm": 49063.703125, + "learning_rate": 4.7384510045682975e-05, + "loss": 2.1654, + "step": 10338 + }, + { + "epoch": 1.9379568884723524, + "grad_norm": 51340.2578125, + "learning_rate": 4.7376662925649754e-05, + "loss": 2.1689, + "step": 10339 + }, + { + "epoch": 1.9381443298969072, + "grad_norm": 51671.80078125, + "learning_rate": 4.736881587040955e-05, + "loss": 2.213, + "step": 10340 + }, + { + "epoch": 1.938331771321462, + "grad_norm": 49162.83984375, + "learning_rate": 4.736096888015618e-05, + "loss": 2.249, + "step": 10341 + }, + { + "epoch": 1.938519212746017, + "grad_norm": 48160.09375, + "learning_rate": 4.735312195508346e-05, + "loss": 2.2463, + "step": 10342 + }, + { + "epoch": 1.9387066541705718, + "grad_norm": 53546.2109375, + "learning_rate": 4.7345275095385214e-05, + "loss": 2.1892, + "step": 10343 + }, + { + "epoch": 1.9388940955951264, + "grad_norm": 49064.58203125, + "learning_rate": 4.733742830125522e-05, + "loss": 2.2136, + "step": 10344 + }, + { + "epoch": 1.9390815370196814, + "grad_norm": 50480.58984375, + "learning_rate": 4.732958157288729e-05, + "loss": 2.2459, + "step": 10345 + }, + { + "epoch": 1.939268978444236, + "grad_norm": 50766.01171875, + "learning_rate": 4.732173491047523e-05, + "loss": 2.2533, + "step": 10346 + }, + { + "epoch": 1.9394564198687911, + "grad_norm": 50108.41796875, + "learning_rate": 4.7313888314212864e-05, + "loss": 2.2776, + "step": 10347 + }, + { + "epoch": 1.9396438612933458, + "grad_norm": 53221.4609375, + "learning_rate": 4.730604178429396e-05, + "loss": 2.2062, + "step": 10348 + }, + { + "epoch": 1.9398313027179006, + "grad_norm": 54061.625, + "learning_rate": 4.729819532091232e-05, + "loss": 2.2304, + "step": 10349 + }, + { + "epoch": 1.9400187441424555, + "grad_norm": 56299.0625, + "learning_rate": 4.7290348924261754e-05, + "loss": 2.1969, + "step": 10350 + }, + { + "epoch": 1.9402061855670103, + "grad_norm": 50162.31640625, + "learning_rate": 4.7282502594536084e-05, + "loss": 2.2208, + "step": 10351 + }, + { + "epoch": 1.9403936269915651, + "grad_norm": 51665.55078125, + "learning_rate": 4.7274656331929054e-05, + "loss": 2.1428, + "step": 10352 + }, + { + "epoch": 1.94058106841612, + "grad_norm": 48581.05859375, + "learning_rate": 4.7266810136634485e-05, + "loss": 2.2354, + "step": 10353 + }, + { + "epoch": 1.9407685098406748, + "grad_norm": 51556.2265625, + "learning_rate": 4.7258964008846166e-05, + "loss": 2.2065, + "step": 10354 + }, + { + "epoch": 1.9409559512652295, + "grad_norm": 50066.44921875, + "learning_rate": 4.7251117948757876e-05, + "loss": 2.1888, + "step": 10355 + }, + { + "epoch": 1.9411433926897845, + "grad_norm": 50259.4375, + "learning_rate": 4.724327195656341e-05, + "loss": 2.2015, + "step": 10356 + }, + { + "epoch": 1.9413308341143392, + "grad_norm": 51445.04296875, + "learning_rate": 4.7235426032456544e-05, + "loss": 2.1745, + "step": 10357 + }, + { + "epoch": 1.9415182755388942, + "grad_norm": 51218.9921875, + "learning_rate": 4.7227580176631096e-05, + "loss": 2.1917, + "step": 10358 + }, + { + "epoch": 1.9417057169634488, + "grad_norm": 50098.44140625, + "learning_rate": 4.7219734389280795e-05, + "loss": 2.2415, + "step": 10359 + }, + { + "epoch": 1.941893158388004, + "grad_norm": 50429.48046875, + "learning_rate": 4.721188867059946e-05, + "loss": 2.1878, + "step": 10360 + }, + { + "epoch": 1.9420805998125585, + "grad_norm": 52219.23828125, + "learning_rate": 4.7204043020780864e-05, + "loss": 2.1934, + "step": 10361 + }, + { + "epoch": 1.9422680412371134, + "grad_norm": 53425.97265625, + "learning_rate": 4.719619744001879e-05, + "loss": 2.1956, + "step": 10362 + }, + { + "epoch": 1.9424554826616682, + "grad_norm": 50968.88671875, + "learning_rate": 4.718835192850698e-05, + "loss": 2.2649, + "step": 10363 + }, + { + "epoch": 1.942642924086223, + "grad_norm": 54776.4140625, + "learning_rate": 4.7180506486439234e-05, + "loss": 2.2817, + "step": 10364 + }, + { + "epoch": 1.942830365510778, + "grad_norm": 50379.2578125, + "learning_rate": 4.717266111400934e-05, + "loss": 2.1566, + "step": 10365 + }, + { + "epoch": 1.9430178069353325, + "grad_norm": 52263.46875, + "learning_rate": 4.7164815811411045e-05, + "loss": 2.207, + "step": 10366 + }, + { + "epoch": 1.9432052483598876, + "grad_norm": 49395.21484375, + "learning_rate": 4.7156970578838125e-05, + "loss": 2.2045, + "step": 10367 + }, + { + "epoch": 1.9433926897844422, + "grad_norm": 50404.43359375, + "learning_rate": 4.714912541648434e-05, + "loss": 2.2161, + "step": 10368 + }, + { + "epoch": 1.9435801312089973, + "grad_norm": 52457.6015625, + "learning_rate": 4.7141280324543466e-05, + "loss": 2.199, + "step": 10369 + }, + { + "epoch": 1.943767572633552, + "grad_norm": 51071.21484375, + "learning_rate": 4.713343530320926e-05, + "loss": 2.1991, + "step": 10370 + }, + { + "epoch": 1.943955014058107, + "grad_norm": 48325.12890625, + "learning_rate": 4.7125590352675484e-05, + "loss": 2.2624, + "step": 10371 + }, + { + "epoch": 1.9441424554826616, + "grad_norm": 55994.8671875, + "learning_rate": 4.711774547313589e-05, + "loss": 2.1502, + "step": 10372 + }, + { + "epoch": 1.9443298969072165, + "grad_norm": 48955.18359375, + "learning_rate": 4.7109900664784275e-05, + "loss": 2.1788, + "step": 10373 + }, + { + "epoch": 1.9445173383317713, + "grad_norm": 49684.7265625, + "learning_rate": 4.710205592781434e-05, + "loss": 2.1912, + "step": 10374 + }, + { + "epoch": 1.9447047797563262, + "grad_norm": 53596.3359375, + "learning_rate": 4.709421126241987e-05, + "loss": 2.202, + "step": 10375 + }, + { + "epoch": 1.944892221180881, + "grad_norm": 50863.953125, + "learning_rate": 4.7086366668794614e-05, + "loss": 2.1693, + "step": 10376 + }, + { + "epoch": 1.9450796626054359, + "grad_norm": 52491.8515625, + "learning_rate": 4.7078522147132334e-05, + "loss": 2.2549, + "step": 10377 + }, + { + "epoch": 1.9452671040299907, + "grad_norm": 58219.13671875, + "learning_rate": 4.7070677697626764e-05, + "loss": 2.2034, + "step": 10378 + }, + { + "epoch": 1.9454545454545453, + "grad_norm": 50821.63671875, + "learning_rate": 4.7062833320471646e-05, + "loss": 2.2351, + "step": 10379 + }, + { + "epoch": 1.9456419868791004, + "grad_norm": 51749.8515625, + "learning_rate": 4.7054989015860766e-05, + "loss": 2.2285, + "step": 10380 + }, + { + "epoch": 1.945829428303655, + "grad_norm": 55218.51953125, + "learning_rate": 4.704714478398781e-05, + "loss": 2.2099, + "step": 10381 + }, + { + "epoch": 1.94601686972821, + "grad_norm": 55170.76171875, + "learning_rate": 4.703930062504656e-05, + "loss": 2.1866, + "step": 10382 + }, + { + "epoch": 1.9462043111527647, + "grad_norm": 51895.37109375, + "learning_rate": 4.703145653923074e-05, + "loss": 2.2217, + "step": 10383 + }, + { + "epoch": 1.9463917525773196, + "grad_norm": 49982.46484375, + "learning_rate": 4.702361252673411e-05, + "loss": 2.1859, + "step": 10384 + }, + { + "epoch": 1.9465791940018744, + "grad_norm": 49139.76953125, + "learning_rate": 4.701576858775038e-05, + "loss": 2.2089, + "step": 10385 + }, + { + "epoch": 1.9467666354264292, + "grad_norm": 51184.234375, + "learning_rate": 4.7007924722473296e-05, + "loss": 2.2947, + "step": 10386 + }, + { + "epoch": 1.946954076850984, + "grad_norm": 48930.5859375, + "learning_rate": 4.700008093109658e-05, + "loss": 2.2487, + "step": 10387 + }, + { + "epoch": 1.947141518275539, + "grad_norm": 49063.05859375, + "learning_rate": 4.699223721381401e-05, + "loss": 2.1892, + "step": 10388 + }, + { + "epoch": 1.9473289597000938, + "grad_norm": 47629.73828125, + "learning_rate": 4.6984393570819255e-05, + "loss": 2.2298, + "step": 10389 + }, + { + "epoch": 1.9475164011246484, + "grad_norm": 50765.70703125, + "learning_rate": 4.697655000230607e-05, + "loss": 2.2233, + "step": 10390 + }, + { + "epoch": 1.9477038425492035, + "grad_norm": 51385.14453125, + "learning_rate": 4.6968706508468186e-05, + "loss": 2.2509, + "step": 10391 + }, + { + "epoch": 1.947891283973758, + "grad_norm": 48643.9140625, + "learning_rate": 4.696086308949933e-05, + "loss": 2.2141, + "step": 10392 + }, + { + "epoch": 1.9480787253983132, + "grad_norm": 53680.96875, + "learning_rate": 4.6953019745593204e-05, + "loss": 2.2144, + "step": 10393 + }, + { + "epoch": 1.9482661668228678, + "grad_norm": 58249.234375, + "learning_rate": 4.6945176476943524e-05, + "loss": 2.2676, + "step": 10394 + }, + { + "epoch": 1.9484536082474226, + "grad_norm": 49881.39453125, + "learning_rate": 4.693733328374407e-05, + "loss": 2.202, + "step": 10395 + }, + { + "epoch": 1.9486410496719775, + "grad_norm": 48617.2421875, + "learning_rate": 4.6929490166188465e-05, + "loss": 2.3216, + "step": 10396 + }, + { + "epoch": 1.9488284910965323, + "grad_norm": 50230.3125, + "learning_rate": 4.692164712447049e-05, + "loss": 2.2182, + "step": 10397 + }, + { + "epoch": 1.9490159325210872, + "grad_norm": 48719.6640625, + "learning_rate": 4.691380415878384e-05, + "loss": 2.1493, + "step": 10398 + }, + { + "epoch": 1.949203373945642, + "grad_norm": 50983.8984375, + "learning_rate": 4.690596126932224e-05, + "loss": 2.2297, + "step": 10399 + }, + { + "epoch": 1.9493908153701969, + "grad_norm": 52968.49609375, + "learning_rate": 4.689811845627937e-05, + "loss": 2.1607, + "step": 10400 + }, + { + "epoch": 1.9495782567947515, + "grad_norm": 49687.3828125, + "learning_rate": 4.689027571984896e-05, + "loss": 2.262, + "step": 10401 + }, + { + "epoch": 1.9497656982193066, + "grad_norm": 53283.21875, + "learning_rate": 4.688243306022469e-05, + "loss": 2.2506, + "step": 10402 + }, + { + "epoch": 1.9499531396438612, + "grad_norm": 52216.7578125, + "learning_rate": 4.687459047760031e-05, + "loss": 2.1937, + "step": 10403 + }, + { + "epoch": 1.9501405810684163, + "grad_norm": 50628.33203125, + "learning_rate": 4.6866747972169466e-05, + "loss": 2.2763, + "step": 10404 + }, + { + "epoch": 1.9503280224929709, + "grad_norm": 54492.54296875, + "learning_rate": 4.6858905544125895e-05, + "loss": 2.2703, + "step": 10405 + }, + { + "epoch": 1.9505154639175257, + "grad_norm": 53668.31640625, + "learning_rate": 4.6851063193663284e-05, + "loss": 2.2427, + "step": 10406 + }, + { + "epoch": 1.9507029053420806, + "grad_norm": 48544.640625, + "learning_rate": 4.6843220920975346e-05, + "loss": 2.2044, + "step": 10407 + }, + { + "epoch": 1.9508903467666354, + "grad_norm": 49097.3984375, + "learning_rate": 4.683537872625574e-05, + "loss": 2.2165, + "step": 10408 + }, + { + "epoch": 1.9510777881911903, + "grad_norm": 52215.8671875, + "learning_rate": 4.682753660969818e-05, + "loss": 2.2068, + "step": 10409 + }, + { + "epoch": 1.9512652296157451, + "grad_norm": 52306.25, + "learning_rate": 4.681969457149637e-05, + "loss": 2.1806, + "step": 10410 + }, + { + "epoch": 1.9514526710403, + "grad_norm": 51461.33984375, + "learning_rate": 4.6811852611843957e-05, + "loss": 2.2003, + "step": 10411 + }, + { + "epoch": 1.9516401124648546, + "grad_norm": 51289.625, + "learning_rate": 4.680401073093464e-05, + "loss": 2.2702, + "step": 10412 + }, + { + "epoch": 1.9518275538894096, + "grad_norm": 50178.3203125, + "learning_rate": 4.679616892896213e-05, + "loss": 2.2333, + "step": 10413 + }, + { + "epoch": 1.9520149953139643, + "grad_norm": 55106.92578125, + "learning_rate": 4.678832720612011e-05, + "loss": 2.1522, + "step": 10414 + }, + { + "epoch": 1.9522024367385193, + "grad_norm": 51338.88671875, + "learning_rate": 4.678048556260223e-05, + "loss": 2.1843, + "step": 10415 + }, + { + "epoch": 1.952389878163074, + "grad_norm": 53172.75390625, + "learning_rate": 4.677264399860218e-05, + "loss": 2.2728, + "step": 10416 + }, + { + "epoch": 1.9525773195876288, + "grad_norm": 49832.93359375, + "learning_rate": 4.676480251431363e-05, + "loss": 2.2346, + "step": 10417 + }, + { + "epoch": 1.9527647610121837, + "grad_norm": 51324.6015625, + "learning_rate": 4.6756961109930294e-05, + "loss": 2.2479, + "step": 10418 + }, + { + "epoch": 1.9529522024367385, + "grad_norm": 52911.01953125, + "learning_rate": 4.674911978564578e-05, + "loss": 2.2602, + "step": 10419 + }, + { + "epoch": 1.9531396438612934, + "grad_norm": 48398.60546875, + "learning_rate": 4.6741278541653804e-05, + "loss": 2.1972, + "step": 10420 + }, + { + "epoch": 1.9533270852858482, + "grad_norm": 51309.85546875, + "learning_rate": 4.673343737814804e-05, + "loss": 2.2226, + "step": 10421 + }, + { + "epoch": 1.953514526710403, + "grad_norm": 52012.1328125, + "learning_rate": 4.672559629532212e-05, + "loss": 2.2428, + "step": 10422 + }, + { + "epoch": 1.9537019681349577, + "grad_norm": 59090.09375, + "learning_rate": 4.6717755293369724e-05, + "loss": 2.1371, + "step": 10423 + }, + { + "epoch": 1.9538894095595127, + "grad_norm": 53896.3359375, + "learning_rate": 4.670991437248452e-05, + "loss": 2.1996, + "step": 10424 + }, + { + "epoch": 1.9540768509840674, + "grad_norm": 50375.4765625, + "learning_rate": 4.670207353286018e-05, + "loss": 2.1797, + "step": 10425 + }, + { + "epoch": 1.9542642924086224, + "grad_norm": 49834.47265625, + "learning_rate": 4.6694232774690335e-05, + "loss": 2.2234, + "step": 10426 + }, + { + "epoch": 1.954451733833177, + "grad_norm": 46420.69140625, + "learning_rate": 4.6686392098168644e-05, + "loss": 2.2048, + "step": 10427 + }, + { + "epoch": 1.9546391752577321, + "grad_norm": 51438.26171875, + "learning_rate": 4.6678551503488774e-05, + "loss": 2.181, + "step": 10428 + }, + { + "epoch": 1.9548266166822867, + "grad_norm": 50549.50390625, + "learning_rate": 4.6670710990844404e-05, + "loss": 2.2317, + "step": 10429 + }, + { + "epoch": 1.9550140581068416, + "grad_norm": 50137.8828125, + "learning_rate": 4.6662870560429126e-05, + "loss": 2.2067, + "step": 10430 + }, + { + "epoch": 1.9552014995313964, + "grad_norm": 52001.0234375, + "learning_rate": 4.665503021243664e-05, + "loss": 2.1635, + "step": 10431 + }, + { + "epoch": 1.9553889409559513, + "grad_norm": 52135.91015625, + "learning_rate": 4.6647189947060545e-05, + "loss": 2.2454, + "step": 10432 + }, + { + "epoch": 1.9555763823805061, + "grad_norm": 50892.33203125, + "learning_rate": 4.663934976449455e-05, + "loss": 2.2021, + "step": 10433 + }, + { + "epoch": 1.9557638238050608, + "grad_norm": 53902.8828125, + "learning_rate": 4.6631509664932214e-05, + "loss": 2.226, + "step": 10434 + }, + { + "epoch": 1.9559512652296158, + "grad_norm": 52844.62890625, + "learning_rate": 4.662366964856724e-05, + "loss": 2.2144, + "step": 10435 + }, + { + "epoch": 1.9561387066541704, + "grad_norm": 51711.69921875, + "learning_rate": 4.661582971559326e-05, + "loss": 2.2824, + "step": 10436 + }, + { + "epoch": 1.9563261480787255, + "grad_norm": 55944.91015625, + "learning_rate": 4.660798986620389e-05, + "loss": 2.1843, + "step": 10437 + }, + { + "epoch": 1.9565135895032801, + "grad_norm": 54594.8984375, + "learning_rate": 4.660015010059277e-05, + "loss": 2.2473, + "step": 10438 + }, + { + "epoch": 1.9567010309278352, + "grad_norm": 49274.7421875, + "learning_rate": 4.659231041895354e-05, + "loss": 2.2558, + "step": 10439 + }, + { + "epoch": 1.9568884723523898, + "grad_norm": 53932.08984375, + "learning_rate": 4.658447082147983e-05, + "loss": 2.1562, + "step": 10440 + }, + { + "epoch": 1.9570759137769447, + "grad_norm": 48811.63671875, + "learning_rate": 4.657663130836525e-05, + "loss": 2.1593, + "step": 10441 + }, + { + "epoch": 1.9572633552014995, + "grad_norm": 50799.03125, + "learning_rate": 4.656879187980343e-05, + "loss": 2.2114, + "step": 10442 + }, + { + "epoch": 1.9574507966260544, + "grad_norm": 54294.76171875, + "learning_rate": 4.6560952535988015e-05, + "loss": 2.262, + "step": 10443 + }, + { + "epoch": 1.9576382380506092, + "grad_norm": 53213.0859375, + "learning_rate": 4.655311327711262e-05, + "loss": 2.1793, + "step": 10444 + }, + { + "epoch": 1.957825679475164, + "grad_norm": 48040.5703125, + "learning_rate": 4.654527410337085e-05, + "loss": 2.2322, + "step": 10445 + }, + { + "epoch": 1.958013120899719, + "grad_norm": 50905.75390625, + "learning_rate": 4.653743501495633e-05, + "loss": 2.2818, + "step": 10446 + }, + { + "epoch": 1.9582005623242735, + "grad_norm": 50005.92578125, + "learning_rate": 4.6529596012062676e-05, + "loss": 2.2578, + "step": 10447 + }, + { + "epoch": 1.9583880037488286, + "grad_norm": 55165.26953125, + "learning_rate": 4.6521757094883514e-05, + "loss": 2.1785, + "step": 10448 + }, + { + "epoch": 1.9585754451733832, + "grad_norm": 48000.0703125, + "learning_rate": 4.651391826361242e-05, + "loss": 2.1801, + "step": 10449 + }, + { + "epoch": 1.9587628865979383, + "grad_norm": 52078.36328125, + "learning_rate": 4.6506079518443023e-05, + "loss": 2.2252, + "step": 10450 + }, + { + "epoch": 1.958950328022493, + "grad_norm": 52907.13671875, + "learning_rate": 4.649824085956897e-05, + "loss": 2.2311, + "step": 10451 + }, + { + "epoch": 1.9591377694470478, + "grad_norm": 48152.34375, + "learning_rate": 4.649040228718379e-05, + "loss": 2.2085, + "step": 10452 + }, + { + "epoch": 1.9593252108716026, + "grad_norm": 49839.30859375, + "learning_rate": 4.6482563801481135e-05, + "loss": 2.1865, + "step": 10453 + }, + { + "epoch": 1.9595126522961575, + "grad_norm": 48854.78515625, + "learning_rate": 4.6474725402654587e-05, + "loss": 2.1831, + "step": 10454 + }, + { + "epoch": 1.9597000937207123, + "grad_norm": 52091.68359375, + "learning_rate": 4.6466887090897774e-05, + "loss": 2.1906, + "step": 10455 + }, + { + "epoch": 1.9598875351452671, + "grad_norm": 47754.70703125, + "learning_rate": 4.645904886640425e-05, + "loss": 2.1982, + "step": 10456 + }, + { + "epoch": 1.960074976569822, + "grad_norm": 50769.98046875, + "learning_rate": 4.645121072936762e-05, + "loss": 2.2471, + "step": 10457 + }, + { + "epoch": 1.9602624179943766, + "grad_norm": 53217.4375, + "learning_rate": 4.644337267998149e-05, + "loss": 2.158, + "step": 10458 + }, + { + "epoch": 1.9604498594189317, + "grad_norm": 49268.0859375, + "learning_rate": 4.6435534718439454e-05, + "loss": 2.2143, + "step": 10459 + }, + { + "epoch": 1.9606373008434863, + "grad_norm": 48428.61328125, + "learning_rate": 4.642769684493509e-05, + "loss": 2.1745, + "step": 10460 + }, + { + "epoch": 1.9608247422680414, + "grad_norm": 54609.72265625, + "learning_rate": 4.6419859059661965e-05, + "loss": 2.1855, + "step": 10461 + }, + { + "epoch": 1.961012183692596, + "grad_norm": 50483.6875, + "learning_rate": 4.6412021362813694e-05, + "loss": 2.1961, + "step": 10462 + }, + { + "epoch": 1.9611996251171508, + "grad_norm": 48962.53515625, + "learning_rate": 4.640418375458385e-05, + "loss": 2.1837, + "step": 10463 + }, + { + "epoch": 1.9613870665417057, + "grad_norm": 48883.453125, + "learning_rate": 4.639634623516599e-05, + "loss": 2.2207, + "step": 10464 + }, + { + "epoch": 1.9615745079662605, + "grad_norm": 62293.2734375, + "learning_rate": 4.63885088047537e-05, + "loss": 2.1169, + "step": 10465 + }, + { + "epoch": 1.9617619493908154, + "grad_norm": 50793.80859375, + "learning_rate": 4.638067146354059e-05, + "loss": 2.1584, + "step": 10466 + }, + { + "epoch": 1.9619493908153702, + "grad_norm": 54370.5, + "learning_rate": 4.637283421172017e-05, + "loss": 2.2031, + "step": 10467 + }, + { + "epoch": 1.962136832239925, + "grad_norm": 48905.66796875, + "learning_rate": 4.6364997049486056e-05, + "loss": 2.2206, + "step": 10468 + }, + { + "epoch": 1.9623242736644797, + "grad_norm": 52705.55859375, + "learning_rate": 4.63571599770318e-05, + "loss": 2.2811, + "step": 10469 + }, + { + "epoch": 1.9625117150890348, + "grad_norm": 51072.55078125, + "learning_rate": 4.634932299455098e-05, + "loss": 2.2212, + "step": 10470 + }, + { + "epoch": 1.9626991565135894, + "grad_norm": 54416.08203125, + "learning_rate": 4.634148610223714e-05, + "loss": 2.1934, + "step": 10471 + }, + { + "epoch": 1.9628865979381445, + "grad_norm": 49707.86328125, + "learning_rate": 4.633364930028384e-05, + "loss": 2.2839, + "step": 10472 + }, + { + "epoch": 1.963074039362699, + "grad_norm": 53932.09765625, + "learning_rate": 4.6325812588884656e-05, + "loss": 2.148, + "step": 10473 + }, + { + "epoch": 1.963261480787254, + "grad_norm": 49844.4765625, + "learning_rate": 4.631797596823315e-05, + "loss": 2.1809, + "step": 10474 + }, + { + "epoch": 1.9634489222118088, + "grad_norm": 51672.6796875, + "learning_rate": 4.631013943852285e-05, + "loss": 2.1989, + "step": 10475 + }, + { + "epoch": 1.9636363636363636, + "grad_norm": 51535.5, + "learning_rate": 4.630230299994732e-05, + "loss": 2.1808, + "step": 10476 + }, + { + "epoch": 1.9638238050609185, + "grad_norm": 57315.0390625, + "learning_rate": 4.629446665270013e-05, + "loss": 2.1531, + "step": 10477 + }, + { + "epoch": 1.9640112464854733, + "grad_norm": 53755.50390625, + "learning_rate": 4.62866303969748e-05, + "loss": 2.2033, + "step": 10478 + }, + { + "epoch": 1.9641986879100282, + "grad_norm": 49711.79296875, + "learning_rate": 4.627879423296488e-05, + "loss": 2.2165, + "step": 10479 + }, + { + "epoch": 1.9643861293345828, + "grad_norm": 49141.16796875, + "learning_rate": 4.6270958160863906e-05, + "loss": 2.302, + "step": 10480 + }, + { + "epoch": 1.9645735707591379, + "grad_norm": 50919.33984375, + "learning_rate": 4.626312218086546e-05, + "loss": 2.2927, + "step": 10481 + }, + { + "epoch": 1.9647610121836925, + "grad_norm": 53264.671875, + "learning_rate": 4.625528629316302e-05, + "loss": 2.1915, + "step": 10482 + }, + { + "epoch": 1.9649484536082475, + "grad_norm": 71625.40625, + "learning_rate": 4.6247450497950175e-05, + "loss": 2.4929, + "step": 10483 + }, + { + "epoch": 1.9651358950328022, + "grad_norm": 54883.72265625, + "learning_rate": 4.6239614795420426e-05, + "loss": 2.208, + "step": 10484 + }, + { + "epoch": 1.9653233364573572, + "grad_norm": 47780.6484375, + "learning_rate": 4.623177918576733e-05, + "loss": 2.2012, + "step": 10485 + }, + { + "epoch": 1.9655107778819119, + "grad_norm": 50177.796875, + "learning_rate": 4.6223943669184394e-05, + "loss": 2.1902, + "step": 10486 + }, + { + "epoch": 1.9656982193064667, + "grad_norm": 53005.765625, + "learning_rate": 4.6216108245865145e-05, + "loss": 2.2545, + "step": 10487 + }, + { + "epoch": 1.9658856607310216, + "grad_norm": 52639.89453125, + "learning_rate": 4.620827291600312e-05, + "loss": 2.2171, + "step": 10488 + }, + { + "epoch": 1.9660731021555764, + "grad_norm": 53238.66796875, + "learning_rate": 4.6200437679791855e-05, + "loss": 2.2287, + "step": 10489 + }, + { + "epoch": 1.9662605435801312, + "grad_norm": 51562.33984375, + "learning_rate": 4.619260253742484e-05, + "loss": 2.2377, + "step": 10490 + }, + { + "epoch": 1.9664479850046859, + "grad_norm": 49491.01953125, + "learning_rate": 4.61847674890956e-05, + "loss": 2.2226, + "step": 10491 + }, + { + "epoch": 1.966635426429241, + "grad_norm": 48528.140625, + "learning_rate": 4.617693253499768e-05, + "loss": 2.1577, + "step": 10492 + }, + { + "epoch": 1.9668228678537956, + "grad_norm": 49770.984375, + "learning_rate": 4.616909767532455e-05, + "loss": 2.2407, + "step": 10493 + }, + { + "epoch": 1.9670103092783506, + "grad_norm": 51163.3125, + "learning_rate": 4.616126291026974e-05, + "loss": 2.1816, + "step": 10494 + }, + { + "epoch": 1.9671977507029053, + "grad_norm": 48385.2890625, + "learning_rate": 4.615342824002675e-05, + "loss": 2.2262, + "step": 10495 + }, + { + "epoch": 1.9673851921274603, + "grad_norm": 50823.37109375, + "learning_rate": 4.6145593664789135e-05, + "loss": 2.2636, + "step": 10496 + }, + { + "epoch": 1.967572633552015, + "grad_norm": 51582.23828125, + "learning_rate": 4.613775918475032e-05, + "loss": 2.2581, + "step": 10497 + }, + { + "epoch": 1.9677600749765698, + "grad_norm": 47347.9140625, + "learning_rate": 4.612992480010385e-05, + "loss": 2.2465, + "step": 10498 + }, + { + "epoch": 1.9679475164011246, + "grad_norm": 46377.99609375, + "learning_rate": 4.612209051104323e-05, + "loss": 2.2345, + "step": 10499 + }, + { + "epoch": 1.9681349578256795, + "grad_norm": 51676.3125, + "learning_rate": 4.611425631776195e-05, + "loss": 2.1669, + "step": 10500 + }, + { + "epoch": 1.9681349578256795, + "eval_loss": 2.279844284057617, + "eval_runtime": 131.7288, + "eval_samples_per_second": 38.329, + "eval_steps_per_second": 1.921, + "step": 10500 + }, + { + "epoch": 1.9683223992502343, + "grad_norm": 50897.4453125, + "learning_rate": 4.61064222204535e-05, + "loss": 2.198, + "step": 10501 + }, + { + "epoch": 1.9685098406747892, + "grad_norm": 52623.63671875, + "learning_rate": 4.6098588219311365e-05, + "loss": 2.1303, + "step": 10502 + }, + { + "epoch": 1.968697282099344, + "grad_norm": 55152.33984375, + "learning_rate": 4.609075431452903e-05, + "loss": 2.1869, + "step": 10503 + }, + { + "epoch": 1.9688847235238986, + "grad_norm": 50326.3046875, + "learning_rate": 4.608292050630004e-05, + "loss": 2.2633, + "step": 10504 + }, + { + "epoch": 1.9690721649484537, + "grad_norm": 53586.4765625, + "learning_rate": 4.607508679481779e-05, + "loss": 2.2316, + "step": 10505 + }, + { + "epoch": 1.9692596063730083, + "grad_norm": 52731.59375, + "learning_rate": 4.606725318027582e-05, + "loss": 2.2354, + "step": 10506 + }, + { + "epoch": 1.9694470477975634, + "grad_norm": 53259.42578125, + "learning_rate": 4.605941966286761e-05, + "loss": 2.1339, + "step": 10507 + }, + { + "epoch": 1.969634489222118, + "grad_norm": 50087.90234375, + "learning_rate": 4.605158624278661e-05, + "loss": 2.1942, + "step": 10508 + }, + { + "epoch": 1.9698219306466729, + "grad_norm": 46018.6796875, + "learning_rate": 4.604375292022632e-05, + "loss": 2.2034, + "step": 10509 + }, + { + "epoch": 1.9700093720712277, + "grad_norm": 52230.75390625, + "learning_rate": 4.603591969538019e-05, + "loss": 2.2308, + "step": 10510 + }, + { + "epoch": 1.9701968134957826, + "grad_norm": 48866.69140625, + "learning_rate": 4.602808656844173e-05, + "loss": 2.3136, + "step": 10511 + }, + { + "epoch": 1.9703842549203374, + "grad_norm": 50616.171875, + "learning_rate": 4.602025353960436e-05, + "loss": 2.2018, + "step": 10512 + }, + { + "epoch": 1.9705716963448923, + "grad_norm": 51012.91796875, + "learning_rate": 4.6012420609061577e-05, + "loss": 2.097, + "step": 10513 + }, + { + "epoch": 1.9707591377694471, + "grad_norm": 53316.75390625, + "learning_rate": 4.6004587777006824e-05, + "loss": 2.1932, + "step": 10514 + }, + { + "epoch": 1.9709465791940017, + "grad_norm": 57261.16015625, + "learning_rate": 4.599675504363359e-05, + "loss": 2.2548, + "step": 10515 + }, + { + "epoch": 1.9711340206185568, + "grad_norm": 53879.65625, + "learning_rate": 4.598892240913531e-05, + "loss": 2.2024, + "step": 10516 + }, + { + "epoch": 1.9713214620431114, + "grad_norm": 53296.21875, + "learning_rate": 4.598108987370544e-05, + "loss": 2.2532, + "step": 10517 + }, + { + "epoch": 1.9715089034676665, + "grad_norm": 47817.875, + "learning_rate": 4.5973257437537456e-05, + "loss": 2.2732, + "step": 10518 + }, + { + "epoch": 1.9716963448922211, + "grad_norm": 51963.015625, + "learning_rate": 4.596542510082478e-05, + "loss": 2.1288, + "step": 10519 + }, + { + "epoch": 1.971883786316776, + "grad_norm": 52607.80859375, + "learning_rate": 4.595759286376086e-05, + "loss": 2.2026, + "step": 10520 + }, + { + "epoch": 1.9720712277413308, + "grad_norm": 50071.96875, + "learning_rate": 4.5949760726539165e-05, + "loss": 2.2055, + "step": 10521 + }, + { + "epoch": 1.9722586691658857, + "grad_norm": 51019.328125, + "learning_rate": 4.594192868935314e-05, + "loss": 2.1977, + "step": 10522 + }, + { + "epoch": 1.9724461105904405, + "grad_norm": 51322.88671875, + "learning_rate": 4.5934096752396206e-05, + "loss": 2.1857, + "step": 10523 + }, + { + "epoch": 1.9726335520149954, + "grad_norm": 55778.7578125, + "learning_rate": 4.592626491586181e-05, + "loss": 2.1898, + "step": 10524 + }, + { + "epoch": 1.9728209934395502, + "grad_norm": 55648.171875, + "learning_rate": 4.5918433179943375e-05, + "loss": 2.1825, + "step": 10525 + }, + { + "epoch": 1.9730084348641048, + "grad_norm": 49663.4296875, + "learning_rate": 4.5910601544834384e-05, + "loss": 2.2141, + "step": 10526 + }, + { + "epoch": 1.97319587628866, + "grad_norm": 53750.78125, + "learning_rate": 4.59027700107282e-05, + "loss": 2.3193, + "step": 10527 + }, + { + "epoch": 1.9733833177132145, + "grad_norm": 56884.9296875, + "learning_rate": 4.5894938577818294e-05, + "loss": 2.2165, + "step": 10528 + }, + { + "epoch": 1.9735707591377696, + "grad_norm": 52634.15234375, + "learning_rate": 4.588710724629809e-05, + "loss": 2.2095, + "step": 10529 + }, + { + "epoch": 1.9737582005623242, + "grad_norm": 55238.4296875, + "learning_rate": 4.587927601636101e-05, + "loss": 2.1859, + "step": 10530 + }, + { + "epoch": 1.973945641986879, + "grad_norm": 46851.1796875, + "learning_rate": 4.587144488820046e-05, + "loss": 2.2038, + "step": 10531 + }, + { + "epoch": 1.974133083411434, + "grad_norm": 56680.2734375, + "learning_rate": 4.586361386200987e-05, + "loss": 2.1904, + "step": 10532 + }, + { + "epoch": 1.9743205248359887, + "grad_norm": 49701.921875, + "learning_rate": 4.585578293798266e-05, + "loss": 2.2279, + "step": 10533 + }, + { + "epoch": 1.9745079662605436, + "grad_norm": 52942.90234375, + "learning_rate": 4.5847952116312234e-05, + "loss": 2.2017, + "step": 10534 + }, + { + "epoch": 1.9746954076850984, + "grad_norm": 47861.453125, + "learning_rate": 4.584012139719199e-05, + "loss": 2.2015, + "step": 10535 + }, + { + "epoch": 1.9748828491096533, + "grad_norm": 49856.0859375, + "learning_rate": 4.5832290780815365e-05, + "loss": 2.2157, + "step": 10536 + }, + { + "epoch": 1.975070290534208, + "grad_norm": 52732.640625, + "learning_rate": 4.5824460267375774e-05, + "loss": 2.0562, + "step": 10537 + }, + { + "epoch": 1.975257731958763, + "grad_norm": 47302.47265625, + "learning_rate": 4.5816629857066585e-05, + "loss": 2.1702, + "step": 10538 + }, + { + "epoch": 1.9754451733833176, + "grad_norm": 53415.9453125, + "learning_rate": 4.580879955008121e-05, + "loss": 2.1938, + "step": 10539 + }, + { + "epoch": 1.9756326148078727, + "grad_norm": 52130.05078125, + "learning_rate": 4.580096934661306e-05, + "loss": 2.2422, + "step": 10540 + }, + { + "epoch": 1.9758200562324273, + "grad_norm": 51808.80078125, + "learning_rate": 4.579313924685553e-05, + "loss": 2.2477, + "step": 10541 + }, + { + "epoch": 1.9760074976569824, + "grad_norm": 53253.4296875, + "learning_rate": 4.578530925100199e-05, + "loss": 2.2339, + "step": 10542 + }, + { + "epoch": 1.976194939081537, + "grad_norm": 49028.78125, + "learning_rate": 4.5777479359245846e-05, + "loss": 2.2383, + "step": 10543 + }, + { + "epoch": 1.9763823805060918, + "grad_norm": 53046.09375, + "learning_rate": 4.5769649571780494e-05, + "loss": 2.2221, + "step": 10544 + }, + { + "epoch": 1.9765698219306467, + "grad_norm": 53034.0546875, + "learning_rate": 4.576181988879932e-05, + "loss": 2.1998, + "step": 10545 + }, + { + "epoch": 1.9767572633552015, + "grad_norm": 51172.96484375, + "learning_rate": 4.575399031049569e-05, + "loss": 2.166, + "step": 10546 + }, + { + "epoch": 1.9769447047797564, + "grad_norm": 52464.6875, + "learning_rate": 4.5746160837063e-05, + "loss": 2.2511, + "step": 10547 + }, + { + "epoch": 1.977132146204311, + "grad_norm": 53231.0078125, + "learning_rate": 4.573833146869463e-05, + "loss": 2.2602, + "step": 10548 + }, + { + "epoch": 1.977319587628866, + "grad_norm": 45650.125, + "learning_rate": 4.573050220558393e-05, + "loss": 2.1789, + "step": 10549 + }, + { + "epoch": 1.9775070290534207, + "grad_norm": 48937.0859375, + "learning_rate": 4.572267304792429e-05, + "loss": 2.2135, + "step": 10550 + }, + { + "epoch": 1.9776944704779758, + "grad_norm": 54164.0703125, + "learning_rate": 4.571484399590909e-05, + "loss": 2.2746, + "step": 10551 + }, + { + "epoch": 1.9778819119025304, + "grad_norm": 50459.625, + "learning_rate": 4.570701504973169e-05, + "loss": 2.2737, + "step": 10552 + }, + { + "epoch": 1.9780693533270854, + "grad_norm": 48817.82421875, + "learning_rate": 4.569918620958545e-05, + "loss": 2.1602, + "step": 10553 + }, + { + "epoch": 1.97825679475164, + "grad_norm": 57698.87109375, + "learning_rate": 4.5691357475663735e-05, + "loss": 2.2173, + "step": 10554 + }, + { + "epoch": 1.978444236176195, + "grad_norm": 54422.96875, + "learning_rate": 4.5683528848159895e-05, + "loss": 2.2414, + "step": 10555 + }, + { + "epoch": 1.9786316776007498, + "grad_norm": 48613.65234375, + "learning_rate": 4.5675700327267314e-05, + "loss": 2.2241, + "step": 10556 + }, + { + "epoch": 1.9788191190253046, + "grad_norm": 57204.2890625, + "learning_rate": 4.566787191317932e-05, + "loss": 2.1706, + "step": 10557 + }, + { + "epoch": 1.9790065604498595, + "grad_norm": 52598.28125, + "learning_rate": 4.566004360608925e-05, + "loss": 2.2629, + "step": 10558 + }, + { + "epoch": 1.9791940018744143, + "grad_norm": 56380.34375, + "learning_rate": 4.5652215406190493e-05, + "loss": 2.2352, + "step": 10559 + }, + { + "epoch": 1.9793814432989691, + "grad_norm": 50483.5, + "learning_rate": 4.5644387313676396e-05, + "loss": 2.2138, + "step": 10560 + }, + { + "epoch": 1.9795688847235238, + "grad_norm": 53475.84765625, + "learning_rate": 4.563655932874027e-05, + "loss": 2.2383, + "step": 10561 + }, + { + "epoch": 1.9797563261480788, + "grad_norm": 52293.71484375, + "learning_rate": 4.562873145157547e-05, + "loss": 2.2313, + "step": 10562 + }, + { + "epoch": 1.9799437675726335, + "grad_norm": 51893.66796875, + "learning_rate": 4.562090368237534e-05, + "loss": 2.2502, + "step": 10563 + }, + { + "epoch": 1.9801312089971885, + "grad_norm": 51183.89453125, + "learning_rate": 4.561307602133321e-05, + "loss": 2.2089, + "step": 10564 + }, + { + "epoch": 1.9803186504217432, + "grad_norm": 54313.921875, + "learning_rate": 4.5605248468642404e-05, + "loss": 2.1557, + "step": 10565 + }, + { + "epoch": 1.980506091846298, + "grad_norm": 50075.0546875, + "learning_rate": 4.559742102449627e-05, + "loss": 2.2203, + "step": 10566 + }, + { + "epoch": 1.9806935332708528, + "grad_norm": 50591.7421875, + "learning_rate": 4.5589593689088146e-05, + "loss": 2.2616, + "step": 10567 + }, + { + "epoch": 1.9808809746954077, + "grad_norm": 51774.75390625, + "learning_rate": 4.558176646261132e-05, + "loss": 2.1986, + "step": 10568 + }, + { + "epoch": 1.9810684161199625, + "grad_norm": 51320.28515625, + "learning_rate": 4.5573939345259134e-05, + "loss": 2.1646, + "step": 10569 + }, + { + "epoch": 1.9812558575445174, + "grad_norm": 53224.37890625, + "learning_rate": 4.5566112337224914e-05, + "loss": 2.1932, + "step": 10570 + }, + { + "epoch": 1.9814432989690722, + "grad_norm": 47811.1875, + "learning_rate": 4.555828543870198e-05, + "loss": 2.1409, + "step": 10571 + }, + { + "epoch": 1.9816307403936269, + "grad_norm": 47791.12890625, + "learning_rate": 4.5550458649883625e-05, + "loss": 2.2459, + "step": 10572 + }, + { + "epoch": 1.981818181818182, + "grad_norm": 52657.53125, + "learning_rate": 4.554263197096316e-05, + "loss": 2.208, + "step": 10573 + }, + { + "epoch": 1.9820056232427365, + "grad_norm": 52833.24609375, + "learning_rate": 4.553480540213394e-05, + "loss": 2.2498, + "step": 10574 + }, + { + "epoch": 1.9821930646672916, + "grad_norm": 54625.625, + "learning_rate": 4.552697894358921e-05, + "loss": 2.0928, + "step": 10575 + }, + { + "epoch": 1.9823805060918462, + "grad_norm": 55476.01953125, + "learning_rate": 4.55191525955223e-05, + "loss": 2.1951, + "step": 10576 + }, + { + "epoch": 1.982567947516401, + "grad_norm": 53069.1015625, + "learning_rate": 4.551132635812652e-05, + "loss": 2.2234, + "step": 10577 + }, + { + "epoch": 1.982755388940956, + "grad_norm": 50624.54296875, + "learning_rate": 4.550350023159517e-05, + "loss": 2.2086, + "step": 10578 + }, + { + "epoch": 1.9829428303655108, + "grad_norm": 51677.8671875, + "learning_rate": 4.549567421612152e-05, + "loss": 2.2388, + "step": 10579 + }, + { + "epoch": 1.9831302717900656, + "grad_norm": 52659.52734375, + "learning_rate": 4.548784831189887e-05, + "loss": 2.1937, + "step": 10580 + }, + { + "epoch": 1.9833177132146205, + "grad_norm": 50668.8046875, + "learning_rate": 4.5480022519120525e-05, + "loss": 2.2073, + "step": 10581 + }, + { + "epoch": 1.9835051546391753, + "grad_norm": 49945.76171875, + "learning_rate": 4.5472196837979775e-05, + "loss": 2.2933, + "step": 10582 + }, + { + "epoch": 1.98369259606373, + "grad_norm": 51288.77734375, + "learning_rate": 4.546437126866988e-05, + "loss": 2.261, + "step": 10583 + }, + { + "epoch": 1.983880037488285, + "grad_norm": 58300.68359375, + "learning_rate": 4.545654581138414e-05, + "loss": 2.16, + "step": 10584 + }, + { + "epoch": 1.9840674789128396, + "grad_norm": 48724.9375, + "learning_rate": 4.544872046631582e-05, + "loss": 2.215, + "step": 10585 + }, + { + "epoch": 1.9842549203373947, + "grad_norm": 52775.00390625, + "learning_rate": 4.544089523365822e-05, + "loss": 2.1706, + "step": 10586 + }, + { + "epoch": 1.9844423617619493, + "grad_norm": 52733.84375, + "learning_rate": 4.543307011360458e-05, + "loss": 2.2202, + "step": 10587 + }, + { + "epoch": 1.9846298031865042, + "grad_norm": 53235.16015625, + "learning_rate": 4.542524510634818e-05, + "loss": 2.2207, + "step": 10588 + }, + { + "epoch": 1.984817244611059, + "grad_norm": 53880.01171875, + "learning_rate": 4.541742021208233e-05, + "loss": 2.2502, + "step": 10589 + }, + { + "epoch": 1.9850046860356139, + "grad_norm": 52804.87109375, + "learning_rate": 4.5409595431000226e-05, + "loss": 2.1784, + "step": 10590 + }, + { + "epoch": 1.9851921274601687, + "grad_norm": 52017.69140625, + "learning_rate": 4.540177076329517e-05, + "loss": 2.229, + "step": 10591 + }, + { + "epoch": 1.9853795688847236, + "grad_norm": 53585.71875, + "learning_rate": 4.539394620916041e-05, + "loss": 2.2146, + "step": 10592 + }, + { + "epoch": 1.9855670103092784, + "grad_norm": 56040.5546875, + "learning_rate": 4.538612176878923e-05, + "loss": 2.1106, + "step": 10593 + }, + { + "epoch": 1.985754451733833, + "grad_norm": 49465.2578125, + "learning_rate": 4.5378297442374846e-05, + "loss": 2.243, + "step": 10594 + }, + { + "epoch": 1.985941893158388, + "grad_norm": 52046.7734375, + "learning_rate": 4.537047323011052e-05, + "loss": 2.1781, + "step": 10595 + }, + { + "epoch": 1.9861293345829427, + "grad_norm": 51317.8984375, + "learning_rate": 4.536264913218949e-05, + "loss": 2.2172, + "step": 10596 + }, + { + "epoch": 1.9863167760074978, + "grad_norm": 53631.03515625, + "learning_rate": 4.535482514880505e-05, + "loss": 2.2454, + "step": 10597 + }, + { + "epoch": 1.9865042174320524, + "grad_norm": 55789.6953125, + "learning_rate": 4.5347001280150375e-05, + "loss": 2.2069, + "step": 10598 + }, + { + "epoch": 1.9866916588566073, + "grad_norm": 45937.54296875, + "learning_rate": 4.5339177526418744e-05, + "loss": 2.2641, + "step": 10599 + }, + { + "epoch": 1.986879100281162, + "grad_norm": 50610.98828125, + "learning_rate": 4.533135388780338e-05, + "loss": 2.2263, + "step": 10600 + }, + { + "epoch": 1.987066541705717, + "grad_norm": 52204.91796875, + "learning_rate": 4.5323530364497545e-05, + "loss": 2.2299, + "step": 10601 + }, + { + "epoch": 1.9872539831302718, + "grad_norm": 51613.21875, + "learning_rate": 4.5315706956694434e-05, + "loss": 2.2069, + "step": 10602 + }, + { + "epoch": 1.9874414245548266, + "grad_norm": 54648.84765625, + "learning_rate": 4.530788366458727e-05, + "loss": 2.1637, + "step": 10603 + }, + { + "epoch": 1.9876288659793815, + "grad_norm": 53125.61328125, + "learning_rate": 4.5300060488369335e-05, + "loss": 2.1661, + "step": 10604 + }, + { + "epoch": 1.9878163074039361, + "grad_norm": 47919.265625, + "learning_rate": 4.529223742823378e-05, + "loss": 2.3016, + "step": 10605 + }, + { + "epoch": 1.9880037488284912, + "grad_norm": 51928.7421875, + "learning_rate": 4.528441448437386e-05, + "loss": 2.2135, + "step": 10606 + }, + { + "epoch": 1.9881911902530458, + "grad_norm": 48481.875, + "learning_rate": 4.527659165698279e-05, + "loss": 2.1746, + "step": 10607 + }, + { + "epoch": 1.9883786316776009, + "grad_norm": 53913.71875, + "learning_rate": 4.52687689462538e-05, + "loss": 2.2013, + "step": 10608 + }, + { + "epoch": 1.9885660731021555, + "grad_norm": 58417.75, + "learning_rate": 4.526094635238006e-05, + "loss": 2.1795, + "step": 10609 + }, + { + "epoch": 1.9887535145267106, + "grad_norm": 49403.7421875, + "learning_rate": 4.5253123875554815e-05, + "loss": 2.2206, + "step": 10610 + }, + { + "epoch": 1.9889409559512652, + "grad_norm": 50949.10546875, + "learning_rate": 4.524530151597123e-05, + "loss": 2.2579, + "step": 10611 + }, + { + "epoch": 1.98912839737582, + "grad_norm": 51469.859375, + "learning_rate": 4.523747927382258e-05, + "loss": 2.1973, + "step": 10612 + }, + { + "epoch": 1.9893158388003749, + "grad_norm": 51406.6640625, + "learning_rate": 4.522965714930198e-05, + "loss": 2.2683, + "step": 10613 + }, + { + "epoch": 1.9895032802249297, + "grad_norm": 53376.75, + "learning_rate": 4.522183514260267e-05, + "loss": 2.1461, + "step": 10614 + }, + { + "epoch": 1.9896907216494846, + "grad_norm": 49000.171875, + "learning_rate": 4.521401325391785e-05, + "loss": 2.2607, + "step": 10615 + }, + { + "epoch": 1.9898781630740392, + "grad_norm": 52879.609375, + "learning_rate": 4.520619148344069e-05, + "loss": 2.1942, + "step": 10616 + }, + { + "epoch": 1.9900656044985943, + "grad_norm": 49213.90625, + "learning_rate": 4.519836983136438e-05, + "loss": 2.2817, + "step": 10617 + }, + { + "epoch": 1.990253045923149, + "grad_norm": 52658.51171875, + "learning_rate": 4.51905482978821e-05, + "loss": 2.2515, + "step": 10618 + }, + { + "epoch": 1.990440487347704, + "grad_norm": 52849.796875, + "learning_rate": 4.518272688318708e-05, + "loss": 2.3055, + "step": 10619 + }, + { + "epoch": 1.9906279287722586, + "grad_norm": 50739.63671875, + "learning_rate": 4.517490558747242e-05, + "loss": 2.1462, + "step": 10620 + }, + { + "epoch": 1.9908153701968136, + "grad_norm": 49397.39453125, + "learning_rate": 4.5167084410931346e-05, + "loss": 2.2653, + "step": 10621 + }, + { + "epoch": 1.9910028116213683, + "grad_norm": 50065.75, + "learning_rate": 4.5159263353757034e-05, + "loss": 2.1988, + "step": 10622 + }, + { + "epoch": 1.9911902530459231, + "grad_norm": 47995.03125, + "learning_rate": 4.515144241614264e-05, + "loss": 2.2141, + "step": 10623 + }, + { + "epoch": 1.991377694470478, + "grad_norm": 45139.3203125, + "learning_rate": 4.514362159828133e-05, + "loss": 2.2052, + "step": 10624 + }, + { + "epoch": 1.9915651358950328, + "grad_norm": 53917.66015625, + "learning_rate": 4.513580090036626e-05, + "loss": 2.308, + "step": 10625 + }, + { + "epoch": 1.9917525773195877, + "grad_norm": 52415.52734375, + "learning_rate": 4.51279803225906e-05, + "loss": 2.1938, + "step": 10626 + }, + { + "epoch": 1.9919400187441425, + "grad_norm": 51171.55859375, + "learning_rate": 4.512015986514753e-05, + "loss": 2.268, + "step": 10627 + }, + { + "epoch": 1.9921274601686974, + "grad_norm": 49512.6328125, + "learning_rate": 4.5112339528230154e-05, + "loss": 2.1867, + "step": 10628 + }, + { + "epoch": 1.992314901593252, + "grad_norm": 47582.35546875, + "learning_rate": 4.510451931203168e-05, + "loss": 2.1879, + "step": 10629 + }, + { + "epoch": 1.992502343017807, + "grad_norm": 56206.8984375, + "learning_rate": 4.509669921674523e-05, + "loss": 2.2167, + "step": 10630 + }, + { + "epoch": 1.9926897844423617, + "grad_norm": 51750.7734375, + "learning_rate": 4.508887924256394e-05, + "loss": 2.1512, + "step": 10631 + }, + { + "epoch": 1.9928772258669167, + "grad_norm": 48183.51171875, + "learning_rate": 4.5081059389680976e-05, + "loss": 2.2087, + "step": 10632 + }, + { + "epoch": 1.9930646672914714, + "grad_norm": 49516.0546875, + "learning_rate": 4.507323965828946e-05, + "loss": 2.1928, + "step": 10633 + }, + { + "epoch": 1.9932521087160262, + "grad_norm": 50095.39453125, + "learning_rate": 4.506542004858255e-05, + "loss": 2.1524, + "step": 10634 + }, + { + "epoch": 1.993439550140581, + "grad_norm": 52743.39453125, + "learning_rate": 4.5057600560753355e-05, + "loss": 2.3298, + "step": 10635 + }, + { + "epoch": 1.993626991565136, + "grad_norm": 54400.00390625, + "learning_rate": 4.5049781194995006e-05, + "loss": 2.168, + "step": 10636 + }, + { + "epoch": 1.9938144329896907, + "grad_norm": 54087.37109375, + "learning_rate": 4.504196195150066e-05, + "loss": 2.1658, + "step": 10637 + }, + { + "epoch": 1.9940018744142456, + "grad_norm": 48540.4609375, + "learning_rate": 4.503414283046343e-05, + "loss": 2.2963, + "step": 10638 + }, + { + "epoch": 1.9941893158388004, + "grad_norm": 45926.5546875, + "learning_rate": 4.502632383207642e-05, + "loss": 2.2384, + "step": 10639 + }, + { + "epoch": 1.994376757263355, + "grad_norm": 54289.28515625, + "learning_rate": 4.501850495653277e-05, + "loss": 2.2191, + "step": 10640 + }, + { + "epoch": 1.9945641986879101, + "grad_norm": 56436.45703125, + "learning_rate": 4.501068620402558e-05, + "loss": 2.2015, + "step": 10641 + }, + { + "epoch": 1.9947516401124648, + "grad_norm": 54417.76171875, + "learning_rate": 4.5002867574748e-05, + "loss": 2.3159, + "step": 10642 + }, + { + "epoch": 1.9949390815370198, + "grad_norm": 46584.8828125, + "learning_rate": 4.499504906889308e-05, + "loss": 2.28, + "step": 10643 + }, + { + "epoch": 1.9951265229615744, + "grad_norm": 47974.16796875, + "learning_rate": 4.498723068665397e-05, + "loss": 2.156, + "step": 10644 + }, + { + "epoch": 1.9953139643861293, + "grad_norm": 50623.8046875, + "learning_rate": 4.497941242822377e-05, + "loss": 2.2601, + "step": 10645 + }, + { + "epoch": 1.9955014058106841, + "grad_norm": 53518.125, + "learning_rate": 4.497159429379557e-05, + "loss": 2.2039, + "step": 10646 + }, + { + "epoch": 1.995688847235239, + "grad_norm": 53876.9765625, + "learning_rate": 4.496377628356247e-05, + "loss": 2.1896, + "step": 10647 + }, + { + "epoch": 1.9958762886597938, + "grad_norm": 53004.3359375, + "learning_rate": 4.4955958397717564e-05, + "loss": 2.1246, + "step": 10648 + }, + { + "epoch": 1.9960637300843487, + "grad_norm": 51081.39453125, + "learning_rate": 4.494814063645396e-05, + "loss": 2.2218, + "step": 10649 + }, + { + "epoch": 1.9962511715089035, + "grad_norm": 48776.82421875, + "learning_rate": 4.494032299996471e-05, + "loss": 2.266, + "step": 10650 + }, + { + "epoch": 1.9964386129334581, + "grad_norm": 49852.38671875, + "learning_rate": 4.493250548844292e-05, + "loss": 2.1758, + "step": 10651 + }, + { + "epoch": 1.9966260543580132, + "grad_norm": 53552.88671875, + "learning_rate": 4.4924688102081694e-05, + "loss": 2.2471, + "step": 10652 + }, + { + "epoch": 1.9968134957825678, + "grad_norm": 51296.54296875, + "learning_rate": 4.4916870841074086e-05, + "loss": 2.1847, + "step": 10653 + }, + { + "epoch": 1.997000937207123, + "grad_norm": 46274.71484375, + "learning_rate": 4.490905370561318e-05, + "loss": 2.177, + "step": 10654 + }, + { + "epoch": 1.9971883786316775, + "grad_norm": 52508.77734375, + "learning_rate": 4.490123669589204e-05, + "loss": 2.1408, + "step": 10655 + }, + { + "epoch": 1.9973758200562324, + "grad_norm": 48803.10546875, + "learning_rate": 4.489341981210373e-05, + "loss": 2.2129, + "step": 10656 + }, + { + "epoch": 1.9975632614807872, + "grad_norm": 56835.95703125, + "learning_rate": 4.4885603054441366e-05, + "loss": 2.1503, + "step": 10657 + }, + { + "epoch": 1.997750702905342, + "grad_norm": 49993.34765625, + "learning_rate": 4.4877786423097945e-05, + "loss": 2.2372, + "step": 10658 + }, + { + "epoch": 1.997938144329897, + "grad_norm": 47630.40234375, + "learning_rate": 4.4869969918266566e-05, + "loss": 2.2401, + "step": 10659 + }, + { + "epoch": 1.9981255857544518, + "grad_norm": 48436.8828125, + "learning_rate": 4.48621535401403e-05, + "loss": 2.2365, + "step": 10660 + }, + { + "epoch": 1.9983130271790066, + "grad_norm": 49497.07421875, + "learning_rate": 4.4854337288912154e-05, + "loss": 2.2283, + "step": 10661 + }, + { + "epoch": 1.9985004686035612, + "grad_norm": 53899.7421875, + "learning_rate": 4.484652116477521e-05, + "loss": 2.2426, + "step": 10662 + }, + { + "epoch": 1.9986879100281163, + "grad_norm": 53765.234375, + "learning_rate": 4.4838705167922515e-05, + "loss": 2.2226, + "step": 10663 + }, + { + "epoch": 1.998875351452671, + "grad_norm": 44745.63671875, + "learning_rate": 4.483088929854712e-05, + "loss": 2.2182, + "step": 10664 + }, + { + "epoch": 1.999062792877226, + "grad_norm": 53360.41796875, + "learning_rate": 4.482307355684204e-05, + "loss": 2.1851, + "step": 10665 + }, + { + "epoch": 1.9992502343017806, + "grad_norm": 51824.1171875, + "learning_rate": 4.481525794300033e-05, + "loss": 2.1866, + "step": 10666 + }, + { + "epoch": 1.9994376757263357, + "grad_norm": 52193.93359375, + "learning_rate": 4.480744245721504e-05, + "loss": 2.2101, + "step": 10667 + }, + { + "epoch": 1.9996251171508903, + "grad_norm": 51766.26171875, + "learning_rate": 4.4799627099679194e-05, + "loss": 2.1638, + "step": 10668 + }, + { + "epoch": 1.9998125585754452, + "grad_norm": 49506.8125, + "learning_rate": 4.479181187058581e-05, + "loss": 2.2542, + "step": 10669 + }, + { + "epoch": 2.0, + "grad_norm": 69274.734375, + "learning_rate": 4.478399677012791e-05, + "loss": 2.1865, + "step": 10670 + }, + { + "epoch": 2.0001874414245546, + "grad_norm": 50754.83984375, + "learning_rate": 4.4776181798498544e-05, + "loss": 2.1591, + "step": 10671 + }, + { + "epoch": 2.0003748828491097, + "grad_norm": 51622.703125, + "learning_rate": 4.4768366955890705e-05, + "loss": 2.0958, + "step": 10672 + }, + { + "epoch": 2.0005623242736643, + "grad_norm": 48464.4140625, + "learning_rate": 4.47605522424974e-05, + "loss": 2.0884, + "step": 10673 + }, + { + "epoch": 2.0007497656982194, + "grad_norm": 48876.92578125, + "learning_rate": 4.475273765851168e-05, + "loss": 2.1397, + "step": 10674 + }, + { + "epoch": 2.000937207122774, + "grad_norm": 53878.765625, + "learning_rate": 4.474492320412655e-05, + "loss": 2.1757, + "step": 10675 + }, + { + "epoch": 2.001124648547329, + "grad_norm": 55768.74609375, + "learning_rate": 4.4737108879534995e-05, + "loss": 2.1175, + "step": 10676 + }, + { + "epoch": 2.0013120899718837, + "grad_norm": 51254.921875, + "learning_rate": 4.472929468493002e-05, + "loss": 2.11, + "step": 10677 + }, + { + "epoch": 2.0014995313964388, + "grad_norm": 47553.28125, + "learning_rate": 4.472148062050464e-05, + "loss": 2.2931, + "step": 10678 + }, + { + "epoch": 2.0016869728209934, + "grad_norm": 51580.80859375, + "learning_rate": 4.471366668645185e-05, + "loss": 2.1491, + "step": 10679 + }, + { + "epoch": 2.0018744142455485, + "grad_norm": 53136.58203125, + "learning_rate": 4.4705852882964624e-05, + "loss": 2.1109, + "step": 10680 + }, + { + "epoch": 2.002061855670103, + "grad_norm": 53357.8671875, + "learning_rate": 4.469803921023597e-05, + "loss": 2.1878, + "step": 10681 + }, + { + "epoch": 2.0022492970946577, + "grad_norm": 52732.06640625, + "learning_rate": 4.469022566845887e-05, + "loss": 2.182, + "step": 10682 + }, + { + "epoch": 2.0024367385192128, + "grad_norm": 49065.0703125, + "learning_rate": 4.468241225782633e-05, + "loss": 2.1765, + "step": 10683 + }, + { + "epoch": 2.0026241799437674, + "grad_norm": 51728.984375, + "learning_rate": 4.4674598978531304e-05, + "loss": 2.1581, + "step": 10684 + }, + { + "epoch": 2.0028116213683225, + "grad_norm": 51295.87890625, + "learning_rate": 4.4666785830766773e-05, + "loss": 2.089, + "step": 10685 + }, + { + "epoch": 2.002999062792877, + "grad_norm": 51178.5625, + "learning_rate": 4.465897281472573e-05, + "loss": 2.1447, + "step": 10686 + }, + { + "epoch": 2.003186504217432, + "grad_norm": 52424.03515625, + "learning_rate": 4.465115993060112e-05, + "loss": 2.1176, + "step": 10687 + }, + { + "epoch": 2.003373945641987, + "grad_norm": 52027.484375, + "learning_rate": 4.464334717858593e-05, + "loss": 2.143, + "step": 10688 + }, + { + "epoch": 2.003561387066542, + "grad_norm": 54223.5546875, + "learning_rate": 4.463553455887311e-05, + "loss": 2.1933, + "step": 10689 + }, + { + "epoch": 2.0037488284910965, + "grad_norm": 53122.78125, + "learning_rate": 4.462772207165566e-05, + "loss": 2.1146, + "step": 10690 + }, + { + "epoch": 2.0039362699156515, + "grad_norm": 55320.66015625, + "learning_rate": 4.461990971712648e-05, + "loss": 2.1816, + "step": 10691 + }, + { + "epoch": 2.004123711340206, + "grad_norm": 50344.609375, + "learning_rate": 4.461209749547856e-05, + "loss": 2.1411, + "step": 10692 + }, + { + "epoch": 2.004311152764761, + "grad_norm": 49128.72265625, + "learning_rate": 4.460428540690485e-05, + "loss": 2.1509, + "step": 10693 + }, + { + "epoch": 2.004498594189316, + "grad_norm": 50342.421875, + "learning_rate": 4.459647345159831e-05, + "loss": 2.1644, + "step": 10694 + }, + { + "epoch": 2.0046860356138705, + "grad_norm": 51082.29296875, + "learning_rate": 4.458866162975185e-05, + "loss": 2.1086, + "step": 10695 + }, + { + "epoch": 2.0048734770384256, + "grad_norm": 50928.63671875, + "learning_rate": 4.458084994155843e-05, + "loss": 2.146, + "step": 10696 + }, + { + "epoch": 2.00506091846298, + "grad_norm": 49849.77734375, + "learning_rate": 4.4573038387210994e-05, + "loss": 2.1651, + "step": 10697 + }, + { + "epoch": 2.0052483598875352, + "grad_norm": 49905.1015625, + "learning_rate": 4.456522696690249e-05, + "loss": 2.1199, + "step": 10698 + }, + { + "epoch": 2.00543580131209, + "grad_norm": 53075.2421875, + "learning_rate": 4.455741568082582e-05, + "loss": 2.2152, + "step": 10699 + }, + { + "epoch": 2.005623242736645, + "grad_norm": 50765.6640625, + "learning_rate": 4.454960452917393e-05, + "loss": 2.0561, + "step": 10700 + }, + { + "epoch": 2.0058106841611996, + "grad_norm": 53367.17578125, + "learning_rate": 4.4541793512139744e-05, + "loss": 2.0487, + "step": 10701 + }, + { + "epoch": 2.0059981255857546, + "grad_norm": 52158.0234375, + "learning_rate": 4.453398262991618e-05, + "loss": 2.1421, + "step": 10702 + }, + { + "epoch": 2.0061855670103093, + "grad_norm": 50360.2421875, + "learning_rate": 4.4526171882696146e-05, + "loss": 2.0834, + "step": 10703 + }, + { + "epoch": 2.0063730084348643, + "grad_norm": 51143.9921875, + "learning_rate": 4.4518361270672574e-05, + "loss": 2.0924, + "step": 10704 + }, + { + "epoch": 2.006560449859419, + "grad_norm": 52738.5234375, + "learning_rate": 4.451055079403839e-05, + "loss": 2.1696, + "step": 10705 + }, + { + "epoch": 2.0067478912839736, + "grad_norm": 46164.7734375, + "learning_rate": 4.4502740452986464e-05, + "loss": 2.1553, + "step": 10706 + }, + { + "epoch": 2.0069353327085286, + "grad_norm": 52764.71484375, + "learning_rate": 4.4494930247709725e-05, + "loss": 2.2279, + "step": 10707 + }, + { + "epoch": 2.0071227741330833, + "grad_norm": 53033.7265625, + "learning_rate": 4.4487120178401074e-05, + "loss": 2.1243, + "step": 10708 + }, + { + "epoch": 2.0073102155576383, + "grad_norm": 52900.99609375, + "learning_rate": 4.4479310245253416e-05, + "loss": 2.0834, + "step": 10709 + }, + { + "epoch": 2.007497656982193, + "grad_norm": 53186.0078125, + "learning_rate": 4.447150044845963e-05, + "loss": 2.1181, + "step": 10710 + }, + { + "epoch": 2.007685098406748, + "grad_norm": 51525.28125, + "learning_rate": 4.44636907882126e-05, + "loss": 2.1509, + "step": 10711 + }, + { + "epoch": 2.0078725398313026, + "grad_norm": 53083.29296875, + "learning_rate": 4.445588126470526e-05, + "loss": 2.1002, + "step": 10712 + }, + { + "epoch": 2.0080599812558577, + "grad_norm": 51542.10546875, + "learning_rate": 4.444807187813044e-05, + "loss": 2.1069, + "step": 10713 + }, + { + "epoch": 2.0082474226804123, + "grad_norm": 53566.62890625, + "learning_rate": 4.444026262868105e-05, + "loss": 2.0672, + "step": 10714 + }, + { + "epoch": 2.0084348641049674, + "grad_norm": 51092.93359375, + "learning_rate": 4.443245351654997e-05, + "loss": 2.1439, + "step": 10715 + }, + { + "epoch": 2.008622305529522, + "grad_norm": 48415.12109375, + "learning_rate": 4.442464454193008e-05, + "loss": 2.1509, + "step": 10716 + }, + { + "epoch": 2.0088097469540767, + "grad_norm": 47946.82421875, + "learning_rate": 4.4416835705014233e-05, + "loss": 2.168, + "step": 10717 + }, + { + "epoch": 2.0089971883786317, + "grad_norm": 53885.2890625, + "learning_rate": 4.4409027005995294e-05, + "loss": 2.1908, + "step": 10718 + }, + { + "epoch": 2.0091846298031864, + "grad_norm": 50250.015625, + "learning_rate": 4.440121844506614e-05, + "loss": 2.1312, + "step": 10719 + }, + { + "epoch": 2.0093720712277414, + "grad_norm": 50050.515625, + "learning_rate": 4.439341002241966e-05, + "loss": 2.1599, + "step": 10720 + }, + { + "epoch": 2.009559512652296, + "grad_norm": 54542.91015625, + "learning_rate": 4.438560173824865e-05, + "loss": 2.2121, + "step": 10721 + }, + { + "epoch": 2.009746954076851, + "grad_norm": 56560.03515625, + "learning_rate": 4.437779359274601e-05, + "loss": 2.1031, + "step": 10722 + }, + { + "epoch": 2.0099343955014057, + "grad_norm": 56124.40234375, + "learning_rate": 4.436998558610458e-05, + "loss": 2.1272, + "step": 10723 + }, + { + "epoch": 2.010121836925961, + "grad_norm": 56249.3046875, + "learning_rate": 4.4362177718517216e-05, + "loss": 2.1657, + "step": 10724 + }, + { + "epoch": 2.0103092783505154, + "grad_norm": 60052.703125, + "learning_rate": 4.4354369990176743e-05, + "loss": 2.0769, + "step": 10725 + }, + { + "epoch": 2.0104967197750705, + "grad_norm": 51902.26171875, + "learning_rate": 4.434656240127601e-05, + "loss": 2.1834, + "step": 10726 + }, + { + "epoch": 2.010684161199625, + "grad_norm": 47109.71484375, + "learning_rate": 4.4338754952007865e-05, + "loss": 2.1755, + "step": 10727 + }, + { + "epoch": 2.0108716026241797, + "grad_norm": 53087.26171875, + "learning_rate": 4.4330947642565124e-05, + "loss": 2.118, + "step": 10728 + }, + { + "epoch": 2.011059044048735, + "grad_norm": 53224.109375, + "learning_rate": 4.4323140473140616e-05, + "loss": 2.2202, + "step": 10729 + }, + { + "epoch": 2.0112464854732894, + "grad_norm": 53814.984375, + "learning_rate": 4.4315333443927185e-05, + "loss": 2.1188, + "step": 10730 + }, + { + "epoch": 2.0114339268978445, + "grad_norm": 49533.72265625, + "learning_rate": 4.4307526555117665e-05, + "loss": 2.1022, + "step": 10731 + }, + { + "epoch": 2.011621368322399, + "grad_norm": 46600.1796875, + "learning_rate": 4.429971980690485e-05, + "loss": 2.1279, + "step": 10732 + }, + { + "epoch": 2.011808809746954, + "grad_norm": 47822.12890625, + "learning_rate": 4.4291913199481556e-05, + "loss": 2.1394, + "step": 10733 + }, + { + "epoch": 2.011996251171509, + "grad_norm": 52756.078125, + "learning_rate": 4.4284106733040596e-05, + "loss": 2.1409, + "step": 10734 + }, + { + "epoch": 2.012183692596064, + "grad_norm": 52151.796875, + "learning_rate": 4.427630040777483e-05, + "loss": 2.1386, + "step": 10735 + }, + { + "epoch": 2.0123711340206185, + "grad_norm": 54387.08203125, + "learning_rate": 4.4268494223876976e-05, + "loss": 2.0984, + "step": 10736 + }, + { + "epoch": 2.0125585754451736, + "grad_norm": 54476.1484375, + "learning_rate": 4.4260688181539905e-05, + "loss": 2.154, + "step": 10737 + }, + { + "epoch": 2.012746016869728, + "grad_norm": 59507.1484375, + "learning_rate": 4.42528822809564e-05, + "loss": 2.1575, + "step": 10738 + }, + { + "epoch": 2.012933458294283, + "grad_norm": 54220.6171875, + "learning_rate": 4.424507652231925e-05, + "loss": 2.1557, + "step": 10739 + }, + { + "epoch": 2.013120899718838, + "grad_norm": 54089.58984375, + "learning_rate": 4.4237270905821244e-05, + "loss": 2.1355, + "step": 10740 + }, + { + "epoch": 2.0133083411433925, + "grad_norm": 53539.03515625, + "learning_rate": 4.422946543165517e-05, + "loss": 2.0953, + "step": 10741 + }, + { + "epoch": 2.0134957825679476, + "grad_norm": 54909.23828125, + "learning_rate": 4.4221660100013836e-05, + "loss": 2.1499, + "step": 10742 + }, + { + "epoch": 2.013683223992502, + "grad_norm": 54884.73828125, + "learning_rate": 4.421385491108999e-05, + "loss": 2.1551, + "step": 10743 + }, + { + "epoch": 2.0138706654170573, + "grad_norm": 47985.12109375, + "learning_rate": 4.420604986507642e-05, + "loss": 2.1673, + "step": 10744 + }, + { + "epoch": 2.014058106841612, + "grad_norm": 53894.73046875, + "learning_rate": 4.4198244962165914e-05, + "loss": 2.1814, + "step": 10745 + }, + { + "epoch": 2.014245548266167, + "grad_norm": 51965.08203125, + "learning_rate": 4.4190440202551245e-05, + "loss": 2.1631, + "step": 10746 + }, + { + "epoch": 2.0144329896907216, + "grad_norm": 55870.3671875, + "learning_rate": 4.4182635586425164e-05, + "loss": 2.1574, + "step": 10747 + }, + { + "epoch": 2.0146204311152767, + "grad_norm": 52047.0390625, + "learning_rate": 4.4174831113980434e-05, + "loss": 2.2017, + "step": 10748 + }, + { + "epoch": 2.0148078725398313, + "grad_norm": 58313.9140625, + "learning_rate": 4.416702678540982e-05, + "loss": 2.1348, + "step": 10749 + }, + { + "epoch": 2.014995313964386, + "grad_norm": 52477.3828125, + "learning_rate": 4.4159222600906114e-05, + "loss": 2.1824, + "step": 10750 + }, + { + "epoch": 2.015182755388941, + "grad_norm": 51739.99609375, + "learning_rate": 4.4151418560662e-05, + "loss": 2.1578, + "step": 10751 + }, + { + "epoch": 2.0153701968134956, + "grad_norm": 50089.18359375, + "learning_rate": 4.414361466487027e-05, + "loss": 2.1377, + "step": 10752 + }, + { + "epoch": 2.0155576382380507, + "grad_norm": 49933.92578125, + "learning_rate": 4.413581091372368e-05, + "loss": 2.103, + "step": 10753 + }, + { + "epoch": 2.0157450796626053, + "grad_norm": 52027.2734375, + "learning_rate": 4.4128007307414956e-05, + "loss": 2.1069, + "step": 10754 + }, + { + "epoch": 2.0159325210871604, + "grad_norm": 53624.0703125, + "learning_rate": 4.412020384613684e-05, + "loss": 2.1915, + "step": 10755 + }, + { + "epoch": 2.016119962511715, + "grad_norm": 55064.75390625, + "learning_rate": 4.4112400530082056e-05, + "loss": 2.1398, + "step": 10756 + }, + { + "epoch": 2.01630740393627, + "grad_norm": 52833.3046875, + "learning_rate": 4.4104597359443354e-05, + "loss": 2.116, + "step": 10757 + }, + { + "epoch": 2.0164948453608247, + "grad_norm": 53373.08203125, + "learning_rate": 4.409679433441345e-05, + "loss": 2.0821, + "step": 10758 + }, + { + "epoch": 2.0166822867853798, + "grad_norm": 53996.73046875, + "learning_rate": 4.4088991455185056e-05, + "loss": 2.2074, + "step": 10759 + }, + { + "epoch": 2.0168697282099344, + "grad_norm": 52243.16015625, + "learning_rate": 4.408118872195093e-05, + "loss": 2.087, + "step": 10760 + }, + { + "epoch": 2.017057169634489, + "grad_norm": 55653.33984375, + "learning_rate": 4.407338613490377e-05, + "loss": 2.0283, + "step": 10761 + }, + { + "epoch": 2.017244611059044, + "grad_norm": 53014.6484375, + "learning_rate": 4.406558369423628e-05, + "loss": 2.122, + "step": 10762 + }, + { + "epoch": 2.0174320524835987, + "grad_norm": 51448.67578125, + "learning_rate": 4.405778140014118e-05, + "loss": 2.1958, + "step": 10763 + }, + { + "epoch": 2.0176194939081538, + "grad_norm": 53249.6015625, + "learning_rate": 4.4049979252811155e-05, + "loss": 2.194, + "step": 10764 + }, + { + "epoch": 2.0178069353327084, + "grad_norm": 49230.50390625, + "learning_rate": 4.404217725243897e-05, + "loss": 2.098, + "step": 10765 + }, + { + "epoch": 2.0179943767572635, + "grad_norm": 52544.6171875, + "learning_rate": 4.403437539921725e-05, + "loss": 2.1953, + "step": 10766 + }, + { + "epoch": 2.018181818181818, + "grad_norm": 52362.1328125, + "learning_rate": 4.402657369333873e-05, + "loss": 2.1017, + "step": 10767 + }, + { + "epoch": 2.018369259606373, + "grad_norm": 53741.171875, + "learning_rate": 4.401877213499611e-05, + "loss": 2.1455, + "step": 10768 + }, + { + "epoch": 2.0185567010309278, + "grad_norm": 52627.359375, + "learning_rate": 4.4010970724382055e-05, + "loss": 2.0591, + "step": 10769 + }, + { + "epoch": 2.018744142455483, + "grad_norm": 53594.10546875, + "learning_rate": 4.4003169461689254e-05, + "loss": 2.1987, + "step": 10770 + }, + { + "epoch": 2.0189315838800375, + "grad_norm": 53932.63671875, + "learning_rate": 4.3995368347110397e-05, + "loss": 2.1198, + "step": 10771 + }, + { + "epoch": 2.0191190253045925, + "grad_norm": 57517.90234375, + "learning_rate": 4.398756738083816e-05, + "loss": 2.1855, + "step": 10772 + }, + { + "epoch": 2.019306466729147, + "grad_norm": 50669.171875, + "learning_rate": 4.397976656306521e-05, + "loss": 2.0666, + "step": 10773 + }, + { + "epoch": 2.0194939081537018, + "grad_norm": 49678.45703125, + "learning_rate": 4.39719658939842e-05, + "loss": 2.0864, + "step": 10774 + }, + { + "epoch": 2.019681349578257, + "grad_norm": 48675.1640625, + "learning_rate": 4.396416537378783e-05, + "loss": 2.1686, + "step": 10775 + }, + { + "epoch": 2.0198687910028115, + "grad_norm": 53050.421875, + "learning_rate": 4.395636500266877e-05, + "loss": 2.1601, + "step": 10776 + }, + { + "epoch": 2.0200562324273665, + "grad_norm": 54427.5234375, + "learning_rate": 4.394856478081964e-05, + "loss": 2.1704, + "step": 10777 + }, + { + "epoch": 2.020243673851921, + "grad_norm": 51629.69921875, + "learning_rate": 4.394076470843312e-05, + "loss": 2.1679, + "step": 10778 + }, + { + "epoch": 2.0204311152764762, + "grad_norm": 50380.078125, + "learning_rate": 4.393296478570185e-05, + "loss": 2.1937, + "step": 10779 + }, + { + "epoch": 2.020618556701031, + "grad_norm": 52531.19921875, + "learning_rate": 4.392516501281849e-05, + "loss": 2.114, + "step": 10780 + }, + { + "epoch": 2.020805998125586, + "grad_norm": 52063.0546875, + "learning_rate": 4.391736538997568e-05, + "loss": 2.0722, + "step": 10781 + }, + { + "epoch": 2.0209934395501405, + "grad_norm": 47941.8671875, + "learning_rate": 4.390956591736604e-05, + "loss": 2.1292, + "step": 10782 + }, + { + "epoch": 2.0211808809746956, + "grad_norm": 51277.41796875, + "learning_rate": 4.3901766595182257e-05, + "loss": 2.2028, + "step": 10783 + }, + { + "epoch": 2.0213683223992502, + "grad_norm": 56418.921875, + "learning_rate": 4.3893967423616905e-05, + "loss": 2.1402, + "step": 10784 + }, + { + "epoch": 2.021555763823805, + "grad_norm": 53967.25, + "learning_rate": 4.388616840286265e-05, + "loss": 2.1326, + "step": 10785 + }, + { + "epoch": 2.02174320524836, + "grad_norm": 52463.0390625, + "learning_rate": 4.387836953311211e-05, + "loss": 2.1378, + "step": 10786 + }, + { + "epoch": 2.0219306466729146, + "grad_norm": 50520.13671875, + "learning_rate": 4.387057081455791e-05, + "loss": 2.1379, + "step": 10787 + }, + { + "epoch": 2.0221180880974696, + "grad_norm": 51230.1796875, + "learning_rate": 4.3862772247392655e-05, + "loss": 2.1406, + "step": 10788 + }, + { + "epoch": 2.0223055295220242, + "grad_norm": 49320.0390625, + "learning_rate": 4.3854973831808965e-05, + "loss": 2.1358, + "step": 10789 + }, + { + "epoch": 2.0224929709465793, + "grad_norm": 51163.6875, + "learning_rate": 4.384717556799946e-05, + "loss": 2.1343, + "step": 10790 + }, + { + "epoch": 2.022680412371134, + "grad_norm": 50537.57421875, + "learning_rate": 4.3839377456156756e-05, + "loss": 2.1733, + "step": 10791 + }, + { + "epoch": 2.022867853795689, + "grad_norm": 51497.73046875, + "learning_rate": 4.3831579496473425e-05, + "loss": 2.2019, + "step": 10792 + }, + { + "epoch": 2.0230552952202436, + "grad_norm": 57855.5625, + "learning_rate": 4.382378168914208e-05, + "loss": 2.2033, + "step": 10793 + }, + { + "epoch": 2.0232427366447987, + "grad_norm": 53978.17578125, + "learning_rate": 4.381598403435533e-05, + "loss": 2.1241, + "step": 10794 + }, + { + "epoch": 2.0234301780693533, + "grad_norm": 51350.12109375, + "learning_rate": 4.380818653230576e-05, + "loss": 2.1696, + "step": 10795 + }, + { + "epoch": 2.023617619493908, + "grad_norm": 53546.1953125, + "learning_rate": 4.380038918318595e-05, + "loss": 2.1674, + "step": 10796 + }, + { + "epoch": 2.023805060918463, + "grad_norm": 54366.19140625, + "learning_rate": 4.379259198718847e-05, + "loss": 2.1862, + "step": 10797 + }, + { + "epoch": 2.0239925023430176, + "grad_norm": 55573.3671875, + "learning_rate": 4.378479494450596e-05, + "loss": 2.0985, + "step": 10798 + }, + { + "epoch": 2.0241799437675727, + "grad_norm": 53855.76171875, + "learning_rate": 4.377699805533092e-05, + "loss": 2.1629, + "step": 10799 + }, + { + "epoch": 2.0243673851921273, + "grad_norm": 54666.97265625, + "learning_rate": 4.3769201319855975e-05, + "loss": 2.1778, + "step": 10800 + }, + { + "epoch": 2.0245548266166824, + "grad_norm": 49873.67578125, + "learning_rate": 4.376140473827367e-05, + "loss": 2.1393, + "step": 10801 + }, + { + "epoch": 2.024742268041237, + "grad_norm": 53571.62109375, + "learning_rate": 4.37536083107766e-05, + "loss": 2.165, + "step": 10802 + }, + { + "epoch": 2.024929709465792, + "grad_norm": 52525.296875, + "learning_rate": 4.3745812037557295e-05, + "loss": 2.0403, + "step": 10803 + }, + { + "epoch": 2.0251171508903467, + "grad_norm": 49465.7265625, + "learning_rate": 4.3738015918808305e-05, + "loss": 2.1991, + "step": 10804 + }, + { + "epoch": 2.025304592314902, + "grad_norm": 51241.44140625, + "learning_rate": 4.373021995472222e-05, + "loss": 2.1614, + "step": 10805 + }, + { + "epoch": 2.0254920337394564, + "grad_norm": 50852.26171875, + "learning_rate": 4.372242414549158e-05, + "loss": 2.0567, + "step": 10806 + }, + { + "epoch": 2.025679475164011, + "grad_norm": 50202.9921875, + "learning_rate": 4.371462849130892e-05, + "loss": 2.1671, + "step": 10807 + }, + { + "epoch": 2.025866916588566, + "grad_norm": 50163.48828125, + "learning_rate": 4.370683299236678e-05, + "loss": 2.1405, + "step": 10808 + }, + { + "epoch": 2.0260543580131207, + "grad_norm": 53417.046875, + "learning_rate": 4.369903764885771e-05, + "loss": 2.1682, + "step": 10809 + }, + { + "epoch": 2.026241799437676, + "grad_norm": 50243.6015625, + "learning_rate": 4.369124246097425e-05, + "loss": 2.1379, + "step": 10810 + }, + { + "epoch": 2.0264292408622304, + "grad_norm": 49118.82421875, + "learning_rate": 4.3683447428908914e-05, + "loss": 2.1532, + "step": 10811 + }, + { + "epoch": 2.0266166822867855, + "grad_norm": 52614.828125, + "learning_rate": 4.3675652552854225e-05, + "loss": 2.115, + "step": 10812 + }, + { + "epoch": 2.02680412371134, + "grad_norm": 50640.77734375, + "learning_rate": 4.366785783300275e-05, + "loss": 2.1319, + "step": 10813 + }, + { + "epoch": 2.026991565135895, + "grad_norm": 53380.30859375, + "learning_rate": 4.366006326954695e-05, + "loss": 2.1218, + "step": 10814 + }, + { + "epoch": 2.02717900656045, + "grad_norm": 51820.26953125, + "learning_rate": 4.365226886267937e-05, + "loss": 2.1594, + "step": 10815 + }, + { + "epoch": 2.027366447985005, + "grad_norm": 51883.60546875, + "learning_rate": 4.364447461259252e-05, + "loss": 2.1327, + "step": 10816 + }, + { + "epoch": 2.0275538894095595, + "grad_norm": 53372.28125, + "learning_rate": 4.363668051947892e-05, + "loss": 2.0946, + "step": 10817 + }, + { + "epoch": 2.027741330834114, + "grad_norm": 54827.51953125, + "learning_rate": 4.3628886583531047e-05, + "loss": 2.1092, + "step": 10818 + }, + { + "epoch": 2.027928772258669, + "grad_norm": 54153.7578125, + "learning_rate": 4.362109280494142e-05, + "loss": 2.2031, + "step": 10819 + }, + { + "epoch": 2.028116213683224, + "grad_norm": 50449.34765625, + "learning_rate": 4.361329918390252e-05, + "loss": 2.1397, + "step": 10820 + }, + { + "epoch": 2.028303655107779, + "grad_norm": 58454.98828125, + "learning_rate": 4.360550572060688e-05, + "loss": 2.1468, + "step": 10821 + }, + { + "epoch": 2.0284910965323335, + "grad_norm": 50331.26953125, + "learning_rate": 4.359771241524692e-05, + "loss": 2.0643, + "step": 10822 + }, + { + "epoch": 2.0286785379568886, + "grad_norm": 49304.82421875, + "learning_rate": 4.3589919268015176e-05, + "loss": 2.1791, + "step": 10823 + }, + { + "epoch": 2.028865979381443, + "grad_norm": 51726.5546875, + "learning_rate": 4.358212627910413e-05, + "loss": 2.1527, + "step": 10824 + }, + { + "epoch": 2.0290534208059983, + "grad_norm": 49384.3203125, + "learning_rate": 4.3574333448706235e-05, + "loss": 2.057, + "step": 10825 + }, + { + "epoch": 2.029240862230553, + "grad_norm": 50156.76953125, + "learning_rate": 4.356654077701397e-05, + "loss": 2.1787, + "step": 10826 + }, + { + "epoch": 2.029428303655108, + "grad_norm": 54109.37109375, + "learning_rate": 4.3558748264219797e-05, + "loss": 2.1163, + "step": 10827 + }, + { + "epoch": 2.0296157450796626, + "grad_norm": 53082.6953125, + "learning_rate": 4.3550955910516224e-05, + "loss": 2.143, + "step": 10828 + }, + { + "epoch": 2.0298031865042176, + "grad_norm": 54196.72265625, + "learning_rate": 4.354316371609564e-05, + "loss": 2.0439, + "step": 10829 + }, + { + "epoch": 2.0299906279287723, + "grad_norm": 52040.546875, + "learning_rate": 4.3535371681150554e-05, + "loss": 2.1092, + "step": 10830 + }, + { + "epoch": 2.030178069353327, + "grad_norm": 53907.69921875, + "learning_rate": 4.352757980587341e-05, + "loss": 2.1143, + "step": 10831 + }, + { + "epoch": 2.030365510777882, + "grad_norm": 52667.00390625, + "learning_rate": 4.351978809045666e-05, + "loss": 2.0954, + "step": 10832 + }, + { + "epoch": 2.0305529522024366, + "grad_norm": 51411.23828125, + "learning_rate": 4.351199653509274e-05, + "loss": 2.1242, + "step": 10833 + }, + { + "epoch": 2.0307403936269917, + "grad_norm": 57611.328125, + "learning_rate": 4.350420513997408e-05, + "loss": 2.1017, + "step": 10834 + }, + { + "epoch": 2.0309278350515463, + "grad_norm": 59695.8046875, + "learning_rate": 4.349641390529313e-05, + "loss": 2.069, + "step": 10835 + }, + { + "epoch": 2.0311152764761014, + "grad_norm": 59884.765625, + "learning_rate": 4.348862283124235e-05, + "loss": 2.1254, + "step": 10836 + }, + { + "epoch": 2.031302717900656, + "grad_norm": 52161.6484375, + "learning_rate": 4.348083191801412e-05, + "loss": 2.1052, + "step": 10837 + }, + { + "epoch": 2.031490159325211, + "grad_norm": 49004.3359375, + "learning_rate": 4.347304116580089e-05, + "loss": 2.1331, + "step": 10838 + }, + { + "epoch": 2.0316776007497657, + "grad_norm": 53757.6328125, + "learning_rate": 4.34652505747951e-05, + "loss": 2.1313, + "step": 10839 + }, + { + "epoch": 2.0318650421743207, + "grad_norm": 51407.296875, + "learning_rate": 4.345746014518913e-05, + "loss": 2.1188, + "step": 10840 + }, + { + "epoch": 2.0320524835988754, + "grad_norm": 49461.80078125, + "learning_rate": 4.344966987717543e-05, + "loss": 2.1678, + "step": 10841 + }, + { + "epoch": 2.03223992502343, + "grad_norm": 57504.109375, + "learning_rate": 4.344187977094636e-05, + "loss": 2.0991, + "step": 10842 + }, + { + "epoch": 2.032427366447985, + "grad_norm": 57432.67578125, + "learning_rate": 4.343408982669439e-05, + "loss": 2.0853, + "step": 10843 + }, + { + "epoch": 2.0326148078725397, + "grad_norm": 51801.86328125, + "learning_rate": 4.3426300044611865e-05, + "loss": 2.1343, + "step": 10844 + }, + { + "epoch": 2.0328022492970947, + "grad_norm": 59075.04296875, + "learning_rate": 4.341851042489122e-05, + "loss": 2.0672, + "step": 10845 + }, + { + "epoch": 2.0329896907216494, + "grad_norm": 56407.8125, + "learning_rate": 4.3410720967724824e-05, + "loss": 2.1062, + "step": 10846 + }, + { + "epoch": 2.0331771321462044, + "grad_norm": 49049.62109375, + "learning_rate": 4.3402931673305096e-05, + "loss": 2.1732, + "step": 10847 + }, + { + "epoch": 2.033364573570759, + "grad_norm": 52429.65234375, + "learning_rate": 4.339514254182439e-05, + "loss": 2.1287, + "step": 10848 + }, + { + "epoch": 2.033552014995314, + "grad_norm": 52169.14453125, + "learning_rate": 4.33873535734751e-05, + "loss": 2.1485, + "step": 10849 + }, + { + "epoch": 2.0337394564198688, + "grad_norm": 50759.859375, + "learning_rate": 4.337956476844959e-05, + "loss": 2.0925, + "step": 10850 + }, + { + "epoch": 2.033926897844424, + "grad_norm": 51117.73828125, + "learning_rate": 4.3371776126940286e-05, + "loss": 2.172, + "step": 10851 + }, + { + "epoch": 2.0341143392689784, + "grad_norm": 54411.6796875, + "learning_rate": 4.336398764913948e-05, + "loss": 2.1195, + "step": 10852 + }, + { + "epoch": 2.034301780693533, + "grad_norm": 52619.28125, + "learning_rate": 4.3356199335239575e-05, + "loss": 2.1011, + "step": 10853 + }, + { + "epoch": 2.034489222118088, + "grad_norm": 47813.00390625, + "learning_rate": 4.334841118543296e-05, + "loss": 2.1941, + "step": 10854 + }, + { + "epoch": 2.0346766635426428, + "grad_norm": 50956.52734375, + "learning_rate": 4.334062319991195e-05, + "loss": 2.174, + "step": 10855 + }, + { + "epoch": 2.034864104967198, + "grad_norm": 51676.9453125, + "learning_rate": 4.333283537886891e-05, + "loss": 2.158, + "step": 10856 + }, + { + "epoch": 2.0350515463917525, + "grad_norm": 54260.54296875, + "learning_rate": 4.3325047722496185e-05, + "loss": 2.1638, + "step": 10857 + }, + { + "epoch": 2.0352389878163075, + "grad_norm": 53631.1015625, + "learning_rate": 4.331726023098616e-05, + "loss": 2.2264, + "step": 10858 + }, + { + "epoch": 2.035426429240862, + "grad_norm": 51427.5859375, + "learning_rate": 4.33094729045311e-05, + "loss": 2.1267, + "step": 10859 + }, + { + "epoch": 2.035613870665417, + "grad_norm": 53449.58984375, + "learning_rate": 4.3301685743323404e-05, + "loss": 2.1253, + "step": 10860 + }, + { + "epoch": 2.035801312089972, + "grad_norm": 51536.7109375, + "learning_rate": 4.329389874755538e-05, + "loss": 2.1004, + "step": 10861 + }, + { + "epoch": 2.035988753514527, + "grad_norm": 51186.94140625, + "learning_rate": 4.328611191741939e-05, + "loss": 2.1698, + "step": 10862 + }, + { + "epoch": 2.0361761949390815, + "grad_norm": 52100.5859375, + "learning_rate": 4.3278325253107706e-05, + "loss": 2.1649, + "step": 10863 + }, + { + "epoch": 2.036363636363636, + "grad_norm": 49924.05078125, + "learning_rate": 4.327053875481267e-05, + "loss": 2.0619, + "step": 10864 + }, + { + "epoch": 2.036551077788191, + "grad_norm": 52542.26953125, + "learning_rate": 4.326275242272662e-05, + "loss": 2.1256, + "step": 10865 + }, + { + "epoch": 2.036738519212746, + "grad_norm": 53735.6953125, + "learning_rate": 4.325496625704184e-05, + "loss": 2.2007, + "step": 10866 + }, + { + "epoch": 2.036925960637301, + "grad_norm": 53227.015625, + "learning_rate": 4.324718025795064e-05, + "loss": 2.1799, + "step": 10867 + }, + { + "epoch": 2.0371134020618555, + "grad_norm": 56051.8828125, + "learning_rate": 4.3239394425645327e-05, + "loss": 2.141, + "step": 10868 + }, + { + "epoch": 2.0373008434864106, + "grad_norm": 54074.84375, + "learning_rate": 4.323160876031823e-05, + "loss": 2.1773, + "step": 10869 + }, + { + "epoch": 2.0374882849109652, + "grad_norm": 53112.33203125, + "learning_rate": 4.322382326216161e-05, + "loss": 2.1442, + "step": 10870 + }, + { + "epoch": 2.0376757263355203, + "grad_norm": 51232.765625, + "learning_rate": 4.321603793136777e-05, + "loss": 2.1562, + "step": 10871 + }, + { + "epoch": 2.037863167760075, + "grad_norm": 54364.91015625, + "learning_rate": 4.3208252768128995e-05, + "loss": 2.0754, + "step": 10872 + }, + { + "epoch": 2.03805060918463, + "grad_norm": 52886.13671875, + "learning_rate": 4.320046777263759e-05, + "loss": 2.0431, + "step": 10873 + }, + { + "epoch": 2.0382380506091846, + "grad_norm": 51922.32421875, + "learning_rate": 4.3192682945085794e-05, + "loss": 2.1203, + "step": 10874 + }, + { + "epoch": 2.0384254920337392, + "grad_norm": 55494.08203125, + "learning_rate": 4.3184898285665895e-05, + "loss": 2.1193, + "step": 10875 + }, + { + "epoch": 2.0386129334582943, + "grad_norm": 55497.76953125, + "learning_rate": 4.317711379457019e-05, + "loss": 2.2666, + "step": 10876 + }, + { + "epoch": 2.038800374882849, + "grad_norm": 50409.2421875, + "learning_rate": 4.3169329471990935e-05, + "loss": 2.1255, + "step": 10877 + }, + { + "epoch": 2.038987816307404, + "grad_norm": 50155.03515625, + "learning_rate": 4.3161545318120374e-05, + "loss": 2.0873, + "step": 10878 + }, + { + "epoch": 2.0391752577319586, + "grad_norm": 49974.7109375, + "learning_rate": 4.315376133315078e-05, + "loss": 2.1494, + "step": 10879 + }, + { + "epoch": 2.0393626991565137, + "grad_norm": 50327.90625, + "learning_rate": 4.314597751727441e-05, + "loss": 2.1331, + "step": 10880 + }, + { + "epoch": 2.0395501405810683, + "grad_norm": 53225.83203125, + "learning_rate": 4.3138193870683504e-05, + "loss": 2.1762, + "step": 10881 + }, + { + "epoch": 2.0397375820056234, + "grad_norm": 60313.1171875, + "learning_rate": 4.31304103935703e-05, + "loss": 2.0733, + "step": 10882 + }, + { + "epoch": 2.039925023430178, + "grad_norm": 51812.51953125, + "learning_rate": 4.312262708612706e-05, + "loss": 2.2148, + "step": 10883 + }, + { + "epoch": 2.040112464854733, + "grad_norm": 55004.66015625, + "learning_rate": 4.311484394854603e-05, + "loss": 2.1039, + "step": 10884 + }, + { + "epoch": 2.0402999062792877, + "grad_norm": 53541.4296875, + "learning_rate": 4.3107060981019414e-05, + "loss": 2.1227, + "step": 10885 + }, + { + "epoch": 2.0404873477038423, + "grad_norm": 50305.640625, + "learning_rate": 4.309927818373945e-05, + "loss": 2.1695, + "step": 10886 + }, + { + "epoch": 2.0406747891283974, + "grad_norm": 55156.4921875, + "learning_rate": 4.3091495556898385e-05, + "loss": 2.1315, + "step": 10887 + }, + { + "epoch": 2.040862230552952, + "grad_norm": 57303.765625, + "learning_rate": 4.308371310068842e-05, + "loss": 2.2164, + "step": 10888 + }, + { + "epoch": 2.041049671977507, + "grad_norm": 48179.8828125, + "learning_rate": 4.3075930815301766e-05, + "loss": 2.1238, + "step": 10889 + }, + { + "epoch": 2.0412371134020617, + "grad_norm": 52125.7265625, + "learning_rate": 4.306814870093063e-05, + "loss": 2.1879, + "step": 10890 + }, + { + "epoch": 2.0414245548266168, + "grad_norm": 51699.2109375, + "learning_rate": 4.306036675776725e-05, + "loss": 2.1259, + "step": 10891 + }, + { + "epoch": 2.0416119962511714, + "grad_norm": 53129.8125, + "learning_rate": 4.305258498600382e-05, + "loss": 2.1669, + "step": 10892 + }, + { + "epoch": 2.0417994376757265, + "grad_norm": 52863.55859375, + "learning_rate": 4.304480338583252e-05, + "loss": 2.1377, + "step": 10893 + }, + { + "epoch": 2.041986879100281, + "grad_norm": 46219.390625, + "learning_rate": 4.303702195744557e-05, + "loss": 2.1236, + "step": 10894 + }, + { + "epoch": 2.042174320524836, + "grad_norm": 54177.4140625, + "learning_rate": 4.302924070103515e-05, + "loss": 2.1553, + "step": 10895 + }, + { + "epoch": 2.042361761949391, + "grad_norm": 53060.3046875, + "learning_rate": 4.3021459616793443e-05, + "loss": 2.139, + "step": 10896 + }, + { + "epoch": 2.042549203373946, + "grad_norm": 52690.37890625, + "learning_rate": 4.301367870491262e-05, + "loss": 2.1526, + "step": 10897 + }, + { + "epoch": 2.0427366447985005, + "grad_norm": 54483.265625, + "learning_rate": 4.300589796558489e-05, + "loss": 2.2012, + "step": 10898 + }, + { + "epoch": 2.042924086223055, + "grad_norm": 55373.2421875, + "learning_rate": 4.299811739900242e-05, + "loss": 2.1659, + "step": 10899 + }, + { + "epoch": 2.04311152764761, + "grad_norm": 49739.8203125, + "learning_rate": 4.299033700535736e-05, + "loss": 2.1415, + "step": 10900 + }, + { + "epoch": 2.043298969072165, + "grad_norm": 49171.51171875, + "learning_rate": 4.2982556784841895e-05, + "loss": 2.1237, + "step": 10901 + }, + { + "epoch": 2.04348641049672, + "grad_norm": 50416.234375, + "learning_rate": 4.297477673764818e-05, + "loss": 2.161, + "step": 10902 + }, + { + "epoch": 2.0436738519212745, + "grad_norm": 48807.4609375, + "learning_rate": 4.2966996863968374e-05, + "loss": 2.1622, + "step": 10903 + }, + { + "epoch": 2.0438612933458296, + "grad_norm": 55223.5859375, + "learning_rate": 4.295921716399461e-05, + "loss": 2.1345, + "step": 10904 + }, + { + "epoch": 2.044048734770384, + "grad_norm": 49312.01953125, + "learning_rate": 4.295143763791906e-05, + "loss": 2.1715, + "step": 10905 + }, + { + "epoch": 2.0442361761949392, + "grad_norm": 54047.515625, + "learning_rate": 4.2943658285933866e-05, + "loss": 2.2086, + "step": 10906 + }, + { + "epoch": 2.044423617619494, + "grad_norm": 56374.69921875, + "learning_rate": 4.293587910823117e-05, + "loss": 2.1053, + "step": 10907 + }, + { + "epoch": 2.044611059044049, + "grad_norm": 51087.625, + "learning_rate": 4.292810010500309e-05, + "loss": 2.1806, + "step": 10908 + }, + { + "epoch": 2.0447985004686036, + "grad_norm": 53188.23828125, + "learning_rate": 4.292032127644177e-05, + "loss": 2.1287, + "step": 10909 + }, + { + "epoch": 2.044985941893158, + "grad_norm": 50450.66796875, + "learning_rate": 4.291254262273935e-05, + "loss": 2.099, + "step": 10910 + }, + { + "epoch": 2.0451733833177133, + "grad_norm": 54424.3203125, + "learning_rate": 4.2904764144087925e-05, + "loss": 2.1359, + "step": 10911 + }, + { + "epoch": 2.045360824742268, + "grad_norm": 51590.125, + "learning_rate": 4.289698584067963e-05, + "loss": 2.2085, + "step": 10912 + }, + { + "epoch": 2.045548266166823, + "grad_norm": 52674.93359375, + "learning_rate": 4.288920771270656e-05, + "loss": 2.2447, + "step": 10913 + }, + { + "epoch": 2.0457357075913776, + "grad_norm": 53309.01953125, + "learning_rate": 4.2881429760360875e-05, + "loss": 2.11, + "step": 10914 + }, + { + "epoch": 2.0459231490159326, + "grad_norm": 55217.3671875, + "learning_rate": 4.287365198383461e-05, + "loss": 2.1483, + "step": 10915 + }, + { + "epoch": 2.0461105904404873, + "grad_norm": 49973.453125, + "learning_rate": 4.2865874383319916e-05, + "loss": 2.0891, + "step": 10916 + }, + { + "epoch": 2.0462980318650423, + "grad_norm": 51667.27734375, + "learning_rate": 4.285809695900888e-05, + "loss": 2.1954, + "step": 10917 + }, + { + "epoch": 2.046485473289597, + "grad_norm": 55712.37890625, + "learning_rate": 4.285031971109359e-05, + "loss": 2.1121, + "step": 10918 + }, + { + "epoch": 2.046672914714152, + "grad_norm": 52222.0546875, + "learning_rate": 4.284254263976612e-05, + "loss": 2.1185, + "step": 10919 + }, + { + "epoch": 2.0468603561387066, + "grad_norm": 57074.3828125, + "learning_rate": 4.283476574521857e-05, + "loss": 2.1937, + "step": 10920 + }, + { + "epoch": 2.0470477975632613, + "grad_norm": 50554.8203125, + "learning_rate": 4.282698902764304e-05, + "loss": 2.0878, + "step": 10921 + }, + { + "epoch": 2.0472352389878163, + "grad_norm": 51634.89453125, + "learning_rate": 4.281921248723155e-05, + "loss": 2.0569, + "step": 10922 + }, + { + "epoch": 2.047422680412371, + "grad_norm": 52143.6796875, + "learning_rate": 4.2811436124176216e-05, + "loss": 2.1486, + "step": 10923 + }, + { + "epoch": 2.047610121836926, + "grad_norm": 59445.51171875, + "learning_rate": 4.2803659938669084e-05, + "loss": 2.0372, + "step": 10924 + }, + { + "epoch": 2.0477975632614807, + "grad_norm": 50419.2890625, + "learning_rate": 4.2795883930902235e-05, + "loss": 2.112, + "step": 10925 + }, + { + "epoch": 2.0479850046860357, + "grad_norm": 52501.34765625, + "learning_rate": 4.27881081010677e-05, + "loss": 2.0339, + "step": 10926 + }, + { + "epoch": 2.0481724461105904, + "grad_norm": 53019.51171875, + "learning_rate": 4.278033244935755e-05, + "loss": 2.1899, + "step": 10927 + }, + { + "epoch": 2.0483598875351454, + "grad_norm": 51132.72265625, + "learning_rate": 4.277255697596381e-05, + "loss": 2.1476, + "step": 10928 + }, + { + "epoch": 2.0485473289597, + "grad_norm": 56256.53515625, + "learning_rate": 4.276478168107858e-05, + "loss": 2.1032, + "step": 10929 + }, + { + "epoch": 2.048734770384255, + "grad_norm": 54134.62890625, + "learning_rate": 4.275700656489383e-05, + "loss": 2.1002, + "step": 10930 + }, + { + "epoch": 2.0489222118088097, + "grad_norm": 53309.96875, + "learning_rate": 4.274923162760163e-05, + "loss": 2.1519, + "step": 10931 + }, + { + "epoch": 2.0491096532333644, + "grad_norm": 52466.25390625, + "learning_rate": 4.274145686939402e-05, + "loss": 2.1134, + "step": 10932 + }, + { + "epoch": 2.0492970946579194, + "grad_norm": 51348.171875, + "learning_rate": 4.273368229046301e-05, + "loss": 2.1103, + "step": 10933 + }, + { + "epoch": 2.049484536082474, + "grad_norm": 56795.3203125, + "learning_rate": 4.272590789100063e-05, + "loss": 2.1355, + "step": 10934 + }, + { + "epoch": 2.049671977507029, + "grad_norm": 54244.4375, + "learning_rate": 4.271813367119887e-05, + "loss": 2.1607, + "step": 10935 + }, + { + "epoch": 2.0498594189315837, + "grad_norm": 52498.875, + "learning_rate": 4.2710359631249804e-05, + "loss": 2.1227, + "step": 10936 + }, + { + "epoch": 2.050046860356139, + "grad_norm": 56009.625, + "learning_rate": 4.270258577134538e-05, + "loss": 2.1479, + "step": 10937 + }, + { + "epoch": 2.0502343017806934, + "grad_norm": 54550.359375, + "learning_rate": 4.269481209167763e-05, + "loss": 2.1443, + "step": 10938 + }, + { + "epoch": 2.0504217432052485, + "grad_norm": 54240.859375, + "learning_rate": 4.268703859243856e-05, + "loss": 2.1044, + "step": 10939 + }, + { + "epoch": 2.050609184629803, + "grad_norm": 53362.71484375, + "learning_rate": 4.267926527382015e-05, + "loss": 2.1407, + "step": 10940 + }, + { + "epoch": 2.050796626054358, + "grad_norm": 53214.1015625, + "learning_rate": 4.2671492136014394e-05, + "loss": 2.0855, + "step": 10941 + }, + { + "epoch": 2.050984067478913, + "grad_norm": 51262.875, + "learning_rate": 4.266371917921328e-05, + "loss": 2.1804, + "step": 10942 + }, + { + "epoch": 2.051171508903468, + "grad_norm": 52407.65234375, + "learning_rate": 4.265594640360878e-05, + "loss": 2.1723, + "step": 10943 + }, + { + "epoch": 2.0513589503280225, + "grad_norm": 54728.6328125, + "learning_rate": 4.264817380939292e-05, + "loss": 2.1315, + "step": 10944 + }, + { + "epoch": 2.051546391752577, + "grad_norm": 52609.1328125, + "learning_rate": 4.264040139675759e-05, + "loss": 2.1613, + "step": 10945 + }, + { + "epoch": 2.051733833177132, + "grad_norm": 51832.05078125, + "learning_rate": 4.263262916589482e-05, + "loss": 2.0999, + "step": 10946 + }, + { + "epoch": 2.051921274601687, + "grad_norm": 54024.6484375, + "learning_rate": 4.262485711699656e-05, + "loss": 2.12, + "step": 10947 + }, + { + "epoch": 2.052108716026242, + "grad_norm": 50327.14453125, + "learning_rate": 4.261708525025478e-05, + "loss": 2.144, + "step": 10948 + }, + { + "epoch": 2.0522961574507965, + "grad_norm": 52269.73828125, + "learning_rate": 4.260931356586141e-05, + "loss": 2.1635, + "step": 10949 + }, + { + "epoch": 2.0524835988753516, + "grad_norm": 54382.18359375, + "learning_rate": 4.26015420640084e-05, + "loss": 2.2005, + "step": 10950 + }, + { + "epoch": 2.052671040299906, + "grad_norm": 52684.08984375, + "learning_rate": 4.259377074488774e-05, + "loss": 2.1598, + "step": 10951 + }, + { + "epoch": 2.0528584817244613, + "grad_norm": 53402.08984375, + "learning_rate": 4.258599960869131e-05, + "loss": 2.1241, + "step": 10952 + }, + { + "epoch": 2.053045923149016, + "grad_norm": 52462.83203125, + "learning_rate": 4.2578228655611085e-05, + "loss": 2.0523, + "step": 10953 + }, + { + "epoch": 2.053233364573571, + "grad_norm": 51484.734375, + "learning_rate": 4.2570457885839e-05, + "loss": 2.1778, + "step": 10954 + }, + { + "epoch": 2.0534208059981256, + "grad_norm": 48925.8125, + "learning_rate": 4.2562687299566974e-05, + "loss": 2.182, + "step": 10955 + }, + { + "epoch": 2.05360824742268, + "grad_norm": 48665.68359375, + "learning_rate": 4.255491689698692e-05, + "loss": 2.1284, + "step": 10956 + }, + { + "epoch": 2.0537956888472353, + "grad_norm": 53865.421875, + "learning_rate": 4.254714667829076e-05, + "loss": 2.1452, + "step": 10957 + }, + { + "epoch": 2.05398313027179, + "grad_norm": 53076.01953125, + "learning_rate": 4.2539376643670415e-05, + "loss": 2.1597, + "step": 10958 + }, + { + "epoch": 2.054170571696345, + "grad_norm": 57755.92578125, + "learning_rate": 4.253160679331782e-05, + "loss": 2.1195, + "step": 10959 + }, + { + "epoch": 2.0543580131208996, + "grad_norm": 49587.109375, + "learning_rate": 4.252383712742483e-05, + "loss": 2.1636, + "step": 10960 + }, + { + "epoch": 2.0545454545454547, + "grad_norm": 54149.84375, + "learning_rate": 4.2516067646183364e-05, + "loss": 2.1857, + "step": 10961 + }, + { + "epoch": 2.0547328959700093, + "grad_norm": 49874.12109375, + "learning_rate": 4.250829834978535e-05, + "loss": 2.1763, + "step": 10962 + }, + { + "epoch": 2.0549203373945644, + "grad_norm": 53851.19921875, + "learning_rate": 4.2500529238422634e-05, + "loss": 2.0984, + "step": 10963 + }, + { + "epoch": 2.055107778819119, + "grad_norm": 55663.03125, + "learning_rate": 4.249276031228713e-05, + "loss": 2.0766, + "step": 10964 + }, + { + "epoch": 2.055295220243674, + "grad_norm": 54767.56640625, + "learning_rate": 4.248499157157071e-05, + "loss": 2.1859, + "step": 10965 + }, + { + "epoch": 2.0554826616682287, + "grad_norm": 52090.4296875, + "learning_rate": 4.247722301646526e-05, + "loss": 2.1842, + "step": 10966 + }, + { + "epoch": 2.0556701030927833, + "grad_norm": 60244.09375, + "learning_rate": 4.2469454647162646e-05, + "loss": 2.0405, + "step": 10967 + }, + { + "epoch": 2.0558575445173384, + "grad_norm": 62161.7890625, + "learning_rate": 4.246168646385472e-05, + "loss": 2.1006, + "step": 10968 + }, + { + "epoch": 2.056044985941893, + "grad_norm": 52332.88671875, + "learning_rate": 4.2453918466733375e-05, + "loss": 2.1179, + "step": 10969 + }, + { + "epoch": 2.056232427366448, + "grad_norm": 56938.19921875, + "learning_rate": 4.244615065599048e-05, + "loss": 2.1266, + "step": 10970 + }, + { + "epoch": 2.0564198687910027, + "grad_norm": 53271.2890625, + "learning_rate": 4.2438383031817854e-05, + "loss": 2.056, + "step": 10971 + }, + { + "epoch": 2.0566073102155578, + "grad_norm": 53036.2421875, + "learning_rate": 4.2430615594407364e-05, + "loss": 2.121, + "step": 10972 + }, + { + "epoch": 2.0567947516401124, + "grad_norm": 49216.4609375, + "learning_rate": 4.242284834395084e-05, + "loss": 2.1091, + "step": 10973 + }, + { + "epoch": 2.0569821930646675, + "grad_norm": 53481.41796875, + "learning_rate": 4.241508128064018e-05, + "loss": 2.1333, + "step": 10974 + }, + { + "epoch": 2.057169634489222, + "grad_norm": 53234.296875, + "learning_rate": 4.240731440466714e-05, + "loss": 2.1918, + "step": 10975 + }, + { + "epoch": 2.057357075913777, + "grad_norm": 58174.0546875, + "learning_rate": 4.2399547716223604e-05, + "loss": 2.1873, + "step": 10976 + }, + { + "epoch": 2.0575445173383318, + "grad_norm": 50252.54296875, + "learning_rate": 4.2391781215501395e-05, + "loss": 2.1847, + "step": 10977 + }, + { + "epoch": 2.0577319587628864, + "grad_norm": 53955.5, + "learning_rate": 4.2384014902692307e-05, + "loss": 2.1212, + "step": 10978 + }, + { + "epoch": 2.0579194001874415, + "grad_norm": 55612.55859375, + "learning_rate": 4.237624877798819e-05, + "loss": 2.1009, + "step": 10979 + }, + { + "epoch": 2.058106841611996, + "grad_norm": 51121.1484375, + "learning_rate": 4.236848284158084e-05, + "loss": 2.168, + "step": 10980 + }, + { + "epoch": 2.058294283036551, + "grad_norm": 50897.01171875, + "learning_rate": 4.236071709366207e-05, + "loss": 2.121, + "step": 10981 + }, + { + "epoch": 2.0584817244611058, + "grad_norm": 52973.15625, + "learning_rate": 4.235295153442368e-05, + "loss": 2.1876, + "step": 10982 + }, + { + "epoch": 2.058669165885661, + "grad_norm": 52748.72265625, + "learning_rate": 4.234518616405746e-05, + "loss": 2.138, + "step": 10983 + }, + { + "epoch": 2.0588566073102155, + "grad_norm": 51805.421875, + "learning_rate": 4.233742098275523e-05, + "loss": 2.1347, + "step": 10984 + }, + { + "epoch": 2.0590440487347705, + "grad_norm": 50839.7265625, + "learning_rate": 4.2329655990708776e-05, + "loss": 2.0975, + "step": 10985 + }, + { + "epoch": 2.059231490159325, + "grad_norm": 50835.73046875, + "learning_rate": 4.232189118810987e-05, + "loss": 2.1565, + "step": 10986 + }, + { + "epoch": 2.0594189315838802, + "grad_norm": 56424.98828125, + "learning_rate": 4.231412657515029e-05, + "loss": 2.094, + "step": 10987 + }, + { + "epoch": 2.059606373008435, + "grad_norm": 50891.58984375, + "learning_rate": 4.230636215202181e-05, + "loss": 2.1592, + "step": 10988 + }, + { + "epoch": 2.0597938144329895, + "grad_norm": 52669.32421875, + "learning_rate": 4.2298597918916236e-05, + "loss": 2.1538, + "step": 10989 + }, + { + "epoch": 2.0599812558575445, + "grad_norm": 52240.1015625, + "learning_rate": 4.229083387602529e-05, + "loss": 2.1037, + "step": 10990 + }, + { + "epoch": 2.060168697282099, + "grad_norm": 53020.7109375, + "learning_rate": 4.228307002354075e-05, + "loss": 2.2412, + "step": 10991 + }, + { + "epoch": 2.0603561387066542, + "grad_norm": 48230.734375, + "learning_rate": 4.227530636165439e-05, + "loss": 2.1672, + "step": 10992 + }, + { + "epoch": 2.060543580131209, + "grad_norm": 51887.2109375, + "learning_rate": 4.226754289055794e-05, + "loss": 2.1064, + "step": 10993 + }, + { + "epoch": 2.060731021555764, + "grad_norm": 53253.203125, + "learning_rate": 4.2259779610443154e-05, + "loss": 2.0837, + "step": 10994 + }, + { + "epoch": 2.0609184629803186, + "grad_norm": 51578.0078125, + "learning_rate": 4.225201652150177e-05, + "loss": 2.159, + "step": 10995 + }, + { + "epoch": 2.0611059044048736, + "grad_norm": 57611.45703125, + "learning_rate": 4.2244253623925556e-05, + "loss": 2.2102, + "step": 10996 + }, + { + "epoch": 2.0612933458294282, + "grad_norm": 59084.41796875, + "learning_rate": 4.22364909179062e-05, + "loss": 2.108, + "step": 10997 + }, + { + "epoch": 2.0614807872539833, + "grad_norm": 53399.8515625, + "learning_rate": 4.2228728403635446e-05, + "loss": 2.1622, + "step": 10998 + }, + { + "epoch": 2.061668228678538, + "grad_norm": 51111.01953125, + "learning_rate": 4.2220966081305034e-05, + "loss": 2.2613, + "step": 10999 + }, + { + "epoch": 2.0618556701030926, + "grad_norm": 51848.7890625, + "learning_rate": 4.221320395110669e-05, + "loss": 2.2065, + "step": 11000 + }, + { + "epoch": 2.0618556701030926, + "eval_loss": 2.2797443866729736, + "eval_runtime": 124.0926, + "eval_samples_per_second": 40.687, + "eval_steps_per_second": 2.039, + "step": 11000 + }, + { + "epoch": 2.0620431115276476, + "grad_norm": 51655.24609375, + "learning_rate": 4.22054420132321e-05, + "loss": 2.1359, + "step": 11001 + }, + { + "epoch": 2.0622305529522023, + "grad_norm": 53330.15234375, + "learning_rate": 4.2197680267872987e-05, + "loss": 2.0835, + "step": 11002 + }, + { + "epoch": 2.0624179943767573, + "grad_norm": 53510.0390625, + "learning_rate": 4.218991871522105e-05, + "loss": 2.177, + "step": 11003 + }, + { + "epoch": 2.062605435801312, + "grad_norm": 55764.57421875, + "learning_rate": 4.218215735546801e-05, + "loss": 2.1388, + "step": 11004 + }, + { + "epoch": 2.062792877225867, + "grad_norm": 50618.734375, + "learning_rate": 4.217439618880553e-05, + "loss": 2.1521, + "step": 11005 + }, + { + "epoch": 2.0629803186504216, + "grad_norm": 55300.0703125, + "learning_rate": 4.216663521542531e-05, + "loss": 2.2049, + "step": 11006 + }, + { + "epoch": 2.0631677600749767, + "grad_norm": 52214.0, + "learning_rate": 4.215887443551908e-05, + "loss": 2.183, + "step": 11007 + }, + { + "epoch": 2.0633552014995313, + "grad_norm": 50319.32421875, + "learning_rate": 4.215111384927846e-05, + "loss": 2.178, + "step": 11008 + }, + { + "epoch": 2.0635426429240864, + "grad_norm": 50530.921875, + "learning_rate": 4.2143353456895154e-05, + "loss": 2.181, + "step": 11009 + }, + { + "epoch": 2.063730084348641, + "grad_norm": 53346.34765625, + "learning_rate": 4.213559325856083e-05, + "loss": 2.1094, + "step": 11010 + }, + { + "epoch": 2.0639175257731956, + "grad_norm": 49787.14453125, + "learning_rate": 4.2127833254467166e-05, + "loss": 2.1286, + "step": 11011 + }, + { + "epoch": 2.0641049671977507, + "grad_norm": 50926.625, + "learning_rate": 4.212007344480581e-05, + "loss": 2.145, + "step": 11012 + }, + { + "epoch": 2.0642924086223053, + "grad_norm": 55943.98046875, + "learning_rate": 4.2112313829768405e-05, + "loss": 2.0492, + "step": 11013 + }, + { + "epoch": 2.0644798500468604, + "grad_norm": 52036.2578125, + "learning_rate": 4.2104554409546635e-05, + "loss": 2.2712, + "step": 11014 + }, + { + "epoch": 2.064667291471415, + "grad_norm": 54931.54296875, + "learning_rate": 4.209679518433215e-05, + "loss": 2.1086, + "step": 11015 + }, + { + "epoch": 2.06485473289597, + "grad_norm": 51884.80078125, + "learning_rate": 4.2089036154316564e-05, + "loss": 2.132, + "step": 11016 + }, + { + "epoch": 2.0650421743205247, + "grad_norm": 47623.2734375, + "learning_rate": 4.208127731969153e-05, + "loss": 2.1493, + "step": 11017 + }, + { + "epoch": 2.06522961574508, + "grad_norm": 55554.9609375, + "learning_rate": 4.2073518680648685e-05, + "loss": 2.0882, + "step": 11018 + }, + { + "epoch": 2.0654170571696344, + "grad_norm": 54139.5234375, + "learning_rate": 4.2065760237379645e-05, + "loss": 2.1118, + "step": 11019 + }, + { + "epoch": 2.0656044985941895, + "grad_norm": 54364.9765625, + "learning_rate": 4.205800199007604e-05, + "loss": 2.1757, + "step": 11020 + }, + { + "epoch": 2.065791940018744, + "grad_norm": 51739.65234375, + "learning_rate": 4.205024393892947e-05, + "loss": 2.2157, + "step": 11021 + }, + { + "epoch": 2.065979381443299, + "grad_norm": 50802.4296875, + "learning_rate": 4.204248608413161e-05, + "loss": 2.1781, + "step": 11022 + }, + { + "epoch": 2.066166822867854, + "grad_norm": 52980.44140625, + "learning_rate": 4.203472842587399e-05, + "loss": 2.0989, + "step": 11023 + }, + { + "epoch": 2.0663542642924084, + "grad_norm": 53458.67578125, + "learning_rate": 4.2026970964348265e-05, + "loss": 2.1725, + "step": 11024 + }, + { + "epoch": 2.0665417057169635, + "grad_norm": 52712.52734375, + "learning_rate": 4.201921369974602e-05, + "loss": 2.1692, + "step": 11025 + }, + { + "epoch": 2.066729147141518, + "grad_norm": 53610.0546875, + "learning_rate": 4.201145663225886e-05, + "loss": 2.0649, + "step": 11026 + }, + { + "epoch": 2.066916588566073, + "grad_norm": 55821.31640625, + "learning_rate": 4.200369976207835e-05, + "loss": 2.1409, + "step": 11027 + }, + { + "epoch": 2.067104029990628, + "grad_norm": 51204.3203125, + "learning_rate": 4.199594308939608e-05, + "loss": 2.1365, + "step": 11028 + }, + { + "epoch": 2.067291471415183, + "grad_norm": 56005.92578125, + "learning_rate": 4.198818661440364e-05, + "loss": 2.1211, + "step": 11029 + }, + { + "epoch": 2.0674789128397375, + "grad_norm": 52589.43359375, + "learning_rate": 4.198043033729263e-05, + "loss": 2.1217, + "step": 11030 + }, + { + "epoch": 2.0676663542642926, + "grad_norm": 56978.75390625, + "learning_rate": 4.197267425825457e-05, + "loss": 2.1372, + "step": 11031 + }, + { + "epoch": 2.067853795688847, + "grad_norm": 57383.78125, + "learning_rate": 4.196491837748106e-05, + "loss": 2.1359, + "step": 11032 + }, + { + "epoch": 2.0680412371134023, + "grad_norm": 53785.4296875, + "learning_rate": 4.195716269516365e-05, + "loss": 2.0531, + "step": 11033 + }, + { + "epoch": 2.068228678537957, + "grad_norm": 53746.0234375, + "learning_rate": 4.194940721149388e-05, + "loss": 2.1871, + "step": 11034 + }, + { + "epoch": 2.0684161199625115, + "grad_norm": 51778.40625, + "learning_rate": 4.1941651926663326e-05, + "loss": 2.1367, + "step": 11035 + }, + { + "epoch": 2.0686035613870666, + "grad_norm": 49070.078125, + "learning_rate": 4.1933896840863496e-05, + "loss": 2.1026, + "step": 11036 + }, + { + "epoch": 2.068791002811621, + "grad_norm": 54341.52734375, + "learning_rate": 4.1926141954286004e-05, + "loss": 2.1903, + "step": 11037 + }, + { + "epoch": 2.0689784442361763, + "grad_norm": 55425.30078125, + "learning_rate": 4.1918387267122295e-05, + "loss": 2.145, + "step": 11038 + }, + { + "epoch": 2.069165885660731, + "grad_norm": 53709.06640625, + "learning_rate": 4.191063277956395e-05, + "loss": 2.1758, + "step": 11039 + }, + { + "epoch": 2.069353327085286, + "grad_norm": 51756.36328125, + "learning_rate": 4.19028784918025e-05, + "loss": 2.1707, + "step": 11040 + }, + { + "epoch": 2.0695407685098406, + "grad_norm": 52542.6171875, + "learning_rate": 4.1895124404029454e-05, + "loss": 2.1245, + "step": 11041 + }, + { + "epoch": 2.0697282099343957, + "grad_norm": 51492.58984375, + "learning_rate": 4.188737051643632e-05, + "loss": 2.1577, + "step": 11042 + }, + { + "epoch": 2.0699156513589503, + "grad_norm": 52865.90234375, + "learning_rate": 4.187961682921461e-05, + "loss": 2.2298, + "step": 11043 + }, + { + "epoch": 2.0701030927835053, + "grad_norm": 50833.26953125, + "learning_rate": 4.187186334255584e-05, + "loss": 2.1398, + "step": 11044 + }, + { + "epoch": 2.07029053420806, + "grad_norm": 48741.4453125, + "learning_rate": 4.186411005665152e-05, + "loss": 2.1494, + "step": 11045 + }, + { + "epoch": 2.0704779756326146, + "grad_norm": 53815.94140625, + "learning_rate": 4.185635697169312e-05, + "loss": 2.101, + "step": 11046 + }, + { + "epoch": 2.0706654170571697, + "grad_norm": 57930.3359375, + "learning_rate": 4.184860408787214e-05, + "loss": 2.1322, + "step": 11047 + }, + { + "epoch": 2.0708528584817243, + "grad_norm": 52097.17578125, + "learning_rate": 4.1840851405380096e-05, + "loss": 2.1568, + "step": 11048 + }, + { + "epoch": 2.0710402999062794, + "grad_norm": 51929.62890625, + "learning_rate": 4.183309892440842e-05, + "loss": 2.1518, + "step": 11049 + }, + { + "epoch": 2.071227741330834, + "grad_norm": 56046.2578125, + "learning_rate": 4.182534664514862e-05, + "loss": 2.2078, + "step": 11050 + }, + { + "epoch": 2.071415182755389, + "grad_norm": 53303.640625, + "learning_rate": 4.1817594567792146e-05, + "loss": 2.1671, + "step": 11051 + }, + { + "epoch": 2.0716026241799437, + "grad_norm": 53354.87890625, + "learning_rate": 4.180984269253051e-05, + "loss": 2.1564, + "step": 11052 + }, + { + "epoch": 2.0717900656044987, + "grad_norm": 50946.390625, + "learning_rate": 4.1802091019555104e-05, + "loss": 2.1456, + "step": 11053 + }, + { + "epoch": 2.0719775070290534, + "grad_norm": 53554.375, + "learning_rate": 4.1794339549057436e-05, + "loss": 2.1007, + "step": 11054 + }, + { + "epoch": 2.0721649484536084, + "grad_norm": 49721.6328125, + "learning_rate": 4.178658828122894e-05, + "loss": 2.1579, + "step": 11055 + }, + { + "epoch": 2.072352389878163, + "grad_norm": 51816.296875, + "learning_rate": 4.177883721626108e-05, + "loss": 2.1639, + "step": 11056 + }, + { + "epoch": 2.072539831302718, + "grad_norm": 58131.96875, + "learning_rate": 4.177108635434527e-05, + "loss": 2.1652, + "step": 11057 + }, + { + "epoch": 2.0727272727272728, + "grad_norm": 52810.2578125, + "learning_rate": 4.176333569567295e-05, + "loss": 2.1794, + "step": 11058 + }, + { + "epoch": 2.0729147141518274, + "grad_norm": 55966.26171875, + "learning_rate": 4.175558524043558e-05, + "loss": 2.17, + "step": 11059 + }, + { + "epoch": 2.0731021555763824, + "grad_norm": 51274.31640625, + "learning_rate": 4.1747834988824555e-05, + "loss": 2.1889, + "step": 11060 + }, + { + "epoch": 2.073289597000937, + "grad_norm": 54343.2734375, + "learning_rate": 4.174008494103129e-05, + "loss": 2.092, + "step": 11061 + }, + { + "epoch": 2.073477038425492, + "grad_norm": 53990.0, + "learning_rate": 4.173233509724723e-05, + "loss": 2.1441, + "step": 11062 + }, + { + "epoch": 2.0736644798500468, + "grad_norm": 49396.30859375, + "learning_rate": 4.172458545766379e-05, + "loss": 2.1251, + "step": 11063 + }, + { + "epoch": 2.073851921274602, + "grad_norm": 54423.02734375, + "learning_rate": 4.1716836022472345e-05, + "loss": 2.1549, + "step": 11064 + }, + { + "epoch": 2.0740393626991565, + "grad_norm": 52037.42578125, + "learning_rate": 4.170908679186431e-05, + "loss": 2.1837, + "step": 11065 + }, + { + "epoch": 2.0742268041237115, + "grad_norm": 53623.98828125, + "learning_rate": 4.1701337766031066e-05, + "loss": 2.1514, + "step": 11066 + }, + { + "epoch": 2.074414245548266, + "grad_norm": 52598.14453125, + "learning_rate": 4.169358894516406e-05, + "loss": 2.105, + "step": 11067 + }, + { + "epoch": 2.074601686972821, + "grad_norm": 51089.96484375, + "learning_rate": 4.1685840329454606e-05, + "loss": 2.1242, + "step": 11068 + }, + { + "epoch": 2.074789128397376, + "grad_norm": 49831.05859375, + "learning_rate": 4.1678091919094125e-05, + "loss": 2.1691, + "step": 11069 + }, + { + "epoch": 2.0749765698219305, + "grad_norm": 50685.2578125, + "learning_rate": 4.167034371427399e-05, + "loss": 2.1852, + "step": 11070 + }, + { + "epoch": 2.0751640112464855, + "grad_norm": 54317.7890625, + "learning_rate": 4.1662595715185563e-05, + "loss": 2.0897, + "step": 11071 + }, + { + "epoch": 2.07535145267104, + "grad_norm": 51340.77734375, + "learning_rate": 4.1654847922020206e-05, + "loss": 2.1207, + "step": 11072 + }, + { + "epoch": 2.075538894095595, + "grad_norm": 51173.75, + "learning_rate": 4.164710033496929e-05, + "loss": 2.1983, + "step": 11073 + }, + { + "epoch": 2.07572633552015, + "grad_norm": 55312.65625, + "learning_rate": 4.1639352954224176e-05, + "loss": 2.1255, + "step": 11074 + }, + { + "epoch": 2.075913776944705, + "grad_norm": 54368.8046875, + "learning_rate": 4.163160577997619e-05, + "loss": 2.1607, + "step": 11075 + }, + { + "epoch": 2.0761012183692595, + "grad_norm": 53678.9140625, + "learning_rate": 4.162385881241668e-05, + "loss": 2.2106, + "step": 11076 + }, + { + "epoch": 2.0762886597938146, + "grad_norm": 50226.46875, + "learning_rate": 4.161611205173702e-05, + "loss": 2.1034, + "step": 11077 + }, + { + "epoch": 2.0764761012183692, + "grad_norm": 48726.015625, + "learning_rate": 4.160836549812852e-05, + "loss": 2.1805, + "step": 11078 + }, + { + "epoch": 2.0766635426429243, + "grad_norm": 54653.75, + "learning_rate": 4.1600619151782505e-05, + "loss": 2.1399, + "step": 11079 + }, + { + "epoch": 2.076850984067479, + "grad_norm": 53669.98828125, + "learning_rate": 4.159287301289031e-05, + "loss": 2.1149, + "step": 11080 + }, + { + "epoch": 2.0770384254920335, + "grad_norm": 52756.8984375, + "learning_rate": 4.158512708164324e-05, + "loss": 2.1192, + "step": 11081 + }, + { + "epoch": 2.0772258669165886, + "grad_norm": 52066.68359375, + "learning_rate": 4.157738135823265e-05, + "loss": 2.0926, + "step": 11082 + }, + { + "epoch": 2.0774133083411432, + "grad_norm": 50907.53125, + "learning_rate": 4.156963584284979e-05, + "loss": 2.1232, + "step": 11083 + }, + { + "epoch": 2.0776007497656983, + "grad_norm": 51937.54296875, + "learning_rate": 4.156189053568601e-05, + "loss": 2.1297, + "step": 11084 + }, + { + "epoch": 2.077788191190253, + "grad_norm": 49081.7578125, + "learning_rate": 4.155414543693259e-05, + "loss": 2.1168, + "step": 11085 + }, + { + "epoch": 2.077975632614808, + "grad_norm": 55437.51171875, + "learning_rate": 4.1546400546780834e-05, + "loss": 2.154, + "step": 11086 + }, + { + "epoch": 2.0781630740393626, + "grad_norm": 53460.3125, + "learning_rate": 4.153865586542202e-05, + "loss": 2.1732, + "step": 11087 + }, + { + "epoch": 2.0783505154639177, + "grad_norm": 47552.796875, + "learning_rate": 4.1530911393047434e-05, + "loss": 2.1059, + "step": 11088 + }, + { + "epoch": 2.0785379568884723, + "grad_norm": 57644.09375, + "learning_rate": 4.152316712984836e-05, + "loss": 2.1599, + "step": 11089 + }, + { + "epoch": 2.0787253983130274, + "grad_norm": 56309.75390625, + "learning_rate": 4.1515423076016066e-05, + "loss": 2.1075, + "step": 11090 + }, + { + "epoch": 2.078912839737582, + "grad_norm": 49769.66015625, + "learning_rate": 4.150767923174181e-05, + "loss": 2.1205, + "step": 11091 + }, + { + "epoch": 2.0791002811621366, + "grad_norm": 52613.71875, + "learning_rate": 4.149993559721687e-05, + "loss": 2.178, + "step": 11092 + }, + { + "epoch": 2.0792877225866917, + "grad_norm": 54451.95703125, + "learning_rate": 4.149219217263252e-05, + "loss": 2.2234, + "step": 11093 + }, + { + "epoch": 2.0794751640112463, + "grad_norm": 56502.890625, + "learning_rate": 4.148444895817998e-05, + "loss": 2.1286, + "step": 11094 + }, + { + "epoch": 2.0796626054358014, + "grad_norm": 54404.953125, + "learning_rate": 4.147670595405051e-05, + "loss": 2.2275, + "step": 11095 + }, + { + "epoch": 2.079850046860356, + "grad_norm": 56778.828125, + "learning_rate": 4.146896316043535e-05, + "loss": 2.1646, + "step": 11096 + }, + { + "epoch": 2.080037488284911, + "grad_norm": 51754.8828125, + "learning_rate": 4.146122057752574e-05, + "loss": 2.1698, + "step": 11097 + }, + { + "epoch": 2.0802249297094657, + "grad_norm": 56837.0859375, + "learning_rate": 4.145347820551292e-05, + "loss": 2.1025, + "step": 11098 + }, + { + "epoch": 2.0804123711340208, + "grad_norm": 52794.57421875, + "learning_rate": 4.1445736044588086e-05, + "loss": 2.1913, + "step": 11099 + }, + { + "epoch": 2.0805998125585754, + "grad_norm": 54761.484375, + "learning_rate": 4.143799409494249e-05, + "loss": 2.0924, + "step": 11100 + }, + { + "epoch": 2.0807872539831305, + "grad_norm": 59705.05078125, + "learning_rate": 4.1430252356767354e-05, + "loss": 2.1439, + "step": 11101 + }, + { + "epoch": 2.080974695407685, + "grad_norm": 51739.95703125, + "learning_rate": 4.142251083025386e-05, + "loss": 2.1253, + "step": 11102 + }, + { + "epoch": 2.0811621368322397, + "grad_norm": 53408.3828125, + "learning_rate": 4.1414769515593235e-05, + "loss": 2.1447, + "step": 11103 + }, + { + "epoch": 2.081349578256795, + "grad_norm": 53147.34765625, + "learning_rate": 4.1407028412976675e-05, + "loss": 2.1788, + "step": 11104 + }, + { + "epoch": 2.0815370196813494, + "grad_norm": 54609.4375, + "learning_rate": 4.139928752259537e-05, + "loss": 2.1105, + "step": 11105 + }, + { + "epoch": 2.0817244611059045, + "grad_norm": 53362.00390625, + "learning_rate": 4.139154684464049e-05, + "loss": 2.0937, + "step": 11106 + }, + { + "epoch": 2.081911902530459, + "grad_norm": 47433.6484375, + "learning_rate": 4.1383806379303256e-05, + "loss": 2.1406, + "step": 11107 + }, + { + "epoch": 2.082099343955014, + "grad_norm": 55318.01953125, + "learning_rate": 4.137606612677485e-05, + "loss": 2.1876, + "step": 11108 + }, + { + "epoch": 2.082286785379569, + "grad_norm": 51913.23046875, + "learning_rate": 4.136832608724641e-05, + "loss": 2.1291, + "step": 11109 + }, + { + "epoch": 2.082474226804124, + "grad_norm": 59108.42578125, + "learning_rate": 4.1360586260909126e-05, + "loss": 2.1021, + "step": 11110 + }, + { + "epoch": 2.0826616682286785, + "grad_norm": 51810.5859375, + "learning_rate": 4.135284664795417e-05, + "loss": 2.2105, + "step": 11111 + }, + { + "epoch": 2.0828491096532336, + "grad_norm": 50837.56640625, + "learning_rate": 4.13451072485727e-05, + "loss": 2.0942, + "step": 11112 + }, + { + "epoch": 2.083036551077788, + "grad_norm": 52767.8359375, + "learning_rate": 4.133736806295584e-05, + "loss": 2.1608, + "step": 11113 + }, + { + "epoch": 2.083223992502343, + "grad_norm": 55317.20703125, + "learning_rate": 4.1329629091294753e-05, + "loss": 2.1097, + "step": 11114 + }, + { + "epoch": 2.083411433926898, + "grad_norm": 55233.7734375, + "learning_rate": 4.132189033378061e-05, + "loss": 2.1547, + "step": 11115 + }, + { + "epoch": 2.0835988753514525, + "grad_norm": 49262.69140625, + "learning_rate": 4.1314151790604504e-05, + "loss": 2.0835, + "step": 11116 + }, + { + "epoch": 2.0837863167760076, + "grad_norm": 52674.74609375, + "learning_rate": 4.130641346195759e-05, + "loss": 2.1687, + "step": 11117 + }, + { + "epoch": 2.083973758200562, + "grad_norm": 56556.18359375, + "learning_rate": 4.1298675348030994e-05, + "loss": 2.1222, + "step": 11118 + }, + { + "epoch": 2.0841611996251173, + "grad_norm": 54883.828125, + "learning_rate": 4.129093744901585e-05, + "loss": 2.0808, + "step": 11119 + }, + { + "epoch": 2.084348641049672, + "grad_norm": 52386.81640625, + "learning_rate": 4.128319976510324e-05, + "loss": 2.1105, + "step": 11120 + }, + { + "epoch": 2.084536082474227, + "grad_norm": 50171.44921875, + "learning_rate": 4.127546229648429e-05, + "loss": 2.1912, + "step": 11121 + }, + { + "epoch": 2.0847235238987816, + "grad_norm": 50579.9296875, + "learning_rate": 4.126772504335012e-05, + "loss": 2.1854, + "step": 11122 + }, + { + "epoch": 2.0849109653233366, + "grad_norm": 61990.20703125, + "learning_rate": 4.1259988005891834e-05, + "loss": 2.1243, + "step": 11123 + }, + { + "epoch": 2.0850984067478913, + "grad_norm": 51677.9609375, + "learning_rate": 4.1252251184300496e-05, + "loss": 2.1963, + "step": 11124 + }, + { + "epoch": 2.085285848172446, + "grad_norm": 50506.75390625, + "learning_rate": 4.124451457876722e-05, + "loss": 2.1291, + "step": 11125 + }, + { + "epoch": 2.085473289597001, + "grad_norm": 53686.1171875, + "learning_rate": 4.123677818948307e-05, + "loss": 2.1962, + "step": 11126 + }, + { + "epoch": 2.0856607310215556, + "grad_norm": 57086.0859375, + "learning_rate": 4.122904201663916e-05, + "loss": 2.2106, + "step": 11127 + }, + { + "epoch": 2.0858481724461106, + "grad_norm": 53872.60546875, + "learning_rate": 4.1221306060426525e-05, + "loss": 2.1598, + "step": 11128 + }, + { + "epoch": 2.0860356138706653, + "grad_norm": 54358.5234375, + "learning_rate": 4.121357032103623e-05, + "loss": 2.1985, + "step": 11129 + }, + { + "epoch": 2.0862230552952203, + "grad_norm": 47304.421875, + "learning_rate": 4.12058347986594e-05, + "loss": 2.1783, + "step": 11130 + }, + { + "epoch": 2.086410496719775, + "grad_norm": 57956.58203125, + "learning_rate": 4.119809949348701e-05, + "loss": 2.172, + "step": 11131 + }, + { + "epoch": 2.08659793814433, + "grad_norm": 54351.875, + "learning_rate": 4.119036440571017e-05, + "loss": 2.1455, + "step": 11132 + }, + { + "epoch": 2.0867853795688847, + "grad_norm": 52604.71484375, + "learning_rate": 4.11826295355199e-05, + "loss": 2.1548, + "step": 11133 + }, + { + "epoch": 2.0869728209934397, + "grad_norm": 51942.28515625, + "learning_rate": 4.1174894883107255e-05, + "loss": 2.148, + "step": 11134 + }, + { + "epoch": 2.0871602624179943, + "grad_norm": 55427.64453125, + "learning_rate": 4.116716044866326e-05, + "loss": 2.1475, + "step": 11135 + }, + { + "epoch": 2.0873477038425494, + "grad_norm": 55388.484375, + "learning_rate": 4.115942623237894e-05, + "loss": 2.2227, + "step": 11136 + }, + { + "epoch": 2.087535145267104, + "grad_norm": 51270.0859375, + "learning_rate": 4.1151692234445336e-05, + "loss": 2.0679, + "step": 11137 + }, + { + "epoch": 2.0877225866916587, + "grad_norm": 48865.8984375, + "learning_rate": 4.114395845505348e-05, + "loss": 2.0937, + "step": 11138 + }, + { + "epoch": 2.0879100281162137, + "grad_norm": 50543.60546875, + "learning_rate": 4.113622489439435e-05, + "loss": 2.1527, + "step": 11139 + }, + { + "epoch": 2.0880974695407684, + "grad_norm": 54053.78125, + "learning_rate": 4.112849155265898e-05, + "loss": 2.1527, + "step": 11140 + }, + { + "epoch": 2.0882849109653234, + "grad_norm": 55270.078125, + "learning_rate": 4.1120758430038356e-05, + "loss": 2.1179, + "step": 11141 + }, + { + "epoch": 2.088472352389878, + "grad_norm": 54275.92578125, + "learning_rate": 4.11130255267235e-05, + "loss": 2.0635, + "step": 11142 + }, + { + "epoch": 2.088659793814433, + "grad_norm": 54240.99609375, + "learning_rate": 4.110529284290539e-05, + "loss": 2.1309, + "step": 11143 + }, + { + "epoch": 2.0888472352389877, + "grad_norm": 53439.01171875, + "learning_rate": 4.1097560378774996e-05, + "loss": 2.1684, + "step": 11144 + }, + { + "epoch": 2.089034676663543, + "grad_norm": 52984.08203125, + "learning_rate": 4.108982813452336e-05, + "loss": 2.1552, + "step": 11145 + }, + { + "epoch": 2.0892221180880974, + "grad_norm": 54254.49609375, + "learning_rate": 4.108209611034137e-05, + "loss": 2.0829, + "step": 11146 + }, + { + "epoch": 2.0894095595126525, + "grad_norm": 59935.6171875, + "learning_rate": 4.1074364306420067e-05, + "loss": 2.1118, + "step": 11147 + }, + { + "epoch": 2.089597000937207, + "grad_norm": 53702.16796875, + "learning_rate": 4.1066632722950384e-05, + "loss": 2.1696, + "step": 11148 + }, + { + "epoch": 2.0897844423617618, + "grad_norm": 50994.68359375, + "learning_rate": 4.105890136012331e-05, + "loss": 2.1333, + "step": 11149 + }, + { + "epoch": 2.089971883786317, + "grad_norm": 52189.015625, + "learning_rate": 4.105117021812976e-05, + "loss": 2.1477, + "step": 11150 + }, + { + "epoch": 2.0901593252108714, + "grad_norm": 55624.83984375, + "learning_rate": 4.1043439297160714e-05, + "loss": 2.1572, + "step": 11151 + }, + { + "epoch": 2.0903467666354265, + "grad_norm": 54433.34375, + "learning_rate": 4.1035708597407085e-05, + "loss": 2.1349, + "step": 11152 + }, + { + "epoch": 2.090534208059981, + "grad_norm": 51906.7265625, + "learning_rate": 4.1027978119059867e-05, + "loss": 2.1487, + "step": 11153 + }, + { + "epoch": 2.090721649484536, + "grad_norm": 52121.5234375, + "learning_rate": 4.1020247862309924e-05, + "loss": 2.1751, + "step": 11154 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 49705.75, + "learning_rate": 4.101251782734823e-05, + "loss": 2.1044, + "step": 11155 + }, + { + "epoch": 2.091096532333646, + "grad_norm": 56379.40625, + "learning_rate": 4.1004788014365704e-05, + "loss": 2.1151, + "step": 11156 + }, + { + "epoch": 2.0912839737582005, + "grad_norm": 50985.2421875, + "learning_rate": 4.0997058423553234e-05, + "loss": 2.1455, + "step": 11157 + }, + { + "epoch": 2.0914714151827556, + "grad_norm": 48948.94140625, + "learning_rate": 4.098932905510175e-05, + "loss": 2.1572, + "step": 11158 + }, + { + "epoch": 2.09165885660731, + "grad_norm": 50639.47265625, + "learning_rate": 4.0981599909202155e-05, + "loss": 2.1272, + "step": 11159 + }, + { + "epoch": 2.091846298031865, + "grad_norm": 49762.72265625, + "learning_rate": 4.0973870986045374e-05, + "loss": 2.1694, + "step": 11160 + }, + { + "epoch": 2.09203373945642, + "grad_norm": 49897.09375, + "learning_rate": 4.096614228582225e-05, + "loss": 2.159, + "step": 11161 + }, + { + "epoch": 2.0922211808809745, + "grad_norm": 53529.4375, + "learning_rate": 4.095841380872371e-05, + "loss": 2.154, + "step": 11162 + }, + { + "epoch": 2.0924086223055296, + "grad_norm": 53046.65234375, + "learning_rate": 4.0950685554940615e-05, + "loss": 2.1639, + "step": 11163 + }, + { + "epoch": 2.092596063730084, + "grad_norm": 57153.18359375, + "learning_rate": 4.094295752466387e-05, + "loss": 2.1363, + "step": 11164 + }, + { + "epoch": 2.0927835051546393, + "grad_norm": 50815.6015625, + "learning_rate": 4.093522971808433e-05, + "loss": 2.0577, + "step": 11165 + }, + { + "epoch": 2.092970946579194, + "grad_norm": 52753.61328125, + "learning_rate": 4.092750213539285e-05, + "loss": 2.1754, + "step": 11166 + }, + { + "epoch": 2.093158388003749, + "grad_norm": 51032.12890625, + "learning_rate": 4.091977477678031e-05, + "loss": 2.1356, + "step": 11167 + }, + { + "epoch": 2.0933458294283036, + "grad_norm": 55049.2578125, + "learning_rate": 4.091204764243758e-05, + "loss": 2.1362, + "step": 11168 + }, + { + "epoch": 2.0935332708528587, + "grad_norm": 53891.3046875, + "learning_rate": 4.090432073255546e-05, + "loss": 2.1808, + "step": 11169 + }, + { + "epoch": 2.0937207122774133, + "grad_norm": 51414.51953125, + "learning_rate": 4.089659404732483e-05, + "loss": 2.1628, + "step": 11170 + }, + { + "epoch": 2.093908153701968, + "grad_norm": 55378.4765625, + "learning_rate": 4.088886758693655e-05, + "loss": 2.0828, + "step": 11171 + }, + { + "epoch": 2.094095595126523, + "grad_norm": 54299.78125, + "learning_rate": 4.088114135158141e-05, + "loss": 2.1591, + "step": 11172 + }, + { + "epoch": 2.0942830365510776, + "grad_norm": 57627.04296875, + "learning_rate": 4.087341534145025e-05, + "loss": 2.0675, + "step": 11173 + }, + { + "epoch": 2.0944704779756327, + "grad_norm": 51839.9921875, + "learning_rate": 4.086568955673389e-05, + "loss": 2.2157, + "step": 11174 + }, + { + "epoch": 2.0946579194001873, + "grad_norm": 53454.1484375, + "learning_rate": 4.0857963997623194e-05, + "loss": 2.1738, + "step": 11175 + }, + { + "epoch": 2.0948453608247424, + "grad_norm": 55626.171875, + "learning_rate": 4.08502386643089e-05, + "loss": 2.1684, + "step": 11176 + }, + { + "epoch": 2.095032802249297, + "grad_norm": 51686.91015625, + "learning_rate": 4.084251355698185e-05, + "loss": 2.1142, + "step": 11177 + }, + { + "epoch": 2.095220243673852, + "grad_norm": 49745.7890625, + "learning_rate": 4.0834788675832855e-05, + "loss": 2.1174, + "step": 11178 + }, + { + "epoch": 2.0954076850984067, + "grad_norm": 59191.2890625, + "learning_rate": 4.08270640210527e-05, + "loss": 2.1924, + "step": 11179 + }, + { + "epoch": 2.0955951265229618, + "grad_norm": 57595.4296875, + "learning_rate": 4.0819339592832164e-05, + "loss": 2.1493, + "step": 11180 + }, + { + "epoch": 2.0957825679475164, + "grad_norm": 55437.2421875, + "learning_rate": 4.0811615391362034e-05, + "loss": 2.125, + "step": 11181 + }, + { + "epoch": 2.0959700093720715, + "grad_norm": 53680.53515625, + "learning_rate": 4.080389141683308e-05, + "loss": 2.0704, + "step": 11182 + }, + { + "epoch": 2.096157450796626, + "grad_norm": 52242.41015625, + "learning_rate": 4.079616766943612e-05, + "loss": 2.1775, + "step": 11183 + }, + { + "epoch": 2.0963448922211807, + "grad_norm": 50085.71875, + "learning_rate": 4.078844414936185e-05, + "loss": 2.1656, + "step": 11184 + }, + { + "epoch": 2.0965323336457358, + "grad_norm": 53909.2109375, + "learning_rate": 4.078072085680108e-05, + "loss": 2.1507, + "step": 11185 + }, + { + "epoch": 2.0967197750702904, + "grad_norm": 50848.83203125, + "learning_rate": 4.077299779194456e-05, + "loss": 2.1322, + "step": 11186 + }, + { + "epoch": 2.0969072164948455, + "grad_norm": 51676.30859375, + "learning_rate": 4.0765274954983024e-05, + "loss": 2.1785, + "step": 11187 + }, + { + "epoch": 2.0970946579194, + "grad_norm": 49538.05859375, + "learning_rate": 4.075755234610722e-05, + "loss": 2.163, + "step": 11188 + }, + { + "epoch": 2.097282099343955, + "grad_norm": 54403.171875, + "learning_rate": 4.07498299655079e-05, + "loss": 2.1773, + "step": 11189 + }, + { + "epoch": 2.0974695407685098, + "grad_norm": 53884.015625, + "learning_rate": 4.0742107813375785e-05, + "loss": 2.152, + "step": 11190 + }, + { + "epoch": 2.097656982193065, + "grad_norm": 52633.89453125, + "learning_rate": 4.0734385889901596e-05, + "loss": 2.1306, + "step": 11191 + }, + { + "epoch": 2.0978444236176195, + "grad_norm": 55406.9609375, + "learning_rate": 4.072666419527605e-05, + "loss": 2.1533, + "step": 11192 + }, + { + "epoch": 2.0980318650421745, + "grad_norm": 52883.1171875, + "learning_rate": 4.07189427296899e-05, + "loss": 2.179, + "step": 11193 + }, + { + "epoch": 2.098219306466729, + "grad_norm": 53491.140625, + "learning_rate": 4.071122149333383e-05, + "loss": 2.1562, + "step": 11194 + }, + { + "epoch": 2.098406747891284, + "grad_norm": 53150.83984375, + "learning_rate": 4.070350048639854e-05, + "loss": 2.1757, + "step": 11195 + }, + { + "epoch": 2.098594189315839, + "grad_norm": 50248.80859375, + "learning_rate": 4.0695779709074726e-05, + "loss": 2.1848, + "step": 11196 + }, + { + "epoch": 2.0987816307403935, + "grad_norm": 53744.25, + "learning_rate": 4.0688059161553085e-05, + "loss": 2.1658, + "step": 11197 + }, + { + "epoch": 2.0989690721649485, + "grad_norm": 51963.2578125, + "learning_rate": 4.068033884402435e-05, + "loss": 2.0782, + "step": 11198 + }, + { + "epoch": 2.099156513589503, + "grad_norm": 51985.36328125, + "learning_rate": 4.067261875667911e-05, + "loss": 2.0901, + "step": 11199 + }, + { + "epoch": 2.0993439550140582, + "grad_norm": 53750.1171875, + "learning_rate": 4.066489889970813e-05, + "loss": 2.1661, + "step": 11200 + }, + { + "epoch": 2.099531396438613, + "grad_norm": 49288.99609375, + "learning_rate": 4.0657179273302035e-05, + "loss": 2.1167, + "step": 11201 + }, + { + "epoch": 2.099718837863168, + "grad_norm": 52947.671875, + "learning_rate": 4.06494598776515e-05, + "loss": 2.1144, + "step": 11202 + }, + { + "epoch": 2.0999062792877226, + "grad_norm": 60614.265625, + "learning_rate": 4.0641740712947175e-05, + "loss": 2.0839, + "step": 11203 + }, + { + "epoch": 2.1000937207122776, + "grad_norm": 55781.0859375, + "learning_rate": 4.063402177937972e-05, + "loss": 2.129, + "step": 11204 + }, + { + "epoch": 2.1002811621368322, + "grad_norm": 52261.50390625, + "learning_rate": 4.06263030771398e-05, + "loss": 2.1338, + "step": 11205 + }, + { + "epoch": 2.100468603561387, + "grad_norm": 53238.80078125, + "learning_rate": 4.061858460641803e-05, + "loss": 2.1006, + "step": 11206 + }, + { + "epoch": 2.100656044985942, + "grad_norm": 50708.65625, + "learning_rate": 4.061086636740504e-05, + "loss": 2.1766, + "step": 11207 + }, + { + "epoch": 2.1008434864104966, + "grad_norm": 58479.81640625, + "learning_rate": 4.06031483602915e-05, + "loss": 2.1384, + "step": 11208 + }, + { + "epoch": 2.1010309278350516, + "grad_norm": 54787.05078125, + "learning_rate": 4.0595430585268016e-05, + "loss": 2.1341, + "step": 11209 + }, + { + "epoch": 2.1012183692596063, + "grad_norm": 55150.5390625, + "learning_rate": 4.058771304252519e-05, + "loss": 2.1565, + "step": 11210 + }, + { + "epoch": 2.1014058106841613, + "grad_norm": 53561.9296875, + "learning_rate": 4.057999573225365e-05, + "loss": 2.0789, + "step": 11211 + }, + { + "epoch": 2.101593252108716, + "grad_norm": 59274.3671875, + "learning_rate": 4.057227865464402e-05, + "loss": 2.0449, + "step": 11212 + }, + { + "epoch": 2.101780693533271, + "grad_norm": 54520.8828125, + "learning_rate": 4.056456180988687e-05, + "loss": 2.0898, + "step": 11213 + }, + { + "epoch": 2.1019681349578256, + "grad_norm": 55224.6640625, + "learning_rate": 4.055684519817279e-05, + "loss": 2.1517, + "step": 11214 + }, + { + "epoch": 2.1021555763823807, + "grad_norm": 52742.1328125, + "learning_rate": 4.054912881969241e-05, + "loss": 2.2058, + "step": 11215 + }, + { + "epoch": 2.1023430178069353, + "grad_norm": 55014.1328125, + "learning_rate": 4.05414126746363e-05, + "loss": 2.1454, + "step": 11216 + }, + { + "epoch": 2.10253045923149, + "grad_norm": 52872.13671875, + "learning_rate": 4.0533696763195026e-05, + "loss": 2.1764, + "step": 11217 + }, + { + "epoch": 2.102717900656045, + "grad_norm": 50170.9375, + "learning_rate": 4.052598108555917e-05, + "loss": 2.1373, + "step": 11218 + }, + { + "epoch": 2.1029053420805996, + "grad_norm": 55466.73046875, + "learning_rate": 4.05182656419193e-05, + "loss": 2.1342, + "step": 11219 + }, + { + "epoch": 2.1030927835051547, + "grad_norm": 55398.8125, + "learning_rate": 4.051055043246599e-05, + "loss": 2.1393, + "step": 11220 + }, + { + "epoch": 2.1032802249297093, + "grad_norm": 58855.0546875, + "learning_rate": 4.050283545738976e-05, + "loss": 2.1835, + "step": 11221 + }, + { + "epoch": 2.1034676663542644, + "grad_norm": 52850.734375, + "learning_rate": 4.049512071688118e-05, + "loss": 2.143, + "step": 11222 + }, + { + "epoch": 2.103655107778819, + "grad_norm": 56601.84375, + "learning_rate": 4.048740621113079e-05, + "loss": 2.4516, + "step": 11223 + }, + { + "epoch": 2.103842549203374, + "grad_norm": 59745.94921875, + "learning_rate": 4.0479691940329154e-05, + "loss": 2.1451, + "step": 11224 + }, + { + "epoch": 2.1040299906279287, + "grad_norm": 53135.12890625, + "learning_rate": 4.0471977904666775e-05, + "loss": 2.1234, + "step": 11225 + }, + { + "epoch": 2.104217432052484, + "grad_norm": 54022.0859375, + "learning_rate": 4.0464264104334184e-05, + "loss": 2.1192, + "step": 11226 + }, + { + "epoch": 2.1044048734770384, + "grad_norm": 49893.56640625, + "learning_rate": 4.045655053952192e-05, + "loss": 2.1785, + "step": 11227 + }, + { + "epoch": 2.104592314901593, + "grad_norm": 52999.95703125, + "learning_rate": 4.044883721042047e-05, + "loss": 2.1112, + "step": 11228 + }, + { + "epoch": 2.104779756326148, + "grad_norm": 57685.796875, + "learning_rate": 4.0441124117220355e-05, + "loss": 2.1095, + "step": 11229 + }, + { + "epoch": 2.1049671977507027, + "grad_norm": 49550.50390625, + "learning_rate": 4.043341126011208e-05, + "loss": 2.1502, + "step": 11230 + }, + { + "epoch": 2.105154639175258, + "grad_norm": 58978.0078125, + "learning_rate": 4.0425698639286166e-05, + "loss": 2.0978, + "step": 11231 + }, + { + "epoch": 2.1053420805998124, + "grad_norm": 50848.79296875, + "learning_rate": 4.041798625493307e-05, + "loss": 2.1379, + "step": 11232 + }, + { + "epoch": 2.1055295220243675, + "grad_norm": 53262.25390625, + "learning_rate": 4.0410274107243286e-05, + "loss": 2.0798, + "step": 11233 + }, + { + "epoch": 2.105716963448922, + "grad_norm": 50237.328125, + "learning_rate": 4.0402562196407304e-05, + "loss": 2.1409, + "step": 11234 + }, + { + "epoch": 2.105904404873477, + "grad_norm": 50213.26953125, + "learning_rate": 4.039485052261559e-05, + "loss": 2.1489, + "step": 11235 + }, + { + "epoch": 2.106091846298032, + "grad_norm": 58934.33984375, + "learning_rate": 4.038713908605862e-05, + "loss": 2.2288, + "step": 11236 + }, + { + "epoch": 2.106279287722587, + "grad_norm": 48983.7890625, + "learning_rate": 4.037942788692683e-05, + "loss": 2.1076, + "step": 11237 + }, + { + "epoch": 2.1064667291471415, + "grad_norm": 56707.578125, + "learning_rate": 4.037171692541071e-05, + "loss": 2.1295, + "step": 11238 + }, + { + "epoch": 2.106654170571696, + "grad_norm": 56247.76171875, + "learning_rate": 4.036400620170071e-05, + "loss": 2.1013, + "step": 11239 + }, + { + "epoch": 2.106841611996251, + "grad_norm": 56681.7578125, + "learning_rate": 4.035629571598726e-05, + "loss": 2.1586, + "step": 11240 + }, + { + "epoch": 2.107029053420806, + "grad_norm": 53460.125, + "learning_rate": 4.0348585468460795e-05, + "loss": 2.1215, + "step": 11241 + }, + { + "epoch": 2.107216494845361, + "grad_norm": 55724.40234375, + "learning_rate": 4.0340875459311764e-05, + "loss": 2.051, + "step": 11242 + }, + { + "epoch": 2.1074039362699155, + "grad_norm": 51486.08984375, + "learning_rate": 4.033316568873059e-05, + "loss": 2.181, + "step": 11243 + }, + { + "epoch": 2.1075913776944706, + "grad_norm": 52120.23828125, + "learning_rate": 4.032545615690767e-05, + "loss": 2.1744, + "step": 11244 + }, + { + "epoch": 2.107778819119025, + "grad_norm": 51731.640625, + "learning_rate": 4.031774686403344e-05, + "loss": 2.1398, + "step": 11245 + }, + { + "epoch": 2.1079662605435803, + "grad_norm": 56829.55859375, + "learning_rate": 4.031003781029834e-05, + "loss": 2.0834, + "step": 11246 + }, + { + "epoch": 2.108153701968135, + "grad_norm": 55341.5859375, + "learning_rate": 4.030232899589271e-05, + "loss": 2.1818, + "step": 11247 + }, + { + "epoch": 2.10834114339269, + "grad_norm": 57611.50390625, + "learning_rate": 4.029462042100699e-05, + "loss": 2.1728, + "step": 11248 + }, + { + "epoch": 2.1085285848172446, + "grad_norm": 52725.62890625, + "learning_rate": 4.028691208583157e-05, + "loss": 2.181, + "step": 11249 + }, + { + "epoch": 2.108716026241799, + "grad_norm": 50631.54296875, + "learning_rate": 4.027920399055683e-05, + "loss": 2.1358, + "step": 11250 + }, + { + "epoch": 2.1089034676663543, + "grad_norm": 54187.5625, + "learning_rate": 4.027149613537314e-05, + "loss": 2.1247, + "step": 11251 + }, + { + "epoch": 2.109090909090909, + "grad_norm": 56569.8046875, + "learning_rate": 4.026378852047087e-05, + "loss": 2.1467, + "step": 11252 + }, + { + "epoch": 2.109278350515464, + "grad_norm": 52550.73828125, + "learning_rate": 4.025608114604044e-05, + "loss": 2.131, + "step": 11253 + }, + { + "epoch": 2.1094657919400186, + "grad_norm": 57838.75390625, + "learning_rate": 4.024837401227213e-05, + "loss": 2.1497, + "step": 11254 + }, + { + "epoch": 2.1096532333645737, + "grad_norm": 49144.41015625, + "learning_rate": 4.024066711935635e-05, + "loss": 2.0714, + "step": 11255 + }, + { + "epoch": 2.1098406747891283, + "grad_norm": 56778.22265625, + "learning_rate": 4.023296046748346e-05, + "loss": 2.14, + "step": 11256 + }, + { + "epoch": 2.1100281162136834, + "grad_norm": 50672.9296875, + "learning_rate": 4.022525405684377e-05, + "loss": 2.1636, + "step": 11257 + }, + { + "epoch": 2.110215557638238, + "grad_norm": 52000.8203125, + "learning_rate": 4.0217547887627626e-05, + "loss": 2.0801, + "step": 11258 + }, + { + "epoch": 2.110402999062793, + "grad_norm": 52340.17578125, + "learning_rate": 4.0209841960025375e-05, + "loss": 2.1584, + "step": 11259 + }, + { + "epoch": 2.1105904404873477, + "grad_norm": 53422.02734375, + "learning_rate": 4.0202136274227314e-05, + "loss": 2.1259, + "step": 11260 + }, + { + "epoch": 2.1107778819119027, + "grad_norm": 57740.32421875, + "learning_rate": 4.019443083042382e-05, + "loss": 2.1091, + "step": 11261 + }, + { + "epoch": 2.1109653233364574, + "grad_norm": 51885.23046875, + "learning_rate": 4.018672562880514e-05, + "loss": 2.1354, + "step": 11262 + }, + { + "epoch": 2.111152764761012, + "grad_norm": 55958.99609375, + "learning_rate": 4.0179020669561626e-05, + "loss": 2.1705, + "step": 11263 + }, + { + "epoch": 2.111340206185567, + "grad_norm": 55015.54296875, + "learning_rate": 4.017131595288356e-05, + "loss": 2.0608, + "step": 11264 + }, + { + "epoch": 2.1115276476101217, + "grad_norm": 53496.29296875, + "learning_rate": 4.0163611478961264e-05, + "loss": 2.0922, + "step": 11265 + }, + { + "epoch": 2.1117150890346768, + "grad_norm": 54817.09765625, + "learning_rate": 4.0155907247985e-05, + "loss": 2.1263, + "step": 11266 + }, + { + "epoch": 2.1119025304592314, + "grad_norm": 51485.71875, + "learning_rate": 4.014820326014506e-05, + "loss": 2.1356, + "step": 11267 + }, + { + "epoch": 2.1120899718837864, + "grad_norm": 62794.734375, + "learning_rate": 4.0140499515631744e-05, + "loss": 2.1648, + "step": 11268 + }, + { + "epoch": 2.112277413308341, + "grad_norm": 50596.3671875, + "learning_rate": 4.0132796014635285e-05, + "loss": 2.1448, + "step": 11269 + }, + { + "epoch": 2.112464854732896, + "grad_norm": 52966.67578125, + "learning_rate": 4.012509275734598e-05, + "loss": 2.1044, + "step": 11270 + }, + { + "epoch": 2.1126522961574508, + "grad_norm": 55151.10546875, + "learning_rate": 4.011738974395408e-05, + "loss": 2.1326, + "step": 11271 + }, + { + "epoch": 2.112839737582006, + "grad_norm": 51757.22265625, + "learning_rate": 4.010968697464985e-05, + "loss": 2.1423, + "step": 11272 + }, + { + "epoch": 2.1130271790065605, + "grad_norm": 52874.51171875, + "learning_rate": 4.010198444962352e-05, + "loss": 2.0799, + "step": 11273 + }, + { + "epoch": 2.113214620431115, + "grad_norm": 48413.5625, + "learning_rate": 4.009428216906535e-05, + "loss": 2.127, + "step": 11274 + }, + { + "epoch": 2.11340206185567, + "grad_norm": 57066.125, + "learning_rate": 4.008658013316554e-05, + "loss": 2.1817, + "step": 11275 + }, + { + "epoch": 2.1135895032802248, + "grad_norm": 50779.3359375, + "learning_rate": 4.007887834211439e-05, + "loss": 2.1819, + "step": 11276 + }, + { + "epoch": 2.11377694470478, + "grad_norm": 54536.28125, + "learning_rate": 4.0071176796102047e-05, + "loss": 2.0829, + "step": 11277 + }, + { + "epoch": 2.1139643861293345, + "grad_norm": 54156.546875, + "learning_rate": 4.006347549531877e-05, + "loss": 2.1443, + "step": 11278 + }, + { + "epoch": 2.1141518275538895, + "grad_norm": 54913.640625, + "learning_rate": 4.0055774439954773e-05, + "loss": 2.1412, + "step": 11279 + }, + { + "epoch": 2.114339268978444, + "grad_norm": 54183.25390625, + "learning_rate": 4.0048073630200265e-05, + "loss": 2.1373, + "step": 11280 + }, + { + "epoch": 2.114526710402999, + "grad_norm": 62547.0234375, + "learning_rate": 4.004037306624542e-05, + "loss": 2.1927, + "step": 11281 + }, + { + "epoch": 2.114714151827554, + "grad_norm": 56499.2265625, + "learning_rate": 4.0032672748280456e-05, + "loss": 2.0851, + "step": 11282 + }, + { + "epoch": 2.114901593252109, + "grad_norm": 52843.7265625, + "learning_rate": 4.0024972676495556e-05, + "loss": 2.151, + "step": 11283 + }, + { + "epoch": 2.1150890346766635, + "grad_norm": 51967.8671875, + "learning_rate": 4.001727285108088e-05, + "loss": 2.1365, + "step": 11284 + }, + { + "epoch": 2.115276476101218, + "grad_norm": 54827.53125, + "learning_rate": 4.000957327222663e-05, + "loss": 2.0856, + "step": 11285 + }, + { + "epoch": 2.1154639175257732, + "grad_norm": 52875.91015625, + "learning_rate": 4.000187394012296e-05, + "loss": 2.0838, + "step": 11286 + }, + { + "epoch": 2.115651358950328, + "grad_norm": 51909.33984375, + "learning_rate": 3.999417485496006e-05, + "loss": 2.1226, + "step": 11287 + }, + { + "epoch": 2.115838800374883, + "grad_norm": 52542.5546875, + "learning_rate": 3.998647601692806e-05, + "loss": 2.1041, + "step": 11288 + }, + { + "epoch": 2.1160262417994375, + "grad_norm": 48742.0859375, + "learning_rate": 3.9978777426217114e-05, + "loss": 2.0798, + "step": 11289 + }, + { + "epoch": 2.1162136832239926, + "grad_norm": 48470.0859375, + "learning_rate": 3.9971079083017364e-05, + "loss": 2.2217, + "step": 11290 + }, + { + "epoch": 2.1164011246485472, + "grad_norm": 56394.94140625, + "learning_rate": 3.996338098751899e-05, + "loss": 2.1719, + "step": 11291 + }, + { + "epoch": 2.1165885660731023, + "grad_norm": 53489.86328125, + "learning_rate": 3.9955683139912056e-05, + "loss": 2.1066, + "step": 11292 + }, + { + "epoch": 2.116776007497657, + "grad_norm": 53538.92578125, + "learning_rate": 3.994798554038674e-05, + "loss": 2.1419, + "step": 11293 + }, + { + "epoch": 2.116963448922212, + "grad_norm": 57779.90625, + "learning_rate": 3.9940288189133155e-05, + "loss": 2.0846, + "step": 11294 + }, + { + "epoch": 2.1171508903467666, + "grad_norm": 55543.45703125, + "learning_rate": 3.993259108634142e-05, + "loss": 2.144, + "step": 11295 + }, + { + "epoch": 2.1173383317713212, + "grad_norm": 52386.35546875, + "learning_rate": 3.992489423220161e-05, + "loss": 2.194, + "step": 11296 + }, + { + "epoch": 2.1175257731958763, + "grad_norm": 52206.19921875, + "learning_rate": 3.991719762690386e-05, + "loss": 2.085, + "step": 11297 + }, + { + "epoch": 2.117713214620431, + "grad_norm": 53290.140625, + "learning_rate": 3.9909501270638266e-05, + "loss": 2.1395, + "step": 11298 + }, + { + "epoch": 2.117900656044986, + "grad_norm": 51582.5546875, + "learning_rate": 3.99018051635949e-05, + "loss": 2.1343, + "step": 11299 + }, + { + "epoch": 2.1180880974695406, + "grad_norm": 53519.203125, + "learning_rate": 3.989410930596383e-05, + "loss": 2.2043, + "step": 11300 + }, + { + "epoch": 2.1182755388940957, + "grad_norm": 50980.03515625, + "learning_rate": 3.988641369793518e-05, + "loss": 2.0929, + "step": 11301 + }, + { + "epoch": 2.1184629803186503, + "grad_norm": 50570.51171875, + "learning_rate": 3.9878718339699e-05, + "loss": 2.1643, + "step": 11302 + }, + { + "epoch": 2.1186504217432054, + "grad_norm": 54102.328125, + "learning_rate": 3.987102323144536e-05, + "loss": 2.1382, + "step": 11303 + }, + { + "epoch": 2.11883786316776, + "grad_norm": 56533.26953125, + "learning_rate": 3.986332837336431e-05, + "loss": 2.1622, + "step": 11304 + }, + { + "epoch": 2.119025304592315, + "grad_norm": 56367.5234375, + "learning_rate": 3.9855633765645883e-05, + "loss": 2.1514, + "step": 11305 + }, + { + "epoch": 2.1192127460168697, + "grad_norm": 54192.7109375, + "learning_rate": 3.984793940848019e-05, + "loss": 2.1724, + "step": 11306 + }, + { + "epoch": 2.1194001874414248, + "grad_norm": 53435.82421875, + "learning_rate": 3.98402453020572e-05, + "loss": 2.1427, + "step": 11307 + }, + { + "epoch": 2.1195876288659794, + "grad_norm": 54209.47265625, + "learning_rate": 3.9832551446566986e-05, + "loss": 2.1198, + "step": 11308 + }, + { + "epoch": 2.119775070290534, + "grad_norm": 52148.01171875, + "learning_rate": 3.982485784219958e-05, + "loss": 2.0954, + "step": 11309 + }, + { + "epoch": 2.119962511715089, + "grad_norm": 58228.48046875, + "learning_rate": 3.981716448914499e-05, + "loss": 2.1503, + "step": 11310 + }, + { + "epoch": 2.1201499531396437, + "grad_norm": 54365.66796875, + "learning_rate": 3.980947138759322e-05, + "loss": 2.1818, + "step": 11311 + }, + { + "epoch": 2.120337394564199, + "grad_norm": 55363.50390625, + "learning_rate": 3.980177853773431e-05, + "loss": 2.1376, + "step": 11312 + }, + { + "epoch": 2.1205248359887534, + "grad_norm": 54226.24609375, + "learning_rate": 3.979408593975825e-05, + "loss": 2.0058, + "step": 11313 + }, + { + "epoch": 2.1207122774133085, + "grad_norm": 48756.76171875, + "learning_rate": 3.9786393593855026e-05, + "loss": 2.1944, + "step": 11314 + }, + { + "epoch": 2.120899718837863, + "grad_norm": 50596.90625, + "learning_rate": 3.9778701500214625e-05, + "loss": 2.1436, + "step": 11315 + }, + { + "epoch": 2.121087160262418, + "grad_norm": 57000.421875, + "learning_rate": 3.977100965902706e-05, + "loss": 1.9927, + "step": 11316 + }, + { + "epoch": 2.121274601686973, + "grad_norm": 53913.4453125, + "learning_rate": 3.9763318070482305e-05, + "loss": 2.1442, + "step": 11317 + }, + { + "epoch": 2.121462043111528, + "grad_norm": 51521.83203125, + "learning_rate": 3.9755626734770304e-05, + "loss": 2.1232, + "step": 11318 + }, + { + "epoch": 2.1216494845360825, + "grad_norm": 54312.71875, + "learning_rate": 3.9747935652081045e-05, + "loss": 2.1343, + "step": 11319 + }, + { + "epoch": 2.121836925960637, + "grad_norm": 52766.80859375, + "learning_rate": 3.974024482260448e-05, + "loss": 2.1165, + "step": 11320 + }, + { + "epoch": 2.122024367385192, + "grad_norm": 55340.7109375, + "learning_rate": 3.973255424653059e-05, + "loss": 2.1645, + "step": 11321 + }, + { + "epoch": 2.122211808809747, + "grad_norm": 57060.8984375, + "learning_rate": 3.9724863924049264e-05, + "loss": 2.0074, + "step": 11322 + }, + { + "epoch": 2.122399250234302, + "grad_norm": 51071.640625, + "learning_rate": 3.971717385535049e-05, + "loss": 2.1447, + "step": 11323 + }, + { + "epoch": 2.1225866916588565, + "grad_norm": 50096.42578125, + "learning_rate": 3.9709484040624215e-05, + "loss": 2.0485, + "step": 11324 + }, + { + "epoch": 2.1227741330834116, + "grad_norm": 52086.2265625, + "learning_rate": 3.9701794480060304e-05, + "loss": 2.1782, + "step": 11325 + }, + { + "epoch": 2.122961574507966, + "grad_norm": 50591.39453125, + "learning_rate": 3.969410517384874e-05, + "loss": 2.1693, + "step": 11326 + }, + { + "epoch": 2.1231490159325213, + "grad_norm": 54552.70703125, + "learning_rate": 3.9686416122179404e-05, + "loss": 2.2244, + "step": 11327 + }, + { + "epoch": 2.123336457357076, + "grad_norm": 54344.22265625, + "learning_rate": 3.967872732524224e-05, + "loss": 2.0448, + "step": 11328 + }, + { + "epoch": 2.123523898781631, + "grad_norm": 56482.8203125, + "learning_rate": 3.967103878322711e-05, + "loss": 2.1919, + "step": 11329 + }, + { + "epoch": 2.1237113402061856, + "grad_norm": 53142.74609375, + "learning_rate": 3.966335049632392e-05, + "loss": 2.0875, + "step": 11330 + }, + { + "epoch": 2.12389878163074, + "grad_norm": 50891.140625, + "learning_rate": 3.965566246472258e-05, + "loss": 2.1252, + "step": 11331 + }, + { + "epoch": 2.1240862230552953, + "grad_norm": 52092.30859375, + "learning_rate": 3.9647974688612984e-05, + "loss": 2.1431, + "step": 11332 + }, + { + "epoch": 2.12427366447985, + "grad_norm": 51265.44921875, + "learning_rate": 3.9640287168184977e-05, + "loss": 2.1707, + "step": 11333 + }, + { + "epoch": 2.124461105904405, + "grad_norm": 53605.7265625, + "learning_rate": 3.9632599903628444e-05, + "loss": 2.2462, + "step": 11334 + }, + { + "epoch": 2.1246485473289596, + "grad_norm": 51886.19140625, + "learning_rate": 3.962491289513325e-05, + "loss": 2.1764, + "step": 11335 + }, + { + "epoch": 2.1248359887535146, + "grad_norm": 54429.9765625, + "learning_rate": 3.961722614288927e-05, + "loss": 2.1214, + "step": 11336 + }, + { + "epoch": 2.1250234301780693, + "grad_norm": 53301.33984375, + "learning_rate": 3.960953964708633e-05, + "loss": 2.1401, + "step": 11337 + }, + { + "epoch": 2.1252108716026243, + "grad_norm": 52027.421875, + "learning_rate": 3.960185340791428e-05, + "loss": 2.1431, + "step": 11338 + }, + { + "epoch": 2.125398313027179, + "grad_norm": 51977.875, + "learning_rate": 3.9594167425563e-05, + "loss": 2.1384, + "step": 11339 + }, + { + "epoch": 2.125585754451734, + "grad_norm": 56059.30078125, + "learning_rate": 3.958648170022227e-05, + "loss": 2.1205, + "step": 11340 + }, + { + "epoch": 2.1257731958762887, + "grad_norm": 50095.421875, + "learning_rate": 3.9578796232081936e-05, + "loss": 2.1112, + "step": 11341 + }, + { + "epoch": 2.1259606373008433, + "grad_norm": 56478.15625, + "learning_rate": 3.9571111021331835e-05, + "loss": 2.1241, + "step": 11342 + }, + { + "epoch": 2.1261480787253983, + "grad_norm": 54944.95703125, + "learning_rate": 3.956342606816178e-05, + "loss": 2.1946, + "step": 11343 + }, + { + "epoch": 2.126335520149953, + "grad_norm": 53658.7421875, + "learning_rate": 3.955574137276156e-05, + "loss": 2.0637, + "step": 11344 + }, + { + "epoch": 2.126522961574508, + "grad_norm": 52436.91796875, + "learning_rate": 3.954805693532097e-05, + "loss": 2.1946, + "step": 11345 + }, + { + "epoch": 2.1267104029990627, + "grad_norm": 53296.1171875, + "learning_rate": 3.954037275602984e-05, + "loss": 2.0447, + "step": 11346 + }, + { + "epoch": 2.1268978444236177, + "grad_norm": 58578.87890625, + "learning_rate": 3.953268883507795e-05, + "loss": 2.1316, + "step": 11347 + }, + { + "epoch": 2.1270852858481724, + "grad_norm": 54236.28125, + "learning_rate": 3.952500517265506e-05, + "loss": 2.094, + "step": 11348 + }, + { + "epoch": 2.1272727272727274, + "grad_norm": 59646.13671875, + "learning_rate": 3.951732176895096e-05, + "loss": 2.1504, + "step": 11349 + }, + { + "epoch": 2.127460168697282, + "grad_norm": 56322.62890625, + "learning_rate": 3.950963862415544e-05, + "loss": 2.1411, + "step": 11350 + }, + { + "epoch": 2.127647610121837, + "grad_norm": 55126.8203125, + "learning_rate": 3.950195573845823e-05, + "loss": 2.1515, + "step": 11351 + }, + { + "epoch": 2.1278350515463917, + "grad_norm": 53792.78515625, + "learning_rate": 3.94942731120491e-05, + "loss": 2.163, + "step": 11352 + }, + { + "epoch": 2.1280224929709464, + "grad_norm": 55046.91015625, + "learning_rate": 3.94865907451178e-05, + "loss": 2.075, + "step": 11353 + }, + { + "epoch": 2.1282099343955014, + "grad_norm": 53098.40625, + "learning_rate": 3.94789086378541e-05, + "loss": 2.1135, + "step": 11354 + }, + { + "epoch": 2.128397375820056, + "grad_norm": 58832.1484375, + "learning_rate": 3.947122679044769e-05, + "loss": 2.1298, + "step": 11355 + }, + { + "epoch": 2.128584817244611, + "grad_norm": 55893.1328125, + "learning_rate": 3.946354520308834e-05, + "loss": 2.1357, + "step": 11356 + }, + { + "epoch": 2.1287722586691658, + "grad_norm": 51902.06640625, + "learning_rate": 3.945586387596576e-05, + "loss": 2.1617, + "step": 11357 + }, + { + "epoch": 2.128959700093721, + "grad_norm": 57876.9765625, + "learning_rate": 3.944818280926969e-05, + "loss": 2.0941, + "step": 11358 + }, + { + "epoch": 2.1291471415182754, + "grad_norm": 51866.00390625, + "learning_rate": 3.94405020031898e-05, + "loss": 2.0797, + "step": 11359 + }, + { + "epoch": 2.1293345829428305, + "grad_norm": 54078.07421875, + "learning_rate": 3.9432821457915814e-05, + "loss": 2.1193, + "step": 11360 + }, + { + "epoch": 2.129522024367385, + "grad_norm": 50661.01171875, + "learning_rate": 3.9425141173637446e-05, + "loss": 2.1361, + "step": 11361 + }, + { + "epoch": 2.12970946579194, + "grad_norm": 58752.4765625, + "learning_rate": 3.94174611505444e-05, + "loss": 2.1712, + "step": 11362 + }, + { + "epoch": 2.129896907216495, + "grad_norm": 47802.87890625, + "learning_rate": 3.9409781388826325e-05, + "loss": 2.1725, + "step": 11363 + }, + { + "epoch": 2.1300843486410495, + "grad_norm": 54782.9921875, + "learning_rate": 3.940210188867292e-05, + "loss": 2.1679, + "step": 11364 + }, + { + "epoch": 2.1302717900656045, + "grad_norm": 51714.2890625, + "learning_rate": 3.939442265027387e-05, + "loss": 2.1473, + "step": 11365 + }, + { + "epoch": 2.130459231490159, + "grad_norm": 51914.94921875, + "learning_rate": 3.938674367381882e-05, + "loss": 2.0921, + "step": 11366 + }, + { + "epoch": 2.130646672914714, + "grad_norm": 56355.72265625, + "learning_rate": 3.937906495949744e-05, + "loss": 2.1935, + "step": 11367 + }, + { + "epoch": 2.130834114339269, + "grad_norm": 53547.04296875, + "learning_rate": 3.937138650749937e-05, + "loss": 2.116, + "step": 11368 + }, + { + "epoch": 2.131021555763824, + "grad_norm": 51156.6875, + "learning_rate": 3.936370831801431e-05, + "loss": 2.2526, + "step": 11369 + }, + { + "epoch": 2.1312089971883785, + "grad_norm": 50655.46484375, + "learning_rate": 3.9356030391231836e-05, + "loss": 2.1295, + "step": 11370 + }, + { + "epoch": 2.1313964386129336, + "grad_norm": 54875.79296875, + "learning_rate": 3.934835272734162e-05, + "loss": 2.1726, + "step": 11371 + }, + { + "epoch": 2.131583880037488, + "grad_norm": 54473.58203125, + "learning_rate": 3.934067532653328e-05, + "loss": 2.1826, + "step": 11372 + }, + { + "epoch": 2.1317713214620433, + "grad_norm": 59777.03125, + "learning_rate": 3.933299818899645e-05, + "loss": 2.1195, + "step": 11373 + }, + { + "epoch": 2.131958762886598, + "grad_norm": 49851.80859375, + "learning_rate": 3.932532131492073e-05, + "loss": 2.1649, + "step": 11374 + }, + { + "epoch": 2.1321462043111525, + "grad_norm": 52110.5078125, + "learning_rate": 3.931764470449572e-05, + "loss": 2.0689, + "step": 11375 + }, + { + "epoch": 2.1323336457357076, + "grad_norm": 56703.87890625, + "learning_rate": 3.930996835791103e-05, + "loss": 2.145, + "step": 11376 + }, + { + "epoch": 2.1325210871602622, + "grad_norm": 49788.11328125, + "learning_rate": 3.93022922753563e-05, + "loss": 2.147, + "step": 11377 + }, + { + "epoch": 2.1327085285848173, + "grad_norm": 58732.91015625, + "learning_rate": 3.9294616457021043e-05, + "loss": 2.0485, + "step": 11378 + }, + { + "epoch": 2.132895970009372, + "grad_norm": 58622.6875, + "learning_rate": 3.9286940903094895e-05, + "loss": 2.2197, + "step": 11379 + }, + { + "epoch": 2.133083411433927, + "grad_norm": 52564.01953125, + "learning_rate": 3.9279265613767425e-05, + "loss": 2.1153, + "step": 11380 + }, + { + "epoch": 2.1332708528584816, + "grad_norm": 55805.51953125, + "learning_rate": 3.927159058922819e-05, + "loss": 2.1143, + "step": 11381 + }, + { + "epoch": 2.1334582942830367, + "grad_norm": 58301.921875, + "learning_rate": 3.926391582966675e-05, + "loss": 2.1377, + "step": 11382 + }, + { + "epoch": 2.1336457357075913, + "grad_norm": 49050.37109375, + "learning_rate": 3.925624133527266e-05, + "loss": 2.1235, + "step": 11383 + }, + { + "epoch": 2.1338331771321464, + "grad_norm": 51169.171875, + "learning_rate": 3.9248567106235515e-05, + "loss": 2.1874, + "step": 11384 + }, + { + "epoch": 2.134020618556701, + "grad_norm": 57777.4765625, + "learning_rate": 3.9240893142744784e-05, + "loss": 2.1146, + "step": 11385 + }, + { + "epoch": 2.1342080599812556, + "grad_norm": 52645.9375, + "learning_rate": 3.923321944499006e-05, + "loss": 2.0979, + "step": 11386 + }, + { + "epoch": 2.1343955014058107, + "grad_norm": 55733.37109375, + "learning_rate": 3.922554601316085e-05, + "loss": 2.1212, + "step": 11387 + }, + { + "epoch": 2.1345829428303653, + "grad_norm": 51299.21484375, + "learning_rate": 3.9217872847446705e-05, + "loss": 2.1349, + "step": 11388 + }, + { + "epoch": 2.1347703842549204, + "grad_norm": 48474.76171875, + "learning_rate": 3.921019994803711e-05, + "loss": 2.1024, + "step": 11389 + }, + { + "epoch": 2.134957825679475, + "grad_norm": 55503.34375, + "learning_rate": 3.9202527315121576e-05, + "loss": 2.0888, + "step": 11390 + }, + { + "epoch": 2.13514526710403, + "grad_norm": 59061.6953125, + "learning_rate": 3.9194854948889615e-05, + "loss": 2.163, + "step": 11391 + }, + { + "epoch": 2.1353327085285847, + "grad_norm": 55653.203125, + "learning_rate": 3.918718284953076e-05, + "loss": 2.1478, + "step": 11392 + }, + { + "epoch": 2.1355201499531398, + "grad_norm": 58341.0078125, + "learning_rate": 3.917951101723444e-05, + "loss": 2.1132, + "step": 11393 + }, + { + "epoch": 2.1357075913776944, + "grad_norm": 54561.734375, + "learning_rate": 3.9171839452190175e-05, + "loss": 2.2049, + "step": 11394 + }, + { + "epoch": 2.1358950328022495, + "grad_norm": 51726.9296875, + "learning_rate": 3.916416815458744e-05, + "loss": 2.1101, + "step": 11395 + }, + { + "epoch": 2.136082474226804, + "grad_norm": 49747.39453125, + "learning_rate": 3.915649712461571e-05, + "loss": 2.1826, + "step": 11396 + }, + { + "epoch": 2.136269915651359, + "grad_norm": 58944.48046875, + "learning_rate": 3.914882636246443e-05, + "loss": 2.2097, + "step": 11397 + }, + { + "epoch": 2.1364573570759138, + "grad_norm": 49455.28515625, + "learning_rate": 3.914115586832306e-05, + "loss": 2.2962, + "step": 11398 + }, + { + "epoch": 2.1366447985004684, + "grad_norm": 51752.3046875, + "learning_rate": 3.913348564238109e-05, + "loss": 2.1607, + "step": 11399 + }, + { + "epoch": 2.1368322399250235, + "grad_norm": 50529.6953125, + "learning_rate": 3.912581568482792e-05, + "loss": 2.142, + "step": 11400 + }, + { + "epoch": 2.137019681349578, + "grad_norm": 52541.46484375, + "learning_rate": 3.9118145995853e-05, + "loss": 2.1774, + "step": 11401 + }, + { + "epoch": 2.137207122774133, + "grad_norm": 54793.859375, + "learning_rate": 3.911047657564577e-05, + "loss": 2.1302, + "step": 11402 + }, + { + "epoch": 2.137394564198688, + "grad_norm": 59490.95703125, + "learning_rate": 3.910280742439566e-05, + "loss": 2.0977, + "step": 11403 + }, + { + "epoch": 2.137582005623243, + "grad_norm": 57610.11328125, + "learning_rate": 3.9095138542292063e-05, + "loss": 2.2314, + "step": 11404 + }, + { + "epoch": 2.1377694470477975, + "grad_norm": 53902.6484375, + "learning_rate": 3.908746992952441e-05, + "loss": 2.1389, + "step": 11405 + }, + { + "epoch": 2.1379568884723525, + "grad_norm": 55037.97265625, + "learning_rate": 3.9079801586282116e-05, + "loss": 2.0922, + "step": 11406 + }, + { + "epoch": 2.138144329896907, + "grad_norm": 50719.7109375, + "learning_rate": 3.907213351275455e-05, + "loss": 2.0987, + "step": 11407 + }, + { + "epoch": 2.1383317713214622, + "grad_norm": 54746.421875, + "learning_rate": 3.906446570913111e-05, + "loss": 2.1658, + "step": 11408 + }, + { + "epoch": 2.138519212746017, + "grad_norm": 52674.203125, + "learning_rate": 3.9056798175601195e-05, + "loss": 2.1086, + "step": 11409 + }, + { + "epoch": 2.138706654170572, + "grad_norm": 51296.41015625, + "learning_rate": 3.904913091235419e-05, + "loss": 2.129, + "step": 11410 + }, + { + "epoch": 2.1388940955951266, + "grad_norm": 50615.06640625, + "learning_rate": 3.904146391957944e-05, + "loss": 2.125, + "step": 11411 + }, + { + "epoch": 2.139081537019681, + "grad_norm": 54063.37109375, + "learning_rate": 3.903379719746633e-05, + "loss": 2.1778, + "step": 11412 + }, + { + "epoch": 2.1392689784442362, + "grad_norm": 50936.9765625, + "learning_rate": 3.902613074620419e-05, + "loss": 2.1369, + "step": 11413 + }, + { + "epoch": 2.139456419868791, + "grad_norm": 51166.67578125, + "learning_rate": 3.9018464565982436e-05, + "loss": 2.1444, + "step": 11414 + }, + { + "epoch": 2.139643861293346, + "grad_norm": 52675.58203125, + "learning_rate": 3.9010798656990335e-05, + "loss": 2.113, + "step": 11415 + }, + { + "epoch": 2.1398313027179006, + "grad_norm": 54560.6328125, + "learning_rate": 3.9003133019417264e-05, + "loss": 2.0581, + "step": 11416 + }, + { + "epoch": 2.1400187441424556, + "grad_norm": 52320.88671875, + "learning_rate": 3.899546765345256e-05, + "loss": 2.1624, + "step": 11417 + }, + { + "epoch": 2.1402061855670103, + "grad_norm": 56181.4375, + "learning_rate": 3.898780255928553e-05, + "loss": 2.0598, + "step": 11418 + }, + { + "epoch": 2.1403936269915653, + "grad_norm": 49452.17578125, + "learning_rate": 3.8980137737105504e-05, + "loss": 2.1667, + "step": 11419 + }, + { + "epoch": 2.14058106841612, + "grad_norm": 50985.984375, + "learning_rate": 3.8972473187101785e-05, + "loss": 2.1388, + "step": 11420 + }, + { + "epoch": 2.140768509840675, + "grad_norm": 48917.7109375, + "learning_rate": 3.89648089094637e-05, + "loss": 2.1754, + "step": 11421 + }, + { + "epoch": 2.1409559512652296, + "grad_norm": 51661.31640625, + "learning_rate": 3.895714490438051e-05, + "loss": 2.1438, + "step": 11422 + }, + { + "epoch": 2.1411433926897843, + "grad_norm": 51951.2109375, + "learning_rate": 3.8949481172041504e-05, + "loss": 2.1878, + "step": 11423 + }, + { + "epoch": 2.1413308341143393, + "grad_norm": 53692.24609375, + "learning_rate": 3.8941817712636005e-05, + "loss": 2.1304, + "step": 11424 + }, + { + "epoch": 2.141518275538894, + "grad_norm": 53217.29296875, + "learning_rate": 3.893415452635328e-05, + "loss": 2.1566, + "step": 11425 + }, + { + "epoch": 2.141705716963449, + "grad_norm": 49512.7265625, + "learning_rate": 3.892649161338258e-05, + "loss": 2.1381, + "step": 11426 + }, + { + "epoch": 2.1418931583880036, + "grad_norm": 51873.796875, + "learning_rate": 3.891882897391318e-05, + "loss": 2.2374, + "step": 11427 + }, + { + "epoch": 2.1420805998125587, + "grad_norm": 55503.79296875, + "learning_rate": 3.891116660813434e-05, + "loss": 2.1069, + "step": 11428 + }, + { + "epoch": 2.1422680412371133, + "grad_norm": 50994.62890625, + "learning_rate": 3.890350451623532e-05, + "loss": 2.2054, + "step": 11429 + }, + { + "epoch": 2.1424554826616684, + "grad_norm": 53854.36328125, + "learning_rate": 3.889584269840533e-05, + "loss": 2.124, + "step": 11430 + }, + { + "epoch": 2.142642924086223, + "grad_norm": 53716.23828125, + "learning_rate": 3.888818115483363e-05, + "loss": 2.1482, + "step": 11431 + }, + { + "epoch": 2.142830365510778, + "grad_norm": 55113.16015625, + "learning_rate": 3.8880519885709454e-05, + "loss": 2.1586, + "step": 11432 + }, + { + "epoch": 2.1430178069353327, + "grad_norm": 52221.5, + "learning_rate": 3.887285889122203e-05, + "loss": 2.164, + "step": 11433 + }, + { + "epoch": 2.1432052483598873, + "grad_norm": 56953.0234375, + "learning_rate": 3.886519817156056e-05, + "loss": 2.2109, + "step": 11434 + }, + { + "epoch": 2.1433926897844424, + "grad_norm": 55981.43359375, + "learning_rate": 3.8857537726914256e-05, + "loss": 2.1323, + "step": 11435 + }, + { + "epoch": 2.143580131208997, + "grad_norm": 51175.25, + "learning_rate": 3.8849877557472335e-05, + "loss": 2.2134, + "step": 11436 + }, + { + "epoch": 2.143767572633552, + "grad_norm": 56710.25390625, + "learning_rate": 3.8842217663423965e-05, + "loss": 2.1094, + "step": 11437 + }, + { + "epoch": 2.1439550140581067, + "grad_norm": 53409.1953125, + "learning_rate": 3.883455804495835e-05, + "loss": 2.1711, + "step": 11438 + }, + { + "epoch": 2.144142455482662, + "grad_norm": 53484.640625, + "learning_rate": 3.882689870226468e-05, + "loss": 2.1317, + "step": 11439 + }, + { + "epoch": 2.1443298969072164, + "grad_norm": 56065.03125, + "learning_rate": 3.8819239635532137e-05, + "loss": 2.1437, + "step": 11440 + }, + { + "epoch": 2.1445173383317715, + "grad_norm": 53651.2265625, + "learning_rate": 3.8811580844949875e-05, + "loss": 2.1919, + "step": 11441 + }, + { + "epoch": 2.144704779756326, + "grad_norm": 55109.046875, + "learning_rate": 3.8803922330707056e-05, + "loss": 2.0733, + "step": 11442 + }, + { + "epoch": 2.144892221180881, + "grad_norm": 53714.18359375, + "learning_rate": 3.879626409299284e-05, + "loss": 2.1074, + "step": 11443 + }, + { + "epoch": 2.145079662605436, + "grad_norm": 53412.0078125, + "learning_rate": 3.878860613199638e-05, + "loss": 2.1201, + "step": 11444 + }, + { + "epoch": 2.1452671040299904, + "grad_norm": 53753.25, + "learning_rate": 3.8780948447906806e-05, + "loss": 2.0923, + "step": 11445 + }, + { + "epoch": 2.1454545454545455, + "grad_norm": 59334.73046875, + "learning_rate": 3.877329104091325e-05, + "loss": 2.1228, + "step": 11446 + }, + { + "epoch": 2.1456419868791, + "grad_norm": 55684.96484375, + "learning_rate": 3.8765633911204886e-05, + "loss": 2.108, + "step": 11447 + }, + { + "epoch": 2.145829428303655, + "grad_norm": 58159.88671875, + "learning_rate": 3.875797705897076e-05, + "loss": 2.0919, + "step": 11448 + }, + { + "epoch": 2.14601686972821, + "grad_norm": 55271.15234375, + "learning_rate": 3.875032048440004e-05, + "loss": 2.18, + "step": 11449 + }, + { + "epoch": 2.146204311152765, + "grad_norm": 56665.8359375, + "learning_rate": 3.874266418768181e-05, + "loss": 2.198, + "step": 11450 + }, + { + "epoch": 2.1463917525773195, + "grad_norm": 53134.375, + "learning_rate": 3.87350081690052e-05, + "loss": 2.1608, + "step": 11451 + }, + { + "epoch": 2.1465791940018746, + "grad_norm": 55890.640625, + "learning_rate": 3.8727352428559274e-05, + "loss": 2.1626, + "step": 11452 + }, + { + "epoch": 2.146766635426429, + "grad_norm": 54743.46484375, + "learning_rate": 3.8719696966533105e-05, + "loss": 2.1826, + "step": 11453 + }, + { + "epoch": 2.1469540768509843, + "grad_norm": 51702.18359375, + "learning_rate": 3.871204178311582e-05, + "loss": 2.1652, + "step": 11454 + }, + { + "epoch": 2.147141518275539, + "grad_norm": 54691.453125, + "learning_rate": 3.870438687849647e-05, + "loss": 2.0466, + "step": 11455 + }, + { + "epoch": 2.1473289597000935, + "grad_norm": 56586.4375, + "learning_rate": 3.869673225286411e-05, + "loss": 2.0941, + "step": 11456 + }, + { + "epoch": 2.1475164011246486, + "grad_norm": 49834.6171875, + "learning_rate": 3.868907790640781e-05, + "loss": 2.1441, + "step": 11457 + }, + { + "epoch": 2.147703842549203, + "grad_norm": 51159.4453125, + "learning_rate": 3.8681423839316614e-05, + "loss": 2.1604, + "step": 11458 + }, + { + "epoch": 2.1478912839737583, + "grad_norm": 56221.33203125, + "learning_rate": 3.867377005177959e-05, + "loss": 2.052, + "step": 11459 + }, + { + "epoch": 2.148078725398313, + "grad_norm": 53839.42578125, + "learning_rate": 3.866611654398575e-05, + "loss": 2.1134, + "step": 11460 + }, + { + "epoch": 2.148266166822868, + "grad_norm": 54426.0234375, + "learning_rate": 3.865846331612413e-05, + "loss": 2.1726, + "step": 11461 + }, + { + "epoch": 2.1484536082474226, + "grad_norm": 52258.15625, + "learning_rate": 3.86508103683838e-05, + "loss": 2.1256, + "step": 11462 + }, + { + "epoch": 2.1486410496719777, + "grad_norm": 55375.109375, + "learning_rate": 3.864315770095369e-05, + "loss": 2.1173, + "step": 11463 + }, + { + "epoch": 2.1488284910965323, + "grad_norm": 53361.546875, + "learning_rate": 3.863550531402288e-05, + "loss": 2.1775, + "step": 11464 + }, + { + "epoch": 2.1490159325210874, + "grad_norm": 54314.3125, + "learning_rate": 3.862785320778035e-05, + "loss": 2.0623, + "step": 11465 + }, + { + "epoch": 2.149203373945642, + "grad_norm": 48200.5078125, + "learning_rate": 3.862020138241512e-05, + "loss": 2.1228, + "step": 11466 + }, + { + "epoch": 2.1493908153701966, + "grad_norm": 50022.51171875, + "learning_rate": 3.861254983811615e-05, + "loss": 2.1427, + "step": 11467 + }, + { + "epoch": 2.1495782567947517, + "grad_norm": 53249.99609375, + "learning_rate": 3.860489857507243e-05, + "loss": 2.1812, + "step": 11468 + }, + { + "epoch": 2.1497656982193063, + "grad_norm": 53491.62890625, + "learning_rate": 3.859724759347293e-05, + "loss": 2.2139, + "step": 11469 + }, + { + "epoch": 2.1499531396438614, + "grad_norm": 55569.3828125, + "learning_rate": 3.858959689350666e-05, + "loss": 2.1165, + "step": 11470 + }, + { + "epoch": 2.150140581068416, + "grad_norm": 57137.78125, + "learning_rate": 3.858194647536253e-05, + "loss": 2.0949, + "step": 11471 + }, + { + "epoch": 2.150328022492971, + "grad_norm": 57359.7421875, + "learning_rate": 3.857429633922953e-05, + "loss": 2.1661, + "step": 11472 + }, + { + "epoch": 2.1505154639175257, + "grad_norm": 54401.640625, + "learning_rate": 3.856664648529659e-05, + "loss": 2.0927, + "step": 11473 + }, + { + "epoch": 2.1507029053420808, + "grad_norm": 50631.0625, + "learning_rate": 3.8558996913752666e-05, + "loss": 2.1474, + "step": 11474 + }, + { + "epoch": 2.1508903467666354, + "grad_norm": 49884.703125, + "learning_rate": 3.855134762478668e-05, + "loss": 2.1283, + "step": 11475 + }, + { + "epoch": 2.1510777881911904, + "grad_norm": 53100.1640625, + "learning_rate": 3.8543698618587554e-05, + "loss": 2.1174, + "step": 11476 + }, + { + "epoch": 2.151265229615745, + "grad_norm": 51659.734375, + "learning_rate": 3.853604989534425e-05, + "loss": 2.1991, + "step": 11477 + }, + { + "epoch": 2.1514526710402997, + "grad_norm": 53484.63671875, + "learning_rate": 3.8528401455245614e-05, + "loss": 2.1368, + "step": 11478 + }, + { + "epoch": 2.1516401124648548, + "grad_norm": 58724.59765625, + "learning_rate": 3.8520753298480604e-05, + "loss": 2.1543, + "step": 11479 + }, + { + "epoch": 2.1518275538894094, + "grad_norm": 55220.203125, + "learning_rate": 3.851310542523811e-05, + "loss": 2.1594, + "step": 11480 + }, + { + "epoch": 2.1520149953139645, + "grad_norm": 55335.15234375, + "learning_rate": 3.850545783570702e-05, + "loss": 2.1426, + "step": 11481 + }, + { + "epoch": 2.152202436738519, + "grad_norm": 58897.53515625, + "learning_rate": 3.849781053007621e-05, + "loss": 2.1332, + "step": 11482 + }, + { + "epoch": 2.152389878163074, + "grad_norm": 63009.94140625, + "learning_rate": 3.849016350853457e-05, + "loss": 2.2453, + "step": 11483 + }, + { + "epoch": 2.1525773195876288, + "grad_norm": 50445.125, + "learning_rate": 3.848251677127095e-05, + "loss": 2.1438, + "step": 11484 + }, + { + "epoch": 2.152764761012184, + "grad_norm": 58823.9921875, + "learning_rate": 3.8474870318474276e-05, + "loss": 2.0971, + "step": 11485 + }, + { + "epoch": 2.1529522024367385, + "grad_norm": 55510.0546875, + "learning_rate": 3.846722415033333e-05, + "loss": 2.1064, + "step": 11486 + }, + { + "epoch": 2.1531396438612935, + "grad_norm": 55254.45703125, + "learning_rate": 3.8459578267037e-05, + "loss": 2.1847, + "step": 11487 + }, + { + "epoch": 2.153327085285848, + "grad_norm": 56400.88671875, + "learning_rate": 3.845193266877412e-05, + "loss": 2.1249, + "step": 11488 + }, + { + "epoch": 2.1535145267104028, + "grad_norm": 57009.49609375, + "learning_rate": 3.844428735573355e-05, + "loss": 2.1009, + "step": 11489 + }, + { + "epoch": 2.153701968134958, + "grad_norm": 57049.23046875, + "learning_rate": 3.8436642328104086e-05, + "loss": 2.0243, + "step": 11490 + }, + { + "epoch": 2.1538894095595125, + "grad_norm": 59219.91796875, + "learning_rate": 3.842899758607456e-05, + "loss": 2.0539, + "step": 11491 + }, + { + "epoch": 2.1540768509840675, + "grad_norm": 55108.54296875, + "learning_rate": 3.8421353129833816e-05, + "loss": 2.1104, + "step": 11492 + }, + { + "epoch": 2.154264292408622, + "grad_norm": 49637.8203125, + "learning_rate": 3.84137089595706e-05, + "loss": 2.0864, + "step": 11493 + }, + { + "epoch": 2.1544517338331772, + "grad_norm": 52027.484375, + "learning_rate": 3.8406065075473775e-05, + "loss": 2.0785, + "step": 11494 + }, + { + "epoch": 2.154639175257732, + "grad_norm": 53735.4765625, + "learning_rate": 3.8398421477732104e-05, + "loss": 2.1638, + "step": 11495 + }, + { + "epoch": 2.154826616682287, + "grad_norm": 54147.84765625, + "learning_rate": 3.8390778166534394e-05, + "loss": 2.1752, + "step": 11496 + }, + { + "epoch": 2.1550140581068415, + "grad_norm": 54133.5390625, + "learning_rate": 3.8383135142069405e-05, + "loss": 2.1529, + "step": 11497 + }, + { + "epoch": 2.1552014995313966, + "grad_norm": 54488.61328125, + "learning_rate": 3.837549240452591e-05, + "loss": 2.152, + "step": 11498 + }, + { + "epoch": 2.1553889409559512, + "grad_norm": 53485.2109375, + "learning_rate": 3.836784995409267e-05, + "loss": 2.111, + "step": 11499 + }, + { + "epoch": 2.155576382380506, + "grad_norm": 57659.0859375, + "learning_rate": 3.836020779095849e-05, + "loss": 2.2052, + "step": 11500 + }, + { + "epoch": 2.155576382380506, + "eval_loss": 2.280621290206909, + "eval_runtime": 132.2265, + "eval_samples_per_second": 38.184, + "eval_steps_per_second": 1.913, + "step": 11500 + }, + { + "epoch": 2.155763823805061, + "grad_norm": 55975.76953125, + "learning_rate": 3.835256591531205e-05, + "loss": 2.1335, + "step": 11501 + }, + { + "epoch": 2.1559512652296156, + "grad_norm": 48716.3125, + "learning_rate": 3.834492432734215e-05, + "loss": 2.1956, + "step": 11502 + }, + { + "epoch": 2.1561387066541706, + "grad_norm": 52955.64453125, + "learning_rate": 3.8337283027237506e-05, + "loss": 2.14, + "step": 11503 + }, + { + "epoch": 2.1563261480787252, + "grad_norm": 51525.9375, + "learning_rate": 3.832964201518685e-05, + "loss": 2.1508, + "step": 11504 + }, + { + "epoch": 2.1565135895032803, + "grad_norm": 53498.22265625, + "learning_rate": 3.83220012913789e-05, + "loss": 2.0684, + "step": 11505 + }, + { + "epoch": 2.156701030927835, + "grad_norm": 54637.13671875, + "learning_rate": 3.831436085600237e-05, + "loss": 2.181, + "step": 11506 + }, + { + "epoch": 2.15688847235239, + "grad_norm": 50971.94140625, + "learning_rate": 3.830672070924601e-05, + "loss": 2.171, + "step": 11507 + }, + { + "epoch": 2.1570759137769446, + "grad_norm": 54587.5, + "learning_rate": 3.8299080851298455e-05, + "loss": 2.1045, + "step": 11508 + }, + { + "epoch": 2.1572633552014997, + "grad_norm": 53392.8046875, + "learning_rate": 3.829144128234844e-05, + "loss": 2.1662, + "step": 11509 + }, + { + "epoch": 2.1574507966260543, + "grad_norm": 55693.49609375, + "learning_rate": 3.828380200258464e-05, + "loss": 2.1149, + "step": 11510 + }, + { + "epoch": 2.1576382380506094, + "grad_norm": 53516.078125, + "learning_rate": 3.827616301219577e-05, + "loss": 2.1345, + "step": 11511 + }, + { + "epoch": 2.157825679475164, + "grad_norm": 52325.015625, + "learning_rate": 3.826852431137044e-05, + "loss": 2.1492, + "step": 11512 + }, + { + "epoch": 2.1580131208997186, + "grad_norm": 51250.57421875, + "learning_rate": 3.826088590029736e-05, + "loss": 2.1593, + "step": 11513 + }, + { + "epoch": 2.1582005623242737, + "grad_norm": 50737.3984375, + "learning_rate": 3.825324777916517e-05, + "loss": 2.1554, + "step": 11514 + }, + { + "epoch": 2.1583880037488283, + "grad_norm": 51474.1171875, + "learning_rate": 3.824560994816256e-05, + "loss": 2.1492, + "step": 11515 + }, + { + "epoch": 2.1585754451733834, + "grad_norm": 53651.38671875, + "learning_rate": 3.823797240747811e-05, + "loss": 2.1175, + "step": 11516 + }, + { + "epoch": 2.158762886597938, + "grad_norm": 54288.1640625, + "learning_rate": 3.82303351573005e-05, + "loss": 2.0902, + "step": 11517 + }, + { + "epoch": 2.158950328022493, + "grad_norm": 51081.4609375, + "learning_rate": 3.822269819781837e-05, + "loss": 2.1786, + "step": 11518 + }, + { + "epoch": 2.1591377694470477, + "grad_norm": 57818.21875, + "learning_rate": 3.821506152922031e-05, + "loss": 2.1677, + "step": 11519 + }, + { + "epoch": 2.159325210871603, + "grad_norm": 53318.76953125, + "learning_rate": 3.820742515169495e-05, + "loss": 2.1336, + "step": 11520 + }, + { + "epoch": 2.1595126522961574, + "grad_norm": 49636.6640625, + "learning_rate": 3.81997890654309e-05, + "loss": 2.1161, + "step": 11521 + }, + { + "epoch": 2.1597000937207125, + "grad_norm": 53905.97265625, + "learning_rate": 3.8192153270616776e-05, + "loss": 2.1361, + "step": 11522 + }, + { + "epoch": 2.159887535145267, + "grad_norm": 48871.66796875, + "learning_rate": 3.8184517767441144e-05, + "loss": 2.1853, + "step": 11523 + }, + { + "epoch": 2.1600749765698217, + "grad_norm": 55431.2578125, + "learning_rate": 3.8176882556092594e-05, + "loss": 2.1162, + "step": 11524 + }, + { + "epoch": 2.160262417994377, + "grad_norm": 54090.14453125, + "learning_rate": 3.8169247636759724e-05, + "loss": 2.1303, + "step": 11525 + }, + { + "epoch": 2.1604498594189314, + "grad_norm": 58206.25390625, + "learning_rate": 3.816161300963111e-05, + "loss": 2.2066, + "step": 11526 + }, + { + "epoch": 2.1606373008434865, + "grad_norm": 52456.69140625, + "learning_rate": 3.8153978674895294e-05, + "loss": 2.1349, + "step": 11527 + }, + { + "epoch": 2.160824742268041, + "grad_norm": 54005.16796875, + "learning_rate": 3.814634463274085e-05, + "loss": 2.1401, + "step": 11528 + }, + { + "epoch": 2.161012183692596, + "grad_norm": 61435.90625, + "learning_rate": 3.813871088335631e-05, + "loss": 2.1195, + "step": 11529 + }, + { + "epoch": 2.161199625117151, + "grad_norm": 53960.1328125, + "learning_rate": 3.8131077426930276e-05, + "loss": 2.1265, + "step": 11530 + }, + { + "epoch": 2.161387066541706, + "grad_norm": 58289.01953125, + "learning_rate": 3.812344426365119e-05, + "loss": 2.2279, + "step": 11531 + }, + { + "epoch": 2.1615745079662605, + "grad_norm": 59225.484375, + "learning_rate": 3.811581139370766e-05, + "loss": 2.0574, + "step": 11532 + }, + { + "epoch": 2.1617619493908156, + "grad_norm": 53275.26171875, + "learning_rate": 3.810817881728818e-05, + "loss": 2.1121, + "step": 11533 + }, + { + "epoch": 2.16194939081537, + "grad_norm": 51803.87109375, + "learning_rate": 3.810054653458126e-05, + "loss": 2.1225, + "step": 11534 + }, + { + "epoch": 2.1621368322399253, + "grad_norm": 53223.95703125, + "learning_rate": 3.8092914545775414e-05, + "loss": 2.169, + "step": 11535 + }, + { + "epoch": 2.16232427366448, + "grad_norm": 55339.609375, + "learning_rate": 3.808528285105914e-05, + "loss": 2.12, + "step": 11536 + }, + { + "epoch": 2.1625117150890345, + "grad_norm": 53331.765625, + "learning_rate": 3.807765145062094e-05, + "loss": 2.1309, + "step": 11537 + }, + { + "epoch": 2.1626991565135896, + "grad_norm": 54506.7578125, + "learning_rate": 3.8070020344649284e-05, + "loss": 2.1395, + "step": 11538 + }, + { + "epoch": 2.162886597938144, + "grad_norm": 55188.16796875, + "learning_rate": 3.806238953333264e-05, + "loss": 2.167, + "step": 11539 + }, + { + "epoch": 2.1630740393626993, + "grad_norm": 53787.28515625, + "learning_rate": 3.805475901685952e-05, + "loss": 2.1124, + "step": 11540 + }, + { + "epoch": 2.163261480787254, + "grad_norm": 56243.625, + "learning_rate": 3.804712879541837e-05, + "loss": 2.1605, + "step": 11541 + }, + { + "epoch": 2.163448922211809, + "grad_norm": 50307.8046875, + "learning_rate": 3.803949886919763e-05, + "loss": 2.1277, + "step": 11542 + }, + { + "epoch": 2.1636363636363636, + "grad_norm": 47990.2109375, + "learning_rate": 3.803186923838576e-05, + "loss": 2.2029, + "step": 11543 + }, + { + "epoch": 2.1638238050609186, + "grad_norm": 49119.109375, + "learning_rate": 3.8024239903171216e-05, + "loss": 2.1007, + "step": 11544 + }, + { + "epoch": 2.1640112464854733, + "grad_norm": 55553.6640625, + "learning_rate": 3.8016610863742405e-05, + "loss": 2.1893, + "step": 11545 + }, + { + "epoch": 2.1641986879100283, + "grad_norm": 52156.24609375, + "learning_rate": 3.8008982120287764e-05, + "loss": 2.1384, + "step": 11546 + }, + { + "epoch": 2.164386129334583, + "grad_norm": 58572.3828125, + "learning_rate": 3.800135367299573e-05, + "loss": 2.0976, + "step": 11547 + }, + { + "epoch": 2.1645735707591376, + "grad_norm": 55189.3203125, + "learning_rate": 3.799372552205471e-05, + "loss": 2.0847, + "step": 11548 + }, + { + "epoch": 2.1647610121836927, + "grad_norm": 49346.13671875, + "learning_rate": 3.7986097667653084e-05, + "loss": 2.1374, + "step": 11549 + }, + { + "epoch": 2.1649484536082473, + "grad_norm": 49337.8828125, + "learning_rate": 3.797847010997928e-05, + "loss": 2.1092, + "step": 11550 + }, + { + "epoch": 2.1651358950328023, + "grad_norm": 54484.4765625, + "learning_rate": 3.7970842849221674e-05, + "loss": 2.1559, + "step": 11551 + }, + { + "epoch": 2.165323336457357, + "grad_norm": 52821.515625, + "learning_rate": 3.796321588556867e-05, + "loss": 2.0266, + "step": 11552 + }, + { + "epoch": 2.165510777881912, + "grad_norm": 60872.28515625, + "learning_rate": 3.795558921920861e-05, + "loss": 2.1338, + "step": 11553 + }, + { + "epoch": 2.1656982193064667, + "grad_norm": 54074.61328125, + "learning_rate": 3.7947962850329877e-05, + "loss": 2.1918, + "step": 11554 + }, + { + "epoch": 2.1658856607310217, + "grad_norm": 52555.23828125, + "learning_rate": 3.7940336779120846e-05, + "loss": 2.1928, + "step": 11555 + }, + { + "epoch": 2.1660731021555764, + "grad_norm": 56234.9453125, + "learning_rate": 3.7932711005769874e-05, + "loss": 2.137, + "step": 11556 + }, + { + "epoch": 2.1662605435801314, + "grad_norm": 54298.64453125, + "learning_rate": 3.792508553046528e-05, + "loss": 2.1553, + "step": 11557 + }, + { + "epoch": 2.166447985004686, + "grad_norm": 54068.8828125, + "learning_rate": 3.7917460353395415e-05, + "loss": 2.1261, + "step": 11558 + }, + { + "epoch": 2.1666354264292407, + "grad_norm": 53319.515625, + "learning_rate": 3.790983547474862e-05, + "loss": 2.2573, + "step": 11559 + }, + { + "epoch": 2.1668228678537957, + "grad_norm": 48864.32421875, + "learning_rate": 3.790221089471321e-05, + "loss": 2.1094, + "step": 11560 + }, + { + "epoch": 2.1670103092783504, + "grad_norm": 51516.81640625, + "learning_rate": 3.7894586613477506e-05, + "loss": 2.1806, + "step": 11561 + }, + { + "epoch": 2.1671977507029054, + "grad_norm": 50975.66796875, + "learning_rate": 3.7886962631229804e-05, + "loss": 2.1651, + "step": 11562 + }, + { + "epoch": 2.16738519212746, + "grad_norm": 49260.15234375, + "learning_rate": 3.787933894815846e-05, + "loss": 2.1729, + "step": 11563 + }, + { + "epoch": 2.167572633552015, + "grad_norm": 55836.21484375, + "learning_rate": 3.787171556445169e-05, + "loss": 2.2098, + "step": 11564 + }, + { + "epoch": 2.1677600749765698, + "grad_norm": 54799.16796875, + "learning_rate": 3.786409248029783e-05, + "loss": 2.1369, + "step": 11565 + }, + { + "epoch": 2.167947516401125, + "grad_norm": 60693.3046875, + "learning_rate": 3.7856469695885164e-05, + "loss": 2.1442, + "step": 11566 + }, + { + "epoch": 2.1681349578256794, + "grad_norm": 56242.828125, + "learning_rate": 3.784884721140195e-05, + "loss": 2.1286, + "step": 11567 + }, + { + "epoch": 2.1683223992502345, + "grad_norm": 57521.9921875, + "learning_rate": 3.7841225027036446e-05, + "loss": 2.1522, + "step": 11568 + }, + { + "epoch": 2.168509840674789, + "grad_norm": 56918.91015625, + "learning_rate": 3.7833603142976916e-05, + "loss": 2.1566, + "step": 11569 + }, + { + "epoch": 2.1686972820993438, + "grad_norm": 52218.046875, + "learning_rate": 3.7825981559411615e-05, + "loss": 2.181, + "step": 11570 + }, + { + "epoch": 2.168884723523899, + "grad_norm": 50336.171875, + "learning_rate": 3.781836027652881e-05, + "loss": 2.0899, + "step": 11571 + }, + { + "epoch": 2.1690721649484535, + "grad_norm": 52670.2265625, + "learning_rate": 3.781073929451669e-05, + "loss": 2.1156, + "step": 11572 + }, + { + "epoch": 2.1692596063730085, + "grad_norm": 53129.8359375, + "learning_rate": 3.780311861356351e-05, + "loss": 2.1181, + "step": 11573 + }, + { + "epoch": 2.169447047797563, + "grad_norm": 57675.79296875, + "learning_rate": 3.77954982338575e-05, + "loss": 2.1277, + "step": 11574 + }, + { + "epoch": 2.169634489222118, + "grad_norm": 51749.67578125, + "learning_rate": 3.7787878155586845e-05, + "loss": 2.1301, + "step": 11575 + }, + { + "epoch": 2.169821930646673, + "grad_norm": 50591.67578125, + "learning_rate": 3.778025837893977e-05, + "loss": 2.1192, + "step": 11576 + }, + { + "epoch": 2.170009372071228, + "grad_norm": 46468.73828125, + "learning_rate": 3.777263890410445e-05, + "loss": 2.1628, + "step": 11577 + }, + { + "epoch": 2.1701968134957825, + "grad_norm": 57453.4765625, + "learning_rate": 3.7765019731269145e-05, + "loss": 2.186, + "step": 11578 + }, + { + "epoch": 2.1703842549203376, + "grad_norm": 60035.4921875, + "learning_rate": 3.775740086062194e-05, + "loss": 2.2559, + "step": 11579 + }, + { + "epoch": 2.170571696344892, + "grad_norm": 48467.2265625, + "learning_rate": 3.7749782292351084e-05, + "loss": 2.0973, + "step": 11580 + }, + { + "epoch": 2.170759137769447, + "grad_norm": 54526.015625, + "learning_rate": 3.7742164026644716e-05, + "loss": 2.1674, + "step": 11581 + }, + { + "epoch": 2.170946579194002, + "grad_norm": 58205.5859375, + "learning_rate": 3.773454606369102e-05, + "loss": 2.0905, + "step": 11582 + }, + { + "epoch": 2.1711340206185565, + "grad_norm": 58336.453125, + "learning_rate": 3.772692840367812e-05, + "loss": 2.1072, + "step": 11583 + }, + { + "epoch": 2.1713214620431116, + "grad_norm": 54657.453125, + "learning_rate": 3.7719311046794166e-05, + "loss": 2.0267, + "step": 11584 + }, + { + "epoch": 2.1715089034676662, + "grad_norm": 53018.1875, + "learning_rate": 3.771169399322732e-05, + "loss": 2.1581, + "step": 11585 + }, + { + "epoch": 2.1716963448922213, + "grad_norm": 51640.46484375, + "learning_rate": 3.7704077243165704e-05, + "loss": 2.0909, + "step": 11586 + }, + { + "epoch": 2.171883786316776, + "grad_norm": 51837.68359375, + "learning_rate": 3.7696460796797434e-05, + "loss": 2.1154, + "step": 11587 + }, + { + "epoch": 2.172071227741331, + "grad_norm": 55272.26171875, + "learning_rate": 3.768884465431063e-05, + "loss": 2.1837, + "step": 11588 + }, + { + "epoch": 2.1722586691658856, + "grad_norm": 51825.20703125, + "learning_rate": 3.768122881589341e-05, + "loss": 2.1617, + "step": 11589 + }, + { + "epoch": 2.1724461105904407, + "grad_norm": 55343.63671875, + "learning_rate": 3.767361328173386e-05, + "loss": 2.1442, + "step": 11590 + }, + { + "epoch": 2.1726335520149953, + "grad_norm": 51305.59375, + "learning_rate": 3.766599805202008e-05, + "loss": 2.1797, + "step": 11591 + }, + { + "epoch": 2.17282099343955, + "grad_norm": 51558.7890625, + "learning_rate": 3.765838312694013e-05, + "loss": 2.1177, + "step": 11592 + }, + { + "epoch": 2.173008434864105, + "grad_norm": 57273.96484375, + "learning_rate": 3.765076850668217e-05, + "loss": 2.1302, + "step": 11593 + }, + { + "epoch": 2.1731958762886596, + "grad_norm": 53511.21875, + "learning_rate": 3.764315419143417e-05, + "loss": 2.1308, + "step": 11594 + }, + { + "epoch": 2.1733833177132147, + "grad_norm": 53484.84765625, + "learning_rate": 3.763554018138426e-05, + "loss": 2.1217, + "step": 11595 + }, + { + "epoch": 2.1735707591377693, + "grad_norm": 53870.1015625, + "learning_rate": 3.7627926476720474e-05, + "loss": 2.0644, + "step": 11596 + }, + { + "epoch": 2.1737582005623244, + "grad_norm": 52070.59765625, + "learning_rate": 3.762031307763087e-05, + "loss": 2.1719, + "step": 11597 + }, + { + "epoch": 2.173945641986879, + "grad_norm": 55040.61328125, + "learning_rate": 3.7612699984303476e-05, + "loss": 2.089, + "step": 11598 + }, + { + "epoch": 2.174133083411434, + "grad_norm": 55679.3828125, + "learning_rate": 3.7605087196926325e-05, + "loss": 2.2463, + "step": 11599 + }, + { + "epoch": 2.1743205248359887, + "grad_norm": 50539.73046875, + "learning_rate": 3.759747471568748e-05, + "loss": 2.0973, + "step": 11600 + }, + { + "epoch": 2.1745079662605438, + "grad_norm": 51229.94921875, + "learning_rate": 3.75898625407749e-05, + "loss": 2.1532, + "step": 11601 + }, + { + "epoch": 2.1746954076850984, + "grad_norm": 50516.80859375, + "learning_rate": 3.758225067237663e-05, + "loss": 2.1055, + "step": 11602 + }, + { + "epoch": 2.174882849109653, + "grad_norm": 52690.984375, + "learning_rate": 3.757463911068067e-05, + "loss": 2.0604, + "step": 11603 + }, + { + "epoch": 2.175070290534208, + "grad_norm": 57014.0703125, + "learning_rate": 3.756702785587502e-05, + "loss": 2.1744, + "step": 11604 + }, + { + "epoch": 2.1752577319587627, + "grad_norm": 58187.859375, + "learning_rate": 3.755941690814766e-05, + "loss": 2.1318, + "step": 11605 + }, + { + "epoch": 2.1754451733833178, + "grad_norm": 50462.73828125, + "learning_rate": 3.755180626768657e-05, + "loss": 2.1531, + "step": 11606 + }, + { + "epoch": 2.1756326148078724, + "grad_norm": 54670.80859375, + "learning_rate": 3.7544195934679714e-05, + "loss": 2.1893, + "step": 11607 + }, + { + "epoch": 2.1758200562324275, + "grad_norm": 59991.5390625, + "learning_rate": 3.7536585909315105e-05, + "loss": 2.0623, + "step": 11608 + }, + { + "epoch": 2.176007497656982, + "grad_norm": 55997.77734375, + "learning_rate": 3.7528976191780626e-05, + "loss": 2.101, + "step": 11609 + }, + { + "epoch": 2.176194939081537, + "grad_norm": 50092.5546875, + "learning_rate": 3.752136678226428e-05, + "loss": 2.1395, + "step": 11610 + }, + { + "epoch": 2.176382380506092, + "grad_norm": 55877.29296875, + "learning_rate": 3.7513757680954e-05, + "loss": 2.1431, + "step": 11611 + }, + { + "epoch": 2.176569821930647, + "grad_norm": 54864.03125, + "learning_rate": 3.7506148888037725e-05, + "loss": 2.0946, + "step": 11612 + }, + { + "epoch": 2.1767572633552015, + "grad_norm": 57584.51953125, + "learning_rate": 3.749854040370336e-05, + "loss": 2.1355, + "step": 11613 + }, + { + "epoch": 2.176944704779756, + "grad_norm": 53306.29296875, + "learning_rate": 3.749093222813884e-05, + "loss": 2.1182, + "step": 11614 + }, + { + "epoch": 2.177132146204311, + "grad_norm": 57608.4609375, + "learning_rate": 3.7483324361532086e-05, + "loss": 2.1371, + "step": 11615 + }, + { + "epoch": 2.177319587628866, + "grad_norm": 51959.1328125, + "learning_rate": 3.7475716804070984e-05, + "loss": 2.1356, + "step": 11616 + }, + { + "epoch": 2.177507029053421, + "grad_norm": 55396.87890625, + "learning_rate": 3.746810955594343e-05, + "loss": 2.1778, + "step": 11617 + }, + { + "epoch": 2.1776944704779755, + "grad_norm": 53839.7734375, + "learning_rate": 3.746050261733732e-05, + "loss": 2.102, + "step": 11618 + }, + { + "epoch": 2.1778819119025306, + "grad_norm": 53651.76953125, + "learning_rate": 3.745289598844056e-05, + "loss": 2.1696, + "step": 11619 + }, + { + "epoch": 2.178069353327085, + "grad_norm": 56938.1953125, + "learning_rate": 3.744528966944099e-05, + "loss": 2.1285, + "step": 11620 + }, + { + "epoch": 2.1782567947516402, + "grad_norm": 57586.46875, + "learning_rate": 3.743768366052649e-05, + "loss": 2.1061, + "step": 11621 + }, + { + "epoch": 2.178444236176195, + "grad_norm": 53487.78125, + "learning_rate": 3.74300779618849e-05, + "loss": 2.1814, + "step": 11622 + }, + { + "epoch": 2.17863167760075, + "grad_norm": 56055.77734375, + "learning_rate": 3.742247257370413e-05, + "loss": 2.116, + "step": 11623 + }, + { + "epoch": 2.1788191190253046, + "grad_norm": 53850.69921875, + "learning_rate": 3.7414867496171934e-05, + "loss": 2.1438, + "step": 11624 + }, + { + "epoch": 2.179006560449859, + "grad_norm": 52501.94140625, + "learning_rate": 3.740726272947621e-05, + "loss": 2.1475, + "step": 11625 + }, + { + "epoch": 2.1791940018744143, + "grad_norm": 50538.30078125, + "learning_rate": 3.7399658273804775e-05, + "loss": 2.1639, + "step": 11626 + }, + { + "epoch": 2.179381443298969, + "grad_norm": 58024.09375, + "learning_rate": 3.7392054129345446e-05, + "loss": 2.1995, + "step": 11627 + }, + { + "epoch": 2.179568884723524, + "grad_norm": 53278.46484375, + "learning_rate": 3.7384450296286024e-05, + "loss": 2.1577, + "step": 11628 + }, + { + "epoch": 2.1797563261480786, + "grad_norm": 54681.39453125, + "learning_rate": 3.7376846774814325e-05, + "loss": 2.1537, + "step": 11629 + }, + { + "epoch": 2.1799437675726336, + "grad_norm": 51793.55859375, + "learning_rate": 3.736924356511816e-05, + "loss": 2.2199, + "step": 11630 + }, + { + "epoch": 2.1801312089971883, + "grad_norm": 51943.2578125, + "learning_rate": 3.736164066738529e-05, + "loss": 2.1571, + "step": 11631 + }, + { + "epoch": 2.1803186504217433, + "grad_norm": 50815.4140625, + "learning_rate": 3.73540380818035e-05, + "loss": 2.1356, + "step": 11632 + }, + { + "epoch": 2.180506091846298, + "grad_norm": 54752.2578125, + "learning_rate": 3.7346435808560576e-05, + "loss": 2.1419, + "step": 11633 + }, + { + "epoch": 2.180693533270853, + "grad_norm": 52947.23828125, + "learning_rate": 3.73388338478443e-05, + "loss": 2.1481, + "step": 11634 + }, + { + "epoch": 2.1808809746954076, + "grad_norm": 56344.4296875, + "learning_rate": 3.73312321998424e-05, + "loss": 2.0748, + "step": 11635 + }, + { + "epoch": 2.1810684161199627, + "grad_norm": 52386.19140625, + "learning_rate": 3.732363086474264e-05, + "loss": 2.1078, + "step": 11636 + }, + { + "epoch": 2.1812558575445173, + "grad_norm": 56265.83203125, + "learning_rate": 3.731602984273275e-05, + "loss": 2.1383, + "step": 11637 + }, + { + "epoch": 2.181443298969072, + "grad_norm": 53147.92578125, + "learning_rate": 3.7308429134000513e-05, + "loss": 2.13, + "step": 11638 + }, + { + "epoch": 2.181630740393627, + "grad_norm": 49406.7421875, + "learning_rate": 3.730082873873359e-05, + "loss": 2.1638, + "step": 11639 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 58857.75390625, + "learning_rate": 3.729322865711974e-05, + "loss": 2.1076, + "step": 11640 + }, + { + "epoch": 2.1820056232427367, + "grad_norm": 53254.1875, + "learning_rate": 3.728562888934668e-05, + "loss": 2.155, + "step": 11641 + }, + { + "epoch": 2.1821930646672913, + "grad_norm": 52238.07421875, + "learning_rate": 3.727802943560209e-05, + "loss": 2.1486, + "step": 11642 + }, + { + "epoch": 2.1823805060918464, + "grad_norm": 50988.42578125, + "learning_rate": 3.7270430296073675e-05, + "loss": 2.2186, + "step": 11643 + }, + { + "epoch": 2.182567947516401, + "grad_norm": 53009.078125, + "learning_rate": 3.7262831470949136e-05, + "loss": 2.1723, + "step": 11644 + }, + { + "epoch": 2.182755388940956, + "grad_norm": 52252.12109375, + "learning_rate": 3.725523296041615e-05, + "loss": 2.1265, + "step": 11645 + }, + { + "epoch": 2.1829428303655107, + "grad_norm": 54191.54296875, + "learning_rate": 3.724763476466238e-05, + "loss": 2.2309, + "step": 11646 + }, + { + "epoch": 2.183130271790066, + "grad_norm": 52693.29296875, + "learning_rate": 3.7240036883875487e-05, + "loss": 2.1204, + "step": 11647 + }, + { + "epoch": 2.1833177132146204, + "grad_norm": 57187.90234375, + "learning_rate": 3.723243931824315e-05, + "loss": 2.128, + "step": 11648 + }, + { + "epoch": 2.183505154639175, + "grad_norm": 53303.484375, + "learning_rate": 3.722484206795301e-05, + "loss": 2.1388, + "step": 11649 + }, + { + "epoch": 2.18369259606373, + "grad_norm": 55514.6328125, + "learning_rate": 3.7217245133192715e-05, + "loss": 2.0847, + "step": 11650 + }, + { + "epoch": 2.1838800374882847, + "grad_norm": 56781.2109375, + "learning_rate": 3.7209648514149875e-05, + "loss": 2.1033, + "step": 11651 + }, + { + "epoch": 2.18406747891284, + "grad_norm": 52717.3984375, + "learning_rate": 3.720205221101214e-05, + "loss": 2.0792, + "step": 11652 + }, + { + "epoch": 2.1842549203373944, + "grad_norm": 57271.69140625, + "learning_rate": 3.719445622396714e-05, + "loss": 2.2004, + "step": 11653 + }, + { + "epoch": 2.1844423617619495, + "grad_norm": 53584.3046875, + "learning_rate": 3.718686055320244e-05, + "loss": 2.1612, + "step": 11654 + }, + { + "epoch": 2.184629803186504, + "grad_norm": 49412.50390625, + "learning_rate": 3.717926519890568e-05, + "loss": 2.1542, + "step": 11655 + }, + { + "epoch": 2.184817244611059, + "grad_norm": 56423.109375, + "learning_rate": 3.717167016126447e-05, + "loss": 2.155, + "step": 11656 + }, + { + "epoch": 2.185004686035614, + "grad_norm": 50362.74609375, + "learning_rate": 3.716407544046634e-05, + "loss": 2.1094, + "step": 11657 + }, + { + "epoch": 2.185192127460169, + "grad_norm": 52523.0546875, + "learning_rate": 3.715648103669892e-05, + "loss": 2.1832, + "step": 11658 + }, + { + "epoch": 2.1853795688847235, + "grad_norm": 55004.33203125, + "learning_rate": 3.714888695014977e-05, + "loss": 2.154, + "step": 11659 + }, + { + "epoch": 2.1855670103092786, + "grad_norm": 51430.37109375, + "learning_rate": 3.714129318100645e-05, + "loss": 2.1928, + "step": 11660 + }, + { + "epoch": 2.185754451733833, + "grad_norm": 56720.47265625, + "learning_rate": 3.713369972945652e-05, + "loss": 2.1895, + "step": 11661 + }, + { + "epoch": 2.185941893158388, + "grad_norm": 49756.24609375, + "learning_rate": 3.712610659568751e-05, + "loss": 2.091, + "step": 11662 + }, + { + "epoch": 2.186129334582943, + "grad_norm": 50657.2890625, + "learning_rate": 3.711851377988698e-05, + "loss": 2.0725, + "step": 11663 + }, + { + "epoch": 2.1863167760074975, + "grad_norm": 53437.09765625, + "learning_rate": 3.711092128224247e-05, + "loss": 2.2282, + "step": 11664 + }, + { + "epoch": 2.1865042174320526, + "grad_norm": 50477.57421875, + "learning_rate": 3.710332910294149e-05, + "loss": 2.1186, + "step": 11665 + }, + { + "epoch": 2.186691658856607, + "grad_norm": 47019.36328125, + "learning_rate": 3.709573724217156e-05, + "loss": 2.0851, + "step": 11666 + }, + { + "epoch": 2.1868791002811623, + "grad_norm": 53036.03125, + "learning_rate": 3.708814570012018e-05, + "loss": 2.0831, + "step": 11667 + }, + { + "epoch": 2.187066541705717, + "grad_norm": 56389.96484375, + "learning_rate": 3.7080554476974884e-05, + "loss": 2.1037, + "step": 11668 + }, + { + "epoch": 2.187253983130272, + "grad_norm": 52972.734375, + "learning_rate": 3.7072963572923126e-05, + "loss": 2.1273, + "step": 11669 + }, + { + "epoch": 2.1874414245548266, + "grad_norm": 53457.828125, + "learning_rate": 3.70653729881524e-05, + "loss": 2.1507, + "step": 11670 + }, + { + "epoch": 2.1876288659793817, + "grad_norm": 50163.6171875, + "learning_rate": 3.705778272285021e-05, + "loss": 2.0869, + "step": 11671 + }, + { + "epoch": 2.1878163074039363, + "grad_norm": 51399.42578125, + "learning_rate": 3.705019277720399e-05, + "loss": 2.1971, + "step": 11672 + }, + { + "epoch": 2.188003748828491, + "grad_norm": 51224.25390625, + "learning_rate": 3.704260315140122e-05, + "loss": 2.0802, + "step": 11673 + }, + { + "epoch": 2.188191190253046, + "grad_norm": 59997.265625, + "learning_rate": 3.703501384562936e-05, + "loss": 2.1248, + "step": 11674 + }, + { + "epoch": 2.1883786316776006, + "grad_norm": 49613.75, + "learning_rate": 3.7027424860075855e-05, + "loss": 2.1489, + "step": 11675 + }, + { + "epoch": 2.1885660731021557, + "grad_norm": 51299.453125, + "learning_rate": 3.7019836194928125e-05, + "loss": 2.1144, + "step": 11676 + }, + { + "epoch": 2.1887535145267103, + "grad_norm": 53075.05859375, + "learning_rate": 3.7012247850373604e-05, + "loss": 2.1504, + "step": 11677 + }, + { + "epoch": 2.1889409559512654, + "grad_norm": 51874.90234375, + "learning_rate": 3.700465982659973e-05, + "loss": 2.2213, + "step": 11678 + }, + { + "epoch": 2.18912839737582, + "grad_norm": 55119.35546875, + "learning_rate": 3.699707212379393e-05, + "loss": 2.1074, + "step": 11679 + }, + { + "epoch": 2.189315838800375, + "grad_norm": 52775.45703125, + "learning_rate": 3.698948474214357e-05, + "loss": 2.1348, + "step": 11680 + }, + { + "epoch": 2.1895032802249297, + "grad_norm": 55371.9375, + "learning_rate": 3.698189768183607e-05, + "loss": 2.1531, + "step": 11681 + }, + { + "epoch": 2.1896907216494848, + "grad_norm": 52145.07421875, + "learning_rate": 3.697431094305882e-05, + "loss": 2.1308, + "step": 11682 + }, + { + "epoch": 2.1898781630740394, + "grad_norm": 50915.26953125, + "learning_rate": 3.6966724525999205e-05, + "loss": 2.136, + "step": 11683 + }, + { + "epoch": 2.190065604498594, + "grad_norm": 52628.94921875, + "learning_rate": 3.69591384308446e-05, + "loss": 2.1077, + "step": 11684 + }, + { + "epoch": 2.190253045923149, + "grad_norm": 52232.17578125, + "learning_rate": 3.6951552657782343e-05, + "loss": 2.0687, + "step": 11685 + }, + { + "epoch": 2.1904404873477037, + "grad_norm": 54412.61328125, + "learning_rate": 3.6943967206999864e-05, + "loss": 2.0992, + "step": 11686 + }, + { + "epoch": 2.1906279287722588, + "grad_norm": 50240.95703125, + "learning_rate": 3.6936382078684426e-05, + "loss": 2.1905, + "step": 11687 + }, + { + "epoch": 2.1908153701968134, + "grad_norm": 52239.70703125, + "learning_rate": 3.692879727302342e-05, + "loss": 2.0824, + "step": 11688 + }, + { + "epoch": 2.1910028116213685, + "grad_norm": 50072.85546875, + "learning_rate": 3.6921212790204186e-05, + "loss": 2.1484, + "step": 11689 + }, + { + "epoch": 2.191190253045923, + "grad_norm": 50763.30078125, + "learning_rate": 3.6913628630414044e-05, + "loss": 2.1039, + "step": 11690 + }, + { + "epoch": 2.191377694470478, + "grad_norm": 52813.98046875, + "learning_rate": 3.69060447938403e-05, + "loss": 2.1717, + "step": 11691 + }, + { + "epoch": 2.1915651358950328, + "grad_norm": 48175.53125, + "learning_rate": 3.6898461280670255e-05, + "loss": 2.1504, + "step": 11692 + }, + { + "epoch": 2.191752577319588, + "grad_norm": 55393.62109375, + "learning_rate": 3.689087809109125e-05, + "loss": 2.1814, + "step": 11693 + }, + { + "epoch": 2.1919400187441425, + "grad_norm": 50024.3125, + "learning_rate": 3.6883295225290557e-05, + "loss": 2.1212, + "step": 11694 + }, + { + "epoch": 2.192127460168697, + "grad_norm": 51865.44921875, + "learning_rate": 3.6875712683455465e-05, + "loss": 2.1589, + "step": 11695 + }, + { + "epoch": 2.192314901593252, + "grad_norm": 54324.9609375, + "learning_rate": 3.6868130465773256e-05, + "loss": 2.1283, + "step": 11696 + }, + { + "epoch": 2.1925023430178068, + "grad_norm": 53121.78515625, + "learning_rate": 3.686054857243121e-05, + "loss": 2.0688, + "step": 11697 + }, + { + "epoch": 2.192689784442362, + "grad_norm": 50959.56640625, + "learning_rate": 3.6852967003616564e-05, + "loss": 2.1064, + "step": 11698 + }, + { + "epoch": 2.1928772258669165, + "grad_norm": 57101.765625, + "learning_rate": 3.6845385759516585e-05, + "loss": 2.1491, + "step": 11699 + }, + { + "epoch": 2.1930646672914715, + "grad_norm": 56270.77734375, + "learning_rate": 3.683780484031851e-05, + "loss": 2.1128, + "step": 11700 + }, + { + "epoch": 2.193252108716026, + "grad_norm": 55034.84765625, + "learning_rate": 3.683022424620963e-05, + "loss": 2.15, + "step": 11701 + }, + { + "epoch": 2.1934395501405812, + "grad_norm": 52935.578125, + "learning_rate": 3.68226439773771e-05, + "loss": 2.1175, + "step": 11702 + }, + { + "epoch": 2.193626991565136, + "grad_norm": 54865.484375, + "learning_rate": 3.68150640340082e-05, + "loss": 2.0518, + "step": 11703 + }, + { + "epoch": 2.193814432989691, + "grad_norm": 55423.05859375, + "learning_rate": 3.6807484416290116e-05, + "loss": 2.1008, + "step": 11704 + }, + { + "epoch": 2.1940018744142455, + "grad_norm": 51014.6796875, + "learning_rate": 3.679990512441007e-05, + "loss": 2.1583, + "step": 11705 + }, + { + "epoch": 2.1941893158388, + "grad_norm": 53273.54296875, + "learning_rate": 3.6792326158555245e-05, + "loss": 2.1585, + "step": 11706 + }, + { + "epoch": 2.1943767572633552, + "grad_norm": 54427.95703125, + "learning_rate": 3.678474751891284e-05, + "loss": 2.1428, + "step": 11707 + }, + { + "epoch": 2.19456419868791, + "grad_norm": 54903.609375, + "learning_rate": 3.677716920567003e-05, + "loss": 2.1207, + "step": 11708 + }, + { + "epoch": 2.194751640112465, + "grad_norm": 50962.4453125, + "learning_rate": 3.676959121901402e-05, + "loss": 2.1041, + "step": 11709 + }, + { + "epoch": 2.1949390815370196, + "grad_norm": 55891.05859375, + "learning_rate": 3.676201355913193e-05, + "loss": 2.1218, + "step": 11710 + }, + { + "epoch": 2.1951265229615746, + "grad_norm": 52730.6796875, + "learning_rate": 3.675443622621095e-05, + "loss": 2.1697, + "step": 11711 + }, + { + "epoch": 2.1953139643861292, + "grad_norm": 55637.47265625, + "learning_rate": 3.674685922043824e-05, + "loss": 2.139, + "step": 11712 + }, + { + "epoch": 2.1955014058106843, + "grad_norm": 56037.90234375, + "learning_rate": 3.67392825420009e-05, + "loss": 2.0929, + "step": 11713 + }, + { + "epoch": 2.195688847235239, + "grad_norm": 50666.2890625, + "learning_rate": 3.673170619108609e-05, + "loss": 2.1404, + "step": 11714 + }, + { + "epoch": 2.195876288659794, + "grad_norm": 54304.21875, + "learning_rate": 3.672413016788093e-05, + "loss": 2.1247, + "step": 11715 + }, + { + "epoch": 2.1960637300843486, + "grad_norm": 54727.109375, + "learning_rate": 3.671655447257257e-05, + "loss": 2.1351, + "step": 11716 + }, + { + "epoch": 2.1962511715089033, + "grad_norm": 58638.8828125, + "learning_rate": 3.670897910534806e-05, + "loss": 2.0863, + "step": 11717 + }, + { + "epoch": 2.1964386129334583, + "grad_norm": 56364.03125, + "learning_rate": 3.6701404066394554e-05, + "loss": 2.1377, + "step": 11718 + }, + { + "epoch": 2.196626054358013, + "grad_norm": 56599.15234375, + "learning_rate": 3.6693829355899125e-05, + "loss": 2.12, + "step": 11719 + }, + { + "epoch": 2.196813495782568, + "grad_norm": 55403.125, + "learning_rate": 3.668625497404886e-05, + "loss": 2.1452, + "step": 11720 + }, + { + "epoch": 2.1970009372071226, + "grad_norm": 51686.859375, + "learning_rate": 3.667868092103083e-05, + "loss": 2.1243, + "step": 11721 + }, + { + "epoch": 2.1971883786316777, + "grad_norm": 58279.828125, + "learning_rate": 3.667110719703211e-05, + "loss": 2.1318, + "step": 11722 + }, + { + "epoch": 2.1973758200562323, + "grad_norm": 52590.31640625, + "learning_rate": 3.666353380223975e-05, + "loss": 2.1757, + "step": 11723 + }, + { + "epoch": 2.1975632614807874, + "grad_norm": 51192.09765625, + "learning_rate": 3.6655960736840845e-05, + "loss": 2.1623, + "step": 11724 + }, + { + "epoch": 2.197750702905342, + "grad_norm": 54135.35546875, + "learning_rate": 3.664838800102237e-05, + "loss": 2.0861, + "step": 11725 + }, + { + "epoch": 2.197938144329897, + "grad_norm": 52127.6796875, + "learning_rate": 3.6640815594971424e-05, + "loss": 2.1922, + "step": 11726 + }, + { + "epoch": 2.1981255857544517, + "grad_norm": 45971.078125, + "learning_rate": 3.6633243518875014e-05, + "loss": 2.1215, + "step": 11727 + }, + { + "epoch": 2.1983130271790063, + "grad_norm": 54271.10546875, + "learning_rate": 3.662567177292016e-05, + "loss": 2.178, + "step": 11728 + }, + { + "epoch": 2.1985004686035614, + "grad_norm": 52475.65234375, + "learning_rate": 3.661810035729386e-05, + "loss": 2.117, + "step": 11729 + }, + { + "epoch": 2.198687910028116, + "grad_norm": 53968.2265625, + "learning_rate": 3.6610529272183125e-05, + "loss": 2.1195, + "step": 11730 + }, + { + "epoch": 2.198875351452671, + "grad_norm": 52223.03125, + "learning_rate": 3.660295851777499e-05, + "loss": 2.2154, + "step": 11731 + }, + { + "epoch": 2.1990627928772257, + "grad_norm": 56382.01171875, + "learning_rate": 3.6595388094256375e-05, + "loss": 2.1714, + "step": 11732 + }, + { + "epoch": 2.199250234301781, + "grad_norm": 51391.70703125, + "learning_rate": 3.658781800181431e-05, + "loss": 2.1401, + "step": 11733 + }, + { + "epoch": 2.1994376757263354, + "grad_norm": 54222.51953125, + "learning_rate": 3.6580248240635756e-05, + "loss": 2.1219, + "step": 11734 + }, + { + "epoch": 2.1996251171508905, + "grad_norm": 52763.46875, + "learning_rate": 3.657267881090768e-05, + "loss": 2.1531, + "step": 11735 + }, + { + "epoch": 2.199812558575445, + "grad_norm": 54240.25, + "learning_rate": 3.656510971281701e-05, + "loss": 2.1239, + "step": 11736 + }, + { + "epoch": 2.2, + "grad_norm": 51941.73828125, + "learning_rate": 3.655754094655072e-05, + "loss": 2.1033, + "step": 11737 + }, + { + "epoch": 2.200187441424555, + "grad_norm": 50120.21484375, + "learning_rate": 3.6549972512295736e-05, + "loss": 2.1315, + "step": 11738 + }, + { + "epoch": 2.2003748828491094, + "grad_norm": 57717.78125, + "learning_rate": 3.6542404410239e-05, + "loss": 2.1207, + "step": 11739 + }, + { + "epoch": 2.2005623242736645, + "grad_norm": 55065.53125, + "learning_rate": 3.6534836640567406e-05, + "loss": 2.1845, + "step": 11740 + }, + { + "epoch": 2.200749765698219, + "grad_norm": 56240.18359375, + "learning_rate": 3.6527269203467906e-05, + "loss": 2.0515, + "step": 11741 + }, + { + "epoch": 2.200937207122774, + "grad_norm": 50367.8828125, + "learning_rate": 3.651970209912739e-05, + "loss": 2.1777, + "step": 11742 + }, + { + "epoch": 2.201124648547329, + "grad_norm": 53169.6484375, + "learning_rate": 3.6512135327732756e-05, + "loss": 2.2186, + "step": 11743 + }, + { + "epoch": 2.201312089971884, + "grad_norm": 61803.46484375, + "learning_rate": 3.650456888947088e-05, + "loss": 2.227, + "step": 11744 + }, + { + "epoch": 2.2014995313964385, + "grad_norm": 52259.95703125, + "learning_rate": 3.6497002784528663e-05, + "loss": 2.1668, + "step": 11745 + }, + { + "epoch": 2.2016869728209936, + "grad_norm": 52307.88671875, + "learning_rate": 3.6489437013092986e-05, + "loss": 2.145, + "step": 11746 + }, + { + "epoch": 2.201874414245548, + "grad_norm": 51458.83984375, + "learning_rate": 3.648187157535068e-05, + "loss": 2.1169, + "step": 11747 + }, + { + "epoch": 2.2020618556701033, + "grad_norm": 51595.23828125, + "learning_rate": 3.6474306471488605e-05, + "loss": 2.1177, + "step": 11748 + }, + { + "epoch": 2.202249297094658, + "grad_norm": 53851.9765625, + "learning_rate": 3.646674170169364e-05, + "loss": 2.1246, + "step": 11749 + }, + { + "epoch": 2.2024367385192125, + "grad_norm": 55825.24609375, + "learning_rate": 3.645917726615262e-05, + "loss": 2.0966, + "step": 11750 + }, + { + "epoch": 2.2026241799437676, + "grad_norm": 55393.265625, + "learning_rate": 3.6451613165052356e-05, + "loss": 2.1176, + "step": 11751 + }, + { + "epoch": 2.202811621368322, + "grad_norm": 50508.8359375, + "learning_rate": 3.644404939857968e-05, + "loss": 2.1315, + "step": 11752 + }, + { + "epoch": 2.2029990627928773, + "grad_norm": 55448.359375, + "learning_rate": 3.6436485966921416e-05, + "loss": 2.1543, + "step": 11753 + }, + { + "epoch": 2.203186504217432, + "grad_norm": 56262.0390625, + "learning_rate": 3.642892287026436e-05, + "loss": 2.119, + "step": 11754 + }, + { + "epoch": 2.203373945641987, + "grad_norm": 53357.98828125, + "learning_rate": 3.642136010879529e-05, + "loss": 2.1291, + "step": 11755 + }, + { + "epoch": 2.2035613870665416, + "grad_norm": 58224.8984375, + "learning_rate": 3.6413797682701046e-05, + "loss": 2.1164, + "step": 11756 + }, + { + "epoch": 2.2037488284910967, + "grad_norm": 51867.515625, + "learning_rate": 3.6406235592168383e-05, + "loss": 2.1138, + "step": 11757 + }, + { + "epoch": 2.2039362699156513, + "grad_norm": 53483.45703125, + "learning_rate": 3.639867383738407e-05, + "loss": 2.0437, + "step": 11758 + }, + { + "epoch": 2.2041237113402063, + "grad_norm": 50427.38671875, + "learning_rate": 3.639111241853488e-05, + "loss": 2.1085, + "step": 11759 + }, + { + "epoch": 2.204311152764761, + "grad_norm": 53038.48828125, + "learning_rate": 3.638355133580757e-05, + "loss": 2.1563, + "step": 11760 + }, + { + "epoch": 2.204498594189316, + "grad_norm": 51934.52734375, + "learning_rate": 3.63759905893889e-05, + "loss": 2.0908, + "step": 11761 + }, + { + "epoch": 2.2046860356138707, + "grad_norm": 52839.3125, + "learning_rate": 3.6368430179465584e-05, + "loss": 2.1738, + "step": 11762 + }, + { + "epoch": 2.2048734770384253, + "grad_norm": 56456.109375, + "learning_rate": 3.636087010622436e-05, + "loss": 2.1615, + "step": 11763 + }, + { + "epoch": 2.2050609184629804, + "grad_norm": 56312.81640625, + "learning_rate": 3.6353310369851966e-05, + "loss": 2.1587, + "step": 11764 + }, + { + "epoch": 2.205248359887535, + "grad_norm": 54861.3984375, + "learning_rate": 3.634575097053513e-05, + "loss": 2.1292, + "step": 11765 + }, + { + "epoch": 2.20543580131209, + "grad_norm": 48694.58984375, + "learning_rate": 3.6338191908460535e-05, + "loss": 2.1356, + "step": 11766 + }, + { + "epoch": 2.2056232427366447, + "grad_norm": 49593.10546875, + "learning_rate": 3.6330633183814875e-05, + "loss": 2.2023, + "step": 11767 + }, + { + "epoch": 2.2058106841611997, + "grad_norm": 50390.8984375, + "learning_rate": 3.632307479678487e-05, + "loss": 2.1456, + "step": 11768 + }, + { + "epoch": 2.2059981255857544, + "grad_norm": 50846.79296875, + "learning_rate": 3.631551674755718e-05, + "loss": 2.1539, + "step": 11769 + }, + { + "epoch": 2.2061855670103094, + "grad_norm": 50170.05078125, + "learning_rate": 3.630795903631847e-05, + "loss": 2.1324, + "step": 11770 + }, + { + "epoch": 2.206373008434864, + "grad_norm": 52284.33984375, + "learning_rate": 3.6300401663255426e-05, + "loss": 2.1102, + "step": 11771 + }, + { + "epoch": 2.206560449859419, + "grad_norm": 50162.53125, + "learning_rate": 3.629284462855471e-05, + "loss": 2.1327, + "step": 11772 + }, + { + "epoch": 2.2067478912839738, + "grad_norm": 57474.6953125, + "learning_rate": 3.628528793240296e-05, + "loss": 2.1354, + "step": 11773 + }, + { + "epoch": 2.206935332708529, + "grad_norm": 52169.18359375, + "learning_rate": 3.6277731574986806e-05, + "loss": 2.1435, + "step": 11774 + }, + { + "epoch": 2.2071227741330834, + "grad_norm": 52898.90234375, + "learning_rate": 3.627017555649289e-05, + "loss": 2.1482, + "step": 11775 + }, + { + "epoch": 2.207310215557638, + "grad_norm": 55507.40234375, + "learning_rate": 3.626261987710785e-05, + "loss": 2.1305, + "step": 11776 + }, + { + "epoch": 2.207497656982193, + "grad_norm": 51515.08984375, + "learning_rate": 3.625506453701828e-05, + "loss": 2.1328, + "step": 11777 + }, + { + "epoch": 2.2076850984067478, + "grad_norm": 56989.1875, + "learning_rate": 3.6247509536410786e-05, + "loss": 2.1363, + "step": 11778 + }, + { + "epoch": 2.207872539831303, + "grad_norm": 54728.11328125, + "learning_rate": 3.623995487547198e-05, + "loss": 2.2057, + "step": 11779 + }, + { + "epoch": 2.2080599812558575, + "grad_norm": 56834.94140625, + "learning_rate": 3.623240055438847e-05, + "loss": 2.1791, + "step": 11780 + }, + { + "epoch": 2.2082474226804125, + "grad_norm": 52174.78125, + "learning_rate": 3.6224846573346795e-05, + "loss": 2.1777, + "step": 11781 + }, + { + "epoch": 2.208434864104967, + "grad_norm": 53788.4765625, + "learning_rate": 3.621729293253355e-05, + "loss": 2.0371, + "step": 11782 + }, + { + "epoch": 2.208622305529522, + "grad_norm": 54743.19140625, + "learning_rate": 3.620973963213532e-05, + "loss": 2.0626, + "step": 11783 + }, + { + "epoch": 2.208809746954077, + "grad_norm": 56027.828125, + "learning_rate": 3.620218667233862e-05, + "loss": 2.1295, + "step": 11784 + }, + { + "epoch": 2.208997188378632, + "grad_norm": 55835.0078125, + "learning_rate": 3.619463405333002e-05, + "loss": 2.1635, + "step": 11785 + }, + { + "epoch": 2.2091846298031865, + "grad_norm": 62989.9453125, + "learning_rate": 3.6187081775296066e-05, + "loss": 2.1047, + "step": 11786 + }, + { + "epoch": 2.209372071227741, + "grad_norm": 53095.4921875, + "learning_rate": 3.6179529838423305e-05, + "loss": 2.1611, + "step": 11787 + }, + { + "epoch": 2.209559512652296, + "grad_norm": 57283.125, + "learning_rate": 3.617197824289821e-05, + "loss": 2.1801, + "step": 11788 + }, + { + "epoch": 2.209746954076851, + "grad_norm": 57107.6953125, + "learning_rate": 3.616442698890733e-05, + "loss": 2.0591, + "step": 11789 + }, + { + "epoch": 2.209934395501406, + "grad_norm": 58680.7734375, + "learning_rate": 3.6156876076637156e-05, + "loss": 2.1597, + "step": 11790 + }, + { + "epoch": 2.2101218369259605, + "grad_norm": 49810.546875, + "learning_rate": 3.614932550627422e-05, + "loss": 2.1592, + "step": 11791 + }, + { + "epoch": 2.2103092783505156, + "grad_norm": 56236.07421875, + "learning_rate": 3.614177527800497e-05, + "loss": 2.134, + "step": 11792 + }, + { + "epoch": 2.2104967197750702, + "grad_norm": 53237.08203125, + "learning_rate": 3.61342253920159e-05, + "loss": 2.1124, + "step": 11793 + }, + { + "epoch": 2.2106841611996253, + "grad_norm": 49914.9140625, + "learning_rate": 3.612667584849351e-05, + "loss": 2.0886, + "step": 11794 + }, + { + "epoch": 2.21087160262418, + "grad_norm": 55045.6015625, + "learning_rate": 3.611912664762421e-05, + "loss": 2.1337, + "step": 11795 + }, + { + "epoch": 2.211059044048735, + "grad_norm": 52373.015625, + "learning_rate": 3.611157778959449e-05, + "loss": 2.1284, + "step": 11796 + }, + { + "epoch": 2.2112464854732896, + "grad_norm": 50751.04296875, + "learning_rate": 3.61040292745908e-05, + "loss": 2.2205, + "step": 11797 + }, + { + "epoch": 2.2114339268978442, + "grad_norm": 58713.8125, + "learning_rate": 3.6096481102799575e-05, + "loss": 2.1213, + "step": 11798 + }, + { + "epoch": 2.2116213683223993, + "grad_norm": 60355.1953125, + "learning_rate": 3.6088933274407236e-05, + "loss": 2.0858, + "step": 11799 + }, + { + "epoch": 2.211808809746954, + "grad_norm": 50765.8046875, + "learning_rate": 3.6081385789600206e-05, + "loss": 2.1635, + "step": 11800 + }, + { + "epoch": 2.211996251171509, + "grad_norm": 54228.20703125, + "learning_rate": 3.6073838648564885e-05, + "loss": 2.1789, + "step": 11801 + }, + { + "epoch": 2.2121836925960636, + "grad_norm": 50440.89453125, + "learning_rate": 3.606629185148773e-05, + "loss": 2.1663, + "step": 11802 + }, + { + "epoch": 2.2123711340206187, + "grad_norm": 59054.08203125, + "learning_rate": 3.6058745398555074e-05, + "loss": 2.137, + "step": 11803 + }, + { + "epoch": 2.2125585754451733, + "grad_norm": 47913.91796875, + "learning_rate": 3.605119928995334e-05, + "loss": 2.0848, + "step": 11804 + }, + { + "epoch": 2.2127460168697284, + "grad_norm": 51424.94140625, + "learning_rate": 3.604365352586889e-05, + "loss": 2.0446, + "step": 11805 + }, + { + "epoch": 2.212933458294283, + "grad_norm": 53563.1875, + "learning_rate": 3.603610810648813e-05, + "loss": 2.1196, + "step": 11806 + }, + { + "epoch": 2.213120899718838, + "grad_norm": 58791.6640625, + "learning_rate": 3.602856303199737e-05, + "loss": 2.1667, + "step": 11807 + }, + { + "epoch": 2.2133083411433927, + "grad_norm": 55017.66796875, + "learning_rate": 3.6021018302582987e-05, + "loss": 2.1921, + "step": 11808 + }, + { + "epoch": 2.2134957825679473, + "grad_norm": 55291.05078125, + "learning_rate": 3.601347391843135e-05, + "loss": 2.167, + "step": 11809 + }, + { + "epoch": 2.2136832239925024, + "grad_norm": 52117.671875, + "learning_rate": 3.600592987972875e-05, + "loss": 2.1242, + "step": 11810 + }, + { + "epoch": 2.213870665417057, + "grad_norm": 51226.14453125, + "learning_rate": 3.599838618666155e-05, + "loss": 2.1829, + "step": 11811 + }, + { + "epoch": 2.214058106841612, + "grad_norm": 58200.33984375, + "learning_rate": 3.599084283941605e-05, + "loss": 2.1319, + "step": 11812 + }, + { + "epoch": 2.2142455482661667, + "grad_norm": 54056.8984375, + "learning_rate": 3.598329983817859e-05, + "loss": 2.1311, + "step": 11813 + }, + { + "epoch": 2.2144329896907218, + "grad_norm": 51534.25, + "learning_rate": 3.597575718313543e-05, + "loss": 2.1583, + "step": 11814 + }, + { + "epoch": 2.2146204311152764, + "grad_norm": 51189.7109375, + "learning_rate": 3.5968214874472895e-05, + "loss": 2.135, + "step": 11815 + }, + { + "epoch": 2.2148078725398315, + "grad_norm": 60024.14453125, + "learning_rate": 3.596067291237724e-05, + "loss": 2.1487, + "step": 11816 + }, + { + "epoch": 2.214995313964386, + "grad_norm": 48109.17578125, + "learning_rate": 3.59531312970348e-05, + "loss": 2.099, + "step": 11817 + }, + { + "epoch": 2.215182755388941, + "grad_norm": 52438.30078125, + "learning_rate": 3.594559002863177e-05, + "loss": 2.1343, + "step": 11818 + }, + { + "epoch": 2.215370196813496, + "grad_norm": 50064.10546875, + "learning_rate": 3.5938049107354454e-05, + "loss": 2.1432, + "step": 11819 + }, + { + "epoch": 2.2155576382380504, + "grad_norm": 56281.5078125, + "learning_rate": 3.5930508533389094e-05, + "loss": 2.1472, + "step": 11820 + }, + { + "epoch": 2.2157450796626055, + "grad_norm": 50115.60546875, + "learning_rate": 3.592296830692194e-05, + "loss": 2.1841, + "step": 11821 + }, + { + "epoch": 2.21593252108716, + "grad_norm": 54127.78125, + "learning_rate": 3.591542842813922e-05, + "loss": 2.0735, + "step": 11822 + }, + { + "epoch": 2.216119962511715, + "grad_norm": 53531.03515625, + "learning_rate": 3.590788889722714e-05, + "loss": 2.0704, + "step": 11823 + }, + { + "epoch": 2.21630740393627, + "grad_norm": 51158.40625, + "learning_rate": 3.590034971437196e-05, + "loss": 2.1621, + "step": 11824 + }, + { + "epoch": 2.216494845360825, + "grad_norm": 52624.27734375, + "learning_rate": 3.589281087975983e-05, + "loss": 2.1374, + "step": 11825 + }, + { + "epoch": 2.2166822867853795, + "grad_norm": 51678.0234375, + "learning_rate": 3.588527239357699e-05, + "loss": 2.1197, + "step": 11826 + }, + { + "epoch": 2.2168697282099346, + "grad_norm": 48878.90234375, + "learning_rate": 3.5877734256009633e-05, + "loss": 2.1012, + "step": 11827 + }, + { + "epoch": 2.217057169634489, + "grad_norm": 53231.23828125, + "learning_rate": 3.587019646724393e-05, + "loss": 2.1356, + "step": 11828 + }, + { + "epoch": 2.2172446110590442, + "grad_norm": 50307.44140625, + "learning_rate": 3.586265902746605e-05, + "loss": 2.0711, + "step": 11829 + }, + { + "epoch": 2.217432052483599, + "grad_norm": 56217.0859375, + "learning_rate": 3.585512193686216e-05, + "loss": 2.1412, + "step": 11830 + }, + { + "epoch": 2.2176194939081535, + "grad_norm": 55662.3203125, + "learning_rate": 3.584758519561841e-05, + "loss": 2.1173, + "step": 11831 + }, + { + "epoch": 2.2178069353327086, + "grad_norm": 51044.203125, + "learning_rate": 3.584004880392099e-05, + "loss": 2.1931, + "step": 11832 + }, + { + "epoch": 2.217994376757263, + "grad_norm": 49712.45703125, + "learning_rate": 3.583251276195597e-05, + "loss": 2.1915, + "step": 11833 + }, + { + "epoch": 2.2181818181818183, + "grad_norm": 53041.8359375, + "learning_rate": 3.5824977069909526e-05, + "loss": 2.1438, + "step": 11834 + }, + { + "epoch": 2.218369259606373, + "grad_norm": 55386.7265625, + "learning_rate": 3.581744172796778e-05, + "loss": 2.1433, + "step": 11835 + }, + { + "epoch": 2.218556701030928, + "grad_norm": 50323.26953125, + "learning_rate": 3.580990673631683e-05, + "loss": 2.1856, + "step": 11836 + }, + { + "epoch": 2.2187441424554826, + "grad_norm": 52547.33203125, + "learning_rate": 3.580237209514279e-05, + "loss": 2.1335, + "step": 11837 + }, + { + "epoch": 2.2189315838800376, + "grad_norm": 50295.30859375, + "learning_rate": 3.5794837804631745e-05, + "loss": 2.0759, + "step": 11838 + }, + { + "epoch": 2.2191190253045923, + "grad_norm": 50288.22265625, + "learning_rate": 3.5787303864969795e-05, + "loss": 2.1646, + "step": 11839 + }, + { + "epoch": 2.2193064667291473, + "grad_norm": 55701.51171875, + "learning_rate": 3.5779770276343005e-05, + "loss": 2.1918, + "step": 11840 + }, + { + "epoch": 2.219493908153702, + "grad_norm": 52457.50390625, + "learning_rate": 3.577223703893745e-05, + "loss": 2.1394, + "step": 11841 + }, + { + "epoch": 2.2196813495782566, + "grad_norm": 53444.44140625, + "learning_rate": 3.5764704152939196e-05, + "loss": 2.1102, + "step": 11842 + }, + { + "epoch": 2.2198687910028116, + "grad_norm": 54331.5234375, + "learning_rate": 3.57571716185343e-05, + "loss": 2.1325, + "step": 11843 + }, + { + "epoch": 2.2200562324273663, + "grad_norm": 51779.8359375, + "learning_rate": 3.57496394359088e-05, + "loss": 2.1565, + "step": 11844 + }, + { + "epoch": 2.2202436738519213, + "grad_norm": 57087.22265625, + "learning_rate": 3.574210760524872e-05, + "loss": 2.1445, + "step": 11845 + }, + { + "epoch": 2.220431115276476, + "grad_norm": 54532.703125, + "learning_rate": 3.573457612674009e-05, + "loss": 2.1251, + "step": 11846 + }, + { + "epoch": 2.220618556701031, + "grad_norm": 51839.88671875, + "learning_rate": 3.572704500056897e-05, + "loss": 2.0818, + "step": 11847 + }, + { + "epoch": 2.2208059981255857, + "grad_norm": 55209.8203125, + "learning_rate": 3.57195142269213e-05, + "loss": 2.0937, + "step": 11848 + }, + { + "epoch": 2.2209934395501407, + "grad_norm": 55717.1796875, + "learning_rate": 3.5711983805983115e-05, + "loss": 2.1588, + "step": 11849 + }, + { + "epoch": 2.2211808809746953, + "grad_norm": 54963.71484375, + "learning_rate": 3.5704453737940425e-05, + "loss": 2.089, + "step": 11850 + }, + { + "epoch": 2.2213683223992504, + "grad_norm": 52850.9765625, + "learning_rate": 3.569692402297918e-05, + "loss": 2.1357, + "step": 11851 + }, + { + "epoch": 2.221555763823805, + "grad_norm": 51801.15625, + "learning_rate": 3.568939466128537e-05, + "loss": 2.1217, + "step": 11852 + }, + { + "epoch": 2.2217432052483597, + "grad_norm": 57760.546875, + "learning_rate": 3.568186565304495e-05, + "loss": 2.1447, + "step": 11853 + }, + { + "epoch": 2.2219306466729147, + "grad_norm": 47750.9375, + "learning_rate": 3.5674336998443905e-05, + "loss": 2.1517, + "step": 11854 + }, + { + "epoch": 2.2221180880974694, + "grad_norm": 53604.78125, + "learning_rate": 3.566680869766815e-05, + "loss": 2.1251, + "step": 11855 + }, + { + "epoch": 2.2223055295220244, + "grad_norm": 54217.46875, + "learning_rate": 3.565928075090362e-05, + "loss": 2.1793, + "step": 11856 + }, + { + "epoch": 2.222492970946579, + "grad_norm": 51207.21875, + "learning_rate": 3.565175315833628e-05, + "loss": 2.1126, + "step": 11857 + }, + { + "epoch": 2.222680412371134, + "grad_norm": 54508.84375, + "learning_rate": 3.564422592015204e-05, + "loss": 2.1012, + "step": 11858 + }, + { + "epoch": 2.2228678537956887, + "grad_norm": 50930.9453125, + "learning_rate": 3.56366990365368e-05, + "loss": 2.1952, + "step": 11859 + }, + { + "epoch": 2.223055295220244, + "grad_norm": 51749.6875, + "learning_rate": 3.562917250767647e-05, + "loss": 2.1018, + "step": 11860 + }, + { + "epoch": 2.2232427366447984, + "grad_norm": 57784.8828125, + "learning_rate": 3.5621646333756945e-05, + "loss": 2.1015, + "step": 11861 + }, + { + "epoch": 2.2234301780693535, + "grad_norm": 55937.3203125, + "learning_rate": 3.561412051496413e-05, + "loss": 2.1033, + "step": 11862 + }, + { + "epoch": 2.223617619493908, + "grad_norm": 51299.0625, + "learning_rate": 3.560659505148386e-05, + "loss": 2.1318, + "step": 11863 + }, + { + "epoch": 2.2238050609184628, + "grad_norm": 54807.2890625, + "learning_rate": 3.559906994350205e-05, + "loss": 2.1008, + "step": 11864 + }, + { + "epoch": 2.223992502343018, + "grad_norm": 55245.60546875, + "learning_rate": 3.5591545191204545e-05, + "loss": 2.2163, + "step": 11865 + }, + { + "epoch": 2.2241799437675724, + "grad_norm": 50020.96875, + "learning_rate": 3.5584020794777186e-05, + "loss": 2.1433, + "step": 11866 + }, + { + "epoch": 2.2243673851921275, + "grad_norm": 61486.2890625, + "learning_rate": 3.557649675440581e-05, + "loss": 2.1784, + "step": 11867 + }, + { + "epoch": 2.224554826616682, + "grad_norm": 49160.9921875, + "learning_rate": 3.556897307027628e-05, + "loss": 2.1108, + "step": 11868 + }, + { + "epoch": 2.224742268041237, + "grad_norm": 51985.14453125, + "learning_rate": 3.556144974257441e-05, + "loss": 2.0826, + "step": 11869 + }, + { + "epoch": 2.224929709465792, + "grad_norm": 59327.03515625, + "learning_rate": 3.5553926771485994e-05, + "loss": 2.112, + "step": 11870 + }, + { + "epoch": 2.225117150890347, + "grad_norm": 58160.88671875, + "learning_rate": 3.5546404157196853e-05, + "loss": 2.1319, + "step": 11871 + }, + { + "epoch": 2.2253045923149015, + "grad_norm": 55611.1171875, + "learning_rate": 3.5538881899892805e-05, + "loss": 2.0967, + "step": 11872 + }, + { + "epoch": 2.2254920337394566, + "grad_norm": 49900.65625, + "learning_rate": 3.5531359999759626e-05, + "loss": 2.1101, + "step": 11873 + }, + { + "epoch": 2.225679475164011, + "grad_norm": 51232.25, + "learning_rate": 3.5523838456983095e-05, + "loss": 2.1574, + "step": 11874 + }, + { + "epoch": 2.2258669165885663, + "grad_norm": 52811.21484375, + "learning_rate": 3.551631727174898e-05, + "loss": 2.122, + "step": 11875 + }, + { + "epoch": 2.226054358013121, + "grad_norm": 53607.53515625, + "learning_rate": 3.550879644424304e-05, + "loss": 2.0772, + "step": 11876 + }, + { + "epoch": 2.2262417994376755, + "grad_norm": 51800.30078125, + "learning_rate": 3.5501275974651075e-05, + "loss": 2.1574, + "step": 11877 + }, + { + "epoch": 2.2264292408622306, + "grad_norm": 53806.71875, + "learning_rate": 3.549375586315876e-05, + "loss": 2.1988, + "step": 11878 + }, + { + "epoch": 2.226616682286785, + "grad_norm": 53753.94921875, + "learning_rate": 3.5486236109951874e-05, + "loss": 2.125, + "step": 11879 + }, + { + "epoch": 2.2268041237113403, + "grad_norm": 54225.22265625, + "learning_rate": 3.547871671521616e-05, + "loss": 2.1847, + "step": 11880 + }, + { + "epoch": 2.226991565135895, + "grad_norm": 55570.25, + "learning_rate": 3.5471197679137294e-05, + "loss": 2.1297, + "step": 11881 + }, + { + "epoch": 2.22717900656045, + "grad_norm": 54165.375, + "learning_rate": 3.546367900190101e-05, + "loss": 2.1678, + "step": 11882 + }, + { + "epoch": 2.2273664479850046, + "grad_norm": 51005.74609375, + "learning_rate": 3.5456160683693016e-05, + "loss": 2.1694, + "step": 11883 + }, + { + "epoch": 2.2275538894095597, + "grad_norm": 53419.9296875, + "learning_rate": 3.5448642724699e-05, + "loss": 2.0971, + "step": 11884 + }, + { + "epoch": 2.2277413308341143, + "grad_norm": 50631.87109375, + "learning_rate": 3.5441125125104635e-05, + "loss": 2.157, + "step": 11885 + }, + { + "epoch": 2.2279287722586694, + "grad_norm": 59233.33984375, + "learning_rate": 3.54336078850956e-05, + "loss": 2.1757, + "step": 11886 + }, + { + "epoch": 2.228116213683224, + "grad_norm": 54241.7109375, + "learning_rate": 3.542609100485756e-05, + "loss": 2.1345, + "step": 11887 + }, + { + "epoch": 2.2283036551077786, + "grad_norm": 51644.703125, + "learning_rate": 3.54185744845762e-05, + "loss": 2.1294, + "step": 11888 + }, + { + "epoch": 2.2284910965323337, + "grad_norm": 56402.45703125, + "learning_rate": 3.541105832443713e-05, + "loss": 2.0855, + "step": 11889 + }, + { + "epoch": 2.2286785379568883, + "grad_norm": 50877.578125, + "learning_rate": 3.5403542524626006e-05, + "loss": 2.1168, + "step": 11890 + }, + { + "epoch": 2.2288659793814434, + "grad_norm": 53228.02734375, + "learning_rate": 3.5396027085328464e-05, + "loss": 2.1259, + "step": 11891 + }, + { + "epoch": 2.229053420805998, + "grad_norm": 50165.29296875, + "learning_rate": 3.538851200673011e-05, + "loss": 2.1235, + "step": 11892 + }, + { + "epoch": 2.229240862230553, + "grad_norm": 57394.79296875, + "learning_rate": 3.538099728901657e-05, + "loss": 2.1464, + "step": 11893 + }, + { + "epoch": 2.2294283036551077, + "grad_norm": 48369.578125, + "learning_rate": 3.537348293237343e-05, + "loss": 2.1599, + "step": 11894 + }, + { + "epoch": 2.2296157450796628, + "grad_norm": 48284.67578125, + "learning_rate": 3.536596893698633e-05, + "loss": 2.1346, + "step": 11895 + }, + { + "epoch": 2.2298031865042174, + "grad_norm": 56063.4375, + "learning_rate": 3.535845530304078e-05, + "loss": 2.1366, + "step": 11896 + }, + { + "epoch": 2.2299906279287725, + "grad_norm": 51246.47265625, + "learning_rate": 3.535094203072242e-05, + "loss": 2.183, + "step": 11897 + }, + { + "epoch": 2.230178069353327, + "grad_norm": 54153.828125, + "learning_rate": 3.534342912021681e-05, + "loss": 2.1653, + "step": 11898 + }, + { + "epoch": 2.230365510777882, + "grad_norm": 53004.6875, + "learning_rate": 3.5335916571709494e-05, + "loss": 2.1375, + "step": 11899 + }, + { + "epoch": 2.2305529522024368, + "grad_norm": 53268.65625, + "learning_rate": 3.532840438538603e-05, + "loss": 2.1568, + "step": 11900 + }, + { + "epoch": 2.2307403936269914, + "grad_norm": 52421.01953125, + "learning_rate": 3.532089256143193e-05, + "loss": 2.1081, + "step": 11901 + }, + { + "epoch": 2.2309278350515465, + "grad_norm": 51443.7265625, + "learning_rate": 3.531338110003277e-05, + "loss": 2.1477, + "step": 11902 + }, + { + "epoch": 2.231115276476101, + "grad_norm": 54150.28515625, + "learning_rate": 3.5305870001374065e-05, + "loss": 2.081, + "step": 11903 + }, + { + "epoch": 2.231302717900656, + "grad_norm": 51018.19921875, + "learning_rate": 3.529835926564131e-05, + "loss": 2.1077, + "step": 11904 + }, + { + "epoch": 2.2314901593252108, + "grad_norm": 52935.4609375, + "learning_rate": 3.5290848893020015e-05, + "loss": 2.1788, + "step": 11905 + }, + { + "epoch": 2.231677600749766, + "grad_norm": 55085.05078125, + "learning_rate": 3.52833388836957e-05, + "loss": 2.2086, + "step": 11906 + }, + { + "epoch": 2.2318650421743205, + "grad_norm": 51305.1015625, + "learning_rate": 3.527582923785382e-05, + "loss": 2.147, + "step": 11907 + }, + { + "epoch": 2.2320524835988755, + "grad_norm": 52897.1796875, + "learning_rate": 3.5268319955679865e-05, + "loss": 2.1379, + "step": 11908 + }, + { + "epoch": 2.23223992502343, + "grad_norm": 57983.93359375, + "learning_rate": 3.5260811037359306e-05, + "loss": 2.1785, + "step": 11909 + }, + { + "epoch": 2.2324273664479852, + "grad_norm": 54431.44921875, + "learning_rate": 3.525330248307763e-05, + "loss": 2.1519, + "step": 11910 + }, + { + "epoch": 2.23261480787254, + "grad_norm": 50079.9140625, + "learning_rate": 3.524579429302023e-05, + "loss": 2.0708, + "step": 11911 + }, + { + "epoch": 2.2328022492970945, + "grad_norm": 52728.62109375, + "learning_rate": 3.523828646737259e-05, + "loss": 2.1407, + "step": 11912 + }, + { + "epoch": 2.2329896907216495, + "grad_norm": 54369.84765625, + "learning_rate": 3.5230779006320144e-05, + "loss": 2.098, + "step": 11913 + }, + { + "epoch": 2.233177132146204, + "grad_norm": 55075.3359375, + "learning_rate": 3.5223271910048315e-05, + "loss": 2.1708, + "step": 11914 + }, + { + "epoch": 2.2333645735707592, + "grad_norm": 54241.43359375, + "learning_rate": 3.52157651787425e-05, + "loss": 2.1518, + "step": 11915 + }, + { + "epoch": 2.233552014995314, + "grad_norm": 55202.16015625, + "learning_rate": 3.52082588125881e-05, + "loss": 2.156, + "step": 11916 + }, + { + "epoch": 2.233739456419869, + "grad_norm": 60645.65234375, + "learning_rate": 3.520075281177055e-05, + "loss": 2.2441, + "step": 11917 + }, + { + "epoch": 2.2339268978444236, + "grad_norm": 53173.578125, + "learning_rate": 3.5193247176475224e-05, + "loss": 2.1371, + "step": 11918 + }, + { + "epoch": 2.2341143392689786, + "grad_norm": 53333.29296875, + "learning_rate": 3.518574190688749e-05, + "loss": 2.2072, + "step": 11919 + }, + { + "epoch": 2.2343017806935332, + "grad_norm": 56520.08203125, + "learning_rate": 3.5178237003192714e-05, + "loss": 2.02, + "step": 11920 + }, + { + "epoch": 2.2344892221180883, + "grad_norm": 52851.70703125, + "learning_rate": 3.517073246557629e-05, + "loss": 2.0648, + "step": 11921 + }, + { + "epoch": 2.234676663542643, + "grad_norm": 52112.68359375, + "learning_rate": 3.516322829422353e-05, + "loss": 2.1343, + "step": 11922 + }, + { + "epoch": 2.2348641049671976, + "grad_norm": 51659.31640625, + "learning_rate": 3.51557244893198e-05, + "loss": 2.0868, + "step": 11923 + }, + { + "epoch": 2.2350515463917526, + "grad_norm": 54913.0390625, + "learning_rate": 3.514822105105041e-05, + "loss": 2.0683, + "step": 11924 + }, + { + "epoch": 2.2352389878163073, + "grad_norm": 54150.3984375, + "learning_rate": 3.514071797960074e-05, + "loss": 2.0919, + "step": 11925 + }, + { + "epoch": 2.2354264292408623, + "grad_norm": 54200.2421875, + "learning_rate": 3.513321527515604e-05, + "loss": 2.1068, + "step": 11926 + }, + { + "epoch": 2.235613870665417, + "grad_norm": 51250.75390625, + "learning_rate": 3.5125712937901653e-05, + "loss": 2.1548, + "step": 11927 + }, + { + "epoch": 2.235801312089972, + "grad_norm": 51681.52734375, + "learning_rate": 3.511821096802288e-05, + "loss": 2.1471, + "step": 11928 + }, + { + "epoch": 2.2359887535145266, + "grad_norm": 55145.5859375, + "learning_rate": 3.5110709365705005e-05, + "loss": 2.1556, + "step": 11929 + }, + { + "epoch": 2.2361761949390817, + "grad_norm": 56009.47265625, + "learning_rate": 3.5103208131133306e-05, + "loss": 2.1878, + "step": 11930 + }, + { + "epoch": 2.2363636363636363, + "grad_norm": 53212.94921875, + "learning_rate": 3.509570726449304e-05, + "loss": 2.116, + "step": 11931 + }, + { + "epoch": 2.2365510777881914, + "grad_norm": 52560.7734375, + "learning_rate": 3.508820676596949e-05, + "loss": 2.2039, + "step": 11932 + }, + { + "epoch": 2.236738519212746, + "grad_norm": 52564.70703125, + "learning_rate": 3.50807066357479e-05, + "loss": 2.0397, + "step": 11933 + }, + { + "epoch": 2.2369259606373006, + "grad_norm": 51593.2265625, + "learning_rate": 3.50732068740135e-05, + "loss": 2.1424, + "step": 11934 + }, + { + "epoch": 2.2371134020618557, + "grad_norm": 53597.4453125, + "learning_rate": 3.506570748095154e-05, + "loss": 2.151, + "step": 11935 + }, + { + "epoch": 2.2373008434864103, + "grad_norm": 57972.09375, + "learning_rate": 3.5058208456747254e-05, + "loss": 2.1228, + "step": 11936 + }, + { + "epoch": 2.2374882849109654, + "grad_norm": 53962.1953125, + "learning_rate": 3.505070980158584e-05, + "loss": 2.1272, + "step": 11937 + }, + { + "epoch": 2.23767572633552, + "grad_norm": 47780.4375, + "learning_rate": 3.504321151565251e-05, + "loss": 2.1689, + "step": 11938 + }, + { + "epoch": 2.237863167760075, + "grad_norm": 52697.0234375, + "learning_rate": 3.503571359913246e-05, + "loss": 2.1668, + "step": 11939 + }, + { + "epoch": 2.2380506091846297, + "grad_norm": 52105.1796875, + "learning_rate": 3.50282160522109e-05, + "loss": 2.14, + "step": 11940 + }, + { + "epoch": 2.238238050609185, + "grad_norm": 51373.546875, + "learning_rate": 3.5020718875072974e-05, + "loss": 2.0553, + "step": 11941 + }, + { + "epoch": 2.2384254920337394, + "grad_norm": 52628.82421875, + "learning_rate": 3.501322206790388e-05, + "loss": 2.1257, + "step": 11942 + }, + { + "epoch": 2.2386129334582945, + "grad_norm": 54606.96484375, + "learning_rate": 3.5005725630888765e-05, + "loss": 2.1633, + "step": 11943 + }, + { + "epoch": 2.238800374882849, + "grad_norm": 53062.046875, + "learning_rate": 3.4998229564212806e-05, + "loss": 2.1411, + "step": 11944 + }, + { + "epoch": 2.2389878163074037, + "grad_norm": 54542.828125, + "learning_rate": 3.49907338680611e-05, + "loss": 2.1171, + "step": 11945 + }, + { + "epoch": 2.239175257731959, + "grad_norm": 50319.3984375, + "learning_rate": 3.4983238542618825e-05, + "loss": 2.1142, + "step": 11946 + }, + { + "epoch": 2.2393626991565134, + "grad_norm": 56602.69140625, + "learning_rate": 3.4975743588071084e-05, + "loss": 2.1736, + "step": 11947 + }, + { + "epoch": 2.2395501405810685, + "grad_norm": 50785.9140625, + "learning_rate": 3.4968249004602996e-05, + "loss": 2.1598, + "step": 11948 + }, + { + "epoch": 2.239737582005623, + "grad_norm": 55335.76171875, + "learning_rate": 3.496075479239966e-05, + "loss": 2.1519, + "step": 11949 + }, + { + "epoch": 2.239925023430178, + "grad_norm": 55176.69140625, + "learning_rate": 3.4953260951646185e-05, + "loss": 2.2354, + "step": 11950 + }, + { + "epoch": 2.240112464854733, + "grad_norm": 60729.77734375, + "learning_rate": 3.494576748252767e-05, + "loss": 2.1693, + "step": 11951 + }, + { + "epoch": 2.240299906279288, + "grad_norm": 52052.9140625, + "learning_rate": 3.493827438522917e-05, + "loss": 2.1271, + "step": 11952 + }, + { + "epoch": 2.2404873477038425, + "grad_norm": 53963.52734375, + "learning_rate": 3.4930781659935765e-05, + "loss": 2.155, + "step": 11953 + }, + { + "epoch": 2.2406747891283976, + "grad_norm": 49888.34765625, + "learning_rate": 3.492328930683251e-05, + "loss": 2.0725, + "step": 11954 + }, + { + "epoch": 2.240862230552952, + "grad_norm": 53119.33984375, + "learning_rate": 3.491579732610448e-05, + "loss": 2.1086, + "step": 11955 + }, + { + "epoch": 2.241049671977507, + "grad_norm": 53928.25, + "learning_rate": 3.4908305717936676e-05, + "loss": 2.1176, + "step": 11956 + }, + { + "epoch": 2.241237113402062, + "grad_norm": 55613.8359375, + "learning_rate": 3.490081448251416e-05, + "loss": 2.1281, + "step": 11957 + }, + { + "epoch": 2.2414245548266165, + "grad_norm": 58309.70703125, + "learning_rate": 3.4893323620021954e-05, + "loss": 2.1291, + "step": 11958 + }, + { + "epoch": 2.2416119962511716, + "grad_norm": 54002.12890625, + "learning_rate": 3.488583313064508e-05, + "loss": 2.0921, + "step": 11959 + }, + { + "epoch": 2.241799437675726, + "grad_norm": 55034.5390625, + "learning_rate": 3.4878343014568525e-05, + "loss": 2.1909, + "step": 11960 + }, + { + "epoch": 2.2419868791002813, + "grad_norm": 53635.23046875, + "learning_rate": 3.487085327197729e-05, + "loss": 2.125, + "step": 11961 + }, + { + "epoch": 2.242174320524836, + "grad_norm": 54738.828125, + "learning_rate": 3.4863363903056365e-05, + "loss": 2.1483, + "step": 11962 + }, + { + "epoch": 2.242361761949391, + "grad_norm": 58589.421875, + "learning_rate": 3.4855874907990725e-05, + "loss": 2.0483, + "step": 11963 + }, + { + "epoch": 2.2425492033739456, + "grad_norm": 53397.58203125, + "learning_rate": 3.484838628696533e-05, + "loss": 2.1665, + "step": 11964 + }, + { + "epoch": 2.2427366447985007, + "grad_norm": 53661.75390625, + "learning_rate": 3.484089804016516e-05, + "loss": 2.0801, + "step": 11965 + }, + { + "epoch": 2.2429240862230553, + "grad_norm": 54569.93359375, + "learning_rate": 3.483341016777516e-05, + "loss": 2.0661, + "step": 11966 + }, + { + "epoch": 2.24311152764761, + "grad_norm": 51417.83984375, + "learning_rate": 3.4825922669980254e-05, + "loss": 2.1206, + "step": 11967 + }, + { + "epoch": 2.243298969072165, + "grad_norm": 55451.90625, + "learning_rate": 3.481843554696539e-05, + "loss": 2.1054, + "step": 11968 + }, + { + "epoch": 2.2434864104967196, + "grad_norm": 53473.2890625, + "learning_rate": 3.4810948798915466e-05, + "loss": 2.0958, + "step": 11969 + }, + { + "epoch": 2.2436738519212747, + "grad_norm": 58540.0078125, + "learning_rate": 3.4803462426015454e-05, + "loss": 2.07, + "step": 11970 + }, + { + "epoch": 2.2438612933458293, + "grad_norm": 54288.47265625, + "learning_rate": 3.479597642845018e-05, + "loss": 2.1438, + "step": 11971 + }, + { + "epoch": 2.2440487347703844, + "grad_norm": 57139.81640625, + "learning_rate": 3.478849080640459e-05, + "loss": 2.1615, + "step": 11972 + }, + { + "epoch": 2.244236176194939, + "grad_norm": 53837.625, + "learning_rate": 3.4781005560063556e-05, + "loss": 2.111, + "step": 11973 + }, + { + "epoch": 2.244423617619494, + "grad_norm": 57487.80078125, + "learning_rate": 3.477352068961196e-05, + "loss": 2.0818, + "step": 11974 + }, + { + "epoch": 2.2446110590440487, + "grad_norm": 54284.97265625, + "learning_rate": 3.4766036195234654e-05, + "loss": 2.0891, + "step": 11975 + }, + { + "epoch": 2.2447985004686037, + "grad_norm": 57498.015625, + "learning_rate": 3.47585520771165e-05, + "loss": 2.1402, + "step": 11976 + }, + { + "epoch": 2.2449859418931584, + "grad_norm": 54646.7734375, + "learning_rate": 3.475106833544236e-05, + "loss": 2.0686, + "step": 11977 + }, + { + "epoch": 2.245173383317713, + "grad_norm": 52427.01171875, + "learning_rate": 3.4743584970397055e-05, + "loss": 2.1638, + "step": 11978 + }, + { + "epoch": 2.245360824742268, + "grad_norm": 53031.84765625, + "learning_rate": 3.473610198216541e-05, + "loss": 2.0595, + "step": 11979 + }, + { + "epoch": 2.2455482661668227, + "grad_norm": 50561.73046875, + "learning_rate": 3.4728619370932265e-05, + "loss": 2.1501, + "step": 11980 + }, + { + "epoch": 2.2457357075913777, + "grad_norm": 51335.19921875, + "learning_rate": 3.472113713688243e-05, + "loss": 2.1151, + "step": 11981 + }, + { + "epoch": 2.2459231490159324, + "grad_norm": 60038.5078125, + "learning_rate": 3.4713655280200684e-05, + "loss": 2.1689, + "step": 11982 + }, + { + "epoch": 2.2461105904404874, + "grad_norm": 49481.76171875, + "learning_rate": 3.4706173801071844e-05, + "loss": 2.1121, + "step": 11983 + }, + { + "epoch": 2.246298031865042, + "grad_norm": 54982.98046875, + "learning_rate": 3.469869269968068e-05, + "loss": 2.207, + "step": 11984 + }, + { + "epoch": 2.246485473289597, + "grad_norm": 50284.3203125, + "learning_rate": 3.469121197621197e-05, + "loss": 2.0083, + "step": 11985 + }, + { + "epoch": 2.2466729147141518, + "grad_norm": 52419.828125, + "learning_rate": 3.4683731630850474e-05, + "loss": 2.1107, + "step": 11986 + }, + { + "epoch": 2.246860356138707, + "grad_norm": 50611.09765625, + "learning_rate": 3.467625166378093e-05, + "loss": 2.1787, + "step": 11987 + }, + { + "epoch": 2.2470477975632615, + "grad_norm": 53290.30078125, + "learning_rate": 3.466877207518815e-05, + "loss": 2.2225, + "step": 11988 + }, + { + "epoch": 2.247235238987816, + "grad_norm": 55067.65625, + "learning_rate": 3.466129286525678e-05, + "loss": 2.1157, + "step": 11989 + }, + { + "epoch": 2.247422680412371, + "grad_norm": 58367.56640625, + "learning_rate": 3.4653814034171594e-05, + "loss": 2.128, + "step": 11990 + }, + { + "epoch": 2.2476101218369258, + "grad_norm": 51745.3359375, + "learning_rate": 3.464633558211732e-05, + "loss": 2.1453, + "step": 11991 + }, + { + "epoch": 2.247797563261481, + "grad_norm": 57447.44921875, + "learning_rate": 3.463885750927864e-05, + "loss": 2.164, + "step": 11992 + }, + { + "epoch": 2.2479850046860355, + "grad_norm": 49528.609375, + "learning_rate": 3.463137981584027e-05, + "loss": 2.0678, + "step": 11993 + }, + { + "epoch": 2.2481724461105905, + "grad_norm": 49713.84765625, + "learning_rate": 3.4623902501986884e-05, + "loss": 2.1685, + "step": 11994 + }, + { + "epoch": 2.248359887535145, + "grad_norm": 59740.41796875, + "learning_rate": 3.461642556790317e-05, + "loss": 2.0766, + "step": 11995 + }, + { + "epoch": 2.2485473289597, + "grad_norm": 55919.37109375, + "learning_rate": 3.460894901377382e-05, + "loss": 2.3349, + "step": 11996 + }, + { + "epoch": 2.248734770384255, + "grad_norm": 54454.28125, + "learning_rate": 3.460147283978346e-05, + "loss": 2.1666, + "step": 11997 + }, + { + "epoch": 2.24892221180881, + "grad_norm": 53993.0, + "learning_rate": 3.459399704611675e-05, + "loss": 2.2275, + "step": 11998 + }, + { + "epoch": 2.2491096532333645, + "grad_norm": 55715.359375, + "learning_rate": 3.458652163295835e-05, + "loss": 2.1679, + "step": 11999 + }, + { + "epoch": 2.2492970946579196, + "grad_norm": 55755.91015625, + "learning_rate": 3.457904660049288e-05, + "loss": 2.1607, + "step": 12000 + }, + { + "epoch": 2.2492970946579196, + "eval_loss": 2.276926279067993, + "eval_runtime": 129.2061, + "eval_samples_per_second": 39.077, + "eval_steps_per_second": 1.958, + "step": 12000 + }, + { + "epoch": 2.2494845360824742, + "grad_norm": 53484.39453125, + "learning_rate": 3.457157194890496e-05, + "loss": 2.1191, + "step": 12001 + }, + { + "epoch": 2.249671977507029, + "grad_norm": 54388.28125, + "learning_rate": 3.45640976783792e-05, + "loss": 2.1259, + "step": 12002 + }, + { + "epoch": 2.249859418931584, + "grad_norm": 53919.984375, + "learning_rate": 3.4556623789100254e-05, + "loss": 2.1262, + "step": 12003 + }, + { + "epoch": 2.2500468603561385, + "grad_norm": 51704.89453125, + "learning_rate": 3.4549150281252636e-05, + "loss": 2.1633, + "step": 12004 + }, + { + "epoch": 2.2502343017806936, + "grad_norm": 50195.55078125, + "learning_rate": 3.454167715502099e-05, + "loss": 2.1067, + "step": 12005 + }, + { + "epoch": 2.2504217432052482, + "grad_norm": 54460.7109375, + "learning_rate": 3.4534204410589866e-05, + "loss": 2.0897, + "step": 12006 + }, + { + "epoch": 2.2506091846298033, + "grad_norm": 53432.640625, + "learning_rate": 3.452673204814386e-05, + "loss": 2.1107, + "step": 12007 + }, + { + "epoch": 2.250796626054358, + "grad_norm": 53683.80078125, + "learning_rate": 3.4519260067867496e-05, + "loss": 2.1059, + "step": 12008 + }, + { + "epoch": 2.250984067478913, + "grad_norm": 58754.2421875, + "learning_rate": 3.4511788469945335e-05, + "loss": 2.0499, + "step": 12009 + }, + { + "epoch": 2.2511715089034676, + "grad_norm": 56733.15234375, + "learning_rate": 3.450431725456192e-05, + "loss": 2.0701, + "step": 12010 + }, + { + "epoch": 2.2513589503280222, + "grad_norm": 59941.97265625, + "learning_rate": 3.4496846421901796e-05, + "loss": 2.1326, + "step": 12011 + }, + { + "epoch": 2.2515463917525773, + "grad_norm": 51904.94140625, + "learning_rate": 3.448937597214945e-05, + "loss": 2.2511, + "step": 12012 + }, + { + "epoch": 2.2517338331771324, + "grad_norm": 58171.58984375, + "learning_rate": 3.448190590548941e-05, + "loss": 2.1222, + "step": 12013 + }, + { + "epoch": 2.251921274601687, + "grad_norm": 53960.59765625, + "learning_rate": 3.4474436222106176e-05, + "loss": 2.138, + "step": 12014 + }, + { + "epoch": 2.2521087160262416, + "grad_norm": 54134.4453125, + "learning_rate": 3.446696692218424e-05, + "loss": 2.1455, + "step": 12015 + }, + { + "epoch": 2.2522961574507967, + "grad_norm": 52660.3203125, + "learning_rate": 3.445949800590809e-05, + "loss": 2.1733, + "step": 12016 + }, + { + "epoch": 2.2524835988753513, + "grad_norm": 61965.2890625, + "learning_rate": 3.445202947346216e-05, + "loss": 2.1781, + "step": 12017 + }, + { + "epoch": 2.2526710402999064, + "grad_norm": 52259.875, + "learning_rate": 3.444456132503098e-05, + "loss": 2.1897, + "step": 12018 + }, + { + "epoch": 2.252858481724461, + "grad_norm": 56652.75390625, + "learning_rate": 3.443709356079894e-05, + "loss": 2.072, + "step": 12019 + }, + { + "epoch": 2.253045923149016, + "grad_norm": 56027.45703125, + "learning_rate": 3.442962618095053e-05, + "loss": 2.2256, + "step": 12020 + }, + { + "epoch": 2.2532333645735707, + "grad_norm": 55663.05078125, + "learning_rate": 3.4422159185670156e-05, + "loss": 2.0867, + "step": 12021 + }, + { + "epoch": 2.2534208059981258, + "grad_norm": 57346.01171875, + "learning_rate": 3.441469257514226e-05, + "loss": 2.0713, + "step": 12022 + }, + { + "epoch": 2.2536082474226804, + "grad_norm": 51896.01953125, + "learning_rate": 3.440722634955125e-05, + "loss": 2.1027, + "step": 12023 + }, + { + "epoch": 2.2537956888472355, + "grad_norm": 52534.0703125, + "learning_rate": 3.4399760509081526e-05, + "loss": 2.1435, + "step": 12024 + }, + { + "epoch": 2.25398313027179, + "grad_norm": 57589.10546875, + "learning_rate": 3.439229505391748e-05, + "loss": 2.1307, + "step": 12025 + }, + { + "epoch": 2.2541705716963447, + "grad_norm": 54426.5859375, + "learning_rate": 3.4384829984243546e-05, + "loss": 2.2218, + "step": 12026 + }, + { + "epoch": 2.2543580131209, + "grad_norm": 52171.703125, + "learning_rate": 3.437736530024403e-05, + "loss": 2.0826, + "step": 12027 + }, + { + "epoch": 2.2545454545454544, + "grad_norm": 52688.36328125, + "learning_rate": 3.436990100210335e-05, + "loss": 2.1992, + "step": 12028 + }, + { + "epoch": 2.2547328959700095, + "grad_norm": 51045.7265625, + "learning_rate": 3.436243709000586e-05, + "loss": 2.1956, + "step": 12029 + }, + { + "epoch": 2.254920337394564, + "grad_norm": 52127.85546875, + "learning_rate": 3.4354973564135895e-05, + "loss": 2.1505, + "step": 12030 + }, + { + "epoch": 2.255107778819119, + "grad_norm": 54791.38671875, + "learning_rate": 3.434751042467779e-05, + "loss": 2.1543, + "step": 12031 + }, + { + "epoch": 2.255295220243674, + "grad_norm": 52839.28125, + "learning_rate": 3.4340047671815876e-05, + "loss": 2.1251, + "step": 12032 + }, + { + "epoch": 2.255482661668229, + "grad_norm": 51233.0703125, + "learning_rate": 3.433258530573452e-05, + "loss": 2.1383, + "step": 12033 + }, + { + "epoch": 2.2556701030927835, + "grad_norm": 57397.08984375, + "learning_rate": 3.4325123326617955e-05, + "loss": 2.1016, + "step": 12034 + }, + { + "epoch": 2.2558575445173386, + "grad_norm": 55206.2265625, + "learning_rate": 3.431766173465053e-05, + "loss": 2.1148, + "step": 12035 + }, + { + "epoch": 2.256044985941893, + "grad_norm": 53758.22265625, + "learning_rate": 3.431020053001654e-05, + "loss": 2.1069, + "step": 12036 + }, + { + "epoch": 2.256232427366448, + "grad_norm": 51413.4375, + "learning_rate": 3.4302739712900245e-05, + "loss": 2.2137, + "step": 12037 + }, + { + "epoch": 2.256419868791003, + "grad_norm": 51263.4765625, + "learning_rate": 3.429527928348593e-05, + "loss": 2.155, + "step": 12038 + }, + { + "epoch": 2.2566073102155575, + "grad_norm": 52821.94921875, + "learning_rate": 3.4287819241957854e-05, + "loss": 2.1841, + "step": 12039 + }, + { + "epoch": 2.2567947516401126, + "grad_norm": 53803.44921875, + "learning_rate": 3.4280359588500256e-05, + "loss": 2.1401, + "step": 12040 + }, + { + "epoch": 2.256982193064667, + "grad_norm": 54673.671875, + "learning_rate": 3.427290032329743e-05, + "loss": 2.121, + "step": 12041 + }, + { + "epoch": 2.2571696344892223, + "grad_norm": 50875.921875, + "learning_rate": 3.426544144653354e-05, + "loss": 2.0545, + "step": 12042 + }, + { + "epoch": 2.257357075913777, + "grad_norm": 54261.36328125, + "learning_rate": 3.4257982958392864e-05, + "loss": 2.1028, + "step": 12043 + }, + { + "epoch": 2.257544517338332, + "grad_norm": 52679.12109375, + "learning_rate": 3.425052485905961e-05, + "loss": 2.1099, + "step": 12044 + }, + { + "epoch": 2.2577319587628866, + "grad_norm": 52946.2890625, + "learning_rate": 3.424306714871795e-05, + "loss": 2.1269, + "step": 12045 + }, + { + "epoch": 2.2579194001874416, + "grad_norm": 55254.91015625, + "learning_rate": 3.423560982755212e-05, + "loss": 2.1957, + "step": 12046 + }, + { + "epoch": 2.2581068416119963, + "grad_norm": 56833.02734375, + "learning_rate": 3.422815289574627e-05, + "loss": 2.134, + "step": 12047 + }, + { + "epoch": 2.258294283036551, + "grad_norm": 57069.11328125, + "learning_rate": 3.422069635348462e-05, + "loss": 2.1192, + "step": 12048 + }, + { + "epoch": 2.258481724461106, + "grad_norm": 57236.53515625, + "learning_rate": 3.421324020095129e-05, + "loss": 2.1712, + "step": 12049 + }, + { + "epoch": 2.2586691658856606, + "grad_norm": 53666.66796875, + "learning_rate": 3.420578443833046e-05, + "loss": 2.1684, + "step": 12050 + }, + { + "epoch": 2.2588566073102156, + "grad_norm": 48888.48828125, + "learning_rate": 3.419832906580629e-05, + "loss": 2.1501, + "step": 12051 + }, + { + "epoch": 2.2590440487347703, + "grad_norm": 53095.8828125, + "learning_rate": 3.4190874083562904e-05, + "loss": 2.1135, + "step": 12052 + }, + { + "epoch": 2.2592314901593253, + "grad_norm": 53204.296875, + "learning_rate": 3.418341949178442e-05, + "loss": 2.1572, + "step": 12053 + }, + { + "epoch": 2.25941893158388, + "grad_norm": 47767.34375, + "learning_rate": 3.4175965290654975e-05, + "loss": 2.173, + "step": 12054 + }, + { + "epoch": 2.259606373008435, + "grad_norm": 55569.546875, + "learning_rate": 3.416851148035866e-05, + "loss": 2.1008, + "step": 12055 + }, + { + "epoch": 2.2597938144329897, + "grad_norm": 55082.46875, + "learning_rate": 3.416105806107961e-05, + "loss": 2.0532, + "step": 12056 + }, + { + "epoch": 2.2599812558575447, + "grad_norm": 61420.109375, + "learning_rate": 3.4153605033001865e-05, + "loss": 2.0589, + "step": 12057 + }, + { + "epoch": 2.2601686972820993, + "grad_norm": 53705.0546875, + "learning_rate": 3.414615239630954e-05, + "loss": 2.1787, + "step": 12058 + }, + { + "epoch": 2.260356138706654, + "grad_norm": 57364.34375, + "learning_rate": 3.4138700151186706e-05, + "loss": 2.1843, + "step": 12059 + }, + { + "epoch": 2.260543580131209, + "grad_norm": 51572.7109375, + "learning_rate": 3.413124829781741e-05, + "loss": 2.1307, + "step": 12060 + }, + { + "epoch": 2.2607310215557637, + "grad_norm": 55160.37109375, + "learning_rate": 3.4123796836385705e-05, + "loss": 2.1621, + "step": 12061 + }, + { + "epoch": 2.2609184629803187, + "grad_norm": 56967.015625, + "learning_rate": 3.411634576707561e-05, + "loss": 2.0645, + "step": 12062 + }, + { + "epoch": 2.2611059044048734, + "grad_norm": 54183.5078125, + "learning_rate": 3.410889509007122e-05, + "loss": 2.0722, + "step": 12063 + }, + { + "epoch": 2.2612933458294284, + "grad_norm": 53191.890625, + "learning_rate": 3.4101444805556494e-05, + "loss": 2.1718, + "step": 12064 + }, + { + "epoch": 2.261480787253983, + "grad_norm": 53632.89453125, + "learning_rate": 3.409399491371548e-05, + "loss": 2.0826, + "step": 12065 + }, + { + "epoch": 2.261668228678538, + "grad_norm": 56328.5, + "learning_rate": 3.408654541473216e-05, + "loss": 2.1511, + "step": 12066 + }, + { + "epoch": 2.2618556701030927, + "grad_norm": 51965.72265625, + "learning_rate": 3.407909630879055e-05, + "loss": 2.1375, + "step": 12067 + }, + { + "epoch": 2.262043111527648, + "grad_norm": 52228.87109375, + "learning_rate": 3.4071647596074604e-05, + "loss": 2.1651, + "step": 12068 + }, + { + "epoch": 2.2622305529522024, + "grad_norm": 55955.671875, + "learning_rate": 3.4064199276768316e-05, + "loss": 2.1073, + "step": 12069 + }, + { + "epoch": 2.262417994376757, + "grad_norm": 49893.203125, + "learning_rate": 3.405675135105564e-05, + "loss": 2.1999, + "step": 12070 + }, + { + "epoch": 2.262605435801312, + "grad_norm": 55095.39453125, + "learning_rate": 3.4049303819120556e-05, + "loss": 2.0942, + "step": 12071 + }, + { + "epoch": 2.2627928772258668, + "grad_norm": 52667.9140625, + "learning_rate": 3.404185668114697e-05, + "loss": 2.0469, + "step": 12072 + }, + { + "epoch": 2.262980318650422, + "grad_norm": 54798.70703125, + "learning_rate": 3.4034409937318825e-05, + "loss": 2.0743, + "step": 12073 + }, + { + "epoch": 2.2631677600749764, + "grad_norm": 54000.54296875, + "learning_rate": 3.402696358782008e-05, + "loss": 2.0459, + "step": 12074 + }, + { + "epoch": 2.2633552014995315, + "grad_norm": 51167.77734375, + "learning_rate": 3.401951763283461e-05, + "loss": 2.1154, + "step": 12075 + }, + { + "epoch": 2.263542642924086, + "grad_norm": 55297.0625, + "learning_rate": 3.401207207254633e-05, + "loss": 2.0416, + "step": 12076 + }, + { + "epoch": 2.263730084348641, + "grad_norm": 52076.6796875, + "learning_rate": 3.4004626907139145e-05, + "loss": 2.1101, + "step": 12077 + }, + { + "epoch": 2.263917525773196, + "grad_norm": 54208.87890625, + "learning_rate": 3.399718213679695e-05, + "loss": 2.1296, + "step": 12078 + }, + { + "epoch": 2.264104967197751, + "grad_norm": 57248.05078125, + "learning_rate": 3.39897377617036e-05, + "loss": 2.1303, + "step": 12079 + }, + { + "epoch": 2.2642924086223055, + "grad_norm": 52747.33984375, + "learning_rate": 3.398229378204295e-05, + "loss": 2.1226, + "step": 12080 + }, + { + "epoch": 2.26447985004686, + "grad_norm": 56516.39453125, + "learning_rate": 3.39748501979989e-05, + "loss": 2.1247, + "step": 12081 + }, + { + "epoch": 2.264667291471415, + "grad_norm": 53036.78515625, + "learning_rate": 3.396740700975527e-05, + "loss": 2.2029, + "step": 12082 + }, + { + "epoch": 2.26485473289597, + "grad_norm": 59304.5234375, + "learning_rate": 3.39599642174959e-05, + "loss": 2.2403, + "step": 12083 + }, + { + "epoch": 2.265042174320525, + "grad_norm": 54889.85546875, + "learning_rate": 3.395252182140461e-05, + "loss": 2.1823, + "step": 12084 + }, + { + "epoch": 2.2652296157450795, + "grad_norm": 55467.58984375, + "learning_rate": 3.3945079821665236e-05, + "loss": 2.1292, + "step": 12085 + }, + { + "epoch": 2.2654170571696346, + "grad_norm": 53827.73046875, + "learning_rate": 3.393763821846158e-05, + "loss": 2.0957, + "step": 12086 + }, + { + "epoch": 2.265604498594189, + "grad_norm": 53988.61328125, + "learning_rate": 3.3930197011977406e-05, + "loss": 2.0908, + "step": 12087 + }, + { + "epoch": 2.2657919400187443, + "grad_norm": 55944.33203125, + "learning_rate": 3.3922756202396544e-05, + "loss": 2.178, + "step": 12088 + }, + { + "epoch": 2.265979381443299, + "grad_norm": 60514.296875, + "learning_rate": 3.3915315789902775e-05, + "loss": 2.0977, + "step": 12089 + }, + { + "epoch": 2.266166822867854, + "grad_norm": 53820.08984375, + "learning_rate": 3.390787577467984e-05, + "loss": 2.117, + "step": 12090 + }, + { + "epoch": 2.2663542642924086, + "grad_norm": 54921.99609375, + "learning_rate": 3.390043615691151e-05, + "loss": 2.1166, + "step": 12091 + }, + { + "epoch": 2.2665417057169632, + "grad_norm": 57318.48046875, + "learning_rate": 3.389299693678153e-05, + "loss": 2.1404, + "step": 12092 + }, + { + "epoch": 2.2667291471415183, + "grad_norm": 55593.58203125, + "learning_rate": 3.388555811447366e-05, + "loss": 2.1091, + "step": 12093 + }, + { + "epoch": 2.266916588566073, + "grad_norm": 56127.296875, + "learning_rate": 3.3878119690171595e-05, + "loss": 2.1381, + "step": 12094 + }, + { + "epoch": 2.267104029990628, + "grad_norm": 53647.42578125, + "learning_rate": 3.387068166405906e-05, + "loss": 2.043, + "step": 12095 + }, + { + "epoch": 2.2672914714151826, + "grad_norm": 55868.0234375, + "learning_rate": 3.3863244036319795e-05, + "loss": 2.1599, + "step": 12096 + }, + { + "epoch": 2.2674789128397377, + "grad_norm": 54813.140625, + "learning_rate": 3.385580680713748e-05, + "loss": 2.2022, + "step": 12097 + }, + { + "epoch": 2.2676663542642923, + "grad_norm": 57787.8359375, + "learning_rate": 3.3848369976695803e-05, + "loss": 2.0463, + "step": 12098 + }, + { + "epoch": 2.2678537956888474, + "grad_norm": 51883.5625, + "learning_rate": 3.3840933545178445e-05, + "loss": 2.2144, + "step": 12099 + }, + { + "epoch": 2.268041237113402, + "grad_norm": 53554.1640625, + "learning_rate": 3.3833497512769086e-05, + "loss": 2.1435, + "step": 12100 + }, + { + "epoch": 2.268228678537957, + "grad_norm": 59350.09765625, + "learning_rate": 3.382606187965137e-05, + "loss": 2.0744, + "step": 12101 + }, + { + "epoch": 2.2684161199625117, + "grad_norm": 55434.69921875, + "learning_rate": 3.381862664600894e-05, + "loss": 2.136, + "step": 12102 + }, + { + "epoch": 2.2686035613870663, + "grad_norm": 49097.79296875, + "learning_rate": 3.3811191812025477e-05, + "loss": 2.1268, + "step": 12103 + }, + { + "epoch": 2.2687910028116214, + "grad_norm": 56599.94921875, + "learning_rate": 3.3803757377884585e-05, + "loss": 2.1438, + "step": 12104 + }, + { + "epoch": 2.268978444236176, + "grad_norm": 56076.33203125, + "learning_rate": 3.3796323343769883e-05, + "loss": 2.1815, + "step": 12105 + }, + { + "epoch": 2.269165885660731, + "grad_norm": 60173.1171875, + "learning_rate": 3.378888970986498e-05, + "loss": 2.1468, + "step": 12106 + }, + { + "epoch": 2.2693533270852857, + "grad_norm": 56563.7890625, + "learning_rate": 3.378145647635349e-05, + "loss": 2.1298, + "step": 12107 + }, + { + "epoch": 2.2695407685098408, + "grad_norm": 51296.91015625, + "learning_rate": 3.377402364341901e-05, + "loss": 2.1035, + "step": 12108 + }, + { + "epoch": 2.2697282099343954, + "grad_norm": 57115.2265625, + "learning_rate": 3.376659121124508e-05, + "loss": 2.1328, + "step": 12109 + }, + { + "epoch": 2.2699156513589505, + "grad_norm": 50559.1015625, + "learning_rate": 3.375915918001531e-05, + "loss": 2.1381, + "step": 12110 + }, + { + "epoch": 2.270103092783505, + "grad_norm": 49107.33984375, + "learning_rate": 3.375172754991325e-05, + "loss": 2.1117, + "step": 12111 + }, + { + "epoch": 2.27029053420806, + "grad_norm": 61043.30859375, + "learning_rate": 3.374429632112247e-05, + "loss": 2.1179, + "step": 12112 + }, + { + "epoch": 2.2704779756326148, + "grad_norm": 48945.5390625, + "learning_rate": 3.373686549382648e-05, + "loss": 2.1145, + "step": 12113 + }, + { + "epoch": 2.2706654170571694, + "grad_norm": 55165.390625, + "learning_rate": 3.372943506820883e-05, + "loss": 2.1443, + "step": 12114 + }, + { + "epoch": 2.2708528584817245, + "grad_norm": 55261.90625, + "learning_rate": 3.372200504445305e-05, + "loss": 2.2006, + "step": 12115 + }, + { + "epoch": 2.2710402999062795, + "grad_norm": 52816.46875, + "learning_rate": 3.371457542274263e-05, + "loss": 2.1711, + "step": 12116 + }, + { + "epoch": 2.271227741330834, + "grad_norm": 56108.06640625, + "learning_rate": 3.3707146203261074e-05, + "loss": 2.1578, + "step": 12117 + }, + { + "epoch": 2.271415182755389, + "grad_norm": 63142.203125, + "learning_rate": 3.3699717386191884e-05, + "loss": 2.2206, + "step": 12118 + }, + { + "epoch": 2.271602624179944, + "grad_norm": 57713.140625, + "learning_rate": 3.3692288971718565e-05, + "loss": 2.1321, + "step": 12119 + }, + { + "epoch": 2.2717900656044985, + "grad_norm": 54856.66796875, + "learning_rate": 3.368486096002454e-05, + "loss": 2.0929, + "step": 12120 + }, + { + "epoch": 2.2719775070290535, + "grad_norm": 55628.6953125, + "learning_rate": 3.36774333512933e-05, + "loss": 2.1194, + "step": 12121 + }, + { + "epoch": 2.272164948453608, + "grad_norm": 59142.00390625, + "learning_rate": 3.36700061457083e-05, + "loss": 2.1614, + "step": 12122 + }, + { + "epoch": 2.2723523898781632, + "grad_norm": 55529.00390625, + "learning_rate": 3.366257934345298e-05, + "loss": 2.1305, + "step": 12123 + }, + { + "epoch": 2.272539831302718, + "grad_norm": 52971.9765625, + "learning_rate": 3.3655152944710765e-05, + "loss": 2.1085, + "step": 12124 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 51741.97265625, + "learning_rate": 3.364772694966507e-05, + "loss": 2.1643, + "step": 12125 + }, + { + "epoch": 2.2729147141518276, + "grad_norm": 54522.40625, + "learning_rate": 3.364030135849935e-05, + "loss": 2.1418, + "step": 12126 + }, + { + "epoch": 2.2731021555763826, + "grad_norm": 58540.8046875, + "learning_rate": 3.3632876171396936e-05, + "loss": 2.1971, + "step": 12127 + }, + { + "epoch": 2.2732895970009372, + "grad_norm": 48503.9140625, + "learning_rate": 3.3625451388541275e-05, + "loss": 2.1069, + "step": 12128 + }, + { + "epoch": 2.273477038425492, + "grad_norm": 55278.29296875, + "learning_rate": 3.361802701011574e-05, + "loss": 2.1373, + "step": 12129 + }, + { + "epoch": 2.273664479850047, + "grad_norm": 51559.0546875, + "learning_rate": 3.36106030363037e-05, + "loss": 2.1001, + "step": 12130 + }, + { + "epoch": 2.2738519212746016, + "grad_norm": 51969.25, + "learning_rate": 3.3603179467288514e-05, + "loss": 2.1415, + "step": 12131 + }, + { + "epoch": 2.2740393626991566, + "grad_norm": 56133.87109375, + "learning_rate": 3.359575630325354e-05, + "loss": 2.196, + "step": 12132 + }, + { + "epoch": 2.2742268041237113, + "grad_norm": 50664.375, + "learning_rate": 3.358833354438209e-05, + "loss": 2.1514, + "step": 12133 + }, + { + "epoch": 2.2744142455482663, + "grad_norm": 50812.6875, + "learning_rate": 3.358091119085757e-05, + "loss": 2.1493, + "step": 12134 + }, + { + "epoch": 2.274601686972821, + "grad_norm": 60858.8671875, + "learning_rate": 3.357348924286321e-05, + "loss": 2.0742, + "step": 12135 + }, + { + "epoch": 2.2747891283973756, + "grad_norm": 53901.015625, + "learning_rate": 3.356606770058239e-05, + "loss": 2.0821, + "step": 12136 + }, + { + "epoch": 2.2749765698219306, + "grad_norm": 56459.5859375, + "learning_rate": 3.355864656419839e-05, + "loss": 2.0711, + "step": 12137 + }, + { + "epoch": 2.2751640112464857, + "grad_norm": 53453.66015625, + "learning_rate": 3.3551225833894504e-05, + "loss": 2.2145, + "step": 12138 + }, + { + "epoch": 2.2753514526710403, + "grad_norm": 49971.8203125, + "learning_rate": 3.3543805509854016e-05, + "loss": 2.1306, + "step": 12139 + }, + { + "epoch": 2.275538894095595, + "grad_norm": 57210.15625, + "learning_rate": 3.353638559226018e-05, + "loss": 2.1719, + "step": 12140 + }, + { + "epoch": 2.27572633552015, + "grad_norm": 55308.30078125, + "learning_rate": 3.352896608129631e-05, + "loss": 2.1549, + "step": 12141 + }, + { + "epoch": 2.2759137769447046, + "grad_norm": 49467.93359375, + "learning_rate": 3.3521546977145583e-05, + "loss": 2.1462, + "step": 12142 + }, + { + "epoch": 2.2761012183692597, + "grad_norm": 54875.62109375, + "learning_rate": 3.3514128279991296e-05, + "loss": 2.1083, + "step": 12143 + }, + { + "epoch": 2.2762886597938143, + "grad_norm": 50727.890625, + "learning_rate": 3.350670999001667e-05, + "loss": 2.113, + "step": 12144 + }, + { + "epoch": 2.2764761012183694, + "grad_norm": 53100.7578125, + "learning_rate": 3.349929210740493e-05, + "loss": 2.0773, + "step": 12145 + }, + { + "epoch": 2.276663542642924, + "grad_norm": 54530.71875, + "learning_rate": 3.349187463233927e-05, + "loss": 2.1276, + "step": 12146 + }, + { + "epoch": 2.276850984067479, + "grad_norm": 52000.76171875, + "learning_rate": 3.34844575650029e-05, + "loss": 2.0882, + "step": 12147 + }, + { + "epoch": 2.2770384254920337, + "grad_norm": 48139.7578125, + "learning_rate": 3.347704090557902e-05, + "loss": 2.1532, + "step": 12148 + }, + { + "epoch": 2.277225866916589, + "grad_norm": 54182.4921875, + "learning_rate": 3.346962465425082e-05, + "loss": 2.1292, + "step": 12149 + }, + { + "epoch": 2.2774133083411434, + "grad_norm": 52642.30078125, + "learning_rate": 3.346220881120144e-05, + "loss": 2.1374, + "step": 12150 + }, + { + "epoch": 2.277600749765698, + "grad_norm": 58864.2265625, + "learning_rate": 3.345479337661407e-05, + "loss": 2.1794, + "step": 12151 + }, + { + "epoch": 2.277788191190253, + "grad_norm": 58937.6640625, + "learning_rate": 3.3447378350671844e-05, + "loss": 2.1659, + "step": 12152 + }, + { + "epoch": 2.2779756326148077, + "grad_norm": 52258.5703125, + "learning_rate": 3.343996373355793e-05, + "loss": 2.1023, + "step": 12153 + }, + { + "epoch": 2.278163074039363, + "grad_norm": 58204.296875, + "learning_rate": 3.343254952545543e-05, + "loss": 2.118, + "step": 12154 + }, + { + "epoch": 2.2783505154639174, + "grad_norm": 51694.33984375, + "learning_rate": 3.3425135726547466e-05, + "loss": 2.113, + "step": 12155 + }, + { + "epoch": 2.2785379568884725, + "grad_norm": 52258.33984375, + "learning_rate": 3.341772233701719e-05, + "loss": 2.1465, + "step": 12156 + }, + { + "epoch": 2.278725398313027, + "grad_norm": 51329.296875, + "learning_rate": 3.3410309357047656e-05, + "loss": 2.1358, + "step": 12157 + }, + { + "epoch": 2.278912839737582, + "grad_norm": 56536.55078125, + "learning_rate": 3.3402896786821957e-05, + "loss": 2.1505, + "step": 12158 + }, + { + "epoch": 2.279100281162137, + "grad_norm": 55497.3359375, + "learning_rate": 3.33954846265232e-05, + "loss": 2.1481, + "step": 12159 + }, + { + "epoch": 2.279287722586692, + "grad_norm": 54968.90234375, + "learning_rate": 3.3388072876334456e-05, + "loss": 2.1763, + "step": 12160 + }, + { + "epoch": 2.2794751640112465, + "grad_norm": 51805.24609375, + "learning_rate": 3.338066153643876e-05, + "loss": 2.1306, + "step": 12161 + }, + { + "epoch": 2.279662605435801, + "grad_norm": 57595.671875, + "learning_rate": 3.3373250607019187e-05, + "loss": 2.1096, + "step": 12162 + }, + { + "epoch": 2.279850046860356, + "grad_norm": 56463.08203125, + "learning_rate": 3.336584008825875e-05, + "loss": 2.1464, + "step": 12163 + }, + { + "epoch": 2.280037488284911, + "grad_norm": 56304.0234375, + "learning_rate": 3.335842998034052e-05, + "loss": 2.1468, + "step": 12164 + }, + { + "epoch": 2.280224929709466, + "grad_norm": 55919.8984375, + "learning_rate": 3.3351020283447474e-05, + "loss": 2.1655, + "step": 12165 + }, + { + "epoch": 2.2804123711340205, + "grad_norm": 54160.8828125, + "learning_rate": 3.334361099776265e-05, + "loss": 2.0752, + "step": 12166 + }, + { + "epoch": 2.2805998125585756, + "grad_norm": 53477.0234375, + "learning_rate": 3.333620212346903e-05, + "loss": 2.1141, + "step": 12167 + }, + { + "epoch": 2.28078725398313, + "grad_norm": 55150.92578125, + "learning_rate": 3.332879366074963e-05, + "loss": 2.1712, + "step": 12168 + }, + { + "epoch": 2.2809746954076853, + "grad_norm": 55689.8046875, + "learning_rate": 3.33213856097874e-05, + "loss": 2.2193, + "step": 12169 + }, + { + "epoch": 2.28116213683224, + "grad_norm": 53785.62109375, + "learning_rate": 3.331397797076533e-05, + "loss": 2.2693, + "step": 12170 + }, + { + "epoch": 2.281349578256795, + "grad_norm": 55953.48828125, + "learning_rate": 3.330657074386637e-05, + "loss": 2.0966, + "step": 12171 + }, + { + "epoch": 2.2815370196813496, + "grad_norm": 56874.41796875, + "learning_rate": 3.329916392927346e-05, + "loss": 2.1429, + "step": 12172 + }, + { + "epoch": 2.281724461105904, + "grad_norm": 58143.7734375, + "learning_rate": 3.329175752716954e-05, + "loss": 2.1522, + "step": 12173 + }, + { + "epoch": 2.2819119025304593, + "grad_norm": 54969.4375, + "learning_rate": 3.3284351537737554e-05, + "loss": 2.1461, + "step": 12174 + }, + { + "epoch": 2.282099343955014, + "grad_norm": 57585.80078125, + "learning_rate": 3.3276945961160435e-05, + "loss": 2.1206, + "step": 12175 + }, + { + "epoch": 2.282286785379569, + "grad_norm": 53634.234375, + "learning_rate": 3.326954079762104e-05, + "loss": 2.1023, + "step": 12176 + }, + { + "epoch": 2.2824742268041236, + "grad_norm": 54730.21875, + "learning_rate": 3.326213604730232e-05, + "loss": 2.2175, + "step": 12177 + }, + { + "epoch": 2.2826616682286787, + "grad_norm": 52239.609375, + "learning_rate": 3.325473171038711e-05, + "loss": 2.181, + "step": 12178 + }, + { + "epoch": 2.2828491096532333, + "grad_norm": 53667.59765625, + "learning_rate": 3.324732778705835e-05, + "loss": 2.1332, + "step": 12179 + }, + { + "epoch": 2.2830365510777884, + "grad_norm": 53963.40625, + "learning_rate": 3.323992427749884e-05, + "loss": 2.1937, + "step": 12180 + }, + { + "epoch": 2.283223992502343, + "grad_norm": 58889.94140625, + "learning_rate": 3.3232521181891484e-05, + "loss": 2.1668, + "step": 12181 + }, + { + "epoch": 2.283411433926898, + "grad_norm": 53668.45703125, + "learning_rate": 3.322511850041913e-05, + "loss": 2.1083, + "step": 12182 + }, + { + "epoch": 2.2835988753514527, + "grad_norm": 54633.3671875, + "learning_rate": 3.321771623326458e-05, + "loss": 2.1868, + "step": 12183 + }, + { + "epoch": 2.2837863167760073, + "grad_norm": 55738.6796875, + "learning_rate": 3.321031438061069e-05, + "loss": 2.1876, + "step": 12184 + }, + { + "epoch": 2.2839737582005624, + "grad_norm": 55270.42578125, + "learning_rate": 3.3202912942640266e-05, + "loss": 2.1643, + "step": 12185 + }, + { + "epoch": 2.284161199625117, + "grad_norm": 57093.06640625, + "learning_rate": 3.319551191953612e-05, + "loss": 2.127, + "step": 12186 + }, + { + "epoch": 2.284348641049672, + "grad_norm": 55505.9609375, + "learning_rate": 3.318811131148104e-05, + "loss": 2.1828, + "step": 12187 + }, + { + "epoch": 2.2845360824742267, + "grad_norm": 53731.59375, + "learning_rate": 3.318071111865779e-05, + "loss": 2.0737, + "step": 12188 + }, + { + "epoch": 2.2847235238987817, + "grad_norm": 53936.07421875, + "learning_rate": 3.317331134124919e-05, + "loss": 2.0859, + "step": 12189 + }, + { + "epoch": 2.2849109653233364, + "grad_norm": 49648.30078125, + "learning_rate": 3.316591197943799e-05, + "loss": 2.1082, + "step": 12190 + }, + { + "epoch": 2.2850984067478914, + "grad_norm": 58251.98046875, + "learning_rate": 3.3158513033406925e-05, + "loss": 2.1909, + "step": 12191 + }, + { + "epoch": 2.285285848172446, + "grad_norm": 51220.609375, + "learning_rate": 3.315111450333876e-05, + "loss": 2.2078, + "step": 12192 + }, + { + "epoch": 2.285473289597001, + "grad_norm": 56901.47265625, + "learning_rate": 3.3143716389416204e-05, + "loss": 2.1854, + "step": 12193 + }, + { + "epoch": 2.2856607310215558, + "grad_norm": 52122.2421875, + "learning_rate": 3.313631869182203e-05, + "loss": 2.0786, + "step": 12194 + }, + { + "epoch": 2.2858481724461104, + "grad_norm": 51181.62109375, + "learning_rate": 3.31289214107389e-05, + "loss": 2.145, + "step": 12195 + }, + { + "epoch": 2.2860356138706655, + "grad_norm": 52057.26953125, + "learning_rate": 3.3121524546349536e-05, + "loss": 2.1033, + "step": 12196 + }, + { + "epoch": 2.28622305529522, + "grad_norm": 50732.55859375, + "learning_rate": 3.311412809883665e-05, + "loss": 2.1962, + "step": 12197 + }, + { + "epoch": 2.286410496719775, + "grad_norm": 51705.54296875, + "learning_rate": 3.3106732068382894e-05, + "loss": 2.1386, + "step": 12198 + }, + { + "epoch": 2.2865979381443298, + "grad_norm": 54941.15625, + "learning_rate": 3.309933645517095e-05, + "loss": 2.1247, + "step": 12199 + }, + { + "epoch": 2.286785379568885, + "grad_norm": 56647.453125, + "learning_rate": 3.309194125938349e-05, + "loss": 2.1864, + "step": 12200 + }, + { + "epoch": 2.2869728209934395, + "grad_norm": 51845.296875, + "learning_rate": 3.308454648120317e-05, + "loss": 2.123, + "step": 12201 + }, + { + "epoch": 2.2871602624179945, + "grad_norm": 52072.578125, + "learning_rate": 3.3077152120812605e-05, + "loss": 2.1523, + "step": 12202 + }, + { + "epoch": 2.287347703842549, + "grad_norm": 52273.6953125, + "learning_rate": 3.3069758178394434e-05, + "loss": 2.1211, + "step": 12203 + }, + { + "epoch": 2.287535145267104, + "grad_norm": 51071.484375, + "learning_rate": 3.3062364654131296e-05, + "loss": 2.1457, + "step": 12204 + }, + { + "epoch": 2.287722586691659, + "grad_norm": 52063.8203125, + "learning_rate": 3.3054971548205804e-05, + "loss": 2.0813, + "step": 12205 + }, + { + "epoch": 2.2879100281162135, + "grad_norm": 52564.6484375, + "learning_rate": 3.304757886080053e-05, + "loss": 2.1294, + "step": 12206 + }, + { + "epoch": 2.2880974695407685, + "grad_norm": 53336.98046875, + "learning_rate": 3.304018659209808e-05, + "loss": 2.1712, + "step": 12207 + }, + { + "epoch": 2.288284910965323, + "grad_norm": 54512.79296875, + "learning_rate": 3.303279474228103e-05, + "loss": 2.0984, + "step": 12208 + }, + { + "epoch": 2.2884723523898782, + "grad_norm": 55880.16015625, + "learning_rate": 3.302540331153197e-05, + "loss": 2.1467, + "step": 12209 + }, + { + "epoch": 2.288659793814433, + "grad_norm": 58759.87890625, + "learning_rate": 3.3018012300033416e-05, + "loss": 2.1513, + "step": 12210 + }, + { + "epoch": 2.288847235238988, + "grad_norm": 55927.3046875, + "learning_rate": 3.301062170796795e-05, + "loss": 2.1218, + "step": 12211 + }, + { + "epoch": 2.2890346766635425, + "grad_norm": 53721.9921875, + "learning_rate": 3.300323153551812e-05, + "loss": 2.1974, + "step": 12212 + }, + { + "epoch": 2.2892221180880976, + "grad_norm": 55334.9453125, + "learning_rate": 3.29958417828664e-05, + "loss": 2.2243, + "step": 12213 + }, + { + "epoch": 2.2894095595126522, + "grad_norm": 55597.69140625, + "learning_rate": 3.298845245019536e-05, + "loss": 2.1429, + "step": 12214 + }, + { + "epoch": 2.2895970009372073, + "grad_norm": 56718.1484375, + "learning_rate": 3.298106353768748e-05, + "loss": 2.1722, + "step": 12215 + }, + { + "epoch": 2.289784442361762, + "grad_norm": 54004.26171875, + "learning_rate": 3.297367504552529e-05, + "loss": 2.167, + "step": 12216 + }, + { + "epoch": 2.2899718837863166, + "grad_norm": 58478.35546875, + "learning_rate": 3.296628697389123e-05, + "loss": 2.1505, + "step": 12217 + }, + { + "epoch": 2.2901593252108716, + "grad_norm": 53421.8671875, + "learning_rate": 3.2958899322967785e-05, + "loss": 2.0354, + "step": 12218 + }, + { + "epoch": 2.2903467666354262, + "grad_norm": 53144.50390625, + "learning_rate": 3.295151209293745e-05, + "loss": 2.1564, + "step": 12219 + }, + { + "epoch": 2.2905342080599813, + "grad_norm": 51331.484375, + "learning_rate": 3.2944125283982666e-05, + "loss": 2.1144, + "step": 12220 + }, + { + "epoch": 2.290721649484536, + "grad_norm": 60455.1640625, + "learning_rate": 3.293673889628587e-05, + "loss": 2.1436, + "step": 12221 + }, + { + "epoch": 2.290909090909091, + "grad_norm": 51634.8515625, + "learning_rate": 3.29293529300295e-05, + "loss": 2.0832, + "step": 12222 + }, + { + "epoch": 2.2910965323336456, + "grad_norm": 52102.8046875, + "learning_rate": 3.292196738539598e-05, + "loss": 2.1222, + "step": 12223 + }, + { + "epoch": 2.2912839737582007, + "grad_norm": 53862.9296875, + "learning_rate": 3.291458226256773e-05, + "loss": 2.0737, + "step": 12224 + }, + { + "epoch": 2.2914714151827553, + "grad_norm": 51952.61328125, + "learning_rate": 3.290719756172714e-05, + "loss": 2.1641, + "step": 12225 + }, + { + "epoch": 2.2916588566073104, + "grad_norm": 54527.7265625, + "learning_rate": 3.289981328305659e-05, + "loss": 2.109, + "step": 12226 + }, + { + "epoch": 2.291846298031865, + "grad_norm": 54547.60546875, + "learning_rate": 3.289242942673852e-05, + "loss": 2.1765, + "step": 12227 + }, + { + "epoch": 2.2920337394564196, + "grad_norm": 55590.92578125, + "learning_rate": 3.2885045992955234e-05, + "loss": 2.1849, + "step": 12228 + }, + { + "epoch": 2.2922211808809747, + "grad_norm": 54890.51953125, + "learning_rate": 3.287766298188913e-05, + "loss": 2.1675, + "step": 12229 + }, + { + "epoch": 2.2924086223055293, + "grad_norm": 53167.7421875, + "learning_rate": 3.287028039372255e-05, + "loss": 2.1236, + "step": 12230 + }, + { + "epoch": 2.2925960637300844, + "grad_norm": 53255.83984375, + "learning_rate": 3.286289822863785e-05, + "loss": 2.1513, + "step": 12231 + }, + { + "epoch": 2.292783505154639, + "grad_norm": 53383.046875, + "learning_rate": 3.285551648681733e-05, + "loss": 2.1589, + "step": 12232 + }, + { + "epoch": 2.292970946579194, + "grad_norm": 59663.28515625, + "learning_rate": 3.2848135168443314e-05, + "loss": 2.1151, + "step": 12233 + }, + { + "epoch": 2.2931583880037487, + "grad_norm": 54310.0, + "learning_rate": 3.284075427369814e-05, + "loss": 2.1078, + "step": 12234 + }, + { + "epoch": 2.293345829428304, + "grad_norm": 55073.3046875, + "learning_rate": 3.283337380276409e-05, + "loss": 2.0975, + "step": 12235 + }, + { + "epoch": 2.2935332708528584, + "grad_norm": 52399.76171875, + "learning_rate": 3.282599375582344e-05, + "loss": 2.0809, + "step": 12236 + }, + { + "epoch": 2.2937207122774135, + "grad_norm": 57546.03125, + "learning_rate": 3.281861413305848e-05, + "loss": 2.1753, + "step": 12237 + }, + { + "epoch": 2.293908153701968, + "grad_norm": 54805.671875, + "learning_rate": 3.281123493465148e-05, + "loss": 2.2121, + "step": 12238 + }, + { + "epoch": 2.2940955951265227, + "grad_norm": 52563.56640625, + "learning_rate": 3.280385616078469e-05, + "loss": 2.1294, + "step": 12239 + }, + { + "epoch": 2.294283036551078, + "grad_norm": 50042.96484375, + "learning_rate": 3.279647781164035e-05, + "loss": 2.1838, + "step": 12240 + }, + { + "epoch": 2.294470477975633, + "grad_norm": 53726.23046875, + "learning_rate": 3.27890998874007e-05, + "loss": 2.1573, + "step": 12241 + }, + { + "epoch": 2.2946579194001875, + "grad_norm": 55422.26953125, + "learning_rate": 3.278172238824799e-05, + "loss": 2.1457, + "step": 12242 + }, + { + "epoch": 2.294845360824742, + "grad_norm": 51937.4453125, + "learning_rate": 3.27743453143644e-05, + "loss": 2.1368, + "step": 12243 + }, + { + "epoch": 2.295032802249297, + "grad_norm": 48843.7109375, + "learning_rate": 3.276696866593214e-05, + "loss": 2.0705, + "step": 12244 + }, + { + "epoch": 2.295220243673852, + "grad_norm": 49773.06640625, + "learning_rate": 3.275959244313341e-05, + "loss": 2.0928, + "step": 12245 + }, + { + "epoch": 2.295407685098407, + "grad_norm": 51289.72265625, + "learning_rate": 3.2752216646150415e-05, + "loss": 2.1028, + "step": 12246 + }, + { + "epoch": 2.2955951265229615, + "grad_norm": 51908.10546875, + "learning_rate": 3.274484127516529e-05, + "loss": 2.1771, + "step": 12247 + }, + { + "epoch": 2.2957825679475166, + "grad_norm": 51075.29296875, + "learning_rate": 3.2737466330360213e-05, + "loss": 2.0986, + "step": 12248 + }, + { + "epoch": 2.295970009372071, + "grad_norm": 55950.7109375, + "learning_rate": 3.273009181191734e-05, + "loss": 2.2263, + "step": 12249 + }, + { + "epoch": 2.296157450796626, + "grad_norm": 54081.1171875, + "learning_rate": 3.272271772001882e-05, + "loss": 2.1508, + "step": 12250 + }, + { + "epoch": 2.296344892221181, + "grad_norm": 57713.0390625, + "learning_rate": 3.271534405484676e-05, + "loss": 2.1061, + "step": 12251 + }, + { + "epoch": 2.296532333645736, + "grad_norm": 57793.70703125, + "learning_rate": 3.270797081658329e-05, + "loss": 2.1922, + "step": 12252 + }, + { + "epoch": 2.2967197750702906, + "grad_norm": 54044.13671875, + "learning_rate": 3.270059800541053e-05, + "loss": 2.1375, + "step": 12253 + }, + { + "epoch": 2.296907216494845, + "grad_norm": 54970.0625, + "learning_rate": 3.269322562151057e-05, + "loss": 2.1637, + "step": 12254 + }, + { + "epoch": 2.2970946579194003, + "grad_norm": 54294.84765625, + "learning_rate": 3.268585366506549e-05, + "loss": 2.1533, + "step": 12255 + }, + { + "epoch": 2.297282099343955, + "grad_norm": 52333.703125, + "learning_rate": 3.267848213625737e-05, + "loss": 2.122, + "step": 12256 + }, + { + "epoch": 2.29746954076851, + "grad_norm": 51217.7109375, + "learning_rate": 3.2671111035268314e-05, + "loss": 2.0805, + "step": 12257 + }, + { + "epoch": 2.2976569821930646, + "grad_norm": 55899.90625, + "learning_rate": 3.2663740362280315e-05, + "loss": 2.17, + "step": 12258 + }, + { + "epoch": 2.2978444236176196, + "grad_norm": 48559.421875, + "learning_rate": 3.2656370117475466e-05, + "loss": 2.1317, + "step": 12259 + }, + { + "epoch": 2.2980318650421743, + "grad_norm": 56231.2578125, + "learning_rate": 3.2649000301035786e-05, + "loss": 2.1194, + "step": 12260 + }, + { + "epoch": 2.2982193064667293, + "grad_norm": 51334.8671875, + "learning_rate": 3.264163091314332e-05, + "loss": 2.1764, + "step": 12261 + }, + { + "epoch": 2.298406747891284, + "grad_norm": 52493.3125, + "learning_rate": 3.2634261953980046e-05, + "loss": 2.1094, + "step": 12262 + }, + { + "epoch": 2.298594189315839, + "grad_norm": 55593.9921875, + "learning_rate": 3.2626893423727986e-05, + "loss": 2.1741, + "step": 12263 + }, + { + "epoch": 2.2987816307403937, + "grad_norm": 52091.6640625, + "learning_rate": 3.261952532256913e-05, + "loss": 2.1005, + "step": 12264 + }, + { + "epoch": 2.2989690721649483, + "grad_norm": 55333.97265625, + "learning_rate": 3.2612157650685494e-05, + "loss": 2.1892, + "step": 12265 + }, + { + "epoch": 2.2991565135895033, + "grad_norm": 57509.64453125, + "learning_rate": 3.260479040825899e-05, + "loss": 2.0715, + "step": 12266 + }, + { + "epoch": 2.299343955014058, + "grad_norm": 50692.98046875, + "learning_rate": 3.259742359547161e-05, + "loss": 2.1231, + "step": 12267 + }, + { + "epoch": 2.299531396438613, + "grad_norm": 52595.38671875, + "learning_rate": 3.259005721250532e-05, + "loss": 2.1919, + "step": 12268 + }, + { + "epoch": 2.2997188378631677, + "grad_norm": 56706.6484375, + "learning_rate": 3.258269125954204e-05, + "loss": 2.1594, + "step": 12269 + }, + { + "epoch": 2.2999062792877227, + "grad_norm": 53999.609375, + "learning_rate": 3.25753257367637e-05, + "loss": 2.1059, + "step": 12270 + }, + { + "epoch": 2.3000937207122774, + "grad_norm": 51863.17578125, + "learning_rate": 3.2567960644352215e-05, + "loss": 2.0996, + "step": 12271 + }, + { + "epoch": 2.3002811621368324, + "grad_norm": 49956.8984375, + "learning_rate": 3.256059598248953e-05, + "loss": 2.1579, + "step": 12272 + }, + { + "epoch": 2.300468603561387, + "grad_norm": 51296.8046875, + "learning_rate": 3.255323175135748e-05, + "loss": 2.1532, + "step": 12273 + }, + { + "epoch": 2.300656044985942, + "grad_norm": 51153.66015625, + "learning_rate": 3.2545867951138e-05, + "loss": 2.2035, + "step": 12274 + }, + { + "epoch": 2.3008434864104967, + "grad_norm": 55993.5, + "learning_rate": 3.253850458201295e-05, + "loss": 2.135, + "step": 12275 + }, + { + "epoch": 2.3010309278350514, + "grad_norm": 56089.06640625, + "learning_rate": 3.253114164416421e-05, + "loss": 2.1449, + "step": 12276 + }, + { + "epoch": 2.3012183692596064, + "grad_norm": 54003.17578125, + "learning_rate": 3.252377913777361e-05, + "loss": 2.083, + "step": 12277 + }, + { + "epoch": 2.301405810684161, + "grad_norm": 54981.859375, + "learning_rate": 3.251641706302301e-05, + "loss": 2.1355, + "step": 12278 + }, + { + "epoch": 2.301593252108716, + "grad_norm": 57173.99609375, + "learning_rate": 3.250905542009425e-05, + "loss": 2.0762, + "step": 12279 + }, + { + "epoch": 2.3017806935332707, + "grad_norm": 51622.6796875, + "learning_rate": 3.250169420916913e-05, + "loss": 2.1695, + "step": 12280 + }, + { + "epoch": 2.301968134957826, + "grad_norm": 55440.21484375, + "learning_rate": 3.249433343042947e-05, + "loss": 2.0832, + "step": 12281 + }, + { + "epoch": 2.3021555763823804, + "grad_norm": 53537.43359375, + "learning_rate": 3.248697308405709e-05, + "loss": 2.1702, + "step": 12282 + }, + { + "epoch": 2.3023430178069355, + "grad_norm": 54242.2890625, + "learning_rate": 3.247961317023378e-05, + "loss": 2.1476, + "step": 12283 + }, + { + "epoch": 2.30253045923149, + "grad_norm": 52112.703125, + "learning_rate": 3.247225368914129e-05, + "loss": 2.1709, + "step": 12284 + }, + { + "epoch": 2.302717900656045, + "grad_norm": 49110.18359375, + "learning_rate": 3.246489464096141e-05, + "loss": 2.1358, + "step": 12285 + }, + { + "epoch": 2.3029053420806, + "grad_norm": 59844.0625, + "learning_rate": 3.24575360258759e-05, + "loss": 2.1176, + "step": 12286 + }, + { + "epoch": 2.3030927835051545, + "grad_norm": 52361.98828125, + "learning_rate": 3.245017784406652e-05, + "loss": 2.1023, + "step": 12287 + }, + { + "epoch": 2.3032802249297095, + "grad_norm": 54458.5390625, + "learning_rate": 3.2442820095714976e-05, + "loss": 2.1169, + "step": 12288 + }, + { + "epoch": 2.303467666354264, + "grad_norm": 53702.2109375, + "learning_rate": 3.243546278100301e-05, + "loss": 2.0688, + "step": 12289 + }, + { + "epoch": 2.303655107778819, + "grad_norm": 60453.27734375, + "learning_rate": 3.242810590011235e-05, + "loss": 2.1324, + "step": 12290 + }, + { + "epoch": 2.303842549203374, + "grad_norm": 54409.15234375, + "learning_rate": 3.2420749453224704e-05, + "loss": 2.2499, + "step": 12291 + }, + { + "epoch": 2.304029990627929, + "grad_norm": 52408.94921875, + "learning_rate": 3.241339344052174e-05, + "loss": 2.146, + "step": 12292 + }, + { + "epoch": 2.3042174320524835, + "grad_norm": 49513.078125, + "learning_rate": 3.2406037862185156e-05, + "loss": 2.1509, + "step": 12293 + }, + { + "epoch": 2.3044048734770386, + "grad_norm": 52081.109375, + "learning_rate": 3.239868271839664e-05, + "loss": 2.1249, + "step": 12294 + }, + { + "epoch": 2.304592314901593, + "grad_norm": 52131.59375, + "learning_rate": 3.2391328009337826e-05, + "loss": 2.093, + "step": 12295 + }, + { + "epoch": 2.3047797563261483, + "grad_norm": 52863.8125, + "learning_rate": 3.238397373519037e-05, + "loss": 2.1055, + "step": 12296 + }, + { + "epoch": 2.304967197750703, + "grad_norm": 51532.0390625, + "learning_rate": 3.237661989613594e-05, + "loss": 2.147, + "step": 12297 + }, + { + "epoch": 2.3051546391752575, + "grad_norm": 52964.76953125, + "learning_rate": 3.236926649235615e-05, + "loss": 2.0661, + "step": 12298 + }, + { + "epoch": 2.3053420805998126, + "grad_norm": 51672.2578125, + "learning_rate": 3.2361913524032615e-05, + "loss": 2.1344, + "step": 12299 + }, + { + "epoch": 2.3055295220243672, + "grad_norm": 52447.16796875, + "learning_rate": 3.235456099134695e-05, + "loss": 2.1151, + "step": 12300 + }, + { + "epoch": 2.3057169634489223, + "grad_norm": 53692.34375, + "learning_rate": 3.234720889448074e-05, + "loss": 2.0917, + "step": 12301 + }, + { + "epoch": 2.305904404873477, + "grad_norm": 50671.203125, + "learning_rate": 3.2339857233615604e-05, + "loss": 2.152, + "step": 12302 + }, + { + "epoch": 2.306091846298032, + "grad_norm": 53488.39453125, + "learning_rate": 3.2332506008933086e-05, + "loss": 2.1464, + "step": 12303 + }, + { + "epoch": 2.3062792877225866, + "grad_norm": 53740.421875, + "learning_rate": 3.232515522061475e-05, + "loss": 2.2129, + "step": 12304 + }, + { + "epoch": 2.3064667291471417, + "grad_norm": 60700.28515625, + "learning_rate": 3.231780486884217e-05, + "loss": 2.1945, + "step": 12305 + }, + { + "epoch": 2.3066541705716963, + "grad_norm": 55455.51953125, + "learning_rate": 3.2310454953796906e-05, + "loss": 2.174, + "step": 12306 + }, + { + "epoch": 2.3068416119962514, + "grad_norm": 51379.57421875, + "learning_rate": 3.230310547566046e-05, + "loss": 2.1208, + "step": 12307 + }, + { + "epoch": 2.307029053420806, + "grad_norm": 53237.6015625, + "learning_rate": 3.2295756434614357e-05, + "loss": 2.1239, + "step": 12308 + }, + { + "epoch": 2.3072164948453606, + "grad_norm": 55906.09375, + "learning_rate": 3.2288407830840123e-05, + "loss": 2.085, + "step": 12309 + }, + { + "epoch": 2.3074039362699157, + "grad_norm": 54577.19921875, + "learning_rate": 3.228105966451924e-05, + "loss": 2.217, + "step": 12310 + }, + { + "epoch": 2.3075913776944703, + "grad_norm": 53385.8359375, + "learning_rate": 3.227371193583321e-05, + "loss": 2.168, + "step": 12311 + }, + { + "epoch": 2.3077788191190254, + "grad_norm": 54123.69921875, + "learning_rate": 3.226636464496352e-05, + "loss": 2.0708, + "step": 12312 + }, + { + "epoch": 2.30796626054358, + "grad_norm": 51799.93359375, + "learning_rate": 3.2259017792091636e-05, + "loss": 2.1083, + "step": 12313 + }, + { + "epoch": 2.308153701968135, + "grad_norm": 55666.859375, + "learning_rate": 3.2251671377399015e-05, + "loss": 2.1122, + "step": 12314 + }, + { + "epoch": 2.3083411433926897, + "grad_norm": 58708.55859375, + "learning_rate": 3.2244325401067086e-05, + "loss": 2.1712, + "step": 12315 + }, + { + "epoch": 2.3085285848172448, + "grad_norm": 56748.3984375, + "learning_rate": 3.2236979863277304e-05, + "loss": 2.1641, + "step": 12316 + }, + { + "epoch": 2.3087160262417994, + "grad_norm": 54979.3359375, + "learning_rate": 3.22296347642111e-05, + "loss": 2.1477, + "step": 12317 + }, + { + "epoch": 2.3089034676663545, + "grad_norm": 53570.82421875, + "learning_rate": 3.222229010404987e-05, + "loss": 2.1715, + "step": 12318 + }, + { + "epoch": 2.309090909090909, + "grad_norm": 55600.85546875, + "learning_rate": 3.2214945882975014e-05, + "loss": 2.0587, + "step": 12319 + }, + { + "epoch": 2.3092783505154637, + "grad_norm": 53104.4296875, + "learning_rate": 3.220760210116795e-05, + "loss": 2.1006, + "step": 12320 + }, + { + "epoch": 2.3094657919400188, + "grad_norm": 52013.515625, + "learning_rate": 3.220025875881004e-05, + "loss": 2.1195, + "step": 12321 + }, + { + "epoch": 2.3096532333645734, + "grad_norm": 53083.6796875, + "learning_rate": 3.219291585608266e-05, + "loss": 2.2241, + "step": 12322 + }, + { + "epoch": 2.3098406747891285, + "grad_norm": 51223.66796875, + "learning_rate": 3.218557339316717e-05, + "loss": 2.1956, + "step": 12323 + }, + { + "epoch": 2.310028116213683, + "grad_norm": 58669.79296875, + "learning_rate": 3.217823137024494e-05, + "loss": 2.2061, + "step": 12324 + }, + { + "epoch": 2.310215557638238, + "grad_norm": 55127.26953125, + "learning_rate": 3.217088978749727e-05, + "loss": 2.0965, + "step": 12325 + }, + { + "epoch": 2.310402999062793, + "grad_norm": 54602.8671875, + "learning_rate": 3.216354864510549e-05, + "loss": 2.097, + "step": 12326 + }, + { + "epoch": 2.310590440487348, + "grad_norm": 51815.11328125, + "learning_rate": 3.215620794325096e-05, + "loss": 2.133, + "step": 12327 + }, + { + "epoch": 2.3107778819119025, + "grad_norm": 52472.15625, + "learning_rate": 3.214886768211496e-05, + "loss": 2.1485, + "step": 12328 + }, + { + "epoch": 2.3109653233364575, + "grad_norm": 52613.6875, + "learning_rate": 3.2141527861878776e-05, + "loss": 2.1325, + "step": 12329 + }, + { + "epoch": 2.311152764761012, + "grad_norm": 51566.74609375, + "learning_rate": 3.213418848272369e-05, + "loss": 2.1505, + "step": 12330 + }, + { + "epoch": 2.311340206185567, + "grad_norm": 55920.46875, + "learning_rate": 3.212684954483099e-05, + "loss": 2.094, + "step": 12331 + }, + { + "epoch": 2.311527647610122, + "grad_norm": 56392.69921875, + "learning_rate": 3.211951104838196e-05, + "loss": 2.1015, + "step": 12332 + }, + { + "epoch": 2.3117150890346765, + "grad_norm": 55498.1875, + "learning_rate": 3.211217299355779e-05, + "loss": 2.0986, + "step": 12333 + }, + { + "epoch": 2.3119025304592316, + "grad_norm": 54350.71484375, + "learning_rate": 3.2104835380539764e-05, + "loss": 2.1298, + "step": 12334 + }, + { + "epoch": 2.312089971883786, + "grad_norm": 54006.1953125, + "learning_rate": 3.2097498209509124e-05, + "loss": 2.1234, + "step": 12335 + }, + { + "epoch": 2.3122774133083412, + "grad_norm": 61229.19140625, + "learning_rate": 3.2090161480647036e-05, + "loss": 2.0552, + "step": 12336 + }, + { + "epoch": 2.312464854732896, + "grad_norm": 53925.66796875, + "learning_rate": 3.208282519413475e-05, + "loss": 2.1184, + "step": 12337 + }, + { + "epoch": 2.312652296157451, + "grad_norm": 52080.0234375, + "learning_rate": 3.207548935015345e-05, + "loss": 2.1504, + "step": 12338 + }, + { + "epoch": 2.3128397375820056, + "grad_norm": 53083.171875, + "learning_rate": 3.206815394888434e-05, + "loss": 2.0689, + "step": 12339 + }, + { + "epoch": 2.3130271790065606, + "grad_norm": 54962.5078125, + "learning_rate": 3.206081899050857e-05, + "loss": 2.0931, + "step": 12340 + }, + { + "epoch": 2.3132146204311153, + "grad_norm": 60474.23046875, + "learning_rate": 3.205348447520729e-05, + "loss": 2.101, + "step": 12341 + }, + { + "epoch": 2.31340206185567, + "grad_norm": 58296.8515625, + "learning_rate": 3.20461504031617e-05, + "loss": 2.133, + "step": 12342 + }, + { + "epoch": 2.313589503280225, + "grad_norm": 55021.76953125, + "learning_rate": 3.203881677455293e-05, + "loss": 2.09, + "step": 12343 + }, + { + "epoch": 2.3137769447047796, + "grad_norm": 50584.3828125, + "learning_rate": 3.2031483589562063e-05, + "loss": 2.2499, + "step": 12344 + }, + { + "epoch": 2.3139643861293346, + "grad_norm": 55122.41015625, + "learning_rate": 3.202415084837027e-05, + "loss": 2.1014, + "step": 12345 + }, + { + "epoch": 2.3141518275538893, + "grad_norm": 51566.08203125, + "learning_rate": 3.201681855115865e-05, + "loss": 2.135, + "step": 12346 + }, + { + "epoch": 2.3143392689784443, + "grad_norm": 48949.83984375, + "learning_rate": 3.2009486698108306e-05, + "loss": 2.1362, + "step": 12347 + }, + { + "epoch": 2.314526710402999, + "grad_norm": 51637.75390625, + "learning_rate": 3.20021552894003e-05, + "loss": 2.1238, + "step": 12348 + }, + { + "epoch": 2.314714151827554, + "grad_norm": 50431.05859375, + "learning_rate": 3.199482432521571e-05, + "loss": 2.1417, + "step": 12349 + }, + { + "epoch": 2.3149015932521086, + "grad_norm": 54289.46875, + "learning_rate": 3.198749380573566e-05, + "loss": 2.1547, + "step": 12350 + }, + { + "epoch": 2.3150890346766637, + "grad_norm": 52852.71875, + "learning_rate": 3.198016373114111e-05, + "loss": 2.172, + "step": 12351 + }, + { + "epoch": 2.3152764761012183, + "grad_norm": 56411.31640625, + "learning_rate": 3.1972834101613175e-05, + "loss": 2.1237, + "step": 12352 + }, + { + "epoch": 2.315463917525773, + "grad_norm": 55656.5625, + "learning_rate": 3.196550491733287e-05, + "loss": 2.1802, + "step": 12353 + }, + { + "epoch": 2.315651358950328, + "grad_norm": 51258.953125, + "learning_rate": 3.195817617848121e-05, + "loss": 2.1764, + "step": 12354 + }, + { + "epoch": 2.3158388003748827, + "grad_norm": 51378.8125, + "learning_rate": 3.1950847885239196e-05, + "loss": 2.1868, + "step": 12355 + }, + { + "epoch": 2.3160262417994377, + "grad_norm": 54141.1640625, + "learning_rate": 3.1943520037787845e-05, + "loss": 2.1198, + "step": 12356 + }, + { + "epoch": 2.3162136832239923, + "grad_norm": 53004.69921875, + "learning_rate": 3.1936192636308124e-05, + "loss": 2.1198, + "step": 12357 + }, + { + "epoch": 2.3164011246485474, + "grad_norm": 52935.44140625, + "learning_rate": 3.1928865680981065e-05, + "loss": 2.0911, + "step": 12358 + }, + { + "epoch": 2.316588566073102, + "grad_norm": 51399.984375, + "learning_rate": 3.1921539171987556e-05, + "loss": 2.0693, + "step": 12359 + }, + { + "epoch": 2.316776007497657, + "grad_norm": 52781.62890625, + "learning_rate": 3.19142131095086e-05, + "loss": 2.0451, + "step": 12360 + }, + { + "epoch": 2.3169634489222117, + "grad_norm": 56233.53515625, + "learning_rate": 3.190688749372513e-05, + "loss": 2.1015, + "step": 12361 + }, + { + "epoch": 2.317150890346767, + "grad_norm": 53823.91015625, + "learning_rate": 3.18995623248181e-05, + "loss": 2.1274, + "step": 12362 + }, + { + "epoch": 2.3173383317713214, + "grad_norm": 56379.26953125, + "learning_rate": 3.189223760296839e-05, + "loss": 2.08, + "step": 12363 + }, + { + "epoch": 2.317525773195876, + "grad_norm": 51862.6328125, + "learning_rate": 3.1884913328356936e-05, + "loss": 2.1528, + "step": 12364 + }, + { + "epoch": 2.317713214620431, + "grad_norm": 55145.2109375, + "learning_rate": 3.187758950116466e-05, + "loss": 2.0759, + "step": 12365 + }, + { + "epoch": 2.317900656044986, + "grad_norm": 53851.765625, + "learning_rate": 3.18702661215724e-05, + "loss": 2.1428, + "step": 12366 + }, + { + "epoch": 2.318088097469541, + "grad_norm": 55660.671875, + "learning_rate": 3.186294318976106e-05, + "loss": 2.1029, + "step": 12367 + }, + { + "epoch": 2.3182755388940954, + "grad_norm": 55081.81640625, + "learning_rate": 3.185562070591153e-05, + "loss": 2.0565, + "step": 12368 + }, + { + "epoch": 2.3184629803186505, + "grad_norm": 69512.234375, + "learning_rate": 3.1848298670204634e-05, + "loss": 2.1174, + "step": 12369 + }, + { + "epoch": 2.318650421743205, + "grad_norm": 52184.828125, + "learning_rate": 3.184097708282123e-05, + "loss": 2.0949, + "step": 12370 + }, + { + "epoch": 2.31883786316776, + "grad_norm": 52912.59765625, + "learning_rate": 3.183365594394214e-05, + "loss": 2.1324, + "step": 12371 + }, + { + "epoch": 2.319025304592315, + "grad_norm": 54977.93359375, + "learning_rate": 3.182633525374819e-05, + "loss": 2.1367, + "step": 12372 + }, + { + "epoch": 2.31921274601687, + "grad_norm": 59481.27734375, + "learning_rate": 3.181901501242022e-05, + "loss": 2.1613, + "step": 12373 + }, + { + "epoch": 2.3194001874414245, + "grad_norm": 51393.5625, + "learning_rate": 3.181169522013898e-05, + "loss": 2.1771, + "step": 12374 + }, + { + "epoch": 2.319587628865979, + "grad_norm": 54342.3984375, + "learning_rate": 3.1804375877085305e-05, + "loss": 2.1008, + "step": 12375 + }, + { + "epoch": 2.319775070290534, + "grad_norm": 55306.37109375, + "learning_rate": 3.1797056983439954e-05, + "loss": 2.1506, + "step": 12376 + }, + { + "epoch": 2.3199625117150893, + "grad_norm": 54198.22265625, + "learning_rate": 3.17897385393837e-05, + "loss": 2.1011, + "step": 12377 + }, + { + "epoch": 2.320149953139644, + "grad_norm": 51217.69140625, + "learning_rate": 3.178242054509728e-05, + "loss": 2.0986, + "step": 12378 + }, + { + "epoch": 2.3203373945641985, + "grad_norm": 50707.6640625, + "learning_rate": 3.1775103000761444e-05, + "loss": 2.1323, + "step": 12379 + }, + { + "epoch": 2.3205248359887536, + "grad_norm": 50350.359375, + "learning_rate": 3.176778590655697e-05, + "loss": 2.0976, + "step": 12380 + }, + { + "epoch": 2.320712277413308, + "grad_norm": 50949.89453125, + "learning_rate": 3.176046926266451e-05, + "loss": 2.1172, + "step": 12381 + }, + { + "epoch": 2.3208997188378633, + "grad_norm": 51354.66796875, + "learning_rate": 3.175315306926483e-05, + "loss": 2.1424, + "step": 12382 + }, + { + "epoch": 2.321087160262418, + "grad_norm": 55419.671875, + "learning_rate": 3.17458373265386e-05, + "loss": 2.0492, + "step": 12383 + }, + { + "epoch": 2.321274601686973, + "grad_norm": 52478.96484375, + "learning_rate": 3.173852203466654e-05, + "loss": 2.2742, + "step": 12384 + }, + { + "epoch": 2.3214620431115276, + "grad_norm": 54945.2890625, + "learning_rate": 3.17312071938293e-05, + "loss": 2.0843, + "step": 12385 + }, + { + "epoch": 2.3216494845360827, + "grad_norm": 54810.3984375, + "learning_rate": 3.172389280420755e-05, + "loss": 2.174, + "step": 12386 + }, + { + "epoch": 2.3218369259606373, + "grad_norm": 53626.09375, + "learning_rate": 3.171657886598195e-05, + "loss": 2.1394, + "step": 12387 + }, + { + "epoch": 2.3220243673851924, + "grad_norm": 56550.55078125, + "learning_rate": 3.170926537933317e-05, + "loss": 2.1258, + "step": 12388 + }, + { + "epoch": 2.322211808809747, + "grad_norm": 53611.5625, + "learning_rate": 3.170195234444179e-05, + "loss": 2.1034, + "step": 12389 + }, + { + "epoch": 2.3223992502343016, + "grad_norm": 56312.296875, + "learning_rate": 3.169463976148848e-05, + "loss": 2.1839, + "step": 12390 + }, + { + "epoch": 2.3225866916588567, + "grad_norm": 52674.35546875, + "learning_rate": 3.168732763065384e-05, + "loss": 2.177, + "step": 12391 + }, + { + "epoch": 2.3227741330834113, + "grad_norm": 49535.984375, + "learning_rate": 3.168001595211846e-05, + "loss": 2.093, + "step": 12392 + }, + { + "epoch": 2.3229615745079664, + "grad_norm": 59370.37109375, + "learning_rate": 3.167270472606293e-05, + "loss": 2.2319, + "step": 12393 + }, + { + "epoch": 2.323149015932521, + "grad_norm": 51918.59765625, + "learning_rate": 3.166539395266783e-05, + "loss": 2.1095, + "step": 12394 + }, + { + "epoch": 2.323336457357076, + "grad_norm": 51414.984375, + "learning_rate": 3.165808363211375e-05, + "loss": 2.1055, + "step": 12395 + }, + { + "epoch": 2.3235238987816307, + "grad_norm": 54146.7578125, + "learning_rate": 3.16507737645812e-05, + "loss": 2.065, + "step": 12396 + }, + { + "epoch": 2.3237113402061857, + "grad_norm": 53883.37890625, + "learning_rate": 3.164346435025074e-05, + "loss": 2.1028, + "step": 12397 + }, + { + "epoch": 2.3238987816307404, + "grad_norm": 55642.20703125, + "learning_rate": 3.1636155389302926e-05, + "loss": 2.0178, + "step": 12398 + }, + { + "epoch": 2.3240862230552954, + "grad_norm": 50940.66796875, + "learning_rate": 3.1628846881918274e-05, + "loss": 2.1195, + "step": 12399 + }, + { + "epoch": 2.32427366447985, + "grad_norm": 54563.1640625, + "learning_rate": 3.1621538828277265e-05, + "loss": 2.091, + "step": 12400 + }, + { + "epoch": 2.3244611059044047, + "grad_norm": 53216.2265625, + "learning_rate": 3.1614231228560434e-05, + "loss": 2.0624, + "step": 12401 + }, + { + "epoch": 2.3246485473289598, + "grad_norm": 58752.48828125, + "learning_rate": 3.160692408294823e-05, + "loss": 2.1144, + "step": 12402 + }, + { + "epoch": 2.3248359887535144, + "grad_norm": 50368.02734375, + "learning_rate": 3.159961739162119e-05, + "loss": 2.1532, + "step": 12403 + }, + { + "epoch": 2.3250234301780695, + "grad_norm": 54911.78125, + "learning_rate": 3.159231115475972e-05, + "loss": 2.1567, + "step": 12404 + }, + { + "epoch": 2.325210871602624, + "grad_norm": 54623.27734375, + "learning_rate": 3.1585005372544305e-05, + "loss": 2.1182, + "step": 12405 + }, + { + "epoch": 2.325398313027179, + "grad_norm": 52297.34765625, + "learning_rate": 3.157770004515539e-05, + "loss": 2.141, + "step": 12406 + }, + { + "epoch": 2.3255857544517338, + "grad_norm": 52384.84765625, + "learning_rate": 3.1570395172773394e-05, + "loss": 2.2043, + "step": 12407 + }, + { + "epoch": 2.325773195876289, + "grad_norm": 54689.0390625, + "learning_rate": 3.1563090755578734e-05, + "loss": 2.1041, + "step": 12408 + }, + { + "epoch": 2.3259606373008435, + "grad_norm": 51735.703125, + "learning_rate": 3.155578679375183e-05, + "loss": 2.111, + "step": 12409 + }, + { + "epoch": 2.3261480787253985, + "grad_norm": 51913.99609375, + "learning_rate": 3.1548483287473094e-05, + "loss": 2.1353, + "step": 12410 + }, + { + "epoch": 2.326335520149953, + "grad_norm": 55558.0703125, + "learning_rate": 3.154118023692288e-05, + "loss": 2.1076, + "step": 12411 + }, + { + "epoch": 2.3265229615745078, + "grad_norm": 56413.72265625, + "learning_rate": 3.153387764228159e-05, + "loss": 2.1464, + "step": 12412 + }, + { + "epoch": 2.326710402999063, + "grad_norm": 54848.1640625, + "learning_rate": 3.1526575503729574e-05, + "loss": 2.0576, + "step": 12413 + }, + { + "epoch": 2.3268978444236175, + "grad_norm": 58565.2734375, + "learning_rate": 3.151927382144721e-05, + "loss": 2.1776, + "step": 12414 + }, + { + "epoch": 2.3270852858481725, + "grad_norm": 58112.2734375, + "learning_rate": 3.15119725956148e-05, + "loss": 2.2015, + "step": 12415 + }, + { + "epoch": 2.327272727272727, + "grad_norm": 59896.1875, + "learning_rate": 3.1504671826412714e-05, + "loss": 2.1159, + "step": 12416 + }, + { + "epoch": 2.3274601686972822, + "grad_norm": 53256.796875, + "learning_rate": 3.1497371514021234e-05, + "loss": 2.1392, + "step": 12417 + }, + { + "epoch": 2.327647610121837, + "grad_norm": 55116.48828125, + "learning_rate": 3.149007165862073e-05, + "loss": 2.153, + "step": 12418 + }, + { + "epoch": 2.327835051546392, + "grad_norm": 55550.97265625, + "learning_rate": 3.1482772260391406e-05, + "loss": 2.1188, + "step": 12419 + }, + { + "epoch": 2.3280224929709465, + "grad_norm": 53413.3046875, + "learning_rate": 3.147547331951362e-05, + "loss": 2.1365, + "step": 12420 + }, + { + "epoch": 2.3282099343955016, + "grad_norm": 54209.92578125, + "learning_rate": 3.146817483616764e-05, + "loss": 2.2261, + "step": 12421 + }, + { + "epoch": 2.3283973758200562, + "grad_norm": 61320.80859375, + "learning_rate": 3.1460876810533705e-05, + "loss": 2.1372, + "step": 12422 + }, + { + "epoch": 2.328584817244611, + "grad_norm": 58862.44140625, + "learning_rate": 3.145357924279206e-05, + "loss": 2.1213, + "step": 12423 + }, + { + "epoch": 2.328772258669166, + "grad_norm": 48875.46875, + "learning_rate": 3.1446282133122986e-05, + "loss": 2.1418, + "step": 12424 + }, + { + "epoch": 2.3289597000937206, + "grad_norm": 51378.8515625, + "learning_rate": 3.143898548170668e-05, + "loss": 2.0916, + "step": 12425 + }, + { + "epoch": 2.3291471415182756, + "grad_norm": 54882.875, + "learning_rate": 3.1431689288723366e-05, + "loss": 2.0985, + "step": 12426 + }, + { + "epoch": 2.3293345829428302, + "grad_norm": 53270.3828125, + "learning_rate": 3.1424393554353246e-05, + "loss": 2.1099, + "step": 12427 + }, + { + "epoch": 2.3295220243673853, + "grad_norm": 55851.3125, + "learning_rate": 3.141709827877652e-05, + "loss": 2.1218, + "step": 12428 + }, + { + "epoch": 2.32970946579194, + "grad_norm": 58082.953125, + "learning_rate": 3.140980346217339e-05, + "loss": 2.1623, + "step": 12429 + }, + { + "epoch": 2.329896907216495, + "grad_norm": 53559.140625, + "learning_rate": 3.140250910472401e-05, + "loss": 2.1824, + "step": 12430 + }, + { + "epoch": 2.3300843486410496, + "grad_norm": 52756.80078125, + "learning_rate": 3.139521520660853e-05, + "loss": 2.1566, + "step": 12431 + }, + { + "epoch": 2.3302717900656047, + "grad_norm": 49057.26171875, + "learning_rate": 3.1387921768007134e-05, + "loss": 2.115, + "step": 12432 + }, + { + "epoch": 2.3304592314901593, + "grad_norm": 54911.83984375, + "learning_rate": 3.138062878909992e-05, + "loss": 2.0721, + "step": 12433 + }, + { + "epoch": 2.330646672914714, + "grad_norm": 53924.55859375, + "learning_rate": 3.137333627006703e-05, + "loss": 2.1216, + "step": 12434 + }, + { + "epoch": 2.330834114339269, + "grad_norm": 52891.38671875, + "learning_rate": 3.1366044211088584e-05, + "loss": 2.1276, + "step": 12435 + }, + { + "epoch": 2.3310215557638236, + "grad_norm": 55780.8359375, + "learning_rate": 3.135875261234472e-05, + "loss": 2.0395, + "step": 12436 + }, + { + "epoch": 2.3312089971883787, + "grad_norm": 54303.5859375, + "learning_rate": 3.1351461474015466e-05, + "loss": 2.1496, + "step": 12437 + }, + { + "epoch": 2.3313964386129333, + "grad_norm": 54015.52734375, + "learning_rate": 3.134417079628094e-05, + "loss": 2.1352, + "step": 12438 + }, + { + "epoch": 2.3315838800374884, + "grad_norm": 54695.8046875, + "learning_rate": 3.133688057932121e-05, + "loss": 2.1906, + "step": 12439 + }, + { + "epoch": 2.331771321462043, + "grad_norm": 51817.91015625, + "learning_rate": 3.132959082331635e-05, + "loss": 2.1766, + "step": 12440 + }, + { + "epoch": 2.331958762886598, + "grad_norm": 52206.86328125, + "learning_rate": 3.1322301528446366e-05, + "loss": 2.0419, + "step": 12441 + }, + { + "epoch": 2.3321462043111527, + "grad_norm": 54961.875, + "learning_rate": 3.131501269489131e-05, + "loss": 2.1187, + "step": 12442 + }, + { + "epoch": 2.332333645735708, + "grad_norm": 53530.046875, + "learning_rate": 3.130772432283124e-05, + "loss": 2.1177, + "step": 12443 + }, + { + "epoch": 2.3325210871602624, + "grad_norm": 51246.375, + "learning_rate": 3.130043641244614e-05, + "loss": 2.0418, + "step": 12444 + }, + { + "epoch": 2.332708528584817, + "grad_norm": 59242.25390625, + "learning_rate": 3.129314896391601e-05, + "loss": 2.1642, + "step": 12445 + }, + { + "epoch": 2.332895970009372, + "grad_norm": 62268.54296875, + "learning_rate": 3.128586197742085e-05, + "loss": 2.1248, + "step": 12446 + }, + { + "epoch": 2.3330834114339267, + "grad_norm": 62973.8046875, + "learning_rate": 3.127857545314063e-05, + "loss": 2.0723, + "step": 12447 + }, + { + "epoch": 2.333270852858482, + "grad_norm": 52680.8125, + "learning_rate": 3.127128939125533e-05, + "loss": 2.1658, + "step": 12448 + }, + { + "epoch": 2.3334582942830364, + "grad_norm": 48184.9921875, + "learning_rate": 3.12640037919449e-05, + "loss": 2.1483, + "step": 12449 + }, + { + "epoch": 2.3336457357075915, + "grad_norm": 51591.24609375, + "learning_rate": 3.1256718655389274e-05, + "loss": 2.1848, + "step": 12450 + }, + { + "epoch": 2.333833177132146, + "grad_norm": 53188.140625, + "learning_rate": 3.124943398176842e-05, + "loss": 2.2074, + "step": 12451 + }, + { + "epoch": 2.334020618556701, + "grad_norm": 57249.08984375, + "learning_rate": 3.124214977126221e-05, + "loss": 2.0661, + "step": 12452 + }, + { + "epoch": 2.334208059981256, + "grad_norm": 54098.15625, + "learning_rate": 3.123486602405059e-05, + "loss": 2.1259, + "step": 12453 + }, + { + "epoch": 2.334395501405811, + "grad_norm": 53387.69921875, + "learning_rate": 3.122758274031345e-05, + "loss": 2.12, + "step": 12454 + }, + { + "epoch": 2.3345829428303655, + "grad_norm": 58837.703125, + "learning_rate": 3.122029992023069e-05, + "loss": 2.0629, + "step": 12455 + }, + { + "epoch": 2.33477038425492, + "grad_norm": 50183.75, + "learning_rate": 3.1213017563982164e-05, + "loss": 2.1254, + "step": 12456 + }, + { + "epoch": 2.334957825679475, + "grad_norm": 55696.87890625, + "learning_rate": 3.120573567174773e-05, + "loss": 2.057, + "step": 12457 + }, + { + "epoch": 2.33514526710403, + "grad_norm": 56512.79296875, + "learning_rate": 3.119845424370727e-05, + "loss": 2.1326, + "step": 12458 + }, + { + "epoch": 2.335332708528585, + "grad_norm": 51109.55859375, + "learning_rate": 3.119117328004063e-05, + "loss": 2.1312, + "step": 12459 + }, + { + "epoch": 2.3355201499531395, + "grad_norm": 57949.7265625, + "learning_rate": 3.118389278092761e-05, + "loss": 2.1265, + "step": 12460 + }, + { + "epoch": 2.3357075913776946, + "grad_norm": 54949.55859375, + "learning_rate": 3.117661274654804e-05, + "loss": 2.1426, + "step": 12461 + }, + { + "epoch": 2.335895032802249, + "grad_norm": 62746.1640625, + "learning_rate": 3.116933317708174e-05, + "loss": 2.0573, + "step": 12462 + }, + { + "epoch": 2.3360824742268043, + "grad_norm": 56793.1953125, + "learning_rate": 3.1162054072708494e-05, + "loss": 2.1118, + "step": 12463 + }, + { + "epoch": 2.336269915651359, + "grad_norm": 53761.69921875, + "learning_rate": 3.1154775433608085e-05, + "loss": 2.1748, + "step": 12464 + }, + { + "epoch": 2.336457357075914, + "grad_norm": 55055.84765625, + "learning_rate": 3.114749725996028e-05, + "loss": 2.1426, + "step": 12465 + }, + { + "epoch": 2.3366447985004686, + "grad_norm": 53081.25, + "learning_rate": 3.114021955194487e-05, + "loss": 2.1167, + "step": 12466 + }, + { + "epoch": 2.336832239925023, + "grad_norm": 56462.51171875, + "learning_rate": 3.113294230974156e-05, + "loss": 2.1179, + "step": 12467 + }, + { + "epoch": 2.3370196813495783, + "grad_norm": 55114.1328125, + "learning_rate": 3.112566553353013e-05, + "loss": 2.0866, + "step": 12468 + }, + { + "epoch": 2.337207122774133, + "grad_norm": 51375.94140625, + "learning_rate": 3.1118389223490294e-05, + "loss": 2.1557, + "step": 12469 + }, + { + "epoch": 2.337394564198688, + "grad_norm": 57342.06640625, + "learning_rate": 3.111111337980176e-05, + "loss": 2.1483, + "step": 12470 + }, + { + "epoch": 2.3375820056232426, + "grad_norm": 59815.5390625, + "learning_rate": 3.1103838002644236e-05, + "loss": 2.1692, + "step": 12471 + }, + { + "epoch": 2.3377694470477977, + "grad_norm": 53323.6875, + "learning_rate": 3.1096563092197404e-05, + "loss": 2.1431, + "step": 12472 + }, + { + "epoch": 2.3379568884723523, + "grad_norm": 52542.1875, + "learning_rate": 3.108928864864098e-05, + "loss": 2.1458, + "step": 12473 + }, + { + "epoch": 2.3381443298969073, + "grad_norm": 57292.328125, + "learning_rate": 3.108201467215458e-05, + "loss": 2.144, + "step": 12474 + }, + { + "epoch": 2.338331771321462, + "grad_norm": 51873.27734375, + "learning_rate": 3.10747411629179e-05, + "loss": 2.1384, + "step": 12475 + }, + { + "epoch": 2.338519212746017, + "grad_norm": 51726.328125, + "learning_rate": 3.106746812111058e-05, + "loss": 2.0749, + "step": 12476 + }, + { + "epoch": 2.3387066541705717, + "grad_norm": 55117.18359375, + "learning_rate": 3.106019554691225e-05, + "loss": 2.1286, + "step": 12477 + }, + { + "epoch": 2.3388940955951263, + "grad_norm": 53700.56640625, + "learning_rate": 3.105292344050253e-05, + "loss": 2.169, + "step": 12478 + }, + { + "epoch": 2.3390815370196814, + "grad_norm": 53823.78515625, + "learning_rate": 3.104565180206104e-05, + "loss": 2.1419, + "step": 12479 + }, + { + "epoch": 2.3392689784442364, + "grad_norm": 54388.8046875, + "learning_rate": 3.103838063176736e-05, + "loss": 2.2721, + "step": 12480 + }, + { + "epoch": 2.339456419868791, + "grad_norm": 53563.48046875, + "learning_rate": 3.103110992980113e-05, + "loss": 2.1361, + "step": 12481 + }, + { + "epoch": 2.3396438612933457, + "grad_norm": 55466.609375, + "learning_rate": 3.102383969634186e-05, + "loss": 2.1017, + "step": 12482 + }, + { + "epoch": 2.3398313027179007, + "grad_norm": 58197.32421875, + "learning_rate": 3.101656993156915e-05, + "loss": 2.0752, + "step": 12483 + }, + { + "epoch": 2.3400187441424554, + "grad_norm": 54137.54296875, + "learning_rate": 3.100930063566256e-05, + "loss": 2.1519, + "step": 12484 + }, + { + "epoch": 2.3402061855670104, + "grad_norm": 53138.69921875, + "learning_rate": 3.1002031808801624e-05, + "loss": 2.2176, + "step": 12485 + }, + { + "epoch": 2.340393626991565, + "grad_norm": 50005.61328125, + "learning_rate": 3.099476345116587e-05, + "loss": 2.1744, + "step": 12486 + }, + { + "epoch": 2.34058106841612, + "grad_norm": 55722.640625, + "learning_rate": 3.098749556293481e-05, + "loss": 2.1098, + "step": 12487 + }, + { + "epoch": 2.3407685098406747, + "grad_norm": 53660.953125, + "learning_rate": 3.098022814428797e-05, + "loss": 2.1532, + "step": 12488 + }, + { + "epoch": 2.3409559512652294, + "grad_norm": 54894.3828125, + "learning_rate": 3.0972961195404826e-05, + "loss": 2.1081, + "step": 12489 + }, + { + "epoch": 2.3411433926897844, + "grad_norm": 50315.87109375, + "learning_rate": 3.096569471646487e-05, + "loss": 2.1467, + "step": 12490 + }, + { + "epoch": 2.3413308341143395, + "grad_norm": 52141.20703125, + "learning_rate": 3.095842870764758e-05, + "loss": 2.1737, + "step": 12491 + }, + { + "epoch": 2.341518275538894, + "grad_norm": 53588.97265625, + "learning_rate": 3.0951163169132417e-05, + "loss": 2.165, + "step": 12492 + }, + { + "epoch": 2.3417057169634488, + "grad_norm": 49762.8125, + "learning_rate": 3.0943898101098825e-05, + "loss": 2.1482, + "step": 12493 + }, + { + "epoch": 2.341893158388004, + "grad_norm": 55628.578125, + "learning_rate": 3.093663350372624e-05, + "loss": 2.175, + "step": 12494 + }, + { + "epoch": 2.3420805998125585, + "grad_norm": 49776.21484375, + "learning_rate": 3.092936937719408e-05, + "loss": 2.1146, + "step": 12495 + }, + { + "epoch": 2.3422680412371135, + "grad_norm": 53228.3515625, + "learning_rate": 3.09221057216818e-05, + "loss": 2.1943, + "step": 12496 + }, + { + "epoch": 2.342455482661668, + "grad_norm": 52368.6171875, + "learning_rate": 3.0914842537368746e-05, + "loss": 2.1244, + "step": 12497 + }, + { + "epoch": 2.342642924086223, + "grad_norm": 51564.65234375, + "learning_rate": 3.090757982443435e-05, + "loss": 2.0901, + "step": 12498 + }, + { + "epoch": 2.342830365510778, + "grad_norm": 55410.453125, + "learning_rate": 3.090031758305798e-05, + "loss": 2.1623, + "step": 12499 + }, + { + "epoch": 2.3430178069353325, + "grad_norm": 55731.75390625, + "learning_rate": 3.089305581341901e-05, + "loss": 2.1639, + "step": 12500 + }, + { + "epoch": 2.3430178069353325, + "eval_loss": 2.2749242782592773, + "eval_runtime": 131.1421, + "eval_samples_per_second": 38.5, + "eval_steps_per_second": 1.929, + "step": 12500 + }, + { + "epoch": 2.3432052483598875, + "grad_norm": 55105.51171875, + "learning_rate": 3.088579451569678e-05, + "loss": 2.2037, + "step": 12501 + }, + { + "epoch": 2.3433926897844426, + "grad_norm": 57173.0390625, + "learning_rate": 3.087853369007064e-05, + "loss": 2.1036, + "step": 12502 + }, + { + "epoch": 2.343580131208997, + "grad_norm": 56506.62109375, + "learning_rate": 3.087127333671993e-05, + "loss": 2.1228, + "step": 12503 + }, + { + "epoch": 2.343767572633552, + "grad_norm": 53582.62890625, + "learning_rate": 3.086401345582397e-05, + "loss": 2.1515, + "step": 12504 + }, + { + "epoch": 2.343955014058107, + "grad_norm": 56216.3359375, + "learning_rate": 3.085675404756205e-05, + "loss": 2.1388, + "step": 12505 + }, + { + "epoch": 2.3441424554826615, + "grad_norm": 54411.79296875, + "learning_rate": 3.08494951121135e-05, + "loss": 2.1564, + "step": 12506 + }, + { + "epoch": 2.3443298969072166, + "grad_norm": 50674.5390625, + "learning_rate": 3.084223664965761e-05, + "loss": 2.1105, + "step": 12507 + }, + { + "epoch": 2.3445173383317712, + "grad_norm": 52131.89453125, + "learning_rate": 3.0834978660373617e-05, + "loss": 2.1379, + "step": 12508 + }, + { + "epoch": 2.3447047797563263, + "grad_norm": 53725.06640625, + "learning_rate": 3.0827721144440795e-05, + "loss": 2.1739, + "step": 12509 + }, + { + "epoch": 2.344892221180881, + "grad_norm": 55389.125, + "learning_rate": 3.08204641020384e-05, + "loss": 2.1996, + "step": 12510 + }, + { + "epoch": 2.345079662605436, + "grad_norm": 54560.7734375, + "learning_rate": 3.081320753334571e-05, + "loss": 2.1452, + "step": 12511 + }, + { + "epoch": 2.3452671040299906, + "grad_norm": 52541.28125, + "learning_rate": 3.080595143854188e-05, + "loss": 2.0954, + "step": 12512 + }, + { + "epoch": 2.3454545454545457, + "grad_norm": 52842.41015625, + "learning_rate": 3.0798695817806184e-05, + "loss": 2.1228, + "step": 12513 + }, + { + "epoch": 2.3456419868791003, + "grad_norm": 51060.984375, + "learning_rate": 3.07914406713178e-05, + "loss": 2.1567, + "step": 12514 + }, + { + "epoch": 2.345829428303655, + "grad_norm": 52268.3125, + "learning_rate": 3.0784185999255937e-05, + "loss": 2.1405, + "step": 12515 + }, + { + "epoch": 2.34601686972821, + "grad_norm": 50760.50390625, + "learning_rate": 3.077693180179976e-05, + "loss": 2.1168, + "step": 12516 + }, + { + "epoch": 2.3462043111527646, + "grad_norm": 55192.96875, + "learning_rate": 3.076967807912844e-05, + "loss": 2.1422, + "step": 12517 + }, + { + "epoch": 2.3463917525773197, + "grad_norm": 54161.78125, + "learning_rate": 3.076242483142115e-05, + "loss": 2.1709, + "step": 12518 + }, + { + "epoch": 2.3465791940018743, + "grad_norm": 54027.85546875, + "learning_rate": 3.0755172058857016e-05, + "loss": 2.1816, + "step": 12519 + }, + { + "epoch": 2.3467666354264294, + "grad_norm": 54487.07421875, + "learning_rate": 3.0747919761615166e-05, + "loss": 2.0969, + "step": 12520 + }, + { + "epoch": 2.346954076850984, + "grad_norm": 54051.28125, + "learning_rate": 3.0740667939874756e-05, + "loss": 2.1482, + "step": 12521 + }, + { + "epoch": 2.347141518275539, + "grad_norm": 53486.3125, + "learning_rate": 3.073341659381488e-05, + "loss": 2.1512, + "step": 12522 + }, + { + "epoch": 2.3473289597000937, + "grad_norm": 55719.32421875, + "learning_rate": 3.072616572361463e-05, + "loss": 2.1245, + "step": 12523 + }, + { + "epoch": 2.3475164011246488, + "grad_norm": 59672.4921875, + "learning_rate": 3.0718915329453095e-05, + "loss": 2.2102, + "step": 12524 + }, + { + "epoch": 2.3477038425492034, + "grad_norm": 51110.77734375, + "learning_rate": 3.0711665411509345e-05, + "loss": 2.128, + "step": 12525 + }, + { + "epoch": 2.347891283973758, + "grad_norm": 50518.76171875, + "learning_rate": 3.070441596996248e-05, + "loss": 2.0853, + "step": 12526 + }, + { + "epoch": 2.348078725398313, + "grad_norm": 57647.5, + "learning_rate": 3.06971670049915e-05, + "loss": 2.0424, + "step": 12527 + }, + { + "epoch": 2.3482661668228677, + "grad_norm": 52305.6484375, + "learning_rate": 3.068991851677546e-05, + "loss": 2.1449, + "step": 12528 + }, + { + "epoch": 2.3484536082474228, + "grad_norm": 52516.9609375, + "learning_rate": 3.0682670505493435e-05, + "loss": 2.1244, + "step": 12529 + }, + { + "epoch": 2.3486410496719774, + "grad_norm": 55965.328125, + "learning_rate": 3.067542297132436e-05, + "loss": 2.1799, + "step": 12530 + }, + { + "epoch": 2.3488284910965325, + "grad_norm": 50424.59765625, + "learning_rate": 3.066817591444731e-05, + "loss": 2.0913, + "step": 12531 + }, + { + "epoch": 2.349015932521087, + "grad_norm": 53965.97265625, + "learning_rate": 3.0660929335041235e-05, + "loss": 2.0908, + "step": 12532 + }, + { + "epoch": 2.349203373945642, + "grad_norm": 57589.0859375, + "learning_rate": 3.0653683233285147e-05, + "loss": 2.1529, + "step": 12533 + }, + { + "epoch": 2.349390815370197, + "grad_norm": 51664.72265625, + "learning_rate": 3.064643760935799e-05, + "loss": 2.1572, + "step": 12534 + }, + { + "epoch": 2.349578256794752, + "grad_norm": 54889.359375, + "learning_rate": 3.063919246343872e-05, + "loss": 2.0868, + "step": 12535 + }, + { + "epoch": 2.3497656982193065, + "grad_norm": 59635.796875, + "learning_rate": 3.063194779570632e-05, + "loss": 2.1266, + "step": 12536 + }, + { + "epoch": 2.349953139643861, + "grad_norm": 49918.77734375, + "learning_rate": 3.0624703606339685e-05, + "loss": 2.059, + "step": 12537 + }, + { + "epoch": 2.350140581068416, + "grad_norm": 54527.5546875, + "learning_rate": 3.0617459895517754e-05, + "loss": 2.1318, + "step": 12538 + }, + { + "epoch": 2.350328022492971, + "grad_norm": 51823.30859375, + "learning_rate": 3.061021666341943e-05, + "loss": 2.1449, + "step": 12539 + }, + { + "epoch": 2.350515463917526, + "grad_norm": 56553.4609375, + "learning_rate": 3.060297391022362e-05, + "loss": 2.1478, + "step": 12540 + }, + { + "epoch": 2.3507029053420805, + "grad_norm": 54360.75, + "learning_rate": 3.0595731636109215e-05, + "loss": 2.1456, + "step": 12541 + }, + { + "epoch": 2.3508903467666356, + "grad_norm": 59807.375, + "learning_rate": 3.058848984125506e-05, + "loss": 2.0868, + "step": 12542 + }, + { + "epoch": 2.35107778819119, + "grad_norm": 58710.65625, + "learning_rate": 3.058124852584004e-05, + "loss": 2.1037, + "step": 12543 + }, + { + "epoch": 2.3512652296157452, + "grad_norm": 53210.1015625, + "learning_rate": 3.057400769004303e-05, + "loss": 2.1698, + "step": 12544 + }, + { + "epoch": 2.3514526710403, + "grad_norm": 52041.1640625, + "learning_rate": 3.0566767334042815e-05, + "loss": 2.1005, + "step": 12545 + }, + { + "epoch": 2.351640112464855, + "grad_norm": 55019.27734375, + "learning_rate": 3.055952745801826e-05, + "loss": 2.109, + "step": 12546 + }, + { + "epoch": 2.3518275538894096, + "grad_norm": 54728.82421875, + "learning_rate": 3.055228806214817e-05, + "loss": 2.1387, + "step": 12547 + }, + { + "epoch": 2.352014995313964, + "grad_norm": 54447.77734375, + "learning_rate": 3.054504914661137e-05, + "loss": 2.167, + "step": 12548 + }, + { + "epoch": 2.3522024367385193, + "grad_norm": 53721.91796875, + "learning_rate": 3.053781071158661e-05, + "loss": 2.1118, + "step": 12549 + }, + { + "epoch": 2.352389878163074, + "grad_norm": 51242.640625, + "learning_rate": 3.053057275725269e-05, + "loss": 2.1061, + "step": 12550 + }, + { + "epoch": 2.352577319587629, + "grad_norm": 56164.30859375, + "learning_rate": 3.052333528378839e-05, + "loss": 2.1133, + "step": 12551 + }, + { + "epoch": 2.3527647610121836, + "grad_norm": 50557.87890625, + "learning_rate": 3.0516098291372465e-05, + "loss": 2.1269, + "step": 12552 + }, + { + "epoch": 2.3529522024367386, + "grad_norm": 53875.59765625, + "learning_rate": 3.050886178018364e-05, + "loss": 2.1784, + "step": 12553 + }, + { + "epoch": 2.3531396438612933, + "grad_norm": 54066.26953125, + "learning_rate": 3.050162575040066e-05, + "loss": 2.1389, + "step": 12554 + }, + { + "epoch": 2.3533270852858483, + "grad_norm": 50365.28125, + "learning_rate": 3.0494390202202238e-05, + "loss": 2.1455, + "step": 12555 + }, + { + "epoch": 2.353514526710403, + "grad_norm": 50825.609375, + "learning_rate": 3.0487155135767108e-05, + "loss": 2.1214, + "step": 12556 + }, + { + "epoch": 2.353701968134958, + "grad_norm": 49075.703125, + "learning_rate": 3.0479920551273932e-05, + "loss": 2.0992, + "step": 12557 + }, + { + "epoch": 2.3538894095595126, + "grad_norm": 49536.19140625, + "learning_rate": 3.04726864489014e-05, + "loss": 2.131, + "step": 12558 + }, + { + "epoch": 2.3540768509840673, + "grad_norm": 53604.22265625, + "learning_rate": 3.0465452828828234e-05, + "loss": 2.1838, + "step": 12559 + }, + { + "epoch": 2.3542642924086223, + "grad_norm": 54434.75390625, + "learning_rate": 3.0458219691233015e-05, + "loss": 2.144, + "step": 12560 + }, + { + "epoch": 2.354451733833177, + "grad_norm": 51805.71875, + "learning_rate": 3.045098703629444e-05, + "loss": 2.0936, + "step": 12561 + }, + { + "epoch": 2.354639175257732, + "grad_norm": 51659.33984375, + "learning_rate": 3.0443754864191154e-05, + "loss": 2.1502, + "step": 12562 + }, + { + "epoch": 2.3548266166822867, + "grad_norm": 51088.11328125, + "learning_rate": 3.0436523175101773e-05, + "loss": 2.1602, + "step": 12563 + }, + { + "epoch": 2.3550140581068417, + "grad_norm": 50659.49609375, + "learning_rate": 3.0429291969204886e-05, + "loss": 2.2197, + "step": 12564 + }, + { + "epoch": 2.3552014995313963, + "grad_norm": 51905.29296875, + "learning_rate": 3.0422061246679113e-05, + "loss": 2.0544, + "step": 12565 + }, + { + "epoch": 2.3553889409559514, + "grad_norm": 52592.0078125, + "learning_rate": 3.041483100770305e-05, + "loss": 2.1354, + "step": 12566 + }, + { + "epoch": 2.355576382380506, + "grad_norm": 57587.078125, + "learning_rate": 3.0407601252455283e-05, + "loss": 2.1316, + "step": 12567 + }, + { + "epoch": 2.355763823805061, + "grad_norm": 57116.765625, + "learning_rate": 3.040037198111435e-05, + "loss": 2.2415, + "step": 12568 + }, + { + "epoch": 2.3559512652296157, + "grad_norm": 55376.734375, + "learning_rate": 3.039314319385882e-05, + "loss": 2.1449, + "step": 12569 + }, + { + "epoch": 2.3561387066541704, + "grad_norm": 54681.9375, + "learning_rate": 3.038591489086724e-05, + "loss": 2.2272, + "step": 12570 + }, + { + "epoch": 2.3563261480787254, + "grad_norm": 55944.94140625, + "learning_rate": 3.0378687072318117e-05, + "loss": 2.1256, + "step": 12571 + }, + { + "epoch": 2.35651358950328, + "grad_norm": 56083.33984375, + "learning_rate": 3.037145973838999e-05, + "loss": 2.1433, + "step": 12572 + }, + { + "epoch": 2.356701030927835, + "grad_norm": 49681.6875, + "learning_rate": 3.0364232889261345e-05, + "loss": 2.0978, + "step": 12573 + }, + { + "epoch": 2.3568884723523897, + "grad_norm": 50852.73828125, + "learning_rate": 3.0357006525110727e-05, + "loss": 2.1334, + "step": 12574 + }, + { + "epoch": 2.357075913776945, + "grad_norm": 56863.8359375, + "learning_rate": 3.0349780646116543e-05, + "loss": 2.1542, + "step": 12575 + }, + { + "epoch": 2.3572633552014994, + "grad_norm": 52223.1953125, + "learning_rate": 3.0342555252457316e-05, + "loss": 2.1789, + "step": 12576 + }, + { + "epoch": 2.3574507966260545, + "grad_norm": 54227.8359375, + "learning_rate": 3.0335330344311486e-05, + "loss": 2.1426, + "step": 12577 + }, + { + "epoch": 2.357638238050609, + "grad_norm": 55358.609375, + "learning_rate": 3.032810592185751e-05, + "loss": 2.0959, + "step": 12578 + }, + { + "epoch": 2.357825679475164, + "grad_norm": 54617.36328125, + "learning_rate": 3.0320881985273797e-05, + "loss": 2.106, + "step": 12579 + }, + { + "epoch": 2.358013120899719, + "grad_norm": 56575.765625, + "learning_rate": 3.0313658534738794e-05, + "loss": 2.1177, + "step": 12580 + }, + { + "epoch": 2.3582005623242734, + "grad_norm": 53366.6640625, + "learning_rate": 3.0306435570430887e-05, + "loss": 2.1387, + "step": 12581 + }, + { + "epoch": 2.3583880037488285, + "grad_norm": 54568.921875, + "learning_rate": 3.0299213092528522e-05, + "loss": 2.1599, + "step": 12582 + }, + { + "epoch": 2.358575445173383, + "grad_norm": 55320.62890625, + "learning_rate": 3.0291991101210015e-05, + "loss": 2.1522, + "step": 12583 + }, + { + "epoch": 2.358762886597938, + "grad_norm": 51155.91015625, + "learning_rate": 3.028476959665379e-05, + "loss": 2.078, + "step": 12584 + }, + { + "epoch": 2.358950328022493, + "grad_norm": 53523.1875, + "learning_rate": 3.0277548579038205e-05, + "loss": 2.1135, + "step": 12585 + }, + { + "epoch": 2.359137769447048, + "grad_norm": 55105.4296875, + "learning_rate": 3.0270328048541585e-05, + "loss": 2.212, + "step": 12586 + }, + { + "epoch": 2.3593252108716025, + "grad_norm": 52851.91796875, + "learning_rate": 3.0263108005342288e-05, + "loss": 2.1431, + "step": 12587 + }, + { + "epoch": 2.3595126522961576, + "grad_norm": 53785.33984375, + "learning_rate": 3.025588844961862e-05, + "loss": 2.2516, + "step": 12588 + }, + { + "epoch": 2.359700093720712, + "grad_norm": 52880.19921875, + "learning_rate": 3.0248669381548945e-05, + "loss": 2.1147, + "step": 12589 + }, + { + "epoch": 2.3598875351452673, + "grad_norm": 57078.65234375, + "learning_rate": 3.024145080131149e-05, + "loss": 2.1092, + "step": 12590 + }, + { + "epoch": 2.360074976569822, + "grad_norm": 51822.26171875, + "learning_rate": 3.02342327090846e-05, + "loss": 2.1029, + "step": 12591 + }, + { + "epoch": 2.3602624179943765, + "grad_norm": 57573.98828125, + "learning_rate": 3.022701510504653e-05, + "loss": 2.0515, + "step": 12592 + }, + { + "epoch": 2.3604498594189316, + "grad_norm": 48907.578125, + "learning_rate": 3.0219797989375564e-05, + "loss": 2.1096, + "step": 12593 + }, + { + "epoch": 2.360637300843486, + "grad_norm": 56318.5859375, + "learning_rate": 3.0212581362249924e-05, + "loss": 2.0793, + "step": 12594 + }, + { + "epoch": 2.3608247422680413, + "grad_norm": 57659.4609375, + "learning_rate": 3.0205365223847882e-05, + "loss": 2.1062, + "step": 12595 + }, + { + "epoch": 2.361012183692596, + "grad_norm": 63384.00390625, + "learning_rate": 3.0198149574347633e-05, + "loss": 2.1657, + "step": 12596 + }, + { + "epoch": 2.361199625117151, + "grad_norm": 52076.73046875, + "learning_rate": 3.0190934413927456e-05, + "loss": 2.0923, + "step": 12597 + }, + { + "epoch": 2.3613870665417056, + "grad_norm": 56887.4375, + "learning_rate": 3.018371974276548e-05, + "loss": 2.1715, + "step": 12598 + }, + { + "epoch": 2.3615745079662607, + "grad_norm": 56861.45703125, + "learning_rate": 3.017650556103995e-05, + "loss": 2.0819, + "step": 12599 + }, + { + "epoch": 2.3617619493908153, + "grad_norm": 53298.25, + "learning_rate": 3.0169291868929038e-05, + "loss": 2.112, + "step": 12600 + }, + { + "epoch": 2.3619493908153704, + "grad_norm": 55804.5859375, + "learning_rate": 3.0162078666610894e-05, + "loss": 2.1324, + "step": 12601 + }, + { + "epoch": 2.362136832239925, + "grad_norm": 52369.015625, + "learning_rate": 3.015486595426369e-05, + "loss": 2.137, + "step": 12602 + }, + { + "epoch": 2.3623242736644796, + "grad_norm": 52828.9140625, + "learning_rate": 3.0147653732065556e-05, + "loss": 2.1201, + "step": 12603 + }, + { + "epoch": 2.3625117150890347, + "grad_norm": 53075.78125, + "learning_rate": 3.0140442000194667e-05, + "loss": 2.1484, + "step": 12604 + }, + { + "epoch": 2.3626991565135897, + "grad_norm": 51947.6640625, + "learning_rate": 3.013323075882908e-05, + "loss": 2.0932, + "step": 12605 + }, + { + "epoch": 2.3628865979381444, + "grad_norm": 57071.1328125, + "learning_rate": 3.012602000814695e-05, + "loss": 2.1508, + "step": 12606 + }, + { + "epoch": 2.363074039362699, + "grad_norm": 54853.20703125, + "learning_rate": 3.011880974832635e-05, + "loss": 2.1649, + "step": 12607 + }, + { + "epoch": 2.363261480787254, + "grad_norm": 52668.90234375, + "learning_rate": 3.011159997954539e-05, + "loss": 2.1833, + "step": 12608 + }, + { + "epoch": 2.3634489222118087, + "grad_norm": 54310.41015625, + "learning_rate": 3.010439070198211e-05, + "loss": 2.1497, + "step": 12609 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 55734.171875, + "learning_rate": 3.0097181915814586e-05, + "loss": 2.1182, + "step": 12610 + }, + { + "epoch": 2.3638238050609184, + "grad_norm": 51408.92578125, + "learning_rate": 3.0089973621220856e-05, + "loss": 2.1115, + "step": 12611 + }, + { + "epoch": 2.3640112464854734, + "grad_norm": 57612.296875, + "learning_rate": 3.0082765818379e-05, + "loss": 2.1086, + "step": 12612 + }, + { + "epoch": 2.364198687910028, + "grad_norm": 55350.0625, + "learning_rate": 3.0075558507466972e-05, + "loss": 2.1571, + "step": 12613 + }, + { + "epoch": 2.3643861293345827, + "grad_norm": 56719.859375, + "learning_rate": 3.0068351688662822e-05, + "loss": 2.0531, + "step": 12614 + }, + { + "epoch": 2.3645735707591378, + "grad_norm": 52790.37109375, + "learning_rate": 3.0061145362144548e-05, + "loss": 2.1164, + "step": 12615 + }, + { + "epoch": 2.364761012183693, + "grad_norm": 58621.89453125, + "learning_rate": 3.0053939528090135e-05, + "loss": 2.0702, + "step": 12616 + }, + { + "epoch": 2.3649484536082475, + "grad_norm": 52719.53125, + "learning_rate": 3.004673418667755e-05, + "loss": 2.131, + "step": 12617 + }, + { + "epoch": 2.365135895032802, + "grad_norm": 49349.82421875, + "learning_rate": 3.003952933808475e-05, + "loss": 2.1373, + "step": 12618 + }, + { + "epoch": 2.365323336457357, + "grad_norm": 51364.55859375, + "learning_rate": 3.0032324982489724e-05, + "loss": 2.1231, + "step": 12619 + }, + { + "epoch": 2.3655107778819118, + "grad_norm": 52192.90625, + "learning_rate": 3.0025121120070365e-05, + "loss": 2.2181, + "step": 12620 + }, + { + "epoch": 2.365698219306467, + "grad_norm": 51734.7109375, + "learning_rate": 3.001791775100461e-05, + "loss": 2.1076, + "step": 12621 + }, + { + "epoch": 2.3658856607310215, + "grad_norm": 55025.5390625, + "learning_rate": 3.0010714875470387e-05, + "loss": 2.087, + "step": 12622 + }, + { + "epoch": 2.3660731021555765, + "grad_norm": 54178.56640625, + "learning_rate": 3.0003512493645595e-05, + "loss": 2.1634, + "step": 12623 + }, + { + "epoch": 2.366260543580131, + "grad_norm": 57987.421875, + "learning_rate": 2.999631060570811e-05, + "loss": 2.1386, + "step": 12624 + }, + { + "epoch": 2.3664479850046862, + "grad_norm": 53591.47265625, + "learning_rate": 2.9989109211835827e-05, + "loss": 2.1852, + "step": 12625 + }, + { + "epoch": 2.366635426429241, + "grad_norm": 49453.80078125, + "learning_rate": 2.9981908312206604e-05, + "loss": 2.1051, + "step": 12626 + }, + { + "epoch": 2.366822867853796, + "grad_norm": 57295.08984375, + "learning_rate": 2.9974707906998284e-05, + "loss": 2.1489, + "step": 12627 + }, + { + "epoch": 2.3670103092783505, + "grad_norm": 56022.75, + "learning_rate": 2.9967507996388705e-05, + "loss": 2.0875, + "step": 12628 + }, + { + "epoch": 2.367197750702905, + "grad_norm": 56109.20703125, + "learning_rate": 2.9960308580555718e-05, + "loss": 2.1807, + "step": 12629 + }, + { + "epoch": 2.3673851921274602, + "grad_norm": 53394.61328125, + "learning_rate": 2.995310965967713e-05, + "loss": 2.1081, + "step": 12630 + }, + { + "epoch": 2.367572633552015, + "grad_norm": 49502.8515625, + "learning_rate": 2.994591123393074e-05, + "loss": 2.0751, + "step": 12631 + }, + { + "epoch": 2.36776007497657, + "grad_norm": 56203.22265625, + "learning_rate": 2.993871330349434e-05, + "loss": 2.0978, + "step": 12632 + }, + { + "epoch": 2.3679475164011246, + "grad_norm": 54610.73828125, + "learning_rate": 2.993151586854571e-05, + "loss": 2.1448, + "step": 12633 + }, + { + "epoch": 2.3681349578256796, + "grad_norm": 53648.6328125, + "learning_rate": 2.9924318929262628e-05, + "loss": 2.1042, + "step": 12634 + }, + { + "epoch": 2.3683223992502342, + "grad_norm": 52417.30078125, + "learning_rate": 2.9917122485822834e-05, + "loss": 2.2001, + "step": 12635 + }, + { + "epoch": 2.3685098406747893, + "grad_norm": 55808.03125, + "learning_rate": 2.9909926538404066e-05, + "loss": 2.1044, + "step": 12636 + }, + { + "epoch": 2.368697282099344, + "grad_norm": 51303.75, + "learning_rate": 2.990273108718407e-05, + "loss": 2.1498, + "step": 12637 + }, + { + "epoch": 2.368884723523899, + "grad_norm": 54535.8828125, + "learning_rate": 2.9895536132340574e-05, + "loss": 2.0731, + "step": 12638 + }, + { + "epoch": 2.3690721649484536, + "grad_norm": 51927.81640625, + "learning_rate": 2.9888341674051257e-05, + "loss": 2.1392, + "step": 12639 + }, + { + "epoch": 2.3692596063730083, + "grad_norm": 53444.6640625, + "learning_rate": 2.9881147712493828e-05, + "loss": 2.1377, + "step": 12640 + }, + { + "epoch": 2.3694470477975633, + "grad_norm": 56705.859375, + "learning_rate": 2.987395424784598e-05, + "loss": 2.0917, + "step": 12641 + }, + { + "epoch": 2.369634489222118, + "grad_norm": 51257.296875, + "learning_rate": 2.986676128028535e-05, + "loss": 2.0701, + "step": 12642 + }, + { + "epoch": 2.369821930646673, + "grad_norm": 54881.30078125, + "learning_rate": 2.985956880998961e-05, + "loss": 2.162, + "step": 12643 + }, + { + "epoch": 2.3700093720712276, + "grad_norm": 61339.12109375, + "learning_rate": 2.9852376837136414e-05, + "loss": 2.074, + "step": 12644 + }, + { + "epoch": 2.3701968134957827, + "grad_norm": 51333.12109375, + "learning_rate": 2.9845185361903398e-05, + "loss": 2.1356, + "step": 12645 + }, + { + "epoch": 2.3703842549203373, + "grad_norm": 54984.140625, + "learning_rate": 2.9837994384468165e-05, + "loss": 2.1568, + "step": 12646 + }, + { + "epoch": 2.3705716963448924, + "grad_norm": 53842.32421875, + "learning_rate": 2.983080390500833e-05, + "loss": 2.1523, + "step": 12647 + }, + { + "epoch": 2.370759137769447, + "grad_norm": 53915.25390625, + "learning_rate": 2.9823613923701486e-05, + "loss": 2.1508, + "step": 12648 + }, + { + "epoch": 2.370946579194002, + "grad_norm": 52410.64453125, + "learning_rate": 2.9816424440725232e-05, + "loss": 2.1377, + "step": 12649 + }, + { + "epoch": 2.3711340206185567, + "grad_norm": 53996.8515625, + "learning_rate": 2.9809235456257118e-05, + "loss": 2.0666, + "step": 12650 + }, + { + "epoch": 2.3713214620431113, + "grad_norm": 52414.125, + "learning_rate": 2.9802046970474694e-05, + "loss": 2.1609, + "step": 12651 + }, + { + "epoch": 2.3715089034676664, + "grad_norm": 56085.85546875, + "learning_rate": 2.979485898355554e-05, + "loss": 2.1119, + "step": 12652 + }, + { + "epoch": 2.371696344892221, + "grad_norm": 51627.41796875, + "learning_rate": 2.978767149567718e-05, + "loss": 2.0757, + "step": 12653 + }, + { + "epoch": 2.371883786316776, + "grad_norm": 54844.38671875, + "learning_rate": 2.9780484507017114e-05, + "loss": 2.16, + "step": 12654 + }, + { + "epoch": 2.3720712277413307, + "grad_norm": 50346.87890625, + "learning_rate": 2.977329801775287e-05, + "loss": 2.1382, + "step": 12655 + }, + { + "epoch": 2.372258669165886, + "grad_norm": 53231.6796875, + "learning_rate": 2.976611202806195e-05, + "loss": 2.1334, + "step": 12656 + }, + { + "epoch": 2.3724461105904404, + "grad_norm": 57681.9140625, + "learning_rate": 2.975892653812182e-05, + "loss": 2.095, + "step": 12657 + }, + { + "epoch": 2.3726335520149955, + "grad_norm": 53594.96875, + "learning_rate": 2.9751741548109945e-05, + "loss": 2.1598, + "step": 12658 + }, + { + "epoch": 2.37282099343955, + "grad_norm": 56921.00390625, + "learning_rate": 2.974455705820382e-05, + "loss": 2.1373, + "step": 12659 + }, + { + "epoch": 2.373008434864105, + "grad_norm": 54389.62109375, + "learning_rate": 2.973737306858088e-05, + "loss": 2.1533, + "step": 12660 + }, + { + "epoch": 2.37319587628866, + "grad_norm": 50578.015625, + "learning_rate": 2.9730189579418544e-05, + "loss": 2.1066, + "step": 12661 + }, + { + "epoch": 2.3733833177132144, + "grad_norm": 54532.0546875, + "learning_rate": 2.9723006590894253e-05, + "loss": 2.1954, + "step": 12662 + }, + { + "epoch": 2.3735707591377695, + "grad_norm": 52753.23046875, + "learning_rate": 2.9715824103185398e-05, + "loss": 2.1871, + "step": 12663 + }, + { + "epoch": 2.373758200562324, + "grad_norm": 56944.94140625, + "learning_rate": 2.970864211646941e-05, + "loss": 2.1864, + "step": 12664 + }, + { + "epoch": 2.373945641986879, + "grad_norm": 49964.66796875, + "learning_rate": 2.970146063092363e-05, + "loss": 2.1494, + "step": 12665 + }, + { + "epoch": 2.374133083411434, + "grad_norm": 52971.47265625, + "learning_rate": 2.9694279646725455e-05, + "loss": 2.1654, + "step": 12666 + }, + { + "epoch": 2.374320524835989, + "grad_norm": 55349.86328125, + "learning_rate": 2.968709916405228e-05, + "loss": 2.137, + "step": 12667 + }, + { + "epoch": 2.3745079662605435, + "grad_norm": 56522.80859375, + "learning_rate": 2.967991918308138e-05, + "loss": 2.1737, + "step": 12668 + }, + { + "epoch": 2.3746954076850986, + "grad_norm": 53119.0078125, + "learning_rate": 2.9672739703990144e-05, + "loss": 2.1587, + "step": 12669 + }, + { + "epoch": 2.374882849109653, + "grad_norm": 51995.80078125, + "learning_rate": 2.9665560726955883e-05, + "loss": 2.1576, + "step": 12670 + }, + { + "epoch": 2.3750702905342083, + "grad_norm": 56891.703125, + "learning_rate": 2.9658382252155913e-05, + "loss": 2.108, + "step": 12671 + }, + { + "epoch": 2.375257731958763, + "grad_norm": 53515.73828125, + "learning_rate": 2.965120427976753e-05, + "loss": 2.1056, + "step": 12672 + }, + { + "epoch": 2.3754451733833175, + "grad_norm": 52539.34375, + "learning_rate": 2.9644026809968006e-05, + "loss": 2.1181, + "step": 12673 + }, + { + "epoch": 2.3756326148078726, + "grad_norm": 54812.98046875, + "learning_rate": 2.963684984293462e-05, + "loss": 2.1178, + "step": 12674 + }, + { + "epoch": 2.375820056232427, + "grad_norm": 58301.97265625, + "learning_rate": 2.962967337884468e-05, + "loss": 2.133, + "step": 12675 + }, + { + "epoch": 2.3760074976569823, + "grad_norm": 53710.9140625, + "learning_rate": 2.962249741787536e-05, + "loss": 2.1703, + "step": 12676 + }, + { + "epoch": 2.376194939081537, + "grad_norm": 52806.81640625, + "learning_rate": 2.9615321960203947e-05, + "loss": 2.1223, + "step": 12677 + }, + { + "epoch": 2.376382380506092, + "grad_norm": 55779.5390625, + "learning_rate": 2.9608147006007653e-05, + "loss": 2.1741, + "step": 12678 + }, + { + "epoch": 2.3765698219306466, + "grad_norm": 53742.25, + "learning_rate": 2.9600972555463702e-05, + "loss": 2.1421, + "step": 12679 + }, + { + "epoch": 2.3767572633552017, + "grad_norm": 56395.83984375, + "learning_rate": 2.9593798608749263e-05, + "loss": 2.1918, + "step": 12680 + }, + { + "epoch": 2.3769447047797563, + "grad_norm": 49895.39453125, + "learning_rate": 2.9586625166041538e-05, + "loss": 2.1891, + "step": 12681 + }, + { + "epoch": 2.3771321462043113, + "grad_norm": 55586.54296875, + "learning_rate": 2.9579452227517734e-05, + "loss": 2.0847, + "step": 12682 + }, + { + "epoch": 2.377319587628866, + "grad_norm": 52583.0390625, + "learning_rate": 2.9572279793354952e-05, + "loss": 2.1931, + "step": 12683 + }, + { + "epoch": 2.3775070290534206, + "grad_norm": 52340.16796875, + "learning_rate": 2.956510786373038e-05, + "loss": 2.1059, + "step": 12684 + }, + { + "epoch": 2.3776944704779757, + "grad_norm": 51561.87109375, + "learning_rate": 2.9557936438821155e-05, + "loss": 2.1124, + "step": 12685 + }, + { + "epoch": 2.3778819119025303, + "grad_norm": 54768.87890625, + "learning_rate": 2.9550765518804407e-05, + "loss": 2.1298, + "step": 12686 + }, + { + "epoch": 2.3780693533270854, + "grad_norm": 51421.4296875, + "learning_rate": 2.9543595103857226e-05, + "loss": 2.15, + "step": 12687 + }, + { + "epoch": 2.37825679475164, + "grad_norm": 55795.828125, + "learning_rate": 2.9536425194156725e-05, + "loss": 2.0839, + "step": 12688 + }, + { + "epoch": 2.378444236176195, + "grad_norm": 53574.96484375, + "learning_rate": 2.9529255789879984e-05, + "loss": 2.1156, + "step": 12689 + }, + { + "epoch": 2.3786316776007497, + "grad_norm": 57631.921875, + "learning_rate": 2.9522086891204108e-05, + "loss": 2.056, + "step": 12690 + }, + { + "epoch": 2.3788191190253047, + "grad_norm": 51624.34765625, + "learning_rate": 2.951491849830611e-05, + "loss": 2.1472, + "step": 12691 + }, + { + "epoch": 2.3790065604498594, + "grad_norm": 61904.203125, + "learning_rate": 2.950775061136307e-05, + "loss": 2.0664, + "step": 12692 + }, + { + "epoch": 2.3791940018744144, + "grad_norm": 51009.91015625, + "learning_rate": 2.9500583230552025e-05, + "loss": 2.1426, + "step": 12693 + }, + { + "epoch": 2.379381443298969, + "grad_norm": 55286.47265625, + "learning_rate": 2.9493416356050006e-05, + "loss": 2.123, + "step": 12694 + }, + { + "epoch": 2.3795688847235237, + "grad_norm": 54598.5625, + "learning_rate": 2.948624998803401e-05, + "loss": 2.0646, + "step": 12695 + }, + { + "epoch": 2.3797563261480787, + "grad_norm": 53291.7578125, + "learning_rate": 2.9479084126681028e-05, + "loss": 2.1855, + "step": 12696 + }, + { + "epoch": 2.3799437675726334, + "grad_norm": 59341.04296875, + "learning_rate": 2.9471918772168093e-05, + "loss": 2.1632, + "step": 12697 + }, + { + "epoch": 2.3801312089971884, + "grad_norm": 56731.2890625, + "learning_rate": 2.9464753924672117e-05, + "loss": 2.165, + "step": 12698 + }, + { + "epoch": 2.380318650421743, + "grad_norm": 56779.046875, + "learning_rate": 2.945758958437011e-05, + "loss": 2.1618, + "step": 12699 + }, + { + "epoch": 2.380506091846298, + "grad_norm": 53890.16796875, + "learning_rate": 2.9450425751439005e-05, + "loss": 2.1658, + "step": 12700 + }, + { + "epoch": 2.3806935332708528, + "grad_norm": 51354.21875, + "learning_rate": 2.9443262426055752e-05, + "loss": 2.1246, + "step": 12701 + }, + { + "epoch": 2.380880974695408, + "grad_norm": 52252.8359375, + "learning_rate": 2.943609960839725e-05, + "loss": 2.1432, + "step": 12702 + }, + { + "epoch": 2.3810684161199624, + "grad_norm": 56020.1875, + "learning_rate": 2.942893729864043e-05, + "loss": 2.095, + "step": 12703 + }, + { + "epoch": 2.3812558575445175, + "grad_norm": 56236.65625, + "learning_rate": 2.9421775496962177e-05, + "loss": 2.1841, + "step": 12704 + }, + { + "epoch": 2.381443298969072, + "grad_norm": 52536.14453125, + "learning_rate": 2.941461420353942e-05, + "loss": 2.1392, + "step": 12705 + }, + { + "epoch": 2.3816307403936268, + "grad_norm": 56103.55078125, + "learning_rate": 2.9407453418548968e-05, + "loss": 2.0959, + "step": 12706 + }, + { + "epoch": 2.381818181818182, + "grad_norm": 50332.08203125, + "learning_rate": 2.9400293142167736e-05, + "loss": 2.1352, + "step": 12707 + }, + { + "epoch": 2.3820056232427365, + "grad_norm": 52638.15625, + "learning_rate": 2.939313337457255e-05, + "loss": 2.0895, + "step": 12708 + }, + { + "epoch": 2.3821930646672915, + "grad_norm": 53382.65625, + "learning_rate": 2.9385974115940262e-05, + "loss": 2.0936, + "step": 12709 + }, + { + "epoch": 2.382380506091846, + "grad_norm": 55759.875, + "learning_rate": 2.937881536644768e-05, + "loss": 2.2176, + "step": 12710 + }, + { + "epoch": 2.382567947516401, + "grad_norm": 53137.42578125, + "learning_rate": 2.9371657126271614e-05, + "loss": 2.0627, + "step": 12711 + }, + { + "epoch": 2.382755388940956, + "grad_norm": 54231.08984375, + "learning_rate": 2.9364499395588897e-05, + "loss": 2.1254, + "step": 12712 + }, + { + "epoch": 2.382942830365511, + "grad_norm": 54176.92578125, + "learning_rate": 2.9357342174576275e-05, + "loss": 2.1451, + "step": 12713 + }, + { + "epoch": 2.3831302717900655, + "grad_norm": 52235.88671875, + "learning_rate": 2.9350185463410524e-05, + "loss": 2.0985, + "step": 12714 + }, + { + "epoch": 2.3833177132146206, + "grad_norm": 54415.046875, + "learning_rate": 2.934302926226843e-05, + "loss": 2.0822, + "step": 12715 + }, + { + "epoch": 2.3835051546391752, + "grad_norm": 54632.17578125, + "learning_rate": 2.9335873571326743e-05, + "loss": 2.0984, + "step": 12716 + }, + { + "epoch": 2.38369259606373, + "grad_norm": 55296.30859375, + "learning_rate": 2.932871839076218e-05, + "loss": 2.174, + "step": 12717 + }, + { + "epoch": 2.383880037488285, + "grad_norm": 49831.3203125, + "learning_rate": 2.9321563720751468e-05, + "loss": 2.1604, + "step": 12718 + }, + { + "epoch": 2.3840674789128395, + "grad_norm": 56062.0625, + "learning_rate": 2.9314409561471313e-05, + "loss": 2.1559, + "step": 12719 + }, + { + "epoch": 2.3842549203373946, + "grad_norm": 54554.359375, + "learning_rate": 2.930725591309845e-05, + "loss": 2.1086, + "step": 12720 + }, + { + "epoch": 2.3844423617619492, + "grad_norm": 57729.48828125, + "learning_rate": 2.9300102775809507e-05, + "loss": 2.122, + "step": 12721 + }, + { + "epoch": 2.3846298031865043, + "grad_norm": 53491.296875, + "learning_rate": 2.92929501497812e-05, + "loss": 2.182, + "step": 12722 + }, + { + "epoch": 2.384817244611059, + "grad_norm": 52636.3203125, + "learning_rate": 2.928579803519019e-05, + "loss": 2.1305, + "step": 12723 + }, + { + "epoch": 2.385004686035614, + "grad_norm": 58537.13671875, + "learning_rate": 2.9278646432213097e-05, + "loss": 2.0822, + "step": 12724 + }, + { + "epoch": 2.3851921274601686, + "grad_norm": 56965.4296875, + "learning_rate": 2.9271495341026576e-05, + "loss": 2.1174, + "step": 12725 + }, + { + "epoch": 2.3853795688847237, + "grad_norm": 50638.3203125, + "learning_rate": 2.9264344761807237e-05, + "loss": 2.1289, + "step": 12726 + }, + { + "epoch": 2.3855670103092783, + "grad_norm": 50928.3984375, + "learning_rate": 2.9257194694731716e-05, + "loss": 2.1283, + "step": 12727 + }, + { + "epoch": 2.385754451733833, + "grad_norm": 50294.03125, + "learning_rate": 2.9250045139976585e-05, + "loss": 2.1355, + "step": 12728 + }, + { + "epoch": 2.385941893158388, + "grad_norm": 57486.68359375, + "learning_rate": 2.924289609771842e-05, + "loss": 2.1063, + "step": 12729 + }, + { + "epoch": 2.386129334582943, + "grad_norm": 55103.40625, + "learning_rate": 2.923574756813382e-05, + "loss": 2.1664, + "step": 12730 + }, + { + "epoch": 2.3863167760074977, + "grad_norm": 53299.84765625, + "learning_rate": 2.922859955139935e-05, + "loss": 2.1227, + "step": 12731 + }, + { + "epoch": 2.3865042174320523, + "grad_norm": 55606.65625, + "learning_rate": 2.9221452047691523e-05, + "loss": 2.1145, + "step": 12732 + }, + { + "epoch": 2.3866916588566074, + "grad_norm": 50264.1484375, + "learning_rate": 2.9214305057186897e-05, + "loss": 2.1234, + "step": 12733 + }, + { + "epoch": 2.386879100281162, + "grad_norm": 54381.828125, + "learning_rate": 2.920715858006197e-05, + "loss": 2.2117, + "step": 12734 + }, + { + "epoch": 2.387066541705717, + "grad_norm": 54041.265625, + "learning_rate": 2.92000126164933e-05, + "loss": 2.1512, + "step": 12735 + }, + { + "epoch": 2.3872539831302717, + "grad_norm": 50800.63671875, + "learning_rate": 2.9192867166657323e-05, + "loss": 2.1624, + "step": 12736 + }, + { + "epoch": 2.3874414245548268, + "grad_norm": 54152.83203125, + "learning_rate": 2.918572223073056e-05, + "loss": 2.0789, + "step": 12737 + }, + { + "epoch": 2.3876288659793814, + "grad_norm": 49713.8671875, + "learning_rate": 2.9178577808889484e-05, + "loss": 2.114, + "step": 12738 + }, + { + "epoch": 2.387816307403936, + "grad_norm": 63041.46875, + "learning_rate": 2.9171433901310534e-05, + "loss": 2.144, + "step": 12739 + }, + { + "epoch": 2.388003748828491, + "grad_norm": 54900.25, + "learning_rate": 2.9164290508170143e-05, + "loss": 2.1547, + "step": 12740 + }, + { + "epoch": 2.388191190253046, + "grad_norm": 55711.328125, + "learning_rate": 2.9157147629644765e-05, + "loss": 2.1591, + "step": 12741 + }, + { + "epoch": 2.388378631677601, + "grad_norm": 52870.328125, + "learning_rate": 2.915000526591084e-05, + "loss": 2.1, + "step": 12742 + }, + { + "epoch": 2.3885660731021554, + "grad_norm": 53750.59375, + "learning_rate": 2.914286341714475e-05, + "loss": 2.1726, + "step": 12743 + }, + { + "epoch": 2.3887535145267105, + "grad_norm": 53601.50390625, + "learning_rate": 2.9135722083522876e-05, + "loss": 2.1577, + "step": 12744 + }, + { + "epoch": 2.388940955951265, + "grad_norm": 53178.078125, + "learning_rate": 2.9128581265221634e-05, + "loss": 2.1537, + "step": 12745 + }, + { + "epoch": 2.38912839737582, + "grad_norm": 50475.234375, + "learning_rate": 2.912144096241737e-05, + "loss": 2.1215, + "step": 12746 + }, + { + "epoch": 2.389315838800375, + "grad_norm": 50480.9609375, + "learning_rate": 2.911430117528643e-05, + "loss": 2.1258, + "step": 12747 + }, + { + "epoch": 2.38950328022493, + "grad_norm": 52455.39453125, + "learning_rate": 2.9107161904005164e-05, + "loss": 2.1772, + "step": 12748 + }, + { + "epoch": 2.3896907216494845, + "grad_norm": 52612.28515625, + "learning_rate": 2.9100023148749938e-05, + "loss": 2.1903, + "step": 12749 + }, + { + "epoch": 2.3898781630740396, + "grad_norm": 56559.25390625, + "learning_rate": 2.9092884909697038e-05, + "loss": 2.1413, + "step": 12750 + }, + { + "epoch": 2.390065604498594, + "grad_norm": 55486.82421875, + "learning_rate": 2.908574718702276e-05, + "loss": 2.1698, + "step": 12751 + }, + { + "epoch": 2.3902530459231492, + "grad_norm": 53816.79296875, + "learning_rate": 2.9078609980903426e-05, + "loss": 2.1064, + "step": 12752 + }, + { + "epoch": 2.390440487347704, + "grad_norm": 55016.578125, + "learning_rate": 2.9071473291515307e-05, + "loss": 2.1146, + "step": 12753 + }, + { + "epoch": 2.3906279287722585, + "grad_norm": 51966.5546875, + "learning_rate": 2.9064337119034635e-05, + "loss": 2.1291, + "step": 12754 + }, + { + "epoch": 2.3908153701968136, + "grad_norm": 50887.83984375, + "learning_rate": 2.90572014636377e-05, + "loss": 2.1305, + "step": 12755 + }, + { + "epoch": 2.391002811621368, + "grad_norm": 55738.515625, + "learning_rate": 2.9050066325500754e-05, + "loss": 2.1177, + "step": 12756 + }, + { + "epoch": 2.3911902530459233, + "grad_norm": 56532.4609375, + "learning_rate": 2.9042931704800002e-05, + "loss": 2.1361, + "step": 12757 + }, + { + "epoch": 2.391377694470478, + "grad_norm": 54815.45703125, + "learning_rate": 2.9035797601711656e-05, + "loss": 2.1155, + "step": 12758 + }, + { + "epoch": 2.391565135895033, + "grad_norm": 52484.828125, + "learning_rate": 2.9028664016411945e-05, + "loss": 2.1002, + "step": 12759 + }, + { + "epoch": 2.3917525773195876, + "grad_norm": 51552.57421875, + "learning_rate": 2.902153094907702e-05, + "loss": 2.1845, + "step": 12760 + }, + { + "epoch": 2.3919400187441426, + "grad_norm": 57463.42578125, + "learning_rate": 2.90143983998831e-05, + "loss": 2.2125, + "step": 12761 + }, + { + "epoch": 2.3921274601686973, + "grad_norm": 55200.2421875, + "learning_rate": 2.9007266369006335e-05, + "loss": 2.1603, + "step": 12762 + }, + { + "epoch": 2.3923149015932523, + "grad_norm": 49569.20703125, + "learning_rate": 2.900013485662285e-05, + "loss": 2.0812, + "step": 12763 + }, + { + "epoch": 2.392502343017807, + "grad_norm": 56093.40625, + "learning_rate": 2.8993003862908828e-05, + "loss": 2.0687, + "step": 12764 + }, + { + "epoch": 2.3926897844423616, + "grad_norm": 51802.91015625, + "learning_rate": 2.8985873388040365e-05, + "loss": 2.1217, + "step": 12765 + }, + { + "epoch": 2.3928772258669166, + "grad_norm": 50514.9765625, + "learning_rate": 2.897874343219359e-05, + "loss": 2.1094, + "step": 12766 + }, + { + "epoch": 2.3930646672914713, + "grad_norm": 52471.3984375, + "learning_rate": 2.8971613995544584e-05, + "loss": 2.0911, + "step": 12767 + }, + { + "epoch": 2.3932521087160263, + "grad_norm": 57077.3828125, + "learning_rate": 2.8964485078269465e-05, + "loss": 2.1719, + "step": 12768 + }, + { + "epoch": 2.393439550140581, + "grad_norm": 50213.46875, + "learning_rate": 2.89573566805443e-05, + "loss": 2.129, + "step": 12769 + }, + { + "epoch": 2.393626991565136, + "grad_norm": 55487.46875, + "learning_rate": 2.8950228802545116e-05, + "loss": 2.1978, + "step": 12770 + }, + { + "epoch": 2.3938144329896907, + "grad_norm": 52596.01953125, + "learning_rate": 2.894310144444799e-05, + "loss": 2.1642, + "step": 12771 + }, + { + "epoch": 2.3940018744142457, + "grad_norm": 51444.96875, + "learning_rate": 2.8935974606428977e-05, + "loss": 2.1571, + "step": 12772 + }, + { + "epoch": 2.3941893158388003, + "grad_norm": 53828.359375, + "learning_rate": 2.892884828866408e-05, + "loss": 2.0635, + "step": 12773 + }, + { + "epoch": 2.3943767572633554, + "grad_norm": 53698.48828125, + "learning_rate": 2.8921722491329296e-05, + "loss": 2.1556, + "step": 12774 + }, + { + "epoch": 2.39456419868791, + "grad_norm": 53430.83203125, + "learning_rate": 2.8914597214600657e-05, + "loss": 2.1869, + "step": 12775 + }, + { + "epoch": 2.3947516401124647, + "grad_norm": 51735.359375, + "learning_rate": 2.890747245865413e-05, + "loss": 2.1721, + "step": 12776 + }, + { + "epoch": 2.3949390815370197, + "grad_norm": 51976.0859375, + "learning_rate": 2.8900348223665663e-05, + "loss": 2.1751, + "step": 12777 + }, + { + "epoch": 2.3951265229615744, + "grad_norm": 53107.89453125, + "learning_rate": 2.8893224509811234e-05, + "loss": 2.1108, + "step": 12778 + }, + { + "epoch": 2.3953139643861294, + "grad_norm": 56099.99609375, + "learning_rate": 2.8886101317266843e-05, + "loss": 2.1288, + "step": 12779 + }, + { + "epoch": 2.395501405810684, + "grad_norm": 55362.28515625, + "learning_rate": 2.8878978646208333e-05, + "loss": 2.0877, + "step": 12780 + }, + { + "epoch": 2.395688847235239, + "grad_norm": 54252.63671875, + "learning_rate": 2.8871856496811668e-05, + "loss": 2.1433, + "step": 12781 + }, + { + "epoch": 2.3958762886597937, + "grad_norm": 53682.609375, + "learning_rate": 2.8864734869252767e-05, + "loss": 2.1684, + "step": 12782 + }, + { + "epoch": 2.396063730084349, + "grad_norm": 55340.69921875, + "learning_rate": 2.885761376370752e-05, + "loss": 2.1134, + "step": 12783 + }, + { + "epoch": 2.3962511715089034, + "grad_norm": 53481.54296875, + "learning_rate": 2.8850493180351778e-05, + "loss": 2.1181, + "step": 12784 + }, + { + "epoch": 2.3964386129334585, + "grad_norm": 51871.28515625, + "learning_rate": 2.8843373119361455e-05, + "loss": 2.1367, + "step": 12785 + }, + { + "epoch": 2.396626054358013, + "grad_norm": 49353.19140625, + "learning_rate": 2.8836253580912365e-05, + "loss": 2.1379, + "step": 12786 + }, + { + "epoch": 2.3968134957825677, + "grad_norm": 50479.78515625, + "learning_rate": 2.8829134565180394e-05, + "loss": 2.1612, + "step": 12787 + }, + { + "epoch": 2.397000937207123, + "grad_norm": 53033.2890625, + "learning_rate": 2.882201607234133e-05, + "loss": 2.1519, + "step": 12788 + }, + { + "epoch": 2.3971883786316774, + "grad_norm": 52544.56640625, + "learning_rate": 2.881489810257103e-05, + "loss": 2.1017, + "step": 12789 + }, + { + "epoch": 2.3973758200562325, + "grad_norm": 50999.51171875, + "learning_rate": 2.8807780656045257e-05, + "loss": 2.079, + "step": 12790 + }, + { + "epoch": 2.397563261480787, + "grad_norm": 52143.9765625, + "learning_rate": 2.8800663732939858e-05, + "loss": 2.1466, + "step": 12791 + }, + { + "epoch": 2.397750702905342, + "grad_norm": 54406.46484375, + "learning_rate": 2.879354733343057e-05, + "loss": 2.1663, + "step": 12792 + }, + { + "epoch": 2.397938144329897, + "grad_norm": 53393.4921875, + "learning_rate": 2.8786431457693146e-05, + "loss": 2.1635, + "step": 12793 + }, + { + "epoch": 2.398125585754452, + "grad_norm": 51967.0546875, + "learning_rate": 2.8779316105903385e-05, + "loss": 2.1819, + "step": 12794 + }, + { + "epoch": 2.3983130271790065, + "grad_norm": 60814.21875, + "learning_rate": 2.877220127823699e-05, + "loss": 2.1844, + "step": 12795 + }, + { + "epoch": 2.3985004686035616, + "grad_norm": 53980.2734375, + "learning_rate": 2.876508697486971e-05, + "loss": 2.1848, + "step": 12796 + }, + { + "epoch": 2.398687910028116, + "grad_norm": 52709.6796875, + "learning_rate": 2.8757973195977235e-05, + "loss": 2.0698, + "step": 12797 + }, + { + "epoch": 2.398875351452671, + "grad_norm": 55980.48046875, + "learning_rate": 2.8750859941735297e-05, + "loss": 2.1158, + "step": 12798 + }, + { + "epoch": 2.399062792877226, + "grad_norm": 54901.82421875, + "learning_rate": 2.874374721231957e-05, + "loss": 2.2091, + "step": 12799 + }, + { + "epoch": 2.3992502343017805, + "grad_norm": 52404.19921875, + "learning_rate": 2.873663500790571e-05, + "loss": 2.1106, + "step": 12800 + }, + { + "epoch": 2.3994376757263356, + "grad_norm": 52976.2421875, + "learning_rate": 2.87295233286694e-05, + "loss": 2.1171, + "step": 12801 + }, + { + "epoch": 2.39962511715089, + "grad_norm": 52328.19140625, + "learning_rate": 2.8722412174786324e-05, + "loss": 2.1062, + "step": 12802 + }, + { + "epoch": 2.3998125585754453, + "grad_norm": 53469.84765625, + "learning_rate": 2.871530154643204e-05, + "loss": 2.1409, + "step": 12803 + }, + { + "epoch": 2.4, + "grad_norm": 52252.32421875, + "learning_rate": 2.870819144378221e-05, + "loss": 2.1267, + "step": 12804 + }, + { + "epoch": 2.400187441424555, + "grad_norm": 56996.89453125, + "learning_rate": 2.870108186701247e-05, + "loss": 2.0899, + "step": 12805 + }, + { + "epoch": 2.4003748828491096, + "grad_norm": 53451.86328125, + "learning_rate": 2.8693972816298387e-05, + "loss": 2.1292, + "step": 12806 + }, + { + "epoch": 2.4005623242736647, + "grad_norm": 56838.89453125, + "learning_rate": 2.8686864291815542e-05, + "loss": 2.1105, + "step": 12807 + }, + { + "epoch": 2.4007497656982193, + "grad_norm": 49675.6484375, + "learning_rate": 2.8679756293739505e-05, + "loss": 2.1219, + "step": 12808 + }, + { + "epoch": 2.400937207122774, + "grad_norm": 52087.37890625, + "learning_rate": 2.867264882224589e-05, + "loss": 2.1136, + "step": 12809 + }, + { + "epoch": 2.401124648547329, + "grad_norm": 55490.21875, + "learning_rate": 2.866554187751016e-05, + "loss": 2.1874, + "step": 12810 + }, + { + "epoch": 2.4013120899718836, + "grad_norm": 56684.65625, + "learning_rate": 2.8658435459707883e-05, + "loss": 2.1432, + "step": 12811 + }, + { + "epoch": 2.4014995313964387, + "grad_norm": 50591.09765625, + "learning_rate": 2.86513295690146e-05, + "loss": 2.1279, + "step": 12812 + }, + { + "epoch": 2.4016869728209933, + "grad_norm": 57400.09375, + "learning_rate": 2.8644224205605798e-05, + "loss": 2.1128, + "step": 12813 + }, + { + "epoch": 2.4018744142455484, + "grad_norm": 50854.921875, + "learning_rate": 2.863711936965695e-05, + "loss": 2.112, + "step": 12814 + }, + { + "epoch": 2.402061855670103, + "grad_norm": 50071.24609375, + "learning_rate": 2.8630015061343572e-05, + "loss": 2.1177, + "step": 12815 + }, + { + "epoch": 2.402249297094658, + "grad_norm": 54329.20703125, + "learning_rate": 2.8622911280841102e-05, + "loss": 2.2019, + "step": 12816 + }, + { + "epoch": 2.4024367385192127, + "grad_norm": 57725.921875, + "learning_rate": 2.8615808028325032e-05, + "loss": 2.1894, + "step": 12817 + }, + { + "epoch": 2.4026241799437678, + "grad_norm": 54972.046875, + "learning_rate": 2.860870530397075e-05, + "loss": 2.0982, + "step": 12818 + }, + { + "epoch": 2.4028116213683224, + "grad_norm": 57362.0234375, + "learning_rate": 2.8601603107953735e-05, + "loss": 2.1067, + "step": 12819 + }, + { + "epoch": 2.402999062792877, + "grad_norm": 61265.75, + "learning_rate": 2.8594501440449372e-05, + "loss": 2.1569, + "step": 12820 + }, + { + "epoch": 2.403186504217432, + "grad_norm": 63705.36328125, + "learning_rate": 2.858740030163306e-05, + "loss": 2.1234, + "step": 12821 + }, + { + "epoch": 2.4033739456419867, + "grad_norm": 53610.0703125, + "learning_rate": 2.8580299691680222e-05, + "loss": 2.142, + "step": 12822 + }, + { + "epoch": 2.4035613870665418, + "grad_norm": 58469.4453125, + "learning_rate": 2.857319961076618e-05, + "loss": 2.1661, + "step": 12823 + }, + { + "epoch": 2.4037488284910964, + "grad_norm": 56505.22265625, + "learning_rate": 2.856610005906636e-05, + "loss": 2.2221, + "step": 12824 + }, + { + "epoch": 2.4039362699156515, + "grad_norm": 61250.359375, + "learning_rate": 2.855900103675605e-05, + "loss": 2.1196, + "step": 12825 + }, + { + "epoch": 2.404123711340206, + "grad_norm": 57319.640625, + "learning_rate": 2.8551902544010633e-05, + "loss": 2.1356, + "step": 12826 + }, + { + "epoch": 2.404311152764761, + "grad_norm": 50862.1796875, + "learning_rate": 2.8544804581005403e-05, + "loss": 2.1467, + "step": 12827 + }, + { + "epoch": 2.4044985941893158, + "grad_norm": 48753.4609375, + "learning_rate": 2.85377071479157e-05, + "loss": 2.1163, + "step": 12828 + }, + { + "epoch": 2.404686035613871, + "grad_norm": 57999.41796875, + "learning_rate": 2.85306102449168e-05, + "loss": 2.0887, + "step": 12829 + }, + { + "epoch": 2.4048734770384255, + "grad_norm": 49763.34765625, + "learning_rate": 2.8523513872183982e-05, + "loss": 2.1472, + "step": 12830 + }, + { + "epoch": 2.40506091846298, + "grad_norm": 56215.984375, + "learning_rate": 2.8516418029892517e-05, + "loss": 2.1604, + "step": 12831 + }, + { + "epoch": 2.405248359887535, + "grad_norm": 53514.5703125, + "learning_rate": 2.8509322718217713e-05, + "loss": 2.1246, + "step": 12832 + }, + { + "epoch": 2.40543580131209, + "grad_norm": 54108.50390625, + "learning_rate": 2.850222793733474e-05, + "loss": 2.1424, + "step": 12833 + }, + { + "epoch": 2.405623242736645, + "grad_norm": 55018.23828125, + "learning_rate": 2.8495133687418863e-05, + "loss": 2.1947, + "step": 12834 + }, + { + "epoch": 2.4058106841611995, + "grad_norm": 51437.71875, + "learning_rate": 2.848803996864532e-05, + "loss": 2.1351, + "step": 12835 + }, + { + "epoch": 2.4059981255857545, + "grad_norm": 54861.26953125, + "learning_rate": 2.8480946781189298e-05, + "loss": 2.1519, + "step": 12836 + }, + { + "epoch": 2.406185567010309, + "grad_norm": 52301.3203125, + "learning_rate": 2.8473854125225974e-05, + "loss": 2.1399, + "step": 12837 + }, + { + "epoch": 2.4063730084348642, + "grad_norm": 56656.19140625, + "learning_rate": 2.846676200093057e-05, + "loss": 2.1383, + "step": 12838 + }, + { + "epoch": 2.406560449859419, + "grad_norm": 53842.62109375, + "learning_rate": 2.8459670408478216e-05, + "loss": 2.1378, + "step": 12839 + }, + { + "epoch": 2.406747891283974, + "grad_norm": 53535.671875, + "learning_rate": 2.845257934804406e-05, + "loss": 2.1614, + "step": 12840 + }, + { + "epoch": 2.4069353327085286, + "grad_norm": 53083.94140625, + "learning_rate": 2.8445488819803255e-05, + "loss": 2.1545, + "step": 12841 + }, + { + "epoch": 2.407122774133083, + "grad_norm": 53001.4765625, + "learning_rate": 2.8438398823930957e-05, + "loss": 2.1786, + "step": 12842 + }, + { + "epoch": 2.4073102155576382, + "grad_norm": 50630.2265625, + "learning_rate": 2.8431309360602242e-05, + "loss": 2.1832, + "step": 12843 + }, + { + "epoch": 2.4074976569821933, + "grad_norm": 55669.09765625, + "learning_rate": 2.842422042999221e-05, + "loss": 2.09, + "step": 12844 + }, + { + "epoch": 2.407685098406748, + "grad_norm": 53725.390625, + "learning_rate": 2.8417132032275977e-05, + "loss": 2.1338, + "step": 12845 + }, + { + "epoch": 2.4078725398313026, + "grad_norm": 60962.01171875, + "learning_rate": 2.841004416762858e-05, + "loss": 2.1, + "step": 12846 + }, + { + "epoch": 2.4080599812558576, + "grad_norm": 54416.65625, + "learning_rate": 2.8402956836225124e-05, + "loss": 2.1471, + "step": 12847 + }, + { + "epoch": 2.4082474226804123, + "grad_norm": 53476.01953125, + "learning_rate": 2.8395870038240612e-05, + "loss": 2.1628, + "step": 12848 + }, + { + "epoch": 2.4084348641049673, + "grad_norm": 53703.6328125, + "learning_rate": 2.8388783773850112e-05, + "loss": 2.139, + "step": 12849 + }, + { + "epoch": 2.408622305529522, + "grad_norm": 49593.296875, + "learning_rate": 2.8381698043228642e-05, + "loss": 2.0994, + "step": 12850 + }, + { + "epoch": 2.408809746954077, + "grad_norm": 54382.4765625, + "learning_rate": 2.8374612846551186e-05, + "loss": 2.0101, + "step": 12851 + }, + { + "epoch": 2.4089971883786316, + "grad_norm": 56829.53515625, + "learning_rate": 2.8367528183992763e-05, + "loss": 2.1561, + "step": 12852 + }, + { + "epoch": 2.4091846298031863, + "grad_norm": 55794.49609375, + "learning_rate": 2.836044405572833e-05, + "loss": 2.086, + "step": 12853 + }, + { + "epoch": 2.4093720712277413, + "grad_norm": 54028.6015625, + "learning_rate": 2.83533604619329e-05, + "loss": 2.1923, + "step": 12854 + }, + { + "epoch": 2.4095595126522964, + "grad_norm": 54184.671875, + "learning_rate": 2.8346277402781395e-05, + "loss": 2.1962, + "step": 12855 + }, + { + "epoch": 2.409746954076851, + "grad_norm": 52774.68359375, + "learning_rate": 2.8339194878448744e-05, + "loss": 2.2017, + "step": 12856 + }, + { + "epoch": 2.4099343955014056, + "grad_norm": 49742.359375, + "learning_rate": 2.8332112889109897e-05, + "loss": 2.0653, + "step": 12857 + }, + { + "epoch": 2.4101218369259607, + "grad_norm": 54153.86328125, + "learning_rate": 2.8325031434939786e-05, + "loss": 2.1236, + "step": 12858 + }, + { + "epoch": 2.4103092783505153, + "grad_norm": 54778.98046875, + "learning_rate": 2.83179505161133e-05, + "loss": 2.1949, + "step": 12859 + }, + { + "epoch": 2.4104967197750704, + "grad_norm": 52705.58203125, + "learning_rate": 2.8310870132805305e-05, + "loss": 2.073, + "step": 12860 + }, + { + "epoch": 2.410684161199625, + "grad_norm": 53868.1171875, + "learning_rate": 2.830379028519072e-05, + "loss": 2.1388, + "step": 12861 + }, + { + "epoch": 2.41087160262418, + "grad_norm": 51794.70703125, + "learning_rate": 2.829671097344438e-05, + "loss": 2.174, + "step": 12862 + }, + { + "epoch": 2.4110590440487347, + "grad_norm": 57005.86328125, + "learning_rate": 2.8289632197741124e-05, + "loss": 2.159, + "step": 12863 + }, + { + "epoch": 2.41124648547329, + "grad_norm": 51552.4140625, + "learning_rate": 2.8282553958255797e-05, + "loss": 2.0819, + "step": 12864 + }, + { + "epoch": 2.4114339268978444, + "grad_norm": 55892.19140625, + "learning_rate": 2.827547625516326e-05, + "loss": 2.1673, + "step": 12865 + }, + { + "epoch": 2.4116213683223995, + "grad_norm": 57531.0546875, + "learning_rate": 2.8268399088638287e-05, + "loss": 2.117, + "step": 12866 + }, + { + "epoch": 2.411808809746954, + "grad_norm": 55204.99609375, + "learning_rate": 2.8261322458855664e-05, + "loss": 2.143, + "step": 12867 + }, + { + "epoch": 2.4119962511715087, + "grad_norm": 54067.0859375, + "learning_rate": 2.825424636599021e-05, + "loss": 2.12, + "step": 12868 + }, + { + "epoch": 2.412183692596064, + "grad_norm": 50956.81640625, + "learning_rate": 2.8247170810216685e-05, + "loss": 2.171, + "step": 12869 + }, + { + "epoch": 2.4123711340206184, + "grad_norm": 52865.72265625, + "learning_rate": 2.8240095791709807e-05, + "loss": 2.1222, + "step": 12870 + }, + { + "epoch": 2.4125585754451735, + "grad_norm": 56828.14453125, + "learning_rate": 2.823302131064436e-05, + "loss": 2.1578, + "step": 12871 + }, + { + "epoch": 2.412746016869728, + "grad_norm": 51455.2421875, + "learning_rate": 2.822594736719507e-05, + "loss": 2.1852, + "step": 12872 + }, + { + "epoch": 2.412933458294283, + "grad_norm": 54530.078125, + "learning_rate": 2.821887396153666e-05, + "loss": 2.1232, + "step": 12873 + }, + { + "epoch": 2.413120899718838, + "grad_norm": 55828.44140625, + "learning_rate": 2.821180109384381e-05, + "loss": 2.0857, + "step": 12874 + }, + { + "epoch": 2.413308341143393, + "grad_norm": 55237.734375, + "learning_rate": 2.8204728764291233e-05, + "loss": 2.094, + "step": 12875 + }, + { + "epoch": 2.4134957825679475, + "grad_norm": 54042.015625, + "learning_rate": 2.8197656973053604e-05, + "loss": 2.1304, + "step": 12876 + }, + { + "epoch": 2.4136832239925026, + "grad_norm": 57795.8671875, + "learning_rate": 2.8190585720305563e-05, + "loss": 2.1408, + "step": 12877 + }, + { + "epoch": 2.413870665417057, + "grad_norm": 51463.1953125, + "learning_rate": 2.8183515006221796e-05, + "loss": 2.2535, + "step": 12878 + }, + { + "epoch": 2.414058106841612, + "grad_norm": 51872.828125, + "learning_rate": 2.8176444830976896e-05, + "loss": 2.154, + "step": 12879 + }, + { + "epoch": 2.414245548266167, + "grad_norm": 52882.60546875, + "learning_rate": 2.816937519474554e-05, + "loss": 2.1156, + "step": 12880 + }, + { + "epoch": 2.4144329896907215, + "grad_norm": 58868.4765625, + "learning_rate": 2.816230609770229e-05, + "loss": 2.1633, + "step": 12881 + }, + { + "epoch": 2.4146204311152766, + "grad_norm": 49324.41015625, + "learning_rate": 2.815523754002179e-05, + "loss": 2.1396, + "step": 12882 + }, + { + "epoch": 2.414807872539831, + "grad_norm": 54137.734375, + "learning_rate": 2.8148169521878576e-05, + "loss": 2.1016, + "step": 12883 + }, + { + "epoch": 2.4149953139643863, + "grad_norm": 58501.078125, + "learning_rate": 2.8141102043447264e-05, + "loss": 2.1039, + "step": 12884 + }, + { + "epoch": 2.415182755388941, + "grad_norm": 57756.2265625, + "learning_rate": 2.8134035104902396e-05, + "loss": 2.1153, + "step": 12885 + }, + { + "epoch": 2.415370196813496, + "grad_norm": 58168.72265625, + "learning_rate": 2.8126968706418488e-05, + "loss": 2.122, + "step": 12886 + }, + { + "epoch": 2.4155576382380506, + "grad_norm": 52165.26171875, + "learning_rate": 2.81199028481701e-05, + "loss": 2.1369, + "step": 12887 + }, + { + "epoch": 2.4157450796626057, + "grad_norm": 50219.6796875, + "learning_rate": 2.8112837530331754e-05, + "loss": 2.1133, + "step": 12888 + }, + { + "epoch": 2.4159325210871603, + "grad_norm": 57121.76171875, + "learning_rate": 2.8105772753077947e-05, + "loss": 2.1451, + "step": 12889 + }, + { + "epoch": 2.416119962511715, + "grad_norm": 57139.8125, + "learning_rate": 2.809870851658315e-05, + "loss": 2.2509, + "step": 12890 + }, + { + "epoch": 2.41630740393627, + "grad_norm": 57296.19921875, + "learning_rate": 2.8091644821021874e-05, + "loss": 2.1121, + "step": 12891 + }, + { + "epoch": 2.4164948453608246, + "grad_norm": 53313.28125, + "learning_rate": 2.8084581666568566e-05, + "loss": 2.1631, + "step": 12892 + }, + { + "epoch": 2.4166822867853797, + "grad_norm": 50445.41015625, + "learning_rate": 2.807751905339766e-05, + "loss": 2.1086, + "step": 12893 + }, + { + "epoch": 2.4168697282099343, + "grad_norm": 52493.05859375, + "learning_rate": 2.807045698168361e-05, + "loss": 2.0826, + "step": 12894 + }, + { + "epoch": 2.4170571696344894, + "grad_norm": 55063.91015625, + "learning_rate": 2.8063395451600884e-05, + "loss": 2.1585, + "step": 12895 + }, + { + "epoch": 2.417244611059044, + "grad_norm": 52013.76953125, + "learning_rate": 2.8056334463323797e-05, + "loss": 2.1599, + "step": 12896 + }, + { + "epoch": 2.417432052483599, + "grad_norm": 59609.234375, + "learning_rate": 2.8049274017026804e-05, + "loss": 2.0895, + "step": 12897 + }, + { + "epoch": 2.4176194939081537, + "grad_norm": 54519.44140625, + "learning_rate": 2.8042214112884314e-05, + "loss": 2.0662, + "step": 12898 + }, + { + "epoch": 2.4178069353327087, + "grad_norm": 54938.03515625, + "learning_rate": 2.8035154751070653e-05, + "loss": 2.0964, + "step": 12899 + }, + { + "epoch": 2.4179943767572634, + "grad_norm": 55576.55859375, + "learning_rate": 2.802809593176018e-05, + "loss": 2.1434, + "step": 12900 + }, + { + "epoch": 2.418181818181818, + "grad_norm": 52801.890625, + "learning_rate": 2.8021037655127248e-05, + "loss": 2.1361, + "step": 12901 + }, + { + "epoch": 2.418369259606373, + "grad_norm": 53186.2109375, + "learning_rate": 2.8013979921346206e-05, + "loss": 2.167, + "step": 12902 + }, + { + "epoch": 2.4185567010309277, + "grad_norm": 53619.57421875, + "learning_rate": 2.8006922730591356e-05, + "loss": 2.1019, + "step": 12903 + }, + { + "epoch": 2.4187441424554827, + "grad_norm": 50479.62890625, + "learning_rate": 2.7999866083036984e-05, + "loss": 2.156, + "step": 12904 + }, + { + "epoch": 2.4189315838800374, + "grad_norm": 48466.62890625, + "learning_rate": 2.7992809978857415e-05, + "loss": 2.1256, + "step": 12905 + }, + { + "epoch": 2.4191190253045924, + "grad_norm": 55223.078125, + "learning_rate": 2.7985754418226905e-05, + "loss": 2.1766, + "step": 12906 + }, + { + "epoch": 2.419306466729147, + "grad_norm": 51115.64453125, + "learning_rate": 2.79786994013197e-05, + "loss": 2.1255, + "step": 12907 + }, + { + "epoch": 2.419493908153702, + "grad_norm": 53405.05078125, + "learning_rate": 2.7971644928310087e-05, + "loss": 2.1554, + "step": 12908 + }, + { + "epoch": 2.4196813495782568, + "grad_norm": 55537.38671875, + "learning_rate": 2.7964590999372266e-05, + "loss": 2.1144, + "step": 12909 + }, + { + "epoch": 2.419868791002812, + "grad_norm": 53821.0546875, + "learning_rate": 2.7957537614680496e-05, + "loss": 2.0458, + "step": 12910 + }, + { + "epoch": 2.4200562324273664, + "grad_norm": 60519.92578125, + "learning_rate": 2.7950484774408948e-05, + "loss": 2.0631, + "step": 12911 + }, + { + "epoch": 2.420243673851921, + "grad_norm": 51432.22265625, + "learning_rate": 2.7943432478731863e-05, + "loss": 2.1216, + "step": 12912 + }, + { + "epoch": 2.420431115276476, + "grad_norm": 52500.234375, + "learning_rate": 2.7936380727823374e-05, + "loss": 2.1201, + "step": 12913 + }, + { + "epoch": 2.4206185567010308, + "grad_norm": 56106.07421875, + "learning_rate": 2.79293295218577e-05, + "loss": 2.2212, + "step": 12914 + }, + { + "epoch": 2.420805998125586, + "grad_norm": 53256.265625, + "learning_rate": 2.7922278861008966e-05, + "loss": 2.1726, + "step": 12915 + }, + { + "epoch": 2.4209934395501405, + "grad_norm": 55103.77734375, + "learning_rate": 2.7915228745451305e-05, + "loss": 2.1071, + "step": 12916 + }, + { + "epoch": 2.4211808809746955, + "grad_norm": 57026.91015625, + "learning_rate": 2.7908179175358877e-05, + "loss": 2.1172, + "step": 12917 + }, + { + "epoch": 2.42136832239925, + "grad_norm": 55915.0625, + "learning_rate": 2.7901130150905762e-05, + "loss": 2.1373, + "step": 12918 + }, + { + "epoch": 2.421555763823805, + "grad_norm": 50623.41015625, + "learning_rate": 2.7894081672266102e-05, + "loss": 2.1425, + "step": 12919 + }, + { + "epoch": 2.42174320524836, + "grad_norm": 53991.9765625, + "learning_rate": 2.7887033739613943e-05, + "loss": 2.1237, + "step": 12920 + }, + { + "epoch": 2.421930646672915, + "grad_norm": 56694.35546875, + "learning_rate": 2.78799863531234e-05, + "loss": 2.144, + "step": 12921 + }, + { + "epoch": 2.4221180880974695, + "grad_norm": 54475.91015625, + "learning_rate": 2.7872939512968522e-05, + "loss": 2.1052, + "step": 12922 + }, + { + "epoch": 2.422305529522024, + "grad_norm": 48908.53125, + "learning_rate": 2.7865893219323323e-05, + "loss": 2.1451, + "step": 12923 + }, + { + "epoch": 2.4224929709465792, + "grad_norm": 53134.9921875, + "learning_rate": 2.7858847472361865e-05, + "loss": 2.1076, + "step": 12924 + }, + { + "epoch": 2.422680412371134, + "grad_norm": 56453.1796875, + "learning_rate": 2.785180227225821e-05, + "loss": 2.2146, + "step": 12925 + }, + { + "epoch": 2.422867853795689, + "grad_norm": 55427.109375, + "learning_rate": 2.784475761918628e-05, + "loss": 2.1703, + "step": 12926 + }, + { + "epoch": 2.4230552952202435, + "grad_norm": 54428.6640625, + "learning_rate": 2.783771351332011e-05, + "loss": 2.1741, + "step": 12927 + }, + { + "epoch": 2.4232427366447986, + "grad_norm": 55797.1796875, + "learning_rate": 2.783066995483371e-05, + "loss": 2.151, + "step": 12928 + }, + { + "epoch": 2.4234301780693532, + "grad_norm": 55715.8515625, + "learning_rate": 2.7823626943901014e-05, + "loss": 2.1327, + "step": 12929 + }, + { + "epoch": 2.4236176194939083, + "grad_norm": 57024.03125, + "learning_rate": 2.781658448069596e-05, + "loss": 2.0702, + "step": 12930 + }, + { + "epoch": 2.423805060918463, + "grad_norm": 52253.5078125, + "learning_rate": 2.780954256539253e-05, + "loss": 2.1487, + "step": 12931 + }, + { + "epoch": 2.423992502343018, + "grad_norm": 51889.17578125, + "learning_rate": 2.780250119816463e-05, + "loss": 2.2009, + "step": 12932 + }, + { + "epoch": 2.4241799437675726, + "grad_norm": 52412.98046875, + "learning_rate": 2.7795460379186146e-05, + "loss": 2.1313, + "step": 12933 + }, + { + "epoch": 2.4243673851921272, + "grad_norm": 54432.28515625, + "learning_rate": 2.7788420108631008e-05, + "loss": 2.1341, + "step": 12934 + }, + { + "epoch": 2.4245548266166823, + "grad_norm": 53111.6171875, + "learning_rate": 2.7781380386673112e-05, + "loss": 2.0884, + "step": 12935 + }, + { + "epoch": 2.424742268041237, + "grad_norm": 52583.40625, + "learning_rate": 2.777434121348631e-05, + "loss": 2.1868, + "step": 12936 + }, + { + "epoch": 2.424929709465792, + "grad_norm": 52456.51171875, + "learning_rate": 2.776730258924445e-05, + "loss": 2.1619, + "step": 12937 + }, + { + "epoch": 2.4251171508903466, + "grad_norm": 52745.12109375, + "learning_rate": 2.776026451412141e-05, + "loss": 2.152, + "step": 12938 + }, + { + "epoch": 2.4253045923149017, + "grad_norm": 56281.1015625, + "learning_rate": 2.775322698829098e-05, + "loss": 2.1526, + "step": 12939 + }, + { + "epoch": 2.4254920337394563, + "grad_norm": 51345.4296875, + "learning_rate": 2.774619001192702e-05, + "loss": 2.155, + "step": 12940 + }, + { + "epoch": 2.4256794751640114, + "grad_norm": 50734.71484375, + "learning_rate": 2.7739153585203298e-05, + "loss": 2.1471, + "step": 12941 + }, + { + "epoch": 2.425866916588566, + "grad_norm": 53267.90625, + "learning_rate": 2.7732117708293638e-05, + "loss": 2.2165, + "step": 12942 + }, + { + "epoch": 2.426054358013121, + "grad_norm": 53645.3046875, + "learning_rate": 2.772508238137178e-05, + "loss": 2.1045, + "step": 12943 + }, + { + "epoch": 2.4262417994376757, + "grad_norm": 53317.2734375, + "learning_rate": 2.771804760461153e-05, + "loss": 2.1309, + "step": 12944 + }, + { + "epoch": 2.4264292408622303, + "grad_norm": 58481.7890625, + "learning_rate": 2.7711013378186613e-05, + "loss": 2.1429, + "step": 12945 + }, + { + "epoch": 2.4266166822867854, + "grad_norm": 56383.1328125, + "learning_rate": 2.7703979702270744e-05, + "loss": 2.105, + "step": 12946 + }, + { + "epoch": 2.42680412371134, + "grad_norm": 53169.390625, + "learning_rate": 2.769694657703769e-05, + "loss": 2.0826, + "step": 12947 + }, + { + "epoch": 2.426991565135895, + "grad_norm": 49905.65234375, + "learning_rate": 2.7689914002661143e-05, + "loss": 2.1106, + "step": 12948 + }, + { + "epoch": 2.4271790065604497, + "grad_norm": 54089.75390625, + "learning_rate": 2.7682881979314774e-05, + "loss": 2.1675, + "step": 12949 + }, + { + "epoch": 2.427366447985005, + "grad_norm": 54196.5390625, + "learning_rate": 2.7675850507172284e-05, + "loss": 2.1455, + "step": 12950 + }, + { + "epoch": 2.4275538894095594, + "grad_norm": 56243.09765625, + "learning_rate": 2.7668819586407368e-05, + "loss": 2.175, + "step": 12951 + }, + { + "epoch": 2.4277413308341145, + "grad_norm": 49864.98828125, + "learning_rate": 2.7661789217193647e-05, + "loss": 2.1402, + "step": 12952 + }, + { + "epoch": 2.427928772258669, + "grad_norm": 54810.84765625, + "learning_rate": 2.7654759399704754e-05, + "loss": 2.1175, + "step": 12953 + }, + { + "epoch": 2.428116213683224, + "grad_norm": 52815.33203125, + "learning_rate": 2.7647730134114337e-05, + "loss": 2.1333, + "step": 12954 + }, + { + "epoch": 2.428303655107779, + "grad_norm": 53224.66015625, + "learning_rate": 2.7640701420596037e-05, + "loss": 2.1572, + "step": 12955 + }, + { + "epoch": 2.4284910965323334, + "grad_norm": 55077.734375, + "learning_rate": 2.7633673259323388e-05, + "loss": 2.1088, + "step": 12956 + }, + { + "epoch": 2.4286785379568885, + "grad_norm": 53886.7734375, + "learning_rate": 2.762664565047001e-05, + "loss": 2.1431, + "step": 12957 + }, + { + "epoch": 2.428865979381443, + "grad_norm": 52582.91015625, + "learning_rate": 2.7619618594209494e-05, + "loss": 2.118, + "step": 12958 + }, + { + "epoch": 2.429053420805998, + "grad_norm": 53299.57421875, + "learning_rate": 2.7612592090715388e-05, + "loss": 2.1595, + "step": 12959 + }, + { + "epoch": 2.429240862230553, + "grad_norm": 55357.78125, + "learning_rate": 2.760556614016121e-05, + "loss": 2.1495, + "step": 12960 + }, + { + "epoch": 2.429428303655108, + "grad_norm": 52669.24609375, + "learning_rate": 2.7598540742720534e-05, + "loss": 2.1682, + "step": 12961 + }, + { + "epoch": 2.4296157450796625, + "grad_norm": 58906.0546875, + "learning_rate": 2.7591515898566855e-05, + "loss": 2.1025, + "step": 12962 + }, + { + "epoch": 2.4298031865042176, + "grad_norm": 51103.984375, + "learning_rate": 2.7584491607873664e-05, + "loss": 2.0798, + "step": 12963 + }, + { + "epoch": 2.429990627928772, + "grad_norm": 55952.1953125, + "learning_rate": 2.7577467870814466e-05, + "loss": 2.1196, + "step": 12964 + }, + { + "epoch": 2.4301780693533273, + "grad_norm": 52999.16015625, + "learning_rate": 2.757044468756276e-05, + "loss": 2.1726, + "step": 12965 + }, + { + "epoch": 2.430365510777882, + "grad_norm": 57666.6875, + "learning_rate": 2.756342205829199e-05, + "loss": 2.2136, + "step": 12966 + }, + { + "epoch": 2.4305529522024365, + "grad_norm": 50115.1484375, + "learning_rate": 2.7556399983175584e-05, + "loss": 2.1537, + "step": 12967 + }, + { + "epoch": 2.4307403936269916, + "grad_norm": 57905.28515625, + "learning_rate": 2.754937846238702e-05, + "loss": 2.1359, + "step": 12968 + }, + { + "epoch": 2.4309278350515466, + "grad_norm": 57411.1875, + "learning_rate": 2.754235749609968e-05, + "loss": 2.1306, + "step": 12969 + }, + { + "epoch": 2.4311152764761013, + "grad_norm": 53270.71484375, + "learning_rate": 2.753533708448702e-05, + "loss": 2.1011, + "step": 12970 + }, + { + "epoch": 2.431302717900656, + "grad_norm": 57675.73828125, + "learning_rate": 2.75283172277224e-05, + "loss": 2.0969, + "step": 12971 + }, + { + "epoch": 2.431490159325211, + "grad_norm": 54399.765625, + "learning_rate": 2.7521297925979195e-05, + "loss": 2.1175, + "step": 12972 + }, + { + "epoch": 2.4316776007497656, + "grad_norm": 51364.6875, + "learning_rate": 2.7514279179430802e-05, + "loss": 2.0941, + "step": 12973 + }, + { + "epoch": 2.4318650421743206, + "grad_norm": 55777.78125, + "learning_rate": 2.7507260988250545e-05, + "loss": 2.1131, + "step": 12974 + }, + { + "epoch": 2.4320524835988753, + "grad_norm": 51814.94921875, + "learning_rate": 2.75002433526118e-05, + "loss": 2.1724, + "step": 12975 + }, + { + "epoch": 2.4322399250234303, + "grad_norm": 53990.61328125, + "learning_rate": 2.7493226272687845e-05, + "loss": 2.1747, + "step": 12976 + }, + { + "epoch": 2.432427366447985, + "grad_norm": 54935.29296875, + "learning_rate": 2.7486209748652048e-05, + "loss": 2.1045, + "step": 12977 + }, + { + "epoch": 2.4326148078725396, + "grad_norm": 53069.41796875, + "learning_rate": 2.747919378067768e-05, + "loss": 2.0956, + "step": 12978 + }, + { + "epoch": 2.4328022492970947, + "grad_norm": 51123.140625, + "learning_rate": 2.7472178368938005e-05, + "loss": 2.1892, + "step": 12979 + }, + { + "epoch": 2.4329896907216497, + "grad_norm": 54632.12890625, + "learning_rate": 2.7465163513606317e-05, + "loss": 2.1843, + "step": 12980 + }, + { + "epoch": 2.4331771321462043, + "grad_norm": 49344.1328125, + "learning_rate": 2.745814921485589e-05, + "loss": 2.1213, + "step": 12981 + }, + { + "epoch": 2.433364573570759, + "grad_norm": 54898.68359375, + "learning_rate": 2.7451135472859956e-05, + "loss": 2.105, + "step": 12982 + }, + { + "epoch": 2.433552014995314, + "grad_norm": 55449.16796875, + "learning_rate": 2.744412228779172e-05, + "loss": 2.0798, + "step": 12983 + }, + { + "epoch": 2.4337394564198687, + "grad_norm": 55839.09375, + "learning_rate": 2.7437109659824432e-05, + "loss": 2.2131, + "step": 12984 + }, + { + "epoch": 2.4339268978444237, + "grad_norm": 56825.2734375, + "learning_rate": 2.74300975891313e-05, + "loss": 2.0681, + "step": 12985 + }, + { + "epoch": 2.4341143392689784, + "grad_norm": 50320.390625, + "learning_rate": 2.742308607588546e-05, + "loss": 2.1244, + "step": 12986 + }, + { + "epoch": 2.4343017806935334, + "grad_norm": 50737.7578125, + "learning_rate": 2.7416075120260127e-05, + "loss": 2.1153, + "step": 12987 + }, + { + "epoch": 2.434489222118088, + "grad_norm": 54355.1875, + "learning_rate": 2.7409064722428495e-05, + "loss": 2.198, + "step": 12988 + }, + { + "epoch": 2.434676663542643, + "grad_norm": 54447.890625, + "learning_rate": 2.740205488256364e-05, + "loss": 2.1398, + "step": 12989 + }, + { + "epoch": 2.4348641049671977, + "grad_norm": 59156.484375, + "learning_rate": 2.7395045600838726e-05, + "loss": 2.0982, + "step": 12990 + }, + { + "epoch": 2.435051546391753, + "grad_norm": 57751.984375, + "learning_rate": 2.7388036877426903e-05, + "loss": 2.1313, + "step": 12991 + }, + { + "epoch": 2.4352389878163074, + "grad_norm": 54492.85546875, + "learning_rate": 2.7381028712501243e-05, + "loss": 2.1179, + "step": 12992 + }, + { + "epoch": 2.435426429240862, + "grad_norm": 53144.99609375, + "learning_rate": 2.7374021106234838e-05, + "loss": 2.1204, + "step": 12993 + }, + { + "epoch": 2.435613870665417, + "grad_norm": 52887.2578125, + "learning_rate": 2.736701405880078e-05, + "loss": 2.1377, + "step": 12994 + }, + { + "epoch": 2.4358013120899717, + "grad_norm": 54198.07421875, + "learning_rate": 2.7360007570372137e-05, + "loss": 2.1623, + "step": 12995 + }, + { + "epoch": 2.435988753514527, + "grad_norm": 55350.296875, + "learning_rate": 2.7353001641121962e-05, + "loss": 2.1189, + "step": 12996 + }, + { + "epoch": 2.4361761949390814, + "grad_norm": 54147.6484375, + "learning_rate": 2.734599627122326e-05, + "loss": 2.1247, + "step": 12997 + }, + { + "epoch": 2.4363636363636365, + "grad_norm": 52110.984375, + "learning_rate": 2.7338991460849108e-05, + "loss": 2.1167, + "step": 12998 + }, + { + "epoch": 2.436551077788191, + "grad_norm": 49228.1015625, + "learning_rate": 2.7331987210172455e-05, + "loss": 2.1146, + "step": 12999 + }, + { + "epoch": 2.436738519212746, + "grad_norm": 57747.1875, + "learning_rate": 2.7324983519366353e-05, + "loss": 2.1701, + "step": 13000 + }, + { + "epoch": 2.436738519212746, + "eval_loss": 2.270242214202881, + "eval_runtime": 128.2171, + "eval_samples_per_second": 39.379, + "eval_steps_per_second": 1.973, + "step": 13000 + }, + { + "epoch": 2.436925960637301, + "grad_norm": 56415.50390625, + "learning_rate": 2.7317980388603758e-05, + "loss": 2.1455, + "step": 13001 + }, + { + "epoch": 2.437113402061856, + "grad_norm": 56045.828125, + "learning_rate": 2.7310977818057628e-05, + "loss": 2.1585, + "step": 13002 + }, + { + "epoch": 2.4373008434864105, + "grad_norm": 50537.72265625, + "learning_rate": 2.730397580790094e-05, + "loss": 2.1388, + "step": 13003 + }, + { + "epoch": 2.437488284910965, + "grad_norm": 52761.7109375, + "learning_rate": 2.7296974358306604e-05, + "loss": 2.1468, + "step": 13004 + }, + { + "epoch": 2.43767572633552, + "grad_norm": 53725.55859375, + "learning_rate": 2.7289973469447588e-05, + "loss": 2.1824, + "step": 13005 + }, + { + "epoch": 2.437863167760075, + "grad_norm": 54934.69140625, + "learning_rate": 2.7282973141496765e-05, + "loss": 2.122, + "step": 13006 + }, + { + "epoch": 2.43805060918463, + "grad_norm": 59081.61328125, + "learning_rate": 2.7275973374627074e-05, + "loss": 2.0933, + "step": 13007 + }, + { + "epoch": 2.4382380506091845, + "grad_norm": 55208.75390625, + "learning_rate": 2.7268974169011376e-05, + "loss": 2.1009, + "step": 13008 + }, + { + "epoch": 2.4384254920337396, + "grad_norm": 50495.62109375, + "learning_rate": 2.7261975524822518e-05, + "loss": 2.1142, + "step": 13009 + }, + { + "epoch": 2.438612933458294, + "grad_norm": 57316.0390625, + "learning_rate": 2.7254977442233394e-05, + "loss": 2.0876, + "step": 13010 + }, + { + "epoch": 2.4388003748828493, + "grad_norm": 54212.765625, + "learning_rate": 2.7247979921416856e-05, + "loss": 2.0964, + "step": 13011 + }, + { + "epoch": 2.438987816307404, + "grad_norm": 52979.671875, + "learning_rate": 2.7240982962545713e-05, + "loss": 2.1663, + "step": 13012 + }, + { + "epoch": 2.439175257731959, + "grad_norm": 54669.6484375, + "learning_rate": 2.7233986565792763e-05, + "loss": 2.2444, + "step": 13013 + }, + { + "epoch": 2.4393626991565136, + "grad_norm": 50414.83203125, + "learning_rate": 2.7226990731330847e-05, + "loss": 2.0857, + "step": 13014 + }, + { + "epoch": 2.4395501405810682, + "grad_norm": 48774.74609375, + "learning_rate": 2.7219995459332743e-05, + "loss": 2.1944, + "step": 13015 + }, + { + "epoch": 2.4397375820056233, + "grad_norm": 56297.07421875, + "learning_rate": 2.7213000749971186e-05, + "loss": 2.1047, + "step": 13016 + }, + { + "epoch": 2.439925023430178, + "grad_norm": 54550.26171875, + "learning_rate": 2.7206006603418977e-05, + "loss": 2.1489, + "step": 13017 + }, + { + "epoch": 2.440112464854733, + "grad_norm": 53818.8203125, + "learning_rate": 2.7199013019848886e-05, + "loss": 2.1129, + "step": 13018 + }, + { + "epoch": 2.4402999062792876, + "grad_norm": 58077.71484375, + "learning_rate": 2.719201999943357e-05, + "loss": 2.1272, + "step": 13019 + }, + { + "epoch": 2.4404873477038427, + "grad_norm": 53507.46875, + "learning_rate": 2.7185027542345786e-05, + "loss": 2.0751, + "step": 13020 + }, + { + "epoch": 2.4406747891283973, + "grad_norm": 50497.8359375, + "learning_rate": 2.717803564875827e-05, + "loss": 2.0411, + "step": 13021 + }, + { + "epoch": 2.4408622305529524, + "grad_norm": 54227.94140625, + "learning_rate": 2.717104431884368e-05, + "loss": 2.2144, + "step": 13022 + }, + { + "epoch": 2.441049671977507, + "grad_norm": 54846.53125, + "learning_rate": 2.716405355277468e-05, + "loss": 2.12, + "step": 13023 + }, + { + "epoch": 2.441237113402062, + "grad_norm": 51250.46875, + "learning_rate": 2.7157063350723967e-05, + "loss": 2.1684, + "step": 13024 + }, + { + "epoch": 2.4414245548266167, + "grad_norm": 50223.65625, + "learning_rate": 2.7150073712864154e-05, + "loss": 2.1861, + "step": 13025 + }, + { + "epoch": 2.4416119962511713, + "grad_norm": 54386.0859375, + "learning_rate": 2.714308463936791e-05, + "loss": 2.0885, + "step": 13026 + }, + { + "epoch": 2.4417994376757264, + "grad_norm": 54138.77734375, + "learning_rate": 2.713609613040783e-05, + "loss": 2.1117, + "step": 13027 + }, + { + "epoch": 2.441986879100281, + "grad_norm": 53108.7421875, + "learning_rate": 2.712910818615655e-05, + "loss": 2.1031, + "step": 13028 + }, + { + "epoch": 2.442174320524836, + "grad_norm": 54468.10546875, + "learning_rate": 2.7122120806786645e-05, + "loss": 2.0626, + "step": 13029 + }, + { + "epoch": 2.4423617619493907, + "grad_norm": 55907.25390625, + "learning_rate": 2.711513399247067e-05, + "loss": 2.123, + "step": 13030 + }, + { + "epoch": 2.4425492033739458, + "grad_norm": 51976.77734375, + "learning_rate": 2.710814774338124e-05, + "loss": 2.0855, + "step": 13031 + }, + { + "epoch": 2.4427366447985004, + "grad_norm": 56012.30859375, + "learning_rate": 2.7101162059690856e-05, + "loss": 2.1197, + "step": 13032 + }, + { + "epoch": 2.4429240862230555, + "grad_norm": 53394.3125, + "learning_rate": 2.7094176941572104e-05, + "loss": 2.087, + "step": 13033 + }, + { + "epoch": 2.44311152764761, + "grad_norm": 55291.390625, + "learning_rate": 2.708719238919746e-05, + "loss": 2.1809, + "step": 13034 + }, + { + "epoch": 2.443298969072165, + "grad_norm": 58808.97265625, + "learning_rate": 2.708020840273948e-05, + "loss": 2.1594, + "step": 13035 + }, + { + "epoch": 2.4434864104967198, + "grad_norm": 54794.140625, + "learning_rate": 2.7073224982370614e-05, + "loss": 2.1168, + "step": 13036 + }, + { + "epoch": 2.4436738519212744, + "grad_norm": 55188.8671875, + "learning_rate": 2.7066242128263387e-05, + "loss": 2.0327, + "step": 13037 + }, + { + "epoch": 2.4438612933458295, + "grad_norm": 56279.140625, + "learning_rate": 2.7059259840590247e-05, + "loss": 2.0705, + "step": 13038 + }, + { + "epoch": 2.444048734770384, + "grad_norm": 53566.60546875, + "learning_rate": 2.7052278119523628e-05, + "loss": 2.1619, + "step": 13039 + }, + { + "epoch": 2.444236176194939, + "grad_norm": 56570.50390625, + "learning_rate": 2.7045296965235984e-05, + "loss": 2.1348, + "step": 13040 + }, + { + "epoch": 2.444423617619494, + "grad_norm": 57627.6796875, + "learning_rate": 2.7038316377899787e-05, + "loss": 2.0796, + "step": 13041 + }, + { + "epoch": 2.444611059044049, + "grad_norm": 55802.765625, + "learning_rate": 2.7031336357687358e-05, + "loss": 2.0941, + "step": 13042 + }, + { + "epoch": 2.4447985004686035, + "grad_norm": 50590.0703125, + "learning_rate": 2.7024356904771146e-05, + "loss": 2.1318, + "step": 13043 + }, + { + "epoch": 2.4449859418931585, + "grad_norm": 50275.3359375, + "learning_rate": 2.701737801932355e-05, + "loss": 2.1449, + "step": 13044 + }, + { + "epoch": 2.445173383317713, + "grad_norm": 54434.41796875, + "learning_rate": 2.701039970151692e-05, + "loss": 2.1404, + "step": 13045 + }, + { + "epoch": 2.4453608247422682, + "grad_norm": 52726.97265625, + "learning_rate": 2.700342195152359e-05, + "loss": 2.1812, + "step": 13046 + }, + { + "epoch": 2.445548266166823, + "grad_norm": 58327.43359375, + "learning_rate": 2.6996444769515926e-05, + "loss": 2.0976, + "step": 13047 + }, + { + "epoch": 2.4457357075913775, + "grad_norm": 56851.9765625, + "learning_rate": 2.6989468155666287e-05, + "loss": 2.1674, + "step": 13048 + }, + { + "epoch": 2.4459231490159326, + "grad_norm": 53371.91015625, + "learning_rate": 2.6982492110146914e-05, + "loss": 2.1603, + "step": 13049 + }, + { + "epoch": 2.446110590440487, + "grad_norm": 54732.78515625, + "learning_rate": 2.6975516633130138e-05, + "loss": 2.1108, + "step": 13050 + }, + { + "epoch": 2.4462980318650422, + "grad_norm": 55923.2578125, + "learning_rate": 2.6968541724788278e-05, + "loss": 2.1475, + "step": 13051 + }, + { + "epoch": 2.446485473289597, + "grad_norm": 49995.27734375, + "learning_rate": 2.696156738529357e-05, + "loss": 2.0735, + "step": 13052 + }, + { + "epoch": 2.446672914714152, + "grad_norm": 53728.16015625, + "learning_rate": 2.6954593614818257e-05, + "loss": 2.1665, + "step": 13053 + }, + { + "epoch": 2.4468603561387066, + "grad_norm": 52368.984375, + "learning_rate": 2.6947620413534625e-05, + "loss": 2.1136, + "step": 13054 + }, + { + "epoch": 2.4470477975632616, + "grad_norm": 54966.6640625, + "learning_rate": 2.694064778161488e-05, + "loss": 2.1915, + "step": 13055 + }, + { + "epoch": 2.4472352389878163, + "grad_norm": 50105.28515625, + "learning_rate": 2.693367571923122e-05, + "loss": 2.1207, + "step": 13056 + }, + { + "epoch": 2.4474226804123713, + "grad_norm": 52263.609375, + "learning_rate": 2.6926704226555867e-05, + "loss": 2.0768, + "step": 13057 + }, + { + "epoch": 2.447610121836926, + "grad_norm": 55240.10546875, + "learning_rate": 2.691973330376102e-05, + "loss": 2.1509, + "step": 13058 + }, + { + "epoch": 2.4477975632614806, + "grad_norm": 57930.47265625, + "learning_rate": 2.6912762951018832e-05, + "loss": 2.137, + "step": 13059 + }, + { + "epoch": 2.4479850046860356, + "grad_norm": 52667.59765625, + "learning_rate": 2.6905793168501453e-05, + "loss": 2.1359, + "step": 13060 + }, + { + "epoch": 2.4481724461105903, + "grad_norm": 53897.546875, + "learning_rate": 2.689882395638106e-05, + "loss": 2.1704, + "step": 13061 + }, + { + "epoch": 2.4483598875351453, + "grad_norm": 57624.3671875, + "learning_rate": 2.689185531482974e-05, + "loss": 2.0765, + "step": 13062 + }, + { + "epoch": 2.4485473289597, + "grad_norm": 59211.60546875, + "learning_rate": 2.6884887244019664e-05, + "loss": 2.1101, + "step": 13063 + }, + { + "epoch": 2.448734770384255, + "grad_norm": 59302.32421875, + "learning_rate": 2.6877919744122894e-05, + "loss": 2.1802, + "step": 13064 + }, + { + "epoch": 2.4489222118088096, + "grad_norm": 53879.54296875, + "learning_rate": 2.6870952815311512e-05, + "loss": 2.0808, + "step": 13065 + }, + { + "epoch": 2.4491096532333647, + "grad_norm": 49163.5390625, + "learning_rate": 2.6863986457757605e-05, + "loss": 2.1638, + "step": 13066 + }, + { + "epoch": 2.4492970946579193, + "grad_norm": 54071.40625, + "learning_rate": 2.685702067163326e-05, + "loss": 2.1665, + "step": 13067 + }, + { + "epoch": 2.4494845360824744, + "grad_norm": 52610.140625, + "learning_rate": 2.685005545711049e-05, + "loss": 2.1588, + "step": 13068 + }, + { + "epoch": 2.449671977507029, + "grad_norm": 48306.87109375, + "learning_rate": 2.6843090814361328e-05, + "loss": 2.1151, + "step": 13069 + }, + { + "epoch": 2.4498594189315837, + "grad_norm": 49319.23046875, + "learning_rate": 2.683612674355781e-05, + "loss": 2.1161, + "step": 13070 + }, + { + "epoch": 2.4500468603561387, + "grad_norm": 53820.0546875, + "learning_rate": 2.6829163244871936e-05, + "loss": 2.1504, + "step": 13071 + }, + { + "epoch": 2.4502343017806933, + "grad_norm": 55891.859375, + "learning_rate": 2.6822200318475666e-05, + "loss": 2.0813, + "step": 13072 + }, + { + "epoch": 2.4504217432052484, + "grad_norm": 54055.5625, + "learning_rate": 2.6815237964540996e-05, + "loss": 2.0482, + "step": 13073 + }, + { + "epoch": 2.450609184629803, + "grad_norm": 53702.42578125, + "learning_rate": 2.680827618323991e-05, + "loss": 2.1215, + "step": 13074 + }, + { + "epoch": 2.450796626054358, + "grad_norm": 56449.515625, + "learning_rate": 2.6801314974744335e-05, + "loss": 2.0873, + "step": 13075 + }, + { + "epoch": 2.4509840674789127, + "grad_norm": 54002.3203125, + "learning_rate": 2.6794354339226178e-05, + "loss": 2.0988, + "step": 13076 + }, + { + "epoch": 2.451171508903468, + "grad_norm": 56311.71875, + "learning_rate": 2.6787394276857415e-05, + "loss": 2.0872, + "step": 13077 + }, + { + "epoch": 2.4513589503280224, + "grad_norm": 101008.625, + "learning_rate": 2.6780434787809906e-05, + "loss": 2.1069, + "step": 13078 + }, + { + "epoch": 2.4515463917525775, + "grad_norm": 54900.23046875, + "learning_rate": 2.6773475872255534e-05, + "loss": 2.1764, + "step": 13079 + }, + { + "epoch": 2.451733833177132, + "grad_norm": 54684.79296875, + "learning_rate": 2.67665175303662e-05, + "loss": 2.1803, + "step": 13080 + }, + { + "epoch": 2.4519212746016867, + "grad_norm": 56063.83203125, + "learning_rate": 2.6759559762313767e-05, + "loss": 2.1275, + "step": 13081 + }, + { + "epoch": 2.452108716026242, + "grad_norm": 56328.69921875, + "learning_rate": 2.6752602568270092e-05, + "loss": 2.1327, + "step": 13082 + }, + { + "epoch": 2.4522961574507964, + "grad_norm": 61912.2421875, + "learning_rate": 2.6745645948406965e-05, + "loss": 2.0877, + "step": 13083 + }, + { + "epoch": 2.4524835988753515, + "grad_norm": 56633.83203125, + "learning_rate": 2.6738689902896253e-05, + "loss": 2.0346, + "step": 13084 + }, + { + "epoch": 2.452671040299906, + "grad_norm": 58466.85546875, + "learning_rate": 2.673173443190975e-05, + "loss": 2.0933, + "step": 13085 + }, + { + "epoch": 2.452858481724461, + "grad_norm": 50634.66796875, + "learning_rate": 2.6724779535619205e-05, + "loss": 2.1504, + "step": 13086 + }, + { + "epoch": 2.453045923149016, + "grad_norm": 54454.50390625, + "learning_rate": 2.671782521419644e-05, + "loss": 2.1871, + "step": 13087 + }, + { + "epoch": 2.453233364573571, + "grad_norm": 56381.70703125, + "learning_rate": 2.6710871467813226e-05, + "loss": 2.0636, + "step": 13088 + }, + { + "epoch": 2.4534208059981255, + "grad_norm": 55179.3984375, + "learning_rate": 2.6703918296641285e-05, + "loss": 2.185, + "step": 13089 + }, + { + "epoch": 2.4536082474226806, + "grad_norm": 51030.8515625, + "learning_rate": 2.669696570085234e-05, + "loss": 2.1622, + "step": 13090 + }, + { + "epoch": 2.453795688847235, + "grad_norm": 59248.4296875, + "learning_rate": 2.6690013680618152e-05, + "loss": 2.0754, + "step": 13091 + }, + { + "epoch": 2.45398313027179, + "grad_norm": 50684.296875, + "learning_rate": 2.6683062236110378e-05, + "loss": 2.105, + "step": 13092 + }, + { + "epoch": 2.454170571696345, + "grad_norm": 56309.1796875, + "learning_rate": 2.6676111367500766e-05, + "loss": 2.162, + "step": 13093 + }, + { + "epoch": 2.4543580131209, + "grad_norm": 50513.1171875, + "learning_rate": 2.6669161074960958e-05, + "loss": 2.1996, + "step": 13094 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 50082.76171875, + "learning_rate": 2.6662211358662604e-05, + "loss": 2.1432, + "step": 13095 + }, + { + "epoch": 2.454732895970009, + "grad_norm": 54006.7734375, + "learning_rate": 2.665526221877737e-05, + "loss": 2.0579, + "step": 13096 + }, + { + "epoch": 2.4549203373945643, + "grad_norm": 61039.5859375, + "learning_rate": 2.6648313655476913e-05, + "loss": 2.1259, + "step": 13097 + }, + { + "epoch": 2.455107778819119, + "grad_norm": 53260.89453125, + "learning_rate": 2.664136566893283e-05, + "loss": 2.1783, + "step": 13098 + }, + { + "epoch": 2.455295220243674, + "grad_norm": 54031.35546875, + "learning_rate": 2.6634418259316717e-05, + "loss": 2.1873, + "step": 13099 + }, + { + "epoch": 2.4554826616682286, + "grad_norm": 56349.6015625, + "learning_rate": 2.6627471426800193e-05, + "loss": 2.0776, + "step": 13100 + }, + { + "epoch": 2.4556701030927837, + "grad_norm": 52738.6015625, + "learning_rate": 2.6620525171554823e-05, + "loss": 2.0666, + "step": 13101 + }, + { + "epoch": 2.4558575445173383, + "grad_norm": 52442.1484375, + "learning_rate": 2.661357949375215e-05, + "loss": 2.1164, + "step": 13102 + }, + { + "epoch": 2.456044985941893, + "grad_norm": 54398.078125, + "learning_rate": 2.6606634393563745e-05, + "loss": 2.04, + "step": 13103 + }, + { + "epoch": 2.456232427366448, + "grad_norm": 54341.77734375, + "learning_rate": 2.659968987116116e-05, + "loss": 2.1505, + "step": 13104 + }, + { + "epoch": 2.456419868791003, + "grad_norm": 53129.09375, + "learning_rate": 2.6592745926715888e-05, + "loss": 2.1102, + "step": 13105 + }, + { + "epoch": 2.4566073102155577, + "grad_norm": 57575.5, + "learning_rate": 2.658580256039943e-05, + "loss": 2.1823, + "step": 13106 + }, + { + "epoch": 2.4567947516401123, + "grad_norm": 56890.375, + "learning_rate": 2.6578859772383312e-05, + "loss": 2.1389, + "step": 13107 + }, + { + "epoch": 2.4569821930646674, + "grad_norm": 53284.18359375, + "learning_rate": 2.6571917562838993e-05, + "loss": 2.0677, + "step": 13108 + }, + { + "epoch": 2.457169634489222, + "grad_norm": 50096.51953125, + "learning_rate": 2.6564975931937913e-05, + "loss": 2.1598, + "step": 13109 + }, + { + "epoch": 2.457357075913777, + "grad_norm": 58064.8515625, + "learning_rate": 2.6558034879851546e-05, + "loss": 2.2612, + "step": 13110 + }, + { + "epoch": 2.4575445173383317, + "grad_norm": 59231.6640625, + "learning_rate": 2.6551094406751365e-05, + "loss": 2.1094, + "step": 13111 + }, + { + "epoch": 2.4577319587628867, + "grad_norm": 52824.171875, + "learning_rate": 2.6544154512808705e-05, + "loss": 2.0971, + "step": 13112 + }, + { + "epoch": 2.4579194001874414, + "grad_norm": 57645.390625, + "learning_rate": 2.6537215198195022e-05, + "loss": 2.1168, + "step": 13113 + }, + { + "epoch": 2.4581068416119964, + "grad_norm": 55959.73828125, + "learning_rate": 2.653027646308172e-05, + "loss": 2.1324, + "step": 13114 + }, + { + "epoch": 2.458294283036551, + "grad_norm": 51363.3515625, + "learning_rate": 2.6523338307640165e-05, + "loss": 2.1388, + "step": 13115 + }, + { + "epoch": 2.458481724461106, + "grad_norm": 58735.3046875, + "learning_rate": 2.651640073204169e-05, + "loss": 2.1636, + "step": 13116 + }, + { + "epoch": 2.4586691658856608, + "grad_norm": 51854.671875, + "learning_rate": 2.6509463736457695e-05, + "loss": 2.1071, + "step": 13117 + }, + { + "epoch": 2.4588566073102154, + "grad_norm": 51330.6953125, + "learning_rate": 2.6502527321059466e-05, + "loss": 2.1858, + "step": 13118 + }, + { + "epoch": 2.4590440487347704, + "grad_norm": 53651.05078125, + "learning_rate": 2.6495591486018367e-05, + "loss": 2.1366, + "step": 13119 + }, + { + "epoch": 2.459231490159325, + "grad_norm": 50386.05859375, + "learning_rate": 2.648865623150567e-05, + "loss": 2.1629, + "step": 13120 + }, + { + "epoch": 2.45941893158388, + "grad_norm": 53636.34375, + "learning_rate": 2.6481721557692694e-05, + "loss": 2.1131, + "step": 13121 + }, + { + "epoch": 2.4596063730084348, + "grad_norm": 53690.23046875, + "learning_rate": 2.6474787464750695e-05, + "loss": 2.2139, + "step": 13122 + }, + { + "epoch": 2.45979381443299, + "grad_norm": 52610.390625, + "learning_rate": 2.6467853952850953e-05, + "loss": 2.1213, + "step": 13123 + }, + { + "epoch": 2.4599812558575445, + "grad_norm": 56754.29296875, + "learning_rate": 2.6460921022164714e-05, + "loss": 2.2337, + "step": 13124 + }, + { + "epoch": 2.4601686972820995, + "grad_norm": 54392.51953125, + "learning_rate": 2.6453988672863185e-05, + "loss": 2.1892, + "step": 13125 + }, + { + "epoch": 2.460356138706654, + "grad_norm": 51620.90234375, + "learning_rate": 2.6447056905117628e-05, + "loss": 2.1394, + "step": 13126 + }, + { + "epoch": 2.460543580131209, + "grad_norm": 49412.55859375, + "learning_rate": 2.644012571909921e-05, + "loss": 2.069, + "step": 13127 + }, + { + "epoch": 2.460731021555764, + "grad_norm": 53939.5390625, + "learning_rate": 2.643319511497916e-05, + "loss": 2.1175, + "step": 13128 + }, + { + "epoch": 2.4609184629803185, + "grad_norm": 51138.33203125, + "learning_rate": 2.6426265092928614e-05, + "loss": 2.1942, + "step": 13129 + }, + { + "epoch": 2.4611059044048735, + "grad_norm": 52581.3359375, + "learning_rate": 2.6419335653118783e-05, + "loss": 2.1332, + "step": 13130 + }, + { + "epoch": 2.461293345829428, + "grad_norm": 55077.953125, + "learning_rate": 2.641240679572078e-05, + "loss": 2.0796, + "step": 13131 + }, + { + "epoch": 2.4614807872539832, + "grad_norm": 52632.55078125, + "learning_rate": 2.6405478520905736e-05, + "loss": 2.1504, + "step": 13132 + }, + { + "epoch": 2.461668228678538, + "grad_norm": 51848.37109375, + "learning_rate": 2.6398550828844776e-05, + "loss": 2.1575, + "step": 13133 + }, + { + "epoch": 2.461855670103093, + "grad_norm": 56256.88671875, + "learning_rate": 2.6391623719709048e-05, + "loss": 2.1504, + "step": 13134 + }, + { + "epoch": 2.4620431115276475, + "grad_norm": 57750.40234375, + "learning_rate": 2.638469719366957e-05, + "loss": 2.1543, + "step": 13135 + }, + { + "epoch": 2.4622305529522026, + "grad_norm": 54170.9453125, + "learning_rate": 2.6377771250897455e-05, + "loss": 2.1702, + "step": 13136 + }, + { + "epoch": 2.4624179943767572, + "grad_norm": 56992.02734375, + "learning_rate": 2.6370845891563778e-05, + "loss": 2.1291, + "step": 13137 + }, + { + "epoch": 2.4626054358013123, + "grad_norm": 55012.1328125, + "learning_rate": 2.6363921115839573e-05, + "loss": 2.1792, + "step": 13138 + }, + { + "epoch": 2.462792877225867, + "grad_norm": 56011.265625, + "learning_rate": 2.635699692389586e-05, + "loss": 2.1647, + "step": 13139 + }, + { + "epoch": 2.4629803186504216, + "grad_norm": 60629.6875, + "learning_rate": 2.635007331590366e-05, + "loss": 2.0789, + "step": 13140 + }, + { + "epoch": 2.4631677600749766, + "grad_norm": 51483.4609375, + "learning_rate": 2.6343150292034035e-05, + "loss": 2.1187, + "step": 13141 + }, + { + "epoch": 2.4633552014995312, + "grad_norm": 57938.703125, + "learning_rate": 2.633622785245788e-05, + "loss": 2.1127, + "step": 13142 + }, + { + "epoch": 2.4635426429240863, + "grad_norm": 50266.19140625, + "learning_rate": 2.6329305997346214e-05, + "loss": 2.046, + "step": 13143 + }, + { + "epoch": 2.463730084348641, + "grad_norm": 55217.22265625, + "learning_rate": 2.6322384726870024e-05, + "loss": 2.1533, + "step": 13144 + }, + { + "epoch": 2.463917525773196, + "grad_norm": 55707.97265625, + "learning_rate": 2.6315464041200232e-05, + "loss": 2.116, + "step": 13145 + }, + { + "epoch": 2.4641049671977506, + "grad_norm": 55217.9921875, + "learning_rate": 2.6308543940507758e-05, + "loss": 2.0353, + "step": 13146 + }, + { + "epoch": 2.4642924086223057, + "grad_norm": 54188.609375, + "learning_rate": 2.6301624424963546e-05, + "loss": 2.0914, + "step": 13147 + }, + { + "epoch": 2.4644798500468603, + "grad_norm": 59279.8125, + "learning_rate": 2.6294705494738458e-05, + "loss": 2.1883, + "step": 13148 + }, + { + "epoch": 2.4646672914714154, + "grad_norm": 54915.90625, + "learning_rate": 2.628778715000344e-05, + "loss": 2.0758, + "step": 13149 + }, + { + "epoch": 2.46485473289597, + "grad_norm": 54172.12890625, + "learning_rate": 2.6280869390929313e-05, + "loss": 2.1168, + "step": 13150 + }, + { + "epoch": 2.4650421743205246, + "grad_norm": 51409.12890625, + "learning_rate": 2.627395221768698e-05, + "loss": 2.1615, + "step": 13151 + }, + { + "epoch": 2.4652296157450797, + "grad_norm": 51365.51171875, + "learning_rate": 2.6267035630447267e-05, + "loss": 2.0986, + "step": 13152 + }, + { + "epoch": 2.4654170571696343, + "grad_norm": 52595.41015625, + "learning_rate": 2.626011962938098e-05, + "loss": 2.134, + "step": 13153 + }, + { + "epoch": 2.4656044985941894, + "grad_norm": 52215.0546875, + "learning_rate": 2.6253204214658978e-05, + "loss": 2.0759, + "step": 13154 + }, + { + "epoch": 2.465791940018744, + "grad_norm": 53340.73828125, + "learning_rate": 2.624628938645203e-05, + "loss": 2.2159, + "step": 13155 + }, + { + "epoch": 2.465979381443299, + "grad_norm": 55264.82421875, + "learning_rate": 2.6239375144930956e-05, + "loss": 2.1653, + "step": 13156 + }, + { + "epoch": 2.4661668228678537, + "grad_norm": 53681.94140625, + "learning_rate": 2.6232461490266502e-05, + "loss": 2.1417, + "step": 13157 + }, + { + "epoch": 2.466354264292409, + "grad_norm": 56403.5, + "learning_rate": 2.622554842262942e-05, + "loss": 2.122, + "step": 13158 + }, + { + "epoch": 2.4665417057169634, + "grad_norm": 54090.64453125, + "learning_rate": 2.6218635942190474e-05, + "loss": 2.1494, + "step": 13159 + }, + { + "epoch": 2.4667291471415185, + "grad_norm": 54217.05859375, + "learning_rate": 2.62117240491204e-05, + "loss": 2.1197, + "step": 13160 + }, + { + "epoch": 2.466916588566073, + "grad_norm": 51624.9296875, + "learning_rate": 2.6204812743589902e-05, + "loss": 2.1596, + "step": 13161 + }, + { + "epoch": 2.4671040299906277, + "grad_norm": 58161.40234375, + "learning_rate": 2.6197902025769653e-05, + "loss": 2.1319, + "step": 13162 + }, + { + "epoch": 2.467291471415183, + "grad_norm": 54072.2109375, + "learning_rate": 2.6190991895830376e-05, + "loss": 2.1045, + "step": 13163 + }, + { + "epoch": 2.4674789128397374, + "grad_norm": 54843.359375, + "learning_rate": 2.6184082353942756e-05, + "loss": 2.0961, + "step": 13164 + }, + { + "epoch": 2.4676663542642925, + "grad_norm": 54715.64453125, + "learning_rate": 2.617717340027739e-05, + "loss": 2.0856, + "step": 13165 + }, + { + "epoch": 2.467853795688847, + "grad_norm": 53525.71484375, + "learning_rate": 2.6170265035004958e-05, + "loss": 2.1693, + "step": 13166 + }, + { + "epoch": 2.468041237113402, + "grad_norm": 54575.38671875, + "learning_rate": 2.61633572582961e-05, + "loss": 2.1165, + "step": 13167 + }, + { + "epoch": 2.468228678537957, + "grad_norm": 48467.53515625, + "learning_rate": 2.6156450070321413e-05, + "loss": 2.1668, + "step": 13168 + }, + { + "epoch": 2.468416119962512, + "grad_norm": 54719.14453125, + "learning_rate": 2.614954347125148e-05, + "loss": 2.1614, + "step": 13169 + }, + { + "epoch": 2.4686035613870665, + "grad_norm": 68500.8125, + "learning_rate": 2.6142637461256915e-05, + "loss": 2.0885, + "step": 13170 + }, + { + "epoch": 2.4687910028116216, + "grad_norm": 55105.10546875, + "learning_rate": 2.6135732040508275e-05, + "loss": 2.1899, + "step": 13171 + }, + { + "epoch": 2.468978444236176, + "grad_norm": 57656.546875, + "learning_rate": 2.6128827209176098e-05, + "loss": 2.1447, + "step": 13172 + }, + { + "epoch": 2.469165885660731, + "grad_norm": 54136.5703125, + "learning_rate": 2.6121922967430934e-05, + "loss": 2.0451, + "step": 13173 + }, + { + "epoch": 2.469353327085286, + "grad_norm": 56308.390625, + "learning_rate": 2.6115019315443335e-05, + "loss": 2.1572, + "step": 13174 + }, + { + "epoch": 2.4695407685098405, + "grad_norm": 57279.76171875, + "learning_rate": 2.610811625338379e-05, + "loss": 2.1703, + "step": 13175 + }, + { + "epoch": 2.4697282099343956, + "grad_norm": 49643.765625, + "learning_rate": 2.6101213781422784e-05, + "loss": 2.1397, + "step": 13176 + }, + { + "epoch": 2.46991565135895, + "grad_norm": 52371.49609375, + "learning_rate": 2.6094311899730826e-05, + "loss": 2.1317, + "step": 13177 + }, + { + "epoch": 2.4701030927835053, + "grad_norm": 54618.328125, + "learning_rate": 2.6087410608478346e-05, + "loss": 2.1421, + "step": 13178 + }, + { + "epoch": 2.47029053420806, + "grad_norm": 54657.30078125, + "learning_rate": 2.608050990783585e-05, + "loss": 2.1839, + "step": 13179 + }, + { + "epoch": 2.470477975632615, + "grad_norm": 50440.98046875, + "learning_rate": 2.6073609797973726e-05, + "loss": 2.1151, + "step": 13180 + }, + { + "epoch": 2.4706654170571696, + "grad_norm": 52500.66796875, + "learning_rate": 2.6066710279062434e-05, + "loss": 2.1296, + "step": 13181 + }, + { + "epoch": 2.4708528584817246, + "grad_norm": 53007.05859375, + "learning_rate": 2.6059811351272367e-05, + "loss": 2.0784, + "step": 13182 + }, + { + "epoch": 2.4710402999062793, + "grad_norm": 56597.7734375, + "learning_rate": 2.6052913014773906e-05, + "loss": 2.1581, + "step": 13183 + }, + { + "epoch": 2.471227741330834, + "grad_norm": 53785.46875, + "learning_rate": 2.6046015269737468e-05, + "loss": 2.1499, + "step": 13184 + }, + { + "epoch": 2.471415182755389, + "grad_norm": 53522.890625, + "learning_rate": 2.603911811633337e-05, + "loss": 2.0774, + "step": 13185 + }, + { + "epoch": 2.4716026241799436, + "grad_norm": 56111.30078125, + "learning_rate": 2.6032221554732013e-05, + "loss": 2.1195, + "step": 13186 + }, + { + "epoch": 2.4717900656044987, + "grad_norm": 51517.76953125, + "learning_rate": 2.6025325585103704e-05, + "loss": 2.0882, + "step": 13187 + }, + { + "epoch": 2.4719775070290533, + "grad_norm": 54021.26171875, + "learning_rate": 2.6018430207618753e-05, + "loss": 2.137, + "step": 13188 + }, + { + "epoch": 2.4721649484536083, + "grad_norm": 59235.7109375, + "learning_rate": 2.6011535422447487e-05, + "loss": 2.1763, + "step": 13189 + }, + { + "epoch": 2.472352389878163, + "grad_norm": 59147.8046875, + "learning_rate": 2.6004641229760208e-05, + "loss": 2.1794, + "step": 13190 + }, + { + "epoch": 2.472539831302718, + "grad_norm": 52581.3359375, + "learning_rate": 2.5997747629727182e-05, + "loss": 2.1732, + "step": 13191 + }, + { + "epoch": 2.4727272727272727, + "grad_norm": 56196.5390625, + "learning_rate": 2.5990854622518645e-05, + "loss": 2.0872, + "step": 13192 + }, + { + "epoch": 2.4729147141518277, + "grad_norm": 56596.99609375, + "learning_rate": 2.598396220830488e-05, + "loss": 2.0937, + "step": 13193 + }, + { + "epoch": 2.4731021555763824, + "grad_norm": 55237.0625, + "learning_rate": 2.597707038725615e-05, + "loss": 2.1767, + "step": 13194 + }, + { + "epoch": 2.473289597000937, + "grad_norm": 58217.87890625, + "learning_rate": 2.597017915954259e-05, + "loss": 2.0253, + "step": 13195 + }, + { + "epoch": 2.473477038425492, + "grad_norm": 54049.40625, + "learning_rate": 2.596328852533445e-05, + "loss": 2.1477, + "step": 13196 + }, + { + "epoch": 2.4736644798500467, + "grad_norm": 56246.796875, + "learning_rate": 2.5956398484801946e-05, + "loss": 2.13, + "step": 13197 + }, + { + "epoch": 2.4738519212746017, + "grad_norm": 52354.6640625, + "learning_rate": 2.5949509038115216e-05, + "loss": 2.1005, + "step": 13198 + }, + { + "epoch": 2.4740393626991564, + "grad_norm": 62379.70703125, + "learning_rate": 2.5942620185444422e-05, + "loss": 2.196, + "step": 13199 + }, + { + "epoch": 2.4742268041237114, + "grad_norm": 51756.4296875, + "learning_rate": 2.5935731926959726e-05, + "loss": 2.1502, + "step": 13200 + }, + { + "epoch": 2.474414245548266, + "grad_norm": 54415.25390625, + "learning_rate": 2.5928844262831266e-05, + "loss": 2.1394, + "step": 13201 + }, + { + "epoch": 2.474601686972821, + "grad_norm": 50290.58203125, + "learning_rate": 2.592195719322912e-05, + "loss": 2.1349, + "step": 13202 + }, + { + "epoch": 2.4747891283973757, + "grad_norm": 58505.953125, + "learning_rate": 2.5915070718323407e-05, + "loss": 2.1691, + "step": 13203 + }, + { + "epoch": 2.474976569821931, + "grad_norm": 53110.0625, + "learning_rate": 2.5908184838284245e-05, + "loss": 2.1712, + "step": 13204 + }, + { + "epoch": 2.4751640112464854, + "grad_norm": 50872.87109375, + "learning_rate": 2.5901299553281677e-05, + "loss": 2.1326, + "step": 13205 + }, + { + "epoch": 2.47535145267104, + "grad_norm": 53927.71484375, + "learning_rate": 2.589441486348575e-05, + "loss": 2.0876, + "step": 13206 + }, + { + "epoch": 2.475538894095595, + "grad_norm": 50096.79296875, + "learning_rate": 2.5887530769066547e-05, + "loss": 2.1472, + "step": 13207 + }, + { + "epoch": 2.47572633552015, + "grad_norm": 56309.984375, + "learning_rate": 2.5880647270194063e-05, + "loss": 2.1203, + "step": 13208 + }, + { + "epoch": 2.475913776944705, + "grad_norm": 53850.12890625, + "learning_rate": 2.5873764367038307e-05, + "loss": 2.2174, + "step": 13209 + }, + { + "epoch": 2.4761012183692594, + "grad_norm": 50760.578125, + "learning_rate": 2.5866882059769303e-05, + "loss": 2.1054, + "step": 13210 + }, + { + "epoch": 2.4762886597938145, + "grad_norm": 51807.359375, + "learning_rate": 2.5860000348557012e-05, + "loss": 2.1327, + "step": 13211 + }, + { + "epoch": 2.476476101218369, + "grad_norm": 52272.703125, + "learning_rate": 2.5853119233571433e-05, + "loss": 2.0305, + "step": 13212 + }, + { + "epoch": 2.476663542642924, + "grad_norm": 57860.4765625, + "learning_rate": 2.584623871498248e-05, + "loss": 2.1125, + "step": 13213 + }, + { + "epoch": 2.476850984067479, + "grad_norm": 56835.41015625, + "learning_rate": 2.5839358792960133e-05, + "loss": 2.1589, + "step": 13214 + }, + { + "epoch": 2.477038425492034, + "grad_norm": 56732.0078125, + "learning_rate": 2.583247946767428e-05, + "loss": 2.1494, + "step": 13215 + }, + { + "epoch": 2.4772258669165885, + "grad_norm": 51876.109375, + "learning_rate": 2.5825600739294875e-05, + "loss": 2.1559, + "step": 13216 + }, + { + "epoch": 2.477413308341143, + "grad_norm": 53296.296875, + "learning_rate": 2.5818722607991787e-05, + "loss": 2.131, + "step": 13217 + }, + { + "epoch": 2.477600749765698, + "grad_norm": 54652.23046875, + "learning_rate": 2.581184507393487e-05, + "loss": 2.0509, + "step": 13218 + }, + { + "epoch": 2.4777881911902533, + "grad_norm": 53603.109375, + "learning_rate": 2.580496813729403e-05, + "loss": 2.1181, + "step": 13219 + }, + { + "epoch": 2.477975632614808, + "grad_norm": 54154.24609375, + "learning_rate": 2.579809179823912e-05, + "loss": 2.141, + "step": 13220 + }, + { + "epoch": 2.4781630740393625, + "grad_norm": 55748.4609375, + "learning_rate": 2.5791216056939972e-05, + "loss": 2.1028, + "step": 13221 + }, + { + "epoch": 2.4783505154639176, + "grad_norm": 51797.6015625, + "learning_rate": 2.5784340913566378e-05, + "loss": 2.1614, + "step": 13222 + }, + { + "epoch": 2.4785379568884722, + "grad_norm": 54144.7890625, + "learning_rate": 2.5777466368288183e-05, + "loss": 2.1229, + "step": 13223 + }, + { + "epoch": 2.4787253983130273, + "grad_norm": 54061.16015625, + "learning_rate": 2.5770592421275175e-05, + "loss": 2.1342, + "step": 13224 + }, + { + "epoch": 2.478912839737582, + "grad_norm": 52414.9609375, + "learning_rate": 2.576371907269709e-05, + "loss": 2.085, + "step": 13225 + }, + { + "epoch": 2.479100281162137, + "grad_norm": 52861.21484375, + "learning_rate": 2.5756846322723727e-05, + "loss": 2.0552, + "step": 13226 + }, + { + "epoch": 2.4792877225866916, + "grad_norm": 55611.828125, + "learning_rate": 2.574997417152486e-05, + "loss": 2.1221, + "step": 13227 + }, + { + "epoch": 2.4794751640112467, + "grad_norm": 52910.2265625, + "learning_rate": 2.5743102619270155e-05, + "loss": 2.161, + "step": 13228 + }, + { + "epoch": 2.4796626054358013, + "grad_norm": 56308.78515625, + "learning_rate": 2.5736231666129364e-05, + "loss": 2.0874, + "step": 13229 + }, + { + "epoch": 2.4798500468603564, + "grad_norm": 48699.15234375, + "learning_rate": 2.5729361312272216e-05, + "loss": 2.1474, + "step": 13230 + }, + { + "epoch": 2.480037488284911, + "grad_norm": 54436.48828125, + "learning_rate": 2.572249155786838e-05, + "loss": 2.0356, + "step": 13231 + }, + { + "epoch": 2.4802249297094656, + "grad_norm": 59996.29296875, + "learning_rate": 2.571562240308749e-05, + "loss": 2.1607, + "step": 13232 + }, + { + "epoch": 2.4804123711340207, + "grad_norm": 52500.5546875, + "learning_rate": 2.5708753848099255e-05, + "loss": 2.1187, + "step": 13233 + }, + { + "epoch": 2.4805998125585753, + "grad_norm": 49681.54296875, + "learning_rate": 2.570188589307333e-05, + "loss": 2.1158, + "step": 13234 + }, + { + "epoch": 2.4807872539831304, + "grad_norm": 56193.19921875, + "learning_rate": 2.5695018538179306e-05, + "loss": 2.1537, + "step": 13235 + }, + { + "epoch": 2.480974695407685, + "grad_norm": 55255.6640625, + "learning_rate": 2.56881517835868e-05, + "loss": 2.0751, + "step": 13236 + }, + { + "epoch": 2.48116213683224, + "grad_norm": 54445.14453125, + "learning_rate": 2.5681285629465447e-05, + "loss": 2.139, + "step": 13237 + }, + { + "epoch": 2.4813495782567947, + "grad_norm": 51532.8515625, + "learning_rate": 2.5674420075984806e-05, + "loss": 2.1202, + "step": 13238 + }, + { + "epoch": 2.4815370196813498, + "grad_norm": 57113.3671875, + "learning_rate": 2.566755512331443e-05, + "loss": 2.1107, + "step": 13239 + }, + { + "epoch": 2.4817244611059044, + "grad_norm": 50301.390625, + "learning_rate": 2.5660690771623917e-05, + "loss": 2.1206, + "step": 13240 + }, + { + "epoch": 2.4819119025304595, + "grad_norm": 54406.7734375, + "learning_rate": 2.5653827021082767e-05, + "loss": 2.1689, + "step": 13241 + }, + { + "epoch": 2.482099343955014, + "grad_norm": 57818.8046875, + "learning_rate": 2.5646963871860542e-05, + "loss": 2.1387, + "step": 13242 + }, + { + "epoch": 2.4822867853795687, + "grad_norm": 57196.7734375, + "learning_rate": 2.564010132412671e-05, + "loss": 2.1241, + "step": 13243 + }, + { + "epoch": 2.4824742268041238, + "grad_norm": 57184.1796875, + "learning_rate": 2.5633239378050822e-05, + "loss": 2.2281, + "step": 13244 + }, + { + "epoch": 2.4826616682286784, + "grad_norm": 55214.640625, + "learning_rate": 2.5626378033802302e-05, + "loss": 2.0744, + "step": 13245 + }, + { + "epoch": 2.4828491096532335, + "grad_norm": 54443.734375, + "learning_rate": 2.5619517291550666e-05, + "loss": 2.1638, + "step": 13246 + }, + { + "epoch": 2.483036551077788, + "grad_norm": 52597.75, + "learning_rate": 2.5612657151465343e-05, + "loss": 2.1136, + "step": 13247 + }, + { + "epoch": 2.483223992502343, + "grad_norm": 51695.8515625, + "learning_rate": 2.560579761371575e-05, + "loss": 2.1763, + "step": 13248 + }, + { + "epoch": 2.483411433926898, + "grad_norm": 52973.890625, + "learning_rate": 2.5598938678471345e-05, + "loss": 2.1496, + "step": 13249 + }, + { + "epoch": 2.483598875351453, + "grad_norm": 51798.48046875, + "learning_rate": 2.5592080345901527e-05, + "loss": 2.12, + "step": 13250 + }, + { + "epoch": 2.4837863167760075, + "grad_norm": 52770.06640625, + "learning_rate": 2.5585222616175653e-05, + "loss": 2.0926, + "step": 13251 + }, + { + "epoch": 2.4839737582005625, + "grad_norm": 61076.33984375, + "learning_rate": 2.5578365489463125e-05, + "loss": 2.1165, + "step": 13252 + }, + { + "epoch": 2.484161199625117, + "grad_norm": 51750.25, + "learning_rate": 2.5571508965933326e-05, + "loss": 2.0921, + "step": 13253 + }, + { + "epoch": 2.484348641049672, + "grad_norm": 53089.0234375, + "learning_rate": 2.5564653045755582e-05, + "loss": 2.1115, + "step": 13254 + }, + { + "epoch": 2.484536082474227, + "grad_norm": 55248.1640625, + "learning_rate": 2.5557797729099215e-05, + "loss": 2.139, + "step": 13255 + }, + { + "epoch": 2.4847235238987815, + "grad_norm": 53162.53125, + "learning_rate": 2.555094301613355e-05, + "loss": 2.1995, + "step": 13256 + }, + { + "epoch": 2.4849109653233366, + "grad_norm": 51756.4765625, + "learning_rate": 2.554408890702793e-05, + "loss": 2.111, + "step": 13257 + }, + { + "epoch": 2.485098406747891, + "grad_norm": 56521.4453125, + "learning_rate": 2.5537235401951576e-05, + "loss": 2.0822, + "step": 13258 + }, + { + "epoch": 2.4852858481724462, + "grad_norm": 55063.7734375, + "learning_rate": 2.553038250107378e-05, + "loss": 2.1074, + "step": 13259 + }, + { + "epoch": 2.485473289597001, + "grad_norm": 53145.984375, + "learning_rate": 2.552353020456384e-05, + "loss": 2.1232, + "step": 13260 + }, + { + "epoch": 2.485660731021556, + "grad_norm": 57617.59765625, + "learning_rate": 2.5516678512590962e-05, + "loss": 2.1119, + "step": 13261 + }, + { + "epoch": 2.4858481724461106, + "grad_norm": 53439.01171875, + "learning_rate": 2.550982742532437e-05, + "loss": 2.1243, + "step": 13262 + }, + { + "epoch": 2.4860356138706656, + "grad_norm": 56457.75, + "learning_rate": 2.55029769429333e-05, + "loss": 2.0963, + "step": 13263 + }, + { + "epoch": 2.4862230552952203, + "grad_norm": 53450.734375, + "learning_rate": 2.5496127065586946e-05, + "loss": 2.0828, + "step": 13264 + }, + { + "epoch": 2.486410496719775, + "grad_norm": 53279.52734375, + "learning_rate": 2.5489277793454458e-05, + "loss": 2.1296, + "step": 13265 + }, + { + "epoch": 2.48659793814433, + "grad_norm": 53831.09375, + "learning_rate": 2.548242912670503e-05, + "loss": 2.1858, + "step": 13266 + }, + { + "epoch": 2.4867853795688846, + "grad_norm": 54200.12890625, + "learning_rate": 2.5475581065507843e-05, + "loss": 2.0994, + "step": 13267 + }, + { + "epoch": 2.4869728209934396, + "grad_norm": 51381.4453125, + "learning_rate": 2.5468733610032003e-05, + "loss": 2.1739, + "step": 13268 + }, + { + "epoch": 2.4871602624179943, + "grad_norm": 62246.91015625, + "learning_rate": 2.5461886760446613e-05, + "loss": 2.0193, + "step": 13269 + }, + { + "epoch": 2.4873477038425493, + "grad_norm": 52695.6796875, + "learning_rate": 2.5455040516920825e-05, + "loss": 2.1314, + "step": 13270 + }, + { + "epoch": 2.487535145267104, + "grad_norm": 53835.9765625, + "learning_rate": 2.54481948796237e-05, + "loss": 2.0662, + "step": 13271 + }, + { + "epoch": 2.487722586691659, + "grad_norm": 52266.171875, + "learning_rate": 2.544134984872435e-05, + "loss": 2.1103, + "step": 13272 + }, + { + "epoch": 2.4879100281162136, + "grad_norm": 58645.5546875, + "learning_rate": 2.5434505424391796e-05, + "loss": 2.054, + "step": 13273 + }, + { + "epoch": 2.4880974695407687, + "grad_norm": 54741.35546875, + "learning_rate": 2.542766160679513e-05, + "loss": 2.1238, + "step": 13274 + }, + { + "epoch": 2.4882849109653233, + "grad_norm": 59271.8203125, + "learning_rate": 2.5420818396103335e-05, + "loss": 2.0221, + "step": 13275 + }, + { + "epoch": 2.488472352389878, + "grad_norm": 54663.703125, + "learning_rate": 2.5413975792485485e-05, + "loss": 2.123, + "step": 13276 + }, + { + "epoch": 2.488659793814433, + "grad_norm": 53914.91796875, + "learning_rate": 2.5407133796110556e-05, + "loss": 2.094, + "step": 13277 + }, + { + "epoch": 2.4888472352389877, + "grad_norm": 53759.48046875, + "learning_rate": 2.5400292407147517e-05, + "loss": 2.1164, + "step": 13278 + }, + { + "epoch": 2.4890346766635427, + "grad_norm": 58547.76953125, + "learning_rate": 2.539345162576538e-05, + "loss": 2.1913, + "step": 13279 + }, + { + "epoch": 2.4892221180880973, + "grad_norm": 59283.28515625, + "learning_rate": 2.538661145213309e-05, + "loss": 2.1116, + "step": 13280 + }, + { + "epoch": 2.4894095595126524, + "grad_norm": 58220.05078125, + "learning_rate": 2.5379771886419557e-05, + "loss": 2.4541, + "step": 13281 + }, + { + "epoch": 2.489597000937207, + "grad_norm": 51740.5859375, + "learning_rate": 2.5372932928793748e-05, + "loss": 2.1281, + "step": 13282 + }, + { + "epoch": 2.489784442361762, + "grad_norm": 55224.37109375, + "learning_rate": 2.536609457942458e-05, + "loss": 2.1275, + "step": 13283 + }, + { + "epoch": 2.4899718837863167, + "grad_norm": 52224.734375, + "learning_rate": 2.535925683848094e-05, + "loss": 2.1373, + "step": 13284 + }, + { + "epoch": 2.490159325210872, + "grad_norm": 49680.13671875, + "learning_rate": 2.5352419706131686e-05, + "loss": 2.1196, + "step": 13285 + }, + { + "epoch": 2.4903467666354264, + "grad_norm": 50266.8671875, + "learning_rate": 2.5345583182545707e-05, + "loss": 2.1362, + "step": 13286 + }, + { + "epoch": 2.490534208059981, + "grad_norm": 53306.890625, + "learning_rate": 2.5338747267891903e-05, + "loss": 2.1821, + "step": 13287 + }, + { + "epoch": 2.490721649484536, + "grad_norm": 56126.38671875, + "learning_rate": 2.5331911962339028e-05, + "loss": 2.1473, + "step": 13288 + }, + { + "epoch": 2.4909090909090907, + "grad_norm": 53883.0625, + "learning_rate": 2.5325077266055942e-05, + "loss": 2.1634, + "step": 13289 + }, + { + "epoch": 2.491096532333646, + "grad_norm": 54506.2109375, + "learning_rate": 2.5318243179211475e-05, + "loss": 2.1377, + "step": 13290 + }, + { + "epoch": 2.4912839737582004, + "grad_norm": 54028.9140625, + "learning_rate": 2.531140970197441e-05, + "loss": 2.1539, + "step": 13291 + }, + { + "epoch": 2.4914714151827555, + "grad_norm": 55137.44921875, + "learning_rate": 2.5304576834513488e-05, + "loss": 2.1728, + "step": 13292 + }, + { + "epoch": 2.49165885660731, + "grad_norm": 52744.62109375, + "learning_rate": 2.5297744576997523e-05, + "loss": 2.1894, + "step": 13293 + }, + { + "epoch": 2.491846298031865, + "grad_norm": 53522.44921875, + "learning_rate": 2.529091292959524e-05, + "loss": 2.1173, + "step": 13294 + }, + { + "epoch": 2.49203373945642, + "grad_norm": 51463.78125, + "learning_rate": 2.5284081892475352e-05, + "loss": 2.1768, + "step": 13295 + }, + { + "epoch": 2.492221180880975, + "grad_norm": 55858.109375, + "learning_rate": 2.5277251465806595e-05, + "loss": 2.0958, + "step": 13296 + }, + { + "epoch": 2.4924086223055295, + "grad_norm": 54256.20703125, + "learning_rate": 2.5270421649757692e-05, + "loss": 2.1717, + "step": 13297 + }, + { + "epoch": 2.492596063730084, + "grad_norm": 52752.21484375, + "learning_rate": 2.5263592444497315e-05, + "loss": 2.176, + "step": 13298 + }, + { + "epoch": 2.492783505154639, + "grad_norm": 62637.53515625, + "learning_rate": 2.525676385019411e-05, + "loss": 2.1353, + "step": 13299 + }, + { + "epoch": 2.492970946579194, + "grad_norm": 53398.46875, + "learning_rate": 2.524993586701678e-05, + "loss": 2.1352, + "step": 13300 + }, + { + "epoch": 2.493158388003749, + "grad_norm": 56145.046875, + "learning_rate": 2.5243108495133927e-05, + "loss": 2.1321, + "step": 13301 + }, + { + "epoch": 2.4933458294283035, + "grad_norm": 53956.71875, + "learning_rate": 2.5236281734714212e-05, + "loss": 2.1546, + "step": 13302 + }, + { + "epoch": 2.4935332708528586, + "grad_norm": 52137.640625, + "learning_rate": 2.5229455585926232e-05, + "loss": 2.149, + "step": 13303 + }, + { + "epoch": 2.493720712277413, + "grad_norm": 56709.0234375, + "learning_rate": 2.5222630048938565e-05, + "loss": 2.1497, + "step": 13304 + }, + { + "epoch": 2.4939081537019683, + "grad_norm": 61182.65234375, + "learning_rate": 2.5215805123919833e-05, + "loss": 2.2194, + "step": 13305 + }, + { + "epoch": 2.494095595126523, + "grad_norm": 57764.26953125, + "learning_rate": 2.5208980811038564e-05, + "loss": 2.0837, + "step": 13306 + }, + { + "epoch": 2.494283036551078, + "grad_norm": 52934.0078125, + "learning_rate": 2.5202157110463342e-05, + "loss": 2.1621, + "step": 13307 + }, + { + "epoch": 2.4944704779756326, + "grad_norm": 55817.44140625, + "learning_rate": 2.519533402236267e-05, + "loss": 2.1559, + "step": 13308 + }, + { + "epoch": 2.494657919400187, + "grad_norm": 53905.59765625, + "learning_rate": 2.5188511546905103e-05, + "loss": 2.1945, + "step": 13309 + }, + { + "epoch": 2.4948453608247423, + "grad_norm": 53173.76171875, + "learning_rate": 2.5181689684259137e-05, + "loss": 2.0373, + "step": 13310 + }, + { + "epoch": 2.495032802249297, + "grad_norm": 58400.640625, + "learning_rate": 2.517486843459324e-05, + "loss": 2.143, + "step": 13311 + }, + { + "epoch": 2.495220243673852, + "grad_norm": 54281.83984375, + "learning_rate": 2.5168047798075905e-05, + "loss": 2.1454, + "step": 13312 + }, + { + "epoch": 2.4954076850984066, + "grad_norm": 52472.37109375, + "learning_rate": 2.5161227774875616e-05, + "loss": 2.0923, + "step": 13313 + }, + { + "epoch": 2.4955951265229617, + "grad_norm": 57587.4921875, + "learning_rate": 2.5154408365160792e-05, + "loss": 2.1307, + "step": 13314 + }, + { + "epoch": 2.4957825679475163, + "grad_norm": 55319.59375, + "learning_rate": 2.514758956909985e-05, + "loss": 2.1594, + "step": 13315 + }, + { + "epoch": 2.4959700093720714, + "grad_norm": 55668.91015625, + "learning_rate": 2.514077138686125e-05, + "loss": 2.0624, + "step": 13316 + }, + { + "epoch": 2.496157450796626, + "grad_norm": 52895.16796875, + "learning_rate": 2.513395381861336e-05, + "loss": 2.156, + "step": 13317 + }, + { + "epoch": 2.496344892221181, + "grad_norm": 54958.57421875, + "learning_rate": 2.5127136864524554e-05, + "loss": 2.117, + "step": 13318 + }, + { + "epoch": 2.4965323336457357, + "grad_norm": 51353.98828125, + "learning_rate": 2.512032052476322e-05, + "loss": 2.0714, + "step": 13319 + }, + { + "epoch": 2.4967197750702903, + "grad_norm": 52424.640625, + "learning_rate": 2.5113504799497743e-05, + "loss": 2.097, + "step": 13320 + }, + { + "epoch": 2.4969072164948454, + "grad_norm": 53055.515625, + "learning_rate": 2.51066896888964e-05, + "loss": 2.1028, + "step": 13321 + }, + { + "epoch": 2.4970946579194, + "grad_norm": 53232.578125, + "learning_rate": 2.5099875193127552e-05, + "loss": 2.0775, + "step": 13322 + }, + { + "epoch": 2.497282099343955, + "grad_norm": 54979.70703125, + "learning_rate": 2.5093061312359512e-05, + "loss": 2.1175, + "step": 13323 + }, + { + "epoch": 2.4974695407685097, + "grad_norm": 58693.89453125, + "learning_rate": 2.5086248046760574e-05, + "loss": 2.0871, + "step": 13324 + }, + { + "epoch": 2.4976569821930648, + "grad_norm": 52700.375, + "learning_rate": 2.5079435396498984e-05, + "loss": 2.1571, + "step": 13325 + }, + { + "epoch": 2.4978444236176194, + "grad_norm": 53414.43359375, + "learning_rate": 2.507262336174303e-05, + "loss": 2.127, + "step": 13326 + }, + { + "epoch": 2.4980318650421744, + "grad_norm": 51756.5078125, + "learning_rate": 2.5065811942660978e-05, + "loss": 2.1458, + "step": 13327 + }, + { + "epoch": 2.498219306466729, + "grad_norm": 51183.81640625, + "learning_rate": 2.5059001139421045e-05, + "loss": 2.2004, + "step": 13328 + }, + { + "epoch": 2.498406747891284, + "grad_norm": 53823.5, + "learning_rate": 2.5052190952191434e-05, + "loss": 2.1568, + "step": 13329 + }, + { + "epoch": 2.4985941893158388, + "grad_norm": 52026.2890625, + "learning_rate": 2.5045381381140375e-05, + "loss": 2.1181, + "step": 13330 + }, + { + "epoch": 2.4987816307403934, + "grad_norm": 50130.3125, + "learning_rate": 2.5038572426436018e-05, + "loss": 2.2168, + "step": 13331 + }, + { + "epoch": 2.4989690721649485, + "grad_norm": 59106.37109375, + "learning_rate": 2.5031764088246584e-05, + "loss": 2.0993, + "step": 13332 + }, + { + "epoch": 2.4991565135895035, + "grad_norm": 56679.453125, + "learning_rate": 2.5024956366740206e-05, + "loss": 2.2017, + "step": 13333 + }, + { + "epoch": 2.499343955014058, + "grad_norm": 56824.39453125, + "learning_rate": 2.5018149262085e-05, + "loss": 2.1306, + "step": 13334 + }, + { + "epoch": 2.4995313964386128, + "grad_norm": 56647.9375, + "learning_rate": 2.5011342774449143e-05, + "loss": 2.1273, + "step": 13335 + }, + { + "epoch": 2.499718837863168, + "grad_norm": 54896.0859375, + "learning_rate": 2.5004536904000696e-05, + "loss": 2.0769, + "step": 13336 + }, + { + "epoch": 2.4999062792877225, + "grad_norm": 58217.0078125, + "learning_rate": 2.499773165090781e-05, + "loss": 2.0781, + "step": 13337 + }, + { + "epoch": 2.5000937207122775, + "grad_norm": 56663.421875, + "learning_rate": 2.4990927015338512e-05, + "loss": 2.1162, + "step": 13338 + }, + { + "epoch": 2.500281162136832, + "grad_norm": 54271.68359375, + "learning_rate": 2.4984122997460907e-05, + "loss": 2.1148, + "step": 13339 + }, + { + "epoch": 2.5004686035613872, + "grad_norm": 55075.64453125, + "learning_rate": 2.497731959744303e-05, + "loss": 2.1664, + "step": 13340 + }, + { + "epoch": 2.500656044985942, + "grad_norm": 52720.03515625, + "learning_rate": 2.4970516815452904e-05, + "loss": 2.1238, + "step": 13341 + }, + { + "epoch": 2.5008434864104965, + "grad_norm": 52839.96875, + "learning_rate": 2.4963714651658558e-05, + "loss": 2.0161, + "step": 13342 + }, + { + "epoch": 2.5010309278350515, + "grad_norm": 51722.8046875, + "learning_rate": 2.4956913106228035e-05, + "loss": 2.1916, + "step": 13343 + }, + { + "epoch": 2.5012183692596066, + "grad_norm": 55854.5, + "learning_rate": 2.495011217932926e-05, + "loss": 2.1631, + "step": 13344 + }, + { + "epoch": 2.5014058106841612, + "grad_norm": 55731.51171875, + "learning_rate": 2.494331187113023e-05, + "loss": 2.1508, + "step": 13345 + }, + { + "epoch": 2.501593252108716, + "grad_norm": 58708.93359375, + "learning_rate": 2.4936512181798933e-05, + "loss": 2.1199, + "step": 13346 + }, + { + "epoch": 2.501780693533271, + "grad_norm": 55017.03515625, + "learning_rate": 2.4929713111503283e-05, + "loss": 2.1643, + "step": 13347 + }, + { + "epoch": 2.5019681349578256, + "grad_norm": 61640.234375, + "learning_rate": 2.4922914660411202e-05, + "loss": 2.1978, + "step": 13348 + }, + { + "epoch": 2.5021555763823806, + "grad_norm": 56416.5703125, + "learning_rate": 2.4916116828690612e-05, + "loss": 2.1379, + "step": 13349 + }, + { + "epoch": 2.5023430178069352, + "grad_norm": 56408.34765625, + "learning_rate": 2.490931961650946e-05, + "loss": 2.1161, + "step": 13350 + }, + { + "epoch": 2.5025304592314903, + "grad_norm": 52223.7265625, + "learning_rate": 2.4902523024035536e-05, + "loss": 2.1172, + "step": 13351 + }, + { + "epoch": 2.502717900656045, + "grad_norm": 54066.15625, + "learning_rate": 2.4895727051436757e-05, + "loss": 2.1969, + "step": 13352 + }, + { + "epoch": 2.5029053420805996, + "grad_norm": 54536.5234375, + "learning_rate": 2.488893169888099e-05, + "loss": 2.0846, + "step": 13353 + }, + { + "epoch": 2.5030927835051546, + "grad_norm": 54947.85546875, + "learning_rate": 2.4882136966536057e-05, + "loss": 2.0793, + "step": 13354 + }, + { + "epoch": 2.5032802249297097, + "grad_norm": 54967.57421875, + "learning_rate": 2.4875342854569755e-05, + "loss": 2.0764, + "step": 13355 + }, + { + "epoch": 2.5034676663542643, + "grad_norm": 56086.4296875, + "learning_rate": 2.4868549363149924e-05, + "loss": 2.1838, + "step": 13356 + }, + { + "epoch": 2.503655107778819, + "grad_norm": 51874.6953125, + "learning_rate": 2.4861756492444328e-05, + "loss": 2.0647, + "step": 13357 + }, + { + "epoch": 2.503842549203374, + "grad_norm": 51728.0703125, + "learning_rate": 2.485496424262077e-05, + "loss": 2.1333, + "step": 13358 + }, + { + "epoch": 2.5040299906279286, + "grad_norm": 54651.6796875, + "learning_rate": 2.4848172613846976e-05, + "loss": 2.13, + "step": 13359 + }, + { + "epoch": 2.5042174320524837, + "grad_norm": 53453.359375, + "learning_rate": 2.4841381606290724e-05, + "loss": 2.1995, + "step": 13360 + }, + { + "epoch": 2.5044048734770383, + "grad_norm": 50500.2265625, + "learning_rate": 2.483459122011973e-05, + "loss": 2.1159, + "step": 13361 + }, + { + "epoch": 2.5045923149015934, + "grad_norm": 57466.390625, + "learning_rate": 2.4827801455501687e-05, + "loss": 2.0938, + "step": 13362 + }, + { + "epoch": 2.504779756326148, + "grad_norm": 56069.0078125, + "learning_rate": 2.4821012312604337e-05, + "loss": 2.1523, + "step": 13363 + }, + { + "epoch": 2.5049671977507026, + "grad_norm": 56951.28515625, + "learning_rate": 2.4814223791595308e-05, + "loss": 2.1565, + "step": 13364 + }, + { + "epoch": 2.5051546391752577, + "grad_norm": 59615.42578125, + "learning_rate": 2.4807435892642322e-05, + "loss": 2.308, + "step": 13365 + }, + { + "epoch": 2.505342080599813, + "grad_norm": 56567.38671875, + "learning_rate": 2.4800648615912986e-05, + "loss": 2.2081, + "step": 13366 + }, + { + "epoch": 2.5055295220243674, + "grad_norm": 57141.35546875, + "learning_rate": 2.4793861961574976e-05, + "loss": 2.2668, + "step": 13367 + }, + { + "epoch": 2.505716963448922, + "grad_norm": 61602.08984375, + "learning_rate": 2.4787075929795883e-05, + "loss": 2.0654, + "step": 13368 + }, + { + "epoch": 2.505904404873477, + "grad_norm": 51078.73828125, + "learning_rate": 2.4780290520743337e-05, + "loss": 2.1216, + "step": 13369 + }, + { + "epoch": 2.5060918462980317, + "grad_norm": 52222.05078125, + "learning_rate": 2.4773505734584922e-05, + "loss": 2.0976, + "step": 13370 + }, + { + "epoch": 2.506279287722587, + "grad_norm": 55156.99609375, + "learning_rate": 2.4766721571488188e-05, + "loss": 2.1631, + "step": 13371 + }, + { + "epoch": 2.5064667291471414, + "grad_norm": 58923.6328125, + "learning_rate": 2.4759938031620716e-05, + "loss": 2.1622, + "step": 13372 + }, + { + "epoch": 2.5066541705716965, + "grad_norm": 50609.06640625, + "learning_rate": 2.4753155115150083e-05, + "loss": 2.0611, + "step": 13373 + }, + { + "epoch": 2.506841611996251, + "grad_norm": 61738.62109375, + "learning_rate": 2.4746372822243756e-05, + "loss": 2.1666, + "step": 13374 + }, + { + "epoch": 2.5070290534208057, + "grad_norm": 52148.82421875, + "learning_rate": 2.4739591153069274e-05, + "loss": 2.124, + "step": 13375 + }, + { + "epoch": 2.507216494845361, + "grad_norm": 60664.94140625, + "learning_rate": 2.4732810107794157e-05, + "loss": 2.2498, + "step": 13376 + }, + { + "epoch": 2.507403936269916, + "grad_norm": 58845.16015625, + "learning_rate": 2.472602968658588e-05, + "loss": 2.1609, + "step": 13377 + }, + { + "epoch": 2.5075913776944705, + "grad_norm": 51260.625, + "learning_rate": 2.4719249889611873e-05, + "loss": 2.1302, + "step": 13378 + }, + { + "epoch": 2.507778819119025, + "grad_norm": 52143.18359375, + "learning_rate": 2.471247071703962e-05, + "loss": 2.1757, + "step": 13379 + }, + { + "epoch": 2.50796626054358, + "grad_norm": 53020.328125, + "learning_rate": 2.4705692169036593e-05, + "loss": 2.1849, + "step": 13380 + }, + { + "epoch": 2.508153701968135, + "grad_norm": 59247.9140625, + "learning_rate": 2.469891424577014e-05, + "loss": 2.1419, + "step": 13381 + }, + { + "epoch": 2.50834114339269, + "grad_norm": 55904.53515625, + "learning_rate": 2.4692136947407695e-05, + "loss": 2.1063, + "step": 13382 + }, + { + "epoch": 2.5085285848172445, + "grad_norm": 53032.88671875, + "learning_rate": 2.4685360274116677e-05, + "loss": 2.1438, + "step": 13383 + }, + { + "epoch": 2.5087160262417996, + "grad_norm": 57068.546875, + "learning_rate": 2.4678584226064438e-05, + "loss": 2.0802, + "step": 13384 + }, + { + "epoch": 2.508903467666354, + "grad_norm": 54666.13671875, + "learning_rate": 2.467180880341832e-05, + "loss": 2.1167, + "step": 13385 + }, + { + "epoch": 2.509090909090909, + "grad_norm": 50370.9921875, + "learning_rate": 2.466503400634571e-05, + "loss": 2.143, + "step": 13386 + }, + { + "epoch": 2.509278350515464, + "grad_norm": 57840.421875, + "learning_rate": 2.465825983501389e-05, + "loss": 2.151, + "step": 13387 + }, + { + "epoch": 2.509465791940019, + "grad_norm": 53561.859375, + "learning_rate": 2.4651486289590215e-05, + "loss": 2.0307, + "step": 13388 + }, + { + "epoch": 2.5096532333645736, + "grad_norm": 55459.65234375, + "learning_rate": 2.4644713370241944e-05, + "loss": 2.1569, + "step": 13389 + }, + { + "epoch": 2.509840674789128, + "grad_norm": 55589.7578125, + "learning_rate": 2.46379410771364e-05, + "loss": 2.1241, + "step": 13390 + }, + { + "epoch": 2.5100281162136833, + "grad_norm": 55254.6015625, + "learning_rate": 2.463116941044083e-05, + "loss": 2.0054, + "step": 13391 + }, + { + "epoch": 2.510215557638238, + "grad_norm": 52885.2578125, + "learning_rate": 2.4624398370322464e-05, + "loss": 2.0859, + "step": 13392 + }, + { + "epoch": 2.510402999062793, + "grad_norm": 53076.69921875, + "learning_rate": 2.461762795694858e-05, + "loss": 2.0863, + "step": 13393 + }, + { + "epoch": 2.5105904404873476, + "grad_norm": 51589.52734375, + "learning_rate": 2.461085817048635e-05, + "loss": 2.1552, + "step": 13394 + }, + { + "epoch": 2.5107778819119027, + "grad_norm": 52756.1796875, + "learning_rate": 2.4604089011103034e-05, + "loss": 2.1151, + "step": 13395 + }, + { + "epoch": 2.5109653233364573, + "grad_norm": 58027.0703125, + "learning_rate": 2.4597320478965785e-05, + "loss": 2.1575, + "step": 13396 + }, + { + "epoch": 2.5111527647610123, + "grad_norm": 57354.30078125, + "learning_rate": 2.4590552574241772e-05, + "loss": 2.1425, + "step": 13397 + }, + { + "epoch": 2.511340206185567, + "grad_norm": 53698.89453125, + "learning_rate": 2.4583785297098167e-05, + "loss": 2.0857, + "step": 13398 + }, + { + "epoch": 2.511527647610122, + "grad_norm": 55879.0546875, + "learning_rate": 2.457701864770213e-05, + "loss": 2.0942, + "step": 13399 + }, + { + "epoch": 2.5117150890346767, + "grad_norm": 55591.046875, + "learning_rate": 2.4570252626220768e-05, + "loss": 2.1519, + "step": 13400 + }, + { + "epoch": 2.5119025304592313, + "grad_norm": 56635.64453125, + "learning_rate": 2.4563487232821175e-05, + "loss": 2.1285, + "step": 13401 + }, + { + "epoch": 2.5120899718837864, + "grad_norm": 52984.5078125, + "learning_rate": 2.4556722467670497e-05, + "loss": 2.1709, + "step": 13402 + }, + { + "epoch": 2.512277413308341, + "grad_norm": 54549.4609375, + "learning_rate": 2.4549958330935778e-05, + "loss": 2.1327, + "step": 13403 + }, + { + "epoch": 2.512464854732896, + "grad_norm": 60185.2421875, + "learning_rate": 2.4543194822784077e-05, + "loss": 2.1148, + "step": 13404 + }, + { + "epoch": 2.5126522961574507, + "grad_norm": 58007.0390625, + "learning_rate": 2.4536431943382453e-05, + "loss": 2.1096, + "step": 13405 + }, + { + "epoch": 2.5128397375820057, + "grad_norm": 52702.48046875, + "learning_rate": 2.452966969289797e-05, + "loss": 2.1488, + "step": 13406 + }, + { + "epoch": 2.5130271790065604, + "grad_norm": 52832.22265625, + "learning_rate": 2.4522908071497625e-05, + "loss": 2.0955, + "step": 13407 + }, + { + "epoch": 2.5132146204311154, + "grad_norm": 54937.08984375, + "learning_rate": 2.4516147079348396e-05, + "loss": 2.1351, + "step": 13408 + }, + { + "epoch": 2.51340206185567, + "grad_norm": 49770.9765625, + "learning_rate": 2.450938671661731e-05, + "loss": 2.1683, + "step": 13409 + }, + { + "epoch": 2.513589503280225, + "grad_norm": 57790.71875, + "learning_rate": 2.4502626983471333e-05, + "loss": 2.1262, + "step": 13410 + }, + { + "epoch": 2.5137769447047797, + "grad_norm": 53471.19140625, + "learning_rate": 2.4495867880077383e-05, + "loss": 2.1282, + "step": 13411 + }, + { + "epoch": 2.5139643861293344, + "grad_norm": 54672.203125, + "learning_rate": 2.4489109406602436e-05, + "loss": 2.1789, + "step": 13412 + }, + { + "epoch": 2.5141518275538894, + "grad_norm": 53988.30859375, + "learning_rate": 2.4482351563213424e-05, + "loss": 2.1295, + "step": 13413 + }, + { + "epoch": 2.5143392689784445, + "grad_norm": 51839.0625, + "learning_rate": 2.4475594350077254e-05, + "loss": 2.0757, + "step": 13414 + }, + { + "epoch": 2.514526710402999, + "grad_norm": 51351.6171875, + "learning_rate": 2.446883776736078e-05, + "loss": 2.0323, + "step": 13415 + }, + { + "epoch": 2.5147141518275538, + "grad_norm": 55942.72265625, + "learning_rate": 2.4462081815230935e-05, + "loss": 2.0804, + "step": 13416 + }, + { + "epoch": 2.514901593252109, + "grad_norm": 55169.23828125, + "learning_rate": 2.4455326493854564e-05, + "loss": 2.1035, + "step": 13417 + }, + { + "epoch": 2.5150890346766634, + "grad_norm": 49830.5625, + "learning_rate": 2.4448571803398484e-05, + "loss": 2.1388, + "step": 13418 + }, + { + "epoch": 2.5152764761012185, + "grad_norm": 57884.28125, + "learning_rate": 2.4441817744029553e-05, + "loss": 2.1995, + "step": 13419 + }, + { + "epoch": 2.515463917525773, + "grad_norm": 55763.0859375, + "learning_rate": 2.4435064315914606e-05, + "loss": 2.0879, + "step": 13420 + }, + { + "epoch": 2.515651358950328, + "grad_norm": 57579.078125, + "learning_rate": 2.442831151922043e-05, + "loss": 2.1108, + "step": 13421 + }, + { + "epoch": 2.515838800374883, + "grad_norm": 57523.76953125, + "learning_rate": 2.4421559354113783e-05, + "loss": 2.0167, + "step": 13422 + }, + { + "epoch": 2.5160262417994375, + "grad_norm": 60010.6953125, + "learning_rate": 2.4414807820761476e-05, + "loss": 2.1368, + "step": 13423 + }, + { + "epoch": 2.5162136832239925, + "grad_norm": 55554.6875, + "learning_rate": 2.4408056919330225e-05, + "loss": 2.1561, + "step": 13424 + }, + { + "epoch": 2.5164011246485476, + "grad_norm": 53096.80859375, + "learning_rate": 2.440130664998681e-05, + "loss": 2.1673, + "step": 13425 + }, + { + "epoch": 2.516588566073102, + "grad_norm": 63058.09765625, + "learning_rate": 2.4394557012897927e-05, + "loss": 2.1, + "step": 13426 + }, + { + "epoch": 2.516776007497657, + "grad_norm": 54856.7265625, + "learning_rate": 2.4387808008230266e-05, + "loss": 2.1866, + "step": 13427 + }, + { + "epoch": 2.516963448922212, + "grad_norm": 60686.37890625, + "learning_rate": 2.438105963615055e-05, + "loss": 2.2122, + "step": 13428 + }, + { + "epoch": 2.5171508903467665, + "grad_norm": 50893.63671875, + "learning_rate": 2.437431189682546e-05, + "loss": 2.1732, + "step": 13429 + }, + { + "epoch": 2.5173383317713216, + "grad_norm": 51139.55859375, + "learning_rate": 2.4367564790421638e-05, + "loss": 2.07, + "step": 13430 + }, + { + "epoch": 2.5175257731958762, + "grad_norm": 58233.90234375, + "learning_rate": 2.4360818317105715e-05, + "loss": 2.1515, + "step": 13431 + }, + { + "epoch": 2.5177132146204313, + "grad_norm": 57091.51953125, + "learning_rate": 2.4354072477044366e-05, + "loss": 2.1076, + "step": 13432 + }, + { + "epoch": 2.517900656044986, + "grad_norm": 53958.1953125, + "learning_rate": 2.4347327270404162e-05, + "loss": 2.1163, + "step": 13433 + }, + { + "epoch": 2.5180880974695405, + "grad_norm": 54203.61328125, + "learning_rate": 2.4340582697351705e-05, + "loss": 2.0884, + "step": 13434 + }, + { + "epoch": 2.5182755388940956, + "grad_norm": 59506.82421875, + "learning_rate": 2.433383875805359e-05, + "loss": 2.2124, + "step": 13435 + }, + { + "epoch": 2.5184629803186507, + "grad_norm": 54455.0546875, + "learning_rate": 2.4327095452676412e-05, + "loss": 2.0843, + "step": 13436 + }, + { + "epoch": 2.5186504217432053, + "grad_norm": 53027.8828125, + "learning_rate": 2.4320352781386658e-05, + "loss": 2.128, + "step": 13437 + }, + { + "epoch": 2.51883786316776, + "grad_norm": 50502.23046875, + "learning_rate": 2.4313610744350895e-05, + "loss": 2.1342, + "step": 13438 + }, + { + "epoch": 2.519025304592315, + "grad_norm": 58042.99609375, + "learning_rate": 2.430686934173566e-05, + "loss": 2.1276, + "step": 13439 + }, + { + "epoch": 2.5192127460168696, + "grad_norm": 57044.14453125, + "learning_rate": 2.430012857370744e-05, + "loss": 2.1082, + "step": 13440 + }, + { + "epoch": 2.5194001874414247, + "grad_norm": 55207.609375, + "learning_rate": 2.4293388440432708e-05, + "loss": 2.1098, + "step": 13441 + }, + { + "epoch": 2.5195876288659793, + "grad_norm": 51092.60546875, + "learning_rate": 2.4286648942077955e-05, + "loss": 2.1533, + "step": 13442 + }, + { + "epoch": 2.5197750702905344, + "grad_norm": 59030.46484375, + "learning_rate": 2.427991007880967e-05, + "loss": 2.1401, + "step": 13443 + }, + { + "epoch": 2.519962511715089, + "grad_norm": 52608.3515625, + "learning_rate": 2.4273171850794218e-05, + "loss": 2.1419, + "step": 13444 + }, + { + "epoch": 2.5201499531396436, + "grad_norm": 53065.63671875, + "learning_rate": 2.4266434258198067e-05, + "loss": 2.0851, + "step": 13445 + }, + { + "epoch": 2.5203373945641987, + "grad_norm": 50858.12890625, + "learning_rate": 2.425969730118765e-05, + "loss": 2.0572, + "step": 13446 + }, + { + "epoch": 2.5205248359887538, + "grad_norm": 53567.1953125, + "learning_rate": 2.4252960979929334e-05, + "loss": 2.1477, + "step": 13447 + }, + { + "epoch": 2.5207122774133084, + "grad_norm": 53421.859375, + "learning_rate": 2.424622529458948e-05, + "loss": 2.0891, + "step": 13448 + }, + { + "epoch": 2.520899718837863, + "grad_norm": 55151.6484375, + "learning_rate": 2.423949024533449e-05, + "loss": 2.115, + "step": 13449 + }, + { + "epoch": 2.521087160262418, + "grad_norm": 57296.91015625, + "learning_rate": 2.4232755832330678e-05, + "loss": 2.1215, + "step": 13450 + }, + { + "epoch": 2.5212746016869727, + "grad_norm": 52382.9453125, + "learning_rate": 2.4226022055744402e-05, + "loss": 2.1222, + "step": 13451 + }, + { + "epoch": 2.5214620431115278, + "grad_norm": 55287.546875, + "learning_rate": 2.421928891574195e-05, + "loss": 2.0743, + "step": 13452 + }, + { + "epoch": 2.5216494845360824, + "grad_norm": 54963.046875, + "learning_rate": 2.4212556412489652e-05, + "loss": 2.1321, + "step": 13453 + }, + { + "epoch": 2.5218369259606375, + "grad_norm": 53775.5234375, + "learning_rate": 2.420582454615376e-05, + "loss": 2.1658, + "step": 13454 + }, + { + "epoch": 2.522024367385192, + "grad_norm": 53231.02734375, + "learning_rate": 2.419909331690058e-05, + "loss": 2.1579, + "step": 13455 + }, + { + "epoch": 2.5222118088097467, + "grad_norm": 55041.7421875, + "learning_rate": 2.4192362724896338e-05, + "loss": 2.1586, + "step": 13456 + }, + { + "epoch": 2.522399250234302, + "grad_norm": 56751.03515625, + "learning_rate": 2.4185632770307258e-05, + "loss": 2.1638, + "step": 13457 + }, + { + "epoch": 2.522586691658857, + "grad_norm": 53179.37109375, + "learning_rate": 2.4178903453299607e-05, + "loss": 2.1211, + "step": 13458 + }, + { + "epoch": 2.5227741330834115, + "grad_norm": 58713.57421875, + "learning_rate": 2.4172174774039535e-05, + "loss": 2.3332, + "step": 13459 + }, + { + "epoch": 2.522961574507966, + "grad_norm": 51834.48828125, + "learning_rate": 2.416544673269328e-05, + "loss": 2.0861, + "step": 13460 + }, + { + "epoch": 2.523149015932521, + "grad_norm": 50766.30859375, + "learning_rate": 2.415871932942698e-05, + "loss": 2.1565, + "step": 13461 + }, + { + "epoch": 2.523336457357076, + "grad_norm": 52618.01171875, + "learning_rate": 2.4151992564406828e-05, + "loss": 2.0915, + "step": 13462 + }, + { + "epoch": 2.523523898781631, + "grad_norm": 55174.171875, + "learning_rate": 2.4145266437798948e-05, + "loss": 2.1096, + "step": 13463 + }, + { + "epoch": 2.5237113402061855, + "grad_norm": 49752.0625, + "learning_rate": 2.4138540949769443e-05, + "loss": 2.1895, + "step": 13464 + }, + { + "epoch": 2.5238987816307406, + "grad_norm": 50207.1953125, + "learning_rate": 2.413181610048445e-05, + "loss": 2.1817, + "step": 13465 + }, + { + "epoch": 2.524086223055295, + "grad_norm": 53609.34375, + "learning_rate": 2.4125091890110098e-05, + "loss": 2.0555, + "step": 13466 + }, + { + "epoch": 2.52427366447985, + "grad_norm": 55870.56640625, + "learning_rate": 2.4118368318812388e-05, + "loss": 2.1093, + "step": 13467 + }, + { + "epoch": 2.524461105904405, + "grad_norm": 51669.41796875, + "learning_rate": 2.411164538675743e-05, + "loss": 2.1502, + "step": 13468 + }, + { + "epoch": 2.52464854732896, + "grad_norm": 55314.47265625, + "learning_rate": 2.4104923094111283e-05, + "loss": 2.1693, + "step": 13469 + }, + { + "epoch": 2.5248359887535146, + "grad_norm": 63397.23828125, + "learning_rate": 2.4098201441039957e-05, + "loss": 2.0818, + "step": 13470 + }, + { + "epoch": 2.525023430178069, + "grad_norm": 56346.0078125, + "learning_rate": 2.4091480427709463e-05, + "loss": 2.1439, + "step": 13471 + }, + { + "epoch": 2.5252108716026243, + "grad_norm": 49586.5390625, + "learning_rate": 2.4084760054285804e-05, + "loss": 2.157, + "step": 13472 + }, + { + "epoch": 2.525398313027179, + "grad_norm": 54898.63671875, + "learning_rate": 2.4078040320935012e-05, + "loss": 2.1763, + "step": 13473 + }, + { + "epoch": 2.525585754451734, + "grad_norm": 51834.203125, + "learning_rate": 2.407132122782298e-05, + "loss": 2.1964, + "step": 13474 + }, + { + "epoch": 2.5257731958762886, + "grad_norm": 55784.88671875, + "learning_rate": 2.4064602775115695e-05, + "loss": 2.0873, + "step": 13475 + }, + { + "epoch": 2.5259606373008436, + "grad_norm": 54143.22265625, + "learning_rate": 2.4057884962979115e-05, + "loss": 2.0667, + "step": 13476 + }, + { + "epoch": 2.5261480787253983, + "grad_norm": 54832.9140625, + "learning_rate": 2.4051167791579144e-05, + "loss": 2.1754, + "step": 13477 + }, + { + "epoch": 2.526335520149953, + "grad_norm": 54854.890625, + "learning_rate": 2.4044451261081662e-05, + "loss": 2.1591, + "step": 13478 + }, + { + "epoch": 2.526522961574508, + "grad_norm": 50701.88671875, + "learning_rate": 2.4037735371652603e-05, + "loss": 2.1737, + "step": 13479 + }, + { + "epoch": 2.526710402999063, + "grad_norm": 54752.90234375, + "learning_rate": 2.4031020123457802e-05, + "loss": 2.1419, + "step": 13480 + }, + { + "epoch": 2.5268978444236176, + "grad_norm": 51306.48828125, + "learning_rate": 2.4024305516663152e-05, + "loss": 2.1147, + "step": 13481 + }, + { + "epoch": 2.5270852858481723, + "grad_norm": 51926.6640625, + "learning_rate": 2.4017591551434458e-05, + "loss": 2.1195, + "step": 13482 + }, + { + "epoch": 2.5272727272727273, + "grad_norm": 53570.98046875, + "learning_rate": 2.401087822793759e-05, + "loss": 2.1469, + "step": 13483 + }, + { + "epoch": 2.527460168697282, + "grad_norm": 57092.27734375, + "learning_rate": 2.4004165546338315e-05, + "loss": 2.1741, + "step": 13484 + }, + { + "epoch": 2.527647610121837, + "grad_norm": 61368.9140625, + "learning_rate": 2.399745350680246e-05, + "loss": 2.1054, + "step": 13485 + }, + { + "epoch": 2.5278350515463917, + "grad_norm": 49689.9453125, + "learning_rate": 2.3990742109495795e-05, + "loss": 2.16, + "step": 13486 + }, + { + "epoch": 2.5280224929709467, + "grad_norm": 50236.00390625, + "learning_rate": 2.398403135458406e-05, + "loss": 2.1186, + "step": 13487 + }, + { + "epoch": 2.5282099343955013, + "grad_norm": 53777.1484375, + "learning_rate": 2.397732124223303e-05, + "loss": 2.1482, + "step": 13488 + }, + { + "epoch": 2.528397375820056, + "grad_norm": 51333.49609375, + "learning_rate": 2.3970611772608435e-05, + "loss": 2.1651, + "step": 13489 + }, + { + "epoch": 2.528584817244611, + "grad_norm": 49978.484375, + "learning_rate": 2.3963902945875964e-05, + "loss": 2.1588, + "step": 13490 + }, + { + "epoch": 2.528772258669166, + "grad_norm": 54137.9296875, + "learning_rate": 2.395719476220133e-05, + "loss": 2.1328, + "step": 13491 + }, + { + "epoch": 2.5289597000937207, + "grad_norm": 55567.5078125, + "learning_rate": 2.395048722175024e-05, + "loss": 2.0513, + "step": 13492 + }, + { + "epoch": 2.5291471415182754, + "grad_norm": 54591.703125, + "learning_rate": 2.3943780324688343e-05, + "loss": 2.1499, + "step": 13493 + }, + { + "epoch": 2.5293345829428304, + "grad_norm": 52819.203125, + "learning_rate": 2.393707407118127e-05, + "loss": 2.1511, + "step": 13494 + }, + { + "epoch": 2.529522024367385, + "grad_norm": 61702.265625, + "learning_rate": 2.3930368461394675e-05, + "loss": 2.1345, + "step": 13495 + }, + { + "epoch": 2.52970946579194, + "grad_norm": 51127.83984375, + "learning_rate": 2.392366349549422e-05, + "loss": 2.1976, + "step": 13496 + }, + { + "epoch": 2.5298969072164947, + "grad_norm": 54592.609375, + "learning_rate": 2.391695917364543e-05, + "loss": 2.1767, + "step": 13497 + }, + { + "epoch": 2.53008434864105, + "grad_norm": 57560.16015625, + "learning_rate": 2.391025549601393e-05, + "loss": 2.1139, + "step": 13498 + }, + { + "epoch": 2.5302717900656044, + "grad_norm": 57661.80859375, + "learning_rate": 2.3903552462765312e-05, + "loss": 2.1428, + "step": 13499 + }, + { + "epoch": 2.530459231490159, + "grad_norm": 53007.8984375, + "learning_rate": 2.3896850074065115e-05, + "loss": 2.2695, + "step": 13500 + }, + { + "epoch": 2.530459231490159, + "eval_loss": 2.2662506103515625, + "eval_runtime": 128.7577, + "eval_samples_per_second": 39.213, + "eval_steps_per_second": 1.965, + "step": 13500 + }, + { + "epoch": 2.530646672914714, + "grad_norm": 53399.140625, + "learning_rate": 2.3890148330078854e-05, + "loss": 2.1245, + "step": 13501 + }, + { + "epoch": 2.530834114339269, + "grad_norm": 52842.53125, + "learning_rate": 2.38834472309721e-05, + "loss": 2.0943, + "step": 13502 + }, + { + "epoch": 2.531021555763824, + "grad_norm": 51310.9296875, + "learning_rate": 2.3876746776910343e-05, + "loss": 2.1688, + "step": 13503 + }, + { + "epoch": 2.5312089971883784, + "grad_norm": 55285.8984375, + "learning_rate": 2.3870046968059047e-05, + "loss": 2.1268, + "step": 13504 + }, + { + "epoch": 2.5313964386129335, + "grad_norm": 55445.16015625, + "learning_rate": 2.3863347804583712e-05, + "loss": 2.1346, + "step": 13505 + }, + { + "epoch": 2.531583880037488, + "grad_norm": 49610.89453125, + "learning_rate": 2.3856649286649822e-05, + "loss": 2.1333, + "step": 13506 + }, + { + "epoch": 2.531771321462043, + "grad_norm": 55884.55859375, + "learning_rate": 2.38499514144228e-05, + "loss": 2.1393, + "step": 13507 + }, + { + "epoch": 2.531958762886598, + "grad_norm": 57717.578125, + "learning_rate": 2.3843254188068053e-05, + "loss": 2.0406, + "step": 13508 + }, + { + "epoch": 2.532146204311153, + "grad_norm": 57725.12109375, + "learning_rate": 2.3836557607751033e-05, + "loss": 2.1182, + "step": 13509 + }, + { + "epoch": 2.5323336457357075, + "grad_norm": 54805.9453125, + "learning_rate": 2.38298616736371e-05, + "loss": 2.2044, + "step": 13510 + }, + { + "epoch": 2.532521087160262, + "grad_norm": 50966.2578125, + "learning_rate": 2.3823166385891676e-05, + "loss": 2.1324, + "step": 13511 + }, + { + "epoch": 2.532708528584817, + "grad_norm": 57190.53125, + "learning_rate": 2.381647174468008e-05, + "loss": 2.0387, + "step": 13512 + }, + { + "epoch": 2.5328959700093723, + "grad_norm": 56359.078125, + "learning_rate": 2.3809777750167707e-05, + "loss": 2.1082, + "step": 13513 + }, + { + "epoch": 2.533083411433927, + "grad_norm": 58294.70703125, + "learning_rate": 2.3803084402519864e-05, + "loss": 2.1595, + "step": 13514 + }, + { + "epoch": 2.5332708528584815, + "grad_norm": 57460.890625, + "learning_rate": 2.3796391701901855e-05, + "loss": 2.1326, + "step": 13515 + }, + { + "epoch": 2.5334582942830366, + "grad_norm": 53781.44921875, + "learning_rate": 2.3789699648479013e-05, + "loss": 2.1204, + "step": 13516 + }, + { + "epoch": 2.533645735707591, + "grad_norm": 57196.203125, + "learning_rate": 2.3783008242416592e-05, + "loss": 2.2015, + "step": 13517 + }, + { + "epoch": 2.5338331771321463, + "grad_norm": 52802.21875, + "learning_rate": 2.3776317483879895e-05, + "loss": 2.147, + "step": 13518 + }, + { + "epoch": 2.534020618556701, + "grad_norm": 53327.63671875, + "learning_rate": 2.3769627373034158e-05, + "loss": 2.1641, + "step": 13519 + }, + { + "epoch": 2.534208059981256, + "grad_norm": 51213.5234375, + "learning_rate": 2.3762937910044596e-05, + "loss": 2.1352, + "step": 13520 + }, + { + "epoch": 2.5343955014058106, + "grad_norm": 53519.0234375, + "learning_rate": 2.375624909507645e-05, + "loss": 2.1005, + "step": 13521 + }, + { + "epoch": 2.5345829428303657, + "grad_norm": 52892.69140625, + "learning_rate": 2.3749560928294945e-05, + "loss": 2.1521, + "step": 13522 + }, + { + "epoch": 2.5347703842549203, + "grad_norm": 51846.48046875, + "learning_rate": 2.3742873409865252e-05, + "loss": 2.1433, + "step": 13523 + }, + { + "epoch": 2.5349578256794754, + "grad_norm": 53714.609375, + "learning_rate": 2.373618653995252e-05, + "loss": 2.1571, + "step": 13524 + }, + { + "epoch": 2.53514526710403, + "grad_norm": 53021.89453125, + "learning_rate": 2.3729500318721954e-05, + "loss": 2.149, + "step": 13525 + }, + { + "epoch": 2.5353327085285846, + "grad_norm": 54378.40234375, + "learning_rate": 2.3722814746338668e-05, + "loss": 2.2031, + "step": 13526 + }, + { + "epoch": 2.5355201499531397, + "grad_norm": 61742.93359375, + "learning_rate": 2.371612982296777e-05, + "loss": 2.1276, + "step": 13527 + }, + { + "epoch": 2.5357075913776947, + "grad_norm": 54685.94921875, + "learning_rate": 2.3709445548774385e-05, + "loss": 2.0691, + "step": 13528 + }, + { + "epoch": 2.5358950328022494, + "grad_norm": 49851.921875, + "learning_rate": 2.370276192392365e-05, + "loss": 2.0695, + "step": 13529 + }, + { + "epoch": 2.536082474226804, + "grad_norm": 50600.72265625, + "learning_rate": 2.3696078948580558e-05, + "loss": 2.1539, + "step": 13530 + }, + { + "epoch": 2.536269915651359, + "grad_norm": 54516.26953125, + "learning_rate": 2.368939662291022e-05, + "loss": 2.0911, + "step": 13531 + }, + { + "epoch": 2.5364573570759137, + "grad_norm": 51476.37109375, + "learning_rate": 2.368271494707769e-05, + "loss": 2.1771, + "step": 13532 + }, + { + "epoch": 2.5366447985004688, + "grad_norm": 54755.6484375, + "learning_rate": 2.3676033921247977e-05, + "loss": 2.1607, + "step": 13533 + }, + { + "epoch": 2.5368322399250234, + "grad_norm": 52502.2109375, + "learning_rate": 2.3669353545586077e-05, + "loss": 2.1368, + "step": 13534 + }, + { + "epoch": 2.5370196813495784, + "grad_norm": 50912.1796875, + "learning_rate": 2.366267382025701e-05, + "loss": 2.0844, + "step": 13535 + }, + { + "epoch": 2.537207122774133, + "grad_norm": 54274.0546875, + "learning_rate": 2.3655994745425764e-05, + "loss": 2.1248, + "step": 13536 + }, + { + "epoch": 2.5373945641986877, + "grad_norm": 55449.921875, + "learning_rate": 2.3649316321257297e-05, + "loss": 2.1474, + "step": 13537 + }, + { + "epoch": 2.5375820056232428, + "grad_norm": 55027.58984375, + "learning_rate": 2.3642638547916533e-05, + "loss": 2.1364, + "step": 13538 + }, + { + "epoch": 2.537769447047798, + "grad_norm": 53821.74609375, + "learning_rate": 2.3635961425568447e-05, + "loss": 2.1526, + "step": 13539 + }, + { + "epoch": 2.5379568884723525, + "grad_norm": 52260.22265625, + "learning_rate": 2.3629284954377935e-05, + "loss": 2.1904, + "step": 13540 + }, + { + "epoch": 2.538144329896907, + "grad_norm": 49623.0234375, + "learning_rate": 2.3622609134509866e-05, + "loss": 2.0886, + "step": 13541 + }, + { + "epoch": 2.538331771321462, + "grad_norm": 56860.83203125, + "learning_rate": 2.3615933966129177e-05, + "loss": 2.1804, + "step": 13542 + }, + { + "epoch": 2.5385192127460168, + "grad_norm": 53968.33984375, + "learning_rate": 2.3609259449400688e-05, + "loss": 2.1623, + "step": 13543 + }, + { + "epoch": 2.538706654170572, + "grad_norm": 53427.08984375, + "learning_rate": 2.36025855844893e-05, + "loss": 2.1048, + "step": 13544 + }, + { + "epoch": 2.5388940955951265, + "grad_norm": 55121.9765625, + "learning_rate": 2.3595912371559803e-05, + "loss": 2.0862, + "step": 13545 + }, + { + "epoch": 2.5390815370196815, + "grad_norm": 54347.5625, + "learning_rate": 2.358923981077705e-05, + "loss": 2.1113, + "step": 13546 + }, + { + "epoch": 2.539268978444236, + "grad_norm": 56019.0, + "learning_rate": 2.3582567902305818e-05, + "loss": 2.1329, + "step": 13547 + }, + { + "epoch": 2.539456419868791, + "grad_norm": 53818.5703125, + "learning_rate": 2.357589664631093e-05, + "loss": 2.1293, + "step": 13548 + }, + { + "epoch": 2.539643861293346, + "grad_norm": 53862.48046875, + "learning_rate": 2.3569226042957126e-05, + "loss": 2.0981, + "step": 13549 + }, + { + "epoch": 2.539831302717901, + "grad_norm": 50911.6171875, + "learning_rate": 2.356255609240916e-05, + "loss": 2.087, + "step": 13550 + }, + { + "epoch": 2.5400187441424555, + "grad_norm": 51484.46875, + "learning_rate": 2.3555886794831778e-05, + "loss": 2.0898, + "step": 13551 + }, + { + "epoch": 2.54020618556701, + "grad_norm": 51782.23046875, + "learning_rate": 2.3549218150389728e-05, + "loss": 2.1631, + "step": 13552 + }, + { + "epoch": 2.5403936269915652, + "grad_norm": 54894.69140625, + "learning_rate": 2.3542550159247693e-05, + "loss": 2.0929, + "step": 13553 + }, + { + "epoch": 2.54058106841612, + "grad_norm": 52101.71484375, + "learning_rate": 2.3535882821570348e-05, + "loss": 2.086, + "step": 13554 + }, + { + "epoch": 2.540768509840675, + "grad_norm": 54116.921875, + "learning_rate": 2.3529216137522408e-05, + "loss": 2.1516, + "step": 13555 + }, + { + "epoch": 2.5409559512652296, + "grad_norm": 53129.453125, + "learning_rate": 2.352255010726851e-05, + "loss": 2.1369, + "step": 13556 + }, + { + "epoch": 2.5411433926897846, + "grad_norm": 55526.62109375, + "learning_rate": 2.3515884730973275e-05, + "loss": 2.0908, + "step": 13557 + }, + { + "epoch": 2.5413308341143392, + "grad_norm": 53921.17578125, + "learning_rate": 2.3509220008801346e-05, + "loss": 2.1725, + "step": 13558 + }, + { + "epoch": 2.541518275538894, + "grad_norm": 51329.953125, + "learning_rate": 2.3502555940917382e-05, + "loss": 2.0953, + "step": 13559 + }, + { + "epoch": 2.541705716963449, + "grad_norm": 52401.7890625, + "learning_rate": 2.349589252748589e-05, + "loss": 2.0918, + "step": 13560 + }, + { + "epoch": 2.541893158388004, + "grad_norm": 55267.89453125, + "learning_rate": 2.3489229768671488e-05, + "loss": 2.148, + "step": 13561 + }, + { + "epoch": 2.5420805998125586, + "grad_norm": 53867.22265625, + "learning_rate": 2.3482567664638762e-05, + "loss": 2.1328, + "step": 13562 + }, + { + "epoch": 2.5422680412371133, + "grad_norm": 51742.6640625, + "learning_rate": 2.347590621555223e-05, + "loss": 2.151, + "step": 13563 + }, + { + "epoch": 2.5424554826616683, + "grad_norm": 55087.44921875, + "learning_rate": 2.3469245421576408e-05, + "loss": 2.1703, + "step": 13564 + }, + { + "epoch": 2.542642924086223, + "grad_norm": 53790.51171875, + "learning_rate": 2.3462585282875825e-05, + "loss": 2.15, + "step": 13565 + }, + { + "epoch": 2.542830365510778, + "grad_norm": 57278.99609375, + "learning_rate": 2.3455925799615e-05, + "loss": 2.0736, + "step": 13566 + }, + { + "epoch": 2.5430178069353326, + "grad_norm": 49957.09765625, + "learning_rate": 2.3449266971958394e-05, + "loss": 2.1421, + "step": 13567 + }, + { + "epoch": 2.5432052483598877, + "grad_norm": 50330.4140625, + "learning_rate": 2.3442608800070452e-05, + "loss": 2.2105, + "step": 13568 + }, + { + "epoch": 2.5433926897844423, + "grad_norm": 55261.48046875, + "learning_rate": 2.343595128411566e-05, + "loss": 2.0561, + "step": 13569 + }, + { + "epoch": 2.543580131208997, + "grad_norm": 56095.01953125, + "learning_rate": 2.3429294424258423e-05, + "loss": 2.1572, + "step": 13570 + }, + { + "epoch": 2.543767572633552, + "grad_norm": 50314.15625, + "learning_rate": 2.3422638220663157e-05, + "loss": 2.1835, + "step": 13571 + }, + { + "epoch": 2.543955014058107, + "grad_norm": 52838.65234375, + "learning_rate": 2.341598267349428e-05, + "loss": 2.1483, + "step": 13572 + }, + { + "epoch": 2.5441424554826617, + "grad_norm": 53637.79296875, + "learning_rate": 2.3409327782916145e-05, + "loss": 2.1536, + "step": 13573 + }, + { + "epoch": 2.5443298969072163, + "grad_norm": 47336.87109375, + "learning_rate": 2.3402673549093162e-05, + "loss": 2.1212, + "step": 13574 + }, + { + "epoch": 2.5445173383317714, + "grad_norm": 50456.26953125, + "learning_rate": 2.3396019972189637e-05, + "loss": 2.159, + "step": 13575 + }, + { + "epoch": 2.544704779756326, + "grad_norm": 59714.0390625, + "learning_rate": 2.3389367052369943e-05, + "loss": 2.157, + "step": 13576 + }, + { + "epoch": 2.544892221180881, + "grad_norm": 48630.84765625, + "learning_rate": 2.3382714789798366e-05, + "loss": 2.0932, + "step": 13577 + }, + { + "epoch": 2.5450796626054357, + "grad_norm": 53466.66796875, + "learning_rate": 2.337606318463924e-05, + "loss": 2.1943, + "step": 13578 + }, + { + "epoch": 2.545267104029991, + "grad_norm": 55411.0390625, + "learning_rate": 2.3369412237056837e-05, + "loss": 2.1617, + "step": 13579 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 63130.5625, + "learning_rate": 2.3362761947215406e-05, + "loss": 2.1372, + "step": 13580 + }, + { + "epoch": 2.5456419868791, + "grad_norm": 54691.1953125, + "learning_rate": 2.3356112315279217e-05, + "loss": 2.1113, + "step": 13581 + }, + { + "epoch": 2.545829428303655, + "grad_norm": 53967.33203125, + "learning_rate": 2.3349463341412546e-05, + "loss": 2.1738, + "step": 13582 + }, + { + "epoch": 2.54601686972821, + "grad_norm": 51514.2109375, + "learning_rate": 2.334281502577954e-05, + "loss": 2.1251, + "step": 13583 + }, + { + "epoch": 2.546204311152765, + "grad_norm": 56410.7578125, + "learning_rate": 2.3336167368544447e-05, + "loss": 2.0805, + "step": 13584 + }, + { + "epoch": 2.5463917525773194, + "grad_norm": 56073.82421875, + "learning_rate": 2.3329520369871467e-05, + "loss": 2.1458, + "step": 13585 + }, + { + "epoch": 2.5465791940018745, + "grad_norm": 47915.9140625, + "learning_rate": 2.332287402992475e-05, + "loss": 2.1099, + "step": 13586 + }, + { + "epoch": 2.546766635426429, + "grad_norm": 53565.11328125, + "learning_rate": 2.331622834886844e-05, + "loss": 2.0675, + "step": 13587 + }, + { + "epoch": 2.546954076850984, + "grad_norm": 58879.03125, + "learning_rate": 2.3309583326866695e-05, + "loss": 2.0481, + "step": 13588 + }, + { + "epoch": 2.547141518275539, + "grad_norm": 54781.1875, + "learning_rate": 2.3302938964083675e-05, + "loss": 2.1732, + "step": 13589 + }, + { + "epoch": 2.547328959700094, + "grad_norm": 55830.9296875, + "learning_rate": 2.3296295260683414e-05, + "loss": 2.0833, + "step": 13590 + }, + { + "epoch": 2.5475164011246485, + "grad_norm": 52562.1875, + "learning_rate": 2.3289652216830037e-05, + "loss": 2.1512, + "step": 13591 + }, + { + "epoch": 2.547703842549203, + "grad_norm": 53599.87890625, + "learning_rate": 2.3283009832687636e-05, + "loss": 2.1328, + "step": 13592 + }, + { + "epoch": 2.547891283973758, + "grad_norm": 53950.41015625, + "learning_rate": 2.3276368108420253e-05, + "loss": 2.194, + "step": 13593 + }, + { + "epoch": 2.5480787253983133, + "grad_norm": 53007.390625, + "learning_rate": 2.3269727044191914e-05, + "loss": 2.157, + "step": 13594 + }, + { + "epoch": 2.548266166822868, + "grad_norm": 51995.59375, + "learning_rate": 2.326308664016668e-05, + "loss": 2.0479, + "step": 13595 + }, + { + "epoch": 2.5484536082474225, + "grad_norm": 54630.9140625, + "learning_rate": 2.3256446896508537e-05, + "loss": 2.0938, + "step": 13596 + }, + { + "epoch": 2.5486410496719776, + "grad_norm": 55738.6484375, + "learning_rate": 2.3249807813381468e-05, + "loss": 2.1661, + "step": 13597 + }, + { + "epoch": 2.548828491096532, + "grad_norm": 56580.59375, + "learning_rate": 2.324316939094946e-05, + "loss": 2.1764, + "step": 13598 + }, + { + "epoch": 2.5490159325210873, + "grad_norm": 57917.578125, + "learning_rate": 2.3236531629376496e-05, + "loss": 2.1002, + "step": 13599 + }, + { + "epoch": 2.549203373945642, + "grad_norm": 52772.328125, + "learning_rate": 2.3229894528826507e-05, + "loss": 2.0632, + "step": 13600 + }, + { + "epoch": 2.549390815370197, + "grad_norm": 50322.36328125, + "learning_rate": 2.322325808946339e-05, + "loss": 2.1336, + "step": 13601 + }, + { + "epoch": 2.5495782567947516, + "grad_norm": 55470.265625, + "learning_rate": 2.3216622311451102e-05, + "loss": 2.1486, + "step": 13602 + }, + { + "epoch": 2.549765698219306, + "grad_norm": 57518.5234375, + "learning_rate": 2.3209987194953504e-05, + "loss": 2.1623, + "step": 13603 + }, + { + "epoch": 2.5499531396438613, + "grad_norm": 57642.69921875, + "learning_rate": 2.32033527401345e-05, + "loss": 2.1218, + "step": 13604 + }, + { + "epoch": 2.5501405810684163, + "grad_norm": 58378.84765625, + "learning_rate": 2.3196718947157924e-05, + "loss": 2.1486, + "step": 13605 + }, + { + "epoch": 2.550328022492971, + "grad_norm": 51319.0859375, + "learning_rate": 2.3190085816187662e-05, + "loss": 2.0857, + "step": 13606 + }, + { + "epoch": 2.5505154639175256, + "grad_norm": 49510.2421875, + "learning_rate": 2.3183453347387498e-05, + "loss": 2.1587, + "step": 13607 + }, + { + "epoch": 2.5507029053420807, + "grad_norm": 54490.39453125, + "learning_rate": 2.3176821540921283e-05, + "loss": 2.1327, + "step": 13608 + }, + { + "epoch": 2.5508903467666353, + "grad_norm": 52391.96875, + "learning_rate": 2.3170190396952805e-05, + "loss": 2.2135, + "step": 13609 + }, + { + "epoch": 2.5510777881911904, + "grad_norm": 50003.30078125, + "learning_rate": 2.3163559915645822e-05, + "loss": 2.1547, + "step": 13610 + }, + { + "epoch": 2.551265229615745, + "grad_norm": 57979.16015625, + "learning_rate": 2.3156930097164133e-05, + "loss": 2.111, + "step": 13611 + }, + { + "epoch": 2.5514526710403, + "grad_norm": 53110.97265625, + "learning_rate": 2.3150300941671465e-05, + "loss": 2.1234, + "step": 13612 + }, + { + "epoch": 2.5516401124648547, + "grad_norm": 54467.2890625, + "learning_rate": 2.3143672449331534e-05, + "loss": 2.0963, + "step": 13613 + }, + { + "epoch": 2.5518275538894093, + "grad_norm": 55186.53125, + "learning_rate": 2.3137044620308075e-05, + "loss": 2.1539, + "step": 13614 + }, + { + "epoch": 2.5520149953139644, + "grad_norm": 57637.6484375, + "learning_rate": 2.31304174547648e-05, + "loss": 2.1671, + "step": 13615 + }, + { + "epoch": 2.5522024367385194, + "grad_norm": 55395.76953125, + "learning_rate": 2.3123790952865387e-05, + "loss": 2.1282, + "step": 13616 + }, + { + "epoch": 2.552389878163074, + "grad_norm": 55270.42578125, + "learning_rate": 2.3117165114773466e-05, + "loss": 2.1542, + "step": 13617 + }, + { + "epoch": 2.5525773195876287, + "grad_norm": 53148.3359375, + "learning_rate": 2.3110539940652733e-05, + "loss": 2.1693, + "step": 13618 + }, + { + "epoch": 2.5527647610121837, + "grad_norm": 53482.25, + "learning_rate": 2.31039154306668e-05, + "loss": 2.1439, + "step": 13619 + }, + { + "epoch": 2.5529522024367384, + "grad_norm": 55735.75, + "learning_rate": 2.3097291584979265e-05, + "loss": 2.1499, + "step": 13620 + }, + { + "epoch": 2.5531396438612934, + "grad_norm": 55435.75390625, + "learning_rate": 2.3090668403753746e-05, + "loss": 2.1699, + "step": 13621 + }, + { + "epoch": 2.553327085285848, + "grad_norm": 53645.45703125, + "learning_rate": 2.3084045887153853e-05, + "loss": 2.1367, + "step": 13622 + }, + { + "epoch": 2.553514526710403, + "grad_norm": 54388.00390625, + "learning_rate": 2.3077424035343127e-05, + "loss": 2.1382, + "step": 13623 + }, + { + "epoch": 2.5537019681349578, + "grad_norm": 49591.90625, + "learning_rate": 2.3070802848485102e-05, + "loss": 2.1163, + "step": 13624 + }, + { + "epoch": 2.5538894095595124, + "grad_norm": 56861.86328125, + "learning_rate": 2.3064182326743355e-05, + "loss": 2.0777, + "step": 13625 + }, + { + "epoch": 2.5540768509840674, + "grad_norm": 55282.78125, + "learning_rate": 2.305756247028138e-05, + "loss": 2.2298, + "step": 13626 + }, + { + "epoch": 2.5542642924086225, + "grad_norm": 53607.171875, + "learning_rate": 2.305094327926266e-05, + "loss": 2.1493, + "step": 13627 + }, + { + "epoch": 2.554451733833177, + "grad_norm": 51903.57421875, + "learning_rate": 2.3044324753850698e-05, + "loss": 2.1501, + "step": 13628 + }, + { + "epoch": 2.5546391752577318, + "grad_norm": 51603.07421875, + "learning_rate": 2.3037706894208994e-05, + "loss": 2.0256, + "step": 13629 + }, + { + "epoch": 2.554826616682287, + "grad_norm": 53302.10546875, + "learning_rate": 2.3031089700500964e-05, + "loss": 2.1197, + "step": 13630 + }, + { + "epoch": 2.5550140581068415, + "grad_norm": 54633.96484375, + "learning_rate": 2.302447317289004e-05, + "loss": 2.1403, + "step": 13631 + }, + { + "epoch": 2.5552014995313965, + "grad_norm": 56790.67578125, + "learning_rate": 2.301785731153967e-05, + "loss": 2.1373, + "step": 13632 + }, + { + "epoch": 2.555388940955951, + "grad_norm": 55959.15234375, + "learning_rate": 2.3011242116613217e-05, + "loss": 2.1811, + "step": 13633 + }, + { + "epoch": 2.555576382380506, + "grad_norm": 54629.82421875, + "learning_rate": 2.300462758827412e-05, + "loss": 2.0907, + "step": 13634 + }, + { + "epoch": 2.555763823805061, + "grad_norm": 54380.421875, + "learning_rate": 2.2998013726685718e-05, + "loss": 2.1111, + "step": 13635 + }, + { + "epoch": 2.555951265229616, + "grad_norm": 54637.58984375, + "learning_rate": 2.2991400532011353e-05, + "loss": 2.1526, + "step": 13636 + }, + { + "epoch": 2.5561387066541705, + "grad_norm": 50413.2890625, + "learning_rate": 2.2984788004414374e-05, + "loss": 2.0918, + "step": 13637 + }, + { + "epoch": 2.5563261480787256, + "grad_norm": 53202.1796875, + "learning_rate": 2.2978176144058126e-05, + "loss": 2.1304, + "step": 13638 + }, + { + "epoch": 2.5565135895032802, + "grad_norm": 54775.234375, + "learning_rate": 2.297156495110589e-05, + "loss": 2.1167, + "step": 13639 + }, + { + "epoch": 2.556701030927835, + "grad_norm": 55388.96484375, + "learning_rate": 2.296495442572094e-05, + "loss": 2.0231, + "step": 13640 + }, + { + "epoch": 2.55688847235239, + "grad_norm": 54147.56640625, + "learning_rate": 2.2958344568066587e-05, + "loss": 2.1454, + "step": 13641 + }, + { + "epoch": 2.5570759137769445, + "grad_norm": 55912.890625, + "learning_rate": 2.2951735378306054e-05, + "loss": 2.1834, + "step": 13642 + }, + { + "epoch": 2.5572633552014996, + "grad_norm": 52581.40625, + "learning_rate": 2.294512685660258e-05, + "loss": 2.2005, + "step": 13643 + }, + { + "epoch": 2.5574507966260542, + "grad_norm": 56357.52734375, + "learning_rate": 2.293851900311939e-05, + "loss": 2.1503, + "step": 13644 + }, + { + "epoch": 2.5576382380506093, + "grad_norm": 56303.2265625, + "learning_rate": 2.2931911818019714e-05, + "loss": 2.1443, + "step": 13645 + }, + { + "epoch": 2.557825679475164, + "grad_norm": 54376.859375, + "learning_rate": 2.292530530146672e-05, + "loss": 2.0843, + "step": 13646 + }, + { + "epoch": 2.558013120899719, + "grad_norm": 50192.0625, + "learning_rate": 2.2918699453623567e-05, + "loss": 2.1427, + "step": 13647 + }, + { + "epoch": 2.5582005623242736, + "grad_norm": 61885.9765625, + "learning_rate": 2.2912094274653446e-05, + "loss": 2.1524, + "step": 13648 + }, + { + "epoch": 2.5583880037488287, + "grad_norm": 58001.18359375, + "learning_rate": 2.2905489764719474e-05, + "loss": 2.1784, + "step": 13649 + }, + { + "epoch": 2.5585754451733833, + "grad_norm": 53360.21484375, + "learning_rate": 2.2898885923984753e-05, + "loss": 2.1612, + "step": 13650 + }, + { + "epoch": 2.558762886597938, + "grad_norm": 51791.91015625, + "learning_rate": 2.289228275261242e-05, + "loss": 2.1022, + "step": 13651 + }, + { + "epoch": 2.558950328022493, + "grad_norm": 52018.89453125, + "learning_rate": 2.288568025076559e-05, + "loss": 2.1098, + "step": 13652 + }, + { + "epoch": 2.559137769447048, + "grad_norm": 55862.9453125, + "learning_rate": 2.287907841860727e-05, + "loss": 2.0609, + "step": 13653 + }, + { + "epoch": 2.5593252108716027, + "grad_norm": 51814.1015625, + "learning_rate": 2.2872477256300544e-05, + "loss": 2.1192, + "step": 13654 + }, + { + "epoch": 2.5595126522961573, + "grad_norm": 59936.41796875, + "learning_rate": 2.286587676400848e-05, + "loss": 2.1477, + "step": 13655 + }, + { + "epoch": 2.5597000937207124, + "grad_norm": 55325.05859375, + "learning_rate": 2.2859276941894086e-05, + "loss": 2.1482, + "step": 13656 + }, + { + "epoch": 2.559887535145267, + "grad_norm": 53946.65625, + "learning_rate": 2.2852677790120335e-05, + "loss": 2.1486, + "step": 13657 + }, + { + "epoch": 2.560074976569822, + "grad_norm": 52726.609375, + "learning_rate": 2.2846079308850253e-05, + "loss": 2.1194, + "step": 13658 + }, + { + "epoch": 2.5602624179943767, + "grad_norm": 60070.07421875, + "learning_rate": 2.2839481498246825e-05, + "loss": 2.1621, + "step": 13659 + }, + { + "epoch": 2.5604498594189318, + "grad_norm": 51713.375, + "learning_rate": 2.2832884358472982e-05, + "loss": 2.1146, + "step": 13660 + }, + { + "epoch": 2.5606373008434864, + "grad_norm": 56048.8671875, + "learning_rate": 2.282628788969166e-05, + "loss": 2.2823, + "step": 13661 + }, + { + "epoch": 2.560824742268041, + "grad_norm": 50290.23046875, + "learning_rate": 2.2819692092065813e-05, + "loss": 2.1492, + "step": 13662 + }, + { + "epoch": 2.561012183692596, + "grad_norm": 54790.49609375, + "learning_rate": 2.281309696575832e-05, + "loss": 2.1029, + "step": 13663 + }, + { + "epoch": 2.561199625117151, + "grad_norm": 51047.65234375, + "learning_rate": 2.280650251093209e-05, + "loss": 2.1594, + "step": 13664 + }, + { + "epoch": 2.561387066541706, + "grad_norm": 55421.28125, + "learning_rate": 2.2799908727749997e-05, + "loss": 2.1039, + "step": 13665 + }, + { + "epoch": 2.5615745079662604, + "grad_norm": 55303.89453125, + "learning_rate": 2.2793315616374876e-05, + "loss": 2.3044, + "step": 13666 + }, + { + "epoch": 2.5617619493908155, + "grad_norm": 54233.7109375, + "learning_rate": 2.2786723176969604e-05, + "loss": 2.1219, + "step": 13667 + }, + { + "epoch": 2.56194939081537, + "grad_norm": 59235.40234375, + "learning_rate": 2.2780131409696966e-05, + "loss": 2.2253, + "step": 13668 + }, + { + "epoch": 2.562136832239925, + "grad_norm": 49597.8828125, + "learning_rate": 2.277354031471981e-05, + "loss": 2.1663, + "step": 13669 + }, + { + "epoch": 2.56232427366448, + "grad_norm": 58454.14453125, + "learning_rate": 2.276694989220089e-05, + "loss": 2.191, + "step": 13670 + }, + { + "epoch": 2.562511715089035, + "grad_norm": 54938.07421875, + "learning_rate": 2.2760360142303016e-05, + "loss": 2.1072, + "step": 13671 + }, + { + "epoch": 2.5626991565135895, + "grad_norm": 51980.23046875, + "learning_rate": 2.2753771065188933e-05, + "loss": 2.1112, + "step": 13672 + }, + { + "epoch": 2.562886597938144, + "grad_norm": 50935.19921875, + "learning_rate": 2.274718266102136e-05, + "loss": 2.1302, + "step": 13673 + }, + { + "epoch": 2.563074039362699, + "grad_norm": 57885.14453125, + "learning_rate": 2.2740594929963044e-05, + "loss": 2.1761, + "step": 13674 + }, + { + "epoch": 2.5632614807872542, + "grad_norm": 55625.0, + "learning_rate": 2.2734007872176723e-05, + "loss": 2.086, + "step": 13675 + }, + { + "epoch": 2.563448922211809, + "grad_norm": 51269.31640625, + "learning_rate": 2.2727421487825024e-05, + "loss": 2.1694, + "step": 13676 + }, + { + "epoch": 2.5636363636363635, + "grad_norm": 54475.6328125, + "learning_rate": 2.272083577707065e-05, + "loss": 2.1496, + "step": 13677 + }, + { + "epoch": 2.5638238050609186, + "grad_norm": 52831.55078125, + "learning_rate": 2.2714250740076288e-05, + "loss": 2.1123, + "step": 13678 + }, + { + "epoch": 2.564011246485473, + "grad_norm": 62473.84765625, + "learning_rate": 2.2707666377004556e-05, + "loss": 2.2138, + "step": 13679 + }, + { + "epoch": 2.5641986879100283, + "grad_norm": 56583.984375, + "learning_rate": 2.2701082688018055e-05, + "loss": 2.1198, + "step": 13680 + }, + { + "epoch": 2.564386129334583, + "grad_norm": 54522.1171875, + "learning_rate": 2.2694499673279424e-05, + "loss": 2.116, + "step": 13681 + }, + { + "epoch": 2.564573570759138, + "grad_norm": 60385.6875, + "learning_rate": 2.268791733295128e-05, + "loss": 2.1074, + "step": 13682 + }, + { + "epoch": 2.5647610121836926, + "grad_norm": 55862.30078125, + "learning_rate": 2.2681335667196135e-05, + "loss": 2.03, + "step": 13683 + }, + { + "epoch": 2.564948453608247, + "grad_norm": 55605.4921875, + "learning_rate": 2.267475467617658e-05, + "loss": 2.2308, + "step": 13684 + }, + { + "epoch": 2.5651358950328023, + "grad_norm": 56856.79296875, + "learning_rate": 2.2668174360055173e-05, + "loss": 2.1492, + "step": 13685 + }, + { + "epoch": 2.5653233364573573, + "grad_norm": 55496.7109375, + "learning_rate": 2.2661594718994427e-05, + "loss": 2.1465, + "step": 13686 + }, + { + "epoch": 2.565510777881912, + "grad_norm": 53349.18359375, + "learning_rate": 2.265501575315682e-05, + "loss": 2.1819, + "step": 13687 + }, + { + "epoch": 2.5656982193064666, + "grad_norm": 55530.99609375, + "learning_rate": 2.2648437462704896e-05, + "loss": 2.1434, + "step": 13688 + }, + { + "epoch": 2.5658856607310216, + "grad_norm": 51942.515625, + "learning_rate": 2.264185984780109e-05, + "loss": 2.1549, + "step": 13689 + }, + { + "epoch": 2.5660731021555763, + "grad_norm": 55469.18359375, + "learning_rate": 2.2635282908607886e-05, + "loss": 2.1444, + "step": 13690 + }, + { + "epoch": 2.5662605435801313, + "grad_norm": 53566.9296875, + "learning_rate": 2.26287066452877e-05, + "loss": 2.1413, + "step": 13691 + }, + { + "epoch": 2.566447985004686, + "grad_norm": 49538.9375, + "learning_rate": 2.2622131058002986e-05, + "loss": 2.1346, + "step": 13692 + }, + { + "epoch": 2.566635426429241, + "grad_norm": 52378.5, + "learning_rate": 2.261555614691615e-05, + "loss": 2.1351, + "step": 13693 + }, + { + "epoch": 2.5668228678537957, + "grad_norm": 55151.04296875, + "learning_rate": 2.260898191218955e-05, + "loss": 2.1672, + "step": 13694 + }, + { + "epoch": 2.5670103092783503, + "grad_norm": 58805.66015625, + "learning_rate": 2.2602408353985593e-05, + "loss": 2.0278, + "step": 13695 + }, + { + "epoch": 2.5671977507029053, + "grad_norm": 58323.6328125, + "learning_rate": 2.259583547246662e-05, + "loss": 2.0559, + "step": 13696 + }, + { + "epoch": 2.5673851921274604, + "grad_norm": 54013.92578125, + "learning_rate": 2.2589263267794995e-05, + "loss": 2.1929, + "step": 13697 + }, + { + "epoch": 2.567572633552015, + "grad_norm": 56386.9296875, + "learning_rate": 2.258269174013301e-05, + "loss": 2.1319, + "step": 13698 + }, + { + "epoch": 2.5677600749765697, + "grad_norm": 54562.3515625, + "learning_rate": 2.257612088964301e-05, + "loss": 2.1005, + "step": 13699 + }, + { + "epoch": 2.5679475164011247, + "grad_norm": 58875.9921875, + "learning_rate": 2.2569550716487252e-05, + "loss": 2.0649, + "step": 13700 + }, + { + "epoch": 2.5681349578256794, + "grad_norm": 57814.7578125, + "learning_rate": 2.256298122082805e-05, + "loss": 2.1524, + "step": 13701 + }, + { + "epoch": 2.5683223992502344, + "grad_norm": 58034.484375, + "learning_rate": 2.2556412402827632e-05, + "loss": 2.141, + "step": 13702 + }, + { + "epoch": 2.568509840674789, + "grad_norm": 52645.22265625, + "learning_rate": 2.254984426264823e-05, + "loss": 2.1308, + "step": 13703 + }, + { + "epoch": 2.568697282099344, + "grad_norm": 52238.01953125, + "learning_rate": 2.2543276800452084e-05, + "loss": 2.1766, + "step": 13704 + }, + { + "epoch": 2.5688847235238987, + "grad_norm": 57643.65625, + "learning_rate": 2.2536710016401442e-05, + "loss": 2.0995, + "step": 13705 + }, + { + "epoch": 2.5690721649484534, + "grad_norm": 55054.02734375, + "learning_rate": 2.2530143910658417e-05, + "loss": 2.1159, + "step": 13706 + }, + { + "epoch": 2.5692596063730084, + "grad_norm": 53110.5234375, + "learning_rate": 2.252357848338523e-05, + "loss": 2.1543, + "step": 13707 + }, + { + "epoch": 2.5694470477975635, + "grad_norm": 49360.8125, + "learning_rate": 2.2517013734744043e-05, + "loss": 2.1907, + "step": 13708 + }, + { + "epoch": 2.569634489222118, + "grad_norm": 50122.1640625, + "learning_rate": 2.2510449664896982e-05, + "loss": 2.0865, + "step": 13709 + }, + { + "epoch": 2.5698219306466727, + "grad_norm": 51187.21484375, + "learning_rate": 2.2503886274006154e-05, + "loss": 2.1223, + "step": 13710 + }, + { + "epoch": 2.570009372071228, + "grad_norm": 54701.81640625, + "learning_rate": 2.2497323562233706e-05, + "loss": 2.14, + "step": 13711 + }, + { + "epoch": 2.5701968134957824, + "grad_norm": 55353.7421875, + "learning_rate": 2.249076152974171e-05, + "loss": 2.0478, + "step": 13712 + }, + { + "epoch": 2.5703842549203375, + "grad_norm": 56812.046875, + "learning_rate": 2.2484200176692222e-05, + "loss": 2.148, + "step": 13713 + }, + { + "epoch": 2.570571696344892, + "grad_norm": 54295.26171875, + "learning_rate": 2.2477639503247312e-05, + "loss": 2.3815, + "step": 13714 + }, + { + "epoch": 2.570759137769447, + "grad_norm": 53149.9609375, + "learning_rate": 2.2471079509569043e-05, + "loss": 2.1555, + "step": 13715 + }, + { + "epoch": 2.570946579194002, + "grad_norm": 53322.71875, + "learning_rate": 2.2464520195819417e-05, + "loss": 2.1642, + "step": 13716 + }, + { + "epoch": 2.5711340206185564, + "grad_norm": 53920.08203125, + "learning_rate": 2.2457961562160423e-05, + "loss": 2.1223, + "step": 13717 + }, + { + "epoch": 2.5713214620431115, + "grad_norm": 52144.796875, + "learning_rate": 2.245140360875409e-05, + "loss": 2.211, + "step": 13718 + }, + { + "epoch": 2.5715089034676666, + "grad_norm": 56502.7578125, + "learning_rate": 2.2444846335762347e-05, + "loss": 2.1401, + "step": 13719 + }, + { + "epoch": 2.571696344892221, + "grad_norm": 52623.4609375, + "learning_rate": 2.2438289743347195e-05, + "loss": 2.1204, + "step": 13720 + }, + { + "epoch": 2.571883786316776, + "grad_norm": 56262.55859375, + "learning_rate": 2.243173383167054e-05, + "loss": 2.1231, + "step": 13721 + }, + { + "epoch": 2.572071227741331, + "grad_norm": 55299.38671875, + "learning_rate": 2.2425178600894324e-05, + "loss": 2.2235, + "step": 13722 + }, + { + "epoch": 2.5722586691658855, + "grad_norm": 57000.1796875, + "learning_rate": 2.2418624051180453e-05, + "loss": 2.174, + "step": 13723 + }, + { + "epoch": 2.5724461105904406, + "grad_norm": 55671.89453125, + "learning_rate": 2.2412070182690786e-05, + "loss": 2.1368, + "step": 13724 + }, + { + "epoch": 2.572633552014995, + "grad_norm": 58270.4921875, + "learning_rate": 2.240551699558724e-05, + "loss": 2.1119, + "step": 13725 + }, + { + "epoch": 2.5728209934395503, + "grad_norm": 54720.171875, + "learning_rate": 2.2398964490031628e-05, + "loss": 2.1557, + "step": 13726 + }, + { + "epoch": 2.573008434864105, + "grad_norm": 60369.296875, + "learning_rate": 2.2392412666185818e-05, + "loss": 2.1694, + "step": 13727 + }, + { + "epoch": 2.5731958762886595, + "grad_norm": 50767.37109375, + "learning_rate": 2.2385861524211634e-05, + "loss": 2.084, + "step": 13728 + }, + { + "epoch": 2.5733833177132146, + "grad_norm": 51882.203125, + "learning_rate": 2.2379311064270842e-05, + "loss": 2.1504, + "step": 13729 + }, + { + "epoch": 2.5735707591377697, + "grad_norm": 55972.23828125, + "learning_rate": 2.2372761286525256e-05, + "loss": 2.2016, + "step": 13730 + }, + { + "epoch": 2.5737582005623243, + "grad_norm": 53075.38671875, + "learning_rate": 2.2366212191136666e-05, + "loss": 2.1098, + "step": 13731 + }, + { + "epoch": 2.573945641986879, + "grad_norm": 54045.99609375, + "learning_rate": 2.235966377826681e-05, + "loss": 2.159, + "step": 13732 + }, + { + "epoch": 2.574133083411434, + "grad_norm": 57085.60546875, + "learning_rate": 2.23531160480774e-05, + "loss": 2.1077, + "step": 13733 + }, + { + "epoch": 2.5743205248359886, + "grad_norm": 54579.125, + "learning_rate": 2.2346569000730182e-05, + "loss": 2.1844, + "step": 13734 + }, + { + "epoch": 2.5745079662605437, + "grad_norm": 53146.03515625, + "learning_rate": 2.234002263638689e-05, + "loss": 2.1247, + "step": 13735 + }, + { + "epoch": 2.5746954076850983, + "grad_norm": 53202.6015625, + "learning_rate": 2.2333476955209144e-05, + "loss": 2.0993, + "step": 13736 + }, + { + "epoch": 2.5748828491096534, + "grad_norm": 54245.46484375, + "learning_rate": 2.2326931957358642e-05, + "loss": 2.1372, + "step": 13737 + }, + { + "epoch": 2.575070290534208, + "grad_norm": 55801.2109375, + "learning_rate": 2.2320387642997064e-05, + "loss": 2.124, + "step": 13738 + }, + { + "epoch": 2.5752577319587626, + "grad_norm": 54766.92578125, + "learning_rate": 2.2313844012286027e-05, + "loss": 2.0822, + "step": 13739 + }, + { + "epoch": 2.5754451733833177, + "grad_norm": 55996.9609375, + "learning_rate": 2.2307301065387127e-05, + "loss": 2.0952, + "step": 13740 + }, + { + "epoch": 2.5756326148078728, + "grad_norm": 54869.91796875, + "learning_rate": 2.2300758802462003e-05, + "loss": 2.0972, + "step": 13741 + }, + { + "epoch": 2.5758200562324274, + "grad_norm": 56337.04296875, + "learning_rate": 2.2294217223672227e-05, + "loss": 2.112, + "step": 13742 + }, + { + "epoch": 2.576007497656982, + "grad_norm": 55539.4921875, + "learning_rate": 2.2287676329179342e-05, + "loss": 2.1375, + "step": 13743 + }, + { + "epoch": 2.576194939081537, + "grad_norm": 53697.51171875, + "learning_rate": 2.2281136119144923e-05, + "loss": 2.2113, + "step": 13744 + }, + { + "epoch": 2.5763823805060917, + "grad_norm": 49082.22265625, + "learning_rate": 2.2274596593730524e-05, + "loss": 2.1647, + "step": 13745 + }, + { + "epoch": 2.5765698219306468, + "grad_norm": 50248.79296875, + "learning_rate": 2.2268057753097644e-05, + "loss": 2.146, + "step": 13746 + }, + { + "epoch": 2.5767572633552014, + "grad_norm": 56342.80859375, + "learning_rate": 2.226151959740776e-05, + "loss": 2.1141, + "step": 13747 + }, + { + "epoch": 2.5769447047797565, + "grad_norm": 49541.15234375, + "learning_rate": 2.2254982126822398e-05, + "loss": 2.1125, + "step": 13748 + }, + { + "epoch": 2.577132146204311, + "grad_norm": 54725.703125, + "learning_rate": 2.2248445341503005e-05, + "loss": 2.1254, + "step": 13749 + }, + { + "epoch": 2.5773195876288657, + "grad_norm": 56495.25390625, + "learning_rate": 2.2241909241611014e-05, + "loss": 2.1246, + "step": 13750 + }, + { + "epoch": 2.5775070290534208, + "grad_norm": 51165.25390625, + "learning_rate": 2.2235373827307878e-05, + "loss": 2.0736, + "step": 13751 + }, + { + "epoch": 2.577694470477976, + "grad_norm": 56561.296875, + "learning_rate": 2.222883909875503e-05, + "loss": 2.1554, + "step": 13752 + }, + { + "epoch": 2.5778819119025305, + "grad_norm": 58003.51953125, + "learning_rate": 2.2222305056113845e-05, + "loss": 2.1092, + "step": 13753 + }, + { + "epoch": 2.578069353327085, + "grad_norm": 52909.40234375, + "learning_rate": 2.2215771699545696e-05, + "loss": 2.0888, + "step": 13754 + }, + { + "epoch": 2.57825679475164, + "grad_norm": 58991.87890625, + "learning_rate": 2.2209239029211982e-05, + "loss": 2.1842, + "step": 13755 + }, + { + "epoch": 2.578444236176195, + "grad_norm": 56374.7734375, + "learning_rate": 2.2202707045274012e-05, + "loss": 2.1467, + "step": 13756 + }, + { + "epoch": 2.57863167760075, + "grad_norm": 52878.80078125, + "learning_rate": 2.219617574789316e-05, + "loss": 2.117, + "step": 13757 + }, + { + "epoch": 2.5788191190253045, + "grad_norm": 55858.03125, + "learning_rate": 2.218964513723072e-05, + "loss": 2.1727, + "step": 13758 + }, + { + "epoch": 2.5790065604498595, + "grad_norm": 57540.51953125, + "learning_rate": 2.2183115213447965e-05, + "loss": 2.1038, + "step": 13759 + }, + { + "epoch": 2.579194001874414, + "grad_norm": 53705.03125, + "learning_rate": 2.21765859767062e-05, + "loss": 2.1419, + "step": 13760 + }, + { + "epoch": 2.5793814432989692, + "grad_norm": 51945.25, + "learning_rate": 2.2170057427166707e-05, + "loss": 2.1905, + "step": 13761 + }, + { + "epoch": 2.579568884723524, + "grad_norm": 53946.42578125, + "learning_rate": 2.2163529564990713e-05, + "loss": 2.1647, + "step": 13762 + }, + { + "epoch": 2.579756326148079, + "grad_norm": 57633.4375, + "learning_rate": 2.2157002390339433e-05, + "loss": 2.1205, + "step": 13763 + }, + { + "epoch": 2.5799437675726336, + "grad_norm": 53901.38671875, + "learning_rate": 2.2150475903374112e-05, + "loss": 2.1424, + "step": 13764 + }, + { + "epoch": 2.580131208997188, + "grad_norm": 56523.8515625, + "learning_rate": 2.214395010425593e-05, + "loss": 2.1554, + "step": 13765 + }, + { + "epoch": 2.5803186504217432, + "grad_norm": 53774.74609375, + "learning_rate": 2.2137424993146037e-05, + "loss": 2.1544, + "step": 13766 + }, + { + "epoch": 2.580506091846298, + "grad_norm": 50802.33984375, + "learning_rate": 2.213090057020563e-05, + "loss": 2.1298, + "step": 13767 + }, + { + "epoch": 2.580693533270853, + "grad_norm": 52816.24609375, + "learning_rate": 2.2124376835595884e-05, + "loss": 2.1732, + "step": 13768 + }, + { + "epoch": 2.5808809746954076, + "grad_norm": 54398.07421875, + "learning_rate": 2.211785378947785e-05, + "loss": 2.0267, + "step": 13769 + }, + { + "epoch": 2.5810684161199626, + "grad_norm": 52471.0, + "learning_rate": 2.2111331432012676e-05, + "loss": 2.1098, + "step": 13770 + }, + { + "epoch": 2.5812558575445173, + "grad_norm": 50034.90625, + "learning_rate": 2.2104809763361484e-05, + "loss": 2.1511, + "step": 13771 + }, + { + "epoch": 2.5814432989690723, + "grad_norm": 51676.421875, + "learning_rate": 2.2098288783685317e-05, + "loss": 2.1641, + "step": 13772 + }, + { + "epoch": 2.581630740393627, + "grad_norm": 57033.41796875, + "learning_rate": 2.209176849314522e-05, + "loss": 2.1661, + "step": 13773 + }, + { + "epoch": 2.581818181818182, + "grad_norm": 53505.16015625, + "learning_rate": 2.2085248891902266e-05, + "loss": 2.1458, + "step": 13774 + }, + { + "epoch": 2.5820056232427366, + "grad_norm": 57931.5625, + "learning_rate": 2.2078729980117485e-05, + "loss": 2.1529, + "step": 13775 + }, + { + "epoch": 2.5821930646672913, + "grad_norm": 54189.6484375, + "learning_rate": 2.2072211757951877e-05, + "loss": 2.1926, + "step": 13776 + }, + { + "epoch": 2.5823805060918463, + "grad_norm": 54739.54296875, + "learning_rate": 2.2065694225566407e-05, + "loss": 2.0948, + "step": 13777 + }, + { + "epoch": 2.5825679475164014, + "grad_norm": 53900.703125, + "learning_rate": 2.2059177383122093e-05, + "loss": 2.069, + "step": 13778 + }, + { + "epoch": 2.582755388940956, + "grad_norm": 55912.578125, + "learning_rate": 2.205266123077987e-05, + "loss": 2.0922, + "step": 13779 + }, + { + "epoch": 2.5829428303655106, + "grad_norm": 63460.94140625, + "learning_rate": 2.2046145768700667e-05, + "loss": 2.0857, + "step": 13780 + }, + { + "epoch": 2.5831302717900657, + "grad_norm": 54179.40625, + "learning_rate": 2.2039630997045434e-05, + "loss": 2.1145, + "step": 13781 + }, + { + "epoch": 2.5833177132146203, + "grad_norm": 54289.16796875, + "learning_rate": 2.203311691597505e-05, + "loss": 2.1489, + "step": 13782 + }, + { + "epoch": 2.5835051546391754, + "grad_norm": 54355.828125, + "learning_rate": 2.2026603525650434e-05, + "loss": 2.1929, + "step": 13783 + }, + { + "epoch": 2.58369259606373, + "grad_norm": 57910.3828125, + "learning_rate": 2.2020090826232426e-05, + "loss": 2.078, + "step": 13784 + }, + { + "epoch": 2.583880037488285, + "grad_norm": 51872.21875, + "learning_rate": 2.2013578817881914e-05, + "loss": 2.1256, + "step": 13785 + }, + { + "epoch": 2.5840674789128397, + "grad_norm": 54841.3984375, + "learning_rate": 2.2007067500759703e-05, + "loss": 2.1099, + "step": 13786 + }, + { + "epoch": 2.5842549203373943, + "grad_norm": 52089.82421875, + "learning_rate": 2.200055687502665e-05, + "loss": 2.1126, + "step": 13787 + }, + { + "epoch": 2.5844423617619494, + "grad_norm": 53685.50390625, + "learning_rate": 2.1994046940843543e-05, + "loss": 2.1641, + "step": 13788 + }, + { + "epoch": 2.5846298031865045, + "grad_norm": 53359.04296875, + "learning_rate": 2.198753769837114e-05, + "loss": 2.1382, + "step": 13789 + }, + { + "epoch": 2.584817244611059, + "grad_norm": 61540.87109375, + "learning_rate": 2.1981029147770266e-05, + "loss": 2.054, + "step": 13790 + }, + { + "epoch": 2.5850046860356137, + "grad_norm": 53688.0078125, + "learning_rate": 2.197452128920162e-05, + "loss": 2.1421, + "step": 13791 + }, + { + "epoch": 2.585192127460169, + "grad_norm": 57120.6875, + "learning_rate": 2.196801412282598e-05, + "loss": 2.1554, + "step": 13792 + }, + { + "epoch": 2.5853795688847234, + "grad_norm": 55730.015625, + "learning_rate": 2.1961507648804035e-05, + "loss": 2.1354, + "step": 13793 + }, + { + "epoch": 2.5855670103092785, + "grad_norm": 54286.6171875, + "learning_rate": 2.1955001867296508e-05, + "loss": 2.1036, + "step": 13794 + }, + { + "epoch": 2.585754451733833, + "grad_norm": 51524.26953125, + "learning_rate": 2.1948496778464083e-05, + "loss": 2.177, + "step": 13795 + }, + { + "epoch": 2.585941893158388, + "grad_norm": 52836.28515625, + "learning_rate": 2.1941992382467396e-05, + "loss": 2.1045, + "step": 13796 + }, + { + "epoch": 2.586129334582943, + "grad_norm": 58878.43359375, + "learning_rate": 2.193548867946712e-05, + "loss": 2.1168, + "step": 13797 + }, + { + "epoch": 2.5863167760074974, + "grad_norm": 53727.0859375, + "learning_rate": 2.1928985669623924e-05, + "loss": 2.1769, + "step": 13798 + }, + { + "epoch": 2.5865042174320525, + "grad_norm": 51791.47265625, + "learning_rate": 2.192248335309835e-05, + "loss": 2.1264, + "step": 13799 + }, + { + "epoch": 2.5866916588566076, + "grad_norm": 54140.48046875, + "learning_rate": 2.1915981730051032e-05, + "loss": 2.2129, + "step": 13800 + }, + { + "epoch": 2.586879100281162, + "grad_norm": 55368.66796875, + "learning_rate": 2.190948080064258e-05, + "loss": 2.1705, + "step": 13801 + }, + { + "epoch": 2.587066541705717, + "grad_norm": 53945.1875, + "learning_rate": 2.1902980565033526e-05, + "loss": 2.1744, + "step": 13802 + }, + { + "epoch": 2.587253983130272, + "grad_norm": 52180.48828125, + "learning_rate": 2.189648102338441e-05, + "loss": 2.1841, + "step": 13803 + }, + { + "epoch": 2.5874414245548265, + "grad_norm": 52061.9453125, + "learning_rate": 2.1889982175855783e-05, + "loss": 2.1054, + "step": 13804 + }, + { + "epoch": 2.5876288659793816, + "grad_norm": 58161.1484375, + "learning_rate": 2.1883484022608164e-05, + "loss": 2.0835, + "step": 13805 + }, + { + "epoch": 2.587816307403936, + "grad_norm": 58290.6015625, + "learning_rate": 2.187698656380201e-05, + "loss": 2.0535, + "step": 13806 + }, + { + "epoch": 2.5880037488284913, + "grad_norm": 56829.5859375, + "learning_rate": 2.1870489799597825e-05, + "loss": 2.1162, + "step": 13807 + }, + { + "epoch": 2.588191190253046, + "grad_norm": 54453.8359375, + "learning_rate": 2.1863993730156096e-05, + "loss": 2.0896, + "step": 13808 + }, + { + "epoch": 2.5883786316776005, + "grad_norm": 55932.35546875, + "learning_rate": 2.1857498355637242e-05, + "loss": 2.0775, + "step": 13809 + }, + { + "epoch": 2.5885660731021556, + "grad_norm": 57692.30078125, + "learning_rate": 2.185100367620167e-05, + "loss": 2.0782, + "step": 13810 + }, + { + "epoch": 2.5887535145267107, + "grad_norm": 55869.0625, + "learning_rate": 2.184450969200983e-05, + "loss": 2.1093, + "step": 13811 + }, + { + "epoch": 2.5889409559512653, + "grad_norm": 54441.49609375, + "learning_rate": 2.183801640322208e-05, + "loss": 2.1401, + "step": 13812 + }, + { + "epoch": 2.58912839737582, + "grad_norm": 54352.41796875, + "learning_rate": 2.1831523809998826e-05, + "loss": 2.1814, + "step": 13813 + }, + { + "epoch": 2.589315838800375, + "grad_norm": 51584.3125, + "learning_rate": 2.1825031912500398e-05, + "loss": 2.1104, + "step": 13814 + }, + { + "epoch": 2.5895032802249296, + "grad_norm": 49937.0703125, + "learning_rate": 2.1818540710887174e-05, + "loss": 2.1742, + "step": 13815 + }, + { + "epoch": 2.5896907216494847, + "grad_norm": 49013.53515625, + "learning_rate": 2.1812050205319435e-05, + "loss": 2.1651, + "step": 13816 + }, + { + "epoch": 2.5898781630740393, + "grad_norm": 55977.51953125, + "learning_rate": 2.180556039595753e-05, + "loss": 2.1185, + "step": 13817 + }, + { + "epoch": 2.5900656044985944, + "grad_norm": 54088.38671875, + "learning_rate": 2.1799071282961724e-05, + "loss": 2.1356, + "step": 13818 + }, + { + "epoch": 2.590253045923149, + "grad_norm": 55094.0625, + "learning_rate": 2.179258286649228e-05, + "loss": 2.1536, + "step": 13819 + }, + { + "epoch": 2.5904404873477036, + "grad_norm": 57062.73046875, + "learning_rate": 2.1786095146709485e-05, + "loss": 2.1632, + "step": 13820 + }, + { + "epoch": 2.5906279287722587, + "grad_norm": 62599.33203125, + "learning_rate": 2.177960812377356e-05, + "loss": 2.0901, + "step": 13821 + }, + { + "epoch": 2.5908153701968137, + "grad_norm": 49534.53515625, + "learning_rate": 2.1773121797844708e-05, + "loss": 2.106, + "step": 13822 + }, + { + "epoch": 2.5910028116213684, + "grad_norm": 53586.171875, + "learning_rate": 2.176663616908315e-05, + "loss": 2.1695, + "step": 13823 + }, + { + "epoch": 2.591190253045923, + "grad_norm": 52915.328125, + "learning_rate": 2.1760151237649095e-05, + "loss": 2.1475, + "step": 13824 + }, + { + "epoch": 2.591377694470478, + "grad_norm": 54017.28125, + "learning_rate": 2.1753667003702687e-05, + "loss": 2.1169, + "step": 13825 + }, + { + "epoch": 2.5915651358950327, + "grad_norm": 53720.1171875, + "learning_rate": 2.1747183467404064e-05, + "loss": 2.1847, + "step": 13826 + }, + { + "epoch": 2.5917525773195877, + "grad_norm": 51227.86328125, + "learning_rate": 2.174070062891338e-05, + "loss": 2.147, + "step": 13827 + }, + { + "epoch": 2.5919400187441424, + "grad_norm": 57329.62890625, + "learning_rate": 2.173421848839079e-05, + "loss": 2.1086, + "step": 13828 + }, + { + "epoch": 2.5921274601686974, + "grad_norm": 56597.2578125, + "learning_rate": 2.1727737045996315e-05, + "loss": 2.2108, + "step": 13829 + }, + { + "epoch": 2.592314901593252, + "grad_norm": 56201.94921875, + "learning_rate": 2.1721256301890086e-05, + "loss": 2.125, + "step": 13830 + }, + { + "epoch": 2.5925023430178067, + "grad_norm": 53021.609375, + "learning_rate": 2.1714776256232178e-05, + "loss": 2.1364, + "step": 13831 + }, + { + "epoch": 2.5926897844423618, + "grad_norm": 52527.96875, + "learning_rate": 2.1708296909182624e-05, + "loss": 2.1425, + "step": 13832 + }, + { + "epoch": 2.592877225866917, + "grad_norm": 58291.8671875, + "learning_rate": 2.1701818260901434e-05, + "loss": 2.1704, + "step": 13833 + }, + { + "epoch": 2.5930646672914714, + "grad_norm": 51801.3671875, + "learning_rate": 2.1695340311548667e-05, + "loss": 2.1707, + "step": 13834 + }, + { + "epoch": 2.593252108716026, + "grad_norm": 55745.00390625, + "learning_rate": 2.168886306128429e-05, + "loss": 2.0877, + "step": 13835 + }, + { + "epoch": 2.593439550140581, + "grad_norm": 58508.42578125, + "learning_rate": 2.1682386510268276e-05, + "loss": 2.1354, + "step": 13836 + }, + { + "epoch": 2.5936269915651358, + "grad_norm": 56845.1484375, + "learning_rate": 2.1675910658660603e-05, + "loss": 2.2678, + "step": 13837 + }, + { + "epoch": 2.593814432989691, + "grad_norm": 53206.73828125, + "learning_rate": 2.1669435506621226e-05, + "loss": 2.1385, + "step": 13838 + }, + { + "epoch": 2.5940018744142455, + "grad_norm": 53690.37109375, + "learning_rate": 2.166296105431007e-05, + "loss": 2.1097, + "step": 13839 + }, + { + "epoch": 2.5941893158388005, + "grad_norm": 51683.65625, + "learning_rate": 2.1656487301887014e-05, + "loss": 2.1364, + "step": 13840 + }, + { + "epoch": 2.594376757263355, + "grad_norm": 52825.83203125, + "learning_rate": 2.1650014249511992e-05, + "loss": 2.1471, + "step": 13841 + }, + { + "epoch": 2.5945641986879098, + "grad_norm": 51984.21875, + "learning_rate": 2.1643541897344837e-05, + "loss": 2.1223, + "step": 13842 + }, + { + "epoch": 2.594751640112465, + "grad_norm": 53104.0859375, + "learning_rate": 2.163707024554546e-05, + "loss": 2.1425, + "step": 13843 + }, + { + "epoch": 2.59493908153702, + "grad_norm": 60608.27734375, + "learning_rate": 2.1630599294273652e-05, + "loss": 2.1193, + "step": 13844 + }, + { + "epoch": 2.5951265229615745, + "grad_norm": 52999.59375, + "learning_rate": 2.162412904368928e-05, + "loss": 2.1564, + "step": 13845 + }, + { + "epoch": 2.595313964386129, + "grad_norm": 55533.55078125, + "learning_rate": 2.1617659493952124e-05, + "loss": 2.1804, + "step": 13846 + }, + { + "epoch": 2.5955014058106842, + "grad_norm": 50059.89453125, + "learning_rate": 2.1611190645221967e-05, + "loss": 2.1257, + "step": 13847 + }, + { + "epoch": 2.595688847235239, + "grad_norm": 54458.84765625, + "learning_rate": 2.160472249765862e-05, + "loss": 2.1517, + "step": 13848 + }, + { + "epoch": 2.595876288659794, + "grad_norm": 51454.984375, + "learning_rate": 2.1598255051421784e-05, + "loss": 2.1335, + "step": 13849 + }, + { + "epoch": 2.5960637300843485, + "grad_norm": 52494.70703125, + "learning_rate": 2.1591788306671246e-05, + "loss": 2.1312, + "step": 13850 + }, + { + "epoch": 2.5962511715089036, + "grad_norm": 55512.640625, + "learning_rate": 2.1585322263566704e-05, + "loss": 2.1702, + "step": 13851 + }, + { + "epoch": 2.5964386129334582, + "grad_norm": 52929.6484375, + "learning_rate": 2.1578856922267847e-05, + "loss": 2.0228, + "step": 13852 + }, + { + "epoch": 2.596626054358013, + "grad_norm": 53192.3203125, + "learning_rate": 2.1572392282934373e-05, + "loss": 2.1479, + "step": 13853 + }, + { + "epoch": 2.596813495782568, + "grad_norm": 58161.5859375, + "learning_rate": 2.1565928345725974e-05, + "loss": 2.1879, + "step": 13854 + }, + { + "epoch": 2.597000937207123, + "grad_norm": 54814.5234375, + "learning_rate": 2.1559465110802274e-05, + "loss": 2.1443, + "step": 13855 + }, + { + "epoch": 2.5971883786316776, + "grad_norm": 58674.02734375, + "learning_rate": 2.15530025783229e-05, + "loss": 2.1603, + "step": 13856 + }, + { + "epoch": 2.5973758200562322, + "grad_norm": 50161.1015625, + "learning_rate": 2.154654074844749e-05, + "loss": 2.1628, + "step": 13857 + }, + { + "epoch": 2.5975632614807873, + "grad_norm": 53423.1328125, + "learning_rate": 2.154007962133564e-05, + "loss": 2.1166, + "step": 13858 + }, + { + "epoch": 2.597750702905342, + "grad_norm": 58591.28515625, + "learning_rate": 2.153361919714691e-05, + "loss": 2.076, + "step": 13859 + }, + { + "epoch": 2.597938144329897, + "grad_norm": 54222.5859375, + "learning_rate": 2.152715947604087e-05, + "loss": 2.1625, + "step": 13860 + }, + { + "epoch": 2.5981255857544516, + "grad_norm": 60884.984375, + "learning_rate": 2.1520700458177118e-05, + "loss": 2.2123, + "step": 13861 + }, + { + "epoch": 2.5983130271790067, + "grad_norm": 51288.48828125, + "learning_rate": 2.1514242143715097e-05, + "loss": 2.1692, + "step": 13862 + }, + { + "epoch": 2.5985004686035613, + "grad_norm": 54376.9921875, + "learning_rate": 2.1507784532814367e-05, + "loss": 2.1486, + "step": 13863 + }, + { + "epoch": 2.598687910028116, + "grad_norm": 53220.515625, + "learning_rate": 2.1501327625634436e-05, + "loss": 2.1521, + "step": 13864 + }, + { + "epoch": 2.598875351452671, + "grad_norm": 56207.65625, + "learning_rate": 2.1494871422334768e-05, + "loss": 2.1167, + "step": 13865 + }, + { + "epoch": 2.599062792877226, + "grad_norm": 51554.359375, + "learning_rate": 2.1488415923074795e-05, + "loss": 2.0654, + "step": 13866 + }, + { + "epoch": 2.5992502343017807, + "grad_norm": 55593.39453125, + "learning_rate": 2.1481961128013982e-05, + "loss": 2.1085, + "step": 13867 + }, + { + "epoch": 2.5994376757263353, + "grad_norm": 55040.26171875, + "learning_rate": 2.1475507037311775e-05, + "loss": 2.2233, + "step": 13868 + }, + { + "epoch": 2.5996251171508904, + "grad_norm": 53726.87890625, + "learning_rate": 2.1469053651127565e-05, + "loss": 2.1082, + "step": 13869 + }, + { + "epoch": 2.599812558575445, + "grad_norm": 53554.87109375, + "learning_rate": 2.1462600969620716e-05, + "loss": 2.118, + "step": 13870 + }, + { + "epoch": 2.6, + "grad_norm": 60669.83203125, + "learning_rate": 2.1456148992950646e-05, + "loss": 2.0322, + "step": 13871 + }, + { + "epoch": 2.6001874414245547, + "grad_norm": 54440.984375, + "learning_rate": 2.1449697721276667e-05, + "loss": 2.2138, + "step": 13872 + }, + { + "epoch": 2.60037488284911, + "grad_norm": 53090.265625, + "learning_rate": 2.1443247154758154e-05, + "loss": 2.1292, + "step": 13873 + }, + { + "epoch": 2.6005623242736644, + "grad_norm": 56794.65625, + "learning_rate": 2.1436797293554416e-05, + "loss": 2.1344, + "step": 13874 + }, + { + "epoch": 2.600749765698219, + "grad_norm": 53090.56640625, + "learning_rate": 2.1430348137824734e-05, + "loss": 2.1333, + "step": 13875 + }, + { + "epoch": 2.600937207122774, + "grad_norm": 61106.6328125, + "learning_rate": 2.1423899687728432e-05, + "loss": 2.1029, + "step": 13876 + }, + { + "epoch": 2.601124648547329, + "grad_norm": 51374.73828125, + "learning_rate": 2.1417451943424736e-05, + "loss": 2.1183, + "step": 13877 + }, + { + "epoch": 2.601312089971884, + "grad_norm": 58542.75390625, + "learning_rate": 2.141100490507294e-05, + "loss": 2.1726, + "step": 13878 + }, + { + "epoch": 2.6014995313964384, + "grad_norm": 54308.703125, + "learning_rate": 2.1404558572832235e-05, + "loss": 2.12, + "step": 13879 + }, + { + "epoch": 2.6016869728209935, + "grad_norm": 54519.1640625, + "learning_rate": 2.1398112946861883e-05, + "loss": 2.1076, + "step": 13880 + }, + { + "epoch": 2.601874414245548, + "grad_norm": 58637.52734375, + "learning_rate": 2.1391668027321047e-05, + "loss": 2.1895, + "step": 13881 + }, + { + "epoch": 2.602061855670103, + "grad_norm": 55310.13671875, + "learning_rate": 2.138522381436891e-05, + "loss": 2.1524, + "step": 13882 + }, + { + "epoch": 2.602249297094658, + "grad_norm": 55543.98046875, + "learning_rate": 2.1378780308164648e-05, + "loss": 2.0791, + "step": 13883 + }, + { + "epoch": 2.602436738519213, + "grad_norm": 57722.85546875, + "learning_rate": 2.137233750886742e-05, + "loss": 2.102, + "step": 13884 + }, + { + "epoch": 2.6026241799437675, + "grad_norm": 51240.44140625, + "learning_rate": 2.1365895416636335e-05, + "loss": 2.1906, + "step": 13885 + }, + { + "epoch": 2.6028116213683226, + "grad_norm": 57061.96484375, + "learning_rate": 2.1359454031630492e-05, + "loss": 2.2023, + "step": 13886 + }, + { + "epoch": 2.602999062792877, + "grad_norm": 54326.2265625, + "learning_rate": 2.135301335400902e-05, + "loss": 2.1616, + "step": 13887 + }, + { + "epoch": 2.6031865042174323, + "grad_norm": 52845.640625, + "learning_rate": 2.1346573383930978e-05, + "loss": 2.1091, + "step": 13888 + }, + { + "epoch": 2.603373945641987, + "grad_norm": 50680.7109375, + "learning_rate": 2.1340134121555407e-05, + "loss": 2.1473, + "step": 13889 + }, + { + "epoch": 2.6035613870665415, + "grad_norm": 55745.8359375, + "learning_rate": 2.1333695567041363e-05, + "loss": 2.1257, + "step": 13890 + }, + { + "epoch": 2.6037488284910966, + "grad_norm": 51844.07421875, + "learning_rate": 2.1327257720547907e-05, + "loss": 2.092, + "step": 13891 + }, + { + "epoch": 2.6039362699156516, + "grad_norm": 57250.1953125, + "learning_rate": 2.1320820582233974e-05, + "loss": 2.2067, + "step": 13892 + }, + { + "epoch": 2.6041237113402063, + "grad_norm": 54591.171875, + "learning_rate": 2.1314384152258594e-05, + "loss": 2.1631, + "step": 13893 + }, + { + "epoch": 2.604311152764761, + "grad_norm": 57203.48828125, + "learning_rate": 2.1307948430780744e-05, + "loss": 2.1382, + "step": 13894 + }, + { + "epoch": 2.604498594189316, + "grad_norm": 56389.8984375, + "learning_rate": 2.1301513417959373e-05, + "loss": 2.0704, + "step": 13895 + }, + { + "epoch": 2.6046860356138706, + "grad_norm": 57624.6953125, + "learning_rate": 2.1295079113953397e-05, + "loss": 2.1088, + "step": 13896 + }, + { + "epoch": 2.6048734770384256, + "grad_norm": 51233.73046875, + "learning_rate": 2.1288645518921763e-05, + "loss": 2.1198, + "step": 13897 + }, + { + "epoch": 2.6050609184629803, + "grad_norm": 57145.921875, + "learning_rate": 2.1282212633023345e-05, + "loss": 2.189, + "step": 13898 + }, + { + "epoch": 2.6052483598875353, + "grad_norm": 49765.17578125, + "learning_rate": 2.1275780456417062e-05, + "loss": 2.1631, + "step": 13899 + }, + { + "epoch": 2.60543580131209, + "grad_norm": 56052.17578125, + "learning_rate": 2.126934898926174e-05, + "loss": 2.1612, + "step": 13900 + }, + { + "epoch": 2.6056232427366446, + "grad_norm": 54555.71875, + "learning_rate": 2.1262918231716266e-05, + "loss": 2.11, + "step": 13901 + }, + { + "epoch": 2.6058106841611997, + "grad_norm": 56493.703125, + "learning_rate": 2.1256488183939455e-05, + "loss": 2.1188, + "step": 13902 + }, + { + "epoch": 2.6059981255857547, + "grad_norm": 57893.22265625, + "learning_rate": 2.1250058846090098e-05, + "loss": 2.169, + "step": 13903 + }, + { + "epoch": 2.6061855670103093, + "grad_norm": 57115.421875, + "learning_rate": 2.124363021832703e-05, + "loss": 2.0808, + "step": 13904 + }, + { + "epoch": 2.606373008434864, + "grad_norm": 56560.421875, + "learning_rate": 2.1237202300809002e-05, + "loss": 2.1476, + "step": 13905 + }, + { + "epoch": 2.606560449859419, + "grad_norm": 54099.65625, + "learning_rate": 2.1230775093694795e-05, + "loss": 2.1546, + "step": 13906 + }, + { + "epoch": 2.6067478912839737, + "grad_norm": 51375.93359375, + "learning_rate": 2.1224348597143128e-05, + "loss": 2.0975, + "step": 13907 + }, + { + "epoch": 2.6069353327085287, + "grad_norm": 55237.88671875, + "learning_rate": 2.1217922811312764e-05, + "loss": 2.1112, + "step": 13908 + }, + { + "epoch": 2.6071227741330834, + "grad_norm": 50464.828125, + "learning_rate": 2.1211497736362367e-05, + "loss": 2.1329, + "step": 13909 + }, + { + "epoch": 2.6073102155576384, + "grad_norm": 52048.79296875, + "learning_rate": 2.1205073372450674e-05, + "loss": 2.1285, + "step": 13910 + }, + { + "epoch": 2.607497656982193, + "grad_norm": 59007.01953125, + "learning_rate": 2.119864971973633e-05, + "loss": 2.1184, + "step": 13911 + }, + { + "epoch": 2.6076850984067477, + "grad_norm": 53892.984375, + "learning_rate": 2.1192226778377984e-05, + "loss": 2.192, + "step": 13912 + }, + { + "epoch": 2.6078725398313027, + "grad_norm": 53570.67578125, + "learning_rate": 2.1185804548534282e-05, + "loss": 2.103, + "step": 13913 + }, + { + "epoch": 2.608059981255858, + "grad_norm": 53057.4296875, + "learning_rate": 2.1179383030363888e-05, + "loss": 2.0975, + "step": 13914 + }, + { + "epoch": 2.6082474226804124, + "grad_norm": 52381.26953125, + "learning_rate": 2.117296222402533e-05, + "loss": 2.0704, + "step": 13915 + }, + { + "epoch": 2.608434864104967, + "grad_norm": 50614.2265625, + "learning_rate": 2.116654212967723e-05, + "loss": 2.154, + "step": 13916 + }, + { + "epoch": 2.608622305529522, + "grad_norm": 57349.625, + "learning_rate": 2.1160122747478174e-05, + "loss": 2.1337, + "step": 13917 + }, + { + "epoch": 2.6088097469540767, + "grad_norm": 52510.546875, + "learning_rate": 2.1153704077586695e-05, + "loss": 2.0865, + "step": 13918 + }, + { + "epoch": 2.608997188378632, + "grad_norm": 51017.625, + "learning_rate": 2.114728612016131e-05, + "loss": 2.0459, + "step": 13919 + }, + { + "epoch": 2.6091846298031864, + "grad_norm": 56598.1640625, + "learning_rate": 2.1140868875360547e-05, + "loss": 2.089, + "step": 13920 + }, + { + "epoch": 2.6093720712277415, + "grad_norm": 53220.88671875, + "learning_rate": 2.1134452343342943e-05, + "loss": 2.1158, + "step": 13921 + }, + { + "epoch": 2.609559512652296, + "grad_norm": 53347.35546875, + "learning_rate": 2.1128036524266903e-05, + "loss": 2.0967, + "step": 13922 + }, + { + "epoch": 2.6097469540768508, + "grad_norm": 56757.98828125, + "learning_rate": 2.1121621418290933e-05, + "loss": 2.1286, + "step": 13923 + }, + { + "epoch": 2.609934395501406, + "grad_norm": 58322.49609375, + "learning_rate": 2.1115207025573486e-05, + "loss": 2.0748, + "step": 13924 + }, + { + "epoch": 2.610121836925961, + "grad_norm": 51254.93359375, + "learning_rate": 2.1108793346272982e-05, + "loss": 2.0687, + "step": 13925 + }, + { + "epoch": 2.6103092783505155, + "grad_norm": 53753.99609375, + "learning_rate": 2.11023803805478e-05, + "loss": 2.1805, + "step": 13926 + }, + { + "epoch": 2.61049671977507, + "grad_norm": 49884.984375, + "learning_rate": 2.1095968128556378e-05, + "loss": 2.09, + "step": 13927 + }, + { + "epoch": 2.610684161199625, + "grad_norm": 51765.0859375, + "learning_rate": 2.1089556590457044e-05, + "loss": 2.0917, + "step": 13928 + }, + { + "epoch": 2.61087160262418, + "grad_norm": 50986.10546875, + "learning_rate": 2.1083145766408202e-05, + "loss": 2.0622, + "step": 13929 + }, + { + "epoch": 2.611059044048735, + "grad_norm": 54081.2890625, + "learning_rate": 2.1076735656568153e-05, + "loss": 2.1025, + "step": 13930 + }, + { + "epoch": 2.6112464854732895, + "grad_norm": 52542.046875, + "learning_rate": 2.1070326261095248e-05, + "loss": 2.1457, + "step": 13931 + }, + { + "epoch": 2.6114339268978446, + "grad_norm": 54697.44921875, + "learning_rate": 2.106391758014778e-05, + "loss": 2.1081, + "step": 13932 + }, + { + "epoch": 2.611621368322399, + "grad_norm": 51737.61328125, + "learning_rate": 2.105750961388401e-05, + "loss": 2.0491, + "step": 13933 + }, + { + "epoch": 2.611808809746954, + "grad_norm": 58449.078125, + "learning_rate": 2.1051102362462245e-05, + "loss": 2.2534, + "step": 13934 + }, + { + "epoch": 2.611996251171509, + "grad_norm": 54241.15234375, + "learning_rate": 2.1044695826040704e-05, + "loss": 2.1696, + "step": 13935 + }, + { + "epoch": 2.612183692596064, + "grad_norm": 63121.06640625, + "learning_rate": 2.103829000477765e-05, + "loss": 2.178, + "step": 13936 + }, + { + "epoch": 2.6123711340206186, + "grad_norm": 60201.64453125, + "learning_rate": 2.1031884898831272e-05, + "loss": 2.1293, + "step": 13937 + }, + { + "epoch": 2.6125585754451732, + "grad_norm": 54136.63671875, + "learning_rate": 2.1025480508359792e-05, + "loss": 2.0904, + "step": 13938 + }, + { + "epoch": 2.6127460168697283, + "grad_norm": 58300.34375, + "learning_rate": 2.1019076833521357e-05, + "loss": 2.1472, + "step": 13939 + }, + { + "epoch": 2.612933458294283, + "grad_norm": 49496.74609375, + "learning_rate": 2.101267387447417e-05, + "loss": 2.1688, + "step": 13940 + }, + { + "epoch": 2.613120899718838, + "grad_norm": 54329.7578125, + "learning_rate": 2.1006271631376363e-05, + "loss": 2.0855, + "step": 13941 + }, + { + "epoch": 2.6133083411433926, + "grad_norm": 55481.66015625, + "learning_rate": 2.0999870104386026e-05, + "loss": 2.2508, + "step": 13942 + }, + { + "epoch": 2.6134957825679477, + "grad_norm": 49881.0625, + "learning_rate": 2.0993469293661322e-05, + "loss": 2.0829, + "step": 13943 + }, + { + "epoch": 2.6136832239925023, + "grad_norm": 53365.23046875, + "learning_rate": 2.098706919936032e-05, + "loss": 2.1317, + "step": 13944 + }, + { + "epoch": 2.613870665417057, + "grad_norm": 51455.359375, + "learning_rate": 2.0980669821641074e-05, + "loss": 2.1128, + "step": 13945 + }, + { + "epoch": 2.614058106841612, + "grad_norm": 50507.69921875, + "learning_rate": 2.0974271160661656e-05, + "loss": 2.1186, + "step": 13946 + }, + { + "epoch": 2.614245548266167, + "grad_norm": 57486.6015625, + "learning_rate": 2.096787321658013e-05, + "loss": 2.0993, + "step": 13947 + }, + { + "epoch": 2.6144329896907217, + "grad_norm": 55050.91796875, + "learning_rate": 2.0961475989554495e-05, + "loss": 2.1625, + "step": 13948 + }, + { + "epoch": 2.6146204311152763, + "grad_norm": 59117.19921875, + "learning_rate": 2.0955079479742735e-05, + "loss": 2.0511, + "step": 13949 + }, + { + "epoch": 2.6148078725398314, + "grad_norm": 55830.328125, + "learning_rate": 2.0948683687302874e-05, + "loss": 2.0902, + "step": 13950 + }, + { + "epoch": 2.614995313964386, + "grad_norm": 59234.5234375, + "learning_rate": 2.094228861239286e-05, + "loss": 2.1037, + "step": 13951 + }, + { + "epoch": 2.615182755388941, + "grad_norm": 62030.75390625, + "learning_rate": 2.0935894255170618e-05, + "loss": 2.0729, + "step": 13952 + }, + { + "epoch": 2.6153701968134957, + "grad_norm": 56467.28515625, + "learning_rate": 2.092950061579411e-05, + "loss": 2.1219, + "step": 13953 + }, + { + "epoch": 2.6155576382380508, + "grad_norm": 54420.6328125, + "learning_rate": 2.0923107694421268e-05, + "loss": 2.143, + "step": 13954 + }, + { + "epoch": 2.6157450796626054, + "grad_norm": 54922.3828125, + "learning_rate": 2.0916715491209965e-05, + "loss": 2.182, + "step": 13955 + }, + { + "epoch": 2.61593252108716, + "grad_norm": 59904.296875, + "learning_rate": 2.0910324006318067e-05, + "loss": 2.0949, + "step": 13956 + }, + { + "epoch": 2.616119962511715, + "grad_norm": 60444.4453125, + "learning_rate": 2.0903933239903466e-05, + "loss": 2.1481, + "step": 13957 + }, + { + "epoch": 2.61630740393627, + "grad_norm": 50778.20703125, + "learning_rate": 2.089754319212399e-05, + "loss": 2.1541, + "step": 13958 + }, + { + "epoch": 2.6164948453608248, + "grad_norm": 52109.3515625, + "learning_rate": 2.0891153863137448e-05, + "loss": 2.1904, + "step": 13959 + }, + { + "epoch": 2.6166822867853794, + "grad_norm": 48837.3359375, + "learning_rate": 2.088476525310167e-05, + "loss": 2.1229, + "step": 13960 + }, + { + "epoch": 2.6168697282099345, + "grad_norm": 55118.08203125, + "learning_rate": 2.087837736217445e-05, + "loss": 2.1348, + "step": 13961 + }, + { + "epoch": 2.617057169634489, + "grad_norm": 56803.94140625, + "learning_rate": 2.0871990190513558e-05, + "loss": 2.0156, + "step": 13962 + }, + { + "epoch": 2.617244611059044, + "grad_norm": 52504.3125, + "learning_rate": 2.0865603738276723e-05, + "loss": 2.1558, + "step": 13963 + }, + { + "epoch": 2.617432052483599, + "grad_norm": 52554.84765625, + "learning_rate": 2.085921800562172e-05, + "loss": 2.096, + "step": 13964 + }, + { + "epoch": 2.617619493908154, + "grad_norm": 47808.40234375, + "learning_rate": 2.0852832992706238e-05, + "loss": 2.1957, + "step": 13965 + }, + { + "epoch": 2.6178069353327085, + "grad_norm": 53733.33984375, + "learning_rate": 2.0846448699688004e-05, + "loss": 2.1688, + "step": 13966 + }, + { + "epoch": 2.617994376757263, + "grad_norm": 54201.3046875, + "learning_rate": 2.0840065126724696e-05, + "loss": 2.1086, + "step": 13967 + }, + { + "epoch": 2.618181818181818, + "grad_norm": 53892.58203125, + "learning_rate": 2.083368227397395e-05, + "loss": 2.1712, + "step": 13968 + }, + { + "epoch": 2.6183692596063732, + "grad_norm": 51645.53125, + "learning_rate": 2.0827300141593436e-05, + "loss": 2.1852, + "step": 13969 + }, + { + "epoch": 2.618556701030928, + "grad_norm": 56283.33203125, + "learning_rate": 2.08209187297408e-05, + "loss": 2.204, + "step": 13970 + }, + { + "epoch": 2.6187441424554825, + "grad_norm": 56027.56640625, + "learning_rate": 2.081453803857365e-05, + "loss": 2.1169, + "step": 13971 + }, + { + "epoch": 2.6189315838800376, + "grad_norm": 52929.875, + "learning_rate": 2.0808158068249544e-05, + "loss": 2.1335, + "step": 13972 + }, + { + "epoch": 2.619119025304592, + "grad_norm": 56344.25390625, + "learning_rate": 2.0801778818926108e-05, + "loss": 2.0421, + "step": 13973 + }, + { + "epoch": 2.6193064667291472, + "grad_norm": 57668.5625, + "learning_rate": 2.079540029076088e-05, + "loss": 2.1547, + "step": 13974 + }, + { + "epoch": 2.619493908153702, + "grad_norm": 55798.98046875, + "learning_rate": 2.078902248391138e-05, + "loss": 2.1186, + "step": 13975 + }, + { + "epoch": 2.619681349578257, + "grad_norm": 53473.6953125, + "learning_rate": 2.0782645398535154e-05, + "loss": 2.1114, + "step": 13976 + }, + { + "epoch": 2.6198687910028116, + "grad_norm": 54663.3359375, + "learning_rate": 2.0776269034789725e-05, + "loss": 2.1344, + "step": 13977 + }, + { + "epoch": 2.620056232427366, + "grad_norm": 57595.0703125, + "learning_rate": 2.0769893392832556e-05, + "loss": 2.1479, + "step": 13978 + }, + { + "epoch": 2.6202436738519213, + "grad_norm": 52801.70703125, + "learning_rate": 2.0763518472821107e-05, + "loss": 2.2173, + "step": 13979 + }, + { + "epoch": 2.6204311152764763, + "grad_norm": 53659.8984375, + "learning_rate": 2.075714427491287e-05, + "loss": 2.0528, + "step": 13980 + }, + { + "epoch": 2.620618556701031, + "grad_norm": 54838.828125, + "learning_rate": 2.0750770799265252e-05, + "loss": 2.1372, + "step": 13981 + }, + { + "epoch": 2.6208059981255856, + "grad_norm": 58786.359375, + "learning_rate": 2.074439804603565e-05, + "loss": 2.1184, + "step": 13982 + }, + { + "epoch": 2.6209934395501406, + "grad_norm": 49663.96484375, + "learning_rate": 2.073802601538149e-05, + "loss": 2.1345, + "step": 13983 + }, + { + "epoch": 2.6211808809746953, + "grad_norm": 55423.0625, + "learning_rate": 2.0731654707460184e-05, + "loss": 2.1862, + "step": 13984 + }, + { + "epoch": 2.6213683223992503, + "grad_norm": 52372.2734375, + "learning_rate": 2.072528412242903e-05, + "loss": 2.1759, + "step": 13985 + }, + { + "epoch": 2.621555763823805, + "grad_norm": 55959.82421875, + "learning_rate": 2.0718914260445393e-05, + "loss": 2.1335, + "step": 13986 + }, + { + "epoch": 2.62174320524836, + "grad_norm": 58632.32421875, + "learning_rate": 2.0712545121666632e-05, + "loss": 2.1768, + "step": 13987 + }, + { + "epoch": 2.6219306466729146, + "grad_norm": 54063.0390625, + "learning_rate": 2.0706176706250037e-05, + "loss": 2.1387, + "step": 13988 + }, + { + "epoch": 2.6221180880974693, + "grad_norm": 65357.31640625, + "learning_rate": 2.0699809014352878e-05, + "loss": 2.1632, + "step": 13989 + }, + { + "epoch": 2.6223055295220243, + "grad_norm": 58108.7734375, + "learning_rate": 2.0693442046132466e-05, + "loss": 2.0831, + "step": 13990 + }, + { + "epoch": 2.6224929709465794, + "grad_norm": 56807.30859375, + "learning_rate": 2.068707580174602e-05, + "loss": 2.2371, + "step": 13991 + }, + { + "epoch": 2.622680412371134, + "grad_norm": 56268.9453125, + "learning_rate": 2.0680710281350817e-05, + "loss": 2.1666, + "step": 13992 + }, + { + "epoch": 2.6228678537956887, + "grad_norm": 57970.0, + "learning_rate": 2.0674345485104035e-05, + "loss": 2.0862, + "step": 13993 + }, + { + "epoch": 2.6230552952202437, + "grad_norm": 61714.83984375, + "learning_rate": 2.0667981413162923e-05, + "loss": 2.0824, + "step": 13994 + }, + { + "epoch": 2.6232427366447983, + "grad_norm": 59708.49609375, + "learning_rate": 2.0661618065684618e-05, + "loss": 2.0484, + "step": 13995 + }, + { + "epoch": 2.6234301780693534, + "grad_norm": 52014.8515625, + "learning_rate": 2.0655255442826327e-05, + "loss": 2.135, + "step": 13996 + }, + { + "epoch": 2.623617619493908, + "grad_norm": 57807.453125, + "learning_rate": 2.064889354474518e-05, + "loss": 2.1049, + "step": 13997 + }, + { + "epoch": 2.623805060918463, + "grad_norm": 54955.6328125, + "learning_rate": 2.0642532371598296e-05, + "loss": 2.0979, + "step": 13998 + }, + { + "epoch": 2.6239925023430177, + "grad_norm": 50502.19921875, + "learning_rate": 2.063617192354281e-05, + "loss": 2.0558, + "step": 13999 + }, + { + "epoch": 2.624179943767573, + "grad_norm": 52401.4765625, + "learning_rate": 2.0629812200735793e-05, + "loss": 2.1198, + "step": 14000 + }, + { + "epoch": 2.624179943767573, + "eval_loss": 2.263713836669922, + "eval_runtime": 126.1743, + "eval_samples_per_second": 40.016, + "eval_steps_per_second": 2.005, + "step": 14000 + }, + { + "epoch": 2.6243673851921274, + "grad_norm": 59624.3828125, + "learning_rate": 2.0623453203334347e-05, + "loss": 2.2521, + "step": 14001 + }, + { + "epoch": 2.6245548266166825, + "grad_norm": 49697.1484375, + "learning_rate": 2.061709493149551e-05, + "loss": 2.1429, + "step": 14002 + }, + { + "epoch": 2.624742268041237, + "grad_norm": 56267.03515625, + "learning_rate": 2.061073738537635e-05, + "loss": 2.1554, + "step": 14003 + }, + { + "epoch": 2.6249297094657917, + "grad_norm": 54451.84765625, + "learning_rate": 2.0604380565133868e-05, + "loss": 2.1617, + "step": 14004 + }, + { + "epoch": 2.625117150890347, + "grad_norm": 56528.99609375, + "learning_rate": 2.059802447092506e-05, + "loss": 2.1159, + "step": 14005 + }, + { + "epoch": 2.6253045923149014, + "grad_norm": 54553.5234375, + "learning_rate": 2.0591669102906924e-05, + "loss": 2.2705, + "step": 14006 + }, + { + "epoch": 2.6254920337394565, + "grad_norm": 55474.05078125, + "learning_rate": 2.0585314461236473e-05, + "loss": 2.1972, + "step": 14007 + }, + { + "epoch": 2.625679475164011, + "grad_norm": 53224.39453125, + "learning_rate": 2.057896054607058e-05, + "loss": 2.0864, + "step": 14008 + }, + { + "epoch": 2.625866916588566, + "grad_norm": 52834.95703125, + "learning_rate": 2.0572607357566213e-05, + "loss": 2.0809, + "step": 14009 + }, + { + "epoch": 2.626054358013121, + "grad_norm": 59414.96484375, + "learning_rate": 2.0566254895880315e-05, + "loss": 2.1494, + "step": 14010 + }, + { + "epoch": 2.626241799437676, + "grad_norm": 56935.59765625, + "learning_rate": 2.0559903161169757e-05, + "loss": 2.0835, + "step": 14011 + }, + { + "epoch": 2.6264292408622305, + "grad_norm": 54488.953125, + "learning_rate": 2.0553552153591405e-05, + "loss": 2.1596, + "step": 14012 + }, + { + "epoch": 2.6266166822867856, + "grad_norm": 52577.125, + "learning_rate": 2.0547201873302134e-05, + "loss": 2.0797, + "step": 14013 + }, + { + "epoch": 2.62680412371134, + "grad_norm": 56166.61328125, + "learning_rate": 2.0540852320458826e-05, + "loss": 2.1783, + "step": 14014 + }, + { + "epoch": 2.626991565135895, + "grad_norm": 53319.48046875, + "learning_rate": 2.053450349521824e-05, + "loss": 2.0957, + "step": 14015 + }, + { + "epoch": 2.62717900656045, + "grad_norm": 59220.46484375, + "learning_rate": 2.052815539773721e-05, + "loss": 2.042, + "step": 14016 + }, + { + "epoch": 2.627366447985005, + "grad_norm": 53616.91015625, + "learning_rate": 2.0521808028172552e-05, + "loss": 2.129, + "step": 14017 + }, + { + "epoch": 2.6275538894095596, + "grad_norm": 52781.3125, + "learning_rate": 2.0515461386681017e-05, + "loss": 2.1095, + "step": 14018 + }, + { + "epoch": 2.627741330834114, + "grad_norm": 53623.375, + "learning_rate": 2.050911547341934e-05, + "loss": 2.0917, + "step": 14019 + }, + { + "epoch": 2.6279287722586693, + "grad_norm": 53245.5, + "learning_rate": 2.0502770288544292e-05, + "loss": 2.1888, + "step": 14020 + }, + { + "epoch": 2.628116213683224, + "grad_norm": 55459.1171875, + "learning_rate": 2.049642583221256e-05, + "loss": 2.1281, + "step": 14021 + }, + { + "epoch": 2.628303655107779, + "grad_norm": 49965.49609375, + "learning_rate": 2.049008210458087e-05, + "loss": 2.1713, + "step": 14022 + }, + { + "epoch": 2.6284910965323336, + "grad_norm": 56691.9140625, + "learning_rate": 2.0483739105805876e-05, + "loss": 2.2868, + "step": 14023 + }, + { + "epoch": 2.6286785379568887, + "grad_norm": 59061.41015625, + "learning_rate": 2.0477396836044278e-05, + "loss": 2.0482, + "step": 14024 + }, + { + "epoch": 2.6288659793814433, + "grad_norm": 56725.828125, + "learning_rate": 2.047105529545268e-05, + "loss": 2.1792, + "step": 14025 + }, + { + "epoch": 2.629053420805998, + "grad_norm": 56861.62109375, + "learning_rate": 2.0464714484187752e-05, + "loss": 2.0255, + "step": 14026 + }, + { + "epoch": 2.629240862230553, + "grad_norm": 51924.921875, + "learning_rate": 2.0458374402406084e-05, + "loss": 2.1425, + "step": 14027 + }, + { + "epoch": 2.629428303655108, + "grad_norm": 52984.8671875, + "learning_rate": 2.0452035050264245e-05, + "loss": 2.1463, + "step": 14028 + }, + { + "epoch": 2.6296157450796627, + "grad_norm": 50312.0, + "learning_rate": 2.0445696427918848e-05, + "loss": 2.1476, + "step": 14029 + }, + { + "epoch": 2.6298031865042173, + "grad_norm": 51305.15625, + "learning_rate": 2.043935853552642e-05, + "loss": 2.0556, + "step": 14030 + }, + { + "epoch": 2.6299906279287724, + "grad_norm": 56228.50390625, + "learning_rate": 2.0433021373243523e-05, + "loss": 2.1469, + "step": 14031 + }, + { + "epoch": 2.630178069353327, + "grad_norm": 55248.234375, + "learning_rate": 2.0426684941226648e-05, + "loss": 2.1635, + "step": 14032 + }, + { + "epoch": 2.630365510777882, + "grad_norm": 58424.21875, + "learning_rate": 2.042034923963233e-05, + "loss": 2.1228, + "step": 14033 + }, + { + "epoch": 2.6305529522024367, + "grad_norm": 52488.37109375, + "learning_rate": 2.0414014268617037e-05, + "loss": 2.079, + "step": 14034 + }, + { + "epoch": 2.6307403936269917, + "grad_norm": 50878.9296875, + "learning_rate": 2.0407680028337213e-05, + "loss": 2.1572, + "step": 14035 + }, + { + "epoch": 2.6309278350515464, + "grad_norm": 53111.51171875, + "learning_rate": 2.0401346518949328e-05, + "loss": 2.1589, + "step": 14036 + }, + { + "epoch": 2.631115276476101, + "grad_norm": 52968.2109375, + "learning_rate": 2.0395013740609843e-05, + "loss": 2.124, + "step": 14037 + }, + { + "epoch": 2.631302717900656, + "grad_norm": 51968.30859375, + "learning_rate": 2.0388681693475098e-05, + "loss": 2.1223, + "step": 14038 + }, + { + "epoch": 2.631490159325211, + "grad_norm": 54964.59375, + "learning_rate": 2.0382350377701522e-05, + "loss": 2.1782, + "step": 14039 + }, + { + "epoch": 2.6316776007497658, + "grad_norm": 56202.61328125, + "learning_rate": 2.037601979344551e-05, + "loss": 2.0904, + "step": 14040 + }, + { + "epoch": 2.6318650421743204, + "grad_norm": 59288.80859375, + "learning_rate": 2.03696899408634e-05, + "loss": 2.2214, + "step": 14041 + }, + { + "epoch": 2.6320524835988754, + "grad_norm": 52528.69140625, + "learning_rate": 2.0363360820111522e-05, + "loss": 2.1558, + "step": 14042 + }, + { + "epoch": 2.63223992502343, + "grad_norm": 51731.43359375, + "learning_rate": 2.035703243134622e-05, + "loss": 2.1317, + "step": 14043 + }, + { + "epoch": 2.632427366447985, + "grad_norm": 55276.2890625, + "learning_rate": 2.0350704774723783e-05, + "loss": 2.0957, + "step": 14044 + }, + { + "epoch": 2.6326148078725398, + "grad_norm": 52190.1015625, + "learning_rate": 2.034437785040048e-05, + "loss": 2.1402, + "step": 14045 + }, + { + "epoch": 2.632802249297095, + "grad_norm": 50298.484375, + "learning_rate": 2.03380516585326e-05, + "loss": 2.177, + "step": 14046 + }, + { + "epoch": 2.6329896907216495, + "grad_norm": 48850.91015625, + "learning_rate": 2.0331726199276395e-05, + "loss": 2.1083, + "step": 14047 + }, + { + "epoch": 2.633177132146204, + "grad_norm": 52795.0234375, + "learning_rate": 2.03254014727881e-05, + "loss": 2.1358, + "step": 14048 + }, + { + "epoch": 2.633364573570759, + "grad_norm": 55020.8671875, + "learning_rate": 2.0319077479223886e-05, + "loss": 2.0477, + "step": 14049 + }, + { + "epoch": 2.633552014995314, + "grad_norm": 51481.625, + "learning_rate": 2.0312754218739998e-05, + "loss": 2.0712, + "step": 14050 + }, + { + "epoch": 2.633739456419869, + "grad_norm": 50139.17578125, + "learning_rate": 2.0306431691492572e-05, + "loss": 2.1048, + "step": 14051 + }, + { + "epoch": 2.6339268978444235, + "grad_norm": 56907.77734375, + "learning_rate": 2.0300109897637803e-05, + "loss": 2.0897, + "step": 14052 + }, + { + "epoch": 2.6341143392689785, + "grad_norm": 53380.6171875, + "learning_rate": 2.02937888373318e-05, + "loss": 2.1336, + "step": 14053 + }, + { + "epoch": 2.634301780693533, + "grad_norm": 52043.953125, + "learning_rate": 2.028746851073071e-05, + "loss": 2.1261, + "step": 14054 + }, + { + "epoch": 2.6344892221180882, + "grad_norm": 52962.20703125, + "learning_rate": 2.0281148917990628e-05, + "loss": 2.0755, + "step": 14055 + }, + { + "epoch": 2.634676663542643, + "grad_norm": 56500.8671875, + "learning_rate": 2.027483005926762e-05, + "loss": 2.1694, + "step": 14056 + }, + { + "epoch": 2.634864104967198, + "grad_norm": 57722.765625, + "learning_rate": 2.0268511934717782e-05, + "loss": 2.1152, + "step": 14057 + }, + { + "epoch": 2.6350515463917525, + "grad_norm": 51907.625, + "learning_rate": 2.0262194544497137e-05, + "loss": 2.1569, + "step": 14058 + }, + { + "epoch": 2.635238987816307, + "grad_norm": 55619.58984375, + "learning_rate": 2.025587788876175e-05, + "loss": 2.1079, + "step": 14059 + }, + { + "epoch": 2.6354264292408622, + "grad_norm": 55312.43359375, + "learning_rate": 2.0249561967667617e-05, + "loss": 2.118, + "step": 14060 + }, + { + "epoch": 2.6356138706654173, + "grad_norm": 54010.91796875, + "learning_rate": 2.024324678137071e-05, + "loss": 2.1365, + "step": 14061 + }, + { + "epoch": 2.635801312089972, + "grad_norm": 53697.62109375, + "learning_rate": 2.0236932330027032e-05, + "loss": 2.1034, + "step": 14062 + }, + { + "epoch": 2.6359887535145266, + "grad_norm": 55450.91796875, + "learning_rate": 2.023061861379255e-05, + "loss": 2.0793, + "step": 14063 + }, + { + "epoch": 2.6361761949390816, + "grad_norm": 57054.9765625, + "learning_rate": 2.0224305632823193e-05, + "loss": 2.1463, + "step": 14064 + }, + { + "epoch": 2.6363636363636362, + "grad_norm": 53741.29296875, + "learning_rate": 2.0217993387274864e-05, + "loss": 2.1782, + "step": 14065 + }, + { + "epoch": 2.6365510777881913, + "grad_norm": 53223.58203125, + "learning_rate": 2.0211681877303485e-05, + "loss": 2.1258, + "step": 14066 + }, + { + "epoch": 2.636738519212746, + "grad_norm": 54583.75, + "learning_rate": 2.020537110306498e-05, + "loss": 2.1072, + "step": 14067 + }, + { + "epoch": 2.636925960637301, + "grad_norm": 50370.02734375, + "learning_rate": 2.019906106471514e-05, + "loss": 2.1591, + "step": 14068 + }, + { + "epoch": 2.6371134020618556, + "grad_norm": 54079.4375, + "learning_rate": 2.0192751762409863e-05, + "loss": 2.1488, + "step": 14069 + }, + { + "epoch": 2.6373008434864103, + "grad_norm": 49747.484375, + "learning_rate": 2.0186443196304983e-05, + "loss": 2.1095, + "step": 14070 + }, + { + "epoch": 2.6374882849109653, + "grad_norm": 58251.91015625, + "learning_rate": 2.0180135366556302e-05, + "loss": 2.19, + "step": 14071 + }, + { + "epoch": 2.6376757263355204, + "grad_norm": 55608.0703125, + "learning_rate": 2.0173828273319602e-05, + "loss": 2.0803, + "step": 14072 + }, + { + "epoch": 2.637863167760075, + "grad_norm": 54474.62109375, + "learning_rate": 2.016752191675068e-05, + "loss": 2.1444, + "step": 14073 + }, + { + "epoch": 2.6380506091846296, + "grad_norm": 52553.35546875, + "learning_rate": 2.0161216297005296e-05, + "loss": 2.1678, + "step": 14074 + }, + { + "epoch": 2.6382380506091847, + "grad_norm": 58994.36328125, + "learning_rate": 2.0154911414239168e-05, + "loss": 2.1776, + "step": 14075 + }, + { + "epoch": 2.6384254920337393, + "grad_norm": 55693.09765625, + "learning_rate": 2.0148607268608033e-05, + "loss": 2.0874, + "step": 14076 + }, + { + "epoch": 2.6386129334582944, + "grad_norm": 55576.390625, + "learning_rate": 2.0142303860267607e-05, + "loss": 2.1439, + "step": 14077 + }, + { + "epoch": 2.638800374882849, + "grad_norm": 55497.984375, + "learning_rate": 2.0136001189373573e-05, + "loss": 2.1567, + "step": 14078 + }, + { + "epoch": 2.638987816307404, + "grad_norm": 52288.73828125, + "learning_rate": 2.012969925608157e-05, + "loss": 2.154, + "step": 14079 + }, + { + "epoch": 2.6391752577319587, + "grad_norm": 57626.8359375, + "learning_rate": 2.0123398060547283e-05, + "loss": 2.1598, + "step": 14080 + }, + { + "epoch": 2.6393626991565133, + "grad_norm": 54264.25390625, + "learning_rate": 2.011709760292633e-05, + "loss": 2.0593, + "step": 14081 + }, + { + "epoch": 2.6395501405810684, + "grad_norm": 59386.6328125, + "learning_rate": 2.01107978833743e-05, + "loss": 2.146, + "step": 14082 + }, + { + "epoch": 2.6397375820056235, + "grad_norm": 57144.6953125, + "learning_rate": 2.010449890204683e-05, + "loss": 2.0938, + "step": 14083 + }, + { + "epoch": 2.639925023430178, + "grad_norm": 54946.09375, + "learning_rate": 2.0098200659099464e-05, + "loss": 2.115, + "step": 14084 + }, + { + "epoch": 2.6401124648547327, + "grad_norm": 53573.4296875, + "learning_rate": 2.0091903154687784e-05, + "loss": 2.1577, + "step": 14085 + }, + { + "epoch": 2.640299906279288, + "grad_norm": 51445.24609375, + "learning_rate": 2.008560638896731e-05, + "loss": 2.13, + "step": 14086 + }, + { + "epoch": 2.6404873477038424, + "grad_norm": 55405.18359375, + "learning_rate": 2.0079310362093588e-05, + "loss": 2.1657, + "step": 14087 + }, + { + "epoch": 2.6406747891283975, + "grad_norm": 57522.52734375, + "learning_rate": 2.007301507422209e-05, + "loss": 2.0512, + "step": 14088 + }, + { + "epoch": 2.640862230552952, + "grad_norm": 50679.37890625, + "learning_rate": 2.0066720525508338e-05, + "loss": 2.1141, + "step": 14089 + }, + { + "epoch": 2.641049671977507, + "grad_norm": 53371.73828125, + "learning_rate": 2.0060426716107776e-05, + "loss": 2.1193, + "step": 14090 + }, + { + "epoch": 2.641237113402062, + "grad_norm": 49101.39453125, + "learning_rate": 2.0054133646175837e-05, + "loss": 2.1439, + "step": 14091 + }, + { + "epoch": 2.6414245548266164, + "grad_norm": 49985.41015625, + "learning_rate": 2.0047841315867978e-05, + "loss": 2.1259, + "step": 14092 + }, + { + "epoch": 2.6416119962511715, + "grad_norm": 51279.0, + "learning_rate": 2.004154972533962e-05, + "loss": 2.0924, + "step": 14093 + }, + { + "epoch": 2.6417994376757266, + "grad_norm": 60454.49609375, + "learning_rate": 2.0035258874746144e-05, + "loss": 2.1725, + "step": 14094 + }, + { + "epoch": 2.641986879100281, + "grad_norm": 53680.7265625, + "learning_rate": 2.00289687642429e-05, + "loss": 2.1969, + "step": 14095 + }, + { + "epoch": 2.642174320524836, + "grad_norm": 56481.953125, + "learning_rate": 2.0022679393985293e-05, + "loss": 2.1261, + "step": 14096 + }, + { + "epoch": 2.642361761949391, + "grad_norm": 53838.01953125, + "learning_rate": 2.001639076412864e-05, + "loss": 2.1803, + "step": 14097 + }, + { + "epoch": 2.6425492033739455, + "grad_norm": 55005.6484375, + "learning_rate": 2.001010287482824e-05, + "loss": 2.0886, + "step": 14098 + }, + { + "epoch": 2.6427366447985006, + "grad_norm": 53153.08984375, + "learning_rate": 2.0003815726239416e-05, + "loss": 2.1203, + "step": 14099 + }, + { + "epoch": 2.642924086223055, + "grad_norm": 63333.9765625, + "learning_rate": 1.999752931851749e-05, + "loss": 2.0998, + "step": 14100 + }, + { + "epoch": 2.6431115276476103, + "grad_norm": 55050.23046875, + "learning_rate": 1.9991243651817654e-05, + "loss": 2.1539, + "step": 14101 + }, + { + "epoch": 2.643298969072165, + "grad_norm": 52122.91015625, + "learning_rate": 1.9984958726295195e-05, + "loss": 2.0865, + "step": 14102 + }, + { + "epoch": 2.6434864104967195, + "grad_norm": 53579.6484375, + "learning_rate": 1.9978674542105357e-05, + "loss": 2.1498, + "step": 14103 + }, + { + "epoch": 2.6436738519212746, + "grad_norm": 55529.42578125, + "learning_rate": 1.9972391099403332e-05, + "loss": 2.0557, + "step": 14104 + }, + { + "epoch": 2.6438612933458296, + "grad_norm": 60861.21484375, + "learning_rate": 1.99661083983443e-05, + "loss": 2.1156, + "step": 14105 + }, + { + "epoch": 2.6440487347703843, + "grad_norm": 52423.734375, + "learning_rate": 1.995982643908345e-05, + "loss": 2.107, + "step": 14106 + }, + { + "epoch": 2.644236176194939, + "grad_norm": 57996.85546875, + "learning_rate": 1.995354522177596e-05, + "loss": 2.0558, + "step": 14107 + }, + { + "epoch": 2.644423617619494, + "grad_norm": 54078.77734375, + "learning_rate": 1.994726474657695e-05, + "loss": 2.1927, + "step": 14108 + }, + { + "epoch": 2.6446110590440486, + "grad_norm": 52130.09375, + "learning_rate": 1.994098501364151e-05, + "loss": 2.1, + "step": 14109 + }, + { + "epoch": 2.6447985004686037, + "grad_norm": 53479.4609375, + "learning_rate": 1.9934706023124793e-05, + "loss": 2.0522, + "step": 14110 + }, + { + "epoch": 2.6449859418931583, + "grad_norm": 55659.6328125, + "learning_rate": 1.992842777518185e-05, + "loss": 2.1463, + "step": 14111 + }, + { + "epoch": 2.6451733833177133, + "grad_norm": 50641.38671875, + "learning_rate": 1.9922150269967744e-05, + "loss": 2.0984, + "step": 14112 + }, + { + "epoch": 2.645360824742268, + "grad_norm": 59186.37109375, + "learning_rate": 1.9915873507637545e-05, + "loss": 2.0926, + "step": 14113 + }, + { + "epoch": 2.6455482661668226, + "grad_norm": 56904.41796875, + "learning_rate": 1.9909597488346243e-05, + "loss": 2.0986, + "step": 14114 + }, + { + "epoch": 2.6457357075913777, + "grad_norm": 55631.03125, + "learning_rate": 1.9903322212248892e-05, + "loss": 2.0519, + "step": 14115 + }, + { + "epoch": 2.6459231490159327, + "grad_norm": 55460.1328125, + "learning_rate": 1.989704767950044e-05, + "loss": 2.1214, + "step": 14116 + }, + { + "epoch": 2.6461105904404874, + "grad_norm": 52019.66796875, + "learning_rate": 1.9890773890255902e-05, + "loss": 2.1544, + "step": 14117 + }, + { + "epoch": 2.646298031865042, + "grad_norm": 56847.45703125, + "learning_rate": 1.9884500844670195e-05, + "loss": 2.1858, + "step": 14118 + }, + { + "epoch": 2.646485473289597, + "grad_norm": 52959.609375, + "learning_rate": 1.9878228542898285e-05, + "loss": 2.0534, + "step": 14119 + }, + { + "epoch": 2.6466729147141517, + "grad_norm": 53374.25390625, + "learning_rate": 1.987195698509508e-05, + "loss": 2.075, + "step": 14120 + }, + { + "epoch": 2.6468603561387067, + "grad_norm": 58009.1328125, + "learning_rate": 1.986568617141546e-05, + "loss": 2.183, + "step": 14121 + }, + { + "epoch": 2.6470477975632614, + "grad_norm": 49991.98828125, + "learning_rate": 1.9859416102014316e-05, + "loss": 2.1239, + "step": 14122 + }, + { + "epoch": 2.6472352389878164, + "grad_norm": 51016.12109375, + "learning_rate": 1.985314677704654e-05, + "loss": 2.1422, + "step": 14123 + }, + { + "epoch": 2.647422680412371, + "grad_norm": 57878.16796875, + "learning_rate": 1.984687819666695e-05, + "loss": 2.1635, + "step": 14124 + }, + { + "epoch": 2.647610121836926, + "grad_norm": 60431.41015625, + "learning_rate": 1.984061036103036e-05, + "loss": 2.0458, + "step": 14125 + }, + { + "epoch": 2.6477975632614807, + "grad_norm": 52601.015625, + "learning_rate": 1.9834343270291607e-05, + "loss": 2.1051, + "step": 14126 + }, + { + "epoch": 2.647985004686036, + "grad_norm": 55425.18359375, + "learning_rate": 1.982807692460546e-05, + "loss": 2.1302, + "step": 14127 + }, + { + "epoch": 2.6481724461105904, + "grad_norm": 51842.31640625, + "learning_rate": 1.9821811324126682e-05, + "loss": 2.1029, + "step": 14128 + }, + { + "epoch": 2.648359887535145, + "grad_norm": 51957.66796875, + "learning_rate": 1.9815546469010037e-05, + "loss": 2.0878, + "step": 14129 + }, + { + "epoch": 2.6485473289597, + "grad_norm": 52664.63671875, + "learning_rate": 1.9809282359410297e-05, + "loss": 2.1368, + "step": 14130 + }, + { + "epoch": 2.6487347703842548, + "grad_norm": 51091.62890625, + "learning_rate": 1.9803018995482104e-05, + "loss": 2.1007, + "step": 14131 + }, + { + "epoch": 2.64892221180881, + "grad_norm": 51235.5625, + "learning_rate": 1.9796756377380188e-05, + "loss": 2.1325, + "step": 14132 + }, + { + "epoch": 2.6491096532333644, + "grad_norm": 57448.375, + "learning_rate": 1.979049450525925e-05, + "loss": 2.1466, + "step": 14133 + }, + { + "epoch": 2.6492970946579195, + "grad_norm": 54149.03125, + "learning_rate": 1.9784233379273927e-05, + "loss": 2.1506, + "step": 14134 + }, + { + "epoch": 2.649484536082474, + "grad_norm": 57713.2109375, + "learning_rate": 1.9777972999578847e-05, + "loss": 2.1421, + "step": 14135 + }, + { + "epoch": 2.649671977507029, + "grad_norm": 57032.875, + "learning_rate": 1.9771713366328676e-05, + "loss": 2.0322, + "step": 14136 + }, + { + "epoch": 2.649859418931584, + "grad_norm": 52787.86328125, + "learning_rate": 1.9765454479677987e-05, + "loss": 2.1407, + "step": 14137 + }, + { + "epoch": 2.650046860356139, + "grad_norm": 55121.5390625, + "learning_rate": 1.9759196339781356e-05, + "loss": 2.1056, + "step": 14138 + }, + { + "epoch": 2.6502343017806935, + "grad_norm": 58262.4453125, + "learning_rate": 1.9752938946793363e-05, + "loss": 2.0852, + "step": 14139 + }, + { + "epoch": 2.650421743205248, + "grad_norm": 52392.26171875, + "learning_rate": 1.9746682300868586e-05, + "loss": 2.1149, + "step": 14140 + }, + { + "epoch": 2.650609184629803, + "grad_norm": 58344.3671875, + "learning_rate": 1.9740426402161532e-05, + "loss": 2.122, + "step": 14141 + }, + { + "epoch": 2.6507966260543583, + "grad_norm": 57004.18359375, + "learning_rate": 1.9734171250826695e-05, + "loss": 2.1266, + "step": 14142 + }, + { + "epoch": 2.650984067478913, + "grad_norm": 52453.36328125, + "learning_rate": 1.97279168470186e-05, + "loss": 2.1569, + "step": 14143 + }, + { + "epoch": 2.6511715089034675, + "grad_norm": 55719.01953125, + "learning_rate": 1.97216631908917e-05, + "loss": 2.0839, + "step": 14144 + }, + { + "epoch": 2.6513589503280226, + "grad_norm": 51292.5625, + "learning_rate": 1.9715410282600476e-05, + "loss": 2.0878, + "step": 14145 + }, + { + "epoch": 2.6515463917525772, + "grad_norm": 51273.33203125, + "learning_rate": 1.9709158122299343e-05, + "loss": 2.1301, + "step": 14146 + }, + { + "epoch": 2.6517338331771323, + "grad_norm": 54905.7109375, + "learning_rate": 1.9702906710142744e-05, + "loss": 2.1753, + "step": 14147 + }, + { + "epoch": 2.651921274601687, + "grad_norm": 54175.69921875, + "learning_rate": 1.9696656046285052e-05, + "loss": 2.0981, + "step": 14148 + }, + { + "epoch": 2.652108716026242, + "grad_norm": 51325.82421875, + "learning_rate": 1.969040613088068e-05, + "loss": 2.0815, + "step": 14149 + }, + { + "epoch": 2.6522961574507966, + "grad_norm": 52979.390625, + "learning_rate": 1.9684156964083983e-05, + "loss": 2.1627, + "step": 14150 + }, + { + "epoch": 2.6524835988753512, + "grad_norm": 51356.625, + "learning_rate": 1.967790854604929e-05, + "loss": 2.1539, + "step": 14151 + }, + { + "epoch": 2.6526710402999063, + "grad_norm": 53324.609375, + "learning_rate": 1.9671660876930954e-05, + "loss": 2.0871, + "step": 14152 + }, + { + "epoch": 2.6528584817244614, + "grad_norm": 56417.40625, + "learning_rate": 1.9665413956883277e-05, + "loss": 2.1126, + "step": 14153 + }, + { + "epoch": 2.653045923149016, + "grad_norm": 58614.265625, + "learning_rate": 1.9659167786060527e-05, + "loss": 2.2053, + "step": 14154 + }, + { + "epoch": 2.6532333645735706, + "grad_norm": 54174.5078125, + "learning_rate": 1.9652922364616994e-05, + "loss": 2.2155, + "step": 14155 + }, + { + "epoch": 2.6534208059981257, + "grad_norm": 53306.70703125, + "learning_rate": 1.9646677692706956e-05, + "loss": 2.0668, + "step": 14156 + }, + { + "epoch": 2.6536082474226803, + "grad_norm": 54654.09765625, + "learning_rate": 1.9640433770484618e-05, + "loss": 2.0815, + "step": 14157 + }, + { + "epoch": 2.6537956888472354, + "grad_norm": 54826.4375, + "learning_rate": 1.963419059810419e-05, + "loss": 2.1283, + "step": 14158 + }, + { + "epoch": 2.65398313027179, + "grad_norm": 51059.3984375, + "learning_rate": 1.9627948175719885e-05, + "loss": 2.0916, + "step": 14159 + }, + { + "epoch": 2.654170571696345, + "grad_norm": 52333.10546875, + "learning_rate": 1.9621706503485913e-05, + "loss": 2.1088, + "step": 14160 + }, + { + "epoch": 2.6543580131208997, + "grad_norm": 55081.6640625, + "learning_rate": 1.9615465581556375e-05, + "loss": 2.1593, + "step": 14161 + }, + { + "epoch": 2.6545454545454543, + "grad_norm": 59717.0546875, + "learning_rate": 1.9609225410085442e-05, + "loss": 2.0629, + "step": 14162 + }, + { + "epoch": 2.6547328959700094, + "grad_norm": 49881.5078125, + "learning_rate": 1.9602985989227256e-05, + "loss": 2.1595, + "step": 14163 + }, + { + "epoch": 2.6549203373945645, + "grad_norm": 57810.21484375, + "learning_rate": 1.9596747319135906e-05, + "loss": 2.0702, + "step": 14164 + }, + { + "epoch": 2.655107778819119, + "grad_norm": 59800.546875, + "learning_rate": 1.959050939996546e-05, + "loss": 2.1009, + "step": 14165 + }, + { + "epoch": 2.6552952202436737, + "grad_norm": 56549.0390625, + "learning_rate": 1.9584272231870023e-05, + "loss": 2.2429, + "step": 14166 + }, + { + "epoch": 2.6554826616682288, + "grad_norm": 56515.29296875, + "learning_rate": 1.9578035815003627e-05, + "loss": 2.1295, + "step": 14167 + }, + { + "epoch": 2.6556701030927834, + "grad_norm": 50696.80859375, + "learning_rate": 1.9571800149520287e-05, + "loss": 2.1173, + "step": 14168 + }, + { + "epoch": 2.6558575445173385, + "grad_norm": 53910.0703125, + "learning_rate": 1.9565565235574034e-05, + "loss": 2.1327, + "step": 14169 + }, + { + "epoch": 2.656044985941893, + "grad_norm": 57996.36328125, + "learning_rate": 1.9559331073318876e-05, + "loss": 2.1056, + "step": 14170 + }, + { + "epoch": 2.656232427366448, + "grad_norm": 54556.94921875, + "learning_rate": 1.9553097662908777e-05, + "loss": 2.0918, + "step": 14171 + }, + { + "epoch": 2.656419868791003, + "grad_norm": 57205.92578125, + "learning_rate": 1.9546865004497665e-05, + "loss": 2.0879, + "step": 14172 + }, + { + "epoch": 2.6566073102155574, + "grad_norm": 54916.04296875, + "learning_rate": 1.954063309823953e-05, + "loss": 2.1279, + "step": 14173 + }, + { + "epoch": 2.6567947516401125, + "grad_norm": 57636.6875, + "learning_rate": 1.9534401944288244e-05, + "loss": 2.1216, + "step": 14174 + }, + { + "epoch": 2.6569821930646675, + "grad_norm": 51545.7109375, + "learning_rate": 1.952817154279775e-05, + "loss": 2.1259, + "step": 14175 + }, + { + "epoch": 2.657169634489222, + "grad_norm": 54662.01953125, + "learning_rate": 1.9521941893921912e-05, + "loss": 2.1346, + "step": 14176 + }, + { + "epoch": 2.657357075913777, + "grad_norm": 53659.890625, + "learning_rate": 1.951571299781458e-05, + "loss": 2.1151, + "step": 14177 + }, + { + "epoch": 2.657544517338332, + "grad_norm": 55956.47265625, + "learning_rate": 1.950948485462963e-05, + "loss": 2.1943, + "step": 14178 + }, + { + "epoch": 2.6577319587628865, + "grad_norm": 49792.47265625, + "learning_rate": 1.9503257464520846e-05, + "loss": 2.0762, + "step": 14179 + }, + { + "epoch": 2.6579194001874415, + "grad_norm": 53313.03125, + "learning_rate": 1.9497030827642088e-05, + "loss": 2.1676, + "step": 14180 + }, + { + "epoch": 2.658106841611996, + "grad_norm": 56406.2109375, + "learning_rate": 1.94908049441471e-05, + "loss": 2.0839, + "step": 14181 + }, + { + "epoch": 2.6582942830365512, + "grad_norm": 51969.77734375, + "learning_rate": 1.9484579814189698e-05, + "loss": 2.0892, + "step": 14182 + }, + { + "epoch": 2.658481724461106, + "grad_norm": 54845.3671875, + "learning_rate": 1.9478355437923606e-05, + "loss": 2.1219, + "step": 14183 + }, + { + "epoch": 2.6586691658856605, + "grad_norm": 51790.03125, + "learning_rate": 1.9472131815502547e-05, + "loss": 2.1257, + "step": 14184 + }, + { + "epoch": 2.6588566073102156, + "grad_norm": 54321.79296875, + "learning_rate": 1.9465908947080252e-05, + "loss": 2.1191, + "step": 14185 + }, + { + "epoch": 2.6590440487347706, + "grad_norm": 52664.921875, + "learning_rate": 1.945968683281043e-05, + "loss": 2.0997, + "step": 14186 + }, + { + "epoch": 2.6592314901593253, + "grad_norm": 53904.09375, + "learning_rate": 1.9453465472846755e-05, + "loss": 2.1231, + "step": 14187 + }, + { + "epoch": 2.65941893158388, + "grad_norm": 53943.125, + "learning_rate": 1.9447244867342855e-05, + "loss": 2.1237, + "step": 14188 + }, + { + "epoch": 2.659606373008435, + "grad_norm": 53712.9140625, + "learning_rate": 1.944102501645242e-05, + "loss": 2.1797, + "step": 14189 + }, + { + "epoch": 2.6597938144329896, + "grad_norm": 54472.265625, + "learning_rate": 1.9434805920329042e-05, + "loss": 2.1154, + "step": 14190 + }, + { + "epoch": 2.6599812558575446, + "grad_norm": 58214.08984375, + "learning_rate": 1.9428587579126306e-05, + "loss": 2.099, + "step": 14191 + }, + { + "epoch": 2.6601686972820993, + "grad_norm": 53833.94921875, + "learning_rate": 1.942236999299783e-05, + "loss": 2.2425, + "step": 14192 + }, + { + "epoch": 2.6603561387066543, + "grad_norm": 52889.40625, + "learning_rate": 1.94161531620972e-05, + "loss": 2.1787, + "step": 14193 + }, + { + "epoch": 2.660543580131209, + "grad_norm": 52552.078125, + "learning_rate": 1.94099370865779e-05, + "loss": 2.127, + "step": 14194 + }, + { + "epoch": 2.6607310215557636, + "grad_norm": 50678.9140625, + "learning_rate": 1.9403721766593497e-05, + "loss": 2.1306, + "step": 14195 + }, + { + "epoch": 2.6609184629803186, + "grad_norm": 54103.44140625, + "learning_rate": 1.939750720229751e-05, + "loss": 2.1189, + "step": 14196 + }, + { + "epoch": 2.6611059044048737, + "grad_norm": 56619.26171875, + "learning_rate": 1.9391293393843425e-05, + "loss": 2.1062, + "step": 14197 + }, + { + "epoch": 2.6612933458294283, + "grad_norm": 62668.94140625, + "learning_rate": 1.938508034138468e-05, + "loss": 2.1182, + "step": 14198 + }, + { + "epoch": 2.661480787253983, + "grad_norm": 50783.16796875, + "learning_rate": 1.9378868045074768e-05, + "loss": 2.0957, + "step": 14199 + }, + { + "epoch": 2.661668228678538, + "grad_norm": 51817.37109375, + "learning_rate": 1.9372656505067127e-05, + "loss": 2.1313, + "step": 14200 + }, + { + "epoch": 2.6618556701030927, + "grad_norm": 52434.515625, + "learning_rate": 1.9366445721515165e-05, + "loss": 2.0599, + "step": 14201 + }, + { + "epoch": 2.6620431115276477, + "grad_norm": 51638.50390625, + "learning_rate": 1.9360235694572255e-05, + "loss": 2.1714, + "step": 14202 + }, + { + "epoch": 2.6622305529522023, + "grad_norm": 54718.05078125, + "learning_rate": 1.935402642439182e-05, + "loss": 2.1198, + "step": 14203 + }, + { + "epoch": 2.6624179943767574, + "grad_norm": 55354.8515625, + "learning_rate": 1.934781791112718e-05, + "loss": 2.1228, + "step": 14204 + }, + { + "epoch": 2.662605435801312, + "grad_norm": 55090.19921875, + "learning_rate": 1.934161015493171e-05, + "loss": 2.1023, + "step": 14205 + }, + { + "epoch": 2.6627928772258667, + "grad_norm": 54060.66015625, + "learning_rate": 1.9335403155958725e-05, + "loss": 2.0868, + "step": 14206 + }, + { + "epoch": 2.6629803186504217, + "grad_norm": 51410.07421875, + "learning_rate": 1.932919691436151e-05, + "loss": 2.1283, + "step": 14207 + }, + { + "epoch": 2.663167760074977, + "grad_norm": 56839.55078125, + "learning_rate": 1.9322991430293374e-05, + "loss": 2.1849, + "step": 14208 + }, + { + "epoch": 2.6633552014995314, + "grad_norm": 58813.109375, + "learning_rate": 1.9316786703907564e-05, + "loss": 2.206, + "step": 14209 + }, + { + "epoch": 2.663542642924086, + "grad_norm": 50428.140625, + "learning_rate": 1.9310582735357357e-05, + "loss": 2.14, + "step": 14210 + }, + { + "epoch": 2.663730084348641, + "grad_norm": 54121.18359375, + "learning_rate": 1.9304379524795956e-05, + "loss": 2.1145, + "step": 14211 + }, + { + "epoch": 2.6639175257731957, + "grad_norm": 61467.87109375, + "learning_rate": 1.9298177072376595e-05, + "loss": 2.1843, + "step": 14212 + }, + { + "epoch": 2.664104967197751, + "grad_norm": 55590.8359375, + "learning_rate": 1.9291975378252452e-05, + "loss": 2.1597, + "step": 14213 + }, + { + "epoch": 2.6642924086223054, + "grad_norm": 60581.85546875, + "learning_rate": 1.9285774442576693e-05, + "loss": 2.0626, + "step": 14214 + }, + { + "epoch": 2.6644798500468605, + "grad_norm": 56441.17578125, + "learning_rate": 1.927957426550248e-05, + "loss": 2.06, + "step": 14215 + }, + { + "epoch": 2.664667291471415, + "grad_norm": 54258.41796875, + "learning_rate": 1.927337484718299e-05, + "loss": 2.1315, + "step": 14216 + }, + { + "epoch": 2.6648547328959697, + "grad_norm": 59002.74609375, + "learning_rate": 1.926717618777127e-05, + "loss": 2.1822, + "step": 14217 + }, + { + "epoch": 2.665042174320525, + "grad_norm": 53234.40625, + "learning_rate": 1.9260978287420445e-05, + "loss": 2.1208, + "step": 14218 + }, + { + "epoch": 2.66522961574508, + "grad_norm": 55482.671875, + "learning_rate": 1.9254781146283622e-05, + "loss": 2.123, + "step": 14219 + }, + { + "epoch": 2.6654170571696345, + "grad_norm": 54250.61328125, + "learning_rate": 1.924858476451385e-05, + "loss": 2.0752, + "step": 14220 + }, + { + "epoch": 2.665604498594189, + "grad_norm": 55247.43359375, + "learning_rate": 1.9242389142264137e-05, + "loss": 2.161, + "step": 14221 + }, + { + "epoch": 2.665791940018744, + "grad_norm": 53751.4765625, + "learning_rate": 1.9236194279687542e-05, + "loss": 2.0556, + "step": 14222 + }, + { + "epoch": 2.665979381443299, + "grad_norm": 51784.04296875, + "learning_rate": 1.9230000176937096e-05, + "loss": 2.1576, + "step": 14223 + }, + { + "epoch": 2.666166822867854, + "grad_norm": 51877.390625, + "learning_rate": 1.922380683416572e-05, + "loss": 2.17, + "step": 14224 + }, + { + "epoch": 2.6663542642924085, + "grad_norm": 54993.81640625, + "learning_rate": 1.9217614251526405e-05, + "loss": 2.1542, + "step": 14225 + }, + { + "epoch": 2.6665417057169636, + "grad_norm": 51545.58984375, + "learning_rate": 1.9211422429172134e-05, + "loss": 2.1267, + "step": 14226 + }, + { + "epoch": 2.666729147141518, + "grad_norm": 57357.3046875, + "learning_rate": 1.9205231367255805e-05, + "loss": 2.1456, + "step": 14227 + }, + { + "epoch": 2.666916588566073, + "grad_norm": 53799.12890625, + "learning_rate": 1.9199041065930316e-05, + "loss": 2.0874, + "step": 14228 + }, + { + "epoch": 2.667104029990628, + "grad_norm": 54646.234375, + "learning_rate": 1.9192851525348604e-05, + "loss": 2.097, + "step": 14229 + }, + { + "epoch": 2.667291471415183, + "grad_norm": 50167.37109375, + "learning_rate": 1.9186662745663496e-05, + "loss": 2.1678, + "step": 14230 + }, + { + "epoch": 2.6674789128397376, + "grad_norm": 50850.3828125, + "learning_rate": 1.9180474727027887e-05, + "loss": 2.0974, + "step": 14231 + }, + { + "epoch": 2.667666354264292, + "grad_norm": 55599.55859375, + "learning_rate": 1.917428746959457e-05, + "loss": 2.2825, + "step": 14232 + }, + { + "epoch": 2.6678537956888473, + "grad_norm": 52820.640625, + "learning_rate": 1.9168100973516413e-05, + "loss": 2.062, + "step": 14233 + }, + { + "epoch": 2.668041237113402, + "grad_norm": 48177.2421875, + "learning_rate": 1.9161915238946183e-05, + "loss": 2.0825, + "step": 14234 + }, + { + "epoch": 2.668228678537957, + "grad_norm": 51840.828125, + "learning_rate": 1.915573026603665e-05, + "loss": 2.0724, + "step": 14235 + }, + { + "epoch": 2.6684161199625116, + "grad_norm": 52192.57421875, + "learning_rate": 1.91495460549406e-05, + "loss": 2.1271, + "step": 14236 + }, + { + "epoch": 2.6686035613870667, + "grad_norm": 49892.453125, + "learning_rate": 1.914336260581075e-05, + "loss": 2.1271, + "step": 14237 + }, + { + "epoch": 2.6687910028116213, + "grad_norm": 58599.51953125, + "learning_rate": 1.9137179918799856e-05, + "loss": 2.0944, + "step": 14238 + }, + { + "epoch": 2.668978444236176, + "grad_norm": 58939.67578125, + "learning_rate": 1.9130997994060583e-05, + "loss": 2.1219, + "step": 14239 + }, + { + "epoch": 2.669165885660731, + "grad_norm": 53798.64453125, + "learning_rate": 1.9124816831745663e-05, + "loss": 2.1338, + "step": 14240 + }, + { + "epoch": 2.669353327085286, + "grad_norm": 50301.56640625, + "learning_rate": 1.9118636432007713e-05, + "loss": 2.1764, + "step": 14241 + }, + { + "epoch": 2.6695407685098407, + "grad_norm": 51047.02734375, + "learning_rate": 1.9112456794999422e-05, + "loss": 2.1547, + "step": 14242 + }, + { + "epoch": 2.6697282099343953, + "grad_norm": 57440.0234375, + "learning_rate": 1.9106277920873404e-05, + "loss": 2.2286, + "step": 14243 + }, + { + "epoch": 2.6699156513589504, + "grad_norm": 53624.44921875, + "learning_rate": 1.9100099809782252e-05, + "loss": 2.1318, + "step": 14244 + }, + { + "epoch": 2.670103092783505, + "grad_norm": 55005.11328125, + "learning_rate": 1.9093922461878566e-05, + "loss": 2.1838, + "step": 14245 + }, + { + "epoch": 2.67029053420806, + "grad_norm": 51564.59765625, + "learning_rate": 1.9087745877314968e-05, + "loss": 2.1858, + "step": 14246 + }, + { + "epoch": 2.6704779756326147, + "grad_norm": 56554.0390625, + "learning_rate": 1.9081570056243925e-05, + "loss": 2.0736, + "step": 14247 + }, + { + "epoch": 2.6706654170571698, + "grad_norm": 56853.83984375, + "learning_rate": 1.907539499881802e-05, + "loss": 2.1069, + "step": 14248 + }, + { + "epoch": 2.6708528584817244, + "grad_norm": 58576.99609375, + "learning_rate": 1.9069220705189777e-05, + "loss": 2.1282, + "step": 14249 + }, + { + "epoch": 2.6710402999062794, + "grad_norm": 51217.15625, + "learning_rate": 1.906304717551169e-05, + "loss": 2.1417, + "step": 14250 + }, + { + "epoch": 2.671227741330834, + "grad_norm": 53350.484375, + "learning_rate": 1.9056874409936197e-05, + "loss": 2.1323, + "step": 14251 + }, + { + "epoch": 2.671415182755389, + "grad_norm": 49199.55078125, + "learning_rate": 1.90507024086158e-05, + "loss": 2.1126, + "step": 14252 + }, + { + "epoch": 2.6716026241799438, + "grad_norm": 59822.30078125, + "learning_rate": 1.9044531171702955e-05, + "loss": 2.0503, + "step": 14253 + }, + { + "epoch": 2.6717900656044984, + "grad_norm": 48995.40234375, + "learning_rate": 1.9038360699350023e-05, + "loss": 2.1128, + "step": 14254 + }, + { + "epoch": 2.6719775070290535, + "grad_norm": 52716.54296875, + "learning_rate": 1.9032190991709437e-05, + "loss": 2.1152, + "step": 14255 + }, + { + "epoch": 2.6721649484536085, + "grad_norm": 54228.72265625, + "learning_rate": 1.9026022048933606e-05, + "loss": 2.1204, + "step": 14256 + }, + { + "epoch": 2.672352389878163, + "grad_norm": 59247.5546875, + "learning_rate": 1.9019853871174866e-05, + "loss": 2.1217, + "step": 14257 + }, + { + "epoch": 2.6725398313027178, + "grad_norm": 56550.30859375, + "learning_rate": 1.9013686458585554e-05, + "loss": 2.1292, + "step": 14258 + }, + { + "epoch": 2.672727272727273, + "grad_norm": 53347.54296875, + "learning_rate": 1.900751981131803e-05, + "loss": 2.0975, + "step": 14259 + }, + { + "epoch": 2.6729147141518275, + "grad_norm": 51922.0078125, + "learning_rate": 1.9001353929524567e-05, + "loss": 2.1356, + "step": 14260 + }, + { + "epoch": 2.6731021555763825, + "grad_norm": 56500.1328125, + "learning_rate": 1.899518881335749e-05, + "loss": 2.1024, + "step": 14261 + }, + { + "epoch": 2.673289597000937, + "grad_norm": 53324.15625, + "learning_rate": 1.898902446296903e-05, + "loss": 2.1314, + "step": 14262 + }, + { + "epoch": 2.6734770384254922, + "grad_norm": 52645.53515625, + "learning_rate": 1.8982860878511478e-05, + "loss": 2.138, + "step": 14263 + }, + { + "epoch": 2.673664479850047, + "grad_norm": 52681.1953125, + "learning_rate": 1.897669806013705e-05, + "loss": 2.1695, + "step": 14264 + }, + { + "epoch": 2.6738519212746015, + "grad_norm": 54222.3671875, + "learning_rate": 1.8970536007997942e-05, + "loss": 2.0675, + "step": 14265 + }, + { + "epoch": 2.6740393626991565, + "grad_norm": 56149.3515625, + "learning_rate": 1.8964374722246376e-05, + "loss": 2.1327, + "step": 14266 + }, + { + "epoch": 2.6742268041237116, + "grad_norm": 59854.63671875, + "learning_rate": 1.89582142030345e-05, + "loss": 2.1102, + "step": 14267 + }, + { + "epoch": 2.6744142455482662, + "grad_norm": 53725.96875, + "learning_rate": 1.8952054450514506e-05, + "loss": 2.1001, + "step": 14268 + }, + { + "epoch": 2.674601686972821, + "grad_norm": 54068.69140625, + "learning_rate": 1.8945895464838516e-05, + "loss": 2.0361, + "step": 14269 + }, + { + "epoch": 2.674789128397376, + "grad_norm": 52946.8515625, + "learning_rate": 1.8939737246158622e-05, + "loss": 2.1613, + "step": 14270 + }, + { + "epoch": 2.6749765698219305, + "grad_norm": 54143.46484375, + "learning_rate": 1.8933579794626948e-05, + "loss": 2.111, + "step": 14271 + }, + { + "epoch": 2.6751640112464856, + "grad_norm": 54973.5703125, + "learning_rate": 1.8927423110395597e-05, + "loss": 2.1454, + "step": 14272 + }, + { + "epoch": 2.6753514526710402, + "grad_norm": 57361.90234375, + "learning_rate": 1.8921267193616608e-05, + "loss": 2.162, + "step": 14273 + }, + { + "epoch": 2.6755388940955953, + "grad_norm": 53989.609375, + "learning_rate": 1.8915112044442007e-05, + "loss": 2.1131, + "step": 14274 + }, + { + "epoch": 2.67572633552015, + "grad_norm": 50898.80078125, + "learning_rate": 1.8908957663023852e-05, + "loss": 2.1311, + "step": 14275 + }, + { + "epoch": 2.6759137769447046, + "grad_norm": 51137.58984375, + "learning_rate": 1.8902804049514133e-05, + "loss": 2.2033, + "step": 14276 + }, + { + "epoch": 2.6761012183692596, + "grad_norm": 56889.2109375, + "learning_rate": 1.8896651204064824e-05, + "loss": 2.0891, + "step": 14277 + }, + { + "epoch": 2.6762886597938147, + "grad_norm": 52971.20703125, + "learning_rate": 1.88904991268279e-05, + "loss": 2.0909, + "step": 14278 + }, + { + "epoch": 2.6764761012183693, + "grad_norm": 54853.69921875, + "learning_rate": 1.888434781795533e-05, + "loss": 2.1711, + "step": 14279 + }, + { + "epoch": 2.676663542642924, + "grad_norm": 52050.0390625, + "learning_rate": 1.8878197277599026e-05, + "loss": 2.1466, + "step": 14280 + }, + { + "epoch": 2.676850984067479, + "grad_norm": 55091.27734375, + "learning_rate": 1.8872047505910882e-05, + "loss": 2.0734, + "step": 14281 + }, + { + "epoch": 2.6770384254920336, + "grad_norm": 52662.46484375, + "learning_rate": 1.8865898503042828e-05, + "loss": 2.176, + "step": 14282 + }, + { + "epoch": 2.6772258669165887, + "grad_norm": 57555.48828125, + "learning_rate": 1.885975026914671e-05, + "loss": 2.0875, + "step": 14283 + }, + { + "epoch": 2.6774133083411433, + "grad_norm": 58539.68359375, + "learning_rate": 1.885360280437437e-05, + "loss": 2.089, + "step": 14284 + }, + { + "epoch": 2.6776007497656984, + "grad_norm": 51473.30859375, + "learning_rate": 1.8847456108877664e-05, + "loss": 2.1521, + "step": 14285 + }, + { + "epoch": 2.677788191190253, + "grad_norm": 52505.109375, + "learning_rate": 1.8841310182808415e-05, + "loss": 2.1165, + "step": 14286 + }, + { + "epoch": 2.6779756326148076, + "grad_norm": 52625.71484375, + "learning_rate": 1.8835165026318408e-05, + "loss": 2.1171, + "step": 14287 + }, + { + "epoch": 2.6781630740393627, + "grad_norm": 53372.80859375, + "learning_rate": 1.8829020639559403e-05, + "loss": 2.0822, + "step": 14288 + }, + { + "epoch": 2.678350515463918, + "grad_norm": 53532.1015625, + "learning_rate": 1.882287702268319e-05, + "loss": 2.1193, + "step": 14289 + }, + { + "epoch": 2.6785379568884724, + "grad_norm": 53542.1484375, + "learning_rate": 1.8816734175841487e-05, + "loss": 2.1582, + "step": 14290 + }, + { + "epoch": 2.678725398313027, + "grad_norm": 51121.24609375, + "learning_rate": 1.8810592099186007e-05, + "loss": 2.1609, + "step": 14291 + }, + { + "epoch": 2.678912839737582, + "grad_norm": 59360.61328125, + "learning_rate": 1.8804450792868473e-05, + "loss": 2.1911, + "step": 14292 + }, + { + "epoch": 2.6791002811621367, + "grad_norm": 54990.19140625, + "learning_rate": 1.8798310257040567e-05, + "loss": 2.0764, + "step": 14293 + }, + { + "epoch": 2.679287722586692, + "grad_norm": 52459.73046875, + "learning_rate": 1.8792170491853944e-05, + "loss": 2.1288, + "step": 14294 + }, + { + "epoch": 2.6794751640112464, + "grad_norm": 63035.80078125, + "learning_rate": 1.878603149746023e-05, + "loss": 2.1923, + "step": 14295 + }, + { + "epoch": 2.6796626054358015, + "grad_norm": 53952.7265625, + "learning_rate": 1.8779893274011096e-05, + "loss": 2.1808, + "step": 14296 + }, + { + "epoch": 2.679850046860356, + "grad_norm": 54874.8671875, + "learning_rate": 1.8773755821658097e-05, + "loss": 2.104, + "step": 14297 + }, + { + "epoch": 2.6800374882849107, + "grad_norm": 52404.80859375, + "learning_rate": 1.8767619140552854e-05, + "loss": 2.1165, + "step": 14298 + }, + { + "epoch": 2.680224929709466, + "grad_norm": 58857.33984375, + "learning_rate": 1.8761483230846934e-05, + "loss": 2.1013, + "step": 14299 + }, + { + "epoch": 2.680412371134021, + "grad_norm": 51875.8671875, + "learning_rate": 1.875534809269186e-05, + "loss": 2.0941, + "step": 14300 + }, + { + "epoch": 2.6805998125585755, + "grad_norm": 53623.93359375, + "learning_rate": 1.8749213726239173e-05, + "loss": 2.0918, + "step": 14301 + }, + { + "epoch": 2.68078725398313, + "grad_norm": 59880.890625, + "learning_rate": 1.874308013164041e-05, + "loss": 2.0948, + "step": 14302 + }, + { + "epoch": 2.680974695407685, + "grad_norm": 53404.09375, + "learning_rate": 1.8736947309047044e-05, + "loss": 2.1077, + "step": 14303 + }, + { + "epoch": 2.68116213683224, + "grad_norm": 51784.7890625, + "learning_rate": 1.8730815258610524e-05, + "loss": 2.1127, + "step": 14304 + }, + { + "epoch": 2.681349578256795, + "grad_norm": 58399.359375, + "learning_rate": 1.872468398048235e-05, + "loss": 2.1459, + "step": 14305 + }, + { + "epoch": 2.6815370196813495, + "grad_norm": 49541.1875, + "learning_rate": 1.8718553474813926e-05, + "loss": 2.1806, + "step": 14306 + }, + { + "epoch": 2.6817244611059046, + "grad_norm": 52477.1015625, + "learning_rate": 1.8712423741756662e-05, + "loss": 2.1012, + "step": 14307 + }, + { + "epoch": 2.681911902530459, + "grad_norm": 55556.36328125, + "learning_rate": 1.8706294781461963e-05, + "loss": 2.1166, + "step": 14308 + }, + { + "epoch": 2.682099343955014, + "grad_norm": 59099.83984375, + "learning_rate": 1.8700166594081246e-05, + "loss": 2.1806, + "step": 14309 + }, + { + "epoch": 2.682286785379569, + "grad_norm": 60331.88671875, + "learning_rate": 1.86940391797658e-05, + "loss": 2.1497, + "step": 14310 + }, + { + "epoch": 2.682474226804124, + "grad_norm": 51773.3125, + "learning_rate": 1.8687912538666995e-05, + "loss": 2.139, + "step": 14311 + }, + { + "epoch": 2.6826616682286786, + "grad_norm": 51036.703125, + "learning_rate": 1.8681786670936173e-05, + "loss": 2.1459, + "step": 14312 + }, + { + "epoch": 2.682849109653233, + "grad_norm": 52564.32421875, + "learning_rate": 1.8675661576724616e-05, + "loss": 2.0963, + "step": 14313 + }, + { + "epoch": 2.6830365510777883, + "grad_norm": 55849.453125, + "learning_rate": 1.866953725618359e-05, + "loss": 2.2073, + "step": 14314 + }, + { + "epoch": 2.683223992502343, + "grad_norm": 52663.41015625, + "learning_rate": 1.866341370946437e-05, + "loss": 2.1456, + "step": 14315 + }, + { + "epoch": 2.683411433926898, + "grad_norm": 52667.7890625, + "learning_rate": 1.865729093671822e-05, + "loss": 2.1277, + "step": 14316 + }, + { + "epoch": 2.6835988753514526, + "grad_norm": 56370.9453125, + "learning_rate": 1.8651168938096346e-05, + "loss": 2.0427, + "step": 14317 + }, + { + "epoch": 2.6837863167760077, + "grad_norm": 58171.57421875, + "learning_rate": 1.8645047713749942e-05, + "loss": 2.1364, + "step": 14318 + }, + { + "epoch": 2.6839737582005623, + "grad_norm": 51933.296875, + "learning_rate": 1.863892726383022e-05, + "loss": 2.1242, + "step": 14319 + }, + { + "epoch": 2.684161199625117, + "grad_norm": 56482.90234375, + "learning_rate": 1.8632807588488332e-05, + "loss": 2.096, + "step": 14320 + }, + { + "epoch": 2.684348641049672, + "grad_norm": 51286.046875, + "learning_rate": 1.8626688687875415e-05, + "loss": 2.0977, + "step": 14321 + }, + { + "epoch": 2.684536082474227, + "grad_norm": 53099.30859375, + "learning_rate": 1.862057056214263e-05, + "loss": 2.1304, + "step": 14322 + }, + { + "epoch": 2.6847235238987817, + "grad_norm": 58413.23828125, + "learning_rate": 1.861445321144105e-05, + "loss": 2.1493, + "step": 14323 + }, + { + "epoch": 2.6849109653233363, + "grad_norm": 58850.10546875, + "learning_rate": 1.8608336635921804e-05, + "loss": 2.1325, + "step": 14324 + }, + { + "epoch": 2.6850984067478914, + "grad_norm": 52211.0703125, + "learning_rate": 1.8602220835735924e-05, + "loss": 2.1615, + "step": 14325 + }, + { + "epoch": 2.685285848172446, + "grad_norm": 53347.6484375, + "learning_rate": 1.859610581103449e-05, + "loss": 2.1545, + "step": 14326 + }, + { + "epoch": 2.685473289597001, + "grad_norm": 54612.96484375, + "learning_rate": 1.8589991561968524e-05, + "loss": 2.1248, + "step": 14327 + }, + { + "epoch": 2.6856607310215557, + "grad_norm": 51205.86328125, + "learning_rate": 1.858387808868905e-05, + "loss": 2.1574, + "step": 14328 + }, + { + "epoch": 2.6858481724461107, + "grad_norm": 57892.89453125, + "learning_rate": 1.8577765391347058e-05, + "loss": 2.1599, + "step": 14329 + }, + { + "epoch": 2.6860356138706654, + "grad_norm": 60261.46875, + "learning_rate": 1.8571653470093503e-05, + "loss": 2.1484, + "step": 14330 + }, + { + "epoch": 2.68622305529522, + "grad_norm": 52251.203125, + "learning_rate": 1.8565542325079382e-05, + "loss": 2.1137, + "step": 14331 + }, + { + "epoch": 2.686410496719775, + "grad_norm": 54431.54296875, + "learning_rate": 1.8559431956455586e-05, + "loss": 2.1461, + "step": 14332 + }, + { + "epoch": 2.68659793814433, + "grad_norm": 54337.09375, + "learning_rate": 1.8553322364373083e-05, + "loss": 2.078, + "step": 14333 + }, + { + "epoch": 2.6867853795688847, + "grad_norm": 55067.74609375, + "learning_rate": 1.8547213548982724e-05, + "loss": 2.1668, + "step": 14334 + }, + { + "epoch": 2.6869728209934394, + "grad_norm": 51815.6875, + "learning_rate": 1.8541105510435425e-05, + "loss": 2.0712, + "step": 14335 + }, + { + "epoch": 2.6871602624179944, + "grad_norm": 56230.25, + "learning_rate": 1.853499824888204e-05, + "loss": 2.1277, + "step": 14336 + }, + { + "epoch": 2.687347703842549, + "grad_norm": 57124.90234375, + "learning_rate": 1.852889176447338e-05, + "loss": 2.166, + "step": 14337 + }, + { + "epoch": 2.687535145267104, + "grad_norm": 54966.859375, + "learning_rate": 1.85227860573603e-05, + "loss": 2.1251, + "step": 14338 + }, + { + "epoch": 2.6877225866916588, + "grad_norm": 53954.61328125, + "learning_rate": 1.8516681127693615e-05, + "loss": 2.0886, + "step": 14339 + }, + { + "epoch": 2.687910028116214, + "grad_norm": 52655.90625, + "learning_rate": 1.851057697562406e-05, + "loss": 2.0822, + "step": 14340 + }, + { + "epoch": 2.6880974695407684, + "grad_norm": 55156.23046875, + "learning_rate": 1.8504473601302423e-05, + "loss": 2.0462, + "step": 14341 + }, + { + "epoch": 2.688284910965323, + "grad_norm": 53888.0, + "learning_rate": 1.849837100487948e-05, + "loss": 2.0992, + "step": 14342 + }, + { + "epoch": 2.688472352389878, + "grad_norm": 52158.984375, + "learning_rate": 1.8492269186505922e-05, + "loss": 2.0978, + "step": 14343 + }, + { + "epoch": 2.688659793814433, + "grad_norm": 53016.9921875, + "learning_rate": 1.848616814633245e-05, + "loss": 2.1819, + "step": 14344 + }, + { + "epoch": 2.688847235238988, + "grad_norm": 49217.95703125, + "learning_rate": 1.8480067884509767e-05, + "loss": 2.109, + "step": 14345 + }, + { + "epoch": 2.6890346766635425, + "grad_norm": 60181.46484375, + "learning_rate": 1.8473968401188575e-05, + "loss": 2.1004, + "step": 14346 + }, + { + "epoch": 2.6892221180880975, + "grad_norm": 54334.19140625, + "learning_rate": 1.8467869696519457e-05, + "loss": 2.0807, + "step": 14347 + }, + { + "epoch": 2.689409559512652, + "grad_norm": 60231.28515625, + "learning_rate": 1.846177177065307e-05, + "loss": 2.1534, + "step": 14348 + }, + { + "epoch": 2.689597000937207, + "grad_norm": 57067.5390625, + "learning_rate": 1.8455674623740055e-05, + "loss": 2.1061, + "step": 14349 + }, + { + "epoch": 2.689784442361762, + "grad_norm": 53232.75, + "learning_rate": 1.844957825593097e-05, + "loss": 2.1614, + "step": 14350 + }, + { + "epoch": 2.689971883786317, + "grad_norm": 56622.30078125, + "learning_rate": 1.844348266737639e-05, + "loss": 2.1303, + "step": 14351 + }, + { + "epoch": 2.6901593252108715, + "grad_norm": 55760.34375, + "learning_rate": 1.8437387858226886e-05, + "loss": 2.1379, + "step": 14352 + }, + { + "epoch": 2.690346766635426, + "grad_norm": 54909.41796875, + "learning_rate": 1.843129382863296e-05, + "loss": 2.0585, + "step": 14353 + }, + { + "epoch": 2.6905342080599812, + "grad_norm": 54308.0859375, + "learning_rate": 1.8425200578745166e-05, + "loss": 2.0833, + "step": 14354 + }, + { + "epoch": 2.6907216494845363, + "grad_norm": 56741.1875, + "learning_rate": 1.8419108108713968e-05, + "loss": 2.0927, + "step": 14355 + }, + { + "epoch": 2.690909090909091, + "grad_norm": 58472.59765625, + "learning_rate": 1.8413016418689865e-05, + "loss": 2.0646, + "step": 14356 + }, + { + "epoch": 2.6910965323336455, + "grad_norm": 54997.31640625, + "learning_rate": 1.8406925508823293e-05, + "loss": 2.1436, + "step": 14357 + }, + { + "epoch": 2.6912839737582006, + "grad_norm": 52872.6484375, + "learning_rate": 1.840083537926471e-05, + "loss": 2.1053, + "step": 14358 + }, + { + "epoch": 2.6914714151827552, + "grad_norm": 57403.74609375, + "learning_rate": 1.8394746030164528e-05, + "loss": 2.1309, + "step": 14359 + }, + { + "epoch": 2.6916588566073103, + "grad_norm": 52738.7734375, + "learning_rate": 1.8388657461673124e-05, + "loss": 2.0715, + "step": 14360 + }, + { + "epoch": 2.691846298031865, + "grad_norm": 51375.3515625, + "learning_rate": 1.8382569673940915e-05, + "loss": 2.1107, + "step": 14361 + }, + { + "epoch": 2.69203373945642, + "grad_norm": 55975.296875, + "learning_rate": 1.837648266711824e-05, + "loss": 2.0556, + "step": 14362 + }, + { + "epoch": 2.6922211808809746, + "grad_norm": 54050.359375, + "learning_rate": 1.8370396441355426e-05, + "loss": 2.1322, + "step": 14363 + }, + { + "epoch": 2.6924086223055297, + "grad_norm": 53672.6640625, + "learning_rate": 1.8364310996802813e-05, + "loss": 2.1171, + "step": 14364 + }, + { + "epoch": 2.6925960637300843, + "grad_norm": 52443.30078125, + "learning_rate": 1.8358226333610717e-05, + "loss": 2.1384, + "step": 14365 + }, + { + "epoch": 2.6927835051546394, + "grad_norm": 49936.90625, + "learning_rate": 1.835214245192941e-05, + "loss": 2.0565, + "step": 14366 + }, + { + "epoch": 2.692970946579194, + "grad_norm": 54025.2734375, + "learning_rate": 1.8346059351909133e-05, + "loss": 2.0581, + "step": 14367 + }, + { + "epoch": 2.6931583880037486, + "grad_norm": 51354.234375, + "learning_rate": 1.8339977033700152e-05, + "loss": 2.1417, + "step": 14368 + }, + { + "epoch": 2.6933458294283037, + "grad_norm": 50106.00390625, + "learning_rate": 1.833389549745272e-05, + "loss": 2.1506, + "step": 14369 + }, + { + "epoch": 2.6935332708528583, + "grad_norm": 59660.15625, + "learning_rate": 1.8327814743316984e-05, + "loss": 2.1066, + "step": 14370 + }, + { + "epoch": 2.6937207122774134, + "grad_norm": 56811.4921875, + "learning_rate": 1.8321734771443155e-05, + "loss": 2.1056, + "step": 14371 + }, + { + "epoch": 2.693908153701968, + "grad_norm": 54085.28125, + "learning_rate": 1.831565558198143e-05, + "loss": 2.1659, + "step": 14372 + }, + { + "epoch": 2.694095595126523, + "grad_norm": 56589.9375, + "learning_rate": 1.8309577175081926e-05, + "loss": 2.1668, + "step": 14373 + }, + { + "epoch": 2.6942830365510777, + "grad_norm": 57010.86328125, + "learning_rate": 1.8303499550894765e-05, + "loss": 2.106, + "step": 14374 + }, + { + "epoch": 2.6944704779756328, + "grad_norm": 52095.0234375, + "learning_rate": 1.829742270957009e-05, + "loss": 2.1792, + "step": 14375 + }, + { + "epoch": 2.6946579194001874, + "grad_norm": 51828.09375, + "learning_rate": 1.8291346651257963e-05, + "loss": 2.0729, + "step": 14376 + }, + { + "epoch": 2.6948453608247425, + "grad_norm": 51363.82421875, + "learning_rate": 1.8285271376108453e-05, + "loss": 2.1615, + "step": 14377 + }, + { + "epoch": 2.695032802249297, + "grad_norm": 56593.2109375, + "learning_rate": 1.8279196884271614e-05, + "loss": 2.1309, + "step": 14378 + }, + { + "epoch": 2.6952202436738517, + "grad_norm": 52994.78515625, + "learning_rate": 1.827312317589751e-05, + "loss": 2.0757, + "step": 14379 + }, + { + "epoch": 2.695407685098407, + "grad_norm": 56218.05078125, + "learning_rate": 1.8267050251136124e-05, + "loss": 2.089, + "step": 14380 + }, + { + "epoch": 2.695595126522962, + "grad_norm": 50753.1640625, + "learning_rate": 1.8260978110137438e-05, + "loss": 2.1128, + "step": 14381 + }, + { + "epoch": 2.6957825679475165, + "grad_norm": 52645.9296875, + "learning_rate": 1.8254906753051464e-05, + "loss": 2.1479, + "step": 14382 + }, + { + "epoch": 2.695970009372071, + "grad_norm": 55267.54296875, + "learning_rate": 1.824883618002811e-05, + "loss": 2.0957, + "step": 14383 + }, + { + "epoch": 2.696157450796626, + "grad_norm": 49957.625, + "learning_rate": 1.8242766391217364e-05, + "loss": 2.103, + "step": 14384 + }, + { + "epoch": 2.696344892221181, + "grad_norm": 56356.27734375, + "learning_rate": 1.823669738676909e-05, + "loss": 2.1799, + "step": 14385 + }, + { + "epoch": 2.696532333645736, + "grad_norm": 50432.40234375, + "learning_rate": 1.8230629166833223e-05, + "loss": 2.1084, + "step": 14386 + }, + { + "epoch": 2.6967197750702905, + "grad_norm": 57738.03125, + "learning_rate": 1.8224561731559637e-05, + "loss": 2.072, + "step": 14387 + }, + { + "epoch": 2.6969072164948455, + "grad_norm": 84626.921875, + "learning_rate": 1.821849508109815e-05, + "loss": 2.1053, + "step": 14388 + }, + { + "epoch": 2.6970946579194, + "grad_norm": 55318.4375, + "learning_rate": 1.821242921559865e-05, + "loss": 2.0679, + "step": 14389 + }, + { + "epoch": 2.697282099343955, + "grad_norm": 54427.50390625, + "learning_rate": 1.8206364135210925e-05, + "loss": 2.1462, + "step": 14390 + }, + { + "epoch": 2.69746954076851, + "grad_norm": 49818.29296875, + "learning_rate": 1.8200299840084793e-05, + "loss": 2.0998, + "step": 14391 + }, + { + "epoch": 2.697656982193065, + "grad_norm": 54472.1015625, + "learning_rate": 1.819423633037003e-05, + "loss": 2.0852, + "step": 14392 + }, + { + "epoch": 2.6978444236176196, + "grad_norm": 55597.76953125, + "learning_rate": 1.8188173606216374e-05, + "loss": 2.1135, + "step": 14393 + }, + { + "epoch": 2.698031865042174, + "grad_norm": 49402.31640625, + "learning_rate": 1.818211166777359e-05, + "loss": 2.1872, + "step": 14394 + }, + { + "epoch": 2.6982193064667293, + "grad_norm": 53310.8125, + "learning_rate": 1.8176050515191405e-05, + "loss": 2.1228, + "step": 14395 + }, + { + "epoch": 2.698406747891284, + "grad_norm": 57089.734375, + "learning_rate": 1.8169990148619513e-05, + "loss": 2.1213, + "step": 14396 + }, + { + "epoch": 2.698594189315839, + "grad_norm": 51185.64453125, + "learning_rate": 1.8163930568207583e-05, + "loss": 2.1239, + "step": 14397 + }, + { + "epoch": 2.6987816307403936, + "grad_norm": 60266.98046875, + "learning_rate": 1.815787177410529e-05, + "loss": 2.1547, + "step": 14398 + }, + { + "epoch": 2.6989690721649486, + "grad_norm": 53546.828125, + "learning_rate": 1.8151813766462316e-05, + "loss": 2.0962, + "step": 14399 + }, + { + "epoch": 2.6991565135895033, + "grad_norm": 53241.265625, + "learning_rate": 1.8145756545428216e-05, + "loss": 2.128, + "step": 14400 + }, + { + "epoch": 2.699343955014058, + "grad_norm": 55841.296875, + "learning_rate": 1.813970011115263e-05, + "loss": 2.0633, + "step": 14401 + }, + { + "epoch": 2.699531396438613, + "grad_norm": 78415.40625, + "learning_rate": 1.8133644463785178e-05, + "loss": 2.11, + "step": 14402 + }, + { + "epoch": 2.699718837863168, + "grad_norm": 59772.30078125, + "learning_rate": 1.8127589603475354e-05, + "loss": 2.1062, + "step": 14403 + }, + { + "epoch": 2.6999062792877226, + "grad_norm": 54158.03125, + "learning_rate": 1.8121535530372748e-05, + "loss": 2.1878, + "step": 14404 + }, + { + "epoch": 2.7000937207122773, + "grad_norm": 56502.01171875, + "learning_rate": 1.8115482244626897e-05, + "loss": 2.1496, + "step": 14405 + }, + { + "epoch": 2.7002811621368323, + "grad_norm": 53826.09375, + "learning_rate": 1.8109429746387296e-05, + "loss": 2.194, + "step": 14406 + }, + { + "epoch": 2.700468603561387, + "grad_norm": 57115.25390625, + "learning_rate": 1.810337803580342e-05, + "loss": 2.1395, + "step": 14407 + }, + { + "epoch": 2.700656044985942, + "grad_norm": 53531.95703125, + "learning_rate": 1.8097327113024747e-05, + "loss": 2.1687, + "step": 14408 + }, + { + "epoch": 2.7008434864104967, + "grad_norm": 62875.99609375, + "learning_rate": 1.8091276978200754e-05, + "loss": 2.1066, + "step": 14409 + }, + { + "epoch": 2.7010309278350517, + "grad_norm": 52156.41796875, + "learning_rate": 1.8085227631480845e-05, + "loss": 2.1541, + "step": 14410 + }, + { + "epoch": 2.7012183692596063, + "grad_norm": 52360.93359375, + "learning_rate": 1.8079179073014424e-05, + "loss": 2.059, + "step": 14411 + }, + { + "epoch": 2.701405810684161, + "grad_norm": 57021.8359375, + "learning_rate": 1.8073131302950908e-05, + "loss": 2.0263, + "step": 14412 + }, + { + "epoch": 2.701593252108716, + "grad_norm": 53753.640625, + "learning_rate": 1.8067084321439637e-05, + "loss": 2.1939, + "step": 14413 + }, + { + "epoch": 2.701780693533271, + "grad_norm": 53175.4765625, + "learning_rate": 1.806103812863e-05, + "loss": 2.0713, + "step": 14414 + }, + { + "epoch": 2.7019681349578257, + "grad_norm": 58499.203125, + "learning_rate": 1.8054992724671303e-05, + "loss": 2.1644, + "step": 14415 + }, + { + "epoch": 2.7021555763823804, + "grad_norm": 56861.90234375, + "learning_rate": 1.8048948109712865e-05, + "loss": 2.0179, + "step": 14416 + }, + { + "epoch": 2.7023430178069354, + "grad_norm": 53662.53515625, + "learning_rate": 1.8042904283903994e-05, + "loss": 2.1591, + "step": 14417 + }, + { + "epoch": 2.70253045923149, + "grad_norm": 53149.6796875, + "learning_rate": 1.803686124739394e-05, + "loss": 2.1222, + "step": 14418 + }, + { + "epoch": 2.702717900656045, + "grad_norm": 55485.93359375, + "learning_rate": 1.803081900033198e-05, + "loss": 2.0946, + "step": 14419 + }, + { + "epoch": 2.7029053420805997, + "grad_norm": 51373.64453125, + "learning_rate": 1.8024777542867337e-05, + "loss": 2.1073, + "step": 14420 + }, + { + "epoch": 2.703092783505155, + "grad_norm": 55830.3671875, + "learning_rate": 1.8018736875149246e-05, + "loss": 2.0992, + "step": 14421 + }, + { + "epoch": 2.7032802249297094, + "grad_norm": 54061.77734375, + "learning_rate": 1.801269699732689e-05, + "loss": 2.1412, + "step": 14422 + }, + { + "epoch": 2.703467666354264, + "grad_norm": 58202.40234375, + "learning_rate": 1.8006657909549424e-05, + "loss": 2.0955, + "step": 14423 + }, + { + "epoch": 2.703655107778819, + "grad_norm": 56743.3359375, + "learning_rate": 1.800061961196603e-05, + "loss": 2.1643, + "step": 14424 + }, + { + "epoch": 2.703842549203374, + "grad_norm": 57174.97265625, + "learning_rate": 1.799458210472586e-05, + "loss": 2.172, + "step": 14425 + }, + { + "epoch": 2.704029990627929, + "grad_norm": 58762.55078125, + "learning_rate": 1.7988545387978023e-05, + "loss": 2.1633, + "step": 14426 + }, + { + "epoch": 2.7042174320524834, + "grad_norm": 52876.265625, + "learning_rate": 1.798250946187159e-05, + "loss": 2.1827, + "step": 14427 + }, + { + "epoch": 2.7044048734770385, + "grad_norm": 53055.5859375, + "learning_rate": 1.7976474326555676e-05, + "loss": 2.156, + "step": 14428 + }, + { + "epoch": 2.704592314901593, + "grad_norm": 63041.65625, + "learning_rate": 1.7970439982179327e-05, + "loss": 2.1182, + "step": 14429 + }, + { + "epoch": 2.704779756326148, + "grad_norm": 54282.83203125, + "learning_rate": 1.7964406428891572e-05, + "loss": 2.1394, + "step": 14430 + }, + { + "epoch": 2.704967197750703, + "grad_norm": 53214.53125, + "learning_rate": 1.795837366684144e-05, + "loss": 2.1234, + "step": 14431 + }, + { + "epoch": 2.705154639175258, + "grad_norm": 49376.7578125, + "learning_rate": 1.795234169617797e-05, + "loss": 2.0949, + "step": 14432 + }, + { + "epoch": 2.7053420805998125, + "grad_norm": 54496.07421875, + "learning_rate": 1.7946310517050075e-05, + "loss": 2.1456, + "step": 14433 + }, + { + "epoch": 2.705529522024367, + "grad_norm": 54371.59375, + "learning_rate": 1.7940280129606758e-05, + "loss": 2.1459, + "step": 14434 + }, + { + "epoch": 2.705716963448922, + "grad_norm": 53189.4765625, + "learning_rate": 1.7934250533996965e-05, + "loss": 2.1466, + "step": 14435 + }, + { + "epoch": 2.7059044048734773, + "grad_norm": 54461.09375, + "learning_rate": 1.792822173036961e-05, + "loss": 2.0843, + "step": 14436 + }, + { + "epoch": 2.706091846298032, + "grad_norm": 52942.140625, + "learning_rate": 1.7922193718873586e-05, + "loss": 2.1359, + "step": 14437 + }, + { + "epoch": 2.7062792877225865, + "grad_norm": 52133.76171875, + "learning_rate": 1.7916166499657783e-05, + "loss": 2.1666, + "step": 14438 + }, + { + "epoch": 2.7064667291471416, + "grad_norm": 54220.046875, + "learning_rate": 1.791014007287109e-05, + "loss": 2.0713, + "step": 14439 + }, + { + "epoch": 2.706654170571696, + "grad_norm": 49913.7109375, + "learning_rate": 1.790411443866233e-05, + "loss": 2.0871, + "step": 14440 + }, + { + "epoch": 2.7068416119962513, + "grad_norm": 57786.05859375, + "learning_rate": 1.7898089597180312e-05, + "loss": 2.1188, + "step": 14441 + }, + { + "epoch": 2.707029053420806, + "grad_norm": 53174.921875, + "learning_rate": 1.7892065548573877e-05, + "loss": 2.0976, + "step": 14442 + }, + { + "epoch": 2.707216494845361, + "grad_norm": 55771.328125, + "learning_rate": 1.78860422929918e-05, + "loss": 2.0723, + "step": 14443 + }, + { + "epoch": 2.7074039362699156, + "grad_norm": 55395.625, + "learning_rate": 1.7880019830582815e-05, + "loss": 2.1498, + "step": 14444 + }, + { + "epoch": 2.7075913776944702, + "grad_norm": 52532.828125, + "learning_rate": 1.7873998161495725e-05, + "loss": 2.0541, + "step": 14445 + }, + { + "epoch": 2.7077788191190253, + "grad_norm": 54691.4765625, + "learning_rate": 1.7867977285879206e-05, + "loss": 2.141, + "step": 14446 + }, + { + "epoch": 2.7079662605435804, + "grad_norm": 54006.7734375, + "learning_rate": 1.7861957203882e-05, + "loss": 2.1713, + "step": 14447 + }, + { + "epoch": 2.708153701968135, + "grad_norm": 58295.40625, + "learning_rate": 1.7855937915652778e-05, + "loss": 2.0916, + "step": 14448 + }, + { + "epoch": 2.7083411433926896, + "grad_norm": 52424.5546875, + "learning_rate": 1.784991942134022e-05, + "loss": 2.126, + "step": 14449 + }, + { + "epoch": 2.7085285848172447, + "grad_norm": 52549.65234375, + "learning_rate": 1.7843901721092958e-05, + "loss": 2.1149, + "step": 14450 + }, + { + "epoch": 2.7087160262417993, + "grad_norm": 51818.4296875, + "learning_rate": 1.7837884815059646e-05, + "loss": 2.1544, + "step": 14451 + }, + { + "epoch": 2.7089034676663544, + "grad_norm": 51563.42578125, + "learning_rate": 1.7831868703388882e-05, + "loss": 2.142, + "step": 14452 + }, + { + "epoch": 2.709090909090909, + "grad_norm": 58080.08203125, + "learning_rate": 1.782585338622924e-05, + "loss": 2.1137, + "step": 14453 + }, + { + "epoch": 2.709278350515464, + "grad_norm": 51599.1015625, + "learning_rate": 1.7819838863729304e-05, + "loss": 2.1845, + "step": 14454 + }, + { + "epoch": 2.7094657919400187, + "grad_norm": 62752.15234375, + "learning_rate": 1.781382513603766e-05, + "loss": 2.0613, + "step": 14455 + }, + { + "epoch": 2.7096532333645733, + "grad_norm": 61178.59765625, + "learning_rate": 1.7807812203302766e-05, + "loss": 2.1303, + "step": 14456 + }, + { + "epoch": 2.7098406747891284, + "grad_norm": 54054.04296875, + "learning_rate": 1.780180006567318e-05, + "loss": 2.1418, + "step": 14457 + }, + { + "epoch": 2.7100281162136834, + "grad_norm": 55954.10546875, + "learning_rate": 1.7795788723297396e-05, + "loss": 2.1883, + "step": 14458 + }, + { + "epoch": 2.710215557638238, + "grad_norm": 50041.625, + "learning_rate": 1.7789778176323884e-05, + "loss": 2.1342, + "step": 14459 + }, + { + "epoch": 2.7104029990627927, + "grad_norm": 59839.62109375, + "learning_rate": 1.778376842490107e-05, + "loss": 2.0799, + "step": 14460 + }, + { + "epoch": 2.7105904404873478, + "grad_norm": 52194.6640625, + "learning_rate": 1.7777759469177407e-05, + "loss": 2.1434, + "step": 14461 + }, + { + "epoch": 2.7107778819119024, + "grad_norm": 53050.00390625, + "learning_rate": 1.7771751309301342e-05, + "loss": 2.1754, + "step": 14462 + }, + { + "epoch": 2.7109653233364575, + "grad_norm": 54274.83203125, + "learning_rate": 1.7765743945421204e-05, + "loss": 2.1211, + "step": 14463 + }, + { + "epoch": 2.711152764761012, + "grad_norm": 54092.06640625, + "learning_rate": 1.7759737377685398e-05, + "loss": 2.0993, + "step": 14464 + }, + { + "epoch": 2.711340206185567, + "grad_norm": 55494.01171875, + "learning_rate": 1.775373160624229e-05, + "loss": 2.1038, + "step": 14465 + }, + { + "epoch": 2.7115276476101218, + "grad_norm": 54835.6484375, + "learning_rate": 1.774772663124021e-05, + "loss": 2.2039, + "step": 14466 + }, + { + "epoch": 2.7117150890346764, + "grad_norm": 54994.38671875, + "learning_rate": 1.7741722452827446e-05, + "loss": 2.1614, + "step": 14467 + }, + { + "epoch": 2.7119025304592315, + "grad_norm": 55709.859375, + "learning_rate": 1.773571907115233e-05, + "loss": 2.0707, + "step": 14468 + }, + { + "epoch": 2.7120899718837865, + "grad_norm": 49830.828125, + "learning_rate": 1.7729716486363124e-05, + "loss": 2.1367, + "step": 14469 + }, + { + "epoch": 2.712277413308341, + "grad_norm": 50135.203125, + "learning_rate": 1.772371469860806e-05, + "loss": 2.1265, + "step": 14470 + }, + { + "epoch": 2.712464854732896, + "grad_norm": 53914.53125, + "learning_rate": 1.7717713708035403e-05, + "loss": 2.1609, + "step": 14471 + }, + { + "epoch": 2.712652296157451, + "grad_norm": 52584.2265625, + "learning_rate": 1.7711713514793375e-05, + "loss": 2.1195, + "step": 14472 + }, + { + "epoch": 2.7128397375820055, + "grad_norm": 54833.359375, + "learning_rate": 1.7705714119030166e-05, + "loss": 2.1449, + "step": 14473 + }, + { + "epoch": 2.7130271790065605, + "grad_norm": 52191.3359375, + "learning_rate": 1.769971552089393e-05, + "loss": 2.1308, + "step": 14474 + }, + { + "epoch": 2.713214620431115, + "grad_norm": 55840.50390625, + "learning_rate": 1.7693717720532855e-05, + "loss": 2.0484, + "step": 14475 + }, + { + "epoch": 2.7134020618556702, + "grad_norm": 52336.94921875, + "learning_rate": 1.7687720718095052e-05, + "loss": 2.1011, + "step": 14476 + }, + { + "epoch": 2.713589503280225, + "grad_norm": 56170.3828125, + "learning_rate": 1.768172451372867e-05, + "loss": 2.1835, + "step": 14477 + }, + { + "epoch": 2.7137769447047795, + "grad_norm": 53555.76953125, + "learning_rate": 1.7675729107581773e-05, + "loss": 2.1762, + "step": 14478 + }, + { + "epoch": 2.7139643861293345, + "grad_norm": 52183.78515625, + "learning_rate": 1.766973449980248e-05, + "loss": 2.157, + "step": 14479 + }, + { + "epoch": 2.7141518275538896, + "grad_norm": 55034.6171875, + "learning_rate": 1.766374069053881e-05, + "loss": 2.1033, + "step": 14480 + }, + { + "epoch": 2.7143392689784442, + "grad_norm": 55142.9296875, + "learning_rate": 1.7657747679938836e-05, + "loss": 2.0893, + "step": 14481 + }, + { + "epoch": 2.714526710402999, + "grad_norm": 58178.1171875, + "learning_rate": 1.765175546815056e-05, + "loss": 2.2106, + "step": 14482 + }, + { + "epoch": 2.714714151827554, + "grad_norm": 54685.38671875, + "learning_rate": 1.7645764055321966e-05, + "loss": 2.1138, + "step": 14483 + }, + { + "epoch": 2.7149015932521086, + "grad_norm": 56373.0390625, + "learning_rate": 1.763977344160107e-05, + "loss": 2.0208, + "step": 14484 + }, + { + "epoch": 2.7150890346766636, + "grad_norm": 58454.28125, + "learning_rate": 1.763378362713582e-05, + "loss": 2.0523, + "step": 14485 + }, + { + "epoch": 2.7152764761012183, + "grad_norm": 57229.19140625, + "learning_rate": 1.7627794612074127e-05, + "loss": 2.1261, + "step": 14486 + }, + { + "epoch": 2.7154639175257733, + "grad_norm": 54091.0859375, + "learning_rate": 1.762180639656394e-05, + "loss": 2.1027, + "step": 14487 + }, + { + "epoch": 2.715651358950328, + "grad_norm": 57188.6640625, + "learning_rate": 1.761581898075317e-05, + "loss": 2.0845, + "step": 14488 + }, + { + "epoch": 2.715838800374883, + "grad_norm": 52929.75, + "learning_rate": 1.7609832364789687e-05, + "loss": 2.1602, + "step": 14489 + }, + { + "epoch": 2.7160262417994376, + "grad_norm": 54961.4609375, + "learning_rate": 1.760384654882133e-05, + "loss": 2.099, + "step": 14490 + }, + { + "epoch": 2.7162136832239927, + "grad_norm": 50652.60546875, + "learning_rate": 1.7597861532995963e-05, + "loss": 2.0553, + "step": 14491 + }, + { + "epoch": 2.7164011246485473, + "grad_norm": 55485.23046875, + "learning_rate": 1.759187731746144e-05, + "loss": 2.1984, + "step": 14492 + }, + { + "epoch": 2.716588566073102, + "grad_norm": 58456.3515625, + "learning_rate": 1.7585893902365498e-05, + "loss": 2.1547, + "step": 14493 + }, + { + "epoch": 2.716776007497657, + "grad_norm": 57616.26171875, + "learning_rate": 1.757991128785595e-05, + "loss": 2.0634, + "step": 14494 + }, + { + "epoch": 2.7169634489222116, + "grad_norm": 54716.51171875, + "learning_rate": 1.7573929474080574e-05, + "loss": 2.1287, + "step": 14495 + }, + { + "epoch": 2.7171508903467667, + "grad_norm": 55677.421875, + "learning_rate": 1.75679484611871e-05, + "loss": 2.1096, + "step": 14496 + }, + { + "epoch": 2.7173383317713213, + "grad_norm": 50491.67578125, + "learning_rate": 1.7561968249323235e-05, + "loss": 2.0916, + "step": 14497 + }, + { + "epoch": 2.7175257731958764, + "grad_norm": 53079.2578125, + "learning_rate": 1.7555988838636713e-05, + "loss": 2.106, + "step": 14498 + }, + { + "epoch": 2.717713214620431, + "grad_norm": 53539.5, + "learning_rate": 1.7550010229275204e-05, + "loss": 2.1145, + "step": 14499 + }, + { + "epoch": 2.717900656044986, + "grad_norm": 51391.19140625, + "learning_rate": 1.7544032421386353e-05, + "loss": 2.175, + "step": 14500 + }, + { + "epoch": 2.717900656044986, + "eval_loss": 2.2608094215393066, + "eval_runtime": 130.8149, + "eval_samples_per_second": 38.597, + "eval_steps_per_second": 1.934, + "step": 14500 + }, + { + "epoch": 2.7180880974695407, + "grad_norm": 51608.46875, + "learning_rate": 1.753805541511782e-05, + "loss": 2.125, + "step": 14501 + }, + { + "epoch": 2.718275538894096, + "grad_norm": 56409.87109375, + "learning_rate": 1.7532079210617246e-05, + "loss": 2.0622, + "step": 14502 + }, + { + "epoch": 2.7184629803186504, + "grad_norm": 51835.921875, + "learning_rate": 1.7526103808032223e-05, + "loss": 2.1455, + "step": 14503 + }, + { + "epoch": 2.718650421743205, + "grad_norm": 60258.67578125, + "learning_rate": 1.7520129207510317e-05, + "loss": 2.1578, + "step": 14504 + }, + { + "epoch": 2.71883786316776, + "grad_norm": 51140.92578125, + "learning_rate": 1.7514155409199123e-05, + "loss": 2.13, + "step": 14505 + }, + { + "epoch": 2.719025304592315, + "grad_norm": 55599.3515625, + "learning_rate": 1.7508182413246155e-05, + "loss": 2.0958, + "step": 14506 + }, + { + "epoch": 2.71921274601687, + "grad_norm": 51638.8828125, + "learning_rate": 1.7502210219798975e-05, + "loss": 2.2028, + "step": 14507 + }, + { + "epoch": 2.7194001874414244, + "grad_norm": 54148.7890625, + "learning_rate": 1.7496238829005068e-05, + "loss": 2.0903, + "step": 14508 + }, + { + "epoch": 2.7195876288659795, + "grad_norm": 56605.33984375, + "learning_rate": 1.7490268241011902e-05, + "loss": 2.1217, + "step": 14509 + }, + { + "epoch": 2.719775070290534, + "grad_norm": 54006.08984375, + "learning_rate": 1.7484298455966962e-05, + "loss": 2.1011, + "step": 14510 + }, + { + "epoch": 2.719962511715089, + "grad_norm": 58819.6328125, + "learning_rate": 1.7478329474017707e-05, + "loss": 2.1146, + "step": 14511 + }, + { + "epoch": 2.720149953139644, + "grad_norm": 55244.265625, + "learning_rate": 1.747236129531155e-05, + "loss": 2.1286, + "step": 14512 + }, + { + "epoch": 2.720337394564199, + "grad_norm": 53596.7890625, + "learning_rate": 1.746639391999588e-05, + "loss": 2.1857, + "step": 14513 + }, + { + "epoch": 2.7205248359887535, + "grad_norm": 54592.73046875, + "learning_rate": 1.7460427348218118e-05, + "loss": 2.0774, + "step": 14514 + }, + { + "epoch": 2.720712277413308, + "grad_norm": 54135.140625, + "learning_rate": 1.7454461580125615e-05, + "loss": 2.1312, + "step": 14515 + }, + { + "epoch": 2.720899718837863, + "grad_norm": 53730.8046875, + "learning_rate": 1.744849661586569e-05, + "loss": 2.1598, + "step": 14516 + }, + { + "epoch": 2.7210871602624183, + "grad_norm": 50902.36328125, + "learning_rate": 1.7442532455585702e-05, + "loss": 2.165, + "step": 14517 + }, + { + "epoch": 2.721274601686973, + "grad_norm": 55985.4296875, + "learning_rate": 1.7436569099432964e-05, + "loss": 2.0995, + "step": 14518 + }, + { + "epoch": 2.7214620431115275, + "grad_norm": 52258.7265625, + "learning_rate": 1.743060654755475e-05, + "loss": 2.1501, + "step": 14519 + }, + { + "epoch": 2.7216494845360826, + "grad_norm": 57447.4765625, + "learning_rate": 1.7424644800098312e-05, + "loss": 2.1063, + "step": 14520 + }, + { + "epoch": 2.721836925960637, + "grad_norm": 50937.73828125, + "learning_rate": 1.7418683857210932e-05, + "loss": 2.1191, + "step": 14521 + }, + { + "epoch": 2.7220243673851923, + "grad_norm": 51451.2890625, + "learning_rate": 1.7412723719039814e-05, + "loss": 2.3763, + "step": 14522 + }, + { + "epoch": 2.722211808809747, + "grad_norm": 49900.93359375, + "learning_rate": 1.7406764385732156e-05, + "loss": 2.1001, + "step": 14523 + }, + { + "epoch": 2.722399250234302, + "grad_norm": 59143.171875, + "learning_rate": 1.740080585743516e-05, + "loss": 2.1255, + "step": 14524 + }, + { + "epoch": 2.7225866916588566, + "grad_norm": 59144.453125, + "learning_rate": 1.7394848134296026e-05, + "loss": 2.1444, + "step": 14525 + }, + { + "epoch": 2.722774133083411, + "grad_norm": 51655.02734375, + "learning_rate": 1.738889121646184e-05, + "loss": 2.1687, + "step": 14526 + }, + { + "epoch": 2.7229615745079663, + "grad_norm": 50339.953125, + "learning_rate": 1.7382935104079758e-05, + "loss": 2.163, + "step": 14527 + }, + { + "epoch": 2.7231490159325213, + "grad_norm": 53308.56640625, + "learning_rate": 1.73769797972969e-05, + "loss": 2.0893, + "step": 14528 + }, + { + "epoch": 2.723336457357076, + "grad_norm": 61337.6171875, + "learning_rate": 1.7371025296260353e-05, + "loss": 2.049, + "step": 14529 + }, + { + "epoch": 2.7235238987816306, + "grad_norm": 52665.18359375, + "learning_rate": 1.7365071601117156e-05, + "loss": 2.1453, + "step": 14530 + }, + { + "epoch": 2.7237113402061857, + "grad_norm": 57855.31640625, + "learning_rate": 1.735911871201438e-05, + "loss": 2.1443, + "step": 14531 + }, + { + "epoch": 2.7238987816307403, + "grad_norm": 56042.83203125, + "learning_rate": 1.735316662909907e-05, + "loss": 2.0933, + "step": 14532 + }, + { + "epoch": 2.7240862230552954, + "grad_norm": 50741.515625, + "learning_rate": 1.734721535251822e-05, + "loss": 2.1187, + "step": 14533 + }, + { + "epoch": 2.72427366447985, + "grad_norm": 52522.671875, + "learning_rate": 1.7341264882418797e-05, + "loss": 2.1118, + "step": 14534 + }, + { + "epoch": 2.724461105904405, + "grad_norm": 52754.17578125, + "learning_rate": 1.73353152189478e-05, + "loss": 2.1193, + "step": 14535 + }, + { + "epoch": 2.7246485473289597, + "grad_norm": 50086.9609375, + "learning_rate": 1.7329366362252153e-05, + "loss": 2.1703, + "step": 14536 + }, + { + "epoch": 2.7248359887535143, + "grad_norm": 48646.24609375, + "learning_rate": 1.7323418312478816e-05, + "loss": 2.1228, + "step": 14537 + }, + { + "epoch": 2.7250234301780694, + "grad_norm": 60782.44921875, + "learning_rate": 1.7317471069774683e-05, + "loss": 2.1667, + "step": 14538 + }, + { + "epoch": 2.7252108716026244, + "grad_norm": 52793.91796875, + "learning_rate": 1.7311524634286624e-05, + "loss": 2.1409, + "step": 14539 + }, + { + "epoch": 2.725398313027179, + "grad_norm": 51747.80859375, + "learning_rate": 1.7305579006161543e-05, + "loss": 2.1229, + "step": 14540 + }, + { + "epoch": 2.7255857544517337, + "grad_norm": 49545.21484375, + "learning_rate": 1.7299634185546253e-05, + "loss": 2.129, + "step": 14541 + }, + { + "epoch": 2.7257731958762887, + "grad_norm": 52525.390625, + "learning_rate": 1.7293690172587618e-05, + "loss": 2.118, + "step": 14542 + }, + { + "epoch": 2.7259606373008434, + "grad_norm": 51719.875, + "learning_rate": 1.7287746967432416e-05, + "loss": 2.0943, + "step": 14543 + }, + { + "epoch": 2.7261480787253984, + "grad_norm": 63305.875, + "learning_rate": 1.7281804570227468e-05, + "loss": 2.0967, + "step": 14544 + }, + { + "epoch": 2.726335520149953, + "grad_norm": 54207.05859375, + "learning_rate": 1.7275862981119524e-05, + "loss": 2.1306, + "step": 14545 + }, + { + "epoch": 2.726522961574508, + "grad_norm": 50933.96484375, + "learning_rate": 1.726992220025532e-05, + "loss": 2.134, + "step": 14546 + }, + { + "epoch": 2.7267104029990628, + "grad_norm": 52085.46484375, + "learning_rate": 1.726398222778161e-05, + "loss": 2.1188, + "step": 14547 + }, + { + "epoch": 2.7268978444236174, + "grad_norm": 54289.87890625, + "learning_rate": 1.7258043063845126e-05, + "loss": 2.169, + "step": 14548 + }, + { + "epoch": 2.7270852858481724, + "grad_norm": 56947.328125, + "learning_rate": 1.725210470859249e-05, + "loss": 2.1783, + "step": 14549 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 53569.640625, + "learning_rate": 1.724616716217041e-05, + "loss": 2.0894, + "step": 14550 + }, + { + "epoch": 2.727460168697282, + "grad_norm": 51713.796875, + "learning_rate": 1.7240230424725562e-05, + "loss": 2.1065, + "step": 14551 + }, + { + "epoch": 2.7276476101218368, + "grad_norm": 52705.14453125, + "learning_rate": 1.7234294496404542e-05, + "loss": 2.2812, + "step": 14552 + }, + { + "epoch": 2.727835051546392, + "grad_norm": 55427.65625, + "learning_rate": 1.7228359377353948e-05, + "loss": 2.1845, + "step": 14553 + }, + { + "epoch": 2.7280224929709465, + "grad_norm": 48306.54296875, + "learning_rate": 1.7222425067720387e-05, + "loss": 2.1161, + "step": 14554 + }, + { + "epoch": 2.7282099343955015, + "grad_norm": 53903.51953125, + "learning_rate": 1.7216491567650465e-05, + "loss": 2.068, + "step": 14555 + }, + { + "epoch": 2.728397375820056, + "grad_norm": 56036.26953125, + "learning_rate": 1.7210558877290668e-05, + "loss": 2.1159, + "step": 14556 + }, + { + "epoch": 2.728584817244611, + "grad_norm": 59009.3671875, + "learning_rate": 1.7204626996787548e-05, + "loss": 2.1328, + "step": 14557 + }, + { + "epoch": 2.728772258669166, + "grad_norm": 54864.84765625, + "learning_rate": 1.7198695926287643e-05, + "loss": 2.0512, + "step": 14558 + }, + { + "epoch": 2.7289597000937205, + "grad_norm": 50732.65625, + "learning_rate": 1.7192765665937422e-05, + "loss": 2.1588, + "step": 14559 + }, + { + "epoch": 2.7291471415182755, + "grad_norm": 64077.0390625, + "learning_rate": 1.7186836215883333e-05, + "loss": 2.1408, + "step": 14560 + }, + { + "epoch": 2.7293345829428306, + "grad_norm": 57452.1015625, + "learning_rate": 1.7180907576271865e-05, + "loss": 2.1376, + "step": 14561 + }, + { + "epoch": 2.7295220243673852, + "grad_norm": 50986.890625, + "learning_rate": 1.7174979747249413e-05, + "loss": 2.0716, + "step": 14562 + }, + { + "epoch": 2.72970946579194, + "grad_norm": 59510.171875, + "learning_rate": 1.716905272896242e-05, + "loss": 2.1775, + "step": 14563 + }, + { + "epoch": 2.729896907216495, + "grad_norm": 61536.078125, + "learning_rate": 1.7163126521557237e-05, + "loss": 2.0948, + "step": 14564 + }, + { + "epoch": 2.7300843486410495, + "grad_norm": 52344.87890625, + "learning_rate": 1.7157201125180277e-05, + "loss": 2.1216, + "step": 14565 + }, + { + "epoch": 2.7302717900656046, + "grad_norm": 54364.90625, + "learning_rate": 1.7151276539977867e-05, + "loss": 2.1701, + "step": 14566 + }, + { + "epoch": 2.7304592314901592, + "grad_norm": 52946.30078125, + "learning_rate": 1.7145352766096322e-05, + "loss": 2.125, + "step": 14567 + }, + { + "epoch": 2.7306466729147143, + "grad_norm": 55202.7578125, + "learning_rate": 1.7139429803681984e-05, + "loss": 2.1273, + "step": 14568 + }, + { + "epoch": 2.730834114339269, + "grad_norm": 56156.0703125, + "learning_rate": 1.7133507652881104e-05, + "loss": 2.0735, + "step": 14569 + }, + { + "epoch": 2.7310215557638235, + "grad_norm": 56249.8984375, + "learning_rate": 1.7127586313839995e-05, + "loss": 2.0843, + "step": 14570 + }, + { + "epoch": 2.7312089971883786, + "grad_norm": 51749.7890625, + "learning_rate": 1.7121665786704865e-05, + "loss": 2.1505, + "step": 14571 + }, + { + "epoch": 2.7313964386129337, + "grad_norm": 52699.55859375, + "learning_rate": 1.711574607162198e-05, + "loss": 2.1869, + "step": 14572 + }, + { + "epoch": 2.7315838800374883, + "grad_norm": 57279.2578125, + "learning_rate": 1.7109827168737512e-05, + "loss": 2.1781, + "step": 14573 + }, + { + "epoch": 2.731771321462043, + "grad_norm": 52512.5625, + "learning_rate": 1.7103909078197694e-05, + "loss": 2.221, + "step": 14574 + }, + { + "epoch": 2.731958762886598, + "grad_norm": 53882.12109375, + "learning_rate": 1.7097991800148667e-05, + "loss": 2.1008, + "step": 14575 + }, + { + "epoch": 2.7321462043111526, + "grad_norm": 54972.59765625, + "learning_rate": 1.709207533473657e-05, + "loss": 2.2158, + "step": 14576 + }, + { + "epoch": 2.7323336457357077, + "grad_norm": 54602.3359375, + "learning_rate": 1.708615968210754e-05, + "loss": 2.218, + "step": 14577 + }, + { + "epoch": 2.7325210871602623, + "grad_norm": 52362.921875, + "learning_rate": 1.7080244842407734e-05, + "loss": 2.1391, + "step": 14578 + }, + { + "epoch": 2.7327085285848174, + "grad_norm": 52316.19921875, + "learning_rate": 1.7074330815783156e-05, + "loss": 2.1142, + "step": 14579 + }, + { + "epoch": 2.732895970009372, + "grad_norm": 53880.75390625, + "learning_rate": 1.7068417602379926e-05, + "loss": 2.1, + "step": 14580 + }, + { + "epoch": 2.7330834114339266, + "grad_norm": 54178.375, + "learning_rate": 1.7062505202344104e-05, + "loss": 2.1063, + "step": 14581 + }, + { + "epoch": 2.7332708528584817, + "grad_norm": 52989.953125, + "learning_rate": 1.7056593615821694e-05, + "loss": 2.0998, + "step": 14582 + }, + { + "epoch": 2.7334582942830368, + "grad_norm": 56979.59765625, + "learning_rate": 1.705068284295869e-05, + "loss": 2.1539, + "step": 14583 + }, + { + "epoch": 2.7336457357075914, + "grad_norm": 52170.02734375, + "learning_rate": 1.7044772883901107e-05, + "loss": 2.188, + "step": 14584 + }, + { + "epoch": 2.733833177132146, + "grad_norm": 49836.01171875, + "learning_rate": 1.703886373879494e-05, + "loss": 2.0913, + "step": 14585 + }, + { + "epoch": 2.734020618556701, + "grad_norm": 59736.6484375, + "learning_rate": 1.7032955407786067e-05, + "loss": 2.0434, + "step": 14586 + }, + { + "epoch": 2.7342080599812557, + "grad_norm": 52958.8359375, + "learning_rate": 1.7027047891020452e-05, + "loss": 2.1086, + "step": 14587 + }, + { + "epoch": 2.734395501405811, + "grad_norm": 56504.921875, + "learning_rate": 1.7021141188644023e-05, + "loss": 2.0853, + "step": 14588 + }, + { + "epoch": 2.7345829428303654, + "grad_norm": 54038.80859375, + "learning_rate": 1.7015235300802652e-05, + "loss": 2.1105, + "step": 14589 + }, + { + "epoch": 2.7347703842549205, + "grad_norm": 57006.91015625, + "learning_rate": 1.7009330227642184e-05, + "loss": 2.1067, + "step": 14590 + }, + { + "epoch": 2.734957825679475, + "grad_norm": 49850.4609375, + "learning_rate": 1.70034259693085e-05, + "loss": 2.1545, + "step": 14591 + }, + { + "epoch": 2.7351452671040297, + "grad_norm": 53091.91796875, + "learning_rate": 1.69975225259474e-05, + "loss": 2.0957, + "step": 14592 + }, + { + "epoch": 2.735332708528585, + "grad_norm": 50417.75, + "learning_rate": 1.6991619897704725e-05, + "loss": 2.069, + "step": 14593 + }, + { + "epoch": 2.73552014995314, + "grad_norm": 53577.38671875, + "learning_rate": 1.698571808472622e-05, + "loss": 2.1126, + "step": 14594 + }, + { + "epoch": 2.7357075913776945, + "grad_norm": 54305.41015625, + "learning_rate": 1.6979817087157694e-05, + "loss": 2.1177, + "step": 14595 + }, + { + "epoch": 2.735895032802249, + "grad_norm": 56599.75, + "learning_rate": 1.6973916905144877e-05, + "loss": 2.0691, + "step": 14596 + }, + { + "epoch": 2.736082474226804, + "grad_norm": 60828.83203125, + "learning_rate": 1.6968017538833474e-05, + "loss": 2.1246, + "step": 14597 + }, + { + "epoch": 2.736269915651359, + "grad_norm": 50263.921875, + "learning_rate": 1.696211898836923e-05, + "loss": 2.0793, + "step": 14598 + }, + { + "epoch": 2.736457357075914, + "grad_norm": 50367.97265625, + "learning_rate": 1.6956221253897797e-05, + "loss": 2.0448, + "step": 14599 + }, + { + "epoch": 2.7366447985004685, + "grad_norm": 57511.1015625, + "learning_rate": 1.695032433556487e-05, + "loss": 2.1151, + "step": 14600 + }, + { + "epoch": 2.7368322399250236, + "grad_norm": 53706.0859375, + "learning_rate": 1.6944428233516084e-05, + "loss": 2.0855, + "step": 14601 + }, + { + "epoch": 2.737019681349578, + "grad_norm": 57462.21484375, + "learning_rate": 1.6938532947897046e-05, + "loss": 2.0982, + "step": 14602 + }, + { + "epoch": 2.737207122774133, + "grad_norm": 56531.51171875, + "learning_rate": 1.693263847885338e-05, + "loss": 2.1424, + "step": 14603 + }, + { + "epoch": 2.737394564198688, + "grad_norm": 52950.2109375, + "learning_rate": 1.692674482653069e-05, + "loss": 2.0943, + "step": 14604 + }, + { + "epoch": 2.737582005623243, + "grad_norm": 53939.77734375, + "learning_rate": 1.692085199107452e-05, + "loss": 2.0662, + "step": 14605 + }, + { + "epoch": 2.7377694470477976, + "grad_norm": 54352.00390625, + "learning_rate": 1.69149599726304e-05, + "loss": 2.1799, + "step": 14606 + }, + { + "epoch": 2.737956888472352, + "grad_norm": 54946.68359375, + "learning_rate": 1.690906877134388e-05, + "loss": 2.0699, + "step": 14607 + }, + { + "epoch": 2.7381443298969073, + "grad_norm": 52239.0625, + "learning_rate": 1.6903178387360487e-05, + "loss": 2.1, + "step": 14608 + }, + { + "epoch": 2.738331771321462, + "grad_norm": 58344.375, + "learning_rate": 1.689728882082564e-05, + "loss": 2.0906, + "step": 14609 + }, + { + "epoch": 2.738519212746017, + "grad_norm": 58083.71484375, + "learning_rate": 1.6891400071884842e-05, + "loss": 2.0869, + "step": 14610 + }, + { + "epoch": 2.7387066541705716, + "grad_norm": 57751.05859375, + "learning_rate": 1.6885512140683556e-05, + "loss": 2.1687, + "step": 14611 + }, + { + "epoch": 2.7388940955951266, + "grad_norm": 55203.25, + "learning_rate": 1.6879625027367186e-05, + "loss": 2.0743, + "step": 14612 + }, + { + "epoch": 2.7390815370196813, + "grad_norm": 52279.4765625, + "learning_rate": 1.6873738732081125e-05, + "loss": 2.1074, + "step": 14613 + }, + { + "epoch": 2.7392689784442363, + "grad_norm": 56301.67578125, + "learning_rate": 1.6867853254970778e-05, + "loss": 2.2529, + "step": 14614 + }, + { + "epoch": 2.739456419868791, + "grad_norm": 58364.09375, + "learning_rate": 1.68619685961815e-05, + "loss": 2.0878, + "step": 14615 + }, + { + "epoch": 2.739643861293346, + "grad_norm": 53812.0, + "learning_rate": 1.685608475585862e-05, + "loss": 2.1097, + "step": 14616 + }, + { + "epoch": 2.7398313027179007, + "grad_norm": 52964.41015625, + "learning_rate": 1.685020173414747e-05, + "loss": 2.1448, + "step": 14617 + }, + { + "epoch": 2.7400187441424553, + "grad_norm": 55560.58203125, + "learning_rate": 1.6844319531193375e-05, + "loss": 2.0561, + "step": 14618 + }, + { + "epoch": 2.7402061855670103, + "grad_norm": 55373.92578125, + "learning_rate": 1.6838438147141605e-05, + "loss": 2.1377, + "step": 14619 + }, + { + "epoch": 2.7403936269915654, + "grad_norm": 52377.40625, + "learning_rate": 1.6832557582137397e-05, + "loss": 2.1163, + "step": 14620 + }, + { + "epoch": 2.74058106841612, + "grad_norm": 61864.7734375, + "learning_rate": 1.6826677836326037e-05, + "loss": 2.2294, + "step": 14621 + }, + { + "epoch": 2.7407685098406747, + "grad_norm": 54763.69140625, + "learning_rate": 1.682079890985272e-05, + "loss": 2.1208, + "step": 14622 + }, + { + "epoch": 2.7409559512652297, + "grad_norm": 50986.43359375, + "learning_rate": 1.6814920802862637e-05, + "loss": 2.1207, + "step": 14623 + }, + { + "epoch": 2.7411433926897844, + "grad_norm": 52006.59765625, + "learning_rate": 1.6809043515500988e-05, + "loss": 2.118, + "step": 14624 + }, + { + "epoch": 2.7413308341143394, + "grad_norm": 58099.109375, + "learning_rate": 1.6803167047912942e-05, + "loss": 2.1428, + "step": 14625 + }, + { + "epoch": 2.741518275538894, + "grad_norm": 54364.5390625, + "learning_rate": 1.6797291400243638e-05, + "loss": 2.1155, + "step": 14626 + }, + { + "epoch": 2.741705716963449, + "grad_norm": 50903.52734375, + "learning_rate": 1.6791416572638163e-05, + "loss": 2.1467, + "step": 14627 + }, + { + "epoch": 2.7418931583880037, + "grad_norm": 54083.515625, + "learning_rate": 1.6785542565241663e-05, + "loss": 2.1621, + "step": 14628 + }, + { + "epoch": 2.7420805998125584, + "grad_norm": 55635.2109375, + "learning_rate": 1.677966937819918e-05, + "loss": 2.1361, + "step": 14629 + }, + { + "epoch": 2.7422680412371134, + "grad_norm": 53064.3203125, + "learning_rate": 1.677379701165581e-05, + "loss": 2.1512, + "step": 14630 + }, + { + "epoch": 2.7424554826616685, + "grad_norm": 53906.9453125, + "learning_rate": 1.6767925465756578e-05, + "loss": 2.156, + "step": 14631 + }, + { + "epoch": 2.742642924086223, + "grad_norm": 54367.02734375, + "learning_rate": 1.6762054740646487e-05, + "loss": 2.0594, + "step": 14632 + }, + { + "epoch": 2.7428303655107777, + "grad_norm": 56370.02734375, + "learning_rate": 1.6756184836470545e-05, + "loss": 2.139, + "step": 14633 + }, + { + "epoch": 2.743017806935333, + "grad_norm": 54761.02734375, + "learning_rate": 1.675031575337376e-05, + "loss": 2.06, + "step": 14634 + }, + { + "epoch": 2.7432052483598874, + "grad_norm": 54169.83203125, + "learning_rate": 1.674444749150106e-05, + "loss": 2.1078, + "step": 14635 + }, + { + "epoch": 2.7433926897844425, + "grad_norm": 58952.58203125, + "learning_rate": 1.673858005099737e-05, + "loss": 2.1235, + "step": 14636 + }, + { + "epoch": 2.743580131208997, + "grad_norm": 55188.1875, + "learning_rate": 1.6732713432007652e-05, + "loss": 2.1627, + "step": 14637 + }, + { + "epoch": 2.743767572633552, + "grad_norm": 50511.32421875, + "learning_rate": 1.672684763467678e-05, + "loss": 2.1289, + "step": 14638 + }, + { + "epoch": 2.743955014058107, + "grad_norm": 52480.58203125, + "learning_rate": 1.6720982659149614e-05, + "loss": 2.1268, + "step": 14639 + }, + { + "epoch": 2.7441424554826614, + "grad_norm": 51722.55078125, + "learning_rate": 1.671511850557103e-05, + "loss": 2.1002, + "step": 14640 + }, + { + "epoch": 2.7443298969072165, + "grad_norm": 55990.203125, + "learning_rate": 1.6709255174085897e-05, + "loss": 2.1194, + "step": 14641 + }, + { + "epoch": 2.7445173383317716, + "grad_norm": 56632.4609375, + "learning_rate": 1.6703392664838967e-05, + "loss": 2.1608, + "step": 14642 + }, + { + "epoch": 2.744704779756326, + "grad_norm": 55658.13671875, + "learning_rate": 1.669753097797507e-05, + "loss": 2.1507, + "step": 14643 + }, + { + "epoch": 2.744892221180881, + "grad_norm": 54399.703125, + "learning_rate": 1.669167011363899e-05, + "loss": 2.1145, + "step": 14644 + }, + { + "epoch": 2.745079662605436, + "grad_norm": 54026.96875, + "learning_rate": 1.6685810071975482e-05, + "loss": 2.151, + "step": 14645 + }, + { + "epoch": 2.7452671040299905, + "grad_norm": 57491.484375, + "learning_rate": 1.667995085312925e-05, + "loss": 2.1851, + "step": 14646 + }, + { + "epoch": 2.7454545454545456, + "grad_norm": 54004.765625, + "learning_rate": 1.667409245724504e-05, + "loss": 2.2182, + "step": 14647 + }, + { + "epoch": 2.7456419868791, + "grad_norm": 54549.94140625, + "learning_rate": 1.666823488446755e-05, + "loss": 2.1019, + "step": 14648 + }, + { + "epoch": 2.7458294283036553, + "grad_norm": 50364.1015625, + "learning_rate": 1.6662378134941437e-05, + "loss": 2.1086, + "step": 14649 + }, + { + "epoch": 2.74601686972821, + "grad_norm": 57473.63671875, + "learning_rate": 1.6656522208811354e-05, + "loss": 2.1666, + "step": 14650 + }, + { + "epoch": 2.7462043111527645, + "grad_norm": 57219.90625, + "learning_rate": 1.6650667106221953e-05, + "loss": 2.1353, + "step": 14651 + }, + { + "epoch": 2.7463917525773196, + "grad_norm": 56276.265625, + "learning_rate": 1.6644812827317842e-05, + "loss": 2.0626, + "step": 14652 + }, + { + "epoch": 2.7465791940018747, + "grad_norm": 52277.60546875, + "learning_rate": 1.6638959372243583e-05, + "loss": 2.0907, + "step": 14653 + }, + { + "epoch": 2.7467666354264293, + "grad_norm": 56331.23828125, + "learning_rate": 1.66331067411438e-05, + "loss": 2.15, + "step": 14654 + }, + { + "epoch": 2.746954076850984, + "grad_norm": 58710.5078125, + "learning_rate": 1.6627254934163e-05, + "loss": 2.1193, + "step": 14655 + }, + { + "epoch": 2.747141518275539, + "grad_norm": 53786.5, + "learning_rate": 1.6621403951445748e-05, + "loss": 2.1168, + "step": 14656 + }, + { + "epoch": 2.7473289597000936, + "grad_norm": 48901.109375, + "learning_rate": 1.6615553793136528e-05, + "loss": 2.1059, + "step": 14657 + }, + { + "epoch": 2.7475164011246487, + "grad_norm": 55977.66796875, + "learning_rate": 1.6609704459379864e-05, + "loss": 2.169, + "step": 14658 + }, + { + "epoch": 2.7477038425492033, + "grad_norm": 51601.0625, + "learning_rate": 1.660385595032019e-05, + "loss": 2.0674, + "step": 14659 + }, + { + "epoch": 2.7478912839737584, + "grad_norm": 53844.44140625, + "learning_rate": 1.6598008266101995e-05, + "loss": 2.1766, + "step": 14660 + }, + { + "epoch": 2.748078725398313, + "grad_norm": 57521.953125, + "learning_rate": 1.6592161406869684e-05, + "loss": 2.1144, + "step": 14661 + }, + { + "epoch": 2.7482661668228676, + "grad_norm": 58323.57421875, + "learning_rate": 1.658631537276766e-05, + "loss": 2.158, + "step": 14662 + }, + { + "epoch": 2.7484536082474227, + "grad_norm": 49454.640625, + "learning_rate": 1.6580470163940343e-05, + "loss": 2.1012, + "step": 14663 + }, + { + "epoch": 2.7486410496719778, + "grad_norm": 52662.38671875, + "learning_rate": 1.657462578053206e-05, + "loss": 2.1738, + "step": 14664 + }, + { + "epoch": 2.7488284910965324, + "grad_norm": 49160.1875, + "learning_rate": 1.656878222268721e-05, + "loss": 2.1345, + "step": 14665 + }, + { + "epoch": 2.749015932521087, + "grad_norm": 56605.4140625, + "learning_rate": 1.6562939490550072e-05, + "loss": 2.1435, + "step": 14666 + }, + { + "epoch": 2.749203373945642, + "grad_norm": 52784.53125, + "learning_rate": 1.655709758426499e-05, + "loss": 2.1784, + "step": 14667 + }, + { + "epoch": 2.7493908153701967, + "grad_norm": 51797.9765625, + "learning_rate": 1.6551256503976243e-05, + "loss": 2.061, + "step": 14668 + }, + { + "epoch": 2.7495782567947518, + "grad_norm": 53303.5234375, + "learning_rate": 1.6545416249828078e-05, + "loss": 2.2342, + "step": 14669 + }, + { + "epoch": 2.7497656982193064, + "grad_norm": 51568.47265625, + "learning_rate": 1.6539576821964757e-05, + "loss": 2.1395, + "step": 14670 + }, + { + "epoch": 2.7499531396438615, + "grad_norm": 56215.109375, + "learning_rate": 1.6533738220530538e-05, + "loss": 2.1584, + "step": 14671 + }, + { + "epoch": 2.750140581068416, + "grad_norm": 55641.953125, + "learning_rate": 1.6527900445669565e-05, + "loss": 2.0972, + "step": 14672 + }, + { + "epoch": 2.7503280224929707, + "grad_norm": 60685.265625, + "learning_rate": 1.6522063497526046e-05, + "loss": 2.1239, + "step": 14673 + }, + { + "epoch": 2.7505154639175258, + "grad_norm": 56421.328125, + "learning_rate": 1.6516227376244175e-05, + "loss": 2.1466, + "step": 14674 + }, + { + "epoch": 2.750702905342081, + "grad_norm": 55004.49609375, + "learning_rate": 1.6510392081968078e-05, + "loss": 2.1653, + "step": 14675 + }, + { + "epoch": 2.7508903467666355, + "grad_norm": 53177.8359375, + "learning_rate": 1.650455761484186e-05, + "loss": 2.0685, + "step": 14676 + }, + { + "epoch": 2.75107778819119, + "grad_norm": 57704.46875, + "learning_rate": 1.649872397500964e-05, + "loss": 2.1336, + "step": 14677 + }, + { + "epoch": 2.751265229615745, + "grad_norm": 55070.4375, + "learning_rate": 1.649289116261553e-05, + "loss": 2.0874, + "step": 14678 + }, + { + "epoch": 2.7514526710403, + "grad_norm": 53366.203125, + "learning_rate": 1.6487059177803543e-05, + "loss": 2.1238, + "step": 14679 + }, + { + "epoch": 2.751640112464855, + "grad_norm": 52230.2890625, + "learning_rate": 1.648122802071774e-05, + "loss": 2.0971, + "step": 14680 + }, + { + "epoch": 2.7518275538894095, + "grad_norm": 54289.7109375, + "learning_rate": 1.647539769150216e-05, + "loss": 2.1359, + "step": 14681 + }, + { + "epoch": 2.7520149953139645, + "grad_norm": 51222.5859375, + "learning_rate": 1.6469568190300793e-05, + "loss": 2.1392, + "step": 14682 + }, + { + "epoch": 2.752202436738519, + "grad_norm": 54103.49609375, + "learning_rate": 1.6463739517257603e-05, + "loss": 2.1907, + "step": 14683 + }, + { + "epoch": 2.752389878163074, + "grad_norm": 53314.6328125, + "learning_rate": 1.6457911672516585e-05, + "loss": 2.0529, + "step": 14684 + }, + { + "epoch": 2.752577319587629, + "grad_norm": 51900.59765625, + "learning_rate": 1.6452084656221646e-05, + "loss": 2.2412, + "step": 14685 + }, + { + "epoch": 2.752764761012184, + "grad_norm": 56780.9921875, + "learning_rate": 1.6446258468516733e-05, + "loss": 2.0953, + "step": 14686 + }, + { + "epoch": 2.7529522024367385, + "grad_norm": 52177.5625, + "learning_rate": 1.6440433109545717e-05, + "loss": 2.1539, + "step": 14687 + }, + { + "epoch": 2.753139643861293, + "grad_norm": 52386.55078125, + "learning_rate": 1.643460857945251e-05, + "loss": 2.1332, + "step": 14688 + }, + { + "epoch": 2.7533270852858482, + "grad_norm": 56116.77734375, + "learning_rate": 1.6428784878380936e-05, + "loss": 2.1366, + "step": 14689 + }, + { + "epoch": 2.753514526710403, + "grad_norm": 58305.80078125, + "learning_rate": 1.6422962006474867e-05, + "loss": 2.1855, + "step": 14690 + }, + { + "epoch": 2.753701968134958, + "grad_norm": 54864.37109375, + "learning_rate": 1.6417139963878102e-05, + "loss": 2.1132, + "step": 14691 + }, + { + "epoch": 2.7538894095595126, + "grad_norm": 57504.72265625, + "learning_rate": 1.641131875073442e-05, + "loss": 2.0858, + "step": 14692 + }, + { + "epoch": 2.7540768509840676, + "grad_norm": 49660.6171875, + "learning_rate": 1.640549836718764e-05, + "loss": 2.1, + "step": 14693 + }, + { + "epoch": 2.7542642924086223, + "grad_norm": 53345.9453125, + "learning_rate": 1.6399678813381485e-05, + "loss": 2.1904, + "step": 14694 + }, + { + "epoch": 2.754451733833177, + "grad_norm": 53335.265625, + "learning_rate": 1.6393860089459696e-05, + "loss": 2.0575, + "step": 14695 + }, + { + "epoch": 2.754639175257732, + "grad_norm": 56171.7734375, + "learning_rate": 1.6388042195565977e-05, + "loss": 2.1066, + "step": 14696 + }, + { + "epoch": 2.754826616682287, + "grad_norm": 54665.60546875, + "learning_rate": 1.6382225131844065e-05, + "loss": 2.0793, + "step": 14697 + }, + { + "epoch": 2.7550140581068416, + "grad_norm": 51926.4765625, + "learning_rate": 1.63764088984376e-05, + "loss": 2.1129, + "step": 14698 + }, + { + "epoch": 2.7552014995313963, + "grad_norm": 54129.01171875, + "learning_rate": 1.6370593495490228e-05, + "loss": 2.1434, + "step": 14699 + }, + { + "epoch": 2.7553889409559513, + "grad_norm": 62287.98046875, + "learning_rate": 1.6364778923145596e-05, + "loss": 2.1648, + "step": 14700 + }, + { + "epoch": 2.755576382380506, + "grad_norm": 57105.71484375, + "learning_rate": 1.6358965181547348e-05, + "loss": 2.2106, + "step": 14701 + }, + { + "epoch": 2.755763823805061, + "grad_norm": 50437.7578125, + "learning_rate": 1.6353152270839012e-05, + "loss": 2.1383, + "step": 14702 + }, + { + "epoch": 2.7559512652296156, + "grad_norm": 50322.09765625, + "learning_rate": 1.634734019116419e-05, + "loss": 2.1891, + "step": 14703 + }, + { + "epoch": 2.7561387066541707, + "grad_norm": 56517.04296875, + "learning_rate": 1.634152894266645e-05, + "loss": 2.1775, + "step": 14704 + }, + { + "epoch": 2.7563261480787253, + "grad_norm": 49770.1953125, + "learning_rate": 1.6335718525489302e-05, + "loss": 2.1077, + "step": 14705 + }, + { + "epoch": 2.75651358950328, + "grad_norm": 56059.7421875, + "learning_rate": 1.632990893977625e-05, + "loss": 2.1536, + "step": 14706 + }, + { + "epoch": 2.756701030927835, + "grad_norm": 55448.8984375, + "learning_rate": 1.632410018567081e-05, + "loss": 2.1212, + "step": 14707 + }, + { + "epoch": 2.75688847235239, + "grad_norm": 58987.06640625, + "learning_rate": 1.6318292263316432e-05, + "loss": 2.1449, + "step": 14708 + }, + { + "epoch": 2.7570759137769447, + "grad_norm": 58455.41015625, + "learning_rate": 1.6312485172856547e-05, + "loss": 2.0791, + "step": 14709 + }, + { + "epoch": 2.7572633552014993, + "grad_norm": 57711.19921875, + "learning_rate": 1.6306678914434602e-05, + "loss": 2.1455, + "step": 14710 + }, + { + "epoch": 2.7574507966260544, + "grad_norm": 54829.46875, + "learning_rate": 1.6300873488194024e-05, + "loss": 2.2298, + "step": 14711 + }, + { + "epoch": 2.757638238050609, + "grad_norm": 53639.48828125, + "learning_rate": 1.6295068894278177e-05, + "loss": 2.0741, + "step": 14712 + }, + { + "epoch": 2.757825679475164, + "grad_norm": 54170.984375, + "learning_rate": 1.628926513283041e-05, + "loss": 2.0913, + "step": 14713 + }, + { + "epoch": 2.7580131208997187, + "grad_norm": 56128.48828125, + "learning_rate": 1.628346220399411e-05, + "loss": 2.1444, + "step": 14714 + }, + { + "epoch": 2.758200562324274, + "grad_norm": 51015.43359375, + "learning_rate": 1.6277660107912562e-05, + "loss": 2.1385, + "step": 14715 + }, + { + "epoch": 2.7583880037488284, + "grad_norm": 53036.1171875, + "learning_rate": 1.62718588447291e-05, + "loss": 2.1935, + "step": 14716 + }, + { + "epoch": 2.758575445173383, + "grad_norm": 55912.3828125, + "learning_rate": 1.6266058414586983e-05, + "loss": 2.1537, + "step": 14717 + }, + { + "epoch": 2.758762886597938, + "grad_norm": 50991.5546875, + "learning_rate": 1.6260258817629502e-05, + "loss": 2.1527, + "step": 14718 + }, + { + "epoch": 2.758950328022493, + "grad_norm": 55534.83984375, + "learning_rate": 1.625446005399988e-05, + "loss": 2.1209, + "step": 14719 + }, + { + "epoch": 2.759137769447048, + "grad_norm": 55637.1015625, + "learning_rate": 1.6248662123841328e-05, + "loss": 2.1035, + "step": 14720 + }, + { + "epoch": 2.7593252108716024, + "grad_norm": 56462.89453125, + "learning_rate": 1.6242865027297084e-05, + "loss": 2.172, + "step": 14721 + }, + { + "epoch": 2.7595126522961575, + "grad_norm": 56110.0625, + "learning_rate": 1.6237068764510288e-05, + "loss": 2.0462, + "step": 14722 + }, + { + "epoch": 2.759700093720712, + "grad_norm": 55524.9921875, + "learning_rate": 1.623127333562413e-05, + "loss": 2.1122, + "step": 14723 + }, + { + "epoch": 2.759887535145267, + "grad_norm": 53191.90234375, + "learning_rate": 1.6225478740781746e-05, + "loss": 2.1626, + "step": 14724 + }, + { + "epoch": 2.760074976569822, + "grad_norm": 53300.25390625, + "learning_rate": 1.621968498012623e-05, + "loss": 2.1231, + "step": 14725 + }, + { + "epoch": 2.760262417994377, + "grad_norm": 51496.58203125, + "learning_rate": 1.6213892053800696e-05, + "loss": 2.1464, + "step": 14726 + }, + { + "epoch": 2.7604498594189315, + "grad_norm": 56364.5859375, + "learning_rate": 1.6208099961948242e-05, + "loss": 2.0824, + "step": 14727 + }, + { + "epoch": 2.7606373008434866, + "grad_norm": 51878.14453125, + "learning_rate": 1.6202308704711904e-05, + "loss": 2.1549, + "step": 14728 + }, + { + "epoch": 2.760824742268041, + "grad_norm": 51226.5, + "learning_rate": 1.6196518282234708e-05, + "loss": 2.068, + "step": 14729 + }, + { + "epoch": 2.7610121836925963, + "grad_norm": 55177.921875, + "learning_rate": 1.6190728694659703e-05, + "loss": 2.1977, + "step": 14730 + }, + { + "epoch": 2.761199625117151, + "grad_norm": 53683.5390625, + "learning_rate": 1.6184939942129863e-05, + "loss": 2.1722, + "step": 14731 + }, + { + "epoch": 2.7613870665417055, + "grad_norm": 54888.15234375, + "learning_rate": 1.6179152024788143e-05, + "loss": 2.1604, + "step": 14732 + }, + { + "epoch": 2.7615745079662606, + "grad_norm": 59824.92578125, + "learning_rate": 1.6173364942777525e-05, + "loss": 2.1371, + "step": 14733 + }, + { + "epoch": 2.761761949390815, + "grad_norm": 55591.64453125, + "learning_rate": 1.6167578696240965e-05, + "loss": 2.159, + "step": 14734 + }, + { + "epoch": 2.7619493908153703, + "grad_norm": 59350.12890625, + "learning_rate": 1.6161793285321313e-05, + "loss": 2.2015, + "step": 14735 + }, + { + "epoch": 2.762136832239925, + "grad_norm": 50616.1328125, + "learning_rate": 1.6156008710161497e-05, + "loss": 2.1406, + "step": 14736 + }, + { + "epoch": 2.76232427366448, + "grad_norm": 54318.75390625, + "learning_rate": 1.6150224970904403e-05, + "loss": 2.139, + "step": 14737 + }, + { + "epoch": 2.7625117150890346, + "grad_norm": 50367.2578125, + "learning_rate": 1.6144442067692866e-05, + "loss": 2.1652, + "step": 14738 + }, + { + "epoch": 2.7626991565135897, + "grad_norm": 52760.67578125, + "learning_rate": 1.6138660000669697e-05, + "loss": 2.1097, + "step": 14739 + }, + { + "epoch": 2.7628865979381443, + "grad_norm": 53639.01171875, + "learning_rate": 1.613287876997773e-05, + "loss": 2.0879, + "step": 14740 + }, + { + "epoch": 2.7630740393626994, + "grad_norm": 54699.328125, + "learning_rate": 1.6127098375759758e-05, + "loss": 2.0746, + "step": 14741 + }, + { + "epoch": 2.763261480787254, + "grad_norm": 50219.45703125, + "learning_rate": 1.6121318818158536e-05, + "loss": 2.1534, + "step": 14742 + }, + { + "epoch": 2.7634489222118086, + "grad_norm": 52881.67578125, + "learning_rate": 1.6115540097316806e-05, + "loss": 2.1233, + "step": 14743 + }, + { + "epoch": 2.7636363636363637, + "grad_norm": 54526.16015625, + "learning_rate": 1.6109762213377315e-05, + "loss": 2.124, + "step": 14744 + }, + { + "epoch": 2.7638238050609187, + "grad_norm": 55590.8046875, + "learning_rate": 1.610398516648275e-05, + "loss": 2.1906, + "step": 14745 + }, + { + "epoch": 2.7640112464854734, + "grad_norm": 52749.53125, + "learning_rate": 1.6098208956775813e-05, + "loss": 2.1573, + "step": 14746 + }, + { + "epoch": 2.764198687910028, + "grad_norm": 53130.390625, + "learning_rate": 1.6092433584399163e-05, + "loss": 2.1194, + "step": 14747 + }, + { + "epoch": 2.764386129334583, + "grad_norm": 54796.4921875, + "learning_rate": 1.6086659049495422e-05, + "loss": 2.2203, + "step": 14748 + }, + { + "epoch": 2.7645735707591377, + "grad_norm": 55756.2578125, + "learning_rate": 1.6080885352207253e-05, + "loss": 2.0774, + "step": 14749 + }, + { + "epoch": 2.7647610121836927, + "grad_norm": 60197.73046875, + "learning_rate": 1.6075112492677226e-05, + "loss": 2.0847, + "step": 14750 + }, + { + "epoch": 2.7649484536082474, + "grad_norm": 55274.1953125, + "learning_rate": 1.606934047104795e-05, + "loss": 2.0792, + "step": 14751 + }, + { + "epoch": 2.7651358950328024, + "grad_norm": 59906.5859375, + "learning_rate": 1.6063569287461954e-05, + "loss": 2.0932, + "step": 14752 + }, + { + "epoch": 2.765323336457357, + "grad_norm": 62578.328125, + "learning_rate": 1.6057798942061813e-05, + "loss": 2.1626, + "step": 14753 + }, + { + "epoch": 2.7655107778819117, + "grad_norm": 49113.23046875, + "learning_rate": 1.6052029434990034e-05, + "loss": 2.136, + "step": 14754 + }, + { + "epoch": 2.7656982193064668, + "grad_norm": 55229.60546875, + "learning_rate": 1.604626076638909e-05, + "loss": 2.1714, + "step": 14755 + }, + { + "epoch": 2.765885660731022, + "grad_norm": 57164.44140625, + "learning_rate": 1.6040492936401498e-05, + "loss": 2.05, + "step": 14756 + }, + { + "epoch": 2.7660731021555764, + "grad_norm": 53839.4921875, + "learning_rate": 1.6034725945169703e-05, + "loss": 2.1594, + "step": 14757 + }, + { + "epoch": 2.766260543580131, + "grad_norm": 53639.47265625, + "learning_rate": 1.602895979283615e-05, + "loss": 2.1083, + "step": 14758 + }, + { + "epoch": 2.766447985004686, + "grad_norm": 52315.23828125, + "learning_rate": 1.602319447954323e-05, + "loss": 2.1365, + "step": 14759 + }, + { + "epoch": 2.7666354264292408, + "grad_norm": 59563.9609375, + "learning_rate": 1.6017430005433366e-05, + "loss": 2.1608, + "step": 14760 + }, + { + "epoch": 2.766822867853796, + "grad_norm": 53002.6953125, + "learning_rate": 1.6011666370648924e-05, + "loss": 2.1735, + "step": 14761 + }, + { + "epoch": 2.7670103092783505, + "grad_norm": 56986.484375, + "learning_rate": 1.600590357533224e-05, + "loss": 2.1012, + "step": 14762 + }, + { + "epoch": 2.7671977507029055, + "grad_norm": 52423.18359375, + "learning_rate": 1.6000141619625663e-05, + "loss": 2.1464, + "step": 14763 + }, + { + "epoch": 2.76738519212746, + "grad_norm": 54446.27734375, + "learning_rate": 1.5994380503671536e-05, + "loss": 2.1787, + "step": 14764 + }, + { + "epoch": 2.7675726335520148, + "grad_norm": 56612.44140625, + "learning_rate": 1.5988620227612094e-05, + "loss": 1.9983, + "step": 14765 + }, + { + "epoch": 2.76776007497657, + "grad_norm": 54551.69140625, + "learning_rate": 1.5982860791589633e-05, + "loss": 2.2414, + "step": 14766 + }, + { + "epoch": 2.767947516401125, + "grad_norm": 60608.16796875, + "learning_rate": 1.5977102195746423e-05, + "loss": 2.035, + "step": 14767 + }, + { + "epoch": 2.7681349578256795, + "grad_norm": 53654.38671875, + "learning_rate": 1.5971344440224677e-05, + "loss": 2.1083, + "step": 14768 + }, + { + "epoch": 2.768322399250234, + "grad_norm": 53011.265625, + "learning_rate": 1.5965587525166593e-05, + "loss": 2.1676, + "step": 14769 + }, + { + "epoch": 2.7685098406747892, + "grad_norm": 55217.171875, + "learning_rate": 1.595983145071437e-05, + "loss": 2.1482, + "step": 14770 + }, + { + "epoch": 2.768697282099344, + "grad_norm": 54726.12890625, + "learning_rate": 1.5954076217010188e-05, + "loss": 2.0933, + "step": 14771 + }, + { + "epoch": 2.768884723523899, + "grad_norm": 60475.6171875, + "learning_rate": 1.5948321824196182e-05, + "loss": 2.1884, + "step": 14772 + }, + { + "epoch": 2.7690721649484535, + "grad_norm": 50546.11328125, + "learning_rate": 1.5942568272414464e-05, + "loss": 2.1394, + "step": 14773 + }, + { + "epoch": 2.7692596063730086, + "grad_norm": 53084.9296875, + "learning_rate": 1.5936815561807173e-05, + "loss": 2.0986, + "step": 14774 + }, + { + "epoch": 2.7694470477975632, + "grad_norm": 54959.5, + "learning_rate": 1.5931063692516373e-05, + "loss": 2.0608, + "step": 14775 + }, + { + "epoch": 2.769634489222118, + "grad_norm": 55991.07421875, + "learning_rate": 1.5925312664684112e-05, + "loss": 2.1663, + "step": 14776 + }, + { + "epoch": 2.769821930646673, + "grad_norm": 54673.23046875, + "learning_rate": 1.5919562478452464e-05, + "loss": 2.151, + "step": 14777 + }, + { + "epoch": 2.770009372071228, + "grad_norm": 54318.73828125, + "learning_rate": 1.5913813133963423e-05, + "loss": 2.1582, + "step": 14778 + }, + { + "epoch": 2.7701968134957826, + "grad_norm": 54277.34765625, + "learning_rate": 1.5908064631359015e-05, + "loss": 2.0555, + "step": 14779 + }, + { + "epoch": 2.7703842549203372, + "grad_norm": 52807.1484375, + "learning_rate": 1.59023169707812e-05, + "loss": 2.0797, + "step": 14780 + }, + { + "epoch": 2.7705716963448923, + "grad_norm": 57119.69921875, + "learning_rate": 1.5896570152371958e-05, + "loss": 2.0913, + "step": 14781 + }, + { + "epoch": 2.770759137769447, + "grad_norm": 52040.8203125, + "learning_rate": 1.5890824176273207e-05, + "loss": 2.1183, + "step": 14782 + }, + { + "epoch": 2.770946579194002, + "grad_norm": 56298.69921875, + "learning_rate": 1.588507904262689e-05, + "loss": 2.0661, + "step": 14783 + }, + { + "epoch": 2.7711340206185566, + "grad_norm": 56840.40234375, + "learning_rate": 1.5879334751574887e-05, + "loss": 2.1696, + "step": 14784 + }, + { + "epoch": 2.7713214620431117, + "grad_norm": 53091.0703125, + "learning_rate": 1.5873591303259068e-05, + "loss": 2.1586, + "step": 14785 + }, + { + "epoch": 2.7715089034676663, + "grad_norm": 56957.34765625, + "learning_rate": 1.5867848697821292e-05, + "loss": 2.0625, + "step": 14786 + }, + { + "epoch": 2.771696344892221, + "grad_norm": 53431.35546875, + "learning_rate": 1.5862106935403432e-05, + "loss": 2.1058, + "step": 14787 + }, + { + "epoch": 2.771883786316776, + "grad_norm": 51304.67578125, + "learning_rate": 1.585636601614724e-05, + "loss": 2.1082, + "step": 14788 + }, + { + "epoch": 2.772071227741331, + "grad_norm": 53346.9140625, + "learning_rate": 1.5850625940194542e-05, + "loss": 2.1201, + "step": 14789 + }, + { + "epoch": 2.7722586691658857, + "grad_norm": 56080.55859375, + "learning_rate": 1.5844886707687123e-05, + "loss": 2.1699, + "step": 14790 + }, + { + "epoch": 2.7724461105904403, + "grad_norm": 57205.5546875, + "learning_rate": 1.5839148318766718e-05, + "loss": 2.2199, + "step": 14791 + }, + { + "epoch": 2.7726335520149954, + "grad_norm": 54963.83984375, + "learning_rate": 1.5833410773575042e-05, + "loss": 2.1546, + "step": 14792 + }, + { + "epoch": 2.77282099343955, + "grad_norm": 57313.90625, + "learning_rate": 1.5827674072253824e-05, + "loss": 2.0245, + "step": 14793 + }, + { + "epoch": 2.773008434864105, + "grad_norm": 52155.06640625, + "learning_rate": 1.5821938214944787e-05, + "loss": 2.1357, + "step": 14794 + }, + { + "epoch": 2.7731958762886597, + "grad_norm": 58595.16015625, + "learning_rate": 1.5816203201789524e-05, + "loss": 2.0452, + "step": 14795 + }, + { + "epoch": 2.773383317713215, + "grad_norm": 55569.3125, + "learning_rate": 1.5810469032929726e-05, + "loss": 2.1069, + "step": 14796 + }, + { + "epoch": 2.7735707591377694, + "grad_norm": 61081.078125, + "learning_rate": 1.5804735708507035e-05, + "loss": 2.1319, + "step": 14797 + }, + { + "epoch": 2.773758200562324, + "grad_norm": 57007.69921875, + "learning_rate": 1.5799003228663033e-05, + "loss": 2.116, + "step": 14798 + }, + { + "epoch": 2.773945641986879, + "grad_norm": 55545.33984375, + "learning_rate": 1.5793271593539294e-05, + "loss": 2.1693, + "step": 14799 + }, + { + "epoch": 2.774133083411434, + "grad_norm": 56048.21484375, + "learning_rate": 1.578754080327741e-05, + "loss": 2.1372, + "step": 14800 + }, + { + "epoch": 2.774320524835989, + "grad_norm": 53401.9453125, + "learning_rate": 1.5781810858018896e-05, + "loss": 2.0474, + "step": 14801 + }, + { + "epoch": 2.7745079662605434, + "grad_norm": 57049.81640625, + "learning_rate": 1.577608175790531e-05, + "loss": 2.1347, + "step": 14802 + }, + { + "epoch": 2.7746954076850985, + "grad_norm": 59287.70703125, + "learning_rate": 1.577035350307811e-05, + "loss": 2.0519, + "step": 14803 + }, + { + "epoch": 2.774882849109653, + "grad_norm": 54819.53125, + "learning_rate": 1.5764626093678818e-05, + "loss": 2.0562, + "step": 14804 + }, + { + "epoch": 2.775070290534208, + "grad_norm": 52674.171875, + "learning_rate": 1.575889952984888e-05, + "loss": 2.1012, + "step": 14805 + }, + { + "epoch": 2.775257731958763, + "grad_norm": 52923.33984375, + "learning_rate": 1.5753173811729705e-05, + "loss": 2.1523, + "step": 14806 + }, + { + "epoch": 2.775445173383318, + "grad_norm": 50989.9296875, + "learning_rate": 1.5747448939462757e-05, + "loss": 2.0864, + "step": 14807 + }, + { + "epoch": 2.7756326148078725, + "grad_norm": 53585.84765625, + "learning_rate": 1.5741724913189393e-05, + "loss": 2.1192, + "step": 14808 + }, + { + "epoch": 2.775820056232427, + "grad_norm": 53469.3828125, + "learning_rate": 1.5736001733051026e-05, + "loss": 2.126, + "step": 14809 + }, + { + "epoch": 2.776007497656982, + "grad_norm": 54954.71875, + "learning_rate": 1.573027939918897e-05, + "loss": 2.1998, + "step": 14810 + }, + { + "epoch": 2.7761949390815372, + "grad_norm": 56331.9921875, + "learning_rate": 1.5724557911744602e-05, + "loss": 2.0545, + "step": 14811 + }, + { + "epoch": 2.776382380506092, + "grad_norm": 56825.375, + "learning_rate": 1.57188372708592e-05, + "loss": 2.0259, + "step": 14812 + }, + { + "epoch": 2.7765698219306465, + "grad_norm": 56184.8359375, + "learning_rate": 1.5713117476674083e-05, + "loss": 2.1151, + "step": 14813 + }, + { + "epoch": 2.7767572633552016, + "grad_norm": 61648.8125, + "learning_rate": 1.5707398529330512e-05, + "loss": 2.1348, + "step": 14814 + }, + { + "epoch": 2.776944704779756, + "grad_norm": 53971.9375, + "learning_rate": 1.5701680428969717e-05, + "loss": 2.1064, + "step": 14815 + }, + { + "epoch": 2.7771321462043113, + "grad_norm": 52709.7734375, + "learning_rate": 1.569596317573297e-05, + "loss": 2.14, + "step": 14816 + }, + { + "epoch": 2.777319587628866, + "grad_norm": 60210.43359375, + "learning_rate": 1.5690246769761452e-05, + "loss": 1.9945, + "step": 14817 + }, + { + "epoch": 2.777507029053421, + "grad_norm": 54697.6328125, + "learning_rate": 1.568453121119634e-05, + "loss": 2.1319, + "step": 14818 + }, + { + "epoch": 2.7776944704779756, + "grad_norm": 55151.40625, + "learning_rate": 1.5678816500178824e-05, + "loss": 2.0732, + "step": 14819 + }, + { + "epoch": 2.77788191190253, + "grad_norm": 50594.7421875, + "learning_rate": 1.5673102636850046e-05, + "loss": 2.1159, + "step": 14820 + }, + { + "epoch": 2.7780693533270853, + "grad_norm": 53153.58984375, + "learning_rate": 1.5667389621351137e-05, + "loss": 2.1051, + "step": 14821 + }, + { + "epoch": 2.7782567947516403, + "grad_norm": 55350.8125, + "learning_rate": 1.566167745382317e-05, + "loss": 2.1804, + "step": 14822 + }, + { + "epoch": 2.778444236176195, + "grad_norm": 55716.7890625, + "learning_rate": 1.5655966134407267e-05, + "loss": 2.1353, + "step": 14823 + }, + { + "epoch": 2.7786316776007496, + "grad_norm": 54983.76171875, + "learning_rate": 1.5650255663244472e-05, + "loss": 2.1605, + "step": 14824 + }, + { + "epoch": 2.7788191190253047, + "grad_norm": 54424.0859375, + "learning_rate": 1.564454604047581e-05, + "loss": 2.1293, + "step": 14825 + }, + { + "epoch": 2.7790065604498593, + "grad_norm": 56210.25390625, + "learning_rate": 1.5638837266242325e-05, + "loss": 2.1735, + "step": 14826 + }, + { + "epoch": 2.7791940018744143, + "grad_norm": 56692.4375, + "learning_rate": 1.563312934068502e-05, + "loss": 2.1277, + "step": 14827 + }, + { + "epoch": 2.779381443298969, + "grad_norm": 53381.78515625, + "learning_rate": 1.5627422263944864e-05, + "loss": 2.0984, + "step": 14828 + }, + { + "epoch": 2.779568884723524, + "grad_norm": 53086.875, + "learning_rate": 1.5621716036162798e-05, + "loss": 2.1126, + "step": 14829 + }, + { + "epoch": 2.7797563261480787, + "grad_norm": 53792.8671875, + "learning_rate": 1.561601065747979e-05, + "loss": 2.1034, + "step": 14830 + }, + { + "epoch": 2.7799437675726333, + "grad_norm": 51865.75390625, + "learning_rate": 1.5610306128036733e-05, + "loss": 2.0994, + "step": 14831 + }, + { + "epoch": 2.7801312089971884, + "grad_norm": 54515.31640625, + "learning_rate": 1.560460244797452e-05, + "loss": 2.1778, + "step": 14832 + }, + { + "epoch": 2.7803186504217434, + "grad_norm": 56827.94140625, + "learning_rate": 1.5598899617434033e-05, + "loss": 2.1453, + "step": 14833 + }, + { + "epoch": 2.780506091846298, + "grad_norm": 54505.9609375, + "learning_rate": 1.5593197636556133e-05, + "loss": 2.0259, + "step": 14834 + }, + { + "epoch": 2.7806935332708527, + "grad_norm": 50437.44140625, + "learning_rate": 1.5587496505481648e-05, + "loss": 2.1309, + "step": 14835 + }, + { + "epoch": 2.7808809746954077, + "grad_norm": 52432.46484375, + "learning_rate": 1.5581796224351363e-05, + "loss": 2.1262, + "step": 14836 + }, + { + "epoch": 2.7810684161199624, + "grad_norm": 54519.35546875, + "learning_rate": 1.5576096793306105e-05, + "loss": 2.0351, + "step": 14837 + }, + { + "epoch": 2.7812558575445174, + "grad_norm": 55144.98828125, + "learning_rate": 1.5570398212486608e-05, + "loss": 2.155, + "step": 14838 + }, + { + "epoch": 2.781443298969072, + "grad_norm": 60980.3671875, + "learning_rate": 1.556470048203365e-05, + "loss": 2.1286, + "step": 14839 + }, + { + "epoch": 2.781630740393627, + "grad_norm": 55259.42578125, + "learning_rate": 1.5559003602087952e-05, + "loss": 2.0546, + "step": 14840 + }, + { + "epoch": 2.7818181818181817, + "grad_norm": 51532.09765625, + "learning_rate": 1.555330757279019e-05, + "loss": 2.1501, + "step": 14841 + }, + { + "epoch": 2.7820056232427364, + "grad_norm": 59407.234375, + "learning_rate": 1.554761239428107e-05, + "loss": 2.1594, + "step": 14842 + }, + { + "epoch": 2.7821930646672914, + "grad_norm": 56725.5234375, + "learning_rate": 1.5541918066701273e-05, + "loss": 2.0723, + "step": 14843 + }, + { + "epoch": 2.7823805060918465, + "grad_norm": 58366.453125, + "learning_rate": 1.5536224590191424e-05, + "loss": 2.2039, + "step": 14844 + }, + { + "epoch": 2.782567947516401, + "grad_norm": 59122.9609375, + "learning_rate": 1.5530531964892125e-05, + "loss": 2.0187, + "step": 14845 + }, + { + "epoch": 2.7827553889409558, + "grad_norm": 53368.23046875, + "learning_rate": 1.552484019094402e-05, + "loss": 2.0659, + "step": 14846 + }, + { + "epoch": 2.782942830365511, + "grad_norm": 53139.19921875, + "learning_rate": 1.5519149268487658e-05, + "loss": 2.1479, + "step": 14847 + }, + { + "epoch": 2.7831302717900654, + "grad_norm": 63036.5546875, + "learning_rate": 1.551345919766359e-05, + "loss": 2.1486, + "step": 14848 + }, + { + "epoch": 2.7833177132146205, + "grad_norm": 59229.28125, + "learning_rate": 1.550776997861237e-05, + "loss": 2.0921, + "step": 14849 + }, + { + "epoch": 2.783505154639175, + "grad_norm": 52776.12109375, + "learning_rate": 1.550208161147452e-05, + "loss": 2.0649, + "step": 14850 + }, + { + "epoch": 2.78369259606373, + "grad_norm": 53875.61328125, + "learning_rate": 1.5496394096390537e-05, + "loss": 2.0962, + "step": 14851 + }, + { + "epoch": 2.783880037488285, + "grad_norm": 55164.640625, + "learning_rate": 1.549070743350086e-05, + "loss": 2.0725, + "step": 14852 + }, + { + "epoch": 2.78406747891284, + "grad_norm": 51079.12109375, + "learning_rate": 1.548502162294599e-05, + "loss": 2.1442, + "step": 14853 + }, + { + "epoch": 2.7842549203373945, + "grad_norm": 54559.33203125, + "learning_rate": 1.5479336664866333e-05, + "loss": 2.0934, + "step": 14854 + }, + { + "epoch": 2.7844423617619496, + "grad_norm": 56919.4296875, + "learning_rate": 1.547365255940229e-05, + "loss": 2.1033, + "step": 14855 + }, + { + "epoch": 2.784629803186504, + "grad_norm": 55305.20703125, + "learning_rate": 1.5467969306694262e-05, + "loss": 2.0204, + "step": 14856 + }, + { + "epoch": 2.784817244611059, + "grad_norm": 50373.0078125, + "learning_rate": 1.5462286906882657e-05, + "loss": 2.1302, + "step": 14857 + }, + { + "epoch": 2.785004686035614, + "grad_norm": 53944.796875, + "learning_rate": 1.545660536010775e-05, + "loss": 2.1089, + "step": 14858 + }, + { + "epoch": 2.7851921274601685, + "grad_norm": 55472.61328125, + "learning_rate": 1.5450924666509908e-05, + "loss": 2.1315, + "step": 14859 + }, + { + "epoch": 2.7853795688847236, + "grad_norm": 55386.7421875, + "learning_rate": 1.5445244826229448e-05, + "loss": 2.1986, + "step": 14860 + }, + { + "epoch": 2.7855670103092782, + "grad_norm": 57496.54296875, + "learning_rate": 1.543956583940664e-05, + "loss": 2.1126, + "step": 14861 + }, + { + "epoch": 2.7857544517338333, + "grad_norm": 52945.86328125, + "learning_rate": 1.543388770618173e-05, + "loss": 2.1176, + "step": 14862 + }, + { + "epoch": 2.785941893158388, + "grad_norm": 53848.51171875, + "learning_rate": 1.5428210426694978e-05, + "loss": 2.1341, + "step": 14863 + }, + { + "epoch": 2.786129334582943, + "grad_norm": 54157.81640625, + "learning_rate": 1.5422534001086625e-05, + "loss": 2.1295, + "step": 14864 + }, + { + "epoch": 2.7863167760074976, + "grad_norm": 52602.0625, + "learning_rate": 1.541685842949685e-05, + "loss": 2.0765, + "step": 14865 + }, + { + "epoch": 2.7865042174320527, + "grad_norm": 54626.8203125, + "learning_rate": 1.541118371206582e-05, + "loss": 2.1548, + "step": 14866 + }, + { + "epoch": 2.7866916588566073, + "grad_norm": 50843.015625, + "learning_rate": 1.5405509848933718e-05, + "loss": 2.1232, + "step": 14867 + }, + { + "epoch": 2.786879100281162, + "grad_norm": 57444.171875, + "learning_rate": 1.5399836840240654e-05, + "loss": 2.1109, + "step": 14868 + }, + { + "epoch": 2.787066541705717, + "grad_norm": 54962.1484375, + "learning_rate": 1.539416468612678e-05, + "loss": 2.1628, + "step": 14869 + }, + { + "epoch": 2.787253983130272, + "grad_norm": 54819.5625, + "learning_rate": 1.538849338673217e-05, + "loss": 2.0833, + "step": 14870 + }, + { + "epoch": 2.7874414245548267, + "grad_norm": 54976.74609375, + "learning_rate": 1.5382822942196883e-05, + "loss": 2.2074, + "step": 14871 + }, + { + "epoch": 2.7876288659793813, + "grad_norm": 56892.48046875, + "learning_rate": 1.5377153352661006e-05, + "loss": 2.1617, + "step": 14872 + }, + { + "epoch": 2.7878163074039364, + "grad_norm": 51435.01171875, + "learning_rate": 1.5371484618264537e-05, + "loss": 2.1234, + "step": 14873 + }, + { + "epoch": 2.788003748828491, + "grad_norm": 51346.1875, + "learning_rate": 1.536581673914751e-05, + "loss": 2.1031, + "step": 14874 + }, + { + "epoch": 2.788191190253046, + "grad_norm": 55704.7578125, + "learning_rate": 1.5360149715449895e-05, + "loss": 2.1899, + "step": 14875 + }, + { + "epoch": 2.7883786316776007, + "grad_norm": 53238.45703125, + "learning_rate": 1.5354483547311688e-05, + "loss": 2.1284, + "step": 14876 + }, + { + "epoch": 2.7885660731021558, + "grad_norm": 56151.62890625, + "learning_rate": 1.5348818234872814e-05, + "loss": 2.0454, + "step": 14877 + }, + { + "epoch": 2.7887535145267104, + "grad_norm": 57952.171875, + "learning_rate": 1.5343153778273188e-05, + "loss": 2.1615, + "step": 14878 + }, + { + "epoch": 2.788940955951265, + "grad_norm": 54774.2109375, + "learning_rate": 1.5337490177652736e-05, + "loss": 2.14, + "step": 14879 + }, + { + "epoch": 2.78912839737582, + "grad_norm": 56510.4140625, + "learning_rate": 1.5331827433151363e-05, + "loss": 2.1294, + "step": 14880 + }, + { + "epoch": 2.789315838800375, + "grad_norm": 55209.703125, + "learning_rate": 1.532616554490888e-05, + "loss": 2.1739, + "step": 14881 + }, + { + "epoch": 2.7895032802249298, + "grad_norm": 49721.0234375, + "learning_rate": 1.5320504513065143e-05, + "loss": 2.109, + "step": 14882 + }, + { + "epoch": 2.7896907216494844, + "grad_norm": 53224.8515625, + "learning_rate": 1.5314844337759997e-05, + "loss": 2.1574, + "step": 14883 + }, + { + "epoch": 2.7898781630740395, + "grad_norm": 55083.1328125, + "learning_rate": 1.5309185019133232e-05, + "loss": 2.0732, + "step": 14884 + }, + { + "epoch": 2.790065604498594, + "grad_norm": 58773.24609375, + "learning_rate": 1.5303526557324604e-05, + "loss": 2.1323, + "step": 14885 + }, + { + "epoch": 2.790253045923149, + "grad_norm": 53788.84765625, + "learning_rate": 1.5297868952473882e-05, + "loss": 2.1785, + "step": 14886 + }, + { + "epoch": 2.790440487347704, + "grad_norm": 54078.234375, + "learning_rate": 1.529221220472084e-05, + "loss": 2.0666, + "step": 14887 + }, + { + "epoch": 2.790627928772259, + "grad_norm": 58641.43359375, + "learning_rate": 1.5286556314205123e-05, + "loss": 2.1024, + "step": 14888 + }, + { + "epoch": 2.7908153701968135, + "grad_norm": 51776.94140625, + "learning_rate": 1.528090128106646e-05, + "loss": 2.1235, + "step": 14889 + }, + { + "epoch": 2.791002811621368, + "grad_norm": 59989.171875, + "learning_rate": 1.5275247105444534e-05, + "loss": 2.0775, + "step": 14890 + }, + { + "epoch": 2.791190253045923, + "grad_norm": 53768.16015625, + "learning_rate": 1.5269593787478987e-05, + "loss": 2.0837, + "step": 14891 + }, + { + "epoch": 2.7913776944704782, + "grad_norm": 53474.09765625, + "learning_rate": 1.5263941327309428e-05, + "loss": 2.0982, + "step": 14892 + }, + { + "epoch": 2.791565135895033, + "grad_norm": 57421.703125, + "learning_rate": 1.5258289725075498e-05, + "loss": 2.1162, + "step": 14893 + }, + { + "epoch": 2.7917525773195875, + "grad_norm": 56596.6328125, + "learning_rate": 1.525263898091675e-05, + "loss": 2.1588, + "step": 14894 + }, + { + "epoch": 2.7919400187441425, + "grad_norm": 50173.203125, + "learning_rate": 1.5246989094972791e-05, + "loss": 2.0662, + "step": 14895 + }, + { + "epoch": 2.792127460168697, + "grad_norm": 57781.95703125, + "learning_rate": 1.524134006738312e-05, + "loss": 2.1626, + "step": 14896 + }, + { + "epoch": 2.7923149015932522, + "grad_norm": 57348.84765625, + "learning_rate": 1.5235691898287301e-05, + "loss": 2.1389, + "step": 14897 + }, + { + "epoch": 2.792502343017807, + "grad_norm": 53928.1875, + "learning_rate": 1.523004458782481e-05, + "loss": 2.1074, + "step": 14898 + }, + { + "epoch": 2.792689784442362, + "grad_norm": 51012.87890625, + "learning_rate": 1.5224398136135148e-05, + "loss": 2.1457, + "step": 14899 + }, + { + "epoch": 2.7928772258669166, + "grad_norm": 54593.203125, + "learning_rate": 1.5218752543357767e-05, + "loss": 2.1732, + "step": 14900 + }, + { + "epoch": 2.793064667291471, + "grad_norm": 59737.60546875, + "learning_rate": 1.5213107809632088e-05, + "loss": 2.1341, + "step": 14901 + }, + { + "epoch": 2.7932521087160262, + "grad_norm": 51170.29296875, + "learning_rate": 1.520746393509756e-05, + "loss": 2.1209, + "step": 14902 + }, + { + "epoch": 2.7934395501405813, + "grad_norm": 57116.1328125, + "learning_rate": 1.5201820919893544e-05, + "loss": 2.1102, + "step": 14903 + }, + { + "epoch": 2.793626991565136, + "grad_norm": 54870.5546875, + "learning_rate": 1.5196178764159452e-05, + "loss": 2.1607, + "step": 14904 + }, + { + "epoch": 2.7938144329896906, + "grad_norm": 50948.53125, + "learning_rate": 1.519053746803461e-05, + "loss": 2.093, + "step": 14905 + }, + { + "epoch": 2.7940018744142456, + "grad_norm": 55021.76171875, + "learning_rate": 1.5184897031658374e-05, + "loss": 2.097, + "step": 14906 + }, + { + "epoch": 2.7941893158388003, + "grad_norm": 50555.4140625, + "learning_rate": 1.5179257455170044e-05, + "loss": 2.039, + "step": 14907 + }, + { + "epoch": 2.7943767572633553, + "grad_norm": 50756.40625, + "learning_rate": 1.517361873870889e-05, + "loss": 2.0767, + "step": 14908 + }, + { + "epoch": 2.79456419868791, + "grad_norm": 55606.1640625, + "learning_rate": 1.5167980882414201e-05, + "loss": 2.047, + "step": 14909 + }, + { + "epoch": 2.794751640112465, + "grad_norm": 57163.18359375, + "learning_rate": 1.5162343886425256e-05, + "loss": 2.0698, + "step": 14910 + }, + { + "epoch": 2.7949390815370196, + "grad_norm": 56549.76171875, + "learning_rate": 1.5156707750881215e-05, + "loss": 2.1151, + "step": 14911 + }, + { + "epoch": 2.7951265229615743, + "grad_norm": 57412.2421875, + "learning_rate": 1.5151072475921324e-05, + "loss": 2.1116, + "step": 14912 + }, + { + "epoch": 2.7953139643861293, + "grad_norm": 55415.84375, + "learning_rate": 1.5145438061684774e-05, + "loss": 2.1622, + "step": 14913 + }, + { + "epoch": 2.7955014058106844, + "grad_norm": 52756.17578125, + "learning_rate": 1.5139804508310712e-05, + "loss": 2.1194, + "step": 14914 + }, + { + "epoch": 2.795688847235239, + "grad_norm": 51605.29296875, + "learning_rate": 1.5134171815938263e-05, + "loss": 2.1179, + "step": 14915 + }, + { + "epoch": 2.7958762886597937, + "grad_norm": 53995.9375, + "learning_rate": 1.5128539984706592e-05, + "loss": 2.0825, + "step": 14916 + }, + { + "epoch": 2.7960637300843487, + "grad_norm": 51928.71875, + "learning_rate": 1.512290901475476e-05, + "loss": 2.212, + "step": 14917 + }, + { + "epoch": 2.7962511715089033, + "grad_norm": 59686.37890625, + "learning_rate": 1.511727890622185e-05, + "loss": 2.1536, + "step": 14918 + }, + { + "epoch": 2.7964386129334584, + "grad_norm": 57243.58203125, + "learning_rate": 1.5111649659246919e-05, + "loss": 2.1137, + "step": 14919 + }, + { + "epoch": 2.796626054358013, + "grad_norm": 54809.1484375, + "learning_rate": 1.5106021273969029e-05, + "loss": 2.1267, + "step": 14920 + }, + { + "epoch": 2.796813495782568, + "grad_norm": 52204.83203125, + "learning_rate": 1.510039375052717e-05, + "loss": 2.132, + "step": 14921 + }, + { + "epoch": 2.7970009372071227, + "grad_norm": 61793.3203125, + "learning_rate": 1.5094767089060318e-05, + "loss": 2.2172, + "step": 14922 + }, + { + "epoch": 2.7971883786316774, + "grad_norm": 53771.7734375, + "learning_rate": 1.5089141289707487e-05, + "loss": 2.0974, + "step": 14923 + }, + { + "epoch": 2.7973758200562324, + "grad_norm": 51233.4765625, + "learning_rate": 1.5083516352607579e-05, + "loss": 2.1298, + "step": 14924 + }, + { + "epoch": 2.7975632614807875, + "grad_norm": 60705.41796875, + "learning_rate": 1.5077892277899568e-05, + "loss": 2.1863, + "step": 14925 + }, + { + "epoch": 2.797750702905342, + "grad_norm": 54416.4375, + "learning_rate": 1.5072269065722328e-05, + "loss": 2.0889, + "step": 14926 + }, + { + "epoch": 2.7979381443298967, + "grad_norm": 51088.9609375, + "learning_rate": 1.5066646716214766e-05, + "loss": 2.0788, + "step": 14927 + }, + { + "epoch": 2.798125585754452, + "grad_norm": 58078.44140625, + "learning_rate": 1.5061025229515746e-05, + "loss": 2.0877, + "step": 14928 + }, + { + "epoch": 2.7983130271790064, + "grad_norm": 59881.07421875, + "learning_rate": 1.5055404605764083e-05, + "loss": 2.0749, + "step": 14929 + }, + { + "epoch": 2.7985004686035615, + "grad_norm": 50356.62109375, + "learning_rate": 1.5049784845098636e-05, + "loss": 2.0933, + "step": 14930 + }, + { + "epoch": 2.798687910028116, + "grad_norm": 52954.46875, + "learning_rate": 1.5044165947658173e-05, + "loss": 2.1274, + "step": 14931 + }, + { + "epoch": 2.798875351452671, + "grad_norm": 52243.3671875, + "learning_rate": 1.5038547913581507e-05, + "loss": 2.2181, + "step": 14932 + }, + { + "epoch": 2.799062792877226, + "grad_norm": 55938.03515625, + "learning_rate": 1.5032930743007384e-05, + "loss": 2.0213, + "step": 14933 + }, + { + "epoch": 2.7992502343017804, + "grad_norm": 51140.11328125, + "learning_rate": 1.5027314436074514e-05, + "loss": 2.1977, + "step": 14934 + }, + { + "epoch": 2.7994376757263355, + "grad_norm": 59812.09375, + "learning_rate": 1.5021698992921635e-05, + "loss": 2.1496, + "step": 14935 + }, + { + "epoch": 2.7996251171508906, + "grad_norm": 55638.78125, + "learning_rate": 1.5016084413687459e-05, + "loss": 2.0985, + "step": 14936 + }, + { + "epoch": 2.799812558575445, + "grad_norm": 56586.015625, + "learning_rate": 1.5010470698510642e-05, + "loss": 2.1496, + "step": 14937 + }, + { + "epoch": 2.8, + "grad_norm": 64207.55859375, + "learning_rate": 1.5004857847529818e-05, + "loss": 2.1362, + "step": 14938 + }, + { + "epoch": 2.800187441424555, + "grad_norm": 55872.50390625, + "learning_rate": 1.4999245860883632e-05, + "loss": 2.13, + "step": 14939 + }, + { + "epoch": 2.8003748828491095, + "grad_norm": 52690.5, + "learning_rate": 1.4993634738710726e-05, + "loss": 2.1626, + "step": 14940 + }, + { + "epoch": 2.8005623242736646, + "grad_norm": 48109.62890625, + "learning_rate": 1.498802448114962e-05, + "loss": 2.1131, + "step": 14941 + }, + { + "epoch": 2.800749765698219, + "grad_norm": 57350.78125, + "learning_rate": 1.498241508833892e-05, + "loss": 2.097, + "step": 14942 + }, + { + "epoch": 2.8009372071227743, + "grad_norm": 54436.83984375, + "learning_rate": 1.4976806560417184e-05, + "loss": 2.157, + "step": 14943 + }, + { + "epoch": 2.801124648547329, + "grad_norm": 55621.38671875, + "learning_rate": 1.4971198897522914e-05, + "loss": 2.1511, + "step": 14944 + }, + { + "epoch": 2.8013120899718835, + "grad_norm": 57510.43359375, + "learning_rate": 1.49655920997946e-05, + "loss": 2.1593, + "step": 14945 + }, + { + "epoch": 2.8014995313964386, + "grad_norm": 51931.69921875, + "learning_rate": 1.4959986167370755e-05, + "loss": 2.0771, + "step": 14946 + }, + { + "epoch": 2.8016869728209937, + "grad_norm": 55420.20703125, + "learning_rate": 1.495438110038982e-05, + "loss": 2.1031, + "step": 14947 + }, + { + "epoch": 2.8018744142455483, + "grad_norm": 53215.6171875, + "learning_rate": 1.4948776898990224e-05, + "loss": 2.1902, + "step": 14948 + }, + { + "epoch": 2.802061855670103, + "grad_norm": 54323.94921875, + "learning_rate": 1.4943173563310387e-05, + "loss": 2.1203, + "step": 14949 + }, + { + "epoch": 2.802249297094658, + "grad_norm": 52269.46875, + "learning_rate": 1.4937571093488734e-05, + "loss": 2.0935, + "step": 14950 + }, + { + "epoch": 2.8024367385192126, + "grad_norm": 59582.5, + "learning_rate": 1.4931969489663611e-05, + "loss": 2.122, + "step": 14951 + }, + { + "epoch": 2.8026241799437677, + "grad_norm": 52043.85546875, + "learning_rate": 1.4926368751973358e-05, + "loss": 2.1302, + "step": 14952 + }, + { + "epoch": 2.8028116213683223, + "grad_norm": 57254.03125, + "learning_rate": 1.492076888055634e-05, + "loss": 2.1504, + "step": 14953 + }, + { + "epoch": 2.8029990627928774, + "grad_norm": 58660.40625, + "learning_rate": 1.4915169875550855e-05, + "loss": 2.2114, + "step": 14954 + }, + { + "epoch": 2.803186504217432, + "grad_norm": 56982.453125, + "learning_rate": 1.4909571737095168e-05, + "loss": 2.1425, + "step": 14955 + }, + { + "epoch": 2.8033739456419866, + "grad_norm": 55453.65625, + "learning_rate": 1.4903974465327559e-05, + "loss": 2.1244, + "step": 14956 + }, + { + "epoch": 2.8035613870665417, + "grad_norm": 55536.56640625, + "learning_rate": 1.4898378060386298e-05, + "loss": 2.1565, + "step": 14957 + }, + { + "epoch": 2.8037488284910967, + "grad_norm": 54070.8125, + "learning_rate": 1.4892782522409588e-05, + "loss": 2.1358, + "step": 14958 + }, + { + "epoch": 2.8039362699156514, + "grad_norm": 55234.9140625, + "learning_rate": 1.4887187851535617e-05, + "loss": 2.1189, + "step": 14959 + }, + { + "epoch": 2.804123711340206, + "grad_norm": 51823.98828125, + "learning_rate": 1.4881594047902597e-05, + "loss": 2.0745, + "step": 14960 + }, + { + "epoch": 2.804311152764761, + "grad_norm": 58379.54296875, + "learning_rate": 1.4876001111648657e-05, + "loss": 2.0751, + "step": 14961 + }, + { + "epoch": 2.8044985941893157, + "grad_norm": 60891.1875, + "learning_rate": 1.4870409042911965e-05, + "loss": 2.1042, + "step": 14962 + }, + { + "epoch": 2.8046860356138708, + "grad_norm": 51733.484375, + "learning_rate": 1.4864817841830625e-05, + "loss": 2.1121, + "step": 14963 + }, + { + "epoch": 2.8048734770384254, + "grad_norm": 59498.921875, + "learning_rate": 1.4859227508542717e-05, + "loss": 2.0992, + "step": 14964 + }, + { + "epoch": 2.8050609184629804, + "grad_norm": 54435.87890625, + "learning_rate": 1.4853638043186324e-05, + "loss": 2.2089, + "step": 14965 + }, + { + "epoch": 2.805248359887535, + "grad_norm": 53378.64453125, + "learning_rate": 1.484804944589952e-05, + "loss": 2.1656, + "step": 14966 + }, + { + "epoch": 2.80543580131209, + "grad_norm": 55481.21875, + "learning_rate": 1.4842461716820321e-05, + "loss": 2.1113, + "step": 14967 + }, + { + "epoch": 2.8056232427366448, + "grad_norm": 53490.0234375, + "learning_rate": 1.4836874856086719e-05, + "loss": 2.1716, + "step": 14968 + }, + { + "epoch": 2.8058106841612, + "grad_norm": 54485.54296875, + "learning_rate": 1.4831288863836734e-05, + "loss": 2.114, + "step": 14969 + }, + { + "epoch": 2.8059981255857545, + "grad_norm": 59321.64453125, + "learning_rate": 1.4825703740208319e-05, + "loss": 2.127, + "step": 14970 + }, + { + "epoch": 2.806185567010309, + "grad_norm": 59454.14453125, + "learning_rate": 1.48201194853394e-05, + "loss": 2.1444, + "step": 14971 + }, + { + "epoch": 2.806373008434864, + "grad_norm": 54640.5234375, + "learning_rate": 1.481453609936792e-05, + "loss": 2.1039, + "step": 14972 + }, + { + "epoch": 2.8065604498594188, + "grad_norm": 55768.46484375, + "learning_rate": 1.4808953582431811e-05, + "loss": 2.1112, + "step": 14973 + }, + { + "epoch": 2.806747891283974, + "grad_norm": 52753.15625, + "learning_rate": 1.4803371934668892e-05, + "loss": 2.1619, + "step": 14974 + }, + { + "epoch": 2.8069353327085285, + "grad_norm": 60279.68359375, + "learning_rate": 1.479779115621705e-05, + "loss": 2.0853, + "step": 14975 + }, + { + "epoch": 2.8071227741330835, + "grad_norm": 53961.5, + "learning_rate": 1.4792211247214144e-05, + "loss": 2.0235, + "step": 14976 + }, + { + "epoch": 2.807310215557638, + "grad_norm": 51433.23828125, + "learning_rate": 1.4786632207797967e-05, + "loss": 2.1071, + "step": 14977 + }, + { + "epoch": 2.8074976569821932, + "grad_norm": 56176.71875, + "learning_rate": 1.4781054038106312e-05, + "loss": 2.0892, + "step": 14978 + }, + { + "epoch": 2.807685098406748, + "grad_norm": 60706.546875, + "learning_rate": 1.4775476738276955e-05, + "loss": 2.0657, + "step": 14979 + }, + { + "epoch": 2.807872539831303, + "grad_norm": 58041.921875, + "learning_rate": 1.476990030844767e-05, + "loss": 2.1001, + "step": 14980 + }, + { + "epoch": 2.8080599812558575, + "grad_norm": 52274.45703125, + "learning_rate": 1.4764324748756176e-05, + "loss": 2.1345, + "step": 14981 + }, + { + "epoch": 2.808247422680412, + "grad_norm": 53312.76171875, + "learning_rate": 1.4758750059340153e-05, + "loss": 2.0508, + "step": 14982 + }, + { + "epoch": 2.8084348641049672, + "grad_norm": 55233.01171875, + "learning_rate": 1.4753176240337336e-05, + "loss": 2.1648, + "step": 14983 + }, + { + "epoch": 2.8086223055295223, + "grad_norm": 54447.81640625, + "learning_rate": 1.4747603291885364e-05, + "loss": 2.1021, + "step": 14984 + }, + { + "epoch": 2.808809746954077, + "grad_norm": 60735.61328125, + "learning_rate": 1.474203121412187e-05, + "loss": 2.1491, + "step": 14985 + }, + { + "epoch": 2.8089971883786315, + "grad_norm": 52802.51171875, + "learning_rate": 1.4736460007184506e-05, + "loss": 2.1165, + "step": 14986 + }, + { + "epoch": 2.8091846298031866, + "grad_norm": 55299.515625, + "learning_rate": 1.4730889671210845e-05, + "loss": 2.2198, + "step": 14987 + }, + { + "epoch": 2.8093720712277412, + "grad_norm": 54315.79296875, + "learning_rate": 1.4725320206338499e-05, + "loss": 2.1569, + "step": 14988 + }, + { + "epoch": 2.8095595126522963, + "grad_norm": 51329.33984375, + "learning_rate": 1.4719751612704996e-05, + "loss": 2.1276, + "step": 14989 + }, + { + "epoch": 2.809746954076851, + "grad_norm": 60573.37890625, + "learning_rate": 1.4714183890447896e-05, + "loss": 2.1369, + "step": 14990 + }, + { + "epoch": 2.809934395501406, + "grad_norm": 55537.5234375, + "learning_rate": 1.4708617039704698e-05, + "loss": 2.1063, + "step": 14991 + }, + { + "epoch": 2.8101218369259606, + "grad_norm": 51869.296875, + "learning_rate": 1.470305106061291e-05, + "loss": 2.1052, + "step": 14992 + }, + { + "epoch": 2.8103092783505152, + "grad_norm": 53358.98828125, + "learning_rate": 1.4697485953310004e-05, + "loss": 2.1394, + "step": 14993 + }, + { + "epoch": 2.8104967197750703, + "grad_norm": 57540.06640625, + "learning_rate": 1.4691921717933405e-05, + "loss": 2.1271, + "step": 14994 + }, + { + "epoch": 2.8106841611996254, + "grad_norm": 59963.125, + "learning_rate": 1.4686358354620566e-05, + "loss": 2.1122, + "step": 14995 + }, + { + "epoch": 2.81087160262418, + "grad_norm": 56458.890625, + "learning_rate": 1.4680795863508907e-05, + "loss": 2.1154, + "step": 14996 + }, + { + "epoch": 2.8110590440487346, + "grad_norm": 58117.921875, + "learning_rate": 1.4675234244735792e-05, + "loss": 2.0391, + "step": 14997 + }, + { + "epoch": 2.8112464854732897, + "grad_norm": 59709.16015625, + "learning_rate": 1.466967349843858e-05, + "loss": 2.0347, + "step": 14998 + }, + { + "epoch": 2.8114339268978443, + "grad_norm": 51240.2890625, + "learning_rate": 1.4664113624754644e-05, + "loss": 2.093, + "step": 14999 + }, + { + "epoch": 2.8116213683223994, + "grad_norm": 55343.75390625, + "learning_rate": 1.465855462382128e-05, + "loss": 2.1307, + "step": 15000 + }, + { + "epoch": 2.8116213683223994, + "eval_loss": 2.259845733642578, + "eval_runtime": 129.9499, + "eval_samples_per_second": 38.853, + "eval_steps_per_second": 1.947, + "step": 15000 + }, + { + "epoch": 2.811808809746954, + "grad_norm": 57488.73828125, + "learning_rate": 1.465299649577579e-05, + "loss": 2.1049, + "step": 15001 + }, + { + "epoch": 2.811996251171509, + "grad_norm": 52965.36328125, + "learning_rate": 1.464743924075545e-05, + "loss": 2.1134, + "step": 15002 + }, + { + "epoch": 2.8121836925960637, + "grad_norm": 55876.671875, + "learning_rate": 1.4641882858897565e-05, + "loss": 1.989, + "step": 15003 + }, + { + "epoch": 2.8123711340206183, + "grad_norm": 54018.640625, + "learning_rate": 1.4636327350339291e-05, + "loss": 2.1376, + "step": 15004 + }, + { + "epoch": 2.8125585754451734, + "grad_norm": 53186.70703125, + "learning_rate": 1.4630772715217889e-05, + "loss": 2.067, + "step": 15005 + }, + { + "epoch": 2.8127460168697285, + "grad_norm": 54002.2734375, + "learning_rate": 1.4625218953670556e-05, + "loss": 2.1384, + "step": 15006 + }, + { + "epoch": 2.812933458294283, + "grad_norm": 57324.40625, + "learning_rate": 1.4619666065834454e-05, + "loss": 2.0257, + "step": 15007 + }, + { + "epoch": 2.8131208997188377, + "grad_norm": 53046.50390625, + "learning_rate": 1.4614114051846712e-05, + "loss": 2.0675, + "step": 15008 + }, + { + "epoch": 2.813308341143393, + "grad_norm": 52219.28515625, + "learning_rate": 1.4608562911844487e-05, + "loss": 2.127, + "step": 15009 + }, + { + "epoch": 2.8134957825679474, + "grad_norm": 56624.39453125, + "learning_rate": 1.4603012645964875e-05, + "loss": 2.1744, + "step": 15010 + }, + { + "epoch": 2.8136832239925025, + "grad_norm": 58431.7734375, + "learning_rate": 1.459746325434494e-05, + "loss": 2.122, + "step": 15011 + }, + { + "epoch": 2.813870665417057, + "grad_norm": 53662.921875, + "learning_rate": 1.4591914737121764e-05, + "loss": 2.1238, + "step": 15012 + }, + { + "epoch": 2.814058106841612, + "grad_norm": 56168.52734375, + "learning_rate": 1.4586367094432401e-05, + "loss": 2.1778, + "step": 15013 + }, + { + "epoch": 2.814245548266167, + "grad_norm": 54938.94921875, + "learning_rate": 1.4580820326413857e-05, + "loss": 2.1209, + "step": 15014 + }, + { + "epoch": 2.8144329896907214, + "grad_norm": 54886.1015625, + "learning_rate": 1.4575274433203118e-05, + "loss": 2.1014, + "step": 15015 + }, + { + "epoch": 2.8146204311152765, + "grad_norm": 53560.32421875, + "learning_rate": 1.4569729414937183e-05, + "loss": 2.0926, + "step": 15016 + }, + { + "epoch": 2.8148078725398316, + "grad_norm": 53031.859375, + "learning_rate": 1.4564185271752978e-05, + "loss": 2.1304, + "step": 15017 + }, + { + "epoch": 2.814995313964386, + "grad_norm": 55087.7265625, + "learning_rate": 1.455864200378747e-05, + "loss": 2.131, + "step": 15018 + }, + { + "epoch": 2.815182755388941, + "grad_norm": 57571.0390625, + "learning_rate": 1.455309961117754e-05, + "loss": 2.1291, + "step": 15019 + }, + { + "epoch": 2.815370196813496, + "grad_norm": 54560.2421875, + "learning_rate": 1.454755809406011e-05, + "loss": 2.1048, + "step": 15020 + }, + { + "epoch": 2.8155576382380505, + "grad_norm": 55034.84375, + "learning_rate": 1.454201745257201e-05, + "loss": 2.0482, + "step": 15021 + }, + { + "epoch": 2.8157450796626056, + "grad_norm": 54190.0546875, + "learning_rate": 1.4536477686850126e-05, + "loss": 2.0997, + "step": 15022 + }, + { + "epoch": 2.81593252108716, + "grad_norm": 55643.203125, + "learning_rate": 1.4530938797031263e-05, + "loss": 2.2129, + "step": 15023 + }, + { + "epoch": 2.8161199625117153, + "grad_norm": 54448.06640625, + "learning_rate": 1.4525400783252213e-05, + "loss": 2.1538, + "step": 15024 + }, + { + "epoch": 2.81630740393627, + "grad_norm": 54978.23828125, + "learning_rate": 1.4519863645649778e-05, + "loss": 2.0735, + "step": 15025 + }, + { + "epoch": 2.8164948453608245, + "grad_norm": 52213.953125, + "learning_rate": 1.4514327384360715e-05, + "loss": 2.1326, + "step": 15026 + }, + { + "epoch": 2.8166822867853796, + "grad_norm": 52948.609375, + "learning_rate": 1.4508791999521742e-05, + "loss": 2.0723, + "step": 15027 + }, + { + "epoch": 2.8168697282099346, + "grad_norm": 57030.79296875, + "learning_rate": 1.4503257491269595e-05, + "loss": 2.1256, + "step": 15028 + }, + { + "epoch": 2.8170571696344893, + "grad_norm": 54672.734375, + "learning_rate": 1.4497723859740975e-05, + "loss": 2.2114, + "step": 15029 + }, + { + "epoch": 2.817244611059044, + "grad_norm": 59157.671875, + "learning_rate": 1.449219110507255e-05, + "loss": 2.2526, + "step": 15030 + }, + { + "epoch": 2.817432052483599, + "grad_norm": 56941.79296875, + "learning_rate": 1.448665922740095e-05, + "loss": 2.1276, + "step": 15031 + }, + { + "epoch": 2.8176194939081536, + "grad_norm": 52168.828125, + "learning_rate": 1.4481128226862827e-05, + "loss": 2.1402, + "step": 15032 + }, + { + "epoch": 2.8178069353327087, + "grad_norm": 50700.7578125, + "learning_rate": 1.4475598103594817e-05, + "loss": 2.1092, + "step": 15033 + }, + { + "epoch": 2.8179943767572633, + "grad_norm": 53252.171875, + "learning_rate": 1.4470068857733443e-05, + "loss": 2.081, + "step": 15034 + }, + { + "epoch": 2.8181818181818183, + "grad_norm": 62321.66796875, + "learning_rate": 1.4464540489415301e-05, + "loss": 2.1727, + "step": 15035 + }, + { + "epoch": 2.818369259606373, + "grad_norm": 52796.0390625, + "learning_rate": 1.4459012998776954e-05, + "loss": 2.114, + "step": 15036 + }, + { + "epoch": 2.8185567010309276, + "grad_norm": 59704.00390625, + "learning_rate": 1.4453486385954906e-05, + "loss": 2.1011, + "step": 15037 + }, + { + "epoch": 2.8187441424554827, + "grad_norm": 55021.484375, + "learning_rate": 1.4447960651085641e-05, + "loss": 2.0441, + "step": 15038 + }, + { + "epoch": 2.8189315838800377, + "grad_norm": 52884.94140625, + "learning_rate": 1.444243579430567e-05, + "loss": 2.1305, + "step": 15039 + }, + { + "epoch": 2.8191190253045924, + "grad_norm": 53355.9453125, + "learning_rate": 1.4436911815751437e-05, + "loss": 2.1322, + "step": 15040 + }, + { + "epoch": 2.819306466729147, + "grad_norm": 57374.59375, + "learning_rate": 1.4431388715559357e-05, + "loss": 2.105, + "step": 15041 + }, + { + "epoch": 2.819493908153702, + "grad_norm": 55558.44921875, + "learning_rate": 1.4425866493865859e-05, + "loss": 2.0901, + "step": 15042 + }, + { + "epoch": 2.8196813495782567, + "grad_norm": 59756.953125, + "learning_rate": 1.4420345150807351e-05, + "loss": 2.0895, + "step": 15043 + }, + { + "epoch": 2.8198687910028117, + "grad_norm": 51067.13671875, + "learning_rate": 1.441482468652019e-05, + "loss": 2.1155, + "step": 15044 + }, + { + "epoch": 2.8200562324273664, + "grad_norm": 54129.08984375, + "learning_rate": 1.4409305101140702e-05, + "loss": 2.0831, + "step": 15045 + }, + { + "epoch": 2.8202436738519214, + "grad_norm": 52828.5390625, + "learning_rate": 1.4403786394805258e-05, + "loss": 2.1587, + "step": 15046 + }, + { + "epoch": 2.820431115276476, + "grad_norm": 54930.15234375, + "learning_rate": 1.4398268567650114e-05, + "loss": 2.0979, + "step": 15047 + }, + { + "epoch": 2.8206185567010307, + "grad_norm": 51550.6953125, + "learning_rate": 1.43927516198116e-05, + "loss": 2.0853, + "step": 15048 + }, + { + "epoch": 2.8208059981255857, + "grad_norm": 59619.20703125, + "learning_rate": 1.4387235551425954e-05, + "loss": 2.0885, + "step": 15049 + }, + { + "epoch": 2.820993439550141, + "grad_norm": 54873.1953125, + "learning_rate": 1.4381720362629402e-05, + "loss": 2.0626, + "step": 15050 + }, + { + "epoch": 2.8211808809746954, + "grad_norm": 53326.734375, + "learning_rate": 1.4376206053558177e-05, + "loss": 2.1963, + "step": 15051 + }, + { + "epoch": 2.82136832239925, + "grad_norm": 52512.6015625, + "learning_rate": 1.4370692624348486e-05, + "loss": 2.1316, + "step": 15052 + }, + { + "epoch": 2.821555763823805, + "grad_norm": 57988.1796875, + "learning_rate": 1.4365180075136503e-05, + "loss": 2.1225, + "step": 15053 + }, + { + "epoch": 2.8217432052483598, + "grad_norm": 51386.69140625, + "learning_rate": 1.4359668406058347e-05, + "loss": 2.1543, + "step": 15054 + }, + { + "epoch": 2.821930646672915, + "grad_norm": 51378.2109375, + "learning_rate": 1.4354157617250191e-05, + "loss": 2.1277, + "step": 15055 + }, + { + "epoch": 2.8221180880974694, + "grad_norm": 53712.48828125, + "learning_rate": 1.4348647708848128e-05, + "loss": 2.1249, + "step": 15056 + }, + { + "epoch": 2.8223055295220245, + "grad_norm": 55276.9453125, + "learning_rate": 1.4343138680988227e-05, + "loss": 2.0755, + "step": 15057 + }, + { + "epoch": 2.822492970946579, + "grad_norm": 51668.16796875, + "learning_rate": 1.4337630533806574e-05, + "loss": 2.1626, + "step": 15058 + }, + { + "epoch": 2.8226804123711338, + "grad_norm": 57512.32421875, + "learning_rate": 1.4332123267439217e-05, + "loss": 2.0771, + "step": 15059 + }, + { + "epoch": 2.822867853795689, + "grad_norm": 55148.83984375, + "learning_rate": 1.432661688202218e-05, + "loss": 2.1457, + "step": 15060 + }, + { + "epoch": 2.823055295220244, + "grad_norm": 55718.6171875, + "learning_rate": 1.4321111377691438e-05, + "loss": 2.0797, + "step": 15061 + }, + { + "epoch": 2.8232427366447985, + "grad_norm": 50244.09375, + "learning_rate": 1.4315606754583e-05, + "loss": 2.1313, + "step": 15062 + }, + { + "epoch": 2.823430178069353, + "grad_norm": 54639.29296875, + "learning_rate": 1.431010301283281e-05, + "loss": 2.1012, + "step": 15063 + }, + { + "epoch": 2.823617619493908, + "grad_norm": 52384.609375, + "learning_rate": 1.4304600152576785e-05, + "loss": 2.0918, + "step": 15064 + }, + { + "epoch": 2.823805060918463, + "grad_norm": 57269.9921875, + "learning_rate": 1.429909817395086e-05, + "loss": 2.1958, + "step": 15065 + }, + { + "epoch": 2.823992502343018, + "grad_norm": 49983.05078125, + "learning_rate": 1.4293597077090954e-05, + "loss": 2.124, + "step": 15066 + }, + { + "epoch": 2.8241799437675725, + "grad_norm": 54696.05859375, + "learning_rate": 1.4288096862132872e-05, + "loss": 2.1311, + "step": 15067 + }, + { + "epoch": 2.8243673851921276, + "grad_norm": 53848.9375, + "learning_rate": 1.4282597529212499e-05, + "loss": 2.0701, + "step": 15068 + }, + { + "epoch": 2.8245548266166822, + "grad_norm": 57073.76171875, + "learning_rate": 1.4277099078465673e-05, + "loss": 2.1306, + "step": 15069 + }, + { + "epoch": 2.824742268041237, + "grad_norm": 57514.87109375, + "learning_rate": 1.4271601510028187e-05, + "loss": 2.115, + "step": 15070 + }, + { + "epoch": 2.824929709465792, + "grad_norm": 53791.82421875, + "learning_rate": 1.4266104824035803e-05, + "loss": 2.1354, + "step": 15071 + }, + { + "epoch": 2.825117150890347, + "grad_norm": 56693.6640625, + "learning_rate": 1.4260609020624298e-05, + "loss": 2.1743, + "step": 15072 + }, + { + "epoch": 2.8253045923149016, + "grad_norm": 51843.234375, + "learning_rate": 1.4255114099929434e-05, + "loss": 2.139, + "step": 15073 + }, + { + "epoch": 2.8254920337394562, + "grad_norm": 48102.47265625, + "learning_rate": 1.4249620062086905e-05, + "loss": 2.2076, + "step": 15074 + }, + { + "epoch": 2.8256794751640113, + "grad_norm": 54525.875, + "learning_rate": 1.4244126907232396e-05, + "loss": 2.1087, + "step": 15075 + }, + { + "epoch": 2.825866916588566, + "grad_norm": 54325.43359375, + "learning_rate": 1.4238634635501608e-05, + "loss": 2.0543, + "step": 15076 + }, + { + "epoch": 2.826054358013121, + "grad_norm": 51815.79296875, + "learning_rate": 1.4233143247030161e-05, + "loss": 2.1456, + "step": 15077 + }, + { + "epoch": 2.8262417994376756, + "grad_norm": 56546.80859375, + "learning_rate": 1.4227652741953722e-05, + "loss": 2.1118, + "step": 15078 + }, + { + "epoch": 2.8264292408622307, + "grad_norm": 54466.60546875, + "learning_rate": 1.4222163120407883e-05, + "loss": 2.0854, + "step": 15079 + }, + { + "epoch": 2.8266166822867853, + "grad_norm": 55872.05078125, + "learning_rate": 1.4216674382528212e-05, + "loss": 2.1034, + "step": 15080 + }, + { + "epoch": 2.82680412371134, + "grad_norm": 52515.50390625, + "learning_rate": 1.4211186528450299e-05, + "loss": 2.0958, + "step": 15081 + }, + { + "epoch": 2.826991565135895, + "grad_norm": 54079.9609375, + "learning_rate": 1.420569955830967e-05, + "loss": 2.0971, + "step": 15082 + }, + { + "epoch": 2.82717900656045, + "grad_norm": 54987.1171875, + "learning_rate": 1.4200213472241863e-05, + "loss": 2.1385, + "step": 15083 + }, + { + "epoch": 2.8273664479850047, + "grad_norm": 60359.74609375, + "learning_rate": 1.4194728270382358e-05, + "loss": 2.1023, + "step": 15084 + }, + { + "epoch": 2.8275538894095593, + "grad_norm": 52980.60546875, + "learning_rate": 1.4189243952866655e-05, + "loss": 2.101, + "step": 15085 + }, + { + "epoch": 2.8277413308341144, + "grad_norm": 53699.609375, + "learning_rate": 1.4183760519830197e-05, + "loss": 2.1129, + "step": 15086 + }, + { + "epoch": 2.827928772258669, + "grad_norm": 51939.59375, + "learning_rate": 1.4178277971408404e-05, + "loss": 2.1738, + "step": 15087 + }, + { + "epoch": 2.828116213683224, + "grad_norm": 53216.95703125, + "learning_rate": 1.4172796307736702e-05, + "loss": 2.0788, + "step": 15088 + }, + { + "epoch": 2.8283036551077787, + "grad_norm": 55883.03125, + "learning_rate": 1.4167315528950498e-05, + "loss": 2.1374, + "step": 15089 + }, + { + "epoch": 2.8284910965323338, + "grad_norm": 53687.32421875, + "learning_rate": 1.416183563518514e-05, + "loss": 2.1447, + "step": 15090 + }, + { + "epoch": 2.8286785379568884, + "grad_norm": 54688.31640625, + "learning_rate": 1.4156356626575957e-05, + "loss": 2.0445, + "step": 15091 + }, + { + "epoch": 2.8288659793814435, + "grad_norm": 51894.9375, + "learning_rate": 1.4150878503258314e-05, + "loss": 2.114, + "step": 15092 + }, + { + "epoch": 2.829053420805998, + "grad_norm": 55155.1875, + "learning_rate": 1.4145401265367492e-05, + "loss": 2.0706, + "step": 15093 + }, + { + "epoch": 2.829240862230553, + "grad_norm": 53169.390625, + "learning_rate": 1.4139924913038754e-05, + "loss": 2.1102, + "step": 15094 + }, + { + "epoch": 2.829428303655108, + "grad_norm": 61960.21875, + "learning_rate": 1.4134449446407384e-05, + "loss": 2.098, + "step": 15095 + }, + { + "epoch": 2.8296157450796624, + "grad_norm": 54252.0703125, + "learning_rate": 1.4128974865608636e-05, + "loss": 2.1305, + "step": 15096 + }, + { + "epoch": 2.8298031865042175, + "grad_norm": 54770.91015625, + "learning_rate": 1.4123501170777676e-05, + "loss": 2.1578, + "step": 15097 + }, + { + "epoch": 2.829990627928772, + "grad_norm": 54327.07421875, + "learning_rate": 1.4118028362049718e-05, + "loss": 2.1555, + "step": 15098 + }, + { + "epoch": 2.830178069353327, + "grad_norm": 57555.1640625, + "learning_rate": 1.411255643955996e-05, + "loss": 2.1858, + "step": 15099 + }, + { + "epoch": 2.830365510777882, + "grad_norm": 54595.29296875, + "learning_rate": 1.410708540344352e-05, + "loss": 2.115, + "step": 15100 + }, + { + "epoch": 2.830552952202437, + "grad_norm": 57662.828125, + "learning_rate": 1.4101615253835527e-05, + "loss": 2.1518, + "step": 15101 + }, + { + "epoch": 2.8307403936269915, + "grad_norm": 57175.45703125, + "learning_rate": 1.4096145990871107e-05, + "loss": 2.0984, + "step": 15102 + }, + { + "epoch": 2.8309278350515465, + "grad_norm": 52821.0703125, + "learning_rate": 1.4090677614685316e-05, + "loss": 2.1468, + "step": 15103 + }, + { + "epoch": 2.831115276476101, + "grad_norm": 53212.94140625, + "learning_rate": 1.4085210125413246e-05, + "loss": 2.1193, + "step": 15104 + }, + { + "epoch": 2.8313027179006562, + "grad_norm": 52965.8984375, + "learning_rate": 1.4079743523189903e-05, + "loss": 2.1083, + "step": 15105 + }, + { + "epoch": 2.831490159325211, + "grad_norm": 55537.31640625, + "learning_rate": 1.4074277808150344e-05, + "loss": 2.1796, + "step": 15106 + }, + { + "epoch": 2.8316776007497655, + "grad_norm": 60327.1171875, + "learning_rate": 1.4068812980429541e-05, + "loss": 2.0919, + "step": 15107 + }, + { + "epoch": 2.8318650421743206, + "grad_norm": 55535.8828125, + "learning_rate": 1.406334904016246e-05, + "loss": 2.1108, + "step": 15108 + }, + { + "epoch": 2.8320524835988756, + "grad_norm": 52533.9609375, + "learning_rate": 1.4057885987484077e-05, + "loss": 2.147, + "step": 15109 + }, + { + "epoch": 2.8322399250234302, + "grad_norm": 54686.859375, + "learning_rate": 1.4052423822529298e-05, + "loss": 2.1271, + "step": 15110 + }, + { + "epoch": 2.832427366447985, + "grad_norm": 53700.33984375, + "learning_rate": 1.4046962545433057e-05, + "loss": 2.1461, + "step": 15111 + }, + { + "epoch": 2.83261480787254, + "grad_norm": 54350.9375, + "learning_rate": 1.4041502156330211e-05, + "loss": 2.1951, + "step": 15112 + }, + { + "epoch": 2.8328022492970946, + "grad_norm": 53872.98046875, + "learning_rate": 1.4036042655355658e-05, + "loss": 2.1493, + "step": 15113 + }, + { + "epoch": 2.8329896907216496, + "grad_norm": 49377.7109375, + "learning_rate": 1.4030584042644213e-05, + "loss": 2.1464, + "step": 15114 + }, + { + "epoch": 2.8331771321462043, + "grad_norm": 50707.8515625, + "learning_rate": 1.4025126318330716e-05, + "loss": 2.1272, + "step": 15115 + }, + { + "epoch": 2.8333645735707593, + "grad_norm": 55970.5, + "learning_rate": 1.4019669482549958e-05, + "loss": 2.1243, + "step": 15116 + }, + { + "epoch": 2.833552014995314, + "grad_norm": 52189.25390625, + "learning_rate": 1.4014213535436698e-05, + "loss": 2.0308, + "step": 15117 + }, + { + "epoch": 2.8337394564198686, + "grad_norm": 52576.7109375, + "learning_rate": 1.4008758477125711e-05, + "loss": 2.1246, + "step": 15118 + }, + { + "epoch": 2.8339268978444236, + "grad_norm": 57468.1328125, + "learning_rate": 1.4003304307751752e-05, + "loss": 2.1705, + "step": 15119 + }, + { + "epoch": 2.8341143392689787, + "grad_norm": 59322.13671875, + "learning_rate": 1.3997851027449483e-05, + "loss": 2.0551, + "step": 15120 + }, + { + "epoch": 2.8343017806935333, + "grad_norm": 56998.421875, + "learning_rate": 1.399239863635361e-05, + "loss": 2.0675, + "step": 15121 + }, + { + "epoch": 2.834489222118088, + "grad_norm": 59405.9453125, + "learning_rate": 1.3986947134598821e-05, + "loss": 2.0933, + "step": 15122 + }, + { + "epoch": 2.834676663542643, + "grad_norm": 54762.3125, + "learning_rate": 1.3981496522319749e-05, + "loss": 2.142, + "step": 15123 + }, + { + "epoch": 2.8348641049671977, + "grad_norm": 59844.3046875, + "learning_rate": 1.3976046799650999e-05, + "loss": 2.1786, + "step": 15124 + }, + { + "epoch": 2.8350515463917527, + "grad_norm": 58312.63671875, + "learning_rate": 1.397059796672719e-05, + "loss": 2.0495, + "step": 15125 + }, + { + "epoch": 2.8352389878163073, + "grad_norm": 58552.2890625, + "learning_rate": 1.396515002368292e-05, + "loss": 2.09, + "step": 15126 + }, + { + "epoch": 2.8354264292408624, + "grad_norm": 60453.32421875, + "learning_rate": 1.3959702970652705e-05, + "loss": 2.1153, + "step": 15127 + }, + { + "epoch": 2.835613870665417, + "grad_norm": 55103.3125, + "learning_rate": 1.3954256807771094e-05, + "loss": 2.0616, + "step": 15128 + }, + { + "epoch": 2.8358013120899717, + "grad_norm": 54583.85546875, + "learning_rate": 1.3948811535172617e-05, + "loss": 2.1282, + "step": 15129 + }, + { + "epoch": 2.8359887535145267, + "grad_norm": 54814.671875, + "learning_rate": 1.394336715299176e-05, + "loss": 2.1683, + "step": 15130 + }, + { + "epoch": 2.836176194939082, + "grad_norm": 57863.640625, + "learning_rate": 1.3937923661362973e-05, + "loss": 2.0657, + "step": 15131 + }, + { + "epoch": 2.8363636363636364, + "grad_norm": 63047.05859375, + "learning_rate": 1.3932481060420728e-05, + "loss": 2.1018, + "step": 15132 + }, + { + "epoch": 2.836551077788191, + "grad_norm": 55967.375, + "learning_rate": 1.392703935029942e-05, + "loss": 2.0986, + "step": 15133 + }, + { + "epoch": 2.836738519212746, + "grad_norm": 52424.29296875, + "learning_rate": 1.392159853113349e-05, + "loss": 2.1235, + "step": 15134 + }, + { + "epoch": 2.8369259606373007, + "grad_norm": 49931.5390625, + "learning_rate": 1.3916158603057283e-05, + "loss": 2.1497, + "step": 15135 + }, + { + "epoch": 2.837113402061856, + "grad_norm": 55930.15625, + "learning_rate": 1.3910719566205193e-05, + "loss": 2.1057, + "step": 15136 + }, + { + "epoch": 2.8373008434864104, + "grad_norm": 56493.9453125, + "learning_rate": 1.3905281420711536e-05, + "loss": 2.1353, + "step": 15137 + }, + { + "epoch": 2.8374882849109655, + "grad_norm": 54094.64453125, + "learning_rate": 1.3899844166710618e-05, + "loss": 2.1323, + "step": 15138 + }, + { + "epoch": 2.83767572633552, + "grad_norm": 52136.3671875, + "learning_rate": 1.3894407804336757e-05, + "loss": 2.1514, + "step": 15139 + }, + { + "epoch": 2.8378631677600747, + "grad_norm": 51509.3515625, + "learning_rate": 1.3888972333724199e-05, + "loss": 2.1192, + "step": 15140 + }, + { + "epoch": 2.83805060918463, + "grad_norm": 53221.375, + "learning_rate": 1.388353775500722e-05, + "loss": 2.1501, + "step": 15141 + }, + { + "epoch": 2.838238050609185, + "grad_norm": 53478.59765625, + "learning_rate": 1.387810406832003e-05, + "loss": 2.0586, + "step": 15142 + }, + { + "epoch": 2.8384254920337395, + "grad_norm": 61539.3828125, + "learning_rate": 1.3872671273796828e-05, + "loss": 2.0897, + "step": 15143 + }, + { + "epoch": 2.838612933458294, + "grad_norm": 50996.1875, + "learning_rate": 1.38672393715718e-05, + "loss": 2.1626, + "step": 15144 + }, + { + "epoch": 2.838800374882849, + "grad_norm": 49843.80078125, + "learning_rate": 1.3861808361779132e-05, + "loss": 2.0862, + "step": 15145 + }, + { + "epoch": 2.838987816307404, + "grad_norm": 51033.33984375, + "learning_rate": 1.3856378244552942e-05, + "loss": 2.1438, + "step": 15146 + }, + { + "epoch": 2.839175257731959, + "grad_norm": 55746.1640625, + "learning_rate": 1.3850949020027332e-05, + "loss": 2.1507, + "step": 15147 + }, + { + "epoch": 2.8393626991565135, + "grad_norm": 52769.62109375, + "learning_rate": 1.3845520688336411e-05, + "loss": 2.1897, + "step": 15148 + }, + { + "epoch": 2.8395501405810686, + "grad_norm": 55511.01171875, + "learning_rate": 1.3840093249614288e-05, + "loss": 2.1436, + "step": 15149 + }, + { + "epoch": 2.839737582005623, + "grad_norm": 53446.3984375, + "learning_rate": 1.3834666703994953e-05, + "loss": 2.1282, + "step": 15150 + }, + { + "epoch": 2.839925023430178, + "grad_norm": 57227.72265625, + "learning_rate": 1.3829241051612457e-05, + "loss": 2.1621, + "step": 15151 + }, + { + "epoch": 2.840112464854733, + "grad_norm": 53960.12109375, + "learning_rate": 1.3823816292600822e-05, + "loss": 2.101, + "step": 15152 + }, + { + "epoch": 2.840299906279288, + "grad_norm": 53786.57421875, + "learning_rate": 1.3818392427094029e-05, + "loss": 2.1953, + "step": 15153 + }, + { + "epoch": 2.8404873477038426, + "grad_norm": 51282.95703125, + "learning_rate": 1.3812969455226016e-05, + "loss": 2.1398, + "step": 15154 + }, + { + "epoch": 2.840674789128397, + "grad_norm": 54311.61328125, + "learning_rate": 1.3807547377130754e-05, + "loss": 2.1887, + "step": 15155 + }, + { + "epoch": 2.8408622305529523, + "grad_norm": 59303.015625, + "learning_rate": 1.3802126192942144e-05, + "loss": 2.0358, + "step": 15156 + }, + { + "epoch": 2.841049671977507, + "grad_norm": 55778.33984375, + "learning_rate": 1.3796705902794076e-05, + "loss": 2.0901, + "step": 15157 + }, + { + "epoch": 2.841237113402062, + "grad_norm": 58081.53125, + "learning_rate": 1.3791286506820434e-05, + "loss": 2.1407, + "step": 15158 + }, + { + "epoch": 2.8414245548266166, + "grad_norm": 52629.30078125, + "learning_rate": 1.3785868005155078e-05, + "loss": 2.1411, + "step": 15159 + }, + { + "epoch": 2.8416119962511717, + "grad_norm": 57232.171875, + "learning_rate": 1.3780450397931838e-05, + "loss": 2.1186, + "step": 15160 + }, + { + "epoch": 2.8417994376757263, + "grad_norm": 53706.71484375, + "learning_rate": 1.3775033685284494e-05, + "loss": 2.2258, + "step": 15161 + }, + { + "epoch": 2.841986879100281, + "grad_norm": 53887.64453125, + "learning_rate": 1.3769617867346868e-05, + "loss": 2.1663, + "step": 15162 + }, + { + "epoch": 2.842174320524836, + "grad_norm": 52474.03515625, + "learning_rate": 1.3764202944252707e-05, + "loss": 2.093, + "step": 15163 + }, + { + "epoch": 2.842361761949391, + "grad_norm": 52480.921875, + "learning_rate": 1.375878891613574e-05, + "loss": 2.1155, + "step": 15164 + }, + { + "epoch": 2.8425492033739457, + "grad_norm": 52849.203125, + "learning_rate": 1.3753375783129696e-05, + "loss": 2.1159, + "step": 15165 + }, + { + "epoch": 2.8427366447985003, + "grad_norm": 55701.3203125, + "learning_rate": 1.3747963545368297e-05, + "loss": 2.1051, + "step": 15166 + }, + { + "epoch": 2.8429240862230554, + "grad_norm": 53220.00390625, + "learning_rate": 1.3742552202985193e-05, + "loss": 2.1198, + "step": 15167 + }, + { + "epoch": 2.84311152764761, + "grad_norm": 54548.4296875, + "learning_rate": 1.3737141756114025e-05, + "loss": 2.0534, + "step": 15168 + }, + { + "epoch": 2.843298969072165, + "grad_norm": 58345.0859375, + "learning_rate": 1.3731732204888453e-05, + "loss": 2.1078, + "step": 15169 + }, + { + "epoch": 2.8434864104967197, + "grad_norm": 58919.06640625, + "learning_rate": 1.372632354944206e-05, + "loss": 2.1743, + "step": 15170 + }, + { + "epoch": 2.8436738519212748, + "grad_norm": 60992.42578125, + "learning_rate": 1.3720915789908468e-05, + "loss": 2.1719, + "step": 15171 + }, + { + "epoch": 2.8438612933458294, + "grad_norm": 51982.484375, + "learning_rate": 1.3715508926421211e-05, + "loss": 2.1141, + "step": 15172 + }, + { + "epoch": 2.844048734770384, + "grad_norm": 54687.4375, + "learning_rate": 1.3710102959113824e-05, + "loss": 2.129, + "step": 15173 + }, + { + "epoch": 2.844236176194939, + "grad_norm": 51991.01953125, + "learning_rate": 1.370469788811985e-05, + "loss": 2.1815, + "step": 15174 + }, + { + "epoch": 2.844423617619494, + "grad_norm": 56156.32421875, + "learning_rate": 1.3699293713572791e-05, + "loss": 2.0943, + "step": 15175 + }, + { + "epoch": 2.8446110590440488, + "grad_norm": 53204.328125, + "learning_rate": 1.3693890435606116e-05, + "loss": 2.1683, + "step": 15176 + }, + { + "epoch": 2.8447985004686034, + "grad_norm": 57199.16796875, + "learning_rate": 1.3688488054353254e-05, + "loss": 2.1311, + "step": 15177 + }, + { + "epoch": 2.8449859418931585, + "grad_norm": 57057.671875, + "learning_rate": 1.3683086569947678e-05, + "loss": 2.141, + "step": 15178 + }, + { + "epoch": 2.845173383317713, + "grad_norm": 57741.296875, + "learning_rate": 1.367768598252278e-05, + "loss": 2.1744, + "step": 15179 + }, + { + "epoch": 2.845360824742268, + "grad_norm": 59398.625, + "learning_rate": 1.3672286292211928e-05, + "loss": 2.0964, + "step": 15180 + }, + { + "epoch": 2.8455482661668228, + "grad_norm": 56800.13671875, + "learning_rate": 1.3666887499148507e-05, + "loss": 2.0771, + "step": 15181 + }, + { + "epoch": 2.845735707591378, + "grad_norm": 54613.23046875, + "learning_rate": 1.3661489603465872e-05, + "loss": 2.096, + "step": 15182 + }, + { + "epoch": 2.8459231490159325, + "grad_norm": 55017.09375, + "learning_rate": 1.3656092605297333e-05, + "loss": 2.1392, + "step": 15183 + }, + { + "epoch": 2.846110590440487, + "grad_norm": 57640.90625, + "learning_rate": 1.3650696504776166e-05, + "loss": 2.1686, + "step": 15184 + }, + { + "epoch": 2.846298031865042, + "grad_norm": 55325.58984375, + "learning_rate": 1.3645301302035685e-05, + "loss": 2.1526, + "step": 15185 + }, + { + "epoch": 2.8464854732895972, + "grad_norm": 52017.69921875, + "learning_rate": 1.3639906997209129e-05, + "loss": 2.1303, + "step": 15186 + }, + { + "epoch": 2.846672914714152, + "grad_norm": 53067.84375, + "learning_rate": 1.363451359042971e-05, + "loss": 2.0641, + "step": 15187 + }, + { + "epoch": 2.8468603561387065, + "grad_norm": 57643.72265625, + "learning_rate": 1.362912108183066e-05, + "loss": 2.1349, + "step": 15188 + }, + { + "epoch": 2.8470477975632615, + "grad_norm": 56651.765625, + "learning_rate": 1.3623729471545177e-05, + "loss": 2.186, + "step": 15189 + }, + { + "epoch": 2.847235238987816, + "grad_norm": 60986.69921875, + "learning_rate": 1.3618338759706412e-05, + "loss": 2.1805, + "step": 15190 + }, + { + "epoch": 2.8474226804123712, + "grad_norm": 54246.375, + "learning_rate": 1.3612948946447496e-05, + "loss": 2.0994, + "step": 15191 + }, + { + "epoch": 2.847610121836926, + "grad_norm": 54573.375, + "learning_rate": 1.3607560031901584e-05, + "loss": 2.0928, + "step": 15192 + }, + { + "epoch": 2.847797563261481, + "grad_norm": 59704.16796875, + "learning_rate": 1.3602172016201753e-05, + "loss": 2.1303, + "step": 15193 + }, + { + "epoch": 2.8479850046860355, + "grad_norm": 52481.69921875, + "learning_rate": 1.3596784899481063e-05, + "loss": 2.1443, + "step": 15194 + }, + { + "epoch": 2.84817244611059, + "grad_norm": 50379.44921875, + "learning_rate": 1.3591398681872608e-05, + "loss": 2.1186, + "step": 15195 + }, + { + "epoch": 2.8483598875351452, + "grad_norm": 51703.98828125, + "learning_rate": 1.3586013363509387e-05, + "loss": 2.146, + "step": 15196 + }, + { + "epoch": 2.8485473289597003, + "grad_norm": 54193.640625, + "learning_rate": 1.3580628944524437e-05, + "loss": 2.0997, + "step": 15197 + }, + { + "epoch": 2.848734770384255, + "grad_norm": 53430.13671875, + "learning_rate": 1.357524542505072e-05, + "loss": 2.1026, + "step": 15198 + }, + { + "epoch": 2.8489222118088096, + "grad_norm": 52028.6796875, + "learning_rate": 1.3569862805221234e-05, + "loss": 2.2934, + "step": 15199 + }, + { + "epoch": 2.8491096532333646, + "grad_norm": 53317.296875, + "learning_rate": 1.3564481085168883e-05, + "loss": 2.0741, + "step": 15200 + }, + { + "epoch": 2.8492970946579192, + "grad_norm": 53327.83203125, + "learning_rate": 1.3559100265026625e-05, + "loss": 2.1409, + "step": 15201 + }, + { + "epoch": 2.8494845360824743, + "grad_norm": 51227.99609375, + "learning_rate": 1.355372034492735e-05, + "loss": 2.1557, + "step": 15202 + }, + { + "epoch": 2.849671977507029, + "grad_norm": 51523.9140625, + "learning_rate": 1.354834132500391e-05, + "loss": 2.1219, + "step": 15203 + }, + { + "epoch": 2.849859418931584, + "grad_norm": 63890.9921875, + "learning_rate": 1.3542963205389197e-05, + "loss": 2.0883, + "step": 15204 + }, + { + "epoch": 2.8500468603561386, + "grad_norm": 54752.7421875, + "learning_rate": 1.3537585986216007e-05, + "loss": 2.1287, + "step": 15205 + }, + { + "epoch": 2.8502343017806933, + "grad_norm": 58915.234375, + "learning_rate": 1.3532209667617185e-05, + "loss": 2.237, + "step": 15206 + }, + { + "epoch": 2.8504217432052483, + "grad_norm": 58204.8046875, + "learning_rate": 1.352683424972549e-05, + "loss": 2.0431, + "step": 15207 + }, + { + "epoch": 2.8506091846298034, + "grad_norm": 52017.03125, + "learning_rate": 1.3521459732673713e-05, + "loss": 2.1183, + "step": 15208 + }, + { + "epoch": 2.850796626054358, + "grad_norm": 55116.28515625, + "learning_rate": 1.3516086116594584e-05, + "loss": 2.1164, + "step": 15209 + }, + { + "epoch": 2.8509840674789126, + "grad_norm": 54996.5859375, + "learning_rate": 1.3510713401620817e-05, + "loss": 2.1822, + "step": 15210 + }, + { + "epoch": 2.8511715089034677, + "grad_norm": 54130.75390625, + "learning_rate": 1.3505341587885118e-05, + "loss": 2.1902, + "step": 15211 + }, + { + "epoch": 2.8513589503280223, + "grad_norm": 52859.625, + "learning_rate": 1.349997067552019e-05, + "loss": 2.1662, + "step": 15212 + }, + { + "epoch": 2.8515463917525774, + "grad_norm": 56457.3203125, + "learning_rate": 1.349460066465864e-05, + "loss": 2.1036, + "step": 15213 + }, + { + "epoch": 2.851733833177132, + "grad_norm": 51372.18359375, + "learning_rate": 1.3489231555433118e-05, + "loss": 2.1684, + "step": 15214 + }, + { + "epoch": 2.851921274601687, + "grad_norm": 57372.68359375, + "learning_rate": 1.3483863347976261e-05, + "loss": 2.1204, + "step": 15215 + }, + { + "epoch": 2.8521087160262417, + "grad_norm": 58601.49609375, + "learning_rate": 1.347849604242063e-05, + "loss": 2.1317, + "step": 15216 + }, + { + "epoch": 2.852296157450797, + "grad_norm": 52436.87890625, + "learning_rate": 1.3473129638898784e-05, + "loss": 2.0832, + "step": 15217 + }, + { + "epoch": 2.8524835988753514, + "grad_norm": 56619.15234375, + "learning_rate": 1.3467764137543277e-05, + "loss": 2.1198, + "step": 15218 + }, + { + "epoch": 2.8526710402999065, + "grad_norm": 48243.70703125, + "learning_rate": 1.3462399538486663e-05, + "loss": 2.1574, + "step": 15219 + }, + { + "epoch": 2.852858481724461, + "grad_norm": 55187.03125, + "learning_rate": 1.3457035841861377e-05, + "loss": 2.1472, + "step": 15220 + }, + { + "epoch": 2.8530459231490157, + "grad_norm": 53536.90234375, + "learning_rate": 1.3451673047799922e-05, + "loss": 2.1276, + "step": 15221 + }, + { + "epoch": 2.853233364573571, + "grad_norm": 57241.8984375, + "learning_rate": 1.3446311156434782e-05, + "loss": 2.114, + "step": 15222 + }, + { + "epoch": 2.8534208059981254, + "grad_norm": 51831.65234375, + "learning_rate": 1.3440950167898358e-05, + "loss": 2.1262, + "step": 15223 + }, + { + "epoch": 2.8536082474226805, + "grad_norm": 53908.17578125, + "learning_rate": 1.3435590082323052e-05, + "loss": 2.1078, + "step": 15224 + }, + { + "epoch": 2.853795688847235, + "grad_norm": 52970.4609375, + "learning_rate": 1.3430230899841278e-05, + "loss": 2.1043, + "step": 15225 + }, + { + "epoch": 2.85398313027179, + "grad_norm": 54207.05859375, + "learning_rate": 1.3424872620585371e-05, + "loss": 2.1536, + "step": 15226 + }, + { + "epoch": 2.854170571696345, + "grad_norm": 59498.94140625, + "learning_rate": 1.3419515244687708e-05, + "loss": 2.1657, + "step": 15227 + }, + { + "epoch": 2.8543580131209, + "grad_norm": 52078.16015625, + "learning_rate": 1.3414158772280576e-05, + "loss": 2.1546, + "step": 15228 + }, + { + "epoch": 2.8545454545454545, + "grad_norm": 51320.03125, + "learning_rate": 1.3408803203496306e-05, + "loss": 2.1084, + "step": 15229 + }, + { + "epoch": 2.8547328959700096, + "grad_norm": 54286.1640625, + "learning_rate": 1.3403448538467134e-05, + "loss": 2.1566, + "step": 15230 + }, + { + "epoch": 2.854920337394564, + "grad_norm": 58362.79296875, + "learning_rate": 1.3398094777325354e-05, + "loss": 2.0547, + "step": 15231 + }, + { + "epoch": 2.855107778819119, + "grad_norm": 50126.3671875, + "learning_rate": 1.3392741920203177e-05, + "loss": 2.0429, + "step": 15232 + }, + { + "epoch": 2.855295220243674, + "grad_norm": 56949.359375, + "learning_rate": 1.3387389967232799e-05, + "loss": 2.0591, + "step": 15233 + }, + { + "epoch": 2.855482661668229, + "grad_norm": 57020.328125, + "learning_rate": 1.338203891854643e-05, + "loss": 2.1374, + "step": 15234 + }, + { + "epoch": 2.8556701030927836, + "grad_norm": 56813.046875, + "learning_rate": 1.337668877427623e-05, + "loss": 2.1335, + "step": 15235 + }, + { + "epoch": 2.855857544517338, + "grad_norm": 57267.01171875, + "learning_rate": 1.3371339534554318e-05, + "loss": 2.1647, + "step": 15236 + }, + { + "epoch": 2.8560449859418933, + "grad_norm": 59223.48046875, + "learning_rate": 1.3365991199512828e-05, + "loss": 2.4077, + "step": 15237 + }, + { + "epoch": 2.856232427366448, + "grad_norm": 54611.1796875, + "learning_rate": 1.336064376928387e-05, + "loss": 2.1078, + "step": 15238 + }, + { + "epoch": 2.856419868791003, + "grad_norm": 54263.1875, + "learning_rate": 1.3355297243999515e-05, + "loss": 2.1609, + "step": 15239 + }, + { + "epoch": 2.8566073102155576, + "grad_norm": 56172.31640625, + "learning_rate": 1.334995162379179e-05, + "loss": 2.1405, + "step": 15240 + }, + { + "epoch": 2.8567947516401127, + "grad_norm": 59577.84375, + "learning_rate": 1.334460690879274e-05, + "loss": 2.0879, + "step": 15241 + }, + { + "epoch": 2.8569821930646673, + "grad_norm": 62436.5703125, + "learning_rate": 1.3339263099134403e-05, + "loss": 2.1518, + "step": 15242 + }, + { + "epoch": 2.857169634489222, + "grad_norm": 55511.609375, + "learning_rate": 1.3333920194948713e-05, + "loss": 2.0991, + "step": 15243 + }, + { + "epoch": 2.857357075913777, + "grad_norm": 55343.80859375, + "learning_rate": 1.3328578196367653e-05, + "loss": 2.0968, + "step": 15244 + }, + { + "epoch": 2.857544517338332, + "grad_norm": 54780.20703125, + "learning_rate": 1.3323237103523179e-05, + "loss": 2.1503, + "step": 15245 + }, + { + "epoch": 2.8577319587628867, + "grad_norm": 56819.04296875, + "learning_rate": 1.3317896916547195e-05, + "loss": 2.131, + "step": 15246 + }, + { + "epoch": 2.8579194001874413, + "grad_norm": 57293.1953125, + "learning_rate": 1.3312557635571588e-05, + "loss": 2.1609, + "step": 15247 + }, + { + "epoch": 2.8581068416119964, + "grad_norm": 51235.16796875, + "learning_rate": 1.3307219260728255e-05, + "loss": 2.0714, + "step": 15248 + }, + { + "epoch": 2.858294283036551, + "grad_norm": 53883.984375, + "learning_rate": 1.3301881792149034e-05, + "loss": 2.121, + "step": 15249 + }, + { + "epoch": 2.858481724461106, + "grad_norm": 54425.56640625, + "learning_rate": 1.3296545229965734e-05, + "loss": 2.1454, + "step": 15250 + }, + { + "epoch": 2.8586691658856607, + "grad_norm": 52407.55078125, + "learning_rate": 1.3291209574310182e-05, + "loss": 2.1016, + "step": 15251 + }, + { + "epoch": 2.8588566073102157, + "grad_norm": 53207.64453125, + "learning_rate": 1.3285874825314177e-05, + "loss": 2.1121, + "step": 15252 + }, + { + "epoch": 2.8590440487347704, + "grad_norm": 53172.1953125, + "learning_rate": 1.3280540983109463e-05, + "loss": 2.1014, + "step": 15253 + }, + { + "epoch": 2.859231490159325, + "grad_norm": 53284.57421875, + "learning_rate": 1.3275208047827764e-05, + "loss": 2.0833, + "step": 15254 + }, + { + "epoch": 2.85941893158388, + "grad_norm": 51217.453125, + "learning_rate": 1.3269876019600824e-05, + "loss": 2.1584, + "step": 15255 + }, + { + "epoch": 2.859606373008435, + "grad_norm": 58929.7890625, + "learning_rate": 1.3264544898560311e-05, + "loss": 2.1434, + "step": 15256 + }, + { + "epoch": 2.8597938144329897, + "grad_norm": 50806.6171875, + "learning_rate": 1.3259214684837929e-05, + "loss": 2.1405, + "step": 15257 + }, + { + "epoch": 2.8599812558575444, + "grad_norm": 54604.11328125, + "learning_rate": 1.325388537856529e-05, + "loss": 2.1691, + "step": 15258 + }, + { + "epoch": 2.8601686972820994, + "grad_norm": 57977.5078125, + "learning_rate": 1.3248556979874061e-05, + "loss": 2.0528, + "step": 15259 + }, + { + "epoch": 2.860356138706654, + "grad_norm": 56455.125, + "learning_rate": 1.324322948889582e-05, + "loss": 2.0478, + "step": 15260 + }, + { + "epoch": 2.860543580131209, + "grad_norm": 53196.6484375, + "learning_rate": 1.3237902905762139e-05, + "loss": 2.129, + "step": 15261 + }, + { + "epoch": 2.8607310215557638, + "grad_norm": 58890.84765625, + "learning_rate": 1.3232577230604615e-05, + "loss": 2.1313, + "step": 15262 + }, + { + "epoch": 2.860918462980319, + "grad_norm": 62295.52734375, + "learning_rate": 1.322725246355474e-05, + "loss": 2.1199, + "step": 15263 + }, + { + "epoch": 2.8611059044048734, + "grad_norm": 49879.93359375, + "learning_rate": 1.3221928604744077e-05, + "loss": 2.1055, + "step": 15264 + }, + { + "epoch": 2.861293345829428, + "grad_norm": 51836.91015625, + "learning_rate": 1.3216605654304087e-05, + "loss": 2.0751, + "step": 15265 + }, + { + "epoch": 2.861480787253983, + "grad_norm": 53453.625, + "learning_rate": 1.3211283612366232e-05, + "loss": 2.0773, + "step": 15266 + }, + { + "epoch": 2.861668228678538, + "grad_norm": 55058.65234375, + "learning_rate": 1.3205962479061978e-05, + "loss": 2.0984, + "step": 15267 + }, + { + "epoch": 2.861855670103093, + "grad_norm": 53342.015625, + "learning_rate": 1.320064225452276e-05, + "loss": 2.1214, + "step": 15268 + }, + { + "epoch": 2.8620431115276475, + "grad_norm": 51180.80078125, + "learning_rate": 1.3195322938879968e-05, + "loss": 2.1701, + "step": 15269 + }, + { + "epoch": 2.8622305529522025, + "grad_norm": 55714.13671875, + "learning_rate": 1.3190004532264965e-05, + "loss": 2.1654, + "step": 15270 + }, + { + "epoch": 2.862417994376757, + "grad_norm": 57467.71484375, + "learning_rate": 1.318468703480913e-05, + "loss": 2.1078, + "step": 15271 + }, + { + "epoch": 2.862605435801312, + "grad_norm": 55127.26171875, + "learning_rate": 1.3179370446643818e-05, + "loss": 2.1766, + "step": 15272 + }, + { + "epoch": 2.862792877225867, + "grad_norm": 58902.734375, + "learning_rate": 1.317405476790029e-05, + "loss": 2.0707, + "step": 15273 + }, + { + "epoch": 2.862980318650422, + "grad_norm": 54867.890625, + "learning_rate": 1.3168739998709861e-05, + "loss": 2.1332, + "step": 15274 + }, + { + "epoch": 2.8631677600749765, + "grad_norm": 53174.01953125, + "learning_rate": 1.3163426139203817e-05, + "loss": 2.0971, + "step": 15275 + }, + { + "epoch": 2.863355201499531, + "grad_norm": 55159.41015625, + "learning_rate": 1.3158113189513387e-05, + "loss": 2.0969, + "step": 15276 + }, + { + "epoch": 2.8635426429240862, + "grad_norm": 57112.1640625, + "learning_rate": 1.3152801149769784e-05, + "loss": 2.1169, + "step": 15277 + }, + { + "epoch": 2.8637300843486413, + "grad_norm": 54240.3828125, + "learning_rate": 1.3147490020104236e-05, + "loss": 2.0494, + "step": 15278 + }, + { + "epoch": 2.863917525773196, + "grad_norm": 52280.625, + "learning_rate": 1.3142179800647897e-05, + "loss": 2.1664, + "step": 15279 + }, + { + "epoch": 2.8641049671977505, + "grad_norm": 56751.4453125, + "learning_rate": 1.3136870491531922e-05, + "loss": 2.0964, + "step": 15280 + }, + { + "epoch": 2.8642924086223056, + "grad_norm": 54465.19921875, + "learning_rate": 1.313156209288745e-05, + "loss": 2.1089, + "step": 15281 + }, + { + "epoch": 2.8644798500468602, + "grad_norm": 60336.625, + "learning_rate": 1.3126254604845607e-05, + "loss": 2.1239, + "step": 15282 + }, + { + "epoch": 2.8646672914714153, + "grad_norm": 58838.09765625, + "learning_rate": 1.3120948027537466e-05, + "loss": 2.1416, + "step": 15283 + }, + { + "epoch": 2.86485473289597, + "grad_norm": 53185.23828125, + "learning_rate": 1.3115642361094083e-05, + "loss": 2.0667, + "step": 15284 + }, + { + "epoch": 2.865042174320525, + "grad_norm": 51128.87109375, + "learning_rate": 1.3110337605646528e-05, + "loss": 2.1423, + "step": 15285 + }, + { + "epoch": 2.8652296157450796, + "grad_norm": 54156.609375, + "learning_rate": 1.310503376132579e-05, + "loss": 2.1164, + "step": 15286 + }, + { + "epoch": 2.8654170571696342, + "grad_norm": 53770.8828125, + "learning_rate": 1.3099730828262897e-05, + "loss": 2.1484, + "step": 15287 + }, + { + "epoch": 2.8656044985941893, + "grad_norm": 58754.5625, + "learning_rate": 1.3094428806588809e-05, + "loss": 2.0666, + "step": 15288 + }, + { + "epoch": 2.8657919400187444, + "grad_norm": 52674.15625, + "learning_rate": 1.3089127696434472e-05, + "loss": 2.1684, + "step": 15289 + }, + { + "epoch": 2.865979381443299, + "grad_norm": 55255.23046875, + "learning_rate": 1.3083827497930834e-05, + "loss": 2.0994, + "step": 15290 + }, + { + "epoch": 2.8661668228678536, + "grad_norm": 55238.00390625, + "learning_rate": 1.3078528211208785e-05, + "loss": 2.148, + "step": 15291 + }, + { + "epoch": 2.8663542642924087, + "grad_norm": 50968.6484375, + "learning_rate": 1.3073229836399226e-05, + "loss": 2.1463, + "step": 15292 + }, + { + "epoch": 2.8665417057169633, + "grad_norm": 54746.05859375, + "learning_rate": 1.3067932373633007e-05, + "loss": 1.9979, + "step": 15293 + }, + { + "epoch": 2.8667291471415184, + "grad_norm": 50269.02734375, + "learning_rate": 1.3062635823040986e-05, + "loss": 2.0929, + "step": 15294 + }, + { + "epoch": 2.866916588566073, + "grad_norm": 55610.3046875, + "learning_rate": 1.3057340184753975e-05, + "loss": 2.1256, + "step": 15295 + }, + { + "epoch": 2.867104029990628, + "grad_norm": 57349.6875, + "learning_rate": 1.3052045458902745e-05, + "loss": 2.1285, + "step": 15296 + }, + { + "epoch": 2.8672914714151827, + "grad_norm": 55474.609375, + "learning_rate": 1.3046751645618088e-05, + "loss": 2.0515, + "step": 15297 + }, + { + "epoch": 2.8674789128397373, + "grad_norm": 51171.40625, + "learning_rate": 1.3041458745030772e-05, + "loss": 2.0654, + "step": 15298 + }, + { + "epoch": 2.8676663542642924, + "grad_norm": 51704.80078125, + "learning_rate": 1.3036166757271506e-05, + "loss": 2.1138, + "step": 15299 + }, + { + "epoch": 2.8678537956888475, + "grad_norm": 57630.59375, + "learning_rate": 1.3030875682470984e-05, + "loss": 2.1097, + "step": 15300 + }, + { + "epoch": 2.868041237113402, + "grad_norm": 51129.765625, + "learning_rate": 1.3025585520759915e-05, + "loss": 2.1007, + "step": 15301 + }, + { + "epoch": 2.8682286785379567, + "grad_norm": 56818.73828125, + "learning_rate": 1.3020296272268945e-05, + "loss": 2.0943, + "step": 15302 + }, + { + "epoch": 2.868416119962512, + "grad_norm": 57682.09375, + "learning_rate": 1.3015007937128693e-05, + "loss": 2.0968, + "step": 15303 + }, + { + "epoch": 2.8686035613870664, + "grad_norm": 53107.6640625, + "learning_rate": 1.3009720515469798e-05, + "loss": 2.1925, + "step": 15304 + }, + { + "epoch": 2.8687910028116215, + "grad_norm": 60834.65625, + "learning_rate": 1.3004434007422878e-05, + "loss": 2.1582, + "step": 15305 + }, + { + "epoch": 2.868978444236176, + "grad_norm": 54401.67578125, + "learning_rate": 1.2999148413118445e-05, + "loss": 2.1941, + "step": 15306 + }, + { + "epoch": 2.869165885660731, + "grad_norm": 52239.83984375, + "learning_rate": 1.2993863732687067e-05, + "loss": 2.0622, + "step": 15307 + }, + { + "epoch": 2.869353327085286, + "grad_norm": 55100.01953125, + "learning_rate": 1.2988579966259296e-05, + "loss": 2.0545, + "step": 15308 + }, + { + "epoch": 2.8695407685098404, + "grad_norm": 58805.02734375, + "learning_rate": 1.2983297113965615e-05, + "loss": 2.1042, + "step": 15309 + }, + { + "epoch": 2.8697282099343955, + "grad_norm": 56887.76953125, + "learning_rate": 1.2978015175936481e-05, + "loss": 2.0262, + "step": 15310 + }, + { + "epoch": 2.8699156513589505, + "grad_norm": 58322.265625, + "learning_rate": 1.2972734152302379e-05, + "loss": 2.1781, + "step": 15311 + }, + { + "epoch": 2.870103092783505, + "grad_norm": 52217.80078125, + "learning_rate": 1.296745404319375e-05, + "loss": 2.1469, + "step": 15312 + }, + { + "epoch": 2.87029053420806, + "grad_norm": 53325.0234375, + "learning_rate": 1.2962174848740992e-05, + "loss": 2.1316, + "step": 15313 + }, + { + "epoch": 2.870477975632615, + "grad_norm": 56259.95703125, + "learning_rate": 1.2956896569074478e-05, + "loss": 2.0545, + "step": 15314 + }, + { + "epoch": 2.8706654170571695, + "grad_norm": 54001.3828125, + "learning_rate": 1.2951619204324611e-05, + "loss": 2.275, + "step": 15315 + }, + { + "epoch": 2.8708528584817246, + "grad_norm": 53285.35546875, + "learning_rate": 1.294634275462171e-05, + "loss": 2.1041, + "step": 15316 + }, + { + "epoch": 2.871040299906279, + "grad_norm": 57967.921875, + "learning_rate": 1.2941067220096092e-05, + "loss": 2.0778, + "step": 15317 + }, + { + "epoch": 2.8712277413308342, + "grad_norm": 51756.8203125, + "learning_rate": 1.2935792600878077e-05, + "loss": 2.1246, + "step": 15318 + }, + { + "epoch": 2.871415182755389, + "grad_norm": 53263.48046875, + "learning_rate": 1.293051889709792e-05, + "loss": 2.097, + "step": 15319 + }, + { + "epoch": 2.8716026241799435, + "grad_norm": 57900.94140625, + "learning_rate": 1.2925246108885896e-05, + "loss": 2.15, + "step": 15320 + }, + { + "epoch": 2.8717900656044986, + "grad_norm": 52649.25, + "learning_rate": 1.2919974236372212e-05, + "loss": 2.1032, + "step": 15321 + }, + { + "epoch": 2.8719775070290536, + "grad_norm": 53389.0234375, + "learning_rate": 1.2914703279687106e-05, + "loss": 2.088, + "step": 15322 + }, + { + "epoch": 2.8721649484536083, + "grad_norm": 59934.80078125, + "learning_rate": 1.2909433238960728e-05, + "loss": 2.0271, + "step": 15323 + }, + { + "epoch": 2.872352389878163, + "grad_norm": 53802.9609375, + "learning_rate": 1.290416411432328e-05, + "loss": 2.0804, + "step": 15324 + }, + { + "epoch": 2.872539831302718, + "grad_norm": 52231.44140625, + "learning_rate": 1.2898895905904879e-05, + "loss": 2.1344, + "step": 15325 + }, + { + "epoch": 2.8727272727272726, + "grad_norm": 55313.53515625, + "learning_rate": 1.2893628613835634e-05, + "loss": 2.0872, + "step": 15326 + }, + { + "epoch": 2.8729147141518276, + "grad_norm": 55832.57421875, + "learning_rate": 1.2888362238245655e-05, + "loss": 2.0921, + "step": 15327 + }, + { + "epoch": 2.8731021555763823, + "grad_norm": 54157.640625, + "learning_rate": 1.2883096779265036e-05, + "loss": 2.1151, + "step": 15328 + }, + { + "epoch": 2.8732895970009373, + "grad_norm": 57149.2109375, + "learning_rate": 1.287783223702378e-05, + "loss": 2.1396, + "step": 15329 + }, + { + "epoch": 2.873477038425492, + "grad_norm": 57283.5703125, + "learning_rate": 1.2872568611651936e-05, + "loss": 2.1011, + "step": 15330 + }, + { + "epoch": 2.873664479850047, + "grad_norm": 53544.6484375, + "learning_rate": 1.2867305903279525e-05, + "loss": 2.1062, + "step": 15331 + }, + { + "epoch": 2.8738519212746017, + "grad_norm": 62806.03515625, + "learning_rate": 1.2862044112036515e-05, + "loss": 2.1689, + "step": 15332 + }, + { + "epoch": 2.8740393626991567, + "grad_norm": 54873.91015625, + "learning_rate": 1.2856783238052856e-05, + "loss": 2.0784, + "step": 15333 + }, + { + "epoch": 2.8742268041237113, + "grad_norm": 51832.9609375, + "learning_rate": 1.2851523281458488e-05, + "loss": 2.1782, + "step": 15334 + }, + { + "epoch": 2.874414245548266, + "grad_norm": 57184.83984375, + "learning_rate": 1.284626424238336e-05, + "loss": 2.1529, + "step": 15335 + }, + { + "epoch": 2.874601686972821, + "grad_norm": 53465.77734375, + "learning_rate": 1.2841006120957305e-05, + "loss": 2.0659, + "step": 15336 + }, + { + "epoch": 2.8747891283973757, + "grad_norm": 54433.5703125, + "learning_rate": 1.2835748917310225e-05, + "loss": 2.0832, + "step": 15337 + }, + { + "epoch": 2.8749765698219307, + "grad_norm": 55164.484375, + "learning_rate": 1.2830492631571977e-05, + "loss": 2.0736, + "step": 15338 + }, + { + "epoch": 2.8751640112464854, + "grad_norm": 57638.453125, + "learning_rate": 1.282523726387237e-05, + "loss": 2.1038, + "step": 15339 + }, + { + "epoch": 2.8753514526710404, + "grad_norm": 57018.12890625, + "learning_rate": 1.2819982814341192e-05, + "loss": 2.1463, + "step": 15340 + }, + { + "epoch": 2.875538894095595, + "grad_norm": 54469.65234375, + "learning_rate": 1.2814729283108246e-05, + "loss": 2.1554, + "step": 15341 + }, + { + "epoch": 2.87572633552015, + "grad_norm": 56664.4375, + "learning_rate": 1.2809476670303256e-05, + "loss": 2.0898, + "step": 15342 + }, + { + "epoch": 2.8759137769447047, + "grad_norm": 55982.671875, + "learning_rate": 1.2804224976055995e-05, + "loss": 2.1489, + "step": 15343 + }, + { + "epoch": 2.87610121836926, + "grad_norm": 55155.35546875, + "learning_rate": 1.2798974200496134e-05, + "loss": 2.0638, + "step": 15344 + }, + { + "epoch": 2.8762886597938144, + "grad_norm": 64341.55078125, + "learning_rate": 1.2793724343753393e-05, + "loss": 2.207, + "step": 15345 + }, + { + "epoch": 2.876476101218369, + "grad_norm": 53132.92578125, + "learning_rate": 1.278847540595743e-05, + "loss": 2.1577, + "step": 15346 + }, + { + "epoch": 2.876663542642924, + "grad_norm": 56563.42578125, + "learning_rate": 1.2783227387237856e-05, + "loss": 2.1307, + "step": 15347 + }, + { + "epoch": 2.876850984067479, + "grad_norm": 53462.796875, + "learning_rate": 1.2777980287724328e-05, + "loss": 2.0942, + "step": 15348 + }, + { + "epoch": 2.877038425492034, + "grad_norm": 52473.98828125, + "learning_rate": 1.2772734107546413e-05, + "loss": 2.1258, + "step": 15349 + }, + { + "epoch": 2.8772258669165884, + "grad_norm": 52813.82421875, + "learning_rate": 1.276748884683372e-05, + "loss": 2.1468, + "step": 15350 + }, + { + "epoch": 2.8774133083411435, + "grad_norm": 57781.54296875, + "learning_rate": 1.276224450571576e-05, + "loss": 2.1615, + "step": 15351 + }, + { + "epoch": 2.877600749765698, + "grad_norm": 51888.36328125, + "learning_rate": 1.27570010843221e-05, + "loss": 2.0974, + "step": 15352 + }, + { + "epoch": 2.877788191190253, + "grad_norm": 56575.2265625, + "learning_rate": 1.2751758582782209e-05, + "loss": 2.1067, + "step": 15353 + }, + { + "epoch": 2.877975632614808, + "grad_norm": 50982.01171875, + "learning_rate": 1.2746517001225605e-05, + "loss": 2.1368, + "step": 15354 + }, + { + "epoch": 2.878163074039363, + "grad_norm": 52043.078125, + "learning_rate": 1.2741276339781728e-05, + "loss": 2.0866, + "step": 15355 + }, + { + "epoch": 2.8783505154639175, + "grad_norm": 53057.51953125, + "learning_rate": 1.2736036598580003e-05, + "loss": 2.1008, + "step": 15356 + }, + { + "epoch": 2.878537956888472, + "grad_norm": 51114.5625, + "learning_rate": 1.273079777774988e-05, + "loss": 2.0997, + "step": 15357 + }, + { + "epoch": 2.878725398313027, + "grad_norm": 51005.77734375, + "learning_rate": 1.2725559877420729e-05, + "loss": 2.0932, + "step": 15358 + }, + { + "epoch": 2.8789128397375823, + "grad_norm": 55321.9140625, + "learning_rate": 1.2720322897721909e-05, + "loss": 2.0409, + "step": 15359 + }, + { + "epoch": 2.879100281162137, + "grad_norm": 54732.5546875, + "learning_rate": 1.271508683878278e-05, + "loss": 2.1328, + "step": 15360 + }, + { + "epoch": 2.8792877225866915, + "grad_norm": 56123.29296875, + "learning_rate": 1.2709851700732677e-05, + "loss": 2.1705, + "step": 15361 + }, + { + "epoch": 2.8794751640112466, + "grad_norm": 56397.7890625, + "learning_rate": 1.2704617483700892e-05, + "loss": 2.007, + "step": 15362 + }, + { + "epoch": 2.879662605435801, + "grad_norm": 53314.33984375, + "learning_rate": 1.2699384187816688e-05, + "loss": 2.1789, + "step": 15363 + }, + { + "epoch": 2.8798500468603563, + "grad_norm": 52457.140625, + "learning_rate": 1.2694151813209326e-05, + "loss": 2.0648, + "step": 15364 + }, + { + "epoch": 2.880037488284911, + "grad_norm": 54710.734375, + "learning_rate": 1.2688920360008082e-05, + "loss": 2.1174, + "step": 15365 + }, + { + "epoch": 2.880224929709466, + "grad_norm": 52465.97265625, + "learning_rate": 1.26836898283421e-05, + "loss": 2.1103, + "step": 15366 + }, + { + "epoch": 2.8804123711340206, + "grad_norm": 58989.984375, + "learning_rate": 1.2678460218340599e-05, + "loss": 2.0707, + "step": 15367 + }, + { + "epoch": 2.8805998125585752, + "grad_norm": 60606.9453125, + "learning_rate": 1.2673231530132757e-05, + "loss": 2.0835, + "step": 15368 + }, + { + "epoch": 2.8807872539831303, + "grad_norm": 51993.00390625, + "learning_rate": 1.2668003763847703e-05, + "loss": 2.1442, + "step": 15369 + }, + { + "epoch": 2.8809746954076854, + "grad_norm": 53488.03125, + "learning_rate": 1.2662776919614533e-05, + "loss": 2.0872, + "step": 15370 + }, + { + "epoch": 2.88116213683224, + "grad_norm": 53739.8203125, + "learning_rate": 1.2657550997562385e-05, + "loss": 2.162, + "step": 15371 + }, + { + "epoch": 2.8813495782567946, + "grad_norm": 53523.98828125, + "learning_rate": 1.2652325997820313e-05, + "loss": 2.1785, + "step": 15372 + }, + { + "epoch": 2.8815370196813497, + "grad_norm": 58834.890625, + "learning_rate": 1.264710192051735e-05, + "loss": 2.1826, + "step": 15373 + }, + { + "epoch": 2.8817244611059043, + "grad_norm": 51213.40234375, + "learning_rate": 1.2641878765782538e-05, + "loss": 2.1802, + "step": 15374 + }, + { + "epoch": 2.8819119025304594, + "grad_norm": 54518.99609375, + "learning_rate": 1.2636656533744906e-05, + "loss": 2.0197, + "step": 15375 + }, + { + "epoch": 2.882099343955014, + "grad_norm": 61724.36328125, + "learning_rate": 1.2631435224533411e-05, + "loss": 2.1083, + "step": 15376 + }, + { + "epoch": 2.882286785379569, + "grad_norm": 57826.5703125, + "learning_rate": 1.2626214838277001e-05, + "loss": 2.1384, + "step": 15377 + }, + { + "epoch": 2.8824742268041237, + "grad_norm": 56602.2578125, + "learning_rate": 1.2620995375104644e-05, + "loss": 2.0763, + "step": 15378 + }, + { + "epoch": 2.8826616682286783, + "grad_norm": 53824.015625, + "learning_rate": 1.2615776835145226e-05, + "loss": 2.1178, + "step": 15379 + }, + { + "epoch": 2.8828491096532334, + "grad_norm": 52341.0, + "learning_rate": 1.2610559218527669e-05, + "loss": 2.1285, + "step": 15380 + }, + { + "epoch": 2.8830365510777884, + "grad_norm": 59221.4453125, + "learning_rate": 1.2605342525380814e-05, + "loss": 2.143, + "step": 15381 + }, + { + "epoch": 2.883223992502343, + "grad_norm": 51879.34765625, + "learning_rate": 1.2600126755833507e-05, + "loss": 2.1224, + "step": 15382 + }, + { + "epoch": 2.8834114339268977, + "grad_norm": 50264.45703125, + "learning_rate": 1.2594911910014579e-05, + "loss": 2.1394, + "step": 15383 + }, + { + "epoch": 2.8835988753514528, + "grad_norm": 65166.5078125, + "learning_rate": 1.2589697988052846e-05, + "loss": 2.1757, + "step": 15384 + }, + { + "epoch": 2.8837863167760074, + "grad_norm": 55980.73828125, + "learning_rate": 1.2584484990077067e-05, + "loss": 2.1059, + "step": 15385 + }, + { + "epoch": 2.8839737582005625, + "grad_norm": 53769.390625, + "learning_rate": 1.2579272916215985e-05, + "loss": 2.0991, + "step": 15386 + }, + { + "epoch": 2.884161199625117, + "grad_norm": 53889.9921875, + "learning_rate": 1.257406176659836e-05, + "loss": 2.1547, + "step": 15387 + }, + { + "epoch": 2.884348641049672, + "grad_norm": 57331.69921875, + "learning_rate": 1.2568851541352889e-05, + "loss": 2.167, + "step": 15388 + }, + { + "epoch": 2.8845360824742268, + "grad_norm": 54989.1015625, + "learning_rate": 1.2563642240608236e-05, + "loss": 2.1863, + "step": 15389 + }, + { + "epoch": 2.8847235238987814, + "grad_norm": 52971.3984375, + "learning_rate": 1.2558433864493085e-05, + "loss": 2.1713, + "step": 15390 + }, + { + "epoch": 2.8849109653233365, + "grad_norm": 54819.3984375, + "learning_rate": 1.2553226413136088e-05, + "loss": 2.1651, + "step": 15391 + }, + { + "epoch": 2.8850984067478915, + "grad_norm": 52455.51171875, + "learning_rate": 1.2548019886665845e-05, + "loss": 2.0927, + "step": 15392 + }, + { + "epoch": 2.885285848172446, + "grad_norm": 51280.0234375, + "learning_rate": 1.2542814285210941e-05, + "loss": 2.1139, + "step": 15393 + }, + { + "epoch": 2.885473289597001, + "grad_norm": 50439.59765625, + "learning_rate": 1.2537609608899974e-05, + "loss": 2.1008, + "step": 15394 + }, + { + "epoch": 2.885660731021556, + "grad_norm": 54597.8984375, + "learning_rate": 1.2532405857861485e-05, + "loss": 2.1094, + "step": 15395 + }, + { + "epoch": 2.8858481724461105, + "grad_norm": 59917.4609375, + "learning_rate": 1.2527203032223972e-05, + "loss": 2.0787, + "step": 15396 + }, + { + "epoch": 2.8860356138706655, + "grad_norm": 52194.51171875, + "learning_rate": 1.2522001132115963e-05, + "loss": 2.1167, + "step": 15397 + }, + { + "epoch": 2.88622305529522, + "grad_norm": 53087.5, + "learning_rate": 1.2516800157665964e-05, + "loss": 2.1205, + "step": 15398 + }, + { + "epoch": 2.8864104967197752, + "grad_norm": 50243.1484375, + "learning_rate": 1.2511600109002375e-05, + "loss": 2.0968, + "step": 15399 + }, + { + "epoch": 2.88659793814433, + "grad_norm": 63388.1640625, + "learning_rate": 1.2506400986253659e-05, + "loss": 2.1601, + "step": 15400 + }, + { + "epoch": 2.8867853795688845, + "grad_norm": 51586.4921875, + "learning_rate": 1.2501202789548238e-05, + "loss": 2.0656, + "step": 15401 + }, + { + "epoch": 2.8869728209934395, + "grad_norm": 51690.55859375, + "learning_rate": 1.2496005519014492e-05, + "loss": 2.0738, + "step": 15402 + }, + { + "epoch": 2.8871602624179946, + "grad_norm": 57103.03515625, + "learning_rate": 1.2490809174780772e-05, + "loss": 2.1191, + "step": 15403 + }, + { + "epoch": 2.8873477038425492, + "grad_norm": 54688.6171875, + "learning_rate": 1.2485613756975434e-05, + "loss": 2.1121, + "step": 15404 + }, + { + "epoch": 2.887535145267104, + "grad_norm": 56227.65625, + "learning_rate": 1.2480419265726811e-05, + "loss": 2.106, + "step": 15405 + }, + { + "epoch": 2.887722586691659, + "grad_norm": 52487.53515625, + "learning_rate": 1.247522570116319e-05, + "loss": 2.0745, + "step": 15406 + }, + { + "epoch": 2.8879100281162136, + "grad_norm": 55696.34375, + "learning_rate": 1.2470033063412828e-05, + "loss": 2.1396, + "step": 15407 + }, + { + "epoch": 2.8880974695407686, + "grad_norm": 54217.453125, + "learning_rate": 1.2464841352604e-05, + "loss": 2.1128, + "step": 15408 + }, + { + "epoch": 2.8882849109653232, + "grad_norm": 54211.640625, + "learning_rate": 1.2459650568864917e-05, + "loss": 2.0981, + "step": 15409 + }, + { + "epoch": 2.8884723523898783, + "grad_norm": 59335.3359375, + "learning_rate": 1.2454460712323807e-05, + "loss": 2.1035, + "step": 15410 + }, + { + "epoch": 2.888659793814433, + "grad_norm": 57216.58203125, + "learning_rate": 1.2449271783108835e-05, + "loss": 2.0328, + "step": 15411 + }, + { + "epoch": 2.8888472352389876, + "grad_norm": 54125.78125, + "learning_rate": 1.2444083781348153e-05, + "loss": 2.136, + "step": 15412 + }, + { + "epoch": 2.8890346766635426, + "grad_norm": 54460.8359375, + "learning_rate": 1.2438896707169923e-05, + "loss": 2.1119, + "step": 15413 + }, + { + "epoch": 2.8892221180880977, + "grad_norm": 52683.5859375, + "learning_rate": 1.2433710560702233e-05, + "loss": 2.1726, + "step": 15414 + }, + { + "epoch": 2.8894095595126523, + "grad_norm": 50339.22265625, + "learning_rate": 1.24285253420732e-05, + "loss": 2.1302, + "step": 15415 + }, + { + "epoch": 2.889597000937207, + "grad_norm": 58923.83203125, + "learning_rate": 1.2423341051410864e-05, + "loss": 2.1382, + "step": 15416 + }, + { + "epoch": 2.889784442361762, + "grad_norm": 53703.890625, + "learning_rate": 1.2418157688843302e-05, + "loss": 2.0867, + "step": 15417 + }, + { + "epoch": 2.8899718837863166, + "grad_norm": 53447.39453125, + "learning_rate": 1.2412975254498516e-05, + "loss": 2.1395, + "step": 15418 + }, + { + "epoch": 2.8901593252108717, + "grad_norm": 57722.26953125, + "learning_rate": 1.2407793748504499e-05, + "loss": 2.1796, + "step": 15419 + }, + { + "epoch": 2.8903467666354263, + "grad_norm": 54001.0078125, + "learning_rate": 1.2402613170989236e-05, + "loss": 2.1327, + "step": 15420 + }, + { + "epoch": 2.8905342080599814, + "grad_norm": 59385.6953125, + "learning_rate": 1.239743352208071e-05, + "loss": 2.1424, + "step": 15421 + }, + { + "epoch": 2.890721649484536, + "grad_norm": 60011.6875, + "learning_rate": 1.2392254801906788e-05, + "loss": 2.0976, + "step": 15422 + }, + { + "epoch": 2.8909090909090907, + "grad_norm": 53450.15625, + "learning_rate": 1.238707701059542e-05, + "loss": 2.1285, + "step": 15423 + }, + { + "epoch": 2.8910965323336457, + "grad_norm": 53928.46875, + "learning_rate": 1.23819001482745e-05, + "loss": 2.167, + "step": 15424 + }, + { + "epoch": 2.891283973758201, + "grad_norm": 53318.7109375, + "learning_rate": 1.2376724215071872e-05, + "loss": 2.0822, + "step": 15425 + }, + { + "epoch": 2.8914714151827554, + "grad_norm": 54532.6171875, + "learning_rate": 1.2371549211115363e-05, + "loss": 2.1956, + "step": 15426 + }, + { + "epoch": 2.89165885660731, + "grad_norm": 53575.3671875, + "learning_rate": 1.23663751365328e-05, + "loss": 2.1609, + "step": 15427 + }, + { + "epoch": 2.891846298031865, + "grad_norm": 57137.16796875, + "learning_rate": 1.236120199145201e-05, + "loss": 2.0947, + "step": 15428 + }, + { + "epoch": 2.8920337394564197, + "grad_norm": 56057.4140625, + "learning_rate": 1.23560297760007e-05, + "loss": 2.0804, + "step": 15429 + }, + { + "epoch": 2.892221180880975, + "grad_norm": 59973.578125, + "learning_rate": 1.2350858490306649e-05, + "loss": 2.112, + "step": 15430 + }, + { + "epoch": 2.8924086223055294, + "grad_norm": 55063.23046875, + "learning_rate": 1.2345688134497597e-05, + "loss": 2.1201, + "step": 15431 + }, + { + "epoch": 2.8925960637300845, + "grad_norm": 57465.77734375, + "learning_rate": 1.2340518708701226e-05, + "loss": 2.176, + "step": 15432 + }, + { + "epoch": 2.892783505154639, + "grad_norm": 54875.0859375, + "learning_rate": 1.2335350213045199e-05, + "loss": 2.1825, + "step": 15433 + }, + { + "epoch": 2.8929709465791937, + "grad_norm": 60092.10546875, + "learning_rate": 1.2330182647657212e-05, + "loss": 2.052, + "step": 15434 + }, + { + "epoch": 2.893158388003749, + "grad_norm": 55285.8125, + "learning_rate": 1.2325016012664852e-05, + "loss": 2.1174, + "step": 15435 + }, + { + "epoch": 2.893345829428304, + "grad_norm": 56610.1171875, + "learning_rate": 1.2319850308195769e-05, + "loss": 2.1281, + "step": 15436 + }, + { + "epoch": 2.8935332708528585, + "grad_norm": 52499.85546875, + "learning_rate": 1.2314685534377513e-05, + "loss": 2.1193, + "step": 15437 + }, + { + "epoch": 2.893720712277413, + "grad_norm": 58404.16015625, + "learning_rate": 1.2309521691337677e-05, + "loss": 2.1106, + "step": 15438 + }, + { + "epoch": 2.893908153701968, + "grad_norm": 55038.0234375, + "learning_rate": 1.2304358779203779e-05, + "loss": 2.1139, + "step": 15439 + }, + { + "epoch": 2.894095595126523, + "grad_norm": 52348.32421875, + "learning_rate": 1.2299196798103362e-05, + "loss": 2.1468, + "step": 15440 + }, + { + "epoch": 2.894283036551078, + "grad_norm": 58830.87109375, + "learning_rate": 1.2294035748163901e-05, + "loss": 2.1199, + "step": 15441 + }, + { + "epoch": 2.8944704779756325, + "grad_norm": 59746.859375, + "learning_rate": 1.2288875629512859e-05, + "loss": 2.1455, + "step": 15442 + }, + { + "epoch": 2.8946579194001876, + "grad_norm": 53717.35546875, + "learning_rate": 1.2283716442277704e-05, + "loss": 2.0867, + "step": 15443 + }, + { + "epoch": 2.894845360824742, + "grad_norm": 56520.11328125, + "learning_rate": 1.2278558186585842e-05, + "loss": 2.0944, + "step": 15444 + }, + { + "epoch": 2.895032802249297, + "grad_norm": 56143.58203125, + "learning_rate": 1.2273400862564699e-05, + "loss": 2.1033, + "step": 15445 + }, + { + "epoch": 2.895220243673852, + "grad_norm": 50226.76171875, + "learning_rate": 1.2268244470341633e-05, + "loss": 2.1331, + "step": 15446 + }, + { + "epoch": 2.895407685098407, + "grad_norm": 55146.1640625, + "learning_rate": 1.2263089010044016e-05, + "loss": 2.0983, + "step": 15447 + }, + { + "epoch": 2.8955951265229616, + "grad_norm": 51341.63671875, + "learning_rate": 1.2257934481799177e-05, + "loss": 2.1334, + "step": 15448 + }, + { + "epoch": 2.895782567947516, + "grad_norm": 51463.58203125, + "learning_rate": 1.225278088573441e-05, + "loss": 2.0465, + "step": 15449 + }, + { + "epoch": 2.8959700093720713, + "grad_norm": 59322.671875, + "learning_rate": 1.2247628221977014e-05, + "loss": 2.0889, + "step": 15450 + }, + { + "epoch": 2.896157450796626, + "grad_norm": 51198.76171875, + "learning_rate": 1.2242476490654281e-05, + "loss": 2.1572, + "step": 15451 + }, + { + "epoch": 2.896344892221181, + "grad_norm": 57703.2265625, + "learning_rate": 1.2237325691893397e-05, + "loss": 2.174, + "step": 15452 + }, + { + "epoch": 2.8965323336457356, + "grad_norm": 55674.48828125, + "learning_rate": 1.2232175825821612e-05, + "loss": 2.1897, + "step": 15453 + }, + { + "epoch": 2.8967197750702907, + "grad_norm": 51444.84375, + "learning_rate": 1.222702689256613e-05, + "loss": 2.1594, + "step": 15454 + }, + { + "epoch": 2.8969072164948453, + "grad_norm": 52084.93359375, + "learning_rate": 1.2221878892254107e-05, + "loss": 2.148, + "step": 15455 + }, + { + "epoch": 2.8970946579194004, + "grad_norm": 56144.89453125, + "learning_rate": 1.2216731825012683e-05, + "loss": 2.0863, + "step": 15456 + }, + { + "epoch": 2.897282099343955, + "grad_norm": 58254.90625, + "learning_rate": 1.2211585690968996e-05, + "loss": 2.1688, + "step": 15457 + }, + { + "epoch": 2.89746954076851, + "grad_norm": 52444.02734375, + "learning_rate": 1.2206440490250181e-05, + "loss": 2.134, + "step": 15458 + }, + { + "epoch": 2.8976569821930647, + "grad_norm": 52680.07421875, + "learning_rate": 1.2201296222983255e-05, + "loss": 2.1541, + "step": 15459 + }, + { + "epoch": 2.8978444236176193, + "grad_norm": 54569.6015625, + "learning_rate": 1.2196152889295304e-05, + "loss": 2.1916, + "step": 15460 + }, + { + "epoch": 2.8980318650421744, + "grad_norm": 57432.703125, + "learning_rate": 1.2191010489313376e-05, + "loss": 2.0058, + "step": 15461 + }, + { + "epoch": 2.898219306466729, + "grad_norm": 52789.13671875, + "learning_rate": 1.218586902316447e-05, + "loss": 2.1961, + "step": 15462 + }, + { + "epoch": 2.898406747891284, + "grad_norm": 52371.7890625, + "learning_rate": 1.2180728490975563e-05, + "loss": 2.0932, + "step": 15463 + }, + { + "epoch": 2.8985941893158387, + "grad_norm": 55703.66015625, + "learning_rate": 1.217558889287364e-05, + "loss": 2.0566, + "step": 15464 + }, + { + "epoch": 2.8987816307403937, + "grad_norm": 57016.0390625, + "learning_rate": 1.2170450228985614e-05, + "loss": 2.1182, + "step": 15465 + }, + { + "epoch": 2.8989690721649484, + "grad_norm": 54067.65234375, + "learning_rate": 1.2165312499438435e-05, + "loss": 2.1323, + "step": 15466 + }, + { + "epoch": 2.8991565135895034, + "grad_norm": 58592.2578125, + "learning_rate": 1.2160175704358973e-05, + "loss": 2.1815, + "step": 15467 + }, + { + "epoch": 2.899343955014058, + "grad_norm": 53396.6015625, + "learning_rate": 1.2155039843874127e-05, + "loss": 2.1168, + "step": 15468 + }, + { + "epoch": 2.899531396438613, + "grad_norm": 55530.5078125, + "learning_rate": 1.2149904918110727e-05, + "loss": 2.0812, + "step": 15469 + }, + { + "epoch": 2.8997188378631678, + "grad_norm": 57223.875, + "learning_rate": 1.2144770927195587e-05, + "loss": 2.1853, + "step": 15470 + }, + { + "epoch": 2.8999062792877224, + "grad_norm": 58570.625, + "learning_rate": 1.2139637871255544e-05, + "loss": 2.0366, + "step": 15471 + }, + { + "epoch": 2.9000937207122774, + "grad_norm": 55537.05859375, + "learning_rate": 1.2134505750417341e-05, + "loss": 2.0526, + "step": 15472 + }, + { + "epoch": 2.9002811621368325, + "grad_norm": 54400.39453125, + "learning_rate": 1.2129374564807766e-05, + "loss": 2.1576, + "step": 15473 + }, + { + "epoch": 2.900468603561387, + "grad_norm": 55413.71875, + "learning_rate": 1.212424431455354e-05, + "loss": 2.1731, + "step": 15474 + }, + { + "epoch": 2.9006560449859418, + "grad_norm": 53348.4765625, + "learning_rate": 1.2119114999781362e-05, + "loss": 2.1184, + "step": 15475 + }, + { + "epoch": 2.900843486410497, + "grad_norm": 52877.1328125, + "learning_rate": 1.2113986620617923e-05, + "loss": 2.0959, + "step": 15476 + }, + { + "epoch": 2.9010309278350515, + "grad_norm": 59529.6640625, + "learning_rate": 1.2108859177189912e-05, + "loss": 2.1148, + "step": 15477 + }, + { + "epoch": 2.9012183692596065, + "grad_norm": 58460.28125, + "learning_rate": 1.2103732669623958e-05, + "loss": 2.1024, + "step": 15478 + }, + { + "epoch": 2.901405810684161, + "grad_norm": 59849.609375, + "learning_rate": 1.2098607098046654e-05, + "loss": 2.1509, + "step": 15479 + }, + { + "epoch": 2.901593252108716, + "grad_norm": 58275.921875, + "learning_rate": 1.2093482462584616e-05, + "loss": 2.1085, + "step": 15480 + }, + { + "epoch": 2.901780693533271, + "grad_norm": 54638.0625, + "learning_rate": 1.2088358763364443e-05, + "loss": 2.1796, + "step": 15481 + }, + { + "epoch": 2.9019681349578255, + "grad_norm": 55583.38671875, + "learning_rate": 1.2083236000512626e-05, + "loss": 2.126, + "step": 15482 + }, + { + "epoch": 2.9021555763823805, + "grad_norm": 60864.6953125, + "learning_rate": 1.2078114174155724e-05, + "loss": 2.0857, + "step": 15483 + }, + { + "epoch": 2.9023430178069356, + "grad_norm": 54391.55859375, + "learning_rate": 1.2072993284420247e-05, + "loss": 2.1125, + "step": 15484 + }, + { + "epoch": 2.9025304592314902, + "grad_norm": 52368.39453125, + "learning_rate": 1.206787333143266e-05, + "loss": 2.0766, + "step": 15485 + }, + { + "epoch": 2.902717900656045, + "grad_norm": 53818.578125, + "learning_rate": 1.2062754315319414e-05, + "loss": 2.1041, + "step": 15486 + }, + { + "epoch": 2.9029053420806, + "grad_norm": 51508.5859375, + "learning_rate": 1.205763623620696e-05, + "loss": 2.1115, + "step": 15487 + }, + { + "epoch": 2.9030927835051545, + "grad_norm": 60432.77734375, + "learning_rate": 1.2052519094221698e-05, + "loss": 2.1266, + "step": 15488 + }, + { + "epoch": 2.9032802249297096, + "grad_norm": 56827.54296875, + "learning_rate": 1.204740288949e-05, + "loss": 2.1683, + "step": 15489 + }, + { + "epoch": 2.9034676663542642, + "grad_norm": 51928.8984375, + "learning_rate": 1.2042287622138243e-05, + "loss": 2.1323, + "step": 15490 + }, + { + "epoch": 2.9036551077788193, + "grad_norm": 52908.17578125, + "learning_rate": 1.2037173292292786e-05, + "loss": 2.1516, + "step": 15491 + }, + { + "epoch": 2.903842549203374, + "grad_norm": 53116.2109375, + "learning_rate": 1.2032059900079928e-05, + "loss": 2.0697, + "step": 15492 + }, + { + "epoch": 2.9040299906279285, + "grad_norm": 52362.31640625, + "learning_rate": 1.2026947445625953e-05, + "loss": 2.1443, + "step": 15493 + }, + { + "epoch": 2.9042174320524836, + "grad_norm": 51195.76171875, + "learning_rate": 1.2021835929057151e-05, + "loss": 2.1174, + "step": 15494 + }, + { + "epoch": 2.9044048734770387, + "grad_norm": 58216.61328125, + "learning_rate": 1.2016725350499768e-05, + "loss": 2.072, + "step": 15495 + }, + { + "epoch": 2.9045923149015933, + "grad_norm": 56319.76171875, + "learning_rate": 1.201161571008001e-05, + "loss": 2.1186, + "step": 15496 + }, + { + "epoch": 2.904779756326148, + "grad_norm": 53948.51171875, + "learning_rate": 1.2006507007924083e-05, + "loss": 2.1813, + "step": 15497 + }, + { + "epoch": 2.904967197750703, + "grad_norm": 54728.47265625, + "learning_rate": 1.2001399244158196e-05, + "loss": 2.144, + "step": 15498 + }, + { + "epoch": 2.9051546391752576, + "grad_norm": 58941.875, + "learning_rate": 1.1996292418908478e-05, + "loss": 2.0765, + "step": 15499 + }, + { + "epoch": 2.9053420805998127, + "grad_norm": 56195.55078125, + "learning_rate": 1.1991186532301052e-05, + "loss": 2.1222, + "step": 15500 + }, + { + "epoch": 2.9053420805998127, + "eval_loss": 2.2569265365600586, + "eval_runtime": 125.8471, + "eval_samples_per_second": 40.12, + "eval_steps_per_second": 2.01, + "step": 15500 + }, + { + "epoch": 2.9055295220243673, + "grad_norm": 52727.0703125, + "learning_rate": 1.1986081584462056e-05, + "loss": 2.1512, + "step": 15501 + }, + { + "epoch": 2.9057169634489224, + "grad_norm": 61787.921875, + "learning_rate": 1.1980977575517539e-05, + "loss": 2.0605, + "step": 15502 + }, + { + "epoch": 2.905904404873477, + "grad_norm": 50387.8671875, + "learning_rate": 1.1975874505593599e-05, + "loss": 2.0781, + "step": 15503 + }, + { + "epoch": 2.9060918462980316, + "grad_norm": 56018.10546875, + "learning_rate": 1.1970772374816264e-05, + "loss": 2.0999, + "step": 15504 + }, + { + "epoch": 2.9062792877225867, + "grad_norm": 55133.9765625, + "learning_rate": 1.1965671183311528e-05, + "loss": 2.1854, + "step": 15505 + }, + { + "epoch": 2.9064667291471418, + "grad_norm": 56181.9921875, + "learning_rate": 1.19605709312054e-05, + "loss": 2.1578, + "step": 15506 + }, + { + "epoch": 2.9066541705716964, + "grad_norm": 54402.90234375, + "learning_rate": 1.195547161862387e-05, + "loss": 2.1408, + "step": 15507 + }, + { + "epoch": 2.906841611996251, + "grad_norm": 54532.046875, + "learning_rate": 1.1950373245692865e-05, + "loss": 2.1346, + "step": 15508 + }, + { + "epoch": 2.907029053420806, + "grad_norm": 56445.07421875, + "learning_rate": 1.1945275812538292e-05, + "loss": 2.137, + "step": 15509 + }, + { + "epoch": 2.9072164948453607, + "grad_norm": 52612.55078125, + "learning_rate": 1.1940179319286082e-05, + "loss": 2.1167, + "step": 15510 + }, + { + "epoch": 2.907403936269916, + "grad_norm": 54472.91015625, + "learning_rate": 1.19350837660621e-05, + "loss": 2.149, + "step": 15511 + }, + { + "epoch": 2.9075913776944704, + "grad_norm": 52663.37109375, + "learning_rate": 1.1929989152992178e-05, + "loss": 2.1306, + "step": 15512 + }, + { + "epoch": 2.9077788191190255, + "grad_norm": 51119.0078125, + "learning_rate": 1.192489548020217e-05, + "loss": 2.1339, + "step": 15513 + }, + { + "epoch": 2.90796626054358, + "grad_norm": 51619.17578125, + "learning_rate": 1.1919802747817904e-05, + "loss": 2.057, + "step": 15514 + }, + { + "epoch": 2.9081537019681347, + "grad_norm": 50982.44140625, + "learning_rate": 1.1914710955965113e-05, + "loss": 2.1242, + "step": 15515 + }, + { + "epoch": 2.90834114339269, + "grad_norm": 51236.609375, + "learning_rate": 1.1909620104769581e-05, + "loss": 2.1271, + "step": 15516 + }, + { + "epoch": 2.908528584817245, + "grad_norm": 54824.859375, + "learning_rate": 1.1904530194357061e-05, + "loss": 2.0872, + "step": 15517 + }, + { + "epoch": 2.9087160262417995, + "grad_norm": 52780.93359375, + "learning_rate": 1.1899441224853258e-05, + "loss": 2.1018, + "step": 15518 + }, + { + "epoch": 2.908903467666354, + "grad_norm": 55166.8984375, + "learning_rate": 1.1894353196383839e-05, + "loss": 2.1091, + "step": 15519 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 54470.3359375, + "learning_rate": 1.1889266109074493e-05, + "loss": 2.2688, + "step": 15520 + }, + { + "epoch": 2.909278350515464, + "grad_norm": 55483.953125, + "learning_rate": 1.1884179963050879e-05, + "loss": 2.0657, + "step": 15521 + }, + { + "epoch": 2.909465791940019, + "grad_norm": 51478.171875, + "learning_rate": 1.1879094758438603e-05, + "loss": 2.1058, + "step": 15522 + }, + { + "epoch": 2.9096532333645735, + "grad_norm": 58442.45703125, + "learning_rate": 1.187401049536324e-05, + "loss": 2.0467, + "step": 15523 + }, + { + "epoch": 2.9098406747891286, + "grad_norm": 58234.6328125, + "learning_rate": 1.1868927173950411e-05, + "loss": 2.0933, + "step": 15524 + }, + { + "epoch": 2.910028116213683, + "grad_norm": 55387.203125, + "learning_rate": 1.1863844794325635e-05, + "loss": 2.0479, + "step": 15525 + }, + { + "epoch": 2.910215557638238, + "grad_norm": 50601.22265625, + "learning_rate": 1.1858763356614438e-05, + "loss": 2.2007, + "step": 15526 + }, + { + "epoch": 2.910402999062793, + "grad_norm": 53421.5546875, + "learning_rate": 1.185368286094235e-05, + "loss": 2.1136, + "step": 15527 + }, + { + "epoch": 2.910590440487348, + "grad_norm": 57338.55859375, + "learning_rate": 1.1848603307434825e-05, + "loss": 2.1209, + "step": 15528 + }, + { + "epoch": 2.9107778819119026, + "grad_norm": 54195.5703125, + "learning_rate": 1.1843524696217345e-05, + "loss": 2.1354, + "step": 15529 + }, + { + "epoch": 2.910965323336457, + "grad_norm": 56935.3359375, + "learning_rate": 1.1838447027415323e-05, + "loss": 2.1645, + "step": 15530 + }, + { + "epoch": 2.9111527647610123, + "grad_norm": 52589.734375, + "learning_rate": 1.1833370301154195e-05, + "loss": 2.1437, + "step": 15531 + }, + { + "epoch": 2.911340206185567, + "grad_norm": 53219.20703125, + "learning_rate": 1.1828294517559318e-05, + "loss": 2.13, + "step": 15532 + }, + { + "epoch": 2.911527647610122, + "grad_norm": 60090.98046875, + "learning_rate": 1.1823219676756098e-05, + "loss": 2.0734, + "step": 15533 + }, + { + "epoch": 2.9117150890346766, + "grad_norm": 57673.1875, + "learning_rate": 1.1818145778869849e-05, + "loss": 2.0628, + "step": 15534 + }, + { + "epoch": 2.9119025304592316, + "grad_norm": 52785.32421875, + "learning_rate": 1.181307282402589e-05, + "loss": 2.1389, + "step": 15535 + }, + { + "epoch": 2.9120899718837863, + "grad_norm": 56131.453125, + "learning_rate": 1.1808000812349512e-05, + "loss": 2.124, + "step": 15536 + }, + { + "epoch": 2.912277413308341, + "grad_norm": 56653.25390625, + "learning_rate": 1.1802929743966018e-05, + "loss": 2.1514, + "step": 15537 + }, + { + "epoch": 2.912464854732896, + "grad_norm": 55683.33203125, + "learning_rate": 1.1797859619000635e-05, + "loss": 2.2092, + "step": 15538 + }, + { + "epoch": 2.912652296157451, + "grad_norm": 53765.48828125, + "learning_rate": 1.1792790437578572e-05, + "loss": 2.132, + "step": 15539 + }, + { + "epoch": 2.9128397375820057, + "grad_norm": 55444.5078125, + "learning_rate": 1.1787722199825068e-05, + "loss": 2.057, + "step": 15540 + }, + { + "epoch": 2.9130271790065603, + "grad_norm": 54940.18359375, + "learning_rate": 1.1782654905865276e-05, + "loss": 2.1496, + "step": 15541 + }, + { + "epoch": 2.9132146204311153, + "grad_norm": 55051.49609375, + "learning_rate": 1.1777588555824347e-05, + "loss": 2.1224, + "step": 15542 + }, + { + "epoch": 2.91340206185567, + "grad_norm": 58448.046875, + "learning_rate": 1.177252314982743e-05, + "loss": 2.1094, + "step": 15543 + }, + { + "epoch": 2.913589503280225, + "grad_norm": 51210.984375, + "learning_rate": 1.1767458687999656e-05, + "loss": 2.1194, + "step": 15544 + }, + { + "epoch": 2.9137769447047797, + "grad_norm": 55898.515625, + "learning_rate": 1.176239517046605e-05, + "loss": 2.1138, + "step": 15545 + }, + { + "epoch": 2.9139643861293347, + "grad_norm": 56833.40625, + "learning_rate": 1.1757332597351706e-05, + "loss": 2.1244, + "step": 15546 + }, + { + "epoch": 2.9141518275538894, + "grad_norm": 60460.2734375, + "learning_rate": 1.175227096878168e-05, + "loss": 2.1777, + "step": 15547 + }, + { + "epoch": 2.914339268978444, + "grad_norm": 59103.48046875, + "learning_rate": 1.1747210284880977e-05, + "loss": 2.1495, + "step": 15548 + }, + { + "epoch": 2.914526710402999, + "grad_norm": 54070.50390625, + "learning_rate": 1.1742150545774566e-05, + "loss": 2.1708, + "step": 15549 + }, + { + "epoch": 2.914714151827554, + "grad_norm": 48872.3359375, + "learning_rate": 1.1737091751587437e-05, + "loss": 2.1486, + "step": 15550 + }, + { + "epoch": 2.9149015932521087, + "grad_norm": 57019.58984375, + "learning_rate": 1.1732033902444561e-05, + "loss": 2.1033, + "step": 15551 + }, + { + "epoch": 2.9150890346766634, + "grad_norm": 53117.984375, + "learning_rate": 1.17269769984708e-05, + "loss": 2.1373, + "step": 15552 + }, + { + "epoch": 2.9152764761012184, + "grad_norm": 61639.53125, + "learning_rate": 1.172192103979109e-05, + "loss": 2.1678, + "step": 15553 + }, + { + "epoch": 2.915463917525773, + "grad_norm": 51893.16796875, + "learning_rate": 1.1716866026530315e-05, + "loss": 2.1019, + "step": 15554 + }, + { + "epoch": 2.915651358950328, + "grad_norm": 56072.58203125, + "learning_rate": 1.1711811958813317e-05, + "loss": 2.1503, + "step": 15555 + }, + { + "epoch": 2.9158388003748827, + "grad_norm": 57192.0859375, + "learning_rate": 1.1706758836764903e-05, + "loss": 2.0965, + "step": 15556 + }, + { + "epoch": 2.916026241799438, + "grad_norm": 58698.48046875, + "learning_rate": 1.1701706660509914e-05, + "loss": 2.1787, + "step": 15557 + }, + { + "epoch": 2.9162136832239924, + "grad_norm": 53422.2734375, + "learning_rate": 1.1696655430173104e-05, + "loss": 2.1541, + "step": 15558 + }, + { + "epoch": 2.916401124648547, + "grad_norm": 54405.359375, + "learning_rate": 1.1691605145879259e-05, + "loss": 2.1634, + "step": 15559 + }, + { + "epoch": 2.916588566073102, + "grad_norm": 57262.53515625, + "learning_rate": 1.168655580775308e-05, + "loss": 2.112, + "step": 15560 + }, + { + "epoch": 2.916776007497657, + "grad_norm": 53731.40234375, + "learning_rate": 1.1681507415919312e-05, + "loss": 2.1632, + "step": 15561 + }, + { + "epoch": 2.916963448922212, + "grad_norm": 56948.71875, + "learning_rate": 1.1676459970502623e-05, + "loss": 2.0231, + "step": 15562 + }, + { + "epoch": 2.9171508903467664, + "grad_norm": 57762.45703125, + "learning_rate": 1.1671413471627696e-05, + "loss": 2.0649, + "step": 15563 + }, + { + "epoch": 2.9173383317713215, + "grad_norm": 54826.59375, + "learning_rate": 1.1666367919419158e-05, + "loss": 2.136, + "step": 15564 + }, + { + "epoch": 2.917525773195876, + "grad_norm": 54772.109375, + "learning_rate": 1.1661323314001626e-05, + "loss": 2.1432, + "step": 15565 + }, + { + "epoch": 2.917713214620431, + "grad_norm": 54478.4921875, + "learning_rate": 1.1656279655499707e-05, + "loss": 2.0805, + "step": 15566 + }, + { + "epoch": 2.917900656044986, + "grad_norm": 52073.3671875, + "learning_rate": 1.1651236944037974e-05, + "loss": 2.1204, + "step": 15567 + }, + { + "epoch": 2.918088097469541, + "grad_norm": 55312.36328125, + "learning_rate": 1.1646195179740954e-05, + "loss": 2.1082, + "step": 15568 + }, + { + "epoch": 2.9182755388940955, + "grad_norm": 54040.5078125, + "learning_rate": 1.1641154362733181e-05, + "loss": 2.1247, + "step": 15569 + }, + { + "epoch": 2.91846298031865, + "grad_norm": 53271.4296875, + "learning_rate": 1.1636114493139184e-05, + "loss": 2.1486, + "step": 15570 + }, + { + "epoch": 2.918650421743205, + "grad_norm": 48886.33984375, + "learning_rate": 1.1631075571083416e-05, + "loss": 2.0969, + "step": 15571 + }, + { + "epoch": 2.9188378631677603, + "grad_norm": 62447.9609375, + "learning_rate": 1.162603759669032e-05, + "loss": 2.1308, + "step": 15572 + }, + { + "epoch": 2.919025304592315, + "grad_norm": 52236.9375, + "learning_rate": 1.1621000570084345e-05, + "loss": 2.1353, + "step": 15573 + }, + { + "epoch": 2.9192127460168695, + "grad_norm": 53943.4296875, + "learning_rate": 1.1615964491389918e-05, + "loss": 2.1968, + "step": 15574 + }, + { + "epoch": 2.9194001874414246, + "grad_norm": 51861.7890625, + "learning_rate": 1.1610929360731381e-05, + "loss": 2.1482, + "step": 15575 + }, + { + "epoch": 2.9195876288659792, + "grad_norm": 56007.43359375, + "learning_rate": 1.1605895178233116e-05, + "loss": 2.0944, + "step": 15576 + }, + { + "epoch": 2.9197750702905343, + "grad_norm": 55116.53125, + "learning_rate": 1.1600861944019476e-05, + "loss": 2.0658, + "step": 15577 + }, + { + "epoch": 2.919962511715089, + "grad_norm": 55083.9296875, + "learning_rate": 1.1595829658214762e-05, + "loss": 2.2012, + "step": 15578 + }, + { + "epoch": 2.920149953139644, + "grad_norm": 50921.73046875, + "learning_rate": 1.1590798320943247e-05, + "loss": 2.0694, + "step": 15579 + }, + { + "epoch": 2.9203373945641986, + "grad_norm": 60184.20703125, + "learning_rate": 1.1585767932329228e-05, + "loss": 2.1648, + "step": 15580 + }, + { + "epoch": 2.9205248359887537, + "grad_norm": 54258.8671875, + "learning_rate": 1.158073849249694e-05, + "loss": 2.0597, + "step": 15581 + }, + { + "epoch": 2.9207122774133083, + "grad_norm": 54986.93359375, + "learning_rate": 1.1575710001570583e-05, + "loss": 2.0949, + "step": 15582 + }, + { + "epoch": 2.9208997188378634, + "grad_norm": 50247.13671875, + "learning_rate": 1.157068245967437e-05, + "loss": 2.1619, + "step": 15583 + }, + { + "epoch": 2.921087160262418, + "grad_norm": 53262.62890625, + "learning_rate": 1.1565655866932496e-05, + "loss": 2.0857, + "step": 15584 + }, + { + "epoch": 2.9212746016869726, + "grad_norm": 57380.62109375, + "learning_rate": 1.1560630223469082e-05, + "loss": 2.1268, + "step": 15585 + }, + { + "epoch": 2.9214620431115277, + "grad_norm": 53207.63671875, + "learning_rate": 1.1555605529408259e-05, + "loss": 2.1039, + "step": 15586 + }, + { + "epoch": 2.9216494845360823, + "grad_norm": 52658.44921875, + "learning_rate": 1.1550581784874143e-05, + "loss": 2.0859, + "step": 15587 + }, + { + "epoch": 2.9218369259606374, + "grad_norm": 57540.82421875, + "learning_rate": 1.1545558989990796e-05, + "loss": 2.0991, + "step": 15588 + }, + { + "epoch": 2.922024367385192, + "grad_norm": 54579.7421875, + "learning_rate": 1.1540537144882296e-05, + "loss": 2.0745, + "step": 15589 + }, + { + "epoch": 2.922211808809747, + "grad_norm": 55967.47265625, + "learning_rate": 1.1535516249672657e-05, + "loss": 2.1679, + "step": 15590 + }, + { + "epoch": 2.9223992502343017, + "grad_norm": 54535.73828125, + "learning_rate": 1.1530496304485905e-05, + "loss": 2.0901, + "step": 15591 + }, + { + "epoch": 2.9225866916588568, + "grad_norm": 52065.703125, + "learning_rate": 1.152547730944602e-05, + "loss": 2.0639, + "step": 15592 + }, + { + "epoch": 2.9227741330834114, + "grad_norm": 59107.6484375, + "learning_rate": 1.1520459264676948e-05, + "loss": 2.1367, + "step": 15593 + }, + { + "epoch": 2.9229615745079665, + "grad_norm": 57336.03125, + "learning_rate": 1.1515442170302659e-05, + "loss": 2.1332, + "step": 15594 + }, + { + "epoch": 2.923149015932521, + "grad_norm": 55887.59765625, + "learning_rate": 1.1510426026447035e-05, + "loss": 2.0937, + "step": 15595 + }, + { + "epoch": 2.9233364573570757, + "grad_norm": 56573.421875, + "learning_rate": 1.1505410833234004e-05, + "loss": 2.127, + "step": 15596 + }, + { + "epoch": 2.9235238987816308, + "grad_norm": 59125.15234375, + "learning_rate": 1.150039659078741e-05, + "loss": 2.1169, + "step": 15597 + }, + { + "epoch": 2.923711340206186, + "grad_norm": 56675.62890625, + "learning_rate": 1.1495383299231094e-05, + "loss": 2.1486, + "step": 15598 + }, + { + "epoch": 2.9238987816307405, + "grad_norm": 58046.5, + "learning_rate": 1.149037095868889e-05, + "loss": 2.1216, + "step": 15599 + }, + { + "epoch": 2.924086223055295, + "grad_norm": 58211.18359375, + "learning_rate": 1.148535956928461e-05, + "loss": 2.09, + "step": 15600 + }, + { + "epoch": 2.92427366447985, + "grad_norm": 52111.42578125, + "learning_rate": 1.148034913114201e-05, + "loss": 2.1269, + "step": 15601 + }, + { + "epoch": 2.924461105904405, + "grad_norm": 57171.0546875, + "learning_rate": 1.1475339644384831e-05, + "loss": 2.1725, + "step": 15602 + }, + { + "epoch": 2.92464854732896, + "grad_norm": 56066.48828125, + "learning_rate": 1.1470331109136816e-05, + "loss": 2.1264, + "step": 15603 + }, + { + "epoch": 2.9248359887535145, + "grad_norm": 55152.52734375, + "learning_rate": 1.1465323525521698e-05, + "loss": 2.1079, + "step": 15604 + }, + { + "epoch": 2.9250234301780695, + "grad_norm": 53609.9765625, + "learning_rate": 1.1460316893663098e-05, + "loss": 2.1463, + "step": 15605 + }, + { + "epoch": 2.925210871602624, + "grad_norm": 50487.71484375, + "learning_rate": 1.1455311213684705e-05, + "loss": 2.11, + "step": 15606 + }, + { + "epoch": 2.925398313027179, + "grad_norm": 53868.7734375, + "learning_rate": 1.1450306485710177e-05, + "loss": 2.1867, + "step": 15607 + }, + { + "epoch": 2.925585754451734, + "grad_norm": 50024.62109375, + "learning_rate": 1.144530270986307e-05, + "loss": 2.1774, + "step": 15608 + }, + { + "epoch": 2.925773195876289, + "grad_norm": 53246.05859375, + "learning_rate": 1.1440299886266997e-05, + "loss": 2.1894, + "step": 15609 + }, + { + "epoch": 2.9259606373008435, + "grad_norm": 51920.23828125, + "learning_rate": 1.1435298015045543e-05, + "loss": 2.1522, + "step": 15610 + }, + { + "epoch": 2.926148078725398, + "grad_norm": 55767.62890625, + "learning_rate": 1.1430297096322223e-05, + "loss": 2.2093, + "step": 15611 + }, + { + "epoch": 2.9263355201499532, + "grad_norm": 63219.7890625, + "learning_rate": 1.1425297130220552e-05, + "loss": 2.1632, + "step": 15612 + }, + { + "epoch": 2.926522961574508, + "grad_norm": 55041.39453125, + "learning_rate": 1.142029811686402e-05, + "loss": 2.1149, + "step": 15613 + }, + { + "epoch": 2.926710402999063, + "grad_norm": 54761.50390625, + "learning_rate": 1.1415300056376127e-05, + "loss": 2.0836, + "step": 15614 + }, + { + "epoch": 2.9268978444236176, + "grad_norm": 51010.734375, + "learning_rate": 1.141030294888029e-05, + "loss": 2.0916, + "step": 15615 + }, + { + "epoch": 2.9270852858481726, + "grad_norm": 53358.28125, + "learning_rate": 1.1405306794499925e-05, + "loss": 2.0944, + "step": 15616 + }, + { + "epoch": 2.9272727272727272, + "grad_norm": 54187.4375, + "learning_rate": 1.1400311593358465e-05, + "loss": 2.0994, + "step": 15617 + }, + { + "epoch": 2.927460168697282, + "grad_norm": 53758.44140625, + "learning_rate": 1.1395317345579243e-05, + "loss": 2.0668, + "step": 15618 + }, + { + "epoch": 2.927647610121837, + "grad_norm": 52018.94140625, + "learning_rate": 1.1390324051285645e-05, + "loss": 2.0925, + "step": 15619 + }, + { + "epoch": 2.927835051546392, + "grad_norm": 59939.38671875, + "learning_rate": 1.1385331710600982e-05, + "loss": 2.0696, + "step": 15620 + }, + { + "epoch": 2.9280224929709466, + "grad_norm": 59333.4375, + "learning_rate": 1.1380340323648553e-05, + "loss": 2.0835, + "step": 15621 + }, + { + "epoch": 2.9282099343955013, + "grad_norm": 52335.99609375, + "learning_rate": 1.1375349890551657e-05, + "loss": 2.0809, + "step": 15622 + }, + { + "epoch": 2.9283973758200563, + "grad_norm": 52139.89453125, + "learning_rate": 1.1370360411433528e-05, + "loss": 2.1737, + "step": 15623 + }, + { + "epoch": 2.928584817244611, + "grad_norm": 58465.5859375, + "learning_rate": 1.136537188641743e-05, + "loss": 2.1276, + "step": 15624 + }, + { + "epoch": 2.928772258669166, + "grad_norm": 53056.65625, + "learning_rate": 1.1360384315626538e-05, + "loss": 2.1264, + "step": 15625 + }, + { + "epoch": 2.9289597000937206, + "grad_norm": 61525.0, + "learning_rate": 1.1355397699184073e-05, + "loss": 2.1513, + "step": 15626 + }, + { + "epoch": 2.9291471415182757, + "grad_norm": 50249.41796875, + "learning_rate": 1.1350412037213176e-05, + "loss": 2.0921, + "step": 15627 + }, + { + "epoch": 2.9293345829428303, + "grad_norm": 52226.0703125, + "learning_rate": 1.1345427329836983e-05, + "loss": 2.1772, + "step": 15628 + }, + { + "epoch": 2.929522024367385, + "grad_norm": 56021.69921875, + "learning_rate": 1.1340443577178616e-05, + "loss": 2.1931, + "step": 15629 + }, + { + "epoch": 2.92970946579194, + "grad_norm": 50236.421875, + "learning_rate": 1.133546077936119e-05, + "loss": 2.0623, + "step": 15630 + }, + { + "epoch": 2.929896907216495, + "grad_norm": 53650.95703125, + "learning_rate": 1.1330478936507744e-05, + "loss": 2.087, + "step": 15631 + }, + { + "epoch": 2.9300843486410497, + "grad_norm": 54424.4375, + "learning_rate": 1.1325498048741323e-05, + "loss": 2.1327, + "step": 15632 + }, + { + "epoch": 2.9302717900656043, + "grad_norm": 57710.0078125, + "learning_rate": 1.1320518116184975e-05, + "loss": 2.1408, + "step": 15633 + }, + { + "epoch": 2.9304592314901594, + "grad_norm": 53671.421875, + "learning_rate": 1.1315539138961678e-05, + "loss": 2.1002, + "step": 15634 + }, + { + "epoch": 2.930646672914714, + "grad_norm": 51844.6640625, + "learning_rate": 1.131056111719439e-05, + "loss": 2.163, + "step": 15635 + }, + { + "epoch": 2.930834114339269, + "grad_norm": 53617.10546875, + "learning_rate": 1.130558405100609e-05, + "loss": 2.0845, + "step": 15636 + }, + { + "epoch": 2.9310215557638237, + "grad_norm": 55117.3046875, + "learning_rate": 1.130060794051972e-05, + "loss": 2.1448, + "step": 15637 + }, + { + "epoch": 2.931208997188379, + "grad_norm": 55018.8671875, + "learning_rate": 1.1295632785858129e-05, + "loss": 2.1082, + "step": 15638 + }, + { + "epoch": 2.9313964386129334, + "grad_norm": 52082.66015625, + "learning_rate": 1.1290658587144232e-05, + "loss": 2.0846, + "step": 15639 + }, + { + "epoch": 2.931583880037488, + "grad_norm": 55220.61328125, + "learning_rate": 1.128568534450089e-05, + "loss": 2.11, + "step": 15640 + }, + { + "epoch": 2.931771321462043, + "grad_norm": 54000.23828125, + "learning_rate": 1.128071305805093e-05, + "loss": 2.1493, + "step": 15641 + }, + { + "epoch": 2.931958762886598, + "grad_norm": 57270.1484375, + "learning_rate": 1.1275741727917138e-05, + "loss": 2.1293, + "step": 15642 + }, + { + "epoch": 2.932146204311153, + "grad_norm": 56304.74609375, + "learning_rate": 1.127077135422232e-05, + "loss": 2.1208, + "step": 15643 + }, + { + "epoch": 2.9323336457357074, + "grad_norm": 55865.1484375, + "learning_rate": 1.1265801937089254e-05, + "loss": 2.0735, + "step": 15644 + }, + { + "epoch": 2.9325210871602625, + "grad_norm": 49917.09375, + "learning_rate": 1.1260833476640658e-05, + "loss": 2.153, + "step": 15645 + }, + { + "epoch": 2.932708528584817, + "grad_norm": 53417.4140625, + "learning_rate": 1.1255865972999236e-05, + "loss": 2.15, + "step": 15646 + }, + { + "epoch": 2.932895970009372, + "grad_norm": 52556.20703125, + "learning_rate": 1.1250899426287703e-05, + "loss": 2.1642, + "step": 15647 + }, + { + "epoch": 2.933083411433927, + "grad_norm": 54192.70703125, + "learning_rate": 1.124593383662872e-05, + "loss": 2.1633, + "step": 15648 + }, + { + "epoch": 2.933270852858482, + "grad_norm": 53545.09375, + "learning_rate": 1.1240969204144908e-05, + "loss": 2.083, + "step": 15649 + }, + { + "epoch": 2.9334582942830365, + "grad_norm": 56231.59375, + "learning_rate": 1.123600552895892e-05, + "loss": 2.2281, + "step": 15650 + }, + { + "epoch": 2.933645735707591, + "grad_norm": 61346.6015625, + "learning_rate": 1.1231042811193321e-05, + "loss": 2.0113, + "step": 15651 + }, + { + "epoch": 2.933833177132146, + "grad_norm": 55366.390625, + "learning_rate": 1.1226081050970716e-05, + "loss": 2.1633, + "step": 15652 + }, + { + "epoch": 2.9340206185567013, + "grad_norm": 55737.56640625, + "learning_rate": 1.1221120248413625e-05, + "loss": 2.0799, + "step": 15653 + }, + { + "epoch": 2.934208059981256, + "grad_norm": 56297.29296875, + "learning_rate": 1.12161604036446e-05, + "loss": 2.0787, + "step": 15654 + }, + { + "epoch": 2.9343955014058105, + "grad_norm": 56554.890625, + "learning_rate": 1.1211201516786112e-05, + "loss": 2.1279, + "step": 15655 + }, + { + "epoch": 2.9345829428303656, + "grad_norm": 52300.43359375, + "learning_rate": 1.1206243587960674e-05, + "loss": 2.1251, + "step": 15656 + }, + { + "epoch": 2.93477038425492, + "grad_norm": 55938.38671875, + "learning_rate": 1.1201286617290724e-05, + "loss": 2.1418, + "step": 15657 + }, + { + "epoch": 2.9349578256794753, + "grad_norm": 56597.16796875, + "learning_rate": 1.1196330604898674e-05, + "loss": 2.1592, + "step": 15658 + }, + { + "epoch": 2.93514526710403, + "grad_norm": 53374.44140625, + "learning_rate": 1.1191375550906947e-05, + "loss": 2.0005, + "step": 15659 + }, + { + "epoch": 2.935332708528585, + "grad_norm": 50018.0390625, + "learning_rate": 1.1186421455437956e-05, + "loss": 2.103, + "step": 15660 + }, + { + "epoch": 2.9355201499531396, + "grad_norm": 56665.52734375, + "learning_rate": 1.1181468318614008e-05, + "loss": 2.1516, + "step": 15661 + }, + { + "epoch": 2.935707591377694, + "grad_norm": 54931.9296875, + "learning_rate": 1.1176516140557463e-05, + "loss": 2.0611, + "step": 15662 + }, + { + "epoch": 2.9358950328022493, + "grad_norm": 53878.98828125, + "learning_rate": 1.1171564921390647e-05, + "loss": 2.057, + "step": 15663 + }, + { + "epoch": 2.9360824742268044, + "grad_norm": 51017.5703125, + "learning_rate": 1.1166614661235836e-05, + "loss": 2.1259, + "step": 15664 + }, + { + "epoch": 2.936269915651359, + "grad_norm": 51392.18359375, + "learning_rate": 1.1161665360215278e-05, + "loss": 2.046, + "step": 15665 + }, + { + "epoch": 2.9364573570759136, + "grad_norm": 51804.1875, + "learning_rate": 1.1156717018451235e-05, + "loss": 2.112, + "step": 15666 + }, + { + "epoch": 2.9366447985004687, + "grad_norm": 54527.484375, + "learning_rate": 1.1151769636065945e-05, + "loss": 2.0727, + "step": 15667 + }, + { + "epoch": 2.9368322399250233, + "grad_norm": 51968.03125, + "learning_rate": 1.1146823213181551e-05, + "loss": 2.1226, + "step": 15668 + }, + { + "epoch": 2.9370196813495784, + "grad_norm": 55751.640625, + "learning_rate": 1.1141877749920248e-05, + "loss": 2.1274, + "step": 15669 + }, + { + "epoch": 2.937207122774133, + "grad_norm": 54448.109375, + "learning_rate": 1.1136933246404202e-05, + "loss": 2.1124, + "step": 15670 + }, + { + "epoch": 2.937394564198688, + "grad_norm": 52534.171875, + "learning_rate": 1.1131989702755512e-05, + "loss": 2.0787, + "step": 15671 + }, + { + "epoch": 2.9375820056232427, + "grad_norm": 52511.9921875, + "learning_rate": 1.1127047119096274e-05, + "loss": 2.1155, + "step": 15672 + }, + { + "epoch": 2.9377694470477973, + "grad_norm": 51169.95703125, + "learning_rate": 1.1122105495548584e-05, + "loss": 2.1538, + "step": 15673 + }, + { + "epoch": 2.9379568884723524, + "grad_norm": 53954.61328125, + "learning_rate": 1.1117164832234473e-05, + "loss": 2.1462, + "step": 15674 + }, + { + "epoch": 2.9381443298969074, + "grad_norm": 51435.765625, + "learning_rate": 1.1112225129275988e-05, + "loss": 2.1201, + "step": 15675 + }, + { + "epoch": 2.938331771321462, + "grad_norm": 57474.1015625, + "learning_rate": 1.1107286386795113e-05, + "loss": 2.0791, + "step": 15676 + }, + { + "epoch": 2.9385192127460167, + "grad_norm": 55221.8046875, + "learning_rate": 1.1102348604913848e-05, + "loss": 2.1831, + "step": 15677 + }, + { + "epoch": 2.9387066541705718, + "grad_norm": 53674.06640625, + "learning_rate": 1.1097411783754146e-05, + "loss": 2.0643, + "step": 15678 + }, + { + "epoch": 2.9388940955951264, + "grad_norm": 57198.484375, + "learning_rate": 1.1092475923437917e-05, + "loss": 2.0761, + "step": 15679 + }, + { + "epoch": 2.9390815370196814, + "grad_norm": 60945.640625, + "learning_rate": 1.1087541024087105e-05, + "loss": 2.0843, + "step": 15680 + }, + { + "epoch": 2.939268978444236, + "grad_norm": 56220.4375, + "learning_rate": 1.1082607085823566e-05, + "loss": 2.0969, + "step": 15681 + }, + { + "epoch": 2.939456419868791, + "grad_norm": 50801.7265625, + "learning_rate": 1.1077674108769188e-05, + "loss": 2.1424, + "step": 15682 + }, + { + "epoch": 2.9396438612933458, + "grad_norm": 52234.00390625, + "learning_rate": 1.107274209304578e-05, + "loss": 2.1374, + "step": 15683 + }, + { + "epoch": 2.9398313027179004, + "grad_norm": 56845.27734375, + "learning_rate": 1.1067811038775189e-05, + "loss": 2.1325, + "step": 15684 + }, + { + "epoch": 2.9400187441424555, + "grad_norm": 56949.40625, + "learning_rate": 1.1062880946079174e-05, + "loss": 2.1332, + "step": 15685 + }, + { + "epoch": 2.9402061855670105, + "grad_norm": 53056.49609375, + "learning_rate": 1.1057951815079526e-05, + "loss": 2.0618, + "step": 15686 + }, + { + "epoch": 2.940393626991565, + "grad_norm": 55251.05078125, + "learning_rate": 1.1053023645897986e-05, + "loss": 2.0572, + "step": 15687 + }, + { + "epoch": 2.9405810684161198, + "grad_norm": 58033.13671875, + "learning_rate": 1.1048096438656247e-05, + "loss": 2.0505, + "step": 15688 + }, + { + "epoch": 2.940768509840675, + "grad_norm": 52949.12890625, + "learning_rate": 1.1043170193476043e-05, + "loss": 2.066, + "step": 15689 + }, + { + "epoch": 2.9409559512652295, + "grad_norm": 60261.703125, + "learning_rate": 1.1038244910479023e-05, + "loss": 2.0932, + "step": 15690 + }, + { + "epoch": 2.9411433926897845, + "grad_norm": 51111.3671875, + "learning_rate": 1.1033320589786828e-05, + "loss": 2.1029, + "step": 15691 + }, + { + "epoch": 2.941330834114339, + "grad_norm": 50388.57421875, + "learning_rate": 1.1028397231521092e-05, + "loss": 2.1289, + "step": 15692 + }, + { + "epoch": 2.9415182755388942, + "grad_norm": 53046.328125, + "learning_rate": 1.1023474835803433e-05, + "loss": 2.094, + "step": 15693 + }, + { + "epoch": 2.941705716963449, + "grad_norm": 56394.41015625, + "learning_rate": 1.1018553402755411e-05, + "loss": 2.0545, + "step": 15694 + }, + { + "epoch": 2.941893158388004, + "grad_norm": 54822.83984375, + "learning_rate": 1.1013632932498563e-05, + "loss": 2.1178, + "step": 15695 + }, + { + "epoch": 2.9420805998125585, + "grad_norm": 52220.66015625, + "learning_rate": 1.1008713425154438e-05, + "loss": 2.0389, + "step": 15696 + }, + { + "epoch": 2.9422680412371136, + "grad_norm": 55682.05859375, + "learning_rate": 1.1003794880844564e-05, + "loss": 2.1133, + "step": 15697 + }, + { + "epoch": 2.9424554826616682, + "grad_norm": 54028.578125, + "learning_rate": 1.0998877299690368e-05, + "loss": 2.1389, + "step": 15698 + }, + { + "epoch": 2.942642924086223, + "grad_norm": 53256.7421875, + "learning_rate": 1.0993960681813342e-05, + "loss": 2.0803, + "step": 15699 + }, + { + "epoch": 2.942830365510778, + "grad_norm": 55094.84375, + "learning_rate": 1.0989045027334926e-05, + "loss": 2.0723, + "step": 15700 + }, + { + "epoch": 2.9430178069353325, + "grad_norm": 52944.53125, + "learning_rate": 1.0984130336376518e-05, + "loss": 2.1762, + "step": 15701 + }, + { + "epoch": 2.9432052483598876, + "grad_norm": 55615.66015625, + "learning_rate": 1.0979216609059495e-05, + "loss": 2.126, + "step": 15702 + }, + { + "epoch": 2.9433926897844422, + "grad_norm": 52740.88671875, + "learning_rate": 1.0974303845505241e-05, + "loss": 2.0641, + "step": 15703 + }, + { + "epoch": 2.9435801312089973, + "grad_norm": 55985.22265625, + "learning_rate": 1.0969392045835086e-05, + "loss": 2.1639, + "step": 15704 + }, + { + "epoch": 2.943767572633552, + "grad_norm": 50047.59765625, + "learning_rate": 1.0964481210170329e-05, + "loss": 2.1559, + "step": 15705 + }, + { + "epoch": 2.943955014058107, + "grad_norm": 52699.76171875, + "learning_rate": 1.095957133863228e-05, + "loss": 2.1566, + "step": 15706 + }, + { + "epoch": 2.9441424554826616, + "grad_norm": 57638.05078125, + "learning_rate": 1.095466243134221e-05, + "loss": 2.1177, + "step": 15707 + }, + { + "epoch": 2.9443298969072167, + "grad_norm": 55920.56640625, + "learning_rate": 1.094975448842136e-05, + "loss": 2.1322, + "step": 15708 + }, + { + "epoch": 2.9445173383317713, + "grad_norm": 58670.96875, + "learning_rate": 1.094484750999093e-05, + "loss": 2.0961, + "step": 15709 + }, + { + "epoch": 2.944704779756326, + "grad_norm": 50692.07421875, + "learning_rate": 1.093994149617214e-05, + "loss": 2.1378, + "step": 15710 + }, + { + "epoch": 2.944892221180881, + "grad_norm": 58356.05078125, + "learning_rate": 1.0935036447086144e-05, + "loss": 2.1842, + "step": 15711 + }, + { + "epoch": 2.945079662605436, + "grad_norm": 60177.6875, + "learning_rate": 1.0930132362854112e-05, + "loss": 2.1348, + "step": 15712 + }, + { + "epoch": 2.9452671040299907, + "grad_norm": 52569.44921875, + "learning_rate": 1.0925229243597158e-05, + "loss": 2.0952, + "step": 15713 + }, + { + "epoch": 2.9454545454545453, + "grad_norm": 62277.2109375, + "learning_rate": 1.092032708943636e-05, + "loss": 2.0542, + "step": 15714 + }, + { + "epoch": 2.9456419868791004, + "grad_norm": 55464.3671875, + "learning_rate": 1.091542590049282e-05, + "loss": 2.0992, + "step": 15715 + }, + { + "epoch": 2.945829428303655, + "grad_norm": 57911.85546875, + "learning_rate": 1.0910525676887601e-05, + "loss": 2.0513, + "step": 15716 + }, + { + "epoch": 2.94601686972821, + "grad_norm": 57805.9453125, + "learning_rate": 1.0905626418741716e-05, + "loss": 2.0947, + "step": 15717 + }, + { + "epoch": 2.9462043111527647, + "grad_norm": 60083.55078125, + "learning_rate": 1.0900728126176158e-05, + "loss": 2.1314, + "step": 15718 + }, + { + "epoch": 2.94639175257732, + "grad_norm": 53865.8828125, + "learning_rate": 1.089583079931194e-05, + "loss": 2.16, + "step": 15719 + }, + { + "epoch": 2.9465791940018744, + "grad_norm": 51070.8359375, + "learning_rate": 1.0890934438269996e-05, + "loss": 2.1245, + "step": 15720 + }, + { + "epoch": 2.946766635426429, + "grad_norm": 57393.98828125, + "learning_rate": 1.0886039043171254e-05, + "loss": 2.0751, + "step": 15721 + }, + { + "epoch": 2.946954076850984, + "grad_norm": 50481.8828125, + "learning_rate": 1.0881144614136634e-05, + "loss": 2.1373, + "step": 15722 + }, + { + "epoch": 2.947141518275539, + "grad_norm": 49694.40234375, + "learning_rate": 1.0876251151287042e-05, + "loss": 2.1094, + "step": 15723 + }, + { + "epoch": 2.947328959700094, + "grad_norm": 53809.34375, + "learning_rate": 1.087135865474332e-05, + "loss": 2.1915, + "step": 15724 + }, + { + "epoch": 2.9475164011246484, + "grad_norm": 55473.1875, + "learning_rate": 1.086646712462629e-05, + "loss": 2.1353, + "step": 15725 + }, + { + "epoch": 2.9477038425492035, + "grad_norm": 56166.2578125, + "learning_rate": 1.0861576561056802e-05, + "loss": 2.1471, + "step": 15726 + }, + { + "epoch": 2.947891283973758, + "grad_norm": 55177.43359375, + "learning_rate": 1.0856686964155632e-05, + "loss": 2.1507, + "step": 15727 + }, + { + "epoch": 2.948078725398313, + "grad_norm": 55432.78515625, + "learning_rate": 1.0851798334043527e-05, + "loss": 2.194, + "step": 15728 + }, + { + "epoch": 2.948266166822868, + "grad_norm": 53920.7734375, + "learning_rate": 1.0846910670841248e-05, + "loss": 2.1418, + "step": 15729 + }, + { + "epoch": 2.948453608247423, + "grad_norm": 54851.22265625, + "learning_rate": 1.0842023974669523e-05, + "loss": 2.0861, + "step": 15730 + }, + { + "epoch": 2.9486410496719775, + "grad_norm": 53083.59765625, + "learning_rate": 1.0837138245649042e-05, + "loss": 2.1012, + "step": 15731 + }, + { + "epoch": 2.948828491096532, + "grad_norm": 54799.796875, + "learning_rate": 1.083225348390045e-05, + "loss": 2.1953, + "step": 15732 + }, + { + "epoch": 2.949015932521087, + "grad_norm": 56528.64453125, + "learning_rate": 1.0827369689544436e-05, + "loss": 2.1283, + "step": 15733 + }, + { + "epoch": 2.9492033739456422, + "grad_norm": 59277.5078125, + "learning_rate": 1.0822486862701603e-05, + "loss": 2.0708, + "step": 15734 + }, + { + "epoch": 2.949390815370197, + "grad_norm": 52490.3046875, + "learning_rate": 1.0817605003492532e-05, + "loss": 2.0996, + "step": 15735 + }, + { + "epoch": 2.9495782567947515, + "grad_norm": 58980.55859375, + "learning_rate": 1.081272411203782e-05, + "loss": 2.0292, + "step": 15736 + }, + { + "epoch": 2.9497656982193066, + "grad_norm": 53672.98828125, + "learning_rate": 1.0807844188458028e-05, + "loss": 2.1396, + "step": 15737 + }, + { + "epoch": 2.949953139643861, + "grad_norm": 51561.26953125, + "learning_rate": 1.080296523287367e-05, + "loss": 2.1505, + "step": 15738 + }, + { + "epoch": 2.9501405810684163, + "grad_norm": 53916.86328125, + "learning_rate": 1.0798087245405237e-05, + "loss": 2.07, + "step": 15739 + }, + { + "epoch": 2.950328022492971, + "grad_norm": 60180.98046875, + "learning_rate": 1.0793210226173234e-05, + "loss": 2.1002, + "step": 15740 + }, + { + "epoch": 2.950515463917526, + "grad_norm": 59789.19140625, + "learning_rate": 1.0788334175298098e-05, + "loss": 2.1088, + "step": 15741 + }, + { + "epoch": 2.9507029053420806, + "grad_norm": 54416.94140625, + "learning_rate": 1.0783459092900277e-05, + "loss": 2.194, + "step": 15742 + }, + { + "epoch": 2.950890346766635, + "grad_norm": 52578.796875, + "learning_rate": 1.0778584979100176e-05, + "loss": 2.0706, + "step": 15743 + }, + { + "epoch": 2.9510777881911903, + "grad_norm": 56869.13671875, + "learning_rate": 1.0773711834018162e-05, + "loss": 2.1609, + "step": 15744 + }, + { + "epoch": 2.9512652296157453, + "grad_norm": 56294.484375, + "learning_rate": 1.076883965777462e-05, + "loss": 2.0779, + "step": 15745 + }, + { + "epoch": 2.9514526710403, + "grad_norm": 54519.953125, + "learning_rate": 1.0763968450489858e-05, + "loss": 2.1417, + "step": 15746 + }, + { + "epoch": 2.9516401124648546, + "grad_norm": 53651.5078125, + "learning_rate": 1.0759098212284224e-05, + "loss": 2.1075, + "step": 15747 + }, + { + "epoch": 2.9518275538894096, + "grad_norm": 51228.19140625, + "learning_rate": 1.0754228943277972e-05, + "loss": 2.1232, + "step": 15748 + }, + { + "epoch": 2.9520149953139643, + "grad_norm": 53469.2265625, + "learning_rate": 1.0749360643591395e-05, + "loss": 2.1344, + "step": 15749 + }, + { + "epoch": 2.9522024367385193, + "grad_norm": 55126.7734375, + "learning_rate": 1.0744493313344722e-05, + "loss": 2.0736, + "step": 15750 + }, + { + "epoch": 2.952389878163074, + "grad_norm": 57662.02734375, + "learning_rate": 1.073962695265815e-05, + "loss": 2.0839, + "step": 15751 + }, + { + "epoch": 2.952577319587629, + "grad_norm": 59910.86328125, + "learning_rate": 1.0734761561651895e-05, + "loss": 2.2568, + "step": 15752 + }, + { + "epoch": 2.9527647610121837, + "grad_norm": 49910.06640625, + "learning_rate": 1.0729897140446149e-05, + "loss": 2.1648, + "step": 15753 + }, + { + "epoch": 2.9529522024367383, + "grad_norm": 52042.6796875, + "learning_rate": 1.0725033689161002e-05, + "loss": 2.178, + "step": 15754 + }, + { + "epoch": 2.9531396438612934, + "grad_norm": 51014.75, + "learning_rate": 1.0720171207916602e-05, + "loss": 2.0936, + "step": 15755 + }, + { + "epoch": 2.9533270852858484, + "grad_norm": 51522.9921875, + "learning_rate": 1.0715309696833059e-05, + "loss": 2.1977, + "step": 15756 + }, + { + "epoch": 2.953514526710403, + "grad_norm": 58231.87890625, + "learning_rate": 1.0710449156030434e-05, + "loss": 2.1043, + "step": 15757 + }, + { + "epoch": 2.9537019681349577, + "grad_norm": 58150.54296875, + "learning_rate": 1.0705589585628755e-05, + "loss": 2.0958, + "step": 15758 + }, + { + "epoch": 2.9538894095595127, + "grad_norm": 52343.61328125, + "learning_rate": 1.0700730985748075e-05, + "loss": 2.103, + "step": 15759 + }, + { + "epoch": 2.9540768509840674, + "grad_norm": 61879.21875, + "learning_rate": 1.0695873356508412e-05, + "loss": 2.0395, + "step": 15760 + }, + { + "epoch": 2.9542642924086224, + "grad_norm": 59993.58203125, + "learning_rate": 1.0691016698029693e-05, + "loss": 2.1587, + "step": 15761 + }, + { + "epoch": 2.954451733833177, + "grad_norm": 51700.7265625, + "learning_rate": 1.0686161010431895e-05, + "loss": 2.1107, + "step": 15762 + }, + { + "epoch": 2.954639175257732, + "grad_norm": 55501.40625, + "learning_rate": 1.0681306293834964e-05, + "loss": 2.0245, + "step": 15763 + }, + { + "epoch": 2.9548266166822867, + "grad_norm": 57561.55078125, + "learning_rate": 1.067645254835879e-05, + "loss": 2.0814, + "step": 15764 + }, + { + "epoch": 2.9550140581068414, + "grad_norm": 56307.61328125, + "learning_rate": 1.0671599774123236e-05, + "loss": 2.0827, + "step": 15765 + }, + { + "epoch": 2.9552014995313964, + "grad_norm": 57344.21484375, + "learning_rate": 1.0666747971248197e-05, + "loss": 2.019, + "step": 15766 + }, + { + "epoch": 2.9553889409559515, + "grad_norm": 52100.98046875, + "learning_rate": 1.0661897139853471e-05, + "loss": 2.1144, + "step": 15767 + }, + { + "epoch": 2.955576382380506, + "grad_norm": 53006.953125, + "learning_rate": 1.0657047280058896e-05, + "loss": 2.1504, + "step": 15768 + }, + { + "epoch": 2.9557638238050608, + "grad_norm": 52466.7265625, + "learning_rate": 1.0652198391984231e-05, + "loss": 2.1011, + "step": 15769 + }, + { + "epoch": 2.955951265229616, + "grad_norm": 53802.82421875, + "learning_rate": 1.0647350475749262e-05, + "loss": 2.0825, + "step": 15770 + }, + { + "epoch": 2.9561387066541704, + "grad_norm": 51623.16015625, + "learning_rate": 1.0642503531473708e-05, + "loss": 2.1698, + "step": 15771 + }, + { + "epoch": 2.9563261480787255, + "grad_norm": 52812.83203125, + "learning_rate": 1.0637657559277297e-05, + "loss": 2.1445, + "step": 15772 + }, + { + "epoch": 2.95651358950328, + "grad_norm": 53698.80859375, + "learning_rate": 1.063281255927971e-05, + "loss": 2.0635, + "step": 15773 + }, + { + "epoch": 2.956701030927835, + "grad_norm": 55350.53125, + "learning_rate": 1.0627968531600601e-05, + "loss": 2.0669, + "step": 15774 + }, + { + "epoch": 2.95688847235239, + "grad_norm": 58091.9765625, + "learning_rate": 1.0623125476359635e-05, + "loss": 2.1255, + "step": 15775 + }, + { + "epoch": 2.9570759137769445, + "grad_norm": 56466.92578125, + "learning_rate": 1.0618283393676404e-05, + "loss": 2.1284, + "step": 15776 + }, + { + "epoch": 2.9572633552014995, + "grad_norm": 52678.484375, + "learning_rate": 1.0613442283670527e-05, + "loss": 2.0799, + "step": 15777 + }, + { + "epoch": 2.9574507966260546, + "grad_norm": 56117.76171875, + "learning_rate": 1.0608602146461549e-05, + "loss": 2.1828, + "step": 15778 + }, + { + "epoch": 2.957638238050609, + "grad_norm": 52371.47265625, + "learning_rate": 1.060376298216904e-05, + "loss": 2.0875, + "step": 15779 + }, + { + "epoch": 2.957825679475164, + "grad_norm": 55746.78125, + "learning_rate": 1.0598924790912502e-05, + "loss": 2.1416, + "step": 15780 + }, + { + "epoch": 2.958013120899719, + "grad_norm": 50925.53515625, + "learning_rate": 1.0594087572811428e-05, + "loss": 2.1366, + "step": 15781 + }, + { + "epoch": 2.9582005623242735, + "grad_norm": 59634.65234375, + "learning_rate": 1.05892513279853e-05, + "loss": 2.1344, + "step": 15782 + }, + { + "epoch": 2.9583880037488286, + "grad_norm": 56426.26171875, + "learning_rate": 1.0584416056553592e-05, + "loss": 2.1042, + "step": 15783 + }, + { + "epoch": 2.9585754451733832, + "grad_norm": 54341.92578125, + "learning_rate": 1.0579581758635682e-05, + "loss": 2.1006, + "step": 15784 + }, + { + "epoch": 2.9587628865979383, + "grad_norm": 51407.54296875, + "learning_rate": 1.057474843435099e-05, + "loss": 2.0772, + "step": 15785 + }, + { + "epoch": 2.958950328022493, + "grad_norm": 53028.29296875, + "learning_rate": 1.056991608381891e-05, + "loss": 2.0944, + "step": 15786 + }, + { + "epoch": 2.9591377694470475, + "grad_norm": 49973.8515625, + "learning_rate": 1.0565084707158784e-05, + "loss": 2.1789, + "step": 15787 + }, + { + "epoch": 2.9593252108716026, + "grad_norm": 50269.71484375, + "learning_rate": 1.0560254304489919e-05, + "loss": 2.0986, + "step": 15788 + }, + { + "epoch": 2.9595126522961577, + "grad_norm": 49334.76953125, + "learning_rate": 1.0555424875931641e-05, + "loss": 2.0937, + "step": 15789 + }, + { + "epoch": 2.9597000937207123, + "grad_norm": 53911.90234375, + "learning_rate": 1.0550596421603254e-05, + "loss": 2.0717, + "step": 15790 + }, + { + "epoch": 2.959887535145267, + "grad_norm": 57385.484375, + "learning_rate": 1.0545768941623963e-05, + "loss": 2.0934, + "step": 15791 + }, + { + "epoch": 2.960074976569822, + "grad_norm": 53060.578125, + "learning_rate": 1.0540942436113022e-05, + "loss": 2.0999, + "step": 15792 + }, + { + "epoch": 2.9602624179943766, + "grad_norm": 53246.6796875, + "learning_rate": 1.0536116905189663e-05, + "loss": 2.1371, + "step": 15793 + }, + { + "epoch": 2.9604498594189317, + "grad_norm": 53890.0390625, + "learning_rate": 1.0531292348973048e-05, + "loss": 2.108, + "step": 15794 + }, + { + "epoch": 2.9606373008434863, + "grad_norm": 52747.609375, + "learning_rate": 1.0526468767582326e-05, + "loss": 2.1303, + "step": 15795 + }, + { + "epoch": 2.9608247422680414, + "grad_norm": 55873.078125, + "learning_rate": 1.0521646161136661e-05, + "loss": 2.1753, + "step": 15796 + }, + { + "epoch": 2.961012183692596, + "grad_norm": 53538.47265625, + "learning_rate": 1.0516824529755137e-05, + "loss": 2.0818, + "step": 15797 + }, + { + "epoch": 2.9611996251171506, + "grad_norm": 56085.2109375, + "learning_rate": 1.051200387355687e-05, + "loss": 2.1194, + "step": 15798 + }, + { + "epoch": 2.9613870665417057, + "grad_norm": 61190.46875, + "learning_rate": 1.0507184192660896e-05, + "loss": 2.1515, + "step": 15799 + }, + { + "epoch": 2.9615745079662608, + "grad_norm": 52977.4921875, + "learning_rate": 1.0502365487186282e-05, + "loss": 2.1332, + "step": 15800 + }, + { + "epoch": 2.9617619493908154, + "grad_norm": 55663.70703125, + "learning_rate": 1.0497547757252035e-05, + "loss": 2.1519, + "step": 15801 + }, + { + "epoch": 2.96194939081537, + "grad_norm": 59981.84765625, + "learning_rate": 1.049273100297713e-05, + "loss": 2.0485, + "step": 15802 + }, + { + "epoch": 2.962136832239925, + "grad_norm": 53072.53125, + "learning_rate": 1.0487915224480554e-05, + "loss": 2.1287, + "step": 15803 + }, + { + "epoch": 2.9623242736644797, + "grad_norm": 54628.265625, + "learning_rate": 1.0483100421881237e-05, + "loss": 2.0736, + "step": 15804 + }, + { + "epoch": 2.9625117150890348, + "grad_norm": 53513.28515625, + "learning_rate": 1.0478286595298114e-05, + "loss": 2.0963, + "step": 15805 + }, + { + "epoch": 2.9626991565135894, + "grad_norm": 56486.859375, + "learning_rate": 1.0473473744850076e-05, + "loss": 2.1088, + "step": 15806 + }, + { + "epoch": 2.9628865979381445, + "grad_norm": 52755.0703125, + "learning_rate": 1.0468661870655971e-05, + "loss": 2.0959, + "step": 15807 + }, + { + "epoch": 2.963074039362699, + "grad_norm": 57761.4921875, + "learning_rate": 1.0463850972834672e-05, + "loss": 2.0613, + "step": 15808 + }, + { + "epoch": 2.9632614807872537, + "grad_norm": 54327.01171875, + "learning_rate": 1.0459041051505004e-05, + "loss": 2.0166, + "step": 15809 + }, + { + "epoch": 2.963448922211809, + "grad_norm": 53311.30078125, + "learning_rate": 1.0454232106785755e-05, + "loss": 2.0456, + "step": 15810 + }, + { + "epoch": 2.963636363636364, + "grad_norm": 53889.9140625, + "learning_rate": 1.0449424138795689e-05, + "loss": 2.0417, + "step": 15811 + }, + { + "epoch": 2.9638238050609185, + "grad_norm": 58137.75, + "learning_rate": 1.0444617147653568e-05, + "loss": 2.0781, + "step": 15812 + }, + { + "epoch": 2.964011246485473, + "grad_norm": 56698.578125, + "learning_rate": 1.0439811133478145e-05, + "loss": 2.1201, + "step": 15813 + }, + { + "epoch": 2.964198687910028, + "grad_norm": 52696.63671875, + "learning_rate": 1.0435006096388073e-05, + "loss": 2.1199, + "step": 15814 + }, + { + "epoch": 2.964386129334583, + "grad_norm": 54695.69140625, + "learning_rate": 1.0430202036502046e-05, + "loss": 2.142, + "step": 15815 + }, + { + "epoch": 2.964573570759138, + "grad_norm": 53258.3203125, + "learning_rate": 1.0425398953938747e-05, + "loss": 2.1437, + "step": 15816 + }, + { + "epoch": 2.9647610121836925, + "grad_norm": 55860.53125, + "learning_rate": 1.0420596848816782e-05, + "loss": 2.0965, + "step": 15817 + }, + { + "epoch": 2.9649484536082475, + "grad_norm": 60919.30078125, + "learning_rate": 1.0415795721254745e-05, + "loss": 2.0673, + "step": 15818 + }, + { + "epoch": 2.965135895032802, + "grad_norm": 55151.3046875, + "learning_rate": 1.0410995571371245e-05, + "loss": 2.1242, + "step": 15819 + }, + { + "epoch": 2.9653233364573572, + "grad_norm": 56726.890625, + "learning_rate": 1.0406196399284829e-05, + "loss": 2.1216, + "step": 15820 + }, + { + "epoch": 2.965510777881912, + "grad_norm": 58422.9921875, + "learning_rate": 1.0401398205114015e-05, + "loss": 2.0778, + "step": 15821 + }, + { + "epoch": 2.965698219306467, + "grad_norm": 55476.4375, + "learning_rate": 1.0396600988977324e-05, + "loss": 2.0784, + "step": 15822 + }, + { + "epoch": 2.9658856607310216, + "grad_norm": 54837.8203125, + "learning_rate": 1.0391804750993255e-05, + "loss": 2.1739, + "step": 15823 + }, + { + "epoch": 2.966073102155576, + "grad_norm": 52139.16015625, + "learning_rate": 1.0387009491280253e-05, + "loss": 2.1149, + "step": 15824 + }, + { + "epoch": 2.9662605435801312, + "grad_norm": 57189.68359375, + "learning_rate": 1.0382215209956746e-05, + "loss": 2.1045, + "step": 15825 + }, + { + "epoch": 2.966447985004686, + "grad_norm": 53111.37890625, + "learning_rate": 1.0377421907141177e-05, + "loss": 2.0672, + "step": 15826 + }, + { + "epoch": 2.966635426429241, + "grad_norm": 57575.19921875, + "learning_rate": 1.0372629582951893e-05, + "loss": 2.1114, + "step": 15827 + }, + { + "epoch": 2.9668228678537956, + "grad_norm": 51175.22265625, + "learning_rate": 1.03678382375073e-05, + "loss": 2.1217, + "step": 15828 + }, + { + "epoch": 2.9670103092783506, + "grad_norm": 57608.46484375, + "learning_rate": 1.0363047870925701e-05, + "loss": 2.046, + "step": 15829 + }, + { + "epoch": 2.9671977507029053, + "grad_norm": 57476.1796875, + "learning_rate": 1.035825848332545e-05, + "loss": 2.1243, + "step": 15830 + }, + { + "epoch": 2.9673851921274603, + "grad_norm": 51933.2421875, + "learning_rate": 1.0353470074824812e-05, + "loss": 2.1251, + "step": 15831 + }, + { + "epoch": 2.967572633552015, + "grad_norm": 55272.984375, + "learning_rate": 1.0348682645542051e-05, + "loss": 2.1169, + "step": 15832 + }, + { + "epoch": 2.96776007497657, + "grad_norm": 57384.0546875, + "learning_rate": 1.0343896195595432e-05, + "loss": 2.1255, + "step": 15833 + }, + { + "epoch": 2.9679475164011246, + "grad_norm": 52380.4140625, + "learning_rate": 1.0339110725103145e-05, + "loss": 2.1182, + "step": 15834 + }, + { + "epoch": 2.9681349578256793, + "grad_norm": 51280.0078125, + "learning_rate": 1.0334326234183422e-05, + "loss": 2.1315, + "step": 15835 + }, + { + "epoch": 2.9683223992502343, + "grad_norm": 56280.67578125, + "learning_rate": 1.0329542722954411e-05, + "loss": 2.0932, + "step": 15836 + }, + { + "epoch": 2.9685098406747894, + "grad_norm": 61333.4765625, + "learning_rate": 1.0324760191534245e-05, + "loss": 2.0482, + "step": 15837 + }, + { + "epoch": 2.968697282099344, + "grad_norm": 56714.80078125, + "learning_rate": 1.031997864004106e-05, + "loss": 2.1028, + "step": 15838 + }, + { + "epoch": 2.9688847235238986, + "grad_norm": 54135.82421875, + "learning_rate": 1.0315198068592979e-05, + "loss": 2.1605, + "step": 15839 + }, + { + "epoch": 2.9690721649484537, + "grad_norm": 48105.90625, + "learning_rate": 1.0310418477308048e-05, + "loss": 2.1086, + "step": 15840 + }, + { + "epoch": 2.9692596063730083, + "grad_norm": 54782.1875, + "learning_rate": 1.0305639866304312e-05, + "loss": 2.0788, + "step": 15841 + }, + { + "epoch": 2.9694470477975634, + "grad_norm": 55043.36328125, + "learning_rate": 1.0300862235699815e-05, + "loss": 2.1608, + "step": 15842 + }, + { + "epoch": 2.969634489222118, + "grad_norm": 52262.171875, + "learning_rate": 1.0296085585612552e-05, + "loss": 2.1097, + "step": 15843 + }, + { + "epoch": 2.969821930646673, + "grad_norm": 58331.515625, + "learning_rate": 1.029130991616049e-05, + "loss": 2.2205, + "step": 15844 + }, + { + "epoch": 2.9700093720712277, + "grad_norm": 55017.046875, + "learning_rate": 1.0286535227461585e-05, + "loss": 2.1174, + "step": 15845 + }, + { + "epoch": 2.9701968134957824, + "grad_norm": 57797.0625, + "learning_rate": 1.0281761519633799e-05, + "loss": 2.0068, + "step": 15846 + }, + { + "epoch": 2.9703842549203374, + "grad_norm": 49295.05078125, + "learning_rate": 1.0276988792794979e-05, + "loss": 2.1403, + "step": 15847 + }, + { + "epoch": 2.9705716963448925, + "grad_norm": 53977.40234375, + "learning_rate": 1.0272217047063037e-05, + "loss": 2.1511, + "step": 15848 + }, + { + "epoch": 2.970759137769447, + "grad_norm": 54105.28515625, + "learning_rate": 1.0267446282555838e-05, + "loss": 2.1001, + "step": 15849 + }, + { + "epoch": 2.9709465791940017, + "grad_norm": 54841.54296875, + "learning_rate": 1.0262676499391205e-05, + "loss": 2.1499, + "step": 15850 + }, + { + "epoch": 2.971134020618557, + "grad_norm": 52634.1171875, + "learning_rate": 1.0257907697686925e-05, + "loss": 2.0403, + "step": 15851 + }, + { + "epoch": 2.9713214620431114, + "grad_norm": 53457.5703125, + "learning_rate": 1.0253139877560797e-05, + "loss": 2.155, + "step": 15852 + }, + { + "epoch": 2.9715089034676665, + "grad_norm": 53810.38671875, + "learning_rate": 1.0248373039130599e-05, + "loss": 2.1786, + "step": 15853 + }, + { + "epoch": 2.971696344892221, + "grad_norm": 49637.91796875, + "learning_rate": 1.0243607182514047e-05, + "loss": 2.158, + "step": 15854 + }, + { + "epoch": 2.971883786316776, + "grad_norm": 57803.734375, + "learning_rate": 1.0238842307828838e-05, + "loss": 2.2063, + "step": 15855 + }, + { + "epoch": 2.972071227741331, + "grad_norm": 54008.90625, + "learning_rate": 1.0234078415192688e-05, + "loss": 2.0842, + "step": 15856 + }, + { + "epoch": 2.9722586691658854, + "grad_norm": 54794.3671875, + "learning_rate": 1.0229315504723242e-05, + "loss": 2.1177, + "step": 15857 + }, + { + "epoch": 2.9724461105904405, + "grad_norm": 53874.54296875, + "learning_rate": 1.0224553576538126e-05, + "loss": 2.1581, + "step": 15858 + }, + { + "epoch": 2.9726335520149956, + "grad_norm": 54019.8359375, + "learning_rate": 1.0219792630754981e-05, + "loss": 2.0984, + "step": 15859 + }, + { + "epoch": 2.97282099343955, + "grad_norm": 55343.828125, + "learning_rate": 1.021503266749137e-05, + "loss": 2.1004, + "step": 15860 + }, + { + "epoch": 2.973008434864105, + "grad_norm": 53770.0, + "learning_rate": 1.0210273686864886e-05, + "loss": 2.1477, + "step": 15861 + }, + { + "epoch": 2.97319587628866, + "grad_norm": 54341.390625, + "learning_rate": 1.0205515688993039e-05, + "loss": 2.0861, + "step": 15862 + }, + { + "epoch": 2.9733833177132145, + "grad_norm": 58056.21484375, + "learning_rate": 1.0200758673993372e-05, + "loss": 2.1867, + "step": 15863 + }, + { + "epoch": 2.9735707591377696, + "grad_norm": 55973.94140625, + "learning_rate": 1.019600264198335e-05, + "loss": 2.1628, + "step": 15864 + }, + { + "epoch": 2.973758200562324, + "grad_norm": 55966.7109375, + "learning_rate": 1.0191247593080472e-05, + "loss": 2.1814, + "step": 15865 + }, + { + "epoch": 2.9739456419868793, + "grad_norm": 55189.9765625, + "learning_rate": 1.0186493527402163e-05, + "loss": 2.068, + "step": 15866 + }, + { + "epoch": 2.974133083411434, + "grad_norm": 48714.890625, + "learning_rate": 1.0181740445065834e-05, + "loss": 2.2285, + "step": 15867 + }, + { + "epoch": 2.9743205248359885, + "grad_norm": 56462.6015625, + "learning_rate": 1.0176988346188893e-05, + "loss": 2.0899, + "step": 15868 + }, + { + "epoch": 2.9745079662605436, + "grad_norm": 54993.94921875, + "learning_rate": 1.0172237230888715e-05, + "loss": 2.0805, + "step": 15869 + }, + { + "epoch": 2.9746954076850987, + "grad_norm": 52447.46875, + "learning_rate": 1.0167487099282646e-05, + "loss": 1.9973, + "step": 15870 + }, + { + "epoch": 2.9748828491096533, + "grad_norm": 57052.62890625, + "learning_rate": 1.0162737951487988e-05, + "loss": 2.0146, + "step": 15871 + }, + { + "epoch": 2.975070290534208, + "grad_norm": 55372.40234375, + "learning_rate": 1.0157989787622063e-05, + "loss": 2.0863, + "step": 15872 + }, + { + "epoch": 2.975257731958763, + "grad_norm": 55954.76953125, + "learning_rate": 1.0153242607802138e-05, + "loss": 2.1146, + "step": 15873 + }, + { + "epoch": 2.9754451733833176, + "grad_norm": 57605.55078125, + "learning_rate": 1.0148496412145441e-05, + "loss": 2.0742, + "step": 15874 + }, + { + "epoch": 2.9756326148078727, + "grad_norm": 51217.765625, + "learning_rate": 1.014375120076922e-05, + "loss": 2.1275, + "step": 15875 + }, + { + "epoch": 2.9758200562324273, + "grad_norm": 58180.51171875, + "learning_rate": 1.0139006973790693e-05, + "loss": 2.1315, + "step": 15876 + }, + { + "epoch": 2.9760074976569824, + "grad_norm": 50955.47265625, + "learning_rate": 1.0134263731326987e-05, + "loss": 2.1211, + "step": 15877 + }, + { + "epoch": 2.976194939081537, + "grad_norm": 54472.2734375, + "learning_rate": 1.012952147349528e-05, + "loss": 2.0974, + "step": 15878 + }, + { + "epoch": 2.9763823805060916, + "grad_norm": 50726.44140625, + "learning_rate": 1.0124780200412715e-05, + "loss": 2.0851, + "step": 15879 + }, + { + "epoch": 2.9765698219306467, + "grad_norm": 51703.91015625, + "learning_rate": 1.0120039912196377e-05, + "loss": 2.0845, + "step": 15880 + }, + { + "epoch": 2.9767572633552017, + "grad_norm": 55335.68359375, + "learning_rate": 1.0115300608963336e-05, + "loss": 2.1179, + "step": 15881 + }, + { + "epoch": 2.9769447047797564, + "grad_norm": 57307.6015625, + "learning_rate": 1.0110562290830672e-05, + "loss": 2.0894, + "step": 15882 + }, + { + "epoch": 2.977132146204311, + "grad_norm": 54083.55078125, + "learning_rate": 1.01058249579154e-05, + "loss": 2.044, + "step": 15883 + }, + { + "epoch": 2.977319587628866, + "grad_norm": 58994.01953125, + "learning_rate": 1.010108861033452e-05, + "loss": 2.0986, + "step": 15884 + }, + { + "epoch": 2.9775070290534207, + "grad_norm": 54516.515625, + "learning_rate": 1.0096353248205014e-05, + "loss": 2.1431, + "step": 15885 + }, + { + "epoch": 2.9776944704779758, + "grad_norm": 56466.76171875, + "learning_rate": 1.0091618871643865e-05, + "loss": 2.151, + "step": 15886 + }, + { + "epoch": 2.9778819119025304, + "grad_norm": 57385.578125, + "learning_rate": 1.0086885480767988e-05, + "loss": 2.0587, + "step": 15887 + }, + { + "epoch": 2.9780693533270854, + "grad_norm": 53340.57421875, + "learning_rate": 1.0082153075694279e-05, + "loss": 2.1125, + "step": 15888 + }, + { + "epoch": 2.97825679475164, + "grad_norm": 55665.17578125, + "learning_rate": 1.0077421656539649e-05, + "loss": 2.1205, + "step": 15889 + }, + { + "epoch": 2.9784442361761947, + "grad_norm": 57490.75, + "learning_rate": 1.0072691223420926e-05, + "loss": 2.0988, + "step": 15890 + }, + { + "epoch": 2.9786316776007498, + "grad_norm": 57938.7890625, + "learning_rate": 1.0067961776454976e-05, + "loss": 2.1236, + "step": 15891 + }, + { + "epoch": 2.978819119025305, + "grad_norm": 54234.53125, + "learning_rate": 1.0063233315758586e-05, + "loss": 2.1235, + "step": 15892 + }, + { + "epoch": 2.9790065604498595, + "grad_norm": 59675.4765625, + "learning_rate": 1.0058505841448573e-05, + "loss": 2.0396, + "step": 15893 + }, + { + "epoch": 2.979194001874414, + "grad_norm": 52921.21875, + "learning_rate": 1.0053779353641662e-05, + "loss": 2.1758, + "step": 15894 + }, + { + "epoch": 2.979381443298969, + "grad_norm": 53140.1640625, + "learning_rate": 1.004905385245462e-05, + "loss": 2.1254, + "step": 15895 + }, + { + "epoch": 2.9795688847235238, + "grad_norm": 54425.75, + "learning_rate": 1.0044329338004155e-05, + "loss": 2.0672, + "step": 15896 + }, + { + "epoch": 2.979756326148079, + "grad_norm": 59701.69140625, + "learning_rate": 1.0039605810406943e-05, + "loss": 2.0875, + "step": 15897 + }, + { + "epoch": 2.9799437675726335, + "grad_norm": 55813.55859375, + "learning_rate": 1.0034883269779666e-05, + "loss": 2.1141, + "step": 15898 + }, + { + "epoch": 2.9801312089971885, + "grad_norm": 55470.13671875, + "learning_rate": 1.0030161716238955e-05, + "loss": 2.1547, + "step": 15899 + }, + { + "epoch": 2.980318650421743, + "grad_norm": 53624.65234375, + "learning_rate": 1.002544114990142e-05, + "loss": 2.0748, + "step": 15900 + }, + { + "epoch": 2.980506091846298, + "grad_norm": 51614.296875, + "learning_rate": 1.0020721570883657e-05, + "loss": 2.0843, + "step": 15901 + }, + { + "epoch": 2.980693533270853, + "grad_norm": 52658.4609375, + "learning_rate": 1.0016002979302252e-05, + "loss": 2.0537, + "step": 15902 + }, + { + "epoch": 2.980880974695408, + "grad_norm": 55936.53515625, + "learning_rate": 1.0011285375273732e-05, + "loss": 2.1175, + "step": 15903 + }, + { + "epoch": 2.9810684161199625, + "grad_norm": 52244.43359375, + "learning_rate": 1.0006568758914603e-05, + "loss": 2.2432, + "step": 15904 + }, + { + "epoch": 2.981255857544517, + "grad_norm": 55172.921875, + "learning_rate": 1.0001853130341377e-05, + "loss": 2.1371, + "step": 15905 + }, + { + "epoch": 2.9814432989690722, + "grad_norm": 54070.75390625, + "learning_rate": 9.997138489670543e-06, + "loss": 2.0467, + "step": 15906 + }, + { + "epoch": 2.981630740393627, + "grad_norm": 52228.859375, + "learning_rate": 9.992424837018506e-06, + "loss": 2.1293, + "step": 15907 + }, + { + "epoch": 2.981818181818182, + "grad_norm": 53206.9921875, + "learning_rate": 9.987712172501696e-06, + "loss": 2.0981, + "step": 15908 + }, + { + "epoch": 2.9820056232427365, + "grad_norm": 52993.1328125, + "learning_rate": 9.983000496236533e-06, + "loss": 2.0917, + "step": 15909 + }, + { + "epoch": 2.9821930646672916, + "grad_norm": 56131.22265625, + "learning_rate": 9.978289808339375e-06, + "loss": 2.1336, + "step": 15910 + }, + { + "epoch": 2.9823805060918462, + "grad_norm": 53434.109375, + "learning_rate": 9.97358010892656e-06, + "loss": 2.1003, + "step": 15911 + }, + { + "epoch": 2.982567947516401, + "grad_norm": 51664.46875, + "learning_rate": 9.968871398114433e-06, + "loss": 2.1472, + "step": 15912 + }, + { + "epoch": 2.982755388940956, + "grad_norm": 57690.35546875, + "learning_rate": 9.964163676019278e-06, + "loss": 2.1751, + "step": 15913 + }, + { + "epoch": 2.982942830365511, + "grad_norm": 55353.140625, + "learning_rate": 9.959456942757362e-06, + "loss": 2.1496, + "step": 15914 + }, + { + "epoch": 2.9831302717900656, + "grad_norm": 52136.46484375, + "learning_rate": 9.954751198444945e-06, + "loss": 2.1209, + "step": 15915 + }, + { + "epoch": 2.9833177132146202, + "grad_norm": 54787.16796875, + "learning_rate": 9.950046443198269e-06, + "loss": 2.143, + "step": 15916 + }, + { + "epoch": 2.9835051546391753, + "grad_norm": 55630.53515625, + "learning_rate": 9.945342677133523e-06, + "loss": 2.1128, + "step": 15917 + }, + { + "epoch": 2.98369259606373, + "grad_norm": 56141.5078125, + "learning_rate": 9.940639900366866e-06, + "loss": 2.1413, + "step": 15918 + }, + { + "epoch": 2.983880037488285, + "grad_norm": 52679.4296875, + "learning_rate": 9.935938113014476e-06, + "loss": 2.0233, + "step": 15919 + }, + { + "epoch": 2.9840674789128396, + "grad_norm": 55433.68359375, + "learning_rate": 9.93123731519246e-06, + "loss": 2.1103, + "step": 15920 + }, + { + "epoch": 2.9842549203373947, + "grad_norm": 56316.421875, + "learning_rate": 9.926537507016943e-06, + "loss": 2.1419, + "step": 15921 + }, + { + "epoch": 2.9844423617619493, + "grad_norm": 53644.484375, + "learning_rate": 9.921838688603985e-06, + "loss": 2.0909, + "step": 15922 + }, + { + "epoch": 2.984629803186504, + "grad_norm": 53012.60546875, + "learning_rate": 9.917140860069656e-06, + "loss": 2.1167, + "step": 15923 + }, + { + "epoch": 2.984817244611059, + "grad_norm": 50246.35546875, + "learning_rate": 9.91244402152997e-06, + "loss": 2.1151, + "step": 15924 + }, + { + "epoch": 2.985004686035614, + "grad_norm": 57552.828125, + "learning_rate": 9.90774817310096e-06, + "loss": 2.0422, + "step": 15925 + }, + { + "epoch": 2.9851921274601687, + "grad_norm": 59053.53515625, + "learning_rate": 9.903053314898586e-06, + "loss": 2.1123, + "step": 15926 + }, + { + "epoch": 2.9853795688847233, + "grad_norm": 56561.14453125, + "learning_rate": 9.898359447038797e-06, + "loss": 2.1328, + "step": 15927 + }, + { + "epoch": 2.9855670103092784, + "grad_norm": 50071.48828125, + "learning_rate": 9.893666569637545e-06, + "loss": 2.1124, + "step": 15928 + }, + { + "epoch": 2.985754451733833, + "grad_norm": 56302.6171875, + "learning_rate": 9.888974682810737e-06, + "loss": 2.0979, + "step": 15929 + }, + { + "epoch": 2.985941893158388, + "grad_norm": 53298.984375, + "learning_rate": 9.884283786674231e-06, + "loss": 2.1114, + "step": 15930 + }, + { + "epoch": 2.9861293345829427, + "grad_norm": 55163.14453125, + "learning_rate": 9.879593881343908e-06, + "loss": 2.1588, + "step": 15931 + }, + { + "epoch": 2.986316776007498, + "grad_norm": 53867.69921875, + "learning_rate": 9.874904966935611e-06, + "loss": 2.1318, + "step": 15932 + }, + { + "epoch": 2.9865042174320524, + "grad_norm": 54043.078125, + "learning_rate": 9.870217043565138e-06, + "loss": 2.1679, + "step": 15933 + }, + { + "epoch": 2.986691658856607, + "grad_norm": 50841.0625, + "learning_rate": 9.865530111348259e-06, + "loss": 2.1008, + "step": 15934 + }, + { + "epoch": 2.986879100281162, + "grad_norm": 55071.62890625, + "learning_rate": 9.860844170400768e-06, + "loss": 2.0657, + "step": 15935 + }, + { + "epoch": 2.987066541705717, + "grad_norm": 54755.65625, + "learning_rate": 9.856159220838384e-06, + "loss": 2.1565, + "step": 15936 + }, + { + "epoch": 2.987253983130272, + "grad_norm": 57536.83984375, + "learning_rate": 9.85147526277681e-06, + "loss": 2.2592, + "step": 15937 + }, + { + "epoch": 2.9874414245548264, + "grad_norm": 56297.75390625, + "learning_rate": 9.84679229633174e-06, + "loss": 2.1155, + "step": 15938 + }, + { + "epoch": 2.9876288659793815, + "grad_norm": 59421.28515625, + "learning_rate": 9.84211032161887e-06, + "loss": 2.1794, + "step": 15939 + }, + { + "epoch": 2.987816307403936, + "grad_norm": 56916.6171875, + "learning_rate": 9.83742933875378e-06, + "loss": 2.1517, + "step": 15940 + }, + { + "epoch": 2.988003748828491, + "grad_norm": 53707.48046875, + "learning_rate": 9.832749347852115e-06, + "loss": 2.2826, + "step": 15941 + }, + { + "epoch": 2.988191190253046, + "grad_norm": 54465.5234375, + "learning_rate": 9.828070349029478e-06, + "loss": 2.1233, + "step": 15942 + }, + { + "epoch": 2.988378631677601, + "grad_norm": 52378.421875, + "learning_rate": 9.823392342401417e-06, + "loss": 2.1228, + "step": 15943 + }, + { + "epoch": 2.9885660731021555, + "grad_norm": 53986.375, + "learning_rate": 9.818715328083461e-06, + "loss": 2.0633, + "step": 15944 + }, + { + "epoch": 2.9887535145267106, + "grad_norm": 52548.703125, + "learning_rate": 9.814039306191148e-06, + "loss": 2.159, + "step": 15945 + }, + { + "epoch": 2.988940955951265, + "grad_norm": 52757.140625, + "learning_rate": 9.809364276839967e-06, + "loss": 2.1114, + "step": 15946 + }, + { + "epoch": 2.9891283973758203, + "grad_norm": 53718.9453125, + "learning_rate": 9.804690240145386e-06, + "loss": 2.1338, + "step": 15947 + }, + { + "epoch": 2.989315838800375, + "grad_norm": 58544.515625, + "learning_rate": 9.800017196222827e-06, + "loss": 2.1071, + "step": 15948 + }, + { + "epoch": 2.9895032802249295, + "grad_norm": 56595.18359375, + "learning_rate": 9.795345145187734e-06, + "loss": 2.0929, + "step": 15949 + }, + { + "epoch": 2.9896907216494846, + "grad_norm": 52576.3828125, + "learning_rate": 9.790674087155482e-06, + "loss": 2.1551, + "step": 15950 + }, + { + "epoch": 2.989878163074039, + "grad_norm": 57291.4453125, + "learning_rate": 9.786004022241457e-06, + "loss": 2.0744, + "step": 15951 + }, + { + "epoch": 2.9900656044985943, + "grad_norm": 59480.8828125, + "learning_rate": 9.781334950560994e-06, + "loss": 2.1199, + "step": 15952 + }, + { + "epoch": 2.990253045923149, + "grad_norm": 50004.66796875, + "learning_rate": 9.7766668722294e-06, + "loss": 2.0985, + "step": 15953 + }, + { + "epoch": 2.990440487347704, + "grad_norm": 56971.015625, + "learning_rate": 9.771999787362002e-06, + "loss": 2.1551, + "step": 15954 + }, + { + "epoch": 2.9906279287722586, + "grad_norm": 50976.5234375, + "learning_rate": 9.767333696074038e-06, + "loss": 2.0854, + "step": 15955 + }, + { + "epoch": 2.9908153701968136, + "grad_norm": 53526.984375, + "learning_rate": 9.762668598480778e-06, + "loss": 2.0833, + "step": 15956 + }, + { + "epoch": 2.9910028116213683, + "grad_norm": 50836.78515625, + "learning_rate": 9.758004494697426e-06, + "loss": 2.169, + "step": 15957 + }, + { + "epoch": 2.9911902530459233, + "grad_norm": 50881.0703125, + "learning_rate": 9.753341384839198e-06, + "loss": 2.0607, + "step": 15958 + }, + { + "epoch": 2.991377694470478, + "grad_norm": 53066.51953125, + "learning_rate": 9.74867926902126e-06, + "loss": 2.1648, + "step": 15959 + }, + { + "epoch": 2.9915651358950326, + "grad_norm": 56589.89453125, + "learning_rate": 9.744018147358746e-06, + "loss": 2.1685, + "step": 15960 + }, + { + "epoch": 2.9917525773195877, + "grad_norm": 52873.85546875, + "learning_rate": 9.739358019966788e-06, + "loss": 2.1302, + "step": 15961 + }, + { + "epoch": 2.9919400187441427, + "grad_norm": 52991.125, + "learning_rate": 9.734698886960503e-06, + "loss": 2.1559, + "step": 15962 + }, + { + "epoch": 2.9921274601686974, + "grad_norm": 54094.34765625, + "learning_rate": 9.730040748454949e-06, + "loss": 2.1323, + "step": 15963 + }, + { + "epoch": 2.992314901593252, + "grad_norm": 51404.34375, + "learning_rate": 9.725383604565169e-06, + "loss": 2.1289, + "step": 15964 + }, + { + "epoch": 2.992502343017807, + "grad_norm": 55134.37890625, + "learning_rate": 9.72072745540621e-06, + "loss": 2.1779, + "step": 15965 + }, + { + "epoch": 2.9926897844423617, + "grad_norm": 56286.2421875, + "learning_rate": 9.716072301093054e-06, + "loss": 2.1378, + "step": 15966 + }, + { + "epoch": 2.9928772258669167, + "grad_norm": 55362.73828125, + "learning_rate": 9.711418141740675e-06, + "loss": 2.0356, + "step": 15967 + }, + { + "epoch": 2.9930646672914714, + "grad_norm": 52904.94140625, + "learning_rate": 9.706764977464029e-06, + "loss": 2.0979, + "step": 15968 + }, + { + "epoch": 2.9932521087160264, + "grad_norm": 56326.64453125, + "learning_rate": 9.702112808378078e-06, + "loss": 2.1468, + "step": 15969 + }, + { + "epoch": 2.993439550140581, + "grad_norm": 55765.4921875, + "learning_rate": 9.697461634597665e-06, + "loss": 2.0585, + "step": 15970 + }, + { + "epoch": 2.9936269915651357, + "grad_norm": 53509.4375, + "learning_rate": 9.692811456237693e-06, + "loss": 2.1603, + "step": 15971 + }, + { + "epoch": 2.9938144329896907, + "grad_norm": 58895.546875, + "learning_rate": 9.688162273413038e-06, + "loss": 2.0932, + "step": 15972 + }, + { + "epoch": 2.994001874414246, + "grad_norm": 52262.45703125, + "learning_rate": 9.68351408623851e-06, + "loss": 2.1492, + "step": 15973 + }, + { + "epoch": 2.9941893158388004, + "grad_norm": 52380.28515625, + "learning_rate": 9.678866894828897e-06, + "loss": 2.1201, + "step": 15974 + }, + { + "epoch": 2.994376757263355, + "grad_norm": 57046.4921875, + "learning_rate": 9.674220699299008e-06, + "loss": 2.1434, + "step": 15975 + }, + { + "epoch": 2.99456419868791, + "grad_norm": 51824.29296875, + "learning_rate": 9.669575499763572e-06, + "loss": 2.1373, + "step": 15976 + }, + { + "epoch": 2.9947516401124648, + "grad_norm": 55584.66015625, + "learning_rate": 9.664931296337342e-06, + "loss": 2.13, + "step": 15977 + }, + { + "epoch": 2.99493908153702, + "grad_norm": 55083.32421875, + "learning_rate": 9.660288089135005e-06, + "loss": 2.1367, + "step": 15978 + }, + { + "epoch": 2.9951265229615744, + "grad_norm": 53556.4375, + "learning_rate": 9.655645878271264e-06, + "loss": 2.184, + "step": 15979 + }, + { + "epoch": 2.9953139643861295, + "grad_norm": 55808.1875, + "learning_rate": 9.651004663860758e-06, + "loss": 2.1349, + "step": 15980 + }, + { + "epoch": 2.995501405810684, + "grad_norm": 53181.26953125, + "learning_rate": 9.646364446018113e-06, + "loss": 2.0793, + "step": 15981 + }, + { + "epoch": 2.9956888472352388, + "grad_norm": 56431.61328125, + "learning_rate": 9.641725224857962e-06, + "loss": 2.1113, + "step": 15982 + }, + { + "epoch": 2.995876288659794, + "grad_norm": 56886.6484375, + "learning_rate": 9.63708700049486e-06, + "loss": 2.1059, + "step": 15983 + }, + { + "epoch": 2.996063730084349, + "grad_norm": 56836.984375, + "learning_rate": 9.63244977304339e-06, + "loss": 2.0764, + "step": 15984 + }, + { + "epoch": 2.9962511715089035, + "grad_norm": 50944.1640625, + "learning_rate": 9.627813542618058e-06, + "loss": 2.1631, + "step": 15985 + }, + { + "epoch": 2.996438612933458, + "grad_norm": 53562.41015625, + "learning_rate": 9.623178309333403e-06, + "loss": 2.2048, + "step": 15986 + }, + { + "epoch": 2.996626054358013, + "grad_norm": 54455.4140625, + "learning_rate": 9.618544073303881e-06, + "loss": 2.0842, + "step": 15987 + }, + { + "epoch": 2.996813495782568, + "grad_norm": 56442.37109375, + "learning_rate": 9.613910834643974e-06, + "loss": 2.1431, + "step": 15988 + }, + { + "epoch": 2.997000937207123, + "grad_norm": 53726.87890625, + "learning_rate": 9.609278593468113e-06, + "loss": 2.0946, + "step": 15989 + }, + { + "epoch": 2.9971883786316775, + "grad_norm": 51660.4609375, + "learning_rate": 9.604647349890689e-06, + "loss": 2.1763, + "step": 15990 + }, + { + "epoch": 2.9973758200562326, + "grad_norm": 53737.734375, + "learning_rate": 9.600017104026104e-06, + "loss": 2.1136, + "step": 15991 + }, + { + "epoch": 2.9975632614807872, + "grad_norm": 56570.2890625, + "learning_rate": 9.595387855988741e-06, + "loss": 2.1126, + "step": 15992 + }, + { + "epoch": 2.997750702905342, + "grad_norm": 55845.78125, + "learning_rate": 9.590759605892885e-06, + "loss": 2.1242, + "step": 15993 + }, + { + "epoch": 2.997938144329897, + "grad_norm": 54853.55078125, + "learning_rate": 9.586132353852878e-06, + "loss": 2.1067, + "step": 15994 + }, + { + "epoch": 2.998125585754452, + "grad_norm": 54891.56640625, + "learning_rate": 9.581506099983018e-06, + "loss": 2.1037, + "step": 15995 + }, + { + "epoch": 2.9983130271790066, + "grad_norm": 61577.78125, + "learning_rate": 9.576880844397552e-06, + "loss": 2.1667, + "step": 15996 + }, + { + "epoch": 2.9985004686035612, + "grad_norm": 50424.578125, + "learning_rate": 9.57225658721071e-06, + "loss": 2.0653, + "step": 15997 + }, + { + "epoch": 2.9986879100281163, + "grad_norm": 53835.9921875, + "learning_rate": 9.567633328536713e-06, + "loss": 2.0902, + "step": 15998 + }, + { + "epoch": 2.998875351452671, + "grad_norm": 53111.4609375, + "learning_rate": 9.563011068489774e-06, + "loss": 2.0864, + "step": 15999 + }, + { + "epoch": 2.999062792877226, + "grad_norm": 53694.6328125, + "learning_rate": 9.558389807184014e-06, + "loss": 2.1535, + "step": 16000 + }, + { + "epoch": 2.999062792877226, + "eval_loss": 2.2541162967681885, + "eval_runtime": 128.4934, + "eval_samples_per_second": 39.294, + "eval_steps_per_second": 1.969, + "step": 16000 + }, + { + "epoch": 2.9992502343017806, + "grad_norm": 57802.89453125, + "learning_rate": 9.553769544733594e-06, + "loss": 2.1264, + "step": 16001 + }, + { + "epoch": 2.9994376757263357, + "grad_norm": 55138.92578125, + "learning_rate": 9.549150281252633e-06, + "loss": 2.1177, + "step": 16002 + }, + { + "epoch": 2.9996251171508903, + "grad_norm": 54790.578125, + "learning_rate": 9.544532016855218e-06, + "loss": 2.1309, + "step": 16003 + }, + { + "epoch": 2.999812558575445, + "grad_norm": 56496.5703125, + "learning_rate": 9.539914751655399e-06, + "loss": 2.0816, + "step": 16004 + }, + { + "epoch": 3.0, + "grad_norm": 69339.953125, + "learning_rate": 9.535298485767235e-06, + "loss": 2.1821, + "step": 16005 + }, + { + "epoch": 3.0001874414245546, + "grad_norm": 50965.03125, + "learning_rate": 9.530683219304731e-06, + "loss": 2.0701, + "step": 16006 + }, + { + "epoch": 3.0003748828491097, + "grad_norm": 50245.36328125, + "learning_rate": 9.526068952381888e-06, + "loss": 2.0534, + "step": 16007 + }, + { + "epoch": 3.0005623242736643, + "grad_norm": 50429.3671875, + "learning_rate": 9.521455685112657e-06, + "loss": 2.0966, + "step": 16008 + }, + { + "epoch": 3.0007497656982194, + "grad_norm": 52093.68359375, + "learning_rate": 9.516843417611005e-06, + "loss": 2.1466, + "step": 16009 + }, + { + "epoch": 3.000937207122774, + "grad_norm": 51273.4921875, + "learning_rate": 9.51223214999083e-06, + "loss": 2.0599, + "step": 16010 + }, + { + "epoch": 3.001124648547329, + "grad_norm": 54036.8046875, + "learning_rate": 9.507621882366013e-06, + "loss": 2.1053, + "step": 16011 + }, + { + "epoch": 3.0013120899718837, + "grad_norm": 53453.66015625, + "learning_rate": 9.503012614850454e-06, + "loss": 2.096, + "step": 16012 + }, + { + "epoch": 3.0014995313964388, + "grad_norm": 53412.671875, + "learning_rate": 9.49840434755796e-06, + "loss": 2.0893, + "step": 16013 + }, + { + "epoch": 3.0016869728209934, + "grad_norm": 54137.70703125, + "learning_rate": 9.493797080602385e-06, + "loss": 2.044, + "step": 16014 + }, + { + "epoch": 3.0018744142455485, + "grad_norm": 51429.8359375, + "learning_rate": 9.489190814097492e-06, + "loss": 2.1118, + "step": 16015 + }, + { + "epoch": 3.002061855670103, + "grad_norm": 51983.75, + "learning_rate": 9.484585548157077e-06, + "loss": 2.0057, + "step": 16016 + }, + { + "epoch": 3.0022492970946577, + "grad_norm": 54770.7890625, + "learning_rate": 9.479981282894851e-06, + "loss": 2.1574, + "step": 16017 + }, + { + "epoch": 3.0024367385192128, + "grad_norm": 54711.40234375, + "learning_rate": 9.475378018424568e-06, + "loss": 1.9847, + "step": 16018 + }, + { + "epoch": 3.0026241799437674, + "grad_norm": 54729.09375, + "learning_rate": 9.470775754859907e-06, + "loss": 2.0518, + "step": 16019 + }, + { + "epoch": 3.0028116213683225, + "grad_norm": 54696.953125, + "learning_rate": 9.466174492314527e-06, + "loss": 2.0498, + "step": 16020 + }, + { + "epoch": 3.002999062792877, + "grad_norm": 60693.0, + "learning_rate": 9.461574230902081e-06, + "loss": 1.9655, + "step": 16021 + }, + { + "epoch": 3.003186504217432, + "grad_norm": 57170.1875, + "learning_rate": 9.45697497073622e-06, + "loss": 2.1151, + "step": 16022 + }, + { + "epoch": 3.003373945641987, + "grad_norm": 54717.39453125, + "learning_rate": 9.452376711930488e-06, + "loss": 2.088, + "step": 16023 + }, + { + "epoch": 3.003561387066542, + "grad_norm": 50552.44921875, + "learning_rate": 9.447779454598482e-06, + "loss": 2.0531, + "step": 16024 + }, + { + "epoch": 3.0037488284910965, + "grad_norm": 54342.73046875, + "learning_rate": 9.443183198853756e-06, + "loss": 2.1477, + "step": 16025 + }, + { + "epoch": 3.0039362699156515, + "grad_norm": 54070.48046875, + "learning_rate": 9.438587944809824e-06, + "loss": 2.0961, + "step": 16026 + }, + { + "epoch": 3.004123711340206, + "grad_norm": 54090.37109375, + "learning_rate": 9.433993692580168e-06, + "loss": 2.0399, + "step": 16027 + }, + { + "epoch": 3.004311152764761, + "grad_norm": 59514.88671875, + "learning_rate": 9.429400442278285e-06, + "loss": 2.0646, + "step": 16028 + }, + { + "epoch": 3.004498594189316, + "grad_norm": 53134.95703125, + "learning_rate": 9.42480819401761e-06, + "loss": 2.0386, + "step": 16029 + }, + { + "epoch": 3.0046860356138705, + "grad_norm": 52831.68359375, + "learning_rate": 9.420216947911553e-06, + "loss": 1.9617, + "step": 16030 + }, + { + "epoch": 3.0048734770384256, + "grad_norm": 52907.734375, + "learning_rate": 9.415626704073526e-06, + "loss": 2.0799, + "step": 16031 + }, + { + "epoch": 3.00506091846298, + "grad_norm": 55607.5625, + "learning_rate": 9.411037462616917e-06, + "loss": 2.0476, + "step": 16032 + }, + { + "epoch": 3.0052483598875352, + "grad_norm": 53313.48046875, + "learning_rate": 9.406449223655056e-06, + "loss": 2.0764, + "step": 16033 + }, + { + "epoch": 3.00543580131209, + "grad_norm": 54899.73828125, + "learning_rate": 9.401861987301253e-06, + "loss": 2.0458, + "step": 16034 + }, + { + "epoch": 3.005623242736645, + "grad_norm": 52366.27734375, + "learning_rate": 9.397275753668838e-06, + "loss": 2.0961, + "step": 16035 + }, + { + "epoch": 3.0058106841611996, + "grad_norm": 54466.59765625, + "learning_rate": 9.392690522871073e-06, + "loss": 2.0537, + "step": 16036 + }, + { + "epoch": 3.0059981255857546, + "grad_norm": 61847.42578125, + "learning_rate": 9.388106295021181e-06, + "loss": 2.0154, + "step": 16037 + }, + { + "epoch": 3.0061855670103093, + "grad_norm": 53486.23046875, + "learning_rate": 9.38352307023242e-06, + "loss": 2.0305, + "step": 16038 + }, + { + "epoch": 3.0063730084348643, + "grad_norm": 51104.359375, + "learning_rate": 9.37894084861799e-06, + "loss": 2.1068, + "step": 16039 + }, + { + "epoch": 3.006560449859419, + "grad_norm": 60406.33203125, + "learning_rate": 9.374359630291052e-06, + "loss": 1.9927, + "step": 16040 + }, + { + "epoch": 3.0067478912839736, + "grad_norm": 54688.84375, + "learning_rate": 9.369779415364743e-06, + "loss": 2.0572, + "step": 16041 + }, + { + "epoch": 3.0069353327085286, + "grad_norm": 52119.375, + "learning_rate": 9.365200203952223e-06, + "loss": 2.1135, + "step": 16042 + }, + { + "epoch": 3.0071227741330833, + "grad_norm": 55514.65625, + "learning_rate": 9.360621996166563e-06, + "loss": 2.0947, + "step": 16043 + }, + { + "epoch": 3.0073102155576383, + "grad_norm": 53879.0625, + "learning_rate": 9.35604479212086e-06, + "loss": 2.0814, + "step": 16044 + }, + { + "epoch": 3.007497656982193, + "grad_norm": 51999.41015625, + "learning_rate": 9.351468591928153e-06, + "loss": 2.0879, + "step": 16045 + }, + { + "epoch": 3.007685098406748, + "grad_norm": 52790.671875, + "learning_rate": 9.346893395701461e-06, + "loss": 2.0543, + "step": 16046 + }, + { + "epoch": 3.0078725398313026, + "grad_norm": 53703.4453125, + "learning_rate": 9.34231920355379e-06, + "loss": 2.0443, + "step": 16047 + }, + { + "epoch": 3.0080599812558577, + "grad_norm": 55663.703125, + "learning_rate": 9.337746015598137e-06, + "loss": 1.982, + "step": 16048 + }, + { + "epoch": 3.0082474226804123, + "grad_norm": 60037.23828125, + "learning_rate": 9.33317383194744e-06, + "loss": 2.0396, + "step": 16049 + }, + { + "epoch": 3.0084348641049674, + "grad_norm": 54492.20703125, + "learning_rate": 9.328602652714607e-06, + "loss": 1.9774, + "step": 16050 + }, + { + "epoch": 3.008622305529522, + "grad_norm": 54341.8046875, + "learning_rate": 9.324032478012573e-06, + "loss": 2.0925, + "step": 16051 + }, + { + "epoch": 3.0088097469540767, + "grad_norm": 50615.03515625, + "learning_rate": 9.319463307954202e-06, + "loss": 2.0943, + "step": 16052 + }, + { + "epoch": 3.0089971883786317, + "grad_norm": 60127.18359375, + "learning_rate": 9.314895142652325e-06, + "loss": 1.9697, + "step": 16053 + }, + { + "epoch": 3.0091846298031864, + "grad_norm": 48492.91796875, + "learning_rate": 9.31032798221979e-06, + "loss": 2.1156, + "step": 16054 + }, + { + "epoch": 3.0093720712277414, + "grad_norm": 53688.89453125, + "learning_rate": 9.305761826769415e-06, + "loss": 2.115, + "step": 16055 + }, + { + "epoch": 3.009559512652296, + "grad_norm": 56369.8359375, + "learning_rate": 9.301196676413965e-06, + "loss": 2.0364, + "step": 16056 + }, + { + "epoch": 3.009746954076851, + "grad_norm": 52380.796875, + "learning_rate": 9.296632531266169e-06, + "loss": 2.0318, + "step": 16057 + }, + { + "epoch": 3.0099343955014057, + "grad_norm": 54365.1328125, + "learning_rate": 9.292069391438796e-06, + "loss": 2.0996, + "step": 16058 + }, + { + "epoch": 3.010121836925961, + "grad_norm": 59329.640625, + "learning_rate": 9.287507257044526e-06, + "loss": 2.0766, + "step": 16059 + }, + { + "epoch": 3.0103092783505154, + "grad_norm": 51144.67578125, + "learning_rate": 9.282946128196036e-06, + "loss": 2.0124, + "step": 16060 + }, + { + "epoch": 3.0104967197750705, + "grad_norm": 48425.06640625, + "learning_rate": 9.278386005005979e-06, + "loss": 2.0893, + "step": 16061 + }, + { + "epoch": 3.010684161199625, + "grad_norm": 54980.28125, + "learning_rate": 9.273826887587006e-06, + "loss": 2.1386, + "step": 16062 + }, + { + "epoch": 3.0108716026241797, + "grad_norm": 57673.03515625, + "learning_rate": 9.26926877605171e-06, + "loss": 2.0575, + "step": 16063 + }, + { + "epoch": 3.011059044048735, + "grad_norm": 58369.42578125, + "learning_rate": 9.26471167051265e-06, + "loss": 2.1007, + "step": 16064 + }, + { + "epoch": 3.0112464854732894, + "grad_norm": 53835.890625, + "learning_rate": 9.260155571082414e-06, + "loss": 2.1226, + "step": 16065 + }, + { + "epoch": 3.0114339268978445, + "grad_norm": 54187.22265625, + "learning_rate": 9.255600477873515e-06, + "loss": 2.0593, + "step": 16066 + }, + { + "epoch": 3.011621368322399, + "grad_norm": 58067.5546875, + "learning_rate": 9.251046390998441e-06, + "loss": 2.0107, + "step": 16067 + }, + { + "epoch": 3.011808809746954, + "grad_norm": 59823.9609375, + "learning_rate": 9.24649331056971e-06, + "loss": 2.0833, + "step": 16068 + }, + { + "epoch": 3.011996251171509, + "grad_norm": 52886.25390625, + "learning_rate": 9.241941236699736e-06, + "loss": 2.0199, + "step": 16069 + }, + { + "epoch": 3.012183692596064, + "grad_norm": 53443.59765625, + "learning_rate": 9.237390169500986e-06, + "loss": 2.0271, + "step": 16070 + }, + { + "epoch": 3.0123711340206185, + "grad_norm": 51460.23046875, + "learning_rate": 9.232840109085838e-06, + "loss": 2.0842, + "step": 16071 + }, + { + "epoch": 3.0125585754451736, + "grad_norm": 54837.6484375, + "learning_rate": 9.228291055566696e-06, + "loss": 2.0869, + "step": 16072 + }, + { + "epoch": 3.012746016869728, + "grad_norm": 54778.62109375, + "learning_rate": 9.223743009055885e-06, + "loss": 2.0335, + "step": 16073 + }, + { + "epoch": 3.012933458294283, + "grad_norm": 53499.87890625, + "learning_rate": 9.219195969665772e-06, + "loss": 2.0899, + "step": 16074 + }, + { + "epoch": 3.013120899718838, + "grad_norm": 54535.46484375, + "learning_rate": 9.214649937508647e-06, + "loss": 2.0169, + "step": 16075 + }, + { + "epoch": 3.0133083411433925, + "grad_norm": 55323.9375, + "learning_rate": 9.210104912696777e-06, + "loss": 2.1183, + "step": 16076 + }, + { + "epoch": 3.0134957825679476, + "grad_norm": 55428.5390625, + "learning_rate": 9.20556089534244e-06, + "loss": 2.0082, + "step": 16077 + }, + { + "epoch": 3.013683223992502, + "grad_norm": 57570.828125, + "learning_rate": 9.20101788555785e-06, + "loss": 2.0209, + "step": 16078 + }, + { + "epoch": 3.0138706654170573, + "grad_norm": 54667.2578125, + "learning_rate": 9.19647588345523e-06, + "loss": 2.0356, + "step": 16079 + }, + { + "epoch": 3.014058106841612, + "grad_norm": 60347.6953125, + "learning_rate": 9.191934889146742e-06, + "loss": 2.111, + "step": 16080 + }, + { + "epoch": 3.014245548266167, + "grad_norm": 52456.98828125, + "learning_rate": 9.187394902744567e-06, + "loss": 2.0439, + "step": 16081 + }, + { + "epoch": 3.0144329896907216, + "grad_norm": 52121.9921875, + "learning_rate": 9.182855924360823e-06, + "loss": 2.0257, + "step": 16082 + }, + { + "epoch": 3.0146204311152767, + "grad_norm": 61552.36328125, + "learning_rate": 9.178317954107602e-06, + "loss": 2.1065, + "step": 16083 + }, + { + "epoch": 3.0148078725398313, + "grad_norm": 50818.82421875, + "learning_rate": 9.173780992097003e-06, + "loss": 2.0319, + "step": 16084 + }, + { + "epoch": 3.014995313964386, + "grad_norm": 58063.72265625, + "learning_rate": 9.16924503844111e-06, + "loss": 2.0543, + "step": 16085 + }, + { + "epoch": 3.015182755388941, + "grad_norm": 55790.0625, + "learning_rate": 9.1647100932519e-06, + "loss": 2.0677, + "step": 16086 + }, + { + "epoch": 3.0153701968134956, + "grad_norm": 55185.08203125, + "learning_rate": 9.160176156641408e-06, + "loss": 2.0743, + "step": 16087 + }, + { + "epoch": 3.0155576382380507, + "grad_norm": 52849.1875, + "learning_rate": 9.155643228721622e-06, + "loss": 2.0633, + "step": 16088 + }, + { + "epoch": 3.0157450796626053, + "grad_norm": 62375.83203125, + "learning_rate": 9.1511113096045e-06, + "loss": 2.0917, + "step": 16089 + }, + { + "epoch": 3.0159325210871604, + "grad_norm": 58562.6484375, + "learning_rate": 9.146580399401949e-06, + "loss": 2.1038, + "step": 16090 + }, + { + "epoch": 3.016119962511715, + "grad_norm": 56160.53515625, + "learning_rate": 9.142050498225901e-06, + "loss": 2.0849, + "step": 16091 + }, + { + "epoch": 3.01630740393627, + "grad_norm": 51774.25, + "learning_rate": 9.137521606188254e-06, + "loss": 1.9951, + "step": 16092 + }, + { + "epoch": 3.0164948453608247, + "grad_norm": 57051.2265625, + "learning_rate": 9.132993723400818e-06, + "loss": 2.0839, + "step": 16093 + }, + { + "epoch": 3.0166822867853798, + "grad_norm": 55252.74609375, + "learning_rate": 9.128466849975453e-06, + "loss": 2.0778, + "step": 16094 + }, + { + "epoch": 3.0168697282099344, + "grad_norm": 57160.3046875, + "learning_rate": 9.123940986023981e-06, + "loss": 2.0313, + "step": 16095 + }, + { + "epoch": 3.017057169634489, + "grad_norm": 58050.7421875, + "learning_rate": 9.119416131658171e-06, + "loss": 2.1131, + "step": 16096 + }, + { + "epoch": 3.017244611059044, + "grad_norm": 58789.9609375, + "learning_rate": 9.114892286989763e-06, + "loss": 2.1, + "step": 16097 + }, + { + "epoch": 3.0174320524835987, + "grad_norm": 58325.0390625, + "learning_rate": 9.110369452130524e-06, + "loss": 2.0524, + "step": 16098 + }, + { + "epoch": 3.0176194939081538, + "grad_norm": 58171.87109375, + "learning_rate": 9.105847627192126e-06, + "loss": 2.0465, + "step": 16099 + }, + { + "epoch": 3.0178069353327084, + "grad_norm": 55551.12109375, + "learning_rate": 9.101326812286292e-06, + "loss": 2.078, + "step": 16100 + }, + { + "epoch": 3.0179943767572635, + "grad_norm": 53170.65625, + "learning_rate": 9.096807007524644e-06, + "loss": 2.0477, + "step": 16101 + }, + { + "epoch": 3.018181818181818, + "grad_norm": 53933.96484375, + "learning_rate": 9.09228821301884e-06, + "loss": 2.0924, + "step": 16102 + }, + { + "epoch": 3.018369259606373, + "grad_norm": 56747.62109375, + "learning_rate": 9.087770428880472e-06, + "loss": 2.0718, + "step": 16103 + }, + { + "epoch": 3.0185567010309278, + "grad_norm": 53713.76171875, + "learning_rate": 9.083253655221142e-06, + "loss": 2.0848, + "step": 16104 + }, + { + "epoch": 3.018744142455483, + "grad_norm": 56902.2890625, + "learning_rate": 9.078737892152395e-06, + "loss": 2.0931, + "step": 16105 + }, + { + "epoch": 3.0189315838800375, + "grad_norm": 58560.54296875, + "learning_rate": 9.074223139785759e-06, + "loss": 2.0885, + "step": 16106 + }, + { + "epoch": 3.0191190253045925, + "grad_norm": 57169.4296875, + "learning_rate": 9.069709398232762e-06, + "loss": 2.1109, + "step": 16107 + }, + { + "epoch": 3.019306466729147, + "grad_norm": 51878.11328125, + "learning_rate": 9.065196667604859e-06, + "loss": 2.0972, + "step": 16108 + }, + { + "epoch": 3.0194939081537018, + "grad_norm": 58198.5703125, + "learning_rate": 9.060684948013543e-06, + "loss": 2.1239, + "step": 16109 + }, + { + "epoch": 3.019681349578257, + "grad_norm": 52543.4921875, + "learning_rate": 9.056174239570215e-06, + "loss": 2.0563, + "step": 16110 + }, + { + "epoch": 3.0198687910028115, + "grad_norm": 53069.83203125, + "learning_rate": 9.051664542386313e-06, + "loss": 2.0708, + "step": 16111 + }, + { + "epoch": 3.0200562324273665, + "grad_norm": 51309.421875, + "learning_rate": 9.047155856573204e-06, + "loss": 2.0827, + "step": 16112 + }, + { + "epoch": 3.020243673851921, + "grad_norm": 51371.125, + "learning_rate": 9.042648182242235e-06, + "loss": 2.0679, + "step": 16113 + }, + { + "epoch": 3.0204311152764762, + "grad_norm": 53495.96484375, + "learning_rate": 9.03814151950476e-06, + "loss": 2.0537, + "step": 16114 + }, + { + "epoch": 3.020618556701031, + "grad_norm": 57403.35546875, + "learning_rate": 9.033635868472107e-06, + "loss": 2.114, + "step": 16115 + }, + { + "epoch": 3.020805998125586, + "grad_norm": 52019.7734375, + "learning_rate": 9.029131229255506e-06, + "loss": 2.0854, + "step": 16116 + }, + { + "epoch": 3.0209934395501405, + "grad_norm": 55576.8046875, + "learning_rate": 9.02462760196624e-06, + "loss": 2.0667, + "step": 16117 + }, + { + "epoch": 3.0211808809746956, + "grad_norm": 50413.4921875, + "learning_rate": 9.020124986715568e-06, + "loss": 2.0303, + "step": 16118 + }, + { + "epoch": 3.0213683223992502, + "grad_norm": 54479.99609375, + "learning_rate": 9.01562338361467e-06, + "loss": 2.0979, + "step": 16119 + }, + { + "epoch": 3.021555763823805, + "grad_norm": 53225.42578125, + "learning_rate": 9.011122792774728e-06, + "loss": 2.0648, + "step": 16120 + }, + { + "epoch": 3.02174320524836, + "grad_norm": 53112.7421875, + "learning_rate": 9.006623214306926e-06, + "loss": 2.1728, + "step": 16121 + }, + { + "epoch": 3.0219306466729146, + "grad_norm": 58818.84765625, + "learning_rate": 9.002124648322374e-06, + "loss": 2.1175, + "step": 16122 + }, + { + "epoch": 3.0221180880974696, + "grad_norm": 54197.3046875, + "learning_rate": 8.997627094932182e-06, + "loss": 1.9957, + "step": 16123 + }, + { + "epoch": 3.0223055295220242, + "grad_norm": 55254.94140625, + "learning_rate": 8.993130554247441e-06, + "loss": 2.1286, + "step": 16124 + }, + { + "epoch": 3.0224929709465793, + "grad_norm": 55751.33203125, + "learning_rate": 8.988635026379221e-06, + "loss": 2.0478, + "step": 16125 + }, + { + "epoch": 3.022680412371134, + "grad_norm": 54654.859375, + "learning_rate": 8.984140511438543e-06, + "loss": 2.0211, + "step": 16126 + }, + { + "epoch": 3.022867853795689, + "grad_norm": 53479.73046875, + "learning_rate": 8.979647009536402e-06, + "loss": 2.1141, + "step": 16127 + }, + { + "epoch": 3.0230552952202436, + "grad_norm": 52659.3984375, + "learning_rate": 8.975154520783807e-06, + "loss": 2.0564, + "step": 16128 + }, + { + "epoch": 3.0232427366447987, + "grad_norm": 60990.91015625, + "learning_rate": 8.9706630452917e-06, + "loss": 2.0418, + "step": 16129 + }, + { + "epoch": 3.0234301780693533, + "grad_norm": 51837.78125, + "learning_rate": 8.966172583171028e-06, + "loss": 2.0728, + "step": 16130 + }, + { + "epoch": 3.023617619493908, + "grad_norm": 56296.43359375, + "learning_rate": 8.961683134532684e-06, + "loss": 2.0765, + "step": 16131 + }, + { + "epoch": 3.023805060918463, + "grad_norm": 57185.99609375, + "learning_rate": 8.95719469948757e-06, + "loss": 2.0658, + "step": 16132 + }, + { + "epoch": 3.0239925023430176, + "grad_norm": 52833.125, + "learning_rate": 8.952707278146532e-06, + "loss": 2.0721, + "step": 16133 + }, + { + "epoch": 3.0241799437675727, + "grad_norm": 56636.109375, + "learning_rate": 8.948220870620394e-06, + "loss": 1.9242, + "step": 16134 + }, + { + "epoch": 3.0243673851921273, + "grad_norm": 54363.43359375, + "learning_rate": 8.943735477019987e-06, + "loss": 2.1276, + "step": 16135 + }, + { + "epoch": 3.0245548266166824, + "grad_norm": 53787.44140625, + "learning_rate": 8.939251097456076e-06, + "loss": 2.0841, + "step": 16136 + }, + { + "epoch": 3.024742268041237, + "grad_norm": 63038.9609375, + "learning_rate": 8.934767732039433e-06, + "loss": 2.0699, + "step": 16137 + }, + { + "epoch": 3.024929709465792, + "grad_norm": 55738.3828125, + "learning_rate": 8.930285380880793e-06, + "loss": 2.091, + "step": 16138 + }, + { + "epoch": 3.0251171508903467, + "grad_norm": 51796.15625, + "learning_rate": 8.925804044090836e-06, + "loss": 2.0638, + "step": 16139 + }, + { + "epoch": 3.025304592314902, + "grad_norm": 57056.87109375, + "learning_rate": 8.921323721780273e-06, + "loss": 2.0484, + "step": 16140 + }, + { + "epoch": 3.0254920337394564, + "grad_norm": 58569.921875, + "learning_rate": 8.91684441405976e-06, + "loss": 2.0283, + "step": 16141 + }, + { + "epoch": 3.025679475164011, + "grad_norm": 55053.953125, + "learning_rate": 8.912366121039927e-06, + "loss": 1.9903, + "step": 16142 + }, + { + "epoch": 3.025866916588566, + "grad_norm": 54196.5234375, + "learning_rate": 8.90788884283137e-06, + "loss": 2.0001, + "step": 16143 + }, + { + "epoch": 3.0260543580131207, + "grad_norm": 55843.63671875, + "learning_rate": 8.903412579544684e-06, + "loss": 2.0501, + "step": 16144 + }, + { + "epoch": 3.026241799437676, + "grad_norm": 54269.17578125, + "learning_rate": 8.898937331290447e-06, + "loss": 2.0912, + "step": 16145 + }, + { + "epoch": 3.0264292408622304, + "grad_norm": 56042.6875, + "learning_rate": 8.89446309817914e-06, + "loss": 2.1752, + "step": 16146 + }, + { + "epoch": 3.0266166822867855, + "grad_norm": 52145.765625, + "learning_rate": 8.889989880321303e-06, + "loss": 2.0575, + "step": 16147 + }, + { + "epoch": 3.02680412371134, + "grad_norm": 53509.4140625, + "learning_rate": 8.885517677827431e-06, + "loss": 2.1098, + "step": 16148 + }, + { + "epoch": 3.026991565135895, + "grad_norm": 54745.6328125, + "learning_rate": 8.881046490807965e-06, + "loss": 2.0569, + "step": 16149 + }, + { + "epoch": 3.02717900656045, + "grad_norm": 51589.51171875, + "learning_rate": 8.87657631937333e-06, + "loss": 2.0361, + "step": 16150 + }, + { + "epoch": 3.027366447985005, + "grad_norm": 59613.5390625, + "learning_rate": 8.872107163633953e-06, + "loss": 2.1286, + "step": 16151 + }, + { + "epoch": 3.0275538894095595, + "grad_norm": 53271.67578125, + "learning_rate": 8.867639023700203e-06, + "loss": 2.087, + "step": 16152 + }, + { + "epoch": 3.027741330834114, + "grad_norm": 55291.29296875, + "learning_rate": 8.863171899682432e-06, + "loss": 2.0801, + "step": 16153 + }, + { + "epoch": 3.027928772258669, + "grad_norm": 52416.44921875, + "learning_rate": 8.858705791690975e-06, + "loss": 2.018, + "step": 16154 + }, + { + "epoch": 3.028116213683224, + "grad_norm": 58879.22265625, + "learning_rate": 8.854240699836163e-06, + "loss": 2.0312, + "step": 16155 + }, + { + "epoch": 3.028303655107779, + "grad_norm": 56717.265625, + "learning_rate": 8.849776624228256e-06, + "loss": 2.0106, + "step": 16156 + }, + { + "epoch": 3.0284910965323335, + "grad_norm": 50506.54296875, + "learning_rate": 8.845313564977503e-06, + "loss": 2.0879, + "step": 16157 + }, + { + "epoch": 3.0286785379568886, + "grad_norm": 58521.39453125, + "learning_rate": 8.84085152219416e-06, + "loss": 2.0374, + "step": 16158 + }, + { + "epoch": 3.028865979381443, + "grad_norm": 53165.375, + "learning_rate": 8.836390495988406e-06, + "loss": 2.074, + "step": 16159 + }, + { + "epoch": 3.0290534208059983, + "grad_norm": 52867.71484375, + "learning_rate": 8.831930486470452e-06, + "loss": 2.1147, + "step": 16160 + }, + { + "epoch": 3.029240862230553, + "grad_norm": 52421.55859375, + "learning_rate": 8.827471493750439e-06, + "loss": 2.0893, + "step": 16161 + }, + { + "epoch": 3.029428303655108, + "grad_norm": 50532.2265625, + "learning_rate": 8.823013517938483e-06, + "loss": 2.0997, + "step": 16162 + }, + { + "epoch": 3.0296157450796626, + "grad_norm": 51837.8671875, + "learning_rate": 8.818556559144724e-06, + "loss": 2.0806, + "step": 16163 + }, + { + "epoch": 3.0298031865042176, + "grad_norm": 55329.52734375, + "learning_rate": 8.814100617479215e-06, + "loss": 2.0687, + "step": 16164 + }, + { + "epoch": 3.0299906279287723, + "grad_norm": 62526.91796875, + "learning_rate": 8.809645693052026e-06, + "loss": 2.1362, + "step": 16165 + }, + { + "epoch": 3.030178069353327, + "grad_norm": 55837.296875, + "learning_rate": 8.805191785973182e-06, + "loss": 2.0868, + "step": 16166 + }, + { + "epoch": 3.030365510777882, + "grad_norm": 50981.65625, + "learning_rate": 8.800738896352695e-06, + "loss": 2.0489, + "step": 16167 + }, + { + "epoch": 3.0305529522024366, + "grad_norm": 54439.79296875, + "learning_rate": 8.796287024300549e-06, + "loss": 2.0728, + "step": 16168 + }, + { + "epoch": 3.0307403936269917, + "grad_norm": 54008.09375, + "learning_rate": 8.791836169926676e-06, + "loss": 2.0262, + "step": 16169 + }, + { + "epoch": 3.0309278350515463, + "grad_norm": 65725.8671875, + "learning_rate": 8.787386333341019e-06, + "loss": 2.0448, + "step": 16170 + }, + { + "epoch": 3.0311152764761014, + "grad_norm": 52631.77734375, + "learning_rate": 8.782937514653505e-06, + "loss": 2.0935, + "step": 16171 + }, + { + "epoch": 3.031302717900656, + "grad_norm": 58419.88671875, + "learning_rate": 8.778489713973992e-06, + "loss": 2.0827, + "step": 16172 + }, + { + "epoch": 3.031490159325211, + "grad_norm": 55308.578125, + "learning_rate": 8.774042931412329e-06, + "loss": 2.0922, + "step": 16173 + }, + { + "epoch": 3.0316776007497657, + "grad_norm": 57272.7109375, + "learning_rate": 8.769597167078364e-06, + "loss": 2.0022, + "step": 16174 + }, + { + "epoch": 3.0318650421743207, + "grad_norm": 54988.4140625, + "learning_rate": 8.765152421081895e-06, + "loss": 1.9927, + "step": 16175 + }, + { + "epoch": 3.0320524835988754, + "grad_norm": 56213.59765625, + "learning_rate": 8.760708693532687e-06, + "loss": 2.117, + "step": 16176 + }, + { + "epoch": 3.03223992502343, + "grad_norm": 53781.53515625, + "learning_rate": 8.756265984540507e-06, + "loss": 2.0246, + "step": 16177 + }, + { + "epoch": 3.032427366447985, + "grad_norm": 55139.09375, + "learning_rate": 8.75182429421511e-06, + "loss": 2.0701, + "step": 16178 + }, + { + "epoch": 3.0326148078725397, + "grad_norm": 58218.43359375, + "learning_rate": 8.747383622666144e-06, + "loss": 2.1368, + "step": 16179 + }, + { + "epoch": 3.0328022492970947, + "grad_norm": 55400.2109375, + "learning_rate": 8.742943970003325e-06, + "loss": 2.0343, + "step": 16180 + }, + { + "epoch": 3.0329896907216494, + "grad_norm": 52358.546875, + "learning_rate": 8.738505336336305e-06, + "loss": 2.0954, + "step": 16181 + }, + { + "epoch": 3.0331771321462044, + "grad_norm": 57795.4765625, + "learning_rate": 8.734067721774714e-06, + "loss": 2.0478, + "step": 16182 + }, + { + "epoch": 3.033364573570759, + "grad_norm": 59029.6953125, + "learning_rate": 8.729631126428128e-06, + "loss": 2.016, + "step": 16183 + }, + { + "epoch": 3.033552014995314, + "grad_norm": 58110.6484375, + "learning_rate": 8.725195550406152e-06, + "loss": 2.0592, + "step": 16184 + }, + { + "epoch": 3.0337394564198688, + "grad_norm": 54332.9140625, + "learning_rate": 8.72076099381834e-06, + "loss": 2.072, + "step": 16185 + }, + { + "epoch": 3.033926897844424, + "grad_norm": 55759.734375, + "learning_rate": 8.716327456774215e-06, + "loss": 2.0674, + "step": 16186 + }, + { + "epoch": 3.0341143392689784, + "grad_norm": 57204.03515625, + "learning_rate": 8.711894939383265e-06, + "loss": 2.1112, + "step": 16187 + }, + { + "epoch": 3.034301780693533, + "grad_norm": 50438.8203125, + "learning_rate": 8.707463441754987e-06, + "loss": 2.0433, + "step": 16188 + }, + { + "epoch": 3.034489222118088, + "grad_norm": 60520.11328125, + "learning_rate": 8.703032963998831e-06, + "loss": 2.066, + "step": 16189 + }, + { + "epoch": 3.0346766635426428, + "grad_norm": 50590.96484375, + "learning_rate": 8.698603506224201e-06, + "loss": 2.1012, + "step": 16190 + }, + { + "epoch": 3.034864104967198, + "grad_norm": 52802.56640625, + "learning_rate": 8.694175068540528e-06, + "loss": 2.0883, + "step": 16191 + }, + { + "epoch": 3.0350515463917525, + "grad_norm": 61619.015625, + "learning_rate": 8.689747651057168e-06, + "loss": 2.0756, + "step": 16192 + }, + { + "epoch": 3.0352389878163075, + "grad_norm": 52813.65234375, + "learning_rate": 8.685321253883493e-06, + "loss": 2.0342, + "step": 16193 + }, + { + "epoch": 3.035426429240862, + "grad_norm": 58919.37890625, + "learning_rate": 8.680895877128809e-06, + "loss": 2.122, + "step": 16194 + }, + { + "epoch": 3.035613870665417, + "grad_norm": 54312.61328125, + "learning_rate": 8.676471520902435e-06, + "loss": 2.0212, + "step": 16195 + }, + { + "epoch": 3.035801312089972, + "grad_norm": 58928.70703125, + "learning_rate": 8.672048185313625e-06, + "loss": 2.066, + "step": 16196 + }, + { + "epoch": 3.035988753514527, + "grad_norm": 56279.2109375, + "learning_rate": 8.667625870471657e-06, + "loss": 2.0841, + "step": 16197 + }, + { + "epoch": 3.0361761949390815, + "grad_norm": 56319.9609375, + "learning_rate": 8.66320457648574e-06, + "loss": 2.0161, + "step": 16198 + }, + { + "epoch": 3.036363636363636, + "grad_norm": 49997.5078125, + "learning_rate": 8.658784303465061e-06, + "loss": 2.093, + "step": 16199 + }, + { + "epoch": 3.036551077788191, + "grad_norm": 51475.765625, + "learning_rate": 8.654365051518815e-06, + "loss": 2.0222, + "step": 16200 + }, + { + "epoch": 3.036738519212746, + "grad_norm": 58323.7890625, + "learning_rate": 8.649946820756155e-06, + "loss": 2.0378, + "step": 16201 + }, + { + "epoch": 3.036925960637301, + "grad_norm": 58563.30859375, + "learning_rate": 8.645529611286197e-06, + "loss": 2.0803, + "step": 16202 + }, + { + "epoch": 3.0371134020618555, + "grad_norm": 58551.671875, + "learning_rate": 8.641113423218033e-06, + "loss": 2.0721, + "step": 16203 + }, + { + "epoch": 3.0373008434864106, + "grad_norm": 50913.7890625, + "learning_rate": 8.636698256660752e-06, + "loss": 2.0838, + "step": 16204 + }, + { + "epoch": 3.0374882849109652, + "grad_norm": 56959.62109375, + "learning_rate": 8.632284111723398e-06, + "loss": 2.0662, + "step": 16205 + }, + { + "epoch": 3.0376757263355203, + "grad_norm": 54219.5625, + "learning_rate": 8.627870988514985e-06, + "loss": 2.0931, + "step": 16206 + }, + { + "epoch": 3.037863167760075, + "grad_norm": 54176.421875, + "learning_rate": 8.623458887144514e-06, + "loss": 2.0464, + "step": 16207 + }, + { + "epoch": 3.03805060918463, + "grad_norm": 56437.7421875, + "learning_rate": 8.619047807720986e-06, + "loss": 2.1093, + "step": 16208 + }, + { + "epoch": 3.0382380506091846, + "grad_norm": 50176.33203125, + "learning_rate": 8.614637750353305e-06, + "loss": 2.0609, + "step": 16209 + }, + { + "epoch": 3.0384254920337392, + "grad_norm": 52667.76953125, + "learning_rate": 8.61022871515041e-06, + "loss": 2.0839, + "step": 16210 + }, + { + "epoch": 3.0386129334582943, + "grad_norm": 50057.41015625, + "learning_rate": 8.605820702221218e-06, + "loss": 2.0307, + "step": 16211 + }, + { + "epoch": 3.038800374882849, + "grad_norm": 53592.31640625, + "learning_rate": 8.601413711674588e-06, + "loss": 2.1374, + "step": 16212 + }, + { + "epoch": 3.038987816307404, + "grad_norm": 53655.06640625, + "learning_rate": 8.59700774361935e-06, + "loss": 2.1282, + "step": 16213 + }, + { + "epoch": 3.0391752577319586, + "grad_norm": 54373.8671875, + "learning_rate": 8.592602798164356e-06, + "loss": 2.0382, + "step": 16214 + }, + { + "epoch": 3.0393626991565137, + "grad_norm": 54468.33984375, + "learning_rate": 8.588198875418375e-06, + "loss": 2.0586, + "step": 16215 + }, + { + "epoch": 3.0395501405810683, + "grad_norm": 52570.7734375, + "learning_rate": 8.5837959754902e-06, + "loss": 2.0227, + "step": 16216 + }, + { + "epoch": 3.0397375820056234, + "grad_norm": 52300.765625, + "learning_rate": 8.579394098488558e-06, + "loss": 2.0648, + "step": 16217 + }, + { + "epoch": 3.039925023430178, + "grad_norm": 54585.63671875, + "learning_rate": 8.574993244522189e-06, + "loss": 2.0729, + "step": 16218 + }, + { + "epoch": 3.040112464854733, + "grad_norm": 51074.67578125, + "learning_rate": 8.570593413699785e-06, + "loss": 2.103, + "step": 16219 + }, + { + "epoch": 3.0402999062792877, + "grad_norm": 56337.90234375, + "learning_rate": 8.56619460612999e-06, + "loss": 2.0337, + "step": 16220 + }, + { + "epoch": 3.0404873477038423, + "grad_norm": 52502.04296875, + "learning_rate": 8.56179682192148e-06, + "loss": 2.0421, + "step": 16221 + }, + { + "epoch": 3.0406747891283974, + "grad_norm": 59001.37890625, + "learning_rate": 8.557400061182857e-06, + "loss": 2.0519, + "step": 16222 + }, + { + "epoch": 3.040862230552952, + "grad_norm": 55357.44140625, + "learning_rate": 8.553004324022729e-06, + "loss": 2.0919, + "step": 16223 + }, + { + "epoch": 3.041049671977507, + "grad_norm": 54917.3203125, + "learning_rate": 8.548609610549645e-06, + "loss": 2.0703, + "step": 16224 + }, + { + "epoch": 3.0412371134020617, + "grad_norm": 54426.09765625, + "learning_rate": 8.544215920872178e-06, + "loss": 2.0694, + "step": 16225 + }, + { + "epoch": 3.0414245548266168, + "grad_norm": 53208.671875, + "learning_rate": 8.539823255098816e-06, + "loss": 2.0552, + "step": 16226 + }, + { + "epoch": 3.0416119962511714, + "grad_norm": 52817.92578125, + "learning_rate": 8.535431613338074e-06, + "loss": 2.0385, + "step": 16227 + }, + { + "epoch": 3.0417994376757265, + "grad_norm": 54626.0234375, + "learning_rate": 8.531040995698413e-06, + "loss": 2.1245, + "step": 16228 + }, + { + "epoch": 3.041986879100281, + "grad_norm": 58038.97265625, + "learning_rate": 8.526651402288266e-06, + "loss": 2.0659, + "step": 16229 + }, + { + "epoch": 3.042174320524836, + "grad_norm": 56792.53515625, + "learning_rate": 8.522262833216065e-06, + "loss": 2.0878, + "step": 16230 + }, + { + "epoch": 3.042361761949391, + "grad_norm": 53828.1015625, + "learning_rate": 8.517875288590199e-06, + "loss": 2.1132, + "step": 16231 + }, + { + "epoch": 3.042549203373946, + "grad_norm": 56804.6953125, + "learning_rate": 8.513488768519018e-06, + "loss": 2.0498, + "step": 16232 + }, + { + "epoch": 3.0427366447985005, + "grad_norm": 55451.48828125, + "learning_rate": 8.509103273110874e-06, + "loss": 2.0917, + "step": 16233 + }, + { + "epoch": 3.042924086223055, + "grad_norm": 58231.4296875, + "learning_rate": 8.504718802474099e-06, + "loss": 2.1032, + "step": 16234 + }, + { + "epoch": 3.04311152764761, + "grad_norm": 56613.94921875, + "learning_rate": 8.500335356716972e-06, + "loss": 1.9752, + "step": 16235 + }, + { + "epoch": 3.043298969072165, + "grad_norm": 57310.14453125, + "learning_rate": 8.495952935947742e-06, + "loss": 2.0899, + "step": 16236 + }, + { + "epoch": 3.04348641049672, + "grad_norm": 54814.49609375, + "learning_rate": 8.491571540274662e-06, + "loss": 1.9615, + "step": 16237 + }, + { + "epoch": 3.0436738519212745, + "grad_norm": 55918.36328125, + "learning_rate": 8.48719116980598e-06, + "loss": 2.0695, + "step": 16238 + }, + { + "epoch": 3.0438612933458296, + "grad_norm": 56211.8203125, + "learning_rate": 8.482811824649822e-06, + "loss": 2.1501, + "step": 16239 + }, + { + "epoch": 3.044048734770384, + "grad_norm": 52179.57421875, + "learning_rate": 8.47843350491439e-06, + "loss": 2.1258, + "step": 16240 + }, + { + "epoch": 3.0442361761949392, + "grad_norm": 54963.7578125, + "learning_rate": 8.474056210707827e-06, + "loss": 2.1486, + "step": 16241 + }, + { + "epoch": 3.044423617619494, + "grad_norm": 52967.24609375, + "learning_rate": 8.46967994213823e-06, + "loss": 2.0712, + "step": 16242 + }, + { + "epoch": 3.044611059044049, + "grad_norm": 56846.0703125, + "learning_rate": 8.465304699313692e-06, + "loss": 2.1011, + "step": 16243 + }, + { + "epoch": 3.0447985004686036, + "grad_norm": 57505.84375, + "learning_rate": 8.460930482342283e-06, + "loss": 2.005, + "step": 16244 + }, + { + "epoch": 3.044985941893158, + "grad_norm": 58627.0, + "learning_rate": 8.456557291332034e-06, + "loss": 2.1194, + "step": 16245 + }, + { + "epoch": 3.0451733833177133, + "grad_norm": 53360.34765625, + "learning_rate": 8.452185126390949e-06, + "loss": 2.0609, + "step": 16246 + }, + { + "epoch": 3.045360824742268, + "grad_norm": 63114.77734375, + "learning_rate": 8.447813987627023e-06, + "loss": 2.0015, + "step": 16247 + }, + { + "epoch": 3.045548266166823, + "grad_norm": 55150.4765625, + "learning_rate": 8.443443875148233e-06, + "loss": 1.9724, + "step": 16248 + }, + { + "epoch": 3.0457357075913776, + "grad_norm": 54375.91796875, + "learning_rate": 8.439074789062496e-06, + "loss": 2.0731, + "step": 16249 + }, + { + "epoch": 3.0459231490159326, + "grad_norm": 58634.4140625, + "learning_rate": 8.43470672947772e-06, + "loss": 2.1183, + "step": 16250 + }, + { + "epoch": 3.0461105904404873, + "grad_norm": 54683.44140625, + "learning_rate": 8.430339696501804e-06, + "loss": 2.0736, + "step": 16251 + }, + { + "epoch": 3.0462980318650423, + "grad_norm": 60511.4140625, + "learning_rate": 8.425973690242595e-06, + "loss": 2.1156, + "step": 16252 + }, + { + "epoch": 3.046485473289597, + "grad_norm": 56378.609375, + "learning_rate": 8.421608710807943e-06, + "loss": 2.1069, + "step": 16253 + }, + { + "epoch": 3.046672914714152, + "grad_norm": 57124.203125, + "learning_rate": 8.417244758305648e-06, + "loss": 2.0948, + "step": 16254 + }, + { + "epoch": 3.0468603561387066, + "grad_norm": 56595.4296875, + "learning_rate": 8.412881832843484e-06, + "loss": 2.0948, + "step": 16255 + }, + { + "epoch": 3.0470477975632613, + "grad_norm": 51323.8359375, + "learning_rate": 8.408519934529219e-06, + "loss": 2.024, + "step": 16256 + }, + { + "epoch": 3.0472352389878163, + "grad_norm": 53190.01171875, + "learning_rate": 8.4041590634706e-06, + "loss": 2.0422, + "step": 16257 + }, + { + "epoch": 3.047422680412371, + "grad_norm": 56603.359375, + "learning_rate": 8.399799219775323e-06, + "loss": 2.0961, + "step": 16258 + }, + { + "epoch": 3.047610121836926, + "grad_norm": 63209.69921875, + "learning_rate": 8.395440403551059e-06, + "loss": 2.0578, + "step": 16259 + }, + { + "epoch": 3.0477975632614807, + "grad_norm": 55277.47265625, + "learning_rate": 8.391082614905488e-06, + "loss": 2.0266, + "step": 16260 + }, + { + "epoch": 3.0479850046860357, + "grad_norm": 55489.1796875, + "learning_rate": 8.386725853946226e-06, + "loss": 2.0877, + "step": 16261 + }, + { + "epoch": 3.0481724461105904, + "grad_norm": 55364.51171875, + "learning_rate": 8.382370120780875e-06, + "loss": 2.0545, + "step": 16262 + }, + { + "epoch": 3.0483598875351454, + "grad_norm": 53789.3125, + "learning_rate": 8.378015415517021e-06, + "loss": 2.0957, + "step": 16263 + }, + { + "epoch": 3.0485473289597, + "grad_norm": 51202.56640625, + "learning_rate": 8.373661738262239e-06, + "loss": 2.0794, + "step": 16264 + }, + { + "epoch": 3.048734770384255, + "grad_norm": 52978.9375, + "learning_rate": 8.369309089124039e-06, + "loss": 2.0577, + "step": 16265 + }, + { + "epoch": 3.0489222118088097, + "grad_norm": 54756.65625, + "learning_rate": 8.364957468209916e-06, + "loss": 2.0581, + "step": 16266 + }, + { + "epoch": 3.0491096532333644, + "grad_norm": 56207.71875, + "learning_rate": 8.360606875627381e-06, + "loss": 2.0687, + "step": 16267 + }, + { + "epoch": 3.0492970946579194, + "grad_norm": 58676.9765625, + "learning_rate": 8.356257311483861e-06, + "loss": 2.0397, + "step": 16268 + }, + { + "epoch": 3.049484536082474, + "grad_norm": 55310.21484375, + "learning_rate": 8.351908775886785e-06, + "loss": 2.0554, + "step": 16269 + }, + { + "epoch": 3.049671977507029, + "grad_norm": 54690.3984375, + "learning_rate": 8.347561268943572e-06, + "loss": 2.0567, + "step": 16270 + }, + { + "epoch": 3.0498594189315837, + "grad_norm": 64289.28515625, + "learning_rate": 8.343214790761606e-06, + "loss": 2.0752, + "step": 16271 + }, + { + "epoch": 3.050046860356139, + "grad_norm": 53553.140625, + "learning_rate": 8.338869341448208e-06, + "loss": 2.059, + "step": 16272 + }, + { + "epoch": 3.0502343017806934, + "grad_norm": 53717.33203125, + "learning_rate": 8.33452492111072e-06, + "loss": 2.0953, + "step": 16273 + }, + { + "epoch": 3.0504217432052485, + "grad_norm": 64282.46875, + "learning_rate": 8.330181529856463e-06, + "loss": 2.0253, + "step": 16274 + }, + { + "epoch": 3.050609184629803, + "grad_norm": 56068.9765625, + "learning_rate": 8.325839167792693e-06, + "loss": 2.1412, + "step": 16275 + }, + { + "epoch": 3.050796626054358, + "grad_norm": 57913.71875, + "learning_rate": 8.321497835026648e-06, + "loss": 2.0549, + "step": 16276 + }, + { + "epoch": 3.050984067478913, + "grad_norm": 52351.16796875, + "learning_rate": 8.317157531665576e-06, + "loss": 2.0426, + "step": 16277 + }, + { + "epoch": 3.051171508903468, + "grad_norm": 54834.49609375, + "learning_rate": 8.312818257816679e-06, + "loss": 2.0601, + "step": 16278 + }, + { + "epoch": 3.0513589503280225, + "grad_norm": 55129.7734375, + "learning_rate": 8.308480013587122e-06, + "loss": 2.0287, + "step": 16279 + }, + { + "epoch": 3.051546391752577, + "grad_norm": 53190.73046875, + "learning_rate": 8.304142799084047e-06, + "loss": 2.0364, + "step": 16280 + }, + { + "epoch": 3.051733833177132, + "grad_norm": 56092.36328125, + "learning_rate": 8.299806614414596e-06, + "loss": 2.098, + "step": 16281 + }, + { + "epoch": 3.051921274601687, + "grad_norm": 52692.74609375, + "learning_rate": 8.295471459685844e-06, + "loss": 2.043, + "step": 16282 + }, + { + "epoch": 3.052108716026242, + "grad_norm": 56248.99609375, + "learning_rate": 8.291137335004884e-06, + "loss": 2.0819, + "step": 16283 + }, + { + "epoch": 3.0522961574507965, + "grad_norm": 54397.03515625, + "learning_rate": 8.286804240478763e-06, + "loss": 2.0578, + "step": 16284 + }, + { + "epoch": 3.0524835988753516, + "grad_norm": 53694.03515625, + "learning_rate": 8.282472176214479e-06, + "loss": 2.0463, + "step": 16285 + }, + { + "epoch": 3.052671040299906, + "grad_norm": 52023.578125, + "learning_rate": 8.278141142319057e-06, + "loss": 2.0449, + "step": 16286 + }, + { + "epoch": 3.0528584817244613, + "grad_norm": 54136.93359375, + "learning_rate": 8.273811138899445e-06, + "loss": 2.011, + "step": 16287 + }, + { + "epoch": 3.053045923149016, + "grad_norm": 52708.8515625, + "learning_rate": 8.269482166062609e-06, + "loss": 1.9899, + "step": 16288 + }, + { + "epoch": 3.053233364573571, + "grad_norm": 52558.90625, + "learning_rate": 8.265154223915444e-06, + "loss": 2.0187, + "step": 16289 + }, + { + "epoch": 3.0534208059981256, + "grad_norm": 55672.26953125, + "learning_rate": 8.260827312564878e-06, + "loss": 2.1149, + "step": 16290 + }, + { + "epoch": 3.05360824742268, + "grad_norm": 62453.2421875, + "learning_rate": 8.256501432117759e-06, + "loss": 2.0162, + "step": 16291 + }, + { + "epoch": 3.0537956888472353, + "grad_norm": 56078.6015625, + "learning_rate": 8.25217658268092e-06, + "loss": 2.039, + "step": 16292 + }, + { + "epoch": 3.05398313027179, + "grad_norm": 50810.98046875, + "learning_rate": 8.247852764361196e-06, + "loss": 2.0542, + "step": 16293 + }, + { + "epoch": 3.054170571696345, + "grad_norm": 57244.51953125, + "learning_rate": 8.243529977265385e-06, + "loss": 2.1161, + "step": 16294 + }, + { + "epoch": 3.0543580131208996, + "grad_norm": 53900.59375, + "learning_rate": 8.23920822150025e-06, + "loss": 2.114, + "step": 16295 + }, + { + "epoch": 3.0545454545454547, + "grad_norm": 52879.01953125, + "learning_rate": 8.234887497172511e-06, + "loss": 2.1127, + "step": 16296 + }, + { + "epoch": 3.0547328959700093, + "grad_norm": 59434.9453125, + "learning_rate": 8.230567804388916e-06, + "loss": 2.1209, + "step": 16297 + }, + { + "epoch": 3.0549203373945644, + "grad_norm": 60230.48828125, + "learning_rate": 8.226249143256143e-06, + "loss": 2.049, + "step": 16298 + }, + { + "epoch": 3.055107778819119, + "grad_norm": 51856.0234375, + "learning_rate": 8.221931513880843e-06, + "loss": 2.1146, + "step": 16299 + }, + { + "epoch": 3.055295220243674, + "grad_norm": 52917.60546875, + "learning_rate": 8.217614916369664e-06, + "loss": 2.0495, + "step": 16300 + }, + { + "epoch": 3.0554826616682287, + "grad_norm": 53494.0859375, + "learning_rate": 8.213299350829256e-06, + "loss": 2.0079, + "step": 16301 + }, + { + "epoch": 3.0556701030927833, + "grad_norm": 54219.52734375, + "learning_rate": 8.208984817366144e-06, + "loss": 2.0382, + "step": 16302 + }, + { + "epoch": 3.0558575445173384, + "grad_norm": 56131.1796875, + "learning_rate": 8.204671316086931e-06, + "loss": 2.0149, + "step": 16303 + }, + { + "epoch": 3.056044985941893, + "grad_norm": 54061.5078125, + "learning_rate": 8.20035884709816e-06, + "loss": 2.0785, + "step": 16304 + }, + { + "epoch": 3.056232427366448, + "grad_norm": 53843.875, + "learning_rate": 8.19604741050633e-06, + "loss": 2.0916, + "step": 16305 + }, + { + "epoch": 3.0564198687910027, + "grad_norm": 53043.9765625, + "learning_rate": 8.191737006417916e-06, + "loss": 2.1022, + "step": 16306 + }, + { + "epoch": 3.0566073102155578, + "grad_norm": 52434.90625, + "learning_rate": 8.187427634939403e-06, + "loss": 2.0769, + "step": 16307 + }, + { + "epoch": 3.0567947516401124, + "grad_norm": 53363.1171875, + "learning_rate": 8.183119296177211e-06, + "loss": 2.0525, + "step": 16308 + }, + { + "epoch": 3.0569821930646675, + "grad_norm": 62337.515625, + "learning_rate": 8.178811990237762e-06, + "loss": 2.0724, + "step": 16309 + }, + { + "epoch": 3.057169634489222, + "grad_norm": 52033.7109375, + "learning_rate": 8.174505717227426e-06, + "loss": 2.0823, + "step": 16310 + }, + { + "epoch": 3.057357075913777, + "grad_norm": 58480.3515625, + "learning_rate": 8.170200477252583e-06, + "loss": 2.1056, + "step": 16311 + }, + { + "epoch": 3.0575445173383318, + "grad_norm": 56452.0390625, + "learning_rate": 8.165896270419542e-06, + "loss": 2.0586, + "step": 16312 + }, + { + "epoch": 3.0577319587628864, + "grad_norm": 53964.84375, + "learning_rate": 8.161593096834635e-06, + "loss": 2.0766, + "step": 16313 + }, + { + "epoch": 3.0579194001874415, + "grad_norm": 61382.296875, + "learning_rate": 8.157290956604142e-06, + "loss": 2.0573, + "step": 16314 + }, + { + "epoch": 3.058106841611996, + "grad_norm": 53142.8515625, + "learning_rate": 8.152989849834291e-06, + "loss": 2.1002, + "step": 16315 + }, + { + "epoch": 3.058294283036551, + "grad_norm": 54015.53125, + "learning_rate": 8.148689776631352e-06, + "loss": 2.0726, + "step": 16316 + }, + { + "epoch": 3.0584817244611058, + "grad_norm": 56770.2734375, + "learning_rate": 8.144390737101498e-06, + "loss": 2.0195, + "step": 16317 + }, + { + "epoch": 3.058669165885661, + "grad_norm": 56440.2890625, + "learning_rate": 8.140092731350945e-06, + "loss": 2.0167, + "step": 16318 + }, + { + "epoch": 3.0588566073102155, + "grad_norm": 55423.9453125, + "learning_rate": 8.135795759485815e-06, + "loss": 2.0687, + "step": 16319 + }, + { + "epoch": 3.0590440487347705, + "grad_norm": 56604.56640625, + "learning_rate": 8.13149982161226e-06, + "loss": 2.0141, + "step": 16320 + }, + { + "epoch": 3.059231490159325, + "grad_norm": 55597.03515625, + "learning_rate": 8.127204917836379e-06, + "loss": 2.0702, + "step": 16321 + }, + { + "epoch": 3.0594189315838802, + "grad_norm": 60715.0625, + "learning_rate": 8.122911048264237e-06, + "loss": 2.1312, + "step": 16322 + }, + { + "epoch": 3.059606373008435, + "grad_norm": 53530.7265625, + "learning_rate": 8.118618213001894e-06, + "loss": 2.1172, + "step": 16323 + }, + { + "epoch": 3.0597938144329895, + "grad_norm": 58933.515625, + "learning_rate": 8.11432641215541e-06, + "loss": 2.0339, + "step": 16324 + }, + { + "epoch": 3.0599812558575445, + "grad_norm": 56551.01953125, + "learning_rate": 8.110035645830733e-06, + "loss": 2.0354, + "step": 16325 + }, + { + "epoch": 3.060168697282099, + "grad_norm": 56730.734375, + "learning_rate": 8.105745914133861e-06, + "loss": 2.1105, + "step": 16326 + }, + { + "epoch": 3.0603561387066542, + "grad_norm": 53281.3359375, + "learning_rate": 8.10145721717076e-06, + "loss": 2.0876, + "step": 16327 + }, + { + "epoch": 3.060543580131209, + "grad_norm": 53137.484375, + "learning_rate": 8.097169555047346e-06, + "loss": 2.0964, + "step": 16328 + }, + { + "epoch": 3.060731021555764, + "grad_norm": 56762.8125, + "learning_rate": 8.092882927869505e-06, + "loss": 2.052, + "step": 16329 + }, + { + "epoch": 3.0609184629803186, + "grad_norm": 61807.12109375, + "learning_rate": 8.08859733574312e-06, + "loss": 2.0935, + "step": 16330 + }, + { + "epoch": 3.0611059044048736, + "grad_norm": 56027.40625, + "learning_rate": 8.084312778774066e-06, + "loss": 2.0991, + "step": 16331 + }, + { + "epoch": 3.0612933458294282, + "grad_norm": 55395.2578125, + "learning_rate": 8.080029257068123e-06, + "loss": 2.1239, + "step": 16332 + }, + { + "epoch": 3.0614807872539833, + "grad_norm": 56039.078125, + "learning_rate": 8.075746770731102e-06, + "loss": 2.1011, + "step": 16333 + }, + { + "epoch": 3.061668228678538, + "grad_norm": 57428.69140625, + "learning_rate": 8.071465319868792e-06, + "loss": 2.0454, + "step": 16334 + }, + { + "epoch": 3.0618556701030926, + "grad_norm": 56329.171875, + "learning_rate": 8.067184904586928e-06, + "loss": 2.0983, + "step": 16335 + }, + { + "epoch": 3.0620431115276476, + "grad_norm": 60401.4140625, + "learning_rate": 8.062905524991221e-06, + "loss": 1.9687, + "step": 16336 + }, + { + "epoch": 3.0622305529522023, + "grad_norm": 56182.03125, + "learning_rate": 8.058627181187384e-06, + "loss": 2.1092, + "step": 16337 + }, + { + "epoch": 3.0624179943767573, + "grad_norm": 61900.66796875, + "learning_rate": 8.054349873281064e-06, + "loss": 1.9921, + "step": 16338 + }, + { + "epoch": 3.062605435801312, + "grad_norm": 53061.1953125, + "learning_rate": 8.050073601377938e-06, + "loss": 2.1323, + "step": 16339 + }, + { + "epoch": 3.062792877225867, + "grad_norm": 52142.6953125, + "learning_rate": 8.04579836558359e-06, + "loss": 2.0637, + "step": 16340 + }, + { + "epoch": 3.0629803186504216, + "grad_norm": 56100.3203125, + "learning_rate": 8.04152416600364e-06, + "loss": 2.1189, + "step": 16341 + }, + { + "epoch": 3.0631677600749767, + "grad_norm": 50078.5859375, + "learning_rate": 8.037251002743639e-06, + "loss": 2.0339, + "step": 16342 + }, + { + "epoch": 3.0633552014995313, + "grad_norm": 56220.84375, + "learning_rate": 8.032978875909124e-06, + "loss": 2.0952, + "step": 16343 + }, + { + "epoch": 3.0635426429240864, + "grad_norm": 54802.40625, + "learning_rate": 8.028707785605632e-06, + "loss": 2.1099, + "step": 16344 + }, + { + "epoch": 3.063730084348641, + "grad_norm": 58424.5, + "learning_rate": 8.024437731938627e-06, + "loss": 2.1275, + "step": 16345 + }, + { + "epoch": 3.0639175257731956, + "grad_norm": 53547.5546875, + "learning_rate": 8.020168715013598e-06, + "loss": 2.0954, + "step": 16346 + }, + { + "epoch": 3.0641049671977507, + "grad_norm": 55691.0546875, + "learning_rate": 8.015900734935983e-06, + "loss": 2.0398, + "step": 16347 + }, + { + "epoch": 3.0642924086223053, + "grad_norm": 51980.35546875, + "learning_rate": 8.011633791811168e-06, + "loss": 2.1301, + "step": 16348 + }, + { + "epoch": 3.0644798500468604, + "grad_norm": 55041.64453125, + "learning_rate": 8.00736788574456e-06, + "loss": 2.0453, + "step": 16349 + }, + { + "epoch": 3.064667291471415, + "grad_norm": 57608.8046875, + "learning_rate": 8.003103016841534e-06, + "loss": 2.0807, + "step": 16350 + }, + { + "epoch": 3.06485473289597, + "grad_norm": 51036.6875, + "learning_rate": 7.998839185207414e-06, + "loss": 2.0619, + "step": 16351 + }, + { + "epoch": 3.0650421743205247, + "grad_norm": 52721.9140625, + "learning_rate": 7.994576390947495e-06, + "loss": 1.9952, + "step": 16352 + }, + { + "epoch": 3.06522961574508, + "grad_norm": 53226.38671875, + "learning_rate": 7.990314634167085e-06, + "loss": 2.0102, + "step": 16353 + }, + { + "epoch": 3.0654170571696344, + "grad_norm": 52814.1171875, + "learning_rate": 7.986053914971454e-06, + "loss": 2.0435, + "step": 16354 + }, + { + "epoch": 3.0656044985941895, + "grad_norm": 57369.734375, + "learning_rate": 7.9817942334658e-06, + "loss": 2.0938, + "step": 16355 + }, + { + "epoch": 3.065791940018744, + "grad_norm": 55310.21484375, + "learning_rate": 7.977535589755353e-06, + "loss": 2.0874, + "step": 16356 + }, + { + "epoch": 3.065979381443299, + "grad_norm": 63393.046875, + "learning_rate": 7.973277983945299e-06, + "loss": 2.1443, + "step": 16357 + }, + { + "epoch": 3.066166822867854, + "grad_norm": 53585.53515625, + "learning_rate": 7.969021416140799e-06, + "loss": 2.1126, + "step": 16358 + }, + { + "epoch": 3.0663542642924084, + "grad_norm": 55320.1328125, + "learning_rate": 7.964765886446957e-06, + "loss": 2.0582, + "step": 16359 + }, + { + "epoch": 3.0665417057169635, + "grad_norm": 68866.9453125, + "learning_rate": 7.960511394968912e-06, + "loss": 2.1336, + "step": 16360 + }, + { + "epoch": 3.066729147141518, + "grad_norm": 57125.42578125, + "learning_rate": 7.956257941811729e-06, + "loss": 2.0505, + "step": 16361 + }, + { + "epoch": 3.066916588566073, + "grad_norm": 63420.31640625, + "learning_rate": 7.952005527080458e-06, + "loss": 2.0182, + "step": 16362 + }, + { + "epoch": 3.067104029990628, + "grad_norm": 58812.796875, + "learning_rate": 7.947754150880132e-06, + "loss": 2.0942, + "step": 16363 + }, + { + "epoch": 3.067291471415183, + "grad_norm": 51715.8125, + "learning_rate": 7.943503813315767e-06, + "loss": 2.0303, + "step": 16364 + }, + { + "epoch": 3.0674789128397375, + "grad_norm": 55953.953125, + "learning_rate": 7.939254514492328e-06, + "loss": 2.0823, + "step": 16365 + }, + { + "epoch": 3.0676663542642926, + "grad_norm": 59753.875, + "learning_rate": 7.935006254514765e-06, + "loss": 2.07, + "step": 16366 + }, + { + "epoch": 3.067853795688847, + "grad_norm": 61308.48828125, + "learning_rate": 7.930759033488017e-06, + "loss": 2.0568, + "step": 16367 + }, + { + "epoch": 3.0680412371134023, + "grad_norm": 51382.82421875, + "learning_rate": 7.926512851516976e-06, + "loss": 2.0904, + "step": 16368 + }, + { + "epoch": 3.068228678537957, + "grad_norm": 55761.71875, + "learning_rate": 7.92226770870651e-06, + "loss": 2.0377, + "step": 16369 + }, + { + "epoch": 3.0684161199625115, + "grad_norm": 55015.25390625, + "learning_rate": 7.918023605161479e-06, + "loss": 2.0699, + "step": 16370 + }, + { + "epoch": 3.0686035613870666, + "grad_norm": 59199.6484375, + "learning_rate": 7.91378054098671e-06, + "loss": 2.0709, + "step": 16371 + }, + { + "epoch": 3.068791002811621, + "grad_norm": 55494.82421875, + "learning_rate": 7.909538516287002e-06, + "loss": 2.0306, + "step": 16372 + }, + { + "epoch": 3.0689784442361763, + "grad_norm": 52554.546875, + "learning_rate": 7.905297531167104e-06, + "loss": 2.093, + "step": 16373 + }, + { + "epoch": 3.069165885660731, + "grad_norm": 55697.15625, + "learning_rate": 7.901057585731798e-06, + "loss": 2.0555, + "step": 16374 + }, + { + "epoch": 3.069353327085286, + "grad_norm": 55496.22265625, + "learning_rate": 7.896818680085772e-06, + "loss": 2.0758, + "step": 16375 + }, + { + "epoch": 3.0695407685098406, + "grad_norm": 62238.35546875, + "learning_rate": 7.89258081433375e-06, + "loss": 2.0089, + "step": 16376 + }, + { + "epoch": 3.0697282099343957, + "grad_norm": 53038.03125, + "learning_rate": 7.888343988580387e-06, + "loss": 2.0766, + "step": 16377 + }, + { + "epoch": 3.0699156513589503, + "grad_norm": 50006.328125, + "learning_rate": 7.884108202930324e-06, + "loss": 2.104, + "step": 16378 + }, + { + "epoch": 3.0701030927835053, + "grad_norm": 57082.76953125, + "learning_rate": 7.87987345748818e-06, + "loss": 2.0926, + "step": 16379 + }, + { + "epoch": 3.07029053420806, + "grad_norm": 59733.06640625, + "learning_rate": 7.87563975235856e-06, + "loss": 2.0192, + "step": 16380 + }, + { + "epoch": 3.0704779756326146, + "grad_norm": 52651.41796875, + "learning_rate": 7.871407087646026e-06, + "loss": 2.1012, + "step": 16381 + }, + { + "epoch": 3.0706654170571697, + "grad_norm": 53013.8203125, + "learning_rate": 7.867175463455107e-06, + "loss": 2.0807, + "step": 16382 + }, + { + "epoch": 3.0708528584817243, + "grad_norm": 57563.17578125, + "learning_rate": 7.862944879890339e-06, + "loss": 2.1042, + "step": 16383 + }, + { + "epoch": 3.0710402999062794, + "grad_norm": 51180.5625, + "learning_rate": 7.858715337056195e-06, + "loss": 2.0616, + "step": 16384 + }, + { + "epoch": 3.071227741330834, + "grad_norm": 53604.9140625, + "learning_rate": 7.85448683505714e-06, + "loss": 1.9797, + "step": 16385 + }, + { + "epoch": 3.071415182755389, + "grad_norm": 53692.3515625, + "learning_rate": 7.850259373997614e-06, + "loss": 2.0683, + "step": 16386 + }, + { + "epoch": 3.0716026241799437, + "grad_norm": 64486.48828125, + "learning_rate": 7.846032953982046e-06, + "loss": 2.1161, + "step": 16387 + }, + { + "epoch": 3.0717900656044987, + "grad_norm": 56657.86328125, + "learning_rate": 7.841807575114813e-06, + "loss": 2.1213, + "step": 16388 + }, + { + "epoch": 3.0719775070290534, + "grad_norm": 56317.71484375, + "learning_rate": 7.837583237500262e-06, + "loss": 2.1508, + "step": 16389 + }, + { + "epoch": 3.0721649484536084, + "grad_norm": 63092.4375, + "learning_rate": 7.833359941242752e-06, + "loss": 2.0356, + "step": 16390 + }, + { + "epoch": 3.072352389878163, + "grad_norm": 55844.359375, + "learning_rate": 7.82913768644658e-06, + "loss": 2.0651, + "step": 16391 + }, + { + "epoch": 3.072539831302718, + "grad_norm": 54005.01953125, + "learning_rate": 7.82491647321602e-06, + "loss": 2.0919, + "step": 16392 + }, + { + "epoch": 3.0727272727272728, + "grad_norm": 57601.21484375, + "learning_rate": 7.820696301655345e-06, + "loss": 2.1545, + "step": 16393 + }, + { + "epoch": 3.0729147141518274, + "grad_norm": 54486.2421875, + "learning_rate": 7.816477171868791e-06, + "loss": 2.0916, + "step": 16394 + }, + { + "epoch": 3.0731021555763824, + "grad_norm": 58022.5859375, + "learning_rate": 7.812259083960566e-06, + "loss": 1.97, + "step": 16395 + }, + { + "epoch": 3.073289597000937, + "grad_norm": 53605.5859375, + "learning_rate": 7.808042038034829e-06, + "loss": 2.0945, + "step": 16396 + }, + { + "epoch": 3.073477038425492, + "grad_norm": 56833.99609375, + "learning_rate": 7.803826034195766e-06, + "loss": 2.0681, + "step": 16397 + }, + { + "epoch": 3.0736644798500468, + "grad_norm": 52618.43359375, + "learning_rate": 7.799611072547485e-06, + "loss": 2.1138, + "step": 16398 + }, + { + "epoch": 3.073851921274602, + "grad_norm": 52544.2109375, + "learning_rate": 7.79539715319409e-06, + "loss": 2.0707, + "step": 16399 + }, + { + "epoch": 3.0740393626991565, + "grad_norm": 52454.65625, + "learning_rate": 7.791184276239677e-06, + "loss": 2.0601, + "step": 16400 + }, + { + "epoch": 3.0742268041237115, + "grad_norm": 59417.62890625, + "learning_rate": 7.786972441788276e-06, + "loss": 2.1163, + "step": 16401 + }, + { + "epoch": 3.074414245548266, + "grad_norm": 51564.203125, + "learning_rate": 7.782761649943943e-06, + "loss": 2.0305, + "step": 16402 + }, + { + "epoch": 3.074601686972821, + "grad_norm": 61116.9296875, + "learning_rate": 7.778551900810644e-06, + "loss": 2.0107, + "step": 16403 + }, + { + "epoch": 3.074789128397376, + "grad_norm": 54349.921875, + "learning_rate": 7.774343194492384e-06, + "loss": 2.1123, + "step": 16404 + }, + { + "epoch": 3.0749765698219305, + "grad_norm": 59478.0859375, + "learning_rate": 7.770135531093086e-06, + "loss": 2.035, + "step": 16405 + }, + { + "epoch": 3.0751640112464855, + "grad_norm": 53525.9921875, + "learning_rate": 7.765928910716707e-06, + "loss": 2.077, + "step": 16406 + }, + { + "epoch": 3.07535145267104, + "grad_norm": 57454.34765625, + "learning_rate": 7.761723333467124e-06, + "loss": 2.0732, + "step": 16407 + }, + { + "epoch": 3.075538894095595, + "grad_norm": 53463.89453125, + "learning_rate": 7.757518799448199e-06, + "loss": 2.0748, + "step": 16408 + }, + { + "epoch": 3.07572633552015, + "grad_norm": 59252.30859375, + "learning_rate": 7.753315308763792e-06, + "loss": 2.0465, + "step": 16409 + }, + { + "epoch": 3.075913776944705, + "grad_norm": 55406.60546875, + "learning_rate": 7.749112861517737e-06, + "loss": 1.9886, + "step": 16410 + }, + { + "epoch": 3.0761012183692595, + "grad_norm": 59360.74609375, + "learning_rate": 7.744911457813815e-06, + "loss": 2.071, + "step": 16411 + }, + { + "epoch": 3.0762886597938146, + "grad_norm": 54960.66796875, + "learning_rate": 7.740711097755781e-06, + "loss": 2.1416, + "step": 16412 + }, + { + "epoch": 3.0764761012183692, + "grad_norm": 56660.8046875, + "learning_rate": 7.736511781447403e-06, + "loss": 2.016, + "step": 16413 + }, + { + "epoch": 3.0766635426429243, + "grad_norm": 53282.12109375, + "learning_rate": 7.732313508992396e-06, + "loss": 2.0535, + "step": 16414 + }, + { + "epoch": 3.076850984067479, + "grad_norm": 52216.578125, + "learning_rate": 7.728116280494424e-06, + "loss": 2.0912, + "step": 16415 + }, + { + "epoch": 3.0770384254920335, + "grad_norm": 52328.5, + "learning_rate": 7.72392009605718e-06, + "loss": 2.1038, + "step": 16416 + }, + { + "epoch": 3.0772258669165886, + "grad_norm": 58537.1953125, + "learning_rate": 7.71972495578432e-06, + "loss": 2.0895, + "step": 16417 + }, + { + "epoch": 3.0774133083411432, + "grad_norm": 62199.6953125, + "learning_rate": 7.715530859779412e-06, + "loss": 2.0368, + "step": 16418 + }, + { + "epoch": 3.0776007497656983, + "grad_norm": 55910.96484375, + "learning_rate": 7.71133780814607e-06, + "loss": 2.0593, + "step": 16419 + }, + { + "epoch": 3.077788191190253, + "grad_norm": 56429.02734375, + "learning_rate": 7.707145800987864e-06, + "loss": 2.0316, + "step": 16420 + }, + { + "epoch": 3.077975632614808, + "grad_norm": 50696.484375, + "learning_rate": 7.702954838408327e-06, + "loss": 2.0345, + "step": 16421 + }, + { + "epoch": 3.0781630740393626, + "grad_norm": 51763.52734375, + "learning_rate": 7.698764920510953e-06, + "loss": 2.0848, + "step": 16422 + }, + { + "epoch": 3.0783505154639177, + "grad_norm": 56963.46484375, + "learning_rate": 7.694576047399244e-06, + "loss": 2.0651, + "step": 16423 + }, + { + "epoch": 3.0785379568884723, + "grad_norm": 56384.28125, + "learning_rate": 7.69038821917668e-06, + "loss": 2.0368, + "step": 16424 + }, + { + "epoch": 3.0787253983130274, + "grad_norm": 53651.453125, + "learning_rate": 7.686201435946644e-06, + "loss": 2.1111, + "step": 16425 + }, + { + "epoch": 3.078912839737582, + "grad_norm": 54469.1484375, + "learning_rate": 7.682015697812577e-06, + "loss": 2.0553, + "step": 16426 + }, + { + "epoch": 3.0791002811621366, + "grad_norm": 57356.4140625, + "learning_rate": 7.677831004877867e-06, + "loss": 2.0643, + "step": 16427 + }, + { + "epoch": 3.0792877225866917, + "grad_norm": 55238.3515625, + "learning_rate": 7.673647357245855e-06, + "loss": 2.0552, + "step": 16428 + }, + { + "epoch": 3.0794751640112463, + "grad_norm": 52920.12890625, + "learning_rate": 7.669464755019866e-06, + "loss": 2.0373, + "step": 16429 + }, + { + "epoch": 3.0796626054358014, + "grad_norm": 53095.98828125, + "learning_rate": 7.66528319830323e-06, + "loss": 2.0594, + "step": 16430 + }, + { + "epoch": 3.079850046860356, + "grad_norm": 56714.8515625, + "learning_rate": 7.661102687199201e-06, + "loss": 2.103, + "step": 16431 + }, + { + "epoch": 3.080037488284911, + "grad_norm": 54573.4453125, + "learning_rate": 7.656923221811052e-06, + "loss": 2.0777, + "step": 16432 + }, + { + "epoch": 3.0802249297094657, + "grad_norm": 55266.8828125, + "learning_rate": 7.652744802241985e-06, + "loss": 2.1111, + "step": 16433 + }, + { + "epoch": 3.0804123711340208, + "grad_norm": 57371.87890625, + "learning_rate": 7.648567428595232e-06, + "loss": 2.0813, + "step": 16434 + }, + { + "epoch": 3.0805998125585754, + "grad_norm": 53724.265625, + "learning_rate": 7.644391100973947e-06, + "loss": 2.1295, + "step": 16435 + }, + { + "epoch": 3.0807872539831305, + "grad_norm": 53782.51953125, + "learning_rate": 7.640215819481294e-06, + "loss": 2.0283, + "step": 16436 + }, + { + "epoch": 3.080974695407685, + "grad_norm": 55192.8515625, + "learning_rate": 7.636041584220387e-06, + "loss": 2.0386, + "step": 16437 + }, + { + "epoch": 3.0811621368322397, + "grad_norm": 60315.97265625, + "learning_rate": 7.631868395294323e-06, + "loss": 2.0551, + "step": 16438 + }, + { + "epoch": 3.081349578256795, + "grad_norm": 54644.19921875, + "learning_rate": 7.6276962528061825e-06, + "loss": 2.0467, + "step": 16439 + }, + { + "epoch": 3.0815370196813494, + "grad_norm": 53678.265625, + "learning_rate": 7.623525156859013e-06, + "loss": 2.0647, + "step": 16440 + }, + { + "epoch": 3.0817244611059045, + "grad_norm": 57169.51171875, + "learning_rate": 7.61935510755582e-06, + "loss": 2.0329, + "step": 16441 + }, + { + "epoch": 3.081911902530459, + "grad_norm": 54682.9375, + "learning_rate": 7.6151861049996055e-06, + "loss": 2.077, + "step": 16442 + }, + { + "epoch": 3.082099343955014, + "grad_norm": 50377.6328125, + "learning_rate": 7.611018149293353e-06, + "loss": 2.1244, + "step": 16443 + }, + { + "epoch": 3.082286785379569, + "grad_norm": 53746.33203125, + "learning_rate": 7.606851240539991e-06, + "loss": 2.0197, + "step": 16444 + }, + { + "epoch": 3.082474226804124, + "grad_norm": 62177.7421875, + "learning_rate": 7.602685378842434e-06, + "loss": 2.0419, + "step": 16445 + }, + { + "epoch": 3.0826616682286785, + "grad_norm": 56020.17578125, + "learning_rate": 7.59852056430358e-06, + "loss": 2.1517, + "step": 16446 + }, + { + "epoch": 3.0828491096532336, + "grad_norm": 56086.8203125, + "learning_rate": 7.594356797026308e-06, + "loss": 2.0965, + "step": 16447 + }, + { + "epoch": 3.083036551077788, + "grad_norm": 63045.15234375, + "learning_rate": 7.590194077113427e-06, + "loss": 2.0511, + "step": 16448 + }, + { + "epoch": 3.083223992502343, + "grad_norm": 55460.76953125, + "learning_rate": 7.58603240466777e-06, + "loss": 2.0204, + "step": 16449 + }, + { + "epoch": 3.083411433926898, + "grad_norm": 55943.73046875, + "learning_rate": 7.581871779792127e-06, + "loss": 2.0382, + "step": 16450 + }, + { + "epoch": 3.0835988753514525, + "grad_norm": 55323.86328125, + "learning_rate": 7.57771220258926e-06, + "loss": 2.0924, + "step": 16451 + }, + { + "epoch": 3.0837863167760076, + "grad_norm": 55785.41015625, + "learning_rate": 7.573553673161887e-06, + "loss": 2.0819, + "step": 16452 + }, + { + "epoch": 3.083973758200562, + "grad_norm": 58941.23046875, + "learning_rate": 7.569396191612743e-06, + "loss": 1.9965, + "step": 16453 + }, + { + "epoch": 3.0841611996251173, + "grad_norm": 59004.79296875, + "learning_rate": 7.565239758044501e-06, + "loss": 2.0603, + "step": 16454 + }, + { + "epoch": 3.084348641049672, + "grad_norm": 53375.828125, + "learning_rate": 7.561084372559807e-06, + "loss": 2.1007, + "step": 16455 + }, + { + "epoch": 3.084536082474227, + "grad_norm": 51644.78515625, + "learning_rate": 7.556930035261306e-06, + "loss": 2.1487, + "step": 16456 + }, + { + "epoch": 3.0847235238987816, + "grad_norm": 54975.80859375, + "learning_rate": 7.552776746251616e-06, + "loss": 2.099, + "step": 16457 + }, + { + "epoch": 3.0849109653233366, + "grad_norm": 55434.90234375, + "learning_rate": 7.548624505633312e-06, + "loss": 2.0417, + "step": 16458 + }, + { + "epoch": 3.0850984067478913, + "grad_norm": 55908.36328125, + "learning_rate": 7.544473313508927e-06, + "loss": 2.0822, + "step": 16459 + }, + { + "epoch": 3.085285848172446, + "grad_norm": 55281.328125, + "learning_rate": 7.540323169981018e-06, + "loss": 2.1044, + "step": 16460 + }, + { + "epoch": 3.085473289597001, + "grad_norm": 56806.12890625, + "learning_rate": 7.5361740751520695e-06, + "loss": 2.0623, + "step": 16461 + }, + { + "epoch": 3.0856607310215556, + "grad_norm": 53130.69140625, + "learning_rate": 7.532026029124573e-06, + "loss": 2.0054, + "step": 16462 + }, + { + "epoch": 3.0858481724461106, + "grad_norm": 54839.9296875, + "learning_rate": 7.527879032000962e-06, + "loss": 2.0784, + "step": 16463 + }, + { + "epoch": 3.0860356138706653, + "grad_norm": 55109.5703125, + "learning_rate": 7.523733083883689e-06, + "loss": 2.1681, + "step": 16464 + }, + { + "epoch": 3.0862230552952203, + "grad_norm": 52204.5546875, + "learning_rate": 7.51958818487512e-06, + "loss": 2.1582, + "step": 16465 + }, + { + "epoch": 3.086410496719775, + "grad_norm": 54577.0625, + "learning_rate": 7.515444335077665e-06, + "loss": 2.0852, + "step": 16466 + }, + { + "epoch": 3.08659793814433, + "grad_norm": 55141.06640625, + "learning_rate": 7.511301534593646e-06, + "loss": 2.1272, + "step": 16467 + }, + { + "epoch": 3.0867853795688847, + "grad_norm": 54069.16796875, + "learning_rate": 7.507159783525386e-06, + "loss": 2.1029, + "step": 16468 + }, + { + "epoch": 3.0869728209934397, + "grad_norm": 54846.53125, + "learning_rate": 7.503019081975193e-06, + "loss": 2.0721, + "step": 16469 + }, + { + "epoch": 3.0871602624179943, + "grad_norm": 53039.43359375, + "learning_rate": 7.49887943004533e-06, + "loss": 2.0964, + "step": 16470 + }, + { + "epoch": 3.0873477038425494, + "grad_norm": 52445.296875, + "learning_rate": 7.494740827838032e-06, + "loss": 2.1402, + "step": 16471 + }, + { + "epoch": 3.087535145267104, + "grad_norm": 53771.84765625, + "learning_rate": 7.490603275455527e-06, + "loss": 2.0394, + "step": 16472 + }, + { + "epoch": 3.0877225866916587, + "grad_norm": 59154.921875, + "learning_rate": 7.486466773000023e-06, + "loss": 2.1236, + "step": 16473 + }, + { + "epoch": 3.0879100281162137, + "grad_norm": 62335.90625, + "learning_rate": 7.482331320573665e-06, + "loss": 1.9731, + "step": 16474 + }, + { + "epoch": 3.0880974695407684, + "grad_norm": 54622.33203125, + "learning_rate": 7.478196918278585e-06, + "loss": 2.0694, + "step": 16475 + }, + { + "epoch": 3.0882849109653234, + "grad_norm": 61078.0, + "learning_rate": 7.474063566216915e-06, + "loss": 2.1151, + "step": 16476 + }, + { + "epoch": 3.088472352389878, + "grad_norm": 59650.5390625, + "learning_rate": 7.469931264490759e-06, + "loss": 2.0988, + "step": 16477 + }, + { + "epoch": 3.088659793814433, + "grad_norm": 55407.078125, + "learning_rate": 7.46580001320214e-06, + "loss": 2.0359, + "step": 16478 + }, + { + "epoch": 3.0888472352389877, + "grad_norm": 55368.8671875, + "learning_rate": 7.4616698124531095e-06, + "loss": 2.1248, + "step": 16479 + }, + { + "epoch": 3.089034676663543, + "grad_norm": 54398.5703125, + "learning_rate": 7.457540662345697e-06, + "loss": 2.0279, + "step": 16480 + }, + { + "epoch": 3.0892221180880974, + "grad_norm": 59162.890625, + "learning_rate": 7.4534125629818765e-06, + "loss": 2.0829, + "step": 16481 + }, + { + "epoch": 3.0894095595126525, + "grad_norm": 53273.2734375, + "learning_rate": 7.4492855144635885e-06, + "loss": 2.0171, + "step": 16482 + }, + { + "epoch": 3.089597000937207, + "grad_norm": 57912.37890625, + "learning_rate": 7.445159516892791e-06, + "loss": 2.0828, + "step": 16483 + }, + { + "epoch": 3.0897844423617618, + "grad_norm": 59956.8515625, + "learning_rate": 7.44103457037138e-06, + "loss": 2.1247, + "step": 16484 + }, + { + "epoch": 3.089971883786317, + "grad_norm": 55532.7109375, + "learning_rate": 7.43691067500123e-06, + "loss": 2.0631, + "step": 16485 + }, + { + "epoch": 3.0901593252108714, + "grad_norm": 51622.97265625, + "learning_rate": 7.432787830884197e-06, + "loss": 2.0758, + "step": 16486 + }, + { + "epoch": 3.0903467666354265, + "grad_norm": 57478.62890625, + "learning_rate": 7.428666038122128e-06, + "loss": 2.0645, + "step": 16487 + }, + { + "epoch": 3.090534208059981, + "grad_norm": 53499.00390625, + "learning_rate": 7.424545296816815e-06, + "loss": 2.0854, + "step": 16488 + }, + { + "epoch": 3.090721649484536, + "grad_norm": 53688.203125, + "learning_rate": 7.4204256070700195e-06, + "loss": 2.0665, + "step": 16489 + }, + { + "epoch": 3.090909090909091, + "grad_norm": 57966.65625, + "learning_rate": 7.416306968983522e-06, + "loss": 2.0014, + "step": 16490 + }, + { + "epoch": 3.091096532333646, + "grad_norm": 55495.234375, + "learning_rate": 7.4121893826590185e-06, + "loss": 2.0017, + "step": 16491 + }, + { + "epoch": 3.0912839737582005, + "grad_norm": 54742.63671875, + "learning_rate": 7.408072848198228e-06, + "loss": 2.0905, + "step": 16492 + }, + { + "epoch": 3.0914714151827556, + "grad_norm": 55234.7734375, + "learning_rate": 7.403957365702824e-06, + "loss": 2.1436, + "step": 16493 + }, + { + "epoch": 3.09165885660731, + "grad_norm": 54903.77734375, + "learning_rate": 7.399842935274431e-06, + "loss": 2.0513, + "step": 16494 + }, + { + "epoch": 3.091846298031865, + "grad_norm": 60162.38671875, + "learning_rate": 7.395729557014697e-06, + "loss": 1.9682, + "step": 16495 + }, + { + "epoch": 3.09203373945642, + "grad_norm": 57007.48046875, + "learning_rate": 7.391617231025194e-06, + "loss": 2.0619, + "step": 16496 + }, + { + "epoch": 3.0922211808809745, + "grad_norm": 49804.0546875, + "learning_rate": 7.387505957407515e-06, + "loss": 2.0758, + "step": 16497 + }, + { + "epoch": 3.0924086223055296, + "grad_norm": 57201.171875, + "learning_rate": 7.383395736263177e-06, + "loss": 2.1533, + "step": 16498 + }, + { + "epoch": 3.092596063730084, + "grad_norm": 57197.94921875, + "learning_rate": 7.3792865676937275e-06, + "loss": 2.124, + "step": 16499 + }, + { + "epoch": 3.0927835051546393, + "grad_norm": 55174.8203125, + "learning_rate": 7.375178451800641e-06, + "loss": 2.049, + "step": 16500 + }, + { + "epoch": 3.0927835051546393, + "eval_loss": 2.2604148387908936, + "eval_runtime": 131.628, + "eval_samples_per_second": 38.358, + "eval_steps_per_second": 1.922, + "step": 16500 + }, + { + "epoch": 3.092970946579194, + "grad_norm": 57311.8671875, + "learning_rate": 7.371071388685369e-06, + "loss": 2.0641, + "step": 16501 + }, + { + "epoch": 3.093158388003749, + "grad_norm": 62896.2421875, + "learning_rate": 7.366965378449364e-06, + "loss": 2.0163, + "step": 16502 + }, + { + "epoch": 3.0933458294283036, + "grad_norm": 58222.359375, + "learning_rate": 7.362860421194051e-06, + "loss": 2.0605, + "step": 16503 + }, + { + "epoch": 3.0935332708528587, + "grad_norm": 55495.71484375, + "learning_rate": 7.358756517020804e-06, + "loss": 2.0666, + "step": 16504 + }, + { + "epoch": 3.0937207122774133, + "grad_norm": 54245.17578125, + "learning_rate": 7.3546536660309806e-06, + "loss": 2.0435, + "step": 16505 + }, + { + "epoch": 3.093908153701968, + "grad_norm": 52570.85546875, + "learning_rate": 7.350551868325928e-06, + "loss": 2.1152, + "step": 16506 + }, + { + "epoch": 3.094095595126523, + "grad_norm": 56009.921875, + "learning_rate": 7.346451124006947e-06, + "loss": 2.052, + "step": 16507 + }, + { + "epoch": 3.0942830365510776, + "grad_norm": 60945.72265625, + "learning_rate": 7.342351433175315e-06, + "loss": 2.2246, + "step": 16508 + }, + { + "epoch": 3.0944704779756327, + "grad_norm": 61317.140625, + "learning_rate": 7.338252795932293e-06, + "loss": 2.1171, + "step": 16509 + }, + { + "epoch": 3.0946579194001873, + "grad_norm": 53250.0234375, + "learning_rate": 7.33415521237914e-06, + "loss": 2.0876, + "step": 16510 + }, + { + "epoch": 3.0948453608247424, + "grad_norm": 52327.484375, + "learning_rate": 7.330058682617014e-06, + "loss": 2.085, + "step": 16511 + }, + { + "epoch": 3.095032802249297, + "grad_norm": 55681.00390625, + "learning_rate": 7.3259632067471106e-06, + "loss": 2.1812, + "step": 16512 + }, + { + "epoch": 3.095220243673852, + "grad_norm": 56314.42578125, + "learning_rate": 7.3218687848706054e-06, + "loss": 2.0494, + "step": 16513 + }, + { + "epoch": 3.0954076850984067, + "grad_norm": 54293.5625, + "learning_rate": 7.317775417088607e-06, + "loss": 2.063, + "step": 16514 + }, + { + "epoch": 3.0955951265229618, + "grad_norm": 53973.72265625, + "learning_rate": 7.313683103502206e-06, + "loss": 2.0783, + "step": 16515 + }, + { + "epoch": 3.0957825679475164, + "grad_norm": 58455.83984375, + "learning_rate": 7.3095918442124875e-06, + "loss": 2.0529, + "step": 16516 + }, + { + "epoch": 3.0959700093720715, + "grad_norm": 56268.4765625, + "learning_rate": 7.305501639320511e-06, + "loss": 2.0803, + "step": 16517 + }, + { + "epoch": 3.096157450796626, + "grad_norm": 55161.203125, + "learning_rate": 7.301412488927295e-06, + "loss": 2.0865, + "step": 16518 + }, + { + "epoch": 3.0963448922211807, + "grad_norm": 54005.921875, + "learning_rate": 7.2973243931338145e-06, + "loss": 2.0392, + "step": 16519 + }, + { + "epoch": 3.0965323336457358, + "grad_norm": 59353.40234375, + "learning_rate": 7.293237352041072e-06, + "loss": 2.0588, + "step": 16520 + }, + { + "epoch": 3.0967197750702904, + "grad_norm": 57216.2890625, + "learning_rate": 7.289151365749997e-06, + "loss": 2.0747, + "step": 16521 + }, + { + "epoch": 3.0969072164948455, + "grad_norm": 55319.54296875, + "learning_rate": 7.285066434361492e-06, + "loss": 2.039, + "step": 16522 + }, + { + "epoch": 3.0970946579194, + "grad_norm": 56602.9453125, + "learning_rate": 7.280982557976479e-06, + "loss": 2.0484, + "step": 16523 + }, + { + "epoch": 3.097282099343955, + "grad_norm": 57034.55078125, + "learning_rate": 7.2768997366958025e-06, + "loss": 2.0572, + "step": 16524 + }, + { + "epoch": 3.0974695407685098, + "grad_norm": 58393.9453125, + "learning_rate": 7.272817970620322e-06, + "loss": 2.0811, + "step": 16525 + }, + { + "epoch": 3.097656982193065, + "grad_norm": 53422.515625, + "learning_rate": 7.268737259850828e-06, + "loss": 2.0261, + "step": 16526 + }, + { + "epoch": 3.0978444236176195, + "grad_norm": 59134.8046875, + "learning_rate": 7.264657604488129e-06, + "loss": 2.0715, + "step": 16527 + }, + { + "epoch": 3.0980318650421745, + "grad_norm": 53514.94140625, + "learning_rate": 7.2605790046329725e-06, + "loss": 2.0992, + "step": 16528 + }, + { + "epoch": 3.098219306466729, + "grad_norm": 54629.0859375, + "learning_rate": 7.256501460386111e-06, + "loss": 2.2507, + "step": 16529 + }, + { + "epoch": 3.098406747891284, + "grad_norm": 50835.35546875, + "learning_rate": 7.252424971848249e-06, + "loss": 2.1001, + "step": 16530 + }, + { + "epoch": 3.098594189315839, + "grad_norm": 54056.71875, + "learning_rate": 7.248349539120058e-06, + "loss": 2.0963, + "step": 16531 + }, + { + "epoch": 3.0987816307403935, + "grad_norm": 58516.0546875, + "learning_rate": 7.2442751623022e-06, + "loss": 2.0828, + "step": 16532 + }, + { + "epoch": 3.0989690721649485, + "grad_norm": 55667.90234375, + "learning_rate": 7.240201841495336e-06, + "loss": 2.0333, + "step": 16533 + }, + { + "epoch": 3.099156513589503, + "grad_norm": 58669.890625, + "learning_rate": 7.236129576800027e-06, + "loss": 2.1019, + "step": 16534 + }, + { + "epoch": 3.0993439550140582, + "grad_norm": 53606.83984375, + "learning_rate": 7.232058368316869e-06, + "loss": 2.0113, + "step": 16535 + }, + { + "epoch": 3.099531396438613, + "grad_norm": 50500.3671875, + "learning_rate": 7.2279882161464364e-06, + "loss": 2.0788, + "step": 16536 + }, + { + "epoch": 3.099718837863168, + "grad_norm": 59088.2890625, + "learning_rate": 7.223919120389239e-06, + "loss": 2.0069, + "step": 16537 + }, + { + "epoch": 3.0999062792877226, + "grad_norm": 48974.453125, + "learning_rate": 7.219851081145762e-06, + "loss": 2.0394, + "step": 16538 + }, + { + "epoch": 3.1000937207122776, + "grad_norm": 56367.1875, + "learning_rate": 7.215784098516504e-06, + "loss": 2.0173, + "step": 16539 + }, + { + "epoch": 3.1002811621368322, + "grad_norm": 55817.16015625, + "learning_rate": 7.211718172601928e-06, + "loss": 2.1054, + "step": 16540 + }, + { + "epoch": 3.100468603561387, + "grad_norm": 56990.36328125, + "learning_rate": 7.207653303502415e-06, + "loss": 2.0808, + "step": 16541 + }, + { + "epoch": 3.100656044985942, + "grad_norm": 56103.52734375, + "learning_rate": 7.203589491318385e-06, + "loss": 2.1714, + "step": 16542 + }, + { + "epoch": 3.1008434864104966, + "grad_norm": 55675.26171875, + "learning_rate": 7.1995267361502194e-06, + "loss": 2.0416, + "step": 16543 + }, + { + "epoch": 3.1010309278350516, + "grad_norm": 62666.3125, + "learning_rate": 7.195465038098254e-06, + "loss": 2.0895, + "step": 16544 + }, + { + "epoch": 3.1012183692596063, + "grad_norm": 56071.6953125, + "learning_rate": 7.191404397262791e-06, + "loss": 2.1111, + "step": 16545 + }, + { + "epoch": 3.1014058106841613, + "grad_norm": 55637.7109375, + "learning_rate": 7.187344813744151e-06, + "loss": 2.0722, + "step": 16546 + }, + { + "epoch": 3.101593252108716, + "grad_norm": 57270.90625, + "learning_rate": 7.183286287642576e-06, + "loss": 1.9673, + "step": 16547 + }, + { + "epoch": 3.101780693533271, + "grad_norm": 53057.55078125, + "learning_rate": 7.179228819058325e-06, + "loss": 2.0847, + "step": 16548 + }, + { + "epoch": 3.1019681349578256, + "grad_norm": 52982.56640625, + "learning_rate": 7.175172408091596e-06, + "loss": 2.0298, + "step": 16549 + }, + { + "epoch": 3.1021555763823807, + "grad_norm": 53902.8828125, + "learning_rate": 7.171117054842591e-06, + "loss": 2.0613, + "step": 16550 + }, + { + "epoch": 3.1023430178069353, + "grad_norm": 57142.59765625, + "learning_rate": 7.16706275941147e-06, + "loss": 2.0313, + "step": 16551 + }, + { + "epoch": 3.10253045923149, + "grad_norm": 54447.64453125, + "learning_rate": 7.163009521898356e-06, + "loss": 2.1004, + "step": 16552 + }, + { + "epoch": 3.102717900656045, + "grad_norm": 54425.453125, + "learning_rate": 7.1589573424033775e-06, + "loss": 2.0271, + "step": 16553 + }, + { + "epoch": 3.1029053420805996, + "grad_norm": 51248.3671875, + "learning_rate": 7.154906221026597e-06, + "loss": 2.0904, + "step": 16554 + }, + { + "epoch": 3.1030927835051547, + "grad_norm": 51325.31640625, + "learning_rate": 7.150856157868091e-06, + "loss": 2.0974, + "step": 16555 + }, + { + "epoch": 3.1032802249297093, + "grad_norm": 56925.83203125, + "learning_rate": 7.146807153027879e-06, + "loss": 2.0219, + "step": 16556 + }, + { + "epoch": 3.1034676663542644, + "grad_norm": 57483.8828125, + "learning_rate": 7.142759206605976e-06, + "loss": 2.0754, + "step": 16557 + }, + { + "epoch": 3.103655107778819, + "grad_norm": 54647.60546875, + "learning_rate": 7.138712318702345e-06, + "loss": 2.0962, + "step": 16558 + }, + { + "epoch": 3.103842549203374, + "grad_norm": 55317.2421875, + "learning_rate": 7.134666489416963e-06, + "loss": 2.0274, + "step": 16559 + }, + { + "epoch": 3.1040299906279287, + "grad_norm": 56159.73828125, + "learning_rate": 7.130621718849739e-06, + "loss": 1.9807, + "step": 16560 + }, + { + "epoch": 3.104217432052484, + "grad_norm": 60484.23828125, + "learning_rate": 7.126578007100571e-06, + "loss": 2.1213, + "step": 16561 + }, + { + "epoch": 3.1044048734770384, + "grad_norm": 51047.0546875, + "learning_rate": 7.122535354269339e-06, + "loss": 2.0871, + "step": 16562 + }, + { + "epoch": 3.104592314901593, + "grad_norm": 52457.296875, + "learning_rate": 7.118493760455913e-06, + "loss": 2.0348, + "step": 16563 + }, + { + "epoch": 3.104779756326148, + "grad_norm": 55278.6015625, + "learning_rate": 7.11445322576007e-06, + "loss": 2.0832, + "step": 16564 + }, + { + "epoch": 3.1049671977507027, + "grad_norm": 54070.48828125, + "learning_rate": 7.110413750281631e-06, + "loss": 2.1072, + "step": 16565 + }, + { + "epoch": 3.105154639175258, + "grad_norm": 53288.08984375, + "learning_rate": 7.106375334120379e-06, + "loss": 2.0342, + "step": 16566 + }, + { + "epoch": 3.1053420805998124, + "grad_norm": 53049.47265625, + "learning_rate": 7.102337977376044e-06, + "loss": 2.0739, + "step": 16567 + }, + { + "epoch": 3.1055295220243675, + "grad_norm": 54002.8984375, + "learning_rate": 7.098301680148334e-06, + "loss": 2.0615, + "step": 16568 + }, + { + "epoch": 3.105716963448922, + "grad_norm": 52529.98046875, + "learning_rate": 7.094266442536945e-06, + "loss": 2.0263, + "step": 16569 + }, + { + "epoch": 3.105904404873477, + "grad_norm": 56094.51953125, + "learning_rate": 7.090232264641572e-06, + "loss": 2.1416, + "step": 16570 + }, + { + "epoch": 3.106091846298032, + "grad_norm": 53840.203125, + "learning_rate": 7.086199146561801e-06, + "loss": 2.0692, + "step": 16571 + }, + { + "epoch": 3.106279287722587, + "grad_norm": 58830.88671875, + "learning_rate": 7.082167088397279e-06, + "loss": 2.0371, + "step": 16572 + }, + { + "epoch": 3.1064667291471415, + "grad_norm": 55076.75390625, + "learning_rate": 7.078136090247595e-06, + "loss": 2.155, + "step": 16573 + }, + { + "epoch": 3.106654170571696, + "grad_norm": 52056.4609375, + "learning_rate": 7.074106152212301e-06, + "loss": 2.1103, + "step": 16574 + }, + { + "epoch": 3.106841611996251, + "grad_norm": 54017.24609375, + "learning_rate": 7.070077274390918e-06, + "loss": 2.0465, + "step": 16575 + }, + { + "epoch": 3.107029053420806, + "grad_norm": 59724.44921875, + "learning_rate": 7.066049456882978e-06, + "loss": 1.9988, + "step": 16576 + }, + { + "epoch": 3.107216494845361, + "grad_norm": 50723.71484375, + "learning_rate": 7.062022699787957e-06, + "loss": 2.1349, + "step": 16577 + }, + { + "epoch": 3.1074039362699155, + "grad_norm": 57315.15234375, + "learning_rate": 7.0579970032052855e-06, + "loss": 2.1286, + "step": 16578 + }, + { + "epoch": 3.1075913776944706, + "grad_norm": 56466.27734375, + "learning_rate": 7.053972367234418e-06, + "loss": 2.0284, + "step": 16579 + }, + { + "epoch": 3.107778819119025, + "grad_norm": 56864.1640625, + "learning_rate": 7.049948791974764e-06, + "loss": 2.0437, + "step": 16580 + }, + { + "epoch": 3.1079662605435803, + "grad_norm": 54634.23046875, + "learning_rate": 7.045926277525688e-06, + "loss": 2.0355, + "step": 16581 + }, + { + "epoch": 3.108153701968135, + "grad_norm": 53563.3671875, + "learning_rate": 7.041904823986534e-06, + "loss": 2.0537, + "step": 16582 + }, + { + "epoch": 3.10834114339269, + "grad_norm": 57029.6953125, + "learning_rate": 7.037884431456643e-06, + "loss": 2.0835, + "step": 16583 + }, + { + "epoch": 3.1085285848172446, + "grad_norm": 52908.35546875, + "learning_rate": 7.033865100035297e-06, + "loss": 2.0103, + "step": 16584 + }, + { + "epoch": 3.108716026241799, + "grad_norm": 62896.078125, + "learning_rate": 7.029846829821785e-06, + "loss": 2.0801, + "step": 16585 + }, + { + "epoch": 3.1089034676663543, + "grad_norm": 60687.5859375, + "learning_rate": 7.025829620915342e-06, + "loss": 2.0978, + "step": 16586 + }, + { + "epoch": 3.109090909090909, + "grad_norm": 60543.57421875, + "learning_rate": 7.0218134734151845e-06, + "loss": 2.0278, + "step": 16587 + }, + { + "epoch": 3.109278350515464, + "grad_norm": 51578.05078125, + "learning_rate": 7.01779838742051e-06, + "loss": 2.0107, + "step": 16588 + }, + { + "epoch": 3.1094657919400186, + "grad_norm": 52888.59375, + "learning_rate": 7.0137843630305015e-06, + "loss": 2.1126, + "step": 16589 + }, + { + "epoch": 3.1096532333645737, + "grad_norm": 53428.6015625, + "learning_rate": 7.009771400344284e-06, + "loss": 2.0307, + "step": 16590 + }, + { + "epoch": 3.1098406747891283, + "grad_norm": 49177.171875, + "learning_rate": 7.005759499460962e-06, + "loss": 2.0843, + "step": 16591 + }, + { + "epoch": 3.1100281162136834, + "grad_norm": 58130.95703125, + "learning_rate": 7.001748660479657e-06, + "loss": 2.0862, + "step": 16592 + }, + { + "epoch": 3.110215557638238, + "grad_norm": 54080.69140625, + "learning_rate": 6.997738883499405e-06, + "loss": 2.062, + "step": 16593 + }, + { + "epoch": 3.110402999062793, + "grad_norm": 53479.3203125, + "learning_rate": 6.993730168619239e-06, + "loss": 2.0069, + "step": 16594 + }, + { + "epoch": 3.1105904404873477, + "grad_norm": 52346.84765625, + "learning_rate": 6.989722515938179e-06, + "loss": 2.0657, + "step": 16595 + }, + { + "epoch": 3.1107778819119027, + "grad_norm": 59169.796875, + "learning_rate": 6.98571592555522e-06, + "loss": 2.069, + "step": 16596 + }, + { + "epoch": 3.1109653233364574, + "grad_norm": 52540.2109375, + "learning_rate": 6.981710397569314e-06, + "loss": 2.0354, + "step": 16597 + }, + { + "epoch": 3.111152764761012, + "grad_norm": 53811.18359375, + "learning_rate": 6.97770593207937e-06, + "loss": 2.0903, + "step": 16598 + }, + { + "epoch": 3.111340206185567, + "grad_norm": 54411.61328125, + "learning_rate": 6.973702529184323e-06, + "loss": 2.048, + "step": 16599 + }, + { + "epoch": 3.1115276476101217, + "grad_norm": 54441.62890625, + "learning_rate": 6.969700188983042e-06, + "loss": 2.1003, + "step": 16600 + }, + { + "epoch": 3.1117150890346768, + "grad_norm": 56776.16015625, + "learning_rate": 6.965698911574365e-06, + "loss": 2.0734, + "step": 16601 + }, + { + "epoch": 3.1119025304592314, + "grad_norm": 58062.703125, + "learning_rate": 6.961698697057129e-06, + "loss": 2.0426, + "step": 16602 + }, + { + "epoch": 3.1120899718837864, + "grad_norm": 56674.2734375, + "learning_rate": 6.957699545530144e-06, + "loss": 2.0066, + "step": 16603 + }, + { + "epoch": 3.112277413308341, + "grad_norm": 51138.6796875, + "learning_rate": 6.953701457092176e-06, + "loss": 2.0727, + "step": 16604 + }, + { + "epoch": 3.112464854732896, + "grad_norm": 62675.671875, + "learning_rate": 6.949704431841964e-06, + "loss": 2.0789, + "step": 16605 + }, + { + "epoch": 3.1126522961574508, + "grad_norm": 59658.6328125, + "learning_rate": 6.945708469878242e-06, + "loss": 2.0813, + "step": 16606 + }, + { + "epoch": 3.112839737582006, + "grad_norm": 57451.0078125, + "learning_rate": 6.941713571299702e-06, + "loss": 2.1042, + "step": 16607 + }, + { + "epoch": 3.1130271790065605, + "grad_norm": 52663.26171875, + "learning_rate": 6.937719736205006e-06, + "loss": 2.0329, + "step": 16608 + }, + { + "epoch": 3.113214620431115, + "grad_norm": 51778.6796875, + "learning_rate": 6.9337269646927935e-06, + "loss": 2.1257, + "step": 16609 + }, + { + "epoch": 3.11340206185567, + "grad_norm": 55658.26171875, + "learning_rate": 6.929735256861703e-06, + "loss": 2.0614, + "step": 16610 + }, + { + "epoch": 3.1135895032802248, + "grad_norm": 55766.515625, + "learning_rate": 6.925744612810309e-06, + "loss": 2.0855, + "step": 16611 + }, + { + "epoch": 3.11377694470478, + "grad_norm": 54807.3125, + "learning_rate": 6.921755032637167e-06, + "loss": 2.1091, + "step": 16612 + }, + { + "epoch": 3.1139643861293345, + "grad_norm": 56488.28515625, + "learning_rate": 6.917766516440838e-06, + "loss": 2.1149, + "step": 16613 + }, + { + "epoch": 3.1141518275538895, + "grad_norm": 57001.92578125, + "learning_rate": 6.913779064319803e-06, + "loss": 2.039, + "step": 16614 + }, + { + "epoch": 3.114339268978444, + "grad_norm": 54517.5234375, + "learning_rate": 6.909792676372578e-06, + "loss": 2.0634, + "step": 16615 + }, + { + "epoch": 3.114526710402999, + "grad_norm": 53318.6484375, + "learning_rate": 6.9058073526976015e-06, + "loss": 2.1034, + "step": 16616 + }, + { + "epoch": 3.114714151827554, + "grad_norm": 54270.9140625, + "learning_rate": 6.901823093393306e-06, + "loss": 2.0691, + "step": 16617 + }, + { + "epoch": 3.114901593252109, + "grad_norm": 53006.390625, + "learning_rate": 6.897839898558106e-06, + "loss": 2.0687, + "step": 16618 + }, + { + "epoch": 3.1150890346766635, + "grad_norm": 53679.80859375, + "learning_rate": 6.893857768290373e-06, + "loss": 2.0788, + "step": 16619 + }, + { + "epoch": 3.115276476101218, + "grad_norm": 57710.01953125, + "learning_rate": 6.889876702688475e-06, + "loss": 2.1751, + "step": 16620 + }, + { + "epoch": 3.1154639175257732, + "grad_norm": 56551.16015625, + "learning_rate": 6.8858967018507194e-06, + "loss": 2.1024, + "step": 16621 + }, + { + "epoch": 3.115651358950328, + "grad_norm": 51769.640625, + "learning_rate": 6.881917765875423e-06, + "loss": 2.1078, + "step": 16622 + }, + { + "epoch": 3.115838800374883, + "grad_norm": 55981.19921875, + "learning_rate": 6.877939894860857e-06, + "loss": 2.1063, + "step": 16623 + }, + { + "epoch": 3.1160262417994375, + "grad_norm": 59352.2265625, + "learning_rate": 6.873963088905255e-06, + "loss": 2.0711, + "step": 16624 + }, + { + "epoch": 3.1162136832239926, + "grad_norm": 51608.578125, + "learning_rate": 6.869987348106855e-06, + "loss": 2.1103, + "step": 16625 + }, + { + "epoch": 3.1164011246485472, + "grad_norm": 58318.65234375, + "learning_rate": 6.866012672563871e-06, + "loss": 2.0584, + "step": 16626 + }, + { + "epoch": 3.1165885660731023, + "grad_norm": 50091.10546875, + "learning_rate": 6.86203906237442e-06, + "loss": 2.0273, + "step": 16627 + }, + { + "epoch": 3.116776007497657, + "grad_norm": 56487.0546875, + "learning_rate": 6.85806651763668e-06, + "loss": 2.0461, + "step": 16628 + }, + { + "epoch": 3.116963448922212, + "grad_norm": 60992.171875, + "learning_rate": 6.854095038448777e-06, + "loss": 2.0253, + "step": 16629 + }, + { + "epoch": 3.1171508903467666, + "grad_norm": 55732.3984375, + "learning_rate": 6.850124624908783e-06, + "loss": 1.9964, + "step": 16630 + }, + { + "epoch": 3.1173383317713212, + "grad_norm": 54717.37890625, + "learning_rate": 6.846155277114758e-06, + "loss": 2.1591, + "step": 16631 + }, + { + "epoch": 3.1175257731958763, + "grad_norm": 52588.43359375, + "learning_rate": 6.842186995164745e-06, + "loss": 2.0795, + "step": 16632 + }, + { + "epoch": 3.117713214620431, + "grad_norm": 55487.31640625, + "learning_rate": 6.838219779156779e-06, + "loss": 2.0283, + "step": 16633 + }, + { + "epoch": 3.117900656044986, + "grad_norm": 59210.765625, + "learning_rate": 6.834253629188803e-06, + "loss": 2.0803, + "step": 16634 + }, + { + "epoch": 3.1180880974695406, + "grad_norm": 57388.8671875, + "learning_rate": 6.830288545358798e-06, + "loss": 2.0831, + "step": 16635 + }, + { + "epoch": 3.1182755388940957, + "grad_norm": 57285.87109375, + "learning_rate": 6.8263245277647106e-06, + "loss": 2.0376, + "step": 16636 + }, + { + "epoch": 3.1184629803186503, + "grad_norm": 53565.6015625, + "learning_rate": 6.822361576504427e-06, + "loss": 2.0995, + "step": 16637 + }, + { + "epoch": 3.1186504217432054, + "grad_norm": 58341.26953125, + "learning_rate": 6.81839969167582e-06, + "loss": 2.0246, + "step": 16638 + }, + { + "epoch": 3.11883786316776, + "grad_norm": 52297.15625, + "learning_rate": 6.8144388733767715e-06, + "loss": 2.0492, + "step": 16639 + }, + { + "epoch": 3.119025304592315, + "grad_norm": 53986.01171875, + "learning_rate": 6.810479121705076e-06, + "loss": 2.059, + "step": 16640 + }, + { + "epoch": 3.1192127460168697, + "grad_norm": 59412.29296875, + "learning_rate": 6.806520436758562e-06, + "loss": 2.1464, + "step": 16641 + }, + { + "epoch": 3.1194001874414248, + "grad_norm": 54483.34765625, + "learning_rate": 6.802562818634984e-06, + "loss": 2.049, + "step": 16642 + }, + { + "epoch": 3.1195876288659794, + "grad_norm": 54740.51953125, + "learning_rate": 6.798606267432106e-06, + "loss": 2.1243, + "step": 16643 + }, + { + "epoch": 3.119775070290534, + "grad_norm": 59846.15625, + "learning_rate": 6.79465078324763e-06, + "loss": 2.0947, + "step": 16644 + }, + { + "epoch": 3.119962511715089, + "grad_norm": 57192.4765625, + "learning_rate": 6.790696366179278e-06, + "loss": 2.0996, + "step": 16645 + }, + { + "epoch": 3.1201499531396437, + "grad_norm": 53236.0703125, + "learning_rate": 6.786743016324698e-06, + "loss": 2.093, + "step": 16646 + }, + { + "epoch": 3.120337394564199, + "grad_norm": 54374.58984375, + "learning_rate": 6.78279073378153e-06, + "loss": 2.057, + "step": 16647 + }, + { + "epoch": 3.1205248359887534, + "grad_norm": 58160.47265625, + "learning_rate": 6.778839518647412e-06, + "loss": 2.076, + "step": 16648 + }, + { + "epoch": 3.1207122774133085, + "grad_norm": 58594.7734375, + "learning_rate": 6.77488937101991e-06, + "loss": 2.14, + "step": 16649 + }, + { + "epoch": 3.120899718837863, + "grad_norm": 57397.20703125, + "learning_rate": 6.770940290996608e-06, + "loss": 2.0536, + "step": 16650 + }, + { + "epoch": 3.121087160262418, + "grad_norm": 57945.57421875, + "learning_rate": 6.766992278675022e-06, + "loss": 2.071, + "step": 16651 + }, + { + "epoch": 3.121274601686973, + "grad_norm": 53818.94921875, + "learning_rate": 6.76304533415269e-06, + "loss": 2.0747, + "step": 16652 + }, + { + "epoch": 3.121462043111528, + "grad_norm": 60135.75390625, + "learning_rate": 6.759099457527074e-06, + "loss": 1.9897, + "step": 16653 + }, + { + "epoch": 3.1216494845360825, + "grad_norm": 55414.0703125, + "learning_rate": 6.755154648895629e-06, + "loss": 2.1149, + "step": 16654 + }, + { + "epoch": 3.121836925960637, + "grad_norm": 57622.80078125, + "learning_rate": 6.751210908355799e-06, + "loss": 2.0831, + "step": 16655 + }, + { + "epoch": 3.122024367385192, + "grad_norm": 56716.51171875, + "learning_rate": 6.747268236005011e-06, + "loss": 2.0983, + "step": 16656 + }, + { + "epoch": 3.122211808809747, + "grad_norm": 56230.984375, + "learning_rate": 6.743326631940594e-06, + "loss": 2.0736, + "step": 16657 + }, + { + "epoch": 3.122399250234302, + "grad_norm": 54831.7734375, + "learning_rate": 6.739386096259925e-06, + "loss": 2.0927, + "step": 16658 + }, + { + "epoch": 3.1225866916588565, + "grad_norm": 57231.96875, + "learning_rate": 6.735446629060344e-06, + "loss": 2.0862, + "step": 16659 + }, + { + "epoch": 3.1227741330834116, + "grad_norm": 58150.1875, + "learning_rate": 6.731508230439138e-06, + "loss": 2.0829, + "step": 16660 + }, + { + "epoch": 3.122961574507966, + "grad_norm": 52631.30078125, + "learning_rate": 6.727570900493568e-06, + "loss": 2.0482, + "step": 16661 + }, + { + "epoch": 3.1231490159325213, + "grad_norm": 52574.9453125, + "learning_rate": 6.723634639320892e-06, + "loss": 2.08, + "step": 16662 + }, + { + "epoch": 3.123336457357076, + "grad_norm": 62088.80859375, + "learning_rate": 6.719699447018357e-06, + "loss": 2.0553, + "step": 16663 + }, + { + "epoch": 3.123523898781631, + "grad_norm": 58471.2421875, + "learning_rate": 6.71576532368311e-06, + "loss": 2.0265, + "step": 16664 + }, + { + "epoch": 3.1237113402061856, + "grad_norm": 55569.50390625, + "learning_rate": 6.711832269412338e-06, + "loss": 2.1176, + "step": 16665 + }, + { + "epoch": 3.12389878163074, + "grad_norm": 53408.265625, + "learning_rate": 6.707900284303198e-06, + "loss": 2.0624, + "step": 16666 + }, + { + "epoch": 3.1240862230552953, + "grad_norm": 57831.21875, + "learning_rate": 6.7039693684527895e-06, + "loss": 2.0946, + "step": 16667 + }, + { + "epoch": 3.12427366447985, + "grad_norm": 57875.0390625, + "learning_rate": 6.700039521958196e-06, + "loss": 2.0491, + "step": 16668 + }, + { + "epoch": 3.124461105904405, + "grad_norm": 55601.25390625, + "learning_rate": 6.696110744916495e-06, + "loss": 2.0878, + "step": 16669 + }, + { + "epoch": 3.1246485473289596, + "grad_norm": 53429.5078125, + "learning_rate": 6.692183037424704e-06, + "loss": 2.0144, + "step": 16670 + }, + { + "epoch": 3.1248359887535146, + "grad_norm": 60011.33203125, + "learning_rate": 6.688256399579857e-06, + "loss": 2.0252, + "step": 16671 + }, + { + "epoch": 3.1250234301780693, + "grad_norm": 55506.921875, + "learning_rate": 6.684330831478908e-06, + "loss": 2.0049, + "step": 16672 + }, + { + "epoch": 3.1252108716026243, + "grad_norm": 51840.19140625, + "learning_rate": 6.680406333218836e-06, + "loss": 2.07, + "step": 16673 + }, + { + "epoch": 3.125398313027179, + "grad_norm": 54071.97265625, + "learning_rate": 6.676482904896569e-06, + "loss": 2.0929, + "step": 16674 + }, + { + "epoch": 3.125585754451734, + "grad_norm": 54619.09375, + "learning_rate": 6.672560546608991e-06, + "loss": 2.0575, + "step": 16675 + }, + { + "epoch": 3.1257731958762887, + "grad_norm": 54994.82421875, + "learning_rate": 6.668639258453002e-06, + "loss": 2.0551, + "step": 16676 + }, + { + "epoch": 3.1259606373008433, + "grad_norm": 57591.97265625, + "learning_rate": 6.664719040525435e-06, + "loss": 2.117, + "step": 16677 + }, + { + "epoch": 3.1261480787253983, + "grad_norm": 56197.9921875, + "learning_rate": 6.660799892923131e-06, + "loss": 2.0331, + "step": 16678 + }, + { + "epoch": 3.126335520149953, + "grad_norm": 60025.484375, + "learning_rate": 6.656881815742882e-06, + "loss": 2.1316, + "step": 16679 + }, + { + "epoch": 3.126522961574508, + "grad_norm": 53916.90625, + "learning_rate": 6.652964809081441e-06, + "loss": 2.0251, + "step": 16680 + }, + { + "epoch": 3.1267104029990627, + "grad_norm": 59975.37890625, + "learning_rate": 6.649048873035574e-06, + "loss": 2.1048, + "step": 16681 + }, + { + "epoch": 3.1268978444236177, + "grad_norm": 61045.109375, + "learning_rate": 6.645134007702003e-06, + "loss": 2.0489, + "step": 16682 + }, + { + "epoch": 3.1270852858481724, + "grad_norm": 56117.8046875, + "learning_rate": 6.641220213177418e-06, + "loss": 2.0939, + "step": 16683 + }, + { + "epoch": 3.1272727272727274, + "grad_norm": 62613.8984375, + "learning_rate": 6.6373074895584585e-06, + "loss": 2.0019, + "step": 16684 + }, + { + "epoch": 3.127460168697282, + "grad_norm": 54587.3046875, + "learning_rate": 6.633395836941786e-06, + "loss": 2.05, + "step": 16685 + }, + { + "epoch": 3.127647610121837, + "grad_norm": 53107.71484375, + "learning_rate": 6.629485255424034e-06, + "loss": 2.0281, + "step": 16686 + }, + { + "epoch": 3.1278350515463917, + "grad_norm": 52063.01171875, + "learning_rate": 6.6255757451017474e-06, + "loss": 2.0971, + "step": 16687 + }, + { + "epoch": 3.1280224929709464, + "grad_norm": 55393.87109375, + "learning_rate": 6.621667306071499e-06, + "loss": 2.0356, + "step": 16688 + }, + { + "epoch": 3.1282099343955014, + "grad_norm": 56890.53125, + "learning_rate": 6.61775993842984e-06, + "loss": 2.0766, + "step": 16689 + }, + { + "epoch": 3.128397375820056, + "grad_norm": 54819.91796875, + "learning_rate": 6.61385364227326e-06, + "loss": 2.0738, + "step": 16690 + }, + { + "epoch": 3.128584817244611, + "grad_norm": 55614.359375, + "learning_rate": 6.609948417698242e-06, + "loss": 2.1122, + "step": 16691 + }, + { + "epoch": 3.1287722586691658, + "grad_norm": 57409.8515625, + "learning_rate": 6.606044264801248e-06, + "loss": 2.0314, + "step": 16692 + }, + { + "epoch": 3.128959700093721, + "grad_norm": 54517.53515625, + "learning_rate": 6.602141183678695e-06, + "loss": 2.0757, + "step": 16693 + }, + { + "epoch": 3.1291471415182754, + "grad_norm": 58481.52734375, + "learning_rate": 6.598239174426985e-06, + "loss": 2.0396, + "step": 16694 + }, + { + "epoch": 3.1293345829428305, + "grad_norm": 51613.79296875, + "learning_rate": 6.594338237142494e-06, + "loss": 2.0505, + "step": 16695 + }, + { + "epoch": 3.129522024367385, + "grad_norm": 60084.06640625, + "learning_rate": 6.590438371921581e-06, + "loss": 2.0427, + "step": 16696 + }, + { + "epoch": 3.12970946579194, + "grad_norm": 55249.84375, + "learning_rate": 6.586539578860562e-06, + "loss": 2.0176, + "step": 16697 + }, + { + "epoch": 3.129896907216495, + "grad_norm": 54091.5, + "learning_rate": 6.582641858055716e-06, + "loss": 2.1244, + "step": 16698 + }, + { + "epoch": 3.1300843486410495, + "grad_norm": 54870.2265625, + "learning_rate": 6.5787452096033375e-06, + "loss": 2.1174, + "step": 16699 + }, + { + "epoch": 3.1302717900656045, + "grad_norm": 54804.34375, + "learning_rate": 6.574849633599639e-06, + "loss": 2.1029, + "step": 16700 + }, + { + "epoch": 3.130459231490159, + "grad_norm": 59663.1015625, + "learning_rate": 6.570955130140871e-06, + "loss": 2.0578, + "step": 16701 + }, + { + "epoch": 3.130646672914714, + "grad_norm": 51456.77734375, + "learning_rate": 6.567061699323191e-06, + "loss": 2.0085, + "step": 16702 + }, + { + "epoch": 3.130834114339269, + "grad_norm": 56783.91796875, + "learning_rate": 6.5631693412427865e-06, + "loss": 2.0992, + "step": 16703 + }, + { + "epoch": 3.131021555763824, + "grad_norm": 56639.578125, + "learning_rate": 6.559278055995783e-06, + "loss": 1.9999, + "step": 16704 + }, + { + "epoch": 3.1312089971883785, + "grad_norm": 57971.8984375, + "learning_rate": 6.5553878436782786e-06, + "loss": 2.0443, + "step": 16705 + }, + { + "epoch": 3.1313964386129336, + "grad_norm": 57128.9453125, + "learning_rate": 6.551498704386378e-06, + "loss": 2.068, + "step": 16706 + }, + { + "epoch": 3.131583880037488, + "grad_norm": 52351.6875, + "learning_rate": 6.5476106382161175e-06, + "loss": 2.034, + "step": 16707 + }, + { + "epoch": 3.1317713214620433, + "grad_norm": 62241.03125, + "learning_rate": 6.54372364526355e-06, + "loss": 2.0032, + "step": 16708 + }, + { + "epoch": 3.131958762886598, + "grad_norm": 58002.23046875, + "learning_rate": 6.539837725624665e-06, + "loss": 2.0623, + "step": 16709 + }, + { + "epoch": 3.1321462043111525, + "grad_norm": 57758.484375, + "learning_rate": 6.535952879395429e-06, + "loss": 2.0927, + "step": 16710 + }, + { + "epoch": 3.1323336457357076, + "grad_norm": 51937.79296875, + "learning_rate": 6.5320691066718065e-06, + "loss": 2.0695, + "step": 16711 + }, + { + "epoch": 3.1325210871602622, + "grad_norm": 58793.36328125, + "learning_rate": 6.52818640754973e-06, + "loss": 2.0497, + "step": 16712 + }, + { + "epoch": 3.1327085285848173, + "grad_norm": 56887.859375, + "learning_rate": 6.524304782125085e-06, + "loss": 2.0767, + "step": 16713 + }, + { + "epoch": 3.132895970009372, + "grad_norm": 57850.9375, + "learning_rate": 6.520424230493738e-06, + "loss": 2.1198, + "step": 16714 + }, + { + "epoch": 3.133083411433927, + "grad_norm": 51110.7578125, + "learning_rate": 6.516544752751547e-06, + "loss": 2.027, + "step": 16715 + }, + { + "epoch": 3.1332708528584816, + "grad_norm": 62732.46875, + "learning_rate": 6.512666348994328e-06, + "loss": 2.1053, + "step": 16716 + }, + { + "epoch": 3.1334582942830367, + "grad_norm": 60596.9609375, + "learning_rate": 6.508789019317857e-06, + "loss": 1.9419, + "step": 16717 + }, + { + "epoch": 3.1336457357075913, + "grad_norm": 52002.0078125, + "learning_rate": 6.504912763817905e-06, + "loss": 2.0497, + "step": 16718 + }, + { + "epoch": 3.1338331771321464, + "grad_norm": 53769.46875, + "learning_rate": 6.501037582590242e-06, + "loss": 2.0733, + "step": 16719 + }, + { + "epoch": 3.134020618556701, + "grad_norm": 53375.30078125, + "learning_rate": 6.497163475730528e-06, + "loss": 2.0052, + "step": 16720 + }, + { + "epoch": 3.1342080599812556, + "grad_norm": 53479.02734375, + "learning_rate": 6.493290443334477e-06, + "loss": 2.0904, + "step": 16721 + }, + { + "epoch": 3.1343955014058107, + "grad_norm": 55472.55078125, + "learning_rate": 6.489418485497756e-06, + "loss": 2.1599, + "step": 16722 + }, + { + "epoch": 3.1345829428303653, + "grad_norm": 58243.14453125, + "learning_rate": 6.485547602315984e-06, + "loss": 2.0868, + "step": 16723 + }, + { + "epoch": 3.1347703842549204, + "grad_norm": 53531.91796875, + "learning_rate": 6.481677793884761e-06, + "loss": 2.1088, + "step": 16724 + }, + { + "epoch": 3.134957825679475, + "grad_norm": 56000.984375, + "learning_rate": 6.477809060299672e-06, + "loss": 2.0345, + "step": 16725 + }, + { + "epoch": 3.13514526710403, + "grad_norm": 57260.99609375, + "learning_rate": 6.473941401656281e-06, + "loss": 2.0269, + "step": 16726 + }, + { + "epoch": 3.1353327085285847, + "grad_norm": 58653.98828125, + "learning_rate": 6.470074818050109e-06, + "loss": 2.065, + "step": 16727 + }, + { + "epoch": 3.1355201499531398, + "grad_norm": 53437.78515625, + "learning_rate": 6.466209309576638e-06, + "loss": 2.0506, + "step": 16728 + }, + { + "epoch": 3.1357075913776944, + "grad_norm": 55120.29296875, + "learning_rate": 6.462344876331367e-06, + "loss": 2.0507, + "step": 16729 + }, + { + "epoch": 3.1358950328022495, + "grad_norm": 53078.546875, + "learning_rate": 6.458481518409726e-06, + "loss": 2.0851, + "step": 16730 + }, + { + "epoch": 3.136082474226804, + "grad_norm": 63243.6796875, + "learning_rate": 6.454619235907134e-06, + "loss": 2.1531, + "step": 16731 + }, + { + "epoch": 3.136269915651359, + "grad_norm": 54159.60546875, + "learning_rate": 6.450758028918996e-06, + "loss": 2.0872, + "step": 16732 + }, + { + "epoch": 3.1364573570759138, + "grad_norm": 58817.3671875, + "learning_rate": 6.446897897540666e-06, + "loss": 2.0975, + "step": 16733 + }, + { + "epoch": 3.1366447985004684, + "grad_norm": 48801.37109375, + "learning_rate": 6.443038841867494e-06, + "loss": 2.0823, + "step": 16734 + }, + { + "epoch": 3.1368322399250235, + "grad_norm": 61122.88671875, + "learning_rate": 6.439180861994787e-06, + "loss": 2.0773, + "step": 16735 + }, + { + "epoch": 3.137019681349578, + "grad_norm": 60817.3125, + "learning_rate": 6.435323958017842e-06, + "loss": 2.112, + "step": 16736 + }, + { + "epoch": 3.137207122774133, + "grad_norm": 53514.55859375, + "learning_rate": 6.431468130031904e-06, + "loss": 2.0565, + "step": 16737 + }, + { + "epoch": 3.137394564198688, + "grad_norm": 54740.8046875, + "learning_rate": 6.4276133781322245e-06, + "loss": 2.0844, + "step": 16738 + }, + { + "epoch": 3.137582005623243, + "grad_norm": 56022.73046875, + "learning_rate": 6.423759702414001e-06, + "loss": 2.0989, + "step": 16739 + }, + { + "epoch": 3.1377694470477975, + "grad_norm": 55545.25, + "learning_rate": 6.4199071029724055e-06, + "loss": 2.0847, + "step": 16740 + }, + { + "epoch": 3.1379568884723525, + "grad_norm": 53826.74609375, + "learning_rate": 6.416055579902608e-06, + "loss": 2.1387, + "step": 16741 + }, + { + "epoch": 3.138144329896907, + "grad_norm": 52718.21875, + "learning_rate": 6.412205133299731e-06, + "loss": 2.0094, + "step": 16742 + }, + { + "epoch": 3.1383317713214622, + "grad_norm": 55301.26953125, + "learning_rate": 6.408355763258883e-06, + "loss": 2.0083, + "step": 16743 + }, + { + "epoch": 3.138519212746017, + "grad_norm": 58187.30078125, + "learning_rate": 6.404507469875115e-06, + "loss": 2.092, + "step": 16744 + }, + { + "epoch": 3.138706654170572, + "grad_norm": 56006.796875, + "learning_rate": 6.400660253243507e-06, + "loss": 2.0563, + "step": 16745 + }, + { + "epoch": 3.1388940955951266, + "grad_norm": 55818.46875, + "learning_rate": 6.396814113459054e-06, + "loss": 2.0909, + "step": 16746 + }, + { + "epoch": 3.139081537019681, + "grad_norm": 59737.2890625, + "learning_rate": 6.392969050616754e-06, + "loss": 2.1025, + "step": 16747 + }, + { + "epoch": 3.1392689784442362, + "grad_norm": 53670.83203125, + "learning_rate": 6.389125064811585e-06, + "loss": 2.039, + "step": 16748 + }, + { + "epoch": 3.139456419868791, + "grad_norm": 54843.4296875, + "learning_rate": 6.3852821561385e-06, + "loss": 2.1047, + "step": 16749 + }, + { + "epoch": 3.139643861293346, + "grad_norm": 59515.13671875, + "learning_rate": 6.381440324692378e-06, + "loss": 2.1149, + "step": 16750 + }, + { + "epoch": 3.1398313027179006, + "grad_norm": 58138.3515625, + "learning_rate": 6.377599570568133e-06, + "loss": 2.0427, + "step": 16751 + }, + { + "epoch": 3.1400187441424556, + "grad_norm": 55976.75, + "learning_rate": 6.373759893860626e-06, + "loss": 2.0376, + "step": 16752 + }, + { + "epoch": 3.1402061855670103, + "grad_norm": 53642.0, + "learning_rate": 6.369921294664688e-06, + "loss": 2.0824, + "step": 16753 + }, + { + "epoch": 3.1403936269915653, + "grad_norm": 56351.0, + "learning_rate": 6.366083773075115e-06, + "loss": 1.9735, + "step": 16754 + }, + { + "epoch": 3.14058106841612, + "grad_norm": 54280.0625, + "learning_rate": 6.362247329186705e-06, + "loss": 2.073, + "step": 16755 + }, + { + "epoch": 3.140768509840675, + "grad_norm": 57131.57421875, + "learning_rate": 6.358411963094218e-06, + "loss": 2.0876, + "step": 16756 + }, + { + "epoch": 3.1409559512652296, + "grad_norm": 54749.796875, + "learning_rate": 6.354577674892371e-06, + "loss": 2.06, + "step": 16757 + }, + { + "epoch": 3.1411433926897843, + "grad_norm": 60941.95703125, + "learning_rate": 6.3507444646758615e-06, + "loss": 2.0267, + "step": 16758 + }, + { + "epoch": 3.1413308341143393, + "grad_norm": 60132.1953125, + "learning_rate": 6.346912332539378e-06, + "loss": 2.0803, + "step": 16759 + }, + { + "epoch": 3.141518275538894, + "grad_norm": 64340.7734375, + "learning_rate": 6.3430812785775676e-06, + "loss": 2.0348, + "step": 16760 + }, + { + "epoch": 3.141705716963449, + "grad_norm": 52511.0390625, + "learning_rate": 6.339251302885035e-06, + "loss": 2.0721, + "step": 16761 + }, + { + "epoch": 3.1418931583880036, + "grad_norm": 56558.94921875, + "learning_rate": 6.335422405556396e-06, + "loss": 2.0353, + "step": 16762 + }, + { + "epoch": 3.1420805998125587, + "grad_norm": 51512.32421875, + "learning_rate": 6.331594586686201e-06, + "loss": 2.1068, + "step": 16763 + }, + { + "epoch": 3.1422680412371133, + "grad_norm": 55802.71484375, + "learning_rate": 6.32776784636902e-06, + "loss": 2.1373, + "step": 16764 + }, + { + "epoch": 3.1424554826616684, + "grad_norm": 53728.65234375, + "learning_rate": 6.323942184699333e-06, + "loss": 2.0689, + "step": 16765 + }, + { + "epoch": 3.142642924086223, + "grad_norm": 53766.6171875, + "learning_rate": 6.320117601771664e-06, + "loss": 2.044, + "step": 16766 + }, + { + "epoch": 3.142830365510778, + "grad_norm": 59099.79296875, + "learning_rate": 6.316294097680442e-06, + "loss": 2.0542, + "step": 16767 + }, + { + "epoch": 3.1430178069353327, + "grad_norm": 51480.078125, + "learning_rate": 6.3124716725201314e-06, + "loss": 2.0472, + "step": 16768 + }, + { + "epoch": 3.1432052483598873, + "grad_norm": 63283.9453125, + "learning_rate": 6.308650326385135e-06, + "loss": 2.0251, + "step": 16769 + }, + { + "epoch": 3.1433926897844424, + "grad_norm": 57514.18359375, + "learning_rate": 6.3048300593698096e-06, + "loss": 2.0555, + "step": 16770 + }, + { + "epoch": 3.143580131208997, + "grad_norm": 54017.421875, + "learning_rate": 6.301010871568547e-06, + "loss": 2.0758, + "step": 16771 + }, + { + "epoch": 3.143767572633552, + "grad_norm": 57218.4609375, + "learning_rate": 6.297192763075654e-06, + "loss": 2.0703, + "step": 16772 + }, + { + "epoch": 3.1439550140581067, + "grad_norm": 51671.3203125, + "learning_rate": 6.293375733985429e-06, + "loss": 2.1486, + "step": 16773 + }, + { + "epoch": 3.144142455482662, + "grad_norm": 58996.27734375, + "learning_rate": 6.289559784392157e-06, + "loss": 2.0673, + "step": 16774 + }, + { + "epoch": 3.1443298969072164, + "grad_norm": 52883.43359375, + "learning_rate": 6.285744914390102e-06, + "loss": 2.0934, + "step": 16775 + }, + { + "epoch": 3.1445173383317715, + "grad_norm": 55721.71875, + "learning_rate": 6.281931124073465e-06, + "loss": 2.0001, + "step": 16776 + }, + { + "epoch": 3.144704779756326, + "grad_norm": 62610.76953125, + "learning_rate": 6.278118413536444e-06, + "loss": 2.0466, + "step": 16777 + }, + { + "epoch": 3.144892221180881, + "grad_norm": 56160.93359375, + "learning_rate": 6.274306782873208e-06, + "loss": 2.0218, + "step": 16778 + }, + { + "epoch": 3.145079662605436, + "grad_norm": 54696.1484375, + "learning_rate": 6.270496232177925e-06, + "loss": 2.0687, + "step": 16779 + }, + { + "epoch": 3.1452671040299904, + "grad_norm": 53044.2890625, + "learning_rate": 6.266686761544665e-06, + "loss": 2.0044, + "step": 16780 + }, + { + "epoch": 3.1454545454545455, + "grad_norm": 54495.515625, + "learning_rate": 6.2628783710675435e-06, + "loss": 2.0724, + "step": 16781 + }, + { + "epoch": 3.1456419868791, + "grad_norm": 60067.68359375, + "learning_rate": 6.259071060840632e-06, + "loss": 2.0839, + "step": 16782 + }, + { + "epoch": 3.145829428303655, + "grad_norm": 57335.6875, + "learning_rate": 6.255264830957952e-06, + "loss": 2.1663, + "step": 16783 + }, + { + "epoch": 3.14601686972821, + "grad_norm": 55990.2421875, + "learning_rate": 6.251459681513505e-06, + "loss": 2.001, + "step": 16784 + }, + { + "epoch": 3.146204311152765, + "grad_norm": 55061.4296875, + "learning_rate": 6.247655612601294e-06, + "loss": 2.0162, + "step": 16785 + }, + { + "epoch": 3.1463917525773195, + "grad_norm": 52428.49609375, + "learning_rate": 6.24385262431526e-06, + "loss": 2.0364, + "step": 16786 + }, + { + "epoch": 3.1465791940018746, + "grad_norm": 57001.51171875, + "learning_rate": 6.2400507167493295e-06, + "loss": 2.0564, + "step": 16787 + }, + { + "epoch": 3.146766635426429, + "grad_norm": 59377.8046875, + "learning_rate": 6.236249889997409e-06, + "loss": 2.1366, + "step": 16788 + }, + { + "epoch": 3.1469540768509843, + "grad_norm": 54294.859375, + "learning_rate": 6.2324501441533846e-06, + "loss": 2.0605, + "step": 16789 + }, + { + "epoch": 3.147141518275539, + "grad_norm": 55351.80859375, + "learning_rate": 6.228651479311098e-06, + "loss": 2.0297, + "step": 16790 + }, + { + "epoch": 3.1473289597000935, + "grad_norm": 55514.83203125, + "learning_rate": 6.224853895564359e-06, + "loss": 2.07, + "step": 16791 + }, + { + "epoch": 3.1475164011246486, + "grad_norm": 61232.8671875, + "learning_rate": 6.221057393006985e-06, + "loss": 1.9827, + "step": 16792 + }, + { + "epoch": 3.147703842549203, + "grad_norm": 50840.59765625, + "learning_rate": 6.217261971732724e-06, + "loss": 2.0672, + "step": 16793 + }, + { + "epoch": 3.1478912839737583, + "grad_norm": 53001.671875, + "learning_rate": 6.2134676318353345e-06, + "loss": 2.0965, + "step": 16794 + }, + { + "epoch": 3.148078725398313, + "grad_norm": 58669.51171875, + "learning_rate": 6.209674373408514e-06, + "loss": 2.0464, + "step": 16795 + }, + { + "epoch": 3.148266166822868, + "grad_norm": 52218.3828125, + "learning_rate": 6.205882196545976e-06, + "loss": 2.0249, + "step": 16796 + }, + { + "epoch": 3.1484536082474226, + "grad_norm": 53019.83203125, + "learning_rate": 6.202091101341351e-06, + "loss": 2.1351, + "step": 16797 + }, + { + "epoch": 3.1486410496719777, + "grad_norm": 54821.53515625, + "learning_rate": 6.198301087888303e-06, + "loss": 2.1078, + "step": 16798 + }, + { + "epoch": 3.1488284910965323, + "grad_norm": 52059.86328125, + "learning_rate": 6.19451215628043e-06, + "loss": 2.068, + "step": 16799 + }, + { + "epoch": 3.1490159325210874, + "grad_norm": 53745.38671875, + "learning_rate": 6.190724306611301e-06, + "loss": 2.0501, + "step": 16800 + }, + { + "epoch": 3.149203373945642, + "grad_norm": 54635.10546875, + "learning_rate": 6.186937538974497e-06, + "loss": 2.132, + "step": 16801 + }, + { + "epoch": 3.1493908153701966, + "grad_norm": 58602.328125, + "learning_rate": 6.183151853463526e-06, + "loss": 2.0927, + "step": 16802 + }, + { + "epoch": 3.1495782567947517, + "grad_norm": 58228.671875, + "learning_rate": 6.179367250171886e-06, + "loss": 2.001, + "step": 16803 + }, + { + "epoch": 3.1497656982193063, + "grad_norm": 59495.83203125, + "learning_rate": 6.175583729193057e-06, + "loss": 2.0369, + "step": 16804 + }, + { + "epoch": 3.1499531396438614, + "grad_norm": 55052.171875, + "learning_rate": 6.171801290620504e-06, + "loss": 2.1654, + "step": 16805 + }, + { + "epoch": 3.150140581068416, + "grad_norm": 61022.3984375, + "learning_rate": 6.168019934547636e-06, + "loss": 2.0337, + "step": 16806 + }, + { + "epoch": 3.150328022492971, + "grad_norm": 59978.2890625, + "learning_rate": 6.164239661067839e-06, + "loss": 2.0037, + "step": 16807 + }, + { + "epoch": 3.1505154639175257, + "grad_norm": 55256.59375, + "learning_rate": 6.1604604702744926e-06, + "loss": 2.1014, + "step": 16808 + }, + { + "epoch": 3.1507029053420808, + "grad_norm": 56371.8125, + "learning_rate": 6.15668236226094e-06, + "loss": 1.9941, + "step": 16809 + }, + { + "epoch": 3.1508903467666354, + "grad_norm": 56069.25, + "learning_rate": 6.152905337120474e-06, + "loss": 2.1028, + "step": 16810 + }, + { + "epoch": 3.1510777881911904, + "grad_norm": 53064.171875, + "learning_rate": 6.149129394946396e-06, + "loss": 2.0163, + "step": 16811 + }, + { + "epoch": 3.151265229615745, + "grad_norm": 52877.40234375, + "learning_rate": 6.1453545358319934e-06, + "loss": 2.0449, + "step": 16812 + }, + { + "epoch": 3.1514526710402997, + "grad_norm": 51746.65234375, + "learning_rate": 6.141580759870458e-06, + "loss": 2.0179, + "step": 16813 + }, + { + "epoch": 3.1516401124648548, + "grad_norm": 58028.51171875, + "learning_rate": 6.13780806715501e-06, + "loss": 2.1156, + "step": 16814 + }, + { + "epoch": 3.1518275538894094, + "grad_norm": 56307.52734375, + "learning_rate": 6.134036457778841e-06, + "loss": 2.1149, + "step": 16815 + }, + { + "epoch": 3.1520149953139645, + "grad_norm": 58188.56640625, + "learning_rate": 6.130265931835106e-06, + "loss": 2.0407, + "step": 16816 + }, + { + "epoch": 3.152202436738519, + "grad_norm": 60486.953125, + "learning_rate": 6.1264964894169105e-06, + "loss": 2.0605, + "step": 16817 + }, + { + "epoch": 3.152389878163074, + "grad_norm": 59408.984375, + "learning_rate": 6.122728130617367e-06, + "loss": 2.1218, + "step": 16818 + }, + { + "epoch": 3.1525773195876288, + "grad_norm": 58280.87109375, + "learning_rate": 6.118960855529565e-06, + "loss": 2.1109, + "step": 16819 + }, + { + "epoch": 3.152764761012184, + "grad_norm": 55425.3828125, + "learning_rate": 6.115194664246532e-06, + "loss": 2.0335, + "step": 16820 + }, + { + "epoch": 3.1529522024367385, + "grad_norm": 53166.828125, + "learning_rate": 6.1114295568612865e-06, + "loss": 2.1092, + "step": 16821 + }, + { + "epoch": 3.1531396438612935, + "grad_norm": 53605.52734375, + "learning_rate": 6.107665533466839e-06, + "loss": 1.995, + "step": 16822 + }, + { + "epoch": 3.153327085285848, + "grad_norm": 60189.73828125, + "learning_rate": 6.103902594156135e-06, + "loss": 2.0878, + "step": 16823 + }, + { + "epoch": 3.1535145267104028, + "grad_norm": 52348.6484375, + "learning_rate": 6.100140739022136e-06, + "loss": 2.0643, + "step": 16824 + }, + { + "epoch": 3.153701968134958, + "grad_norm": 56001.34765625, + "learning_rate": 6.096379968157745e-06, + "loss": 2.014, + "step": 16825 + }, + { + "epoch": 3.1538894095595125, + "grad_norm": 50183.8359375, + "learning_rate": 6.0926202816558365e-06, + "loss": 2.0654, + "step": 16826 + }, + { + "epoch": 3.1540768509840675, + "grad_norm": 51547.76953125, + "learning_rate": 6.088861679609287e-06, + "loss": 2.0801, + "step": 16827 + }, + { + "epoch": 3.154264292408622, + "grad_norm": 53685.04296875, + "learning_rate": 6.0851041621109105e-06, + "loss": 2.1144, + "step": 16828 + }, + { + "epoch": 3.1544517338331772, + "grad_norm": 51861.85546875, + "learning_rate": 6.081347729253539e-06, + "loss": 2.0413, + "step": 16829 + }, + { + "epoch": 3.154639175257732, + "grad_norm": 60059.0625, + "learning_rate": 6.077592381129927e-06, + "loss": 2.118, + "step": 16830 + }, + { + "epoch": 3.154826616682287, + "grad_norm": 55643.54296875, + "learning_rate": 6.073838117832847e-06, + "loss": 2.088, + "step": 16831 + }, + { + "epoch": 3.1550140581068415, + "grad_norm": 54427.5, + "learning_rate": 6.070084939455018e-06, + "loss": 2.0764, + "step": 16832 + }, + { + "epoch": 3.1552014995313966, + "grad_norm": 54074.6171875, + "learning_rate": 6.066332846089118e-06, + "loss": 2.0712, + "step": 16833 + }, + { + "epoch": 3.1553889409559512, + "grad_norm": 61599.875, + "learning_rate": 6.062581837827835e-06, + "loss": 2.1146, + "step": 16834 + }, + { + "epoch": 3.155576382380506, + "grad_norm": 54361.4453125, + "learning_rate": 6.05883191476383e-06, + "loss": 2.0706, + "step": 16835 + }, + { + "epoch": 3.155763823805061, + "grad_norm": 58359.796875, + "learning_rate": 6.055083076989704e-06, + "loss": 2.0471, + "step": 16836 + }, + { + "epoch": 3.1559512652296156, + "grad_norm": 58125.30078125, + "learning_rate": 6.051335324598046e-06, + "loss": 2.0514, + "step": 16837 + }, + { + "epoch": 3.1561387066541706, + "grad_norm": 54234.7578125, + "learning_rate": 6.04758865768143e-06, + "loss": 2.0313, + "step": 16838 + }, + { + "epoch": 3.1563261480787252, + "grad_norm": 54920.25390625, + "learning_rate": 6.04384307633239e-06, + "loss": 2.0833, + "step": 16839 + }, + { + "epoch": 3.1565135895032803, + "grad_norm": 61323.765625, + "learning_rate": 6.040098580643422e-06, + "loss": 2.0519, + "step": 16840 + }, + { + "epoch": 3.156701030927835, + "grad_norm": 58243.64453125, + "learning_rate": 6.036355170707025e-06, + "loss": 2.1041, + "step": 16841 + }, + { + "epoch": 3.15688847235239, + "grad_norm": 55103.296875, + "learning_rate": 6.032612846615682e-06, + "loss": 2.0402, + "step": 16842 + }, + { + "epoch": 3.1570759137769446, + "grad_norm": 52315.07421875, + "learning_rate": 6.028871608461772e-06, + "loss": 2.1423, + "step": 16843 + }, + { + "epoch": 3.1572633552014997, + "grad_norm": 60333.609375, + "learning_rate": 6.025131456337724e-06, + "loss": 2.0364, + "step": 16844 + }, + { + "epoch": 3.1574507966260543, + "grad_norm": 57720.546875, + "learning_rate": 6.021392390335928e-06, + "loss": 2.1655, + "step": 16845 + }, + { + "epoch": 3.1576382380506094, + "grad_norm": 55984.12890625, + "learning_rate": 6.017654410548718e-06, + "loss": 2.0578, + "step": 16846 + }, + { + "epoch": 3.157825679475164, + "grad_norm": 53983.55078125, + "learning_rate": 6.013917517068412e-06, + "loss": 2.0742, + "step": 16847 + }, + { + "epoch": 3.1580131208997186, + "grad_norm": 54910.2421875, + "learning_rate": 6.010181709987311e-06, + "loss": 2.0907, + "step": 16848 + }, + { + "epoch": 3.1582005623242737, + "grad_norm": 52985.4453125, + "learning_rate": 6.006446989397701e-06, + "loss": 2.0792, + "step": 16849 + }, + { + "epoch": 3.1583880037488283, + "grad_norm": 59369.234375, + "learning_rate": 6.002713355391809e-06, + "loss": 2.0282, + "step": 16850 + }, + { + "epoch": 3.1585754451733834, + "grad_norm": 54380.890625, + "learning_rate": 5.998980808061849e-06, + "loss": 2.1459, + "step": 16851 + }, + { + "epoch": 3.158762886597938, + "grad_norm": 56108.28515625, + "learning_rate": 5.995249347500021e-06, + "loss": 2.1061, + "step": 16852 + }, + { + "epoch": 3.158950328022493, + "grad_norm": 56496.984375, + "learning_rate": 5.991518973798471e-06, + "loss": 2.0482, + "step": 16853 + }, + { + "epoch": 3.1591377694470477, + "grad_norm": 54896.3828125, + "learning_rate": 5.987789687049356e-06, + "loss": 2.0463, + "step": 16854 + }, + { + "epoch": 3.159325210871603, + "grad_norm": 55953.28125, + "learning_rate": 5.984061487344772e-06, + "loss": 2.0224, + "step": 16855 + }, + { + "epoch": 3.1595126522961574, + "grad_norm": 58232.046875, + "learning_rate": 5.980334374776797e-06, + "loss": 2.0834, + "step": 16856 + }, + { + "epoch": 3.1597000937207125, + "grad_norm": 61485.6015625, + "learning_rate": 5.976608349437496e-06, + "loss": 2.0789, + "step": 16857 + }, + { + "epoch": 3.159887535145267, + "grad_norm": 55863.6328125, + "learning_rate": 5.972883411418878e-06, + "loss": 2.0497, + "step": 16858 + }, + { + "epoch": 3.1600749765698217, + "grad_norm": 57701.39453125, + "learning_rate": 5.969159560812971e-06, + "loss": 2.1106, + "step": 16859 + }, + { + "epoch": 3.160262417994377, + "grad_norm": 53351.25390625, + "learning_rate": 5.965436797711726e-06, + "loss": 2.0616, + "step": 16860 + }, + { + "epoch": 3.1604498594189314, + "grad_norm": 58213.35546875, + "learning_rate": 5.961715122207112e-06, + "loss": 1.9916, + "step": 16861 + }, + { + "epoch": 3.1606373008434865, + "grad_norm": 56872.42578125, + "learning_rate": 5.957994534391037e-06, + "loss": 2.0393, + "step": 16862 + }, + { + "epoch": 3.160824742268041, + "grad_norm": 57431.71484375, + "learning_rate": 5.9542750343553834e-06, + "loss": 2.0491, + "step": 16863 + }, + { + "epoch": 3.161012183692596, + "grad_norm": 54392.92578125, + "learning_rate": 5.950556622192033e-06, + "loss": 2.0907, + "step": 16864 + }, + { + "epoch": 3.161199625117151, + "grad_norm": 56446.890625, + "learning_rate": 5.9468392979928455e-06, + "loss": 2.1177, + "step": 16865 + }, + { + "epoch": 3.161387066541706, + "grad_norm": 53512.625, + "learning_rate": 5.943123061849587e-06, + "loss": 2.1052, + "step": 16866 + }, + { + "epoch": 3.1615745079662605, + "grad_norm": 58723.71484375, + "learning_rate": 5.939407913854072e-06, + "loss": 2.0232, + "step": 16867 + }, + { + "epoch": 3.1617619493908156, + "grad_norm": 56000.10546875, + "learning_rate": 5.935693854098062e-06, + "loss": 2.035, + "step": 16868 + }, + { + "epoch": 3.16194939081537, + "grad_norm": 57636.30078125, + "learning_rate": 5.931980882673288e-06, + "loss": 2.0677, + "step": 16869 + }, + { + "epoch": 3.1621368322399253, + "grad_norm": 59177.84375, + "learning_rate": 5.928268999671438e-06, + "loss": 2.0476, + "step": 16870 + }, + { + "epoch": 3.16232427366448, + "grad_norm": 54464.23046875, + "learning_rate": 5.924558205184205e-06, + "loss": 2.1027, + "step": 16871 + }, + { + "epoch": 3.1625117150890345, + "grad_norm": 53684.15234375, + "learning_rate": 5.9208484993032556e-06, + "loss": 2.2308, + "step": 16872 + }, + { + "epoch": 3.1626991565135896, + "grad_norm": 58107.9609375, + "learning_rate": 5.917139882120181e-06, + "loss": 2.0062, + "step": 16873 + }, + { + "epoch": 3.162886597938144, + "grad_norm": 51979.83203125, + "learning_rate": 5.9134323537266e-06, + "loss": 2.0951, + "step": 16874 + }, + { + "epoch": 3.1630740393626993, + "grad_norm": 50351.40234375, + "learning_rate": 5.909725914214093e-06, + "loss": 2.0275, + "step": 16875 + }, + { + "epoch": 3.163261480787254, + "grad_norm": 53461.34375, + "learning_rate": 5.906020563674186e-06, + "loss": 2.0718, + "step": 16876 + }, + { + "epoch": 3.163448922211809, + "grad_norm": 70422.1796875, + "learning_rate": 5.9023163021983966e-06, + "loss": 2.0883, + "step": 16877 + }, + { + "epoch": 3.1636363636363636, + "grad_norm": 56569.625, + "learning_rate": 5.898613129878228e-06, + "loss": 2.119, + "step": 16878 + }, + { + "epoch": 3.1638238050609186, + "grad_norm": 54012.44140625, + "learning_rate": 5.894911046805129e-06, + "loss": 2.0561, + "step": 16879 + }, + { + "epoch": 3.1640112464854733, + "grad_norm": 60702.5, + "learning_rate": 5.891210053070551e-06, + "loss": 2.0055, + "step": 16880 + }, + { + "epoch": 3.1641986879100283, + "grad_norm": 58599.68359375, + "learning_rate": 5.8875101487658905e-06, + "loss": 2.0654, + "step": 16881 + }, + { + "epoch": 3.164386129334583, + "grad_norm": 52189.03125, + "learning_rate": 5.883811333982542e-06, + "loss": 2.0455, + "step": 16882 + }, + { + "epoch": 3.1645735707591376, + "grad_norm": 59696.796875, + "learning_rate": 5.88011360881186e-06, + "loss": 2.1528, + "step": 16883 + }, + { + "epoch": 3.1647610121836927, + "grad_norm": 51544.05859375, + "learning_rate": 5.8764169733451615e-06, + "loss": 2.1025, + "step": 16884 + }, + { + "epoch": 3.1649484536082473, + "grad_norm": 54398.2109375, + "learning_rate": 5.872721427673761e-06, + "loss": 1.9933, + "step": 16885 + }, + { + "epoch": 3.1651358950328023, + "grad_norm": 59397.93359375, + "learning_rate": 5.869026971888924e-06, + "loss": 2.0382, + "step": 16886 + }, + { + "epoch": 3.165323336457357, + "grad_norm": 56036.859375, + "learning_rate": 5.865333606081908e-06, + "loss": 2.1198, + "step": 16887 + }, + { + "epoch": 3.165510777881912, + "grad_norm": 61424.24609375, + "learning_rate": 5.8616413303439256e-06, + "loss": 2.1361, + "step": 16888 + }, + { + "epoch": 3.1656982193064667, + "grad_norm": 55307.06640625, + "learning_rate": 5.857950144766189e-06, + "loss": 1.994, + "step": 16889 + }, + { + "epoch": 3.1658856607310217, + "grad_norm": 56095.29296875, + "learning_rate": 5.854260049439836e-06, + "loss": 2.024, + "step": 16890 + }, + { + "epoch": 3.1660731021555764, + "grad_norm": 50937.578125, + "learning_rate": 5.850571044456038e-06, + "loss": 2.1184, + "step": 16891 + }, + { + "epoch": 3.1662605435801314, + "grad_norm": 51082.7578125, + "learning_rate": 5.846883129905895e-06, + "loss": 2.0697, + "step": 16892 + }, + { + "epoch": 3.166447985004686, + "grad_norm": 55039.7734375, + "learning_rate": 5.843196305880477e-06, + "loss": 2.1556, + "step": 16893 + }, + { + "epoch": 3.1666354264292407, + "grad_norm": 54624.0703125, + "learning_rate": 5.8395105724708686e-06, + "loss": 2.0256, + "step": 16894 + }, + { + "epoch": 3.1668228678537957, + "grad_norm": 53901.00390625, + "learning_rate": 5.835825929768113e-06, + "loss": 2.0959, + "step": 16895 + }, + { + "epoch": 3.1670103092783504, + "grad_norm": 53445.6796875, + "learning_rate": 5.83214237786317e-06, + "loss": 2.0761, + "step": 16896 + }, + { + "epoch": 3.1671977507029054, + "grad_norm": 52338.04296875, + "learning_rate": 5.828459916847051e-06, + "loss": 2.0544, + "step": 16897 + }, + { + "epoch": 3.16738519212746, + "grad_norm": 48907.76953125, + "learning_rate": 5.824778546810716e-06, + "loss": 2.1086, + "step": 16898 + }, + { + "epoch": 3.167572633552015, + "grad_norm": 65185.69140625, + "learning_rate": 5.821098267845071e-06, + "loss": 2.1087, + "step": 16899 + }, + { + "epoch": 3.1677600749765698, + "grad_norm": 55552.46484375, + "learning_rate": 5.817419080041014e-06, + "loss": 2.0136, + "step": 16900 + }, + { + "epoch": 3.167947516401125, + "grad_norm": 55573.19921875, + "learning_rate": 5.813740983489429e-06, + "loss": 2.1074, + "step": 16901 + }, + { + "epoch": 3.1681349578256794, + "grad_norm": 53282.73046875, + "learning_rate": 5.810063978281155e-06, + "loss": 2.1119, + "step": 16902 + }, + { + "epoch": 3.1683223992502345, + "grad_norm": 56054.06640625, + "learning_rate": 5.806388064506996e-06, + "loss": 2.0201, + "step": 16903 + }, + { + "epoch": 3.168509840674789, + "grad_norm": 58322.18359375, + "learning_rate": 5.802713242257757e-06, + "loss": 2.1106, + "step": 16904 + }, + { + "epoch": 3.1686972820993438, + "grad_norm": 57944.5390625, + "learning_rate": 5.7990395116242046e-06, + "loss": 2.0692, + "step": 16905 + }, + { + "epoch": 3.168884723523899, + "grad_norm": 52782.328125, + "learning_rate": 5.795366872697072e-06, + "loss": 2.0585, + "step": 16906 + }, + { + "epoch": 3.1690721649484535, + "grad_norm": 58756.26953125, + "learning_rate": 5.791695325567054e-06, + "loss": 2.0941, + "step": 16907 + }, + { + "epoch": 3.1692596063730085, + "grad_norm": 60440.69921875, + "learning_rate": 5.788024870324854e-06, + "loss": 2.0459, + "step": 16908 + }, + { + "epoch": 3.169447047797563, + "grad_norm": 59067.37890625, + "learning_rate": 5.784355507061117e-06, + "loss": 2.1165, + "step": 16909 + }, + { + "epoch": 3.169634489222118, + "grad_norm": 51190.0078125, + "learning_rate": 5.780687235866466e-06, + "loss": 2.1028, + "step": 16910 + }, + { + "epoch": 3.169821930646673, + "grad_norm": 60432.171875, + "learning_rate": 5.777020056831512e-06, + "loss": 2.0591, + "step": 16911 + }, + { + "epoch": 3.170009372071228, + "grad_norm": 52653.44140625, + "learning_rate": 5.77335397004683e-06, + "loss": 2.047, + "step": 16912 + }, + { + "epoch": 3.1701968134957825, + "grad_norm": 56322.1875, + "learning_rate": 5.769688975602966e-06, + "loss": 2.0659, + "step": 16913 + }, + { + "epoch": 3.1703842549203376, + "grad_norm": 54544.078125, + "learning_rate": 5.766025073590431e-06, + "loss": 2.083, + "step": 16914 + }, + { + "epoch": 3.170571696344892, + "grad_norm": 53810.40625, + "learning_rate": 5.762362264099735e-06, + "loss": 2.0426, + "step": 16915 + }, + { + "epoch": 3.170759137769447, + "grad_norm": 57558.1484375, + "learning_rate": 5.758700547221324e-06, + "loss": 2.0708, + "step": 16916 + }, + { + "epoch": 3.170946579194002, + "grad_norm": 56207.63671875, + "learning_rate": 5.755039923045663e-06, + "loss": 2.0895, + "step": 16917 + }, + { + "epoch": 3.1711340206185565, + "grad_norm": 53070.375, + "learning_rate": 5.751380391663147e-06, + "loss": 2.0297, + "step": 16918 + }, + { + "epoch": 3.1713214620431116, + "grad_norm": 59379.8828125, + "learning_rate": 5.747721953164159e-06, + "loss": 2.1266, + "step": 16919 + }, + { + "epoch": 3.1715089034676662, + "grad_norm": 52853.25390625, + "learning_rate": 5.744064607639066e-06, + "loss": 2.0973, + "step": 16920 + }, + { + "epoch": 3.1716963448922213, + "grad_norm": 52504.93359375, + "learning_rate": 5.7404083551782065e-06, + "loss": 2.0459, + "step": 16921 + }, + { + "epoch": 3.171883786316776, + "grad_norm": 54091.484375, + "learning_rate": 5.73675319587188e-06, + "loss": 2.0655, + "step": 16922 + }, + { + "epoch": 3.172071227741331, + "grad_norm": 54688.359375, + "learning_rate": 5.733099129810349e-06, + "loss": 2.0771, + "step": 16923 + }, + { + "epoch": 3.1722586691658856, + "grad_norm": 53548.7265625, + "learning_rate": 5.7294461570838895e-06, + "loss": 2.0583, + "step": 16924 + }, + { + "epoch": 3.1724461105904407, + "grad_norm": 56007.734375, + "learning_rate": 5.725794277782709e-06, + "loss": 2.047, + "step": 16925 + }, + { + "epoch": 3.1726335520149953, + "grad_norm": 60252.32421875, + "learning_rate": 5.722143491997001e-06, + "loss": 2.062, + "step": 16926 + }, + { + "epoch": 3.17282099343955, + "grad_norm": 53784.29296875, + "learning_rate": 5.7184937998169375e-06, + "loss": 2.0161, + "step": 16927 + }, + { + "epoch": 3.173008434864105, + "grad_norm": 59813.1796875, + "learning_rate": 5.714845201332675e-06, + "loss": 2.0896, + "step": 16928 + }, + { + "epoch": 3.1731958762886596, + "grad_norm": 56618.20703125, + "learning_rate": 5.711197696634324e-06, + "loss": 2.0988, + "step": 16929 + }, + { + "epoch": 3.1733833177132147, + "grad_norm": 50864.76171875, + "learning_rate": 5.707551285811957e-06, + "loss": 2.0535, + "step": 16930 + }, + { + "epoch": 3.1735707591377693, + "grad_norm": 53597.078125, + "learning_rate": 5.703905968955653e-06, + "loss": 2.0846, + "step": 16931 + }, + { + "epoch": 3.1737582005623244, + "grad_norm": 55788.98046875, + "learning_rate": 5.700261746155444e-06, + "loss": 2.0914, + "step": 16932 + }, + { + "epoch": 3.173945641986879, + "grad_norm": 52249.6875, + "learning_rate": 5.696618617501326e-06, + "loss": 2.0265, + "step": 16933 + }, + { + "epoch": 3.174133083411434, + "grad_norm": 54973.2890625, + "learning_rate": 5.692976583083287e-06, + "loss": 1.9889, + "step": 16934 + }, + { + "epoch": 3.1743205248359887, + "grad_norm": 55141.37890625, + "learning_rate": 5.689335642991295e-06, + "loss": 2.0663, + "step": 16935 + }, + { + "epoch": 3.1745079662605438, + "grad_norm": 57474.6640625, + "learning_rate": 5.685695797315255e-06, + "loss": 2.0621, + "step": 16936 + }, + { + "epoch": 3.1746954076850984, + "grad_norm": 53328.24609375, + "learning_rate": 5.6820570461450675e-06, + "loss": 2.1015, + "step": 16937 + }, + { + "epoch": 3.174882849109653, + "grad_norm": 56391.5546875, + "learning_rate": 5.678419389570627e-06, + "loss": 2.0511, + "step": 16938 + }, + { + "epoch": 3.175070290534208, + "grad_norm": 56097.05859375, + "learning_rate": 5.674782827681757e-06, + "loss": 2.0457, + "step": 16939 + }, + { + "epoch": 3.1752577319587627, + "grad_norm": 60714.93359375, + "learning_rate": 5.671147360568274e-06, + "loss": 2.1565, + "step": 16940 + }, + { + "epoch": 3.1754451733833178, + "grad_norm": 54591.484375, + "learning_rate": 5.667512988319984e-06, + "loss": 2.0405, + "step": 16941 + }, + { + "epoch": 3.1756326148078724, + "grad_norm": 53602.703125, + "learning_rate": 5.663879711026648e-06, + "loss": 2.2049, + "step": 16942 + }, + { + "epoch": 3.1758200562324275, + "grad_norm": 61626.25390625, + "learning_rate": 5.660247528778001e-06, + "loss": 2.1182, + "step": 16943 + }, + { + "epoch": 3.176007497656982, + "grad_norm": 53297.79296875, + "learning_rate": 5.656616441663748e-06, + "loss": 2.0676, + "step": 16944 + }, + { + "epoch": 3.176194939081537, + "grad_norm": 48246.80078125, + "learning_rate": 5.652986449773584e-06, + "loss": 2.1246, + "step": 16945 + }, + { + "epoch": 3.176382380506092, + "grad_norm": 55123.1171875, + "learning_rate": 5.649357553197149e-06, + "loss": 2.0632, + "step": 16946 + }, + { + "epoch": 3.176569821930647, + "grad_norm": 55409.453125, + "learning_rate": 5.6457297520240874e-06, + "loss": 2.086, + "step": 16947 + }, + { + "epoch": 3.1767572633552015, + "grad_norm": 62220.87890625, + "learning_rate": 5.6421030463439995e-06, + "loss": 2.0822, + "step": 16948 + }, + { + "epoch": 3.176944704779756, + "grad_norm": 56446.64453125, + "learning_rate": 5.638477436246442e-06, + "loss": 2.0655, + "step": 16949 + }, + { + "epoch": 3.177132146204311, + "grad_norm": 51181.48828125, + "learning_rate": 5.634852921820976e-06, + "loss": 2.0479, + "step": 16950 + }, + { + "epoch": 3.177319587628866, + "grad_norm": 59432.05859375, + "learning_rate": 5.631229503157132e-06, + "loss": 2.0618, + "step": 16951 + }, + { + "epoch": 3.177507029053421, + "grad_norm": 52396.640625, + "learning_rate": 5.627607180344396e-06, + "loss": 2.0816, + "step": 16952 + }, + { + "epoch": 3.1776944704779755, + "grad_norm": 54477.90234375, + "learning_rate": 5.623985953472222e-06, + "loss": 2.0438, + "step": 16953 + }, + { + "epoch": 3.1778819119025306, + "grad_norm": 57968.35546875, + "learning_rate": 5.620365822630075e-06, + "loss": 2.1077, + "step": 16954 + }, + { + "epoch": 3.178069353327085, + "grad_norm": 60200.76171875, + "learning_rate": 5.616746787907351e-06, + "loss": 2.1027, + "step": 16955 + }, + { + "epoch": 3.1782567947516402, + "grad_norm": 60478.84375, + "learning_rate": 5.613128849393423e-06, + "loss": 2.0528, + "step": 16956 + }, + { + "epoch": 3.178444236176195, + "grad_norm": 49135.9140625, + "learning_rate": 5.60951200717767e-06, + "loss": 2.06, + "step": 16957 + }, + { + "epoch": 3.17863167760075, + "grad_norm": 59702.77734375, + "learning_rate": 5.605896261349436e-06, + "loss": 1.9975, + "step": 16958 + }, + { + "epoch": 3.1788191190253046, + "grad_norm": 51405.05078125, + "learning_rate": 5.6022816119979836e-06, + "loss": 2.0717, + "step": 16959 + }, + { + "epoch": 3.179006560449859, + "grad_norm": 54572.19921875, + "learning_rate": 5.598668059212614e-06, + "loss": 2.0591, + "step": 16960 + }, + { + "epoch": 3.1791940018744143, + "grad_norm": 52896.2890625, + "learning_rate": 5.595055603082588e-06, + "loss": 2.141, + "step": 16961 + }, + { + "epoch": 3.179381443298969, + "grad_norm": 60740.4375, + "learning_rate": 5.591444243697114e-06, + "loss": 1.9901, + "step": 16962 + }, + { + "epoch": 3.179568884723524, + "grad_norm": 57729.5859375, + "learning_rate": 5.587833981145379e-06, + "loss": 2.09, + "step": 16963 + }, + { + "epoch": 3.1797563261480786, + "grad_norm": 52007.77734375, + "learning_rate": 5.5842248155165696e-06, + "loss": 2.0994, + "step": 16964 + }, + { + "epoch": 3.1799437675726336, + "grad_norm": 53266.84375, + "learning_rate": 5.580616746899836e-06, + "loss": 2.0447, + "step": 16965 + }, + { + "epoch": 3.1801312089971883, + "grad_norm": 64039.30859375, + "learning_rate": 5.577009775384262e-06, + "loss": 2.0827, + "step": 16966 + }, + { + "epoch": 3.1803186504217433, + "grad_norm": 60433.3203125, + "learning_rate": 5.573403901058949e-06, + "loss": 2.0801, + "step": 16967 + }, + { + "epoch": 3.180506091846298, + "grad_norm": 54196.7890625, + "learning_rate": 5.56979912401297e-06, + "loss": 2.1308, + "step": 16968 + }, + { + "epoch": 3.180693533270853, + "grad_norm": 59680.10546875, + "learning_rate": 5.566195444335348e-06, + "loss": 2.0903, + "step": 16969 + }, + { + "epoch": 3.1808809746954076, + "grad_norm": 51897.60546875, + "learning_rate": 5.562592862115079e-06, + "loss": 2.0786, + "step": 16970 + }, + { + "epoch": 3.1810684161199627, + "grad_norm": 55081.390625, + "learning_rate": 5.5589913774411636e-06, + "loss": 2.0304, + "step": 16971 + }, + { + "epoch": 3.1812558575445173, + "grad_norm": 53910.890625, + "learning_rate": 5.555390990402531e-06, + "loss": 2.0894, + "step": 16972 + }, + { + "epoch": 3.181443298969072, + "grad_norm": 55257.8984375, + "learning_rate": 5.551791701088132e-06, + "loss": 2.1377, + "step": 16973 + }, + { + "epoch": 3.181630740393627, + "grad_norm": 54388.04296875, + "learning_rate": 5.548193509586835e-06, + "loss": 2.099, + "step": 16974 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 60848.5, + "learning_rate": 5.544596415987535e-06, + "loss": 2.0111, + "step": 16975 + }, + { + "epoch": 3.1820056232427367, + "grad_norm": 56246.92578125, + "learning_rate": 5.541000420379061e-06, + "loss": 2.1068, + "step": 16976 + }, + { + "epoch": 3.1821930646672913, + "grad_norm": 57657.0390625, + "learning_rate": 5.5374055228502415e-06, + "loss": 2.0572, + "step": 16977 + }, + { + "epoch": 3.1823805060918464, + "grad_norm": 55698.453125, + "learning_rate": 5.533811723489863e-06, + "loss": 2.0921, + "step": 16978 + }, + { + "epoch": 3.182567947516401, + "grad_norm": 59654.09765625, + "learning_rate": 5.530219022386668e-06, + "loss": 2.0801, + "step": 16979 + }, + { + "epoch": 3.182755388940956, + "grad_norm": 61373.25390625, + "learning_rate": 5.526627419629421e-06, + "loss": 2.1419, + "step": 16980 + }, + { + "epoch": 3.1829428303655107, + "grad_norm": 57661.28125, + "learning_rate": 5.523036915306801e-06, + "loss": 2.0884, + "step": 16981 + }, + { + "epoch": 3.183130271790066, + "grad_norm": 53680.20703125, + "learning_rate": 5.51944750950752e-06, + "loss": 2.0829, + "step": 16982 + }, + { + "epoch": 3.1833177132146204, + "grad_norm": 57932.6953125, + "learning_rate": 5.5158592023202005e-06, + "loss": 2.0876, + "step": 16983 + }, + { + "epoch": 3.183505154639175, + "grad_norm": 52586.78515625, + "learning_rate": 5.512271993833501e-06, + "loss": 2.0679, + "step": 16984 + }, + { + "epoch": 3.18369259606373, + "grad_norm": 57371.3828125, + "learning_rate": 5.508685884136e-06, + "loss": 2.1111, + "step": 16985 + }, + { + "epoch": 3.1838800374882847, + "grad_norm": 51858.71484375, + "learning_rate": 5.50510087331626e-06, + "loss": 2.07, + "step": 16986 + }, + { + "epoch": 3.18406747891284, + "grad_norm": 61467.91015625, + "learning_rate": 5.501516961462844e-06, + "loss": 2.0558, + "step": 16987 + }, + { + "epoch": 3.1842549203373944, + "grad_norm": 53539.16015625, + "learning_rate": 5.497934148664286e-06, + "loss": 2.0993, + "step": 16988 + }, + { + "epoch": 3.1844423617619495, + "grad_norm": 56973.87890625, + "learning_rate": 5.494352435009037e-06, + "loss": 1.9859, + "step": 16989 + }, + { + "epoch": 3.184629803186504, + "grad_norm": 54614.6328125, + "learning_rate": 5.4907718205855825e-06, + "loss": 2.0634, + "step": 16990 + }, + { + "epoch": 3.184817244611059, + "grad_norm": 55493.69140625, + "learning_rate": 5.487192305482363e-06, + "loss": 2.0619, + "step": 16991 + }, + { + "epoch": 3.185004686035614, + "grad_norm": 56823.56640625, + "learning_rate": 5.483613889787781e-06, + "loss": 2.1186, + "step": 16992 + }, + { + "epoch": 3.185192127460169, + "grad_norm": 54754.78515625, + "learning_rate": 5.480036573590214e-06, + "loss": 2.0177, + "step": 16993 + }, + { + "epoch": 3.1853795688847235, + "grad_norm": 58439.6328125, + "learning_rate": 5.476460356978031e-06, + "loss": 2.1381, + "step": 16994 + }, + { + "epoch": 3.1855670103092786, + "grad_norm": 52478.671875, + "learning_rate": 5.472885240039549e-06, + "loss": 2.0682, + "step": 16995 + }, + { + "epoch": 3.185754451733833, + "grad_norm": 54926.06640625, + "learning_rate": 5.469311222863066e-06, + "loss": 2.1066, + "step": 16996 + }, + { + "epoch": 3.185941893158388, + "grad_norm": 53522.84765625, + "learning_rate": 5.465738305536855e-06, + "loss": 2.0679, + "step": 16997 + }, + { + "epoch": 3.186129334582943, + "grad_norm": 48241.8046875, + "learning_rate": 5.462166488149184e-06, + "loss": 2.0257, + "step": 16998 + }, + { + "epoch": 3.1863167760074975, + "grad_norm": 55512.1484375, + "learning_rate": 5.458595770788255e-06, + "loss": 2.0744, + "step": 16999 + }, + { + "epoch": 3.1865042174320526, + "grad_norm": 52793.734375, + "learning_rate": 5.455026153542247e-06, + "loss": 2.0786, + "step": 17000 + }, + { + "epoch": 3.1865042174320526, + "eval_loss": 2.258635997772217, + "eval_runtime": 128.4706, + "eval_samples_per_second": 39.301, + "eval_steps_per_second": 1.969, + "step": 17000 + }, + { + "epoch": 3.186691658856607, + "grad_norm": 58757.1796875, + "learning_rate": 5.451457636499352e-06, + "loss": 2.2419, + "step": 17001 + }, + { + "epoch": 3.1868791002811623, + "grad_norm": 61042.43359375, + "learning_rate": 5.447890219747686e-06, + "loss": 2.0349, + "step": 17002 + }, + { + "epoch": 3.187066541705717, + "grad_norm": 52778.6171875, + "learning_rate": 5.444323903375381e-06, + "loss": 2.1154, + "step": 17003 + }, + { + "epoch": 3.187253983130272, + "grad_norm": 54502.51171875, + "learning_rate": 5.440758687470498e-06, + "loss": 1.9896, + "step": 17004 + }, + { + "epoch": 3.1874414245548266, + "grad_norm": 51397.23828125, + "learning_rate": 5.437194572121113e-06, + "loss": 2.1252, + "step": 17005 + }, + { + "epoch": 3.1876288659793817, + "grad_norm": 59889.625, + "learning_rate": 5.433631557415248e-06, + "loss": 2.0186, + "step": 17006 + }, + { + "epoch": 3.1878163074039363, + "grad_norm": 54716.10546875, + "learning_rate": 5.430069643440888e-06, + "loss": 1.9565, + "step": 17007 + }, + { + "epoch": 3.188003748828491, + "grad_norm": 61895.8359375, + "learning_rate": 5.426508830286031e-06, + "loss": 2.0649, + "step": 17008 + }, + { + "epoch": 3.188191190253046, + "grad_norm": 58718.390625, + "learning_rate": 5.422949118038606e-06, + "loss": 2.1166, + "step": 17009 + }, + { + "epoch": 3.1883786316776006, + "grad_norm": 57179.48828125, + "learning_rate": 5.419390506786559e-06, + "loss": 2.0993, + "step": 17010 + }, + { + "epoch": 3.1885660731021557, + "grad_norm": 61470.22265625, + "learning_rate": 5.415832996617759e-06, + "loss": 2.0646, + "step": 17011 + }, + { + "epoch": 3.1887535145267103, + "grad_norm": 56600.4296875, + "learning_rate": 5.412276587620074e-06, + "loss": 2.0755, + "step": 17012 + }, + { + "epoch": 3.1889409559512654, + "grad_norm": 55994.71875, + "learning_rate": 5.408721279881346e-06, + "loss": 2.1118, + "step": 17013 + }, + { + "epoch": 3.18912839737582, + "grad_norm": 55743.96484375, + "learning_rate": 5.405167073489404e-06, + "loss": 2.0506, + "step": 17014 + }, + { + "epoch": 3.189315838800375, + "grad_norm": 52676.75, + "learning_rate": 5.40161396853201e-06, + "loss": 2.0945, + "step": 17015 + }, + { + "epoch": 3.1895032802249297, + "grad_norm": 62665.80078125, + "learning_rate": 5.3980619650969235e-06, + "loss": 2.0122, + "step": 17016 + }, + { + "epoch": 3.1896907216494848, + "grad_norm": 53582.33984375, + "learning_rate": 5.394511063271879e-06, + "loss": 2.063, + "step": 17017 + }, + { + "epoch": 3.1898781630740394, + "grad_norm": 58599.95703125, + "learning_rate": 5.390961263144595e-06, + "loss": 2.1061, + "step": 17018 + }, + { + "epoch": 3.190065604498594, + "grad_norm": 54836.62109375, + "learning_rate": 5.3874125648027184e-06, + "loss": 2.0741, + "step": 17019 + }, + { + "epoch": 3.190253045923149, + "grad_norm": 56937.68359375, + "learning_rate": 5.383864968333907e-06, + "loss": 2.0332, + "step": 17020 + }, + { + "epoch": 3.1904404873477037, + "grad_norm": 57741.2109375, + "learning_rate": 5.380318473825791e-06, + "loss": 2.0996, + "step": 17021 + }, + { + "epoch": 3.1906279287722588, + "grad_norm": 53802.765625, + "learning_rate": 5.376773081365966e-06, + "loss": 2.0818, + "step": 17022 + }, + { + "epoch": 3.1908153701968134, + "grad_norm": 58026.79296875, + "learning_rate": 5.3732287910419735e-06, + "loss": 2.1296, + "step": 17023 + }, + { + "epoch": 3.1910028116213685, + "grad_norm": 57285.68359375, + "learning_rate": 5.369685602941388e-06, + "loss": 2.0144, + "step": 17024 + }, + { + "epoch": 3.191190253045923, + "grad_norm": 55034.0546875, + "learning_rate": 5.366143517151695e-06, + "loss": 2.0827, + "step": 17025 + }, + { + "epoch": 3.191377694470478, + "grad_norm": 56937.3203125, + "learning_rate": 5.362602533760386e-06, + "loss": 2.0918, + "step": 17026 + }, + { + "epoch": 3.1915651358950328, + "grad_norm": 60143.8125, + "learning_rate": 5.3590626528549185e-06, + "loss": 2.069, + "step": 17027 + }, + { + "epoch": 3.191752577319588, + "grad_norm": 57997.0859375, + "learning_rate": 5.3555238745227335e-06, + "loss": 2.0732, + "step": 17028 + }, + { + "epoch": 3.1919400187441425, + "grad_norm": 61171.140625, + "learning_rate": 5.351986198851233e-06, + "loss": 2.0975, + "step": 17029 + }, + { + "epoch": 3.192127460168697, + "grad_norm": 55544.55078125, + "learning_rate": 5.348449625927771e-06, + "loss": 2.057, + "step": 17030 + }, + { + "epoch": 3.192314901593252, + "grad_norm": 55194.44921875, + "learning_rate": 5.34491415583972e-06, + "loss": 1.983, + "step": 17031 + }, + { + "epoch": 3.1925023430178068, + "grad_norm": 55547.49609375, + "learning_rate": 5.341379788674389e-06, + "loss": 2.0836, + "step": 17032 + }, + { + "epoch": 3.192689784442362, + "grad_norm": 60871.86328125, + "learning_rate": 5.33784652451908e-06, + "loss": 2.0223, + "step": 17033 + }, + { + "epoch": 3.1928772258669165, + "grad_norm": 52954.7265625, + "learning_rate": 5.3343143634610515e-06, + "loss": 2.1063, + "step": 17034 + }, + { + "epoch": 3.1930646672914715, + "grad_norm": 51920.09765625, + "learning_rate": 5.330783305587555e-06, + "loss": 2.0545, + "step": 17035 + }, + { + "epoch": 3.193252108716026, + "grad_norm": 55752.58984375, + "learning_rate": 5.3272533509858e-06, + "loss": 2.0456, + "step": 17036 + }, + { + "epoch": 3.1934395501405812, + "grad_norm": 51926.91796875, + "learning_rate": 5.323724499742955e-06, + "loss": 2.0665, + "step": 17037 + }, + { + "epoch": 3.193626991565136, + "grad_norm": 51829.08984375, + "learning_rate": 5.3201967519462045e-06, + "loss": 2.1246, + "step": 17038 + }, + { + "epoch": 3.193814432989691, + "grad_norm": 53326.875, + "learning_rate": 5.316670107682653e-06, + "loss": 2.031, + "step": 17039 + }, + { + "epoch": 3.1940018744142455, + "grad_norm": 55824.33203125, + "learning_rate": 5.31314456703943e-06, + "loss": 2.074, + "step": 17040 + }, + { + "epoch": 3.1941893158388, + "grad_norm": 53194.9765625, + "learning_rate": 5.309620130103593e-06, + "loss": 2.0546, + "step": 17041 + }, + { + "epoch": 3.1943767572633552, + "grad_norm": 58748.09375, + "learning_rate": 5.30609679696219e-06, + "loss": 2.1333, + "step": 17042 + }, + { + "epoch": 3.19456419868791, + "grad_norm": 58308.59765625, + "learning_rate": 5.302574567702251e-06, + "loss": 2.01, + "step": 17043 + }, + { + "epoch": 3.194751640112465, + "grad_norm": 51747.421875, + "learning_rate": 5.299053442410779e-06, + "loss": 2.0764, + "step": 17044 + }, + { + "epoch": 3.1949390815370196, + "grad_norm": 58563.4609375, + "learning_rate": 5.2955334211747306e-06, + "loss": 2.0594, + "step": 17045 + }, + { + "epoch": 3.1951265229615746, + "grad_norm": 54620.18359375, + "learning_rate": 5.292014504081039e-06, + "loss": 2.073, + "step": 17046 + }, + { + "epoch": 3.1953139643861292, + "grad_norm": 55089.81640625, + "learning_rate": 5.288496691216626e-06, + "loss": 2.0796, + "step": 17047 + }, + { + "epoch": 3.1955014058106843, + "grad_norm": 51887.5390625, + "learning_rate": 5.284979982668387e-06, + "loss": 2.0458, + "step": 17048 + }, + { + "epoch": 3.195688847235239, + "grad_norm": 56514.09375, + "learning_rate": 5.281464378523149e-06, + "loss": 2.0564, + "step": 17049 + }, + { + "epoch": 3.195876288659794, + "grad_norm": 60618.22265625, + "learning_rate": 5.277949878867766e-06, + "loss": 2.1098, + "step": 17050 + }, + { + "epoch": 3.1960637300843486, + "grad_norm": 58204.5234375, + "learning_rate": 5.274436483789058e-06, + "loss": 2.1024, + "step": 17051 + }, + { + "epoch": 3.1962511715089033, + "grad_norm": 56423.95703125, + "learning_rate": 5.270924193373761e-06, + "loss": 2.0289, + "step": 17052 + }, + { + "epoch": 3.1964386129334583, + "grad_norm": 52450.48046875, + "learning_rate": 5.267413007708649e-06, + "loss": 2.1298, + "step": 17053 + }, + { + "epoch": 3.196626054358013, + "grad_norm": 54644.609375, + "learning_rate": 5.263902926880443e-06, + "loss": 2.0757, + "step": 17054 + }, + { + "epoch": 3.196813495782568, + "grad_norm": 55016.6796875, + "learning_rate": 5.2603939509758336e-06, + "loss": 2.0191, + "step": 17055 + }, + { + "epoch": 3.1970009372071226, + "grad_norm": 55633.0, + "learning_rate": 5.25688608008148e-06, + "loss": 2.1557, + "step": 17056 + }, + { + "epoch": 3.1971883786316777, + "grad_norm": 52652.99609375, + "learning_rate": 5.253379314284029e-06, + "loss": 2.3209, + "step": 17057 + }, + { + "epoch": 3.1973758200562323, + "grad_norm": 55966.72265625, + "learning_rate": 5.249873653670107e-06, + "loss": 2.0886, + "step": 17058 + }, + { + "epoch": 3.1975632614807874, + "grad_norm": 63269.65625, + "learning_rate": 5.246369098326281e-06, + "loss": 2.0473, + "step": 17059 + }, + { + "epoch": 3.197750702905342, + "grad_norm": 57051.9765625, + "learning_rate": 5.242865648339112e-06, + "loss": 2.034, + "step": 17060 + }, + { + "epoch": 3.197938144329897, + "grad_norm": 50172.96484375, + "learning_rate": 5.239363303795136e-06, + "loss": 2.065, + "step": 17061 + }, + { + "epoch": 3.1981255857544517, + "grad_norm": 53453.2421875, + "learning_rate": 5.235862064780856e-06, + "loss": 2.029, + "step": 17062 + }, + { + "epoch": 3.1983130271790063, + "grad_norm": 50873.62890625, + "learning_rate": 5.2323619313827345e-06, + "loss": 2.1316, + "step": 17063 + }, + { + "epoch": 3.1985004686035614, + "grad_norm": 52864.7578125, + "learning_rate": 5.2288629036872425e-06, + "loss": 2.0741, + "step": 17064 + }, + { + "epoch": 3.198687910028116, + "grad_norm": 54115.91015625, + "learning_rate": 5.225364981780778e-06, + "loss": 2.0694, + "step": 17065 + }, + { + "epoch": 3.198875351452671, + "grad_norm": 57992.515625, + "learning_rate": 5.22186816574976e-06, + "loss": 2.0369, + "step": 17066 + }, + { + "epoch": 3.1990627928772257, + "grad_norm": 53593.9921875, + "learning_rate": 5.218372455680531e-06, + "loss": 2.1004, + "step": 17067 + }, + { + "epoch": 3.199250234301781, + "grad_norm": 58292.390625, + "learning_rate": 5.21487785165945e-06, + "loss": 2.124, + "step": 17068 + }, + { + "epoch": 3.1994376757263354, + "grad_norm": 58300.3125, + "learning_rate": 5.211384353772819e-06, + "loss": 1.9702, + "step": 17069 + }, + { + "epoch": 3.1996251171508905, + "grad_norm": 57897.4921875, + "learning_rate": 5.207891962106925e-06, + "loss": 2.0322, + "step": 17070 + }, + { + "epoch": 3.199812558575445, + "grad_norm": 56958.82421875, + "learning_rate": 5.2044006767480334e-06, + "loss": 2.1503, + "step": 17071 + }, + { + "epoch": 3.2, + "grad_norm": 58146.27734375, + "learning_rate": 5.200910497782352e-06, + "loss": 2.0831, + "step": 17072 + }, + { + "epoch": 3.200187441424555, + "grad_norm": 58466.578125, + "learning_rate": 5.1974214252961e-06, + "loss": 2.1131, + "step": 17073 + }, + { + "epoch": 3.2003748828491094, + "grad_norm": 58707.86328125, + "learning_rate": 5.193933459375461e-06, + "loss": 2.0848, + "step": 17074 + }, + { + "epoch": 3.2005623242736645, + "grad_norm": 55660.41796875, + "learning_rate": 5.1904466001065745e-06, + "loss": 2.0778, + "step": 17075 + }, + { + "epoch": 3.200749765698219, + "grad_norm": 55629.33203125, + "learning_rate": 5.186960847575551e-06, + "loss": 2.0346, + "step": 17076 + }, + { + "epoch": 3.200937207122774, + "grad_norm": 52105.56640625, + "learning_rate": 5.1834762018685035e-06, + "loss": 2.0478, + "step": 17077 + }, + { + "epoch": 3.201124648547329, + "grad_norm": 58534.43359375, + "learning_rate": 5.179992663071487e-06, + "loss": 2.1243, + "step": 17078 + }, + { + "epoch": 3.201312089971884, + "grad_norm": 60381.01171875, + "learning_rate": 5.176510231270531e-06, + "loss": 2.1375, + "step": 17079 + }, + { + "epoch": 3.2014995313964385, + "grad_norm": 57107.54296875, + "learning_rate": 5.173028906551663e-06, + "loss": 2.0471, + "step": 17080 + }, + { + "epoch": 3.2016869728209936, + "grad_norm": 53403.5, + "learning_rate": 5.169548689000875e-06, + "loss": 2.0602, + "step": 17081 + }, + { + "epoch": 3.201874414245548, + "grad_norm": 59142.06640625, + "learning_rate": 5.166069578704097e-06, + "loss": 2.0397, + "step": 17082 + }, + { + "epoch": 3.2020618556701033, + "grad_norm": 56222.8984375, + "learning_rate": 5.162591575747267e-06, + "loss": 2.073, + "step": 17083 + }, + { + "epoch": 3.202249297094658, + "grad_norm": 50972.83203125, + "learning_rate": 5.1591146802163095e-06, + "loss": 2.077, + "step": 17084 + }, + { + "epoch": 3.2024367385192125, + "grad_norm": 56999.296875, + "learning_rate": 5.1556388921970735e-06, + "loss": 2.0801, + "step": 17085 + }, + { + "epoch": 3.2026241799437676, + "grad_norm": 55727.00390625, + "learning_rate": 5.152164211775412e-06, + "loss": 2.0182, + "step": 17086 + }, + { + "epoch": 3.202811621368322, + "grad_norm": 57838.08984375, + "learning_rate": 5.148690639037157e-06, + "loss": 1.9753, + "step": 17087 + }, + { + "epoch": 3.2029990627928773, + "grad_norm": 57582.60546875, + "learning_rate": 5.145218174068078e-06, + "loss": 2.031, + "step": 17088 + }, + { + "epoch": 3.203186504217432, + "grad_norm": 56992.01171875, + "learning_rate": 5.141746816953968e-06, + "loss": 2.1134, + "step": 17089 + }, + { + "epoch": 3.203373945641987, + "grad_norm": 65119.2421875, + "learning_rate": 5.1382765677805474e-06, + "loss": 2.0279, + "step": 17090 + }, + { + "epoch": 3.2035613870665416, + "grad_norm": 56183.640625, + "learning_rate": 5.134807426633537e-06, + "loss": 2.1412, + "step": 17091 + }, + { + "epoch": 3.2037488284910967, + "grad_norm": 58247.0625, + "learning_rate": 5.131339393598611e-06, + "loss": 2.0183, + "step": 17092 + }, + { + "epoch": 3.2039362699156513, + "grad_norm": 64454.6328125, + "learning_rate": 5.127872468761424e-06, + "loss": 2.0992, + "step": 17093 + }, + { + "epoch": 3.2041237113402063, + "grad_norm": 59762.20703125, + "learning_rate": 5.124406652207614e-06, + "loss": 2.0584, + "step": 17094 + }, + { + "epoch": 3.204311152764761, + "grad_norm": 56911.40234375, + "learning_rate": 5.1209419440227726e-06, + "loss": 2.0076, + "step": 17095 + }, + { + "epoch": 3.204498594189316, + "grad_norm": 60457.28515625, + "learning_rate": 5.117478344292486e-06, + "loss": 2.058, + "step": 17096 + }, + { + "epoch": 3.2046860356138707, + "grad_norm": 52978.953125, + "learning_rate": 5.114015853102289e-06, + "loss": 2.0282, + "step": 17097 + }, + { + "epoch": 3.2048734770384253, + "grad_norm": 55183.37109375, + "learning_rate": 5.110554470537709e-06, + "loss": 2.063, + "step": 17098 + }, + { + "epoch": 3.2050609184629804, + "grad_norm": 60939.88671875, + "learning_rate": 5.107094196684226e-06, + "loss": 2.0485, + "step": 17099 + }, + { + "epoch": 3.205248359887535, + "grad_norm": 57888.61328125, + "learning_rate": 5.103635031627319e-06, + "loss": 2.0914, + "step": 17100 + }, + { + "epoch": 3.20543580131209, + "grad_norm": 53031.86328125, + "learning_rate": 5.10017697545242e-06, + "loss": 2.1174, + "step": 17101 + }, + { + "epoch": 3.2056232427366447, + "grad_norm": 53712.9609375, + "learning_rate": 5.096720028244928e-06, + "loss": 2.0958, + "step": 17102 + }, + { + "epoch": 3.2058106841611997, + "grad_norm": 55610.67578125, + "learning_rate": 5.093264190090235e-06, + "loss": 2.0697, + "step": 17103 + }, + { + "epoch": 3.2059981255857544, + "grad_norm": 56588.640625, + "learning_rate": 5.089809461073702e-06, + "loss": 2.0961, + "step": 17104 + }, + { + "epoch": 3.2061855670103094, + "grad_norm": 58032.1640625, + "learning_rate": 5.086355841280638e-06, + "loss": 2.052, + "step": 17105 + }, + { + "epoch": 3.206373008434864, + "grad_norm": 51929.9765625, + "learning_rate": 5.082903330796352e-06, + "loss": 2.1032, + "step": 17106 + }, + { + "epoch": 3.206560449859419, + "grad_norm": 54484.3828125, + "learning_rate": 5.079451929706125e-06, + "loss": 2.0089, + "step": 17107 + }, + { + "epoch": 3.2067478912839738, + "grad_norm": 55319.265625, + "learning_rate": 5.076001638095201e-06, + "loss": 2.0782, + "step": 17108 + }, + { + "epoch": 3.206935332708529, + "grad_norm": 55265.89453125, + "learning_rate": 5.072552456048779e-06, + "loss": 2.1077, + "step": 17109 + }, + { + "epoch": 3.2071227741330834, + "grad_norm": 54977.59375, + "learning_rate": 5.069104383652063e-06, + "loss": 2.0995, + "step": 17110 + }, + { + "epoch": 3.207310215557638, + "grad_norm": 56140.703125, + "learning_rate": 5.065657420990228e-06, + "loss": 2.083, + "step": 17111 + }, + { + "epoch": 3.207497656982193, + "grad_norm": 53537.2578125, + "learning_rate": 5.062211568148384e-06, + "loss": 2.0698, + "step": 17112 + }, + { + "epoch": 3.2076850984067478, + "grad_norm": 56428.47265625, + "learning_rate": 5.058766825211647e-06, + "loss": 2.0762, + "step": 17113 + }, + { + "epoch": 3.207872539831303, + "grad_norm": 52901.7578125, + "learning_rate": 5.055323192265115e-06, + "loss": 2.0918, + "step": 17114 + }, + { + "epoch": 3.2080599812558575, + "grad_norm": 53532.24609375, + "learning_rate": 5.051880669393827e-06, + "loss": 2.0921, + "step": 17115 + }, + { + "epoch": 3.2082474226804125, + "grad_norm": 54878.046875, + "learning_rate": 5.048439256682802e-06, + "loss": 1.9913, + "step": 17116 + }, + { + "epoch": 3.208434864104967, + "grad_norm": 55601.23046875, + "learning_rate": 5.044998954217056e-06, + "loss": 2.0689, + "step": 17117 + }, + { + "epoch": 3.208622305529522, + "grad_norm": 55279.3984375, + "learning_rate": 5.0415597620815545e-06, + "loss": 1.9806, + "step": 17118 + }, + { + "epoch": 3.208809746954077, + "grad_norm": 55421.01171875, + "learning_rate": 5.038121680361224e-06, + "loss": 2.0381, + "step": 17119 + }, + { + "epoch": 3.208997188378632, + "grad_norm": 55867.921875, + "learning_rate": 5.034684709140991e-06, + "loss": 2.1201, + "step": 17120 + }, + { + "epoch": 3.2091846298031865, + "grad_norm": 53357.84765625, + "learning_rate": 5.0312488485057654e-06, + "loss": 2.0327, + "step": 17121 + }, + { + "epoch": 3.209372071227741, + "grad_norm": 60252.8046875, + "learning_rate": 5.027814098540385e-06, + "loss": 2.0014, + "step": 17122 + }, + { + "epoch": 3.209559512652296, + "grad_norm": 53052.078125, + "learning_rate": 5.0243804593296755e-06, + "loss": 2.1103, + "step": 17123 + }, + { + "epoch": 3.209746954076851, + "grad_norm": 55479.375, + "learning_rate": 5.020947930958469e-06, + "loss": 2.0876, + "step": 17124 + }, + { + "epoch": 3.209934395501406, + "grad_norm": 56610.1328125, + "learning_rate": 5.017516513511527e-06, + "loss": 2.0647, + "step": 17125 + }, + { + "epoch": 3.2101218369259605, + "grad_norm": 59107.73828125, + "learning_rate": 5.014086207073615e-06, + "loss": 1.9414, + "step": 17126 + }, + { + "epoch": 3.2103092783505156, + "grad_norm": 60335.51171875, + "learning_rate": 5.010657011729436e-06, + "loss": 1.9894, + "step": 17127 + }, + { + "epoch": 3.2104967197750702, + "grad_norm": 66497.59375, + "learning_rate": 5.007228927563712e-06, + "loss": 2.1247, + "step": 17128 + }, + { + "epoch": 3.2106841611996253, + "grad_norm": 56956.4921875, + "learning_rate": 5.003801954661086e-06, + "loss": 2.0978, + "step": 17129 + }, + { + "epoch": 3.21087160262418, + "grad_norm": 53081.65625, + "learning_rate": 5.000376093106224e-06, + "loss": 2.1007, + "step": 17130 + }, + { + "epoch": 3.211059044048735, + "grad_norm": 56504.5, + "learning_rate": 4.99695134298373e-06, + "loss": 2.072, + "step": 17131 + }, + { + "epoch": 3.2112464854732896, + "grad_norm": 55617.3984375, + "learning_rate": 4.993527704378181e-06, + "loss": 2.0476, + "step": 17132 + }, + { + "epoch": 3.2114339268978442, + "grad_norm": 55333.4453125, + "learning_rate": 4.99010517737416e-06, + "loss": 2.0176, + "step": 17133 + }, + { + "epoch": 3.2116213683223993, + "grad_norm": 65147.5625, + "learning_rate": 4.986683762056182e-06, + "loss": 2.0386, + "step": 17134 + }, + { + "epoch": 3.211808809746954, + "grad_norm": 55042.2421875, + "learning_rate": 4.983263458508741e-06, + "loss": 2.0662, + "step": 17135 + }, + { + "epoch": 3.211996251171509, + "grad_norm": 54296.25, + "learning_rate": 4.97984426681633e-06, + "loss": 2.0489, + "step": 17136 + }, + { + "epoch": 3.2121836925960636, + "grad_norm": 56995.2265625, + "learning_rate": 4.97642618706341e-06, + "loss": 2.0381, + "step": 17137 + }, + { + "epoch": 3.2123711340206187, + "grad_norm": 56888.09375, + "learning_rate": 4.973009219334385e-06, + "loss": 2.0309, + "step": 17138 + }, + { + "epoch": 3.2125585754451733, + "grad_norm": 55293.23046875, + "learning_rate": 4.969593363713643e-06, + "loss": 2.0363, + "step": 17139 + }, + { + "epoch": 3.2127460168697284, + "grad_norm": 54055.65234375, + "learning_rate": 4.966178620285572e-06, + "loss": 2.0527, + "step": 17140 + }, + { + "epoch": 3.212933458294283, + "grad_norm": 55578.47265625, + "learning_rate": 4.9627649891345055e-06, + "loss": 2.0697, + "step": 17141 + }, + { + "epoch": 3.213120899718838, + "grad_norm": 55061.53515625, + "learning_rate": 4.959352470344742e-06, + "loss": 2.0056, + "step": 17142 + }, + { + "epoch": 3.2133083411433927, + "grad_norm": 58103.40625, + "learning_rate": 4.9559410640005744e-06, + "loss": 2.1339, + "step": 17143 + }, + { + "epoch": 3.2134957825679473, + "grad_norm": 55471.609375, + "learning_rate": 4.95253077018627e-06, + "loss": 2.2113, + "step": 17144 + }, + { + "epoch": 3.2136832239925024, + "grad_norm": 51840.3984375, + "learning_rate": 4.949121588986049e-06, + "loss": 2.022, + "step": 17145 + }, + { + "epoch": 3.213870665417057, + "grad_norm": 59083.0625, + "learning_rate": 4.945713520484113e-06, + "loss": 2.0461, + "step": 17146 + }, + { + "epoch": 3.214058106841612, + "grad_norm": 52034.52734375, + "learning_rate": 4.942306564764643e-06, + "loss": 2.0366, + "step": 17147 + }, + { + "epoch": 3.2142455482661667, + "grad_norm": 53463.671875, + "learning_rate": 4.938900721911788e-06, + "loss": 2.049, + "step": 17148 + }, + { + "epoch": 3.2144329896907218, + "grad_norm": 59710.43359375, + "learning_rate": 4.935495992009648e-06, + "loss": 2.0623, + "step": 17149 + }, + { + "epoch": 3.2146204311152764, + "grad_norm": 56370.59765625, + "learning_rate": 4.9320923751423335e-06, + "loss": 2.0768, + "step": 17150 + }, + { + "epoch": 3.2148078725398315, + "grad_norm": 55108.6953125, + "learning_rate": 4.928689871393915e-06, + "loss": 2.0024, + "step": 17151 + }, + { + "epoch": 3.214995313964386, + "grad_norm": 56400.5546875, + "learning_rate": 4.925288480848417e-06, + "loss": 2.0669, + "step": 17152 + }, + { + "epoch": 3.215182755388941, + "grad_norm": 60521.6328125, + "learning_rate": 4.921888203589847e-06, + "loss": 2.1122, + "step": 17153 + }, + { + "epoch": 3.215370196813496, + "grad_norm": 53504.76171875, + "learning_rate": 4.918489039702201e-06, + "loss": 2.0816, + "step": 17154 + }, + { + "epoch": 3.2155576382380504, + "grad_norm": 52829.7265625, + "learning_rate": 4.915090989269422e-06, + "loss": 2.0718, + "step": 17155 + }, + { + "epoch": 3.2157450796626055, + "grad_norm": 55300.13671875, + "learning_rate": 4.911694052375448e-06, + "loss": 2.124, + "step": 17156 + }, + { + "epoch": 3.21593252108716, + "grad_norm": 63178.36328125, + "learning_rate": 4.908298229104174e-06, + "loss": 2.0848, + "step": 17157 + }, + { + "epoch": 3.216119962511715, + "grad_norm": 58967.921875, + "learning_rate": 4.904903519539461e-06, + "loss": 2.0765, + "step": 17158 + }, + { + "epoch": 3.21630740393627, + "grad_norm": 58113.87109375, + "learning_rate": 4.901509923765174e-06, + "loss": 2.1129, + "step": 17159 + }, + { + "epoch": 3.216494845360825, + "grad_norm": 54136.546875, + "learning_rate": 4.8981174418651135e-06, + "loss": 2.0978, + "step": 17160 + }, + { + "epoch": 3.2166822867853795, + "grad_norm": 55510.171875, + "learning_rate": 4.894726073923084e-06, + "loss": 2.039, + "step": 17161 + }, + { + "epoch": 3.2168697282099346, + "grad_norm": 59004.73046875, + "learning_rate": 4.89133582002283e-06, + "loss": 2.0971, + "step": 17162 + }, + { + "epoch": 3.217057169634489, + "grad_norm": 58906.40234375, + "learning_rate": 4.887946680248112e-06, + "loss": 2.1076, + "step": 17163 + }, + { + "epoch": 3.2172446110590442, + "grad_norm": 54064.09765625, + "learning_rate": 4.8845586546826196e-06, + "loss": 2.051, + "step": 17164 + }, + { + "epoch": 3.217432052483599, + "grad_norm": 56152.01953125, + "learning_rate": 4.881171743410024e-06, + "loss": 2.0716, + "step": 17165 + }, + { + "epoch": 3.2176194939081535, + "grad_norm": 56059.703125, + "learning_rate": 4.877785946513985e-06, + "loss": 2.1302, + "step": 17166 + }, + { + "epoch": 3.2178069353327086, + "grad_norm": 51894.1875, + "learning_rate": 4.874401264078149e-06, + "loss": 2.0713, + "step": 17167 + }, + { + "epoch": 3.217994376757263, + "grad_norm": 56882.7890625, + "learning_rate": 4.871017696186092e-06, + "loss": 2.1081, + "step": 17168 + }, + { + "epoch": 3.2181818181818183, + "grad_norm": 52923.36328125, + "learning_rate": 4.867635242921376e-06, + "loss": 2.0753, + "step": 17169 + }, + { + "epoch": 3.218369259606373, + "grad_norm": 51287.22265625, + "learning_rate": 4.864253904367566e-06, + "loss": 2.0514, + "step": 17170 + }, + { + "epoch": 3.218556701030928, + "grad_norm": 53517.25, + "learning_rate": 4.8608736806081685e-06, + "loss": 1.9994, + "step": 17171 + }, + { + "epoch": 3.2187441424554826, + "grad_norm": 54676.9296875, + "learning_rate": 4.8574945717266505e-06, + "loss": 2.0771, + "step": 17172 + }, + { + "epoch": 3.2189315838800376, + "grad_norm": 56988.1875, + "learning_rate": 4.854116577806495e-06, + "loss": 2.133, + "step": 17173 + }, + { + "epoch": 3.2191190253045923, + "grad_norm": 52116.109375, + "learning_rate": 4.850739698931145e-06, + "loss": 2.0706, + "step": 17174 + }, + { + "epoch": 3.2193064667291473, + "grad_norm": 53863.703125, + "learning_rate": 4.8473639351839685e-06, + "loss": 2.073, + "step": 17175 + }, + { + "epoch": 3.219493908153702, + "grad_norm": 57546.890625, + "learning_rate": 4.84398928664836e-06, + "loss": 2.0594, + "step": 17176 + }, + { + "epoch": 3.2196813495782566, + "grad_norm": 52158.38671875, + "learning_rate": 4.840615753407679e-06, + "loss": 2.0447, + "step": 17177 + }, + { + "epoch": 3.2198687910028116, + "grad_norm": 55517.69140625, + "learning_rate": 4.837243335545239e-06, + "loss": 2.0361, + "step": 17178 + }, + { + "epoch": 3.2200562324273663, + "grad_norm": 53248.73046875, + "learning_rate": 4.833872033144327e-06, + "loss": 2.0731, + "step": 17179 + }, + { + "epoch": 3.2202436738519213, + "grad_norm": 54216.59765625, + "learning_rate": 4.8305018462882266e-06, + "loss": 2.101, + "step": 17180 + }, + { + "epoch": 3.220431115276476, + "grad_norm": 54490.73046875, + "learning_rate": 4.827132775060156e-06, + "loss": 2.103, + "step": 17181 + }, + { + "epoch": 3.220618556701031, + "grad_norm": 53484.734375, + "learning_rate": 4.823764819543347e-06, + "loss": 2.1336, + "step": 17182 + }, + { + "epoch": 3.2208059981255857, + "grad_norm": 54821.453125, + "learning_rate": 4.820397979820967e-06, + "loss": 2.1014, + "step": 17183 + }, + { + "epoch": 3.2209934395501407, + "grad_norm": 53717.8203125, + "learning_rate": 4.817032255976195e-06, + "loss": 2.0464, + "step": 17184 + }, + { + "epoch": 3.2211808809746953, + "grad_norm": 55448.01953125, + "learning_rate": 4.81366764809213e-06, + "loss": 2.0891, + "step": 17185 + }, + { + "epoch": 3.2213683223992504, + "grad_norm": 61698.5078125, + "learning_rate": 4.810304156251899e-06, + "loss": 2.1203, + "step": 17186 + }, + { + "epoch": 3.221555763823805, + "grad_norm": 56761.60546875, + "learning_rate": 4.806941780538571e-06, + "loss": 2.1234, + "step": 17187 + }, + { + "epoch": 3.2217432052483597, + "grad_norm": 52687.9375, + "learning_rate": 4.803580521035172e-06, + "loss": 2.0924, + "step": 17188 + }, + { + "epoch": 3.2219306466729147, + "grad_norm": 57430.3515625, + "learning_rate": 4.800220377824749e-06, + "loss": 2.0673, + "step": 17189 + }, + { + "epoch": 3.2221180880974694, + "grad_norm": 58742.125, + "learning_rate": 4.7968613509902725e-06, + "loss": 2.1178, + "step": 17190 + }, + { + "epoch": 3.2223055295220244, + "grad_norm": 55755.33203125, + "learning_rate": 4.793503440614716e-06, + "loss": 2.1123, + "step": 17191 + }, + { + "epoch": 3.222492970946579, + "grad_norm": 53869.359375, + "learning_rate": 4.790146646781013e-06, + "loss": 2.024, + "step": 17192 + }, + { + "epoch": 3.222680412371134, + "grad_norm": 54989.1875, + "learning_rate": 4.7867909695720744e-06, + "loss": 2.1453, + "step": 17193 + }, + { + "epoch": 3.2228678537956887, + "grad_norm": 58640.04296875, + "learning_rate": 4.783436409070785e-06, + "loss": 2.0855, + "step": 17194 + }, + { + "epoch": 3.223055295220244, + "grad_norm": 52786.92578125, + "learning_rate": 4.780082965359977e-06, + "loss": 2.0758, + "step": 17195 + }, + { + "epoch": 3.2232427366447984, + "grad_norm": 59543.26171875, + "learning_rate": 4.776730638522497e-06, + "loss": 2.0027, + "step": 17196 + }, + { + "epoch": 3.2234301780693535, + "grad_norm": 56613.375, + "learning_rate": 4.773379428641151e-06, + "loss": 2.013, + "step": 17197 + }, + { + "epoch": 3.223617619493908, + "grad_norm": 58302.55078125, + "learning_rate": 4.770029335798681e-06, + "loss": 2.1102, + "step": 17198 + }, + { + "epoch": 3.2238050609184628, + "grad_norm": 58644.17578125, + "learning_rate": 4.766680360077841e-06, + "loss": 2.0829, + "step": 17199 + }, + { + "epoch": 3.223992502343018, + "grad_norm": 59744.3828125, + "learning_rate": 4.7633325015613624e-06, + "loss": 2.0733, + "step": 17200 + }, + { + "epoch": 3.2241799437675724, + "grad_norm": 53995.046875, + "learning_rate": 4.75998576033192e-06, + "loss": 2.0331, + "step": 17201 + }, + { + "epoch": 3.2243673851921275, + "grad_norm": 56622.0390625, + "learning_rate": 4.756640136472168e-06, + "loss": 2.0179, + "step": 17202 + }, + { + "epoch": 3.224554826616682, + "grad_norm": 55666.11328125, + "learning_rate": 4.753295630064742e-06, + "loss": 2.077, + "step": 17203 + }, + { + "epoch": 3.224742268041237, + "grad_norm": 51599.21484375, + "learning_rate": 4.749952241192274e-06, + "loss": 2.0346, + "step": 17204 + }, + { + "epoch": 3.224929709465792, + "grad_norm": 54598.30078125, + "learning_rate": 4.746609969937293e-06, + "loss": 2.1492, + "step": 17205 + }, + { + "epoch": 3.225117150890347, + "grad_norm": 54529.3515625, + "learning_rate": 4.743268816382379e-06, + "loss": 2.1072, + "step": 17206 + }, + { + "epoch": 3.2253045923149015, + "grad_norm": 55517.87109375, + "learning_rate": 4.739928780610053e-06, + "loss": 2.1156, + "step": 17207 + }, + { + "epoch": 3.2254920337394566, + "grad_norm": 55887.0546875, + "learning_rate": 4.7365898627028116e-06, + "loss": 2.096, + "step": 17208 + }, + { + "epoch": 3.225679475164011, + "grad_norm": 56311.22265625, + "learning_rate": 4.7332520627431046e-06, + "loss": 2.0466, + "step": 17209 + }, + { + "epoch": 3.2258669165885663, + "grad_norm": 58770.734375, + "learning_rate": 4.729915380813388e-06, + "loss": 2.0493, + "step": 17210 + }, + { + "epoch": 3.226054358013121, + "grad_norm": 56198.92578125, + "learning_rate": 4.726579816996063e-06, + "loss": 2.0057, + "step": 17211 + }, + { + "epoch": 3.2262417994376755, + "grad_norm": 54389.69921875, + "learning_rate": 4.72324537137353e-06, + "loss": 2.0158, + "step": 17212 + }, + { + "epoch": 3.2264292408622306, + "grad_norm": 56139.71875, + "learning_rate": 4.7199120440281185e-06, + "loss": 2.0859, + "step": 17213 + }, + { + "epoch": 3.226616682286785, + "grad_norm": 55159.390625, + "learning_rate": 4.7165798350421845e-06, + "loss": 2.1381, + "step": 17214 + }, + { + "epoch": 3.2268041237113403, + "grad_norm": 56232.95703125, + "learning_rate": 4.713248744498022e-06, + "loss": 2.0953, + "step": 17215 + }, + { + "epoch": 3.226991565135895, + "grad_norm": 59201.73828125, + "learning_rate": 4.709918772477884e-06, + "loss": 2.089, + "step": 17216 + }, + { + "epoch": 3.22717900656045, + "grad_norm": 61228.81640625, + "learning_rate": 4.706589919064047e-06, + "loss": 2.1108, + "step": 17217 + }, + { + "epoch": 3.2273664479850046, + "grad_norm": 50646.0703125, + "learning_rate": 4.703262184338703e-06, + "loss": 2.111, + "step": 17218 + }, + { + "epoch": 3.2275538894095597, + "grad_norm": 50162.3671875, + "learning_rate": 4.6999355683840676e-06, + "loss": 2.0554, + "step": 17219 + }, + { + "epoch": 3.2277413308341143, + "grad_norm": 51724.2890625, + "learning_rate": 4.696610071282281e-06, + "loss": 2.0928, + "step": 17220 + }, + { + "epoch": 3.2279287722586694, + "grad_norm": 51955.06640625, + "learning_rate": 4.693285693115496e-06, + "loss": 2.0872, + "step": 17221 + }, + { + "epoch": 3.228116213683224, + "grad_norm": 52885.51953125, + "learning_rate": 4.689962433965805e-06, + "loss": 2.0827, + "step": 17222 + }, + { + "epoch": 3.2283036551077786, + "grad_norm": 56355.39453125, + "learning_rate": 4.686640293915307e-06, + "loss": 1.945, + "step": 17223 + }, + { + "epoch": 3.2284910965323337, + "grad_norm": 60128.11328125, + "learning_rate": 4.683319273046044e-06, + "loss": 2.0776, + "step": 17224 + }, + { + "epoch": 3.2286785379568883, + "grad_norm": 56823.98046875, + "learning_rate": 4.679999371440031e-06, + "loss": 2.0263, + "step": 17225 + }, + { + "epoch": 3.2288659793814434, + "grad_norm": 52281.390625, + "learning_rate": 4.676680589179277e-06, + "loss": 2.112, + "step": 17226 + }, + { + "epoch": 3.229053420805998, + "grad_norm": 59783.19140625, + "learning_rate": 4.673362926345764e-06, + "loss": 2.102, + "step": 17227 + }, + { + "epoch": 3.229240862230553, + "grad_norm": 58006.1796875, + "learning_rate": 4.670046383021403e-06, + "loss": 2.0348, + "step": 17228 + }, + { + "epoch": 3.2294283036551077, + "grad_norm": 52897.76953125, + "learning_rate": 4.666730959288129e-06, + "loss": 2.1001, + "step": 17229 + }, + { + "epoch": 3.2296157450796628, + "grad_norm": 55136.609375, + "learning_rate": 4.663416655227832e-06, + "loss": 2.0899, + "step": 17230 + }, + { + "epoch": 3.2298031865042174, + "grad_norm": 52658.1953125, + "learning_rate": 4.660103470922361e-06, + "loss": 2.1115, + "step": 17231 + }, + { + "epoch": 3.2299906279287725, + "grad_norm": 55409.98046875, + "learning_rate": 4.656791406453548e-06, + "loss": 2.106, + "step": 17232 + }, + { + "epoch": 3.230178069353327, + "grad_norm": 55270.78515625, + "learning_rate": 4.653480461903204e-06, + "loss": 2.0788, + "step": 17233 + }, + { + "epoch": 3.230365510777882, + "grad_norm": 56552.37109375, + "learning_rate": 4.650170637353096e-06, + "loss": 2.005, + "step": 17234 + }, + { + "epoch": 3.2305529522024368, + "grad_norm": 54128.984375, + "learning_rate": 4.6468619328849715e-06, + "loss": 2.0764, + "step": 17235 + }, + { + "epoch": 3.2307403936269914, + "grad_norm": 54567.98828125, + "learning_rate": 4.643554348580553e-06, + "loss": 2.1018, + "step": 17236 + }, + { + "epoch": 3.2309278350515465, + "grad_norm": 53911.41796875, + "learning_rate": 4.640247884521548e-06, + "loss": 2.1615, + "step": 17237 + }, + { + "epoch": 3.231115276476101, + "grad_norm": 58291.953125, + "learning_rate": 4.636942540789613e-06, + "loss": 2.0589, + "step": 17238 + }, + { + "epoch": 3.231302717900656, + "grad_norm": 52144.86328125, + "learning_rate": 4.633638317466371e-06, + "loss": 2.1086, + "step": 17239 + }, + { + "epoch": 3.2314901593252108, + "grad_norm": 55423.03125, + "learning_rate": 4.630335214633452e-06, + "loss": 2.0397, + "step": 17240 + }, + { + "epoch": 3.231677600749766, + "grad_norm": 53751.7890625, + "learning_rate": 4.6270332323724245e-06, + "loss": 2.0795, + "step": 17241 + }, + { + "epoch": 3.2318650421743205, + "grad_norm": 55780.59765625, + "learning_rate": 4.623732370764855e-06, + "loss": 2.049, + "step": 17242 + }, + { + "epoch": 3.2320524835988755, + "grad_norm": 55995.75, + "learning_rate": 4.620432629892257e-06, + "loss": 2.0484, + "step": 17243 + }, + { + "epoch": 3.23223992502343, + "grad_norm": 55429.32421875, + "learning_rate": 4.61713400983615e-06, + "loss": 2.0466, + "step": 17244 + }, + { + "epoch": 3.2324273664479852, + "grad_norm": 58403.09765625, + "learning_rate": 4.613836510677988e-06, + "loss": 2.1489, + "step": 17245 + }, + { + "epoch": 3.23261480787254, + "grad_norm": 52635.71875, + "learning_rate": 4.610540132499214e-06, + "loss": 2.084, + "step": 17246 + }, + { + "epoch": 3.2328022492970945, + "grad_norm": 55188.4375, + "learning_rate": 4.607244875381261e-06, + "loss": 2.0488, + "step": 17247 + }, + { + "epoch": 3.2329896907216495, + "grad_norm": 55922.3984375, + "learning_rate": 4.603950739405494e-06, + "loss": 2.0774, + "step": 17248 + }, + { + "epoch": 3.233177132146204, + "grad_norm": 54089.54296875, + "learning_rate": 4.6006577246533015e-06, + "loss": 2.1053, + "step": 17249 + }, + { + "epoch": 3.2333645735707592, + "grad_norm": 55078.65625, + "learning_rate": 4.597365831205996e-06, + "loss": 2.0666, + "step": 17250 + }, + { + "epoch": 3.233552014995314, + "grad_norm": 49500.24609375, + "learning_rate": 4.594075059144881e-06, + "loss": 2.0655, + "step": 17251 + }, + { + "epoch": 3.233739456419869, + "grad_norm": 58022.328125, + "learning_rate": 4.590785408551246e-06, + "loss": 2.1025, + "step": 17252 + }, + { + "epoch": 3.2339268978444236, + "grad_norm": 58222.8515625, + "learning_rate": 4.587496879506342e-06, + "loss": 1.9958, + "step": 17253 + }, + { + "epoch": 3.2341143392689786, + "grad_norm": 51079.58203125, + "learning_rate": 4.5842094720913946e-06, + "loss": 2.0408, + "step": 17254 + }, + { + "epoch": 3.2343017806935332, + "grad_norm": 54012.88671875, + "learning_rate": 4.580923186387576e-06, + "loss": 2.1066, + "step": 17255 + }, + { + "epoch": 3.2344892221180883, + "grad_norm": 58111.25, + "learning_rate": 4.5776380224760775e-06, + "loss": 2.0614, + "step": 17256 + }, + { + "epoch": 3.234676663542643, + "grad_norm": 59245.6484375, + "learning_rate": 4.574353980438029e-06, + "loss": 2.1802, + "step": 17257 + }, + { + "epoch": 3.2348641049671976, + "grad_norm": 56427.9765625, + "learning_rate": 4.571071060354531e-06, + "loss": 2.0756, + "step": 17258 + }, + { + "epoch": 3.2350515463917526, + "grad_norm": 57226.44140625, + "learning_rate": 4.567789262306682e-06, + "loss": 2.0176, + "step": 17259 + }, + { + "epoch": 3.2352389878163073, + "grad_norm": 62376.87890625, + "learning_rate": 4.564508586375538e-06, + "loss": 2.0669, + "step": 17260 + }, + { + "epoch": 3.2354264292408623, + "grad_norm": 56956.80078125, + "learning_rate": 4.561229032642128e-06, + "loss": 2.0211, + "step": 17261 + }, + { + "epoch": 3.235613870665417, + "grad_norm": 58627.45703125, + "learning_rate": 4.557950601187438e-06, + "loss": 2.0505, + "step": 17262 + }, + { + "epoch": 3.235801312089972, + "grad_norm": 59122.55859375, + "learning_rate": 4.554673292092465e-06, + "loss": 2.0363, + "step": 17263 + }, + { + "epoch": 3.2359887535145266, + "grad_norm": 57522.9765625, + "learning_rate": 4.551397105438138e-06, + "loss": 2.017, + "step": 17264 + }, + { + "epoch": 3.2361761949390817, + "grad_norm": 57955.87890625, + "learning_rate": 4.548122041305369e-06, + "loss": 2.0899, + "step": 17265 + }, + { + "epoch": 3.2363636363636363, + "grad_norm": 55835.515625, + "learning_rate": 4.5448480997750565e-06, + "loss": 2.1139, + "step": 17266 + }, + { + "epoch": 3.2365510777881914, + "grad_norm": 50288.94921875, + "learning_rate": 4.541575280928068e-06, + "loss": 2.1594, + "step": 17267 + }, + { + "epoch": 3.236738519212746, + "grad_norm": 58314.65234375, + "learning_rate": 4.538303584845233e-06, + "loss": 2.1201, + "step": 17268 + }, + { + "epoch": 3.2369259606373006, + "grad_norm": 58588.34765625, + "learning_rate": 4.535033011607353e-06, + "loss": 2.0439, + "step": 17269 + }, + { + "epoch": 3.2371134020618557, + "grad_norm": 57717.97265625, + "learning_rate": 4.531763561295216e-06, + "loss": 2.0668, + "step": 17270 + }, + { + "epoch": 3.2373008434864103, + "grad_norm": 54873.98828125, + "learning_rate": 4.5284952339895715e-06, + "loss": 1.9642, + "step": 17271 + }, + { + "epoch": 3.2374882849109654, + "grad_norm": 53234.09765625, + "learning_rate": 4.525228029771128e-06, + "loss": 2.0793, + "step": 17272 + }, + { + "epoch": 3.23767572633552, + "grad_norm": 56951.2109375, + "learning_rate": 4.5219619487206045e-06, + "loss": 2.1361, + "step": 17273 + }, + { + "epoch": 3.237863167760075, + "grad_norm": 57668.33984375, + "learning_rate": 4.518696990918653e-06, + "loss": 2.0457, + "step": 17274 + }, + { + "epoch": 3.2380506091846297, + "grad_norm": 57906.375, + "learning_rate": 4.515433156445919e-06, + "loss": 1.9895, + "step": 17275 + }, + { + "epoch": 3.238238050609185, + "grad_norm": 54971.91015625, + "learning_rate": 4.512170445383013e-06, + "loss": 1.95, + "step": 17276 + }, + { + "epoch": 3.2384254920337394, + "grad_norm": 55391.0234375, + "learning_rate": 4.508908857810523e-06, + "loss": 2.0427, + "step": 17277 + }, + { + "epoch": 3.2386129334582945, + "grad_norm": 58383.375, + "learning_rate": 4.505648393808998e-06, + "loss": 2.0108, + "step": 17278 + }, + { + "epoch": 3.238800374882849, + "grad_norm": 55972.3203125, + "learning_rate": 4.502389053458983e-06, + "loss": 2.0613, + "step": 17279 + }, + { + "epoch": 3.2389878163074037, + "grad_norm": 53184.09375, + "learning_rate": 4.49913083684097e-06, + "loss": 2.0934, + "step": 17280 + }, + { + "epoch": 3.239175257731959, + "grad_norm": 57506.94140625, + "learning_rate": 4.495873744035423e-06, + "loss": 2.0858, + "step": 17281 + }, + { + "epoch": 3.2393626991565134, + "grad_norm": 56625.453125, + "learning_rate": 4.492617775122798e-06, + "loss": 2.1145, + "step": 17282 + }, + { + "epoch": 3.2395501405810685, + "grad_norm": 53637.25390625, + "learning_rate": 4.489362930183521e-06, + "loss": 2.1096, + "step": 17283 + }, + { + "epoch": 3.239737582005623, + "grad_norm": 56637.6796875, + "learning_rate": 4.4861092092979785e-06, + "loss": 2.0305, + "step": 17284 + }, + { + "epoch": 3.239925023430178, + "grad_norm": 57256.12109375, + "learning_rate": 4.4828566125465154e-06, + "loss": 2.0563, + "step": 17285 + }, + { + "epoch": 3.240112464854733, + "grad_norm": 53329.9375, + "learning_rate": 4.479605140009491e-06, + "loss": 2.0585, + "step": 17286 + }, + { + "epoch": 3.240299906279288, + "grad_norm": 53407.953125, + "learning_rate": 4.476354791767201e-06, + "loss": 2.0702, + "step": 17287 + }, + { + "epoch": 3.2404873477038425, + "grad_norm": 54164.9765625, + "learning_rate": 4.473105567899916e-06, + "loss": 2.1523, + "step": 17288 + }, + { + "epoch": 3.2406747891283976, + "grad_norm": 57745.8515625, + "learning_rate": 4.469857468487898e-06, + "loss": 2.0157, + "step": 17289 + }, + { + "epoch": 3.240862230552952, + "grad_norm": 57603.1328125, + "learning_rate": 4.466610493611384e-06, + "loss": 2.0252, + "step": 17290 + }, + { + "epoch": 3.241049671977507, + "grad_norm": 54917.3828125, + "learning_rate": 4.463364643350543e-06, + "loss": 2.079, + "step": 17291 + }, + { + "epoch": 3.241237113402062, + "grad_norm": 54915.8828125, + "learning_rate": 4.4601199177855545e-06, + "loss": 2.061, + "step": 17292 + }, + { + "epoch": 3.2414245548266165, + "grad_norm": 56999.94140625, + "learning_rate": 4.456876316996567e-06, + "loss": 2.0314, + "step": 17293 + }, + { + "epoch": 3.2416119962511716, + "grad_norm": 54271.33203125, + "learning_rate": 4.453633841063687e-06, + "loss": 2.0796, + "step": 17294 + }, + { + "epoch": 3.241799437675726, + "grad_norm": 54510.01171875, + "learning_rate": 4.4503924900669955e-06, + "loss": 2.0814, + "step": 17295 + }, + { + "epoch": 3.2419868791002813, + "grad_norm": 53281.14453125, + "learning_rate": 4.4471522640865515e-06, + "loss": 2.0706, + "step": 17296 + }, + { + "epoch": 3.242174320524836, + "grad_norm": 55029.13671875, + "learning_rate": 4.443913163202401e-06, + "loss": 2.0691, + "step": 17297 + }, + { + "epoch": 3.242361761949391, + "grad_norm": 56135.94140625, + "learning_rate": 4.440675187494514e-06, + "loss": 1.9686, + "step": 17298 + }, + { + "epoch": 3.2425492033739456, + "grad_norm": 58808.42578125, + "learning_rate": 4.437438337042882e-06, + "loss": 2.0819, + "step": 17299 + }, + { + "epoch": 3.2427366447985007, + "grad_norm": 54364.4453125, + "learning_rate": 4.434202611927463e-06, + "loss": 2.0935, + "step": 17300 + }, + { + "epoch": 3.2429240862230553, + "grad_norm": 52673.58984375, + "learning_rate": 4.430968012228159e-06, + "loss": 2.0337, + "step": 17301 + }, + { + "epoch": 3.24311152764761, + "grad_norm": 57976.609375, + "learning_rate": 4.4277345380248525e-06, + "loss": 2.0716, + "step": 17302 + }, + { + "epoch": 3.243298969072165, + "grad_norm": 59962.203125, + "learning_rate": 4.424502189397434e-06, + "loss": 2.0724, + "step": 17303 + }, + { + "epoch": 3.2434864104967196, + "grad_norm": 57760.89453125, + "learning_rate": 4.4212709664257055e-06, + "loss": 2.1096, + "step": 17304 + }, + { + "epoch": 3.2436738519212747, + "grad_norm": 56428.48046875, + "learning_rate": 4.418040869189505e-06, + "loss": 2.1068, + "step": 17305 + }, + { + "epoch": 3.2438612933458293, + "grad_norm": 54285.0546875, + "learning_rate": 4.414811897768584e-06, + "loss": 2.0115, + "step": 17306 + }, + { + "epoch": 3.2440487347703844, + "grad_norm": 55512.35546875, + "learning_rate": 4.411584052242718e-06, + "loss": 2.0617, + "step": 17307 + }, + { + "epoch": 3.244236176194939, + "grad_norm": 52123.57421875, + "learning_rate": 4.40835733269161e-06, + "loss": 2.0437, + "step": 17308 + }, + { + "epoch": 3.244423617619494, + "grad_norm": 56361.00390625, + "learning_rate": 4.405131739194973e-06, + "loss": 2.1007, + "step": 17309 + }, + { + "epoch": 3.2446110590440487, + "grad_norm": 52615.85546875, + "learning_rate": 4.401907271832472e-06, + "loss": 2.0981, + "step": 17310 + }, + { + "epoch": 3.2447985004686037, + "grad_norm": 53335.921875, + "learning_rate": 4.398683930683728e-06, + "loss": 2.105, + "step": 17311 + }, + { + "epoch": 3.2449859418931584, + "grad_norm": 53847.5390625, + "learning_rate": 4.39546171582838e-06, + "loss": 2.0472, + "step": 17312 + }, + { + "epoch": 3.245173383317713, + "grad_norm": 58331.35546875, + "learning_rate": 4.3922406273459945e-06, + "loss": 2.0805, + "step": 17313 + }, + { + "epoch": 3.245360824742268, + "grad_norm": 57975.79296875, + "learning_rate": 4.38902066531614e-06, + "loss": 2.069, + "step": 17314 + }, + { + "epoch": 3.2455482661668227, + "grad_norm": 58744.22265625, + "learning_rate": 4.385801829818331e-06, + "loss": 2.1074, + "step": 17315 + }, + { + "epoch": 3.2457357075913777, + "grad_norm": 50178.4609375, + "learning_rate": 4.382584120932082e-06, + "loss": 2.0367, + "step": 17316 + }, + { + "epoch": 3.2459231490159324, + "grad_norm": 52490.12109375, + "learning_rate": 4.379367538736867e-06, + "loss": 2.0256, + "step": 17317 + }, + { + "epoch": 3.2461105904404874, + "grad_norm": 56884.86328125, + "learning_rate": 4.376152083312113e-06, + "loss": 2.1071, + "step": 17318 + }, + { + "epoch": 3.246298031865042, + "grad_norm": 52983.703125, + "learning_rate": 4.3729377547372495e-06, + "loss": 2.0427, + "step": 17319 + }, + { + "epoch": 3.246485473289597, + "grad_norm": 52247.76953125, + "learning_rate": 4.369724553091692e-06, + "loss": 2.0112, + "step": 17320 + }, + { + "epoch": 3.2466729147141518, + "grad_norm": 56129.05078125, + "learning_rate": 4.3665124784547536e-06, + "loss": 2.0955, + "step": 17321 + }, + { + "epoch": 3.246860356138707, + "grad_norm": 55575.75, + "learning_rate": 4.363301530905795e-06, + "loss": 2.0036, + "step": 17322 + }, + { + "epoch": 3.2470477975632615, + "grad_norm": 55661.109375, + "learning_rate": 4.360091710524128e-06, + "loss": 2.1068, + "step": 17323 + }, + { + "epoch": 3.247235238987816, + "grad_norm": 53757.3515625, + "learning_rate": 4.356883017389019e-06, + "loss": 2.0964, + "step": 17324 + }, + { + "epoch": 3.247422680412371, + "grad_norm": 54987.328125, + "learning_rate": 4.35367545157972e-06, + "loss": 2.0439, + "step": 17325 + }, + { + "epoch": 3.2476101218369258, + "grad_norm": 52478.1796875, + "learning_rate": 4.350469013175456e-06, + "loss": 2.0463, + "step": 17326 + }, + { + "epoch": 3.247797563261481, + "grad_norm": 54367.41015625, + "learning_rate": 4.347263702255433e-06, + "loss": 1.9787, + "step": 17327 + }, + { + "epoch": 3.2479850046860355, + "grad_norm": 56637.34765625, + "learning_rate": 4.344059518898791e-06, + "loss": 2.1082, + "step": 17328 + }, + { + "epoch": 3.2481724461105905, + "grad_norm": 56600.5, + "learning_rate": 4.340856463184684e-06, + "loss": 2.0384, + "step": 17329 + }, + { + "epoch": 3.248359887535145, + "grad_norm": 57416.3125, + "learning_rate": 4.3376545351922374e-06, + "loss": 2.0846, + "step": 17330 + }, + { + "epoch": 3.2485473289597, + "grad_norm": 57968.53515625, + "learning_rate": 4.334453735000521e-06, + "loss": 2.1009, + "step": 17331 + }, + { + "epoch": 3.248734770384255, + "grad_norm": 55720.296875, + "learning_rate": 4.331254062688578e-06, + "loss": 2.1032, + "step": 17332 + }, + { + "epoch": 3.24892221180881, + "grad_norm": 57849.5703125, + "learning_rate": 4.328055518335461e-06, + "loss": 2.1139, + "step": 17333 + }, + { + "epoch": 3.2491096532333645, + "grad_norm": 55911.6484375, + "learning_rate": 4.3248581020201515e-06, + "loss": 2.1176, + "step": 17334 + }, + { + "epoch": 3.2492970946579196, + "grad_norm": 54297.34765625, + "learning_rate": 4.321661813821637e-06, + "loss": 2.059, + "step": 17335 + }, + { + "epoch": 3.2494845360824742, + "grad_norm": 53666.86328125, + "learning_rate": 4.318466653818842e-06, + "loss": 2.0499, + "step": 17336 + }, + { + "epoch": 3.249671977507029, + "grad_norm": 56631.87890625, + "learning_rate": 4.3152726220906994e-06, + "loss": 2.1555, + "step": 17337 + }, + { + "epoch": 3.249859418931584, + "grad_norm": 56660.36328125, + "learning_rate": 4.3120797187160885e-06, + "loss": 2.0716, + "step": 17338 + }, + { + "epoch": 3.2500468603561385, + "grad_norm": 55986.28125, + "learning_rate": 4.308887943773882e-06, + "loss": 2.0748, + "step": 17339 + }, + { + "epoch": 3.2502343017806936, + "grad_norm": 56092.0859375, + "learning_rate": 4.3056972973428975e-06, + "loss": 2.1388, + "step": 17340 + }, + { + "epoch": 3.2504217432052482, + "grad_norm": 52237.33984375, + "learning_rate": 4.302507779501941e-06, + "loss": 2.0518, + "step": 17341 + }, + { + "epoch": 3.2506091846298033, + "grad_norm": 55470.109375, + "learning_rate": 4.299319390329809e-06, + "loss": 2.0643, + "step": 17342 + }, + { + "epoch": 3.250796626054358, + "grad_norm": 56131.64453125, + "learning_rate": 4.2961321299052286e-06, + "loss": 2.1349, + "step": 17343 + }, + { + "epoch": 3.250984067478913, + "grad_norm": 60607.625, + "learning_rate": 4.292945998306924e-06, + "loss": 2.0113, + "step": 17344 + }, + { + "epoch": 3.2511715089034676, + "grad_norm": 58464.33203125, + "learning_rate": 4.28976099561359e-06, + "loss": 2.0647, + "step": 17345 + }, + { + "epoch": 3.2513589503280222, + "grad_norm": 53753.04296875, + "learning_rate": 4.286577121903906e-06, + "loss": 2.0502, + "step": 17346 + }, + { + "epoch": 3.2515463917525773, + "grad_norm": 54716.77734375, + "learning_rate": 4.2833943772565e-06, + "loss": 2.0505, + "step": 17347 + }, + { + "epoch": 3.2517338331771324, + "grad_norm": 56207.94140625, + "learning_rate": 4.280212761749969e-06, + "loss": 2.0718, + "step": 17348 + }, + { + "epoch": 3.251921274601687, + "grad_norm": 55767.5859375, + "learning_rate": 4.277032275462905e-06, + "loss": 1.9697, + "step": 17349 + }, + { + "epoch": 3.2521087160262416, + "grad_norm": 59705.4296875, + "learning_rate": 4.2738529184738865e-06, + "loss": 1.9716, + "step": 17350 + }, + { + "epoch": 3.2522961574507967, + "grad_norm": 57052.765625, + "learning_rate": 4.270674690861393e-06, + "loss": 2.0971, + "step": 17351 + }, + { + "epoch": 3.2524835988753513, + "grad_norm": 54528.28515625, + "learning_rate": 4.26749759270395e-06, + "loss": 2.074, + "step": 17352 + }, + { + "epoch": 3.2526710402999064, + "grad_norm": 60861.75, + "learning_rate": 4.264321624080031e-06, + "loss": 2.1377, + "step": 17353 + }, + { + "epoch": 3.252858481724461, + "grad_norm": 63630.15234375, + "learning_rate": 4.261146785068065e-06, + "loss": 2.0359, + "step": 17354 + }, + { + "epoch": 3.253045923149016, + "grad_norm": 55633.546875, + "learning_rate": 4.2579730757464685e-06, + "loss": 2.1022, + "step": 17355 + }, + { + "epoch": 3.2532333645735707, + "grad_norm": 52733.38671875, + "learning_rate": 4.25480049619364e-06, + "loss": 2.0639, + "step": 17356 + }, + { + "epoch": 3.2534208059981258, + "grad_norm": 55742.1171875, + "learning_rate": 4.251629046487931e-06, + "loss": 2.0611, + "step": 17357 + }, + { + "epoch": 3.2536082474226804, + "grad_norm": 57188.1875, + "learning_rate": 4.24845872670766e-06, + "loss": 2.0883, + "step": 17358 + }, + { + "epoch": 3.2537956888472355, + "grad_norm": 64683.21875, + "learning_rate": 4.245289536931141e-06, + "loss": 2.0928, + "step": 17359 + }, + { + "epoch": 3.25398313027179, + "grad_norm": 59657.92578125, + "learning_rate": 4.2421214772366605e-06, + "loss": 2.0466, + "step": 17360 + }, + { + "epoch": 3.2541705716963447, + "grad_norm": 55375.40625, + "learning_rate": 4.23895454770245e-06, + "loss": 2.1127, + "step": 17361 + }, + { + "epoch": 3.2543580131209, + "grad_norm": 50466.5703125, + "learning_rate": 4.235788748406722e-06, + "loss": 2.0692, + "step": 17362 + }, + { + "epoch": 3.2545454545454544, + "grad_norm": 56849.76171875, + "learning_rate": 4.232624079427689e-06, + "loss": 2.0975, + "step": 17363 + }, + { + "epoch": 3.2547328959700095, + "grad_norm": 55046.1796875, + "learning_rate": 4.229460540843494e-06, + "loss": 2.0731, + "step": 17364 + }, + { + "epoch": 3.254920337394564, + "grad_norm": 55903.15234375, + "learning_rate": 4.226298132732292e-06, + "loss": 2.1462, + "step": 17365 + }, + { + "epoch": 3.255107778819119, + "grad_norm": 57517.3828125, + "learning_rate": 4.223136855172177e-06, + "loss": 2.0656, + "step": 17366 + }, + { + "epoch": 3.255295220243674, + "grad_norm": 57864.328125, + "learning_rate": 4.219976708241225e-06, + "loss": 2.1114, + "step": 17367 + }, + { + "epoch": 3.255482661668229, + "grad_norm": 58459.87109375, + "learning_rate": 4.2168176920175e-06, + "loss": 2.0373, + "step": 17368 + }, + { + "epoch": 3.2556701030927835, + "grad_norm": 55840.76171875, + "learning_rate": 4.213659806579006e-06, + "loss": 2.2955, + "step": 17369 + }, + { + "epoch": 3.2558575445173386, + "grad_norm": 53603.59765625, + "learning_rate": 4.210503052003767e-06, + "loss": 2.0311, + "step": 17370 + }, + { + "epoch": 3.256044985941893, + "grad_norm": 54887.16796875, + "learning_rate": 4.207347428369723e-06, + "loss": 2.0169, + "step": 17371 + }, + { + "epoch": 3.256232427366448, + "grad_norm": 53498.46484375, + "learning_rate": 4.204192935754842e-06, + "loss": 2.0788, + "step": 17372 + }, + { + "epoch": 3.256419868791003, + "grad_norm": 59929.17578125, + "learning_rate": 4.201039574237015e-06, + "loss": 2.0318, + "step": 17373 + }, + { + "epoch": 3.2566073102155575, + "grad_norm": 57189.44921875, + "learning_rate": 4.197887343894119e-06, + "loss": 2.0999, + "step": 17374 + }, + { + "epoch": 3.2567947516401126, + "grad_norm": 57976.3125, + "learning_rate": 4.194736244804032e-06, + "loss": 2.0095, + "step": 17375 + }, + { + "epoch": 3.256982193064667, + "grad_norm": 52381.28125, + "learning_rate": 4.191586277044573e-06, + "loss": 2.068, + "step": 17376 + }, + { + "epoch": 3.2571696344892223, + "grad_norm": 53291.29296875, + "learning_rate": 4.188437440693544e-06, + "loss": 2.087, + "step": 17377 + }, + { + "epoch": 3.257357075913777, + "grad_norm": 55216.203125, + "learning_rate": 4.185289735828707e-06, + "loss": 2.0947, + "step": 17378 + }, + { + "epoch": 3.257544517338332, + "grad_norm": 55029.30859375, + "learning_rate": 4.182143162527818e-06, + "loss": 2.0688, + "step": 17379 + }, + { + "epoch": 3.2577319587628866, + "grad_norm": 53859.87890625, + "learning_rate": 4.178997720868594e-06, + "loss": 2.0909, + "step": 17380 + }, + { + "epoch": 3.2579194001874416, + "grad_norm": 57747.046875, + "learning_rate": 4.175853410928709e-06, + "loss": 2.0385, + "step": 17381 + }, + { + "epoch": 3.2581068416119963, + "grad_norm": 53576.1171875, + "learning_rate": 4.172710232785831e-06, + "loss": 2.1961, + "step": 17382 + }, + { + "epoch": 3.258294283036551, + "grad_norm": 50812.03125, + "learning_rate": 4.169568186517614e-06, + "loss": 2.0457, + "step": 17383 + }, + { + "epoch": 3.258481724461106, + "grad_norm": 59326.92578125, + "learning_rate": 4.166427272201628e-06, + "loss": 2.0423, + "step": 17384 + }, + { + "epoch": 3.2586691658856606, + "grad_norm": 53023.2265625, + "learning_rate": 4.163287489915457e-06, + "loss": 2.1031, + "step": 17385 + }, + { + "epoch": 3.2588566073102156, + "grad_norm": 53513.31640625, + "learning_rate": 4.160148839736671e-06, + "loss": 2.0733, + "step": 17386 + }, + { + "epoch": 3.2590440487347703, + "grad_norm": 52081.453125, + "learning_rate": 4.157011321742777e-06, + "loss": 2.0871, + "step": 17387 + }, + { + "epoch": 3.2592314901593253, + "grad_norm": 55948.046875, + "learning_rate": 4.1538749360112536e-06, + "loss": 2.0739, + "step": 17388 + }, + { + "epoch": 3.25941893158388, + "grad_norm": 57613.94140625, + "learning_rate": 4.150739682619581e-06, + "loss": 2.1011, + "step": 17389 + }, + { + "epoch": 3.259606373008435, + "grad_norm": 53784.84375, + "learning_rate": 4.147605561645201e-06, + "loss": 2.0421, + "step": 17390 + }, + { + "epoch": 3.2597938144329897, + "grad_norm": 54076.34375, + "learning_rate": 4.14447257316552e-06, + "loss": 2.0232, + "step": 17391 + }, + { + "epoch": 3.2599812558575447, + "grad_norm": 57096.875, + "learning_rate": 4.141340717257902e-06, + "loss": 2.0169, + "step": 17392 + }, + { + "epoch": 3.2601686972820993, + "grad_norm": 51417.05078125, + "learning_rate": 4.13820999399972e-06, + "loss": 2.0783, + "step": 17393 + }, + { + "epoch": 3.260356138706654, + "grad_norm": 57560.67578125, + "learning_rate": 4.135080403468289e-06, + "loss": 2.0017, + "step": 17394 + }, + { + "epoch": 3.260543580131209, + "grad_norm": 57840.31640625, + "learning_rate": 4.131951945740903e-06, + "loss": 2.1306, + "step": 17395 + }, + { + "epoch": 3.2607310215557637, + "grad_norm": 52833.7890625, + "learning_rate": 4.128824620894839e-06, + "loss": 2.0939, + "step": 17396 + }, + { + "epoch": 3.2609184629803187, + "grad_norm": 60307.4296875, + "learning_rate": 4.12569842900733e-06, + "loss": 2.0809, + "step": 17397 + }, + { + "epoch": 3.2611059044048734, + "grad_norm": 58723.59375, + "learning_rate": 4.1225733701555965e-06, + "loss": 2.0834, + "step": 17398 + }, + { + "epoch": 3.2612933458294284, + "grad_norm": 57561.09375, + "learning_rate": 4.119449444416817e-06, + "loss": 2.0617, + "step": 17399 + }, + { + "epoch": 3.261480787253983, + "grad_norm": 59332.46484375, + "learning_rate": 4.116326651868152e-06, + "loss": 2.0012, + "step": 17400 + }, + { + "epoch": 3.261668228678538, + "grad_norm": 53529.76953125, + "learning_rate": 4.113204992586728e-06, + "loss": 1.9853, + "step": 17401 + }, + { + "epoch": 3.2618556701030927, + "grad_norm": 58104.23828125, + "learning_rate": 4.110084466649655e-06, + "loss": 2.1239, + "step": 17402 + }, + { + "epoch": 3.262043111527648, + "grad_norm": 56078.53125, + "learning_rate": 4.106965074133995e-06, + "loss": 2.1194, + "step": 17403 + }, + { + "epoch": 3.2622305529522024, + "grad_norm": 57814.44921875, + "learning_rate": 4.103846815116791e-06, + "loss": 2.0632, + "step": 17404 + }, + { + "epoch": 3.262417994376757, + "grad_norm": 50745.93359375, + "learning_rate": 4.100729689675065e-06, + "loss": 2.1193, + "step": 17405 + }, + { + "epoch": 3.262605435801312, + "grad_norm": 59268.3984375, + "learning_rate": 4.097613697885816e-06, + "loss": 2.0668, + "step": 17406 + }, + { + "epoch": 3.2627928772258668, + "grad_norm": 56919.2578125, + "learning_rate": 4.094498839825994e-06, + "loss": 2.0035, + "step": 17407 + }, + { + "epoch": 3.262980318650422, + "grad_norm": 55696.82421875, + "learning_rate": 4.091385115572521e-06, + "loss": 2.0593, + "step": 17408 + }, + { + "epoch": 3.2631677600749764, + "grad_norm": 58112.39453125, + "learning_rate": 4.08827252520233e-06, + "loss": 2.033, + "step": 17409 + }, + { + "epoch": 3.2633552014995315, + "grad_norm": 58498.828125, + "learning_rate": 4.085161068792276e-06, + "loss": 2.035, + "step": 17410 + }, + { + "epoch": 3.263542642924086, + "grad_norm": 56771.6171875, + "learning_rate": 4.082050746419208e-06, + "loss": 2.0176, + "step": 17411 + }, + { + "epoch": 3.263730084348641, + "grad_norm": 55706.171875, + "learning_rate": 4.078941558159955e-06, + "loss": 2.0405, + "step": 17412 + }, + { + "epoch": 3.263917525773196, + "grad_norm": 57367.3203125, + "learning_rate": 4.075833504091325e-06, + "loss": 2.0016, + "step": 17413 + }, + { + "epoch": 3.264104967197751, + "grad_norm": 56311.6953125, + "learning_rate": 4.072726584290043e-06, + "loss": 2.079, + "step": 17414 + }, + { + "epoch": 3.2642924086223055, + "grad_norm": 58575.16796875, + "learning_rate": 4.069620798832874e-06, + "loss": 2.087, + "step": 17415 + }, + { + "epoch": 3.26447985004686, + "grad_norm": 50930.5703125, + "learning_rate": 4.066516147796529e-06, + "loss": 2.0135, + "step": 17416 + }, + { + "epoch": 3.264667291471415, + "grad_norm": 54280.28515625, + "learning_rate": 4.063412631257685e-06, + "loss": 2.0582, + "step": 17417 + }, + { + "epoch": 3.26485473289597, + "grad_norm": 56082.984375, + "learning_rate": 4.060310249292975e-06, + "loss": 2.0384, + "step": 17418 + }, + { + "epoch": 3.265042174320525, + "grad_norm": 53843.90625, + "learning_rate": 4.057209001979056e-06, + "loss": 2.0314, + "step": 17419 + }, + { + "epoch": 3.2652296157450795, + "grad_norm": 54969.40234375, + "learning_rate": 4.054108889392499e-06, + "loss": 2.0199, + "step": 17420 + }, + { + "epoch": 3.2654170571696346, + "grad_norm": 57597.89453125, + "learning_rate": 4.051009911609887e-06, + "loss": 2.0679, + "step": 17421 + }, + { + "epoch": 3.265604498594189, + "grad_norm": 55540.4609375, + "learning_rate": 4.0479120687077485e-06, + "loss": 2.0104, + "step": 17422 + }, + { + "epoch": 3.2657919400187443, + "grad_norm": 57641.3515625, + "learning_rate": 4.044815360762616e-06, + "loss": 2.0721, + "step": 17423 + }, + { + "epoch": 3.265979381443299, + "grad_norm": 52344.69921875, + "learning_rate": 4.041719787850962e-06, + "loss": 2.0869, + "step": 17424 + }, + { + "epoch": 3.266166822867854, + "grad_norm": 58124.54296875, + "learning_rate": 4.038625350049235e-06, + "loss": 2.0396, + "step": 17425 + }, + { + "epoch": 3.2663542642924086, + "grad_norm": 54259.984375, + "learning_rate": 4.035532047433882e-06, + "loss": 2.0322, + "step": 17426 + }, + { + "epoch": 3.2665417057169632, + "grad_norm": 53864.140625, + "learning_rate": 4.032439880081279e-06, + "loss": 2.1276, + "step": 17427 + }, + { + "epoch": 3.2667291471415183, + "grad_norm": 54286.08203125, + "learning_rate": 4.029348848067827e-06, + "loss": 2.016, + "step": 17428 + }, + { + "epoch": 3.266916588566073, + "grad_norm": 57678.37890625, + "learning_rate": 4.026258951469846e-06, + "loss": 2.1927, + "step": 17429 + }, + { + "epoch": 3.267104029990628, + "grad_norm": 56901.62890625, + "learning_rate": 4.023170190363673e-06, + "loss": 2.0841, + "step": 17430 + }, + { + "epoch": 3.2672914714151826, + "grad_norm": 54065.72265625, + "learning_rate": 4.020082564825578e-06, + "loss": 2.0585, + "step": 17431 + }, + { + "epoch": 3.2674789128397377, + "grad_norm": 57485.609375, + "learning_rate": 4.016996074931833e-06, + "loss": 2.0749, + "step": 17432 + }, + { + "epoch": 3.2676663542642923, + "grad_norm": 56254.06640625, + "learning_rate": 4.013910720758674e-06, + "loss": 1.9944, + "step": 17433 + }, + { + "epoch": 3.2678537956888474, + "grad_norm": 52411.671875, + "learning_rate": 4.010826502382287e-06, + "loss": 2.0924, + "step": 17434 + }, + { + "epoch": 3.268041237113402, + "grad_norm": 61135.51953125, + "learning_rate": 4.007743419878857e-06, + "loss": 2.0983, + "step": 17435 + }, + { + "epoch": 3.268228678537957, + "grad_norm": 55151.03125, + "learning_rate": 4.004661473324555e-06, + "loss": 2.1209, + "step": 17436 + }, + { + "epoch": 3.2684161199625117, + "grad_norm": 53638.765625, + "learning_rate": 4.0015806627954625e-06, + "loss": 2.1076, + "step": 17437 + }, + { + "epoch": 3.2686035613870663, + "grad_norm": 59029.765625, + "learning_rate": 3.9985009883676925e-06, + "loss": 2.0368, + "step": 17438 + }, + { + "epoch": 3.2687910028116214, + "grad_norm": 59410.30859375, + "learning_rate": 3.995422450117315e-06, + "loss": 2.0987, + "step": 17439 + }, + { + "epoch": 3.268978444236176, + "grad_norm": 53959.40234375, + "learning_rate": 3.992345048120355e-06, + "loss": 2.0859, + "step": 17440 + }, + { + "epoch": 3.269165885660731, + "grad_norm": 55493.3515625, + "learning_rate": 3.98926878245281e-06, + "loss": 2.0755, + "step": 17441 + }, + { + "epoch": 3.2693533270852857, + "grad_norm": 56138.48046875, + "learning_rate": 3.986193653190679e-06, + "loss": 2.0694, + "step": 17442 + }, + { + "epoch": 3.2695407685098408, + "grad_norm": 61790.7578125, + "learning_rate": 3.983119660409923e-06, + "loss": 2.081, + "step": 17443 + }, + { + "epoch": 3.2697282099343954, + "grad_norm": 53534.95703125, + "learning_rate": 3.980046804186427e-06, + "loss": 1.9776, + "step": 17444 + }, + { + "epoch": 3.2699156513589505, + "grad_norm": 51279.671875, + "learning_rate": 3.976975084596113e-06, + "loss": 2.0377, + "step": 17445 + }, + { + "epoch": 3.270103092783505, + "grad_norm": 57437.81640625, + "learning_rate": 3.973904501714854e-06, + "loss": 2.0718, + "step": 17446 + }, + { + "epoch": 3.27029053420806, + "grad_norm": 60604.53515625, + "learning_rate": 3.970835055618477e-06, + "loss": 2.0472, + "step": 17447 + }, + { + "epoch": 3.2704779756326148, + "grad_norm": 52356.0625, + "learning_rate": 3.967766746382795e-06, + "loss": 2.071, + "step": 17448 + }, + { + "epoch": 3.2706654170571694, + "grad_norm": 56844.64453125, + "learning_rate": 3.964699574083591e-06, + "loss": 2.0621, + "step": 17449 + }, + { + "epoch": 3.2708528584817245, + "grad_norm": 56088.19921875, + "learning_rate": 3.9616335387966275e-06, + "loss": 2.0632, + "step": 17450 + }, + { + "epoch": 3.2710402999062795, + "grad_norm": 53215.9375, + "learning_rate": 3.958568640597621e-06, + "loss": 2.0799, + "step": 17451 + }, + { + "epoch": 3.271227741330834, + "grad_norm": 54787.015625, + "learning_rate": 3.9555048795622716e-06, + "loss": 2.0851, + "step": 17452 + }, + { + "epoch": 3.271415182755389, + "grad_norm": 52682.5546875, + "learning_rate": 3.952442255766259e-06, + "loss": 2.0735, + "step": 17453 + }, + { + "epoch": 3.271602624179944, + "grad_norm": 57295.00390625, + "learning_rate": 3.949380769285227e-06, + "loss": 2.1271, + "step": 17454 + }, + { + "epoch": 3.2717900656044985, + "grad_norm": 59990.1953125, + "learning_rate": 3.946320420194772e-06, + "loss": 2.0172, + "step": 17455 + }, + { + "epoch": 3.2719775070290535, + "grad_norm": 56138.6953125, + "learning_rate": 3.943261208570504e-06, + "loss": 2.1072, + "step": 17456 + }, + { + "epoch": 3.272164948453608, + "grad_norm": 55808.1875, + "learning_rate": 3.940203134487963e-06, + "loss": 2.0357, + "step": 17457 + }, + { + "epoch": 3.2723523898781632, + "grad_norm": 55758.06640625, + "learning_rate": 3.9371461980226954e-06, + "loss": 1.9942, + "step": 17458 + }, + { + "epoch": 3.272539831302718, + "grad_norm": 54843.83984375, + "learning_rate": 3.9340903992501955e-06, + "loss": 2.0855, + "step": 17459 + }, + { + "epoch": 3.2727272727272725, + "grad_norm": 57696.109375, + "learning_rate": 3.931035738245925e-06, + "loss": 2.0217, + "step": 17460 + }, + { + "epoch": 3.2729147141518276, + "grad_norm": 53493.61328125, + "learning_rate": 3.927982215085346e-06, + "loss": 2.0682, + "step": 17461 + }, + { + "epoch": 3.2731021555763826, + "grad_norm": 56234.91796875, + "learning_rate": 3.924929829843882e-06, + "loss": 2.0518, + "step": 17462 + }, + { + "epoch": 3.2732895970009372, + "grad_norm": 63287.78515625, + "learning_rate": 3.921878582596911e-06, + "loss": 2.0815, + "step": 17463 + }, + { + "epoch": 3.273477038425492, + "grad_norm": 55242.54296875, + "learning_rate": 3.918828473419789e-06, + "loss": 2.0625, + "step": 17464 + }, + { + "epoch": 3.273664479850047, + "grad_norm": 54418.2421875, + "learning_rate": 3.915779502387868e-06, + "loss": 2.0298, + "step": 17465 + }, + { + "epoch": 3.2738519212746016, + "grad_norm": 56815.00390625, + "learning_rate": 3.912731669576447e-06, + "loss": 2.0682, + "step": 17466 + }, + { + "epoch": 3.2740393626991566, + "grad_norm": 52050.27734375, + "learning_rate": 3.90968497506079e-06, + "loss": 2.0218, + "step": 17467 + }, + { + "epoch": 3.2742268041237113, + "grad_norm": 53325.0390625, + "learning_rate": 3.906639418916158e-06, + "loss": 2.0897, + "step": 17468 + }, + { + "epoch": 3.2744142455482663, + "grad_norm": 58321.56640625, + "learning_rate": 3.90359500121778e-06, + "loss": 2.1368, + "step": 17469 + }, + { + "epoch": 3.274601686972821, + "grad_norm": 54977.3828125, + "learning_rate": 3.900551722040841e-06, + "loss": 2.1123, + "step": 17470 + }, + { + "epoch": 3.2747891283973756, + "grad_norm": 56072.84765625, + "learning_rate": 3.897509581460496e-06, + "loss": 2.0852, + "step": 17471 + }, + { + "epoch": 3.2749765698219306, + "grad_norm": 56222.32421875, + "learning_rate": 3.894468579551896e-06, + "loss": 2.0632, + "step": 17472 + }, + { + "epoch": 3.2751640112464857, + "grad_norm": 56131.03125, + "learning_rate": 3.891428716390155e-06, + "loss": 2.0111, + "step": 17473 + }, + { + "epoch": 3.2753514526710403, + "grad_norm": 52770.16796875, + "learning_rate": 3.888389992050329e-06, + "loss": 2.0647, + "step": 17474 + }, + { + "epoch": 3.275538894095595, + "grad_norm": 56074.0078125, + "learning_rate": 3.885352406607484e-06, + "loss": 2.0869, + "step": 17475 + }, + { + "epoch": 3.27572633552015, + "grad_norm": 57125.97265625, + "learning_rate": 3.8823159601366564e-06, + "loss": 2.1142, + "step": 17476 + }, + { + "epoch": 3.2759137769447046, + "grad_norm": 53361.21484375, + "learning_rate": 3.879280652712836e-06, + "loss": 2.0927, + "step": 17477 + }, + { + "epoch": 3.2761012183692597, + "grad_norm": 54737.4296875, + "learning_rate": 3.876246484410978e-06, + "loss": 2.006, + "step": 17478 + }, + { + "epoch": 3.2762886597938143, + "grad_norm": 54022.01953125, + "learning_rate": 3.873213455306035e-06, + "loss": 2.0133, + "step": 17479 + }, + { + "epoch": 3.2764761012183694, + "grad_norm": 55495.41015625, + "learning_rate": 3.8701815654729186e-06, + "loss": 1.9942, + "step": 17480 + }, + { + "epoch": 3.276663542642924, + "grad_norm": 58087.13671875, + "learning_rate": 3.8671508149864974e-06, + "loss": 2.1237, + "step": 17481 + }, + { + "epoch": 3.276850984067479, + "grad_norm": 55355.24609375, + "learning_rate": 3.864121203921645e-06, + "loss": 2.0467, + "step": 17482 + }, + { + "epoch": 3.2770384254920337, + "grad_norm": 52890.2734375, + "learning_rate": 3.861092732353189e-06, + "loss": 2.1, + "step": 17483 + }, + { + "epoch": 3.277225866916589, + "grad_norm": 54602.546875, + "learning_rate": 3.85806540035592e-06, + "loss": 2.1107, + "step": 17484 + }, + { + "epoch": 3.2774133083411434, + "grad_norm": 54054.62109375, + "learning_rate": 3.855039208004607e-06, + "loss": 2.0699, + "step": 17485 + }, + { + "epoch": 3.277600749765698, + "grad_norm": 58943.0703125, + "learning_rate": 3.852014155374001e-06, + "loss": 2.1331, + "step": 17486 + }, + { + "epoch": 3.277788191190253, + "grad_norm": 57160.48828125, + "learning_rate": 3.848990242538808e-06, + "loss": 2.0173, + "step": 17487 + }, + { + "epoch": 3.2779756326148077, + "grad_norm": 56489.59765625, + "learning_rate": 3.84596746957373e-06, + "loss": 2.1104, + "step": 17488 + }, + { + "epoch": 3.278163074039363, + "grad_norm": 53078.50390625, + "learning_rate": 3.842945836553413e-06, + "loss": 2.0746, + "step": 17489 + }, + { + "epoch": 3.2783505154639174, + "grad_norm": 58782.36328125, + "learning_rate": 3.839925343552486e-06, + "loss": 2.014, + "step": 17490 + }, + { + "epoch": 3.2785379568884725, + "grad_norm": 58201.93359375, + "learning_rate": 3.8369059906455615e-06, + "loss": 2.0595, + "step": 17491 + }, + { + "epoch": 3.278725398313027, + "grad_norm": 52873.56640625, + "learning_rate": 3.833887777907203e-06, + "loss": 2.0663, + "step": 17492 + }, + { + "epoch": 3.278912839737582, + "grad_norm": 60247.24609375, + "learning_rate": 3.8308707054119654e-06, + "loss": 1.9719, + "step": 17493 + }, + { + "epoch": 3.279100281162137, + "grad_norm": 54798.65625, + "learning_rate": 3.8278547732343515e-06, + "loss": 2.1452, + "step": 17494 + }, + { + "epoch": 3.279287722586692, + "grad_norm": 54316.62890625, + "learning_rate": 3.824839981448874e-06, + "loss": 2.0361, + "step": 17495 + }, + { + "epoch": 3.2794751640112465, + "grad_norm": 50654.37109375, + "learning_rate": 3.82182633012998e-06, + "loss": 2.039, + "step": 17496 + }, + { + "epoch": 3.279662605435801, + "grad_norm": 57437.81640625, + "learning_rate": 3.818813819352096e-06, + "loss": 2.1459, + "step": 17497 + }, + { + "epoch": 3.279850046860356, + "grad_norm": 57196.69140625, + "learning_rate": 3.815802449189637e-06, + "loss": 2.0903, + "step": 17498 + }, + { + "epoch": 3.280037488284911, + "grad_norm": 65298.22265625, + "learning_rate": 3.8127922197169875e-06, + "loss": 1.9912, + "step": 17499 + }, + { + "epoch": 3.280224929709466, + "grad_norm": 56165.0, + "learning_rate": 3.8097831310084875e-06, + "loss": 2.0934, + "step": 17500 + }, + { + "epoch": 3.280224929709466, + "eval_loss": 2.258958578109741, + "eval_runtime": 127.2667, + "eval_samples_per_second": 39.673, + "eval_steps_per_second": 1.988, + "step": 17500 + }, + { + "epoch": 3.2804123711340205, + "grad_norm": 58049.171875, + "learning_rate": 3.80677518313845e-06, + "loss": 2.0313, + "step": 17501 + }, + { + "epoch": 3.2805998125585756, + "grad_norm": 50840.78125, + "learning_rate": 3.803768376181183e-06, + "loss": 2.0161, + "step": 17502 + }, + { + "epoch": 3.28078725398313, + "grad_norm": 51390.5703125, + "learning_rate": 3.8007627102109423e-06, + "loss": 2.0318, + "step": 17503 + }, + { + "epoch": 3.2809746954076853, + "grad_norm": 55700.0234375, + "learning_rate": 3.7977581853019585e-06, + "loss": 2.0884, + "step": 17504 + }, + { + "epoch": 3.28116213683224, + "grad_norm": 52002.10546875, + "learning_rate": 3.794754801528444e-06, + "loss": 2.0552, + "step": 17505 + }, + { + "epoch": 3.281349578256795, + "grad_norm": 61395.73046875, + "learning_rate": 3.791752558964595e-06, + "loss": 2.1, + "step": 17506 + }, + { + "epoch": 3.2815370196813496, + "grad_norm": 57208.59375, + "learning_rate": 3.7887514576845306e-06, + "loss": 2.1068, + "step": 17507 + }, + { + "epoch": 3.281724461105904, + "grad_norm": 53998.88671875, + "learning_rate": 3.7857514977623965e-06, + "loss": 2.0359, + "step": 17508 + }, + { + "epoch": 3.2819119025304593, + "grad_norm": 59605.37109375, + "learning_rate": 3.7827526792722834e-06, + "loss": 1.9872, + "step": 17509 + }, + { + "epoch": 3.282099343955014, + "grad_norm": 54653.85546875, + "learning_rate": 3.7797550022882655e-06, + "loss": 2.0236, + "step": 17510 + }, + { + "epoch": 3.282286785379569, + "grad_norm": 53459.51171875, + "learning_rate": 3.7767584668843616e-06, + "loss": 2.0908, + "step": 17511 + }, + { + "epoch": 3.2824742268041236, + "grad_norm": 54116.69140625, + "learning_rate": 3.7737630731346007e-06, + "loss": 2.0129, + "step": 17512 + }, + { + "epoch": 3.2826616682286787, + "grad_norm": 56991.8203125, + "learning_rate": 3.770768821112952e-06, + "loss": 2.1202, + "step": 17513 + }, + { + "epoch": 3.2828491096532333, + "grad_norm": 62740.78515625, + "learning_rate": 3.7677757108933842e-06, + "loss": 2.0885, + "step": 17514 + }, + { + "epoch": 3.2830365510777884, + "grad_norm": 53630.16015625, + "learning_rate": 3.76478374254981e-06, + "loss": 2.0605, + "step": 17515 + }, + { + "epoch": 3.283223992502343, + "grad_norm": 58175.08203125, + "learning_rate": 3.7617929161561372e-06, + "loss": 2.0497, + "step": 17516 + }, + { + "epoch": 3.283411433926898, + "grad_norm": 53255.01953125, + "learning_rate": 3.758803231786223e-06, + "loss": 2.0212, + "step": 17517 + }, + { + "epoch": 3.2835988753514527, + "grad_norm": 51446.6015625, + "learning_rate": 3.7558146895139257e-06, + "loss": 2.0761, + "step": 17518 + }, + { + "epoch": 3.2837863167760073, + "grad_norm": 56559.828125, + "learning_rate": 3.752827289413047e-06, + "loss": 1.9964, + "step": 17519 + }, + { + "epoch": 3.2839737582005624, + "grad_norm": 57564.15234375, + "learning_rate": 3.7498410315573663e-06, + "loss": 2.11, + "step": 17520 + }, + { + "epoch": 3.284161199625117, + "grad_norm": 55714.30078125, + "learning_rate": 3.7468559160206585e-06, + "loss": 2.0386, + "step": 17521 + }, + { + "epoch": 3.284348641049672, + "grad_norm": 56637.31640625, + "learning_rate": 3.743871942876631e-06, + "loss": 2.1229, + "step": 17522 + }, + { + "epoch": 3.2845360824742267, + "grad_norm": 54107.38671875, + "learning_rate": 3.740889112199003e-06, + "loss": 2.0702, + "step": 17523 + }, + { + "epoch": 3.2847235238987817, + "grad_norm": 58489.7890625, + "learning_rate": 3.737907424061432e-06, + "loss": 2.0663, + "step": 17524 + }, + { + "epoch": 3.2849109653233364, + "grad_norm": 57497.37890625, + "learning_rate": 3.7349268785375757e-06, + "loss": 2.0545, + "step": 17525 + }, + { + "epoch": 3.2850984067478914, + "grad_norm": 53620.0546875, + "learning_rate": 3.731947475701042e-06, + "loss": 2.0331, + "step": 17526 + }, + { + "epoch": 3.285285848172446, + "grad_norm": 57998.54296875, + "learning_rate": 3.728969215625405e-06, + "loss": 2.0847, + "step": 17527 + }, + { + "epoch": 3.285473289597001, + "grad_norm": 59547.984375, + "learning_rate": 3.7259920983842454e-06, + "loss": 2.0689, + "step": 17528 + }, + { + "epoch": 3.2856607310215558, + "grad_norm": 58509.66015625, + "learning_rate": 3.7230161240510988e-06, + "loss": 2.0654, + "step": 17529 + }, + { + "epoch": 3.2858481724461104, + "grad_norm": 56047.75, + "learning_rate": 3.720041292699433e-06, + "loss": 2.0327, + "step": 17530 + }, + { + "epoch": 3.2860356138706655, + "grad_norm": 60716.21875, + "learning_rate": 3.717067604402752e-06, + "loss": 2.0617, + "step": 17531 + }, + { + "epoch": 3.28622305529522, + "grad_norm": 53954.65234375, + "learning_rate": 3.7140950592345016e-06, + "loss": 2.0621, + "step": 17532 + }, + { + "epoch": 3.286410496719775, + "grad_norm": 56250.375, + "learning_rate": 3.71112365726809e-06, + "loss": 2.0686, + "step": 17533 + }, + { + "epoch": 3.2865979381443298, + "grad_norm": 52196.69140625, + "learning_rate": 3.7081533985769023e-06, + "loss": 2.1776, + "step": 17534 + }, + { + "epoch": 3.286785379568885, + "grad_norm": 56996.8203125, + "learning_rate": 3.7051842832343033e-06, + "loss": 2.0008, + "step": 17535 + }, + { + "epoch": 3.2869728209934395, + "grad_norm": 54347.86328125, + "learning_rate": 3.70221631131365e-06, + "loss": 2.0443, + "step": 17536 + }, + { + "epoch": 3.2871602624179945, + "grad_norm": 55129.3125, + "learning_rate": 3.6992494828882117e-06, + "loss": 2.0825, + "step": 17537 + }, + { + "epoch": 3.287347703842549, + "grad_norm": 61219.6015625, + "learning_rate": 3.696283798031286e-06, + "loss": 2.0752, + "step": 17538 + }, + { + "epoch": 3.287535145267104, + "grad_norm": 58871.2421875, + "learning_rate": 3.693319256816119e-06, + "loss": 2.0836, + "step": 17539 + }, + { + "epoch": 3.287722586691659, + "grad_norm": 50982.2890625, + "learning_rate": 3.6903558593159305e-06, + "loss": 2.0566, + "step": 17540 + }, + { + "epoch": 3.2879100281162135, + "grad_norm": 55649.859375, + "learning_rate": 3.687393605603906e-06, + "loss": 2.0953, + "step": 17541 + }, + { + "epoch": 3.2880974695407685, + "grad_norm": 64192.3984375, + "learning_rate": 3.6844324957532152e-06, + "loss": 2.0413, + "step": 17542 + }, + { + "epoch": 3.288284910965323, + "grad_norm": 56452.78125, + "learning_rate": 3.6814725298369935e-06, + "loss": 2.0659, + "step": 17543 + }, + { + "epoch": 3.2884723523898782, + "grad_norm": 51617.35546875, + "learning_rate": 3.6785137079283493e-06, + "loss": 2.0361, + "step": 17544 + }, + { + "epoch": 3.288659793814433, + "grad_norm": 52479.86328125, + "learning_rate": 3.675556030100352e-06, + "loss": 2.0371, + "step": 17545 + }, + { + "epoch": 3.288847235238988, + "grad_norm": 54510.5546875, + "learning_rate": 3.672599496426071e-06, + "loss": 2.0592, + "step": 17546 + }, + { + "epoch": 3.2890346766635425, + "grad_norm": 56690.1953125, + "learning_rate": 3.6696441069785146e-06, + "loss": 2.0859, + "step": 17547 + }, + { + "epoch": 3.2892221180880976, + "grad_norm": 51394.375, + "learning_rate": 3.6666898618306746e-06, + "loss": 2.1168, + "step": 17548 + }, + { + "epoch": 3.2894095595126522, + "grad_norm": 55876.8671875, + "learning_rate": 3.6637367610555306e-06, + "loss": 2.1049, + "step": 17549 + }, + { + "epoch": 3.2895970009372073, + "grad_norm": 60857.421875, + "learning_rate": 3.6607848047260086e-06, + "loss": 2.0199, + "step": 17550 + }, + { + "epoch": 3.289784442361762, + "grad_norm": 56263.9140625, + "learning_rate": 3.6578339929150217e-06, + "loss": 2.0368, + "step": 17551 + }, + { + "epoch": 3.2899718837863166, + "grad_norm": 55381.8203125, + "learning_rate": 3.654884325695457e-06, + "loss": 1.9955, + "step": 17552 + }, + { + "epoch": 3.2901593252108716, + "grad_norm": 59805.6875, + "learning_rate": 3.6519358031401553e-06, + "loss": 2.0338, + "step": 17553 + }, + { + "epoch": 3.2903467666354262, + "grad_norm": 54512.74609375, + "learning_rate": 3.6489884253219474e-06, + "loss": 2.0614, + "step": 17554 + }, + { + "epoch": 3.2905342080599813, + "grad_norm": 57154.4765625, + "learning_rate": 3.646042192313637e-06, + "loss": 2.0089, + "step": 17555 + }, + { + "epoch": 3.290721649484536, + "grad_norm": 54945.9609375, + "learning_rate": 3.643097104187987e-06, + "loss": 2.0301, + "step": 17556 + }, + { + "epoch": 3.290909090909091, + "grad_norm": 54057.87109375, + "learning_rate": 3.6401531610177285e-06, + "loss": 2.0935, + "step": 17557 + }, + { + "epoch": 3.2910965323336456, + "grad_norm": 59758.69921875, + "learning_rate": 3.637210362875576e-06, + "loss": 2.0483, + "step": 17558 + }, + { + "epoch": 3.2912839737582007, + "grad_norm": 57717.4296875, + "learning_rate": 3.6342687098342376e-06, + "loss": 2.037, + "step": 17559 + }, + { + "epoch": 3.2914714151827553, + "grad_norm": 55790.90625, + "learning_rate": 3.631328201966333e-06, + "loss": 2.0936, + "step": 17560 + }, + { + "epoch": 3.2916588566073104, + "grad_norm": 62053.90625, + "learning_rate": 3.6283888393444986e-06, + "loss": 2.0101, + "step": 17561 + }, + { + "epoch": 3.291846298031865, + "grad_norm": 53524.93359375, + "learning_rate": 3.625450622041349e-06, + "loss": 2.0524, + "step": 17562 + }, + { + "epoch": 3.2920337394564196, + "grad_norm": 54749.20703125, + "learning_rate": 3.6225135501294472e-06, + "loss": 2.1295, + "step": 17563 + }, + { + "epoch": 3.2922211808809747, + "grad_norm": 55275.74609375, + "learning_rate": 3.6195776236813193e-06, + "loss": 2.0536, + "step": 17564 + }, + { + "epoch": 3.2924086223055293, + "grad_norm": 53652.57421875, + "learning_rate": 3.616642842769502e-06, + "loss": 2.0681, + "step": 17565 + }, + { + "epoch": 3.2925960637300844, + "grad_norm": 57077.00390625, + "learning_rate": 3.6137092074664647e-06, + "loss": 2.1293, + "step": 17566 + }, + { + "epoch": 3.292783505154639, + "grad_norm": 61294.61328125, + "learning_rate": 3.6107767178446663e-06, + "loss": 2.0321, + "step": 17567 + }, + { + "epoch": 3.292970946579194, + "grad_norm": 55673.77734375, + "learning_rate": 3.6078453739765317e-06, + "loss": 2.0265, + "step": 17568 + }, + { + "epoch": 3.2931583880037487, + "grad_norm": 52617.26953125, + "learning_rate": 3.604915175934481e-06, + "loss": 2.0215, + "step": 17569 + }, + { + "epoch": 3.293345829428304, + "grad_norm": 57923.26171875, + "learning_rate": 3.6019861237908735e-06, + "loss": 2.0333, + "step": 17570 + }, + { + "epoch": 3.2935332708528584, + "grad_norm": 56442.4296875, + "learning_rate": 3.5990582176180456e-06, + "loss": 2.1163, + "step": 17571 + }, + { + "epoch": 3.2937207122774135, + "grad_norm": 58434.31640625, + "learning_rate": 3.5961314574883275e-06, + "loss": 2.1223, + "step": 17572 + }, + { + "epoch": 3.293908153701968, + "grad_norm": 51975.0390625, + "learning_rate": 3.59320584347399e-06, + "loss": 2.0809, + "step": 17573 + }, + { + "epoch": 3.2940955951265227, + "grad_norm": 59622.1015625, + "learning_rate": 3.5902813756473086e-06, + "loss": 2.0595, + "step": 17574 + }, + { + "epoch": 3.294283036551078, + "grad_norm": 54855.28125, + "learning_rate": 3.5873580540804976e-06, + "loss": 2.0616, + "step": 17575 + }, + { + "epoch": 3.294470477975633, + "grad_norm": 52819.01953125, + "learning_rate": 3.5844358788457766e-06, + "loss": 2.0575, + "step": 17576 + }, + { + "epoch": 3.2946579194001875, + "grad_norm": 54721.23046875, + "learning_rate": 3.5815148500153108e-06, + "loss": 2.0612, + "step": 17577 + }, + { + "epoch": 3.294845360824742, + "grad_norm": 59978.375, + "learning_rate": 3.5785949676612363e-06, + "loss": 2.0726, + "step": 17578 + }, + { + "epoch": 3.295032802249297, + "grad_norm": 61454.328125, + "learning_rate": 3.5756762318556902e-06, + "loss": 2.124, + "step": 17579 + }, + { + "epoch": 3.295220243673852, + "grad_norm": 53582.03515625, + "learning_rate": 3.5727586426707425e-06, + "loss": 2.0719, + "step": 17580 + }, + { + "epoch": 3.295407685098407, + "grad_norm": 57654.0, + "learning_rate": 3.5698422001784693e-06, + "loss": 2.0964, + "step": 17581 + }, + { + "epoch": 3.2955951265229615, + "grad_norm": 55752.24609375, + "learning_rate": 3.566926904450896e-06, + "loss": 2.0874, + "step": 17582 + }, + { + "epoch": 3.2957825679475166, + "grad_norm": 69035.2578125, + "learning_rate": 3.56401275556002e-06, + "loss": 2.0534, + "step": 17583 + }, + { + "epoch": 3.295970009372071, + "grad_norm": 52980.83203125, + "learning_rate": 3.5610997535778234e-06, + "loss": 2.0602, + "step": 17584 + }, + { + "epoch": 3.296157450796626, + "grad_norm": 57420.5859375, + "learning_rate": 3.5581878985762594e-06, + "loss": 2.0949, + "step": 17585 + }, + { + "epoch": 3.296344892221181, + "grad_norm": 55455.10546875, + "learning_rate": 3.5552771906272488e-06, + "loss": 1.995, + "step": 17586 + }, + { + "epoch": 3.296532333645736, + "grad_norm": 57273.27734375, + "learning_rate": 3.552367629802661e-06, + "loss": 2.0373, + "step": 17587 + }, + { + "epoch": 3.2967197750702906, + "grad_norm": 54128.90234375, + "learning_rate": 3.5494592161743833e-06, + "loss": 2.0748, + "step": 17588 + }, + { + "epoch": 3.296907216494845, + "grad_norm": 56448.13671875, + "learning_rate": 3.5465519498142364e-06, + "loss": 2.0362, + "step": 17589 + }, + { + "epoch": 3.2970946579194003, + "grad_norm": 56468.09765625, + "learning_rate": 3.5436458307940234e-06, + "loss": 2.0907, + "step": 17590 + }, + { + "epoch": 3.297282099343955, + "grad_norm": 54430.8828125, + "learning_rate": 3.5407408591855264e-06, + "loss": 2.1248, + "step": 17591 + }, + { + "epoch": 3.29746954076851, + "grad_norm": 52048.87890625, + "learning_rate": 3.5378370350604985e-06, + "loss": 2.0598, + "step": 17592 + }, + { + "epoch": 3.2976569821930646, + "grad_norm": 55276.15234375, + "learning_rate": 3.53493435849066e-06, + "loss": 2.091, + "step": 17593 + }, + { + "epoch": 3.2978444236176196, + "grad_norm": 58999.140625, + "learning_rate": 3.5320328295476934e-06, + "loss": 2.0963, + "step": 17594 + }, + { + "epoch": 3.2980318650421743, + "grad_norm": 58064.546875, + "learning_rate": 3.529132448303274e-06, + "loss": 2.1305, + "step": 17595 + }, + { + "epoch": 3.2982193064667293, + "grad_norm": 63347.92578125, + "learning_rate": 3.5262332148290333e-06, + "loss": 1.9585, + "step": 17596 + }, + { + "epoch": 3.298406747891284, + "grad_norm": 51634.84375, + "learning_rate": 3.5233351291965757e-06, + "loss": 2.0892, + "step": 17597 + }, + { + "epoch": 3.298594189315839, + "grad_norm": 63362.6640625, + "learning_rate": 3.5204381914774766e-06, + "loss": 2.0027, + "step": 17598 + }, + { + "epoch": 3.2987816307403937, + "grad_norm": 55204.37890625, + "learning_rate": 3.5175424017433012e-06, + "loss": 2.0552, + "step": 17599 + }, + { + "epoch": 3.2989690721649483, + "grad_norm": 58587.08984375, + "learning_rate": 3.514647760065565e-06, + "loss": 2.1426, + "step": 17600 + }, + { + "epoch": 3.2991565135895033, + "grad_norm": 57314.9296875, + "learning_rate": 3.511754266515749e-06, + "loss": 2.0091, + "step": 17601 + }, + { + "epoch": 3.299343955014058, + "grad_norm": 54022.0703125, + "learning_rate": 3.5088619211653406e-06, + "loss": 2.1788, + "step": 17602 + }, + { + "epoch": 3.299531396438613, + "grad_norm": 54800.85546875, + "learning_rate": 3.5059707240857664e-06, + "loss": 2.0326, + "step": 17603 + }, + { + "epoch": 3.2997188378631677, + "grad_norm": 56410.44921875, + "learning_rate": 3.5030806753484247e-06, + "loss": 2.0363, + "step": 17604 + }, + { + "epoch": 3.2999062792877227, + "grad_norm": 57597.69140625, + "learning_rate": 3.5001917750247138e-06, + "loss": 2.0411, + "step": 17605 + }, + { + "epoch": 3.3000937207122774, + "grad_norm": 54145.04296875, + "learning_rate": 3.497304023185971e-06, + "loss": 2.0578, + "step": 17606 + }, + { + "epoch": 3.3002811621368324, + "grad_norm": 53599.3828125, + "learning_rate": 3.4944174199035395e-06, + "loss": 2.0307, + "step": 17607 + }, + { + "epoch": 3.300468603561387, + "grad_norm": 55529.9765625, + "learning_rate": 3.49153196524869e-06, + "loss": 1.9976, + "step": 17608 + }, + { + "epoch": 3.300656044985942, + "grad_norm": 59443.5625, + "learning_rate": 3.4886476592927097e-06, + "loss": 2.025, + "step": 17609 + }, + { + "epoch": 3.3008434864104967, + "grad_norm": 53153.64453125, + "learning_rate": 3.4857645021068197e-06, + "loss": 2.0311, + "step": 17610 + }, + { + "epoch": 3.3010309278350514, + "grad_norm": 52132.67578125, + "learning_rate": 3.4828824937622518e-06, + "loss": 2.093, + "step": 17611 + }, + { + "epoch": 3.3012183692596064, + "grad_norm": 53261.8984375, + "learning_rate": 3.4800016343301768e-06, + "loss": 2.0981, + "step": 17612 + }, + { + "epoch": 3.301405810684161, + "grad_norm": 51526.8671875, + "learning_rate": 3.4771219238817322e-06, + "loss": 2.0602, + "step": 17613 + }, + { + "epoch": 3.301593252108716, + "grad_norm": 59014.6875, + "learning_rate": 3.4742433624880664e-06, + "loss": 2.0703, + "step": 17614 + }, + { + "epoch": 3.3017806935332707, + "grad_norm": 61287.65625, + "learning_rate": 3.4713659502202733e-06, + "loss": 1.9665, + "step": 17615 + }, + { + "epoch": 3.301968134957826, + "grad_norm": 56054.33984375, + "learning_rate": 3.4684896871494123e-06, + "loss": 2.0884, + "step": 17616 + }, + { + "epoch": 3.3021555763823804, + "grad_norm": 54701.2109375, + "learning_rate": 3.4656145733465205e-06, + "loss": 2.0139, + "step": 17617 + }, + { + "epoch": 3.3023430178069355, + "grad_norm": 57855.45703125, + "learning_rate": 3.462740608882631e-06, + "loss": 2.0181, + "step": 17618 + }, + { + "epoch": 3.30253045923149, + "grad_norm": 57728.765625, + "learning_rate": 3.459867793828703e-06, + "loss": 2.0872, + "step": 17619 + }, + { + "epoch": 3.302717900656045, + "grad_norm": 56315.03125, + "learning_rate": 3.456996128255696e-06, + "loss": 1.9819, + "step": 17620 + }, + { + "epoch": 3.3029053420806, + "grad_norm": 53709.859375, + "learning_rate": 3.4541256122345433e-06, + "loss": 2.0736, + "step": 17621 + }, + { + "epoch": 3.3030927835051545, + "grad_norm": 56953.140625, + "learning_rate": 3.451256245836154e-06, + "loss": 2.0313, + "step": 17622 + }, + { + "epoch": 3.3032802249297095, + "grad_norm": 58882.17578125, + "learning_rate": 3.4483880291313663e-06, + "loss": 2.1102, + "step": 17623 + }, + { + "epoch": 3.303467666354264, + "grad_norm": 54319.26171875, + "learning_rate": 3.44552096219104e-06, + "loss": 2.0589, + "step": 17624 + }, + { + "epoch": 3.303655107778819, + "grad_norm": 54674.140625, + "learning_rate": 3.4426550450859963e-06, + "loss": 2.0293, + "step": 17625 + }, + { + "epoch": 3.303842549203374, + "grad_norm": 52109.39453125, + "learning_rate": 3.4397902778870115e-06, + "loss": 2.0739, + "step": 17626 + }, + { + "epoch": 3.304029990627929, + "grad_norm": 55537.9140625, + "learning_rate": 3.43692666066483e-06, + "loss": 2.0115, + "step": 17627 + }, + { + "epoch": 3.3042174320524835, + "grad_norm": 57415.16015625, + "learning_rate": 3.4340641934901884e-06, + "loss": 2.0582, + "step": 17628 + }, + { + "epoch": 3.3044048734770386, + "grad_norm": 52275.5703125, + "learning_rate": 3.431202876433798e-06, + "loss": 2.0442, + "step": 17629 + }, + { + "epoch": 3.304592314901593, + "grad_norm": 59699.80859375, + "learning_rate": 3.4283427095663234e-06, + "loss": 2.0433, + "step": 17630 + }, + { + "epoch": 3.3047797563261483, + "grad_norm": 53554.484375, + "learning_rate": 3.4254836929583866e-06, + "loss": 2.0499, + "step": 17631 + }, + { + "epoch": 3.304967197750703, + "grad_norm": 58361.703125, + "learning_rate": 3.422625826680631e-06, + "loss": 2.0511, + "step": 17632 + }, + { + "epoch": 3.3051546391752575, + "grad_norm": 55033.31640625, + "learning_rate": 3.419769110803628e-06, + "loss": 2.1349, + "step": 17633 + }, + { + "epoch": 3.3053420805998126, + "grad_norm": 55820.62890625, + "learning_rate": 3.4169135453979316e-06, + "loss": 2.0458, + "step": 17634 + }, + { + "epoch": 3.3055295220243672, + "grad_norm": 54092.359375, + "learning_rate": 3.4140591305340753e-06, + "loss": 2.086, + "step": 17635 + }, + { + "epoch": 3.3057169634489223, + "grad_norm": 51568.66015625, + "learning_rate": 3.4112058662825574e-06, + "loss": 2.1279, + "step": 17636 + }, + { + "epoch": 3.305904404873477, + "grad_norm": 55473.51953125, + "learning_rate": 3.4083537527138554e-06, + "loss": 2.0794, + "step": 17637 + }, + { + "epoch": 3.306091846298032, + "grad_norm": 63096.015625, + "learning_rate": 3.4055027898984014e-06, + "loss": 2.0725, + "step": 17638 + }, + { + "epoch": 3.3062792877225866, + "grad_norm": 60426.84765625, + "learning_rate": 3.4026529779066285e-06, + "loss": 2.128, + "step": 17639 + }, + { + "epoch": 3.3064667291471417, + "grad_norm": 52474.1953125, + "learning_rate": 3.399804316808902e-06, + "loss": 2.0912, + "step": 17640 + }, + { + "epoch": 3.3066541705716963, + "grad_norm": 56533.06640625, + "learning_rate": 3.3969568066755995e-06, + "loss": 2.0572, + "step": 17641 + }, + { + "epoch": 3.3068416119962514, + "grad_norm": 56069.3046875, + "learning_rate": 3.3941104475770426e-06, + "loss": 2.0724, + "step": 17642 + }, + { + "epoch": 3.307029053420806, + "grad_norm": 58239.89453125, + "learning_rate": 3.3912652395835242e-06, + "loss": 2.0797, + "step": 17643 + }, + { + "epoch": 3.3072164948453606, + "grad_norm": 61432.02734375, + "learning_rate": 3.3884211827653332e-06, + "loss": 2.0246, + "step": 17644 + }, + { + "epoch": 3.3074039362699157, + "grad_norm": 53711.80859375, + "learning_rate": 3.3855782771927027e-06, + "loss": 2.0995, + "step": 17645 + }, + { + "epoch": 3.3075913776944703, + "grad_norm": 54636.6171875, + "learning_rate": 3.3827365229358477e-06, + "loss": 2.0758, + "step": 17646 + }, + { + "epoch": 3.3077788191190254, + "grad_norm": 54352.85546875, + "learning_rate": 3.379895920064963e-06, + "loss": 2.0799, + "step": 17647 + }, + { + "epoch": 3.30796626054358, + "grad_norm": 49884.90625, + "learning_rate": 3.377056468650208e-06, + "loss": 2.0955, + "step": 17648 + }, + { + "epoch": 3.308153701968135, + "grad_norm": 57297.7421875, + "learning_rate": 3.3742181687617114e-06, + "loss": 2.1048, + "step": 17649 + }, + { + "epoch": 3.3083411433926897, + "grad_norm": 53496.45703125, + "learning_rate": 3.3713810204695716e-06, + "loss": 2.1614, + "step": 17650 + }, + { + "epoch": 3.3085285848172448, + "grad_norm": 57544.0234375, + "learning_rate": 3.3685450238438608e-06, + "loss": 2.0211, + "step": 17651 + }, + { + "epoch": 3.3087160262417994, + "grad_norm": 57435.0078125, + "learning_rate": 3.365710178954645e-06, + "loss": 2.0231, + "step": 17652 + }, + { + "epoch": 3.3089034676663545, + "grad_norm": 55365.9453125, + "learning_rate": 3.362876485871913e-06, + "loss": 2.0915, + "step": 17653 + }, + { + "epoch": 3.309090909090909, + "grad_norm": 57870.0234375, + "learning_rate": 3.360043944665664e-06, + "loss": 2.1553, + "step": 17654 + }, + { + "epoch": 3.3092783505154637, + "grad_norm": 53320.4609375, + "learning_rate": 3.35721255540587e-06, + "loss": 2.083, + "step": 17655 + }, + { + "epoch": 3.3094657919400188, + "grad_norm": 61755.1328125, + "learning_rate": 3.3543823181624525e-06, + "loss": 2.0202, + "step": 17656 + }, + { + "epoch": 3.3096532333645734, + "grad_norm": 55879.20703125, + "learning_rate": 3.3515532330053057e-06, + "loss": 2.0889, + "step": 17657 + }, + { + "epoch": 3.3098406747891285, + "grad_norm": 56068.18359375, + "learning_rate": 3.3487253000043183e-06, + "loss": 2.1046, + "step": 17658 + }, + { + "epoch": 3.310028116213683, + "grad_norm": 56147.4375, + "learning_rate": 3.3458985192293344e-06, + "loss": 2.038, + "step": 17659 + }, + { + "epoch": 3.310215557638238, + "grad_norm": 54703.50390625, + "learning_rate": 3.3430728907501594e-06, + "loss": 2.0841, + "step": 17660 + }, + { + "epoch": 3.310402999062793, + "grad_norm": 55573.12890625, + "learning_rate": 3.3402484146365985e-06, + "loss": 2.0893, + "step": 17661 + }, + { + "epoch": 3.310590440487348, + "grad_norm": 58456.984375, + "learning_rate": 3.337425090958407e-06, + "loss": 2.0603, + "step": 17662 + }, + { + "epoch": 3.3107778819119025, + "grad_norm": 54652.6015625, + "learning_rate": 3.3346029197853235e-06, + "loss": 2.0342, + "step": 17663 + }, + { + "epoch": 3.3109653233364575, + "grad_norm": 60943.03125, + "learning_rate": 3.331781901187031e-06, + "loss": 2.0845, + "step": 17664 + }, + { + "epoch": 3.311152764761012, + "grad_norm": 61193.73046875, + "learning_rate": 3.3289620352332295e-06, + "loss": 2.1212, + "step": 17665 + }, + { + "epoch": 3.311340206185567, + "grad_norm": 55212.0, + "learning_rate": 3.326143321993547e-06, + "loss": 2.0153, + "step": 17666 + }, + { + "epoch": 3.311527647610122, + "grad_norm": 56168.5546875, + "learning_rate": 3.3233257615376213e-06, + "loss": 2.0562, + "step": 17667 + }, + { + "epoch": 3.3117150890346765, + "grad_norm": 56568.4921875, + "learning_rate": 3.320509353935025e-06, + "loss": 2.1605, + "step": 17668 + }, + { + "epoch": 3.3119025304592316, + "grad_norm": 54939.89453125, + "learning_rate": 3.3176940992553308e-06, + "loss": 2.0771, + "step": 17669 + }, + { + "epoch": 3.312089971883786, + "grad_norm": 58245.3671875, + "learning_rate": 3.3148799975680656e-06, + "loss": 2.0243, + "step": 17670 + }, + { + "epoch": 3.3122774133083412, + "grad_norm": 54500.5625, + "learning_rate": 3.312067048942741e-06, + "loss": 2.1183, + "step": 17671 + }, + { + "epoch": 3.312464854732896, + "grad_norm": 52376.3203125, + "learning_rate": 3.309255253448823e-06, + "loss": 2.1385, + "step": 17672 + }, + { + "epoch": 3.312652296157451, + "grad_norm": 58753.7578125, + "learning_rate": 3.306444611155762e-06, + "loss": 2.2857, + "step": 17673 + }, + { + "epoch": 3.3128397375820056, + "grad_norm": 58049.4296875, + "learning_rate": 3.303635122132992e-06, + "loss": 2.0108, + "step": 17674 + }, + { + "epoch": 3.3130271790065606, + "grad_norm": 59555.22265625, + "learning_rate": 3.3008267864498844e-06, + "loss": 2.1023, + "step": 17675 + }, + { + "epoch": 3.3132146204311153, + "grad_norm": 59355.75390625, + "learning_rate": 3.298019604175806e-06, + "loss": 2.0405, + "step": 17676 + }, + { + "epoch": 3.31340206185567, + "grad_norm": 52623.765625, + "learning_rate": 3.2952135753800904e-06, + "loss": 2.0368, + "step": 17677 + }, + { + "epoch": 3.313589503280225, + "grad_norm": 58025.3359375, + "learning_rate": 3.292408700132055e-06, + "loss": 2.0299, + "step": 17678 + }, + { + "epoch": 3.3137769447047796, + "grad_norm": 60596.11328125, + "learning_rate": 3.2896049785009652e-06, + "loss": 2.0908, + "step": 17679 + }, + { + "epoch": 3.3139643861293346, + "grad_norm": 55787.54296875, + "learning_rate": 3.286802410556067e-06, + "loss": 2.0965, + "step": 17680 + }, + { + "epoch": 3.3141518275538893, + "grad_norm": 49372.671875, + "learning_rate": 3.284000996366582e-06, + "loss": 2.097, + "step": 17681 + }, + { + "epoch": 3.3143392689784443, + "grad_norm": 54033.3125, + "learning_rate": 3.2812007360017217e-06, + "loss": 2.039, + "step": 17682 + }, + { + "epoch": 3.314526710402999, + "grad_norm": 57540.92578125, + "learning_rate": 3.278401629530614e-06, + "loss": 2.1238, + "step": 17683 + }, + { + "epoch": 3.314714151827554, + "grad_norm": 55353.515625, + "learning_rate": 3.275603677022415e-06, + "loss": 2.0575, + "step": 17684 + }, + { + "epoch": 3.3149015932521086, + "grad_norm": 57389.98046875, + "learning_rate": 3.272806878546231e-06, + "loss": 2.059, + "step": 17685 + }, + { + "epoch": 3.3150890346766637, + "grad_norm": 55788.421875, + "learning_rate": 3.270011234171133e-06, + "loss": 2.0561, + "step": 17686 + }, + { + "epoch": 3.3152764761012183, + "grad_norm": 58492.125, + "learning_rate": 3.267216743966167e-06, + "loss": 2.0978, + "step": 17687 + }, + { + "epoch": 3.315463917525773, + "grad_norm": 52808.38671875, + "learning_rate": 3.264423408000361e-06, + "loss": 2.0128, + "step": 17688 + }, + { + "epoch": 3.315651358950328, + "grad_norm": 57313.30859375, + "learning_rate": 3.261631226342704e-06, + "loss": 2.1087, + "step": 17689 + }, + { + "epoch": 3.3158388003748827, + "grad_norm": 56666.015625, + "learning_rate": 3.258840199062152e-06, + "loss": 2.0509, + "step": 17690 + }, + { + "epoch": 3.3160262417994377, + "grad_norm": 51851.046875, + "learning_rate": 3.2560503262276444e-06, + "loss": 2.0717, + "step": 17691 + }, + { + "epoch": 3.3162136832239923, + "grad_norm": 58692.73828125, + "learning_rate": 3.253261607908098e-06, + "loss": 2.0868, + "step": 17692 + }, + { + "epoch": 3.3164011246485474, + "grad_norm": 52484.58984375, + "learning_rate": 3.250474044172386e-06, + "loss": 2.0223, + "step": 17693 + }, + { + "epoch": 3.316588566073102, + "grad_norm": 52737.41796875, + "learning_rate": 3.2476876350893416e-06, + "loss": 2.0523, + "step": 17694 + }, + { + "epoch": 3.316776007497657, + "grad_norm": 57900.32421875, + "learning_rate": 3.2449023807278044e-06, + "loss": 1.9913, + "step": 17695 + }, + { + "epoch": 3.3169634489222117, + "grad_norm": 55118.7421875, + "learning_rate": 3.2421182811565533e-06, + "loss": 2.0789, + "step": 17696 + }, + { + "epoch": 3.317150890346767, + "grad_norm": 54697.70703125, + "learning_rate": 3.2393353364443657e-06, + "loss": 1.9861, + "step": 17697 + }, + { + "epoch": 3.3173383317713214, + "grad_norm": 52092.8515625, + "learning_rate": 3.2365535466599707e-06, + "loss": 2.0491, + "step": 17698 + }, + { + "epoch": 3.317525773195876, + "grad_norm": 54837.3359375, + "learning_rate": 3.2337729118720627e-06, + "loss": 2.0828, + "step": 17699 + }, + { + "epoch": 3.317713214620431, + "grad_norm": 54700.84375, + "learning_rate": 3.230993432149343e-06, + "loss": 2.0468, + "step": 17700 + }, + { + "epoch": 3.317900656044986, + "grad_norm": 56659.84765625, + "learning_rate": 3.228215107560434e-06, + "loss": 2.0711, + "step": 17701 + }, + { + "epoch": 3.318088097469541, + "grad_norm": 54302.08984375, + "learning_rate": 3.225437938173981e-06, + "loss": 2.0466, + "step": 17702 + }, + { + "epoch": 3.3182755388940954, + "grad_norm": 55806.515625, + "learning_rate": 3.222661924058562e-06, + "loss": 2.0855, + "step": 17703 + }, + { + "epoch": 3.3184629803186505, + "grad_norm": 56546.80859375, + "learning_rate": 3.2198870652827505e-06, + "loss": 2.1564, + "step": 17704 + }, + { + "epoch": 3.318650421743205, + "grad_norm": 52952.61328125, + "learning_rate": 3.2171133619150808e-06, + "loss": 2.0559, + "step": 17705 + }, + { + "epoch": 3.31883786316776, + "grad_norm": 54360.1328125, + "learning_rate": 3.214340814024047e-06, + "loss": 2.0819, + "step": 17706 + }, + { + "epoch": 3.319025304592315, + "grad_norm": 54108.9609375, + "learning_rate": 3.2115694216781344e-06, + "loss": 2.133, + "step": 17707 + }, + { + "epoch": 3.31921274601687, + "grad_norm": 58330.15625, + "learning_rate": 3.2087991849458043e-06, + "loss": 1.9835, + "step": 17708 + }, + { + "epoch": 3.3194001874414245, + "grad_norm": 57964.8984375, + "learning_rate": 3.2060301038954688e-06, + "loss": 2.1142, + "step": 17709 + }, + { + "epoch": 3.319587628865979, + "grad_norm": 54116.25390625, + "learning_rate": 3.2032621785955117e-06, + "loss": 2.1155, + "step": 17710 + }, + { + "epoch": 3.319775070290534, + "grad_norm": 55985.16796875, + "learning_rate": 3.200495409114318e-06, + "loss": 2.0549, + "step": 17711 + }, + { + "epoch": 3.3199625117150893, + "grad_norm": 55806.4921875, + "learning_rate": 3.1977297955202045e-06, + "loss": 2.112, + "step": 17712 + }, + { + "epoch": 3.320149953139644, + "grad_norm": 49858.65625, + "learning_rate": 3.1949653378814837e-06, + "loss": 2.0537, + "step": 17713 + }, + { + "epoch": 3.3203373945641985, + "grad_norm": 52830.80078125, + "learning_rate": 3.192202036266434e-06, + "loss": 2.0966, + "step": 17714 + }, + { + "epoch": 3.3205248359887536, + "grad_norm": 60461.05078125, + "learning_rate": 3.1894398907433176e-06, + "loss": 2.116, + "step": 17715 + }, + { + "epoch": 3.320712277413308, + "grad_norm": 55429.6953125, + "learning_rate": 3.1866789013803355e-06, + "loss": 2.1244, + "step": 17716 + }, + { + "epoch": 3.3208997188378633, + "grad_norm": 54460.34765625, + "learning_rate": 3.183919068245683e-06, + "loss": 2.0471, + "step": 17717 + }, + { + "epoch": 3.321087160262418, + "grad_norm": 53815.4453125, + "learning_rate": 3.181160391407545e-06, + "loss": 2.1171, + "step": 17718 + }, + { + "epoch": 3.321274601686973, + "grad_norm": 51217.578125, + "learning_rate": 3.178402870934044e-06, + "loss": 2.0935, + "step": 17719 + }, + { + "epoch": 3.3214620431115276, + "grad_norm": 53519.23828125, + "learning_rate": 3.175646506893276e-06, + "loss": 1.9952, + "step": 17720 + }, + { + "epoch": 3.3216494845360827, + "grad_norm": 52140.4609375, + "learning_rate": 3.172891299353331e-06, + "loss": 2.0781, + "step": 17721 + }, + { + "epoch": 3.3218369259606373, + "grad_norm": 53402.68359375, + "learning_rate": 3.170137248382271e-06, + "loss": 2.0673, + "step": 17722 + }, + { + "epoch": 3.3220243673851924, + "grad_norm": 62105.42578125, + "learning_rate": 3.1673843540480972e-06, + "loss": 2.1198, + "step": 17723 + }, + { + "epoch": 3.322211808809747, + "grad_norm": 53336.640625, + "learning_rate": 3.164632616418811e-06, + "loss": 2.1624, + "step": 17724 + }, + { + "epoch": 3.3223992502343016, + "grad_norm": 55241.51171875, + "learning_rate": 3.1618820355623802e-06, + "loss": 2.0735, + "step": 17725 + }, + { + "epoch": 3.3225866916588567, + "grad_norm": 57126.6328125, + "learning_rate": 3.1591326115467278e-06, + "loss": 2.0888, + "step": 17726 + }, + { + "epoch": 3.3227741330834113, + "grad_norm": 55216.640625, + "learning_rate": 3.1563843444397777e-06, + "loss": 2.1443, + "step": 17727 + }, + { + "epoch": 3.3229615745079664, + "grad_norm": 55042.94140625, + "learning_rate": 3.1536372343094033e-06, + "loss": 2.1195, + "step": 17728 + }, + { + "epoch": 3.323149015932521, + "grad_norm": 58886.74609375, + "learning_rate": 3.150891281223439e-06, + "loss": 2.0958, + "step": 17729 + }, + { + "epoch": 3.323336457357076, + "grad_norm": 56328.85546875, + "learning_rate": 3.148146485249731e-06, + "loss": 2.0654, + "step": 17730 + }, + { + "epoch": 3.3235238987816307, + "grad_norm": 56966.63671875, + "learning_rate": 3.1454028464560526e-06, + "loss": 2.0174, + "step": 17731 + }, + { + "epoch": 3.3237113402061857, + "grad_norm": 53317.234375, + "learning_rate": 3.142660364910183e-06, + "loss": 2.0743, + "step": 17732 + }, + { + "epoch": 3.3238987816307404, + "grad_norm": 56626.7265625, + "learning_rate": 3.139919040679845e-06, + "loss": 2.05, + "step": 17733 + }, + { + "epoch": 3.3240862230552954, + "grad_norm": 58250.87109375, + "learning_rate": 3.137178873832758e-06, + "loss": 1.9601, + "step": 17734 + }, + { + "epoch": 3.32427366447985, + "grad_norm": 55591.375, + "learning_rate": 3.134439864436595e-06, + "loss": 2.0014, + "step": 17735 + }, + { + "epoch": 3.3244611059044047, + "grad_norm": 57174.44921875, + "learning_rate": 3.1317020125589957e-06, + "loss": 2.0455, + "step": 17736 + }, + { + "epoch": 3.3246485473289598, + "grad_norm": 54115.6015625, + "learning_rate": 3.1289653182675906e-06, + "loss": 2.078, + "step": 17737 + }, + { + "epoch": 3.3248359887535144, + "grad_norm": 55437.8359375, + "learning_rate": 3.126229781629991e-06, + "loss": 2.089, + "step": 17738 + }, + { + "epoch": 3.3250234301780695, + "grad_norm": 52616.6953125, + "learning_rate": 3.1234954027137276e-06, + "loss": 2.0683, + "step": 17739 + }, + { + "epoch": 3.325210871602624, + "grad_norm": 59151.1015625, + "learning_rate": 3.1207621815863507e-06, + "loss": 2.2725, + "step": 17740 + }, + { + "epoch": 3.325398313027179, + "grad_norm": 53497.4765625, + "learning_rate": 3.118030118315374e-06, + "loss": 2.1234, + "step": 17741 + }, + { + "epoch": 3.3255857544517338, + "grad_norm": 58548.515625, + "learning_rate": 3.1152992129682757e-06, + "loss": 2.0613, + "step": 17742 + }, + { + "epoch": 3.325773195876289, + "grad_norm": 55538.62109375, + "learning_rate": 3.112569465612486e-06, + "loss": 1.971, + "step": 17743 + }, + { + "epoch": 3.3259606373008435, + "grad_norm": 54103.94140625, + "learning_rate": 3.1098408763154396e-06, + "loss": 2.0607, + "step": 17744 + }, + { + "epoch": 3.3261480787253985, + "grad_norm": 52585.8828125, + "learning_rate": 3.1071134451445494e-06, + "loss": 2.0386, + "step": 17745 + }, + { + "epoch": 3.326335520149953, + "grad_norm": 55531.73046875, + "learning_rate": 3.1043871721671445e-06, + "loss": 2.0903, + "step": 17746 + }, + { + "epoch": 3.3265229615745078, + "grad_norm": 61377.43359375, + "learning_rate": 3.1016620574505718e-06, + "loss": 2.1062, + "step": 17747 + }, + { + "epoch": 3.326710402999063, + "grad_norm": 60046.703125, + "learning_rate": 3.0989381010621486e-06, + "loss": 2.0812, + "step": 17748 + }, + { + "epoch": 3.3268978444236175, + "grad_norm": 56889.42578125, + "learning_rate": 3.09621530306915e-06, + "loss": 2.088, + "step": 17749 + }, + { + "epoch": 3.3270852858481725, + "grad_norm": 56174.37109375, + "learning_rate": 3.09349366353881e-06, + "loss": 2.0324, + "step": 17750 + }, + { + "epoch": 3.327272727272727, + "grad_norm": 56953.6796875, + "learning_rate": 3.090773182538376e-06, + "loss": 2.049, + "step": 17751 + }, + { + "epoch": 3.3274601686972822, + "grad_norm": 55455.74609375, + "learning_rate": 3.0880538601350095e-06, + "loss": 2.0443, + "step": 17752 + }, + { + "epoch": 3.327647610121837, + "grad_norm": 56896.08203125, + "learning_rate": 3.085335696395902e-06, + "loss": 2.0435, + "step": 17753 + }, + { + "epoch": 3.327835051546392, + "grad_norm": 57130.984375, + "learning_rate": 3.082618691388173e-06, + "loss": 2.0191, + "step": 17754 + }, + { + "epoch": 3.3280224929709465, + "grad_norm": 54313.19140625, + "learning_rate": 3.0799028451789336e-06, + "loss": 2.122, + "step": 17755 + }, + { + "epoch": 3.3282099343955016, + "grad_norm": 58512.890625, + "learning_rate": 3.0771881578352648e-06, + "loss": 2.0223, + "step": 17756 + }, + { + "epoch": 3.3283973758200562, + "grad_norm": 54642.8515625, + "learning_rate": 3.0744746294242065e-06, + "loss": 2.0833, + "step": 17757 + }, + { + "epoch": 3.328584817244611, + "grad_norm": 58207.70703125, + "learning_rate": 3.071762260012789e-06, + "loss": 2.0465, + "step": 17758 + }, + { + "epoch": 3.328772258669166, + "grad_norm": 52109.5, + "learning_rate": 3.0690510496679924e-06, + "loss": 2.0738, + "step": 17759 + }, + { + "epoch": 3.3289597000937206, + "grad_norm": 56539.65234375, + "learning_rate": 3.066340998456796e-06, + "loss": 2.1301, + "step": 17760 + }, + { + "epoch": 3.3291471415182756, + "grad_norm": 52389.0546875, + "learning_rate": 3.0636321064461238e-06, + "loss": 2.0366, + "step": 17761 + }, + { + "epoch": 3.3293345829428302, + "grad_norm": 64885.31640625, + "learning_rate": 3.060924373702889e-06, + "loss": 1.982, + "step": 17762 + }, + { + "epoch": 3.3295220243673853, + "grad_norm": 57559.08203125, + "learning_rate": 3.0582178002939556e-06, + "loss": 2.0348, + "step": 17763 + }, + { + "epoch": 3.32970946579194, + "grad_norm": 60106.85546875, + "learning_rate": 3.0555123862861913e-06, + "loss": 2.1263, + "step": 17764 + }, + { + "epoch": 3.329896907216495, + "grad_norm": 57355.15625, + "learning_rate": 3.052808131746404e-06, + "loss": 2.0706, + "step": 17765 + }, + { + "epoch": 3.3300843486410496, + "grad_norm": 60769.30859375, + "learning_rate": 3.0501050367413854e-06, + "loss": 2.1381, + "step": 17766 + }, + { + "epoch": 3.3302717900656047, + "grad_norm": 51552.7109375, + "learning_rate": 3.0474031013378924e-06, + "loss": 2.0232, + "step": 17767 + }, + { + "epoch": 3.3304592314901593, + "grad_norm": 57418.4296875, + "learning_rate": 3.0447023256026886e-06, + "loss": 2.0717, + "step": 17768 + }, + { + "epoch": 3.330646672914714, + "grad_norm": 61888.84765625, + "learning_rate": 3.0420027096024427e-06, + "loss": 2.0471, + "step": 17769 + }, + { + "epoch": 3.330834114339269, + "grad_norm": 64094.01953125, + "learning_rate": 3.0393042534038517e-06, + "loss": 2.1301, + "step": 17770 + }, + { + "epoch": 3.3310215557638236, + "grad_norm": 62397.6484375, + "learning_rate": 3.0366069570735623e-06, + "loss": 1.9587, + "step": 17771 + }, + { + "epoch": 3.3312089971883787, + "grad_norm": 55559.71875, + "learning_rate": 3.0339108206781987e-06, + "loss": 2.0561, + "step": 17772 + }, + { + "epoch": 3.3313964386129333, + "grad_norm": 55806.4765625, + "learning_rate": 3.0312158442843297e-06, + "loss": 1.9521, + "step": 17773 + }, + { + "epoch": 3.3315838800374884, + "grad_norm": 62626.40625, + "learning_rate": 3.0285220279585413e-06, + "loss": 2.0376, + "step": 17774 + }, + { + "epoch": 3.331771321462043, + "grad_norm": 57646.21875, + "learning_rate": 3.0258293717673693e-06, + "loss": 2.1246, + "step": 17775 + }, + { + "epoch": 3.331958762886598, + "grad_norm": 56387.44140625, + "learning_rate": 3.023137875777293e-06, + "loss": 2.108, + "step": 17776 + }, + { + "epoch": 3.3321462043111527, + "grad_norm": 58392.59375, + "learning_rate": 3.0204475400548106e-06, + "loss": 2.0153, + "step": 17777 + }, + { + "epoch": 3.332333645735708, + "grad_norm": 55266.94140625, + "learning_rate": 3.0177583646663675e-06, + "loss": 2.0641, + "step": 17778 + }, + { + "epoch": 3.3325210871602624, + "grad_norm": 55282.1796875, + "learning_rate": 3.015070349678378e-06, + "loss": 2.0678, + "step": 17779 + }, + { + "epoch": 3.332708528584817, + "grad_norm": 52019.6953125, + "learning_rate": 3.0123834951572285e-06, + "loss": 2.0933, + "step": 17780 + }, + { + "epoch": 3.332895970009372, + "grad_norm": 56343.4375, + "learning_rate": 3.0096978011692924e-06, + "loss": 2.0037, + "step": 17781 + }, + { + "epoch": 3.3330834114339267, + "grad_norm": 60159.7734375, + "learning_rate": 3.0070132677809006e-06, + "loss": 2.019, + "step": 17782 + }, + { + "epoch": 3.333270852858482, + "grad_norm": 55736.87109375, + "learning_rate": 3.004329895058339e-06, + "loss": 2.0927, + "step": 17783 + }, + { + "epoch": 3.3334582942830364, + "grad_norm": 49907.21484375, + "learning_rate": 3.001647683067904e-06, + "loss": 2.0445, + "step": 17784 + }, + { + "epoch": 3.3336457357075915, + "grad_norm": 54808.24609375, + "learning_rate": 2.9989666318758437e-06, + "loss": 2.1362, + "step": 17785 + }, + { + "epoch": 3.333833177132146, + "grad_norm": 51702.890625, + "learning_rate": 2.996286741548371e-06, + "loss": 2.056, + "step": 17786 + }, + { + "epoch": 3.334020618556701, + "grad_norm": 54816.59375, + "learning_rate": 2.993608012151666e-06, + "loss": 2.0279, + "step": 17787 + }, + { + "epoch": 3.334208059981256, + "grad_norm": 55687.9609375, + "learning_rate": 2.990930443751905e-06, + "loss": 2.0243, + "step": 17788 + }, + { + "epoch": 3.334395501405811, + "grad_norm": 50130.9765625, + "learning_rate": 2.9882540364152058e-06, + "loss": 2.052, + "step": 17789 + }, + { + "epoch": 3.3345829428303655, + "grad_norm": 55965.22265625, + "learning_rate": 2.985578790207688e-06, + "loss": 2.0975, + "step": 17790 + }, + { + "epoch": 3.33477038425492, + "grad_norm": 63504.5703125, + "learning_rate": 2.982904705195422e-06, + "loss": 2.0299, + "step": 17791 + }, + { + "epoch": 3.334957825679475, + "grad_norm": 55400.8359375, + "learning_rate": 2.9802317814444368e-06, + "loss": 2.0751, + "step": 17792 + }, + { + "epoch": 3.33514526710403, + "grad_norm": 54872.28515625, + "learning_rate": 2.9775600190207697e-06, + "loss": 2.0322, + "step": 17793 + }, + { + "epoch": 3.335332708528585, + "grad_norm": 58872.984375, + "learning_rate": 2.974889417990412e-06, + "loss": 2.1326, + "step": 17794 + }, + { + "epoch": 3.3355201499531395, + "grad_norm": 56818.98046875, + "learning_rate": 2.972219978419316e-06, + "loss": 2.0821, + "step": 17795 + }, + { + "epoch": 3.3357075913776946, + "grad_norm": 56240.015625, + "learning_rate": 2.9695517003734073e-06, + "loss": 2.0877, + "step": 17796 + }, + { + "epoch": 3.335895032802249, + "grad_norm": 51972.45703125, + "learning_rate": 2.9668845839186055e-06, + "loss": 1.9968, + "step": 17797 + }, + { + "epoch": 3.3360824742268043, + "grad_norm": 53692.38671875, + "learning_rate": 2.9642186291207683e-06, + "loss": 2.1102, + "step": 17798 + }, + { + "epoch": 3.336269915651359, + "grad_norm": 55454.92578125, + "learning_rate": 2.9615538360457495e-06, + "loss": 2.1555, + "step": 17799 + }, + { + "epoch": 3.336457357075914, + "grad_norm": 51965.03125, + "learning_rate": 2.9588902047593568e-06, + "loss": 2.0892, + "step": 17800 + }, + { + "epoch": 3.3366447985004686, + "grad_norm": 53083.6796875, + "learning_rate": 2.956227735327399e-06, + "loss": 2.0969, + "step": 17801 + }, + { + "epoch": 3.336832239925023, + "grad_norm": 54690.58984375, + "learning_rate": 2.9535664278156185e-06, + "loss": 2.1193, + "step": 17802 + }, + { + "epoch": 3.3370196813495783, + "grad_norm": 58134.48046875, + "learning_rate": 2.9509062822897447e-06, + "loss": 2.0938, + "step": 17803 + }, + { + "epoch": 3.337207122774133, + "grad_norm": 57057.13671875, + "learning_rate": 2.9482472988154986e-06, + "loss": 2.0862, + "step": 17804 + }, + { + "epoch": 3.337394564198688, + "grad_norm": 57615.15625, + "learning_rate": 2.9455894774585325e-06, + "loss": 2.1097, + "step": 17805 + }, + { + "epoch": 3.3375820056232426, + "grad_norm": 54472.78515625, + "learning_rate": 2.9429328182844994e-06, + "loss": 2.0869, + "step": 17806 + }, + { + "epoch": 3.3377694470477977, + "grad_norm": 52910.43359375, + "learning_rate": 2.9402773213590084e-06, + "loss": 2.0317, + "step": 17807 + }, + { + "epoch": 3.3379568884723523, + "grad_norm": 55248.0625, + "learning_rate": 2.9376229867476677e-06, + "loss": 2.0672, + "step": 17808 + }, + { + "epoch": 3.3381443298969073, + "grad_norm": 61472.8671875, + "learning_rate": 2.9349698145160144e-06, + "loss": 2.0582, + "step": 17809 + }, + { + "epoch": 3.338331771321462, + "grad_norm": 58092.9765625, + "learning_rate": 2.932317804729584e-06, + "loss": 2.0731, + "step": 17810 + }, + { + "epoch": 3.338519212746017, + "grad_norm": 52071.39453125, + "learning_rate": 2.929666957453886e-06, + "loss": 2.1222, + "step": 17811 + }, + { + "epoch": 3.3387066541705717, + "grad_norm": 54089.640625, + "learning_rate": 2.92701727275439e-06, + "loss": 2.1224, + "step": 17812 + }, + { + "epoch": 3.3388940955951263, + "grad_norm": 51054.33984375, + "learning_rate": 2.924368750696521e-06, + "loss": 2.0742, + "step": 17813 + }, + { + "epoch": 3.3390815370196814, + "grad_norm": 52856.89453125, + "learning_rate": 2.9217213913457166e-06, + "loss": 2.0279, + "step": 17814 + }, + { + "epoch": 3.3392689784442364, + "grad_norm": 56425.82421875, + "learning_rate": 2.9190751947673565e-06, + "loss": 2.0172, + "step": 17815 + }, + { + "epoch": 3.339456419868791, + "grad_norm": 57673.65625, + "learning_rate": 2.9164301610268062e-06, + "loss": 2.0475, + "step": 17816 + }, + { + "epoch": 3.3396438612933457, + "grad_norm": 52805.671875, + "learning_rate": 2.913786290189374e-06, + "loss": 2.0256, + "step": 17817 + }, + { + "epoch": 3.3398313027179007, + "grad_norm": 57576.0546875, + "learning_rate": 2.9111435823203803e-06, + "loss": 2.0187, + "step": 17818 + }, + { + "epoch": 3.3400187441424554, + "grad_norm": 53530.671875, + "learning_rate": 2.908502037485078e-06, + "loss": 2.1014, + "step": 17819 + }, + { + "epoch": 3.3402061855670104, + "grad_norm": 55331.9609375, + "learning_rate": 2.9058616557487317e-06, + "loss": 2.0325, + "step": 17820 + }, + { + "epoch": 3.340393626991565, + "grad_norm": 57131.5625, + "learning_rate": 2.9032224371765394e-06, + "loss": 2.0532, + "step": 17821 + }, + { + "epoch": 3.34058106841612, + "grad_norm": 56866.50390625, + "learning_rate": 2.9005843818336873e-06, + "loss": 2.0861, + "step": 17822 + }, + { + "epoch": 3.3407685098406747, + "grad_norm": 55428.25390625, + "learning_rate": 2.8979474897853355e-06, + "loss": 2.0855, + "step": 17823 + }, + { + "epoch": 3.3409559512652294, + "grad_norm": 57845.3984375, + "learning_rate": 2.8953117610966196e-06, + "loss": 2.137, + "step": 17824 + }, + { + "epoch": 3.3411433926897844, + "grad_norm": 56977.55859375, + "learning_rate": 2.8926771958326328e-06, + "loss": 2.0796, + "step": 17825 + }, + { + "epoch": 3.3413308341143395, + "grad_norm": 52650.41015625, + "learning_rate": 2.8900437940584335e-06, + "loss": 2.0775, + "step": 17826 + }, + { + "epoch": 3.341518275538894, + "grad_norm": 55720.33203125, + "learning_rate": 2.887411555839081e-06, + "loss": 2.0823, + "step": 17827 + }, + { + "epoch": 3.3417057169634488, + "grad_norm": 59992.3515625, + "learning_rate": 2.8847804812395796e-06, + "loss": 2.129, + "step": 17828 + }, + { + "epoch": 3.341893158388004, + "grad_norm": 56770.0234375, + "learning_rate": 2.8821505703249097e-06, + "loss": 2.0754, + "step": 17829 + }, + { + "epoch": 3.3420805998125585, + "grad_norm": 58779.29296875, + "learning_rate": 2.8795218231600306e-06, + "loss": 2.0545, + "step": 17830 + }, + { + "epoch": 3.3422680412371135, + "grad_norm": 55876.08984375, + "learning_rate": 2.8768942398098854e-06, + "loss": 2.0439, + "step": 17831 + }, + { + "epoch": 3.342455482661668, + "grad_norm": 61381.87109375, + "learning_rate": 2.8742678203393437e-06, + "loss": 2.0113, + "step": 17832 + }, + { + "epoch": 3.342642924086223, + "grad_norm": 60147.484375, + "learning_rate": 2.8716425648132816e-06, + "loss": 2.1475, + "step": 17833 + }, + { + "epoch": 3.342830365510778, + "grad_norm": 53459.96875, + "learning_rate": 2.8690184732965585e-06, + "loss": 2.0668, + "step": 17834 + }, + { + "epoch": 3.3430178069353325, + "grad_norm": 55276.1015625, + "learning_rate": 2.8663955458539672e-06, + "loss": 2.091, + "step": 17835 + }, + { + "epoch": 3.3432052483598875, + "grad_norm": 61926.80859375, + "learning_rate": 2.863773782550294e-06, + "loss": 2.0557, + "step": 17836 + }, + { + "epoch": 3.3433926897844426, + "grad_norm": 52238.06640625, + "learning_rate": 2.8611531834502936e-06, + "loss": 2.1123, + "step": 17837 + }, + { + "epoch": 3.343580131208997, + "grad_norm": 53654.94140625, + "learning_rate": 2.858533748618708e-06, + "loss": 2.1143, + "step": 17838 + }, + { + "epoch": 3.343767572633552, + "grad_norm": 55255.7734375, + "learning_rate": 2.8559154781202025e-06, + "loss": 2.0421, + "step": 17839 + }, + { + "epoch": 3.343955014058107, + "grad_norm": 53695.1484375, + "learning_rate": 2.853298372019464e-06, + "loss": 2.0689, + "step": 17840 + }, + { + "epoch": 3.3441424554826615, + "grad_norm": 57030.17578125, + "learning_rate": 2.8506824303811353e-06, + "loss": 2.0416, + "step": 17841 + }, + { + "epoch": 3.3443298969072166, + "grad_norm": 51211.15234375, + "learning_rate": 2.8480676532698204e-06, + "loss": 2.0136, + "step": 17842 + }, + { + "epoch": 3.3445173383317712, + "grad_norm": 58522.82421875, + "learning_rate": 2.845454040750095e-06, + "loss": 2.0521, + "step": 17843 + }, + { + "epoch": 3.3447047797563263, + "grad_norm": 53018.0, + "learning_rate": 2.842841592886525e-06, + "loss": 2.0583, + "step": 17844 + }, + { + "epoch": 3.344892221180881, + "grad_norm": 56233.5078125, + "learning_rate": 2.8402303097436188e-06, + "loss": 1.9822, + "step": 17845 + }, + { + "epoch": 3.345079662605436, + "grad_norm": 52493.0546875, + "learning_rate": 2.837620191385887e-06, + "loss": 2.0792, + "step": 17846 + }, + { + "epoch": 3.3452671040299906, + "grad_norm": 61925.0078125, + "learning_rate": 2.8350112378777827e-06, + "loss": 2.0792, + "step": 17847 + }, + { + "epoch": 3.3454545454545457, + "grad_norm": 52301.25, + "learning_rate": 2.83240344928376e-06, + "loss": 2.0085, + "step": 17848 + }, + { + "epoch": 3.3456419868791003, + "grad_norm": 60029.53125, + "learning_rate": 2.829796825668207e-06, + "loss": 2.0228, + "step": 17849 + }, + { + "epoch": 3.345829428303655, + "grad_norm": 56614.953125, + "learning_rate": 2.8271913670955275e-06, + "loss": 2.0508, + "step": 17850 + }, + { + "epoch": 3.34601686972821, + "grad_norm": 56770.1171875, + "learning_rate": 2.8245870736300583e-06, + "loss": 2.0727, + "step": 17851 + }, + { + "epoch": 3.3462043111527646, + "grad_norm": 55816.31640625, + "learning_rate": 2.8219839453361153e-06, + "loss": 2.0761, + "step": 17852 + }, + { + "epoch": 3.3463917525773197, + "grad_norm": 58606.3828125, + "learning_rate": 2.8193819822780133e-06, + "loss": 2.0554, + "step": 17853 + }, + { + "epoch": 3.3465791940018743, + "grad_norm": 55976.0625, + "learning_rate": 2.8167811845199954e-06, + "loss": 2.0622, + "step": 17854 + }, + { + "epoch": 3.3467666354264294, + "grad_norm": 56024.828125, + "learning_rate": 2.814181552126316e-06, + "loss": 2.0573, + "step": 17855 + }, + { + "epoch": 3.346954076850984, + "grad_norm": 52029.41015625, + "learning_rate": 2.8115830851611678e-06, + "loss": 2.0895, + "step": 17856 + }, + { + "epoch": 3.347141518275539, + "grad_norm": 54733.87890625, + "learning_rate": 2.8089857836887444e-06, + "loss": 2.0266, + "step": 17857 + }, + { + "epoch": 3.3473289597000937, + "grad_norm": 57760.54296875, + "learning_rate": 2.806389647773189e-06, + "loss": 2.0958, + "step": 17858 + }, + { + "epoch": 3.3475164011246488, + "grad_norm": 53629.09765625, + "learning_rate": 2.803794677478616e-06, + "loss": 2.0728, + "step": 17859 + }, + { + "epoch": 3.3477038425492034, + "grad_norm": 54094.84375, + "learning_rate": 2.8012008728691195e-06, + "loss": 2.0993, + "step": 17860 + }, + { + "epoch": 3.347891283973758, + "grad_norm": 59769.546875, + "learning_rate": 2.7986082340087873e-06, + "loss": 2.0832, + "step": 17861 + }, + { + "epoch": 3.348078725398313, + "grad_norm": 56340.90234375, + "learning_rate": 2.7960167609616117e-06, + "loss": 2.0597, + "step": 17862 + }, + { + "epoch": 3.3482661668228677, + "grad_norm": 57556.3359375, + "learning_rate": 2.7934264537916255e-06, + "loss": 2.1425, + "step": 17863 + }, + { + "epoch": 3.3484536082474228, + "grad_norm": 59721.06640625, + "learning_rate": 2.7908373125628105e-06, + "loss": 2.1607, + "step": 17864 + }, + { + "epoch": 3.3486410496719774, + "grad_norm": 54470.21484375, + "learning_rate": 2.788249337339105e-06, + "loss": 2.0764, + "step": 17865 + }, + { + "epoch": 3.3488284910965325, + "grad_norm": 53884.46484375, + "learning_rate": 2.785662528184424e-06, + "loss": 2.0728, + "step": 17866 + }, + { + "epoch": 3.349015932521087, + "grad_norm": 56731.58203125, + "learning_rate": 2.783076885162661e-06, + "loss": 2.0822, + "step": 17867 + }, + { + "epoch": 3.349203373945642, + "grad_norm": 52909.26953125, + "learning_rate": 2.7804924083376984e-06, + "loss": 1.99, + "step": 17868 + }, + { + "epoch": 3.349390815370197, + "grad_norm": 51126.0546875, + "learning_rate": 2.7779090977733347e-06, + "loss": 2.0266, + "step": 17869 + }, + { + "epoch": 3.349578256794752, + "grad_norm": 61432.66015625, + "learning_rate": 2.775326953533397e-06, + "loss": 2.0836, + "step": 17870 + }, + { + "epoch": 3.3497656982193065, + "grad_norm": 58689.24609375, + "learning_rate": 2.7727459756816622e-06, + "loss": 2.1958, + "step": 17871 + }, + { + "epoch": 3.349953139643861, + "grad_norm": 57278.20703125, + "learning_rate": 2.770166164281868e-06, + "loss": 2.0581, + "step": 17872 + }, + { + "epoch": 3.350140581068416, + "grad_norm": 53511.73046875, + "learning_rate": 2.767587519397724e-06, + "loss": 2.0429, + "step": 17873 + }, + { + "epoch": 3.350328022492971, + "grad_norm": 52113.13671875, + "learning_rate": 2.7650100410929467e-06, + "loss": 2.0566, + "step": 17874 + }, + { + "epoch": 3.350515463917526, + "grad_norm": 54101.125, + "learning_rate": 2.7624337294311684e-06, + "loss": 1.9945, + "step": 17875 + }, + { + "epoch": 3.3507029053420805, + "grad_norm": 54717.44921875, + "learning_rate": 2.759858584476038e-06, + "loss": 2.0679, + "step": 17876 + }, + { + "epoch": 3.3508903467666356, + "grad_norm": 51374.5234375, + "learning_rate": 2.7572846062911495e-06, + "loss": 2.1269, + "step": 17877 + }, + { + "epoch": 3.35107778819119, + "grad_norm": 55090.9609375, + "learning_rate": 2.754711794940085e-06, + "loss": 2.1189, + "step": 17878 + }, + { + "epoch": 3.3512652296157452, + "grad_norm": 52849.11328125, + "learning_rate": 2.7521401504863765e-06, + "loss": 2.0664, + "step": 17879 + }, + { + "epoch": 3.3514526710403, + "grad_norm": 56494.97265625, + "learning_rate": 2.7495696729935574e-06, + "loss": 2.0865, + "step": 17880 + }, + { + "epoch": 3.351640112464855, + "grad_norm": 55696.7734375, + "learning_rate": 2.7470003625251096e-06, + "loss": 2.1283, + "step": 17881 + }, + { + "epoch": 3.3518275538894096, + "grad_norm": 51828.3203125, + "learning_rate": 2.744432219144477e-06, + "loss": 2.0425, + "step": 17882 + }, + { + "epoch": 3.352014995313964, + "grad_norm": 59041.88671875, + "learning_rate": 2.7418652429151093e-06, + "loss": 2.0013, + "step": 17883 + }, + { + "epoch": 3.3522024367385193, + "grad_norm": 53556.64453125, + "learning_rate": 2.7392994339003995e-06, + "loss": 2.0916, + "step": 17884 + }, + { + "epoch": 3.352389878163074, + "grad_norm": 55114.3359375, + "learning_rate": 2.7367347921637086e-06, + "loss": 2.1107, + "step": 17885 + }, + { + "epoch": 3.352577319587629, + "grad_norm": 51639.5, + "learning_rate": 2.734171317768397e-06, + "loss": 2.1118, + "step": 17886 + }, + { + "epoch": 3.3527647610121836, + "grad_norm": 57652.52734375, + "learning_rate": 2.7316090107777746e-06, + "loss": 2.0948, + "step": 17887 + }, + { + "epoch": 3.3529522024367386, + "grad_norm": 58205.375, + "learning_rate": 2.729047871255125e-06, + "loss": 2.0498, + "step": 17888 + }, + { + "epoch": 3.3531396438612933, + "grad_norm": 60278.328125, + "learning_rate": 2.726487899263702e-06, + "loss": 2.0171, + "step": 17889 + }, + { + "epoch": 3.3533270852858483, + "grad_norm": 58685.46484375, + "learning_rate": 2.723929094866734e-06, + "loss": 2.0382, + "step": 17890 + }, + { + "epoch": 3.353514526710403, + "grad_norm": 55096.57421875, + "learning_rate": 2.721371458127442e-06, + "loss": 1.9982, + "step": 17891 + }, + { + "epoch": 3.353701968134958, + "grad_norm": 58837.484375, + "learning_rate": 2.7188149891089587e-06, + "loss": 2.1138, + "step": 17892 + }, + { + "epoch": 3.3538894095595126, + "grad_norm": 57179.67578125, + "learning_rate": 2.716259687874445e-06, + "loss": 2.1093, + "step": 17893 + }, + { + "epoch": 3.3540768509840673, + "grad_norm": 54470.1484375, + "learning_rate": 2.7137055544870172e-06, + "loss": 2.0804, + "step": 17894 + }, + { + "epoch": 3.3542642924086223, + "grad_norm": 59156.75, + "learning_rate": 2.7111525890097577e-06, + "loss": 2.0046, + "step": 17895 + }, + { + "epoch": 3.354451733833177, + "grad_norm": 58105.58984375, + "learning_rate": 2.708600791505711e-06, + "loss": 2.1004, + "step": 17896 + }, + { + "epoch": 3.354639175257732, + "grad_norm": 56161.015625, + "learning_rate": 2.7060501620379154e-06, + "loss": 2.0812, + "step": 17897 + }, + { + "epoch": 3.3548266166822867, + "grad_norm": 57158.62109375, + "learning_rate": 2.703500700669365e-06, + "loss": 2.0916, + "step": 17898 + }, + { + "epoch": 3.3550140581068417, + "grad_norm": 52740.7265625, + "learning_rate": 2.70095240746302e-06, + "loss": 2.0665, + "step": 17899 + }, + { + "epoch": 3.3552014995313963, + "grad_norm": 53619.12109375, + "learning_rate": 2.698405282481825e-06, + "loss": 2.0392, + "step": 17900 + }, + { + "epoch": 3.3553889409559514, + "grad_norm": 56464.37109375, + "learning_rate": 2.695859325788702e-06, + "loss": 2.0666, + "step": 17901 + }, + { + "epoch": 3.355576382380506, + "grad_norm": 61134.48046875, + "learning_rate": 2.693314537446523e-06, + "loss": 2.0524, + "step": 17902 + }, + { + "epoch": 3.355763823805061, + "grad_norm": 55837.78125, + "learning_rate": 2.6907709175181317e-06, + "loss": 2.1001, + "step": 17903 + }, + { + "epoch": 3.3559512652296157, + "grad_norm": 54188.0234375, + "learning_rate": 2.6882284660663672e-06, + "loss": 2.0159, + "step": 17904 + }, + { + "epoch": 3.3561387066541704, + "grad_norm": 58860.171875, + "learning_rate": 2.685687183154012e-06, + "loss": 2.0613, + "step": 17905 + }, + { + "epoch": 3.3563261480787254, + "grad_norm": 59600.2734375, + "learning_rate": 2.6831470688438496e-06, + "loss": 2.0699, + "step": 17906 + }, + { + "epoch": 3.35651358950328, + "grad_norm": 55884.4609375, + "learning_rate": 2.6806081231985968e-06, + "loss": 2.0534, + "step": 17907 + }, + { + "epoch": 3.356701030927835, + "grad_norm": 56050.1484375, + "learning_rate": 2.6780703462809808e-06, + "loss": 2.1785, + "step": 17908 + }, + { + "epoch": 3.3568884723523897, + "grad_norm": 52951.12109375, + "learning_rate": 2.6755337381536796e-06, + "loss": 2.0837, + "step": 17909 + }, + { + "epoch": 3.357075913776945, + "grad_norm": 58368.55078125, + "learning_rate": 2.67299829887932e-06, + "loss": 2.2366, + "step": 17910 + }, + { + "epoch": 3.3572633552014994, + "grad_norm": 54664.609375, + "learning_rate": 2.6704640285205585e-06, + "loss": 2.1217, + "step": 17911 + }, + { + "epoch": 3.3574507966260545, + "grad_norm": 56471.1015625, + "learning_rate": 2.667930927139961e-06, + "loss": 2.0161, + "step": 17912 + }, + { + "epoch": 3.357638238050609, + "grad_norm": 57781.859375, + "learning_rate": 2.6653989948001114e-06, + "loss": 2.1139, + "step": 17913 + }, + { + "epoch": 3.357825679475164, + "grad_norm": 58366.21875, + "learning_rate": 2.662868231563531e-06, + "loss": 2.0576, + "step": 17914 + }, + { + "epoch": 3.358013120899719, + "grad_norm": 53852.6171875, + "learning_rate": 2.660338637492732e-06, + "loss": 2.0801, + "step": 17915 + }, + { + "epoch": 3.3582005623242734, + "grad_norm": 53891.16015625, + "learning_rate": 2.657810212650186e-06, + "loss": 2.1197, + "step": 17916 + }, + { + "epoch": 3.3583880037488285, + "grad_norm": 52157.49609375, + "learning_rate": 2.6552829570983597e-06, + "loss": 2.0659, + "step": 17917 + }, + { + "epoch": 3.358575445173383, + "grad_norm": 58281.80078125, + "learning_rate": 2.6527568708996585e-06, + "loss": 2.117, + "step": 17918 + }, + { + "epoch": 3.358762886597938, + "grad_norm": 50231.21484375, + "learning_rate": 2.6502319541164667e-06, + "loss": 2.0408, + "step": 17919 + }, + { + "epoch": 3.358950328022493, + "grad_norm": 58923.83984375, + "learning_rate": 2.647708206811167e-06, + "loss": 2.0602, + "step": 17920 + }, + { + "epoch": 3.359137769447048, + "grad_norm": 62113.01953125, + "learning_rate": 2.6451856290460764e-06, + "loss": 2.0888, + "step": 17921 + }, + { + "epoch": 3.3593252108716025, + "grad_norm": 55109.00390625, + "learning_rate": 2.6426642208835006e-06, + "loss": 2.0723, + "step": 17922 + }, + { + "epoch": 3.3595126522961576, + "grad_norm": 50275.24609375, + "learning_rate": 2.6401439823857176e-06, + "loss": 2.092, + "step": 17923 + }, + { + "epoch": 3.359700093720712, + "grad_norm": 54881.2578125, + "learning_rate": 2.6376249136149942e-06, + "loss": 2.0548, + "step": 17924 + }, + { + "epoch": 3.3598875351452673, + "grad_norm": 58301.64453125, + "learning_rate": 2.635107014633509e-06, + "loss": 2.099, + "step": 17925 + }, + { + "epoch": 3.360074976569822, + "grad_norm": 57058.19921875, + "learning_rate": 2.6325902855034724e-06, + "loss": 2.0282, + "step": 17926 + }, + { + "epoch": 3.3602624179943765, + "grad_norm": 57076.765625, + "learning_rate": 2.630074726287052e-06, + "loss": 2.0506, + "step": 17927 + }, + { + "epoch": 3.3604498594189316, + "grad_norm": 56563.359375, + "learning_rate": 2.6275603370463696e-06, + "loss": 2.0939, + "step": 17928 + }, + { + "epoch": 3.360637300843486, + "grad_norm": 58806.41796875, + "learning_rate": 2.625047117843521e-06, + "loss": 2.073, + "step": 17929 + }, + { + "epoch": 3.3608247422680413, + "grad_norm": 58794.140625, + "learning_rate": 2.622535068740584e-06, + "loss": 2.1018, + "step": 17930 + }, + { + "epoch": 3.361012183692596, + "grad_norm": 58501.63671875, + "learning_rate": 2.620024189799614e-06, + "loss": 2.0363, + "step": 17931 + }, + { + "epoch": 3.361199625117151, + "grad_norm": 55484.2265625, + "learning_rate": 2.6175144810826114e-06, + "loss": 2.0175, + "step": 17932 + }, + { + "epoch": 3.3613870665417056, + "grad_norm": 52872.3828125, + "learning_rate": 2.6150059426515662e-06, + "loss": 2.029, + "step": 17933 + }, + { + "epoch": 3.3615745079662607, + "grad_norm": 57125.41015625, + "learning_rate": 2.6124985745684507e-06, + "loss": 2.1001, + "step": 17934 + }, + { + "epoch": 3.3617619493908153, + "grad_norm": 60800.59765625, + "learning_rate": 2.609992376895176e-06, + "loss": 2.0723, + "step": 17935 + }, + { + "epoch": 3.3619493908153704, + "grad_norm": 56062.4921875, + "learning_rate": 2.607487349693638e-06, + "loss": 2.0619, + "step": 17936 + }, + { + "epoch": 3.362136832239925, + "grad_norm": 55258.546875, + "learning_rate": 2.604983493025731e-06, + "loss": 2.0903, + "step": 17937 + }, + { + "epoch": 3.3623242736644796, + "grad_norm": 55186.84375, + "learning_rate": 2.6024808069532726e-06, + "loss": 2.111, + "step": 17938 + }, + { + "epoch": 3.3625117150890347, + "grad_norm": 54776.375, + "learning_rate": 2.599979291538096e-06, + "loss": 2.0245, + "step": 17939 + }, + { + "epoch": 3.3626991565135897, + "grad_norm": 58964.56640625, + "learning_rate": 2.5974789468419636e-06, + "loss": 2.0721, + "step": 17940 + }, + { + "epoch": 3.3628865979381444, + "grad_norm": 52404.8125, + "learning_rate": 2.5949797729266532e-06, + "loss": 2.0782, + "step": 17941 + }, + { + "epoch": 3.363074039362699, + "grad_norm": 56587.4296875, + "learning_rate": 2.592481769853872e-06, + "loss": 2.0545, + "step": 17942 + }, + { + "epoch": 3.363261480787254, + "grad_norm": 55923.02734375, + "learning_rate": 2.5899849376853357e-06, + "loss": 2.0479, + "step": 17943 + }, + { + "epoch": 3.3634489222118087, + "grad_norm": 55668.57421875, + "learning_rate": 2.587489276482702e-06, + "loss": 2.0348, + "step": 17944 + }, + { + "epoch": 3.3636363636363638, + "grad_norm": 56457.02734375, + "learning_rate": 2.5849947863076095e-06, + "loss": 2.073, + "step": 17945 + }, + { + "epoch": 3.3638238050609184, + "grad_norm": 57231.3984375, + "learning_rate": 2.5825014672216653e-06, + "loss": 2.0756, + "step": 17946 + }, + { + "epoch": 3.3640112464854734, + "grad_norm": 58621.94921875, + "learning_rate": 2.580009319286464e-06, + "loss": 2.0983, + "step": 17947 + }, + { + "epoch": 3.364198687910028, + "grad_norm": 60546.23828125, + "learning_rate": 2.577518342563556e-06, + "loss": 2.0693, + "step": 17948 + }, + { + "epoch": 3.3643861293345827, + "grad_norm": 52246.69140625, + "learning_rate": 2.5750285371144545e-06, + "loss": 2.041, + "step": 17949 + }, + { + "epoch": 3.3645735707591378, + "grad_norm": 55503.91796875, + "learning_rate": 2.572539903000665e-06, + "loss": 2.0813, + "step": 17950 + }, + { + "epoch": 3.364761012183693, + "grad_norm": 59493.55078125, + "learning_rate": 2.570052440283649e-06, + "loss": 2.0325, + "step": 17951 + }, + { + "epoch": 3.3649484536082475, + "grad_norm": 54034.3671875, + "learning_rate": 2.567566149024836e-06, + "loss": 2.0596, + "step": 17952 + }, + { + "epoch": 3.365135895032802, + "grad_norm": 58374.5859375, + "learning_rate": 2.565081029285643e-06, + "loss": 2.0931, + "step": 17953 + }, + { + "epoch": 3.365323336457357, + "grad_norm": 55499.734375, + "learning_rate": 2.5625970811274604e-06, + "loss": 2.0432, + "step": 17954 + }, + { + "epoch": 3.3655107778819118, + "grad_norm": 56719.29296875, + "learning_rate": 2.560114304611616e-06, + "loss": 2.1197, + "step": 17955 + }, + { + "epoch": 3.365698219306467, + "grad_norm": 53944.8359375, + "learning_rate": 2.5576326997994394e-06, + "loss": 2.0601, + "step": 17956 + }, + { + "epoch": 3.3658856607310215, + "grad_norm": 60677.69140625, + "learning_rate": 2.555152266752231e-06, + "loss": 2.1123, + "step": 17957 + }, + { + "epoch": 3.3660731021555765, + "grad_norm": 52805.34765625, + "learning_rate": 2.552673005531253e-06, + "loss": 2.1413, + "step": 17958 + }, + { + "epoch": 3.366260543580131, + "grad_norm": 56322.234375, + "learning_rate": 2.5501949161977233e-06, + "loss": 2.1494, + "step": 17959 + }, + { + "epoch": 3.3664479850046862, + "grad_norm": 56959.7734375, + "learning_rate": 2.5477179988128654e-06, + "loss": 2.146, + "step": 17960 + }, + { + "epoch": 3.366635426429241, + "grad_norm": 56279.68359375, + "learning_rate": 2.5452422534378517e-06, + "loss": 2.1906, + "step": 17961 + }, + { + "epoch": 3.366822867853796, + "grad_norm": 55819.375, + "learning_rate": 2.5427676801338283e-06, + "loss": 2.083, + "step": 17962 + }, + { + "epoch": 3.3670103092783505, + "grad_norm": 61537.9375, + "learning_rate": 2.540294278961908e-06, + "loss": 2.0881, + "step": 17963 + }, + { + "epoch": 3.367197750702905, + "grad_norm": 54009.296875, + "learning_rate": 2.5378220499831964e-06, + "loss": 2.1186, + "step": 17964 + }, + { + "epoch": 3.3673851921274602, + "grad_norm": 49925.60546875, + "learning_rate": 2.5353509932587447e-06, + "loss": 2.0557, + "step": 17965 + }, + { + "epoch": 3.367572633552015, + "grad_norm": 55019.4296875, + "learning_rate": 2.532881108849572e-06, + "loss": 2.0896, + "step": 17966 + }, + { + "epoch": 3.36776007497657, + "grad_norm": 54740.91015625, + "learning_rate": 2.530412396816706e-06, + "loss": 2.0765, + "step": 17967 + }, + { + "epoch": 3.3679475164011246, + "grad_norm": 53587.046875, + "learning_rate": 2.5279448572210984e-06, + "loss": 2.0406, + "step": 17968 + }, + { + "epoch": 3.3681349578256796, + "grad_norm": 50620.2890625, + "learning_rate": 2.525478490123717e-06, + "loss": 2.0654, + "step": 17969 + }, + { + "epoch": 3.3683223992502342, + "grad_norm": 54123.1796875, + "learning_rate": 2.5230132955854525e-06, + "loss": 2.0319, + "step": 17970 + }, + { + "epoch": 3.3685098406747893, + "grad_norm": 56926.4609375, + "learning_rate": 2.520549273667211e-06, + "loss": 2.0794, + "step": 17971 + }, + { + "epoch": 3.368697282099344, + "grad_norm": 52597.95703125, + "learning_rate": 2.518086424429844e-06, + "loss": 2.0737, + "step": 17972 + }, + { + "epoch": 3.368884723523899, + "grad_norm": 56407.01953125, + "learning_rate": 2.515624747934181e-06, + "loss": 2.0802, + "step": 17973 + }, + { + "epoch": 3.3690721649484536, + "grad_norm": 53619.359375, + "learning_rate": 2.5131642442410285e-06, + "loss": 2.0726, + "step": 17974 + }, + { + "epoch": 3.3692596063730083, + "grad_norm": 59636.44921875, + "learning_rate": 2.5107049134111382e-06, + "loss": 2.0402, + "step": 17975 + }, + { + "epoch": 3.3694470477975633, + "grad_norm": 58024.4609375, + "learning_rate": 2.5082467555052724e-06, + "loss": 2.0721, + "step": 17976 + }, + { + "epoch": 3.369634489222118, + "grad_norm": 58499.81640625, + "learning_rate": 2.5057897705841437e-06, + "loss": 2.0998, + "step": 17977 + }, + { + "epoch": 3.369821930646673, + "grad_norm": 51051.9765625, + "learning_rate": 2.5033339587084257e-06, + "loss": 2.0781, + "step": 17978 + }, + { + "epoch": 3.3700093720712276, + "grad_norm": 53202.4453125, + "learning_rate": 2.5008793199387705e-06, + "loss": 2.1067, + "step": 17979 + }, + { + "epoch": 3.3701968134957827, + "grad_norm": 50506.640625, + "learning_rate": 2.498425854335823e-06, + "loss": 2.0628, + "step": 17980 + }, + { + "epoch": 3.3703842549203373, + "grad_norm": 57769.54296875, + "learning_rate": 2.495973561960174e-06, + "loss": 2.0792, + "step": 17981 + }, + { + "epoch": 3.3705716963448924, + "grad_norm": 56808.16015625, + "learning_rate": 2.493522442872376e-06, + "loss": 2.0016, + "step": 17982 + }, + { + "epoch": 3.370759137769447, + "grad_norm": 56066.71875, + "learning_rate": 2.491072497132979e-06, + "loss": 2.0558, + "step": 17983 + }, + { + "epoch": 3.370946579194002, + "grad_norm": 58221.21484375, + "learning_rate": 2.4886237248025136e-06, + "loss": 2.1716, + "step": 17984 + }, + { + "epoch": 3.3711340206185567, + "grad_norm": 58748.9921875, + "learning_rate": 2.4861761259414307e-06, + "loss": 2.1012, + "step": 17985 + }, + { + "epoch": 3.3713214620431113, + "grad_norm": 56672.71875, + "learning_rate": 2.483729700610188e-06, + "loss": 2.0874, + "step": 17986 + }, + { + "epoch": 3.3715089034676664, + "grad_norm": 55112.5234375, + "learning_rate": 2.4812844488692254e-06, + "loss": 2.1281, + "step": 17987 + }, + { + "epoch": 3.371696344892221, + "grad_norm": 58856.91015625, + "learning_rate": 2.4788403707789285e-06, + "loss": 2.0943, + "step": 17988 + }, + { + "epoch": 3.371883786316776, + "grad_norm": 58602.171875, + "learning_rate": 2.4763974663996603e-06, + "loss": 2.0446, + "step": 17989 + }, + { + "epoch": 3.3720712277413307, + "grad_norm": 63469.81640625, + "learning_rate": 2.473955735791761e-06, + "loss": 2.0408, + "step": 17990 + }, + { + "epoch": 3.372258669165886, + "grad_norm": 61044.66015625, + "learning_rate": 2.4715151790155376e-06, + "loss": 2.1273, + "step": 17991 + }, + { + "epoch": 3.3724461105904404, + "grad_norm": 55870.0703125, + "learning_rate": 2.4690757961312593e-06, + "loss": 2.1029, + "step": 17992 + }, + { + "epoch": 3.3726335520149955, + "grad_norm": 55010.2421875, + "learning_rate": 2.4666375871991886e-06, + "loss": 2.0409, + "step": 17993 + }, + { + "epoch": 3.37282099343955, + "grad_norm": 51678.06640625, + "learning_rate": 2.464200552279544e-06, + "loss": 2.0744, + "step": 17994 + }, + { + "epoch": 3.373008434864105, + "grad_norm": 56142.671875, + "learning_rate": 2.4617646914325164e-06, + "loss": 2.0875, + "step": 17995 + }, + { + "epoch": 3.37319587628866, + "grad_norm": 57395.15625, + "learning_rate": 2.4593300047182576e-06, + "loss": 2.0633, + "step": 17996 + }, + { + "epoch": 3.3733833177132144, + "grad_norm": 54749.75, + "learning_rate": 2.4568964921969196e-06, + "loss": 1.9951, + "step": 17997 + }, + { + "epoch": 3.3735707591377695, + "grad_norm": 50884.421875, + "learning_rate": 2.4544641539285872e-06, + "loss": 2.0656, + "step": 17998 + }, + { + "epoch": 3.373758200562324, + "grad_norm": 57815.28515625, + "learning_rate": 2.4520329899733574e-06, + "loss": 2.1897, + "step": 17999 + }, + { + "epoch": 3.373945641986879, + "grad_norm": 53299.14453125, + "learning_rate": 2.4496030003912594e-06, + "loss": 2.0676, + "step": 18000 + }, + { + "epoch": 3.373945641986879, + "eval_loss": 2.2577755451202393, + "eval_runtime": 131.5858, + "eval_samples_per_second": 38.37, + "eval_steps_per_second": 1.923, + "step": 18000 + }, + { + "epoch": 3.374133083411434, + "grad_norm": 56610.58203125, + "learning_rate": 2.4471741852423237e-06, + "loss": 2.0706, + "step": 18001 + }, + { + "epoch": 3.374320524835989, + "grad_norm": 54635.4296875, + "learning_rate": 2.4447465445865237e-06, + "loss": 2.0641, + "step": 18002 + }, + { + "epoch": 3.3745079662605435, + "grad_norm": 53644.09375, + "learning_rate": 2.442320078483834e-06, + "loss": 2.1569, + "step": 18003 + }, + { + "epoch": 3.3746954076850986, + "grad_norm": 55079.88671875, + "learning_rate": 2.4398947869941847e-06, + "loss": 2.0119, + "step": 18004 + }, + { + "epoch": 3.374882849109653, + "grad_norm": 52856.625, + "learning_rate": 2.4374706701774607e-06, + "loss": 2.0568, + "step": 18005 + }, + { + "epoch": 3.3750702905342083, + "grad_norm": 54243.77734375, + "learning_rate": 2.435047728093548e-06, + "loss": 2.0304, + "step": 18006 + }, + { + "epoch": 3.375257731958763, + "grad_norm": 52665.2890625, + "learning_rate": 2.4326259608022926e-06, + "loss": 2.1053, + "step": 18007 + }, + { + "epoch": 3.3754451733833175, + "grad_norm": 54114.22265625, + "learning_rate": 2.4302053683634974e-06, + "loss": 2.0752, + "step": 18008 + }, + { + "epoch": 3.3756326148078726, + "grad_norm": 54823.2734375, + "learning_rate": 2.4277859508369528e-06, + "loss": 2.0221, + "step": 18009 + }, + { + "epoch": 3.375820056232427, + "grad_norm": 58492.9609375, + "learning_rate": 2.4253677082824222e-06, + "loss": 2.098, + "step": 18010 + }, + { + "epoch": 3.3760074976569823, + "grad_norm": 59173.62890625, + "learning_rate": 2.42295064075963e-06, + "loss": 2.2052, + "step": 18011 + }, + { + "epoch": 3.376194939081537, + "grad_norm": 57202.046875, + "learning_rate": 2.4205347483282683e-06, + "loss": 2.1186, + "step": 18012 + }, + { + "epoch": 3.376382380506092, + "grad_norm": 53193.08203125, + "learning_rate": 2.4181200310480154e-06, + "loss": 2.044, + "step": 18013 + }, + { + "epoch": 3.3765698219306466, + "grad_norm": 56814.3671875, + "learning_rate": 2.415706488978503e-06, + "loss": 1.9947, + "step": 18014 + }, + { + "epoch": 3.3767572633552017, + "grad_norm": 52972.14453125, + "learning_rate": 2.413294122179344e-06, + "loss": 2.0537, + "step": 18015 + }, + { + "epoch": 3.3769447047797563, + "grad_norm": 52177.5859375, + "learning_rate": 2.4108829307101234e-06, + "loss": 2.0773, + "step": 18016 + }, + { + "epoch": 3.3771321462043113, + "grad_norm": 57510.890625, + "learning_rate": 2.4084729146303942e-06, + "loss": 1.9791, + "step": 18017 + }, + { + "epoch": 3.377319587628866, + "grad_norm": 53174.73828125, + "learning_rate": 2.4060640739996866e-06, + "loss": 2.0729, + "step": 18018 + }, + { + "epoch": 3.3775070290534206, + "grad_norm": 57634.76171875, + "learning_rate": 2.403656408877486e-06, + "loss": 2.0903, + "step": 18019 + }, + { + "epoch": 3.3776944704779757, + "grad_norm": 61941.98828125, + "learning_rate": 2.401249919323262e-06, + "loss": 2.1179, + "step": 18020 + }, + { + "epoch": 3.3778819119025303, + "grad_norm": 59213.06640625, + "learning_rate": 2.398844605396455e-06, + "loss": 2.1467, + "step": 18021 + }, + { + "epoch": 3.3780693533270854, + "grad_norm": 54486.3125, + "learning_rate": 2.396440467156469e-06, + "loss": 2.0384, + "step": 18022 + }, + { + "epoch": 3.37825679475164, + "grad_norm": 53051.71875, + "learning_rate": 2.3940375046626775e-06, + "loss": 2.0275, + "step": 18023 + }, + { + "epoch": 3.378444236176195, + "grad_norm": 54725.3125, + "learning_rate": 2.3916357179744495e-06, + "loss": 2.0354, + "step": 18024 + }, + { + "epoch": 3.3786316776007497, + "grad_norm": 54138.74609375, + "learning_rate": 2.389235107151089e-06, + "loss": 2.1102, + "step": 18025 + }, + { + "epoch": 3.3788191190253047, + "grad_norm": 62872.81640625, + "learning_rate": 2.3868356722518915e-06, + "loss": 2.128, + "step": 18026 + }, + { + "epoch": 3.3790065604498594, + "grad_norm": 53611.28125, + "learning_rate": 2.3844374133361214e-06, + "loss": 2.174, + "step": 18027 + }, + { + "epoch": 3.3791940018744144, + "grad_norm": 59031.4453125, + "learning_rate": 2.382040330463009e-06, + "loss": 1.9607, + "step": 18028 + }, + { + "epoch": 3.379381443298969, + "grad_norm": 58172.703125, + "learning_rate": 2.3796444236917734e-06, + "loss": 2.0426, + "step": 18029 + }, + { + "epoch": 3.3795688847235237, + "grad_norm": 63553.0703125, + "learning_rate": 2.377249693081579e-06, + "loss": 2.0214, + "step": 18030 + }, + { + "epoch": 3.3797563261480787, + "grad_norm": 61452.015625, + "learning_rate": 2.374856138691561e-06, + "loss": 2.1618, + "step": 18031 + }, + { + "epoch": 3.3799437675726334, + "grad_norm": 58732.00390625, + "learning_rate": 2.372463760580862e-06, + "loss": 2.1553, + "step": 18032 + }, + { + "epoch": 3.3801312089971884, + "grad_norm": 54560.56640625, + "learning_rate": 2.3700725588085503e-06, + "loss": 2.0452, + "step": 18033 + }, + { + "epoch": 3.380318650421743, + "grad_norm": 60225.55078125, + "learning_rate": 2.367682533433696e-06, + "loss": 2.0672, + "step": 18034 + }, + { + "epoch": 3.380506091846298, + "grad_norm": 52529.421875, + "learning_rate": 2.365293684515324e-06, + "loss": 2.021, + "step": 18035 + }, + { + "epoch": 3.3806935332708528, + "grad_norm": 51583.64453125, + "learning_rate": 2.362906012112448e-06, + "loss": 2.0703, + "step": 18036 + }, + { + "epoch": 3.380880974695408, + "grad_norm": 61635.5, + "learning_rate": 2.3605195162840264e-06, + "loss": 2.0903, + "step": 18037 + }, + { + "epoch": 3.3810684161199624, + "grad_norm": 51694.50390625, + "learning_rate": 2.3581341970890013e-06, + "loss": 2.1091, + "step": 18038 + }, + { + "epoch": 3.3812558575445175, + "grad_norm": 54962.45703125, + "learning_rate": 2.355750054586292e-06, + "loss": 2.0842, + "step": 18039 + }, + { + "epoch": 3.381443298969072, + "grad_norm": 58339.49609375, + "learning_rate": 2.3533670888347957e-06, + "loss": 2.0323, + "step": 18040 + }, + { + "epoch": 3.3816307403936268, + "grad_norm": 57194.29296875, + "learning_rate": 2.3509852998933544e-06, + "loss": 2.0745, + "step": 18041 + }, + { + "epoch": 3.381818181818182, + "grad_norm": 55560.65625, + "learning_rate": 2.348604687820788e-06, + "loss": 2.1152, + "step": 18042 + }, + { + "epoch": 3.3820056232427365, + "grad_norm": 57868.1640625, + "learning_rate": 2.3462252526759156e-06, + "loss": 2.0485, + "step": 18043 + }, + { + "epoch": 3.3821930646672915, + "grad_norm": 55903.1171875, + "learning_rate": 2.3438469945174955e-06, + "loss": 2.0538, + "step": 18044 + }, + { + "epoch": 3.382380506091846, + "grad_norm": 54033.390625, + "learning_rate": 2.341469913404265e-06, + "loss": 2.0275, + "step": 18045 + }, + { + "epoch": 3.382567947516401, + "grad_norm": 55971.109375, + "learning_rate": 2.339094009394932e-06, + "loss": 2.1451, + "step": 18046 + }, + { + "epoch": 3.382755388940956, + "grad_norm": 58184.73046875, + "learning_rate": 2.336719282548194e-06, + "loss": 2.0783, + "step": 18047 + }, + { + "epoch": 3.382942830365511, + "grad_norm": 55003.37890625, + "learning_rate": 2.334345732922688e-06, + "loss": 2.1246, + "step": 18048 + }, + { + "epoch": 3.3831302717900655, + "grad_norm": 51926.4609375, + "learning_rate": 2.331973360577039e-06, + "loss": 2.051, + "step": 18049 + }, + { + "epoch": 3.3833177132146206, + "grad_norm": 58710.91015625, + "learning_rate": 2.3296021655698497e-06, + "loss": 2.0327, + "step": 18050 + }, + { + "epoch": 3.3835051546391752, + "grad_norm": 53846.0390625, + "learning_rate": 2.3272321479596847e-06, + "loss": 2.0916, + "step": 18051 + }, + { + "epoch": 3.38369259606373, + "grad_norm": 59511.54296875, + "learning_rate": 2.3248633078050696e-06, + "loss": 2.0625, + "step": 18052 + }, + { + "epoch": 3.383880037488285, + "grad_norm": 59563.59375, + "learning_rate": 2.3224956451645188e-06, + "loss": 2.0294, + "step": 18053 + }, + { + "epoch": 3.3840674789128395, + "grad_norm": 58536.7890625, + "learning_rate": 2.320129160096518e-06, + "loss": 2.0825, + "step": 18054 + }, + { + "epoch": 3.3842549203373946, + "grad_norm": 56036.96484375, + "learning_rate": 2.3177638526595102e-06, + "loss": 2.0693, + "step": 18055 + }, + { + "epoch": 3.3844423617619492, + "grad_norm": 53889.26171875, + "learning_rate": 2.315399722911904e-06, + "loss": 2.1116, + "step": 18056 + }, + { + "epoch": 3.3846298031865043, + "grad_norm": 61585.375, + "learning_rate": 2.3130367709121083e-06, + "loss": 2.1023, + "step": 18057 + }, + { + "epoch": 3.384817244611059, + "grad_norm": 52637.6953125, + "learning_rate": 2.3106749967184704e-06, + "loss": 2.0493, + "step": 18058 + }, + { + "epoch": 3.385004686035614, + "grad_norm": 55143.125, + "learning_rate": 2.308314400389339e-06, + "loss": 2.0368, + "step": 18059 + }, + { + "epoch": 3.3851921274601686, + "grad_norm": 55372.66796875, + "learning_rate": 2.3059549819830106e-06, + "loss": 2.0772, + "step": 18060 + }, + { + "epoch": 3.3853795688847237, + "grad_norm": 51261.7265625, + "learning_rate": 2.303596741557745e-06, + "loss": 2.0242, + "step": 18061 + }, + { + "epoch": 3.3855670103092783, + "grad_norm": 54884.74609375, + "learning_rate": 2.3012396791718127e-06, + "loss": 2.038, + "step": 18062 + }, + { + "epoch": 3.385754451733833, + "grad_norm": 56352.16015625, + "learning_rate": 2.298883794883411e-06, + "loss": 2.0458, + "step": 18063 + }, + { + "epoch": 3.385941893158388, + "grad_norm": 53725.4765625, + "learning_rate": 2.296529088750737e-06, + "loss": 2.0191, + "step": 18064 + }, + { + "epoch": 3.386129334582943, + "grad_norm": 54532.140625, + "learning_rate": 2.2941755608319404e-06, + "loss": 2.0118, + "step": 18065 + }, + { + "epoch": 3.3863167760074977, + "grad_norm": 57142.8984375, + "learning_rate": 2.2918232111851625e-06, + "loss": 2.1172, + "step": 18066 + }, + { + "epoch": 3.3865042174320523, + "grad_norm": 60568.73828125, + "learning_rate": 2.289472039868501e-06, + "loss": 2.082, + "step": 18067 + }, + { + "epoch": 3.3866916588566074, + "grad_norm": 55466.234375, + "learning_rate": 2.2871220469400103e-06, + "loss": 2.0924, + "step": 18068 + }, + { + "epoch": 3.386879100281162, + "grad_norm": 56017.66015625, + "learning_rate": 2.2847732324577485e-06, + "loss": 2.0546, + "step": 18069 + }, + { + "epoch": 3.387066541705717, + "grad_norm": 60788.1171875, + "learning_rate": 2.2824255964797316e-06, + "loss": 2.0165, + "step": 18070 + }, + { + "epoch": 3.3872539831302717, + "grad_norm": 53462.66015625, + "learning_rate": 2.2800791390639288e-06, + "loss": 2.0633, + "step": 18071 + }, + { + "epoch": 3.3874414245548268, + "grad_norm": 55096.90234375, + "learning_rate": 2.2777338602682996e-06, + "loss": 2.1163, + "step": 18072 + }, + { + "epoch": 3.3876288659793814, + "grad_norm": 62043.0390625, + "learning_rate": 2.2753897601507756e-06, + "loss": 2.01, + "step": 18073 + }, + { + "epoch": 3.387816307403936, + "grad_norm": 55432.97265625, + "learning_rate": 2.2730468387692494e-06, + "loss": 2.2108, + "step": 18074 + }, + { + "epoch": 3.388003748828491, + "grad_norm": 55106.40234375, + "learning_rate": 2.2707050961815856e-06, + "loss": 2.0845, + "step": 18075 + }, + { + "epoch": 3.388191190253046, + "grad_norm": 63626.5546875, + "learning_rate": 2.268364532445616e-06, + "loss": 1.9778, + "step": 18076 + }, + { + "epoch": 3.388378631677601, + "grad_norm": 56459.84375, + "learning_rate": 2.2660251476191775e-06, + "loss": 2.0145, + "step": 18077 + }, + { + "epoch": 3.3885660731021554, + "grad_norm": 56876.60546875, + "learning_rate": 2.2636869417600126e-06, + "loss": 2.0455, + "step": 18078 + }, + { + "epoch": 3.3887535145267105, + "grad_norm": 55078.35546875, + "learning_rate": 2.261349914925892e-06, + "loss": 1.9641, + "step": 18079 + }, + { + "epoch": 3.388940955951265, + "grad_norm": 53410.1015625, + "learning_rate": 2.2590140671745364e-06, + "loss": 2.0915, + "step": 18080 + }, + { + "epoch": 3.38912839737582, + "grad_norm": 59282.3359375, + "learning_rate": 2.2566793985636436e-06, + "loss": 2.2214, + "step": 18081 + }, + { + "epoch": 3.389315838800375, + "grad_norm": 54610.23046875, + "learning_rate": 2.2543459091508567e-06, + "loss": 2.0602, + "step": 18082 + }, + { + "epoch": 3.38950328022493, + "grad_norm": 54627.375, + "learning_rate": 2.2520135989938295e-06, + "loss": 2.0944, + "step": 18083 + }, + { + "epoch": 3.3896907216494845, + "grad_norm": 52132.03125, + "learning_rate": 2.249682468150155e-06, + "loss": 2.0704, + "step": 18084 + }, + { + "epoch": 3.3898781630740396, + "grad_norm": 54705.234375, + "learning_rate": 2.24735251667742e-06, + "loss": 2.0182, + "step": 18085 + }, + { + "epoch": 3.390065604498594, + "grad_norm": 53627.8984375, + "learning_rate": 2.2450237446331625e-06, + "loss": 2.0335, + "step": 18086 + }, + { + "epoch": 3.3902530459231492, + "grad_norm": 57416.578125, + "learning_rate": 2.242696152074908e-06, + "loss": 2.0855, + "step": 18087 + }, + { + "epoch": 3.390440487347704, + "grad_norm": 55192.921875, + "learning_rate": 2.2403697390601387e-06, + "loss": 2.0296, + "step": 18088 + }, + { + "epoch": 3.3906279287722585, + "grad_norm": 58350.328125, + "learning_rate": 2.238044505646314e-06, + "loss": 2.0597, + "step": 18089 + }, + { + "epoch": 3.3908153701968136, + "grad_norm": 57717.62890625, + "learning_rate": 2.235720451890866e-06, + "loss": 2.0853, + "step": 18090 + }, + { + "epoch": 3.391002811621368, + "grad_norm": 58003.09765625, + "learning_rate": 2.233397577851193e-06, + "loss": 2.0089, + "step": 18091 + }, + { + "epoch": 3.3911902530459233, + "grad_norm": 58574.01171875, + "learning_rate": 2.2310758835846713e-06, + "loss": 2.0683, + "step": 18092 + }, + { + "epoch": 3.391377694470478, + "grad_norm": 55533.9453125, + "learning_rate": 2.2287553691486385e-06, + "loss": 2.0506, + "step": 18093 + }, + { + "epoch": 3.391565135895033, + "grad_norm": 57622.65234375, + "learning_rate": 2.226436034600421e-06, + "loss": 2.0599, + "step": 18094 + }, + { + "epoch": 3.3917525773195876, + "grad_norm": 60077.640625, + "learning_rate": 2.224117879997284e-06, + "loss": 2.0571, + "step": 18095 + }, + { + "epoch": 3.3919400187441426, + "grad_norm": 56248.6875, + "learning_rate": 2.2218009053964985e-06, + "loss": 2.0844, + "step": 18096 + }, + { + "epoch": 3.3921274601686973, + "grad_norm": 54261.5, + "learning_rate": 2.2194851108552903e-06, + "loss": 2.0737, + "step": 18097 + }, + { + "epoch": 3.3923149015932523, + "grad_norm": 53475.390625, + "learning_rate": 2.2171704964308427e-06, + "loss": 2.093, + "step": 18098 + }, + { + "epoch": 3.392502343017807, + "grad_norm": 56140.16796875, + "learning_rate": 2.2148570621803313e-06, + "loss": 2.0818, + "step": 18099 + }, + { + "epoch": 3.3926897844423616, + "grad_norm": 56379.1328125, + "learning_rate": 2.2125448081609103e-06, + "loss": 2.0801, + "step": 18100 + }, + { + "epoch": 3.3928772258669166, + "grad_norm": 61322.05078125, + "learning_rate": 2.210233734429662e-06, + "loss": 2.0447, + "step": 18101 + }, + { + "epoch": 3.3930646672914713, + "grad_norm": 56087.8125, + "learning_rate": 2.2079238410436743e-06, + "loss": 2.0694, + "step": 18102 + }, + { + "epoch": 3.3932521087160263, + "grad_norm": 57855.2421875, + "learning_rate": 2.2056151280600177e-06, + "loss": 2.0579, + "step": 18103 + }, + { + "epoch": 3.393439550140581, + "grad_norm": 55697.42578125, + "learning_rate": 2.203307595535703e-06, + "loss": 2.1203, + "step": 18104 + }, + { + "epoch": 3.393626991565136, + "grad_norm": 50037.66015625, + "learning_rate": 2.201001243527712e-06, + "loss": 2.0911, + "step": 18105 + }, + { + "epoch": 3.3938144329896907, + "grad_norm": 58870.125, + "learning_rate": 2.1986960720930207e-06, + "loss": 2.0315, + "step": 18106 + }, + { + "epoch": 3.3940018744142457, + "grad_norm": 56473.2421875, + "learning_rate": 2.196392081288562e-06, + "loss": 1.9973, + "step": 18107 + }, + { + "epoch": 3.3941893158388003, + "grad_norm": 60324.7890625, + "learning_rate": 2.19408927117124e-06, + "loss": 2.0438, + "step": 18108 + }, + { + "epoch": 3.3943767572633554, + "grad_norm": 54192.0078125, + "learning_rate": 2.1917876417979265e-06, + "loss": 2.0906, + "step": 18109 + }, + { + "epoch": 3.39456419868791, + "grad_norm": 59676.3984375, + "learning_rate": 2.1894871932254814e-06, + "loss": 1.9805, + "step": 18110 + }, + { + "epoch": 3.3947516401124647, + "grad_norm": 62310.78515625, + "learning_rate": 2.187187925510714e-06, + "loss": 2.058, + "step": 18111 + }, + { + "epoch": 3.3949390815370197, + "grad_norm": 50874.046875, + "learning_rate": 2.184889838710413e-06, + "loss": 2.075, + "step": 18112 + }, + { + "epoch": 3.3951265229615744, + "grad_norm": 56308.15625, + "learning_rate": 2.1825929328813444e-06, + "loss": 2.0078, + "step": 18113 + }, + { + "epoch": 3.3953139643861294, + "grad_norm": 64591.1328125, + "learning_rate": 2.180297208080223e-06, + "loss": 2.1511, + "step": 18114 + }, + { + "epoch": 3.395501405810684, + "grad_norm": 56428.63671875, + "learning_rate": 2.178002664363776e-06, + "loss": 2.1329, + "step": 18115 + }, + { + "epoch": 3.395688847235239, + "grad_norm": 53643.55078125, + "learning_rate": 2.1757093017886478e-06, + "loss": 2.0888, + "step": 18116 + }, + { + "epoch": 3.3958762886597937, + "grad_norm": 53969.01171875, + "learning_rate": 2.1734171204115027e-06, + "loss": 2.0934, + "step": 18117 + }, + { + "epoch": 3.396063730084349, + "grad_norm": 57442.99609375, + "learning_rate": 2.1711261202889464e-06, + "loss": 1.9963, + "step": 18118 + }, + { + "epoch": 3.3962511715089034, + "grad_norm": 54815.72265625, + "learning_rate": 2.1688363014775616e-06, + "loss": 2.005, + "step": 18119 + }, + { + "epoch": 3.3964386129334585, + "grad_norm": 52855.328125, + "learning_rate": 2.1665476640339076e-06, + "loss": 2.0526, + "step": 18120 + }, + { + "epoch": 3.396626054358013, + "grad_norm": 52471.46875, + "learning_rate": 2.164260208014507e-06, + "loss": 2.0479, + "step": 18121 + }, + { + "epoch": 3.3968134957825677, + "grad_norm": 54625.53125, + "learning_rate": 2.1619739334758635e-06, + "loss": 2.0888, + "step": 18122 + }, + { + "epoch": 3.397000937207123, + "grad_norm": 58602.5078125, + "learning_rate": 2.159688840474439e-06, + "loss": 2.0263, + "step": 18123 + }, + { + "epoch": 3.3971883786316774, + "grad_norm": 56654.625, + "learning_rate": 2.15740492906667e-06, + "loss": 2.0156, + "step": 18124 + }, + { + "epoch": 3.3973758200562325, + "grad_norm": 50950.55078125, + "learning_rate": 2.155122199308973e-06, + "loss": 2.0554, + "step": 18125 + }, + { + "epoch": 3.397563261480787, + "grad_norm": 51962.63671875, + "learning_rate": 2.152840651257726e-06, + "loss": 2.1148, + "step": 18126 + }, + { + "epoch": 3.397750702905342, + "grad_norm": 52669.75390625, + "learning_rate": 2.150560284969283e-06, + "loss": 2.112, + "step": 18127 + }, + { + "epoch": 3.397938144329897, + "grad_norm": 52636.62890625, + "learning_rate": 2.1482811004999602e-06, + "loss": 2.066, + "step": 18128 + }, + { + "epoch": 3.398125585754452, + "grad_norm": 57227.4921875, + "learning_rate": 2.1460030979060576e-06, + "loss": 2.0391, + "step": 18129 + }, + { + "epoch": 3.3983130271790065, + "grad_norm": 53045.1328125, + "learning_rate": 2.1437262772438348e-06, + "loss": 2.0374, + "step": 18130 + }, + { + "epoch": 3.3985004686035616, + "grad_norm": 50398.37890625, + "learning_rate": 2.14145063856952e-06, + "loss": 2.089, + "step": 18131 + }, + { + "epoch": 3.398687910028116, + "grad_norm": 54720.671875, + "learning_rate": 2.139176181939323e-06, + "loss": 2.0439, + "step": 18132 + }, + { + "epoch": 3.398875351452671, + "grad_norm": 62591.79296875, + "learning_rate": 2.1369029074094272e-06, + "loss": 2.1212, + "step": 18133 + }, + { + "epoch": 3.399062792877226, + "grad_norm": 57373.203125, + "learning_rate": 2.134630815035976e-06, + "loss": 2.0676, + "step": 18134 + }, + { + "epoch": 3.3992502343017805, + "grad_norm": 58419.52734375, + "learning_rate": 2.1323599048750807e-06, + "loss": 2.0139, + "step": 18135 + }, + { + "epoch": 3.3994376757263356, + "grad_norm": 56319.546875, + "learning_rate": 2.130090176982841e-06, + "loss": 2.1429, + "step": 18136 + }, + { + "epoch": 3.39962511715089, + "grad_norm": 53282.4375, + "learning_rate": 2.1278216314153053e-06, + "loss": 2.1083, + "step": 18137 + }, + { + "epoch": 3.3998125585754453, + "grad_norm": 57540.265625, + "learning_rate": 2.125554268228508e-06, + "loss": 2.0607, + "step": 18138 + }, + { + "epoch": 3.4, + "grad_norm": 54788.24609375, + "learning_rate": 2.1232880874784476e-06, + "loss": 2.116, + "step": 18139 + }, + { + "epoch": 3.400187441424555, + "grad_norm": 53367.64453125, + "learning_rate": 2.1210230892211026e-06, + "loss": 2.1323, + "step": 18140 + }, + { + "epoch": 3.4003748828491096, + "grad_norm": 54237.23046875, + "learning_rate": 2.118759273512411e-06, + "loss": 2.0679, + "step": 18141 + }, + { + "epoch": 3.4005623242736647, + "grad_norm": 54379.81640625, + "learning_rate": 2.116496640408283e-06, + "loss": 2.0531, + "step": 18142 + }, + { + "epoch": 3.4007497656982193, + "grad_norm": 58764.984375, + "learning_rate": 2.114235189964614e-06, + "loss": 2.0712, + "step": 18143 + }, + { + "epoch": 3.400937207122774, + "grad_norm": 54213.43359375, + "learning_rate": 2.1119749222372477e-06, + "loss": 2.0477, + "step": 18144 + }, + { + "epoch": 3.401124648547329, + "grad_norm": 54198.765625, + "learning_rate": 2.1097158372820057e-06, + "loss": 2.1215, + "step": 18145 + }, + { + "epoch": 3.4013120899718836, + "grad_norm": 58147.078125, + "learning_rate": 2.107457935154694e-06, + "loss": 2.0715, + "step": 18146 + }, + { + "epoch": 3.4014995313964387, + "grad_norm": 54660.76953125, + "learning_rate": 2.1052012159110844e-06, + "loss": 2.0473, + "step": 18147 + }, + { + "epoch": 3.4016869728209933, + "grad_norm": 56207.04296875, + "learning_rate": 2.1029456796069046e-06, + "loss": 2.1165, + "step": 18148 + }, + { + "epoch": 3.4018744142455484, + "grad_norm": 60077.6953125, + "learning_rate": 2.1006913262978656e-06, + "loss": 2.1231, + "step": 18149 + }, + { + "epoch": 3.402061855670103, + "grad_norm": 53309.75, + "learning_rate": 2.0984381560396505e-06, + "loss": 2.0683, + "step": 18150 + }, + { + "epoch": 3.402249297094658, + "grad_norm": 55329.08203125, + "learning_rate": 2.0961861688879037e-06, + "loss": 2.056, + "step": 18151 + }, + { + "epoch": 3.4024367385192127, + "grad_norm": 54074.54296875, + "learning_rate": 2.0939353648982585e-06, + "loss": 2.0811, + "step": 18152 + }, + { + "epoch": 3.4026241799437678, + "grad_norm": 52950.1171875, + "learning_rate": 2.091685744126293e-06, + "loss": 2.0851, + "step": 18153 + }, + { + "epoch": 3.4028116213683224, + "grad_norm": 54690.4296875, + "learning_rate": 2.089437306627573e-06, + "loss": 2.0644, + "step": 18154 + }, + { + "epoch": 3.402999062792877, + "grad_norm": 56231.68359375, + "learning_rate": 2.0871900524576384e-06, + "loss": 1.9955, + "step": 18155 + }, + { + "epoch": 3.403186504217432, + "grad_norm": 54723.39453125, + "learning_rate": 2.0849439816719884e-06, + "loss": 2.0742, + "step": 18156 + }, + { + "epoch": 3.4033739456419867, + "grad_norm": 58050.62109375, + "learning_rate": 2.0826990943261016e-06, + "loss": 2.1604, + "step": 18157 + }, + { + "epoch": 3.4035613870665418, + "grad_norm": 57819.74609375, + "learning_rate": 2.0804553904754163e-06, + "loss": 2.0735, + "step": 18158 + }, + { + "epoch": 3.4037488284910964, + "grad_norm": 58973.14453125, + "learning_rate": 2.078212870175361e-06, + "loss": 2.0379, + "step": 18159 + }, + { + "epoch": 3.4039362699156515, + "grad_norm": 52793.33203125, + "learning_rate": 2.075971533481319e-06, + "loss": 2.1268, + "step": 18160 + }, + { + "epoch": 3.404123711340206, + "grad_norm": 54587.16015625, + "learning_rate": 2.073731380448635e-06, + "loss": 2.0736, + "step": 18161 + }, + { + "epoch": 3.404311152764761, + "grad_norm": 69746.640625, + "learning_rate": 2.071492411132647e-06, + "loss": 2.064, + "step": 18162 + }, + { + "epoch": 3.4044985941893158, + "grad_norm": 54716.23046875, + "learning_rate": 2.0692546255886734e-06, + "loss": 2.0595, + "step": 18163 + }, + { + "epoch": 3.404686035613871, + "grad_norm": 51935.9375, + "learning_rate": 2.0670180238719528e-06, + "loss": 2.1079, + "step": 18164 + }, + { + "epoch": 3.4048734770384255, + "grad_norm": 55402.5078125, + "learning_rate": 2.0647826060377407e-06, + "loss": 2.0547, + "step": 18165 + }, + { + "epoch": 3.40506091846298, + "grad_norm": 57052.265625, + "learning_rate": 2.062548372141254e-06, + "loss": 2.0958, + "step": 18166 + }, + { + "epoch": 3.405248359887535, + "grad_norm": 53316.8203125, + "learning_rate": 2.060315322237677e-06, + "loss": 2.076, + "step": 18167 + }, + { + "epoch": 3.40543580131209, + "grad_norm": 57306.29296875, + "learning_rate": 2.0580834563821427e-06, + "loss": 2.0493, + "step": 18168 + }, + { + "epoch": 3.405623242736645, + "grad_norm": 53758.1875, + "learning_rate": 2.055852774629796e-06, + "loss": 2.0478, + "step": 18169 + }, + { + "epoch": 3.4058106841611995, + "grad_norm": 53935.16796875, + "learning_rate": 2.0536232770357268e-06, + "loss": 2.0357, + "step": 18170 + }, + { + "epoch": 3.4059981255857545, + "grad_norm": 56249.01953125, + "learning_rate": 2.0513949636550013e-06, + "loss": 2.0998, + "step": 18171 + }, + { + "epoch": 3.406185567010309, + "grad_norm": 61418.47265625, + "learning_rate": 2.049167834542648e-06, + "loss": 2.0924, + "step": 18172 + }, + { + "epoch": 3.4063730084348642, + "grad_norm": 53814.8203125, + "learning_rate": 2.046941889753684e-06, + "loss": 2.0446, + "step": 18173 + }, + { + "epoch": 3.406560449859419, + "grad_norm": 56375.95703125, + "learning_rate": 2.0447171293430876e-06, + "loss": 2.0579, + "step": 18174 + }, + { + "epoch": 3.406747891283974, + "grad_norm": 57027.1796875, + "learning_rate": 2.042493553365793e-06, + "loss": 2.1033, + "step": 18175 + }, + { + "epoch": 3.4069353327085286, + "grad_norm": 56850.6484375, + "learning_rate": 2.040271161876739e-06, + "loss": 2.083, + "step": 18176 + }, + { + "epoch": 3.407122774133083, + "grad_norm": 54689.6953125, + "learning_rate": 2.0380499549307985e-06, + "loss": 2.0863, + "step": 18177 + }, + { + "epoch": 3.4073102155576382, + "grad_norm": 56372.65625, + "learning_rate": 2.0358299325828444e-06, + "loss": 2.1455, + "step": 18178 + }, + { + "epoch": 3.4074976569821933, + "grad_norm": 58667.37890625, + "learning_rate": 2.0336110948876995e-06, + "loss": 2.02, + "step": 18179 + }, + { + "epoch": 3.407685098406748, + "grad_norm": 52589.0078125, + "learning_rate": 2.0313934419001813e-06, + "loss": 2.0074, + "step": 18180 + }, + { + "epoch": 3.4078725398313026, + "grad_norm": 56112.77734375, + "learning_rate": 2.0291769736750397e-06, + "loss": 2.055, + "step": 18181 + }, + { + "epoch": 3.4080599812558576, + "grad_norm": 54492.69921875, + "learning_rate": 2.0269616902670428e-06, + "loss": 2.0483, + "step": 18182 + }, + { + "epoch": 3.4082474226804123, + "grad_norm": 55834.03515625, + "learning_rate": 2.0247475917308965e-06, + "loss": 2.0698, + "step": 18183 + }, + { + "epoch": 3.4084348641049673, + "grad_norm": 56811.8125, + "learning_rate": 2.022534678121274e-06, + "loss": 2.0566, + "step": 18184 + }, + { + "epoch": 3.408622305529522, + "grad_norm": 60020.59375, + "learning_rate": 2.020322949492848e-06, + "loss": 2.1239, + "step": 18185 + }, + { + "epoch": 3.408809746954077, + "grad_norm": 57498.1015625, + "learning_rate": 2.01811240590023e-06, + "loss": 2.1395, + "step": 18186 + }, + { + "epoch": 3.4089971883786316, + "grad_norm": 56283.4140625, + "learning_rate": 2.015903047398038e-06, + "loss": 2.0667, + "step": 18187 + }, + { + "epoch": 3.4091846298031863, + "grad_norm": 56096.23828125, + "learning_rate": 2.013694874040817e-06, + "loss": 2.1936, + "step": 18188 + }, + { + "epoch": 3.4093720712277413, + "grad_norm": 54983.98046875, + "learning_rate": 2.011487885883129e-06, + "loss": 2.0721, + "step": 18189 + }, + { + "epoch": 3.4095595126522964, + "grad_norm": 51867.5625, + "learning_rate": 2.009282082979469e-06, + "loss": 2.0202, + "step": 18190 + }, + { + "epoch": 3.409746954076851, + "grad_norm": 53380.125, + "learning_rate": 2.0070774653843104e-06, + "loss": 2.0884, + "step": 18191 + }, + { + "epoch": 3.4099343955014056, + "grad_norm": 54444.9453125, + "learning_rate": 2.004874033152121e-06, + "loss": 2.1096, + "step": 18192 + }, + { + "epoch": 3.4101218369259607, + "grad_norm": 56523.82421875, + "learning_rate": 2.0026717863373235e-06, + "loss": 2.0549, + "step": 18193 + }, + { + "epoch": 3.4103092783505153, + "grad_norm": 54788.12890625, + "learning_rate": 2.0004707249942967e-06, + "loss": 2.0713, + "step": 18194 + }, + { + "epoch": 3.4104967197750704, + "grad_norm": 56034.609375, + "learning_rate": 1.998270849177408e-06, + "loss": 2.0491, + "step": 18195 + }, + { + "epoch": 3.410684161199625, + "grad_norm": 59589.50390625, + "learning_rate": 1.996072158940998e-06, + "loss": 2.0545, + "step": 18196 + }, + { + "epoch": 3.41087160262418, + "grad_norm": 58601.5078125, + "learning_rate": 1.993874654339373e-06, + "loss": 2.0978, + "step": 18197 + }, + { + "epoch": 3.4110590440487347, + "grad_norm": 56801.05078125, + "learning_rate": 1.9916783354267943e-06, + "loss": 2.0401, + "step": 18198 + }, + { + "epoch": 3.41124648547329, + "grad_norm": 52918.4296875, + "learning_rate": 1.989483202257519e-06, + "loss": 2.1013, + "step": 18199 + }, + { + "epoch": 3.4114339268978444, + "grad_norm": 61247.50390625, + "learning_rate": 1.987289254885766e-06, + "loss": 2.0467, + "step": 18200 + }, + { + "epoch": 3.4116213683223995, + "grad_norm": 56678.01171875, + "learning_rate": 1.9850964933657126e-06, + "loss": 2.0285, + "step": 18201 + }, + { + "epoch": 3.411808809746954, + "grad_norm": 51326.3203125, + "learning_rate": 1.982904917751527e-06, + "loss": 2.0908, + "step": 18202 + }, + { + "epoch": 3.4119962511715087, + "grad_norm": 53505.4453125, + "learning_rate": 1.980714528097338e-06, + "loss": 2.0726, + "step": 18203 + }, + { + "epoch": 3.412183692596064, + "grad_norm": 59330.375, + "learning_rate": 1.978525324457242e-06, + "loss": 2.0505, + "step": 18204 + }, + { + "epoch": 3.4123711340206184, + "grad_norm": 57300.671875, + "learning_rate": 1.9763373068853007e-06, + "loss": 2.0557, + "step": 18205 + }, + { + "epoch": 3.4125585754451735, + "grad_norm": 51630.2421875, + "learning_rate": 1.9741504754355766e-06, + "loss": 2.0377, + "step": 18206 + }, + { + "epoch": 3.412746016869728, + "grad_norm": 54038.12890625, + "learning_rate": 1.97196483016206e-06, + "loss": 2.1287, + "step": 18207 + }, + { + "epoch": 3.412933458294283, + "grad_norm": 53046.73828125, + "learning_rate": 1.9697803711187458e-06, + "loss": 2.1268, + "step": 18208 + }, + { + "epoch": 3.413120899718838, + "grad_norm": 52099.03515625, + "learning_rate": 1.9675970983595806e-06, + "loss": 2.1076, + "step": 18209 + }, + { + "epoch": 3.413308341143393, + "grad_norm": 59500.49609375, + "learning_rate": 1.9654150119384983e-06, + "loss": 2.0622, + "step": 18210 + }, + { + "epoch": 3.4134957825679475, + "grad_norm": 50375.35546875, + "learning_rate": 1.9632341119093846e-06, + "loss": 2.0639, + "step": 18211 + }, + { + "epoch": 3.4136832239925026, + "grad_norm": 55524.76171875, + "learning_rate": 1.9610543983261065e-06, + "loss": 2.0552, + "step": 18212 + }, + { + "epoch": 3.413870665417057, + "grad_norm": 54123.46875, + "learning_rate": 1.9588758712425104e-06, + "loss": 2.047, + "step": 18213 + }, + { + "epoch": 3.414058106841612, + "grad_norm": 53504.69921875, + "learning_rate": 1.9566985307123808e-06, + "loss": 2.1255, + "step": 18214 + }, + { + "epoch": 3.414245548266167, + "grad_norm": 53235.37890625, + "learning_rate": 1.9545223767895194e-06, + "loss": 2.0081, + "step": 18215 + }, + { + "epoch": 3.4144329896907215, + "grad_norm": 53820.234375, + "learning_rate": 1.9523474095276607e-06, + "loss": 2.0379, + "step": 18216 + }, + { + "epoch": 3.4146204311152766, + "grad_norm": 57242.234375, + "learning_rate": 1.9501736289805173e-06, + "loss": 2.0929, + "step": 18217 + }, + { + "epoch": 3.414807872539831, + "grad_norm": 56114.58984375, + "learning_rate": 1.9480010352017906e-06, + "loss": 2.0938, + "step": 18218 + }, + { + "epoch": 3.4149953139643863, + "grad_norm": 61151.42578125, + "learning_rate": 1.9458296282451495e-06, + "loss": 2.0401, + "step": 18219 + }, + { + "epoch": 3.415182755388941, + "grad_norm": 53986.1015625, + "learning_rate": 1.9436594081642056e-06, + "loss": 2.0724, + "step": 18220 + }, + { + "epoch": 3.415370196813496, + "grad_norm": 57749.80859375, + "learning_rate": 1.941490375012567e-06, + "loss": 2.0396, + "step": 18221 + }, + { + "epoch": 3.4155576382380506, + "grad_norm": 59103.48046875, + "learning_rate": 1.939322528843807e-06, + "loss": 2.0653, + "step": 18222 + }, + { + "epoch": 3.4157450796626057, + "grad_norm": 49128.5859375, + "learning_rate": 1.9371558697114765e-06, + "loss": 2.0023, + "step": 18223 + }, + { + "epoch": 3.4159325210871603, + "grad_norm": 58727.2578125, + "learning_rate": 1.934990397669073e-06, + "loss": 2.0296, + "step": 18224 + }, + { + "epoch": 3.416119962511715, + "grad_norm": 54320.40625, + "learning_rate": 1.932826112770092e-06, + "loss": 2.0914, + "step": 18225 + }, + { + "epoch": 3.41630740393627, + "grad_norm": 52995.0859375, + "learning_rate": 1.930663015067985e-06, + "loss": 2.0139, + "step": 18226 + }, + { + "epoch": 3.4164948453608246, + "grad_norm": 57914.9453125, + "learning_rate": 1.9285011046161818e-06, + "loss": 2.0515, + "step": 18227 + }, + { + "epoch": 3.4166822867853797, + "grad_norm": 56057.2265625, + "learning_rate": 1.9263403814680725e-06, + "loss": 2.0495, + "step": 18228 + }, + { + "epoch": 3.4168697282099343, + "grad_norm": 54349.98828125, + "learning_rate": 1.9241808456770318e-06, + "loss": 2.0583, + "step": 18229 + }, + { + "epoch": 3.4170571696344894, + "grad_norm": 60095.34375, + "learning_rate": 1.922022497296394e-06, + "loss": 2.0868, + "step": 18230 + }, + { + "epoch": 3.417244611059044, + "grad_norm": 59017.09765625, + "learning_rate": 1.919865336379456e-06, + "loss": 2.0451, + "step": 18231 + }, + { + "epoch": 3.417432052483599, + "grad_norm": 60712.7421875, + "learning_rate": 1.917709362979514e-06, + "loss": 1.9972, + "step": 18232 + }, + { + "epoch": 3.4176194939081537, + "grad_norm": 54176.0390625, + "learning_rate": 1.915554577149814e-06, + "loss": 2.1039, + "step": 18233 + }, + { + "epoch": 3.4178069353327087, + "grad_norm": 53362.0546875, + "learning_rate": 1.913400978943569e-06, + "loss": 2.0462, + "step": 18234 + }, + { + "epoch": 3.4179943767572634, + "grad_norm": 56969.62109375, + "learning_rate": 1.911248568413976e-06, + "loss": 2.0641, + "step": 18235 + }, + { + "epoch": 3.418181818181818, + "grad_norm": 57596.27734375, + "learning_rate": 1.909097345614197e-06, + "loss": 2.1168, + "step": 18236 + }, + { + "epoch": 3.418369259606373, + "grad_norm": 58732.05078125, + "learning_rate": 1.9069473105973569e-06, + "loss": 2.0348, + "step": 18237 + }, + { + "epoch": 3.4185567010309277, + "grad_norm": 59776.9609375, + "learning_rate": 1.9047984634165738e-06, + "loss": 1.9609, + "step": 18238 + }, + { + "epoch": 3.4187441424554827, + "grad_norm": 53855.1328125, + "learning_rate": 1.9026508041249059e-06, + "loss": 1.9684, + "step": 18239 + }, + { + "epoch": 3.4189315838800374, + "grad_norm": 51811.51953125, + "learning_rate": 1.9005043327754046e-06, + "loss": 2.0249, + "step": 18240 + }, + { + "epoch": 3.4191190253045924, + "grad_norm": 60144.01953125, + "learning_rate": 1.898359049421089e-06, + "loss": 2.063, + "step": 18241 + }, + { + "epoch": 3.419306466729147, + "grad_norm": 55812.7890625, + "learning_rate": 1.8962149541149332e-06, + "loss": 2.061, + "step": 18242 + }, + { + "epoch": 3.419493908153702, + "grad_norm": 52848.65234375, + "learning_rate": 1.8940720469099061e-06, + "loss": 2.0619, + "step": 18243 + }, + { + "epoch": 3.4196813495782568, + "grad_norm": 52161.25, + "learning_rate": 1.8919303278589262e-06, + "loss": 2.0703, + "step": 18244 + }, + { + "epoch": 3.419868791002812, + "grad_norm": 56752.25390625, + "learning_rate": 1.889789797014896e-06, + "loss": 2.1206, + "step": 18245 + }, + { + "epoch": 3.4200562324273664, + "grad_norm": 52828.1953125, + "learning_rate": 1.8876504544306839e-06, + "loss": 2.0371, + "step": 18246 + }, + { + "epoch": 3.420243673851921, + "grad_norm": 62476.9453125, + "learning_rate": 1.8855123001591201e-06, + "loss": 2.0899, + "step": 18247 + }, + { + "epoch": 3.420431115276476, + "grad_norm": 56238.25, + "learning_rate": 1.8833753342530235e-06, + "loss": 2.0968, + "step": 18248 + }, + { + "epoch": 3.4206185567010308, + "grad_norm": 56990.1953125, + "learning_rate": 1.8812395567651742e-06, + "loss": 2.0197, + "step": 18249 + }, + { + "epoch": 3.420805998125586, + "grad_norm": 57236.55859375, + "learning_rate": 1.879104967748324e-06, + "loss": 2.0496, + "step": 18250 + }, + { + "epoch": 3.4209934395501405, + "grad_norm": 56475.1875, + "learning_rate": 1.8769715672551868e-06, + "loss": 1.9727, + "step": 18251 + }, + { + "epoch": 3.4211808809746955, + "grad_norm": 58766.40625, + "learning_rate": 1.874839355338459e-06, + "loss": 2.0871, + "step": 18252 + }, + { + "epoch": 3.42136832239925, + "grad_norm": 53424.24609375, + "learning_rate": 1.87270833205081e-06, + "loss": 2.0677, + "step": 18253 + }, + { + "epoch": 3.421555763823805, + "grad_norm": 56744.625, + "learning_rate": 1.870578497444858e-06, + "loss": 2.0808, + "step": 18254 + }, + { + "epoch": 3.42174320524836, + "grad_norm": 53343.51171875, + "learning_rate": 1.8684498515732173e-06, + "loss": 2.0516, + "step": 18255 + }, + { + "epoch": 3.421930646672915, + "grad_norm": 54292.4296875, + "learning_rate": 1.866322394488479e-06, + "loss": 2.088, + "step": 18256 + }, + { + "epoch": 3.4221180880974695, + "grad_norm": 54838.86328125, + "learning_rate": 1.8641961262431506e-06, + "loss": 2.0616, + "step": 18257 + }, + { + "epoch": 3.422305529522024, + "grad_norm": 53541.40625, + "learning_rate": 1.862071046889774e-06, + "loss": 2.0715, + "step": 18258 + }, + { + "epoch": 3.4224929709465792, + "grad_norm": 54219.09375, + "learning_rate": 1.8599471564808347e-06, + "loss": 2.1388, + "step": 18259 + }, + { + "epoch": 3.422680412371134, + "grad_norm": 61612.02734375, + "learning_rate": 1.8578244550687906e-06, + "loss": 2.0555, + "step": 18260 + }, + { + "epoch": 3.422867853795689, + "grad_norm": 58634.84375, + "learning_rate": 1.8557029427060557e-06, + "loss": 1.9961, + "step": 18261 + }, + { + "epoch": 3.4230552952202435, + "grad_norm": 56768.984375, + "learning_rate": 1.8535826194450378e-06, + "loss": 2.0455, + "step": 18262 + }, + { + "epoch": 3.4232427366447986, + "grad_norm": 53621.7578125, + "learning_rate": 1.8514634853381118e-06, + "loss": 2.0707, + "step": 18263 + }, + { + "epoch": 3.4234301780693532, + "grad_norm": 58584.8359375, + "learning_rate": 1.8493455404376137e-06, + "loss": 2.0706, + "step": 18264 + }, + { + "epoch": 3.4236176194939083, + "grad_norm": 60128.703125, + "learning_rate": 1.8472287847958458e-06, + "loss": 1.9745, + "step": 18265 + }, + { + "epoch": 3.423805060918463, + "grad_norm": 55941.953125, + "learning_rate": 1.8451132184650999e-06, + "loss": 2.0152, + "step": 18266 + }, + { + "epoch": 3.423992502343018, + "grad_norm": 53553.41796875, + "learning_rate": 1.8429988414976174e-06, + "loss": 2.0971, + "step": 18267 + }, + { + "epoch": 3.4241799437675726, + "grad_norm": 58486.33984375, + "learning_rate": 1.8408856539456398e-06, + "loss": 2.1018, + "step": 18268 + }, + { + "epoch": 3.4243673851921272, + "grad_norm": 56062.921875, + "learning_rate": 1.8387736558613422e-06, + "loss": 2.0716, + "step": 18269 + }, + { + "epoch": 3.4245548266166823, + "grad_norm": 57641.375, + "learning_rate": 1.8366628472968884e-06, + "loss": 2.1301, + "step": 18270 + }, + { + "epoch": 3.424742268041237, + "grad_norm": 58828.99609375, + "learning_rate": 1.8345532283044198e-06, + "loss": 2.0376, + "step": 18271 + }, + { + "epoch": 3.424929709465792, + "grad_norm": 61029.48046875, + "learning_rate": 1.8324447989360393e-06, + "loss": 1.9996, + "step": 18272 + }, + { + "epoch": 3.4251171508903466, + "grad_norm": 56487.2578125, + "learning_rate": 1.8303375592438276e-06, + "loss": 2.0858, + "step": 18273 + }, + { + "epoch": 3.4253045923149017, + "grad_norm": 54187.96484375, + "learning_rate": 1.8282315092798152e-06, + "loss": 2.1108, + "step": 18274 + }, + { + "epoch": 3.4254920337394563, + "grad_norm": 53578.5, + "learning_rate": 1.8261266490960383e-06, + "loss": 2.0601, + "step": 18275 + }, + { + "epoch": 3.4256794751640114, + "grad_norm": 51372.26171875, + "learning_rate": 1.8240229787444718e-06, + "loss": 2.069, + "step": 18276 + }, + { + "epoch": 3.425866916588566, + "grad_norm": 52982.4296875, + "learning_rate": 1.8219204982770688e-06, + "loss": 2.1208, + "step": 18277 + }, + { + "epoch": 3.426054358013121, + "grad_norm": 57755.03125, + "learning_rate": 1.819819207745771e-06, + "loss": 1.9895, + "step": 18278 + }, + { + "epoch": 3.4262417994376757, + "grad_norm": 53029.1171875, + "learning_rate": 1.8177191072024758e-06, + "loss": 2.0256, + "step": 18279 + }, + { + "epoch": 3.4264292408622303, + "grad_norm": 59530.55859375, + "learning_rate": 1.8156201966990527e-06, + "loss": 2.0662, + "step": 18280 + }, + { + "epoch": 3.4266166822867854, + "grad_norm": 57750.48828125, + "learning_rate": 1.813522476287327e-06, + "loss": 2.1, + "step": 18281 + }, + { + "epoch": 3.42680412371134, + "grad_norm": 56493.5546875, + "learning_rate": 1.8114259460191296e-06, + "loss": 2.0147, + "step": 18282 + }, + { + "epoch": 3.426991565135895, + "grad_norm": 54941.140625, + "learning_rate": 1.8093306059462357e-06, + "loss": 2.1091, + "step": 18283 + }, + { + "epoch": 3.4271790065604497, + "grad_norm": 57646.4375, + "learning_rate": 1.807236456120387e-06, + "loss": 2.0354, + "step": 18284 + }, + { + "epoch": 3.427366447985005, + "grad_norm": 57219.28515625, + "learning_rate": 1.80514349659332e-06, + "loss": 2.0925, + "step": 18285 + }, + { + "epoch": 3.4275538894095594, + "grad_norm": 57930.01953125, + "learning_rate": 1.8030517274167324e-06, + "loss": 2.0045, + "step": 18286 + }, + { + "epoch": 3.4277413308341145, + "grad_norm": 50274.4375, + "learning_rate": 1.8009611486422662e-06, + "loss": 2.0277, + "step": 18287 + }, + { + "epoch": 3.427928772258669, + "grad_norm": 61135.1953125, + "learning_rate": 1.7988717603215687e-06, + "loss": 2.0785, + "step": 18288 + }, + { + "epoch": 3.428116213683224, + "grad_norm": 56263.60546875, + "learning_rate": 1.7967835625062491e-06, + "loss": 2.1181, + "step": 18289 + }, + { + "epoch": 3.428303655107779, + "grad_norm": 58273.703125, + "learning_rate": 1.7946965552478823e-06, + "loss": 2.0811, + "step": 18290 + }, + { + "epoch": 3.4284910965323334, + "grad_norm": 50113.37890625, + "learning_rate": 1.792610738598005e-06, + "loss": 2.0427, + "step": 18291 + }, + { + "epoch": 3.4286785379568885, + "grad_norm": 52953.3359375, + "learning_rate": 1.7905261126081429e-06, + "loss": 2.0563, + "step": 18292 + }, + { + "epoch": 3.428865979381443, + "grad_norm": 54467.47265625, + "learning_rate": 1.7884426773297768e-06, + "loss": 2.1607, + "step": 18293 + }, + { + "epoch": 3.429053420805998, + "grad_norm": 54454.51953125, + "learning_rate": 1.7863604328143769e-06, + "loss": 2.1464, + "step": 18294 + }, + { + "epoch": 3.429240862230553, + "grad_norm": 53978.7734375, + "learning_rate": 1.7842793791133627e-06, + "loss": 2.0934, + "step": 18295 + }, + { + "epoch": 3.429428303655108, + "grad_norm": 61358.28125, + "learning_rate": 1.7821995162781324e-06, + "loss": 1.9936, + "step": 18296 + }, + { + "epoch": 3.4296157450796625, + "grad_norm": 55371.98046875, + "learning_rate": 1.780120844360067e-06, + "loss": 2.0933, + "step": 18297 + }, + { + "epoch": 3.4298031865042176, + "grad_norm": 56214.5546875, + "learning_rate": 1.7780433634104866e-06, + "loss": 2.044, + "step": 18298 + }, + { + "epoch": 3.429990627928772, + "grad_norm": 50136.171875, + "learning_rate": 1.775967073480722e-06, + "loss": 2.0734, + "step": 18299 + }, + { + "epoch": 3.4301780693533273, + "grad_norm": 56097.12890625, + "learning_rate": 1.773891974622044e-06, + "loss": 2.044, + "step": 18300 + }, + { + "epoch": 3.430365510777882, + "grad_norm": 56079.90625, + "learning_rate": 1.771818066885711e-06, + "loss": 2.1131, + "step": 18301 + }, + { + "epoch": 3.4305529522024365, + "grad_norm": 56058.14453125, + "learning_rate": 1.7697453503229378e-06, + "loss": 2.0697, + "step": 18302 + }, + { + "epoch": 3.4307403936269916, + "grad_norm": 55093.81640625, + "learning_rate": 1.7676738249849277e-06, + "loss": 2.0609, + "step": 18303 + }, + { + "epoch": 3.4309278350515466, + "grad_norm": 55362.16015625, + "learning_rate": 1.7656034909228403e-06, + "loss": 2.1141, + "step": 18304 + }, + { + "epoch": 3.4311152764761013, + "grad_norm": 54345.41796875, + "learning_rate": 1.7635343481878065e-06, + "loss": 1.96, + "step": 18305 + }, + { + "epoch": 3.431302717900656, + "grad_norm": 55023.734375, + "learning_rate": 1.7614663968309409e-06, + "loss": 2.127, + "step": 18306 + }, + { + "epoch": 3.431490159325211, + "grad_norm": 51730.41796875, + "learning_rate": 1.7593996369033083e-06, + "loss": 2.0437, + "step": 18307 + }, + { + "epoch": 3.4316776007497656, + "grad_norm": 56282.7734375, + "learning_rate": 1.757334068455957e-06, + "loss": 2.1013, + "step": 18308 + }, + { + "epoch": 3.4318650421743206, + "grad_norm": 58454.38671875, + "learning_rate": 1.755269691539918e-06, + "loss": 2.0785, + "step": 18309 + }, + { + "epoch": 3.4320524835988753, + "grad_norm": 53029.3671875, + "learning_rate": 1.7532065062061565e-06, + "loss": 2.1105, + "step": 18310 + }, + { + "epoch": 3.4322399250234303, + "grad_norm": 57646.2265625, + "learning_rate": 1.7511445125056424e-06, + "loss": 2.0584, + "step": 18311 + }, + { + "epoch": 3.432427366447985, + "grad_norm": 51692.51953125, + "learning_rate": 1.7490837104893075e-06, + "loss": 2.0265, + "step": 18312 + }, + { + "epoch": 3.4326148078725396, + "grad_norm": 56657.1171875, + "learning_rate": 1.7470241002080446e-06, + "loss": 2.0303, + "step": 18313 + }, + { + "epoch": 3.4328022492970947, + "grad_norm": 51099.61328125, + "learning_rate": 1.7449656817127235e-06, + "loss": 2.0288, + "step": 18314 + }, + { + "epoch": 3.4329896907216497, + "grad_norm": 56442.6015625, + "learning_rate": 1.7429084550541819e-06, + "loss": 2.0428, + "step": 18315 + }, + { + "epoch": 3.4331771321462043, + "grad_norm": 56536.59765625, + "learning_rate": 1.7408524202832511e-06, + "loss": 2.1176, + "step": 18316 + }, + { + "epoch": 3.433364573570759, + "grad_norm": 55224.0859375, + "learning_rate": 1.7387975774506848e-06, + "loss": 2.0985, + "step": 18317 + }, + { + "epoch": 3.433552014995314, + "grad_norm": 57332.03515625, + "learning_rate": 1.7367439266072482e-06, + "loss": 2.0799, + "step": 18318 + }, + { + "epoch": 3.4337394564198687, + "grad_norm": 56399.7421875, + "learning_rate": 1.7346914678036618e-06, + "loss": 2.051, + "step": 18319 + }, + { + "epoch": 3.4339268978444237, + "grad_norm": 53135.875, + "learning_rate": 1.732640201090624e-06, + "loss": 2.0377, + "step": 18320 + }, + { + "epoch": 3.4341143392689784, + "grad_norm": 56083.5, + "learning_rate": 1.7305901265187886e-06, + "loss": 2.0933, + "step": 18321 + }, + { + "epoch": 3.4343017806935334, + "grad_norm": 51350.8046875, + "learning_rate": 1.7285412441387982e-06, + "loss": 2.1628, + "step": 18322 + }, + { + "epoch": 3.434489222118088, + "grad_norm": 55938.91796875, + "learning_rate": 1.7264935540012572e-06, + "loss": 2.0891, + "step": 18323 + }, + { + "epoch": 3.434676663542643, + "grad_norm": 55384.0, + "learning_rate": 1.7244470561567305e-06, + "loss": 2.1051, + "step": 18324 + }, + { + "epoch": 3.4348641049671977, + "grad_norm": 57527.59765625, + "learning_rate": 1.722401750655772e-06, + "loss": 2.0708, + "step": 18325 + }, + { + "epoch": 3.435051546391753, + "grad_norm": 54718.234375, + "learning_rate": 1.7203576375489028e-06, + "loss": 2.083, + "step": 18326 + }, + { + "epoch": 3.4352389878163074, + "grad_norm": 55673.91796875, + "learning_rate": 1.7183147168866044e-06, + "loss": 2.0268, + "step": 18327 + }, + { + "epoch": 3.435426429240862, + "grad_norm": 54313.94140625, + "learning_rate": 1.7162729887193252e-06, + "loss": 2.1315, + "step": 18328 + }, + { + "epoch": 3.435613870665417, + "grad_norm": 53059.40625, + "learning_rate": 1.714232453097514e-06, + "loss": 2.0559, + "step": 18329 + }, + { + "epoch": 3.4358013120899717, + "grad_norm": 58080.81640625, + "learning_rate": 1.712193110071547e-06, + "loss": 2.1147, + "step": 18330 + }, + { + "epoch": 3.435988753514527, + "grad_norm": 50811.98046875, + "learning_rate": 1.710154959691812e-06, + "loss": 1.9996, + "step": 18331 + }, + { + "epoch": 3.4361761949390814, + "grad_norm": 60265.5390625, + "learning_rate": 1.7081180020086352e-06, + "loss": 2.0399, + "step": 18332 + }, + { + "epoch": 3.4363636363636365, + "grad_norm": 54626.08203125, + "learning_rate": 1.7060822370723374e-06, + "loss": 2.0395, + "step": 18333 + }, + { + "epoch": 3.436551077788191, + "grad_norm": 59925.08984375, + "learning_rate": 1.7040476649331893e-06, + "loss": 2.0792, + "step": 18334 + }, + { + "epoch": 3.436738519212746, + "grad_norm": 56264.5234375, + "learning_rate": 1.702014285641451e-06, + "loss": 2.0428, + "step": 18335 + }, + { + "epoch": 3.436925960637301, + "grad_norm": 56101.12109375, + "learning_rate": 1.6999820992473436e-06, + "loss": 2.1333, + "step": 18336 + }, + { + "epoch": 3.437113402061856, + "grad_norm": 54304.5234375, + "learning_rate": 1.6979511058010489e-06, + "loss": 2.0556, + "step": 18337 + }, + { + "epoch": 3.4373008434864105, + "grad_norm": 53394.9375, + "learning_rate": 1.6959213053527433e-06, + "loss": 2.0469, + "step": 18338 + }, + { + "epoch": 3.437488284910965, + "grad_norm": 60640.69921875, + "learning_rate": 1.6938926979525538e-06, + "loss": 2.0164, + "step": 18339 + }, + { + "epoch": 3.43767572633552, + "grad_norm": 57203.21484375, + "learning_rate": 1.6918652836505844e-06, + "loss": 2.0568, + "step": 18340 + }, + { + "epoch": 3.437863167760075, + "grad_norm": 57108.05078125, + "learning_rate": 1.6898390624969063e-06, + "loss": 2.0719, + "step": 18341 + }, + { + "epoch": 3.43805060918463, + "grad_norm": 51942.640625, + "learning_rate": 1.6878140345415683e-06, + "loss": 2.073, + "step": 18342 + }, + { + "epoch": 3.4382380506091845, + "grad_norm": 51647.02734375, + "learning_rate": 1.6857901998345915e-06, + "loss": 2.0952, + "step": 18343 + }, + { + "epoch": 3.4384254920337396, + "grad_norm": 54633.12109375, + "learning_rate": 1.6837675584259527e-06, + "loss": 2.0945, + "step": 18344 + }, + { + "epoch": 3.438612933458294, + "grad_norm": 56470.65625, + "learning_rate": 1.6817461103656174e-06, + "loss": 2.0731, + "step": 18345 + }, + { + "epoch": 3.4388003748828493, + "grad_norm": 53919.73828125, + "learning_rate": 1.6797258557035012e-06, + "loss": 2.0423, + "step": 18346 + }, + { + "epoch": 3.438987816307404, + "grad_norm": 54490.8984375, + "learning_rate": 1.6777067944895087e-06, + "loss": 2.0164, + "step": 18347 + }, + { + "epoch": 3.439175257731959, + "grad_norm": 52194.19921875, + "learning_rate": 1.6756889267735054e-06, + "loss": 2.1224, + "step": 18348 + }, + { + "epoch": 3.4393626991565136, + "grad_norm": 55411.96484375, + "learning_rate": 1.6736722526053405e-06, + "loss": 2.0406, + "step": 18349 + }, + { + "epoch": 3.4395501405810682, + "grad_norm": 54996.0234375, + "learning_rate": 1.6716567720348132e-06, + "loss": 2.0381, + "step": 18350 + }, + { + "epoch": 3.4397375820056233, + "grad_norm": 56212.05859375, + "learning_rate": 1.6696424851116999e-06, + "loss": 1.9773, + "step": 18351 + }, + { + "epoch": 3.439925023430178, + "grad_norm": 54282.80859375, + "learning_rate": 1.6676293918857611e-06, + "loss": 2.1045, + "step": 18352 + }, + { + "epoch": 3.440112464854733, + "grad_norm": 57769.515625, + "learning_rate": 1.6656174924067125e-06, + "loss": 2.046, + "step": 18353 + }, + { + "epoch": 3.4402999062792876, + "grad_norm": 54615.06640625, + "learning_rate": 1.6636067867242366e-06, + "loss": 2.069, + "step": 18354 + }, + { + "epoch": 3.4404873477038427, + "grad_norm": 55203.10546875, + "learning_rate": 1.661597274888005e-06, + "loss": 2.0615, + "step": 18355 + }, + { + "epoch": 3.4406747891283973, + "grad_norm": 51108.48046875, + "learning_rate": 1.6595889569476608e-06, + "loss": 2.0943, + "step": 18356 + }, + { + "epoch": 3.4408622305529524, + "grad_norm": 57237.390625, + "learning_rate": 1.657581832952787e-06, + "loss": 2.0233, + "step": 18357 + }, + { + "epoch": 3.441049671977507, + "grad_norm": 51375.99609375, + "learning_rate": 1.655575902952966e-06, + "loss": 2.0504, + "step": 18358 + }, + { + "epoch": 3.441237113402062, + "grad_norm": 55154.86328125, + "learning_rate": 1.6535711669977417e-06, + "loss": 2.0048, + "step": 18359 + }, + { + "epoch": 3.4414245548266167, + "grad_norm": 57220.62109375, + "learning_rate": 1.651567625136624e-06, + "loss": 2.0506, + "step": 18360 + }, + { + "epoch": 3.4416119962511713, + "grad_norm": 58799.75, + "learning_rate": 1.6495652774191072e-06, + "loss": 2.0691, + "step": 18361 + }, + { + "epoch": 3.4417994376757264, + "grad_norm": 57418.34375, + "learning_rate": 1.6475641238946405e-06, + "loss": 2.087, + "step": 18362 + }, + { + "epoch": 3.441986879100281, + "grad_norm": 55750.81640625, + "learning_rate": 1.6455641646126452e-06, + "loss": 2.1827, + "step": 18363 + }, + { + "epoch": 3.442174320524836, + "grad_norm": 55769.67578125, + "learning_rate": 1.643565399622521e-06, + "loss": 2.0342, + "step": 18364 + }, + { + "epoch": 3.4423617619493907, + "grad_norm": 55881.6953125, + "learning_rate": 1.6415678289736447e-06, + "loss": 2.1156, + "step": 18365 + }, + { + "epoch": 3.4425492033739458, + "grad_norm": 54381.0078125, + "learning_rate": 1.6395714527153383e-06, + "loss": 2.0956, + "step": 18366 + }, + { + "epoch": 3.4427366447985004, + "grad_norm": 55655.87109375, + "learning_rate": 1.6375762708969178e-06, + "loss": 2.0096, + "step": 18367 + }, + { + "epoch": 3.4429240862230555, + "grad_norm": 53888.28125, + "learning_rate": 1.6355822835676659e-06, + "loss": 2.1157, + "step": 18368 + }, + { + "epoch": 3.44311152764761, + "grad_norm": 53846.0625, + "learning_rate": 1.6335894907768267e-06, + "loss": 2.0096, + "step": 18369 + }, + { + "epoch": 3.443298969072165, + "grad_norm": 55496.37890625, + "learning_rate": 1.6315978925736109e-06, + "loss": 2.007, + "step": 18370 + }, + { + "epoch": 3.4434864104967198, + "grad_norm": 55412.0, + "learning_rate": 1.6296074890072178e-06, + "loss": 2.0752, + "step": 18371 + }, + { + "epoch": 3.4436738519212744, + "grad_norm": 54067.30078125, + "learning_rate": 1.6276182801268082e-06, + "loss": 2.141, + "step": 18372 + }, + { + "epoch": 3.4438612933458295, + "grad_norm": 60177.54296875, + "learning_rate": 1.6256302659815148e-06, + "loss": 2.0799, + "step": 18373 + }, + { + "epoch": 3.444048734770384, + "grad_norm": 57247.70703125, + "learning_rate": 1.623643446620432e-06, + "loss": 2.0541, + "step": 18374 + }, + { + "epoch": 3.444236176194939, + "grad_norm": 56972.55859375, + "learning_rate": 1.621657822092637e-06, + "loss": 2.0317, + "step": 18375 + }, + { + "epoch": 3.444423617619494, + "grad_norm": 58231.51953125, + "learning_rate": 1.619673392447174e-06, + "loss": 2.0882, + "step": 18376 + }, + { + "epoch": 3.444611059044049, + "grad_norm": 58337.60546875, + "learning_rate": 1.6176901577330428e-06, + "loss": 2.0255, + "step": 18377 + }, + { + "epoch": 3.4447985004686035, + "grad_norm": 54948.390625, + "learning_rate": 1.6157081179992373e-06, + "loss": 2.1738, + "step": 18378 + }, + { + "epoch": 3.4449859418931585, + "grad_norm": 55382.13671875, + "learning_rate": 1.6137272732947239e-06, + "loss": 2.1063, + "step": 18379 + }, + { + "epoch": 3.445173383317713, + "grad_norm": 53013.03515625, + "learning_rate": 1.6117476236683971e-06, + "loss": 2.0792, + "step": 18380 + }, + { + "epoch": 3.4453608247422682, + "grad_norm": 51458.48828125, + "learning_rate": 1.609769169169173e-06, + "loss": 2.1318, + "step": 18381 + }, + { + "epoch": 3.445548266166823, + "grad_norm": 58732.44921875, + "learning_rate": 1.6077919098459126e-06, + "loss": 2.1145, + "step": 18382 + }, + { + "epoch": 3.4457357075913775, + "grad_norm": 56916.0859375, + "learning_rate": 1.6058158457474548e-06, + "loss": 2.0428, + "step": 18383 + }, + { + "epoch": 3.4459231490159326, + "grad_norm": 54170.1328125, + "learning_rate": 1.6038409769225938e-06, + "loss": 2.0587, + "step": 18384 + }, + { + "epoch": 3.446110590440487, + "grad_norm": 55760.51171875, + "learning_rate": 1.6018673034201181e-06, + "loss": 2.134, + "step": 18385 + }, + { + "epoch": 3.4462980318650422, + "grad_norm": 55800.3515625, + "learning_rate": 1.599894825288767e-06, + "loss": 2.1352, + "step": 18386 + }, + { + "epoch": 3.446485473289597, + "grad_norm": 56256.0234375, + "learning_rate": 1.5979235425772676e-06, + "loss": 2.0404, + "step": 18387 + }, + { + "epoch": 3.446672914714152, + "grad_norm": 57002.546875, + "learning_rate": 1.5959534553342982e-06, + "loss": 2.067, + "step": 18388 + }, + { + "epoch": 3.4468603561387066, + "grad_norm": 57359.7265625, + "learning_rate": 1.5939845636085304e-06, + "loss": 2.1297, + "step": 18389 + }, + { + "epoch": 3.4470477975632616, + "grad_norm": 54298.23046875, + "learning_rate": 1.5920168674485703e-06, + "loss": 2.1516, + "step": 18390 + }, + { + "epoch": 3.4472352389878163, + "grad_norm": 55778.046875, + "learning_rate": 1.5900503669030453e-06, + "loss": 1.9868, + "step": 18391 + }, + { + "epoch": 3.4474226804123713, + "grad_norm": 54116.8203125, + "learning_rate": 1.5880850620205057e-06, + "loss": 2.1264, + "step": 18392 + }, + { + "epoch": 3.447610121836926, + "grad_norm": 59321.84765625, + "learning_rate": 1.5861209528494958e-06, + "loss": 2.101, + "step": 18393 + }, + { + "epoch": 3.4477975632614806, + "grad_norm": 53334.22265625, + "learning_rate": 1.5841580394385324e-06, + "loss": 2.0455, + "step": 18394 + }, + { + "epoch": 3.4479850046860356, + "grad_norm": 61448.48828125, + "learning_rate": 1.582196321836088e-06, + "loss": 2.1295, + "step": 18395 + }, + { + "epoch": 3.4481724461105903, + "grad_norm": 57354.75, + "learning_rate": 1.5802358000906291e-06, + "loss": 2.0803, + "step": 18396 + }, + { + "epoch": 3.4483598875351453, + "grad_norm": 53091.12890625, + "learning_rate": 1.5782764742505619e-06, + "loss": 2.0371, + "step": 18397 + }, + { + "epoch": 3.4485473289597, + "grad_norm": 56932.63671875, + "learning_rate": 1.5763183443642859e-06, + "loss": 2.0211, + "step": 18398 + }, + { + "epoch": 3.448734770384255, + "grad_norm": 59387.2265625, + "learning_rate": 1.5743614104801685e-06, + "loss": 2.1116, + "step": 18399 + }, + { + "epoch": 3.4489222118088096, + "grad_norm": 58312.046875, + "learning_rate": 1.5724056726465375e-06, + "loss": 2.005, + "step": 18400 + }, + { + "epoch": 3.4491096532333647, + "grad_norm": 49758.33984375, + "learning_rate": 1.5704511309116931e-06, + "loss": 2.0782, + "step": 18401 + }, + { + "epoch": 3.4492970946579193, + "grad_norm": 55996.99609375, + "learning_rate": 1.5684977853239302e-06, + "loss": 2.1009, + "step": 18402 + }, + { + "epoch": 3.4494845360824744, + "grad_norm": 56779.26171875, + "learning_rate": 1.5665456359314711e-06, + "loss": 2.05, + "step": 18403 + }, + { + "epoch": 3.449671977507029, + "grad_norm": 54394.78125, + "learning_rate": 1.564594682782533e-06, + "loss": 2.0409, + "step": 18404 + }, + { + "epoch": 3.4498594189315837, + "grad_norm": 58070.9375, + "learning_rate": 1.5626449259253218e-06, + "loss": 2.042, + "step": 18405 + }, + { + "epoch": 3.4500468603561387, + "grad_norm": 55175.03125, + "learning_rate": 1.5606963654079766e-06, + "loss": 2.0643, + "step": 18406 + }, + { + "epoch": 3.4502343017806933, + "grad_norm": 52086.8984375, + "learning_rate": 1.5587490012786255e-06, + "loss": 2.1058, + "step": 18407 + }, + { + "epoch": 3.4504217432052484, + "grad_norm": 51374.5234375, + "learning_rate": 1.5568028335853636e-06, + "loss": 2.0972, + "step": 18408 + }, + { + "epoch": 3.450609184629803, + "grad_norm": 59055.3046875, + "learning_rate": 1.55485786237628e-06, + "loss": 2.0915, + "step": 18409 + }, + { + "epoch": 3.450796626054358, + "grad_norm": 58278.9375, + "learning_rate": 1.5529140876993864e-06, + "loss": 2.1019, + "step": 18410 + }, + { + "epoch": 3.4509840674789127, + "grad_norm": 56119.1796875, + "learning_rate": 1.5509715096027e-06, + "loss": 2.0613, + "step": 18411 + }, + { + "epoch": 3.451171508903468, + "grad_norm": 53655.21875, + "learning_rate": 1.5490301281342102e-06, + "loss": 2.057, + "step": 18412 + }, + { + "epoch": 3.4513589503280224, + "grad_norm": 55225.50390625, + "learning_rate": 1.5470899433418562e-06, + "loss": 2.1611, + "step": 18413 + }, + { + "epoch": 3.4515463917525775, + "grad_norm": 51625.15234375, + "learning_rate": 1.5451509552735555e-06, + "loss": 2.0723, + "step": 18414 + }, + { + "epoch": 3.451733833177132, + "grad_norm": 61397.109375, + "learning_rate": 1.5432131639772085e-06, + "loss": 2.0809, + "step": 18415 + }, + { + "epoch": 3.4519212746016867, + "grad_norm": 58826.55078125, + "learning_rate": 1.5412765695006714e-06, + "loss": 2.1271, + "step": 18416 + }, + { + "epoch": 3.452108716026242, + "grad_norm": 52689.19140625, + "learning_rate": 1.5393411718917728e-06, + "loss": 2.0735, + "step": 18417 + }, + { + "epoch": 3.4522961574507964, + "grad_norm": 58083.33984375, + "learning_rate": 1.5374069711983187e-06, + "loss": 2.06, + "step": 18418 + }, + { + "epoch": 3.4524835988753515, + "grad_norm": 58331.28515625, + "learning_rate": 1.535473967468082e-06, + "loss": 2.1371, + "step": 18419 + }, + { + "epoch": 3.452671040299906, + "grad_norm": 56779.39453125, + "learning_rate": 1.5335421607488022e-06, + "loss": 2.0959, + "step": 18420 + }, + { + "epoch": 3.452858481724461, + "grad_norm": 58566.75, + "learning_rate": 1.5316115510881856e-06, + "loss": 2.0508, + "step": 18421 + }, + { + "epoch": 3.453045923149016, + "grad_norm": 53056.0234375, + "learning_rate": 1.5296821385339333e-06, + "loss": 2.0331, + "step": 18422 + }, + { + "epoch": 3.453233364573571, + "grad_norm": 52863.140625, + "learning_rate": 1.527753923133679e-06, + "loss": 2.0576, + "step": 18423 + }, + { + "epoch": 3.4534208059981255, + "grad_norm": 53410.9453125, + "learning_rate": 1.525826904935068e-06, + "loss": 2.0534, + "step": 18424 + }, + { + "epoch": 3.4536082474226806, + "grad_norm": 54507.62890625, + "learning_rate": 1.5239010839856793e-06, + "loss": 2.0402, + "step": 18425 + }, + { + "epoch": 3.453795688847235, + "grad_norm": 57675.875, + "learning_rate": 1.5219764603330855e-06, + "loss": 2.0149, + "step": 18426 + }, + { + "epoch": 3.45398313027179, + "grad_norm": 57037.1875, + "learning_rate": 1.5200530340248153e-06, + "loss": 2.0386, + "step": 18427 + }, + { + "epoch": 3.454170571696345, + "grad_norm": 59419.1875, + "learning_rate": 1.518130805108392e-06, + "loss": 2.0946, + "step": 18428 + }, + { + "epoch": 3.4543580131209, + "grad_norm": 57579.4375, + "learning_rate": 1.516209773631272e-06, + "loss": 2.142, + "step": 18429 + }, + { + "epoch": 3.4545454545454546, + "grad_norm": 53281.6328125, + "learning_rate": 1.5142899396409116e-06, + "loss": 2.0701, + "step": 18430 + }, + { + "epoch": 3.454732895970009, + "grad_norm": 53227.81640625, + "learning_rate": 1.5123713031847232e-06, + "loss": 2.1124, + "step": 18431 + }, + { + "epoch": 3.4549203373945643, + "grad_norm": 63789.0703125, + "learning_rate": 1.5104538643101074e-06, + "loss": 2.0655, + "step": 18432 + }, + { + "epoch": 3.455107778819119, + "grad_norm": 55219.6796875, + "learning_rate": 1.5085376230644044e-06, + "loss": 2.0899, + "step": 18433 + }, + { + "epoch": 3.455295220243674, + "grad_norm": 50797.92578125, + "learning_rate": 1.506622579494954e-06, + "loss": 2.092, + "step": 18434 + }, + { + "epoch": 3.4554826616682286, + "grad_norm": 55454.17578125, + "learning_rate": 1.5047087336490572e-06, + "loss": 2.0621, + "step": 18435 + }, + { + "epoch": 3.4556701030927837, + "grad_norm": 62586.25, + "learning_rate": 1.502796085573982e-06, + "loss": 2.1327, + "step": 18436 + }, + { + "epoch": 3.4558575445173383, + "grad_norm": 68611.875, + "learning_rate": 1.500884635316957e-06, + "loss": 2.0903, + "step": 18437 + }, + { + "epoch": 3.456044985941893, + "grad_norm": 54560.203125, + "learning_rate": 1.4989743829252056e-06, + "loss": 2.0251, + "step": 18438 + }, + { + "epoch": 3.456232427366448, + "grad_norm": 54691.84765625, + "learning_rate": 1.4970653284459068e-06, + "loss": 2.0077, + "step": 18439 + }, + { + "epoch": 3.456419868791003, + "grad_norm": 57321.12890625, + "learning_rate": 1.4951574719262063e-06, + "loss": 2.119, + "step": 18440 + }, + { + "epoch": 3.4566073102155577, + "grad_norm": 61690.97265625, + "learning_rate": 1.4932508134132273e-06, + "loss": 2.1017, + "step": 18441 + }, + { + "epoch": 3.4567947516401123, + "grad_norm": 53875.6875, + "learning_rate": 1.4913453529540655e-06, + "loss": 2.1148, + "step": 18442 + }, + { + "epoch": 3.4569821930646674, + "grad_norm": 54046.671875, + "learning_rate": 1.4894410905957833e-06, + "loss": 2.0734, + "step": 18443 + }, + { + "epoch": 3.457169634489222, + "grad_norm": 52209.3203125, + "learning_rate": 1.48753802638541e-06, + "loss": 2.055, + "step": 18444 + }, + { + "epoch": 3.457357075913777, + "grad_norm": 54015.64453125, + "learning_rate": 1.485636160369952e-06, + "loss": 2.0554, + "step": 18445 + }, + { + "epoch": 3.4575445173383317, + "grad_norm": 53960.30078125, + "learning_rate": 1.483735492596372e-06, + "loss": 2.0664, + "step": 18446 + }, + { + "epoch": 3.4577319587628867, + "grad_norm": 58937.06640625, + "learning_rate": 1.4818360231116324e-06, + "loss": 2.0631, + "step": 18447 + }, + { + "epoch": 3.4579194001874414, + "grad_norm": 55859.17578125, + "learning_rate": 1.4799377519626346e-06, + "loss": 2.0941, + "step": 18448 + }, + { + "epoch": 3.4581068416119964, + "grad_norm": 57836.30859375, + "learning_rate": 1.4780406791962687e-06, + "loss": 2.0638, + "step": 18449 + }, + { + "epoch": 3.458294283036551, + "grad_norm": 52098.4296875, + "learning_rate": 1.4761448048593918e-06, + "loss": 2.1226, + "step": 18450 + }, + { + "epoch": 3.458481724461106, + "grad_norm": 52152.1484375, + "learning_rate": 1.4742501289988163e-06, + "loss": 2.1326, + "step": 18451 + }, + { + "epoch": 3.4586691658856608, + "grad_norm": 51053.53125, + "learning_rate": 1.4723566516613551e-06, + "loss": 2.1184, + "step": 18452 + }, + { + "epoch": 3.4588566073102154, + "grad_norm": 58897.12109375, + "learning_rate": 1.4704643728937651e-06, + "loss": 2.0595, + "step": 18453 + }, + { + "epoch": 3.4590440487347704, + "grad_norm": 54607.39453125, + "learning_rate": 1.4685732927427864e-06, + "loss": 2.0936, + "step": 18454 + }, + { + "epoch": 3.459231490159325, + "grad_norm": 63788.7578125, + "learning_rate": 1.466683411255132e-06, + "loss": 2.0843, + "step": 18455 + }, + { + "epoch": 3.45941893158388, + "grad_norm": 61293.7578125, + "learning_rate": 1.4647947284774643e-06, + "loss": 2.0409, + "step": 18456 + }, + { + "epoch": 3.4596063730084348, + "grad_norm": 56263.0625, + "learning_rate": 1.4629072444564352e-06, + "loss": 2.1445, + "step": 18457 + }, + { + "epoch": 3.45979381443299, + "grad_norm": 61973.08203125, + "learning_rate": 1.4610209592386792e-06, + "loss": 2.0525, + "step": 18458 + }, + { + "epoch": 3.4599812558575445, + "grad_norm": 53997.3046875, + "learning_rate": 1.4591358728707704e-06, + "loss": 2.1018, + "step": 18459 + }, + { + "epoch": 3.4601686972820995, + "grad_norm": 58192.66015625, + "learning_rate": 1.4572519853992661e-06, + "loss": 2.0461, + "step": 18460 + }, + { + "epoch": 3.460356138706654, + "grad_norm": 53401.56640625, + "learning_rate": 1.4553692968707068e-06, + "loss": 2.0426, + "step": 18461 + }, + { + "epoch": 3.460543580131209, + "grad_norm": 58100.5546875, + "learning_rate": 1.4534878073315938e-06, + "loss": 2.036, + "step": 18462 + }, + { + "epoch": 3.460731021555764, + "grad_norm": 52692.9921875, + "learning_rate": 1.4516075168283793e-06, + "loss": 2.0526, + "step": 18463 + }, + { + "epoch": 3.4609184629803185, + "grad_norm": 54317.83984375, + "learning_rate": 1.449728425407515e-06, + "loss": 2.0456, + "step": 18464 + }, + { + "epoch": 3.4611059044048735, + "grad_norm": 58040.58984375, + "learning_rate": 1.4478505331154135e-06, + "loss": 2.0567, + "step": 18465 + }, + { + "epoch": 3.461293345829428, + "grad_norm": 52842.41796875, + "learning_rate": 1.44597383999846e-06, + "loss": 2.1071, + "step": 18466 + }, + { + "epoch": 3.4614807872539832, + "grad_norm": 57333.609375, + "learning_rate": 1.4440983461029956e-06, + "loss": 2.039, + "step": 18467 + }, + { + "epoch": 3.461668228678538, + "grad_norm": 54945.6171875, + "learning_rate": 1.442224051475355e-06, + "loss": 2.1517, + "step": 18468 + }, + { + "epoch": 3.461855670103093, + "grad_norm": 55970.26953125, + "learning_rate": 1.4403509561618233e-06, + "loss": 2.0696, + "step": 18469 + }, + { + "epoch": 3.4620431115276475, + "grad_norm": 51581.390625, + "learning_rate": 1.4384790602086584e-06, + "loss": 2.017, + "step": 18470 + }, + { + "epoch": 3.4622305529522026, + "grad_norm": 52713.73828125, + "learning_rate": 1.4366083636621008e-06, + "loss": 2.1403, + "step": 18471 + }, + { + "epoch": 3.4624179943767572, + "grad_norm": 55476.05078125, + "learning_rate": 1.434738866568358e-06, + "loss": 2.0514, + "step": 18472 + }, + { + "epoch": 3.4626054358013123, + "grad_norm": 57666.01953125, + "learning_rate": 1.4328705689735988e-06, + "loss": 2.0999, + "step": 18473 + }, + { + "epoch": 3.462792877225867, + "grad_norm": 56026.29296875, + "learning_rate": 1.431003470923964e-06, + "loss": 2.0164, + "step": 18474 + }, + { + "epoch": 3.4629803186504216, + "grad_norm": 52860.73046875, + "learning_rate": 1.4291375724655775e-06, + "loss": 1.9978, + "step": 18475 + }, + { + "epoch": 3.4631677600749766, + "grad_norm": 51140.171875, + "learning_rate": 1.4272728736445195e-06, + "loss": 2.1245, + "step": 18476 + }, + { + "epoch": 3.4633552014995312, + "grad_norm": 58059.6953125, + "learning_rate": 1.4254093745068364e-06, + "loss": 2.0461, + "step": 18477 + }, + { + "epoch": 3.4635426429240863, + "grad_norm": 56002.3671875, + "learning_rate": 1.4235470750985747e-06, + "loss": 2.0275, + "step": 18478 + }, + { + "epoch": 3.463730084348641, + "grad_norm": 64563.56640625, + "learning_rate": 1.421685975465714e-06, + "loss": 1.9976, + "step": 18479 + }, + { + "epoch": 3.463917525773196, + "grad_norm": 51284.37890625, + "learning_rate": 1.4198260756542347e-06, + "loss": 2.1037, + "step": 18480 + }, + { + "epoch": 3.4641049671977506, + "grad_norm": 52911.38671875, + "learning_rate": 1.4179673757100554e-06, + "loss": 2.155, + "step": 18481 + }, + { + "epoch": 3.4642924086223057, + "grad_norm": 51667.37890625, + "learning_rate": 1.4161098756791057e-06, + "loss": 2.0332, + "step": 18482 + }, + { + "epoch": 3.4644798500468603, + "grad_norm": 54072.15625, + "learning_rate": 1.4142535756072438e-06, + "loss": 2.1298, + "step": 18483 + }, + { + "epoch": 3.4646672914714154, + "grad_norm": 53631.71875, + "learning_rate": 1.4123984755403275e-06, + "loss": 2.0073, + "step": 18484 + }, + { + "epoch": 3.46485473289597, + "grad_norm": 52611.84765625, + "learning_rate": 1.410544575524181e-06, + "loss": 2.1515, + "step": 18485 + }, + { + "epoch": 3.4650421743205246, + "grad_norm": 56935.4140625, + "learning_rate": 1.4086918756045787e-06, + "loss": 2.1362, + "step": 18486 + }, + { + "epoch": 3.4652296157450797, + "grad_norm": 60378.63671875, + "learning_rate": 1.4068403758272842e-06, + "loss": 2.0905, + "step": 18487 + }, + { + "epoch": 3.4654170571696343, + "grad_norm": 55813.84375, + "learning_rate": 1.4049900762380386e-06, + "loss": 2.0743, + "step": 18488 + }, + { + "epoch": 3.4656044985941894, + "grad_norm": 55594.296875, + "learning_rate": 1.4031409768825333e-06, + "loss": 2.0435, + "step": 18489 + }, + { + "epoch": 3.465791940018744, + "grad_norm": 56979.11328125, + "learning_rate": 1.401293077806437e-06, + "loss": 2.0564, + "step": 18490 + }, + { + "epoch": 3.465979381443299, + "grad_norm": 55079.6015625, + "learning_rate": 1.3994463790553914e-06, + "loss": 2.0566, + "step": 18491 + }, + { + "epoch": 3.4661668228678537, + "grad_norm": 57322.2421875, + "learning_rate": 1.3976008806750152e-06, + "loss": 2.072, + "step": 18492 + }, + { + "epoch": 3.466354264292409, + "grad_norm": 53627.94921875, + "learning_rate": 1.395756582710872e-06, + "loss": 2.0427, + "step": 18493 + }, + { + "epoch": 3.4665417057169634, + "grad_norm": 54346.25390625, + "learning_rate": 1.3939134852085313e-06, + "loss": 2.0044, + "step": 18494 + }, + { + "epoch": 3.4667291471415185, + "grad_norm": 54856.55859375, + "learning_rate": 1.3920715882135116e-06, + "loss": 2.0955, + "step": 18495 + }, + { + "epoch": 3.466916588566073, + "grad_norm": 51659.36328125, + "learning_rate": 1.390230891771299e-06, + "loss": 2.0486, + "step": 18496 + }, + { + "epoch": 3.4671040299906277, + "grad_norm": 53855.09765625, + "learning_rate": 1.3883913959273576e-06, + "loss": 2.1179, + "step": 18497 + }, + { + "epoch": 3.467291471415183, + "grad_norm": 56009.21484375, + "learning_rate": 1.3865531007271281e-06, + "loss": 2.0164, + "step": 18498 + }, + { + "epoch": 3.4674789128397374, + "grad_norm": 57571.265625, + "learning_rate": 1.3847160062160081e-06, + "loss": 2.1011, + "step": 18499 + }, + { + "epoch": 3.4676663542642925, + "grad_norm": 58352.19140625, + "learning_rate": 1.3828801124393664e-06, + "loss": 2.0446, + "step": 18500 + }, + { + "epoch": 3.4676663542642925, + "eval_loss": 2.2577602863311768, + "eval_runtime": 132.1241, + "eval_samples_per_second": 38.214, + "eval_steps_per_second": 1.915, + "step": 18500 + }, + { + "epoch": 3.467853795688847, + "grad_norm": 60530.43359375, + "learning_rate": 1.3810454194425505e-06, + "loss": 2.0268, + "step": 18501 + }, + { + "epoch": 3.468041237113402, + "grad_norm": 56342.17578125, + "learning_rate": 1.3792119272708903e-06, + "loss": 2.0617, + "step": 18502 + }, + { + "epoch": 3.468228678537957, + "grad_norm": 60345.28515625, + "learning_rate": 1.37737963596965e-06, + "loss": 2.0204, + "step": 18503 + }, + { + "epoch": 3.468416119962512, + "grad_norm": 58858.30859375, + "learning_rate": 1.3755485455840934e-06, + "loss": 2.0777, + "step": 18504 + }, + { + "epoch": 3.4686035613870665, + "grad_norm": 50692.86328125, + "learning_rate": 1.373718656159445e-06, + "loss": 2.0755, + "step": 18505 + }, + { + "epoch": 3.4687910028116216, + "grad_norm": 55922.8125, + "learning_rate": 1.3718899677409025e-06, + "loss": 2.0877, + "step": 18506 + }, + { + "epoch": 3.468978444236176, + "grad_norm": 55977.671875, + "learning_rate": 1.370062480373624e-06, + "loss": 2.0533, + "step": 18507 + }, + { + "epoch": 3.469165885660731, + "grad_norm": 57341.9921875, + "learning_rate": 1.3682361941027621e-06, + "loss": 2.0685, + "step": 18508 + }, + { + "epoch": 3.469353327085286, + "grad_norm": 56721.8046875, + "learning_rate": 1.3664111089734088e-06, + "loss": 2.0967, + "step": 18509 + }, + { + "epoch": 3.4695407685098405, + "grad_norm": 54577.8125, + "learning_rate": 1.3645872250306503e-06, + "loss": 2.0735, + "step": 18510 + }, + { + "epoch": 3.4697282099343956, + "grad_norm": 56374.671875, + "learning_rate": 1.3627645423195222e-06, + "loss": 2.0629, + "step": 18511 + }, + { + "epoch": 3.46991565135895, + "grad_norm": 53601.5234375, + "learning_rate": 1.3609430608850614e-06, + "loss": 2.1063, + "step": 18512 + }, + { + "epoch": 3.4701030927835053, + "grad_norm": 57107.17578125, + "learning_rate": 1.3591227807722373e-06, + "loss": 2.0454, + "step": 18513 + }, + { + "epoch": 3.47029053420806, + "grad_norm": 54244.546875, + "learning_rate": 1.3573037020260194e-06, + "loss": 2.0497, + "step": 18514 + }, + { + "epoch": 3.470477975632615, + "grad_norm": 57879.265625, + "learning_rate": 1.3554858246913381e-06, + "loss": 2.1012, + "step": 18515 + }, + { + "epoch": 3.4706654170571696, + "grad_norm": 54406.296875, + "learning_rate": 1.3536691488130804e-06, + "loss": 2.0751, + "step": 18516 + }, + { + "epoch": 3.4708528584817246, + "grad_norm": 54908.19140625, + "learning_rate": 1.3518536744361322e-06, + "loss": 2.1356, + "step": 18517 + }, + { + "epoch": 3.4710402999062793, + "grad_norm": 54991.7421875, + "learning_rate": 1.3500394016053186e-06, + "loss": 2.1378, + "step": 18518 + }, + { + "epoch": 3.471227741330834, + "grad_norm": 52793.25, + "learning_rate": 1.3482263303654596e-06, + "loss": 2.0593, + "step": 18519 + }, + { + "epoch": 3.471415182755389, + "grad_norm": 55172.0703125, + "learning_rate": 1.346414460761325e-06, + "loss": 2.081, + "step": 18520 + }, + { + "epoch": 3.4716026241799436, + "grad_norm": 50471.66015625, + "learning_rate": 1.3446037928376842e-06, + "loss": 2.0684, + "step": 18521 + }, + { + "epoch": 3.4717900656044987, + "grad_norm": 58683.91796875, + "learning_rate": 1.3427943266392406e-06, + "loss": 2.0865, + "step": 18522 + }, + { + "epoch": 3.4719775070290533, + "grad_norm": 60799.14453125, + "learning_rate": 1.3409860622106918e-06, + "loss": 2.0284, + "step": 18523 + }, + { + "epoch": 3.4721649484536083, + "grad_norm": 58158.3125, + "learning_rate": 1.3391789995967019e-06, + "loss": 2.0466, + "step": 18524 + }, + { + "epoch": 3.472352389878163, + "grad_norm": 55321.51953125, + "learning_rate": 1.3373731388419074e-06, + "loss": 2.1077, + "step": 18525 + }, + { + "epoch": 3.472539831302718, + "grad_norm": 64995.6796875, + "learning_rate": 1.3355684799908952e-06, + "loss": 2.0161, + "step": 18526 + }, + { + "epoch": 3.4727272727272727, + "grad_norm": 52688.14453125, + "learning_rate": 1.333765023088246e-06, + "loss": 2.0483, + "step": 18527 + }, + { + "epoch": 3.4729147141518277, + "grad_norm": 55886.5703125, + "learning_rate": 1.3319627681785075e-06, + "loss": 2.0933, + "step": 18528 + }, + { + "epoch": 3.4731021555763824, + "grad_norm": 51355.6640625, + "learning_rate": 1.330161715306194e-06, + "loss": 2.0758, + "step": 18529 + }, + { + "epoch": 3.473289597000937, + "grad_norm": 56018.1171875, + "learning_rate": 1.3283618645157757e-06, + "loss": 2.0315, + "step": 18530 + }, + { + "epoch": 3.473477038425492, + "grad_norm": 58744.80859375, + "learning_rate": 1.3265632158517228e-06, + "loss": 2.0607, + "step": 18531 + }, + { + "epoch": 3.4736644798500467, + "grad_norm": 55651.43359375, + "learning_rate": 1.3247657693584548e-06, + "loss": 2.1132, + "step": 18532 + }, + { + "epoch": 3.4738519212746017, + "grad_norm": 51789.86328125, + "learning_rate": 1.3229695250803531e-06, + "loss": 2.043, + "step": 18533 + }, + { + "epoch": 3.4740393626991564, + "grad_norm": 53022.25390625, + "learning_rate": 1.321174483061799e-06, + "loss": 2.0328, + "step": 18534 + }, + { + "epoch": 3.4742268041237114, + "grad_norm": 55613.55078125, + "learning_rate": 1.3193806433471234e-06, + "loss": 2.1139, + "step": 18535 + }, + { + "epoch": 3.474414245548266, + "grad_norm": 55944.76171875, + "learning_rate": 1.31758800598063e-06, + "loss": 2.0514, + "step": 18536 + }, + { + "epoch": 3.474601686972821, + "grad_norm": 58979.66015625, + "learning_rate": 1.3157965710065945e-06, + "loss": 2.1675, + "step": 18537 + }, + { + "epoch": 3.4747891283973757, + "grad_norm": 53609.0, + "learning_rate": 1.3140063384692647e-06, + "loss": 2.0317, + "step": 18538 + }, + { + "epoch": 3.474976569821931, + "grad_norm": 58888.5390625, + "learning_rate": 1.3122173084128553e-06, + "loss": 2.023, + "step": 18539 + }, + { + "epoch": 3.4751640112464854, + "grad_norm": 55550.6328125, + "learning_rate": 1.3104294808815531e-06, + "loss": 2.0759, + "step": 18540 + }, + { + "epoch": 3.47535145267104, + "grad_norm": 52588.109375, + "learning_rate": 1.3086428559195118e-06, + "loss": 2.0379, + "step": 18541 + }, + { + "epoch": 3.475538894095595, + "grad_norm": 53702.3828125, + "learning_rate": 1.306857433570874e-06, + "loss": 2.1269, + "step": 18542 + }, + { + "epoch": 3.47572633552015, + "grad_norm": 53887.27734375, + "learning_rate": 1.305073213879715e-06, + "loss": 2.0317, + "step": 18543 + }, + { + "epoch": 3.475913776944705, + "grad_norm": 60789.2109375, + "learning_rate": 1.3032901968901167e-06, + "loss": 2.3109, + "step": 18544 + }, + { + "epoch": 3.4761012183692594, + "grad_norm": 55659.37109375, + "learning_rate": 1.3015083826461217e-06, + "loss": 2.1354, + "step": 18545 + }, + { + "epoch": 3.4762886597938145, + "grad_norm": 52624.484375, + "learning_rate": 1.2997277711917222e-06, + "loss": 2.0436, + "step": 18546 + }, + { + "epoch": 3.476476101218369, + "grad_norm": 60000.0390625, + "learning_rate": 1.297948362570911e-06, + "loss": 2.1032, + "step": 18547 + }, + { + "epoch": 3.476663542642924, + "grad_norm": 54440.140625, + "learning_rate": 1.2961701568276307e-06, + "loss": 2.0292, + "step": 18548 + }, + { + "epoch": 3.476850984067479, + "grad_norm": 50883.03515625, + "learning_rate": 1.2943931540057962e-06, + "loss": 2.0596, + "step": 18549 + }, + { + "epoch": 3.477038425492034, + "grad_norm": 54228.53125, + "learning_rate": 1.2926173541493058e-06, + "loss": 2.0649, + "step": 18550 + }, + { + "epoch": 3.4772258669165885, + "grad_norm": 56201.44921875, + "learning_rate": 1.2908427573020244e-06, + "loss": 2.0365, + "step": 18551 + }, + { + "epoch": 3.477413308341143, + "grad_norm": 52406.125, + "learning_rate": 1.2890693635077721e-06, + "loss": 2.0811, + "step": 18552 + }, + { + "epoch": 3.477600749765698, + "grad_norm": 57713.1171875, + "learning_rate": 1.2872971728103423e-06, + "loss": 2.1525, + "step": 18553 + }, + { + "epoch": 3.4777881911902533, + "grad_norm": 52041.44140625, + "learning_rate": 1.2855261852535216e-06, + "loss": 2.0379, + "step": 18554 + }, + { + "epoch": 3.477975632614808, + "grad_norm": 53772.6875, + "learning_rate": 1.2837564008810533e-06, + "loss": 2.0604, + "step": 18555 + }, + { + "epoch": 3.4781630740393625, + "grad_norm": 55566.4921875, + "learning_rate": 1.2819878197366298e-06, + "loss": 2.1586, + "step": 18556 + }, + { + "epoch": 3.4783505154639176, + "grad_norm": 57935.453125, + "learning_rate": 1.2802204418639384e-06, + "loss": 2.0946, + "step": 18557 + }, + { + "epoch": 3.4785379568884722, + "grad_norm": 48135.953125, + "learning_rate": 1.2784542673066502e-06, + "loss": 2.0219, + "step": 18558 + }, + { + "epoch": 3.4787253983130273, + "grad_norm": 52710.79296875, + "learning_rate": 1.2766892961083632e-06, + "loss": 2.0451, + "step": 18559 + }, + { + "epoch": 3.478912839737582, + "grad_norm": 55209.640625, + "learning_rate": 1.2749255283126815e-06, + "loss": 2.088, + "step": 18560 + }, + { + "epoch": 3.479100281162137, + "grad_norm": 57714.38671875, + "learning_rate": 1.2731629639631704e-06, + "loss": 2.1085, + "step": 18561 + }, + { + "epoch": 3.4792877225866916, + "grad_norm": 56039.37890625, + "learning_rate": 1.2714016031033615e-06, + "loss": 2.0951, + "step": 18562 + }, + { + "epoch": 3.4794751640112467, + "grad_norm": 52678.74609375, + "learning_rate": 1.2696414457767425e-06, + "loss": 2.0363, + "step": 18563 + }, + { + "epoch": 3.4796626054358013, + "grad_norm": 57119.48828125, + "learning_rate": 1.2678824920268062e-06, + "loss": 2.0426, + "step": 18564 + }, + { + "epoch": 3.4798500468603564, + "grad_norm": 55753.7734375, + "learning_rate": 1.2661247418969957e-06, + "loss": 2.0628, + "step": 18565 + }, + { + "epoch": 3.480037488284911, + "grad_norm": 50101.3984375, + "learning_rate": 1.2643681954307153e-06, + "loss": 2.1021, + "step": 18566 + }, + { + "epoch": 3.4802249297094656, + "grad_norm": 53378.16015625, + "learning_rate": 1.2626128526713522e-06, + "loss": 2.0996, + "step": 18567 + }, + { + "epoch": 3.4804123711340207, + "grad_norm": 56103.1953125, + "learning_rate": 1.2608587136622662e-06, + "loss": 2.0808, + "step": 18568 + }, + { + "epoch": 3.4805998125585753, + "grad_norm": 53195.81640625, + "learning_rate": 1.259105778446773e-06, + "loss": 2.0709, + "step": 18569 + }, + { + "epoch": 3.4807872539831304, + "grad_norm": 53548.3828125, + "learning_rate": 1.2573540470681822e-06, + "loss": 2.116, + "step": 18570 + }, + { + "epoch": 3.480974695407685, + "grad_norm": 56212.0546875, + "learning_rate": 1.2556035195697479e-06, + "loss": 2.0442, + "step": 18571 + }, + { + "epoch": 3.48116213683224, + "grad_norm": 60735.87890625, + "learning_rate": 1.2538541959947025e-06, + "loss": 2.0209, + "step": 18572 + }, + { + "epoch": 3.4813495782567947, + "grad_norm": 59225.08984375, + "learning_rate": 1.2521060763862668e-06, + "loss": 2.0408, + "step": 18573 + }, + { + "epoch": 3.4815370196813498, + "grad_norm": 53840.4296875, + "learning_rate": 1.2503591607876009e-06, + "loss": 2.1079, + "step": 18574 + }, + { + "epoch": 3.4817244611059044, + "grad_norm": 57477.08984375, + "learning_rate": 1.2486134492418645e-06, + "loss": 2.0539, + "step": 18575 + }, + { + "epoch": 3.4819119025304595, + "grad_norm": 50992.60546875, + "learning_rate": 1.246868941792162e-06, + "loss": 2.1296, + "step": 18576 + }, + { + "epoch": 3.482099343955014, + "grad_norm": 54194.5, + "learning_rate": 1.2451256384815924e-06, + "loss": 2.0515, + "step": 18577 + }, + { + "epoch": 3.4822867853795687, + "grad_norm": 51249.38671875, + "learning_rate": 1.2433835393532101e-06, + "loss": 2.0995, + "step": 18578 + }, + { + "epoch": 3.4824742268041238, + "grad_norm": 56482.59765625, + "learning_rate": 1.2416426444500362e-06, + "loss": 2.0942, + "step": 18579 + }, + { + "epoch": 3.4826616682286784, + "grad_norm": 56350.109375, + "learning_rate": 1.2399029538150697e-06, + "loss": 2.041, + "step": 18580 + }, + { + "epoch": 3.4828491096532335, + "grad_norm": 57012.57421875, + "learning_rate": 1.2381644674912874e-06, + "loss": 2.1016, + "step": 18581 + }, + { + "epoch": 3.483036551077788, + "grad_norm": 54778.57421875, + "learning_rate": 1.2364271855216215e-06, + "loss": 2.06, + "step": 18582 + }, + { + "epoch": 3.483223992502343, + "grad_norm": 55124.9765625, + "learning_rate": 1.234691107948982e-06, + "loss": 2.071, + "step": 18583 + }, + { + "epoch": 3.483411433926898, + "grad_norm": 55433.234375, + "learning_rate": 1.232956234816246e-06, + "loss": 2.0578, + "step": 18584 + }, + { + "epoch": 3.483598875351453, + "grad_norm": 51887.375, + "learning_rate": 1.231222566166268e-06, + "loss": 2.0661, + "step": 18585 + }, + { + "epoch": 3.4837863167760075, + "grad_norm": 56271.546875, + "learning_rate": 1.2294901020418525e-06, + "loss": 1.961, + "step": 18586 + }, + { + "epoch": 3.4839737582005625, + "grad_norm": 55387.5625, + "learning_rate": 1.2277588424858044e-06, + "loss": 2.098, + "step": 18587 + }, + { + "epoch": 3.484161199625117, + "grad_norm": 54636.87890625, + "learning_rate": 1.2260287875408838e-06, + "loss": 2.0882, + "step": 18588 + }, + { + "epoch": 3.484348641049672, + "grad_norm": 51369.47265625, + "learning_rate": 1.224299937249812e-06, + "loss": 2.0422, + "step": 18589 + }, + { + "epoch": 3.484536082474227, + "grad_norm": 53582.890625, + "learning_rate": 1.2225722916552883e-06, + "loss": 1.9949, + "step": 18590 + }, + { + "epoch": 3.4847235238987815, + "grad_norm": 56576.828125, + "learning_rate": 1.2208458507999954e-06, + "loss": 2.1263, + "step": 18591 + }, + { + "epoch": 3.4849109653233366, + "grad_norm": 56677.625, + "learning_rate": 1.2191206147265655e-06, + "loss": 2.1059, + "step": 18592 + }, + { + "epoch": 3.485098406747891, + "grad_norm": 56306.20703125, + "learning_rate": 1.2173965834776035e-06, + "loss": 2.0639, + "step": 18593 + }, + { + "epoch": 3.4852858481724462, + "grad_norm": 53117.5078125, + "learning_rate": 1.2156737570957033e-06, + "loss": 2.0002, + "step": 18594 + }, + { + "epoch": 3.485473289597001, + "grad_norm": 54885.328125, + "learning_rate": 1.213952135623414e-06, + "loss": 2.133, + "step": 18595 + }, + { + "epoch": 3.485660731021556, + "grad_norm": 58118.31640625, + "learning_rate": 1.2122317191032573e-06, + "loss": 2.0447, + "step": 18596 + }, + { + "epoch": 3.4858481724461106, + "grad_norm": 52466.51171875, + "learning_rate": 1.2105125075777157e-06, + "loss": 2.1003, + "step": 18597 + }, + { + "epoch": 3.4860356138706656, + "grad_norm": 56962.73828125, + "learning_rate": 1.208794501089261e-06, + "loss": 2.0498, + "step": 18598 + }, + { + "epoch": 3.4862230552952203, + "grad_norm": 56959.62109375, + "learning_rate": 1.2070776996803258e-06, + "loss": 2.0997, + "step": 18599 + }, + { + "epoch": 3.486410496719775, + "grad_norm": 56853.08203125, + "learning_rate": 1.2053621033933094e-06, + "loss": 2.0499, + "step": 18600 + }, + { + "epoch": 3.48659793814433, + "grad_norm": 52391.80859375, + "learning_rate": 1.2036477122705892e-06, + "loss": 2.113, + "step": 18601 + }, + { + "epoch": 3.4867853795688846, + "grad_norm": 56231.48828125, + "learning_rate": 1.2019345263544979e-06, + "loss": 2.0613, + "step": 18602 + }, + { + "epoch": 3.4869728209934396, + "grad_norm": 57426.1953125, + "learning_rate": 1.2002225456873572e-06, + "loss": 2.1128, + "step": 18603 + }, + { + "epoch": 3.4871602624179943, + "grad_norm": 57641.58984375, + "learning_rate": 1.1985117703114502e-06, + "loss": 2.1578, + "step": 18604 + }, + { + "epoch": 3.4873477038425493, + "grad_norm": 51462.60546875, + "learning_rate": 1.1968022002690372e-06, + "loss": 2.1775, + "step": 18605 + }, + { + "epoch": 3.487535145267104, + "grad_norm": 58286.57421875, + "learning_rate": 1.195093835602329e-06, + "loss": 2.0271, + "step": 18606 + }, + { + "epoch": 3.487722586691659, + "grad_norm": 58904.4453125, + "learning_rate": 1.1933866763535306e-06, + "loss": 2.037, + "step": 18607 + }, + { + "epoch": 3.4879100281162136, + "grad_norm": 55609.58203125, + "learning_rate": 1.1916807225647975e-06, + "loss": 2.0806, + "step": 18608 + }, + { + "epoch": 3.4880974695407687, + "grad_norm": 56118.80859375, + "learning_rate": 1.1899759742782735e-06, + "loss": 2.1652, + "step": 18609 + }, + { + "epoch": 3.4882849109653233, + "grad_norm": 54427.34375, + "learning_rate": 1.1882724315360528e-06, + "loss": 2.0413, + "step": 18610 + }, + { + "epoch": 3.488472352389878, + "grad_norm": 56445.5390625, + "learning_rate": 1.1865700943802294e-06, + "loss": 2.0698, + "step": 18611 + }, + { + "epoch": 3.488659793814433, + "grad_norm": 56246.80078125, + "learning_rate": 1.1848689628528308e-06, + "loss": 2.0975, + "step": 18612 + }, + { + "epoch": 3.4888472352389877, + "grad_norm": 55888.3359375, + "learning_rate": 1.1831690369958792e-06, + "loss": 2.0688, + "step": 18613 + }, + { + "epoch": 3.4890346766635427, + "grad_norm": 52576.359375, + "learning_rate": 1.181470316851363e-06, + "loss": 2.1246, + "step": 18614 + }, + { + "epoch": 3.4892221180880973, + "grad_norm": 52946.93359375, + "learning_rate": 1.179772802461232e-06, + "loss": 2.1342, + "step": 18615 + }, + { + "epoch": 3.4894095595126524, + "grad_norm": 58397.76953125, + "learning_rate": 1.1780764938674193e-06, + "loss": 2.0391, + "step": 18616 + }, + { + "epoch": 3.489597000937207, + "grad_norm": 53763.921875, + "learning_rate": 1.1763813911118139e-06, + "loss": 2.0835, + "step": 18617 + }, + { + "epoch": 3.489784442361762, + "grad_norm": 54969.5859375, + "learning_rate": 1.1746874942362929e-06, + "loss": 2.0449, + "step": 18618 + }, + { + "epoch": 3.4899718837863167, + "grad_norm": 53416.6328125, + "learning_rate": 1.1729948032826843e-06, + "loss": 2.0564, + "step": 18619 + }, + { + "epoch": 3.490159325210872, + "grad_norm": 50073.6953125, + "learning_rate": 1.171303318292799e-06, + "loss": 2.0805, + "step": 18620 + }, + { + "epoch": 3.4903467666354264, + "grad_norm": 63694.62109375, + "learning_rate": 1.1696130393084203e-06, + "loss": 2.051, + "step": 18621 + }, + { + "epoch": 3.490534208059981, + "grad_norm": 51152.7578125, + "learning_rate": 1.1679239663712871e-06, + "loss": 2.0198, + "step": 18622 + }, + { + "epoch": 3.490721649484536, + "grad_norm": 60435.58984375, + "learning_rate": 1.1662360995231158e-06, + "loss": 2.0873, + "step": 18623 + }, + { + "epoch": 3.4909090909090907, + "grad_norm": 51101.27734375, + "learning_rate": 1.164549438805601e-06, + "loss": 2.0442, + "step": 18624 + }, + { + "epoch": 3.491096532333646, + "grad_norm": 58349.83984375, + "learning_rate": 1.1628639842603984e-06, + "loss": 2.0446, + "step": 18625 + }, + { + "epoch": 3.4912839737582004, + "grad_norm": 56209.8203125, + "learning_rate": 1.161179735929141e-06, + "loss": 2.0161, + "step": 18626 + }, + { + "epoch": 3.4914714151827555, + "grad_norm": 61508.37109375, + "learning_rate": 1.159496693853418e-06, + "loss": 2.0376, + "step": 18627 + }, + { + "epoch": 3.49165885660731, + "grad_norm": 55748.11328125, + "learning_rate": 1.1578148580748073e-06, + "loss": 2.0495, + "step": 18628 + }, + { + "epoch": 3.491846298031865, + "grad_norm": 62185.8125, + "learning_rate": 1.1561342286348475e-06, + "loss": 2.0933, + "step": 18629 + }, + { + "epoch": 3.49203373945642, + "grad_norm": 57805.20703125, + "learning_rate": 1.1544548055750392e-06, + "loss": 2.0538, + "step": 18630 + }, + { + "epoch": 3.492221180880975, + "grad_norm": 57084.37109375, + "learning_rate": 1.1527765889368714e-06, + "loss": 2.104, + "step": 18631 + }, + { + "epoch": 3.4924086223055295, + "grad_norm": 59519.03515625, + "learning_rate": 1.1510995787617884e-06, + "loss": 2.0852, + "step": 18632 + }, + { + "epoch": 3.492596063730084, + "grad_norm": 55272.53515625, + "learning_rate": 1.1494237750912184e-06, + "loss": 2.0386, + "step": 18633 + }, + { + "epoch": 3.492783505154639, + "grad_norm": 50895.55859375, + "learning_rate": 1.1477491779665339e-06, + "loss": 2.1, + "step": 18634 + }, + { + "epoch": 3.492970946579194, + "grad_norm": 58187.12890625, + "learning_rate": 1.1460757874291184e-06, + "loss": 2.0101, + "step": 18635 + }, + { + "epoch": 3.493158388003749, + "grad_norm": 55925.265625, + "learning_rate": 1.1444036035202833e-06, + "loss": 2.0342, + "step": 18636 + }, + { + "epoch": 3.4933458294283035, + "grad_norm": 55046.328125, + "learning_rate": 1.1427326262813397e-06, + "loss": 1.9732, + "step": 18637 + }, + { + "epoch": 3.4935332708528586, + "grad_norm": 53720.2109375, + "learning_rate": 1.1410628557535608e-06, + "loss": 2.0258, + "step": 18638 + }, + { + "epoch": 3.493720712277413, + "grad_norm": 54275.58984375, + "learning_rate": 1.1393942919781742e-06, + "loss": 2.0238, + "step": 18639 + }, + { + "epoch": 3.4939081537019683, + "grad_norm": 59168.69921875, + "learning_rate": 1.1377269349964026e-06, + "loss": 2.1343, + "step": 18640 + }, + { + "epoch": 3.494095595126523, + "grad_norm": 56692.3515625, + "learning_rate": 1.1360607848494297e-06, + "loss": 2.1537, + "step": 18641 + }, + { + "epoch": 3.494283036551078, + "grad_norm": 53481.84375, + "learning_rate": 1.1343958415784006e-06, + "loss": 2.0319, + "step": 18642 + }, + { + "epoch": 3.4944704779756326, + "grad_norm": 57177.9609375, + "learning_rate": 1.1327321052244323e-06, + "loss": 2.0199, + "step": 18643 + }, + { + "epoch": 3.494657919400187, + "grad_norm": 57362.78125, + "learning_rate": 1.1310695758286304e-06, + "loss": 2.0504, + "step": 18644 + }, + { + "epoch": 3.4948453608247423, + "grad_norm": 56691.859375, + "learning_rate": 1.1294082534320515e-06, + "loss": 2.0257, + "step": 18645 + }, + { + "epoch": 3.495032802249297, + "grad_norm": 57074.77734375, + "learning_rate": 1.1277481380757238e-06, + "loss": 2.1303, + "step": 18646 + }, + { + "epoch": 3.495220243673852, + "grad_norm": 53480.98046875, + "learning_rate": 1.1260892298006476e-06, + "loss": 2.0258, + "step": 18647 + }, + { + "epoch": 3.4954076850984066, + "grad_norm": 53765.5703125, + "learning_rate": 1.1244315286478125e-06, + "loss": 2.0846, + "step": 18648 + }, + { + "epoch": 3.4955951265229617, + "grad_norm": 50246.73046875, + "learning_rate": 1.1227750346581466e-06, + "loss": 2.0601, + "step": 18649 + }, + { + "epoch": 3.4957825679475163, + "grad_norm": 55414.453125, + "learning_rate": 1.1211197478725622e-06, + "loss": 2.0606, + "step": 18650 + }, + { + "epoch": 3.4959700093720714, + "grad_norm": 57245.89453125, + "learning_rate": 1.1194656683319537e-06, + "loss": 2.0511, + "step": 18651 + }, + { + "epoch": 3.496157450796626, + "grad_norm": 52921.90625, + "learning_rate": 1.1178127960771668e-06, + "loss": 2.1145, + "step": 18652 + }, + { + "epoch": 3.496344892221181, + "grad_norm": 53480.3359375, + "learning_rate": 1.116161131149024e-06, + "loss": 2.088, + "step": 18653 + }, + { + "epoch": 3.4965323336457357, + "grad_norm": 52888.015625, + "learning_rate": 1.114510673588326e-06, + "loss": 2.07, + "step": 18654 + }, + { + "epoch": 3.4967197750702903, + "grad_norm": 55594.85546875, + "learning_rate": 1.1128614234358348e-06, + "loss": 2.0018, + "step": 18655 + }, + { + "epoch": 3.4969072164948454, + "grad_norm": 53671.03125, + "learning_rate": 1.111213380732279e-06, + "loss": 2.055, + "step": 18656 + }, + { + "epoch": 3.4970946579194, + "grad_norm": 60933.76953125, + "learning_rate": 1.1095665455183702e-06, + "loss": 2.1319, + "step": 18657 + }, + { + "epoch": 3.497282099343955, + "grad_norm": 52404.109375, + "learning_rate": 1.1079209178347816e-06, + "loss": 2.0709, + "step": 18658 + }, + { + "epoch": 3.4974695407685097, + "grad_norm": 54986.98046875, + "learning_rate": 1.1062764977221528e-06, + "loss": 2.0721, + "step": 18659 + }, + { + "epoch": 3.4976569821930648, + "grad_norm": 55670.625, + "learning_rate": 1.1046332852211073e-06, + "loss": 2.1436, + "step": 18660 + }, + { + "epoch": 3.4978444236176194, + "grad_norm": 58339.85546875, + "learning_rate": 1.1029912803722231e-06, + "loss": 2.0708, + "step": 18661 + }, + { + "epoch": 3.4980318650421744, + "grad_norm": 57835.42578125, + "learning_rate": 1.1013504832160571e-06, + "loss": 2.0085, + "step": 18662 + }, + { + "epoch": 3.498219306466729, + "grad_norm": 54513.7265625, + "learning_rate": 1.0997108937931378e-06, + "loss": 2.0998, + "step": 18663 + }, + { + "epoch": 3.498406747891284, + "grad_norm": 52890.83203125, + "learning_rate": 1.0980725121439606e-06, + "loss": 2.0523, + "step": 18664 + }, + { + "epoch": 3.4985941893158388, + "grad_norm": 54864.56640625, + "learning_rate": 1.0964353383089875e-06, + "loss": 2.0611, + "step": 18665 + }, + { + "epoch": 3.4987816307403934, + "grad_norm": 60540.53125, + "learning_rate": 1.0947993723286532e-06, + "loss": 2.0435, + "step": 18666 + }, + { + "epoch": 3.4989690721649485, + "grad_norm": 57066.5625, + "learning_rate": 1.093164614243375e-06, + "loss": 2.0393, + "step": 18667 + }, + { + "epoch": 3.4991565135895035, + "grad_norm": 53582.5859375, + "learning_rate": 1.091531064093526e-06, + "loss": 2.0945, + "step": 18668 + }, + { + "epoch": 3.499343955014058, + "grad_norm": 65668.96875, + "learning_rate": 1.0898987219194412e-06, + "loss": 2.0891, + "step": 18669 + }, + { + "epoch": 3.4995313964386128, + "grad_norm": 57528.66796875, + "learning_rate": 1.088267587761449e-06, + "loss": 2.0268, + "step": 18670 + }, + { + "epoch": 3.499718837863168, + "grad_norm": 50641.6640625, + "learning_rate": 1.0866376616598283e-06, + "loss": 2.0612, + "step": 18671 + }, + { + "epoch": 3.4999062792877225, + "grad_norm": 54184.9453125, + "learning_rate": 1.0850089436548417e-06, + "loss": 2.0684, + "step": 18672 + }, + { + "epoch": 3.5000937207122775, + "grad_norm": 55998.60546875, + "learning_rate": 1.083381433786712e-06, + "loss": 2.13, + "step": 18673 + }, + { + "epoch": 3.500281162136832, + "grad_norm": 55672.69140625, + "learning_rate": 1.0817551320956466e-06, + "loss": 2.1158, + "step": 18674 + }, + { + "epoch": 3.5004686035613872, + "grad_norm": 54711.34375, + "learning_rate": 1.0801300386218015e-06, + "loss": 2.0715, + "step": 18675 + }, + { + "epoch": 3.500656044985942, + "grad_norm": 53961.73828125, + "learning_rate": 1.0785061534053177e-06, + "loss": 2.1018, + "step": 18676 + }, + { + "epoch": 3.5008434864104965, + "grad_norm": 56126.7734375, + "learning_rate": 1.0768834764863123e-06, + "loss": 2.1089, + "step": 18677 + }, + { + "epoch": 3.5010309278350515, + "grad_norm": 50558.75, + "learning_rate": 1.075262007904848e-06, + "loss": 2.1234, + "step": 18678 + }, + { + "epoch": 3.5012183692596066, + "grad_norm": 53529.1796875, + "learning_rate": 1.073641747700982e-06, + "loss": 2.0771, + "step": 18679 + }, + { + "epoch": 3.5014058106841612, + "grad_norm": 54591.15625, + "learning_rate": 1.0720226959147263e-06, + "loss": 2.0936, + "step": 18680 + }, + { + "epoch": 3.501593252108716, + "grad_norm": 56616.4140625, + "learning_rate": 1.0704048525860822e-06, + "loss": 2.0793, + "step": 18681 + }, + { + "epoch": 3.501780693533271, + "grad_norm": 57776.2109375, + "learning_rate": 1.0687882177549958e-06, + "loss": 2.0977, + "step": 18682 + }, + { + "epoch": 3.5019681349578256, + "grad_norm": 52269.32421875, + "learning_rate": 1.067172791461396e-06, + "loss": 2.0381, + "step": 18683 + }, + { + "epoch": 3.5021555763823806, + "grad_norm": 56599.6484375, + "learning_rate": 1.0655585737451956e-06, + "loss": 2.099, + "step": 18684 + }, + { + "epoch": 3.5023430178069352, + "grad_norm": 56356.42578125, + "learning_rate": 1.0639455646462514e-06, + "loss": 2.126, + "step": 18685 + }, + { + "epoch": 3.5025304592314903, + "grad_norm": 55808.37890625, + "learning_rate": 1.0623337642043984e-06, + "loss": 2.1128, + "step": 18686 + }, + { + "epoch": 3.502717900656045, + "grad_norm": 57466.5625, + "learning_rate": 1.0607231724594547e-06, + "loss": 2.077, + "step": 18687 + }, + { + "epoch": 3.5029053420805996, + "grad_norm": 56247.54296875, + "learning_rate": 1.0591137894511993e-06, + "loss": 2.0601, + "step": 18688 + }, + { + "epoch": 3.5030927835051546, + "grad_norm": 56939.7734375, + "learning_rate": 1.057505615219384e-06, + "loss": 2.0777, + "step": 18689 + }, + { + "epoch": 3.5032802249297097, + "grad_norm": 59081.01171875, + "learning_rate": 1.0558986498037216e-06, + "loss": 2.0413, + "step": 18690 + }, + { + "epoch": 3.5034676663542643, + "grad_norm": 58353.70703125, + "learning_rate": 1.0542928932439078e-06, + "loss": 2.0688, + "step": 18691 + }, + { + "epoch": 3.503655107778819, + "grad_norm": 52420.69140625, + "learning_rate": 1.0526883455795944e-06, + "loss": 2.0663, + "step": 18692 + }, + { + "epoch": 3.503842549203374, + "grad_norm": 55165.1328125, + "learning_rate": 1.0510850068504274e-06, + "loss": 2.1371, + "step": 18693 + }, + { + "epoch": 3.5040299906279286, + "grad_norm": 57203.2734375, + "learning_rate": 1.0494828770959918e-06, + "loss": 2.0139, + "step": 18694 + }, + { + "epoch": 3.5042174320524837, + "grad_norm": 54764.8203125, + "learning_rate": 1.0478819563558618e-06, + "loss": 2.0817, + "step": 18695 + }, + { + "epoch": 3.5044048734770383, + "grad_norm": 53764.65234375, + "learning_rate": 1.046282244669583e-06, + "loss": 2.0455, + "step": 18696 + }, + { + "epoch": 3.5045923149015934, + "grad_norm": 55194.31640625, + "learning_rate": 1.044683742076663e-06, + "loss": 2.0971, + "step": 18697 + }, + { + "epoch": 3.504779756326148, + "grad_norm": 56907.23828125, + "learning_rate": 1.043086448616587e-06, + "loss": 2.0527, + "step": 18698 + }, + { + "epoch": 3.5049671977507026, + "grad_norm": 56836.50390625, + "learning_rate": 1.0414903643287954e-06, + "loss": 2.0938, + "step": 18699 + }, + { + "epoch": 3.5051546391752577, + "grad_norm": 51676.453125, + "learning_rate": 1.0398954892527235e-06, + "loss": 2.0253, + "step": 18700 + }, + { + "epoch": 3.505342080599813, + "grad_norm": 58945.36328125, + "learning_rate": 1.0383018234277508e-06, + "loss": 2.0939, + "step": 18701 + }, + { + "epoch": 3.5055295220243674, + "grad_norm": 50298.6796875, + "learning_rate": 1.0367093668932403e-06, + "loss": 2.0334, + "step": 18702 + }, + { + "epoch": 3.505716963448922, + "grad_norm": 54060.2109375, + "learning_rate": 1.0351181196885273e-06, + "loss": 2.0411, + "step": 18703 + }, + { + "epoch": 3.505904404873477, + "grad_norm": 52523.66015625, + "learning_rate": 1.0335280818529192e-06, + "loss": 2.1051, + "step": 18704 + }, + { + "epoch": 3.5060918462980317, + "grad_norm": 51727.1015625, + "learning_rate": 1.031939253425679e-06, + "loss": 2.055, + "step": 18705 + }, + { + "epoch": 3.506279287722587, + "grad_norm": 60068.93359375, + "learning_rate": 1.0303516344460473e-06, + "loss": 2.0862, + "step": 18706 + }, + { + "epoch": 3.5064667291471414, + "grad_norm": 59272.94140625, + "learning_rate": 1.028765224953243e-06, + "loss": 2.0522, + "step": 18707 + }, + { + "epoch": 3.5066541705716965, + "grad_norm": 57972.37109375, + "learning_rate": 1.0271800249864462e-06, + "loss": 2.1044, + "step": 18708 + }, + { + "epoch": 3.506841611996251, + "grad_norm": 57359.17578125, + "learning_rate": 1.025596034584808e-06, + "loss": 2.0702, + "step": 18709 + }, + { + "epoch": 3.5070290534208057, + "grad_norm": 55174.68359375, + "learning_rate": 1.024013253787448e-06, + "loss": 2.0548, + "step": 18710 + }, + { + "epoch": 3.507216494845361, + "grad_norm": 55309.828125, + "learning_rate": 1.0224316826334679e-06, + "loss": 2.0945, + "step": 18711 + }, + { + "epoch": 3.507403936269916, + "grad_norm": 53042.546875, + "learning_rate": 1.0208513211619196e-06, + "loss": 2.0733, + "step": 18712 + }, + { + "epoch": 3.5075913776944705, + "grad_norm": 57392.5625, + "learning_rate": 1.0192721694118447e-06, + "loss": 2.0028, + "step": 18713 + }, + { + "epoch": 3.507778819119025, + "grad_norm": 65865.7890625, + "learning_rate": 1.0176942274222446e-06, + "loss": 2.0395, + "step": 18714 + }, + { + "epoch": 3.50796626054358, + "grad_norm": 55878.578125, + "learning_rate": 1.0161174952320884e-06, + "loss": 2.0531, + "step": 18715 + }, + { + "epoch": 3.508153701968135, + "grad_norm": 56381.36328125, + "learning_rate": 1.0145419728803173e-06, + "loss": 2.1357, + "step": 18716 + }, + { + "epoch": 3.50834114339269, + "grad_norm": 51729.30078125, + "learning_rate": 1.0129676604058557e-06, + "loss": 2.0217, + "step": 18717 + }, + { + "epoch": 3.5085285848172445, + "grad_norm": 58822.8671875, + "learning_rate": 1.0113945578475725e-06, + "loss": 2.1846, + "step": 18718 + }, + { + "epoch": 3.5087160262417996, + "grad_norm": 54030.7421875, + "learning_rate": 1.0098226652443366e-06, + "loss": 2.0826, + "step": 18719 + }, + { + "epoch": 3.508903467666354, + "grad_norm": 58578.375, + "learning_rate": 1.0082519826349611e-06, + "loss": 2.0627, + "step": 18720 + }, + { + "epoch": 3.509090909090909, + "grad_norm": 60014.484375, + "learning_rate": 1.0066825100582434e-06, + "loss": 2.0619, + "step": 18721 + }, + { + "epoch": 3.509278350515464, + "grad_norm": 58841.15625, + "learning_rate": 1.0051142475529463e-06, + "loss": 2.1202, + "step": 18722 + }, + { + "epoch": 3.509465791940019, + "grad_norm": 57535.578125, + "learning_rate": 1.0035471951578056e-06, + "loss": 2.146, + "step": 18723 + }, + { + "epoch": 3.5096532333645736, + "grad_norm": 53782.85546875, + "learning_rate": 1.0019813529115241e-06, + "loss": 2.0321, + "step": 18724 + }, + { + "epoch": 3.509840674789128, + "grad_norm": 53124.265625, + "learning_rate": 1.000416720852776e-06, + "loss": 2.0611, + "step": 18725 + }, + { + "epoch": 3.5100281162136833, + "grad_norm": 55611.7109375, + "learning_rate": 9.988532990202083e-07, + "loss": 2.1186, + "step": 18726 + }, + { + "epoch": 3.510215557638238, + "grad_norm": 63539.44921875, + "learning_rate": 9.972910874524233e-07, + "loss": 1.9493, + "step": 18727 + }, + { + "epoch": 3.510402999062793, + "grad_norm": 51320.98046875, + "learning_rate": 9.957300861880292e-07, + "loss": 2.1212, + "step": 18728 + }, + { + "epoch": 3.5105904404873476, + "grad_norm": 53625.875, + "learning_rate": 9.941702952655563e-07, + "loss": 2.0446, + "step": 18729 + }, + { + "epoch": 3.5107778819119027, + "grad_norm": 54045.2109375, + "learning_rate": 9.926117147235459e-07, + "loss": 2.0517, + "step": 18730 + }, + { + "epoch": 3.5109653233364573, + "grad_norm": 55543.6875, + "learning_rate": 9.910543446004895e-07, + "loss": 2.0538, + "step": 18731 + }, + { + "epoch": 3.5111527647610123, + "grad_norm": 54960.43359375, + "learning_rate": 9.894981849348395e-07, + "loss": 2.0543, + "step": 18732 + }, + { + "epoch": 3.511340206185567, + "grad_norm": 50784.48046875, + "learning_rate": 9.879432357650488e-07, + "loss": 2.054, + "step": 18733 + }, + { + "epoch": 3.511527647610122, + "grad_norm": 58431.359375, + "learning_rate": 9.863894971295195e-07, + "loss": 2.0771, + "step": 18734 + }, + { + "epoch": 3.5117150890346767, + "grad_norm": 54634.2265625, + "learning_rate": 9.84836969066616e-07, + "loss": 2.0953, + "step": 18735 + }, + { + "epoch": 3.5119025304592313, + "grad_norm": 49818.1171875, + "learning_rate": 9.83285651614685e-07, + "loss": 1.9874, + "step": 18736 + }, + { + "epoch": 3.5120899718837864, + "grad_norm": 60016.87109375, + "learning_rate": 9.817355448120569e-07, + "loss": 2.0524, + "step": 18737 + }, + { + "epoch": 3.512277413308341, + "grad_norm": 55270.4375, + "learning_rate": 9.80186648697007e-07, + "loss": 2.1149, + "step": 18738 + }, + { + "epoch": 3.512464854732896, + "grad_norm": 55954.69140625, + "learning_rate": 9.786389633077875e-07, + "loss": 2.0374, + "step": 18739 + }, + { + "epoch": 3.5126522961574507, + "grad_norm": 57611.7109375, + "learning_rate": 9.770924886826238e-07, + "loss": 2.0939, + "step": 18740 + }, + { + "epoch": 3.5128397375820057, + "grad_norm": 50605.91015625, + "learning_rate": 9.755472248597298e-07, + "loss": 2.0502, + "step": 18741 + }, + { + "epoch": 3.5130271790065604, + "grad_norm": 57631.38671875, + "learning_rate": 9.740031718772524e-07, + "loss": 2.0334, + "step": 18742 + }, + { + "epoch": 3.5132146204311154, + "grad_norm": 55795.43359375, + "learning_rate": 9.724603297733282e-07, + "loss": 2.0546, + "step": 18743 + }, + { + "epoch": 3.51340206185567, + "grad_norm": 57134.30859375, + "learning_rate": 9.709186985860762e-07, + "loss": 2.0565, + "step": 18744 + }, + { + "epoch": 3.513589503280225, + "grad_norm": 59388.30859375, + "learning_rate": 9.69378278353561e-07, + "loss": 2.0188, + "step": 18745 + }, + { + "epoch": 3.5137769447047797, + "grad_norm": 56307.18359375, + "learning_rate": 9.678390691138294e-07, + "loss": 2.0537, + "step": 18746 + }, + { + "epoch": 3.5139643861293344, + "grad_norm": 57855.53125, + "learning_rate": 9.663010709049124e-07, + "loss": 2.0327, + "step": 18747 + }, + { + "epoch": 3.5141518275538894, + "grad_norm": 52804.86328125, + "learning_rate": 9.647642837647742e-07, + "loss": 2.0838, + "step": 18748 + }, + { + "epoch": 3.5143392689784445, + "grad_norm": 54179.4921875, + "learning_rate": 9.632287077313894e-07, + "loss": 2.0825, + "step": 18749 + }, + { + "epoch": 3.514526710402999, + "grad_norm": 57565.64453125, + "learning_rate": 9.616943428426784e-07, + "loss": 2.1097, + "step": 18750 + }, + { + "epoch": 3.5147141518275538, + "grad_norm": 53161.640625, + "learning_rate": 9.601611891365382e-07, + "loss": 2.0609, + "step": 18751 + }, + { + "epoch": 3.514901593252109, + "grad_norm": 60237.9375, + "learning_rate": 9.586292466508328e-07, + "loss": 2.0957, + "step": 18752 + }, + { + "epoch": 3.5150890346766634, + "grad_norm": 54918.51171875, + "learning_rate": 9.570985154234046e-07, + "loss": 2.0691, + "step": 18753 + }, + { + "epoch": 3.5152764761012185, + "grad_norm": 58291.41015625, + "learning_rate": 9.555689954920566e-07, + "loss": 2.0284, + "step": 18754 + }, + { + "epoch": 3.515463917525773, + "grad_norm": 54871.37890625, + "learning_rate": 9.540406868945696e-07, + "loss": 2.0977, + "step": 18755 + }, + { + "epoch": 3.515651358950328, + "grad_norm": 53060.36328125, + "learning_rate": 9.525135896686854e-07, + "loss": 2.059, + "step": 18756 + }, + { + "epoch": 3.515838800374883, + "grad_norm": 54148.68359375, + "learning_rate": 9.509877038521298e-07, + "loss": 2.0909, + "step": 18757 + }, + { + "epoch": 3.5160262417994375, + "grad_norm": 52317.74609375, + "learning_rate": 9.49463029482578e-07, + "loss": 2.0684, + "step": 18758 + }, + { + "epoch": 3.5162136832239925, + "grad_norm": 54235.04296875, + "learning_rate": 9.479395665976942e-07, + "loss": 2.0778, + "step": 18759 + }, + { + "epoch": 3.5164011246485476, + "grad_norm": 55663.93359375, + "learning_rate": 9.464173152351096e-07, + "loss": 2.0525, + "step": 18760 + }, + { + "epoch": 3.516588566073102, + "grad_norm": 54127.453125, + "learning_rate": 9.44896275432422e-07, + "loss": 2.0116, + "step": 18761 + }, + { + "epoch": 3.516776007497657, + "grad_norm": 60946.53515625, + "learning_rate": 9.4337644722719e-07, + "loss": 2.0547, + "step": 18762 + }, + { + "epoch": 3.516963448922212, + "grad_norm": 55494.76953125, + "learning_rate": 9.418578306569558e-07, + "loss": 2.09, + "step": 18763 + }, + { + "epoch": 3.5171508903467665, + "grad_norm": 52235.86328125, + "learning_rate": 9.403404257592341e-07, + "loss": 2.1013, + "step": 18764 + }, + { + "epoch": 3.5173383317713216, + "grad_norm": 55447.48046875, + "learning_rate": 9.388242325714946e-07, + "loss": 2.0795, + "step": 18765 + }, + { + "epoch": 3.5175257731958762, + "grad_norm": 55735.7890625, + "learning_rate": 9.373092511311854e-07, + "loss": 2.0688, + "step": 18766 + }, + { + "epoch": 3.5177132146204313, + "grad_norm": 57517.05859375, + "learning_rate": 9.357954814757319e-07, + "loss": 1.9878, + "step": 18767 + }, + { + "epoch": 3.517900656044986, + "grad_norm": 57131.2734375, + "learning_rate": 9.342829236425155e-07, + "loss": 2.0898, + "step": 18768 + }, + { + "epoch": 3.5180880974695405, + "grad_norm": 50193.37109375, + "learning_rate": 9.327715776688894e-07, + "loss": 2.0212, + "step": 18769 + }, + { + "epoch": 3.5182755388940956, + "grad_norm": 50706.97265625, + "learning_rate": 9.312614435921963e-07, + "loss": 2.1048, + "step": 18770 + }, + { + "epoch": 3.5184629803186507, + "grad_norm": 56135.8125, + "learning_rate": 9.297525214497282e-07, + "loss": 2.0662, + "step": 18771 + }, + { + "epoch": 3.5186504217432053, + "grad_norm": 55187.234375, + "learning_rate": 9.282448112787446e-07, + "loss": 2.0545, + "step": 18772 + }, + { + "epoch": 3.51883786316776, + "grad_norm": 52452.87890625, + "learning_rate": 9.267383131164986e-07, + "loss": 2.0496, + "step": 18773 + }, + { + "epoch": 3.519025304592315, + "grad_norm": 52938.73828125, + "learning_rate": 9.252330270001886e-07, + "loss": 2.069, + "step": 18774 + }, + { + "epoch": 3.5192127460168696, + "grad_norm": 54829.2265625, + "learning_rate": 9.237289529670012e-07, + "loss": 2.0873, + "step": 18775 + }, + { + "epoch": 3.5194001874414247, + "grad_norm": 54956.01171875, + "learning_rate": 9.222260910540737e-07, + "loss": 2.0633, + "step": 18776 + }, + { + "epoch": 3.5195876288659793, + "grad_norm": 57949.60546875, + "learning_rate": 9.207244412985372e-07, + "loss": 2.0312, + "step": 18777 + }, + { + "epoch": 3.5197750702905344, + "grad_norm": 61072.39453125, + "learning_rate": 9.192240037374677e-07, + "loss": 2.0081, + "step": 18778 + }, + { + "epoch": 3.519962511715089, + "grad_norm": 58409.90234375, + "learning_rate": 9.177247784079357e-07, + "loss": 2.0739, + "step": 18779 + }, + { + "epoch": 3.5201499531396436, + "grad_norm": 59595.46875, + "learning_rate": 9.162267653469614e-07, + "loss": 2.0549, + "step": 18780 + }, + { + "epoch": 3.5203373945641987, + "grad_norm": 58710.82421875, + "learning_rate": 9.147299645915541e-07, + "loss": 1.9843, + "step": 18781 + }, + { + "epoch": 3.5205248359887538, + "grad_norm": 57287.046875, + "learning_rate": 9.132343761786788e-07, + "loss": 2.0835, + "step": 18782 + }, + { + "epoch": 3.5207122774133084, + "grad_norm": 52169.7265625, + "learning_rate": 9.117400001452614e-07, + "loss": 2.0534, + "step": 18783 + }, + { + "epoch": 3.520899718837863, + "grad_norm": 59180.95703125, + "learning_rate": 9.102468365282335e-07, + "loss": 2.0272, + "step": 18784 + }, + { + "epoch": 3.521087160262418, + "grad_norm": 53004.14453125, + "learning_rate": 9.087548853644545e-07, + "loss": 2.0966, + "step": 18785 + }, + { + "epoch": 3.5212746016869727, + "grad_norm": 57658.97265625, + "learning_rate": 9.072641466907894e-07, + "loss": 2.1545, + "step": 18786 + }, + { + "epoch": 3.5214620431115278, + "grad_norm": 55356.87890625, + "learning_rate": 9.057746205440476e-07, + "loss": 2.0488, + "step": 18787 + }, + { + "epoch": 3.5216494845360824, + "grad_norm": 58329.76953125, + "learning_rate": 9.042863069610163e-07, + "loss": 2.1042, + "step": 18788 + }, + { + "epoch": 3.5218369259606375, + "grad_norm": 55264.171875, + "learning_rate": 9.027992059784606e-07, + "loss": 2.0529, + "step": 18789 + }, + { + "epoch": 3.522024367385192, + "grad_norm": 52859.7421875, + "learning_rate": 9.013133176331123e-07, + "loss": 2.1511, + "step": 18790 + }, + { + "epoch": 3.5222118088097467, + "grad_norm": 55171.24609375, + "learning_rate": 8.998286419616697e-07, + "loss": 1.9853, + "step": 18791 + }, + { + "epoch": 3.522399250234302, + "grad_norm": 53193.01171875, + "learning_rate": 8.983451790007979e-07, + "loss": 2.107, + "step": 18792 + }, + { + "epoch": 3.522586691658857, + "grad_norm": 53604.6171875, + "learning_rate": 8.968629287871344e-07, + "loss": 1.9941, + "step": 18793 + }, + { + "epoch": 3.5227741330834115, + "grad_norm": 57931.6328125, + "learning_rate": 8.953818913573053e-07, + "loss": 2.07, + "step": 18794 + }, + { + "epoch": 3.522961574507966, + "grad_norm": 59330.9375, + "learning_rate": 8.939020667478648e-07, + "loss": 2.0277, + "step": 18795 + }, + { + "epoch": 3.523149015932521, + "grad_norm": 58823.0, + "learning_rate": 8.924234549953837e-07, + "loss": 2.1068, + "step": 18796 + }, + { + "epoch": 3.523336457357076, + "grad_norm": 56883.03515625, + "learning_rate": 8.90946056136377e-07, + "loss": 2.0908, + "step": 18797 + }, + { + "epoch": 3.523523898781631, + "grad_norm": 53890.53125, + "learning_rate": 8.894698702073268e-07, + "loss": 2.1224, + "step": 18798 + }, + { + "epoch": 3.5237113402061855, + "grad_norm": 54526.390625, + "learning_rate": 8.879948972446983e-07, + "loss": 2.1032, + "step": 18799 + }, + { + "epoch": 3.5238987816307406, + "grad_norm": 63655.0859375, + "learning_rate": 8.865211372849236e-07, + "loss": 2.0361, + "step": 18800 + }, + { + "epoch": 3.524086223055295, + "grad_norm": 54326.19140625, + "learning_rate": 8.850485903643957e-07, + "loss": 2.1604, + "step": 18801 + }, + { + "epoch": 3.52427366447985, + "grad_norm": 54418.33203125, + "learning_rate": 8.835772565194911e-07, + "loss": 2.0421, + "step": 18802 + }, + { + "epoch": 3.524461105904405, + "grad_norm": 57474.79296875, + "learning_rate": 8.821071357865473e-07, + "loss": 2.0839, + "step": 18803 + }, + { + "epoch": 3.52464854732896, + "grad_norm": 56455.71875, + "learning_rate": 8.806382282018799e-07, + "loss": 2.0287, + "step": 18804 + }, + { + "epoch": 3.5248359887535146, + "grad_norm": 52797.8125, + "learning_rate": 8.791705338017597e-07, + "loss": 2.0368, + "step": 18805 + }, + { + "epoch": 3.525023430178069, + "grad_norm": 57038.20703125, + "learning_rate": 8.777040526224412e-07, + "loss": 2.0569, + "step": 18806 + }, + { + "epoch": 3.5252108716026243, + "grad_norm": 53463.33203125, + "learning_rate": 8.762387847001452e-07, + "loss": 2.0723, + "step": 18807 + }, + { + "epoch": 3.525398313027179, + "grad_norm": 61684.453125, + "learning_rate": 8.747747300710651e-07, + "loss": 2.0169, + "step": 18808 + }, + { + "epoch": 3.525585754451734, + "grad_norm": 58522.33984375, + "learning_rate": 8.733118887713498e-07, + "loss": 2.0974, + "step": 18809 + }, + { + "epoch": 3.5257731958762886, + "grad_norm": 55656.3671875, + "learning_rate": 8.718502608371426e-07, + "loss": 2.0667, + "step": 18810 + }, + { + "epoch": 3.5259606373008436, + "grad_norm": 57750.43359375, + "learning_rate": 8.703898463045368e-07, + "loss": 2.1346, + "step": 18811 + }, + { + "epoch": 3.5261480787253983, + "grad_norm": 52252.921875, + "learning_rate": 8.689306452096035e-07, + "loss": 2.1453, + "step": 18812 + }, + { + "epoch": 3.526335520149953, + "grad_norm": 57229.25, + "learning_rate": 8.674726575883862e-07, + "loss": 2.0464, + "step": 18813 + }, + { + "epoch": 3.526522961574508, + "grad_norm": 56765.48828125, + "learning_rate": 8.66015883476895e-07, + "loss": 2.0924, + "step": 18814 + }, + { + "epoch": 3.526710402999063, + "grad_norm": 57314.60546875, + "learning_rate": 8.64560322911101e-07, + "loss": 1.9818, + "step": 18815 + }, + { + "epoch": 3.5268978444236176, + "grad_norm": 56528.63671875, + "learning_rate": 8.631059759269644e-07, + "loss": 2.0952, + "step": 18816 + }, + { + "epoch": 3.5270852858481723, + "grad_norm": 55293.03515625, + "learning_rate": 8.61652842560412e-07, + "loss": 2.0787, + "step": 18817 + }, + { + "epoch": 3.5272727272727273, + "grad_norm": 64382.3984375, + "learning_rate": 8.602009228473151e-07, + "loss": 2.1062, + "step": 18818 + }, + { + "epoch": 3.527460168697282, + "grad_norm": 53211.55859375, + "learning_rate": 8.58750216823545e-07, + "loss": 2.0468, + "step": 18819 + }, + { + "epoch": 3.527647610121837, + "grad_norm": 52865.44140625, + "learning_rate": 8.573007245249343e-07, + "loss": 2.1018, + "step": 18820 + }, + { + "epoch": 3.5278350515463917, + "grad_norm": 57959.0703125, + "learning_rate": 8.558524459872874e-07, + "loss": 2.0602, + "step": 18821 + }, + { + "epoch": 3.5280224929709467, + "grad_norm": 60858.40625, + "learning_rate": 8.544053812463593e-07, + "loss": 2.0963, + "step": 18822 + }, + { + "epoch": 3.5282099343955013, + "grad_norm": 56114.6484375, + "learning_rate": 8.529595303379045e-07, + "loss": 2.0563, + "step": 18823 + }, + { + "epoch": 3.528397375820056, + "grad_norm": 54695.20703125, + "learning_rate": 8.515148932976336e-07, + "loss": 2.0196, + "step": 18824 + }, + { + "epoch": 3.528584817244611, + "grad_norm": 59513.1640625, + "learning_rate": 8.500714701612123e-07, + "loss": 2.0548, + "step": 18825 + }, + { + "epoch": 3.528772258669166, + "grad_norm": 60535.03515625, + "learning_rate": 8.486292609643065e-07, + "loss": 2.0346, + "step": 18826 + }, + { + "epoch": 3.5289597000937207, + "grad_norm": 56537.09765625, + "learning_rate": 8.471882657425379e-07, + "loss": 2.095, + "step": 18827 + }, + { + "epoch": 3.5291471415182754, + "grad_norm": 58121.203125, + "learning_rate": 8.45748484531489e-07, + "loss": 2.0827, + "step": 18828 + }, + { + "epoch": 3.5293345829428304, + "grad_norm": 52788.65625, + "learning_rate": 8.443099173667146e-07, + "loss": 2.0634, + "step": 18829 + }, + { + "epoch": 3.529522024367385, + "grad_norm": 59542.5078125, + "learning_rate": 8.428725642837643e-07, + "loss": 2.0498, + "step": 18830 + }, + { + "epoch": 3.52970946579194, + "grad_norm": 60646.91015625, + "learning_rate": 8.414364253181262e-07, + "loss": 2.1241, + "step": 18831 + }, + { + "epoch": 3.5298969072164947, + "grad_norm": 53946.0078125, + "learning_rate": 8.400015005052719e-07, + "loss": 2.0294, + "step": 18832 + }, + { + "epoch": 3.53008434864105, + "grad_norm": 56296.14453125, + "learning_rate": 8.385677898806399e-07, + "loss": 2.0669, + "step": 18833 + }, + { + "epoch": 3.5302717900656044, + "grad_norm": 58981.12890625, + "learning_rate": 8.371352934796517e-07, + "loss": 2.1274, + "step": 18834 + }, + { + "epoch": 3.530459231490159, + "grad_norm": 55532.46875, + "learning_rate": 8.35704011337679e-07, + "loss": 2.0884, + "step": 18835 + }, + { + "epoch": 3.530646672914714, + "grad_norm": 57592.7265625, + "learning_rate": 8.342739434900715e-07, + "loss": 2.0404, + "step": 18836 + }, + { + "epoch": 3.530834114339269, + "grad_norm": 56199.4765625, + "learning_rate": 8.328450899721563e-07, + "loss": 1.9808, + "step": 18837 + }, + { + "epoch": 3.531021555763824, + "grad_norm": 60463.5234375, + "learning_rate": 8.314174508192219e-07, + "loss": 2.102, + "step": 18838 + }, + { + "epoch": 3.5312089971883784, + "grad_norm": 56923.55078125, + "learning_rate": 8.299910260665234e-07, + "loss": 2.0507, + "step": 18839 + }, + { + "epoch": 3.5313964386129335, + "grad_norm": 54671.02734375, + "learning_rate": 8.285658157493048e-07, + "loss": 2.0542, + "step": 18840 + }, + { + "epoch": 3.531583880037488, + "grad_norm": 55959.7578125, + "learning_rate": 8.271418199027491e-07, + "loss": 2.0486, + "step": 18841 + }, + { + "epoch": 3.531771321462043, + "grad_norm": 53828.671875, + "learning_rate": 8.25719038562045e-07, + "loss": 2.1301, + "step": 18842 + }, + { + "epoch": 3.531958762886598, + "grad_norm": 56269.7421875, + "learning_rate": 8.242974717623197e-07, + "loss": 2.0161, + "step": 18843 + }, + { + "epoch": 3.532146204311153, + "grad_norm": 58116.6953125, + "learning_rate": 8.228771195386953e-07, + "loss": 2.0674, + "step": 18844 + }, + { + "epoch": 3.5323336457357075, + "grad_norm": 60792.5234375, + "learning_rate": 8.21457981926238e-07, + "loss": 2.0355, + "step": 18845 + }, + { + "epoch": 3.532521087160262, + "grad_norm": 56874.93359375, + "learning_rate": 8.200400589600199e-07, + "loss": 1.9898, + "step": 18846 + }, + { + "epoch": 3.532708528584817, + "grad_norm": 54427.73828125, + "learning_rate": 8.186233506750407e-07, + "loss": 2.0543, + "step": 18847 + }, + { + "epoch": 3.5328959700093723, + "grad_norm": 56519.43359375, + "learning_rate": 8.172078571063002e-07, + "loss": 2.0383, + "step": 18848 + }, + { + "epoch": 3.533083411433927, + "grad_norm": 53250.04296875, + "learning_rate": 8.157935782887593e-07, + "loss": 2.0687, + "step": 18849 + }, + { + "epoch": 3.5332708528584815, + "grad_norm": 55183.32421875, + "learning_rate": 8.143805142573568e-07, + "loss": 2.0739, + "step": 18850 + }, + { + "epoch": 3.5334582942830366, + "grad_norm": 57181.60546875, + "learning_rate": 8.129686650469815e-07, + "loss": 1.9932, + "step": 18851 + }, + { + "epoch": 3.533645735707591, + "grad_norm": 53209.8359375, + "learning_rate": 8.115580306924997e-07, + "loss": 2.0054, + "step": 18852 + }, + { + "epoch": 3.5338331771321463, + "grad_norm": 56778.78515625, + "learning_rate": 8.101486112287726e-07, + "loss": 1.9586, + "step": 18853 + }, + { + "epoch": 3.534020618556701, + "grad_norm": 56806.74609375, + "learning_rate": 8.087404066905946e-07, + "loss": 2.0685, + "step": 18854 + }, + { + "epoch": 3.534208059981256, + "grad_norm": 58584.18359375, + "learning_rate": 8.07333417112749e-07, + "loss": 2.0776, + "step": 18855 + }, + { + "epoch": 3.5343955014058106, + "grad_norm": 57466.89453125, + "learning_rate": 8.059276425299855e-07, + "loss": 2.0507, + "step": 18856 + }, + { + "epoch": 3.5345829428303657, + "grad_norm": 54848.640625, + "learning_rate": 8.045230829770434e-07, + "loss": 2.0699, + "step": 18857 + }, + { + "epoch": 3.5347703842549203, + "grad_norm": 60171.29296875, + "learning_rate": 8.031197384885835e-07, + "loss": 2.0921, + "step": 18858 + }, + { + "epoch": 3.5349578256794754, + "grad_norm": 57733.046875, + "learning_rate": 8.017176090992839e-07, + "loss": 2.0854, + "step": 18859 + }, + { + "epoch": 3.53514526710403, + "grad_norm": 53966.2265625, + "learning_rate": 8.003166948437779e-07, + "loss": 2.0931, + "step": 18860 + }, + { + "epoch": 3.5353327085285846, + "grad_norm": 58060.9140625, + "learning_rate": 7.9891699575666e-07, + "loss": 2.1137, + "step": 18861 + }, + { + "epoch": 3.5355201499531397, + "grad_norm": 59149.67578125, + "learning_rate": 7.97518511872497e-07, + "loss": 2.0337, + "step": 18862 + }, + { + "epoch": 3.5357075913776947, + "grad_norm": 58092.22265625, + "learning_rate": 7.961212432258392e-07, + "loss": 2.0847, + "step": 18863 + }, + { + "epoch": 3.5358950328022494, + "grad_norm": 60308.15625, + "learning_rate": 7.947251898511921e-07, + "loss": 2.1078, + "step": 18864 + }, + { + "epoch": 3.536082474226804, + "grad_norm": 55435.94140625, + "learning_rate": 7.933303517830338e-07, + "loss": 2.0751, + "step": 18865 + }, + { + "epoch": 3.536269915651359, + "grad_norm": 59953.65625, + "learning_rate": 7.919367290558199e-07, + "loss": 2.0819, + "step": 18866 + }, + { + "epoch": 3.5364573570759137, + "grad_norm": 55835.69921875, + "learning_rate": 7.905443217039731e-07, + "loss": 2.021, + "step": 18867 + }, + { + "epoch": 3.5366447985004688, + "grad_norm": 53323.3515625, + "learning_rate": 7.891531297618826e-07, + "loss": 1.9955, + "step": 18868 + }, + { + "epoch": 3.5368322399250234, + "grad_norm": 55738.95703125, + "learning_rate": 7.877631532638985e-07, + "loss": 2.0193, + "step": 18869 + }, + { + "epoch": 3.5370196813495784, + "grad_norm": 59241.4765625, + "learning_rate": 7.863743922443656e-07, + "loss": 1.9804, + "step": 18870 + }, + { + "epoch": 3.537207122774133, + "grad_norm": 53591.3984375, + "learning_rate": 7.849868467375788e-07, + "loss": 2.0491, + "step": 18871 + }, + { + "epoch": 3.5373945641986877, + "grad_norm": 52604.8984375, + "learning_rate": 7.836005167778049e-07, + "loss": 2.1289, + "step": 18872 + }, + { + "epoch": 3.5375820056232428, + "grad_norm": 52943.7734375, + "learning_rate": 7.82215402399289e-07, + "loss": 1.982, + "step": 18873 + }, + { + "epoch": 3.537769447047798, + "grad_norm": 49512.71875, + "learning_rate": 7.808315036362479e-07, + "loss": 2.0816, + "step": 18874 + }, + { + "epoch": 3.5379568884723525, + "grad_norm": 58872.15234375, + "learning_rate": 7.794488205228434e-07, + "loss": 2.0676, + "step": 18875 + }, + { + "epoch": 3.538144329896907, + "grad_norm": 54293.890625, + "learning_rate": 7.780673530932481e-07, + "loss": 2.0783, + "step": 18876 + }, + { + "epoch": 3.538331771321462, + "grad_norm": 65424.4609375, + "learning_rate": 7.766871013815679e-07, + "loss": 2.053, + "step": 18877 + }, + { + "epoch": 3.5385192127460168, + "grad_norm": 54918.3125, + "learning_rate": 7.75308065421898e-07, + "loss": 2.1025, + "step": 18878 + }, + { + "epoch": 3.538706654170572, + "grad_norm": 54705.69140625, + "learning_rate": 7.739302452482999e-07, + "loss": 2.0713, + "step": 18879 + }, + { + "epoch": 3.5388940955951265, + "grad_norm": 53271.46484375, + "learning_rate": 7.72553640894802e-07, + "loss": 2.0709, + "step": 18880 + }, + { + "epoch": 3.5390815370196815, + "grad_norm": 55960.8828125, + "learning_rate": 7.711782523953992e-07, + "loss": 2.1602, + "step": 18881 + }, + { + "epoch": 3.539268978444236, + "grad_norm": 51952.765625, + "learning_rate": 7.6980407978407e-07, + "loss": 2.1053, + "step": 18882 + }, + { + "epoch": 3.539456419868791, + "grad_norm": 57783.30078125, + "learning_rate": 7.684311230947538e-07, + "loss": 2.0897, + "step": 18883 + }, + { + "epoch": 3.539643861293346, + "grad_norm": 56303.59375, + "learning_rate": 7.67059382361357e-07, + "loss": 2.1196, + "step": 18884 + }, + { + "epoch": 3.539831302717901, + "grad_norm": 57271.296875, + "learning_rate": 7.656888576177634e-07, + "loss": 2.101, + "step": 18885 + }, + { + "epoch": 3.5400187441424555, + "grad_norm": 65112.1328125, + "learning_rate": 7.643195488978183e-07, + "loss": 2.0943, + "step": 18886 + }, + { + "epoch": 3.54020618556701, + "grad_norm": 54094.1171875, + "learning_rate": 7.629514562353556e-07, + "loss": 2.0348, + "step": 18887 + }, + { + "epoch": 3.5403936269915652, + "grad_norm": 59467.36328125, + "learning_rate": 7.615845796641485e-07, + "loss": 2.1985, + "step": 18888 + }, + { + "epoch": 3.54058106841612, + "grad_norm": 57352.64453125, + "learning_rate": 7.602189192179587e-07, + "loss": 2.1354, + "step": 18889 + }, + { + "epoch": 3.540768509840675, + "grad_norm": 53301.546875, + "learning_rate": 7.58854474930526e-07, + "loss": 2.0361, + "step": 18890 + }, + { + "epoch": 3.5409559512652296, + "grad_norm": 55534.04296875, + "learning_rate": 7.574912468355511e-07, + "loss": 2.107, + "step": 18891 + }, + { + "epoch": 3.5411433926897846, + "grad_norm": 55819.6328125, + "learning_rate": 7.561292349666904e-07, + "loss": 2.0475, + "step": 18892 + }, + { + "epoch": 3.5413308341143392, + "grad_norm": 53845.3046875, + "learning_rate": 7.547684393575949e-07, + "loss": 2.0339, + "step": 18893 + }, + { + "epoch": 3.541518275538894, + "grad_norm": 59399.37109375, + "learning_rate": 7.534088600418765e-07, + "loss": 2.0833, + "step": 18894 + }, + { + "epoch": 3.541705716963449, + "grad_norm": 55102.99609375, + "learning_rate": 7.520504970531028e-07, + "loss": 2.072, + "step": 18895 + }, + { + "epoch": 3.541893158388004, + "grad_norm": 56162.89453125, + "learning_rate": 7.506933504248303e-07, + "loss": 2.0618, + "step": 18896 + }, + { + "epoch": 3.5420805998125586, + "grad_norm": 51967.25, + "learning_rate": 7.493374201905823e-07, + "loss": 2.1425, + "step": 18897 + }, + { + "epoch": 3.5422680412371133, + "grad_norm": 55714.84765625, + "learning_rate": 7.479827063838485e-07, + "loss": 2.0507, + "step": 18898 + }, + { + "epoch": 3.5424554826616683, + "grad_norm": 52835.640625, + "learning_rate": 7.4662920903808e-07, + "loss": 2.0463, + "step": 18899 + }, + { + "epoch": 3.542642924086223, + "grad_norm": 56592.22265625, + "learning_rate": 7.452769281867167e-07, + "loss": 1.9594, + "step": 18900 + }, + { + "epoch": 3.542830365510778, + "grad_norm": 56908.61328125, + "learning_rate": 7.439258638631486e-07, + "loss": 2.0634, + "step": 18901 + }, + { + "epoch": 3.5430178069353326, + "grad_norm": 51064.6015625, + "learning_rate": 7.425760161007545e-07, + "loss": 2.0601, + "step": 18902 + }, + { + "epoch": 3.5432052483598877, + "grad_norm": 52638.08984375, + "learning_rate": 7.412273849328688e-07, + "loss": 2.0643, + "step": 18903 + }, + { + "epoch": 3.5433926897844423, + "grad_norm": 56540.21484375, + "learning_rate": 7.398799703928039e-07, + "loss": 2.0677, + "step": 18904 + }, + { + "epoch": 3.543580131208997, + "grad_norm": 55106.9609375, + "learning_rate": 7.385337725138331e-07, + "loss": 2.0235, + "step": 18905 + }, + { + "epoch": 3.543767572633552, + "grad_norm": 52670.51953125, + "learning_rate": 7.371887913292075e-07, + "loss": 1.9973, + "step": 18906 + }, + { + "epoch": 3.543955014058107, + "grad_norm": 55376.80078125, + "learning_rate": 7.358450268721561e-07, + "loss": 2.0442, + "step": 18907 + }, + { + "epoch": 3.5441424554826617, + "grad_norm": 56220.3515625, + "learning_rate": 7.345024791758526e-07, + "loss": 2.0878, + "step": 18908 + }, + { + "epoch": 3.5443298969072163, + "grad_norm": 56639.6875, + "learning_rate": 7.331611482734701e-07, + "loss": 2.1018, + "step": 18909 + }, + { + "epoch": 3.5445173383317714, + "grad_norm": 54887.4921875, + "learning_rate": 7.318210341981324e-07, + "loss": 2.1064, + "step": 18910 + }, + { + "epoch": 3.544704779756326, + "grad_norm": 52473.6640625, + "learning_rate": 7.304821369829351e-07, + "loss": 2.051, + "step": 18911 + }, + { + "epoch": 3.544892221180881, + "grad_norm": 54797.71484375, + "learning_rate": 7.291444566609462e-07, + "loss": 2.0804, + "step": 18912 + }, + { + "epoch": 3.5450796626054357, + "grad_norm": 53268.87890625, + "learning_rate": 7.278079932652171e-07, + "loss": 2.0377, + "step": 18913 + }, + { + "epoch": 3.545267104029991, + "grad_norm": 56282.78125, + "learning_rate": 7.264727468287436e-07, + "loss": 2.0139, + "step": 18914 + }, + { + "epoch": 3.5454545454545454, + "grad_norm": 54324.71875, + "learning_rate": 7.251387173845103e-07, + "loss": 2.0678, + "step": 18915 + }, + { + "epoch": 3.5456419868791, + "grad_norm": 55441.90234375, + "learning_rate": 7.238059049654633e-07, + "loss": 2.0503, + "step": 18916 + }, + { + "epoch": 3.545829428303655, + "grad_norm": 55080.94921875, + "learning_rate": 7.224743096045261e-07, + "loss": 2.1088, + "step": 18917 + }, + { + "epoch": 3.54601686972821, + "grad_norm": 53600.6875, + "learning_rate": 7.211439313345836e-07, + "loss": 2.0722, + "step": 18918 + }, + { + "epoch": 3.546204311152765, + "grad_norm": 56445.0546875, + "learning_rate": 7.198147701884871e-07, + "loss": 2.0487, + "step": 18919 + }, + { + "epoch": 3.5463917525773194, + "grad_norm": 52414.19921875, + "learning_rate": 7.184868261990885e-07, + "loss": 2.0477, + "step": 18920 + }, + { + "epoch": 3.5465791940018745, + "grad_norm": 56917.33203125, + "learning_rate": 7.171600993991556e-07, + "loss": 2.1035, + "step": 18921 + }, + { + "epoch": 3.546766635426429, + "grad_norm": 55896.2265625, + "learning_rate": 7.158345898214791e-07, + "loss": 2.0862, + "step": 18922 + }, + { + "epoch": 3.546954076850984, + "grad_norm": 52780.09765625, + "learning_rate": 7.145102974987882e-07, + "loss": 2.0927, + "step": 18923 + }, + { + "epoch": 3.547141518275539, + "grad_norm": 52452.8359375, + "learning_rate": 7.131872224637959e-07, + "loss": 2.0994, + "step": 18924 + }, + { + "epoch": 3.547328959700094, + "grad_norm": 59721.69140625, + "learning_rate": 7.118653647491757e-07, + "loss": 1.9906, + "step": 18925 + }, + { + "epoch": 3.5475164011246485, + "grad_norm": 55985.26171875, + "learning_rate": 7.105447243875796e-07, + "loss": 2.0261, + "step": 18926 + }, + { + "epoch": 3.547703842549203, + "grad_norm": 54604.78515625, + "learning_rate": 7.092253014116257e-07, + "loss": 2.1714, + "step": 18927 + }, + { + "epoch": 3.547891283973758, + "grad_norm": 53441.359375, + "learning_rate": 7.079070958538991e-07, + "loss": 2.1346, + "step": 18928 + }, + { + "epoch": 3.5480787253983133, + "grad_norm": 61040.95703125, + "learning_rate": 7.065901077469572e-07, + "loss": 2.0856, + "step": 18929 + }, + { + "epoch": 3.548266166822868, + "grad_norm": 54346.26171875, + "learning_rate": 7.052743371233295e-07, + "loss": 2.0765, + "step": 18930 + }, + { + "epoch": 3.5484536082474225, + "grad_norm": 53945.09765625, + "learning_rate": 7.039597840155121e-07, + "loss": 2.0754, + "step": 18931 + }, + { + "epoch": 3.5486410496719776, + "grad_norm": 58325.0078125, + "learning_rate": 7.026464484559791e-07, + "loss": 2.1115, + "step": 18932 + }, + { + "epoch": 3.548828491096532, + "grad_norm": 53551.48046875, + "learning_rate": 7.013343304771658e-07, + "loss": 2.0269, + "step": 18933 + }, + { + "epoch": 3.5490159325210873, + "grad_norm": 55066.24609375, + "learning_rate": 7.000234301114738e-07, + "loss": 2.075, + "step": 18934 + }, + { + "epoch": 3.549203373945642, + "grad_norm": 59314.06640625, + "learning_rate": 6.987137473912885e-07, + "loss": 2.0995, + "step": 18935 + }, + { + "epoch": 3.549390815370197, + "grad_norm": 50251.1875, + "learning_rate": 6.974052823489452e-07, + "loss": 2.0295, + "step": 18936 + }, + { + "epoch": 3.5495782567947516, + "grad_norm": 52630.35546875, + "learning_rate": 6.960980350167789e-07, + "loss": 2.0371, + "step": 18937 + }, + { + "epoch": 3.549765698219306, + "grad_norm": 56802.16015625, + "learning_rate": 6.947920054270641e-07, + "loss": 2.0567, + "step": 18938 + }, + { + "epoch": 3.5499531396438613, + "grad_norm": 54709.0546875, + "learning_rate": 6.934871936120635e-07, + "loss": 2.0963, + "step": 18939 + }, + { + "epoch": 3.5501405810684163, + "grad_norm": 53285.80859375, + "learning_rate": 6.921835996040015e-07, + "loss": 2.0879, + "step": 18940 + }, + { + "epoch": 3.550328022492971, + "grad_norm": 53732.17578125, + "learning_rate": 6.908812234350693e-07, + "loss": 2.1249, + "step": 18941 + }, + { + "epoch": 3.5505154639175256, + "grad_norm": 58650.37890625, + "learning_rate": 6.895800651374462e-07, + "loss": 2.108, + "step": 18942 + }, + { + "epoch": 3.5507029053420807, + "grad_norm": 58122.02734375, + "learning_rate": 6.88280124743268e-07, + "loss": 2.0382, + "step": 18943 + }, + { + "epoch": 3.5508903467666353, + "grad_norm": 55295.44921875, + "learning_rate": 6.869814022846311e-07, + "loss": 2.1147, + "step": 18944 + }, + { + "epoch": 3.5510777881911904, + "grad_norm": 57240.3828125, + "learning_rate": 6.856838977936208e-07, + "loss": 2.0835, + "step": 18945 + }, + { + "epoch": 3.551265229615745, + "grad_norm": 56297.02734375, + "learning_rate": 6.843876113022784e-07, + "loss": 2.0594, + "step": 18946 + }, + { + "epoch": 3.5514526710403, + "grad_norm": 56523.8984375, + "learning_rate": 6.830925428426283e-07, + "loss": 2.0938, + "step": 18947 + }, + { + "epoch": 3.5516401124648547, + "grad_norm": 54576.1328125, + "learning_rate": 6.817986924466502e-07, + "loss": 2.0948, + "step": 18948 + }, + { + "epoch": 3.5518275538894093, + "grad_norm": 55065.87890625, + "learning_rate": 6.805060601462965e-07, + "loss": 2.0741, + "step": 18949 + }, + { + "epoch": 3.5520149953139644, + "grad_norm": 55187.69921875, + "learning_rate": 6.792146459735083e-07, + "loss": 2.0894, + "step": 18950 + }, + { + "epoch": 3.5522024367385194, + "grad_norm": 55104.15234375, + "learning_rate": 6.779244499601711e-07, + "loss": 2.0469, + "step": 18951 + }, + { + "epoch": 3.552389878163074, + "grad_norm": 63112.46875, + "learning_rate": 6.766354721381485e-07, + "loss": 2.0526, + "step": 18952 + }, + { + "epoch": 3.5525773195876287, + "grad_norm": 57153.12109375, + "learning_rate": 6.753477125392815e-07, + "loss": 2.19, + "step": 18953 + }, + { + "epoch": 3.5527647610121837, + "grad_norm": 55342.359375, + "learning_rate": 6.740611711953837e-07, + "loss": 2.0443, + "step": 18954 + }, + { + "epoch": 3.5529522024367384, + "grad_norm": 52711.77734375, + "learning_rate": 6.727758481382129e-07, + "loss": 2.0845, + "step": 18955 + }, + { + "epoch": 3.5531396438612934, + "grad_norm": 52769.12890625, + "learning_rate": 6.714917433995272e-07, + "loss": 2.0922, + "step": 18956 + }, + { + "epoch": 3.553327085285848, + "grad_norm": 57742.76953125, + "learning_rate": 6.702088570110399e-07, + "loss": 2.0943, + "step": 18957 + }, + { + "epoch": 3.553514526710403, + "grad_norm": 53344.6484375, + "learning_rate": 6.689271890044424e-07, + "loss": 2.0384, + "step": 18958 + }, + { + "epoch": 3.5537019681349578, + "grad_norm": 52557.91796875, + "learning_rate": 6.676467394113817e-07, + "loss": 2.0677, + "step": 18959 + }, + { + "epoch": 3.5538894095595124, + "grad_norm": 57131.984375, + "learning_rate": 6.663675082634824e-07, + "loss": 2.0938, + "step": 18960 + }, + { + "epoch": 3.5540768509840674, + "grad_norm": 58607.62109375, + "learning_rate": 6.650894955923526e-07, + "loss": 2.1031, + "step": 18961 + }, + { + "epoch": 3.5542642924086225, + "grad_norm": 57644.73046875, + "learning_rate": 6.638127014295392e-07, + "loss": 2.0394, + "step": 18962 + }, + { + "epoch": 3.554451733833177, + "grad_norm": 56024.359375, + "learning_rate": 6.625371258065893e-07, + "loss": 2.1621, + "step": 18963 + }, + { + "epoch": 3.5546391752577318, + "grad_norm": 53025.6171875, + "learning_rate": 6.612627687550054e-07, + "loss": 2.0069, + "step": 18964 + }, + { + "epoch": 3.554826616682287, + "grad_norm": 54974.10546875, + "learning_rate": 6.599896303062681e-07, + "loss": 2.1196, + "step": 18965 + }, + { + "epoch": 3.5550140581068415, + "grad_norm": 56338.96484375, + "learning_rate": 6.587177104918075e-07, + "loss": 2.0315, + "step": 18966 + }, + { + "epoch": 3.5552014995313965, + "grad_norm": 56616.22265625, + "learning_rate": 6.574470093430541e-07, + "loss": 2.0847, + "step": 18967 + }, + { + "epoch": 3.555388940955951, + "grad_norm": 59394.76953125, + "learning_rate": 6.561775268913828e-07, + "loss": 2.073, + "step": 18968 + }, + { + "epoch": 3.555576382380506, + "grad_norm": 55969.6953125, + "learning_rate": 6.549092631681519e-07, + "loss": 2.0921, + "step": 18969 + }, + { + "epoch": 3.555763823805061, + "grad_norm": 59547.55859375, + "learning_rate": 6.536422182046864e-07, + "loss": 2.1033, + "step": 18970 + }, + { + "epoch": 3.555951265229616, + "grad_norm": 57258.15625, + "learning_rate": 6.523763920322779e-07, + "loss": 2.0246, + "step": 18971 + }, + { + "epoch": 3.5561387066541705, + "grad_norm": 55499.74609375, + "learning_rate": 6.511117846821956e-07, + "loss": 2.1291, + "step": 18972 + }, + { + "epoch": 3.5563261480787256, + "grad_norm": 55490.69921875, + "learning_rate": 6.498483961856705e-07, + "loss": 2.0681, + "step": 18973 + }, + { + "epoch": 3.5565135895032802, + "grad_norm": 65893.7734375, + "learning_rate": 6.485862265739051e-07, + "loss": 2.1396, + "step": 18974 + }, + { + "epoch": 3.556701030927835, + "grad_norm": 57889.55078125, + "learning_rate": 6.47325275878069e-07, + "loss": 2.3347, + "step": 18975 + }, + { + "epoch": 3.55688847235239, + "grad_norm": 55636.8046875, + "learning_rate": 6.460655441293206e-07, + "loss": 2.1008, + "step": 18976 + }, + { + "epoch": 3.5570759137769445, + "grad_norm": 51978.71484375, + "learning_rate": 6.448070313587629e-07, + "loss": 2.0562, + "step": 18977 + }, + { + "epoch": 3.5572633552014996, + "grad_norm": 54126.83984375, + "learning_rate": 6.435497375974764e-07, + "loss": 2.1001, + "step": 18978 + }, + { + "epoch": 3.5574507966260542, + "grad_norm": 60372.33203125, + "learning_rate": 6.422936628765252e-07, + "loss": 2.0156, + "step": 18979 + }, + { + "epoch": 3.5576382380506093, + "grad_norm": 54874.28515625, + "learning_rate": 6.410388072269291e-07, + "loss": 2.1013, + "step": 18980 + }, + { + "epoch": 3.557825679475164, + "grad_norm": 59257.32421875, + "learning_rate": 6.397851706796743e-07, + "loss": 2.012, + "step": 18981 + }, + { + "epoch": 3.558013120899719, + "grad_norm": 58678.1640625, + "learning_rate": 6.385327532657304e-07, + "loss": 2.0644, + "step": 18982 + }, + { + "epoch": 3.5582005623242736, + "grad_norm": 58364.76171875, + "learning_rate": 6.372815550160282e-07, + "loss": 2.0686, + "step": 18983 + }, + { + "epoch": 3.5583880037488287, + "grad_norm": 57392.84765625, + "learning_rate": 6.360315759614765e-07, + "loss": 2.0565, + "step": 18984 + }, + { + "epoch": 3.5585754451733833, + "grad_norm": 54808.78515625, + "learning_rate": 6.347828161329395e-07, + "loss": 2.1094, + "step": 18985 + }, + { + "epoch": 3.558762886597938, + "grad_norm": 58830.02734375, + "learning_rate": 6.3353527556127e-07, + "loss": 2.0409, + "step": 18986 + }, + { + "epoch": 3.558950328022493, + "grad_norm": 54154.13671875, + "learning_rate": 6.322889542772659e-07, + "loss": 2.0641, + "step": 18987 + }, + { + "epoch": 3.559137769447048, + "grad_norm": 55848.70703125, + "learning_rate": 6.310438523117246e-07, + "loss": 2.0678, + "step": 18988 + }, + { + "epoch": 3.5593252108716027, + "grad_norm": 54379.6015625, + "learning_rate": 6.297999696953882e-07, + "loss": 2.224, + "step": 18989 + }, + { + "epoch": 3.5595126522961573, + "grad_norm": 56153.70703125, + "learning_rate": 6.285573064589878e-07, + "loss": 2.0029, + "step": 18990 + }, + { + "epoch": 3.5597000937207124, + "grad_norm": 55486.13671875, + "learning_rate": 6.273158626332098e-07, + "loss": 2.0407, + "step": 18991 + }, + { + "epoch": 3.559887535145267, + "grad_norm": 54622.2578125, + "learning_rate": 6.260756382487132e-07, + "loss": 2.0029, + "step": 18992 + }, + { + "epoch": 3.560074976569822, + "grad_norm": 54047.15234375, + "learning_rate": 6.2483663333614e-07, + "loss": 2.0317, + "step": 18993 + }, + { + "epoch": 3.5602624179943767, + "grad_norm": 55943.21875, + "learning_rate": 6.235988479260768e-07, + "loss": 2.0836, + "step": 18994 + }, + { + "epoch": 3.5604498594189318, + "grad_norm": 55221.2421875, + "learning_rate": 6.22362282049116e-07, + "loss": 1.9908, + "step": 18995 + }, + { + "epoch": 3.5606373008434864, + "grad_norm": 54684.43359375, + "learning_rate": 6.21126935735783e-07, + "loss": 2.0673, + "step": 18996 + }, + { + "epoch": 3.560824742268041, + "grad_norm": 64423.83203125, + "learning_rate": 6.198928090165923e-07, + "loss": 2.013, + "step": 18997 + }, + { + "epoch": 3.561012183692596, + "grad_norm": 53573.546875, + "learning_rate": 6.186599019220251e-07, + "loss": 2.1022, + "step": 18998 + }, + { + "epoch": 3.561199625117151, + "grad_norm": 56000.19140625, + "learning_rate": 6.174282144825406e-07, + "loss": 2.0328, + "step": 18999 + }, + { + "epoch": 3.561387066541706, + "grad_norm": 53448.53125, + "learning_rate": 6.161977467285529e-07, + "loss": 2.0743, + "step": 19000 + }, + { + "epoch": 3.561387066541706, + "eval_loss": 2.257038116455078, + "eval_runtime": 128.7608, + "eval_samples_per_second": 39.212, + "eval_steps_per_second": 1.965, + "step": 19000 + }, + { + "epoch": 3.5615745079662604, + "grad_norm": 59905.1875, + "learning_rate": 6.149684986904492e-07, + "loss": 2.0783, + "step": 19001 + }, + { + "epoch": 3.5617619493908155, + "grad_norm": 57954.18359375, + "learning_rate": 6.137404703985994e-07, + "loss": 2.0498, + "step": 19002 + }, + { + "epoch": 3.56194939081537, + "grad_norm": 53360.4375, + "learning_rate": 6.125136618833293e-07, + "loss": 2.0563, + "step": 19003 + }, + { + "epoch": 3.562136832239925, + "grad_norm": 56648.52734375, + "learning_rate": 6.112880731749315e-07, + "loss": 2.0327, + "step": 19004 + }, + { + "epoch": 3.56232427366448, + "grad_norm": 57464.40234375, + "learning_rate": 6.100637043036928e-07, + "loss": 2.0956, + "step": 19005 + }, + { + "epoch": 3.562511715089035, + "grad_norm": 54531.9921875, + "learning_rate": 6.088405552998444e-07, + "loss": 2.1461, + "step": 19006 + }, + { + "epoch": 3.5626991565135895, + "grad_norm": 52373.390625, + "learning_rate": 6.076186261936012e-07, + "loss": 2.0701, + "step": 19007 + }, + { + "epoch": 3.562886597938144, + "grad_norm": 61866.125, + "learning_rate": 6.063979170151335e-07, + "loss": 2.0497, + "step": 19008 + }, + { + "epoch": 3.563074039362699, + "grad_norm": 55447.20703125, + "learning_rate": 6.051784277946005e-07, + "loss": 2.1197, + "step": 19009 + }, + { + "epoch": 3.5632614807872542, + "grad_norm": 51090.25, + "learning_rate": 6.039601585621224e-07, + "loss": 2.0646, + "step": 19010 + }, + { + "epoch": 3.563448922211809, + "grad_norm": 54981.72265625, + "learning_rate": 6.027431093477809e-07, + "loss": 2.0699, + "step": 19011 + }, + { + "epoch": 3.5636363636363635, + "grad_norm": 54642.05859375, + "learning_rate": 6.015272801816351e-07, + "loss": 2.0581, + "step": 19012 + }, + { + "epoch": 3.5638238050609186, + "grad_norm": 57062.09375, + "learning_rate": 6.003126710937279e-07, + "loss": 2.1452, + "step": 19013 + }, + { + "epoch": 3.564011246485473, + "grad_norm": 60722.87890625, + "learning_rate": 5.990992821140518e-07, + "loss": 2.0613, + "step": 19014 + }, + { + "epoch": 3.5641986879100283, + "grad_norm": 60244.890625, + "learning_rate": 5.978871132725661e-07, + "loss": 2.139, + "step": 19015 + }, + { + "epoch": 3.564386129334583, + "grad_norm": 56174.625, + "learning_rate": 5.966761645992248e-07, + "loss": 2.1016, + "step": 19016 + }, + { + "epoch": 3.564573570759138, + "grad_norm": 58926.5390625, + "learning_rate": 5.95466436123926e-07, + "loss": 2.0618, + "step": 19017 + }, + { + "epoch": 3.5647610121836926, + "grad_norm": 55405.7890625, + "learning_rate": 5.942579278765514e-07, + "loss": 2.0873, + "step": 19018 + }, + { + "epoch": 3.564948453608247, + "grad_norm": 54181.94921875, + "learning_rate": 5.930506398869496e-07, + "loss": 2.1128, + "step": 19019 + }, + { + "epoch": 3.5651358950328023, + "grad_norm": 59258.2109375, + "learning_rate": 5.918445721849464e-07, + "loss": 2.0135, + "step": 19020 + }, + { + "epoch": 3.5653233364573573, + "grad_norm": 55807.17578125, + "learning_rate": 5.906397248003237e-07, + "loss": 2.0494, + "step": 19021 + }, + { + "epoch": 3.565510777881912, + "grad_norm": 53868.46484375, + "learning_rate": 5.894360977628299e-07, + "loss": 2.1128, + "step": 19022 + }, + { + "epoch": 3.5656982193064666, + "grad_norm": 57691.2265625, + "learning_rate": 5.882336911022134e-07, + "loss": 2.001, + "step": 19023 + }, + { + "epoch": 3.5658856607310216, + "grad_norm": 52742.625, + "learning_rate": 5.870325048481506e-07, + "loss": 2.0501, + "step": 19024 + }, + { + "epoch": 3.5660731021555763, + "grad_norm": 61187.328125, + "learning_rate": 5.858325390303287e-07, + "loss": 2.0835, + "step": 19025 + }, + { + "epoch": 3.5662605435801313, + "grad_norm": 55628.23046875, + "learning_rate": 5.846337936783796e-07, + "loss": 2.048, + "step": 19026 + }, + { + "epoch": 3.566447985004686, + "grad_norm": 51830.203125, + "learning_rate": 5.834362688218963e-07, + "loss": 2.0713, + "step": 19027 + }, + { + "epoch": 3.566635426429241, + "grad_norm": 58451.6171875, + "learning_rate": 5.822399644904719e-07, + "loss": 2.0563, + "step": 19028 + }, + { + "epoch": 3.5668228678537957, + "grad_norm": 62867.67578125, + "learning_rate": 5.810448807136548e-07, + "loss": 2.0266, + "step": 19029 + }, + { + "epoch": 3.5670103092783503, + "grad_norm": 56109.75, + "learning_rate": 5.798510175209548e-07, + "loss": 2.0093, + "step": 19030 + }, + { + "epoch": 3.5671977507029053, + "grad_norm": 55664.94921875, + "learning_rate": 5.786583749418539e-07, + "loss": 2.0668, + "step": 19031 + }, + { + "epoch": 3.5673851921274604, + "grad_norm": 58585.1484375, + "learning_rate": 5.77466953005823e-07, + "loss": 2.0258, + "step": 19032 + }, + { + "epoch": 3.567572633552015, + "grad_norm": 54878.9375, + "learning_rate": 5.762767517422774e-07, + "loss": 2.0581, + "step": 19033 + }, + { + "epoch": 3.5677600749765697, + "grad_norm": 55560.59375, + "learning_rate": 5.750877711806157e-07, + "loss": 2.1089, + "step": 19034 + }, + { + "epoch": 3.5679475164011247, + "grad_norm": 54950.83203125, + "learning_rate": 5.739000113502091e-07, + "loss": 2.0846, + "step": 19035 + }, + { + "epoch": 3.5681349578256794, + "grad_norm": 56684.60546875, + "learning_rate": 5.727134722803951e-07, + "loss": 2.0361, + "step": 19036 + }, + { + "epoch": 3.5683223992502344, + "grad_norm": 57398.765625, + "learning_rate": 5.715281540004669e-07, + "loss": 2.0631, + "step": 19037 + }, + { + "epoch": 3.568509840674789, + "grad_norm": 54496.8125, + "learning_rate": 5.703440565397066e-07, + "loss": 2.1036, + "step": 19038 + }, + { + "epoch": 3.568697282099344, + "grad_norm": 55422.90234375, + "learning_rate": 5.691611799273688e-07, + "loss": 1.9972, + "step": 19039 + }, + { + "epoch": 3.5688847235238987, + "grad_norm": 51780.71484375, + "learning_rate": 5.679795241926578e-07, + "loss": 2.0641, + "step": 19040 + }, + { + "epoch": 3.5690721649484534, + "grad_norm": 55598.03125, + "learning_rate": 5.667990893647612e-07, + "loss": 2.0992, + "step": 19041 + }, + { + "epoch": 3.5692596063730084, + "grad_norm": 51826.90625, + "learning_rate": 5.656198754728392e-07, + "loss": 2.0765, + "step": 19042 + }, + { + "epoch": 3.5694470477975635, + "grad_norm": 55139.00390625, + "learning_rate": 5.644418825460129e-07, + "loss": 2.0383, + "step": 19043 + }, + { + "epoch": 3.569634489222118, + "grad_norm": 57269.55859375, + "learning_rate": 5.632651106133758e-07, + "loss": 2.1036, + "step": 19044 + }, + { + "epoch": 3.5698219306466727, + "grad_norm": 53839.4921875, + "learning_rate": 5.620895597039987e-07, + "loss": 2.0747, + "step": 19045 + }, + { + "epoch": 3.570009372071228, + "grad_norm": 53921.58984375, + "learning_rate": 5.609152298469089e-07, + "loss": 2.0672, + "step": 19046 + }, + { + "epoch": 3.5701968134957824, + "grad_norm": 54013.86328125, + "learning_rate": 5.597421210711218e-07, + "loss": 2.0959, + "step": 19047 + }, + { + "epoch": 3.5703842549203375, + "grad_norm": 56030.0234375, + "learning_rate": 5.58570233405592e-07, + "loss": 2.0442, + "step": 19048 + }, + { + "epoch": 3.570571696344892, + "grad_norm": 57398.37890625, + "learning_rate": 5.573995668792853e-07, + "loss": 2.0588, + "step": 19049 + }, + { + "epoch": 3.570759137769447, + "grad_norm": 59790.515625, + "learning_rate": 5.562301215211008e-07, + "loss": 2.1858, + "step": 19050 + }, + { + "epoch": 3.570946579194002, + "grad_norm": 56731.96484375, + "learning_rate": 5.55061897359932e-07, + "loss": 1.9762, + "step": 19051 + }, + { + "epoch": 3.5711340206185564, + "grad_norm": 54508.765625, + "learning_rate": 5.538948944246225e-07, + "loss": 2.0224, + "step": 19052 + }, + { + "epoch": 3.5713214620431115, + "grad_norm": 53546.43359375, + "learning_rate": 5.527291127440049e-07, + "loss": 2.077, + "step": 19053 + }, + { + "epoch": 3.5715089034676666, + "grad_norm": 57415.140625, + "learning_rate": 5.515645523468671e-07, + "loss": 2.0367, + "step": 19054 + }, + { + "epoch": 3.571696344892221, + "grad_norm": 57462.01953125, + "learning_rate": 5.504012132619751e-07, + "loss": 2.0148, + "step": 19055 + }, + { + "epoch": 3.571883786316776, + "grad_norm": 51663.71875, + "learning_rate": 5.492390955180616e-07, + "loss": 2.0523, + "step": 19056 + }, + { + "epoch": 3.572071227741331, + "grad_norm": 53714.1171875, + "learning_rate": 5.480781991438256e-07, + "loss": 2.0772, + "step": 19057 + }, + { + "epoch": 3.5722586691658855, + "grad_norm": 54873.28125, + "learning_rate": 5.469185241679442e-07, + "loss": 2.0829, + "step": 19058 + }, + { + "epoch": 3.5724461105904406, + "grad_norm": 56333.9765625, + "learning_rate": 5.457600706190558e-07, + "loss": 2.0612, + "step": 19059 + }, + { + "epoch": 3.572633552014995, + "grad_norm": 58141.796875, + "learning_rate": 5.446028385257818e-07, + "loss": 2.0079, + "step": 19060 + }, + { + "epoch": 3.5728209934395503, + "grad_norm": 53466.42578125, + "learning_rate": 5.434468279166938e-07, + "loss": 2.0232, + "step": 19061 + }, + { + "epoch": 3.573008434864105, + "grad_norm": 61181.3515625, + "learning_rate": 5.422920388203468e-07, + "loss": 2.0105, + "step": 19062 + }, + { + "epoch": 3.5731958762886595, + "grad_norm": 61068.48046875, + "learning_rate": 5.411384712652678e-07, + "loss": 2.0757, + "step": 19063 + }, + { + "epoch": 3.5733833177132146, + "grad_norm": 53694.0625, + "learning_rate": 5.399861252799398e-07, + "loss": 2.1812, + "step": 19064 + }, + { + "epoch": 3.5735707591377697, + "grad_norm": 52831.60546875, + "learning_rate": 5.388350008928289e-07, + "loss": 2.0711, + "step": 19065 + }, + { + "epoch": 3.5737582005623243, + "grad_norm": 54685.9921875, + "learning_rate": 5.376850981323677e-07, + "loss": 2.0825, + "step": 19066 + }, + { + "epoch": 3.573945641986879, + "grad_norm": 52799.90625, + "learning_rate": 5.36536417026956e-07, + "loss": 2.0885, + "step": 19067 + }, + { + "epoch": 3.574133083411434, + "grad_norm": 57683.75390625, + "learning_rate": 5.353889576049598e-07, + "loss": 2.1333, + "step": 19068 + }, + { + "epoch": 3.5743205248359886, + "grad_norm": 55275.203125, + "learning_rate": 5.342427198947342e-07, + "loss": 2.0708, + "step": 19069 + }, + { + "epoch": 3.5745079662605437, + "grad_norm": 57324.6875, + "learning_rate": 5.330977039245733e-07, + "loss": 2.1072, + "step": 19070 + }, + { + "epoch": 3.5746954076850983, + "grad_norm": 56682.828125, + "learning_rate": 5.319539097227655e-07, + "loss": 2.0235, + "step": 19071 + }, + { + "epoch": 3.5748828491096534, + "grad_norm": 50246.390625, + "learning_rate": 5.30811337317555e-07, + "loss": 2.103, + "step": 19072 + }, + { + "epoch": 3.575070290534208, + "grad_norm": 60469.078125, + "learning_rate": 5.296699867371801e-07, + "loss": 2.0169, + "step": 19073 + }, + { + "epoch": 3.5752577319587626, + "grad_norm": 55447.5546875, + "learning_rate": 5.285298580098075e-07, + "loss": 2.0713, + "step": 19074 + }, + { + "epoch": 3.5754451733833177, + "grad_norm": 53413.80859375, + "learning_rate": 5.273909511636033e-07, + "loss": 2.0531, + "step": 19075 + }, + { + "epoch": 3.5756326148078728, + "grad_norm": 56883.26953125, + "learning_rate": 5.262532662267061e-07, + "loss": 2.0388, + "step": 19076 + }, + { + "epoch": 3.5758200562324274, + "grad_norm": 53753.50390625, + "learning_rate": 5.251168032272103e-07, + "loss": 2.0486, + "step": 19077 + }, + { + "epoch": 3.576007497656982, + "grad_norm": 57773.83203125, + "learning_rate": 5.239815621931821e-07, + "loss": 2.0553, + "step": 19078 + }, + { + "epoch": 3.576194939081537, + "grad_norm": 54356.38671875, + "learning_rate": 5.228475431526658e-07, + "loss": 2.0845, + "step": 19079 + }, + { + "epoch": 3.5763823805060917, + "grad_norm": 55594.27734375, + "learning_rate": 5.217147461336613e-07, + "loss": 2.021, + "step": 19080 + }, + { + "epoch": 3.5765698219306468, + "grad_norm": 52985.734375, + "learning_rate": 5.205831711641573e-07, + "loss": 2.0439, + "step": 19081 + }, + { + "epoch": 3.5767572633552014, + "grad_norm": 58717.16015625, + "learning_rate": 5.194528182720926e-07, + "loss": 2.0269, + "step": 19082 + }, + { + "epoch": 3.5769447047797565, + "grad_norm": 54623.43359375, + "learning_rate": 5.183236874854003e-07, + "loss": 2.0002, + "step": 19083 + }, + { + "epoch": 3.577132146204311, + "grad_norm": 53929.51953125, + "learning_rate": 5.171957788319526e-07, + "loss": 2.0495, + "step": 19084 + }, + { + "epoch": 3.5773195876288657, + "grad_norm": 55582.7578125, + "learning_rate": 5.160690923396161e-07, + "loss": 2.075, + "step": 19085 + }, + { + "epoch": 3.5775070290534208, + "grad_norm": 54248.32421875, + "learning_rate": 5.14943628036213e-07, + "loss": 2.0351, + "step": 19086 + }, + { + "epoch": 3.577694470477976, + "grad_norm": 54518.6484375, + "learning_rate": 5.138193859495433e-07, + "loss": 2.1321, + "step": 19087 + }, + { + "epoch": 3.5778819119025305, + "grad_norm": 56342.0546875, + "learning_rate": 5.126963661073791e-07, + "loss": 2.0428, + "step": 19088 + }, + { + "epoch": 3.578069353327085, + "grad_norm": 55357.72265625, + "learning_rate": 5.11574568537454e-07, + "loss": 2.0797, + "step": 19089 + }, + { + "epoch": 3.57825679475164, + "grad_norm": 53136.69140625, + "learning_rate": 5.104539932674734e-07, + "loss": 2.0927, + "step": 19090 + }, + { + "epoch": 3.578444236176195, + "grad_norm": 53881.41015625, + "learning_rate": 5.093346403251098e-07, + "loss": 2.1063, + "step": 19091 + }, + { + "epoch": 3.57863167760075, + "grad_norm": 55601.203125, + "learning_rate": 5.082165097380243e-07, + "loss": 2.0869, + "step": 19092 + }, + { + "epoch": 3.5788191190253045, + "grad_norm": 52290.70703125, + "learning_rate": 5.07099601533817e-07, + "loss": 2.0569, + "step": 19093 + }, + { + "epoch": 3.5790065604498595, + "grad_norm": 52901.16796875, + "learning_rate": 5.059839157400825e-07, + "loss": 2.1053, + "step": 19094 + }, + { + "epoch": 3.579194001874414, + "grad_norm": 58376.89453125, + "learning_rate": 5.048694523843711e-07, + "loss": 2.0165, + "step": 19095 + }, + { + "epoch": 3.5793814432989692, + "grad_norm": 58809.9296875, + "learning_rate": 5.037562114942218e-07, + "loss": 2.0518, + "step": 19096 + }, + { + "epoch": 3.579568884723524, + "grad_norm": 55201.29296875, + "learning_rate": 5.026441930971182e-07, + "loss": 2.0813, + "step": 19097 + }, + { + "epoch": 3.579756326148079, + "grad_norm": 56222.1171875, + "learning_rate": 5.015333972205272e-07, + "loss": 2.0857, + "step": 19098 + }, + { + "epoch": 3.5799437675726336, + "grad_norm": 50074.21484375, + "learning_rate": 5.004238238918823e-07, + "loss": 2.0644, + "step": 19099 + }, + { + "epoch": 3.580131208997188, + "grad_norm": 53386.50390625, + "learning_rate": 4.993154731386007e-07, + "loss": 2.0491, + "step": 19100 + }, + { + "epoch": 3.5803186504217432, + "grad_norm": 55209.59765625, + "learning_rate": 4.982083449880381e-07, + "loss": 2.0633, + "step": 19101 + }, + { + "epoch": 3.580506091846298, + "grad_norm": 60819.6875, + "learning_rate": 4.971024394675561e-07, + "loss": 2.0312, + "step": 19102 + }, + { + "epoch": 3.580693533270853, + "grad_norm": 57101.19921875, + "learning_rate": 4.959977566044605e-07, + "loss": 2.0206, + "step": 19103 + }, + { + "epoch": 3.5808809746954076, + "grad_norm": 54893.4375, + "learning_rate": 4.94894296426035e-07, + "loss": 2.0792, + "step": 19104 + }, + { + "epoch": 3.5810684161199626, + "grad_norm": 53763.67578125, + "learning_rate": 4.937920589595413e-07, + "loss": 2.076, + "step": 19105 + }, + { + "epoch": 3.5812558575445173, + "grad_norm": 58718.30078125, + "learning_rate": 4.926910442321964e-07, + "loss": 2.1236, + "step": 19106 + }, + { + "epoch": 3.5814432989690723, + "grad_norm": 53334.51171875, + "learning_rate": 4.915912522711952e-07, + "loss": 2.097, + "step": 19107 + }, + { + "epoch": 3.581630740393627, + "grad_norm": 66854.703125, + "learning_rate": 4.904926831036938e-07, + "loss": 1.9705, + "step": 19108 + }, + { + "epoch": 3.581818181818182, + "grad_norm": 53910.90234375, + "learning_rate": 4.893953367568427e-07, + "loss": 2.0337, + "step": 19109 + }, + { + "epoch": 3.5820056232427366, + "grad_norm": 58357.265625, + "learning_rate": 4.882992132577313e-07, + "loss": 2.0476, + "step": 19110 + }, + { + "epoch": 3.5821930646672913, + "grad_norm": 56564.15625, + "learning_rate": 4.872043126334381e-07, + "loss": 2.0497, + "step": 19111 + }, + { + "epoch": 3.5823805060918463, + "grad_norm": 62401.5859375, + "learning_rate": 4.861106349110023e-07, + "loss": 1.9891, + "step": 19112 + }, + { + "epoch": 3.5825679475164014, + "grad_norm": 52866.23046875, + "learning_rate": 4.850181801174413e-07, + "loss": 2.0676, + "step": 19113 + }, + { + "epoch": 3.582755388940956, + "grad_norm": 54156.12109375, + "learning_rate": 4.839269482797338e-07, + "loss": 1.9968, + "step": 19114 + }, + { + "epoch": 3.5829428303655106, + "grad_norm": 56863.83984375, + "learning_rate": 4.8283693942483e-07, + "loss": 2.0491, + "step": 19115 + }, + { + "epoch": 3.5831302717900657, + "grad_norm": 54480.8203125, + "learning_rate": 4.817481535796531e-07, + "loss": 2.144, + "step": 19116 + }, + { + "epoch": 3.5833177132146203, + "grad_norm": 67064.6484375, + "learning_rate": 4.806605907710981e-07, + "loss": 1.9592, + "step": 19117 + }, + { + "epoch": 3.5835051546391754, + "grad_norm": 52355.51171875, + "learning_rate": 4.795742510260215e-07, + "loss": 2.0368, + "step": 19118 + }, + { + "epoch": 3.58369259606373, + "grad_norm": 56581.66796875, + "learning_rate": 4.784891343712572e-07, + "loss": 2.0582, + "step": 19119 + }, + { + "epoch": 3.583880037488285, + "grad_norm": 54572.92578125, + "learning_rate": 4.77405240833606e-07, + "loss": 2.0711, + "step": 19120 + }, + { + "epoch": 3.5840674789128397, + "grad_norm": 63189.234375, + "learning_rate": 4.7632257043983554e-07, + "loss": 2.0935, + "step": 19121 + }, + { + "epoch": 3.5842549203373943, + "grad_norm": 54897.97265625, + "learning_rate": 4.7524112321669645e-07, + "loss": 2.1143, + "step": 19122 + }, + { + "epoch": 3.5844423617619494, + "grad_norm": 59891.2109375, + "learning_rate": 4.7416089919088415e-07, + "loss": 2.1496, + "step": 19123 + }, + { + "epoch": 3.5846298031865045, + "grad_norm": 55958.984375, + "learning_rate": 4.7308189838908834e-07, + "loss": 2.0342, + "step": 19124 + }, + { + "epoch": 3.584817244611059, + "grad_norm": 57475.1796875, + "learning_rate": 4.7200412083795996e-07, + "loss": 2.0604, + "step": 19125 + }, + { + "epoch": 3.5850046860356137, + "grad_norm": 56980.98046875, + "learning_rate": 4.7092756656411664e-07, + "loss": 2.065, + "step": 19126 + }, + { + "epoch": 3.585192127460169, + "grad_norm": 59807.8125, + "learning_rate": 4.698522355941426e-07, + "loss": 2.1313, + "step": 19127 + }, + { + "epoch": 3.5853795688847234, + "grad_norm": 54641.65234375, + "learning_rate": 4.687781279546055e-07, + "loss": 2.1278, + "step": 19128 + }, + { + "epoch": 3.5855670103092785, + "grad_norm": 57646.296875, + "learning_rate": 4.6770524367202863e-07, + "loss": 1.9871, + "step": 19129 + }, + { + "epoch": 3.585754451733833, + "grad_norm": 54477.48046875, + "learning_rate": 4.666335827729129e-07, + "loss": 2.1413, + "step": 19130 + }, + { + "epoch": 3.585941893158388, + "grad_norm": 61730.6328125, + "learning_rate": 4.6556314528372615e-07, + "loss": 2.0805, + "step": 19131 + }, + { + "epoch": 3.586129334582943, + "grad_norm": 53702.44140625, + "learning_rate": 4.6449393123090824e-07, + "loss": 2.0889, + "step": 19132 + }, + { + "epoch": 3.5863167760074974, + "grad_norm": 53650.03515625, + "learning_rate": 4.6342594064086587e-07, + "loss": 2.0972, + "step": 19133 + }, + { + "epoch": 3.5865042174320525, + "grad_norm": 61847.19140625, + "learning_rate": 4.6235917353997793e-07, + "loss": 2.0067, + "step": 19134 + }, + { + "epoch": 3.5866916588566076, + "grad_norm": 54668.69921875, + "learning_rate": 4.6129362995459004e-07, + "loss": 2.2502, + "step": 19135 + }, + { + "epoch": 3.586879100281162, + "grad_norm": 56349.6640625, + "learning_rate": 4.6022930991101995e-07, + "loss": 2.0273, + "step": 19136 + }, + { + "epoch": 3.587066541705717, + "grad_norm": 53093.375, + "learning_rate": 4.5916621343556343e-07, + "loss": 2.037, + "step": 19137 + }, + { + "epoch": 3.587253983130272, + "grad_norm": 53835.65234375, + "learning_rate": 4.5810434055446605e-07, + "loss": 2.0624, + "step": 19138 + }, + { + "epoch": 3.5874414245548265, + "grad_norm": 58568.0859375, + "learning_rate": 4.5704369129396243e-07, + "loss": 2.0351, + "step": 19139 + }, + { + "epoch": 3.5876288659793816, + "grad_norm": 53860.58203125, + "learning_rate": 4.559842656802427e-07, + "loss": 2.0881, + "step": 19140 + }, + { + "epoch": 3.587816307403936, + "grad_norm": 54922.7421875, + "learning_rate": 4.5492606373948033e-07, + "loss": 2.0105, + "step": 19141 + }, + { + "epoch": 3.5880037488284913, + "grad_norm": 51625.8671875, + "learning_rate": 4.538690854978045e-07, + "loss": 2.0698, + "step": 19142 + }, + { + "epoch": 3.588191190253046, + "grad_norm": 55198.9296875, + "learning_rate": 4.528133309813276e-07, + "loss": 2.1158, + "step": 19143 + }, + { + "epoch": 3.5883786316776005, + "grad_norm": 53825.15234375, + "learning_rate": 4.5175880021612327e-07, + "loss": 2.1258, + "step": 19144 + }, + { + "epoch": 3.5885660731021556, + "grad_norm": 52403.47265625, + "learning_rate": 4.5070549322823176e-07, + "loss": 2.0905, + "step": 19145 + }, + { + "epoch": 3.5887535145267107, + "grad_norm": 58746.59765625, + "learning_rate": 4.4965341004367666e-07, + "loss": 2.1142, + "step": 19146 + }, + { + "epoch": 3.5889409559512653, + "grad_norm": 56967.29296875, + "learning_rate": 4.4860255068843727e-07, + "loss": 2.0619, + "step": 19147 + }, + { + "epoch": 3.58912839737582, + "grad_norm": 53310.76953125, + "learning_rate": 4.4755291518847054e-07, + "loss": 2.0468, + "step": 19148 + }, + { + "epoch": 3.589315838800375, + "grad_norm": 57778.6328125, + "learning_rate": 4.4650450356970574e-07, + "loss": 1.979, + "step": 19149 + }, + { + "epoch": 3.5895032802249296, + "grad_norm": 55192.93359375, + "learning_rate": 4.454573158580277e-07, + "loss": 2.0413, + "step": 19150 + }, + { + "epoch": 3.5896907216494847, + "grad_norm": 56143.7734375, + "learning_rate": 4.4441135207930476e-07, + "loss": 2.0334, + "step": 19151 + }, + { + "epoch": 3.5898781630740393, + "grad_norm": 54279.97265625, + "learning_rate": 4.4336661225937157e-07, + "loss": 2.0597, + "step": 19152 + }, + { + "epoch": 3.5900656044985944, + "grad_norm": 51361.05078125, + "learning_rate": 4.4232309642403545e-07, + "loss": 2.0554, + "step": 19153 + }, + { + "epoch": 3.590253045923149, + "grad_norm": 50873.640625, + "learning_rate": 4.4128080459906463e-07, + "loss": 2.1015, + "step": 19154 + }, + { + "epoch": 3.5904404873477036, + "grad_norm": 56401.109375, + "learning_rate": 4.4023973681020514e-07, + "loss": 2.0672, + "step": 19155 + }, + { + "epoch": 3.5906279287722587, + "grad_norm": 54736.046875, + "learning_rate": 4.391998930831698e-07, + "loss": 2.0914, + "step": 19156 + }, + { + "epoch": 3.5908153701968137, + "grad_norm": 55132.875, + "learning_rate": 4.38161273443638e-07, + "loss": 2.1313, + "step": 19157 + }, + { + "epoch": 3.5910028116213684, + "grad_norm": 56670.203125, + "learning_rate": 4.3712387791726703e-07, + "loss": 2.0588, + "step": 19158 + }, + { + "epoch": 3.591190253045923, + "grad_norm": 56875.87109375, + "learning_rate": 4.360877065296809e-07, + "loss": 2.0755, + "step": 19159 + }, + { + "epoch": 3.591377694470478, + "grad_norm": 58020.9453125, + "learning_rate": 4.3505275930645903e-07, + "loss": 2.1424, + "step": 19160 + }, + { + "epoch": 3.5915651358950327, + "grad_norm": 53796.7578125, + "learning_rate": 4.3401903627317555e-07, + "loss": 2.0592, + "step": 19161 + }, + { + "epoch": 3.5917525773195877, + "grad_norm": 55484.82421875, + "learning_rate": 4.329865374553599e-07, + "loss": 2.0604, + "step": 19162 + }, + { + "epoch": 3.5919400187441424, + "grad_norm": 57743.578125, + "learning_rate": 4.3195526287851396e-07, + "loss": 2.2016, + "step": 19163 + }, + { + "epoch": 3.5921274601686974, + "grad_norm": 53967.13671875, + "learning_rate": 4.309252125681007e-07, + "loss": 1.9842, + "step": 19164 + }, + { + "epoch": 3.592314901593252, + "grad_norm": 64637.58203125, + "learning_rate": 4.29896386549572e-07, + "loss": 1.9733, + "step": 19165 + }, + { + "epoch": 3.5925023430178067, + "grad_norm": 56529.00390625, + "learning_rate": 4.288687848483297e-07, + "loss": 2.0828, + "step": 19166 + }, + { + "epoch": 3.5926897844423618, + "grad_norm": 57834.26953125, + "learning_rate": 4.2784240748976466e-07, + "loss": 2.0631, + "step": 19167 + }, + { + "epoch": 3.592877225866917, + "grad_norm": 56538.2421875, + "learning_rate": 4.268172544992177e-07, + "loss": 1.996, + "step": 19168 + }, + { + "epoch": 3.5930646672914714, + "grad_norm": 50329.43359375, + "learning_rate": 4.2579332590201306e-07, + "loss": 2.0745, + "step": 19169 + }, + { + "epoch": 3.593252108716026, + "grad_norm": 52564.8984375, + "learning_rate": 4.247706217234415e-07, + "loss": 2.0562, + "step": 19170 + }, + { + "epoch": 3.593439550140581, + "grad_norm": 56189.5390625, + "learning_rate": 4.2374914198875514e-07, + "loss": 2.0725, + "step": 19171 + }, + { + "epoch": 3.5936269915651358, + "grad_norm": 58102.109375, + "learning_rate": 4.227288867231949e-07, + "loss": 2.0752, + "step": 19172 + }, + { + "epoch": 3.593814432989691, + "grad_norm": 51737.4921875, + "learning_rate": 4.217098559519461e-07, + "loss": 2.0182, + "step": 19173 + }, + { + "epoch": 3.5940018744142455, + "grad_norm": 49786.71875, + "learning_rate": 4.2069204970018873e-07, + "loss": 2.0993, + "step": 19174 + }, + { + "epoch": 3.5941893158388005, + "grad_norm": 57743.6953125, + "learning_rate": 4.1967546799305256e-07, + "loss": 2.0676, + "step": 19175 + }, + { + "epoch": 3.594376757263355, + "grad_norm": 57182.6875, + "learning_rate": 4.186601108556565e-07, + "loss": 2.1043, + "step": 19176 + }, + { + "epoch": 3.5945641986879098, + "grad_norm": 55496.12109375, + "learning_rate": 4.1764597831306926e-07, + "loss": 2.0195, + "step": 19177 + }, + { + "epoch": 3.594751640112465, + "grad_norm": 54259.33203125, + "learning_rate": 4.166330703903432e-07, + "loss": 2.0567, + "step": 19178 + }, + { + "epoch": 3.59493908153702, + "grad_norm": 57781.35546875, + "learning_rate": 4.15621387112497e-07, + "loss": 2.0598, + "step": 19179 + }, + { + "epoch": 3.5951265229615745, + "grad_norm": 54371.23046875, + "learning_rate": 4.146109285045108e-07, + "loss": 2.0324, + "step": 19180 + }, + { + "epoch": 3.595313964386129, + "grad_norm": 57338.84765625, + "learning_rate": 4.136016945913479e-07, + "loss": 2.1274, + "step": 19181 + }, + { + "epoch": 3.5955014058106842, + "grad_norm": 59479.078125, + "learning_rate": 4.125936853979384e-07, + "loss": 2.1077, + "step": 19182 + }, + { + "epoch": 3.595688847235239, + "grad_norm": 50271.7421875, + "learning_rate": 4.1158690094916244e-07, + "loss": 2.1145, + "step": 19183 + }, + { + "epoch": 3.595876288659794, + "grad_norm": 53901.08984375, + "learning_rate": 4.105813412699056e-07, + "loss": 2.0382, + "step": 19184 + }, + { + "epoch": 3.5960637300843485, + "grad_norm": 55360.4765625, + "learning_rate": 4.0957700638499795e-07, + "loss": 2.0087, + "step": 19185 + }, + { + "epoch": 3.5962511715089036, + "grad_norm": 55789.140625, + "learning_rate": 4.0857389631923646e-07, + "loss": 2.0471, + "step": 19186 + }, + { + "epoch": 3.5964386129334582, + "grad_norm": 57186.765625, + "learning_rate": 4.0757201109740686e-07, + "loss": 2.0386, + "step": 19187 + }, + { + "epoch": 3.596626054358013, + "grad_norm": 58027.921875, + "learning_rate": 4.0657135074425036e-07, + "loss": 2.099, + "step": 19188 + }, + { + "epoch": 3.596813495782568, + "grad_norm": 58408.42578125, + "learning_rate": 4.0557191528448614e-07, + "loss": 2.0659, + "step": 19189 + }, + { + "epoch": 3.597000937207123, + "grad_norm": 54535.82421875, + "learning_rate": 4.0457370474279444e-07, + "loss": 2.0264, + "step": 19190 + }, + { + "epoch": 3.5971883786316776, + "grad_norm": 57245.12890625, + "learning_rate": 4.0357671914382776e-07, + "loss": 2.034, + "step": 19191 + }, + { + "epoch": 3.5973758200562322, + "grad_norm": 60042.09375, + "learning_rate": 4.0258095851221647e-07, + "loss": 2.0841, + "step": 19192 + }, + { + "epoch": 3.5975632614807873, + "grad_norm": 55971.6953125, + "learning_rate": 4.0158642287255186e-07, + "loss": 2.0705, + "step": 19193 + }, + { + "epoch": 3.597750702905342, + "grad_norm": 53967.82421875, + "learning_rate": 4.005931122493922e-07, + "loss": 2.1235, + "step": 19194 + }, + { + "epoch": 3.597938144329897, + "grad_norm": 58507.359375, + "learning_rate": 3.996010266672789e-07, + "loss": 2.2, + "step": 19195 + }, + { + "epoch": 3.5981255857544516, + "grad_norm": 53853.2578125, + "learning_rate": 3.9861016615071465e-07, + "loss": 2.0383, + "step": 19196 + }, + { + "epoch": 3.5983130271790067, + "grad_norm": 54799.86328125, + "learning_rate": 3.976205307241687e-07, + "loss": 2.1187, + "step": 19197 + }, + { + "epoch": 3.5985004686035613, + "grad_norm": 59616.671875, + "learning_rate": 3.9663212041208263e-07, + "loss": 2.0036, + "step": 19198 + }, + { + "epoch": 3.598687910028116, + "grad_norm": 52164.8984375, + "learning_rate": 3.956449352388758e-07, + "loss": 2.0455, + "step": 19199 + }, + { + "epoch": 3.598875351452671, + "grad_norm": 57029.984375, + "learning_rate": 3.9465897522892315e-07, + "loss": 2.0461, + "step": 19200 + }, + { + "epoch": 3.599062792877226, + "grad_norm": 58944.16015625, + "learning_rate": 3.9367424040657743e-07, + "loss": 2.0542, + "step": 19201 + }, + { + "epoch": 3.5992502343017807, + "grad_norm": 51815.35546875, + "learning_rate": 3.9269073079616915e-07, + "loss": 2.0495, + "step": 19202 + }, + { + "epoch": 3.5994376757263353, + "grad_norm": 57510.5, + "learning_rate": 3.9170844642197893e-07, + "loss": 2.0934, + "step": 19203 + }, + { + "epoch": 3.5996251171508904, + "grad_norm": 54778.91796875, + "learning_rate": 3.9072738730827066e-07, + "loss": 2.1217, + "step": 19204 + }, + { + "epoch": 3.599812558575445, + "grad_norm": 52639.375, + "learning_rate": 3.8974755347927493e-07, + "loss": 2.1206, + "step": 19205 + }, + { + "epoch": 3.6, + "grad_norm": 56481.98828125, + "learning_rate": 3.8876894495920027e-07, + "loss": 2.1126, + "step": 19206 + }, + { + "epoch": 3.6001874414245547, + "grad_norm": 52501.578125, + "learning_rate": 3.87791561772205e-07, + "loss": 2.0804, + "step": 19207 + }, + { + "epoch": 3.60037488284911, + "grad_norm": 59307.0859375, + "learning_rate": 3.8681540394243656e-07, + "loss": 2.0381, + "step": 19208 + }, + { + "epoch": 3.6005623242736644, + "grad_norm": 57557.25, + "learning_rate": 3.858404714940089e-07, + "loss": 2.0373, + "step": 19209 + }, + { + "epoch": 3.600749765698219, + "grad_norm": 56637.66796875, + "learning_rate": 3.848667644509862e-07, + "loss": 2.0793, + "step": 19210 + }, + { + "epoch": 3.600937207122774, + "grad_norm": 52178.8046875, + "learning_rate": 3.8389428283743813e-07, + "loss": 2.0303, + "step": 19211 + }, + { + "epoch": 3.601124648547329, + "grad_norm": 53400.55859375, + "learning_rate": 3.8292302667736755e-07, + "loss": 2.0899, + "step": 19212 + }, + { + "epoch": 3.601312089971884, + "grad_norm": 52912.72265625, + "learning_rate": 3.819529959947665e-07, + "loss": 2.1088, + "step": 19213 + }, + { + "epoch": 3.6014995313964384, + "grad_norm": 57963.81640625, + "learning_rate": 3.809841908135936e-07, + "loss": 2.0695, + "step": 19214 + }, + { + "epoch": 3.6016869728209935, + "grad_norm": 58616.375, + "learning_rate": 3.800166111577852e-07, + "loss": 2.042, + "step": 19215 + }, + { + "epoch": 3.601874414245548, + "grad_norm": 57838.32421875, + "learning_rate": 3.790502570512333e-07, + "loss": 2.0481, + "step": 19216 + }, + { + "epoch": 3.602061855670103, + "grad_norm": 53051.1953125, + "learning_rate": 3.7808512851780223e-07, + "loss": 1.9905, + "step": 19217 + }, + { + "epoch": 3.602249297094658, + "grad_norm": 58373.34765625, + "learning_rate": 3.771212255813339e-07, + "loss": 2.1037, + "step": 19218 + }, + { + "epoch": 3.602436738519213, + "grad_norm": 52051.328125, + "learning_rate": 3.7615854826563713e-07, + "loss": 2.0466, + "step": 19219 + }, + { + "epoch": 3.6026241799437675, + "grad_norm": 55195.80859375, + "learning_rate": 3.751970965944818e-07, + "loss": 2.0975, + "step": 19220 + }, + { + "epoch": 3.6028116213683226, + "grad_norm": 55573.51953125, + "learning_rate": 3.742368705916155e-07, + "loss": 2.1014, + "step": 19221 + }, + { + "epoch": 3.602999062792877, + "grad_norm": 61189.96875, + "learning_rate": 3.7327787028076376e-07, + "loss": 1.9641, + "step": 19222 + }, + { + "epoch": 3.6031865042174323, + "grad_norm": 57298.87890625, + "learning_rate": 3.7232009568560765e-07, + "loss": 2.048, + "step": 19223 + }, + { + "epoch": 3.603373945641987, + "grad_norm": 55044.19921875, + "learning_rate": 3.7136354682980044e-07, + "loss": 2.0634, + "step": 19224 + }, + { + "epoch": 3.6035613870665415, + "grad_norm": 56943.515625, + "learning_rate": 3.704082237369677e-07, + "loss": 2.1414, + "step": 19225 + }, + { + "epoch": 3.6037488284910966, + "grad_norm": 57288.54296875, + "learning_rate": 3.694541264307072e-07, + "loss": 2.0629, + "step": 19226 + }, + { + "epoch": 3.6039362699156516, + "grad_norm": 55998.109375, + "learning_rate": 3.6850125493458343e-07, + "loss": 2.0797, + "step": 19227 + }, + { + "epoch": 3.6041237113402063, + "grad_norm": 53927.6640625, + "learning_rate": 3.6754960927212754e-07, + "loss": 2.0581, + "step": 19228 + }, + { + "epoch": 3.604311152764761, + "grad_norm": 52431.51953125, + "learning_rate": 3.6659918946684855e-07, + "loss": 2.0248, + "step": 19229 + }, + { + "epoch": 3.604498594189316, + "grad_norm": 57030.66796875, + "learning_rate": 3.656499955422166e-07, + "loss": 2.026, + "step": 19230 + }, + { + "epoch": 3.6046860356138706, + "grad_norm": 58104.234375, + "learning_rate": 3.6470202752167946e-07, + "loss": 2.0916, + "step": 19231 + }, + { + "epoch": 3.6048734770384256, + "grad_norm": 56487.69140625, + "learning_rate": 3.637552854286519e-07, + "loss": 2.0513, + "step": 19232 + }, + { + "epoch": 3.6050609184629803, + "grad_norm": 55920.3046875, + "learning_rate": 3.6280976928650954e-07, + "loss": 2.0756, + "step": 19233 + }, + { + "epoch": 3.6052483598875353, + "grad_norm": 58156.1875, + "learning_rate": 3.618654791186171e-07, + "loss": 2.0642, + "step": 19234 + }, + { + "epoch": 3.60543580131209, + "grad_norm": 57681.19921875, + "learning_rate": 3.609224149482893e-07, + "loss": 2.0784, + "step": 19235 + }, + { + "epoch": 3.6056232427366446, + "grad_norm": 54377.9921875, + "learning_rate": 3.599805767988129e-07, + "loss": 2.0447, + "step": 19236 + }, + { + "epoch": 3.6058106841611997, + "grad_norm": 52559.7734375, + "learning_rate": 3.5903996469345835e-07, + "loss": 2.0672, + "step": 19237 + }, + { + "epoch": 3.6059981255857547, + "grad_norm": 55099.21484375, + "learning_rate": 3.581005786554625e-07, + "loss": 2.138, + "step": 19238 + }, + { + "epoch": 3.6061855670103093, + "grad_norm": 59754.84375, + "learning_rate": 3.5716241870801805e-07, + "loss": 2.0387, + "step": 19239 + }, + { + "epoch": 3.606373008434864, + "grad_norm": 56135.48046875, + "learning_rate": 3.562254848742952e-07, + "loss": 2.0851, + "step": 19240 + }, + { + "epoch": 3.606560449859419, + "grad_norm": 48959.73828125, + "learning_rate": 3.5528977717744216e-07, + "loss": 2.09, + "step": 19241 + }, + { + "epoch": 3.6067478912839737, + "grad_norm": 51495.8515625, + "learning_rate": 3.5435529564056823e-07, + "loss": 2.0865, + "step": 19242 + }, + { + "epoch": 3.6069353327085287, + "grad_norm": 53200.8125, + "learning_rate": 3.5342204028674944e-07, + "loss": 2.0431, + "step": 19243 + }, + { + "epoch": 3.6071227741330834, + "grad_norm": 56687.82421875, + "learning_rate": 3.5249001113903944e-07, + "loss": 2.1158, + "step": 19244 + }, + { + "epoch": 3.6073102155576384, + "grad_norm": 61895.96484375, + "learning_rate": 3.515592082204533e-07, + "loss": 2.1064, + "step": 19245 + }, + { + "epoch": 3.607497656982193, + "grad_norm": 56619.3515625, + "learning_rate": 3.506296315539892e-07, + "loss": 2.0858, + "step": 19246 + }, + { + "epoch": 3.6076850984067477, + "grad_norm": 55102.8984375, + "learning_rate": 3.4970128116260105e-07, + "loss": 2.0596, + "step": 19247 + }, + { + "epoch": 3.6078725398313027, + "grad_norm": 58263.20703125, + "learning_rate": 3.4877415706922044e-07, + "loss": 2.0834, + "step": 19248 + }, + { + "epoch": 3.608059981255858, + "grad_norm": 53326.4296875, + "learning_rate": 3.478482592967403e-07, + "loss": 2.0487, + "step": 19249 + }, + { + "epoch": 3.6082474226804124, + "grad_norm": 59335.66796875, + "learning_rate": 3.469235878680366e-07, + "loss": 2.0701, + "step": 19250 + }, + { + "epoch": 3.608434864104967, + "grad_norm": 61078.8984375, + "learning_rate": 3.4600014280593564e-07, + "loss": 2.0067, + "step": 19251 + }, + { + "epoch": 3.608622305529522, + "grad_norm": 55388.14453125, + "learning_rate": 3.450779241332691e-07, + "loss": 2.0734, + "step": 19252 + }, + { + "epoch": 3.6088097469540767, + "grad_norm": 56699.73046875, + "learning_rate": 3.441569318727855e-07, + "loss": 2.0895, + "step": 19253 + }, + { + "epoch": 3.608997188378632, + "grad_norm": 60480.4609375, + "learning_rate": 3.4323716604724997e-07, + "loss": 2.0248, + "step": 19254 + }, + { + "epoch": 3.6091846298031864, + "grad_norm": 56658.24609375, + "learning_rate": 3.4231862667937765e-07, + "loss": 2.1519, + "step": 19255 + }, + { + "epoch": 3.6093720712277415, + "grad_norm": 54405.22265625, + "learning_rate": 3.414013137918559e-07, + "loss": 2.017, + "step": 19256 + }, + { + "epoch": 3.609559512652296, + "grad_norm": 54010.71484375, + "learning_rate": 3.404852274073334e-07, + "loss": 2.0855, + "step": 19257 + }, + { + "epoch": 3.6097469540768508, + "grad_norm": 53612.62109375, + "learning_rate": 3.3957036754844184e-07, + "loss": 2.0211, + "step": 19258 + }, + { + "epoch": 3.609934395501406, + "grad_norm": 54773.7578125, + "learning_rate": 3.3865673423777443e-07, + "loss": 2.0535, + "step": 19259 + }, + { + "epoch": 3.610121836925961, + "grad_norm": 51164.17578125, + "learning_rate": 3.37744327497902e-07, + "loss": 2.0444, + "step": 19260 + }, + { + "epoch": 3.6103092783505155, + "grad_norm": 53749.6328125, + "learning_rate": 3.368331473513564e-07, + "loss": 2.0771, + "step": 19261 + }, + { + "epoch": 3.61049671977507, + "grad_norm": 51433.48828125, + "learning_rate": 3.35923193820642e-07, + "loss": 2.0264, + "step": 19262 + }, + { + "epoch": 3.610684161199625, + "grad_norm": 57179.55078125, + "learning_rate": 3.350144669282351e-07, + "loss": 2.0086, + "step": 19263 + }, + { + "epoch": 3.61087160262418, + "grad_norm": 52717.5703125, + "learning_rate": 3.3410696669657906e-07, + "loss": 2.0856, + "step": 19264 + }, + { + "epoch": 3.611059044048735, + "grad_norm": 57321.6640625, + "learning_rate": 3.332006931480891e-07, + "loss": 2.1273, + "step": 19265 + }, + { + "epoch": 3.6112464854732895, + "grad_norm": 57269.62109375, + "learning_rate": 3.3229564630514744e-07, + "loss": 2.0593, + "step": 19266 + }, + { + "epoch": 3.6114339268978446, + "grad_norm": 51973.0859375, + "learning_rate": 3.3139182619010835e-07, + "loss": 2.0541, + "step": 19267 + }, + { + "epoch": 3.611621368322399, + "grad_norm": 61635.3203125, + "learning_rate": 3.3048923282529285e-07, + "loss": 2.0928, + "step": 19268 + }, + { + "epoch": 3.611808809746954, + "grad_norm": 57103.84375, + "learning_rate": 3.295878662329943e-07, + "loss": 2.05, + "step": 19269 + }, + { + "epoch": 3.611996251171509, + "grad_norm": 57668.09765625, + "learning_rate": 3.286877264354782e-07, + "loss": 2.0876, + "step": 19270 + }, + { + "epoch": 3.612183692596064, + "grad_norm": 51434.40625, + "learning_rate": 3.2778881345498225e-07, + "loss": 2.1167, + "step": 19271 + }, + { + "epoch": 3.6123711340206186, + "grad_norm": 55284.70703125, + "learning_rate": 3.268911273136943e-07, + "loss": 2.0698, + "step": 19272 + }, + { + "epoch": 3.6125585754451732, + "grad_norm": 55654.3984375, + "learning_rate": 3.2599466803379663e-07, + "loss": 2.0879, + "step": 19273 + }, + { + "epoch": 3.6127460168697283, + "grad_norm": 55844.69140625, + "learning_rate": 3.250994356374215e-07, + "loss": 2.0392, + "step": 19274 + }, + { + "epoch": 3.612933458294283, + "grad_norm": 52027.6953125, + "learning_rate": 3.242054301466957e-07, + "loss": 1.9922, + "step": 19275 + }, + { + "epoch": 3.613120899718838, + "grad_norm": 58227.5390625, + "learning_rate": 3.2331265158368483e-07, + "loss": 2.0803, + "step": 19276 + }, + { + "epoch": 3.6133083411433926, + "grad_norm": 55453.453125, + "learning_rate": 3.2242109997044354e-07, + "loss": 2.058, + "step": 19277 + }, + { + "epoch": 3.6134957825679477, + "grad_norm": 56061.51953125, + "learning_rate": 3.215307753289931e-07, + "loss": 2.0287, + "step": 19278 + }, + { + "epoch": 3.6136832239925023, + "grad_norm": 55691.28515625, + "learning_rate": 3.2064167768132703e-07, + "loss": 2.0282, + "step": 19279 + }, + { + "epoch": 3.613870665417057, + "grad_norm": 53861.59375, + "learning_rate": 3.197538070493944e-07, + "loss": 2.0731, + "step": 19280 + }, + { + "epoch": 3.614058106841612, + "grad_norm": 55675.375, + "learning_rate": 3.188671634551388e-07, + "loss": 2.0495, + "step": 19281 + }, + { + "epoch": 3.614245548266167, + "grad_norm": 53318.875, + "learning_rate": 3.1798174692044823e-07, + "loss": 2.0896, + "step": 19282 + }, + { + "epoch": 3.6144329896907217, + "grad_norm": 58626.4296875, + "learning_rate": 3.170975574671942e-07, + "loss": 2.0692, + "step": 19283 + }, + { + "epoch": 3.6146204311152763, + "grad_norm": 61789.52734375, + "learning_rate": 3.1621459511721463e-07, + "loss": 2.0614, + "step": 19284 + }, + { + "epoch": 3.6148078725398314, + "grad_norm": 55817.046875, + "learning_rate": 3.1533285989232555e-07, + "loss": 2.0824, + "step": 19285 + }, + { + "epoch": 3.614995313964386, + "grad_norm": 60872.12890625, + "learning_rate": 3.1445235181429276e-07, + "loss": 2.016, + "step": 19286 + }, + { + "epoch": 3.615182755388941, + "grad_norm": 60744.7421875, + "learning_rate": 3.1357307090486564e-07, + "loss": 2.146, + "step": 19287 + }, + { + "epoch": 3.6153701968134957, + "grad_norm": 53633.08984375, + "learning_rate": 3.1269501718577123e-07, + "loss": 2.1214, + "step": 19288 + }, + { + "epoch": 3.6155576382380508, + "grad_norm": 61115.796875, + "learning_rate": 3.11818190678681e-07, + "loss": 2.0454, + "step": 19289 + }, + { + "epoch": 3.6157450796626054, + "grad_norm": 63562.71875, + "learning_rate": 3.109425914052666e-07, + "loss": 2.0904, + "step": 19290 + }, + { + "epoch": 3.61593252108716, + "grad_norm": 57576.73046875, + "learning_rate": 3.1006821938714405e-07, + "loss": 2.0293, + "step": 19291 + }, + { + "epoch": 3.616119962511715, + "grad_norm": 57534.2265625, + "learning_rate": 3.0919507464591266e-07, + "loss": 2.0814, + "step": 19292 + }, + { + "epoch": 3.61630740393627, + "grad_norm": 59947.55859375, + "learning_rate": 3.0832315720313863e-07, + "loss": 2.0539, + "step": 19293 + }, + { + "epoch": 3.6164948453608248, + "grad_norm": 57292.10546875, + "learning_rate": 3.0745246708036026e-07, + "loss": 2.1021, + "step": 19294 + }, + { + "epoch": 3.6166822867853794, + "grad_norm": 60149.06640625, + "learning_rate": 3.0658300429907137e-07, + "loss": 1.9956, + "step": 19295 + }, + { + "epoch": 3.6168697282099345, + "grad_norm": 54110.00390625, + "learning_rate": 3.0571476888076046e-07, + "loss": 2.0947, + "step": 19296 + }, + { + "epoch": 3.617057169634489, + "grad_norm": 51233.26953125, + "learning_rate": 3.0484776084686027e-07, + "loss": 2.0871, + "step": 19297 + }, + { + "epoch": 3.617244611059044, + "grad_norm": 55445.72265625, + "learning_rate": 3.0398198021879267e-07, + "loss": 2.0696, + "step": 19298 + }, + { + "epoch": 3.617432052483599, + "grad_norm": 62522.83984375, + "learning_rate": 3.031174270179349e-07, + "loss": 2.0634, + "step": 19299 + }, + { + "epoch": 3.617619493908154, + "grad_norm": 61906.80078125, + "learning_rate": 3.022541012656477e-07, + "loss": 2.0551, + "step": 19300 + }, + { + "epoch": 3.6178069353327085, + "grad_norm": 55431.640625, + "learning_rate": 3.013920029832473e-07, + "loss": 2.092, + "step": 19301 + }, + { + "epoch": 3.617994376757263, + "grad_norm": 54901.015625, + "learning_rate": 3.005311321920334e-07, + "loss": 2.1448, + "step": 19302 + }, + { + "epoch": 3.618181818181818, + "grad_norm": 48364.58203125, + "learning_rate": 2.996714889132557e-07, + "loss": 2.0494, + "step": 19303 + }, + { + "epoch": 3.6183692596063732, + "grad_norm": 58748.49609375, + "learning_rate": 2.988130731681582e-07, + "loss": 2.0142, + "step": 19304 + }, + { + "epoch": 3.618556701030928, + "grad_norm": 57518.6484375, + "learning_rate": 2.9795588497794623e-07, + "loss": 2.0584, + "step": 19305 + }, + { + "epoch": 3.6187441424554825, + "grad_norm": 51233.83203125, + "learning_rate": 2.970999243637751e-07, + "loss": 2.0596, + "step": 19306 + }, + { + "epoch": 3.6189315838800376, + "grad_norm": 51808.55859375, + "learning_rate": 2.9624519134680006e-07, + "loss": 2.0688, + "step": 19307 + }, + { + "epoch": 3.619119025304592, + "grad_norm": 57730.390625, + "learning_rate": 2.953916859481265e-07, + "loss": 2.0206, + "step": 19308 + }, + { + "epoch": 3.6193064667291472, + "grad_norm": 54381.28125, + "learning_rate": 2.9453940818883754e-07, + "loss": 2.0875, + "step": 19309 + }, + { + "epoch": 3.619493908153702, + "grad_norm": 58173.40625, + "learning_rate": 2.936883580899774e-07, + "loss": 2.0631, + "step": 19310 + }, + { + "epoch": 3.619681349578257, + "grad_norm": 54995.578125, + "learning_rate": 2.9283853567256825e-07, + "loss": 2.058, + "step": 19311 + }, + { + "epoch": 3.6198687910028116, + "grad_norm": 54624.07421875, + "learning_rate": 2.9198994095760437e-07, + "loss": 1.9684, + "step": 19312 + }, + { + "epoch": 3.620056232427366, + "grad_norm": 52360.54296875, + "learning_rate": 2.9114257396604114e-07, + "loss": 2.0982, + "step": 19313 + }, + { + "epoch": 3.6202436738519213, + "grad_norm": 62584.84765625, + "learning_rate": 2.9029643471880643e-07, + "loss": 2.0241, + "step": 19314 + }, + { + "epoch": 3.6204311152764763, + "grad_norm": 54120.546875, + "learning_rate": 2.8945152323680003e-07, + "loss": 2.0796, + "step": 19315 + }, + { + "epoch": 3.620618556701031, + "grad_norm": 54547.86328125, + "learning_rate": 2.886078395408942e-07, + "loss": 2.2334, + "step": 19316 + }, + { + "epoch": 3.6208059981255856, + "grad_norm": 56034.9375, + "learning_rate": 2.8776538365191676e-07, + "loss": 2.0646, + "step": 19317 + }, + { + "epoch": 3.6209934395501406, + "grad_norm": 59834.6953125, + "learning_rate": 2.869241555906843e-07, + "loss": 2.0485, + "step": 19318 + }, + { + "epoch": 3.6211808809746953, + "grad_norm": 58526.59375, + "learning_rate": 2.860841553779747e-07, + "loss": 2.0574, + "step": 19319 + }, + { + "epoch": 3.6213683223992503, + "grad_norm": 56737.0390625, + "learning_rate": 2.85245383034527e-07, + "loss": 2.0435, + "step": 19320 + }, + { + "epoch": 3.621555763823805, + "grad_norm": 55167.078125, + "learning_rate": 2.8440783858106337e-07, + "loss": 2.0217, + "step": 19321 + }, + { + "epoch": 3.62174320524836, + "grad_norm": 57949.35546875, + "learning_rate": 2.835715220382673e-07, + "loss": 2.0712, + "step": 19322 + }, + { + "epoch": 3.6219306466729146, + "grad_norm": 56713.44921875, + "learning_rate": 2.8273643342680014e-07, + "loss": 2.0828, + "step": 19323 + }, + { + "epoch": 3.6221180880974693, + "grad_norm": 56526.6328125, + "learning_rate": 2.819025727672786e-07, + "loss": 2.0822, + "step": 19324 + }, + { + "epoch": 3.6223055295220243, + "grad_norm": 55238.80859375, + "learning_rate": 2.810699400803085e-07, + "loss": 2.0763, + "step": 19325 + }, + { + "epoch": 3.6224929709465794, + "grad_norm": 53397.375, + "learning_rate": 2.802385353864456e-07, + "loss": 2.0408, + "step": 19326 + }, + { + "epoch": 3.622680412371134, + "grad_norm": 55920.9296875, + "learning_rate": 2.7940835870622903e-07, + "loss": 2.1388, + "step": 19327 + }, + { + "epoch": 3.6228678537956887, + "grad_norm": 61325.45703125, + "learning_rate": 2.785794100601646e-07, + "loss": 2.1, + "step": 19328 + }, + { + "epoch": 3.6230552952202437, + "grad_norm": 57443.87109375, + "learning_rate": 2.777516894687249e-07, + "loss": 2.0624, + "step": 19329 + }, + { + "epoch": 3.6232427366447983, + "grad_norm": 61453.109375, + "learning_rate": 2.7692519695234897e-07, + "loss": 2.1433, + "step": 19330 + }, + { + "epoch": 3.6234301780693534, + "grad_norm": 59367.29296875, + "learning_rate": 2.7609993253145394e-07, + "loss": 2.0295, + "step": 19331 + }, + { + "epoch": 3.623617619493908, + "grad_norm": 59314.6796875, + "learning_rate": 2.7527589622642345e-07, + "loss": 2.0894, + "step": 19332 + }, + { + "epoch": 3.623805060918463, + "grad_norm": 52951.0, + "learning_rate": 2.7445308805760793e-07, + "loss": 2.0319, + "step": 19333 + }, + { + "epoch": 3.6239925023430177, + "grad_norm": 53699.5546875, + "learning_rate": 2.7363150804533e-07, + "loss": 2.1271, + "step": 19334 + }, + { + "epoch": 3.624179943767573, + "grad_norm": 56746.265625, + "learning_rate": 2.728111562098901e-07, + "loss": 2.0581, + "step": 19335 + }, + { + "epoch": 3.6243673851921274, + "grad_norm": 54333.37109375, + "learning_rate": 2.7199203257153305e-07, + "loss": 2.0704, + "step": 19336 + }, + { + "epoch": 3.6245548266166825, + "grad_norm": 55916.93359375, + "learning_rate": 2.7117413715050387e-07, + "loss": 2.071, + "step": 19337 + }, + { + "epoch": 3.624742268041237, + "grad_norm": 56360.46875, + "learning_rate": 2.703574699669975e-07, + "loss": 2.1011, + "step": 19338 + }, + { + "epoch": 3.6249297094657917, + "grad_norm": 60485.0703125, + "learning_rate": 2.6954203104119224e-07, + "loss": 2.002, + "step": 19339 + }, + { + "epoch": 3.625117150890347, + "grad_norm": 55018.17578125, + "learning_rate": 2.6872782039321644e-07, + "loss": 2.0722, + "step": 19340 + }, + { + "epoch": 3.6253045923149014, + "grad_norm": 57000.80859375, + "learning_rate": 2.679148380431873e-07, + "loss": 1.9533, + "step": 19341 + }, + { + "epoch": 3.6254920337394565, + "grad_norm": 53661.77734375, + "learning_rate": 2.6710308401118324e-07, + "loss": 2.1169, + "step": 19342 + }, + { + "epoch": 3.625679475164011, + "grad_norm": 55311.6796875, + "learning_rate": 2.662925583172493e-07, + "loss": 2.0803, + "step": 19343 + }, + { + "epoch": 3.625866916588566, + "grad_norm": 54374.28515625, + "learning_rate": 2.6548326098141397e-07, + "loss": 2.0691, + "step": 19344 + }, + { + "epoch": 3.626054358013121, + "grad_norm": 62142.9609375, + "learning_rate": 2.6467519202365566e-07, + "loss": 2.0734, + "step": 19345 + }, + { + "epoch": 3.626241799437676, + "grad_norm": 53745.81640625, + "learning_rate": 2.6386835146394175e-07, + "loss": 2.087, + "step": 19346 + }, + { + "epoch": 3.6264292408622305, + "grad_norm": 55859.671875, + "learning_rate": 2.6306273932219514e-07, + "loss": 2.0806, + "step": 19347 + }, + { + "epoch": 3.6266166822867856, + "grad_norm": 53511.7734375, + "learning_rate": 2.622583556183111e-07, + "loss": 2.0819, + "step": 19348 + }, + { + "epoch": 3.62680412371134, + "grad_norm": 52831.109375, + "learning_rate": 2.6145520037216263e-07, + "loss": 2.0336, + "step": 19349 + }, + { + "epoch": 3.626991565135895, + "grad_norm": 51428.3125, + "learning_rate": 2.6065327360358383e-07, + "loss": 2.065, + "step": 19350 + }, + { + "epoch": 3.62717900656045, + "grad_norm": 56059.18359375, + "learning_rate": 2.5985257533237553e-07, + "loss": 1.9863, + "step": 19351 + }, + { + "epoch": 3.627366447985005, + "grad_norm": 59529.75390625, + "learning_rate": 2.590531055783274e-07, + "loss": 2.0447, + "step": 19352 + }, + { + "epoch": 3.6275538894095596, + "grad_norm": 61035.58984375, + "learning_rate": 2.582548643611737e-07, + "loss": 2.0395, + "step": 19353 + }, + { + "epoch": 3.627741330834114, + "grad_norm": 52704.92578125, + "learning_rate": 2.5745785170063205e-07, + "loss": 2.0677, + "step": 19354 + }, + { + "epoch": 3.6279287722586693, + "grad_norm": 54041.65625, + "learning_rate": 2.5666206761639223e-07, + "loss": 2.0789, + "step": 19355 + }, + { + "epoch": 3.628116213683224, + "grad_norm": 53578.9296875, + "learning_rate": 2.558675121281051e-07, + "loss": 2.1141, + "step": 19356 + }, + { + "epoch": 3.628303655107779, + "grad_norm": 58623.3046875, + "learning_rate": 2.550741852553995e-07, + "loss": 1.9984, + "step": 19357 + }, + { + "epoch": 3.6284910965323336, + "grad_norm": 57852.1796875, + "learning_rate": 2.5428208701786527e-07, + "loss": 2.0255, + "step": 19358 + }, + { + "epoch": 3.6286785379568887, + "grad_norm": 54867.2578125, + "learning_rate": 2.53491217435059e-07, + "loss": 2.0521, + "step": 19359 + }, + { + "epoch": 3.6288659793814433, + "grad_norm": 53760.51953125, + "learning_rate": 2.527015765265317e-07, + "loss": 2.0548, + "step": 19360 + }, + { + "epoch": 3.629053420805998, + "grad_norm": 58019.53515625, + "learning_rate": 2.5191316431177337e-07, + "loss": 2.0714, + "step": 19361 + }, + { + "epoch": 3.629240862230553, + "grad_norm": 57909.859375, + "learning_rate": 2.5112598081026285e-07, + "loss": 2.1117, + "step": 19362 + }, + { + "epoch": 3.629428303655108, + "grad_norm": 53452.8046875, + "learning_rate": 2.5034002604143457e-07, + "loss": 2.1523, + "step": 19363 + }, + { + "epoch": 3.6296157450796627, + "grad_norm": 53659.4375, + "learning_rate": 2.4955530002471194e-07, + "loss": 2.0479, + "step": 19364 + }, + { + "epoch": 3.6298031865042173, + "grad_norm": 56647.89453125, + "learning_rate": 2.487718027794683e-07, + "loss": 2.0596, + "step": 19365 + }, + { + "epoch": 3.6299906279287724, + "grad_norm": 59479.82421875, + "learning_rate": 2.479895343250604e-07, + "loss": 2.094, + "step": 19366 + }, + { + "epoch": 3.630178069353327, + "grad_norm": 57833.58984375, + "learning_rate": 2.4720849468080066e-07, + "loss": 2.0947, + "step": 19367 + }, + { + "epoch": 3.630365510777882, + "grad_norm": 56559.11328125, + "learning_rate": 2.464286838659957e-07, + "loss": 1.9951, + "step": 19368 + }, + { + "epoch": 3.6305529522024367, + "grad_norm": 58581.421875, + "learning_rate": 2.456501018998858e-07, + "loss": 2.1152, + "step": 19369 + }, + { + "epoch": 3.6307403936269917, + "grad_norm": 52748.390625, + "learning_rate": 2.4487274880171664e-07, + "loss": 2.1103, + "step": 19370 + }, + { + "epoch": 3.6309278350515464, + "grad_norm": 59831.390625, + "learning_rate": 2.440966245906784e-07, + "loss": 2.0769, + "step": 19371 + }, + { + "epoch": 3.631115276476101, + "grad_norm": 66794.9140625, + "learning_rate": 2.433217292859447e-07, + "loss": 2.0697, + "step": 19372 + }, + { + "epoch": 3.631302717900656, + "grad_norm": 54544.28515625, + "learning_rate": 2.4254806290665567e-07, + "loss": 2.0785, + "step": 19373 + }, + { + "epoch": 3.631490159325211, + "grad_norm": 55366.5, + "learning_rate": 2.4177562547191837e-07, + "loss": 2.0336, + "step": 19374 + }, + { + "epoch": 3.6316776007497658, + "grad_norm": 55633.91015625, + "learning_rate": 2.4100441700080636e-07, + "loss": 2.1249, + "step": 19375 + }, + { + "epoch": 3.6318650421743204, + "grad_norm": 58879.7734375, + "learning_rate": 2.402344375123766e-07, + "loss": 2.1332, + "step": 19376 + }, + { + "epoch": 3.6320524835988754, + "grad_norm": 53294.91015625, + "learning_rate": 2.3946568702564174e-07, + "loss": 2.0781, + "step": 19377 + }, + { + "epoch": 3.63223992502343, + "grad_norm": 53925.5625, + "learning_rate": 2.3869816555958655e-07, + "loss": 2.0862, + "step": 19378 + }, + { + "epoch": 3.632427366447985, + "grad_norm": 57331.21875, + "learning_rate": 2.3793187313317368e-07, + "loss": 2.0622, + "step": 19379 + }, + { + "epoch": 3.6326148078725398, + "grad_norm": 56918.36328125, + "learning_rate": 2.3716680976532123e-07, + "loss": 2.0776, + "step": 19380 + }, + { + "epoch": 3.632802249297095, + "grad_norm": 53678.94921875, + "learning_rate": 2.3640297547493084e-07, + "loss": 2.0385, + "step": 19381 + }, + { + "epoch": 3.6329896907216495, + "grad_norm": 50991.32421875, + "learning_rate": 2.356403702808707e-07, + "loss": 2.0617, + "step": 19382 + }, + { + "epoch": 3.633177132146204, + "grad_norm": 59922.4296875, + "learning_rate": 2.3487899420197024e-07, + "loss": 1.9996, + "step": 19383 + }, + { + "epoch": 3.633364573570759, + "grad_norm": 57125.7109375, + "learning_rate": 2.3411884725703658e-07, + "loss": 2.0663, + "step": 19384 + }, + { + "epoch": 3.633552014995314, + "grad_norm": 57140.75, + "learning_rate": 2.333599294648492e-07, + "loss": 2.0668, + "step": 19385 + }, + { + "epoch": 3.633739456419869, + "grad_norm": 52620.6953125, + "learning_rate": 2.3260224084414306e-07, + "loss": 2.0428, + "step": 19386 + }, + { + "epoch": 3.6339268978444235, + "grad_norm": 51249.25, + "learning_rate": 2.318457814136421e-07, + "loss": 2.058, + "step": 19387 + }, + { + "epoch": 3.6341143392689785, + "grad_norm": 63723.8984375, + "learning_rate": 2.3109055119202028e-07, + "loss": 2.1766, + "step": 19388 + }, + { + "epoch": 3.634301780693533, + "grad_norm": 57511.69140625, + "learning_rate": 2.3033655019794042e-07, + "loss": 2.1818, + "step": 19389 + }, + { + "epoch": 3.6344892221180882, + "grad_norm": 51803.5390625, + "learning_rate": 2.2958377845001544e-07, + "loss": 2.0855, + "step": 19390 + }, + { + "epoch": 3.634676663542643, + "grad_norm": 50614.9296875, + "learning_rate": 2.288322359668471e-07, + "loss": 2.0408, + "step": 19391 + }, + { + "epoch": 3.634864104967198, + "grad_norm": 60579.34765625, + "learning_rate": 2.2808192276698725e-07, + "loss": 2.0381, + "step": 19392 + }, + { + "epoch": 3.6350515463917525, + "grad_norm": 53578.54296875, + "learning_rate": 2.2733283886897662e-07, + "loss": 2.0023, + "step": 19393 + }, + { + "epoch": 3.635238987816307, + "grad_norm": 53138.08203125, + "learning_rate": 2.2658498429131703e-07, + "loss": 2.0964, + "step": 19394 + }, + { + "epoch": 3.6354264292408622, + "grad_norm": 57918.8046875, + "learning_rate": 2.2583835905247707e-07, + "loss": 2.0802, + "step": 19395 + }, + { + "epoch": 3.6356138706654173, + "grad_norm": 54833.9765625, + "learning_rate": 2.25092963170892e-07, + "loss": 2.0653, + "step": 19396 + }, + { + "epoch": 3.635801312089972, + "grad_norm": 57121.12109375, + "learning_rate": 2.2434879666497488e-07, + "loss": 2.0269, + "step": 19397 + }, + { + "epoch": 3.6359887535145266, + "grad_norm": 54991.09765625, + "learning_rate": 2.2360585955311097e-07, + "loss": 1.9649, + "step": 19398 + }, + { + "epoch": 3.6361761949390816, + "grad_norm": 54067.29296875, + "learning_rate": 2.2286415185364673e-07, + "loss": 2.0756, + "step": 19399 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 56763.56640625, + "learning_rate": 2.2212367358489527e-07, + "loss": 2.1192, + "step": 19400 + }, + { + "epoch": 3.6365510777881913, + "grad_norm": 57226.88671875, + "learning_rate": 2.2138442476515863e-07, + "loss": 2.084, + "step": 19401 + }, + { + "epoch": 3.636738519212746, + "grad_norm": 61427.9375, + "learning_rate": 2.206464054126889e-07, + "loss": 2.1046, + "step": 19402 + }, + { + "epoch": 3.636925960637301, + "grad_norm": 55840.61328125, + "learning_rate": 2.1990961554570478e-07, + "loss": 2.1123, + "step": 19403 + }, + { + "epoch": 3.6371134020618556, + "grad_norm": 58790.953125, + "learning_rate": 2.1917405518241952e-07, + "loss": 2.0143, + "step": 19404 + }, + { + "epoch": 3.6373008434864103, + "grad_norm": 56219.89453125, + "learning_rate": 2.1843972434099082e-07, + "loss": 2.1073, + "step": 19405 + }, + { + "epoch": 3.6374882849109653, + "grad_norm": 60620.4921875, + "learning_rate": 2.1770662303955412e-07, + "loss": 2.1501, + "step": 19406 + }, + { + "epoch": 3.6376757263355204, + "grad_norm": 56240.8984375, + "learning_rate": 2.169747512962228e-07, + "loss": 2.0837, + "step": 19407 + }, + { + "epoch": 3.637863167760075, + "grad_norm": 58523.3515625, + "learning_rate": 2.1624410912907124e-07, + "loss": 2.0888, + "step": 19408 + }, + { + "epoch": 3.6380506091846296, + "grad_norm": 57931.3984375, + "learning_rate": 2.155146965561461e-07, + "loss": 2.1644, + "step": 19409 + }, + { + "epoch": 3.6382380506091847, + "grad_norm": 55233.5234375, + "learning_rate": 2.1478651359546076e-07, + "loss": 2.0575, + "step": 19410 + }, + { + "epoch": 3.6384254920337393, + "grad_norm": 51935.546875, + "learning_rate": 2.1405956026500084e-07, + "loss": 2.1034, + "step": 19411 + }, + { + "epoch": 3.6386129334582944, + "grad_norm": 53174.9609375, + "learning_rate": 2.1333383658271866e-07, + "loss": 2.1355, + "step": 19412 + }, + { + "epoch": 3.638800374882849, + "grad_norm": 58764.28125, + "learning_rate": 2.126093425665443e-07, + "loss": 2.0354, + "step": 19413 + }, + { + "epoch": 3.638987816307404, + "grad_norm": 57233.23828125, + "learning_rate": 2.11886078234369e-07, + "loss": 2.0345, + "step": 19414 + }, + { + "epoch": 3.6391752577319587, + "grad_norm": 59578.50390625, + "learning_rate": 2.1116404360405074e-07, + "loss": 2.1203, + "step": 19415 + }, + { + "epoch": 3.6393626991565133, + "grad_norm": 57027.078125, + "learning_rate": 2.104432386934363e-07, + "loss": 2.157, + "step": 19416 + }, + { + "epoch": 3.6395501405810684, + "grad_norm": 62351.55078125, + "learning_rate": 2.0972366352031701e-07, + "loss": 2.1265, + "step": 19417 + }, + { + "epoch": 3.6397375820056235, + "grad_norm": 57534.4609375, + "learning_rate": 2.090053181024676e-07, + "loss": 2.0193, + "step": 19418 + }, + { + "epoch": 3.639925023430178, + "grad_norm": 53063.9453125, + "learning_rate": 2.0828820245763492e-07, + "loss": 2.0724, + "step": 19419 + }, + { + "epoch": 3.6401124648547327, + "grad_norm": 52557.01953125, + "learning_rate": 2.0757231660352705e-07, + "loss": 2.0939, + "step": 19420 + }, + { + "epoch": 3.640299906279288, + "grad_norm": 59478.5078125, + "learning_rate": 2.068576605578243e-07, + "loss": 2.0329, + "step": 19421 + }, + { + "epoch": 3.6404873477038424, + "grad_norm": 52818.66015625, + "learning_rate": 2.0614423433818476e-07, + "loss": 2.0674, + "step": 19422 + }, + { + "epoch": 3.6406747891283975, + "grad_norm": 65842.5703125, + "learning_rate": 2.0543203796221655e-07, + "loss": 2.0611, + "step": 19423 + }, + { + "epoch": 3.640862230552952, + "grad_norm": 53909.6171875, + "learning_rate": 2.0472107144752228e-07, + "loss": 2.0886, + "step": 19424 + }, + { + "epoch": 3.641049671977507, + "grad_norm": 62255.90234375, + "learning_rate": 2.0401133481165458e-07, + "loss": 2.0599, + "step": 19425 + }, + { + "epoch": 3.641237113402062, + "grad_norm": 57534.1796875, + "learning_rate": 2.033028280721494e-07, + "loss": 2.0358, + "step": 19426 + }, + { + "epoch": 3.6414245548266164, + "grad_norm": 54596.375, + "learning_rate": 2.0259555124649832e-07, + "loss": 2.0535, + "step": 19427 + }, + { + "epoch": 3.6416119962511715, + "grad_norm": 52699.29296875, + "learning_rate": 2.0188950435217625e-07, + "loss": 2.0495, + "step": 19428 + }, + { + "epoch": 3.6417994376757266, + "grad_norm": 58157.33203125, + "learning_rate": 2.0118468740661368e-07, + "loss": 2.0787, + "step": 19429 + }, + { + "epoch": 3.641986879100281, + "grad_norm": 57185.15625, + "learning_rate": 2.0048110042723e-07, + "loss": 1.9698, + "step": 19430 + }, + { + "epoch": 3.642174320524836, + "grad_norm": 57156.40234375, + "learning_rate": 1.997787434313947e-07, + "loss": 2.0378, + "step": 19431 + }, + { + "epoch": 3.642361761949391, + "grad_norm": 55609.0625, + "learning_rate": 1.990776164364605e-07, + "loss": 2.0596, + "step": 19432 + }, + { + "epoch": 3.6425492033739455, + "grad_norm": 55121.0234375, + "learning_rate": 1.9837771945973583e-07, + "loss": 2.1077, + "step": 19433 + }, + { + "epoch": 3.6427366447985006, + "grad_norm": 59251.55859375, + "learning_rate": 1.9767905251851792e-07, + "loss": 2.0792, + "step": 19434 + }, + { + "epoch": 3.642924086223055, + "grad_norm": 58495.78125, + "learning_rate": 1.9698161563005412e-07, + "loss": 2.04, + "step": 19435 + }, + { + "epoch": 3.6431115276476103, + "grad_norm": 54187.0078125, + "learning_rate": 1.962854088115751e-07, + "loss": 2.1232, + "step": 19436 + }, + { + "epoch": 3.643298969072165, + "grad_norm": 55709.484375, + "learning_rate": 1.9559043208027262e-07, + "loss": 2.0635, + "step": 19437 + }, + { + "epoch": 3.6434864104967195, + "grad_norm": 56635.89453125, + "learning_rate": 1.9489668545332184e-07, + "loss": 2.1086, + "step": 19438 + }, + { + "epoch": 3.6436738519212746, + "grad_norm": 51307.0234375, + "learning_rate": 1.9420416894784242e-07, + "loss": 2.0726, + "step": 19439 + }, + { + "epoch": 3.6438612933458296, + "grad_norm": 52868.80859375, + "learning_rate": 1.9351288258094847e-07, + "loss": 2.05, + "step": 19440 + }, + { + "epoch": 3.6440487347703843, + "grad_norm": 57375.80078125, + "learning_rate": 1.9282282636970962e-07, + "loss": 2.0133, + "step": 19441 + }, + { + "epoch": 3.644236176194939, + "grad_norm": 56193.05859375, + "learning_rate": 1.9213400033117336e-07, + "loss": 2.0724, + "step": 19442 + }, + { + "epoch": 3.644423617619494, + "grad_norm": 59996.63671875, + "learning_rate": 1.914464044823483e-07, + "loss": 2.074, + "step": 19443 + }, + { + "epoch": 3.6446110590440486, + "grad_norm": 55000.23828125, + "learning_rate": 1.9076003884022087e-07, + "loss": 2.1016, + "step": 19444 + }, + { + "epoch": 3.6447985004686037, + "grad_norm": 57303.66015625, + "learning_rate": 1.900749034217386e-07, + "loss": 1.9997, + "step": 19445 + }, + { + "epoch": 3.6449859418931583, + "grad_norm": 58796.26171875, + "learning_rate": 1.8939099824383243e-07, + "loss": 2.0826, + "step": 19446 + }, + { + "epoch": 3.6451733833177133, + "grad_norm": 50653.62109375, + "learning_rate": 1.887083233233833e-07, + "loss": 2.1411, + "step": 19447 + }, + { + "epoch": 3.645360824742268, + "grad_norm": 53705.67578125, + "learning_rate": 1.88026878677261e-07, + "loss": 2.093, + "step": 19448 + }, + { + "epoch": 3.6455482661668226, + "grad_norm": 54431.640625, + "learning_rate": 1.8734666432228544e-07, + "loss": 1.9979, + "step": 19449 + }, + { + "epoch": 3.6457357075913777, + "grad_norm": 62778.2109375, + "learning_rate": 1.8666768027527093e-07, + "loss": 2.0284, + "step": 19450 + }, + { + "epoch": 3.6459231490159327, + "grad_norm": 52407.3515625, + "learning_rate": 1.8598992655298187e-07, + "loss": 2.0501, + "step": 19451 + }, + { + "epoch": 3.6461105904404874, + "grad_norm": 54974.3359375, + "learning_rate": 1.8531340317215483e-07, + "loss": 2.0768, + "step": 19452 + }, + { + "epoch": 3.646298031865042, + "grad_norm": 57334.19921875, + "learning_rate": 1.846381101494987e-07, + "loss": 2.0377, + "step": 19453 + }, + { + "epoch": 3.646485473289597, + "grad_norm": 58050.79296875, + "learning_rate": 1.839640475016946e-07, + "loss": 1.9833, + "step": 19454 + }, + { + "epoch": 3.6466729147141517, + "grad_norm": 56768.3359375, + "learning_rate": 1.8329121524539583e-07, + "loss": 2.1257, + "step": 19455 + }, + { + "epoch": 3.6468603561387067, + "grad_norm": 58232.36328125, + "learning_rate": 1.8261961339721134e-07, + "loss": 2.0109, + "step": 19456 + }, + { + "epoch": 3.6470477975632614, + "grad_norm": 57233.40234375, + "learning_rate": 1.8194924197373342e-07, + "loss": 2.1477, + "step": 19457 + }, + { + "epoch": 3.6472352389878164, + "grad_norm": 55168.3515625, + "learning_rate": 1.8128010099152103e-07, + "loss": 2.0544, + "step": 19458 + }, + { + "epoch": 3.647422680412371, + "grad_norm": 55608.16796875, + "learning_rate": 1.8061219046709987e-07, + "loss": 2.0593, + "step": 19459 + }, + { + "epoch": 3.647610121836926, + "grad_norm": 53810.65625, + "learning_rate": 1.7994551041696228e-07, + "loss": 2.0211, + "step": 19460 + }, + { + "epoch": 3.6477975632614807, + "grad_norm": 56480.7265625, + "learning_rate": 1.7928006085758397e-07, + "loss": 2.0269, + "step": 19461 + }, + { + "epoch": 3.647985004686036, + "grad_norm": 54733.37890625, + "learning_rate": 1.786158418053907e-07, + "loss": 2.0028, + "step": 19462 + }, + { + "epoch": 3.6481724461105904, + "grad_norm": 59668.08203125, + "learning_rate": 1.7795285327678602e-07, + "loss": 2.0992, + "step": 19463 + }, + { + "epoch": 3.648359887535145, + "grad_norm": 56147.90625, + "learning_rate": 1.772910952881568e-07, + "loss": 2.0634, + "step": 19464 + }, + { + "epoch": 3.6485473289597, + "grad_norm": 54869.91015625, + "learning_rate": 1.7663056785584554e-07, + "loss": 2.1024, + "step": 19465 + }, + { + "epoch": 3.6487347703842548, + "grad_norm": 53113.40234375, + "learning_rate": 1.7597127099615586e-07, + "loss": 1.9985, + "step": 19466 + }, + { + "epoch": 3.64892221180881, + "grad_norm": 58169.859375, + "learning_rate": 1.7531320472538028e-07, + "loss": 2.0606, + "step": 19467 + }, + { + "epoch": 3.6491096532333644, + "grad_norm": 54163.88671875, + "learning_rate": 1.746563690597669e-07, + "loss": 2.0785, + "step": 19468 + }, + { + "epoch": 3.6492970946579195, + "grad_norm": 54533.25, + "learning_rate": 1.7400076401554723e-07, + "loss": 2.1067, + "step": 19469 + }, + { + "epoch": 3.649484536082474, + "grad_norm": 60510.69140625, + "learning_rate": 1.7334638960890271e-07, + "loss": 2.0574, + "step": 19470 + }, + { + "epoch": 3.649671977507029, + "grad_norm": 50108.59765625, + "learning_rate": 1.7269324585600378e-07, + "loss": 2.1145, + "step": 19471 + }, + { + "epoch": 3.649859418931584, + "grad_norm": 54533.56640625, + "learning_rate": 1.7204133277298195e-07, + "loss": 2.0617, + "step": 19472 + }, + { + "epoch": 3.650046860356139, + "grad_norm": 51338.01171875, + "learning_rate": 1.7139065037593549e-07, + "loss": 2.0292, + "step": 19473 + }, + { + "epoch": 3.6502343017806935, + "grad_norm": 59628.84765625, + "learning_rate": 1.7074119868093485e-07, + "loss": 2.1011, + "step": 19474 + }, + { + "epoch": 3.650421743205248, + "grad_norm": 53031.0078125, + "learning_rate": 1.7009297770401721e-07, + "loss": 2.1089, + "step": 19475 + }, + { + "epoch": 3.650609184629803, + "grad_norm": 54507.46875, + "learning_rate": 1.6944598746120311e-07, + "loss": 2.034, + "step": 19476 + }, + { + "epoch": 3.6507966260543583, + "grad_norm": 59518.89453125, + "learning_rate": 1.6880022796846863e-07, + "loss": 2.0586, + "step": 19477 + }, + { + "epoch": 3.650984067478913, + "grad_norm": 61451.2109375, + "learning_rate": 1.6815569924175655e-07, + "loss": 2.0473, + "step": 19478 + }, + { + "epoch": 3.6511715089034675, + "grad_norm": 55974.5390625, + "learning_rate": 1.675124012969931e-07, + "loss": 2.0462, + "step": 19479 + }, + { + "epoch": 3.6513589503280226, + "grad_norm": 52108.91015625, + "learning_rate": 1.6687033415006547e-07, + "loss": 2.027, + "step": 19480 + }, + { + "epoch": 3.6515463917525772, + "grad_norm": 62040.12109375, + "learning_rate": 1.6622949781682773e-07, + "loss": 2.1415, + "step": 19481 + }, + { + "epoch": 3.6517338331771323, + "grad_norm": 57355.43359375, + "learning_rate": 1.6558989231311716e-07, + "loss": 2.1062, + "step": 19482 + }, + { + "epoch": 3.651921274601687, + "grad_norm": 53786.3203125, + "learning_rate": 1.6495151765471563e-07, + "loss": 2.1644, + "step": 19483 + }, + { + "epoch": 3.652108716026242, + "grad_norm": 50312.1171875, + "learning_rate": 1.643143738574049e-07, + "loss": 2.0159, + "step": 19484 + }, + { + "epoch": 3.6522961574507966, + "grad_norm": 54274.109375, + "learning_rate": 1.6367846093691685e-07, + "loss": 2.0647, + "step": 19485 + }, + { + "epoch": 3.6524835988753512, + "grad_norm": 53971.0625, + "learning_rate": 1.630437789089556e-07, + "loss": 2.0582, + "step": 19486 + }, + { + "epoch": 3.6526710402999063, + "grad_norm": 55029.0078125, + "learning_rate": 1.6241032778920307e-07, + "loss": 2.0045, + "step": 19487 + }, + { + "epoch": 3.6528584817244614, + "grad_norm": 51157.56640625, + "learning_rate": 1.617781075932967e-07, + "loss": 2.0498, + "step": 19488 + }, + { + "epoch": 3.653045923149016, + "grad_norm": 53554.94921875, + "learning_rate": 1.611471183368518e-07, + "loss": 2.1032, + "step": 19489 + }, + { + "epoch": 3.6532333645735706, + "grad_norm": 52802.5, + "learning_rate": 1.6051736003545592e-07, + "loss": 2.0464, + "step": 19490 + }, + { + "epoch": 3.6534208059981257, + "grad_norm": 58806.9375, + "learning_rate": 1.5988883270466882e-07, + "loss": 2.0944, + "step": 19491 + }, + { + "epoch": 3.6536082474226803, + "grad_norm": 53857.60546875, + "learning_rate": 1.5926153636000584e-07, + "loss": 1.9944, + "step": 19492 + }, + { + "epoch": 3.6537956888472354, + "grad_norm": 62161.5078125, + "learning_rate": 1.5863547101696573e-07, + "loss": 1.9261, + "step": 19493 + }, + { + "epoch": 3.65398313027179, + "grad_norm": 53690.6796875, + "learning_rate": 1.5801063669100834e-07, + "loss": 2.0992, + "step": 19494 + }, + { + "epoch": 3.654170571696345, + "grad_norm": 55448.51171875, + "learning_rate": 1.5738703339756579e-07, + "loss": 2.1309, + "step": 19495 + }, + { + "epoch": 3.6543580131208997, + "grad_norm": 54239.453125, + "learning_rate": 1.567646611520368e-07, + "loss": 2.0883, + "step": 19496 + }, + { + "epoch": 3.6545454545454543, + "grad_norm": 55777.09375, + "learning_rate": 1.5614351996980913e-07, + "loss": 2.022, + "step": 19497 + }, + { + "epoch": 3.6547328959700094, + "grad_norm": 55712.33203125, + "learning_rate": 1.5552360986620384e-07, + "loss": 2.0833, + "step": 19498 + }, + { + "epoch": 3.6549203373945645, + "grad_norm": 54845.7265625, + "learning_rate": 1.54904930856542e-07, + "loss": 2.0541, + "step": 19499 + }, + { + "epoch": 3.655107778819119, + "grad_norm": 56210.58984375, + "learning_rate": 1.5428748295610584e-07, + "loss": 2.1126, + "step": 19500 + }, + { + "epoch": 3.655107778819119, + "eval_loss": 2.256906747817993, + "eval_runtime": 128.3778, + "eval_samples_per_second": 39.329, + "eval_steps_per_second": 1.971, + "step": 19500 + }, + { + "epoch": 3.6552952202436737, + "grad_norm": 54056.59765625, + "learning_rate": 1.5367126618014428e-07, + "loss": 2.1405, + "step": 19501 + }, + { + "epoch": 3.6554826616682288, + "grad_norm": 56508.890625, + "learning_rate": 1.530562805438729e-07, + "loss": 2.0761, + "step": 19502 + }, + { + "epoch": 3.6556701030927834, + "grad_norm": 58800.1171875, + "learning_rate": 1.524425260624851e-07, + "loss": 2.0686, + "step": 19503 + }, + { + "epoch": 3.6558575445173385, + "grad_norm": 55153.49609375, + "learning_rate": 1.5183000275114102e-07, + "loss": 2.0375, + "step": 19504 + }, + { + "epoch": 3.656044985941893, + "grad_norm": 53148.09765625, + "learning_rate": 1.5121871062496185e-07, + "loss": 2.0843, + "step": 19505 + }, + { + "epoch": 3.656232427366448, + "grad_norm": 63056.79296875, + "learning_rate": 1.506086496990522e-07, + "loss": 2.0393, + "step": 19506 + }, + { + "epoch": 3.656419868791003, + "grad_norm": 51497.61328125, + "learning_rate": 1.4999981998847779e-07, + "loss": 2.2651, + "step": 19507 + }, + { + "epoch": 3.6566073102155574, + "grad_norm": 56857.85546875, + "learning_rate": 1.493922215082766e-07, + "loss": 2.0122, + "step": 19508 + }, + { + "epoch": 3.6567947516401125, + "grad_norm": 56787.109375, + "learning_rate": 1.487858542734588e-07, + "loss": 2.1213, + "step": 19509 + }, + { + "epoch": 3.6569821930646675, + "grad_norm": 55739.140625, + "learning_rate": 1.4818071829899027e-07, + "loss": 2.0497, + "step": 19510 + }, + { + "epoch": 3.657169634489222, + "grad_norm": 53997.12890625, + "learning_rate": 1.475768135998312e-07, + "loss": 2.0782, + "step": 19511 + }, + { + "epoch": 3.657357075913777, + "grad_norm": 61320.83984375, + "learning_rate": 1.469741401908864e-07, + "loss": 2.079, + "step": 19512 + }, + { + "epoch": 3.657544517338332, + "grad_norm": 57985.61328125, + "learning_rate": 1.463726980870439e-07, + "loss": 2.0181, + "step": 19513 + }, + { + "epoch": 3.6577319587628865, + "grad_norm": 53024.20703125, + "learning_rate": 1.4577248730316406e-07, + "loss": 1.9993, + "step": 19514 + }, + { + "epoch": 3.6579194001874415, + "grad_norm": 53683.7890625, + "learning_rate": 1.451735078540628e-07, + "loss": 2.0931, + "step": 19515 + }, + { + "epoch": 3.658106841611996, + "grad_norm": 58067.21875, + "learning_rate": 1.4457575975453942e-07, + "loss": 2.0849, + "step": 19516 + }, + { + "epoch": 3.6582942830365512, + "grad_norm": 52165.57421875, + "learning_rate": 1.4397924301935984e-07, + "loss": 2.0597, + "step": 19517 + }, + { + "epoch": 3.658481724461106, + "grad_norm": 56751.046875, + "learning_rate": 1.433839576632512e-07, + "loss": 2.034, + "step": 19518 + }, + { + "epoch": 3.6586691658856605, + "grad_norm": 54407.28515625, + "learning_rate": 1.4278990370091838e-07, + "loss": 2.0649, + "step": 19519 + }, + { + "epoch": 3.6588566073102156, + "grad_norm": 56875.6875, + "learning_rate": 1.42197081147033e-07, + "loss": 2.0017, + "step": 19520 + }, + { + "epoch": 3.6590440487347706, + "grad_norm": 52202.58984375, + "learning_rate": 1.416054900162389e-07, + "loss": 2.09, + "step": 19521 + }, + { + "epoch": 3.6592314901593253, + "grad_norm": 60950.35546875, + "learning_rate": 1.410151303231466e-07, + "loss": 1.9827, + "step": 19522 + }, + { + "epoch": 3.65941893158388, + "grad_norm": 52652.57421875, + "learning_rate": 1.4042600208233337e-07, + "loss": 2.098, + "step": 19523 + }, + { + "epoch": 3.659606373008435, + "grad_norm": 60532.90234375, + "learning_rate": 1.3983810530835972e-07, + "loss": 2.0801, + "step": 19524 + }, + { + "epoch": 3.6597938144329896, + "grad_norm": 52505.82421875, + "learning_rate": 1.3925144001574186e-07, + "loss": 2.0876, + "step": 19525 + }, + { + "epoch": 3.6599812558575446, + "grad_norm": 53714.88671875, + "learning_rate": 1.3866600621896265e-07, + "loss": 2.0772, + "step": 19526 + }, + { + "epoch": 3.6601686972820993, + "grad_norm": 52234.73046875, + "learning_rate": 1.3808180393248825e-07, + "loss": 2.0058, + "step": 19527 + }, + { + "epoch": 3.6603561387066543, + "grad_norm": 55655.828125, + "learning_rate": 1.3749883317074608e-07, + "loss": 2.0996, + "step": 19528 + }, + { + "epoch": 3.660543580131209, + "grad_norm": 61890.96875, + "learning_rate": 1.3691709394813567e-07, + "loss": 1.9886, + "step": 19529 + }, + { + "epoch": 3.6607310215557636, + "grad_norm": 52125.23828125, + "learning_rate": 1.363365862790289e-07, + "loss": 2.0951, + "step": 19530 + }, + { + "epoch": 3.6609184629803186, + "grad_norm": 50824.82421875, + "learning_rate": 1.3575731017775317e-07, + "loss": 2.0809, + "step": 19531 + }, + { + "epoch": 3.6611059044048737, + "grad_norm": 55056.71875, + "learning_rate": 1.3517926565862482e-07, + "loss": 2.0376, + "step": 19532 + }, + { + "epoch": 3.6612933458294283, + "grad_norm": 59640.703125, + "learning_rate": 1.3460245273591575e-07, + "loss": 2.0648, + "step": 19533 + }, + { + "epoch": 3.661480787253983, + "grad_norm": 62790.640625, + "learning_rate": 1.3402687142387572e-07, + "loss": 2.0988, + "step": 19534 + }, + { + "epoch": 3.661668228678538, + "grad_norm": 60395.5625, + "learning_rate": 1.3345252173672107e-07, + "loss": 2.1185, + "step": 19535 + }, + { + "epoch": 3.6618556701030927, + "grad_norm": 54421.703125, + "learning_rate": 1.3287940368863494e-07, + "loss": 2.0641, + "step": 19536 + }, + { + "epoch": 3.6620431115276477, + "grad_norm": 60579.796875, + "learning_rate": 1.3230751729377267e-07, + "loss": 2.1457, + "step": 19537 + }, + { + "epoch": 3.6622305529522023, + "grad_norm": 59300.734375, + "learning_rate": 1.3173686256626737e-07, + "loss": 2.1166, + "step": 19538 + }, + { + "epoch": 3.6624179943767574, + "grad_norm": 52368.9765625, + "learning_rate": 1.311674395201967e-07, + "loss": 2.1238, + "step": 19539 + }, + { + "epoch": 3.662605435801312, + "grad_norm": 56052.1796875, + "learning_rate": 1.3059924816963832e-07, + "loss": 2.1441, + "step": 19540 + }, + { + "epoch": 3.6627928772258667, + "grad_norm": 55018.640625, + "learning_rate": 1.3003228852862537e-07, + "loss": 2.0948, + "step": 19541 + }, + { + "epoch": 3.6629803186504217, + "grad_norm": 54000.58984375, + "learning_rate": 1.2946656061115227e-07, + "loss": 2.0511, + "step": 19542 + }, + { + "epoch": 3.663167760074977, + "grad_norm": 55831.98046875, + "learning_rate": 1.2890206443120223e-07, + "loss": 2.0983, + "step": 19543 + }, + { + "epoch": 3.6633552014995314, + "grad_norm": 54531.4140625, + "learning_rate": 1.2833880000270858e-07, + "loss": 2.0193, + "step": 19544 + }, + { + "epoch": 3.663542642924086, + "grad_norm": 57637.03125, + "learning_rate": 1.27776767339588e-07, + "loss": 2.0642, + "step": 19545 + }, + { + "epoch": 3.663730084348641, + "grad_norm": 55203.1484375, + "learning_rate": 1.272159664557182e-07, + "loss": 2.0236, + "step": 19546 + }, + { + "epoch": 3.6639175257731957, + "grad_norm": 59476.90234375, + "learning_rate": 1.2665639736495482e-07, + "loss": 2.0666, + "step": 19547 + }, + { + "epoch": 3.664104967197751, + "grad_norm": 52067.55859375, + "learning_rate": 1.2609806008112013e-07, + "loss": 2.0614, + "step": 19548 + }, + { + "epoch": 3.6642924086223054, + "grad_norm": 52004.9140625, + "learning_rate": 1.2554095461799753e-07, + "loss": 2.0948, + "step": 19549 + }, + { + "epoch": 3.6644798500468605, + "grad_norm": 53630.58984375, + "learning_rate": 1.2498508098934825e-07, + "loss": 1.9993, + "step": 19550 + }, + { + "epoch": 3.664667291471415, + "grad_norm": 54306.9609375, + "learning_rate": 1.2443043920890574e-07, + "loss": 2.1174, + "step": 19551 + }, + { + "epoch": 3.6648547328959697, + "grad_norm": 51350.66796875, + "learning_rate": 1.2387702929037014e-07, + "loss": 2.0114, + "step": 19552 + }, + { + "epoch": 3.665042174320525, + "grad_norm": 57084.74609375, + "learning_rate": 1.2332485124739724e-07, + "loss": 2.1082, + "step": 19553 + }, + { + "epoch": 3.66522961574508, + "grad_norm": 55002.4765625, + "learning_rate": 1.2277390509364273e-07, + "loss": 2.0614, + "step": 19554 + }, + { + "epoch": 3.6654170571696345, + "grad_norm": 59026.8359375, + "learning_rate": 1.2222419084270686e-07, + "loss": 2.0804, + "step": 19555 + }, + { + "epoch": 3.665604498594189, + "grad_norm": 56152.3359375, + "learning_rate": 1.2167570850816213e-07, + "loss": 2.1118, + "step": 19556 + }, + { + "epoch": 3.665791940018744, + "grad_norm": 59562.99609375, + "learning_rate": 1.2112845810355877e-07, + "loss": 2.0367, + "step": 19557 + }, + { + "epoch": 3.665979381443299, + "grad_norm": 55724.06640625, + "learning_rate": 1.205824396424138e-07, + "loss": 2.0365, + "step": 19558 + }, + { + "epoch": 3.666166822867854, + "grad_norm": 55940.5234375, + "learning_rate": 1.2003765313821637e-07, + "loss": 2.1532, + "step": 19559 + }, + { + "epoch": 3.6663542642924085, + "grad_norm": 50915.828125, + "learning_rate": 1.194940986044113e-07, + "loss": 2.072, + "step": 19560 + }, + { + "epoch": 3.6665417057169636, + "grad_norm": 52588.61328125, + "learning_rate": 1.1895177605443785e-07, + "loss": 2.0696, + "step": 19561 + }, + { + "epoch": 3.666729147141518, + "grad_norm": 57074.1328125, + "learning_rate": 1.1841068550168533e-07, + "loss": 2.0432, + "step": 19562 + }, + { + "epoch": 3.666916588566073, + "grad_norm": 53059.32421875, + "learning_rate": 1.1787082695950969e-07, + "loss": 2.0898, + "step": 19563 + }, + { + "epoch": 3.667104029990628, + "grad_norm": 56328.765625, + "learning_rate": 1.1733220044125581e-07, + "loss": 2.0168, + "step": 19564 + }, + { + "epoch": 3.667291471415183, + "grad_norm": 56640.76171875, + "learning_rate": 1.1679480596021864e-07, + "loss": 2.0541, + "step": 19565 + }, + { + "epoch": 3.6674789128397376, + "grad_norm": 54518.4765625, + "learning_rate": 1.1625864352968197e-07, + "loss": 2.0809, + "step": 19566 + }, + { + "epoch": 3.667666354264292, + "grad_norm": 55459.9921875, + "learning_rate": 1.1572371316287966e-07, + "loss": 1.9815, + "step": 19567 + }, + { + "epoch": 3.6678537956888473, + "grad_norm": 55093.3671875, + "learning_rate": 1.1519001487301784e-07, + "loss": 2.0982, + "step": 19568 + }, + { + "epoch": 3.668041237113402, + "grad_norm": 56739.3671875, + "learning_rate": 1.1465754867329148e-07, + "loss": 2.1007, + "step": 19569 + }, + { + "epoch": 3.668228678537957, + "grad_norm": 50129.69140625, + "learning_rate": 1.141263145768512e-07, + "loss": 2.097, + "step": 19570 + }, + { + "epoch": 3.6684161199625116, + "grad_norm": 53928.1171875, + "learning_rate": 1.135963125968087e-07, + "loss": 2.0541, + "step": 19571 + }, + { + "epoch": 3.6686035613870667, + "grad_norm": 56359.52734375, + "learning_rate": 1.1306754274625908e-07, + "loss": 2.0601, + "step": 19572 + }, + { + "epoch": 3.6687910028116213, + "grad_norm": 55131.0703125, + "learning_rate": 1.1254000503825856e-07, + "loss": 2.0566, + "step": 19573 + }, + { + "epoch": 3.668978444236176, + "grad_norm": 56682.640625, + "learning_rate": 1.1201369948584118e-07, + "loss": 2.0109, + "step": 19574 + }, + { + "epoch": 3.669165885660731, + "grad_norm": 54090.3203125, + "learning_rate": 1.1148862610200762e-07, + "loss": 2.0098, + "step": 19575 + }, + { + "epoch": 3.669353327085286, + "grad_norm": 52616.1171875, + "learning_rate": 1.1096478489971973e-07, + "loss": 2.0656, + "step": 19576 + }, + { + "epoch": 3.6695407685098407, + "grad_norm": 59051.80859375, + "learning_rate": 1.1044217589192275e-07, + "loss": 2.1063, + "step": 19577 + }, + { + "epoch": 3.6697282099343953, + "grad_norm": 54004.625, + "learning_rate": 1.0992079909152298e-07, + "loss": 2.0631, + "step": 19578 + }, + { + "epoch": 3.6699156513589504, + "grad_norm": 56169.0390625, + "learning_rate": 1.0940065451139347e-07, + "loss": 2.037, + "step": 19579 + }, + { + "epoch": 3.670103092783505, + "grad_norm": 56156.65234375, + "learning_rate": 1.0888174216438507e-07, + "loss": 2.0047, + "step": 19580 + }, + { + "epoch": 3.67029053420806, + "grad_norm": 54350.34765625, + "learning_rate": 1.0836406206330973e-07, + "loss": 2.0394, + "step": 19581 + }, + { + "epoch": 3.6704779756326147, + "grad_norm": 58153.5, + "learning_rate": 1.0784761422095724e-07, + "loss": 1.9517, + "step": 19582 + }, + { + "epoch": 3.6706654170571698, + "grad_norm": 51599.5078125, + "learning_rate": 1.0733239865008405e-07, + "loss": 2.1055, + "step": 19583 + }, + { + "epoch": 3.6708528584817244, + "grad_norm": 55322.24609375, + "learning_rate": 1.068184153634133e-07, + "loss": 2.0526, + "step": 19584 + }, + { + "epoch": 3.6710402999062794, + "grad_norm": 55415.4140625, + "learning_rate": 1.0630566437364042e-07, + "loss": 2.0859, + "step": 19585 + }, + { + "epoch": 3.671227741330834, + "grad_norm": 57789.1875, + "learning_rate": 1.0579414569342749e-07, + "loss": 2.0334, + "step": 19586 + }, + { + "epoch": 3.671415182755389, + "grad_norm": 57986.04296875, + "learning_rate": 1.052838593354144e-07, + "loss": 2.0331, + "step": 19587 + }, + { + "epoch": 3.6716026241799438, + "grad_norm": 57100.44921875, + "learning_rate": 1.0477480531219663e-07, + "loss": 2.0648, + "step": 19588 + }, + { + "epoch": 3.6717900656044984, + "grad_norm": 55348.71484375, + "learning_rate": 1.04266983636353e-07, + "loss": 2.0184, + "step": 19589 + }, + { + "epoch": 3.6719775070290535, + "grad_norm": 54008.49609375, + "learning_rate": 1.0376039432042351e-07, + "loss": 2.1067, + "step": 19590 + }, + { + "epoch": 3.6721649484536085, + "grad_norm": 53622.49609375, + "learning_rate": 1.0325503737692033e-07, + "loss": 2.051, + "step": 19591 + }, + { + "epoch": 3.672352389878163, + "grad_norm": 57629.30859375, + "learning_rate": 1.0275091281832794e-07, + "loss": 2.089, + "step": 19592 + }, + { + "epoch": 3.6725398313027178, + "grad_norm": 55535.8515625, + "learning_rate": 1.0224802065709193e-07, + "loss": 1.9923, + "step": 19593 + }, + { + "epoch": 3.672727272727273, + "grad_norm": 54024.7109375, + "learning_rate": 1.0174636090564127e-07, + "loss": 2.0478, + "step": 19594 + }, + { + "epoch": 3.6729147141518275, + "grad_norm": 58439.796875, + "learning_rate": 1.0124593357635493e-07, + "loss": 2.0078, + "step": 19595 + }, + { + "epoch": 3.6731021555763825, + "grad_norm": 54369.171875, + "learning_rate": 1.0074673868160634e-07, + "loss": 2.0899, + "step": 19596 + }, + { + "epoch": 3.673289597000937, + "grad_norm": 54453.7109375, + "learning_rate": 1.0024877623371343e-07, + "loss": 1.9975, + "step": 19597 + }, + { + "epoch": 3.6734770384254922, + "grad_norm": 54326.60546875, + "learning_rate": 9.975204624497747e-08, + "loss": 2.0473, + "step": 19598 + }, + { + "epoch": 3.673664479850047, + "grad_norm": 60553.3828125, + "learning_rate": 9.925654872767198e-08, + "loss": 2.1232, + "step": 19599 + }, + { + "epoch": 3.6738519212746015, + "grad_norm": 52173.4296875, + "learning_rate": 9.876228369403717e-08, + "loss": 2.078, + "step": 19600 + }, + { + "epoch": 3.6740393626991565, + "grad_norm": 59530.046875, + "learning_rate": 9.826925115626883e-08, + "loss": 2.0338, + "step": 19601 + }, + { + "epoch": 3.6742268041237116, + "grad_norm": 61789.30859375, + "learning_rate": 9.777745112655723e-08, + "loss": 2.1042, + "step": 19602 + }, + { + "epoch": 3.6744142455482662, + "grad_norm": 56906.53125, + "learning_rate": 9.728688361703709e-08, + "loss": 2.0668, + "step": 19603 + }, + { + "epoch": 3.674601686972821, + "grad_norm": 58863.01171875, + "learning_rate": 9.679754863983759e-08, + "loss": 2.1104, + "step": 19604 + }, + { + "epoch": 3.674789128397376, + "grad_norm": 64183.90625, + "learning_rate": 9.630944620703241e-08, + "loss": 2.1096, + "step": 19605 + }, + { + "epoch": 3.6749765698219305, + "grad_norm": 53299.0078125, + "learning_rate": 9.582257633068969e-08, + "loss": 2.0232, + "step": 19606 + }, + { + "epoch": 3.6751640112464856, + "grad_norm": 55694.125, + "learning_rate": 9.533693902282204e-08, + "loss": 2.0627, + "step": 19607 + }, + { + "epoch": 3.6753514526710402, + "grad_norm": 57530.0234375, + "learning_rate": 9.485253429543096e-08, + "loss": 2.0987, + "step": 19608 + }, + { + "epoch": 3.6755388940955953, + "grad_norm": 53402.78515625, + "learning_rate": 9.436936216047909e-08, + "loss": 2.0092, + "step": 19609 + }, + { + "epoch": 3.67572633552015, + "grad_norm": 53340.00390625, + "learning_rate": 9.388742262990135e-08, + "loss": 2.0849, + "step": 19610 + }, + { + "epoch": 3.6759137769447046, + "grad_norm": 56439.77734375, + "learning_rate": 9.340671571559933e-08, + "loss": 2.0612, + "step": 19611 + }, + { + "epoch": 3.6761012183692596, + "grad_norm": 55426.3515625, + "learning_rate": 9.292724142944687e-08, + "loss": 2.1228, + "step": 19612 + }, + { + "epoch": 3.6762886597938147, + "grad_norm": 58238.48828125, + "learning_rate": 9.244899978328447e-08, + "loss": 2.0697, + "step": 19613 + }, + { + "epoch": 3.6764761012183693, + "grad_norm": 58614.27734375, + "learning_rate": 9.197199078893049e-08, + "loss": 2.0751, + "step": 19614 + }, + { + "epoch": 3.676663542642924, + "grad_norm": 57157.09375, + "learning_rate": 9.149621445815881e-08, + "loss": 2.1088, + "step": 19615 + }, + { + "epoch": 3.676850984067479, + "grad_norm": 54786.49609375, + "learning_rate": 9.102167080272117e-08, + "loss": 2.0718, + "step": 19616 + }, + { + "epoch": 3.6770384254920336, + "grad_norm": 61482.16015625, + "learning_rate": 9.054835983434151e-08, + "loss": 2.1188, + "step": 19617 + }, + { + "epoch": 3.6772258669165887, + "grad_norm": 57881.25390625, + "learning_rate": 9.007628156471048e-08, + "loss": 2.0597, + "step": 19618 + }, + { + "epoch": 3.6774133083411433, + "grad_norm": 58249.53515625, + "learning_rate": 8.960543600548543e-08, + "loss": 2.033, + "step": 19619 + }, + { + "epoch": 3.6776007497656984, + "grad_norm": 55756.82421875, + "learning_rate": 8.913582316829594e-08, + "loss": 2.0937, + "step": 19620 + }, + { + "epoch": 3.677788191190253, + "grad_norm": 56601.53125, + "learning_rate": 8.866744306473829e-08, + "loss": 2.0494, + "step": 19621 + }, + { + "epoch": 3.6779756326148076, + "grad_norm": 50655.52734375, + "learning_rate": 8.820029570638655e-08, + "loss": 2.0322, + "step": 19622 + }, + { + "epoch": 3.6781630740393627, + "grad_norm": 51707.3125, + "learning_rate": 8.773438110477594e-08, + "loss": 2.1019, + "step": 19623 + }, + { + "epoch": 3.678350515463918, + "grad_norm": 53374.87109375, + "learning_rate": 8.726969927141393e-08, + "loss": 2.1601, + "step": 19624 + }, + { + "epoch": 3.6785379568884724, + "grad_norm": 54507.46484375, + "learning_rate": 8.680625021777467e-08, + "loss": 2.1405, + "step": 19625 + }, + { + "epoch": 3.678725398313027, + "grad_norm": 57607.51953125, + "learning_rate": 8.634403395531565e-08, + "loss": 2.02, + "step": 19626 + }, + { + "epoch": 3.678912839737582, + "grad_norm": 53299.21484375, + "learning_rate": 8.588305049543887e-08, + "loss": 2.0358, + "step": 19627 + }, + { + "epoch": 3.6791002811621367, + "grad_norm": 52920.59375, + "learning_rate": 8.542329984953524e-08, + "loss": 2.0445, + "step": 19628 + }, + { + "epoch": 3.679287722586692, + "grad_norm": 54133.77734375, + "learning_rate": 8.49647820289623e-08, + "loss": 2.2656, + "step": 19629 + }, + { + "epoch": 3.6794751640112464, + "grad_norm": 60705.84765625, + "learning_rate": 8.450749704504435e-08, + "loss": 2.1065, + "step": 19630 + }, + { + "epoch": 3.6796626054358015, + "grad_norm": 59440.96875, + "learning_rate": 8.405144490907235e-08, + "loss": 2.0327, + "step": 19631 + }, + { + "epoch": 3.679850046860356, + "grad_norm": 52961.984375, + "learning_rate": 8.359662563231508e-08, + "loss": 2.0675, + "step": 19632 + }, + { + "epoch": 3.6800374882849107, + "grad_norm": 55063.203125, + "learning_rate": 8.314303922600242e-08, + "loss": 2.059, + "step": 19633 + }, + { + "epoch": 3.680224929709466, + "grad_norm": 62597.9375, + "learning_rate": 8.269068570134208e-08, + "loss": 2.0576, + "step": 19634 + }, + { + "epoch": 3.680412371134021, + "grad_norm": 54083.84375, + "learning_rate": 8.223956506949737e-08, + "loss": 2.1249, + "step": 19635 + }, + { + "epoch": 3.6805998125585755, + "grad_norm": 52990.71484375, + "learning_rate": 8.178967734162046e-08, + "loss": 2.0796, + "step": 19636 + }, + { + "epoch": 3.68078725398313, + "grad_norm": 58315.9375, + "learning_rate": 8.134102252881915e-08, + "loss": 2.0711, + "step": 19637 + }, + { + "epoch": 3.680974695407685, + "grad_norm": 54873.7265625, + "learning_rate": 8.089360064216789e-08, + "loss": 2.0514, + "step": 19638 + }, + { + "epoch": 3.68116213683224, + "grad_norm": 55359.4140625, + "learning_rate": 8.044741169273007e-08, + "loss": 2.0865, + "step": 19639 + }, + { + "epoch": 3.681349578256795, + "grad_norm": 53478.0390625, + "learning_rate": 8.000245569151354e-08, + "loss": 2.0063, + "step": 19640 + }, + { + "epoch": 3.6815370196813495, + "grad_norm": 56965.16015625, + "learning_rate": 7.955873264952063e-08, + "loss": 2.1347, + "step": 19641 + }, + { + "epoch": 3.6817244611059046, + "grad_norm": 58328.91015625, + "learning_rate": 7.911624257769812e-08, + "loss": 2.041, + "step": 19642 + }, + { + "epoch": 3.681911902530459, + "grad_norm": 55593.74609375, + "learning_rate": 7.867498548698727e-08, + "loss": 2.0509, + "step": 19643 + }, + { + "epoch": 3.682099343955014, + "grad_norm": 50918.47265625, + "learning_rate": 7.823496138827935e-08, + "loss": 2.0266, + "step": 19644 + }, + { + "epoch": 3.682286785379569, + "grad_norm": 56535.11328125, + "learning_rate": 7.779617029243791e-08, + "loss": 2.1385, + "step": 19645 + }, + { + "epoch": 3.682474226804124, + "grad_norm": 56646.9765625, + "learning_rate": 7.735861221030982e-08, + "loss": 2.1622, + "step": 19646 + }, + { + "epoch": 3.6826616682286786, + "grad_norm": 59511.93359375, + "learning_rate": 7.692228715269756e-08, + "loss": 2.0803, + "step": 19647 + }, + { + "epoch": 3.682849109653233, + "grad_norm": 52774.90625, + "learning_rate": 7.648719513038138e-08, + "loss": 2.0432, + "step": 19648 + }, + { + "epoch": 3.6830365510777883, + "grad_norm": 55779.7421875, + "learning_rate": 7.605333615409716e-08, + "loss": 2.1142, + "step": 19649 + }, + { + "epoch": 3.683223992502343, + "grad_norm": 55838.421875, + "learning_rate": 7.562071023457518e-08, + "loss": 2.0937, + "step": 19650 + }, + { + "epoch": 3.683411433926898, + "grad_norm": 54873.6171875, + "learning_rate": 7.518931738249025e-08, + "loss": 2.1152, + "step": 19651 + }, + { + "epoch": 3.6835988753514526, + "grad_norm": 55202.90625, + "learning_rate": 7.475915760850605e-08, + "loss": 2.0967, + "step": 19652 + }, + { + "epoch": 3.6837863167760077, + "grad_norm": 62086.90625, + "learning_rate": 7.433023092323632e-08, + "loss": 2.0397, + "step": 19653 + }, + { + "epoch": 3.6839737582005623, + "grad_norm": 58523.984375, + "learning_rate": 7.390253733727814e-08, + "loss": 2.0439, + "step": 19654 + }, + { + "epoch": 3.684161199625117, + "grad_norm": 53911.15234375, + "learning_rate": 7.347607686120639e-08, + "loss": 2.0497, + "step": 19655 + }, + { + "epoch": 3.684348641049672, + "grad_norm": 55123.62109375, + "learning_rate": 7.305084950553486e-08, + "loss": 2.0488, + "step": 19656 + }, + { + "epoch": 3.684536082474227, + "grad_norm": 57479.015625, + "learning_rate": 7.262685528078295e-08, + "loss": 2.037, + "step": 19657 + }, + { + "epoch": 3.6847235238987817, + "grad_norm": 57952.87890625, + "learning_rate": 7.220409419740893e-08, + "loss": 2.1238, + "step": 19658 + }, + { + "epoch": 3.6849109653233363, + "grad_norm": 62356.62109375, + "learning_rate": 7.17825662658711e-08, + "loss": 2.106, + "step": 19659 + }, + { + "epoch": 3.6850984067478914, + "grad_norm": 65358.98828125, + "learning_rate": 7.136227149656672e-08, + "loss": 2.114, + "step": 19660 + }, + { + "epoch": 3.685285848172446, + "grad_norm": 58213.4140625, + "learning_rate": 7.094320989987635e-08, + "loss": 2.0896, + "step": 19661 + }, + { + "epoch": 3.685473289597001, + "grad_norm": 54967.06640625, + "learning_rate": 7.05253814861584e-08, + "loss": 2.0594, + "step": 19662 + }, + { + "epoch": 3.6856607310215557, + "grad_norm": 52816.65234375, + "learning_rate": 7.010878626572681e-08, + "loss": 2.0572, + "step": 19663 + }, + { + "epoch": 3.6858481724461107, + "grad_norm": 52820.375, + "learning_rate": 6.969342424887893e-08, + "loss": 2.0007, + "step": 19664 + }, + { + "epoch": 3.6860356138706654, + "grad_norm": 58102.98046875, + "learning_rate": 6.92792954458621e-08, + "loss": 2.0807, + "step": 19665 + }, + { + "epoch": 3.68622305529522, + "grad_norm": 57426.5703125, + "learning_rate": 6.886639986691257e-08, + "loss": 2.0365, + "step": 19666 + }, + { + "epoch": 3.686410496719775, + "grad_norm": 60950.88671875, + "learning_rate": 6.845473752222775e-08, + "loss": 2.0479, + "step": 19667 + }, + { + "epoch": 3.68659793814433, + "grad_norm": 52697.8203125, + "learning_rate": 6.804430842197174e-08, + "loss": 2.0412, + "step": 19668 + }, + { + "epoch": 3.6867853795688847, + "grad_norm": 51778.62890625, + "learning_rate": 6.763511257628086e-08, + "loss": 2.0743, + "step": 19669 + }, + { + "epoch": 3.6869728209934394, + "grad_norm": 56343.9921875, + "learning_rate": 6.722714999526924e-08, + "loss": 2.0799, + "step": 19670 + }, + { + "epoch": 3.6871602624179944, + "grad_norm": 53941.7421875, + "learning_rate": 6.682042068900663e-08, + "loss": 2.0637, + "step": 19671 + }, + { + "epoch": 3.687347703842549, + "grad_norm": 53586.16015625, + "learning_rate": 6.641492466754052e-08, + "loss": 2.1257, + "step": 19672 + }, + { + "epoch": 3.687535145267104, + "grad_norm": 54983.890625, + "learning_rate": 6.601066194088512e-08, + "loss": 2.0167, + "step": 19673 + }, + { + "epoch": 3.6877225866916588, + "grad_norm": 59438.6875, + "learning_rate": 6.56076325190269e-08, + "loss": 2.0377, + "step": 19674 + }, + { + "epoch": 3.687910028116214, + "grad_norm": 52315.12890625, + "learning_rate": 6.520583641191901e-08, + "loss": 2.0694, + "step": 19675 + }, + { + "epoch": 3.6880974695407684, + "grad_norm": 49539.83203125, + "learning_rate": 6.48052736294813e-08, + "loss": 2.1267, + "step": 19676 + }, + { + "epoch": 3.688284910965323, + "grad_norm": 52021.375, + "learning_rate": 6.440594418161694e-08, + "loss": 2.0373, + "step": 19677 + }, + { + "epoch": 3.688472352389878, + "grad_norm": 53358.7890625, + "learning_rate": 6.400784807818472e-08, + "loss": 2.0598, + "step": 19678 + }, + { + "epoch": 3.688659793814433, + "grad_norm": 61504.5, + "learning_rate": 6.361098532901011e-08, + "loss": 1.9815, + "step": 19679 + }, + { + "epoch": 3.688847235238988, + "grad_norm": 56565.640625, + "learning_rate": 6.321535594390193e-08, + "loss": 2.1108, + "step": 19680 + }, + { + "epoch": 3.6890346766635425, + "grad_norm": 52020.2890625, + "learning_rate": 6.28209599326357e-08, + "loss": 2.0289, + "step": 19681 + }, + { + "epoch": 3.6892221180880975, + "grad_norm": 53592.12890625, + "learning_rate": 6.242779730494253e-08, + "loss": 2.0494, + "step": 19682 + }, + { + "epoch": 3.689409559512652, + "grad_norm": 54216.4140625, + "learning_rate": 6.203586807053685e-08, + "loss": 2.1046, + "step": 19683 + }, + { + "epoch": 3.689597000937207, + "grad_norm": 54913.61328125, + "learning_rate": 6.164517223909982e-08, + "loss": 2.0469, + "step": 19684 + }, + { + "epoch": 3.689784442361762, + "grad_norm": 53080.96875, + "learning_rate": 6.125570982028483e-08, + "loss": 2.0191, + "step": 19685 + }, + { + "epoch": 3.689971883786317, + "grad_norm": 53467.0234375, + "learning_rate": 6.086748082370641e-08, + "loss": 2.0316, + "step": 19686 + }, + { + "epoch": 3.6901593252108715, + "grad_norm": 62831.99609375, + "learning_rate": 6.048048525895133e-08, + "loss": 2.0299, + "step": 19687 + }, + { + "epoch": 3.690346766635426, + "grad_norm": 58115.74609375, + "learning_rate": 6.009472313558418e-08, + "loss": 2.0954, + "step": 19688 + }, + { + "epoch": 3.6905342080599812, + "grad_norm": 55169.89453125, + "learning_rate": 5.97101944631251e-08, + "loss": 2.0729, + "step": 19689 + }, + { + "epoch": 3.6907216494845363, + "grad_norm": 56724.8046875, + "learning_rate": 5.9326899251077596e-08, + "loss": 2.0596, + "step": 19690 + }, + { + "epoch": 3.690909090909091, + "grad_norm": 52952.12109375, + "learning_rate": 5.8944837508906335e-08, + "loss": 2.0406, + "step": 19691 + }, + { + "epoch": 3.6910965323336455, + "grad_norm": 52561.30859375, + "learning_rate": 5.8564009246048215e-08, + "loss": 2.0599, + "step": 19692 + }, + { + "epoch": 3.6912839737582006, + "grad_norm": 51573.98046875, + "learning_rate": 5.818441447191236e-08, + "loss": 2.0633, + "step": 19693 + }, + { + "epoch": 3.6914714151827552, + "grad_norm": 56463.15625, + "learning_rate": 5.780605319586907e-08, + "loss": 2.1684, + "step": 19694 + }, + { + "epoch": 3.6916588566073103, + "grad_norm": 58729.81640625, + "learning_rate": 5.742892542726086e-08, + "loss": 2.0679, + "step": 19695 + }, + { + "epoch": 3.691846298031865, + "grad_norm": 56539.8671875, + "learning_rate": 5.705303117540806e-08, + "loss": 2.1033, + "step": 19696 + }, + { + "epoch": 3.69203373945642, + "grad_norm": 58277.1953125, + "learning_rate": 5.6678370449597675e-08, + "loss": 2.1359, + "step": 19697 + }, + { + "epoch": 3.6922211808809746, + "grad_norm": 56231.734375, + "learning_rate": 5.630494325907787e-08, + "loss": 2.1556, + "step": 19698 + }, + { + "epoch": 3.6924086223055297, + "grad_norm": 54560.3828125, + "learning_rate": 5.5932749613069045e-08, + "loss": 2.0607, + "step": 19699 + }, + { + "epoch": 3.6925960637300843, + "grad_norm": 62616.125, + "learning_rate": 5.5561789520774954e-08, + "loss": 2.0368, + "step": 19700 + }, + { + "epoch": 3.6927835051546394, + "grad_norm": 57907.46875, + "learning_rate": 5.519206299134383e-08, + "loss": 2.0703, + "step": 19701 + }, + { + "epoch": 3.692970946579194, + "grad_norm": 52179.94140625, + "learning_rate": 5.482357003391836e-08, + "loss": 2.0883, + "step": 19702 + }, + { + "epoch": 3.6931583880037486, + "grad_norm": 55106.0390625, + "learning_rate": 5.445631065759127e-08, + "loss": 2.0876, + "step": 19703 + }, + { + "epoch": 3.6933458294283037, + "grad_norm": 52439.87109375, + "learning_rate": 5.4090284871444186e-08, + "loss": 2.0567, + "step": 19704 + }, + { + "epoch": 3.6935332708528583, + "grad_norm": 55863.51953125, + "learning_rate": 5.372549268450877e-08, + "loss": 2.0322, + "step": 19705 + }, + { + "epoch": 3.6937207122774134, + "grad_norm": 52029.72265625, + "learning_rate": 5.3361934105794485e-08, + "loss": 2.1103, + "step": 19706 + }, + { + "epoch": 3.693908153701968, + "grad_norm": 56069.4453125, + "learning_rate": 5.2999609144288584e-08, + "loss": 2.1134, + "step": 19707 + }, + { + "epoch": 3.694095595126523, + "grad_norm": 57928.39453125, + "learning_rate": 5.263851780892837e-08, + "loss": 2.0299, + "step": 19708 + }, + { + "epoch": 3.6942830365510777, + "grad_norm": 60756.58203125, + "learning_rate": 5.227866010864557e-08, + "loss": 2.0635, + "step": 19709 + }, + { + "epoch": 3.6944704779756328, + "grad_norm": 52393.5390625, + "learning_rate": 5.192003605231644e-08, + "loss": 2.0975, + "step": 19710 + }, + { + "epoch": 3.6946579194001874, + "grad_norm": 53527.62109375, + "learning_rate": 5.156264564881163e-08, + "loss": 2.0254, + "step": 19711 + }, + { + "epoch": 3.6948453608247425, + "grad_norm": 52948.65625, + "learning_rate": 5.1206488906940794e-08, + "loss": 2.0115, + "step": 19712 + }, + { + "epoch": 3.695032802249297, + "grad_norm": 54117.8125, + "learning_rate": 5.085156583551909e-08, + "loss": 2.1255, + "step": 19713 + }, + { + "epoch": 3.6952202436738517, + "grad_norm": 56131.23828125, + "learning_rate": 5.0497876443295064e-08, + "loss": 2.1124, + "step": 19714 + }, + { + "epoch": 3.695407685098407, + "grad_norm": 56397.1484375, + "learning_rate": 5.014542073901729e-08, + "loss": 2.0853, + "step": 19715 + }, + { + "epoch": 3.695595126522962, + "grad_norm": 57000.23828125, + "learning_rate": 4.979419873138991e-08, + "loss": 2.0567, + "step": 19716 + }, + { + "epoch": 3.6957825679475165, + "grad_norm": 53262.16015625, + "learning_rate": 4.944421042907821e-08, + "loss": 2.0964, + "step": 19717 + }, + { + "epoch": 3.695970009372071, + "grad_norm": 56265.40625, + "learning_rate": 4.909545584073638e-08, + "loss": 2.0488, + "step": 19718 + }, + { + "epoch": 3.696157450796626, + "grad_norm": 55939.57421875, + "learning_rate": 4.874793497497421e-08, + "loss": 2.1158, + "step": 19719 + }, + { + "epoch": 3.696344892221181, + "grad_norm": 59409.203125, + "learning_rate": 4.840164784037371e-08, + "loss": 2.089, + "step": 19720 + }, + { + "epoch": 3.696532333645736, + "grad_norm": 53433.34375, + "learning_rate": 4.805659444548916e-08, + "loss": 2.0676, + "step": 19721 + }, + { + "epoch": 3.6967197750702905, + "grad_norm": 50046.0859375, + "learning_rate": 4.771277479884706e-08, + "loss": 2.0632, + "step": 19722 + }, + { + "epoch": 3.6969072164948455, + "grad_norm": 57077.7265625, + "learning_rate": 4.737018890892952e-08, + "loss": 2.068, + "step": 19723 + }, + { + "epoch": 3.6970946579194, + "grad_norm": 56883.0234375, + "learning_rate": 4.702883678420755e-08, + "loss": 2.0532, + "step": 19724 + }, + { + "epoch": 3.697282099343955, + "grad_norm": 59028.515625, + "learning_rate": 4.668871843310774e-08, + "loss": 2.0727, + "step": 19725 + }, + { + "epoch": 3.69746954076851, + "grad_norm": 53873.328125, + "learning_rate": 4.634983386402891e-08, + "loss": 2.1134, + "step": 19726 + }, + { + "epoch": 3.697656982193065, + "grad_norm": 59016.2578125, + "learning_rate": 4.601218308534771e-08, + "loss": 2.0674, + "step": 19727 + }, + { + "epoch": 3.6978444236176196, + "grad_norm": 57192.37109375, + "learning_rate": 4.5675766105396366e-08, + "loss": 2.0855, + "step": 19728 + }, + { + "epoch": 3.698031865042174, + "grad_norm": 55872.16796875, + "learning_rate": 4.534058293248489e-08, + "loss": 2.0766, + "step": 19729 + }, + { + "epoch": 3.6982193064667293, + "grad_norm": 53149.140625, + "learning_rate": 4.500663357490109e-08, + "loss": 2.0507, + "step": 19730 + }, + { + "epoch": 3.698406747891284, + "grad_norm": 53032.328125, + "learning_rate": 4.467391804087728e-08, + "loss": 2.0935, + "step": 19731 + }, + { + "epoch": 3.698594189315839, + "grad_norm": 56304.29296875, + "learning_rate": 4.4342436338645765e-08, + "loss": 2.1152, + "step": 19732 + }, + { + "epoch": 3.6987816307403936, + "grad_norm": 57926.55078125, + "learning_rate": 4.401218847638333e-08, + "loss": 2.1269, + "step": 19733 + }, + { + "epoch": 3.6989690721649486, + "grad_norm": 55228.453125, + "learning_rate": 4.368317446225567e-08, + "loss": 2.0781, + "step": 19734 + }, + { + "epoch": 3.6991565135895033, + "grad_norm": 55824.59375, + "learning_rate": 4.335539430437852e-08, + "loss": 2.101, + "step": 19735 + }, + { + "epoch": 3.699343955014058, + "grad_norm": 51183.24609375, + "learning_rate": 4.302884801085649e-08, + "loss": 2.1315, + "step": 19736 + }, + { + "epoch": 3.699531396438613, + "grad_norm": 55426.703125, + "learning_rate": 4.2703535589749823e-08, + "loss": 2.0836, + "step": 19737 + }, + { + "epoch": 3.699718837863168, + "grad_norm": 54655.046875, + "learning_rate": 4.237945704909651e-08, + "loss": 2.1125, + "step": 19738 + }, + { + "epoch": 3.6999062792877226, + "grad_norm": 53374.32421875, + "learning_rate": 4.205661239690128e-08, + "loss": 2.0829, + "step": 19739 + }, + { + "epoch": 3.7000937207122773, + "grad_norm": 55747.75390625, + "learning_rate": 4.173500164113553e-08, + "loss": 2.0642, + "step": 19740 + }, + { + "epoch": 3.7002811621368323, + "grad_norm": 54442.2578125, + "learning_rate": 4.14146247897429e-08, + "loss": 2.068, + "step": 19741 + }, + { + "epoch": 3.700468603561387, + "grad_norm": 58833.53125, + "learning_rate": 4.109548185063927e-08, + "loss": 2.0991, + "step": 19742 + }, + { + "epoch": 3.700656044985942, + "grad_norm": 55938.18359375, + "learning_rate": 4.0777572831701696e-08, + "loss": 2.0975, + "step": 19743 + }, + { + "epoch": 3.7008434864104967, + "grad_norm": 60307.49609375, + "learning_rate": 4.0460897740784983e-08, + "loss": 2.0735, + "step": 19744 + }, + { + "epoch": 3.7010309278350517, + "grad_norm": 55107.33203125, + "learning_rate": 4.014545658571067e-08, + "loss": 2.0848, + "step": 19745 + }, + { + "epoch": 3.7012183692596063, + "grad_norm": 57860.94921875, + "learning_rate": 3.9831249374266965e-08, + "loss": 2.1012, + "step": 19746 + }, + { + "epoch": 3.701405810684161, + "grad_norm": 58151.08203125, + "learning_rate": 3.951827611421988e-08, + "loss": 2.1055, + "step": 19747 + }, + { + "epoch": 3.701593252108716, + "grad_norm": 61993.36328125, + "learning_rate": 3.920653681329656e-08, + "loss": 2.2402, + "step": 19748 + }, + { + "epoch": 3.701780693533271, + "grad_norm": 64305.3828125, + "learning_rate": 3.8896031479201954e-08, + "loss": 2.0024, + "step": 19749 + }, + { + "epoch": 3.7019681349578257, + "grad_norm": 56989.5859375, + "learning_rate": 3.8586760119591056e-08, + "loss": 2.0451, + "step": 19750 + }, + { + "epoch": 3.7021555763823804, + "grad_norm": 58003.98828125, + "learning_rate": 3.8278722742113304e-08, + "loss": 2.0858, + "step": 19751 + }, + { + "epoch": 3.7023430178069354, + "grad_norm": 57628.89453125, + "learning_rate": 3.797191935437927e-08, + "loss": 2.1111, + "step": 19752 + }, + { + "epoch": 3.70253045923149, + "grad_norm": 52292.49609375, + "learning_rate": 3.766634996396068e-08, + "loss": 2.0614, + "step": 19753 + }, + { + "epoch": 3.702717900656045, + "grad_norm": 60346.859375, + "learning_rate": 3.7362014578401496e-08, + "loss": 2.0885, + "step": 19754 + }, + { + "epoch": 3.7029053420805997, + "grad_norm": 54265.48828125, + "learning_rate": 3.705891320522903e-08, + "loss": 2.071, + "step": 19755 + }, + { + "epoch": 3.703092783505155, + "grad_norm": 62850.734375, + "learning_rate": 3.675704585192063e-08, + "loss": 2.1556, + "step": 19756 + }, + { + "epoch": 3.7032802249297094, + "grad_norm": 59224.3984375, + "learning_rate": 3.6456412525931464e-08, + "loss": 2.1255, + "step": 19757 + }, + { + "epoch": 3.703467666354264, + "grad_norm": 59702.65625, + "learning_rate": 3.6157013234694445e-08, + "loss": 2.0902, + "step": 19758 + }, + { + "epoch": 3.703655107778819, + "grad_norm": 53882.390625, + "learning_rate": 3.585884798559813e-08, + "loss": 2.0554, + "step": 19759 + }, + { + "epoch": 3.703842549203374, + "grad_norm": 58241.55859375, + "learning_rate": 3.556191678601439e-08, + "loss": 2.0226, + "step": 19760 + }, + { + "epoch": 3.704029990627929, + "grad_norm": 54168.875, + "learning_rate": 3.526621964326515e-08, + "loss": 2.0458, + "step": 19761 + }, + { + "epoch": 3.7042174320524834, + "grad_norm": 54091.06640625, + "learning_rate": 3.497175656466123e-08, + "loss": 2.0569, + "step": 19762 + }, + { + "epoch": 3.7044048734770385, + "grad_norm": 54609.5703125, + "learning_rate": 3.467852755747458e-08, + "loss": 2.0653, + "step": 19763 + }, + { + "epoch": 3.704592314901593, + "grad_norm": 52015.7890625, + "learning_rate": 3.438653262894942e-08, + "loss": 2.0726, + "step": 19764 + }, + { + "epoch": 3.704779756326148, + "grad_norm": 55100.83984375, + "learning_rate": 3.409577178629109e-08, + "loss": 2.0602, + "step": 19765 + }, + { + "epoch": 3.704967197750703, + "grad_norm": 55923.1640625, + "learning_rate": 3.380624503669383e-08, + "loss": 2.0692, + "step": 19766 + }, + { + "epoch": 3.705154639175258, + "grad_norm": 59091.3984375, + "learning_rate": 3.351795238729083e-08, + "loss": 2.0854, + "step": 19767 + }, + { + "epoch": 3.7053420805998125, + "grad_norm": 52776.37890625, + "learning_rate": 3.323089384521527e-08, + "loss": 2.0518, + "step": 19768 + }, + { + "epoch": 3.705529522024367, + "grad_norm": 55121.171875, + "learning_rate": 3.294506941755593e-08, + "loss": 2.0666, + "step": 19769 + }, + { + "epoch": 3.705716963448922, + "grad_norm": 53470.265625, + "learning_rate": 3.266047911136827e-08, + "loss": 2.0842, + "step": 19770 + }, + { + "epoch": 3.7059044048734773, + "grad_norm": 52694.703125, + "learning_rate": 3.2377122933685554e-08, + "loss": 2.0746, + "step": 19771 + }, + { + "epoch": 3.706091846298032, + "grad_norm": 52129.609375, + "learning_rate": 3.209500089149664e-08, + "loss": 2.0182, + "step": 19772 + }, + { + "epoch": 3.7062792877225865, + "grad_norm": 55260.3671875, + "learning_rate": 3.181411299178483e-08, + "loss": 2.1826, + "step": 19773 + }, + { + "epoch": 3.7064667291471416, + "grad_norm": 55393.21875, + "learning_rate": 3.153445924147791e-08, + "loss": 2.0418, + "step": 19774 + }, + { + "epoch": 3.706654170571696, + "grad_norm": 59041.87109375, + "learning_rate": 3.125603964748147e-08, + "loss": 2.0767, + "step": 19775 + }, + { + "epoch": 3.7068416119962513, + "grad_norm": 55195.796875, + "learning_rate": 3.09788542166789e-08, + "loss": 2.0209, + "step": 19776 + }, + { + "epoch": 3.707029053420806, + "grad_norm": 54035.953125, + "learning_rate": 3.070290295590916e-08, + "loss": 2.0737, + "step": 19777 + }, + { + "epoch": 3.707216494845361, + "grad_norm": 55017.05859375, + "learning_rate": 3.042818587199459e-08, + "loss": 2.0369, + "step": 19778 + }, + { + "epoch": 3.7074039362699156, + "grad_norm": 54441.1953125, + "learning_rate": 3.015470297171863e-08, + "loss": 2.0828, + "step": 19779 + }, + { + "epoch": 3.7075913776944702, + "grad_norm": 53272.5703125, + "learning_rate": 2.988245426183145e-08, + "loss": 2.0517, + "step": 19780 + }, + { + "epoch": 3.7077788191190253, + "grad_norm": 56716.9375, + "learning_rate": 2.9611439749060998e-08, + "loss": 2.0998, + "step": 19781 + }, + { + "epoch": 3.7079662605435804, + "grad_norm": 60040.421875, + "learning_rate": 2.934165944010192e-08, + "loss": 2.083, + "step": 19782 + }, + { + "epoch": 3.708153701968135, + "grad_norm": 52422.984375, + "learning_rate": 2.9073113341615556e-08, + "loss": 2.1429, + "step": 19783 + }, + { + "epoch": 3.7083411433926896, + "grad_norm": 57256.78515625, + "learning_rate": 2.880580146023548e-08, + "loss": 2.066, + "step": 19784 + }, + { + "epoch": 3.7085285848172447, + "grad_norm": 58971.359375, + "learning_rate": 2.853972380256198e-08, + "loss": 2.0913, + "step": 19785 + }, + { + "epoch": 3.7087160262417993, + "grad_norm": 55287.93359375, + "learning_rate": 2.8274880375167567e-08, + "loss": 2.0009, + "step": 19786 + }, + { + "epoch": 3.7089034676663544, + "grad_norm": 53067.65234375, + "learning_rate": 2.801127118459701e-08, + "loss": 2.0039, + "step": 19787 + }, + { + "epoch": 3.709090909090909, + "grad_norm": 58182.6796875, + "learning_rate": 2.7748896237356214e-08, + "loss": 2.111, + "step": 19788 + }, + { + "epoch": 3.709278350515464, + "grad_norm": 49962.21484375, + "learning_rate": 2.7487755539928883e-08, + "loss": 2.0537, + "step": 19789 + }, + { + "epoch": 3.7094657919400187, + "grad_norm": 59120.02734375, + "learning_rate": 2.722784909875986e-08, + "loss": 2.0665, + "step": 19790 + }, + { + "epoch": 3.7096532333645733, + "grad_norm": 53586.9921875, + "learning_rate": 2.696917692027734e-08, + "loss": 2.0624, + "step": 19791 + }, + { + "epoch": 3.7098406747891284, + "grad_norm": 63509.0859375, + "learning_rate": 2.67117390108651e-08, + "loss": 2.0089, + "step": 19792 + }, + { + "epoch": 3.7100281162136834, + "grad_norm": 56429.81640625, + "learning_rate": 2.645553537687917e-08, + "loss": 2.0577, + "step": 19793 + }, + { + "epoch": 3.710215557638238, + "grad_norm": 54235.046875, + "learning_rate": 2.620056602464782e-08, + "loss": 2.1093, + "step": 19794 + }, + { + "epoch": 3.7104029990627927, + "grad_norm": 51492.6875, + "learning_rate": 2.5946830960477118e-08, + "loss": 2.0603, + "step": 19795 + }, + { + "epoch": 3.7105904404873478, + "grad_norm": 56396.8828125, + "learning_rate": 2.5694330190623174e-08, + "loss": 2.138, + "step": 19796 + }, + { + "epoch": 3.7107778819119024, + "grad_norm": 55208.359375, + "learning_rate": 2.5443063721325435e-08, + "loss": 2.0643, + "step": 19797 + }, + { + "epoch": 3.7109653233364575, + "grad_norm": 55138.07421875, + "learning_rate": 2.5193031558790047e-08, + "loss": 2.1069, + "step": 19798 + }, + { + "epoch": 3.711152764761012, + "grad_norm": 54168.9453125, + "learning_rate": 2.4944233709195408e-08, + "loss": 2.0286, + "step": 19799 + }, + { + "epoch": 3.711340206185567, + "grad_norm": 54591.6796875, + "learning_rate": 2.46966701786866e-08, + "loss": 2.087, + "step": 19800 + }, + { + "epoch": 3.7115276476101218, + "grad_norm": 52710.00390625, + "learning_rate": 2.4450340973375395e-08, + "loss": 2.0688, + "step": 19801 + }, + { + "epoch": 3.7117150890346764, + "grad_norm": 55247.72265625, + "learning_rate": 2.4205246099345824e-08, + "loss": 2.1124, + "step": 19802 + }, + { + "epoch": 3.7119025304592315, + "grad_norm": 51606.28125, + "learning_rate": 2.3961385562654148e-08, + "loss": 2.0677, + "step": 19803 + }, + { + "epoch": 3.7120899718837865, + "grad_norm": 59155.2421875, + "learning_rate": 2.3718759369317778e-08, + "loss": 2.0448, + "step": 19804 + }, + { + "epoch": 3.712277413308341, + "grad_norm": 51749.2421875, + "learning_rate": 2.3477367525331915e-08, + "loss": 2.0564, + "step": 19805 + }, + { + "epoch": 3.712464854732896, + "grad_norm": 57091.9453125, + "learning_rate": 2.3237210036664015e-08, + "loss": 2.0317, + "step": 19806 + }, + { + "epoch": 3.712652296157451, + "grad_norm": 52556.875, + "learning_rate": 2.2998286909237112e-08, + "loss": 2.0701, + "step": 19807 + }, + { + "epoch": 3.7128397375820055, + "grad_norm": 55044.7890625, + "learning_rate": 2.276059814896314e-08, + "loss": 2.1055, + "step": 19808 + }, + { + "epoch": 3.7130271790065605, + "grad_norm": 60315.6953125, + "learning_rate": 2.2524143761698537e-08, + "loss": 2.0791, + "step": 19809 + }, + { + "epoch": 3.713214620431115, + "grad_norm": 53709.66015625, + "learning_rate": 2.2288923753294167e-08, + "loss": 2.0344, + "step": 19810 + }, + { + "epoch": 3.7134020618556702, + "grad_norm": 58309.71484375, + "learning_rate": 2.2054938129556503e-08, + "loss": 2.032, + "step": 19811 + }, + { + "epoch": 3.713589503280225, + "grad_norm": 54647.046875, + "learning_rate": 2.182218689626425e-08, + "loss": 2.0901, + "step": 19812 + }, + { + "epoch": 3.7137769447047795, + "grad_norm": 56288.10546875, + "learning_rate": 2.159067005916282e-08, + "loss": 2.0893, + "step": 19813 + }, + { + "epoch": 3.7139643861293345, + "grad_norm": 54273.29296875, + "learning_rate": 2.136038762398096e-08, + "loss": 2.0623, + "step": 19814 + }, + { + "epoch": 3.7141518275538896, + "grad_norm": 53401.4296875, + "learning_rate": 2.113133959639191e-08, + "loss": 2.0452, + "step": 19815 + }, + { + "epoch": 3.7143392689784442, + "grad_norm": 53841.15625, + "learning_rate": 2.0903525982063354e-08, + "loss": 2.0661, + "step": 19816 + }, + { + "epoch": 3.714526710402999, + "grad_norm": 53609.87109375, + "learning_rate": 2.0676946786618577e-08, + "loss": 2.0999, + "step": 19817 + }, + { + "epoch": 3.714714151827554, + "grad_norm": 55936.03515625, + "learning_rate": 2.0451602015653105e-08, + "loss": 2.0656, + "step": 19818 + }, + { + "epoch": 3.7149015932521086, + "grad_norm": 58987.515625, + "learning_rate": 2.02274916747347e-08, + "loss": 2.175, + "step": 19819 + }, + { + "epoch": 3.7150890346766636, + "grad_norm": 52061.703125, + "learning_rate": 2.0004615769397827e-08, + "loss": 2.0905, + "step": 19820 + }, + { + "epoch": 3.7152764761012183, + "grad_norm": 57246.16796875, + "learning_rate": 1.9782974305149192e-08, + "loss": 2.0206, + "step": 19821 + }, + { + "epoch": 3.7154639175257733, + "grad_norm": 53213.296875, + "learning_rate": 1.9562567287456647e-08, + "loss": 2.1116, + "step": 19822 + }, + { + "epoch": 3.715651358950328, + "grad_norm": 52737.5625, + "learning_rate": 1.934339472177138e-08, + "loss": 2.0871, + "step": 19823 + }, + { + "epoch": 3.715838800374883, + "grad_norm": 54563.4609375, + "learning_rate": 1.912545661350018e-08, + "loss": 2.0226, + "step": 19824 + }, + { + "epoch": 3.7160262417994376, + "grad_norm": 58472.13671875, + "learning_rate": 1.8908752968033183e-08, + "loss": 2.0311, + "step": 19825 + }, + { + "epoch": 3.7162136832239927, + "grad_norm": 55282.91015625, + "learning_rate": 1.869328379071611e-08, + "loss": 2.0303, + "step": 19826 + }, + { + "epoch": 3.7164011246485473, + "grad_norm": 58944.91796875, + "learning_rate": 1.847904908687248e-08, + "loss": 2.1375, + "step": 19827 + }, + { + "epoch": 3.716588566073102, + "grad_norm": 53170.6953125, + "learning_rate": 1.8266048861792508e-08, + "loss": 2.0736, + "step": 19828 + }, + { + "epoch": 3.716776007497657, + "grad_norm": 56200.02734375, + "learning_rate": 1.8054283120744198e-08, + "loss": 2.0567, + "step": 19829 + }, + { + "epoch": 3.7169634489222116, + "grad_norm": 57472.44921875, + "learning_rate": 1.7843751868951153e-08, + "loss": 2.1103, + "step": 19830 + }, + { + "epoch": 3.7171508903467667, + "grad_norm": 56747.48046875, + "learning_rate": 1.7634455111609217e-08, + "loss": 2.0288, + "step": 19831 + }, + { + "epoch": 3.7173383317713213, + "grad_norm": 55085.640625, + "learning_rate": 1.7426392853897576e-08, + "loss": 2.0675, + "step": 19832 + }, + { + "epoch": 3.7175257731958764, + "grad_norm": 54993.97265625, + "learning_rate": 1.721956510095102e-08, + "loss": 2.1738, + "step": 19833 + }, + { + "epoch": 3.717713214620431, + "grad_norm": 61345.171875, + "learning_rate": 1.7013971857876564e-08, + "loss": 2.0591, + "step": 19834 + }, + { + "epoch": 3.717900656044986, + "grad_norm": 61395.6796875, + "learning_rate": 1.680961312975349e-08, + "loss": 2.0837, + "step": 19835 + }, + { + "epoch": 3.7180880974695407, + "grad_norm": 61586.56640625, + "learning_rate": 1.660648892162775e-08, + "loss": 2.1313, + "step": 19836 + }, + { + "epoch": 3.718275538894096, + "grad_norm": 59395.00390625, + "learning_rate": 1.640459923851756e-08, + "loss": 2.0429, + "step": 19837 + }, + { + "epoch": 3.7184629803186504, + "grad_norm": 53947.03125, + "learning_rate": 1.6203944085407817e-08, + "loss": 2.0586, + "step": 19838 + }, + { + "epoch": 3.718650421743205, + "grad_norm": 52973.51953125, + "learning_rate": 1.6004523467261222e-08, + "loss": 2.088, + "step": 19839 + }, + { + "epoch": 3.71883786316776, + "grad_norm": 55389.3046875, + "learning_rate": 1.5806337388990512e-08, + "loss": 2.0274, + "step": 19840 + }, + { + "epoch": 3.719025304592315, + "grad_norm": 59540.39453125, + "learning_rate": 1.5609385855502868e-08, + "loss": 2.0668, + "step": 19841 + }, + { + "epoch": 3.71921274601687, + "grad_norm": 54325.48046875, + "learning_rate": 1.541366887165552e-08, + "loss": 2.0635, + "step": 19842 + }, + { + "epoch": 3.7194001874414244, + "grad_norm": 58385.23828125, + "learning_rate": 1.5219186442283483e-08, + "loss": 2.1447, + "step": 19843 + }, + { + "epoch": 3.7195876288659795, + "grad_norm": 55372.6875, + "learning_rate": 1.502593857219403e-08, + "loss": 2.0401, + "step": 19844 + }, + { + "epoch": 3.719775070290534, + "grad_norm": 55296.1484375, + "learning_rate": 1.4833925266155568e-08, + "loss": 2.1191, + "step": 19845 + }, + { + "epoch": 3.719962511715089, + "grad_norm": 57283.0, + "learning_rate": 1.4643146528914298e-08, + "loss": 2.0584, + "step": 19846 + }, + { + "epoch": 3.720149953139644, + "grad_norm": 56223.828125, + "learning_rate": 1.4453602365177566e-08, + "loss": 2.0482, + "step": 19847 + }, + { + "epoch": 3.720337394564199, + "grad_norm": 54449.703125, + "learning_rate": 1.4265292779636064e-08, + "loss": 2.0619, + "step": 19848 + }, + { + "epoch": 3.7205248359887535, + "grad_norm": 60541.5234375, + "learning_rate": 1.4078217776924973e-08, + "loss": 2.0554, + "step": 19849 + }, + { + "epoch": 3.720712277413308, + "grad_norm": 51118.375, + "learning_rate": 1.3892377361679476e-08, + "loss": 2.0381, + "step": 19850 + }, + { + "epoch": 3.720899718837863, + "grad_norm": 52312.9609375, + "learning_rate": 1.3707771538484792e-08, + "loss": 2.1117, + "step": 19851 + }, + { + "epoch": 3.7210871602624183, + "grad_norm": 55534.5, + "learning_rate": 1.3524400311898389e-08, + "loss": 2.0939, + "step": 19852 + }, + { + "epoch": 3.721274601686973, + "grad_norm": 54585.7578125, + "learning_rate": 1.3342263686449974e-08, + "loss": 2.0802, + "step": 19853 + }, + { + "epoch": 3.7214620431115275, + "grad_norm": 56737.51953125, + "learning_rate": 1.3161361666641502e-08, + "loss": 2.1042, + "step": 19854 + }, + { + "epoch": 3.7216494845360826, + "grad_norm": 56314.3359375, + "learning_rate": 1.2981694256936073e-08, + "loss": 2.11, + "step": 19855 + }, + { + "epoch": 3.721836925960637, + "grad_norm": 53229.36328125, + "learning_rate": 1.2803261461774574e-08, + "loss": 2.0267, + "step": 19856 + }, + { + "epoch": 3.7220243673851923, + "grad_norm": 59461.30859375, + "learning_rate": 1.2626063285564594e-08, + "loss": 2.061, + "step": 19857 + }, + { + "epoch": 3.722211808809747, + "grad_norm": 55747.48046875, + "learning_rate": 1.245009973268041e-08, + "loss": 2.0647, + "step": 19858 + }, + { + "epoch": 3.722399250234302, + "grad_norm": 55148.26171875, + "learning_rate": 1.2275370807468544e-08, + "loss": 2.0092, + "step": 19859 + }, + { + "epoch": 3.7225866916588566, + "grad_norm": 55094.68359375, + "learning_rate": 1.2101876514247768e-08, + "loss": 2.0706, + "step": 19860 + }, + { + "epoch": 3.722774133083411, + "grad_norm": 62968.0, + "learning_rate": 1.1929616857292435e-08, + "loss": 2.0869, + "step": 19861 + }, + { + "epoch": 3.7229615745079663, + "grad_norm": 55600.04296875, + "learning_rate": 1.1758591840871358e-08, + "loss": 2.0182, + "step": 19862 + }, + { + "epoch": 3.7231490159325213, + "grad_norm": 52415.94921875, + "learning_rate": 1.1588801469203381e-08, + "loss": 2.0962, + "step": 19863 + }, + { + "epoch": 3.723336457357076, + "grad_norm": 52976.8671875, + "learning_rate": 1.1420245746479596e-08, + "loss": 2.0639, + "step": 19864 + }, + { + "epoch": 3.7235238987816306, + "grad_norm": 54378.796875, + "learning_rate": 1.125292467686334e-08, + "loss": 2.0373, + "step": 19865 + }, + { + "epoch": 3.7237113402061857, + "grad_norm": 56586.14453125, + "learning_rate": 1.1086838264490195e-08, + "loss": 2.0354, + "step": 19866 + }, + { + "epoch": 3.7238987816307403, + "grad_norm": 56883.3359375, + "learning_rate": 1.0921986513456883e-08, + "loss": 2.0781, + "step": 19867 + }, + { + "epoch": 3.7240862230552954, + "grad_norm": 51950.85546875, + "learning_rate": 1.0758369427843473e-08, + "loss": 2.0463, + "step": 19868 + }, + { + "epoch": 3.72427366447985, + "grad_norm": 54504.734375, + "learning_rate": 1.0595987011680075e-08, + "loss": 2.0185, + "step": 19869 + }, + { + "epoch": 3.724461105904405, + "grad_norm": 63049.953125, + "learning_rate": 1.0434839268991248e-08, + "loss": 2.0801, + "step": 19870 + }, + { + "epoch": 3.7246485473289597, + "grad_norm": 53811.7890625, + "learning_rate": 1.0274926203746039e-08, + "loss": 2.0129, + "step": 19871 + }, + { + "epoch": 3.7248359887535143, + "grad_norm": 57861.703125, + "learning_rate": 1.0116247819896841e-08, + "loss": 2.0419, + "step": 19872 + }, + { + "epoch": 3.7250234301780694, + "grad_norm": 49302.28515625, + "learning_rate": 9.958804121362742e-09, + "loss": 2.1252, + "step": 19873 + }, + { + "epoch": 3.7252108716026244, + "grad_norm": 53949.41015625, + "learning_rate": 9.802595112035073e-09, + "loss": 2.0649, + "step": 19874 + }, + { + "epoch": 3.725398313027179, + "grad_norm": 57092.796875, + "learning_rate": 9.64762079576631e-09, + "loss": 2.0348, + "step": 19875 + }, + { + "epoch": 3.7255857544517337, + "grad_norm": 59722.2109375, + "learning_rate": 9.493881176392273e-09, + "loss": 2.0564, + "step": 19876 + }, + { + "epoch": 3.7257731958762887, + "grad_norm": 57271.70703125, + "learning_rate": 9.341376257704371e-09, + "loss": 2.0594, + "step": 19877 + }, + { + "epoch": 3.7259606373008434, + "grad_norm": 54816.27734375, + "learning_rate": 9.190106043471813e-09, + "loss": 2.0841, + "step": 19878 + }, + { + "epoch": 3.7261480787253984, + "grad_norm": 57598.859375, + "learning_rate": 9.04007053742495e-09, + "loss": 2.0385, + "step": 19879 + }, + { + "epoch": 3.726335520149953, + "grad_norm": 56507.9921875, + "learning_rate": 8.891269743277474e-09, + "loss": 2.0639, + "step": 19880 + }, + { + "epoch": 3.726522961574508, + "grad_norm": 53511.7109375, + "learning_rate": 8.743703664704228e-09, + "loss": 2.094, + "step": 19881 + }, + { + "epoch": 3.7267104029990628, + "grad_norm": 60224.1875, + "learning_rate": 8.59737230534119e-09, + "loss": 2.0183, + "step": 19882 + }, + { + "epoch": 3.7268978444236174, + "grad_norm": 55664.33203125, + "learning_rate": 8.452275668813237e-09, + "loss": 2.0552, + "step": 19883 + }, + { + "epoch": 3.7270852858481724, + "grad_norm": 59039.55078125, + "learning_rate": 8.308413758695289e-09, + "loss": 2.184, + "step": 19884 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 52358.3984375, + "learning_rate": 8.165786578545608e-09, + "loss": 2.0771, + "step": 19885 + }, + { + "epoch": 3.727460168697282, + "grad_norm": 58758.85546875, + "learning_rate": 8.024394131889157e-09, + "loss": 2.1064, + "step": 19886 + }, + { + "epoch": 3.7276476101218368, + "grad_norm": 54558.859375, + "learning_rate": 7.884236422206481e-09, + "loss": 2.0567, + "step": 19887 + }, + { + "epoch": 3.727835051546392, + "grad_norm": 54126.23046875, + "learning_rate": 7.74531345297258e-09, + "loss": 2.121, + "step": 19888 + }, + { + "epoch": 3.7280224929709465, + "grad_norm": 56444.3515625, + "learning_rate": 7.607625227612492e-09, + "loss": 2.1147, + "step": 19889 + }, + { + "epoch": 3.7282099343955015, + "grad_norm": 57024.51953125, + "learning_rate": 7.471171749529049e-09, + "loss": 2.0631, + "step": 19890 + }, + { + "epoch": 3.728397375820056, + "grad_norm": 52697.125, + "learning_rate": 7.335953022091779e-09, + "loss": 2.0768, + "step": 19891 + }, + { + "epoch": 3.728584817244611, + "grad_norm": 56517.16796875, + "learning_rate": 7.201969048642454e-09, + "loss": 2.1777, + "step": 19892 + }, + { + "epoch": 3.728772258669166, + "grad_norm": 53492.1953125, + "learning_rate": 7.0692198324839866e-09, + "loss": 2.0762, + "step": 19893 + }, + { + "epoch": 3.7289597000937205, + "grad_norm": 63305.66015625, + "learning_rate": 6.9377053768970856e-09, + "loss": 2.0674, + "step": 19894 + }, + { + "epoch": 3.7291471415182755, + "grad_norm": 55524.0859375, + "learning_rate": 6.807425685134705e-09, + "loss": 2.1151, + "step": 19895 + }, + { + "epoch": 3.7293345829428306, + "grad_norm": 57038.32421875, + "learning_rate": 6.678380760410941e-09, + "loss": 2.0362, + "step": 19896 + }, + { + "epoch": 3.7295220243673852, + "grad_norm": 52682.9296875, + "learning_rate": 6.550570605912132e-09, + "loss": 2.06, + "step": 19897 + }, + { + "epoch": 3.72970946579194, + "grad_norm": 52087.22265625, + "learning_rate": 6.4239952248024145e-09, + "loss": 2.0279, + "step": 19898 + }, + { + "epoch": 3.729896907216495, + "grad_norm": 53708.19140625, + "learning_rate": 6.298654620195965e-09, + "loss": 2.0849, + "step": 19899 + }, + { + "epoch": 3.7300843486410495, + "grad_norm": 53405.17578125, + "learning_rate": 6.174548795201407e-09, + "loss": 2.038, + "step": 19900 + }, + { + "epoch": 3.7302717900656046, + "grad_norm": 58525.64453125, + "learning_rate": 6.051677752871854e-09, + "loss": 2.034, + "step": 19901 + }, + { + "epoch": 3.7304592314901592, + "grad_norm": 58860.734375, + "learning_rate": 5.930041496249317e-09, + "loss": 2.0589, + "step": 19902 + }, + { + "epoch": 3.7306466729147143, + "grad_norm": 54521.109375, + "learning_rate": 5.809640028336949e-09, + "loss": 2.0068, + "step": 19903 + }, + { + "epoch": 3.730834114339269, + "grad_norm": 57440.609375, + "learning_rate": 5.690473352110148e-09, + "loss": 2.0336, + "step": 19904 + }, + { + "epoch": 3.7310215557638235, + "grad_norm": 57179.12890625, + "learning_rate": 5.572541470505455e-09, + "loss": 2.0417, + "step": 19905 + }, + { + "epoch": 3.7312089971883786, + "grad_norm": 52146.95703125, + "learning_rate": 5.455844386448305e-09, + "loss": 2.0548, + "step": 19906 + }, + { + "epoch": 3.7313964386129337, + "grad_norm": 55260.4921875, + "learning_rate": 5.3403821028030764e-09, + "loss": 2.0432, + "step": 19907 + }, + { + "epoch": 3.7315838800374883, + "grad_norm": 56832.40234375, + "learning_rate": 5.226154622439694e-09, + "loss": 2.0139, + "step": 19908 + }, + { + "epoch": 3.731771321462043, + "grad_norm": 53523.26171875, + "learning_rate": 5.113161948167023e-09, + "loss": 2.0862, + "step": 19909 + }, + { + "epoch": 3.731958762886598, + "grad_norm": 54821.78515625, + "learning_rate": 5.001404082777273e-09, + "loss": 2.1099, + "step": 19910 + }, + { + "epoch": 3.7321462043111526, + "grad_norm": 59981.45703125, + "learning_rate": 4.8908810290349e-09, + "loss": 2.0572, + "step": 19911 + }, + { + "epoch": 3.7323336457357077, + "grad_norm": 57062.01171875, + "learning_rate": 4.781592789671052e-09, + "loss": 2.2718, + "step": 19912 + }, + { + "epoch": 3.7325210871602623, + "grad_norm": 55440.49609375, + "learning_rate": 4.6735393673835726e-09, + "loss": 2.0786, + "step": 19913 + }, + { + "epoch": 3.7327085285848174, + "grad_norm": 55174.33984375, + "learning_rate": 4.566720764831445e-09, + "loss": 2.0523, + "step": 19914 + }, + { + "epoch": 3.732895970009372, + "grad_norm": 57380.9921875, + "learning_rate": 4.4611369846681015e-09, + "loss": 2.0846, + "step": 19915 + }, + { + "epoch": 3.7330834114339266, + "grad_norm": 65806.8984375, + "learning_rate": 4.356788029491465e-09, + "loss": 2.0319, + "step": 19916 + }, + { + "epoch": 3.7332708528584817, + "grad_norm": 55742.01171875, + "learning_rate": 4.2536739018883556e-09, + "loss": 2.1156, + "step": 19917 + }, + { + "epoch": 3.7334582942830368, + "grad_norm": 61331.453125, + "learning_rate": 4.15179460439008e-09, + "loss": 2.1352, + "step": 19918 + }, + { + "epoch": 3.7336457357075914, + "grad_norm": 53254.9765625, + "learning_rate": 4.051150139527948e-09, + "loss": 2.0967, + "step": 19919 + }, + { + "epoch": 3.733833177132146, + "grad_norm": 54293.0234375, + "learning_rate": 3.9517405097777575e-09, + "loss": 2.0805, + "step": 19920 + }, + { + "epoch": 3.734020618556701, + "grad_norm": 56245.43359375, + "learning_rate": 3.8535657176042016e-09, + "loss": 2.0539, + "step": 19921 + }, + { + "epoch": 3.7342080599812557, + "grad_norm": 55648.7890625, + "learning_rate": 3.756625765422017e-09, + "loss": 2.0966, + "step": 19922 + }, + { + "epoch": 3.734395501405811, + "grad_norm": 58430.2890625, + "learning_rate": 3.660920655629285e-09, + "loss": 2.0414, + "step": 19923 + }, + { + "epoch": 3.7345829428303654, + "grad_norm": 52044.03125, + "learning_rate": 3.5664503905907808e-09, + "loss": 2.0725, + "step": 19924 + }, + { + "epoch": 3.7347703842549205, + "grad_norm": 58547.765625, + "learning_rate": 3.4732149726435235e-09, + "loss": 2.0557, + "step": 19925 + }, + { + "epoch": 3.734957825679475, + "grad_norm": 58752.0234375, + "learning_rate": 3.3812144040856753e-09, + "loss": 2.1156, + "step": 19926 + }, + { + "epoch": 3.7351452671040297, + "grad_norm": 56132.875, + "learning_rate": 3.290448687187642e-09, + "loss": 2.0955, + "step": 19927 + }, + { + "epoch": 3.735332708528585, + "grad_norm": 52723.59375, + "learning_rate": 3.2009178241920735e-09, + "loss": 2.0659, + "step": 19928 + }, + { + "epoch": 3.73552014995314, + "grad_norm": 54836.10546875, + "learning_rate": 3.1126218173138656e-09, + "loss": 2.0779, + "step": 19929 + }, + { + "epoch": 3.7357075913776945, + "grad_norm": 55081.375, + "learning_rate": 3.025560668734606e-09, + "loss": 2.0791, + "step": 19930 + }, + { + "epoch": 3.735895032802249, + "grad_norm": 59222.390625, + "learning_rate": 2.939734380602577e-09, + "loss": 2.0559, + "step": 19931 + }, + { + "epoch": 3.736082474226804, + "grad_norm": 56313.61328125, + "learning_rate": 2.8551429550327524e-09, + "loss": 2.1938, + "step": 19932 + }, + { + "epoch": 3.736269915651359, + "grad_norm": 57768.34375, + "learning_rate": 2.7717863941179035e-09, + "loss": 2.0401, + "step": 19933 + }, + { + "epoch": 3.736457357075914, + "grad_norm": 53721.046875, + "learning_rate": 2.6896646999230447e-09, + "loss": 2.0713, + "step": 19934 + }, + { + "epoch": 3.7366447985004685, + "grad_norm": 56116.19140625, + "learning_rate": 2.608777874468782e-09, + "loss": 2.1345, + "step": 19935 + }, + { + "epoch": 3.7368322399250236, + "grad_norm": 53385.87109375, + "learning_rate": 2.5291259197535167e-09, + "loss": 2.0391, + "step": 19936 + }, + { + "epoch": 3.737019681349578, + "grad_norm": 54231.96875, + "learning_rate": 2.450708837747895e-09, + "loss": 2.0277, + "step": 19937 + }, + { + "epoch": 3.737207122774133, + "grad_norm": 52934.421875, + "learning_rate": 2.3735266303837045e-09, + "loss": 2.0891, + "step": 19938 + }, + { + "epoch": 3.737394564198688, + "grad_norm": 56941.4609375, + "learning_rate": 2.2975792995760803e-09, + "loss": 2.137, + "step": 19939 + }, + { + "epoch": 3.737582005623243, + "grad_norm": 55785.50390625, + "learning_rate": 2.222866847190197e-09, + "loss": 2.0829, + "step": 19940 + }, + { + "epoch": 3.7377694470477976, + "grad_norm": 56315.88671875, + "learning_rate": 2.149389275080127e-09, + "loss": 2.0201, + "step": 19941 + }, + { + "epoch": 3.737956888472352, + "grad_norm": 57183.46875, + "learning_rate": 2.077146585055534e-09, + "loss": 2.1225, + "step": 19942 + }, + { + "epoch": 3.7381443298969073, + "grad_norm": 55466.11328125, + "learning_rate": 2.0061387788983255e-09, + "loss": 2.0275, + "step": 19943 + }, + { + "epoch": 3.738331771321462, + "grad_norm": 55094.01171875, + "learning_rate": 1.9363658583682053e-09, + "loss": 2.0637, + "step": 19944 + }, + { + "epoch": 3.738519212746017, + "grad_norm": 58266.359375, + "learning_rate": 1.8678278251915704e-09, + "loss": 2.1564, + "step": 19945 + }, + { + "epoch": 3.7387066541705716, + "grad_norm": 55887.94921875, + "learning_rate": 1.8005246810504084e-09, + "loss": 1.9833, + "step": 19946 + }, + { + "epoch": 3.7388940955951266, + "grad_norm": 53594.2890625, + "learning_rate": 1.734456427615605e-09, + "loss": 2.0309, + "step": 19947 + }, + { + "epoch": 3.7390815370196813, + "grad_norm": 55641.9453125, + "learning_rate": 1.6696230665136369e-09, + "loss": 2.0776, + "step": 19948 + }, + { + "epoch": 3.7392689784442363, + "grad_norm": 56149.60546875, + "learning_rate": 1.6060245993487766e-09, + "loss": 2.0347, + "step": 19949 + }, + { + "epoch": 3.739456419868791, + "grad_norm": 61821.7265625, + "learning_rate": 1.5436610276919893e-09, + "loss": 2.0117, + "step": 19950 + }, + { + "epoch": 3.739643861293346, + "grad_norm": 55909.296875, + "learning_rate": 1.4825323530864855e-09, + "loss": 2.1025, + "step": 19951 + }, + { + "epoch": 3.7398313027179007, + "grad_norm": 60123.12890625, + "learning_rate": 1.4226385770310658e-09, + "loss": 2.0619, + "step": 19952 + }, + { + "epoch": 3.7400187441424553, + "grad_norm": 53343.3359375, + "learning_rate": 1.3639797010189804e-09, + "loss": 2.037, + "step": 19953 + }, + { + "epoch": 3.7402061855670103, + "grad_norm": 55827.12109375, + "learning_rate": 1.306555726487968e-09, + "loss": 2.0576, + "step": 19954 + }, + { + "epoch": 3.7403936269915654, + "grad_norm": 51763.73828125, + "learning_rate": 1.2503666548646654e-09, + "loss": 2.084, + "step": 19955 + }, + { + "epoch": 3.74058106841612, + "grad_norm": 54496.5859375, + "learning_rate": 1.1954124875257488e-09, + "loss": 2.0282, + "step": 19956 + }, + { + "epoch": 3.7407685098406747, + "grad_norm": 54097.1015625, + "learning_rate": 1.141693225842344e-09, + "loss": 2.0966, + "step": 19957 + }, + { + "epoch": 3.7409559512652297, + "grad_norm": 57775.66796875, + "learning_rate": 1.0892088711356163e-09, + "loss": 2.0, + "step": 19958 + }, + { + "epoch": 3.7411433926897844, + "grad_norm": 53652.24609375, + "learning_rate": 1.0379594246934243e-09, + "loss": 2.0721, + "step": 19959 + }, + { + "epoch": 3.7413308341143394, + "grad_norm": 56439.328125, + "learning_rate": 9.879448877980757e-10, + "loss": 2.0671, + "step": 19960 + }, + { + "epoch": 3.741518275538894, + "grad_norm": 56836.09765625, + "learning_rate": 9.391652616708157e-10, + "loss": 2.0575, + "step": 19961 + }, + { + "epoch": 3.741705716963449, + "grad_norm": 57677.015625, + "learning_rate": 8.916205475217876e-10, + "loss": 2.1106, + "step": 19962 + }, + { + "epoch": 3.7418931583880037, + "grad_norm": 55787.47265625, + "learning_rate": 8.453107465222765e-10, + "loss": 2.0547, + "step": 19963 + }, + { + "epoch": 3.7420805998125584, + "grad_norm": 56944.45703125, + "learning_rate": 8.002358598213633e-10, + "loss": 2.0789, + "step": 19964 + }, + { + "epoch": 3.7422680412371134, + "grad_norm": 51668.33984375, + "learning_rate": 7.56395888529271e-10, + "loss": 1.9781, + "step": 19965 + }, + { + "epoch": 3.7424554826616685, + "grad_norm": 55572.99609375, + "learning_rate": 7.137908337340182e-10, + "loss": 2.0732, + "step": 19966 + }, + { + "epoch": 3.742642924086223, + "grad_norm": 57435.58203125, + "learning_rate": 6.724206964792146e-10, + "loss": 2.0033, + "step": 19967 + }, + { + "epoch": 3.7428303655107777, + "grad_norm": 53893.0546875, + "learning_rate": 6.322854777862652e-10, + "loss": 2.0116, + "step": 19968 + }, + { + "epoch": 3.743017806935333, + "grad_norm": 53548.82421875, + "learning_rate": 5.933851786543709e-10, + "loss": 2.0323, + "step": 19969 + }, + { + "epoch": 3.7432052483598874, + "grad_norm": 49593.73828125, + "learning_rate": 5.557198000438745e-10, + "loss": 2.0292, + "step": 19970 + }, + { + "epoch": 3.7433926897844425, + "grad_norm": 53899.4921875, + "learning_rate": 5.192893428762613e-10, + "loss": 2.0716, + "step": 19971 + }, + { + "epoch": 3.743580131208997, + "grad_norm": 54508.4453125, + "learning_rate": 4.840938080619139e-10, + "loss": 2.0353, + "step": 19972 + }, + { + "epoch": 3.743767572633552, + "grad_norm": 54312.15625, + "learning_rate": 4.501331964612554e-10, + "loss": 2.0721, + "step": 19973 + }, + { + "epoch": 3.743955014058107, + "grad_norm": 63377.22265625, + "learning_rate": 4.1740750891805513e-10, + "loss": 2.058, + "step": 19974 + }, + { + "epoch": 3.7441424554826614, + "grad_norm": 58156.484375, + "learning_rate": 3.85916746242776e-10, + "loss": 2.0426, + "step": 19975 + }, + { + "epoch": 3.7443298969072165, + "grad_norm": 49341.43359375, + "learning_rate": 3.556609092070229e-10, + "loss": 2.1004, + "step": 19976 + }, + { + "epoch": 3.7445173383317716, + "grad_norm": 57138.16015625, + "learning_rate": 3.2663999856019647e-10, + "loss": 2.115, + "step": 19977 + }, + { + "epoch": 3.744704779756326, + "grad_norm": 55996.08984375, + "learning_rate": 2.988540150183905e-10, + "loss": 2.1085, + "step": 19978 + }, + { + "epoch": 3.744892221180881, + "grad_norm": 57562.94921875, + "learning_rate": 2.723029592699433e-10, + "loss": 2.0646, + "step": 19979 + }, + { + "epoch": 3.745079662605436, + "grad_norm": 56048.8671875, + "learning_rate": 2.4698683197543757e-10, + "loss": 2.1138, + "step": 19980 + }, + { + "epoch": 3.7452671040299905, + "grad_norm": 57400.87109375, + "learning_rate": 2.229056337510471e-10, + "loss": 2.1033, + "step": 19981 + }, + { + "epoch": 3.7454545454545456, + "grad_norm": 59131.4765625, + "learning_rate": 2.000593651907412e-10, + "loss": 2.0982, + "step": 19982 + }, + { + "epoch": 3.7456419868791, + "grad_norm": 53547.5546875, + "learning_rate": 1.7844802687183583e-10, + "loss": 1.9826, + "step": 19983 + }, + { + "epoch": 3.7458294283036553, + "grad_norm": 56271.7421875, + "learning_rate": 1.5807161931058467e-10, + "loss": 2.2172, + "step": 19984 + }, + { + "epoch": 3.74601686972821, + "grad_norm": 53915.46875, + "learning_rate": 1.389301430287926e-10, + "loss": 2.094, + "step": 19985 + }, + { + "epoch": 3.7462043111527645, + "grad_norm": 54150.19921875, + "learning_rate": 1.2102359848165102e-10, + "loss": 2.1056, + "step": 19986 + }, + { + "epoch": 3.7463917525773196, + "grad_norm": 50660.07421875, + "learning_rate": 1.0435198612435137e-10, + "loss": 2.0363, + "step": 19987 + }, + { + "epoch": 3.7465791940018747, + "grad_norm": 54897.84375, + "learning_rate": 8.891530636212509e-11, + "loss": 1.9721, + "step": 19988 + }, + { + "epoch": 3.7467666354264293, + "grad_norm": 60145.34375, + "learning_rate": 7.471355957799908e-11, + "loss": 2.0128, + "step": 19989 + }, + { + "epoch": 3.746954076850984, + "grad_norm": 53051.89453125, + "learning_rate": 6.174674612169362e-11, + "loss": 1.9819, + "step": 19990 + }, + { + "epoch": 3.747141518275539, + "grad_norm": 57339.6640625, + "learning_rate": 5.001486631517338e-11, + "loss": 2.0733, + "step": 19991 + }, + { + "epoch": 3.7473289597000936, + "grad_norm": 60197.375, + "learning_rate": 3.951792044709635e-11, + "loss": 2.0944, + "step": 19992 + }, + { + "epoch": 3.7475164011246487, + "grad_norm": 52966.98828125, + "learning_rate": 3.0255908772813815e-11, + "loss": 1.9821, + "step": 19993 + }, + { + "epoch": 3.7477038425492033, + "grad_norm": 56442.92578125, + "learning_rate": 2.222883153102373e-11, + "loss": 2.0393, + "step": 19994 + }, + { + "epoch": 3.7478912839737584, + "grad_norm": 54434.30078125, + "learning_rate": 1.5436688910464014e-11, + "loss": 2.0806, + "step": 19995 + }, + { + "epoch": 3.748078725398313, + "grad_norm": 51262.70703125, + "learning_rate": 9.879481088770348e-12, + "loss": 2.0165, + "step": 19996 + }, + { + "epoch": 3.7482661668228676, + "grad_norm": 58336.93359375, + "learning_rate": 5.55720819361838e-12, + "loss": 2.1269, + "step": 19997 + }, + { + "epoch": 3.7484536082474227, + "grad_norm": 54157.1953125, + "learning_rate": 2.4698703304792957e-12, + "loss": 2.129, + "step": 19998 + }, + { + "epoch": 3.7486410496719778, + "grad_norm": 62931.515625, + "learning_rate": 6.174675881709391e-13, + "loss": 2.0879, + "step": 19999 + }, + { + "epoch": 3.7488284910965324, + "grad_norm": 53209.32421875, + "learning_rate": 0.0, + "loss": 2.052, + "step": 20000 + }, + { + "epoch": 3.7488284910965324, + "eval_loss": 2.256967544555664, + "eval_runtime": 127.8375, + "eval_samples_per_second": 39.495, + "eval_steps_per_second": 1.979, + "step": 20000 + } + ], + "logging_steps": 1, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.573828591348613e+17, + "train_batch_size": 20, + "trial_name": null, + "trial_params": null +}