diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10360 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997456549385333, + "eval_steps": 500, + "global_step": 1474, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006782534972445952, + "grad_norm": 4.1065765912581975, + "learning_rate": 8.88888888888889e-07, + "loss": 1.6818, + "step": 1 + }, + { + "epoch": 0.0013565069944891904, + "grad_norm": 3.1768641966000803, + "learning_rate": 1.777777777777778e-06, + "loss": 1.6016, + "step": 2 + }, + { + "epoch": 0.0020347604917337857, + "grad_norm": 4.208842639816025, + "learning_rate": 2.666666666666667e-06, + "loss": 1.694, + "step": 3 + }, + { + "epoch": 0.0027130139889783808, + "grad_norm": 4.144597866071873, + "learning_rate": 3.555555555555556e-06, + "loss": 1.6743, + "step": 4 + }, + { + "epoch": 0.003391267486222976, + "grad_norm": 4.367675036005987, + "learning_rate": 4.444444444444444e-06, + "loss": 1.7215, + "step": 5 + }, + { + "epoch": 0.004069520983467571, + "grad_norm": 3.77451956658442, + "learning_rate": 5.333333333333334e-06, + "loss": 1.7412, + "step": 6 + }, + { + "epoch": 0.004747774480712166, + "grad_norm": 2.828341566261712, + "learning_rate": 6.222222222222223e-06, + "loss": 1.5605, + "step": 7 + }, + { + "epoch": 0.0054260279779567615, + "grad_norm": 2.1739524057415145, + "learning_rate": 7.111111111111112e-06, + "loss": 1.6683, + "step": 8 + }, + { + "epoch": 0.006104281475201356, + "grad_norm": 1.6980194638479158, + "learning_rate": 8.000000000000001e-06, + "loss": 1.6685, + "step": 9 + }, + { + "epoch": 0.006782534972445952, + "grad_norm": 1.5011794096474325, + "learning_rate": 8.888888888888888e-06, + "loss": 1.648, + "step": 10 + }, + { + "epoch": 0.007460788469690547, + "grad_norm": 2.322587769441417, + "learning_rate": 9.777777777777779e-06, + "loss": 1.686, + "step": 11 + }, + { + "epoch": 0.008139041966935143, + "grad_norm": 1.705227880799364, + "learning_rate": 1.0666666666666667e-05, + "loss": 1.4842, + "step": 12 + }, + { + "epoch": 0.008817295464179737, + "grad_norm": 2.7772036835193217, + "learning_rate": 1.1555555555555556e-05, + "loss": 1.6403, + "step": 13 + }, + { + "epoch": 0.009495548961424332, + "grad_norm": 2.921483047712852, + "learning_rate": 1.2444444444444446e-05, + "loss": 1.6296, + "step": 14 + }, + { + "epoch": 0.010173802458668928, + "grad_norm": 2.8535382585231237, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.6325, + "step": 15 + }, + { + "epoch": 0.010852055955913523, + "grad_norm": 2.6523988159853955, + "learning_rate": 1.4222222222222224e-05, + "loss": 1.6585, + "step": 16 + }, + { + "epoch": 0.011530309453158118, + "grad_norm": 2.4275237868017268, + "learning_rate": 1.5111111111111112e-05, + "loss": 1.6138, + "step": 17 + }, + { + "epoch": 0.012208562950402712, + "grad_norm": 1.9022283612641862, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.6117, + "step": 18 + }, + { + "epoch": 0.012886816447647309, + "grad_norm": 1.4961663445429363, + "learning_rate": 1.688888888888889e-05, + "loss": 1.5869, + "step": 19 + }, + { + "epoch": 0.013565069944891903, + "grad_norm": 1.2530261801072107, + "learning_rate": 1.7777777777777777e-05, + "loss": 1.6136, + "step": 20 + }, + { + "epoch": 0.014243323442136498, + "grad_norm": 1.3729319190613, + "learning_rate": 1.866666666666667e-05, + "loss": 1.5903, + "step": 21 + }, + { + "epoch": 0.014921576939381094, + "grad_norm": 1.6164121258819937, + "learning_rate": 1.9555555555555557e-05, + "loss": 1.5962, + "step": 22 + }, + { + "epoch": 0.015599830436625689, + "grad_norm": 1.868473824482503, + "learning_rate": 2.0444444444444446e-05, + "loss": 1.5636, + "step": 23 + }, + { + "epoch": 0.016278083933870285, + "grad_norm": 1.7300543140348683, + "learning_rate": 2.1333333333333335e-05, + "loss": 1.5771, + "step": 24 + }, + { + "epoch": 0.01695633743111488, + "grad_norm": 1.5738361080005434, + "learning_rate": 2.2222222222222227e-05, + "loss": 1.5466, + "step": 25 + }, + { + "epoch": 0.017634590928359475, + "grad_norm": 1.3215383127512057, + "learning_rate": 2.3111111111111112e-05, + "loss": 1.5627, + "step": 26 + }, + { + "epoch": 0.01831284442560407, + "grad_norm": 1.1786255386020765, + "learning_rate": 2.4e-05, + "loss": 1.5864, + "step": 27 + }, + { + "epoch": 0.018991097922848664, + "grad_norm": 1.0543471785756497, + "learning_rate": 2.4888888888888893e-05, + "loss": 1.515, + "step": 28 + }, + { + "epoch": 0.01966935142009326, + "grad_norm": 1.044341262007584, + "learning_rate": 2.577777777777778e-05, + "loss": 1.5272, + "step": 29 + }, + { + "epoch": 0.020347604917337857, + "grad_norm": 1.1429084132740057, + "learning_rate": 2.6666666666666667e-05, + "loss": 1.5751, + "step": 30 + }, + { + "epoch": 0.02102585841458245, + "grad_norm": 1.1720155421313065, + "learning_rate": 2.755555555555556e-05, + "loss": 1.5623, + "step": 31 + }, + { + "epoch": 0.021704111911827046, + "grad_norm": 1.0700523128168293, + "learning_rate": 2.8444444444444447e-05, + "loss": 1.5182, + "step": 32 + }, + { + "epoch": 0.02238236540907164, + "grad_norm": 0.9851762418398425, + "learning_rate": 2.9333333333333333e-05, + "loss": 1.5367, + "step": 33 + }, + { + "epoch": 0.023060618906316235, + "grad_norm": 0.9364787529061317, + "learning_rate": 3.0222222222222225e-05, + "loss": 1.5335, + "step": 34 + }, + { + "epoch": 0.02373887240356083, + "grad_norm": 0.8677156668511112, + "learning_rate": 3.111111111111112e-05, + "loss": 1.53, + "step": 35 + }, + { + "epoch": 0.024417125900805425, + "grad_norm": 0.9026523779646922, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.497, + "step": 36 + }, + { + "epoch": 0.02509537939805002, + "grad_norm": 0.8913970222309001, + "learning_rate": 3.288888888888889e-05, + "loss": 1.4952, + "step": 37 + }, + { + "epoch": 0.025773632895294617, + "grad_norm": 0.8834806762413843, + "learning_rate": 3.377777777777778e-05, + "loss": 1.486, + "step": 38 + }, + { + "epoch": 0.02645188639253921, + "grad_norm": 0.8387084982910075, + "learning_rate": 3.466666666666667e-05, + "loss": 1.5111, + "step": 39 + }, + { + "epoch": 0.027130139889783807, + "grad_norm": 0.8002050518805109, + "learning_rate": 3.555555555555555e-05, + "loss": 1.4901, + "step": 40 + }, + { + "epoch": 0.027808393387028403, + "grad_norm": 0.8499910835361242, + "learning_rate": 3.644444444444445e-05, + "loss": 1.5206, + "step": 41 + }, + { + "epoch": 0.028486646884272996, + "grad_norm": 0.8341993828007787, + "learning_rate": 3.733333333333334e-05, + "loss": 1.5024, + "step": 42 + }, + { + "epoch": 0.029164900381517592, + "grad_norm": 0.880517862113079, + "learning_rate": 3.8222222222222226e-05, + "loss": 1.5103, + "step": 43 + }, + { + "epoch": 0.02984315387876219, + "grad_norm": 0.8096099002799838, + "learning_rate": 3.9111111111111115e-05, + "loss": 1.4883, + "step": 44 + }, + { + "epoch": 0.03052140737600678, + "grad_norm": 0.8050025671092632, + "learning_rate": 4e-05, + "loss": 1.4605, + "step": 45 + }, + { + "epoch": 0.031199660873251378, + "grad_norm": 0.8660853245091175, + "learning_rate": 3.999995166796149e-05, + "loss": 1.5147, + "step": 46 + }, + { + "epoch": 0.031877914370495974, + "grad_norm": 0.7562387786559109, + "learning_rate": 3.999980667207955e-05, + "loss": 1.4858, + "step": 47 + }, + { + "epoch": 0.03255616786774057, + "grad_norm": 0.7875069445374301, + "learning_rate": 3.9999565013054966e-05, + "loss": 1.4917, + "step": 48 + }, + { + "epoch": 0.03323442136498516, + "grad_norm": 0.80585752298433, + "learning_rate": 3.999922669205574e-05, + "loss": 1.4537, + "step": 49 + }, + { + "epoch": 0.03391267486222976, + "grad_norm": 0.8472273049831842, + "learning_rate": 3.9998791710717044e-05, + "loss": 1.4683, + "step": 50 + }, + { + "epoch": 0.03459092835947435, + "grad_norm": 0.8143071540357201, + "learning_rate": 3.999826007114122e-05, + "loss": 1.5072, + "step": 51 + }, + { + "epoch": 0.03526918185671895, + "grad_norm": 0.7861158344158029, + "learning_rate": 3.99976317758978e-05, + "loss": 1.468, + "step": 52 + }, + { + "epoch": 0.035947435353963546, + "grad_norm": 0.734537069409861, + "learning_rate": 3.999690682802347e-05, + "loss": 1.493, + "step": 53 + }, + { + "epoch": 0.03662568885120814, + "grad_norm": 0.7716856308294497, + "learning_rate": 3.9996085231022037e-05, + "loss": 1.4558, + "step": 54 + }, + { + "epoch": 0.03730394234845273, + "grad_norm": 0.8134849509429027, + "learning_rate": 3.9995166988864454e-05, + "loss": 1.4799, + "step": 55 + }, + { + "epoch": 0.03798219584569733, + "grad_norm": 0.7510660632610676, + "learning_rate": 3.999415210598877e-05, + "loss": 1.4794, + "step": 56 + }, + { + "epoch": 0.038660449342941924, + "grad_norm": 0.7615105320958904, + "learning_rate": 3.999304058730012e-05, + "loss": 1.4873, + "step": 57 + }, + { + "epoch": 0.03933870284018652, + "grad_norm": 0.7548985780285249, + "learning_rate": 3.9991832438170706e-05, + "loss": 1.4626, + "step": 58 + }, + { + "epoch": 0.04001695633743112, + "grad_norm": 0.7707075108441955, + "learning_rate": 3.999052766443975e-05, + "loss": 1.4955, + "step": 59 + }, + { + "epoch": 0.040695209834675714, + "grad_norm": 0.7796824365420911, + "learning_rate": 3.99891262724135e-05, + "loss": 1.4633, + "step": 60 + }, + { + "epoch": 0.0413734633319203, + "grad_norm": 1.6634166289908014, + "learning_rate": 3.998762826886516e-05, + "loss": 1.5228, + "step": 61 + }, + { + "epoch": 0.0420517168291649, + "grad_norm": 0.780351811457274, + "learning_rate": 3.998603366103489e-05, + "loss": 1.4538, + "step": 62 + }, + { + "epoch": 0.042729970326409496, + "grad_norm": 0.8635411707323375, + "learning_rate": 3.9984342456629754e-05, + "loss": 1.4477, + "step": 63 + }, + { + "epoch": 0.04340822382365409, + "grad_norm": 0.8326874431336305, + "learning_rate": 3.998255466382369e-05, + "loss": 1.4486, + "step": 64 + }, + { + "epoch": 0.04408647732089869, + "grad_norm": 0.7723770463913483, + "learning_rate": 3.998067029125746e-05, + "loss": 1.45, + "step": 65 + }, + { + "epoch": 0.04476473081814328, + "grad_norm": 0.7740863591041413, + "learning_rate": 3.9978689348038635e-05, + "loss": 1.4506, + "step": 66 + }, + { + "epoch": 0.045442984315387874, + "grad_norm": 0.7484072365132953, + "learning_rate": 3.99766118437415e-05, + "loss": 1.4502, + "step": 67 + }, + { + "epoch": 0.04612123781263247, + "grad_norm": 0.7800723070156931, + "learning_rate": 3.997443778840707e-05, + "loss": 1.4607, + "step": 68 + }, + { + "epoch": 0.04679949130987707, + "grad_norm": 0.7720573548329548, + "learning_rate": 3.997216719254298e-05, + "loss": 1.4343, + "step": 69 + }, + { + "epoch": 0.04747774480712166, + "grad_norm": 0.7419949116021992, + "learning_rate": 3.9969800067123503e-05, + "loss": 1.4575, + "step": 70 + }, + { + "epoch": 0.04815599830436626, + "grad_norm": 0.7757764657684928, + "learning_rate": 3.9967336423589425e-05, + "loss": 1.4346, + "step": 71 + }, + { + "epoch": 0.04883425180161085, + "grad_norm": 0.7372814045835184, + "learning_rate": 3.9964776273848044e-05, + "loss": 1.4572, + "step": 72 + }, + { + "epoch": 0.049512505298855446, + "grad_norm": 0.7861776737925739, + "learning_rate": 3.996211963027309e-05, + "loss": 1.4342, + "step": 73 + }, + { + "epoch": 0.05019075879610004, + "grad_norm": 0.7705824940277366, + "learning_rate": 3.9959366505704646e-05, + "loss": 1.4077, + "step": 74 + }, + { + "epoch": 0.05086901229334464, + "grad_norm": 0.7550842878169225, + "learning_rate": 3.995651691344914e-05, + "loss": 1.4187, + "step": 75 + }, + { + "epoch": 0.051547265790589235, + "grad_norm": 0.7935364668605709, + "learning_rate": 3.9953570867279225e-05, + "loss": 1.4359, + "step": 76 + }, + { + "epoch": 0.05222551928783383, + "grad_norm": 0.7682454471435735, + "learning_rate": 3.995052838143375e-05, + "loss": 1.4539, + "step": 77 + }, + { + "epoch": 0.05290377278507842, + "grad_norm": 0.7339626264298819, + "learning_rate": 3.994738947061766e-05, + "loss": 1.4596, + "step": 78 + }, + { + "epoch": 0.05358202628232302, + "grad_norm": 0.8142559965687818, + "learning_rate": 3.9944154150001956e-05, + "loss": 1.4757, + "step": 79 + }, + { + "epoch": 0.05426027977956761, + "grad_norm": 0.8023956266998655, + "learning_rate": 3.9940822435223596e-05, + "loss": 1.4541, + "step": 80 + }, + { + "epoch": 0.05493853327681221, + "grad_norm": 0.727937829181823, + "learning_rate": 3.993739434238545e-05, + "loss": 1.4323, + "step": 81 + }, + { + "epoch": 0.055616786774056806, + "grad_norm": 0.7209236130647971, + "learning_rate": 3.993386988805617e-05, + "loss": 1.435, + "step": 82 + }, + { + "epoch": 0.056295040271301396, + "grad_norm": 0.7707170493909989, + "learning_rate": 3.9930249089270185e-05, + "loss": 1.4307, + "step": 83 + }, + { + "epoch": 0.05697329376854599, + "grad_norm": 0.8208154253181018, + "learning_rate": 3.992653196352754e-05, + "loss": 1.4446, + "step": 84 + }, + { + "epoch": 0.05765154726579059, + "grad_norm": 0.7446657999243042, + "learning_rate": 3.9922718528793866e-05, + "loss": 1.4177, + "step": 85 + }, + { + "epoch": 0.058329800763035185, + "grad_norm": 0.7523922030736101, + "learning_rate": 3.991880880350026e-05, + "loss": 1.3962, + "step": 86 + }, + { + "epoch": 0.05900805426027978, + "grad_norm": 0.7154458001918853, + "learning_rate": 3.991480280654323e-05, + "loss": 1.4148, + "step": 87 + }, + { + "epoch": 0.05968630775752438, + "grad_norm": 0.7787531051472882, + "learning_rate": 3.991070055728458e-05, + "loss": 1.4224, + "step": 88 + }, + { + "epoch": 0.06036456125476897, + "grad_norm": 0.7284970405300326, + "learning_rate": 3.9906502075551314e-05, + "loss": 1.3971, + "step": 89 + }, + { + "epoch": 0.06104281475201356, + "grad_norm": 0.7290304582501952, + "learning_rate": 3.9902207381635544e-05, + "loss": 1.3923, + "step": 90 + }, + { + "epoch": 0.06172106824925816, + "grad_norm": 0.7250605798466435, + "learning_rate": 3.989781649629441e-05, + "loss": 1.4145, + "step": 91 + }, + { + "epoch": 0.062399321746502756, + "grad_norm": 0.8072790558594046, + "learning_rate": 3.989332944074995e-05, + "loss": 1.4557, + "step": 92 + }, + { + "epoch": 0.06307757524374735, + "grad_norm": 0.7814578631907936, + "learning_rate": 3.9888746236689014e-05, + "loss": 1.4243, + "step": 93 + }, + { + "epoch": 0.06375582874099195, + "grad_norm": 0.7348756701651429, + "learning_rate": 3.988406690626317e-05, + "loss": 1.4493, + "step": 94 + }, + { + "epoch": 0.06443408223823655, + "grad_norm": 0.7763605868250318, + "learning_rate": 3.987929147208857e-05, + "loss": 1.459, + "step": 95 + }, + { + "epoch": 0.06511233573548114, + "grad_norm": 0.7335967317515184, + "learning_rate": 3.987441995724587e-05, + "loss": 1.4323, + "step": 96 + }, + { + "epoch": 0.06579058923272574, + "grad_norm": 0.7970588880633319, + "learning_rate": 3.986945238528009e-05, + "loss": 1.431, + "step": 97 + }, + { + "epoch": 0.06646884272997032, + "grad_norm": 0.7882918020664678, + "learning_rate": 3.9864388780200514e-05, + "loss": 1.444, + "step": 98 + }, + { + "epoch": 0.06714709622721492, + "grad_norm": 0.7709405021384093, + "learning_rate": 3.985922916648058e-05, + "loss": 1.4113, + "step": 99 + }, + { + "epoch": 0.06782534972445951, + "grad_norm": 0.7372128644657954, + "learning_rate": 3.985397356905774e-05, + "loss": 1.4276, + "step": 100 + }, + { + "epoch": 0.06850360322170411, + "grad_norm": 0.8402512733533295, + "learning_rate": 3.984862201333339e-05, + "loss": 1.4214, + "step": 101 + }, + { + "epoch": 0.0691818567189487, + "grad_norm": 0.7775767209846817, + "learning_rate": 3.9843174525172686e-05, + "loss": 1.4241, + "step": 102 + }, + { + "epoch": 0.0698601102161933, + "grad_norm": 0.8803258285981507, + "learning_rate": 3.983763113090444e-05, + "loss": 1.5175, + "step": 103 + }, + { + "epoch": 0.0705383637134379, + "grad_norm": 0.9085210303547249, + "learning_rate": 3.9831991857321e-05, + "loss": 1.4321, + "step": 104 + }, + { + "epoch": 0.0712166172106825, + "grad_norm": 0.8413616396974392, + "learning_rate": 3.982625673167814e-05, + "loss": 1.4639, + "step": 105 + }, + { + "epoch": 0.07189487070792709, + "grad_norm": 0.7801398149333048, + "learning_rate": 3.982042578169489e-05, + "loss": 1.4014, + "step": 106 + }, + { + "epoch": 0.07257312420517169, + "grad_norm": 0.7866846001446234, + "learning_rate": 3.981449903555341e-05, + "loss": 1.4121, + "step": 107 + }, + { + "epoch": 0.07325137770241628, + "grad_norm": 0.7708520359423757, + "learning_rate": 3.980847652189888e-05, + "loss": 1.3839, + "step": 108 + }, + { + "epoch": 0.07392963119966087, + "grad_norm": 0.830761051025023, + "learning_rate": 3.980235826983933e-05, + "loss": 1.3914, + "step": 109 + }, + { + "epoch": 0.07460788469690546, + "grad_norm": 0.8253907698028418, + "learning_rate": 3.979614430894553e-05, + "loss": 1.3954, + "step": 110 + }, + { + "epoch": 0.07528613819415006, + "grad_norm": 0.7112323297589688, + "learning_rate": 3.9789834669250804e-05, + "loss": 1.4181, + "step": 111 + }, + { + "epoch": 0.07596439169139466, + "grad_norm": 0.8057491331242786, + "learning_rate": 3.978342938125094e-05, + "loss": 1.4377, + "step": 112 + }, + { + "epoch": 0.07664264518863925, + "grad_norm": 0.7298327963697059, + "learning_rate": 3.9776928475904e-05, + "loss": 1.4706, + "step": 113 + }, + { + "epoch": 0.07732089868588385, + "grad_norm": 0.9346906876193559, + "learning_rate": 3.9770331984630176e-05, + "loss": 1.4209, + "step": 114 + }, + { + "epoch": 0.07799915218312845, + "grad_norm": 0.800555245189771, + "learning_rate": 3.9763639939311664e-05, + "loss": 1.4042, + "step": 115 + }, + { + "epoch": 0.07867740568037304, + "grad_norm": 0.7820837234183521, + "learning_rate": 3.9756852372292475e-05, + "loss": 1.3983, + "step": 116 + }, + { + "epoch": 0.07935565917761764, + "grad_norm": 0.7759352883815379, + "learning_rate": 3.974996931637831e-05, + "loss": 1.4158, + "step": 117 + }, + { + "epoch": 0.08003391267486223, + "grad_norm": 0.8440621723868824, + "learning_rate": 3.974299080483638e-05, + "loss": 1.4007, + "step": 118 + }, + { + "epoch": 0.08071216617210683, + "grad_norm": 0.6400735452279184, + "learning_rate": 3.973591687139526e-05, + "loss": 1.4793, + "step": 119 + }, + { + "epoch": 0.08139041966935143, + "grad_norm": 0.8313238775827508, + "learning_rate": 3.97287475502447e-05, + "loss": 1.3954, + "step": 120 + }, + { + "epoch": 0.08206867316659601, + "grad_norm": 0.806251290610579, + "learning_rate": 3.97214828760355e-05, + "loss": 1.4117, + "step": 121 + }, + { + "epoch": 0.0827469266638406, + "grad_norm": 0.7712317181214216, + "learning_rate": 3.971412288387931e-05, + "loss": 1.3757, + "step": 122 + }, + { + "epoch": 0.0834251801610852, + "grad_norm": 0.7590372689247342, + "learning_rate": 3.970666760934846e-05, + "loss": 1.3929, + "step": 123 + }, + { + "epoch": 0.0841034336583298, + "grad_norm": 0.8632264568669091, + "learning_rate": 3.969911708847583e-05, + "loss": 1.4347, + "step": 124 + }, + { + "epoch": 0.0847816871555744, + "grad_norm": 0.8379449866062499, + "learning_rate": 3.9691471357754616e-05, + "loss": 1.3956, + "step": 125 + }, + { + "epoch": 0.08545994065281899, + "grad_norm": 0.8080362798715246, + "learning_rate": 3.9683730454138195e-05, + "loss": 1.3934, + "step": 126 + }, + { + "epoch": 0.08613819415006359, + "grad_norm": 0.7552807716389017, + "learning_rate": 3.967589441503994e-05, + "loss": 1.4253, + "step": 127 + }, + { + "epoch": 0.08681644764730818, + "grad_norm": 0.7685649759662768, + "learning_rate": 3.9667963278333006e-05, + "loss": 1.4015, + "step": 128 + }, + { + "epoch": 0.08749470114455278, + "grad_norm": 0.7984044689432437, + "learning_rate": 3.9659937082350214e-05, + "loss": 1.3644, + "step": 129 + }, + { + "epoch": 0.08817295464179738, + "grad_norm": 0.7882181163488372, + "learning_rate": 3.9651815865883794e-05, + "loss": 1.3886, + "step": 130 + }, + { + "epoch": 0.08885120813904197, + "grad_norm": 0.7755502826098504, + "learning_rate": 3.9643599668185246e-05, + "loss": 1.3836, + "step": 131 + }, + { + "epoch": 0.08952946163628656, + "grad_norm": 0.7595962684622689, + "learning_rate": 3.963528852896513e-05, + "loss": 1.3684, + "step": 132 + }, + { + "epoch": 0.09020771513353115, + "grad_norm": 0.771937384457462, + "learning_rate": 3.9626882488392864e-05, + "loss": 1.382, + "step": 133 + }, + { + "epoch": 0.09088596863077575, + "grad_norm": 0.7642977324026196, + "learning_rate": 3.961838158709657e-05, + "loss": 1.3969, + "step": 134 + }, + { + "epoch": 0.09156422212802034, + "grad_norm": 0.7782206736594208, + "learning_rate": 3.960978586616283e-05, + "loss": 1.3762, + "step": 135 + }, + { + "epoch": 0.09224247562526494, + "grad_norm": 0.7520850703006238, + "learning_rate": 3.9601095367136506e-05, + "loss": 1.4047, + "step": 136 + }, + { + "epoch": 0.09292072912250954, + "grad_norm": 0.7517437279702506, + "learning_rate": 3.959231013202057e-05, + "loss": 1.3862, + "step": 137 + }, + { + "epoch": 0.09359898261975413, + "grad_norm": 0.7310919198153829, + "learning_rate": 3.958343020327585e-05, + "loss": 1.4703, + "step": 138 + }, + { + "epoch": 0.09427723611699873, + "grad_norm": 0.7567376870493908, + "learning_rate": 3.957445562382085e-05, + "loss": 1.3923, + "step": 139 + }, + { + "epoch": 0.09495548961424333, + "grad_norm": 0.7996642480637515, + "learning_rate": 3.956538643703153e-05, + "loss": 1.3866, + "step": 140 + }, + { + "epoch": 0.09563374311148792, + "grad_norm": 0.5505747838701305, + "learning_rate": 3.9556222686741136e-05, + "loss": 1.4342, + "step": 141 + }, + { + "epoch": 0.09631199660873252, + "grad_norm": 0.790082512074581, + "learning_rate": 3.9546964417239926e-05, + "loss": 1.3961, + "step": 142 + }, + { + "epoch": 0.0969902501059771, + "grad_norm": 0.7523052325850244, + "learning_rate": 3.9537611673275017e-05, + "loss": 1.3622, + "step": 143 + }, + { + "epoch": 0.0976685036032217, + "grad_norm": 0.5987075742859339, + "learning_rate": 3.9528164500050116e-05, + "loss": 1.4587, + "step": 144 + }, + { + "epoch": 0.0983467571004663, + "grad_norm": 0.8487087393690602, + "learning_rate": 3.951862294322535e-05, + "loss": 1.393, + "step": 145 + }, + { + "epoch": 0.09902501059771089, + "grad_norm": 0.7950888183946537, + "learning_rate": 3.950898704891699e-05, + "loss": 1.3926, + "step": 146 + }, + { + "epoch": 0.09970326409495549, + "grad_norm": 0.708739946597553, + "learning_rate": 3.94992568636973e-05, + "loss": 1.3969, + "step": 147 + }, + { + "epoch": 0.10038151759220008, + "grad_norm": 0.7727410734149786, + "learning_rate": 3.9489432434594224e-05, + "loss": 1.3824, + "step": 148 + }, + { + "epoch": 0.10105977108944468, + "grad_norm": 0.7945940885527554, + "learning_rate": 3.9479513809091254e-05, + "loss": 1.3958, + "step": 149 + }, + { + "epoch": 0.10173802458668928, + "grad_norm": 0.7231623401610149, + "learning_rate": 3.9469501035127115e-05, + "loss": 1.3926, + "step": 150 + }, + { + "epoch": 0.10241627808393387, + "grad_norm": 0.736920583862729, + "learning_rate": 3.945939416109559e-05, + "loss": 1.4025, + "step": 151 + }, + { + "epoch": 0.10309453158117847, + "grad_norm": 0.7601601202084843, + "learning_rate": 3.9449193235845254e-05, + "loss": 1.367, + "step": 152 + }, + { + "epoch": 0.10377278507842307, + "grad_norm": 0.7889869577399472, + "learning_rate": 3.9438898308679264e-05, + "loss": 1.3798, + "step": 153 + }, + { + "epoch": 0.10445103857566766, + "grad_norm": 0.7143512752208158, + "learning_rate": 3.942850942935511e-05, + "loss": 1.3816, + "step": 154 + }, + { + "epoch": 0.10512929207291224, + "grad_norm": 0.7817356495329524, + "learning_rate": 3.941802664808434e-05, + "loss": 1.3656, + "step": 155 + }, + { + "epoch": 0.10580754557015684, + "grad_norm": 0.7956257676424011, + "learning_rate": 3.9407450015532404e-05, + "loss": 1.4003, + "step": 156 + }, + { + "epoch": 0.10648579906740144, + "grad_norm": 0.7032546203690988, + "learning_rate": 3.93967795828183e-05, + "loss": 1.3834, + "step": 157 + }, + { + "epoch": 0.10716405256464603, + "grad_norm": 0.8491721584532255, + "learning_rate": 3.9386015401514406e-05, + "loss": 1.3764, + "step": 158 + }, + { + "epoch": 0.10784230606189063, + "grad_norm": 0.7725869502257687, + "learning_rate": 3.9375157523646215e-05, + "loss": 1.3664, + "step": 159 + }, + { + "epoch": 0.10852055955913523, + "grad_norm": 0.7524439551144759, + "learning_rate": 3.9364206001692055e-05, + "loss": 1.3558, + "step": 160 + }, + { + "epoch": 0.10919881305637982, + "grad_norm": 0.7213983940170066, + "learning_rate": 3.935316088858287e-05, + "loss": 1.3582, + "step": 161 + }, + { + "epoch": 0.10987706655362442, + "grad_norm": 0.8731554191497123, + "learning_rate": 3.9342022237701945e-05, + "loss": 1.4107, + "step": 162 + }, + { + "epoch": 0.11055532005086902, + "grad_norm": 0.7752564142114415, + "learning_rate": 3.9330790102884646e-05, + "loss": 1.3571, + "step": 163 + }, + { + "epoch": 0.11123357354811361, + "grad_norm": 0.8516471203966379, + "learning_rate": 3.931946453841817e-05, + "loss": 1.3965, + "step": 164 + }, + { + "epoch": 0.11191182704535821, + "grad_norm": 0.7520205232084405, + "learning_rate": 3.930804559904128e-05, + "loss": 1.379, + "step": 165 + }, + { + "epoch": 0.11259008054260279, + "grad_norm": 0.8043059152508056, + "learning_rate": 3.929653333994404e-05, + "loss": 1.3993, + "step": 166 + }, + { + "epoch": 0.11326833403984739, + "grad_norm": 0.7874837588805026, + "learning_rate": 3.928492781676754e-05, + "loss": 1.3894, + "step": 167 + }, + { + "epoch": 0.11394658753709198, + "grad_norm": 0.7938048672188758, + "learning_rate": 3.927322908560364e-05, + "loss": 1.3838, + "step": 168 + }, + { + "epoch": 0.11462484103433658, + "grad_norm": 0.8203451910611482, + "learning_rate": 3.9261437202994696e-05, + "loss": 1.3837, + "step": 169 + }, + { + "epoch": 0.11530309453158118, + "grad_norm": 0.8329544923845756, + "learning_rate": 3.924955222593328e-05, + "loss": 1.3669, + "step": 170 + }, + { + "epoch": 0.11598134802882577, + "grad_norm": 0.8475717705924518, + "learning_rate": 3.92375742118619e-05, + "loss": 1.4013, + "step": 171 + }, + { + "epoch": 0.11665960152607037, + "grad_norm": 0.7646694203370581, + "learning_rate": 3.922550321867276e-05, + "loss": 1.3873, + "step": 172 + }, + { + "epoch": 0.11733785502331497, + "grad_norm": 0.7826878270694271, + "learning_rate": 3.921333930470741e-05, + "loss": 1.3869, + "step": 173 + }, + { + "epoch": 0.11801610852055956, + "grad_norm": 0.812404806202415, + "learning_rate": 3.920108252875653e-05, + "loss": 1.3644, + "step": 174 + }, + { + "epoch": 0.11869436201780416, + "grad_norm": 0.7434821957582678, + "learning_rate": 3.918873295005963e-05, + "loss": 1.3521, + "step": 175 + }, + { + "epoch": 0.11937261551504876, + "grad_norm": 0.819837272291158, + "learning_rate": 3.917629062830473e-05, + "loss": 1.3969, + "step": 176 + }, + { + "epoch": 0.12005086901229335, + "grad_norm": 0.883516555460024, + "learning_rate": 3.9163755623628105e-05, + "loss": 1.4598, + "step": 177 + }, + { + "epoch": 0.12072912250953793, + "grad_norm": 1.015649489713416, + "learning_rate": 3.9151127996614e-05, + "loss": 1.4091, + "step": 178 + }, + { + "epoch": 0.12140737600678253, + "grad_norm": 0.9239836623171834, + "learning_rate": 3.913840780829429e-05, + "loss": 1.4172, + "step": 179 + }, + { + "epoch": 0.12208562950402713, + "grad_norm": 0.8280501002137555, + "learning_rate": 3.9125595120148266e-05, + "loss": 1.3671, + "step": 180 + }, + { + "epoch": 0.12276388300127172, + "grad_norm": 0.8483926444765467, + "learning_rate": 3.911268999410224e-05, + "loss": 1.3692, + "step": 181 + }, + { + "epoch": 0.12344213649851632, + "grad_norm": 0.8355083213687582, + "learning_rate": 3.909969249252933e-05, + "loss": 1.3698, + "step": 182 + }, + { + "epoch": 0.12412038999576092, + "grad_norm": 0.8486653810483453, + "learning_rate": 3.9086602678249095e-05, + "loss": 1.3583, + "step": 183 + }, + { + "epoch": 0.12479864349300551, + "grad_norm": 0.938098681006725, + "learning_rate": 3.907342061452729e-05, + "loss": 1.4199, + "step": 184 + }, + { + "epoch": 0.1254768969902501, + "grad_norm": 0.8317954566954049, + "learning_rate": 3.906014636507551e-05, + "loss": 1.4094, + "step": 185 + }, + { + "epoch": 0.1261551504874947, + "grad_norm": 1.095456794855618, + "learning_rate": 3.904677999405091e-05, + "loss": 1.3697, + "step": 186 + }, + { + "epoch": 0.1268334039847393, + "grad_norm": 0.9483196201551992, + "learning_rate": 3.9033321566055885e-05, + "loss": 1.3756, + "step": 187 + }, + { + "epoch": 0.1275116574819839, + "grad_norm": 0.8024238008384962, + "learning_rate": 3.9019771146137764e-05, + "loss": 1.4041, + "step": 188 + }, + { + "epoch": 0.1281899109792285, + "grad_norm": 0.9413726100672268, + "learning_rate": 3.900612879978848e-05, + "loss": 1.3563, + "step": 189 + }, + { + "epoch": 0.1288681644764731, + "grad_norm": 0.9097638693583244, + "learning_rate": 3.8992394592944286e-05, + "loss": 1.4097, + "step": 190 + }, + { + "epoch": 0.1295464179737177, + "grad_norm": 0.8772276516195574, + "learning_rate": 3.8978568591985397e-05, + "loss": 1.3731, + "step": 191 + }, + { + "epoch": 0.13022467147096228, + "grad_norm": 0.9098273829678695, + "learning_rate": 3.89646508637357e-05, + "loss": 1.3672, + "step": 192 + }, + { + "epoch": 0.13090292496820688, + "grad_norm": 0.8183191888811304, + "learning_rate": 3.89506414754624e-05, + "loss": 1.3835, + "step": 193 + }, + { + "epoch": 0.13158117846545148, + "grad_norm": 0.8919803577719078, + "learning_rate": 3.893654049487574e-05, + "loss": 1.3874, + "step": 194 + }, + { + "epoch": 0.13225943196269604, + "grad_norm": 0.8416172745913636, + "learning_rate": 3.892234799012862e-05, + "loss": 1.3894, + "step": 195 + }, + { + "epoch": 0.13293768545994064, + "grad_norm": 0.8447411846074018, + "learning_rate": 3.890806402981632e-05, + "loss": 1.358, + "step": 196 + }, + { + "epoch": 0.13361593895718524, + "grad_norm": 1.13463194606766, + "learning_rate": 3.889368868297613e-05, + "loss": 1.4096, + "step": 197 + }, + { + "epoch": 0.13429419245442983, + "grad_norm": 1.0136148913804681, + "learning_rate": 3.887922201908703e-05, + "loss": 1.3233, + "step": 198 + }, + { + "epoch": 0.13497244595167443, + "grad_norm": 0.9353648682943572, + "learning_rate": 3.886466410806936e-05, + "loss": 1.3681, + "step": 199 + }, + { + "epoch": 0.13565069944891903, + "grad_norm": 0.9194457839730446, + "learning_rate": 3.885001502028446e-05, + "loss": 1.3777, + "step": 200 + }, + { + "epoch": 0.13632895294616362, + "grad_norm": 0.9414436636519163, + "learning_rate": 3.883527482653436e-05, + "loss": 1.3672, + "step": 201 + }, + { + "epoch": 0.13700720644340822, + "grad_norm": 0.9477037701457524, + "learning_rate": 3.882044359806144e-05, + "loss": 1.3587, + "step": 202 + }, + { + "epoch": 0.13768545994065282, + "grad_norm": 0.8296501298762375, + "learning_rate": 3.880552140654803e-05, + "loss": 1.3871, + "step": 203 + }, + { + "epoch": 0.1383637134378974, + "grad_norm": 1.0146984451817604, + "learning_rate": 3.879050832411613e-05, + "loss": 1.3864, + "step": 204 + }, + { + "epoch": 0.139041966935142, + "grad_norm": 1.0688397482821446, + "learning_rate": 3.877540442332703e-05, + "loss": 1.3566, + "step": 205 + }, + { + "epoch": 0.1397202204323866, + "grad_norm": 0.8967082496420603, + "learning_rate": 3.876020977718096e-05, + "loss": 1.3318, + "step": 206 + }, + { + "epoch": 0.1403984739296312, + "grad_norm": 0.8597915304069587, + "learning_rate": 3.8744924459116734e-05, + "loss": 1.359, + "step": 207 + }, + { + "epoch": 0.1410767274268758, + "grad_norm": 0.9861201591743611, + "learning_rate": 3.8729548543011423e-05, + "loss": 1.3637, + "step": 208 + }, + { + "epoch": 0.1417549809241204, + "grad_norm": 0.8462602408390785, + "learning_rate": 3.8714082103179956e-05, + "loss": 1.3951, + "step": 209 + }, + { + "epoch": 0.142433234421365, + "grad_norm": 1.0342160063623491, + "learning_rate": 3.86985252143748e-05, + "loss": 1.3534, + "step": 210 + }, + { + "epoch": 0.1431114879186096, + "grad_norm": 0.9700626844877009, + "learning_rate": 3.868287795178556e-05, + "loss": 1.4074, + "step": 211 + }, + { + "epoch": 0.14378974141585418, + "grad_norm": 0.8355599061006139, + "learning_rate": 3.8667140391038646e-05, + "loss": 1.3945, + "step": 212 + }, + { + "epoch": 0.14446799491309878, + "grad_norm": 0.8588794535196841, + "learning_rate": 3.8651312608196897e-05, + "loss": 1.3439, + "step": 213 + }, + { + "epoch": 0.14514624841034338, + "grad_norm": 0.9737294357813214, + "learning_rate": 3.863539467975922e-05, + "loss": 1.4077, + "step": 214 + }, + { + "epoch": 0.14582450190758797, + "grad_norm": 0.6834058192292372, + "learning_rate": 3.86193866826602e-05, + "loss": 1.4534, + "step": 215 + }, + { + "epoch": 0.14650275540483257, + "grad_norm": 0.945727982315802, + "learning_rate": 3.860328869426975e-05, + "loss": 1.3838, + "step": 216 + }, + { + "epoch": 0.14718100890207717, + "grad_norm": 0.8345543003512037, + "learning_rate": 3.8587100792392744e-05, + "loss": 1.3811, + "step": 217 + }, + { + "epoch": 0.14785926239932173, + "grad_norm": 0.8309922574724689, + "learning_rate": 3.8570823055268605e-05, + "loss": 1.3462, + "step": 218 + }, + { + "epoch": 0.14853751589656633, + "grad_norm": 0.8519439545936296, + "learning_rate": 3.855445556157093e-05, + "loss": 1.3804, + "step": 219 + }, + { + "epoch": 0.14921576939381093, + "grad_norm": 0.8373458633419311, + "learning_rate": 3.853799839040719e-05, + "loss": 1.3872, + "step": 220 + }, + { + "epoch": 0.14989402289105552, + "grad_norm": 0.8204422865846579, + "learning_rate": 3.852145162131824e-05, + "loss": 1.357, + "step": 221 + }, + { + "epoch": 0.15057227638830012, + "grad_norm": 0.7749768398156192, + "learning_rate": 3.850481533427797e-05, + "loss": 1.3422, + "step": 222 + }, + { + "epoch": 0.15125052988554472, + "grad_norm": 0.8751657869837003, + "learning_rate": 3.848808960969296e-05, + "loss": 1.3583, + "step": 223 + }, + { + "epoch": 0.1519287833827893, + "grad_norm": 0.8599619287852476, + "learning_rate": 3.847127452840204e-05, + "loss": 1.345, + "step": 224 + }, + { + "epoch": 0.1526070368800339, + "grad_norm": 0.7790197241344738, + "learning_rate": 3.8454370171675926e-05, + "loss": 1.3525, + "step": 225 + }, + { + "epoch": 0.1532852903772785, + "grad_norm": 0.7798362164571245, + "learning_rate": 3.843737662121682e-05, + "loss": 1.3326, + "step": 226 + }, + { + "epoch": 0.1539635438745231, + "grad_norm": 0.8117309588898491, + "learning_rate": 3.842029395915803e-05, + "loss": 1.3441, + "step": 227 + }, + { + "epoch": 0.1546417973717677, + "grad_norm": 0.7712267857854244, + "learning_rate": 3.8403122268063524e-05, + "loss": 1.3814, + "step": 228 + }, + { + "epoch": 0.1553200508690123, + "grad_norm": 0.8094984928131681, + "learning_rate": 3.83858616309276e-05, + "loss": 1.3731, + "step": 229 + }, + { + "epoch": 0.1559983043662569, + "grad_norm": 0.8020318070438405, + "learning_rate": 3.836851213117443e-05, + "loss": 1.3545, + "step": 230 + }, + { + "epoch": 0.1566765578635015, + "grad_norm": 0.7792455803212042, + "learning_rate": 3.835107385265768e-05, + "loss": 1.3835, + "step": 231 + }, + { + "epoch": 0.15735481136074608, + "grad_norm": 0.7843157924687033, + "learning_rate": 3.833354687966011e-05, + "loss": 1.3248, + "step": 232 + }, + { + "epoch": 0.15803306485799068, + "grad_norm": 0.8493476520147848, + "learning_rate": 3.831593129689315e-05, + "loss": 1.3557, + "step": 233 + }, + { + "epoch": 0.15871131835523528, + "grad_norm": 0.882277870795216, + "learning_rate": 3.82982271894965e-05, + "loss": 1.3543, + "step": 234 + }, + { + "epoch": 0.15938957185247987, + "grad_norm": 0.7572047887431174, + "learning_rate": 3.828043464303773e-05, + "loss": 1.3606, + "step": 235 + }, + { + "epoch": 0.16006782534972447, + "grad_norm": 0.8062972354097508, + "learning_rate": 3.826255374351183e-05, + "loss": 1.3674, + "step": 236 + }, + { + "epoch": 0.16074607884696906, + "grad_norm": 0.8508022675381113, + "learning_rate": 3.824458457734085e-05, + "loss": 1.3741, + "step": 237 + }, + { + "epoch": 0.16142433234421366, + "grad_norm": 0.778304420325275, + "learning_rate": 3.822652723137341e-05, + "loss": 1.3494, + "step": 238 + }, + { + "epoch": 0.16210258584145826, + "grad_norm": 0.8271572580761374, + "learning_rate": 3.8208381792884374e-05, + "loss": 1.3585, + "step": 239 + }, + { + "epoch": 0.16278083933870285, + "grad_norm": 0.7771004938802668, + "learning_rate": 3.8190148349574316e-05, + "loss": 1.3623, + "step": 240 + }, + { + "epoch": 0.16345909283594742, + "grad_norm": 0.8119254585705749, + "learning_rate": 3.81718269895692e-05, + "loss": 1.3507, + "step": 241 + }, + { + "epoch": 0.16413734633319202, + "grad_norm": 0.8054041105152854, + "learning_rate": 3.81534178014199e-05, + "loss": 1.3704, + "step": 242 + }, + { + "epoch": 0.16481559983043662, + "grad_norm": 0.8341651339274286, + "learning_rate": 3.8134920874101756e-05, + "loss": 1.3525, + "step": 243 + }, + { + "epoch": 0.1654938533276812, + "grad_norm": 0.8668088532427671, + "learning_rate": 3.8116336297014195e-05, + "loss": 1.3853, + "step": 244 + }, + { + "epoch": 0.1661721068249258, + "grad_norm": 0.8904485590262735, + "learning_rate": 3.809766415998028e-05, + "loss": 1.3615, + "step": 245 + }, + { + "epoch": 0.1668503603221704, + "grad_norm": 0.7958380694134233, + "learning_rate": 3.8078904553246234e-05, + "loss": 1.3637, + "step": 246 + }, + { + "epoch": 0.167528613819415, + "grad_norm": 0.7809146687054412, + "learning_rate": 3.806005756748108e-05, + "loss": 1.3862, + "step": 247 + }, + { + "epoch": 0.1682068673166596, + "grad_norm": 0.7448426981896193, + "learning_rate": 3.804112329377613e-05, + "loss": 1.3377, + "step": 248 + }, + { + "epoch": 0.1688851208139042, + "grad_norm": 0.7503956304358436, + "learning_rate": 3.8022101823644605e-05, + "loss": 1.3715, + "step": 249 + }, + { + "epoch": 0.1695633743111488, + "grad_norm": 0.7755324225568783, + "learning_rate": 3.800299324902112e-05, + "loss": 1.3544, + "step": 250 + }, + { + "epoch": 0.1702416278083934, + "grad_norm": 0.7640633646163552, + "learning_rate": 3.7983797662261335e-05, + "loss": 1.3456, + "step": 251 + }, + { + "epoch": 0.17091988130563798, + "grad_norm": 0.7504456766281555, + "learning_rate": 3.796451515614142e-05, + "loss": 1.3534, + "step": 252 + }, + { + "epoch": 0.17159813480288258, + "grad_norm": 0.8207771425862805, + "learning_rate": 3.794514582385767e-05, + "loss": 1.3276, + "step": 253 + }, + { + "epoch": 0.17227638830012718, + "grad_norm": 0.7625453955788795, + "learning_rate": 3.792568975902601e-05, + "loss": 1.3439, + "step": 254 + }, + { + "epoch": 0.17295464179737177, + "grad_norm": 0.7351975940767888, + "learning_rate": 3.790614705568156e-05, + "loss": 1.318, + "step": 255 + }, + { + "epoch": 0.17363289529461637, + "grad_norm": 0.809184694000721, + "learning_rate": 3.7886517808278205e-05, + "loss": 1.3334, + "step": 256 + }, + { + "epoch": 0.17431114879186096, + "grad_norm": 0.8061789671241181, + "learning_rate": 3.7866802111688084e-05, + "loss": 1.3489, + "step": 257 + }, + { + "epoch": 0.17498940228910556, + "grad_norm": 0.7936271064037789, + "learning_rate": 3.784700006120119e-05, + "loss": 1.3528, + "step": 258 + }, + { + "epoch": 0.17566765578635016, + "grad_norm": 0.7552641018922746, + "learning_rate": 3.7827111752524866e-05, + "loss": 1.3287, + "step": 259 + }, + { + "epoch": 0.17634590928359475, + "grad_norm": 0.8374689219052238, + "learning_rate": 3.780713728178335e-05, + "loss": 1.3752, + "step": 260 + }, + { + "epoch": 0.17702416278083935, + "grad_norm": 0.729554288427445, + "learning_rate": 3.7787076745517353e-05, + "loss": 1.3975, + "step": 261 + }, + { + "epoch": 0.17770241627808395, + "grad_norm": 0.8419610549000647, + "learning_rate": 3.776693024068351e-05, + "loss": 1.3363, + "step": 262 + }, + { + "epoch": 0.17838066977532852, + "grad_norm": 0.7889473275556681, + "learning_rate": 3.774669786465401e-05, + "loss": 1.3525, + "step": 263 + }, + { + "epoch": 0.1790589232725731, + "grad_norm": 0.7378097589904369, + "learning_rate": 3.772637971521604e-05, + "loss": 1.3348, + "step": 264 + }, + { + "epoch": 0.1797371767698177, + "grad_norm": 0.7913075811035389, + "learning_rate": 3.770597589057136e-05, + "loss": 1.3615, + "step": 265 + }, + { + "epoch": 0.1804154302670623, + "grad_norm": 0.7520950872769981, + "learning_rate": 3.768548648933581e-05, + "loss": 1.3906, + "step": 266 + }, + { + "epoch": 0.1810936837643069, + "grad_norm": 0.8509685860713678, + "learning_rate": 3.7664911610538844e-05, + "loss": 1.338, + "step": 267 + }, + { + "epoch": 0.1817719372615515, + "grad_norm": 0.855152434807828, + "learning_rate": 3.764425135362305e-05, + "loss": 1.3232, + "step": 268 + }, + { + "epoch": 0.1824501907587961, + "grad_norm": 0.8008198108696746, + "learning_rate": 3.762350581844366e-05, + "loss": 1.3109, + "step": 269 + }, + { + "epoch": 0.1831284442560407, + "grad_norm": 0.8705798073505797, + "learning_rate": 3.7602675105268065e-05, + "loss": 1.3843, + "step": 270 + }, + { + "epoch": 0.1838066977532853, + "grad_norm": 0.844301193211463, + "learning_rate": 3.758175931477537e-05, + "loss": 1.3401, + "step": 271 + }, + { + "epoch": 0.18448495125052988, + "grad_norm": 0.8314454058935925, + "learning_rate": 3.756075854805583e-05, + "loss": 1.3741, + "step": 272 + }, + { + "epoch": 0.18516320474777448, + "grad_norm": 0.8322405908547285, + "learning_rate": 3.7539672906610445e-05, + "loss": 1.3488, + "step": 273 + }, + { + "epoch": 0.18584145824501908, + "grad_norm": 0.7929593381557983, + "learning_rate": 3.751850249235041e-05, + "loss": 1.3128, + "step": 274 + }, + { + "epoch": 0.18651971174226367, + "grad_norm": 0.7936714435193957, + "learning_rate": 3.7497247407596665e-05, + "loss": 1.3248, + "step": 275 + }, + { + "epoch": 0.18719796523950827, + "grad_norm": 0.7968038025658714, + "learning_rate": 3.747590775507936e-05, + "loss": 1.3509, + "step": 276 + }, + { + "epoch": 0.18787621873675286, + "grad_norm": 0.8409932538922028, + "learning_rate": 3.745448363793738e-05, + "loss": 1.339, + "step": 277 + }, + { + "epoch": 0.18855447223399746, + "grad_norm": 0.7837545633394652, + "learning_rate": 3.7432975159717865e-05, + "loss": 1.3483, + "step": 278 + }, + { + "epoch": 0.18923272573124206, + "grad_norm": 0.7997013180068606, + "learning_rate": 3.741138242437566e-05, + "loss": 1.3549, + "step": 279 + }, + { + "epoch": 0.18991097922848665, + "grad_norm": 0.8235720011611928, + "learning_rate": 3.738970553627287e-05, + "loss": 1.3973, + "step": 280 + }, + { + "epoch": 0.19058923272573125, + "grad_norm": 0.7698450279657325, + "learning_rate": 3.73679446001783e-05, + "loss": 1.3507, + "step": 281 + }, + { + "epoch": 0.19126748622297585, + "grad_norm": 0.8025960582420557, + "learning_rate": 3.7346099721266995e-05, + "loss": 1.3481, + "step": 282 + }, + { + "epoch": 0.19194573972022044, + "grad_norm": 0.7590548934859671, + "learning_rate": 3.7324171005119716e-05, + "loss": 1.3411, + "step": 283 + }, + { + "epoch": 0.19262399321746504, + "grad_norm": 0.8147227638445041, + "learning_rate": 3.7302158557722415e-05, + "loss": 1.3631, + "step": 284 + }, + { + "epoch": 0.19330224671470964, + "grad_norm": 0.7386164064451395, + "learning_rate": 3.728006248546573e-05, + "loss": 1.3441, + "step": 285 + }, + { + "epoch": 0.1939805002119542, + "grad_norm": 0.7890002607630037, + "learning_rate": 3.7257882895144485e-05, + "loss": 1.3517, + "step": 286 + }, + { + "epoch": 0.1946587537091988, + "grad_norm": 0.7939882773368258, + "learning_rate": 3.7235619893957175e-05, + "loss": 1.3272, + "step": 287 + }, + { + "epoch": 0.1953370072064434, + "grad_norm": 0.7612534430811938, + "learning_rate": 3.7213273589505405e-05, + "loss": 1.3538, + "step": 288 + }, + { + "epoch": 0.196015260703688, + "grad_norm": 0.8166840142238, + "learning_rate": 3.719084408979343e-05, + "loss": 1.3856, + "step": 289 + }, + { + "epoch": 0.1966935142009326, + "grad_norm": 0.9655100351840458, + "learning_rate": 3.7168331503227586e-05, + "loss": 1.4738, + "step": 290 + }, + { + "epoch": 0.1973717676981772, + "grad_norm": 0.748970675337018, + "learning_rate": 3.71457359386158e-05, + "loss": 1.3158, + "step": 291 + }, + { + "epoch": 0.19805002119542178, + "grad_norm": 0.7913831795192827, + "learning_rate": 3.712305750516704e-05, + "loss": 1.2998, + "step": 292 + }, + { + "epoch": 0.19872827469266638, + "grad_norm": 0.8313988747400662, + "learning_rate": 3.71002963124908e-05, + "loss": 1.3098, + "step": 293 + }, + { + "epoch": 0.19940652818991098, + "grad_norm": 0.7698811838943792, + "learning_rate": 3.707745247059656e-05, + "loss": 1.3304, + "step": 294 + }, + { + "epoch": 0.20008478168715557, + "grad_norm": 0.8167400009802671, + "learning_rate": 3.705452608989327e-05, + "loss": 1.3425, + "step": 295 + }, + { + "epoch": 0.20076303518440017, + "grad_norm": 0.7853365125954755, + "learning_rate": 3.7031517281188795e-05, + "loss": 1.3229, + "step": 296 + }, + { + "epoch": 0.20144128868164476, + "grad_norm": 0.7951101430292243, + "learning_rate": 3.70084261556894e-05, + "loss": 1.3465, + "step": 297 + }, + { + "epoch": 0.20211954217888936, + "grad_norm": 0.7980331185038868, + "learning_rate": 3.698525282499921e-05, + "loss": 1.3304, + "step": 298 + }, + { + "epoch": 0.20279779567613396, + "grad_norm": 0.8765891197449154, + "learning_rate": 3.696199740111964e-05, + "loss": 1.3502, + "step": 299 + }, + { + "epoch": 0.20347604917337855, + "grad_norm": 0.7838726576701436, + "learning_rate": 3.6938659996448916e-05, + "loss": 1.343, + "step": 300 + }, + { + "epoch": 0.20415430267062315, + "grad_norm": 1.536887455459171, + "learning_rate": 3.691524072378145e-05, + "loss": 1.3951, + "step": 301 + }, + { + "epoch": 0.20483255616786775, + "grad_norm": 1.1072730222345826, + "learning_rate": 3.689173969630737e-05, + "loss": 1.3276, + "step": 302 + }, + { + "epoch": 0.20551080966511234, + "grad_norm": 1.0341965163359792, + "learning_rate": 3.6868157027611935e-05, + "loss": 1.3409, + "step": 303 + }, + { + "epoch": 0.20618906316235694, + "grad_norm": 0.855269607966128, + "learning_rate": 3.684449283167499e-05, + "loss": 1.3783, + "step": 304 + }, + { + "epoch": 0.20686731665960154, + "grad_norm": 0.9711680096158264, + "learning_rate": 3.6820747222870415e-05, + "loss": 1.3349, + "step": 305 + }, + { + "epoch": 0.20754557015684613, + "grad_norm": 0.9413940555791582, + "learning_rate": 3.679692031596557e-05, + "loss": 1.3565, + "step": 306 + }, + { + "epoch": 0.20822382365409073, + "grad_norm": 0.8505193076721568, + "learning_rate": 3.677301222612077e-05, + "loss": 1.3549, + "step": 307 + }, + { + "epoch": 0.20890207715133532, + "grad_norm": 0.8654824704034854, + "learning_rate": 3.674902306888868e-05, + "loss": 1.3463, + "step": 308 + }, + { + "epoch": 0.2095803306485799, + "grad_norm": 0.9017551080146448, + "learning_rate": 3.672495296021378e-05, + "loss": 1.3675, + "step": 309 + }, + { + "epoch": 0.2102585841458245, + "grad_norm": 0.8610924662593219, + "learning_rate": 3.670080201643183e-05, + "loss": 1.3418, + "step": 310 + }, + { + "epoch": 0.2109368376430691, + "grad_norm": 0.7567901953649514, + "learning_rate": 3.667657035426924e-05, + "loss": 1.3037, + "step": 311 + }, + { + "epoch": 0.21161509114031368, + "grad_norm": 0.9227793753462682, + "learning_rate": 3.6652258090842596e-05, + "loss": 1.3307, + "step": 312 + }, + { + "epoch": 0.21229334463755828, + "grad_norm": 0.8477420323577082, + "learning_rate": 3.6627865343658004e-05, + "loss": 1.3449, + "step": 313 + }, + { + "epoch": 0.21297159813480288, + "grad_norm": 0.7928792403999231, + "learning_rate": 3.6603392230610596e-05, + "loss": 1.3504, + "step": 314 + }, + { + "epoch": 0.21364985163204747, + "grad_norm": 0.8706330117973166, + "learning_rate": 3.657883886998391e-05, + "loss": 1.3384, + "step": 315 + }, + { + "epoch": 0.21432810512929207, + "grad_norm": 0.7912639457114111, + "learning_rate": 3.6554205380449344e-05, + "loss": 1.3058, + "step": 316 + }, + { + "epoch": 0.21500635862653666, + "grad_norm": 0.8074035387342698, + "learning_rate": 3.6529491881065584e-05, + "loss": 1.3212, + "step": 317 + }, + { + "epoch": 0.21568461212378126, + "grad_norm": 0.8193503607565463, + "learning_rate": 3.6504698491277996e-05, + "loss": 1.3591, + "step": 318 + }, + { + "epoch": 0.21636286562102586, + "grad_norm": 0.7995273140530428, + "learning_rate": 3.64798253309181e-05, + "loss": 1.3558, + "step": 319 + }, + { + "epoch": 0.21704111911827045, + "grad_norm": 0.8687660030654587, + "learning_rate": 3.645487252020294e-05, + "loss": 1.3528, + "step": 320 + }, + { + "epoch": 0.21771937261551505, + "grad_norm": 0.8264620665129695, + "learning_rate": 3.642984017973454e-05, + "loss": 1.317, + "step": 321 + }, + { + "epoch": 0.21839762611275965, + "grad_norm": 2.2301642268612523, + "learning_rate": 3.640472843049931e-05, + "loss": 1.4527, + "step": 322 + }, + { + "epoch": 0.21907587961000424, + "grad_norm": 1.0084140949380813, + "learning_rate": 3.637953739386744e-05, + "loss": 1.3177, + "step": 323 + }, + { + "epoch": 0.21975413310724884, + "grad_norm": 1.0372803017251364, + "learning_rate": 3.6354267191592356e-05, + "loss": 1.3312, + "step": 324 + }, + { + "epoch": 0.22043238660449344, + "grad_norm": 0.9457280722406972, + "learning_rate": 3.63289179458101e-05, + "loss": 1.331, + "step": 325 + }, + { + "epoch": 0.22111064010173803, + "grad_norm": 0.8999936637331983, + "learning_rate": 3.630348977903873e-05, + "loss": 1.3561, + "step": 326 + }, + { + "epoch": 0.22178889359898263, + "grad_norm": 0.8861915476360576, + "learning_rate": 3.627798281417778e-05, + "loss": 1.3401, + "step": 327 + }, + { + "epoch": 0.22246714709622722, + "grad_norm": 0.8583266540025729, + "learning_rate": 3.6252397174507595e-05, + "loss": 1.3225, + "step": 328 + }, + { + "epoch": 0.22314540059347182, + "grad_norm": 0.8424597827495318, + "learning_rate": 3.622673298368879e-05, + "loss": 1.3603, + "step": 329 + }, + { + "epoch": 0.22382365409071642, + "grad_norm": 0.8314710447674626, + "learning_rate": 3.620099036576163e-05, + "loss": 1.3519, + "step": 330 + }, + { + "epoch": 0.224501907587961, + "grad_norm": 0.9112220546676985, + "learning_rate": 3.617516944514544e-05, + "loss": 1.3362, + "step": 331 + }, + { + "epoch": 0.22518016108520558, + "grad_norm": 0.7831649871164149, + "learning_rate": 3.614927034663799e-05, + "loss": 1.3413, + "step": 332 + }, + { + "epoch": 0.22585841458245018, + "grad_norm": 0.8642522956740931, + "learning_rate": 3.6123293195414907e-05, + "loss": 1.3426, + "step": 333 + }, + { + "epoch": 0.22653666807969478, + "grad_norm": 0.8860961029717345, + "learning_rate": 3.609723811702905e-05, + "loss": 1.3591, + "step": 334 + }, + { + "epoch": 0.22721492157693937, + "grad_norm": 0.8060705554104319, + "learning_rate": 3.6071105237409926e-05, + "loss": 1.3497, + "step": 335 + }, + { + "epoch": 0.22789317507418397, + "grad_norm": 0.8286691463885103, + "learning_rate": 3.6044894682863076e-05, + "loss": 1.3406, + "step": 336 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 0.8518464543991984, + "learning_rate": 3.601860658006945e-05, + "loss": 1.3464, + "step": 337 + }, + { + "epoch": 0.22924968206867316, + "grad_norm": 0.7881771078373155, + "learning_rate": 3.599224105608481e-05, + "loss": 1.3343, + "step": 338 + }, + { + "epoch": 0.22992793556591776, + "grad_norm": 0.8275811584205034, + "learning_rate": 3.59657982383391e-05, + "loss": 1.3507, + "step": 339 + }, + { + "epoch": 0.23060618906316235, + "grad_norm": 0.8533833802525195, + "learning_rate": 3.5939278254635854e-05, + "loss": 1.3532, + "step": 340 + }, + { + "epoch": 0.23128444256040695, + "grad_norm": 0.8377041485898244, + "learning_rate": 3.591268123315156e-05, + "loss": 1.3204, + "step": 341 + }, + { + "epoch": 0.23196269605765155, + "grad_norm": 0.7389800588833796, + "learning_rate": 3.5886007302435046e-05, + "loss": 1.3377, + "step": 342 + }, + { + "epoch": 0.23264094955489614, + "grad_norm": 0.8426270847369397, + "learning_rate": 3.585925659140685e-05, + "loss": 1.367, + "step": 343 + }, + { + "epoch": 0.23331920305214074, + "grad_norm": 0.7758726182250993, + "learning_rate": 3.583242922935862e-05, + "loss": 1.3328, + "step": 344 + }, + { + "epoch": 0.23399745654938534, + "grad_norm": 0.7722773102342985, + "learning_rate": 3.580552534595246e-05, + "loss": 1.3421, + "step": 345 + }, + { + "epoch": 0.23467571004662993, + "grad_norm": 0.7905307857352721, + "learning_rate": 3.577854507122032e-05, + "loss": 1.3386, + "step": 346 + }, + { + "epoch": 0.23535396354387453, + "grad_norm": 0.7243404498317927, + "learning_rate": 3.575148853556337e-05, + "loss": 1.3251, + "step": 347 + }, + { + "epoch": 0.23603221704111912, + "grad_norm": 0.8417562243598162, + "learning_rate": 3.572435586975138e-05, + "loss": 1.3504, + "step": 348 + }, + { + "epoch": 0.23671047053836372, + "grad_norm": 2.8634053953757204, + "learning_rate": 3.5697147204922026e-05, + "loss": 1.4622, + "step": 349 + }, + { + "epoch": 0.23738872403560832, + "grad_norm": 1.168899136110835, + "learning_rate": 3.5669862672580344e-05, + "loss": 1.3077, + "step": 350 + }, + { + "epoch": 0.2380669775328529, + "grad_norm": 1.2982026940957527, + "learning_rate": 3.564250240459805e-05, + "loss": 1.3518, + "step": 351 + }, + { + "epoch": 0.2387452310300975, + "grad_norm": 1.0615794582481726, + "learning_rate": 3.561506653321288e-05, + "loss": 1.3315, + "step": 352 + }, + { + "epoch": 0.2394234845273421, + "grad_norm": 0.9813507773031921, + "learning_rate": 3.5587555191028015e-05, + "loss": 1.3452, + "step": 353 + }, + { + "epoch": 0.2401017380245867, + "grad_norm": 0.9987807615736051, + "learning_rate": 3.5559968511011356e-05, + "loss": 1.3405, + "step": 354 + }, + { + "epoch": 0.24077999152183127, + "grad_norm": 0.9976124653212103, + "learning_rate": 3.5532306626494965e-05, + "loss": 1.3258, + "step": 355 + }, + { + "epoch": 0.24145824501907587, + "grad_norm": 1.0166956051519895, + "learning_rate": 3.5504569671174366e-05, + "loss": 1.3536, + "step": 356 + }, + { + "epoch": 0.24213649851632046, + "grad_norm": 0.8408913593507406, + "learning_rate": 3.547675777910791e-05, + "loss": 1.3342, + "step": 357 + }, + { + "epoch": 0.24281475201356506, + "grad_norm": 0.929344141469499, + "learning_rate": 3.544887108471616e-05, + "loss": 1.2994, + "step": 358 + }, + { + "epoch": 0.24349300551080966, + "grad_norm": 0.8734694907852633, + "learning_rate": 3.542090972278118e-05, + "loss": 1.3284, + "step": 359 + }, + { + "epoch": 0.24417125900805425, + "grad_norm": 0.9818367995858515, + "learning_rate": 3.539287382844594e-05, + "loss": 1.368, + "step": 360 + }, + { + "epoch": 0.24484951250529885, + "grad_norm": 0.8507298377945505, + "learning_rate": 3.5364763537213614e-05, + "loss": 1.3086, + "step": 361 + }, + { + "epoch": 0.24552776600254345, + "grad_norm": 0.8320634906336775, + "learning_rate": 3.533657898494699e-05, + "loss": 1.2986, + "step": 362 + }, + { + "epoch": 0.24620601949978804, + "grad_norm": 0.8998882822885591, + "learning_rate": 3.5308320307867755e-05, + "loss": 1.342, + "step": 363 + }, + { + "epoch": 0.24688427299703264, + "grad_norm": 0.88201944435703, + "learning_rate": 3.5279987642555845e-05, + "loss": 1.2974, + "step": 364 + }, + { + "epoch": 0.24756252649427724, + "grad_norm": 0.8058457005541839, + "learning_rate": 3.5251581125948806e-05, + "loss": 1.3408, + "step": 365 + }, + { + "epoch": 0.24824077999152183, + "grad_norm": 0.7771041142421848, + "learning_rate": 3.522310089534114e-05, + "loss": 1.3301, + "step": 366 + }, + { + "epoch": 0.24891903348876643, + "grad_norm": 1.5325889692952437, + "learning_rate": 3.5194547088383584e-05, + "loss": 1.3882, + "step": 367 + }, + { + "epoch": 0.24959728698601102, + "grad_norm": 0.9580037839333668, + "learning_rate": 3.516591984308253e-05, + "loss": 1.2997, + "step": 368 + }, + { + "epoch": 0.2502755404832556, + "grad_norm": 0.8748551180692621, + "learning_rate": 3.513721929779928e-05, + "loss": 1.3375, + "step": 369 + }, + { + "epoch": 0.2509537939805002, + "grad_norm": 0.7995787378451331, + "learning_rate": 3.510844559124942e-05, + "loss": 1.3261, + "step": 370 + }, + { + "epoch": 0.2516320474777448, + "grad_norm": 0.860615621983403, + "learning_rate": 3.507959886250213e-05, + "loss": 1.3141, + "step": 371 + }, + { + "epoch": 0.2523103009749894, + "grad_norm": 0.8475296631685789, + "learning_rate": 3.505067925097955e-05, + "loss": 1.3364, + "step": 372 + }, + { + "epoch": 0.252988554472234, + "grad_norm": 0.83958333049959, + "learning_rate": 3.5021686896456045e-05, + "loss": 1.3569, + "step": 373 + }, + { + "epoch": 0.2536668079694786, + "grad_norm": 0.820796767618206, + "learning_rate": 3.499262193905757e-05, + "loss": 1.2947, + "step": 374 + }, + { + "epoch": 0.2543450614667232, + "grad_norm": 0.8913128103227287, + "learning_rate": 3.4963484519261013e-05, + "loss": 1.3486, + "step": 375 + }, + { + "epoch": 0.2550233149639678, + "grad_norm": 0.8231441290517508, + "learning_rate": 3.493427477789343e-05, + "loss": 1.3342, + "step": 376 + }, + { + "epoch": 0.2557015684612124, + "grad_norm": 0.7871230510129541, + "learning_rate": 3.490499285613148e-05, + "loss": 1.3012, + "step": 377 + }, + { + "epoch": 0.256379821958457, + "grad_norm": 0.8024588319974968, + "learning_rate": 3.487563889550066e-05, + "loss": 1.3323, + "step": 378 + }, + { + "epoch": 0.2570580754557016, + "grad_norm": 0.8487882299189274, + "learning_rate": 3.4846213037874625e-05, + "loss": 1.3555, + "step": 379 + }, + { + "epoch": 0.2577363289529462, + "grad_norm": 0.8115184825040263, + "learning_rate": 3.4816715425474566e-05, + "loss": 1.3002, + "step": 380 + }, + { + "epoch": 0.2584145824501908, + "grad_norm": 0.7904471470376874, + "learning_rate": 3.478714620086844e-05, + "loss": 1.3433, + "step": 381 + }, + { + "epoch": 0.2590928359474354, + "grad_norm": 0.8085473654607371, + "learning_rate": 3.475750550697035e-05, + "loss": 1.2955, + "step": 382 + }, + { + "epoch": 0.25977108944467997, + "grad_norm": 0.7731171074438241, + "learning_rate": 3.47277934870398e-05, + "loss": 1.3215, + "step": 383 + }, + { + "epoch": 0.26044934294192457, + "grad_norm": 0.7876037215909811, + "learning_rate": 3.469801028468105e-05, + "loss": 1.3349, + "step": 384 + }, + { + "epoch": 0.26112759643916916, + "grad_norm": 0.7638575055784281, + "learning_rate": 3.4668156043842386e-05, + "loss": 1.3088, + "step": 385 + }, + { + "epoch": 0.26180584993641376, + "grad_norm": 0.7662086462010528, + "learning_rate": 3.4638230908815434e-05, + "loss": 1.3234, + "step": 386 + }, + { + "epoch": 0.26248410343365836, + "grad_norm": 0.7646960186961371, + "learning_rate": 3.460823502423448e-05, + "loss": 1.309, + "step": 387 + }, + { + "epoch": 0.26316235693090295, + "grad_norm": 0.7612754885045987, + "learning_rate": 3.457816853507575e-05, + "loss": 1.3099, + "step": 388 + }, + { + "epoch": 0.2638406104281475, + "grad_norm": 0.8200614112402728, + "learning_rate": 3.454803158665669e-05, + "loss": 1.3278, + "step": 389 + }, + { + "epoch": 0.2645188639253921, + "grad_norm": 0.7794386935749952, + "learning_rate": 3.4517824324635354e-05, + "loss": 1.3352, + "step": 390 + }, + { + "epoch": 0.2651971174226367, + "grad_norm": 0.7855863081380292, + "learning_rate": 3.448754689500957e-05, + "loss": 1.3191, + "step": 391 + }, + { + "epoch": 0.2658753709198813, + "grad_norm": 0.7680839532766452, + "learning_rate": 3.445719944411633e-05, + "loss": 1.2971, + "step": 392 + }, + { + "epoch": 0.2665536244171259, + "grad_norm": 0.8047274090370727, + "learning_rate": 3.442678211863107e-05, + "loss": 1.325, + "step": 393 + }, + { + "epoch": 0.2672318779143705, + "grad_norm": 0.7870281495612244, + "learning_rate": 3.4396295065566904e-05, + "loss": 1.3109, + "step": 394 + }, + { + "epoch": 0.26791013141161507, + "grad_norm": 1.3837649994463146, + "learning_rate": 3.4365738432273974e-05, + "loss": 1.42, + "step": 395 + }, + { + "epoch": 0.26858838490885967, + "grad_norm": 0.9524291373055933, + "learning_rate": 3.433511236643873e-05, + "loss": 1.3104, + "step": 396 + }, + { + "epoch": 0.26926663840610426, + "grad_norm": 0.8643652629290977, + "learning_rate": 3.430441701608319e-05, + "loss": 1.312, + "step": 397 + }, + { + "epoch": 0.26994489190334886, + "grad_norm": 0.8159960685032611, + "learning_rate": 3.427365252956423e-05, + "loss": 1.2978, + "step": 398 + }, + { + "epoch": 0.27062314540059346, + "grad_norm": 0.8439809279263578, + "learning_rate": 3.42428190555729e-05, + "loss": 1.3347, + "step": 399 + }, + { + "epoch": 0.27130139889783805, + "grad_norm": 0.8193404775640883, + "learning_rate": 3.421191674313365e-05, + "loss": 1.2928, + "step": 400 + }, + { + "epoch": 0.27197965239508265, + "grad_norm": 0.7966926170821803, + "learning_rate": 3.418094574160366e-05, + "loss": 1.3213, + "step": 401 + }, + { + "epoch": 0.27265790589232725, + "grad_norm": 0.8172754714007294, + "learning_rate": 3.4149906200672086e-05, + "loss": 1.3247, + "step": 402 + }, + { + "epoch": 0.27333615938957184, + "grad_norm": 0.7747521741583795, + "learning_rate": 3.4118798270359375e-05, + "loss": 1.3473, + "step": 403 + }, + { + "epoch": 0.27401441288681644, + "grad_norm": 0.7711076847067602, + "learning_rate": 3.4087622101016494e-05, + "loss": 1.3203, + "step": 404 + }, + { + "epoch": 0.27469266638406103, + "grad_norm": 0.821559468991808, + "learning_rate": 3.405637784332421e-05, + "loss": 1.3021, + "step": 405 + }, + { + "epoch": 0.27537091988130563, + "grad_norm": 0.7989113298586838, + "learning_rate": 3.402506564829239e-05, + "loss": 1.3076, + "step": 406 + }, + { + "epoch": 0.2760491733785502, + "grad_norm": 0.7851515411605702, + "learning_rate": 3.3993685667259276e-05, + "loss": 1.3526, + "step": 407 + }, + { + "epoch": 0.2767274268757948, + "grad_norm": 0.7849015476169263, + "learning_rate": 3.3962238051890684e-05, + "loss": 1.3253, + "step": 408 + }, + { + "epoch": 0.2774056803730394, + "grad_norm": 0.7673100708701389, + "learning_rate": 3.393072295417937e-05, + "loss": 1.2977, + "step": 409 + }, + { + "epoch": 0.278083933870284, + "grad_norm": 0.7576984686596503, + "learning_rate": 3.3899140526444236e-05, + "loss": 1.2995, + "step": 410 + }, + { + "epoch": 0.2787621873675286, + "grad_norm": 0.7688682595138551, + "learning_rate": 3.386749092132956e-05, + "loss": 1.3604, + "step": 411 + }, + { + "epoch": 0.2794404408647732, + "grad_norm": 0.7657672122532467, + "learning_rate": 3.383577429180436e-05, + "loss": 1.3254, + "step": 412 + }, + { + "epoch": 0.2801186943620178, + "grad_norm": 0.7550560758993546, + "learning_rate": 3.380399079116157e-05, + "loss": 1.3276, + "step": 413 + }, + { + "epoch": 0.2807969478592624, + "grad_norm": 0.7778218998319307, + "learning_rate": 3.377214057301732e-05, + "loss": 1.306, + "step": 414 + }, + { + "epoch": 0.281475201356507, + "grad_norm": 0.797574211706488, + "learning_rate": 3.374022379131021e-05, + "loss": 1.3245, + "step": 415 + }, + { + "epoch": 0.2821534548537516, + "grad_norm": 0.7301612230929835, + "learning_rate": 3.370824060030055e-05, + "loss": 1.3027, + "step": 416 + }, + { + "epoch": 0.2828317083509962, + "grad_norm": 0.7314764016540719, + "learning_rate": 3.367619115456963e-05, + "loss": 1.3009, + "step": 417 + }, + { + "epoch": 0.2835099618482408, + "grad_norm": 0.7689083615586159, + "learning_rate": 3.364407560901894e-05, + "loss": 1.3151, + "step": 418 + }, + { + "epoch": 0.2841882153454854, + "grad_norm": 0.7716865683673857, + "learning_rate": 3.361189411886947e-05, + "loss": 1.3477, + "step": 419 + }, + { + "epoch": 0.28486646884273, + "grad_norm": 0.729080211495635, + "learning_rate": 3.357964683966093e-05, + "loss": 1.333, + "step": 420 + }, + { + "epoch": 0.2855447223399746, + "grad_norm": 0.7899182195999055, + "learning_rate": 3.354733392725098e-05, + "loss": 1.3211, + "step": 421 + }, + { + "epoch": 0.2862229758372192, + "grad_norm": 0.7711518181659062, + "learning_rate": 3.3514955537814514e-05, + "loss": 1.2805, + "step": 422 + }, + { + "epoch": 0.28690122933446377, + "grad_norm": 0.7867286449711313, + "learning_rate": 3.348251182784289e-05, + "loss": 1.2831, + "step": 423 + }, + { + "epoch": 0.28757948283170837, + "grad_norm": 1.0065878825834589, + "learning_rate": 3.345000295414317e-05, + "loss": 1.4279, + "step": 424 + }, + { + "epoch": 0.28825773632895296, + "grad_norm": 0.8330374919809896, + "learning_rate": 3.3417429073837375e-05, + "loss": 1.3308, + "step": 425 + }, + { + "epoch": 0.28893598982619756, + "grad_norm": 0.8614665742850024, + "learning_rate": 3.3384790344361704e-05, + "loss": 1.3173, + "step": 426 + }, + { + "epoch": 0.28961424332344216, + "grad_norm": 0.7841205349981816, + "learning_rate": 3.335208692346579e-05, + "loss": 1.3253, + "step": 427 + }, + { + "epoch": 0.29029249682068675, + "grad_norm": 0.7900221442726821, + "learning_rate": 3.3319318969211935e-05, + "loss": 1.3228, + "step": 428 + }, + { + "epoch": 0.29097075031793135, + "grad_norm": 0.7430038452394311, + "learning_rate": 3.3286486639974336e-05, + "loss": 1.2977, + "step": 429 + }, + { + "epoch": 0.29164900381517594, + "grad_norm": 0.788096881114239, + "learning_rate": 3.325359009443834e-05, + "loss": 1.3163, + "step": 430 + }, + { + "epoch": 0.29232725731242054, + "grad_norm": 0.763056977179756, + "learning_rate": 3.322062949159965e-05, + "loss": 1.3317, + "step": 431 + }, + { + "epoch": 0.29300551080966514, + "grad_norm": 0.7528583965841259, + "learning_rate": 3.318760499076359e-05, + "loss": 1.3428, + "step": 432 + }, + { + "epoch": 0.29368376430690973, + "grad_norm": 0.8233547389855966, + "learning_rate": 3.315451675154429e-05, + "loss": 1.3128, + "step": 433 + }, + { + "epoch": 0.29436201780415433, + "grad_norm": 0.8006537555521465, + "learning_rate": 3.312136493386397e-05, + "loss": 1.3215, + "step": 434 + }, + { + "epoch": 0.29504027130139887, + "grad_norm": 0.7790959381873427, + "learning_rate": 3.308814969795211e-05, + "loss": 1.3182, + "step": 435 + }, + { + "epoch": 0.29571852479864347, + "grad_norm": 0.8156005104406299, + "learning_rate": 3.305487120434473e-05, + "loss": 1.3138, + "step": 436 + }, + { + "epoch": 0.29639677829588806, + "grad_norm": 0.7995387221213647, + "learning_rate": 3.302152961388356e-05, + "loss": 1.3033, + "step": 437 + }, + { + "epoch": 0.29707503179313266, + "grad_norm": 0.7773890592212905, + "learning_rate": 3.298812508771531e-05, + "loss": 1.3119, + "step": 438 + }, + { + "epoch": 0.29775328529037726, + "grad_norm": 0.7765905341629566, + "learning_rate": 3.295465778729086e-05, + "loss": 1.3189, + "step": 439 + }, + { + "epoch": 0.29843153878762185, + "grad_norm": 0.7395137371426747, + "learning_rate": 3.29211278743645e-05, + "loss": 1.297, + "step": 440 + }, + { + "epoch": 0.29910979228486645, + "grad_norm": 0.7686752854804622, + "learning_rate": 3.288753551099314e-05, + "loss": 1.3332, + "step": 441 + }, + { + "epoch": 0.29978804578211105, + "grad_norm": 0.7899946804799971, + "learning_rate": 3.2853880859535505e-05, + "loss": 1.3049, + "step": 442 + }, + { + "epoch": 0.30046629927935564, + "grad_norm": 0.7603115262725664, + "learning_rate": 3.28201640826514e-05, + "loss": 1.3001, + "step": 443 + }, + { + "epoch": 0.30114455277660024, + "grad_norm": 0.785989206001199, + "learning_rate": 3.278638534330087e-05, + "loss": 1.3182, + "step": 444 + }, + { + "epoch": 0.30182280627384483, + "grad_norm": 0.7428802548334874, + "learning_rate": 3.2752544804743454e-05, + "loss": 1.2838, + "step": 445 + }, + { + "epoch": 0.30250105977108943, + "grad_norm": 0.7667601472507088, + "learning_rate": 3.2718642630537374e-05, + "loss": 1.2867, + "step": 446 + }, + { + "epoch": 0.303179313268334, + "grad_norm": 0.8357980934067926, + "learning_rate": 3.268467898453875e-05, + "loss": 1.3245, + "step": 447 + }, + { + "epoch": 0.3038575667655786, + "grad_norm": 0.7511945493729915, + "learning_rate": 3.2650654030900795e-05, + "loss": 1.3121, + "step": 448 + }, + { + "epoch": 0.3045358202628232, + "grad_norm": 0.805566590221847, + "learning_rate": 3.2616567934073055e-05, + "loss": 1.2974, + "step": 449 + }, + { + "epoch": 0.3052140737600678, + "grad_norm": 0.7858413638773709, + "learning_rate": 3.2582420858800596e-05, + "loss": 1.3424, + "step": 450 + }, + { + "epoch": 0.3058923272573124, + "grad_norm": 0.7950960516687662, + "learning_rate": 3.254821297012318e-05, + "loss": 1.3151, + "step": 451 + }, + { + "epoch": 0.306570580754557, + "grad_norm": 0.7816783938657665, + "learning_rate": 3.25139444333745e-05, + "loss": 1.2928, + "step": 452 + }, + { + "epoch": 0.3072488342518016, + "grad_norm": 0.8106894175155682, + "learning_rate": 3.24796154141814e-05, + "loss": 1.2952, + "step": 453 + }, + { + "epoch": 0.3079270877490462, + "grad_norm": 0.8169374050183266, + "learning_rate": 3.2445226078463e-05, + "loss": 1.2978, + "step": 454 + }, + { + "epoch": 0.3086053412462908, + "grad_norm": 0.8107605434555087, + "learning_rate": 3.241077659243e-05, + "loss": 1.3047, + "step": 455 + }, + { + "epoch": 0.3092835947435354, + "grad_norm": 0.7854751274722761, + "learning_rate": 3.2376267122583774e-05, + "loss": 1.3084, + "step": 456 + }, + { + "epoch": 0.30996184824078, + "grad_norm": 0.8358304822334056, + "learning_rate": 3.234169783571562e-05, + "loss": 1.3186, + "step": 457 + }, + { + "epoch": 0.3106401017380246, + "grad_norm": 0.8100113368228923, + "learning_rate": 3.230706889890595e-05, + "loss": 1.3132, + "step": 458 + }, + { + "epoch": 0.3113183552352692, + "grad_norm": 0.7714940319152012, + "learning_rate": 3.227238047952348e-05, + "loss": 1.2946, + "step": 459 + }, + { + "epoch": 0.3119966087325138, + "grad_norm": 0.8032984924634528, + "learning_rate": 3.223763274522442e-05, + "loss": 1.3338, + "step": 460 + }, + { + "epoch": 0.3126748622297584, + "grad_norm": 0.8491174491833109, + "learning_rate": 3.220282586395163e-05, + "loss": 1.3083, + "step": 461 + }, + { + "epoch": 0.313353115727003, + "grad_norm": 0.7589680883885287, + "learning_rate": 3.2167960003933883e-05, + "loss": 1.2991, + "step": 462 + }, + { + "epoch": 0.31403136922424757, + "grad_norm": 0.7724678326422264, + "learning_rate": 3.2133035333684985e-05, + "loss": 1.2792, + "step": 463 + }, + { + "epoch": 0.31470962272149217, + "grad_norm": 0.8192866442851098, + "learning_rate": 3.209805202200298e-05, + "loss": 1.3157, + "step": 464 + }, + { + "epoch": 0.31538787621873676, + "grad_norm": 0.7625468583050007, + "learning_rate": 3.206301023796934e-05, + "loss": 1.2978, + "step": 465 + }, + { + "epoch": 0.31606612971598136, + "grad_norm": 0.7773487076566746, + "learning_rate": 3.202791015094817e-05, + "loss": 1.3055, + "step": 466 + }, + { + "epoch": 0.31674438321322596, + "grad_norm": 0.7720199775299047, + "learning_rate": 3.199275193058533e-05, + "loss": 1.3373, + "step": 467 + }, + { + "epoch": 0.31742263671047055, + "grad_norm": 0.7702978706001685, + "learning_rate": 3.195753574680767e-05, + "loss": 1.2876, + "step": 468 + }, + { + "epoch": 0.31810089020771515, + "grad_norm": 0.7529851813319876, + "learning_rate": 3.1922261769822185e-05, + "loss": 1.2996, + "step": 469 + }, + { + "epoch": 0.31877914370495974, + "grad_norm": 1.2344261301658803, + "learning_rate": 3.1886930170115193e-05, + "loss": 1.4106, + "step": 470 + }, + { + "epoch": 0.31945739720220434, + "grad_norm": 0.8316996725515984, + "learning_rate": 3.1851541118451524e-05, + "loss": 1.3197, + "step": 471 + }, + { + "epoch": 0.32013565069944894, + "grad_norm": 0.8224106215404863, + "learning_rate": 3.181609478587367e-05, + "loss": 1.3022, + "step": 472 + }, + { + "epoch": 0.32081390419669353, + "grad_norm": 0.8093904826429651, + "learning_rate": 3.1780591343701e-05, + "loss": 1.2954, + "step": 473 + }, + { + "epoch": 0.32149215769393813, + "grad_norm": 0.7965078928648566, + "learning_rate": 3.1745030963528875e-05, + "loss": 1.3085, + "step": 474 + }, + { + "epoch": 0.3221704111911827, + "grad_norm": 0.7828063964744848, + "learning_rate": 3.170941381722785e-05, + "loss": 1.2965, + "step": 475 + }, + { + "epoch": 0.3228486646884273, + "grad_norm": 0.8876462180955584, + "learning_rate": 3.167374007694288e-05, + "loss": 1.2933, + "step": 476 + }, + { + "epoch": 0.3235269181856719, + "grad_norm": 0.7800447657814774, + "learning_rate": 3.163800991509239e-05, + "loss": 1.3203, + "step": 477 + }, + { + "epoch": 0.3242051716829165, + "grad_norm": 0.8282634521942043, + "learning_rate": 3.1602223504367574e-05, + "loss": 1.3233, + "step": 478 + }, + { + "epoch": 0.3248834251801611, + "grad_norm": 0.8535697522844348, + "learning_rate": 3.156638101773143e-05, + "loss": 1.323, + "step": 479 + }, + { + "epoch": 0.3255616786774057, + "grad_norm": 0.7954436934176183, + "learning_rate": 3.1530482628418e-05, + "loss": 1.2734, + "step": 480 + }, + { + "epoch": 0.32623993217465025, + "grad_norm": 0.9165026567224229, + "learning_rate": 3.1494528509931525e-05, + "loss": 1.3222, + "step": 481 + }, + { + "epoch": 0.32691818567189485, + "grad_norm": 0.7503270606156048, + "learning_rate": 3.145851883604558e-05, + "loss": 1.319, + "step": 482 + }, + { + "epoch": 0.32759643916913944, + "grad_norm": 0.9473825774688579, + "learning_rate": 3.1422453780802266e-05, + "loss": 1.3094, + "step": 483 + }, + { + "epoch": 0.32827469266638404, + "grad_norm": 0.7753116526540578, + "learning_rate": 3.1386333518511346e-05, + "loss": 1.3089, + "step": 484 + }, + { + "epoch": 0.32895294616362863, + "grad_norm": 1.2406352929881868, + "learning_rate": 3.135015822374942e-05, + "loss": 1.4282, + "step": 485 + }, + { + "epoch": 0.32963119966087323, + "grad_norm": 0.8914851616392252, + "learning_rate": 3.131392807135904e-05, + "loss": 1.3157, + "step": 486 + }, + { + "epoch": 0.3303094531581178, + "grad_norm": 0.8038376927352104, + "learning_rate": 3.127764323644794e-05, + "loss": 1.3288, + "step": 487 + }, + { + "epoch": 0.3309877066553624, + "grad_norm": 0.8892964512690509, + "learning_rate": 3.124130389438811e-05, + "loss": 1.3258, + "step": 488 + }, + { + "epoch": 0.331665960152607, + "grad_norm": 0.8196194137202071, + "learning_rate": 3.120491022081501e-05, + "loss": 1.3147, + "step": 489 + }, + { + "epoch": 0.3323442136498516, + "grad_norm": 0.7444040418507955, + "learning_rate": 3.1168462391626667e-05, + "loss": 1.2869, + "step": 490 + }, + { + "epoch": 0.3330224671470962, + "grad_norm": 0.8593102441640206, + "learning_rate": 3.1131960582982884e-05, + "loss": 1.2752, + "step": 491 + }, + { + "epoch": 0.3337007206443408, + "grad_norm": 0.8307583231844464, + "learning_rate": 3.1095404971304334e-05, + "loss": 1.2886, + "step": 492 + }, + { + "epoch": 0.3343789741415854, + "grad_norm": 0.7950709207927126, + "learning_rate": 3.105879573327174e-05, + "loss": 1.2985, + "step": 493 + }, + { + "epoch": 0.33505722763883, + "grad_norm": 0.8339570103968118, + "learning_rate": 3.1022133045825024e-05, + "loss": 1.2889, + "step": 494 + }, + { + "epoch": 0.3357354811360746, + "grad_norm": 0.7492370042831757, + "learning_rate": 3.098541708616242e-05, + "loss": 1.2747, + "step": 495 + }, + { + "epoch": 0.3364137346333192, + "grad_norm": 0.8170116352141963, + "learning_rate": 3.094864803173964e-05, + "loss": 1.3251, + "step": 496 + }, + { + "epoch": 0.3370919881305638, + "grad_norm": 0.8032401198395067, + "learning_rate": 3.091182606026903e-05, + "loss": 1.3076, + "step": 497 + }, + { + "epoch": 0.3377702416278084, + "grad_norm": 0.7546853470769238, + "learning_rate": 3.087495134971867e-05, + "loss": 1.3316, + "step": 498 + }, + { + "epoch": 0.338448495125053, + "grad_norm": 0.8134389048041515, + "learning_rate": 3.083802407831158e-05, + "loss": 1.3006, + "step": 499 + }, + { + "epoch": 0.3391267486222976, + "grad_norm": 1.0717469671088464, + "learning_rate": 3.080104442452476e-05, + "loss": 1.4107, + "step": 500 + }, + { + "epoch": 0.3398050021195422, + "grad_norm": 0.8548690740301038, + "learning_rate": 3.0764012567088435e-05, + "loss": 1.3009, + "step": 501 + }, + { + "epoch": 0.3404832556167868, + "grad_norm": 0.8953045227864236, + "learning_rate": 3.0726928684985104e-05, + "loss": 1.311, + "step": 502 + }, + { + "epoch": 0.34116150911403137, + "grad_norm": 0.8364067994792729, + "learning_rate": 3.068979295744876e-05, + "loss": 1.3196, + "step": 503 + }, + { + "epoch": 0.34183976261127597, + "grad_norm": 0.7752202509932031, + "learning_rate": 3.06526055639639e-05, + "loss": 1.3337, + "step": 504 + }, + { + "epoch": 0.34251801610852056, + "grad_norm": 0.8529787549558605, + "learning_rate": 3.061536668426481e-05, + "loss": 1.2983, + "step": 505 + }, + { + "epoch": 0.34319626960576516, + "grad_norm": 0.8587010705698397, + "learning_rate": 3.0578076498334574e-05, + "loss": 1.2886, + "step": 506 + }, + { + "epoch": 0.34387452310300975, + "grad_norm": 0.7328588429654739, + "learning_rate": 3.054073518640428e-05, + "loss": 1.3107, + "step": 507 + }, + { + "epoch": 0.34455277660025435, + "grad_norm": 0.8162742121854552, + "learning_rate": 3.0503342928952073e-05, + "loss": 1.2911, + "step": 508 + }, + { + "epoch": 0.34523103009749895, + "grad_norm": 0.8794715093274978, + "learning_rate": 3.0465899906702366e-05, + "loss": 1.3158, + "step": 509 + }, + { + "epoch": 0.34590928359474354, + "grad_norm": 0.7708822837782366, + "learning_rate": 3.042840630062493e-05, + "loss": 1.3143, + "step": 510 + }, + { + "epoch": 0.34658753709198814, + "grad_norm": 0.7911466862190698, + "learning_rate": 3.0390862291933995e-05, + "loss": 1.294, + "step": 511 + }, + { + "epoch": 0.34726579058923274, + "grad_norm": 0.8538894880559746, + "learning_rate": 3.0353268062087412e-05, + "loss": 1.3222, + "step": 512 + }, + { + "epoch": 0.34794404408647733, + "grad_norm": 0.7852631755793705, + "learning_rate": 3.031562379278575e-05, + "loss": 1.2989, + "step": 513 + }, + { + "epoch": 0.34862229758372193, + "grad_norm": 0.8734426165594713, + "learning_rate": 3.027792966597145e-05, + "loss": 1.3079, + "step": 514 + }, + { + "epoch": 0.3493005510809665, + "grad_norm": 0.744360116551954, + "learning_rate": 3.0240185863827904e-05, + "loss": 1.2956, + "step": 515 + }, + { + "epoch": 0.3499788045782111, + "grad_norm": 0.76062354917638, + "learning_rate": 3.0202392568778598e-05, + "loss": 1.2919, + "step": 516 + }, + { + "epoch": 0.3506570580754557, + "grad_norm": 0.7970415706248092, + "learning_rate": 3.0164549963486238e-05, + "loss": 1.3153, + "step": 517 + }, + { + "epoch": 0.3513353115727003, + "grad_norm": 0.763709088739663, + "learning_rate": 3.012665823085185e-05, + "loss": 1.3278, + "step": 518 + }, + { + "epoch": 0.3520135650699449, + "grad_norm": 0.7546502587704492, + "learning_rate": 3.008871755401389e-05, + "loss": 1.2888, + "step": 519 + }, + { + "epoch": 0.3526918185671895, + "grad_norm": 0.8045823402075674, + "learning_rate": 3.0050728116347402e-05, + "loss": 1.3287, + "step": 520 + }, + { + "epoch": 0.3533700720644341, + "grad_norm": 0.7889243224775581, + "learning_rate": 3.0012690101463066e-05, + "loss": 1.2737, + "step": 521 + }, + { + "epoch": 0.3540483255616787, + "grad_norm": 0.7503150202965315, + "learning_rate": 2.9974603693206368e-05, + "loss": 1.3118, + "step": 522 + }, + { + "epoch": 0.3547265790589233, + "grad_norm": 0.7577330239057181, + "learning_rate": 2.9936469075656687e-05, + "loss": 1.3058, + "step": 523 + }, + { + "epoch": 0.3554048325561679, + "grad_norm": 0.7986442780873649, + "learning_rate": 2.9898286433126394e-05, + "loss": 1.3214, + "step": 524 + }, + { + "epoch": 0.3560830860534125, + "grad_norm": 0.7996687192682629, + "learning_rate": 2.9860055950159997e-05, + "loss": 1.3095, + "step": 525 + }, + { + "epoch": 0.35676133955065703, + "grad_norm": 0.7385651563498843, + "learning_rate": 2.9821777811533207e-05, + "loss": 1.3111, + "step": 526 + }, + { + "epoch": 0.3574395930479016, + "grad_norm": 0.7795490034948527, + "learning_rate": 2.9783452202252066e-05, + "loss": 1.3312, + "step": 527 + }, + { + "epoch": 0.3581178465451462, + "grad_norm": 0.8121483784755045, + "learning_rate": 2.974507930755206e-05, + "loss": 1.2942, + "step": 528 + }, + { + "epoch": 0.3587961000423908, + "grad_norm": 0.782162395784764, + "learning_rate": 2.970665931289722e-05, + "loss": 1.3046, + "step": 529 + }, + { + "epoch": 0.3594743535396354, + "grad_norm": 0.7766997025659732, + "learning_rate": 2.9668192403979198e-05, + "loss": 1.334, + "step": 530 + }, + { + "epoch": 0.36015260703688, + "grad_norm": 0.7738747592143685, + "learning_rate": 2.9629678766716418e-05, + "loss": 1.3246, + "step": 531 + }, + { + "epoch": 0.3608308605341246, + "grad_norm": 0.7648781245429924, + "learning_rate": 2.959111858725313e-05, + "loss": 1.305, + "step": 532 + }, + { + "epoch": 0.3615091140313692, + "grad_norm": 0.7418982711728055, + "learning_rate": 2.9552512051958548e-05, + "loss": 1.2996, + "step": 533 + }, + { + "epoch": 0.3621873675286138, + "grad_norm": 0.7420503991032353, + "learning_rate": 2.9513859347425925e-05, + "loss": 1.2734, + "step": 534 + }, + { + "epoch": 0.3628656210258584, + "grad_norm": 0.7367250178072308, + "learning_rate": 2.9475160660471663e-05, + "loss": 1.3182, + "step": 535 + }, + { + "epoch": 0.363543874523103, + "grad_norm": 0.7864827093672973, + "learning_rate": 2.9436416178134405e-05, + "loss": 1.3212, + "step": 536 + }, + { + "epoch": 0.3642221280203476, + "grad_norm": 0.7395342801664027, + "learning_rate": 2.9397626087674133e-05, + "loss": 1.3069, + "step": 537 + }, + { + "epoch": 0.3649003815175922, + "grad_norm": 0.7599790206682413, + "learning_rate": 2.9358790576571258e-05, + "loss": 1.2985, + "step": 538 + }, + { + "epoch": 0.3655786350148368, + "grad_norm": 0.7743513258616831, + "learning_rate": 2.9319909832525724e-05, + "loss": 1.2913, + "step": 539 + }, + { + "epoch": 0.3662568885120814, + "grad_norm": 0.7645342223650176, + "learning_rate": 2.928098404345609e-05, + "loss": 1.3443, + "step": 540 + }, + { + "epoch": 0.366935142009326, + "grad_norm": 0.8268396657883249, + "learning_rate": 2.9242013397498638e-05, + "loss": 1.2847, + "step": 541 + }, + { + "epoch": 0.3676133955065706, + "grad_norm": 0.7682172066640904, + "learning_rate": 2.9202998083006436e-05, + "loss": 1.2781, + "step": 542 + }, + { + "epoch": 0.36829164900381517, + "grad_norm": 0.77396077420331, + "learning_rate": 2.916393828854845e-05, + "loss": 1.3145, + "step": 543 + }, + { + "epoch": 0.36896990250105977, + "grad_norm": 0.7959331968002789, + "learning_rate": 2.9124834202908636e-05, + "loss": 1.2987, + "step": 544 + }, + { + "epoch": 0.36964815599830436, + "grad_norm": 0.7583473002627757, + "learning_rate": 2.908568601508501e-05, + "loss": 1.2757, + "step": 545 + }, + { + "epoch": 0.37032640949554896, + "grad_norm": 0.8118008723150614, + "learning_rate": 2.9046493914288744e-05, + "loss": 1.2702, + "step": 546 + }, + { + "epoch": 0.37100466299279355, + "grad_norm": 0.8101625355380598, + "learning_rate": 2.900725808994325e-05, + "loss": 1.2959, + "step": 547 + }, + { + "epoch": 0.37168291649003815, + "grad_norm": 0.7191114911111972, + "learning_rate": 2.8967978731683266e-05, + "loss": 1.259, + "step": 548 + }, + { + "epoch": 0.37236116998728275, + "grad_norm": 0.7891369500757057, + "learning_rate": 2.8928656029353933e-05, + "loss": 1.3053, + "step": 549 + }, + { + "epoch": 0.37303942348452734, + "grad_norm": 0.7523339999578558, + "learning_rate": 2.88892901730099e-05, + "loss": 1.2761, + "step": 550 + }, + { + "epoch": 0.37371767698177194, + "grad_norm": 0.7462057986119706, + "learning_rate": 2.8849881352914354e-05, + "loss": 1.3017, + "step": 551 + }, + { + "epoch": 0.37439593047901654, + "grad_norm": 0.7323775120616418, + "learning_rate": 2.8810429759538175e-05, + "loss": 1.285, + "step": 552 + }, + { + "epoch": 0.37507418397626113, + "grad_norm": 0.7863772550083453, + "learning_rate": 2.877093558355895e-05, + "loss": 1.3328, + "step": 553 + }, + { + "epoch": 0.37575243747350573, + "grad_norm": 0.7591796309879588, + "learning_rate": 2.873139901586008e-05, + "loss": 1.2943, + "step": 554 + }, + { + "epoch": 0.3764306909707503, + "grad_norm": 0.7427278153708342, + "learning_rate": 2.869182024752986e-05, + "loss": 1.2892, + "step": 555 + }, + { + "epoch": 0.3771089444679949, + "grad_norm": 0.7561087499786749, + "learning_rate": 2.8652199469860544e-05, + "loss": 1.2908, + "step": 556 + }, + { + "epoch": 0.3777871979652395, + "grad_norm": 0.7096955290839252, + "learning_rate": 2.8612536874347428e-05, + "loss": 1.271, + "step": 557 + }, + { + "epoch": 0.3784654514624841, + "grad_norm": 0.7528073877703341, + "learning_rate": 2.857283265268792e-05, + "loss": 1.28, + "step": 558 + }, + { + "epoch": 0.3791437049597287, + "grad_norm": 0.7314180873816569, + "learning_rate": 2.853308699678061e-05, + "loss": 1.2789, + "step": 559 + }, + { + "epoch": 0.3798219584569733, + "grad_norm": 0.7891519735832144, + "learning_rate": 2.8493300098724374e-05, + "loss": 1.3019, + "step": 560 + }, + { + "epoch": 0.3805002119542179, + "grad_norm": 0.792681198228241, + "learning_rate": 2.8453472150817382e-05, + "loss": 1.3276, + "step": 561 + }, + { + "epoch": 0.3811784654514625, + "grad_norm": 0.8203111963480278, + "learning_rate": 2.841360334555624e-05, + "loss": 1.3002, + "step": 562 + }, + { + "epoch": 0.3818567189487071, + "grad_norm": 0.7812193618738654, + "learning_rate": 2.8373693875634997e-05, + "loss": 1.2996, + "step": 563 + }, + { + "epoch": 0.3825349724459517, + "grad_norm": 0.796401799095726, + "learning_rate": 2.8333743933944268e-05, + "loss": 1.3071, + "step": 564 + }, + { + "epoch": 0.3832132259431963, + "grad_norm": 0.8106133032954245, + "learning_rate": 2.829375371357025e-05, + "loss": 1.3186, + "step": 565 + }, + { + "epoch": 0.3838914794404409, + "grad_norm": 0.7548783654866896, + "learning_rate": 2.8253723407793855e-05, + "loss": 1.2779, + "step": 566 + }, + { + "epoch": 0.3845697329376855, + "grad_norm": 0.7794102240472927, + "learning_rate": 2.8213653210089692e-05, + "loss": 1.3019, + "step": 567 + }, + { + "epoch": 0.3852479864349301, + "grad_norm": 0.7765921389066097, + "learning_rate": 2.81735433141252e-05, + "loss": 1.2782, + "step": 568 + }, + { + "epoch": 0.3859262399321747, + "grad_norm": 0.7981815510661463, + "learning_rate": 2.8133393913759684e-05, + "loss": 1.2772, + "step": 569 + }, + { + "epoch": 0.38660449342941927, + "grad_norm": 0.7543209706721942, + "learning_rate": 2.8093205203043377e-05, + "loss": 1.3033, + "step": 570 + }, + { + "epoch": 0.38728274692666387, + "grad_norm": 0.7711577947039863, + "learning_rate": 2.805297737621651e-05, + "loss": 1.2896, + "step": 571 + }, + { + "epoch": 0.3879610004239084, + "grad_norm": 0.7093645514898603, + "learning_rate": 2.801271062770838e-05, + "loss": 1.2946, + "step": 572 + }, + { + "epoch": 0.388639253921153, + "grad_norm": 0.7602736069559685, + "learning_rate": 2.7972405152136377e-05, + "loss": 1.2972, + "step": 573 + }, + { + "epoch": 0.3893175074183976, + "grad_norm": 0.7372141195181956, + "learning_rate": 2.793206114430509e-05, + "loss": 1.2788, + "step": 574 + }, + { + "epoch": 0.3899957609156422, + "grad_norm": 0.7620898187142349, + "learning_rate": 2.789167879920533e-05, + "loss": 1.4549, + "step": 575 + }, + { + "epoch": 0.3906740144128868, + "grad_norm": 0.8146306015267761, + "learning_rate": 2.7851258312013203e-05, + "loss": 1.2849, + "step": 576 + }, + { + "epoch": 0.3913522679101314, + "grad_norm": 0.7574593357968628, + "learning_rate": 2.781079987808917e-05, + "loss": 1.2847, + "step": 577 + }, + { + "epoch": 0.392030521407376, + "grad_norm": 0.7852134352094805, + "learning_rate": 2.7770303692977077e-05, + "loss": 1.3101, + "step": 578 + }, + { + "epoch": 0.3927087749046206, + "grad_norm": 0.7301643782984193, + "learning_rate": 2.7729769952403255e-05, + "loss": 1.3081, + "step": 579 + }, + { + "epoch": 0.3933870284018652, + "grad_norm": 0.5855026097478325, + "learning_rate": 2.7689198852275512e-05, + "loss": 1.4311, + "step": 580 + }, + { + "epoch": 0.3940652818991098, + "grad_norm": 0.8453462322081536, + "learning_rate": 2.764859058868228e-05, + "loss": 1.2846, + "step": 581 + }, + { + "epoch": 0.3947435353963544, + "grad_norm": 0.7932115499525736, + "learning_rate": 2.7607945357891553e-05, + "loss": 1.2994, + "step": 582 + }, + { + "epoch": 0.39542178889359897, + "grad_norm": 0.7878322433522394, + "learning_rate": 2.756726335635002e-05, + "loss": 1.305, + "step": 583 + }, + { + "epoch": 0.39610004239084357, + "grad_norm": 0.7140626220319872, + "learning_rate": 2.7526544780682083e-05, + "loss": 1.3043, + "step": 584 + }, + { + "epoch": 0.39677829588808816, + "grad_norm": 0.8063047757487747, + "learning_rate": 2.7485789827688934e-05, + "loss": 1.2635, + "step": 585 + }, + { + "epoch": 0.39745654938533276, + "grad_norm": 0.8190532547225947, + "learning_rate": 2.7444998694347547e-05, + "loss": 1.2914, + "step": 586 + }, + { + "epoch": 0.39813480288257735, + "grad_norm": 0.7347525125016025, + "learning_rate": 2.7404171577809808e-05, + "loss": 1.2758, + "step": 587 + }, + { + "epoch": 0.39881305637982195, + "grad_norm": 0.8512983415572944, + "learning_rate": 2.7363308675401478e-05, + "loss": 1.2933, + "step": 588 + }, + { + "epoch": 0.39949130987706655, + "grad_norm": 0.7655041356648618, + "learning_rate": 2.7322410184621295e-05, + "loss": 1.2713, + "step": 589 + }, + { + "epoch": 0.40016956337431114, + "grad_norm": 0.7355254071516576, + "learning_rate": 2.7281476303140014e-05, + "loss": 1.2871, + "step": 590 + }, + { + "epoch": 0.40084781687155574, + "grad_norm": 0.7414572019400784, + "learning_rate": 2.7240507228799415e-05, + "loss": 1.2798, + "step": 591 + }, + { + "epoch": 0.40152607036880034, + "grad_norm": 0.7934461535907612, + "learning_rate": 2.7199503159611396e-05, + "loss": 1.2904, + "step": 592 + }, + { + "epoch": 0.40220432386604493, + "grad_norm": 0.7512655190197016, + "learning_rate": 2.7158464293756975e-05, + "loss": 1.289, + "step": 593 + }, + { + "epoch": 0.40288257736328953, + "grad_norm": 0.7490830945846155, + "learning_rate": 2.711739082958536e-05, + "loss": 1.3159, + "step": 594 + }, + { + "epoch": 0.4035608308605341, + "grad_norm": 0.8836221765235461, + "learning_rate": 2.7076282965612967e-05, + "loss": 1.2967, + "step": 595 + }, + { + "epoch": 0.4042390843577787, + "grad_norm": 0.7625471094203554, + "learning_rate": 2.7035140900522506e-05, + "loss": 1.2746, + "step": 596 + }, + { + "epoch": 0.4049173378550233, + "grad_norm": 0.7754942595291234, + "learning_rate": 2.6993964833161937e-05, + "loss": 1.259, + "step": 597 + }, + { + "epoch": 0.4055955913522679, + "grad_norm": 0.8302288117169206, + "learning_rate": 2.6952754962543604e-05, + "loss": 1.299, + "step": 598 + }, + { + "epoch": 0.4062738448495125, + "grad_norm": 0.7458232064348682, + "learning_rate": 2.6911511487843217e-05, + "loss": 1.2856, + "step": 599 + }, + { + "epoch": 0.4069520983467571, + "grad_norm": 0.7533008733753166, + "learning_rate": 2.6870234608398872e-05, + "loss": 1.2563, + "step": 600 + }, + { + "epoch": 0.4076303518440017, + "grad_norm": 0.713259841427573, + "learning_rate": 2.682892452371017e-05, + "loss": 1.2735, + "step": 601 + }, + { + "epoch": 0.4083086053412463, + "grad_norm": 0.7407768024928815, + "learning_rate": 2.6787581433437156e-05, + "loss": 1.3237, + "step": 602 + }, + { + "epoch": 0.4089868588384909, + "grad_norm": 0.7811252651845391, + "learning_rate": 2.674620553739941e-05, + "loss": 1.2713, + "step": 603 + }, + { + "epoch": 0.4096651123357355, + "grad_norm": 0.7829564243679579, + "learning_rate": 2.6704797035575083e-05, + "loss": 1.2687, + "step": 604 + }, + { + "epoch": 0.4103433658329801, + "grad_norm": 0.7540347265094556, + "learning_rate": 2.6663356128099902e-05, + "loss": 1.2961, + "step": 605 + }, + { + "epoch": 0.4110216193302247, + "grad_norm": 0.7119414126159732, + "learning_rate": 2.6621883015266217e-05, + "loss": 1.2603, + "step": 606 + }, + { + "epoch": 0.4116998728274693, + "grad_norm": 0.7666224044912271, + "learning_rate": 2.658037789752204e-05, + "loss": 1.2717, + "step": 607 + }, + { + "epoch": 0.4123781263247139, + "grad_norm": 0.7950232772588871, + "learning_rate": 2.6538840975470064e-05, + "loss": 1.3044, + "step": 608 + }, + { + "epoch": 0.4130563798219585, + "grad_norm": 0.7396309531748422, + "learning_rate": 2.6497272449866708e-05, + "loss": 1.2887, + "step": 609 + }, + { + "epoch": 0.41373463331920307, + "grad_norm": 0.7834865102164703, + "learning_rate": 2.6455672521621112e-05, + "loss": 1.3073, + "step": 610 + }, + { + "epoch": 0.41441288681644767, + "grad_norm": 0.7303842980777212, + "learning_rate": 2.6414041391794226e-05, + "loss": 1.2773, + "step": 611 + }, + { + "epoch": 0.41509114031369226, + "grad_norm": 0.7418178497569553, + "learning_rate": 2.637237926159779e-05, + "loss": 1.2734, + "step": 612 + }, + { + "epoch": 0.41576939381093686, + "grad_norm": 0.7576949634136853, + "learning_rate": 2.633068633239335e-05, + "loss": 1.2813, + "step": 613 + }, + { + "epoch": 0.41644764730818146, + "grad_norm": 0.7651480859750399, + "learning_rate": 2.6288962805691357e-05, + "loss": 1.281, + "step": 614 + }, + { + "epoch": 0.41712590080542605, + "grad_norm": 0.7986560095498149, + "learning_rate": 2.6247208883150107e-05, + "loss": 1.2633, + "step": 615 + }, + { + "epoch": 0.41780415430267065, + "grad_norm": 0.7696603647611979, + "learning_rate": 2.6205424766574826e-05, + "loss": 1.2987, + "step": 616 + }, + { + "epoch": 0.41848240779991525, + "grad_norm": 0.7517581233977016, + "learning_rate": 2.6163610657916655e-05, + "loss": 1.2808, + "step": 617 + }, + { + "epoch": 0.4191606612971598, + "grad_norm": 0.7753358993235157, + "learning_rate": 2.6121766759271716e-05, + "loss": 1.2774, + "step": 618 + }, + { + "epoch": 0.4198389147944044, + "grad_norm": 0.7566013758054129, + "learning_rate": 2.6079893272880102e-05, + "loss": 1.28, + "step": 619 + }, + { + "epoch": 0.420517168291649, + "grad_norm": 0.7965400573019791, + "learning_rate": 2.6037990401124903e-05, + "loss": 1.3203, + "step": 620 + }, + { + "epoch": 0.4211954217888936, + "grad_norm": 0.8457299678893127, + "learning_rate": 2.5996058346531247e-05, + "loss": 1.2675, + "step": 621 + }, + { + "epoch": 0.4218736752861382, + "grad_norm": 0.7236788297057595, + "learning_rate": 2.5954097311765294e-05, + "loss": 1.2608, + "step": 622 + }, + { + "epoch": 0.42255192878338277, + "grad_norm": 0.802431150827969, + "learning_rate": 2.5912107499633278e-05, + "loss": 1.2753, + "step": 623 + }, + { + "epoch": 0.42323018228062737, + "grad_norm": 0.824996773908104, + "learning_rate": 2.5870089113080533e-05, + "loss": 1.3015, + "step": 624 + }, + { + "epoch": 0.42390843577787196, + "grad_norm": 0.7944101948619141, + "learning_rate": 2.5828042355190475e-05, + "loss": 1.3058, + "step": 625 + }, + { + "epoch": 0.42458668927511656, + "grad_norm": 0.7380566963430693, + "learning_rate": 2.5785967429183652e-05, + "loss": 1.2724, + "step": 626 + }, + { + "epoch": 0.42526494277236115, + "grad_norm": 0.7848691720302051, + "learning_rate": 2.574386453841678e-05, + "loss": 1.2688, + "step": 627 + }, + { + "epoch": 0.42594319626960575, + "grad_norm": 0.7668918737753835, + "learning_rate": 2.570173388638169e-05, + "loss": 1.2937, + "step": 628 + }, + { + "epoch": 0.42662144976685035, + "grad_norm": 0.7983440220905413, + "learning_rate": 2.5659575676704426e-05, + "loss": 1.2901, + "step": 629 + }, + { + "epoch": 0.42729970326409494, + "grad_norm": 0.761042939856984, + "learning_rate": 2.56173901131442e-05, + "loss": 1.2911, + "step": 630 + }, + { + "epoch": 0.42797795676133954, + "grad_norm": 0.7645593903471752, + "learning_rate": 2.5575177399592447e-05, + "loss": 1.2787, + "step": 631 + }, + { + "epoch": 0.42865621025858414, + "grad_norm": 0.8295918351947635, + "learning_rate": 2.5532937740071814e-05, + "loss": 1.2907, + "step": 632 + }, + { + "epoch": 0.42933446375582873, + "grad_norm": 0.7930301114347823, + "learning_rate": 2.5490671338735178e-05, + "loss": 1.2738, + "step": 633 + }, + { + "epoch": 0.43001271725307333, + "grad_norm": 0.7896063345267272, + "learning_rate": 2.5448378399864683e-05, + "loss": 1.2968, + "step": 634 + }, + { + "epoch": 0.4306909707503179, + "grad_norm": 0.7951954101418609, + "learning_rate": 2.540605912787073e-05, + "loss": 1.2791, + "step": 635 + }, + { + "epoch": 0.4313692242475625, + "grad_norm": 0.8325761502518559, + "learning_rate": 2.5363713727290974e-05, + "loss": 1.3083, + "step": 636 + }, + { + "epoch": 0.4320474777448071, + "grad_norm": 0.8414502480322035, + "learning_rate": 2.5321342402789377e-05, + "loss": 1.2975, + "step": 637 + }, + { + "epoch": 0.4327257312420517, + "grad_norm": 0.8131875545774248, + "learning_rate": 2.5278945359155183e-05, + "loss": 1.29, + "step": 638 + }, + { + "epoch": 0.4334039847392963, + "grad_norm": 0.7968566988162644, + "learning_rate": 2.5236522801301945e-05, + "loss": 1.2808, + "step": 639 + }, + { + "epoch": 0.4340822382365409, + "grad_norm": 0.797923868742449, + "learning_rate": 2.5194074934266538e-05, + "loss": 1.276, + "step": 640 + }, + { + "epoch": 0.4347604917337855, + "grad_norm": 0.8148462451491864, + "learning_rate": 2.5151601963208152e-05, + "loss": 1.2816, + "step": 641 + }, + { + "epoch": 0.4354387452310301, + "grad_norm": 0.7835535568430211, + "learning_rate": 2.510910409340732e-05, + "loss": 1.3027, + "step": 642 + }, + { + "epoch": 0.4361169987282747, + "grad_norm": 0.7496355105193699, + "learning_rate": 2.50665815302649e-05, + "loss": 1.2834, + "step": 643 + }, + { + "epoch": 0.4367952522255193, + "grad_norm": 0.7875835891135449, + "learning_rate": 2.502403447930112e-05, + "loss": 1.2517, + "step": 644 + }, + { + "epoch": 0.4374735057227639, + "grad_norm": 0.7389635498409806, + "learning_rate": 2.4981463146154546e-05, + "loss": 1.2756, + "step": 645 + }, + { + "epoch": 0.4381517592200085, + "grad_norm": 0.7604996907814987, + "learning_rate": 2.493886773658111e-05, + "loss": 1.2869, + "step": 646 + }, + { + "epoch": 0.4388300127172531, + "grad_norm": 0.7293596088195266, + "learning_rate": 2.48962484564531e-05, + "loss": 1.2868, + "step": 647 + }, + { + "epoch": 0.4395082662144977, + "grad_norm": 0.780363247786965, + "learning_rate": 2.4853605511758193e-05, + "loss": 1.3103, + "step": 648 + }, + { + "epoch": 0.4401865197117423, + "grad_norm": 0.8259123145424743, + "learning_rate": 2.4810939108598443e-05, + "loss": 1.301, + "step": 649 + }, + { + "epoch": 0.44086477320898687, + "grad_norm": 0.8247133906271952, + "learning_rate": 2.4768249453189256e-05, + "loss": 1.3135, + "step": 650 + }, + { + "epoch": 0.44154302670623147, + "grad_norm": 0.7141969140986962, + "learning_rate": 2.4725536751858453e-05, + "loss": 1.2542, + "step": 651 + }, + { + "epoch": 0.44222128020347606, + "grad_norm": 0.7521164292440358, + "learning_rate": 2.4682801211045214e-05, + "loss": 1.2935, + "step": 652 + }, + { + "epoch": 0.44289953370072066, + "grad_norm": 0.7843461839457703, + "learning_rate": 2.4640043037299135e-05, + "loss": 1.2799, + "step": 653 + }, + { + "epoch": 0.44357778719796526, + "grad_norm": 0.8020780634370414, + "learning_rate": 2.4597262437279172e-05, + "loss": 1.3121, + "step": 654 + }, + { + "epoch": 0.44425604069520985, + "grad_norm": 0.7418747882607136, + "learning_rate": 2.455445961775269e-05, + "loss": 1.2531, + "step": 655 + }, + { + "epoch": 0.44493429419245445, + "grad_norm": 0.8239106028350953, + "learning_rate": 2.451163478559444e-05, + "loss": 1.3214, + "step": 656 + }, + { + "epoch": 0.44561254768969905, + "grad_norm": 0.7157980505078755, + "learning_rate": 2.4468788147785574e-05, + "loss": 1.3993, + "step": 657 + }, + { + "epoch": 0.44629080118694364, + "grad_norm": 0.8222169942619503, + "learning_rate": 2.442591991141262e-05, + "loss": 1.2691, + "step": 658 + }, + { + "epoch": 0.44696905468418824, + "grad_norm": 0.7867959562129859, + "learning_rate": 2.4383030283666505e-05, + "loss": 1.2866, + "step": 659 + }, + { + "epoch": 0.44764730818143283, + "grad_norm": 0.7342001393007882, + "learning_rate": 2.434011947184154e-05, + "loss": 1.294, + "step": 660 + }, + { + "epoch": 0.44832556167867743, + "grad_norm": 0.5006326334519876, + "learning_rate": 2.429718768333443e-05, + "loss": 1.4205, + "step": 661 + }, + { + "epoch": 0.449003815175922, + "grad_norm": 0.8123981244739599, + "learning_rate": 2.4254235125643258e-05, + "loss": 1.2908, + "step": 662 + }, + { + "epoch": 0.44968206867316657, + "grad_norm": 0.8233710990841918, + "learning_rate": 2.421126200636649e-05, + "loss": 1.3067, + "step": 663 + }, + { + "epoch": 0.45036032217041116, + "grad_norm": 0.7474277882376499, + "learning_rate": 2.4168268533201978e-05, + "loss": 1.2803, + "step": 664 + }, + { + "epoch": 0.45103857566765576, + "grad_norm": 0.8240228952238716, + "learning_rate": 2.4125254913945933e-05, + "loss": 1.3009, + "step": 665 + }, + { + "epoch": 0.45171682916490036, + "grad_norm": 0.7825754554639681, + "learning_rate": 2.408222135649195e-05, + "loss": 1.2953, + "step": 666 + }, + { + "epoch": 0.45239508266214495, + "grad_norm": 0.7335894316828039, + "learning_rate": 2.4039168068829986e-05, + "loss": 1.2611, + "step": 667 + }, + { + "epoch": 0.45307333615938955, + "grad_norm": 0.7626283165326443, + "learning_rate": 2.399609525904536e-05, + "loss": 1.2842, + "step": 668 + }, + { + "epoch": 0.45375158965663415, + "grad_norm": 0.7958975092419973, + "learning_rate": 2.395300313531773e-05, + "loss": 1.303, + "step": 669 + }, + { + "epoch": 0.45442984315387874, + "grad_norm": 0.8028562108690003, + "learning_rate": 2.390989190592012e-05, + "loss": 1.2993, + "step": 670 + }, + { + "epoch": 0.45510809665112334, + "grad_norm": 0.7705990126826399, + "learning_rate": 2.3866761779217894e-05, + "loss": 1.2882, + "step": 671 + }, + { + "epoch": 0.45578635014836794, + "grad_norm": 0.7406487372590941, + "learning_rate": 2.3823612963667754e-05, + "loss": 1.2679, + "step": 672 + }, + { + "epoch": 0.45646460364561253, + "grad_norm": 0.8387719012384222, + "learning_rate": 2.37804456678167e-05, + "loss": 1.2859, + "step": 673 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.7747517984064684, + "learning_rate": 2.373726010030109e-05, + "loss": 1.2804, + "step": 674 + }, + { + "epoch": 0.4578211106401017, + "grad_norm": 0.7304383087457895, + "learning_rate": 2.3694056469845564e-05, + "loss": 1.2883, + "step": 675 + }, + { + "epoch": 0.4584993641373463, + "grad_norm": 0.8324980955959124, + "learning_rate": 2.3650834985262087e-05, + "loss": 1.2525, + "step": 676 + }, + { + "epoch": 0.4591776176345909, + "grad_norm": 0.7458679753068804, + "learning_rate": 2.3607595855448894e-05, + "loss": 1.2732, + "step": 677 + }, + { + "epoch": 0.4598558711318355, + "grad_norm": 0.734114365691043, + "learning_rate": 2.356433928938952e-05, + "loss": 1.277, + "step": 678 + }, + { + "epoch": 0.4605341246290801, + "grad_norm": 0.805471283388345, + "learning_rate": 2.3521065496151766e-05, + "loss": 1.3127, + "step": 679 + }, + { + "epoch": 0.4612123781263247, + "grad_norm": 0.7409705986781921, + "learning_rate": 2.347777468488669e-05, + "loss": 1.3094, + "step": 680 + }, + { + "epoch": 0.4618906316235693, + "grad_norm": 0.7838969300568545, + "learning_rate": 2.343446706482762e-05, + "loss": 1.2851, + "step": 681 + }, + { + "epoch": 0.4625688851208139, + "grad_norm": 0.7433118851032263, + "learning_rate": 2.3391142845289098e-05, + "loss": 1.2757, + "step": 682 + }, + { + "epoch": 0.4632471386180585, + "grad_norm": 0.7725357938863449, + "learning_rate": 2.3347802235665924e-05, + "loss": 1.3011, + "step": 683 + }, + { + "epoch": 0.4639253921153031, + "grad_norm": 0.7630007346948985, + "learning_rate": 2.330444544543208e-05, + "loss": 1.2868, + "step": 684 + }, + { + "epoch": 0.4646036456125477, + "grad_norm": 0.7714636610331252, + "learning_rate": 2.3261072684139787e-05, + "loss": 1.2682, + "step": 685 + }, + { + "epoch": 0.4652818991097923, + "grad_norm": 0.8215968802215311, + "learning_rate": 2.3217684161418438e-05, + "loss": 1.2654, + "step": 686 + }, + { + "epoch": 0.4659601526070369, + "grad_norm": 0.7314483643688189, + "learning_rate": 2.317428008697361e-05, + "loss": 1.242, + "step": 687 + }, + { + "epoch": 0.4666384061042815, + "grad_norm": 0.7817360509036845, + "learning_rate": 2.3130860670586036e-05, + "loss": 1.2564, + "step": 688 + }, + { + "epoch": 0.4673166596015261, + "grad_norm": 0.7364807816212218, + "learning_rate": 2.308742612211061e-05, + "loss": 1.2618, + "step": 689 + }, + { + "epoch": 0.46799491309877067, + "grad_norm": 0.781087543645866, + "learning_rate": 2.304397665147537e-05, + "loss": 1.2725, + "step": 690 + }, + { + "epoch": 0.46867316659601527, + "grad_norm": 0.7258903133686357, + "learning_rate": 2.3000512468680444e-05, + "loss": 1.2811, + "step": 691 + }, + { + "epoch": 0.46935142009325986, + "grad_norm": 0.7357562054565555, + "learning_rate": 2.29570337837971e-05, + "loss": 1.2519, + "step": 692 + }, + { + "epoch": 0.47002967359050446, + "grad_norm": 0.7073247488405348, + "learning_rate": 2.291354080696668e-05, + "loss": 1.2674, + "step": 693 + }, + { + "epoch": 0.47070792708774906, + "grad_norm": 0.7416390462669545, + "learning_rate": 2.2870033748399613e-05, + "loss": 1.2871, + "step": 694 + }, + { + "epoch": 0.47138618058499365, + "grad_norm": 0.7412100881028095, + "learning_rate": 2.2826512818374386e-05, + "loss": 1.2598, + "step": 695 + }, + { + "epoch": 0.47206443408223825, + "grad_norm": 0.7263342004136881, + "learning_rate": 2.2782978227236515e-05, + "loss": 1.2836, + "step": 696 + }, + { + "epoch": 0.47274268757948285, + "grad_norm": 0.7377485093769998, + "learning_rate": 2.273943018539755e-05, + "loss": 1.2705, + "step": 697 + }, + { + "epoch": 0.47342094107672744, + "grad_norm": 0.7505195569359884, + "learning_rate": 2.2695868903334075e-05, + "loss": 1.3118, + "step": 698 + }, + { + "epoch": 0.47409919457397204, + "grad_norm": 0.7919153501492263, + "learning_rate": 2.2652294591586625e-05, + "loss": 1.2968, + "step": 699 + }, + { + "epoch": 0.47477744807121663, + "grad_norm": 0.765268302876366, + "learning_rate": 2.2608707460758742e-05, + "loss": 1.2666, + "step": 700 + }, + { + "epoch": 0.47545570156846123, + "grad_norm": 0.747503993072507, + "learning_rate": 2.2565107721515913e-05, + "loss": 1.2705, + "step": 701 + }, + { + "epoch": 0.4761339550657058, + "grad_norm": 0.7303954806914537, + "learning_rate": 2.2521495584584567e-05, + "loss": 1.415, + "step": 702 + }, + { + "epoch": 0.4768122085629504, + "grad_norm": 0.7966191203716216, + "learning_rate": 2.247787126075105e-05, + "loss": 1.2616, + "step": 703 + }, + { + "epoch": 0.477490462060195, + "grad_norm": 0.5339493405526096, + "learning_rate": 2.243423496086061e-05, + "loss": 1.3757, + "step": 704 + }, + { + "epoch": 0.4781687155574396, + "grad_norm": 0.7954246313983925, + "learning_rate": 2.239058689581638e-05, + "loss": 1.2622, + "step": 705 + }, + { + "epoch": 0.4788469690546842, + "grad_norm": 0.76273918772253, + "learning_rate": 2.2346927276578363e-05, + "loss": 1.2958, + "step": 706 + }, + { + "epoch": 0.4795252225519288, + "grad_norm": 0.741611495665349, + "learning_rate": 2.2303256314162392e-05, + "loss": 1.2703, + "step": 707 + }, + { + "epoch": 0.4802034760491734, + "grad_norm": 0.7802891637540444, + "learning_rate": 2.2259574219639128e-05, + "loss": 1.267, + "step": 708 + }, + { + "epoch": 0.48088172954641795, + "grad_norm": 0.7359657086413339, + "learning_rate": 2.221588120413305e-05, + "loss": 1.2536, + "step": 709 + }, + { + "epoch": 0.48155998304366254, + "grad_norm": 0.7282463841135269, + "learning_rate": 2.2172177478821397e-05, + "loss": 1.2905, + "step": 710 + }, + { + "epoch": 0.48223823654090714, + "grad_norm": 0.7559767659269689, + "learning_rate": 2.212846325493319e-05, + "loss": 1.295, + "step": 711 + }, + { + "epoch": 0.48291649003815174, + "grad_norm": 0.7345869412181345, + "learning_rate": 2.208473874374818e-05, + "loss": 1.28, + "step": 712 + }, + { + "epoch": 0.48359474353539633, + "grad_norm": 0.8395369552411838, + "learning_rate": 2.2041004156595845e-05, + "loss": 1.2943, + "step": 713 + }, + { + "epoch": 0.48427299703264093, + "grad_norm": 0.7656500828016878, + "learning_rate": 2.1997259704854362e-05, + "loss": 1.3387, + "step": 714 + }, + { + "epoch": 0.4849512505298855, + "grad_norm": 0.7708038244468336, + "learning_rate": 2.1953505599949577e-05, + "loss": 1.3023, + "step": 715 + }, + { + "epoch": 0.4856295040271301, + "grad_norm": 0.7218970860061366, + "learning_rate": 2.1909742053354005e-05, + "loss": 1.266, + "step": 716 + }, + { + "epoch": 0.4863077575243747, + "grad_norm": 0.7682310473494609, + "learning_rate": 2.1865969276585787e-05, + "loss": 1.2693, + "step": 717 + }, + { + "epoch": 0.4869860110216193, + "grad_norm": 0.7471960328816069, + "learning_rate": 2.1822187481207675e-05, + "loss": 1.2534, + "step": 718 + }, + { + "epoch": 0.4876642645188639, + "grad_norm": 0.7727623743996936, + "learning_rate": 2.1778396878826008e-05, + "loss": 1.2781, + "step": 719 + }, + { + "epoch": 0.4883425180161085, + "grad_norm": 0.7065341701899533, + "learning_rate": 2.17345976810897e-05, + "loss": 1.2897, + "step": 720 + }, + { + "epoch": 0.4890207715133531, + "grad_norm": 0.8056543929766073, + "learning_rate": 2.1690790099689197e-05, + "loss": 1.2828, + "step": 721 + }, + { + "epoch": 0.4896990250105977, + "grad_norm": 0.7330818708592541, + "learning_rate": 2.1646974346355472e-05, + "loss": 1.271, + "step": 722 + }, + { + "epoch": 0.4903772785078423, + "grad_norm": 0.7846831270623186, + "learning_rate": 2.1603150632858986e-05, + "loss": 1.2793, + "step": 723 + }, + { + "epoch": 0.4910555320050869, + "grad_norm": 0.7300693718708892, + "learning_rate": 2.15593191710087e-05, + "loss": 1.293, + "step": 724 + }, + { + "epoch": 0.4917337855023315, + "grad_norm": 0.7133388247822674, + "learning_rate": 2.1515480172650983e-05, + "loss": 1.2889, + "step": 725 + }, + { + "epoch": 0.4924120389995761, + "grad_norm": 0.733479372471087, + "learning_rate": 2.1471633849668666e-05, + "loss": 1.2909, + "step": 726 + }, + { + "epoch": 0.4930902924968207, + "grad_norm": 0.7711594632629859, + "learning_rate": 2.1427780413979954e-05, + "loss": 1.2554, + "step": 727 + }, + { + "epoch": 0.4937685459940653, + "grad_norm": 0.7206642982288171, + "learning_rate": 2.1383920077537445e-05, + "loss": 1.2825, + "step": 728 + }, + { + "epoch": 0.4944467994913099, + "grad_norm": 0.7256230446585492, + "learning_rate": 2.1340053052327087e-05, + "loss": 1.3023, + "step": 729 + }, + { + "epoch": 0.49512505298855447, + "grad_norm": 0.7698155651507559, + "learning_rate": 2.1296179550367152e-05, + "loss": 1.2886, + "step": 730 + }, + { + "epoch": 0.49580330648579907, + "grad_norm": 0.7605859306746997, + "learning_rate": 2.1252299783707233e-05, + "loss": 1.2605, + "step": 731 + }, + { + "epoch": 0.49648155998304366, + "grad_norm": 0.698780430218635, + "learning_rate": 2.120841396442717e-05, + "loss": 1.2415, + "step": 732 + }, + { + "epoch": 0.49715981348028826, + "grad_norm": 0.7881374449205716, + "learning_rate": 2.116452230463608e-05, + "loss": 1.2763, + "step": 733 + }, + { + "epoch": 0.49783806697753286, + "grad_norm": 0.7347578225034074, + "learning_rate": 2.1120625016471302e-05, + "loss": 1.2901, + "step": 734 + }, + { + "epoch": 0.49851632047477745, + "grad_norm": 0.7267350175824305, + "learning_rate": 2.107672231209738e-05, + "loss": 1.254, + "step": 735 + }, + { + "epoch": 0.49919457397202205, + "grad_norm": 0.9191272469301005, + "learning_rate": 2.1032814403705028e-05, + "loss": 1.4062, + "step": 736 + }, + { + "epoch": 0.49987282746926665, + "grad_norm": 0.8087334133463818, + "learning_rate": 2.098890150351013e-05, + "loss": 1.2988, + "step": 737 + }, + { + "epoch": 0.5005510809665112, + "grad_norm": 0.7873244004928791, + "learning_rate": 2.0944983823752664e-05, + "loss": 1.2786, + "step": 738 + }, + { + "epoch": 0.5012293344637558, + "grad_norm": 0.7551947535757042, + "learning_rate": 2.0901061576695754e-05, + "loss": 1.2648, + "step": 739 + }, + { + "epoch": 0.5019075879610004, + "grad_norm": 0.7238097481098938, + "learning_rate": 2.085713497462456e-05, + "loss": 1.2745, + "step": 740 + }, + { + "epoch": 0.502585841458245, + "grad_norm": 0.7863072916144123, + "learning_rate": 2.08132042298453e-05, + "loss": 1.2938, + "step": 741 + }, + { + "epoch": 0.5032640949554896, + "grad_norm": 0.7321035315725165, + "learning_rate": 2.076926955468423e-05, + "loss": 1.2626, + "step": 742 + }, + { + "epoch": 0.5039423484527342, + "grad_norm": 0.7758679023626488, + "learning_rate": 2.072533116148658e-05, + "loss": 1.2786, + "step": 743 + }, + { + "epoch": 0.5046206019499788, + "grad_norm": 0.7620697769692747, + "learning_rate": 2.0681389262615574e-05, + "loss": 1.2546, + "step": 744 + }, + { + "epoch": 0.5052988554472234, + "grad_norm": 0.6988722404611495, + "learning_rate": 2.0637444070451346e-05, + "loss": 1.2647, + "step": 745 + }, + { + "epoch": 0.505977108944468, + "grad_norm": 0.736111947921635, + "learning_rate": 2.0593495797389994e-05, + "loss": 1.3025, + "step": 746 + }, + { + "epoch": 0.5066553624417126, + "grad_norm": 0.7598553414031834, + "learning_rate": 2.054954465584246e-05, + "loss": 1.2898, + "step": 747 + }, + { + "epoch": 0.5073336159389572, + "grad_norm": 0.77145507844893, + "learning_rate": 2.0505590858233587e-05, + "loss": 1.2901, + "step": 748 + }, + { + "epoch": 0.5080118694362018, + "grad_norm": 0.7450436756941183, + "learning_rate": 2.0461634617001022e-05, + "loss": 1.2741, + "step": 749 + }, + { + "epoch": 0.5086901229334464, + "grad_norm": 0.7506494842537246, + "learning_rate": 2.0417676144594257e-05, + "loss": 1.2665, + "step": 750 + }, + { + "epoch": 0.509368376430691, + "grad_norm": 0.8068507358614899, + "learning_rate": 2.0373715653473534e-05, + "loss": 1.2781, + "step": 751 + }, + { + "epoch": 0.5100466299279356, + "grad_norm": 0.7559844586490145, + "learning_rate": 2.032975335610888e-05, + "loss": 1.2462, + "step": 752 + }, + { + "epoch": 0.5107248834251802, + "grad_norm": 0.7382892331154312, + "learning_rate": 2.0285789464979033e-05, + "loss": 1.2425, + "step": 753 + }, + { + "epoch": 0.5114031369224248, + "grad_norm": 0.7536737034033136, + "learning_rate": 2.0241824192570446e-05, + "loss": 1.2958, + "step": 754 + }, + { + "epoch": 0.5120813904196694, + "grad_norm": 0.7821684212875851, + "learning_rate": 2.0197857751376237e-05, + "loss": 1.2565, + "step": 755 + }, + { + "epoch": 0.512759643916914, + "grad_norm": 0.8176897966736769, + "learning_rate": 2.0153890353895186e-05, + "loss": 1.2899, + "step": 756 + }, + { + "epoch": 0.5134378974141586, + "grad_norm": 0.7559675257892411, + "learning_rate": 2.010992221263068e-05, + "loss": 1.2981, + "step": 757 + }, + { + "epoch": 0.5141161509114032, + "grad_norm": 0.7468879468648103, + "learning_rate": 2.006595354008971e-05, + "loss": 1.2732, + "step": 758 + }, + { + "epoch": 0.5147944044086478, + "grad_norm": 0.7181990484784445, + "learning_rate": 2.0021984548781844e-05, + "loss": 1.2893, + "step": 759 + }, + { + "epoch": 0.5154726579058924, + "grad_norm": 0.7481524052553766, + "learning_rate": 1.9978015451218166e-05, + "loss": 1.2711, + "step": 760 + }, + { + "epoch": 0.516150911403137, + "grad_norm": 0.7676149070117987, + "learning_rate": 1.9934046459910293e-05, + "loss": 1.2782, + "step": 761 + }, + { + "epoch": 0.5168291649003816, + "grad_norm": 0.7467675834428721, + "learning_rate": 1.989007778736933e-05, + "loss": 1.2741, + "step": 762 + }, + { + "epoch": 0.5175074183976262, + "grad_norm": 0.7079588693786695, + "learning_rate": 1.9846109646104824e-05, + "loss": 1.2785, + "step": 763 + }, + { + "epoch": 0.5181856718948707, + "grad_norm": 0.7567069792021545, + "learning_rate": 1.9802142248623767e-05, + "loss": 1.2952, + "step": 764 + }, + { + "epoch": 0.5188639253921153, + "grad_norm": 0.7160899538626819, + "learning_rate": 1.9758175807429564e-05, + "loss": 1.2492, + "step": 765 + }, + { + "epoch": 0.5195421788893599, + "grad_norm": 0.7375646034553129, + "learning_rate": 1.971421053502097e-05, + "loss": 1.2725, + "step": 766 + }, + { + "epoch": 0.5202204323866045, + "grad_norm": 0.7459565281487732, + "learning_rate": 1.9670246643891125e-05, + "loss": 1.2887, + "step": 767 + }, + { + "epoch": 0.5208986858838491, + "grad_norm": 0.7241813728113365, + "learning_rate": 1.962628434652647e-05, + "loss": 1.2787, + "step": 768 + }, + { + "epoch": 0.5215769393810937, + "grad_norm": 0.7377185301313828, + "learning_rate": 1.9582323855405753e-05, + "loss": 1.2604, + "step": 769 + }, + { + "epoch": 0.5222551928783383, + "grad_norm": 0.7619328845955535, + "learning_rate": 1.953836538299898e-05, + "loss": 1.2484, + "step": 770 + }, + { + "epoch": 0.5229334463755829, + "grad_norm": 0.771598320473821, + "learning_rate": 1.9494409141766416e-05, + "loss": 1.3023, + "step": 771 + }, + { + "epoch": 0.5236116998728275, + "grad_norm": 0.7575952299434909, + "learning_rate": 1.9450455344157547e-05, + "loss": 1.2932, + "step": 772 + }, + { + "epoch": 0.5242899533700721, + "grad_norm": 0.7497620136321677, + "learning_rate": 1.940650420261001e-05, + "loss": 1.2816, + "step": 773 + }, + { + "epoch": 0.5249682068673167, + "grad_norm": 0.8152292701670396, + "learning_rate": 1.936255592954866e-05, + "loss": 1.271, + "step": 774 + }, + { + "epoch": 0.5256464603645613, + "grad_norm": 0.731035301388366, + "learning_rate": 1.9318610737384436e-05, + "loss": 1.2606, + "step": 775 + }, + { + "epoch": 0.5263247138618059, + "grad_norm": 0.7408084710649866, + "learning_rate": 1.927466883851343e-05, + "loss": 1.2823, + "step": 776 + }, + { + "epoch": 0.5270029673590505, + "grad_norm": 0.769360474224272, + "learning_rate": 1.923073044531578e-05, + "loss": 1.2809, + "step": 777 + }, + { + "epoch": 0.527681220856295, + "grad_norm": 0.790407790205139, + "learning_rate": 1.918679577015471e-05, + "loss": 1.3015, + "step": 778 + }, + { + "epoch": 0.5283594743535396, + "grad_norm": 0.7512918107436672, + "learning_rate": 1.9142865025375447e-05, + "loss": 1.2853, + "step": 779 + }, + { + "epoch": 0.5290377278507842, + "grad_norm": 0.7367728132150647, + "learning_rate": 1.9098938423304246e-05, + "loss": 1.2742, + "step": 780 + }, + { + "epoch": 0.5297159813480288, + "grad_norm": 0.7137355054556672, + "learning_rate": 1.905501617624734e-05, + "loss": 1.2466, + "step": 781 + }, + { + "epoch": 0.5303942348452734, + "grad_norm": 0.749371561195395, + "learning_rate": 1.9011098496489876e-05, + "loss": 1.2653, + "step": 782 + }, + { + "epoch": 0.531072488342518, + "grad_norm": 0.7495797022110012, + "learning_rate": 1.896718559629498e-05, + "loss": 1.2784, + "step": 783 + }, + { + "epoch": 0.5317507418397626, + "grad_norm": 0.7192581639726054, + "learning_rate": 1.8923277687902625e-05, + "loss": 1.2711, + "step": 784 + }, + { + "epoch": 0.5324289953370072, + "grad_norm": 0.7261147473593981, + "learning_rate": 1.8879374983528708e-05, + "loss": 1.2525, + "step": 785 + }, + { + "epoch": 0.5331072488342518, + "grad_norm": 0.6962500047334699, + "learning_rate": 1.8835477695363926e-05, + "loss": 1.2657, + "step": 786 + }, + { + "epoch": 0.5337855023314964, + "grad_norm": 0.7553121004344828, + "learning_rate": 1.8791586035572832e-05, + "loss": 1.2738, + "step": 787 + }, + { + "epoch": 0.534463755828741, + "grad_norm": 0.7746187801268444, + "learning_rate": 1.8747700216292774e-05, + "loss": 1.2579, + "step": 788 + }, + { + "epoch": 0.5351420093259855, + "grad_norm": 0.7464557600044285, + "learning_rate": 1.8703820449632844e-05, + "loss": 1.2936, + "step": 789 + }, + { + "epoch": 0.5358202628232301, + "grad_norm": 0.7499003825986563, + "learning_rate": 1.865994694767292e-05, + "loss": 1.28, + "step": 790 + }, + { + "epoch": 0.5364985163204747, + "grad_norm": 0.7312593356679844, + "learning_rate": 1.861607992246256e-05, + "loss": 1.2885, + "step": 791 + }, + { + "epoch": 0.5371767698177193, + "grad_norm": 0.7614005096187081, + "learning_rate": 1.8572219586020056e-05, + "loss": 1.2682, + "step": 792 + }, + { + "epoch": 0.5378550233149639, + "grad_norm": 0.7017023223046736, + "learning_rate": 1.8528366150331344e-05, + "loss": 1.254, + "step": 793 + }, + { + "epoch": 0.5385332768122085, + "grad_norm": 0.7466796945484455, + "learning_rate": 1.848451982734902e-05, + "loss": 1.2767, + "step": 794 + }, + { + "epoch": 0.5392115303094531, + "grad_norm": 0.730197813705563, + "learning_rate": 1.8440680828991308e-05, + "loss": 1.2655, + "step": 795 + }, + { + "epoch": 0.5398897838066977, + "grad_norm": 0.7661939092830227, + "learning_rate": 1.839684936714101e-05, + "loss": 1.2594, + "step": 796 + }, + { + "epoch": 0.5405680373039423, + "grad_norm": 0.6534248100019572, + "learning_rate": 1.8353025653644535e-05, + "loss": 1.4072, + "step": 797 + }, + { + "epoch": 0.5412462908011869, + "grad_norm": 0.7630832597467281, + "learning_rate": 1.830920990031081e-05, + "loss": 1.2739, + "step": 798 + }, + { + "epoch": 0.5419245442984315, + "grad_norm": 0.7981120415918996, + "learning_rate": 1.826540231891031e-05, + "loss": 1.2751, + "step": 799 + }, + { + "epoch": 0.5426027977956761, + "grad_norm": 0.7436141589265621, + "learning_rate": 1.8221603121174e-05, + "loss": 1.28, + "step": 800 + }, + { + "epoch": 0.5432810512929207, + "grad_norm": 0.7369961328790705, + "learning_rate": 1.8177812518792332e-05, + "loss": 1.2706, + "step": 801 + }, + { + "epoch": 0.5439593047901653, + "grad_norm": 0.738256573102201, + "learning_rate": 1.813403072341422e-05, + "loss": 1.2885, + "step": 802 + }, + { + "epoch": 0.5446375582874099, + "grad_norm": 0.7472209862893595, + "learning_rate": 1.8090257946645998e-05, + "loss": 1.257, + "step": 803 + }, + { + "epoch": 0.5453158117846545, + "grad_norm": 0.710354620157084, + "learning_rate": 1.804649440005043e-05, + "loss": 1.2455, + "step": 804 + }, + { + "epoch": 0.5459940652818991, + "grad_norm": 0.7340068410967466, + "learning_rate": 1.8002740295145645e-05, + "loss": 1.3025, + "step": 805 + }, + { + "epoch": 0.5466723187791437, + "grad_norm": 0.48470218124317466, + "learning_rate": 1.7958995843404165e-05, + "loss": 1.4318, + "step": 806 + }, + { + "epoch": 0.5473505722763883, + "grad_norm": 0.7600875770027712, + "learning_rate": 1.7915261256251825e-05, + "loss": 1.2491, + "step": 807 + }, + { + "epoch": 0.5480288257736329, + "grad_norm": 0.8090050109964079, + "learning_rate": 1.787153674506682e-05, + "loss": 1.2578, + "step": 808 + }, + { + "epoch": 0.5487070792708775, + "grad_norm": 0.7219551531427646, + "learning_rate": 1.782782252117861e-05, + "loss": 1.2651, + "step": 809 + }, + { + "epoch": 0.5493853327681221, + "grad_norm": 0.7882648722353363, + "learning_rate": 1.7784118795866954e-05, + "loss": 1.3079, + "step": 810 + }, + { + "epoch": 0.5500635862653667, + "grad_norm": 0.7362810674678596, + "learning_rate": 1.774042578036088e-05, + "loss": 1.2399, + "step": 811 + }, + { + "epoch": 0.5507418397626113, + "grad_norm": 0.7673342227031724, + "learning_rate": 1.769674368583761e-05, + "loss": 1.2736, + "step": 812 + }, + { + "epoch": 0.5514200932598559, + "grad_norm": 0.7485637816315667, + "learning_rate": 1.765307272342165e-05, + "loss": 1.2742, + "step": 813 + }, + { + "epoch": 0.5520983467571005, + "grad_norm": 0.7991809227370478, + "learning_rate": 1.7609413104183623e-05, + "loss": 1.3009, + "step": 814 + }, + { + "epoch": 0.552776600254345, + "grad_norm": 0.72841400665626, + "learning_rate": 1.75657650391394e-05, + "loss": 1.2415, + "step": 815 + }, + { + "epoch": 0.5534548537515896, + "grad_norm": 0.7328231976918355, + "learning_rate": 1.7522128739248956e-05, + "loss": 1.2642, + "step": 816 + }, + { + "epoch": 0.5541331072488342, + "grad_norm": 0.7648044415971349, + "learning_rate": 1.7478504415415437e-05, + "loss": 1.2465, + "step": 817 + }, + { + "epoch": 0.5548113607460788, + "grad_norm": 0.7683617384717926, + "learning_rate": 1.743489227848409e-05, + "loss": 1.2578, + "step": 818 + }, + { + "epoch": 0.5554896142433234, + "grad_norm": 0.7388908337530937, + "learning_rate": 1.739129253924126e-05, + "loss": 1.2859, + "step": 819 + }, + { + "epoch": 0.556167867740568, + "grad_norm": 0.8056916671907673, + "learning_rate": 1.734770540841338e-05, + "loss": 1.2399, + "step": 820 + }, + { + "epoch": 0.5568461212378126, + "grad_norm": 0.7547650035174724, + "learning_rate": 1.730413109666593e-05, + "loss": 1.2685, + "step": 821 + }, + { + "epoch": 0.5575243747350572, + "grad_norm": 0.7299262736595057, + "learning_rate": 1.7260569814602452e-05, + "loss": 1.2188, + "step": 822 + }, + { + "epoch": 0.5582026282323018, + "grad_norm": 0.7525499939514374, + "learning_rate": 1.7217021772763495e-05, + "loss": 1.2618, + "step": 823 + }, + { + "epoch": 0.5588808817295464, + "grad_norm": 0.7098846627583694, + "learning_rate": 1.7173487181625618e-05, + "loss": 1.2711, + "step": 824 + }, + { + "epoch": 0.559559135226791, + "grad_norm": 0.7459859867180162, + "learning_rate": 1.712996625160039e-05, + "loss": 1.2846, + "step": 825 + }, + { + "epoch": 0.5602373887240356, + "grad_norm": 0.757384152710219, + "learning_rate": 1.708645919303332e-05, + "loss": 1.2668, + "step": 826 + }, + { + "epoch": 0.5609156422212802, + "grad_norm": 0.7823318074574718, + "learning_rate": 1.7042966216202906e-05, + "loss": 1.2735, + "step": 827 + }, + { + "epoch": 0.5615938957185248, + "grad_norm": 0.706030098280657, + "learning_rate": 1.6999487531319563e-05, + "loss": 1.2406, + "step": 828 + }, + { + "epoch": 0.5622721492157694, + "grad_norm": 0.7782884788037362, + "learning_rate": 1.6956023348524643e-05, + "loss": 1.2823, + "step": 829 + }, + { + "epoch": 0.562950402713014, + "grad_norm": 0.7701685920541365, + "learning_rate": 1.6912573877889393e-05, + "loss": 1.2658, + "step": 830 + }, + { + "epoch": 0.5636286562102586, + "grad_norm": 0.7828155055979443, + "learning_rate": 1.6869139329413967e-05, + "loss": 1.2987, + "step": 831 + }, + { + "epoch": 0.5643069097075032, + "grad_norm": 0.7585431050580741, + "learning_rate": 1.6825719913026398e-05, + "loss": 1.2746, + "step": 832 + }, + { + "epoch": 0.5649851632047478, + "grad_norm": 0.7975578679187447, + "learning_rate": 1.6782315838581566e-05, + "loss": 1.3008, + "step": 833 + }, + { + "epoch": 0.5656634167019924, + "grad_norm": 0.7407472825658853, + "learning_rate": 1.6738927315860216e-05, + "loss": 1.2584, + "step": 834 + }, + { + "epoch": 0.566341670199237, + "grad_norm": 0.7400394043223196, + "learning_rate": 1.6695554554567925e-05, + "loss": 1.2766, + "step": 835 + }, + { + "epoch": 0.5670199236964816, + "grad_norm": 0.7982859253619563, + "learning_rate": 1.665219776433409e-05, + "loss": 1.2735, + "step": 836 + }, + { + "epoch": 0.5676981771937262, + "grad_norm": 0.7558919073776923, + "learning_rate": 1.6608857154710905e-05, + "loss": 1.2239, + "step": 837 + }, + { + "epoch": 0.5683764306909708, + "grad_norm": 0.7633849662408336, + "learning_rate": 1.6565532935172387e-05, + "loss": 1.2812, + "step": 838 + }, + { + "epoch": 0.5690546841882154, + "grad_norm": 0.7259209453123879, + "learning_rate": 1.6522225315113313e-05, + "loss": 1.2505, + "step": 839 + }, + { + "epoch": 0.56973293768546, + "grad_norm": 0.7219112999111875, + "learning_rate": 1.6478934503848237e-05, + "loss": 1.2573, + "step": 840 + }, + { + "epoch": 0.5704111911827046, + "grad_norm": 0.761409047706854, + "learning_rate": 1.6435660710610485e-05, + "loss": 1.2477, + "step": 841 + }, + { + "epoch": 0.5710894446799492, + "grad_norm": 0.7445867519761595, + "learning_rate": 1.639240414455111e-05, + "loss": 1.2502, + "step": 842 + }, + { + "epoch": 0.5717676981771938, + "grad_norm": 0.7156396178869799, + "learning_rate": 1.6349165014737923e-05, + "loss": 1.2299, + "step": 843 + }, + { + "epoch": 0.5724459516744383, + "grad_norm": 0.7901111725085388, + "learning_rate": 1.630594353015444e-05, + "loss": 1.2958, + "step": 844 + }, + { + "epoch": 0.5731242051716829, + "grad_norm": 0.788495518441872, + "learning_rate": 1.626273989969892e-05, + "loss": 1.2464, + "step": 845 + }, + { + "epoch": 0.5738024586689275, + "grad_norm": 0.7125175624415945, + "learning_rate": 1.6219554332183307e-05, + "loss": 1.272, + "step": 846 + }, + { + "epoch": 0.5744807121661721, + "grad_norm": 0.5312981682163216, + "learning_rate": 1.617638703633225e-05, + "loss": 1.3833, + "step": 847 + }, + { + "epoch": 0.5751589656634167, + "grad_norm": 0.7154786352168268, + "learning_rate": 1.613323822078211e-05, + "loss": 1.2373, + "step": 848 + }, + { + "epoch": 0.5758372191606613, + "grad_norm": 0.7322890290043707, + "learning_rate": 1.609010809407988e-05, + "loss": 1.2353, + "step": 849 + }, + { + "epoch": 0.5765154726579059, + "grad_norm": 0.7402441049307272, + "learning_rate": 1.604699686468228e-05, + "loss": 1.2683, + "step": 850 + }, + { + "epoch": 0.5771937261551505, + "grad_norm": 0.6934244915813785, + "learning_rate": 1.600390474095465e-05, + "loss": 1.2503, + "step": 851 + }, + { + "epoch": 0.5778719796523951, + "grad_norm": 0.7306958031183086, + "learning_rate": 1.5960831931170024e-05, + "loss": 1.2754, + "step": 852 + }, + { + "epoch": 0.5785502331496397, + "grad_norm": 0.7414223143033765, + "learning_rate": 1.5917778643508052e-05, + "loss": 1.281, + "step": 853 + }, + { + "epoch": 0.5792284866468843, + "grad_norm": 0.7367648536751301, + "learning_rate": 1.587474508605407e-05, + "loss": 1.2624, + "step": 854 + }, + { + "epoch": 0.5799067401441289, + "grad_norm": 0.7930690411423662, + "learning_rate": 1.5831731466798032e-05, + "loss": 1.2484, + "step": 855 + }, + { + "epoch": 0.5805849936413735, + "grad_norm": 0.8057908289247886, + "learning_rate": 1.578873799363351e-05, + "loss": 1.276, + "step": 856 + }, + { + "epoch": 0.5812632471386181, + "grad_norm": 0.7374427881988856, + "learning_rate": 1.574576487435675e-05, + "loss": 1.2507, + "step": 857 + }, + { + "epoch": 0.5819415006358627, + "grad_norm": 0.7534989203331487, + "learning_rate": 1.5702812316665576e-05, + "loss": 1.2524, + "step": 858 + }, + { + "epoch": 0.5826197541331073, + "grad_norm": 0.7372448874629303, + "learning_rate": 1.5659880528158464e-05, + "loss": 1.2622, + "step": 859 + }, + { + "epoch": 0.5832980076303519, + "grad_norm": 0.7697321537663581, + "learning_rate": 1.56169697163335e-05, + "loss": 1.2572, + "step": 860 + }, + { + "epoch": 0.5839762611275965, + "grad_norm": 0.8052964355736111, + "learning_rate": 1.557408008858738e-05, + "loss": 1.2446, + "step": 861 + }, + { + "epoch": 0.5846545146248411, + "grad_norm": 0.7354976869190977, + "learning_rate": 1.553121185221443e-05, + "loss": 1.2636, + "step": 862 + }, + { + "epoch": 0.5853327681220857, + "grad_norm": 0.7144607440214421, + "learning_rate": 1.548836521440556e-05, + "loss": 1.2576, + "step": 863 + }, + { + "epoch": 0.5860110216193303, + "grad_norm": 0.7752669102183751, + "learning_rate": 1.5445540382247317e-05, + "loss": 1.2706, + "step": 864 + }, + { + "epoch": 0.5866892751165749, + "grad_norm": 0.7671360295238987, + "learning_rate": 1.5402737562720835e-05, + "loss": 1.2694, + "step": 865 + }, + { + "epoch": 0.5873675286138195, + "grad_norm": 0.7339904919637064, + "learning_rate": 1.5359956962700875e-05, + "loss": 1.2265, + "step": 866 + }, + { + "epoch": 0.5880457821110641, + "grad_norm": 0.7453945953489697, + "learning_rate": 1.531719878895479e-05, + "loss": 1.2794, + "step": 867 + }, + { + "epoch": 0.5887240356083087, + "grad_norm": 0.7049362289674199, + "learning_rate": 1.5274463248141554e-05, + "loss": 1.2199, + "step": 868 + }, + { + "epoch": 0.5894022891055531, + "grad_norm": 0.7199336413075009, + "learning_rate": 1.523175054681075e-05, + "loss": 1.2329, + "step": 869 + }, + { + "epoch": 0.5900805426027977, + "grad_norm": 0.7051800273850447, + "learning_rate": 1.5189060891401564e-05, + "loss": 1.2517, + "step": 870 + }, + { + "epoch": 0.5907587961000423, + "grad_norm": 0.7253522354142042, + "learning_rate": 1.5146394488241812e-05, + "loss": 1.2464, + "step": 871 + }, + { + "epoch": 0.5914370495972869, + "grad_norm": 0.7180955256697961, + "learning_rate": 1.5103751543546906e-05, + "loss": 1.2384, + "step": 872 + }, + { + "epoch": 0.5921153030945315, + "grad_norm": 0.7370359916598486, + "learning_rate": 1.5061132263418903e-05, + "loss": 1.2977, + "step": 873 + }, + { + "epoch": 0.5927935565917761, + "grad_norm": 0.7328367261254487, + "learning_rate": 1.5018536853845461e-05, + "loss": 1.3052, + "step": 874 + }, + { + "epoch": 0.5934718100890207, + "grad_norm": 0.7140757454830974, + "learning_rate": 1.4975965520698887e-05, + "loss": 1.2721, + "step": 875 + }, + { + "epoch": 0.5941500635862653, + "grad_norm": 0.75263882554648, + "learning_rate": 1.4933418469735103e-05, + "loss": 1.23, + "step": 876 + }, + { + "epoch": 0.5948283170835099, + "grad_norm": 0.706673856380447, + "learning_rate": 1.4890895906592682e-05, + "loss": 1.2545, + "step": 877 + }, + { + "epoch": 0.5955065705807545, + "grad_norm": 0.7394045603711208, + "learning_rate": 1.4848398036791856e-05, + "loss": 1.2518, + "step": 878 + }, + { + "epoch": 0.5961848240779991, + "grad_norm": 0.7455235410630398, + "learning_rate": 1.4805925065733468e-05, + "loss": 1.2528, + "step": 879 + }, + { + "epoch": 0.5968630775752437, + "grad_norm": 0.7224215309064259, + "learning_rate": 1.4763477198698062e-05, + "loss": 1.2824, + "step": 880 + }, + { + "epoch": 0.5975413310724883, + "grad_norm": 0.7093906902341754, + "learning_rate": 1.4721054640844826e-05, + "loss": 1.2311, + "step": 881 + }, + { + "epoch": 0.5982195845697329, + "grad_norm": 0.7113504579397316, + "learning_rate": 1.4678657597210633e-05, + "loss": 1.2458, + "step": 882 + }, + { + "epoch": 0.5988978380669775, + "grad_norm": 0.8035763524682514, + "learning_rate": 1.4636286272709033e-05, + "loss": 1.2345, + "step": 883 + }, + { + "epoch": 0.5995760915642221, + "grad_norm": 0.6926055300430728, + "learning_rate": 1.4593940872129268e-05, + "loss": 1.2695, + "step": 884 + }, + { + "epoch": 0.6002543450614667, + "grad_norm": 0.7055103538930207, + "learning_rate": 1.4551621600135319e-05, + "loss": 1.2355, + "step": 885 + }, + { + "epoch": 0.6009325985587113, + "grad_norm": 0.7267167745593457, + "learning_rate": 1.4509328661264824e-05, + "loss": 1.2909, + "step": 886 + }, + { + "epoch": 0.6016108520559559, + "grad_norm": 0.7405252682023891, + "learning_rate": 1.44670622599282e-05, + "loss": 1.2457, + "step": 887 + }, + { + "epoch": 0.6022891055532005, + "grad_norm": 0.7209599678458054, + "learning_rate": 1.4424822600407558e-05, + "loss": 1.2596, + "step": 888 + }, + { + "epoch": 0.6029673590504451, + "grad_norm": 0.6906841638341872, + "learning_rate": 1.4382609886855811e-05, + "loss": 1.2629, + "step": 889 + }, + { + "epoch": 0.6036456125476897, + "grad_norm": 0.7240507304691446, + "learning_rate": 1.4340424323295579e-05, + "loss": 1.2775, + "step": 890 + }, + { + "epoch": 0.6043238660449343, + "grad_norm": 0.708461486979943, + "learning_rate": 1.4298266113618311e-05, + "loss": 1.2555, + "step": 891 + }, + { + "epoch": 0.6050021195421789, + "grad_norm": 0.7043004010120361, + "learning_rate": 1.4256135461583225e-05, + "loss": 1.2327, + "step": 892 + }, + { + "epoch": 0.6056803730394235, + "grad_norm": 0.6954193461319649, + "learning_rate": 1.4214032570816346e-05, + "loss": 1.2673, + "step": 893 + }, + { + "epoch": 0.606358626536668, + "grad_norm": 0.7259164250225965, + "learning_rate": 1.4171957644809533e-05, + "loss": 1.2707, + "step": 894 + }, + { + "epoch": 0.6070368800339127, + "grad_norm": 0.7299474390160032, + "learning_rate": 1.4129910886919472e-05, + "loss": 1.2471, + "step": 895 + }, + { + "epoch": 0.6077151335311572, + "grad_norm": 0.6744713222322375, + "learning_rate": 1.4087892500366725e-05, + "loss": 1.2568, + "step": 896 + }, + { + "epoch": 0.6083933870284018, + "grad_norm": 0.7216386104384064, + "learning_rate": 1.4045902688234711e-05, + "loss": 1.2794, + "step": 897 + }, + { + "epoch": 0.6090716405256464, + "grad_norm": 0.7658293993436277, + "learning_rate": 1.4003941653468758e-05, + "loss": 1.2536, + "step": 898 + }, + { + "epoch": 0.609749894022891, + "grad_norm": 0.7234591048031078, + "learning_rate": 1.39620095988751e-05, + "loss": 1.2694, + "step": 899 + }, + { + "epoch": 0.6104281475201356, + "grad_norm": 0.6981886633012235, + "learning_rate": 1.3920106727119901e-05, + "loss": 1.2803, + "step": 900 + }, + { + "epoch": 0.6111064010173802, + "grad_norm": 0.7248262364678507, + "learning_rate": 1.3878233240728287e-05, + "loss": 1.2675, + "step": 901 + }, + { + "epoch": 0.6117846545146248, + "grad_norm": 0.7658558357705719, + "learning_rate": 1.383638934208335e-05, + "loss": 1.2547, + "step": 902 + }, + { + "epoch": 0.6124629080118694, + "grad_norm": 0.7407751363047477, + "learning_rate": 1.3794575233425187e-05, + "loss": 1.2867, + "step": 903 + }, + { + "epoch": 0.613141161509114, + "grad_norm": 0.7305356284556889, + "learning_rate": 1.37527911168499e-05, + "loss": 1.2639, + "step": 904 + }, + { + "epoch": 0.6138194150063586, + "grad_norm": 0.754015236627855, + "learning_rate": 1.3711037194308653e-05, + "loss": 1.2414, + "step": 905 + }, + { + "epoch": 0.6144976685036032, + "grad_norm": 0.7423896758793486, + "learning_rate": 1.3669313667606655e-05, + "loss": 1.2469, + "step": 906 + }, + { + "epoch": 0.6151759220008478, + "grad_norm": 0.7073978592211552, + "learning_rate": 1.3627620738402221e-05, + "loss": 1.2865, + "step": 907 + }, + { + "epoch": 0.6158541754980924, + "grad_norm": 0.7637953954513205, + "learning_rate": 1.3585958608205779e-05, + "loss": 1.2629, + "step": 908 + }, + { + "epoch": 0.616532428995337, + "grad_norm": 0.6891146950788505, + "learning_rate": 1.3544327478378891e-05, + "loss": 1.2771, + "step": 909 + }, + { + "epoch": 0.6172106824925816, + "grad_norm": 0.6944143467859542, + "learning_rate": 1.3502727550133306e-05, + "loss": 1.2685, + "step": 910 + }, + { + "epoch": 0.6178889359898262, + "grad_norm": 0.7664699532195742, + "learning_rate": 1.3461159024529942e-05, + "loss": 1.2844, + "step": 911 + }, + { + "epoch": 0.6185671894870708, + "grad_norm": 0.7201761910665536, + "learning_rate": 1.3419622102477967e-05, + "loss": 1.2871, + "step": 912 + }, + { + "epoch": 0.6192454429843154, + "grad_norm": 0.7287096841573026, + "learning_rate": 1.3378116984733791e-05, + "loss": 1.2582, + "step": 913 + }, + { + "epoch": 0.61992369648156, + "grad_norm": 0.7374024655158494, + "learning_rate": 1.3336643871900101e-05, + "loss": 1.2479, + "step": 914 + }, + { + "epoch": 0.6206019499788046, + "grad_norm": 0.7344509915633016, + "learning_rate": 1.3295202964424925e-05, + "loss": 1.2704, + "step": 915 + }, + { + "epoch": 0.6212802034760492, + "grad_norm": 0.7182428414638428, + "learning_rate": 1.3253794462600592e-05, + "loss": 1.2547, + "step": 916 + }, + { + "epoch": 0.6219584569732938, + "grad_norm": 0.7456646350427812, + "learning_rate": 1.3212418566562857e-05, + "loss": 1.2791, + "step": 917 + }, + { + "epoch": 0.6226367104705384, + "grad_norm": 0.4978577354755168, + "learning_rate": 1.3171075476289835e-05, + "loss": 1.382, + "step": 918 + }, + { + "epoch": 0.623314963967783, + "grad_norm": 0.745274485467238, + "learning_rate": 1.3129765391601135e-05, + "loss": 1.2466, + "step": 919 + }, + { + "epoch": 0.6239932174650276, + "grad_norm": 0.7878519121126696, + "learning_rate": 1.3088488512156792e-05, + "loss": 1.2795, + "step": 920 + }, + { + "epoch": 0.6246714709622722, + "grad_norm": 0.7220277267261397, + "learning_rate": 1.3047245037456392e-05, + "loss": 1.272, + "step": 921 + }, + { + "epoch": 0.6253497244595168, + "grad_norm": 0.7432803867257498, + "learning_rate": 1.3006035166838068e-05, + "loss": 1.2744, + "step": 922 + }, + { + "epoch": 0.6260279779567614, + "grad_norm": 0.7460263032906728, + "learning_rate": 1.2964859099477499e-05, + "loss": 1.2815, + "step": 923 + }, + { + "epoch": 0.626706231454006, + "grad_norm": 0.6981401291287395, + "learning_rate": 1.2923717034387035e-05, + "loss": 1.2057, + "step": 924 + }, + { + "epoch": 0.6273844849512505, + "grad_norm": 0.7180593879124996, + "learning_rate": 1.2882609170414646e-05, + "loss": 1.2235, + "step": 925 + }, + { + "epoch": 0.6280627384484951, + "grad_norm": 0.751874092601359, + "learning_rate": 1.2841535706243039e-05, + "loss": 1.2681, + "step": 926 + }, + { + "epoch": 0.6287409919457397, + "grad_norm": 0.7238837555578427, + "learning_rate": 1.280049684038861e-05, + "loss": 1.2546, + "step": 927 + }, + { + "epoch": 0.6294192454429843, + "grad_norm": 0.7623351431358577, + "learning_rate": 1.2759492771200588e-05, + "loss": 1.274, + "step": 928 + }, + { + "epoch": 0.6300974989402289, + "grad_norm": 0.726521566712189, + "learning_rate": 1.2718523696859992e-05, + "loss": 1.2556, + "step": 929 + }, + { + "epoch": 0.6307757524374735, + "grad_norm": 0.7294458811856235, + "learning_rate": 1.2677589815378703e-05, + "loss": 1.2611, + "step": 930 + }, + { + "epoch": 0.6314540059347181, + "grad_norm": 0.7337933448476249, + "learning_rate": 1.2636691324598527e-05, + "loss": 1.2638, + "step": 931 + }, + { + "epoch": 0.6321322594319627, + "grad_norm": 0.7620793167630296, + "learning_rate": 1.2595828422190195e-05, + "loss": 1.2631, + "step": 932 + }, + { + "epoch": 0.6328105129292073, + "grad_norm": 0.7333252664894931, + "learning_rate": 1.2555001305652454e-05, + "loss": 1.2744, + "step": 933 + }, + { + "epoch": 0.6334887664264519, + "grad_norm": 0.7009821990937899, + "learning_rate": 1.2514210172311074e-05, + "loss": 1.2504, + "step": 934 + }, + { + "epoch": 0.6341670199236965, + "grad_norm": 0.7252465526498413, + "learning_rate": 1.247345521931792e-05, + "loss": 1.2772, + "step": 935 + }, + { + "epoch": 0.6348452734209411, + "grad_norm": 0.7538115830639216, + "learning_rate": 1.2432736643649985e-05, + "loss": 1.2551, + "step": 936 + }, + { + "epoch": 0.6355235269181857, + "grad_norm": 0.7276674431447986, + "learning_rate": 1.239205464210845e-05, + "loss": 1.2507, + "step": 937 + }, + { + "epoch": 0.6362017804154303, + "grad_norm": 0.7249572800476735, + "learning_rate": 1.2351409411317725e-05, + "loss": 1.2512, + "step": 938 + }, + { + "epoch": 0.6368800339126749, + "grad_norm": 0.7200866241176038, + "learning_rate": 1.2310801147724484e-05, + "loss": 1.2177, + "step": 939 + }, + { + "epoch": 0.6375582874099195, + "grad_norm": 0.6966135730154913, + "learning_rate": 1.2270230047596757e-05, + "loss": 1.238, + "step": 940 + }, + { + "epoch": 0.6382365409071641, + "grad_norm": 0.7119468935646069, + "learning_rate": 1.2229696307022926e-05, + "loss": 1.2271, + "step": 941 + }, + { + "epoch": 0.6389147944044087, + "grad_norm": 0.7384160273867104, + "learning_rate": 1.218920012191084e-05, + "loss": 1.2609, + "step": 942 + }, + { + "epoch": 0.6395930479016533, + "grad_norm": 0.7613184802931767, + "learning_rate": 1.21487416879868e-05, + "loss": 1.2668, + "step": 943 + }, + { + "epoch": 0.6402713013988979, + "grad_norm": 0.7124097044569389, + "learning_rate": 1.2108321200794672e-05, + "loss": 1.2441, + "step": 944 + }, + { + "epoch": 0.6409495548961425, + "grad_norm": 0.7095189582515088, + "learning_rate": 1.2067938855694919e-05, + "loss": 1.2373, + "step": 945 + }, + { + "epoch": 0.6416278083933871, + "grad_norm": 0.7271405420859809, + "learning_rate": 1.202759484786363e-05, + "loss": 1.2449, + "step": 946 + }, + { + "epoch": 0.6423060618906317, + "grad_norm": 0.70386838213244, + "learning_rate": 1.198728937229163e-05, + "loss": 1.2466, + "step": 947 + }, + { + "epoch": 0.6429843153878763, + "grad_norm": 0.7311311552571067, + "learning_rate": 1.1947022623783495e-05, + "loss": 1.236, + "step": 948 + }, + { + "epoch": 0.6436625688851209, + "grad_norm": 0.7445659895700161, + "learning_rate": 1.1906794796956633e-05, + "loss": 1.2718, + "step": 949 + }, + { + "epoch": 0.6443408223823655, + "grad_norm": 0.714864727873201, + "learning_rate": 1.1866606086240325e-05, + "loss": 1.2805, + "step": 950 + }, + { + "epoch": 0.64501907587961, + "grad_norm": 0.7349415371328099, + "learning_rate": 1.1826456685874801e-05, + "loss": 1.2555, + "step": 951 + }, + { + "epoch": 0.6456973293768546, + "grad_norm": 0.7234244895672503, + "learning_rate": 1.1786346789910316e-05, + "loss": 1.2613, + "step": 952 + }, + { + "epoch": 0.6463755828740992, + "grad_norm": 0.7179485946292944, + "learning_rate": 1.1746276592206147e-05, + "loss": 1.2623, + "step": 953 + }, + { + "epoch": 0.6470538363713438, + "grad_norm": 0.7391341017957087, + "learning_rate": 1.1706246286429752e-05, + "loss": 1.2486, + "step": 954 + }, + { + "epoch": 0.6477320898685884, + "grad_norm": 0.7150250700985901, + "learning_rate": 1.1666256066055739e-05, + "loss": 1.2459, + "step": 955 + }, + { + "epoch": 0.648410343365833, + "grad_norm": 0.686582064168532, + "learning_rate": 1.162630612436501e-05, + "loss": 1.2713, + "step": 956 + }, + { + "epoch": 0.6490885968630776, + "grad_norm": 0.7075187279569484, + "learning_rate": 1.1586396654443773e-05, + "loss": 1.223, + "step": 957 + }, + { + "epoch": 0.6497668503603222, + "grad_norm": 0.7178501779928439, + "learning_rate": 1.1546527849182621e-05, + "loss": 1.2227, + "step": 958 + }, + { + "epoch": 0.6504451038575668, + "grad_norm": 0.7192989993624956, + "learning_rate": 1.1506699901275633e-05, + "loss": 1.2598, + "step": 959 + }, + { + "epoch": 0.6511233573548114, + "grad_norm": 0.7301835358461147, + "learning_rate": 1.1466913003219395e-05, + "loss": 1.2332, + "step": 960 + }, + { + "epoch": 0.6518016108520559, + "grad_norm": 0.6994674329929395, + "learning_rate": 1.1427167347312093e-05, + "loss": 1.2699, + "step": 961 + }, + { + "epoch": 0.6524798643493005, + "grad_norm": 0.7078327956045938, + "learning_rate": 1.1387463125652579e-05, + "loss": 1.2414, + "step": 962 + }, + { + "epoch": 0.6531581178465451, + "grad_norm": 0.7630351678619136, + "learning_rate": 1.1347800530139463e-05, + "loss": 1.266, + "step": 963 + }, + { + "epoch": 0.6538363713437897, + "grad_norm": 0.7594897772201343, + "learning_rate": 1.1308179752470147e-05, + "loss": 1.2626, + "step": 964 + }, + { + "epoch": 0.6545146248410343, + "grad_norm": 0.6955575097724982, + "learning_rate": 1.126860098413993e-05, + "loss": 1.2517, + "step": 965 + }, + { + "epoch": 0.6551928783382789, + "grad_norm": 0.7326960738706964, + "learning_rate": 1.1229064416441055e-05, + "loss": 1.2897, + "step": 966 + }, + { + "epoch": 0.6558711318355235, + "grad_norm": 0.7312816709206589, + "learning_rate": 1.1189570240461825e-05, + "loss": 1.2711, + "step": 967 + }, + { + "epoch": 0.6565493853327681, + "grad_norm": 0.7707904224347778, + "learning_rate": 1.1150118647085653e-05, + "loss": 1.2429, + "step": 968 + }, + { + "epoch": 0.6572276388300127, + "grad_norm": 0.7181578925761877, + "learning_rate": 1.111070982699011e-05, + "loss": 1.2603, + "step": 969 + }, + { + "epoch": 0.6579058923272573, + "grad_norm": 0.7485309379333241, + "learning_rate": 1.1071343970646069e-05, + "loss": 1.241, + "step": 970 + }, + { + "epoch": 0.6585841458245019, + "grad_norm": 0.7420054340386768, + "learning_rate": 1.1032021268316742e-05, + "loss": 1.2773, + "step": 971 + }, + { + "epoch": 0.6592623993217465, + "grad_norm": 0.7406520244405697, + "learning_rate": 1.0992741910056758e-05, + "loss": 1.2649, + "step": 972 + }, + { + "epoch": 0.6599406528189911, + "grad_norm": 0.7497464155716638, + "learning_rate": 1.0953506085711258e-05, + "loss": 1.251, + "step": 973 + }, + { + "epoch": 0.6606189063162357, + "grad_norm": 0.7075855670188584, + "learning_rate": 1.0914313984914987e-05, + "loss": 1.2554, + "step": 974 + }, + { + "epoch": 0.6612971598134803, + "grad_norm": 0.7163704059152383, + "learning_rate": 1.0875165797091371e-05, + "loss": 1.2594, + "step": 975 + }, + { + "epoch": 0.6619754133107248, + "grad_norm": 0.7107842817416975, + "learning_rate": 1.0836061711451552e-05, + "loss": 1.2553, + "step": 976 + }, + { + "epoch": 0.6626536668079694, + "grad_norm": 0.7121994112451051, + "learning_rate": 1.0797001916993572e-05, + "loss": 1.2851, + "step": 977 + }, + { + "epoch": 0.663331920305214, + "grad_norm": 0.742016146380806, + "learning_rate": 1.0757986602501365e-05, + "loss": 1.2729, + "step": 978 + }, + { + "epoch": 0.6640101738024586, + "grad_norm": 0.7614468214071379, + "learning_rate": 1.071901595654392e-05, + "loss": 1.2791, + "step": 979 + }, + { + "epoch": 0.6646884272997032, + "grad_norm": 0.7137224832937834, + "learning_rate": 1.0680090167474284e-05, + "loss": 1.2646, + "step": 980 + }, + { + "epoch": 0.6653666807969478, + "grad_norm": 0.6944784698203637, + "learning_rate": 1.0641209423428745e-05, + "loss": 1.2663, + "step": 981 + }, + { + "epoch": 0.6660449342941924, + "grad_norm": 0.6926775709241877, + "learning_rate": 1.0602373912325872e-05, + "loss": 1.2559, + "step": 982 + }, + { + "epoch": 0.666723187791437, + "grad_norm": 0.7121621198396063, + "learning_rate": 1.0563583821865598e-05, + "loss": 1.259, + "step": 983 + }, + { + "epoch": 0.6674014412886816, + "grad_norm": 0.7492579217654587, + "learning_rate": 1.0524839339528344e-05, + "loss": 1.2615, + "step": 984 + }, + { + "epoch": 0.6680796947859262, + "grad_norm": 0.7234426728331795, + "learning_rate": 1.0486140652574077e-05, + "loss": 1.2353, + "step": 985 + }, + { + "epoch": 0.6687579482831708, + "grad_norm": 0.6977853252293624, + "learning_rate": 1.0447487948041454e-05, + "loss": 1.2321, + "step": 986 + }, + { + "epoch": 0.6694362017804154, + "grad_norm": 0.7180470239654396, + "learning_rate": 1.040888141274688e-05, + "loss": 1.2575, + "step": 987 + }, + { + "epoch": 0.67011445527766, + "grad_norm": 0.7166912234194147, + "learning_rate": 1.0370321233283587e-05, + "loss": 1.2592, + "step": 988 + }, + { + "epoch": 0.6707927087749046, + "grad_norm": 0.719471786130956, + "learning_rate": 1.0331807596020804e-05, + "loss": 1.2263, + "step": 989 + }, + { + "epoch": 0.6714709622721492, + "grad_norm": 0.7153381227740333, + "learning_rate": 1.029334068710279e-05, + "loss": 1.257, + "step": 990 + }, + { + "epoch": 0.6721492157693938, + "grad_norm": 0.7211376167286312, + "learning_rate": 1.0254920692447946e-05, + "loss": 1.2396, + "step": 991 + }, + { + "epoch": 0.6728274692666384, + "grad_norm": 0.7353230173357311, + "learning_rate": 1.0216547797747935e-05, + "loss": 1.2579, + "step": 992 + }, + { + "epoch": 0.673505722763883, + "grad_norm": 0.7749549540065295, + "learning_rate": 1.01782221884668e-05, + "loss": 1.2472, + "step": 993 + }, + { + "epoch": 0.6741839762611276, + "grad_norm": 0.7052582586968418, + "learning_rate": 1.013994404984001e-05, + "loss": 1.222, + "step": 994 + }, + { + "epoch": 0.6748622297583722, + "grad_norm": 0.7238807278644459, + "learning_rate": 1.0101713566873612e-05, + "loss": 1.2622, + "step": 995 + }, + { + "epoch": 0.6755404832556168, + "grad_norm": 0.7224437920546443, + "learning_rate": 1.006353092434332e-05, + "loss": 1.2417, + "step": 996 + }, + { + "epoch": 0.6762187367528614, + "grad_norm": 0.7030898940390986, + "learning_rate": 1.002539630679364e-05, + "loss": 1.2487, + "step": 997 + }, + { + "epoch": 0.676896990250106, + "grad_norm": 0.7063999825585107, + "learning_rate": 9.987309898536946e-06, + "loss": 1.2402, + "step": 998 + }, + { + "epoch": 0.6775752437473506, + "grad_norm": 0.7216994485791334, + "learning_rate": 9.949271883652605e-06, + "loss": 1.2535, + "step": 999 + }, + { + "epoch": 0.6782534972445952, + "grad_norm": 0.715290491246088, + "learning_rate": 9.911282445986115e-06, + "loss": 1.2452, + "step": 1000 + }, + { + "epoch": 0.6789317507418398, + "grad_norm": 0.7235774612430038, + "learning_rate": 9.87334176914816e-06, + "loss": 1.2694, + "step": 1001 + }, + { + "epoch": 0.6796100042390844, + "grad_norm": 0.7822215823589889, + "learning_rate": 9.835450036513772e-06, + "loss": 1.2491, + "step": 1002 + }, + { + "epoch": 0.680288257736329, + "grad_norm": 0.7435176319997631, + "learning_rate": 9.797607431221405e-06, + "loss": 1.2358, + "step": 1003 + }, + { + "epoch": 0.6809665112335735, + "grad_norm": 0.7402778365799259, + "learning_rate": 9.759814136172097e-06, + "loss": 1.2333, + "step": 1004 + }, + { + "epoch": 0.6816447647308181, + "grad_norm": 0.6991177876465452, + "learning_rate": 9.722070334028557e-06, + "loss": 1.2448, + "step": 1005 + }, + { + "epoch": 0.6823230182280627, + "grad_norm": 0.7390069128689151, + "learning_rate": 9.684376207214252e-06, + "loss": 1.2667, + "step": 1006 + }, + { + "epoch": 0.6830012717253073, + "grad_norm": 0.744668014688226, + "learning_rate": 9.646731937912596e-06, + "loss": 1.238, + "step": 1007 + }, + { + "epoch": 0.6836795252225519, + "grad_norm": 0.6847769198917258, + "learning_rate": 9.609137708066007e-06, + "loss": 1.249, + "step": 1008 + }, + { + "epoch": 0.6843577787197965, + "grad_norm": 0.7011560538142221, + "learning_rate": 9.571593699375082e-06, + "loss": 1.2518, + "step": 1009 + }, + { + "epoch": 0.6850360322170411, + "grad_norm": 0.7296332469643578, + "learning_rate": 9.534100093297637e-06, + "loss": 1.2667, + "step": 1010 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.7564106571457987, + "learning_rate": 9.49665707104793e-06, + "loss": 1.2803, + "step": 1011 + }, + { + "epoch": 0.6863925392115303, + "grad_norm": 0.7497025167281187, + "learning_rate": 9.459264813595736e-06, + "loss": 1.2676, + "step": 1012 + }, + { + "epoch": 0.6870707927087749, + "grad_norm": 0.6994523711051618, + "learning_rate": 9.421923501665426e-06, + "loss": 1.2505, + "step": 1013 + }, + { + "epoch": 0.6877490462060195, + "grad_norm": 0.7215420284145486, + "learning_rate": 9.384633315735197e-06, + "loss": 1.2526, + "step": 1014 + }, + { + "epoch": 0.6884272997032641, + "grad_norm": 0.7182576007658268, + "learning_rate": 9.3473944360361e-06, + "loss": 1.2344, + "step": 1015 + }, + { + "epoch": 0.6891055532005087, + "grad_norm": 0.7424567342468971, + "learning_rate": 9.310207042551258e-06, + "loss": 1.2352, + "step": 1016 + }, + { + "epoch": 0.6897838066977533, + "grad_norm": 0.7086078062679702, + "learning_rate": 9.273071315014897e-06, + "loss": 1.2633, + "step": 1017 + }, + { + "epoch": 0.6904620601949979, + "grad_norm": 0.7444779504149994, + "learning_rate": 9.235987432911567e-06, + "loss": 1.2711, + "step": 1018 + }, + { + "epoch": 0.6911403136922425, + "grad_norm": 0.77022707965804, + "learning_rate": 9.198955575475241e-06, + "loss": 1.2317, + "step": 1019 + }, + { + "epoch": 0.6918185671894871, + "grad_norm": 0.7276754899754154, + "learning_rate": 9.161975921688427e-06, + "loss": 1.2791, + "step": 1020 + }, + { + "epoch": 0.6924968206867317, + "grad_norm": 0.7026188417040135, + "learning_rate": 9.12504865028133e-06, + "loss": 1.2493, + "step": 1021 + }, + { + "epoch": 0.6931750741839763, + "grad_norm": 0.6926211613260367, + "learning_rate": 9.088173939730971e-06, + "loss": 1.2467, + "step": 1022 + }, + { + "epoch": 0.6938533276812209, + "grad_norm": 0.7017132659341797, + "learning_rate": 9.051351968260362e-06, + "loss": 1.2537, + "step": 1023 + }, + { + "epoch": 0.6945315811784655, + "grad_norm": 0.7157201337759609, + "learning_rate": 9.014582913837588e-06, + "loss": 1.2521, + "step": 1024 + }, + { + "epoch": 0.6952098346757101, + "grad_norm": 0.7397450250982527, + "learning_rate": 8.977866954174983e-06, + "loss": 1.2667, + "step": 1025 + }, + { + "epoch": 0.6958880881729547, + "grad_norm": 0.7381070889903316, + "learning_rate": 8.941204266728259e-06, + "loss": 1.2799, + "step": 1026 + }, + { + "epoch": 0.6965663416701993, + "grad_norm": 0.471275683966, + "learning_rate": 8.904595028695673e-06, + "loss": 1.3761, + "step": 1027 + }, + { + "epoch": 0.6972445951674439, + "grad_norm": 0.7477049967173038, + "learning_rate": 8.868039417017128e-06, + "loss": 1.264, + "step": 1028 + }, + { + "epoch": 0.6979228486646885, + "grad_norm": 0.7439285336253352, + "learning_rate": 8.831537608373337e-06, + "loss": 1.2387, + "step": 1029 + }, + { + "epoch": 0.698601102161933, + "grad_norm": 0.7372188855937621, + "learning_rate": 8.795089779185e-06, + "loss": 1.2242, + "step": 1030 + }, + { + "epoch": 0.6992793556591776, + "grad_norm": 0.7108484812069134, + "learning_rate": 8.758696105611895e-06, + "loss": 1.2376, + "step": 1031 + }, + { + "epoch": 0.6999576091564222, + "grad_norm": 0.7112819048115543, + "learning_rate": 8.72235676355207e-06, + "loss": 1.2577, + "step": 1032 + }, + { + "epoch": 0.7006358626536668, + "grad_norm": 0.46441117142961635, + "learning_rate": 8.68607192864096e-06, + "loss": 1.3853, + "step": 1033 + }, + { + "epoch": 0.7013141161509114, + "grad_norm": 0.704005301317096, + "learning_rate": 8.64984177625059e-06, + "loss": 1.2523, + "step": 1034 + }, + { + "epoch": 0.701992369648156, + "grad_norm": 0.6760654966910298, + "learning_rate": 8.613666481488658e-06, + "loss": 1.2579, + "step": 1035 + }, + { + "epoch": 0.7026706231454006, + "grad_norm": 0.6859875086712081, + "learning_rate": 8.577546219197734e-06, + "loss": 1.203, + "step": 1036 + }, + { + "epoch": 0.7033488766426452, + "grad_norm": 0.7250776709559372, + "learning_rate": 8.541481163954426e-06, + "loss": 1.2512, + "step": 1037 + }, + { + "epoch": 0.7040271301398898, + "grad_norm": 0.6828939714058161, + "learning_rate": 8.505471490068487e-06, + "loss": 1.2535, + "step": 1038 + }, + { + "epoch": 0.7047053836371344, + "grad_norm": 0.7247935847952882, + "learning_rate": 8.46951737158201e-06, + "loss": 1.2648, + "step": 1039 + }, + { + "epoch": 0.705383637134379, + "grad_norm": 0.7062728261435427, + "learning_rate": 8.433618982268575e-06, + "loss": 1.2427, + "step": 1040 + }, + { + "epoch": 0.7060618906316236, + "grad_norm": 0.708341700820802, + "learning_rate": 8.397776495632424e-06, + "loss": 1.2364, + "step": 1041 + }, + { + "epoch": 0.7067401441288682, + "grad_norm": 0.7057742843892745, + "learning_rate": 8.361990084907609e-06, + "loss": 1.2597, + "step": 1042 + }, + { + "epoch": 0.7074183976261128, + "grad_norm": 0.7134634886760335, + "learning_rate": 8.326259923057129e-06, + "loss": 1.2567, + "step": 1043 + }, + { + "epoch": 0.7080966511233574, + "grad_norm": 0.7112068813359304, + "learning_rate": 8.290586182772153e-06, + "loss": 1.2588, + "step": 1044 + }, + { + "epoch": 0.708774904620602, + "grad_norm": 0.7166082576786501, + "learning_rate": 8.254969036471133e-06, + "loss": 1.2684, + "step": 1045 + }, + { + "epoch": 0.7094531581178466, + "grad_norm": 0.6774303776112416, + "learning_rate": 8.21940865629901e-06, + "loss": 1.2518, + "step": 1046 + }, + { + "epoch": 0.7101314116150912, + "grad_norm": 0.7369725642504088, + "learning_rate": 8.183905214126331e-06, + "loss": 1.2812, + "step": 1047 + }, + { + "epoch": 0.7108096651123358, + "grad_norm": 0.7125161557801593, + "learning_rate": 8.148458881548478e-06, + "loss": 1.2579, + "step": 1048 + }, + { + "epoch": 0.7114879186095804, + "grad_norm": 0.7123133162717256, + "learning_rate": 8.11306982988481e-06, + "loss": 1.2381, + "step": 1049 + }, + { + "epoch": 0.712166172106825, + "grad_norm": 0.4556155958071684, + "learning_rate": 8.077738230177823e-06, + "loss": 1.3769, + "step": 1050 + }, + { + "epoch": 0.7128444256040696, + "grad_norm": 0.7296551165413036, + "learning_rate": 8.042464253192339e-06, + "loss": 1.2557, + "step": 1051 + }, + { + "epoch": 0.7135226791013141, + "grad_norm": 0.7153326806368623, + "learning_rate": 8.007248069414673e-06, + "loss": 1.2453, + "step": 1052 + }, + { + "epoch": 0.7142009325985587, + "grad_norm": 0.7370663314394464, + "learning_rate": 7.972089849051834e-06, + "loss": 1.2612, + "step": 1053 + }, + { + "epoch": 0.7148791860958033, + "grad_norm": 0.7071174488237575, + "learning_rate": 7.936989762030662e-06, + "loss": 1.2661, + "step": 1054 + }, + { + "epoch": 0.7155574395930479, + "grad_norm": 0.6903000911952577, + "learning_rate": 7.901947977997031e-06, + "loss": 1.2708, + "step": 1055 + }, + { + "epoch": 0.7162356930902924, + "grad_norm": 0.6865849284089116, + "learning_rate": 7.866964666315018e-06, + "loss": 1.2527, + "step": 1056 + }, + { + "epoch": 0.716913946587537, + "grad_norm": 0.7297435263907307, + "learning_rate": 7.832039996066118e-06, + "loss": 1.2346, + "step": 1057 + }, + { + "epoch": 0.7175922000847816, + "grad_norm": 0.7267674081261121, + "learning_rate": 7.797174136048373e-06, + "loss": 1.2684, + "step": 1058 + }, + { + "epoch": 0.7182704535820262, + "grad_norm": 0.770600917229959, + "learning_rate": 7.762367254775584e-06, + "loss": 1.27, + "step": 1059 + }, + { + "epoch": 0.7189487070792708, + "grad_norm": 0.6989971112730319, + "learning_rate": 7.727619520476522e-06, + "loss": 1.2327, + "step": 1060 + }, + { + "epoch": 0.7196269605765154, + "grad_norm": 0.6845992496825357, + "learning_rate": 7.692931101094055e-06, + "loss": 1.2393, + "step": 1061 + }, + { + "epoch": 0.72030521407376, + "grad_norm": 0.6972088539223147, + "learning_rate": 7.658302164284393e-06, + "loss": 1.2649, + "step": 1062 + }, + { + "epoch": 0.7209834675710046, + "grad_norm": 0.6925122480528125, + "learning_rate": 7.623732877416232e-06, + "loss": 1.2515, + "step": 1063 + }, + { + "epoch": 0.7216617210682492, + "grad_norm": 0.7035579922671705, + "learning_rate": 7.589223407570006e-06, + "loss": 1.2495, + "step": 1064 + }, + { + "epoch": 0.7223399745654938, + "grad_norm": 0.4509807827721255, + "learning_rate": 7.554773921537002e-06, + "loss": 1.3852, + "step": 1065 + }, + { + "epoch": 0.7230182280627384, + "grad_norm": 0.748892704277171, + "learning_rate": 7.520384585818608e-06, + "loss": 1.267, + "step": 1066 + }, + { + "epoch": 0.723696481559983, + "grad_norm": 0.7108147391195934, + "learning_rate": 7.486055566625503e-06, + "loss": 1.2619, + "step": 1067 + }, + { + "epoch": 0.7243747350572276, + "grad_norm": 0.7086880409021801, + "learning_rate": 7.451787029876829e-06, + "loss": 1.2643, + "step": 1068 + }, + { + "epoch": 0.7250529885544722, + "grad_norm": 0.6977432184670711, + "learning_rate": 7.4175791411994114e-06, + "loss": 1.2146, + "step": 1069 + }, + { + "epoch": 0.7257312420517168, + "grad_norm": 0.6687009426082059, + "learning_rate": 7.383432065926943e-06, + "loss": 1.2536, + "step": 1070 + }, + { + "epoch": 0.7264094955489614, + "grad_norm": 0.6879116674692699, + "learning_rate": 7.349345969099211e-06, + "loss": 1.2608, + "step": 1071 + }, + { + "epoch": 0.727087749046206, + "grad_norm": 0.7102026991932625, + "learning_rate": 7.315321015461263e-06, + "loss": 1.2416, + "step": 1072 + }, + { + "epoch": 0.7277660025434506, + "grad_norm": 0.7263982096199556, + "learning_rate": 7.281357369462632e-06, + "loss": 1.2463, + "step": 1073 + }, + { + "epoch": 0.7284442560406952, + "grad_norm": 0.7199407337871239, + "learning_rate": 7.247455195256552e-06, + "loss": 1.2308, + "step": 1074 + }, + { + "epoch": 0.7291225095379398, + "grad_norm": 0.6968257341434265, + "learning_rate": 7.21361465669914e-06, + "loss": 1.2689, + "step": 1075 + }, + { + "epoch": 0.7298007630351844, + "grad_norm": 0.7107034728723381, + "learning_rate": 7.179835917348614e-06, + "loss": 1.2153, + "step": 1076 + }, + { + "epoch": 0.730479016532429, + "grad_norm": 0.7479033842548566, + "learning_rate": 7.146119140464501e-06, + "loss": 1.2619, + "step": 1077 + }, + { + "epoch": 0.7311572700296736, + "grad_norm": 0.6977861461512729, + "learning_rate": 7.112464489006865e-06, + "loss": 1.2417, + "step": 1078 + }, + { + "epoch": 0.7318355235269182, + "grad_norm": 0.6592519646440719, + "learning_rate": 7.078872125635507e-06, + "loss": 1.2182, + "step": 1079 + }, + { + "epoch": 0.7325137770241628, + "grad_norm": 0.4570588318163805, + "learning_rate": 7.045342212709146e-06, + "loss": 1.3656, + "step": 1080 + }, + { + "epoch": 0.7331920305214074, + "grad_norm": 0.7473703638144439, + "learning_rate": 7.0118749122847e-06, + "loss": 1.2813, + "step": 1081 + }, + { + "epoch": 0.733870284018652, + "grad_norm": 0.7159820456458268, + "learning_rate": 6.978470386116445e-06, + "loss": 1.2572, + "step": 1082 + }, + { + "epoch": 0.7345485375158965, + "grad_norm": 0.694451155338397, + "learning_rate": 6.945128795655283e-06, + "loss": 1.247, + "step": 1083 + }, + { + "epoch": 0.7352267910131411, + "grad_norm": 0.6686217234195255, + "learning_rate": 6.911850302047893e-06, + "loss": 1.2527, + "step": 1084 + }, + { + "epoch": 0.7359050445103857, + "grad_norm": 0.7154649690871586, + "learning_rate": 6.878635066136032e-06, + "loss": 1.2548, + "step": 1085 + }, + { + "epoch": 0.7365832980076303, + "grad_norm": 0.7025619787414288, + "learning_rate": 6.845483248455711e-06, + "loss": 1.2515, + "step": 1086 + }, + { + "epoch": 0.7372615515048749, + "grad_norm": 0.6904241313849837, + "learning_rate": 6.812395009236416e-06, + "loss": 1.2605, + "step": 1087 + }, + { + "epoch": 0.7379398050021195, + "grad_norm": 0.7067524261458777, + "learning_rate": 6.779370508400356e-06, + "loss": 1.2371, + "step": 1088 + }, + { + "epoch": 0.7386180584993641, + "grad_norm": 0.6998978069695649, + "learning_rate": 6.7464099055616635e-06, + "loss": 1.2396, + "step": 1089 + }, + { + "epoch": 0.7392963119966087, + "grad_norm": 0.7127235331064138, + "learning_rate": 6.713513360025667e-06, + "loss": 1.2826, + "step": 1090 + }, + { + "epoch": 0.7399745654938533, + "grad_norm": 0.709084315722468, + "learning_rate": 6.680681030788072e-06, + "loss": 1.2685, + "step": 1091 + }, + { + "epoch": 0.7406528189910979, + "grad_norm": 0.7439738397680048, + "learning_rate": 6.6479130765342185e-06, + "loss": 1.2548, + "step": 1092 + }, + { + "epoch": 0.7413310724883425, + "grad_norm": 0.6913571968581045, + "learning_rate": 6.615209655638299e-06, + "loss": 1.2527, + "step": 1093 + }, + { + "epoch": 0.7420093259855871, + "grad_norm": 0.6695385106248662, + "learning_rate": 6.582570926162628e-06, + "loss": 1.245, + "step": 1094 + }, + { + "epoch": 0.7426875794828317, + "grad_norm": 0.7061064658406417, + "learning_rate": 6.549997045856835e-06, + "loss": 1.2422, + "step": 1095 + }, + { + "epoch": 0.7433658329800763, + "grad_norm": 0.7014131205678311, + "learning_rate": 6.517488172157113e-06, + "loss": 1.2609, + "step": 1096 + }, + { + "epoch": 0.7440440864773209, + "grad_norm": 0.6878508712453276, + "learning_rate": 6.485044462185492e-06, + "loss": 1.2718, + "step": 1097 + }, + { + "epoch": 0.7447223399745655, + "grad_norm": 0.6677191346667652, + "learning_rate": 6.452666072749028e-06, + "loss": 1.2492, + "step": 1098 + }, + { + "epoch": 0.7454005934718101, + "grad_norm": 0.7090961144596659, + "learning_rate": 6.4203531603390765e-06, + "loss": 1.2373, + "step": 1099 + }, + { + "epoch": 0.7460788469690547, + "grad_norm": 0.45255204902887586, + "learning_rate": 6.3881058811305264e-06, + "loss": 1.3588, + "step": 1100 + }, + { + "epoch": 0.7467571004662993, + "grad_norm": 0.7135393080533664, + "learning_rate": 6.355924390981061e-06, + "loss": 1.2277, + "step": 1101 + }, + { + "epoch": 0.7474353539635439, + "grad_norm": 0.4458247664684505, + "learning_rate": 6.323808845430379e-06, + "loss": 1.377, + "step": 1102 + }, + { + "epoch": 0.7481136074607885, + "grad_norm": 0.7185164012176813, + "learning_rate": 6.29175939969945e-06, + "loss": 1.2373, + "step": 1103 + }, + { + "epoch": 0.7487918609580331, + "grad_norm": 0.6982441752973358, + "learning_rate": 6.259776208689796e-06, + "loss": 1.2612, + "step": 1104 + }, + { + "epoch": 0.7494701144552777, + "grad_norm": 0.6953929763573126, + "learning_rate": 6.227859426982688e-06, + "loss": 1.2782, + "step": 1105 + }, + { + "epoch": 0.7501483679525223, + "grad_norm": 0.697754814965526, + "learning_rate": 6.196009208838438e-06, + "loss": 1.2536, + "step": 1106 + }, + { + "epoch": 0.7508266214497669, + "grad_norm": 0.6748304634062791, + "learning_rate": 6.164225708195642e-06, + "loss": 1.2523, + "step": 1107 + }, + { + "epoch": 0.7515048749470115, + "grad_norm": 0.6782055822317078, + "learning_rate": 6.132509078670437e-06, + "loss": 1.239, + "step": 1108 + }, + { + "epoch": 0.752183128444256, + "grad_norm": 0.6808116233647222, + "learning_rate": 6.100859473555776e-06, + "loss": 1.2442, + "step": 1109 + }, + { + "epoch": 0.7528613819415007, + "grad_norm": 0.7036659853403976, + "learning_rate": 6.069277045820625e-06, + "loss": 1.2323, + "step": 1110 + }, + { + "epoch": 0.7535396354387452, + "grad_norm": 0.6828322290009959, + "learning_rate": 6.037761948109318e-06, + "loss": 1.2374, + "step": 1111 + }, + { + "epoch": 0.7542178889359898, + "grad_norm": 0.713945854741471, + "learning_rate": 6.006314332740735e-06, + "loss": 1.2603, + "step": 1112 + }, + { + "epoch": 0.7548961424332344, + "grad_norm": 0.7020795096881565, + "learning_rate": 5.9749343517076155e-06, + "loss": 1.2399, + "step": 1113 + }, + { + "epoch": 0.755574395930479, + "grad_norm": 0.6998554795750904, + "learning_rate": 5.943622156675799e-06, + "loss": 1.2494, + "step": 1114 + }, + { + "epoch": 0.7562526494277236, + "grad_norm": 0.7122693984226326, + "learning_rate": 5.91237789898351e-06, + "loss": 1.2505, + "step": 1115 + }, + { + "epoch": 0.7569309029249682, + "grad_norm": 0.7314913158727478, + "learning_rate": 5.881201729640629e-06, + "loss": 1.2491, + "step": 1116 + }, + { + "epoch": 0.7576091564222128, + "grad_norm": 0.6878164452463644, + "learning_rate": 5.850093799327914e-06, + "loss": 1.2419, + "step": 1117 + }, + { + "epoch": 0.7582874099194574, + "grad_norm": 0.7070919917450872, + "learning_rate": 5.81905425839635e-06, + "loss": 1.2535, + "step": 1118 + }, + { + "epoch": 0.758965663416702, + "grad_norm": 0.4473660675198994, + "learning_rate": 5.788083256866357e-06, + "loss": 1.3981, + "step": 1119 + }, + { + "epoch": 0.7596439169139466, + "grad_norm": 0.6899640026759862, + "learning_rate": 5.757180944427115e-06, + "loss": 1.223, + "step": 1120 + }, + { + "epoch": 0.7603221704111912, + "grad_norm": 0.7150514267374936, + "learning_rate": 5.7263474704357715e-06, + "loss": 1.2355, + "step": 1121 + }, + { + "epoch": 0.7610004239084358, + "grad_norm": 0.7389898022126541, + "learning_rate": 5.6955829839168165e-06, + "loss": 1.2777, + "step": 1122 + }, + { + "epoch": 0.7616786774056804, + "grad_norm": 0.6796754729134707, + "learning_rate": 5.664887633561269e-06, + "loss": 1.2135, + "step": 1123 + }, + { + "epoch": 0.762356930902925, + "grad_norm": 0.6900893470393483, + "learning_rate": 5.63426156772603e-06, + "loss": 1.2354, + "step": 1124 + }, + { + "epoch": 0.7630351844001696, + "grad_norm": 0.7012314507491941, + "learning_rate": 5.603704934433107e-06, + "loss": 1.2196, + "step": 1125 + }, + { + "epoch": 0.7637134378974142, + "grad_norm": 0.7056609640562619, + "learning_rate": 5.573217881368936e-06, + "loss": 1.2807, + "step": 1126 + }, + { + "epoch": 0.7643916913946588, + "grad_norm": 0.6769326414922961, + "learning_rate": 5.54280055588367e-06, + "loss": 1.2424, + "step": 1127 + }, + { + "epoch": 0.7650699448919034, + "grad_norm": 0.6758653545849139, + "learning_rate": 5.5124531049904385e-06, + "loss": 1.2471, + "step": 1128 + }, + { + "epoch": 0.765748198389148, + "grad_norm": 0.6630169402173475, + "learning_rate": 5.4821756753646584e-06, + "loss": 1.2248, + "step": 1129 + }, + { + "epoch": 0.7664264518863926, + "grad_norm": 0.6716307852815654, + "learning_rate": 5.451968413343309e-06, + "loss": 1.2381, + "step": 1130 + }, + { + "epoch": 0.7671047053836372, + "grad_norm": 0.6890962976180042, + "learning_rate": 5.421831464924263e-06, + "loss": 1.2713, + "step": 1131 + }, + { + "epoch": 0.7677829588808818, + "grad_norm": 0.6962004345825891, + "learning_rate": 5.3917649757655275e-06, + "loss": 1.2561, + "step": 1132 + }, + { + "epoch": 0.7684612123781264, + "grad_norm": 0.7023013942987391, + "learning_rate": 5.361769091184566e-06, + "loss": 1.22, + "step": 1133 + }, + { + "epoch": 0.769139465875371, + "grad_norm": 0.6767598844140867, + "learning_rate": 5.3318439561576186e-06, + "loss": 1.2404, + "step": 1134 + }, + { + "epoch": 0.7698177193726156, + "grad_norm": 0.6771636851434627, + "learning_rate": 5.301989715318954e-06, + "loss": 1.2369, + "step": 1135 + }, + { + "epoch": 0.7704959728698602, + "grad_norm": 0.6971245037688651, + "learning_rate": 5.272206512960205e-06, + "loss": 1.2324, + "step": 1136 + }, + { + "epoch": 0.7711742263671048, + "grad_norm": 0.6765112279690619, + "learning_rate": 5.242494493029655e-06, + "loss": 1.2217, + "step": 1137 + }, + { + "epoch": 0.7718524798643494, + "grad_norm": 0.6737178423734921, + "learning_rate": 5.212853799131566e-06, + "loss": 1.2588, + "step": 1138 + }, + { + "epoch": 0.772530733361594, + "grad_norm": 0.6798938652668959, + "learning_rate": 5.183284574525444e-06, + "loss": 1.2555, + "step": 1139 + }, + { + "epoch": 0.7732089868588385, + "grad_norm": 0.4647417772652952, + "learning_rate": 5.1537869621253774e-06, + "loss": 1.4124, + "step": 1140 + }, + { + "epoch": 0.7738872403560831, + "grad_norm": 0.7075653241795719, + "learning_rate": 5.124361104499349e-06, + "loss": 1.2511, + "step": 1141 + }, + { + "epoch": 0.7745654938533277, + "grad_norm": 0.7135593274232547, + "learning_rate": 5.095007143868522e-06, + "loss": 1.2313, + "step": 1142 + }, + { + "epoch": 0.7752437473505722, + "grad_norm": 0.4479664062052932, + "learning_rate": 5.065725222106574e-06, + "loss": 1.3455, + "step": 1143 + }, + { + "epoch": 0.7759220008478168, + "grad_norm": 0.6998180607890737, + "learning_rate": 5.036515480738995e-06, + "loss": 1.2638, + "step": 1144 + }, + { + "epoch": 0.7766002543450614, + "grad_norm": 0.6780873565049669, + "learning_rate": 5.007378060942425e-06, + "loss": 1.2585, + "step": 1145 + }, + { + "epoch": 0.777278507842306, + "grad_norm": 0.6951550032542391, + "learning_rate": 4.978313103543964e-06, + "loss": 1.2609, + "step": 1146 + }, + { + "epoch": 0.7779567613395506, + "grad_norm": 0.43274514832271704, + "learning_rate": 4.949320749020454e-06, + "loss": 1.353, + "step": 1147 + }, + { + "epoch": 0.7786350148367952, + "grad_norm": 0.6959484335867067, + "learning_rate": 4.920401137497872e-06, + "loss": 1.226, + "step": 1148 + }, + { + "epoch": 0.7793132683340398, + "grad_norm": 0.6818736987116557, + "learning_rate": 4.891554408750585e-06, + "loss": 1.261, + "step": 1149 + }, + { + "epoch": 0.7799915218312844, + "grad_norm": 0.6928424892107953, + "learning_rate": 4.862780702200729e-06, + "loss": 1.2267, + "step": 1150 + }, + { + "epoch": 0.780669775328529, + "grad_norm": 0.6754834335451715, + "learning_rate": 4.8340801569174735e-06, + "loss": 1.2191, + "step": 1151 + }, + { + "epoch": 0.7813480288257736, + "grad_norm": 0.7004613308457371, + "learning_rate": 4.805452911616417e-06, + "loss": 1.2375, + "step": 1152 + }, + { + "epoch": 0.7820262823230182, + "grad_norm": 0.6806930381395735, + "learning_rate": 4.77689910465887e-06, + "loss": 1.229, + "step": 1153 + }, + { + "epoch": 0.7827045358202628, + "grad_norm": 0.6774321121177972, + "learning_rate": 4.748418874051195e-06, + "loss": 1.2545, + "step": 1154 + }, + { + "epoch": 0.7833827893175074, + "grad_norm": 0.6948812717558227, + "learning_rate": 4.720012357444162e-06, + "loss": 1.2469, + "step": 1155 + }, + { + "epoch": 0.784061042814752, + "grad_norm": 0.6889011025050227, + "learning_rate": 4.691679692132247e-06, + "loss": 1.226, + "step": 1156 + }, + { + "epoch": 0.7847392963119966, + "grad_norm": 0.6679127571883847, + "learning_rate": 4.663421015053016e-06, + "loss": 1.2602, + "step": 1157 + }, + { + "epoch": 0.7854175498092412, + "grad_norm": 0.6579398726491081, + "learning_rate": 4.63523646278639e-06, + "loss": 1.2256, + "step": 1158 + }, + { + "epoch": 0.7860958033064858, + "grad_norm": 0.6688306932896388, + "learning_rate": 4.607126171554075e-06, + "loss": 1.2536, + "step": 1159 + }, + { + "epoch": 0.7867740568037304, + "grad_norm": 0.7004512005802537, + "learning_rate": 4.579090277218825e-06, + "loss": 1.2502, + "step": 1160 + }, + { + "epoch": 0.787452310300975, + "grad_norm": 0.6694251736984261, + "learning_rate": 4.5511289152838444e-06, + "loss": 1.2204, + "step": 1161 + }, + { + "epoch": 0.7881305637982196, + "grad_norm": 0.6843283642295125, + "learning_rate": 4.523242220892092e-06, + "loss": 1.2483, + "step": 1162 + }, + { + "epoch": 0.7888088172954641, + "grad_norm": 0.7067471312697848, + "learning_rate": 4.495430328825639e-06, + "loss": 1.253, + "step": 1163 + }, + { + "epoch": 0.7894870707927087, + "grad_norm": 0.6817423205506041, + "learning_rate": 4.46769337350504e-06, + "loss": 1.235, + "step": 1164 + }, + { + "epoch": 0.7901653242899533, + "grad_norm": 0.7247217035269676, + "learning_rate": 4.440031488988647e-06, + "loss": 1.2366, + "step": 1165 + }, + { + "epoch": 0.7908435777871979, + "grad_norm": 0.6579826200974558, + "learning_rate": 4.412444808971994e-06, + "loss": 1.2285, + "step": 1166 + }, + { + "epoch": 0.7915218312844425, + "grad_norm": 0.667181720898605, + "learning_rate": 4.384933466787116e-06, + "loss": 1.2345, + "step": 1167 + }, + { + "epoch": 0.7922000847816871, + "grad_norm": 0.6899309881903906, + "learning_rate": 4.357497595401954e-06, + "loss": 1.2662, + "step": 1168 + }, + { + "epoch": 0.7928783382789317, + "grad_norm": 0.7080472474113639, + "learning_rate": 4.330137327419656e-06, + "loss": 1.2547, + "step": 1169 + }, + { + "epoch": 0.7935565917761763, + "grad_norm": 0.6877701187705579, + "learning_rate": 4.302852795077976e-06, + "loss": 1.2547, + "step": 1170 + }, + { + "epoch": 0.7942348452734209, + "grad_norm": 0.7248362233029715, + "learning_rate": 4.275644130248629e-06, + "loss": 1.2631, + "step": 1171 + }, + { + "epoch": 0.7949130987706655, + "grad_norm": 0.6786411729582461, + "learning_rate": 4.248511464436629e-06, + "loss": 1.2471, + "step": 1172 + }, + { + "epoch": 0.7955913522679101, + "grad_norm": 0.6759310563754385, + "learning_rate": 4.221454928779687e-06, + "loss": 1.225, + "step": 1173 + }, + { + "epoch": 0.7962696057651547, + "grad_norm": 0.6934587765306394, + "learning_rate": 4.1944746540475465e-06, + "loss": 1.2548, + "step": 1174 + }, + { + "epoch": 0.7969478592623993, + "grad_norm": 0.6521853026195695, + "learning_rate": 4.167570770641387e-06, + "loss": 1.2171, + "step": 1175 + }, + { + "epoch": 0.7976261127596439, + "grad_norm": 0.695071649092735, + "learning_rate": 4.140743408593158e-06, + "loss": 1.2618, + "step": 1176 + }, + { + "epoch": 0.7983043662568885, + "grad_norm": 0.6946974156403914, + "learning_rate": 4.113992697564959e-06, + "loss": 1.2202, + "step": 1177 + }, + { + "epoch": 0.7989826197541331, + "grad_norm": 0.6875018646078283, + "learning_rate": 4.0873187668484444e-06, + "loss": 1.2475, + "step": 1178 + }, + { + "epoch": 0.7996608732513777, + "grad_norm": 0.6837968878391059, + "learning_rate": 4.060721745364153e-06, + "loss": 1.2521, + "step": 1179 + }, + { + "epoch": 0.8003391267486223, + "grad_norm": 0.6999303464068111, + "learning_rate": 4.0342017616609095e-06, + "loss": 1.2452, + "step": 1180 + }, + { + "epoch": 0.8010173802458669, + "grad_norm": 0.6700378318277015, + "learning_rate": 4.007758943915197e-06, + "loss": 1.2421, + "step": 1181 + }, + { + "epoch": 0.8016956337431115, + "grad_norm": 0.6818739644010207, + "learning_rate": 3.981393419930555e-06, + "loss": 1.2235, + "step": 1182 + }, + { + "epoch": 0.8023738872403561, + "grad_norm": 0.6793140252226033, + "learning_rate": 3.955105317136929e-06, + "loss": 1.2663, + "step": 1183 + }, + { + "epoch": 0.8030521407376007, + "grad_norm": 0.6875039701252489, + "learning_rate": 3.928894762590076e-06, + "loss": 1.2192, + "step": 1184 + }, + { + "epoch": 0.8037303942348453, + "grad_norm": 0.711408992755773, + "learning_rate": 3.902761882970958e-06, + "loss": 1.2628, + "step": 1185 + }, + { + "epoch": 0.8044086477320899, + "grad_norm": 0.7195784285174972, + "learning_rate": 3.8767068045850975e-06, + "loss": 1.2556, + "step": 1186 + }, + { + "epoch": 0.8050869012293345, + "grad_norm": 0.7028743882976222, + "learning_rate": 3.850729653362018e-06, + "loss": 1.238, + "step": 1187 + }, + { + "epoch": 0.8057651547265791, + "grad_norm": 0.7195821566351972, + "learning_rate": 3.824830554854566e-06, + "loss": 1.2312, + "step": 1188 + }, + { + "epoch": 0.8064434082238237, + "grad_norm": 0.7077679764250616, + "learning_rate": 3.7990096342383775e-06, + "loss": 1.2487, + "step": 1189 + }, + { + "epoch": 0.8071216617210683, + "grad_norm": 0.6877957096422861, + "learning_rate": 3.773267016311215e-06, + "loss": 1.2407, + "step": 1190 + }, + { + "epoch": 0.8077999152183128, + "grad_norm": 0.7134703340862909, + "learning_rate": 3.7476028254924115e-06, + "loss": 1.2524, + "step": 1191 + }, + { + "epoch": 0.8084781687155574, + "grad_norm": 0.65709662877186, + "learning_rate": 3.7220171858222264e-06, + "loss": 1.214, + "step": 1192 + }, + { + "epoch": 0.809156422212802, + "grad_norm": 0.6768556017907369, + "learning_rate": 3.6965102209612667e-06, + "loss": 1.241, + "step": 1193 + }, + { + "epoch": 0.8098346757100466, + "grad_norm": 0.6839758361669105, + "learning_rate": 3.6710820541899097e-06, + "loss": 1.2783, + "step": 1194 + }, + { + "epoch": 0.8105129292072912, + "grad_norm": 0.7094232562572493, + "learning_rate": 3.645732808407647e-06, + "loss": 1.2558, + "step": 1195 + }, + { + "epoch": 0.8111911827045358, + "grad_norm": 0.6969915627207982, + "learning_rate": 3.6204626061325666e-06, + "loss": 1.2748, + "step": 1196 + }, + { + "epoch": 0.8118694362017804, + "grad_norm": 0.6971469387166436, + "learning_rate": 3.595271569500698e-06, + "loss": 1.2325, + "step": 1197 + }, + { + "epoch": 0.812547689699025, + "grad_norm": 0.6627792415071245, + "learning_rate": 3.570159820265464e-06, + "loss": 1.2179, + "step": 1198 + }, + { + "epoch": 0.8132259431962696, + "grad_norm": 0.6920530698070828, + "learning_rate": 3.545127479797068e-06, + "loss": 1.2169, + "step": 1199 + }, + { + "epoch": 0.8139041966935142, + "grad_norm": 0.7002031808124952, + "learning_rate": 3.520174669081904e-06, + "loss": 1.2683, + "step": 1200 + }, + { + "epoch": 0.8145824501907588, + "grad_norm": 0.6637127831585506, + "learning_rate": 3.4953015087220043e-06, + "loss": 1.2278, + "step": 1201 + }, + { + "epoch": 0.8152607036880034, + "grad_norm": 0.6803390774543788, + "learning_rate": 3.4705081189344214e-06, + "loss": 1.236, + "step": 1202 + }, + { + "epoch": 0.815938957185248, + "grad_norm": 0.7062394197270683, + "learning_rate": 3.4457946195506576e-06, + "loss": 1.2286, + "step": 1203 + }, + { + "epoch": 0.8166172106824926, + "grad_norm": 0.7383625500117024, + "learning_rate": 3.421161130016093e-06, + "loss": 1.2587, + "step": 1204 + }, + { + "epoch": 0.8172954641797372, + "grad_norm": 0.6688377168685776, + "learning_rate": 3.3966077693894106e-06, + "loss": 1.237, + "step": 1205 + }, + { + "epoch": 0.8179737176769818, + "grad_norm": 0.7132285176191728, + "learning_rate": 3.3721346563420033e-06, + "loss": 1.2448, + "step": 1206 + }, + { + "epoch": 0.8186519711742264, + "grad_norm": 0.6680523424443547, + "learning_rate": 3.3477419091574092e-06, + "loss": 1.2498, + "step": 1207 + }, + { + "epoch": 0.819330224671471, + "grad_norm": 0.6905435032148992, + "learning_rate": 3.3234296457307625e-06, + "loss": 1.2649, + "step": 1208 + }, + { + "epoch": 0.8200084781687156, + "grad_norm": 0.6926944873494554, + "learning_rate": 3.2991979835681788e-06, + "loss": 1.2572, + "step": 1209 + }, + { + "epoch": 0.8206867316659602, + "grad_norm": 0.6847897982257526, + "learning_rate": 3.2750470397862232e-06, + "loss": 1.233, + "step": 1210 + }, + { + "epoch": 0.8213649851632048, + "grad_norm": 0.6737318091933269, + "learning_rate": 3.2509769311113227e-06, + "loss": 1.2455, + "step": 1211 + }, + { + "epoch": 0.8220432386604494, + "grad_norm": 0.6670064588219545, + "learning_rate": 3.226987773879233e-06, + "loss": 1.2395, + "step": 1212 + }, + { + "epoch": 0.822721492157694, + "grad_norm": 0.6800698653690069, + "learning_rate": 3.2030796840344335e-06, + "loss": 1.2046, + "step": 1213 + }, + { + "epoch": 0.8233997456549386, + "grad_norm": 0.6954571148384878, + "learning_rate": 3.1792527771295934e-06, + "loss": 1.2555, + "step": 1214 + }, + { + "epoch": 0.8240779991521832, + "grad_norm": 0.6800480785017151, + "learning_rate": 3.1555071683250183e-06, + "loss": 1.2576, + "step": 1215 + }, + { + "epoch": 0.8247562526494278, + "grad_norm": 0.6731537475624323, + "learning_rate": 3.1318429723880705e-06, + "loss": 1.2664, + "step": 1216 + }, + { + "epoch": 0.8254345061466724, + "grad_norm": 0.6744093289438378, + "learning_rate": 3.1082603036926363e-06, + "loss": 1.2681, + "step": 1217 + }, + { + "epoch": 0.826112759643917, + "grad_norm": 0.66118151158709, + "learning_rate": 3.0847592762185563e-06, + "loss": 1.2482, + "step": 1218 + }, + { + "epoch": 0.8267910131411615, + "grad_norm": 0.7773444601932658, + "learning_rate": 3.061340003551092e-06, + "loss": 1.2329, + "step": 1219 + }, + { + "epoch": 0.8274692666384061, + "grad_norm": 0.6699375883984999, + "learning_rate": 3.038002598880363e-06, + "loss": 1.2248, + "step": 1220 + }, + { + "epoch": 0.8281475201356507, + "grad_norm": 0.6754455023957769, + "learning_rate": 3.014747175000794e-06, + "loss": 1.2342, + "step": 1221 + }, + { + "epoch": 0.8288257736328953, + "grad_norm": 0.6738655402613765, + "learning_rate": 2.9915738443106e-06, + "loss": 1.2322, + "step": 1222 + }, + { + "epoch": 0.8295040271301399, + "grad_norm": 0.6775994450297527, + "learning_rate": 2.9684827188112054e-06, + "loss": 1.2306, + "step": 1223 + }, + { + "epoch": 0.8301822806273845, + "grad_norm": 0.6793157607402639, + "learning_rate": 2.9454739101067376e-06, + "loss": 1.2551, + "step": 1224 + }, + { + "epoch": 0.8308605341246291, + "grad_norm": 0.7046333525864542, + "learning_rate": 2.9225475294034434e-06, + "loss": 1.2199, + "step": 1225 + }, + { + "epoch": 0.8315387876218737, + "grad_norm": 0.6956025857176882, + "learning_rate": 2.8997036875092056e-06, + "loss": 1.2423, + "step": 1226 + }, + { + "epoch": 0.8322170411191183, + "grad_norm": 0.6932938866887467, + "learning_rate": 2.8769424948329617e-06, + "loss": 1.2414, + "step": 1227 + }, + { + "epoch": 0.8328952946163629, + "grad_norm": 0.6903105564714394, + "learning_rate": 2.8542640613842043e-06, + "loss": 1.2612, + "step": 1228 + }, + { + "epoch": 0.8335735481136075, + "grad_norm": 0.6640271856515038, + "learning_rate": 2.8316684967724216e-06, + "loss": 1.2215, + "step": 1229 + }, + { + "epoch": 0.8342518016108521, + "grad_norm": 0.6924264638696419, + "learning_rate": 2.8091559102065757e-06, + "loss": 1.2574, + "step": 1230 + }, + { + "epoch": 0.8349300551080967, + "grad_norm": 0.6688585447783828, + "learning_rate": 2.7867264104946e-06, + "loss": 1.2621, + "step": 1231 + }, + { + "epoch": 0.8356083086053413, + "grad_norm": 0.6990612602692031, + "learning_rate": 2.764380106042832e-06, + "loss": 1.2473, + "step": 1232 + }, + { + "epoch": 0.8362865621025859, + "grad_norm": 0.6605622117162815, + "learning_rate": 2.7421171048555174e-06, + "loss": 1.2119, + "step": 1233 + }, + { + "epoch": 0.8369648155998305, + "grad_norm": 0.6798097456514123, + "learning_rate": 2.7199375145342723e-06, + "loss": 1.248, + "step": 1234 + }, + { + "epoch": 0.837643069097075, + "grad_norm": 0.6845646072497431, + "learning_rate": 2.6978414422775913e-06, + "loss": 1.249, + "step": 1235 + }, + { + "epoch": 0.8383213225943196, + "grad_norm": 0.4316045622329302, + "learning_rate": 2.6758289948802873e-06, + "loss": 1.3416, + "step": 1236 + }, + { + "epoch": 0.8389995760915642, + "grad_norm": 0.6920372772638658, + "learning_rate": 2.653900278733006e-06, + "loss": 1.2551, + "step": 1237 + }, + { + "epoch": 0.8396778295888088, + "grad_norm": 0.7074558805325211, + "learning_rate": 2.632055399821707e-06, + "loss": 1.2667, + "step": 1238 + }, + { + "epoch": 0.8403560830860534, + "grad_norm": 0.6764498414633098, + "learning_rate": 2.610294463727141e-06, + "loss": 1.2472, + "step": 1239 + }, + { + "epoch": 0.841034336583298, + "grad_norm": 0.6665093041129415, + "learning_rate": 2.588617575624346e-06, + "loss": 1.2397, + "step": 1240 + }, + { + "epoch": 0.8417125900805426, + "grad_norm": 0.6719411187283858, + "learning_rate": 2.5670248402821416e-06, + "loss": 1.2681, + "step": 1241 + }, + { + "epoch": 0.8423908435777872, + "grad_norm": 0.6633310055649266, + "learning_rate": 2.545516362062623e-06, + "loss": 1.2174, + "step": 1242 + }, + { + "epoch": 0.8430690970750317, + "grad_norm": 0.7009012164351331, + "learning_rate": 2.5240922449206485e-06, + "loss": 1.2499, + "step": 1243 + }, + { + "epoch": 0.8437473505722763, + "grad_norm": 0.65658740687541, + "learning_rate": 2.5027525924033393e-06, + "loss": 1.179, + "step": 1244 + }, + { + "epoch": 0.8444256040695209, + "grad_norm": 0.4369524964249072, + "learning_rate": 2.4814975076495928e-06, + "loss": 1.4163, + "step": 1245 + }, + { + "epoch": 0.8451038575667655, + "grad_norm": 0.6493205856586579, + "learning_rate": 2.460327093389563e-06, + "loss": 1.2481, + "step": 1246 + }, + { + "epoch": 0.8457821110640101, + "grad_norm": 0.6521636141722069, + "learning_rate": 2.4392414519441766e-06, + "loss": 1.2141, + "step": 1247 + }, + { + "epoch": 0.8464603645612547, + "grad_norm": 0.6986484333798653, + "learning_rate": 2.4182406852246353e-06, + "loss": 1.2385, + "step": 1248 + }, + { + "epoch": 0.8471386180584993, + "grad_norm": 0.6773567173483765, + "learning_rate": 2.3973248947319337e-06, + "loss": 1.2637, + "step": 1249 + }, + { + "epoch": 0.8478168715557439, + "grad_norm": 0.7222705415612313, + "learning_rate": 2.3764941815563456e-06, + "loss": 1.2504, + "step": 1250 + }, + { + "epoch": 0.8484951250529885, + "grad_norm": 0.663347159603606, + "learning_rate": 2.355748646376952e-06, + "loss": 1.2924, + "step": 1251 + }, + { + "epoch": 0.8491733785502331, + "grad_norm": 0.672155101613964, + "learning_rate": 2.3350883894611574e-06, + "loss": 1.2404, + "step": 1252 + }, + { + "epoch": 0.8498516320474777, + "grad_norm": 0.6724349931683334, + "learning_rate": 2.314513510664196e-06, + "loss": 1.2531, + "step": 1253 + }, + { + "epoch": 0.8505298855447223, + "grad_norm": 0.654967406341385, + "learning_rate": 2.2940241094286475e-06, + "loss": 1.2368, + "step": 1254 + }, + { + "epoch": 0.8512081390419669, + "grad_norm": 0.7050572649979452, + "learning_rate": 2.2736202847839616e-06, + "loss": 1.2503, + "step": 1255 + }, + { + "epoch": 0.8518863925392115, + "grad_norm": 0.6593550166493525, + "learning_rate": 2.2533021353459917e-06, + "loss": 1.2496, + "step": 1256 + }, + { + "epoch": 0.8525646460364561, + "grad_norm": 0.6797192175460736, + "learning_rate": 2.233069759316491e-06, + "loss": 1.2683, + "step": 1257 + }, + { + "epoch": 0.8532428995337007, + "grad_norm": 0.6594627663012708, + "learning_rate": 2.212923254482653e-06, + "loss": 1.2374, + "step": 1258 + }, + { + "epoch": 0.8539211530309453, + "grad_norm": 0.6790072404571235, + "learning_rate": 2.1928627182166527e-06, + "loss": 1.2256, + "step": 1259 + }, + { + "epoch": 0.8545994065281899, + "grad_norm": 0.6531785829339144, + "learning_rate": 2.17288824747514e-06, + "loss": 1.246, + "step": 1260 + }, + { + "epoch": 0.8552776600254345, + "grad_norm": 0.6697866159646256, + "learning_rate": 2.1529999387988164e-06, + "loss": 1.2297, + "step": 1261 + }, + { + "epoch": 0.8559559135226791, + "grad_norm": 0.6883522445090433, + "learning_rate": 2.1331978883119175e-06, + "loss": 1.2578, + "step": 1262 + }, + { + "epoch": 0.8566341670199237, + "grad_norm": 0.6489524920906364, + "learning_rate": 2.113482191721801e-06, + "loss": 1.2351, + "step": 1263 + }, + { + "epoch": 0.8573124205171683, + "grad_norm": 0.6689527801097199, + "learning_rate": 2.0938529443184395e-06, + "loss": 1.2589, + "step": 1264 + }, + { + "epoch": 0.8579906740144129, + "grad_norm": 0.6636152186845068, + "learning_rate": 2.0743102409739956e-06, + "loss": 1.2717, + "step": 1265 + }, + { + "epoch": 0.8586689275116575, + "grad_norm": 0.643077165344515, + "learning_rate": 2.0548541761423335e-06, + "loss": 1.2472, + "step": 1266 + }, + { + "epoch": 0.8593471810089021, + "grad_norm": 0.6733627311390646, + "learning_rate": 2.0354848438585793e-06, + "loss": 1.2518, + "step": 1267 + }, + { + "epoch": 0.8600254345061467, + "grad_norm": 0.6500042790933303, + "learning_rate": 2.0162023377386684e-06, + "loss": 1.2392, + "step": 1268 + }, + { + "epoch": 0.8607036880033913, + "grad_norm": 0.6711783818853033, + "learning_rate": 1.9970067509788828e-06, + "loss": 1.2434, + "step": 1269 + }, + { + "epoch": 0.8613819415006359, + "grad_norm": 0.6878334385995437, + "learning_rate": 1.977898176355404e-06, + "loss": 1.2482, + "step": 1270 + }, + { + "epoch": 0.8620601949978804, + "grad_norm": 0.6732797757524392, + "learning_rate": 1.9588767062238666e-06, + "loss": 1.2258, + "step": 1271 + }, + { + "epoch": 0.862738448495125, + "grad_norm": 0.690590706308697, + "learning_rate": 1.939942432518922e-06, + "loss": 1.2454, + "step": 1272 + }, + { + "epoch": 0.8634167019923696, + "grad_norm": 0.6742559709870798, + "learning_rate": 1.921095446753767e-06, + "loss": 1.2544, + "step": 1273 + }, + { + "epoch": 0.8640949554896142, + "grad_norm": 0.6812447470627214, + "learning_rate": 1.9023358400197267e-06, + "loss": 1.2221, + "step": 1274 + }, + { + "epoch": 0.8647732089868588, + "grad_norm": 0.6651274724851445, + "learning_rate": 1.8836637029858073e-06, + "loss": 1.2365, + "step": 1275 + }, + { + "epoch": 0.8654514624841034, + "grad_norm": 0.6968195379849065, + "learning_rate": 1.8650791258982525e-06, + "loss": 1.2246, + "step": 1276 + }, + { + "epoch": 0.866129715981348, + "grad_norm": 0.6887124005022628, + "learning_rate": 1.8465821985801113e-06, + "loss": 1.2464, + "step": 1277 + }, + { + "epoch": 0.8668079694785926, + "grad_norm": 0.677103488304023, + "learning_rate": 1.8281730104308027e-06, + "loss": 1.2849, + "step": 1278 + }, + { + "epoch": 0.8674862229758372, + "grad_norm": 0.6806202048462158, + "learning_rate": 1.809851650425689e-06, + "loss": 1.2361, + "step": 1279 + }, + { + "epoch": 0.8681644764730818, + "grad_norm": 0.6594204417492138, + "learning_rate": 1.7916182071156352e-06, + "loss": 1.2087, + "step": 1280 + }, + { + "epoch": 0.8688427299703264, + "grad_norm": 0.6775538501405443, + "learning_rate": 1.7734727686265896e-06, + "loss": 1.2521, + "step": 1281 + }, + { + "epoch": 0.869520983467571, + "grad_norm": 0.6732775057867245, + "learning_rate": 1.7554154226591591e-06, + "loss": 1.2443, + "step": 1282 + }, + { + "epoch": 0.8701992369648156, + "grad_norm": 0.670136546211293, + "learning_rate": 1.7374462564881734e-06, + "loss": 1.2578, + "step": 1283 + }, + { + "epoch": 0.8708774904620602, + "grad_norm": 0.6825310488125489, + "learning_rate": 1.7195653569622806e-06, + "loss": 1.2209, + "step": 1284 + }, + { + "epoch": 0.8715557439593048, + "grad_norm": 0.6779073695972031, + "learning_rate": 1.7017728105035037e-06, + "loss": 1.2423, + "step": 1285 + }, + { + "epoch": 0.8722339974565494, + "grad_norm": 0.7023270067247336, + "learning_rate": 1.684068703106858e-06, + "loss": 1.2305, + "step": 1286 + }, + { + "epoch": 0.872912250953794, + "grad_norm": 0.6567892230707757, + "learning_rate": 1.666453120339897e-06, + "loss": 1.2317, + "step": 1287 + }, + { + "epoch": 0.8735905044510386, + "grad_norm": 0.6761140088046478, + "learning_rate": 1.6489261473423246e-06, + "loss": 1.2294, + "step": 1288 + }, + { + "epoch": 0.8742687579482832, + "grad_norm": 0.6897471789058747, + "learning_rate": 1.6314878688255742e-06, + "loss": 1.2455, + "step": 1289 + }, + { + "epoch": 0.8749470114455278, + "grad_norm": 0.6552396693987875, + "learning_rate": 1.6141383690724e-06, + "loss": 1.2306, + "step": 1290 + }, + { + "epoch": 0.8756252649427724, + "grad_norm": 0.6606878888166857, + "learning_rate": 1.596877731936477e-06, + "loss": 1.2483, + "step": 1291 + }, + { + "epoch": 0.876303518440017, + "grad_norm": 0.6849908384639455, + "learning_rate": 1.579706040841973e-06, + "loss": 1.2554, + "step": 1292 + }, + { + "epoch": 0.8769817719372616, + "grad_norm": 0.6602215845249478, + "learning_rate": 1.5626233787831791e-06, + "loss": 1.2384, + "step": 1293 + }, + { + "epoch": 0.8776600254345062, + "grad_norm": 0.6611842345331809, + "learning_rate": 1.54562982832408e-06, + "loss": 1.237, + "step": 1294 + }, + { + "epoch": 0.8783382789317508, + "grad_norm": 0.6656283658878175, + "learning_rate": 1.5287254715979672e-06, + "loss": 1.2548, + "step": 1295 + }, + { + "epoch": 0.8790165324289954, + "grad_norm": 0.6829406117992067, + "learning_rate": 1.5119103903070476e-06, + "loss": 1.2509, + "step": 1296 + }, + { + "epoch": 0.87969478592624, + "grad_norm": 0.43142912689574664, + "learning_rate": 1.4951846657220336e-06, + "loss": 1.3619, + "step": 1297 + }, + { + "epoch": 0.8803730394234845, + "grad_norm": 0.6636443493647105, + "learning_rate": 1.4785483786817678e-06, + "loss": 1.2532, + "step": 1298 + }, + { + "epoch": 0.8810512929207291, + "grad_norm": 0.6409186280150971, + "learning_rate": 1.462001609592807e-06, + "loss": 1.2439, + "step": 1299 + }, + { + "epoch": 0.8817295464179737, + "grad_norm": 0.6567127599805216, + "learning_rate": 1.4455444384290652e-06, + "loss": 1.2565, + "step": 1300 + }, + { + "epoch": 0.8824077999152183, + "grad_norm": 0.6586436635118995, + "learning_rate": 1.429176944731403e-06, + "loss": 1.2469, + "step": 1301 + }, + { + "epoch": 0.8830860534124629, + "grad_norm": 0.701738944563103, + "learning_rate": 1.412899207607259e-06, + "loss": 1.2422, + "step": 1302 + }, + { + "epoch": 0.8837643069097075, + "grad_norm": 0.447188260094171, + "learning_rate": 1.3967113057302495e-06, + "loss": 1.3799, + "step": 1303 + }, + { + "epoch": 0.8844425604069521, + "grad_norm": 0.6751666808079548, + "learning_rate": 1.3806133173398028e-06, + "loss": 1.2505, + "step": 1304 + }, + { + "epoch": 0.8851208139041967, + "grad_norm": 0.6724348276625257, + "learning_rate": 1.3646053202407861e-06, + "loss": 1.2424, + "step": 1305 + }, + { + "epoch": 0.8857990674014413, + "grad_norm": 0.7038026791021831, + "learning_rate": 1.3486873918031096e-06, + "loss": 1.248, + "step": 1306 + }, + { + "epoch": 0.8864773208986859, + "grad_norm": 0.6493663173977285, + "learning_rate": 1.332859608961361e-06, + "loss": 1.2563, + "step": 1307 + }, + { + "epoch": 0.8871555743959305, + "grad_norm": 0.6761645748668302, + "learning_rate": 1.3171220482144452e-06, + "loss": 1.2366, + "step": 1308 + }, + { + "epoch": 0.8878338278931751, + "grad_norm": 0.6543856339684542, + "learning_rate": 1.301474785625203e-06, + "loss": 1.228, + "step": 1309 + }, + { + "epoch": 0.8885120813904197, + "grad_norm": 0.6736409903221909, + "learning_rate": 1.2859178968200437e-06, + "loss": 1.2555, + "step": 1310 + }, + { + "epoch": 0.8891903348876643, + "grad_norm": 0.7204894887302865, + "learning_rate": 1.2704514569885773e-06, + "loss": 1.2738, + "step": 1311 + }, + { + "epoch": 0.8898685883849089, + "grad_norm": 0.6585100428910327, + "learning_rate": 1.255075540883266e-06, + "loss": 1.2529, + "step": 1312 + }, + { + "epoch": 0.8905468418821535, + "grad_norm": 0.6988673581133306, + "learning_rate": 1.2397902228190483e-06, + "loss": 1.232, + "step": 1313 + }, + { + "epoch": 0.8912250953793981, + "grad_norm": 0.6808638632254461, + "learning_rate": 1.2245955766729757e-06, + "loss": 1.2578, + "step": 1314 + }, + { + "epoch": 0.8919033488766427, + "grad_norm": 0.6931340674170225, + "learning_rate": 1.2094916758838715e-06, + "loss": 1.2652, + "step": 1315 + }, + { + "epoch": 0.8925816023738873, + "grad_norm": 0.6728575515822607, + "learning_rate": 1.194478593451973e-06, + "loss": 1.2484, + "step": 1316 + }, + { + "epoch": 0.8932598558711319, + "grad_norm": 0.6808867615629325, + "learning_rate": 1.1795564019385642e-06, + "loss": 1.2242, + "step": 1317 + }, + { + "epoch": 0.8939381093683765, + "grad_norm": 0.6881827380698176, + "learning_rate": 1.1647251734656352e-06, + "loss": 1.2757, + "step": 1318 + }, + { + "epoch": 0.8946163628656211, + "grad_norm": 0.6730892890394782, + "learning_rate": 1.1499849797155438e-06, + "loss": 1.2477, + "step": 1319 + }, + { + "epoch": 0.8952946163628657, + "grad_norm": 0.6617266640968748, + "learning_rate": 1.1353358919306468e-06, + "loss": 1.2306, + "step": 1320 + }, + { + "epoch": 0.8959728698601103, + "grad_norm": 0.6551174490404256, + "learning_rate": 1.1207779809129748e-06, + "loss": 1.246, + "step": 1321 + }, + { + "epoch": 0.8966511233573549, + "grad_norm": 0.6541381635552229, + "learning_rate": 1.1063113170238715e-06, + "loss": 1.2171, + "step": 1322 + }, + { + "epoch": 0.8973293768545995, + "grad_norm": 0.6624929688825112, + "learning_rate": 1.0919359701836818e-06, + "loss": 1.2357, + "step": 1323 + }, + { + "epoch": 0.898007630351844, + "grad_norm": 0.658560451189581, + "learning_rate": 1.0776520098713838e-06, + "loss": 1.2348, + "step": 1324 + }, + { + "epoch": 0.8986858838490887, + "grad_norm": 0.655584899768396, + "learning_rate": 1.063459505124267e-06, + "loss": 1.2534, + "step": 1325 + }, + { + "epoch": 0.8993641373463331, + "grad_norm": 0.6523259218703032, + "learning_rate": 1.0493585245376048e-06, + "loss": 1.2545, + "step": 1326 + }, + { + "epoch": 0.9000423908435777, + "grad_norm": 0.6712031142934327, + "learning_rate": 1.0353491362643054e-06, + "loss": 1.2311, + "step": 1327 + }, + { + "epoch": 0.9007206443408223, + "grad_norm": 0.6929628155813327, + "learning_rate": 1.0214314080146082e-06, + "loss": 1.2531, + "step": 1328 + }, + { + "epoch": 0.9013988978380669, + "grad_norm": 0.6701710022956457, + "learning_rate": 1.0076054070557163e-06, + "loss": 1.2605, + "step": 1329 + }, + { + "epoch": 0.9020771513353115, + "grad_norm": 0.6677084081978719, + "learning_rate": 9.938712002115226e-07, + "loss": 1.2472, + "step": 1330 + }, + { + "epoch": 0.9027554048325561, + "grad_norm": 0.6549213183466188, + "learning_rate": 9.802288538622417e-07, + "loss": 1.2471, + "step": 1331 + }, + { + "epoch": 0.9034336583298007, + "grad_norm": 0.6498290628304042, + "learning_rate": 9.666784339441216e-07, + "loss": 1.2187, + "step": 1332 + }, + { + "epoch": 0.9041119118270453, + "grad_norm": 0.6607792472753872, + "learning_rate": 9.532200059490959e-07, + "loss": 1.2299, + "step": 1333 + }, + { + "epoch": 0.9047901653242899, + "grad_norm": 0.6807448721243032, + "learning_rate": 9.398536349244947e-07, + "loss": 1.2378, + "step": 1334 + }, + { + "epoch": 0.9054684188215345, + "grad_norm": 0.6834284278293586, + "learning_rate": 9.265793854727189e-07, + "loss": 1.2641, + "step": 1335 + }, + { + "epoch": 0.9061466723187791, + "grad_norm": 0.6561570607501847, + "learning_rate": 9.133973217509106e-07, + "loss": 1.2491, + "step": 1336 + }, + { + "epoch": 0.9068249258160237, + "grad_norm": 0.673378051921662, + "learning_rate": 9.003075074706791e-07, + "loss": 1.2394, + "step": 1337 + }, + { + "epoch": 0.9075031793132683, + "grad_norm": 0.6858403613997888, + "learning_rate": 8.873100058977613e-07, + "loss": 1.2538, + "step": 1338 + }, + { + "epoch": 0.9081814328105129, + "grad_norm": 0.673381363207631, + "learning_rate": 8.744048798517402e-07, + "loss": 1.238, + "step": 1339 + }, + { + "epoch": 0.9088596863077575, + "grad_norm": 0.6637418848607853, + "learning_rate": 8.615921917057069e-07, + "loss": 1.2361, + "step": 1340 + }, + { + "epoch": 0.9095379398050021, + "grad_norm": 0.6637814922635127, + "learning_rate": 8.488720033860032e-07, + "loss": 1.2446, + "step": 1341 + }, + { + "epoch": 0.9102161933022467, + "grad_norm": 0.6602838507106655, + "learning_rate": 8.362443763718953e-07, + "loss": 1.2599, + "step": 1342 + }, + { + "epoch": 0.9108944467994913, + "grad_norm": 0.659284695620839, + "learning_rate": 8.237093716952737e-07, + "loss": 1.2695, + "step": 1343 + }, + { + "epoch": 0.9115727002967359, + "grad_norm": 0.665982122184112, + "learning_rate": 8.11267049940374e-07, + "loss": 1.2202, + "step": 1344 + }, + { + "epoch": 0.9122509537939805, + "grad_norm": 0.6393656129125219, + "learning_rate": 7.989174712434677e-07, + "loss": 1.2217, + "step": 1345 + }, + { + "epoch": 0.9129292072912251, + "grad_norm": 0.6707636295089805, + "learning_rate": 7.86660695292596e-07, + "loss": 1.2275, + "step": 1346 + }, + { + "epoch": 0.9136074607884697, + "grad_norm": 0.6638358184981306, + "learning_rate": 7.744967813272475e-07, + "loss": 1.2489, + "step": 1347 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.6627214654844239, + "learning_rate": 7.624257881380992e-07, + "loss": 1.2488, + "step": 1348 + }, + { + "epoch": 0.9149639677829589, + "grad_norm": 0.6739167341497851, + "learning_rate": 7.504477740667271e-07, + "loss": 1.2447, + "step": 1349 + }, + { + "epoch": 0.9156422212802034, + "grad_norm": 0.6683330924152061, + "learning_rate": 7.385627970053088e-07, + "loss": 1.2374, + "step": 1350 + }, + { + "epoch": 0.916320474777448, + "grad_norm": 0.6592964271700535, + "learning_rate": 7.267709143963663e-07, + "loss": 1.2669, + "step": 1351 + }, + { + "epoch": 0.9169987282746926, + "grad_norm": 0.6843140708067933, + "learning_rate": 7.150721832324659e-07, + "loss": 1.246, + "step": 1352 + }, + { + "epoch": 0.9176769817719372, + "grad_norm": 0.6661572430839988, + "learning_rate": 7.034666600559647e-07, + "loss": 1.2383, + "step": 1353 + }, + { + "epoch": 0.9183552352691818, + "grad_norm": 0.665393427409558, + "learning_rate": 6.919544009587231e-07, + "loss": 1.283, + "step": 1354 + }, + { + "epoch": 0.9190334887664264, + "grad_norm": 0.669387001203815, + "learning_rate": 6.805354615818305e-07, + "loss": 1.2452, + "step": 1355 + }, + { + "epoch": 0.919711742263671, + "grad_norm": 0.43055144276040425, + "learning_rate": 6.692098971153549e-07, + "loss": 1.3544, + "step": 1356 + }, + { + "epoch": 0.9203899957609156, + "grad_norm": 0.6871511026380561, + "learning_rate": 6.579777622980565e-07, + "loss": 1.2243, + "step": 1357 + }, + { + "epoch": 0.9210682492581602, + "grad_norm": 0.6739005611348086, + "learning_rate": 6.468391114171302e-07, + "loss": 1.2756, + "step": 1358 + }, + { + "epoch": 0.9217465027554048, + "grad_norm": 0.6744433756104877, + "learning_rate": 6.357939983079453e-07, + "loss": 1.2444, + "step": 1359 + }, + { + "epoch": 0.9224247562526494, + "grad_norm": 0.6729612932904839, + "learning_rate": 6.248424763537886e-07, + "loss": 1.256, + "step": 1360 + }, + { + "epoch": 0.923103009749894, + "grad_norm": 0.6829166299546027, + "learning_rate": 6.139845984855974e-07, + "loss": 1.2518, + "step": 1361 + }, + { + "epoch": 0.9237812632471386, + "grad_norm": 0.6709060858281704, + "learning_rate": 6.032204171817068e-07, + "loss": 1.221, + "step": 1362 + }, + { + "epoch": 0.9244595167443832, + "grad_norm": 0.7078854183369467, + "learning_rate": 5.92549984467603e-07, + "loss": 1.274, + "step": 1363 + }, + { + "epoch": 0.9251377702416278, + "grad_norm": 0.693268884844903, + "learning_rate": 5.819733519156589e-07, + "loss": 1.2452, + "step": 1364 + }, + { + "epoch": 0.9258160237388724, + "grad_norm": 0.6758341411358181, + "learning_rate": 5.714905706448992e-07, + "loss": 1.2596, + "step": 1365 + }, + { + "epoch": 0.926494277236117, + "grad_norm": 0.6676882929450928, + "learning_rate": 5.611016913207379e-07, + "loss": 1.2596, + "step": 1366 + }, + { + "epoch": 0.9271725307333616, + "grad_norm": 0.6921240376093872, + "learning_rate": 5.508067641547521e-07, + "loss": 1.2504, + "step": 1367 + }, + { + "epoch": 0.9278507842306062, + "grad_norm": 0.6905148881597328, + "learning_rate": 5.406058389044178e-07, + "loss": 1.2266, + "step": 1368 + }, + { + "epoch": 0.9285290377278508, + "grad_norm": 0.6879011599996939, + "learning_rate": 5.30498964872892e-07, + "loss": 1.2458, + "step": 1369 + }, + { + "epoch": 0.9292072912250954, + "grad_norm": 0.6757393596742982, + "learning_rate": 5.204861909087511e-07, + "loss": 1.2417, + "step": 1370 + }, + { + "epoch": 0.92988554472234, + "grad_norm": 0.6926775176816642, + "learning_rate": 5.105675654057752e-07, + "loss": 1.2674, + "step": 1371 + }, + { + "epoch": 0.9305637982195846, + "grad_norm": 0.6881075927445028, + "learning_rate": 5.007431363027082e-07, + "loss": 1.2648, + "step": 1372 + }, + { + "epoch": 0.9312420517168292, + "grad_norm": 0.644620430770547, + "learning_rate": 4.91012951083012e-07, + "loss": 1.2202, + "step": 1373 + }, + { + "epoch": 0.9319203052140738, + "grad_norm": 0.6765738092957386, + "learning_rate": 4.813770567746589e-07, + "loss": 1.2685, + "step": 1374 + }, + { + "epoch": 0.9325985587113184, + "grad_norm": 0.6759248428214282, + "learning_rate": 4.718354999498864e-07, + "loss": 1.2147, + "step": 1375 + }, + { + "epoch": 0.933276812208563, + "grad_norm": 0.4172155603814433, + "learning_rate": 4.6238832672499177e-07, + "loss": 1.3648, + "step": 1376 + }, + { + "epoch": 0.9339550657058076, + "grad_norm": 0.4602491839575769, + "learning_rate": 4.5303558276007744e-07, + "loss": 1.4068, + "step": 1377 + }, + { + "epoch": 0.9346333192030521, + "grad_norm": 0.6755778465641865, + "learning_rate": 4.4377731325887075e-07, + "loss": 1.2283, + "step": 1378 + }, + { + "epoch": 0.9353115727002967, + "grad_norm": 0.6721429198493364, + "learning_rate": 4.3461356296847333e-07, + "loss": 1.2553, + "step": 1379 + }, + { + "epoch": 0.9359898261975413, + "grad_norm": 0.6699419980408854, + "learning_rate": 4.2554437617915666e-07, + "loss": 1.2443, + "step": 1380 + }, + { + "epoch": 0.9366680796947859, + "grad_norm": 0.6749882416152699, + "learning_rate": 4.165697967241511e-07, + "loss": 1.2262, + "step": 1381 + }, + { + "epoch": 0.9373463331920305, + "grad_norm": 0.6847443257768054, + "learning_rate": 4.076898679794261e-07, + "loss": 1.2253, + "step": 1382 + }, + { + "epoch": 0.9380245866892751, + "grad_norm": 0.6727955333188005, + "learning_rate": 3.989046328634927e-07, + "loss": 1.2594, + "step": 1383 + }, + { + "epoch": 0.9387028401865197, + "grad_norm": 0.6620805374528571, + "learning_rate": 3.902141338371768e-07, + "loss": 1.2142, + "step": 1384 + }, + { + "epoch": 0.9393810936837643, + "grad_norm": 0.6480772047300294, + "learning_rate": 3.8161841290343503e-07, + "loss": 1.2418, + "step": 1385 + }, + { + "epoch": 0.9400593471810089, + "grad_norm": 0.6835792051059321, + "learning_rate": 3.7311751160713947e-07, + "loss": 1.2387, + "step": 1386 + }, + { + "epoch": 0.9407376006782535, + "grad_norm": 0.6649418879644043, + "learning_rate": 3.6471147103487756e-07, + "loss": 1.2339, + "step": 1387 + }, + { + "epoch": 0.9414158541754981, + "grad_norm": 0.6842421143137674, + "learning_rate": 3.564003318147569e-07, + "loss": 1.226, + "step": 1388 + }, + { + "epoch": 0.9420941076727427, + "grad_norm": 0.6811128860613901, + "learning_rate": 3.481841341162073e-07, + "loss": 1.2558, + "step": 1389 + }, + { + "epoch": 0.9427723611699873, + "grad_norm": 0.6613640812358834, + "learning_rate": 3.400629176497905e-07, + "loss": 1.2507, + "step": 1390 + }, + { + "epoch": 0.9434506146672319, + "grad_norm": 0.6722408592326058, + "learning_rate": 3.320367216669973e-07, + "loss": 1.2396, + "step": 1391 + }, + { + "epoch": 0.9441288681644765, + "grad_norm": 0.6624694671185586, + "learning_rate": 3.2410558496006825e-07, + "loss": 1.2334, + "step": 1392 + }, + { + "epoch": 0.9448071216617211, + "grad_norm": 0.7137402370876521, + "learning_rate": 3.16269545861807e-07, + "loss": 1.2496, + "step": 1393 + }, + { + "epoch": 0.9454853751589657, + "grad_norm": 0.6449845768103436, + "learning_rate": 3.085286422453893e-07, + "loss": 1.2404, + "step": 1394 + }, + { + "epoch": 0.9461636286562103, + "grad_norm": 0.6749794928160756, + "learning_rate": 3.008829115241763e-07, + "loss": 1.2394, + "step": 1395 + }, + { + "epoch": 0.9468418821534549, + "grad_norm": 0.6526640710965668, + "learning_rate": 2.9333239065154397e-07, + "loss": 1.2561, + "step": 1396 + }, + { + "epoch": 0.9475201356506995, + "grad_norm": 0.6605969261232847, + "learning_rate": 2.858771161206986e-07, + "loss": 1.2233, + "step": 1397 + }, + { + "epoch": 0.9481983891479441, + "grad_norm": 0.656943414990126, + "learning_rate": 2.785171239645057e-07, + "loss": 1.2334, + "step": 1398 + }, + { + "epoch": 0.9488766426451887, + "grad_norm": 0.6734397241519186, + "learning_rate": 2.712524497553037e-07, + "loss": 1.2722, + "step": 1399 + }, + { + "epoch": 0.9495548961424333, + "grad_norm": 0.6663530577389908, + "learning_rate": 2.640831286047441e-07, + "loss": 1.2379, + "step": 1400 + }, + { + "epoch": 0.9502331496396779, + "grad_norm": 0.6673776018022044, + "learning_rate": 2.570091951636178e-07, + "loss": 1.2541, + "step": 1401 + }, + { + "epoch": 0.9509114031369225, + "grad_norm": 0.6653923384339829, + "learning_rate": 2.5003068362168927e-07, + "loss": 1.2273, + "step": 1402 + }, + { + "epoch": 0.9515896566341671, + "grad_norm": 0.6837729865567328, + "learning_rate": 2.431476277075251e-07, + "loss": 1.2423, + "step": 1403 + }, + { + "epoch": 0.9522679101314117, + "grad_norm": 0.6680668422360202, + "learning_rate": 2.3636006068833872e-07, + "loss": 1.2451, + "step": 1404 + }, + { + "epoch": 0.9529461636286563, + "grad_norm": 0.6799977513226729, + "learning_rate": 2.2966801536982387e-07, + "loss": 1.239, + "step": 1405 + }, + { + "epoch": 0.9536244171259008, + "grad_norm": 0.6777577598433704, + "learning_rate": 2.2307152409600575e-07, + "loss": 1.2522, + "step": 1406 + }, + { + "epoch": 0.9543026706231454, + "grad_norm": 0.6606560971096702, + "learning_rate": 2.1657061874906126e-07, + "loss": 1.2405, + "step": 1407 + }, + { + "epoch": 0.95498092412039, + "grad_norm": 0.6469835351521775, + "learning_rate": 2.1016533074919687e-07, + "loss": 1.229, + "step": 1408 + }, + { + "epoch": 0.9556591776176346, + "grad_norm": 0.6958824972510786, + "learning_rate": 2.0385569105447532e-07, + "loss": 1.2687, + "step": 1409 + }, + { + "epoch": 0.9563374311148792, + "grad_norm": 0.6740777877893196, + "learning_rate": 1.9764173016067145e-07, + "loss": 1.2389, + "step": 1410 + }, + { + "epoch": 0.9570156846121238, + "grad_norm": 0.6736611531923851, + "learning_rate": 1.915234781011255e-07, + "loss": 1.2415, + "step": 1411 + }, + { + "epoch": 0.9576939381093684, + "grad_norm": 0.6883611656465358, + "learning_rate": 1.8550096444659216e-07, + "loss": 1.2424, + "step": 1412 + }, + { + "epoch": 0.958372191606613, + "grad_norm": 0.6827067228567522, + "learning_rate": 1.7957421830511412e-07, + "loss": 1.233, + "step": 1413 + }, + { + "epoch": 0.9590504451038576, + "grad_norm": 0.6784984890612891, + "learning_rate": 1.7374326832185983e-07, + "loss": 1.2531, + "step": 1414 + }, + { + "epoch": 0.9597286986011022, + "grad_norm": 0.6633612107331113, + "learning_rate": 1.6800814267899923e-07, + "loss": 1.2398, + "step": 1415 + }, + { + "epoch": 0.9604069520983468, + "grad_norm": 0.6488222570693454, + "learning_rate": 1.6236886909556603e-07, + "loss": 1.2664, + "step": 1416 + }, + { + "epoch": 0.9610852055955914, + "grad_norm": 0.4189268521205959, + "learning_rate": 1.5682547482731791e-07, + "loss": 1.3749, + "step": 1417 + }, + { + "epoch": 0.9617634590928359, + "grad_norm": 0.6917389003668754, + "learning_rate": 1.5137798666660765e-07, + "loss": 1.2585, + "step": 1418 + }, + { + "epoch": 0.9624417125900805, + "grad_norm": 0.672451082602383, + "learning_rate": 1.4602643094225876e-07, + "loss": 1.2517, + "step": 1419 + }, + { + "epoch": 0.9631199660873251, + "grad_norm": 0.6504255430195721, + "learning_rate": 1.4077083351942799e-07, + "loss": 1.2365, + "step": 1420 + }, + { + "epoch": 0.9637982195845697, + "grad_norm": 0.6850370736126643, + "learning_rate": 1.3561121979949188e-07, + "loss": 1.2489, + "step": 1421 + }, + { + "epoch": 0.9644764730818143, + "grad_norm": 0.66162990708649, + "learning_rate": 1.3054761471991362e-07, + "loss": 1.2442, + "step": 1422 + }, + { + "epoch": 0.9651547265790589, + "grad_norm": 0.6636753229495839, + "learning_rate": 1.2558004275412983e-07, + "loss": 1.2332, + "step": 1423 + }, + { + "epoch": 0.9658329800763035, + "grad_norm": 0.6868071837214885, + "learning_rate": 1.207085279114284e-07, + "loss": 1.2472, + "step": 1424 + }, + { + "epoch": 0.9665112335735481, + "grad_norm": 0.6876603336287529, + "learning_rate": 1.1593309373683304e-07, + "loss": 1.272, + "step": 1425 + }, + { + "epoch": 0.9671894870707927, + "grad_norm": 0.6512671334666834, + "learning_rate": 1.1125376331099002e-07, + "loss": 1.2344, + "step": 1426 + }, + { + "epoch": 0.9678677405680373, + "grad_norm": 0.6715940763143174, + "learning_rate": 1.066705592500572e-07, + "loss": 1.2247, + "step": 1427 + }, + { + "epoch": 0.9685459940652819, + "grad_norm": 0.6754499531697719, + "learning_rate": 1.0218350370559515e-07, + "loss": 1.2597, + "step": 1428 + }, + { + "epoch": 0.9692242475625265, + "grad_norm": 0.6524907799318351, + "learning_rate": 9.779261836445841e-08, + "loss": 1.2156, + "step": 1429 + }, + { + "epoch": 0.969902501059771, + "grad_norm": 0.6792557779960546, + "learning_rate": 9.34979244486911e-08, + "loss": 1.3041, + "step": 1430 + }, + { + "epoch": 0.9705807545570156, + "grad_norm": 0.6823631877198995, + "learning_rate": 8.929944271542035e-08, + "loss": 1.24, + "step": 1431 + }, + { + "epoch": 0.9712590080542602, + "grad_norm": 0.6846773472067208, + "learning_rate": 8.51971934567697e-08, + "loss": 1.2623, + "step": 1432 + }, + { + "epoch": 0.9719372615515048, + "grad_norm": 0.6467976465127085, + "learning_rate": 8.119119649974361e-08, + "loss": 1.2613, + "step": 1433 + }, + { + "epoch": 0.9726155150487494, + "grad_norm": 0.6575189203134979, + "learning_rate": 7.728147120614093e-08, + "loss": 1.2143, + "step": 1434 + }, + { + "epoch": 0.973293768545994, + "grad_norm": 0.6665055528564646, + "learning_rate": 7.346803647246381e-08, + "loss": 1.2492, + "step": 1435 + }, + { + "epoch": 0.9739720220432386, + "grad_norm": 0.6486777079132526, + "learning_rate": 6.975091072981777e-08, + "loss": 1.2271, + "step": 1436 + }, + { + "epoch": 0.9746502755404832, + "grad_norm": 0.7034061981240023, + "learning_rate": 6.613011194382957e-08, + "loss": 1.2742, + "step": 1437 + }, + { + "epoch": 0.9753285290377278, + "grad_norm": 0.6686002236944355, + "learning_rate": 6.260565761455616e-08, + "loss": 1.2429, + "step": 1438 + }, + { + "epoch": 0.9760067825349724, + "grad_norm": 0.6556116978595136, + "learning_rate": 5.917756477640702e-08, + "loss": 1.2385, + "step": 1439 + }, + { + "epoch": 0.976685036032217, + "grad_norm": 0.6554439946819614, + "learning_rate": 5.584584999805076e-08, + "loss": 1.239, + "step": 1440 + }, + { + "epoch": 0.9773632895294616, + "grad_norm": 0.6575403679333993, + "learning_rate": 5.2610529382346456e-08, + "loss": 1.2633, + "step": 1441 + }, + { + "epoch": 0.9780415430267062, + "grad_norm": 0.649925623318637, + "learning_rate": 4.947161856625693e-08, + "loss": 1.2244, + "step": 1442 + }, + { + "epoch": 0.9787197965239508, + "grad_norm": 0.6605273078555803, + "learning_rate": 4.642913272077776e-08, + "loss": 1.2355, + "step": 1443 + }, + { + "epoch": 0.9793980500211954, + "grad_norm": 0.6821874686800649, + "learning_rate": 4.3483086550863974e-08, + "loss": 1.2527, + "step": 1444 + }, + { + "epoch": 0.98007630351844, + "grad_norm": 0.6766151832758971, + "learning_rate": 4.063349429535679e-08, + "loss": 1.2384, + "step": 1445 + }, + { + "epoch": 0.9807545570156846, + "grad_norm": 0.6881276161064582, + "learning_rate": 3.788036972691922e-08, + "loss": 1.246, + "step": 1446 + }, + { + "epoch": 0.9814328105129292, + "grad_norm": 0.6700454329845072, + "learning_rate": 3.522372615195835e-08, + "loss": 1.2342, + "step": 1447 + }, + { + "epoch": 0.9821110640101738, + "grad_norm": 0.6545685722297931, + "learning_rate": 3.2663576410576495e-08, + "loss": 1.2287, + "step": 1448 + }, + { + "epoch": 0.9827893175074184, + "grad_norm": 0.6947327345532218, + "learning_rate": 3.0199932876500136e-08, + "loss": 1.2424, + "step": 1449 + }, + { + "epoch": 0.983467571004663, + "grad_norm": 0.6500645029727457, + "learning_rate": 2.7832807457019995e-08, + "loss": 1.2511, + "step": 1450 + }, + { + "epoch": 0.9841458245019076, + "grad_norm": 0.6973448708269708, + "learning_rate": 2.5562211592937703e-08, + "loss": 1.2365, + "step": 1451 + }, + { + "epoch": 0.9848240779991522, + "grad_norm": 0.6555455445351246, + "learning_rate": 2.3388156258501436e-08, + "loss": 1.22, + "step": 1452 + }, + { + "epoch": 0.9855023314963968, + "grad_norm": 0.6778273501171838, + "learning_rate": 2.1310651961368167e-08, + "loss": 1.2449, + "step": 1453 + }, + { + "epoch": 0.9861805849936414, + "grad_norm": 0.4290647050125529, + "learning_rate": 1.9329708742537035e-08, + "loss": 1.3346, + "step": 1454 + }, + { + "epoch": 0.986858838490886, + "grad_norm": 0.6651972834538079, + "learning_rate": 1.744533617631161e-08, + "loss": 1.2721, + "step": 1455 + }, + { + "epoch": 0.9875370919881306, + "grad_norm": 0.6320225606844798, + "learning_rate": 1.5657543370248828e-08, + "loss": 1.2154, + "step": 1456 + }, + { + "epoch": 0.9882153454853752, + "grad_norm": 0.6817640902595111, + "learning_rate": 1.3966338965114567e-08, + "loss": 1.2607, + "step": 1457 + }, + { + "epoch": 0.9888935989826197, + "grad_norm": 0.6667804231582753, + "learning_rate": 1.237173113484591e-08, + "loss": 1.2037, + "step": 1458 + }, + { + "epoch": 0.9895718524798643, + "grad_norm": 0.6537258264830468, + "learning_rate": 1.0873727586506733e-08, + "loss": 1.2153, + "step": 1459 + }, + { + "epoch": 0.9902501059771089, + "grad_norm": 0.6598848171719655, + "learning_rate": 9.472335560254398e-09, + "loss": 1.2209, + "step": 1460 + }, + { + "epoch": 0.9909283594743535, + "grad_norm": 0.6886054526615797, + "learning_rate": 8.167561829299786e-09, + "loss": 1.2512, + "step": 1461 + }, + { + "epoch": 0.9916066129715981, + "grad_norm": 0.6664574716833053, + "learning_rate": 6.9594126998828726e-09, + "loss": 1.2156, + "step": 1462 + }, + { + "epoch": 0.9922848664688427, + "grad_norm": 0.6751343990247555, + "learning_rate": 5.847894011234978e-09, + "loss": 1.2391, + "step": 1463 + }, + { + "epoch": 0.9929631199660873, + "grad_norm": 0.6850150824771514, + "learning_rate": 4.833011135549903e-09, + "loss": 1.235, + "step": 1464 + }, + { + "epoch": 0.9936413734633319, + "grad_norm": 0.6680179651217075, + "learning_rate": 3.914768977966166e-09, + "loss": 1.2249, + "step": 1465 + }, + { + "epoch": 0.9943196269605765, + "grad_norm": 0.669886714999622, + "learning_rate": 3.093171976533693e-09, + "loss": 1.2441, + "step": 1466 + }, + { + "epoch": 0.9949978804578211, + "grad_norm": 0.6783995175305425, + "learning_rate": 2.36822410219828e-09, + "loss": 1.2329, + "step": 1467 + }, + { + "epoch": 0.9956761339550657, + "grad_norm": 0.6487853495244439, + "learning_rate": 1.7399288587816032e-09, + "loss": 1.2061, + "step": 1468 + }, + { + "epoch": 0.9963543874523103, + "grad_norm": 0.6655272487747175, + "learning_rate": 1.2082892829634596e-09, + "loss": 1.2523, + "step": 1469 + }, + { + "epoch": 0.9970326409495549, + "grad_norm": 0.6756617448317891, + "learning_rate": 7.733079442617808e-10, + "loss": 1.2319, + "step": 1470 + }, + { + "epoch": 0.9977108944467995, + "grad_norm": 0.6582339993988024, + "learning_rate": 4.349869450370747e-10, + "loss": 1.2534, + "step": 1471 + }, + { + "epoch": 0.9983891479440441, + "grad_norm": 0.6616448969554425, + "learning_rate": 1.933279204568983e-10, + "loss": 1.2107, + "step": 1472 + }, + { + "epoch": 0.9990674014412887, + "grad_norm": 0.6720706810300792, + "learning_rate": 4.8332038513621e-11, + "loss": 1.2564, + "step": 1473 + }, + { + "epoch": 0.9997456549385333, + "grad_norm": 0.5783213650920862, + "learning_rate": 0.0, + "loss": 1.2774, + "step": 1474 + }, + { + "epoch": 0.9997456549385333, + "step": 1474, + "total_flos": 7384218508197888.0, + "train_loss": 1.3047956960980895, + "train_runtime": 17436.1201, + "train_samples_per_second": 86.585, + "train_steps_per_second": 0.085 + } + ], + "logging_steps": 1.0, + "max_steps": 1474, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7384218508197888.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}