diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8780 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998000399920016, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007998400319936012, + "grad_norm": 4.033497738180381, + "learning_rate": 0.0, + "loss": -0.0349, + "step": 1 + }, + { + "epoch": 0.0015996800639872025, + "grad_norm": 3.51489589336218, + "learning_rate": 3.8110282485354675e-07, + "loss": -0.0542, + "step": 2 + }, + { + "epoch": 0.0023995200959808036, + "grad_norm": 3.185205419075282, + "learning_rate": 6.040336863117743e-07, + "loss": 0.0277, + "step": 3 + }, + { + "epoch": 0.003199360127974405, + "grad_norm": 2.6795442503962392, + "learning_rate": 7.622056497070935e-07, + "loss": 0.0346, + "step": 4 + }, + { + "epoch": 0.003999200159968006, + "grad_norm": 4.306548503833335, + "learning_rate": 8.84893356068388e-07, + "loss": -0.0123, + "step": 5 + }, + { + "epoch": 0.004799040191961607, + "grad_norm": 3.232420399829724, + "learning_rate": 9.85136511165321e-07, + "loss": -0.0277, + "step": 6 + }, + { + "epoch": 0.005598880223955209, + "grad_norm": 4.556647322380419, + "learning_rate": 1.0698908911626617e-06, + "loss": -0.0824, + "step": 7 + }, + { + "epoch": 0.00639872025594881, + "grad_norm": 4.75835426891596, + "learning_rate": 1.1433084745606403e-06, + "loss": -0.0487, + "step": 8 + }, + { + "epoch": 0.007198560287942412, + "grad_norm": 3.671579551985109, + "learning_rate": 1.2080673726235485e-06, + "loss": -0.101, + "step": 9 + }, + { + "epoch": 0.007998400319936013, + "grad_norm": 5.478829821928343, + "learning_rate": 1.2659961809219347e-06, + "loss": 0.0194, + "step": 10 + }, + { + "epoch": 0.008798240351929614, + "grad_norm": 4.697076805586441, + "learning_rate": 1.318399162250352e-06, + "loss": -0.0713, + "step": 11 + }, + { + "epoch": 0.009598080383923215, + "grad_norm": 4.668341585855885, + "learning_rate": 1.366239336018868e-06, + "loss": -0.0365, + "step": 12 + }, + { + "epoch": 0.010397920415916816, + "grad_norm": 4.0841275175924725, + "learning_rate": 1.4102480297838326e-06, + "loss": -0.0814, + "step": 13 + }, + { + "epoch": 0.011197760447910418, + "grad_norm": 3.5676724545793244, + "learning_rate": 1.4509937160162082e-06, + "loss": -0.0815, + "step": 14 + }, + { + "epoch": 0.01199760047990402, + "grad_norm": 3.467762437060136, + "learning_rate": 1.4889270423801623e-06, + "loss": -0.0654, + "step": 15 + }, + { + "epoch": 0.01279744051189762, + "grad_norm": 2.8225474286212457, + "learning_rate": 1.524411299414187e-06, + "loss": -0.062, + "step": 16 + }, + { + "epoch": 0.013597280543891222, + "grad_norm": 5.079798214426631, + "learning_rate": 1.5577436352844088e-06, + "loss": -0.1394, + "step": 17 + }, + { + "epoch": 0.014397120575884824, + "grad_norm": 4.339673002027499, + "learning_rate": 1.5891701974770953e-06, + "loss": -0.042, + "step": 18 + }, + { + "epoch": 0.015196960607878424, + "grad_norm": 4.245593535465187, + "learning_rate": 1.6188971751464532e-06, + "loss": 0.0082, + "step": 19 + }, + { + "epoch": 0.015996800639872025, + "grad_norm": 2.975134158916166, + "learning_rate": 1.6470990057754815e-06, + "loss": -0.0294, + "step": 20 + }, + { + "epoch": 0.016796640671865627, + "grad_norm": 3.331212004539707, + "learning_rate": 1.673924577474436e-06, + "loss": -0.0544, + "step": 21 + }, + { + "epoch": 0.01759648070385923, + "grad_norm": 4.545480625771349, + "learning_rate": 1.6995019871038986e-06, + "loss": -0.0731, + "step": 22 + }, + { + "epoch": 0.01839632073585283, + "grad_norm": 3.7461986348064595, + "learning_rate": 1.7239422398533632e-06, + "loss": -0.1278, + "step": 23 + }, + { + "epoch": 0.01919616076784643, + "grad_norm": 2.8335266795291543, + "learning_rate": 1.7473421608724147e-06, + "loss": -0.083, + "step": 24 + }, + { + "epoch": 0.01999600079984003, + "grad_norm": 3.68991966387542, + "learning_rate": 1.769786712136776e-06, + "loss": -0.0695, + "step": 25 + }, + { + "epoch": 0.020795840831833633, + "grad_norm": 4.133751803923456, + "learning_rate": 1.7913508546373795e-06, + "loss": -0.0445, + "step": 26 + }, + { + "epoch": 0.021595680863827234, + "grad_norm": 5.000412923928387, + "learning_rate": 1.812101058935323e-06, + "loss": 0.1318, + "step": 27 + }, + { + "epoch": 0.022395520895820836, + "grad_norm": 3.5992183048553557, + "learning_rate": 1.832096540869755e-06, + "loss": -0.1249, + "step": 28 + }, + { + "epoch": 0.023195360927814438, + "grad_norm": 3.427019545745099, + "learning_rate": 1.8513902803279621e-06, + "loss": -0.1743, + "step": 29 + }, + { + "epoch": 0.02399520095980804, + "grad_norm": 4.725372315931298, + "learning_rate": 1.8700298672337092e-06, + "loss": 0.0091, + "step": 30 + }, + { + "epoch": 0.024795040991801638, + "grad_norm": 5.254587258373332, + "learning_rate": 1.888058208767457e-06, + "loss": -0.0653, + "step": 31 + }, + { + "epoch": 0.02559488102379524, + "grad_norm": 8.849971672055872, + "learning_rate": 1.905514124267734e-06, + "loss": -0.0049, + "step": 32 + }, + { + "epoch": 0.026394721055788842, + "grad_norm": 3.4635929572339874, + "learning_rate": 1.922432848562126e-06, + "loss": -0.089, + "step": 33 + }, + { + "epoch": 0.027194561087782444, + "grad_norm": 5.9169701870000715, + "learning_rate": 1.9388464601379558e-06, + "loss": -0.1119, + "step": 34 + }, + { + "epoch": 0.027994401119776045, + "grad_norm": 5.188979707658736, + "learning_rate": 1.9547842472310495e-06, + "loss": -0.1121, + "step": 35 + }, + { + "epoch": 0.028794241151769647, + "grad_norm": 4.896152090964042, + "learning_rate": 1.970273022330642e-06, + "loss": -0.0525, + "step": 36 + }, + { + "epoch": 0.02959408118376325, + "grad_norm": 5.994962166737604, + "learning_rate": 1.9853373935840096e-06, + "loss": -0.1089, + "step": 37 + }, + { + "epoch": 0.030393921215756847, + "grad_norm": 5.576677749054259, + "learning_rate": 2e-06, + "loss": 0.0304, + "step": 38 + }, + { + "epoch": 0.03119376124775045, + "grad_norm": 4.313822225686198, + "learning_rate": 2e-06, + "loss": -0.1328, + "step": 39 + }, + { + "epoch": 0.03199360127974405, + "grad_norm": 3.81381102986674, + "learning_rate": 1.998349834983498e-06, + "loss": -0.0792, + "step": 40 + }, + { + "epoch": 0.03279344131173765, + "grad_norm": 4.839236808694631, + "learning_rate": 1.996699669966997e-06, + "loss": -0.056, + "step": 41 + }, + { + "epoch": 0.033593281343731254, + "grad_norm": 3.77305458921913, + "learning_rate": 1.995049504950495e-06, + "loss": -0.0339, + "step": 42 + }, + { + "epoch": 0.03439312137572485, + "grad_norm": 5.867440087459917, + "learning_rate": 1.9933993399339932e-06, + "loss": -0.0176, + "step": 43 + }, + { + "epoch": 0.03519296140771846, + "grad_norm": 6.4685114572374465, + "learning_rate": 1.991749174917492e-06, + "loss": 0.0402, + "step": 44 + }, + { + "epoch": 0.035992801439712056, + "grad_norm": 11.48483869174446, + "learning_rate": 1.99009900990099e-06, + "loss": -0.0702, + "step": 45 + }, + { + "epoch": 0.03679264147170566, + "grad_norm": 4.2414264593452735, + "learning_rate": 1.9884488448844884e-06, + "loss": -0.1241, + "step": 46 + }, + { + "epoch": 0.03759248150369926, + "grad_norm": 4.949735715342123, + "learning_rate": 1.9867986798679866e-06, + "loss": -0.0996, + "step": 47 + }, + { + "epoch": 0.03839232153569286, + "grad_norm": 5.06186964090094, + "learning_rate": 1.9851485148514852e-06, + "loss": -0.1133, + "step": 48 + }, + { + "epoch": 0.039192161567686463, + "grad_norm": 3.9056723205659183, + "learning_rate": 1.9834983498349835e-06, + "loss": -0.1631, + "step": 49 + }, + { + "epoch": 0.03999200159968006, + "grad_norm": 5.51223763254555, + "learning_rate": 1.9818481848184817e-06, + "loss": 0.0191, + "step": 50 + }, + { + "epoch": 0.04079184163167367, + "grad_norm": 3.7107878002289, + "learning_rate": 1.98019801980198e-06, + "loss": -0.0271, + "step": 51 + }, + { + "epoch": 0.041591681663667265, + "grad_norm": 5.618046340756691, + "learning_rate": 1.9785478547854786e-06, + "loss": -0.0227, + "step": 52 + }, + { + "epoch": 0.04239152169566087, + "grad_norm": 3.7602961019841468, + "learning_rate": 1.976897689768977e-06, + "loss": -0.126, + "step": 53 + }, + { + "epoch": 0.04319136172765447, + "grad_norm": 4.322826902384424, + "learning_rate": 1.975247524752475e-06, + "loss": -0.1196, + "step": 54 + }, + { + "epoch": 0.04399120175964807, + "grad_norm": 3.6276654934086565, + "learning_rate": 1.9735973597359733e-06, + "loss": -0.1098, + "step": 55 + }, + { + "epoch": 0.04479104179164167, + "grad_norm": 3.729759012982189, + "learning_rate": 1.971947194719472e-06, + "loss": -0.2248, + "step": 56 + }, + { + "epoch": 0.04559088182363527, + "grad_norm": 5.552584092947439, + "learning_rate": 1.97029702970297e-06, + "loss": -0.0821, + "step": 57 + }, + { + "epoch": 0.046390721855628876, + "grad_norm": 6.016002296406734, + "learning_rate": 1.9686468646864684e-06, + "loss": -0.1302, + "step": 58 + }, + { + "epoch": 0.047190561887622474, + "grad_norm": 6.7453871612622995, + "learning_rate": 1.966996699669967e-06, + "loss": -0.0652, + "step": 59 + }, + { + "epoch": 0.04799040191961608, + "grad_norm": 4.874246979289447, + "learning_rate": 1.9653465346534653e-06, + "loss": -0.0409, + "step": 60 + }, + { + "epoch": 0.04879024195160968, + "grad_norm": 3.894046979082966, + "learning_rate": 1.9636963696369635e-06, + "loss": 0.021, + "step": 61 + }, + { + "epoch": 0.049590081983603276, + "grad_norm": 3.829546481539617, + "learning_rate": 1.962046204620462e-06, + "loss": -0.246, + "step": 62 + }, + { + "epoch": 0.05038992201559688, + "grad_norm": 5.021080021581999, + "learning_rate": 1.9603960396039604e-06, + "loss": -0.0029, + "step": 63 + }, + { + "epoch": 0.05118976204759048, + "grad_norm": 4.084832649304883, + "learning_rate": 1.9587458745874586e-06, + "loss": -0.1911, + "step": 64 + }, + { + "epoch": 0.051989602079584085, + "grad_norm": 3.4567077830219595, + "learning_rate": 1.9570957095709572e-06, + "loss": 0.0388, + "step": 65 + }, + { + "epoch": 0.052789442111577684, + "grad_norm": 4.523151395245523, + "learning_rate": 1.9554455445544555e-06, + "loss": -0.0422, + "step": 66 + }, + { + "epoch": 0.05358928214357129, + "grad_norm": 4.574942149645985, + "learning_rate": 1.9537953795379537e-06, + "loss": 0.0052, + "step": 67 + }, + { + "epoch": 0.05438912217556489, + "grad_norm": 5.884212332415378, + "learning_rate": 1.952145214521452e-06, + "loss": -0.0622, + "step": 68 + }, + { + "epoch": 0.055188962207558485, + "grad_norm": 3.172106804857128, + "learning_rate": 1.95049504950495e-06, + "loss": -0.0936, + "step": 69 + }, + { + "epoch": 0.05598880223955209, + "grad_norm": 4.882587885458746, + "learning_rate": 1.948844884488449e-06, + "loss": -0.1034, + "step": 70 + }, + { + "epoch": 0.05678864227154569, + "grad_norm": 3.290096020906111, + "learning_rate": 1.947194719471947e-06, + "loss": -0.126, + "step": 71 + }, + { + "epoch": 0.057588482303539294, + "grad_norm": 4.175847937084437, + "learning_rate": 1.9455445544554453e-06, + "loss": -0.0295, + "step": 72 + }, + { + "epoch": 0.05838832233553289, + "grad_norm": 4.774862772782205, + "learning_rate": 1.943894389438944e-06, + "loss": -0.0737, + "step": 73 + }, + { + "epoch": 0.0591881623675265, + "grad_norm": 4.866413673374395, + "learning_rate": 1.942244224422442e-06, + "loss": 0.0154, + "step": 74 + }, + { + "epoch": 0.059988002399520096, + "grad_norm": 3.244110640100742, + "learning_rate": 1.9405940594059404e-06, + "loss": -0.0876, + "step": 75 + }, + { + "epoch": 0.060787842431513694, + "grad_norm": 4.94642971249312, + "learning_rate": 1.938943894389439e-06, + "loss": -0.0634, + "step": 76 + }, + { + "epoch": 0.0615876824635073, + "grad_norm": 3.1477348357592705, + "learning_rate": 1.9372937293729373e-06, + "loss": -0.0934, + "step": 77 + }, + { + "epoch": 0.0623875224955009, + "grad_norm": 4.180278871715678, + "learning_rate": 1.9356435643564355e-06, + "loss": -0.0158, + "step": 78 + }, + { + "epoch": 0.0631873625274945, + "grad_norm": 4.73751736841566, + "learning_rate": 1.933993399339934e-06, + "loss": 0.0419, + "step": 79 + }, + { + "epoch": 0.0639872025594881, + "grad_norm": 3.7318625198178577, + "learning_rate": 1.9323432343234324e-06, + "loss": -0.0981, + "step": 80 + }, + { + "epoch": 0.0647870425914817, + "grad_norm": 3.5344903982736016, + "learning_rate": 1.9306930693069306e-06, + "loss": 0.028, + "step": 81 + }, + { + "epoch": 0.0655868826234753, + "grad_norm": 3.6655427915390653, + "learning_rate": 1.9290429042904292e-06, + "loss": -0.0343, + "step": 82 + }, + { + "epoch": 0.06638672265546891, + "grad_norm": 3.8402537750787817, + "learning_rate": 1.9273927392739275e-06, + "loss": -0.0479, + "step": 83 + }, + { + "epoch": 0.06718656268746251, + "grad_norm": 6.194776167870759, + "learning_rate": 1.9257425742574257e-06, + "loss": 0.0046, + "step": 84 + }, + { + "epoch": 0.06798640271945611, + "grad_norm": 5.366879383554931, + "learning_rate": 1.924092409240924e-06, + "loss": -0.1113, + "step": 85 + }, + { + "epoch": 0.0687862427514497, + "grad_norm": 3.9997732575047547, + "learning_rate": 1.922442244224422e-06, + "loss": 0.0446, + "step": 86 + }, + { + "epoch": 0.06958608278344332, + "grad_norm": 9.73377697425672, + "learning_rate": 1.920792079207921e-06, + "loss": -0.0569, + "step": 87 + }, + { + "epoch": 0.07038592281543692, + "grad_norm": 5.0689420802437875, + "learning_rate": 1.919141914191419e-06, + "loss": -0.0352, + "step": 88 + }, + { + "epoch": 0.07118576284743051, + "grad_norm": 8.98640262446026, + "learning_rate": 1.9174917491749173e-06, + "loss": 0.041, + "step": 89 + }, + { + "epoch": 0.07198560287942411, + "grad_norm": 5.63457538673664, + "learning_rate": 1.9158415841584155e-06, + "loss": -0.0193, + "step": 90 + }, + { + "epoch": 0.07278544291141771, + "grad_norm": 4.290130537843607, + "learning_rate": 1.914191419141914e-06, + "loss": -0.0742, + "step": 91 + }, + { + "epoch": 0.07358528294341132, + "grad_norm": 4.0945792486692465, + "learning_rate": 1.9125412541254124e-06, + "loss": 0.029, + "step": 92 + }, + { + "epoch": 0.07438512297540492, + "grad_norm": 4.96670528541929, + "learning_rate": 1.9108910891089106e-06, + "loss": -0.1134, + "step": 93 + }, + { + "epoch": 0.07518496300739852, + "grad_norm": 5.027466862141088, + "learning_rate": 1.9092409240924093e-06, + "loss": -0.0508, + "step": 94 + }, + { + "epoch": 0.07598480303939212, + "grad_norm": 9.573722686429775, + "learning_rate": 1.9075907590759075e-06, + "loss": -0.1592, + "step": 95 + }, + { + "epoch": 0.07678464307138572, + "grad_norm": 5.764961349212166, + "learning_rate": 1.9059405940594057e-06, + "loss": -0.0122, + "step": 96 + }, + { + "epoch": 0.07758448310337933, + "grad_norm": 3.3502664260820247, + "learning_rate": 1.9042904290429044e-06, + "loss": -0.2179, + "step": 97 + }, + { + "epoch": 0.07838432313537293, + "grad_norm": 3.967491851586746, + "learning_rate": 1.9026402640264026e-06, + "loss": -0.0234, + "step": 98 + }, + { + "epoch": 0.07918416316736653, + "grad_norm": 4.552540817957586, + "learning_rate": 1.9009900990099008e-06, + "loss": -0.1591, + "step": 99 + }, + { + "epoch": 0.07998400319936012, + "grad_norm": 10.823710544953496, + "learning_rate": 1.8993399339933993e-06, + "loss": 0.0374, + "step": 100 + }, + { + "epoch": 0.08078384323135374, + "grad_norm": 4.806468007236691, + "learning_rate": 1.8976897689768975e-06, + "loss": -0.0565, + "step": 101 + }, + { + "epoch": 0.08158368326334733, + "grad_norm": 3.634545747480329, + "learning_rate": 1.896039603960396e-06, + "loss": -0.0762, + "step": 102 + }, + { + "epoch": 0.08238352329534093, + "grad_norm": 3.2837047295849597, + "learning_rate": 1.8943894389438944e-06, + "loss": -0.0491, + "step": 103 + }, + { + "epoch": 0.08318336332733453, + "grad_norm": 5.176653817957751, + "learning_rate": 1.8927392739273926e-06, + "loss": -0.1934, + "step": 104 + }, + { + "epoch": 0.08398320335932813, + "grad_norm": 6.107024303996945, + "learning_rate": 1.8910891089108908e-06, + "loss": -0.1129, + "step": 105 + }, + { + "epoch": 0.08478304339132174, + "grad_norm": 4.489176343037952, + "learning_rate": 1.8894389438943895e-06, + "loss": 0.0883, + "step": 106 + }, + { + "epoch": 0.08558288342331534, + "grad_norm": 4.318302618280909, + "learning_rate": 1.8877887788778877e-06, + "loss": -0.1609, + "step": 107 + }, + { + "epoch": 0.08638272345530894, + "grad_norm": 4.634209202008312, + "learning_rate": 1.886138613861386e-06, + "loss": -0.0457, + "step": 108 + }, + { + "epoch": 0.08718256348730254, + "grad_norm": 3.630881832190838, + "learning_rate": 1.8844884488448844e-06, + "loss": -0.1382, + "step": 109 + }, + { + "epoch": 0.08798240351929613, + "grad_norm": 3.886065281514502, + "learning_rate": 1.8828382838283828e-06, + "loss": -0.0535, + "step": 110 + }, + { + "epoch": 0.08878224355128975, + "grad_norm": 3.647392695144741, + "learning_rate": 1.881188118811881e-06, + "loss": -0.0809, + "step": 111 + }, + { + "epoch": 0.08958208358328335, + "grad_norm": 3.9753438884802463, + "learning_rate": 1.8795379537953795e-06, + "loss": -0.0791, + "step": 112 + }, + { + "epoch": 0.09038192361527694, + "grad_norm": 4.473252382488765, + "learning_rate": 1.8778877887788777e-06, + "loss": -0.0723, + "step": 113 + }, + { + "epoch": 0.09118176364727054, + "grad_norm": 4.928253206993449, + "learning_rate": 1.876237623762376e-06, + "loss": 0.0125, + "step": 114 + }, + { + "epoch": 0.09198160367926415, + "grad_norm": 4.557945800338486, + "learning_rate": 1.8745874587458746e-06, + "loss": -0.0889, + "step": 115 + }, + { + "epoch": 0.09278144371125775, + "grad_norm": 5.830924417742841, + "learning_rate": 1.8729372937293728e-06, + "loss": 0.0504, + "step": 116 + }, + { + "epoch": 0.09358128374325135, + "grad_norm": 6.4722171650631655, + "learning_rate": 1.8712871287128713e-06, + "loss": -0.008, + "step": 117 + }, + { + "epoch": 0.09438112377524495, + "grad_norm": 3.1676413558574428, + "learning_rate": 1.8696369636963695e-06, + "loss": -0.0483, + "step": 118 + }, + { + "epoch": 0.09518096380723855, + "grad_norm": 5.1310816710504845, + "learning_rate": 1.867986798679868e-06, + "loss": -0.0513, + "step": 119 + }, + { + "epoch": 0.09598080383923216, + "grad_norm": 3.620114816562482, + "learning_rate": 1.8663366336633664e-06, + "loss": -0.1453, + "step": 120 + }, + { + "epoch": 0.09678064387122576, + "grad_norm": 6.0676794834569865, + "learning_rate": 1.8646864686468646e-06, + "loss": -0.0194, + "step": 121 + }, + { + "epoch": 0.09758048390321936, + "grad_norm": 6.414733331241253, + "learning_rate": 1.8630363036303628e-06, + "loss": -0.0488, + "step": 122 + }, + { + "epoch": 0.09838032393521295, + "grad_norm": 4.6846628376767905, + "learning_rate": 1.8613861386138615e-06, + "loss": -0.0195, + "step": 123 + }, + { + "epoch": 0.09918016396720655, + "grad_norm": 3.235246476419315, + "learning_rate": 1.8597359735973597e-06, + "loss": -0.0942, + "step": 124 + }, + { + "epoch": 0.09998000399920016, + "grad_norm": 5.3470459527801495, + "learning_rate": 1.858085808580858e-06, + "loss": 0.0276, + "step": 125 + }, + { + "epoch": 0.10077984403119376, + "grad_norm": 3.9287996597379995, + "learning_rate": 1.8564356435643564e-06, + "loss": 0.0306, + "step": 126 + }, + { + "epoch": 0.10157968406318736, + "grad_norm": 4.995425229535215, + "learning_rate": 1.8547854785478546e-06, + "loss": 0.0087, + "step": 127 + }, + { + "epoch": 0.10237952409518096, + "grad_norm": 4.573732944820577, + "learning_rate": 1.853135313531353e-06, + "loss": -0.1424, + "step": 128 + }, + { + "epoch": 0.10317936412717456, + "grad_norm": 4.55020470630308, + "learning_rate": 1.8514851485148515e-06, + "loss": -0.0559, + "step": 129 + }, + { + "epoch": 0.10397920415916817, + "grad_norm": 4.41698840906731, + "learning_rate": 1.8498349834983497e-06, + "loss": -0.0188, + "step": 130 + }, + { + "epoch": 0.10477904419116177, + "grad_norm": 5.223224115420677, + "learning_rate": 1.848184818481848e-06, + "loss": -0.1098, + "step": 131 + }, + { + "epoch": 0.10557888422315537, + "grad_norm": 3.8011698979898005, + "learning_rate": 1.8465346534653466e-06, + "loss": -0.0328, + "step": 132 + }, + { + "epoch": 0.10637872425514897, + "grad_norm": 3.1746565925932835, + "learning_rate": 1.8448844884488448e-06, + "loss": -0.0525, + "step": 133 + }, + { + "epoch": 0.10717856428714258, + "grad_norm": 3.9995360105342903, + "learning_rate": 1.843234323432343e-06, + "loss": -0.0665, + "step": 134 + }, + { + "epoch": 0.10797840431913618, + "grad_norm": 4.722040708955319, + "learning_rate": 1.8415841584158415e-06, + "loss": -0.1398, + "step": 135 + }, + { + "epoch": 0.10877824435112977, + "grad_norm": 4.002530013096379, + "learning_rate": 1.83993399339934e-06, + "loss": -0.0609, + "step": 136 + }, + { + "epoch": 0.10957808438312337, + "grad_norm": 4.23131853552439, + "learning_rate": 1.8382838283828382e-06, + "loss": -0.0109, + "step": 137 + }, + { + "epoch": 0.11037792441511697, + "grad_norm": 5.324154803758963, + "learning_rate": 1.8366336633663366e-06, + "loss": -0.0251, + "step": 138 + }, + { + "epoch": 0.11117776444711058, + "grad_norm": 4.297402311394241, + "learning_rate": 1.8349834983498348e-06, + "loss": -0.0589, + "step": 139 + }, + { + "epoch": 0.11197760447910418, + "grad_norm": 4.454960816079389, + "learning_rate": 1.833333333333333e-06, + "loss": -0.1049, + "step": 140 + }, + { + "epoch": 0.11277744451109778, + "grad_norm": 4.709849875744532, + "learning_rate": 1.8316831683168317e-06, + "loss": -0.0279, + "step": 141 + }, + { + "epoch": 0.11357728454309138, + "grad_norm": 3.9184959414442724, + "learning_rate": 1.83003300330033e-06, + "loss": -0.1333, + "step": 142 + }, + { + "epoch": 0.11437712457508498, + "grad_norm": 4.362164005140024, + "learning_rate": 1.8283828382838282e-06, + "loss": -0.0821, + "step": 143 + }, + { + "epoch": 0.11517696460707859, + "grad_norm": 3.814336740776002, + "learning_rate": 1.8267326732673266e-06, + "loss": -0.0764, + "step": 144 + }, + { + "epoch": 0.11597680463907219, + "grad_norm": 4.1087265373281925, + "learning_rate": 1.825082508250825e-06, + "loss": 0.0595, + "step": 145 + }, + { + "epoch": 0.11677664467106579, + "grad_norm": 5.05463448309474, + "learning_rate": 1.8234323432343233e-06, + "loss": -0.0749, + "step": 146 + }, + { + "epoch": 0.11757648470305938, + "grad_norm": 7.009438010420224, + "learning_rate": 1.8217821782178217e-06, + "loss": -0.1623, + "step": 147 + }, + { + "epoch": 0.118376324735053, + "grad_norm": 5.86862518535322, + "learning_rate": 1.82013201320132e-06, + "loss": -0.1914, + "step": 148 + }, + { + "epoch": 0.1191761647670466, + "grad_norm": 8.568812361586986, + "learning_rate": 1.8184818481848184e-06, + "loss": -0.0496, + "step": 149 + }, + { + "epoch": 0.11997600479904019, + "grad_norm": 9.02774053582229, + "learning_rate": 1.8168316831683168e-06, + "loss": -0.022, + "step": 150 + }, + { + "epoch": 0.12077584483103379, + "grad_norm": 5.51491933312306, + "learning_rate": 1.815181518151815e-06, + "loss": -0.1322, + "step": 151 + }, + { + "epoch": 0.12157568486302739, + "grad_norm": 5.304215018308479, + "learning_rate": 1.8135313531353133e-06, + "loss": -0.0676, + "step": 152 + }, + { + "epoch": 0.122375524895021, + "grad_norm": 3.9922542678415565, + "learning_rate": 1.811881188118812e-06, + "loss": 0.0184, + "step": 153 + }, + { + "epoch": 0.1231753649270146, + "grad_norm": 4.724197779779715, + "learning_rate": 1.8102310231023102e-06, + "loss": -0.1204, + "step": 154 + }, + { + "epoch": 0.1239752049590082, + "grad_norm": 5.76455405608935, + "learning_rate": 1.8085808580858084e-06, + "loss": -0.1421, + "step": 155 + }, + { + "epoch": 0.1247750449910018, + "grad_norm": 10.161753692062435, + "learning_rate": 1.8069306930693068e-06, + "loss": 0.0592, + "step": 156 + }, + { + "epoch": 0.1255748850229954, + "grad_norm": 3.667923249601308, + "learning_rate": 1.805280528052805e-06, + "loss": -0.0988, + "step": 157 + }, + { + "epoch": 0.126374725054989, + "grad_norm": 4.515737987543515, + "learning_rate": 1.8036303630363035e-06, + "loss": 0.0522, + "step": 158 + }, + { + "epoch": 0.1271745650869826, + "grad_norm": 3.336996513422035, + "learning_rate": 1.801980198019802e-06, + "loss": -0.09, + "step": 159 + }, + { + "epoch": 0.1279744051189762, + "grad_norm": 3.969953099317271, + "learning_rate": 1.8003300330033002e-06, + "loss": -0.0435, + "step": 160 + }, + { + "epoch": 0.1287742451509698, + "grad_norm": 4.549949209747214, + "learning_rate": 1.7986798679867984e-06, + "loss": -0.0613, + "step": 161 + }, + { + "epoch": 0.1295740851829634, + "grad_norm": 3.759639050223288, + "learning_rate": 1.797029702970297e-06, + "loss": -0.0784, + "step": 162 + }, + { + "epoch": 0.130373925214957, + "grad_norm": 4.619365249559499, + "learning_rate": 1.7953795379537953e-06, + "loss": -0.0111, + "step": 163 + }, + { + "epoch": 0.1311737652469506, + "grad_norm": 4.114791027895229, + "learning_rate": 1.7937293729372935e-06, + "loss": -0.0327, + "step": 164 + }, + { + "epoch": 0.13197360527894422, + "grad_norm": 3.8956026767168836, + "learning_rate": 1.792079207920792e-06, + "loss": -0.1106, + "step": 165 + }, + { + "epoch": 0.13277344531093782, + "grad_norm": 4.818435179721396, + "learning_rate": 1.7904290429042904e-06, + "loss": -0.0034, + "step": 166 + }, + { + "epoch": 0.13357328534293142, + "grad_norm": 6.763152130893218, + "learning_rate": 1.7887788778877888e-06, + "loss": 0.0651, + "step": 167 + }, + { + "epoch": 0.13437312537492502, + "grad_norm": 4.097132792098502, + "learning_rate": 1.787128712871287e-06, + "loss": -0.0269, + "step": 168 + }, + { + "epoch": 0.13517296540691862, + "grad_norm": 4.706830462846675, + "learning_rate": 1.7854785478547853e-06, + "loss": 0.0558, + "step": 169 + }, + { + "epoch": 0.13597280543891221, + "grad_norm": 4.254134691338051, + "learning_rate": 1.783828382838284e-06, + "loss": -0.0046, + "step": 170 + }, + { + "epoch": 0.1367726454709058, + "grad_norm": 5.457939580250951, + "learning_rate": 1.7821782178217822e-06, + "loss": -0.0379, + "step": 171 + }, + { + "epoch": 0.1375724855028994, + "grad_norm": 3.2577166280201544, + "learning_rate": 1.7805280528052804e-06, + "loss": -0.0993, + "step": 172 + }, + { + "epoch": 0.138372325534893, + "grad_norm": 5.551040160162887, + "learning_rate": 1.7788778877887789e-06, + "loss": -0.0543, + "step": 173 + }, + { + "epoch": 0.13917216556688664, + "grad_norm": 3.69149537962834, + "learning_rate": 1.777227722772277e-06, + "loss": -0.0443, + "step": 174 + }, + { + "epoch": 0.13997200559888023, + "grad_norm": 4.4643620642536455, + "learning_rate": 1.7755775577557755e-06, + "loss": -0.0449, + "step": 175 + }, + { + "epoch": 0.14077184563087383, + "grad_norm": 3.5240643064279977, + "learning_rate": 1.773927392739274e-06, + "loss": -0.0928, + "step": 176 + }, + { + "epoch": 0.14157168566286743, + "grad_norm": 5.981016645991625, + "learning_rate": 1.7722772277227722e-06, + "loss": 0.0686, + "step": 177 + }, + { + "epoch": 0.14237152569486103, + "grad_norm": 4.336791468199441, + "learning_rate": 1.7706270627062704e-06, + "loss": -0.0617, + "step": 178 + }, + { + "epoch": 0.14317136572685463, + "grad_norm": 3.678032699373225, + "learning_rate": 1.768976897689769e-06, + "loss": -0.1058, + "step": 179 + }, + { + "epoch": 0.14397120575884823, + "grad_norm": 8.431078918847803, + "learning_rate": 1.7673267326732673e-06, + "loss": -0.034, + "step": 180 + }, + { + "epoch": 0.14477104579084182, + "grad_norm": 4.90238148952107, + "learning_rate": 1.7656765676567655e-06, + "loss": 0.0263, + "step": 181 + }, + { + "epoch": 0.14557088582283542, + "grad_norm": 4.1587161441545115, + "learning_rate": 1.764026402640264e-06, + "loss": -0.0128, + "step": 182 + }, + { + "epoch": 0.14637072585482905, + "grad_norm": 4.255313468888732, + "learning_rate": 1.7623762376237624e-06, + "loss": -0.0138, + "step": 183 + }, + { + "epoch": 0.14717056588682265, + "grad_norm": 6.24454443290786, + "learning_rate": 1.7607260726072606e-06, + "loss": -0.0941, + "step": 184 + }, + { + "epoch": 0.14797040591881624, + "grad_norm": 4.293655354485335, + "learning_rate": 1.759075907590759e-06, + "loss": -0.0621, + "step": 185 + }, + { + "epoch": 0.14877024595080984, + "grad_norm": 4.224321769134034, + "learning_rate": 1.7574257425742573e-06, + "loss": -0.0214, + "step": 186 + }, + { + "epoch": 0.14957008598280344, + "grad_norm": 3.7629471117165827, + "learning_rate": 1.7557755775577555e-06, + "loss": -0.0735, + "step": 187 + }, + { + "epoch": 0.15036992601479704, + "grad_norm": 4.511985288731285, + "learning_rate": 1.7541254125412542e-06, + "loss": -0.1376, + "step": 188 + }, + { + "epoch": 0.15116976604679064, + "grad_norm": 4.701449783409153, + "learning_rate": 1.7524752475247524e-06, + "loss": -0.1208, + "step": 189 + }, + { + "epoch": 0.15196960607878424, + "grad_norm": 7.169693891516351, + "learning_rate": 1.7508250825082506e-06, + "loss": 0.0103, + "step": 190 + }, + { + "epoch": 0.15276944611077783, + "grad_norm": 3.6302391864591126, + "learning_rate": 1.749174917491749e-06, + "loss": -0.036, + "step": 191 + }, + { + "epoch": 0.15356928614277143, + "grad_norm": 8.15707311459662, + "learning_rate": 1.7475247524752475e-06, + "loss": -0.0226, + "step": 192 + }, + { + "epoch": 0.15436912617476506, + "grad_norm": 4.001526302961896, + "learning_rate": 1.7458745874587458e-06, + "loss": -0.0587, + "step": 193 + }, + { + "epoch": 0.15516896620675866, + "grad_norm": 4.468601251007179, + "learning_rate": 1.7442244224422442e-06, + "loss": -0.1388, + "step": 194 + }, + { + "epoch": 0.15596880623875226, + "grad_norm": 4.107118632559092, + "learning_rate": 1.7425742574257424e-06, + "loss": -0.0961, + "step": 195 + }, + { + "epoch": 0.15676864627074585, + "grad_norm": 3.4961373949789665, + "learning_rate": 1.7409240924092409e-06, + "loss": -0.0567, + "step": 196 + }, + { + "epoch": 0.15756848630273945, + "grad_norm": 4.144654814148264, + "learning_rate": 1.7392739273927393e-06, + "loss": -0.1309, + "step": 197 + }, + { + "epoch": 0.15836832633473305, + "grad_norm": 3.6625054473315664, + "learning_rate": 1.7376237623762375e-06, + "loss": -0.0208, + "step": 198 + }, + { + "epoch": 0.15916816636672665, + "grad_norm": 4.664494531197071, + "learning_rate": 1.7359735973597358e-06, + "loss": 0.0178, + "step": 199 + }, + { + "epoch": 0.15996800639872025, + "grad_norm": 6.383022272218445, + "learning_rate": 1.7343234323432342e-06, + "loss": -0.0616, + "step": 200 + }, + { + "epoch": 0.16076784643071385, + "grad_norm": 5.505206158317875, + "learning_rate": 1.7326732673267326e-06, + "loss": -0.0452, + "step": 201 + }, + { + "epoch": 0.16156768646270747, + "grad_norm": 3.5601606217056765, + "learning_rate": 1.7310231023102309e-06, + "loss": 0.0225, + "step": 202 + }, + { + "epoch": 0.16236752649470107, + "grad_norm": 4.408138222273653, + "learning_rate": 1.7293729372937293e-06, + "loss": -0.1139, + "step": 203 + }, + { + "epoch": 0.16316736652669467, + "grad_norm": 3.2562884601218087, + "learning_rate": 1.7277227722772275e-06, + "loss": 0.0087, + "step": 204 + }, + { + "epoch": 0.16396720655868827, + "grad_norm": 4.350781355214131, + "learning_rate": 1.726072607260726e-06, + "loss": -0.0885, + "step": 205 + }, + { + "epoch": 0.16476704659068186, + "grad_norm": 3.3568949216522475, + "learning_rate": 1.7244224422442244e-06, + "loss": -0.0134, + "step": 206 + }, + { + "epoch": 0.16556688662267546, + "grad_norm": 6.798474914966856, + "learning_rate": 1.7227722772277227e-06, + "loss": -0.0945, + "step": 207 + }, + { + "epoch": 0.16636672665466906, + "grad_norm": 4.577665859282248, + "learning_rate": 1.7211221122112209e-06, + "loss": -0.1607, + "step": 208 + }, + { + "epoch": 0.16716656668666266, + "grad_norm": 6.460632243204499, + "learning_rate": 1.7194719471947195e-06, + "loss": -0.0235, + "step": 209 + }, + { + "epoch": 0.16796640671865626, + "grad_norm": 4.306267256349224, + "learning_rate": 1.7178217821782178e-06, + "loss": -0.0059, + "step": 210 + }, + { + "epoch": 0.16876624675064986, + "grad_norm": 3.0483507543879105, + "learning_rate": 1.716171617161716e-06, + "loss": -0.111, + "step": 211 + }, + { + "epoch": 0.16956608678264348, + "grad_norm": 5.737336519193611, + "learning_rate": 1.7145214521452144e-06, + "loss": 0.018, + "step": 212 + }, + { + "epoch": 0.17036592681463708, + "grad_norm": 3.7845990191052734, + "learning_rate": 1.7128712871287127e-06, + "loss": -0.0926, + "step": 213 + }, + { + "epoch": 0.17116576684663068, + "grad_norm": 3.669531800966666, + "learning_rate": 1.711221122112211e-06, + "loss": -0.0776, + "step": 214 + }, + { + "epoch": 0.17196560687862428, + "grad_norm": 4.005323920134325, + "learning_rate": 1.7095709570957095e-06, + "loss": -0.0399, + "step": 215 + }, + { + "epoch": 0.17276544691061788, + "grad_norm": 2.8598435648570186, + "learning_rate": 1.7079207920792078e-06, + "loss": -0.0548, + "step": 216 + }, + { + "epoch": 0.17356528694261147, + "grad_norm": 4.139220262158334, + "learning_rate": 1.7062706270627062e-06, + "loss": 0.0686, + "step": 217 + }, + { + "epoch": 0.17436512697460507, + "grad_norm": 4.988425208682803, + "learning_rate": 1.7046204620462046e-06, + "loss": -0.0028, + "step": 218 + }, + { + "epoch": 0.17516496700659867, + "grad_norm": 3.4806124639328164, + "learning_rate": 1.7029702970297029e-06, + "loss": -0.0914, + "step": 219 + }, + { + "epoch": 0.17596480703859227, + "grad_norm": 6.013581164060899, + "learning_rate": 1.7013201320132013e-06, + "loss": -0.0207, + "step": 220 + }, + { + "epoch": 0.1767646470705859, + "grad_norm": 6.048232130793178, + "learning_rate": 1.6996699669966995e-06, + "loss": 0.0301, + "step": 221 + }, + { + "epoch": 0.1775644871025795, + "grad_norm": 4.206288334982141, + "learning_rate": 1.698019801980198e-06, + "loss": -0.0503, + "step": 222 + }, + { + "epoch": 0.1783643271345731, + "grad_norm": 4.383148234898824, + "learning_rate": 1.6963696369636964e-06, + "loss": 0.0425, + "step": 223 + }, + { + "epoch": 0.1791641671665667, + "grad_norm": 4.013900208301416, + "learning_rate": 1.6947194719471947e-06, + "loss": -0.0873, + "step": 224 + }, + { + "epoch": 0.1799640071985603, + "grad_norm": 3.729807083009099, + "learning_rate": 1.6930693069306929e-06, + "loss": -0.0124, + "step": 225 + }, + { + "epoch": 0.1807638472305539, + "grad_norm": 4.739805223350201, + "learning_rate": 1.6914191419141915e-06, + "loss": -0.0965, + "step": 226 + }, + { + "epoch": 0.18156368726254749, + "grad_norm": 3.684225018193131, + "learning_rate": 1.6897689768976898e-06, + "loss": -0.0899, + "step": 227 + }, + { + "epoch": 0.18236352729454108, + "grad_norm": 4.647773349022286, + "learning_rate": 1.688118811881188e-06, + "loss": -0.1433, + "step": 228 + }, + { + "epoch": 0.18316336732653468, + "grad_norm": 4.314549940205055, + "learning_rate": 1.6864686468646864e-06, + "loss": -0.0987, + "step": 229 + }, + { + "epoch": 0.1839632073585283, + "grad_norm": 6.602144366923463, + "learning_rate": 1.6848184818481847e-06, + "loss": -0.0855, + "step": 230 + }, + { + "epoch": 0.1847630473905219, + "grad_norm": 4.611073533381248, + "learning_rate": 1.683168316831683e-06, + "loss": -0.1262, + "step": 231 + }, + { + "epoch": 0.1855628874225155, + "grad_norm": 4.9020247032635655, + "learning_rate": 1.6815181518151815e-06, + "loss": -0.1706, + "step": 232 + }, + { + "epoch": 0.1863627274545091, + "grad_norm": 4.16092482080365, + "learning_rate": 1.6798679867986798e-06, + "loss": -0.009, + "step": 233 + }, + { + "epoch": 0.1871625674865027, + "grad_norm": 3.5906088992190277, + "learning_rate": 1.678217821782178e-06, + "loss": 0.0999, + "step": 234 + }, + { + "epoch": 0.1879624075184963, + "grad_norm": 4.005270108795308, + "learning_rate": 1.6765676567656767e-06, + "loss": -0.0993, + "step": 235 + }, + { + "epoch": 0.1887622475504899, + "grad_norm": 6.563769408476828, + "learning_rate": 1.6749174917491749e-06, + "loss": -0.0193, + "step": 236 + }, + { + "epoch": 0.1895620875824835, + "grad_norm": 3.380070162840573, + "learning_rate": 1.6732673267326731e-06, + "loss": -0.0809, + "step": 237 + }, + { + "epoch": 0.1903619276144771, + "grad_norm": 4.931354996369631, + "learning_rate": 1.6716171617161716e-06, + "loss": -0.0658, + "step": 238 + }, + { + "epoch": 0.1911617676464707, + "grad_norm": 4.710207450817461, + "learning_rate": 1.66996699669967e-06, + "loss": -0.1167, + "step": 239 + }, + { + "epoch": 0.19196160767846432, + "grad_norm": 3.361685025176525, + "learning_rate": 1.6683168316831682e-06, + "loss": -0.1245, + "step": 240 + }, + { + "epoch": 0.19276144771045792, + "grad_norm": 3.767676589968502, + "learning_rate": 1.6666666666666667e-06, + "loss": -0.1638, + "step": 241 + }, + { + "epoch": 0.19356128774245152, + "grad_norm": 3.7460434704410575, + "learning_rate": 1.6650165016501649e-06, + "loss": -0.1207, + "step": 242 + }, + { + "epoch": 0.1943611277744451, + "grad_norm": 3.7655100191535413, + "learning_rate": 1.6633663366336631e-06, + "loss": 0.0038, + "step": 243 + }, + { + "epoch": 0.1951609678064387, + "grad_norm": 4.3387270640143685, + "learning_rate": 1.6617161716171618e-06, + "loss": -0.0234, + "step": 244 + }, + { + "epoch": 0.1959608078384323, + "grad_norm": 4.729420704117281, + "learning_rate": 1.66006600660066e-06, + "loss": -0.1446, + "step": 245 + }, + { + "epoch": 0.1967606478704259, + "grad_norm": 11.46352939122658, + "learning_rate": 1.6584158415841582e-06, + "loss": -0.0447, + "step": 246 + }, + { + "epoch": 0.1975604879024195, + "grad_norm": 4.6392172787916355, + "learning_rate": 1.6567656765676567e-06, + "loss": -0.1279, + "step": 247 + }, + { + "epoch": 0.1983603279344131, + "grad_norm": 7.81945174107532, + "learning_rate": 1.6551155115511551e-06, + "loss": -0.1788, + "step": 248 + }, + { + "epoch": 0.19916016796640673, + "grad_norm": 4.257894476705108, + "learning_rate": 1.6534653465346533e-06, + "loss": -0.0386, + "step": 249 + }, + { + "epoch": 0.19996000799840033, + "grad_norm": 3.9255930993081094, + "learning_rate": 1.6518151815181518e-06, + "loss": -0.0204, + "step": 250 + }, + { + "epoch": 0.20075984803039393, + "grad_norm": 8.61324493331346, + "learning_rate": 1.65016501650165e-06, + "loss": 0.0872, + "step": 251 + }, + { + "epoch": 0.20155968806238753, + "grad_norm": 3.7965562474708525, + "learning_rate": 1.6485148514851484e-06, + "loss": -0.0834, + "step": 252 + }, + { + "epoch": 0.20235952809438112, + "grad_norm": 4.327305685228189, + "learning_rate": 1.6468646864686469e-06, + "loss": -0.0639, + "step": 253 + }, + { + "epoch": 0.20315936812637472, + "grad_norm": 3.461407011747761, + "learning_rate": 1.6452145214521451e-06, + "loss": -0.1243, + "step": 254 + }, + { + "epoch": 0.20395920815836832, + "grad_norm": 5.164636623307167, + "learning_rate": 1.6435643564356433e-06, + "loss": -0.0318, + "step": 255 + }, + { + "epoch": 0.20475904819036192, + "grad_norm": 4.411537190722961, + "learning_rate": 1.641914191419142e-06, + "loss": -0.1533, + "step": 256 + }, + { + "epoch": 0.20555888822235552, + "grad_norm": 4.832045065537041, + "learning_rate": 1.6402640264026402e-06, + "loss": -0.0931, + "step": 257 + }, + { + "epoch": 0.20635872825434912, + "grad_norm": 4.133203614158014, + "learning_rate": 1.6386138613861385e-06, + "loss": -0.0951, + "step": 258 + }, + { + "epoch": 0.20715856828634274, + "grad_norm": 4.649558155027992, + "learning_rate": 1.636963696369637e-06, + "loss": -0.0459, + "step": 259 + }, + { + "epoch": 0.20795840831833634, + "grad_norm": 3.7050574045200126, + "learning_rate": 1.6353135313531351e-06, + "loss": -0.1324, + "step": 260 + }, + { + "epoch": 0.20875824835032994, + "grad_norm": 4.406446520163225, + "learning_rate": 1.6336633663366336e-06, + "loss": -0.0903, + "step": 261 + }, + { + "epoch": 0.20955808838232354, + "grad_norm": 4.150658998676116, + "learning_rate": 1.632013201320132e-06, + "loss": 0.052, + "step": 262 + }, + { + "epoch": 0.21035792841431714, + "grad_norm": 4.637643800546993, + "learning_rate": 1.6303630363036302e-06, + "loss": -0.1008, + "step": 263 + }, + { + "epoch": 0.21115776844631073, + "grad_norm": 4.356392007316505, + "learning_rate": 1.6287128712871285e-06, + "loss": -0.0666, + "step": 264 + }, + { + "epoch": 0.21195760847830433, + "grad_norm": 4.2232050225914675, + "learning_rate": 1.6270627062706271e-06, + "loss": -0.034, + "step": 265 + }, + { + "epoch": 0.21275744851029793, + "grad_norm": 4.621467065766651, + "learning_rate": 1.6254125412541253e-06, + "loss": -0.1166, + "step": 266 + }, + { + "epoch": 0.21355728854229153, + "grad_norm": 3.6996328893459385, + "learning_rate": 1.6237623762376238e-06, + "loss": -0.0651, + "step": 267 + }, + { + "epoch": 0.21435712857428516, + "grad_norm": 6.476232651431598, + "learning_rate": 1.622112211221122e-06, + "loss": -0.087, + "step": 268 + }, + { + "epoch": 0.21515696860627875, + "grad_norm": 3.3774511125642115, + "learning_rate": 1.6204620462046205e-06, + "loss": -0.0541, + "step": 269 + }, + { + "epoch": 0.21595680863827235, + "grad_norm": 8.039893341875281, + "learning_rate": 1.6188118811881189e-06, + "loss": 0.0426, + "step": 270 + }, + { + "epoch": 0.21675664867026595, + "grad_norm": 3.45288250792369, + "learning_rate": 1.6171617161716171e-06, + "loss": -0.071, + "step": 271 + }, + { + "epoch": 0.21755648870225955, + "grad_norm": 5.813145099240533, + "learning_rate": 1.6155115511551154e-06, + "loss": 0.0774, + "step": 272 + }, + { + "epoch": 0.21835632873425315, + "grad_norm": 3.4988010260216202, + "learning_rate": 1.6138613861386138e-06, + "loss": -0.0378, + "step": 273 + }, + { + "epoch": 0.21915616876624675, + "grad_norm": 4.136529473287242, + "learning_rate": 1.6122112211221122e-06, + "loss": 0.0215, + "step": 274 + }, + { + "epoch": 0.21995600879824034, + "grad_norm": 3.9538185204867884, + "learning_rate": 1.6105610561056105e-06, + "loss": -0.1014, + "step": 275 + }, + { + "epoch": 0.22075584883023394, + "grad_norm": 4.987429074808495, + "learning_rate": 1.608910891089109e-06, + "loss": -0.0326, + "step": 276 + }, + { + "epoch": 0.22155568886222757, + "grad_norm": 3.1011116987800595, + "learning_rate": 1.6072607260726071e-06, + "loss": -0.0742, + "step": 277 + }, + { + "epoch": 0.22235552889422117, + "grad_norm": 4.87646247250274, + "learning_rate": 1.6056105610561056e-06, + "loss": -0.0488, + "step": 278 + }, + { + "epoch": 0.22315536892621476, + "grad_norm": 5.406703510997709, + "learning_rate": 1.603960396039604e-06, + "loss": -0.0998, + "step": 279 + }, + { + "epoch": 0.22395520895820836, + "grad_norm": 4.199602090060885, + "learning_rate": 1.6023102310231022e-06, + "loss": 0.0787, + "step": 280 + }, + { + "epoch": 0.22475504899020196, + "grad_norm": 6.262166502823287, + "learning_rate": 1.6006600660066005e-06, + "loss": 0.0123, + "step": 281 + }, + { + "epoch": 0.22555488902219556, + "grad_norm": 5.4085402736640225, + "learning_rate": 1.5990099009900991e-06, + "loss": -0.1219, + "step": 282 + }, + { + "epoch": 0.22635472905418916, + "grad_norm": 14.35407252989058, + "learning_rate": 1.5973597359735973e-06, + "loss": -0.0427, + "step": 283 + }, + { + "epoch": 0.22715456908618276, + "grad_norm": 5.560237467243524, + "learning_rate": 1.5957095709570956e-06, + "loss": -0.0363, + "step": 284 + }, + { + "epoch": 0.22795440911817635, + "grad_norm": 5.376214533362693, + "learning_rate": 1.594059405940594e-06, + "loss": -0.1198, + "step": 285 + }, + { + "epoch": 0.22875424915016995, + "grad_norm": 7.872347430401011, + "learning_rate": 1.5924092409240922e-06, + "loss": 0.02, + "step": 286 + }, + { + "epoch": 0.22955408918216358, + "grad_norm": 4.079731942515135, + "learning_rate": 1.5907590759075907e-06, + "loss": -0.1465, + "step": 287 + }, + { + "epoch": 0.23035392921415718, + "grad_norm": 4.054081807256331, + "learning_rate": 1.5891089108910891e-06, + "loss": -0.097, + "step": 288 + }, + { + "epoch": 0.23115376924615078, + "grad_norm": 5.668828140611865, + "learning_rate": 1.5874587458745874e-06, + "loss": -0.0113, + "step": 289 + }, + { + "epoch": 0.23195360927814437, + "grad_norm": 4.222209049226612, + "learning_rate": 1.5858085808580856e-06, + "loss": -0.0565, + "step": 290 + }, + { + "epoch": 0.23275344931013797, + "grad_norm": 3.7308714963795735, + "learning_rate": 1.5841584158415842e-06, + "loss": -0.0311, + "step": 291 + }, + { + "epoch": 0.23355328934213157, + "grad_norm": 3.9384379405107914, + "learning_rate": 1.5825082508250825e-06, + "loss": -0.0921, + "step": 292 + }, + { + "epoch": 0.23435312937412517, + "grad_norm": 4.208635426370359, + "learning_rate": 1.5808580858085807e-06, + "loss": -0.1037, + "step": 293 + }, + { + "epoch": 0.23515296940611877, + "grad_norm": 4.471661666164002, + "learning_rate": 1.5792079207920791e-06, + "loss": -0.048, + "step": 294 + }, + { + "epoch": 0.23595280943811237, + "grad_norm": 5.922123322526879, + "learning_rate": 1.5775577557755776e-06, + "loss": -0.0822, + "step": 295 + }, + { + "epoch": 0.236752649470106, + "grad_norm": 3.9336004171911596, + "learning_rate": 1.5759075907590758e-06, + "loss": -0.0751, + "step": 296 + }, + { + "epoch": 0.2375524895020996, + "grad_norm": 2.9881202405051086, + "learning_rate": 1.5742574257425742e-06, + "loss": -0.0694, + "step": 297 + }, + { + "epoch": 0.2383523295340932, + "grad_norm": 7.04293625200489, + "learning_rate": 1.5726072607260725e-06, + "loss": -0.1209, + "step": 298 + }, + { + "epoch": 0.2391521695660868, + "grad_norm": 4.3791375350104165, + "learning_rate": 1.5709570957095707e-06, + "loss": -0.0704, + "step": 299 + }, + { + "epoch": 0.23995200959808038, + "grad_norm": 4.4299869604327835, + "learning_rate": 1.5693069306930694e-06, + "loss": 0.0578, + "step": 300 + }, + { + "epoch": 0.24075184963007398, + "grad_norm": 5.721847612449816, + "learning_rate": 1.5676567656765676e-06, + "loss": 0.0124, + "step": 301 + }, + { + "epoch": 0.24155168966206758, + "grad_norm": 4.346519849517093, + "learning_rate": 1.5660066006600658e-06, + "loss": -0.0676, + "step": 302 + }, + { + "epoch": 0.24235152969406118, + "grad_norm": 4.166900068739509, + "learning_rate": 1.5643564356435643e-06, + "loss": -0.0379, + "step": 303 + }, + { + "epoch": 0.24315136972605478, + "grad_norm": 4.171740126126224, + "learning_rate": 1.5627062706270627e-06, + "loss": -0.0369, + "step": 304 + }, + { + "epoch": 0.2439512097580484, + "grad_norm": 4.571373866809776, + "learning_rate": 1.561056105610561e-06, + "loss": -0.0423, + "step": 305 + }, + { + "epoch": 0.244751049790042, + "grad_norm": 4.687528076087793, + "learning_rate": 1.5594059405940594e-06, + "loss": -0.0427, + "step": 306 + }, + { + "epoch": 0.2455508898220356, + "grad_norm": 4.099266935733802, + "learning_rate": 1.5577557755775576e-06, + "loss": -0.1168, + "step": 307 + }, + { + "epoch": 0.2463507298540292, + "grad_norm": 4.76705500925925, + "learning_rate": 1.556105610561056e-06, + "loss": -0.0726, + "step": 308 + }, + { + "epoch": 0.2471505698860228, + "grad_norm": 7.726027050692815, + "learning_rate": 1.5544554455445545e-06, + "loss": -0.0858, + "step": 309 + }, + { + "epoch": 0.2479504099180164, + "grad_norm": 4.588817621333979, + "learning_rate": 1.5528052805280527e-06, + "loss": 0.0889, + "step": 310 + }, + { + "epoch": 0.24875024995001, + "grad_norm": 5.351566242300243, + "learning_rate": 1.551155115511551e-06, + "loss": -0.143, + "step": 311 + }, + { + "epoch": 0.2495500899820036, + "grad_norm": 6.279051438632601, + "learning_rate": 1.5495049504950496e-06, + "loss": 0.0312, + "step": 312 + }, + { + "epoch": 0.2503499300139972, + "grad_norm": 4.251123392069971, + "learning_rate": 1.5478547854785478e-06, + "loss": -0.0477, + "step": 313 + }, + { + "epoch": 0.2511497700459908, + "grad_norm": 4.255617580398947, + "learning_rate": 1.546204620462046e-06, + "loss": -0.1445, + "step": 314 + }, + { + "epoch": 0.2519496100779844, + "grad_norm": 3.979778076387235, + "learning_rate": 1.5445544554455445e-06, + "loss": -0.0937, + "step": 315 + }, + { + "epoch": 0.252749450109978, + "grad_norm": 5.547095237980292, + "learning_rate": 1.5429042904290427e-06, + "loss": -0.0091, + "step": 316 + }, + { + "epoch": 0.2535492901419716, + "grad_norm": 5.863554498962612, + "learning_rate": 1.5412541254125414e-06, + "loss": -0.0883, + "step": 317 + }, + { + "epoch": 0.2543491301739652, + "grad_norm": 3.9832799266815533, + "learning_rate": 1.5396039603960396e-06, + "loss": -0.1173, + "step": 318 + }, + { + "epoch": 0.2551489702059588, + "grad_norm": 4.961222194402448, + "learning_rate": 1.5379537953795378e-06, + "loss": -0.0647, + "step": 319 + }, + { + "epoch": 0.2559488102379524, + "grad_norm": 2.7901444246654945, + "learning_rate": 1.5363036303630363e-06, + "loss": -0.0873, + "step": 320 + }, + { + "epoch": 0.25674865026994603, + "grad_norm": 4.6616454131308265, + "learning_rate": 1.5346534653465347e-06, + "loss": -0.1283, + "step": 321 + }, + { + "epoch": 0.2575484903019396, + "grad_norm": 4.32603696177896, + "learning_rate": 1.533003300330033e-06, + "loss": -0.0748, + "step": 322 + }, + { + "epoch": 0.25834833033393323, + "grad_norm": 4.653928241866685, + "learning_rate": 1.5313531353135314e-06, + "loss": -0.1215, + "step": 323 + }, + { + "epoch": 0.2591481703659268, + "grad_norm": 4.476046494175142, + "learning_rate": 1.5297029702970296e-06, + "loss": -0.0247, + "step": 324 + }, + { + "epoch": 0.2599480103979204, + "grad_norm": 5.41605277862076, + "learning_rate": 1.528052805280528e-06, + "loss": 0.0055, + "step": 325 + }, + { + "epoch": 0.260747850429914, + "grad_norm": 7.359939974664472, + "learning_rate": 1.5264026402640265e-06, + "loss": -0.0994, + "step": 326 + }, + { + "epoch": 0.2615476904619076, + "grad_norm": 3.7953460503418794, + "learning_rate": 1.5247524752475247e-06, + "loss": 0.0777, + "step": 327 + }, + { + "epoch": 0.2623475304939012, + "grad_norm": 4.375620334787856, + "learning_rate": 1.523102310231023e-06, + "loss": -0.1532, + "step": 328 + }, + { + "epoch": 0.2631473705258948, + "grad_norm": 5.690054518936246, + "learning_rate": 1.5214521452145214e-06, + "loss": -0.0744, + "step": 329 + }, + { + "epoch": 0.26394721055788845, + "grad_norm": 4.095859129867475, + "learning_rate": 1.5198019801980198e-06, + "loss": -0.1342, + "step": 330 + }, + { + "epoch": 0.264747050589882, + "grad_norm": 5.261928086906211, + "learning_rate": 1.518151815181518e-06, + "loss": -0.0327, + "step": 331 + }, + { + "epoch": 0.26554689062187564, + "grad_norm": 3.723958703243353, + "learning_rate": 1.5165016501650165e-06, + "loss": -0.0919, + "step": 332 + }, + { + "epoch": 0.2663467306538692, + "grad_norm": 7.064342427249925, + "learning_rate": 1.5148514851485147e-06, + "loss": -0.3055, + "step": 333 + }, + { + "epoch": 0.26714657068586284, + "grad_norm": 5.094162064249706, + "learning_rate": 1.5132013201320131e-06, + "loss": -0.1551, + "step": 334 + }, + { + "epoch": 0.2679464107178564, + "grad_norm": 5.182464177568643, + "learning_rate": 1.5115511551155116e-06, + "loss": -0.1656, + "step": 335 + }, + { + "epoch": 0.26874625074985004, + "grad_norm": 4.205631232130195, + "learning_rate": 1.5099009900990098e-06, + "loss": -0.1339, + "step": 336 + }, + { + "epoch": 0.2695460907818436, + "grad_norm": 3.4595951551287234, + "learning_rate": 1.508250825082508e-06, + "loss": -0.0259, + "step": 337 + }, + { + "epoch": 0.27034593081383723, + "grad_norm": 4.040524953973991, + "learning_rate": 1.5066006600660067e-06, + "loss": 0.0424, + "step": 338 + }, + { + "epoch": 0.27114577084583086, + "grad_norm": 3.3792775209230044, + "learning_rate": 1.504950495049505e-06, + "loss": -0.0252, + "step": 339 + }, + { + "epoch": 0.27194561087782443, + "grad_norm": 5.329937489556339, + "learning_rate": 1.5033003300330032e-06, + "loss": -0.1064, + "step": 340 + }, + { + "epoch": 0.27274545090981805, + "grad_norm": 3.8366500907383, + "learning_rate": 1.5016501650165016e-06, + "loss": -0.1327, + "step": 341 + }, + { + "epoch": 0.2735452909418116, + "grad_norm": 3.4211397121327334, + "learning_rate": 1.5e-06, + "loss": -0.0469, + "step": 342 + }, + { + "epoch": 0.27434513097380525, + "grad_norm": 7.190396728605877, + "learning_rate": 1.4983498349834983e-06, + "loss": -0.0455, + "step": 343 + }, + { + "epoch": 0.2751449710057988, + "grad_norm": 5.208941899667468, + "learning_rate": 1.4966996699669967e-06, + "loss": -0.0118, + "step": 344 + }, + { + "epoch": 0.27594481103779245, + "grad_norm": 5.0666099160345635, + "learning_rate": 1.495049504950495e-06, + "loss": -0.08, + "step": 345 + }, + { + "epoch": 0.276744651069786, + "grad_norm": 3.445240945570377, + "learning_rate": 1.4933993399339932e-06, + "loss": 0.0043, + "step": 346 + }, + { + "epoch": 0.27754449110177964, + "grad_norm": 6.719396089938185, + "learning_rate": 1.4917491749174918e-06, + "loss": -0.0528, + "step": 347 + }, + { + "epoch": 0.27834433113377327, + "grad_norm": 4.948551220275233, + "learning_rate": 1.49009900990099e-06, + "loss": -0.0351, + "step": 348 + }, + { + "epoch": 0.27914417116576684, + "grad_norm": 4.198757242081244, + "learning_rate": 1.4884488448844883e-06, + "loss": -0.1767, + "step": 349 + }, + { + "epoch": 0.27994401119776047, + "grad_norm": 4.020517893591624, + "learning_rate": 1.4867986798679867e-06, + "loss": -0.0777, + "step": 350 + }, + { + "epoch": 0.28074385122975404, + "grad_norm": 7.665385125345826, + "learning_rate": 1.4851485148514852e-06, + "loss": -0.0675, + "step": 351 + }, + { + "epoch": 0.28154369126174766, + "grad_norm": 4.359035902610134, + "learning_rate": 1.4834983498349834e-06, + "loss": 0.1022, + "step": 352 + }, + { + "epoch": 0.28234353129374123, + "grad_norm": 4.515833866344382, + "learning_rate": 1.4818481848184818e-06, + "loss": -0.1318, + "step": 353 + }, + { + "epoch": 0.28314337132573486, + "grad_norm": 10.48643307447715, + "learning_rate": 1.48019801980198e-06, + "loss": -0.0624, + "step": 354 + }, + { + "epoch": 0.28394321135772843, + "grad_norm": 3.9055137245563167, + "learning_rate": 1.4785478547854785e-06, + "loss": 0.0131, + "step": 355 + }, + { + "epoch": 0.28474305138972206, + "grad_norm": 5.064555563223541, + "learning_rate": 1.476897689768977e-06, + "loss": -0.0848, + "step": 356 + }, + { + "epoch": 0.2855428914217157, + "grad_norm": 6.403904331900866, + "learning_rate": 1.4752475247524752e-06, + "loss": -0.1231, + "step": 357 + }, + { + "epoch": 0.28634273145370925, + "grad_norm": 4.4680198659839405, + "learning_rate": 1.4735973597359734e-06, + "loss": -0.0751, + "step": 358 + }, + { + "epoch": 0.2871425714857029, + "grad_norm": 7.88048544071049, + "learning_rate": 1.4719471947194718e-06, + "loss": -0.0111, + "step": 359 + }, + { + "epoch": 0.28794241151769645, + "grad_norm": 4.041245481168213, + "learning_rate": 1.4702970297029703e-06, + "loss": -0.0219, + "step": 360 + }, + { + "epoch": 0.2887422515496901, + "grad_norm": 3.2378522821181748, + "learning_rate": 1.4686468646864685e-06, + "loss": 0.0154, + "step": 361 + }, + { + "epoch": 0.28954209158168365, + "grad_norm": 5.187324980575399, + "learning_rate": 1.466996699669967e-06, + "loss": 0.0768, + "step": 362 + }, + { + "epoch": 0.2903419316136773, + "grad_norm": 3.892629574264858, + "learning_rate": 1.4653465346534652e-06, + "loss": -0.1757, + "step": 363 + }, + { + "epoch": 0.29114177164567084, + "grad_norm": 4.66291997005039, + "learning_rate": 1.4636963696369636e-06, + "loss": -0.0897, + "step": 364 + }, + { + "epoch": 0.29194161167766447, + "grad_norm": 4.490266870328807, + "learning_rate": 1.462046204620462e-06, + "loss": -0.1638, + "step": 365 + }, + { + "epoch": 0.2927414517096581, + "grad_norm": 7.248644471878, + "learning_rate": 1.4603960396039603e-06, + "loss": -0.1413, + "step": 366 + }, + { + "epoch": 0.29354129174165167, + "grad_norm": 13.65496051906939, + "learning_rate": 1.4587458745874585e-06, + "loss": -0.0144, + "step": 367 + }, + { + "epoch": 0.2943411317736453, + "grad_norm": 3.0336626027850593, + "learning_rate": 1.4570957095709572e-06, + "loss": -0.008, + "step": 368 + }, + { + "epoch": 0.29514097180563886, + "grad_norm": 7.873854054225054, + "learning_rate": 1.4554455445544554e-06, + "loss": 0.1034, + "step": 369 + }, + { + "epoch": 0.2959408118376325, + "grad_norm": 3.727594324731175, + "learning_rate": 1.4537953795379538e-06, + "loss": -0.1401, + "step": 370 + }, + { + "epoch": 0.29674065186962606, + "grad_norm": 5.229701446706082, + "learning_rate": 1.452145214521452e-06, + "loss": -0.1203, + "step": 371 + }, + { + "epoch": 0.2975404919016197, + "grad_norm": 4.492128268970922, + "learning_rate": 1.4504950495049503e-06, + "loss": 0.0004, + "step": 372 + }, + { + "epoch": 0.29834033193361326, + "grad_norm": 5.077090301738471, + "learning_rate": 1.448844884488449e-06, + "loss": 0.0219, + "step": 373 + }, + { + "epoch": 0.2991401719656069, + "grad_norm": 4.796744776644939, + "learning_rate": 1.4471947194719472e-06, + "loss": -0.0064, + "step": 374 + }, + { + "epoch": 0.29994001199760045, + "grad_norm": 7.42447528462134, + "learning_rate": 1.4455445544554454e-06, + "loss": 0.07, + "step": 375 + }, + { + "epoch": 0.3007398520295941, + "grad_norm": 3.848638759590051, + "learning_rate": 1.4438943894389438e-06, + "loss": -0.0777, + "step": 376 + }, + { + "epoch": 0.3015396920615877, + "grad_norm": 4.256980996790008, + "learning_rate": 1.4422442244224423e-06, + "loss": -0.1766, + "step": 377 + }, + { + "epoch": 0.3023395320935813, + "grad_norm": 3.961327287203466, + "learning_rate": 1.4405940594059405e-06, + "loss": -0.0571, + "step": 378 + }, + { + "epoch": 0.3031393721255749, + "grad_norm": 5.478690567895318, + "learning_rate": 1.438943894389439e-06, + "loss": 0.0013, + "step": 379 + }, + { + "epoch": 0.3039392121575685, + "grad_norm": 3.8685538296119106, + "learning_rate": 1.4372937293729372e-06, + "loss": -0.0135, + "step": 380 + }, + { + "epoch": 0.3047390521895621, + "grad_norm": 3.712350805091167, + "learning_rate": 1.4356435643564356e-06, + "loss": -0.0965, + "step": 381 + }, + { + "epoch": 0.30553889222155567, + "grad_norm": 4.12545866294737, + "learning_rate": 1.433993399339934e-06, + "loss": 0.0192, + "step": 382 + }, + { + "epoch": 0.3063387322535493, + "grad_norm": 3.9826126090375085, + "learning_rate": 1.4323432343234323e-06, + "loss": 0.0096, + "step": 383 + }, + { + "epoch": 0.30713857228554287, + "grad_norm": 5.253969236088526, + "learning_rate": 1.4306930693069305e-06, + "loss": 0.0596, + "step": 384 + }, + { + "epoch": 0.3079384123175365, + "grad_norm": 4.369221167744991, + "learning_rate": 1.4290429042904292e-06, + "loss": -0.0586, + "step": 385 + }, + { + "epoch": 0.3087382523495301, + "grad_norm": 3.386456014084215, + "learning_rate": 1.4273927392739274e-06, + "loss": -0.1952, + "step": 386 + }, + { + "epoch": 0.3095380923815237, + "grad_norm": 4.175162288229841, + "learning_rate": 1.4257425742574256e-06, + "loss": -0.1559, + "step": 387 + }, + { + "epoch": 0.3103379324135173, + "grad_norm": 4.07269720996871, + "learning_rate": 1.424092409240924e-06, + "loss": -0.0591, + "step": 388 + }, + { + "epoch": 0.3111377724455109, + "grad_norm": 3.873233515579836, + "learning_rate": 1.4224422442244223e-06, + "loss": -0.0649, + "step": 389 + }, + { + "epoch": 0.3119376124775045, + "grad_norm": 5.33165026969968, + "learning_rate": 1.4207920792079207e-06, + "loss": -0.0568, + "step": 390 + }, + { + "epoch": 0.3127374525094981, + "grad_norm": 5.644618937197355, + "learning_rate": 1.4191419141914192e-06, + "loss": -0.0425, + "step": 391 + }, + { + "epoch": 0.3135372925414917, + "grad_norm": 4.609038777130941, + "learning_rate": 1.4174917491749174e-06, + "loss": -0.0991, + "step": 392 + }, + { + "epoch": 0.3143371325734853, + "grad_norm": 5.362814464107483, + "learning_rate": 1.4158415841584156e-06, + "loss": -0.0377, + "step": 393 + }, + { + "epoch": 0.3151369726054789, + "grad_norm": 4.1100020129716315, + "learning_rate": 1.4141914191419143e-06, + "loss": -0.0176, + "step": 394 + }, + { + "epoch": 0.31593681263747253, + "grad_norm": 3.6462471572713198, + "learning_rate": 1.4125412541254125e-06, + "loss": 0.0183, + "step": 395 + }, + { + "epoch": 0.3167366526694661, + "grad_norm": 3.425535847868438, + "learning_rate": 1.4108910891089107e-06, + "loss": -0.1166, + "step": 396 + }, + { + "epoch": 0.3175364927014597, + "grad_norm": 4.023065583159361, + "learning_rate": 1.4092409240924092e-06, + "loss": -0.069, + "step": 397 + }, + { + "epoch": 0.3183363327334533, + "grad_norm": 4.435192529053884, + "learning_rate": 1.4075907590759076e-06, + "loss": -0.1024, + "step": 398 + }, + { + "epoch": 0.3191361727654469, + "grad_norm": 4.351874787170239, + "learning_rate": 1.4059405940594058e-06, + "loss": -0.1381, + "step": 399 + }, + { + "epoch": 0.3199360127974405, + "grad_norm": 5.114118048590294, + "learning_rate": 1.4042904290429043e-06, + "loss": -0.027, + "step": 400 + }, + { + "epoch": 0.3207358528294341, + "grad_norm": 6.62264310550409, + "learning_rate": 1.4026402640264025e-06, + "loss": -0.1372, + "step": 401 + }, + { + "epoch": 0.3215356928614277, + "grad_norm": 5.515472496453124, + "learning_rate": 1.4009900990099007e-06, + "loss": 0.0028, + "step": 402 + }, + { + "epoch": 0.3223355328934213, + "grad_norm": 5.43524070368167, + "learning_rate": 1.3993399339933994e-06, + "loss": -0.1192, + "step": 403 + }, + { + "epoch": 0.32313537292541494, + "grad_norm": 4.309916510249054, + "learning_rate": 1.3976897689768976e-06, + "loss": -0.049, + "step": 404 + }, + { + "epoch": 0.3239352129574085, + "grad_norm": 4.392826058059571, + "learning_rate": 1.3960396039603959e-06, + "loss": -0.1248, + "step": 405 + }, + { + "epoch": 0.32473505298940214, + "grad_norm": 5.384606404349416, + "learning_rate": 1.3943894389438943e-06, + "loss": -0.0248, + "step": 406 + }, + { + "epoch": 0.3255348930213957, + "grad_norm": 5.369884451931867, + "learning_rate": 1.3927392739273927e-06, + "loss": 0.0453, + "step": 407 + }, + { + "epoch": 0.32633473305338934, + "grad_norm": 3.799887635426884, + "learning_rate": 1.391089108910891e-06, + "loss": 0.0924, + "step": 408 + }, + { + "epoch": 0.3271345730853829, + "grad_norm": 5.151153079821819, + "learning_rate": 1.3894389438943894e-06, + "loss": -0.1524, + "step": 409 + }, + { + "epoch": 0.32793441311737653, + "grad_norm": 4.9429474730234935, + "learning_rate": 1.3877887788778876e-06, + "loss": -0.0066, + "step": 410 + }, + { + "epoch": 0.3287342531493701, + "grad_norm": 3.8669767688401637, + "learning_rate": 1.386138613861386e-06, + "loss": -0.0998, + "step": 411 + }, + { + "epoch": 0.32953409318136373, + "grad_norm": 4.1249285605053165, + "learning_rate": 1.3844884488448845e-06, + "loss": -0.1198, + "step": 412 + }, + { + "epoch": 0.33033393321335736, + "grad_norm": 4.264021911092433, + "learning_rate": 1.3828382838283827e-06, + "loss": -0.027, + "step": 413 + }, + { + "epoch": 0.3311337732453509, + "grad_norm": 8.192155984704781, + "learning_rate": 1.381188118811881e-06, + "loss": 0.0037, + "step": 414 + }, + { + "epoch": 0.33193361327734455, + "grad_norm": 4.842071045333458, + "learning_rate": 1.3795379537953794e-06, + "loss": -0.0183, + "step": 415 + }, + { + "epoch": 0.3327334533093381, + "grad_norm": 5.69008602834876, + "learning_rate": 1.3778877887788779e-06, + "loss": -0.037, + "step": 416 + }, + { + "epoch": 0.33353329334133175, + "grad_norm": 3.4506588237689044, + "learning_rate": 1.376237623762376e-06, + "loss": -0.1827, + "step": 417 + }, + { + "epoch": 0.3343331333733253, + "grad_norm": 4.276859677588479, + "learning_rate": 1.3745874587458745e-06, + "loss": -0.1397, + "step": 418 + }, + { + "epoch": 0.33513297340531895, + "grad_norm": 5.137955642134524, + "learning_rate": 1.3729372937293728e-06, + "loss": 0.0313, + "step": 419 + }, + { + "epoch": 0.3359328134373125, + "grad_norm": 5.626427193889533, + "learning_rate": 1.3712871287128714e-06, + "loss": 0.0585, + "step": 420 + }, + { + "epoch": 0.33673265346930614, + "grad_norm": 4.259015114708382, + "learning_rate": 1.3696369636963696e-06, + "loss": -0.1352, + "step": 421 + }, + { + "epoch": 0.3375324935012997, + "grad_norm": 3.900501996524311, + "learning_rate": 1.3679867986798679e-06, + "loss": -0.1541, + "step": 422 + }, + { + "epoch": 0.33833233353329334, + "grad_norm": 21.342155698599925, + "learning_rate": 1.3663366336633663e-06, + "loss": 0.0163, + "step": 423 + }, + { + "epoch": 0.33913217356528697, + "grad_norm": 9.376314198251674, + "learning_rate": 1.3646864686468647e-06, + "loss": -0.1147, + "step": 424 + }, + { + "epoch": 0.33993201359728054, + "grad_norm": 3.9556694436435773, + "learning_rate": 1.363036303630363e-06, + "loss": 0.0607, + "step": 425 + }, + { + "epoch": 0.34073185362927416, + "grad_norm": 4.413407376716041, + "learning_rate": 1.3613861386138614e-06, + "loss": 0.0269, + "step": 426 + }, + { + "epoch": 0.34153169366126773, + "grad_norm": 4.5745629523971285, + "learning_rate": 1.3597359735973596e-06, + "loss": -0.1232, + "step": 427 + }, + { + "epoch": 0.34233153369326136, + "grad_norm": 6.482169711595175, + "learning_rate": 1.3580858085808579e-06, + "loss": -0.0933, + "step": 428 + }, + { + "epoch": 0.34313137372525493, + "grad_norm": 4.614948794989073, + "learning_rate": 1.3564356435643565e-06, + "loss": -0.0881, + "step": 429 + }, + { + "epoch": 0.34393121375724856, + "grad_norm": 4.902443350581836, + "learning_rate": 1.3547854785478547e-06, + "loss": -0.0492, + "step": 430 + }, + { + "epoch": 0.3447310537892421, + "grad_norm": 4.293832374460016, + "learning_rate": 1.353135313531353e-06, + "loss": -0.0988, + "step": 431 + }, + { + "epoch": 0.34553089382123575, + "grad_norm": 4.239300667652253, + "learning_rate": 1.3514851485148514e-06, + "loss": -0.0395, + "step": 432 + }, + { + "epoch": 0.3463307338532294, + "grad_norm": 6.000658634911202, + "learning_rate": 1.3498349834983499e-06, + "loss": -0.0687, + "step": 433 + }, + { + "epoch": 0.34713057388522295, + "grad_norm": 4.533327665512432, + "learning_rate": 1.348184818481848e-06, + "loss": -0.0457, + "step": 434 + }, + { + "epoch": 0.3479304139172166, + "grad_norm": 4.966203144811649, + "learning_rate": 1.3465346534653465e-06, + "loss": -0.0891, + "step": 435 + }, + { + "epoch": 0.34873025394921014, + "grad_norm": 3.160979702375991, + "learning_rate": 1.3448844884488448e-06, + "loss": 0.0509, + "step": 436 + }, + { + "epoch": 0.34953009398120377, + "grad_norm": 7.163626654459487, + "learning_rate": 1.3432343234323432e-06, + "loss": -0.1026, + "step": 437 + }, + { + "epoch": 0.35032993401319734, + "grad_norm": 4.10929586240042, + "learning_rate": 1.3415841584158416e-06, + "loss": -0.0346, + "step": 438 + }, + { + "epoch": 0.35112977404519097, + "grad_norm": 7.444864169509166, + "learning_rate": 1.3399339933993399e-06, + "loss": -0.084, + "step": 439 + }, + { + "epoch": 0.35192961407718454, + "grad_norm": 4.279436158804133, + "learning_rate": 1.338283828382838e-06, + "loss": -0.0753, + "step": 440 + }, + { + "epoch": 0.35272945410917816, + "grad_norm": 7.0310221317242965, + "learning_rate": 1.3366336633663367e-06, + "loss": 0.0822, + "step": 441 + }, + { + "epoch": 0.3535292941411718, + "grad_norm": 3.546380500099962, + "learning_rate": 1.334983498349835e-06, + "loss": -0.0826, + "step": 442 + }, + { + "epoch": 0.35432913417316536, + "grad_norm": 3.978575910618056, + "learning_rate": 1.3333333333333332e-06, + "loss": -0.0183, + "step": 443 + }, + { + "epoch": 0.355128974205159, + "grad_norm": 4.893702894351932, + "learning_rate": 1.3316831683168316e-06, + "loss": 0.0513, + "step": 444 + }, + { + "epoch": 0.35592881423715256, + "grad_norm": 4.712476792012751, + "learning_rate": 1.3300330033003299e-06, + "loss": 0.0161, + "step": 445 + }, + { + "epoch": 0.3567286542691462, + "grad_norm": 4.363095681693482, + "learning_rate": 1.3283828382838283e-06, + "loss": -0.0878, + "step": 446 + }, + { + "epoch": 0.35752849430113975, + "grad_norm": 3.6779713769559206, + "learning_rate": 1.3267326732673268e-06, + "loss": -0.0884, + "step": 447 + }, + { + "epoch": 0.3583283343331334, + "grad_norm": 4.691244638726057, + "learning_rate": 1.325082508250825e-06, + "loss": 0.0164, + "step": 448 + }, + { + "epoch": 0.35912817436512695, + "grad_norm": 3.9918624835208574, + "learning_rate": 1.3234323432343232e-06, + "loss": -0.0623, + "step": 449 + }, + { + "epoch": 0.3599280143971206, + "grad_norm": 4.3423857158760475, + "learning_rate": 1.3217821782178219e-06, + "loss": -0.0166, + "step": 450 + }, + { + "epoch": 0.3607278544291142, + "grad_norm": 3.3557272335230266, + "learning_rate": 1.32013201320132e-06, + "loss": -0.1243, + "step": 451 + }, + { + "epoch": 0.3615276944611078, + "grad_norm": 4.121010209091045, + "learning_rate": 1.3184818481848183e-06, + "loss": 0.0157, + "step": 452 + }, + { + "epoch": 0.3623275344931014, + "grad_norm": 5.61777014754645, + "learning_rate": 1.3168316831683168e-06, + "loss": 0.0009, + "step": 453 + }, + { + "epoch": 0.36312737452509497, + "grad_norm": 3.9497241442966673, + "learning_rate": 1.3151815181518152e-06, + "loss": -0.0748, + "step": 454 + }, + { + "epoch": 0.3639272145570886, + "grad_norm": 3.78165099484685, + "learning_rate": 1.3135313531353134e-06, + "loss": 0.0137, + "step": 455 + }, + { + "epoch": 0.36472705458908217, + "grad_norm": 3.8365555088656573, + "learning_rate": 1.3118811881188119e-06, + "loss": 0.0, + "step": 456 + }, + { + "epoch": 0.3655268946210758, + "grad_norm": 3.9613296946642933, + "learning_rate": 1.31023102310231e-06, + "loss": -0.068, + "step": 457 + }, + { + "epoch": 0.36632673465306936, + "grad_norm": 3.558717962079936, + "learning_rate": 1.3085808580858083e-06, + "loss": -0.1112, + "step": 458 + }, + { + "epoch": 0.367126574685063, + "grad_norm": 4.93902023669042, + "learning_rate": 1.306930693069307e-06, + "loss": -0.0433, + "step": 459 + }, + { + "epoch": 0.3679264147170566, + "grad_norm": 4.69421251966819, + "learning_rate": 1.3052805280528052e-06, + "loss": 0.0994, + "step": 460 + }, + { + "epoch": 0.3687262547490502, + "grad_norm": 6.3721851791610336, + "learning_rate": 1.3036303630363034e-06, + "loss": -0.1078, + "step": 461 + }, + { + "epoch": 0.3695260947810438, + "grad_norm": 3.389060929800596, + "learning_rate": 1.3019801980198019e-06, + "loss": -0.088, + "step": 462 + }, + { + "epoch": 0.3703259348130374, + "grad_norm": 5.748513070947605, + "learning_rate": 1.3003300330033003e-06, + "loss": -0.029, + "step": 463 + }, + { + "epoch": 0.371125774845031, + "grad_norm": 4.690511727792042, + "learning_rate": 1.2986798679867985e-06, + "loss": -0.0756, + "step": 464 + }, + { + "epoch": 0.3719256148770246, + "grad_norm": 5.217441052047622, + "learning_rate": 1.297029702970297e-06, + "loss": -0.0748, + "step": 465 + }, + { + "epoch": 0.3727254549090182, + "grad_norm": 4.240980113487688, + "learning_rate": 1.2953795379537952e-06, + "loss": -0.0008, + "step": 466 + }, + { + "epoch": 0.3735252949410118, + "grad_norm": 4.743889341456478, + "learning_rate": 1.2937293729372937e-06, + "loss": -0.0671, + "step": 467 + }, + { + "epoch": 0.3743251349730054, + "grad_norm": 4.473362389672442, + "learning_rate": 1.292079207920792e-06, + "loss": 0.0101, + "step": 468 + }, + { + "epoch": 0.375124975004999, + "grad_norm": 4.197750015674087, + "learning_rate": 1.2904290429042903e-06, + "loss": 0.0175, + "step": 469 + }, + { + "epoch": 0.3759248150369926, + "grad_norm": 6.425414954415456, + "learning_rate": 1.2887788778877888e-06, + "loss": -0.0783, + "step": 470 + }, + { + "epoch": 0.3767246550689862, + "grad_norm": 2.9864850798252855, + "learning_rate": 1.2871287128712872e-06, + "loss": -0.0884, + "step": 471 + }, + { + "epoch": 0.3775244951009798, + "grad_norm": 5.261385424958508, + "learning_rate": 1.2854785478547854e-06, + "loss": -0.09, + "step": 472 + }, + { + "epoch": 0.3783243351329734, + "grad_norm": 4.958045993888585, + "learning_rate": 1.2838283828382839e-06, + "loss": -0.0682, + "step": 473 + }, + { + "epoch": 0.379124175164967, + "grad_norm": 4.7190018138263605, + "learning_rate": 1.282178217821782e-06, + "loss": -0.1395, + "step": 474 + }, + { + "epoch": 0.3799240151969606, + "grad_norm": 5.882864051380202, + "learning_rate": 1.2805280528052803e-06, + "loss": -0.1829, + "step": 475 + }, + { + "epoch": 0.3807238552289542, + "grad_norm": 3.7556665205378352, + "learning_rate": 1.278877887788779e-06, + "loss": -0.021, + "step": 476 + }, + { + "epoch": 0.3815236952609478, + "grad_norm": 4.086321431606577, + "learning_rate": 1.2772277227722772e-06, + "loss": -0.0382, + "step": 477 + }, + { + "epoch": 0.3823235352929414, + "grad_norm": 4.616776862820448, + "learning_rate": 1.2755775577557754e-06, + "loss": -0.1779, + "step": 478 + }, + { + "epoch": 0.383123375324935, + "grad_norm": 4.004332580198827, + "learning_rate": 1.2739273927392739e-06, + "loss": -0.0252, + "step": 479 + }, + { + "epoch": 0.38392321535692864, + "grad_norm": 4.624789258949781, + "learning_rate": 1.2722772277227723e-06, + "loss": -0.0274, + "step": 480 + }, + { + "epoch": 0.3847230553889222, + "grad_norm": 4.107644532644881, + "learning_rate": 1.2706270627062705e-06, + "loss": -0.0706, + "step": 481 + }, + { + "epoch": 0.38552289542091583, + "grad_norm": 5.606536912327608, + "learning_rate": 1.268976897689769e-06, + "loss": -0.1579, + "step": 482 + }, + { + "epoch": 0.3863227354529094, + "grad_norm": 3.661768864377637, + "learning_rate": 1.2673267326732672e-06, + "loss": -0.0483, + "step": 483 + }, + { + "epoch": 0.38712257548490303, + "grad_norm": 4.163789722318428, + "learning_rate": 1.2656765676567657e-06, + "loss": -0.1628, + "step": 484 + }, + { + "epoch": 0.3879224155168966, + "grad_norm": 5.862521290689618, + "learning_rate": 1.264026402640264e-06, + "loss": -0.0378, + "step": 485 + }, + { + "epoch": 0.3887222555488902, + "grad_norm": 4.451191371926914, + "learning_rate": 1.2623762376237623e-06, + "loss": 0.041, + "step": 486 + }, + { + "epoch": 0.3895220955808838, + "grad_norm": 6.554041470323983, + "learning_rate": 1.2607260726072606e-06, + "loss": -0.0089, + "step": 487 + }, + { + "epoch": 0.3903219356128774, + "grad_norm": 4.958459911280161, + "learning_rate": 1.259075907590759e-06, + "loss": -0.0351, + "step": 488 + }, + { + "epoch": 0.39112177564487105, + "grad_norm": 5.5754285433841595, + "learning_rate": 1.2574257425742574e-06, + "loss": -0.0866, + "step": 489 + }, + { + "epoch": 0.3919216156768646, + "grad_norm": 4.927561354349523, + "learning_rate": 1.2557755775577557e-06, + "loss": 0.0114, + "step": 490 + }, + { + "epoch": 0.39272145570885825, + "grad_norm": 4.275369657183623, + "learning_rate": 1.2541254125412541e-06, + "loss": 0.0731, + "step": 491 + }, + { + "epoch": 0.3935212957408518, + "grad_norm": 4.553288397020381, + "learning_rate": 1.2524752475247523e-06, + "loss": -0.0366, + "step": 492 + }, + { + "epoch": 0.39432113577284544, + "grad_norm": 4.3640356820358415, + "learning_rate": 1.2508250825082508e-06, + "loss": -0.026, + "step": 493 + }, + { + "epoch": 0.395120975804839, + "grad_norm": 6.781778763227194, + "learning_rate": 1.2491749174917492e-06, + "loss": 0.0272, + "step": 494 + }, + { + "epoch": 0.39592081583683264, + "grad_norm": 4.147600624744722, + "learning_rate": 1.2475247524752474e-06, + "loss": -0.0533, + "step": 495 + }, + { + "epoch": 0.3967206558688262, + "grad_norm": 7.925587764087279, + "learning_rate": 1.2458745874587457e-06, + "loss": -0.0023, + "step": 496 + }, + { + "epoch": 0.39752049590081984, + "grad_norm": 3.9471683782785267, + "learning_rate": 1.2442244224422443e-06, + "loss": -0.0624, + "step": 497 + }, + { + "epoch": 0.39832033593281346, + "grad_norm": 5.046330000323796, + "learning_rate": 1.2425742574257426e-06, + "loss": -0.1152, + "step": 498 + }, + { + "epoch": 0.39912017596480703, + "grad_norm": 3.797212185428219, + "learning_rate": 1.2409240924092408e-06, + "loss": -0.0566, + "step": 499 + }, + { + "epoch": 0.39992001599680066, + "grad_norm": 5.219397955775355, + "learning_rate": 1.2392739273927392e-06, + "loss": -0.0473, + "step": 500 + }, + { + "epoch": 0.40071985602879423, + "grad_norm": 4.888043487068187, + "learning_rate": 1.2376237623762375e-06, + "loss": -0.0766, + "step": 501 + }, + { + "epoch": 0.40151969606078786, + "grad_norm": 4.812490488018197, + "learning_rate": 1.2359735973597359e-06, + "loss": -0.0063, + "step": 502 + }, + { + "epoch": 0.4023195360927814, + "grad_norm": 4.137421603194797, + "learning_rate": 1.2343234323432343e-06, + "loss": -0.0894, + "step": 503 + }, + { + "epoch": 0.40311937612477505, + "grad_norm": 4.446466976487968, + "learning_rate": 1.2326732673267326e-06, + "loss": -0.1032, + "step": 504 + }, + { + "epoch": 0.4039192161567686, + "grad_norm": 4.517828908408806, + "learning_rate": 1.2310231023102308e-06, + "loss": -0.0366, + "step": 505 + }, + { + "epoch": 0.40471905618876225, + "grad_norm": 4.025363379714323, + "learning_rate": 1.2293729372937294e-06, + "loss": -0.0322, + "step": 506 + }, + { + "epoch": 0.4055188962207559, + "grad_norm": 3.0065985273378026, + "learning_rate": 1.2277227722772277e-06, + "loss": -0.1451, + "step": 507 + }, + { + "epoch": 0.40631873625274945, + "grad_norm": 6.102992165053075, + "learning_rate": 1.226072607260726e-06, + "loss": -0.0947, + "step": 508 + }, + { + "epoch": 0.4071185762847431, + "grad_norm": 5.373085923158729, + "learning_rate": 1.2244224422442243e-06, + "loss": -0.038, + "step": 509 + }, + { + "epoch": 0.40791841631673664, + "grad_norm": 6.507842394701745, + "learning_rate": 1.2227722772277228e-06, + "loss": -0.0416, + "step": 510 + }, + { + "epoch": 0.40871825634873027, + "grad_norm": 6.211682775156014, + "learning_rate": 1.221122112211221e-06, + "loss": -0.0155, + "step": 511 + }, + { + "epoch": 0.40951809638072384, + "grad_norm": 3.253438304578216, + "learning_rate": 1.2194719471947194e-06, + "loss": -0.0971, + "step": 512 + }, + { + "epoch": 0.41031793641271747, + "grad_norm": 3.562574820820311, + "learning_rate": 1.2178217821782177e-06, + "loss": -0.0982, + "step": 513 + }, + { + "epoch": 0.41111777644471104, + "grad_norm": 3.5746182911507067, + "learning_rate": 1.216171617161716e-06, + "loss": -0.023, + "step": 514 + }, + { + "epoch": 0.41191761647670466, + "grad_norm": 3.488381447372906, + "learning_rate": 1.2145214521452146e-06, + "loss": 0.1215, + "step": 515 + }, + { + "epoch": 0.41271745650869823, + "grad_norm": 3.1641673883077788, + "learning_rate": 1.2128712871287128e-06, + "loss": -0.0269, + "step": 516 + }, + { + "epoch": 0.41351729654069186, + "grad_norm": 4.13780306256476, + "learning_rate": 1.211221122112211e-06, + "loss": -0.08, + "step": 517 + }, + { + "epoch": 0.4143171365726855, + "grad_norm": 5.23630954806011, + "learning_rate": 1.2095709570957095e-06, + "loss": 0.0683, + "step": 518 + }, + { + "epoch": 0.41511697660467906, + "grad_norm": 5.08041039318514, + "learning_rate": 1.207920792079208e-06, + "loss": -0.0634, + "step": 519 + }, + { + "epoch": 0.4159168166366727, + "grad_norm": 4.328106931793288, + "learning_rate": 1.2062706270627063e-06, + "loss": -0.1251, + "step": 520 + }, + { + "epoch": 0.41671665666866625, + "grad_norm": 3.8685318893058978, + "learning_rate": 1.2046204620462046e-06, + "loss": -0.0397, + "step": 521 + }, + { + "epoch": 0.4175164967006599, + "grad_norm": 4.228873306898751, + "learning_rate": 1.2029702970297028e-06, + "loss": -0.0097, + "step": 522 + }, + { + "epoch": 0.41831633673265345, + "grad_norm": 8.545101114191558, + "learning_rate": 1.2013201320132014e-06, + "loss": 0.0257, + "step": 523 + }, + { + "epoch": 0.4191161767646471, + "grad_norm": 4.940963303084406, + "learning_rate": 1.1996699669966997e-06, + "loss": -0.0218, + "step": 524 + }, + { + "epoch": 0.41991601679664065, + "grad_norm": 4.486186548838174, + "learning_rate": 1.198019801980198e-06, + "loss": -0.1274, + "step": 525 + }, + { + "epoch": 0.42071585682863427, + "grad_norm": 6.117424152809813, + "learning_rate": 1.1963696369636963e-06, + "loss": -0.0412, + "step": 526 + }, + { + "epoch": 0.4215156968606279, + "grad_norm": 6.026088581435606, + "learning_rate": 1.1947194719471948e-06, + "loss": -0.1461, + "step": 527 + }, + { + "epoch": 0.42231553689262147, + "grad_norm": 4.391032244166686, + "learning_rate": 1.193069306930693e-06, + "loss": 0.0956, + "step": 528 + }, + { + "epoch": 0.4231153769246151, + "grad_norm": 5.139889742785653, + "learning_rate": 1.1914191419141915e-06, + "loss": -0.0998, + "step": 529 + }, + { + "epoch": 0.42391521695660866, + "grad_norm": 5.63979191849408, + "learning_rate": 1.1897689768976897e-06, + "loss": -0.0672, + "step": 530 + }, + { + "epoch": 0.4247150569886023, + "grad_norm": 8.323803093358931, + "learning_rate": 1.188118811881188e-06, + "loss": -0.1988, + "step": 531 + }, + { + "epoch": 0.42551489702059586, + "grad_norm": 3.5224991117629263, + "learning_rate": 1.1864686468646866e-06, + "loss": -0.0976, + "step": 532 + }, + { + "epoch": 0.4263147370525895, + "grad_norm": 3.3222865799787407, + "learning_rate": 1.1848184818481848e-06, + "loss": -0.1625, + "step": 533 + }, + { + "epoch": 0.42711457708458306, + "grad_norm": 4.473219337166838, + "learning_rate": 1.183168316831683e-06, + "loss": -0.102, + "step": 534 + }, + { + "epoch": 0.4279144171165767, + "grad_norm": 4.255445918061684, + "learning_rate": 1.1815181518151815e-06, + "loss": 0.0347, + "step": 535 + }, + { + "epoch": 0.4287142571485703, + "grad_norm": 5.273596279438336, + "learning_rate": 1.17986798679868e-06, + "loss": -0.0233, + "step": 536 + }, + { + "epoch": 0.4295140971805639, + "grad_norm": 3.2928329204452167, + "learning_rate": 1.1782178217821781e-06, + "loss": -0.1795, + "step": 537 + }, + { + "epoch": 0.4303139372125575, + "grad_norm": 3.1943187707330676, + "learning_rate": 1.1765676567656766e-06, + "loss": -0.1193, + "step": 538 + }, + { + "epoch": 0.4311137772445511, + "grad_norm": 4.457107636902936, + "learning_rate": 1.1749174917491748e-06, + "loss": -0.0256, + "step": 539 + }, + { + "epoch": 0.4319136172765447, + "grad_norm": 4.508728040150466, + "learning_rate": 1.1732673267326732e-06, + "loss": -0.1272, + "step": 540 + }, + { + "epoch": 0.4327134573085383, + "grad_norm": 4.679176366336832, + "learning_rate": 1.1716171617161717e-06, + "loss": -0.0107, + "step": 541 + }, + { + "epoch": 0.4335132973405319, + "grad_norm": 4.572704243632147, + "learning_rate": 1.16996699669967e-06, + "loss": -0.0189, + "step": 542 + }, + { + "epoch": 0.43431313737252547, + "grad_norm": 3.823996049360206, + "learning_rate": 1.1683168316831681e-06, + "loss": 0.0071, + "step": 543 + }, + { + "epoch": 0.4351129774045191, + "grad_norm": 4.448417665137879, + "learning_rate": 1.1666666666666668e-06, + "loss": 0.0018, + "step": 544 + }, + { + "epoch": 0.4359128174365127, + "grad_norm": 3.7067784825161625, + "learning_rate": 1.165016501650165e-06, + "loss": -0.0643, + "step": 545 + }, + { + "epoch": 0.4367126574685063, + "grad_norm": 4.304960211061566, + "learning_rate": 1.1633663366336632e-06, + "loss": 0.0434, + "step": 546 + }, + { + "epoch": 0.4375124975004999, + "grad_norm": 4.809624741919171, + "learning_rate": 1.1617161716171617e-06, + "loss": -0.1175, + "step": 547 + }, + { + "epoch": 0.4383123375324935, + "grad_norm": 3.572402442577118, + "learning_rate": 1.16006600660066e-06, + "loss": 0.0641, + "step": 548 + }, + { + "epoch": 0.4391121775644871, + "grad_norm": 3.1323439706728173, + "learning_rate": 1.1584158415841584e-06, + "loss": -0.1315, + "step": 549 + }, + { + "epoch": 0.4399120175964807, + "grad_norm": 6.63310206919076, + "learning_rate": 1.1567656765676568e-06, + "loss": -0.1572, + "step": 550 + }, + { + "epoch": 0.4407118576284743, + "grad_norm": 5.7194336862922475, + "learning_rate": 1.155115511551155e-06, + "loss": -0.0498, + "step": 551 + }, + { + "epoch": 0.4415116976604679, + "grad_norm": 5.0355458371512976, + "learning_rate": 1.1534653465346533e-06, + "loss": -0.0343, + "step": 552 + }, + { + "epoch": 0.4423115376924615, + "grad_norm": 4.479813545297925, + "learning_rate": 1.151815181518152e-06, + "loss": -0.052, + "step": 553 + }, + { + "epoch": 0.44311137772445514, + "grad_norm": 5.515043665694904, + "learning_rate": 1.1501650165016501e-06, + "loss": -0.0962, + "step": 554 + }, + { + "epoch": 0.4439112177564487, + "grad_norm": 4.35349503345848, + "learning_rate": 1.1485148514851484e-06, + "loss": -0.1718, + "step": 555 + }, + { + "epoch": 0.44471105778844233, + "grad_norm": 4.324313582265245, + "learning_rate": 1.1468646864686468e-06, + "loss": -0.1059, + "step": 556 + }, + { + "epoch": 0.4455108978204359, + "grad_norm": 6.603357917371442, + "learning_rate": 1.1452145214521452e-06, + "loss": -0.0179, + "step": 557 + }, + { + "epoch": 0.44631073785242953, + "grad_norm": 4.731847308612818, + "learning_rate": 1.1435643564356435e-06, + "loss": -0.0285, + "step": 558 + }, + { + "epoch": 0.4471105778844231, + "grad_norm": 3.2387655192879, + "learning_rate": 1.141914191419142e-06, + "loss": 0.0157, + "step": 559 + }, + { + "epoch": 0.4479104179164167, + "grad_norm": 4.11046977381839, + "learning_rate": 1.1402640264026401e-06, + "loss": -0.0738, + "step": 560 + }, + { + "epoch": 0.4487102579484103, + "grad_norm": 4.7767170879491765, + "learning_rate": 1.1386138613861384e-06, + "loss": -0.1826, + "step": 561 + }, + { + "epoch": 0.4495100979804039, + "grad_norm": 4.712736226037487, + "learning_rate": 1.136963696369637e-06, + "loss": -0.0453, + "step": 562 + }, + { + "epoch": 0.45030993801239755, + "grad_norm": 5.2823586287855795, + "learning_rate": 1.1353135313531353e-06, + "loss": -0.0112, + "step": 563 + }, + { + "epoch": 0.4511097780443911, + "grad_norm": 4.313375854007458, + "learning_rate": 1.1336633663366335e-06, + "loss": -0.1529, + "step": 564 + }, + { + "epoch": 0.45190961807638474, + "grad_norm": 3.661113275988152, + "learning_rate": 1.132013201320132e-06, + "loss": 0.0182, + "step": 565 + }, + { + "epoch": 0.4527094581083783, + "grad_norm": 3.3641831166871015, + "learning_rate": 1.1303630363036304e-06, + "loss": -0.0967, + "step": 566 + }, + { + "epoch": 0.45350929814037194, + "grad_norm": 3.5187461731044634, + "learning_rate": 1.1287128712871286e-06, + "loss": -0.0115, + "step": 567 + }, + { + "epoch": 0.4543091381723655, + "grad_norm": 5.055238201441692, + "learning_rate": 1.127062706270627e-06, + "loss": -0.0701, + "step": 568 + }, + { + "epoch": 0.45510897820435914, + "grad_norm": 5.048998878882335, + "learning_rate": 1.1254125412541253e-06, + "loss": -0.0982, + "step": 569 + }, + { + "epoch": 0.4559088182363527, + "grad_norm": 6.706995417966311, + "learning_rate": 1.123762376237624e-06, + "loss": -0.0323, + "step": 570 + }, + { + "epoch": 0.45670865826834633, + "grad_norm": 5.717361031375047, + "learning_rate": 1.1221122112211221e-06, + "loss": -0.0251, + "step": 571 + }, + { + "epoch": 0.4575084983003399, + "grad_norm": 5.073568794377317, + "learning_rate": 1.1204620462046204e-06, + "loss": -0.2165, + "step": 572 + }, + { + "epoch": 0.45830833833233353, + "grad_norm": 4.340499430104141, + "learning_rate": 1.1188118811881188e-06, + "loss": -0.1008, + "step": 573 + }, + { + "epoch": 0.45910817836432716, + "grad_norm": 11.188522206922801, + "learning_rate": 1.117161716171617e-06, + "loss": -0.0362, + "step": 574 + }, + { + "epoch": 0.45990801839632073, + "grad_norm": 8.96889120914533, + "learning_rate": 1.1155115511551155e-06, + "loss": 0.0492, + "step": 575 + }, + { + "epoch": 0.46070785842831435, + "grad_norm": 4.7623433805729825, + "learning_rate": 1.113861386138614e-06, + "loss": -0.0462, + "step": 576 + }, + { + "epoch": 0.4615076984603079, + "grad_norm": 4.809552169223393, + "learning_rate": 1.1122112211221121e-06, + "loss": -0.0298, + "step": 577 + }, + { + "epoch": 0.46230753849230155, + "grad_norm": 4.199314588295882, + "learning_rate": 1.1105610561056104e-06, + "loss": -0.1501, + "step": 578 + }, + { + "epoch": 0.4631073785242951, + "grad_norm": 3.531450924178731, + "learning_rate": 1.108910891089109e-06, + "loss": -0.0589, + "step": 579 + }, + { + "epoch": 0.46390721855628875, + "grad_norm": 4.247443159328166, + "learning_rate": 1.1072607260726073e-06, + "loss": -0.0362, + "step": 580 + }, + { + "epoch": 0.4647070585882823, + "grad_norm": 3.4845953217374346, + "learning_rate": 1.1056105610561055e-06, + "loss": -0.0171, + "step": 581 + }, + { + "epoch": 0.46550689862027594, + "grad_norm": 6.269891746353069, + "learning_rate": 1.103960396039604e-06, + "loss": -0.0552, + "step": 582 + }, + { + "epoch": 0.46630673865226957, + "grad_norm": 4.110925186237378, + "learning_rate": 1.1023102310231024e-06, + "loss": -0.1525, + "step": 583 + }, + { + "epoch": 0.46710657868426314, + "grad_norm": 4.027706037445169, + "learning_rate": 1.1006600660066006e-06, + "loss": -0.1283, + "step": 584 + }, + { + "epoch": 0.46790641871625677, + "grad_norm": 6.085436348609565, + "learning_rate": 1.099009900990099e-06, + "loss": 0.0152, + "step": 585 + }, + { + "epoch": 0.46870625874825034, + "grad_norm": 4.403688809188704, + "learning_rate": 1.0973597359735973e-06, + "loss": -0.0462, + "step": 586 + }, + { + "epoch": 0.46950609878024396, + "grad_norm": 3.7204758942669924, + "learning_rate": 1.0957095709570955e-06, + "loss": -0.0472, + "step": 587 + }, + { + "epoch": 0.47030593881223753, + "grad_norm": 4.16282077045935, + "learning_rate": 1.0940594059405941e-06, + "loss": -0.1098, + "step": 588 + }, + { + "epoch": 0.47110577884423116, + "grad_norm": 4.490618781800236, + "learning_rate": 1.0924092409240924e-06, + "loss": -0.0323, + "step": 589 + }, + { + "epoch": 0.47190561887622473, + "grad_norm": 3.83450769320751, + "learning_rate": 1.0907590759075906e-06, + "loss": -0.0482, + "step": 590 + }, + { + "epoch": 0.47270545890821836, + "grad_norm": 3.951497100856045, + "learning_rate": 1.089108910891089e-06, + "loss": -0.1206, + "step": 591 + }, + { + "epoch": 0.473505298940212, + "grad_norm": 4.798754196622245, + "learning_rate": 1.0874587458745875e-06, + "loss": 0.0584, + "step": 592 + }, + { + "epoch": 0.47430513897220555, + "grad_norm": 4.437199971976538, + "learning_rate": 1.0858085808580857e-06, + "loss": -0.0666, + "step": 593 + }, + { + "epoch": 0.4751049790041992, + "grad_norm": 5.877945506525689, + "learning_rate": 1.0841584158415842e-06, + "loss": -0.0795, + "step": 594 + }, + { + "epoch": 0.47590481903619275, + "grad_norm": 4.9827179740392165, + "learning_rate": 1.0825082508250824e-06, + "loss": -0.145, + "step": 595 + }, + { + "epoch": 0.4767046590681864, + "grad_norm": 4.579893011650921, + "learning_rate": 1.0808580858085808e-06, + "loss": 0.0438, + "step": 596 + }, + { + "epoch": 0.47750449910017995, + "grad_norm": 4.6741113317873975, + "learning_rate": 1.0792079207920793e-06, + "loss": -0.0471, + "step": 597 + }, + { + "epoch": 0.4783043391321736, + "grad_norm": 4.45102253856279, + "learning_rate": 1.0775577557755775e-06, + "loss": -0.0705, + "step": 598 + }, + { + "epoch": 0.47910417916416714, + "grad_norm": 7.735572429403455, + "learning_rate": 1.0759075907590757e-06, + "loss": -0.0004, + "step": 599 + }, + { + "epoch": 0.47990401919616077, + "grad_norm": 4.3890790831054645, + "learning_rate": 1.0742574257425744e-06, + "loss": -0.11, + "step": 600 + }, + { + "epoch": 0.4807038592281544, + "grad_norm": 3.750715441802859, + "learning_rate": 1.0726072607260726e-06, + "loss": -0.0943, + "step": 601 + }, + { + "epoch": 0.48150369926014797, + "grad_norm": 3.65650652124133, + "learning_rate": 1.0709570957095708e-06, + "loss": -0.169, + "step": 602 + }, + { + "epoch": 0.4823035392921416, + "grad_norm": 4.55201247427091, + "learning_rate": 1.0693069306930693e-06, + "loss": -0.0753, + "step": 603 + }, + { + "epoch": 0.48310337932413516, + "grad_norm": 4.55776208974576, + "learning_rate": 1.0676567656765675e-06, + "loss": -0.1347, + "step": 604 + }, + { + "epoch": 0.4839032193561288, + "grad_norm": 4.48864117831728, + "learning_rate": 1.066006600660066e-06, + "loss": -0.0974, + "step": 605 + }, + { + "epoch": 0.48470305938812236, + "grad_norm": 3.9363005491507668, + "learning_rate": 1.0643564356435644e-06, + "loss": -0.0079, + "step": 606 + }, + { + "epoch": 0.485502899420116, + "grad_norm": 4.252862078789536, + "learning_rate": 1.0627062706270626e-06, + "loss": -0.0136, + "step": 607 + }, + { + "epoch": 0.48630273945210956, + "grad_norm": 4.543019341197776, + "learning_rate": 1.0610561056105608e-06, + "loss": -0.1351, + "step": 608 + }, + { + "epoch": 0.4871025794841032, + "grad_norm": 5.655845163937271, + "learning_rate": 1.0594059405940595e-06, + "loss": -0.0266, + "step": 609 + }, + { + "epoch": 0.4879024195160968, + "grad_norm": 11.681523554473147, + "learning_rate": 1.0577557755775577e-06, + "loss": -0.1126, + "step": 610 + }, + { + "epoch": 0.4887022595480904, + "grad_norm": 4.307554643653013, + "learning_rate": 1.056105610561056e-06, + "loss": -0.0647, + "step": 611 + }, + { + "epoch": 0.489502099580084, + "grad_norm": 5.732899988046993, + "learning_rate": 1.0544554455445544e-06, + "loss": -0.0934, + "step": 612 + }, + { + "epoch": 0.4903019396120776, + "grad_norm": 4.150445541916088, + "learning_rate": 1.0528052805280528e-06, + "loss": -0.1406, + "step": 613 + }, + { + "epoch": 0.4911017796440712, + "grad_norm": 5.1082669144378725, + "learning_rate": 1.051155115511551e-06, + "loss": 0.0558, + "step": 614 + }, + { + "epoch": 0.49190161967606477, + "grad_norm": 7.67678160227598, + "learning_rate": 1.0495049504950495e-06, + "loss": 0.0088, + "step": 615 + }, + { + "epoch": 0.4927014597080584, + "grad_norm": 3.354349182723264, + "learning_rate": 1.0478547854785477e-06, + "loss": -0.1359, + "step": 616 + }, + { + "epoch": 0.49350129974005197, + "grad_norm": 4.725187848913473, + "learning_rate": 1.046204620462046e-06, + "loss": -0.0762, + "step": 617 + }, + { + "epoch": 0.4943011397720456, + "grad_norm": 5.7805399690672825, + "learning_rate": 1.0445544554455446e-06, + "loss": 0.0578, + "step": 618 + }, + { + "epoch": 0.49510097980403917, + "grad_norm": 4.179300735975294, + "learning_rate": 1.0429042904290428e-06, + "loss": 0.005, + "step": 619 + }, + { + "epoch": 0.4959008198360328, + "grad_norm": 5.098901682751034, + "learning_rate": 1.0412541254125413e-06, + "loss": -0.0156, + "step": 620 + }, + { + "epoch": 0.4967006598680264, + "grad_norm": 8.248543813099444, + "learning_rate": 1.0396039603960395e-06, + "loss": -0.0457, + "step": 621 + }, + { + "epoch": 0.49750049990002, + "grad_norm": 4.864094199749138, + "learning_rate": 1.037953795379538e-06, + "loss": 0.0829, + "step": 622 + }, + { + "epoch": 0.4983003399320136, + "grad_norm": 4.692602610129244, + "learning_rate": 1.0363036303630364e-06, + "loss": -0.0396, + "step": 623 + }, + { + "epoch": 0.4991001799640072, + "grad_norm": 3.696934414641692, + "learning_rate": 1.0346534653465346e-06, + "loss": -0.1939, + "step": 624 + }, + { + "epoch": 0.4999000199960008, + "grad_norm": 5.491055932283412, + "learning_rate": 1.0330033003300328e-06, + "loss": -0.0639, + "step": 625 + }, + { + "epoch": 0.5006998600279944, + "grad_norm": 3.92108657024522, + "learning_rate": 1.0313531353135315e-06, + "loss": -0.0969, + "step": 626 + }, + { + "epoch": 0.501499700059988, + "grad_norm": 5.243975134143899, + "learning_rate": 1.0297029702970297e-06, + "loss": -0.0206, + "step": 627 + }, + { + "epoch": 0.5022995400919816, + "grad_norm": 4.328763949057532, + "learning_rate": 1.028052805280528e-06, + "loss": -0.1081, + "step": 628 + }, + { + "epoch": 0.5030993801239753, + "grad_norm": 4.197218364380273, + "learning_rate": 1.0264026402640264e-06, + "loss": -0.0932, + "step": 629 + }, + { + "epoch": 0.5038992201559688, + "grad_norm": 4.300285400855226, + "learning_rate": 1.0247524752475248e-06, + "loss": -0.1016, + "step": 630 + }, + { + "epoch": 0.5046990601879624, + "grad_norm": 4.4977354073528675, + "learning_rate": 1.023102310231023e-06, + "loss": -0.0846, + "step": 631 + }, + { + "epoch": 0.505498900219956, + "grad_norm": 4.303063503427107, + "learning_rate": 1.0214521452145215e-06, + "loss": -0.0724, + "step": 632 + }, + { + "epoch": 0.5062987402519497, + "grad_norm": 6.073276027125735, + "learning_rate": 1.0198019801980197e-06, + "loss": -0.0326, + "step": 633 + }, + { + "epoch": 0.5070985802839432, + "grad_norm": 4.973392453051079, + "learning_rate": 1.018151815181518e-06, + "loss": 0.0189, + "step": 634 + }, + { + "epoch": 0.5078984203159368, + "grad_norm": 4.811288854611089, + "learning_rate": 1.0165016501650166e-06, + "loss": -0.1415, + "step": 635 + }, + { + "epoch": 0.5086982603479304, + "grad_norm": 4.853499316182358, + "learning_rate": 1.0148514851485148e-06, + "loss": -0.0896, + "step": 636 + }, + { + "epoch": 0.509498100379924, + "grad_norm": 3.839359784599905, + "learning_rate": 1.013201320132013e-06, + "loss": -0.0358, + "step": 637 + }, + { + "epoch": 0.5102979404119176, + "grad_norm": 4.783165043297262, + "learning_rate": 1.0115511551155115e-06, + "loss": -0.0102, + "step": 638 + }, + { + "epoch": 0.5110977804439112, + "grad_norm": 5.184943614862812, + "learning_rate": 1.00990099009901e-06, + "loss": -0.1354, + "step": 639 + }, + { + "epoch": 0.5118976204759048, + "grad_norm": 39.49636862757923, + "learning_rate": 1.0082508250825082e-06, + "loss": 0.0023, + "step": 640 + }, + { + "epoch": 0.5126974605078984, + "grad_norm": 4.083973866781674, + "learning_rate": 1.0066006600660066e-06, + "loss": -0.0357, + "step": 641 + }, + { + "epoch": 0.5134973005398921, + "grad_norm": 5.817761080874723, + "learning_rate": 1.0049504950495048e-06, + "loss": -0.0441, + "step": 642 + }, + { + "epoch": 0.5142971405718856, + "grad_norm": 8.811705641420119, + "learning_rate": 1.0033003300330033e-06, + "loss": 0.1013, + "step": 643 + }, + { + "epoch": 0.5150969806038792, + "grad_norm": 4.211618456869653, + "learning_rate": 1.0016501650165017e-06, + "loss": -0.1781, + "step": 644 + }, + { + "epoch": 0.5158968206358728, + "grad_norm": 6.430696021299668, + "learning_rate": 1e-06, + "loss": -0.0475, + "step": 645 + }, + { + "epoch": 0.5166966606678665, + "grad_norm": 4.287574273625528, + "learning_rate": 9.983498349834984e-07, + "loss": -0.0304, + "step": 646 + }, + { + "epoch": 0.51749650069986, + "grad_norm": 2.916483591782696, + "learning_rate": 9.966996699669966e-07, + "loss": -0.1574, + "step": 647 + }, + { + "epoch": 0.5182963407318536, + "grad_norm": 5.071406876730617, + "learning_rate": 9.95049504950495e-07, + "loss": -0.0227, + "step": 648 + }, + { + "epoch": 0.5190961807638472, + "grad_norm": 3.863107212570657, + "learning_rate": 9.933993399339933e-07, + "loss": -0.0452, + "step": 649 + }, + { + "epoch": 0.5198960207958409, + "grad_norm": 4.234640528387632, + "learning_rate": 9.917491749174917e-07, + "loss": -0.0816, + "step": 650 + }, + { + "epoch": 0.5206958608278345, + "grad_norm": 4.255603238118902, + "learning_rate": 9.9009900990099e-07, + "loss": -0.1311, + "step": 651 + }, + { + "epoch": 0.521495700859828, + "grad_norm": 4.723143092604518, + "learning_rate": 9.884488448844884e-07, + "loss": -0.1076, + "step": 652 + }, + { + "epoch": 0.5222955408918216, + "grad_norm": 4.609676855516043, + "learning_rate": 9.867986798679866e-07, + "loss": -0.0468, + "step": 653 + }, + { + "epoch": 0.5230953809238152, + "grad_norm": 4.648497611546731, + "learning_rate": 9.85148514851485e-07, + "loss": 0.066, + "step": 654 + }, + { + "epoch": 0.5238952209558089, + "grad_norm": 4.444890919088204, + "learning_rate": 9.834983498349835e-07, + "loss": -0.0954, + "step": 655 + }, + { + "epoch": 0.5246950609878024, + "grad_norm": 3.789960680030435, + "learning_rate": 9.818481848184817e-07, + "loss": -0.0846, + "step": 656 + }, + { + "epoch": 0.525494901019796, + "grad_norm": 7.20767352956141, + "learning_rate": 9.801980198019802e-07, + "loss": -0.0689, + "step": 657 + }, + { + "epoch": 0.5262947410517896, + "grad_norm": 3.8251645221108883, + "learning_rate": 9.785478547854786e-07, + "loss": -0.0026, + "step": 658 + }, + { + "epoch": 0.5270945810837833, + "grad_norm": 10.44191187281191, + "learning_rate": 9.768976897689768e-07, + "loss": -0.1835, + "step": 659 + }, + { + "epoch": 0.5278944211157769, + "grad_norm": 6.355832474610427, + "learning_rate": 9.75247524752475e-07, + "loss": -0.0168, + "step": 660 + }, + { + "epoch": 0.5286942611477704, + "grad_norm": 3.864986648832606, + "learning_rate": 9.735973597359735e-07, + "loss": -0.0699, + "step": 661 + }, + { + "epoch": 0.529494101179764, + "grad_norm": 9.07974895718424, + "learning_rate": 9.71947194719472e-07, + "loss": -0.0013, + "step": 662 + }, + { + "epoch": 0.5302939412117577, + "grad_norm": 10.325195266318097, + "learning_rate": 9.702970297029702e-07, + "loss": -0.1641, + "step": 663 + }, + { + "epoch": 0.5310937812437513, + "grad_norm": 4.441568363287549, + "learning_rate": 9.686468646864686e-07, + "loss": 0.0397, + "step": 664 + }, + { + "epoch": 0.5318936212757448, + "grad_norm": 5.1229672005813605, + "learning_rate": 9.66996699669967e-07, + "loss": -0.1918, + "step": 665 + }, + { + "epoch": 0.5326934613077384, + "grad_norm": 4.721345907443594, + "learning_rate": 9.653465346534653e-07, + "loss": -0.0215, + "step": 666 + }, + { + "epoch": 0.533493301339732, + "grad_norm": 7.118557074848867, + "learning_rate": 9.636963696369637e-07, + "loss": -0.0039, + "step": 667 + }, + { + "epoch": 0.5342931413717257, + "grad_norm": 4.7832103026691755, + "learning_rate": 9.62046204620462e-07, + "loss": -0.0221, + "step": 668 + }, + { + "epoch": 0.5350929814037193, + "grad_norm": 6.3240275362924505, + "learning_rate": 9.603960396039604e-07, + "loss": -0.0409, + "step": 669 + }, + { + "epoch": 0.5358928214357128, + "grad_norm": 4.027979744126303, + "learning_rate": 9.587458745874586e-07, + "loss": -0.0503, + "step": 670 + }, + { + "epoch": 0.5366926614677064, + "grad_norm": 4.74302864470449, + "learning_rate": 9.57095709570957e-07, + "loss": -0.0521, + "step": 671 + }, + { + "epoch": 0.5374925014997001, + "grad_norm": 4.985380734947703, + "learning_rate": 9.554455445544553e-07, + "loss": -0.082, + "step": 672 + }, + { + "epoch": 0.5382923415316937, + "grad_norm": 4.472849852523853, + "learning_rate": 9.537953795379537e-07, + "loss": 0.1016, + "step": 673 + }, + { + "epoch": 0.5390921815636872, + "grad_norm": 3.308647938974776, + "learning_rate": 9.521452145214522e-07, + "loss": -0.0457, + "step": 674 + }, + { + "epoch": 0.5398920215956808, + "grad_norm": 4.677054809392709, + "learning_rate": 9.504950495049504e-07, + "loss": -0.1658, + "step": 675 + }, + { + "epoch": 0.5406918616276745, + "grad_norm": 4.067875284374342, + "learning_rate": 9.488448844884487e-07, + "loss": 0.0196, + "step": 676 + }, + { + "epoch": 0.5414917016596681, + "grad_norm": 3.730643734647644, + "learning_rate": 9.471947194719472e-07, + "loss": -0.0894, + "step": 677 + }, + { + "epoch": 0.5422915416916617, + "grad_norm": 5.204622803431674, + "learning_rate": 9.455445544554454e-07, + "loss": -0.0538, + "step": 678 + }, + { + "epoch": 0.5430913817236552, + "grad_norm": 5.261361679954622, + "learning_rate": 9.438943894389439e-07, + "loss": -0.026, + "step": 679 + }, + { + "epoch": 0.5438912217556489, + "grad_norm": 4.057248991454938, + "learning_rate": 9.422442244224422e-07, + "loss": -0.0785, + "step": 680 + }, + { + "epoch": 0.5446910617876425, + "grad_norm": 5.580346613410825, + "learning_rate": 9.405940594059405e-07, + "loss": -0.0335, + "step": 681 + }, + { + "epoch": 0.5454909018196361, + "grad_norm": 3.287439521262259, + "learning_rate": 9.389438943894389e-07, + "loss": -0.014, + "step": 682 + }, + { + "epoch": 0.5462907418516296, + "grad_norm": 4.383770462305995, + "learning_rate": 9.372937293729373e-07, + "loss": -0.1349, + "step": 683 + }, + { + "epoch": 0.5470905818836233, + "grad_norm": 3.570830393151609, + "learning_rate": 9.356435643564356e-07, + "loss": -0.1603, + "step": 684 + }, + { + "epoch": 0.5478904219156169, + "grad_norm": 3.9301633755259076, + "learning_rate": 9.33993399339934e-07, + "loss": -0.0728, + "step": 685 + }, + { + "epoch": 0.5486902619476105, + "grad_norm": 4.577722525237458, + "learning_rate": 9.323432343234323e-07, + "loss": -0.0686, + "step": 686 + }, + { + "epoch": 0.5494901019796041, + "grad_norm": 3.4290177162671704, + "learning_rate": 9.306930693069307e-07, + "loss": -0.0002, + "step": 687 + }, + { + "epoch": 0.5502899420115976, + "grad_norm": 3.7664714194362574, + "learning_rate": 9.29042904290429e-07, + "loss": 0.0181, + "step": 688 + }, + { + "epoch": 0.5510897820435913, + "grad_norm": 3.53486758576286, + "learning_rate": 9.273927392739273e-07, + "loss": 0.0046, + "step": 689 + }, + { + "epoch": 0.5518896220755849, + "grad_norm": 5.533353761076084, + "learning_rate": 9.257425742574257e-07, + "loss": -0.1142, + "step": 690 + }, + { + "epoch": 0.5526894621075785, + "grad_norm": 3.618813464410366, + "learning_rate": 9.24092409240924e-07, + "loss": -0.0533, + "step": 691 + }, + { + "epoch": 0.553489302139572, + "grad_norm": 3.2399813948203064, + "learning_rate": 9.224422442244224e-07, + "loss": -0.2655, + "step": 692 + }, + { + "epoch": 0.5542891421715657, + "grad_norm": 4.777938394039966, + "learning_rate": 9.207920792079208e-07, + "loss": -0.1287, + "step": 693 + }, + { + "epoch": 0.5550889822035593, + "grad_norm": 4.753607333838816, + "learning_rate": 9.191419141914191e-07, + "loss": -0.0197, + "step": 694 + }, + { + "epoch": 0.5558888222355529, + "grad_norm": 4.574962995202333, + "learning_rate": 9.174917491749174e-07, + "loss": -0.0882, + "step": 695 + }, + { + "epoch": 0.5566886622675465, + "grad_norm": 5.716789868568477, + "learning_rate": 9.158415841584159e-07, + "loss": -0.0978, + "step": 696 + }, + { + "epoch": 0.5574885022995401, + "grad_norm": 5.1621141783698805, + "learning_rate": 9.141914191419141e-07, + "loss": -0.0492, + "step": 697 + }, + { + "epoch": 0.5582883423315337, + "grad_norm": 5.066721910041668, + "learning_rate": 9.125412541254125e-07, + "loss": -0.1249, + "step": 698 + }, + { + "epoch": 0.5590881823635273, + "grad_norm": 6.6614345364199, + "learning_rate": 9.108910891089109e-07, + "loss": -0.0784, + "step": 699 + }, + { + "epoch": 0.5598880223955209, + "grad_norm": 7.651991800546116, + "learning_rate": 9.092409240924092e-07, + "loss": 0.0583, + "step": 700 + }, + { + "epoch": 0.5606878624275144, + "grad_norm": 5.556872832637128, + "learning_rate": 9.075907590759075e-07, + "loss": -0.0337, + "step": 701 + }, + { + "epoch": 0.5614877024595081, + "grad_norm": 10.302249877529787, + "learning_rate": 9.05940594059406e-07, + "loss": -0.0318, + "step": 702 + }, + { + "epoch": 0.5622875424915017, + "grad_norm": 6.446831319626775, + "learning_rate": 9.042904290429042e-07, + "loss": -0.032, + "step": 703 + }, + { + "epoch": 0.5630873825234953, + "grad_norm": 3.1431425262284156, + "learning_rate": 9.026402640264025e-07, + "loss": -0.1257, + "step": 704 + }, + { + "epoch": 0.563887222555489, + "grad_norm": 6.8613589813033755, + "learning_rate": 9.00990099009901e-07, + "loss": -0.0899, + "step": 705 + }, + { + "epoch": 0.5646870625874825, + "grad_norm": 4.727075571003651, + "learning_rate": 8.993399339933992e-07, + "loss": -0.0103, + "step": 706 + }, + { + "epoch": 0.5654869026194761, + "grad_norm": 3.757033554841126, + "learning_rate": 8.976897689768976e-07, + "loss": -0.1201, + "step": 707 + }, + { + "epoch": 0.5662867426514697, + "grad_norm": 3.6568326567325586, + "learning_rate": 8.96039603960396e-07, + "loss": -0.0442, + "step": 708 + }, + { + "epoch": 0.5670865826834633, + "grad_norm": 4.422118271245446, + "learning_rate": 8.943894389438944e-07, + "loss": -0.1491, + "step": 709 + }, + { + "epoch": 0.5678864227154569, + "grad_norm": 4.090528079313399, + "learning_rate": 8.927392739273927e-07, + "loss": -0.1213, + "step": 710 + }, + { + "epoch": 0.5686862627474505, + "grad_norm": 6.203118235394611, + "learning_rate": 8.910891089108911e-07, + "loss": -0.0415, + "step": 711 + }, + { + "epoch": 0.5694861027794441, + "grad_norm": 4.192997526379617, + "learning_rate": 8.894389438943894e-07, + "loss": -0.0378, + "step": 712 + }, + { + "epoch": 0.5702859428114377, + "grad_norm": 4.532567536428998, + "learning_rate": 8.877887788778878e-07, + "loss": -0.0275, + "step": 713 + }, + { + "epoch": 0.5710857828434314, + "grad_norm": 4.168577112275988, + "learning_rate": 8.861386138613861e-07, + "loss": -0.0494, + "step": 714 + }, + { + "epoch": 0.5718856228754249, + "grad_norm": 7.436541583714728, + "learning_rate": 8.844884488448845e-07, + "loss": 0.0338, + "step": 715 + }, + { + "epoch": 0.5726854629074185, + "grad_norm": 4.3341221213745555, + "learning_rate": 8.828382838283828e-07, + "loss": -0.1824, + "step": 716 + }, + { + "epoch": 0.5734853029394121, + "grad_norm": 5.774496226610818, + "learning_rate": 8.811881188118812e-07, + "loss": -0.0764, + "step": 717 + }, + { + "epoch": 0.5742851429714058, + "grad_norm": 4.53339855951246, + "learning_rate": 8.795379537953795e-07, + "loss": -0.0494, + "step": 718 + }, + { + "epoch": 0.5750849830033993, + "grad_norm": 3.7011611817540118, + "learning_rate": 8.778877887788778e-07, + "loss": -0.1227, + "step": 719 + }, + { + "epoch": 0.5758848230353929, + "grad_norm": 4.117237995569142, + "learning_rate": 8.762376237623762e-07, + "loss": -0.0782, + "step": 720 + }, + { + "epoch": 0.5766846630673865, + "grad_norm": 3.347109381610254, + "learning_rate": 8.745874587458745e-07, + "loss": -0.1517, + "step": 721 + }, + { + "epoch": 0.5774845030993802, + "grad_norm": 3.9587273384893447, + "learning_rate": 8.729372937293729e-07, + "loss": -0.0647, + "step": 722 + }, + { + "epoch": 0.5782843431313738, + "grad_norm": 4.073013317525639, + "learning_rate": 8.712871287128712e-07, + "loss": -0.0354, + "step": 723 + }, + { + "epoch": 0.5790841831633673, + "grad_norm": 4.360786018214523, + "learning_rate": 8.696369636963697e-07, + "loss": -0.0845, + "step": 724 + }, + { + "epoch": 0.5798840231953609, + "grad_norm": 4.911325926898916, + "learning_rate": 8.679867986798679e-07, + "loss": -0.0581, + "step": 725 + }, + { + "epoch": 0.5806838632273545, + "grad_norm": 7.374565499647674, + "learning_rate": 8.663366336633663e-07, + "loss": 0.0876, + "step": 726 + }, + { + "epoch": 0.5814837032593482, + "grad_norm": 5.228899924882716, + "learning_rate": 8.646864686468647e-07, + "loss": -0.0188, + "step": 727 + }, + { + "epoch": 0.5822835432913417, + "grad_norm": 5.076190474367137, + "learning_rate": 8.63036303630363e-07, + "loss": -0.0482, + "step": 728 + }, + { + "epoch": 0.5830833833233353, + "grad_norm": 3.8391392043031067, + "learning_rate": 8.613861386138613e-07, + "loss": -0.0351, + "step": 729 + }, + { + "epoch": 0.5838832233553289, + "grad_norm": 4.720664943150823, + "learning_rate": 8.597359735973598e-07, + "loss": -0.0919, + "step": 730 + }, + { + "epoch": 0.5846830633873226, + "grad_norm": 3.696825036479519, + "learning_rate": 8.58085808580858e-07, + "loss": -0.0491, + "step": 731 + }, + { + "epoch": 0.5854829034193162, + "grad_norm": 4.957633580857494, + "learning_rate": 8.564356435643563e-07, + "loss": -0.0084, + "step": 732 + }, + { + "epoch": 0.5862827434513097, + "grad_norm": 4.145591204807576, + "learning_rate": 8.547854785478548e-07, + "loss": -0.0777, + "step": 733 + }, + { + "epoch": 0.5870825834833033, + "grad_norm": 6.008926427229853, + "learning_rate": 8.531353135313531e-07, + "loss": -0.0285, + "step": 734 + }, + { + "epoch": 0.587882423515297, + "grad_norm": 6.457940104008622, + "learning_rate": 8.514851485148514e-07, + "loss": -0.1126, + "step": 735 + }, + { + "epoch": 0.5886822635472906, + "grad_norm": 4.150286303835989, + "learning_rate": 8.498349834983498e-07, + "loss": -0.0633, + "step": 736 + }, + { + "epoch": 0.5894821035792841, + "grad_norm": 4.765817707105298, + "learning_rate": 8.481848184818482e-07, + "loss": -0.1274, + "step": 737 + }, + { + "epoch": 0.5902819436112777, + "grad_norm": 2.944307272093047, + "learning_rate": 8.465346534653464e-07, + "loss": -0.1143, + "step": 738 + }, + { + "epoch": 0.5910817836432714, + "grad_norm": 4.301068842918969, + "learning_rate": 8.448844884488449e-07, + "loss": -0.0303, + "step": 739 + }, + { + "epoch": 0.591881623675265, + "grad_norm": 3.5345181895694724, + "learning_rate": 8.432343234323432e-07, + "loss": -0.0369, + "step": 740 + }, + { + "epoch": 0.5926814637072585, + "grad_norm": 3.6758939784537477, + "learning_rate": 8.415841584158416e-07, + "loss": -0.0435, + "step": 741 + }, + { + "epoch": 0.5934813037392521, + "grad_norm": 9.280032916175081, + "learning_rate": 8.399339933993399e-07, + "loss": -0.077, + "step": 742 + }, + { + "epoch": 0.5942811437712457, + "grad_norm": 3.691324223005634, + "learning_rate": 8.382838283828383e-07, + "loss": 0.0384, + "step": 743 + }, + { + "epoch": 0.5950809838032394, + "grad_norm": 5.13228312974853, + "learning_rate": 8.366336633663366e-07, + "loss": -0.0101, + "step": 744 + }, + { + "epoch": 0.595880823835233, + "grad_norm": 3.127958499467288, + "learning_rate": 8.34983498349835e-07, + "loss": 0.0036, + "step": 745 + }, + { + "epoch": 0.5966806638672265, + "grad_norm": 4.371581567674568, + "learning_rate": 8.333333333333333e-07, + "loss": -0.088, + "step": 746 + }, + { + "epoch": 0.5974805038992201, + "grad_norm": 3.7498910252313786, + "learning_rate": 8.316831683168316e-07, + "loss": -0.0249, + "step": 747 + }, + { + "epoch": 0.5982803439312138, + "grad_norm": 9.554152491664782, + "learning_rate": 8.3003300330033e-07, + "loss": -0.0718, + "step": 748 + }, + { + "epoch": 0.5990801839632074, + "grad_norm": 4.450073267301403, + "learning_rate": 8.283828382838283e-07, + "loss": -0.038, + "step": 749 + }, + { + "epoch": 0.5998800239952009, + "grad_norm": 4.885796549111672, + "learning_rate": 8.267326732673267e-07, + "loss": -0.0279, + "step": 750 + }, + { + "epoch": 0.6006798640271945, + "grad_norm": 4.729710058959354, + "learning_rate": 8.25082508250825e-07, + "loss": -0.0415, + "step": 751 + }, + { + "epoch": 0.6014797040591882, + "grad_norm": 5.310403387692374, + "learning_rate": 8.234323432343234e-07, + "loss": -0.1964, + "step": 752 + }, + { + "epoch": 0.6022795440911818, + "grad_norm": 4.462129183196535, + "learning_rate": 8.217821782178217e-07, + "loss": -0.0289, + "step": 753 + }, + { + "epoch": 0.6030793841231754, + "grad_norm": 4.343559206058792, + "learning_rate": 8.201320132013201e-07, + "loss": 0.0155, + "step": 754 + }, + { + "epoch": 0.6038792241551689, + "grad_norm": 7.14111860643498, + "learning_rate": 8.184818481848184e-07, + "loss": 0.076, + "step": 755 + }, + { + "epoch": 0.6046790641871626, + "grad_norm": 5.741261351757093, + "learning_rate": 8.168316831683168e-07, + "loss": -0.0684, + "step": 756 + }, + { + "epoch": 0.6054789042191562, + "grad_norm": 4.841100743497433, + "learning_rate": 8.151815181518151e-07, + "loss": 0.0251, + "step": 757 + }, + { + "epoch": 0.6062787442511498, + "grad_norm": 5.776828704222559, + "learning_rate": 8.135313531353136e-07, + "loss": 0.0922, + "step": 758 + }, + { + "epoch": 0.6070785842831433, + "grad_norm": 5.171546395456714, + "learning_rate": 8.118811881188119e-07, + "loss": 0.0378, + "step": 759 + }, + { + "epoch": 0.607878424315137, + "grad_norm": 4.5497593071961475, + "learning_rate": 8.102310231023102e-07, + "loss": -0.0091, + "step": 760 + }, + { + "epoch": 0.6086782643471306, + "grad_norm": 3.4008260116242837, + "learning_rate": 8.085808580858086e-07, + "loss": -0.0513, + "step": 761 + }, + { + "epoch": 0.6094781043791242, + "grad_norm": 5.9528170929025475, + "learning_rate": 8.069306930693069e-07, + "loss": -0.0846, + "step": 762 + }, + { + "epoch": 0.6102779444111178, + "grad_norm": 8.59254741230532, + "learning_rate": 8.052805280528052e-07, + "loss": 0.0454, + "step": 763 + }, + { + "epoch": 0.6110777844431113, + "grad_norm": 5.92388092933109, + "learning_rate": 8.036303630363036e-07, + "loss": -0.0677, + "step": 764 + }, + { + "epoch": 0.611877624475105, + "grad_norm": 4.5071343981279375, + "learning_rate": 8.01980198019802e-07, + "loss": -0.0598, + "step": 765 + }, + { + "epoch": 0.6126774645070986, + "grad_norm": 5.095952967655762, + "learning_rate": 8.003300330033002e-07, + "loss": -0.1077, + "step": 766 + }, + { + "epoch": 0.6134773045390922, + "grad_norm": 3.686767117360266, + "learning_rate": 7.986798679867987e-07, + "loss": -0.0509, + "step": 767 + }, + { + "epoch": 0.6142771445710857, + "grad_norm": 4.709313867244328, + "learning_rate": 7.97029702970297e-07, + "loss": -0.0384, + "step": 768 + }, + { + "epoch": 0.6150769846030794, + "grad_norm": 4.606485786769665, + "learning_rate": 7.953795379537953e-07, + "loss": -0.0639, + "step": 769 + }, + { + "epoch": 0.615876824635073, + "grad_norm": 4.126308811511686, + "learning_rate": 7.937293729372937e-07, + "loss": -0.1454, + "step": 770 + }, + { + "epoch": 0.6166766646670666, + "grad_norm": 5.0891862328923985, + "learning_rate": 7.920792079207921e-07, + "loss": 0.0552, + "step": 771 + }, + { + "epoch": 0.6174765046990602, + "grad_norm": 4.348447825302712, + "learning_rate": 7.904290429042903e-07, + "loss": -0.0914, + "step": 772 + }, + { + "epoch": 0.6182763447310538, + "grad_norm": 4.069062786121328, + "learning_rate": 7.887788778877888e-07, + "loss": -0.0, + "step": 773 + }, + { + "epoch": 0.6190761847630474, + "grad_norm": 4.293180451800697, + "learning_rate": 7.871287128712871e-07, + "loss": -0.2021, + "step": 774 + }, + { + "epoch": 0.619876024795041, + "grad_norm": 6.832958614340714, + "learning_rate": 7.854785478547854e-07, + "loss": -0.049, + "step": 775 + }, + { + "epoch": 0.6206758648270346, + "grad_norm": 4.62295713929861, + "learning_rate": 7.838283828382838e-07, + "loss": -0.0099, + "step": 776 + }, + { + "epoch": 0.6214757048590281, + "grad_norm": 16.021103293232393, + "learning_rate": 7.821782178217821e-07, + "loss": -0.0229, + "step": 777 + }, + { + "epoch": 0.6222755448910218, + "grad_norm": 6.878826117062365, + "learning_rate": 7.805280528052805e-07, + "loss": 0.0852, + "step": 778 + }, + { + "epoch": 0.6230753849230154, + "grad_norm": 3.5925006851057595, + "learning_rate": 7.788778877887788e-07, + "loss": -0.0838, + "step": 779 + }, + { + "epoch": 0.623875224955009, + "grad_norm": 4.312199015410901, + "learning_rate": 7.772277227722772e-07, + "loss": -0.1938, + "step": 780 + }, + { + "epoch": 0.6246750649870026, + "grad_norm": 4.427437777822434, + "learning_rate": 7.755775577557755e-07, + "loss": -0.1088, + "step": 781 + }, + { + "epoch": 0.6254749050189962, + "grad_norm": 3.391720502521415, + "learning_rate": 7.739273927392739e-07, + "loss": -0.1204, + "step": 782 + }, + { + "epoch": 0.6262747450509898, + "grad_norm": 3.3125718306904512, + "learning_rate": 7.722772277227722e-07, + "loss": -0.0828, + "step": 783 + }, + { + "epoch": 0.6270745850829834, + "grad_norm": 5.402117257600779, + "learning_rate": 7.706270627062707e-07, + "loss": -0.054, + "step": 784 + }, + { + "epoch": 0.627874425114977, + "grad_norm": 4.489840589382479, + "learning_rate": 7.689768976897689e-07, + "loss": -0.0368, + "step": 785 + }, + { + "epoch": 0.6286742651469706, + "grad_norm": 3.848546702562119, + "learning_rate": 7.673267326732673e-07, + "loss": -0.0842, + "step": 786 + }, + { + "epoch": 0.6294741051789642, + "grad_norm": 4.3003301981629765, + "learning_rate": 7.656765676567657e-07, + "loss": -0.1575, + "step": 787 + }, + { + "epoch": 0.6302739452109578, + "grad_norm": 3.234095759477404, + "learning_rate": 7.64026402640264e-07, + "loss": -0.1448, + "step": 788 + }, + { + "epoch": 0.6310737852429514, + "grad_norm": 4.587363650091271, + "learning_rate": 7.623762376237624e-07, + "loss": -0.0806, + "step": 789 + }, + { + "epoch": 0.6318736252749451, + "grad_norm": 4.91701458923025, + "learning_rate": 7.607260726072607e-07, + "loss": 0.0289, + "step": 790 + }, + { + "epoch": 0.6326734653069386, + "grad_norm": 4.26117605640356, + "learning_rate": 7.59075907590759e-07, + "loss": -0.0442, + "step": 791 + }, + { + "epoch": 0.6334733053389322, + "grad_norm": 4.714328336316601, + "learning_rate": 7.574257425742574e-07, + "loss": -0.1182, + "step": 792 + }, + { + "epoch": 0.6342731453709258, + "grad_norm": 4.8043256421789975, + "learning_rate": 7.557755775577558e-07, + "loss": -0.0532, + "step": 793 + }, + { + "epoch": 0.6350729854029195, + "grad_norm": 4.981158806362152, + "learning_rate": 7.54125412541254e-07, + "loss": -0.0322, + "step": 794 + }, + { + "epoch": 0.635872825434913, + "grad_norm": 6.600526915815604, + "learning_rate": 7.524752475247525e-07, + "loss": -0.1059, + "step": 795 + }, + { + "epoch": 0.6366726654669066, + "grad_norm": 4.989184316503347, + "learning_rate": 7.508250825082508e-07, + "loss": -0.0888, + "step": 796 + }, + { + "epoch": 0.6374725054989002, + "grad_norm": 4.683126825596764, + "learning_rate": 7.491749174917491e-07, + "loss": -0.0425, + "step": 797 + }, + { + "epoch": 0.6382723455308938, + "grad_norm": 2.835338855117646, + "learning_rate": 7.475247524752475e-07, + "loss": -0.1188, + "step": 798 + }, + { + "epoch": 0.6390721855628875, + "grad_norm": 3.334782349034145, + "learning_rate": 7.458745874587459e-07, + "loss": -0.0085, + "step": 799 + }, + { + "epoch": 0.639872025594881, + "grad_norm": 3.8950834045490677, + "learning_rate": 7.442244224422441e-07, + "loss": -0.0866, + "step": 800 + }, + { + "epoch": 0.6406718656268746, + "grad_norm": 3.7954504535302047, + "learning_rate": 7.425742574257426e-07, + "loss": -0.1718, + "step": 801 + }, + { + "epoch": 0.6414717056588682, + "grad_norm": 3.9147882759458907, + "learning_rate": 7.409240924092409e-07, + "loss": -0.1172, + "step": 802 + }, + { + "epoch": 0.6422715456908619, + "grad_norm": 4.14195212922553, + "learning_rate": 7.392739273927392e-07, + "loss": -0.2055, + "step": 803 + }, + { + "epoch": 0.6430713857228554, + "grad_norm": 5.564167929906255, + "learning_rate": 7.376237623762376e-07, + "loss": -0.0587, + "step": 804 + }, + { + "epoch": 0.643871225754849, + "grad_norm": 5.104003509197404, + "learning_rate": 7.359735973597359e-07, + "loss": -0.1599, + "step": 805 + }, + { + "epoch": 0.6446710657868426, + "grad_norm": 3.249110466926901, + "learning_rate": 7.343234323432343e-07, + "loss": 0.0254, + "step": 806 + }, + { + "epoch": 0.6454709058188363, + "grad_norm": 5.330488201062819, + "learning_rate": 7.326732673267326e-07, + "loss": -0.0932, + "step": 807 + }, + { + "epoch": 0.6462707458508299, + "grad_norm": 3.391881050876262, + "learning_rate": 7.31023102310231e-07, + "loss": -0.0045, + "step": 808 + }, + { + "epoch": 0.6470705858828234, + "grad_norm": 4.729176906400958, + "learning_rate": 7.293729372937293e-07, + "loss": 0.0309, + "step": 809 + }, + { + "epoch": 0.647870425914817, + "grad_norm": 4.873305784391995, + "learning_rate": 7.277227722772277e-07, + "loss": -0.016, + "step": 810 + }, + { + "epoch": 0.6486702659468107, + "grad_norm": 4.737325724513948, + "learning_rate": 7.26072607260726e-07, + "loss": -0.0555, + "step": 811 + }, + { + "epoch": 0.6494701059788043, + "grad_norm": 4.700178573137915, + "learning_rate": 7.244224422442245e-07, + "loss": -0.0054, + "step": 812 + }, + { + "epoch": 0.6502699460107978, + "grad_norm": 3.8303301007119375, + "learning_rate": 7.227722772277227e-07, + "loss": -0.0237, + "step": 813 + }, + { + "epoch": 0.6510697860427914, + "grad_norm": 3.9787912774229404, + "learning_rate": 7.211221122112211e-07, + "loss": -0.0158, + "step": 814 + }, + { + "epoch": 0.651869626074785, + "grad_norm": 4.3995808661843805, + "learning_rate": 7.194719471947195e-07, + "loss": 0.0276, + "step": 815 + }, + { + "epoch": 0.6526694661067787, + "grad_norm": 4.052268422009291, + "learning_rate": 7.178217821782178e-07, + "loss": -0.0178, + "step": 816 + }, + { + "epoch": 0.6534693061387723, + "grad_norm": 4.162149014725009, + "learning_rate": 7.161716171617161e-07, + "loss": -0.0343, + "step": 817 + }, + { + "epoch": 0.6542691461707658, + "grad_norm": 4.733808124710197, + "learning_rate": 7.145214521452146e-07, + "loss": -0.0719, + "step": 818 + }, + { + "epoch": 0.6550689862027594, + "grad_norm": 4.367810691084756, + "learning_rate": 7.128712871287128e-07, + "loss": -0.1031, + "step": 819 + }, + { + "epoch": 0.6558688262347531, + "grad_norm": 3.393663864026175, + "learning_rate": 7.112211221122111e-07, + "loss": -0.0469, + "step": 820 + }, + { + "epoch": 0.6566686662667467, + "grad_norm": 6.593078579096907, + "learning_rate": 7.095709570957096e-07, + "loss": -0.0492, + "step": 821 + }, + { + "epoch": 0.6574685062987402, + "grad_norm": 5.90593512770552, + "learning_rate": 7.079207920792078e-07, + "loss": 0.017, + "step": 822 + }, + { + "epoch": 0.6582683463307338, + "grad_norm": 4.280214281867913, + "learning_rate": 7.062706270627063e-07, + "loss": -0.0432, + "step": 823 + }, + { + "epoch": 0.6590681863627275, + "grad_norm": 8.423741404535653, + "learning_rate": 7.046204620462046e-07, + "loss": -0.0291, + "step": 824 + }, + { + "epoch": 0.6598680263947211, + "grad_norm": 5.038317153573228, + "learning_rate": 7.029702970297029e-07, + "loss": -0.093, + "step": 825 + }, + { + "epoch": 0.6606678664267147, + "grad_norm": 6.706344720432834, + "learning_rate": 7.013201320132013e-07, + "loss": -0.1315, + "step": 826 + }, + { + "epoch": 0.6614677064587082, + "grad_norm": 3.716489601764274, + "learning_rate": 6.996699669966997e-07, + "loss": -0.0825, + "step": 827 + }, + { + "epoch": 0.6622675464907019, + "grad_norm": 4.158836764412884, + "learning_rate": 6.980198019801979e-07, + "loss": -0.0161, + "step": 828 + }, + { + "epoch": 0.6630673865226955, + "grad_norm": 3.733283485919958, + "learning_rate": 6.963696369636964e-07, + "loss": -0.0594, + "step": 829 + }, + { + "epoch": 0.6638672265546891, + "grad_norm": 6.968567713247902, + "learning_rate": 6.947194719471947e-07, + "loss": -0.0441, + "step": 830 + }, + { + "epoch": 0.6646670665866826, + "grad_norm": 5.008730323701448, + "learning_rate": 6.93069306930693e-07, + "loss": -0.1307, + "step": 831 + }, + { + "epoch": 0.6654669066186762, + "grad_norm": 4.2175650733942955, + "learning_rate": 6.914191419141914e-07, + "loss": -0.0771, + "step": 832 + }, + { + "epoch": 0.6662667466506699, + "grad_norm": 5.402322742920563, + "learning_rate": 6.897689768976897e-07, + "loss": -0.0841, + "step": 833 + }, + { + "epoch": 0.6670665866826635, + "grad_norm": 5.646942573991696, + "learning_rate": 6.88118811881188e-07, + "loss": -0.0622, + "step": 834 + }, + { + "epoch": 0.667866426714657, + "grad_norm": 13.649360926832344, + "learning_rate": 6.864686468646864e-07, + "loss": -0.0637, + "step": 835 + }, + { + "epoch": 0.6686662667466506, + "grad_norm": 5.461268948386568, + "learning_rate": 6.848184818481848e-07, + "loss": -0.0241, + "step": 836 + }, + { + "epoch": 0.6694661067786443, + "grad_norm": 4.011621520471584, + "learning_rate": 6.831683168316831e-07, + "loss": -0.0422, + "step": 837 + }, + { + "epoch": 0.6702659468106379, + "grad_norm": 20.835014010983784, + "learning_rate": 6.815181518151815e-07, + "loss": -0.126, + "step": 838 + }, + { + "epoch": 0.6710657868426315, + "grad_norm": 5.041368060556288, + "learning_rate": 6.798679867986798e-07, + "loss": -0.1016, + "step": 839 + }, + { + "epoch": 0.671865626874625, + "grad_norm": 5.834292995896152, + "learning_rate": 6.782178217821783e-07, + "loss": -0.0738, + "step": 840 + }, + { + "epoch": 0.6726654669066187, + "grad_norm": 3.5120723151753985, + "learning_rate": 6.765676567656765e-07, + "loss": -0.0504, + "step": 841 + }, + { + "epoch": 0.6734653069386123, + "grad_norm": 2.901517891733533, + "learning_rate": 6.749174917491749e-07, + "loss": -0.0698, + "step": 842 + }, + { + "epoch": 0.6742651469706059, + "grad_norm": 3.465841087435974, + "learning_rate": 6.732673267326733e-07, + "loss": -0.1227, + "step": 843 + }, + { + "epoch": 0.6750649870025994, + "grad_norm": 4.536588693958206, + "learning_rate": 6.716171617161716e-07, + "loss": -0.0602, + "step": 844 + }, + { + "epoch": 0.675864827034593, + "grad_norm": 10.903656834330391, + "learning_rate": 6.699669966996699e-07, + "loss": -0.1289, + "step": 845 + }, + { + "epoch": 0.6766646670665867, + "grad_norm": 7.296365266758308, + "learning_rate": 6.683168316831684e-07, + "loss": -0.1561, + "step": 846 + }, + { + "epoch": 0.6774645070985803, + "grad_norm": 4.412331570876947, + "learning_rate": 6.666666666666666e-07, + "loss": -0.0958, + "step": 847 + }, + { + "epoch": 0.6782643471305739, + "grad_norm": 2.8672230897612345, + "learning_rate": 6.650165016501649e-07, + "loss": -0.1643, + "step": 848 + }, + { + "epoch": 0.6790641871625674, + "grad_norm": 4.5674033793568904, + "learning_rate": 6.633663366336634e-07, + "loss": -0.1336, + "step": 849 + }, + { + "epoch": 0.6798640271945611, + "grad_norm": 4.381540695320094, + "learning_rate": 6.617161716171616e-07, + "loss": -0.0132, + "step": 850 + }, + { + "epoch": 0.6806638672265547, + "grad_norm": 5.664059132542103, + "learning_rate": 6.6006600660066e-07, + "loss": -0.0285, + "step": 851 + }, + { + "epoch": 0.6814637072585483, + "grad_norm": 5.7461780617115, + "learning_rate": 6.584158415841584e-07, + "loss": 0.0047, + "step": 852 + }, + { + "epoch": 0.6822635472905418, + "grad_norm": 4.320720191192789, + "learning_rate": 6.567656765676567e-07, + "loss": -0.1447, + "step": 853 + }, + { + "epoch": 0.6830633873225355, + "grad_norm": 3.108737285911658, + "learning_rate": 6.55115511551155e-07, + "loss": -0.1611, + "step": 854 + }, + { + "epoch": 0.6838632273545291, + "grad_norm": 5.023876212557061, + "learning_rate": 6.534653465346535e-07, + "loss": -0.1529, + "step": 855 + }, + { + "epoch": 0.6846630673865227, + "grad_norm": 4.545192341611211, + "learning_rate": 6.518151815181517e-07, + "loss": -0.0456, + "step": 856 + }, + { + "epoch": 0.6854629074185163, + "grad_norm": 5.267884265007784, + "learning_rate": 6.501650165016502e-07, + "loss": -0.0288, + "step": 857 + }, + { + "epoch": 0.6862627474505099, + "grad_norm": 5.101939165542976, + "learning_rate": 6.485148514851485e-07, + "loss": -0.0036, + "step": 858 + }, + { + "epoch": 0.6870625874825035, + "grad_norm": 3.55089136672625, + "learning_rate": 6.468646864686468e-07, + "loss": -0.0762, + "step": 859 + }, + { + "epoch": 0.6878624275144971, + "grad_norm": 4.121292066096188, + "learning_rate": 6.452145214521452e-07, + "loss": -0.0858, + "step": 860 + }, + { + "epoch": 0.6886622675464907, + "grad_norm": 4.411771304555708, + "learning_rate": 6.435643564356436e-07, + "loss": -0.0359, + "step": 861 + }, + { + "epoch": 0.6894621075784843, + "grad_norm": 4.163024887578695, + "learning_rate": 6.419141914191419e-07, + "loss": -0.1036, + "step": 862 + }, + { + "epoch": 0.6902619476104779, + "grad_norm": 3.9253552561550307, + "learning_rate": 6.402640264026402e-07, + "loss": -0.092, + "step": 863 + }, + { + "epoch": 0.6910617876424715, + "grad_norm": 3.3756432096953906, + "learning_rate": 6.386138613861386e-07, + "loss": 0.0002, + "step": 864 + }, + { + "epoch": 0.6918616276744651, + "grad_norm": 4.4942197763700245, + "learning_rate": 6.369636963696369e-07, + "loss": 0.0044, + "step": 865 + }, + { + "epoch": 0.6926614677064588, + "grad_norm": 4.469003096662444, + "learning_rate": 6.353135313531353e-07, + "loss": -0.0027, + "step": 866 + }, + { + "epoch": 0.6934613077384523, + "grad_norm": 3.212779461175308, + "learning_rate": 6.336633663366336e-07, + "loss": -0.0255, + "step": 867 + }, + { + "epoch": 0.6942611477704459, + "grad_norm": 3.473986685130551, + "learning_rate": 6.32013201320132e-07, + "loss": -0.0171, + "step": 868 + }, + { + "epoch": 0.6950609878024395, + "grad_norm": 4.216176017792983, + "learning_rate": 6.303630363036303e-07, + "loss": -0.0673, + "step": 869 + }, + { + "epoch": 0.6958608278344331, + "grad_norm": 12.098304541476889, + "learning_rate": 6.287128712871287e-07, + "loss": 0.0742, + "step": 870 + }, + { + "epoch": 0.6966606678664267, + "grad_norm": 8.083305542363899, + "learning_rate": 6.270627062706271e-07, + "loss": 0.0401, + "step": 871 + }, + { + "epoch": 0.6974605078984203, + "grad_norm": 4.685734734341067, + "learning_rate": 6.254125412541254e-07, + "loss": -0.0235, + "step": 872 + }, + { + "epoch": 0.6982603479304139, + "grad_norm": 5.85956379904162, + "learning_rate": 6.237623762376237e-07, + "loss": -0.1556, + "step": 873 + }, + { + "epoch": 0.6990601879624075, + "grad_norm": 3.61321328588953, + "learning_rate": 6.221122112211222e-07, + "loss": -0.093, + "step": 874 + }, + { + "epoch": 0.6998600279944012, + "grad_norm": 3.528560458155755, + "learning_rate": 6.204620462046204e-07, + "loss": -0.0322, + "step": 875 + }, + { + "epoch": 0.7006598680263947, + "grad_norm": 6.51982029204985, + "learning_rate": 6.188118811881187e-07, + "loss": 0.0667, + "step": 876 + }, + { + "epoch": 0.7014597080583883, + "grad_norm": 4.530213336089527, + "learning_rate": 6.171617161716172e-07, + "loss": -0.0434, + "step": 877 + }, + { + "epoch": 0.7022595480903819, + "grad_norm": 6.616076811947681, + "learning_rate": 6.155115511551154e-07, + "loss": 0.0219, + "step": 878 + }, + { + "epoch": 0.7030593881223756, + "grad_norm": 4.1562171877376075, + "learning_rate": 6.138613861386138e-07, + "loss": -0.0648, + "step": 879 + }, + { + "epoch": 0.7038592281543691, + "grad_norm": 4.419647626337558, + "learning_rate": 6.122112211221122e-07, + "loss": -0.1112, + "step": 880 + }, + { + "epoch": 0.7046590681863627, + "grad_norm": 4.940388667457944, + "learning_rate": 6.105610561056105e-07, + "loss": -0.0162, + "step": 881 + }, + { + "epoch": 0.7054589082183563, + "grad_norm": 6.023947292183416, + "learning_rate": 6.089108910891088e-07, + "loss": -0.0435, + "step": 882 + }, + { + "epoch": 0.70625874825035, + "grad_norm": 7.1901426018379935, + "learning_rate": 6.072607260726073e-07, + "loss": -0.001, + "step": 883 + }, + { + "epoch": 0.7070585882823436, + "grad_norm": 5.858167518812244, + "learning_rate": 6.056105610561055e-07, + "loss": -0.0017, + "step": 884 + }, + { + "epoch": 0.7078584283143371, + "grad_norm": 4.6643313740940835, + "learning_rate": 6.03960396039604e-07, + "loss": -0.0808, + "step": 885 + }, + { + "epoch": 0.7086582683463307, + "grad_norm": 3.964900755824796, + "learning_rate": 6.023102310231023e-07, + "loss": -0.1257, + "step": 886 + }, + { + "epoch": 0.7094581083783243, + "grad_norm": 3.6312620074127797, + "learning_rate": 6.006600660066007e-07, + "loss": -0.044, + "step": 887 + }, + { + "epoch": 0.710257948410318, + "grad_norm": 4.46804227760141, + "learning_rate": 5.99009900990099e-07, + "loss": -0.0864, + "step": 888 + }, + { + "epoch": 0.7110577884423115, + "grad_norm": 3.9877331513072884, + "learning_rate": 5.973597359735974e-07, + "loss": -0.0555, + "step": 889 + }, + { + "epoch": 0.7118576284743051, + "grad_norm": 5.71452000001456, + "learning_rate": 5.957095709570957e-07, + "loss": -0.0516, + "step": 890 + }, + { + "epoch": 0.7126574685062987, + "grad_norm": 4.729225707280767, + "learning_rate": 5.94059405940594e-07, + "loss": -0.1252, + "step": 891 + }, + { + "epoch": 0.7134573085382924, + "grad_norm": 10.531702048903348, + "learning_rate": 5.924092409240924e-07, + "loss": -0.0143, + "step": 892 + }, + { + "epoch": 0.714257148570286, + "grad_norm": 4.337061940699697, + "learning_rate": 5.907590759075907e-07, + "loss": -0.0256, + "step": 893 + }, + { + "epoch": 0.7150569886022795, + "grad_norm": 4.553139268045056, + "learning_rate": 5.891089108910891e-07, + "loss": 0.0592, + "step": 894 + }, + { + "epoch": 0.7158568286342731, + "grad_norm": 3.65289282929829, + "learning_rate": 5.874587458745874e-07, + "loss": 0.0011, + "step": 895 + }, + { + "epoch": 0.7166566686662668, + "grad_norm": 2.969949901428, + "learning_rate": 5.858085808580858e-07, + "loss": -0.1288, + "step": 896 + }, + { + "epoch": 0.7174565086982604, + "grad_norm": 5.316965178875907, + "learning_rate": 5.841584158415841e-07, + "loss": -0.0688, + "step": 897 + }, + { + "epoch": 0.7182563487302539, + "grad_norm": 6.112915886146603, + "learning_rate": 5.825082508250825e-07, + "loss": -0.1923, + "step": 898 + }, + { + "epoch": 0.7190561887622475, + "grad_norm": 4.072265156624673, + "learning_rate": 5.808580858085808e-07, + "loss": -0.1749, + "step": 899 + }, + { + "epoch": 0.7198560287942412, + "grad_norm": 4.286524287381163, + "learning_rate": 5.792079207920792e-07, + "loss": -0.0736, + "step": 900 + }, + { + "epoch": 0.7206558688262348, + "grad_norm": 6.654813369667659, + "learning_rate": 5.775577557755775e-07, + "loss": -0.1185, + "step": 901 + }, + { + "epoch": 0.7214557088582284, + "grad_norm": 5.075962580453491, + "learning_rate": 5.75907590759076e-07, + "loss": 0.0889, + "step": 902 + }, + { + "epoch": 0.7222555488902219, + "grad_norm": 5.581410015072146, + "learning_rate": 5.742574257425742e-07, + "loss": -0.0718, + "step": 903 + }, + { + "epoch": 0.7230553889222155, + "grad_norm": 7.760040178489886, + "learning_rate": 5.726072607260726e-07, + "loss": -0.0227, + "step": 904 + }, + { + "epoch": 0.7238552289542092, + "grad_norm": 4.491264765964933, + "learning_rate": 5.70957095709571e-07, + "loss": -0.0838, + "step": 905 + }, + { + "epoch": 0.7246550689862028, + "grad_norm": 4.876358038696258, + "learning_rate": 5.693069306930692e-07, + "loss": -0.172, + "step": 906 + }, + { + "epoch": 0.7254549090181963, + "grad_norm": 3.9354286195012422, + "learning_rate": 5.676567656765676e-07, + "loss": -0.0793, + "step": 907 + }, + { + "epoch": 0.7262547490501899, + "grad_norm": 3.51016598192195, + "learning_rate": 5.66006600660066e-07, + "loss": 0.0052, + "step": 908 + }, + { + "epoch": 0.7270545890821836, + "grad_norm": 3.6895775842146166, + "learning_rate": 5.643564356435643e-07, + "loss": -0.0167, + "step": 909 + }, + { + "epoch": 0.7278544291141772, + "grad_norm": 4.773443293841103, + "learning_rate": 5.627062706270626e-07, + "loss": -0.0359, + "step": 910 + }, + { + "epoch": 0.7286542691461708, + "grad_norm": 4.278237278247243, + "learning_rate": 5.610561056105611e-07, + "loss": -0.054, + "step": 911 + }, + { + "epoch": 0.7294541091781643, + "grad_norm": 10.27415077431224, + "learning_rate": 5.594059405940594e-07, + "loss": -0.0075, + "step": 912 + }, + { + "epoch": 0.730253949210158, + "grad_norm": 4.2766288821859755, + "learning_rate": 5.577557755775577e-07, + "loss": 0.0493, + "step": 913 + }, + { + "epoch": 0.7310537892421516, + "grad_norm": 3.5281016400546275, + "learning_rate": 5.561056105610561e-07, + "loss": -0.0766, + "step": 914 + }, + { + "epoch": 0.7318536292741452, + "grad_norm": 3.8674946364382223, + "learning_rate": 5.544554455445545e-07, + "loss": -0.0442, + "step": 915 + }, + { + "epoch": 0.7326534693061387, + "grad_norm": 5.4465871711884395, + "learning_rate": 5.528052805280527e-07, + "loss": -0.0841, + "step": 916 + }, + { + "epoch": 0.7334533093381324, + "grad_norm": 3.530741427097772, + "learning_rate": 5.511551155115512e-07, + "loss": -0.0629, + "step": 917 + }, + { + "epoch": 0.734253149370126, + "grad_norm": 3.965321298788348, + "learning_rate": 5.495049504950495e-07, + "loss": -0.0573, + "step": 918 + }, + { + "epoch": 0.7350529894021196, + "grad_norm": 4.295307109186891, + "learning_rate": 5.478547854785477e-07, + "loss": -0.1381, + "step": 919 + }, + { + "epoch": 0.7358528294341132, + "grad_norm": 3.8500617084264257, + "learning_rate": 5.462046204620462e-07, + "loss": -0.049, + "step": 920 + }, + { + "epoch": 0.7366526694661067, + "grad_norm": 4.8663143152337005, + "learning_rate": 5.445544554455445e-07, + "loss": -0.0553, + "step": 921 + }, + { + "epoch": 0.7374525094981004, + "grad_norm": 3.4389426238655476, + "learning_rate": 5.429042904290429e-07, + "loss": -0.1896, + "step": 922 + }, + { + "epoch": 0.738252349530094, + "grad_norm": 5.019665193069423, + "learning_rate": 5.412541254125412e-07, + "loss": 0.1007, + "step": 923 + }, + { + "epoch": 0.7390521895620876, + "grad_norm": 3.9100586900916126, + "learning_rate": 5.396039603960396e-07, + "loss": -0.0697, + "step": 924 + }, + { + "epoch": 0.7398520295940811, + "grad_norm": 3.919136094051066, + "learning_rate": 5.379537953795379e-07, + "loss": -0.0907, + "step": 925 + }, + { + "epoch": 0.7406518696260748, + "grad_norm": 4.124863593418168, + "learning_rate": 5.363036303630363e-07, + "loss": -0.0569, + "step": 926 + }, + { + "epoch": 0.7414517096580684, + "grad_norm": 3.9668145454046977, + "learning_rate": 5.346534653465346e-07, + "loss": -0.0045, + "step": 927 + }, + { + "epoch": 0.742251549690062, + "grad_norm": 4.794421236275003, + "learning_rate": 5.33003300330033e-07, + "loss": -0.1284, + "step": 928 + }, + { + "epoch": 0.7430513897220556, + "grad_norm": 5.586609443850984, + "learning_rate": 5.313531353135313e-07, + "loss": -0.0346, + "step": 929 + }, + { + "epoch": 0.7438512297540492, + "grad_norm": 5.408030486873039, + "learning_rate": 5.297029702970297e-07, + "loss": -0.1473, + "step": 930 + }, + { + "epoch": 0.7446510697860428, + "grad_norm": 3.506888694865617, + "learning_rate": 5.28052805280528e-07, + "loss": -0.0018, + "step": 931 + }, + { + "epoch": 0.7454509098180364, + "grad_norm": 4.703351899310227, + "learning_rate": 5.264026402640264e-07, + "loss": 0.0077, + "step": 932 + }, + { + "epoch": 0.74625074985003, + "grad_norm": 5.030173808558858, + "learning_rate": 5.247524752475247e-07, + "loss": -0.1189, + "step": 933 + }, + { + "epoch": 0.7470505898820236, + "grad_norm": 6.14459022838033, + "learning_rate": 5.23102310231023e-07, + "loss": 0.0135, + "step": 934 + }, + { + "epoch": 0.7478504299140172, + "grad_norm": 3.162063833925453, + "learning_rate": 5.214521452145214e-07, + "loss": -0.0648, + "step": 935 + }, + { + "epoch": 0.7486502699460108, + "grad_norm": 3.322990934843452, + "learning_rate": 5.198019801980198e-07, + "loss": -0.0444, + "step": 936 + }, + { + "epoch": 0.7494501099780044, + "grad_norm": 7.281580577762579, + "learning_rate": 5.181518151815182e-07, + "loss": -0.079, + "step": 937 + }, + { + "epoch": 0.750249950009998, + "grad_norm": 4.995689441346887, + "learning_rate": 5.165016501650164e-07, + "loss": -0.0494, + "step": 938 + }, + { + "epoch": 0.7510497900419916, + "grad_norm": 3.662981016059356, + "learning_rate": 5.148514851485149e-07, + "loss": -0.1306, + "step": 939 + }, + { + "epoch": 0.7518496300739852, + "grad_norm": 4.564675844346983, + "learning_rate": 5.132013201320132e-07, + "loss": 0.0849, + "step": 940 + }, + { + "epoch": 0.7526494701059788, + "grad_norm": 4.127114880881102, + "learning_rate": 5.115511551155115e-07, + "loss": -0.1003, + "step": 941 + }, + { + "epoch": 0.7534493101379725, + "grad_norm": 7.580284959172436, + "learning_rate": 5.099009900990099e-07, + "loss": 0.0127, + "step": 942 + }, + { + "epoch": 0.754249150169966, + "grad_norm": 3.8477764594959223, + "learning_rate": 5.082508250825083e-07, + "loss": -0.0247, + "step": 943 + }, + { + "epoch": 0.7550489902019596, + "grad_norm": 5.134480523239938, + "learning_rate": 5.066006600660065e-07, + "loss": -0.064, + "step": 944 + }, + { + "epoch": 0.7558488302339532, + "grad_norm": 4.184980403074917, + "learning_rate": 5.04950495049505e-07, + "loss": -0.0374, + "step": 945 + }, + { + "epoch": 0.7566486702659468, + "grad_norm": 4.095109087896494, + "learning_rate": 5.033003300330033e-07, + "loss": -0.0667, + "step": 946 + }, + { + "epoch": 0.7574485102979404, + "grad_norm": 4.493509273952016, + "learning_rate": 5.016501650165016e-07, + "loss": -0.046, + "step": 947 + }, + { + "epoch": 0.758248350329934, + "grad_norm": 4.37283031614021, + "learning_rate": 5e-07, + "loss": -0.0662, + "step": 948 + }, + { + "epoch": 0.7590481903619276, + "grad_norm": 4.509842387488209, + "learning_rate": 4.983498349834983e-07, + "loss": -0.0951, + "step": 949 + }, + { + "epoch": 0.7598480303939212, + "grad_norm": 5.100099502789577, + "learning_rate": 4.966996699669966e-07, + "loss": -0.1086, + "step": 950 + }, + { + "epoch": 0.7606478704259149, + "grad_norm": 4.034320460393353, + "learning_rate": 4.95049504950495e-07, + "loss": -0.0263, + "step": 951 + }, + { + "epoch": 0.7614477104579084, + "grad_norm": 4.593555887493731, + "learning_rate": 4.933993399339933e-07, + "loss": -0.1636, + "step": 952 + }, + { + "epoch": 0.762247550489902, + "grad_norm": 4.526550802808929, + "learning_rate": 4.917491749174918e-07, + "loss": -0.0163, + "step": 953 + }, + { + "epoch": 0.7630473905218956, + "grad_norm": 5.515431862710494, + "learning_rate": 4.900990099009901e-07, + "loss": 0.0348, + "step": 954 + }, + { + "epoch": 0.7638472305538893, + "grad_norm": 4.2009128577609145, + "learning_rate": 4.884488448844884e-07, + "loss": -0.2542, + "step": 955 + }, + { + "epoch": 0.7646470705858828, + "grad_norm": 4.271019064733657, + "learning_rate": 4.867986798679868e-07, + "loss": -0.1869, + "step": 956 + }, + { + "epoch": 0.7654469106178764, + "grad_norm": 2.77377368877347, + "learning_rate": 4.851485148514851e-07, + "loss": -0.0735, + "step": 957 + }, + { + "epoch": 0.76624675064987, + "grad_norm": 6.0608792055595995, + "learning_rate": 4.834983498349835e-07, + "loss": -0.0099, + "step": 958 + }, + { + "epoch": 0.7670465906818636, + "grad_norm": 4.371986441183748, + "learning_rate": 4.818481848184819e-07, + "loss": -0.0748, + "step": 959 + }, + { + "epoch": 0.7678464307138573, + "grad_norm": 3.611071563569357, + "learning_rate": 4.801980198019802e-07, + "loss": -0.085, + "step": 960 + }, + { + "epoch": 0.7686462707458508, + "grad_norm": 5.1052303944455915, + "learning_rate": 4.785478547854785e-07, + "loss": 0.0156, + "step": 961 + }, + { + "epoch": 0.7694461107778444, + "grad_norm": 4.3911870956367505, + "learning_rate": 4.768976897689769e-07, + "loss": -0.0685, + "step": 962 + }, + { + "epoch": 0.770245950809838, + "grad_norm": 4.259527634413816, + "learning_rate": 4.752475247524752e-07, + "loss": -0.0837, + "step": 963 + }, + { + "epoch": 0.7710457908418317, + "grad_norm": 4.970224413650343, + "learning_rate": 4.735973597359736e-07, + "loss": -0.1512, + "step": 964 + }, + { + "epoch": 0.7718456308738252, + "grad_norm": 5.7445596582962395, + "learning_rate": 4.7194719471947193e-07, + "loss": -0.0713, + "step": 965 + }, + { + "epoch": 0.7726454709058188, + "grad_norm": 4.543014932858551, + "learning_rate": 4.7029702970297026e-07, + "loss": -0.1337, + "step": 966 + }, + { + "epoch": 0.7734453109378124, + "grad_norm": 4.451634421927441, + "learning_rate": 4.6864686468646865e-07, + "loss": -0.1269, + "step": 967 + }, + { + "epoch": 0.7742451509698061, + "grad_norm": 5.379063157263456, + "learning_rate": 4.66996699669967e-07, + "loss": -0.0862, + "step": 968 + }, + { + "epoch": 0.7750449910017997, + "grad_norm": 10.70668137767473, + "learning_rate": 4.6534653465346537e-07, + "loss": 0.0881, + "step": 969 + }, + { + "epoch": 0.7758448310337932, + "grad_norm": 3.7167032202960177, + "learning_rate": 4.6369636963696365e-07, + "loss": -0.017, + "step": 970 + }, + { + "epoch": 0.7766446710657868, + "grad_norm": 4.634801839565637, + "learning_rate": 4.62046204620462e-07, + "loss": -0.1011, + "step": 971 + }, + { + "epoch": 0.7774445110977805, + "grad_norm": 3.7911455543889354, + "learning_rate": 4.603960396039604e-07, + "loss": -0.1149, + "step": 972 + }, + { + "epoch": 0.7782443511297741, + "grad_norm": 3.824528464380088, + "learning_rate": 4.587458745874587e-07, + "loss": 0.0263, + "step": 973 + }, + { + "epoch": 0.7790441911617676, + "grad_norm": 3.978602397648478, + "learning_rate": 4.5709570957095705e-07, + "loss": -0.0273, + "step": 974 + }, + { + "epoch": 0.7798440311937612, + "grad_norm": 3.9315702808225206, + "learning_rate": 4.5544554455445543e-07, + "loss": -0.1133, + "step": 975 + }, + { + "epoch": 0.7806438712257548, + "grad_norm": 4.4594711325427845, + "learning_rate": 4.5379537953795377e-07, + "loss": -0.0891, + "step": 976 + }, + { + "epoch": 0.7814437112577485, + "grad_norm": 4.03305817498308, + "learning_rate": 4.521452145214521e-07, + "loss": -0.1069, + "step": 977 + }, + { + "epoch": 0.7822435512897421, + "grad_norm": 4.444308363632601, + "learning_rate": 4.504950495049505e-07, + "loss": -0.1274, + "step": 978 + }, + { + "epoch": 0.7830433913217356, + "grad_norm": 3.8559207670610953, + "learning_rate": 4.488448844884488e-07, + "loss": -0.1089, + "step": 979 + }, + { + "epoch": 0.7838432313537292, + "grad_norm": 6.852327449815295, + "learning_rate": 4.471947194719472e-07, + "loss": -0.0608, + "step": 980 + }, + { + "epoch": 0.7846430713857229, + "grad_norm": 5.172264061437722, + "learning_rate": 4.4554455445544555e-07, + "loss": -0.0064, + "step": 981 + }, + { + "epoch": 0.7854429114177165, + "grad_norm": 4.338134868672705, + "learning_rate": 4.438943894389439e-07, + "loss": 0.0261, + "step": 982 + }, + { + "epoch": 0.78624275144971, + "grad_norm": 4.7120432908120975, + "learning_rate": 4.4224422442244227e-07, + "loss": -0.006, + "step": 983 + }, + { + "epoch": 0.7870425914817036, + "grad_norm": 4.093388665246742, + "learning_rate": 4.405940594059406e-07, + "loss": -0.1195, + "step": 984 + }, + { + "epoch": 0.7878424315136973, + "grad_norm": 3.9779679788074565, + "learning_rate": 4.389438943894389e-07, + "loss": -0.0318, + "step": 985 + }, + { + "epoch": 0.7886422715456909, + "grad_norm": 5.681211915009173, + "learning_rate": 4.3729372937293727e-07, + "loss": -0.0715, + "step": 986 + }, + { + "epoch": 0.7894421115776845, + "grad_norm": 3.6961307708427875, + "learning_rate": 4.356435643564356e-07, + "loss": -0.1238, + "step": 987 + }, + { + "epoch": 0.790241951609678, + "grad_norm": 5.551081220452864, + "learning_rate": 4.3399339933993394e-07, + "loss": -0.0353, + "step": 988 + }, + { + "epoch": 0.7910417916416717, + "grad_norm": 4.1540822277204725, + "learning_rate": 4.3234323432343233e-07, + "loss": -0.1044, + "step": 989 + }, + { + "epoch": 0.7918416316736653, + "grad_norm": 5.690644788629102, + "learning_rate": 4.3069306930693066e-07, + "loss": -0.0824, + "step": 990 + }, + { + "epoch": 0.7926414717056589, + "grad_norm": 5.090727863244342, + "learning_rate": 4.29042904290429e-07, + "loss": -0.0275, + "step": 991 + }, + { + "epoch": 0.7934413117376524, + "grad_norm": 4.9291753971455705, + "learning_rate": 4.273927392739274e-07, + "loss": -0.0339, + "step": 992 + }, + { + "epoch": 0.794241151769646, + "grad_norm": 6.803186332065206, + "learning_rate": 4.257425742574257e-07, + "loss": -0.0342, + "step": 993 + }, + { + "epoch": 0.7950409918016397, + "grad_norm": 5.804775232239933, + "learning_rate": 4.240924092409241e-07, + "loss": -0.0866, + "step": 994 + }, + { + "epoch": 0.7958408318336333, + "grad_norm": 4.750722930505078, + "learning_rate": 4.2244224422442244e-07, + "loss": -0.1836, + "step": 995 + }, + { + "epoch": 0.7966406718656269, + "grad_norm": 6.716244820472627, + "learning_rate": 4.207920792079208e-07, + "loss": -0.1024, + "step": 996 + }, + { + "epoch": 0.7974405118976204, + "grad_norm": 3.722729260908509, + "learning_rate": 4.1914191419141916e-07, + "loss": -0.0993, + "step": 997 + }, + { + "epoch": 0.7982403519296141, + "grad_norm": 3.311724877987371, + "learning_rate": 4.174917491749175e-07, + "loss": -0.104, + "step": 998 + }, + { + "epoch": 0.7990401919616077, + "grad_norm": 5.521959854449801, + "learning_rate": 4.158415841584158e-07, + "loss": -0.1918, + "step": 999 + }, + { + "epoch": 0.7998400319936013, + "grad_norm": 5.022786099690795, + "learning_rate": 4.1419141914191417e-07, + "loss": -0.044, + "step": 1000 + }, + { + "epoch": 0.8006398720255948, + "grad_norm": 3.85176033301189, + "learning_rate": 4.125412541254125e-07, + "loss": -0.1389, + "step": 1001 + }, + { + "epoch": 0.8014397120575885, + "grad_norm": 3.850320054659798, + "learning_rate": 4.1089108910891084e-07, + "loss": -0.0621, + "step": 1002 + }, + { + "epoch": 0.8022395520895821, + "grad_norm": 5.125417738846334, + "learning_rate": 4.092409240924092e-07, + "loss": 0.024, + "step": 1003 + }, + { + "epoch": 0.8030393921215757, + "grad_norm": 5.550789325018317, + "learning_rate": 4.0759075907590756e-07, + "loss": -0.0596, + "step": 1004 + }, + { + "epoch": 0.8038392321535693, + "grad_norm": 2.987341879008025, + "learning_rate": 4.0594059405940595e-07, + "loss": 0.0174, + "step": 1005 + }, + { + "epoch": 0.8046390721855629, + "grad_norm": 3.840920872539408, + "learning_rate": 4.042904290429043e-07, + "loss": -0.1812, + "step": 1006 + }, + { + "epoch": 0.8054389122175565, + "grad_norm": 4.5904346361674495, + "learning_rate": 4.026402640264026e-07, + "loss": -0.153, + "step": 1007 + }, + { + "epoch": 0.8062387522495501, + "grad_norm": 4.716177536008103, + "learning_rate": 4.00990099009901e-07, + "loss": -0.0347, + "step": 1008 + }, + { + "epoch": 0.8070385922815437, + "grad_norm": 3.8819770758540106, + "learning_rate": 3.9933993399339934e-07, + "loss": -0.0491, + "step": 1009 + }, + { + "epoch": 0.8078384323135372, + "grad_norm": 4.337114713855018, + "learning_rate": 3.9768976897689767e-07, + "loss": -0.076, + "step": 1010 + }, + { + "epoch": 0.8086382723455309, + "grad_norm": 5.085958876323165, + "learning_rate": 3.9603960396039606e-07, + "loss": -0.1292, + "step": 1011 + }, + { + "epoch": 0.8094381123775245, + "grad_norm": 4.225428966888881, + "learning_rate": 3.943894389438944e-07, + "loss": -0.0349, + "step": 1012 + }, + { + "epoch": 0.8102379524095181, + "grad_norm": 3.132492848210798, + "learning_rate": 3.927392739273927e-07, + "loss": 0.0267, + "step": 1013 + }, + { + "epoch": 0.8110377924415118, + "grad_norm": 4.728427487496938, + "learning_rate": 3.9108910891089106e-07, + "loss": 0.0251, + "step": 1014 + }, + { + "epoch": 0.8118376324735053, + "grad_norm": 3.231169950869779, + "learning_rate": 3.894389438943894e-07, + "loss": -0.0048, + "step": 1015 + }, + { + "epoch": 0.8126374725054989, + "grad_norm": 5.4315342239443645, + "learning_rate": 3.8778877887788773e-07, + "loss": 0.0017, + "step": 1016 + }, + { + "epoch": 0.8134373125374925, + "grad_norm": 4.1495886173643015, + "learning_rate": 3.861386138613861e-07, + "loss": -0.0606, + "step": 1017 + }, + { + "epoch": 0.8142371525694861, + "grad_norm": 4.571814448385221, + "learning_rate": 3.8448844884488445e-07, + "loss": 0.0305, + "step": 1018 + }, + { + "epoch": 0.8150369926014797, + "grad_norm": 5.376749508040782, + "learning_rate": 3.8283828382838284e-07, + "loss": -0.0529, + "step": 1019 + }, + { + "epoch": 0.8158368326334733, + "grad_norm": 4.234529949886336, + "learning_rate": 3.811881188118812e-07, + "loss": -0.0884, + "step": 1020 + }, + { + "epoch": 0.8166366726654669, + "grad_norm": 5.103075536241735, + "learning_rate": 3.795379537953795e-07, + "loss": -0.0217, + "step": 1021 + }, + { + "epoch": 0.8174365126974605, + "grad_norm": 4.089650205096865, + "learning_rate": 3.778877887788779e-07, + "loss": -0.1912, + "step": 1022 + }, + { + "epoch": 0.8182363527294542, + "grad_norm": 4.315512669100621, + "learning_rate": 3.7623762376237623e-07, + "loss": -0.1096, + "step": 1023 + }, + { + "epoch": 0.8190361927614477, + "grad_norm": 5.730342268116768, + "learning_rate": 3.7458745874587457e-07, + "loss": -0.0613, + "step": 1024 + }, + { + "epoch": 0.8198360327934413, + "grad_norm": 4.482232090208855, + "learning_rate": 3.7293729372937295e-07, + "loss": -0.0025, + "step": 1025 + }, + { + "epoch": 0.8206358728254349, + "grad_norm": 4.242971330310737, + "learning_rate": 3.712871287128713e-07, + "loss": -0.0786, + "step": 1026 + }, + { + "epoch": 0.8214357128574286, + "grad_norm": 4.11563596499524, + "learning_rate": 3.696369636963696e-07, + "loss": -0.082, + "step": 1027 + }, + { + "epoch": 0.8222355528894221, + "grad_norm": 9.332422439085308, + "learning_rate": 3.6798679867986796e-07, + "loss": 0.0022, + "step": 1028 + }, + { + "epoch": 0.8230353929214157, + "grad_norm": 4.067536269426566, + "learning_rate": 3.663366336633663e-07, + "loss": -0.0361, + "step": 1029 + }, + { + "epoch": 0.8238352329534093, + "grad_norm": 6.044500944552922, + "learning_rate": 3.6468646864686463e-07, + "loss": 0.1496, + "step": 1030 + }, + { + "epoch": 0.824635072985403, + "grad_norm": 4.30205099295344, + "learning_rate": 3.63036303630363e-07, + "loss": -0.045, + "step": 1031 + }, + { + "epoch": 0.8254349130173965, + "grad_norm": 4.012282824952302, + "learning_rate": 3.6138613861386135e-07, + "loss": -0.1718, + "step": 1032 + }, + { + "epoch": 0.8262347530493901, + "grad_norm": 4.159359564634324, + "learning_rate": 3.5973597359735974e-07, + "loss": -0.0454, + "step": 1033 + }, + { + "epoch": 0.8270345930813837, + "grad_norm": 3.814387620245282, + "learning_rate": 3.5808580858085807e-07, + "loss": -0.0466, + "step": 1034 + }, + { + "epoch": 0.8278344331133773, + "grad_norm": 6.904216207979962, + "learning_rate": 3.564356435643564e-07, + "loss": -0.1518, + "step": 1035 + }, + { + "epoch": 0.828634273145371, + "grad_norm": 5.250865501632598, + "learning_rate": 3.547854785478548e-07, + "loss": -0.0118, + "step": 1036 + }, + { + "epoch": 0.8294341131773645, + "grad_norm": 5.846194811534598, + "learning_rate": 3.5313531353135313e-07, + "loss": 0.0351, + "step": 1037 + }, + { + "epoch": 0.8302339532093581, + "grad_norm": 4.460630924092106, + "learning_rate": 3.5148514851485146e-07, + "loss": -0.2443, + "step": 1038 + }, + { + "epoch": 0.8310337932413517, + "grad_norm": 4.1447157895119995, + "learning_rate": 3.4983498349834985e-07, + "loss": -0.0134, + "step": 1039 + }, + { + "epoch": 0.8318336332733454, + "grad_norm": 3.5252220895483517, + "learning_rate": 3.481848184818482e-07, + "loss": -0.0577, + "step": 1040 + }, + { + "epoch": 0.8326334733053389, + "grad_norm": 4.361902741479118, + "learning_rate": 3.465346534653465e-07, + "loss": 0.0104, + "step": 1041 + }, + { + "epoch": 0.8334333133373325, + "grad_norm": 4.461915879260683, + "learning_rate": 3.4488448844884485e-07, + "loss": -0.0156, + "step": 1042 + }, + { + "epoch": 0.8342331533693261, + "grad_norm": 4.834838939615413, + "learning_rate": 3.432343234323432e-07, + "loss": -0.0306, + "step": 1043 + }, + { + "epoch": 0.8350329934013198, + "grad_norm": 4.457492333115142, + "learning_rate": 3.415841584158416e-07, + "loss": -0.0158, + "step": 1044 + }, + { + "epoch": 0.8358328334333134, + "grad_norm": 6.418129824325349, + "learning_rate": 3.399339933993399e-07, + "loss": 0.023, + "step": 1045 + }, + { + "epoch": 0.8366326734653069, + "grad_norm": 5.631846859681406, + "learning_rate": 3.3828382838283824e-07, + "loss": -0.0842, + "step": 1046 + }, + { + "epoch": 0.8374325134973005, + "grad_norm": 4.893647743608584, + "learning_rate": 3.3663366336633663e-07, + "loss": 0.0169, + "step": 1047 + }, + { + "epoch": 0.8382323535292941, + "grad_norm": 3.327224537992695, + "learning_rate": 3.3498349834983497e-07, + "loss": 0.0234, + "step": 1048 + }, + { + "epoch": 0.8390321935612878, + "grad_norm": 3.334769765979331, + "learning_rate": 3.333333333333333e-07, + "loss": -0.0878, + "step": 1049 + }, + { + "epoch": 0.8398320335932813, + "grad_norm": 4.036389763362471, + "learning_rate": 3.316831683168317e-07, + "loss": -0.0402, + "step": 1050 + }, + { + "epoch": 0.8406318736252749, + "grad_norm": 3.653874204118681, + "learning_rate": 3.3003300330033e-07, + "loss": -0.0631, + "step": 1051 + }, + { + "epoch": 0.8414317136572685, + "grad_norm": 4.88359310166619, + "learning_rate": 3.2838283828382836e-07, + "loss": -0.0544, + "step": 1052 + }, + { + "epoch": 0.8422315536892622, + "grad_norm": 6.462333703622296, + "learning_rate": 3.2673267326732674e-07, + "loss": -0.0678, + "step": 1053 + }, + { + "epoch": 0.8430313937212558, + "grad_norm": 7.009021395345441, + "learning_rate": 3.250825082508251e-07, + "loss": 0.0099, + "step": 1054 + }, + { + "epoch": 0.8438312337532493, + "grad_norm": 4.2767377032125875, + "learning_rate": 3.234323432343234e-07, + "loss": 0.0297, + "step": 1055 + }, + { + "epoch": 0.8446310737852429, + "grad_norm": 9.79641552202019, + "learning_rate": 3.217821782178218e-07, + "loss": 0.0968, + "step": 1056 + }, + { + "epoch": 0.8454309138172366, + "grad_norm": 4.669605737417231, + "learning_rate": 3.201320132013201e-07, + "loss": 0.0097, + "step": 1057 + }, + { + "epoch": 0.8462307538492302, + "grad_norm": 3.1819061861624807, + "learning_rate": 3.1848184818481847e-07, + "loss": -0.0776, + "step": 1058 + }, + { + "epoch": 0.8470305938812237, + "grad_norm": 4.1774987880629695, + "learning_rate": 3.168316831683168e-07, + "loss": -0.0806, + "step": 1059 + }, + { + "epoch": 0.8478304339132173, + "grad_norm": 4.454569906758588, + "learning_rate": 3.1518151815181514e-07, + "loss": -0.0526, + "step": 1060 + }, + { + "epoch": 0.848630273945211, + "grad_norm": 3.527299815228531, + "learning_rate": 3.1353135313531353e-07, + "loss": -0.1065, + "step": 1061 + }, + { + "epoch": 0.8494301139772046, + "grad_norm": 4.481801002071373, + "learning_rate": 3.1188118811881186e-07, + "loss": -0.0022, + "step": 1062 + }, + { + "epoch": 0.8502299540091982, + "grad_norm": 4.3147168197624755, + "learning_rate": 3.102310231023102e-07, + "loss": -0.011, + "step": 1063 + }, + { + "epoch": 0.8510297940411917, + "grad_norm": 3.812340279657359, + "learning_rate": 3.085808580858086e-07, + "loss": -0.2093, + "step": 1064 + }, + { + "epoch": 0.8518296340731853, + "grad_norm": 5.097295358094463, + "learning_rate": 3.069306930693069e-07, + "loss": -0.0048, + "step": 1065 + }, + { + "epoch": 0.852629474105179, + "grad_norm": 5.088642578790314, + "learning_rate": 3.0528052805280525e-07, + "loss": -0.1315, + "step": 1066 + }, + { + "epoch": 0.8534293141371726, + "grad_norm": 4.052723785754238, + "learning_rate": 3.0363036303630364e-07, + "loss": -0.1133, + "step": 1067 + }, + { + "epoch": 0.8542291541691661, + "grad_norm": 5.193579179546016, + "learning_rate": 3.01980198019802e-07, + "loss": -0.0787, + "step": 1068 + }, + { + "epoch": 0.8550289942011597, + "grad_norm": 3.030054387526671, + "learning_rate": 3.0033003300330036e-07, + "loss": -0.0677, + "step": 1069 + }, + { + "epoch": 0.8558288342331534, + "grad_norm": 9.135373007054904, + "learning_rate": 2.986798679867987e-07, + "loss": -0.1171, + "step": 1070 + }, + { + "epoch": 0.856628674265147, + "grad_norm": 3.3785319743939013, + "learning_rate": 2.97029702970297e-07, + "loss": -0.0803, + "step": 1071 + }, + { + "epoch": 0.8574285142971406, + "grad_norm": 3.9077882713350762, + "learning_rate": 2.9537953795379537e-07, + "loss": -0.0513, + "step": 1072 + }, + { + "epoch": 0.8582283543291341, + "grad_norm": 4.038560493011451, + "learning_rate": 2.937293729372937e-07, + "loss": -0.0518, + "step": 1073 + }, + { + "epoch": 0.8590281943611278, + "grad_norm": 3.800775478942818, + "learning_rate": 2.9207920792079203e-07, + "loss": -0.1613, + "step": 1074 + }, + { + "epoch": 0.8598280343931214, + "grad_norm": 5.948071515082444, + "learning_rate": 2.904290429042904e-07, + "loss": -0.0834, + "step": 1075 + }, + { + "epoch": 0.860627874425115, + "grad_norm": 4.190899280558429, + "learning_rate": 2.8877887788778876e-07, + "loss": -0.103, + "step": 1076 + }, + { + "epoch": 0.8614277144571085, + "grad_norm": 4.910742773988997, + "learning_rate": 2.871287128712871e-07, + "loss": -0.0388, + "step": 1077 + }, + { + "epoch": 0.8622275544891022, + "grad_norm": 7.707730353888358, + "learning_rate": 2.854785478547855e-07, + "loss": -0.1477, + "step": 1078 + }, + { + "epoch": 0.8630273945210958, + "grad_norm": 4.297638426499125, + "learning_rate": 2.838283828382838e-07, + "loss": 0.0134, + "step": 1079 + }, + { + "epoch": 0.8638272345530894, + "grad_norm": 6.145968919540754, + "learning_rate": 2.8217821782178215e-07, + "loss": 0.0021, + "step": 1080 + }, + { + "epoch": 0.864627074585083, + "grad_norm": 3.7854604687393296, + "learning_rate": 2.8052805280528054e-07, + "loss": -0.1547, + "step": 1081 + }, + { + "epoch": 0.8654269146170765, + "grad_norm": 3.991674461171312, + "learning_rate": 2.7887788778877887e-07, + "loss": -0.0668, + "step": 1082 + }, + { + "epoch": 0.8662267546490702, + "grad_norm": 4.795940892228349, + "learning_rate": 2.7722772277227726e-07, + "loss": 0.0146, + "step": 1083 + }, + { + "epoch": 0.8670265946810638, + "grad_norm": 4.944098976058084, + "learning_rate": 2.755775577557756e-07, + "loss": -0.0523, + "step": 1084 + }, + { + "epoch": 0.8678264347130574, + "grad_norm": 5.005504629817167, + "learning_rate": 2.7392739273927387e-07, + "loss": 0.0065, + "step": 1085 + }, + { + "epoch": 0.8686262747450509, + "grad_norm": 4.33635141957305, + "learning_rate": 2.7227722772277226e-07, + "loss": 0.0051, + "step": 1086 + }, + { + "epoch": 0.8694261147770446, + "grad_norm": 6.827221005304679, + "learning_rate": 2.706270627062706e-07, + "loss": -0.1068, + "step": 1087 + }, + { + "epoch": 0.8702259548090382, + "grad_norm": 3.763012118037954, + "learning_rate": 2.6897689768976893e-07, + "loss": 0.0675, + "step": 1088 + }, + { + "epoch": 0.8710257948410318, + "grad_norm": 3.476547412625268, + "learning_rate": 2.673267326732673e-07, + "loss": -0.1067, + "step": 1089 + }, + { + "epoch": 0.8718256348730254, + "grad_norm": 3.622631746348685, + "learning_rate": 2.6567656765676565e-07, + "loss": -0.0829, + "step": 1090 + }, + { + "epoch": 0.872625474905019, + "grad_norm": 4.835180762619133, + "learning_rate": 2.64026402640264e-07, + "loss": -0.0761, + "step": 1091 + }, + { + "epoch": 0.8734253149370126, + "grad_norm": 4.188641976033946, + "learning_rate": 2.623762376237624e-07, + "loss": -0.114, + "step": 1092 + }, + { + "epoch": 0.8742251549690062, + "grad_norm": 4.833712113544916, + "learning_rate": 2.607260726072607e-07, + "loss": -0.1158, + "step": 1093 + }, + { + "epoch": 0.8750249950009998, + "grad_norm": 3.6494839656219935, + "learning_rate": 2.590759075907591e-07, + "loss": -0.048, + "step": 1094 + }, + { + "epoch": 0.8758248350329934, + "grad_norm": 3.9306302162750857, + "learning_rate": 2.5742574257425743e-07, + "loss": -0.0928, + "step": 1095 + }, + { + "epoch": 0.876624675064987, + "grad_norm": 3.7010390446563517, + "learning_rate": 2.5577557755775576e-07, + "loss": 0.0242, + "step": 1096 + }, + { + "epoch": 0.8774245150969806, + "grad_norm": 3.641273539002507, + "learning_rate": 2.5412541254125415e-07, + "loss": -0.1014, + "step": 1097 + }, + { + "epoch": 0.8782243551289742, + "grad_norm": 4.233409363271656, + "learning_rate": 2.524752475247525e-07, + "loss": -0.0404, + "step": 1098 + }, + { + "epoch": 0.8790241951609679, + "grad_norm": 4.188973466495453, + "learning_rate": 2.508250825082508e-07, + "loss": -0.0684, + "step": 1099 + }, + { + "epoch": 0.8798240351929614, + "grad_norm": 5.017584397195866, + "learning_rate": 2.4917491749174916e-07, + "loss": -0.0368, + "step": 1100 + }, + { + "epoch": 0.880623875224955, + "grad_norm": 3.9510700176873566, + "learning_rate": 2.475247524752475e-07, + "loss": -0.1375, + "step": 1101 + }, + { + "epoch": 0.8814237152569486, + "grad_norm": 5.84233851394486, + "learning_rate": 2.458745874587459e-07, + "loss": -0.1067, + "step": 1102 + }, + { + "epoch": 0.8822235552889423, + "grad_norm": 5.406949565806744, + "learning_rate": 2.442244224422442e-07, + "loss": -0.0962, + "step": 1103 + }, + { + "epoch": 0.8830233953209358, + "grad_norm": 5.315262379239265, + "learning_rate": 2.4257425742574255e-07, + "loss": -0.066, + "step": 1104 + }, + { + "epoch": 0.8838232353529294, + "grad_norm": 4.836530658291514, + "learning_rate": 2.4092409240924093e-07, + "loss": -0.0646, + "step": 1105 + }, + { + "epoch": 0.884623075384923, + "grad_norm": 3.293455547222145, + "learning_rate": 2.3927392739273927e-07, + "loss": -0.1701, + "step": 1106 + }, + { + "epoch": 0.8854229154169166, + "grad_norm": 4.709525078481242, + "learning_rate": 2.376237623762376e-07, + "loss": -0.0504, + "step": 1107 + }, + { + "epoch": 0.8862227554489103, + "grad_norm": 4.295657231556702, + "learning_rate": 2.3597359735973596e-07, + "loss": -0.1419, + "step": 1108 + }, + { + "epoch": 0.8870225954809038, + "grad_norm": 3.682029286721376, + "learning_rate": 2.3432343234323433e-07, + "loss": -0.0927, + "step": 1109 + }, + { + "epoch": 0.8878224355128974, + "grad_norm": 7.500929711256007, + "learning_rate": 2.3267326732673269e-07, + "loss": -0.001, + "step": 1110 + }, + { + "epoch": 0.888622275544891, + "grad_norm": 3.370577280876358, + "learning_rate": 2.31023102310231e-07, + "loss": -0.0219, + "step": 1111 + }, + { + "epoch": 0.8894221155768847, + "grad_norm": 4.603247549338215, + "learning_rate": 2.2937293729372936e-07, + "loss": -0.0407, + "step": 1112 + }, + { + "epoch": 0.8902219556088782, + "grad_norm": 3.033292259385364, + "learning_rate": 2.2772277227722772e-07, + "loss": -0.0612, + "step": 1113 + }, + { + "epoch": 0.8910217956408718, + "grad_norm": 5.654397566299044, + "learning_rate": 2.2607260726072605e-07, + "loss": -0.0081, + "step": 1114 + }, + { + "epoch": 0.8918216356728654, + "grad_norm": 4.7463861069291235, + "learning_rate": 2.244224422442244e-07, + "loss": 0.0324, + "step": 1115 + }, + { + "epoch": 0.8926214757048591, + "grad_norm": 3.6795751469461697, + "learning_rate": 2.2277227722772277e-07, + "loss": -0.082, + "step": 1116 + }, + { + "epoch": 0.8934213157368527, + "grad_norm": 6.7304826361036385, + "learning_rate": 2.2112211221122113e-07, + "loss": -0.1421, + "step": 1117 + }, + { + "epoch": 0.8942211557688462, + "grad_norm": 4.6084848352584, + "learning_rate": 2.1947194719471944e-07, + "loss": -0.0154, + "step": 1118 + }, + { + "epoch": 0.8950209958008398, + "grad_norm": 3.3241186250160673, + "learning_rate": 2.178217821782178e-07, + "loss": -0.0463, + "step": 1119 + }, + { + "epoch": 0.8958208358328335, + "grad_norm": 4.285630706698749, + "learning_rate": 2.1617161716171616e-07, + "loss": -0.0102, + "step": 1120 + }, + { + "epoch": 0.8966206758648271, + "grad_norm": 3.7442923024099266, + "learning_rate": 2.145214521452145e-07, + "loss": -0.0737, + "step": 1121 + }, + { + "epoch": 0.8974205158968206, + "grad_norm": 4.067618329578387, + "learning_rate": 2.1287128712871286e-07, + "loss": -0.0694, + "step": 1122 + }, + { + "epoch": 0.8982203559288142, + "grad_norm": 4.012101702069505, + "learning_rate": 2.1122112211221122e-07, + "loss": 0.013, + "step": 1123 + }, + { + "epoch": 0.8990201959608078, + "grad_norm": 3.927004556177739, + "learning_rate": 2.0957095709570958e-07, + "loss": 0.0032, + "step": 1124 + }, + { + "epoch": 0.8998200359928015, + "grad_norm": 4.153485289274271, + "learning_rate": 2.079207920792079e-07, + "loss": 0.0691, + "step": 1125 + }, + { + "epoch": 0.9006198760247951, + "grad_norm": 3.0852405388784936, + "learning_rate": 2.0627062706270625e-07, + "loss": -0.152, + "step": 1126 + }, + { + "epoch": 0.9014197160567886, + "grad_norm": 5.668232907029322, + "learning_rate": 2.046204620462046e-07, + "loss": -0.0051, + "step": 1127 + }, + { + "epoch": 0.9022195560887822, + "grad_norm": 4.016353081034237, + "learning_rate": 2.0297029702970297e-07, + "loss": 0.004, + "step": 1128 + }, + { + "epoch": 0.9030193961207759, + "grad_norm": 5.222331571223233, + "learning_rate": 2.013201320132013e-07, + "loss": -0.0398, + "step": 1129 + }, + { + "epoch": 0.9038192361527695, + "grad_norm": 6.050674073050146, + "learning_rate": 1.9966996699669967e-07, + "loss": -0.0201, + "step": 1130 + }, + { + "epoch": 0.904619076184763, + "grad_norm": 3.6129712440915336, + "learning_rate": 1.9801980198019803e-07, + "loss": -0.0873, + "step": 1131 + }, + { + "epoch": 0.9054189162167566, + "grad_norm": 3.9706646963831527, + "learning_rate": 1.9636963696369634e-07, + "loss": -0.0425, + "step": 1132 + }, + { + "epoch": 0.9062187562487503, + "grad_norm": 3.575800911347383, + "learning_rate": 1.947194719471947e-07, + "loss": -0.0728, + "step": 1133 + }, + { + "epoch": 0.9070185962807439, + "grad_norm": 4.9127144958415165, + "learning_rate": 1.9306930693069306e-07, + "loss": -0.1456, + "step": 1134 + }, + { + "epoch": 0.9078184363127374, + "grad_norm": 4.630716245217967, + "learning_rate": 1.9141914191419142e-07, + "loss": -0.1385, + "step": 1135 + }, + { + "epoch": 0.908618276344731, + "grad_norm": 3.7522928222148413, + "learning_rate": 1.8976897689768976e-07, + "loss": -0.0473, + "step": 1136 + }, + { + "epoch": 0.9094181163767247, + "grad_norm": 3.702942791411621, + "learning_rate": 1.8811881188118812e-07, + "loss": -0.0115, + "step": 1137 + }, + { + "epoch": 0.9102179564087183, + "grad_norm": 4.215778349737591, + "learning_rate": 1.8646864686468648e-07, + "loss": 0.0369, + "step": 1138 + }, + { + "epoch": 0.9110177964407119, + "grad_norm": 4.312725558809124, + "learning_rate": 1.848184818481848e-07, + "loss": -0.0821, + "step": 1139 + }, + { + "epoch": 0.9118176364727054, + "grad_norm": 4.321449833697151, + "learning_rate": 1.8316831683168315e-07, + "loss": -0.1315, + "step": 1140 + }, + { + "epoch": 0.912617476504699, + "grad_norm": 3.2649224360601234, + "learning_rate": 1.815181518151815e-07, + "loss": -0.1925, + "step": 1141 + }, + { + "epoch": 0.9134173165366927, + "grad_norm": 3.760790622253671, + "learning_rate": 1.7986798679867987e-07, + "loss": -0.0013, + "step": 1142 + }, + { + "epoch": 0.9142171565686863, + "grad_norm": 4.157207407424451, + "learning_rate": 1.782178217821782e-07, + "loss": -0.0581, + "step": 1143 + }, + { + "epoch": 0.9150169966006798, + "grad_norm": 5.763265152647982, + "learning_rate": 1.7656765676567656e-07, + "loss": -0.0789, + "step": 1144 + }, + { + "epoch": 0.9158168366326734, + "grad_norm": 5.000391563342132, + "learning_rate": 1.7491749174917492e-07, + "loss": 0.017, + "step": 1145 + }, + { + "epoch": 0.9166166766646671, + "grad_norm": 5.503835207911807, + "learning_rate": 1.7326732673267326e-07, + "loss": -0.0498, + "step": 1146 + }, + { + "epoch": 0.9174165166966607, + "grad_norm": 4.892439922885906, + "learning_rate": 1.716171617161716e-07, + "loss": -0.0188, + "step": 1147 + }, + { + "epoch": 0.9182163567286543, + "grad_norm": 4.1251081972670915, + "learning_rate": 1.6996699669966995e-07, + "loss": -0.1601, + "step": 1148 + }, + { + "epoch": 0.9190161967606478, + "grad_norm": 5.234413078511215, + "learning_rate": 1.6831683168316832e-07, + "loss": -0.0751, + "step": 1149 + }, + { + "epoch": 0.9198160367926415, + "grad_norm": 5.21079968866447, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.0173, + "step": 1150 + }, + { + "epoch": 0.9206158768246351, + "grad_norm": 4.287315805109782, + "learning_rate": 1.65016501650165e-07, + "loss": -0.0245, + "step": 1151 + }, + { + "epoch": 0.9214157168566287, + "grad_norm": 2.89403897319458, + "learning_rate": 1.6336633663366337e-07, + "loss": -0.1234, + "step": 1152 + }, + { + "epoch": 0.9222155568886222, + "grad_norm": 5.421448264794362, + "learning_rate": 1.617161716171617e-07, + "loss": -0.0738, + "step": 1153 + }, + { + "epoch": 0.9230153969206158, + "grad_norm": 5.340841829330172, + "learning_rate": 1.6006600660066004e-07, + "loss": -0.0978, + "step": 1154 + }, + { + "epoch": 0.9238152369526095, + "grad_norm": 5.139659477858416, + "learning_rate": 1.584158415841584e-07, + "loss": -0.0053, + "step": 1155 + }, + { + "epoch": 0.9246150769846031, + "grad_norm": 3.5692046134784676, + "learning_rate": 1.5676567656765676e-07, + "loss": -0.078, + "step": 1156 + }, + { + "epoch": 0.9254149170165967, + "grad_norm": 3.6214949664373624, + "learning_rate": 1.551155115511551e-07, + "loss": -0.1948, + "step": 1157 + }, + { + "epoch": 0.9262147570485902, + "grad_norm": 3.6462100170455516, + "learning_rate": 1.5346534653465346e-07, + "loss": -0.1466, + "step": 1158 + }, + { + "epoch": 0.9270145970805839, + "grad_norm": 4.042490583513813, + "learning_rate": 1.5181518151815182e-07, + "loss": -0.0494, + "step": 1159 + }, + { + "epoch": 0.9278144371125775, + "grad_norm": 3.570518304081052, + "learning_rate": 1.5016501650165018e-07, + "loss": -0.0203, + "step": 1160 + }, + { + "epoch": 0.9286142771445711, + "grad_norm": 4.860609480391736, + "learning_rate": 1.485148514851485e-07, + "loss": -0.0608, + "step": 1161 + }, + { + "epoch": 0.9294141171765646, + "grad_norm": 3.6503782154768336, + "learning_rate": 1.4686468646864685e-07, + "loss": -0.1209, + "step": 1162 + }, + { + "epoch": 0.9302139572085583, + "grad_norm": 5.2679977730736915, + "learning_rate": 1.452145214521452e-07, + "loss": -0.2231, + "step": 1163 + }, + { + "epoch": 0.9310137972405519, + "grad_norm": 4.077205627405786, + "learning_rate": 1.4356435643564355e-07, + "loss": -0.0918, + "step": 1164 + }, + { + "epoch": 0.9318136372725455, + "grad_norm": 4.7895431899614245, + "learning_rate": 1.419141914191419e-07, + "loss": -0.0527, + "step": 1165 + }, + { + "epoch": 0.9326134773045391, + "grad_norm": 5.279018314307402, + "learning_rate": 1.4026402640264027e-07, + "loss": 0.0147, + "step": 1166 + }, + { + "epoch": 0.9334133173365327, + "grad_norm": 4.336258829943017, + "learning_rate": 1.3861386138613863e-07, + "loss": -0.0938, + "step": 1167 + }, + { + "epoch": 0.9342131573685263, + "grad_norm": 4.54870086400182, + "learning_rate": 1.3696369636963694e-07, + "loss": -0.1337, + "step": 1168 + }, + { + "epoch": 0.9350129974005199, + "grad_norm": 4.5996184978678105, + "learning_rate": 1.353135313531353e-07, + "loss": -0.0564, + "step": 1169 + }, + { + "epoch": 0.9358128374325135, + "grad_norm": 3.580175353715861, + "learning_rate": 1.3366336633663366e-07, + "loss": -0.0917, + "step": 1170 + }, + { + "epoch": 0.936612677464507, + "grad_norm": 4.236520821746979, + "learning_rate": 1.32013201320132e-07, + "loss": -0.0952, + "step": 1171 + }, + { + "epoch": 0.9374125174965007, + "grad_norm": 4.45059914145225, + "learning_rate": 1.3036303630363035e-07, + "loss": -0.0501, + "step": 1172 + }, + { + "epoch": 0.9382123575284943, + "grad_norm": 3.895112295625407, + "learning_rate": 1.2871287128712872e-07, + "loss": -0.0248, + "step": 1173 + }, + { + "epoch": 0.9390121975604879, + "grad_norm": 3.446049410323525, + "learning_rate": 1.2706270627062708e-07, + "loss": -0.0609, + "step": 1174 + }, + { + "epoch": 0.9398120375924816, + "grad_norm": 4.440477903266653, + "learning_rate": 1.254125412541254e-07, + "loss": -0.1183, + "step": 1175 + }, + { + "epoch": 0.9406118776244751, + "grad_norm": 8.879875026201496, + "learning_rate": 1.2376237623762375e-07, + "loss": 0.0741, + "step": 1176 + }, + { + "epoch": 0.9414117176564687, + "grad_norm": 3.251289551995566, + "learning_rate": 1.221122112211221e-07, + "loss": -0.2012, + "step": 1177 + }, + { + "epoch": 0.9422115576884623, + "grad_norm": 5.090866489665643, + "learning_rate": 1.2046204620462047e-07, + "loss": -0.0937, + "step": 1178 + }, + { + "epoch": 0.943011397720456, + "grad_norm": 5.634824692335556, + "learning_rate": 1.188118811881188e-07, + "loss": -0.0674, + "step": 1179 + }, + { + "epoch": 0.9438112377524495, + "grad_norm": 7.237946586468722, + "learning_rate": 1.1716171617161716e-07, + "loss": 0.0063, + "step": 1180 + }, + { + "epoch": 0.9446110777844431, + "grad_norm": 5.322093424679851, + "learning_rate": 1.155115511551155e-07, + "loss": 0.0182, + "step": 1181 + }, + { + "epoch": 0.9454109178164367, + "grad_norm": 5.685444219272491, + "learning_rate": 1.1386138613861386e-07, + "loss": -0.0409, + "step": 1182 + }, + { + "epoch": 0.9462107578484303, + "grad_norm": 4.89343356574685, + "learning_rate": 1.122112211221122e-07, + "loss": -0.1669, + "step": 1183 + }, + { + "epoch": 0.947010597880424, + "grad_norm": 2.976462375028197, + "learning_rate": 1.1056105610561057e-07, + "loss": -0.1014, + "step": 1184 + }, + { + "epoch": 0.9478104379124175, + "grad_norm": 5.789816947507454, + "learning_rate": 1.089108910891089e-07, + "loss": -0.0268, + "step": 1185 + }, + { + "epoch": 0.9486102779444111, + "grad_norm": 4.547707602704605, + "learning_rate": 1.0726072607260725e-07, + "loss": -0.0252, + "step": 1186 + }, + { + "epoch": 0.9494101179764047, + "grad_norm": 3.669232395567316, + "learning_rate": 1.0561056105610561e-07, + "loss": -0.0721, + "step": 1187 + }, + { + "epoch": 0.9502099580083984, + "grad_norm": 4.725560206551925, + "learning_rate": 1.0396039603960394e-07, + "loss": -0.0374, + "step": 1188 + }, + { + "epoch": 0.9510097980403919, + "grad_norm": 5.642201380367948, + "learning_rate": 1.023102310231023e-07, + "loss": -0.0524, + "step": 1189 + }, + { + "epoch": 0.9518096380723855, + "grad_norm": 5.1228456388588555, + "learning_rate": 1.0066006600660065e-07, + "loss": -0.0122, + "step": 1190 + }, + { + "epoch": 0.9526094781043791, + "grad_norm": 3.5927188886776995, + "learning_rate": 9.900990099009901e-08, + "loss": -0.1424, + "step": 1191 + }, + { + "epoch": 0.9534093181363728, + "grad_norm": 3.815501173104961, + "learning_rate": 9.735973597359735e-08, + "loss": -0.1578, + "step": 1192 + }, + { + "epoch": 0.9542091581683664, + "grad_norm": 3.474045980664194, + "learning_rate": 9.570957095709571e-08, + "loss": -0.0597, + "step": 1193 + }, + { + "epoch": 0.9550089982003599, + "grad_norm": 4.782969446108987, + "learning_rate": 9.405940594059406e-08, + "loss": -0.0957, + "step": 1194 + }, + { + "epoch": 0.9558088382323535, + "grad_norm": 4.74265001734386, + "learning_rate": 9.24092409240924e-08, + "loss": -0.0267, + "step": 1195 + }, + { + "epoch": 0.9566086782643471, + "grad_norm": 3.661437568587583, + "learning_rate": 9.075907590759075e-08, + "loss": -0.1227, + "step": 1196 + }, + { + "epoch": 0.9574085182963408, + "grad_norm": 4.279576775134146, + "learning_rate": 8.91089108910891e-08, + "loss": 0.0111, + "step": 1197 + }, + { + "epoch": 0.9582083583283343, + "grad_norm": 4.169597103863264, + "learning_rate": 8.745874587458746e-08, + "loss": 0.0377, + "step": 1198 + }, + { + "epoch": 0.9590081983603279, + "grad_norm": 6.207479567569039, + "learning_rate": 8.58085808580858e-08, + "loss": 0.0014, + "step": 1199 + }, + { + "epoch": 0.9598080383923215, + "grad_norm": 3.5423075974898586, + "learning_rate": 8.415841584158416e-08, + "loss": -0.1664, + "step": 1200 + }, + { + "epoch": 0.9606078784243152, + "grad_norm": 3.6165897542858887, + "learning_rate": 8.25082508250825e-08, + "loss": -0.0884, + "step": 1201 + }, + { + "epoch": 0.9614077184563088, + "grad_norm": 4.517734549618362, + "learning_rate": 8.085808580858085e-08, + "loss": -0.1076, + "step": 1202 + }, + { + "epoch": 0.9622075584883023, + "grad_norm": 4.307866136219069, + "learning_rate": 7.92079207920792e-08, + "loss": -0.115, + "step": 1203 + }, + { + "epoch": 0.9630073985202959, + "grad_norm": 4.922270840667124, + "learning_rate": 7.755775577557755e-08, + "loss": -0.055, + "step": 1204 + }, + { + "epoch": 0.9638072385522896, + "grad_norm": 4.385179502669176, + "learning_rate": 7.590759075907591e-08, + "loss": -0.1254, + "step": 1205 + }, + { + "epoch": 0.9646070785842832, + "grad_norm": 3.9178495988004443, + "learning_rate": 7.425742574257424e-08, + "loss": -0.1016, + "step": 1206 + }, + { + "epoch": 0.9654069186162767, + "grad_norm": 3.3585705170911515, + "learning_rate": 7.26072607260726e-08, + "loss": -0.0209, + "step": 1207 + }, + { + "epoch": 0.9662067586482703, + "grad_norm": 3.815494549495067, + "learning_rate": 7.095709570957095e-08, + "loss": -0.0635, + "step": 1208 + }, + { + "epoch": 0.967006598680264, + "grad_norm": 8.403899931437618, + "learning_rate": 6.930693069306931e-08, + "loss": -0.0465, + "step": 1209 + }, + { + "epoch": 0.9678064387122576, + "grad_norm": 4.29568964473992, + "learning_rate": 6.765676567656765e-08, + "loss": -0.0332, + "step": 1210 + }, + { + "epoch": 0.9686062787442512, + "grad_norm": 3.4280415193587235, + "learning_rate": 6.6006600660066e-08, + "loss": -0.0803, + "step": 1211 + }, + { + "epoch": 0.9694061187762447, + "grad_norm": 4.6120423955763625, + "learning_rate": 6.435643564356436e-08, + "loss": -0.0619, + "step": 1212 + }, + { + "epoch": 0.9702059588082383, + "grad_norm": 6.424876752925553, + "learning_rate": 6.27062706270627e-08, + "loss": -0.1442, + "step": 1213 + }, + { + "epoch": 0.971005798840232, + "grad_norm": 5.485217081397391, + "learning_rate": 6.105610561056105e-08, + "loss": -0.0939, + "step": 1214 + }, + { + "epoch": 0.9718056388722256, + "grad_norm": 6.774111317136949, + "learning_rate": 5.94059405940594e-08, + "loss": -0.0439, + "step": 1215 + }, + { + "epoch": 0.9726054789042191, + "grad_norm": 5.096515115630733, + "learning_rate": 5.775577557755775e-08, + "loss": -0.0734, + "step": 1216 + }, + { + "epoch": 0.9734053189362127, + "grad_norm": 4.159248360440637, + "learning_rate": 5.61056105610561e-08, + "loss": -0.0121, + "step": 1217 + }, + { + "epoch": 0.9742051589682064, + "grad_norm": 7.334455086425638, + "learning_rate": 5.445544554455445e-08, + "loss": -0.1328, + "step": 1218 + }, + { + "epoch": 0.9750049990002, + "grad_norm": 5.126425754126674, + "learning_rate": 5.2805280528052805e-08, + "loss": -0.123, + "step": 1219 + }, + { + "epoch": 0.9758048390321936, + "grad_norm": 4.215981033934002, + "learning_rate": 5.115511551155115e-08, + "loss": -0.0326, + "step": 1220 + }, + { + "epoch": 0.9766046790641871, + "grad_norm": 5.727858996419284, + "learning_rate": 4.950495049504951e-08, + "loss": -0.078, + "step": 1221 + }, + { + "epoch": 0.9774045190961808, + "grad_norm": 4.2278775409875475, + "learning_rate": 4.7854785478547855e-08, + "loss": -0.1444, + "step": 1222 + }, + { + "epoch": 0.9782043591281744, + "grad_norm": 5.1475401401668455, + "learning_rate": 4.62046204620462e-08, + "loss": -0.0758, + "step": 1223 + }, + { + "epoch": 0.979004199160168, + "grad_norm": 4.979540737683743, + "learning_rate": 4.455445544554455e-08, + "loss": -0.0669, + "step": 1224 + }, + { + "epoch": 0.9798040391921615, + "grad_norm": 3.4515888370253385, + "learning_rate": 4.29042904290429e-08, + "loss": -0.0937, + "step": 1225 + }, + { + "epoch": 0.9806038792241552, + "grad_norm": 6.288776500193402, + "learning_rate": 4.125412541254125e-08, + "loss": 0.1102, + "step": 1226 + }, + { + "epoch": 0.9814037192561488, + "grad_norm": 5.48415137375722, + "learning_rate": 3.96039603960396e-08, + "loss": -0.0964, + "step": 1227 + }, + { + "epoch": 0.9822035592881424, + "grad_norm": 4.355868334038742, + "learning_rate": 3.7953795379537955e-08, + "loss": -0.0328, + "step": 1228 + }, + { + "epoch": 0.9830033993201359, + "grad_norm": 4.721491339476331, + "learning_rate": 3.63036303630363e-08, + "loss": -0.0365, + "step": 1229 + }, + { + "epoch": 0.9838032393521295, + "grad_norm": 6.339518521675752, + "learning_rate": 3.465346534653466e-08, + "loss": -0.0573, + "step": 1230 + }, + { + "epoch": 0.9846030793841232, + "grad_norm": 4.434922235230731, + "learning_rate": 3.3003300330033e-08, + "loss": -0.1461, + "step": 1231 + }, + { + "epoch": 0.9854029194161168, + "grad_norm": 5.310987908083999, + "learning_rate": 3.135313531353135e-08, + "loss": -0.0746, + "step": 1232 + }, + { + "epoch": 0.9862027594481104, + "grad_norm": 5.686966755780067, + "learning_rate": 2.97029702970297e-08, + "loss": -0.0133, + "step": 1233 + }, + { + "epoch": 0.9870025994801039, + "grad_norm": 4.108463781012627, + "learning_rate": 2.805280528052805e-08, + "loss": 0.0749, + "step": 1234 + }, + { + "epoch": 0.9878024395120976, + "grad_norm": 3.6672659008615764, + "learning_rate": 2.6402640264026403e-08, + "loss": -0.1047, + "step": 1235 + }, + { + "epoch": 0.9886022795440912, + "grad_norm": 3.9834854628962146, + "learning_rate": 2.4752475247524754e-08, + "loss": -0.0921, + "step": 1236 + }, + { + "epoch": 0.9894021195760848, + "grad_norm": 3.5139198067318054, + "learning_rate": 2.31023102310231e-08, + "loss": -0.1611, + "step": 1237 + }, + { + "epoch": 0.9902019596080783, + "grad_norm": 4.1541924223616356, + "learning_rate": 2.145214521452145e-08, + "loss": -0.0147, + "step": 1238 + }, + { + "epoch": 0.991001799640072, + "grad_norm": 6.337275466101498, + "learning_rate": 1.98019801980198e-08, + "loss": -0.0767, + "step": 1239 + }, + { + "epoch": 0.9918016396720656, + "grad_norm": 82.20527671342789, + "learning_rate": 1.815181518151815e-08, + "loss": -0.0431, + "step": 1240 + }, + { + "epoch": 0.9926014797040592, + "grad_norm": 2.6922514851959494, + "learning_rate": 1.65016501650165e-08, + "loss": -0.0195, + "step": 1241 + }, + { + "epoch": 0.9934013197360528, + "grad_norm": 3.6925978256211747, + "learning_rate": 1.485148514851485e-08, + "loss": -0.1541, + "step": 1242 + }, + { + "epoch": 0.9942011597680463, + "grad_norm": 4.79635865852686, + "learning_rate": 1.3201320132013201e-08, + "loss": -0.1104, + "step": 1243 + }, + { + "epoch": 0.99500099980004, + "grad_norm": 4.327272847702339, + "learning_rate": 1.155115511551155e-08, + "loss": -0.0807, + "step": 1244 + }, + { + "epoch": 0.9958008398320336, + "grad_norm": 4.256644720520306, + "learning_rate": 9.9009900990099e-09, + "loss": -0.0278, + "step": 1245 + }, + { + "epoch": 0.9966006798640272, + "grad_norm": 3.586841344680467, + "learning_rate": 8.25082508250825e-09, + "loss": 0.0033, + "step": 1246 + }, + { + "epoch": 0.9974005198960207, + "grad_norm": 4.084312918321821, + "learning_rate": 6.600660066006601e-09, + "loss": -0.0053, + "step": 1247 + }, + { + "epoch": 0.9982003599280144, + "grad_norm": 3.983696015790867, + "learning_rate": 4.95049504950495e-09, + "loss": -0.0229, + "step": 1248 + }, + { + "epoch": 0.999000199960008, + "grad_norm": 4.1770415014644104, + "learning_rate": 3.3003300330033003e-09, + "loss": -0.1898, + "step": 1249 + }, + { + "epoch": 0.9998000399920016, + "grad_norm": 4.645732496504415, + "learning_rate": 1.6501650165016502e-09, + "loss": 0.0221, + "step": 1250 + }, + { + "epoch": 0.9998000399920016, + "step": 1250, + "total_flos": 208730583859200.0, + "train_loss": -0.05900815903544426, + "train_runtime": 14539.7464, + "train_samples_per_second": 11.005, + "train_steps_per_second": 0.086 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 208730583859200.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}