diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,29533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 4216, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004743833017077799, + "grad_norm": 12.117147445678711, + "learning_rate": 1.5748031496062994e-07, + "loss": 0.6555, + "step": 1 + }, + { + "epoch": 0.0009487666034155598, + "grad_norm": 12.216944694519043, + "learning_rate": 3.149606299212599e-07, + "loss": 0.7042, + "step": 2 + }, + { + "epoch": 0.0014231499051233396, + "grad_norm": 11.745461463928223, + "learning_rate": 4.724409448818898e-07, + "loss": 0.7974, + "step": 3 + }, + { + "epoch": 0.0018975332068311196, + "grad_norm": 14.210430145263672, + "learning_rate": 6.299212598425198e-07, + "loss": 0.6868, + "step": 4 + }, + { + "epoch": 0.0023719165085388993, + "grad_norm": 13.679463386535645, + "learning_rate": 7.874015748031496e-07, + "loss": 0.7468, + "step": 5 + }, + { + "epoch": 0.0028462998102466793, + "grad_norm": 12.984026908874512, + "learning_rate": 9.448818897637796e-07, + "loss": 0.7443, + "step": 6 + }, + { + "epoch": 0.003320683111954459, + "grad_norm": 15.170774459838867, + "learning_rate": 1.1023622047244096e-06, + "loss": 0.7255, + "step": 7 + }, + { + "epoch": 0.003795066413662239, + "grad_norm": 12.011886596679688, + "learning_rate": 1.2598425196850396e-06, + "loss": 0.6684, + "step": 8 + }, + { + "epoch": 0.004269449715370019, + "grad_norm": 29.63150405883789, + "learning_rate": 1.4173228346456693e-06, + "loss": 0.6935, + "step": 9 + }, + { + "epoch": 0.004743833017077799, + "grad_norm": 12.307829856872559, + "learning_rate": 1.5748031496062992e-06, + "loss": 0.5554, + "step": 10 + }, + { + "epoch": 0.005218216318785579, + "grad_norm": 15.281328201293945, + "learning_rate": 1.7322834645669292e-06, + "loss": 0.5863, + "step": 11 + }, + { + "epoch": 0.0056925996204933585, + "grad_norm": 10.656610488891602, + "learning_rate": 1.8897637795275591e-06, + "loss": 0.4246, + "step": 12 + }, + { + "epoch": 0.006166982922201139, + "grad_norm": 9.216360092163086, + "learning_rate": 2.0472440944881893e-06, + "loss": 0.3654, + "step": 13 + }, + { + "epoch": 0.006641366223908918, + "grad_norm": 7.334237575531006, + "learning_rate": 2.2047244094488192e-06, + "loss": 0.3389, + "step": 14 + }, + { + "epoch": 0.007115749525616698, + "grad_norm": 9.264925003051758, + "learning_rate": 2.362204724409449e-06, + "loss": 0.3582, + "step": 15 + }, + { + "epoch": 0.007590132827324478, + "grad_norm": 7.431726455688477, + "learning_rate": 2.519685039370079e-06, + "loss": 0.3778, + "step": 16 + }, + { + "epoch": 0.008064516129032258, + "grad_norm": 14.502622604370117, + "learning_rate": 2.677165354330709e-06, + "loss": 0.4209, + "step": 17 + }, + { + "epoch": 0.008538899430740038, + "grad_norm": 8.857325553894043, + "learning_rate": 2.8346456692913386e-06, + "loss": 0.3823, + "step": 18 + }, + { + "epoch": 0.009013282732447819, + "grad_norm": 10.866437911987305, + "learning_rate": 2.992125984251969e-06, + "loss": 0.4257, + "step": 19 + }, + { + "epoch": 0.009487666034155597, + "grad_norm": 7.098900318145752, + "learning_rate": 3.1496062992125985e-06, + "loss": 0.324, + "step": 20 + }, + { + "epoch": 0.009962049335863378, + "grad_norm": 7.2487616539001465, + "learning_rate": 3.307086614173229e-06, + "loss": 0.3815, + "step": 21 + }, + { + "epoch": 0.010436432637571158, + "grad_norm": 6.616808891296387, + "learning_rate": 3.4645669291338583e-06, + "loss": 0.2939, + "step": 22 + }, + { + "epoch": 0.010910815939278937, + "grad_norm": 6.747620582580566, + "learning_rate": 3.6220472440944887e-06, + "loss": 0.2914, + "step": 23 + }, + { + "epoch": 0.011385199240986717, + "grad_norm": 5.690284729003906, + "learning_rate": 3.7795275590551182e-06, + "loss": 0.2879, + "step": 24 + }, + { + "epoch": 0.011859582542694497, + "grad_norm": 6.056181907653809, + "learning_rate": 3.937007874015748e-06, + "loss": 0.2785, + "step": 25 + }, + { + "epoch": 0.012333965844402278, + "grad_norm": 7.560486316680908, + "learning_rate": 4.0944881889763785e-06, + "loss": 0.3008, + "step": 26 + }, + { + "epoch": 0.012808349146110056, + "grad_norm": 7.066635608673096, + "learning_rate": 4.251968503937008e-06, + "loss": 0.3298, + "step": 27 + }, + { + "epoch": 0.013282732447817837, + "grad_norm": 5.725413799285889, + "learning_rate": 4.4094488188976384e-06, + "loss": 0.2901, + "step": 28 + }, + { + "epoch": 0.013757115749525617, + "grad_norm": 7.520142555236816, + "learning_rate": 4.566929133858268e-06, + "loss": 0.3461, + "step": 29 + }, + { + "epoch": 0.014231499051233396, + "grad_norm": 6.214694976806641, + "learning_rate": 4.724409448818898e-06, + "loss": 0.3216, + "step": 30 + }, + { + "epoch": 0.014705882352941176, + "grad_norm": 6.9384870529174805, + "learning_rate": 4.881889763779528e-06, + "loss": 0.3779, + "step": 31 + }, + { + "epoch": 0.015180265654648957, + "grad_norm": 6.863812446594238, + "learning_rate": 5.039370078740158e-06, + "loss": 0.3194, + "step": 32 + }, + { + "epoch": 0.015654648956356737, + "grad_norm": 7.183291912078857, + "learning_rate": 5.196850393700788e-06, + "loss": 0.3293, + "step": 33 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 6.995275497436523, + "learning_rate": 5.354330708661418e-06, + "loss": 0.4026, + "step": 34 + }, + { + "epoch": 0.016603415559772294, + "grad_norm": 6.775660514831543, + "learning_rate": 5.511811023622048e-06, + "loss": 0.3289, + "step": 35 + }, + { + "epoch": 0.017077798861480076, + "grad_norm": 6.413484573364258, + "learning_rate": 5.669291338582677e-06, + "loss": 0.3315, + "step": 36 + }, + { + "epoch": 0.017552182163187855, + "grad_norm": 7.173760414123535, + "learning_rate": 5.8267716535433075e-06, + "loss": 0.3633, + "step": 37 + }, + { + "epoch": 0.018026565464895637, + "grad_norm": 4.803355693817139, + "learning_rate": 5.984251968503938e-06, + "loss": 0.3607, + "step": 38 + }, + { + "epoch": 0.018500948766603416, + "grad_norm": 5.451982498168945, + "learning_rate": 6.141732283464567e-06, + "loss": 0.3503, + "step": 39 + }, + { + "epoch": 0.018975332068311195, + "grad_norm": 5.767213821411133, + "learning_rate": 6.299212598425197e-06, + "loss": 0.2794, + "step": 40 + }, + { + "epoch": 0.019449715370018977, + "grad_norm": 6.687100410461426, + "learning_rate": 6.456692913385827e-06, + "loss": 0.3668, + "step": 41 + }, + { + "epoch": 0.019924098671726755, + "grad_norm": 6.374785900115967, + "learning_rate": 6.614173228346458e-06, + "loss": 0.3366, + "step": 42 + }, + { + "epoch": 0.020398481973434534, + "grad_norm": 6.196499347686768, + "learning_rate": 6.771653543307087e-06, + "loss": 0.3421, + "step": 43 + }, + { + "epoch": 0.020872865275142316, + "grad_norm": 6.357893943786621, + "learning_rate": 6.929133858267717e-06, + "loss": 0.3663, + "step": 44 + }, + { + "epoch": 0.021347248576850095, + "grad_norm": 5.325692653656006, + "learning_rate": 7.086614173228347e-06, + "loss": 0.3341, + "step": 45 + }, + { + "epoch": 0.021821631878557873, + "grad_norm": 6.0895538330078125, + "learning_rate": 7.2440944881889774e-06, + "loss": 0.3348, + "step": 46 + }, + { + "epoch": 0.022296015180265655, + "grad_norm": 5.129356384277344, + "learning_rate": 7.401574803149607e-06, + "loss": 0.294, + "step": 47 + }, + { + "epoch": 0.022770398481973434, + "grad_norm": 5.093695163726807, + "learning_rate": 7.5590551181102365e-06, + "loss": 0.3223, + "step": 48 + }, + { + "epoch": 0.023244781783681213, + "grad_norm": 5.516068935394287, + "learning_rate": 7.716535433070867e-06, + "loss": 0.3455, + "step": 49 + }, + { + "epoch": 0.023719165085388995, + "grad_norm": 5.486409664154053, + "learning_rate": 7.874015748031496e-06, + "loss": 0.3349, + "step": 50 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 5.390769004821777, + "learning_rate": 8.031496062992128e-06, + "loss": 0.3474, + "step": 51 + }, + { + "epoch": 0.024667931688804556, + "grad_norm": 5.160117149353027, + "learning_rate": 8.188976377952757e-06, + "loss": 0.3048, + "step": 52 + }, + { + "epoch": 0.025142314990512334, + "grad_norm": 7.194066047668457, + "learning_rate": 8.346456692913387e-06, + "loss": 0.4043, + "step": 53 + }, + { + "epoch": 0.025616698292220113, + "grad_norm": 5.130771636962891, + "learning_rate": 8.503937007874016e-06, + "loss": 0.3362, + "step": 54 + }, + { + "epoch": 0.026091081593927895, + "grad_norm": 5.493372440338135, + "learning_rate": 8.661417322834647e-06, + "loss": 0.3842, + "step": 55 + }, + { + "epoch": 0.026565464895635674, + "grad_norm": 6.465633392333984, + "learning_rate": 8.818897637795277e-06, + "loss": 0.3974, + "step": 56 + }, + { + "epoch": 0.027039848197343452, + "grad_norm": 7.080130100250244, + "learning_rate": 8.976377952755906e-06, + "loss": 0.3579, + "step": 57 + }, + { + "epoch": 0.027514231499051234, + "grad_norm": 5.774561882019043, + "learning_rate": 9.133858267716536e-06, + "loss": 0.374, + "step": 58 + }, + { + "epoch": 0.027988614800759013, + "grad_norm": 5.477121353149414, + "learning_rate": 9.291338582677165e-06, + "loss": 0.3664, + "step": 59 + }, + { + "epoch": 0.028462998102466792, + "grad_norm": 5.975531101226807, + "learning_rate": 9.448818897637797e-06, + "loss": 0.4218, + "step": 60 + }, + { + "epoch": 0.028937381404174574, + "grad_norm": 5.1018967628479, + "learning_rate": 9.606299212598426e-06, + "loss": 0.3879, + "step": 61 + }, + { + "epoch": 0.029411764705882353, + "grad_norm": 5.107121467590332, + "learning_rate": 9.763779527559056e-06, + "loss": 0.3992, + "step": 62 + }, + { + "epoch": 0.02988614800759013, + "grad_norm": 6.47650671005249, + "learning_rate": 9.921259842519685e-06, + "loss": 0.4705, + "step": 63 + }, + { + "epoch": 0.030360531309297913, + "grad_norm": 6.037809371948242, + "learning_rate": 1.0078740157480316e-05, + "loss": 0.4264, + "step": 64 + }, + { + "epoch": 0.030834914611005692, + "grad_norm": 5.349067211151123, + "learning_rate": 1.0236220472440946e-05, + "loss": 0.384, + "step": 65 + }, + { + "epoch": 0.031309297912713474, + "grad_norm": 5.131612777709961, + "learning_rate": 1.0393700787401575e-05, + "loss": 0.3905, + "step": 66 + }, + { + "epoch": 0.03178368121442125, + "grad_norm": 5.476340293884277, + "learning_rate": 1.0551181102362205e-05, + "loss": 0.427, + "step": 67 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 6.4385480880737305, + "learning_rate": 1.0708661417322836e-05, + "loss": 0.5252, + "step": 68 + }, + { + "epoch": 0.03273244781783681, + "grad_norm": 5.29050874710083, + "learning_rate": 1.0866141732283466e-05, + "loss": 0.453, + "step": 69 + }, + { + "epoch": 0.03320683111954459, + "grad_norm": 4.675810813903809, + "learning_rate": 1.1023622047244095e-05, + "loss": 0.412, + "step": 70 + }, + { + "epoch": 0.033681214421252374, + "grad_norm": 5.531217575073242, + "learning_rate": 1.1181102362204725e-05, + "loss": 0.4553, + "step": 71 + }, + { + "epoch": 0.03415559772296015, + "grad_norm": 6.203500747680664, + "learning_rate": 1.1338582677165354e-05, + "loss": 0.4826, + "step": 72 + }, + { + "epoch": 0.03462998102466793, + "grad_norm": 5.102776050567627, + "learning_rate": 1.1496062992125985e-05, + "loss": 0.3427, + "step": 73 + }, + { + "epoch": 0.03510436432637571, + "grad_norm": 4.565568923950195, + "learning_rate": 1.1653543307086615e-05, + "loss": 0.3734, + "step": 74 + }, + { + "epoch": 0.03557874762808349, + "grad_norm": 4.548391819000244, + "learning_rate": 1.1811023622047245e-05, + "loss": 0.3647, + "step": 75 + }, + { + "epoch": 0.036053130929791274, + "grad_norm": 5.1189398765563965, + "learning_rate": 1.1968503937007876e-05, + "loss": 0.4211, + "step": 76 + }, + { + "epoch": 0.03652751423149905, + "grad_norm": 6.630745887756348, + "learning_rate": 1.2125984251968505e-05, + "loss": 0.5191, + "step": 77 + }, + { + "epoch": 0.03700189753320683, + "grad_norm": 5.194817066192627, + "learning_rate": 1.2283464566929135e-05, + "loss": 0.3779, + "step": 78 + }, + { + "epoch": 0.03747628083491461, + "grad_norm": 5.23333740234375, + "learning_rate": 1.2440944881889764e-05, + "loss": 0.4235, + "step": 79 + }, + { + "epoch": 0.03795066413662239, + "grad_norm": 4.812337398529053, + "learning_rate": 1.2598425196850394e-05, + "loss": 0.3958, + "step": 80 + }, + { + "epoch": 0.03842504743833017, + "grad_norm": 4.991153240203857, + "learning_rate": 1.2755905511811025e-05, + "loss": 0.4543, + "step": 81 + }, + { + "epoch": 0.03889943074003795, + "grad_norm": 5.6834025382995605, + "learning_rate": 1.2913385826771655e-05, + "loss": 0.4253, + "step": 82 + }, + { + "epoch": 0.03937381404174573, + "grad_norm": 4.659487247467041, + "learning_rate": 1.3070866141732284e-05, + "loss": 0.3582, + "step": 83 + }, + { + "epoch": 0.03984819734345351, + "grad_norm": 6.373027324676514, + "learning_rate": 1.3228346456692915e-05, + "loss": 0.4211, + "step": 84 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 4.993396759033203, + "learning_rate": 1.3385826771653545e-05, + "loss": 0.417, + "step": 85 + }, + { + "epoch": 0.04079696394686907, + "grad_norm": 5.25504207611084, + "learning_rate": 1.3543307086614174e-05, + "loss": 0.4558, + "step": 86 + }, + { + "epoch": 0.04127134724857685, + "grad_norm": 5.829912185668945, + "learning_rate": 1.3700787401574804e-05, + "loss": 0.473, + "step": 87 + }, + { + "epoch": 0.04174573055028463, + "grad_norm": 5.047908782958984, + "learning_rate": 1.3858267716535433e-05, + "loss": 0.4056, + "step": 88 + }, + { + "epoch": 0.04222011385199241, + "grad_norm": 5.062896251678467, + "learning_rate": 1.4015748031496063e-05, + "loss": 0.4264, + "step": 89 + }, + { + "epoch": 0.04269449715370019, + "grad_norm": 5.646873950958252, + "learning_rate": 1.4173228346456694e-05, + "loss": 0.4772, + "step": 90 + }, + { + "epoch": 0.04316888045540797, + "grad_norm": 4.573570728302002, + "learning_rate": 1.4330708661417324e-05, + "loss": 0.4483, + "step": 91 + }, + { + "epoch": 0.04364326375711575, + "grad_norm": 5.0274128913879395, + "learning_rate": 1.4488188976377955e-05, + "loss": 0.4248, + "step": 92 + }, + { + "epoch": 0.04411764705882353, + "grad_norm": 5.646190643310547, + "learning_rate": 1.4645669291338584e-05, + "loss": 0.5235, + "step": 93 + }, + { + "epoch": 0.04459203036053131, + "grad_norm": 5.01361083984375, + "learning_rate": 1.4803149606299214e-05, + "loss": 0.4591, + "step": 94 + }, + { + "epoch": 0.04506641366223909, + "grad_norm": 5.54431676864624, + "learning_rate": 1.4960629921259843e-05, + "loss": 0.4361, + "step": 95 + }, + { + "epoch": 0.04554079696394687, + "grad_norm": 4.712576389312744, + "learning_rate": 1.5118110236220473e-05, + "loss": 0.3923, + "step": 96 + }, + { + "epoch": 0.04601518026565465, + "grad_norm": 4.848332405090332, + "learning_rate": 1.5275590551181102e-05, + "loss": 0.4677, + "step": 97 + }, + { + "epoch": 0.046489563567362426, + "grad_norm": 5.39124870300293, + "learning_rate": 1.5433070866141734e-05, + "loss": 0.4506, + "step": 98 + }, + { + "epoch": 0.04696394686907021, + "grad_norm": 5.186285972595215, + "learning_rate": 1.559055118110236e-05, + "loss": 0.4037, + "step": 99 + }, + { + "epoch": 0.04743833017077799, + "grad_norm": 4.8791913986206055, + "learning_rate": 1.5748031496062993e-05, + "loss": 0.4467, + "step": 100 + }, + { + "epoch": 0.04791271347248577, + "grad_norm": 5.618642330169678, + "learning_rate": 1.5905511811023624e-05, + "loss": 0.4959, + "step": 101 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 4.833317279815674, + "learning_rate": 1.6062992125984255e-05, + "loss": 0.4752, + "step": 102 + }, + { + "epoch": 0.048861480075901326, + "grad_norm": 4.536538600921631, + "learning_rate": 1.6220472440944883e-05, + "loss": 0.4186, + "step": 103 + }, + { + "epoch": 0.04933586337760911, + "grad_norm": 5.135328769683838, + "learning_rate": 1.6377952755905514e-05, + "loss": 0.4471, + "step": 104 + }, + { + "epoch": 0.04981024667931689, + "grad_norm": 5.110005855560303, + "learning_rate": 1.6535433070866142e-05, + "loss": 0.4585, + "step": 105 + }, + { + "epoch": 0.05028462998102467, + "grad_norm": 5.040478229522705, + "learning_rate": 1.6692913385826773e-05, + "loss": 0.4656, + "step": 106 + }, + { + "epoch": 0.05075901328273245, + "grad_norm": 4.914176940917969, + "learning_rate": 1.68503937007874e-05, + "loss": 0.4783, + "step": 107 + }, + { + "epoch": 0.051233396584440226, + "grad_norm": 4.72114372253418, + "learning_rate": 1.7007874015748032e-05, + "loss": 0.4365, + "step": 108 + }, + { + "epoch": 0.051707779886148005, + "grad_norm": 4.7739362716674805, + "learning_rate": 1.7165354330708663e-05, + "loss": 0.4766, + "step": 109 + }, + { + "epoch": 0.05218216318785579, + "grad_norm": 4.787454128265381, + "learning_rate": 1.7322834645669295e-05, + "loss": 0.4542, + "step": 110 + }, + { + "epoch": 0.05265654648956357, + "grad_norm": 5.132379055023193, + "learning_rate": 1.7480314960629923e-05, + "loss": 0.4978, + "step": 111 + }, + { + "epoch": 0.05313092979127135, + "grad_norm": 6.812975883483887, + "learning_rate": 1.7637795275590554e-05, + "loss": 0.4559, + "step": 112 + }, + { + "epoch": 0.053605313092979126, + "grad_norm": 5.09624719619751, + "learning_rate": 1.779527559055118e-05, + "loss": 0.5481, + "step": 113 + }, + { + "epoch": 0.054079696394686905, + "grad_norm": 5.495835781097412, + "learning_rate": 1.7952755905511813e-05, + "loss": 0.5282, + "step": 114 + }, + { + "epoch": 0.05455407969639469, + "grad_norm": 6.671237468719482, + "learning_rate": 1.811023622047244e-05, + "loss": 0.5136, + "step": 115 + }, + { + "epoch": 0.05502846299810247, + "grad_norm": 7.128397464752197, + "learning_rate": 1.8267716535433072e-05, + "loss": 0.5009, + "step": 116 + }, + { + "epoch": 0.05550284629981025, + "grad_norm": 4.9932026863098145, + "learning_rate": 1.8425196850393703e-05, + "loss": 0.4473, + "step": 117 + }, + { + "epoch": 0.055977229601518026, + "grad_norm": 5.382380485534668, + "learning_rate": 1.858267716535433e-05, + "loss": 0.5041, + "step": 118 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 4.284328937530518, + "learning_rate": 1.8740157480314962e-05, + "loss": 0.4731, + "step": 119 + }, + { + "epoch": 0.056925996204933584, + "grad_norm": 6.466545104980469, + "learning_rate": 1.8897637795275593e-05, + "loss": 0.5566, + "step": 120 + }, + { + "epoch": 0.05740037950664137, + "grad_norm": 4.7638959884643555, + "learning_rate": 1.905511811023622e-05, + "loss": 0.5096, + "step": 121 + }, + { + "epoch": 0.05787476280834915, + "grad_norm": 5.770228862762451, + "learning_rate": 1.9212598425196852e-05, + "loss": 0.5628, + "step": 122 + }, + { + "epoch": 0.058349146110056926, + "grad_norm": 5.023980140686035, + "learning_rate": 1.937007874015748e-05, + "loss": 0.4875, + "step": 123 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 5.017863750457764, + "learning_rate": 1.952755905511811e-05, + "loss": 0.5002, + "step": 124 + }, + { + "epoch": 0.059297912713472484, + "grad_norm": 5.194025039672852, + "learning_rate": 1.9685039370078743e-05, + "loss": 0.4668, + "step": 125 + }, + { + "epoch": 0.05977229601518026, + "grad_norm": 5.176628589630127, + "learning_rate": 1.984251968503937e-05, + "loss": 0.4949, + "step": 126 + }, + { + "epoch": 0.06024667931688805, + "grad_norm": 5.499045372009277, + "learning_rate": 2e-05, + "loss": 0.502, + "step": 127 + }, + { + "epoch": 0.06072106261859583, + "grad_norm": 4.964819431304932, + "learning_rate": 1.999999704854948e-05, + "loss": 0.5428, + "step": 128 + }, + { + "epoch": 0.061195445920303605, + "grad_norm": 4.875455379486084, + "learning_rate": 1.9999988194199653e-05, + "loss": 0.473, + "step": 129 + }, + { + "epoch": 0.061669829222011384, + "grad_norm": 4.419773578643799, + "learning_rate": 1.999997343695575e-05, + "loss": 0.4555, + "step": 130 + }, + { + "epoch": 0.06214421252371916, + "grad_norm": 5.253462314605713, + "learning_rate": 1.999995277682648e-05, + "loss": 0.563, + "step": 131 + }, + { + "epoch": 0.06261859582542695, + "grad_norm": 4.706827640533447, + "learning_rate": 1.999992621382404e-05, + "loss": 0.5296, + "step": 132 + }, + { + "epoch": 0.06309297912713473, + "grad_norm": 4.565430641174316, + "learning_rate": 1.9999893747964108e-05, + "loss": 0.4412, + "step": 133 + }, + { + "epoch": 0.0635673624288425, + "grad_norm": 4.702053070068359, + "learning_rate": 1.9999855379265855e-05, + "loss": 0.5513, + "step": 134 + }, + { + "epoch": 0.06404174573055028, + "grad_norm": 4.838067531585693, + "learning_rate": 1.999981110775192e-05, + "loss": 0.5562, + "step": 135 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 4.499827861785889, + "learning_rate": 1.9999760933448443e-05, + "loss": 0.549, + "step": 136 + }, + { + "epoch": 0.06499051233396584, + "grad_norm": 4.42749547958374, + "learning_rate": 1.9999704856385037e-05, + "loss": 0.5549, + "step": 137 + }, + { + "epoch": 0.06546489563567362, + "grad_norm": 4.693006992340088, + "learning_rate": 1.9999642876594806e-05, + "loss": 0.4435, + "step": 138 + }, + { + "epoch": 0.0659392789373814, + "grad_norm": 4.892571449279785, + "learning_rate": 1.9999574994114336e-05, + "loss": 0.5113, + "step": 139 + }, + { + "epoch": 0.06641366223908918, + "grad_norm": 4.446290493011475, + "learning_rate": 1.9999501208983692e-05, + "loss": 0.5456, + "step": 140 + }, + { + "epoch": 0.06688804554079697, + "grad_norm": 5.799952507019043, + "learning_rate": 1.999942152124644e-05, + "loss": 0.517, + "step": 141 + }, + { + "epoch": 0.06736242884250475, + "grad_norm": 4.448901653289795, + "learning_rate": 1.9999335930949612e-05, + "loss": 0.4831, + "step": 142 + }, + { + "epoch": 0.06783681214421253, + "grad_norm": 5.490376949310303, + "learning_rate": 1.999924443814373e-05, + "loss": 0.5247, + "step": 143 + }, + { + "epoch": 0.0683111954459203, + "grad_norm": 4.813867568969727, + "learning_rate": 1.9999147042882803e-05, + "loss": 0.4864, + "step": 144 + }, + { + "epoch": 0.06878557874762808, + "grad_norm": 4.6280293464660645, + "learning_rate": 1.9999043745224324e-05, + "loss": 0.5794, + "step": 145 + }, + { + "epoch": 0.06925996204933586, + "grad_norm": 4.530418872833252, + "learning_rate": 1.9998934545229266e-05, + "loss": 0.5069, + "step": 146 + }, + { + "epoch": 0.06973434535104364, + "grad_norm": 5.701735019683838, + "learning_rate": 1.9998819442962088e-05, + "loss": 0.657, + "step": 147 + }, + { + "epoch": 0.07020872865275142, + "grad_norm": 5.44976282119751, + "learning_rate": 1.999869843849074e-05, + "loss": 0.5559, + "step": 148 + }, + { + "epoch": 0.0706831119544592, + "grad_norm": 4.931896686553955, + "learning_rate": 1.999857153188664e-05, + "loss": 0.527, + "step": 149 + }, + { + "epoch": 0.07115749525616698, + "grad_norm": 4.865047931671143, + "learning_rate": 1.999843872322471e-05, + "loss": 0.5177, + "step": 150 + }, + { + "epoch": 0.07163187855787476, + "grad_norm": 4.696810245513916, + "learning_rate": 1.9998300012583333e-05, + "loss": 0.5715, + "step": 151 + }, + { + "epoch": 0.07210626185958255, + "grad_norm": 4.350542068481445, + "learning_rate": 1.99981554000444e-05, + "loss": 0.4627, + "step": 152 + }, + { + "epoch": 0.07258064516129033, + "grad_norm": 5.0633111000061035, + "learning_rate": 1.999800488569327e-05, + "loss": 0.5614, + "step": 153 + }, + { + "epoch": 0.0730550284629981, + "grad_norm": 4.741815090179443, + "learning_rate": 1.999784846961879e-05, + "loss": 0.5093, + "step": 154 + }, + { + "epoch": 0.07352941176470588, + "grad_norm": 6.608360290527344, + "learning_rate": 1.9997686151913297e-05, + "loss": 0.497, + "step": 155 + }, + { + "epoch": 0.07400379506641366, + "grad_norm": 5.2583441734313965, + "learning_rate": 1.9997517932672592e-05, + "loss": 0.5925, + "step": 156 + }, + { + "epoch": 0.07447817836812144, + "grad_norm": 4.322223663330078, + "learning_rate": 1.9997343811995985e-05, + "loss": 0.5369, + "step": 157 + }, + { + "epoch": 0.07495256166982922, + "grad_norm": 4.527962684631348, + "learning_rate": 1.9997163789986255e-05, + "loss": 0.5313, + "step": 158 + }, + { + "epoch": 0.075426944971537, + "grad_norm": 4.953525066375732, + "learning_rate": 1.999697786674966e-05, + "loss": 0.5229, + "step": 159 + }, + { + "epoch": 0.07590132827324478, + "grad_norm": 5.049784183502197, + "learning_rate": 1.999678604239596e-05, + "loss": 0.4546, + "step": 160 + }, + { + "epoch": 0.07637571157495256, + "grad_norm": 5.497480392456055, + "learning_rate": 1.9996588317038382e-05, + "loss": 0.5097, + "step": 161 + }, + { + "epoch": 0.07685009487666034, + "grad_norm": 4.220514297485352, + "learning_rate": 1.9996384690793634e-05, + "loss": 0.471, + "step": 162 + }, + { + "epoch": 0.07732447817836813, + "grad_norm": 4.465171813964844, + "learning_rate": 1.999617516378193e-05, + "loss": 0.4938, + "step": 163 + }, + { + "epoch": 0.0777988614800759, + "grad_norm": 5.333194255828857, + "learning_rate": 1.999595973612694e-05, + "loss": 0.5676, + "step": 164 + }, + { + "epoch": 0.07827324478178369, + "grad_norm": 4.7594780921936035, + "learning_rate": 1.999573840795583e-05, + "loss": 0.5586, + "step": 165 + }, + { + "epoch": 0.07874762808349146, + "grad_norm": 4.380119800567627, + "learning_rate": 1.9995511179399253e-05, + "loss": 0.5206, + "step": 166 + }, + { + "epoch": 0.07922201138519924, + "grad_norm": 5.4240922927856445, + "learning_rate": 1.9995278050591334e-05, + "loss": 0.6165, + "step": 167 + }, + { + "epoch": 0.07969639468690702, + "grad_norm": 5.2482075691223145, + "learning_rate": 1.9995039021669692e-05, + "loss": 0.5472, + "step": 168 + }, + { + "epoch": 0.0801707779886148, + "grad_norm": 4.465966701507568, + "learning_rate": 1.9994794092775418e-05, + "loss": 0.5151, + "step": 169 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 4.838151454925537, + "learning_rate": 1.9994543264053093e-05, + "loss": 0.5031, + "step": 170 + }, + { + "epoch": 0.08111954459203036, + "grad_norm": 4.365642070770264, + "learning_rate": 1.9994286535650782e-05, + "loss": 0.4621, + "step": 171 + }, + { + "epoch": 0.08159392789373814, + "grad_norm": 5.1148152351379395, + "learning_rate": 1.9994023907720027e-05, + "loss": 0.5432, + "step": 172 + }, + { + "epoch": 0.08206831119544591, + "grad_norm": 4.51958703994751, + "learning_rate": 1.9993755380415854e-05, + "loss": 0.526, + "step": 173 + }, + { + "epoch": 0.0825426944971537, + "grad_norm": 3.9636707305908203, + "learning_rate": 1.999348095389677e-05, + "loss": 0.4614, + "step": 174 + }, + { + "epoch": 0.08301707779886149, + "grad_norm": 5.010396480560303, + "learning_rate": 1.999320062832477e-05, + "loss": 0.5862, + "step": 175 + }, + { + "epoch": 0.08349146110056926, + "grad_norm": 4.869819164276123, + "learning_rate": 1.9992914403865327e-05, + "loss": 0.5009, + "step": 176 + }, + { + "epoch": 0.08396584440227704, + "grad_norm": 5.5586838722229, + "learning_rate": 1.9992622280687395e-05, + "loss": 0.5472, + "step": 177 + }, + { + "epoch": 0.08444022770398482, + "grad_norm": 4.700314998626709, + "learning_rate": 1.9992324258963414e-05, + "loss": 0.5833, + "step": 178 + }, + { + "epoch": 0.0849146110056926, + "grad_norm": 5.759016990661621, + "learning_rate": 1.99920203388693e-05, + "loss": 0.6045, + "step": 179 + }, + { + "epoch": 0.08538899430740038, + "grad_norm": 5.080048084259033, + "learning_rate": 1.999171052058445e-05, + "loss": 0.5256, + "step": 180 + }, + { + "epoch": 0.08586337760910816, + "grad_norm": 4.277463912963867, + "learning_rate": 1.999139480429176e-05, + "loss": 0.5763, + "step": 181 + }, + { + "epoch": 0.08633776091081594, + "grad_norm": 5.613553524017334, + "learning_rate": 1.999107319017758e-05, + "loss": 0.5423, + "step": 182 + }, + { + "epoch": 0.08681214421252371, + "grad_norm": 4.811156272888184, + "learning_rate": 1.9990745678431765e-05, + "loss": 0.4214, + "step": 183 + }, + { + "epoch": 0.0872865275142315, + "grad_norm": 4.72564697265625, + "learning_rate": 1.9990412269247637e-05, + "loss": 0.466, + "step": 184 + }, + { + "epoch": 0.08776091081593927, + "grad_norm": 7.244020938873291, + "learning_rate": 1.999007296282201e-05, + "loss": 0.5073, + "step": 185 + }, + { + "epoch": 0.08823529411764706, + "grad_norm": 4.737941741943359, + "learning_rate": 1.9989727759355164e-05, + "loss": 0.5523, + "step": 186 + }, + { + "epoch": 0.08870967741935484, + "grad_norm": 4.7859086990356445, + "learning_rate": 1.9989376659050878e-05, + "loss": 0.5216, + "step": 187 + }, + { + "epoch": 0.08918406072106262, + "grad_norm": 4.781891345977783, + "learning_rate": 1.9989019662116395e-05, + "loss": 0.5328, + "step": 188 + }, + { + "epoch": 0.0896584440227704, + "grad_norm": 4.595061779022217, + "learning_rate": 1.9988656768762455e-05, + "loss": 0.5584, + "step": 189 + }, + { + "epoch": 0.09013282732447818, + "grad_norm": 4.338557720184326, + "learning_rate": 1.9988287979203264e-05, + "loss": 0.4839, + "step": 190 + }, + { + "epoch": 0.09060721062618596, + "grad_norm": 4.124628067016602, + "learning_rate": 1.9987913293656515e-05, + "loss": 0.5612, + "step": 191 + }, + { + "epoch": 0.09108159392789374, + "grad_norm": 3.821711778640747, + "learning_rate": 1.9987532712343388e-05, + "loss": 0.4442, + "step": 192 + }, + { + "epoch": 0.09155597722960152, + "grad_norm": 3.7175636291503906, + "learning_rate": 1.9987146235488532e-05, + "loss": 0.5021, + "step": 193 + }, + { + "epoch": 0.0920303605313093, + "grad_norm": 4.476022720336914, + "learning_rate": 1.9986753863320077e-05, + "loss": 0.524, + "step": 194 + }, + { + "epoch": 0.09250474383301707, + "grad_norm": 4.444040298461914, + "learning_rate": 1.998635559606964e-05, + "loss": 0.4565, + "step": 195 + }, + { + "epoch": 0.09297912713472485, + "grad_norm": 4.800974369049072, + "learning_rate": 1.9985951433972313e-05, + "loss": 0.5845, + "step": 196 + }, + { + "epoch": 0.09345351043643264, + "grad_norm": 4.884645462036133, + "learning_rate": 1.9985541377266675e-05, + "loss": 0.5319, + "step": 197 + }, + { + "epoch": 0.09392789373814042, + "grad_norm": 4.790307521820068, + "learning_rate": 1.998512542619477e-05, + "loss": 0.505, + "step": 198 + }, + { + "epoch": 0.0944022770398482, + "grad_norm": 4.581803321838379, + "learning_rate": 1.998470358100213e-05, + "loss": 0.5228, + "step": 199 + }, + { + "epoch": 0.09487666034155598, + "grad_norm": 4.337541580200195, + "learning_rate": 1.9984275841937776e-05, + "loss": 0.5135, + "step": 200 + }, + { + "epoch": 0.09535104364326376, + "grad_norm": 4.175999641418457, + "learning_rate": 1.9983842209254182e-05, + "loss": 0.5219, + "step": 201 + }, + { + "epoch": 0.09582542694497154, + "grad_norm": 3.8409905433654785, + "learning_rate": 1.9983402683207334e-05, + "loss": 0.4495, + "step": 202 + }, + { + "epoch": 0.09629981024667932, + "grad_norm": 3.7136104106903076, + "learning_rate": 1.9982957264056667e-05, + "loss": 0.5357, + "step": 203 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 4.796589374542236, + "learning_rate": 1.9982505952065115e-05, + "loss": 0.5567, + "step": 204 + }, + { + "epoch": 0.09724857685009487, + "grad_norm": 5.0165886878967285, + "learning_rate": 1.9982048747499082e-05, + "loss": 0.5025, + "step": 205 + }, + { + "epoch": 0.09772296015180265, + "grad_norm": 4.150423049926758, + "learning_rate": 1.9981585650628447e-05, + "loss": 0.5405, + "step": 206 + }, + { + "epoch": 0.09819734345351043, + "grad_norm": 3.635175943374634, + "learning_rate": 1.9981116661726575e-05, + "loss": 0.4784, + "step": 207 + }, + { + "epoch": 0.09867172675521822, + "grad_norm": 4.339625358581543, + "learning_rate": 1.998064178107031e-05, + "loss": 0.5831, + "step": 208 + }, + { + "epoch": 0.099146110056926, + "grad_norm": 4.854864120483398, + "learning_rate": 1.9980161008939957e-05, + "loss": 0.6303, + "step": 209 + }, + { + "epoch": 0.09962049335863378, + "grad_norm": 4.287778377532959, + "learning_rate": 1.9979674345619322e-05, + "loss": 0.467, + "step": 210 + }, + { + "epoch": 0.10009487666034156, + "grad_norm": 3.7619311809539795, + "learning_rate": 1.997918179139567e-05, + "loss": 0.5403, + "step": 211 + }, + { + "epoch": 0.10056925996204934, + "grad_norm": 3.486117124557495, + "learning_rate": 1.9978683346559762e-05, + "loss": 0.4819, + "step": 212 + }, + { + "epoch": 0.10104364326375712, + "grad_norm": 4.301611423492432, + "learning_rate": 1.9978179011405814e-05, + "loss": 0.5293, + "step": 213 + }, + { + "epoch": 0.1015180265654649, + "grad_norm": 4.88173770904541, + "learning_rate": 1.9977668786231536e-05, + "loss": 0.5506, + "step": 214 + }, + { + "epoch": 0.10199240986717267, + "grad_norm": 4.954184532165527, + "learning_rate": 1.99771526713381e-05, + "loss": 0.5209, + "step": 215 + }, + { + "epoch": 0.10246679316888045, + "grad_norm": 4.852926254272461, + "learning_rate": 1.9976630667030175e-05, + "loss": 0.5922, + "step": 216 + }, + { + "epoch": 0.10294117647058823, + "grad_norm": 5.19386100769043, + "learning_rate": 1.9976102773615894e-05, + "loss": 0.5261, + "step": 217 + }, + { + "epoch": 0.10341555977229601, + "grad_norm": 4.068961143493652, + "learning_rate": 1.997556899140686e-05, + "loss": 0.5104, + "step": 218 + }, + { + "epoch": 0.1038899430740038, + "grad_norm": 3.790403366088867, + "learning_rate": 1.997502932071816e-05, + "loss": 0.4915, + "step": 219 + }, + { + "epoch": 0.10436432637571158, + "grad_norm": 5.95817756652832, + "learning_rate": 1.997448376186836e-05, + "loss": 0.4801, + "step": 220 + }, + { + "epoch": 0.10483870967741936, + "grad_norm": 4.333256244659424, + "learning_rate": 1.9973932315179502e-05, + "loss": 0.5602, + "step": 221 + }, + { + "epoch": 0.10531309297912714, + "grad_norm": 3.932823419570923, + "learning_rate": 1.997337498097709e-05, + "loss": 0.5351, + "step": 222 + }, + { + "epoch": 0.10578747628083492, + "grad_norm": 4.296048164367676, + "learning_rate": 1.9972811759590117e-05, + "loss": 0.5502, + "step": 223 + }, + { + "epoch": 0.1062618595825427, + "grad_norm": 4.166548252105713, + "learning_rate": 1.997224265135105e-05, + "loss": 0.5277, + "step": 224 + }, + { + "epoch": 0.10673624288425047, + "grad_norm": 5.517498016357422, + "learning_rate": 1.9971667656595824e-05, + "loss": 0.5458, + "step": 225 + }, + { + "epoch": 0.10721062618595825, + "grad_norm": 4.525831699371338, + "learning_rate": 1.9971086775663856e-05, + "loss": 0.5935, + "step": 226 + }, + { + "epoch": 0.10768500948766603, + "grad_norm": 4.5020012855529785, + "learning_rate": 1.997050000889803e-05, + "loss": 0.4852, + "step": 227 + }, + { + "epoch": 0.10815939278937381, + "grad_norm": 4.161679744720459, + "learning_rate": 1.9969907356644716e-05, + "loss": 0.5776, + "step": 228 + }, + { + "epoch": 0.10863377609108159, + "grad_norm": 5.031361103057861, + "learning_rate": 1.996930881925374e-05, + "loss": 0.5912, + "step": 229 + }, + { + "epoch": 0.10910815939278938, + "grad_norm": 4.38935661315918, + "learning_rate": 1.9968704397078422e-05, + "loss": 0.5005, + "step": 230 + }, + { + "epoch": 0.10958254269449716, + "grad_norm": 4.4222412109375, + "learning_rate": 1.996809409047554e-05, + "loss": 0.5669, + "step": 231 + }, + { + "epoch": 0.11005692599620494, + "grad_norm": 4.336076736450195, + "learning_rate": 1.996747789980536e-05, + "loss": 0.5975, + "step": 232 + }, + { + "epoch": 0.11053130929791272, + "grad_norm": 4.671880722045898, + "learning_rate": 1.9966855825431605e-05, + "loss": 0.6065, + "step": 233 + }, + { + "epoch": 0.1110056925996205, + "grad_norm": 3.9240012168884277, + "learning_rate": 1.996622786772148e-05, + "loss": 0.474, + "step": 234 + }, + { + "epoch": 0.11148007590132827, + "grad_norm": 4.048487663269043, + "learning_rate": 1.9965594027045668e-05, + "loss": 0.5361, + "step": 235 + }, + { + "epoch": 0.11195445920303605, + "grad_norm": 3.9906532764434814, + "learning_rate": 1.996495430377831e-05, + "loss": 0.6107, + "step": 236 + }, + { + "epoch": 0.11242884250474383, + "grad_norm": 4.5680084228515625, + "learning_rate": 1.996430869829704e-05, + "loss": 0.4812, + "step": 237 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 4.888277530670166, + "learning_rate": 1.9963657210982947e-05, + "loss": 0.479, + "step": 238 + }, + { + "epoch": 0.11337760910815939, + "grad_norm": 3.874725103378296, + "learning_rate": 1.9962999842220596e-05, + "loss": 0.5418, + "step": 239 + }, + { + "epoch": 0.11385199240986717, + "grad_norm": 4.379036903381348, + "learning_rate": 1.9962336592398027e-05, + "loss": 0.4977, + "step": 240 + }, + { + "epoch": 0.11432637571157495, + "grad_norm": 4.229637145996094, + "learning_rate": 1.9961667461906743e-05, + "loss": 0.5353, + "step": 241 + }, + { + "epoch": 0.11480075901328274, + "grad_norm": 4.034006118774414, + "learning_rate": 1.9960992451141737e-05, + "loss": 0.5121, + "step": 242 + }, + { + "epoch": 0.11527514231499052, + "grad_norm": 4.3000288009643555, + "learning_rate": 1.9960311560501457e-05, + "loss": 0.6312, + "step": 243 + }, + { + "epoch": 0.1157495256166983, + "grad_norm": 4.055355072021484, + "learning_rate": 1.995962479038782e-05, + "loss": 0.5626, + "step": 244 + }, + { + "epoch": 0.11622390891840607, + "grad_norm": 4.0171990394592285, + "learning_rate": 1.9958932141206224e-05, + "loss": 0.4723, + "step": 245 + }, + { + "epoch": 0.11669829222011385, + "grad_norm": 4.504859924316406, + "learning_rate": 1.9958233613365534e-05, + "loss": 0.5482, + "step": 246 + }, + { + "epoch": 0.11717267552182163, + "grad_norm": 4.141243934631348, + "learning_rate": 1.9957529207278082e-05, + "loss": 0.4875, + "step": 247 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 4.3180389404296875, + "learning_rate": 1.9956818923359677e-05, + "loss": 0.5515, + "step": 248 + }, + { + "epoch": 0.11812144212523719, + "grad_norm": 3.9524307250976562, + "learning_rate": 1.995610276202958e-05, + "loss": 0.5466, + "step": 249 + }, + { + "epoch": 0.11859582542694497, + "grad_norm": 3.8721537590026855, + "learning_rate": 1.995538072371055e-05, + "loss": 0.6034, + "step": 250 + }, + { + "epoch": 0.11907020872865275, + "grad_norm": 4.435497283935547, + "learning_rate": 1.9954652808828793e-05, + "loss": 0.5693, + "step": 251 + }, + { + "epoch": 0.11954459203036052, + "grad_norm": 3.3445560932159424, + "learning_rate": 1.9953919017813985e-05, + "loss": 0.3989, + "step": 252 + }, + { + "epoch": 0.12001897533206832, + "grad_norm": 3.7608346939086914, + "learning_rate": 1.9953179351099276e-05, + "loss": 0.5034, + "step": 253 + }, + { + "epoch": 0.1204933586337761, + "grad_norm": 4.122411727905273, + "learning_rate": 1.995243380912129e-05, + "loss": 0.5092, + "step": 254 + }, + { + "epoch": 0.12096774193548387, + "grad_norm": 4.1877007484436035, + "learning_rate": 1.995168239232011e-05, + "loss": 0.5847, + "step": 255 + }, + { + "epoch": 0.12144212523719165, + "grad_norm": 4.213406562805176, + "learning_rate": 1.9950925101139292e-05, + "loss": 0.4321, + "step": 256 + }, + { + "epoch": 0.12191650853889943, + "grad_norm": 3.9307830333709717, + "learning_rate": 1.995016193602585e-05, + "loss": 0.5166, + "step": 257 + }, + { + "epoch": 0.12239089184060721, + "grad_norm": 4.194809913635254, + "learning_rate": 1.9949392897430283e-05, + "loss": 0.5193, + "step": 258 + }, + { + "epoch": 0.12286527514231499, + "grad_norm": 5.011254787445068, + "learning_rate": 1.994861798580654e-05, + "loss": 0.6044, + "step": 259 + }, + { + "epoch": 0.12333965844402277, + "grad_norm": 3.800550937652588, + "learning_rate": 1.9947837201612046e-05, + "loss": 0.5104, + "step": 260 + }, + { + "epoch": 0.12381404174573055, + "grad_norm": 4.209667682647705, + "learning_rate": 1.9947050545307693e-05, + "loss": 0.5558, + "step": 261 + }, + { + "epoch": 0.12428842504743833, + "grad_norm": 4.35175085067749, + "learning_rate": 1.994625801735783e-05, + "loss": 0.4245, + "step": 262 + }, + { + "epoch": 0.1247628083491461, + "grad_norm": 4.295873165130615, + "learning_rate": 1.9945459618230282e-05, + "loss": 0.5044, + "step": 263 + }, + { + "epoch": 0.1252371916508539, + "grad_norm": 4.467810153961182, + "learning_rate": 1.9944655348396336e-05, + "loss": 0.5308, + "step": 264 + }, + { + "epoch": 0.12571157495256166, + "grad_norm": 3.797988176345825, + "learning_rate": 1.9943845208330742e-05, + "loss": 0.4543, + "step": 265 + }, + { + "epoch": 0.12618595825426945, + "grad_norm": 3.5803213119506836, + "learning_rate": 1.9943029198511724e-05, + "loss": 0.4598, + "step": 266 + }, + { + "epoch": 0.12666034155597722, + "grad_norm": 3.6839261054992676, + "learning_rate": 1.9942207319420962e-05, + "loss": 0.5384, + "step": 267 + }, + { + "epoch": 0.127134724857685, + "grad_norm": 3.6129837036132812, + "learning_rate": 1.9941379571543597e-05, + "loss": 0.5078, + "step": 268 + }, + { + "epoch": 0.12760910815939278, + "grad_norm": 4.039849281311035, + "learning_rate": 1.9940545955368247e-05, + "loss": 0.6595, + "step": 269 + }, + { + "epoch": 0.12808349146110057, + "grad_norm": 4.292903423309326, + "learning_rate": 1.993970647138699e-05, + "loss": 0.6002, + "step": 270 + }, + { + "epoch": 0.12855787476280836, + "grad_norm": 3.4465274810791016, + "learning_rate": 1.9938861120095353e-05, + "loss": 0.4465, + "step": 271 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 5.027604579925537, + "learning_rate": 1.993800990199235e-05, + "loss": 0.5447, + "step": 272 + }, + { + "epoch": 0.12950664136622392, + "grad_norm": 4.433981418609619, + "learning_rate": 1.993715281758044e-05, + "loss": 0.5327, + "step": 273 + }, + { + "epoch": 0.12998102466793168, + "grad_norm": 4.061509609222412, + "learning_rate": 1.9936289867365557e-05, + "loss": 0.4561, + "step": 274 + }, + { + "epoch": 0.13045540796963948, + "grad_norm": 4.023436546325684, + "learning_rate": 1.9935421051857088e-05, + "loss": 0.5113, + "step": 275 + }, + { + "epoch": 0.13092979127134724, + "grad_norm": 4.111091613769531, + "learning_rate": 1.9934546371567888e-05, + "loss": 0.5168, + "step": 276 + }, + { + "epoch": 0.13140417457305503, + "grad_norm": 4.130946636199951, + "learning_rate": 1.9933665827014272e-05, + "loss": 0.5131, + "step": 277 + }, + { + "epoch": 0.1318785578747628, + "grad_norm": 4.0225348472595215, + "learning_rate": 1.9932779418716012e-05, + "loss": 0.4993, + "step": 278 + }, + { + "epoch": 0.1323529411764706, + "grad_norm": 4.16855525970459, + "learning_rate": 1.9931887147196355e-05, + "loss": 0.4827, + "step": 279 + }, + { + "epoch": 0.13282732447817835, + "grad_norm": 4.637106418609619, + "learning_rate": 1.9930989012981992e-05, + "loss": 0.4109, + "step": 280 + }, + { + "epoch": 0.13330170777988615, + "grad_norm": 3.4839680194854736, + "learning_rate": 1.993008501660309e-05, + "loss": 0.4779, + "step": 281 + }, + { + "epoch": 0.13377609108159394, + "grad_norm": 4.075927734375, + "learning_rate": 1.9929175158593262e-05, + "loss": 0.531, + "step": 282 + }, + { + "epoch": 0.1342504743833017, + "grad_norm": 4.005972385406494, + "learning_rate": 1.992825943948959e-05, + "loss": 0.5608, + "step": 283 + }, + { + "epoch": 0.1347248576850095, + "grad_norm": 3.9188387393951416, + "learning_rate": 1.9927337859832617e-05, + "loss": 0.5646, + "step": 284 + }, + { + "epoch": 0.13519924098671726, + "grad_norm": 3.8422210216522217, + "learning_rate": 1.9926410420166343e-05, + "loss": 0.4807, + "step": 285 + }, + { + "epoch": 0.13567362428842505, + "grad_norm": 4.120503902435303, + "learning_rate": 1.9925477121038218e-05, + "loss": 0.6104, + "step": 286 + }, + { + "epoch": 0.13614800759013282, + "grad_norm": 4.473390579223633, + "learning_rate": 1.992453796299917e-05, + "loss": 0.5511, + "step": 287 + }, + { + "epoch": 0.1366223908918406, + "grad_norm": 4.302347183227539, + "learning_rate": 1.9923592946603573e-05, + "loss": 0.4618, + "step": 288 + }, + { + "epoch": 0.13709677419354838, + "grad_norm": 7.132138729095459, + "learning_rate": 1.992264207240925e-05, + "loss": 0.5165, + "step": 289 + }, + { + "epoch": 0.13757115749525617, + "grad_norm": 4.486799716949463, + "learning_rate": 1.9921685340977506e-05, + "loss": 0.459, + "step": 290 + }, + { + "epoch": 0.13804554079696393, + "grad_norm": 4.0745038986206055, + "learning_rate": 1.992072275287308e-05, + "loss": 0.4965, + "step": 291 + }, + { + "epoch": 0.13851992409867173, + "grad_norm": 4.047938346862793, + "learning_rate": 1.991975430866419e-05, + "loss": 0.5792, + "step": 292 + }, + { + "epoch": 0.13899430740037952, + "grad_norm": 4.198129177093506, + "learning_rate": 1.9918780008922484e-05, + "loss": 0.5546, + "step": 293 + }, + { + "epoch": 0.13946869070208728, + "grad_norm": 4.39850378036499, + "learning_rate": 1.9917799854223093e-05, + "loss": 0.4974, + "step": 294 + }, + { + "epoch": 0.13994307400379508, + "grad_norm": 5.012327194213867, + "learning_rate": 1.9916813845144587e-05, + "loss": 0.5658, + "step": 295 + }, + { + "epoch": 0.14041745730550284, + "grad_norm": 4.509042263031006, + "learning_rate": 1.9915821982269002e-05, + "loss": 0.5774, + "step": 296 + }, + { + "epoch": 0.14089184060721063, + "grad_norm": 4.060871124267578, + "learning_rate": 1.9914824266181818e-05, + "loss": 0.4573, + "step": 297 + }, + { + "epoch": 0.1413662239089184, + "grad_norm": 3.875030040740967, + "learning_rate": 1.9913820697471988e-05, + "loss": 0.4239, + "step": 298 + }, + { + "epoch": 0.1418406072106262, + "grad_norm": 4.114848613739014, + "learning_rate": 1.9912811276731895e-05, + "loss": 0.5666, + "step": 299 + }, + { + "epoch": 0.14231499051233396, + "grad_norm": 3.5044398307800293, + "learning_rate": 1.9911796004557397e-05, + "loss": 0.5189, + "step": 300 + }, + { + "epoch": 0.14278937381404175, + "grad_norm": 3.8690919876098633, + "learning_rate": 1.9910774881547803e-05, + "loss": 0.5543, + "step": 301 + }, + { + "epoch": 0.1432637571157495, + "grad_norm": 3.6126976013183594, + "learning_rate": 1.9909747908305866e-05, + "loss": 0.4635, + "step": 302 + }, + { + "epoch": 0.1437381404174573, + "grad_norm": 3.76629900932312, + "learning_rate": 1.99087150854378e-05, + "loss": 0.5394, + "step": 303 + }, + { + "epoch": 0.1442125237191651, + "grad_norm": 4.235199451446533, + "learning_rate": 1.990767641355327e-05, + "loss": 0.6441, + "step": 304 + }, + { + "epoch": 0.14468690702087286, + "grad_norm": 3.634746551513672, + "learning_rate": 1.9906631893265393e-05, + "loss": 0.5005, + "step": 305 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 3.812934160232544, + "learning_rate": 1.990558152519074e-05, + "loss": 0.4656, + "step": 306 + }, + { + "epoch": 0.14563567362428842, + "grad_norm": 3.4849369525909424, + "learning_rate": 1.9904525309949332e-05, + "loss": 0.4462, + "step": 307 + }, + { + "epoch": 0.1461100569259962, + "grad_norm": 4.69333553314209, + "learning_rate": 1.9903463248164643e-05, + "loss": 0.5539, + "step": 308 + }, + { + "epoch": 0.14658444022770398, + "grad_norm": 4.567856788635254, + "learning_rate": 1.99023953404636e-05, + "loss": 0.5745, + "step": 309 + }, + { + "epoch": 0.14705882352941177, + "grad_norm": 3.9516940116882324, + "learning_rate": 1.9901321587476573e-05, + "loss": 0.5849, + "step": 310 + }, + { + "epoch": 0.14753320683111953, + "grad_norm": 4.662928581237793, + "learning_rate": 1.9900241989837395e-05, + "loss": 0.57, + "step": 311 + }, + { + "epoch": 0.14800759013282733, + "grad_norm": 3.7832348346710205, + "learning_rate": 1.9899156548183332e-05, + "loss": 0.5359, + "step": 312 + }, + { + "epoch": 0.1484819734345351, + "grad_norm": 3.7341370582580566, + "learning_rate": 1.9898065263155118e-05, + "loss": 0.5341, + "step": 313 + }, + { + "epoch": 0.14895635673624288, + "grad_norm": 4.346981048583984, + "learning_rate": 1.9896968135396924e-05, + "loss": 0.5166, + "step": 314 + }, + { + "epoch": 0.14943074003795068, + "grad_norm": 4.063964366912842, + "learning_rate": 1.9895865165556375e-05, + "loss": 0.5726, + "step": 315 + }, + { + "epoch": 0.14990512333965844, + "grad_norm": 3.6817450523376465, + "learning_rate": 1.989475635428454e-05, + "loss": 0.4844, + "step": 316 + }, + { + "epoch": 0.15037950664136623, + "grad_norm": 3.254460096359253, + "learning_rate": 1.9893641702235946e-05, + "loss": 0.4731, + "step": 317 + }, + { + "epoch": 0.150853889943074, + "grad_norm": 4.255951404571533, + "learning_rate": 1.9892521210068552e-05, + "loss": 0.5561, + "step": 318 + }, + { + "epoch": 0.1513282732447818, + "grad_norm": 4.263412952423096, + "learning_rate": 1.9891394878443783e-05, + "loss": 0.5309, + "step": 319 + }, + { + "epoch": 0.15180265654648956, + "grad_norm": 3.499624252319336, + "learning_rate": 1.9890262708026497e-05, + "loss": 0.5453, + "step": 320 + }, + { + "epoch": 0.15227703984819735, + "grad_norm": 4.114099502563477, + "learning_rate": 1.9889124699485e-05, + "loss": 0.55, + "step": 321 + }, + { + "epoch": 0.1527514231499051, + "grad_norm": 3.8856418132781982, + "learning_rate": 1.988798085349105e-05, + "loss": 0.4584, + "step": 322 + }, + { + "epoch": 0.1532258064516129, + "grad_norm": 3.403284788131714, + "learning_rate": 1.9886831170719852e-05, + "loss": 0.4941, + "step": 323 + }, + { + "epoch": 0.15370018975332067, + "grad_norm": 3.322474479675293, + "learning_rate": 1.9885675651850044e-05, + "loss": 0.4564, + "step": 324 + }, + { + "epoch": 0.15417457305502846, + "grad_norm": 4.788414001464844, + "learning_rate": 1.9884514297563722e-05, + "loss": 0.6303, + "step": 325 + }, + { + "epoch": 0.15464895635673626, + "grad_norm": 3.2986533641815186, + "learning_rate": 1.9883347108546424e-05, + "loss": 0.4885, + "step": 326 + }, + { + "epoch": 0.15512333965844402, + "grad_norm": 3.582542896270752, + "learning_rate": 1.9882174085487125e-05, + "loss": 0.4793, + "step": 327 + }, + { + "epoch": 0.1555977229601518, + "grad_norm": 5.127254486083984, + "learning_rate": 1.9880995229078253e-05, + "loss": 0.5257, + "step": 328 + }, + { + "epoch": 0.15607210626185958, + "grad_norm": 3.8393657207489014, + "learning_rate": 1.9879810540015674e-05, + "loss": 0.4924, + "step": 329 + }, + { + "epoch": 0.15654648956356737, + "grad_norm": 4.303122043609619, + "learning_rate": 1.9878620018998696e-05, + "loss": 0.5073, + "step": 330 + }, + { + "epoch": 0.15702087286527514, + "grad_norm": 3.3921568393707275, + "learning_rate": 1.9877423666730075e-05, + "loss": 0.4634, + "step": 331 + }, + { + "epoch": 0.15749525616698293, + "grad_norm": 4.370551109313965, + "learning_rate": 1.9876221483916006e-05, + "loss": 0.641, + "step": 332 + }, + { + "epoch": 0.1579696394686907, + "grad_norm": 7.480087757110596, + "learning_rate": 1.9875013471266124e-05, + "loss": 0.5516, + "step": 333 + }, + { + "epoch": 0.15844402277039848, + "grad_norm": 3.2443172931671143, + "learning_rate": 1.9873799629493507e-05, + "loss": 0.4536, + "step": 334 + }, + { + "epoch": 0.15891840607210625, + "grad_norm": 3.3742051124572754, + "learning_rate": 1.9872579959314675e-05, + "loss": 0.4506, + "step": 335 + }, + { + "epoch": 0.15939278937381404, + "grad_norm": 4.108852386474609, + "learning_rate": 1.987135446144959e-05, + "loss": 0.5022, + "step": 336 + }, + { + "epoch": 0.15986717267552183, + "grad_norm": 3.776676893234253, + "learning_rate": 1.9870123136621638e-05, + "loss": 0.4441, + "step": 337 + }, + { + "epoch": 0.1603415559772296, + "grad_norm": 4.488317966461182, + "learning_rate": 1.9868885985557675e-05, + "loss": 0.5622, + "step": 338 + }, + { + "epoch": 0.1608159392789374, + "grad_norm": 4.163279056549072, + "learning_rate": 1.986764300898797e-05, + "loss": 0.5567, + "step": 339 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 3.326791524887085, + "learning_rate": 1.986639420764624e-05, + "loss": 0.4917, + "step": 340 + }, + { + "epoch": 0.16176470588235295, + "grad_norm": 4.889130115509033, + "learning_rate": 1.9865139582269642e-05, + "loss": 0.5238, + "step": 341 + }, + { + "epoch": 0.16223908918406071, + "grad_norm": 3.6789140701293945, + "learning_rate": 1.986387913359877e-05, + "loss": 0.6143, + "step": 342 + }, + { + "epoch": 0.1627134724857685, + "grad_norm": 3.560218572616577, + "learning_rate": 1.9862612862377652e-05, + "loss": 0.4937, + "step": 343 + }, + { + "epoch": 0.16318785578747627, + "grad_norm": 3.0971603393554688, + "learning_rate": 1.9861340769353753e-05, + "loss": 0.4614, + "step": 344 + }, + { + "epoch": 0.16366223908918406, + "grad_norm": 3.41616153717041, + "learning_rate": 1.9860062855277982e-05, + "loss": 0.4853, + "step": 345 + }, + { + "epoch": 0.16413662239089183, + "grad_norm": 3.4327688217163086, + "learning_rate": 1.985877912090468e-05, + "loss": 0.4745, + "step": 346 + }, + { + "epoch": 0.16461100569259962, + "grad_norm": 3.024824380874634, + "learning_rate": 1.9857489566991614e-05, + "loss": 0.5228, + "step": 347 + }, + { + "epoch": 0.1650853889943074, + "grad_norm": 4.104083061218262, + "learning_rate": 1.9856194194300005e-05, + "loss": 0.5777, + "step": 348 + }, + { + "epoch": 0.16555977229601518, + "grad_norm": 4.0848798751831055, + "learning_rate": 1.9854893003594492e-05, + "loss": 0.5047, + "step": 349 + }, + { + "epoch": 0.16603415559772297, + "grad_norm": 3.6772310733795166, + "learning_rate": 1.9853585995643158e-05, + "loss": 0.5242, + "step": 350 + }, + { + "epoch": 0.16650853889943074, + "grad_norm": 3.716475009918213, + "learning_rate": 1.9852273171217518e-05, + "loss": 0.4722, + "step": 351 + }, + { + "epoch": 0.16698292220113853, + "grad_norm": 4.485456466674805, + "learning_rate": 1.9850954531092515e-05, + "loss": 0.4973, + "step": 352 + }, + { + "epoch": 0.1674573055028463, + "grad_norm": 3.9249846935272217, + "learning_rate": 1.9849630076046536e-05, + "loss": 0.506, + "step": 353 + }, + { + "epoch": 0.16793168880455409, + "grad_norm": 3.6390221118927, + "learning_rate": 1.9848299806861385e-05, + "loss": 0.4511, + "step": 354 + }, + { + "epoch": 0.16840607210626185, + "grad_norm": 4.2053937911987305, + "learning_rate": 1.984696372432231e-05, + "loss": 0.523, + "step": 355 + }, + { + "epoch": 0.16888045540796964, + "grad_norm": 3.690425157546997, + "learning_rate": 1.984562182921799e-05, + "loss": 0.5692, + "step": 356 + }, + { + "epoch": 0.1693548387096774, + "grad_norm": 3.777768611907959, + "learning_rate": 1.9844274122340534e-05, + "loss": 0.4881, + "step": 357 + }, + { + "epoch": 0.1698292220113852, + "grad_norm": 3.9081265926361084, + "learning_rate": 1.9842920604485474e-05, + "loss": 0.5767, + "step": 358 + }, + { + "epoch": 0.170303605313093, + "grad_norm": 4.124914169311523, + "learning_rate": 1.984156127645178e-05, + "loss": 0.5526, + "step": 359 + }, + { + "epoch": 0.17077798861480076, + "grad_norm": 3.433190107345581, + "learning_rate": 1.9840196139041853e-05, + "loss": 0.385, + "step": 360 + }, + { + "epoch": 0.17125237191650855, + "grad_norm": 3.6542704105377197, + "learning_rate": 1.9838825193061518e-05, + "loss": 0.4004, + "step": 361 + }, + { + "epoch": 0.17172675521821631, + "grad_norm": 4.064217567443848, + "learning_rate": 1.9837448439320027e-05, + "loss": 0.4996, + "step": 362 + }, + { + "epoch": 0.1722011385199241, + "grad_norm": 3.451735019683838, + "learning_rate": 1.9836065878630074e-05, + "loss": 0.4737, + "step": 363 + }, + { + "epoch": 0.17267552182163187, + "grad_norm": 3.747659206390381, + "learning_rate": 1.983467751180776e-05, + "loss": 0.5078, + "step": 364 + }, + { + "epoch": 0.17314990512333966, + "grad_norm": 3.898174524307251, + "learning_rate": 1.983328333967263e-05, + "loss": 0.5549, + "step": 365 + }, + { + "epoch": 0.17362428842504743, + "grad_norm": 3.5523483753204346, + "learning_rate": 1.983188336304765e-05, + "loss": 0.5202, + "step": 366 + }, + { + "epoch": 0.17409867172675522, + "grad_norm": 3.90126633644104, + "learning_rate": 1.9830477582759213e-05, + "loss": 0.5718, + "step": 367 + }, + { + "epoch": 0.174573055028463, + "grad_norm": 4.264011383056641, + "learning_rate": 1.9829065999637134e-05, + "loss": 0.561, + "step": 368 + }, + { + "epoch": 0.17504743833017078, + "grad_norm": 3.489223003387451, + "learning_rate": 1.982764861451466e-05, + "loss": 0.5515, + "step": 369 + }, + { + "epoch": 0.17552182163187854, + "grad_norm": 3.144134998321533, + "learning_rate": 1.9826225428228455e-05, + "loss": 0.3927, + "step": 370 + }, + { + "epoch": 0.17599620493358634, + "grad_norm": 3.7182862758636475, + "learning_rate": 1.9824796441618617e-05, + "loss": 0.5222, + "step": 371 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 3.7761785984039307, + "learning_rate": 1.9823361655528658e-05, + "loss": 0.5393, + "step": 372 + }, + { + "epoch": 0.1769449715370019, + "grad_norm": 3.673797369003296, + "learning_rate": 1.9821921070805522e-05, + "loss": 0.5508, + "step": 373 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 5.075558662414551, + "learning_rate": 1.9820474688299566e-05, + "loss": 0.4923, + "step": 374 + }, + { + "epoch": 0.17789373814041745, + "grad_norm": 2.8637001514434814, + "learning_rate": 1.9819022508864582e-05, + "loss": 0.4581, + "step": 375 + }, + { + "epoch": 0.17836812144212524, + "grad_norm": 3.7829291820526123, + "learning_rate": 1.9817564533357775e-05, + "loss": 0.4989, + "step": 376 + }, + { + "epoch": 0.178842504743833, + "grad_norm": 3.2799975872039795, + "learning_rate": 1.9816100762639773e-05, + "loss": 0.513, + "step": 377 + }, + { + "epoch": 0.1793168880455408, + "grad_norm": 3.540116310119629, + "learning_rate": 1.9814631197574626e-05, + "loss": 0.4851, + "step": 378 + }, + { + "epoch": 0.17979127134724857, + "grad_norm": 3.821983814239502, + "learning_rate": 1.98131558390298e-05, + "loss": 0.4854, + "step": 379 + }, + { + "epoch": 0.18026565464895636, + "grad_norm": 7.540534973144531, + "learning_rate": 1.981167468787619e-05, + "loss": 0.5211, + "step": 380 + }, + { + "epoch": 0.18074003795066412, + "grad_norm": 3.578125, + "learning_rate": 1.98101877449881e-05, + "loss": 0.5135, + "step": 381 + }, + { + "epoch": 0.18121442125237192, + "grad_norm": 3.8135201930999756, + "learning_rate": 1.980869501124326e-05, + "loss": 0.551, + "step": 382 + }, + { + "epoch": 0.1816888045540797, + "grad_norm": 3.3921446800231934, + "learning_rate": 1.9807196487522818e-05, + "loss": 0.5174, + "step": 383 + }, + { + "epoch": 0.18216318785578747, + "grad_norm": 3.473004102706909, + "learning_rate": 1.9805692174711337e-05, + "loss": 0.5091, + "step": 384 + }, + { + "epoch": 0.18263757115749527, + "grad_norm": 3.449249505996704, + "learning_rate": 1.9804182073696793e-05, + "loss": 0.5027, + "step": 385 + }, + { + "epoch": 0.18311195445920303, + "grad_norm": 3.633528470993042, + "learning_rate": 1.980266618537059e-05, + "loss": 0.5025, + "step": 386 + }, + { + "epoch": 0.18358633776091082, + "grad_norm": 3.325308084487915, + "learning_rate": 1.980114451062754e-05, + "loss": 0.4739, + "step": 387 + }, + { + "epoch": 0.1840607210626186, + "grad_norm": 3.829240083694458, + "learning_rate": 1.979961705036587e-05, + "loss": 0.4921, + "step": 388 + }, + { + "epoch": 0.18453510436432638, + "grad_norm": 3.1751868724823, + "learning_rate": 1.9798083805487228e-05, + "loss": 0.4318, + "step": 389 + }, + { + "epoch": 0.18500948766603414, + "grad_norm": 3.521556854248047, + "learning_rate": 1.979654477689667e-05, + "loss": 0.4898, + "step": 390 + }, + { + "epoch": 0.18548387096774194, + "grad_norm": 3.7403318881988525, + "learning_rate": 1.979499996550267e-05, + "loss": 0.5121, + "step": 391 + }, + { + "epoch": 0.1859582542694497, + "grad_norm": 4.491858959197998, + "learning_rate": 1.9793449372217123e-05, + "loss": 0.5375, + "step": 392 + }, + { + "epoch": 0.1864326375711575, + "grad_norm": 3.4098191261291504, + "learning_rate": 1.9791892997955317e-05, + "loss": 0.5462, + "step": 393 + }, + { + "epoch": 0.1869070208728653, + "grad_norm": 3.658325672149658, + "learning_rate": 1.9790330843635967e-05, + "loss": 0.4567, + "step": 394 + }, + { + "epoch": 0.18738140417457305, + "grad_norm": 4.077053546905518, + "learning_rate": 1.97887629101812e-05, + "loss": 0.5194, + "step": 395 + }, + { + "epoch": 0.18785578747628084, + "grad_norm": 3.680041551589966, + "learning_rate": 1.9787189198516553e-05, + "loss": 0.4515, + "step": 396 + }, + { + "epoch": 0.1883301707779886, + "grad_norm": 3.7228457927703857, + "learning_rate": 1.9785609709570973e-05, + "loss": 0.4504, + "step": 397 + }, + { + "epoch": 0.1888045540796964, + "grad_norm": 4.01034688949585, + "learning_rate": 1.9784024444276812e-05, + "loss": 0.4637, + "step": 398 + }, + { + "epoch": 0.18927893738140417, + "grad_norm": 2.7363219261169434, + "learning_rate": 1.9782433403569836e-05, + "loss": 0.4619, + "step": 399 + }, + { + "epoch": 0.18975332068311196, + "grad_norm": 3.8097171783447266, + "learning_rate": 1.9780836588389225e-05, + "loss": 0.578, + "step": 400 + }, + { + "epoch": 0.19022770398481972, + "grad_norm": 3.741736650466919, + "learning_rate": 1.9779233999677563e-05, + "loss": 0.5731, + "step": 401 + }, + { + "epoch": 0.19070208728652752, + "grad_norm": 3.690082311630249, + "learning_rate": 1.9777625638380838e-05, + "loss": 0.5538, + "step": 402 + }, + { + "epoch": 0.19117647058823528, + "grad_norm": 3.4858877658843994, + "learning_rate": 1.9776011505448455e-05, + "loss": 0.4858, + "step": 403 + }, + { + "epoch": 0.19165085388994307, + "grad_norm": 3.8378117084503174, + "learning_rate": 1.977439160183322e-05, + "loss": 0.5408, + "step": 404 + }, + { + "epoch": 0.19212523719165087, + "grad_norm": 3.4993057250976562, + "learning_rate": 1.977276592849134e-05, + "loss": 0.4522, + "step": 405 + }, + { + "epoch": 0.19259962049335863, + "grad_norm": 3.624081611633301, + "learning_rate": 1.9771134486382436e-05, + "loss": 0.4739, + "step": 406 + }, + { + "epoch": 0.19307400379506642, + "grad_norm": 3.2720398902893066, + "learning_rate": 1.9769497276469538e-05, + "loss": 0.5003, + "step": 407 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 3.429849624633789, + "learning_rate": 1.9767854299719073e-05, + "loss": 0.5161, + "step": 408 + }, + { + "epoch": 0.19402277039848198, + "grad_norm": 3.6602742671966553, + "learning_rate": 1.976620555710087e-05, + "loss": 0.5546, + "step": 409 + }, + { + "epoch": 0.19449715370018975, + "grad_norm": 4.340588092803955, + "learning_rate": 1.9764551049588165e-05, + "loss": 0.5262, + "step": 410 + }, + { + "epoch": 0.19497153700189754, + "grad_norm": 3.39455509185791, + "learning_rate": 1.97628907781576e-05, + "loss": 0.4408, + "step": 411 + }, + { + "epoch": 0.1954459203036053, + "grad_norm": 3.521057605743408, + "learning_rate": 1.976122474378922e-05, + "loss": 0.4413, + "step": 412 + }, + { + "epoch": 0.1959203036053131, + "grad_norm": 4.244317531585693, + "learning_rate": 1.9759552947466462e-05, + "loss": 0.5525, + "step": 413 + }, + { + "epoch": 0.19639468690702086, + "grad_norm": 3.8756752014160156, + "learning_rate": 1.9757875390176177e-05, + "loss": 0.5267, + "step": 414 + }, + { + "epoch": 0.19686907020872865, + "grad_norm": 3.6845438480377197, + "learning_rate": 1.9756192072908605e-05, + "loss": 0.5128, + "step": 415 + }, + { + "epoch": 0.19734345351043645, + "grad_norm": 4.302131652832031, + "learning_rate": 1.9754502996657395e-05, + "loss": 0.5089, + "step": 416 + }, + { + "epoch": 0.1978178368121442, + "grad_norm": 3.6274499893188477, + "learning_rate": 1.975280816241959e-05, + "loss": 0.4639, + "step": 417 + }, + { + "epoch": 0.198292220113852, + "grad_norm": 3.127408742904663, + "learning_rate": 1.975110757119564e-05, + "loss": 0.4801, + "step": 418 + }, + { + "epoch": 0.19876660341555977, + "grad_norm": 9.724570274353027, + "learning_rate": 1.9749401223989376e-05, + "loss": 0.6008, + "step": 419 + }, + { + "epoch": 0.19924098671726756, + "grad_norm": 3.2324442863464355, + "learning_rate": 1.9747689121808045e-05, + "loss": 0.4913, + "step": 420 + }, + { + "epoch": 0.19971537001897532, + "grad_norm": 2.958690881729126, + "learning_rate": 1.9745971265662286e-05, + "loss": 0.3945, + "step": 421 + }, + { + "epoch": 0.20018975332068312, + "grad_norm": 3.4928455352783203, + "learning_rate": 1.9744247656566125e-05, + "loss": 0.4863, + "step": 422 + }, + { + "epoch": 0.20066413662239088, + "grad_norm": 3.0363330841064453, + "learning_rate": 1.9742518295536997e-05, + "loss": 0.4847, + "step": 423 + }, + { + "epoch": 0.20113851992409867, + "grad_norm": 3.126666307449341, + "learning_rate": 1.9740783183595726e-05, + "loss": 0.4281, + "step": 424 + }, + { + "epoch": 0.20161290322580644, + "grad_norm": 3.345747232437134, + "learning_rate": 1.9739042321766533e-05, + "loss": 0.4699, + "step": 425 + }, + { + "epoch": 0.20208728652751423, + "grad_norm": 3.1891050338745117, + "learning_rate": 1.973729571107703e-05, + "loss": 0.4362, + "step": 426 + }, + { + "epoch": 0.20256166982922202, + "grad_norm": 3.56644606590271, + "learning_rate": 1.973554335255822e-05, + "loss": 0.4374, + "step": 427 + }, + { + "epoch": 0.2030360531309298, + "grad_norm": 3.70436692237854, + "learning_rate": 1.9733785247244506e-05, + "loss": 0.4582, + "step": 428 + }, + { + "epoch": 0.20351043643263758, + "grad_norm": 3.7359845638275146, + "learning_rate": 1.9732021396173682e-05, + "loss": 0.5247, + "step": 429 + }, + { + "epoch": 0.20398481973434535, + "grad_norm": 3.910747528076172, + "learning_rate": 1.973025180038693e-05, + "loss": 0.5534, + "step": 430 + }, + { + "epoch": 0.20445920303605314, + "grad_norm": 3.243649482727051, + "learning_rate": 1.9728476460928828e-05, + "loss": 0.53, + "step": 431 + }, + { + "epoch": 0.2049335863377609, + "grad_norm": 3.5252304077148438, + "learning_rate": 1.9726695378847332e-05, + "loss": 0.5136, + "step": 432 + }, + { + "epoch": 0.2054079696394687, + "grad_norm": 3.3857786655426025, + "learning_rate": 1.972490855519381e-05, + "loss": 0.4836, + "step": 433 + }, + { + "epoch": 0.20588235294117646, + "grad_norm": 3.3901121616363525, + "learning_rate": 1.9723115991022997e-05, + "loss": 0.4585, + "step": 434 + }, + { + "epoch": 0.20635673624288425, + "grad_norm": 3.816561460494995, + "learning_rate": 1.972131768739303e-05, + "loss": 0.5757, + "step": 435 + }, + { + "epoch": 0.20683111954459202, + "grad_norm": 3.4764530658721924, + "learning_rate": 1.9719513645365426e-05, + "loss": 0.4892, + "step": 436 + }, + { + "epoch": 0.2073055028462998, + "grad_norm": 3.6275835037231445, + "learning_rate": 1.97177038660051e-05, + "loss": 0.4744, + "step": 437 + }, + { + "epoch": 0.2077798861480076, + "grad_norm": 3.73422908782959, + "learning_rate": 1.971588835038034e-05, + "loss": 0.6116, + "step": 438 + }, + { + "epoch": 0.20825426944971537, + "grad_norm": 3.751572370529175, + "learning_rate": 1.971406709956283e-05, + "loss": 0.534, + "step": 439 + }, + { + "epoch": 0.20872865275142316, + "grad_norm": 2.7028439044952393, + "learning_rate": 1.9712240114627637e-05, + "loss": 0.4065, + "step": 440 + }, + { + "epoch": 0.20920303605313093, + "grad_norm": 2.995476007461548, + "learning_rate": 1.971040739665321e-05, + "loss": 0.4091, + "step": 441 + }, + { + "epoch": 0.20967741935483872, + "grad_norm": 2.9523215293884277, + "learning_rate": 1.970856894672139e-05, + "loss": 0.465, + "step": 442 + }, + { + "epoch": 0.21015180265654648, + "grad_norm": 3.5105068683624268, + "learning_rate": 1.9706724765917384e-05, + "loss": 0.459, + "step": 443 + }, + { + "epoch": 0.21062618595825428, + "grad_norm": 2.8509361743927, + "learning_rate": 1.97048748553298e-05, + "loss": 0.4519, + "step": 444 + }, + { + "epoch": 0.21110056925996204, + "grad_norm": 3.4983131885528564, + "learning_rate": 1.9703019216050627e-05, + "loss": 0.5233, + "step": 445 + }, + { + "epoch": 0.21157495256166983, + "grad_norm": 3.1177706718444824, + "learning_rate": 1.970115784917523e-05, + "loss": 0.4847, + "step": 446 + }, + { + "epoch": 0.2120493358633776, + "grad_norm": 3.1251273155212402, + "learning_rate": 1.9699290755802344e-05, + "loss": 0.4727, + "step": 447 + }, + { + "epoch": 0.2125237191650854, + "grad_norm": 2.4966578483581543, + "learning_rate": 1.9697417937034106e-05, + "loss": 0.4165, + "step": 448 + }, + { + "epoch": 0.21299810246679318, + "grad_norm": 3.8288350105285645, + "learning_rate": 1.969553939397602e-05, + "loss": 0.553, + "step": 449 + }, + { + "epoch": 0.21347248576850095, + "grad_norm": 3.0275819301605225, + "learning_rate": 1.9693655127736974e-05, + "loss": 0.3849, + "step": 450 + }, + { + "epoch": 0.21394686907020874, + "grad_norm": 3.2554712295532227, + "learning_rate": 1.9691765139429227e-05, + "loss": 0.5852, + "step": 451 + }, + { + "epoch": 0.2144212523719165, + "grad_norm": 3.404175043106079, + "learning_rate": 1.968986943016842e-05, + "loss": 0.5296, + "step": 452 + }, + { + "epoch": 0.2148956356736243, + "grad_norm": 3.853860378265381, + "learning_rate": 1.9687968001073575e-05, + "loss": 0.6205, + "step": 453 + }, + { + "epoch": 0.21537001897533206, + "grad_norm": 3.2675604820251465, + "learning_rate": 1.9686060853267088e-05, + "loss": 0.5094, + "step": 454 + }, + { + "epoch": 0.21584440227703985, + "grad_norm": 4.281660079956055, + "learning_rate": 1.9684147987874725e-05, + "loss": 0.514, + "step": 455 + }, + { + "epoch": 0.21631878557874762, + "grad_norm": 3.5292105674743652, + "learning_rate": 1.9682229406025635e-05, + "loss": 0.4982, + "step": 456 + }, + { + "epoch": 0.2167931688804554, + "grad_norm": 3.585477352142334, + "learning_rate": 1.9680305108852335e-05, + "loss": 0.4612, + "step": 457 + }, + { + "epoch": 0.21726755218216318, + "grad_norm": 3.200887680053711, + "learning_rate": 1.9678375097490717e-05, + "loss": 0.5057, + "step": 458 + }, + { + "epoch": 0.21774193548387097, + "grad_norm": 3.3191189765930176, + "learning_rate": 1.9676439373080056e-05, + "loss": 0.4977, + "step": 459 + }, + { + "epoch": 0.21821631878557876, + "grad_norm": 3.9757373332977295, + "learning_rate": 1.9674497936762984e-05, + "loss": 0.5366, + "step": 460 + }, + { + "epoch": 0.21869070208728653, + "grad_norm": 3.3760838508605957, + "learning_rate": 1.9672550789685512e-05, + "loss": 0.4957, + "step": 461 + }, + { + "epoch": 0.21916508538899432, + "grad_norm": 3.371440887451172, + "learning_rate": 1.9670597932997023e-05, + "loss": 0.5172, + "step": 462 + }, + { + "epoch": 0.21963946869070208, + "grad_norm": 5.2982001304626465, + "learning_rate": 1.966863936785027e-05, + "loss": 0.5267, + "step": 463 + }, + { + "epoch": 0.22011385199240988, + "grad_norm": 3.9646644592285156, + "learning_rate": 1.966667509540137e-05, + "loss": 0.4803, + "step": 464 + }, + { + "epoch": 0.22058823529411764, + "grad_norm": 3.059048891067505, + "learning_rate": 1.9664705116809823e-05, + "loss": 0.4096, + "step": 465 + }, + { + "epoch": 0.22106261859582543, + "grad_norm": 3.714001178741455, + "learning_rate": 1.9662729433238477e-05, + "loss": 0.6027, + "step": 466 + }, + { + "epoch": 0.2215370018975332, + "grad_norm": 3.6565237045288086, + "learning_rate": 1.9660748045853567e-05, + "loss": 0.4968, + "step": 467 + }, + { + "epoch": 0.222011385199241, + "grad_norm": 3.092435836791992, + "learning_rate": 1.965876095582468e-05, + "loss": 0.4037, + "step": 468 + }, + { + "epoch": 0.22248576850094876, + "grad_norm": 3.3902924060821533, + "learning_rate": 1.965676816432478e-05, + "loss": 0.5345, + "step": 469 + }, + { + "epoch": 0.22296015180265655, + "grad_norm": 3.4829235076904297, + "learning_rate": 1.9654769672530186e-05, + "loss": 0.504, + "step": 470 + }, + { + "epoch": 0.22343453510436434, + "grad_norm": 3.3810994625091553, + "learning_rate": 1.9652765481620596e-05, + "loss": 0.4718, + "step": 471 + }, + { + "epoch": 0.2239089184060721, + "grad_norm": 4.006764888763428, + "learning_rate": 1.965075559277906e-05, + "loss": 0.5748, + "step": 472 + }, + { + "epoch": 0.2243833017077799, + "grad_norm": 3.318103313446045, + "learning_rate": 1.9648740007191994e-05, + "loss": 0.4585, + "step": 473 + }, + { + "epoch": 0.22485768500948766, + "grad_norm": 3.561944007873535, + "learning_rate": 1.9646718726049187e-05, + "loss": 0.5574, + "step": 474 + }, + { + "epoch": 0.22533206831119545, + "grad_norm": 3.0337014198303223, + "learning_rate": 1.964469175054377e-05, + "loss": 0.4363, + "step": 475 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 2.986750364303589, + "learning_rate": 1.964265908187225e-05, + "loss": 0.4626, + "step": 476 + }, + { + "epoch": 0.226280834914611, + "grad_norm": 3.6310508251190186, + "learning_rate": 1.9640620721234488e-05, + "loss": 0.4709, + "step": 477 + }, + { + "epoch": 0.22675521821631878, + "grad_norm": 3.44527530670166, + "learning_rate": 1.963857666983372e-05, + "loss": 0.4663, + "step": 478 + }, + { + "epoch": 0.22722960151802657, + "grad_norm": 4.240223407745361, + "learning_rate": 1.963652692887652e-05, + "loss": 0.6681, + "step": 479 + }, + { + "epoch": 0.22770398481973433, + "grad_norm": 3.381404161453247, + "learning_rate": 1.9634471499572826e-05, + "loss": 0.4429, + "step": 480 + }, + { + "epoch": 0.22817836812144213, + "grad_norm": 3.138699769973755, + "learning_rate": 1.9632410383135946e-05, + "loss": 0.4803, + "step": 481 + }, + { + "epoch": 0.2286527514231499, + "grad_norm": 3.806920051574707, + "learning_rate": 1.9630343580782538e-05, + "loss": 0.4943, + "step": 482 + }, + { + "epoch": 0.22912713472485768, + "grad_norm": 3.408698320388794, + "learning_rate": 1.9628271093732605e-05, + "loss": 0.5208, + "step": 483 + }, + { + "epoch": 0.22960151802656548, + "grad_norm": 2.962362051010132, + "learning_rate": 1.9626192923209524e-05, + "loss": 0.4449, + "step": 484 + }, + { + "epoch": 0.23007590132827324, + "grad_norm": 3.10517954826355, + "learning_rate": 1.9624109070440017e-05, + "loss": 0.4865, + "step": 485 + }, + { + "epoch": 0.23055028462998103, + "grad_norm": 3.3021395206451416, + "learning_rate": 1.9622019536654154e-05, + "loss": 0.5415, + "step": 486 + }, + { + "epoch": 0.2310246679316888, + "grad_norm": 3.686612606048584, + "learning_rate": 1.961992432308538e-05, + "loss": 0.4875, + "step": 487 + }, + { + "epoch": 0.2314990512333966, + "grad_norm": 3.50701904296875, + "learning_rate": 1.961782343097047e-05, + "loss": 0.459, + "step": 488 + }, + { + "epoch": 0.23197343453510436, + "grad_norm": 3.291677713394165, + "learning_rate": 1.9615716861549557e-05, + "loss": 0.5038, + "step": 489 + }, + { + "epoch": 0.23244781783681215, + "grad_norm": 3.3820300102233887, + "learning_rate": 1.9613604616066137e-05, + "loss": 0.4445, + "step": 490 + }, + { + "epoch": 0.2329222011385199, + "grad_norm": 3.5324206352233887, + "learning_rate": 1.9611486695767037e-05, + "loss": 0.5252, + "step": 491 + }, + { + "epoch": 0.2333965844402277, + "grad_norm": 3.8190603256225586, + "learning_rate": 1.9609363101902456e-05, + "loss": 0.4666, + "step": 492 + }, + { + "epoch": 0.23387096774193547, + "grad_norm": 4.703067779541016, + "learning_rate": 1.960723383572592e-05, + "loss": 0.4529, + "step": 493 + }, + { + "epoch": 0.23434535104364326, + "grad_norm": 3.1753973960876465, + "learning_rate": 1.960509889849432e-05, + "loss": 0.4731, + "step": 494 + }, + { + "epoch": 0.23481973434535106, + "grad_norm": 3.4943294525146484, + "learning_rate": 1.960295829146789e-05, + "loss": 0.5578, + "step": 495 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 3.4101996421813965, + "learning_rate": 1.9600812015910203e-05, + "loss": 0.5255, + "step": 496 + }, + { + "epoch": 0.2357685009487666, + "grad_norm": 3.5300912857055664, + "learning_rate": 1.9598660073088186e-05, + "loss": 0.5781, + "step": 497 + }, + { + "epoch": 0.23624288425047438, + "grad_norm": 3.204566478729248, + "learning_rate": 1.959650246427211e-05, + "loss": 0.5813, + "step": 498 + }, + { + "epoch": 0.23671726755218217, + "grad_norm": 3.192633628845215, + "learning_rate": 1.9594339190735594e-05, + "loss": 0.4062, + "step": 499 + }, + { + "epoch": 0.23719165085388993, + "grad_norm": 3.062784194946289, + "learning_rate": 1.9592170253755594e-05, + "loss": 0.4764, + "step": 500 + }, + { + "epoch": 0.23766603415559773, + "grad_norm": 2.8469433784484863, + "learning_rate": 1.9589995654612412e-05, + "loss": 0.4091, + "step": 501 + }, + { + "epoch": 0.2381404174573055, + "grad_norm": 3.5228500366210938, + "learning_rate": 1.958781539458969e-05, + "loss": 0.51, + "step": 502 + }, + { + "epoch": 0.23861480075901328, + "grad_norm": 3.872974395751953, + "learning_rate": 1.9585629474974413e-05, + "loss": 0.5438, + "step": 503 + }, + { + "epoch": 0.23908918406072105, + "grad_norm": 3.1505889892578125, + "learning_rate": 1.9583437897056915e-05, + "loss": 0.4181, + "step": 504 + }, + { + "epoch": 0.23956356736242884, + "grad_norm": 3.113068103790283, + "learning_rate": 1.958124066213086e-05, + "loss": 0.4512, + "step": 505 + }, + { + "epoch": 0.24003795066413663, + "grad_norm": 3.0535147190093994, + "learning_rate": 1.957903777149325e-05, + "loss": 0.3913, + "step": 506 + }, + { + "epoch": 0.2405123339658444, + "grad_norm": 3.152808427810669, + "learning_rate": 1.957682922644443e-05, + "loss": 0.4813, + "step": 507 + }, + { + "epoch": 0.2409867172675522, + "grad_norm": 3.2799031734466553, + "learning_rate": 1.957461502828809e-05, + "loss": 0.4559, + "step": 508 + }, + { + "epoch": 0.24146110056925996, + "grad_norm": 3.305764675140381, + "learning_rate": 1.957239517833124e-05, + "loss": 0.4553, + "step": 509 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 2.7665562629699707, + "learning_rate": 1.957016967788424e-05, + "loss": 0.4145, + "step": 510 + }, + { + "epoch": 0.2424098671726755, + "grad_norm": 3.1011545658111572, + "learning_rate": 1.9567938528260778e-05, + "loss": 0.5109, + "step": 511 + }, + { + "epoch": 0.2428842504743833, + "grad_norm": 3.0363271236419678, + "learning_rate": 1.9565701730777883e-05, + "loss": 0.4649, + "step": 512 + }, + { + "epoch": 0.24335863377609107, + "grad_norm": 3.1920406818389893, + "learning_rate": 1.9563459286755914e-05, + "loss": 0.5797, + "step": 513 + }, + { + "epoch": 0.24383301707779886, + "grad_norm": 3.131014347076416, + "learning_rate": 1.9561211197518564e-05, + "loss": 0.4634, + "step": 514 + }, + { + "epoch": 0.24430740037950663, + "grad_norm": 3.515669107437134, + "learning_rate": 1.9558957464392853e-05, + "loss": 0.5281, + "step": 515 + }, + { + "epoch": 0.24478178368121442, + "grad_norm": 3.187274932861328, + "learning_rate": 1.9556698088709144e-05, + "loss": 0.443, + "step": 516 + }, + { + "epoch": 0.2452561669829222, + "grad_norm": 3.458089828491211, + "learning_rate": 1.9554433071801117e-05, + "loss": 0.4375, + "step": 517 + }, + { + "epoch": 0.24573055028462998, + "grad_norm": 3.4186503887176514, + "learning_rate": 1.9552162415005797e-05, + "loss": 0.5475, + "step": 518 + }, + { + "epoch": 0.24620493358633777, + "grad_norm": 3.319234848022461, + "learning_rate": 1.954988611966352e-05, + "loss": 0.4545, + "step": 519 + }, + { + "epoch": 0.24667931688804554, + "grad_norm": 3.491206169128418, + "learning_rate": 1.9547604187117974e-05, + "loss": 0.4993, + "step": 520 + }, + { + "epoch": 0.24715370018975333, + "grad_norm": 3.1041345596313477, + "learning_rate": 1.954531661871615e-05, + "loss": 0.4866, + "step": 521 + }, + { + "epoch": 0.2476280834914611, + "grad_norm": 4.381217956542969, + "learning_rate": 1.954302341580838e-05, + "loss": 0.5897, + "step": 522 + }, + { + "epoch": 0.24810246679316889, + "grad_norm": 3.8608601093292236, + "learning_rate": 1.9540724579748323e-05, + "loss": 0.4814, + "step": 523 + }, + { + "epoch": 0.24857685009487665, + "grad_norm": 3.7535743713378906, + "learning_rate": 1.953842011189295e-05, + "loss": 0.5214, + "step": 524 + }, + { + "epoch": 0.24905123339658444, + "grad_norm": 3.2677531242370605, + "learning_rate": 1.9536110013602578e-05, + "loss": 0.3946, + "step": 525 + }, + { + "epoch": 0.2495256166982922, + "grad_norm": 3.734286069869995, + "learning_rate": 1.9533794286240828e-05, + "loss": 0.4848, + "step": 526 + }, + { + "epoch": 0.25, + "grad_norm": 3.121314764022827, + "learning_rate": 1.953147293117465e-05, + "loss": 0.4554, + "step": 527 + }, + { + "epoch": 0.2504743833017078, + "grad_norm": 4.03399658203125, + "learning_rate": 1.9529145949774322e-05, + "loss": 0.5657, + "step": 528 + }, + { + "epoch": 0.2509487666034156, + "grad_norm": 3.444110631942749, + "learning_rate": 1.952681334341343e-05, + "loss": 0.4767, + "step": 529 + }, + { + "epoch": 0.2514231499051233, + "grad_norm": 3.272854804992676, + "learning_rate": 1.9524475113468897e-05, + "loss": 0.5218, + "step": 530 + }, + { + "epoch": 0.2518975332068311, + "grad_norm": 3.146151542663574, + "learning_rate": 1.9522131261320952e-05, + "loss": 0.47, + "step": 531 + }, + { + "epoch": 0.2523719165085389, + "grad_norm": 3.1734254360198975, + "learning_rate": 1.9519781788353148e-05, + "loss": 0.4846, + "step": 532 + }, + { + "epoch": 0.2528462998102467, + "grad_norm": 4.805948257446289, + "learning_rate": 1.9517426695952358e-05, + "loss": 0.4689, + "step": 533 + }, + { + "epoch": 0.25332068311195444, + "grad_norm": 3.5787010192871094, + "learning_rate": 1.9515065985508766e-05, + "loss": 0.4505, + "step": 534 + }, + { + "epoch": 0.25379506641366223, + "grad_norm": 3.29832124710083, + "learning_rate": 1.9512699658415882e-05, + "loss": 0.4994, + "step": 535 + }, + { + "epoch": 0.25426944971537, + "grad_norm": 3.159043312072754, + "learning_rate": 1.951032771607052e-05, + "loss": 0.5383, + "step": 536 + }, + { + "epoch": 0.2547438330170778, + "grad_norm": 3.565232515335083, + "learning_rate": 1.9507950159872814e-05, + "loss": 0.4824, + "step": 537 + }, + { + "epoch": 0.25521821631878555, + "grad_norm": 3.3344407081604004, + "learning_rate": 1.9505566991226214e-05, + "loss": 0.4797, + "step": 538 + }, + { + "epoch": 0.25569259962049334, + "grad_norm": 3.4726908206939697, + "learning_rate": 1.9503178211537483e-05, + "loss": 0.5105, + "step": 539 + }, + { + "epoch": 0.25616698292220114, + "grad_norm": 3.1351394653320312, + "learning_rate": 1.9500783822216693e-05, + "loss": 0.4824, + "step": 540 + }, + { + "epoch": 0.25664136622390893, + "grad_norm": 3.049128532409668, + "learning_rate": 1.9498383824677223e-05, + "loss": 0.518, + "step": 541 + }, + { + "epoch": 0.2571157495256167, + "grad_norm": 3.9116055965423584, + "learning_rate": 1.9495978220335774e-05, + "loss": 0.4997, + "step": 542 + }, + { + "epoch": 0.25759013282732446, + "grad_norm": 3.9851927757263184, + "learning_rate": 1.949356701061235e-05, + "loss": 0.4598, + "step": 543 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 3.269645929336548, + "learning_rate": 1.9491150196930258e-05, + "loss": 0.4539, + "step": 544 + }, + { + "epoch": 0.25853889943074004, + "grad_norm": 3.270271062850952, + "learning_rate": 1.9488727780716125e-05, + "loss": 0.4371, + "step": 545 + }, + { + "epoch": 0.25901328273244784, + "grad_norm": 3.121974468231201, + "learning_rate": 1.948629976339988e-05, + "loss": 0.4952, + "step": 546 + }, + { + "epoch": 0.2594876660341556, + "grad_norm": 2.9817652702331543, + "learning_rate": 1.9483866146414756e-05, + "loss": 0.4515, + "step": 547 + }, + { + "epoch": 0.25996204933586337, + "grad_norm": 3.0989949703216553, + "learning_rate": 1.9481426931197293e-05, + "loss": 0.4139, + "step": 548 + }, + { + "epoch": 0.26043643263757116, + "grad_norm": 4.340046405792236, + "learning_rate": 1.9478982119187333e-05, + "loss": 0.6325, + "step": 549 + }, + { + "epoch": 0.26091081593927895, + "grad_norm": 3.455524444580078, + "learning_rate": 1.9476531711828027e-05, + "loss": 0.5801, + "step": 550 + }, + { + "epoch": 0.26138519924098674, + "grad_norm": 3.185102939605713, + "learning_rate": 1.9474075710565825e-05, + "loss": 0.5409, + "step": 551 + }, + { + "epoch": 0.2618595825426945, + "grad_norm": 3.527089834213257, + "learning_rate": 1.9471614116850482e-05, + "loss": 0.44, + "step": 552 + }, + { + "epoch": 0.2623339658444023, + "grad_norm": 3.273226499557495, + "learning_rate": 1.946914693213505e-05, + "loss": 0.4909, + "step": 553 + }, + { + "epoch": 0.26280834914611007, + "grad_norm": 3.4487481117248535, + "learning_rate": 1.946667415787589e-05, + "loss": 0.4712, + "step": 554 + }, + { + "epoch": 0.26328273244781786, + "grad_norm": 3.9401209354400635, + "learning_rate": 1.9464195795532648e-05, + "loss": 0.4435, + "step": 555 + }, + { + "epoch": 0.2637571157495256, + "grad_norm": 3.349949836730957, + "learning_rate": 1.946171184656828e-05, + "loss": 0.4909, + "step": 556 + }, + { + "epoch": 0.2642314990512334, + "grad_norm": 4.190546989440918, + "learning_rate": 1.9459222312449036e-05, + "loss": 0.5163, + "step": 557 + }, + { + "epoch": 0.2647058823529412, + "grad_norm": 3.7838876247406006, + "learning_rate": 1.9456727194644465e-05, + "loss": 0.5356, + "step": 558 + }, + { + "epoch": 0.265180265654649, + "grad_norm": 3.253169536590576, + "learning_rate": 1.945422649462741e-05, + "loss": 0.5241, + "step": 559 + }, + { + "epoch": 0.2656546489563567, + "grad_norm": 3.7193899154663086, + "learning_rate": 1.9451720213874007e-05, + "loss": 0.4999, + "step": 560 + }, + { + "epoch": 0.2661290322580645, + "grad_norm": 3.82112717628479, + "learning_rate": 1.9449208353863693e-05, + "loss": 0.4958, + "step": 561 + }, + { + "epoch": 0.2666034155597723, + "grad_norm": 3.6052136421203613, + "learning_rate": 1.944669091607919e-05, + "loss": 0.5644, + "step": 562 + }, + { + "epoch": 0.2670777988614801, + "grad_norm": 3.2630624771118164, + "learning_rate": 1.9444167902006516e-05, + "loss": 0.4038, + "step": 563 + }, + { + "epoch": 0.2675521821631879, + "grad_norm": 3.3602817058563232, + "learning_rate": 1.944163931313499e-05, + "loss": 0.4928, + "step": 564 + }, + { + "epoch": 0.2680265654648956, + "grad_norm": 2.9780235290527344, + "learning_rate": 1.94391051509572e-05, + "loss": 0.4451, + "step": 565 + }, + { + "epoch": 0.2685009487666034, + "grad_norm": 3.799266815185547, + "learning_rate": 1.9436565416969045e-05, + "loss": 0.5159, + "step": 566 + }, + { + "epoch": 0.2689753320683112, + "grad_norm": 3.299086093902588, + "learning_rate": 1.9434020112669706e-05, + "loss": 0.4683, + "step": 567 + }, + { + "epoch": 0.269449715370019, + "grad_norm": 3.3280222415924072, + "learning_rate": 1.9431469239561646e-05, + "loss": 0.4656, + "step": 568 + }, + { + "epoch": 0.26992409867172673, + "grad_norm": 3.7164220809936523, + "learning_rate": 1.9428912799150624e-05, + "loss": 0.4551, + "step": 569 + }, + { + "epoch": 0.2703984819734345, + "grad_norm": 3.125343084335327, + "learning_rate": 1.9426350792945676e-05, + "loss": 0.4851, + "step": 570 + }, + { + "epoch": 0.2708728652751423, + "grad_norm": 2.9615533351898193, + "learning_rate": 1.9423783222459135e-05, + "loss": 0.4318, + "step": 571 + }, + { + "epoch": 0.2713472485768501, + "grad_norm": 2.992379903793335, + "learning_rate": 1.942121008920661e-05, + "loss": 0.4402, + "step": 572 + }, + { + "epoch": 0.2718216318785579, + "grad_norm": 4.136736869812012, + "learning_rate": 1.9418631394706998e-05, + "loss": 0.5978, + "step": 573 + }, + { + "epoch": 0.27229601518026564, + "grad_norm": 3.9404451847076416, + "learning_rate": 1.941604714048247e-05, + "loss": 0.5606, + "step": 574 + }, + { + "epoch": 0.27277039848197343, + "grad_norm": 3.232971668243408, + "learning_rate": 1.9413457328058495e-05, + "loss": 0.5066, + "step": 575 + }, + { + "epoch": 0.2732447817836812, + "grad_norm": 3.407205820083618, + "learning_rate": 1.941086195896381e-05, + "loss": 0.5584, + "step": 576 + }, + { + "epoch": 0.273719165085389, + "grad_norm": 3.10275936126709, + "learning_rate": 1.940826103473043e-05, + "loss": 0.4627, + "step": 577 + }, + { + "epoch": 0.27419354838709675, + "grad_norm": 3.237243413925171, + "learning_rate": 1.9405654556893667e-05, + "loss": 0.4897, + "step": 578 + }, + { + "epoch": 0.27466793168880455, + "grad_norm": 3.3360302448272705, + "learning_rate": 1.9403042526992087e-05, + "loss": 0.5143, + "step": 579 + }, + { + "epoch": 0.27514231499051234, + "grad_norm": 3.802823305130005, + "learning_rate": 1.9400424946567552e-05, + "loss": 0.492, + "step": 580 + }, + { + "epoch": 0.27561669829222013, + "grad_norm": 2.7487895488739014, + "learning_rate": 1.9397801817165192e-05, + "loss": 0.4175, + "step": 581 + }, + { + "epoch": 0.27609108159392787, + "grad_norm": 3.0970826148986816, + "learning_rate": 1.9395173140333413e-05, + "loss": 0.4156, + "step": 582 + }, + { + "epoch": 0.27656546489563566, + "grad_norm": 4.566722393035889, + "learning_rate": 1.93925389176239e-05, + "loss": 0.5653, + "step": 583 + }, + { + "epoch": 0.27703984819734345, + "grad_norm": 2.8217339515686035, + "learning_rate": 1.9389899150591605e-05, + "loss": 0.4451, + "step": 584 + }, + { + "epoch": 0.27751423149905124, + "grad_norm": 3.9668426513671875, + "learning_rate": 1.938725384079476e-05, + "loss": 0.5351, + "step": 585 + }, + { + "epoch": 0.27798861480075904, + "grad_norm": 2.601659059524536, + "learning_rate": 1.9384602989794868e-05, + "loss": 0.3897, + "step": 586 + }, + { + "epoch": 0.2784629981024668, + "grad_norm": 4.612437725067139, + "learning_rate": 1.938194659915669e-05, + "loss": 0.436, + "step": 587 + }, + { + "epoch": 0.27893738140417457, + "grad_norm": 3.368626356124878, + "learning_rate": 1.9379284670448274e-05, + "loss": 0.3943, + "step": 588 + }, + { + "epoch": 0.27941176470588236, + "grad_norm": 3.076282262802124, + "learning_rate": 1.937661720524093e-05, + "loss": 0.4004, + "step": 589 + }, + { + "epoch": 0.27988614800759015, + "grad_norm": 2.78485107421875, + "learning_rate": 1.9373944205109236e-05, + "loss": 0.35, + "step": 590 + }, + { + "epoch": 0.2803605313092979, + "grad_norm": 3.2779622077941895, + "learning_rate": 1.9371265671631038e-05, + "loss": 0.4068, + "step": 591 + }, + { + "epoch": 0.2808349146110057, + "grad_norm": 2.9968180656433105, + "learning_rate": 1.9368581606387442e-05, + "loss": 0.4667, + "step": 592 + }, + { + "epoch": 0.2813092979127135, + "grad_norm": 3.2578439712524414, + "learning_rate": 1.9365892010962834e-05, + "loss": 0.4826, + "step": 593 + }, + { + "epoch": 0.28178368121442127, + "grad_norm": 3.467496395111084, + "learning_rate": 1.936319688694485e-05, + "loss": 0.503, + "step": 594 + }, + { + "epoch": 0.28225806451612906, + "grad_norm": 2.929417848587036, + "learning_rate": 1.9360496235924396e-05, + "loss": 0.4527, + "step": 595 + }, + { + "epoch": 0.2827324478178368, + "grad_norm": 3.066240072250366, + "learning_rate": 1.935779005949564e-05, + "loss": 0.4604, + "step": 596 + }, + { + "epoch": 0.2832068311195446, + "grad_norm": 2.88321852684021, + "learning_rate": 1.935507835925601e-05, + "loss": 0.4544, + "step": 597 + }, + { + "epoch": 0.2836812144212524, + "grad_norm": 2.9748549461364746, + "learning_rate": 1.93523611368062e-05, + "loss": 0.4272, + "step": 598 + }, + { + "epoch": 0.2841555977229602, + "grad_norm": 2.744473695755005, + "learning_rate": 1.9349638393750156e-05, + "loss": 0.4874, + "step": 599 + }, + { + "epoch": 0.2846299810246679, + "grad_norm": 3.6597301959991455, + "learning_rate": 1.9346910131695084e-05, + "loss": 0.4996, + "step": 600 + }, + { + "epoch": 0.2851043643263757, + "grad_norm": 2.5389652252197266, + "learning_rate": 1.9344176352251456e-05, + "loss": 0.3724, + "step": 601 + }, + { + "epoch": 0.2855787476280835, + "grad_norm": 3.2770135402679443, + "learning_rate": 1.934143705703299e-05, + "loss": 0.4549, + "step": 602 + }, + { + "epoch": 0.2860531309297913, + "grad_norm": 2.7468864917755127, + "learning_rate": 1.933869224765667e-05, + "loss": 0.4371, + "step": 603 + }, + { + "epoch": 0.286527514231499, + "grad_norm": 2.662562370300293, + "learning_rate": 1.933594192574272e-05, + "loss": 0.4146, + "step": 604 + }, + { + "epoch": 0.2870018975332068, + "grad_norm": 2.997250556945801, + "learning_rate": 1.933318609291464e-05, + "loss": 0.485, + "step": 605 + }, + { + "epoch": 0.2874762808349146, + "grad_norm": 2.8720083236694336, + "learning_rate": 1.9330424750799165e-05, + "loss": 0.4197, + "step": 606 + }, + { + "epoch": 0.2879506641366224, + "grad_norm": 3.827880620956421, + "learning_rate": 1.9327657901026284e-05, + "loss": 0.4941, + "step": 607 + }, + { + "epoch": 0.2884250474383302, + "grad_norm": 3.460559368133545, + "learning_rate": 1.9324885545229248e-05, + "loss": 0.4842, + "step": 608 + }, + { + "epoch": 0.28889943074003793, + "grad_norm": 2.8307604789733887, + "learning_rate": 1.932210768504455e-05, + "loss": 0.4564, + "step": 609 + }, + { + "epoch": 0.2893738140417457, + "grad_norm": 3.346304416656494, + "learning_rate": 1.9319324322111928e-05, + "loss": 0.5572, + "step": 610 + }, + { + "epoch": 0.2898481973434535, + "grad_norm": 3.4799015522003174, + "learning_rate": 1.931653545807438e-05, + "loss": 0.542, + "step": 611 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 2.8453822135925293, + "learning_rate": 1.931374109457814e-05, + "loss": 0.3665, + "step": 612 + }, + { + "epoch": 0.29079696394686905, + "grad_norm": 3.6189804077148438, + "learning_rate": 1.9310941233272698e-05, + "loss": 0.4945, + "step": 613 + }, + { + "epoch": 0.29127134724857684, + "grad_norm": 3.2037627696990967, + "learning_rate": 1.9308135875810778e-05, + "loss": 0.4499, + "step": 614 + }, + { + "epoch": 0.29174573055028463, + "grad_norm": 3.068361520767212, + "learning_rate": 1.930532502384836e-05, + "loss": 0.4529, + "step": 615 + }, + { + "epoch": 0.2922201138519924, + "grad_norm": 2.5314414501190186, + "learning_rate": 1.9302508679044662e-05, + "loss": 0.3866, + "step": 616 + }, + { + "epoch": 0.2926944971537002, + "grad_norm": 3.3053672313690186, + "learning_rate": 1.929968684306214e-05, + "loss": 0.5091, + "step": 617 + }, + { + "epoch": 0.29316888045540795, + "grad_norm": 3.5351502895355225, + "learning_rate": 1.9296859517566505e-05, + "loss": 0.5416, + "step": 618 + }, + { + "epoch": 0.29364326375711575, + "grad_norm": 3.617424964904785, + "learning_rate": 1.929402670422669e-05, + "loss": 0.5325, + "step": 619 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 2.799778938293457, + "learning_rate": 1.9291188404714876e-05, + "loss": 0.514, + "step": 620 + }, + { + "epoch": 0.29459203036053133, + "grad_norm": 3.1185672283172607, + "learning_rate": 1.9288344620706493e-05, + "loss": 0.5391, + "step": 621 + }, + { + "epoch": 0.29506641366223907, + "grad_norm": 3.1492013931274414, + "learning_rate": 1.9285495353880187e-05, + "loss": 0.4585, + "step": 622 + }, + { + "epoch": 0.29554079696394686, + "grad_norm": 3.211419105529785, + "learning_rate": 1.928264060591786e-05, + "loss": 0.4983, + "step": 623 + }, + { + "epoch": 0.29601518026565465, + "grad_norm": 3.3846914768218994, + "learning_rate": 1.927978037850464e-05, + "loss": 0.5182, + "step": 624 + }, + { + "epoch": 0.29648956356736245, + "grad_norm": 2.692244052886963, + "learning_rate": 1.927691467332889e-05, + "loss": 0.4298, + "step": 625 + }, + { + "epoch": 0.2969639468690702, + "grad_norm": 2.660977363586426, + "learning_rate": 1.9274043492082205e-05, + "loss": 0.396, + "step": 626 + }, + { + "epoch": 0.297438330170778, + "grad_norm": 3.270251989364624, + "learning_rate": 1.9271166836459418e-05, + "loss": 0.499, + "step": 627 + }, + { + "epoch": 0.29791271347248577, + "grad_norm": 3.6545050144195557, + "learning_rate": 1.926828470815859e-05, + "loss": 0.5114, + "step": 628 + }, + { + "epoch": 0.29838709677419356, + "grad_norm": 3.5217530727386475, + "learning_rate": 1.9265397108881015e-05, + "loss": 0.4573, + "step": 629 + }, + { + "epoch": 0.29886148007590135, + "grad_norm": 3.2720742225646973, + "learning_rate": 1.9262504040331208e-05, + "loss": 0.4423, + "step": 630 + }, + { + "epoch": 0.2993358633776091, + "grad_norm": 3.0071029663085938, + "learning_rate": 1.9259605504216922e-05, + "loss": 0.4364, + "step": 631 + }, + { + "epoch": 0.2998102466793169, + "grad_norm": 4.148307800292969, + "learning_rate": 1.925670150224914e-05, + "loss": 0.4924, + "step": 632 + }, + { + "epoch": 0.3002846299810247, + "grad_norm": 3.19143009185791, + "learning_rate": 1.9253792036142052e-05, + "loss": 0.5972, + "step": 633 + }, + { + "epoch": 0.30075901328273247, + "grad_norm": 2.9321649074554443, + "learning_rate": 1.92508771076131e-05, + "loss": 0.3967, + "step": 634 + }, + { + "epoch": 0.3012333965844402, + "grad_norm": 3.0709781646728516, + "learning_rate": 1.9247956718382933e-05, + "loss": 0.5369, + "step": 635 + }, + { + "epoch": 0.301707779886148, + "grad_norm": 2.6908938884735107, + "learning_rate": 1.9245030870175427e-05, + "loss": 0.4135, + "step": 636 + }, + { + "epoch": 0.3021821631878558, + "grad_norm": 2.776761054992676, + "learning_rate": 1.9242099564717683e-05, + "loss": 0.519, + "step": 637 + }, + { + "epoch": 0.3026565464895636, + "grad_norm": 3.2145349979400635, + "learning_rate": 1.9239162803740016e-05, + "loss": 0.4982, + "step": 638 + }, + { + "epoch": 0.3031309297912713, + "grad_norm": 2.7437076568603516, + "learning_rate": 1.9236220588975976e-05, + "loss": 0.4159, + "step": 639 + }, + { + "epoch": 0.3036053130929791, + "grad_norm": 3.104196310043335, + "learning_rate": 1.9233272922162318e-05, + "loss": 0.493, + "step": 640 + }, + { + "epoch": 0.3040796963946869, + "grad_norm": 3.3658387660980225, + "learning_rate": 1.9230319805039022e-05, + "loss": 0.5201, + "step": 641 + }, + { + "epoch": 0.3045540796963947, + "grad_norm": 2.732059955596924, + "learning_rate": 1.922736123934928e-05, + "loss": 0.4961, + "step": 642 + }, + { + "epoch": 0.3050284629981025, + "grad_norm": 3.038705825805664, + "learning_rate": 1.922439722683951e-05, + "loss": 0.4819, + "step": 643 + }, + { + "epoch": 0.3055028462998102, + "grad_norm": 2.702758312225342, + "learning_rate": 1.9221427769259333e-05, + "loss": 0.4632, + "step": 644 + }, + { + "epoch": 0.305977229601518, + "grad_norm": 3.554279327392578, + "learning_rate": 1.9218452868361597e-05, + "loss": 0.4135, + "step": 645 + }, + { + "epoch": 0.3064516129032258, + "grad_norm": 3.9966790676116943, + "learning_rate": 1.921547252590235e-05, + "loss": 0.537, + "step": 646 + }, + { + "epoch": 0.3069259962049336, + "grad_norm": 3.105196475982666, + "learning_rate": 1.9212486743640864e-05, + "loss": 0.474, + "step": 647 + }, + { + "epoch": 0.30740037950664134, + "grad_norm": 3.0190954208374023, + "learning_rate": 1.9209495523339614e-05, + "loss": 0.4715, + "step": 648 + }, + { + "epoch": 0.30787476280834913, + "grad_norm": 3.061038017272949, + "learning_rate": 1.920649886676429e-05, + "loss": 0.5493, + "step": 649 + }, + { + "epoch": 0.3083491461100569, + "grad_norm": 3.0604310035705566, + "learning_rate": 1.9203496775683787e-05, + "loss": 0.4818, + "step": 650 + }, + { + "epoch": 0.3088235294117647, + "grad_norm": 3.209660768508911, + "learning_rate": 1.9200489251870207e-05, + "loss": 0.3923, + "step": 651 + }, + { + "epoch": 0.3092979127134725, + "grad_norm": 3.1261727809906006, + "learning_rate": 1.9197476297098868e-05, + "loss": 0.4662, + "step": 652 + }, + { + "epoch": 0.30977229601518025, + "grad_norm": 3.9394774436950684, + "learning_rate": 1.919445791314828e-05, + "loss": 0.4693, + "step": 653 + }, + { + "epoch": 0.31024667931688804, + "grad_norm": 2.708280086517334, + "learning_rate": 1.9191434101800174e-05, + "loss": 0.4855, + "step": 654 + }, + { + "epoch": 0.31072106261859583, + "grad_norm": 3.1161880493164062, + "learning_rate": 1.9188404864839465e-05, + "loss": 0.5513, + "step": 655 + }, + { + "epoch": 0.3111954459203036, + "grad_norm": 3.3281567096710205, + "learning_rate": 1.918537020405429e-05, + "loss": 0.4403, + "step": 656 + }, + { + "epoch": 0.31166982922201136, + "grad_norm": 3.432615041732788, + "learning_rate": 1.9182330121235978e-05, + "loss": 0.5654, + "step": 657 + }, + { + "epoch": 0.31214421252371916, + "grad_norm": 3.0545363426208496, + "learning_rate": 1.917928461817906e-05, + "loss": 0.4361, + "step": 658 + }, + { + "epoch": 0.31261859582542695, + "grad_norm": 2.9954404830932617, + "learning_rate": 1.917623369668126e-05, + "loss": 0.4687, + "step": 659 + }, + { + "epoch": 0.31309297912713474, + "grad_norm": 3.5407347679138184, + "learning_rate": 1.9173177358543512e-05, + "loss": 0.5126, + "step": 660 + }, + { + "epoch": 0.3135673624288425, + "grad_norm": 3.2592506408691406, + "learning_rate": 1.9170115605569945e-05, + "loss": 0.5198, + "step": 661 + }, + { + "epoch": 0.31404174573055027, + "grad_norm": 2.86240291595459, + "learning_rate": 1.9167048439567876e-05, + "loss": 0.4064, + "step": 662 + }, + { + "epoch": 0.31451612903225806, + "grad_norm": 2.8558406829833984, + "learning_rate": 1.9163975862347824e-05, + "loss": 0.4397, + "step": 663 + }, + { + "epoch": 0.31499051233396586, + "grad_norm": 2.9497201442718506, + "learning_rate": 1.9160897875723505e-05, + "loss": 0.447, + "step": 664 + }, + { + "epoch": 0.31546489563567365, + "grad_norm": 2.7483532428741455, + "learning_rate": 1.915781448151182e-05, + "loss": 0.4017, + "step": 665 + }, + { + "epoch": 0.3159392789373814, + "grad_norm": 3.7263011932373047, + "learning_rate": 1.9154725681532867e-05, + "loss": 0.5402, + "step": 666 + }, + { + "epoch": 0.3164136622390892, + "grad_norm": 3.7040700912475586, + "learning_rate": 1.9151631477609932e-05, + "loss": 0.4836, + "step": 667 + }, + { + "epoch": 0.31688804554079697, + "grad_norm": 2.6552894115448, + "learning_rate": 1.9148531871569496e-05, + "loss": 0.4124, + "step": 668 + }, + { + "epoch": 0.31736242884250476, + "grad_norm": 3.1435623168945312, + "learning_rate": 1.9145426865241224e-05, + "loss": 0.4807, + "step": 669 + }, + { + "epoch": 0.3178368121442125, + "grad_norm": 2.9633023738861084, + "learning_rate": 1.9142316460457974e-05, + "loss": 0.4609, + "step": 670 + }, + { + "epoch": 0.3183111954459203, + "grad_norm": 3.0346839427948, + "learning_rate": 1.9139200659055785e-05, + "loss": 0.4028, + "step": 671 + }, + { + "epoch": 0.3187855787476281, + "grad_norm": 3.140936851501465, + "learning_rate": 1.913607946287388e-05, + "loss": 0.4631, + "step": 672 + }, + { + "epoch": 0.3192599620493359, + "grad_norm": 2.8173251152038574, + "learning_rate": 1.9132952873754675e-05, + "loss": 0.4232, + "step": 673 + }, + { + "epoch": 0.31973434535104367, + "grad_norm": 2.9139256477355957, + "learning_rate": 1.9129820893543766e-05, + "loss": 0.4401, + "step": 674 + }, + { + "epoch": 0.3202087286527514, + "grad_norm": 3.9275453090667725, + "learning_rate": 1.912668352408992e-05, + "loss": 0.5251, + "step": 675 + }, + { + "epoch": 0.3206831119544592, + "grad_norm": 2.808434247970581, + "learning_rate": 1.9123540767245107e-05, + "loss": 0.4207, + "step": 676 + }, + { + "epoch": 0.321157495256167, + "grad_norm": 2.854140043258667, + "learning_rate": 1.912039262486446e-05, + "loss": 0.4516, + "step": 677 + }, + { + "epoch": 0.3216318785578748, + "grad_norm": 3.1188645362854004, + "learning_rate": 1.9117239098806296e-05, + "loss": 0.473, + "step": 678 + }, + { + "epoch": 0.3221062618595825, + "grad_norm": 2.9418070316314697, + "learning_rate": 1.911408019093211e-05, + "loss": 0.4666, + "step": 679 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 7.523387908935547, + "learning_rate": 1.9110915903106577e-05, + "loss": 0.4657, + "step": 680 + }, + { + "epoch": 0.3230550284629981, + "grad_norm": 3.0443685054779053, + "learning_rate": 1.9107746237197542e-05, + "loss": 0.5032, + "step": 681 + }, + { + "epoch": 0.3235294117647059, + "grad_norm": 3.1826181411743164, + "learning_rate": 1.910457119507603e-05, + "loss": 0.4716, + "step": 682 + }, + { + "epoch": 0.32400379506641364, + "grad_norm": 2.9390838146209717, + "learning_rate": 1.9101390778616232e-05, + "loss": 0.3951, + "step": 683 + }, + { + "epoch": 0.32447817836812143, + "grad_norm": 3.393476963043213, + "learning_rate": 1.909820498969552e-05, + "loss": 0.5147, + "step": 684 + }, + { + "epoch": 0.3249525616698292, + "grad_norm": 3.6798181533813477, + "learning_rate": 1.9095013830194432e-05, + "loss": 0.4758, + "step": 685 + }, + { + "epoch": 0.325426944971537, + "grad_norm": 3.2658169269561768, + "learning_rate": 1.9091817301996684e-05, + "loss": 0.4902, + "step": 686 + }, + { + "epoch": 0.3259013282732448, + "grad_norm": 3.256455898284912, + "learning_rate": 1.9088615406989146e-05, + "loss": 0.5238, + "step": 687 + }, + { + "epoch": 0.32637571157495254, + "grad_norm": 3.187185287475586, + "learning_rate": 1.908540814706187e-05, + "loss": 0.4681, + "step": 688 + }, + { + "epoch": 0.32685009487666034, + "grad_norm": 2.9497554302215576, + "learning_rate": 1.9082195524108068e-05, + "loss": 0.4091, + "step": 689 + }, + { + "epoch": 0.32732447817836813, + "grad_norm": 3.060279607772827, + "learning_rate": 1.907897754002412e-05, + "loss": 0.504, + "step": 690 + }, + { + "epoch": 0.3277988614800759, + "grad_norm": 2.83272123336792, + "learning_rate": 1.9075754196709574e-05, + "loss": 0.4661, + "step": 691 + }, + { + "epoch": 0.32827324478178366, + "grad_norm": 3.2421908378601074, + "learning_rate": 1.9072525496067128e-05, + "loss": 0.3761, + "step": 692 + }, + { + "epoch": 0.32874762808349145, + "grad_norm": 3.3531394004821777, + "learning_rate": 1.9069291440002665e-05, + "loss": 0.553, + "step": 693 + }, + { + "epoch": 0.32922201138519924, + "grad_norm": 3.3758304119110107, + "learning_rate": 1.9066052030425206e-05, + "loss": 0.5256, + "step": 694 + }, + { + "epoch": 0.32969639468690703, + "grad_norm": 3.2974579334259033, + "learning_rate": 1.9062807269246945e-05, + "loss": 0.4929, + "step": 695 + }, + { + "epoch": 0.3301707779886148, + "grad_norm": 3.126060962677002, + "learning_rate": 1.9059557158383234e-05, + "loss": 0.4732, + "step": 696 + }, + { + "epoch": 0.33064516129032256, + "grad_norm": 3.069438934326172, + "learning_rate": 1.9056301699752578e-05, + "loss": 0.4659, + "step": 697 + }, + { + "epoch": 0.33111954459203036, + "grad_norm": 3.131999969482422, + "learning_rate": 1.9053040895276652e-05, + "loss": 0.486, + "step": 698 + }, + { + "epoch": 0.33159392789373815, + "grad_norm": 3.255291700363159, + "learning_rate": 1.904977474688026e-05, + "loss": 0.4963, + "step": 699 + }, + { + "epoch": 0.33206831119544594, + "grad_norm": 3.3611719608306885, + "learning_rate": 1.9046503256491395e-05, + "loss": 0.5017, + "step": 700 + }, + { + "epoch": 0.3325426944971537, + "grad_norm": 2.538780689239502, + "learning_rate": 1.904322642604117e-05, + "loss": 0.405, + "step": 701 + }, + { + "epoch": 0.33301707779886147, + "grad_norm": 2.767956495285034, + "learning_rate": 1.9039944257463875e-05, + "loss": 0.3662, + "step": 702 + }, + { + "epoch": 0.33349146110056926, + "grad_norm": 2.7074410915374756, + "learning_rate": 1.903665675269694e-05, + "loss": 0.4577, + "step": 703 + }, + { + "epoch": 0.33396584440227706, + "grad_norm": 3.3300936222076416, + "learning_rate": 1.9033363913680944e-05, + "loss": 0.4979, + "step": 704 + }, + { + "epoch": 0.3344402277039848, + "grad_norm": 2.8957359790802, + "learning_rate": 1.9030065742359618e-05, + "loss": 0.359, + "step": 705 + }, + { + "epoch": 0.3349146110056926, + "grad_norm": 3.4609487056732178, + "learning_rate": 1.9026762240679843e-05, + "loss": 0.5508, + "step": 706 + }, + { + "epoch": 0.3353889943074004, + "grad_norm": 3.0877790451049805, + "learning_rate": 1.902345341059164e-05, + "loss": 0.5167, + "step": 707 + }, + { + "epoch": 0.33586337760910817, + "grad_norm": 3.2066214084625244, + "learning_rate": 1.9020139254048174e-05, + "loss": 0.4389, + "step": 708 + }, + { + "epoch": 0.33633776091081596, + "grad_norm": 3.8554136753082275, + "learning_rate": 1.9016819773005774e-05, + "loss": 0.5015, + "step": 709 + }, + { + "epoch": 0.3368121442125237, + "grad_norm": 3.0794146060943604, + "learning_rate": 1.901349496942388e-05, + "loss": 0.4256, + "step": 710 + }, + { + "epoch": 0.3372865275142315, + "grad_norm": 3.269186496734619, + "learning_rate": 1.9010164845265103e-05, + "loss": 0.4864, + "step": 711 + }, + { + "epoch": 0.3377609108159393, + "grad_norm": 3.30047607421875, + "learning_rate": 1.9006829402495174e-05, + "loss": 0.5635, + "step": 712 + }, + { + "epoch": 0.3382352941176471, + "grad_norm": 3.331386089324951, + "learning_rate": 1.9003488643082978e-05, + "loss": 0.5245, + "step": 713 + }, + { + "epoch": 0.3387096774193548, + "grad_norm": 3.2732012271881104, + "learning_rate": 1.9000142569000524e-05, + "loss": 0.4657, + "step": 714 + }, + { + "epoch": 0.3391840607210626, + "grad_norm": 2.6173768043518066, + "learning_rate": 1.8996791182222977e-05, + "loss": 0.4839, + "step": 715 + }, + { + "epoch": 0.3396584440227704, + "grad_norm": 2.7928354740142822, + "learning_rate": 1.899343448472862e-05, + "loss": 0.5419, + "step": 716 + }, + { + "epoch": 0.3401328273244782, + "grad_norm": 2.8685014247894287, + "learning_rate": 1.899007247849888e-05, + "loss": 0.3946, + "step": 717 + }, + { + "epoch": 0.340607210626186, + "grad_norm": 2.6877050399780273, + "learning_rate": 1.8986705165518318e-05, + "loss": 0.457, + "step": 718 + }, + { + "epoch": 0.3410815939278937, + "grad_norm": 3.0826516151428223, + "learning_rate": 1.898333254777462e-05, + "loss": 0.4636, + "step": 719 + }, + { + "epoch": 0.3415559772296015, + "grad_norm": 4.633234024047852, + "learning_rate": 1.897995462725862e-05, + "loss": 0.4727, + "step": 720 + }, + { + "epoch": 0.3420303605313093, + "grad_norm": 2.6432247161865234, + "learning_rate": 1.8976571405964258e-05, + "loss": 0.4449, + "step": 721 + }, + { + "epoch": 0.3425047438330171, + "grad_norm": 5.404833793640137, + "learning_rate": 1.8973182885888626e-05, + "loss": 0.4411, + "step": 722 + }, + { + "epoch": 0.34297912713472484, + "grad_norm": 2.931230306625366, + "learning_rate": 1.8969789069031927e-05, + "loss": 0.4184, + "step": 723 + }, + { + "epoch": 0.34345351043643263, + "grad_norm": 3.3786165714263916, + "learning_rate": 1.8966389957397503e-05, + "loss": 0.4311, + "step": 724 + }, + { + "epoch": 0.3439278937381404, + "grad_norm": 2.9587435722351074, + "learning_rate": 1.896298555299181e-05, + "loss": 0.4465, + "step": 725 + }, + { + "epoch": 0.3444022770398482, + "grad_norm": 2.8019826412200928, + "learning_rate": 1.895957585782444e-05, + "loss": 0.4498, + "step": 726 + }, + { + "epoch": 0.34487666034155595, + "grad_norm": 3.1993303298950195, + "learning_rate": 1.8956160873908097e-05, + "loss": 0.4221, + "step": 727 + }, + { + "epoch": 0.34535104364326374, + "grad_norm": 2.909705877304077, + "learning_rate": 1.895274060325862e-05, + "loss": 0.4789, + "step": 728 + }, + { + "epoch": 0.34582542694497154, + "grad_norm": 3.371626138687134, + "learning_rate": 1.8949315047894956e-05, + "loss": 0.5294, + "step": 729 + }, + { + "epoch": 0.34629981024667933, + "grad_norm": 3.704002618789673, + "learning_rate": 1.8945884209839172e-05, + "loss": 0.5202, + "step": 730 + }, + { + "epoch": 0.3467741935483871, + "grad_norm": 2.856421709060669, + "learning_rate": 1.8942448091116464e-05, + "loss": 0.4168, + "step": 731 + }, + { + "epoch": 0.34724857685009486, + "grad_norm": 3.03263783454895, + "learning_rate": 1.8939006693755138e-05, + "loss": 0.4592, + "step": 732 + }, + { + "epoch": 0.34772296015180265, + "grad_norm": 3.3531453609466553, + "learning_rate": 1.8935560019786618e-05, + "loss": 0.4228, + "step": 733 + }, + { + "epoch": 0.34819734345351044, + "grad_norm": 2.8518269062042236, + "learning_rate": 1.8932108071245435e-05, + "loss": 0.4481, + "step": 734 + }, + { + "epoch": 0.34867172675521824, + "grad_norm": 3.01810622215271, + "learning_rate": 1.8928650850169246e-05, + "loss": 0.5109, + "step": 735 + }, + { + "epoch": 0.349146110056926, + "grad_norm": 3.7409400939941406, + "learning_rate": 1.8925188358598815e-05, + "loss": 0.5323, + "step": 736 + }, + { + "epoch": 0.34962049335863377, + "grad_norm": 2.571120262145996, + "learning_rate": 1.892172059857801e-05, + "loss": 0.4073, + "step": 737 + }, + { + "epoch": 0.35009487666034156, + "grad_norm": 3.2061607837677, + "learning_rate": 1.8918247572153822e-05, + "loss": 0.5185, + "step": 738 + }, + { + "epoch": 0.35056925996204935, + "grad_norm": 2.933711528778076, + "learning_rate": 1.8914769281376345e-05, + "loss": 0.3978, + "step": 739 + }, + { + "epoch": 0.3510436432637571, + "grad_norm": 3.2886109352111816, + "learning_rate": 1.8911285728298778e-05, + "loss": 0.4505, + "step": 740 + }, + { + "epoch": 0.3515180265654649, + "grad_norm": 3.886824607849121, + "learning_rate": 1.8907796914977422e-05, + "loss": 0.5427, + "step": 741 + }, + { + "epoch": 0.3519924098671727, + "grad_norm": 3.2919833660125732, + "learning_rate": 1.8904302843471692e-05, + "loss": 0.409, + "step": 742 + }, + { + "epoch": 0.35246679316888047, + "grad_norm": 3.437626600265503, + "learning_rate": 1.8900803515844107e-05, + "loss": 0.4457, + "step": 743 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 3.2604973316192627, + "learning_rate": 1.8897298934160285e-05, + "loss": 0.4763, + "step": 744 + }, + { + "epoch": 0.353415559772296, + "grad_norm": 2.8597803115844727, + "learning_rate": 1.8893789100488945e-05, + "loss": 0.4451, + "step": 745 + }, + { + "epoch": 0.3538899430740038, + "grad_norm": 2.9492034912109375, + "learning_rate": 1.8890274016901905e-05, + "loss": 0.4771, + "step": 746 + }, + { + "epoch": 0.3543643263757116, + "grad_norm": 3.0947840213775635, + "learning_rate": 1.888675368547409e-05, + "loss": 0.4063, + "step": 747 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 2.7532737255096436, + "learning_rate": 1.888322810828351e-05, + "loss": 0.461, + "step": 748 + }, + { + "epoch": 0.3553130929791271, + "grad_norm": 3.891683340072632, + "learning_rate": 1.887969728741128e-05, + "loss": 0.4789, + "step": 749 + }, + { + "epoch": 0.3557874762808349, + "grad_norm": 3.2587778568267822, + "learning_rate": 1.8876161224941607e-05, + "loss": 0.4776, + "step": 750 + }, + { + "epoch": 0.3562618595825427, + "grad_norm": 2.7931885719299316, + "learning_rate": 1.8872619922961802e-05, + "loss": 0.4503, + "step": 751 + }, + { + "epoch": 0.3567362428842505, + "grad_norm": 3.225916862487793, + "learning_rate": 1.886907338356225e-05, + "loss": 0.4879, + "step": 752 + }, + { + "epoch": 0.3572106261859583, + "grad_norm": 2.8587422370910645, + "learning_rate": 1.8865521608836446e-05, + "loss": 0.4243, + "step": 753 + }, + { + "epoch": 0.357685009487666, + "grad_norm": 3.333045482635498, + "learning_rate": 1.8861964600880963e-05, + "loss": 0.4953, + "step": 754 + }, + { + "epoch": 0.3581593927893738, + "grad_norm": 2.7636733055114746, + "learning_rate": 1.885840236179547e-05, + "loss": 0.4484, + "step": 755 + }, + { + "epoch": 0.3586337760910816, + "grad_norm": 2.7610294818878174, + "learning_rate": 1.8854834893682722e-05, + "loss": 0.4055, + "step": 756 + }, + { + "epoch": 0.3591081593927894, + "grad_norm": 3.245123863220215, + "learning_rate": 1.8851262198648555e-05, + "loss": 0.5173, + "step": 757 + }, + { + "epoch": 0.35958254269449713, + "grad_norm": 2.8020386695861816, + "learning_rate": 1.88476842788019e-05, + "loss": 0.4727, + "step": 758 + }, + { + "epoch": 0.3600569259962049, + "grad_norm": 3.41428542137146, + "learning_rate": 1.8844101136254768e-05, + "loss": 0.4924, + "step": 759 + }, + { + "epoch": 0.3605313092979127, + "grad_norm": 2.779858350753784, + "learning_rate": 1.884051277312225e-05, + "loss": 0.4455, + "step": 760 + }, + { + "epoch": 0.3610056925996205, + "grad_norm": 2.965505838394165, + "learning_rate": 1.8836919191522526e-05, + "loss": 0.5216, + "step": 761 + }, + { + "epoch": 0.36148007590132825, + "grad_norm": 2.897125005722046, + "learning_rate": 1.8833320393576847e-05, + "loss": 0.4364, + "step": 762 + }, + { + "epoch": 0.36195445920303604, + "grad_norm": 3.3312103748321533, + "learning_rate": 1.8829716381409545e-05, + "loss": 0.541, + "step": 763 + }, + { + "epoch": 0.36242884250474383, + "grad_norm": 3.1323423385620117, + "learning_rate": 1.8826107157148042e-05, + "loss": 0.4879, + "step": 764 + }, + { + "epoch": 0.3629032258064516, + "grad_norm": 2.554779052734375, + "learning_rate": 1.882249272292282e-05, + "loss": 0.3459, + "step": 765 + }, + { + "epoch": 0.3633776091081594, + "grad_norm": 2.9526796340942383, + "learning_rate": 1.8818873080867445e-05, + "loss": 0.4802, + "step": 766 + }, + { + "epoch": 0.36385199240986715, + "grad_norm": 2.4592695236206055, + "learning_rate": 1.8815248233118558e-05, + "loss": 0.3779, + "step": 767 + }, + { + "epoch": 0.36432637571157495, + "grad_norm": 2.9448318481445312, + "learning_rate": 1.881161818181587e-05, + "loss": 0.4546, + "step": 768 + }, + { + "epoch": 0.36480075901328274, + "grad_norm": 3.2894225120544434, + "learning_rate": 1.8807982929102164e-05, + "loss": 0.4668, + "step": 769 + }, + { + "epoch": 0.36527514231499053, + "grad_norm": 2.9263877868652344, + "learning_rate": 1.8804342477123292e-05, + "loss": 0.4316, + "step": 770 + }, + { + "epoch": 0.36574952561669827, + "grad_norm": 2.8721306324005127, + "learning_rate": 1.880069682802818e-05, + "loss": 0.4152, + "step": 771 + }, + { + "epoch": 0.36622390891840606, + "grad_norm": 2.6576414108276367, + "learning_rate": 1.879704598396882e-05, + "loss": 0.3802, + "step": 772 + }, + { + "epoch": 0.36669829222011385, + "grad_norm": 2.7889490127563477, + "learning_rate": 1.879338994710026e-05, + "loss": 0.3833, + "step": 773 + }, + { + "epoch": 0.36717267552182165, + "grad_norm": 3.141726016998291, + "learning_rate": 1.8789728719580632e-05, + "loss": 0.4202, + "step": 774 + }, + { + "epoch": 0.36764705882352944, + "grad_norm": 3.9903249740600586, + "learning_rate": 1.878606230357112e-05, + "loss": 0.5108, + "step": 775 + }, + { + "epoch": 0.3681214421252372, + "grad_norm": 2.6597206592559814, + "learning_rate": 1.878239070123597e-05, + "loss": 0.4236, + "step": 776 + }, + { + "epoch": 0.36859582542694497, + "grad_norm": 3.9152090549468994, + "learning_rate": 1.8778713914742494e-05, + "loss": 0.5953, + "step": 777 + }, + { + "epoch": 0.36907020872865276, + "grad_norm": 2.510295867919922, + "learning_rate": 1.8775031946261065e-05, + "loss": 0.389, + "step": 778 + }, + { + "epoch": 0.36954459203036055, + "grad_norm": 2.6154842376708984, + "learning_rate": 1.877134479796511e-05, + "loss": 0.4015, + "step": 779 + }, + { + "epoch": 0.3700189753320683, + "grad_norm": 3.2904458045959473, + "learning_rate": 1.8767652472031118e-05, + "loss": 0.4731, + "step": 780 + }, + { + "epoch": 0.3704933586337761, + "grad_norm": 3.2095184326171875, + "learning_rate": 1.8763954970638628e-05, + "loss": 0.5158, + "step": 781 + }, + { + "epoch": 0.3709677419354839, + "grad_norm": 2.783874034881592, + "learning_rate": 1.8760252295970245e-05, + "loss": 0.4831, + "step": 782 + }, + { + "epoch": 0.37144212523719167, + "grad_norm": 2.9339380264282227, + "learning_rate": 1.8756544450211614e-05, + "loss": 0.5171, + "step": 783 + }, + { + "epoch": 0.3719165085388994, + "grad_norm": 3.8392632007598877, + "learning_rate": 1.875283143555145e-05, + "loss": 0.4464, + "step": 784 + }, + { + "epoch": 0.3723908918406072, + "grad_norm": 2.6166434288024902, + "learning_rate": 1.8749113254181498e-05, + "loss": 0.4343, + "step": 785 + }, + { + "epoch": 0.372865275142315, + "grad_norm": 2.91298246383667, + "learning_rate": 1.874538990829657e-05, + "loss": 0.5344, + "step": 786 + }, + { + "epoch": 0.3733396584440228, + "grad_norm": 2.931877374649048, + "learning_rate": 1.874166140009452e-05, + "loss": 0.4732, + "step": 787 + }, + { + "epoch": 0.3738140417457306, + "grad_norm": 2.885974407196045, + "learning_rate": 1.8737927731776245e-05, + "loss": 0.4594, + "step": 788 + }, + { + "epoch": 0.3742884250474383, + "grad_norm": 3.037022352218628, + "learning_rate": 1.8734188905545697e-05, + "loss": 0.4378, + "step": 789 + }, + { + "epoch": 0.3747628083491461, + "grad_norm": 3.06512713432312, + "learning_rate": 1.8730444923609865e-05, + "loss": 0.4882, + "step": 790 + }, + { + "epoch": 0.3752371916508539, + "grad_norm": 3.42930269241333, + "learning_rate": 1.872669578817879e-05, + "loss": 0.5288, + "step": 791 + }, + { + "epoch": 0.3757115749525617, + "grad_norm": 2.3780622482299805, + "learning_rate": 1.872294150146554e-05, + "loss": 0.3665, + "step": 792 + }, + { + "epoch": 0.3761859582542694, + "grad_norm": 3.4545505046844482, + "learning_rate": 1.8719182065686242e-05, + "loss": 0.5605, + "step": 793 + }, + { + "epoch": 0.3766603415559772, + "grad_norm": 2.9837419986724854, + "learning_rate": 1.871541748306005e-05, + "loss": 0.4979, + "step": 794 + }, + { + "epoch": 0.377134724857685, + "grad_norm": 2.7663657665252686, + "learning_rate": 1.871164775580916e-05, + "loss": 0.4332, + "step": 795 + }, + { + "epoch": 0.3776091081593928, + "grad_norm": 3.1293842792510986, + "learning_rate": 1.8707872886158806e-05, + "loss": 0.4708, + "step": 796 + }, + { + "epoch": 0.3780834914611006, + "grad_norm": 2.830396890640259, + "learning_rate": 1.870409287633726e-05, + "loss": 0.5119, + "step": 797 + }, + { + "epoch": 0.37855787476280833, + "grad_norm": 2.5124335289001465, + "learning_rate": 1.8700307728575813e-05, + "loss": 0.3997, + "step": 798 + }, + { + "epoch": 0.3790322580645161, + "grad_norm": 2.8015406131744385, + "learning_rate": 1.8696517445108807e-05, + "loss": 0.4406, + "step": 799 + }, + { + "epoch": 0.3795066413662239, + "grad_norm": 2.6684601306915283, + "learning_rate": 1.8692722028173612e-05, + "loss": 0.4578, + "step": 800 + }, + { + "epoch": 0.3799810246679317, + "grad_norm": 2.6726903915405273, + "learning_rate": 1.868892148001062e-05, + "loss": 0.3832, + "step": 801 + }, + { + "epoch": 0.38045540796963945, + "grad_norm": 2.7872049808502197, + "learning_rate": 1.868511580286326e-05, + "loss": 0.4706, + "step": 802 + }, + { + "epoch": 0.38092979127134724, + "grad_norm": 2.7430388927459717, + "learning_rate": 1.8681304998977988e-05, + "loss": 0.4383, + "step": 803 + }, + { + "epoch": 0.38140417457305503, + "grad_norm": 2.5697414875030518, + "learning_rate": 1.8677489070604274e-05, + "loss": 0.4179, + "step": 804 + }, + { + "epoch": 0.3818785578747628, + "grad_norm": 2.9383411407470703, + "learning_rate": 1.8673668019994632e-05, + "loss": 0.4117, + "step": 805 + }, + { + "epoch": 0.38235294117647056, + "grad_norm": 3.752838373184204, + "learning_rate": 1.866984184940459e-05, + "loss": 0.5366, + "step": 806 + }, + { + "epoch": 0.38282732447817835, + "grad_norm": 4.9944024085998535, + "learning_rate": 1.866601056109269e-05, + "loss": 0.5161, + "step": 807 + }, + { + "epoch": 0.38330170777988615, + "grad_norm": 2.624593734741211, + "learning_rate": 1.8662174157320515e-05, + "loss": 0.3766, + "step": 808 + }, + { + "epoch": 0.38377609108159394, + "grad_norm": 2.987598419189453, + "learning_rate": 1.8658332640352647e-05, + "loss": 0.5166, + "step": 809 + }, + { + "epoch": 0.38425047438330173, + "grad_norm": 2.785848379135132, + "learning_rate": 1.8654486012456704e-05, + "loss": 0.4269, + "step": 810 + }, + { + "epoch": 0.38472485768500947, + "grad_norm": 2.538492202758789, + "learning_rate": 1.8650634275903304e-05, + "loss": 0.4338, + "step": 811 + }, + { + "epoch": 0.38519924098671726, + "grad_norm": 2.889648914337158, + "learning_rate": 1.864677743296609e-05, + "loss": 0.4407, + "step": 812 + }, + { + "epoch": 0.38567362428842505, + "grad_norm": 2.803192138671875, + "learning_rate": 1.8642915485921726e-05, + "loss": 0.4189, + "step": 813 + }, + { + "epoch": 0.38614800759013285, + "grad_norm": 2.4680614471435547, + "learning_rate": 1.8639048437049875e-05, + "loss": 0.3655, + "step": 814 + }, + { + "epoch": 0.3866223908918406, + "grad_norm": 3.2701048851013184, + "learning_rate": 1.8635176288633218e-05, + "loss": 0.4297, + "step": 815 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 3.7797579765319824, + "learning_rate": 1.8631299042957448e-05, + "loss": 0.5702, + "step": 816 + }, + { + "epoch": 0.38757115749525617, + "grad_norm": 3.297485113143921, + "learning_rate": 1.862741670231126e-05, + "loss": 0.5324, + "step": 817 + }, + { + "epoch": 0.38804554079696396, + "grad_norm": 2.463757276535034, + "learning_rate": 1.8623529268986366e-05, + "loss": 0.3748, + "step": 818 + }, + { + "epoch": 0.38851992409867175, + "grad_norm": 2.859805107116699, + "learning_rate": 1.861963674527748e-05, + "loss": 0.5214, + "step": 819 + }, + { + "epoch": 0.3889943074003795, + "grad_norm": 3.135437488555908, + "learning_rate": 1.8615739133482315e-05, + "loss": 0.4381, + "step": 820 + }, + { + "epoch": 0.3894686907020873, + "grad_norm": 3.0858075618743896, + "learning_rate": 1.8611836435901595e-05, + "loss": 0.4133, + "step": 821 + }, + { + "epoch": 0.3899430740037951, + "grad_norm": 2.585782527923584, + "learning_rate": 1.860792865483905e-05, + "loss": 0.3963, + "step": 822 + }, + { + "epoch": 0.39041745730550287, + "grad_norm": 3.3522262573242188, + "learning_rate": 1.8604015792601395e-05, + "loss": 0.478, + "step": 823 + }, + { + "epoch": 0.3908918406072106, + "grad_norm": 2.940312147140503, + "learning_rate": 1.860009785149836e-05, + "loss": 0.5005, + "step": 824 + }, + { + "epoch": 0.3913662239089184, + "grad_norm": 3.0102758407592773, + "learning_rate": 1.8596174833842664e-05, + "loss": 0.4627, + "step": 825 + }, + { + "epoch": 0.3918406072106262, + "grad_norm": 2.446089744567871, + "learning_rate": 1.8592246741950027e-05, + "loss": 0.3507, + "step": 826 + }, + { + "epoch": 0.392314990512334, + "grad_norm": 2.569380283355713, + "learning_rate": 1.858831357813916e-05, + "loss": 0.3988, + "step": 827 + }, + { + "epoch": 0.3927893738140417, + "grad_norm": 3.037994861602783, + "learning_rate": 1.8584375344731777e-05, + "loss": 0.3629, + "step": 828 + }, + { + "epoch": 0.3932637571157495, + "grad_norm": 3.2176268100738525, + "learning_rate": 1.8580432044052567e-05, + "loss": 0.4764, + "step": 829 + }, + { + "epoch": 0.3937381404174573, + "grad_norm": 2.4146931171417236, + "learning_rate": 1.8576483678429234e-05, + "loss": 0.3852, + "step": 830 + }, + { + "epoch": 0.3942125237191651, + "grad_norm": 3.2689833641052246, + "learning_rate": 1.8572530250192453e-05, + "loss": 0.444, + "step": 831 + }, + { + "epoch": 0.3946869070208729, + "grad_norm": 3.194239377975464, + "learning_rate": 1.8568571761675893e-05, + "loss": 0.4141, + "step": 832 + }, + { + "epoch": 0.3951612903225806, + "grad_norm": 2.9334359169006348, + "learning_rate": 1.8564608215216212e-05, + "loss": 0.4693, + "step": 833 + }, + { + "epoch": 0.3956356736242884, + "grad_norm": 2.812944173812866, + "learning_rate": 1.8560639613153056e-05, + "loss": 0.436, + "step": 834 + }, + { + "epoch": 0.3961100569259962, + "grad_norm": 2.986809730529785, + "learning_rate": 1.855666595782904e-05, + "loss": 0.4789, + "step": 835 + }, + { + "epoch": 0.396584440227704, + "grad_norm": 2.4972949028015137, + "learning_rate": 1.8552687251589786e-05, + "loss": 0.4695, + "step": 836 + }, + { + "epoch": 0.39705882352941174, + "grad_norm": 2.675363779067993, + "learning_rate": 1.8548703496783877e-05, + "loss": 0.4012, + "step": 837 + }, + { + "epoch": 0.39753320683111953, + "grad_norm": 2.956235885620117, + "learning_rate": 1.854471469576289e-05, + "loss": 0.503, + "step": 838 + }, + { + "epoch": 0.3980075901328273, + "grad_norm": 2.4831976890563965, + "learning_rate": 1.8540720850881372e-05, + "loss": 0.3945, + "step": 839 + }, + { + "epoch": 0.3984819734345351, + "grad_norm": 2.747809886932373, + "learning_rate": 1.8536721964496846e-05, + "loss": 0.431, + "step": 840 + }, + { + "epoch": 0.3989563567362429, + "grad_norm": 3.124937057495117, + "learning_rate": 1.853271803896982e-05, + "loss": 0.505, + "step": 841 + }, + { + "epoch": 0.39943074003795065, + "grad_norm": 3.363137722015381, + "learning_rate": 1.8528709076663772e-05, + "loss": 0.4563, + "step": 842 + }, + { + "epoch": 0.39990512333965844, + "grad_norm": 2.7867400646209717, + "learning_rate": 1.8524695079945154e-05, + "loss": 0.4487, + "step": 843 + }, + { + "epoch": 0.40037950664136623, + "grad_norm": 2.9408624172210693, + "learning_rate": 1.8520676051183377e-05, + "loss": 0.4014, + "step": 844 + }, + { + "epoch": 0.400853889943074, + "grad_norm": 3.224708318710327, + "learning_rate": 1.851665199275085e-05, + "loss": 0.4478, + "step": 845 + }, + { + "epoch": 0.40132827324478176, + "grad_norm": 2.7404086589813232, + "learning_rate": 1.8512622907022924e-05, + "loss": 0.4689, + "step": 846 + }, + { + "epoch": 0.40180265654648956, + "grad_norm": 2.713667154312134, + "learning_rate": 1.8508588796377936e-05, + "loss": 0.4486, + "step": 847 + }, + { + "epoch": 0.40227703984819735, + "grad_norm": 3.0665454864501953, + "learning_rate": 1.8504549663197175e-05, + "loss": 0.4629, + "step": 848 + }, + { + "epoch": 0.40275142314990514, + "grad_norm": 3.2429921627044678, + "learning_rate": 1.8500505509864903e-05, + "loss": 0.4945, + "step": 849 + }, + { + "epoch": 0.4032258064516129, + "grad_norm": 3.3584725856781006, + "learning_rate": 1.8496456338768345e-05, + "loss": 0.4229, + "step": 850 + }, + { + "epoch": 0.40370018975332067, + "grad_norm": 2.912458896636963, + "learning_rate": 1.8492402152297688e-05, + "loss": 0.4418, + "step": 851 + }, + { + "epoch": 0.40417457305502846, + "grad_norm": 2.935781478881836, + "learning_rate": 1.8488342952846074e-05, + "loss": 0.3275, + "step": 852 + }, + { + "epoch": 0.40464895635673626, + "grad_norm": 3.8504891395568848, + "learning_rate": 1.848427874280961e-05, + "loss": 0.413, + "step": 853 + }, + { + "epoch": 0.40512333965844405, + "grad_norm": 2.4456560611724854, + "learning_rate": 1.8480209524587363e-05, + "loss": 0.419, + "step": 854 + }, + { + "epoch": 0.4055977229601518, + "grad_norm": 3.666505813598633, + "learning_rate": 1.8476135300581347e-05, + "loss": 0.4197, + "step": 855 + }, + { + "epoch": 0.4060721062618596, + "grad_norm": 2.4932539463043213, + "learning_rate": 1.847205607319654e-05, + "loss": 0.3565, + "step": 856 + }, + { + "epoch": 0.40654648956356737, + "grad_norm": 2.7682738304138184, + "learning_rate": 1.8467971844840864e-05, + "loss": 0.4519, + "step": 857 + }, + { + "epoch": 0.40702087286527516, + "grad_norm": 2.8969831466674805, + "learning_rate": 1.8463882617925208e-05, + "loss": 0.4616, + "step": 858 + }, + { + "epoch": 0.4074952561669829, + "grad_norm": 3.2404887676239014, + "learning_rate": 1.8459788394863388e-05, + "loss": 0.5208, + "step": 859 + }, + { + "epoch": 0.4079696394686907, + "grad_norm": 2.699791669845581, + "learning_rate": 1.8455689178072197e-05, + "loss": 0.438, + "step": 860 + }, + { + "epoch": 0.4084440227703985, + "grad_norm": 3.718968152999878, + "learning_rate": 1.8451584969971358e-05, + "loss": 0.5273, + "step": 861 + }, + { + "epoch": 0.4089184060721063, + "grad_norm": 2.4565632343292236, + "learning_rate": 1.8447475772983542e-05, + "loss": 0.3949, + "step": 862 + }, + { + "epoch": 0.409392789373814, + "grad_norm": 2.7938921451568604, + "learning_rate": 1.8443361589534366e-05, + "loss": 0.3812, + "step": 863 + }, + { + "epoch": 0.4098671726755218, + "grad_norm": 2.8506662845611572, + "learning_rate": 1.84392424220524e-05, + "loss": 0.4237, + "step": 864 + }, + { + "epoch": 0.4103415559772296, + "grad_norm": 2.958847999572754, + "learning_rate": 1.8435118272969135e-05, + "loss": 0.5132, + "step": 865 + }, + { + "epoch": 0.4108159392789374, + "grad_norm": 3.2671656608581543, + "learning_rate": 1.8430989144719028e-05, + "loss": 0.4776, + "step": 866 + }, + { + "epoch": 0.4112903225806452, + "grad_norm": 2.673391103744507, + "learning_rate": 1.8426855039739454e-05, + "loss": 0.4367, + "step": 867 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 2.532644271850586, + "learning_rate": 1.8422715960470737e-05, + "loss": 0.4003, + "step": 868 + }, + { + "epoch": 0.4122390891840607, + "grad_norm": 2.612393856048584, + "learning_rate": 1.8418571909356138e-05, + "loss": 0.4027, + "step": 869 + }, + { + "epoch": 0.4127134724857685, + "grad_norm": 2.484384775161743, + "learning_rate": 1.8414422888841844e-05, + "loss": 0.4135, + "step": 870 + }, + { + "epoch": 0.4131878557874763, + "grad_norm": 3.7062361240386963, + "learning_rate": 1.8410268901376983e-05, + "loss": 0.4552, + "step": 871 + }, + { + "epoch": 0.41366223908918404, + "grad_norm": 2.7248425483703613, + "learning_rate": 1.8406109949413614e-05, + "loss": 0.4411, + "step": 872 + }, + { + "epoch": 0.41413662239089183, + "grad_norm": 2.9587390422821045, + "learning_rate": 1.8401946035406723e-05, + "loss": 0.5239, + "step": 873 + }, + { + "epoch": 0.4146110056925996, + "grad_norm": 3.329129457473755, + "learning_rate": 1.839777716181423e-05, + "loss": 0.4036, + "step": 874 + }, + { + "epoch": 0.4150853889943074, + "grad_norm": 2.5802624225616455, + "learning_rate": 1.8393603331096974e-05, + "loss": 0.4146, + "step": 875 + }, + { + "epoch": 0.4155597722960152, + "grad_norm": 3.270162582397461, + "learning_rate": 1.8389424545718733e-05, + "loss": 0.4765, + "step": 876 + }, + { + "epoch": 0.41603415559772294, + "grad_norm": 2.688945770263672, + "learning_rate": 1.8385240808146197e-05, + "loss": 0.4338, + "step": 877 + }, + { + "epoch": 0.41650853889943074, + "grad_norm": 2.7994351387023926, + "learning_rate": 1.838105212084899e-05, + "loss": 0.4464, + "step": 878 + }, + { + "epoch": 0.41698292220113853, + "grad_norm": 2.3464057445526123, + "learning_rate": 1.837685848629965e-05, + "loss": 0.3783, + "step": 879 + }, + { + "epoch": 0.4174573055028463, + "grad_norm": 3.1224400997161865, + "learning_rate": 1.837265990697364e-05, + "loss": 0.5672, + "step": 880 + }, + { + "epoch": 0.41793168880455406, + "grad_norm": 3.112441301345825, + "learning_rate": 1.8368456385349333e-05, + "loss": 0.4204, + "step": 881 + }, + { + "epoch": 0.41840607210626185, + "grad_norm": 3.453056812286377, + "learning_rate": 1.8364247923908033e-05, + "loss": 0.4365, + "step": 882 + }, + { + "epoch": 0.41888045540796964, + "grad_norm": 3.2397301197052, + "learning_rate": 1.8360034525133953e-05, + "loss": 0.4454, + "step": 883 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 2.897355794906616, + "learning_rate": 1.8355816191514216e-05, + "loss": 0.4647, + "step": 884 + }, + { + "epoch": 0.4198292220113852, + "grad_norm": 3.6154603958129883, + "learning_rate": 1.8351592925538865e-05, + "loss": 0.6002, + "step": 885 + }, + { + "epoch": 0.42030360531309297, + "grad_norm": 2.3313372135162354, + "learning_rate": 1.8347364729700857e-05, + "loss": 0.4218, + "step": 886 + }, + { + "epoch": 0.42077798861480076, + "grad_norm": 2.8984670639038086, + "learning_rate": 1.8343131606496046e-05, + "loss": 0.4025, + "step": 887 + }, + { + "epoch": 0.42125237191650855, + "grad_norm": 3.124525785446167, + "learning_rate": 1.8338893558423207e-05, + "loss": 0.441, + "step": 888 + }, + { + "epoch": 0.42172675521821634, + "grad_norm": 2.574047565460205, + "learning_rate": 1.833465058798402e-05, + "loss": 0.4107, + "step": 889 + }, + { + "epoch": 0.4222011385199241, + "grad_norm": 2.7786552906036377, + "learning_rate": 1.8330402697683067e-05, + "loss": 0.4836, + "step": 890 + }, + { + "epoch": 0.42267552182163187, + "grad_norm": 2.599820137023926, + "learning_rate": 1.832614989002783e-05, + "loss": 0.4446, + "step": 891 + }, + { + "epoch": 0.42314990512333966, + "grad_norm": 2.800710678100586, + "learning_rate": 1.8321892167528707e-05, + "loss": 0.4315, + "step": 892 + }, + { + "epoch": 0.42362428842504746, + "grad_norm": 2.935514450073242, + "learning_rate": 1.831762953269898e-05, + "loss": 0.4062, + "step": 893 + }, + { + "epoch": 0.4240986717267552, + "grad_norm": 2.597562551498413, + "learning_rate": 1.8313361988054853e-05, + "loss": 0.3958, + "step": 894 + }, + { + "epoch": 0.424573055028463, + "grad_norm": 2.7025997638702393, + "learning_rate": 1.8309089536115406e-05, + "loss": 0.4288, + "step": 895 + }, + { + "epoch": 0.4250474383301708, + "grad_norm": 2.58357310295105, + "learning_rate": 1.8304812179402626e-05, + "loss": 0.4734, + "step": 896 + }, + { + "epoch": 0.42552182163187857, + "grad_norm": 2.8267996311187744, + "learning_rate": 1.83005299204414e-05, + "loss": 0.4395, + "step": 897 + }, + { + "epoch": 0.42599620493358636, + "grad_norm": 2.934715986251831, + "learning_rate": 1.82962427617595e-05, + "loss": 0.4381, + "step": 898 + }, + { + "epoch": 0.4264705882352941, + "grad_norm": 2.855116128921509, + "learning_rate": 1.829195070588759e-05, + "loss": 0.4122, + "step": 899 + }, + { + "epoch": 0.4269449715370019, + "grad_norm": 2.7135651111602783, + "learning_rate": 1.8287653755359228e-05, + "loss": 0.4812, + "step": 900 + }, + { + "epoch": 0.4274193548387097, + "grad_norm": 2.9477922916412354, + "learning_rate": 1.8283351912710867e-05, + "loss": 0.4166, + "step": 901 + }, + { + "epoch": 0.4278937381404175, + "grad_norm": 3.2937138080596924, + "learning_rate": 1.827904518048184e-05, + "loss": 0.4541, + "step": 902 + }, + { + "epoch": 0.4283681214421252, + "grad_norm": 2.7012522220611572, + "learning_rate": 1.8274733561214368e-05, + "loss": 0.4235, + "step": 903 + }, + { + "epoch": 0.428842504743833, + "grad_norm": 3.3364474773406982, + "learning_rate": 1.8270417057453554e-05, + "loss": 0.4433, + "step": 904 + }, + { + "epoch": 0.4293168880455408, + "grad_norm": 2.75413179397583, + "learning_rate": 1.826609567174739e-05, + "loss": 0.4105, + "step": 905 + }, + { + "epoch": 0.4297912713472486, + "grad_norm": 2.27841854095459, + "learning_rate": 1.826176940664675e-05, + "loss": 0.3656, + "step": 906 + }, + { + "epoch": 0.43026565464895633, + "grad_norm": 3.102612018585205, + "learning_rate": 1.8257438264705382e-05, + "loss": 0.4927, + "step": 907 + }, + { + "epoch": 0.4307400379506641, + "grad_norm": 2.6770787239074707, + "learning_rate": 1.825310224847992e-05, + "loss": 0.4812, + "step": 908 + }, + { + "epoch": 0.4312144212523719, + "grad_norm": 3.230632781982422, + "learning_rate": 1.8248761360529864e-05, + "loss": 0.4326, + "step": 909 + }, + { + "epoch": 0.4316888045540797, + "grad_norm": 3.148620843887329, + "learning_rate": 1.8244415603417603e-05, + "loss": 0.4355, + "step": 910 + }, + { + "epoch": 0.4321631878557875, + "grad_norm": 3.1337029933929443, + "learning_rate": 1.8240064979708397e-05, + "loss": 0.5506, + "step": 911 + }, + { + "epoch": 0.43263757115749524, + "grad_norm": 2.7768473625183105, + "learning_rate": 1.8235709491970366e-05, + "loss": 0.4683, + "step": 912 + }, + { + "epoch": 0.43311195445920303, + "grad_norm": 2.6969683170318604, + "learning_rate": 1.8231349142774525e-05, + "loss": 0.4588, + "step": 913 + }, + { + "epoch": 0.4335863377609108, + "grad_norm": 2.7988765239715576, + "learning_rate": 1.8226983934694732e-05, + "loss": 0.399, + "step": 914 + }, + { + "epoch": 0.4340607210626186, + "grad_norm": 3.1512439250946045, + "learning_rate": 1.8222613870307735e-05, + "loss": 0.5061, + "step": 915 + }, + { + "epoch": 0.43453510436432635, + "grad_norm": 2.885368824005127, + "learning_rate": 1.8218238952193136e-05, + "loss": 0.4037, + "step": 916 + }, + { + "epoch": 0.43500948766603414, + "grad_norm": 2.4949498176574707, + "learning_rate": 1.8213859182933407e-05, + "loss": 0.3942, + "step": 917 + }, + { + "epoch": 0.43548387096774194, + "grad_norm": 2.3052561283111572, + "learning_rate": 1.820947456511388e-05, + "loss": 0.4208, + "step": 918 + }, + { + "epoch": 0.43595825426944973, + "grad_norm": 2.642266273498535, + "learning_rate": 1.8205085101322754e-05, + "loss": 0.465, + "step": 919 + }, + { + "epoch": 0.4364326375711575, + "grad_norm": 3.87248158454895, + "learning_rate": 1.8200690794151087e-05, + "loss": 0.5536, + "step": 920 + }, + { + "epoch": 0.43690702087286526, + "grad_norm": 2.5663421154022217, + "learning_rate": 1.819629164619279e-05, + "loss": 0.4441, + "step": 921 + }, + { + "epoch": 0.43738140417457305, + "grad_norm": 2.330423593521118, + "learning_rate": 1.8191887660044646e-05, + "loss": 0.36, + "step": 922 + }, + { + "epoch": 0.43785578747628084, + "grad_norm": 2.401970386505127, + "learning_rate": 1.8187478838306273e-05, + "loss": 0.3381, + "step": 923 + }, + { + "epoch": 0.43833017077798864, + "grad_norm": 3.270221710205078, + "learning_rate": 1.818306518358016e-05, + "loss": 0.5003, + "step": 924 + }, + { + "epoch": 0.4388045540796964, + "grad_norm": 2.4524645805358887, + "learning_rate": 1.817864669847165e-05, + "loss": 0.4074, + "step": 925 + }, + { + "epoch": 0.43927893738140417, + "grad_norm": 3.223022222518921, + "learning_rate": 1.817422338558892e-05, + "loss": 0.5338, + "step": 926 + }, + { + "epoch": 0.43975332068311196, + "grad_norm": 2.6757102012634277, + "learning_rate": 1.8169795247543014e-05, + "loss": 0.4344, + "step": 927 + }, + { + "epoch": 0.44022770398481975, + "grad_norm": 3.086233139038086, + "learning_rate": 1.8165362286947817e-05, + "loss": 0.4607, + "step": 928 + }, + { + "epoch": 0.4407020872865275, + "grad_norm": 2.754612922668457, + "learning_rate": 1.8160924506420064e-05, + "loss": 0.3898, + "step": 929 + }, + { + "epoch": 0.4411764705882353, + "grad_norm": 4.099813461303711, + "learning_rate": 1.8156481908579326e-05, + "loss": 0.4333, + "step": 930 + }, + { + "epoch": 0.4416508538899431, + "grad_norm": 2.578651189804077, + "learning_rate": 1.815203449604803e-05, + "loss": 0.3853, + "step": 931 + }, + { + "epoch": 0.44212523719165087, + "grad_norm": 2.7206499576568604, + "learning_rate": 1.8147582271451443e-05, + "loss": 0.4229, + "step": 932 + }, + { + "epoch": 0.44259962049335866, + "grad_norm": 3.497983694076538, + "learning_rate": 1.814312523741766e-05, + "loss": 0.5558, + "step": 933 + }, + { + "epoch": 0.4430740037950664, + "grad_norm": 3.077042579650879, + "learning_rate": 1.8138663396577633e-05, + "loss": 0.4388, + "step": 934 + }, + { + "epoch": 0.4435483870967742, + "grad_norm": 2.824192523956299, + "learning_rate": 1.813419675156514e-05, + "loss": 0.4736, + "step": 935 + }, + { + "epoch": 0.444022770398482, + "grad_norm": 2.8267297744750977, + "learning_rate": 1.8129725305016793e-05, + "loss": 0.4546, + "step": 936 + }, + { + "epoch": 0.4444971537001898, + "grad_norm": 2.9733405113220215, + "learning_rate": 1.8125249059572042e-05, + "loss": 0.5409, + "step": 937 + }, + { + "epoch": 0.4449715370018975, + "grad_norm": 3.0438060760498047, + "learning_rate": 1.8120768017873178e-05, + "loss": 0.4126, + "step": 938 + }, + { + "epoch": 0.4454459203036053, + "grad_norm": 2.6118757724761963, + "learning_rate": 1.8116282182565313e-05, + "loss": 0.3854, + "step": 939 + }, + { + "epoch": 0.4459203036053131, + "grad_norm": 2.909747838973999, + "learning_rate": 1.8111791556296386e-05, + "loss": 0.5156, + "step": 940 + }, + { + "epoch": 0.4463946869070209, + "grad_norm": 2.6982431411743164, + "learning_rate": 1.8107296141717175e-05, + "loss": 0.3597, + "step": 941 + }, + { + "epoch": 0.4468690702087287, + "grad_norm": 3.628803253173828, + "learning_rate": 1.8102795941481277e-05, + "loss": 0.5069, + "step": 942 + }, + { + "epoch": 0.4473434535104364, + "grad_norm": 2.5725972652435303, + "learning_rate": 1.8098290958245116e-05, + "loss": 0.3675, + "step": 943 + }, + { + "epoch": 0.4478178368121442, + "grad_norm": 3.47405743598938, + "learning_rate": 1.8093781194667935e-05, + "loss": 0.4533, + "step": 944 + }, + { + "epoch": 0.448292220113852, + "grad_norm": 2.7190101146698, + "learning_rate": 1.808926665341181e-05, + "loss": 0.3759, + "step": 945 + }, + { + "epoch": 0.4487666034155598, + "grad_norm": 4.446865558624268, + "learning_rate": 1.8084747337141622e-05, + "loss": 0.3144, + "step": 946 + }, + { + "epoch": 0.44924098671726753, + "grad_norm": 2.6146459579467773, + "learning_rate": 1.8080223248525087e-05, + "loss": 0.4736, + "step": 947 + }, + { + "epoch": 0.4497153700189753, + "grad_norm": 3.1823534965515137, + "learning_rate": 1.8075694390232725e-05, + "loss": 0.5324, + "step": 948 + }, + { + "epoch": 0.4501897533206831, + "grad_norm": 3.1742212772369385, + "learning_rate": 1.8071160764937875e-05, + "loss": 0.4744, + "step": 949 + }, + { + "epoch": 0.4506641366223909, + "grad_norm": 2.7472152709960938, + "learning_rate": 1.8066622375316695e-05, + "loss": 0.3879, + "step": 950 + }, + { + "epoch": 0.45113851992409865, + "grad_norm": 2.5368971824645996, + "learning_rate": 1.8062079224048146e-05, + "loss": 0.4705, + "step": 951 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 2.812864065170288, + "learning_rate": 1.805753131381401e-05, + "loss": 0.4219, + "step": 952 + }, + { + "epoch": 0.45208728652751423, + "grad_norm": 2.2089133262634277, + "learning_rate": 1.8052978647298873e-05, + "loss": 0.387, + "step": 953 + }, + { + "epoch": 0.452561669829222, + "grad_norm": 2.8962409496307373, + "learning_rate": 1.804842122719013e-05, + "loss": 0.5197, + "step": 954 + }, + { + "epoch": 0.4530360531309298, + "grad_norm": 2.630946397781372, + "learning_rate": 1.8043859056177976e-05, + "loss": 0.3808, + "step": 955 + }, + { + "epoch": 0.45351043643263755, + "grad_norm": 2.4841771125793457, + "learning_rate": 1.803929213695542e-05, + "loss": 0.4099, + "step": 956 + }, + { + "epoch": 0.45398481973434535, + "grad_norm": 2.3919498920440674, + "learning_rate": 1.803472047221827e-05, + "loss": 0.3677, + "step": 957 + }, + { + "epoch": 0.45445920303605314, + "grad_norm": 2.786252498626709, + "learning_rate": 1.8030144064665127e-05, + "loss": 0.4811, + "step": 958 + }, + { + "epoch": 0.45493358633776093, + "grad_norm": 2.7596981525421143, + "learning_rate": 1.802556291699741e-05, + "loss": 0.4529, + "step": 959 + }, + { + "epoch": 0.45540796963946867, + "grad_norm": 2.7790467739105225, + "learning_rate": 1.8020977031919315e-05, + "loss": 0.4452, + "step": 960 + }, + { + "epoch": 0.45588235294117646, + "grad_norm": 3.088778018951416, + "learning_rate": 1.801638641213785e-05, + "loss": 0.4797, + "step": 961 + }, + { + "epoch": 0.45635673624288425, + "grad_norm": 2.6321218013763428, + "learning_rate": 1.801179106036281e-05, + "loss": 0.4205, + "step": 962 + }, + { + "epoch": 0.45683111954459205, + "grad_norm": 2.7974629402160645, + "learning_rate": 1.8007190979306793e-05, + "loss": 0.4557, + "step": 963 + }, + { + "epoch": 0.4573055028462998, + "grad_norm": 2.396785259246826, + "learning_rate": 1.800258617168517e-05, + "loss": 0.4265, + "step": 964 + }, + { + "epoch": 0.4577798861480076, + "grad_norm": 2.8968255519866943, + "learning_rate": 1.799797664021612e-05, + "loss": 0.3469, + "step": 965 + }, + { + "epoch": 0.45825426944971537, + "grad_norm": 2.8797080516815186, + "learning_rate": 1.7993362387620602e-05, + "loss": 0.4216, + "step": 966 + }, + { + "epoch": 0.45872865275142316, + "grad_norm": 3.1519381999969482, + "learning_rate": 1.798874341662237e-05, + "loss": 0.5129, + "step": 967 + }, + { + "epoch": 0.45920303605313095, + "grad_norm": 2.6369104385375977, + "learning_rate": 1.7984119729947944e-05, + "loss": 0.4251, + "step": 968 + }, + { + "epoch": 0.4596774193548387, + "grad_norm": 2.4390084743499756, + "learning_rate": 1.797949133032665e-05, + "loss": 0.4245, + "step": 969 + }, + { + "epoch": 0.4601518026565465, + "grad_norm": 2.8855037689208984, + "learning_rate": 1.7974858220490586e-05, + "loss": 0.4015, + "step": 970 + }, + { + "epoch": 0.4606261859582543, + "grad_norm": 2.9386088848114014, + "learning_rate": 1.7970220403174626e-05, + "loss": 0.4222, + "step": 971 + }, + { + "epoch": 0.46110056925996207, + "grad_norm": 2.18912410736084, + "learning_rate": 1.796557788111643e-05, + "loss": 0.3247, + "step": 972 + }, + { + "epoch": 0.4615749525616698, + "grad_norm": 3.183178663253784, + "learning_rate": 1.796093065705644e-05, + "loss": 0.5049, + "step": 973 + }, + { + "epoch": 0.4620493358633776, + "grad_norm": 2.8676347732543945, + "learning_rate": 1.7956278733737855e-05, + "loss": 0.458, + "step": 974 + }, + { + "epoch": 0.4625237191650854, + "grad_norm": 5.36901330947876, + "learning_rate": 1.7951622113906663e-05, + "loss": 0.4597, + "step": 975 + }, + { + "epoch": 0.4629981024667932, + "grad_norm": 2.528803586959839, + "learning_rate": 1.7946960800311623e-05, + "loss": 0.3229, + "step": 976 + }, + { + "epoch": 0.463472485768501, + "grad_norm": 2.7239816188812256, + "learning_rate": 1.7942294795704265e-05, + "loss": 0.4284, + "step": 977 + }, + { + "epoch": 0.4639468690702087, + "grad_norm": 2.4139273166656494, + "learning_rate": 1.7937624102838878e-05, + "loss": 0.3585, + "step": 978 + }, + { + "epoch": 0.4644212523719165, + "grad_norm": 2.762474775314331, + "learning_rate": 1.793294872447253e-05, + "loss": 0.3938, + "step": 979 + }, + { + "epoch": 0.4648956356736243, + "grad_norm": 2.676497459411621, + "learning_rate": 1.792826866336505e-05, + "loss": 0.4188, + "step": 980 + }, + { + "epoch": 0.4653700189753321, + "grad_norm": 3.0633938312530518, + "learning_rate": 1.792358392227903e-05, + "loss": 0.3947, + "step": 981 + }, + { + "epoch": 0.4658444022770398, + "grad_norm": 2.3310940265655518, + "learning_rate": 1.791889450397983e-05, + "loss": 0.3666, + "step": 982 + }, + { + "epoch": 0.4663187855787476, + "grad_norm": 3.144381523132324, + "learning_rate": 1.7914200411235562e-05, + "loss": 0.4274, + "step": 983 + }, + { + "epoch": 0.4667931688804554, + "grad_norm": 2.8709566593170166, + "learning_rate": 1.7909501646817108e-05, + "loss": 0.4486, + "step": 984 + }, + { + "epoch": 0.4672675521821632, + "grad_norm": 2.4950621128082275, + "learning_rate": 1.79047982134981e-05, + "loss": 0.4079, + "step": 985 + }, + { + "epoch": 0.46774193548387094, + "grad_norm": 2.85103178024292, + "learning_rate": 1.7900090114054925e-05, + "loss": 0.3948, + "step": 986 + }, + { + "epoch": 0.46821631878557873, + "grad_norm": 2.7091755867004395, + "learning_rate": 1.7895377351266737e-05, + "loss": 0.4352, + "step": 987 + }, + { + "epoch": 0.4686907020872865, + "grad_norm": 2.969698905944824, + "learning_rate": 1.7890659927915418e-05, + "loss": 0.4333, + "step": 988 + }, + { + "epoch": 0.4691650853889943, + "grad_norm": 2.8901190757751465, + "learning_rate": 1.7885937846785633e-05, + "loss": 0.4314, + "step": 989 + }, + { + "epoch": 0.4696394686907021, + "grad_norm": 2.5206494331359863, + "learning_rate": 1.7881211110664767e-05, + "loss": 0.4896, + "step": 990 + }, + { + "epoch": 0.47011385199240985, + "grad_norm": 2.556368589401245, + "learning_rate": 1.7876479722342972e-05, + "loss": 0.4165, + "step": 991 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 2.626251459121704, + "learning_rate": 1.7871743684613137e-05, + "loss": 0.5195, + "step": 992 + }, + { + "epoch": 0.47106261859582543, + "grad_norm": 2.428295373916626, + "learning_rate": 1.78670030002709e-05, + "loss": 0.4136, + "step": 993 + }, + { + "epoch": 0.4715370018975332, + "grad_norm": 2.7496650218963623, + "learning_rate": 1.786225767211464e-05, + "loss": 0.4622, + "step": 994 + }, + { + "epoch": 0.47201138519924096, + "grad_norm": 2.3996787071228027, + "learning_rate": 1.7857507702945472e-05, + "loss": 0.3481, + "step": 995 + }, + { + "epoch": 0.47248576850094876, + "grad_norm": 2.776946544647217, + "learning_rate": 1.7852753095567266e-05, + "loss": 0.4187, + "step": 996 + }, + { + "epoch": 0.47296015180265655, + "grad_norm": 2.833527088165283, + "learning_rate": 1.7847993852786612e-05, + "loss": 0.4635, + "step": 997 + }, + { + "epoch": 0.47343453510436434, + "grad_norm": 2.589299201965332, + "learning_rate": 1.7843229977412844e-05, + "loss": 0.4279, + "step": 998 + }, + { + "epoch": 0.47390891840607213, + "grad_norm": 2.5228946208953857, + "learning_rate": 1.7838461472258035e-05, + "loss": 0.4273, + "step": 999 + }, + { + "epoch": 0.47438330170777987, + "grad_norm": 2.441437244415283, + "learning_rate": 1.7833688340136982e-05, + "loss": 0.3756, + "step": 1000 + }, + { + "epoch": 0.47485768500948766, + "grad_norm": 3.258878231048584, + "learning_rate": 1.782891058386722e-05, + "loss": 0.4789, + "step": 1001 + }, + { + "epoch": 0.47533206831119545, + "grad_norm": 2.807469367980957, + "learning_rate": 1.782412820626901e-05, + "loss": 0.4335, + "step": 1002 + }, + { + "epoch": 0.47580645161290325, + "grad_norm": 2.546851873397827, + "learning_rate": 1.7819341210165347e-05, + "loss": 0.3704, + "step": 1003 + }, + { + "epoch": 0.476280834914611, + "grad_norm": 2.9630987644195557, + "learning_rate": 1.781454959838194e-05, + "loss": 0.4367, + "step": 1004 + }, + { + "epoch": 0.4767552182163188, + "grad_norm": 2.7907612323760986, + "learning_rate": 1.7809753373747235e-05, + "loss": 0.4591, + "step": 1005 + }, + { + "epoch": 0.47722960151802657, + "grad_norm": 3.0077595710754395, + "learning_rate": 1.7804952539092393e-05, + "loss": 0.4258, + "step": 1006 + }, + { + "epoch": 0.47770398481973436, + "grad_norm": 2.507061243057251, + "learning_rate": 1.78001470972513e-05, + "loss": 0.4579, + "step": 1007 + }, + { + "epoch": 0.4781783681214421, + "grad_norm": 2.3923323154449463, + "learning_rate": 1.7795337051060562e-05, + "loss": 0.3826, + "step": 1008 + }, + { + "epoch": 0.4786527514231499, + "grad_norm": 2.3353378772735596, + "learning_rate": 1.77905224033595e-05, + "loss": 0.3645, + "step": 1009 + }, + { + "epoch": 0.4791271347248577, + "grad_norm": 2.9613916873931885, + "learning_rate": 1.7785703156990153e-05, + "loss": 0.4481, + "step": 1010 + }, + { + "epoch": 0.4796015180265655, + "grad_norm": 3.13313889503479, + "learning_rate": 1.7780879314797278e-05, + "loss": 0.4703, + "step": 1011 + }, + { + "epoch": 0.48007590132827327, + "grad_norm": 3.504654884338379, + "learning_rate": 1.7776050879628338e-05, + "loss": 0.4535, + "step": 1012 + }, + { + "epoch": 0.480550284629981, + "grad_norm": 2.6365973949432373, + "learning_rate": 1.777121785433351e-05, + "loss": 0.3529, + "step": 1013 + }, + { + "epoch": 0.4810246679316888, + "grad_norm": 3.4946820735931396, + "learning_rate": 1.7766380241765682e-05, + "loss": 0.4124, + "step": 1014 + }, + { + "epoch": 0.4814990512333966, + "grad_norm": 2.5579440593719482, + "learning_rate": 1.776153804478045e-05, + "loss": 0.4045, + "step": 1015 + }, + { + "epoch": 0.4819734345351044, + "grad_norm": 2.6704046726226807, + "learning_rate": 1.7756691266236114e-05, + "loss": 0.4014, + "step": 1016 + }, + { + "epoch": 0.4824478178368121, + "grad_norm": 2.637950897216797, + "learning_rate": 1.7751839908993677e-05, + "loss": 0.4664, + "step": 1017 + }, + { + "epoch": 0.4829222011385199, + "grad_norm": 2.9768013954162598, + "learning_rate": 1.7746983975916852e-05, + "loss": 0.5027, + "step": 1018 + }, + { + "epoch": 0.4833965844402277, + "grad_norm": 2.8544719219207764, + "learning_rate": 1.7742123469872043e-05, + "loss": 0.479, + "step": 1019 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 2.228623151779175, + "learning_rate": 1.7737258393728363e-05, + "loss": 0.3347, + "step": 1020 + }, + { + "epoch": 0.4843453510436433, + "grad_norm": 2.553727865219116, + "learning_rate": 1.7732388750357618e-05, + "loss": 0.3965, + "step": 1021 + }, + { + "epoch": 0.484819734345351, + "grad_norm": 3.6200978755950928, + "learning_rate": 1.7727514542634308e-05, + "loss": 0.532, + "step": 1022 + }, + { + "epoch": 0.4852941176470588, + "grad_norm": 2.9352922439575195, + "learning_rate": 1.772263577343563e-05, + "loss": 0.428, + "step": 1023 + }, + { + "epoch": 0.4857685009487666, + "grad_norm": 2.7596585750579834, + "learning_rate": 1.7717752445641473e-05, + "loss": 0.4415, + "step": 1024 + }, + { + "epoch": 0.4862428842504744, + "grad_norm": 2.538545608520508, + "learning_rate": 1.771286456213442e-05, + "loss": 0.3995, + "step": 1025 + }, + { + "epoch": 0.48671726755218214, + "grad_norm": 2.592538833618164, + "learning_rate": 1.7707972125799738e-05, + "loss": 0.3906, + "step": 1026 + }, + { + "epoch": 0.48719165085388993, + "grad_norm": 2.813682794570923, + "learning_rate": 1.770307513952538e-05, + "loss": 0.409, + "step": 1027 + }, + { + "epoch": 0.4876660341555977, + "grad_norm": 2.8214876651763916, + "learning_rate": 1.7698173606201994e-05, + "loss": 0.4843, + "step": 1028 + }, + { + "epoch": 0.4881404174573055, + "grad_norm": 2.7187728881835938, + "learning_rate": 1.7693267528722907e-05, + "loss": 0.4048, + "step": 1029 + }, + { + "epoch": 0.48861480075901326, + "grad_norm": 3.217090368270874, + "learning_rate": 1.7688356909984125e-05, + "loss": 0.5723, + "step": 1030 + }, + { + "epoch": 0.48908918406072105, + "grad_norm": 2.610750675201416, + "learning_rate": 1.7683441752884337e-05, + "loss": 0.3835, + "step": 1031 + }, + { + "epoch": 0.48956356736242884, + "grad_norm": 2.7182836532592773, + "learning_rate": 1.7678522060324918e-05, + "loss": 0.4672, + "step": 1032 + }, + { + "epoch": 0.49003795066413663, + "grad_norm": 11.401104927062988, + "learning_rate": 1.7673597835209904e-05, + "loss": 0.3387, + "step": 1033 + }, + { + "epoch": 0.4905123339658444, + "grad_norm": 2.622774124145508, + "learning_rate": 1.766866908044602e-05, + "loss": 0.4282, + "step": 1034 + }, + { + "epoch": 0.49098671726755216, + "grad_norm": 2.8863613605499268, + "learning_rate": 1.7663735798942666e-05, + "loss": 0.4643, + "step": 1035 + }, + { + "epoch": 0.49146110056925996, + "grad_norm": 2.4292235374450684, + "learning_rate": 1.7658797993611907e-05, + "loss": 0.3813, + "step": 1036 + }, + { + "epoch": 0.49193548387096775, + "grad_norm": 2.83770751953125, + "learning_rate": 1.7653855667368474e-05, + "loss": 0.5212, + "step": 1037 + }, + { + "epoch": 0.49240986717267554, + "grad_norm": 2.406994581222534, + "learning_rate": 1.7648908823129788e-05, + "loss": 0.429, + "step": 1038 + }, + { + "epoch": 0.4928842504743833, + "grad_norm": 2.8495097160339355, + "learning_rate": 1.7643957463815904e-05, + "loss": 0.4773, + "step": 1039 + }, + { + "epoch": 0.49335863377609107, + "grad_norm": 2.946979522705078, + "learning_rate": 1.7639001592349575e-05, + "loss": 0.4433, + "step": 1040 + }, + { + "epoch": 0.49383301707779886, + "grad_norm": 2.636714220046997, + "learning_rate": 1.7634041211656193e-05, + "loss": 0.3856, + "step": 1041 + }, + { + "epoch": 0.49430740037950666, + "grad_norm": 2.825824737548828, + "learning_rate": 1.7629076324663827e-05, + "loss": 0.4985, + "step": 1042 + }, + { + "epoch": 0.49478178368121445, + "grad_norm": 2.4185476303100586, + "learning_rate": 1.7624106934303202e-05, + "loss": 0.4149, + "step": 1043 + }, + { + "epoch": 0.4952561669829222, + "grad_norm": 2.691755533218384, + "learning_rate": 1.7619133043507694e-05, + "loss": 0.4321, + "step": 1044 + }, + { + "epoch": 0.49573055028463, + "grad_norm": 2.816511392593384, + "learning_rate": 1.7614154655213343e-05, + "loss": 0.4777, + "step": 1045 + }, + { + "epoch": 0.49620493358633777, + "grad_norm": 2.25382924079895, + "learning_rate": 1.760917177235885e-05, + "loss": 0.4227, + "step": 1046 + }, + { + "epoch": 0.49667931688804556, + "grad_norm": 3.144374132156372, + "learning_rate": 1.7604184397885554e-05, + "loss": 0.4284, + "step": 1047 + }, + { + "epoch": 0.4971537001897533, + "grad_norm": 2.518986940383911, + "learning_rate": 1.7599192534737453e-05, + "loss": 0.4029, + "step": 1048 + }, + { + "epoch": 0.4976280834914611, + "grad_norm": 2.7416396141052246, + "learning_rate": 1.7594196185861198e-05, + "loss": 0.4459, + "step": 1049 + }, + { + "epoch": 0.4981024667931689, + "grad_norm": 3.26064395904541, + "learning_rate": 1.7589195354206082e-05, + "loss": 0.3789, + "step": 1050 + }, + { + "epoch": 0.4985768500948767, + "grad_norm": 3.0012292861938477, + "learning_rate": 1.7584190042724047e-05, + "loss": 0.5016, + "step": 1051 + }, + { + "epoch": 0.4990512333965844, + "grad_norm": 3.3961806297302246, + "learning_rate": 1.7579180254369674e-05, + "loss": 0.4325, + "step": 1052 + }, + { + "epoch": 0.4995256166982922, + "grad_norm": 2.4713120460510254, + "learning_rate": 1.75741659921002e-05, + "loss": 0.3976, + "step": 1053 + }, + { + "epoch": 0.5, + "grad_norm": 2.7608566284179688, + "learning_rate": 1.756914725887549e-05, + "loss": 0.4458, + "step": 1054 + }, + { + "epoch": 0.5004743833017078, + "grad_norm": 2.3729867935180664, + "learning_rate": 1.7564124057658057e-05, + "loss": 0.3763, + "step": 1055 + }, + { + "epoch": 0.5009487666034156, + "grad_norm": 2.746999979019165, + "learning_rate": 1.755909639141304e-05, + "loss": 0.4111, + "step": 1056 + }, + { + "epoch": 0.5014231499051234, + "grad_norm": 2.6905953884124756, + "learning_rate": 1.755406426310822e-05, + "loss": 0.4423, + "step": 1057 + }, + { + "epoch": 0.5018975332068312, + "grad_norm": 2.955181837081909, + "learning_rate": 1.754902767571402e-05, + "loss": 0.33, + "step": 1058 + }, + { + "epoch": 0.5023719165085389, + "grad_norm": 2.381403684616089, + "learning_rate": 1.754398663220348e-05, + "loss": 0.4346, + "step": 1059 + }, + { + "epoch": 0.5028462998102466, + "grad_norm": 2.5319292545318604, + "learning_rate": 1.7538941135552286e-05, + "loss": 0.4469, + "step": 1060 + }, + { + "epoch": 0.5033206831119544, + "grad_norm": 2.7511696815490723, + "learning_rate": 1.7533891188738738e-05, + "loss": 0.4318, + "step": 1061 + }, + { + "epoch": 0.5037950664136622, + "grad_norm": 2.6034233570098877, + "learning_rate": 1.7528836794743776e-05, + "loss": 0.3468, + "step": 1062 + }, + { + "epoch": 0.50426944971537, + "grad_norm": 2.702395439147949, + "learning_rate": 1.752377795655095e-05, + "loss": 0.4786, + "step": 1063 + }, + { + "epoch": 0.5047438330170778, + "grad_norm": 2.741300582885742, + "learning_rate": 1.751871467714645e-05, + "loss": 0.3856, + "step": 1064 + }, + { + "epoch": 0.5052182163187856, + "grad_norm": 2.5689239501953125, + "learning_rate": 1.751364695951908e-05, + "loss": 0.3626, + "step": 1065 + }, + { + "epoch": 0.5056925996204934, + "grad_norm": 2.544797897338867, + "learning_rate": 1.7508574806660256e-05, + "loss": 0.4253, + "step": 1066 + }, + { + "epoch": 0.5061669829222012, + "grad_norm": 2.194748640060425, + "learning_rate": 1.7503498221564026e-05, + "loss": 0.322, + "step": 1067 + }, + { + "epoch": 0.5066413662239089, + "grad_norm": 2.4700746536254883, + "learning_rate": 1.7498417207227046e-05, + "loss": 0.3716, + "step": 1068 + }, + { + "epoch": 0.5071157495256167, + "grad_norm": 3.115464687347412, + "learning_rate": 1.7493331766648588e-05, + "loss": 0.4376, + "step": 1069 + }, + { + "epoch": 0.5075901328273245, + "grad_norm": 2.6482186317443848, + "learning_rate": 1.748824190283054e-05, + "loss": 0.3741, + "step": 1070 + }, + { + "epoch": 0.5080645161290323, + "grad_norm": 2.6619374752044678, + "learning_rate": 1.7483147618777393e-05, + "loss": 0.4422, + "step": 1071 + }, + { + "epoch": 0.50853889943074, + "grad_norm": 2.94618558883667, + "learning_rate": 1.7478048917496255e-05, + "loss": 0.4338, + "step": 1072 + }, + { + "epoch": 0.5090132827324478, + "grad_norm": 3.4346418380737305, + "learning_rate": 1.7472945801996842e-05, + "loss": 0.5294, + "step": 1073 + }, + { + "epoch": 0.5094876660341556, + "grad_norm": 2.2107574939727783, + "learning_rate": 1.7467838275291467e-05, + "loss": 0.3451, + "step": 1074 + }, + { + "epoch": 0.5099620493358634, + "grad_norm": 2.622593641281128, + "learning_rate": 1.746272634039506e-05, + "loss": 0.4888, + "step": 1075 + }, + { + "epoch": 0.5104364326375711, + "grad_norm": 2.8133835792541504, + "learning_rate": 1.745761000032514e-05, + "loss": 0.45, + "step": 1076 + }, + { + "epoch": 0.5109108159392789, + "grad_norm": 1.8900046348571777, + "learning_rate": 1.745248925810183e-05, + "loss": 0.3165, + "step": 1077 + }, + { + "epoch": 0.5113851992409867, + "grad_norm": 2.4775540828704834, + "learning_rate": 1.744736411674786e-05, + "loss": 0.3891, + "step": 1078 + }, + { + "epoch": 0.5118595825426945, + "grad_norm": 2.9898617267608643, + "learning_rate": 1.7442234579288543e-05, + "loss": 0.4028, + "step": 1079 + }, + { + "epoch": 0.5123339658444023, + "grad_norm": 2.6482555866241455, + "learning_rate": 1.74371006487518e-05, + "loss": 0.445, + "step": 1080 + }, + { + "epoch": 0.5128083491461101, + "grad_norm": 3.1871161460876465, + "learning_rate": 1.743196232816814e-05, + "loss": 0.3826, + "step": 1081 + }, + { + "epoch": 0.5132827324478179, + "grad_norm": 2.722008466720581, + "learning_rate": 1.7426819620570655e-05, + "loss": 0.4419, + "step": 1082 + }, + { + "epoch": 0.5137571157495257, + "grad_norm": 2.5276153087615967, + "learning_rate": 1.7421672528995043e-05, + "loss": 0.4635, + "step": 1083 + }, + { + "epoch": 0.5142314990512334, + "grad_norm": 2.950514554977417, + "learning_rate": 1.7416521056479577e-05, + "loss": 0.3816, + "step": 1084 + }, + { + "epoch": 0.5147058823529411, + "grad_norm": 2.382774829864502, + "learning_rate": 1.741136520606512e-05, + "loss": 0.3931, + "step": 1085 + }, + { + "epoch": 0.5151802656546489, + "grad_norm": 2.6314048767089844, + "learning_rate": 1.7406204980795124e-05, + "loss": 0.4377, + "step": 1086 + }, + { + "epoch": 0.5156546489563567, + "grad_norm": 3.256890296936035, + "learning_rate": 1.740104038371561e-05, + "loss": 0.5105, + "step": 1087 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 2.571377992630005, + "learning_rate": 1.7395871417875198e-05, + "loss": 0.4192, + "step": 1088 + }, + { + "epoch": 0.5166034155597723, + "grad_norm": 2.945387601852417, + "learning_rate": 1.7390698086325072e-05, + "loss": 0.3837, + "step": 1089 + }, + { + "epoch": 0.5170777988614801, + "grad_norm": 3.032877206802368, + "learning_rate": 1.7385520392118998e-05, + "loss": 0.4706, + "step": 1090 + }, + { + "epoch": 0.5175521821631879, + "grad_norm": 3.0032753944396973, + "learning_rate": 1.7380338338313322e-05, + "loss": 0.4447, + "step": 1091 + }, + { + "epoch": 0.5180265654648957, + "grad_norm": 2.387190103530884, + "learning_rate": 1.7375151927966954e-05, + "loss": 0.386, + "step": 1092 + }, + { + "epoch": 0.5185009487666035, + "grad_norm": 3.4106404781341553, + "learning_rate": 1.7369961164141383e-05, + "loss": 0.5009, + "step": 1093 + }, + { + "epoch": 0.5189753320683111, + "grad_norm": 2.958599805831909, + "learning_rate": 1.736476604990067e-05, + "loss": 0.4794, + "step": 1094 + }, + { + "epoch": 0.5194497153700189, + "grad_norm": 2.795013189315796, + "learning_rate": 1.735956658831143e-05, + "loss": 0.4323, + "step": 1095 + }, + { + "epoch": 0.5199240986717267, + "grad_norm": 2.330531120300293, + "learning_rate": 1.7354362782442864e-05, + "loss": 0.3815, + "step": 1096 + }, + { + "epoch": 0.5203984819734345, + "grad_norm": 2.7681827545166016, + "learning_rate": 1.734915463536672e-05, + "loss": 0.4358, + "step": 1097 + }, + { + "epoch": 0.5208728652751423, + "grad_norm": 2.7465672492980957, + "learning_rate": 1.7343942150157315e-05, + "loss": 0.4384, + "step": 1098 + }, + { + "epoch": 0.5213472485768501, + "grad_norm": 2.6306185722351074, + "learning_rate": 1.7338725329891532e-05, + "loss": 0.4233, + "step": 1099 + }, + { + "epoch": 0.5218216318785579, + "grad_norm": 2.5246400833129883, + "learning_rate": 1.7333504177648806e-05, + "loss": 0.3583, + "step": 1100 + }, + { + "epoch": 0.5222960151802657, + "grad_norm": 2.543013334274292, + "learning_rate": 1.7328278696511135e-05, + "loss": 0.4036, + "step": 1101 + }, + { + "epoch": 0.5227703984819735, + "grad_norm": 3.1978096961975098, + "learning_rate": 1.7323048889563062e-05, + "loss": 0.489, + "step": 1102 + }, + { + "epoch": 0.5232447817836812, + "grad_norm": 2.8943567276000977, + "learning_rate": 1.73178147598917e-05, + "loss": 0.5043, + "step": 1103 + }, + { + "epoch": 0.523719165085389, + "grad_norm": 2.635658025741577, + "learning_rate": 1.7312576310586692e-05, + "loss": 0.4701, + "step": 1104 + }, + { + "epoch": 0.5241935483870968, + "grad_norm": 2.4131529331207275, + "learning_rate": 1.730733354474025e-05, + "loss": 0.4144, + "step": 1105 + }, + { + "epoch": 0.5246679316888045, + "grad_norm": 2.3183786869049072, + "learning_rate": 1.730208646544713e-05, + "loss": 0.4114, + "step": 1106 + }, + { + "epoch": 0.5251423149905123, + "grad_norm": 2.472311496734619, + "learning_rate": 1.729683507580462e-05, + "loss": 0.3944, + "step": 1107 + }, + { + "epoch": 0.5256166982922201, + "grad_norm": 2.4466824531555176, + "learning_rate": 1.7291579378912574e-05, + "loss": 0.4335, + "step": 1108 + }, + { + "epoch": 0.5260910815939279, + "grad_norm": 2.289536237716675, + "learning_rate": 1.7286319377873374e-05, + "loss": 0.3746, + "step": 1109 + }, + { + "epoch": 0.5265654648956357, + "grad_norm": 2.8903002738952637, + "learning_rate": 1.7281055075791946e-05, + "loss": 0.4114, + "step": 1110 + }, + { + "epoch": 0.5270398481973435, + "grad_norm": 2.633312225341797, + "learning_rate": 1.7275786475775757e-05, + "loss": 0.3702, + "step": 1111 + }, + { + "epoch": 0.5275142314990512, + "grad_norm": 3.3855140209198, + "learning_rate": 1.7270513580934805e-05, + "loss": 0.4827, + "step": 1112 + }, + { + "epoch": 0.527988614800759, + "grad_norm": 2.293537139892578, + "learning_rate": 1.7265236394381634e-05, + "loss": 0.3591, + "step": 1113 + }, + { + "epoch": 0.5284629981024668, + "grad_norm": 2.2649781703948975, + "learning_rate": 1.725995491923131e-05, + "loss": 0.3496, + "step": 1114 + }, + { + "epoch": 0.5289373814041746, + "grad_norm": 2.7540788650512695, + "learning_rate": 1.725466915860144e-05, + "loss": 0.4094, + "step": 1115 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 2.8190548419952393, + "learning_rate": 1.7249379115612154e-05, + "loss": 0.442, + "step": 1116 + }, + { + "epoch": 0.5298861480075902, + "grad_norm": 2.757744789123535, + "learning_rate": 1.7244084793386108e-05, + "loss": 0.3825, + "step": 1117 + }, + { + "epoch": 0.530360531309298, + "grad_norm": 3.2028768062591553, + "learning_rate": 1.7238786195048493e-05, + "loss": 0.4394, + "step": 1118 + }, + { + "epoch": 0.5308349146110057, + "grad_norm": 2.5741443634033203, + "learning_rate": 1.7233483323727018e-05, + "loss": 0.3955, + "step": 1119 + }, + { + "epoch": 0.5313092979127134, + "grad_norm": 3.050013303756714, + "learning_rate": 1.7228176182551917e-05, + "loss": 0.4598, + "step": 1120 + }, + { + "epoch": 0.5317836812144212, + "grad_norm": 2.7212207317352295, + "learning_rate": 1.722286477465594e-05, + "loss": 0.4926, + "step": 1121 + }, + { + "epoch": 0.532258064516129, + "grad_norm": 2.9433910846710205, + "learning_rate": 1.721754910317436e-05, + "loss": 0.4738, + "step": 1122 + }, + { + "epoch": 0.5327324478178368, + "grad_norm": 2.578136444091797, + "learning_rate": 1.7212229171244966e-05, + "loss": 0.3956, + "step": 1123 + }, + { + "epoch": 0.5332068311195446, + "grad_norm": 2.4337382316589355, + "learning_rate": 1.7206904982008058e-05, + "loss": 0.3345, + "step": 1124 + }, + { + "epoch": 0.5336812144212524, + "grad_norm": 2.611060619354248, + "learning_rate": 1.7201576538606458e-05, + "loss": 0.4081, + "step": 1125 + }, + { + "epoch": 0.5341555977229602, + "grad_norm": 2.9734725952148438, + "learning_rate": 1.7196243844185488e-05, + "loss": 0.4356, + "step": 1126 + }, + { + "epoch": 0.534629981024668, + "grad_norm": 2.1246142387390137, + "learning_rate": 1.7190906901892986e-05, + "loss": 0.412, + "step": 1127 + }, + { + "epoch": 0.5351043643263758, + "grad_norm": 2.533374547958374, + "learning_rate": 1.7185565714879295e-05, + "loss": 0.4051, + "step": 1128 + }, + { + "epoch": 0.5355787476280834, + "grad_norm": 2.476228713989258, + "learning_rate": 1.718022028629727e-05, + "loss": 0.4775, + "step": 1129 + }, + { + "epoch": 0.5360531309297912, + "grad_norm": 2.6208913326263428, + "learning_rate": 1.7174870619302263e-05, + "loss": 0.3889, + "step": 1130 + }, + { + "epoch": 0.536527514231499, + "grad_norm": 2.2168285846710205, + "learning_rate": 1.7169516717052122e-05, + "loss": 0.3205, + "step": 1131 + }, + { + "epoch": 0.5370018975332068, + "grad_norm": 2.6198856830596924, + "learning_rate": 1.7164158582707215e-05, + "loss": 0.3413, + "step": 1132 + }, + { + "epoch": 0.5374762808349146, + "grad_norm": 2.9443211555480957, + "learning_rate": 1.715879621943038e-05, + "loss": 0.5137, + "step": 1133 + }, + { + "epoch": 0.5379506641366224, + "grad_norm": 2.5952861309051514, + "learning_rate": 1.7153429630386985e-05, + "loss": 0.4471, + "step": 1134 + }, + { + "epoch": 0.5384250474383302, + "grad_norm": 2.5865321159362793, + "learning_rate": 1.714805881874486e-05, + "loss": 0.3915, + "step": 1135 + }, + { + "epoch": 0.538899430740038, + "grad_norm": 2.849581003189087, + "learning_rate": 1.7142683787674353e-05, + "loss": 0.4406, + "step": 1136 + }, + { + "epoch": 0.5393738140417458, + "grad_norm": 2.4955079555511475, + "learning_rate": 1.713730454034828e-05, + "loss": 0.4032, + "step": 1137 + }, + { + "epoch": 0.5398481973434535, + "grad_norm": 2.804607391357422, + "learning_rate": 1.7131921079941965e-05, + "loss": 0.354, + "step": 1138 + }, + { + "epoch": 0.5403225806451613, + "grad_norm": 2.3211519718170166, + "learning_rate": 1.7126533409633214e-05, + "loss": 0.3228, + "step": 1139 + }, + { + "epoch": 0.540796963946869, + "grad_norm": 2.563812732696533, + "learning_rate": 1.7121141532602306e-05, + "loss": 0.407, + "step": 1140 + }, + { + "epoch": 0.5412713472485768, + "grad_norm": 2.5948188304901123, + "learning_rate": 1.7115745452032023e-05, + "loss": 0.3885, + "step": 1141 + }, + { + "epoch": 0.5417457305502846, + "grad_norm": 2.253960371017456, + "learning_rate": 1.711034517110761e-05, + "loss": 0.4199, + "step": 1142 + }, + { + "epoch": 0.5422201138519924, + "grad_norm": 2.6741251945495605, + "learning_rate": 1.7104940693016803e-05, + "loss": 0.4642, + "step": 1143 + }, + { + "epoch": 0.5426944971537002, + "grad_norm": 2.9957103729248047, + "learning_rate": 1.709953202094981e-05, + "loss": 0.3952, + "step": 1144 + }, + { + "epoch": 0.543168880455408, + "grad_norm": 2.8041434288024902, + "learning_rate": 1.7094119158099318e-05, + "loss": 0.3991, + "step": 1145 + }, + { + "epoch": 0.5436432637571158, + "grad_norm": 2.4647340774536133, + "learning_rate": 1.708870210766049e-05, + "loss": 0.3622, + "step": 1146 + }, + { + "epoch": 0.5441176470588235, + "grad_norm": 3.1950418949127197, + "learning_rate": 1.708328087283095e-05, + "loss": 0.4606, + "step": 1147 + }, + { + "epoch": 0.5445920303605313, + "grad_norm": 2.3248064517974854, + "learning_rate": 1.7077855456810803e-05, + "loss": 0.3685, + "step": 1148 + }, + { + "epoch": 0.5450664136622391, + "grad_norm": 3.027299404144287, + "learning_rate": 1.7072425862802618e-05, + "loss": 0.4978, + "step": 1149 + }, + { + "epoch": 0.5455407969639469, + "grad_norm": 3.0360865592956543, + "learning_rate": 1.706699209401143e-05, + "loss": 0.3587, + "step": 1150 + }, + { + "epoch": 0.5460151802656547, + "grad_norm": 2.7599403858184814, + "learning_rate": 1.7061554153644743e-05, + "loss": 0.4213, + "step": 1151 + }, + { + "epoch": 0.5464895635673624, + "grad_norm": 3.035600423812866, + "learning_rate": 1.7056112044912513e-05, + "loss": 0.4591, + "step": 1152 + }, + { + "epoch": 0.5469639468690702, + "grad_norm": 2.4583334922790527, + "learning_rate": 1.705066577102717e-05, + "loss": 0.4092, + "step": 1153 + }, + { + "epoch": 0.547438330170778, + "grad_norm": 2.438873529434204, + "learning_rate": 1.704521533520359e-05, + "loss": 0.4513, + "step": 1154 + }, + { + "epoch": 0.5479127134724858, + "grad_norm": 2.434335708618164, + "learning_rate": 1.703976074065911e-05, + "loss": 0.3763, + "step": 1155 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 2.7896952629089355, + "learning_rate": 1.703430199061353e-05, + "loss": 0.4478, + "step": 1156 + }, + { + "epoch": 0.5488614800759013, + "grad_norm": 3.330437183380127, + "learning_rate": 1.7028839088289092e-05, + "loss": 0.4884, + "step": 1157 + }, + { + "epoch": 0.5493358633776091, + "grad_norm": 2.2704224586486816, + "learning_rate": 1.702337203691049e-05, + "loss": 0.3599, + "step": 1158 + }, + { + "epoch": 0.5498102466793169, + "grad_norm": 2.6077935695648193, + "learning_rate": 1.7017900839704877e-05, + "loss": 0.439, + "step": 1159 + }, + { + "epoch": 0.5502846299810247, + "grad_norm": 2.6503183841705322, + "learning_rate": 1.7012425499901842e-05, + "loss": 0.443, + "step": 1160 + }, + { + "epoch": 0.5507590132827325, + "grad_norm": 2.3090269565582275, + "learning_rate": 1.7006946020733426e-05, + "loss": 0.4096, + "step": 1161 + }, + { + "epoch": 0.5512333965844403, + "grad_norm": 2.1434450149536133, + "learning_rate": 1.700146240543411e-05, + "loss": 0.373, + "step": 1162 + }, + { + "epoch": 0.551707779886148, + "grad_norm": 2.830927610397339, + "learning_rate": 1.699597465724082e-05, + "loss": 0.4866, + "step": 1163 + }, + { + "epoch": 0.5521821631878557, + "grad_norm": 2.427565336227417, + "learning_rate": 1.6990482779392918e-05, + "loss": 0.3733, + "step": 1164 + }, + { + "epoch": 0.5526565464895635, + "grad_norm": 2.7859416007995605, + "learning_rate": 1.6984986775132202e-05, + "loss": 0.4309, + "step": 1165 + }, + { + "epoch": 0.5531309297912713, + "grad_norm": 3.880078077316284, + "learning_rate": 1.6979486647702917e-05, + "loss": 0.6485, + "step": 1166 + }, + { + "epoch": 0.5536053130929791, + "grad_norm": 2.7935032844543457, + "learning_rate": 1.6973982400351726e-05, + "loss": 0.4619, + "step": 1167 + }, + { + "epoch": 0.5540796963946869, + "grad_norm": 2.788963794708252, + "learning_rate": 1.6968474036327733e-05, + "loss": 0.3934, + "step": 1168 + }, + { + "epoch": 0.5545540796963947, + "grad_norm": 2.3804471492767334, + "learning_rate": 1.6962961558882476e-05, + "loss": 0.4006, + "step": 1169 + }, + { + "epoch": 0.5550284629981025, + "grad_norm": 2.1218841075897217, + "learning_rate": 1.6957444971269907e-05, + "loss": 0.3862, + "step": 1170 + }, + { + "epoch": 0.5555028462998103, + "grad_norm": 2.6136226654052734, + "learning_rate": 1.6951924276746425e-05, + "loss": 0.4073, + "step": 1171 + }, + { + "epoch": 0.5559772296015181, + "grad_norm": 2.710028886795044, + "learning_rate": 1.694639947857083e-05, + "loss": 0.4464, + "step": 1172 + }, + { + "epoch": 0.5564516129032258, + "grad_norm": 2.524308204650879, + "learning_rate": 1.6940870580004364e-05, + "loss": 0.4632, + "step": 1173 + }, + { + "epoch": 0.5569259962049335, + "grad_norm": 2.7076687812805176, + "learning_rate": 1.6935337584310674e-05, + "loss": 0.4706, + "step": 1174 + }, + { + "epoch": 0.5574003795066413, + "grad_norm": 2.7183334827423096, + "learning_rate": 1.6929800494755836e-05, + "loss": 0.4462, + "step": 1175 + }, + { + "epoch": 0.5578747628083491, + "grad_norm": 2.5758368968963623, + "learning_rate": 1.692425931460834e-05, + "loss": 0.3369, + "step": 1176 + }, + { + "epoch": 0.5583491461100569, + "grad_norm": 2.9988672733306885, + "learning_rate": 1.691871404713909e-05, + "loss": 0.5041, + "step": 1177 + }, + { + "epoch": 0.5588235294117647, + "grad_norm": 2.278681516647339, + "learning_rate": 1.69131646956214e-05, + "loss": 0.4227, + "step": 1178 + }, + { + "epoch": 0.5592979127134725, + "grad_norm": 2.4047083854675293, + "learning_rate": 1.6907611263331004e-05, + "loss": 0.3633, + "step": 1179 + }, + { + "epoch": 0.5597722960151803, + "grad_norm": 2.6348397731781006, + "learning_rate": 1.6902053753546026e-05, + "loss": 0.4034, + "step": 1180 + }, + { + "epoch": 0.5602466793168881, + "grad_norm": 2.6245360374450684, + "learning_rate": 1.6896492169547022e-05, + "loss": 0.4203, + "step": 1181 + }, + { + "epoch": 0.5607210626185958, + "grad_norm": 2.4060757160186768, + "learning_rate": 1.6890926514616926e-05, + "loss": 0.3561, + "step": 1182 + }, + { + "epoch": 0.5611954459203036, + "grad_norm": 4.168298244476318, + "learning_rate": 1.6885356792041107e-05, + "loss": 0.4976, + "step": 1183 + }, + { + "epoch": 0.5616698292220114, + "grad_norm": 2.4673924446105957, + "learning_rate": 1.68797830051073e-05, + "loss": 0.4339, + "step": 1184 + }, + { + "epoch": 0.5621442125237192, + "grad_norm": 2.879330635070801, + "learning_rate": 1.6874205157105667e-05, + "loss": 0.4091, + "step": 1185 + }, + { + "epoch": 0.562618595825427, + "grad_norm": 2.9739437103271484, + "learning_rate": 1.686862325132875e-05, + "loss": 0.379, + "step": 1186 + }, + { + "epoch": 0.5630929791271347, + "grad_norm": 2.463998317718506, + "learning_rate": 1.68630372910715e-05, + "loss": 0.394, + "step": 1187 + }, + { + "epoch": 0.5635673624288425, + "grad_norm": 3.4014320373535156, + "learning_rate": 1.685744727963125e-05, + "loss": 0.4995, + "step": 1188 + }, + { + "epoch": 0.5640417457305503, + "grad_norm": 3.1018409729003906, + "learning_rate": 1.685185322030772e-05, + "loss": 0.3801, + "step": 1189 + }, + { + "epoch": 0.5645161290322581, + "grad_norm": 3.028447151184082, + "learning_rate": 1.6846255116403044e-05, + "loss": 0.4118, + "step": 1190 + }, + { + "epoch": 0.5649905123339658, + "grad_norm": 3.2124314308166504, + "learning_rate": 1.6840652971221714e-05, + "loss": 0.3829, + "step": 1191 + }, + { + "epoch": 0.5654648956356736, + "grad_norm": 3.856614351272583, + "learning_rate": 1.683504678807063e-05, + "loss": 0.4024, + "step": 1192 + }, + { + "epoch": 0.5659392789373814, + "grad_norm": 2.8994805812835693, + "learning_rate": 1.6829436570259064e-05, + "loss": 0.469, + "step": 1193 + }, + { + "epoch": 0.5664136622390892, + "grad_norm": 2.968153953552246, + "learning_rate": 1.6823822321098667e-05, + "loss": 0.4357, + "step": 1194 + }, + { + "epoch": 0.566888045540797, + "grad_norm": 2.9536709785461426, + "learning_rate": 1.681820404390348e-05, + "loss": 0.4572, + "step": 1195 + }, + { + "epoch": 0.5673624288425048, + "grad_norm": 2.574279546737671, + "learning_rate": 1.6812581741989915e-05, + "loss": 0.3608, + "step": 1196 + }, + { + "epoch": 0.5678368121442126, + "grad_norm": 2.5156452655792236, + "learning_rate": 1.680695541867676e-05, + "loss": 0.402, + "step": 1197 + }, + { + "epoch": 0.5683111954459203, + "grad_norm": 3.1304829120635986, + "learning_rate": 1.680132507728518e-05, + "loss": 0.3993, + "step": 1198 + }, + { + "epoch": 0.568785578747628, + "grad_norm": 3.0311906337738037, + "learning_rate": 1.679569072113871e-05, + "loss": 0.3883, + "step": 1199 + }, + { + "epoch": 0.5692599620493358, + "grad_norm": 2.762141227722168, + "learning_rate": 1.6790052353563254e-05, + "loss": 0.3877, + "step": 1200 + }, + { + "epoch": 0.5697343453510436, + "grad_norm": 2.4210193157196045, + "learning_rate": 1.678440997788708e-05, + "loss": 0.3758, + "step": 1201 + }, + { + "epoch": 0.5702087286527514, + "grad_norm": 2.446246862411499, + "learning_rate": 1.6778763597440833e-05, + "loss": 0.3929, + "step": 1202 + }, + { + "epoch": 0.5706831119544592, + "grad_norm": 2.542310953140259, + "learning_rate": 1.6773113215557514e-05, + "loss": 0.4038, + "step": 1203 + }, + { + "epoch": 0.571157495256167, + "grad_norm": 2.6096861362457275, + "learning_rate": 1.6767458835572487e-05, + "loss": 0.421, + "step": 1204 + }, + { + "epoch": 0.5716318785578748, + "grad_norm": 2.5336506366729736, + "learning_rate": 1.6761800460823473e-05, + "loss": 0.4722, + "step": 1205 + }, + { + "epoch": 0.5721062618595826, + "grad_norm": 2.657362461090088, + "learning_rate": 1.6756138094650563e-05, + "loss": 0.4266, + "step": 1206 + }, + { + "epoch": 0.5725806451612904, + "grad_norm": 2.6421406269073486, + "learning_rate": 1.6750471740396187e-05, + "loss": 0.4215, + "step": 1207 + }, + { + "epoch": 0.573055028462998, + "grad_norm": 2.54229998588562, + "learning_rate": 1.6744801401405138e-05, + "loss": 0.4456, + "step": 1208 + }, + { + "epoch": 0.5735294117647058, + "grad_norm": 3.0632123947143555, + "learning_rate": 1.6739127081024574e-05, + "loss": 0.3799, + "step": 1209 + }, + { + "epoch": 0.5740037950664136, + "grad_norm": 2.753382921218872, + "learning_rate": 1.673344878260397e-05, + "loss": 0.4764, + "step": 1210 + }, + { + "epoch": 0.5744781783681214, + "grad_norm": 2.06011700630188, + "learning_rate": 1.6727766509495186e-05, + "loss": 0.3336, + "step": 1211 + }, + { + "epoch": 0.5749525616698292, + "grad_norm": 2.405019760131836, + "learning_rate": 1.6722080265052407e-05, + "loss": 0.4868, + "step": 1212 + }, + { + "epoch": 0.575426944971537, + "grad_norm": 2.274955987930298, + "learning_rate": 1.671639005263216e-05, + "loss": 0.3628, + "step": 1213 + }, + { + "epoch": 0.5759013282732448, + "grad_norm": 3.2761285305023193, + "learning_rate": 1.671069587559333e-05, + "loss": 0.4588, + "step": 1214 + }, + { + "epoch": 0.5763757115749526, + "grad_norm": 3.0697081089019775, + "learning_rate": 1.6704997737297134e-05, + "loss": 0.43, + "step": 1215 + }, + { + "epoch": 0.5768500948766604, + "grad_norm": 2.4201908111572266, + "learning_rate": 1.6699295641107116e-05, + "loss": 0.3929, + "step": 1216 + }, + { + "epoch": 0.5773244781783681, + "grad_norm": 2.5585038661956787, + "learning_rate": 1.6693589590389176e-05, + "loss": 0.4479, + "step": 1217 + }, + { + "epoch": 0.5777988614800759, + "grad_norm": 2.488297939300537, + "learning_rate": 1.668787958851153e-05, + "loss": 0.4259, + "step": 1218 + }, + { + "epoch": 0.5782732447817837, + "grad_norm": 2.3524911403656006, + "learning_rate": 1.6682165638844753e-05, + "loss": 0.4478, + "step": 1219 + }, + { + "epoch": 0.5787476280834914, + "grad_norm": 2.2819252014160156, + "learning_rate": 1.6676447744761715e-05, + "loss": 0.3464, + "step": 1220 + }, + { + "epoch": 0.5792220113851992, + "grad_norm": 2.497574806213379, + "learning_rate": 1.6670725909637645e-05, + "loss": 0.4008, + "step": 1221 + }, + { + "epoch": 0.579696394686907, + "grad_norm": 2.4774222373962402, + "learning_rate": 1.6665000136850076e-05, + "loss": 0.4056, + "step": 1222 + }, + { + "epoch": 0.5801707779886148, + "grad_norm": 3.251617908477783, + "learning_rate": 1.665927042977888e-05, + "loss": 0.4725, + "step": 1223 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 5.06425666809082, + "learning_rate": 1.6653536791806248e-05, + "loss": 0.4183, + "step": 1224 + }, + { + "epoch": 0.5811195445920304, + "grad_norm": 2.323410749435425, + "learning_rate": 1.6647799226316684e-05, + "loss": 0.3405, + "step": 1225 + }, + { + "epoch": 0.5815939278937381, + "grad_norm": 2.2724153995513916, + "learning_rate": 1.6642057736697023e-05, + "loss": 0.3668, + "step": 1226 + }, + { + "epoch": 0.5820683111954459, + "grad_norm": 2.63533878326416, + "learning_rate": 1.6636312326336402e-05, + "loss": 0.458, + "step": 1227 + }, + { + "epoch": 0.5825426944971537, + "grad_norm": 3.030594825744629, + "learning_rate": 1.6630562998626287e-05, + "loss": 0.3992, + "step": 1228 + }, + { + "epoch": 0.5830170777988615, + "grad_norm": 2.6020584106445312, + "learning_rate": 1.6624809756960445e-05, + "loss": 0.4317, + "step": 1229 + }, + { + "epoch": 0.5834914611005693, + "grad_norm": 2.6600024700164795, + "learning_rate": 1.6619052604734958e-05, + "loss": 0.3952, + "step": 1230 + }, + { + "epoch": 0.5839658444022771, + "grad_norm": 2.172214984893799, + "learning_rate": 1.661329154534822e-05, + "loss": 0.343, + "step": 1231 + }, + { + "epoch": 0.5844402277039848, + "grad_norm": 2.652944803237915, + "learning_rate": 1.6607526582200918e-05, + "loss": 0.4041, + "step": 1232 + }, + { + "epoch": 0.5849146110056926, + "grad_norm": 2.3528411388397217, + "learning_rate": 1.6601757718696065e-05, + "loss": 0.4106, + "step": 1233 + }, + { + "epoch": 0.5853889943074004, + "grad_norm": 2.205167770385742, + "learning_rate": 1.6595984958238952e-05, + "loss": 0.3645, + "step": 1234 + }, + { + "epoch": 0.5858633776091081, + "grad_norm": 2.9009509086608887, + "learning_rate": 1.6590208304237193e-05, + "loss": 0.5345, + "step": 1235 + }, + { + "epoch": 0.5863377609108159, + "grad_norm": 3.4057090282440186, + "learning_rate": 1.6584427760100682e-05, + "loss": 0.4928, + "step": 1236 + }, + { + "epoch": 0.5868121442125237, + "grad_norm": 2.6523916721343994, + "learning_rate": 1.657864332924162e-05, + "loss": 0.4055, + "step": 1237 + }, + { + "epoch": 0.5872865275142315, + "grad_norm": 2.7750117778778076, + "learning_rate": 1.6572855015074502e-05, + "loss": 0.4551, + "step": 1238 + }, + { + "epoch": 0.5877609108159393, + "grad_norm": 2.880966901779175, + "learning_rate": 1.656706282101611e-05, + "loss": 0.4637, + "step": 1239 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.484705924987793, + "learning_rate": 1.6561266750485517e-05, + "loss": 0.37, + "step": 1240 + }, + { + "epoch": 0.5887096774193549, + "grad_norm": 2.381371259689331, + "learning_rate": 1.655546680690409e-05, + "loss": 0.4134, + "step": 1241 + }, + { + "epoch": 0.5891840607210627, + "grad_norm": 2.261758327484131, + "learning_rate": 1.654966299369547e-05, + "loss": 0.3461, + "step": 1242 + }, + { + "epoch": 0.5896584440227703, + "grad_norm": 2.374173402786255, + "learning_rate": 1.6543855314285598e-05, + "loss": 0.4009, + "step": 1243 + }, + { + "epoch": 0.5901328273244781, + "grad_norm": 2.667855978012085, + "learning_rate": 1.6538043772102692e-05, + "loss": 0.4612, + "step": 1244 + }, + { + "epoch": 0.5906072106261859, + "grad_norm": 2.694554328918457, + "learning_rate": 1.653222837057724e-05, + "loss": 0.4593, + "step": 1245 + }, + { + "epoch": 0.5910815939278937, + "grad_norm": 2.515317678451538, + "learning_rate": 1.6526409113142022e-05, + "loss": 0.4236, + "step": 1246 + }, + { + "epoch": 0.5915559772296015, + "grad_norm": 2.1223080158233643, + "learning_rate": 1.652058600323209e-05, + "loss": 0.3854, + "step": 1247 + }, + { + "epoch": 0.5920303605313093, + "grad_norm": 2.624248504638672, + "learning_rate": 1.651475904428476e-05, + "loss": 0.3865, + "step": 1248 + }, + { + "epoch": 0.5925047438330171, + "grad_norm": 2.6333069801330566, + "learning_rate": 1.6508928239739632e-05, + "loss": 0.4257, + "step": 1249 + }, + { + "epoch": 0.5929791271347249, + "grad_norm": 2.5864479541778564, + "learning_rate": 1.6503093593038573e-05, + "loss": 0.415, + "step": 1250 + }, + { + "epoch": 0.5934535104364327, + "grad_norm": 2.4937593936920166, + "learning_rate": 1.649725510762572e-05, + "loss": 0.4392, + "step": 1251 + }, + { + "epoch": 0.5939278937381404, + "grad_norm": 2.5235211849212646, + "learning_rate": 1.6491412786947468e-05, + "loss": 0.3365, + "step": 1252 + }, + { + "epoch": 0.5944022770398482, + "grad_norm": 2.3775665760040283, + "learning_rate": 1.6485566634452483e-05, + "loss": 0.3976, + "step": 1253 + }, + { + "epoch": 0.594876660341556, + "grad_norm": 2.5280470848083496, + "learning_rate": 1.6479716653591694e-05, + "loss": 0.4174, + "step": 1254 + }, + { + "epoch": 0.5953510436432637, + "grad_norm": 2.390460968017578, + "learning_rate": 1.647386284781828e-05, + "loss": 0.3771, + "step": 1255 + }, + { + "epoch": 0.5958254269449715, + "grad_norm": 2.5851383209228516, + "learning_rate": 1.6468005220587687e-05, + "loss": 0.4512, + "step": 1256 + }, + { + "epoch": 0.5962998102466793, + "grad_norm": 2.5305631160736084, + "learning_rate": 1.646214377535762e-05, + "loss": 0.3618, + "step": 1257 + }, + { + "epoch": 0.5967741935483871, + "grad_norm": 2.88232159614563, + "learning_rate": 1.6456278515588023e-05, + "loss": 0.4459, + "step": 1258 + }, + { + "epoch": 0.5972485768500949, + "grad_norm": 2.761087656021118, + "learning_rate": 1.6450409444741112e-05, + "loss": 0.4103, + "step": 1259 + }, + { + "epoch": 0.5977229601518027, + "grad_norm": 2.098811388015747, + "learning_rate": 1.6444536566281332e-05, + "loss": 0.3358, + "step": 1260 + }, + { + "epoch": 0.5981973434535104, + "grad_norm": 2.363554000854492, + "learning_rate": 1.643865988367539e-05, + "loss": 0.3722, + "step": 1261 + }, + { + "epoch": 0.5986717267552182, + "grad_norm": 2.7632856369018555, + "learning_rate": 1.6432779400392232e-05, + "loss": 0.4257, + "step": 1262 + }, + { + "epoch": 0.599146110056926, + "grad_norm": 2.5627601146698, + "learning_rate": 1.6426895119903046e-05, + "loss": 0.4035, + "step": 1263 + }, + { + "epoch": 0.5996204933586338, + "grad_norm": 2.7176856994628906, + "learning_rate": 1.6421007045681273e-05, + "loss": 0.4467, + "step": 1264 + }, + { + "epoch": 0.6000948766603416, + "grad_norm": 2.368619441986084, + "learning_rate": 1.6415115181202576e-05, + "loss": 0.3523, + "step": 1265 + }, + { + "epoch": 0.6005692599620494, + "grad_norm": 2.4514405727386475, + "learning_rate": 1.6409219529944866e-05, + "loss": 0.3838, + "step": 1266 + }, + { + "epoch": 0.6010436432637571, + "grad_norm": 2.8423192501068115, + "learning_rate": 1.640332009538829e-05, + "loss": 0.3943, + "step": 1267 + }, + { + "epoch": 0.6015180265654649, + "grad_norm": 2.820495843887329, + "learning_rate": 1.639741688101523e-05, + "loss": 0.4247, + "step": 1268 + }, + { + "epoch": 0.6019924098671727, + "grad_norm": 2.6144824028015137, + "learning_rate": 1.6391509890310285e-05, + "loss": 0.43, + "step": 1269 + }, + { + "epoch": 0.6024667931688804, + "grad_norm": 2.3828113079071045, + "learning_rate": 1.63855991267603e-05, + "loss": 0.3844, + "step": 1270 + }, + { + "epoch": 0.6029411764705882, + "grad_norm": 2.611931324005127, + "learning_rate": 1.637968459385434e-05, + "loss": 0.3651, + "step": 1271 + }, + { + "epoch": 0.603415559772296, + "grad_norm": 2.4862523078918457, + "learning_rate": 1.6373766295083693e-05, + "loss": 0.4257, + "step": 1272 + }, + { + "epoch": 0.6038899430740038, + "grad_norm": 2.119983673095703, + "learning_rate": 1.636784423394187e-05, + "loss": 0.3721, + "step": 1273 + }, + { + "epoch": 0.6043643263757116, + "grad_norm": 2.5551223754882812, + "learning_rate": 1.6361918413924614e-05, + "loss": 0.4151, + "step": 1274 + }, + { + "epoch": 0.6048387096774194, + "grad_norm": 2.954251527786255, + "learning_rate": 1.635598883852987e-05, + "loss": 0.4131, + "step": 1275 + }, + { + "epoch": 0.6053130929791272, + "grad_norm": 2.8312759399414062, + "learning_rate": 1.6350055511257806e-05, + "loss": 0.3901, + "step": 1276 + }, + { + "epoch": 0.605787476280835, + "grad_norm": 2.543302536010742, + "learning_rate": 1.6344118435610814e-05, + "loss": 0.3597, + "step": 1277 + }, + { + "epoch": 0.6062618595825426, + "grad_norm": 2.338500738143921, + "learning_rate": 1.633817761509349e-05, + "loss": 0.3255, + "step": 1278 + }, + { + "epoch": 0.6067362428842504, + "grad_norm": 2.9333157539367676, + "learning_rate": 1.6332233053212632e-05, + "loss": 0.4025, + "step": 1279 + }, + { + "epoch": 0.6072106261859582, + "grad_norm": 2.388615608215332, + "learning_rate": 1.6326284753477267e-05, + "loss": 0.4042, + "step": 1280 + }, + { + "epoch": 0.607685009487666, + "grad_norm": 2.1982688903808594, + "learning_rate": 1.6320332719398612e-05, + "loss": 0.341, + "step": 1281 + }, + { + "epoch": 0.6081593927893738, + "grad_norm": 2.6067326068878174, + "learning_rate": 1.6314376954490097e-05, + "loss": 0.441, + "step": 1282 + }, + { + "epoch": 0.6086337760910816, + "grad_norm": 2.442051410675049, + "learning_rate": 1.6308417462267348e-05, + "loss": 0.4034, + "step": 1283 + }, + { + "epoch": 0.6091081593927894, + "grad_norm": 2.5747456550598145, + "learning_rate": 1.6302454246248195e-05, + "loss": 0.4045, + "step": 1284 + }, + { + "epoch": 0.6095825426944972, + "grad_norm": 3.0773766040802, + "learning_rate": 1.6296487309952666e-05, + "loss": 0.4241, + "step": 1285 + }, + { + "epoch": 0.610056925996205, + "grad_norm": 2.966850519180298, + "learning_rate": 1.6290516656902985e-05, + "loss": 0.4131, + "step": 1286 + }, + { + "epoch": 0.6105313092979127, + "grad_norm": 2.5385637283325195, + "learning_rate": 1.6284542290623568e-05, + "loss": 0.3904, + "step": 1287 + }, + { + "epoch": 0.6110056925996205, + "grad_norm": 2.2452964782714844, + "learning_rate": 1.6278564214641027e-05, + "loss": 0.3438, + "step": 1288 + }, + { + "epoch": 0.6114800759013282, + "grad_norm": 3.1382503509521484, + "learning_rate": 1.6272582432484155e-05, + "loss": 0.4465, + "step": 1289 + }, + { + "epoch": 0.611954459203036, + "grad_norm": 2.5982370376586914, + "learning_rate": 1.6266596947683945e-05, + "loss": 0.4197, + "step": 1290 + }, + { + "epoch": 0.6124288425047438, + "grad_norm": 3.1609013080596924, + "learning_rate": 1.626060776377357e-05, + "loss": 0.4191, + "step": 1291 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 2.7382616996765137, + "learning_rate": 1.625461488428838e-05, + "loss": 0.4635, + "step": 1292 + }, + { + "epoch": 0.6133776091081594, + "grad_norm": 2.5566318035125732, + "learning_rate": 1.6248618312765918e-05, + "loss": 0.4796, + "step": 1293 + }, + { + "epoch": 0.6138519924098672, + "grad_norm": 2.541482448577881, + "learning_rate": 1.62426180527459e-05, + "loss": 0.4099, + "step": 1294 + }, + { + "epoch": 0.614326375711575, + "grad_norm": 3.1109683513641357, + "learning_rate": 1.6236614107770216e-05, + "loss": 0.4481, + "step": 1295 + }, + { + "epoch": 0.6148007590132827, + "grad_norm": 2.3709452152252197, + "learning_rate": 1.623060648138294e-05, + "loss": 0.4273, + "step": 1296 + }, + { + "epoch": 0.6152751423149905, + "grad_norm": 2.797222375869751, + "learning_rate": 1.622459517713031e-05, + "loss": 0.3849, + "step": 1297 + }, + { + "epoch": 0.6157495256166983, + "grad_norm": 2.5726704597473145, + "learning_rate": 1.621858019856074e-05, + "loss": 0.4335, + "step": 1298 + }, + { + "epoch": 0.6162239089184061, + "grad_norm": 2.80641508102417, + "learning_rate": 1.6212561549224818e-05, + "loss": 0.4747, + "step": 1299 + }, + { + "epoch": 0.6166982922201139, + "grad_norm": 2.7815253734588623, + "learning_rate": 1.620653923267529e-05, + "loss": 0.4144, + "step": 1300 + }, + { + "epoch": 0.6171726755218216, + "grad_norm": 2.2950119972229004, + "learning_rate": 1.6200513252467068e-05, + "loss": 0.4452, + "step": 1301 + }, + { + "epoch": 0.6176470588235294, + "grad_norm": 2.441072463989258, + "learning_rate": 1.6194483612157232e-05, + "loss": 0.4684, + "step": 1302 + }, + { + "epoch": 0.6181214421252372, + "grad_norm": 2.0913050174713135, + "learning_rate": 1.6188450315305012e-05, + "loss": 0.3706, + "step": 1303 + }, + { + "epoch": 0.618595825426945, + "grad_norm": 2.4034149646759033, + "learning_rate": 1.6182413365471815e-05, + "loss": 0.3506, + "step": 1304 + }, + { + "epoch": 0.6190702087286527, + "grad_norm": 2.8475241661071777, + "learning_rate": 1.617637276622118e-05, + "loss": 0.4507, + "step": 1305 + }, + { + "epoch": 0.6195445920303605, + "grad_norm": 2.499732255935669, + "learning_rate": 1.617032852111882e-05, + "loss": 0.368, + "step": 1306 + }, + { + "epoch": 0.6200189753320683, + "grad_norm": 2.651540517807007, + "learning_rate": 1.6164280633732594e-05, + "loss": 0.4742, + "step": 1307 + }, + { + "epoch": 0.6204933586337761, + "grad_norm": 2.2591097354888916, + "learning_rate": 1.6158229107632507e-05, + "loss": 0.3422, + "step": 1308 + }, + { + "epoch": 0.6209677419354839, + "grad_norm": 2.7106447219848633, + "learning_rate": 1.6152173946390715e-05, + "loss": 0.4175, + "step": 1309 + }, + { + "epoch": 0.6214421252371917, + "grad_norm": 3.118277072906494, + "learning_rate": 1.6146115153581523e-05, + "loss": 0.5013, + "step": 1310 + }, + { + "epoch": 0.6219165085388995, + "grad_norm": 2.603504180908203, + "learning_rate": 1.6140052732781373e-05, + "loss": 0.3475, + "step": 1311 + }, + { + "epoch": 0.6223908918406073, + "grad_norm": 2.1159276962280273, + "learning_rate": 1.6133986687568854e-05, + "loss": 0.3465, + "step": 1312 + }, + { + "epoch": 0.622865275142315, + "grad_norm": 2.8673932552337646, + "learning_rate": 1.612791702152469e-05, + "loss": 0.4357, + "step": 1313 + }, + { + "epoch": 0.6233396584440227, + "grad_norm": 2.5344109535217285, + "learning_rate": 1.6121843738231748e-05, + "loss": 0.3729, + "step": 1314 + }, + { + "epoch": 0.6238140417457305, + "grad_norm": 2.3215112686157227, + "learning_rate": 1.6115766841275027e-05, + "loss": 0.3495, + "step": 1315 + }, + { + "epoch": 0.6242884250474383, + "grad_norm": 2.229944944381714, + "learning_rate": 1.6109686334241655e-05, + "loss": 0.3594, + "step": 1316 + }, + { + "epoch": 0.6247628083491461, + "grad_norm": 3.3155102729797363, + "learning_rate": 1.6103602220720897e-05, + "loss": 0.3867, + "step": 1317 + }, + { + "epoch": 0.6252371916508539, + "grad_norm": 2.2139594554901123, + "learning_rate": 1.609751450430415e-05, + "loss": 0.374, + "step": 1318 + }, + { + "epoch": 0.6257115749525617, + "grad_norm": 2.4204013347625732, + "learning_rate": 1.6091423188584926e-05, + "loss": 0.3767, + "step": 1319 + }, + { + "epoch": 0.6261859582542695, + "grad_norm": 2.2802765369415283, + "learning_rate": 1.608532827715887e-05, + "loss": 0.3397, + "step": 1320 + }, + { + "epoch": 0.6266603415559773, + "grad_norm": 2.470691204071045, + "learning_rate": 1.607922977362375e-05, + "loss": 0.3941, + "step": 1321 + }, + { + "epoch": 0.627134724857685, + "grad_norm": 2.650733232498169, + "learning_rate": 1.607312768157945e-05, + "loss": 0.3187, + "step": 1322 + }, + { + "epoch": 0.6276091081593927, + "grad_norm": 2.694547176361084, + "learning_rate": 1.606702200462798e-05, + "loss": 0.4531, + "step": 1323 + }, + { + "epoch": 0.6280834914611005, + "grad_norm": 2.8122143745422363, + "learning_rate": 1.6060912746373453e-05, + "loss": 0.4155, + "step": 1324 + }, + { + "epoch": 0.6285578747628083, + "grad_norm": 2.1507198810577393, + "learning_rate": 1.6054799910422106e-05, + "loss": 0.3287, + "step": 1325 + }, + { + "epoch": 0.6290322580645161, + "grad_norm": 3.3852505683898926, + "learning_rate": 1.604868350038229e-05, + "loss": 0.4204, + "step": 1326 + }, + { + "epoch": 0.6295066413662239, + "grad_norm": 2.315032482147217, + "learning_rate": 1.6042563519864452e-05, + "loss": 0.3035, + "step": 1327 + }, + { + "epoch": 0.6299810246679317, + "grad_norm": 2.5337111949920654, + "learning_rate": 1.603643997248117e-05, + "loss": 0.4118, + "step": 1328 + }, + { + "epoch": 0.6304554079696395, + "grad_norm": 2.549182653427124, + "learning_rate": 1.60303128618471e-05, + "loss": 0.3673, + "step": 1329 + }, + { + "epoch": 0.6309297912713473, + "grad_norm": 2.420851230621338, + "learning_rate": 1.6024182191579024e-05, + "loss": 0.3595, + "step": 1330 + }, + { + "epoch": 0.631404174573055, + "grad_norm": 2.3073275089263916, + "learning_rate": 1.601804796529581e-05, + "loss": 0.3316, + "step": 1331 + }, + { + "epoch": 0.6318785578747628, + "grad_norm": 2.652170181274414, + "learning_rate": 1.6011910186618433e-05, + "loss": 0.3742, + "step": 1332 + }, + { + "epoch": 0.6323529411764706, + "grad_norm": 2.5604469776153564, + "learning_rate": 1.6005768859169965e-05, + "loss": 0.3906, + "step": 1333 + }, + { + "epoch": 0.6328273244781784, + "grad_norm": 2.7032854557037354, + "learning_rate": 1.5999623986575565e-05, + "loss": 0.3651, + "step": 1334 + }, + { + "epoch": 0.6333017077798861, + "grad_norm": 2.5799543857574463, + "learning_rate": 1.5993475572462498e-05, + "loss": 0.3985, + "step": 1335 + }, + { + "epoch": 0.6337760910815939, + "grad_norm": 2.3732707500457764, + "learning_rate": 1.5987323620460106e-05, + "loss": 0.3936, + "step": 1336 + }, + { + "epoch": 0.6342504743833017, + "grad_norm": 2.5276899337768555, + "learning_rate": 1.598116813419983e-05, + "loss": 0.3924, + "step": 1337 + }, + { + "epoch": 0.6347248576850095, + "grad_norm": 2.2890419960021973, + "learning_rate": 1.597500911731519e-05, + "loss": 0.4306, + "step": 1338 + }, + { + "epoch": 0.6351992409867173, + "grad_norm": 2.1135501861572266, + "learning_rate": 1.5968846573441794e-05, + "loss": 0.281, + "step": 1339 + }, + { + "epoch": 0.635673624288425, + "grad_norm": 2.5915725231170654, + "learning_rate": 1.596268050621733e-05, + "loss": 0.4047, + "step": 1340 + }, + { + "epoch": 0.6361480075901328, + "grad_norm": 2.0635995864868164, + "learning_rate": 1.5956510919281564e-05, + "loss": 0.3489, + "step": 1341 + }, + { + "epoch": 0.6366223908918406, + "grad_norm": 2.7874855995178223, + "learning_rate": 1.5950337816276347e-05, + "loss": 0.3854, + "step": 1342 + }, + { + "epoch": 0.6370967741935484, + "grad_norm": 2.3711447715759277, + "learning_rate": 1.5944161200845595e-05, + "loss": 0.3981, + "step": 1343 + }, + { + "epoch": 0.6375711574952562, + "grad_norm": 2.6661829948425293, + "learning_rate": 1.5937981076635313e-05, + "loss": 0.4276, + "step": 1344 + }, + { + "epoch": 0.638045540796964, + "grad_norm": 2.2618393898010254, + "learning_rate": 1.5931797447293553e-05, + "loss": 0.3637, + "step": 1345 + }, + { + "epoch": 0.6385199240986718, + "grad_norm": 2.272256851196289, + "learning_rate": 1.5925610316470462e-05, + "loss": 0.3746, + "step": 1346 + }, + { + "epoch": 0.6389943074003795, + "grad_norm": 2.7523257732391357, + "learning_rate": 1.5919419687818235e-05, + "loss": 0.4538, + "step": 1347 + }, + { + "epoch": 0.6394686907020873, + "grad_norm": 2.0002548694610596, + "learning_rate": 1.5913225564991142e-05, + "loss": 0.352, + "step": 1348 + }, + { + "epoch": 0.639943074003795, + "grad_norm": 3.1848556995391846, + "learning_rate": 1.590702795164551e-05, + "loss": 0.4523, + "step": 1349 + }, + { + "epoch": 0.6404174573055028, + "grad_norm": 2.4870452880859375, + "learning_rate": 1.5900826851439734e-05, + "loss": 0.4132, + "step": 1350 + }, + { + "epoch": 0.6408918406072106, + "grad_norm": 2.299504041671753, + "learning_rate": 1.5894622268034258e-05, + "loss": 0.3974, + "step": 1351 + }, + { + "epoch": 0.6413662239089184, + "grad_norm": 2.126984119415283, + "learning_rate": 1.588841420509159e-05, + "loss": 0.377, + "step": 1352 + }, + { + "epoch": 0.6418406072106262, + "grad_norm": 2.595367193222046, + "learning_rate": 1.588220266627628e-05, + "loss": 0.375, + "step": 1353 + }, + { + "epoch": 0.642314990512334, + "grad_norm": 2.4395711421966553, + "learning_rate": 1.5875987655254947e-05, + "loss": 0.3947, + "step": 1354 + }, + { + "epoch": 0.6427893738140418, + "grad_norm": 2.616209030151367, + "learning_rate": 1.5869769175696243e-05, + "loss": 0.3637, + "step": 1355 + }, + { + "epoch": 0.6432637571157496, + "grad_norm": 2.404573917388916, + "learning_rate": 1.586354723127088e-05, + "loss": 0.3973, + "step": 1356 + }, + { + "epoch": 0.6437381404174574, + "grad_norm": 2.426798105239868, + "learning_rate": 1.585732182565161e-05, + "loss": 0.3932, + "step": 1357 + }, + { + "epoch": 0.644212523719165, + "grad_norm": 2.2638189792633057, + "learning_rate": 1.5851092962513224e-05, + "loss": 0.3432, + "step": 1358 + }, + { + "epoch": 0.6446869070208728, + "grad_norm": 3.1538186073303223, + "learning_rate": 1.5844860645532563e-05, + "loss": 0.3715, + "step": 1359 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 2.5172009468078613, + "learning_rate": 1.5838624878388498e-05, + "loss": 0.3756, + "step": 1360 + }, + { + "epoch": 0.6456356736242884, + "grad_norm": 2.5488150119781494, + "learning_rate": 1.5832385664761943e-05, + "loss": 0.4415, + "step": 1361 + }, + { + "epoch": 0.6461100569259962, + "grad_norm": 2.245295763015747, + "learning_rate": 1.582614300833585e-05, + "loss": 0.3377, + "step": 1362 + }, + { + "epoch": 0.646584440227704, + "grad_norm": 2.691495180130005, + "learning_rate": 1.5819896912795186e-05, + "loss": 0.3952, + "step": 1363 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 2.450929880142212, + "learning_rate": 1.5813647381826968e-05, + "loss": 0.436, + "step": 1364 + }, + { + "epoch": 0.6475332068311196, + "grad_norm": 2.736772298812866, + "learning_rate": 1.5807394419120224e-05, + "loss": 0.4265, + "step": 1365 + }, + { + "epoch": 0.6480075901328273, + "grad_norm": 2.9314868450164795, + "learning_rate": 1.5801138028366026e-05, + "loss": 0.353, + "step": 1366 + }, + { + "epoch": 0.6484819734345351, + "grad_norm": 3.4517641067504883, + "learning_rate": 1.579487821325745e-05, + "loss": 0.4646, + "step": 1367 + }, + { + "epoch": 0.6489563567362429, + "grad_norm": 2.4029881954193115, + "learning_rate": 1.5788614977489612e-05, + "loss": 0.3996, + "step": 1368 + }, + { + "epoch": 0.6494307400379506, + "grad_norm": 2.305480718612671, + "learning_rate": 1.578234832475963e-05, + "loss": 0.3684, + "step": 1369 + }, + { + "epoch": 0.6499051233396584, + "grad_norm": 2.9141790866851807, + "learning_rate": 1.5776078258766654e-05, + "loss": 0.4405, + "step": 1370 + }, + { + "epoch": 0.6503795066413662, + "grad_norm": 2.265993356704712, + "learning_rate": 1.5769804783211837e-05, + "loss": 0.3726, + "step": 1371 + }, + { + "epoch": 0.650853889943074, + "grad_norm": 2.6375434398651123, + "learning_rate": 1.576352790179835e-05, + "loss": 0.3809, + "step": 1372 + }, + { + "epoch": 0.6513282732447818, + "grad_norm": 2.3192057609558105, + "learning_rate": 1.5757247618231378e-05, + "loss": 0.4016, + "step": 1373 + }, + { + "epoch": 0.6518026565464896, + "grad_norm": 2.4729015827178955, + "learning_rate": 1.5750963936218104e-05, + "loss": 0.4239, + "step": 1374 + }, + { + "epoch": 0.6522770398481973, + "grad_norm": 2.1907765865325928, + "learning_rate": 1.574467685946773e-05, + "loss": 0.3452, + "step": 1375 + }, + { + "epoch": 0.6527514231499051, + "grad_norm": 2.9924540519714355, + "learning_rate": 1.573838639169145e-05, + "loss": 0.5011, + "step": 1376 + }, + { + "epoch": 0.6532258064516129, + "grad_norm": 2.7867164611816406, + "learning_rate": 1.5732092536602466e-05, + "loss": 0.4015, + "step": 1377 + }, + { + "epoch": 0.6537001897533207, + "grad_norm": 1.9842467308044434, + "learning_rate": 1.572579529791598e-05, + "loss": 0.2994, + "step": 1378 + }, + { + "epoch": 0.6541745730550285, + "grad_norm": 2.6319007873535156, + "learning_rate": 1.571949467934919e-05, + "loss": 0.4522, + "step": 1379 + }, + { + "epoch": 0.6546489563567363, + "grad_norm": 2.6779913902282715, + "learning_rate": 1.5713190684621285e-05, + "loss": 0.3868, + "step": 1380 + }, + { + "epoch": 0.655123339658444, + "grad_norm": 2.5989339351654053, + "learning_rate": 1.5706883317453455e-05, + "loss": 0.3916, + "step": 1381 + }, + { + "epoch": 0.6555977229601518, + "grad_norm": 2.8662238121032715, + "learning_rate": 1.5700572581568875e-05, + "loss": 0.4695, + "step": 1382 + }, + { + "epoch": 0.6560721062618596, + "grad_norm": 2.4219655990600586, + "learning_rate": 1.569425848069271e-05, + "loss": 0.4111, + "step": 1383 + }, + { + "epoch": 0.6565464895635673, + "grad_norm": 2.6834099292755127, + "learning_rate": 1.568794101855211e-05, + "loss": 0.4125, + "step": 1384 + }, + { + "epoch": 0.6570208728652751, + "grad_norm": 2.470796585083008, + "learning_rate": 1.568162019887621e-05, + "loss": 0.4157, + "step": 1385 + }, + { + "epoch": 0.6574952561669829, + "grad_norm": 2.3338913917541504, + "learning_rate": 1.567529602539613e-05, + "loss": 0.3847, + "step": 1386 + }, + { + "epoch": 0.6579696394686907, + "grad_norm": 2.07490873336792, + "learning_rate": 1.5668968501844966e-05, + "loss": 0.3439, + "step": 1387 + }, + { + "epoch": 0.6584440227703985, + "grad_norm": 2.4413368701934814, + "learning_rate": 1.5662637631957793e-05, + "loss": 0.4105, + "step": 1388 + }, + { + "epoch": 0.6589184060721063, + "grad_norm": 2.3040366172790527, + "learning_rate": 1.565630341947166e-05, + "loss": 0.3204, + "step": 1389 + }, + { + "epoch": 0.6593927893738141, + "grad_norm": 2.9284796714782715, + "learning_rate": 1.564996586812559e-05, + "loss": 0.4296, + "step": 1390 + }, + { + "epoch": 0.6598671726755219, + "grad_norm": 2.683302164077759, + "learning_rate": 1.5643624981660573e-05, + "loss": 0.3965, + "step": 1391 + }, + { + "epoch": 0.6603415559772297, + "grad_norm": 2.779085397720337, + "learning_rate": 1.563728076381958e-05, + "loss": 0.4836, + "step": 1392 + }, + { + "epoch": 0.6608159392789373, + "grad_norm": 2.2810940742492676, + "learning_rate": 1.5630933218347536e-05, + "loss": 0.3765, + "step": 1393 + }, + { + "epoch": 0.6612903225806451, + "grad_norm": 2.6218605041503906, + "learning_rate": 1.5624582348991327e-05, + "loss": 0.3954, + "step": 1394 + }, + { + "epoch": 0.6617647058823529, + "grad_norm": 2.3885746002197266, + "learning_rate": 1.5618228159499823e-05, + "loss": 0.3817, + "step": 1395 + }, + { + "epoch": 0.6622390891840607, + "grad_norm": 3.312493085861206, + "learning_rate": 1.5611870653623826e-05, + "loss": 0.4092, + "step": 1396 + }, + { + "epoch": 0.6627134724857685, + "grad_norm": 3.737312078475952, + "learning_rate": 1.5605509835116115e-05, + "loss": 0.4746, + "step": 1397 + }, + { + "epoch": 0.6631878557874763, + "grad_norm": 2.460756778717041, + "learning_rate": 1.5599145707731417e-05, + "loss": 0.415, + "step": 1398 + }, + { + "epoch": 0.6636622390891841, + "grad_norm": 2.247875690460205, + "learning_rate": 1.5592778275226413e-05, + "loss": 0.3812, + "step": 1399 + }, + { + "epoch": 0.6641366223908919, + "grad_norm": 2.569827079772949, + "learning_rate": 1.558640754135974e-05, + "loss": 0.4392, + "step": 1400 + }, + { + "epoch": 0.6646110056925996, + "grad_norm": 2.2257590293884277, + "learning_rate": 1.558003350989197e-05, + "loss": 0.3763, + "step": 1401 + }, + { + "epoch": 0.6650853889943074, + "grad_norm": 2.3620169162750244, + "learning_rate": 1.5573656184585643e-05, + "loss": 0.4499, + "step": 1402 + }, + { + "epoch": 0.6655597722960152, + "grad_norm": 2.4123148918151855, + "learning_rate": 1.5567275569205216e-05, + "loss": 0.3642, + "step": 1403 + }, + { + "epoch": 0.6660341555977229, + "grad_norm": 2.121957302093506, + "learning_rate": 1.556089166751712e-05, + "loss": 0.3786, + "step": 1404 + }, + { + "epoch": 0.6665085388994307, + "grad_norm": 2.452988624572754, + "learning_rate": 1.555450448328969e-05, + "loss": 0.4058, + "step": 1405 + }, + { + "epoch": 0.6669829222011385, + "grad_norm": 2.358628988265991, + "learning_rate": 1.554811402029323e-05, + "loss": 0.4026, + "step": 1406 + }, + { + "epoch": 0.6674573055028463, + "grad_norm": 2.781123638153076, + "learning_rate": 1.554172028229997e-05, + "loss": 0.455, + "step": 1407 + }, + { + "epoch": 0.6679316888045541, + "grad_norm": 3.405427932739258, + "learning_rate": 1.5535323273084062e-05, + "loss": 0.456, + "step": 1408 + }, + { + "epoch": 0.6684060721062619, + "grad_norm": 2.7615528106689453, + "learning_rate": 1.55289229964216e-05, + "loss": 0.3644, + "step": 1409 + }, + { + "epoch": 0.6688804554079696, + "grad_norm": 3.0365653038024902, + "learning_rate": 1.5522519456090604e-05, + "loss": 0.5515, + "step": 1410 + }, + { + "epoch": 0.6693548387096774, + "grad_norm": 2.3187596797943115, + "learning_rate": 1.551611265587102e-05, + "loss": 0.3391, + "step": 1411 + }, + { + "epoch": 0.6698292220113852, + "grad_norm": 2.320058584213257, + "learning_rate": 1.5509702599544723e-05, + "loss": 0.3337, + "step": 1412 + }, + { + "epoch": 0.670303605313093, + "grad_norm": 2.3811447620391846, + "learning_rate": 1.55032892908955e-05, + "loss": 0.3742, + "step": 1413 + }, + { + "epoch": 0.6707779886148008, + "grad_norm": 2.2313289642333984, + "learning_rate": 1.5496872733709067e-05, + "loss": 0.3389, + "step": 1414 + }, + { + "epoch": 0.6712523719165086, + "grad_norm": 2.9275193214416504, + "learning_rate": 1.5490452931773053e-05, + "loss": 0.4493, + "step": 1415 + }, + { + "epoch": 0.6717267552182163, + "grad_norm": 2.3053927421569824, + "learning_rate": 1.5484029888877004e-05, + "loss": 0.3616, + "step": 1416 + }, + { + "epoch": 0.6722011385199241, + "grad_norm": 2.178098201751709, + "learning_rate": 1.547760360881238e-05, + "loss": 0.3492, + "step": 1417 + }, + { + "epoch": 0.6726755218216319, + "grad_norm": 2.574260950088501, + "learning_rate": 1.547117409537254e-05, + "loss": 0.4382, + "step": 1418 + }, + { + "epoch": 0.6731499051233396, + "grad_norm": 2.7682528495788574, + "learning_rate": 1.546474135235278e-05, + "loss": 0.3529, + "step": 1419 + }, + { + "epoch": 0.6736242884250474, + "grad_norm": 5.3996500968933105, + "learning_rate": 1.5458305383550275e-05, + "loss": 0.3958, + "step": 1420 + }, + { + "epoch": 0.6740986717267552, + "grad_norm": 2.8296844959259033, + "learning_rate": 1.545186619276411e-05, + "loss": 0.4987, + "step": 1421 + }, + { + "epoch": 0.674573055028463, + "grad_norm": 2.638801097869873, + "learning_rate": 1.5445423783795283e-05, + "loss": 0.4888, + "step": 1422 + }, + { + "epoch": 0.6750474383301708, + "grad_norm": 2.253319025039673, + "learning_rate": 1.5438978160446684e-05, + "loss": 0.3512, + "step": 1423 + }, + { + "epoch": 0.6755218216318786, + "grad_norm": 2.3527333736419678, + "learning_rate": 1.543252932652309e-05, + "loss": 0.3054, + "step": 1424 + }, + { + "epoch": 0.6759962049335864, + "grad_norm": 2.6959636211395264, + "learning_rate": 1.5426077285831195e-05, + "loss": 0.3844, + "step": 1425 + }, + { + "epoch": 0.6764705882352942, + "grad_norm": 2.884432792663574, + "learning_rate": 1.5419622042179575e-05, + "loss": 0.4787, + "step": 1426 + }, + { + "epoch": 0.676944971537002, + "grad_norm": 3.0761313438415527, + "learning_rate": 1.541316359937869e-05, + "loss": 0.4422, + "step": 1427 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 2.4010064601898193, + "learning_rate": 1.54067019612409e-05, + "loss": 0.4129, + "step": 1428 + }, + { + "epoch": 0.6778937381404174, + "grad_norm": 2.568399429321289, + "learning_rate": 1.5400237131580443e-05, + "loss": 0.3751, + "step": 1429 + }, + { + "epoch": 0.6783681214421252, + "grad_norm": 3.373952627182007, + "learning_rate": 1.539376911421344e-05, + "loss": 0.404, + "step": 1430 + }, + { + "epoch": 0.678842504743833, + "grad_norm": 2.3124148845672607, + "learning_rate": 1.5387297912957907e-05, + "loss": 0.3388, + "step": 1431 + }, + { + "epoch": 0.6793168880455408, + "grad_norm": 2.5099921226501465, + "learning_rate": 1.5380823531633727e-05, + "loss": 0.3389, + "step": 1432 + }, + { + "epoch": 0.6797912713472486, + "grad_norm": 2.23146915435791, + "learning_rate": 1.537434597406266e-05, + "loss": 0.3419, + "step": 1433 + }, + { + "epoch": 0.6802656546489564, + "grad_norm": 2.684089183807373, + "learning_rate": 1.5367865244068346e-05, + "loss": 0.445, + "step": 1434 + }, + { + "epoch": 0.6807400379506642, + "grad_norm": 2.252448320388794, + "learning_rate": 1.53613813454763e-05, + "loss": 0.3182, + "step": 1435 + }, + { + "epoch": 0.681214421252372, + "grad_norm": 2.7252156734466553, + "learning_rate": 1.5354894282113892e-05, + "loss": 0.4379, + "step": 1436 + }, + { + "epoch": 0.6816888045540797, + "grad_norm": 2.7150869369506836, + "learning_rate": 1.5348404057810383e-05, + "loss": 0.405, + "step": 1437 + }, + { + "epoch": 0.6821631878557874, + "grad_norm": 2.2281618118286133, + "learning_rate": 1.534191067639688e-05, + "loss": 0.3822, + "step": 1438 + }, + { + "epoch": 0.6826375711574952, + "grad_norm": 2.0423622131347656, + "learning_rate": 1.5335414141706366e-05, + "loss": 0.3505, + "step": 1439 + }, + { + "epoch": 0.683111954459203, + "grad_norm": 2.7963783740997314, + "learning_rate": 1.5328914457573683e-05, + "loss": 0.4724, + "step": 1440 + }, + { + "epoch": 0.6835863377609108, + "grad_norm": 2.374962568283081, + "learning_rate": 1.5322411627835526e-05, + "loss": 0.3721, + "step": 1441 + }, + { + "epoch": 0.6840607210626186, + "grad_norm": 2.619636058807373, + "learning_rate": 1.531590565633045e-05, + "loss": 0.4549, + "step": 1442 + }, + { + "epoch": 0.6845351043643264, + "grad_norm": 2.1424806118011475, + "learning_rate": 1.530939654689887e-05, + "loss": 0.3219, + "step": 1443 + }, + { + "epoch": 0.6850094876660342, + "grad_norm": 2.061220407485962, + "learning_rate": 1.5302884303383046e-05, + "loss": 0.358, + "step": 1444 + }, + { + "epoch": 0.6854838709677419, + "grad_norm": 3.31339955329895, + "learning_rate": 1.5296368929627097e-05, + "loss": 0.5536, + "step": 1445 + }, + { + "epoch": 0.6859582542694497, + "grad_norm": 2.542325973510742, + "learning_rate": 1.528985042947697e-05, + "loss": 0.4265, + "step": 1446 + }, + { + "epoch": 0.6864326375711575, + "grad_norm": 2.6769909858703613, + "learning_rate": 1.5283328806780488e-05, + "loss": 0.4561, + "step": 1447 + }, + { + "epoch": 0.6869070208728653, + "grad_norm": 2.1639342308044434, + "learning_rate": 1.527680406538729e-05, + "loss": 0.3672, + "step": 1448 + }, + { + "epoch": 0.687381404174573, + "grad_norm": 2.8890748023986816, + "learning_rate": 1.5270276209148867e-05, + "loss": 0.3952, + "step": 1449 + }, + { + "epoch": 0.6878557874762808, + "grad_norm": 2.939438819885254, + "learning_rate": 1.526374524191855e-05, + "loss": 0.4082, + "step": 1450 + }, + { + "epoch": 0.6883301707779886, + "grad_norm": 2.6139535903930664, + "learning_rate": 1.52572111675515e-05, + "loss": 0.4213, + "step": 1451 + }, + { + "epoch": 0.6888045540796964, + "grad_norm": 2.3408873081207275, + "learning_rate": 1.5250673989904728e-05, + "loss": 0.3499, + "step": 1452 + }, + { + "epoch": 0.6892789373814042, + "grad_norm": 2.349148988723755, + "learning_rate": 1.524413371283705e-05, + "loss": 0.3571, + "step": 1453 + }, + { + "epoch": 0.6897533206831119, + "grad_norm": 2.3099985122680664, + "learning_rate": 1.5237590340209139e-05, + "loss": 0.3955, + "step": 1454 + }, + { + "epoch": 0.6902277039848197, + "grad_norm": 2.275421380996704, + "learning_rate": 1.5231043875883474e-05, + "loss": 0.3267, + "step": 1455 + }, + { + "epoch": 0.6907020872865275, + "grad_norm": 2.5383238792419434, + "learning_rate": 1.5224494323724374e-05, + "loss": 0.355, + "step": 1456 + }, + { + "epoch": 0.6911764705882353, + "grad_norm": 2.6643269062042236, + "learning_rate": 1.5217941687597976e-05, + "loss": 0.4296, + "step": 1457 + }, + { + "epoch": 0.6916508538899431, + "grad_norm": 2.6603057384490967, + "learning_rate": 1.5211385971372233e-05, + "loss": 0.3736, + "step": 1458 + }, + { + "epoch": 0.6921252371916509, + "grad_norm": 2.079129695892334, + "learning_rate": 1.5204827178916919e-05, + "loss": 0.3354, + "step": 1459 + }, + { + "epoch": 0.6925996204933587, + "grad_norm": 2.422872304916382, + "learning_rate": 1.5198265314103626e-05, + "loss": 0.4841, + "step": 1460 + }, + { + "epoch": 0.6930740037950665, + "grad_norm": 2.4735107421875, + "learning_rate": 1.5191700380805754e-05, + "loss": 0.3615, + "step": 1461 + }, + { + "epoch": 0.6935483870967742, + "grad_norm": 2.0199763774871826, + "learning_rate": 1.5185132382898524e-05, + "loss": 0.2761, + "step": 1462 + }, + { + "epoch": 0.6940227703984819, + "grad_norm": 2.425189733505249, + "learning_rate": 1.5178561324258956e-05, + "loss": 0.4204, + "step": 1463 + }, + { + "epoch": 0.6944971537001897, + "grad_norm": 2.212663173675537, + "learning_rate": 1.5171987208765884e-05, + "loss": 0.35, + "step": 1464 + }, + { + "epoch": 0.6949715370018975, + "grad_norm": 3.0007028579711914, + "learning_rate": 1.5165410040299942e-05, + "loss": 0.424, + "step": 1465 + }, + { + "epoch": 0.6954459203036053, + "grad_norm": 2.555004835128784, + "learning_rate": 1.5158829822743567e-05, + "loss": 0.4113, + "step": 1466 + }, + { + "epoch": 0.6959203036053131, + "grad_norm": 2.6518445014953613, + "learning_rate": 1.5152246559980994e-05, + "loss": 0.3866, + "step": 1467 + }, + { + "epoch": 0.6963946869070209, + "grad_norm": 2.7971367835998535, + "learning_rate": 1.5145660255898262e-05, + "loss": 0.3921, + "step": 1468 + }, + { + "epoch": 0.6968690702087287, + "grad_norm": 2.154005765914917, + "learning_rate": 1.5139070914383198e-05, + "loss": 0.334, + "step": 1469 + }, + { + "epoch": 0.6973434535104365, + "grad_norm": 2.5318751335144043, + "learning_rate": 1.5132478539325428e-05, + "loss": 0.3787, + "step": 1470 + }, + { + "epoch": 0.6978178368121443, + "grad_norm": 2.6897404193878174, + "learning_rate": 1.5125883134616363e-05, + "loss": 0.3783, + "step": 1471 + }, + { + "epoch": 0.698292220113852, + "grad_norm": 2.307647943496704, + "learning_rate": 1.5119284704149211e-05, + "loss": 0.3814, + "step": 1472 + }, + { + "epoch": 0.6987666034155597, + "grad_norm": 2.4573044776916504, + "learning_rate": 1.5112683251818952e-05, + "loss": 0.42, + "step": 1473 + }, + { + "epoch": 0.6992409867172675, + "grad_norm": 2.418071985244751, + "learning_rate": 1.5106078781522363e-05, + "loss": 0.3043, + "step": 1474 + }, + { + "epoch": 0.6997153700189753, + "grad_norm": 2.5575947761535645, + "learning_rate": 1.509947129715799e-05, + "loss": 0.4268, + "step": 1475 + }, + { + "epoch": 0.7001897533206831, + "grad_norm": 2.537424087524414, + "learning_rate": 1.5092860802626179e-05, + "loss": 0.4167, + "step": 1476 + }, + { + "epoch": 0.7006641366223909, + "grad_norm": 2.9543752670288086, + "learning_rate": 1.5086247301829028e-05, + "loss": 0.407, + "step": 1477 + }, + { + "epoch": 0.7011385199240987, + "grad_norm": 2.10909104347229, + "learning_rate": 1.507963079867043e-05, + "loss": 0.3587, + "step": 1478 + }, + { + "epoch": 0.7016129032258065, + "grad_norm": 2.046889543533325, + "learning_rate": 1.5073011297056033e-05, + "loss": 0.3019, + "step": 1479 + }, + { + "epoch": 0.7020872865275142, + "grad_norm": 2.1816864013671875, + "learning_rate": 1.5066388800893266e-05, + "loss": 0.349, + "step": 1480 + }, + { + "epoch": 0.702561669829222, + "grad_norm": 1.9779382944107056, + "learning_rate": 1.5059763314091326e-05, + "loss": 0.3138, + "step": 1481 + }, + { + "epoch": 0.7030360531309298, + "grad_norm": 3.019711971282959, + "learning_rate": 1.505313484056117e-05, + "loss": 0.466, + "step": 1482 + }, + { + "epoch": 0.7035104364326376, + "grad_norm": 2.450096607208252, + "learning_rate": 1.5046503384215521e-05, + "loss": 0.3534, + "step": 1483 + }, + { + "epoch": 0.7039848197343453, + "grad_norm": 2.3313405513763428, + "learning_rate": 1.503986894896886e-05, + "loss": 0.4373, + "step": 1484 + }, + { + "epoch": 0.7044592030360531, + "grad_norm": 2.4880897998809814, + "learning_rate": 1.5033231538737432e-05, + "loss": 0.4231, + "step": 1485 + }, + { + "epoch": 0.7049335863377609, + "grad_norm": 2.491865873336792, + "learning_rate": 1.502659115743923e-05, + "loss": 0.4453, + "step": 1486 + }, + { + "epoch": 0.7054079696394687, + "grad_norm": 2.641712188720703, + "learning_rate": 1.501994780899401e-05, + "loss": 0.4031, + "step": 1487 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 2.309870719909668, + "learning_rate": 1.5013301497323274e-05, + "loss": 0.3533, + "step": 1488 + }, + { + "epoch": 0.7063567362428842, + "grad_norm": 2.5076959133148193, + "learning_rate": 1.5006652226350272e-05, + "loss": 0.3699, + "step": 1489 + }, + { + "epoch": 0.706831119544592, + "grad_norm": 2.0265212059020996, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.3459, + "step": 1490 + }, + { + "epoch": 0.7073055028462998, + "grad_norm": 2.6799657344818115, + "learning_rate": 1.499334482219921e-05, + "loss": 0.4195, + "step": 1491 + }, + { + "epoch": 0.7077798861480076, + "grad_norm": 2.5318443775177, + "learning_rate": 1.4986686696876381e-05, + "loss": 0.3655, + "step": 1492 + }, + { + "epoch": 0.7082542694497154, + "grad_norm": 2.371875047683716, + "learning_rate": 1.498002562796174e-05, + "loss": 0.3385, + "step": 1493 + }, + { + "epoch": 0.7087286527514232, + "grad_norm": 3.2066407203674316, + "learning_rate": 1.497336161938725e-05, + "loss": 0.3932, + "step": 1494 + }, + { + "epoch": 0.709203036053131, + "grad_norm": 2.56129789352417, + "learning_rate": 1.4966694675086611e-05, + "loss": 0.3833, + "step": 1495 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 1.934271216392517, + "learning_rate": 1.4960024798995252e-05, + "loss": 0.3025, + "step": 1496 + }, + { + "epoch": 0.7101518026565465, + "grad_norm": 2.200591564178467, + "learning_rate": 1.4953351995050336e-05, + "loss": 0.3285, + "step": 1497 + }, + { + "epoch": 0.7106261859582542, + "grad_norm": 2.4421536922454834, + "learning_rate": 1.4946676267190751e-05, + "loss": 0.3932, + "step": 1498 + }, + { + "epoch": 0.711100569259962, + "grad_norm": 2.842991590499878, + "learning_rate": 1.4939997619357116e-05, + "loss": 0.4339, + "step": 1499 + }, + { + "epoch": 0.7115749525616698, + "grad_norm": 2.2917048931121826, + "learning_rate": 1.493331605549177e-05, + "loss": 0.3514, + "step": 1500 + }, + { + "epoch": 0.7120493358633776, + "grad_norm": 2.323843002319336, + "learning_rate": 1.4926631579538775e-05, + "loss": 0.3419, + "step": 1501 + }, + { + "epoch": 0.7125237191650854, + "grad_norm": 2.5014545917510986, + "learning_rate": 1.4919944195443908e-05, + "loss": 0.46, + "step": 1502 + }, + { + "epoch": 0.7129981024667932, + "grad_norm": 2.414332389831543, + "learning_rate": 1.4913253907154666e-05, + "loss": 0.3509, + "step": 1503 + }, + { + "epoch": 0.713472485768501, + "grad_norm": 2.879897117614746, + "learning_rate": 1.4906560718620265e-05, + "loss": 0.318, + "step": 1504 + }, + { + "epoch": 0.7139468690702088, + "grad_norm": 2.6578876972198486, + "learning_rate": 1.4899864633791623e-05, + "loss": 0.3298, + "step": 1505 + }, + { + "epoch": 0.7144212523719166, + "grad_norm": 2.390251636505127, + "learning_rate": 1.4893165656621371e-05, + "loss": 0.3778, + "step": 1506 + }, + { + "epoch": 0.7148956356736242, + "grad_norm": 3.521209478378296, + "learning_rate": 1.4886463791063854e-05, + "loss": 0.4522, + "step": 1507 + }, + { + "epoch": 0.715370018975332, + "grad_norm": 2.280897617340088, + "learning_rate": 1.4879759041075117e-05, + "loss": 0.3888, + "step": 1508 + }, + { + "epoch": 0.7158444022770398, + "grad_norm": 2.56290340423584, + "learning_rate": 1.4873051410612905e-05, + "loss": 0.3578, + "step": 1509 + }, + { + "epoch": 0.7163187855787476, + "grad_norm": 2.607135534286499, + "learning_rate": 1.486634090363666e-05, + "loss": 0.4305, + "step": 1510 + }, + { + "epoch": 0.7167931688804554, + "grad_norm": 2.685330629348755, + "learning_rate": 1.4859627524107538e-05, + "loss": 0.4462, + "step": 1511 + }, + { + "epoch": 0.7172675521821632, + "grad_norm": 2.119518280029297, + "learning_rate": 1.4852911275988379e-05, + "loss": 0.3338, + "step": 1512 + }, + { + "epoch": 0.717741935483871, + "grad_norm": 2.616091251373291, + "learning_rate": 1.4846192163243713e-05, + "loss": 0.387, + "step": 1513 + }, + { + "epoch": 0.7182163187855788, + "grad_norm": 2.2375452518463135, + "learning_rate": 1.483947018983977e-05, + "loss": 0.3961, + "step": 1514 + }, + { + "epoch": 0.7186907020872866, + "grad_norm": 2.1125147342681885, + "learning_rate": 1.483274535974446e-05, + "loss": 0.3652, + "step": 1515 + }, + { + "epoch": 0.7191650853889943, + "grad_norm": 2.8368351459503174, + "learning_rate": 1.482601767692739e-05, + "loss": 0.4423, + "step": 1516 + }, + { + "epoch": 0.719639468690702, + "grad_norm": 2.951655864715576, + "learning_rate": 1.4819287145359837e-05, + "loss": 0.4042, + "step": 1517 + }, + { + "epoch": 0.7201138519924098, + "grad_norm": 2.661517381668091, + "learning_rate": 1.481255376901477e-05, + "loss": 0.3976, + "step": 1518 + }, + { + "epoch": 0.7205882352941176, + "grad_norm": 2.2923014163970947, + "learning_rate": 1.4805817551866839e-05, + "loss": 0.354, + "step": 1519 + }, + { + "epoch": 0.7210626185958254, + "grad_norm": 2.025585174560547, + "learning_rate": 1.479907849789236e-05, + "loss": 0.3351, + "step": 1520 + }, + { + "epoch": 0.7215370018975332, + "grad_norm": 2.3796160221099854, + "learning_rate": 1.4792336611069335e-05, + "loss": 0.4202, + "step": 1521 + }, + { + "epoch": 0.722011385199241, + "grad_norm": 2.3229618072509766, + "learning_rate": 1.478559189537743e-05, + "loss": 0.3969, + "step": 1522 + }, + { + "epoch": 0.7224857685009488, + "grad_norm": 2.162994861602783, + "learning_rate": 1.4778844354797985e-05, + "loss": 0.302, + "step": 1523 + }, + { + "epoch": 0.7229601518026565, + "grad_norm": 2.562293767929077, + "learning_rate": 1.4772093993314005e-05, + "loss": 0.3537, + "step": 1524 + }, + { + "epoch": 0.7234345351043643, + "grad_norm": 2.323808431625366, + "learning_rate": 1.4765340814910163e-05, + "loss": 0.3961, + "step": 1525 + }, + { + "epoch": 0.7239089184060721, + "grad_norm": 2.3642516136169434, + "learning_rate": 1.4758584823572792e-05, + "loss": 0.3491, + "step": 1526 + }, + { + "epoch": 0.7243833017077799, + "grad_norm": 2.2448556423187256, + "learning_rate": 1.4751826023289889e-05, + "loss": 0.3941, + "step": 1527 + }, + { + "epoch": 0.7248576850094877, + "grad_norm": 2.7379980087280273, + "learning_rate": 1.4745064418051107e-05, + "loss": 0.4637, + "step": 1528 + }, + { + "epoch": 0.7253320683111955, + "grad_norm": 2.4983396530151367, + "learning_rate": 1.4738300011847752e-05, + "loss": 0.3884, + "step": 1529 + }, + { + "epoch": 0.7258064516129032, + "grad_norm": 2.730942964553833, + "learning_rate": 1.4731532808672785e-05, + "loss": 0.4037, + "step": 1530 + }, + { + "epoch": 0.726280834914611, + "grad_norm": 2.594155788421631, + "learning_rate": 1.4724762812520825e-05, + "loss": 0.4064, + "step": 1531 + }, + { + "epoch": 0.7267552182163188, + "grad_norm": 2.7857630252838135, + "learning_rate": 1.4717990027388129e-05, + "loss": 0.3294, + "step": 1532 + }, + { + "epoch": 0.7272296015180265, + "grad_norm": 2.5247364044189453, + "learning_rate": 1.4711214457272611e-05, + "loss": 0.4019, + "step": 1533 + }, + { + "epoch": 0.7277039848197343, + "grad_norm": 2.9784631729125977, + "learning_rate": 1.4704436106173813e-05, + "loss": 0.4528, + "step": 1534 + }, + { + "epoch": 0.7281783681214421, + "grad_norm": 2.051332950592041, + "learning_rate": 1.4697654978092935e-05, + "loss": 0.3219, + "step": 1535 + }, + { + "epoch": 0.7286527514231499, + "grad_norm": 2.4342894554138184, + "learning_rate": 1.4690871077032808e-05, + "loss": 0.4505, + "step": 1536 + }, + { + "epoch": 0.7291271347248577, + "grad_norm": 2.270163059234619, + "learning_rate": 1.4684084406997903e-05, + "loss": 0.3858, + "step": 1537 + }, + { + "epoch": 0.7296015180265655, + "grad_norm": 2.4693052768707275, + "learning_rate": 1.4677294971994325e-05, + "loss": 0.3749, + "step": 1538 + }, + { + "epoch": 0.7300759013282733, + "grad_norm": 2.4099597930908203, + "learning_rate": 1.4670502776029804e-05, + "loss": 0.328, + "step": 1539 + }, + { + "epoch": 0.7305502846299811, + "grad_norm": 3.063075304031372, + "learning_rate": 1.4663707823113717e-05, + "loss": 0.4566, + "step": 1540 + }, + { + "epoch": 0.7310246679316889, + "grad_norm": 1.9556933641433716, + "learning_rate": 1.4656910117257049e-05, + "loss": 0.3263, + "step": 1541 + }, + { + "epoch": 0.7314990512333965, + "grad_norm": 2.0928752422332764, + "learning_rate": 1.4650109662472422e-05, + "loss": 0.314, + "step": 1542 + }, + { + "epoch": 0.7319734345351043, + "grad_norm": 2.1672606468200684, + "learning_rate": 1.4643306462774071e-05, + "loss": 0.2977, + "step": 1543 + }, + { + "epoch": 0.7324478178368121, + "grad_norm": 2.769050121307373, + "learning_rate": 1.4636500522177868e-05, + "loss": 0.3666, + "step": 1544 + }, + { + "epoch": 0.7329222011385199, + "grad_norm": 2.6004836559295654, + "learning_rate": 1.4629691844701288e-05, + "loss": 0.3975, + "step": 1545 + }, + { + "epoch": 0.7333965844402277, + "grad_norm": 2.554579019546509, + "learning_rate": 1.462288043436342e-05, + "loss": 0.4653, + "step": 1546 + }, + { + "epoch": 0.7338709677419355, + "grad_norm": 2.2475826740264893, + "learning_rate": 1.461606629518498e-05, + "loss": 0.4302, + "step": 1547 + }, + { + "epoch": 0.7343453510436433, + "grad_norm": 2.2661335468292236, + "learning_rate": 1.460924943118828e-05, + "loss": 0.3663, + "step": 1548 + }, + { + "epoch": 0.7348197343453511, + "grad_norm": 3.354585647583008, + "learning_rate": 1.4602429846397254e-05, + "loss": 0.385, + "step": 1549 + }, + { + "epoch": 0.7352941176470589, + "grad_norm": 2.5863516330718994, + "learning_rate": 1.4595607544837435e-05, + "loss": 0.4031, + "step": 1550 + }, + { + "epoch": 0.7357685009487666, + "grad_norm": 2.85939359664917, + "learning_rate": 1.4588782530535955e-05, + "loss": 0.4245, + "step": 1551 + }, + { + "epoch": 0.7362428842504743, + "grad_norm": 2.234726667404175, + "learning_rate": 1.4581954807521555e-05, + "loss": 0.314, + "step": 1552 + }, + { + "epoch": 0.7367172675521821, + "grad_norm": 2.442960262298584, + "learning_rate": 1.4575124379824574e-05, + "loss": 0.3825, + "step": 1553 + }, + { + "epoch": 0.7371916508538899, + "grad_norm": 3.2431604862213135, + "learning_rate": 1.4568291251476944e-05, + "loss": 0.4371, + "step": 1554 + }, + { + "epoch": 0.7376660341555977, + "grad_norm": 2.6331787109375, + "learning_rate": 1.4561455426512192e-05, + "loss": 0.4168, + "step": 1555 + }, + { + "epoch": 0.7381404174573055, + "grad_norm": 2.5787758827209473, + "learning_rate": 1.455461690896544e-05, + "loss": 0.3975, + "step": 1556 + }, + { + "epoch": 0.7386148007590133, + "grad_norm": 2.2691023349761963, + "learning_rate": 1.45477757028734e-05, + "loss": 0.3795, + "step": 1557 + }, + { + "epoch": 0.7390891840607211, + "grad_norm": 2.3956029415130615, + "learning_rate": 1.4540931812274359e-05, + "loss": 0.393, + "step": 1558 + }, + { + "epoch": 0.7395635673624289, + "grad_norm": 2.2934021949768066, + "learning_rate": 1.4534085241208206e-05, + "loss": 0.3465, + "step": 1559 + }, + { + "epoch": 0.7400379506641366, + "grad_norm": 2.020655632019043, + "learning_rate": 1.4527235993716402e-05, + "loss": 0.2692, + "step": 1560 + }, + { + "epoch": 0.7405123339658444, + "grad_norm": 2.299783945083618, + "learning_rate": 1.4520384073841991e-05, + "loss": 0.3394, + "step": 1561 + }, + { + "epoch": 0.7409867172675522, + "grad_norm": 2.876800298690796, + "learning_rate": 1.4513529485629591e-05, + "loss": 0.3533, + "step": 1562 + }, + { + "epoch": 0.74146110056926, + "grad_norm": 2.770765542984009, + "learning_rate": 1.4506672233125398e-05, + "loss": 0.3855, + "step": 1563 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 2.479526996612549, + "learning_rate": 1.4499812320377182e-05, + "loss": 0.4417, + "step": 1564 + }, + { + "epoch": 0.7424098671726755, + "grad_norm": 2.534569025039673, + "learning_rate": 1.4492949751434282e-05, + "loss": 0.346, + "step": 1565 + }, + { + "epoch": 0.7428842504743833, + "grad_norm": 2.4352223873138428, + "learning_rate": 1.4486084530347604e-05, + "loss": 0.3262, + "step": 1566 + }, + { + "epoch": 0.7433586337760911, + "grad_norm": 2.7315893173217773, + "learning_rate": 1.4479216661169618e-05, + "loss": 0.3896, + "step": 1567 + }, + { + "epoch": 0.7438330170777988, + "grad_norm": 2.1865360736846924, + "learning_rate": 1.4472346147954356e-05, + "loss": 0.4102, + "step": 1568 + }, + { + "epoch": 0.7443074003795066, + "grad_norm": 1.9840688705444336, + "learning_rate": 1.4465472994757429e-05, + "loss": 0.2967, + "step": 1569 + }, + { + "epoch": 0.7447817836812144, + "grad_norm": 2.338481903076172, + "learning_rate": 1.4458597205635973e-05, + "loss": 0.3746, + "step": 1570 + }, + { + "epoch": 0.7452561669829222, + "grad_norm": 2.240323543548584, + "learning_rate": 1.445171878464871e-05, + "loss": 0.3847, + "step": 1571 + }, + { + "epoch": 0.74573055028463, + "grad_norm": 2.151249408721924, + "learning_rate": 1.4444837735855896e-05, + "loss": 0.3707, + "step": 1572 + }, + { + "epoch": 0.7462049335863378, + "grad_norm": 2.2230353355407715, + "learning_rate": 1.4437954063319352e-05, + "loss": 0.3636, + "step": 1573 + }, + { + "epoch": 0.7466793168880456, + "grad_norm": 2.0856575965881348, + "learning_rate": 1.4431067771102443e-05, + "loss": 0.2867, + "step": 1574 + }, + { + "epoch": 0.7471537001897534, + "grad_norm": 2.0492308139801025, + "learning_rate": 1.442417886327007e-05, + "loss": 0.3734, + "step": 1575 + }, + { + "epoch": 0.7476280834914611, + "grad_norm": 2.09089994430542, + "learning_rate": 1.4417287343888698e-05, + "loss": 0.3346, + "step": 1576 + }, + { + "epoch": 0.7481024667931688, + "grad_norm": 2.4338037967681885, + "learning_rate": 1.4410393217026317e-05, + "loss": 0.3388, + "step": 1577 + }, + { + "epoch": 0.7485768500948766, + "grad_norm": 2.734534740447998, + "learning_rate": 1.4403496486752465e-05, + "loss": 0.4177, + "step": 1578 + }, + { + "epoch": 0.7490512333965844, + "grad_norm": 3.0047781467437744, + "learning_rate": 1.4396597157138205e-05, + "loss": 0.3927, + "step": 1579 + }, + { + "epoch": 0.7495256166982922, + "grad_norm": 2.755629062652588, + "learning_rate": 1.4389695232256151e-05, + "loss": 0.4591, + "step": 1580 + }, + { + "epoch": 0.75, + "grad_norm": 2.1415135860443115, + "learning_rate": 1.4382790716180446e-05, + "loss": 0.2856, + "step": 1581 + }, + { + "epoch": 0.7504743833017078, + "grad_norm": 2.5595362186431885, + "learning_rate": 1.4375883612986744e-05, + "loss": 0.3146, + "step": 1582 + }, + { + "epoch": 0.7509487666034156, + "grad_norm": 3.115945339202881, + "learning_rate": 1.4368973926752248e-05, + "loss": 0.4559, + "step": 1583 + }, + { + "epoch": 0.7514231499051234, + "grad_norm": 2.1239240169525146, + "learning_rate": 1.4362061661555675e-05, + "loss": 0.3491, + "step": 1584 + }, + { + "epoch": 0.7518975332068312, + "grad_norm": 2.0848984718322754, + "learning_rate": 1.435514682147727e-05, + "loss": 0.3327, + "step": 1585 + }, + { + "epoch": 0.7523719165085389, + "grad_norm": 2.5240261554718018, + "learning_rate": 1.4348229410598791e-05, + "loss": 0.3467, + "step": 1586 + }, + { + "epoch": 0.7528462998102466, + "grad_norm": 2.0627050399780273, + "learning_rate": 1.4341309433003518e-05, + "loss": 0.3109, + "step": 1587 + }, + { + "epoch": 0.7533206831119544, + "grad_norm": 2.349839210510254, + "learning_rate": 1.4334386892776246e-05, + "loss": 0.3552, + "step": 1588 + }, + { + "epoch": 0.7537950664136622, + "grad_norm": 3.1062917709350586, + "learning_rate": 1.4327461794003284e-05, + "loss": 0.4663, + "step": 1589 + }, + { + "epoch": 0.75426944971537, + "grad_norm": 2.2856526374816895, + "learning_rate": 1.4320534140772447e-05, + "loss": 0.3425, + "step": 1590 + }, + { + "epoch": 0.7547438330170778, + "grad_norm": 2.193957567214966, + "learning_rate": 1.4313603937173058e-05, + "loss": 0.3448, + "step": 1591 + }, + { + "epoch": 0.7552182163187856, + "grad_norm": 2.4013428688049316, + "learning_rate": 1.4306671187295948e-05, + "loss": 0.3795, + "step": 1592 + }, + { + "epoch": 0.7556925996204934, + "grad_norm": 2.1507675647735596, + "learning_rate": 1.4299735895233457e-05, + "loss": 0.307, + "step": 1593 + }, + { + "epoch": 0.7561669829222012, + "grad_norm": 2.046177864074707, + "learning_rate": 1.4292798065079413e-05, + "loss": 0.3221, + "step": 1594 + }, + { + "epoch": 0.7566413662239089, + "grad_norm": 1.7904236316680908, + "learning_rate": 1.428585770092915e-05, + "loss": 0.3009, + "step": 1595 + }, + { + "epoch": 0.7571157495256167, + "grad_norm": 2.811075210571289, + "learning_rate": 1.4278914806879494e-05, + "loss": 0.4872, + "step": 1596 + }, + { + "epoch": 0.7575901328273245, + "grad_norm": 2.26833176612854, + "learning_rate": 1.4271969387028773e-05, + "loss": 0.3738, + "step": 1597 + }, + { + "epoch": 0.7580645161290323, + "grad_norm": 2.7194955348968506, + "learning_rate": 1.4265021445476794e-05, + "loss": 0.4348, + "step": 1598 + }, + { + "epoch": 0.75853889943074, + "grad_norm": 2.353123664855957, + "learning_rate": 1.4258070986324859e-05, + "loss": 0.3861, + "step": 1599 + }, + { + "epoch": 0.7590132827324478, + "grad_norm": 4.482807636260986, + "learning_rate": 1.4251118013675758e-05, + "loss": 0.3265, + "step": 1600 + }, + { + "epoch": 0.7594876660341556, + "grad_norm": 2.567732810974121, + "learning_rate": 1.424416253163376e-05, + "loss": 0.4312, + "step": 1601 + }, + { + "epoch": 0.7599620493358634, + "grad_norm": 2.412712335586548, + "learning_rate": 1.4237204544304616e-05, + "loss": 0.4085, + "step": 1602 + }, + { + "epoch": 0.7604364326375711, + "grad_norm": 2.1643178462982178, + "learning_rate": 1.423024405579556e-05, + "loss": 0.3099, + "step": 1603 + }, + { + "epoch": 0.7609108159392789, + "grad_norm": 2.2639448642730713, + "learning_rate": 1.4223281070215297e-05, + "loss": 0.3819, + "step": 1604 + }, + { + "epoch": 0.7613851992409867, + "grad_norm": 2.341059446334839, + "learning_rate": 1.421631559167401e-05, + "loss": 0.4073, + "step": 1605 + }, + { + "epoch": 0.7618595825426945, + "grad_norm": 2.276641845703125, + "learning_rate": 1.4209347624283352e-05, + "loss": 0.4333, + "step": 1606 + }, + { + "epoch": 0.7623339658444023, + "grad_norm": 2.5862913131713867, + "learning_rate": 1.4202377172156443e-05, + "loss": 0.3642, + "step": 1607 + }, + { + "epoch": 0.7628083491461101, + "grad_norm": 2.1797101497650146, + "learning_rate": 1.4195404239407873e-05, + "loss": 0.35, + "step": 1608 + }, + { + "epoch": 0.7632827324478179, + "grad_norm": 2.534806251525879, + "learning_rate": 1.4188428830153698e-05, + "loss": 0.4342, + "step": 1609 + }, + { + "epoch": 0.7637571157495257, + "grad_norm": 2.4096391201019287, + "learning_rate": 1.4181450948511431e-05, + "loss": 0.3694, + "step": 1610 + }, + { + "epoch": 0.7642314990512334, + "grad_norm": 2.1816611289978027, + "learning_rate": 1.4174470598600048e-05, + "loss": 0.3174, + "step": 1611 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 2.178921699523926, + "learning_rate": 1.4167487784539973e-05, + "loss": 0.3492, + "step": 1612 + }, + { + "epoch": 0.7651802656546489, + "grad_norm": 2.4309422969818115, + "learning_rate": 1.4160502510453103e-05, + "loss": 0.3701, + "step": 1613 + }, + { + "epoch": 0.7656546489563567, + "grad_norm": 2.0590929985046387, + "learning_rate": 1.4153514780462767e-05, + "loss": 0.305, + "step": 1614 + }, + { + "epoch": 0.7661290322580645, + "grad_norm": 2.736320972442627, + "learning_rate": 1.4146524598693758e-05, + "loss": 0.4535, + "step": 1615 + }, + { + "epoch": 0.7666034155597723, + "grad_norm": 2.418962001800537, + "learning_rate": 1.4139531969272313e-05, + "loss": 0.3975, + "step": 1616 + }, + { + "epoch": 0.7670777988614801, + "grad_norm": 2.187948703765869, + "learning_rate": 1.4132536896326105e-05, + "loss": 0.3178, + "step": 1617 + }, + { + "epoch": 0.7675521821631879, + "grad_norm": 2.1725316047668457, + "learning_rate": 1.4125539383984264e-05, + "loss": 0.3138, + "step": 1618 + }, + { + "epoch": 0.7680265654648957, + "grad_norm": 2.384047508239746, + "learning_rate": 1.4118539436377345e-05, + "loss": 0.3081, + "step": 1619 + }, + { + "epoch": 0.7685009487666035, + "grad_norm": 2.3366310596466064, + "learning_rate": 1.4111537057637353e-05, + "loss": 0.335, + "step": 1620 + }, + { + "epoch": 0.7689753320683111, + "grad_norm": 2.152684211730957, + "learning_rate": 1.4104532251897722e-05, + "loss": 0.3145, + "step": 1621 + }, + { + "epoch": 0.7694497153700189, + "grad_norm": 3.2571208477020264, + "learning_rate": 1.409752502329332e-05, + "loss": 0.297, + "step": 1622 + }, + { + "epoch": 0.7699240986717267, + "grad_norm": 2.3170127868652344, + "learning_rate": 1.409051537596044e-05, + "loss": 0.3196, + "step": 1623 + }, + { + "epoch": 0.7703984819734345, + "grad_norm": 2.442521810531616, + "learning_rate": 1.4083503314036813e-05, + "loss": 0.318, + "step": 1624 + }, + { + "epoch": 0.7708728652751423, + "grad_norm": 3.705756902694702, + "learning_rate": 1.4076488841661588e-05, + "loss": 0.3793, + "step": 1625 + }, + { + "epoch": 0.7713472485768501, + "grad_norm": 2.7555148601531982, + "learning_rate": 1.406947196297534e-05, + "loss": 0.3927, + "step": 1626 + }, + { + "epoch": 0.7718216318785579, + "grad_norm": 2.6326348781585693, + "learning_rate": 1.4062452682120056e-05, + "loss": 0.4274, + "step": 1627 + }, + { + "epoch": 0.7722960151802657, + "grad_norm": 2.276700258255005, + "learning_rate": 1.4055431003239156e-05, + "loss": 0.3913, + "step": 1628 + }, + { + "epoch": 0.7727703984819735, + "grad_norm": 2.765024423599243, + "learning_rate": 1.4048406930477465e-05, + "loss": 0.4786, + "step": 1629 + }, + { + "epoch": 0.7732447817836812, + "grad_norm": 2.5013036727905273, + "learning_rate": 1.4041380467981225e-05, + "loss": 0.4325, + "step": 1630 + }, + { + "epoch": 0.773719165085389, + "grad_norm": 2.752293348312378, + "learning_rate": 1.4034351619898088e-05, + "loss": 0.3946, + "step": 1631 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 2.66115665435791, + "learning_rate": 1.4027320390377106e-05, + "loss": 0.4486, + "step": 1632 + }, + { + "epoch": 0.7746679316888045, + "grad_norm": 2.023422956466675, + "learning_rate": 1.4020286783568753e-05, + "loss": 0.3411, + "step": 1633 + }, + { + "epoch": 0.7751423149905123, + "grad_norm": 2.1489474773406982, + "learning_rate": 1.4013250803624894e-05, + "loss": 0.3351, + "step": 1634 + }, + { + "epoch": 0.7756166982922201, + "grad_norm": 2.9013800621032715, + "learning_rate": 1.4006212454698798e-05, + "loss": 0.4143, + "step": 1635 + }, + { + "epoch": 0.7760910815939279, + "grad_norm": 2.3680756092071533, + "learning_rate": 1.3999171740945132e-05, + "loss": 0.3711, + "step": 1636 + }, + { + "epoch": 0.7765654648956357, + "grad_norm": 2.0781219005584717, + "learning_rate": 1.3992128666519961e-05, + "loss": 0.3542, + "step": 1637 + }, + { + "epoch": 0.7770398481973435, + "grad_norm": 1.984551191329956, + "learning_rate": 1.3985083235580743e-05, + "loss": 0.315, + "step": 1638 + }, + { + "epoch": 0.7775142314990512, + "grad_norm": 1.8971257209777832, + "learning_rate": 1.3978035452286325e-05, + "loss": 0.2914, + "step": 1639 + }, + { + "epoch": 0.777988614800759, + "grad_norm": 2.8102481365203857, + "learning_rate": 1.3970985320796943e-05, + "loss": 0.3875, + "step": 1640 + }, + { + "epoch": 0.7784629981024668, + "grad_norm": 2.2260053157806396, + "learning_rate": 1.396393284527422e-05, + "loss": 0.3428, + "step": 1641 + }, + { + "epoch": 0.7789373814041746, + "grad_norm": 2.3798251152038574, + "learning_rate": 1.3956878029881167e-05, + "loss": 0.3244, + "step": 1642 + }, + { + "epoch": 0.7794117647058824, + "grad_norm": 2.634979009628296, + "learning_rate": 1.3949820878782166e-05, + "loss": 0.317, + "step": 1643 + }, + { + "epoch": 0.7798861480075902, + "grad_norm": 2.1837527751922607, + "learning_rate": 1.3942761396142982e-05, + "loss": 0.3588, + "step": 1644 + }, + { + "epoch": 0.780360531309298, + "grad_norm": 2.22882080078125, + "learning_rate": 1.3935699586130767e-05, + "loss": 0.3373, + "step": 1645 + }, + { + "epoch": 0.7808349146110057, + "grad_norm": 2.3200697898864746, + "learning_rate": 1.3928635452914028e-05, + "loss": 0.3661, + "step": 1646 + }, + { + "epoch": 0.7813092979127134, + "grad_norm": 2.158055543899536, + "learning_rate": 1.3921569000662658e-05, + "loss": 0.3346, + "step": 1647 + }, + { + "epoch": 0.7817836812144212, + "grad_norm": 2.5481321811676025, + "learning_rate": 1.3914500233547909e-05, + "loss": 0.3935, + "step": 1648 + }, + { + "epoch": 0.782258064516129, + "grad_norm": 2.6886203289031982, + "learning_rate": 1.3907429155742414e-05, + "loss": 0.4086, + "step": 1649 + }, + { + "epoch": 0.7827324478178368, + "grad_norm": 2.635003089904785, + "learning_rate": 1.3900355771420151e-05, + "loss": 0.4449, + "step": 1650 + }, + { + "epoch": 0.7832068311195446, + "grad_norm": 2.5500478744506836, + "learning_rate": 1.3893280084756468e-05, + "loss": 0.3725, + "step": 1651 + }, + { + "epoch": 0.7836812144212524, + "grad_norm": 2.0035715103149414, + "learning_rate": 1.3886202099928083e-05, + "loss": 0.3206, + "step": 1652 + }, + { + "epoch": 0.7841555977229602, + "grad_norm": 2.1694624423980713, + "learning_rate": 1.387912182111305e-05, + "loss": 0.3578, + "step": 1653 + }, + { + "epoch": 0.784629981024668, + "grad_norm": 2.505841016769409, + "learning_rate": 1.3872039252490796e-05, + "loss": 0.395, + "step": 1654 + }, + { + "epoch": 0.7851043643263758, + "grad_norm": 1.9906131029129028, + "learning_rate": 1.386495439824208e-05, + "loss": 0.3343, + "step": 1655 + }, + { + "epoch": 0.7855787476280834, + "grad_norm": 2.116349458694458, + "learning_rate": 1.3857867262549032e-05, + "loss": 0.3224, + "step": 1656 + }, + { + "epoch": 0.7860531309297912, + "grad_norm": 2.3191020488739014, + "learning_rate": 1.3850777849595114e-05, + "loss": 0.4305, + "step": 1657 + }, + { + "epoch": 0.786527514231499, + "grad_norm": 2.867265224456787, + "learning_rate": 1.3843686163565138e-05, + "loss": 0.4917, + "step": 1658 + }, + { + "epoch": 0.7870018975332068, + "grad_norm": 2.6565215587615967, + "learning_rate": 1.3836592208645252e-05, + "loss": 0.3741, + "step": 1659 + }, + { + "epoch": 0.7874762808349146, + "grad_norm": 2.156074047088623, + "learning_rate": 1.382949598902295e-05, + "loss": 0.352, + "step": 1660 + }, + { + "epoch": 0.7879506641366224, + "grad_norm": 2.667464017868042, + "learning_rate": 1.3822397508887063e-05, + "loss": 0.4249, + "step": 1661 + }, + { + "epoch": 0.7884250474383302, + "grad_norm": 2.4767210483551025, + "learning_rate": 1.381529677242775e-05, + "loss": 0.3748, + "step": 1662 + }, + { + "epoch": 0.788899430740038, + "grad_norm": 2.4702491760253906, + "learning_rate": 1.38081937838365e-05, + "loss": 0.3526, + "step": 1663 + }, + { + "epoch": 0.7893738140417458, + "grad_norm": 2.1653172969818115, + "learning_rate": 1.3801088547306149e-05, + "loss": 0.393, + "step": 1664 + }, + { + "epoch": 0.7898481973434535, + "grad_norm": 2.3579320907592773, + "learning_rate": 1.379398106703084e-05, + "loss": 0.4005, + "step": 1665 + }, + { + "epoch": 0.7903225806451613, + "grad_norm": 1.9658737182617188, + "learning_rate": 1.3786871347206053e-05, + "loss": 0.2901, + "step": 1666 + }, + { + "epoch": 0.790796963946869, + "grad_norm": 2.903672695159912, + "learning_rate": 1.377975939202858e-05, + "loss": 0.4421, + "step": 1667 + }, + { + "epoch": 0.7912713472485768, + "grad_norm": 3.15793514251709, + "learning_rate": 1.377264520569654e-05, + "loss": 0.3416, + "step": 1668 + }, + { + "epoch": 0.7917457305502846, + "grad_norm": 2.4560675621032715, + "learning_rate": 1.3765528792409368e-05, + "loss": 0.3839, + "step": 1669 + }, + { + "epoch": 0.7922201138519924, + "grad_norm": 2.24422550201416, + "learning_rate": 1.3758410156367812e-05, + "loss": 0.33, + "step": 1670 + }, + { + "epoch": 0.7926944971537002, + "grad_norm": 2.6136462688446045, + "learning_rate": 1.375128930177393e-05, + "loss": 0.3796, + "step": 1671 + }, + { + "epoch": 0.793168880455408, + "grad_norm": 2.220956325531006, + "learning_rate": 1.3744166232831093e-05, + "loss": 0.343, + "step": 1672 + }, + { + "epoch": 0.7936432637571158, + "grad_norm": 2.2082831859588623, + "learning_rate": 1.3737040953743981e-05, + "loss": 0.3344, + "step": 1673 + }, + { + "epoch": 0.7941176470588235, + "grad_norm": 2.327711582183838, + "learning_rate": 1.3729913468718574e-05, + "loss": 0.3889, + "step": 1674 + }, + { + "epoch": 0.7945920303605313, + "grad_norm": 2.5623528957366943, + "learning_rate": 1.3722783781962155e-05, + "loss": 0.3582, + "step": 1675 + }, + { + "epoch": 0.7950664136622391, + "grad_norm": 2.1435022354125977, + "learning_rate": 1.3715651897683306e-05, + "loss": 0.3027, + "step": 1676 + }, + { + "epoch": 0.7955407969639469, + "grad_norm": 2.650327205657959, + "learning_rate": 1.3708517820091912e-05, + "loss": 0.3932, + "step": 1677 + }, + { + "epoch": 0.7960151802656547, + "grad_norm": 2.224015235900879, + "learning_rate": 1.3701381553399147e-05, + "loss": 0.3401, + "step": 1678 + }, + { + "epoch": 0.7964895635673624, + "grad_norm": 2.3124001026153564, + "learning_rate": 1.3694243101817475e-05, + "loss": 0.3646, + "step": 1679 + }, + { + "epoch": 0.7969639468690702, + "grad_norm": 2.3588428497314453, + "learning_rate": 1.3687102469560656e-05, + "loss": 0.3372, + "step": 1680 + }, + { + "epoch": 0.797438330170778, + "grad_norm": 2.7870490550994873, + "learning_rate": 1.3679959660843736e-05, + "loss": 0.47, + "step": 1681 + }, + { + "epoch": 0.7979127134724858, + "grad_norm": 2.235140562057495, + "learning_rate": 1.3672814679883044e-05, + "loss": 0.3261, + "step": 1682 + }, + { + "epoch": 0.7983870967741935, + "grad_norm": 2.4782466888427734, + "learning_rate": 1.3665667530896189e-05, + "loss": 0.4113, + "step": 1683 + }, + { + "epoch": 0.7988614800759013, + "grad_norm": 2.6865344047546387, + "learning_rate": 1.3658518218102064e-05, + "loss": 0.4226, + "step": 1684 + }, + { + "epoch": 0.7993358633776091, + "grad_norm": 2.3782565593719482, + "learning_rate": 1.3651366745720837e-05, + "loss": 0.3715, + "step": 1685 + }, + { + "epoch": 0.7998102466793169, + "grad_norm": 2.305004119873047, + "learning_rate": 1.3644213117973954e-05, + "loss": 0.4011, + "step": 1686 + }, + { + "epoch": 0.8002846299810247, + "grad_norm": 2.5552866458892822, + "learning_rate": 1.3637057339084125e-05, + "loss": 0.3444, + "step": 1687 + }, + { + "epoch": 0.8007590132827325, + "grad_norm": 2.5058846473693848, + "learning_rate": 1.3629899413275342e-05, + "loss": 0.3723, + "step": 1688 + }, + { + "epoch": 0.8012333965844403, + "grad_norm": 2.0107834339141846, + "learning_rate": 1.3622739344772853e-05, + "loss": 0.2992, + "step": 1689 + }, + { + "epoch": 0.801707779886148, + "grad_norm": 1.9664660692214966, + "learning_rate": 1.361557713780318e-05, + "loss": 0.3447, + "step": 1690 + }, + { + "epoch": 0.8021821631878557, + "grad_norm": 2.476154088973999, + "learning_rate": 1.3608412796594096e-05, + "loss": 0.4195, + "step": 1691 + }, + { + "epoch": 0.8026565464895635, + "grad_norm": 2.669809103012085, + "learning_rate": 1.360124632537465e-05, + "loss": 0.4169, + "step": 1692 + }, + { + "epoch": 0.8031309297912713, + "grad_norm": 2.68930983543396, + "learning_rate": 1.3594077728375129e-05, + "loss": 0.418, + "step": 1693 + }, + { + "epoch": 0.8036053130929791, + "grad_norm": 2.4470651149749756, + "learning_rate": 1.3586907009827093e-05, + "loss": 0.4022, + "step": 1694 + }, + { + "epoch": 0.8040796963946869, + "grad_norm": 2.488619565963745, + "learning_rate": 1.3579734173963343e-05, + "loss": 0.3691, + "step": 1695 + }, + { + "epoch": 0.8045540796963947, + "grad_norm": 2.0860743522644043, + "learning_rate": 1.3572559225017932e-05, + "loss": 0.3586, + "step": 1696 + }, + { + "epoch": 0.8050284629981025, + "grad_norm": 2.9916765689849854, + "learning_rate": 1.3565382167226162e-05, + "loss": 0.3821, + "step": 1697 + }, + { + "epoch": 0.8055028462998103, + "grad_norm": 2.3840274810791016, + "learning_rate": 1.3558203004824581e-05, + "loss": 0.3548, + "step": 1698 + }, + { + "epoch": 0.8059772296015181, + "grad_norm": 5.072951316833496, + "learning_rate": 1.3551021742050974e-05, + "loss": 0.3889, + "step": 1699 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 1.943625569343567, + "learning_rate": 1.3543838383144374e-05, + "loss": 0.3432, + "step": 1700 + }, + { + "epoch": 0.8069259962049335, + "grad_norm": 1.8006517887115479, + "learning_rate": 1.3536652932345043e-05, + "loss": 0.2821, + "step": 1701 + }, + { + "epoch": 0.8074003795066413, + "grad_norm": 2.0171427726745605, + "learning_rate": 1.3529465393894485e-05, + "loss": 0.3287, + "step": 1702 + }, + { + "epoch": 0.8078747628083491, + "grad_norm": 2.133181095123291, + "learning_rate": 1.3522275772035426e-05, + "loss": 0.3074, + "step": 1703 + }, + { + "epoch": 0.8083491461100569, + "grad_norm": 1.9669580459594727, + "learning_rate": 1.3515084071011837e-05, + "loss": 0.3211, + "step": 1704 + }, + { + "epoch": 0.8088235294117647, + "grad_norm": 2.351128101348877, + "learning_rate": 1.3507890295068902e-05, + "loss": 0.3666, + "step": 1705 + }, + { + "epoch": 0.8092979127134725, + "grad_norm": 2.2970938682556152, + "learning_rate": 1.3500694448453038e-05, + "loss": 0.3401, + "step": 1706 + }, + { + "epoch": 0.8097722960151803, + "grad_norm": 2.5522122383117676, + "learning_rate": 1.3493496535411885e-05, + "loss": 0.3661, + "step": 1707 + }, + { + "epoch": 0.8102466793168881, + "grad_norm": 3.098694324493408, + "learning_rate": 1.3486296560194292e-05, + "loss": 0.3709, + "step": 1708 + }, + { + "epoch": 0.8107210626185958, + "grad_norm": 2.7200305461883545, + "learning_rate": 1.347909452705034e-05, + "loss": 0.4361, + "step": 1709 + }, + { + "epoch": 0.8111954459203036, + "grad_norm": 2.124351978302002, + "learning_rate": 1.3471890440231319e-05, + "loss": 0.411, + "step": 1710 + }, + { + "epoch": 0.8116698292220114, + "grad_norm": 2.3741860389709473, + "learning_rate": 1.3464684303989723e-05, + "loss": 0.36, + "step": 1711 + }, + { + "epoch": 0.8121442125237192, + "grad_norm": 2.1634681224823, + "learning_rate": 1.3457476122579268e-05, + "loss": 0.3456, + "step": 1712 + }, + { + "epoch": 0.812618595825427, + "grad_norm": 2.8045156002044678, + "learning_rate": 1.3450265900254868e-05, + "loss": 0.472, + "step": 1713 + }, + { + "epoch": 0.8130929791271347, + "grad_norm": 3.158280372619629, + "learning_rate": 1.3443053641272656e-05, + "loss": 0.4005, + "step": 1714 + }, + { + "epoch": 0.8135673624288425, + "grad_norm": 2.369152784347534, + "learning_rate": 1.3435839349889945e-05, + "loss": 0.3169, + "step": 1715 + }, + { + "epoch": 0.8140417457305503, + "grad_norm": 2.4343760013580322, + "learning_rate": 1.3428623030365267e-05, + "loss": 0.3438, + "step": 1716 + }, + { + "epoch": 0.8145161290322581, + "grad_norm": 2.882737636566162, + "learning_rate": 1.342140468695834e-05, + "loss": 0.3749, + "step": 1717 + }, + { + "epoch": 0.8149905123339658, + "grad_norm": 2.584279775619507, + "learning_rate": 1.3414184323930082e-05, + "loss": 0.3517, + "step": 1718 + }, + { + "epoch": 0.8154648956356736, + "grad_norm": 2.2734382152557373, + "learning_rate": 1.3406961945542605e-05, + "loss": 0.2909, + "step": 1719 + }, + { + "epoch": 0.8159392789373814, + "grad_norm": 2.0772268772125244, + "learning_rate": 1.3399737556059203e-05, + "loss": 0.3331, + "step": 1720 + }, + { + "epoch": 0.8164136622390892, + "grad_norm": 2.1270837783813477, + "learning_rate": 1.3392511159744364e-05, + "loss": 0.286, + "step": 1721 + }, + { + "epoch": 0.816888045540797, + "grad_norm": 2.7782678604125977, + "learning_rate": 1.3385282760863758e-05, + "loss": 0.3716, + "step": 1722 + }, + { + "epoch": 0.8173624288425048, + "grad_norm": 2.4752237796783447, + "learning_rate": 1.3378052363684238e-05, + "loss": 0.3222, + "step": 1723 + }, + { + "epoch": 0.8178368121442126, + "grad_norm": 2.3941702842712402, + "learning_rate": 1.3370819972473832e-05, + "loss": 0.3384, + "step": 1724 + }, + { + "epoch": 0.8183111954459203, + "grad_norm": 2.5002641677856445, + "learning_rate": 1.3363585591501751e-05, + "loss": 0.3646, + "step": 1725 + }, + { + "epoch": 0.818785578747628, + "grad_norm": 2.297147750854492, + "learning_rate": 1.3356349225038384e-05, + "loss": 0.3964, + "step": 1726 + }, + { + "epoch": 0.8192599620493358, + "grad_norm": 2.366065740585327, + "learning_rate": 1.334911087735528e-05, + "loss": 0.3453, + "step": 1727 + }, + { + "epoch": 0.8197343453510436, + "grad_norm": 2.4925296306610107, + "learning_rate": 1.3341870552725166e-05, + "loss": 0.3305, + "step": 1728 + }, + { + "epoch": 0.8202087286527514, + "grad_norm": 2.4606728553771973, + "learning_rate": 1.3334628255421932e-05, + "loss": 0.3727, + "step": 1729 + }, + { + "epoch": 0.8206831119544592, + "grad_norm": 2.2296173572540283, + "learning_rate": 1.3327383989720639e-05, + "loss": 0.3335, + "step": 1730 + }, + { + "epoch": 0.821157495256167, + "grad_norm": 2.3122830390930176, + "learning_rate": 1.3320137759897505e-05, + "loss": 0.3168, + "step": 1731 + }, + { + "epoch": 0.8216318785578748, + "grad_norm": 2.9335269927978516, + "learning_rate": 1.3312889570229901e-05, + "loss": 0.391, + "step": 1732 + }, + { + "epoch": 0.8221062618595826, + "grad_norm": 1.8690437078475952, + "learning_rate": 1.3305639424996369e-05, + "loss": 0.3191, + "step": 1733 + }, + { + "epoch": 0.8225806451612904, + "grad_norm": 2.195049524307251, + "learning_rate": 1.3298387328476594e-05, + "loss": 0.3354, + "step": 1734 + }, + { + "epoch": 0.823055028462998, + "grad_norm": 2.337287187576294, + "learning_rate": 1.3291133284951418e-05, + "loss": 0.3784, + "step": 1735 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 2.4185826778411865, + "learning_rate": 1.328387729870283e-05, + "loss": 0.3549, + "step": 1736 + }, + { + "epoch": 0.8240037950664136, + "grad_norm": 2.838695526123047, + "learning_rate": 1.3276619374013965e-05, + "loss": 0.4295, + "step": 1737 + }, + { + "epoch": 0.8244781783681214, + "grad_norm": 2.432915687561035, + "learning_rate": 1.3269359515169113e-05, + "loss": 0.2935, + "step": 1738 + }, + { + "epoch": 0.8249525616698292, + "grad_norm": 3.295152187347412, + "learning_rate": 1.326209772645369e-05, + "loss": 0.473, + "step": 1739 + }, + { + "epoch": 0.825426944971537, + "grad_norm": 3.3393542766571045, + "learning_rate": 1.3254834012154259e-05, + "loss": 0.2849, + "step": 1740 + }, + { + "epoch": 0.8259013282732448, + "grad_norm": 2.1521685123443604, + "learning_rate": 1.3247568376558515e-05, + "loss": 0.3756, + "step": 1741 + }, + { + "epoch": 0.8263757115749526, + "grad_norm": 2.7531650066375732, + "learning_rate": 1.3240300823955297e-05, + "loss": 0.3388, + "step": 1742 + }, + { + "epoch": 0.8268500948766604, + "grad_norm": 2.021456241607666, + "learning_rate": 1.3233031358634566e-05, + "loss": 0.3245, + "step": 1743 + }, + { + "epoch": 0.8273244781783681, + "grad_norm": 2.3515677452087402, + "learning_rate": 1.3225759984887416e-05, + "loss": 0.412, + "step": 1744 + }, + { + "epoch": 0.8277988614800759, + "grad_norm": 1.9360193014144897, + "learning_rate": 1.3218486707006069e-05, + "loss": 0.3299, + "step": 1745 + }, + { + "epoch": 0.8282732447817837, + "grad_norm": 1.9744075536727905, + "learning_rate": 1.3211211529283867e-05, + "loss": 0.3216, + "step": 1746 + }, + { + "epoch": 0.8287476280834914, + "grad_norm": 2.255706548690796, + "learning_rate": 1.3203934456015275e-05, + "loss": 0.3296, + "step": 1747 + }, + { + "epoch": 0.8292220113851992, + "grad_norm": 2.0415632724761963, + "learning_rate": 1.3196655491495877e-05, + "loss": 0.2996, + "step": 1748 + }, + { + "epoch": 0.829696394686907, + "grad_norm": 2.2202017307281494, + "learning_rate": 1.3189374640022372e-05, + "loss": 0.3678, + "step": 1749 + }, + { + "epoch": 0.8301707779886148, + "grad_norm": 2.691748857498169, + "learning_rate": 1.3182091905892581e-05, + "loss": 0.4671, + "step": 1750 + }, + { + "epoch": 0.8306451612903226, + "grad_norm": 2.277745008468628, + "learning_rate": 1.3174807293405427e-05, + "loss": 0.3606, + "step": 1751 + }, + { + "epoch": 0.8311195445920304, + "grad_norm": 2.287468910217285, + "learning_rate": 1.3167520806860943e-05, + "loss": 0.4041, + "step": 1752 + }, + { + "epoch": 0.8315939278937381, + "grad_norm": 2.397675037384033, + "learning_rate": 1.3160232450560268e-05, + "loss": 0.4329, + "step": 1753 + }, + { + "epoch": 0.8320683111954459, + "grad_norm": 2.1399128437042236, + "learning_rate": 1.3152942228805651e-05, + "loss": 0.3077, + "step": 1754 + }, + { + "epoch": 0.8325426944971537, + "grad_norm": 2.231016159057617, + "learning_rate": 1.314565014590044e-05, + "loss": 0.3496, + "step": 1755 + }, + { + "epoch": 0.8330170777988615, + "grad_norm": 1.9128320217132568, + "learning_rate": 1.3138356206149069e-05, + "loss": 0.2968, + "step": 1756 + }, + { + "epoch": 0.8334914611005693, + "grad_norm": 2.082197666168213, + "learning_rate": 1.3131060413857087e-05, + "loss": 0.323, + "step": 1757 + }, + { + "epoch": 0.8339658444022771, + "grad_norm": 2.4280385971069336, + "learning_rate": 1.3123762773331127e-05, + "loss": 0.3908, + "step": 1758 + }, + { + "epoch": 0.8344402277039848, + "grad_norm": 2.383655071258545, + "learning_rate": 1.3116463288878914e-05, + "loss": 0.3721, + "step": 1759 + }, + { + "epoch": 0.8349146110056926, + "grad_norm": 2.2262542247772217, + "learning_rate": 1.3109161964809256e-05, + "loss": 0.368, + "step": 1760 + }, + { + "epoch": 0.8353889943074004, + "grad_norm": 2.4055254459381104, + "learning_rate": 1.310185880543206e-05, + "loss": 0.3321, + "step": 1761 + }, + { + "epoch": 0.8358633776091081, + "grad_norm": 2.6039791107177734, + "learning_rate": 1.3094553815058304e-05, + "loss": 0.3922, + "step": 1762 + }, + { + "epoch": 0.8363377609108159, + "grad_norm": 1.9806139469146729, + "learning_rate": 1.3087246998000054e-05, + "loss": 0.3628, + "step": 1763 + }, + { + "epoch": 0.8368121442125237, + "grad_norm": 2.162916660308838, + "learning_rate": 1.307993835857045e-05, + "loss": 0.3516, + "step": 1764 + }, + { + "epoch": 0.8372865275142315, + "grad_norm": 2.237438917160034, + "learning_rate": 1.307262790108371e-05, + "loss": 0.3383, + "step": 1765 + }, + { + "epoch": 0.8377609108159393, + "grad_norm": 2.004232406616211, + "learning_rate": 1.3065315629855124e-05, + "loss": 0.3077, + "step": 1766 + }, + { + "epoch": 0.8382352941176471, + "grad_norm": 2.4037442207336426, + "learning_rate": 1.3058001549201056e-05, + "loss": 0.3527, + "step": 1767 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 2.4244558811187744, + "learning_rate": 1.3050685663438931e-05, + "loss": 0.3533, + "step": 1768 + }, + { + "epoch": 0.8391840607210627, + "grad_norm": 2.3665969371795654, + "learning_rate": 1.3043367976887251e-05, + "loss": 0.3269, + "step": 1769 + }, + { + "epoch": 0.8396584440227703, + "grad_norm": 2.266602039337158, + "learning_rate": 1.3036048493865567e-05, + "loss": 0.3345, + "step": 1770 + }, + { + "epoch": 0.8401328273244781, + "grad_norm": 2.609055757522583, + "learning_rate": 1.3028727218694503e-05, + "loss": 0.3333, + "step": 1771 + }, + { + "epoch": 0.8406072106261859, + "grad_norm": 2.1717774868011475, + "learning_rate": 1.3021404155695728e-05, + "loss": 0.3405, + "step": 1772 + }, + { + "epoch": 0.8410815939278937, + "grad_norm": 2.0103063583374023, + "learning_rate": 1.301407930919198e-05, + "loss": 0.2846, + "step": 1773 + }, + { + "epoch": 0.8415559772296015, + "grad_norm": 2.3277881145477295, + "learning_rate": 1.300675268350704e-05, + "loss": 0.331, + "step": 1774 + }, + { + "epoch": 0.8420303605313093, + "grad_norm": 2.5206899642944336, + "learning_rate": 1.2999424282965747e-05, + "loss": 0.3298, + "step": 1775 + }, + { + "epoch": 0.8425047438330171, + "grad_norm": 2.3512508869171143, + "learning_rate": 1.299209411189398e-05, + "loss": 0.3856, + "step": 1776 + }, + { + "epoch": 0.8429791271347249, + "grad_norm": 2.2688114643096924, + "learning_rate": 1.2984762174618664e-05, + "loss": 0.3016, + "step": 1777 + }, + { + "epoch": 0.8434535104364327, + "grad_norm": 2.555619239807129, + "learning_rate": 1.2977428475467773e-05, + "loss": 0.317, + "step": 1778 + }, + { + "epoch": 0.8439278937381404, + "grad_norm": 2.4550106525421143, + "learning_rate": 1.2970093018770318e-05, + "loss": 0.3679, + "step": 1779 + }, + { + "epoch": 0.8444022770398482, + "grad_norm": 2.240664005279541, + "learning_rate": 1.2962755808856341e-05, + "loss": 0.3785, + "step": 1780 + }, + { + "epoch": 0.844876660341556, + "grad_norm": 2.236607313156128, + "learning_rate": 1.295541685005693e-05, + "loss": 0.3545, + "step": 1781 + }, + { + "epoch": 0.8453510436432637, + "grad_norm": 2.1329894065856934, + "learning_rate": 1.2948076146704195e-05, + "loss": 0.3287, + "step": 1782 + }, + { + "epoch": 0.8458254269449715, + "grad_norm": 2.1090683937072754, + "learning_rate": 1.2940733703131287e-05, + "loss": 0.3241, + "step": 1783 + }, + { + "epoch": 0.8462998102466793, + "grad_norm": 2.2232136726379395, + "learning_rate": 1.293338952367237e-05, + "loss": 0.3764, + "step": 1784 + }, + { + "epoch": 0.8467741935483871, + "grad_norm": 2.0039751529693604, + "learning_rate": 1.2926043612662646e-05, + "loss": 0.3344, + "step": 1785 + }, + { + "epoch": 0.8472485768500949, + "grad_norm": 2.570286273956299, + "learning_rate": 1.291869597443833e-05, + "loss": 0.3779, + "step": 1786 + }, + { + "epoch": 0.8477229601518027, + "grad_norm": 2.4321069717407227, + "learning_rate": 1.2911346613336666e-05, + "loss": 0.4621, + "step": 1787 + }, + { + "epoch": 0.8481973434535104, + "grad_norm": 2.2266476154327393, + "learning_rate": 1.2903995533695904e-05, + "loss": 0.3687, + "step": 1788 + }, + { + "epoch": 0.8486717267552182, + "grad_norm": 1.9736994504928589, + "learning_rate": 1.2896642739855311e-05, + "loss": 0.3805, + "step": 1789 + }, + { + "epoch": 0.849146110056926, + "grad_norm": 3.032970666885376, + "learning_rate": 1.2889288236155177e-05, + "loss": 0.403, + "step": 1790 + }, + { + "epoch": 0.8496204933586338, + "grad_norm": 2.4022488594055176, + "learning_rate": 1.2881932026936785e-05, + "loss": 0.405, + "step": 1791 + }, + { + "epoch": 0.8500948766603416, + "grad_norm": 2.144449234008789, + "learning_rate": 1.2874574116542439e-05, + "loss": 0.3194, + "step": 1792 + }, + { + "epoch": 0.8505692599620494, + "grad_norm": 2.102857828140259, + "learning_rate": 1.2867214509315434e-05, + "loss": 0.339, + "step": 1793 + }, + { + "epoch": 0.8510436432637571, + "grad_norm": 2.4417030811309814, + "learning_rate": 1.2859853209600075e-05, + "loss": 0.3896, + "step": 1794 + }, + { + "epoch": 0.8515180265654649, + "grad_norm": 2.425147533416748, + "learning_rate": 1.2852490221741669e-05, + "loss": 0.2993, + "step": 1795 + }, + { + "epoch": 0.8519924098671727, + "grad_norm": 2.761335849761963, + "learning_rate": 1.284512555008651e-05, + "loss": 0.3798, + "step": 1796 + }, + { + "epoch": 0.8524667931688804, + "grad_norm": 2.5409250259399414, + "learning_rate": 1.2837759198981894e-05, + "loss": 0.3064, + "step": 1797 + }, + { + "epoch": 0.8529411764705882, + "grad_norm": 2.241783380508423, + "learning_rate": 1.28303911727761e-05, + "loss": 0.3359, + "step": 1798 + }, + { + "epoch": 0.853415559772296, + "grad_norm": 2.344348907470703, + "learning_rate": 1.2823021475818408e-05, + "loss": 0.3305, + "step": 1799 + }, + { + "epoch": 0.8538899430740038, + "grad_norm": 2.3390181064605713, + "learning_rate": 1.2815650112459075e-05, + "loss": 0.3343, + "step": 1800 + }, + { + "epoch": 0.8543643263757116, + "grad_norm": 1.938061237335205, + "learning_rate": 1.2808277087049338e-05, + "loss": 0.2944, + "step": 1801 + }, + { + "epoch": 0.8548387096774194, + "grad_norm": 2.33855938911438, + "learning_rate": 1.2800902403941429e-05, + "loss": 0.4172, + "step": 1802 + }, + { + "epoch": 0.8553130929791272, + "grad_norm": 2.146927833557129, + "learning_rate": 1.279352606748855e-05, + "loss": 0.3548, + "step": 1803 + }, + { + "epoch": 0.855787476280835, + "grad_norm": 2.442716598510742, + "learning_rate": 1.278614808204487e-05, + "loss": 0.3444, + "step": 1804 + }, + { + "epoch": 0.8562618595825426, + "grad_norm": 2.412533760070801, + "learning_rate": 1.277876845196555e-05, + "loss": 0.4137, + "step": 1805 + }, + { + "epoch": 0.8567362428842504, + "grad_norm": 2.463116407394409, + "learning_rate": 1.2771387181606709e-05, + "loss": 0.4336, + "step": 1806 + }, + { + "epoch": 0.8572106261859582, + "grad_norm": 2.2430520057678223, + "learning_rate": 1.2764004275325443e-05, + "loss": 0.2363, + "step": 1807 + }, + { + "epoch": 0.857685009487666, + "grad_norm": 2.2711448669433594, + "learning_rate": 1.2756619737479801e-05, + "loss": 0.4087, + "step": 1808 + }, + { + "epoch": 0.8581593927893738, + "grad_norm": 2.3729405403137207, + "learning_rate": 1.2749233572428805e-05, + "loss": 0.3526, + "step": 1809 + }, + { + "epoch": 0.8586337760910816, + "grad_norm": 2.3605003356933594, + "learning_rate": 1.2741845784532436e-05, + "loss": 0.3231, + "step": 1810 + }, + { + "epoch": 0.8591081593927894, + "grad_norm": 2.177626848220825, + "learning_rate": 1.2734456378151636e-05, + "loss": 0.3287, + "step": 1811 + }, + { + "epoch": 0.8595825426944972, + "grad_norm": 2.164935350418091, + "learning_rate": 1.272706535764829e-05, + "loss": 0.2948, + "step": 1812 + }, + { + "epoch": 0.860056925996205, + "grad_norm": 2.3316850662231445, + "learning_rate": 1.2719672727385249e-05, + "loss": 0.3226, + "step": 1813 + }, + { + "epoch": 0.8605313092979127, + "grad_norm": 2.731200933456421, + "learning_rate": 1.271227849172631e-05, + "loss": 0.358, + "step": 1814 + }, + { + "epoch": 0.8610056925996205, + "grad_norm": 3.0866591930389404, + "learning_rate": 1.270488265503622e-05, + "loss": 0.4681, + "step": 1815 + }, + { + "epoch": 0.8614800759013282, + "grad_norm": 2.4756972789764404, + "learning_rate": 1.2697485221680663e-05, + "loss": 0.3664, + "step": 1816 + }, + { + "epoch": 0.861954459203036, + "grad_norm": 2.456561803817749, + "learning_rate": 1.269008619602627e-05, + "loss": 0.334, + "step": 1817 + }, + { + "epoch": 0.8624288425047438, + "grad_norm": 2.1399877071380615, + "learning_rate": 1.2682685582440614e-05, + "loss": 0.3091, + "step": 1818 + }, + { + "epoch": 0.8629032258064516, + "grad_norm": 2.5643470287323, + "learning_rate": 1.2675283385292212e-05, + "loss": 0.4395, + "step": 1819 + }, + { + "epoch": 0.8633776091081594, + "grad_norm": 2.543917179107666, + "learning_rate": 1.2667879608950494e-05, + "loss": 0.3285, + "step": 1820 + }, + { + "epoch": 0.8638519924098672, + "grad_norm": 2.08056902885437, + "learning_rate": 1.2660474257785844e-05, + "loss": 0.3604, + "step": 1821 + }, + { + "epoch": 0.864326375711575, + "grad_norm": 2.5211234092712402, + "learning_rate": 1.2653067336169566e-05, + "loss": 0.326, + "step": 1822 + }, + { + "epoch": 0.8648007590132827, + "grad_norm": 1.8964157104492188, + "learning_rate": 1.2645658848473894e-05, + "loss": 0.2578, + "step": 1823 + }, + { + "epoch": 0.8652751423149905, + "grad_norm": 1.9530812501907349, + "learning_rate": 1.2638248799071985e-05, + "loss": 0.3031, + "step": 1824 + }, + { + "epoch": 0.8657495256166983, + "grad_norm": 2.3713808059692383, + "learning_rate": 1.2630837192337913e-05, + "loss": 0.3138, + "step": 1825 + }, + { + "epoch": 0.8662239089184061, + "grad_norm": 2.222252607345581, + "learning_rate": 1.2623424032646683e-05, + "loss": 0.3455, + "step": 1826 + }, + { + "epoch": 0.8666982922201139, + "grad_norm": 2.9525506496429443, + "learning_rate": 1.2616009324374205e-05, + "loss": 0.4101, + "step": 1827 + }, + { + "epoch": 0.8671726755218216, + "grad_norm": 2.140566825866699, + "learning_rate": 1.2608593071897311e-05, + "loss": 0.3621, + "step": 1828 + }, + { + "epoch": 0.8676470588235294, + "grad_norm": 2.7032320499420166, + "learning_rate": 1.2601175279593737e-05, + "loss": 0.4193, + "step": 1829 + }, + { + "epoch": 0.8681214421252372, + "grad_norm": 2.723836660385132, + "learning_rate": 1.2593755951842134e-05, + "loss": 0.4037, + "step": 1830 + }, + { + "epoch": 0.868595825426945, + "grad_norm": 2.102358102798462, + "learning_rate": 1.2586335093022064e-05, + "loss": 0.3605, + "step": 1831 + }, + { + "epoch": 0.8690702087286527, + "grad_norm": 2.0517046451568604, + "learning_rate": 1.2578912707513977e-05, + "loss": 0.3291, + "step": 1832 + }, + { + "epoch": 0.8695445920303605, + "grad_norm": 2.466214418411255, + "learning_rate": 1.257148879969924e-05, + "loss": 0.3382, + "step": 1833 + }, + { + "epoch": 0.8700189753320683, + "grad_norm": 2.1797330379486084, + "learning_rate": 1.2564063373960109e-05, + "loss": 0.3483, + "step": 1834 + }, + { + "epoch": 0.8704933586337761, + "grad_norm": 2.182875394821167, + "learning_rate": 1.2556636434679744e-05, + "loss": 0.3307, + "step": 1835 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 1.9578689336776733, + "learning_rate": 1.254920798624219e-05, + "loss": 0.284, + "step": 1836 + }, + { + "epoch": 0.8714421252371917, + "grad_norm": 2.1100683212280273, + "learning_rate": 1.2541778033032383e-05, + "loss": 0.3402, + "step": 1837 + }, + { + "epoch": 0.8719165085388995, + "grad_norm": 2.199645757675171, + "learning_rate": 1.2534346579436158e-05, + "loss": 0.3736, + "step": 1838 + }, + { + "epoch": 0.8723908918406073, + "grad_norm": 2.371626853942871, + "learning_rate": 1.2526913629840228e-05, + "loss": 0.4038, + "step": 1839 + }, + { + "epoch": 0.872865275142315, + "grad_norm": 2.2260429859161377, + "learning_rate": 1.2519479188632184e-05, + "loss": 0.3058, + "step": 1840 + }, + { + "epoch": 0.8733396584440227, + "grad_norm": 2.0778400897979736, + "learning_rate": 1.2512043260200506e-05, + "loss": 0.34, + "step": 1841 + }, + { + "epoch": 0.8738140417457305, + "grad_norm": 1.6490452289581299, + "learning_rate": 1.2504605848934552e-05, + "loss": 0.2596, + "step": 1842 + }, + { + "epoch": 0.8742884250474383, + "grad_norm": 2.222475051879883, + "learning_rate": 1.2497166959224546e-05, + "loss": 0.2611, + "step": 1843 + }, + { + "epoch": 0.8747628083491461, + "grad_norm": 2.1947946548461914, + "learning_rate": 1.2489726595461598e-05, + "loss": 0.3622, + "step": 1844 + }, + { + "epoch": 0.8752371916508539, + "grad_norm": 2.6763131618499756, + "learning_rate": 1.2482284762037675e-05, + "loss": 0.3425, + "step": 1845 + }, + { + "epoch": 0.8757115749525617, + "grad_norm": 1.7346967458724976, + "learning_rate": 1.247484146334562e-05, + "loss": 0.243, + "step": 1846 + }, + { + "epoch": 0.8761859582542695, + "grad_norm": 2.4290475845336914, + "learning_rate": 1.246739670377914e-05, + "loss": 0.4306, + "step": 1847 + }, + { + "epoch": 0.8766603415559773, + "grad_norm": 2.096940517425537, + "learning_rate": 1.2459950487732804e-05, + "loss": 0.3498, + "step": 1848 + }, + { + "epoch": 0.877134724857685, + "grad_norm": 1.9826620817184448, + "learning_rate": 1.2452502819602035e-05, + "loss": 0.2844, + "step": 1849 + }, + { + "epoch": 0.8776091081593927, + "grad_norm": 2.250736951828003, + "learning_rate": 1.2445053703783118e-05, + "loss": 0.3541, + "step": 1850 + }, + { + "epoch": 0.8780834914611005, + "grad_norm": 2.070547342300415, + "learning_rate": 1.2437603144673198e-05, + "loss": 0.2969, + "step": 1851 + }, + { + "epoch": 0.8785578747628083, + "grad_norm": 2.3527519702911377, + "learning_rate": 1.2430151146670261e-05, + "loss": 0.3719, + "step": 1852 + }, + { + "epoch": 0.8790322580645161, + "grad_norm": 2.063654661178589, + "learning_rate": 1.242269771417315e-05, + "loss": 0.315, + "step": 1853 + }, + { + "epoch": 0.8795066413662239, + "grad_norm": 2.292386531829834, + "learning_rate": 1.2415242851581552e-05, + "loss": 0.3539, + "step": 1854 + }, + { + "epoch": 0.8799810246679317, + "grad_norm": 2.2170569896698, + "learning_rate": 1.2407786563296e-05, + "loss": 0.2982, + "step": 1855 + }, + { + "epoch": 0.8804554079696395, + "grad_norm": 2.0592644214630127, + "learning_rate": 1.2400328853717862e-05, + "loss": 0.3065, + "step": 1856 + }, + { + "epoch": 0.8809297912713473, + "grad_norm": 1.8755029439926147, + "learning_rate": 1.2392869727249358e-05, + "loss": 0.3065, + "step": 1857 + }, + { + "epoch": 0.881404174573055, + "grad_norm": 1.9304600954055786, + "learning_rate": 1.2385409188293528e-05, + "loss": 0.3204, + "step": 1858 + }, + { + "epoch": 0.8818785578747628, + "grad_norm": 2.4600729942321777, + "learning_rate": 1.2377947241254263e-05, + "loss": 0.3721, + "step": 1859 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 2.047233819961548, + "learning_rate": 1.2370483890536271e-05, + "loss": 0.3081, + "step": 1860 + }, + { + "epoch": 0.8828273244781784, + "grad_norm": 1.9848384857177734, + "learning_rate": 1.2363019140545096e-05, + "loss": 0.2854, + "step": 1861 + }, + { + "epoch": 0.8833017077798861, + "grad_norm": 2.357391595840454, + "learning_rate": 1.2355552995687104e-05, + "loss": 0.4203, + "step": 1862 + }, + { + "epoch": 0.8837760910815939, + "grad_norm": 1.8160470724105835, + "learning_rate": 1.2348085460369488e-05, + "loss": 0.236, + "step": 1863 + }, + { + "epoch": 0.8842504743833017, + "grad_norm": 2.142580509185791, + "learning_rate": 1.234061653900026e-05, + "loss": 0.3978, + "step": 1864 + }, + { + "epoch": 0.8847248576850095, + "grad_norm": 2.4461870193481445, + "learning_rate": 1.2333146235988251e-05, + "loss": 0.3631, + "step": 1865 + }, + { + "epoch": 0.8851992409867173, + "grad_norm": 2.1813673973083496, + "learning_rate": 1.2325674555743106e-05, + "loss": 0.3192, + "step": 1866 + }, + { + "epoch": 0.885673624288425, + "grad_norm": 2.124764919281006, + "learning_rate": 1.2318201502675285e-05, + "loss": 0.3734, + "step": 1867 + }, + { + "epoch": 0.8861480075901328, + "grad_norm": 1.9253191947937012, + "learning_rate": 1.2310727081196054e-05, + "loss": 0.3054, + "step": 1868 + }, + { + "epoch": 0.8866223908918406, + "grad_norm": 1.8508185148239136, + "learning_rate": 1.2303251295717495e-05, + "loss": 0.2451, + "step": 1869 + }, + { + "epoch": 0.8870967741935484, + "grad_norm": 2.3445682525634766, + "learning_rate": 1.2295774150652486e-05, + "loss": 0.3997, + "step": 1870 + }, + { + "epoch": 0.8875711574952562, + "grad_norm": 2.4285199642181396, + "learning_rate": 1.2288295650414716e-05, + "loss": 0.3243, + "step": 1871 + }, + { + "epoch": 0.888045540796964, + "grad_norm": 1.8846771717071533, + "learning_rate": 1.2280815799418666e-05, + "loss": 0.2579, + "step": 1872 + }, + { + "epoch": 0.8885199240986718, + "grad_norm": 3.0957839488983154, + "learning_rate": 1.2273334602079621e-05, + "loss": 0.4514, + "step": 1873 + }, + { + "epoch": 0.8889943074003795, + "grad_norm": 2.224541425704956, + "learning_rate": 1.2265852062813652e-05, + "loss": 0.3709, + "step": 1874 + }, + { + "epoch": 0.8894686907020873, + "grad_norm": 2.923872470855713, + "learning_rate": 1.2258368186037638e-05, + "loss": 0.4537, + "step": 1875 + }, + { + "epoch": 0.889943074003795, + "grad_norm": 2.0478696823120117, + "learning_rate": 1.2250882976169228e-05, + "loss": 0.3222, + "step": 1876 + }, + { + "epoch": 0.8904174573055028, + "grad_norm": 2.2573540210723877, + "learning_rate": 1.2243396437626866e-05, + "loss": 0.37, + "step": 1877 + }, + { + "epoch": 0.8908918406072106, + "grad_norm": 2.3689069747924805, + "learning_rate": 1.2235908574829792e-05, + "loss": 0.3659, + "step": 1878 + }, + { + "epoch": 0.8913662239089184, + "grad_norm": 2.4283788204193115, + "learning_rate": 1.2228419392198008e-05, + "loss": 0.3417, + "step": 1879 + }, + { + "epoch": 0.8918406072106262, + "grad_norm": 2.9532792568206787, + "learning_rate": 1.2220928894152313e-05, + "loss": 0.2853, + "step": 1880 + }, + { + "epoch": 0.892314990512334, + "grad_norm": 2.0899689197540283, + "learning_rate": 1.2213437085114263e-05, + "loss": 0.3315, + "step": 1881 + }, + { + "epoch": 0.8927893738140418, + "grad_norm": 2.3636410236358643, + "learning_rate": 1.2205943969506207e-05, + "loss": 0.4148, + "step": 1882 + }, + { + "epoch": 0.8932637571157496, + "grad_norm": 2.356285810470581, + "learning_rate": 1.2198449551751255e-05, + "loss": 0.3218, + "step": 1883 + }, + { + "epoch": 0.8937381404174574, + "grad_norm": 2.254668951034546, + "learning_rate": 1.219095383627329e-05, + "loss": 0.3474, + "step": 1884 + }, + { + "epoch": 0.894212523719165, + "grad_norm": 2.1036298274993896, + "learning_rate": 1.2183456827496951e-05, + "loss": 0.3559, + "step": 1885 + }, + { + "epoch": 0.8946869070208728, + "grad_norm": 2.2919163703918457, + "learning_rate": 1.2175958529847654e-05, + "loss": 0.3481, + "step": 1886 + }, + { + "epoch": 0.8951612903225806, + "grad_norm": 2.041938066482544, + "learning_rate": 1.216845894775157e-05, + "loss": 0.2727, + "step": 1887 + }, + { + "epoch": 0.8956356736242884, + "grad_norm": 2.564687728881836, + "learning_rate": 1.2160958085635628e-05, + "loss": 0.4146, + "step": 1888 + }, + { + "epoch": 0.8961100569259962, + "grad_norm": 2.3934693336486816, + "learning_rate": 1.2153455947927509e-05, + "loss": 0.2986, + "step": 1889 + }, + { + "epoch": 0.896584440227704, + "grad_norm": 2.548827886581421, + "learning_rate": 1.2145952539055654e-05, + "loss": 0.4007, + "step": 1890 + }, + { + "epoch": 0.8970588235294118, + "grad_norm": 2.382176399230957, + "learning_rate": 1.213844786344925e-05, + "loss": 0.3523, + "step": 1891 + }, + { + "epoch": 0.8975332068311196, + "grad_norm": 2.39910626411438, + "learning_rate": 1.2130941925538237e-05, + "loss": 0.3493, + "step": 1892 + }, + { + "epoch": 0.8980075901328273, + "grad_norm": 2.387073278427124, + "learning_rate": 1.2123434729753287e-05, + "loss": 0.3532, + "step": 1893 + }, + { + "epoch": 0.8984819734345351, + "grad_norm": 2.265078544616699, + "learning_rate": 1.211592628052583e-05, + "loss": 0.3608, + "step": 1894 + }, + { + "epoch": 0.8989563567362429, + "grad_norm": 2.8354647159576416, + "learning_rate": 1.2108416582288027e-05, + "loss": 0.3634, + "step": 1895 + }, + { + "epoch": 0.8994307400379506, + "grad_norm": 2.2116317749023438, + "learning_rate": 1.210090563947278e-05, + "loss": 0.31, + "step": 1896 + }, + { + "epoch": 0.8999051233396584, + "grad_norm": 2.5344135761260986, + "learning_rate": 1.2093393456513724e-05, + "loss": 0.3346, + "step": 1897 + }, + { + "epoch": 0.9003795066413662, + "grad_norm": 2.046262502670288, + "learning_rate": 1.2085880037845223e-05, + "loss": 0.3562, + "step": 1898 + }, + { + "epoch": 0.900853889943074, + "grad_norm": 1.9915024042129517, + "learning_rate": 1.2078365387902379e-05, + "loss": 0.3242, + "step": 1899 + }, + { + "epoch": 0.9013282732447818, + "grad_norm": 2.071885108947754, + "learning_rate": 1.2070849511121014e-05, + "loss": 0.3272, + "step": 1900 + }, + { + "epoch": 0.9018026565464896, + "grad_norm": 2.1949217319488525, + "learning_rate": 1.2063332411937672e-05, + "loss": 0.3419, + "step": 1901 + }, + { + "epoch": 0.9022770398481973, + "grad_norm": 2.311913013458252, + "learning_rate": 1.2055814094789625e-05, + "loss": 0.3776, + "step": 1902 + }, + { + "epoch": 0.9027514231499051, + "grad_norm": 2.1613712310791016, + "learning_rate": 1.2048294564114859e-05, + "loss": 0.3538, + "step": 1903 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 2.3244266510009766, + "learning_rate": 1.2040773824352088e-05, + "loss": 0.366, + "step": 1904 + }, + { + "epoch": 0.9037001897533207, + "grad_norm": 3.7160706520080566, + "learning_rate": 1.2033251879940716e-05, + "loss": 0.4384, + "step": 1905 + }, + { + "epoch": 0.9041745730550285, + "grad_norm": 1.9680372476577759, + "learning_rate": 1.2025728735320878e-05, + "loss": 0.2941, + "step": 1906 + }, + { + "epoch": 0.9046489563567363, + "grad_norm": 1.8036983013153076, + "learning_rate": 1.2018204394933417e-05, + "loss": 0.2192, + "step": 1907 + }, + { + "epoch": 0.905123339658444, + "grad_norm": 2.109776496887207, + "learning_rate": 1.201067886321987e-05, + "loss": 0.3297, + "step": 1908 + }, + { + "epoch": 0.9055977229601518, + "grad_norm": 2.50654935836792, + "learning_rate": 1.2003152144622493e-05, + "loss": 0.4017, + "step": 1909 + }, + { + "epoch": 0.9060721062618596, + "grad_norm": 2.427208185195923, + "learning_rate": 1.1995624243584219e-05, + "loss": 0.3476, + "step": 1910 + }, + { + "epoch": 0.9065464895635673, + "grad_norm": 3.9568138122558594, + "learning_rate": 1.1988095164548706e-05, + "loss": 0.3642, + "step": 1911 + }, + { + "epoch": 0.9070208728652751, + "grad_norm": 2.389507293701172, + "learning_rate": 1.198056491196029e-05, + "loss": 0.4409, + "step": 1912 + }, + { + "epoch": 0.9074952561669829, + "grad_norm": 2.369760751724243, + "learning_rate": 1.1973033490264e-05, + "loss": 0.3742, + "step": 1913 + }, + { + "epoch": 0.9079696394686907, + "grad_norm": 2.787485361099243, + "learning_rate": 1.1965500903905571e-05, + "loss": 0.3947, + "step": 1914 + }, + { + "epoch": 0.9084440227703985, + "grad_norm": 2.2195568084716797, + "learning_rate": 1.1957967157331404e-05, + "loss": 0.3326, + "step": 1915 + }, + { + "epoch": 0.9089184060721063, + "grad_norm": 2.2224323749542236, + "learning_rate": 1.1950432254988604e-05, + "loss": 0.4156, + "step": 1916 + }, + { + "epoch": 0.9093927893738141, + "grad_norm": 2.9090452194213867, + "learning_rate": 1.1942896201324938e-05, + "loss": 0.4651, + "step": 1917 + }, + { + "epoch": 0.9098671726755219, + "grad_norm": 2.5390193462371826, + "learning_rate": 1.1935359000788873e-05, + "loss": 0.3556, + "step": 1918 + }, + { + "epoch": 0.9103415559772297, + "grad_norm": 2.103929281234741, + "learning_rate": 1.192782065782954e-05, + "loss": 0.3225, + "step": 1919 + }, + { + "epoch": 0.9108159392789373, + "grad_norm": 2.0904929637908936, + "learning_rate": 1.1920281176896752e-05, + "loss": 0.2711, + "step": 1920 + }, + { + "epoch": 0.9112903225806451, + "grad_norm": 2.126173257827759, + "learning_rate": 1.1912740562440988e-05, + "loss": 0.2951, + "step": 1921 + }, + { + "epoch": 0.9117647058823529, + "grad_norm": 1.8137263059616089, + "learning_rate": 1.1905198818913393e-05, + "loss": 0.2783, + "step": 1922 + }, + { + "epoch": 0.9122390891840607, + "grad_norm": 2.573413133621216, + "learning_rate": 1.1897655950765789e-05, + "loss": 0.3153, + "step": 1923 + }, + { + "epoch": 0.9127134724857685, + "grad_norm": 1.9147273302078247, + "learning_rate": 1.189011196245066e-05, + "loss": 0.3062, + "step": 1924 + }, + { + "epoch": 0.9131878557874763, + "grad_norm": 2.3643062114715576, + "learning_rate": 1.1882566858421137e-05, + "loss": 0.4424, + "step": 1925 + }, + { + "epoch": 0.9136622390891841, + "grad_norm": 2.178414821624756, + "learning_rate": 1.1875020643131028e-05, + "loss": 0.3204, + "step": 1926 + }, + { + "epoch": 0.9141366223908919, + "grad_norm": 2.598742961883545, + "learning_rate": 1.1867473321034786e-05, + "loss": 0.4002, + "step": 1927 + }, + { + "epoch": 0.9146110056925996, + "grad_norm": 2.4741811752319336, + "learning_rate": 1.1859924896587528e-05, + "loss": 0.3557, + "step": 1928 + }, + { + "epoch": 0.9150853889943074, + "grad_norm": 2.3986568450927734, + "learning_rate": 1.1852375374245003e-05, + "loss": 0.3971, + "step": 1929 + }, + { + "epoch": 0.9155597722960152, + "grad_norm": 1.920324683189392, + "learning_rate": 1.1844824758463626e-05, + "loss": 0.2705, + "step": 1930 + }, + { + "epoch": 0.9160341555977229, + "grad_norm": 2.353745937347412, + "learning_rate": 1.183727305370045e-05, + "loss": 0.3579, + "step": 1931 + }, + { + "epoch": 0.9165085388994307, + "grad_norm": 2.8409640789031982, + "learning_rate": 1.1829720264413169e-05, + "loss": 0.3981, + "step": 1932 + }, + { + "epoch": 0.9169829222011385, + "grad_norm": 2.5938303470611572, + "learning_rate": 1.1822166395060124e-05, + "loss": 0.3382, + "step": 1933 + }, + { + "epoch": 0.9174573055028463, + "grad_norm": 2.2392194271087646, + "learning_rate": 1.1814611450100286e-05, + "loss": 0.3603, + "step": 1934 + }, + { + "epoch": 0.9179316888045541, + "grad_norm": 2.44313907623291, + "learning_rate": 1.1807055433993266e-05, + "loss": 0.3935, + "step": 1935 + }, + { + "epoch": 0.9184060721062619, + "grad_norm": 2.418811559677124, + "learning_rate": 1.1799498351199303e-05, + "loss": 0.3081, + "step": 1936 + }, + { + "epoch": 0.9188804554079696, + "grad_norm": 2.3761041164398193, + "learning_rate": 1.179194020617927e-05, + "loss": 0.3717, + "step": 1937 + }, + { + "epoch": 0.9193548387096774, + "grad_norm": 2.0884335041046143, + "learning_rate": 1.1784381003394669e-05, + "loss": 0.3512, + "step": 1938 + }, + { + "epoch": 0.9198292220113852, + "grad_norm": 1.974050760269165, + "learning_rate": 1.1776820747307615e-05, + "loss": 0.3303, + "step": 1939 + }, + { + "epoch": 0.920303605313093, + "grad_norm": 2.162907600402832, + "learning_rate": 1.1769259442380862e-05, + "loss": 0.3726, + "step": 1940 + }, + { + "epoch": 0.9207779886148008, + "grad_norm": 2.2310690879821777, + "learning_rate": 1.1761697093077762e-05, + "loss": 0.3117, + "step": 1941 + }, + { + "epoch": 0.9212523719165086, + "grad_norm": 2.247523307800293, + "learning_rate": 1.1754133703862302e-05, + "loss": 0.3046, + "step": 1942 + }, + { + "epoch": 0.9217267552182163, + "grad_norm": 2.2049553394317627, + "learning_rate": 1.1746569279199073e-05, + "loss": 0.2433, + "step": 1943 + }, + { + "epoch": 0.9222011385199241, + "grad_norm": 2.9283275604248047, + "learning_rate": 1.1739003823553286e-05, + "loss": 0.3686, + "step": 1944 + }, + { + "epoch": 0.9226755218216319, + "grad_norm": 1.9516762495040894, + "learning_rate": 1.1731437341390748e-05, + "loss": 0.2654, + "step": 1945 + }, + { + "epoch": 0.9231499051233396, + "grad_norm": 2.1916255950927734, + "learning_rate": 1.172386983717788e-05, + "loss": 0.3597, + "step": 1946 + }, + { + "epoch": 0.9236242884250474, + "grad_norm": 2.2968626022338867, + "learning_rate": 1.1716301315381706e-05, + "loss": 0.3295, + "step": 1947 + }, + { + "epoch": 0.9240986717267552, + "grad_norm": 2.341371774673462, + "learning_rate": 1.170873178046985e-05, + "loss": 0.3365, + "step": 1948 + }, + { + "epoch": 0.924573055028463, + "grad_norm": 2.3758347034454346, + "learning_rate": 1.1701161236910532e-05, + "loss": 0.3714, + "step": 1949 + }, + { + "epoch": 0.9250474383301708, + "grad_norm": 2.347499132156372, + "learning_rate": 1.1693589689172566e-05, + "loss": 0.3345, + "step": 1950 + }, + { + "epoch": 0.9255218216318786, + "grad_norm": 2.4397614002227783, + "learning_rate": 1.1686017141725367e-05, + "loss": 0.295, + "step": 1951 + }, + { + "epoch": 0.9259962049335864, + "grad_norm": 1.9351602792739868, + "learning_rate": 1.167844359903894e-05, + "loss": 0.2817, + "step": 1952 + }, + { + "epoch": 0.9264705882352942, + "grad_norm": 2.63478684425354, + "learning_rate": 1.1670869065583856e-05, + "loss": 0.4525, + "step": 1953 + }, + { + "epoch": 0.926944971537002, + "grad_norm": 3.3862996101379395, + "learning_rate": 1.1663293545831302e-05, + "loss": 0.3677, + "step": 1954 + }, + { + "epoch": 0.9274193548387096, + "grad_norm": 2.883392333984375, + "learning_rate": 1.1655717044253025e-05, + "loss": 0.41, + "step": 1955 + }, + { + "epoch": 0.9278937381404174, + "grad_norm": 2.115570068359375, + "learning_rate": 1.164813956532136e-05, + "loss": 0.3187, + "step": 1956 + }, + { + "epoch": 0.9283681214421252, + "grad_norm": 1.8709588050842285, + "learning_rate": 1.1640561113509222e-05, + "loss": 0.2728, + "step": 1957 + }, + { + "epoch": 0.928842504743833, + "grad_norm": 2.141753911972046, + "learning_rate": 1.1632981693290089e-05, + "loss": 0.2978, + "step": 1958 + }, + { + "epoch": 0.9293168880455408, + "grad_norm": 2.3228015899658203, + "learning_rate": 1.1625401309138025e-05, + "loss": 0.3375, + "step": 1959 + }, + { + "epoch": 0.9297912713472486, + "grad_norm": 2.2550671100616455, + "learning_rate": 1.161781996552765e-05, + "loss": 0.3075, + "step": 1960 + }, + { + "epoch": 0.9302656546489564, + "grad_norm": 2.196171998977661, + "learning_rate": 1.1610237666934158e-05, + "loss": 0.3969, + "step": 1961 + }, + { + "epoch": 0.9307400379506642, + "grad_norm": 3.4995615482330322, + "learning_rate": 1.1602654417833305e-05, + "loss": 0.349, + "step": 1962 + }, + { + "epoch": 0.931214421252372, + "grad_norm": 2.3877415657043457, + "learning_rate": 1.1595070222701408e-05, + "loss": 0.3497, + "step": 1963 + }, + { + "epoch": 0.9316888045540797, + "grad_norm": 2.260338306427002, + "learning_rate": 1.1587485086015346e-05, + "loss": 0.2895, + "step": 1964 + }, + { + "epoch": 0.9321631878557874, + "grad_norm": 2.212602138519287, + "learning_rate": 1.1579899012252543e-05, + "loss": 0.3157, + "step": 1965 + }, + { + "epoch": 0.9326375711574952, + "grad_norm": 1.9429450035095215, + "learning_rate": 1.1572312005890986e-05, + "loss": 0.3059, + "step": 1966 + }, + { + "epoch": 0.933111954459203, + "grad_norm": 2.017580509185791, + "learning_rate": 1.1564724071409213e-05, + "loss": 0.2694, + "step": 1967 + }, + { + "epoch": 0.9335863377609108, + "grad_norm": 3.207867383956909, + "learning_rate": 1.1557135213286303e-05, + "loss": 0.3742, + "step": 1968 + }, + { + "epoch": 0.9340607210626186, + "grad_norm": 2.253946542739868, + "learning_rate": 1.1549545436001888e-05, + "loss": 0.3292, + "step": 1969 + }, + { + "epoch": 0.9345351043643264, + "grad_norm": 2.34022855758667, + "learning_rate": 1.1541954744036131e-05, + "loss": 0.388, + "step": 1970 + }, + { + "epoch": 0.9350094876660342, + "grad_norm": 2.2387092113494873, + "learning_rate": 1.153436314186975e-05, + "loss": 0.344, + "step": 1971 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 2.4496212005615234, + "learning_rate": 1.152677063398399e-05, + "loss": 0.3724, + "step": 1972 + }, + { + "epoch": 0.9359582542694497, + "grad_norm": 2.5698015689849854, + "learning_rate": 1.1519177224860632e-05, + "loss": 0.3821, + "step": 1973 + }, + { + "epoch": 0.9364326375711575, + "grad_norm": 1.8094062805175781, + "learning_rate": 1.151158291898199e-05, + "loss": 0.3154, + "step": 1974 + }, + { + "epoch": 0.9369070208728653, + "grad_norm": 2.1138343811035156, + "learning_rate": 1.1503987720830908e-05, + "loss": 0.347, + "step": 1975 + }, + { + "epoch": 0.937381404174573, + "grad_norm": 2.825183153152466, + "learning_rate": 1.1496391634890758e-05, + "loss": 0.4074, + "step": 1976 + }, + { + "epoch": 0.9378557874762808, + "grad_norm": 2.231473922729492, + "learning_rate": 1.1488794665645434e-05, + "loss": 0.3262, + "step": 1977 + }, + { + "epoch": 0.9383301707779886, + "grad_norm": 2.0554375648498535, + "learning_rate": 1.1481196817579352e-05, + "loss": 0.2908, + "step": 1978 + }, + { + "epoch": 0.9388045540796964, + "grad_norm": 2.4389944076538086, + "learning_rate": 1.1473598095177443e-05, + "loss": 0.2931, + "step": 1979 + }, + { + "epoch": 0.9392789373814042, + "grad_norm": 3.037283420562744, + "learning_rate": 1.1465998502925161e-05, + "loss": 0.4337, + "step": 1980 + }, + { + "epoch": 0.9397533206831119, + "grad_norm": 2.06538987159729, + "learning_rate": 1.1458398045308471e-05, + "loss": 0.2837, + "step": 1981 + }, + { + "epoch": 0.9402277039848197, + "grad_norm": 2.118652820587158, + "learning_rate": 1.1450796726813844e-05, + "loss": 0.3299, + "step": 1982 + }, + { + "epoch": 0.9407020872865275, + "grad_norm": 2.2635416984558105, + "learning_rate": 1.1443194551928267e-05, + "loss": 0.3512, + "step": 1983 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 2.088435173034668, + "learning_rate": 1.1435591525139228e-05, + "loss": 0.3404, + "step": 1984 + }, + { + "epoch": 0.9416508538899431, + "grad_norm": 2.3585917949676514, + "learning_rate": 1.1427987650934717e-05, + "loss": 0.3526, + "step": 1985 + }, + { + "epoch": 0.9421252371916509, + "grad_norm": 1.858132243156433, + "learning_rate": 1.1420382933803225e-05, + "loss": 0.2791, + "step": 1986 + }, + { + "epoch": 0.9425996204933587, + "grad_norm": 2.3248212337493896, + "learning_rate": 1.1412777378233742e-05, + "loss": 0.3954, + "step": 1987 + }, + { + "epoch": 0.9430740037950665, + "grad_norm": 2.187311887741089, + "learning_rate": 1.1405170988715752e-05, + "loss": 0.3678, + "step": 1988 + }, + { + "epoch": 0.9435483870967742, + "grad_norm": 2.151273488998413, + "learning_rate": 1.1397563769739235e-05, + "loss": 0.3197, + "step": 1989 + }, + { + "epoch": 0.9440227703984819, + "grad_norm": 2.1821095943450928, + "learning_rate": 1.1389955725794655e-05, + "loss": 0.3604, + "step": 1990 + }, + { + "epoch": 0.9444971537001897, + "grad_norm": 2.963883876800537, + "learning_rate": 1.138234686137296e-05, + "loss": 0.3858, + "step": 1991 + }, + { + "epoch": 0.9449715370018975, + "grad_norm": 2.512237310409546, + "learning_rate": 1.1374737180965593e-05, + "loss": 0.3751, + "step": 1992 + }, + { + "epoch": 0.9454459203036053, + "grad_norm": 2.063708543777466, + "learning_rate": 1.1367126689064473e-05, + "loss": 0.315, + "step": 1993 + }, + { + "epoch": 0.9459203036053131, + "grad_norm": 2.0706629753112793, + "learning_rate": 1.1359515390161996e-05, + "loss": 0.3446, + "step": 1994 + }, + { + "epoch": 0.9463946869070209, + "grad_norm": 2.1223249435424805, + "learning_rate": 1.1351903288751038e-05, + "loss": 0.3453, + "step": 1995 + }, + { + "epoch": 0.9468690702087287, + "grad_norm": 2.1654293537139893, + "learning_rate": 1.1344290389324949e-05, + "loss": 0.2958, + "step": 1996 + }, + { + "epoch": 0.9473434535104365, + "grad_norm": 2.032048225402832, + "learning_rate": 1.1336676696377542e-05, + "loss": 0.3034, + "step": 1997 + }, + { + "epoch": 0.9478178368121443, + "grad_norm": 2.133802890777588, + "learning_rate": 1.1329062214403106e-05, + "loss": 0.3172, + "step": 1998 + }, + { + "epoch": 0.948292220113852, + "grad_norm": 2.323798656463623, + "learning_rate": 1.13214469478964e-05, + "loss": 0.3271, + "step": 1999 + }, + { + "epoch": 0.9487666034155597, + "grad_norm": 2.347034454345703, + "learning_rate": 1.1313830901352634e-05, + "loss": 0.3382, + "step": 2000 + }, + { + "epoch": 0.9492409867172675, + "grad_norm": 2.104706048965454, + "learning_rate": 1.130621407926749e-05, + "loss": 0.3714, + "step": 2001 + }, + { + "epoch": 0.9497153700189753, + "grad_norm": 2.216869354248047, + "learning_rate": 1.12985964861371e-05, + "loss": 0.3052, + "step": 2002 + }, + { + "epoch": 0.9501897533206831, + "grad_norm": 2.0643813610076904, + "learning_rate": 1.1290978126458054e-05, + "loss": 0.2865, + "step": 2003 + }, + { + "epoch": 0.9506641366223909, + "grad_norm": 1.9597926139831543, + "learning_rate": 1.1283359004727397e-05, + "loss": 0.3136, + "step": 2004 + }, + { + "epoch": 0.9511385199240987, + "grad_norm": 2.2669873237609863, + "learning_rate": 1.1275739125442618e-05, + "loss": 0.3544, + "step": 2005 + }, + { + "epoch": 0.9516129032258065, + "grad_norm": 1.7317811250686646, + "learning_rate": 1.1268118493101654e-05, + "loss": 0.2356, + "step": 2006 + }, + { + "epoch": 0.9520872865275142, + "grad_norm": 2.301215887069702, + "learning_rate": 1.1260497112202895e-05, + "loss": 0.3188, + "step": 2007 + }, + { + "epoch": 0.952561669829222, + "grad_norm": 1.8703101873397827, + "learning_rate": 1.1252874987245163e-05, + "loss": 0.2773, + "step": 2008 + }, + { + "epoch": 0.9530360531309298, + "grad_norm": 2.490936517715454, + "learning_rate": 1.1245252122727726e-05, + "loss": 0.363, + "step": 2009 + }, + { + "epoch": 0.9535104364326376, + "grad_norm": 2.491692066192627, + "learning_rate": 1.123762852315028e-05, + "loss": 0.3159, + "step": 2010 + }, + { + "epoch": 0.9539848197343453, + "grad_norm": 2.416424036026001, + "learning_rate": 1.1230004193012964e-05, + "loss": 0.3757, + "step": 2011 + }, + { + "epoch": 0.9544592030360531, + "grad_norm": 2.583618402481079, + "learning_rate": 1.1222379136816347e-05, + "loss": 0.3097, + "step": 2012 + }, + { + "epoch": 0.9549335863377609, + "grad_norm": 2.5698044300079346, + "learning_rate": 1.1214753359061418e-05, + "loss": 0.3456, + "step": 2013 + }, + { + "epoch": 0.9554079696394687, + "grad_norm": 2.5504226684570312, + "learning_rate": 1.1207126864249604e-05, + "loss": 0.3817, + "step": 2014 + }, + { + "epoch": 0.9558823529411765, + "grad_norm": 2.0663864612579346, + "learning_rate": 1.1199499656882747e-05, + "loss": 0.2627, + "step": 2015 + }, + { + "epoch": 0.9563567362428842, + "grad_norm": 2.0620248317718506, + "learning_rate": 1.1191871741463112e-05, + "loss": 0.3134, + "step": 2016 + }, + { + "epoch": 0.956831119544592, + "grad_norm": 2.855698585510254, + "learning_rate": 1.1184243122493381e-05, + "loss": 0.3152, + "step": 2017 + }, + { + "epoch": 0.9573055028462998, + "grad_norm": 1.9541351795196533, + "learning_rate": 1.1176613804476655e-05, + "loss": 0.3101, + "step": 2018 + }, + { + "epoch": 0.9577798861480076, + "grad_norm": 2.0520036220550537, + "learning_rate": 1.1168983791916442e-05, + "loss": 0.2803, + "step": 2019 + }, + { + "epoch": 0.9582542694497154, + "grad_norm": 2.2084736824035645, + "learning_rate": 1.1161353089316664e-05, + "loss": 0.3272, + "step": 2020 + }, + { + "epoch": 0.9587286527514232, + "grad_norm": 2.3713338375091553, + "learning_rate": 1.115372170118165e-05, + "loss": 0.3432, + "step": 2021 + }, + { + "epoch": 0.959203036053131, + "grad_norm": 2.585407257080078, + "learning_rate": 1.1146089632016132e-05, + "loss": 0.4149, + "step": 2022 + }, + { + "epoch": 0.9596774193548387, + "grad_norm": 2.2623908519744873, + "learning_rate": 1.1138456886325247e-05, + "loss": 0.3292, + "step": 2023 + }, + { + "epoch": 0.9601518026565465, + "grad_norm": 1.627692461013794, + "learning_rate": 1.1130823468614525e-05, + "loss": 0.2295, + "step": 2024 + }, + { + "epoch": 0.9606261859582542, + "grad_norm": 2.19572114944458, + "learning_rate": 1.1123189383389901e-05, + "loss": 0.3843, + "step": 2025 + }, + { + "epoch": 0.961100569259962, + "grad_norm": 2.066941499710083, + "learning_rate": 1.1115554635157698e-05, + "loss": 0.3033, + "step": 2026 + }, + { + "epoch": 0.9615749525616698, + "grad_norm": 2.1408028602600098, + "learning_rate": 1.1107919228424631e-05, + "loss": 0.3627, + "step": 2027 + }, + { + "epoch": 0.9620493358633776, + "grad_norm": 2.3943800926208496, + "learning_rate": 1.110028316769781e-05, + "loss": 0.3059, + "step": 2028 + }, + { + "epoch": 0.9625237191650854, + "grad_norm": 2.0022356510162354, + "learning_rate": 1.1092646457484721e-05, + "loss": 0.3247, + "step": 2029 + }, + { + "epoch": 0.9629981024667932, + "grad_norm": 2.043557643890381, + "learning_rate": 1.1085009102293243e-05, + "loss": 0.2876, + "step": 2030 + }, + { + "epoch": 0.963472485768501, + "grad_norm": 2.535658836364746, + "learning_rate": 1.1077371106631625e-05, + "loss": 0.3627, + "step": 2031 + }, + { + "epoch": 0.9639468690702088, + "grad_norm": 1.851802945137024, + "learning_rate": 1.1069732475008504e-05, + "loss": 0.2793, + "step": 2032 + }, + { + "epoch": 0.9644212523719166, + "grad_norm": 2.064366340637207, + "learning_rate": 1.1062093211932891e-05, + "loss": 0.3435, + "step": 2033 + }, + { + "epoch": 0.9648956356736242, + "grad_norm": 1.589677333831787, + "learning_rate": 1.1054453321914162e-05, + "loss": 0.2602, + "step": 2034 + }, + { + "epoch": 0.965370018975332, + "grad_norm": 2.559872627258301, + "learning_rate": 1.1046812809462073e-05, + "loss": 0.3856, + "step": 2035 + }, + { + "epoch": 0.9658444022770398, + "grad_norm": 2.204385995864868, + "learning_rate": 1.1039171679086739e-05, + "loss": 0.336, + "step": 2036 + }, + { + "epoch": 0.9663187855787476, + "grad_norm": 2.258500337600708, + "learning_rate": 1.1031529935298651e-05, + "loss": 0.3388, + "step": 2037 + }, + { + "epoch": 0.9667931688804554, + "grad_norm": 2.894205331802368, + "learning_rate": 1.1023887582608645e-05, + "loss": 0.368, + "step": 2038 + }, + { + "epoch": 0.9672675521821632, + "grad_norm": 3.6994946002960205, + "learning_rate": 1.1016244625527933e-05, + "loss": 0.3644, + "step": 2039 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 2.2392802238464355, + "learning_rate": 1.1008601068568074e-05, + "loss": 0.3416, + "step": 2040 + }, + { + "epoch": 0.9682163187855788, + "grad_norm": 2.4265706539154053, + "learning_rate": 1.1000956916240985e-05, + "loss": 0.3332, + "step": 2041 + }, + { + "epoch": 0.9686907020872866, + "grad_norm": 1.7920218706130981, + "learning_rate": 1.0993312173058934e-05, + "loss": 0.2237, + "step": 2042 + }, + { + "epoch": 0.9691650853889943, + "grad_norm": 2.4036221504211426, + "learning_rate": 1.0985666843534534e-05, + "loss": 0.3508, + "step": 2043 + }, + { + "epoch": 0.969639468690702, + "grad_norm": 2.5761871337890625, + "learning_rate": 1.097802093218075e-05, + "loss": 0.3949, + "step": 2044 + }, + { + "epoch": 0.9701138519924098, + "grad_norm": 2.6531662940979004, + "learning_rate": 1.0970374443510891e-05, + "loss": 0.3703, + "step": 2045 + }, + { + "epoch": 0.9705882352941176, + "grad_norm": 1.9129854440689087, + "learning_rate": 1.0962727382038598e-05, + "loss": 0.2939, + "step": 2046 + }, + { + "epoch": 0.9710626185958254, + "grad_norm": 2.6273584365844727, + "learning_rate": 1.0955079752277859e-05, + "loss": 0.418, + "step": 2047 + }, + { + "epoch": 0.9715370018975332, + "grad_norm": 2.4543769359588623, + "learning_rate": 1.0947431558742991e-05, + "loss": 0.2984, + "step": 2048 + }, + { + "epoch": 0.972011385199241, + "grad_norm": 2.275509834289551, + "learning_rate": 1.0939782805948653e-05, + "loss": 0.3299, + "step": 2049 + }, + { + "epoch": 0.9724857685009488, + "grad_norm": 2.039682626724243, + "learning_rate": 1.0932133498409821e-05, + "loss": 0.3081, + "step": 2050 + }, + { + "epoch": 0.9729601518026565, + "grad_norm": 2.2457706928253174, + "learning_rate": 1.0924483640641808e-05, + "loss": 0.3862, + "step": 2051 + }, + { + "epoch": 0.9734345351043643, + "grad_norm": 2.2409019470214844, + "learning_rate": 1.0916833237160252e-05, + "loss": 0.3165, + "step": 2052 + }, + { + "epoch": 0.9739089184060721, + "grad_norm": 1.9504499435424805, + "learning_rate": 1.0909182292481108e-05, + "loss": 0.3161, + "step": 2053 + }, + { + "epoch": 0.9743833017077799, + "grad_norm": 2.0921568870544434, + "learning_rate": 1.0901530811120655e-05, + "loss": 0.3617, + "step": 2054 + }, + { + "epoch": 0.9748576850094877, + "grad_norm": 1.8210068941116333, + "learning_rate": 1.0893878797595484e-05, + "loss": 0.2962, + "step": 2055 + }, + { + "epoch": 0.9753320683111955, + "grad_norm": 1.7719886302947998, + "learning_rate": 1.0886226256422501e-05, + "loss": 0.2733, + "step": 2056 + }, + { + "epoch": 0.9758064516129032, + "grad_norm": 2.5448601245880127, + "learning_rate": 1.0878573192118933e-05, + "loss": 0.2718, + "step": 2057 + }, + { + "epoch": 0.976280834914611, + "grad_norm": 1.8214176893234253, + "learning_rate": 1.0870919609202301e-05, + "loss": 0.2664, + "step": 2058 + }, + { + "epoch": 0.9767552182163188, + "grad_norm": 2.0862033367156982, + "learning_rate": 1.0863265512190444e-05, + "loss": 0.3164, + "step": 2059 + }, + { + "epoch": 0.9772296015180265, + "grad_norm": 1.6361185312271118, + "learning_rate": 1.0855610905601495e-05, + "loss": 0.217, + "step": 2060 + }, + { + "epoch": 0.9777039848197343, + "grad_norm": 2.2029919624328613, + "learning_rate": 1.0847955793953899e-05, + "loss": 0.3134, + "step": 2061 + }, + { + "epoch": 0.9781783681214421, + "grad_norm": 2.551486015319824, + "learning_rate": 1.0840300181766383e-05, + "loss": 0.4083, + "step": 2062 + }, + { + "epoch": 0.9786527514231499, + "grad_norm": 2.5572237968444824, + "learning_rate": 1.0832644073557987e-05, + "loss": 0.4408, + "step": 2063 + }, + { + "epoch": 0.9791271347248577, + "grad_norm": 2.1839964389801025, + "learning_rate": 1.0824987473848037e-05, + "loss": 0.362, + "step": 2064 + }, + { + "epoch": 0.9796015180265655, + "grad_norm": 2.5323362350463867, + "learning_rate": 1.0817330387156142e-05, + "loss": 0.3513, + "step": 2065 + }, + { + "epoch": 0.9800759013282733, + "grad_norm": 2.2569549083709717, + "learning_rate": 1.0809672818002209e-05, + "loss": 0.3858, + "step": 2066 + }, + { + "epoch": 0.9805502846299811, + "grad_norm": 2.198728561401367, + "learning_rate": 1.080201477090642e-05, + "loss": 0.36, + "step": 2067 + }, + { + "epoch": 0.9810246679316889, + "grad_norm": 2.281818389892578, + "learning_rate": 1.079435625038925e-05, + "loss": 0.3084, + "step": 2068 + }, + { + "epoch": 0.9814990512333965, + "grad_norm": 2.4596810340881348, + "learning_rate": 1.0786697260971449e-05, + "loss": 0.3617, + "step": 2069 + }, + { + "epoch": 0.9819734345351043, + "grad_norm": 2.167909622192383, + "learning_rate": 1.0779037807174032e-05, + "loss": 0.3021, + "step": 2070 + }, + { + "epoch": 0.9824478178368121, + "grad_norm": 1.8683514595031738, + "learning_rate": 1.0771377893518314e-05, + "loss": 0.2457, + "step": 2071 + }, + { + "epoch": 0.9829222011385199, + "grad_norm": 2.5630767345428467, + "learning_rate": 1.0763717524525854e-05, + "loss": 0.3483, + "step": 2072 + }, + { + "epoch": 0.9833965844402277, + "grad_norm": 2.544394016265869, + "learning_rate": 1.0756056704718498e-05, + "loss": 0.3901, + "step": 2073 + }, + { + "epoch": 0.9838709677419355, + "grad_norm": 2.037989377975464, + "learning_rate": 1.0748395438618353e-05, + "loss": 0.3368, + "step": 2074 + }, + { + "epoch": 0.9843453510436433, + "grad_norm": 1.889345407485962, + "learning_rate": 1.074073373074778e-05, + "loss": 0.2303, + "step": 2075 + }, + { + "epoch": 0.9848197343453511, + "grad_norm": 2.1819489002227783, + "learning_rate": 1.0733071585629423e-05, + "loss": 0.31, + "step": 2076 + }, + { + "epoch": 0.9852941176470589, + "grad_norm": 2.640437126159668, + "learning_rate": 1.0725409007786161e-05, + "loss": 0.4647, + "step": 2077 + }, + { + "epoch": 0.9857685009487666, + "grad_norm": 2.5758280754089355, + "learning_rate": 1.0717746001741139e-05, + "loss": 0.3326, + "step": 2078 + }, + { + "epoch": 0.9862428842504743, + "grad_norm": 1.8008846044540405, + "learning_rate": 1.0710082572017753e-05, + "loss": 0.2491, + "step": 2079 + }, + { + "epoch": 0.9867172675521821, + "grad_norm": 2.1113157272338867, + "learning_rate": 1.0702418723139654e-05, + "loss": 0.3475, + "step": 2080 + }, + { + "epoch": 0.9871916508538899, + "grad_norm": 2.860546827316284, + "learning_rate": 1.0694754459630732e-05, + "loss": 0.3269, + "step": 2081 + }, + { + "epoch": 0.9876660341555977, + "grad_norm": 2.7859246730804443, + "learning_rate": 1.0687089786015126e-05, + "loss": 0.3732, + "step": 2082 + }, + { + "epoch": 0.9881404174573055, + "grad_norm": 2.2017409801483154, + "learning_rate": 1.0679424706817221e-05, + "loss": 0.3555, + "step": 2083 + }, + { + "epoch": 0.9886148007590133, + "grad_norm": 1.9931174516677856, + "learning_rate": 1.0671759226561631e-05, + "loss": 0.3141, + "step": 2084 + }, + { + "epoch": 0.9890891840607211, + "grad_norm": 2.228123664855957, + "learning_rate": 1.0664093349773222e-05, + "loss": 0.3647, + "step": 2085 + }, + { + "epoch": 0.9895635673624289, + "grad_norm": 2.191070795059204, + "learning_rate": 1.0656427080977077e-05, + "loss": 0.3575, + "step": 2086 + }, + { + "epoch": 0.9900379506641366, + "grad_norm": 2.1509921550750732, + "learning_rate": 1.0648760424698521e-05, + "loss": 0.3169, + "step": 2087 + }, + { + "epoch": 0.9905123339658444, + "grad_norm": 1.9772475957870483, + "learning_rate": 1.0641093385463108e-05, + "loss": 0.3271, + "step": 2088 + }, + { + "epoch": 0.9909867172675522, + "grad_norm": 2.0178329944610596, + "learning_rate": 1.0633425967796614e-05, + "loss": 0.3091, + "step": 2089 + }, + { + "epoch": 0.99146110056926, + "grad_norm": 3.558840751647949, + "learning_rate": 1.0625758176225038e-05, + "loss": 0.3339, + "step": 2090 + }, + { + "epoch": 0.9919354838709677, + "grad_norm": 2.146780014038086, + "learning_rate": 1.0618090015274603e-05, + "loss": 0.298, + "step": 2091 + }, + { + "epoch": 0.9924098671726755, + "grad_norm": 2.24959659576416, + "learning_rate": 1.0610421489471748e-05, + "loss": 0.383, + "step": 2092 + }, + { + "epoch": 0.9928842504743833, + "grad_norm": 2.0573537349700928, + "learning_rate": 1.0602752603343127e-05, + "loss": 0.3116, + "step": 2093 + }, + { + "epoch": 0.9933586337760911, + "grad_norm": 2.7625508308410645, + "learning_rate": 1.059508336141561e-05, + "loss": 0.3006, + "step": 2094 + }, + { + "epoch": 0.9938330170777988, + "grad_norm": 2.205559492111206, + "learning_rate": 1.0587413768216273e-05, + "loss": 0.3381, + "step": 2095 + }, + { + "epoch": 0.9943074003795066, + "grad_norm": 2.6509761810302734, + "learning_rate": 1.05797438282724e-05, + "loss": 0.3785, + "step": 2096 + }, + { + "epoch": 0.9947817836812144, + "grad_norm": 2.5124902725219727, + "learning_rate": 1.0572073546111485e-05, + "loss": 0.3773, + "step": 2097 + }, + { + "epoch": 0.9952561669829222, + "grad_norm": 2.2544424533843994, + "learning_rate": 1.0564402926261216e-05, + "loss": 0.3657, + "step": 2098 + }, + { + "epoch": 0.99573055028463, + "grad_norm": 1.9105042219161987, + "learning_rate": 1.0556731973249486e-05, + "loss": 0.2686, + "step": 2099 + }, + { + "epoch": 0.9962049335863378, + "grad_norm": 1.9611090421676636, + "learning_rate": 1.0549060691604376e-05, + "loss": 0.3119, + "step": 2100 + }, + { + "epoch": 0.9966793168880456, + "grad_norm": 2.110646963119507, + "learning_rate": 1.0541389085854177e-05, + "loss": 0.3229, + "step": 2101 + }, + { + "epoch": 0.9971537001897534, + "grad_norm": 2.017954111099243, + "learning_rate": 1.0533717160527357e-05, + "loss": 0.2829, + "step": 2102 + }, + { + "epoch": 0.9976280834914611, + "grad_norm": 1.9769190549850464, + "learning_rate": 1.0526044920152578e-05, + "loss": 0.2776, + "step": 2103 + }, + { + "epoch": 0.9981024667931688, + "grad_norm": 2.1241579055786133, + "learning_rate": 1.051837236925869e-05, + "loss": 0.3468, + "step": 2104 + }, + { + "epoch": 0.9985768500948766, + "grad_norm": 2.512434720993042, + "learning_rate": 1.051069951237472e-05, + "loss": 0.2764, + "step": 2105 + }, + { + "epoch": 0.9990512333965844, + "grad_norm": 2.3992347717285156, + "learning_rate": 1.0503026354029882e-05, + "loss": 0.3268, + "step": 2106 + }, + { + "epoch": 0.9995256166982922, + "grad_norm": 2.2171542644500732, + "learning_rate": 1.0495352898753563e-05, + "loss": 0.329, + "step": 2107 + }, + { + "epoch": 1.0, + "grad_norm": 1.9980618953704834, + "learning_rate": 1.0487679151075332e-05, + "loss": 0.3509, + "step": 2108 + }, + { + "epoch": 1.0004743833017078, + "grad_norm": 2.0853278636932373, + "learning_rate": 1.0480005115524923e-05, + "loss": 0.2764, + "step": 2109 + }, + { + "epoch": 1.0009487666034156, + "grad_norm": 1.904092788696289, + "learning_rate": 1.0472330796632244e-05, + "loss": 0.26, + "step": 2110 + }, + { + "epoch": 1.0014231499051234, + "grad_norm": 2.1483476161956787, + "learning_rate": 1.0464656198927373e-05, + "loss": 0.2206, + "step": 2111 + }, + { + "epoch": 1.0018975332068312, + "grad_norm": 1.696155309677124, + "learning_rate": 1.0456981326940541e-05, + "loss": 0.1763, + "step": 2112 + }, + { + "epoch": 1.002371916508539, + "grad_norm": 1.9953058958053589, + "learning_rate": 1.0449306185202155e-05, + "loss": 0.1922, + "step": 2113 + }, + { + "epoch": 1.0028462998102468, + "grad_norm": 1.8557651042938232, + "learning_rate": 1.0441630778242775e-05, + "loss": 0.2421, + "step": 2114 + }, + { + "epoch": 1.0033206831119545, + "grad_norm": 1.7121390104293823, + "learning_rate": 1.0433955110593115e-05, + "loss": 0.2016, + "step": 2115 + }, + { + "epoch": 1.0037950664136623, + "grad_norm": 1.799052357673645, + "learning_rate": 1.042627918678405e-05, + "loss": 0.1668, + "step": 2116 + }, + { + "epoch": 1.00426944971537, + "grad_norm": 1.8632630109786987, + "learning_rate": 1.04186030113466e-05, + "loss": 0.2101, + "step": 2117 + }, + { + "epoch": 1.0047438330170777, + "grad_norm": 2.4004900455474854, + "learning_rate": 1.0410926588811931e-05, + "loss": 0.2088, + "step": 2118 + }, + { + "epoch": 1.0052182163187855, + "grad_norm": 1.4884165525436401, + "learning_rate": 1.0403249923711365e-05, + "loss": 0.1716, + "step": 2119 + }, + { + "epoch": 1.0056925996204933, + "grad_norm": 1.6962333917617798, + "learning_rate": 1.0395573020576357e-05, + "loss": 0.1693, + "step": 2120 + }, + { + "epoch": 1.006166982922201, + "grad_norm": 1.9333925247192383, + "learning_rate": 1.038789588393851e-05, + "loss": 0.1512, + "step": 2121 + }, + { + "epoch": 1.0066413662239089, + "grad_norm": 2.6267874240875244, + "learning_rate": 1.0380218518329564e-05, + "loss": 0.2462, + "step": 2122 + }, + { + "epoch": 1.0071157495256167, + "grad_norm": 1.9119246006011963, + "learning_rate": 1.037254092828139e-05, + "loss": 0.1687, + "step": 2123 + }, + { + "epoch": 1.0075901328273245, + "grad_norm": 2.043225049972534, + "learning_rate": 1.0364863118325988e-05, + "loss": 0.225, + "step": 2124 + }, + { + "epoch": 1.0080645161290323, + "grad_norm": 2.4549620151519775, + "learning_rate": 1.0357185092995499e-05, + "loss": 0.2173, + "step": 2125 + }, + { + "epoch": 1.00853889943074, + "grad_norm": 1.932741403579712, + "learning_rate": 1.0349506856822184e-05, + "loss": 0.2033, + "step": 2126 + }, + { + "epoch": 1.0090132827324478, + "grad_norm": 2.0784912109375, + "learning_rate": 1.0341828414338431e-05, + "loss": 0.1974, + "step": 2127 + }, + { + "epoch": 1.0094876660341556, + "grad_norm": 1.9300061464309692, + "learning_rate": 1.0334149770076747e-05, + "loss": 0.1846, + "step": 2128 + }, + { + "epoch": 1.0099620493358634, + "grad_norm": 1.8968406915664673, + "learning_rate": 1.0326470928569758e-05, + "loss": 0.2126, + "step": 2129 + }, + { + "epoch": 1.0104364326375712, + "grad_norm": 2.385936975479126, + "learning_rate": 1.0318791894350217e-05, + "loss": 0.2034, + "step": 2130 + }, + { + "epoch": 1.010910815939279, + "grad_norm": 2.1592063903808594, + "learning_rate": 1.0311112671950969e-05, + "loss": 0.2947, + "step": 2131 + }, + { + "epoch": 1.0113851992409868, + "grad_norm": 1.7248077392578125, + "learning_rate": 1.030343326590499e-05, + "loss": 0.158, + "step": 2132 + }, + { + "epoch": 1.0118595825426946, + "grad_norm": 2.433410167694092, + "learning_rate": 1.0295753680745359e-05, + "loss": 0.226, + "step": 2133 + }, + { + "epoch": 1.0123339658444024, + "grad_norm": 1.5119600296020508, + "learning_rate": 1.0288073921005258e-05, + "loss": 0.1824, + "step": 2134 + }, + { + "epoch": 1.01280834914611, + "grad_norm": 1.8853365182876587, + "learning_rate": 1.028039399121797e-05, + "loss": 0.2013, + "step": 2135 + }, + { + "epoch": 1.0132827324478177, + "grad_norm": 1.7288734912872314, + "learning_rate": 1.0272713895916884e-05, + "loss": 0.1868, + "step": 2136 + }, + { + "epoch": 1.0137571157495255, + "grad_norm": 1.6384727954864502, + "learning_rate": 1.0265033639635483e-05, + "loss": 0.178, + "step": 2137 + }, + { + "epoch": 1.0142314990512333, + "grad_norm": 1.747512936592102, + "learning_rate": 1.0257353226907349e-05, + "loss": 0.1839, + "step": 2138 + }, + { + "epoch": 1.0147058823529411, + "grad_norm": 2.3721799850463867, + "learning_rate": 1.0249672662266148e-05, + "loss": 0.1944, + "step": 2139 + }, + { + "epoch": 1.015180265654649, + "grad_norm": 2.0248069763183594, + "learning_rate": 1.0241991950245648e-05, + "loss": 0.2256, + "step": 2140 + }, + { + "epoch": 1.0156546489563567, + "grad_norm": 2.1707992553710938, + "learning_rate": 1.0234311095379694e-05, + "loss": 0.1732, + "step": 2141 + }, + { + "epoch": 1.0161290322580645, + "grad_norm": 2.033766746520996, + "learning_rate": 1.0226630102202221e-05, + "loss": 0.1923, + "step": 2142 + }, + { + "epoch": 1.0166034155597723, + "grad_norm": 1.76777184009552, + "learning_rate": 1.0218948975247238e-05, + "loss": 0.1628, + "step": 2143 + }, + { + "epoch": 1.01707779886148, + "grad_norm": 1.886703372001648, + "learning_rate": 1.021126771904884e-05, + "loss": 0.1511, + "step": 2144 + }, + { + "epoch": 1.0175521821631879, + "grad_norm": 1.8403862714767456, + "learning_rate": 1.0203586338141202e-05, + "loss": 0.1826, + "step": 2145 + }, + { + "epoch": 1.0180265654648957, + "grad_norm": 2.151848793029785, + "learning_rate": 1.0195904837058563e-05, + "loss": 0.2328, + "step": 2146 + }, + { + "epoch": 1.0185009487666035, + "grad_norm": 1.8675678968429565, + "learning_rate": 1.0188223220335238e-05, + "loss": 0.1969, + "step": 2147 + }, + { + "epoch": 1.0189753320683113, + "grad_norm": 2.2719991207122803, + "learning_rate": 1.0180541492505605e-05, + "loss": 0.2024, + "step": 2148 + }, + { + "epoch": 1.019449715370019, + "grad_norm": 3.2726147174835205, + "learning_rate": 1.0172859658104117e-05, + "loss": 0.2026, + "step": 2149 + }, + { + "epoch": 1.0199240986717268, + "grad_norm": 1.7774338722229004, + "learning_rate": 1.0165177721665284e-05, + "loss": 0.1555, + "step": 2150 + }, + { + "epoch": 1.0203984819734346, + "grad_norm": 1.7717763185501099, + "learning_rate": 1.0157495687723675e-05, + "loss": 0.1982, + "step": 2151 + }, + { + "epoch": 1.0208728652751424, + "grad_norm": 1.9706634283065796, + "learning_rate": 1.0149813560813924e-05, + "loss": 0.1834, + "step": 2152 + }, + { + "epoch": 1.02134724857685, + "grad_norm": 1.683834433555603, + "learning_rate": 1.0142131345470704e-05, + "loss": 0.1814, + "step": 2153 + }, + { + "epoch": 1.0218216318785578, + "grad_norm": 1.992972731590271, + "learning_rate": 1.0134449046228764e-05, + "loss": 0.2124, + "step": 2154 + }, + { + "epoch": 1.0222960151802656, + "grad_norm": 1.861265778541565, + "learning_rate": 1.0126766667622878e-05, + "loss": 0.2043, + "step": 2155 + }, + { + "epoch": 1.0227703984819734, + "grad_norm": 2.0793933868408203, + "learning_rate": 1.0119084214187882e-05, + "loss": 0.2147, + "step": 2156 + }, + { + "epoch": 1.0232447817836812, + "grad_norm": 1.6746715307235718, + "learning_rate": 1.0111401690458655e-05, + "loss": 0.2066, + "step": 2157 + }, + { + "epoch": 1.023719165085389, + "grad_norm": 1.7119837999343872, + "learning_rate": 1.0103719100970115e-05, + "loss": 0.161, + "step": 2158 + }, + { + "epoch": 1.0241935483870968, + "grad_norm": 2.014586925506592, + "learning_rate": 1.0096036450257214e-05, + "loss": 0.1854, + "step": 2159 + }, + { + "epoch": 1.0246679316888045, + "grad_norm": 1.4165558815002441, + "learning_rate": 1.0088353742854943e-05, + "loss": 0.1476, + "step": 2160 + }, + { + "epoch": 1.0251423149905123, + "grad_norm": 1.4397838115692139, + "learning_rate": 1.0080670983298335e-05, + "loss": 0.1388, + "step": 2161 + }, + { + "epoch": 1.0256166982922201, + "grad_norm": 2.0614216327667236, + "learning_rate": 1.0072988176122445e-05, + "loss": 0.2106, + "step": 2162 + }, + { + "epoch": 1.026091081593928, + "grad_norm": 2.1988351345062256, + "learning_rate": 1.0065305325862354e-05, + "loss": 0.1489, + "step": 2163 + }, + { + "epoch": 1.0265654648956357, + "grad_norm": 1.5647263526916504, + "learning_rate": 1.0057622437053178e-05, + "loss": 0.1348, + "step": 2164 + }, + { + "epoch": 1.0270398481973435, + "grad_norm": 1.8337029218673706, + "learning_rate": 1.0049939514230045e-05, + "loss": 0.1559, + "step": 2165 + }, + { + "epoch": 1.0275142314990513, + "grad_norm": 1.588962435722351, + "learning_rate": 1.0042256561928115e-05, + "loss": 0.1418, + "step": 2166 + }, + { + "epoch": 1.027988614800759, + "grad_norm": 1.8072073459625244, + "learning_rate": 1.003457358468255e-05, + "loss": 0.1858, + "step": 2167 + }, + { + "epoch": 1.0284629981024669, + "grad_norm": 2.0580496788024902, + "learning_rate": 1.0026890587028544e-05, + "loss": 0.2082, + "step": 2168 + }, + { + "epoch": 1.0289373814041747, + "grad_norm": 2.101830244064331, + "learning_rate": 1.0019207573501287e-05, + "loss": 0.2083, + "step": 2169 + }, + { + "epoch": 1.0294117647058822, + "grad_norm": 1.8399953842163086, + "learning_rate": 1.001152454863599e-05, + "loss": 0.1713, + "step": 2170 + }, + { + "epoch": 1.02988614800759, + "grad_norm": 1.6996338367462158, + "learning_rate": 1.000384151696787e-05, + "loss": 0.1551, + "step": 2171 + }, + { + "epoch": 1.0303605313092978, + "grad_norm": 1.8706146478652954, + "learning_rate": 9.996158483032137e-06, + "loss": 0.2203, + "step": 2172 + }, + { + "epoch": 1.0308349146110056, + "grad_norm": 2.099688768386841, + "learning_rate": 9.988475451364013e-06, + "loss": 0.2117, + "step": 2173 + }, + { + "epoch": 1.0313092979127134, + "grad_norm": 2.176435947418213, + "learning_rate": 9.980792426498717e-06, + "loss": 0.2196, + "step": 2174 + }, + { + "epoch": 1.0317836812144212, + "grad_norm": 2.1011524200439453, + "learning_rate": 9.97310941297146e-06, + "loss": 0.1895, + "step": 2175 + }, + { + "epoch": 1.032258064516129, + "grad_norm": 1.816577434539795, + "learning_rate": 9.965426415317451e-06, + "loss": 0.1829, + "step": 2176 + }, + { + "epoch": 1.0327324478178368, + "grad_norm": 2.274826765060425, + "learning_rate": 9.95774343807189e-06, + "loss": 0.167, + "step": 2177 + }, + { + "epoch": 1.0332068311195446, + "grad_norm": 1.8276028633117676, + "learning_rate": 9.950060485769958e-06, + "loss": 0.1813, + "step": 2178 + }, + { + "epoch": 1.0336812144212524, + "grad_norm": 1.7824639081954956, + "learning_rate": 9.942377562946825e-06, + "loss": 0.1577, + "step": 2179 + }, + { + "epoch": 1.0341555977229602, + "grad_norm": 1.696703314781189, + "learning_rate": 9.934694674137648e-06, + "loss": 0.1729, + "step": 2180 + }, + { + "epoch": 1.034629981024668, + "grad_norm": 1.851335048675537, + "learning_rate": 9.927011823877559e-06, + "loss": 0.2252, + "step": 2181 + }, + { + "epoch": 1.0351043643263758, + "grad_norm": 1.9400562047958374, + "learning_rate": 9.919329016701668e-06, + "loss": 0.2241, + "step": 2182 + }, + { + "epoch": 1.0355787476280836, + "grad_norm": 1.9708709716796875, + "learning_rate": 9.91164625714506e-06, + "loss": 0.2549, + "step": 2183 + }, + { + "epoch": 1.0360531309297913, + "grad_norm": 1.7907429933547974, + "learning_rate": 9.90396354974279e-06, + "loss": 0.1807, + "step": 2184 + }, + { + "epoch": 1.0365275142314991, + "grad_norm": 1.606693148612976, + "learning_rate": 9.896280899029887e-06, + "loss": 0.1502, + "step": 2185 + }, + { + "epoch": 1.037001897533207, + "grad_norm": 2.3423125743865967, + "learning_rate": 9.888598309541347e-06, + "loss": 0.1645, + "step": 2186 + }, + { + "epoch": 1.0374762808349147, + "grad_norm": 1.9627526998519897, + "learning_rate": 9.88091578581212e-06, + "loss": 0.1701, + "step": 2187 + }, + { + "epoch": 1.0379506641366223, + "grad_norm": 1.9357240200042725, + "learning_rate": 9.873233332377125e-06, + "loss": 0.2047, + "step": 2188 + }, + { + "epoch": 1.03842504743833, + "grad_norm": 2.164787530899048, + "learning_rate": 9.865550953771237e-06, + "loss": 0.2643, + "step": 2189 + }, + { + "epoch": 1.0388994307400379, + "grad_norm": 2.26877498626709, + "learning_rate": 9.857868654529296e-06, + "loss": 0.2058, + "step": 2190 + }, + { + "epoch": 1.0393738140417457, + "grad_norm": 1.9010010957717896, + "learning_rate": 9.850186439186083e-06, + "loss": 0.2138, + "step": 2191 + }, + { + "epoch": 1.0398481973434535, + "grad_norm": 1.56143057346344, + "learning_rate": 9.842504312276326e-06, + "loss": 0.1677, + "step": 2192 + }, + { + "epoch": 1.0403225806451613, + "grad_norm": 1.7175490856170654, + "learning_rate": 9.834822278334718e-06, + "loss": 0.2035, + "step": 2193 + }, + { + "epoch": 1.040796963946869, + "grad_norm": 1.5723693370819092, + "learning_rate": 9.827140341895885e-06, + "loss": 0.1521, + "step": 2194 + }, + { + "epoch": 1.0412713472485768, + "grad_norm": 1.6594966650009155, + "learning_rate": 9.819458507494395e-06, + "loss": 0.2055, + "step": 2195 + }, + { + "epoch": 1.0417457305502846, + "grad_norm": 2.062875747680664, + "learning_rate": 9.811776779664767e-06, + "loss": 0.1937, + "step": 2196 + }, + { + "epoch": 1.0422201138519924, + "grad_norm": 4.114375591278076, + "learning_rate": 9.804095162941439e-06, + "loss": 0.3022, + "step": 2197 + }, + { + "epoch": 1.0426944971537002, + "grad_norm": 2.6715476512908936, + "learning_rate": 9.7964136618588e-06, + "loss": 0.2538, + "step": 2198 + }, + { + "epoch": 1.043168880455408, + "grad_norm": 2.0732991695404053, + "learning_rate": 9.788732280951158e-06, + "loss": 0.2196, + "step": 2199 + }, + { + "epoch": 1.0436432637571158, + "grad_norm": 2.420642375946045, + "learning_rate": 9.781051024752762e-06, + "loss": 0.2222, + "step": 2200 + }, + { + "epoch": 1.0441176470588236, + "grad_norm": 1.8388843536376953, + "learning_rate": 9.773369897797784e-06, + "loss": 0.2209, + "step": 2201 + }, + { + "epoch": 1.0445920303605314, + "grad_norm": 1.8891000747680664, + "learning_rate": 9.76568890462031e-06, + "loss": 0.1758, + "step": 2202 + }, + { + "epoch": 1.0450664136622392, + "grad_norm": 1.7700624465942383, + "learning_rate": 9.758008049754353e-06, + "loss": 0.1815, + "step": 2203 + }, + { + "epoch": 1.045540796963947, + "grad_norm": 2.6104068756103516, + "learning_rate": 9.750327337733852e-06, + "loss": 0.2194, + "step": 2204 + }, + { + "epoch": 1.0460151802656545, + "grad_norm": 1.9325884580612183, + "learning_rate": 9.742646773092656e-06, + "loss": 0.1886, + "step": 2205 + }, + { + "epoch": 1.0464895635673623, + "grad_norm": 2.4706602096557617, + "learning_rate": 9.734966360364519e-06, + "loss": 0.2789, + "step": 2206 + }, + { + "epoch": 1.0469639468690701, + "grad_norm": 2.077040195465088, + "learning_rate": 9.72728610408312e-06, + "loss": 0.2351, + "step": 2207 + }, + { + "epoch": 1.047438330170778, + "grad_norm": 1.9420323371887207, + "learning_rate": 9.719606008782031e-06, + "loss": 0.2085, + "step": 2208 + }, + { + "epoch": 1.0479127134724857, + "grad_norm": 1.9051023721694946, + "learning_rate": 9.711926078994744e-06, + "loss": 0.2164, + "step": 2209 + }, + { + "epoch": 1.0483870967741935, + "grad_norm": 1.6898927688598633, + "learning_rate": 9.704246319254643e-06, + "loss": 0.1834, + "step": 2210 + }, + { + "epoch": 1.0488614800759013, + "grad_norm": 1.6880271434783936, + "learning_rate": 9.696566734095012e-06, + "loss": 0.1787, + "step": 2211 + }, + { + "epoch": 1.049335863377609, + "grad_norm": 1.8380780220031738, + "learning_rate": 9.688887328049034e-06, + "loss": 0.1836, + "step": 2212 + }, + { + "epoch": 1.0498102466793169, + "grad_norm": 1.8687678575515747, + "learning_rate": 9.681208105649786e-06, + "loss": 0.2263, + "step": 2213 + }, + { + "epoch": 1.0502846299810247, + "grad_norm": 2.063650608062744, + "learning_rate": 9.673529071430242e-06, + "loss": 0.1881, + "step": 2214 + }, + { + "epoch": 1.0507590132827325, + "grad_norm": 2.072175979614258, + "learning_rate": 9.665850229923258e-06, + "loss": 0.1931, + "step": 2215 + }, + { + "epoch": 1.0512333965844403, + "grad_norm": 2.0182647705078125, + "learning_rate": 9.658171585661572e-06, + "loss": 0.2125, + "step": 2216 + }, + { + "epoch": 1.051707779886148, + "grad_norm": 1.7542519569396973, + "learning_rate": 9.650493143177817e-06, + "loss": 0.1789, + "step": 2217 + }, + { + "epoch": 1.0521821631878558, + "grad_norm": 1.796041488647461, + "learning_rate": 9.642814907004505e-06, + "loss": 0.1936, + "step": 2218 + }, + { + "epoch": 1.0526565464895636, + "grad_norm": 2.394157648086548, + "learning_rate": 9.635136881674013e-06, + "loss": 0.2391, + "step": 2219 + }, + { + "epoch": 1.0531309297912714, + "grad_norm": 2.1615726947784424, + "learning_rate": 9.627459071718617e-06, + "loss": 0.2023, + "step": 2220 + }, + { + "epoch": 1.0536053130929792, + "grad_norm": 2.2835464477539062, + "learning_rate": 9.619781481670437e-06, + "loss": 0.2261, + "step": 2221 + }, + { + "epoch": 1.054079696394687, + "grad_norm": 1.822164535522461, + "learning_rate": 9.612104116061491e-06, + "loss": 0.2148, + "step": 2222 + }, + { + "epoch": 1.0545540796963946, + "grad_norm": 1.8478764295578003, + "learning_rate": 9.604426979423645e-06, + "loss": 0.1656, + "step": 2223 + }, + { + "epoch": 1.0550284629981024, + "grad_norm": 2.215240001678467, + "learning_rate": 9.596750076288642e-06, + "loss": 0.2309, + "step": 2224 + }, + { + "epoch": 1.0555028462998102, + "grad_norm": 1.7701280117034912, + "learning_rate": 9.589073411188074e-06, + "loss": 0.2403, + "step": 2225 + }, + { + "epoch": 1.055977229601518, + "grad_norm": 1.9893231391906738, + "learning_rate": 9.581396988653404e-06, + "loss": 0.2326, + "step": 2226 + }, + { + "epoch": 1.0564516129032258, + "grad_norm": 1.8492276668548584, + "learning_rate": 9.573720813215954e-06, + "loss": 0.1864, + "step": 2227 + }, + { + "epoch": 1.0569259962049335, + "grad_norm": 1.7757240533828735, + "learning_rate": 9.566044889406885e-06, + "loss": 0.2256, + "step": 2228 + }, + { + "epoch": 1.0574003795066413, + "grad_norm": 1.9123115539550781, + "learning_rate": 9.558369221757229e-06, + "loss": 0.2317, + "step": 2229 + }, + { + "epoch": 1.0578747628083491, + "grad_norm": 2.3356640338897705, + "learning_rate": 9.550693814797847e-06, + "loss": 0.2255, + "step": 2230 + }, + { + "epoch": 1.058349146110057, + "grad_norm": 1.911953330039978, + "learning_rate": 9.543018673059462e-06, + "loss": 0.192, + "step": 2231 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 2.0029706954956055, + "learning_rate": 9.53534380107263e-06, + "loss": 0.2193, + "step": 2232 + }, + { + "epoch": 1.0592979127134725, + "grad_norm": 1.5258057117462158, + "learning_rate": 9.527669203367756e-06, + "loss": 0.1815, + "step": 2233 + }, + { + "epoch": 1.0597722960151803, + "grad_norm": 1.8638172149658203, + "learning_rate": 9.51999488447508e-06, + "loss": 0.1792, + "step": 2234 + }, + { + "epoch": 1.060246679316888, + "grad_norm": 2.022976875305176, + "learning_rate": 9.512320848924672e-06, + "loss": 0.2382, + "step": 2235 + }, + { + "epoch": 1.060721062618596, + "grad_norm": 1.830183744430542, + "learning_rate": 9.504647101246438e-06, + "loss": 0.207, + "step": 2236 + }, + { + "epoch": 1.0611954459203037, + "grad_norm": 1.8549655675888062, + "learning_rate": 9.49697364597012e-06, + "loss": 0.2103, + "step": 2237 + }, + { + "epoch": 1.0616698292220115, + "grad_norm": 2.056288719177246, + "learning_rate": 9.489300487625283e-06, + "loss": 0.2282, + "step": 2238 + }, + { + "epoch": 1.0621442125237193, + "grad_norm": 1.720819354057312, + "learning_rate": 9.481627630741315e-06, + "loss": 0.1979, + "step": 2239 + }, + { + "epoch": 1.0626185958254268, + "grad_norm": 1.9592931270599365, + "learning_rate": 9.473955079847426e-06, + "loss": 0.2263, + "step": 2240 + }, + { + "epoch": 1.0630929791271346, + "grad_norm": 1.7493445873260498, + "learning_rate": 9.466282839472645e-06, + "loss": 0.1799, + "step": 2241 + }, + { + "epoch": 1.0635673624288424, + "grad_norm": 2.2582602500915527, + "learning_rate": 9.458610914145826e-06, + "loss": 0.1552, + "step": 2242 + }, + { + "epoch": 1.0640417457305502, + "grad_norm": 1.6923972368240356, + "learning_rate": 9.450939308395629e-06, + "loss": 0.1921, + "step": 2243 + }, + { + "epoch": 1.064516129032258, + "grad_norm": 1.8049591779708862, + "learning_rate": 9.443268026750521e-06, + "loss": 0.2018, + "step": 2244 + }, + { + "epoch": 1.0649905123339658, + "grad_norm": 1.8883256912231445, + "learning_rate": 9.435597073738787e-06, + "loss": 0.1884, + "step": 2245 + }, + { + "epoch": 1.0654648956356736, + "grad_norm": 2.0094857215881348, + "learning_rate": 9.427926453888518e-06, + "loss": 0.2109, + "step": 2246 + }, + { + "epoch": 1.0659392789373814, + "grad_norm": 1.975786566734314, + "learning_rate": 9.4202561717276e-06, + "loss": 0.1851, + "step": 2247 + }, + { + "epoch": 1.0664136622390892, + "grad_norm": 1.8108450174331665, + "learning_rate": 9.41258623178373e-06, + "loss": 0.1696, + "step": 2248 + }, + { + "epoch": 1.066888045540797, + "grad_norm": 2.1892595291137695, + "learning_rate": 9.404916638584394e-06, + "loss": 0.1848, + "step": 2249 + }, + { + "epoch": 1.0673624288425048, + "grad_norm": 2.6319334506988525, + "learning_rate": 9.397247396656875e-06, + "loss": 0.217, + "step": 2250 + }, + { + "epoch": 1.0678368121442126, + "grad_norm": 1.9110881090164185, + "learning_rate": 9.389578510528256e-06, + "loss": 0.1634, + "step": 2251 + }, + { + "epoch": 1.0683111954459203, + "grad_norm": 1.8657715320587158, + "learning_rate": 9.381909984725399e-06, + "loss": 0.1685, + "step": 2252 + }, + { + "epoch": 1.0687855787476281, + "grad_norm": 1.8046596050262451, + "learning_rate": 9.374241823774967e-06, + "loss": 0.1784, + "step": 2253 + }, + { + "epoch": 1.069259962049336, + "grad_norm": 1.8186533451080322, + "learning_rate": 9.36657403220339e-06, + "loss": 0.1822, + "step": 2254 + }, + { + "epoch": 1.0697343453510437, + "grad_norm": 2.0945181846618652, + "learning_rate": 9.358906614536895e-06, + "loss": 0.2373, + "step": 2255 + }, + { + "epoch": 1.0702087286527515, + "grad_norm": 2.1683075428009033, + "learning_rate": 9.351239575301479e-06, + "loss": 0.2144, + "step": 2256 + }, + { + "epoch": 1.0706831119544593, + "grad_norm": 1.9169727563858032, + "learning_rate": 9.343572919022924e-06, + "loss": 0.2189, + "step": 2257 + }, + { + "epoch": 1.0711574952561669, + "grad_norm": 1.7258111238479614, + "learning_rate": 9.335906650226783e-06, + "loss": 0.1937, + "step": 2258 + }, + { + "epoch": 1.0716318785578747, + "grad_norm": 2.077392816543579, + "learning_rate": 9.328240773438372e-06, + "loss": 0.1898, + "step": 2259 + }, + { + "epoch": 1.0721062618595825, + "grad_norm": 1.833132028579712, + "learning_rate": 9.320575293182782e-06, + "loss": 0.2103, + "step": 2260 + }, + { + "epoch": 1.0725806451612903, + "grad_norm": 1.7154309749603271, + "learning_rate": 9.312910213984876e-06, + "loss": 0.1733, + "step": 2261 + }, + { + "epoch": 1.073055028462998, + "grad_norm": 1.8700604438781738, + "learning_rate": 9.305245540369273e-06, + "loss": 0.2193, + "step": 2262 + }, + { + "epoch": 1.0735294117647058, + "grad_norm": 1.6208839416503906, + "learning_rate": 9.297581276860353e-06, + "loss": 0.1841, + "step": 2263 + }, + { + "epoch": 1.0740037950664136, + "grad_norm": 1.4968012571334839, + "learning_rate": 9.28991742798225e-06, + "loss": 0.16, + "step": 2264 + }, + { + "epoch": 1.0744781783681214, + "grad_norm": 1.8390696048736572, + "learning_rate": 9.282253998258865e-06, + "loss": 0.2012, + "step": 2265 + }, + { + "epoch": 1.0749525616698292, + "grad_norm": 1.7694889307022095, + "learning_rate": 9.274590992213844e-06, + "loss": 0.1982, + "step": 2266 + }, + { + "epoch": 1.075426944971537, + "grad_norm": 1.4749209880828857, + "learning_rate": 9.26692841437058e-06, + "loss": 0.1581, + "step": 2267 + }, + { + "epoch": 1.0759013282732448, + "grad_norm": 1.6622833013534546, + "learning_rate": 9.259266269252221e-06, + "loss": 0.1919, + "step": 2268 + }, + { + "epoch": 1.0763757115749526, + "grad_norm": 1.8222259283065796, + "learning_rate": 9.25160456138165e-06, + "loss": 0.1752, + "step": 2269 + }, + { + "epoch": 1.0768500948766604, + "grad_norm": 2.1095898151397705, + "learning_rate": 9.243943295281505e-06, + "loss": 0.1706, + "step": 2270 + }, + { + "epoch": 1.0773244781783682, + "grad_norm": 1.9356722831726074, + "learning_rate": 9.236282475474146e-06, + "loss": 0.2063, + "step": 2271 + }, + { + "epoch": 1.077798861480076, + "grad_norm": 2.4067156314849854, + "learning_rate": 9.228622106481691e-06, + "loss": 0.188, + "step": 2272 + }, + { + "epoch": 1.0782732447817838, + "grad_norm": 3.1411545276641846, + "learning_rate": 9.22096219282597e-06, + "loss": 0.2492, + "step": 2273 + }, + { + "epoch": 1.0787476280834916, + "grad_norm": 1.8042513132095337, + "learning_rate": 9.213302739028555e-06, + "loss": 0.1861, + "step": 2274 + }, + { + "epoch": 1.0792220113851991, + "grad_norm": 1.792336106300354, + "learning_rate": 9.205643749610751e-06, + "loss": 0.1982, + "step": 2275 + }, + { + "epoch": 1.079696394686907, + "grad_norm": 3.020674228668213, + "learning_rate": 9.19798522909358e-06, + "loss": 0.1797, + "step": 2276 + }, + { + "epoch": 1.0801707779886147, + "grad_norm": 1.7047805786132812, + "learning_rate": 9.190327181997796e-06, + "loss": 0.1772, + "step": 2277 + }, + { + "epoch": 1.0806451612903225, + "grad_norm": 2.018634080886841, + "learning_rate": 9.182669612843861e-06, + "loss": 0.1895, + "step": 2278 + }, + { + "epoch": 1.0811195445920303, + "grad_norm": 1.8824384212493896, + "learning_rate": 9.175012526151968e-06, + "loss": 0.1718, + "step": 2279 + }, + { + "epoch": 1.081593927893738, + "grad_norm": 1.511277437210083, + "learning_rate": 9.167355926442013e-06, + "loss": 0.1604, + "step": 2280 + }, + { + "epoch": 1.0820683111954459, + "grad_norm": 1.670013427734375, + "learning_rate": 9.15969981823362e-06, + "loss": 0.1973, + "step": 2281 + }, + { + "epoch": 1.0825426944971537, + "grad_norm": 1.8747272491455078, + "learning_rate": 9.152044206046106e-06, + "loss": 0.1461, + "step": 2282 + }, + { + "epoch": 1.0830170777988615, + "grad_norm": 1.7594618797302246, + "learning_rate": 9.144389094398508e-06, + "loss": 0.1442, + "step": 2283 + }, + { + "epoch": 1.0834914611005693, + "grad_norm": 2.149158477783203, + "learning_rate": 9.136734487809559e-06, + "loss": 0.2489, + "step": 2284 + }, + { + "epoch": 1.083965844402277, + "grad_norm": 1.746847152709961, + "learning_rate": 9.129080390797699e-06, + "loss": 0.1697, + "step": 2285 + }, + { + "epoch": 1.0844402277039848, + "grad_norm": 1.7937593460083008, + "learning_rate": 9.12142680788107e-06, + "loss": 0.166, + "step": 2286 + }, + { + "epoch": 1.0849146110056926, + "grad_norm": 2.1846930980682373, + "learning_rate": 9.113773743577502e-06, + "loss": 0.1949, + "step": 2287 + }, + { + "epoch": 1.0853889943074004, + "grad_norm": 2.1543169021606445, + "learning_rate": 9.106121202404521e-06, + "loss": 0.2145, + "step": 2288 + }, + { + "epoch": 1.0858633776091082, + "grad_norm": 1.6700936555862427, + "learning_rate": 9.098469188879348e-06, + "loss": 0.164, + "step": 2289 + }, + { + "epoch": 1.086337760910816, + "grad_norm": 1.5210555791854858, + "learning_rate": 9.090817707518893e-06, + "loss": 0.1588, + "step": 2290 + }, + { + "epoch": 1.0868121442125238, + "grad_norm": 1.8209552764892578, + "learning_rate": 9.083166762839751e-06, + "loss": 0.2126, + "step": 2291 + }, + { + "epoch": 1.0872865275142316, + "grad_norm": 1.954764485359192, + "learning_rate": 9.075516359358195e-06, + "loss": 0.1862, + "step": 2292 + }, + { + "epoch": 1.0877609108159392, + "grad_norm": 1.4143950939178467, + "learning_rate": 9.067866501590182e-06, + "loss": 0.1454, + "step": 2293 + }, + { + "epoch": 1.088235294117647, + "grad_norm": 2.105008363723755, + "learning_rate": 9.06021719405135e-06, + "loss": 0.2585, + "step": 2294 + }, + { + "epoch": 1.0887096774193548, + "grad_norm": 2.2465155124664307, + "learning_rate": 9.05256844125701e-06, + "loss": 0.2133, + "step": 2295 + }, + { + "epoch": 1.0891840607210626, + "grad_norm": 2.175455331802368, + "learning_rate": 9.044920247722146e-06, + "loss": 0.2389, + "step": 2296 + }, + { + "epoch": 1.0896584440227703, + "grad_norm": 1.8581993579864502, + "learning_rate": 9.037272617961405e-06, + "loss": 0.2035, + "step": 2297 + }, + { + "epoch": 1.0901328273244781, + "grad_norm": 1.7983685731887817, + "learning_rate": 9.02962555648911e-06, + "loss": 0.2225, + "step": 2298 + }, + { + "epoch": 1.090607210626186, + "grad_norm": 1.98671293258667, + "learning_rate": 9.021979067819252e-06, + "loss": 0.2191, + "step": 2299 + }, + { + "epoch": 1.0910815939278937, + "grad_norm": 1.4009077548980713, + "learning_rate": 9.014333156465467e-06, + "loss": 0.1482, + "step": 2300 + }, + { + "epoch": 1.0915559772296015, + "grad_norm": 1.6995691061019897, + "learning_rate": 9.00668782694107e-06, + "loss": 0.1738, + "step": 2301 + }, + { + "epoch": 1.0920303605313093, + "grad_norm": 1.6954809427261353, + "learning_rate": 8.999043083759016e-06, + "loss": 0.1835, + "step": 2302 + }, + { + "epoch": 1.092504743833017, + "grad_norm": 1.936194896697998, + "learning_rate": 8.99139893143193e-06, + "loss": 0.2194, + "step": 2303 + }, + { + "epoch": 1.092979127134725, + "grad_norm": 1.9008138179779053, + "learning_rate": 8.983755374472069e-06, + "loss": 0.2544, + "step": 2304 + }, + { + "epoch": 1.0934535104364327, + "grad_norm": 1.8978139162063599, + "learning_rate": 8.976112417391358e-06, + "loss": 0.2117, + "step": 2305 + }, + { + "epoch": 1.0939278937381405, + "grad_norm": 1.6767897605895996, + "learning_rate": 8.968470064701354e-06, + "loss": 0.1708, + "step": 2306 + }, + { + "epoch": 1.0944022770398483, + "grad_norm": 2.310898780822754, + "learning_rate": 8.960828320913263e-06, + "loss": 0.2313, + "step": 2307 + }, + { + "epoch": 1.094876660341556, + "grad_norm": 1.91811203956604, + "learning_rate": 8.953187190537929e-06, + "loss": 0.1858, + "step": 2308 + }, + { + "epoch": 1.0953510436432639, + "grad_norm": 1.711073398590088, + "learning_rate": 8.945546678085838e-06, + "loss": 0.183, + "step": 2309 + }, + { + "epoch": 1.0958254269449714, + "grad_norm": 1.898019552230835, + "learning_rate": 8.937906788067114e-06, + "loss": 0.1654, + "step": 2310 + }, + { + "epoch": 1.0962998102466792, + "grad_norm": 1.8163573741912842, + "learning_rate": 8.9302675249915e-06, + "loss": 0.1461, + "step": 2311 + }, + { + "epoch": 1.096774193548387, + "grad_norm": 2.2861268520355225, + "learning_rate": 8.922628893368378e-06, + "loss": 0.2169, + "step": 2312 + }, + { + "epoch": 1.0972485768500948, + "grad_norm": 1.6889331340789795, + "learning_rate": 8.91499089770676e-06, + "loss": 0.1876, + "step": 2313 + }, + { + "epoch": 1.0977229601518026, + "grad_norm": 1.7860161066055298, + "learning_rate": 8.90735354251528e-06, + "loss": 0.1808, + "step": 2314 + }, + { + "epoch": 1.0981973434535104, + "grad_norm": 1.4944920539855957, + "learning_rate": 8.899716832302193e-06, + "loss": 0.142, + "step": 2315 + }, + { + "epoch": 1.0986717267552182, + "grad_norm": 2.013535737991333, + "learning_rate": 8.89208077157537e-06, + "loss": 0.2055, + "step": 2316 + }, + { + "epoch": 1.099146110056926, + "grad_norm": 1.811508059501648, + "learning_rate": 8.884445364842304e-06, + "loss": 0.199, + "step": 2317 + }, + { + "epoch": 1.0996204933586338, + "grad_norm": 2.4114105701446533, + "learning_rate": 8.8768106166101e-06, + "loss": 0.2022, + "step": 2318 + }, + { + "epoch": 1.1000948766603416, + "grad_norm": 2.2680139541625977, + "learning_rate": 8.869176531385476e-06, + "loss": 0.2317, + "step": 2319 + }, + { + "epoch": 1.1005692599620494, + "grad_norm": 1.6140555143356323, + "learning_rate": 8.861543113674758e-06, + "loss": 0.1574, + "step": 2320 + }, + { + "epoch": 1.1010436432637571, + "grad_norm": 1.9578766822814941, + "learning_rate": 8.853910367983871e-06, + "loss": 0.1904, + "step": 2321 + }, + { + "epoch": 1.101518026565465, + "grad_norm": 2.459045886993408, + "learning_rate": 8.846278298818352e-06, + "loss": 0.1852, + "step": 2322 + }, + { + "epoch": 1.1019924098671727, + "grad_norm": 1.5891822576522827, + "learning_rate": 8.838646910683338e-06, + "loss": 0.1727, + "step": 2323 + }, + { + "epoch": 1.1024667931688805, + "grad_norm": 1.923692226409912, + "learning_rate": 8.831016208083563e-06, + "loss": 0.1874, + "step": 2324 + }, + { + "epoch": 1.1029411764705883, + "grad_norm": 1.6137306690216064, + "learning_rate": 8.82338619552335e-06, + "loss": 0.1609, + "step": 2325 + }, + { + "epoch": 1.103415559772296, + "grad_norm": 2.0651705265045166, + "learning_rate": 8.815756877506622e-06, + "loss": 0.1954, + "step": 2326 + }, + { + "epoch": 1.103889943074004, + "grad_norm": 1.705381155014038, + "learning_rate": 8.808128258536893e-06, + "loss": 0.2241, + "step": 2327 + }, + { + "epoch": 1.1043643263757117, + "grad_norm": 1.756785273551941, + "learning_rate": 8.800500343117255e-06, + "loss": 0.1729, + "step": 2328 + }, + { + "epoch": 1.1048387096774193, + "grad_norm": 2.024177312850952, + "learning_rate": 8.7928731357504e-06, + "loss": 0.223, + "step": 2329 + }, + { + "epoch": 1.105313092979127, + "grad_norm": 1.7107657194137573, + "learning_rate": 8.785246640938584e-06, + "loss": 0.1882, + "step": 2330 + }, + { + "epoch": 1.1057874762808348, + "grad_norm": 2.2373104095458984, + "learning_rate": 8.777620863183658e-06, + "loss": 0.2527, + "step": 2331 + }, + { + "epoch": 1.1062618595825426, + "grad_norm": 1.823649525642395, + "learning_rate": 8.769995806987037e-06, + "loss": 0.2164, + "step": 2332 + }, + { + "epoch": 1.1067362428842504, + "grad_norm": 1.7617485523223877, + "learning_rate": 8.762371476849722e-06, + "loss": 0.1942, + "step": 2333 + }, + { + "epoch": 1.1072106261859582, + "grad_norm": 1.7329638004302979, + "learning_rate": 8.754747877272279e-06, + "loss": 0.1631, + "step": 2334 + }, + { + "epoch": 1.107685009487666, + "grad_norm": 1.7115880250930786, + "learning_rate": 8.747125012754839e-06, + "loss": 0.174, + "step": 2335 + }, + { + "epoch": 1.1081593927893738, + "grad_norm": 1.9890222549438477, + "learning_rate": 8.739502887797108e-06, + "loss": 0.1825, + "step": 2336 + }, + { + "epoch": 1.1086337760910816, + "grad_norm": 1.9389824867248535, + "learning_rate": 8.731881506898348e-06, + "loss": 0.1788, + "step": 2337 + }, + { + "epoch": 1.1091081593927894, + "grad_norm": 1.6447457075119019, + "learning_rate": 8.724260874557384e-06, + "loss": 0.1562, + "step": 2338 + }, + { + "epoch": 1.1095825426944972, + "grad_norm": 1.9594886302947998, + "learning_rate": 8.716640995272607e-06, + "loss": 0.2101, + "step": 2339 + }, + { + "epoch": 1.110056925996205, + "grad_norm": 1.9339709281921387, + "learning_rate": 8.70902187354195e-06, + "loss": 0.2179, + "step": 2340 + }, + { + "epoch": 1.1105313092979128, + "grad_norm": 1.7604994773864746, + "learning_rate": 8.701403513862901e-06, + "loss": 0.1864, + "step": 2341 + }, + { + "epoch": 1.1110056925996206, + "grad_norm": 1.8655658960342407, + "learning_rate": 8.69378592073251e-06, + "loss": 0.1842, + "step": 2342 + }, + { + "epoch": 1.1114800759013284, + "grad_norm": 1.4417341947555542, + "learning_rate": 8.68616909864737e-06, + "loss": 0.1523, + "step": 2343 + }, + { + "epoch": 1.1119544592030361, + "grad_norm": 1.832533836364746, + "learning_rate": 8.678553052103605e-06, + "loss": 0.1856, + "step": 2344 + }, + { + "epoch": 1.1124288425047437, + "grad_norm": 2.599240303039551, + "learning_rate": 8.670937785596897e-06, + "loss": 0.1896, + "step": 2345 + }, + { + "epoch": 1.1129032258064515, + "grad_norm": 1.5967415571212769, + "learning_rate": 8.663323303622462e-06, + "loss": 0.1631, + "step": 2346 + }, + { + "epoch": 1.1133776091081593, + "grad_norm": 1.8945375680923462, + "learning_rate": 8.655709610675056e-06, + "loss": 0.1659, + "step": 2347 + }, + { + "epoch": 1.113851992409867, + "grad_norm": 2.1004648208618164, + "learning_rate": 8.648096711248967e-06, + "loss": 0.2273, + "step": 2348 + }, + { + "epoch": 1.114326375711575, + "grad_norm": 1.748862624168396, + "learning_rate": 8.640484609838007e-06, + "loss": 0.1857, + "step": 2349 + }, + { + "epoch": 1.1148007590132827, + "grad_norm": 1.9971057176589966, + "learning_rate": 8.632873310935528e-06, + "loss": 0.1852, + "step": 2350 + }, + { + "epoch": 1.1152751423149905, + "grad_norm": 1.7587655782699585, + "learning_rate": 8.625262819034408e-06, + "loss": 0.1937, + "step": 2351 + }, + { + "epoch": 1.1157495256166983, + "grad_norm": 1.5481939315795898, + "learning_rate": 8.61765313862704e-06, + "loss": 0.1684, + "step": 2352 + }, + { + "epoch": 1.116223908918406, + "grad_norm": 1.8051375150680542, + "learning_rate": 8.610044274205352e-06, + "loss": 0.2023, + "step": 2353 + }, + { + "epoch": 1.1166982922201139, + "grad_norm": 2.1944355964660645, + "learning_rate": 8.602436230260768e-06, + "loss": 0.1854, + "step": 2354 + }, + { + "epoch": 1.1171726755218216, + "grad_norm": 2.0533533096313477, + "learning_rate": 8.59482901128425e-06, + "loss": 0.2236, + "step": 2355 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 2.361926555633545, + "learning_rate": 8.58722262176626e-06, + "loss": 0.2015, + "step": 2356 + }, + { + "epoch": 1.1181214421252372, + "grad_norm": 1.664730191230774, + "learning_rate": 8.579617066196777e-06, + "loss": 0.1824, + "step": 2357 + }, + { + "epoch": 1.118595825426945, + "grad_norm": 1.7999298572540283, + "learning_rate": 8.572012349065288e-06, + "loss": 0.1814, + "step": 2358 + }, + { + "epoch": 1.1190702087286528, + "grad_norm": 2.117183208465576, + "learning_rate": 8.564408474860774e-06, + "loss": 0.2215, + "step": 2359 + }, + { + "epoch": 1.1195445920303606, + "grad_norm": 1.658808946609497, + "learning_rate": 8.556805448071736e-06, + "loss": 0.1568, + "step": 2360 + }, + { + "epoch": 1.1200189753320684, + "grad_norm": 2.2438066005706787, + "learning_rate": 8.549203273186156e-06, + "loss": 0.1864, + "step": 2361 + }, + { + "epoch": 1.1204933586337762, + "grad_norm": 1.6112077236175537, + "learning_rate": 8.541601954691534e-06, + "loss": 0.162, + "step": 2362 + }, + { + "epoch": 1.120967741935484, + "grad_norm": 1.4597880840301514, + "learning_rate": 8.534001497074842e-06, + "loss": 0.1589, + "step": 2363 + }, + { + "epoch": 1.1214421252371916, + "grad_norm": 1.9932173490524292, + "learning_rate": 8.52640190482256e-06, + "loss": 0.2256, + "step": 2364 + }, + { + "epoch": 1.1219165085388993, + "grad_norm": 1.9947072267532349, + "learning_rate": 8.518803182420651e-06, + "loss": 0.2217, + "step": 2365 + }, + { + "epoch": 1.1223908918406071, + "grad_norm": 1.7771036624908447, + "learning_rate": 8.511205334354566e-06, + "loss": 0.1938, + "step": 2366 + }, + { + "epoch": 1.122865275142315, + "grad_norm": 1.7289372682571411, + "learning_rate": 8.503608365109247e-06, + "loss": 0.1559, + "step": 2367 + }, + { + "epoch": 1.1233396584440227, + "grad_norm": 1.8934062719345093, + "learning_rate": 8.496012279169097e-06, + "loss": 0.1868, + "step": 2368 + }, + { + "epoch": 1.1238140417457305, + "grad_norm": 1.762656569480896, + "learning_rate": 8.488417081018015e-06, + "loss": 0.1465, + "step": 2369 + }, + { + "epoch": 1.1242884250474383, + "grad_norm": 1.8933993577957153, + "learning_rate": 8.480822775139371e-06, + "loss": 0.2391, + "step": 2370 + }, + { + "epoch": 1.124762808349146, + "grad_norm": 1.5721724033355713, + "learning_rate": 8.473229366016014e-06, + "loss": 0.1591, + "step": 2371 + }, + { + "epoch": 1.125237191650854, + "grad_norm": 1.9659096002578735, + "learning_rate": 8.465636858130255e-06, + "loss": 0.1897, + "step": 2372 + }, + { + "epoch": 1.1257115749525617, + "grad_norm": 2.4556431770324707, + "learning_rate": 8.45804525596387e-06, + "loss": 0.2287, + "step": 2373 + }, + { + "epoch": 1.1261859582542695, + "grad_norm": 1.9868581295013428, + "learning_rate": 8.450454563998117e-06, + "loss": 0.2112, + "step": 2374 + }, + { + "epoch": 1.1266603415559773, + "grad_norm": 1.849770188331604, + "learning_rate": 8.4428647867137e-06, + "loss": 0.1841, + "step": 2375 + }, + { + "epoch": 1.127134724857685, + "grad_norm": 1.5524128675460815, + "learning_rate": 8.435275928590789e-06, + "loss": 0.1456, + "step": 2376 + }, + { + "epoch": 1.1276091081593929, + "grad_norm": 1.8452872037887573, + "learning_rate": 8.427687994109017e-06, + "loss": 0.1729, + "step": 2377 + }, + { + "epoch": 1.1280834914611007, + "grad_norm": 2.2624077796936035, + "learning_rate": 8.42010098774746e-06, + "loss": 0.2093, + "step": 2378 + }, + { + "epoch": 1.1285578747628084, + "grad_norm": 1.6121922731399536, + "learning_rate": 8.412514913984657e-06, + "loss": 0.157, + "step": 2379 + }, + { + "epoch": 1.129032258064516, + "grad_norm": 1.7655012607574463, + "learning_rate": 8.404929777298592e-06, + "loss": 0.1697, + "step": 2380 + }, + { + "epoch": 1.1295066413662238, + "grad_norm": 1.811368703842163, + "learning_rate": 8.3973455821667e-06, + "loss": 0.1537, + "step": 2381 + }, + { + "epoch": 1.1299810246679316, + "grad_norm": 1.8110166788101196, + "learning_rate": 8.389762333065847e-06, + "loss": 0.2188, + "step": 2382 + }, + { + "epoch": 1.1304554079696394, + "grad_norm": 1.7781898975372314, + "learning_rate": 8.382180034472353e-06, + "loss": 0.2, + "step": 2383 + }, + { + "epoch": 1.1309297912713472, + "grad_norm": 1.581636905670166, + "learning_rate": 8.374598690861978e-06, + "loss": 0.1607, + "step": 2384 + }, + { + "epoch": 1.131404174573055, + "grad_norm": 2.6846189498901367, + "learning_rate": 8.367018306709913e-06, + "loss": 0.1924, + "step": 2385 + }, + { + "epoch": 1.1318785578747628, + "grad_norm": 2.4494681358337402, + "learning_rate": 8.359438886490783e-06, + "loss": 0.2339, + "step": 2386 + }, + { + "epoch": 1.1323529411764706, + "grad_norm": 1.944273591041565, + "learning_rate": 8.351860434678641e-06, + "loss": 0.186, + "step": 2387 + }, + { + "epoch": 1.1328273244781784, + "grad_norm": 1.9705138206481934, + "learning_rate": 8.344282955746978e-06, + "loss": 0.186, + "step": 2388 + }, + { + "epoch": 1.1333017077798861, + "grad_norm": 1.6467750072479248, + "learning_rate": 8.336706454168701e-06, + "loss": 0.1791, + "step": 2389 + }, + { + "epoch": 1.133776091081594, + "grad_norm": 1.7570585012435913, + "learning_rate": 8.329130934416142e-06, + "loss": 0.1567, + "step": 2390 + }, + { + "epoch": 1.1342504743833017, + "grad_norm": 1.4821341037750244, + "learning_rate": 8.321556400961067e-06, + "loss": 0.1465, + "step": 2391 + }, + { + "epoch": 1.1347248576850095, + "grad_norm": 2.2223289012908936, + "learning_rate": 8.313982858274634e-06, + "loss": 0.2004, + "step": 2392 + }, + { + "epoch": 1.1351992409867173, + "grad_norm": 1.931081771850586, + "learning_rate": 8.306410310827435e-06, + "loss": 0.1863, + "step": 2393 + }, + { + "epoch": 1.135673624288425, + "grad_norm": 1.7010200023651123, + "learning_rate": 8.298838763089471e-06, + "loss": 0.1667, + "step": 2394 + }, + { + "epoch": 1.136148007590133, + "grad_norm": 2.051765203475952, + "learning_rate": 8.291268219530153e-06, + "loss": 0.1978, + "step": 2395 + }, + { + "epoch": 1.1366223908918407, + "grad_norm": 1.797257900238037, + "learning_rate": 8.2836986846183e-06, + "loss": 0.1652, + "step": 2396 + }, + { + "epoch": 1.1370967741935485, + "grad_norm": 1.760908603668213, + "learning_rate": 8.276130162822124e-06, + "loss": 0.2005, + "step": 2397 + }, + { + "epoch": 1.1375711574952563, + "grad_norm": 1.7077192068099976, + "learning_rate": 8.268562658609254e-06, + "loss": 0.2053, + "step": 2398 + }, + { + "epoch": 1.1380455407969639, + "grad_norm": 2.186999797821045, + "learning_rate": 8.260996176446716e-06, + "loss": 0.2174, + "step": 2399 + }, + { + "epoch": 1.1385199240986716, + "grad_norm": 1.788298487663269, + "learning_rate": 8.253430720800928e-06, + "loss": 0.1705, + "step": 2400 + }, + { + "epoch": 1.1389943074003794, + "grad_norm": 2.1071958541870117, + "learning_rate": 8.245866296137701e-06, + "loss": 0.1797, + "step": 2401 + }, + { + "epoch": 1.1394686907020872, + "grad_norm": 1.7165755033493042, + "learning_rate": 8.238302906922242e-06, + "loss": 0.2184, + "step": 2402 + }, + { + "epoch": 1.139943074003795, + "grad_norm": 1.7405943870544434, + "learning_rate": 8.230740557619142e-06, + "loss": 0.1772, + "step": 2403 + }, + { + "epoch": 1.1404174573055028, + "grad_norm": 2.0307819843292236, + "learning_rate": 8.223179252692385e-06, + "loss": 0.2016, + "step": 2404 + }, + { + "epoch": 1.1408918406072106, + "grad_norm": 1.6938095092773438, + "learning_rate": 8.215618996605336e-06, + "loss": 0.1615, + "step": 2405 + }, + { + "epoch": 1.1413662239089184, + "grad_norm": 1.7445240020751953, + "learning_rate": 8.208059793820731e-06, + "loss": 0.2164, + "step": 2406 + }, + { + "epoch": 1.1418406072106262, + "grad_norm": 2.3769569396972656, + "learning_rate": 8.200501648800698e-06, + "loss": 0.1711, + "step": 2407 + }, + { + "epoch": 1.142314990512334, + "grad_norm": 1.518480658531189, + "learning_rate": 8.192944566006737e-06, + "loss": 0.1561, + "step": 2408 + }, + { + "epoch": 1.1427893738140418, + "grad_norm": 1.9066368341445923, + "learning_rate": 8.185388549899715e-06, + "loss": 0.2031, + "step": 2409 + }, + { + "epoch": 1.1432637571157496, + "grad_norm": 2.3878681659698486, + "learning_rate": 8.17783360493988e-06, + "loss": 0.2288, + "step": 2410 + }, + { + "epoch": 1.1437381404174574, + "grad_norm": 2.5900661945343018, + "learning_rate": 8.170279735586833e-06, + "loss": 0.1964, + "step": 2411 + }, + { + "epoch": 1.1442125237191652, + "grad_norm": 1.9122105836868286, + "learning_rate": 8.162726946299556e-06, + "loss": 0.1934, + "step": 2412 + }, + { + "epoch": 1.144686907020873, + "grad_norm": 2.024298667907715, + "learning_rate": 8.155175241536377e-06, + "loss": 0.2132, + "step": 2413 + }, + { + "epoch": 1.1451612903225807, + "grad_norm": 1.898661732673645, + "learning_rate": 8.147624625754999e-06, + "loss": 0.2355, + "step": 2414 + }, + { + "epoch": 1.1456356736242883, + "grad_norm": 1.5798265933990479, + "learning_rate": 8.140075103412477e-06, + "loss": 0.1495, + "step": 2415 + }, + { + "epoch": 1.146110056925996, + "grad_norm": 1.6848498582839966, + "learning_rate": 8.132526678965215e-06, + "loss": 0.1671, + "step": 2416 + }, + { + "epoch": 1.146584440227704, + "grad_norm": 1.576760172843933, + "learning_rate": 8.124979356868976e-06, + "loss": 0.1631, + "step": 2417 + }, + { + "epoch": 1.1470588235294117, + "grad_norm": 2.446101427078247, + "learning_rate": 8.117433141578865e-06, + "loss": 0.253, + "step": 2418 + }, + { + "epoch": 1.1475332068311195, + "grad_norm": 1.735353946685791, + "learning_rate": 8.109888037549346e-06, + "loss": 0.1542, + "step": 2419 + }, + { + "epoch": 1.1480075901328273, + "grad_norm": 1.9680674076080322, + "learning_rate": 8.102344049234213e-06, + "loss": 0.2229, + "step": 2420 + }, + { + "epoch": 1.148481973434535, + "grad_norm": 1.939218521118164, + "learning_rate": 8.094801181086612e-06, + "loss": 0.188, + "step": 2421 + }, + { + "epoch": 1.1489563567362429, + "grad_norm": 1.6669310331344604, + "learning_rate": 8.087259437559017e-06, + "loss": 0.1843, + "step": 2422 + }, + { + "epoch": 1.1494307400379506, + "grad_norm": 1.6978830099105835, + "learning_rate": 8.079718823103251e-06, + "loss": 0.1643, + "step": 2423 + }, + { + "epoch": 1.1499051233396584, + "grad_norm": 1.4840154647827148, + "learning_rate": 8.072179342170461e-06, + "loss": 0.1328, + "step": 2424 + }, + { + "epoch": 1.1503795066413662, + "grad_norm": 1.5778673887252808, + "learning_rate": 8.06464099921113e-06, + "loss": 0.1554, + "step": 2425 + }, + { + "epoch": 1.150853889943074, + "grad_norm": 1.7717269659042358, + "learning_rate": 8.057103798675063e-06, + "loss": 0.1731, + "step": 2426 + }, + { + "epoch": 1.1513282732447818, + "grad_norm": 1.752498984336853, + "learning_rate": 8.0495677450114e-06, + "loss": 0.1794, + "step": 2427 + }, + { + "epoch": 1.1518026565464896, + "grad_norm": 1.9130120277404785, + "learning_rate": 8.042032842668598e-06, + "loss": 0.1863, + "step": 2428 + }, + { + "epoch": 1.1522770398481974, + "grad_norm": 1.86540687084198, + "learning_rate": 8.034499096094434e-06, + "loss": 0.1823, + "step": 2429 + }, + { + "epoch": 1.1527514231499052, + "grad_norm": 2.4312217235565186, + "learning_rate": 8.026966509736001e-06, + "loss": 0.1981, + "step": 2430 + }, + { + "epoch": 1.153225806451613, + "grad_norm": 1.7719708681106567, + "learning_rate": 8.019435088039714e-06, + "loss": 0.1733, + "step": 2431 + }, + { + "epoch": 1.1537001897533208, + "grad_norm": 2.0009617805480957, + "learning_rate": 8.011904835451298e-06, + "loss": 0.178, + "step": 2432 + }, + { + "epoch": 1.1541745730550286, + "grad_norm": 2.0233676433563232, + "learning_rate": 8.004375756415783e-06, + "loss": 0.1689, + "step": 2433 + }, + { + "epoch": 1.1546489563567364, + "grad_norm": 1.6531426906585693, + "learning_rate": 7.996847855377514e-06, + "loss": 0.1642, + "step": 2434 + }, + { + "epoch": 1.155123339658444, + "grad_norm": 2.1204326152801514, + "learning_rate": 7.989321136780131e-06, + "loss": 0.1987, + "step": 2435 + }, + { + "epoch": 1.1555977229601517, + "grad_norm": 1.7340539693832397, + "learning_rate": 7.981795605066585e-06, + "loss": 0.1949, + "step": 2436 + }, + { + "epoch": 1.1560721062618595, + "grad_norm": 1.8401095867156982, + "learning_rate": 7.974271264679122e-06, + "loss": 0.1563, + "step": 2437 + }, + { + "epoch": 1.1565464895635673, + "grad_norm": 1.637509822845459, + "learning_rate": 7.966748120059286e-06, + "loss": 0.1538, + "step": 2438 + }, + { + "epoch": 1.157020872865275, + "grad_norm": 2.007936954498291, + "learning_rate": 7.959226175647919e-06, + "loss": 0.1921, + "step": 2439 + }, + { + "epoch": 1.157495256166983, + "grad_norm": 1.9487900733947754, + "learning_rate": 7.951705435885143e-06, + "loss": 0.1798, + "step": 2440 + }, + { + "epoch": 1.1579696394686907, + "grad_norm": 1.8263933658599854, + "learning_rate": 7.944185905210377e-06, + "loss": 0.1991, + "step": 2441 + }, + { + "epoch": 1.1584440227703985, + "grad_norm": 2.152977466583252, + "learning_rate": 7.93666758806233e-06, + "loss": 0.1981, + "step": 2442 + }, + { + "epoch": 1.1589184060721063, + "grad_norm": 1.8188046216964722, + "learning_rate": 7.929150488878991e-06, + "loss": 0.1388, + "step": 2443 + }, + { + "epoch": 1.159392789373814, + "grad_norm": 1.7171995639801025, + "learning_rate": 7.921634612097623e-06, + "loss": 0.1689, + "step": 2444 + }, + { + "epoch": 1.1598671726755219, + "grad_norm": 1.9360337257385254, + "learning_rate": 7.914119962154779e-06, + "loss": 0.1858, + "step": 2445 + }, + { + "epoch": 1.1603415559772297, + "grad_norm": 1.8121434450149536, + "learning_rate": 7.906606543486278e-06, + "loss": 0.169, + "step": 2446 + }, + { + "epoch": 1.1608159392789374, + "grad_norm": 2.7344167232513428, + "learning_rate": 7.89909436052722e-06, + "loss": 0.2118, + "step": 2447 + }, + { + "epoch": 1.1612903225806452, + "grad_norm": 2.18343186378479, + "learning_rate": 7.891583417711975e-06, + "loss": 0.21, + "step": 2448 + }, + { + "epoch": 1.161764705882353, + "grad_norm": 1.4035104513168335, + "learning_rate": 7.884073719474174e-06, + "loss": 0.152, + "step": 2449 + }, + { + "epoch": 1.1622390891840606, + "grad_norm": 1.8150465488433838, + "learning_rate": 7.876565270246715e-06, + "loss": 0.1535, + "step": 2450 + }, + { + "epoch": 1.1627134724857684, + "grad_norm": 1.967027187347412, + "learning_rate": 7.869058074461766e-06, + "loss": 0.1987, + "step": 2451 + }, + { + "epoch": 1.1631878557874762, + "grad_norm": 1.906870722770691, + "learning_rate": 7.86155213655075e-06, + "loss": 0.1772, + "step": 2452 + }, + { + "epoch": 1.163662239089184, + "grad_norm": 1.788644790649414, + "learning_rate": 7.85404746094435e-06, + "loss": 0.1803, + "step": 2453 + }, + { + "epoch": 1.1641366223908918, + "grad_norm": 2.3639960289001465, + "learning_rate": 7.846544052072494e-06, + "loss": 0.1969, + "step": 2454 + }, + { + "epoch": 1.1646110056925996, + "grad_norm": 1.8786990642547607, + "learning_rate": 7.839041914364375e-06, + "loss": 0.2137, + "step": 2455 + }, + { + "epoch": 1.1650853889943074, + "grad_norm": 1.9279520511627197, + "learning_rate": 7.831541052248433e-06, + "loss": 0.1631, + "step": 2456 + }, + { + "epoch": 1.1655597722960152, + "grad_norm": 1.9649664163589478, + "learning_rate": 7.824041470152346e-06, + "loss": 0.1736, + "step": 2457 + }, + { + "epoch": 1.166034155597723, + "grad_norm": 2.580247402191162, + "learning_rate": 7.816543172503052e-06, + "loss": 0.2557, + "step": 2458 + }, + { + "epoch": 1.1665085388994307, + "grad_norm": 1.8188077211380005, + "learning_rate": 7.809046163726715e-06, + "loss": 0.1678, + "step": 2459 + }, + { + "epoch": 1.1669829222011385, + "grad_norm": 1.8238269090652466, + "learning_rate": 7.801550448248746e-06, + "loss": 0.2021, + "step": 2460 + }, + { + "epoch": 1.1674573055028463, + "grad_norm": 1.9044618606567383, + "learning_rate": 7.794056030493793e-06, + "loss": 0.1528, + "step": 2461 + }, + { + "epoch": 1.1679316888045541, + "grad_norm": 1.5531914234161377, + "learning_rate": 7.78656291488574e-06, + "loss": 0.1613, + "step": 2462 + }, + { + "epoch": 1.168406072106262, + "grad_norm": 2.1100800037384033, + "learning_rate": 7.779071105847692e-06, + "loss": 0.2401, + "step": 2463 + }, + { + "epoch": 1.1688804554079697, + "grad_norm": 1.6972509622573853, + "learning_rate": 7.771580607801994e-06, + "loss": 0.2089, + "step": 2464 + }, + { + "epoch": 1.1693548387096775, + "grad_norm": 1.727967381477356, + "learning_rate": 7.76409142517021e-06, + "loss": 0.1693, + "step": 2465 + }, + { + "epoch": 1.1698292220113853, + "grad_norm": 2.0903284549713135, + "learning_rate": 7.756603562373134e-06, + "loss": 0.1983, + "step": 2466 + }, + { + "epoch": 1.170303605313093, + "grad_norm": 1.5043290853500366, + "learning_rate": 7.749117023830779e-06, + "loss": 0.1714, + "step": 2467 + }, + { + "epoch": 1.1707779886148009, + "grad_norm": 1.7228398323059082, + "learning_rate": 7.741631813962367e-06, + "loss": 0.1587, + "step": 2468 + }, + { + "epoch": 1.1712523719165087, + "grad_norm": 1.9864158630371094, + "learning_rate": 7.73414793718635e-06, + "loss": 0.1873, + "step": 2469 + }, + { + "epoch": 1.1717267552182162, + "grad_norm": 1.7577637434005737, + "learning_rate": 7.72666539792038e-06, + "loss": 0.1599, + "step": 2470 + }, + { + "epoch": 1.172201138519924, + "grad_norm": 1.9289382696151733, + "learning_rate": 7.719184200581334e-06, + "loss": 0.1762, + "step": 2471 + }, + { + "epoch": 1.1726755218216318, + "grad_norm": 1.6133570671081543, + "learning_rate": 7.711704349585287e-06, + "loss": 0.1568, + "step": 2472 + }, + { + "epoch": 1.1731499051233396, + "grad_norm": 1.882628083229065, + "learning_rate": 7.704225849347517e-06, + "loss": 0.1661, + "step": 2473 + }, + { + "epoch": 1.1736242884250474, + "grad_norm": 2.2073872089385986, + "learning_rate": 7.696748704282507e-06, + "loss": 0.1734, + "step": 2474 + }, + { + "epoch": 1.1740986717267552, + "grad_norm": 1.9427272081375122, + "learning_rate": 7.689272918803946e-06, + "loss": 0.1713, + "step": 2475 + }, + { + "epoch": 1.174573055028463, + "grad_norm": 1.8138381242752075, + "learning_rate": 7.681798497324717e-06, + "loss": 0.1864, + "step": 2476 + }, + { + "epoch": 1.1750474383301708, + "grad_norm": 2.078590154647827, + "learning_rate": 7.674325444256899e-06, + "loss": 0.1737, + "step": 2477 + }, + { + "epoch": 1.1755218216318786, + "grad_norm": 2.3086998462677, + "learning_rate": 7.666853764011752e-06, + "loss": 0.2696, + "step": 2478 + }, + { + "epoch": 1.1759962049335864, + "grad_norm": 2.107063055038452, + "learning_rate": 7.659383460999742e-06, + "loss": 0.1974, + "step": 2479 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 1.7796244621276855, + "learning_rate": 7.651914539630515e-06, + "loss": 0.194, + "step": 2480 + }, + { + "epoch": 1.176944971537002, + "grad_norm": 2.3119170665740967, + "learning_rate": 7.644447004312903e-06, + "loss": 0.2256, + "step": 2481 + }, + { + "epoch": 1.1774193548387097, + "grad_norm": 1.837838053703308, + "learning_rate": 7.63698085945491e-06, + "loss": 0.1657, + "step": 2482 + }, + { + "epoch": 1.1778937381404175, + "grad_norm": 1.628775954246521, + "learning_rate": 7.629516109463732e-06, + "loss": 0.1785, + "step": 2483 + }, + { + "epoch": 1.1783681214421253, + "grad_norm": 1.6993663311004639, + "learning_rate": 7.622052758745741e-06, + "loss": 0.1829, + "step": 2484 + }, + { + "epoch": 1.178842504743833, + "grad_norm": 2.071762800216675, + "learning_rate": 7.614590811706473e-06, + "loss": 0.2427, + "step": 2485 + }, + { + "epoch": 1.1793168880455407, + "grad_norm": 1.9026820659637451, + "learning_rate": 7.607130272750647e-06, + "loss": 0.2126, + "step": 2486 + }, + { + "epoch": 1.1797912713472485, + "grad_norm": 1.604719638824463, + "learning_rate": 7.59967114628214e-06, + "loss": 0.1347, + "step": 2487 + }, + { + "epoch": 1.1802656546489563, + "grad_norm": 1.5981990098953247, + "learning_rate": 7.592213436704004e-06, + "loss": 0.1716, + "step": 2488 + }, + { + "epoch": 1.180740037950664, + "grad_norm": 1.558847188949585, + "learning_rate": 7.58475714841845e-06, + "loss": 0.1558, + "step": 2489 + }, + { + "epoch": 1.1812144212523719, + "grad_norm": 1.6636430025100708, + "learning_rate": 7.577302285826851e-06, + "loss": 0.1693, + "step": 2490 + }, + { + "epoch": 1.1816888045540797, + "grad_norm": 1.62824285030365, + "learning_rate": 7.569848853329742e-06, + "loss": 0.1649, + "step": 2491 + }, + { + "epoch": 1.1821631878557874, + "grad_norm": 1.6984463930130005, + "learning_rate": 7.562396855326805e-06, + "loss": 0.1345, + "step": 2492 + }, + { + "epoch": 1.1826375711574952, + "grad_norm": 2.12964129447937, + "learning_rate": 7.554946296216884e-06, + "loss": 0.1844, + "step": 2493 + }, + { + "epoch": 1.183111954459203, + "grad_norm": 1.8614979982376099, + "learning_rate": 7.547497180397968e-06, + "loss": 0.203, + "step": 2494 + }, + { + "epoch": 1.1835863377609108, + "grad_norm": 1.7780121564865112, + "learning_rate": 7.540049512267197e-06, + "loss": 0.199, + "step": 2495 + }, + { + "epoch": 1.1840607210626186, + "grad_norm": 2.061229705810547, + "learning_rate": 7.532603296220862e-06, + "loss": 0.2002, + "step": 2496 + }, + { + "epoch": 1.1845351043643264, + "grad_norm": 1.7762293815612793, + "learning_rate": 7.525158536654382e-06, + "loss": 0.1714, + "step": 2497 + }, + { + "epoch": 1.1850094876660342, + "grad_norm": 1.525127649307251, + "learning_rate": 7.517715237962328e-06, + "loss": 0.1557, + "step": 2498 + }, + { + "epoch": 1.185483870967742, + "grad_norm": 1.9348129034042358, + "learning_rate": 7.510273404538404e-06, + "loss": 0.1971, + "step": 2499 + }, + { + "epoch": 1.1859582542694498, + "grad_norm": 1.6467643976211548, + "learning_rate": 7.502833040775457e-06, + "loss": 0.1551, + "step": 2500 + }, + { + "epoch": 1.1864326375711576, + "grad_norm": 1.4035449028015137, + "learning_rate": 7.4953941510654535e-06, + "loss": 0.1521, + "step": 2501 + }, + { + "epoch": 1.1869070208728654, + "grad_norm": 1.7778912782669067, + "learning_rate": 7.487956739799496e-06, + "loss": 0.1788, + "step": 2502 + }, + { + "epoch": 1.1873814041745732, + "grad_norm": 1.626543402671814, + "learning_rate": 7.480520811367817e-06, + "loss": 0.1382, + "step": 2503 + }, + { + "epoch": 1.187855787476281, + "grad_norm": 2.147493362426758, + "learning_rate": 7.473086370159776e-06, + "loss": 0.2429, + "step": 2504 + }, + { + "epoch": 1.1883301707779885, + "grad_norm": 1.7285178899765015, + "learning_rate": 7.465653420563846e-06, + "loss": 0.1818, + "step": 2505 + }, + { + "epoch": 1.1888045540796963, + "grad_norm": 1.7863154411315918, + "learning_rate": 7.45822196696762e-06, + "loss": 0.1791, + "step": 2506 + }, + { + "epoch": 1.189278937381404, + "grad_norm": 2.1906211376190186, + "learning_rate": 7.4507920137578146e-06, + "loss": 0.2307, + "step": 2507 + }, + { + "epoch": 1.189753320683112, + "grad_norm": 3.06583309173584, + "learning_rate": 7.443363565320259e-06, + "loss": 0.1962, + "step": 2508 + }, + { + "epoch": 1.1902277039848197, + "grad_norm": 1.7111692428588867, + "learning_rate": 7.435936626039891e-06, + "loss": 0.1744, + "step": 2509 + }, + { + "epoch": 1.1907020872865275, + "grad_norm": 3.7655081748962402, + "learning_rate": 7.428511200300765e-06, + "loss": 0.2345, + "step": 2510 + }, + { + "epoch": 1.1911764705882353, + "grad_norm": 2.3031506538391113, + "learning_rate": 7.421087292486027e-06, + "loss": 0.208, + "step": 2511 + }, + { + "epoch": 1.191650853889943, + "grad_norm": 2.341597318649292, + "learning_rate": 7.41366490697794e-06, + "loss": 0.2249, + "step": 2512 + }, + { + "epoch": 1.1921252371916509, + "grad_norm": 2.380882501602173, + "learning_rate": 7.406244048157867e-06, + "loss": 0.1982, + "step": 2513 + }, + { + "epoch": 1.1925996204933587, + "grad_norm": 2.25535249710083, + "learning_rate": 7.398824720406265e-06, + "loss": 0.2163, + "step": 2514 + }, + { + "epoch": 1.1930740037950665, + "grad_norm": 1.5992552042007446, + "learning_rate": 7.391406928102695e-06, + "loss": 0.1696, + "step": 2515 + }, + { + "epoch": 1.1935483870967742, + "grad_norm": 1.729461908340454, + "learning_rate": 7.383990675625797e-06, + "loss": 0.1747, + "step": 2516 + }, + { + "epoch": 1.194022770398482, + "grad_norm": 2.1388728618621826, + "learning_rate": 7.37657596735332e-06, + "loss": 0.1818, + "step": 2517 + }, + { + "epoch": 1.1944971537001898, + "grad_norm": 1.8862483501434326, + "learning_rate": 7.369162807662087e-06, + "loss": 0.1946, + "step": 2518 + }, + { + "epoch": 1.1949715370018976, + "grad_norm": 1.9275808334350586, + "learning_rate": 7.36175120092802e-06, + "loss": 0.2447, + "step": 2519 + }, + { + "epoch": 1.1954459203036052, + "grad_norm": 2.302499294281006, + "learning_rate": 7.354341151526107e-06, + "loss": 0.1424, + "step": 2520 + }, + { + "epoch": 1.195920303605313, + "grad_norm": 2.07778263092041, + "learning_rate": 7.3469326638304365e-06, + "loss": 0.2255, + "step": 2521 + }, + { + "epoch": 1.1963946869070208, + "grad_norm": 1.843159794807434, + "learning_rate": 7.339525742214157e-06, + "loss": 0.1705, + "step": 2522 + }, + { + "epoch": 1.1968690702087286, + "grad_norm": 2.0549871921539307, + "learning_rate": 7.332120391049506e-06, + "loss": 0.2071, + "step": 2523 + }, + { + "epoch": 1.1973434535104364, + "grad_norm": 1.5816240310668945, + "learning_rate": 7.324716614707794e-06, + "loss": 0.1605, + "step": 2524 + }, + { + "epoch": 1.1978178368121442, + "grad_norm": 1.4971600770950317, + "learning_rate": 7.317314417559389e-06, + "loss": 0.1392, + "step": 2525 + }, + { + "epoch": 1.198292220113852, + "grad_norm": 1.5572508573532104, + "learning_rate": 7.309913803973734e-06, + "loss": 0.1518, + "step": 2526 + }, + { + "epoch": 1.1987666034155597, + "grad_norm": 2.0520272254943848, + "learning_rate": 7.302514778319341e-06, + "loss": 0.2069, + "step": 2527 + }, + { + "epoch": 1.1992409867172675, + "grad_norm": 1.7996068000793457, + "learning_rate": 7.295117344963782e-06, + "loss": 0.2041, + "step": 2528 + }, + { + "epoch": 1.1997153700189753, + "grad_norm": 1.6315232515335083, + "learning_rate": 7.287721508273691e-06, + "loss": 0.1681, + "step": 2529 + }, + { + "epoch": 1.2001897533206831, + "grad_norm": 1.8578884601593018, + "learning_rate": 7.280327272614753e-06, + "loss": 0.1671, + "step": 2530 + }, + { + "epoch": 1.200664136622391, + "grad_norm": 1.7982726097106934, + "learning_rate": 7.272934642351712e-06, + "loss": 0.1849, + "step": 2531 + }, + { + "epoch": 1.2011385199240987, + "grad_norm": 2.00294828414917, + "learning_rate": 7.265543621848368e-06, + "loss": 0.1644, + "step": 2532 + }, + { + "epoch": 1.2016129032258065, + "grad_norm": 1.6262898445129395, + "learning_rate": 7.2581542154675654e-06, + "loss": 0.1612, + "step": 2533 + }, + { + "epoch": 1.2020872865275143, + "grad_norm": 1.8584097623825073, + "learning_rate": 7.2507664275712e-06, + "loss": 0.1822, + "step": 2534 + }, + { + "epoch": 1.202561669829222, + "grad_norm": 2.0192692279815674, + "learning_rate": 7.243380262520203e-06, + "loss": 0.2231, + "step": 2535 + }, + { + "epoch": 1.2030360531309299, + "grad_norm": 2.2712655067443848, + "learning_rate": 7.23599572467456e-06, + "loss": 0.2046, + "step": 2536 + }, + { + "epoch": 1.2035104364326377, + "grad_norm": 2.379418134689331, + "learning_rate": 7.228612818393292e-06, + "loss": 0.2186, + "step": 2537 + }, + { + "epoch": 1.2039848197343455, + "grad_norm": 1.9519847631454468, + "learning_rate": 7.221231548034451e-06, + "loss": 0.1875, + "step": 2538 + }, + { + "epoch": 1.2044592030360532, + "grad_norm": 1.7326889038085938, + "learning_rate": 7.2138519179551335e-06, + "loss": 0.153, + "step": 2539 + }, + { + "epoch": 1.2049335863377608, + "grad_norm": 1.4056307077407837, + "learning_rate": 7.206473932511455e-06, + "loss": 0.1415, + "step": 2540 + }, + { + "epoch": 1.2054079696394686, + "grad_norm": 1.8842997550964355, + "learning_rate": 7.199097596058573e-06, + "loss": 0.1948, + "step": 2541 + }, + { + "epoch": 1.2058823529411764, + "grad_norm": 1.9680218696594238, + "learning_rate": 7.1917229129506626e-06, + "loss": 0.1902, + "step": 2542 + }, + { + "epoch": 1.2063567362428842, + "grad_norm": 6.604353427886963, + "learning_rate": 7.1843498875409315e-06, + "loss": 0.1912, + "step": 2543 + }, + { + "epoch": 1.206831119544592, + "grad_norm": 1.8714125156402588, + "learning_rate": 7.176978524181595e-06, + "loss": 0.1609, + "step": 2544 + }, + { + "epoch": 1.2073055028462998, + "grad_norm": 1.6578891277313232, + "learning_rate": 7.169608827223902e-06, + "loss": 0.1785, + "step": 2545 + }, + { + "epoch": 1.2077798861480076, + "grad_norm": 1.9563171863555908, + "learning_rate": 7.16224080101811e-06, + "loss": 0.2015, + "step": 2546 + }, + { + "epoch": 1.2082542694497154, + "grad_norm": 1.5768934488296509, + "learning_rate": 7.154874449913492e-06, + "loss": 0.1484, + "step": 2547 + }, + { + "epoch": 1.2087286527514232, + "grad_norm": 1.6119853258132935, + "learning_rate": 7.147509778258334e-06, + "loss": 0.2121, + "step": 2548 + }, + { + "epoch": 1.209203036053131, + "grad_norm": 1.5381999015808105, + "learning_rate": 7.1401467903999285e-06, + "loss": 0.1711, + "step": 2549 + }, + { + "epoch": 1.2096774193548387, + "grad_norm": 1.6348845958709717, + "learning_rate": 7.1327854906845706e-06, + "loss": 0.1632, + "step": 2550 + }, + { + "epoch": 1.2101518026565465, + "grad_norm": 1.7592006921768188, + "learning_rate": 7.125425883457564e-06, + "loss": 0.1786, + "step": 2551 + }, + { + "epoch": 1.2106261859582543, + "grad_norm": 1.6525636911392212, + "learning_rate": 7.118067973063216e-06, + "loss": 0.1704, + "step": 2552 + }, + { + "epoch": 1.2111005692599621, + "grad_norm": 1.930760383605957, + "learning_rate": 7.110711763844826e-06, + "loss": 0.1789, + "step": 2553 + }, + { + "epoch": 1.21157495256167, + "grad_norm": 1.6949514150619507, + "learning_rate": 7.10335726014469e-06, + "loss": 0.2061, + "step": 2554 + }, + { + "epoch": 1.2120493358633775, + "grad_norm": 1.494983434677124, + "learning_rate": 7.096004466304099e-06, + "loss": 0.1317, + "step": 2555 + }, + { + "epoch": 1.2125237191650853, + "grad_norm": 1.5275888442993164, + "learning_rate": 7.088653386663335e-06, + "loss": 0.1481, + "step": 2556 + }, + { + "epoch": 1.212998102466793, + "grad_norm": 2.201962471008301, + "learning_rate": 7.081304025561668e-06, + "loss": 0.2149, + "step": 2557 + }, + { + "epoch": 1.2134724857685009, + "grad_norm": 1.78719961643219, + "learning_rate": 7.073956387337357e-06, + "loss": 0.1635, + "step": 2558 + }, + { + "epoch": 1.2139468690702087, + "grad_norm": 1.9768500328063965, + "learning_rate": 7.066610476327632e-06, + "loss": 0.1727, + "step": 2559 + }, + { + "epoch": 1.2144212523719164, + "grad_norm": 2.0566301345825195, + "learning_rate": 7.059266296868715e-06, + "loss": 0.2103, + "step": 2560 + }, + { + "epoch": 1.2148956356736242, + "grad_norm": 1.8030273914337158, + "learning_rate": 7.051923853295805e-06, + "loss": 0.1743, + "step": 2561 + }, + { + "epoch": 1.215370018975332, + "grad_norm": 1.7541704177856445, + "learning_rate": 7.044583149943076e-06, + "loss": 0.191, + "step": 2562 + }, + { + "epoch": 1.2158444022770398, + "grad_norm": 1.8001971244812012, + "learning_rate": 7.037244191143662e-06, + "loss": 0.1788, + "step": 2563 + }, + { + "epoch": 1.2163187855787476, + "grad_norm": 1.620387315750122, + "learning_rate": 7.029906981229686e-06, + "loss": 0.1421, + "step": 2564 + }, + { + "epoch": 1.2167931688804554, + "grad_norm": 1.8180466890335083, + "learning_rate": 7.022571524532229e-06, + "loss": 0.1778, + "step": 2565 + }, + { + "epoch": 1.2172675521821632, + "grad_norm": 2.036728620529175, + "learning_rate": 7.015237825381338e-06, + "loss": 0.2076, + "step": 2566 + }, + { + "epoch": 1.217741935483871, + "grad_norm": 1.8897072076797485, + "learning_rate": 7.007905888106026e-06, + "loss": 0.1871, + "step": 2567 + }, + { + "epoch": 1.2182163187855788, + "grad_norm": 1.585066556930542, + "learning_rate": 7.000575717034256e-06, + "loss": 0.163, + "step": 2568 + }, + { + "epoch": 1.2186907020872866, + "grad_norm": 1.9996800422668457, + "learning_rate": 6.993247316492962e-06, + "loss": 0.1774, + "step": 2569 + }, + { + "epoch": 1.2191650853889944, + "grad_norm": 2.1412925720214844, + "learning_rate": 6.985920690808021e-06, + "loss": 0.1793, + "step": 2570 + }, + { + "epoch": 1.2196394686907022, + "grad_norm": 1.8670066595077515, + "learning_rate": 6.978595844304272e-06, + "loss": 0.1692, + "step": 2571 + }, + { + "epoch": 1.22011385199241, + "grad_norm": 1.7688610553741455, + "learning_rate": 6.971272781305503e-06, + "loss": 0.175, + "step": 2572 + }, + { + "epoch": 1.2205882352941178, + "grad_norm": 1.6328133344650269, + "learning_rate": 6.963951506134434e-06, + "loss": 0.1546, + "step": 2573 + }, + { + "epoch": 1.2210626185958255, + "grad_norm": 1.8396352529525757, + "learning_rate": 6.956632023112751e-06, + "loss": 0.1753, + "step": 2574 + }, + { + "epoch": 1.2215370018975331, + "grad_norm": 1.9134464263916016, + "learning_rate": 6.949314336561068e-06, + "loss": 0.1688, + "step": 2575 + }, + { + "epoch": 1.222011385199241, + "grad_norm": 1.5007470846176147, + "learning_rate": 6.941998450798946e-06, + "loss": 0.172, + "step": 2576 + }, + { + "epoch": 1.2224857685009487, + "grad_norm": 1.7491124868392944, + "learning_rate": 6.934684370144879e-06, + "loss": 0.2002, + "step": 2577 + }, + { + "epoch": 1.2229601518026565, + "grad_norm": 1.8310550451278687, + "learning_rate": 6.927372098916294e-06, + "loss": 0.1743, + "step": 2578 + }, + { + "epoch": 1.2234345351043643, + "grad_norm": 1.72085702419281, + "learning_rate": 6.9200616414295525e-06, + "loss": 0.1635, + "step": 2579 + }, + { + "epoch": 1.223908918406072, + "grad_norm": 1.723044514656067, + "learning_rate": 6.912753001999948e-06, + "loss": 0.1717, + "step": 2580 + }, + { + "epoch": 1.2243833017077799, + "grad_norm": 1.7026135921478271, + "learning_rate": 6.9054461849417e-06, + "loss": 0.1621, + "step": 2581 + }, + { + "epoch": 1.2248576850094877, + "grad_norm": 1.7875248193740845, + "learning_rate": 6.898141194567944e-06, + "loss": 0.1824, + "step": 2582 + }, + { + "epoch": 1.2253320683111955, + "grad_norm": 1.8562390804290771, + "learning_rate": 6.890838035190747e-06, + "loss": 0.1861, + "step": 2583 + }, + { + "epoch": 1.2258064516129032, + "grad_norm": 1.7400999069213867, + "learning_rate": 6.88353671112109e-06, + "loss": 0.189, + "step": 2584 + }, + { + "epoch": 1.226280834914611, + "grad_norm": 1.8128386735916138, + "learning_rate": 6.876237226668876e-06, + "loss": 0.1825, + "step": 2585 + }, + { + "epoch": 1.2267552182163188, + "grad_norm": 1.9619581699371338, + "learning_rate": 6.868939586142917e-06, + "loss": 0.191, + "step": 2586 + }, + { + "epoch": 1.2272296015180266, + "grad_norm": 1.6164292097091675, + "learning_rate": 6.861643793850934e-06, + "loss": 0.1718, + "step": 2587 + }, + { + "epoch": 1.2277039848197344, + "grad_norm": 1.7258304357528687, + "learning_rate": 6.854349854099565e-06, + "loss": 0.1814, + "step": 2588 + }, + { + "epoch": 1.2281783681214422, + "grad_norm": 2.029466390609741, + "learning_rate": 6.847057771194351e-06, + "loss": 0.1718, + "step": 2589 + }, + { + "epoch": 1.2286527514231498, + "grad_norm": 2.171515464782715, + "learning_rate": 6.839767549439733e-06, + "loss": 0.1829, + "step": 2590 + }, + { + "epoch": 1.2291271347248576, + "grad_norm": 1.7947821617126465, + "learning_rate": 6.832479193139062e-06, + "loss": 0.1668, + "step": 2591 + }, + { + "epoch": 1.2296015180265654, + "grad_norm": 1.5067108869552612, + "learning_rate": 6.8251927065945755e-06, + "loss": 0.1532, + "step": 2592 + }, + { + "epoch": 1.2300759013282732, + "grad_norm": 1.7323521375656128, + "learning_rate": 6.81790809410742e-06, + "loss": 0.1754, + "step": 2593 + }, + { + "epoch": 1.230550284629981, + "grad_norm": 1.8438886404037476, + "learning_rate": 6.8106253599776275e-06, + "loss": 0.173, + "step": 2594 + }, + { + "epoch": 1.2310246679316887, + "grad_norm": 1.911361813545227, + "learning_rate": 6.803344508504124e-06, + "loss": 0.1942, + "step": 2595 + }, + { + "epoch": 1.2314990512333965, + "grad_norm": 2.2166242599487305, + "learning_rate": 6.796065543984729e-06, + "loss": 0.1846, + "step": 2596 + }, + { + "epoch": 1.2319734345351043, + "grad_norm": 1.8407729864120483, + "learning_rate": 6.788788470716136e-06, + "loss": 0.1958, + "step": 2597 + }, + { + "epoch": 1.2324478178368121, + "grad_norm": 1.4400639533996582, + "learning_rate": 6.781513292993933e-06, + "loss": 0.1268, + "step": 2598 + }, + { + "epoch": 1.23292220113852, + "grad_norm": 1.7160131931304932, + "learning_rate": 6.774240015112583e-06, + "loss": 0.1838, + "step": 2599 + }, + { + "epoch": 1.2333965844402277, + "grad_norm": 1.9042142629623413, + "learning_rate": 6.766968641365437e-06, + "loss": 0.1965, + "step": 2600 + }, + { + "epoch": 1.2338709677419355, + "grad_norm": 2.0009496212005615, + "learning_rate": 6.759699176044705e-06, + "loss": 0.1894, + "step": 2601 + }, + { + "epoch": 1.2343453510436433, + "grad_norm": 1.4664678573608398, + "learning_rate": 6.752431623441488e-06, + "loss": 0.1356, + "step": 2602 + }, + { + "epoch": 1.234819734345351, + "grad_norm": 2.177931547164917, + "learning_rate": 6.745165987845744e-06, + "loss": 0.1661, + "step": 2603 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 1.4767887592315674, + "learning_rate": 6.73790227354631e-06, + "loss": 0.161, + "step": 2604 + }, + { + "epoch": 1.2357685009487667, + "grad_norm": 1.9112201929092407, + "learning_rate": 6.73064048483089e-06, + "loss": 0.1683, + "step": 2605 + }, + { + "epoch": 1.2362428842504745, + "grad_norm": 1.6860960721969604, + "learning_rate": 6.7233806259860355e-06, + "loss": 0.183, + "step": 2606 + }, + { + "epoch": 1.2367172675521823, + "grad_norm": 1.788388967514038, + "learning_rate": 6.716122701297173e-06, + "loss": 0.1649, + "step": 2607 + }, + { + "epoch": 1.23719165085389, + "grad_norm": 1.9071511030197144, + "learning_rate": 6.708866715048586e-06, + "loss": 0.1961, + "step": 2608 + }, + { + "epoch": 1.2376660341555978, + "grad_norm": 1.8968346118927002, + "learning_rate": 6.70161267152341e-06, + "loss": 0.1793, + "step": 2609 + }, + { + "epoch": 1.2381404174573054, + "grad_norm": 1.6861655712127686, + "learning_rate": 6.694360575003637e-06, + "loss": 0.145, + "step": 2610 + }, + { + "epoch": 1.2386148007590132, + "grad_norm": 1.9473601579666138, + "learning_rate": 6.687110429770103e-06, + "loss": 0.1736, + "step": 2611 + }, + { + "epoch": 1.239089184060721, + "grad_norm": 1.6562045812606812, + "learning_rate": 6.679862240102499e-06, + "loss": 0.1866, + "step": 2612 + }, + { + "epoch": 1.2395635673624288, + "grad_norm": 1.8242155313491821, + "learning_rate": 6.672616010279362e-06, + "loss": 0.1956, + "step": 2613 + }, + { + "epoch": 1.2400379506641366, + "grad_norm": 1.7770357131958008, + "learning_rate": 6.6653717445780675e-06, + "loss": 0.1963, + "step": 2614 + }, + { + "epoch": 1.2405123339658444, + "grad_norm": 1.7528514862060547, + "learning_rate": 6.658129447274838e-06, + "loss": 0.176, + "step": 2615 + }, + { + "epoch": 1.2409867172675522, + "grad_norm": 1.6112436056137085, + "learning_rate": 6.650889122644723e-06, + "loss": 0.1464, + "step": 2616 + }, + { + "epoch": 1.24146110056926, + "grad_norm": 1.8748856782913208, + "learning_rate": 6.6436507749616195e-06, + "loss": 0.1667, + "step": 2617 + }, + { + "epoch": 1.2419354838709677, + "grad_norm": 1.466968297958374, + "learning_rate": 6.636414408498249e-06, + "loss": 0.1546, + "step": 2618 + }, + { + "epoch": 1.2424098671726755, + "grad_norm": 1.6490691900253296, + "learning_rate": 6.629180027526174e-06, + "loss": 0.1427, + "step": 2619 + }, + { + "epoch": 1.2428842504743833, + "grad_norm": 1.4727083444595337, + "learning_rate": 6.6219476363157685e-06, + "loss": 0.145, + "step": 2620 + }, + { + "epoch": 1.2433586337760911, + "grad_norm": 1.5241156816482544, + "learning_rate": 6.614717239136246e-06, + "loss": 0.1576, + "step": 2621 + }, + { + "epoch": 1.243833017077799, + "grad_norm": 1.842430830001831, + "learning_rate": 6.6074888402556405e-06, + "loss": 0.1815, + "step": 2622 + }, + { + "epoch": 1.2443074003795067, + "grad_norm": 2.016418933868408, + "learning_rate": 6.600262443940799e-06, + "loss": 0.227, + "step": 2623 + }, + { + "epoch": 1.2447817836812145, + "grad_norm": 1.7062424421310425, + "learning_rate": 6.593038054457402e-06, + "loss": 0.1792, + "step": 2624 + }, + { + "epoch": 1.2452561669829223, + "grad_norm": 1.7208788394927979, + "learning_rate": 6.58581567606992e-06, + "loss": 0.1858, + "step": 2625 + }, + { + "epoch": 1.2457305502846299, + "grad_norm": 1.6579339504241943, + "learning_rate": 6.578595313041665e-06, + "loss": 0.1828, + "step": 2626 + }, + { + "epoch": 1.2462049335863377, + "grad_norm": 1.732631802558899, + "learning_rate": 6.571376969634738e-06, + "loss": 0.1919, + "step": 2627 + }, + { + "epoch": 1.2466793168880455, + "grad_norm": 1.7477668523788452, + "learning_rate": 6.564160650110057e-06, + "loss": 0.1894, + "step": 2628 + }, + { + "epoch": 1.2471537001897532, + "grad_norm": 1.7273080348968506, + "learning_rate": 6.556946358727349e-06, + "loss": 0.194, + "step": 2629 + }, + { + "epoch": 1.247628083491461, + "grad_norm": 1.438499927520752, + "learning_rate": 6.5497340997451335e-06, + "loss": 0.1533, + "step": 2630 + }, + { + "epoch": 1.2481024667931688, + "grad_norm": 1.645349383354187, + "learning_rate": 6.5425238774207345e-06, + "loss": 0.1597, + "step": 2631 + }, + { + "epoch": 1.2485768500948766, + "grad_norm": 2.0088460445404053, + "learning_rate": 6.535315696010278e-06, + "loss": 0.2085, + "step": 2632 + }, + { + "epoch": 1.2490512333965844, + "grad_norm": 2.179716110229492, + "learning_rate": 6.528109559768685e-06, + "loss": 0.1696, + "step": 2633 + }, + { + "epoch": 1.2495256166982922, + "grad_norm": 1.4562453031539917, + "learning_rate": 6.520905472949664e-06, + "loss": 0.145, + "step": 2634 + }, + { + "epoch": 1.25, + "grad_norm": 1.8796703815460205, + "learning_rate": 6.51370343980571e-06, + "loss": 0.1897, + "step": 2635 + }, + { + "epoch": 1.2504743833017078, + "grad_norm": 1.4412729740142822, + "learning_rate": 6.506503464588119e-06, + "loss": 0.1843, + "step": 2636 + }, + { + "epoch": 1.2509487666034156, + "grad_norm": 1.632536768913269, + "learning_rate": 6.499305551546964e-06, + "loss": 0.1567, + "step": 2637 + }, + { + "epoch": 1.2514231499051234, + "grad_norm": 1.5959841012954712, + "learning_rate": 6.492109704931101e-06, + "loss": 0.1611, + "step": 2638 + }, + { + "epoch": 1.2518975332068312, + "grad_norm": 1.659533977508545, + "learning_rate": 6.484915928988167e-06, + "loss": 0.1776, + "step": 2639 + }, + { + "epoch": 1.252371916508539, + "grad_norm": 2.8352959156036377, + "learning_rate": 6.4777242279645754e-06, + "loss": 0.1758, + "step": 2640 + }, + { + "epoch": 1.2528462998102468, + "grad_norm": 1.6581546068191528, + "learning_rate": 6.470534606105519e-06, + "loss": 0.1767, + "step": 2641 + }, + { + "epoch": 1.2533206831119545, + "grad_norm": 2.0722496509552, + "learning_rate": 6.463347067654959e-06, + "loss": 0.1892, + "step": 2642 + }, + { + "epoch": 1.2537950664136623, + "grad_norm": 1.7481244802474976, + "learning_rate": 6.456161616855631e-06, + "loss": 0.1726, + "step": 2643 + }, + { + "epoch": 1.2542694497153701, + "grad_norm": 2.0864098072052, + "learning_rate": 6.448978257949028e-06, + "loss": 0.2157, + "step": 2644 + }, + { + "epoch": 1.254743833017078, + "grad_norm": 1.7608370780944824, + "learning_rate": 6.441796995175422e-06, + "loss": 0.1735, + "step": 2645 + }, + { + "epoch": 1.2552182163187855, + "grad_norm": 1.9006822109222412, + "learning_rate": 6.4346178327738405e-06, + "loss": 0.1814, + "step": 2646 + }, + { + "epoch": 1.2556925996204933, + "grad_norm": 1.6309512853622437, + "learning_rate": 6.42744077498207e-06, + "loss": 0.1562, + "step": 2647 + }, + { + "epoch": 1.256166982922201, + "grad_norm": 1.9142560958862305, + "learning_rate": 6.420265826036663e-06, + "loss": 0.1902, + "step": 2648 + }, + { + "epoch": 1.2566413662239089, + "grad_norm": 2.259213447570801, + "learning_rate": 6.41309299017291e-06, + "loss": 0.2249, + "step": 2649 + }, + { + "epoch": 1.2571157495256167, + "grad_norm": 1.8461946249008179, + "learning_rate": 6.405922271624874e-06, + "loss": 0.1775, + "step": 2650 + }, + { + "epoch": 1.2575901328273245, + "grad_norm": 2.4497241973876953, + "learning_rate": 6.398753674625353e-06, + "loss": 0.1998, + "step": 2651 + }, + { + "epoch": 1.2580645161290323, + "grad_norm": 1.6050024032592773, + "learning_rate": 6.391587203405903e-06, + "loss": 0.1655, + "step": 2652 + }, + { + "epoch": 1.25853889943074, + "grad_norm": 1.7544989585876465, + "learning_rate": 6.384422862196824e-06, + "loss": 0.1495, + "step": 2653 + }, + { + "epoch": 1.2590132827324478, + "grad_norm": 1.697287917137146, + "learning_rate": 6.37726065522715e-06, + "loss": 0.1454, + "step": 2654 + }, + { + "epoch": 1.2594876660341556, + "grad_norm": 3.1953728199005127, + "learning_rate": 6.37010058672466e-06, + "loss": 0.2283, + "step": 2655 + }, + { + "epoch": 1.2599620493358634, + "grad_norm": 1.9299657344818115, + "learning_rate": 6.362942660915875e-06, + "loss": 0.2082, + "step": 2656 + }, + { + "epoch": 1.2604364326375712, + "grad_norm": 1.5635534524917603, + "learning_rate": 6.3557868820260495e-06, + "loss": 0.1593, + "step": 2657 + }, + { + "epoch": 1.260910815939279, + "grad_norm": 2.0182600021362305, + "learning_rate": 6.348633254279166e-06, + "loss": 0.1987, + "step": 2658 + }, + { + "epoch": 1.2613851992409868, + "grad_norm": 1.7452116012573242, + "learning_rate": 6.341481781897939e-06, + "loss": 0.205, + "step": 2659 + }, + { + "epoch": 1.2618595825426944, + "grad_norm": 2.041940450668335, + "learning_rate": 6.334332469103814e-06, + "loss": 0.2476, + "step": 2660 + }, + { + "epoch": 1.2623339658444022, + "grad_norm": 1.419898509979248, + "learning_rate": 6.3271853201169594e-06, + "loss": 0.1227, + "step": 2661 + }, + { + "epoch": 1.26280834914611, + "grad_norm": 1.834674596786499, + "learning_rate": 6.320040339156267e-06, + "loss": 0.1672, + "step": 2662 + }, + { + "epoch": 1.2632827324478177, + "grad_norm": 1.685529351234436, + "learning_rate": 6.312897530439348e-06, + "loss": 0.1822, + "step": 2663 + }, + { + "epoch": 1.2637571157495255, + "grad_norm": 1.5215164422988892, + "learning_rate": 6.305756898182529e-06, + "loss": 0.1613, + "step": 2664 + }, + { + "epoch": 1.2642314990512333, + "grad_norm": 16.828428268432617, + "learning_rate": 6.298618446600856e-06, + "loss": 0.233, + "step": 2665 + }, + { + "epoch": 1.2647058823529411, + "grad_norm": 2.161356210708618, + "learning_rate": 6.29148217990809e-06, + "loss": 0.1777, + "step": 2666 + }, + { + "epoch": 1.265180265654649, + "grad_norm": 1.8170483112335205, + "learning_rate": 6.2843481023166975e-06, + "loss": 0.1898, + "step": 2667 + }, + { + "epoch": 1.2656546489563567, + "grad_norm": 1.768537998199463, + "learning_rate": 6.27721621803785e-06, + "loss": 0.1702, + "step": 2668 + }, + { + "epoch": 1.2661290322580645, + "grad_norm": 1.892989993095398, + "learning_rate": 6.270086531281428e-06, + "loss": 0.205, + "step": 2669 + }, + { + "epoch": 1.2666034155597723, + "grad_norm": 1.9306186437606812, + "learning_rate": 6.262959046256021e-06, + "loss": 0.2097, + "step": 2670 + }, + { + "epoch": 1.26707779886148, + "grad_norm": 1.8183523416519165, + "learning_rate": 6.255833767168907e-06, + "loss": 0.1883, + "step": 2671 + }, + { + "epoch": 1.2675521821631879, + "grad_norm": 1.4449431896209717, + "learning_rate": 6.248710698226074e-06, + "loss": 0.1465, + "step": 2672 + }, + { + "epoch": 1.2680265654648957, + "grad_norm": 1.5579372644424438, + "learning_rate": 6.241589843632192e-06, + "loss": 0.1579, + "step": 2673 + }, + { + "epoch": 1.2685009487666035, + "grad_norm": 1.6067235469818115, + "learning_rate": 6.234471207590636e-06, + "loss": 0.1597, + "step": 2674 + }, + { + "epoch": 1.2689753320683113, + "grad_norm": 1.6292030811309814, + "learning_rate": 6.227354794303461e-06, + "loss": 0.2124, + "step": 2675 + }, + { + "epoch": 1.269449715370019, + "grad_norm": 1.8330110311508179, + "learning_rate": 6.220240607971422e-06, + "loss": 0.1691, + "step": 2676 + }, + { + "epoch": 1.2699240986717268, + "grad_norm": 1.4492367506027222, + "learning_rate": 6.213128652793952e-06, + "loss": 0.1557, + "step": 2677 + }, + { + "epoch": 1.2703984819734346, + "grad_norm": 1.5724751949310303, + "learning_rate": 6.206018932969162e-06, + "loss": 0.142, + "step": 2678 + }, + { + "epoch": 1.2708728652751424, + "grad_norm": 1.4839202165603638, + "learning_rate": 6.1989114526938535e-06, + "loss": 0.1237, + "step": 2679 + }, + { + "epoch": 1.2713472485768502, + "grad_norm": 1.7914515733718872, + "learning_rate": 6.1918062161635005e-06, + "loss": 0.185, + "step": 2680 + }, + { + "epoch": 1.271821631878558, + "grad_norm": 1.844212532043457, + "learning_rate": 6.184703227572257e-06, + "loss": 0.1892, + "step": 2681 + }, + { + "epoch": 1.2722960151802656, + "grad_norm": 1.94633948802948, + "learning_rate": 6.1776024911129414e-06, + "loss": 0.1789, + "step": 2682 + }, + { + "epoch": 1.2727703984819734, + "grad_norm": 1.849686622619629, + "learning_rate": 6.170504010977053e-06, + "loss": 0.1831, + "step": 2683 + }, + { + "epoch": 1.2732447817836812, + "grad_norm": 1.323569893836975, + "learning_rate": 6.163407791354751e-06, + "loss": 0.1487, + "step": 2684 + }, + { + "epoch": 1.273719165085389, + "grad_norm": 2.2079055309295654, + "learning_rate": 6.156313836434864e-06, + "loss": 0.1744, + "step": 2685 + }, + { + "epoch": 1.2741935483870968, + "grad_norm": 1.6516516208648682, + "learning_rate": 6.149222150404889e-06, + "loss": 0.1772, + "step": 2686 + }, + { + "epoch": 1.2746679316888045, + "grad_norm": 1.9557101726531982, + "learning_rate": 6.142132737450971e-06, + "loss": 0.2113, + "step": 2687 + }, + { + "epoch": 1.2751423149905123, + "grad_norm": 1.9967963695526123, + "learning_rate": 6.135045601757921e-06, + "loss": 0.1954, + "step": 2688 + }, + { + "epoch": 1.2756166982922201, + "grad_norm": 1.9493542909622192, + "learning_rate": 6.127960747509207e-06, + "loss": 0.1692, + "step": 2689 + }, + { + "epoch": 1.276091081593928, + "grad_norm": 2.279658794403076, + "learning_rate": 6.120878178886951e-06, + "loss": 0.2131, + "step": 2690 + }, + { + "epoch": 1.2765654648956357, + "grad_norm": 1.827585220336914, + "learning_rate": 6.113797900071923e-06, + "loss": 0.1943, + "step": 2691 + }, + { + "epoch": 1.2770398481973435, + "grad_norm": 1.7768232822418213, + "learning_rate": 6.106719915243533e-06, + "loss": 0.1943, + "step": 2692 + }, + { + "epoch": 1.2775142314990513, + "grad_norm": 1.574704885482788, + "learning_rate": 6.099644228579852e-06, + "loss": 0.1617, + "step": 2693 + }, + { + "epoch": 1.277988614800759, + "grad_norm": 2.0819928646087646, + "learning_rate": 6.092570844257589e-06, + "loss": 0.2005, + "step": 2694 + }, + { + "epoch": 1.2784629981024667, + "grad_norm": 2.360511064529419, + "learning_rate": 6.08549976645209e-06, + "loss": 0.2153, + "step": 2695 + }, + { + "epoch": 1.2789373814041745, + "grad_norm": 1.7095462083816528, + "learning_rate": 6.078430999337346e-06, + "loss": 0.1612, + "step": 2696 + }, + { + "epoch": 1.2794117647058822, + "grad_norm": 2.0695948600769043, + "learning_rate": 6.071364547085974e-06, + "loss": 0.1911, + "step": 2697 + }, + { + "epoch": 1.27988614800759, + "grad_norm": 1.8348363637924194, + "learning_rate": 6.064300413869237e-06, + "loss": 0.1883, + "step": 2698 + }, + { + "epoch": 1.2803605313092978, + "grad_norm": 1.8135762214660645, + "learning_rate": 6.057238603857018e-06, + "loss": 0.1442, + "step": 2699 + }, + { + "epoch": 1.2808349146110056, + "grad_norm": 2.002474546432495, + "learning_rate": 6.050179121217839e-06, + "loss": 0.2274, + "step": 2700 + }, + { + "epoch": 1.2813092979127134, + "grad_norm": 2.7125346660614014, + "learning_rate": 6.043121970118837e-06, + "loss": 0.1427, + "step": 2701 + }, + { + "epoch": 1.2817836812144212, + "grad_norm": 1.8385379314422607, + "learning_rate": 6.0360671547257825e-06, + "loss": 0.1609, + "step": 2702 + }, + { + "epoch": 1.282258064516129, + "grad_norm": 1.4639102220535278, + "learning_rate": 6.029014679203059e-06, + "loss": 0.1462, + "step": 2703 + }, + { + "epoch": 1.2827324478178368, + "grad_norm": 1.8368364572525024, + "learning_rate": 6.0219645477136764e-06, + "loss": 0.1943, + "step": 2704 + }, + { + "epoch": 1.2832068311195446, + "grad_norm": 1.7069745063781738, + "learning_rate": 6.014916764419261e-06, + "loss": 0.1726, + "step": 2705 + }, + { + "epoch": 1.2836812144212524, + "grad_norm": 1.7310154438018799, + "learning_rate": 6.007871333480041e-06, + "loss": 0.1615, + "step": 2706 + }, + { + "epoch": 1.2841555977229602, + "grad_norm": 1.5262956619262695, + "learning_rate": 6.000828259054872e-06, + "loss": 0.1396, + "step": 2707 + }, + { + "epoch": 1.284629981024668, + "grad_norm": 1.7818677425384521, + "learning_rate": 5.993787545301204e-06, + "loss": 0.1845, + "step": 2708 + }, + { + "epoch": 1.2851043643263758, + "grad_norm": 2.1896657943725586, + "learning_rate": 5.986749196375108e-06, + "loss": 0.2015, + "step": 2709 + }, + { + "epoch": 1.2855787476280836, + "grad_norm": 2.082186698913574, + "learning_rate": 5.97971321643125e-06, + "loss": 0.2172, + "step": 2710 + }, + { + "epoch": 1.2860531309297913, + "grad_norm": 2.232640504837036, + "learning_rate": 5.972679609622897e-06, + "loss": 0.2158, + "step": 2711 + }, + { + "epoch": 1.2865275142314991, + "grad_norm": 1.818824291229248, + "learning_rate": 5.965648380101916e-06, + "loss": 0.1672, + "step": 2712 + }, + { + "epoch": 1.287001897533207, + "grad_norm": 2.0493948459625244, + "learning_rate": 5.958619532018775e-06, + "loss": 0.1805, + "step": 2713 + }, + { + "epoch": 1.2874762808349147, + "grad_norm": 2.046306610107422, + "learning_rate": 5.951593069522535e-06, + "loss": 0.1763, + "step": 2714 + }, + { + "epoch": 1.2879506641366225, + "grad_norm": 1.797914743423462, + "learning_rate": 5.944568996760847e-06, + "loss": 0.2099, + "step": 2715 + }, + { + "epoch": 1.2884250474383303, + "grad_norm": 1.5372848510742188, + "learning_rate": 5.937547317879946e-06, + "loss": 0.1515, + "step": 2716 + }, + { + "epoch": 1.2888994307400379, + "grad_norm": 1.811116337776184, + "learning_rate": 5.930528037024664e-06, + "loss": 0.151, + "step": 2717 + }, + { + "epoch": 1.2893738140417457, + "grad_norm": 1.7281832695007324, + "learning_rate": 5.923511158338415e-06, + "loss": 0.1729, + "step": 2718 + }, + { + "epoch": 1.2898481973434535, + "grad_norm": 1.5258969068527222, + "learning_rate": 5.916496685963191e-06, + "loss": 0.1412, + "step": 2719 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 1.6176496744155884, + "learning_rate": 5.909484624039563e-06, + "loss": 0.1712, + "step": 2720 + }, + { + "epoch": 1.290796963946869, + "grad_norm": 1.8939071893692017, + "learning_rate": 5.9024749767066835e-06, + "loss": 0.1806, + "step": 2721 + }, + { + "epoch": 1.2912713472485768, + "grad_norm": 1.8788526058197021, + "learning_rate": 5.89546774810228e-06, + "loss": 0.17, + "step": 2722 + }, + { + "epoch": 1.2917457305502846, + "grad_norm": 2.107088327407837, + "learning_rate": 5.888462942362647e-06, + "loss": 0.18, + "step": 2723 + }, + { + "epoch": 1.2922201138519924, + "grad_norm": 1.6402045488357544, + "learning_rate": 5.881460563622659e-06, + "loss": 0.1535, + "step": 2724 + }, + { + "epoch": 1.2926944971537002, + "grad_norm": 1.9502211809158325, + "learning_rate": 5.87446061601574e-06, + "loss": 0.1858, + "step": 2725 + }, + { + "epoch": 1.293168880455408, + "grad_norm": 1.5377591848373413, + "learning_rate": 5.867463103673898e-06, + "loss": 0.1621, + "step": 2726 + }, + { + "epoch": 1.2936432637571158, + "grad_norm": 2.0564045906066895, + "learning_rate": 5.8604680307276906e-06, + "loss": 0.1743, + "step": 2727 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 1.4501993656158447, + "learning_rate": 5.853475401306241e-06, + "loss": 0.1646, + "step": 2728 + }, + { + "epoch": 1.2945920303605314, + "grad_norm": 2.006211042404175, + "learning_rate": 5.846485219537237e-06, + "loss": 0.2069, + "step": 2729 + }, + { + "epoch": 1.295066413662239, + "grad_norm": 2.675589084625244, + "learning_rate": 5.8394974895469015e-06, + "loss": 0.2098, + "step": 2730 + }, + { + "epoch": 1.2955407969639468, + "grad_norm": 1.6392362117767334, + "learning_rate": 5.83251221546003e-06, + "loss": 0.1624, + "step": 2731 + }, + { + "epoch": 1.2960151802656545, + "grad_norm": 1.847262978553772, + "learning_rate": 5.825529401399956e-06, + "loss": 0.2169, + "step": 2732 + }, + { + "epoch": 1.2964895635673623, + "grad_norm": 1.3665305376052856, + "learning_rate": 5.818549051488569e-06, + "loss": 0.1345, + "step": 2733 + }, + { + "epoch": 1.2969639468690701, + "grad_norm": 1.4444960355758667, + "learning_rate": 5.811571169846304e-06, + "loss": 0.1572, + "step": 2734 + }, + { + "epoch": 1.297438330170778, + "grad_norm": 1.631394624710083, + "learning_rate": 5.804595760592127e-06, + "loss": 0.1435, + "step": 2735 + }, + { + "epoch": 1.2979127134724857, + "grad_norm": 2.4109082221984863, + "learning_rate": 5.797622827843561e-06, + "loss": 0.2491, + "step": 2736 + }, + { + "epoch": 1.2983870967741935, + "grad_norm": 1.8289345502853394, + "learning_rate": 5.790652375716653e-06, + "loss": 0.1797, + "step": 2737 + }, + { + "epoch": 1.2988614800759013, + "grad_norm": 1.665688157081604, + "learning_rate": 5.7836844083259954e-06, + "loss": 0.1651, + "step": 2738 + }, + { + "epoch": 1.299335863377609, + "grad_norm": 1.573738932609558, + "learning_rate": 5.776718929784707e-06, + "loss": 0.156, + "step": 2739 + }, + { + "epoch": 1.2998102466793169, + "grad_norm": 1.9333720207214355, + "learning_rate": 5.769755944204443e-06, + "loss": 0.1613, + "step": 2740 + }, + { + "epoch": 1.3002846299810247, + "grad_norm": 1.6634092330932617, + "learning_rate": 5.762795455695385e-06, + "loss": 0.198, + "step": 2741 + }, + { + "epoch": 1.3007590132827325, + "grad_norm": 1.8421812057495117, + "learning_rate": 5.755837468366241e-06, + "loss": 0.1715, + "step": 2742 + }, + { + "epoch": 1.3012333965844403, + "grad_norm": 1.5852497816085815, + "learning_rate": 5.748881986324245e-06, + "loss": 0.1821, + "step": 2743 + }, + { + "epoch": 1.301707779886148, + "grad_norm": 1.8755565881729126, + "learning_rate": 5.741929013675143e-06, + "loss": 0.182, + "step": 2744 + }, + { + "epoch": 1.3021821631878558, + "grad_norm": 1.483273983001709, + "learning_rate": 5.7349785545232115e-06, + "loss": 0.1509, + "step": 2745 + }, + { + "epoch": 1.3026565464895636, + "grad_norm": 1.5257030725479126, + "learning_rate": 5.728030612971231e-06, + "loss": 0.1342, + "step": 2746 + }, + { + "epoch": 1.3031309297912714, + "grad_norm": 1.8776187896728516, + "learning_rate": 5.721085193120507e-06, + "loss": 0.1959, + "step": 2747 + }, + { + "epoch": 1.3036053130929792, + "grad_norm": 1.5106158256530762, + "learning_rate": 5.714142299070856e-06, + "loss": 0.1563, + "step": 2748 + }, + { + "epoch": 1.304079696394687, + "grad_norm": 1.8498852252960205, + "learning_rate": 5.7072019349205925e-06, + "loss": 0.216, + "step": 2749 + }, + { + "epoch": 1.3045540796963948, + "grad_norm": 1.8534907102584839, + "learning_rate": 5.700264104766547e-06, + "loss": 0.1699, + "step": 2750 + }, + { + "epoch": 1.3050284629981026, + "grad_norm": 1.647162675857544, + "learning_rate": 5.6933288127040505e-06, + "loss": 0.1455, + "step": 2751 + }, + { + "epoch": 1.3055028462998102, + "grad_norm": 1.767333745956421, + "learning_rate": 5.686396062826946e-06, + "loss": 0.1483, + "step": 2752 + }, + { + "epoch": 1.305977229601518, + "grad_norm": 1.979466438293457, + "learning_rate": 5.679465859227561e-06, + "loss": 0.1747, + "step": 2753 + }, + { + "epoch": 1.3064516129032258, + "grad_norm": 1.9015872478485107, + "learning_rate": 5.6725382059967205e-06, + "loss": 0.2133, + "step": 2754 + }, + { + "epoch": 1.3069259962049335, + "grad_norm": 1.6040949821472168, + "learning_rate": 5.665613107223755e-06, + "loss": 0.158, + "step": 2755 + }, + { + "epoch": 1.3074003795066413, + "grad_norm": 1.9205068349838257, + "learning_rate": 5.658690566996483e-06, + "loss": 0.1565, + "step": 2756 + }, + { + "epoch": 1.3078747628083491, + "grad_norm": 1.864329218864441, + "learning_rate": 5.651770589401209e-06, + "loss": 0.1957, + "step": 2757 + }, + { + "epoch": 1.308349146110057, + "grad_norm": 2.1258440017700195, + "learning_rate": 5.644853178522734e-06, + "loss": 0.1696, + "step": 2758 + }, + { + "epoch": 1.3088235294117647, + "grad_norm": 1.630826711654663, + "learning_rate": 5.637938338444325e-06, + "loss": 0.1278, + "step": 2759 + }, + { + "epoch": 1.3092979127134725, + "grad_norm": 1.273641586303711, + "learning_rate": 5.631026073247752e-06, + "loss": 0.1296, + "step": 2760 + }, + { + "epoch": 1.3097722960151803, + "grad_norm": 1.7420580387115479, + "learning_rate": 5.624116387013259e-06, + "loss": 0.1666, + "step": 2761 + }, + { + "epoch": 1.310246679316888, + "grad_norm": 1.7752727270126343, + "learning_rate": 5.617209283819562e-06, + "loss": 0.1868, + "step": 2762 + }, + { + "epoch": 1.310721062618596, + "grad_norm": 1.8602519035339355, + "learning_rate": 5.61030476774385e-06, + "loss": 0.1966, + "step": 2763 + }, + { + "epoch": 1.3111954459203037, + "grad_norm": 1.8290077447891235, + "learning_rate": 5.603402842861797e-06, + "loss": 0.1749, + "step": 2764 + }, + { + "epoch": 1.3116698292220113, + "grad_norm": 1.944825291633606, + "learning_rate": 5.5965035132475395e-06, + "loss": 0.2062, + "step": 2765 + }, + { + "epoch": 1.312144212523719, + "grad_norm": 1.7076294422149658, + "learning_rate": 5.589606782973683e-06, + "loss": 0.1738, + "step": 2766 + }, + { + "epoch": 1.3126185958254268, + "grad_norm": 1.9629321098327637, + "learning_rate": 5.5827126561113045e-06, + "loss": 0.1418, + "step": 2767 + }, + { + "epoch": 1.3130929791271346, + "grad_norm": 1.9150692224502563, + "learning_rate": 5.575821136729929e-06, + "loss": 0.1978, + "step": 2768 + }, + { + "epoch": 1.3135673624288424, + "grad_norm": 1.7870982885360718, + "learning_rate": 5.568932228897563e-06, + "loss": 0.1764, + "step": 2769 + }, + { + "epoch": 1.3140417457305502, + "grad_norm": 2.10677170753479, + "learning_rate": 5.562045936680649e-06, + "loss": 0.2014, + "step": 2770 + }, + { + "epoch": 1.314516129032258, + "grad_norm": 2.482201337814331, + "learning_rate": 5.555162264144105e-06, + "loss": 0.2658, + "step": 2771 + }, + { + "epoch": 1.3149905123339658, + "grad_norm": 1.6711809635162354, + "learning_rate": 5.548281215351297e-06, + "loss": 0.1548, + "step": 2772 + }, + { + "epoch": 1.3154648956356736, + "grad_norm": 2.137512683868408, + "learning_rate": 5.54140279436403e-06, + "loss": 0.2021, + "step": 2773 + }, + { + "epoch": 1.3159392789373814, + "grad_norm": 1.648645281791687, + "learning_rate": 5.534527005242575e-06, + "loss": 0.1697, + "step": 2774 + }, + { + "epoch": 1.3164136622390892, + "grad_norm": 1.3213448524475098, + "learning_rate": 5.52765385204564e-06, + "loss": 0.1197, + "step": 2775 + }, + { + "epoch": 1.316888045540797, + "grad_norm": 1.4370523691177368, + "learning_rate": 5.520783338830386e-06, + "loss": 0.1534, + "step": 2776 + }, + { + "epoch": 1.3173624288425048, + "grad_norm": 1.695040225982666, + "learning_rate": 5.5139154696524025e-06, + "loss": 0.1957, + "step": 2777 + }, + { + "epoch": 1.3178368121442126, + "grad_norm": 1.5011173486709595, + "learning_rate": 5.5070502485657216e-06, + "loss": 0.157, + "step": 2778 + }, + { + "epoch": 1.3183111954459203, + "grad_norm": 1.3054955005645752, + "learning_rate": 5.500187679622819e-06, + "loss": 0.1141, + "step": 2779 + }, + { + "epoch": 1.3187855787476281, + "grad_norm": 1.8683662414550781, + "learning_rate": 5.4933277668746036e-06, + "loss": 0.1725, + "step": 2780 + }, + { + "epoch": 1.319259962049336, + "grad_norm": 1.2149678468704224, + "learning_rate": 5.486470514370415e-06, + "loss": 0.1239, + "step": 2781 + }, + { + "epoch": 1.3197343453510437, + "grad_norm": 1.8600114583969116, + "learning_rate": 5.479615926158013e-06, + "loss": 0.1712, + "step": 2782 + }, + { + "epoch": 1.3202087286527515, + "grad_norm": 1.914152979850769, + "learning_rate": 5.4727640062836e-06, + "loss": 0.1836, + "step": 2783 + }, + { + "epoch": 1.3206831119544593, + "grad_norm": 1.7095372676849365, + "learning_rate": 5.465914758791794e-06, + "loss": 0.17, + "step": 2784 + }, + { + "epoch": 1.321157495256167, + "grad_norm": 1.9404311180114746, + "learning_rate": 5.459068187725644e-06, + "loss": 0.1771, + "step": 2785 + }, + { + "epoch": 1.321631878557875, + "grad_norm": 1.4658271074295044, + "learning_rate": 5.452224297126607e-06, + "loss": 0.1525, + "step": 2786 + }, + { + "epoch": 1.3221062618595825, + "grad_norm": 1.9287587404251099, + "learning_rate": 5.445383091034564e-06, + "loss": 0.2087, + "step": 2787 + }, + { + "epoch": 1.3225806451612903, + "grad_norm": 1.8362351655960083, + "learning_rate": 5.438544573487811e-06, + "loss": 0.1848, + "step": 2788 + }, + { + "epoch": 1.323055028462998, + "grad_norm": 1.9103182554244995, + "learning_rate": 5.431708748523058e-06, + "loss": 0.1918, + "step": 2789 + }, + { + "epoch": 1.3235294117647058, + "grad_norm": 1.722016453742981, + "learning_rate": 5.424875620175427e-06, + "loss": 0.1852, + "step": 2790 + }, + { + "epoch": 1.3240037950664136, + "grad_norm": 1.559799313545227, + "learning_rate": 5.4180451924784475e-06, + "loss": 0.1548, + "step": 2791 + }, + { + "epoch": 1.3244781783681214, + "grad_norm": 2.0171709060668945, + "learning_rate": 5.4112174694640475e-06, + "loss": 0.2102, + "step": 2792 + }, + { + "epoch": 1.3249525616698292, + "grad_norm": 1.49323570728302, + "learning_rate": 5.404392455162571e-06, + "loss": 0.1358, + "step": 2793 + }, + { + "epoch": 1.325426944971537, + "grad_norm": 1.6861432790756226, + "learning_rate": 5.397570153602747e-06, + "loss": 0.1531, + "step": 2794 + }, + { + "epoch": 1.3259013282732448, + "grad_norm": 2.1216015815734863, + "learning_rate": 5.39075056881172e-06, + "loss": 0.1996, + "step": 2795 + }, + { + "epoch": 1.3263757115749526, + "grad_norm": 1.6811705827713013, + "learning_rate": 5.383933704815025e-06, + "loss": 0.1685, + "step": 2796 + }, + { + "epoch": 1.3268500948766604, + "grad_norm": 1.721708059310913, + "learning_rate": 5.377119565636584e-06, + "loss": 0.1591, + "step": 2797 + }, + { + "epoch": 1.3273244781783682, + "grad_norm": 1.8749873638153076, + "learning_rate": 5.370308155298716e-06, + "loss": 0.177, + "step": 2798 + }, + { + "epoch": 1.327798861480076, + "grad_norm": 1.5626192092895508, + "learning_rate": 5.363499477822132e-06, + "loss": 0.1364, + "step": 2799 + }, + { + "epoch": 1.3282732447817835, + "grad_norm": 1.7851414680480957, + "learning_rate": 5.35669353722593e-06, + "loss": 0.1724, + "step": 2800 + }, + { + "epoch": 1.3287476280834913, + "grad_norm": 1.9391238689422607, + "learning_rate": 5.3498903375275815e-06, + "loss": 0.161, + "step": 2801 + }, + { + "epoch": 1.3292220113851991, + "grad_norm": 2.0420241355895996, + "learning_rate": 5.3430898827429555e-06, + "loss": 0.1735, + "step": 2802 + }, + { + "epoch": 1.329696394686907, + "grad_norm": 1.4055426120758057, + "learning_rate": 5.336292176886287e-06, + "loss": 0.1211, + "step": 2803 + }, + { + "epoch": 1.3301707779886147, + "grad_norm": 1.658674716949463, + "learning_rate": 5.329497223970195e-06, + "loss": 0.1562, + "step": 2804 + }, + { + "epoch": 1.3306451612903225, + "grad_norm": 2.3525173664093018, + "learning_rate": 5.32270502800568e-06, + "loss": 0.2096, + "step": 2805 + }, + { + "epoch": 1.3311195445920303, + "grad_norm": 1.7249343395233154, + "learning_rate": 5.3159155930021e-06, + "loss": 0.1569, + "step": 2806 + }, + { + "epoch": 1.331593927893738, + "grad_norm": 2.1588501930236816, + "learning_rate": 5.309128922967194e-06, + "loss": 0.2027, + "step": 2807 + }, + { + "epoch": 1.3320683111954459, + "grad_norm": 2.3709404468536377, + "learning_rate": 5.302345021907066e-06, + "loss": 0.2227, + "step": 2808 + }, + { + "epoch": 1.3325426944971537, + "grad_norm": 1.7174904346466064, + "learning_rate": 5.295563893826191e-06, + "loss": 0.1723, + "step": 2809 + }, + { + "epoch": 1.3330170777988615, + "grad_norm": 1.9293659925460815, + "learning_rate": 5.288785542727397e-06, + "loss": 0.1562, + "step": 2810 + }, + { + "epoch": 1.3334914611005693, + "grad_norm": 1.5283536911010742, + "learning_rate": 5.282009972611873e-06, + "loss": 0.1329, + "step": 2811 + }, + { + "epoch": 1.333965844402277, + "grad_norm": 1.8217898607254028, + "learning_rate": 5.275237187479176e-06, + "loss": 0.1772, + "step": 2812 + }, + { + "epoch": 1.3344402277039848, + "grad_norm": 2.461566686630249, + "learning_rate": 5.268467191327214e-06, + "loss": 0.2084, + "step": 2813 + }, + { + "epoch": 1.3349146110056926, + "grad_norm": 1.7789011001586914, + "learning_rate": 5.261699988152249e-06, + "loss": 0.1837, + "step": 2814 + }, + { + "epoch": 1.3353889943074004, + "grad_norm": 1.8147854804992676, + "learning_rate": 5.254935581948897e-06, + "loss": 0.2358, + "step": 2815 + }, + { + "epoch": 1.3358633776091082, + "grad_norm": 1.6862436532974243, + "learning_rate": 5.248173976710111e-06, + "loss": 0.1419, + "step": 2816 + }, + { + "epoch": 1.336337760910816, + "grad_norm": 1.3814226388931274, + "learning_rate": 5.2414151764272116e-06, + "loss": 0.1232, + "step": 2817 + }, + { + "epoch": 1.3368121442125238, + "grad_norm": 1.4221547842025757, + "learning_rate": 5.23465918508984e-06, + "loss": 0.1346, + "step": 2818 + }, + { + "epoch": 1.3372865275142316, + "grad_norm": 2.043062686920166, + "learning_rate": 5.227906006686001e-06, + "loss": 0.1743, + "step": 2819 + }, + { + "epoch": 1.3377609108159394, + "grad_norm": 2.2925965785980225, + "learning_rate": 5.22115564520202e-06, + "loss": 0.1989, + "step": 2820 + }, + { + "epoch": 1.3382352941176472, + "grad_norm": 1.5758484601974487, + "learning_rate": 5.214408104622573e-06, + "loss": 0.154, + "step": 2821 + }, + { + "epoch": 1.3387096774193548, + "grad_norm": 1.688869833946228, + "learning_rate": 5.207663388930666e-06, + "loss": 0.158, + "step": 2822 + }, + { + "epoch": 1.3391840607210626, + "grad_norm": 1.5997257232666016, + "learning_rate": 5.200921502107638e-06, + "loss": 0.1363, + "step": 2823 + }, + { + "epoch": 1.3396584440227703, + "grad_norm": 2.4265429973602295, + "learning_rate": 5.194182448133163e-06, + "loss": 0.1848, + "step": 2824 + }, + { + "epoch": 1.3401328273244781, + "grad_norm": 1.8538546562194824, + "learning_rate": 5.187446230985229e-06, + "loss": 0.1652, + "step": 2825 + }, + { + "epoch": 1.340607210626186, + "grad_norm": 1.5969922542572021, + "learning_rate": 5.180712854640168e-06, + "loss": 0.1649, + "step": 2826 + }, + { + "epoch": 1.3410815939278937, + "grad_norm": 1.623705267906189, + "learning_rate": 5.173982323072615e-06, + "loss": 0.1676, + "step": 2827 + }, + { + "epoch": 1.3415559772296015, + "grad_norm": 1.4985768795013428, + "learning_rate": 5.167254640255542e-06, + "loss": 0.1731, + "step": 2828 + }, + { + "epoch": 1.3420303605313093, + "grad_norm": 1.6000432968139648, + "learning_rate": 5.160529810160235e-06, + "loss": 0.15, + "step": 2829 + }, + { + "epoch": 1.342504743833017, + "grad_norm": 1.7188128232955933, + "learning_rate": 5.153807836756288e-06, + "loss": 0.151, + "step": 2830 + }, + { + "epoch": 1.342979127134725, + "grad_norm": 2.0611462593078613, + "learning_rate": 5.147088724011622e-06, + "loss": 0.2002, + "step": 2831 + }, + { + "epoch": 1.3434535104364327, + "grad_norm": 1.8624550104141235, + "learning_rate": 5.14037247589246e-06, + "loss": 0.1607, + "step": 2832 + }, + { + "epoch": 1.3439278937381405, + "grad_norm": 1.335152506828308, + "learning_rate": 5.133659096363341e-06, + "loss": 0.118, + "step": 2833 + }, + { + "epoch": 1.3444022770398483, + "grad_norm": 1.6288925409317017, + "learning_rate": 5.126948589387104e-06, + "loss": 0.184, + "step": 2834 + }, + { + "epoch": 1.3448766603415558, + "grad_norm": 2.3706490993499756, + "learning_rate": 5.120240958924888e-06, + "loss": 0.2334, + "step": 2835 + }, + { + "epoch": 1.3453510436432636, + "grad_norm": 1.5236918926239014, + "learning_rate": 5.113536208936147e-06, + "loss": 0.1429, + "step": 2836 + }, + { + "epoch": 1.3458254269449714, + "grad_norm": 1.7775214910507202, + "learning_rate": 5.106834343378629e-06, + "loss": 0.1757, + "step": 2837 + }, + { + "epoch": 1.3462998102466792, + "grad_norm": 1.7919654846191406, + "learning_rate": 5.100135366208383e-06, + "loss": 0.1604, + "step": 2838 + }, + { + "epoch": 1.346774193548387, + "grad_norm": 2.141319990158081, + "learning_rate": 5.093439281379738e-06, + "loss": 0.2079, + "step": 2839 + }, + { + "epoch": 1.3472485768500948, + "grad_norm": 1.5796895027160645, + "learning_rate": 5.086746092845334e-06, + "loss": 0.1522, + "step": 2840 + }, + { + "epoch": 1.3477229601518026, + "grad_norm": 1.6436305046081543, + "learning_rate": 5.080055804556097e-06, + "loss": 0.15, + "step": 2841 + }, + { + "epoch": 1.3481973434535104, + "grad_norm": 2.3595945835113525, + "learning_rate": 5.073368420461229e-06, + "loss": 0.2145, + "step": 2842 + }, + { + "epoch": 1.3486717267552182, + "grad_norm": 2.0658891201019287, + "learning_rate": 5.066683944508235e-06, + "loss": 0.1851, + "step": 2843 + }, + { + "epoch": 1.349146110056926, + "grad_norm": 1.6966054439544678, + "learning_rate": 5.060002380642887e-06, + "loss": 0.1432, + "step": 2844 + }, + { + "epoch": 1.3496204933586338, + "grad_norm": 1.6572761535644531, + "learning_rate": 5.053323732809252e-06, + "loss": 0.1847, + "step": 2845 + }, + { + "epoch": 1.3500948766603416, + "grad_norm": 1.654388427734375, + "learning_rate": 5.046648004949667e-06, + "loss": 0.1683, + "step": 2846 + }, + { + "epoch": 1.3505692599620494, + "grad_norm": 1.6657936573028564, + "learning_rate": 5.0399752010047495e-06, + "loss": 0.2028, + "step": 2847 + }, + { + "epoch": 1.3510436432637571, + "grad_norm": 1.6080611944198608, + "learning_rate": 5.033305324913392e-06, + "loss": 0.1479, + "step": 2848 + }, + { + "epoch": 1.351518026565465, + "grad_norm": 1.4048763513565063, + "learning_rate": 5.0266383806127514e-06, + "loss": 0.1322, + "step": 2849 + }, + { + "epoch": 1.3519924098671727, + "grad_norm": 1.806875467300415, + "learning_rate": 5.019974372038265e-06, + "loss": 0.2062, + "step": 2850 + }, + { + "epoch": 1.3524667931688805, + "grad_norm": 1.8327056169509888, + "learning_rate": 5.0133133031236215e-06, + "loss": 0.1663, + "step": 2851 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 1.5724437236785889, + "learning_rate": 5.006655177800792e-06, + "loss": 0.1578, + "step": 2852 + }, + { + "epoch": 1.353415559772296, + "grad_norm": 1.6653856039047241, + "learning_rate": 5.000000000000003e-06, + "loss": 0.1481, + "step": 2853 + }, + { + "epoch": 1.353889943074004, + "grad_norm": 2.099308729171753, + "learning_rate": 4.993347773649732e-06, + "loss": 0.2375, + "step": 2854 + }, + { + "epoch": 1.3543643263757117, + "grad_norm": 1.8074994087219238, + "learning_rate": 4.9866985026767276e-06, + "loss": 0.1593, + "step": 2855 + }, + { + "epoch": 1.3548387096774195, + "grad_norm": 1.7318613529205322, + "learning_rate": 4.980052191005989e-06, + "loss": 0.1456, + "step": 2856 + }, + { + "epoch": 1.355313092979127, + "grad_norm": 1.5684703588485718, + "learning_rate": 4.973408842560772e-06, + "loss": 0.1847, + "step": 2857 + }, + { + "epoch": 1.3557874762808348, + "grad_norm": 2.352754831314087, + "learning_rate": 4.966768461262573e-06, + "loss": 0.1362, + "step": 2858 + }, + { + "epoch": 1.3562618595825426, + "grad_norm": 1.6529217958450317, + "learning_rate": 4.960131051031143e-06, + "loss": 0.1486, + "step": 2859 + }, + { + "epoch": 1.3567362428842504, + "grad_norm": 1.7900093793869019, + "learning_rate": 4.953496615784482e-06, + "loss": 0.1411, + "step": 2860 + }, + { + "epoch": 1.3572106261859582, + "grad_norm": 1.4834645986557007, + "learning_rate": 4.94686515943883e-06, + "loss": 0.143, + "step": 2861 + }, + { + "epoch": 1.357685009487666, + "grad_norm": 1.7863253355026245, + "learning_rate": 4.940236685908677e-06, + "loss": 0.1657, + "step": 2862 + }, + { + "epoch": 1.3581593927893738, + "grad_norm": 1.8910566568374634, + "learning_rate": 4.933611199106736e-06, + "loss": 0.1838, + "step": 2863 + }, + { + "epoch": 1.3586337760910816, + "grad_norm": 1.7281635999679565, + "learning_rate": 4.9269887029439686e-06, + "loss": 0.1884, + "step": 2864 + }, + { + "epoch": 1.3591081593927894, + "grad_norm": 1.703221321105957, + "learning_rate": 4.920369201329575e-06, + "loss": 0.1579, + "step": 2865 + }, + { + "epoch": 1.3595825426944972, + "grad_norm": 1.3635776042938232, + "learning_rate": 4.913752698170972e-06, + "loss": 0.137, + "step": 2866 + }, + { + "epoch": 1.360056925996205, + "grad_norm": 1.9177613258361816, + "learning_rate": 4.907139197373827e-06, + "loss": 0.1918, + "step": 2867 + }, + { + "epoch": 1.3605313092979128, + "grad_norm": 1.5520260334014893, + "learning_rate": 4.900528702842011e-06, + "loss": 0.1742, + "step": 2868 + }, + { + "epoch": 1.3610056925996206, + "grad_norm": 1.8986750841140747, + "learning_rate": 4.893921218477642e-06, + "loss": 0.1707, + "step": 2869 + }, + { + "epoch": 1.3614800759013281, + "grad_norm": 1.954786777496338, + "learning_rate": 4.8873167481810516e-06, + "loss": 0.1707, + "step": 2870 + }, + { + "epoch": 1.361954459203036, + "grad_norm": 1.7505497932434082, + "learning_rate": 4.880715295850791e-06, + "loss": 0.1829, + "step": 2871 + }, + { + "epoch": 1.3624288425047437, + "grad_norm": 1.406683087348938, + "learning_rate": 4.874116865383638e-06, + "loss": 0.1356, + "step": 2872 + }, + { + "epoch": 1.3629032258064515, + "grad_norm": 1.3671265840530396, + "learning_rate": 4.867521460674573e-06, + "loss": 0.1402, + "step": 2873 + }, + { + "epoch": 1.3633776091081593, + "grad_norm": 1.9116820096969604, + "learning_rate": 4.860929085616804e-06, + "loss": 0.1588, + "step": 2874 + }, + { + "epoch": 1.363851992409867, + "grad_norm": 1.5119657516479492, + "learning_rate": 4.85433974410174e-06, + "loss": 0.1318, + "step": 2875 + }, + { + "epoch": 1.364326375711575, + "grad_norm": 1.674631953239441, + "learning_rate": 4.8477534400190075e-06, + "loss": 0.1778, + "step": 2876 + }, + { + "epoch": 1.3648007590132827, + "grad_norm": 1.8323768377304077, + "learning_rate": 4.841170177256439e-06, + "loss": 0.2063, + "step": 2877 + }, + { + "epoch": 1.3652751423149905, + "grad_norm": 2.152963638305664, + "learning_rate": 4.834589959700061e-06, + "loss": 0.2062, + "step": 2878 + }, + { + "epoch": 1.3657495256166983, + "grad_norm": 1.483198881149292, + "learning_rate": 4.828012791234117e-06, + "loss": 0.1412, + "step": 2879 + }, + { + "epoch": 1.366223908918406, + "grad_norm": 1.3623414039611816, + "learning_rate": 4.821438675741044e-06, + "loss": 0.1237, + "step": 2880 + }, + { + "epoch": 1.3666982922201139, + "grad_norm": 1.7617155313491821, + "learning_rate": 4.814867617101479e-06, + "loss": 0.1496, + "step": 2881 + }, + { + "epoch": 1.3671726755218216, + "grad_norm": 1.808595061302185, + "learning_rate": 4.808299619194251e-06, + "loss": 0.1625, + "step": 2882 + }, + { + "epoch": 1.3676470588235294, + "grad_norm": 1.750730276107788, + "learning_rate": 4.80173468589638e-06, + "loss": 0.1531, + "step": 2883 + }, + { + "epoch": 1.3681214421252372, + "grad_norm": 1.9921399354934692, + "learning_rate": 4.795172821083084e-06, + "loss": 0.2081, + "step": 2884 + }, + { + "epoch": 1.368595825426945, + "grad_norm": 1.9898700714111328, + "learning_rate": 4.788614028627769e-06, + "loss": 0.1826, + "step": 2885 + }, + { + "epoch": 1.3690702087286528, + "grad_norm": 1.715350866317749, + "learning_rate": 4.782058312402027e-06, + "loss": 0.1736, + "step": 2886 + }, + { + "epoch": 1.3695445920303606, + "grad_norm": 1.990676760673523, + "learning_rate": 4.7755056762756255e-06, + "loss": 0.1491, + "step": 2887 + }, + { + "epoch": 1.3700189753320684, + "grad_norm": 1.4953407049179077, + "learning_rate": 4.768956124116526e-06, + "loss": 0.1616, + "step": 2888 + }, + { + "epoch": 1.3704933586337762, + "grad_norm": 1.76144278049469, + "learning_rate": 4.762409659790866e-06, + "loss": 0.1715, + "step": 2889 + }, + { + "epoch": 1.370967741935484, + "grad_norm": 1.379292368888855, + "learning_rate": 4.755866287162952e-06, + "loss": 0.1385, + "step": 2890 + }, + { + "epoch": 1.3714421252371918, + "grad_norm": 2.018620014190674, + "learning_rate": 4.74932601009528e-06, + "loss": 0.1926, + "step": 2891 + }, + { + "epoch": 1.3719165085388993, + "grad_norm": 2.3270931243896484, + "learning_rate": 4.742788832448501e-06, + "loss": 0.2674, + "step": 2892 + }, + { + "epoch": 1.3723908918406071, + "grad_norm": 1.6892091035842896, + "learning_rate": 4.736254758081454e-06, + "loss": 0.191, + "step": 2893 + }, + { + "epoch": 1.372865275142315, + "grad_norm": 1.5555603504180908, + "learning_rate": 4.729723790851135e-06, + "loss": 0.1301, + "step": 2894 + }, + { + "epoch": 1.3733396584440227, + "grad_norm": 1.7797441482543945, + "learning_rate": 4.723195934612711e-06, + "loss": 0.1627, + "step": 2895 + }, + { + "epoch": 1.3738140417457305, + "grad_norm": 2.268540859222412, + "learning_rate": 4.7166711932195155e-06, + "loss": 0.1985, + "step": 2896 + }, + { + "epoch": 1.3742884250474383, + "grad_norm": 1.6203911304473877, + "learning_rate": 4.7101495705230285e-06, + "loss": 0.1565, + "step": 2897 + }, + { + "epoch": 1.374762808349146, + "grad_norm": 1.598213791847229, + "learning_rate": 4.703631070372909e-06, + "loss": 0.179, + "step": 2898 + }, + { + "epoch": 1.375237191650854, + "grad_norm": 1.5291484594345093, + "learning_rate": 4.697115696616955e-06, + "loss": 0.1816, + "step": 2899 + }, + { + "epoch": 1.3757115749525617, + "grad_norm": 1.719058871269226, + "learning_rate": 4.690603453101134e-06, + "loss": 0.1626, + "step": 2900 + }, + { + "epoch": 1.3761859582542695, + "grad_norm": 1.810861587524414, + "learning_rate": 4.684094343669554e-06, + "loss": 0.1542, + "step": 2901 + }, + { + "epoch": 1.3766603415559773, + "grad_norm": 1.7553555965423584, + "learning_rate": 4.677588372164479e-06, + "loss": 0.1787, + "step": 2902 + }, + { + "epoch": 1.377134724857685, + "grad_norm": 1.9552857875823975, + "learning_rate": 4.6710855424263205e-06, + "loss": 0.1996, + "step": 2903 + }, + { + "epoch": 1.3776091081593929, + "grad_norm": 1.9977998733520508, + "learning_rate": 4.6645858582936345e-06, + "loss": 0.1963, + "step": 2904 + }, + { + "epoch": 1.3780834914611007, + "grad_norm": 1.5562477111816406, + "learning_rate": 4.658089323603123e-06, + "loss": 0.1483, + "step": 2905 + }, + { + "epoch": 1.3785578747628082, + "grad_norm": 1.7609916925430298, + "learning_rate": 4.651595942189624e-06, + "loss": 0.2095, + "step": 2906 + }, + { + "epoch": 1.379032258064516, + "grad_norm": 1.431341528892517, + "learning_rate": 4.645105717886112e-06, + "loss": 0.1309, + "step": 2907 + }, + { + "epoch": 1.3795066413662238, + "grad_norm": 1.580350399017334, + "learning_rate": 4.638618654523705e-06, + "loss": 0.1537, + "step": 2908 + }, + { + "epoch": 1.3799810246679316, + "grad_norm": 1.886465072631836, + "learning_rate": 4.632134755931653e-06, + "loss": 0.1535, + "step": 2909 + }, + { + "epoch": 1.3804554079696394, + "grad_norm": 1.5633918046951294, + "learning_rate": 4.625654025937342e-06, + "loss": 0.1684, + "step": 2910 + }, + { + "epoch": 1.3809297912713472, + "grad_norm": 1.7328919172286987, + "learning_rate": 4.619176468366274e-06, + "loss": 0.1585, + "step": 2911 + }, + { + "epoch": 1.381404174573055, + "grad_norm": 1.3697620630264282, + "learning_rate": 4.612702087042091e-06, + "loss": 0.1405, + "step": 2912 + }, + { + "epoch": 1.3818785578747628, + "grad_norm": 2.1742119789123535, + "learning_rate": 4.606230885786557e-06, + "loss": 0.1519, + "step": 2913 + }, + { + "epoch": 1.3823529411764706, + "grad_norm": 1.6288695335388184, + "learning_rate": 4.599762868419561e-06, + "loss": 0.1662, + "step": 2914 + }, + { + "epoch": 1.3828273244781784, + "grad_norm": 1.7532272338867188, + "learning_rate": 4.5932980387591054e-06, + "loss": 0.1819, + "step": 2915 + }, + { + "epoch": 1.3833017077798861, + "grad_norm": 1.4690742492675781, + "learning_rate": 4.586836400621313e-06, + "loss": 0.1624, + "step": 2916 + }, + { + "epoch": 1.383776091081594, + "grad_norm": 1.6084843873977661, + "learning_rate": 4.580377957820427e-06, + "loss": 0.1653, + "step": 2917 + }, + { + "epoch": 1.3842504743833017, + "grad_norm": 1.5763942003250122, + "learning_rate": 4.573922714168804e-06, + "loss": 0.1519, + "step": 2918 + }, + { + "epoch": 1.3847248576850095, + "grad_norm": 2.008824586868286, + "learning_rate": 4.567470673476912e-06, + "loss": 0.1915, + "step": 2919 + }, + { + "epoch": 1.3851992409867173, + "grad_norm": 1.5423541069030762, + "learning_rate": 4.561021839553323e-06, + "loss": 0.1571, + "step": 2920 + }, + { + "epoch": 1.385673624288425, + "grad_norm": 1.8469271659851074, + "learning_rate": 4.554576216204718e-06, + "loss": 0.1971, + "step": 2921 + }, + { + "epoch": 1.386148007590133, + "grad_norm": 2.0073492527008057, + "learning_rate": 4.548133807235893e-06, + "loss": 0.1826, + "step": 2922 + }, + { + "epoch": 1.3866223908918407, + "grad_norm": 1.9065463542938232, + "learning_rate": 4.541694616449729e-06, + "loss": 0.1774, + "step": 2923 + }, + { + "epoch": 1.3870967741935485, + "grad_norm": 1.4257770776748657, + "learning_rate": 4.535258647647225e-06, + "loss": 0.1407, + "step": 2924 + }, + { + "epoch": 1.3875711574952563, + "grad_norm": 1.6374725103378296, + "learning_rate": 4.5288259046274605e-06, + "loss": 0.1694, + "step": 2925 + }, + { + "epoch": 1.388045540796964, + "grad_norm": 1.9593760967254639, + "learning_rate": 4.5223963911876265e-06, + "loss": 0.1788, + "step": 2926 + }, + { + "epoch": 1.3885199240986719, + "grad_norm": 1.881184697151184, + "learning_rate": 4.5159701111229995e-06, + "loss": 0.1875, + "step": 2927 + }, + { + "epoch": 1.3889943074003794, + "grad_norm": 1.786327838897705, + "learning_rate": 4.509547068226947e-06, + "loss": 0.171, + "step": 2928 + }, + { + "epoch": 1.3894686907020872, + "grad_norm": 1.864014983177185, + "learning_rate": 4.503127266290935e-06, + "loss": 0.1636, + "step": 2929 + }, + { + "epoch": 1.389943074003795, + "grad_norm": 1.5407509803771973, + "learning_rate": 4.496710709104504e-06, + "loss": 0.1757, + "step": 2930 + }, + { + "epoch": 1.3904174573055028, + "grad_norm": 1.7357945442199707, + "learning_rate": 4.49029740045528e-06, + "loss": 0.1297, + "step": 2931 + }, + { + "epoch": 1.3908918406072106, + "grad_norm": 1.6357675790786743, + "learning_rate": 4.48388734412898e-06, + "loss": 0.1654, + "step": 2932 + }, + { + "epoch": 1.3913662239089184, + "grad_norm": 1.856079339981079, + "learning_rate": 4.477480543909396e-06, + "loss": 0.1842, + "step": 2933 + }, + { + "epoch": 1.3918406072106262, + "grad_norm": 2.02909779548645, + "learning_rate": 4.471077003578403e-06, + "loss": 0.1907, + "step": 2934 + }, + { + "epoch": 1.392314990512334, + "grad_norm": 1.560585379600525, + "learning_rate": 4.464676726915939e-06, + "loss": 0.1355, + "step": 2935 + }, + { + "epoch": 1.3927893738140418, + "grad_norm": 1.7451188564300537, + "learning_rate": 4.458279717700031e-06, + "loss": 0.172, + "step": 2936 + }, + { + "epoch": 1.3932637571157496, + "grad_norm": 1.3806904554367065, + "learning_rate": 4.451885979706767e-06, + "loss": 0.1345, + "step": 2937 + }, + { + "epoch": 1.3937381404174574, + "grad_norm": 2.2464559078216553, + "learning_rate": 4.445495516710312e-06, + "loss": 0.1972, + "step": 2938 + }, + { + "epoch": 1.3942125237191652, + "grad_norm": 1.8251832723617554, + "learning_rate": 4.439108332482889e-06, + "loss": 0.1987, + "step": 2939 + }, + { + "epoch": 1.394686907020873, + "grad_norm": 1.6302183866500854, + "learning_rate": 4.432724430794786e-06, + "loss": 0.1673, + "step": 2940 + }, + { + "epoch": 1.3951612903225805, + "grad_norm": 1.733634114265442, + "learning_rate": 4.426343815414361e-06, + "loss": 0.1736, + "step": 2941 + }, + { + "epoch": 1.3956356736242883, + "grad_norm": 1.841150164604187, + "learning_rate": 4.419966490108028e-06, + "loss": 0.1898, + "step": 2942 + }, + { + "epoch": 1.396110056925996, + "grad_norm": 1.559160828590393, + "learning_rate": 4.413592458640264e-06, + "loss": 0.1301, + "step": 2943 + }, + { + "epoch": 1.396584440227704, + "grad_norm": 1.6111403703689575, + "learning_rate": 4.407221724773587e-06, + "loss": 0.1824, + "step": 2944 + }, + { + "epoch": 1.3970588235294117, + "grad_norm": 1.6043332815170288, + "learning_rate": 4.4008542922685834e-06, + "loss": 0.1555, + "step": 2945 + }, + { + "epoch": 1.3975332068311195, + "grad_norm": 1.6422481536865234, + "learning_rate": 4.3944901648838885e-06, + "loss": 0.155, + "step": 2946 + }, + { + "epoch": 1.3980075901328273, + "grad_norm": 1.5227971076965332, + "learning_rate": 4.388129346376177e-06, + "loss": 0.1495, + "step": 2947 + }, + { + "epoch": 1.398481973434535, + "grad_norm": 1.7671695947647095, + "learning_rate": 4.3817718405001844e-06, + "loss": 0.1753, + "step": 2948 + }, + { + "epoch": 1.3989563567362429, + "grad_norm": 1.6482768058776855, + "learning_rate": 4.3754176510086756e-06, + "loss": 0.156, + "step": 2949 + }, + { + "epoch": 1.3994307400379506, + "grad_norm": 1.7106208801269531, + "learning_rate": 4.369066781652469e-06, + "loss": 0.1681, + "step": 2950 + }, + { + "epoch": 1.3999051233396584, + "grad_norm": 1.4155083894729614, + "learning_rate": 4.362719236180422e-06, + "loss": 0.1351, + "step": 2951 + }, + { + "epoch": 1.4003795066413662, + "grad_norm": 1.725624918937683, + "learning_rate": 4.356375018339426e-06, + "loss": 0.1601, + "step": 2952 + }, + { + "epoch": 1.400853889943074, + "grad_norm": 1.8480092287063599, + "learning_rate": 4.350034131874414e-06, + "loss": 0.1942, + "step": 2953 + }, + { + "epoch": 1.4013282732447818, + "grad_norm": 1.3656977415084839, + "learning_rate": 4.343696580528343e-06, + "loss": 0.1348, + "step": 2954 + }, + { + "epoch": 1.4018026565464896, + "grad_norm": 1.9449461698532104, + "learning_rate": 4.33736236804221e-06, + "loss": 0.1573, + "step": 2955 + }, + { + "epoch": 1.4022770398481974, + "grad_norm": 1.8900002241134644, + "learning_rate": 4.331031498155035e-06, + "loss": 0.1748, + "step": 2956 + }, + { + "epoch": 1.4027514231499052, + "grad_norm": 1.8122249841690063, + "learning_rate": 4.324703974603873e-06, + "loss": 0.1863, + "step": 2957 + }, + { + "epoch": 1.403225806451613, + "grad_norm": 1.5694611072540283, + "learning_rate": 4.318379801123792e-06, + "loss": 0.1249, + "step": 2958 + }, + { + "epoch": 1.4037001897533208, + "grad_norm": 1.4751542806625366, + "learning_rate": 4.312058981447893e-06, + "loss": 0.1364, + "step": 2959 + }, + { + "epoch": 1.4041745730550286, + "grad_norm": 2.7020621299743652, + "learning_rate": 4.305741519307291e-06, + "loss": 0.1851, + "step": 2960 + }, + { + "epoch": 1.4046489563567364, + "grad_norm": 1.3586440086364746, + "learning_rate": 4.2994274184311245e-06, + "loss": 0.1246, + "step": 2961 + }, + { + "epoch": 1.4051233396584442, + "grad_norm": 1.8559433221817017, + "learning_rate": 4.293116682546546e-06, + "loss": 0.154, + "step": 2962 + }, + { + "epoch": 1.4055977229601517, + "grad_norm": 1.997769832611084, + "learning_rate": 4.28680931537872e-06, + "loss": 0.1688, + "step": 2963 + }, + { + "epoch": 1.4060721062618595, + "grad_norm": 1.8411399126052856, + "learning_rate": 4.280505320650814e-06, + "loss": 0.1979, + "step": 2964 + }, + { + "epoch": 1.4065464895635673, + "grad_norm": 1.8652249574661255, + "learning_rate": 4.2742047020840214e-06, + "loss": 0.1975, + "step": 2965 + }, + { + "epoch": 1.407020872865275, + "grad_norm": 1.6203731298446655, + "learning_rate": 4.2679074633975345e-06, + "loss": 0.149, + "step": 2966 + }, + { + "epoch": 1.407495256166983, + "grad_norm": 1.7112479209899902, + "learning_rate": 4.261613608308553e-06, + "loss": 0.1555, + "step": 2967 + }, + { + "epoch": 1.4079696394686907, + "grad_norm": 1.9960196018218994, + "learning_rate": 4.255323140532272e-06, + "loss": 0.1732, + "step": 2968 + }, + { + "epoch": 1.4084440227703985, + "grad_norm": 1.8311996459960938, + "learning_rate": 4.2490360637818965e-06, + "loss": 0.158, + "step": 2969 + }, + { + "epoch": 1.4089184060721063, + "grad_norm": 1.7324796915054321, + "learning_rate": 4.242752381768626e-06, + "loss": 0.1525, + "step": 2970 + }, + { + "epoch": 1.409392789373814, + "grad_norm": 1.6953574419021606, + "learning_rate": 4.236472098201651e-06, + "loss": 0.1494, + "step": 2971 + }, + { + "epoch": 1.4098671726755219, + "grad_norm": 1.5616384744644165, + "learning_rate": 4.230195216788168e-06, + "loss": 0.1523, + "step": 2972 + }, + { + "epoch": 1.4103415559772297, + "grad_norm": 1.4512946605682373, + "learning_rate": 4.223921741233349e-06, + "loss": 0.1483, + "step": 2973 + }, + { + "epoch": 1.4108159392789374, + "grad_norm": 1.525667667388916, + "learning_rate": 4.217651675240371e-06, + "loss": 0.1236, + "step": 2974 + }, + { + "epoch": 1.4112903225806452, + "grad_norm": 1.9730256795883179, + "learning_rate": 4.211385022510389e-06, + "loss": 0.175, + "step": 2975 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 1.4136416912078857, + "learning_rate": 4.205121786742552e-06, + "loss": 0.1287, + "step": 2976 + }, + { + "epoch": 1.4122390891840606, + "grad_norm": 1.5880212783813477, + "learning_rate": 4.198861971633977e-06, + "loss": 0.1426, + "step": 2977 + }, + { + "epoch": 1.4127134724857684, + "grad_norm": 1.5319970846176147, + "learning_rate": 4.1926055808797765e-06, + "loss": 0.1749, + "step": 2978 + }, + { + "epoch": 1.4131878557874762, + "grad_norm": 1.677323579788208, + "learning_rate": 4.186352618173037e-06, + "loss": 0.1692, + "step": 2979 + }, + { + "epoch": 1.413662239089184, + "grad_norm": 1.9051989316940308, + "learning_rate": 4.180103087204817e-06, + "loss": 0.1202, + "step": 2980 + }, + { + "epoch": 1.4141366223908918, + "grad_norm": 1.7331345081329346, + "learning_rate": 4.1738569916641555e-06, + "loss": 0.1551, + "step": 2981 + }, + { + "epoch": 1.4146110056925996, + "grad_norm": 1.4977072477340698, + "learning_rate": 4.167614335238058e-06, + "loss": 0.1707, + "step": 2982 + }, + { + "epoch": 1.4150853889943074, + "grad_norm": 2.1823627948760986, + "learning_rate": 4.161375121611504e-06, + "loss": 0.1984, + "step": 2983 + }, + { + "epoch": 1.4155597722960152, + "grad_norm": 1.9357411861419678, + "learning_rate": 4.155139354467439e-06, + "loss": 0.1967, + "step": 2984 + }, + { + "epoch": 1.416034155597723, + "grad_norm": 1.4335790872573853, + "learning_rate": 4.1489070374867765e-06, + "loss": 0.1393, + "step": 2985 + }, + { + "epoch": 1.4165085388994307, + "grad_norm": 1.459723949432373, + "learning_rate": 4.142678174348395e-06, + "loss": 0.1365, + "step": 2986 + }, + { + "epoch": 1.4169829222011385, + "grad_norm": 1.581613302230835, + "learning_rate": 4.136452768729126e-06, + "loss": 0.1507, + "step": 2987 + }, + { + "epoch": 1.4174573055028463, + "grad_norm": 1.6380817890167236, + "learning_rate": 4.130230824303761e-06, + "loss": 0.1611, + "step": 2988 + }, + { + "epoch": 1.4179316888045541, + "grad_norm": 1.8408558368682861, + "learning_rate": 4.1240123447450575e-06, + "loss": 0.1885, + "step": 2989 + }, + { + "epoch": 1.418406072106262, + "grad_norm": 1.6198720932006836, + "learning_rate": 4.117797333723721e-06, + "loss": 0.1538, + "step": 2990 + }, + { + "epoch": 1.4188804554079697, + "grad_norm": 1.727500319480896, + "learning_rate": 4.1115857949084145e-06, + "loss": 0.1834, + "step": 2991 + }, + { + "epoch": 1.4193548387096775, + "grad_norm": 2.777367353439331, + "learning_rate": 4.105377731965743e-06, + "loss": 0.1844, + "step": 2992 + }, + { + "epoch": 1.4198292220113853, + "grad_norm": 1.618923306465149, + "learning_rate": 4.099173148560265e-06, + "loss": 0.1926, + "step": 2993 + }, + { + "epoch": 1.420303605313093, + "grad_norm": 1.8688693046569824, + "learning_rate": 4.092972048354491e-06, + "loss": 0.1688, + "step": 2994 + }, + { + "epoch": 1.4207779886148009, + "grad_norm": 1.7127189636230469, + "learning_rate": 4.08677443500886e-06, + "loss": 0.1505, + "step": 2995 + }, + { + "epoch": 1.4212523719165087, + "grad_norm": 1.893202543258667, + "learning_rate": 4.08058031218177e-06, + "loss": 0.1643, + "step": 2996 + }, + { + "epoch": 1.4217267552182165, + "grad_norm": 1.7235002517700195, + "learning_rate": 4.074389683529542e-06, + "loss": 0.1644, + "step": 2997 + }, + { + "epoch": 1.422201138519924, + "grad_norm": 1.580626368522644, + "learning_rate": 4.0682025527064486e-06, + "loss": 0.1579, + "step": 2998 + }, + { + "epoch": 1.4226755218216318, + "grad_norm": 1.6993354558944702, + "learning_rate": 4.06201892336469e-06, + "loss": 0.1617, + "step": 2999 + }, + { + "epoch": 1.4231499051233396, + "grad_norm": 1.7193957567214966, + "learning_rate": 4.055838799154406e-06, + "loss": 0.1584, + "step": 3000 + }, + { + "epoch": 1.4236242884250474, + "grad_norm": 1.6037389039993286, + "learning_rate": 4.049662183723655e-06, + "loss": 0.1507, + "step": 3001 + }, + { + "epoch": 1.4240986717267552, + "grad_norm": 1.4931392669677734, + "learning_rate": 4.043489080718437e-06, + "loss": 0.1296, + "step": 3002 + }, + { + "epoch": 1.424573055028463, + "grad_norm": 1.6914727687835693, + "learning_rate": 4.037319493782674e-06, + "loss": 0.1486, + "step": 3003 + }, + { + "epoch": 1.4250474383301708, + "grad_norm": 1.8355352878570557, + "learning_rate": 4.031153426558209e-06, + "loss": 0.1778, + "step": 3004 + }, + { + "epoch": 1.4255218216318786, + "grad_norm": 1.8768749237060547, + "learning_rate": 4.024990882684815e-06, + "loss": 0.1356, + "step": 3005 + }, + { + "epoch": 1.4259962049335864, + "grad_norm": 1.700304388999939, + "learning_rate": 4.018831865800174e-06, + "loss": 0.157, + "step": 3006 + }, + { + "epoch": 1.4264705882352942, + "grad_norm": 2.2423648834228516, + "learning_rate": 4.012676379539896e-06, + "loss": 0.1578, + "step": 3007 + }, + { + "epoch": 1.426944971537002, + "grad_norm": 1.7538981437683105, + "learning_rate": 4.006524427537504e-06, + "loss": 0.1863, + "step": 3008 + }, + { + "epoch": 1.4274193548387097, + "grad_norm": 2.0132992267608643, + "learning_rate": 4.0003760134244355e-06, + "loss": 0.1697, + "step": 3009 + }, + { + "epoch": 1.4278937381404175, + "grad_norm": 1.686045527458191, + "learning_rate": 3.9942311408300395e-06, + "loss": 0.1412, + "step": 3010 + }, + { + "epoch": 1.428368121442125, + "grad_norm": 1.5621978044509888, + "learning_rate": 3.9880898133815724e-06, + "loss": 0.1428, + "step": 3011 + }, + { + "epoch": 1.428842504743833, + "grad_norm": 1.520124912261963, + "learning_rate": 3.981952034704194e-06, + "loss": 0.1541, + "step": 3012 + }, + { + "epoch": 1.4293168880455407, + "grad_norm": 1.930535912513733, + "learning_rate": 3.975817808420978e-06, + "loss": 0.1873, + "step": 3013 + }, + { + "epoch": 1.4297912713472485, + "grad_norm": 1.5178654193878174, + "learning_rate": 3.969687138152899e-06, + "loss": 0.1246, + "step": 3014 + }, + { + "epoch": 1.4302656546489563, + "grad_norm": 1.252669095993042, + "learning_rate": 3.9635600275188335e-06, + "loss": 0.1106, + "step": 3015 + }, + { + "epoch": 1.430740037950664, + "grad_norm": 1.629313349723816, + "learning_rate": 3.957436480135547e-06, + "loss": 0.1768, + "step": 3016 + }, + { + "epoch": 1.4312144212523719, + "grad_norm": 1.6184428930282593, + "learning_rate": 3.951316499617711e-06, + "loss": 0.1696, + "step": 3017 + }, + { + "epoch": 1.4316888045540797, + "grad_norm": 1.6965394020080566, + "learning_rate": 3.9452000895778964e-06, + "loss": 0.1764, + "step": 3018 + }, + { + "epoch": 1.4321631878557874, + "grad_norm": 1.5129637718200684, + "learning_rate": 3.93908725362655e-06, + "loss": 0.1325, + "step": 3019 + }, + { + "epoch": 1.4326375711574952, + "grad_norm": 1.4176459312438965, + "learning_rate": 3.932977995372025e-06, + "loss": 0.1385, + "step": 3020 + }, + { + "epoch": 1.433111954459203, + "grad_norm": 1.6068745851516724, + "learning_rate": 3.926872318420551e-06, + "loss": 0.1298, + "step": 3021 + }, + { + "epoch": 1.4335863377609108, + "grad_norm": 1.753598928451538, + "learning_rate": 3.920770226376251e-06, + "loss": 0.1772, + "step": 3022 + }, + { + "epoch": 1.4340607210626186, + "grad_norm": 1.7061421871185303, + "learning_rate": 3.9146717228411305e-06, + "loss": 0.1549, + "step": 3023 + }, + { + "epoch": 1.4345351043643264, + "grad_norm": 1.6681634187698364, + "learning_rate": 3.908576811415078e-06, + "loss": 0.1601, + "step": 3024 + }, + { + "epoch": 1.4350094876660342, + "grad_norm": 1.7460962533950806, + "learning_rate": 3.902485495695853e-06, + "loss": 0.1574, + "step": 3025 + }, + { + "epoch": 1.435483870967742, + "grad_norm": 1.7344130277633667, + "learning_rate": 3.896397779279102e-06, + "loss": 0.1805, + "step": 3026 + }, + { + "epoch": 1.4359582542694498, + "grad_norm": 1.7097632884979248, + "learning_rate": 3.890313665758348e-06, + "loss": 0.1652, + "step": 3027 + }, + { + "epoch": 1.4364326375711576, + "grad_norm": 1.7743875980377197, + "learning_rate": 3.884233158724976e-06, + "loss": 0.1655, + "step": 3028 + }, + { + "epoch": 1.4369070208728654, + "grad_norm": 1.5984655618667603, + "learning_rate": 3.8781562617682555e-06, + "loss": 0.1579, + "step": 3029 + }, + { + "epoch": 1.4373814041745732, + "grad_norm": 1.6356760263442993, + "learning_rate": 3.872082978475312e-06, + "loss": 0.1618, + "step": 3030 + }, + { + "epoch": 1.437855787476281, + "grad_norm": 1.7311320304870605, + "learning_rate": 3.866013312431148e-06, + "loss": 0.1827, + "step": 3031 + }, + { + "epoch": 1.4383301707779887, + "grad_norm": 1.62131929397583, + "learning_rate": 3.859947267218627e-06, + "loss": 0.1412, + "step": 3032 + }, + { + "epoch": 1.4388045540796963, + "grad_norm": 1.5495834350585938, + "learning_rate": 3.8538848464184766e-06, + "loss": 0.1318, + "step": 3033 + }, + { + "epoch": 1.439278937381404, + "grad_norm": 2.08760404586792, + "learning_rate": 3.847826053609286e-06, + "loss": 0.1718, + "step": 3034 + }, + { + "epoch": 1.439753320683112, + "grad_norm": 1.9067436456680298, + "learning_rate": 3.841770892367497e-06, + "loss": 0.1776, + "step": 3035 + }, + { + "epoch": 1.4402277039848197, + "grad_norm": 1.3851908445358276, + "learning_rate": 3.83571936626741e-06, + "loss": 0.1305, + "step": 3036 + }, + { + "epoch": 1.4407020872865275, + "grad_norm": 1.6880590915679932, + "learning_rate": 3.82967147888118e-06, + "loss": 0.157, + "step": 3037 + }, + { + "epoch": 1.4411764705882353, + "grad_norm": 1.8479284048080444, + "learning_rate": 3.823627233778824e-06, + "loss": 0.1465, + "step": 3038 + }, + { + "epoch": 1.441650853889943, + "grad_norm": 1.678410291671753, + "learning_rate": 3.8175866345281895e-06, + "loss": 0.1558, + "step": 3039 + }, + { + "epoch": 1.4421252371916509, + "grad_norm": 1.8292676210403442, + "learning_rate": 3.8115496846949885e-06, + "loss": 0.1587, + "step": 3040 + }, + { + "epoch": 1.4425996204933587, + "grad_norm": 1.7389070987701416, + "learning_rate": 3.8055163878427703e-06, + "loss": 0.1605, + "step": 3041 + }, + { + "epoch": 1.4430740037950665, + "grad_norm": 1.9918736219406128, + "learning_rate": 3.7994867475329346e-06, + "loss": 0.1665, + "step": 3042 + }, + { + "epoch": 1.4435483870967742, + "grad_norm": 1.6637475490570068, + "learning_rate": 3.7934607673247116e-06, + "loss": 0.1636, + "step": 3043 + }, + { + "epoch": 1.444022770398482, + "grad_norm": 1.8318939208984375, + "learning_rate": 3.787438450775185e-06, + "loss": 0.1563, + "step": 3044 + }, + { + "epoch": 1.4444971537001898, + "grad_norm": 1.7426759004592896, + "learning_rate": 3.781419801439261e-06, + "loss": 0.1709, + "step": 3045 + }, + { + "epoch": 1.4449715370018974, + "grad_norm": 1.7280033826828003, + "learning_rate": 3.775404822869694e-06, + "loss": 0.1878, + "step": 3046 + }, + { + "epoch": 1.4454459203036052, + "grad_norm": 1.8315585851669312, + "learning_rate": 3.7693935186170638e-06, + "loss": 0.1904, + "step": 3047 + }, + { + "epoch": 1.445920303605313, + "grad_norm": 1.3828792572021484, + "learning_rate": 3.7633858922297885e-06, + "loss": 0.1232, + "step": 3048 + }, + { + "epoch": 1.4463946869070208, + "grad_norm": 1.6559396982192993, + "learning_rate": 3.757381947254104e-06, + "loss": 0.1733, + "step": 3049 + }, + { + "epoch": 1.4468690702087286, + "grad_norm": 1.8938626050949097, + "learning_rate": 3.7513816872340826e-06, + "loss": 0.1362, + "step": 3050 + }, + { + "epoch": 1.4473434535104364, + "grad_norm": 2.071521282196045, + "learning_rate": 3.745385115711623e-06, + "loss": 0.1934, + "step": 3051 + }, + { + "epoch": 1.4478178368121442, + "grad_norm": 1.8912732601165771, + "learning_rate": 3.739392236226432e-06, + "loss": 0.1588, + "step": 3052 + }, + { + "epoch": 1.448292220113852, + "grad_norm": 1.723662257194519, + "learning_rate": 3.7334030523160582e-06, + "loss": 0.1656, + "step": 3053 + }, + { + "epoch": 1.4487666034155597, + "grad_norm": 1.6426823139190674, + "learning_rate": 3.7274175675158477e-06, + "loss": 0.1391, + "step": 3054 + }, + { + "epoch": 1.4492409867172675, + "grad_norm": 1.779732346534729, + "learning_rate": 3.7214357853589765e-06, + "loss": 0.1712, + "step": 3055 + }, + { + "epoch": 1.4497153700189753, + "grad_norm": 1.4211159944534302, + "learning_rate": 3.7154577093764334e-06, + "loss": 0.1523, + "step": 3056 + }, + { + "epoch": 1.4501897533206831, + "grad_norm": 2.087869167327881, + "learning_rate": 3.7094833430970188e-06, + "loss": 0.2148, + "step": 3057 + }, + { + "epoch": 1.450664136622391, + "grad_norm": 1.6574203968048096, + "learning_rate": 3.7035126900473363e-06, + "loss": 0.1594, + "step": 3058 + }, + { + "epoch": 1.4511385199240987, + "grad_norm": 1.7676061391830444, + "learning_rate": 3.69754575375181e-06, + "loss": 0.161, + "step": 3059 + }, + { + "epoch": 1.4516129032258065, + "grad_norm": 1.7256070375442505, + "learning_rate": 3.691582537732655e-06, + "loss": 0.1791, + "step": 3060 + }, + { + "epoch": 1.4520872865275143, + "grad_norm": 1.9738215208053589, + "learning_rate": 3.6856230455099053e-06, + "loss": 0.1804, + "step": 3061 + }, + { + "epoch": 1.452561669829222, + "grad_norm": 1.850088357925415, + "learning_rate": 3.6796672806013913e-06, + "loss": 0.173, + "step": 3062 + }, + { + "epoch": 1.4530360531309299, + "grad_norm": 1.7364604473114014, + "learning_rate": 3.6737152465227355e-06, + "loss": 0.1691, + "step": 3063 + }, + { + "epoch": 1.4535104364326377, + "grad_norm": 1.4408142566680908, + "learning_rate": 3.667766946787369e-06, + "loss": 0.1409, + "step": 3064 + }, + { + "epoch": 1.4539848197343455, + "grad_norm": 1.6814855337142944, + "learning_rate": 3.6618223849065126e-06, + "loss": 0.1652, + "step": 3065 + }, + { + "epoch": 1.4544592030360532, + "grad_norm": 1.748642921447754, + "learning_rate": 3.655881564389184e-06, + "loss": 0.1565, + "step": 3066 + }, + { + "epoch": 1.454933586337761, + "grad_norm": 1.5492746829986572, + "learning_rate": 3.649944488742194e-06, + "loss": 0.1557, + "step": 3067 + }, + { + "epoch": 1.4554079696394686, + "grad_norm": 1.3638722896575928, + "learning_rate": 3.644011161470136e-06, + "loss": 0.1359, + "step": 3068 + }, + { + "epoch": 1.4558823529411764, + "grad_norm": 1.5125813484191895, + "learning_rate": 3.6380815860753904e-06, + "loss": 0.1492, + "step": 3069 + }, + { + "epoch": 1.4563567362428842, + "grad_norm": 1.7100872993469238, + "learning_rate": 3.632155766058131e-06, + "loss": 0.1598, + "step": 3070 + }, + { + "epoch": 1.456831119544592, + "grad_norm": 1.9752591848373413, + "learning_rate": 3.6262337049163088e-06, + "loss": 0.1696, + "step": 3071 + }, + { + "epoch": 1.4573055028462998, + "grad_norm": 1.8669570684432983, + "learning_rate": 3.6203154061456648e-06, + "loss": 0.1718, + "step": 3072 + }, + { + "epoch": 1.4577798861480076, + "grad_norm": 1.8326796293258667, + "learning_rate": 3.614400873239703e-06, + "loss": 0.203, + "step": 3073 + }, + { + "epoch": 1.4582542694497154, + "grad_norm": 1.6635417938232422, + "learning_rate": 3.6084901096897163e-06, + "loss": 0.1458, + "step": 3074 + }, + { + "epoch": 1.4587286527514232, + "grad_norm": 1.6087132692337036, + "learning_rate": 3.602583118984776e-06, + "loss": 0.1431, + "step": 3075 + }, + { + "epoch": 1.459203036053131, + "grad_norm": 1.7828035354614258, + "learning_rate": 3.596679904611715e-06, + "loss": 0.1735, + "step": 3076 + }, + { + "epoch": 1.4596774193548387, + "grad_norm": 1.9378790855407715, + "learning_rate": 3.5907804700551385e-06, + "loss": 0.1872, + "step": 3077 + }, + { + "epoch": 1.4601518026565465, + "grad_norm": 1.23013174533844, + "learning_rate": 3.5848848187974294e-06, + "loss": 0.114, + "step": 3078 + }, + { + "epoch": 1.4606261859582543, + "grad_norm": 1.618354320526123, + "learning_rate": 3.5789929543187317e-06, + "loss": 0.147, + "step": 3079 + }, + { + "epoch": 1.4611005692599621, + "grad_norm": 2.332402229309082, + "learning_rate": 3.5731048800969536e-06, + "loss": 0.1879, + "step": 3080 + }, + { + "epoch": 1.4615749525616697, + "grad_norm": 1.8381061553955078, + "learning_rate": 3.5672205996077726e-06, + "loss": 0.1654, + "step": 3081 + }, + { + "epoch": 1.4620493358633775, + "grad_norm": 1.7428001165390015, + "learning_rate": 3.5613401163246118e-06, + "loss": 0.1625, + "step": 3082 + }, + { + "epoch": 1.4625237191650853, + "grad_norm": 1.8108954429626465, + "learning_rate": 3.555463433718671e-06, + "loss": 0.1954, + "step": 3083 + }, + { + "epoch": 1.462998102466793, + "grad_norm": 1.7431238889694214, + "learning_rate": 3.549590555258892e-06, + "loss": 0.1849, + "step": 3084 + }, + { + "epoch": 1.4634724857685009, + "grad_norm": 1.3837710618972778, + "learning_rate": 3.543721484411976e-06, + "loss": 0.1412, + "step": 3085 + }, + { + "epoch": 1.4639468690702087, + "grad_norm": 1.585083246231079, + "learning_rate": 3.537856224642385e-06, + "loss": 0.1452, + "step": 3086 + }, + { + "epoch": 1.4644212523719164, + "grad_norm": 1.6242449283599854, + "learning_rate": 3.5319947794123153e-06, + "loss": 0.157, + "step": 3087 + }, + { + "epoch": 1.4648956356736242, + "grad_norm": 1.61245596408844, + "learning_rate": 3.5261371521817247e-06, + "loss": 0.1665, + "step": 3088 + }, + { + "epoch": 1.465370018975332, + "grad_norm": 1.7214654684066772, + "learning_rate": 3.5202833464083096e-06, + "loss": 0.1626, + "step": 3089 + }, + { + "epoch": 1.4658444022770398, + "grad_norm": 1.7213759422302246, + "learning_rate": 3.514433365547517e-06, + "loss": 0.1907, + "step": 3090 + }, + { + "epoch": 1.4663187855787476, + "grad_norm": 1.6483070850372314, + "learning_rate": 3.5085872130525345e-06, + "loss": 0.1634, + "step": 3091 + }, + { + "epoch": 1.4667931688804554, + "grad_norm": 1.7148456573486328, + "learning_rate": 3.5027448923742845e-06, + "loss": 0.1782, + "step": 3092 + }, + { + "epoch": 1.4672675521821632, + "grad_norm": 1.4626739025115967, + "learning_rate": 3.496906406961428e-06, + "loss": 0.1386, + "step": 3093 + }, + { + "epoch": 1.467741935483871, + "grad_norm": 1.7289766073226929, + "learning_rate": 3.491071760260368e-06, + "loss": 0.1474, + "step": 3094 + }, + { + "epoch": 1.4682163187855788, + "grad_norm": 1.4813730716705322, + "learning_rate": 3.4852409557152432e-06, + "loss": 0.1572, + "step": 3095 + }, + { + "epoch": 1.4686907020872866, + "grad_norm": 1.7407090663909912, + "learning_rate": 3.479413996767913e-06, + "loss": 0.1711, + "step": 3096 + }, + { + "epoch": 1.4691650853889944, + "grad_norm": 1.5457485914230347, + "learning_rate": 3.473590886857977e-06, + "loss": 0.1327, + "step": 3097 + }, + { + "epoch": 1.4696394686907022, + "grad_norm": 1.932664394378662, + "learning_rate": 3.4677716294227583e-06, + "loss": 0.1631, + "step": 3098 + }, + { + "epoch": 1.47011385199241, + "grad_norm": 1.4631233215332031, + "learning_rate": 3.4619562278973105e-06, + "loss": 0.1312, + "step": 3099 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 1.3594050407409668, + "learning_rate": 3.4561446857144054e-06, + "loss": 0.1175, + "step": 3100 + }, + { + "epoch": 1.4710626185958255, + "grad_norm": 1.4718384742736816, + "learning_rate": 3.4503370063045338e-06, + "loss": 0.1438, + "step": 3101 + }, + { + "epoch": 1.4715370018975333, + "grad_norm": 1.6673498153686523, + "learning_rate": 3.444533193095917e-06, + "loss": 0.1628, + "step": 3102 + }, + { + "epoch": 1.472011385199241, + "grad_norm": 1.9447062015533447, + "learning_rate": 3.4387332495144866e-06, + "loss": 0.2165, + "step": 3103 + }, + { + "epoch": 1.4724857685009487, + "grad_norm": 1.9136441946029663, + "learning_rate": 3.4329371789838916e-06, + "loss": 0.1972, + "step": 3104 + }, + { + "epoch": 1.4729601518026565, + "grad_norm": 2.037196636199951, + "learning_rate": 3.4271449849255003e-06, + "loss": 0.1881, + "step": 3105 + }, + { + "epoch": 1.4734345351043643, + "grad_norm": 1.9500868320465088, + "learning_rate": 3.42135667075838e-06, + "loss": 0.1469, + "step": 3106 + }, + { + "epoch": 1.473908918406072, + "grad_norm": 1.8555630445480347, + "learning_rate": 3.4155722398993175e-06, + "loss": 0.1753, + "step": 3107 + }, + { + "epoch": 1.4743833017077799, + "grad_norm": 1.6183701753616333, + "learning_rate": 3.4097916957628108e-06, + "loss": 0.1391, + "step": 3108 + }, + { + "epoch": 1.4748576850094877, + "grad_norm": 1.8170623779296875, + "learning_rate": 3.4040150417610483e-06, + "loss": 0.1789, + "step": 3109 + }, + { + "epoch": 1.4753320683111955, + "grad_norm": 1.6441960334777832, + "learning_rate": 3.3982422813039407e-06, + "loss": 0.1682, + "step": 3110 + }, + { + "epoch": 1.4758064516129032, + "grad_norm": 1.511757254600525, + "learning_rate": 3.3924734177990847e-06, + "loss": 0.1337, + "step": 3111 + }, + { + "epoch": 1.476280834914611, + "grad_norm": 1.9087467193603516, + "learning_rate": 3.3867084546517847e-06, + "loss": 0.1792, + "step": 3112 + }, + { + "epoch": 1.4767552182163188, + "grad_norm": 1.632246732711792, + "learning_rate": 3.3809473952650427e-06, + "loss": 0.1387, + "step": 3113 + }, + { + "epoch": 1.4772296015180266, + "grad_norm": 1.7560805082321167, + "learning_rate": 3.3751902430395558e-06, + "loss": 0.1753, + "step": 3114 + }, + { + "epoch": 1.4777039848197344, + "grad_norm": 1.700164556503296, + "learning_rate": 3.3694370013737153e-06, + "loss": 0.1838, + "step": 3115 + }, + { + "epoch": 1.478178368121442, + "grad_norm": 1.6236701011657715, + "learning_rate": 3.3636876736636013e-06, + "loss": 0.1711, + "step": 3116 + }, + { + "epoch": 1.4786527514231498, + "grad_norm": 1.9994237422943115, + "learning_rate": 3.3579422633029813e-06, + "loss": 0.1642, + "step": 3117 + }, + { + "epoch": 1.4791271347248576, + "grad_norm": 1.9831045866012573, + "learning_rate": 3.352200773683317e-06, + "loss": 0.1701, + "step": 3118 + }, + { + "epoch": 1.4796015180265654, + "grad_norm": 1.4581242799758911, + "learning_rate": 3.3464632081937567e-06, + "loss": 0.1479, + "step": 3119 + }, + { + "epoch": 1.4800759013282732, + "grad_norm": 1.944639801979065, + "learning_rate": 3.3407295702211217e-06, + "loss": 0.1544, + "step": 3120 + }, + { + "epoch": 1.480550284629981, + "grad_norm": 1.6618354320526123, + "learning_rate": 3.3349998631499247e-06, + "loss": 0.1325, + "step": 3121 + }, + { + "epoch": 1.4810246679316887, + "grad_norm": 2.091897964477539, + "learning_rate": 3.3292740903623567e-06, + "loss": 0.1895, + "step": 3122 + }, + { + "epoch": 1.4814990512333965, + "grad_norm": 1.445960283279419, + "learning_rate": 3.323552255238286e-06, + "loss": 0.1448, + "step": 3123 + }, + { + "epoch": 1.4819734345351043, + "grad_norm": 1.5881474018096924, + "learning_rate": 3.317834361155252e-06, + "loss": 0.136, + "step": 3124 + }, + { + "epoch": 1.4824478178368121, + "grad_norm": 1.9209330081939697, + "learning_rate": 3.3121204114884696e-06, + "loss": 0.1705, + "step": 3125 + }, + { + "epoch": 1.48292220113852, + "grad_norm": 1.59219491481781, + "learning_rate": 3.3064104096108287e-06, + "loss": 0.1403, + "step": 3126 + }, + { + "epoch": 1.4833965844402277, + "grad_norm": 1.7049628496170044, + "learning_rate": 3.3007043588928866e-06, + "loss": 0.1602, + "step": 3127 + }, + { + "epoch": 1.4838709677419355, + "grad_norm": 1.7211378812789917, + "learning_rate": 3.295002262702869e-06, + "loss": 0.1711, + "step": 3128 + }, + { + "epoch": 1.4843453510436433, + "grad_norm": 1.8670153617858887, + "learning_rate": 3.2893041244066704e-06, + "loss": 0.1732, + "step": 3129 + }, + { + "epoch": 1.484819734345351, + "grad_norm": 1.462051510810852, + "learning_rate": 3.2836099473678384e-06, + "loss": 0.1409, + "step": 3130 + }, + { + "epoch": 1.4852941176470589, + "grad_norm": 1.5482720136642456, + "learning_rate": 3.2779197349475933e-06, + "loss": 0.1419, + "step": 3131 + }, + { + "epoch": 1.4857685009487667, + "grad_norm": 2.3937394618988037, + "learning_rate": 3.2722334905048146e-06, + "loss": 0.1878, + "step": 3132 + }, + { + "epoch": 1.4862428842504745, + "grad_norm": 1.6191223859786987, + "learning_rate": 3.266551217396029e-06, + "loss": 0.1706, + "step": 3133 + }, + { + "epoch": 1.4867172675521823, + "grad_norm": 1.8945509195327759, + "learning_rate": 3.260872918975432e-06, + "loss": 0.1589, + "step": 3134 + }, + { + "epoch": 1.48719165085389, + "grad_norm": 1.668412685394287, + "learning_rate": 3.255198598594862e-06, + "loss": 0.1757, + "step": 3135 + }, + { + "epoch": 1.4876660341555978, + "grad_norm": 1.9364664554595947, + "learning_rate": 3.2495282596038156e-06, + "loss": 0.1189, + "step": 3136 + }, + { + "epoch": 1.4881404174573056, + "grad_norm": 1.442252278327942, + "learning_rate": 3.243861905349439e-06, + "loss": 0.1294, + "step": 3137 + }, + { + "epoch": 1.4886148007590132, + "grad_norm": 1.5097479820251465, + "learning_rate": 3.2381995391765288e-06, + "loss": 0.1502, + "step": 3138 + }, + { + "epoch": 1.489089184060721, + "grad_norm": 1.8824775218963623, + "learning_rate": 3.2325411644275164e-06, + "loss": 0.1816, + "step": 3139 + }, + { + "epoch": 1.4895635673624288, + "grad_norm": 1.5267640352249146, + "learning_rate": 3.22688678444249e-06, + "loss": 0.1391, + "step": 3140 + }, + { + "epoch": 1.4900379506641366, + "grad_norm": 1.582728385925293, + "learning_rate": 3.221236402559169e-06, + "loss": 0.168, + "step": 3141 + }, + { + "epoch": 1.4905123339658444, + "grad_norm": 1.3848261833190918, + "learning_rate": 3.215590022112921e-06, + "loss": 0.1533, + "step": 3142 + }, + { + "epoch": 1.4909867172675522, + "grad_norm": 1.7733631134033203, + "learning_rate": 3.209947646436752e-06, + "loss": 0.1689, + "step": 3143 + }, + { + "epoch": 1.49146110056926, + "grad_norm": 1.8000030517578125, + "learning_rate": 3.204309278861294e-06, + "loss": 0.1392, + "step": 3144 + }, + { + "epoch": 1.4919354838709677, + "grad_norm": 1.5423595905303955, + "learning_rate": 3.1986749227148215e-06, + "loss": 0.1553, + "step": 3145 + }, + { + "epoch": 1.4924098671726755, + "grad_norm": 1.7993470430374146, + "learning_rate": 3.19304458132324e-06, + "loss": 0.1598, + "step": 3146 + }, + { + "epoch": 1.4928842504743833, + "grad_norm": 1.7650487422943115, + "learning_rate": 3.1874182580100874e-06, + "loss": 0.1995, + "step": 3147 + }, + { + "epoch": 1.4933586337760911, + "grad_norm": 2.178863525390625, + "learning_rate": 3.181795956096522e-06, + "loss": 0.1757, + "step": 3148 + }, + { + "epoch": 1.493833017077799, + "grad_norm": 1.823729395866394, + "learning_rate": 3.1761776789013365e-06, + "loss": 0.1441, + "step": 3149 + }, + { + "epoch": 1.4943074003795067, + "grad_norm": 1.6294647455215454, + "learning_rate": 3.1705634297409404e-06, + "loss": 0.1846, + "step": 3150 + }, + { + "epoch": 1.4947817836812145, + "grad_norm": 1.7616844177246094, + "learning_rate": 3.1649532119293713e-06, + "loss": 0.169, + "step": 3151 + }, + { + "epoch": 1.495256166982922, + "grad_norm": 1.9334604740142822, + "learning_rate": 3.1593470287782847e-06, + "loss": 0.1989, + "step": 3152 + }, + { + "epoch": 1.4957305502846299, + "grad_norm": 1.5212819576263428, + "learning_rate": 3.15374488359696e-06, + "loss": 0.1652, + "step": 3153 + }, + { + "epoch": 1.4962049335863377, + "grad_norm": 1.8870230913162231, + "learning_rate": 3.1481467796922804e-06, + "loss": 0.1775, + "step": 3154 + }, + { + "epoch": 1.4966793168880455, + "grad_norm": 1.5474931001663208, + "learning_rate": 3.1425527203687543e-06, + "loss": 0.152, + "step": 3155 + }, + { + "epoch": 1.4971537001897532, + "grad_norm": 1.711269497871399, + "learning_rate": 3.1369627089285036e-06, + "loss": 0.1527, + "step": 3156 + }, + { + "epoch": 1.497628083491461, + "grad_norm": 1.6961112022399902, + "learning_rate": 3.131376748671253e-06, + "loss": 0.1761, + "step": 3157 + }, + { + "epoch": 1.4981024667931688, + "grad_norm": 1.4259470701217651, + "learning_rate": 3.1257948428943375e-06, + "loss": 0.1497, + "step": 3158 + }, + { + "epoch": 1.4985768500948766, + "grad_norm": 1.8194340467453003, + "learning_rate": 3.120216994892702e-06, + "loss": 0.1785, + "step": 3159 + }, + { + "epoch": 1.4990512333965844, + "grad_norm": 1.514940619468689, + "learning_rate": 3.1146432079588963e-06, + "loss": 0.1585, + "step": 3160 + }, + { + "epoch": 1.4995256166982922, + "grad_norm": 1.442490577697754, + "learning_rate": 3.1090734853830718e-06, + "loss": 0.1337, + "step": 3161 + }, + { + "epoch": 1.5, + "grad_norm": 1.7581517696380615, + "learning_rate": 3.103507830452982e-06, + "loss": 0.1771, + "step": 3162 + }, + { + "epoch": 1.5004743833017078, + "grad_norm": 2.3333330154418945, + "learning_rate": 3.0979462464539744e-06, + "loss": 0.1982, + "step": 3163 + }, + { + "epoch": 1.5009487666034156, + "grad_norm": 1.7055302858352661, + "learning_rate": 3.092388736669002e-06, + "loss": 0.1643, + "step": 3164 + }, + { + "epoch": 1.5014231499051234, + "grad_norm": 1.49795401096344, + "learning_rate": 3.0868353043786004e-06, + "loss": 0.1416, + "step": 3165 + }, + { + "epoch": 1.5018975332068312, + "grad_norm": 1.4336013793945312, + "learning_rate": 3.0812859528609106e-06, + "loss": 0.1412, + "step": 3166 + }, + { + "epoch": 1.502371916508539, + "grad_norm": 1.6154780387878418, + "learning_rate": 3.0757406853916627e-06, + "loss": 0.1667, + "step": 3167 + }, + { + "epoch": 1.5028462998102468, + "grad_norm": 1.670455813407898, + "learning_rate": 3.0701995052441658e-06, + "loss": 0.1467, + "step": 3168 + }, + { + "epoch": 1.5033206831119545, + "grad_norm": 1.86698317527771, + "learning_rate": 3.064662415689328e-06, + "loss": 0.2076, + "step": 3169 + }, + { + "epoch": 1.5037950664136623, + "grad_norm": 1.5714526176452637, + "learning_rate": 3.059129419995638e-06, + "loss": 0.155, + "step": 3170 + }, + { + "epoch": 1.5042694497153701, + "grad_norm": 1.567219853401184, + "learning_rate": 3.053600521429172e-06, + "loss": 0.1538, + "step": 3171 + }, + { + "epoch": 1.504743833017078, + "grad_norm": 1.410211205482483, + "learning_rate": 3.0480757232535773e-06, + "loss": 0.1173, + "step": 3172 + }, + { + "epoch": 1.5052182163187857, + "grad_norm": 1.7881250381469727, + "learning_rate": 3.0425550287300943e-06, + "loss": 0.1827, + "step": 3173 + }, + { + "epoch": 1.5056925996204935, + "grad_norm": 1.595481514930725, + "learning_rate": 3.037038441117528e-06, + "loss": 0.1373, + "step": 3174 + }, + { + "epoch": 1.5061669829222013, + "grad_norm": 1.6253283023834229, + "learning_rate": 3.031525963672267e-06, + "loss": 0.1438, + "step": 3175 + }, + { + "epoch": 1.5066413662239089, + "grad_norm": 1.9808628559112549, + "learning_rate": 3.0260175996482787e-06, + "loss": 0.1479, + "step": 3176 + }, + { + "epoch": 1.5071157495256167, + "grad_norm": 2.177093744277954, + "learning_rate": 3.0205133522970865e-06, + "loss": 0.2158, + "step": 3177 + }, + { + "epoch": 1.5075901328273245, + "grad_norm": 1.7438995838165283, + "learning_rate": 3.0150132248677976e-06, + "loss": 0.1905, + "step": 3178 + }, + { + "epoch": 1.5080645161290323, + "grad_norm": 1.5714632272720337, + "learning_rate": 3.0095172206070833e-06, + "loss": 0.1403, + "step": 3179 + }, + { + "epoch": 1.50853889943074, + "grad_norm": 1.7036011219024658, + "learning_rate": 3.0040253427591827e-06, + "loss": 0.139, + "step": 3180 + }, + { + "epoch": 1.5090132827324478, + "grad_norm": 1.967517375946045, + "learning_rate": 2.9985375945658934e-06, + "loss": 0.1712, + "step": 3181 + }, + { + "epoch": 1.5094876660341556, + "grad_norm": 1.4856235980987549, + "learning_rate": 2.9930539792665767e-06, + "loss": 0.1472, + "step": 3182 + }, + { + "epoch": 1.5099620493358634, + "grad_norm": 1.9669393301010132, + "learning_rate": 2.9875745000981603e-06, + "loss": 0.1836, + "step": 3183 + }, + { + "epoch": 1.510436432637571, + "grad_norm": 1.9737735986709595, + "learning_rate": 2.9820991602951255e-06, + "loss": 0.1955, + "step": 3184 + }, + { + "epoch": 1.5109108159392788, + "grad_norm": 1.8585445880889893, + "learning_rate": 2.97662796308951e-06, + "loss": 0.1881, + "step": 3185 + }, + { + "epoch": 1.5113851992409866, + "grad_norm": 1.8154902458190918, + "learning_rate": 2.971160911710913e-06, + "loss": 0.1274, + "step": 3186 + }, + { + "epoch": 1.5118595825426944, + "grad_norm": 1.5582470893859863, + "learning_rate": 2.965698009386473e-06, + "loss": 0.1438, + "step": 3187 + }, + { + "epoch": 1.5123339658444022, + "grad_norm": 2.245789051055908, + "learning_rate": 2.9602392593408933e-06, + "loss": 0.2352, + "step": 3188 + }, + { + "epoch": 1.51280834914611, + "grad_norm": 1.8019410371780396, + "learning_rate": 2.954784664796414e-06, + "loss": 0.181, + "step": 3189 + }, + { + "epoch": 1.5132827324478177, + "grad_norm": 1.7983249425888062, + "learning_rate": 2.9493342289728334e-06, + "loss": 0.1187, + "step": 3190 + }, + { + "epoch": 1.5137571157495255, + "grad_norm": 1.7953065633773804, + "learning_rate": 2.94388795508749e-06, + "loss": 0.1749, + "step": 3191 + }, + { + "epoch": 1.5142314990512333, + "grad_norm": 1.9157280921936035, + "learning_rate": 2.93844584635526e-06, + "loss": 0.2026, + "step": 3192 + }, + { + "epoch": 1.5147058823529411, + "grad_norm": 1.7939811944961548, + "learning_rate": 2.9330079059885708e-06, + "loss": 0.1734, + "step": 3193 + }, + { + "epoch": 1.515180265654649, + "grad_norm": 1.3667829036712646, + "learning_rate": 2.927574137197383e-06, + "loss": 0.1348, + "step": 3194 + }, + { + "epoch": 1.5156546489563567, + "grad_norm": 1.706744909286499, + "learning_rate": 2.9221445431892003e-06, + "loss": 0.1574, + "step": 3195 + }, + { + "epoch": 1.5161290322580645, + "grad_norm": 1.9317013025283813, + "learning_rate": 2.916719127169053e-06, + "loss": 0.1782, + "step": 3196 + }, + { + "epoch": 1.5166034155597723, + "grad_norm": 1.5074424743652344, + "learning_rate": 2.911297892339516e-06, + "loss": 0.1355, + "step": 3197 + }, + { + "epoch": 1.51707779886148, + "grad_norm": 1.852922797203064, + "learning_rate": 2.9058808419006834e-06, + "loss": 0.1614, + "step": 3198 + }, + { + "epoch": 1.5175521821631879, + "grad_norm": 1.6857560873031616, + "learning_rate": 2.9004679790501922e-06, + "loss": 0.1859, + "step": 3199 + }, + { + "epoch": 1.5180265654648957, + "grad_norm": 1.6432700157165527, + "learning_rate": 2.895059306983201e-06, + "loss": 0.1501, + "step": 3200 + }, + { + "epoch": 1.5185009487666035, + "grad_norm": 2.039566993713379, + "learning_rate": 2.889654828892393e-06, + "loss": 0.1988, + "step": 3201 + }, + { + "epoch": 1.5189753320683113, + "grad_norm": 1.716520071029663, + "learning_rate": 2.8842545479679796e-06, + "loss": 0.1636, + "step": 3202 + }, + { + "epoch": 1.519449715370019, + "grad_norm": 1.249931812286377, + "learning_rate": 2.878858467397693e-06, + "loss": 0.1183, + "step": 3203 + }, + { + "epoch": 1.5199240986717268, + "grad_norm": 1.6915541887283325, + "learning_rate": 2.8734665903667892e-06, + "loss": 0.155, + "step": 3204 + }, + { + "epoch": 1.5203984819734346, + "grad_norm": 1.4063469171524048, + "learning_rate": 2.8680789200580373e-06, + "loss": 0.1344, + "step": 3205 + }, + { + "epoch": 1.5208728652751424, + "grad_norm": 1.345933198928833, + "learning_rate": 2.862695459651722e-06, + "loss": 0.1286, + "step": 3206 + }, + { + "epoch": 1.5213472485768502, + "grad_norm": 1.5287388563156128, + "learning_rate": 2.8573162123256504e-06, + "loss": 0.1617, + "step": 3207 + }, + { + "epoch": 1.521821631878558, + "grad_norm": 1.7381535768508911, + "learning_rate": 2.851941181255139e-06, + "loss": 0.1772, + "step": 3208 + }, + { + "epoch": 1.5222960151802658, + "grad_norm": 1.8976391553878784, + "learning_rate": 2.8465703696130142e-06, + "loss": 0.1806, + "step": 3209 + }, + { + "epoch": 1.5227703984819736, + "grad_norm": 1.5034171342849731, + "learning_rate": 2.841203780569618e-06, + "loss": 0.1625, + "step": 3210 + }, + { + "epoch": 1.5232447817836812, + "grad_norm": 1.4791839122772217, + "learning_rate": 2.835841417292788e-06, + "loss": 0.1411, + "step": 3211 + }, + { + "epoch": 1.523719165085389, + "grad_norm": 1.7350883483886719, + "learning_rate": 2.8304832829478802e-06, + "loss": 0.1606, + "step": 3212 + }, + { + "epoch": 1.5241935483870968, + "grad_norm": 1.4422385692596436, + "learning_rate": 2.825129380697741e-06, + "loss": 0.1368, + "step": 3213 + }, + { + "epoch": 1.5246679316888045, + "grad_norm": 1.8611582517623901, + "learning_rate": 2.8197797137027338e-06, + "loss": 0.1495, + "step": 3214 + }, + { + "epoch": 1.5251423149905123, + "grad_norm": 2.3095667362213135, + "learning_rate": 2.8144342851207076e-06, + "loss": 0.2357, + "step": 3215 + }, + { + "epoch": 1.5256166982922201, + "grad_norm": 1.809204339981079, + "learning_rate": 2.8090930981070176e-06, + "loss": 0.1656, + "step": 3216 + }, + { + "epoch": 1.526091081593928, + "grad_norm": 1.5061122179031372, + "learning_rate": 2.8037561558145154e-06, + "loss": 0.1525, + "step": 3217 + }, + { + "epoch": 1.5265654648956357, + "grad_norm": 1.6025121212005615, + "learning_rate": 2.7984234613935434e-06, + "loss": 0.137, + "step": 3218 + }, + { + "epoch": 1.5270398481973435, + "grad_norm": 1.6906027793884277, + "learning_rate": 2.7930950179919438e-06, + "loss": 0.1781, + "step": 3219 + }, + { + "epoch": 1.527514231499051, + "grad_norm": 1.538794994354248, + "learning_rate": 2.7877708287550366e-06, + "loss": 0.145, + "step": 3220 + }, + { + "epoch": 1.5279886148007589, + "grad_norm": 1.517501711845398, + "learning_rate": 2.7824508968256435e-06, + "loss": 0.1444, + "step": 3221 + }, + { + "epoch": 1.5284629981024667, + "grad_norm": 1.737318515777588, + "learning_rate": 2.777135225344063e-06, + "loss": 0.1274, + "step": 3222 + }, + { + "epoch": 1.5289373814041745, + "grad_norm": 1.742997407913208, + "learning_rate": 2.771823817448085e-06, + "loss": 0.1682, + "step": 3223 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 1.806666612625122, + "learning_rate": 2.7665166762729856e-06, + "loss": 0.1467, + "step": 3224 + }, + { + "epoch": 1.52988614800759, + "grad_norm": 2.0361621379852295, + "learning_rate": 2.7612138049515102e-06, + "loss": 0.1591, + "step": 3225 + }, + { + "epoch": 1.5303605313092978, + "grad_norm": 1.6584888696670532, + "learning_rate": 2.755915206613895e-06, + "loss": 0.1443, + "step": 3226 + }, + { + "epoch": 1.5308349146110056, + "grad_norm": 1.9782226085662842, + "learning_rate": 2.75062088438785e-06, + "loss": 0.1732, + "step": 3227 + }, + { + "epoch": 1.5313092979127134, + "grad_norm": 1.7193796634674072, + "learning_rate": 2.7453308413985635e-06, + "loss": 0.1448, + "step": 3228 + }, + { + "epoch": 1.5317836812144212, + "grad_norm": 1.6462289094924927, + "learning_rate": 2.740045080768694e-06, + "loss": 0.1444, + "step": 3229 + }, + { + "epoch": 1.532258064516129, + "grad_norm": 1.397916316986084, + "learning_rate": 2.73476360561837e-06, + "loss": 0.1503, + "step": 3230 + }, + { + "epoch": 1.5327324478178368, + "grad_norm": 2.2624073028564453, + "learning_rate": 2.7294864190651972e-06, + "loss": 0.2172, + "step": 3231 + }, + { + "epoch": 1.5332068311195446, + "grad_norm": 1.890386939048767, + "learning_rate": 2.724213524224246e-06, + "loss": 0.1538, + "step": 3232 + }, + { + "epoch": 1.5336812144212524, + "grad_norm": 2.1407437324523926, + "learning_rate": 2.7189449242080557e-06, + "loss": 0.1784, + "step": 3233 + }, + { + "epoch": 1.5341555977229602, + "grad_norm": 1.7359429597854614, + "learning_rate": 2.7136806221266286e-06, + "loss": 0.1851, + "step": 3234 + }, + { + "epoch": 1.534629981024668, + "grad_norm": 1.662856101989746, + "learning_rate": 2.7084206210874277e-06, + "loss": 0.1352, + "step": 3235 + }, + { + "epoch": 1.5351043643263758, + "grad_norm": 1.425294280052185, + "learning_rate": 2.7031649241953826e-06, + "loss": 0.134, + "step": 3236 + }, + { + "epoch": 1.5355787476280836, + "grad_norm": 1.6383399963378906, + "learning_rate": 2.697913534552875e-06, + "loss": 0.1595, + "step": 3237 + }, + { + "epoch": 1.5360531309297913, + "grad_norm": 1.8296263217926025, + "learning_rate": 2.6926664552597537e-06, + "loss": 0.1928, + "step": 3238 + }, + { + "epoch": 1.5365275142314991, + "grad_norm": 1.6858452558517456, + "learning_rate": 2.687423689413312e-06, + "loss": 0.1304, + "step": 3239 + }, + { + "epoch": 1.537001897533207, + "grad_norm": 1.7636797428131104, + "learning_rate": 2.6821852401083048e-06, + "loss": 0.167, + "step": 3240 + }, + { + "epoch": 1.5374762808349147, + "grad_norm": 1.6068103313446045, + "learning_rate": 2.6769511104369384e-06, + "loss": 0.1536, + "step": 3241 + }, + { + "epoch": 1.5379506641366225, + "grad_norm": 1.8157026767730713, + "learning_rate": 2.6717213034888656e-06, + "loss": 0.1601, + "step": 3242 + }, + { + "epoch": 1.5384250474383303, + "grad_norm": 1.6704944372177124, + "learning_rate": 2.6664958223511948e-06, + "loss": 0.165, + "step": 3243 + }, + { + "epoch": 1.538899430740038, + "grad_norm": 1.5348975658416748, + "learning_rate": 2.661274670108469e-06, + "loss": 0.1695, + "step": 3244 + }, + { + "epoch": 1.539373814041746, + "grad_norm": 1.7109907865524292, + "learning_rate": 2.6560578498426883e-06, + "loss": 0.151, + "step": 3245 + }, + { + "epoch": 1.5398481973434535, + "grad_norm": 1.4413361549377441, + "learning_rate": 2.6508453646332845e-06, + "loss": 0.1458, + "step": 3246 + }, + { + "epoch": 1.5403225806451613, + "grad_norm": 1.3164259195327759, + "learning_rate": 2.645637217557139e-06, + "loss": 0.1393, + "step": 3247 + }, + { + "epoch": 1.540796963946869, + "grad_norm": 1.6673977375030518, + "learning_rate": 2.640433411688572e-06, + "loss": 0.1613, + "step": 3248 + }, + { + "epoch": 1.5412713472485768, + "grad_norm": 1.5010228157043457, + "learning_rate": 2.635233950099334e-06, + "loss": 0.1433, + "step": 3249 + }, + { + "epoch": 1.5417457305502846, + "grad_norm": 1.5055230855941772, + "learning_rate": 2.630038835858617e-06, + "loss": 0.146, + "step": 3250 + }, + { + "epoch": 1.5422201138519924, + "grad_norm": 1.621572494506836, + "learning_rate": 2.624848072033046e-06, + "loss": 0.1707, + "step": 3251 + }, + { + "epoch": 1.5426944971537002, + "grad_norm": 2.2330636978149414, + "learning_rate": 2.6196616616866822e-06, + "loss": 0.1665, + "step": 3252 + }, + { + "epoch": 1.543168880455408, + "grad_norm": 1.9100672006607056, + "learning_rate": 2.6144796078810065e-06, + "loss": 0.1728, + "step": 3253 + }, + { + "epoch": 1.5436432637571158, + "grad_norm": 1.8044493198394775, + "learning_rate": 2.609301913674933e-06, + "loss": 0.1741, + "step": 3254 + }, + { + "epoch": 1.5441176470588234, + "grad_norm": 1.4403407573699951, + "learning_rate": 2.6041285821248064e-06, + "loss": 0.1417, + "step": 3255 + }, + { + "epoch": 1.5445920303605312, + "grad_norm": 1.5496286153793335, + "learning_rate": 2.598959616284391e-06, + "loss": 0.1356, + "step": 3256 + }, + { + "epoch": 1.545066413662239, + "grad_norm": 2.311690092086792, + "learning_rate": 2.5937950192048823e-06, + "loss": 0.1931, + "step": 3257 + }, + { + "epoch": 1.5455407969639468, + "grad_norm": 2.143083095550537, + "learning_rate": 2.588634793934882e-06, + "loss": 0.1902, + "step": 3258 + }, + { + "epoch": 1.5460151802656545, + "grad_norm": 2.0627899169921875, + "learning_rate": 2.5834789435204245e-06, + "loss": 0.1883, + "step": 3259 + }, + { + "epoch": 1.5464895635673623, + "grad_norm": 2.2060842514038086, + "learning_rate": 2.57832747100496e-06, + "loss": 0.2299, + "step": 3260 + }, + { + "epoch": 1.5469639468690701, + "grad_norm": 1.7852225303649902, + "learning_rate": 2.5731803794293465e-06, + "loss": 0.1833, + "step": 3261 + }, + { + "epoch": 1.547438330170778, + "grad_norm": 1.6334515810012817, + "learning_rate": 2.5680376718318657e-06, + "loss": 0.1617, + "step": 3262 + }, + { + "epoch": 1.5479127134724857, + "grad_norm": 1.4657764434814453, + "learning_rate": 2.5628993512482013e-06, + "loss": 0.15, + "step": 3263 + }, + { + "epoch": 1.5483870967741935, + "grad_norm": 1.775926947593689, + "learning_rate": 2.557765420711458e-06, + "loss": 0.1747, + "step": 3264 + }, + { + "epoch": 1.5488614800759013, + "grad_norm": 1.7715890407562256, + "learning_rate": 2.5526358832521424e-06, + "loss": 0.1488, + "step": 3265 + }, + { + "epoch": 1.549335863377609, + "grad_norm": 1.91201651096344, + "learning_rate": 2.5475107418981692e-06, + "loss": 0.1736, + "step": 3266 + }, + { + "epoch": 1.5498102466793169, + "grad_norm": 1.8647923469543457, + "learning_rate": 2.5423899996748636e-06, + "loss": 0.1741, + "step": 3267 + }, + { + "epoch": 1.5502846299810247, + "grad_norm": 1.4479807615280151, + "learning_rate": 2.5372736596049417e-06, + "loss": 0.1228, + "step": 3268 + }, + { + "epoch": 1.5507590132827325, + "grad_norm": 1.4096297025680542, + "learning_rate": 2.532161724708534e-06, + "loss": 0.1368, + "step": 3269 + }, + { + "epoch": 1.5512333965844403, + "grad_norm": 1.8053456544876099, + "learning_rate": 2.5270541980031603e-06, + "loss": 0.1674, + "step": 3270 + }, + { + "epoch": 1.551707779886148, + "grad_norm": 1.699059009552002, + "learning_rate": 2.521951082503746e-06, + "loss": 0.1305, + "step": 3271 + }, + { + "epoch": 1.5521821631878558, + "grad_norm": 1.85928213596344, + "learning_rate": 2.516852381222612e-06, + "loss": 0.1696, + "step": 3272 + }, + { + "epoch": 1.5526565464895636, + "grad_norm": 1.6062089204788208, + "learning_rate": 2.5117580971694644e-06, + "loss": 0.1237, + "step": 3273 + }, + { + "epoch": 1.5531309297912714, + "grad_norm": 1.8484961986541748, + "learning_rate": 2.5066682333514136e-06, + "loss": 0.1724, + "step": 3274 + }, + { + "epoch": 1.5536053130929792, + "grad_norm": 1.7327786684036255, + "learning_rate": 2.5015827927729554e-06, + "loss": 0.1314, + "step": 3275 + }, + { + "epoch": 1.554079696394687, + "grad_norm": 1.7604756355285645, + "learning_rate": 2.496501778435977e-06, + "loss": 0.1538, + "step": 3276 + }, + { + "epoch": 1.5545540796963948, + "grad_norm": 1.8276807069778442, + "learning_rate": 2.491425193339748e-06, + "loss": 0.1595, + "step": 3277 + }, + { + "epoch": 1.5550284629981026, + "grad_norm": 1.5220667123794556, + "learning_rate": 2.4863530404809253e-06, + "loss": 0.1231, + "step": 3278 + }, + { + "epoch": 1.5555028462998104, + "grad_norm": 2.1537914276123047, + "learning_rate": 2.4812853228535515e-06, + "loss": 0.1959, + "step": 3279 + }, + { + "epoch": 1.5559772296015182, + "grad_norm": 1.3721140623092651, + "learning_rate": 2.4762220434490504e-06, + "loss": 0.1232, + "step": 3280 + }, + { + "epoch": 1.5564516129032258, + "grad_norm": 1.8437730073928833, + "learning_rate": 2.4711632052562283e-06, + "loss": 0.1598, + "step": 3281 + }, + { + "epoch": 1.5569259962049335, + "grad_norm": 1.4485067129135132, + "learning_rate": 2.466108811261263e-06, + "loss": 0.161, + "step": 3282 + }, + { + "epoch": 1.5574003795066413, + "grad_norm": 1.9926724433898926, + "learning_rate": 2.461058864447716e-06, + "loss": 0.1368, + "step": 3283 + }, + { + "epoch": 1.5578747628083491, + "grad_norm": 3.5024831295013428, + "learning_rate": 2.456013367796519e-06, + "loss": 0.1822, + "step": 3284 + }, + { + "epoch": 1.558349146110057, + "grad_norm": 1.5812478065490723, + "learning_rate": 2.450972324285984e-06, + "loss": 0.1446, + "step": 3285 + }, + { + "epoch": 1.5588235294117647, + "grad_norm": 1.8127206563949585, + "learning_rate": 2.445935736891785e-06, + "loss": 0.1643, + "step": 3286 + }, + { + "epoch": 1.5592979127134725, + "grad_norm": 1.8025965690612793, + "learning_rate": 2.4409036085869665e-06, + "loss": 0.1455, + "step": 3287 + }, + { + "epoch": 1.5597722960151803, + "grad_norm": 1.6995849609375, + "learning_rate": 2.4358759423419476e-06, + "loss": 0.1461, + "step": 3288 + }, + { + "epoch": 1.560246679316888, + "grad_norm": 1.7094231843948364, + "learning_rate": 2.43085274112451e-06, + "loss": 0.1504, + "step": 3289 + }, + { + "epoch": 1.5607210626185957, + "grad_norm": 1.381062626838684, + "learning_rate": 2.425834007899799e-06, + "loss": 0.1279, + "step": 3290 + }, + { + "epoch": 1.5611954459203035, + "grad_norm": 1.921175241470337, + "learning_rate": 2.420819745630326e-06, + "loss": 0.1932, + "step": 3291 + }, + { + "epoch": 1.5616698292220113, + "grad_norm": 1.5312777757644653, + "learning_rate": 2.4158099572759564e-06, + "loss": 0.1378, + "step": 3292 + }, + { + "epoch": 1.562144212523719, + "grad_norm": 2.1276259422302246, + "learning_rate": 2.4108046457939215e-06, + "loss": 0.1814, + "step": 3293 + }, + { + "epoch": 1.5626185958254268, + "grad_norm": 1.3188360929489136, + "learning_rate": 2.405803814138804e-06, + "loss": 0.1254, + "step": 3294 + }, + { + "epoch": 1.5630929791271346, + "grad_norm": 1.5824406147003174, + "learning_rate": 2.40080746526255e-06, + "loss": 0.1318, + "step": 3295 + }, + { + "epoch": 1.5635673624288424, + "grad_norm": 1.6440813541412354, + "learning_rate": 2.3958156021144495e-06, + "loss": 0.1395, + "step": 3296 + }, + { + "epoch": 1.5640417457305502, + "grad_norm": 1.85137140750885, + "learning_rate": 2.390828227641152e-06, + "loss": 0.171, + "step": 3297 + }, + { + "epoch": 1.564516129032258, + "grad_norm": 1.7361198663711548, + "learning_rate": 2.385845344786656e-06, + "loss": 0.1485, + "step": 3298 + }, + { + "epoch": 1.5649905123339658, + "grad_norm": 1.779136061668396, + "learning_rate": 2.380866956492307e-06, + "loss": 0.1833, + "step": 3299 + }, + { + "epoch": 1.5654648956356736, + "grad_norm": 1.568875789642334, + "learning_rate": 2.3758930656968025e-06, + "loss": 0.144, + "step": 3300 + }, + { + "epoch": 1.5659392789373814, + "grad_norm": 1.6992019414901733, + "learning_rate": 2.3709236753361777e-06, + "loss": 0.1576, + "step": 3301 + }, + { + "epoch": 1.5664136622390892, + "grad_norm": 1.7569388151168823, + "learning_rate": 2.3659587883438106e-06, + "loss": 0.1876, + "step": 3302 + }, + { + "epoch": 1.566888045540797, + "grad_norm": 1.6685066223144531, + "learning_rate": 2.36099840765043e-06, + "loss": 0.1738, + "step": 3303 + }, + { + "epoch": 1.5673624288425048, + "grad_norm": 1.763706922531128, + "learning_rate": 2.3560425361840976e-06, + "loss": 0.1597, + "step": 3304 + }, + { + "epoch": 1.5678368121442126, + "grad_norm": 1.954229712486267, + "learning_rate": 2.3510911768702184e-06, + "loss": 0.1906, + "step": 3305 + }, + { + "epoch": 1.5683111954459203, + "grad_norm": 1.7269539833068848, + "learning_rate": 2.346144332631526e-06, + "loss": 0.1543, + "step": 3306 + }, + { + "epoch": 1.5687855787476281, + "grad_norm": 1.9077547788619995, + "learning_rate": 2.3412020063880957e-06, + "loss": 0.1752, + "step": 3307 + }, + { + "epoch": 1.569259962049336, + "grad_norm": 1.3996086120605469, + "learning_rate": 2.336264201057333e-06, + "loss": 0.1241, + "step": 3308 + }, + { + "epoch": 1.5697343453510437, + "grad_norm": 1.5195519924163818, + "learning_rate": 2.331330919553981e-06, + "loss": 0.1282, + "step": 3309 + }, + { + "epoch": 1.5702087286527515, + "grad_norm": 1.725934624671936, + "learning_rate": 2.3264021647901014e-06, + "loss": 0.1716, + "step": 3310 + }, + { + "epoch": 1.5706831119544593, + "grad_norm": 1.739709734916687, + "learning_rate": 2.3214779396750885e-06, + "loss": 0.149, + "step": 3311 + }, + { + "epoch": 1.571157495256167, + "grad_norm": 1.6437108516693115, + "learning_rate": 2.3165582471156643e-06, + "loss": 0.1608, + "step": 3312 + }, + { + "epoch": 1.571631878557875, + "grad_norm": 2.1839261054992676, + "learning_rate": 2.311643090015877e-06, + "loss": 0.1525, + "step": 3313 + }, + { + "epoch": 1.5721062618595827, + "grad_norm": 1.8370964527130127, + "learning_rate": 2.3067324712770967e-06, + "loss": 0.1484, + "step": 3314 + }, + { + "epoch": 1.5725806451612905, + "grad_norm": 1.8021410703659058, + "learning_rate": 2.301826393798008e-06, + "loss": 0.1618, + "step": 3315 + }, + { + "epoch": 1.573055028462998, + "grad_norm": 1.7016773223876953, + "learning_rate": 2.296924860474621e-06, + "loss": 0.137, + "step": 3316 + }, + { + "epoch": 1.5735294117647058, + "grad_norm": 1.573151707649231, + "learning_rate": 2.2920278742002677e-06, + "loss": 0.1449, + "step": 3317 + }, + { + "epoch": 1.5740037950664136, + "grad_norm": 1.4972699880599976, + "learning_rate": 2.287135437865583e-06, + "loss": 0.1369, + "step": 3318 + }, + { + "epoch": 1.5744781783681214, + "grad_norm": 1.9173699617385864, + "learning_rate": 2.282247554358531e-06, + "loss": 0.1329, + "step": 3319 + }, + { + "epoch": 1.5749525616698292, + "grad_norm": 1.7459148168563843, + "learning_rate": 2.2773642265643734e-06, + "loss": 0.1782, + "step": 3320 + }, + { + "epoch": 1.575426944971537, + "grad_norm": 1.479753851890564, + "learning_rate": 2.272485457365695e-06, + "loss": 0.1241, + "step": 3321 + }, + { + "epoch": 1.5759013282732448, + "grad_norm": 1.529350996017456, + "learning_rate": 2.267611249642383e-06, + "loss": 0.1352, + "step": 3322 + }, + { + "epoch": 1.5763757115749526, + "grad_norm": 2.032287120819092, + "learning_rate": 2.2627416062716366e-06, + "loss": 0.1704, + "step": 3323 + }, + { + "epoch": 1.5768500948766604, + "grad_norm": 1.399459719657898, + "learning_rate": 2.257876530127958e-06, + "loss": 0.1306, + "step": 3324 + }, + { + "epoch": 1.577324478178368, + "grad_norm": 1.9230834245681763, + "learning_rate": 2.25301602408315e-06, + "loss": 0.2053, + "step": 3325 + }, + { + "epoch": 1.5777988614800758, + "grad_norm": 1.5427366495132446, + "learning_rate": 2.248160091006326e-06, + "loss": 0.137, + "step": 3326 + }, + { + "epoch": 1.5782732447817835, + "grad_norm": 1.7453058958053589, + "learning_rate": 2.243308733763889e-06, + "loss": 0.1422, + "step": 3327 + }, + { + "epoch": 1.5787476280834913, + "grad_norm": 1.7460287809371948, + "learning_rate": 2.2384619552195518e-06, + "loss": 0.1742, + "step": 3328 + }, + { + "epoch": 1.5792220113851991, + "grad_norm": 1.7111773490905762, + "learning_rate": 2.233619758234321e-06, + "loss": 0.1552, + "step": 3329 + }, + { + "epoch": 1.579696394686907, + "grad_norm": 1.5118764638900757, + "learning_rate": 2.2287821456664926e-06, + "loss": 0.1266, + "step": 3330 + }, + { + "epoch": 1.5801707779886147, + "grad_norm": 1.605214238166809, + "learning_rate": 2.2239491203716644e-06, + "loss": 0.1548, + "step": 3331 + }, + { + "epoch": 1.5806451612903225, + "grad_norm": 1.5546764135360718, + "learning_rate": 2.2191206852027225e-06, + "loss": 0.1369, + "step": 3332 + }, + { + "epoch": 1.5811195445920303, + "grad_norm": 1.529728889465332, + "learning_rate": 2.214296843009848e-06, + "loss": 0.1205, + "step": 3333 + }, + { + "epoch": 1.581593927893738, + "grad_norm": 1.7725614309310913, + "learning_rate": 2.2094775966405045e-06, + "loss": 0.1509, + "step": 3334 + }, + { + "epoch": 1.5820683111954459, + "grad_norm": 1.8418389558792114, + "learning_rate": 2.2046629489394422e-06, + "loss": 0.1339, + "step": 3335 + }, + { + "epoch": 1.5825426944971537, + "grad_norm": 1.8874013423919678, + "learning_rate": 2.199852902748704e-06, + "loss": 0.1873, + "step": 3336 + }, + { + "epoch": 1.5830170777988615, + "grad_norm": 1.5304529666900635, + "learning_rate": 2.19504746090761e-06, + "loss": 0.1311, + "step": 3337 + }, + { + "epoch": 1.5834914611005693, + "grad_norm": 2.1148695945739746, + "learning_rate": 2.19024662625277e-06, + "loss": 0.1458, + "step": 3338 + }, + { + "epoch": 1.583965844402277, + "grad_norm": 1.3242015838623047, + "learning_rate": 2.185450401618062e-06, + "loss": 0.1238, + "step": 3339 + }, + { + "epoch": 1.5844402277039848, + "grad_norm": 2.047175645828247, + "learning_rate": 2.1806587898346553e-06, + "loss": 0.1712, + "step": 3340 + }, + { + "epoch": 1.5849146110056926, + "grad_norm": 1.6021418571472168, + "learning_rate": 2.17587179373099e-06, + "loss": 0.1615, + "step": 3341 + }, + { + "epoch": 1.5853889943074004, + "grad_norm": 1.975977897644043, + "learning_rate": 2.1710894161327813e-06, + "loss": 0.1399, + "step": 3342 + }, + { + "epoch": 1.5858633776091082, + "grad_norm": 1.3944038152694702, + "learning_rate": 2.1663116598630207e-06, + "loss": 0.1451, + "step": 3343 + }, + { + "epoch": 1.586337760910816, + "grad_norm": 1.7378476858139038, + "learning_rate": 2.1615385277419687e-06, + "loss": 0.1514, + "step": 3344 + }, + { + "epoch": 1.5868121442125238, + "grad_norm": 1.55166494846344, + "learning_rate": 2.156770022587157e-06, + "loss": 0.1545, + "step": 3345 + }, + { + "epoch": 1.5872865275142316, + "grad_norm": 2.1811869144439697, + "learning_rate": 2.1520061472133903e-06, + "loss": 0.1508, + "step": 3346 + }, + { + "epoch": 1.5877609108159394, + "grad_norm": 1.4423187971115112, + "learning_rate": 2.147246904432735e-06, + "loss": 0.1401, + "step": 3347 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 1.590010166168213, + "learning_rate": 2.1424922970545283e-06, + "loss": 0.1466, + "step": 3348 + }, + { + "epoch": 1.588709677419355, + "grad_norm": 1.5334136486053467, + "learning_rate": 2.1377423278853627e-06, + "loss": 0.1193, + "step": 3349 + }, + { + "epoch": 1.5891840607210628, + "grad_norm": 1.605485200881958, + "learning_rate": 2.1329969997291035e-06, + "loss": 0.1445, + "step": 3350 + }, + { + "epoch": 1.5896584440227703, + "grad_norm": 1.5612473487854004, + "learning_rate": 2.128256315386865e-06, + "loss": 0.1313, + "step": 3351 + }, + { + "epoch": 1.5901328273244781, + "grad_norm": 1.702793002128601, + "learning_rate": 2.1235202776570297e-06, + "loss": 0.158, + "step": 3352 + }, + { + "epoch": 1.590607210626186, + "grad_norm": 1.6103143692016602, + "learning_rate": 2.118788889335236e-06, + "loss": 0.1707, + "step": 3353 + }, + { + "epoch": 1.5910815939278937, + "grad_norm": 1.7173638343811035, + "learning_rate": 2.11406215321437e-06, + "loss": 0.1728, + "step": 3354 + }, + { + "epoch": 1.5915559772296015, + "grad_norm": 1.4429662227630615, + "learning_rate": 2.1093400720845813e-06, + "loss": 0.1154, + "step": 3355 + }, + { + "epoch": 1.5920303605313093, + "grad_norm": 1.5256520509719849, + "learning_rate": 2.1046226487332655e-06, + "loss": 0.1471, + "step": 3356 + }, + { + "epoch": 1.592504743833017, + "grad_norm": 1.6516417264938354, + "learning_rate": 2.099909885945075e-06, + "loss": 0.1527, + "step": 3357 + }, + { + "epoch": 1.592979127134725, + "grad_norm": 1.8592281341552734, + "learning_rate": 2.0952017865019036e-06, + "loss": 0.1506, + "step": 3358 + }, + { + "epoch": 1.5934535104364327, + "grad_norm": 1.5824545621871948, + "learning_rate": 2.0904983531828947e-06, + "loss": 0.1468, + "step": 3359 + }, + { + "epoch": 1.5939278937381403, + "grad_norm": 1.5531175136566162, + "learning_rate": 2.085799588764439e-06, + "loss": 0.1386, + "step": 3360 + }, + { + "epoch": 1.594402277039848, + "grad_norm": 1.6820399761199951, + "learning_rate": 2.081105496020173e-06, + "loss": 0.157, + "step": 3361 + }, + { + "epoch": 1.5948766603415558, + "grad_norm": 1.3576366901397705, + "learning_rate": 2.076416077720973e-06, + "loss": 0.1285, + "step": 3362 + }, + { + "epoch": 1.5953510436432636, + "grad_norm": 1.8949618339538574, + "learning_rate": 2.0717313366349534e-06, + "loss": 0.1585, + "step": 3363 + }, + { + "epoch": 1.5958254269449714, + "grad_norm": 1.572543978691101, + "learning_rate": 2.067051275527472e-06, + "loss": 0.1509, + "step": 3364 + }, + { + "epoch": 1.5962998102466792, + "grad_norm": 1.6558380126953125, + "learning_rate": 2.0623758971611252e-06, + "loss": 0.1409, + "step": 3365 + }, + { + "epoch": 1.596774193548387, + "grad_norm": 1.892311930656433, + "learning_rate": 2.0577052042957378e-06, + "loss": 0.189, + "step": 3366 + }, + { + "epoch": 1.5972485768500948, + "grad_norm": 1.8912239074707031, + "learning_rate": 2.0530391996883782e-06, + "loss": 0.15, + "step": 3367 + }, + { + "epoch": 1.5977229601518026, + "grad_norm": 1.8770484924316406, + "learning_rate": 2.0483778860933377e-06, + "loss": 0.1802, + "step": 3368 + }, + { + "epoch": 1.5981973434535104, + "grad_norm": 1.7015761137008667, + "learning_rate": 2.0437212662621477e-06, + "loss": 0.1437, + "step": 3369 + }, + { + "epoch": 1.5986717267552182, + "grad_norm": 1.6395004987716675, + "learning_rate": 2.0390693429435626e-06, + "loss": 0.1361, + "step": 3370 + }, + { + "epoch": 1.599146110056926, + "grad_norm": 1.7334997653961182, + "learning_rate": 2.0344221188835667e-06, + "loss": 0.1372, + "step": 3371 + }, + { + "epoch": 1.5996204933586338, + "grad_norm": 1.7484663724899292, + "learning_rate": 2.0297795968253753e-06, + "loss": 0.173, + "step": 3372 + }, + { + "epoch": 1.6000948766603416, + "grad_norm": 1.5254170894622803, + "learning_rate": 2.0251417795094166e-06, + "loss": 0.1516, + "step": 3373 + }, + { + "epoch": 1.6005692599620494, + "grad_norm": 1.8212133646011353, + "learning_rate": 2.020508669673352e-06, + "loss": 0.151, + "step": 3374 + }, + { + "epoch": 1.6010436432637571, + "grad_norm": 1.369913101196289, + "learning_rate": 2.0158802700520576e-06, + "loss": 0.1338, + "step": 3375 + }, + { + "epoch": 1.601518026565465, + "grad_norm": 1.2662460803985596, + "learning_rate": 2.0112565833776364e-06, + "loss": 0.1228, + "step": 3376 + }, + { + "epoch": 1.6019924098671727, + "grad_norm": 1.7709726095199585, + "learning_rate": 2.0066376123793984e-06, + "loss": 0.1726, + "step": 3377 + }, + { + "epoch": 1.6024667931688805, + "grad_norm": 1.5085747241973877, + "learning_rate": 2.0020233597838813e-06, + "loss": 0.1034, + "step": 3378 + }, + { + "epoch": 1.6029411764705883, + "grad_norm": 1.3893810510635376, + "learning_rate": 1.99741382831483e-06, + "loss": 0.131, + "step": 3379 + }, + { + "epoch": 1.603415559772296, + "grad_norm": 2.0741636753082275, + "learning_rate": 1.9928090206932083e-06, + "loss": 0.2064, + "step": 3380 + }, + { + "epoch": 1.603889943074004, + "grad_norm": 1.5764415264129639, + "learning_rate": 1.9882089396371896e-06, + "loss": 0.1558, + "step": 3381 + }, + { + "epoch": 1.6043643263757117, + "grad_norm": 1.7845758199691772, + "learning_rate": 1.983613587862153e-06, + "loss": 0.1599, + "step": 3382 + }, + { + "epoch": 1.6048387096774195, + "grad_norm": 1.610253095626831, + "learning_rate": 1.9790229680806883e-06, + "loss": 0.1682, + "step": 3383 + }, + { + "epoch": 1.6053130929791273, + "grad_norm": 1.7443877458572388, + "learning_rate": 1.9744370830025937e-06, + "loss": 0.155, + "step": 3384 + }, + { + "epoch": 1.605787476280835, + "grad_norm": 1.7004719972610474, + "learning_rate": 1.9698559353348735e-06, + "loss": 0.1436, + "step": 3385 + }, + { + "epoch": 1.6062618595825426, + "grad_norm": 1.878106713294983, + "learning_rate": 1.9652795277817348e-06, + "loss": 0.181, + "step": 3386 + }, + { + "epoch": 1.6067362428842504, + "grad_norm": 1.6228288412094116, + "learning_rate": 1.960707863044582e-06, + "loss": 0.1511, + "step": 3387 + }, + { + "epoch": 1.6072106261859582, + "grad_norm": 1.7127445936203003, + "learning_rate": 1.9561409438220245e-06, + "loss": 0.1445, + "step": 3388 + }, + { + "epoch": 1.607685009487666, + "grad_norm": 1.5435030460357666, + "learning_rate": 1.9515787728098733e-06, + "loss": 0.1298, + "step": 3389 + }, + { + "epoch": 1.6081593927893738, + "grad_norm": 1.4933137893676758, + "learning_rate": 1.9470213527011282e-06, + "loss": 0.1415, + "step": 3390 + }, + { + "epoch": 1.6086337760910816, + "grad_norm": 1.9454411268234253, + "learning_rate": 1.9424686861859933e-06, + "loss": 0.174, + "step": 3391 + }, + { + "epoch": 1.6091081593927894, + "grad_norm": 1.5008386373519897, + "learning_rate": 1.937920775951857e-06, + "loss": 0.1465, + "step": 3392 + }, + { + "epoch": 1.6095825426944972, + "grad_norm": 1.5946168899536133, + "learning_rate": 1.9333776246833092e-06, + "loss": 0.1379, + "step": 3393 + }, + { + "epoch": 1.610056925996205, + "grad_norm": 1.4043335914611816, + "learning_rate": 1.9288392350621275e-06, + "loss": 0.1265, + "step": 3394 + }, + { + "epoch": 1.6105313092979125, + "grad_norm": 1.7210297584533691, + "learning_rate": 1.9243056097672796e-06, + "loss": 0.1668, + "step": 3395 + }, + { + "epoch": 1.6110056925996203, + "grad_norm": 1.7458949089050293, + "learning_rate": 1.9197767514749156e-06, + "loss": 0.16, + "step": 3396 + }, + { + "epoch": 1.6114800759013281, + "grad_norm": 1.8597474098205566, + "learning_rate": 1.915252662858378e-06, + "loss": 0.1811, + "step": 3397 + }, + { + "epoch": 1.611954459203036, + "grad_norm": 1.412137508392334, + "learning_rate": 1.910733346588194e-06, + "loss": 0.1204, + "step": 3398 + }, + { + "epoch": 1.6124288425047437, + "grad_norm": 1.6285221576690674, + "learning_rate": 1.9062188053320663e-06, + "loss": 0.1567, + "step": 3399 + }, + { + "epoch": 1.6129032258064515, + "grad_norm": 2.3471789360046387, + "learning_rate": 1.901709041754889e-06, + "loss": 0.1562, + "step": 3400 + }, + { + "epoch": 1.6133776091081593, + "grad_norm": 1.5829018354415894, + "learning_rate": 1.8972040585187256e-06, + "loss": 0.1545, + "step": 3401 + }, + { + "epoch": 1.613851992409867, + "grad_norm": 1.5271830558776855, + "learning_rate": 1.8927038582828261e-06, + "loss": 0.1294, + "step": 3402 + }, + { + "epoch": 1.614326375711575, + "grad_norm": 1.8951029777526855, + "learning_rate": 1.8882084437036142e-06, + "loss": 0.1618, + "step": 3403 + }, + { + "epoch": 1.6148007590132827, + "grad_norm": 1.707849383354187, + "learning_rate": 1.8837178174346882e-06, + "loss": 0.1544, + "step": 3404 + }, + { + "epoch": 1.6152751423149905, + "grad_norm": 1.8298168182373047, + "learning_rate": 1.8792319821268223e-06, + "loss": 0.1755, + "step": 3405 + }, + { + "epoch": 1.6157495256166983, + "grad_norm": 1.6558297872543335, + "learning_rate": 1.8747509404279595e-06, + "loss": 0.1473, + "step": 3406 + }, + { + "epoch": 1.616223908918406, + "grad_norm": 1.5755760669708252, + "learning_rate": 1.8702746949832117e-06, + "loss": 0.1447, + "step": 3407 + }, + { + "epoch": 1.6166982922201139, + "grad_norm": 1.9705390930175781, + "learning_rate": 1.8658032484348632e-06, + "loss": 0.1738, + "step": 3408 + }, + { + "epoch": 1.6171726755218216, + "grad_norm": 2.1337902545928955, + "learning_rate": 1.8613366034223668e-06, + "loss": 0.2303, + "step": 3409 + }, + { + "epoch": 1.6176470588235294, + "grad_norm": 1.5434858798980713, + "learning_rate": 1.8568747625823403e-06, + "loss": 0.1602, + "step": 3410 + }, + { + "epoch": 1.6181214421252372, + "grad_norm": 2.1457488536834717, + "learning_rate": 1.8524177285485588e-06, + "loss": 0.1905, + "step": 3411 + }, + { + "epoch": 1.618595825426945, + "grad_norm": 1.500415563583374, + "learning_rate": 1.8479655039519683e-06, + "loss": 0.1368, + "step": 3412 + }, + { + "epoch": 1.6190702087286528, + "grad_norm": 1.6900932788848877, + "learning_rate": 1.8435180914206763e-06, + "loss": 0.1541, + "step": 3413 + }, + { + "epoch": 1.6195445920303606, + "grad_norm": 1.5415065288543701, + "learning_rate": 1.8390754935799404e-06, + "loss": 0.1665, + "step": 3414 + }, + { + "epoch": 1.6200189753320684, + "grad_norm": 1.7073814868927002, + "learning_rate": 1.8346377130521864e-06, + "loss": 0.1549, + "step": 3415 + }, + { + "epoch": 1.6204933586337762, + "grad_norm": 1.7535865306854248, + "learning_rate": 1.8302047524569888e-06, + "loss": 0.1497, + "step": 3416 + }, + { + "epoch": 1.620967741935484, + "grad_norm": 1.7559723854064941, + "learning_rate": 1.8257766144110823e-06, + "loss": 0.1579, + "step": 3417 + }, + { + "epoch": 1.6214421252371918, + "grad_norm": 1.74555242061615, + "learning_rate": 1.8213533015283524e-06, + "loss": 0.1379, + "step": 3418 + }, + { + "epoch": 1.6219165085388996, + "grad_norm": 2.0303471088409424, + "learning_rate": 1.8169348164198408e-06, + "loss": 0.1545, + "step": 3419 + }, + { + "epoch": 1.6223908918406074, + "grad_norm": 1.6806904077529907, + "learning_rate": 1.8125211616937289e-06, + "loss": 0.1171, + "step": 3420 + }, + { + "epoch": 1.6228652751423152, + "grad_norm": 1.7993685007095337, + "learning_rate": 1.8081123399553569e-06, + "loss": 0.1789, + "step": 3421 + }, + { + "epoch": 1.6233396584440227, + "grad_norm": 1.8345065116882324, + "learning_rate": 1.8037083538072109e-06, + "loss": 0.16, + "step": 3422 + }, + { + "epoch": 1.6238140417457305, + "grad_norm": 1.875479817390442, + "learning_rate": 1.7993092058489158e-06, + "loss": 0.1429, + "step": 3423 + }, + { + "epoch": 1.6242884250474383, + "grad_norm": 1.7342585325241089, + "learning_rate": 1.794914898677249e-06, + "loss": 0.1624, + "step": 3424 + }, + { + "epoch": 1.624762808349146, + "grad_norm": 1.4515552520751953, + "learning_rate": 1.7905254348861235e-06, + "loss": 0.1347, + "step": 3425 + }, + { + "epoch": 1.625237191650854, + "grad_norm": 1.9419041872024536, + "learning_rate": 1.7861408170665961e-06, + "loss": 0.1561, + "step": 3426 + }, + { + "epoch": 1.6257115749525617, + "grad_norm": 2.1055121421813965, + "learning_rate": 1.7817610478068659e-06, + "loss": 0.1867, + "step": 3427 + }, + { + "epoch": 1.6261859582542695, + "grad_norm": 1.6690064668655396, + "learning_rate": 1.7773861296922657e-06, + "loss": 0.1776, + "step": 3428 + }, + { + "epoch": 1.6266603415559773, + "grad_norm": 2.282166004180908, + "learning_rate": 1.7730160653052685e-06, + "loss": 0.205, + "step": 3429 + }, + { + "epoch": 1.6271347248576848, + "grad_norm": 1.7570334672927856, + "learning_rate": 1.7686508572254802e-06, + "loss": 0.1546, + "step": 3430 + }, + { + "epoch": 1.6276091081593926, + "grad_norm": 1.8625390529632568, + "learning_rate": 1.7642905080296346e-06, + "loss": 0.1668, + "step": 3431 + }, + { + "epoch": 1.6280834914611004, + "grad_norm": 1.4064631462097168, + "learning_rate": 1.7599350202916066e-06, + "loss": 0.1402, + "step": 3432 + }, + { + "epoch": 1.6285578747628082, + "grad_norm": 1.3597134351730347, + "learning_rate": 1.7555843965823992e-06, + "loss": 0.1314, + "step": 3433 + }, + { + "epoch": 1.629032258064516, + "grad_norm": 1.76325523853302, + "learning_rate": 1.7512386394701386e-06, + "loss": 0.1729, + "step": 3434 + }, + { + "epoch": 1.6295066413662238, + "grad_norm": 1.6423853635787964, + "learning_rate": 1.7468977515200835e-06, + "loss": 0.1287, + "step": 3435 + }, + { + "epoch": 1.6299810246679316, + "grad_norm": 1.6672308444976807, + "learning_rate": 1.7425617352946178e-06, + "loss": 0.1701, + "step": 3436 + }, + { + "epoch": 1.6304554079696394, + "grad_norm": 1.6599950790405273, + "learning_rate": 1.7382305933532494e-06, + "loss": 0.1618, + "step": 3437 + }, + { + "epoch": 1.6309297912713472, + "grad_norm": 1.5585594177246094, + "learning_rate": 1.7339043282526103e-06, + "loss": 0.149, + "step": 3438 + }, + { + "epoch": 1.631404174573055, + "grad_norm": 1.8305972814559937, + "learning_rate": 1.7295829425464494e-06, + "loss": 0.1789, + "step": 3439 + }, + { + "epoch": 1.6318785578747628, + "grad_norm": 1.763118028640747, + "learning_rate": 1.7252664387856367e-06, + "loss": 0.1734, + "step": 3440 + }, + { + "epoch": 1.6323529411764706, + "grad_norm": 1.743260383605957, + "learning_rate": 1.7209548195181625e-06, + "loss": 0.1609, + "step": 3441 + }, + { + "epoch": 1.6328273244781784, + "grad_norm": 1.892848014831543, + "learning_rate": 1.7166480872891333e-06, + "loss": 0.1606, + "step": 3442 + }, + { + "epoch": 1.6333017077798861, + "grad_norm": 1.9057481288909912, + "learning_rate": 1.7123462446407746e-06, + "loss": 0.1659, + "step": 3443 + }, + { + "epoch": 1.633776091081594, + "grad_norm": 1.7405368089675903, + "learning_rate": 1.7080492941124139e-06, + "loss": 0.1593, + "step": 3444 + }, + { + "epoch": 1.6342504743833017, + "grad_norm": 1.713452696800232, + "learning_rate": 1.7037572382405031e-06, + "loss": 0.1283, + "step": 3445 + }, + { + "epoch": 1.6347248576850095, + "grad_norm": 1.595138430595398, + "learning_rate": 1.6994700795586027e-06, + "loss": 0.1319, + "step": 3446 + }, + { + "epoch": 1.6351992409867173, + "grad_norm": 1.756667137145996, + "learning_rate": 1.6951878205973738e-06, + "loss": 0.1677, + "step": 3447 + }, + { + "epoch": 1.635673624288425, + "grad_norm": 1.6868460178375244, + "learning_rate": 1.6909104638845986e-06, + "loss": 0.1546, + "step": 3448 + }, + { + "epoch": 1.636148007590133, + "grad_norm": 1.5798834562301636, + "learning_rate": 1.686638011945151e-06, + "loss": 0.1525, + "step": 3449 + }, + { + "epoch": 1.6366223908918407, + "grad_norm": 1.3165028095245361, + "learning_rate": 1.682370467301021e-06, + "loss": 0.1139, + "step": 3450 + }, + { + "epoch": 1.6370967741935485, + "grad_norm": 1.6068346500396729, + "learning_rate": 1.6781078324712973e-06, + "loss": 0.1436, + "step": 3451 + }, + { + "epoch": 1.6375711574952563, + "grad_norm": 1.4109083414077759, + "learning_rate": 1.6738501099721737e-06, + "loss": 0.1366, + "step": 3452 + }, + { + "epoch": 1.638045540796964, + "grad_norm": 1.7669192552566528, + "learning_rate": 1.6695973023169375e-06, + "loss": 0.1751, + "step": 3453 + }, + { + "epoch": 1.6385199240986719, + "grad_norm": 1.5503531694412231, + "learning_rate": 1.6653494120159842e-06, + "loss": 0.1462, + "step": 3454 + }, + { + "epoch": 1.6389943074003797, + "grad_norm": 1.9344103336334229, + "learning_rate": 1.6611064415767941e-06, + "loss": 0.2062, + "step": 3455 + }, + { + "epoch": 1.6394686907020875, + "grad_norm": 1.5301547050476074, + "learning_rate": 1.6568683935039554e-06, + "loss": 0.1375, + "step": 3456 + }, + { + "epoch": 1.639943074003795, + "grad_norm": 1.9043666124343872, + "learning_rate": 1.6526352702991478e-06, + "loss": 0.199, + "step": 3457 + }, + { + "epoch": 1.6404174573055028, + "grad_norm": 1.684772253036499, + "learning_rate": 1.6484070744611358e-06, + "loss": 0.1337, + "step": 3458 + }, + { + "epoch": 1.6408918406072106, + "grad_norm": 1.598937749862671, + "learning_rate": 1.6441838084857863e-06, + "loss": 0.1523, + "step": 3459 + }, + { + "epoch": 1.6413662239089184, + "grad_norm": 1.815584421157837, + "learning_rate": 1.6399654748660498e-06, + "loss": 0.1498, + "step": 3460 + }, + { + "epoch": 1.6418406072106262, + "grad_norm": 1.7786113023757935, + "learning_rate": 1.6357520760919675e-06, + "loss": 0.1683, + "step": 3461 + }, + { + "epoch": 1.642314990512334, + "grad_norm": 1.3854840993881226, + "learning_rate": 1.6315436146506702e-06, + "loss": 0.1122, + "step": 3462 + }, + { + "epoch": 1.6427893738140418, + "grad_norm": 1.40729558467865, + "learning_rate": 1.6273400930263672e-06, + "loss": 0.1333, + "step": 3463 + }, + { + "epoch": 1.6432637571157496, + "grad_norm": 1.844829797744751, + "learning_rate": 1.6231415137003536e-06, + "loss": 0.145, + "step": 3464 + }, + { + "epoch": 1.6437381404174574, + "grad_norm": 2.1833529472351074, + "learning_rate": 1.6189478791510116e-06, + "loss": 0.189, + "step": 3465 + }, + { + "epoch": 1.644212523719165, + "grad_norm": 1.6294044256210327, + "learning_rate": 1.614759191853803e-06, + "loss": 0.1264, + "step": 3466 + }, + { + "epoch": 1.6446869070208727, + "grad_norm": 1.6471272706985474, + "learning_rate": 1.6105754542812702e-06, + "loss": 0.1396, + "step": 3467 + }, + { + "epoch": 1.6451612903225805, + "grad_norm": 1.920088768005371, + "learning_rate": 1.6063966689030275e-06, + "loss": 0.1704, + "step": 3468 + }, + { + "epoch": 1.6456356736242883, + "grad_norm": 1.8987756967544556, + "learning_rate": 1.6022228381857729e-06, + "loss": 0.1184, + "step": 3469 + }, + { + "epoch": 1.646110056925996, + "grad_norm": 1.6714553833007812, + "learning_rate": 1.5980539645932802e-06, + "loss": 0.1568, + "step": 3470 + }, + { + "epoch": 1.646584440227704, + "grad_norm": 1.6025445461273193, + "learning_rate": 1.5938900505863886e-06, + "loss": 0.143, + "step": 3471 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 1.8814153671264648, + "learning_rate": 1.5897310986230196e-06, + "loss": 0.1493, + "step": 3472 + }, + { + "epoch": 1.6475332068311195, + "grad_norm": 1.7132648229599, + "learning_rate": 1.5855771111581586e-06, + "loss": 0.1695, + "step": 3473 + }, + { + "epoch": 1.6480075901328273, + "grad_norm": 1.6808983087539673, + "learning_rate": 1.5814280906438639e-06, + "loss": 0.1711, + "step": 3474 + }, + { + "epoch": 1.648481973434535, + "grad_norm": 2.184779405593872, + "learning_rate": 1.5772840395292632e-06, + "loss": 0.1722, + "step": 3475 + }, + { + "epoch": 1.6489563567362429, + "grad_norm": 1.5792295932769775, + "learning_rate": 1.5731449602605487e-06, + "loss": 0.152, + "step": 3476 + }, + { + "epoch": 1.6494307400379506, + "grad_norm": 1.4124369621276855, + "learning_rate": 1.5690108552809746e-06, + "loss": 0.1496, + "step": 3477 + }, + { + "epoch": 1.6499051233396584, + "grad_norm": 1.8069303035736084, + "learning_rate": 1.5648817270308648e-06, + "loss": 0.1382, + "step": 3478 + }, + { + "epoch": 1.6503795066413662, + "grad_norm": 1.832484245300293, + "learning_rate": 1.5607575779476047e-06, + "loss": 0.148, + "step": 3479 + }, + { + "epoch": 1.650853889943074, + "grad_norm": 2.0292952060699463, + "learning_rate": 1.556638410465635e-06, + "loss": 0.1711, + "step": 3480 + }, + { + "epoch": 1.6513282732447818, + "grad_norm": 1.6780633926391602, + "learning_rate": 1.552524227016462e-06, + "loss": 0.1237, + "step": 3481 + }, + { + "epoch": 1.6518026565464896, + "grad_norm": 1.6623648405075073, + "learning_rate": 1.548415030028645e-06, + "loss": 0.1589, + "step": 3482 + }, + { + "epoch": 1.6522770398481974, + "grad_norm": 1.652055263519287, + "learning_rate": 1.5443108219278036e-06, + "loss": 0.1615, + "step": 3483 + }, + { + "epoch": 1.6527514231499052, + "grad_norm": 1.6934820413589478, + "learning_rate": 1.5402116051366111e-06, + "loss": 0.1585, + "step": 3484 + }, + { + "epoch": 1.653225806451613, + "grad_norm": 1.5433070659637451, + "learning_rate": 1.5361173820747942e-06, + "loss": 0.1395, + "step": 3485 + }, + { + "epoch": 1.6537001897533208, + "grad_norm": 1.812530755996704, + "learning_rate": 1.5320281551591366e-06, + "loss": 0.1592, + "step": 3486 + }, + { + "epoch": 1.6541745730550286, + "grad_norm": 2.096843957901001, + "learning_rate": 1.5279439268034634e-06, + "loss": 0.1543, + "step": 3487 + }, + { + "epoch": 1.6546489563567364, + "grad_norm": 1.825169563293457, + "learning_rate": 1.5238646994186546e-06, + "loss": 0.153, + "step": 3488 + }, + { + "epoch": 1.6551233396584442, + "grad_norm": 1.7507545948028564, + "learning_rate": 1.519790475412638e-06, + "loss": 0.1509, + "step": 3489 + }, + { + "epoch": 1.655597722960152, + "grad_norm": 1.6845128536224365, + "learning_rate": 1.515721257190389e-06, + "loss": 0.1752, + "step": 3490 + }, + { + "epoch": 1.6560721062618597, + "grad_norm": 1.793799638748169, + "learning_rate": 1.5116570471539294e-06, + "loss": 0.1689, + "step": 3491 + }, + { + "epoch": 1.6565464895635673, + "grad_norm": 1.5899111032485962, + "learning_rate": 1.5075978477023156e-06, + "loss": 0.1291, + "step": 3492 + }, + { + "epoch": 1.657020872865275, + "grad_norm": 1.5500413179397583, + "learning_rate": 1.5035436612316567e-06, + "loss": 0.1548, + "step": 3493 + }, + { + "epoch": 1.657495256166983, + "grad_norm": 1.3642367124557495, + "learning_rate": 1.4994944901351006e-06, + "loss": 0.1452, + "step": 3494 + }, + { + "epoch": 1.6579696394686907, + "grad_norm": 1.7189817428588867, + "learning_rate": 1.4954503368028305e-06, + "loss": 0.1784, + "step": 3495 + }, + { + "epoch": 1.6584440227703985, + "grad_norm": 1.5453788042068481, + "learning_rate": 1.4914112036220696e-06, + "loss": 0.1572, + "step": 3496 + }, + { + "epoch": 1.6589184060721063, + "grad_norm": 1.3796250820159912, + "learning_rate": 1.4873770929770782e-06, + "loss": 0.1385, + "step": 3497 + }, + { + "epoch": 1.659392789373814, + "grad_norm": 1.3950660228729248, + "learning_rate": 1.4833480072491524e-06, + "loss": 0.1264, + "step": 3498 + }, + { + "epoch": 1.6598671726755219, + "grad_norm": 1.7917977571487427, + "learning_rate": 1.4793239488166222e-06, + "loss": 0.1483, + "step": 3499 + }, + { + "epoch": 1.6603415559772297, + "grad_norm": 1.921996831893921, + "learning_rate": 1.4753049200548519e-06, + "loss": 0.2267, + "step": 3500 + }, + { + "epoch": 1.6608159392789372, + "grad_norm": 1.6829748153686523, + "learning_rate": 1.4712909233362304e-06, + "loss": 0.1554, + "step": 3501 + }, + { + "epoch": 1.661290322580645, + "grad_norm": 1.645816445350647, + "learning_rate": 1.4672819610301802e-06, + "loss": 0.1642, + "step": 3502 + }, + { + "epoch": 1.6617647058823528, + "grad_norm": 1.2264525890350342, + "learning_rate": 1.4632780355031573e-06, + "loss": 0.1073, + "step": 3503 + }, + { + "epoch": 1.6622390891840606, + "grad_norm": 1.5682717561721802, + "learning_rate": 1.459279149118632e-06, + "loss": 0.1596, + "step": 3504 + }, + { + "epoch": 1.6627134724857684, + "grad_norm": 1.7140616178512573, + "learning_rate": 1.455285304237114e-06, + "loss": 0.1499, + "step": 3505 + }, + { + "epoch": 1.6631878557874762, + "grad_norm": 2.096163749694824, + "learning_rate": 1.4512965032161242e-06, + "loss": 0.1381, + "step": 3506 + }, + { + "epoch": 1.663662239089184, + "grad_norm": 1.4052388668060303, + "learning_rate": 1.4473127484102157e-06, + "loss": 0.1269, + "step": 3507 + }, + { + "epoch": 1.6641366223908918, + "grad_norm": 1.667314052581787, + "learning_rate": 1.4433340421709597e-06, + "loss": 0.1574, + "step": 3508 + }, + { + "epoch": 1.6646110056925996, + "grad_norm": 1.8331180810928345, + "learning_rate": 1.4393603868469464e-06, + "loss": 0.1621, + "step": 3509 + }, + { + "epoch": 1.6650853889943074, + "grad_norm": 1.3572697639465332, + "learning_rate": 1.4353917847837883e-06, + "loss": 0.1127, + "step": 3510 + }, + { + "epoch": 1.6655597722960152, + "grad_norm": 1.5188183784484863, + "learning_rate": 1.4314282383241097e-06, + "loss": 0.1292, + "step": 3511 + }, + { + "epoch": 1.666034155597723, + "grad_norm": 1.5787996053695679, + "learning_rate": 1.4274697498075495e-06, + "loss": 0.1352, + "step": 3512 + }, + { + "epoch": 1.6665085388994307, + "grad_norm": 1.8923653364181519, + "learning_rate": 1.423516321570767e-06, + "loss": 0.1811, + "step": 3513 + }, + { + "epoch": 1.6669829222011385, + "grad_norm": 1.3783034086227417, + "learning_rate": 1.419567955947434e-06, + "loss": 0.1151, + "step": 3514 + }, + { + "epoch": 1.6674573055028463, + "grad_norm": 1.7358620166778564, + "learning_rate": 1.4156246552682274e-06, + "loss": 0.178, + "step": 3515 + }, + { + "epoch": 1.6679316888045541, + "grad_norm": 1.7342180013656616, + "learning_rate": 1.4116864218608416e-06, + "loss": 0.1696, + "step": 3516 + }, + { + "epoch": 1.668406072106262, + "grad_norm": 1.8265262842178345, + "learning_rate": 1.4077532580499753e-06, + "loss": 0.1312, + "step": 3517 + }, + { + "epoch": 1.6688804554079697, + "grad_norm": 1.7043464183807373, + "learning_rate": 1.4038251661573387e-06, + "loss": 0.1454, + "step": 3518 + }, + { + "epoch": 1.6693548387096775, + "grad_norm": 1.6283822059631348, + "learning_rate": 1.3999021485016429e-06, + "loss": 0.1431, + "step": 3519 + }, + { + "epoch": 1.6698292220113853, + "grad_norm": 1.8391207456588745, + "learning_rate": 1.3959842073986085e-06, + "loss": 0.1677, + "step": 3520 + }, + { + "epoch": 1.670303605313093, + "grad_norm": 1.350468635559082, + "learning_rate": 1.3920713451609535e-06, + "loss": 0.1179, + "step": 3521 + }, + { + "epoch": 1.6707779886148009, + "grad_norm": 2.141695976257324, + "learning_rate": 1.3881635640984048e-06, + "loss": 0.1947, + "step": 3522 + }, + { + "epoch": 1.6712523719165087, + "grad_norm": 1.6811950206756592, + "learning_rate": 1.384260866517686e-06, + "loss": 0.1595, + "step": 3523 + }, + { + "epoch": 1.6717267552182165, + "grad_norm": 1.6273670196533203, + "learning_rate": 1.3803632547225242e-06, + "loss": 0.1674, + "step": 3524 + }, + { + "epoch": 1.6722011385199242, + "grad_norm": 1.6808335781097412, + "learning_rate": 1.376470731013636e-06, + "loss": 0.1413, + "step": 3525 + }, + { + "epoch": 1.672675521821632, + "grad_norm": 1.9226058721542358, + "learning_rate": 1.372583297688741e-06, + "loss": 0.1372, + "step": 3526 + }, + { + "epoch": 1.6731499051233396, + "grad_norm": 1.719659686088562, + "learning_rate": 1.368700957042557e-06, + "loss": 0.1393, + "step": 3527 + }, + { + "epoch": 1.6736242884250474, + "grad_norm": 1.4148011207580566, + "learning_rate": 1.3648237113667839e-06, + "loss": 0.144, + "step": 3528 + }, + { + "epoch": 1.6740986717267552, + "grad_norm": 1.5885848999023438, + "learning_rate": 1.3609515629501279e-06, + "loss": 0.1342, + "step": 3529 + }, + { + "epoch": 1.674573055028463, + "grad_norm": 1.828921914100647, + "learning_rate": 1.3570845140782752e-06, + "loss": 0.1664, + "step": 3530 + }, + { + "epoch": 1.6750474383301708, + "grad_norm": 1.8899797201156616, + "learning_rate": 1.3532225670339095e-06, + "loss": 0.1795, + "step": 3531 + }, + { + "epoch": 1.6755218216318786, + "grad_norm": 1.7065174579620361, + "learning_rate": 1.3493657240966974e-06, + "loss": 0.1541, + "step": 3532 + }, + { + "epoch": 1.6759962049335864, + "grad_norm": 1.6612739562988281, + "learning_rate": 1.3455139875433e-06, + "loss": 0.1717, + "step": 3533 + }, + { + "epoch": 1.6764705882352942, + "grad_norm": 2.0222203731536865, + "learning_rate": 1.3416673596473528e-06, + "loss": 0.1909, + "step": 3534 + }, + { + "epoch": 1.676944971537002, + "grad_norm": 1.3110690116882324, + "learning_rate": 1.3378258426794888e-06, + "loss": 0.1174, + "step": 3535 + }, + { + "epoch": 1.6774193548387095, + "grad_norm": 1.338375449180603, + "learning_rate": 1.3339894389073104e-06, + "loss": 0.1189, + "step": 3536 + }, + { + "epoch": 1.6778937381404173, + "grad_norm": 1.7583532333374023, + "learning_rate": 1.3301581505954131e-06, + "loss": 0.174, + "step": 3537 + }, + { + "epoch": 1.678368121442125, + "grad_norm": 1.7339435815811157, + "learning_rate": 1.3263319800053698e-06, + "loss": 0.1568, + "step": 3538 + }, + { + "epoch": 1.678842504743833, + "grad_norm": 1.5173304080963135, + "learning_rate": 1.3225109293957272e-06, + "loss": 0.1379, + "step": 3539 + }, + { + "epoch": 1.6793168880455407, + "grad_norm": 1.5127456188201904, + "learning_rate": 1.3186950010220156e-06, + "loss": 0.1355, + "step": 3540 + }, + { + "epoch": 1.6797912713472485, + "grad_norm": 1.6670554876327515, + "learning_rate": 1.3148841971367387e-06, + "loss": 0.1832, + "step": 3541 + }, + { + "epoch": 1.6802656546489563, + "grad_norm": 1.6429007053375244, + "learning_rate": 1.3110785199893806e-06, + "loss": 0.1324, + "step": 3542 + }, + { + "epoch": 1.680740037950664, + "grad_norm": 1.5100871324539185, + "learning_rate": 1.3072779718263884e-06, + "loss": 0.1413, + "step": 3543 + }, + { + "epoch": 1.6812144212523719, + "grad_norm": 1.6783661842346191, + "learning_rate": 1.3034825548911944e-06, + "loss": 0.1182, + "step": 3544 + }, + { + "epoch": 1.6816888045540797, + "grad_norm": 1.708293080329895, + "learning_rate": 1.29969227142419e-06, + "loss": 0.1654, + "step": 3545 + }, + { + "epoch": 1.6821631878557874, + "grad_norm": 1.9683902263641357, + "learning_rate": 1.295907123662744e-06, + "loss": 0.1813, + "step": 3546 + }, + { + "epoch": 1.6826375711574952, + "grad_norm": 1.7525516748428345, + "learning_rate": 1.2921271138411927e-06, + "loss": 0.1453, + "step": 3547 + }, + { + "epoch": 1.683111954459203, + "grad_norm": 1.5966471433639526, + "learning_rate": 1.2883522441908403e-06, + "loss": 0.123, + "step": 3548 + }, + { + "epoch": 1.6835863377609108, + "grad_norm": 1.6996803283691406, + "learning_rate": 1.2845825169399506e-06, + "loss": 0.1626, + "step": 3549 + }, + { + "epoch": 1.6840607210626186, + "grad_norm": 1.50482177734375, + "learning_rate": 1.2808179343137583e-06, + "loss": 0.1239, + "step": 3550 + }, + { + "epoch": 1.6845351043643264, + "grad_norm": 1.7944713830947876, + "learning_rate": 1.2770584985344613e-06, + "loss": 0.1743, + "step": 3551 + }, + { + "epoch": 1.6850094876660342, + "grad_norm": 1.602755069732666, + "learning_rate": 1.2733042118212157e-06, + "loss": 0.1602, + "step": 3552 + }, + { + "epoch": 1.685483870967742, + "grad_norm": 1.5536739826202393, + "learning_rate": 1.2695550763901376e-06, + "loss": 0.1252, + "step": 3553 + }, + { + "epoch": 1.6859582542694498, + "grad_norm": 1.7096506357192993, + "learning_rate": 1.2658110944543055e-06, + "loss": 0.1481, + "step": 3554 + }, + { + "epoch": 1.6864326375711576, + "grad_norm": 1.5002373456954956, + "learning_rate": 1.2620722682237575e-06, + "loss": 0.1237, + "step": 3555 + }, + { + "epoch": 1.6869070208728654, + "grad_norm": 1.692552924156189, + "learning_rate": 1.258338599905482e-06, + "loss": 0.176, + "step": 3556 + }, + { + "epoch": 1.6873814041745732, + "grad_norm": 1.6304970979690552, + "learning_rate": 1.2546100917034322e-06, + "loss": 0.1477, + "step": 3557 + }, + { + "epoch": 1.687855787476281, + "grad_norm": 1.4163672924041748, + "learning_rate": 1.2508867458185037e-06, + "loss": 0.111, + "step": 3558 + }, + { + "epoch": 1.6883301707779887, + "grad_norm": 1.4730753898620605, + "learning_rate": 1.2471685644485543e-06, + "loss": 0.1274, + "step": 3559 + }, + { + "epoch": 1.6888045540796965, + "grad_norm": 1.3261274099349976, + "learning_rate": 1.2434555497883872e-06, + "loss": 0.1217, + "step": 3560 + }, + { + "epoch": 1.6892789373814043, + "grad_norm": 1.5221738815307617, + "learning_rate": 1.239747704029758e-06, + "loss": 0.1276, + "step": 3561 + }, + { + "epoch": 1.689753320683112, + "grad_norm": 1.6323662996292114, + "learning_rate": 1.2360450293613757e-06, + "loss": 0.1552, + "step": 3562 + }, + { + "epoch": 1.6902277039848197, + "grad_norm": 1.506781816482544, + "learning_rate": 1.2323475279688869e-06, + "loss": 0.1304, + "step": 3563 + }, + { + "epoch": 1.6907020872865275, + "grad_norm": 1.473595380783081, + "learning_rate": 1.228655202034893e-06, + "loss": 0.1362, + "step": 3564 + }, + { + "epoch": 1.6911764705882353, + "grad_norm": 2.067612409591675, + "learning_rate": 1.2249680537389375e-06, + "loss": 0.1776, + "step": 3565 + }, + { + "epoch": 1.691650853889943, + "grad_norm": 2.5935635566711426, + "learning_rate": 1.2212860852575093e-06, + "loss": 0.1547, + "step": 3566 + }, + { + "epoch": 1.6921252371916509, + "grad_norm": 1.8970015048980713, + "learning_rate": 1.217609298764033e-06, + "loss": 0.1734, + "step": 3567 + }, + { + "epoch": 1.6925996204933587, + "grad_norm": 1.6494086980819702, + "learning_rate": 1.213937696428885e-06, + "loss": 0.1458, + "step": 3568 + }, + { + "epoch": 1.6930740037950665, + "grad_norm": 2.007638454437256, + "learning_rate": 1.2102712804193705e-06, + "loss": 0.2278, + "step": 3569 + }, + { + "epoch": 1.6935483870967742, + "grad_norm": 1.6985396146774292, + "learning_rate": 1.2066100528997415e-06, + "loss": 0.169, + "step": 3570 + }, + { + "epoch": 1.6940227703984818, + "grad_norm": 2.0554983615875244, + "learning_rate": 1.2029540160311859e-06, + "loss": 0.1328, + "step": 3571 + }, + { + "epoch": 1.6944971537001896, + "grad_norm": 1.8407206535339355, + "learning_rate": 1.1993031719718217e-06, + "loss": 0.1437, + "step": 3572 + }, + { + "epoch": 1.6949715370018974, + "grad_norm": 1.4634854793548584, + "learning_rate": 1.1956575228767087e-06, + "loss": 0.1477, + "step": 3573 + }, + { + "epoch": 1.6954459203036052, + "grad_norm": 1.489542007446289, + "learning_rate": 1.1920170708978374e-06, + "loss": 0.1353, + "step": 3574 + }, + { + "epoch": 1.695920303605313, + "grad_norm": 1.7846124172210693, + "learning_rate": 1.1883818181841323e-06, + "loss": 0.1611, + "step": 3575 + }, + { + "epoch": 1.6963946869070208, + "grad_norm": 1.4704738855361938, + "learning_rate": 1.1847517668814456e-06, + "loss": 0.1405, + "step": 3576 + }, + { + "epoch": 1.6968690702087286, + "grad_norm": 1.5131449699401855, + "learning_rate": 1.181126919132557e-06, + "loss": 0.1282, + "step": 3577 + }, + { + "epoch": 1.6973434535104364, + "grad_norm": 1.477185845375061, + "learning_rate": 1.1775072770771833e-06, + "loss": 0.1477, + "step": 3578 + }, + { + "epoch": 1.6978178368121442, + "grad_norm": 1.6830008029937744, + "learning_rate": 1.1738928428519603e-06, + "loss": 0.1419, + "step": 3579 + }, + { + "epoch": 1.698292220113852, + "grad_norm": 1.5827606916427612, + "learning_rate": 1.1702836185904543e-06, + "loss": 0.1486, + "step": 3580 + }, + { + "epoch": 1.6987666034155597, + "grad_norm": 1.622440218925476, + "learning_rate": 1.1666796064231566e-06, + "loss": 0.1392, + "step": 3581 + }, + { + "epoch": 1.6992409867172675, + "grad_norm": 1.690509557723999, + "learning_rate": 1.1630808084774758e-06, + "loss": 0.1346, + "step": 3582 + }, + { + "epoch": 1.6997153700189753, + "grad_norm": 2.058816432952881, + "learning_rate": 1.1594872268777513e-06, + "loss": 0.2435, + "step": 3583 + }, + { + "epoch": 1.7001897533206831, + "grad_norm": 1.905711054801941, + "learning_rate": 1.155898863745234e-06, + "loss": 0.1724, + "step": 3584 + }, + { + "epoch": 1.700664136622391, + "grad_norm": 1.4442384243011475, + "learning_rate": 1.1523157211981006e-06, + "loss": 0.1424, + "step": 3585 + }, + { + "epoch": 1.7011385199240987, + "grad_norm": 1.9293603897094727, + "learning_rate": 1.1487378013514483e-06, + "loss": 0.1443, + "step": 3586 + }, + { + "epoch": 1.7016129032258065, + "grad_norm": 1.8342009782791138, + "learning_rate": 1.145165106317282e-06, + "loss": 0.195, + "step": 3587 + }, + { + "epoch": 1.7020872865275143, + "grad_norm": 1.8110581636428833, + "learning_rate": 1.1415976382045313e-06, + "loss": 0.1736, + "step": 3588 + }, + { + "epoch": 1.702561669829222, + "grad_norm": 1.952837347984314, + "learning_rate": 1.1380353991190373e-06, + "loss": 0.1521, + "step": 3589 + }, + { + "epoch": 1.7030360531309299, + "grad_norm": 2.2737579345703125, + "learning_rate": 1.1344783911635538e-06, + "loss": 0.1737, + "step": 3590 + }, + { + "epoch": 1.7035104364326377, + "grad_norm": 1.83051335811615, + "learning_rate": 1.130926616437751e-06, + "loss": 0.1311, + "step": 3591 + }, + { + "epoch": 1.7039848197343455, + "grad_norm": 1.8009090423583984, + "learning_rate": 1.1273800770382027e-06, + "loss": 0.1798, + "step": 3592 + }, + { + "epoch": 1.7044592030360532, + "grad_norm": 1.824048399925232, + "learning_rate": 1.1238387750583945e-06, + "loss": 0.1588, + "step": 3593 + }, + { + "epoch": 1.704933586337761, + "grad_norm": 1.5769997835159302, + "learning_rate": 1.1203027125887235e-06, + "loss": 0.1491, + "step": 3594 + }, + { + "epoch": 1.7054079696394688, + "grad_norm": 1.6240620613098145, + "learning_rate": 1.1167718917164961e-06, + "loss": 0.1457, + "step": 3595 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 1.4127658605575562, + "learning_rate": 1.1132463145259144e-06, + "loss": 0.124, + "step": 3596 + }, + { + "epoch": 1.7063567362428842, + "grad_norm": 1.866065502166748, + "learning_rate": 1.1097259830980956e-06, + "loss": 0.1289, + "step": 3597 + }, + { + "epoch": 1.706831119544592, + "grad_norm": 2.1026611328125, + "learning_rate": 1.1062108995110566e-06, + "loss": 0.182, + "step": 3598 + }, + { + "epoch": 1.7073055028462998, + "grad_norm": 1.580645203590393, + "learning_rate": 1.1027010658397175e-06, + "loss": 0.1433, + "step": 3599 + }, + { + "epoch": 1.7077798861480076, + "grad_norm": 1.5717014074325562, + "learning_rate": 1.0991964841558955e-06, + "loss": 0.151, + "step": 3600 + }, + { + "epoch": 1.7082542694497154, + "grad_norm": 1.5343966484069824, + "learning_rate": 1.0956971565283114e-06, + "loss": 0.1533, + "step": 3601 + }, + { + "epoch": 1.7087286527514232, + "grad_norm": 1.4576107263565063, + "learning_rate": 1.092203085022583e-06, + "loss": 0.1411, + "step": 3602 + }, + { + "epoch": 1.709203036053131, + "grad_norm": 1.7388747930526733, + "learning_rate": 1.0887142717012266e-06, + "loss": 0.1765, + "step": 3603 + }, + { + "epoch": 1.7096774193548387, + "grad_norm": 1.668588638305664, + "learning_rate": 1.0852307186236554e-06, + "loss": 0.1562, + "step": 3604 + }, + { + "epoch": 1.7101518026565465, + "grad_norm": 1.5516544580459595, + "learning_rate": 1.0817524278461777e-06, + "loss": 0.1373, + "step": 3605 + }, + { + "epoch": 1.710626185958254, + "grad_norm": 1.614884614944458, + "learning_rate": 1.07827940142199e-06, + "loss": 0.1378, + "step": 3606 + }, + { + "epoch": 1.711100569259962, + "grad_norm": 1.7359789609909058, + "learning_rate": 1.074811641401189e-06, + "loss": 0.1462, + "step": 3607 + }, + { + "epoch": 1.7115749525616697, + "grad_norm": 1.587064504623413, + "learning_rate": 1.071349149830756e-06, + "loss": 0.1173, + "step": 3608 + }, + { + "epoch": 1.7120493358633775, + "grad_norm": 1.8435542583465576, + "learning_rate": 1.0678919287545663e-06, + "loss": 0.1849, + "step": 3609 + }, + { + "epoch": 1.7125237191650853, + "grad_norm": 1.614603042602539, + "learning_rate": 1.0644399802133876e-06, + "loss": 0.1441, + "step": 3610 + }, + { + "epoch": 1.712998102466793, + "grad_norm": 1.7255760431289673, + "learning_rate": 1.060993306244864e-06, + "loss": 0.1506, + "step": 3611 + }, + { + "epoch": 1.7134724857685009, + "grad_norm": 2.0267391204833984, + "learning_rate": 1.0575519088835374e-06, + "loss": 0.1878, + "step": 3612 + }, + { + "epoch": 1.7139468690702087, + "grad_norm": 1.741645336151123, + "learning_rate": 1.054115790160829e-06, + "loss": 0.1536, + "step": 3613 + }, + { + "epoch": 1.7144212523719164, + "grad_norm": 1.8468014001846313, + "learning_rate": 1.0506849521050478e-06, + "loss": 0.1775, + "step": 3614 + }, + { + "epoch": 1.7148956356736242, + "grad_norm": 2.1939375400543213, + "learning_rate": 1.0472593967413813e-06, + "loss": 0.2245, + "step": 3615 + }, + { + "epoch": 1.715370018975332, + "grad_norm": 1.7620271444320679, + "learning_rate": 1.0438391260919034e-06, + "loss": 0.1369, + "step": 3616 + }, + { + "epoch": 1.7158444022770398, + "grad_norm": 1.5060416460037231, + "learning_rate": 1.0404241421755623e-06, + "loss": 0.1334, + "step": 3617 + }, + { + "epoch": 1.7163187855787476, + "grad_norm": 1.5071996450424194, + "learning_rate": 1.037014447008191e-06, + "loss": 0.1385, + "step": 3618 + }, + { + "epoch": 1.7167931688804554, + "grad_norm": 1.741341471672058, + "learning_rate": 1.0336100426025008e-06, + "loss": 0.1588, + "step": 3619 + }, + { + "epoch": 1.7172675521821632, + "grad_norm": 1.5700851678848267, + "learning_rate": 1.0302109309680752e-06, + "loss": 0.1558, + "step": 3620 + }, + { + "epoch": 1.717741935483871, + "grad_norm": 1.5904638767242432, + "learning_rate": 1.0268171141113769e-06, + "loss": 0.1772, + "step": 3621 + }, + { + "epoch": 1.7182163187855788, + "grad_norm": 1.7748823165893555, + "learning_rate": 1.0234285940357424e-06, + "loss": 0.1702, + "step": 3622 + }, + { + "epoch": 1.7186907020872866, + "grad_norm": 1.6879323720932007, + "learning_rate": 1.020045372741384e-06, + "loss": 0.1319, + "step": 3623 + }, + { + "epoch": 1.7191650853889944, + "grad_norm": 1.3660778999328613, + "learning_rate": 1.0166674522253817e-06, + "loss": 0.1377, + "step": 3624 + }, + { + "epoch": 1.7196394686907022, + "grad_norm": 1.907179594039917, + "learning_rate": 1.0132948344816863e-06, + "loss": 0.1655, + "step": 3625 + }, + { + "epoch": 1.72011385199241, + "grad_norm": 1.814266562461853, + "learning_rate": 1.0099275215011227e-06, + "loss": 0.1645, + "step": 3626 + }, + { + "epoch": 1.7205882352941178, + "grad_norm": 1.505477786064148, + "learning_rate": 1.0065655152713828e-06, + "loss": 0.1253, + "step": 3627 + }, + { + "epoch": 1.7210626185958255, + "grad_norm": 2.314615249633789, + "learning_rate": 1.003208817777025e-06, + "loss": 0.1994, + "step": 3628 + }, + { + "epoch": 1.7215370018975333, + "grad_norm": 1.3844680786132812, + "learning_rate": 9.99857430999478e-07, + "loss": 0.1154, + "step": 3629 + }, + { + "epoch": 1.7220113851992411, + "grad_norm": 1.7867406606674194, + "learning_rate": 9.965113569170258e-07, + "loss": 0.1278, + "step": 3630 + }, + { + "epoch": 1.722485768500949, + "grad_norm": 1.7978324890136719, + "learning_rate": 9.931705975048279e-07, + "loss": 0.1922, + "step": 3631 + }, + { + "epoch": 1.7229601518026565, + "grad_norm": 1.2707836627960205, + "learning_rate": 9.898351547349005e-07, + "loss": 0.1291, + "step": 3632 + }, + { + "epoch": 1.7234345351043643, + "grad_norm": 1.5729351043701172, + "learning_rate": 9.865050305761226e-07, + "loss": 0.1654, + "step": 3633 + }, + { + "epoch": 1.723908918406072, + "grad_norm": 1.5746902227401733, + "learning_rate": 9.831802269942304e-07, + "loss": 0.1423, + "step": 3634 + }, + { + "epoch": 1.7243833017077799, + "grad_norm": 1.864458680152893, + "learning_rate": 9.79860745951825e-07, + "loss": 0.1778, + "step": 3635 + }, + { + "epoch": 1.7248576850094877, + "grad_norm": 1.7163097858428955, + "learning_rate": 9.765465894083637e-07, + "loss": 0.1507, + "step": 3636 + }, + { + "epoch": 1.7253320683111955, + "grad_norm": 1.3414804935455322, + "learning_rate": 9.73237759320159e-07, + "loss": 0.1321, + "step": 3637 + }, + { + "epoch": 1.7258064516129032, + "grad_norm": 2.13075852394104, + "learning_rate": 9.699342576403847e-07, + "loss": 0.1707, + "step": 3638 + }, + { + "epoch": 1.726280834914611, + "grad_norm": 1.949949860572815, + "learning_rate": 9.666360863190583e-07, + "loss": 0.1592, + "step": 3639 + }, + { + "epoch": 1.7267552182163188, + "grad_norm": 9.197535514831543, + "learning_rate": 9.633432473030635e-07, + "loss": 0.1525, + "step": 3640 + }, + { + "epoch": 1.7272296015180264, + "grad_norm": 1.6272482872009277, + "learning_rate": 9.600557425361269e-07, + "loss": 0.1586, + "step": 3641 + }, + { + "epoch": 1.7277039848197342, + "grad_norm": 1.5397499799728394, + "learning_rate": 9.567735739588314e-07, + "loss": 0.1292, + "step": 3642 + }, + { + "epoch": 1.728178368121442, + "grad_norm": 1.5450812578201294, + "learning_rate": 9.534967435086095e-07, + "loss": 0.1174, + "step": 3643 + }, + { + "epoch": 1.7286527514231498, + "grad_norm": 1.3916116952896118, + "learning_rate": 9.502252531197398e-07, + "loss": 0.1326, + "step": 3644 + }, + { + "epoch": 1.7291271347248576, + "grad_norm": 1.4877862930297852, + "learning_rate": 9.469591047233517e-07, + "loss": 0.1328, + "step": 3645 + }, + { + "epoch": 1.7296015180265654, + "grad_norm": 1.4843299388885498, + "learning_rate": 9.436983002474209e-07, + "loss": 0.1508, + "step": 3646 + }, + { + "epoch": 1.7300759013282732, + "grad_norm": 1.532395601272583, + "learning_rate": 9.404428416167688e-07, + "loss": 0.1403, + "step": 3647 + }, + { + "epoch": 1.730550284629981, + "grad_norm": 2.052851915359497, + "learning_rate": 9.371927307530593e-07, + "loss": 0.1483, + "step": 3648 + }, + { + "epoch": 1.7310246679316887, + "grad_norm": 1.7346041202545166, + "learning_rate": 9.339479695747988e-07, + "loss": 0.1573, + "step": 3649 + }, + { + "epoch": 1.7314990512333965, + "grad_norm": 1.8105442523956299, + "learning_rate": 9.307085599973387e-07, + "loss": 0.1597, + "step": 3650 + }, + { + "epoch": 1.7319734345351043, + "grad_norm": 2.083927631378174, + "learning_rate": 9.274745039328725e-07, + "loss": 0.164, + "step": 3651 + }, + { + "epoch": 1.7324478178368121, + "grad_norm": 1.740408182144165, + "learning_rate": 9.242458032904311e-07, + "loss": 0.1351, + "step": 3652 + }, + { + "epoch": 1.73292220113852, + "grad_norm": 1.4160617589950562, + "learning_rate": 9.210224599758811e-07, + "loss": 0.1228, + "step": 3653 + }, + { + "epoch": 1.7333965844402277, + "grad_norm": 1.3447182178497314, + "learning_rate": 9.178044758919336e-07, + "loss": 0.1153, + "step": 3654 + }, + { + "epoch": 1.7338709677419355, + "grad_norm": 2.0321731567382812, + "learning_rate": 9.145918529381314e-07, + "loss": 0.1511, + "step": 3655 + }, + { + "epoch": 1.7343453510436433, + "grad_norm": 1.9544363021850586, + "learning_rate": 9.113845930108567e-07, + "loss": 0.1599, + "step": 3656 + }, + { + "epoch": 1.734819734345351, + "grad_norm": 1.5864086151123047, + "learning_rate": 9.081826980033215e-07, + "loss": 0.1562, + "step": 3657 + }, + { + "epoch": 1.7352941176470589, + "grad_norm": 1.6874994039535522, + "learning_rate": 9.049861698055696e-07, + "loss": 0.1619, + "step": 3658 + }, + { + "epoch": 1.7357685009487667, + "grad_norm": 1.6411479711532593, + "learning_rate": 9.017950103044826e-07, + "loss": 0.1461, + "step": 3659 + }, + { + "epoch": 1.7362428842504745, + "grad_norm": 1.355292797088623, + "learning_rate": 8.986092213837705e-07, + "loss": 0.0838, + "step": 3660 + }, + { + "epoch": 1.7367172675521823, + "grad_norm": 2.7029168605804443, + "learning_rate": 8.954288049239734e-07, + "loss": 0.2356, + "step": 3661 + }, + { + "epoch": 1.73719165085389, + "grad_norm": 1.5269354581832886, + "learning_rate": 8.922537628024608e-07, + "loss": 0.1512, + "step": 3662 + }, + { + "epoch": 1.7376660341555978, + "grad_norm": 1.495399832725525, + "learning_rate": 8.890840968934244e-07, + "loss": 0.1317, + "step": 3663 + }, + { + "epoch": 1.7381404174573056, + "grad_norm": 1.3357588052749634, + "learning_rate": 8.859198090678923e-07, + "loss": 0.1108, + "step": 3664 + }, + { + "epoch": 1.7386148007590134, + "grad_norm": 1.5755971670150757, + "learning_rate": 8.827609011937066e-07, + "loss": 0.1393, + "step": 3665 + }, + { + "epoch": 1.7390891840607212, + "grad_norm": 1.437064528465271, + "learning_rate": 8.796073751355417e-07, + "loss": 0.117, + "step": 3666 + }, + { + "epoch": 1.739563567362429, + "grad_norm": 1.7386492490768433, + "learning_rate": 8.764592327548948e-07, + "loss": 0.16, + "step": 3667 + }, + { + "epoch": 1.7400379506641366, + "grad_norm": 1.099024772644043, + "learning_rate": 8.733164759100809e-07, + "loss": 0.0981, + "step": 3668 + }, + { + "epoch": 1.7405123339658444, + "grad_norm": 1.9566211700439453, + "learning_rate": 8.701791064562382e-07, + "loss": 0.157, + "step": 3669 + }, + { + "epoch": 1.7409867172675522, + "grad_norm": 1.458828091621399, + "learning_rate": 8.670471262453251e-07, + "loss": 0.1359, + "step": 3670 + }, + { + "epoch": 1.74146110056926, + "grad_norm": 1.4478492736816406, + "learning_rate": 8.639205371261217e-07, + "loss": 0.1664, + "step": 3671 + }, + { + "epoch": 1.7419354838709677, + "grad_norm": 1.6490832567214966, + "learning_rate": 8.607993409442173e-07, + "loss": 0.1427, + "step": 3672 + }, + { + "epoch": 1.7424098671726755, + "grad_norm": 3.0351619720458984, + "learning_rate": 8.57683539542028e-07, + "loss": 0.1973, + "step": 3673 + }, + { + "epoch": 1.7428842504743833, + "grad_norm": 1.4072052240371704, + "learning_rate": 8.54573134758776e-07, + "loss": 0.1351, + "step": 3674 + }, + { + "epoch": 1.7433586337760911, + "grad_norm": 1.6349687576293945, + "learning_rate": 8.514681284305048e-07, + "loss": 0.1405, + "step": 3675 + }, + { + "epoch": 1.7438330170777987, + "grad_norm": 1.8339117765426636, + "learning_rate": 8.483685223900706e-07, + "loss": 0.1787, + "step": 3676 + }, + { + "epoch": 1.7443074003795065, + "grad_norm": 1.4211890697479248, + "learning_rate": 8.452743184671363e-07, + "loss": 0.1482, + "step": 3677 + }, + { + "epoch": 1.7447817836812143, + "grad_norm": 1.6451423168182373, + "learning_rate": 8.421855184881822e-07, + "loss": 0.1287, + "step": 3678 + }, + { + "epoch": 1.745256166982922, + "grad_norm": 1.9902081489562988, + "learning_rate": 8.391021242764962e-07, + "loss": 0.1923, + "step": 3679 + }, + { + "epoch": 1.7457305502846299, + "grad_norm": 1.723975658416748, + "learning_rate": 8.360241376521772e-07, + "loss": 0.1539, + "step": 3680 + }, + { + "epoch": 1.7462049335863377, + "grad_norm": 1.7645349502563477, + "learning_rate": 8.329515604321281e-07, + "loss": 0.15, + "step": 3681 + }, + { + "epoch": 1.7466793168880455, + "grad_norm": 1.3427985906600952, + "learning_rate": 8.298843944300583e-07, + "loss": 0.113, + "step": 3682 + }, + { + "epoch": 1.7471537001897532, + "grad_norm": 1.576675295829773, + "learning_rate": 8.268226414564895e-07, + "loss": 0.1492, + "step": 3683 + }, + { + "epoch": 1.747628083491461, + "grad_norm": 1.534294605255127, + "learning_rate": 8.237663033187426e-07, + "loss": 0.135, + "step": 3684 + }, + { + "epoch": 1.7481024667931688, + "grad_norm": 1.7654008865356445, + "learning_rate": 8.207153818209446e-07, + "loss": 0.1498, + "step": 3685 + }, + { + "epoch": 1.7485768500948766, + "grad_norm": 1.6576253175735474, + "learning_rate": 8.176698787640247e-07, + "loss": 0.1322, + "step": 3686 + }, + { + "epoch": 1.7490512333965844, + "grad_norm": 1.7810031175613403, + "learning_rate": 8.146297959457116e-07, + "loss": 0.1576, + "step": 3687 + }, + { + "epoch": 1.7495256166982922, + "grad_norm": 1.609955906867981, + "learning_rate": 8.115951351605378e-07, + "loss": 0.1367, + "step": 3688 + }, + { + "epoch": 1.75, + "grad_norm": 1.7572312355041504, + "learning_rate": 8.085658981998312e-07, + "loss": 0.1378, + "step": 3689 + }, + { + "epoch": 1.7504743833017078, + "grad_norm": 1.3481042385101318, + "learning_rate": 8.055420868517227e-07, + "loss": 0.1182, + "step": 3690 + }, + { + "epoch": 1.7509487666034156, + "grad_norm": 2.354755401611328, + "learning_rate": 8.025237029011368e-07, + "loss": 0.201, + "step": 3691 + }, + { + "epoch": 1.7514231499051234, + "grad_norm": 1.7035918235778809, + "learning_rate": 7.995107481297948e-07, + "loss": 0.1286, + "step": 3692 + }, + { + "epoch": 1.7518975332068312, + "grad_norm": 1.6000009775161743, + "learning_rate": 7.965032243162163e-07, + "loss": 0.1307, + "step": 3693 + }, + { + "epoch": 1.752371916508539, + "grad_norm": 1.5351053476333618, + "learning_rate": 7.935011332357113e-07, + "loss": 0.1476, + "step": 3694 + }, + { + "epoch": 1.7528462998102468, + "grad_norm": 1.4815095663070679, + "learning_rate": 7.905044766603876e-07, + "loss": 0.1445, + "step": 3695 + }, + { + "epoch": 1.7533206831119545, + "grad_norm": 1.7617803812026978, + "learning_rate": 7.875132563591382e-07, + "loss": 0.153, + "step": 3696 + }, + { + "epoch": 1.7537950664136623, + "grad_norm": 1.4343780279159546, + "learning_rate": 7.845274740976527e-07, + "loss": 0.135, + "step": 3697 + }, + { + "epoch": 1.7542694497153701, + "grad_norm": 1.5575170516967773, + "learning_rate": 7.815471316384071e-07, + "loss": 0.1198, + "step": 3698 + }, + { + "epoch": 1.754743833017078, + "grad_norm": 1.9399551153182983, + "learning_rate": 7.785722307406685e-07, + "loss": 0.1777, + "step": 3699 + }, + { + "epoch": 1.7552182163187857, + "grad_norm": 1.8243434429168701, + "learning_rate": 7.756027731604943e-07, + "loss": 0.1456, + "step": 3700 + }, + { + "epoch": 1.7556925996204935, + "grad_norm": 1.7485800981521606, + "learning_rate": 7.726387606507224e-07, + "loss": 0.1182, + "step": 3701 + }, + { + "epoch": 1.7561669829222013, + "grad_norm": 1.8186720609664917, + "learning_rate": 7.696801949609811e-07, + "loss": 0.1679, + "step": 3702 + }, + { + "epoch": 1.7566413662239089, + "grad_norm": 2.1172068119049072, + "learning_rate": 7.667270778376834e-07, + "loss": 0.2089, + "step": 3703 + }, + { + "epoch": 1.7571157495256167, + "grad_norm": 1.8532425165176392, + "learning_rate": 7.637794110240259e-07, + "loss": 0.1866, + "step": 3704 + }, + { + "epoch": 1.7575901328273245, + "grad_norm": 1.3793625831604004, + "learning_rate": 7.608371962599847e-07, + "loss": 0.1241, + "step": 3705 + }, + { + "epoch": 1.7580645161290323, + "grad_norm": 1.5093717575073242, + "learning_rate": 7.579004352823205e-07, + "loss": 0.1404, + "step": 3706 + }, + { + "epoch": 1.75853889943074, + "grad_norm": 2.2254135608673096, + "learning_rate": 7.549691298245754e-07, + "loss": 0.1306, + "step": 3707 + }, + { + "epoch": 1.7590132827324478, + "grad_norm": 1.539953351020813, + "learning_rate": 7.520432816170686e-07, + "loss": 0.1244, + "step": 3708 + }, + { + "epoch": 1.7594876660341556, + "grad_norm": 1.3199529647827148, + "learning_rate": 7.491228923868999e-07, + "loss": 0.1135, + "step": 3709 + }, + { + "epoch": 1.7599620493358634, + "grad_norm": 1.6439963579177856, + "learning_rate": 7.462079638579489e-07, + "loss": 0.124, + "step": 3710 + }, + { + "epoch": 1.760436432637571, + "grad_norm": 1.6355787515640259, + "learning_rate": 7.432984977508639e-07, + "loss": 0.1471, + "step": 3711 + }, + { + "epoch": 1.7609108159392788, + "grad_norm": 1.7077136039733887, + "learning_rate": 7.4039449578308e-07, + "loss": 0.1542, + "step": 3712 + }, + { + "epoch": 1.7613851992409866, + "grad_norm": 1.7139383554458618, + "learning_rate": 7.374959596687948e-07, + "loss": 0.1534, + "step": 3713 + }, + { + "epoch": 1.7618595825426944, + "grad_norm": 1.5415153503417969, + "learning_rate": 7.346028911189895e-07, + "loss": 0.1537, + "step": 3714 + }, + { + "epoch": 1.7623339658444022, + "grad_norm": 1.2754297256469727, + "learning_rate": 7.317152918414116e-07, + "loss": 0.1296, + "step": 3715 + }, + { + "epoch": 1.76280834914611, + "grad_norm": 1.7097855806350708, + "learning_rate": 7.288331635405832e-07, + "loss": 0.1495, + "step": 3716 + }, + { + "epoch": 1.7632827324478177, + "grad_norm": 1.5699681043624878, + "learning_rate": 7.259565079177966e-07, + "loss": 0.1435, + "step": 3717 + }, + { + "epoch": 1.7637571157495255, + "grad_norm": 1.5883891582489014, + "learning_rate": 7.230853266711124e-07, + "loss": 0.142, + "step": 3718 + }, + { + "epoch": 1.7642314990512333, + "grad_norm": 1.6266002655029297, + "learning_rate": 7.202196214953616e-07, + "loss": 0.1283, + "step": 3719 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 1.50413978099823, + "learning_rate": 7.173593940821411e-07, + "loss": 0.1272, + "step": 3720 + }, + { + "epoch": 1.765180265654649, + "grad_norm": 1.3868768215179443, + "learning_rate": 7.145046461198146e-07, + "loss": 0.1253, + "step": 3721 + }, + { + "epoch": 1.7656546489563567, + "grad_norm": 1.3816770315170288, + "learning_rate": 7.11655379293511e-07, + "loss": 0.1432, + "step": 3722 + }, + { + "epoch": 1.7661290322580645, + "grad_norm": 1.8432587385177612, + "learning_rate": 7.088115952851238e-07, + "loss": 0.1617, + "step": 3723 + }, + { + "epoch": 1.7666034155597723, + "grad_norm": 1.6249598264694214, + "learning_rate": 7.059732957733145e-07, + "loss": 0.1462, + "step": 3724 + }, + { + "epoch": 1.76707779886148, + "grad_norm": 1.4073657989501953, + "learning_rate": 7.031404824334986e-07, + "loss": 0.126, + "step": 3725 + }, + { + "epoch": 1.7675521821631879, + "grad_norm": 1.3608930110931396, + "learning_rate": 7.003131569378586e-07, + "loss": 0.1174, + "step": 3726 + }, + { + "epoch": 1.7680265654648957, + "grad_norm": 1.6498671770095825, + "learning_rate": 6.97491320955338e-07, + "loss": 0.1563, + "step": 3727 + }, + { + "epoch": 1.7685009487666035, + "grad_norm": 1.5158390998840332, + "learning_rate": 6.946749761516402e-07, + "loss": 0.1455, + "step": 3728 + }, + { + "epoch": 1.7689753320683113, + "grad_norm": 1.5832974910736084, + "learning_rate": 6.918641241892243e-07, + "loss": 0.1263, + "step": 3729 + }, + { + "epoch": 1.769449715370019, + "grad_norm": 1.628330945968628, + "learning_rate": 6.890587667273064e-07, + "loss": 0.1484, + "step": 3730 + }, + { + "epoch": 1.7699240986717268, + "grad_norm": 1.4703549146652222, + "learning_rate": 6.862589054218616e-07, + "loss": 0.1334, + "step": 3731 + }, + { + "epoch": 1.7703984819734346, + "grad_norm": 1.8686482906341553, + "learning_rate": 6.834645419256225e-07, + "loss": 0.1633, + "step": 3732 + }, + { + "epoch": 1.7708728652751424, + "grad_norm": 1.9761866331100464, + "learning_rate": 6.806756778880752e-07, + "loss": 0.1436, + "step": 3733 + }, + { + "epoch": 1.7713472485768502, + "grad_norm": 1.775627851486206, + "learning_rate": 6.77892314955454e-07, + "loss": 0.1542, + "step": 3734 + }, + { + "epoch": 1.771821631878558, + "grad_norm": 1.536481261253357, + "learning_rate": 6.751144547707533e-07, + "loss": 0.1278, + "step": 3735 + }, + { + "epoch": 1.7722960151802658, + "grad_norm": 2.131673812866211, + "learning_rate": 6.723420989737184e-07, + "loss": 0.1947, + "step": 3736 + }, + { + "epoch": 1.7727703984819736, + "grad_norm": 1.6633423566818237, + "learning_rate": 6.695752492008389e-07, + "loss": 0.1661, + "step": 3737 + }, + { + "epoch": 1.7732447817836812, + "grad_norm": 1.718865990638733, + "learning_rate": 6.66813907085363e-07, + "loss": 0.1533, + "step": 3738 + }, + { + "epoch": 1.773719165085389, + "grad_norm": 1.7374719381332397, + "learning_rate": 6.64058074257281e-07, + "loss": 0.1821, + "step": 3739 + }, + { + "epoch": 1.7741935483870968, + "grad_norm": 1.5633835792541504, + "learning_rate": 6.613077523433342e-07, + "loss": 0.1299, + "step": 3740 + }, + { + "epoch": 1.7746679316888045, + "grad_norm": 1.8587796688079834, + "learning_rate": 6.585629429670115e-07, + "loss": 0.1701, + "step": 3741 + }, + { + "epoch": 1.7751423149905123, + "grad_norm": 1.9442617893218994, + "learning_rate": 6.558236477485458e-07, + "loss": 0.1211, + "step": 3742 + }, + { + "epoch": 1.7756166982922201, + "grad_norm": 1.8159979581832886, + "learning_rate": 6.53089868304917e-07, + "loss": 0.1365, + "step": 3743 + }, + { + "epoch": 1.776091081593928, + "grad_norm": 1.4900180101394653, + "learning_rate": 6.503616062498464e-07, + "loss": 0.1381, + "step": 3744 + }, + { + "epoch": 1.7765654648956357, + "grad_norm": 1.4936360120773315, + "learning_rate": 6.476388631938024e-07, + "loss": 0.1397, + "step": 3745 + }, + { + "epoch": 1.7770398481973435, + "grad_norm": 1.368524432182312, + "learning_rate": 6.449216407439906e-07, + "loss": 0.1033, + "step": 3746 + }, + { + "epoch": 1.777514231499051, + "grad_norm": 1.771744728088379, + "learning_rate": 6.422099405043613e-07, + "loss": 0.1609, + "step": 3747 + }, + { + "epoch": 1.7779886148007589, + "grad_norm": 1.7894208431243896, + "learning_rate": 6.395037640756074e-07, + "loss": 0.1453, + "step": 3748 + }, + { + "epoch": 1.7784629981024667, + "grad_norm": 1.6382660865783691, + "learning_rate": 6.368031130551533e-07, + "loss": 0.1232, + "step": 3749 + }, + { + "epoch": 1.7789373814041745, + "grad_norm": 1.9367256164550781, + "learning_rate": 6.341079890371682e-07, + "loss": 0.183, + "step": 3750 + }, + { + "epoch": 1.7794117647058822, + "grad_norm": 1.570556640625, + "learning_rate": 6.314183936125584e-07, + "loss": 0.1278, + "step": 3751 + }, + { + "epoch": 1.77988614800759, + "grad_norm": 1.5493059158325195, + "learning_rate": 6.287343283689662e-07, + "loss": 0.1329, + "step": 3752 + }, + { + "epoch": 1.7803605313092978, + "grad_norm": 1.3096294403076172, + "learning_rate": 6.260557948907664e-07, + "loss": 0.1195, + "step": 3753 + }, + { + "epoch": 1.7808349146110056, + "grad_norm": 2.0561397075653076, + "learning_rate": 6.233827947590709e-07, + "loss": 0.1499, + "step": 3754 + }, + { + "epoch": 1.7813092979127134, + "grad_norm": 1.4813035726547241, + "learning_rate": 6.207153295517265e-07, + "loss": 0.1271, + "step": 3755 + }, + { + "epoch": 1.7817836812144212, + "grad_norm": 1.5294699668884277, + "learning_rate": 6.180534008433114e-07, + "loss": 0.1312, + "step": 3756 + }, + { + "epoch": 1.782258064516129, + "grad_norm": 1.5780595541000366, + "learning_rate": 6.153970102051354e-07, + "loss": 0.1269, + "step": 3757 + }, + { + "epoch": 1.7827324478178368, + "grad_norm": 1.5694869756698608, + "learning_rate": 6.127461592052397e-07, + "loss": 0.1167, + "step": 3758 + }, + { + "epoch": 1.7832068311195446, + "grad_norm": 1.5196350812911987, + "learning_rate": 6.101008494083948e-07, + "loss": 0.1269, + "step": 3759 + }, + { + "epoch": 1.7836812144212524, + "grad_norm": 1.7971737384796143, + "learning_rate": 6.074610823761029e-07, + "loss": 0.1443, + "step": 3760 + }, + { + "epoch": 1.7841555977229602, + "grad_norm": 1.4723081588745117, + "learning_rate": 6.048268596665896e-07, + "loss": 0.1194, + "step": 3761 + }, + { + "epoch": 1.784629981024668, + "grad_norm": 1.2657811641693115, + "learning_rate": 6.021981828348123e-07, + "loss": 0.1006, + "step": 3762 + }, + { + "epoch": 1.7851043643263758, + "grad_norm": 1.2274057865142822, + "learning_rate": 5.995750534324518e-07, + "loss": 0.1081, + "step": 3763 + }, + { + "epoch": 1.7855787476280836, + "grad_norm": 1.5427498817443848, + "learning_rate": 5.969574730079164e-07, + "loss": 0.1491, + "step": 3764 + }, + { + "epoch": 1.7860531309297913, + "grad_norm": 1.7528079748153687, + "learning_rate": 5.943454431063367e-07, + "loss": 0.1405, + "step": 3765 + }, + { + "epoch": 1.7865275142314991, + "grad_norm": 1.619002342224121, + "learning_rate": 5.917389652695693e-07, + "loss": 0.1516, + "step": 3766 + }, + { + "epoch": 1.787001897533207, + "grad_norm": 2.0210466384887695, + "learning_rate": 5.891380410361947e-07, + "loss": 0.1638, + "step": 3767 + }, + { + "epoch": 1.7874762808349147, + "grad_norm": 1.4195687770843506, + "learning_rate": 5.865426719415068e-07, + "loss": 0.1199, + "step": 3768 + }, + { + "epoch": 1.7879506641366225, + "grad_norm": 1.5800368785858154, + "learning_rate": 5.839528595175314e-07, + "loss": 0.1301, + "step": 3769 + }, + { + "epoch": 1.7884250474383303, + "grad_norm": 1.5409759283065796, + "learning_rate": 5.813686052930068e-07, + "loss": 0.1405, + "step": 3770 + }, + { + "epoch": 1.788899430740038, + "grad_norm": 1.754284143447876, + "learning_rate": 5.787899107933936e-07, + "loss": 0.1502, + "step": 3771 + }, + { + "epoch": 1.789373814041746, + "grad_norm": 1.586987853050232, + "learning_rate": 5.762167775408678e-07, + "loss": 0.1244, + "step": 3772 + }, + { + "epoch": 1.7898481973434535, + "grad_norm": 1.611585021018982, + "learning_rate": 5.736492070543265e-07, + "loss": 0.1423, + "step": 3773 + }, + { + "epoch": 1.7903225806451613, + "grad_norm": 1.8729734420776367, + "learning_rate": 5.710872008493795e-07, + "loss": 0.1478, + "step": 3774 + }, + { + "epoch": 1.790796963946869, + "grad_norm": 1.65080988407135, + "learning_rate": 5.685307604383561e-07, + "loss": 0.1352, + "step": 3775 + }, + { + "epoch": 1.7912713472485768, + "grad_norm": 1.5004171133041382, + "learning_rate": 5.659798873302968e-07, + "loss": 0.1132, + "step": 3776 + }, + { + "epoch": 1.7917457305502846, + "grad_norm": 1.4567018747329712, + "learning_rate": 5.634345830309563e-07, + "loss": 0.1126, + "step": 3777 + }, + { + "epoch": 1.7922201138519924, + "grad_norm": 1.566493034362793, + "learning_rate": 5.608948490428023e-07, + "loss": 0.1527, + "step": 3778 + }, + { + "epoch": 1.7926944971537002, + "grad_norm": 1.4656519889831543, + "learning_rate": 5.583606868650138e-07, + "loss": 0.1614, + "step": 3779 + }, + { + "epoch": 1.793168880455408, + "grad_norm": 1.7221026420593262, + "learning_rate": 5.558320979934839e-07, + "loss": 0.1515, + "step": 3780 + }, + { + "epoch": 1.7936432637571158, + "grad_norm": 1.484522819519043, + "learning_rate": 5.533090839208133e-07, + "loss": 0.1394, + "step": 3781 + }, + { + "epoch": 1.7941176470588234, + "grad_norm": 1.717825174331665, + "learning_rate": 5.507916461363094e-07, + "loss": 0.1472, + "step": 3782 + }, + { + "epoch": 1.7945920303605312, + "grad_norm": 1.826720952987671, + "learning_rate": 5.482797861259937e-07, + "loss": 0.1465, + "step": 3783 + }, + { + "epoch": 1.795066413662239, + "grad_norm": 1.4461770057678223, + "learning_rate": 5.45773505372591e-07, + "loss": 0.1204, + "step": 3784 + }, + { + "epoch": 1.7955407969639468, + "grad_norm": 1.7469446659088135, + "learning_rate": 5.43272805355537e-07, + "loss": 0.1501, + "step": 3785 + }, + { + "epoch": 1.7960151802656545, + "grad_norm": 1.5683156251907349, + "learning_rate": 5.407776875509663e-07, + "loss": 0.1272, + "step": 3786 + }, + { + "epoch": 1.7964895635673623, + "grad_norm": 1.7831333875656128, + "learning_rate": 5.382881534317231e-07, + "loss": 0.1567, + "step": 3787 + }, + { + "epoch": 1.7969639468690701, + "grad_norm": 1.6136335134506226, + "learning_rate": 5.35804204467355e-07, + "loss": 0.1244, + "step": 3788 + }, + { + "epoch": 1.797438330170778, + "grad_norm": 2.210510015487671, + "learning_rate": 5.333258421241127e-07, + "loss": 0.1729, + "step": 3789 + }, + { + "epoch": 1.7979127134724857, + "grad_norm": 1.5324962139129639, + "learning_rate": 5.308530678649504e-07, + "loss": 0.1455, + "step": 3790 + }, + { + "epoch": 1.7983870967741935, + "grad_norm": 2.263535737991333, + "learning_rate": 5.283858831495192e-07, + "loss": 0.2427, + "step": 3791 + }, + { + "epoch": 1.7988614800759013, + "grad_norm": 1.561944603919983, + "learning_rate": 5.259242894341765e-07, + "loss": 0.1292, + "step": 3792 + }, + { + "epoch": 1.799335863377609, + "grad_norm": 1.5755988359451294, + "learning_rate": 5.234682881719766e-07, + "loss": 0.1342, + "step": 3793 + }, + { + "epoch": 1.7998102466793169, + "grad_norm": 1.7627415657043457, + "learning_rate": 5.210178808126698e-07, + "loss": 0.1818, + "step": 3794 + }, + { + "epoch": 1.8002846299810247, + "grad_norm": 1.6085638999938965, + "learning_rate": 5.185730688027124e-07, + "loss": 0.166, + "step": 3795 + }, + { + "epoch": 1.8007590132827325, + "grad_norm": 1.764938473701477, + "learning_rate": 5.161338535852467e-07, + "loss": 0.204, + "step": 3796 + }, + { + "epoch": 1.8012333965844403, + "grad_norm": 1.4336494207382202, + "learning_rate": 5.137002366001209e-07, + "loss": 0.1188, + "step": 3797 + }, + { + "epoch": 1.801707779886148, + "grad_norm": 1.5212832689285278, + "learning_rate": 5.112722192838748e-07, + "loss": 0.1357, + "step": 3798 + }, + { + "epoch": 1.8021821631878558, + "grad_norm": 1.5787197351455688, + "learning_rate": 5.08849803069743e-07, + "loss": 0.1273, + "step": 3799 + }, + { + "epoch": 1.8026565464895636, + "grad_norm": 1.7871791124343872, + "learning_rate": 5.064329893876541e-07, + "loss": 0.1423, + "step": 3800 + }, + { + "epoch": 1.8031309297912714, + "grad_norm": 1.4555021524429321, + "learning_rate": 5.0402177966423e-07, + "loss": 0.1356, + "step": 3801 + }, + { + "epoch": 1.8036053130929792, + "grad_norm": 1.40778648853302, + "learning_rate": 5.016161753227799e-07, + "loss": 0.1366, + "step": 3802 + }, + { + "epoch": 1.804079696394687, + "grad_norm": 1.7542588710784912, + "learning_rate": 4.992161777833116e-07, + "loss": 0.1631, + "step": 3803 + }, + { + "epoch": 1.8045540796963948, + "grad_norm": 1.4451662302017212, + "learning_rate": 4.968217884625182e-07, + "loss": 0.1096, + "step": 3804 + }, + { + "epoch": 1.8050284629981026, + "grad_norm": 1.571986198425293, + "learning_rate": 4.944330087737881e-07, + "loss": 0.1449, + "step": 3805 + }, + { + "epoch": 1.8055028462998104, + "grad_norm": 1.793839693069458, + "learning_rate": 4.92049840127189e-07, + "loss": 0.1494, + "step": 3806 + }, + { + "epoch": 1.8059772296015182, + "grad_norm": 1.783685564994812, + "learning_rate": 4.896722839294843e-07, + "loss": 0.1737, + "step": 3807 + }, + { + "epoch": 1.8064516129032258, + "grad_norm": 1.5244357585906982, + "learning_rate": 4.873003415841215e-07, + "loss": 0.1348, + "step": 3808 + }, + { + "epoch": 1.8069259962049335, + "grad_norm": 1.776548147201538, + "learning_rate": 4.849340144912363e-07, + "loss": 0.1773, + "step": 3809 + }, + { + "epoch": 1.8074003795066413, + "grad_norm": 1.6930649280548096, + "learning_rate": 4.825733040476465e-07, + "loss": 0.1131, + "step": 3810 + }, + { + "epoch": 1.8078747628083491, + "grad_norm": 1.6838048696517944, + "learning_rate": 4.802182116468556e-07, + "loss": 0.1474, + "step": 3811 + }, + { + "epoch": 1.808349146110057, + "grad_norm": 1.8660080432891846, + "learning_rate": 4.778687386790515e-07, + "loss": 0.1772, + "step": 3812 + }, + { + "epoch": 1.8088235294117647, + "grad_norm": 1.328919768333435, + "learning_rate": 4.7552488653110575e-07, + "loss": 0.1042, + "step": 3813 + }, + { + "epoch": 1.8092979127134725, + "grad_norm": 1.7662233114242554, + "learning_rate": 4.731866565865717e-07, + "loss": 0.1594, + "step": 3814 + }, + { + "epoch": 1.8097722960151803, + "grad_norm": 1.4320173263549805, + "learning_rate": 4.7085405022568196e-07, + "loss": 0.1277, + "step": 3815 + }, + { + "epoch": 1.810246679316888, + "grad_norm": 1.8004564046859741, + "learning_rate": 4.685270688253507e-07, + "loss": 0.1586, + "step": 3816 + }, + { + "epoch": 1.8107210626185957, + "grad_norm": 1.5289167165756226, + "learning_rate": 4.6620571375917356e-07, + "loss": 0.1296, + "step": 3817 + }, + { + "epoch": 1.8111954459203035, + "grad_norm": 1.3791871070861816, + "learning_rate": 4.638899863974222e-07, + "loss": 0.1142, + "step": 3818 + }, + { + "epoch": 1.8116698292220113, + "grad_norm": 1.5164226293563843, + "learning_rate": 4.615798881070499e-07, + "loss": 0.157, + "step": 3819 + }, + { + "epoch": 1.812144212523719, + "grad_norm": 1.3034436702728271, + "learning_rate": 4.5927542025168025e-07, + "loss": 0.1208, + "step": 3820 + }, + { + "epoch": 1.8126185958254268, + "grad_norm": 1.3577231168746948, + "learning_rate": 4.5697658419162183e-07, + "loss": 0.1204, + "step": 3821 + }, + { + "epoch": 1.8130929791271346, + "grad_norm": 1.599459171295166, + "learning_rate": 4.5468338128385247e-07, + "loss": 0.1329, + "step": 3822 + }, + { + "epoch": 1.8135673624288424, + "grad_norm": 1.78299081325531, + "learning_rate": 4.523958128820283e-07, + "loss": 0.1555, + "step": 3823 + }, + { + "epoch": 1.8140417457305502, + "grad_norm": 2.62516450881958, + "learning_rate": 4.501138803364802e-07, + "loss": 0.2332, + "step": 3824 + }, + { + "epoch": 1.814516129032258, + "grad_norm": 1.7529395818710327, + "learning_rate": 4.478375849942063e-07, + "loss": 0.1568, + "step": 3825 + }, + { + "epoch": 1.8149905123339658, + "grad_norm": 1.5792466402053833, + "learning_rate": 4.4556692819888504e-07, + "loss": 0.1675, + "step": 3826 + }, + { + "epoch": 1.8154648956356736, + "grad_norm": 1.464310646057129, + "learning_rate": 4.4330191129085873e-07, + "loss": 0.1283, + "step": 3827 + }, + { + "epoch": 1.8159392789373814, + "grad_norm": 1.2922571897506714, + "learning_rate": 4.4104253560714794e-07, + "loss": 0.1136, + "step": 3828 + }, + { + "epoch": 1.8164136622390892, + "grad_norm": 1.4315319061279297, + "learning_rate": 4.3878880248143904e-07, + "loss": 0.1296, + "step": 3829 + }, + { + "epoch": 1.816888045540797, + "grad_norm": 1.5491856336593628, + "learning_rate": 4.3654071324408685e-07, + "loss": 0.1488, + "step": 3830 + }, + { + "epoch": 1.8173624288425048, + "grad_norm": 1.3317991495132446, + "learning_rate": 4.342982692221165e-07, + "loss": 0.1164, + "step": 3831 + }, + { + "epoch": 1.8178368121442126, + "grad_norm": 1.8910882472991943, + "learning_rate": 4.3206147173922133e-07, + "loss": 0.176, + "step": 3832 + }, + { + "epoch": 1.8183111954459203, + "grad_norm": 1.317647933959961, + "learning_rate": 4.298303221157618e-07, + "loss": 0.1262, + "step": 3833 + }, + { + "epoch": 1.8187855787476281, + "grad_norm": 1.5027046203613281, + "learning_rate": 4.276048216687634e-07, + "loss": 0.1289, + "step": 3834 + }, + { + "epoch": 1.819259962049336, + "grad_norm": 1.5207945108413696, + "learning_rate": 4.25384971711913e-07, + "loss": 0.1304, + "step": 3835 + }, + { + "epoch": 1.8197343453510437, + "grad_norm": 2.8607802391052246, + "learning_rate": 4.231707735555701e-07, + "loss": 0.1902, + "step": 3836 + }, + { + "epoch": 1.8202087286527515, + "grad_norm": 1.8455995321273804, + "learning_rate": 4.209622285067516e-07, + "loss": 0.1518, + "step": 3837 + }, + { + "epoch": 1.8206831119544593, + "grad_norm": 1.3427563905715942, + "learning_rate": 4.187593378691435e-07, + "loss": 0.1137, + "step": 3838 + }, + { + "epoch": 1.821157495256167, + "grad_norm": 1.9709796905517578, + "learning_rate": 4.165621029430855e-07, + "loss": 0.1531, + "step": 3839 + }, + { + "epoch": 1.821631878557875, + "grad_norm": 2.085360527038574, + "learning_rate": 4.1437052502558693e-07, + "loss": 0.1924, + "step": 3840 + }, + { + "epoch": 1.8221062618595827, + "grad_norm": 1.684356927871704, + "learning_rate": 4.1218460541031404e-07, + "loss": 0.1503, + "step": 3841 + }, + { + "epoch": 1.8225806451612905, + "grad_norm": 1.6647909879684448, + "learning_rate": 4.1000434538759235e-07, + "loss": 0.1498, + "step": 3842 + }, + { + "epoch": 1.823055028462998, + "grad_norm": 1.9033278226852417, + "learning_rate": 4.078297462444092e-07, + "loss": 0.1553, + "step": 3843 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 1.4398269653320312, + "learning_rate": 4.0566080926440765e-07, + "loss": 0.128, + "step": 3844 + }, + { + "epoch": 1.8240037950664136, + "grad_norm": 1.634743571281433, + "learning_rate": 4.034975357278903e-07, + "loss": 0.1454, + "step": 3845 + }, + { + "epoch": 1.8244781783681214, + "grad_norm": 1.7548015117645264, + "learning_rate": 4.013399269118157e-07, + "loss": 0.1535, + "step": 3846 + }, + { + "epoch": 1.8249525616698292, + "grad_norm": 1.925663948059082, + "learning_rate": 3.991879840897994e-07, + "loss": 0.1483, + "step": 3847 + }, + { + "epoch": 1.825426944971537, + "grad_norm": 1.2624423503875732, + "learning_rate": 3.970417085321143e-07, + "loss": 0.1018, + "step": 3848 + }, + { + "epoch": 1.8259013282732448, + "grad_norm": 1.4971909523010254, + "learning_rate": 3.949011015056803e-07, + "loss": 0.1474, + "step": 3849 + }, + { + "epoch": 1.8263757115749526, + "grad_norm": 1.7054779529571533, + "learning_rate": 3.9276616427408207e-07, + "loss": 0.1422, + "step": 3850 + }, + { + "epoch": 1.8268500948766604, + "grad_norm": 1.506289005279541, + "learning_rate": 3.9063689809754837e-07, + "loss": 0.1337, + "step": 3851 + }, + { + "epoch": 1.827324478178368, + "grad_norm": 1.4159045219421387, + "learning_rate": 3.8851330423296476e-07, + "loss": 0.1259, + "step": 3852 + }, + { + "epoch": 1.8277988614800758, + "grad_norm": 1.5308058261871338, + "learning_rate": 3.8639538393386854e-07, + "loss": 0.1109, + "step": 3853 + }, + { + "epoch": 1.8282732447817835, + "grad_norm": 1.6169154644012451, + "learning_rate": 3.842831384504453e-07, + "loss": 0.1385, + "step": 3854 + }, + { + "epoch": 1.8287476280834913, + "grad_norm": 1.4959263801574707, + "learning_rate": 3.821765690295343e-07, + "loss": 0.1364, + "step": 3855 + }, + { + "epoch": 1.8292220113851991, + "grad_norm": 1.550232172012329, + "learning_rate": 3.8007567691462187e-07, + "loss": 0.1415, + "step": 3856 + }, + { + "epoch": 1.829696394686907, + "grad_norm": 1.8426754474639893, + "learning_rate": 3.77980463345845e-07, + "loss": 0.1443, + "step": 3857 + }, + { + "epoch": 1.8301707779886147, + "grad_norm": 1.846116065979004, + "learning_rate": 3.758909295599877e-07, + "loss": 0.1743, + "step": 3858 + }, + { + "epoch": 1.8306451612903225, + "grad_norm": 2.2521839141845703, + "learning_rate": 3.738070767904778e-07, + "loss": 0.1485, + "step": 3859 + }, + { + "epoch": 1.8311195445920303, + "grad_norm": 1.8919899463653564, + "learning_rate": 3.7172890626739566e-07, + "loss": 0.1651, + "step": 3860 + }, + { + "epoch": 1.831593927893738, + "grad_norm": 1.477987289428711, + "learning_rate": 3.696564192174645e-07, + "loss": 0.1399, + "step": 3861 + }, + { + "epoch": 1.8320683111954459, + "grad_norm": 1.4697281122207642, + "learning_rate": 3.675896168640536e-07, + "loss": 0.114, + "step": 3862 + }, + { + "epoch": 1.8325426944971537, + "grad_norm": 1.2176628112792969, + "learning_rate": 3.655285004271747e-07, + "loss": 0.1069, + "step": 3863 + }, + { + "epoch": 1.8330170777988615, + "grad_norm": 1.5733720064163208, + "learning_rate": 3.634730711234835e-07, + "loss": 0.1506, + "step": 3864 + }, + { + "epoch": 1.8334914611005693, + "grad_norm": 2.247699737548828, + "learning_rate": 3.6142333016628286e-07, + "loss": 0.1617, + "step": 3865 + }, + { + "epoch": 1.833965844402277, + "grad_norm": 1.3761510848999023, + "learning_rate": 3.593792787655115e-07, + "loss": 0.1083, + "step": 3866 + }, + { + "epoch": 1.8344402277039848, + "grad_norm": 1.6789445877075195, + "learning_rate": 3.573409181277554e-07, + "loss": 0.1403, + "step": 3867 + }, + { + "epoch": 1.8349146110056926, + "grad_norm": 1.8942904472351074, + "learning_rate": 3.553082494562354e-07, + "loss": 0.1702, + "step": 3868 + }, + { + "epoch": 1.8353889943074004, + "grad_norm": 1.7401469945907593, + "learning_rate": 3.5328127395081736e-07, + "loss": 0.1493, + "step": 3869 + }, + { + "epoch": 1.8358633776091082, + "grad_norm": 1.4459508657455444, + "learning_rate": 3.5125999280800517e-07, + "loss": 0.1332, + "step": 3870 + }, + { + "epoch": 1.836337760910816, + "grad_norm": 1.4247829914093018, + "learning_rate": 3.492444072209411e-07, + "loss": 0.1384, + "step": 3871 + }, + { + "epoch": 1.8368121442125238, + "grad_norm": 1.431828260421753, + "learning_rate": 3.472345183794046e-07, + "loss": 0.1207, + "step": 3872 + }, + { + "epoch": 1.8372865275142316, + "grad_norm": 1.6826831102371216, + "learning_rate": 3.4523032746981434e-07, + "loss": 0.1252, + "step": 3873 + }, + { + "epoch": 1.8377609108159394, + "grad_norm": 1.5859261751174927, + "learning_rate": 3.43231835675224e-07, + "loss": 0.1273, + "step": 3874 + }, + { + "epoch": 1.8382352941176472, + "grad_norm": 1.906586766242981, + "learning_rate": 3.4123904417532325e-07, + "loss": 0.1509, + "step": 3875 + }, + { + "epoch": 1.838709677419355, + "grad_norm": 1.7677314281463623, + "learning_rate": 3.3925195414643677e-07, + "loss": 0.1258, + "step": 3876 + }, + { + "epoch": 1.8391840607210628, + "grad_norm": 1.9575536251068115, + "learning_rate": 3.37270566761525e-07, + "loss": 0.1903, + "step": 3877 + }, + { + "epoch": 1.8396584440227703, + "grad_norm": 1.5801265239715576, + "learning_rate": 3.3529488319017924e-07, + "loss": 0.123, + "step": 3878 + }, + { + "epoch": 1.8401328273244781, + "grad_norm": 1.6547151803970337, + "learning_rate": 3.3332490459862865e-07, + "loss": 0.1577, + "step": 3879 + }, + { + "epoch": 1.840607210626186, + "grad_norm": 1.688603401184082, + "learning_rate": 3.313606321497309e-07, + "loss": 0.1416, + "step": 3880 + }, + { + "epoch": 1.8410815939278937, + "grad_norm": 1.4524204730987549, + "learning_rate": 3.294020670029785e-07, + "loss": 0.1121, + "step": 3881 + }, + { + "epoch": 1.8415559772296015, + "grad_norm": 1.886781930923462, + "learning_rate": 3.2744921031448997e-07, + "loss": 0.1375, + "step": 3882 + }, + { + "epoch": 1.8420303605313093, + "grad_norm": 1.9067200422286987, + "learning_rate": 3.255020632370176e-07, + "loss": 0.182, + "step": 3883 + }, + { + "epoch": 1.842504743833017, + "grad_norm": 1.91090989112854, + "learning_rate": 3.235606269199454e-07, + "loss": 0.1711, + "step": 3884 + }, + { + "epoch": 1.842979127134725, + "grad_norm": 2.2755870819091797, + "learning_rate": 3.2162490250928103e-07, + "loss": 0.1389, + "step": 3885 + }, + { + "epoch": 1.8434535104364327, + "grad_norm": 1.7041386365890503, + "learning_rate": 3.1969489114766715e-07, + "loss": 0.1558, + "step": 3886 + }, + { + "epoch": 1.8439278937381403, + "grad_norm": 1.5372099876403809, + "learning_rate": 3.1777059397436693e-07, + "loss": 0.1196, + "step": 3887 + }, + { + "epoch": 1.844402277039848, + "grad_norm": 1.4192637205123901, + "learning_rate": 3.1585201212527507e-07, + "loss": 0.1458, + "step": 3888 + }, + { + "epoch": 1.8448766603415558, + "grad_norm": 2.2096190452575684, + "learning_rate": 3.1393914673291335e-07, + "loss": 0.1621, + "step": 3889 + }, + { + "epoch": 1.8453510436432636, + "grad_norm": 1.54441499710083, + "learning_rate": 3.120319989264242e-07, + "loss": 0.1255, + "step": 3890 + }, + { + "epoch": 1.8458254269449714, + "grad_norm": 1.6018781661987305, + "learning_rate": 3.101305698315815e-07, + "loss": 0.1354, + "step": 3891 + }, + { + "epoch": 1.8462998102466792, + "grad_norm": 2.1039795875549316, + "learning_rate": 3.082348605707752e-07, + "loss": 0.1501, + "step": 3892 + }, + { + "epoch": 1.846774193548387, + "grad_norm": 1.5671883821487427, + "learning_rate": 3.06344872263028e-07, + "loss": 0.1483, + "step": 3893 + }, + { + "epoch": 1.8472485768500948, + "grad_norm": 1.5464131832122803, + "learning_rate": 3.0446060602397965e-07, + "loss": 0.1455, + "step": 3894 + }, + { + "epoch": 1.8477229601518026, + "grad_norm": 1.674782156944275, + "learning_rate": 3.0258206296589487e-07, + "loss": 0.158, + "step": 3895 + }, + { + "epoch": 1.8481973434535104, + "grad_norm": 1.4829376935958862, + "learning_rate": 3.007092441976567e-07, + "loss": 0.1428, + "step": 3896 + }, + { + "epoch": 1.8486717267552182, + "grad_norm": 1.6832263469696045, + "learning_rate": 2.988421508247741e-07, + "loss": 0.1594, + "step": 3897 + }, + { + "epoch": 1.849146110056926, + "grad_norm": 1.5657440423965454, + "learning_rate": 2.9698078394937325e-07, + "loss": 0.1129, + "step": 3898 + }, + { + "epoch": 1.8496204933586338, + "grad_norm": 1.9195455312728882, + "learning_rate": 2.951251446701997e-07, + "loss": 0.1411, + "step": 3899 + }, + { + "epoch": 1.8500948766603416, + "grad_norm": 1.7796403169631958, + "learning_rate": 2.932752340826195e-07, + "loss": 0.1521, + "step": 3900 + }, + { + "epoch": 1.8505692599620494, + "grad_norm": 1.448384404182434, + "learning_rate": 2.914310532786158e-07, + "loss": 0.1057, + "step": 3901 + }, + { + "epoch": 1.8510436432637571, + "grad_norm": 1.8354464769363403, + "learning_rate": 2.8959260334679107e-07, + "loss": 0.1109, + "step": 3902 + }, + { + "epoch": 1.851518026565465, + "grad_norm": 1.9993449449539185, + "learning_rate": 2.87759885372364e-07, + "loss": 0.194, + "step": 3903 + }, + { + "epoch": 1.8519924098671727, + "grad_norm": 1.7258479595184326, + "learning_rate": 2.859329004371703e-07, + "loss": 0.1546, + "step": 3904 + }, + { + "epoch": 1.8524667931688805, + "grad_norm": 1.6797391176223755, + "learning_rate": 2.8411164961966164e-07, + "loss": 0.1433, + "step": 3905 + }, + { + "epoch": 1.8529411764705883, + "grad_norm": 1.4090880155563354, + "learning_rate": 2.8229613399490265e-07, + "loss": 0.1238, + "step": 3906 + }, + { + "epoch": 1.853415559772296, + "grad_norm": 1.9143239259719849, + "learning_rate": 2.8048635463457485e-07, + "loss": 0.1373, + "step": 3907 + }, + { + "epoch": 1.853889943074004, + "grad_norm": 1.5854467153549194, + "learning_rate": 2.7868231260697267e-07, + "loss": 0.1538, + "step": 3908 + }, + { + "epoch": 1.8543643263757117, + "grad_norm": 1.3684797286987305, + "learning_rate": 2.768840089770053e-07, + "loss": 0.0993, + "step": 3909 + }, + { + "epoch": 1.8548387096774195, + "grad_norm": 1.3780193328857422, + "learning_rate": 2.750914448061925e-07, + "loss": 0.0877, + "step": 3910 + }, + { + "epoch": 1.8553130929791273, + "grad_norm": 1.7516002655029297, + "learning_rate": 2.73304621152668e-07, + "loss": 0.1584, + "step": 3911 + }, + { + "epoch": 1.855787476280835, + "grad_norm": 2.2483954429626465, + "learning_rate": 2.7152353907117566e-07, + "loss": 0.2224, + "step": 3912 + }, + { + "epoch": 1.8562618595825426, + "grad_norm": 1.5955204963684082, + "learning_rate": 2.697481996130713e-07, + "loss": 0.127, + "step": 3913 + }, + { + "epoch": 1.8567362428842504, + "grad_norm": 1.3224315643310547, + "learning_rate": 2.6797860382631993e-07, + "loss": 0.1119, + "step": 3914 + }, + { + "epoch": 1.8572106261859582, + "grad_norm": 1.8325411081314087, + "learning_rate": 2.6621475275549593e-07, + "loss": 0.1764, + "step": 3915 + }, + { + "epoch": 1.857685009487666, + "grad_norm": 1.426345705986023, + "learning_rate": 2.644566474417831e-07, + "loss": 0.1312, + "step": 3916 + }, + { + "epoch": 1.8581593927893738, + "grad_norm": 1.5736240148544312, + "learning_rate": 2.627042889229736e-07, + "loss": 0.1433, + "step": 3917 + }, + { + "epoch": 1.8586337760910816, + "grad_norm": 1.5317918062210083, + "learning_rate": 2.609576782334688e-07, + "loss": 0.1323, + "step": 3918 + }, + { + "epoch": 1.8591081593927894, + "grad_norm": 1.8912248611450195, + "learning_rate": 2.592168164042741e-07, + "loss": 0.14, + "step": 3919 + }, + { + "epoch": 1.8595825426944972, + "grad_norm": 1.658992052078247, + "learning_rate": 2.57481704463004e-07, + "loss": 0.1498, + "step": 3920 + }, + { + "epoch": 1.860056925996205, + "grad_norm": 1.9872993230819702, + "learning_rate": 2.5575234343387603e-07, + "loss": 0.1534, + "step": 3921 + }, + { + "epoch": 1.8605313092979125, + "grad_norm": 1.5195602178573608, + "learning_rate": 2.5402873433771793e-07, + "loss": 0.1353, + "step": 3922 + }, + { + "epoch": 1.8610056925996203, + "grad_norm": 1.7144728899002075, + "learning_rate": 2.52310878191957e-07, + "loss": 0.1466, + "step": 3923 + }, + { + "epoch": 1.8614800759013281, + "grad_norm": 1.6180931329727173, + "learning_rate": 2.5059877601062655e-07, + "loss": 0.1519, + "step": 3924 + }, + { + "epoch": 1.861954459203036, + "grad_norm": 1.5871514081954956, + "learning_rate": 2.488924288043648e-07, + "loss": 0.1451, + "step": 3925 + }, + { + "epoch": 1.8624288425047437, + "grad_norm": 1.9041343927383423, + "learning_rate": 2.4719183758041056e-07, + "loss": 0.1313, + "step": 3926 + }, + { + "epoch": 1.8629032258064515, + "grad_norm": 1.3394379615783691, + "learning_rate": 2.454970033426052e-07, + "loss": 0.1156, + "step": 3927 + }, + { + "epoch": 1.8633776091081593, + "grad_norm": 1.8027927875518799, + "learning_rate": 2.4380792709139513e-07, + "loss": 0.1709, + "step": 3928 + }, + { + "epoch": 1.863851992409867, + "grad_norm": 1.628211498260498, + "learning_rate": 2.4212460982382503e-07, + "loss": 0.1529, + "step": 3929 + }, + { + "epoch": 1.864326375711575, + "grad_norm": 1.6816056966781616, + "learning_rate": 2.4044705253353897e-07, + "loss": 0.171, + "step": 3930 + }, + { + "epoch": 1.8648007590132827, + "grad_norm": 1.5056520700454712, + "learning_rate": 2.387752562107826e-07, + "loss": 0.1233, + "step": 3931 + }, + { + "epoch": 1.8652751423149905, + "grad_norm": 1.4784669876098633, + "learning_rate": 2.3710922184239983e-07, + "loss": 0.103, + "step": 3932 + }, + { + "epoch": 1.8657495256166983, + "grad_norm": 1.5319194793701172, + "learning_rate": 2.3544895041183736e-07, + "loss": 0.131, + "step": 3933 + }, + { + "epoch": 1.866223908918406, + "grad_norm": 1.5678507089614868, + "learning_rate": 2.3379444289913344e-07, + "loss": 0.1255, + "step": 3934 + }, + { + "epoch": 1.8666982922201139, + "grad_norm": 1.686842441558838, + "learning_rate": 2.321457002809302e-07, + "loss": 0.1496, + "step": 3935 + }, + { + "epoch": 1.8671726755218216, + "grad_norm": 1.3267841339111328, + "learning_rate": 2.3050272353046244e-07, + "loss": 0.0997, + "step": 3936 + }, + { + "epoch": 1.8676470588235294, + "grad_norm": 1.3671629428863525, + "learning_rate": 2.2886551361756326e-07, + "loss": 0.1393, + "step": 3937 + }, + { + "epoch": 1.8681214421252372, + "grad_norm": 1.4104747772216797, + "learning_rate": 2.2723407150866295e-07, + "loss": 0.117, + "step": 3938 + }, + { + "epoch": 1.868595825426945, + "grad_norm": 1.6997783184051514, + "learning_rate": 2.2560839816678447e-07, + "loss": 0.1638, + "step": 3939 + }, + { + "epoch": 1.8690702087286528, + "grad_norm": 1.3959341049194336, + "learning_rate": 2.2398849455154693e-07, + "loss": 0.1404, + "step": 3940 + }, + { + "epoch": 1.8695445920303606, + "grad_norm": 1.8219751119613647, + "learning_rate": 2.2237436161916204e-07, + "loss": 0.163, + "step": 3941 + }, + { + "epoch": 1.8700189753320684, + "grad_norm": 1.9857854843139648, + "learning_rate": 2.2076600032243766e-07, + "loss": 0.1881, + "step": 3942 + }, + { + "epoch": 1.8704933586337762, + "grad_norm": 1.7564761638641357, + "learning_rate": 2.1916341161077547e-07, + "loss": 0.1525, + "step": 3943 + }, + { + "epoch": 1.870967741935484, + "grad_norm": 1.8521637916564941, + "learning_rate": 2.175665964301643e-07, + "loss": 0.16, + "step": 3944 + }, + { + "epoch": 1.8714421252371918, + "grad_norm": 1.7699942588806152, + "learning_rate": 2.1597555572319017e-07, + "loss": 0.1384, + "step": 3945 + }, + { + "epoch": 1.8719165085388996, + "grad_norm": 1.621044635772705, + "learning_rate": 2.143902904290296e-07, + "loss": 0.1489, + "step": 3946 + }, + { + "epoch": 1.8723908918406074, + "grad_norm": 1.7144032716751099, + "learning_rate": 2.1281080148344734e-07, + "loss": 0.1488, + "step": 3947 + }, + { + "epoch": 1.8728652751423152, + "grad_norm": 1.5608563423156738, + "learning_rate": 2.1123708981880097e-07, + "loss": 0.133, + "step": 3948 + }, + { + "epoch": 1.8733396584440227, + "grad_norm": 1.6698534488677979, + "learning_rate": 2.0966915636403518e-07, + "loss": 0.1631, + "step": 3949 + }, + { + "epoch": 1.8738140417457305, + "grad_norm": 1.595363974571228, + "learning_rate": 2.0810700204468737e-07, + "loss": 0.1502, + "step": 3950 + }, + { + "epoch": 1.8742884250474383, + "grad_norm": 1.279978632926941, + "learning_rate": 2.0655062778288103e-07, + "loss": 0.1074, + "step": 3951 + }, + { + "epoch": 1.874762808349146, + "grad_norm": 1.4724317789077759, + "learning_rate": 2.050000344973302e-07, + "loss": 0.1373, + "step": 3952 + }, + { + "epoch": 1.875237191650854, + "grad_norm": 1.8594779968261719, + "learning_rate": 2.0345522310333154e-07, + "loss": 0.1505, + "step": 3953 + }, + { + "epoch": 1.8757115749525617, + "grad_norm": 1.8657863140106201, + "learning_rate": 2.0191619451277568e-07, + "loss": 0.1591, + "step": 3954 + }, + { + "epoch": 1.8761859582542695, + "grad_norm": 1.5500346422195435, + "learning_rate": 2.0038294963413251e-07, + "loss": 0.1185, + "step": 3955 + }, + { + "epoch": 1.8766603415559773, + "grad_norm": 2.1276473999023438, + "learning_rate": 1.9885548937246259e-07, + "loss": 0.1951, + "step": 3956 + }, + { + "epoch": 1.8771347248576848, + "grad_norm": 1.9857920408248901, + "learning_rate": 1.9733381462941237e-07, + "loss": 0.1628, + "step": 3957 + }, + { + "epoch": 1.8776091081593926, + "grad_norm": 1.5573140382766724, + "learning_rate": 1.9581792630320784e-07, + "loss": 0.142, + "step": 3958 + }, + { + "epoch": 1.8780834914611004, + "grad_norm": 1.2747137546539307, + "learning_rate": 1.9430782528866655e-07, + "loss": 0.0852, + "step": 3959 + }, + { + "epoch": 1.8785578747628082, + "grad_norm": 1.549103856086731, + "learning_rate": 1.928035124771832e-07, + "loss": 0.1298, + "step": 3960 + }, + { + "epoch": 1.879032258064516, + "grad_norm": 1.432792067527771, + "learning_rate": 1.9130498875673975e-07, + "loss": 0.1194, + "step": 3961 + }, + { + "epoch": 1.8795066413662238, + "grad_norm": 1.63291335105896, + "learning_rate": 1.8981225501190193e-07, + "loss": 0.1418, + "step": 3962 + }, + { + "epoch": 1.8799810246679316, + "grad_norm": 1.7714699506759644, + "learning_rate": 1.8832531212381378e-07, + "loss": 0.1696, + "step": 3963 + }, + { + "epoch": 1.8804554079696394, + "grad_norm": 1.4121677875518799, + "learning_rate": 1.8684416097020318e-07, + "loss": 0.1029, + "step": 3964 + }, + { + "epoch": 1.8809297912713472, + "grad_norm": 2.2035553455352783, + "learning_rate": 1.853688024253786e-07, + "loss": 0.1971, + "step": 3965 + }, + { + "epoch": 1.881404174573055, + "grad_norm": 1.4989498853683472, + "learning_rate": 1.8389923736022886e-07, + "loss": 0.1317, + "step": 3966 + }, + { + "epoch": 1.8818785578747628, + "grad_norm": 1.5162180662155151, + "learning_rate": 1.824354666422268e-07, + "loss": 0.1308, + "step": 3967 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 1.9501628875732422, + "learning_rate": 1.8097749113541897e-07, + "loss": 0.1492, + "step": 3968 + }, + { + "epoch": 1.8828273244781784, + "grad_norm": 1.5917279720306396, + "learning_rate": 1.7952531170043474e-07, + "loss": 0.1331, + "step": 3969 + }, + { + "epoch": 1.8833017077798861, + "grad_norm": 1.2856277227401733, + "learning_rate": 1.7807892919448178e-07, + "loss": 0.1212, + "step": 3970 + }, + { + "epoch": 1.883776091081594, + "grad_norm": 1.876271367073059, + "learning_rate": 1.7663834447134488e-07, + "loss": 0.184, + "step": 3971 + }, + { + "epoch": 1.8842504743833017, + "grad_norm": 2.1068382263183594, + "learning_rate": 1.7520355838138603e-07, + "loss": 0.1388, + "step": 3972 + }, + { + "epoch": 1.8847248576850095, + "grad_norm": 1.8254120349884033, + "learning_rate": 1.7377457177154554e-07, + "loss": 0.1451, + "step": 3973 + }, + { + "epoch": 1.8851992409867173, + "grad_norm": 2.037038564682007, + "learning_rate": 1.7235138548534202e-07, + "loss": 0.1183, + "step": 3974 + }, + { + "epoch": 1.885673624288425, + "grad_norm": 1.3412225246429443, + "learning_rate": 1.7093400036286567e-07, + "loss": 0.1199, + "step": 3975 + }, + { + "epoch": 1.886148007590133, + "grad_norm": 1.9562183618545532, + "learning_rate": 1.6952241724078723e-07, + "loss": 0.1583, + "step": 3976 + }, + { + "epoch": 1.8866223908918407, + "grad_norm": 1.518302321434021, + "learning_rate": 1.6811663695234903e-07, + "loss": 0.1554, + "step": 3977 + }, + { + "epoch": 1.8870967741935485, + "grad_norm": 1.9183396100997925, + "learning_rate": 1.6671666032736845e-07, + "loss": 0.1279, + "step": 3978 + }, + { + "epoch": 1.8875711574952563, + "grad_norm": 2.229943037033081, + "learning_rate": 1.6532248819223995e-07, + "loss": 0.1979, + "step": 3979 + }, + { + "epoch": 1.888045540796964, + "grad_norm": 1.8653277158737183, + "learning_rate": 1.639341213699286e-07, + "loss": 0.1692, + "step": 3980 + }, + { + "epoch": 1.8885199240986719, + "grad_norm": 1.746902346611023, + "learning_rate": 1.6255156067997325e-07, + "loss": 0.1305, + "step": 3981 + }, + { + "epoch": 1.8889943074003797, + "grad_norm": 1.417380928993225, + "learning_rate": 1.6117480693848442e-07, + "loss": 0.1378, + "step": 3982 + }, + { + "epoch": 1.8894686907020875, + "grad_norm": 1.7626910209655762, + "learning_rate": 1.598038609581487e-07, + "loss": 0.1556, + "step": 3983 + }, + { + "epoch": 1.889943074003795, + "grad_norm": 1.6275900602340698, + "learning_rate": 1.5843872354822099e-07, + "loss": 0.1346, + "step": 3984 + }, + { + "epoch": 1.8904174573055028, + "grad_norm": 2.031325340270996, + "learning_rate": 1.5707939551452778e-07, + "loss": 0.1854, + "step": 3985 + }, + { + "epoch": 1.8908918406072106, + "grad_norm": 1.9907748699188232, + "learning_rate": 1.5572587765946833e-07, + "loss": 0.1645, + "step": 3986 + }, + { + "epoch": 1.8913662239089184, + "grad_norm": 1.80055570602417, + "learning_rate": 1.5437817078201024e-07, + "loss": 0.1441, + "step": 3987 + }, + { + "epoch": 1.8918406072106262, + "grad_norm": 1.7448289394378662, + "learning_rate": 1.5303627567769043e-07, + "loss": 0.1543, + "step": 3988 + }, + { + "epoch": 1.892314990512334, + "grad_norm": 1.6348083019256592, + "learning_rate": 1.517001931386175e-07, + "loss": 0.1384, + "step": 3989 + }, + { + "epoch": 1.8927893738140418, + "grad_norm": 1.6425923109054565, + "learning_rate": 1.5036992395346838e-07, + "loss": 0.1273, + "step": 3990 + }, + { + "epoch": 1.8932637571157496, + "grad_norm": 1.3642024993896484, + "learning_rate": 1.4904546890748606e-07, + "loss": 0.123, + "step": 3991 + }, + { + "epoch": 1.8937381404174574, + "grad_norm": 2.2443578243255615, + "learning_rate": 1.477268287824829e-07, + "loss": 0.1664, + "step": 3992 + }, + { + "epoch": 1.894212523719165, + "grad_norm": 1.5792819261550903, + "learning_rate": 1.4641400435684184e-07, + "loss": 0.1412, + "step": 3993 + }, + { + "epoch": 1.8946869070208727, + "grad_norm": 1.4068413972854614, + "learning_rate": 1.4510699640550852e-07, + "loss": 0.1264, + "step": 3994 + }, + { + "epoch": 1.8951612903225805, + "grad_norm": 1.7615771293640137, + "learning_rate": 1.438058056999969e-07, + "loss": 0.1467, + "step": 3995 + }, + { + "epoch": 1.8956356736242883, + "grad_norm": 1.797843098640442, + "learning_rate": 1.4251043300838706e-07, + "loss": 0.1566, + "step": 3996 + }, + { + "epoch": 1.896110056925996, + "grad_norm": 1.5007370710372925, + "learning_rate": 1.412208790953229e-07, + "loss": 0.1244, + "step": 3997 + }, + { + "epoch": 1.896584440227704, + "grad_norm": 1.8236267566680908, + "learning_rate": 1.399371447220188e-07, + "loss": 0.1535, + "step": 3998 + }, + { + "epoch": 1.8970588235294117, + "grad_norm": 1.9256770610809326, + "learning_rate": 1.3865923064624753e-07, + "loss": 0.1759, + "step": 3999 + }, + { + "epoch": 1.8975332068311195, + "grad_norm": 1.9802489280700684, + "learning_rate": 1.3738713762235124e-07, + "loss": 0.1589, + "step": 4000 + }, + { + "epoch": 1.8980075901328273, + "grad_norm": 1.7271060943603516, + "learning_rate": 1.3612086640123257e-07, + "loss": 0.1405, + "step": 4001 + }, + { + "epoch": 1.898481973434535, + "grad_norm": 1.7396737337112427, + "learning_rate": 1.348604177303592e-07, + "loss": 0.1848, + "step": 4002 + }, + { + "epoch": 1.8989563567362429, + "grad_norm": 1.5223190784454346, + "learning_rate": 1.3360579235376148e-07, + "loss": 0.1409, + "step": 4003 + }, + { + "epoch": 1.8994307400379506, + "grad_norm": 1.359562635421753, + "learning_rate": 1.323569910120326e-07, + "loss": 0.1323, + "step": 4004 + }, + { + "epoch": 1.8999051233396584, + "grad_norm": 1.640760064125061, + "learning_rate": 1.311140144423273e-07, + "loss": 0.1185, + "step": 4005 + }, + { + "epoch": 1.9003795066413662, + "grad_norm": 1.469146966934204, + "learning_rate": 1.2987686337836202e-07, + "loss": 0.1387, + "step": 4006 + }, + { + "epoch": 1.900853889943074, + "grad_norm": 1.6434742212295532, + "learning_rate": 1.2864553855041484e-07, + "loss": 0.159, + "step": 4007 + }, + { + "epoch": 1.9013282732447818, + "grad_norm": 1.5248254537582397, + "learning_rate": 1.2742004068532544e-07, + "loss": 0.1352, + "step": 4008 + }, + { + "epoch": 1.9018026565464896, + "grad_norm": 1.5884958505630493, + "learning_rate": 1.2620037050649404e-07, + "loss": 0.1388, + "step": 4009 + }, + { + "epoch": 1.9022770398481974, + "grad_norm": 1.4101170301437378, + "learning_rate": 1.2498652873387696e-07, + "loss": 0.1303, + "step": 4010 + }, + { + "epoch": 1.9027514231499052, + "grad_norm": 1.5898159742355347, + "learning_rate": 1.237785160839955e-07, + "loss": 0.1243, + "step": 4011 + }, + { + "epoch": 1.903225806451613, + "grad_norm": 1.8250586986541748, + "learning_rate": 1.225763332699259e-07, + "loss": 0.1687, + "step": 4012 + }, + { + "epoch": 1.9037001897533208, + "grad_norm": 1.5457115173339844, + "learning_rate": 1.21379981001305e-07, + "loss": 0.137, + "step": 4013 + }, + { + "epoch": 1.9041745730550286, + "grad_norm": 2.082979917526245, + "learning_rate": 1.2018945998433007e-07, + "loss": 0.1687, + "step": 4014 + }, + { + "epoch": 1.9046489563567364, + "grad_norm": 1.4070978164672852, + "learning_rate": 1.190047709217501e-07, + "loss": 0.0983, + "step": 4015 + }, + { + "epoch": 1.9051233396584442, + "grad_norm": 1.6972084045410156, + "learning_rate": 1.1782591451287795e-07, + "loss": 0.1345, + "step": 4016 + }, + { + "epoch": 1.905597722960152, + "grad_norm": 1.9089219570159912, + "learning_rate": 1.1665289145357916e-07, + "loss": 0.1535, + "step": 4017 + }, + { + "epoch": 1.9060721062618597, + "grad_norm": 1.5392357110977173, + "learning_rate": 1.1548570243627988e-07, + "loss": 0.1201, + "step": 4018 + }, + { + "epoch": 1.9065464895635673, + "grad_norm": 1.7641630172729492, + "learning_rate": 1.1432434814995897e-07, + "loss": 0.1616, + "step": 4019 + }, + { + "epoch": 1.907020872865275, + "grad_norm": 1.4591128826141357, + "learning_rate": 1.131688292801525e-07, + "loss": 0.125, + "step": 4020 + }, + { + "epoch": 1.907495256166983, + "grad_norm": 1.7952362298965454, + "learning_rate": 1.1201914650895152e-07, + "loss": 0.1476, + "step": 4021 + }, + { + "epoch": 1.9079696394686907, + "grad_norm": 1.4461826086044312, + "learning_rate": 1.1087530051500206e-07, + "loss": 0.1369, + "step": 4022 + }, + { + "epoch": 1.9084440227703985, + "grad_norm": 1.7202457189559937, + "learning_rate": 1.0973729197350514e-07, + "loss": 0.1535, + "step": 4023 + }, + { + "epoch": 1.9089184060721063, + "grad_norm": 1.419438362121582, + "learning_rate": 1.0860512155621783e-07, + "loss": 0.1482, + "step": 4024 + }, + { + "epoch": 1.909392789373814, + "grad_norm": 1.293198585510254, + "learning_rate": 1.0747878993144667e-07, + "loss": 0.1156, + "step": 4025 + }, + { + "epoch": 1.9098671726755219, + "grad_norm": 1.2855961322784424, + "learning_rate": 1.0635829776405537e-07, + "loss": 0.1263, + "step": 4026 + }, + { + "epoch": 1.9103415559772297, + "grad_norm": 1.5536890029907227, + "learning_rate": 1.0524364571546042e-07, + "loss": 0.1277, + "step": 4027 + }, + { + "epoch": 1.9108159392789372, + "grad_norm": 1.8725106716156006, + "learning_rate": 1.041348344436277e-07, + "loss": 0.1618, + "step": 4028 + }, + { + "epoch": 1.911290322580645, + "grad_norm": 1.922491431236267, + "learning_rate": 1.0303186460307813e-07, + "loss": 0.1475, + "step": 4029 + }, + { + "epoch": 1.9117647058823528, + "grad_norm": 1.6635925769805908, + "learning_rate": 1.019347368448842e-07, + "loss": 0.1678, + "step": 4030 + }, + { + "epoch": 1.9122390891840606, + "grad_norm": 1.6013275384902954, + "learning_rate": 1.0084345181666899e-07, + "loss": 0.1227, + "step": 4031 + }, + { + "epoch": 1.9127134724857684, + "grad_norm": 1.4993422031402588, + "learning_rate": 9.975801016260834e-08, + "loss": 0.1227, + "step": 4032 + }, + { + "epoch": 1.9131878557874762, + "grad_norm": 1.8088953495025635, + "learning_rate": 9.867841252342747e-08, + "loss": 0.1511, + "step": 4033 + }, + { + "epoch": 1.913662239089184, + "grad_norm": 1.366092562675476, + "learning_rate": 9.76046595364022e-08, + "loss": 0.1118, + "step": 4034 + }, + { + "epoch": 1.9141366223908918, + "grad_norm": 1.6480141878128052, + "learning_rate": 9.65367518353577e-08, + "loss": 0.1707, + "step": 4035 + }, + { + "epoch": 1.9146110056925996, + "grad_norm": 1.374222755432129, + "learning_rate": 9.547469005066979e-08, + "loss": 0.1033, + "step": 4036 + }, + { + "epoch": 1.9150853889943074, + "grad_norm": 1.432600736618042, + "learning_rate": 9.441847480926247e-08, + "loss": 0.1277, + "step": 4037 + }, + { + "epoch": 1.9155597722960152, + "grad_norm": 1.8599258661270142, + "learning_rate": 9.336810673460928e-08, + "loss": 0.1714, + "step": 4038 + }, + { + "epoch": 1.916034155597723, + "grad_norm": 1.7460694313049316, + "learning_rate": 9.232358644673311e-08, + "loss": 0.141, + "step": 4039 + }, + { + "epoch": 1.9165085388994307, + "grad_norm": 1.5455800294876099, + "learning_rate": 9.12849145622019e-08, + "loss": 0.1159, + "step": 4040 + }, + { + "epoch": 1.9169829222011385, + "grad_norm": 1.9061877727508545, + "learning_rate": 9.025209169413629e-08, + "loss": 0.1449, + "step": 4041 + }, + { + "epoch": 1.9174573055028463, + "grad_norm": 1.8176791667938232, + "learning_rate": 8.922511845219972e-08, + "loss": 0.1377, + "step": 4042 + }, + { + "epoch": 1.9179316888045541, + "grad_norm": 1.5344781875610352, + "learning_rate": 8.820399544260283e-08, + "loss": 0.1394, + "step": 4043 + }, + { + "epoch": 1.918406072106262, + "grad_norm": 1.6512079238891602, + "learning_rate": 8.71887232681079e-08, + "loss": 0.1129, + "step": 4044 + }, + { + "epoch": 1.9188804554079697, + "grad_norm": 2.159686803817749, + "learning_rate": 8.617930252801665e-08, + "loss": 0.1642, + "step": 4045 + }, + { + "epoch": 1.9193548387096775, + "grad_norm": 1.8301466703414917, + "learning_rate": 8.517573381818245e-08, + "loss": 0.1814, + "step": 4046 + }, + { + "epoch": 1.9198292220113853, + "grad_norm": 1.6383099555969238, + "learning_rate": 8.417801773100032e-08, + "loss": 0.1308, + "step": 4047 + }, + { + "epoch": 1.920303605313093, + "grad_norm": 1.4430229663848877, + "learning_rate": 8.31861548554147e-08, + "loss": 0.1266, + "step": 4048 + }, + { + "epoch": 1.9207779886148009, + "grad_norm": 1.6547898054122925, + "learning_rate": 8.220014577690949e-08, + "loss": 0.1496, + "step": 4049 + }, + { + "epoch": 1.9212523719165087, + "grad_norm": 1.4231595993041992, + "learning_rate": 8.1219991077518e-08, + "loss": 0.1162, + "step": 4050 + }, + { + "epoch": 1.9217267552182165, + "grad_norm": 1.9546478986740112, + "learning_rate": 8.024569133581517e-08, + "loss": 0.1404, + "step": 4051 + }, + { + "epoch": 1.9222011385199242, + "grad_norm": 1.541332483291626, + "learning_rate": 7.927724712692098e-08, + "loss": 0.1339, + "step": 4052 + }, + { + "epoch": 1.922675521821632, + "grad_norm": 2.084918260574341, + "learning_rate": 7.831465902249701e-08, + "loss": 0.1533, + "step": 4053 + }, + { + "epoch": 1.9231499051233396, + "grad_norm": 1.7567437887191772, + "learning_rate": 7.7357927590751e-08, + "loss": 0.1356, + "step": 4054 + }, + { + "epoch": 1.9236242884250474, + "grad_norm": 1.616256833076477, + "learning_rate": 7.640705339643118e-08, + "loss": 0.1404, + "step": 4055 + }, + { + "epoch": 1.9240986717267552, + "grad_norm": 1.6991674900054932, + "learning_rate": 7.546203700082966e-08, + "loss": 0.1553, + "step": 4056 + }, + { + "epoch": 1.924573055028463, + "grad_norm": 1.4188369512557983, + "learning_rate": 7.452287896178134e-08, + "loss": 0.1166, + "step": 4057 + }, + { + "epoch": 1.9250474383301708, + "grad_norm": 1.779510498046875, + "learning_rate": 7.358957983365944e-08, + "loss": 0.1478, + "step": 4058 + }, + { + "epoch": 1.9255218216318786, + "grad_norm": 1.4632707834243774, + "learning_rate": 7.266214016738326e-08, + "loss": 0.1639, + "step": 4059 + }, + { + "epoch": 1.9259962049335864, + "grad_norm": 2.640392303466797, + "learning_rate": 7.174056051041045e-08, + "loss": 0.1873, + "step": 4060 + }, + { + "epoch": 1.9264705882352942, + "grad_norm": 2.0957512855529785, + "learning_rate": 7.082484140674029e-08, + "loss": 0.1609, + "step": 4061 + }, + { + "epoch": 1.926944971537002, + "grad_norm": 1.6926616430282593, + "learning_rate": 6.99149833969126e-08, + "loss": 0.1288, + "step": 4062 + }, + { + "epoch": 1.9274193548387095, + "grad_norm": 1.697201132774353, + "learning_rate": 6.901098701800779e-08, + "loss": 0.1569, + "step": 4063 + }, + { + "epoch": 1.9278937381404173, + "grad_norm": 1.5255944728851318, + "learning_rate": 6.811285280364677e-08, + "loss": 0.1255, + "step": 4064 + }, + { + "epoch": 1.928368121442125, + "grad_norm": 1.38019859790802, + "learning_rate": 6.722058128398768e-08, + "loss": 0.1255, + "step": 4065 + }, + { + "epoch": 1.928842504743833, + "grad_norm": 1.9084728956222534, + "learning_rate": 6.633417298573142e-08, + "loss": 0.1617, + "step": 4066 + }, + { + "epoch": 1.9293168880455407, + "grad_norm": 1.2430599927902222, + "learning_rate": 6.54536284321139e-08, + "loss": 0.1083, + "step": 4067 + }, + { + "epoch": 1.9297912713472485, + "grad_norm": 1.3795536756515503, + "learning_rate": 6.457894814291376e-08, + "loss": 0.1128, + "step": 4068 + }, + { + "epoch": 1.9302656546489563, + "grad_norm": 1.5916107892990112, + "learning_rate": 6.371013263444469e-08, + "loss": 0.1367, + "step": 4069 + }, + { + "epoch": 1.930740037950664, + "grad_norm": 1.4485095739364624, + "learning_rate": 6.284718241956089e-08, + "loss": 0.1246, + "step": 4070 + }, + { + "epoch": 1.9312144212523719, + "grad_norm": 1.8069593906402588, + "learning_rate": 6.199009800765265e-08, + "loss": 0.1763, + "step": 4071 + }, + { + "epoch": 1.9316888045540797, + "grad_norm": 1.608431339263916, + "learning_rate": 6.113887990464862e-08, + "loss": 0.1233, + "step": 4072 + }, + { + "epoch": 1.9321631878557874, + "grad_norm": 1.4641162157058716, + "learning_rate": 6.029352861301462e-08, + "loss": 0.127, + "step": 4073 + }, + { + "epoch": 1.9326375711574952, + "grad_norm": 1.5734367370605469, + "learning_rate": 5.945404463175375e-08, + "loss": 0.1413, + "step": 4074 + }, + { + "epoch": 1.933111954459203, + "grad_norm": 1.354025959968567, + "learning_rate": 5.862042845640403e-08, + "loss": 0.1094, + "step": 4075 + }, + { + "epoch": 1.9335863377609108, + "grad_norm": 1.6101434230804443, + "learning_rate": 5.779268057904186e-08, + "loss": 0.1551, + "step": 4076 + }, + { + "epoch": 1.9340607210626186, + "grad_norm": 1.3197808265686035, + "learning_rate": 5.6970801488276385e-08, + "loss": 0.1253, + "step": 4077 + }, + { + "epoch": 1.9345351043643264, + "grad_norm": 1.7028498649597168, + "learning_rate": 5.61547916692573e-08, + "loss": 0.1485, + "step": 4078 + }, + { + "epoch": 1.9350094876660342, + "grad_norm": 1.6578190326690674, + "learning_rate": 5.534465160366598e-08, + "loss": 0.1275, + "step": 4079 + }, + { + "epoch": 1.935483870967742, + "grad_norm": 1.70431649684906, + "learning_rate": 5.454038176971987e-08, + "loss": 0.1387, + "step": 4080 + }, + { + "epoch": 1.9359582542694498, + "grad_norm": 1.3666795492172241, + "learning_rate": 5.3741982642173675e-08, + "loss": 0.1215, + "step": 4081 + }, + { + "epoch": 1.9364326375711576, + "grad_norm": 1.2814855575561523, + "learning_rate": 5.294945469231039e-08, + "loss": 0.1162, + "step": 4082 + }, + { + "epoch": 1.9369070208728654, + "grad_norm": 1.7028858661651611, + "learning_rate": 5.2162798387954686e-08, + "loss": 0.1455, + "step": 4083 + }, + { + "epoch": 1.9373814041745732, + "grad_norm": 1.8575924634933472, + "learning_rate": 5.1382014193461783e-08, + "loss": 0.1607, + "step": 4084 + }, + { + "epoch": 1.937855787476281, + "grad_norm": 1.5589218139648438, + "learning_rate": 5.0607102569718566e-08, + "loss": 0.1597, + "step": 4085 + }, + { + "epoch": 1.9383301707779887, + "grad_norm": 1.469462275505066, + "learning_rate": 4.9838063974150255e-08, + "loss": 0.1403, + "step": 4086 + }, + { + "epoch": 1.9388045540796965, + "grad_norm": 1.900517225265503, + "learning_rate": 4.9074898860711485e-08, + "loss": 0.1851, + "step": 4087 + }, + { + "epoch": 1.9392789373814043, + "grad_norm": 1.6202280521392822, + "learning_rate": 4.83176076798908e-08, + "loss": 0.1457, + "step": 4088 + }, + { + "epoch": 1.939753320683112, + "grad_norm": 1.608129620552063, + "learning_rate": 4.7566190878710615e-08, + "loss": 0.1075, + "step": 4089 + }, + { + "epoch": 1.9402277039848197, + "grad_norm": 1.531738519668579, + "learning_rate": 4.6820648900725016e-08, + "loss": 0.1195, + "step": 4090 + }, + { + "epoch": 1.9407020872865275, + "grad_norm": 1.6448765993118286, + "learning_rate": 4.608098218601864e-08, + "loss": 0.1508, + "step": 4091 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 1.4417756795883179, + "learning_rate": 4.5347191171211114e-08, + "loss": 0.1439, + "step": 4092 + }, + { + "epoch": 1.941650853889943, + "grad_norm": 1.6759575605392456, + "learning_rate": 4.4619276289450394e-08, + "loss": 0.1487, + "step": 4093 + }, + { + "epoch": 1.9421252371916509, + "grad_norm": 1.6265183687210083, + "learning_rate": 4.3897237970418336e-08, + "loss": 0.1338, + "step": 4094 + }, + { + "epoch": 1.9425996204933587, + "grad_norm": 1.5453293323516846, + "learning_rate": 4.318107664032622e-08, + "loss": 0.1366, + "step": 4095 + }, + { + "epoch": 1.9430740037950665, + "grad_norm": 1.4695607423782349, + "learning_rate": 4.247079272191812e-08, + "loss": 0.1329, + "step": 4096 + }, + { + "epoch": 1.9435483870967742, + "grad_norm": 1.945720911026001, + "learning_rate": 4.1766386634467523e-08, + "loss": 0.169, + "step": 4097 + }, + { + "epoch": 1.9440227703984818, + "grad_norm": 1.592432975769043, + "learning_rate": 4.1067858793777394e-08, + "loss": 0.1179, + "step": 4098 + }, + { + "epoch": 1.9444971537001896, + "grad_norm": 1.6521458625793457, + "learning_rate": 4.037520961218233e-08, + "loss": 0.1522, + "step": 4099 + }, + { + "epoch": 1.9449715370018974, + "grad_norm": 1.886146903038025, + "learning_rate": 3.96884394985475e-08, + "loss": 0.1636, + "step": 4100 + }, + { + "epoch": 1.9454459203036052, + "grad_norm": 1.8324472904205322, + "learning_rate": 3.900754885826419e-08, + "loss": 0.1596, + "step": 4101 + }, + { + "epoch": 1.945920303605313, + "grad_norm": 1.935429573059082, + "learning_rate": 3.833253809325643e-08, + "loss": 0.171, + "step": 4102 + }, + { + "epoch": 1.9463946869070208, + "grad_norm": 1.5601022243499756, + "learning_rate": 3.766340760197662e-08, + "loss": 0.1629, + "step": 4103 + }, + { + "epoch": 1.9468690702087286, + "grad_norm": 1.2988762855529785, + "learning_rate": 3.700015777940547e-08, + "loss": 0.1098, + "step": 4104 + }, + { + "epoch": 1.9473434535104364, + "grad_norm": 1.6322436332702637, + "learning_rate": 3.634278901705424e-08, + "loss": 0.167, + "step": 4105 + }, + { + "epoch": 1.9478178368121442, + "grad_norm": 1.707542061805725, + "learning_rate": 3.56913017029592e-08, + "loss": 0.1659, + "step": 4106 + }, + { + "epoch": 1.948292220113852, + "grad_norm": 1.5153127908706665, + "learning_rate": 3.50456962216883e-08, + "loss": 0.1431, + "step": 4107 + }, + { + "epoch": 1.9487666034155597, + "grad_norm": 1.735724687576294, + "learning_rate": 3.4405972954334454e-08, + "loss": 0.1478, + "step": 4108 + }, + { + "epoch": 1.9492409867172675, + "grad_norm": 1.4995523691177368, + "learning_rate": 3.3772132278522276e-08, + "loss": 0.1218, + "step": 4109 + }, + { + "epoch": 1.9497153700189753, + "grad_norm": 1.6257517337799072, + "learning_rate": 3.3144174568399135e-08, + "loss": 0.1306, + "step": 4110 + }, + { + "epoch": 1.9501897533206831, + "grad_norm": 1.814677357673645, + "learning_rate": 3.252210019464408e-08, + "loss": 0.1423, + "step": 4111 + }, + { + "epoch": 1.950664136622391, + "grad_norm": 1.7298712730407715, + "learning_rate": 3.190590952446115e-08, + "loss": 0.1366, + "step": 4112 + }, + { + "epoch": 1.9511385199240987, + "grad_norm": 1.5716830492019653, + "learning_rate": 3.129560292158051e-08, + "loss": 0.1091, + "step": 4113 + }, + { + "epoch": 1.9516129032258065, + "grad_norm": 1.3613415956497192, + "learning_rate": 3.069118074626176e-08, + "loss": 0.1238, + "step": 4114 + }, + { + "epoch": 1.9520872865275143, + "grad_norm": 1.6825555562973022, + "learning_rate": 3.0092643355287274e-08, + "loss": 0.1441, + "step": 4115 + }, + { + "epoch": 1.952561669829222, + "grad_norm": 1.650367021560669, + "learning_rate": 2.9499991101969995e-08, + "loss": 0.156, + "step": 4116 + }, + { + "epoch": 1.9530360531309299, + "grad_norm": 1.4215662479400635, + "learning_rate": 2.8913224336145628e-08, + "loss": 0.1157, + "step": 4117 + }, + { + "epoch": 1.9535104364326377, + "grad_norm": 1.7951689958572388, + "learning_rate": 2.8332343404177122e-08, + "loss": 0.1527, + "step": 4118 + }, + { + "epoch": 1.9539848197343455, + "grad_norm": 1.3944616317749023, + "learning_rate": 2.7757348648951298e-08, + "loss": 0.1123, + "step": 4119 + }, + { + "epoch": 1.9544592030360532, + "grad_norm": 1.5503982305526733, + "learning_rate": 2.7188240409883325e-08, + "loss": 0.1412, + "step": 4120 + }, + { + "epoch": 1.954933586337761, + "grad_norm": 1.899158000946045, + "learning_rate": 2.6625019022912256e-08, + "loss": 0.1543, + "step": 4121 + }, + { + "epoch": 1.9554079696394688, + "grad_norm": 1.7116118669509888, + "learning_rate": 2.606768482050215e-08, + "loss": 0.1359, + "step": 4122 + }, + { + "epoch": 1.9558823529411766, + "grad_norm": 1.7955915927886963, + "learning_rate": 2.5516238131640945e-08, + "loss": 0.1608, + "step": 4123 + }, + { + "epoch": 1.9563567362428842, + "grad_norm": 1.3384437561035156, + "learning_rate": 2.4970679281842715e-08, + "loss": 0.1349, + "step": 4124 + }, + { + "epoch": 1.956831119544592, + "grad_norm": 1.5795972347259521, + "learning_rate": 2.443100859314429e-08, + "loss": 0.131, + "step": 4125 + }, + { + "epoch": 1.9573055028462998, + "grad_norm": 1.428765892982483, + "learning_rate": 2.389722638410974e-08, + "loss": 0.1195, + "step": 4126 + }, + { + "epoch": 1.9577798861480076, + "grad_norm": 1.754357933998108, + "learning_rate": 2.3369332969824798e-08, + "loss": 0.1282, + "step": 4127 + }, + { + "epoch": 1.9582542694497154, + "grad_norm": 1.8691462278366089, + "learning_rate": 2.2847328661900203e-08, + "loss": 0.1735, + "step": 4128 + }, + { + "epoch": 1.9587286527514232, + "grad_norm": 1.630744218826294, + "learning_rate": 2.2331213768468363e-08, + "loss": 0.119, + "step": 4129 + }, + { + "epoch": 1.959203036053131, + "grad_norm": 1.8132566213607788, + "learning_rate": 2.1820988594187796e-08, + "loss": 0.1588, + "step": 4130 + }, + { + "epoch": 1.9596774193548387, + "grad_norm": 1.4721542596817017, + "learning_rate": 2.131665344023981e-08, + "loss": 0.1301, + "step": 4131 + }, + { + "epoch": 1.9601518026565465, + "grad_norm": 1.846588373184204, + "learning_rate": 2.0818208604328482e-08, + "loss": 0.1776, + "step": 4132 + }, + { + "epoch": 1.960626185958254, + "grad_norm": 1.6939235925674438, + "learning_rate": 2.032565438067957e-08, + "loss": 0.163, + "step": 4133 + }, + { + "epoch": 1.961100569259962, + "grad_norm": 1.7470312118530273, + "learning_rate": 1.9838991060043833e-08, + "loss": 0.1481, + "step": 4134 + }, + { + "epoch": 1.9615749525616697, + "grad_norm": 1.7479218244552612, + "learning_rate": 1.9358218929693695e-08, + "loss": 0.1545, + "step": 4135 + }, + { + "epoch": 1.9620493358633775, + "grad_norm": 1.5178728103637695, + "learning_rate": 1.8883338273425478e-08, + "loss": 0.134, + "step": 4136 + }, + { + "epoch": 1.9625237191650853, + "grad_norm": 1.7611669301986694, + "learning_rate": 1.8414349371553842e-08, + "loss": 0.1345, + "step": 4137 + }, + { + "epoch": 1.962998102466793, + "grad_norm": 1.5717504024505615, + "learning_rate": 1.7951252500920668e-08, + "loss": 0.1209, + "step": 4138 + }, + { + "epoch": 1.9634724857685009, + "grad_norm": 1.52717924118042, + "learning_rate": 1.7494047934885073e-08, + "loss": 0.1201, + "step": 4139 + }, + { + "epoch": 1.9639468690702087, + "grad_norm": 1.8689799308776855, + "learning_rate": 1.7042735943333388e-08, + "loss": 0.1422, + "step": 4140 + }, + { + "epoch": 1.9644212523719164, + "grad_norm": 1.4372001886367798, + "learning_rate": 1.659731679266807e-08, + "loss": 0.123, + "step": 4141 + }, + { + "epoch": 1.9648956356736242, + "grad_norm": 1.98843514919281, + "learning_rate": 1.6157790745817692e-08, + "loss": 0.1896, + "step": 4142 + }, + { + "epoch": 1.965370018975332, + "grad_norm": 1.4792766571044922, + "learning_rate": 1.5724158062228046e-08, + "loss": 0.1424, + "step": 4143 + }, + { + "epoch": 1.9658444022770398, + "grad_norm": 1.795911192893982, + "learning_rate": 1.5296418997869932e-08, + "loss": 0.1412, + "step": 4144 + }, + { + "epoch": 1.9663187855787476, + "grad_norm": 1.8548011779785156, + "learning_rate": 1.4874573805232495e-08, + "loss": 0.1624, + "step": 4145 + }, + { + "epoch": 1.9667931688804554, + "grad_norm": 2.306521415710449, + "learning_rate": 1.4458622733327654e-08, + "loss": 0.1668, + "step": 4146 + }, + { + "epoch": 1.9672675521821632, + "grad_norm": 2.005544900894165, + "learning_rate": 1.4048566027685673e-08, + "loss": 0.1667, + "step": 4147 + }, + { + "epoch": 1.967741935483871, + "grad_norm": 2.2230448722839355, + "learning_rate": 1.3644403930360706e-08, + "loss": 0.1654, + "step": 4148 + }, + { + "epoch": 1.9682163187855788, + "grad_norm": 1.799142837524414, + "learning_rate": 1.3246136679925249e-08, + "loss": 0.138, + "step": 4149 + }, + { + "epoch": 1.9686907020872866, + "grad_norm": 1.5519241094589233, + "learning_rate": 1.2853764511471245e-08, + "loss": 0.114, + "step": 4150 + }, + { + "epoch": 1.9691650853889944, + "grad_norm": 1.872344732284546, + "learning_rate": 1.2467287656613425e-08, + "loss": 0.1696, + "step": 4151 + }, + { + "epoch": 1.9696394686907022, + "grad_norm": 2.0996015071868896, + "learning_rate": 1.2086706343484855e-08, + "loss": 0.1841, + "step": 4152 + }, + { + "epoch": 1.97011385199241, + "grad_norm": 1.880236268043518, + "learning_rate": 1.1712020796738056e-08, + "loss": 0.1657, + "step": 4153 + }, + { + "epoch": 1.9705882352941178, + "grad_norm": 1.773377537727356, + "learning_rate": 1.1343231237548324e-08, + "loss": 0.1686, + "step": 4154 + }, + { + "epoch": 1.9710626185958255, + "grad_norm": 2.0221047401428223, + "learning_rate": 1.0980337883605973e-08, + "loss": 0.1789, + "step": 4155 + }, + { + "epoch": 1.9715370018975333, + "grad_norm": 1.4797416925430298, + "learning_rate": 1.0623340949125204e-08, + "loss": 0.1164, + "step": 4156 + }, + { + "epoch": 1.9720113851992411, + "grad_norm": 1.4807063341140747, + "learning_rate": 1.027224064483745e-08, + "loss": 0.1294, + "step": 4157 + }, + { + "epoch": 1.972485768500949, + "grad_norm": 1.8948291540145874, + "learning_rate": 9.927037177993593e-09, + "loss": 0.1705, + "step": 4158 + }, + { + "epoch": 1.9729601518026565, + "grad_norm": 1.3457918167114258, + "learning_rate": 9.587730752362855e-09, + "loss": 0.1067, + "step": 4159 + }, + { + "epoch": 1.9734345351043643, + "grad_norm": 1.3780224323272705, + "learning_rate": 9.254321568236135e-09, + "loss": 0.1239, + "step": 4160 + }, + { + "epoch": 1.973908918406072, + "grad_norm": 1.7696219682693481, + "learning_rate": 8.926809822420446e-09, + "loss": 0.1581, + "step": 4161 + }, + { + "epoch": 1.9743833017077799, + "grad_norm": 1.508495807647705, + "learning_rate": 8.605195708242254e-09, + "loss": 0.1172, + "step": 4162 + }, + { + "epoch": 1.9748576850094877, + "grad_norm": 1.7179173231124878, + "learning_rate": 8.289479415548585e-09, + "loss": 0.1668, + "step": 4163 + }, + { + "epoch": 1.9753320683111955, + "grad_norm": 1.8635634183883667, + "learning_rate": 7.979661130703697e-09, + "loss": 0.148, + "step": 4164 + }, + { + "epoch": 1.9758064516129032, + "grad_norm": 1.8939173221588135, + "learning_rate": 7.67574103658797e-09, + "loss": 0.1754, + "step": 4165 + }, + { + "epoch": 1.976280834914611, + "grad_norm": 1.3412449359893799, + "learning_rate": 7.377719312605669e-09, + "loss": 0.1145, + "step": 4166 + }, + { + "epoch": 1.9767552182163188, + "grad_norm": 1.482282280921936, + "learning_rate": 7.085596134673855e-09, + "loss": 0.1171, + "step": 4167 + }, + { + "epoch": 1.9772296015180264, + "grad_norm": 1.7333260774612427, + "learning_rate": 6.799371675230149e-09, + "loss": 0.154, + "step": 4168 + }, + { + "epoch": 1.9777039848197342, + "grad_norm": 2.1031572818756104, + "learning_rate": 6.5190461032305085e-09, + "loss": 0.1674, + "step": 4169 + }, + { + "epoch": 1.978178368121442, + "grad_norm": 2.2297379970550537, + "learning_rate": 6.244619584148126e-09, + "loss": 0.1709, + "step": 4170 + }, + { + "epoch": 1.9786527514231498, + "grad_norm": 1.7986024618148804, + "learning_rate": 5.976092279974533e-09, + "loss": 0.1359, + "step": 4171 + }, + { + "epoch": 1.9791271347248576, + "grad_norm": 1.8482614755630493, + "learning_rate": 5.713464349218489e-09, + "loss": 0.1645, + "step": 4172 + }, + { + "epoch": 1.9796015180265654, + "grad_norm": 1.9954028129577637, + "learning_rate": 5.456735946907099e-09, + "loss": 0.15, + "step": 4173 + }, + { + "epoch": 1.9800759013282732, + "grad_norm": 1.6475908756256104, + "learning_rate": 5.205907224583584e-09, + "loss": 0.1412, + "step": 4174 + }, + { + "epoch": 1.980550284629981, + "grad_norm": 1.6490331888198853, + "learning_rate": 4.960978330310618e-09, + "loss": 0.1739, + "step": 4175 + }, + { + "epoch": 1.9810246679316887, + "grad_norm": 1.4961657524108887, + "learning_rate": 4.721949408666993e-09, + "loss": 0.1371, + "step": 4176 + }, + { + "epoch": 1.9814990512333965, + "grad_norm": 2.557554006576538, + "learning_rate": 4.488820600749844e-09, + "loss": 0.2002, + "step": 4177 + }, + { + "epoch": 1.9819734345351043, + "grad_norm": 1.2743830680847168, + "learning_rate": 4.261592044171314e-09, + "loss": 0.1035, + "step": 4178 + }, + { + "epoch": 1.9824478178368121, + "grad_norm": 2.122196674346924, + "learning_rate": 4.040263873063e-09, + "loss": 0.1985, + "step": 4179 + }, + { + "epoch": 1.98292220113852, + "grad_norm": 2.3841426372528076, + "learning_rate": 3.824836218072614e-09, + "loss": 0.1923, + "step": 4180 + }, + { + "epoch": 1.9833965844402277, + "grad_norm": 1.5701267719268799, + "learning_rate": 3.615309206365103e-09, + "loss": 0.12, + "step": 4181 + }, + { + "epoch": 1.9838709677419355, + "grad_norm": 1.7213834524154663, + "learning_rate": 3.411682961621532e-09, + "loss": 0.1528, + "step": 4182 + }, + { + "epoch": 1.9843453510436433, + "grad_norm": 1.8331454992294312, + "learning_rate": 3.2139576040413067e-09, + "loss": 0.1536, + "step": 4183 + }, + { + "epoch": 1.984819734345351, + "grad_norm": 1.744813084602356, + "learning_rate": 3.0221332503399534e-09, + "loss": 0.1609, + "step": 4184 + }, + { + "epoch": 1.9852941176470589, + "grad_norm": 1.6270570755004883, + "learning_rate": 2.8362100137491187e-09, + "loss": 0.1473, + "step": 4185 + }, + { + "epoch": 1.9857685009487667, + "grad_norm": 1.7102569341659546, + "learning_rate": 2.656188004016569e-09, + "loss": 0.1371, + "step": 4186 + }, + { + "epoch": 1.9862428842504745, + "grad_norm": 1.3755574226379395, + "learning_rate": 2.4820673274095207e-09, + "loss": 0.1176, + "step": 4187 + }, + { + "epoch": 1.9867172675521823, + "grad_norm": 1.6135125160217285, + "learning_rate": 2.3138480867079814e-09, + "loss": 0.1593, + "step": 4188 + }, + { + "epoch": 1.98719165085389, + "grad_norm": 1.431528925895691, + "learning_rate": 2.1515303812091883e-09, + "loss": 0.1384, + "step": 4189 + }, + { + "epoch": 1.9876660341555978, + "grad_norm": 1.3943769931793213, + "learning_rate": 1.9951143067309385e-09, + "loss": 0.1049, + "step": 4190 + }, + { + "epoch": 1.9881404174573056, + "grad_norm": 1.5564050674438477, + "learning_rate": 1.8445999556016003e-09, + "loss": 0.1207, + "step": 4191 + }, + { + "epoch": 1.9886148007590134, + "grad_norm": 1.4701181650161743, + "learning_rate": 1.6999874166678809e-09, + "loss": 0.1306, + "step": 4192 + }, + { + "epoch": 1.9890891840607212, + "grad_norm": 2.03861141204834, + "learning_rate": 1.561276775295939e-09, + "loss": 0.1344, + "step": 4193 + }, + { + "epoch": 1.989563567362429, + "grad_norm": 1.9190205335617065, + "learning_rate": 1.4284681133625022e-09, + "loss": 0.1536, + "step": 4194 + }, + { + "epoch": 1.9900379506641366, + "grad_norm": 1.4114086627960205, + "learning_rate": 1.301561509263749e-09, + "loss": 0.1194, + "step": 4195 + }, + { + "epoch": 1.9905123339658444, + "grad_norm": 1.5192102193832397, + "learning_rate": 1.1805570379130882e-09, + "loss": 0.1374, + "step": 4196 + }, + { + "epoch": 1.9909867172675522, + "grad_norm": 1.6114188432693481, + "learning_rate": 1.0654547707367179e-09, + "loss": 0.1747, + "step": 4197 + }, + { + "epoch": 1.99146110056926, + "grad_norm": 1.4619154930114746, + "learning_rate": 9.562547756780672e-10, + "loss": 0.1304, + "step": 4198 + }, + { + "epoch": 1.9919354838709677, + "grad_norm": 1.5712562799453735, + "learning_rate": 8.529571171977946e-10, + "loss": 0.1691, + "step": 4199 + }, + { + "epoch": 1.9924098671726755, + "grad_norm": 1.8165565729141235, + "learning_rate": 7.555618562715695e-10, + "loss": 0.1524, + "step": 4200 + }, + { + "epoch": 1.9928842504743833, + "grad_norm": 1.9051848649978638, + "learning_rate": 6.640690503889601e-10, + "loss": 0.1687, + "step": 4201 + }, + { + "epoch": 1.9933586337760911, + "grad_norm": 1.6580159664154053, + "learning_rate": 5.784787535600966e-10, + "loss": 0.1748, + "step": 4202 + }, + { + "epoch": 1.9938330170777987, + "grad_norm": 1.8379294872283936, + "learning_rate": 4.987910163067878e-10, + "loss": 0.127, + "step": 4203 + }, + { + "epoch": 1.9943074003795065, + "grad_norm": 1.5118616819381714, + "learning_rate": 4.2500588566696254e-10, + "loss": 0.1409, + "step": 4204 + }, + { + "epoch": 1.9947817836812143, + "grad_norm": 1.5703513622283936, + "learning_rate": 3.5712340519689083e-10, + "loss": 0.1385, + "step": 4205 + }, + { + "epoch": 1.995256166982922, + "grad_norm": 1.7153023481369019, + "learning_rate": 2.9514361496563173e-10, + "loss": 0.1148, + "step": 4206 + }, + { + "epoch": 1.9957305502846299, + "grad_norm": 1.4218897819519043, + "learning_rate": 2.39066551560585e-10, + "loss": 0.1276, + "step": 4207 + }, + { + "epoch": 1.9962049335863377, + "grad_norm": 1.6402268409729004, + "learning_rate": 1.8889224808193995e-10, + "loss": 0.1318, + "step": 4208 + }, + { + "epoch": 1.9966793168880455, + "grad_norm": 1.7963615655899048, + "learning_rate": 1.446207341482264e-10, + "loss": 0.1481, + "step": 4209 + }, + { + "epoch": 1.9971537001897532, + "grad_norm": 1.9880362749099731, + "learning_rate": 1.0625203589187394e-10, + "loss": 0.1913, + "step": 4210 + }, + { + "epoch": 1.997628083491461, + "grad_norm": 1.616383671760559, + "learning_rate": 7.378617596143223e-11, + "loss": 0.1222, + "step": 4211 + }, + { + "epoch": 1.9981024667931688, + "grad_norm": 1.5262385606765747, + "learning_rate": 4.722317352157113e-11, + "loss": 0.1253, + "step": 4212 + }, + { + "epoch": 1.9985768500948766, + "grad_norm": 1.7264385223388672, + "learning_rate": 2.656304425308065e-11, + "loss": 0.1395, + "step": 4213 + }, + { + "epoch": 1.9990512333965844, + "grad_norm": 1.826885461807251, + "learning_rate": 1.1805800349540264e-11, + "loss": 0.1372, + "step": 4214 + }, + { + "epoch": 1.9995256166982922, + "grad_norm": 1.5402823686599731, + "learning_rate": 2.9514505228700473e-12, + "loss": 0.1458, + "step": 4215 + }, + { + "epoch": 2.0, + "grad_norm": 1.4708012342453003, + "learning_rate": 0.0, + "loss": 0.1223, + "step": 4216 + } + ], + "logging_steps": 1.0, + "max_steps": 4216, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "total_flos": 8.236846139047936e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}